diff --git a/Detectors/TRD/workflow/src/TRDGlobalTrackingSpec.cxx b/Detectors/TRD/workflow/src/TRDGlobalTrackingSpec.cxx
index 424657ac19426..375fa732007cc 100644
--- a/Detectors/TRD/workflow/src/TRDGlobalTrackingSpec.cxx
+++ b/Detectors/TRD/workflow/src/TRDGlobalTrackingSpec.cxx
@@ -51,6 +51,8 @@
 #include "GPUTRDTrackletWord.h"
 #include "GPUTRDInterfaces.h"
 #include "GPUTRDGeometry.h"
+#include "GPUConstantMem.h"
+#include "GPUTRDTrackerKernels.h"
 
 #ifdef ENABLE_UPGRADES
 #include "ITS3Reconstruction/IOUtils.h"
diff --git a/GPU/GPUTracking/Base/GPUProcessor.cxx b/GPU/GPUTracking/Base/GPUProcessor.cxx
index 8a18f71d535e3..82627fb00723c 100644
--- a/GPU/GPUTracking/Base/GPUProcessor.cxx
+++ b/GPU/GPUTracking/Base/GPUProcessor.cxx
@@ -14,7 +14,7 @@
 
 #include "GPUProcessor.h"
 #include "GPUReconstruction.h"
-#include "GPUReconstructionDeviceBase.h"
+#include "GPUSettings.h"
 
 using namespace o2::gpu;
 
diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx
index acca74e57a80e..c79c743e96ce5 100644
--- a/GPU/GPUTracking/Base/GPUReconstruction.cxx
+++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx
@@ -30,7 +30,9 @@
 #include "GPUROOTDumpCore.h"
 #include "GPUConfigDump.h"
 #include "GPUChainTracking.h"
+#include "GPUConstantMem.h"
 #include "GPUCommonHelpers.h"
+#include "GPUSettings.h"
 
 #include "GPUMemoryResource.h"
 #include "GPUChain.h"
@@ -75,10 +77,10 @@ constexpr GPUReconstruction::GeometryType GPUReconstruction::geometryType;
 
 static ptrdiff_t ptrDiff(void* a, void* b) { return (char*)a - (char*)b; }
 
-GPUReconstruction::GPUReconstruction(const GPUSettingsDeviceBackend& cfg) : mHostConstantMem(new GPUConstantMem), mDeviceBackendSettings(cfg)
+GPUReconstruction::GPUReconstruction(const GPUSettingsDeviceBackend& cfg) : mHostConstantMem(new GPUConstantMem), mGRPSettings(new GPUSettingsGRP), mDeviceBackendSettings(new GPUSettingsDeviceBackend(cfg)), mProcessingSettings(new GPUSettingsProcessing)
 {
   if (cfg.master) {
-    if (cfg.master->mDeviceBackendSettings.deviceType != cfg.deviceType) {
+    if (cfg.master->GetDeviceBackendSettings().deviceType != cfg.deviceType) {
       throw std::invalid_argument("device type of master and slave GPUReconstruction does not match");
     }
     if (cfg.master->mMaster) {
@@ -87,7 +89,7 @@ GPUReconstruction::GPUReconstruction(const GPUSettingsDeviceBackend& cfg) : mHos
     mMaster = cfg.master;
     cfg.master->mSlaves.emplace_back(this);
   }
-  param().SetDefaults(&mGRPSettings);
+  param().SetDefaults(mGRPSettings.get());
   mMemoryScalers.reset(new GPUMemorySizeScalers);
   for (uint32_t i = 0; i < NSECTORS; i++) {
     processors()->tpcTrackers[i].SetSector(i); // TODO: Move to a better place
@@ -148,7 +150,7 @@ int32_t GPUReconstruction::Init()
   if (InitDevice()) {
     return 1;
   }
-  if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
+  if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
     mHostMemoryPoolEnd = (char*)mHostMemoryBase + mHostMemorySize;
     mDeviceMemoryPoolEnd = (char*)mDeviceMemoryBase + mDeviceMemorySize;
   } else {
@@ -213,7 +215,7 @@ static uint32_t getDefaultNThreads()
 
 int32_t GPUReconstruction::InitPhaseBeforeDevice()
 {
-  if (mProcessingSettings.printSettings) {
+  if (GetProcessingSettings().printSettings) {
     if (mSlaves.size() || mMaster) {
       printf("\nConfig Dump %s\n", mMaster ? "Slave" : "Master");
     }
@@ -223,7 +225,7 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice()
         break;
       }
     }
-    GPUConfigDump::dumpConfig(&param().rec, &mProcessingSettings, chTrk ? chTrk->GetQAConfig() : nullptr, chTrk ? chTrk->GetEventDisplayConfig() : nullptr, &mDeviceBackendSettings, &mRecoSteps);
+    GPUConfigDump::dumpConfig(&param().rec, mProcessingSettings.get(), chTrk ? chTrk->GetQAConfig() : nullptr, chTrk ? chTrk->GetEventDisplayConfig() : nullptr, mDeviceBackendSettings.get(), &mRecoSteps);
   }
   mRecoSteps.stepsGPUMask &= mRecoSteps.steps;
   mRecoSteps.stepsGPUMask &= AvailableGPURecoSteps();
@@ -231,95 +233,95 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice()
     mRecoSteps.stepsGPUMask.set((uint8_t)0);
   }
 
-  if (mProcessingSettings.forceMemoryPoolSize >= 1024 || mProcessingSettings.forceHostMemoryPoolSize >= 1024) {
-    mProcessingSettings.memoryAllocationStrategy = GPUMemoryResource::ALLOCATION_GLOBAL;
+  if (GetProcessingSettings().forceMemoryPoolSize >= 1024 || GetProcessingSettings().forceHostMemoryPoolSize >= 1024) {
+    mProcessingSettings->memoryAllocationStrategy = GPUMemoryResource::ALLOCATION_GLOBAL;
   }
-  if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_AUTO) {
-    mProcessingSettings.memoryAllocationStrategy = IsGPU() ? GPUMemoryResource::ALLOCATION_GLOBAL : GPUMemoryResource::ALLOCATION_INDIVIDUAL;
+  if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_AUTO) {
+    mProcessingSettings->memoryAllocationStrategy = IsGPU() ? GPUMemoryResource::ALLOCATION_GLOBAL : GPUMemoryResource::ALLOCATION_INDIVIDUAL;
   }
-  if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) {
-    mProcessingSettings.forceMemoryPoolSize = mProcessingSettings.forceHostMemoryPoolSize = 0;
+  if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) {
+    mProcessingSettings->forceMemoryPoolSize = mProcessingSettings->forceHostMemoryPoolSize = 0;
   }
-  if (mProcessingSettings.debugLevel >= 4) {
-    mProcessingSettings.keepAllMemory = true;
+  if (GetProcessingSettings().debugLevel >= 4) {
+    mProcessingSettings->keepAllMemory = true;
   }
-  if (mProcessingSettings.debugLevel >= 5 && mProcessingSettings.allocDebugLevel < 2) {
-    mProcessingSettings.allocDebugLevel = 2;
+  if (GetProcessingSettings().debugLevel >= 5 && GetProcessingSettings().allocDebugLevel < 2) {
+    mProcessingSettings->allocDebugLevel = 2;
   }
-  if (mProcessingSettings.eventDisplay || mProcessingSettings.keepAllMemory) {
-    mProcessingSettings.keepDisplayMemory = true;
+  if (GetProcessingSettings().eventDisplay || GetProcessingSettings().keepAllMemory) {
+    mProcessingSettings->keepDisplayMemory = true;
   }
-  if (mProcessingSettings.debugLevel < 6) {
-    mProcessingSettings.debugMask = 0;
+  if (GetProcessingSettings().debugLevel < 6) {
+    mProcessingSettings->debugMask = 0;
   }
-  if (mProcessingSettings.debugLevel < 1) {
-    mProcessingSettings.deviceTimers = false;
+  if (GetProcessingSettings().debugLevel < 1) {
+    mProcessingSettings->deviceTimers = false;
   }
-  if (mProcessingSettings.debugLevel > 0) {
-    mProcessingSettings.recoTaskTiming = true;
+  if (GetProcessingSettings().debugLevel > 0) {
+    mProcessingSettings->recoTaskTiming = true;
   }
-  if (mProcessingSettings.deterministicGPUReconstruction == -1) {
-    mProcessingSettings.deterministicGPUReconstruction = mProcessingSettings.debugLevel >= 6;
+  if (GetProcessingSettings().deterministicGPUReconstruction == -1) {
+    mProcessingSettings->deterministicGPUReconstruction = GetProcessingSettings().debugLevel >= 6;
   }
-  if (mProcessingSettings.deterministicGPUReconstruction) {
+  if (GetProcessingSettings().deterministicGPUReconstruction) {
 #ifndef GPUCA_DETERMINISTIC_MODE
     GPUError("WARNING, deterministicGPUReconstruction needs GPUCA_DETERMINISTIC_MODE for being fully deterministic, without only most indeterminism by concurrency is removed, but floating point effects remain!");
 #endif
-    mProcessingSettings.overrideClusterizerFragmentLen = TPC_MAX_FRAGMENT_LEN_GPU;
+    mProcessingSettings->overrideClusterizerFragmentLen = TPC_MAX_FRAGMENT_LEN_GPU;
     param().rec.tpc.nWaysOuter = true;
     if (param().rec.tpc.looperInterpolationInExtraPass == -1) {
       param().rec.tpc.looperInterpolationInExtraPass = 0;
     }
-    if (mProcessingSettings.createO2Output > 1) {
-      mProcessingSettings.createO2Output = 1;
+    if (GetProcessingSettings().createO2Output > 1) {
+      mProcessingSettings->createO2Output = 1;
     }
-    mProcessingSettings.rtc.deterministic = 1;
+    mProcessingSettings->rtc.deterministic = 1;
   } else {
 #ifdef GPUCA_DETERMINISTIC_MODE
     GPUError("WARNING, compiled with GPUCA_DETERMINISTIC_MODE but deterministicGPUReconstruction not set, only compile-time determinism and deterministic math enforced, not fully deterministic!");
 #endif
   }
-  if (mProcessingSettings.deterministicGPUReconstruction && mProcessingSettings.debugLevel >= 6) {
-    mProcessingSettings.nTPCClustererLanes = 1;
+  if (GetProcessingSettings().deterministicGPUReconstruction && GetProcessingSettings().debugLevel >= 6) {
+    mProcessingSettings->nTPCClustererLanes = 1;
   }
-  if (mProcessingSettings.createO2Output > 1 && mProcessingSettings.runQA && mProcessingSettings.qcRunFraction == 100.f) {
-    mProcessingSettings.createO2Output = 1;
+  if (GetProcessingSettings().createO2Output > 1 && GetProcessingSettings().runQA && GetProcessingSettings().qcRunFraction == 100.f) {
+    mProcessingSettings->createO2Output = 1;
   }
-  if (!mProcessingSettings.createO2Output || !IsGPU()) {
-    mProcessingSettings.clearO2OutputFromGPU = false;
+  if (!GetProcessingSettings().createO2Output || !IsGPU()) {
+    mProcessingSettings->clearO2OutputFromGPU = false;
   }
   if (!(mRecoSteps.stepsGPUMask & GPUDataTypes::RecoStep::TPCMerging)) {
-    mProcessingSettings.mergerSortTracks = false;
+    mProcessingSettings->mergerSortTracks = false;
   }
 
-  if (mProcessingSettings.debugLevel > 3 || !IsGPU() || mProcessingSettings.deterministicGPUReconstruction) {
-    mProcessingSettings.delayedOutput = false;
+  if (GetProcessingSettings().debugLevel > 3 || !IsGPU() || GetProcessingSettings().deterministicGPUReconstruction) {
+    mProcessingSettings->delayedOutput = false;
   }
 
-  if (!mProcessingSettings.rtc.enable) {
-    mProcessingSettings.rtc.optConstexpr = false;
+  if (!GetProcessingSettings().rtc.enable) {
+    mProcessingSettings->rtc.optConstexpr = false;
   }
 
-  mMemoryScalers->factor = mProcessingSettings.memoryScalingFactor;
-  mMemoryScalers->conservative = mProcessingSettings.conservativeMemoryEstimate;
-  mMemoryScalers->returnMaxVal = mProcessingSettings.forceMaxMemScalers != 0;
-  if (mProcessingSettings.forceMaxMemScalers > 1) {
-    mMemoryScalers->rescaleMaxMem(mProcessingSettings.forceMaxMemScalers);
+  mMemoryScalers->factor = GetProcessingSettings().memoryScalingFactor;
+  mMemoryScalers->conservative = GetProcessingSettings().conservativeMemoryEstimate;
+  mMemoryScalers->returnMaxVal = GetProcessingSettings().forceMaxMemScalers != 0;
+  if (GetProcessingSettings().forceMaxMemScalers > 1) {
+    mMemoryScalers->rescaleMaxMem(GetProcessingSettings().forceMaxMemScalers);
   }
 
-  if (mProcessingSettings.nHostThreads != -1 && mProcessingSettings.ompThreads != -1) {
+  if (GetProcessingSettings().nHostThreads != -1 && GetProcessingSettings().ompThreads != -1) {
     GPUFatal("Must not use both nHostThreads and ompThreads at the same time!");
-  } else if (mProcessingSettings.ompThreads != -1) {
-    mProcessingSettings.nHostThreads = mProcessingSettings.ompThreads;
+  } else if (GetProcessingSettings().ompThreads != -1) {
+    mProcessingSettings->nHostThreads = GetProcessingSettings().ompThreads;
     GPUWarning("You are using the deprecated ompThreads option, please switch to nHostThreads!");
   }
 
-  if (mProcessingSettings.nHostThreads <= 0) {
-    mProcessingSettings.nHostThreads = internal::getDefaultNThreads();
+  if (GetProcessingSettings().nHostThreads <= 0) {
+    mProcessingSettings->nHostThreads = internal::getDefaultNThreads();
   } else {
-    mProcessingSettings.autoAdjustHostThreads = false;
+    mProcessingSettings->autoAdjustHostThreads = false;
   }
-  mMaxHostThreads = mProcessingSettings.nHostThreads;
+  mMaxHostThreads = GetProcessingSettings().nHostThreads;
   if (mMaster == nullptr) {
     mThreading = std::make_shared<GPUReconstructionThreading>();
     mThreading->control = std::make_unique<tbb::global_control>(tbb::global_control::max_allowed_parallelism, mMaxHostThreads);
@@ -330,26 +332,26 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice()
   }
   mMaxBackendThreads = std::max(mMaxBackendThreads, mMaxHostThreads);
   if (IsGPU()) {
-    mNStreams = std::max<int32_t>(mProcessingSettings.nStreams, 3);
+    mNStreams = std::max<int32_t>(GetProcessingSettings().nStreams, 3);
   }
 
-  if (mProcessingSettings.nTPCClustererLanes == -1) {
-    mProcessingSettings.nTPCClustererLanes = (GetRecoStepsGPU() & RecoStep::TPCClusterFinding) ? 3 : std::max<int32_t>(1, std::min<int32_t>(GPUCA_NSECTORS, mProcessingSettings.inKernelParallel ? (mMaxHostThreads >= 4 ? std::min<int32_t>(mMaxHostThreads / 2, mMaxHostThreads >= 32 ? GPUCA_NSECTORS : 4) : 1) : mMaxHostThreads));
+  if (GetProcessingSettings().nTPCClustererLanes == -1) {
+    mProcessingSettings->nTPCClustererLanes = (GetRecoStepsGPU() & RecoStep::TPCClusterFinding) ? 3 : std::max<int32_t>(1, std::min<int32_t>(GPUCA_NSECTORS, GetProcessingSettings().inKernelParallel ? (mMaxHostThreads >= 4 ? std::min<int32_t>(mMaxHostThreads / 2, mMaxHostThreads >= 32 ? GPUCA_NSECTORS : 4) : 1) : mMaxHostThreads));
   }
-  if (mProcessingSettings.overrideClusterizerFragmentLen == -1) {
-    mProcessingSettings.overrideClusterizerFragmentLen = ((GetRecoStepsGPU() & RecoStep::TPCClusterFinding) || (mMaxHostThreads / mProcessingSettings.nTPCClustererLanes >= 3)) ? TPC_MAX_FRAGMENT_LEN_GPU : TPC_MAX_FRAGMENT_LEN_HOST;
+  if (GetProcessingSettings().overrideClusterizerFragmentLen == -1) {
+    mProcessingSettings->overrideClusterizerFragmentLen = ((GetRecoStepsGPU() & RecoStep::TPCClusterFinding) || (mMaxHostThreads / GetProcessingSettings().nTPCClustererLanes >= 3)) ? TPC_MAX_FRAGMENT_LEN_GPU : TPC_MAX_FRAGMENT_LEN_HOST;
   }
-  if (mProcessingSettings.nTPCClustererLanes > GPUCA_NSECTORS) {
-    GPUError("Invalid value for nTPCClustererLanes: %d", mProcessingSettings.nTPCClustererLanes);
-    mProcessingSettings.nTPCClustererLanes = GPUCA_NSECTORS;
+  if (GetProcessingSettings().nTPCClustererLanes > GPUCA_NSECTORS) {
+    GPUError("Invalid value for nTPCClustererLanes: %d", GetProcessingSettings().nTPCClustererLanes);
+    mProcessingSettings->nTPCClustererLanes = GPUCA_NSECTORS;
   }
 
-  if (mProcessingSettings.doublePipeline && (mChains.size() != 1 || mChains[0]->SupportsDoublePipeline() == false || !IsGPU() || mProcessingSettings.memoryAllocationStrategy != GPUMemoryResource::ALLOCATION_GLOBAL)) {
+  if (GetProcessingSettings().doublePipeline && (mChains.size() != 1 || mChains[0]->SupportsDoublePipeline() == false || !IsGPU() || GetProcessingSettings().memoryAllocationStrategy != GPUMemoryResource::ALLOCATION_GLOBAL)) {
     GPUError("Must use double pipeline mode only with exactly one chain that must support it");
     return 1;
   }
 
-  if (mMaster == nullptr && mProcessingSettings.doublePipeline) {
+  if (mMaster == nullptr && GetProcessingSettings().doublePipeline) {
     mPipelineContext.reset(new GPUReconstructionPipelineContext);
   }
 
@@ -367,16 +369,16 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice()
     mDeviceMemorySize += memPrimary;
     mHostMemorySize += memPageLocked;
   }
-  if (mProcessingSettings.forceMemoryPoolSize && mProcessingSettings.forceMemoryPoolSize <= 2 && CanQueryMaxMemory()) {
-    mDeviceMemorySize = mProcessingSettings.forceMemoryPoolSize;
-  } else if (mProcessingSettings.forceMemoryPoolSize > 2) {
-    mDeviceMemorySize = mProcessingSettings.forceMemoryPoolSize;
+  if (GetProcessingSettings().forceMemoryPoolSize && GetProcessingSettings().forceMemoryPoolSize <= 2 && CanQueryMaxMemory()) {
+    mDeviceMemorySize = GetProcessingSettings().forceMemoryPoolSize;
+  } else if (GetProcessingSettings().forceMemoryPoolSize > 2) {
+    mDeviceMemorySize = GetProcessingSettings().forceMemoryPoolSize;
     if (!IsGPU() || mOutputControl.useInternal()) {
       mHostMemorySize = mDeviceMemorySize;
     }
   }
-  if (mProcessingSettings.forceHostMemoryPoolSize) {
-    mHostMemorySize = mProcessingSettings.forceHostMemoryPoolSize;
+  if (GetProcessingSettings().forceHostMemoryPoolSize) {
+    mHostMemorySize = GetProcessingSettings().forceHostMemoryPoolSize;
   }
 
   for (uint32_t i = 0; i < mProcessors.size(); i++) {
@@ -399,7 +401,7 @@ int32_t GPUReconstruction::InitPhasePermanentMemory()
 
 int32_t GPUReconstruction::InitPhaseAfterDevice()
 {
-  if (mProcessingSettings.forceMaxMemScalers <= 1 && mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
+  if (GetProcessingSettings().forceMaxMemScalers <= 1 && GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
     mMemoryScalers->rescaleMaxMem(IsGPU() ? mDeviceMemorySize : mHostMemorySize);
   }
   for (uint32_t i = 0; i < mChains.size(); i++) {
@@ -446,7 +448,7 @@ int32_t GPUReconstruction::Exit()
 
   mChains.clear();          // Make sure we destroy a possible ITS GPU tracker before we call the destructors
   mHostConstantMem.reset(); // Reset these explicitly before the destruction of other members unloads the library
-  if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) {
+  if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) {
     for (uint32_t i = 0; i < mMemoryResources.size(); i++) {
       if (mMemoryResources[i].mReuse >= 0) {
         continue;
@@ -481,9 +483,38 @@ void GPUReconstruction::ComputeReuseMax(GPUProcessor* proc)
   }
 }
 
+int16_t GPUReconstruction::RegisterMemoryAllocationHelper(GPUProcessor* proc, void* (GPUProcessor::*setPtr)(void*), int32_t type, const char* name, const GPUMemoryReuse& re)
+{
+  if (!(type & (GPUMemoryResource::MEMORY_HOST | GPUMemoryResource::MEMORY_GPU))) {
+    if ((type & GPUMemoryResource::MEMORY_SCRATCH) && !GetProcessingSettings().keepDisplayMemory) { // keepAllMemory --> keepDisplayMemory
+      type |= (proc->mGPUProcessorType == GPUProcessor::PROCESSOR_TYPE_CPU ? GPUMemoryResource::MEMORY_HOST : GPUMemoryResource::MEMORY_GPU);
+    } else {
+      type |= GPUMemoryResource::MEMORY_HOST | GPUMemoryResource::MEMORY_GPU;
+    }
+  }
+  if (proc->mGPUProcessorType == GPUProcessor::PROCESSOR_TYPE_CPU) {
+    type &= ~GPUMemoryResource::MEMORY_GPU;
+  }
+  mMemoryResources.emplace_back(proc, setPtr, (GPUMemoryResource::MemoryType)type, name);
+  if (mMemoryResources.size() >= 32768) {
+    throw std::bad_alloc();
+  }
+  uint16_t retVal = mMemoryResources.size() - 1;
+  if (re.type != GPUMemoryReuse::NONE && !GetProcessingSettings().disableMemoryReuse) {
+    const auto& it = mMemoryReuse1to1.find(re.id);
+    if (it == mMemoryReuse1to1.end()) {
+      mMemoryReuse1to1[re.id] = {proc, retVal};
+    } else {
+      mMemoryResources[retVal].mReuse = it->second.res[0];
+      it->second.res.emplace_back(retVal);
+    }
+  }
+  return retVal;
+}
+
 size_t GPUReconstruction::AllocateRegisteredMemory(GPUProcessor* proc, bool resetCustom)
 {
-  if (mProcessingSettings.debugLevel >= 5) {
+  if (GetProcessingSettings().debugLevel >= 5) {
     GPUInfo("Allocating memory %p", (void*)proc);
   }
   size_t total = 0;
@@ -496,7 +527,7 @@ size_t GPUReconstruction::AllocateRegisteredMemory(GPUProcessor* proc, bool rese
       }
     }
   }
-  if (mProcessingSettings.debugLevel >= 5) {
+  if (GetProcessingSettings().debugLevel >= 5) {
     GPUInfo("Allocating memory done");
   }
   return total;
@@ -504,7 +535,7 @@ size_t GPUReconstruction::AllocateRegisteredMemory(GPUProcessor* proc, bool rese
 
 size_t GPUReconstruction::AllocateRegisteredPermanentMemory()
 {
-  if (mProcessingSettings.debugLevel >= 5) {
+  if (GetProcessingSettings().debugLevel >= 5) {
     GPUInfo("Allocating Permanent Memory");
   }
   int32_t total = 0;
@@ -515,7 +546,7 @@ size_t GPUReconstruction::AllocateRegisteredPermanentMemory()
   }
   mHostMemoryPermanent = mHostMemoryPool;
   mDeviceMemoryPermanent = mDeviceMemoryPool;
-  if (mProcessingSettings.debugLevel >= 5) {
+  if (GetProcessingSettings().debugLevel >= 5) {
     GPUInfo("Permanent Memory Done");
   }
   return total;
@@ -534,7 +565,7 @@ size_t GPUReconstruction::AllocateRegisteredMemoryHelper(GPUMemoryResource* res,
       GPUError("Insufficient reuse memory %lu < %lu (%s) (%s)", mMemoryResources[res->mReuse].mSize, retVal, res->mName, device);
       throw std::bad_alloc();
     }
-    if (mProcessingSettings.allocDebugLevel >= 2) {
+    if (GetProcessingSettings().allocDebugLevel >= 2) {
       std::cout << "Reused (" << device << ") " << res->mName << ": " << retVal << "\n";
     }
     return retVal;
@@ -568,7 +599,7 @@ size_t GPUReconstruction::AllocateRegisteredMemoryHelper(GPUMemoryResource* res,
     std::cerr << "Memory pool size exceeded (" << device << ") (" << res->mName << ": " << (memorypoolend ? (memorysize + ptrDiff(memorypool, memorypoolend)) : ptrDiff(memorypool, memorybase)) << " > " << memorysize << "\n";
     throw std::bad_alloc();
   }
-  if (mProcessingSettings.allocDebugLevel >= 2) {
+  if (GetProcessingSettings().allocDebugLevel >= 2) {
     std::cout << "Allocated (" << device << ") " << res->mName << ": " << retVal << " - available: " << (memorypoolend ? ptrDiff(memorypoolend, memorypool) : (memorysize - ptrDiff(memorypool, memorybase))) << "\n";
   }
   return retVal;
@@ -576,7 +607,7 @@ size_t GPUReconstruction::AllocateRegisteredMemoryHelper(GPUMemoryResource* res,
 
 void GPUReconstruction::AllocateRegisteredMemoryInternal(GPUMemoryResource* res, GPUOutputControl* control, GPUReconstruction* recPool)
 {
-  if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL && (control == nullptr || control->useInternal())) {
+  if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL && (control == nullptr || control->useInternal())) {
     if (!(res->mType & GPUMemoryResource::MEMORY_EXTERNAL)) {
       if (res->mPtrDevice && res->mReuse < 0) {
         operator delete(res->mPtrDevice, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
@@ -593,7 +624,7 @@ void GPUReconstruction::AllocateRegisteredMemoryInternal(GPUMemoryResource* res,
       }
       res->mPtr = GPUProcessor::alignPointer<GPUCA_BUFFER_ALIGNMENT>(res->mPtrDevice);
       res->SetPointers(res->mPtr);
-      if (mProcessingSettings.allocDebugLevel >= 2) {
+      if (GetProcessingSettings().allocDebugLevel >= 2) {
         std::cout << (res->mReuse >= 0 ? "Reused " : "Allocated ") << res->mName << ": " << res->mSize << "\n";
       }
       if (res->mType & GPUMemoryResource::MEMORY_STACK) {
@@ -612,13 +643,13 @@ void GPUReconstruction::AllocateRegisteredMemoryInternal(GPUMemoryResource* res,
     if (IsGPU() && res->mOverrideSize < GPUCA_BUFFER_ALIGNMENT) {
       res->mOverrideSize = GPUCA_BUFFER_ALIGNMENT;
     }
-    if ((!IsGPU() || (res->mType & GPUMemoryResource::MEMORY_HOST) || mProcessingSettings.keepDisplayMemory) && !(res->mType & GPUMemoryResource::MEMORY_EXTERNAL)) { // keepAllMemory --> keepDisplayMemory
+    if ((!IsGPU() || (res->mType & GPUMemoryResource::MEMORY_HOST) || GetProcessingSettings().keepDisplayMemory) && !(res->mType & GPUMemoryResource::MEMORY_EXTERNAL)) { // keepAllMemory --> keepDisplayMemory
       if (control && control->useExternal()) {
         if (control->allocator) {
           res->mSize = std::max((size_t)res->SetPointers((void*)1) - 1, res->mOverrideSize);
           res->mPtr = control->allocator(CAMath::nextMultipleOf<GPUCA_BUFFER_ALIGNMENT>(res->mSize));
           res->mSize = std::max<size_t>(ptrDiff(res->SetPointers(res->mPtr), res->mPtr), res->mOverrideSize);
-          if (mProcessingSettings.allocDebugLevel >= 2) {
+          if (GetProcessingSettings().allocDebugLevel >= 2) {
             std::cout << "Allocated (from callback) " << res->mName << ": " << res->mSize << "\n";
           }
         } else {
@@ -676,7 +707,7 @@ void* GPUReconstruction::AllocateUnmanagedMemory(size_t size, int32_t type)
   if (type != GPUMemoryResource::MEMORY_HOST && (!IsGPU() || type != GPUMemoryResource::MEMORY_GPU)) {
     throw std::runtime_error("Requested invalid memory typo for unmanaged allocation");
   }
-  if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) {
+  if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) {
     mUnmanagedChunks.emplace_back(new char[size + GPUCA_BUFFER_ALIGNMENT]);
     return GPUProcessor::alignPointer<GPUCA_BUFFER_ALIGNMENT>(mUnmanagedChunks.back().get());
   } else {
@@ -689,7 +720,7 @@ void* GPUReconstruction::AllocateUnmanagedMemory(size_t size, int32_t type)
       throw std::bad_alloc();
     }
     UpdateMaxMemoryUsed();
-    if (mProcessingSettings.allocDebugLevel >= 2) {
+    if (GetProcessingSettings().allocDebugLevel >= 2) {
       std::cout << "Allocated (unmanaged " << (type == GPUMemoryResource::MEMORY_GPU ? "gpu" : "host") << "): " << size << " - available: " << ptrDiff(poolend, pool) << "\n";
     }
     return retVal;
@@ -711,7 +742,7 @@ void* GPUReconstruction::AllocateVolatileDeviceMemory(size_t size)
     throw std::bad_alloc();
   }
   UpdateMaxMemoryUsed();
-  if (mProcessingSettings.allocDebugLevel >= 2) {
+  if (GetProcessingSettings().allocDebugLevel >= 2) {
     std::cout << "Allocated (volatile GPU): " << size << " - available: " << ptrDiff(mDeviceMemoryPoolEnd, mDeviceMemoryPool) << "\n";
   }
 
@@ -773,10 +804,10 @@ void GPUReconstruction::FreeRegisteredMemory(int16_t ires)
 
 void GPUReconstruction::FreeRegisteredMemory(GPUMemoryResource* res)
 {
-  if (mProcessingSettings.allocDebugLevel >= 2 && (res->mPtr || res->mPtrDevice)) {
+  if (GetProcessingSettings().allocDebugLevel >= 2 && (res->mPtr || res->mPtrDevice)) {
     std::cout << "Freeing " << res->mName << ": size " << res->mSize << " (reused " << res->mReuse << ")\n";
   }
-  if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL && res->mReuse < 0) {
+  if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL && res->mReuse < 0) {
     operator delete(res->mPtrDevice, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
   }
   res->mPtr = nullptr;
@@ -789,7 +820,7 @@ void GPUReconstruction::ReturnVolatileDeviceMemory()
     mDeviceMemoryPool = mVolatileMemoryStart;
     mVolatileMemoryStart = nullptr;
   }
-  if (mProcessingSettings.allocDebugLevel >= 2) {
+  if (GetProcessingSettings().allocDebugLevel >= 2) {
     std::cout << "Freed (volatile GPU) - available: " << ptrDiff(mDeviceMemoryPoolEnd, mDeviceMemoryPool) << "\n";
   }
 }
@@ -807,7 +838,7 @@ void GPUReconstruction::PushNonPersistentMemory(uint64_t tag)
 
 void GPUReconstruction::PopNonPersistentMemory(RecoStep step, uint64_t tag)
 {
-  if (mProcessingSettings.keepDisplayMemory || mProcessingSettings.disableMemoryReuse) {
+  if (GetProcessingSettings().keepDisplayMemory || GetProcessingSettings().disableMemoryReuse) {
     return;
   }
   if (mNonPersistentMemoryStack.size() == 0) {
@@ -816,7 +847,7 @@ void GPUReconstruction::PopNonPersistentMemory(RecoStep step, uint64_t tag)
   if (tag != 0 && std::get<3>(mNonPersistentMemoryStack.back()) != tag) {
     GPUFatal("Tag mismatch when popping non persistent memory from stack : pop %s vs on stack %s", qTag2Str(tag).c_str(), qTag2Str(std::get<3>(mNonPersistentMemoryStack.back())).c_str());
   }
-  if ((mProcessingSettings.debugLevel >= 3 || mProcessingSettings.allocDebugLevel) && (IsGPU() || mProcessingSettings.forceHostMemoryPoolSize)) {
+  if ((GetProcessingSettings().debugLevel >= 3 || GetProcessingSettings().allocDebugLevel) && (IsGPU() || GetProcessingSettings().forceHostMemoryPoolSize)) {
     printf("Allocated memory after %30s (%8s) (Stack %zu): ", GPUDataTypes::RECO_STEP_NAMES[getRecoStepNum(step, true)], qTag2Str(std::get<3>(mNonPersistentMemoryStack.back())).c_str(), mNonPersistentMemoryStack.size());
     PrintMemoryOverview();
     printf("%76s", "");
@@ -872,7 +903,7 @@ void GPUReconstruction::ClearAllocatedMemory(bool clearOutputs)
   mNonPersistentMemoryStack.clear();
   mNonPersistentIndividualAllocations.clear();
   mVolatileMemoryStart = nullptr;
-  if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
+  if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
     mHostMemoryPool = GPUProcessor::alignPointer<GPUCA_MEMALIGN>(mHostMemoryPermanent);
     mDeviceMemoryPool = GPUProcessor::alignPointer<GPUCA_MEMALIGN>(mDeviceMemoryPermanent);
     mHostMemoryPoolEnd = mHostMemoryPoolBlocked ? mHostMemoryPoolBlocked : ((char*)mHostMemoryBase + mHostMemorySize);
@@ -895,7 +926,7 @@ void GPUReconstruction::PrintMemoryMax()
 
 void GPUReconstruction::PrintMemoryOverview()
 {
-  if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
+  if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
     printf("Memory Allocation: Host %'13zd / %'13zu (Permanent %'13zd, Data %'13zd, Scratch %'13zd), Device %'13zd / %'13zu, (Permanent %'13zd, Data %'13zd, Scratch %'13zd) %zu chunks\n",
            ptrDiff(mHostMemoryPool, mHostMemoryBase) + ptrDiff((char*)mHostMemoryBase + mHostMemorySize, mHostMemoryPoolEnd), mHostMemorySize, ptrDiff(mHostMemoryPermanent, mHostMemoryBase), ptrDiff(mHostMemoryPool, mHostMemoryPermanent), ptrDiff((char*)mHostMemoryBase + mHostMemorySize, mHostMemoryPoolEnd),
            ptrDiff(mDeviceMemoryPool, mDeviceMemoryBase) + ptrDiff((char*)mDeviceMemoryBase + mDeviceMemorySize, mDeviceMemoryPoolEnd), mDeviceMemorySize, ptrDiff(mDeviceMemoryPermanent, mDeviceMemoryBase), ptrDiff(mDeviceMemoryPool, mDeviceMemoryPermanent), ptrDiff((char*)mDeviceMemoryBase + mDeviceMemorySize, mDeviceMemoryPoolEnd),
@@ -934,7 +965,7 @@ void GPUReconstruction::PrintMemoryStatistics()
 
 int32_t GPUReconstruction::registerMemoryForGPU(const void* ptr, size_t size)
 {
-  if (mProcessingSettings.noGPUMemoryRegistration) {
+  if (GetProcessingSettings().noGPUMemoryRegistration) {
     return 0;
   }
   int32_t retVal = registerMemoryForGPU_internal(ptr, size);
@@ -946,7 +977,7 @@ int32_t GPUReconstruction::registerMemoryForGPU(const void* ptr, size_t size)
 
 int32_t GPUReconstruction::unregisterMemoryForGPU(const void* ptr)
 {
-  if (mProcessingSettings.noGPUMemoryRegistration) {
+  if (GetProcessingSettings().noGPUMemoryRegistration) {
     return 0;
   }
   const auto& pos = mRegisteredMemoryPtrs.find(ptr);
@@ -982,10 +1013,10 @@ int32_t GPUReconstruction::getGeneralStepNum(GeneralStep step, bool validCheck)
 
 void GPUReconstruction::RunPipelineWorker()
 {
-  if (!mInitialized || !mProcessingSettings.doublePipeline || mMaster != nullptr || !mSlaves.size()) {
+  if (!mInitialized || !GetProcessingSettings().doublePipeline || mMaster != nullptr || !mSlaves.size()) {
     throw std::invalid_argument("Cannot start double pipeline mode");
   }
-  if (mProcessingSettings.debugLevel >= 3) {
+  if (GetProcessingSettings().debugLevel >= 3) {
     GPUInfo("Pipeline worker started");
   }
   bool terminate = false;
@@ -1011,7 +1042,7 @@ void GPUReconstruction::RunPipelineWorker()
     }
     q->c.notify_one();
   }
-  if (mProcessingSettings.debugLevel >= 3) {
+  if (GetProcessingSettings().debugLevel >= 3) {
     GPUInfo("Pipeline worker ended");
   }
 }
@@ -1107,7 +1138,7 @@ void GPUReconstruction::DumpSettings(const char* dir)
   std::string f;
   f = dir;
   f += "settings.dump";
-  DumpStructToFile(&mGRPSettings, f.c_str());
+  DumpStructToFile(mGRPSettings.get(), f.c_str());
   for (uint32_t i = 0; i < mChains.size(); i++) {
     mChains[i]->DumpSettings(dir);
   }
@@ -1121,11 +1152,11 @@ void GPUReconstruction::UpdateDynamicSettings(const GPUSettingsRecDynamic* d)
 void GPUReconstruction::UpdateSettings(const GPUSettingsGRP* g, const GPUSettingsProcessing* p, const GPUSettingsRecDynamic* d)
 {
   if (g) {
-    mGRPSettings = *g;
+    *mGRPSettings = *g;
   }
   if (p) {
-    mProcessingSettings.debugLevel = p->debugLevel;
-    mProcessingSettings.resetTimers = p->resetTimers;
+    mProcessingSettings->debugLevel = p->debugLevel;
+    mProcessingSettings->resetTimers = p->resetTimers;
   }
   GPURecoStepConfiguration* w = nullptr;
   if (mRecoSteps.steps.isSet(GPUDataTypes::RecoStep::TPCdEdx)) {
@@ -1142,11 +1173,11 @@ int32_t GPUReconstruction::ReadSettings(const char* dir)
   std::string f;
   f = dir;
   f += "settings.dump";
-  new (&mGRPSettings) GPUSettingsGRP;
-  if (ReadStructFromFile(f.c_str(), &mGRPSettings)) {
+  new (mGRPSettings.get()) GPUSettingsGRP;
+  if (ReadStructFromFile(f.c_str(), mGRPSettings.get())) {
     return 1;
   }
-  param().UpdateSettings(&mGRPSettings);
+  param().UpdateSettings(mGRPSettings.get());
   for (uint32_t i = 0; i < mChains.size(); i++) {
     mChains[i]->ReadSettings(dir);
   }
@@ -1173,9 +1204,9 @@ void GPUReconstruction::SetSettings(const GPUSettingsGRP* grp, const GPUSettings
     GPUError("Cannot update settings while initialized");
     throw std::runtime_error("Settings updated while initialized");
   }
-  mGRPSettings = *grp;
+  *mGRPSettings = *grp;
   if (proc) {
-    mProcessingSettings = *proc;
+    *mProcessingSettings = *proc;
   }
   if (workflow) {
     mRecoSteps.steps = workflow->steps;
@@ -1183,7 +1214,7 @@ void GPUReconstruction::SetSettings(const GPUSettingsGRP* grp, const GPUSettings
     mRecoSteps.inputs = workflow->inputs;
     mRecoSteps.outputs = workflow->outputs;
   }
-  param().SetDefaults(&mGRPSettings, rec, proc, workflow);
+  param().SetDefaults(mGRPSettings.get(), rec, proc, workflow);
 }
 
 void GPUReconstruction::SetOutputControl(void* ptr, size_t size)
@@ -1193,10 +1224,14 @@ void GPUReconstruction::SetOutputControl(void* ptr, size_t size)
   SetOutputControl(outputControl);
 }
 
-void GPUReconstruction::SetInputControl(void* ptr, size_t size)
-{
-  mInputControl.set(ptr, size);
-}
+void GPUReconstruction::SetInputControl(void* ptr, size_t size) { mInputControl.set(ptr, size); }
+GPUReconstruction::DeviceType GPUReconstruction::GetDeviceType() const { return (DeviceType)GetDeviceBackendSettings().deviceType; }
+const GPUParam& GPUReconstruction::GetParam() const { return mHostConstantMem->param; }
+void GPUReconstruction::SetResetTimers(bool reset) { mProcessingSettings->resetTimers = reset; }
+void GPUReconstruction::SetDebugLevelTmp(int32_t level) { mProcessingSettings->debugLevel = level; }
+GPUParam& GPUReconstruction::param() { return mHostConstantMem->param; }
+const GPUTrackingInOutPointers GPUReconstruction::GetIOPtrs() const { return mHostConstantMem->ioPtrs; }
+const GPUCalibObjectsConst& GPUReconstruction::GetCalib() const { return processors()->calibObjects; }
 
 ThrustVolatileAllocator::ThrustVolatileAllocator(GPUReconstruction* r)
 {
diff --git a/GPU/GPUTracking/Base/GPUReconstruction.h b/GPU/GPUTracking/Base/GPUReconstruction.h
index 23fb6e4d9ff06..b6256f7f8ad82 100644
--- a/GPU/GPUTracking/Base/GPUReconstruction.h
+++ b/GPU/GPUTracking/Base/GPUReconstruction.h
@@ -25,13 +25,13 @@
 #include <unordered_map>
 #include <unordered_set>
 
-#include "GPUTRDDef.h"
-#include "GPUParam.h"
-#include "GPUSettings.h"
-#include "GPUOutputControl.h"
+#include "GPUDataTypes.h"
 #include "GPUMemoryResource.h"
-#include "GPUConstantMem.h"
-#include "GPULogging.h"
+#include "GPUOutputControl.h"
+
+/*#include "GPUParam.h"
+#include "GPUSettings.h"
+#include "GPULogging.h"*/
 
 namespace o2::its
 {
@@ -49,6 +49,13 @@ struct GPUReconstructionThreading;
 class GPUROOTDumpCore;
 class ThrustVolatileAllocator;
 struct GPUDefParameters;
+class GPUMemoryResource;
+struct GPUSettingsDeviceBackend;
+struct GPUSettingsGRP;
+struct GPUSettingsProcessing;
+struct GPUSettingsRec;
+struct GPUSettingsRecDynamic;
+struct GPUMemoryReuse;
 
 namespace gpu_reconstruction_kernels
 {
@@ -186,18 +193,20 @@ class GPUReconstruction
   bool slavesExist() { return mSlaves.size() || mMaster; }
 
   // Getters / setters for parameters
-  DeviceType GetDeviceType() const { return (DeviceType)mDeviceBackendSettings.deviceType; }
+  DeviceType GetDeviceType() const;
   bool IsGPU() const { return GetDeviceType() != DeviceType::INVALID_DEVICE && GetDeviceType() != DeviceType::CPU; }
-  const GPUParam& GetParam() const { return mHostConstantMem->param; }
+  const GPUParam& GetParam() const;
   const GPUConstantMem& GetConstantMem() const { return *mHostConstantMem; }
-  const GPUSettingsGRP& GetGRPSettings() const { return mGRPSettings; }
-  const GPUSettingsDeviceBackend& GetDeviceBackendSettings() { return mDeviceBackendSettings; }
-  const GPUSettingsProcessing& GetProcessingSettings() const { return mProcessingSettings; }
+  const GPUTrackingInOutPointers GetIOPtrs() const;
+  const GPUSettingsGRP& GetGRPSettings() const { return *mGRPSettings; }
+  const GPUSettingsDeviceBackend& GetDeviceBackendSettings() const { return *mDeviceBackendSettings; }
+  const GPUSettingsProcessing& GetProcessingSettings() const { return *mProcessingSettings; }
+  const GPUCalibObjectsConst& GetCalib() const;
   bool IsInitialized() const { return mInitialized; }
   void SetSettings(float solenoidBzNominalGPU, const GPURecoStepConfiguration* workflow = nullptr);
   void SetSettings(const GPUSettingsGRP* grp, const GPUSettingsRec* rec = nullptr, const GPUSettingsProcessing* proc = nullptr, const GPURecoStepConfiguration* workflow = nullptr);
-  void SetResetTimers(bool reset) { mProcessingSettings.resetTimers = reset; }     // May update also after Init()
-  void SetDebugLevelTmp(int32_t level) { mProcessingSettings.debugLevel = level; } // Temporarily, before calling SetSettings()
+  void SetResetTimers(bool reset);      // May update also after Init()
+  void SetDebugLevelTmp(int32_t level); // Temporarily, before calling SetSettings()
   void UpdateSettings(const GPUSettingsGRP* g, const GPUSettingsProcessing* p = nullptr, const GPUSettingsRecDynamic* d = nullptr);
   void UpdateDynamicSettings(const GPUSettingsRecDynamic* d);
   void SetOutputControl(const GPUOutputControl& v) { mOutputControl = v; }
@@ -272,6 +281,7 @@ class GPUReconstruction
   size_t ReadData(FILE* fp, const T** entries, S* num, std::unique_ptr<T[]>* mem, InOutPointerType type, T** nonConstPtrs = nullptr);
   template <class T>
   T* AllocateIOMemoryHelper(size_t n, const T*& ptr, std::unique_ptr<T[]>& u);
+  int16_t RegisterMemoryAllocationHelper(GPUProcessor* proc, void* (GPUProcessor::*setPtr)(void*), int32_t type, const char* name, const GPUMemoryReuse& re);
 
   // Private helper functions to dump / load flat objects
   template <class T>
@@ -292,17 +302,17 @@ class GPUReconstruction
   // Pointers to tracker classes
   GPUConstantMem* processors() { return mHostConstantMem.get(); }
   const GPUConstantMem* processors() const { return mHostConstantMem.get(); }
-  GPUParam& param() { return mHostConstantMem->param; }
+  GPUParam& param();
   std::unique_ptr<GPUConstantMem> mHostConstantMem;
   GPUConstantMem* mDeviceConstantMem = nullptr;
 
   // Settings
-  GPUSettingsGRP mGRPSettings;                          // Global Run Parameters
-  GPUSettingsDeviceBackend mDeviceBackendSettings;      // Processing Parameters (at constructor level)
-  GPUSettingsProcessing mProcessingSettings;            // Processing Parameters (at init level)
-  GPUOutputControl mOutputControl;                      // Controls the output of the individual components
-  GPUOutputControl mInputControl;                       // Prefefined input memory location for reading standalone dumps
-  std::unique_ptr<GPUMemorySizeScalers> mMemoryScalers; // Scalers how much memory will be needed
+  std::unique_ptr<GPUSettingsGRP> mGRPSettings;                     // Global Run Parameters
+  std::unique_ptr<GPUSettingsDeviceBackend> mDeviceBackendSettings; // Processing Parameters (at constructor level)
+  std::unique_ptr<GPUSettingsProcessing> mProcessingSettings;       // Processing Parameters (at init level)
+  GPUOutputControl mOutputControl;                                  // Controls the output of the individual components
+  GPUOutputControl mInputControl;                                   // Prefefined input memory location for reading standalone dumps
+  std::unique_ptr<GPUMemorySizeScalers> mMemoryScalers;             // Scalers how much memory will be needed
 
   GPURecoStepConfiguration mRecoSteps;
 
@@ -392,35 +402,6 @@ class GPUReconstruction
   static GPUReconstruction* GPUReconstruction_Create_CPU(const GPUSettingsDeviceBackend& cfg);
 };
 
-template <class T>
-inline T* GPUReconstruction::AllocateIOMemoryHelper(size_t n, const T*& ptr, std::unique_ptr<T[]>& u)
-{
-  if (n == 0) {
-    u.reset(nullptr);
-    return nullptr;
-  }
-  T* retVal;
-  if (mInputControl.useExternal()) {
-    u.reset(nullptr);
-    mInputControl.checkCurrent();
-    GPUProcessor::computePointerWithAlignment(mInputControl.ptrCurrent, retVal, n);
-    if ((size_t)((char*)mInputControl.ptrCurrent - (char*)mInputControl.ptrBase) > mInputControl.size) {
-      throw std::bad_alloc();
-    }
-  } else {
-    u.reset(new T[n]);
-    retVal = u.get();
-    if (mProcessingSettings.registerStandaloneInputMemory) {
-      if (registerMemoryForGPU(u.get(), n * sizeof(T))) {
-        GPUError("Error registering memory for GPU: %p - %ld bytes\n", (void*)u.get(), (int64_t)(n * sizeof(T)));
-        throw std::bad_alloc();
-      }
-    }
-  }
-  ptr = retVal;
-  return retVal;
-}
-
 template <class T, typename... Args>
 inline T* GPUReconstruction::AddChain(Args... args)
 {
@@ -431,31 +412,7 @@ inline T* GPUReconstruction::AddChain(Args... args)
 template <class T>
 inline int16_t GPUReconstruction::RegisterMemoryAllocation(T* proc, void* (T::*setPtr)(void*), int32_t type, const char* name, const GPUMemoryReuse& re)
 {
-  if (!(type & (GPUMemoryResource::MEMORY_HOST | GPUMemoryResource::MEMORY_GPU))) {
-    if ((type & GPUMemoryResource::MEMORY_SCRATCH) && !mProcessingSettings.keepDisplayMemory) { // keepAllMemory --> keepDisplayMemory
-      type |= (proc->mGPUProcessorType == GPUProcessor::PROCESSOR_TYPE_CPU ? GPUMemoryResource::MEMORY_HOST : GPUMemoryResource::MEMORY_GPU);
-    } else {
-      type |= GPUMemoryResource::MEMORY_HOST | GPUMemoryResource::MEMORY_GPU;
-    }
-  }
-  if (proc->mGPUProcessorType == GPUProcessor::PROCESSOR_TYPE_CPU) {
-    type &= ~GPUMemoryResource::MEMORY_GPU;
-  }
-  mMemoryResources.emplace_back(proc, static_cast<void* (GPUProcessor::*)(void*)>(setPtr), (GPUMemoryResource::MemoryType)type, name);
-  if (mMemoryResources.size() >= 32768) {
-    throw std::bad_alloc();
-  }
-  uint16_t retVal = mMemoryResources.size() - 1;
-  if (re.type != GPUMemoryReuse::NONE && !mProcessingSettings.disableMemoryReuse) {
-    const auto& it = mMemoryReuse1to1.find(re.id);
-    if (it == mMemoryReuse1to1.end()) {
-      mMemoryReuse1to1[re.id] = {proc, retVal};
-    } else {
-      mMemoryResources[retVal].mReuse = it->second.res[0];
-      it->second.res.emplace_back(retVal);
-    }
-  }
-  return retVal;
+  return RegisterMemoryAllocationHelper(proc, static_cast<void* (GPUProcessor::*)(void*)>(setPtr), type, name, re);
 }
 
 template <class T>
@@ -471,7 +428,7 @@ inline void GPUReconstruction::SetupGPUProcessor(T* proc, bool allocate)
 {
   static_assert(sizeof(T) > sizeof(GPUProcessor), "Need to setup derived class");
   if (allocate) {
-    proc->SetMaxData(mHostConstantMem->ioPtrs);
+    proc->SetMaxData(GetIOPtrs());
   }
   if (proc->mGPUProcessorType != GPUProcessor::PROCESSOR_TYPE_DEVICE && proc->mLinkedProcessor) {
     std::memcpy((void*)proc->mLinkedProcessor, (const void*)proc, sizeof(*proc));
diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx
index 39507beda8a55..2453ce4a2328f 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx
+++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx
@@ -30,19 +30,18 @@
 #include "GPUTRDTrackletLabels.h"
 #include "GPUMemoryResource.h"
 #include "GPUConstantMem.h"
+#include "GPULogging.h"
 #include "GPUMemorySizeScalers.h"
+#include "GPUReconstructionProcessingKernels.inc"
+
 #include <atomic>
 #include <ctime>
 
-#define GPUCA_LOGGING_PRINTF
-#include "GPULogging.h"
-
 #ifndef _WIN32
 #include <unistd.h>
 #endif
 
 using namespace o2::gpu;
-using namespace o2::gpu::gpu_reconstruction_kernels;
 
 constexpr GPUReconstructionCPU::krnlRunRange GPUReconstructionCPU::krnlRunRangeNone;
 constexpr GPUReconstructionCPU::krnlEvent GPUReconstructionCPU::krnlEventNone;
@@ -55,7 +54,7 @@ GPUReconstructionCPU::~GPUReconstructionCPU()
 }
 
 template <class T, int32_t I, typename... Args>
-inline void GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args)
+inline void GPUReconstructionCPU::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args)
 {
   auto& x = _xyz.x;
   auto& y = _xyz.y;
@@ -67,7 +66,7 @@ inline void GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlSetu
   }
   int32_t nThreads = getNKernelHostThreads(false);
   if (nThreads > 1) {
-    if (mProcessingSettings.debugLevel >= 5) {
+    if (GetProcessingSettings().debugLevel >= 5) {
       printf("Running %d Threads\n", mThreading->activeThreads->max_concurrency());
     }
     tbb::this_task_arena::isolate([&] {
@@ -89,7 +88,7 @@ inline void GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlSetu
 }
 
 template <>
-inline void GPUReconstructionCPUBackend::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
+inline void GPUReconstructionCPU::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
 {
   int32_t nThreads = std::max<int32_t>(1, std::min<int32_t>(size / (16 * 1024 * 1024), getNKernelHostThreads(true)));
   if (nThreads > 1) {
@@ -110,7 +109,7 @@ inline void GPUReconstructionCPUBackend::runKernelBackendInternal<GPUMemClean16,
 }
 
 template <class T, int32_t I, typename... Args>
-void GPUReconstructionCPUBackend::runKernelBackend(const krnlSetupArgs<T, I, Args...>& args)
+void GPUReconstructionCPU::runKernelBackend(const krnlSetupArgs<T, I, Args...>& args)
 {
 #pragma GCC diagnostic push
 #if defined(__clang__)
@@ -121,14 +120,14 @@ void GPUReconstructionCPUBackend::runKernelBackend(const krnlSetupArgs<T, I, Arg
 }
 
 template <class S, int32_t I>
-gpu_reconstruction_kernels::krnlProperties GPUReconstructionCPU::getKernelProperties(int gpu)
+GPUReconstructionProcessing::krnlProperties GPUReconstructionCPU::getKernelProperties(int gpu)
 {
   if (gpu == -1) {
     gpu = IsGPU();
   }
   const auto num = GetKernelNum<S, I>();
   const auto* p = gpu ? mParDevice : mParCPU;
-  gpu_reconstruction_kernels::krnlProperties ret = {p->par_LB_maxThreads[num], p->par_LB_minBlocks[num], p->par_LB_forceBlocks[num]};
+  GPUReconstructionProcessing::krnlProperties ret = {p->par_LB_maxThreads[num], p->par_LB_minBlocks[num], p->par_LB_forceBlocks[num]};
   if (ret.nThreads == 0) {
     ret.nThreads = gpu ? mThreadCount : 1u;
   }
@@ -138,9 +137,9 @@ gpu_reconstruction_kernels::krnlProperties GPUReconstructionCPU::getKernelProper
   return ret;
 }
 
-#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...)                                                                                                  \
-  template void GPUReconstructionCPUBackend::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args); \
-  template krnlProperties GPUReconstructionCPU::getKernelProperties<GPUCA_M_KRNL_TEMPLATE(x_class)>(int gpu);
+#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...)                                                                                           \
+  template void GPUReconstructionCPU::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args); \
+  template GPUReconstructionProcessing::krnlProperties GPUReconstructionCPU::getKernelProperties<GPUCA_M_KRNL_TEMPLATE(x_class)>(int gpu);
 #include "GPUReconstructionKernelList.h"
 #undef GPUCA_KRNL
 
@@ -169,7 +168,7 @@ size_t GPUReconstructionCPU::TransferMemoryResourcesHelper(GPUProcessor* proc, i
     if (!(res.mType & GPUMemoryResource::MEMORY_GPU) || (res.mType & GPUMemoryResource::MEMORY_CUSTOM_TRANSFER)) {
       continue;
     }
-    if (!mProcessingSettings.keepAllMemory && !all && (res.mType & exc) && !(res.mType & inc)) {
+    if (!GetProcessingSettings().keepAllMemory && !all && (res.mType & exc) && !(res.mType & inc)) {
       continue;
     }
     if (toGPU) {
@@ -197,7 +196,7 @@ int32_t GPUReconstructionCPU::InitDevice()
 {
   mActiveHostKernelThreads = mMaxHostThreads;
   mThreading->activeThreads = std::make_unique<tbb::task_arena>(mActiveHostKernelThreads);
-  if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
+  if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
     if (mMaster == nullptr) {
       if (mDeviceMemorySize > mHostMemorySize) {
         mHostMemorySize = mDeviceMemorySize;
@@ -207,7 +206,7 @@ int32_t GPUReconstructionCPU::InitDevice()
     mHostMemoryPermanent = mHostMemoryBase;
     ClearAllocatedMemory();
   }
-  if (mProcessingSettings.inKernelParallel) {
+  if (GetProcessingSettings().inKernelParallel) {
     mBlockCount = mMaxHostThreads;
   }
   mProcShadow.mProcessorsProc = processors();
@@ -216,7 +215,7 @@ int32_t GPUReconstructionCPU::InitDevice()
 
 int32_t GPUReconstructionCPU::ExitDevice()
 {
-  if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
+  if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
     if (mMaster == nullptr) {
       operator delete(mHostMemoryBase, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
     }
@@ -232,13 +231,13 @@ int32_t GPUReconstructionCPU::RunChains()
   mStatNEvents++;
   mNEventsProcessed++;
 
-  if (mProcessingSettings.debugLevel >= 3 || mProcessingSettings.allocDebugLevel) {
+  if (GetProcessingSettings().debugLevel >= 3 || GetProcessingSettings().allocDebugLevel) {
     printf("Allocated memory when starting processing %34s", "");
     PrintMemoryOverview();
   }
   mTimerTotal.Start();
   const std::clock_t cpuTimerStart = std::clock();
-  if (mProcessingSettings.doublePipeline) {
+  if (GetProcessingSettings().doublePipeline) {
     int32_t retVal = EnqueuePipeline();
     if (retVal) {
       return retVal;
@@ -259,7 +258,7 @@ int32_t GPUReconstructionCPU::RunChains()
   }
   mTimerTotal.Stop();
   mStatCPUTime += (double)(std::clock() - cpuTimerStart) / CLOCKS_PER_SEC;
-  if (mProcessingSettings.debugLevel >= 3 || mProcessingSettings.allocDebugLevel) {
+  if (GetProcessingSettings().debugLevel >= 3 || GetProcessingSettings().allocDebugLevel) {
     printf("Allocated memory when ending processing %36s", "");
     PrintMemoryOverview();
   }
@@ -281,7 +280,7 @@ int32_t GPUReconstructionCPU::RunChains()
       for (int32_t j = 0; j < mTimers[i]->num; j++) {
         HighResTimer& timer = mTimers[i]->timer[j];
         time += timer.GetElapsedTime();
-        if (mProcessingSettings.resetTimers) {
+        if (GetProcessingSettings().resetTimers) {
           timer.Reset();
         }
       }
@@ -297,7 +296,7 @@ int32_t GPUReconstructionCPU::RunChains()
         snprintf(bandwidth, 256, " (%8.3f GB/s - %'14zu bytes - %'14zu per call)", mTimers[i]->memSize / time * 1e-9, mTimers[i]->memSize / mStatNEvents, mTimers[i]->memSize / mStatNEvents / mTimers[i]->count);
       }
       printf("Execution Time: Task (%c %8ux): %50s Time: %'10.0f us%s\n", type == 0 ? 'K' : 'C', mTimers[i]->count, mTimers[i]->name.c_str(), time * 1000000 / mStatNEvents, bandwidth);
-      if (mProcessingSettings.resetTimers) {
+      if (GetProcessingSettings().resetTimers) {
         mTimers[i]->count = 0;
         mTimers[i]->memSize = 0;
       }
@@ -317,7 +316,7 @@ int32_t GPUReconstructionCPU::RunChains()
         printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", mTimersRecoSteps[i].countToHost, "DMA to Host", GPUDataTypes::RECO_STEP_NAMES[i], mTimersRecoSteps[i].timerToHost.GetElapsedTime() * 1000000 / mStatNEvents,
                mTimersRecoSteps[i].bytesToHost / mTimersRecoSteps[i].timerToHost.GetElapsedTime() * 1e-9, mTimersRecoSteps[i].bytesToHost / mStatNEvents, mTimersRecoSteps[i].bytesToHost / mTimersRecoSteps[i].countToHost);
       }
-      if (mProcessingSettings.resetTimers) {
+      if (GetProcessingSettings().resetTimers) {
         mTimersRecoSteps[i].bytesToGPU = mTimersRecoSteps[i].bytesToHost = 0;
         mTimersRecoSteps[i].timerToGPU.Reset();
         mTimersRecoSteps[i].timerToHost.Reset();
@@ -340,7 +339,7 @@ int32_t GPUReconstructionCPU::RunChains()
   } else if (GetProcessingSettings().debugLevel >= 0) {
     GPUInfo("Total Wall Time: %10.0f us%s", mStatWallTime, nEventReport.c_str());
   }
-  if (mProcessingSettings.resetTimers) {
+  if (GetProcessingSettings().resetTimers) {
     mStatNEvents = 0;
     mStatCPUTime = 0;
     mTimerTotal.Reset();
@@ -366,7 +365,7 @@ void GPUReconstructionCPU::UpdateParamOccupancyMap(const uint32_t* mapHost, cons
     if (!((size_t)&param().occupancyTotal - (size_t)&param().occupancyMap == sizeof(param().occupancyMap) && sizeof(param().occupancyMap) == sizeof(size_t) && sizeof(param().occupancyTotal) < sizeof(size_t))) {
       throw std::runtime_error("occupancy data not consecutive in GPUParam");
     }
-    const auto threadContext = GetThreadContext();
+    const auto holdContext = GetThreadContext();
     size_t tmp[2] = {(size_t)mapGPU, 0};
     memcpy(&tmp[1], &occupancyTotal, sizeof(occupancyTotal));
     WriteToConstantMemory((char*)&processors()->param.occupancyMap - (char*)processors(), &tmp, sizeof(param().occupancyMap) + sizeof(param().occupancyTotal), stream);
diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h
index b37bf2b75f01c..d0d8b05c4af0e 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionCPU.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.h
@@ -16,14 +16,9 @@
 #define GPURECONSTRUCTIONICPU_H
 
 #include "GPUReconstructionProcessing.h"
-#include "GPUConstantMem.h"
 #include <stdexcept>
 #include <vector>
 
-#include "GPUGeneralKernels.h"
-#include "GPUReconstructionKernelIncludes.h"
-#include "GPUReconstructionKernels.h"
-
 namespace Ort
 {
 struct SessionOptions;
@@ -32,20 +27,7 @@ struct SessionOptions;
 namespace o2::gpu
 {
 
-class GPUReconstructionCPUBackend : public GPUReconstructionProcessing
-{
- public:
-  ~GPUReconstructionCPUBackend() override = default;
-
- protected:
-  GPUReconstructionCPUBackend(const GPUSettingsDeviceBackend& cfg) : GPUReconstructionProcessing(cfg) {}
-  template <class T, int32_t I = 0, typename... Args>
-  void runKernelBackend(const gpu_reconstruction_kernels::krnlSetupArgs<T, I, Args...>& args);
-  template <class T, int32_t I = 0, typename... Args>
-  void runKernelBackendInternal(const gpu_reconstruction_kernels::krnlSetupTime& _xyz, const Args&... args);
-};
-
-class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCPUBackend>
+class GPUReconstructionCPU : public GPUReconstructionProcessing::KernelInterface<GPUReconstructionCPU, GPUReconstructionProcessing>
 {
   friend GPUReconstruction* GPUReconstruction::GPUReconstruction_Create_CPU(const GPUSettingsDeviceBackend& cfg);
   friend class GPUChain;
@@ -55,10 +37,10 @@ class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCP
   static constexpr krnlRunRange krnlRunRangeNone{0};
   static constexpr krnlEvent krnlEventNone = krnlEvent{nullptr, nullptr, 0};
 
-  template <class S, int32_t I = 0, typename... Args>
-  void runKernel(krnlSetup&& setup, Args&&... args);
   template <class S, int32_t I = 0>
-  gpu_reconstruction_kernels::krnlProperties getKernelProperties(int gpu = -1);
+  krnlProperties getKernelProperties(int gpu = -1);
+  template <class T, int32_t I = 0, typename... Args>
+  void runKernelBackend(const krnlSetupArgs<T, I, Args...>& args);
 
   virtual int32_t GPUDebug(const char* state = "UNKNOWN", int32_t stream = -1, bool force = false);
   int32_t GPUStuck() { return mGPUStuck; }
@@ -75,21 +57,10 @@ class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCP
     int16_t mMemoryResProcessors = -1;
   };
 
-  GPUReconstructionCPU(const GPUSettingsDeviceBackend& cfg) : GPUReconstructionKernels(cfg) {}
-
-#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...)                                                                                                              \
-  inline void runKernelImplWrapper(gpu_reconstruction_kernels::classArgument<GPUCA_M_KRNL_TEMPLATE(x_class)>, bool cpuFallback, double& timer, krnlSetup&& setup GPUCA_M_STRIP(x_arguments)) \
-  {                                                                                                                                                                                          \
-    krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)> args(setup.x, setup.y, setup.z, timer GPUCA_M_STRIP(x_forward));                                                    \
-    const uint32_t num = GetKernelNum<GPUCA_M_KRNL_TEMPLATE(x_class)>();                                                                                                                     \
-    if (cpuFallback) {                                                                                                                                                                       \
-      GPUReconstructionCPU::runKernelImpl(num, &args);                                                                                                                                       \
-    } else {                                                                                                                                                                                 \
-      runKernelImpl(num, &args);                                                                                                                                                             \
-    }                                                                                                                                                                                        \
-  }
-#include "GPUReconstructionKernelList.h"
-#undef GPUCA_KRNL
+  GPUReconstructionCPU(const GPUSettingsDeviceBackend& cfg) : GPUReconstructionProcessing::KernelInterface<GPUReconstructionCPU, GPUReconstructionProcessing>(cfg) {}
+
+  template <class T, int32_t I = 0, typename... Args>
+  void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args);
 
   int32_t registerMemoryForGPU_internal(const void* ptr, size_t size) override { return 0; }
   int32_t unregisterMemoryForGPU_internal(const void* ptr) override { return 0; }
@@ -132,72 +103,10 @@ class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCP
 
  private:
   size_t TransferMemoryResourcesHelper(GPUProcessor* proc, int32_t stream, bool all, bool toGPU);
+  template <class S, int32_t I = 0, typename... Args>
+  void runKernelInterface(krnlSetup&& setup, Args const&... args);
 };
 
-template <class S, int32_t I, typename... Args>
-inline void GPUReconstructionCPU::runKernel(krnlSetup&& setup, Args&&... args)
-{
-  HighResTimer* t = nullptr;
-  GPUDataTypes::RecoStep myStep = S::GetRecoStep() == GPUDataTypes::RecoStep::NoRecoStep ? setup.x.step : S::GetRecoStep();
-  if (myStep == GPUDataTypes::RecoStep::NoRecoStep) {
-    throw std::runtime_error("Failure running general kernel without defining RecoStep");
-  }
-  int32_t cpuFallback = IsGPU() ? (setup.x.device == krnlDeviceType::CPU ? 2 : (mRecoSteps.stepsGPUMask & myStep) != myStep) : 0;
-  uint32_t& nThreads = setup.x.nThreads;
-  uint32_t& nBlocks = setup.x.nBlocks;
-  const uint32_t stream = setup.x.stream;
-  auto prop = getKernelProperties<S, I>();
-  const int32_t autoThreads = cpuFallback ? 1 : prop.nThreads;
-  const int32_t autoBlocks = cpuFallback ? 1 : (prop.forceBlocks ? prop.forceBlocks : (prop.minBlocks * mBlockCount));
-  if (nBlocks == (uint32_t)-1) {
-    nBlocks = (nThreads + autoThreads - 1) / autoThreads;
-    nThreads = autoThreads;
-  } else if (nBlocks == (uint32_t)-2) {
-    nBlocks = nThreads;
-    nThreads = autoThreads;
-  } else if (nBlocks == (uint32_t)-3) {
-    nBlocks = autoBlocks;
-    nThreads = autoThreads;
-  } else if ((int32_t)nThreads < 0) {
-    nThreads = cpuFallback ? 1 : -nThreads;
-  }
-  if (nThreads > GPUCA_MAX_THREADS) {
-    throw std::runtime_error("GPUCA_MAX_THREADS exceeded");
-  }
-  if (mProcessingSettings.debugLevel >= 3) {
-    GPUInfo("Running kernel %s (Stream %d, Index %d, Grid %d/%d) on %s", GetKernelName<S, I>(), stream, setup.y.index, nBlocks, nThreads, cpuFallback == 2 ? "CPU (forced)" : cpuFallback ? "CPU (fallback)" : mDeviceName.c_str());
-  }
-  if (nThreads == 0 || nBlocks == 0) {
-    return;
-  }
-  if (mProcessingSettings.debugLevel >= 1) {
-    t = &getKernelTimer<S, I>(myStep, !IsGPU() || cpuFallback ? getHostThreadIndex() : stream);
-    if ((!mProcessingSettings.deviceTimers || !IsGPU() || cpuFallback) && (mNActiveThreadsOuterLoop < 2 || getHostThreadIndex() == 0)) {
-      t->Start();
-    }
-  }
-  double deviceTimerTime = 0.;
-  runKernelImplWrapper(gpu_reconstruction_kernels::classArgument<S, I>(), cpuFallback, deviceTimerTime, std::forward<krnlSetup&&>(setup), std::forward<Args>(args)...);
-  if (GPUDebug(GetKernelName<S, I>(), stream, mProcessingSettings.serializeGPU & 1)) {
-    throw std::runtime_error("kernel failure");
-  }
-  if (mProcessingSettings.debugLevel >= 1) {
-    if (t) {
-      if (deviceTimerTime != 0.) {
-        t->AddTime(deviceTimerTime);
-        if (t->IsRunning()) {
-          t->Abort();
-        }
-      } else if (t->IsRunning()) {
-        t->Stop();
-      }
-    }
-    if (CheckErrorCodes(cpuFallback) && !mProcessingSettings.ignoreNonFatalGPUErrors) {
-      throw std::runtime_error("kernel error code");
-    }
-  }
-}
-
 } // namespace o2::gpu
 
 #endif
diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPUKernels.h b/GPU/GPUTracking/Base/GPUReconstructionCPUKernels.h
new file mode 100644
index 0000000000000..837516a93b6ae
--- /dev/null
+++ b/GPU/GPUTracking/Base/GPUReconstructionCPUKernels.h
@@ -0,0 +1,98 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUReconstructionCPUKernels.h
+/// \author David Rohr
+
+#ifndef GPURECONSTRUCTIONICPUKERNELS_H
+#define GPURECONSTRUCTIONICPUKERNELS_H
+
+#include "GPUReconstructionCPU.h"
+#include "GPUSettings.h"
+#include "GPULogging.h"
+
+namespace o2::gpu
+{
+
+template <class S, int32_t I, typename... Args>
+inline void GPUReconstructionCPU::runKernelInterface(krnlSetup&& setup, Args const&... args)
+{
+  HighResTimer* t = nullptr;
+  GPUDataTypes::RecoStep myStep = S::GetRecoStep() == GPUDataTypes::RecoStep::NoRecoStep ? setup.x.step : S::GetRecoStep();
+  if (myStep == GPUDataTypes::RecoStep::NoRecoStep) {
+    throw std::runtime_error("Failure running general kernel without defining RecoStep");
+  }
+  int32_t cpuFallback = IsGPU() ? (setup.x.device == krnlDeviceType::CPU ? 2 : (mRecoSteps.stepsGPUMask & myStep) != myStep) : 0;
+  uint32_t& nThreads = setup.x.nThreads;
+  uint32_t& nBlocks = setup.x.nBlocks;
+  const uint32_t stream = setup.x.stream;
+  auto prop = getKernelProperties<S, I>();
+  const int32_t autoThreads = cpuFallback ? 1 : prop.nThreads;
+  const int32_t autoBlocks = cpuFallback ? 1 : (prop.forceBlocks ? prop.forceBlocks : (prop.minBlocks * mBlockCount));
+  if (nBlocks == (uint32_t)-1) {
+    nBlocks = (nThreads + autoThreads - 1) / autoThreads;
+    nThreads = autoThreads;
+  } else if (nBlocks == (uint32_t)-2) {
+    nBlocks = nThreads;
+    nThreads = autoThreads;
+  } else if (nBlocks == (uint32_t)-3) {
+    nBlocks = autoBlocks;
+    nThreads = autoThreads;
+  } else if ((int32_t)nThreads < 0) {
+    nThreads = cpuFallback ? 1 : -nThreads;
+  }
+  if (nThreads > GPUCA_MAX_THREADS) {
+    throw std::runtime_error("GPUCA_MAX_THREADS exceeded");
+  }
+  if (GetProcessingSettings().debugLevel >= 3) {
+    GPUInfo("Running kernel %s (Stream %d, Index %d, Grid %d/%d) on %s", GetKernelName<S, I>(), stream, setup.y.index, nBlocks, nThreads, cpuFallback == 2 ? "CPU (forced)" : (cpuFallback ? "CPU (fallback)" : mDeviceName.c_str()));
+  }
+  if (nThreads == 0 || nBlocks == 0) {
+    return;
+  }
+  if (GetProcessingSettings().debugLevel >= 1) {
+    t = &getKernelTimer<S, I>(myStep, !IsGPU() || cpuFallback ? getHostThreadIndex() : stream);
+    if ((!GetProcessingSettings().deviceTimers || !IsGPU() || cpuFallback) && (mNActiveThreadsOuterLoop < 2 || getHostThreadIndex() == 0)) {
+      t->Start();
+    }
+  }
+  double deviceTimerTime = 0.;
+  krnlSetupArgs<S, I, Args...> argPack{{}, {{setup.x, setup.y, setup.z}, deviceTimerTime}, {args...}};
+  const uint32_t num = GetKernelNum<S, I>();
+  if (cpuFallback) {
+    GPUReconstructionCPU::runKernelVirtual(num, &argPack);
+  } else {
+    runKernelVirtual(num, &argPack);
+  }
+
+  if (GPUDebug(GetKernelName<S, I>(), stream, GetProcessingSettings().serializeGPU & 1)) {
+    throw std::runtime_error("kernel failure");
+  }
+  if (GetProcessingSettings().debugLevel >= 1) {
+    if (t) {
+      if (deviceTimerTime != 0.) {
+        t->AddTime(deviceTimerTime);
+        if (t->IsRunning()) {
+          t->Abort();
+        }
+      } else if (t->IsRunning()) {
+        t->Stop();
+      }
+    }
+    if (CheckErrorCodes(cpuFallback) && !GetProcessingSettings().ignoreNonFatalGPUErrors) {
+      throw std::runtime_error("kernel error code");
+    }
+  }
+}
+
+} // namespace o2::gpu
+
+#endif
diff --git a/GPU/GPUTracking/Base/GPUReconstructionConvert.cxx b/GPU/GPUTracking/Base/GPUReconstructionConvert.cxx
index e12ca7ec601ad..2dec88393f632 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionConvert.cxx
+++ b/GPU/GPUTracking/Base/GPUReconstructionConvert.cxx
@@ -25,7 +25,7 @@
 #include "GPUO2DataTypes.h"
 #include "GPUDataTypes.h"
 #include "GPUTPCGeometry.h"
-#include "AliHLTTPCRawCluster.h"
+#include "AliHLTTPCRawCluster.h" // TODO: Is this still needed at all, or can it be removed?
 #include "GPUParam.h"
 #include "GPULogging.h"
 #include <algorithm>
diff --git a/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.cxx b/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.cxx
index b389e99a0b2bb..9962bdf3922c1 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.cxx
+++ b/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.cxx
@@ -14,6 +14,7 @@
 
 #include "GPUReconstructionDeviceBase.h"
 #include "GPUReconstructionIncludes.h"
+#include "GPUConstantMem.h"
 
 #include "GPUTPCTracker.h"
 
@@ -93,21 +94,21 @@ int32_t GPUReconstructionDeviceBase::InitDevice()
   // CPU_SET(0, &mask);
   // sched_setaffinity(0, sizeof(mask), &mask);
 
-  if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) {
+  if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) {
     GPUError("Individual memory allocation strategy unsupported for device\n");
     return (1);
   }
-  if (mProcessingSettings.nStreams > GPUCA_MAX_STREAMS) {
-    GPUError("Too many straems requested %d > %d\n", mProcessingSettings.nStreams, GPUCA_MAX_STREAMS);
+  if (GetProcessingSettings().nStreams > GPUCA_MAX_STREAMS) {
+    GPUError("Too many straems requested %d > %d\n", GetProcessingSettings().nStreams, GPUCA_MAX_STREAMS);
     return (1);
   }
 
   void* semLock = nullptr;
-  if (mProcessingSettings.globalInitMutex && GetGlobalLock(semLock)) {
+  if (GetProcessingSettings().globalInitMutex && GetGlobalLock(semLock)) {
     return (1);
   }
 
-  if (mProcessingSettings.deviceTimers) {
+  if (GetProcessingSettings().deviceTimers) {
     AddGPUEvents(mDebugEvents);
   }
 
@@ -117,7 +118,7 @@ int32_t GPUReconstructionDeviceBase::InitDevice()
     return (1);
   }
 
-  if (mProcessingSettings.globalInitMutex) {
+  if (GetProcessingSettings().globalInitMutex) {
     ReleaseGlobalLock(semLock);
   }
 
@@ -129,7 +130,7 @@ int32_t GPUReconstructionDeviceBase::InitDevice()
   mProcShadow.mMemoryResProcessors = RegisterMemoryAllocation(&mProcShadow, &GPUProcessorProcessors::SetPointersDeviceProcessor, GPUMemoryResource::MEMORY_PERMANENT | GPUMemoryResource::MEMORY_HOST, "Processors");
   AllocateRegisteredMemory(mProcShadow.mMemoryResProcessors);
 
-  if (mMaster == nullptr || mProcessingSettings.debugLevel >= 2) {
+  if (mMaster == nullptr || GetProcessingSettings().debugLevel >= 2) {
     GPUInfo("GPU Tracker initialization successfull"); // Verbosity reduced because GPU backend will print GPUImportant message!
   }
 
@@ -186,13 +187,15 @@ void GPUReconstructionDeviceBase::runConstantRegistrators()
 size_t GPUReconstructionDeviceBase::TransferMemoryInternal(GPUMemoryResource* res, int32_t stream, deviceEvent* ev, deviceEvent* evList, int32_t nEvents, bool toGPU, const void* src, void* dst)
 {
   if (!(res->Type() & GPUMemoryResource::MEMORY_GPU)) {
-    if (mProcessingSettings.debugLevel >= 4) {
+    if (GetProcessingSettings().debugLevel >= 4) {
       GPUInfo("Skipped transfer of non-GPU memory resource: %s", res->Name());
     }
     return 0;
   }
-  if (mProcessingSettings.debugLevel >= 3 && (strcmp(res->Name(), "ErrorCodes") || mProcessingSettings.debugLevel >= 4)) {
+  if (GetProcessingSettings().debugLevel >= 3 && (strcmp(res->Name(), "ErrorCodes") || GetProcessingSettings().debugLevel >= 4)) {
     GPUInfo("Copying to %s: %s - %ld bytes", toGPU ? "GPU" : "Host", res->Name(), (int64_t)res->Size());
   }
   return GPUMemCpy(dst, src, res->Size(), stream, toGPU, ev, evList, nEvents);
 }
+
+const GPUParam* GPUReconstructionDeviceBase::DeviceParam() const { return &mDeviceConstantMem->param; }
diff --git a/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.h b/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.h
index f0e19f588e0f1..c8288f978f6ae 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.h
@@ -22,16 +22,12 @@
 
 namespace o2::gpu
 {
-#if !(defined(__CLING__) || defined(__ROOTCLING__) || defined(G__ROOT))
-extern template class GPUReconstructionKernels<GPUReconstructionCPUBackend>;
-#endif
-
 class GPUReconstructionDeviceBase : public GPUReconstructionCPU
 {
  public:
   ~GPUReconstructionDeviceBase() override;
 
-  const GPUParam* DeviceParam() const { return &mDeviceConstantMem->param; }
+  const GPUParam* DeviceParam() const;
   struct deviceConstantMemRegistration {
     deviceConstantMemRegistration(void* (*reg)())
     {
@@ -51,8 +47,6 @@ class GPUReconstructionDeviceBase : public GPUReconstructionCPU
   int32_t unregisterMemoryForGPU_internal(const void* ptr) override;
   void unregisterRemainingRegisteredMemory();
 
-  virtual const GPUTPCTracker* CPUTracker(int32_t iSector) { return &processors()->tpcTrackers[iSector]; }
-
   int32_t GPUDebug(const char* state = "UNKNOWN", int32_t stream = -1, bool force = false) override = 0;
   size_t TransferMemoryInternal(GPUMemoryResource* res, int32_t stream, deviceEvent* ev, deviceEvent* evList, int32_t nEvents, bool toGPU, const void* src, void* dst) override;
   size_t GPUMemCpy(void* dst, const void* src, size_t size, int32_t stream, int32_t toGPU, deviceEvent* ev = nullptr, deviceEvent* evList = nullptr, int32_t nEvents = 1) override = 0;
diff --git a/GPU/GPUTracking/Base/GPUReconstructionIO.h b/GPU/GPUTracking/Base/GPUReconstructionIO.h
index 2208c15846e09..810ebfffe1703 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionIO.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionIO.h
@@ -16,10 +16,40 @@
 #define GPURECONSTRUCTIONIO_H
 
 #include "GPUReconstruction.h"
+#include "GPUSettings.h"
 
 namespace o2::gpu
 {
 
+template <class T>
+inline T* GPUReconstruction::AllocateIOMemoryHelper(size_t n, const T*& ptr, std::unique_ptr<T[]>& u)
+{
+  if (n == 0) {
+    u.reset(nullptr);
+    return nullptr;
+  }
+  T* retVal;
+  if (mInputControl.useExternal()) {
+    u.reset(nullptr);
+    mInputControl.checkCurrent();
+    GPUProcessor::computePointerWithAlignment(mInputControl.ptrCurrent, retVal, n);
+    if ((size_t)((char*)mInputControl.ptrCurrent - (char*)mInputControl.ptrBase) > mInputControl.size) {
+      throw std::bad_alloc();
+    }
+  } else {
+    u.reset(new T[n]);
+    retVal = u.get();
+    if (GetProcessingSettings().registerStandaloneInputMemory) {
+      if (registerMemoryForGPU(u.get(), n * sizeof(T))) {
+        GPUError("Error registering memory for GPU: %p - %ld bytes\n", (void*)u.get(), (int64_t)(n * sizeof(T)));
+        throw std::bad_alloc();
+      }
+    }
+  }
+  ptr = retVal;
+  return retVal;
+}
+
 template <class T, class S>
 inline uint32_t GPUReconstruction::DumpData(FILE* fp, const T* const* entries, const S* num, InOutPointerType type)
 {
@@ -38,7 +68,7 @@ inline uint32_t GPUReconstruction::DumpData(FILE* fp, const T* const* entries, c
       fwrite(entries[i], sizeof(*entries[i]), num[i], fp);
     }
   }
-  if (mProcessingSettings.debugLevel >= 2) {
+  if (GetProcessingSettings().debugLevel >= 2) {
     GPUInfo("Dumped %ld %s", (int64_t)numTotal, IOTYPENAMES[type]);
   }
   return numTotal;
@@ -72,7 +102,7 @@ inline size_t GPUReconstruction::ReadData(FILE* fp, const T** entries, S* num, s
     numTotal += num[i];
   }
   (void)r;
-  if (mProcessingSettings.debugLevel >= 2) {
+  if (GetProcessingSettings().debugLevel >= 2) {
     GPUInfo("Read %ld %s", (int64_t)numTotal, IOTYPENAMES[type]);
   }
   return numTotal;
@@ -112,7 +142,7 @@ inline std::unique_ptr<T> GPUReconstruction::ReadFlatObjectFromFile(const char*
   r = fread((void*)retVal.get(), 1, size[0], fp);
   r = fread(buf, 1, size[1], fp);
   fclose(fp);
-  if (mProcessingSettings.debugLevel >= 2) {
+  if (GetProcessingSettings().debugLevel >= 2) {
     GPUInfo("Read %ld bytes from %s", (int64_t)r, file);
   }
   retVal->clearInternalBufferPtr();
@@ -151,7 +181,7 @@ inline std::unique_ptr<T> GPUReconstruction::ReadStructFromFile(const char* file
   std::unique_ptr<T> newObj(new T);
   r = fread(newObj.get(), 1, size, fp);
   fclose(fp);
-  if (mProcessingSettings.debugLevel >= 2) {
+  if (GetProcessingSettings().debugLevel >= 2) {
     GPUInfo("Read %ld bytes from %s", (int64_t)r, file);
   }
   return newObj;
@@ -172,7 +202,7 @@ inline int32_t GPUReconstruction::ReadStructFromFile(const char* file, T* obj)
   }
   r = fread(obj, 1, size, fp);
   fclose(fp);
-  if (mProcessingSettings.debugLevel >= 2) {
+  if (GetProcessingSettings().debugLevel >= 2) {
     GPUInfo("Read %ld bytes from %s", (int64_t)r, file);
   }
   return 0;
diff --git a/GPU/GPUTracking/Base/GPUReconstructionKernelMacros.h b/GPU/GPUTracking/Base/GPUReconstructionKernelMacros.h
index b3f6c6ec817fd..2b16dfb32fe14 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionKernelMacros.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionKernelMacros.h
@@ -65,7 +65,7 @@
 // GPU Host wrappers for kernel
 #define GPUCA_KRNL_HOST(x_class, ...) \
   GPUCA_KRNLGPU(x_class, __VA_ARGS__) \
-  template <> class GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::backendInternal<GPUCA_M_KRNL_TEMPLATE(x_class)> { \
+  template <> class GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::backendInternal<GPUCA_M_KRNL_TEMPLATE(x_class)> { \
    public: \
     template <typename T, typename... Args> \
     static inline void runKernelBackendMacro(const krnlSetupTime& _xyz, T* me, const Args&... args) \
diff --git a/GPU/GPUTracking/Base/GPUReconstructionKernels.h b/GPU/GPUTracking/Base/GPUReconstructionKernels.h
deleted file mode 100644
index 7f500d471de1f..0000000000000
--- a/GPU/GPUTracking/Base/GPUReconstructionKernels.h
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
-// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
-// All rights not expressly granted are reserved.
-//
-// This software is distributed under the terms of the GNU General Public
-// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
-//
-// In applying this license CERN does not waive the privileges and immunities
-// granted to it by virtue of its status as an Intergovernmental Organization
-// or submit itself to any jurisdiction.
-
-/// \file GPUReconstructionKernels.h
-/// \author David Rohr
-
-#ifndef GPURECONSTRUCTIONKERNELS_H
-#define GPURECONSTRUCTIONKERNELS_H
-
-#include "GPUReconstruction.h"
-
-namespace o2::gpu
-{
-
-namespace gpu_reconstruction_kernels
-{
-
-template <class T, int32_t I = 0>
-struct classArgument {
-  using t = T;
-  static constexpr int32_t i = I;
-};
-
-struct krnlExec {
-  constexpr krnlExec(uint32_t b, uint32_t t, int32_t s, GPUReconstruction::krnlDeviceType d = GPUReconstruction::krnlDeviceType::Auto) : nBlocks(b), nThreads(t), stream(s), device(d), step(GPUDataTypes::RecoStep::NoRecoStep) {}
-  constexpr krnlExec(uint32_t b, uint32_t t, int32_t s, GPUDataTypes::RecoStep st) : nBlocks(b), nThreads(t), stream(s), device(GPUReconstruction::krnlDeviceType::Auto), step(st) {}
-  constexpr krnlExec(uint32_t b, uint32_t t, int32_t s, GPUReconstruction::krnlDeviceType d, GPUDataTypes::RecoStep st) : nBlocks(b), nThreads(t), stream(s), device(d), step(st) {}
-  uint32_t nBlocks;
-  uint32_t nThreads;
-  int32_t stream;
-  GPUReconstruction::krnlDeviceType device;
-  GPUDataTypes::RecoStep step;
-};
-struct krnlRunRange {
-  constexpr krnlRunRange() = default;
-  constexpr krnlRunRange(uint32_t v) : index(v) {}
-  uint32_t index = 0;
-};
-struct krnlEvent {
-  constexpr krnlEvent(deviceEvent* e = nullptr, deviceEvent* el = nullptr, int32_t n = 1) : ev(e), evList(el), nEvents(n) {}
-  deviceEvent* ev;
-  deviceEvent* evList;
-  int32_t nEvents;
-};
-
-struct krnlProperties {
-  krnlProperties(int32_t t = 0, int32_t b = 1, int32_t b2 = 0) : nThreads(t), minBlocks(b), forceBlocks(b2) {}
-  uint32_t nThreads;
-  uint32_t minBlocks;
-  uint32_t forceBlocks;
-  uint32_t total() { return forceBlocks ? forceBlocks : (nThreads * minBlocks); }
-};
-
-struct krnlSetup {
-  krnlSetup(const krnlExec& xx, const krnlRunRange& yy = {0}, const krnlEvent& zz = {nullptr, nullptr, 0}) : x(xx), y(yy), z(zz) {}
-  krnlExec x;
-  krnlRunRange y;
-  krnlEvent z;
-};
-
-struct krnlSetupTime : public krnlSetup {
-  double& t;
-};
-
-template <class T, int32_t I = 0, typename... Args>
-struct krnlSetupArgs : public gpu_reconstruction_kernels::classArgument<T, I> {
-  krnlSetupArgs(const krnlExec& xx, const krnlRunRange& yy, const krnlEvent& zz, double& tt, const Args&... args) : s{{xx, yy, zz}, tt}, v(args...) {}
-  const krnlSetupTime s;
-  std::tuple<typename std::conditional<(sizeof(Args) > sizeof(void*)), const Args&, const Args>::type...> v;
-};
-
-} // namespace gpu_reconstruction_kernels
-
-template <class T>
-class GPUReconstructionKernels : public T
-{
- public:
-  GPUReconstructionKernels(const GPUSettingsDeviceBackend& cfg) : T(cfg) {}
-
- protected:
-  using deviceEvent = gpu_reconstruction_kernels::deviceEvent;
-  using krnlExec = gpu_reconstruction_kernels::krnlExec;
-  using krnlRunRange = gpu_reconstruction_kernels::krnlRunRange;
-  using krnlEvent = gpu_reconstruction_kernels::krnlEvent;
-  using krnlSetup = gpu_reconstruction_kernels::krnlSetup;
-  using krnlSetupTime = gpu_reconstruction_kernels::krnlSetupTime;
-  template <class S, int32_t I = 0, typename... Args>
-  using krnlSetupArgs = gpu_reconstruction_kernels::krnlSetupArgs<S, I, Args...>;
-
-  virtual void runKernelImpl(const int num, const void* args)
-  {
-    switch (num) { // clang-format off
-#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, x_num)                               \
-      case x_num: {                                                                                             \
-        const auto& args2 = *(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>*)args; \
-        T::template runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(args2);                                    \
-        break; \
-      }
-#include "GPUReconstructionKernelList.h"
-#undef GPUCA_KRNL
-    } // clang-format on
-  }
-};
-
-} // namespace o2::gpu
-
-#endif
diff --git a/GPU/GPUTracking/Base/GPUReconstructionLibrary.cxx b/GPU/GPUTracking/Base/GPUReconstructionLibrary.cxx
index aa01d26446b56..89517c612403b 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionLibrary.cxx
+++ b/GPU/GPUTracking/Base/GPUReconstructionLibrary.cxx
@@ -24,6 +24,7 @@
 
 #include "GPUReconstruction.h"
 #include "GPUReconstructionAvailableBackends.h"
+#include "GPUSettings.h"
 
 #include "utils/qlibload.h"
 
diff --git a/GPU/GPUTracking/Base/GPUReconstructionProcessing.cxx b/GPU/GPUTracking/Base/GPUReconstructionProcessing.cxx
index d02309f66c762..a511102a492ef 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionProcessing.cxx
+++ b/GPU/GPUTracking/Base/GPUReconstructionProcessing.cxx
@@ -15,6 +15,9 @@
 #include "GPUReconstructionProcessing.h"
 #include "GPUReconstructionThreading.h"
 #include "GPUDefParametersLoad.inc"
+#include "GPUReconstructionKernelIncludes.h"
+#include "GPUSettings.h"
+#include "GPULogging.h"
 
 using namespace o2::gpu;
 
@@ -41,7 +44,7 @@ GPUReconstructionProcessing::~GPUReconstructionProcessing()
 int32_t GPUReconstructionProcessing::getNKernelHostThreads(bool splitCores)
 {
   int32_t nThreads = 0;
-  if (mProcessingSettings.inKernelParallel == 2 && mNActiveThreadsOuterLoop) {
+  if (GetProcessingSettings().inKernelParallel == 2 && mNActiveThreadsOuterLoop) {
     if (splitCores) {
       nThreads = mMaxHostThreads / mNActiveThreadsOuterLoop;
       nThreads += (uint32_t)getHostThreadIndex() < mMaxHostThreads % mNActiveThreadsOuterLoop;
@@ -50,7 +53,7 @@ int32_t GPUReconstructionProcessing::getNKernelHostThreads(bool splitCores)
     }
     nThreads = std::max(1, nThreads);
   } else {
-    nThreads = mProcessingSettings.inKernelParallel ? mMaxHostThreads : 1;
+    nThreads = GetProcessingSettings().inKernelParallel ? mMaxHostThreads : 1;
   }
   return nThreads;
 }
@@ -59,7 +62,7 @@ void GPUReconstructionProcessing::SetNActiveThreads(int32_t n)
 {
   mActiveHostKernelThreads = std::max(1, n < 0 ? mMaxHostThreads : std::min(n, mMaxHostThreads));
   mThreading->activeThreads = std::make_unique<tbb::task_arena>(mActiveHostKernelThreads);
-  if (mProcessingSettings.debugLevel >= 3) {
+  if (GetProcessingSettings().debugLevel >= 3) {
     GPUInfo("Set number of active parallel kernels threads on host to %d (%d requested)", mActiveHostKernelThreads, n);
   }
 }
@@ -80,12 +83,12 @@ void GPUReconstructionProcessing::runParallelOuterLoop(bool doGPU, uint32_t nThr
 
 uint32_t GPUReconstructionProcessing::SetAndGetNActiveThreadsOuterLoop(bool condition, uint32_t max)
 {
-  if (condition && mProcessingSettings.inKernelParallel != 1) {
-    mNActiveThreadsOuterLoop = mProcessingSettings.inKernelParallel == 2 ? std::min<uint32_t>(max, mMaxHostThreads) : mMaxHostThreads;
+  if (condition && GetProcessingSettings().inKernelParallel != 1) {
+    mNActiveThreadsOuterLoop = GetProcessingSettings().inKernelParallel == 2 ? std::min<uint32_t>(max, mMaxHostThreads) : mMaxHostThreads;
   } else {
     mNActiveThreadsOuterLoop = 1;
   }
-  if (mProcessingSettings.debugLevel >= 5) {
+  if (GetProcessingSettings().debugLevel >= 5) {
     printf("Running %d threads in outer loop\n", mNActiveThreadsOuterLoop);
   }
   return mNActiveThreadsOuterLoop;
@@ -132,9 +135,9 @@ uint32_t GPUReconstructionProcessing::getNextTimerId()
   return id.fetch_add(1);
 }
 
-std::unique_ptr<gpu_reconstruction_kernels::threadContext> GPUReconstructionProcessing::GetThreadContext()
+std::unique_ptr<GPUReconstructionProcessing::threadContext> GPUReconstructionProcessing::GetThreadContext()
 {
-  return std::make_unique<gpu_reconstruction_kernels::threadContext>();
+  return std::make_unique<threadContext>();
 }
 
 gpu_reconstruction_kernels::threadContext::threadContext() = default;
diff --git a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h
index 4ce8bc1b42743..9e611e57148c6 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h
@@ -16,7 +16,6 @@
 #define GPURECONSTRUCTIONPROCESSING_H
 
 #include "GPUReconstruction.h"
-#include "GPUReconstructionKernelIncludes.h"
 
 #include "utils/timer.h"
 #include <functional>
@@ -32,7 +31,7 @@ namespace o2::gpu
 
 struct GPUDefParameters;
 
-namespace gpu_reconstruction_kernels
+namespace gpu_reconstruction_kernels // TODO: Get rid of this namespace
 {
 struct deviceEvent {
   constexpr deviceEvent() = default;
@@ -72,6 +71,86 @@ class GPUReconstructionProcessing : public GPUReconstruction
  public:
   ~GPUReconstructionProcessing() override;
 
+  using deviceEvent = gpu_reconstruction_kernels::deviceEvent;
+  using threadContext = gpu_reconstruction_kernels::threadContext;
+
+  struct RecoStepTimerMeta {
+    HighResTimer timerToGPU;
+    HighResTimer timerToHost;
+    HighResTimer timerTotal;
+    double timerCPU = 0.;
+    size_t bytesToGPU = 0;
+    size_t bytesToHost = 0;
+    uint32_t countToGPU = 0;
+    uint32_t countToHost = 0;
+  };
+
+  template <class T, int32_t I = 0>
+  struct kernelInterfaceArguments {
+    using t = T;
+    static constexpr int32_t i = I;
+  };
+
+  struct krnlExec {
+    constexpr krnlExec(uint32_t b, uint32_t t, int32_t s, GPUReconstruction::krnlDeviceType d = GPUReconstruction::krnlDeviceType::Auto) : nBlocks(b), nThreads(t), stream(s), device(d), step(GPUDataTypes::RecoStep::NoRecoStep) {}
+    constexpr krnlExec(uint32_t b, uint32_t t, int32_t s, GPUDataTypes::RecoStep st) : nBlocks(b), nThreads(t), stream(s), device(GPUReconstruction::krnlDeviceType::Auto), step(st) {}
+    constexpr krnlExec(uint32_t b, uint32_t t, int32_t s, GPUReconstruction::krnlDeviceType d, GPUDataTypes::RecoStep st) : nBlocks(b), nThreads(t), stream(s), device(d), step(st) {}
+    uint32_t nBlocks;
+    uint32_t nThreads;
+    int32_t stream;
+    GPUReconstruction::krnlDeviceType device;
+    GPUDataTypes::RecoStep step;
+  };
+  struct krnlRunRange {
+    constexpr krnlRunRange() = default;
+    constexpr krnlRunRange(uint32_t v) : index(v) {}
+    uint32_t index = 0;
+  };
+  struct krnlEvent {
+    constexpr krnlEvent(deviceEvent* e = nullptr, deviceEvent* el = nullptr, int32_t n = 1) : ev(e), evList(el), nEvents(n) {}
+    deviceEvent* ev;
+    deviceEvent* evList;
+    int32_t nEvents;
+  };
+
+  struct krnlProperties {
+    krnlProperties(int32_t t = 0, int32_t b = 1, int32_t b2 = 0) : nThreads(t), minBlocks(b), forceBlocks(b2) {}
+    uint32_t nThreads;
+    uint32_t minBlocks;
+    uint32_t forceBlocks;
+    uint32_t total() { return forceBlocks ? forceBlocks : (nThreads * minBlocks); }
+  };
+
+  struct krnlSetup {
+    krnlSetup(const krnlExec& xx, const krnlRunRange& yy = {0}, const krnlEvent& zz = {nullptr, nullptr, 0}) : x(xx), y(yy), z(zz) {}
+    krnlExec x;
+    krnlRunRange y;
+    krnlEvent z;
+  };
+
+  struct krnlSetupTime : public krnlSetup {
+    double& t;
+  };
+
+  template <class T, int32_t I = 0, typename... Args>
+  struct krnlSetupArgs : public kernelInterfaceArguments<T, I> {
+    const krnlSetupTime s;
+    std::tuple<typename std::conditional<(sizeof(Args) > sizeof(void*)), const Args&, const Args>::type...> v;
+  };
+
+  template <class T, class S>
+  class KernelInterface : public S
+  {
+   public:
+    template <typename... Args>
+    KernelInterface(const Args&... args) : S(args...)
+    {
+    }
+
+   protected:
+    virtual void runKernelVirtual(const int num, const void* args);
+  };
+
   // Threading
   int32_t getNKernelHostThreads(bool splitCores);
   uint32_t getNActiveThreadsOuterLoop() const { return mNActiveThreadsOuterLoop; }
@@ -94,23 +173,12 @@ class GPUReconstructionProcessing : public GPUReconstruction
   template <class T>
   void AddGPUEvents(T*& events);
 
-  virtual std::unique_ptr<gpu_reconstruction_kernels::threadContext> GetThreadContext() override;
+  virtual std::unique_ptr<threadContext> GetThreadContext() override;
 
-  struct RecoStepTimerMeta {
-    HighResTimer timerToGPU;
-    HighResTimer timerToHost;
-    HighResTimer timerTotal;
-    double timerCPU = 0.;
-    size_t bytesToGPU = 0;
-    size_t bytesToHost = 0;
-    uint32_t countToGPU = 0;
-    uint32_t countToHost = 0;
-  };
   const GPUDefParameters& getGPUParameters(bool doGPU) const override { return *(doGPU ? mParDevice : mParCPU); }
 
  protected:
   GPUReconstructionProcessing(const GPUSettingsDeviceBackend& cfg);
-  using deviceEvent = gpu_reconstruction_kernels::deviceEvent;
 
   static const std::vector<std::string> mKernelNames;
 
@@ -181,7 +249,7 @@ HighResTimer& GPUReconstructionProcessing::getTimer(const char* name, int32_t nu
   static int32_t id = getNextTimerId();
   timerMeta* timer = getTimerById(id);
   if (timer == nullptr) {
-    int32_t max = std::max<int32_t>({mMaxHostThreads, mProcessingSettings.nStreams});
+    int32_t max = std::max<int32_t>({mMaxHostThreads, GPUCA_MAX_STREAMS});
     timer = insertTimer(id, name, J, max, 1, RecoStep::NoRecoStep);
   }
   if (num == -1) {
diff --git a/GPU/GPUTracking/Base/GPUReconstructionProcessingKernels.inc b/GPU/GPUTracking/Base/GPUReconstructionProcessingKernels.inc
new file mode 100644
index 0000000000000..49d02515372b8
--- /dev/null
+++ b/GPU/GPUTracking/Base/GPUReconstructionProcessingKernels.inc
@@ -0,0 +1,41 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUReconstructionProcessingKernels.h
+/// \author David Rohr
+
+#ifndef GPURECONSTRUCTIONPROCESSINGKERNELS_H
+#define GPURECONSTRUCTIONPROCESSINGKERNELS_H
+
+#include "GPUReconstructionProcessing.h"
+#include "GPUReconstructionKernelIncludes.h"
+
+namespace o2::gpu
+{
+
+template <class T, class S>
+void GPUReconstructionProcessing::KernelInterface<T, S>::runKernelVirtual(const int num, const void* args)
+{
+  switch (num) { // clang-format off
+#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, x_num)                             \
+    case x_num: {                                                                                             \
+      const auto& args2 = *(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>*)args; \
+      ((T*)this)->template runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(args2);                                    \
+      break;                                                                                                  \
+    }
+#include "GPUReconstructionKernelList.h"
+#undef GPUCA_KRNL
+  } // clang-format on
+}
+
+} // namespace o2::gpu
+
+#endif // GPURECONSTRUCTIONPROCESSINGKERNELS_H
diff --git a/GPU/GPUTracking/Base/GPUReconstructionTimeframe.cxx b/GPU/GPUTracking/Base/GPUReconstructionTimeframe.cxx
index 4693a1eff24f2..b25b93e957b15 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionTimeframe.cxx
+++ b/GPU/GPUTracking/Base/GPUReconstructionTimeframe.cxx
@@ -14,6 +14,8 @@
 
 #include "GPUReconstructionTimeframe.h"
 #include "GPUReconstruction.h"
+#include "GPUChainTracking.h"
+#include "GPUChainTrackingGetters.inc"
 #include "display/GPUDisplayInterface.h"
 #include "GPUQA.h"
 #include "AliHLTTPCClusterMCData.h"
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
index fe2906caace80..970b331ea99fb 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -24,6 +24,7 @@
 #include "GPUParamRTC.h"
 #include "GPUReconstructionCUDAHelpers.inc"
 #include "GPUDefParametersLoad.inc"
+#include "GPUReconstructionProcessingKernels.inc"
 
 #if defined(GPUCA_KERNEL_COMPILE_MODE) && GPUCA_KERNEL_COMPILE_MODE == 1
 #include "utils/qGetLdBinarySymbols.h"
@@ -53,16 +54,23 @@ __global__ void dummyInitKernel(void*) {}
 
 #include "GPUReconstructionIncludesITS.h"
 
-GPUReconstructionCUDABackend::GPUReconstructionCUDABackend(const GPUSettingsDeviceBackend& cfg) : GPUReconstructionDeviceBase(cfg, sizeof(GPUReconstructionDeviceBase))
+GPUReconstructionCUDA::GPUReconstructionCUDA(const GPUSettingsDeviceBackend& cfg) : GPUReconstructionProcessing::KernelInterface<GPUReconstructionCUDA, GPUReconstructionDeviceBase>(cfg, sizeof(GPUReconstructionDeviceBase))
 {
   if (mMaster == nullptr) {
     mInternals = new GPUReconstructionCUDAInternals;
     *mParDevice = o2::gpu::internal::GPUDefParametersLoad();
   }
-  mDeviceBackendSettings.deviceType = DeviceType::CUDA;
+  mDeviceBackendSettings->deviceType = DeviceType::CUDA;
+#ifndef __HIPCC__ // CUDA
+  mRtcSrcExtension = ".cu";
+  mRtcBinExtension = ".fatbin";
+#else // HIP
+  mRtcSrcExtension = ".hip";
+  mRtcBinExtension = ".o";
+#endif
 }
 
-GPUReconstructionCUDABackend::~GPUReconstructionCUDABackend()
+GPUReconstructionCUDA::~GPUReconstructionCUDA()
 {
   Exit(); // Make sure we destroy everything (in particular the ITS tracker) before we exit CUDA
   if (mMaster == nullptr) {
@@ -71,23 +79,11 @@ GPUReconstructionCUDABackend::~GPUReconstructionCUDABackend()
 }
 
 static_assert(sizeof(cudaError_t) <= sizeof(int64_t) && cudaSuccess == 0);
-int32_t GPUReconstructionCUDABackend::GPUChkErrInternal(const int64_t error, const char* file, int32_t line) const
+int32_t GPUReconstructionCUDA::GPUChkErrInternal(const int64_t error, const char* file, int32_t line) const
 {
   return internal::GPUReconstructionCUDAChkErr(error, file, line);
 }
 
-GPUReconstructionCUDA::GPUReconstructionCUDA(const GPUSettingsDeviceBackend& cfg) : GPUReconstructionKernels(cfg)
-{
-#ifndef __HIPCC__ // CUDA
-  mRtcSrcExtension = ".cu";
-  mRtcBinExtension = ".fatbin";
-#else // HIP
-  mRtcSrcExtension = ".hip";
-  mRtcBinExtension = ".o";
-#endif
-}
-GPUReconstructionCUDA::~GPUReconstructionCUDA() = default;
-
 GPUReconstruction* GPUReconstruction_Create_CUDA(const GPUSettingsDeviceBackend& cfg) { return new GPUReconstructionCUDA(cfg); }
 
 void GPUReconstructionCUDA::GetITSTraits(std::unique_ptr<o2::its::TrackerTraits>* trackerTraits, std::unique_ptr<o2::its::VertexerTraits>* vertexerTraits, std::unique_ptr<o2::its::TimeFrame>* timeFrame)
@@ -109,7 +105,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime()
   constexpr int32_t reqVerMaj = 2;
   constexpr int32_t reqVerMin = 0;
 #endif
-  if (mProcessingSettings.rtc.enable && mProcessingSettings.rtctech.runTest == 2) {
+  if (GetProcessingSettings().rtc.enable && GetProcessingSettings().rtctech.runTest == 2) {
     mWarpSize = GPUCA_WARP_SIZE;
     genAndLoadRTC();
     exit(0);
@@ -123,14 +119,14 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime()
       GPUError("Error getting CUDA Device Count");
       return (1);
     }
-    if (mProcessingSettings.debugLevel >= 2) {
+    if (GetProcessingSettings().debugLevel >= 2) {
       GPUInfo("Available CUDA devices:");
     }
     std::vector<bool> devicesOK(count, false);
     std::vector<size_t> devMemory(count, 0);
     bool contextCreated = false;
     for (int32_t i = 0; i < count; i++) {
-      if (mProcessingSettings.debugLevel >= 4) {
+      if (GetProcessingSettings().debugLevel >= 4) {
         GPUInfo("Examining device %d", i);
       }
       size_t free, total;
@@ -139,14 +135,14 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime()
 #else // HIP
       if (GPUChkErrI(hipSetDevice(i))) {
 #endif
-        if (mProcessingSettings.debugLevel >= 4) {
+        if (GetProcessingSettings().debugLevel >= 4) {
           GPUWarning("Couldn't create context for device %d. Skipping it.", i);
         }
         continue;
       }
       contextCreated = true;
       if (GPUChkErrI(cudaMemGetInfo(&free, &total))) {
-        if (mProcessingSettings.debugLevel >= 4) {
+        if (GetProcessingSettings().debugLevel >= 4) {
           GPUWarning("Error obtaining CUDA memory info about device %d! Skipping it.", i);
         }
         GPUChkErr(cudaDeviceReset());
@@ -156,13 +152,13 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime()
         GPUChkErr(cudaDeviceReset());
         contextCreated = false;
       }
-      if (mProcessingSettings.debugLevel >= 4) {
+      if (GetProcessingSettings().debugLevel >= 4) {
         GPUInfo("Obtained current memory usage for device %d", i);
       }
       if (GPUChkErrI(cudaGetDeviceProperties(&deviceProp, i))) {
         continue;
       }
-      if (mProcessingSettings.debugLevel >= 4) {
+      if (GetProcessingSettings().debugLevel >= 4) {
         GPUInfo("Obtained device properties for device %d", i);
       }
       int32_t deviceOK = true;
@@ -179,7 +175,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime()
       }
 
       deviceSpeed = (double)deviceProp.multiProcessorCount * (double)deviceProp.clockRate * (double)deviceProp.warpSize * (double)free * (double)deviceProp.major * (double)deviceProp.major;
-      if (mProcessingSettings.debugLevel >= 2) {
+      if (GetProcessingSettings().debugLevel >= 2) {
         GPUImportant("Device %s%2d: %s (Rev: %d.%d - Mem Avail %lu / %lu)%s %s", deviceOK ? " " : "[", i, deviceProp.name, deviceProp.major, deviceProp.minor, free, (size_t)deviceProp.totalGlobalMem, deviceOK ? " " : " ]", deviceOK ? "" : deviceFailure);
       }
       if (!deviceOK) {
@@ -191,7 +187,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime()
         bestDevice = i;
         bestDeviceSpeed = deviceSpeed;
       } else {
-        if (mProcessingSettings.debugLevel >= 2 && mProcessingSettings.deviceNum < 0) {
+        if (GetProcessingSettings().debugLevel >= 2 && GetProcessingSettings().deviceNum < 0) {
           GPUInfo("Skipping: Speed %f < %f\n", deviceSpeed, bestDeviceSpeed);
         }
       }
@@ -204,15 +200,15 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime()
       GPUImportant("Requiring Revision %d.%d, Mem: %lu", reqVerMaj, reqVerMin, std::max<size_t>(mDeviceMemorySize, REQUIRE_MIN_MEMORY));
 #endif
       noDevice = true;
-    } else if (mProcessingSettings.deviceNum > -1) {
-      if (mProcessingSettings.deviceNum >= (signed)count) {
-        GPUError("Requested device ID %d does not exist", mProcessingSettings.deviceNum);
+    } else if (GetProcessingSettings().deviceNum > -1) {
+      if (GetProcessingSettings().deviceNum >= (signed)count) {
+        GPUError("Requested device ID %d does not exist", GetProcessingSettings().deviceNum);
         noDevice = true;
-      } else if (!devicesOK[mProcessingSettings.deviceNum]) {
-        GPUError("Unsupported device requested (%d)", mProcessingSettings.deviceNum);
+      } else if (!devicesOK[GetProcessingSettings().deviceNum]) {
+        GPUError("Unsupported device requested (%d)", GetProcessingSettings().deviceNum);
         noDevice = true;
       } else {
-        bestDevice = mProcessingSettings.deviceNum;
+        bestDevice = GetProcessingSettings().deviceNum;
       }
     }
     if (noDevice) {
@@ -225,7 +221,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime()
 
     GPUChkErrI(cudaGetDeviceProperties(&deviceProp, mDeviceId));
 
-    if (mProcessingSettings.debugLevel >= 2) {
+    if (GetProcessingSettings().debugLevel >= 2) {
       GPUInfo("Using CUDA Device %s with Properties:", deviceProp.name);
       GPUInfo("\ttotalGlobalMem = %ld", (uint64_t)deviceProp.totalGlobalMem);
       GPUInfo("\tsharedMemPerBlock = %ld", (uint64_t)deviceProp.sharedMemPerBlock);
@@ -244,7 +240,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime()
       GPUInfo("\ttextureAlignment = %ld", (uint64_t)deviceProp.textureAlignment);
       GPUInfo(" ");
     }
-    if (deviceProp.warpSize != GPUCA_WARP_SIZE && !mProcessingSettings.rtc.enable) {
+    if (deviceProp.warpSize != GPUCA_WARP_SIZE && !GetProcessingSettings().rtc.enable) {
       throw std::runtime_error("Invalid warp size on GPU");
     }
     mWarpSize = deviceProp.warpSize;
@@ -280,7 +276,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime()
       GPUChkErrI(cudaDeviceReset());
       return (1);
     }
-    if (GPUChkErrI(cudaDeviceSetLimit(cudaLimitMallocHeapSize, mProcessingSettings.deterministicGPUReconstruction ? std::max<size_t>(1024 * 1024 * 1024, GPUCA_GPU_HEAP_SIZE) : GPUCA_GPU_HEAP_SIZE))) {
+    if (GPUChkErrI(cudaDeviceSetLimit(cudaLimitMallocHeapSize, GetProcessingSettings().deterministicGPUReconstruction ? std::max<size_t>(1024 * 1024 * 1024, GPUCA_GPU_HEAP_SIZE) : GPUCA_GPU_HEAP_SIZE))) {
       GPUError("Error setting CUDA stack size");
       GPUChkErrI(cudaDeviceReset());
       return (1);
@@ -302,7 +298,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime()
       mDeviceMemorySize = mDeviceMemorySize * 2 / 3; // Leave 1/3 of GPU memory for event display
     }
 
-    if (mProcessingSettings.debugLevel >= 3) {
+    if (GetProcessingSettings().debugLevel >= 3) {
       GPUInfo("Allocating memory on GPU");
     }
     if (mDeviceMemorySize > deviceProp.totalGlobalMem || GPUChkErrI(cudaMalloc(&mDeviceMemoryBase, mDeviceMemorySize))) {
@@ -312,7 +308,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime()
       GPUChkErrI(cudaDeviceReset());
       return (1);
     }
-    if (mProcessingSettings.debugLevel >= 3) {
+    if (GetProcessingSettings().debugLevel >= 3) {
       GPUInfo("Allocating memory on Host");
     }
     if (GPUChkErrI(cudaMallocHost(&mHostMemoryBase, mHostMemorySize))) {
@@ -320,7 +316,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime()
       GPUChkErrI(cudaDeviceReset());
       return (1);
     }
-    if (mProcessingSettings.debugLevel >= 1) {
+    if (GetProcessingSettings().debugLevel >= 1) {
       GPUInfo("Memory ptrs: GPU (%ld bytes): %p - Host (%ld bytes): %p", (int64_t)mDeviceMemorySize, mDeviceMemoryBase, (int64_t)mHostMemorySize, mHostMemoryBase);
       memset(mHostMemoryBase, 0xDD, mHostMemorySize);
       if (GPUChkErrI(cudaMemset(mDeviceMemoryBase, 0xDD, mDeviceMemorySize))) {
@@ -344,7 +340,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime()
     hipLaunchKernelGGL(HIP_KERNEL_NAME(dummyInitKernel), dim3(mBlockCount), dim3(256), 0, 0, mDeviceMemoryBase);
 #endif
 
-    if (mProcessingSettings.rtc.enable) {
+    if (GetProcessingSettings().rtc.enable) {
       genAndLoadRTC();
     }
 #if defined(GPUCA_KERNEL_COMPILE_MODE) && GPUCA_KERNEL_COMPILE_MODE == 1
@@ -419,17 +415,17 @@ void GPUReconstructionCUDA::genAndLoadRTC()
     throw std::runtime_error("Runtime compilation failed");
   }
   for (uint32_t i = 0; i < nCompile; i++) {
-    if (mProcessingSettings.rtctech.runTest != 2) {
+    if (GetProcessingSettings().rtctech.runTest != 2) {
       mInternals->kernelModules.emplace_back(std::make_unique<CUmodule>());
       GPUChkErr(cuModuleLoad(mInternals->kernelModules.back().get(), (filename + "_" + std::to_string(i) + mRtcBinExtension).c_str()));
     }
     remove((filename + "_" + std::to_string(i) + mRtcSrcExtension).c_str());
     remove((filename + "_" + std::to_string(i) + mRtcBinExtension).c_str());
   }
-  if (mProcessingSettings.rtctech.runTest == 2) {
+  if (GetProcessingSettings().rtctech.runTest == 2) {
     return;
   }
-  loadKernelModules(mProcessingSettings.rtc.compilePerKernel);
+  loadKernelModules(GetProcessingSettings().rtc.compilePerKernel);
 }
 
 int32_t GPUReconstructionCUDA::ExitDevice_Runtime()
@@ -472,7 +468,7 @@ int32_t GPUReconstructionCUDA::ExitDevice_Runtime()
 
 size_t GPUReconstructionCUDA::GPUMemCpy(void* dst, const void* src, size_t size, int32_t stream, int32_t toGPU, deviceEvent* ev, deviceEvent* evList, int32_t nEvents)
 {
-  if (mProcessingSettings.debugLevel >= 3) {
+  if (GetProcessingSettings().debugLevel >= 3) {
     stream = -1;
   }
   if (stream == -1) {
@@ -490,7 +486,7 @@ size_t GPUReconstructionCUDA::GPUMemCpy(void* dst, const void* src, size_t size,
   if (ev) {
     GPUChkErr(cudaEventRecord(ev->get<cudaEvent_t>(), mInternals->Streams[stream == -1 ? 0 : stream]));
   }
-  if (mProcessingSettings.serializeGPU & 2) {
+  if (GetProcessingSettings().serializeGPU & 2) {
     GPUDebug(("GPUMemCpy " + std::to_string(toGPU)).c_str(), stream, true);
   }
   return size;
@@ -512,7 +508,7 @@ size_t GPUReconstructionCUDA::WriteToConstantMemory(size_t offset, const void* s
   if (ev && stream != -1) {
     GPUChkErr(cudaEventRecord(ev->get<cudaEvent_t>(), mInternals->Streams[stream]));
   }
-  if (mProcessingSettings.serializeGPU & 2) {
+  if (GetProcessingSettings().serializeGPU & 2) {
     GPUDebug("WriteToConstantMemory", stream, true);
   }
   return size;
@@ -521,7 +517,7 @@ size_t GPUReconstructionCUDA::WriteToConstantMemory(size_t offset, const void* s
 void GPUReconstructionCUDA::ReleaseEvent(deviceEvent ev) {}
 void GPUReconstructionCUDA::RecordMarker(deviceEvent* ev, int32_t stream) { GPUChkErr(cudaEventRecord(ev->get<cudaEvent_t>(), mInternals->Streams[stream])); }
 
-std::unique_ptr<gpu_reconstruction_kernels::threadContext> GPUReconstructionCUDA::GetThreadContext()
+std::unique_ptr<GPUReconstructionProcessing::threadContext> GPUReconstructionCUDA::GetThreadContext()
 {
   GPUChkErr(cudaSetDevice(mDeviceId));
   return GPUReconstructionProcessing::GetThreadContext();
@@ -565,14 +561,14 @@ int32_t GPUReconstructionCUDA::GPUDebug(const char* state, int32_t stream, bool
     GPUError("CUDA Error %s while running (%s) (Stream %d)", cudaGetErrorString(cuErr), state, stream);
     return (1);
   }
-  if (!force && mProcessingSettings.debugLevel <= 0) {
+  if (!force && GetProcessingSettings().debugLevel <= 0) {
     return (0);
   }
   if (GPUChkErrI(stream == -1 ? cudaDeviceSynchronize() : cudaStreamSynchronize(mInternals->Streams[stream]))) {
     GPUError("CUDA Error while synchronizing (%s) (Stream %d)", state, stream);
     return (1);
   }
-  if (mProcessingSettings.debugLevel >= 3) {
+  if (GetProcessingSettings().debugLevel >= 3) {
     GPUInfo("GPU Sync Done");
   }
   return (0);
@@ -580,7 +576,7 @@ int32_t GPUReconstructionCUDA::GPUDebug(const char* state, int32_t stream, bool
 
 int32_t GPUReconstructionCUDA::registerMemoryForGPU_internal(const void* ptr, size_t size)
 {
-  if (mProcessingSettings.debugLevel >= 3) {
+  if (GetProcessingSettings().debugLevel >= 3) {
     GPUInfo("Registering %zu bytes of memory for GPU", size);
   }
   return GPUChkErrI(cudaHostRegister((void*)ptr, size, cudaHostRegisterDefault));
@@ -591,7 +587,7 @@ int32_t GPUReconstructionCUDA::unregisterMemoryForGPU_internal(const void* ptr)
   return GPUChkErrI(cudaHostUnregister((void*)ptr));
 }
 
-void GPUReconstructionCUDABackend::PrintKernelOccupancies()
+void GPUReconstructionCUDA::PrintKernelOccupancies()
 {
   int32_t maxBlocks = 0, threads = 0, suggestedBlocks = 0, nRegs = 0, sMem = 0;
   GPUChkErr(cudaSetDevice(mDeviceId));
@@ -612,7 +608,7 @@ void GPUReconstructionCUDA::loadKernelModules(bool perKernel)
     GPUFatal("kernel numbers out of sync");                                                                                                                                             \
   }                                                                                                                                                                                     \
   mInternals->kernelFunctions.emplace_back(new CUfunction);                                                                                                                             \
-  if (mProcessingSettings.debugLevel >= 3) {                                                                                                                                            \
+  if (GetProcessingSettings().debugLevel >= 3) {                                                                                                                                        \
     GPUInfo("Loading kernel %s (j = %u)", GPUCA_M_STR(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))), j);                                                                              \
   }                                                                                                                                                                                     \
   GPUChkErr(cuModuleGetFunction(mInternals->kernelFunctions.back().get(), *mInternals->kernelModules[perKernel ? j : 0], GPUCA_M_STR(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))))); \
@@ -680,8 +676,3 @@ void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions& session_options
 #endif // ORT_ROCM_BUILD
 }
 #endif // __HIPCC__
-
-namespace o2::gpu
-{
-template class GPUReconstructionKernels<GPUReconstructionCUDABackend>;
-}
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
index 2fc4d14bba491..ed75100dfe351 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
@@ -16,6 +16,7 @@
 #define GPURECONSTRUCTIONCUDA_H
 
 #include "GPUReconstructionDeviceBase.h"
+#include "GPUCommonAlgorithm.h"
 #include <vector>
 #include <string>
 
@@ -34,38 +35,31 @@ namespace o2::gpu
 {
 struct GPUReconstructionCUDAInternals;
 
-class GPUReconstructionCUDABackend : public GPUReconstructionDeviceBase
+class GPUReconstructionCUDA : public GPUReconstructionProcessing::KernelInterface<GPUReconstructionCUDA, GPUReconstructionDeviceBase>
 {
  public:
-  ~GPUReconstructionCUDABackend() override;
-
- protected:
-  GPUReconstructionCUDABackend(const GPUSettingsDeviceBackend& cfg);
+  GPUReconstructionCUDA(const GPUSettingsDeviceBackend& cfg);
+  ~GPUReconstructionCUDA() override;
 
   void PrintKernelOccupancies() override;
   virtual int32_t GPUChkErrInternal(const int64_t error, const char* file, int32_t line) const override;
 
   template <class T, int32_t I = 0, typename... Args>
   void runKernelBackend(const krnlSetupArgs<T, I, Args...>& args);
-  template <class T, int32_t I = 0, typename... Args>
-  void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args);
 
   template <class T, class S>
   friend GPUh() void GPUCommonAlgorithm::sortOnDevice(auto* rec, int32_t stream, T* begin, size_t N, const S& comp);
+
+ protected:
   GPUReconstructionCUDAInternals* mInternals;
-};
 
-class GPUReconstructionCUDA : public GPUReconstructionKernels<GPUReconstructionCUDABackend>
-{
- public:
-  ~GPUReconstructionCUDA() override;
-  GPUReconstructionCUDA(const GPUSettingsDeviceBackend& cfg);
+  template <class T, int32_t I = 0, typename... Args>
+  void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args);
 
- protected:
   int32_t InitDevice_Runtime() override;
   int32_t ExitDevice_Runtime() override;
 
-  std::unique_ptr<gpu_reconstruction_kernels::threadContext> GetThreadContext() override;
+  std::unique_ptr<threadContext> GetThreadContext() override;
   void SynchronizeGPU() override;
   int32_t GPUDebug(const char* state = "UNKNOWN", int32_t stream = -1, bool force = false) override;
   void SynchronizeStream(int32_t stream) override;
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx
index abcd47ca01c90..5706f32e73e96 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx
@@ -37,8 +37,8 @@ QGET_LD_BINARY_SYMBOLS(GPUReconstructionCUDArtc_command_no_fast_math);
 int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
 {
   std::string rtcparam = std::string("#define GPUCA_RTC_CODE\n") +
-                         std::string(mProcessingSettings.rtc.optSpecialCode ? "#define GPUCA_RTC_SPECIAL_CODE(...) __VA_ARGS__\n" : "#define GPUCA_RTC_SPECIAL_CODE(...)\n") +
-                         GPUParamRTC::generateRTCCode(param(), mProcessingSettings.rtc.optConstexpr);
+                         std::string(GetProcessingSettings().rtc.optSpecialCode ? "#define GPUCA_RTC_SPECIAL_CODE(...) __VA_ARGS__\n" : "#define GPUCA_RTC_SPECIAL_CODE(...)\n") +
+                         GPUParamRTC::generateRTCCode(param(), GetProcessingSettings().rtc.optConstexpr);
   if (filename == "") {
     filename = "/tmp/o2cagpu_rtc_";
   }
@@ -53,12 +53,12 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
     kernelsall += kernels[i] + "\n";
   }
 
-  std::string baseCommand = (mProcessingSettings.rtctech.prependCommand != "" ? (mProcessingSettings.rtctech.prependCommand + " ") : "");
+  std::string baseCommand = (GetProcessingSettings().rtctech.prependCommand != "" ? (GetProcessingSettings().rtctech.prependCommand + " ") : "");
   baseCommand += (getenv("O2_GPU_RTC_OVERRIDE_CMD") ? std::string(getenv("O2_GPU_RTC_OVERRIDE_CMD")) : std::string(_binary_GPUReconstructionCUDArtc_command_start, _binary_GPUReconstructionCUDArtc_command_len));
-  baseCommand += std::string(" ") + (mProcessingSettings.rtctech.overrideArchitecture != "" ? mProcessingSettings.rtctech.overrideArchitecture : std::string(_binary_GPUReconstructionCUDArtc_command_arch_start, _binary_GPUReconstructionCUDArtc_command_arch_len));
+  baseCommand += std::string(" ") + (GetProcessingSettings().rtctech.overrideArchitecture != "" ? GetProcessingSettings().rtctech.overrideArchitecture : std::string(_binary_GPUReconstructionCUDArtc_command_arch_start, _binary_GPUReconstructionCUDArtc_command_arch_len));
 
-  if (mProcessingSettings.rtctech.loadLaunchBoundsFromFile.size()) {
-    FILE* fp = fopen(mProcessingSettings.rtctech.loadLaunchBoundsFromFile.c_str(), "rb");
+  if (GetProcessingSettings().rtctech.loadLaunchBoundsFromFile.size()) {
+    FILE* fp = fopen(GetProcessingSettings().rtctech.loadLaunchBoundsFromFile.c_str(), "rb");
     if (fp == nullptr) {
       throw std::runtime_error("Cannot open launch bounds parameter module file");
     }
@@ -75,12 +75,12 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
   }
   const std::string launchBounds = o2::gpu::internal::GPUDefParametersExport(*mParDevice, true) +
                                    "#define GPUCA_WARP_SIZE " + std::to_string(mWarpSize) + "\n";
-  if (mProcessingSettings.rtctech.printLaunchBounds || mProcessingSettings.debugLevel >= 3) {
+  if (GetProcessingSettings().rtctech.printLaunchBounds || GetProcessingSettings().debugLevel >= 3) {
     GPUInfo("RTC Launch Bounds:\n%s", launchBounds.c_str());
   }
 
   char shasource[21], shaparam[21], shacmd[21], shakernels[21], shabounds[21];
-  if (mProcessingSettings.rtc.cacheOutput) {
+  if (GetProcessingSettings().rtc.cacheOutput) {
     o2::framework::internal::SHA1(shasource, _binary_GPUReconstructionCUDArtc_src_start, _binary_GPUReconstructionCUDArtc_src_len);
     o2::framework::internal::SHA1(shaparam, rtcparam.c_str(), rtcparam.size());
     o2::framework::internal::SHA1(shacmd, baseCommand.c_str(), baseCommand.size());
@@ -88,16 +88,16 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
     o2::framework::internal::SHA1(shabounds, launchBounds.c_str(), launchBounds.size());
   }
 
-  nCompile = mProcessingSettings.rtc.compilePerKernel ? kernels.size() : 1;
+  nCompile = GetProcessingSettings().rtc.compilePerKernel ? kernels.size() : 1;
   bool cacheLoaded = false;
   int32_t fd = 0;
-  if (mProcessingSettings.rtc.cacheOutput) {
-    if (mProcessingSettings.rtctech.cacheFolder != ".") {
-      std::filesystem::create_directories(mProcessingSettings.rtctech.cacheFolder);
+  if (GetProcessingSettings().rtc.cacheOutput) {
+    if (GetProcessingSettings().rtctech.cacheFolder != ".") {
+      std::filesystem::create_directories(GetProcessingSettings().rtctech.cacheFolder);
     }
-    if (mProcessingSettings.rtctech.cacheMutex) {
+    if (GetProcessingSettings().rtctech.cacheMutex) {
       mode_t mask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH;
-      fd = open((mProcessingSettings.rtctech.cacheFolder + "/cache.lock").c_str(), O_RDWR | O_CREAT | O_CLOEXEC, mask);
+      fd = open((GetProcessingSettings().rtctech.cacheFolder + "/cache.lock").c_str(), O_RDWR | O_CREAT | O_CLOEXEC, mask);
       if (fd == -1) {
         throw std::runtime_error("Error opening rtc cache mutex lock file");
       }
@@ -107,7 +107,7 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
       }
     }
 
-    FILE* fp = fopen((mProcessingSettings.rtctech.cacheFolder + "/rtc.cuda.cache").c_str(), "rb");
+    FILE* fp = fopen((GetProcessingSettings().rtctech.cacheFolder + "/rtc.cuda.cache").c_str(), "rb");
     char sharead[20];
     if (fp) {
       size_t len;
@@ -116,7 +116,7 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
           if (fread(sharead, 1, 20, fp) != 20) {
             throw std::runtime_error("Cache file corrupt");
           }
-          if (mProcessingSettings.debugLevel >= 3) {
+          if (GetProcessingSettings().debugLevel >= 3) {
             char shaprint1[41], shaprint2[41];
             for (uint32_t i = 0; i < 20; i++) {
               sprintf(shaprint1 + 2 * i, "%02X ", shacmp[i]);
@@ -124,7 +124,7 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
             }
             GPUInfo("SHA for %s: expected %s, read %s", name, shaprint1, shaprint2);
           }
-          if (!mProcessingSettings.rtctech.ignoreCacheValid && memcmp(sharead, shacmp, 20)) {
+          if (!GetProcessingSettings().rtctech.ignoreCacheValid && memcmp(sharead, shacmp, 20)) {
             GPUInfo("Cache file content outdated (%s)", name);
             return 1;
           }
@@ -142,7 +142,7 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
         if (fread(&cachedSettings, sizeof(cachedSettings), 1, fp) != 1) {
           throw std::runtime_error("Cache file corrupt");
         }
-        if (!mProcessingSettings.rtctech.ignoreCacheValid && !(cachedSettings == mProcessingSettings.rtc)) {
+        if (!GetProcessingSettings().rtctech.ignoreCacheValid && !(cachedSettings == GetProcessingSettings().rtc)) {
           GPUInfo("Cache file content outdated (rtc parameters)");
           break;
         }
@@ -172,13 +172,13 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
     }
   }
   if (!cacheLoaded) {
-    if (mProcessingSettings.debugLevel >= 0) {
+    if (GetProcessingSettings().debugLevel >= 0) {
       GPUInfo("Starting CUDA RTC Compilation");
     }
     HighResTimer rtcTimer;
     rtcTimer.ResetStart();
     tbb::parallel_for<uint32_t>(0, nCompile, [&](auto i) {
-      if (mProcessingSettings.debugLevel >= 3) {
+      if (GetProcessingSettings().debugLevel >= 3) {
         printf("Compiling %s\n", (filename + "_" + std::to_string(i) + mRtcSrcExtension).c_str());
       }
       FILE* fp = fopen((filename + "_" + std::to_string(i) + mRtcSrcExtension).c_str(), "w+b");
@@ -187,10 +187,10 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
       }
 
       std::string kernel = "extern \"C\" {";
-      kernel += mProcessingSettings.rtc.compilePerKernel ? kernels[i] : kernelsall;
+      kernel += GetProcessingSettings().rtc.compilePerKernel ? kernels[i] : kernelsall;
       kernel += "}";
 
-      bool deterministic = mProcessingSettings.rtc.deterministic || (mProcessingSettings.rtc.compilePerKernel && o2::gpu::internal::noFastMathKernels.find(GetKernelName(i)) != o2::gpu::internal::noFastMathKernels.end());
+      bool deterministic = GetProcessingSettings().rtc.deterministic || (GetProcessingSettings().rtc.compilePerKernel && o2::gpu::internal::noFastMathKernels.find(GetKernelName(i)) != o2::gpu::internal::noFastMathKernels.end());
       const std::string deterministicStr = std::string(deterministic ? "#define GPUCA_DETERMINISTIC_CODE(det, indet) det\n" : "#define GPUCA_DETERMINISTIC_CODE(det, indet) indet\n");
 
       if (fwrite(deterministicStr.c_str(), 1, deterministicStr.size(), fp) != deterministicStr.size() ||
@@ -206,26 +206,26 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
         command += std::string(" ") + std::string(_binary_GPUReconstructionCUDArtc_command_no_fast_math_start, _binary_GPUReconstructionCUDArtc_command_no_fast_math_len);
       }
       command += " -c " + filename + "_" + std::to_string(i) + mRtcSrcExtension + " -o " + filename + "_" + std::to_string(i) + mRtcBinExtension;
-      if (mProcessingSettings.debugLevel < 0) {
+      if (GetProcessingSettings().debugLevel < 0) {
         command += " &> /dev/null";
-      } else if (mProcessingSettings.debugLevel < 2) {
+      } else if (GetProcessingSettings().debugLevel < 2) {
         command += " > /dev/null";
       }
-      if (mProcessingSettings.debugLevel >= 3) {
+      if (GetProcessingSettings().debugLevel >= 3) {
         printf("Running command %s\n", command.c_str());
       }
       if (system(command.c_str())) {
-        if (mProcessingSettings.debugLevel >= 3) {
+        if (GetProcessingSettings().debugLevel >= 3) {
           printf("Source code file: %s", filename.c_str());
         }
         throw std::runtime_error("Error during CUDA compilation");
       } // clang-format off
     }, tbb::simple_partitioner()); // clang-format on
-    if (mProcessingSettings.debugLevel >= 0) {
+    if (GetProcessingSettings().debugLevel >= 0) {
       GPUInfo("RTC Compilation finished (%f seconds)", rtcTimer.GetCurrentElapsedTime());
     }
-    if (mProcessingSettings.rtc.cacheOutput) {
-      FILE* fp = fopen((mProcessingSettings.rtctech.cacheFolder + "/rtc.cuda.cache").c_str(), "w+b");
+    if (GetProcessingSettings().rtc.cacheOutput) {
+      FILE* fp = fopen((GetProcessingSettings().rtctech.cacheFolder + "/rtc.cuda.cache").c_str(), "w+b");
       if (fp == nullptr) {
         throw std::runtime_error("Cannot open cache file for writing");
       }
@@ -236,7 +236,7 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
           fwrite(shacmd, 1, 20, fp) != 20 ||
           fwrite(shakernels, 1, 20, fp) != 20 ||
           fwrite(shabounds, 1, 20, fp) != 20 ||
-          fwrite(&mProcessingSettings.rtc, sizeof(mProcessingSettings.rtc), 1, fp) != 1) {
+          fwrite(&GetProcessingSettings().rtc, sizeof(GetProcessingSettings().rtc), 1, fp) != 1) {
         throw std::runtime_error("Error writing cache file");
       }
 
@@ -263,7 +263,7 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
       fclose(fp);
     }
   }
-  if (mProcessingSettings.rtc.cacheOutput && mProcessingSettings.rtctech.cacheMutex) {
+  if (GetProcessingSettings().rtc.cacheOutput && GetProcessingSettings().rtctech.cacheMutex) {
     if (lockf(fd, F_ULOCK, 0)) {
       throw std::runtime_error("Error unlocking RTC cache mutex file");
     }
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAInternals.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAInternals.h
index f3fc21243ef0e..0813c9d22ea09 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAInternals.h
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAInternals.h
@@ -44,7 +44,7 @@ struct GPUReconstructionCUDAInternals {
 class GPUDebugTiming
 {
  public:
-  GPUDebugTiming(bool d, gpu_reconstruction_kernels::deviceEvent* t, cudaStream_t* s, const gpu_reconstruction_kernels::krnlSetupTime& x, GPUReconstructionCUDABackend* r) : mDeviceTimers(t), mStreams(s), mXYZ(x), mRec(r), mDo(d)
+  GPUDebugTiming(bool d, GPUReconstructionProcessing::deviceEvent* t, cudaStream_t* s, const GPUReconstructionProcessing::krnlSetupTime& x, GPUReconstructionCUDA* r) : mDeviceTimers(t), mStreams(s), mXYZ(x), mRec(r), mDo(d)
   {
     if (mDo) {
       if (mDeviceTimers) {
@@ -71,10 +71,10 @@ class GPUDebugTiming
   }
 
  private:
-  gpu_reconstruction_kernels::deviceEvent* mDeviceTimers;
+  GPUReconstructionProcessing::deviceEvent* mDeviceTimers;
   cudaStream_t* mStreams;
-  const gpu_reconstruction_kernels::krnlSetupTime& mXYZ;
-  GPUReconstructionCUDABackend* mRec;
+  const GPUReconstructionProcessing::krnlSetupTime& mXYZ;
+  GPUReconstructionCUDA* mRec;
   HighResTimer mTimer;
   bool mDo;
 };
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu
index d53f7cbd81ca9..11a62bcec2318 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu
@@ -22,21 +22,17 @@ using namespace o2::gpu;
 
 #include "GPUReconstructionIncludesDeviceAll.h"
 
+#include "GPUReconstructionCUDAKernelsSpecialize.inc"
+
 #if defined(__HIPCC__) && defined(GPUCA_HAS_GLOBAL_SYMBOL_CONSTANT_MEM)
 __global__ void gGPUConstantMemBuffer_dummy(int32_t* p) { *p = *(int32_t*)&gGPUConstantMemBuffer; }
 #endif
 
-template <>
-inline void GPUReconstructionCUDABackend::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
-{
-  GPUChkErr(cudaMemsetAsync(ptr, 0, size, mInternals->Streams[_xyz.x.stream]));
-}
-
 template <class T, int32_t I, typename... Args>
-inline void GPUReconstructionCUDABackend::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args)
+inline void GPUReconstructionCUDA::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args)
 {
 #if !defined(GPUCA_KERNEL_COMPILE_MODE) || GPUCA_KERNEL_COMPILE_MODE != 1
-  if (!mProcessingSettings.rtc.enable) {
+  if (!GetProcessingSettings().rtc.enable) {
     backendInternal<T, I>::runKernelBackendMacro(_xyz, this, args...);
   } else
 #endif
@@ -56,7 +52,7 @@ inline void GPUReconstructionCUDABackend::runKernelBackendInternal(const krnlSet
 }
 
 template <class T, int32_t I, typename... Args>
-void GPUReconstructionCUDABackend::runKernelBackend(const krnlSetupArgs<T, I, Args...>& args)
+void GPUReconstructionCUDA::runKernelBackend(const krnlSetupArgs<T, I, Args...>& args)
 {
   auto& x = args.s.x;
   auto& z = args.s.z;
@@ -66,7 +62,7 @@ void GPUReconstructionCUDABackend::runKernelBackend(const krnlSetupArgs<T, I, Ar
     }
   }
   {
-    GPUDebugTiming timer(mProcessingSettings.deviceTimers && mProcessingSettings.debugLevel > 0, (deviceEvent*)mDebugEvents, mInternals->Streams, args.s, this);
+    GPUDebugTiming timer(GetProcessingSettings().deviceTimers && GetProcessingSettings().debugLevel > 0, (deviceEvent*)mDebugEvents, mInternals->Streams, args.s, this);
     std::apply([this, &args](auto&... vals) { this->runKernelBackendInternal<T, I, Args...>(args.s, vals...); }, args.v);
   }
   GPUChkErr(cudaGetLastError());
@@ -79,7 +75,7 @@ void GPUReconstructionCUDABackend::runKernelBackend(const krnlSetupArgs<T, I, Ar
 #define GPUCA_KRNL_REG(args) __launch_bounds__(GPUCA_M_MAX2_3(GPUCA_M_STRIP(args)))
 
 #if defined(GPUCA_KERNEL_COMPILE_MODE) && GPUCA_KERNEL_COMPILE_MODE == 1 // ---------- COMPILE_MODE = perkernel ----------
-#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) template void GPUReconstructionCUDABackend::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args);
+#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) template void GPUReconstructionCUDA::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args);
 #else // ---------- COMPILE_MODE = onefile | rdc ----------
 #if defined(GPUCA_KERNEL_COMPILE_MODE) && GPUCA_KERNEL_COMPILE_MODE == 2
 #define GPUCA_KRNL_DEFONLY // COMPILE_MODE = rdc
@@ -87,7 +83,7 @@ void GPUReconstructionCUDABackend::runKernelBackend(const krnlSetupArgs<T, I, Ar
 
 #define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...)        \
   GPUCA_KRNL_HOST(x_class, x_attributes, x_arguments, x_forward, x_types, __VA_ARGS__) \
-  template void GPUReconstructionCUDABackend::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args);
+  template void GPUReconstructionCUDA::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args);
 
 #ifndef __HIPCC__ // CUDA version
 #define GPUCA_KRNL_CALL(x_class, ...) \
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc
new file mode 100644
index 0000000000000..899c2e240cd94
--- /dev/null
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc
@@ -0,0 +1,138 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUReconstructionCUDAKernelsSpecialize.inc
+/// \author David Rohr
+
+#if defined(GPUCA_SPECIALIZE_THRUST_SORTS) && !defined(GPUCA_GPUCODE_COMPILEKERNELS)
+
+namespace o2::gpu::internal
+{
+namespace // anonymous
+{
+struct MergeBorderTracks_compMax {
+  GPUd() bool operator()(const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b)
+  {
+    return GPUCA_DETERMINISTIC_CODE((a.fMax != b.fMax) ? (a.fMax < b.fMax) : (a.fId < b.fId), a.fMax < b.fMax);
+  }
+};
+struct MergeBorderTracks_compMin {
+  GPUd() bool operator()(const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b)
+  {
+    return GPUCA_DETERMINISTIC_CODE((a.fMin != b.fMin) ? (a.fMin < b.fMin) : (a.fId < b.fId), a.fMin < b.fMin);
+  }
+};
+
+struct GPUTPCGMMergerSortTracks_comp {
+  const GPUTPCGMMergedTrack* const mCmp;
+  GPUhd() GPUTPCGMMergerSortTracks_comp(GPUTPCGMMergedTrack* cmp) : mCmp(cmp) {}
+  GPUd() bool operator()(const int32_t aa, const int32_t bb)
+  {
+    const GPUTPCGMMergedTrack& GPUrestrict() a = mCmp[aa];
+    const GPUTPCGMMergedTrack& GPUrestrict() b = mCmp[bb];
+    if (a.CCE() != b.CCE()) {
+      return a.CCE() > b.CCE();
+    }
+    if (a.Legs() != b.Legs()) {
+      return a.Legs() > b.Legs();
+    }
+    GPUCA_DETERMINISTIC_CODE( // clang-format off
+      if (a.NClusters() != b.NClusters()) {
+        return a.NClusters() > b.NClusters();
+      } if (CAMath::Abs(a.GetParam().GetQPt()) != CAMath::Abs(b.GetParam().GetQPt())) {
+        return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt());
+      } if (a.GetParam().GetY() != b.GetParam().GetY()) {
+        return a.GetParam().GetY() > b.GetParam().GetY();
+      }
+      return aa > bb;
+    , // !GPUCA_DETERMINISTIC_CODE
+      return a.NClusters() > b.NClusters();
+    ) // clang-format on
+  }
+};
+
+struct GPUTPCGMMergerSortTracksQPt_comp {
+  const GPUTPCGMMergedTrack* const mCmp;
+  GPUhd() GPUTPCGMMergerSortTracksQPt_comp(GPUTPCGMMergedTrack* cmp) : mCmp(cmp) {}
+  GPUd() bool operator()(const int32_t aa, const int32_t bb)
+  {
+    const GPUTPCGMMergedTrack& GPUrestrict() a = mCmp[aa];
+    const GPUTPCGMMergedTrack& GPUrestrict() b = mCmp[bb];
+    GPUCA_DETERMINISTIC_CODE( // clang-format off
+      if (CAMath::Abs(a.GetParam().GetQPt()) != CAMath::Abs(b.GetParam().GetQPt())) {
+        return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt());
+      } if (a.GetParam().GetY() != b.GetParam().GetY()) {
+        return a.GetParam().GetY() > b.GetParam().GetY();
+      }
+      return a.GetParam().GetZ() > b.GetParam().GetZ();
+    , // !GPUCA_DETERMINISTIC_CODE
+      return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt());
+    ) // clang-format on
+  }
+};
+
+struct GPUTPCGMMergerMergeLoopers_comp {
+  GPUd() bool operator()(const MergeLooperParam& a, const MergeLooperParam& b)
+  {
+    return CAMath::Abs(a.refz) < CAMath::Abs(b.refz);
+  }
+};
+
+struct GPUTPCGMO2OutputSort_comp {
+  GPUd() bool operator()(const GPUTPCGMMerger::tmpSort& a, const GPUTPCGMMerger::tmpSort& b)
+  {
+    return (a.y > b.y);
+  }
+};
+
+} // anonymous namespace
+} // namespace o2::gpu::internal
+
+template <>
+inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal<GPUTPCGMMergerMergeBorders, 3>(const krnlSetupTime& _xyz, GPUTPCGMBorderRange* const& range, int32_t const& N, int32_t const& cmpMax)
+{
+  if (cmpMax) {
+    GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, range, N, MergeBorderTracks_compMax());
+  } else {
+    GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, range, N, MergeBorderTracks_compMin());
+  }
+}
+
+template <>
+inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal<GPUTPCGMMergerSortTracks, 0>(const krnlSetupTime& _xyz)
+{
+  GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackOrderProcess(), processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracks_comp(mProcessorsShadow->tpcMerger.OutputTracks()));
+}
+
+template <>
+inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal<GPUTPCGMMergerSortTracksQPt, 0>(const krnlSetupTime& _xyz)
+{
+  GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackSort(), processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracksQPt_comp(mProcessorsShadow->tpcMerger.OutputTracks()));
+}
+
+template <>
+inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal<GPUTPCGMMergerMergeLoopers, 1>(const krnlSetupTime& _xyz)
+{
+  GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.LooperCandidates(), processors()->tpcMerger.Memory()->nLooperMatchCandidates, GPUTPCGMMergerMergeLoopers_comp());
+}
+
+template <>
+inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal<GPUTPCGMO2Output, GPUTPCGMO2Output::sort>(const krnlSetupTime& _xyz)
+{
+  GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackSortO2(), processors()->tpcMerger.NOutputTracksTPCO2(), GPUTPCGMO2OutputSort_comp());
+}
+#endif // GPUCA_SPECIALIZE_THRUST_SORTS
+
+template <>
+inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
+{
+  GPUChkErr(cudaMemsetAsync(ptr, 0, size, mInternals->Streams[_xyz.x.stream]));
+}
diff --git a/GPU/GPUTracking/Base/hip/CMakeLists.txt b/GPU/GPUTracking/Base/hip/CMakeLists.txt
index d7adb222d547b..d29a6afb60899 100644
--- a/GPU/GPUTracking/Base/hip/CMakeLists.txt
+++ b/GPU/GPUTracking/Base/hip/CMakeLists.txt
@@ -24,7 +24,7 @@ message(STATUS "Building GPUTracking with HIP support ${TMP_TARGET}")
 if(NOT DEFINED GPUCA_HIP_HIPIFY_FROM_CUDA OR "${GPUCA_HIP_HIPIFY_FROM_CUDA}")
   set(GPUCA_HIP_SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/hipify)
   file(MAKE_DIRECTORY ${GPUCA_HIP_SOURCE_DIR})
-  set(GPUCA_HIP_FILE_LIST GPUReconstructionCUDA.cu GPUReconstructionCUDAExternalProvider.cu GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDAHelpers.inc GPUReconstructionCUDAkernel.template.cu GPUReconstructionCUDADef.h GPUReconstructionCUDAGenRTC.cxx GPUReconstructionCUDAKernels.cu GPUReconstructionCUDArtc.cu GPUReconstructionCUDARTCCalls.cu)
+  set(GPUCA_HIP_FILE_LIST GPUReconstructionCUDA.cu GPUReconstructionCUDAExternalProvider.cu GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDAHelpers.inc GPUReconstructionCUDAkernel.template.cu GPUReconstructionCUDADef.h GPUReconstructionCUDAGenRTC.cxx GPUReconstructionCUDAKernels.cu GPUReconstructionCUDAKernelsSpecialize.inc GPUReconstructionCUDArtc.cu GPUReconstructionCUDARTCCalls.cu)
   set(GPUCA_HIP_LOCAL_FILE_LIST GPUReconstructionHIPIncludesSystem.h)
   set(HIP_SOURCES "")
   foreach(file ${GPUCA_HIP_FILE_LIST})
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx
index e276f83413bbc..28c809dd4a09a 100644
--- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx
+++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx
@@ -13,6 +13,7 @@
 /// \author David Rohr
 
 #include "GPUReconstructionOCLIncludesHost.h"
+#include "GPUReconstructionProcessingKernels.inc"
 #include "GPUDefParametersLoad.inc"
 
 #include <map>
@@ -33,16 +34,16 @@ QGET_LD_BINARY_SYMBOLS(GPUReconstructionOCLCode_spirv);
 
 GPUReconstruction* GPUReconstruction_Create_OCL(const GPUSettingsDeviceBackend& cfg) { return new GPUReconstructionOCL(cfg); }
 
-GPUReconstructionOCLBackend::GPUReconstructionOCLBackend(const GPUSettingsDeviceBackend& cfg) : GPUReconstructionDeviceBase(cfg, sizeof(GPUReconstructionDeviceBase))
+GPUReconstructionOCL::GPUReconstructionOCL(const GPUSettingsDeviceBackend& cfg) : GPUReconstructionProcessing::KernelInterface<GPUReconstructionOCL, GPUReconstructionDeviceBase>(cfg, sizeof(GPUReconstructionDeviceBase))
 {
   if (mMaster == nullptr) {
     mInternals = new GPUReconstructionOCLInternals;
     *mParDevice = o2::gpu::internal::GPUDefParametersLoad();
   }
-  mDeviceBackendSettings.deviceType = DeviceType::OCL;
+  mDeviceBackendSettings->deviceType = DeviceType::OCL;
 }
 
-GPUReconstructionOCLBackend::~GPUReconstructionOCLBackend()
+GPUReconstructionOCL::~GPUReconstructionOCL()
 {
   Exit(); // Make sure we destroy everything (in particular the ITS tracker) before we exit
   if (mMaster == nullptr) {
@@ -51,7 +52,7 @@ GPUReconstructionOCLBackend::~GPUReconstructionOCLBackend()
 }
 
 static_assert(sizeof(cl_int) <= sizeof(int64_t) && CL_SUCCESS == 0);
-int32_t GPUReconstructionOCLBackend::GPUChkErrInternal(const int64_t error, const char* file, int32_t line) const
+int32_t GPUReconstructionOCL::GPUChkErrInternal(const int64_t error, const char* file, int32_t line) const
 {
   // Check for OPENCL Error and in the case of an error display the corresponding error string
   if (error != CL_SUCCESS) {
@@ -60,7 +61,7 @@ int32_t GPUReconstructionOCLBackend::GPUChkErrInternal(const int64_t error, cons
   return error != CL_SUCCESS;
 }
 
-int32_t GPUReconstructionOCLBackend::InitDevice_Runtime()
+int32_t GPUReconstructionOCL::InitDevice_Runtime()
 {
   if (mMaster == nullptr) {
     cl_int ocl_error;
@@ -71,7 +72,7 @@ int32_t GPUReconstructionOCLBackend::InitDevice_Runtime()
     if (num_platforms == 0) {
       GPUErrorReturn("No OpenCL Platform found");
     }
-    if (mProcessingSettings.debugLevel >= 2) {
+    if (GetProcessingSettings().debugLevel >= 2) {
       GPUInfo("%d OpenCL Platforms found", num_platforms);
     }
 
@@ -118,17 +119,17 @@ int32_t GPUReconstructionOCLBackend::InitDevice_Runtime()
 
     cl_uint deviceCount, bestDevice = (cl_uint)-1, bestPlatform = (cl_uint)-1;
     for (uint32_t iPlatform = 0; iPlatform < num_platforms; iPlatform++) {
-      if (mProcessingSettings.oclPlatformNum >= 0) {
-        if (mProcessingSettings.oclPlatformNum >= (int32_t)num_platforms) {
+      if (GetProcessingSettings().oclPlatformNum >= 0) {
+        if (GetProcessingSettings().oclPlatformNum >= (int32_t)num_platforms) {
           GPUErrorReturn("Invalid platform specified");
         }
-        iPlatform = mProcessingSettings.oclPlatformNum;
+        iPlatform = GetProcessingSettings().oclPlatformNum;
       }
       std::string platformUsageInfo;
       bool platformCompatible = false;
       queryPlatform(platforms[iPlatform]);
       if (clGetDeviceIDs(platforms[iPlatform], CL_DEVICE_TYPE_ALL, 0, nullptr, &deviceCount) != CL_SUCCESS) {
-        if (mProcessingSettings.oclPlatformNum >= 0) {
+        if (GetProcessingSettings().oclPlatformNum >= 0) {
           GPUErrorReturn("No device in requested platform or error obtaining device count");
         }
         platformUsageInfo += " - no devices";
@@ -139,32 +140,32 @@ int32_t GPUReconstructionOCLBackend::InitDevice_Runtime()
         }
       }
 
-      if (mProcessingSettings.oclPlatformNum >= 0 || mProcessingSettings.debugLevel >= 2) {
-        GPUInfo("%s Platform %d: (%s %s) %s %s (Compatible: %s)%s", mProcessingSettings.oclPlatformNum >= 0 ? "Enforced" : "Available", iPlatform, platform_profile.c_str(), platform_version.c_str(), platform_vendor.c_str(), platform_name.c_str(), platformCompatible ? "yes" : "no", mProcessingSettings.debugLevel >= 2 ? platformUsageInfo.c_str() : "");
+      if (GetProcessingSettings().oclPlatformNum >= 0 || GetProcessingSettings().debugLevel >= 2) {
+        GPUInfo("%s Platform %d: (%s %s) %s %s (Compatible: %s)%s", GetProcessingSettings().oclPlatformNum >= 0 ? "Enforced" : "Available", iPlatform, platform_profile.c_str(), platform_version.c_str(), platform_vendor.c_str(), platform_name.c_str(), platformCompatible ? "yes" : "no", GetProcessingSettings().debugLevel >= 2 ? platformUsageInfo.c_str() : "");
       }
 
-      if (platformCompatible || mProcessingSettings.oclPlatformNum >= 0 || (mProcessingSettings.oclPlatformNum == -2 && deviceCount)) {
+      if (platformCompatible || GetProcessingSettings().oclPlatformNum >= 0 || (GetProcessingSettings().oclPlatformNum == -2 && deviceCount)) {
         if (deviceCount > devices.size()) {
           devices.resize(deviceCount);
         }
         if (clGetDeviceIDs(platforms[iPlatform], CL_DEVICE_TYPE_ALL, deviceCount, devices.data(), nullptr) != CL_SUCCESS) {
-          if (mProcessingSettings.oclPlatformNum >= 0) {
+          if (GetProcessingSettings().oclPlatformNum >= 0) {
             GPUErrorReturn("Error getting OpenCL devices");
           }
           continue;
         }
 
         for (uint32_t i = 0; i < deviceCount; i++) {
-          if (mProcessingSettings.deviceNum >= 0) {
-            if (mProcessingSettings.deviceNum >= (signed)deviceCount) {
-              GPUErrorReturn("Requested device ID %d does not exist", mProcessingSettings.deviceNum);
+          if (GetProcessingSettings().deviceNum >= 0) {
+            if (GetProcessingSettings().deviceNum >= (signed)deviceCount) {
+              GPUErrorReturn("Requested device ID %d does not exist", GetProcessingSettings().deviceNum);
             }
-            i = mProcessingSettings.deviceNum;
+            i = GetProcessingSettings().deviceNum;
           }
           bool deviceOK = true;
           queryDevice(devices[i]);
           std::string deviceFailure;
-          if (mProcessingSettings.gpuDeviceOnly && ((device_type & CL_DEVICE_TYPE_CPU) || !(device_type & CL_DEVICE_TYPE_GPU))) {
+          if (GetProcessingSettings().gpuDeviceOnly && ((device_type & CL_DEVICE_TYPE_CPU) || !(device_type & CL_DEVICE_TYPE_GPU))) {
             deviceOK = false;
             deviceFailure += " - No GPU device";
           }
@@ -193,12 +194,12 @@ int32_t GPUReconstructionOCLBackend::InitDevice_Runtime()
           }
 
           double bestDeviceSpeed = -1, deviceSpeed = (double)device_freq * (double)device_shaders;
-          if (mProcessingSettings.debugLevel >= 2) {
+          if (GetProcessingSettings().debugLevel >= 2) {
             GPUInfo("  Device %s%2d: %s %s (Frequency %d, Shaders %d, %d bit) (Speed Value: %ld)%s %s", deviceOK ? " " : "[", i, device_vendor.c_str(), device_name.c_str(), (int32_t)device_freq, (int32_t)device_shaders, (int32_t)device_nbits, (int64_t)deviceSpeed, deviceOK ? " " : " ]", deviceOK ? "" : deviceFailure.c_str());
           }
           if (!deviceOK) {
-            if (mProcessingSettings.deviceNum >= 0) {
-              GPUInfo("Unsupported device requested on platform %d: (%d)", iPlatform, mProcessingSettings.deviceNum);
+            if (GetProcessingSettings().deviceNum >= 0) {
+              GPUInfo("Unsupported device requested on platform %d: (%d)", iPlatform, GetProcessingSettings().deviceNum);
               break;
             }
             continue;
@@ -209,12 +210,12 @@ int32_t GPUReconstructionOCLBackend::InitDevice_Runtime()
             bestDeviceSpeed = deviceSpeed;
             mOclVersion = platform_version_f;
           }
-          if (mProcessingSettings.deviceNum >= 0) {
+          if (GetProcessingSettings().deviceNum >= 0) {
             break;
           }
         }
       }
-      if (mProcessingSettings.oclPlatformNum >= 0) {
+      if (GetProcessingSettings().oclPlatformNum >= 0) {
         break;
       }
     }
@@ -238,7 +239,7 @@ int32_t GPUReconstructionOCLBackend::InitDevice_Runtime()
     deviceVersion = query(clGetDeviceInfo, mInternals->device, CL_DEVICE_VERSION);
     int versionMajor, versionMinor;
     sscanf(deviceVersion.c_str(), "OpenCL %d.%d", &versionMajor, &versionMinor);
-    if (mProcessingSettings.debugLevel >= 2) {
+    if (GetProcessingSettings().debugLevel >= 2) {
       GPUInfo("Using OpenCL platform %d / device %d: %s %s with properties:", bestPlatform, bestDevice, device_vendor.c_str(), device_name.c_str());
       GPUInfo("\tVersion = %s", deviceVersion);
       GPUInfo("\tFrequency = %d", (int32_t)device_freq);
@@ -271,7 +272,7 @@ int32_t GPUReconstructionOCLBackend::InitDevice_Runtime()
       return 1;
     }
 
-    if (mProcessingSettings.debugLevel >= 2) {
+    if (GetProcessingSettings().debugLevel >= 2) {
       GPUInfo("OpenCL program and kernels loaded successfully");
     }
 
@@ -289,21 +290,21 @@ int32_t GPUReconstructionOCLBackend::InitDevice_Runtime()
     }
 
     if (device_type & CL_DEVICE_TYPE_CPU) {
-      if (mProcessingSettings.deviceTimers && mProcessingSettings.debugLevel >= 2) {
+      if (GetProcessingSettings().deviceTimers && GetProcessingSettings().debugLevel >= 2) {
         GPUInfo("Disabling device timers for CPU device");
       }
-      mProcessingSettings.deviceTimers = 0;
+      mProcessingSettings->deviceTimers = 0;
     }
     for (int32_t i = 0; i < mNStreams; i++) {
 #ifdef CL_VERSION_2_0
       cl_queue_properties prop = 0;
-      if (versionMajor >= 2 && IsGPU() && mProcessingSettings.deviceTimers) {
+      if (versionMajor >= 2 && IsGPU() && GetProcessingSettings().deviceTimers) {
         prop |= CL_QUEUE_PROFILING_ENABLE;
       }
       mInternals->command_queue[i] = clCreateCommandQueueWithProperties(mInternals->context, mInternals->device, &prop, &ocl_error);
-      if (mProcessingSettings.deviceTimers && ocl_error == CL_INVALID_QUEUE_PROPERTIES) {
+      if (GetProcessingSettings().deviceTimers && ocl_error == CL_INVALID_QUEUE_PROPERTIES) {
         GPUError("GPU device timers not supported by OpenCL platform, disabling");
-        mProcessingSettings.deviceTimers = 0;
+        mProcessingSettings->deviceTimers = 0;
         prop = 0;
         mInternals->command_queue[i] = clCreateCommandQueueWithProperties(mInternals->context, mInternals->device, &prop, &ocl_error);
       }
@@ -351,7 +352,7 @@ int32_t GPUReconstructionOCLBackend::InitDevice_Runtime()
       GPUErrorReturn("Error obtaining device memory ptr");
     }
 
-    if (mProcessingSettings.debugLevel >= 2) {
+    if (GetProcessingSettings().debugLevel >= 2) {
       GPUInfo("Mapping hostmemory");
     }
     mHostMemoryBase = clEnqueueMapBuffer(mInternals->command_queue[0], mInternals->mem_host, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, mHostMemorySize, 0, nullptr, nullptr, &ocl_error);
@@ -362,7 +363,7 @@ int32_t GPUReconstructionOCLBackend::InitDevice_Runtime()
     mDeviceMemoryBase = ((void**)mHostMemoryBase)[0];
     mDeviceConstantMem = (GPUConstantMem*)((void**)mHostMemoryBase)[1];
 
-    if (mProcessingSettings.debugLevel >= 1) {
+    if (GetProcessingSettings().debugLevel >= 1) {
       GPUInfo("Memory ptrs: GPU (%ld bytes): %p - Host (%ld bytes): %p", (int64_t)mDeviceMemorySize, mDeviceMemoryBase, (int64_t)mHostMemorySize, mHostMemoryBase);
       memset(mHostMemoryBase, 0xDD, mHostMemorySize);
     }
@@ -386,7 +387,7 @@ int32_t GPUReconstructionOCLBackend::InitDevice_Runtime()
   return (0);
 }
 
-int32_t GPUReconstructionOCLBackend::ExitDevice_Runtime()
+int32_t GPUReconstructionOCL::ExitDevice_Runtime()
 {
   // Uninitialize OPENCL
   SynchronizeGPU();
@@ -418,12 +419,12 @@ int32_t GPUReconstructionOCLBackend::ExitDevice_Runtime()
   return (0);
 }
 
-size_t GPUReconstructionOCLBackend::GPUMemCpy(void* dst, const void* src, size_t size, int32_t stream, int32_t toGPU, deviceEvent* ev, deviceEvent* evList, int32_t nEvents)
+size_t GPUReconstructionOCL::GPUMemCpy(void* dst, const void* src, size_t size, int32_t stream, int32_t toGPU, deviceEvent* ev, deviceEvent* evList, int32_t nEvents)
 {
   if (evList == nullptr) {
     nEvents = 0;
   }
-  if (mProcessingSettings.debugLevel >= 3) {
+  if (GetProcessingSettings().debugLevel >= 3) {
     stream = -1;
   }
   if (stream == -1) {
@@ -440,33 +441,33 @@ size_t GPUReconstructionOCLBackend::GPUMemCpy(void* dst, const void* src, size_t
   } else {
     GPUChkErr(clEnqueueReadBuffer(mInternals->command_queue[stream == -1 ? 0 : stream], mInternals->mem_gpu, stream == -1, (char*)src - (char*)mDeviceMemoryBase, size, dst, nEvents, evList->getEventList<cl_event>(), ev->getEventList<cl_event>()));
   }
-  if (mProcessingSettings.serializeGPU & 2) {
+  if (GetProcessingSettings().serializeGPU & 2) {
     GPUDebug(("GPUMemCpy " + std::to_string(toGPU)).c_str(), stream, true);
   }
   return size;
 }
 
-size_t GPUReconstructionOCLBackend::WriteToConstantMemory(size_t offset, const void* src, size_t size, int32_t stream, deviceEvent* ev)
+size_t GPUReconstructionOCL::WriteToConstantMemory(size_t offset, const void* src, size_t size, int32_t stream, deviceEvent* ev)
 {
   if (stream == -1) {
     SynchronizeGPU();
   }
   GPUChkErr(clEnqueueWriteBuffer(mInternals->command_queue[stream == -1 ? 0 : stream], mInternals->mem_constant, stream == -1, offset, size, src, 0, nullptr, ev->getEventList<cl_event>()));
-  if (mProcessingSettings.serializeGPU & 2) {
+  if (GetProcessingSettings().serializeGPU & 2) {
     GPUDebug("WriteToConstantMemory", stream, true);
   }
   return size;
 }
 
-void GPUReconstructionOCLBackend::ReleaseEvent(deviceEvent ev) { GPUChkErr(clReleaseEvent(ev.get<cl_event>())); }
+void GPUReconstructionOCL::ReleaseEvent(deviceEvent ev) { GPUChkErr(clReleaseEvent(ev.get<cl_event>())); }
 
-void GPUReconstructionOCLBackend::RecordMarker(deviceEvent* ev, int32_t stream) { GPUChkErr(clEnqueueMarkerWithWaitList(mInternals->command_queue[stream], 0, nullptr, ev->getEventList<cl_event>())); }
+void GPUReconstructionOCL::RecordMarker(deviceEvent* ev, int32_t stream) { GPUChkErr(clEnqueueMarkerWithWaitList(mInternals->command_queue[stream], 0, nullptr, ev->getEventList<cl_event>())); }
 
-int32_t GPUReconstructionOCLBackend::DoStuckProtection(int32_t stream, deviceEvent event)
+int32_t GPUReconstructionOCL::DoStuckProtection(int32_t stream, deviceEvent event)
 {
-  if (mProcessingSettings.stuckProtection) {
+  if (GetProcessingSettings().stuckProtection) {
     cl_int tmp = 0;
-    for (int32_t i = 0; i <= mProcessingSettings.stuckProtection / 50; i++) {
+    for (int32_t i = 0; i <= GetProcessingSettings().stuckProtection / 50; i++) {
       usleep(50);
       clGetEventInfo(event.get<cl_event>(), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(tmp), &tmp, nullptr);
       if (tmp == CL_COMPLETE) {
@@ -483,25 +484,25 @@ int32_t GPUReconstructionOCLBackend::DoStuckProtection(int32_t stream, deviceEve
   return 0;
 }
 
-void GPUReconstructionOCLBackend::SynchronizeGPU()
+void GPUReconstructionOCL::SynchronizeGPU()
 {
   for (int32_t i = 0; i < mNStreams; i++) {
     GPUChkErr(clFinish(mInternals->command_queue[i]));
   }
 }
 
-void GPUReconstructionOCLBackend::SynchronizeStream(int32_t stream) { GPUChkErr(clFinish(mInternals->command_queue[stream])); }
+void GPUReconstructionOCL::SynchronizeStream(int32_t stream) { GPUChkErr(clFinish(mInternals->command_queue[stream])); }
 
-void GPUReconstructionOCLBackend::SynchronizeEvents(deviceEvent* evList, int32_t nEvents) { GPUChkErr(clWaitForEvents(nEvents, evList->getEventList<cl_event>())); }
+void GPUReconstructionOCL::SynchronizeEvents(deviceEvent* evList, int32_t nEvents) { GPUChkErr(clWaitForEvents(nEvents, evList->getEventList<cl_event>())); }
 
-void GPUReconstructionOCLBackend::StreamWaitForEvents(int32_t stream, deviceEvent* evList, int32_t nEvents)
+void GPUReconstructionOCL::StreamWaitForEvents(int32_t stream, deviceEvent* evList, int32_t nEvents)
 {
   if (nEvents) {
     GPUChkErr(clEnqueueMarkerWithWaitList(mInternals->command_queue[stream], nEvents, evList->getEventList<cl_event>(), nullptr));
   }
 }
 
-bool GPUReconstructionOCLBackend::IsEventDone(deviceEvent* evList, int32_t nEvents)
+bool GPUReconstructionOCL::IsEventDone(deviceEvent* evList, int32_t nEvents)
 {
   cl_int eventdone;
   for (int32_t i = 0; i < nEvents; i++) {
@@ -513,10 +514,10 @@ bool GPUReconstructionOCLBackend::IsEventDone(deviceEvent* evList, int32_t nEven
   return true;
 }
 
-int32_t GPUReconstructionOCLBackend::GPUDebug(const char* state, int32_t stream, bool force)
+int32_t GPUReconstructionOCL::GPUDebug(const char* state, int32_t stream, bool force)
 {
   // Wait for OPENCL-Kernel to finish and check for OPENCL errors afterwards, in case of debugmode
-  if (!force && mProcessingSettings.debugLevel <= 0) {
+  if (!force && GetProcessingSettings().debugLevel <= 0) {
     return (0);
   }
   for (int32_t i = 0; i < mNStreams; i++) {
@@ -524,13 +525,13 @@ int32_t GPUReconstructionOCLBackend::GPUDebug(const char* state, int32_t stream,
       GPUError("OpenCL Error while synchronizing (%s) (Stream %d/%d)", state, stream, i);
     }
   }
-  if (mProcessingSettings.debugLevel >= 3) {
+  if (GetProcessingSettings().debugLevel >= 3) {
     GPUInfo("GPU Sync Done");
   }
   return (0);
 }
 
-int32_t GPUReconstructionOCLBackend::GetOCLPrograms()
+int32_t GPUReconstructionOCL::GetOCLPrograms()
 {
   cl_int ocl_error;
 
@@ -571,7 +572,7 @@ int32_t GPUReconstructionOCLBackend::GetOCLPrograms()
   return AddKernels();
 }
 
-const char* GPUReconstructionOCLBackend::convertErrorToString(int32_t errorcode)
+const char* GPUReconstructionOCL::convertErrorToString(int32_t errorcode)
 {
   static const std::map<cl_int, const char*> error_map = {
     {CL_SUCCESS, "CL_SUCCESS"},
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h
index abde42f01f073..091bc0409630d 100644
--- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h
+++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h
@@ -27,14 +27,16 @@ namespace o2::gpu
 {
 struct GPUReconstructionOCLInternals;
 
-class GPUReconstructionOCLBackend : public GPUReconstructionDeviceBase
+class GPUReconstructionOCL : public GPUReconstructionProcessing::KernelInterface<GPUReconstructionOCL, GPUReconstructionDeviceBase>
 {
  public:
-  ~GPUReconstructionOCLBackend() override;
+  GPUReconstructionOCL(const GPUSettingsDeviceBackend& cfg);
+  ~GPUReconstructionOCL() override;
 
- protected:
-  GPUReconstructionOCLBackend(const GPUSettingsDeviceBackend& cfg);
+  template <class T, int32_t I = 0, typename... Args>
+  void runKernelBackend(const krnlSetupArgs<T, I, Args...>& args);
 
+ protected:
   int32_t InitDevice_Runtime() override;
   int32_t ExitDevice_Runtime() override;
 
@@ -61,8 +63,6 @@ class GPUReconstructionOCLBackend : public GPUReconstructionDeviceBase
   GPUReconstructionOCLInternals* mInternals;
   float mOclVersion;
 
-  template <class T, int32_t I = 0, typename... Args>
-  void runKernelBackend(const krnlSetupArgs<T, I, Args...>& args);
   template <class S, class T, int32_t I>
   S& getKernelObject();
 
@@ -78,7 +78,6 @@ class GPUReconstructionOCLBackend : public GPUReconstructionDeviceBase
   int32_t AddKernels();
 };
 
-using GPUReconstructionOCL = GPUReconstructionKernels<GPUReconstructionOCLBackend>;
 } // namespace o2::gpu
 
 #endif
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLIncludesHost.h b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLIncludesHost.h
index 0bb2f25093789..919791948d6c3 100644
--- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLIncludesHost.h
+++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLIncludesHost.h
@@ -54,7 +54,7 @@ struct GPUReconstructionOCLInternals {
 } // namespace o2::gpu
 
 template <typename T, typename... Args>
-inline int64_t GPUReconstructionOCLBackend::OCLsetKernelParameters_helper(cl_kernel& kernel, int32_t i, const T& firstParameter, const Args&... restOfParameters)
+inline int64_t GPUReconstructionOCL::OCLsetKernelParameters_helper(cl_kernel& kernel, int32_t i, const T& firstParameter, const Args&... restOfParameters)
 {
   int64_t retVal = clSetKernelArg(kernel, i, sizeof(T), &firstParameter);
   if (retVal) {
@@ -67,12 +67,12 @@ inline int64_t GPUReconstructionOCLBackend::OCLsetKernelParameters_helper(cl_ker
 }
 
 template <typename... Args>
-inline int64_t GPUReconstructionOCLBackend::OCLsetKernelParameters(cl_kernel& kernel, const Args&... args)
+inline int64_t GPUReconstructionOCL::OCLsetKernelParameters(cl_kernel& kernel, const Args&... args)
 {
   return OCLsetKernelParameters_helper(kernel, 0, args...);
 }
 
-inline int64_t GPUReconstructionOCLBackend::clExecuteKernelA(cl_command_queue queue, cl_kernel krnl, size_t local_size, size_t global_size, cl_event* pEvent, cl_event* wait, cl_int nWaitEvents)
+inline int64_t GPUReconstructionOCL::clExecuteKernelA(cl_command_queue queue, cl_kernel krnl, size_t local_size, size_t global_size, cl_event* pEvent, cl_event* wait, cl_int nWaitEvents)
 {
   return clEnqueueNDRangeKernel(queue, krnl, 1, nullptr, &global_size, &local_size, wait == nullptr ? 0 : nWaitEvents, wait, pEvent);
 }
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx
index cca634fba65fc..72c68428149dd 100644
--- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx
+++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx
@@ -13,16 +13,12 @@
 /// \author David Rohr
 
 #include "GPUReconstructionOCLIncludesHost.h"
+#include "GPUReconstructionKernelIncludes.h"
 
-template <>
-inline void GPUReconstructionOCLBackend::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
-{
-  cl_int4 val0 = {0, 0, 0, 0};
-  GPUChkErr(clEnqueueFillBuffer(mInternals->command_queue[_xyz.x.stream], mInternals->mem_gpu, &val0, sizeof(val0), (char*)ptr - (char*)mDeviceMemoryBase, (size + sizeof(val0) - 1) & ~(sizeof(val0) - 1), _xyz.z.evList == nullptr ? 0 : _xyz.z.nEvents, _xyz.z.evList->getEventList<cl_event>(), _xyz.z.ev->getEventList<cl_event>()));
-}
+#include "GPUReconstructionOCLKernelsSpecialize.inc"
 
 template <class T, int32_t I, typename... Args>
-inline void GPUReconstructionOCLBackend::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args)
+inline void GPUReconstructionOCL::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args)
 {
   cl_kernel k = getKernelObject<cl_kernel, T, I>();
   auto& x = _xyz.x;
@@ -33,14 +29,14 @@ inline void GPUReconstructionOCLBackend::runKernelBackendInternal(const krnlSetu
   cl_event ev;
   cl_event* evr;
   bool tmpEvent = false;
-  if (z.ev == nullptr && mProcessingSettings.deviceTimers && mProcessingSettings.debugLevel > 0) {
+  if (z.ev == nullptr && GetProcessingSettings().deviceTimers && GetProcessingSettings().debugLevel > 0) {
     evr = &ev;
     tmpEvent = true;
   } else {
     evr = (cl_event*)z.ev;
   }
   GPUChkErr(clExecuteKernelA(mInternals->command_queue[x.stream], k, x.nThreads, x.nThreads * x.nBlocks, evr, (cl_event*)z.evList, z.nEvents));
-  if (mProcessingSettings.deviceTimers && mProcessingSettings.debugLevel > 0) {
+  if (GetProcessingSettings().deviceTimers && GetProcessingSettings().debugLevel > 0) {
     cl_ulong time_start, time_end;
     GPUChkErr(clWaitForEvents(1, evr));
     GPUChkErr(clGetEventProfilingInfo(*evr, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, nullptr));
@@ -53,13 +49,13 @@ inline void GPUReconstructionOCLBackend::runKernelBackendInternal(const krnlSetu
 }
 
 template <class T, int32_t I, typename... Args>
-void GPUReconstructionOCLBackend::runKernelBackend(const krnlSetupArgs<T, I, Args...>& args)
+void GPUReconstructionOCL::runKernelBackend(const krnlSetupArgs<T, I, Args...>& args)
 {
   std::apply([this, &args](auto&... vals) { runKernelBackendInternal<T, I, Args...>(args.s, vals...); }, args.v);
 }
 
 template <class T, int32_t I>
-int32_t GPUReconstructionOCLBackend::AddKernel()
+int32_t GPUReconstructionOCL::AddKernel()
 {
   std::string name(GetKernelName<T, I>());
   std::string kname("krnl_" + name);
@@ -75,12 +71,12 @@ int32_t GPUReconstructionOCLBackend::AddKernel()
 }
 
 template <class S, class T, int32_t I>
-S& GPUReconstructionOCLBackend::getKernelObject()
+S& GPUReconstructionOCL::getKernelObject()
 {
   return mInternals->kernels[GetKernelNum<T, I>()];
 }
 
-int32_t GPUReconstructionOCLBackend::AddKernels()
+int32_t GPUReconstructionOCL::AddKernels()
 {
 #define GPUCA_KRNL(x_class, ...)                     \
   if (AddKernel<GPUCA_M_KRNL_TEMPLATE(x_class)>()) { \
@@ -91,6 +87,6 @@ int32_t GPUReconstructionOCLBackend::AddKernels()
   return 0;
 }
 
-#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) template void GPUReconstructionOCLBackend::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args);
+#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) template void GPUReconstructionOCL::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args);
 #include "GPUReconstructionKernelList.h"
 #undef GPUCA_KRNL
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernelsSpecialize.inc b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernelsSpecialize.inc
new file mode 100644
index 0000000000000..1b860e47a4243
--- /dev/null
+++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernelsSpecialize.inc
@@ -0,0 +1,20 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUReconstructionOCLKernelsSpecialize.inc
+/// \author David Rohr
+
+template <>
+inline void GPUReconstructionOCL::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
+{
+  cl_int4 val0 = {0, 0, 0, 0};
+  GPUChkErr(clEnqueueFillBuffer(mInternals->command_queue[_xyz.x.stream], mInternals->mem_gpu, &val0, sizeof(val0), (char*)ptr - (char*)mDeviceMemoryBase, (size + sizeof(val0) - 1) & ~(sizeof(val0) - 1), _xyz.z.evList == nullptr ? 0 : _xyz.z.nEvents, _xyz.z.evList->getEventList<cl_event>(), _xyz.z.ev->getEventList<cl_event>()));
+}
diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index e82799b9e59c3..0cd302cc0be94 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -113,7 +113,7 @@ set(HDRS_INSTALL
     Base/GPUReconstructionIO.h
     Base/GPUReconstructionIncludesITS.h
     Base/GPUReconstructionKernelMacros.h
-    Base/GPUReconstructionKernels.h
+    Base/GPUReconstructionCPUKernels.h
     DataCompression/GPUTPCClusterRejection.h
     DataCompression/GPUTPCCompressionKernels.inc
     DataCompression/TPCClusterDecompressionCore.inc
@@ -127,6 +127,7 @@ set(HDRS_INSTALL
     DataTypes/GPUTRDDef.h
     DataTypes/GPUTRDInterfaceO2Track.h
     DataTypes/GPUTriggerOutputs.h
+    DataTypes/GPUKernelClassesFwd.h
     Debug/GPUROOTDump.h
     Definitions/GPUDefConstantsAndSettings.h
     Definitions/GPUDefParametersWrapper.h
@@ -137,6 +138,7 @@ set(HDRS_INSTALL
     Definitions/GPULogging.h
     Definitions/GPUSettingsList.h
     Global/GPUChainTrackingDefs.h
+    Global/GPUChainTrackingGetters.inc
     Global/GPUErrorCodes.h
     Merger/GPUTPCGMBorderTrack.h
     Merger/GPUTPCGMMergedTrack.h
@@ -217,11 +219,11 @@ set(SRCS_NO_H ${SRCS_NO_H}
 
 set(HDRS_INSTALL ${HDRS_INSTALL}
                   ITS/GPUITSTrack.h
-                  TPCClusterFinder/Array2D.h
+                  TPCClusterFinder/CfArray2D.h
                   TPCClusterFinder/CfConsts.h
                   TPCClusterFinder/CfFragment.h
                   TPCClusterFinder/CfUtils.h
-                  TPCClusterFinder/ChargePos.h
+                  TPCClusterFinder/CfChargePos.h
                   Definitions/clusterFinderDefs.h
                   TPCClusterFinder/PackedCharge.h
                   TPCClusterFinder/GPUTPCCFChainContext.h)
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCCompression.cxx b/GPU/GPUTracking/DataCompression/GPUTPCCompression.cxx
index 8a22545314252..2a0c5b58d8a83 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCCompression.cxx
+++ b/GPU/GPUTracking/DataCompression/GPUTPCCompression.cxx
@@ -17,6 +17,7 @@
 #include "GPUO2DataTypes.h"
 #include "GPUMemorySizeScalers.h"
 #include "GPUDefParametersRuntime.h"
+#include "GPUConstantMem.h"
 
 using namespace o2::gpu;
 
@@ -123,7 +124,7 @@ void GPUTPCCompression::SetMaxData(const GPUTrackingInOutPointers& io)
   mMaxClusters = io.clustersNative->nClustersTotal;
   mMaxClusterFactorBase1024 = mMaxClusters > 100000000 ? mRec->MemoryScalers()->NTPCUnattachedHitsBase1024(mRec->GetParam().rec.tpc.rejectionStrategy) : 1024;
   mMaxClustersInCache = mMaxClusters * mMaxClusterFactorBase1024 / 1024;
-  mMaxTrackClusters = mRec->GetConstantMem().tpcMerger.NOutputTrackClusters();
+  mMaxTrackClusters = mRec->GetConstantMem().tpcMerger.NOutputTrackClusters(); // TODO: Why is this not using ioPtrs? Could remove GPUConstantMem.h include
   mMaxTracks = mRec->GetConstantMem().tpcMerger.NOutputTracks();
   if (mMaxClusters % 16) {
     mMaxClusters += 16 - (mMaxClusters % 16);
diff --git a/GPU/GPUTracking/DataTypes/GPUKernelClassesFwd.h b/GPU/GPUTracking/DataTypes/GPUKernelClassesFwd.h
new file mode 100644
index 0000000000000..405eb339dea3b
--- /dev/null
+++ b/GPU/GPUTracking/DataTypes/GPUKernelClassesFwd.h
@@ -0,0 +1,40 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUKernelClassesFwd.h
+/// \author David Rohr
+
+#ifndef GPUKERNELCLASSESFWDN_H
+#define GPUKERNELCLASSESFWDN_H
+
+#include "GPUTRDDef.h"
+
+namespace o2::gpu
+{
+#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) class GPUCA_M_FIRST(GPUCA_M_STRIP(x_class));
+#include "GPUReconstructionKernelList.h"
+#undef GPUCA_KRNL
+
+struct GPUTPCClusterOccupancyMapBin;
+namespace gputpcgmmergertypes
+{
+struct GPUTPCGMBorderRange;
+}
+struct GPUTPCLinearLabels;
+struct CfChargePos;
+} // namespace o2::gpu
+
+namespace o2::tpc
+{
+struct ClusterNative;
+} // namespace o2::tpc
+
+#endif
diff --git a/GPU/GPUTracking/Global/GPUChain.h b/GPU/GPUTracking/Global/GPUChain.h
index 1e99e3b73736f..5df324fcba648 100644
--- a/GPU/GPUTracking/Global/GPUChain.h
+++ b/GPU/GPUTracking/Global/GPUChain.h
@@ -16,11 +16,14 @@
 #define GPUCHAIN_H
 
 #include "GPUReconstructionCPU.h"
+#include "GPUReconstructionCPUKernels.h"
+#include "GPUKernelClassesFwd.h"
 
 #include <ctime>
 
 namespace o2::gpu
 {
+
 class GPUChain
 {
   friend class GPUReconstruction;
@@ -30,10 +33,10 @@ class GPUChain
   using GeneralStep = GPUReconstruction::GeneralStep;
   using InOutPointerType = GPUReconstruction::InOutPointerType;
   using GeometryType = GPUReconstruction::GeometryType;
-  using krnlRunRange = gpu_reconstruction_kernels::krnlRunRange;
-  using krnlExec = gpu_reconstruction_kernels::krnlExec;
-  using krnlEvent = gpu_reconstruction_kernels::krnlEvent;
-  using deviceEvent = gpu_reconstruction_kernels::deviceEvent;
+  using krnlRunRange = GPUReconstructionProcessing::krnlRunRange;
+  using krnlExec = GPUReconstructionProcessing::krnlExec;
+  using krnlEvent = GPUReconstructionProcessing::krnlEvent;
+  using deviceEvent = GPUReconstructionProcessing::deviceEvent;
   static constexpr krnlRunRange krnlRunRangeNone{0};
   static constexpr krnlEvent krnlEventNone = krnlEvent{nullptr, nullptr, 0};
 
@@ -56,20 +59,20 @@ class GPUChain
   virtual void DumpSettings(const char* dir = "") {}
   virtual void ReadSettings(const char* dir = "") {}
 
-  const GPUParam& GetParam() const { return mRec->mHostConstantMem->param; }
-  const GPUSettingsGRP& GetGRPSettings() const { return mRec->mGRPSettings; }
-  const GPUCalibObjectsConst& calib() const { return processors()->calibObjects; }
+  const GPUParam& GetParam() const { return mRec->GetParam(); }
+  const GPUSettingsGRP& GetGRPSettings() const { return mRec->GetGRPSettings(); }
+  const GPUCalibObjectsConst& GetCalib() const { return mRec->GetCalib(); }
   GPUReconstruction* rec() { return mRec; }
   const GPUReconstruction* rec() const { return mRec; }
-  inline const GPUConstantMem* GetProcessors() { return mRec->processors(); }
+  inline const GPUConstantMem* GetProcessors() const { return mRec->processors(); }
 
   // Make functions from GPUReconstruction*** available
   GPUReconstruction::RecoStepField GetRecoSteps() const { return mRec->GetRecoSteps(); }
   GPUReconstruction::RecoStepField GetRecoStepsGPU() const { return mRec->GetRecoStepsGPU(); }
   GPUReconstruction::InOutTypeField GetRecoStepsInputs() const { return mRec->GetRecoStepsInputs(); }
   GPUReconstruction::InOutTypeField GetRecoStepsOutputs() const { return mRec->GetRecoStepsOutputs(); }
-  inline const GPUSettingsDeviceBackend& GetDeviceBackendSettings() const { return mRec->mDeviceBackendSettings; }
-  inline const GPUSettingsProcessing& GetProcessingSettings() const { return mRec->mProcessingSettings; }
+  inline const GPUSettingsDeviceBackend& GetDeviceBackendSettings() const { return mRec->GetDeviceBackendSettings(); }
+  inline const GPUSettingsProcessing& GetProcessingSettings() const { return mRec->GetProcessingSettings(); }
 
  protected:
   GPUReconstructionCPU* mRec;
@@ -102,7 +105,7 @@ class GPUChain
   }
   inline bool IsEventDone(deviceEvent* evList, int32_t nEvents = 1) { return mRec->IsEventDone(evList, nEvents); }
   inline void RecordMarker(deviceEvent* ev, int32_t stream) { mRec->RecordMarker(ev, stream); }
-  virtual inline std::unique_ptr<gpu_reconstruction_kernels::threadContext> GetThreadContext() { return mRec->GetThreadContext(); }
+  virtual inline std::unique_ptr<GPUReconstructionProcessing::threadContext> GetThreadContext() { return mRec->GetThreadContext(); }
   inline void SynchronizeGPU() { mRec->SynchronizeGPU(); }
   inline void ReleaseEvent(deviceEvent ev, bool doGPU = true)
   {
@@ -171,13 +174,16 @@ class GPUChain
   {
     mRec->ReadStructFromFile<T>(file, obj);
   }
+
   template <class S, int32_t I = 0, typename... Args>
-  inline void runKernel(gpu_reconstruction_kernels::krnlSetup&& setup, Args&&... args)
+    requires(sizeof(S) >= 0) // Yields better incomplete type errors than calling runKernelCallInterface directly
+  inline void runKernel(GPUReconstructionProcessing::krnlSetup&& setup, Args const&... args)
   {
-    return mRec->runKernel<S, I, Args...>(std::forward<gpu_reconstruction_kernels::krnlSetup&&>(setup), std::forward<Args>(args)...);
+    runKernelCallInterface<S, I>(std::forward<GPUReconstructionProcessing::krnlSetup&&>(setup), args...);
   }
+
   template <class S, int32_t I = 0>
-  gpu_reconstruction_kernels::krnlProperties getKernelProperties()
+  GPUReconstructionProcessing::krnlProperties getKernelProperties()
   {
     return mRec->getKernelProperties<S, I>();
   }
@@ -233,6 +239,16 @@ class GPUChain
  private:
   template <bool Always = false, class T, class S, typename... Args>
   void timeCpy(RecoStep step, int32_t toGPU, S T::*func, Args... args);
+
+#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...)                                                                                                                                                                             \
+  template <class S, int32_t I>                                                                                                                                                                                                                             \
+    requires(std::is_same_v<S, GPUCA_M_FIRST(GPUCA_M_STRIP(x_class))> && I == S::GPUCA_M_FIRST(GPUCA_M_SHIFT(GPUCA_M_STRIP(x_class), defaultKernel)))                                                                                                       \
+  inline void runKernelCallInterface(GPUReconstructionProcessing::krnlSetup&& setup GPUCA_M_STRIP(x_arguments))                                                                                                                                             \
+  {                                                                                                                                                                                                                                                         \
+    mRec->runKernelInterface<GPUCA_M_FIRST(GPUCA_M_STRIP(x_class)), S::GPUCA_M_FIRST(GPUCA_M_SHIFT(GPUCA_M_STRIP(x_class), defaultKernel)) GPUCA_M_STRIP(x_types)>(std::forward<GPUReconstructionProcessing::krnlSetup&&>(setup) GPUCA_M_STRIP(x_forward)); \
+  }
+#include "GPUReconstructionKernelList.h"
+#undef GPUCA_KRNL
 };
 
 template <bool Always, class T, class S, typename... Args>
@@ -243,7 +259,7 @@ inline void GPUChain::timeCpy(RecoStep step, int32_t toGPU, S T::*func, Args...
   }
   HighResTimer* timer = nullptr;
   size_t* bytes = nullptr;
-  if (mRec->mProcessingSettings.debugLevel >= 1 && toGPU >= 0) { // Todo: time special cases toGPU < 0
+  if (mRec->GetProcessingSettings().debugLevel >= 1 && toGPU >= 0) { // Todo: time special cases toGPU < 0
     int32_t id = mRec->getRecoStepNum(step, false);
     if (id != -1) {
       auto& tmp = mRec->mTimersRecoSteps[id];
diff --git a/GPU/GPUTracking/Global/GPUChainITS.cxx b/GPU/GPUTracking/Global/GPUChainITS.cxx
index 640b92a0eb0f4..eeead79b1840b 100644
--- a/GPU/GPUTracking/Global/GPUChainITS.cxx
+++ b/GPU/GPUTracking/Global/GPUChainITS.cxx
@@ -13,6 +13,7 @@
 /// \author David Rohr
 
 #include "GPUChainITS.h"
+#include "GPUConstantMem.h"
 #include "DataFormatsITS/TrackITS.h"
 #include "ITStracking/ExternalAllocator.h"
 #include "GPUReconstructionIncludesITS.h"
diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx
index 43fa49ff74817..a3f9b996e070d 100644
--- a/GPU/GPUTracking/Global/GPUChainTracking.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTracking.cxx
@@ -18,6 +18,8 @@
 #include <chrono>
 
 #include "GPUChainTracking.h"
+#include "GPUChainTrackingGetters.inc"
+#include "GPUReconstructionIO.h"
 #include "GPUChainTrackingDefs.h"
 #include "GPUTPCClusterData.h"
 #include "GPUTPCSectorOutCluster.h"
@@ -755,7 +757,7 @@ int32_t GPUChainTracking::RunChain()
     }
   }
 
-  if (GetProcessingSettings().trdTrackModelO2 ? runRecoStep(RecoStep::TRDTracking, &GPUChainTracking::RunTRDTracking<GPUTRDTrackerKernels::o2Version>) : runRecoStep(RecoStep::TRDTracking, &GPUChainTracking::RunTRDTracking<GPUTRDTrackerKernels::gpuVersion>)) {
+  if (runRecoStep(RecoStep::TRDTracking, &GPUChainTracking::RunTRDTracking)) {
     return 1;
   }
 
diff --git a/GPU/GPUTracking/Global/GPUChainTracking.h b/GPU/GPUTracking/Global/GPUChainTracking.h
index 5779cec31130c..8664652b549e3 100644
--- a/GPU/GPUTracking/Global/GPUChainTracking.h
+++ b/GPU/GPUTracking/Global/GPUChainTracking.h
@@ -24,6 +24,12 @@
 #include <vector>
 #include <utility>
 
+namespace o2::dataformats
+{
+template <typename TruthElement>
+class ConstMCTruthContainer;
+} // namespace o2::dataformats
+
 namespace o2::trd
 {
 class GeometryFlat;
@@ -39,6 +45,9 @@ class CalibdEdxContainer;
 namespace o2::base
 {
 class MatLayerCylSet;
+template <typename>
+class PropagatorImpl;
+using Propagator = PropagatorImpl<float>;
 } // namespace o2::base
 
 namespace o2::gpu
@@ -55,6 +64,8 @@ struct GPUChainTrackingFinalContext;
 struct GPUTPCCFChainContext;
 struct GPUNewCalibValues;
 struct GPUTriggerOutputs;
+struct CfFragment;
+class GPUTPCClusterFinder;
 
 class GPUChainTracking : public GPUChain
 {
@@ -137,11 +148,6 @@ class GPUChainTracking : public GPUChain
   void ConvertZSFilter(bool zs12bit);
 
   // Getters for external usage of tracker classes
-  GPUTRDTrackerGPU* GetTRDTrackerGPU() { return &processors()->trdTrackerGPU; }
-  GPUTPCTracker* GetTPCSectorTrackers() { return processors()->tpcTrackers; }
-  const GPUTPCTracker* GetTPCSectorTrackers() const { return processors()->tpcTrackers; }
-  const GPUTPCGMMerger& GetTPCMerger() const { return processors()->tpcMerger; }
-  GPUTPCGMMerger& GetTPCMerger() { return processors()->tpcMerger; }
   GPUDisplayInterface* GetEventDisplay() { return mEventDisplay.get(); }
   const GPUQA* GetQA() const { return mQAFromForeignChain ? mQAFromForeignChain->mQA.get() : mQA.get(); }
   GPUQA* GetQA() { return mQAFromForeignChain ? mQAFromForeignChain->mQA.get() : mQA.get(); }
@@ -155,7 +161,6 @@ class GPUChainTracking : public GPUChain
   int32_t ForwardTPCDigits();
   int32_t RunTPCTrackingSectors();
   int32_t RunTPCTrackingMerger(bool synchronizeOutput = true);
-  template <int32_t I>
   int32_t RunTRDTracking();
   template <int32_t I, class T = GPUTRDTracker>
   int32_t DoTRDGPUTracking(T* externalInstance = nullptr);
@@ -164,22 +169,22 @@ class GPUChainTracking : public GPUChain
   int32_t RunRefit();
 
   // Getters / setters for parameters
-  const CorrectionMapsHelper* GetTPCTransformHelper() const { return processors()->calibObjects.fastTransformHelper; }
-  const TPCPadGainCalib* GetTPCPadGainCalib() const { return processors()->calibObjects.tpcPadGain; }
-  const TPCZSLinkMapping* GetTPCZSLinkMapping() const { return processors()->calibObjects.tpcZSLinkMapping; }
-  const o2::tpc::CalibdEdxContainer* GetdEdxCalibContainer() const { return processors()->calibObjects.dEdxCalibContainer; }
-  const o2::base::MatLayerCylSet* GetMatLUT() const { return processors()->calibObjects.matLUT; }
-  const GPUTRDGeometry* GetTRDGeometry() const { return (GPUTRDGeometry*)processors()->calibObjects.trdGeometry; }
-  const o2::base::Propagator* GetO2Propagator() const { return processors()->calibObjects.o2Propagator; }
+  const CorrectionMapsHelper* GetTPCTransformHelper() const;
+  const TPCPadGainCalib* GetTPCPadGainCalib() const;
+  const TPCZSLinkMapping* GetTPCZSLinkMapping() const;
+  const o2::tpc::CalibdEdxContainer* GetdEdxCalibContainer() const;
+  const o2::base::MatLayerCylSet* GetMatLUT() const;
+  const GPUTRDGeometry* GetTRDGeometry() const;
+  const o2::base::Propagator* GetO2Propagator() const;
   const o2::base::Propagator* GetDeviceO2Propagator();
   void SetTPCFastTransform(std::unique_ptr<TPCFastTransform>&& tpcFastTransform, std::unique_ptr<CorrectionMapsHelper>&& tpcTransformHelper);
   void SetMatLUT(std::unique_ptr<o2::base::MatLayerCylSet>&& lut);
   void SetTRDGeometry(std::unique_ptr<o2::trd::GeometryFlat>&& geo);
-  void SetMatLUT(const o2::base::MatLayerCylSet* lut) { processors()->calibObjects.matLUT = lut; }
-  void SetTRDGeometry(const o2::trd::GeometryFlat* geo) { processors()->calibObjects.trdGeometry = geo; }
+  void SetMatLUT(const o2::base::MatLayerCylSet* lut);
+  void SetTRDGeometry(const o2::trd::GeometryFlat* geo);
   void SetO2Propagator(const o2::base::Propagator* prop);
-  void SetCalibObjects(const GPUCalibObjectsConst& obj) { processors()->calibObjects = obj; }
-  void SetCalibObjects(const GPUCalibObjects& obj) { memcpy((void*)&processors()->calibObjects, (const void*)&obj, sizeof(obj)); }
+  void SetCalibObjects(const GPUCalibObjectsConst& obj);
+  void SetCalibObjects(const GPUCalibObjects& obj);
   void SetUpdateCalibObjects(const GPUCalibObjectsConst& obj, const GPUNewCalibValues& vals);
   void SetSubOutputControl(int32_t i, GPUOutputControl* v) { mSubOutputControls[i] = v; }
   void SetFinalInputCallback(std::function<void()> v) { mWaitForFinalInputs = v; }
@@ -298,6 +303,8 @@ class GPUChainTracking : public GPUChain
   void RunTPCTrackingMerger_Resolve(int8_t useOrigTrackParam, int8_t mergeAll, GPUReconstruction::krnlDeviceType deviceType);
   void RunTPCClusterFilter(o2::tpc::ClusterNativeAccess* clusters, std::function<o2::tpc::ClusterNative*(size_t)> allocator, bool applyClusterCuts);
   bool NeedTPCClustersOnGPU();
+  template <int32_t I>
+  int32_t RunTRDTrackingInternal();
   uint32_t StreamForSector(uint32_t sector) const;
 
   std::mutex mMutexUpdateCalib;
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 7db0ba66305e9..981d565852d28 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -19,11 +19,20 @@
 #include "GPUMemorySizeScalers.h"
 #include "GPUTrackingInputProvider.h"
 #include "GPUNewCalibValues.h"
-#include <fstream>
-
-#ifdef GPUCA_O2_LIB
-#include "CommonDataFormat/InteractionRecord.h"
-#endif
+#include "GPUConstantMem.h"
+#include "CfChargePos.h"
+#include "CfArray2D.h"
+#include "GPUGeneralKernels.h"
+#include "GPUTPCCFStreamCompaction.h"
+#include "GPUTPCCFChargeMapFiller.h"
+#include "GPUTPCCFDecodeZS.h"
+#include "GPUTPCCFCheckPadBaseline.h"
+#include "GPUTPCCFPeakFinder.h"
+#include "GPUTPCCFNoiseSuppression.h"
+#include "GPUTPCCFDeconvolution.h"
+#include "GPUTPCCFClusterizer.h"
+#include "GPUTPCCFGather.h"
+#include "GPUTPCCFMCLabelFlattener.h"
 #include "GPUTriggerOutputs.h"
 #include "GPUHostDataTypes.h"
 #include "GPUTPCCFChainContext.h"
@@ -32,18 +41,24 @@
 #include "DataFormatsTPC/Digit.h"
 #include "DataFormatsTPC/Constants.h"
 #include "TPCBase/RDHUtils.h"
+#include "GPULogging.h"
+
+#ifdef GPUCA_HAS_ONNX
+#include "GPUTPCNNClusterizerKernels.h"
+#include "GPUTPCNNClusterizerHost.h"
+#endif
+
+#ifdef GPUCA_O2_LIB
+#include "CommonDataFormat/InteractionRecord.h"
+#endif
 
 #include "utils/strtag.h"
+#include <fstream>
 
 #ifndef GPUCA_NO_VC
 #include <Vc/Vc>
 #endif
 
-#ifdef GPUCA_HAS_ONNX
-#include "GPUTPCNNClusterizerKernels.h"
-#include "GPUTPCNNClusterizerHost.h"
-#endif
-
 using namespace o2::gpu;
 using namespace o2::tpc;
 using namespace o2::tpc::constants;
@@ -791,7 +806,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
         using ChargeMapType = decltype(*clustererShadow.mPchargeMap);
         using PeakMapType = decltype(*clustererShadow.mPpeakMap);
-        runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPchargeMap, TPCMapMemoryLayout<ChargeMapType>::items(GetProcessingSettings().overrideClusterizerFragmentLen) * sizeof(ChargeMapType)); // TODO: Not working in OpenCL2!!!
+        runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPchargeMap, TPCMapMemoryLayout<ChargeMapType>::items(GetProcessingSettings().overrideClusterizerFragmentLen) * sizeof(ChargeMapType));
         runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPpeakMap, TPCMapMemoryLayout<PeakMapType>::items(GetProcessingSettings().overrideClusterizerFragmentLen) * sizeof(PeakMapType));
         if (fragment.index == 0) {
           runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPpadIsNoisy, TPC_PADS_IN_SECTOR * sizeof(*clustererShadow.mPpadIsNoisy));
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
index 8fb6fc4771658..fc07a91004c5f 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
@@ -19,6 +19,9 @@
 #include "GPUTPCCFChainContext.h"
 #include "TPCClusterDecompressor.h"
 #include "GPUDefParametersRuntime.h"
+#include "GPUConstantMem.h" // TODO: Try to get rid of as many GPUConstantMem includes as possible!
+#include "GPUTPCCompressionKernels.h"
+#include "GPUTPCDecompressionKernels.h"
 #include "utils/strtag.h"
 
 #include <numeric>
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx b/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx
index c42d9622f5332..5d05cd6a97776 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx
@@ -15,6 +15,8 @@
 #include "GPUChainTracking.h"
 #include "GPUTrackingInputProvider.h"
 #include "GPUMemorySizeScalers.h"
+#include "GPUConstantMem.h"
+#include "GPUTPCClusterFilter.h"
 #include <map>
 #include <memory>
 #include <string>
@@ -23,8 +25,6 @@
 #include "bitmapfile.h"
 #endif
 
-#include "GPUTPCClusterFilter.h"
-
 #define PROFILE_MAX_SIZE (100 * 1024 * 1024)
 
 using namespace o2::gpu;
@@ -209,7 +209,7 @@ void GPUChainTracking::PrintDebugOutput()
 void GPUChainTracking::PrintOutputStat()
 {
   int32_t nTracks = 0, nAttachedClusters = 0, nAttachedClustersFitted = 0, nAdjacentClusters = 0;
-  uint32_t nCls = GetProcessingSettings().doublePipeline ? mIOPtrs.clustersNative->nClustersTotal : GetTPCMerger().NMaxClusters();
+  uint32_t nCls = GetProcessingSettings().doublePipeline ? mIOPtrs.clustersNative->nClustersTotal : processors()->tpcMerger.NMaxClusters();
   if (GetProcessingSettings().createO2Output > 1) {
     nTracks = mIOPtrs.nOutputTracksTPCO2;
     nAttachedClusters = mIOPtrs.nMergedTrackHits;
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingDefs.h b/GPU/GPUTracking/Global/GPUChainTrackingDefs.h
index dc1a665e6052c..e02419955001a 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingDefs.h
+++ b/GPU/GPUTracking/Global/GPUChainTrackingDefs.h
@@ -9,7 +9,7 @@
 // granted to it by virtue of its status as an Intergovernmental Organization
 // or submit itself to any jurisdiction.
 
-/// \file GPUChainTracking.h
+/// \file GPUChainTrackingDefs.h
 /// \author David Rohr
 
 #ifndef GPUCHAINTRACKINGDEFS_H
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingGetters.inc b/GPU/GPUTracking/Global/GPUChainTrackingGetters.inc
new file mode 100644
index 0000000000000..5b72a8f23c242
--- /dev/null
+++ b/GPU/GPUTracking/Global/GPUChainTrackingGetters.inc
@@ -0,0 +1,36 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUChainTrackingGetters.inc
+/// \author David Rohr
+
+#ifndef GPUCHAINTRACKINGGETTERS_INC_H
+#define GPUCHAINTRACKINGGETTERS_INC_H
+
+#include "GPUChainTracking.h"
+#include "GPUConstantMem.h"
+
+namespace o2::gpu
+{
+inline const CorrectionMapsHelper* GPUChainTracking::GetTPCTransformHelper() const { return processors()->calibObjects.fastTransformHelper; }
+inline const TPCPadGainCalib* GPUChainTracking::GetTPCPadGainCalib() const { return processors()->calibObjects.tpcPadGain; }
+inline const TPCZSLinkMapping* GPUChainTracking::GetTPCZSLinkMapping() const { return processors()->calibObjects.tpcZSLinkMapping; }
+inline const o2::tpc::CalibdEdxContainer* GPUChainTracking::GetdEdxCalibContainer() const { return processors()->calibObjects.dEdxCalibContainer; }
+inline const o2::base::MatLayerCylSet* GPUChainTracking::GetMatLUT() const { return processors()->calibObjects.matLUT; }
+inline const GPUTRDGeometry* GPUChainTracking::GetTRDGeometry() const { return (GPUTRDGeometry*)processors()->calibObjects.trdGeometry; }
+inline const o2::base::Propagator* GPUChainTracking::GetO2Propagator() const { return processors()->calibObjects.o2Propagator; }
+inline void GPUChainTracking::SetMatLUT(const o2::base::MatLayerCylSet* lut) { processors()->calibObjects.matLUT = lut; }
+inline void GPUChainTracking::SetTRDGeometry(const o2::trd::GeometryFlat* geo) { processors()->calibObjects.trdGeometry = geo; }
+inline void GPUChainTracking::SetCalibObjects(const GPUCalibObjectsConst& obj) { processors()->calibObjects = obj; }
+inline void GPUChainTracking::SetCalibObjects(const GPUCalibObjects& obj) { memcpy((void*)&processors()->calibObjects, (const void*)&obj, sizeof(obj)); }
+} // namespace o2::gpu
+
+#endif
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingIO.cxx b/GPU/GPUTracking/Global/GPUChainTrackingIO.cxx
index 4f7846b852b98..5e7672022b3ff 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingIO.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingIO.cxx
@@ -34,6 +34,7 @@
 #include "GPUTrackingInputProvider.h"
 #include "TPCZSLinkMapping.h"
 #include "GPUTriggerOutputs.h"
+#include "GPUConstantMem.h"
 
 #include "SimulationDataFormat/MCCompLabel.h"
 #include "SimulationDataFormat/MCTruthContainer.h"
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx b/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx
index a647c213660c9..163f08634ef86 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx
@@ -17,6 +17,11 @@
 #include "GPUDefParametersRuntime.h"
 #include "GPUO2DataTypes.h"
 #include "GPUQA.h"
+#include "GPUTPCGMMerger.h"
+#include "GPUConstantMem.h"
+#include "GPUTPCGMMergerGPU.h"
+#include "GPUTPCGMO2Output.h"
+#include "GPUTPCGlobalDebugSortKernels.h"
 #include "utils/strtag.h"
 #include <fstream>
 
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingRefit.cxx b/GPU/GPUTracking/Global/GPUChainTrackingRefit.cxx
index 8d1efd7011227..4662b5464f710 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingRefit.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingRefit.cxx
@@ -13,8 +13,12 @@
 /// \author David Rohr
 
 #include "GPUChainTracking.h"
+#include "GPUChainTrackingGetters.inc"
 #include "GPULogging.h"
 #include "GPUO2DataTypes.h"
+#include "GPUTrackingRefit.h"
+#include "GPUConstantMem.h"
+#include "GPUTrackingRefitKernel.h"
 
 using namespace o2::gpu;
 
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx b/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx
index 962b0922eeecc..635641c00ae14 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx
@@ -20,6 +20,16 @@
 #include "GPUTrackingInputProvider.h"
 #include "GPUTPCClusterOccupancyMap.h"
 #include "GPUDefParametersRuntime.h"
+#include "GPUTPCExtrapolationTracking.h"
+#include "GPUTPCCreateOccupancyMap.h"
+#include "GPUTPCCreateTrackingData.h"
+#include "GPUTPCNeighboursFinder.h"
+#include "GPUTPCNeighboursCleaner.h"
+#include "GPUTPCStartHitsFinder.h"
+#include "GPUTPCStartHitsSorter.h"
+#include "GPUTPCTrackletConstructor.h"
+#include "GPUTPCTrackletSelector.h"
+#include "GPUTPCSectorDebugSortKernels.h"
 #include "utils/strtag.h"
 #include <fstream>
 
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingTRD.cxx b/GPU/GPUTracking/Global/GPUChainTrackingTRD.cxx
index 0f17bbcc26842..f9011131803e3 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingTRD.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingTRD.cxx
@@ -21,13 +21,19 @@
 #include "GPUTRDTracker.h"
 #include "GPUTrackingInputProvider.h"
 #include "GPUTRDTrackerKernels.h"
+#include "GPUConstantMem.h"
 #include "utils/strtag.h"
 
 using namespace o2::gpu;
 using namespace o2::trd;
 
-template <int32_t I>
 int32_t GPUChainTracking::RunTRDTracking()
+{
+  return GetProcessingSettings().trdTrackModelO2 ? RunTRDTrackingInternal<GPUTRDTrackerKernels::o2Version>() : RunTRDTrackingInternal<GPUTRDTrackerKernels::gpuVersion>();
+}
+
+template <int32_t I>
+int32_t GPUChainTracking::RunTRDTrackingInternal()
 {
   auto& Tracker = processors()->getTRDTracker<I>();
   if (!Tracker.IsInitialized()) {
@@ -189,9 +195,7 @@ int32_t GPUChainTracking::DoTRDGPUTracking(T* externalInstance)
   return (0);
 }
 
-template int32_t GPUChainTracking::RunTRDTracking<GPUTRDTrackerKernels::gpuVersion>();
 template int32_t GPUChainTracking::DoTRDGPUTracking<GPUTRDTrackerKernels::gpuVersion>(GPUTRDTrackerGPU*);
 template int32_t GPUChainTracking::DoTRDGPUTracking<GPUTRDTrackerKernels::gpuVersion>(GPUTRDTracker*);
-template int32_t GPUChainTracking::RunTRDTracking<GPUTRDTrackerKernels::o2Version>();
 template int32_t GPUChainTracking::DoTRDGPUTracking<GPUTRDTrackerKernels::o2Version>(GPUTRDTracker*);
 template int32_t GPUChainTracking::DoTRDGPUTracking<GPUTRDTrackerKernels::o2Version>(GPUTRDTrackerGPU*);
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingTransformation.cxx b/GPU/GPUTracking/Global/GPUChainTrackingTransformation.cxx
index db5e5ae3aeb75..c9d4d269f070c 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingTransformation.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingTransformation.cxx
@@ -19,9 +19,13 @@
 #include "GPUTPCClusterData.h"
 #include "GPUReconstructionConvert.h"
 #include "GPUMemorySizeScalers.h"
+#include "GPUTPCConvert.h"
 #include "AliHLTTPCRawCluster.h"
+#include "GPUConstantMem.h"
+#include "GPUTPCConvertKernel.h"
 
 #include "DataFormatsTPC/ClusterNative.h"
+#include "DataFormatsTPC/ZeroSuppression.h"
 #include "CommonDataFormat/InteractionRecord.h"
 #include "utils/strtag.h"
 
diff --git a/GPU/GPUTracking/Global/GPUTrackingInputProvider.cxx b/GPU/GPUTracking/Global/GPUTrackingInputProvider.cxx
index a5457bf3f2f23..7ef9baa903fbe 100644
--- a/GPU/GPUTracking/Global/GPUTrackingInputProvider.cxx
+++ b/GPU/GPUTracking/Global/GPUTrackingInputProvider.cxx
@@ -18,6 +18,9 @@
 #include "GPUReconstruction.h"
 #include "GPUTPCClusterOccupancyMap.h"
 #include "GPUErrors.h"
+#include "GPUParam.h"
+#include "DataFormatsTPC/ClusterNative.h"
+#include "GPUTRDSpacePoint.h"
 
 using namespace o2::gpu;
 using namespace o2::tpc;
diff --git a/GPU/GPUTracking/Interface/GPUO2Interface.cxx b/GPU/GPUTracking/Interface/GPUO2Interface.cxx
index 4dac56afed671..81eb2c285192b 100644
--- a/GPU/GPUTracking/Interface/GPUO2Interface.cxx
+++ b/GPU/GPUTracking/Interface/GPUO2Interface.cxx
@@ -15,6 +15,7 @@
 #include "GPUO2Interface.h"
 #include "GPUReconstruction.h"
 #include "GPUChainTracking.h"
+#include "GPUChainTrackingGetters.inc"
 #include "GPUChainITS.h"
 #include "GPUMemorySizeScalers.h"
 #include "GPUOutputControl.h"
@@ -23,6 +24,7 @@
 #include "GPUParam.inc"
 #include "GPUQA.h"
 #include "GPUOutputControl.h"
+#include "DetectorsBase/Propagator.h"
 #include <iostream>
 #include <fstream>
 #include <thread>
diff --git a/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx b/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx
index 82b21e2045b8e..b6241ad36b5de 100644
--- a/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx
+++ b/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx
@@ -736,46 +736,15 @@ GPUd() void GPUTPCGMMerger::MergeBorderTracks<1>(int32_t nBlocks, int32_t nThrea
 #endif
 }
 
-#if defined(GPUCA_SPECIALIZE_THRUST_SORTS) && !defined(GPUCA_GPUCODE_COMPILEKERNELS) // Specialize MergeBorderTracks<3>
-namespace o2::gpu::internal
-{
-namespace // anonymous
-{
-struct MergeBorderTracks_compMax {
-  GPUd() bool operator()(const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b)
-  {
-    return GPUCA_DETERMINISTIC_CODE((a.fMax != b.fMax) ? (a.fMax < b.fMax) : (a.fId < b.fId), a.fMax < b.fMax);
-  }
-};
-struct MergeBorderTracks_compMin {
-  GPUd() bool operator()(const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b)
-  {
-    return GPUCA_DETERMINISTIC_CODE((a.fMin != b.fMin) ? (a.fMin < b.fMin) : (a.fId < b.fId), a.fMin < b.fMin);
-  }
-};
-} // anonymous namespace
-} // namespace o2::gpu::internal
-
-template <>
-inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal<GPUTPCGMMergerMergeBorders, 3>(const krnlSetupTime& _xyz, GPUTPCGMBorderRange* const& range, int32_t const& N, int32_t const& cmpMax)
-{
-  if (cmpMax) {
-    GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, range, N, MergeBorderTracks_compMax());
-  } else {
-    GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, range, N, MergeBorderTracks_compMin());
-  }
-}
-#endif // GPUCA_SPECIALIZE_THRUST_SORTS - Specialize MergeBorderTracks<3>
-
 template <>
 GPUd() void GPUTPCGMMerger::MergeBorderTracks<3>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUTPCGMBorderRange* range, int32_t N, int32_t cmpMax)
 {
 #ifndef GPUCA_SPECIALIZE_THRUST_SORTS
   if (iThread == 0) {
     if (cmpMax) {
-      GPUCommonAlgorithm::sortDeviceDynamic(range, range + N, [](const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b) { return a.fMax < b.fMax; });
+      GPUCommonAlgorithm::sortDeviceDynamic(range, range + N, [](const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b) { return GPUCA_DETERMINISTIC_CODE((a.fMax != b.fMax) ? (a.fMax < b.fMax) : (a.fId < b.fId), a.fMax < b.fMax); });
     } else {
-      GPUCommonAlgorithm::sortDeviceDynamic(range, range + N, [](const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b) { return a.fMin < b.fMin; });
+      GPUCommonAlgorithm::sortDeviceDynamic(range, range + N, [](const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b) { return GPUCA_DETERMINISTIC_CODE((a.fMin != b.fMin) ? (a.fMin < b.fMin) : (a.fId < b.fId), a.fMin < b.fMin); });
     }
   }
 #endif
@@ -1783,74 +1752,6 @@ GPUd() void GPUTPCGMMerger::PrepareClustersForFit0(int32_t nBlocks, int32_t nThr
   }
 }
 
-#if defined(GPUCA_SPECIALIZE_THRUST_SORTS) && !defined(GPUCA_GPUCODE_COMPILEKERNELS) // Specialize GPUTPCGMMergerSortTracks and GPUTPCGMMergerSortTracksQPt
-namespace o2::gpu::internal
-{
-namespace // anonymous
-{
-struct GPUTPCGMMergerSortTracks_comp {
-  const GPUTPCGMMergedTrack* const mCmp;
-  GPUhd() GPUTPCGMMergerSortTracks_comp(GPUTPCGMMergedTrack* cmp) : mCmp(cmp) {}
-  GPUd() bool operator()(const int32_t aa, const int32_t bb)
-  {
-    const GPUTPCGMMergedTrack& GPUrestrict() a = mCmp[aa];
-    const GPUTPCGMMergedTrack& GPUrestrict() b = mCmp[bb];
-    if (a.CCE() != b.CCE()) {
-      return a.CCE() > b.CCE();
-    }
-    if (a.Legs() != b.Legs()) {
-      return a.Legs() > b.Legs();
-    }
-    GPUCA_DETERMINISTIC_CODE( // clang-format off
-      if (a.NClusters() != b.NClusters()) {
-        return a.NClusters() > b.NClusters();
-      } if (CAMath::Abs(a.GetParam().GetQPt()) != CAMath::Abs(b.GetParam().GetQPt())) {
-        return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt());
-      } if (a.GetParam().GetY() != b.GetParam().GetY()) {
-        return a.GetParam().GetY() > b.GetParam().GetY();
-      }
-      return aa > bb;
-    , // !GPUCA_DETERMINISTIC_CODE
-      return a.NClusters() > b.NClusters();
-    ) // clang-format on
-  }
-};
-
-struct GPUTPCGMMergerSortTracksQPt_comp {
-  const GPUTPCGMMergedTrack* const mCmp;
-  GPUhd() GPUTPCGMMergerSortTracksQPt_comp(GPUTPCGMMergedTrack* cmp) : mCmp(cmp) {}
-  GPUd() bool operator()(const int32_t aa, const int32_t bb)
-  {
-    const GPUTPCGMMergedTrack& GPUrestrict() a = mCmp[aa];
-    const GPUTPCGMMergedTrack& GPUrestrict() b = mCmp[bb];
-    GPUCA_DETERMINISTIC_CODE( // clang-format off
-      if (CAMath::Abs(a.GetParam().GetQPt()) != CAMath::Abs(b.GetParam().GetQPt())) {
-        return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt());
-      } if (a.GetParam().GetY() != b.GetParam().GetY()) {
-        return a.GetParam().GetY() > b.GetParam().GetY();
-      }
-      return a.GetParam().GetZ() > b.GetParam().GetZ();
-    , // !GPUCA_DETERMINISTIC_CODE
-      return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt());
-    ) // clang-format on
-  }
-};
-} // anonymous namespace
-} // namespace o2::gpu::internal
-
-template <>
-inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal<GPUTPCGMMergerSortTracks, 0>(const krnlSetupTime& _xyz)
-{
-  GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackOrderProcess(), processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracks_comp(mProcessorsShadow->tpcMerger.OutputTracks()));
-}
-
-template <>
-inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal<GPUTPCGMMergerSortTracksQPt, 0>(const krnlSetupTime& _xyz)
-{
-  GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackSort(), processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracksQPt_comp(mProcessorsShadow->tpcMerger.OutputTracks()));
-}
-#endif // GPUCA_SPECIALIZE_THRUST_SORTS - Specialize GPUTPCGMMergerSortTracks and GPUTPCGMMergerSortTracksQPt
-
 GPUd() void GPUTPCGMMerger::SortTracks(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread)
 {
 #ifndef GPUCA_SPECIALIZE_THRUST_SORTS
@@ -2050,27 +1951,6 @@ GPUd() void GPUTPCGMMerger::MergeLoopersSort(int32_t nBlocks, int32_t nThreads,
 #endif
 }
 
-#if defined(GPUCA_SPECIALIZE_THRUST_SORTS) && !defined(GPUCA_GPUCODE_COMPILEKERNELS) // Specialize GPUTPCGMMergerSortTracks and GPUTPCGMMergerSortTracksQPt
-namespace o2::gpu::internal
-{
-namespace // anonymous
-{
-struct GPUTPCGMMergerMergeLoopers_comp {
-  GPUd() bool operator()(const MergeLooperParam& a, const MergeLooperParam& b)
-  {
-    return CAMath::Abs(a.refz) < CAMath::Abs(b.refz);
-  }
-};
-} // anonymous namespace
-} // namespace o2::gpu::internal
-
-template <>
-inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal<GPUTPCGMMergerMergeLoopers, 1>(const krnlSetupTime& _xyz)
-{
-  GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.LooperCandidates(), processors()->tpcMerger.Memory()->nLooperMatchCandidates, GPUTPCGMMergerMergeLoopers_comp());
-}
-#endif // GPUCA_SPECIALIZE_THRUST_SORTS - Specialize GPUTPCGMMergerSortTracks and GPUTPCGMMergerSortTracksQPt
-
 GPUd() void GPUTPCGMMerger::MergeLoopersMain(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread)
 {
   const MergeLooperParam* params = mLooperCandidates;
diff --git a/GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx b/GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx
index 1e4cc633eb4ca..2f8fbecadce5f 100644
--- a/GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx
+++ b/GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx
@@ -36,6 +36,7 @@
 #include "GPUTPCClusterOccupancyMap.h"
 #include "GPUTrackingRefit.h"
 #include "CorrectionMapsHelper.h"
+#include "GPUConstantMem.h"
 
 using namespace o2::gpu;
 using namespace gputpcgmmergertypes;
diff --git a/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx b/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx
index 13b34a0a64a84..ea219a02a1887 100644
--- a/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx
+++ b/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx
@@ -93,21 +93,6 @@ GPUdii() void GPUTPCGMO2Output::Thread<GPUTPCGMO2Output::sort>(int32_t nBlocks,
 #endif
 }
 
-#if defined(GPUCA_SPECIALIZE_THRUST_SORTS) && !defined(GPUCA_GPUCODE_COMPILEKERNELS) // Specialize GPUTPCGMO2Output::Thread<GPUTPCGMO2Output::sort>
-struct GPUTPCGMO2OutputSort_comp {
-  GPUd() bool operator()(const GPUTPCGMMerger::tmpSort& a, const GPUTPCGMMerger::tmpSort& b)
-  {
-    return (a.y > b.y);
-  }
-};
-
-template <>
-inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal<GPUTPCGMO2Output, GPUTPCGMO2Output::sort>(const krnlSetupTime& _xyz)
-{
-  GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackSortO2(), processors()->tpcMerger.NOutputTracksTPCO2(), GPUTPCGMO2OutputSort_comp());
-}
-#endif // GPUCA_SPECIALIZE_THRUST_SORTS - Specialize GPUTPCGMO2Output::Thread<GPUTPCGMO2Output::sort>
-
 template <>
 GPUdii() void GPUTPCGMO2Output::Thread<GPUTPCGMO2Output::output>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() merger)
 {
diff --git a/GPU/GPUTracking/SectorTracker/GPUTPCTrackerDump.cxx b/GPU/GPUTracking/SectorTracker/GPUTPCTrackerDump.cxx
index 7d83ff9abd91c..e66ad71783dbd 100644
--- a/GPU/GPUTracking/SectorTracker/GPUTPCTrackerDump.cxx
+++ b/GPU/GPUTracking/SectorTracker/GPUTPCTrackerDump.cxx
@@ -16,6 +16,7 @@
 #include "GPUReconstruction.h"
 #include "GPUTPCHitId.h"
 #include "GPUTPCTrack.h"
+#include "GPULogging.h"
 
 #include <iostream>
 #include <cstring>
diff --git a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
index b32db2bfebf11..d4c83f92a2157 100644
--- a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
+++ b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
@@ -17,11 +17,24 @@
 #include "GPUReconstructionTimeframe.h"
 #include "GPUReconstructionConvert.h"
 #include "GPUChainTracking.h"
+#include "GPUChainTrackingGetters.inc"
 #include "GPUTPCDef.h"
 #include "GPUQA.h"
+#include "GPUParam.h"
 #include "display/GPUDisplayInterface.h"
 #include "genEvents.h"
 
+#include "TPCFastTransform.h"
+#include "CorrectionMapsHelper.h"
+#include "GPUTPCGMMergedTrack.h"
+#include "GPUSettings.h"
+#include "GPUConstantMem.h"
+
+#include "GPUO2DataTypes.h"
+#include "GPUChainITS.h"
+
+#include "DataFormatsTPC/CompressedClusters.h"
+
 #include <iostream>
 #include <fstream>
 #include <cstdio>
@@ -32,6 +45,7 @@
 #include <thread>
 #include <future>
 #include <atomic>
+#include <vector>
 
 #ifndef _WIN32
 #include <unistd.h>
@@ -48,15 +62,6 @@
 #include "utils/qmaths_helpers.h"
 #include "utils/vecpod.h"
 
-#include "TPCFastTransform.h"
-#include "CorrectionMapsHelper.h"
-#include "GPUTPCGMMergedTrack.h"
-#include "GPUSettings.h"
-#include <vector>
-
-#include "GPUO2DataTypes.h"
-#include "GPUChainITS.h"
-
 using namespace o2::gpu;
 
 // #define BROKEN_EVENTS
@@ -915,7 +920,7 @@ int32_t main(int argc, char** argv)
       nEventsProcessed++;
 
       if (configStandalone.timeFrameTime) {
-        double nClusters = chainTracking->GetTPCMerger().NMaxClusters();
+        double nClusters = chainTracking->GetProcessors()->tpcMerger.NMaxClusters();
         if (nClusters > 0) {
           const int32_t nOrbits = 32;
           const double colRate = 50000;
diff --git a/GPU/GPUTracking/Standalone/tools/createGeo.C b/GPU/GPUTracking/Standalone/tools/createGeo.C
index 307d687f716d1..c454978177ad6 100644
--- a/GPU/GPUTracking/Standalone/tools/createGeo.C
+++ b/GPU/GPUTracking/Standalone/tools/createGeo.C
@@ -20,6 +20,7 @@
 #include "GPUO2Interface.h"
 #include "GPUReconstruction.h"
 #include "GPUChainTracking.h"
+#include "GPUChainTrackingGetters.inc"
 
 using namespace o2::gpu;
 
diff --git a/GPU/GPUTracking/Standalone/tools/createLUT.C b/GPU/GPUTracking/Standalone/tools/createLUT.C
index 7bb4edbf89f18..b4a053aa46c66 100644
--- a/GPU/GPUTracking/Standalone/tools/createLUT.C
+++ b/GPU/GPUTracking/Standalone/tools/createLUT.C
@@ -18,6 +18,7 @@
 #include "GPUO2Interface.h"
 #include "GPUReconstruction.h"
 #include "GPUChainTracking.h"
+#include "GPUChainTrackingGetters.inc"
 
 using namespace o2::gpu;
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/Array2D.h b/GPU/GPUTracking/TPCClusterFinder/CfArray2D.h
similarity index 81%
rename from GPU/GPUTracking/TPCClusterFinder/Array2D.h
rename to GPU/GPUTracking/TPCClusterFinder/CfArray2D.h
index b62176fdc4365..3c8bcf94da4b3 100644
--- a/GPU/GPUTracking/TPCClusterFinder/Array2D.h
+++ b/GPU/GPUTracking/TPCClusterFinder/CfArray2D.h
@@ -16,22 +16,22 @@
 #define O2_GPU_ARRAY2D_H
 
 #include "clusterFinderDefs.h"
-#include "ChargePos.h"
+#include "CfChargePos.h"
 
 namespace o2::gpu
 {
 
 template <typename T, typename Layout>
-class AbstractArray2D
+class AbstractCfArray2D
 {
 
  public:
-  GPUdi() explicit AbstractArray2D(T* d) : data(d) {}
+  GPUdi() explicit AbstractCfArray2D(T* d) : data(d) {}
 
-  GPUdi() T& operator[](const ChargePos& p) { return data[Layout::idx(p)]; }
-  GPUdi() const T& operator[](const ChargePos& p) const { return data[Layout::idx(p)]; }
+  GPUdi() T& operator[](const CfChargePos& p) { return data[Layout::idx(p)]; }
+  GPUdi() const T& operator[](const CfChargePos& p) const { return data[Layout::idx(p)]; }
 
-  GPUdi() void safeWrite(const ChargePos& p, const T& v)
+  GPUdi() void safeWrite(const CfChargePos& p, const T& v)
   {
     if (data != nullptr) {
       (*this)[p] = v;
@@ -52,7 +52,7 @@ class TilingLayout
     WidthInTiles = (TPC_NUM_OF_PADS + Width - 1) / Width,
   };
 
-  GPUdi() static tpccf::SizeT idx(const ChargePos& p)
+  GPUdi() static tpccf::SizeT idx(const CfChargePos& p)
   {
     const tpccf::SizeT tilePad = p.gpad / Width;
     const tpccf::SizeT tileTime = p.timePadded / Height;
@@ -72,7 +72,7 @@ class TilingLayout
 class LinearLayout
 {
  public:
-  GPUdi() static tpccf::SizeT idx(const ChargePos& p)
+  GPUdi() static tpccf::SizeT idx(const CfChargePos& p)
   {
     return TPC_NUM_OF_PADS * p.timePadded + p.gpad;
   }
@@ -119,7 +119,7 @@ using TPCMapMemoryLayout = LinearLayout;
 #endif
 
 template <typename T>
-using Array2D = AbstractArray2D<T, TPCMapMemoryLayout<T>>;
+using CfArray2D = AbstractCfArray2D<T, TPCMapMemoryLayout<T>>;
 
 } // namespace o2::gpu
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/ChargePos.h b/GPU/GPUTracking/TPCClusterFinder/CfChargePos.h
similarity index 80%
rename from GPU/GPUTracking/TPCClusterFinder/ChargePos.h
rename to GPU/GPUTracking/TPCClusterFinder/CfChargePos.h
index cdd489e0ef938..bf6ce2fc804ba 100644
--- a/GPU/GPUTracking/TPCClusterFinder/ChargePos.h
+++ b/GPU/GPUTracking/TPCClusterFinder/CfChargePos.h
@@ -9,7 +9,7 @@
 // granted to it by virtue of its status as an Intergovernmental Organization
 // or submit itself to any jurisdiction.
 
-/// \file ChargePos.h
+/// \file CfChargePos.h
 /// \author Felix Weiglhofer
 
 #ifndef O2_GPU_CHARGE_POS_H
@@ -22,20 +22,20 @@ namespace o2::gpu
 
 #define INVALID_TIME_BIN (-GPUCF_PADDING_TIME - 1)
 
-struct ChargePos {
+struct CfChargePos {
   tpccf::GlobalPad gpad;
   tpccf::TPCFragmentTime timePadded;
 
-  GPUdDefault() ChargePos() = default;
+  GPUdDefault() CfChargePos() = default;
 
-  constexpr GPUhdi() ChargePos(tpccf::Row row, tpccf::Pad pad, tpccf::TPCFragmentTime t)
+  constexpr GPUhdi() CfChargePos(tpccf::Row row, tpccf::Pad pad, tpccf::TPCFragmentTime t)
     : gpad(tpcGlobalPadIdx(row, pad)), timePadded(t + GPUCF_PADDING_TIME)
   {
   }
 
-  GPUdi() ChargePos(const tpccf::GlobalPad& p, const tpccf::TPCFragmentTime& t) : gpad(p), timePadded(t) {}
+  GPUdi() CfChargePos(const tpccf::GlobalPad& p, const tpccf::TPCFragmentTime& t) : gpad(p), timePadded(t) {}
 
-  GPUdi() ChargePos delta(const tpccf::Delta2& d) const
+  GPUdi() CfChargePos delta(const tpccf::Delta2& d) const
   {
     return {tpccf::GlobalPad(gpad + d.x), tpccf::TPCFragmentTime(timePadded + d.y)};
   }
@@ -56,7 +56,7 @@ struct ChargePos {
   }
 };
 
-inline constexpr ChargePos INVALID_CHARGE_POS{255, 255, INVALID_TIME_BIN};
+inline constexpr CfChargePos INVALID_CHARGE_POS{255, 255, INVALID_TIME_BIN};
 
 } // namespace o2::gpu
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/CfUtils.h b/GPU/GPUTracking/TPCClusterFinder/CfUtils.h
index 75dcc166abd9b..96f4893c74af3 100644
--- a/GPU/GPUTracking/TPCClusterFinder/CfUtils.h
+++ b/GPU/GPUTracking/TPCClusterFinder/CfUtils.h
@@ -17,7 +17,7 @@
 
 #include "clusterFinderDefs.h"
 #include "GPUCommonAlgorithm.h"
-#include "Array2D.h"
+#include "CfArray2D.h"
 #include "CfConsts.h"
 
 namespace o2::gpu
@@ -169,14 +169,14 @@ class CfUtils
 
   template <typename T>
   static GPUdi() void blockLoad(
-    const Array2D<T>& map,
+    const CfArray2D<T>& map,
     uint32_t wgSize,
     uint32_t elems,
     uint16_t ll,
     uint32_t offset,
     uint32_t N,
     GPUconstexprref() const tpccf::Delta2* neighbors,
-    const ChargePos* posBcast,
+    const CfChargePos* posBcast,
     GPUgeneric() T* buf)
   {
 #if defined(GPUCA_GPUCODE)
@@ -186,7 +186,7 @@ class CfUtils
     tpccf::Delta2 d = neighbors[x + offset];
 
     for (uint32_t i = y; i < wgSize; i += (elems / N)) {
-      ChargePos readFrom = posBcast[i];
+      CfChargePos readFrom = posBcast[i];
       uint32_t writeTo = N * i + x;
       buf[writeTo] = map[readFrom.delta(d)];
     }
@@ -196,7 +196,7 @@ class CfUtils
       return;
     }
 
-    ChargePos readFrom = posBcast[ll];
+    CfChargePos readFrom = posBcast[ll];
 
     GPUbarrier();
 
@@ -213,14 +213,14 @@ class CfUtils
 
   template <typename T, bool Inv = false>
   static GPUdi() void condBlockLoad(
-    const Array2D<T>& map,
+    const CfArray2D<T>& map,
     uint16_t wgSize,
     uint16_t elems,
     uint16_t ll,
     uint16_t offset,
     uint16_t N,
     GPUconstexprref() const tpccf::Delta2* neighbors,
-    const ChargePos* posBcast,
+    const CfChargePos* posBcast,
     const uint8_t* aboveThreshold,
     GPUgeneric() T* buf)
   {
@@ -230,7 +230,7 @@ class CfUtils
     uint16_t x = ll % N;
     tpccf::Delta2 d = neighbors[x + offset];
     for (uint32_t i = y; i < wgSize; i += (elems / N)) {
-      ChargePos readFrom = posBcast[i];
+      CfChargePos readFrom = posBcast[i];
       uint8_t above = aboveThreshold[i];
       uint32_t writeTo = N * i + x;
       T v(0);
@@ -247,7 +247,7 @@ class CfUtils
       return;
     }
 
-    ChargePos readFrom = posBcast[ll];
+    CfChargePos readFrom = posBcast[ll];
     uint8_t above = aboveThreshold[ll];
     GPUbarrier();
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.cxx b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.cxx
index 622da856af805..a80283b91c940 100644
--- a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.cxx
@@ -58,7 +58,7 @@ GPUd() Charge ClusterAccumulator::updateOuter(PackedCharge charge, Delta2 d)
   return q;
 }
 
-GPUd() void ClusterAccumulator::finalize(const ChargePos& pos, const Charge q, TPCTime timeOffset)
+GPUd() void ClusterAccumulator::finalize(const CfChargePos& pos, const Charge q, TPCTime timeOffset)
 {
   mQtot += q;
 
@@ -75,7 +75,7 @@ GPUd() void ClusterAccumulator::finalize(const ChargePos& pos, const Charge q, T
   mTimeMean += timeOffset + pos.time();
 }
 
-GPUd() bool ClusterAccumulator::toNative(const ChargePos& pos, const Charge q, tpc::ClusterNative& cn, const GPUParam& param, const Array2D<PackedCharge>& chargeMap)
+GPUd() bool ClusterAccumulator::toNative(const CfChargePos& pos, const Charge q, tpc::ClusterNative& cn, const GPUParam& param, const CfArray2D<PackedCharge>& chargeMap)
 {
   Pad pad = pos.pad();
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h
index 90d977372b201..fb208ca0150d4 100644
--- a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h
+++ b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h
@@ -17,7 +17,7 @@
 
 #include "clusterFinderDefs.h"
 #include "PackedCharge.h"
-#include "Array2D.h"
+#include "CfArray2D.h"
 
 namespace o2
 {
@@ -30,7 +30,7 @@ struct ClusterNative;
 namespace gpu
 {
 
-struct ChargePos;
+struct CfChargePos;
 struct GPUParam;
 class GPUTPCGeometry;
 
@@ -52,8 +52,8 @@ class ClusterAccumulator
     mSplitInTime = splitInTime;
   }
 
-  GPUd() void finalize(const ChargePos&, const tpccf::Charge, tpccf::TPCTime);
-  GPUd() bool toNative(const ChargePos&, const tpccf::Charge, tpc::ClusterNative&, const GPUParam&, const Array2D<PackedCharge>&);
+  GPUd() void finalize(const CfChargePos&, const tpccf::Charge, tpccf::TPCTime);
+  GPUd() bool toNative(const CfChargePos&, const tpccf::Charge, tpc::ClusterNative&, const GPUParam&, const CfArray2D<PackedCharge>&);
 
  private:
   float mQtot = 0;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFChargeMapFiller.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFChargeMapFiller.cxx
index 8dbc5804f8fb8..d2ca3d419c138 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFChargeMapFiller.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFChargeMapFiller.cxx
@@ -13,7 +13,7 @@
 /// \author Felix Weiglhofer
 
 #include "GPUTPCCFChargeMapFiller.h"
-#include "ChargePos.h"
+#include "CfChargePos.h"
 #include "DataFormatsTPC/Digit.h"
 #include "TPCPadGainCalib.h"
 
@@ -23,14 +23,14 @@ using namespace o2::gpu::tpccf;
 template <>
 GPUdii() void GPUTPCCFChargeMapFiller::Thread<GPUTPCCFChargeMapFiller::fillIndexMap>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
 {
-  Array2D<uint32_t> indexMap(clusterer.mPindexMap);
+  CfArray2D<uint32_t> indexMap(clusterer.mPindexMap);
   fillIndexMapImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer.mPmemory->fragment, clusterer.mPdigits, indexMap, clusterer.mPmemory->counters.nDigitsInFragment);
 }
 
 GPUd() void GPUTPCCFChargeMapFiller::fillIndexMapImpl(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread,
                                                       const CfFragment& fragment,
                                                       const tpc::Digit* digits,
-                                                      Array2D<uint32_t>& indexMap,
+                                                      CfArray2D<uint32_t>& indexMap,
                                                       size_t maxDigit)
 {
   size_t idx = get_global_id(0);
@@ -39,21 +39,21 @@ GPUd() void GPUTPCCFChargeMapFiller::fillIndexMapImpl(int32_t nBlocks, int32_t n
   }
   CPU_ONLY(idx += fragment.digitsStart);
   CPU_ONLY(tpc::Digit digit = digits[idx]);
-  CPU_ONLY(ChargePos pos(digit.getRow(), digit.getPad(), fragment.toLocal(digit.getTimeStamp())));
+  CPU_ONLY(CfChargePos pos(digit.getRow(), digit.getPad(), fragment.toLocal(digit.getTimeStamp())));
   CPU_ONLY(indexMap.safeWrite(pos, idx));
 }
 
 template <>
 GPUdii() void GPUTPCCFChargeMapFiller::Thread<GPUTPCCFChargeMapFiller::fillFromDigits>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
 {
-  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   fillFromDigitsImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, clusterer.mPmemory->counters.nPositions, clusterer.mPdigits, clusterer.mPpositions, chargeMap);
 }
 
 GPUd() void GPUTPCCFChargeMapFiller::fillFromDigitsImpl(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, processorType& clusterer, const CfFragment& fragment, size_t digitNum,
                                                         const tpc::Digit* digits,
-                                                        ChargePos* positions,
-                                                        Array2D<PackedCharge>& chargeMap)
+                                                        CfChargePos* positions,
+                                                        CfArray2D<PackedCharge>& chargeMap)
 {
   size_t idx = get_global_id(0);
   if (idx >= digitNum) {
@@ -61,7 +61,7 @@ GPUd() void GPUTPCCFChargeMapFiller::fillFromDigitsImpl(int32_t nBlocks, int32_t
   }
   tpc::Digit digit = digits[fragment.digitsStart + idx];
 
-  ChargePos pos(digit.getRow(), digit.getPad(), fragment.toLocal(digit.getTimeStamp()));
+  CfChargePos pos(digit.getRow(), digit.getPad(), fragment.toLocal(digit.getTimeStamp()));
   positions[idx] = pos;
   float q = digit.getChargeFloat();
   q *= clusterer.GetConstantMem()->calibObjects.tpcPadGain->getGainCorrection(clusterer.mISector, digit.getRow(), digit.getPad());
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFChargeMapFiller.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFChargeMapFiller.h
index f7aab78c33bd1..800ba786c2105 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFChargeMapFiller.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFChargeMapFiller.h
@@ -19,7 +19,7 @@
 #include "GPUGeneralKernels.h"
 #include "GPUConstantMem.h"
 #include "GPUTPCClusterFinder.h"
-#include "Array2D.h"
+#include "CfArray2D.h"
 #include "PackedCharge.h"
 
 namespace o2::tpc
@@ -30,7 +30,7 @@ class Digit;
 namespace o2::gpu
 {
 
-struct ChargePos;
+struct CfChargePos;
 
 class GPUTPCCFChargeMapFiller : public GPUKernelTemplate
 {
@@ -55,9 +55,9 @@ class GPUTPCCFChargeMapFiller : public GPUKernelTemplate
   template <int32_t iKernel = defaultKernel, typename... Args>
   GPUd() static void Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, Args... args);
 
-  static GPUd() void fillIndexMapImpl(int32_t, int32_t, int32_t, int32_t, const CfFragment&, const tpc::Digit*, Array2D<uint32_t>&, size_t);
+  static GPUd() void fillIndexMapImpl(int32_t, int32_t, int32_t, int32_t, const CfFragment&, const tpc::Digit*, CfArray2D<uint32_t>&, size_t);
 
-  static GPUd() void fillFromDigitsImpl(int32_t, int32_t, int32_t, int32_t, processorType&, const CfFragment&, size_t, const tpc::Digit*, ChargePos*, Array2D<PackedCharge>&);
+  static GPUd() void fillFromDigitsImpl(int32_t, int32_t, int32_t, int32_t, processorType&, const CfFragment&, size_t, const tpc::Digit*, CfChargePos*, CfArray2D<PackedCharge>&);
 
  private:
   static GPUd() size_t findTransition(int32_t, const tpc::Digit*, size_t, size_t);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx
index 1e76860331de6..ec084c308312e 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx
@@ -13,7 +13,7 @@
 /// \author Felix Weiglhofer
 
 #include "GPUTPCCFCheckPadBaseline.h"
-#include "Array2D.h"
+#include "CfArray2D.h"
 #include "PackedCharge.h"
 #include "GPUTPCGeometry.h"
 #include "clusterFinderDefs.h"
@@ -33,10 +33,10 @@ template <>
 GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
 {
   const CfFragment& fragment = clusterer.mPmemory->fragment;
-  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
 
   int32_t basePad = iBlock * PadsPerCacheline;
-  ChargePos basePos = padToChargePos(basePad, clusterer);
+  CfChargePos basePos = padToCfChargePos(basePad, clusterer);
 
   if (not basePos.valid()) {
     return;
@@ -55,7 +55,7 @@ GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThread
   bool handlePad = localTimeBin == 0;
 
   for (tpccf::TPCFragmentTime t = fragment.firstNonOverlapTimeBin(); t < fragment.lastNonOverlapTimeBin(); t += NumOfCachedTimebins) {
-    const ChargePos pos = basePos.delta({localPadId, int16_t(t + localTimeBin)});
+    const CfChargePos pos = basePos.delta({localPadId, int16_t(t + localTimeBin)});
     smem.charges[localPadId][localTimeBin] = (pos.valid()) ? chargeMap[pos].unpack() : 0;
     GPUbarrier();
     if (handlePad) {
@@ -150,7 +150,7 @@ GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThread
 #endif
 }
 
-GPUd() ChargePos GPUTPCCFCheckPadBaseline::padToChargePos(int32_t& pad, const GPUTPCClusterFinder& clusterer)
+GPUd() CfChargePos GPUTPCCFCheckPadBaseline::padToCfChargePos(int32_t& pad, const GPUTPCClusterFinder& clusterer)
 {
   constexpr GPUTPCGeometry geo;
 
@@ -161,12 +161,12 @@ GPUd() ChargePos GPUTPCCFCheckPadBaseline::padToChargePos(int32_t& pad, const GP
     if (0 <= padInRow && padInRow < CAMath::nextMultipleOf<PadsPerCacheline, int32_t>(npads)) {
       int32_t cachelineOffset = padInRow % PadsPerCacheline;
       pad -= cachelineOffset;
-      return ChargePos{r, Pad(padInRow - cachelineOffset), 0};
+      return CfChargePos{r, Pad(padInRow - cachelineOffset), 0};
     }
     padOffset += npads;
   }
 
-  return ChargePos{0, 0, INVALID_TIME_BIN};
+  return CfChargePos{0, 0, INVALID_TIME_BIN};
 }
 
 GPUd() void GPUTPCCFCheckPadBaseline::updatePadBaseline(int32_t pad, const GPUTPCClusterFinder& clusterer, int32_t totalCharges, int32_t consecCharges, Charge maxCharge)
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h
index d6daa6803ca39..2403aa6d29ecd 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h
@@ -52,7 +52,7 @@ class GPUTPCCFCheckPadBaseline : public GPUKernelTemplate
   GPUd() static void Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer);
 
  private:
-  GPUd() static ChargePos padToChargePos(int32_t& pad, const GPUTPCClusterFinder&);
+  GPUd() static CfChargePos padToCfChargePos(int32_t& pad, const GPUTPCClusterFinder&);
   GPUd() static void updatePadBaseline(int32_t pad, const GPUTPCClusterFinder&, int32_t totalCharges, int32_t consecCharges, tpccf::Charge maxCharge);
 };
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx
index 2131347decec6..c9c6b157499f2 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx
@@ -30,7 +30,7 @@ using namespace o2::gpu::tpccf;
 template <>
 GPUdii() void GPUTPCCFClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t onlyMC)
 {
-  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CPU_ONLY(MCLabelAccumulator labelAcc(clusterer));
 
   tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.h
index 79f3325ed9ad2..466d13d3254de 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.h
@@ -19,7 +19,7 @@
 #include "GPUGeneralKernels.h"
 #include "GPUConstantMem.h"
 #include "GPUTPCClusterFinder.h"
-#include "Array2D.h"
+#include "CfArray2D.h"
 #include "PackedCharge.h"
 
 namespace o2::tpc
@@ -38,7 +38,7 @@ class GPUTPCCFClusterizer : public GPUKernelTemplate
  public:
   static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFClusterizer);
   struct GPUSharedMemory {
-    ChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE];
+    CfChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE];
     PackedCharge buf[SCRATCH_PAD_WORK_GROUP_SIZE * SCRATCH_PAD_BUILD_N];
     uint8_t innerAboveThreshold[SCRATCH_PAD_WORK_GROUP_SIZE];
   };
@@ -57,16 +57,16 @@ class GPUTPCCFClusterizer : public GPUKernelTemplate
   template <int32_t iKernel = defaultKernel>
   GPUd() static void Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t);
 
-  static GPUd() void computeClustersImpl(int32_t, int32_t, int32_t, int32_t, processorType&, const CfFragment&, GPUSharedMemory&, const Array2D<PackedCharge>&, const ChargePos*, const GPUSettingsRec&, MCLabelAccumulator*, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*, uint32_t*);
+  static GPUd() void computeClustersImpl(int32_t, int32_t, int32_t, int32_t, processorType&, const CfFragment&, GPUSharedMemory&, const CfArray2D<PackedCharge>&, const CfChargePos*, const GPUSettingsRec&, MCLabelAccumulator*, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*, uint32_t*);
 
-  static GPUd() void buildCluster(const GPUSettingsRec&, const Array2D<PackedCharge>&, ChargePos, ChargePos*, PackedCharge*, uint8_t*, ClusterAccumulator*, MCLabelAccumulator*);
+  static GPUd() void buildCluster(const GPUSettingsRec&, const CfArray2D<PackedCharge>&, CfChargePos, CfChargePos*, PackedCharge*, uint8_t*, ClusterAccumulator*, MCLabelAccumulator*);
 
   static GPUd() uint32_t sortIntoBuckets(processorType&, const tpc::ClusterNative&, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*);
 
  private:
-  static GPUd() void updateClusterInner(const GPUSettingsRec&, uint16_t, uint16_t, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*, uint8_t*);
+  static GPUd() void updateClusterInner(const GPUSettingsRec&, uint16_t, uint16_t, const PackedCharge*, const CfChargePos&, ClusterAccumulator*, MCLabelAccumulator*, uint8_t*);
 
-  static GPUd() void updateClusterOuter(uint16_t, uint16_t, uint16_t, uint16_t, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*);
+  static GPUd() void updateClusterOuter(uint16_t, uint16_t, uint16_t, uint16_t, const PackedCharge*, const CfChargePos&, ClusterAccumulator*, MCLabelAccumulator*);
 };
 
 } // namespace o2::gpu
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc
index 8a6b73be8bd8d..e32abbf37584f 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc
@@ -19,8 +19,8 @@ GPUdii() void GPUTPCCFClusterizer::computeClustersImpl(int32_t nBlocks, int32_t
                                                        processorType& clusterer,
                                                        const CfFragment& fragment,
                                                        GPUSharedMemory& smem,
-                                                       const Array2D<PackedCharge>& chargeMap,
-                                                       const ChargePos* filteredPeakPositions,
+                                                       const CfArray2D<PackedCharge>& chargeMap,
+                                                       const CfChargePos* filteredPeakPositions,
                                                        const GPUSettingsRec& calib,
                                                        MCLabelAccumulator* labelAcc,
                                                        uint32_t clusternum,
@@ -34,7 +34,7 @@ GPUdii() void GPUTPCCFClusterizer::computeClustersImpl(int32_t nBlocks, int32_t
   // For certain configurations dummy work items are added, so the total
   // number of work items is dividable by 64.
   // These dummy items also compute the last cluster but discard the result.
-  ChargePos pos = filteredPeakPositions[CAMath::Min(idx, clusternum - 1)];
+  CfChargePos pos = filteredPeakPositions[CAMath::Min(idx, clusternum - 1)];
   Charge charge = chargeMap[pos].unpack();
 
   ClusterAccumulator pc;
@@ -94,7 +94,7 @@ GPUdii() void GPUTPCCFClusterizer::updateClusterInner(
   uint16_t lid,
   uint16_t N,
   const PackedCharge* buf,
-  const ChargePos& pos,
+  const CfChargePos& pos,
   ClusterAccumulator* cluster,
   MCLabelAccumulator* labelAcc,
   uint8_t* innerAboveThreshold)
@@ -125,7 +125,7 @@ GPUdii() void GPUTPCCFClusterizer::updateClusterOuter(
   uint16_t M,
   uint16_t offset,
   const PackedCharge* buf,
-  const ChargePos& pos,
+  const CfChargePos& pos,
   ClusterAccumulator* cluster,
   MCLabelAccumulator* labelAcc)
 {
@@ -144,9 +144,9 @@ GPUdii() void GPUTPCCFClusterizer::updateClusterOuter(
 
 GPUdii() void GPUTPCCFClusterizer::buildCluster(
   const GPUSettingsRec& calib,
-  const Array2D<PackedCharge>& chargeMap,
-  ChargePos pos,
-  ChargePos* posBcast,
+  const CfArray2D<PackedCharge>& chargeMap,
+  CfChargePos pos,
+  CfChargePos* posBcast,
   PackedCharge* buf,
   uint8_t* innerAboveThreshold,
   ClusterAccumulator* myCluster,
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx
index 6662b93eccb78..312085d2947ab 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx
@@ -15,7 +15,7 @@
 #include "GPUTPCCFDecodeZS.h"
 #include "GPUCommonMath.h"
 #include "GPUTPCClusterFinder.h"
-#include "Array2D.h"
+#include "CfArray2D.h"
 #include "PackedCharge.h"
 #include "CfUtils.h"
 #include "CommonConstants/LHCConstants.h"
@@ -53,8 +53,8 @@ GPUdii() void GPUTPCCFDecodeZS::decode(GPUTPCClusterFinder& clusterer, GPUShared
   if (zs.count[endpoint] == 0) {
     return;
   }
-  ChargePos* positions = clusterer.mPpositions;
-  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  CfChargePos* positions = clusterer.mPpositions;
+  CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   const size_t nDigits = clusterer.mPzsOffsets[iBlock].offset;
   if (iThread == 0) {
     const int32_t region = endpoint / 2;
@@ -175,7 +175,7 @@ GPUdii() void GPUTPCCFDecodeZS::decode(GPUTPCClusterFinder& clusterer, GPUShared
                   TPCTime globalTime = timeBin + l;
                   bool inFragment = fragment.contains(globalTime);
                   Row row = rowOffset + m;
-                  ChargePos pos(row, Pad(pad), inFragment ? fragment.toLocal(globalTime) : INVALID_TIME_BIN);
+                  CfChargePos pos(row, Pad(pad), inFragment ? fragment.toLocal(globalTime) : INVALID_TIME_BIN);
                   positions[nDigitsTmp++] = pos;
 
                   if (inFragment) {
@@ -552,7 +552,7 @@ GPUd() o2::tpc::PadPos GPUTPCCFDecodeZSLinkBase::GetPadAndRowFromFEC(processorTy
 GPUd() void GPUTPCCFDecodeZSLinkBase::WriteCharge(processorType& clusterer, float charge, PadPos padAndRow, TPCFragmentTime localTime, size_t positionOffset)
 {
   const uint32_t sector = clusterer.mISector;
-  ChargePos* positions = clusterer.mPpositions;
+  CfChargePos* positions = clusterer.mPpositions;
 #ifdef GPUCA_CHECK_TPCZS_CORRUPTION
   if (padAndRow.getRow() >= GPUCA_ROW_COUNT) {
     positions[positionOffset] = INVALID_CHARGE_POS;
@@ -560,9 +560,9 @@ GPUd() void GPUTPCCFDecodeZSLinkBase::WriteCharge(processorType& clusterer, floa
     return;
   }
 #endif
-  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
 
-  ChargePos pos(padAndRow.getRow(), padAndRow.getPad(), localTime);
+  CfChargePos pos(padAndRow.getRow(), padAndRow.getPad(), localTime);
   positions[positionOffset] = pos;
 
   charge *= clusterer.GetConstantMem()->calibObjects.tpcPadGain->getGainCorrection(sector, padAndRow.getRow(), padAndRow.getPad());
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx
index dab8123698abf..429d51685e504 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx
@@ -15,7 +15,7 @@
 #include "GPUTPCCFDeconvolution.h"
 #include "CfConsts.h"
 #include "CfUtils.h"
-#include "ChargePos.h"
+#include "CfChargePos.h"
 #include "GPUDefMacros.h"
 
 using namespace o2::gpu;
@@ -24,15 +24,15 @@ using namespace o2::gpu::tpccf;
 template <>
 GPUdii() void GPUTPCCFDeconvolution::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
 {
-  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
-  Array2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
+  CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  CfArray2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
   GPUTPCCFDeconvolution::deconvolutionImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), smem, isPeakMap, chargeMap, clusterer.mPpositions, clusterer.mPmemory->counters.nPositions);
 }
 
 GPUdii() void GPUTPCCFDeconvolution::deconvolutionImpl(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem,
-                                                       const Array2D<uint8_t>& peakMap,
-                                                       Array2D<PackedCharge>& chargeMap,
-                                                       const ChargePos* positions,
+                                                       const CfArray2D<uint8_t>& peakMap,
+                                                       CfArray2D<PackedCharge>& chargeMap,
+                                                       const CfChargePos* positions,
                                                        const uint32_t digitnum)
 {
   SizeT idx = get_global_id(0);
@@ -40,7 +40,7 @@ GPUdii() void GPUTPCCFDeconvolution::deconvolutionImpl(int32_t nBlocks, int32_t
   bool iamDummy = (idx >= digitnum);
   idx = iamDummy ? digitnum - 1 : idx;
 
-  ChargePos pos = positions[idx];
+  CfChargePos pos = positions[idx];
 
   bool iamPeak = CfUtils::isPeak(peakMap[pos]);
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.h
index 78fcc8ba1785a..e971a042e95a4 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.h
@@ -20,7 +20,7 @@
 #include "GPUGeneralKernels.h"
 #include "GPUConstantMem.h"
 #include "GPUTPCClusterFinder.h"
-#include "Array2D.h"
+#include "CfArray2D.h"
 #include "PackedCharge.h"
 
 namespace o2::gpu
@@ -31,7 +31,7 @@ class GPUTPCCFDeconvolution : public GPUKernelTemplate
  public:
   static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFDeconvolution);
   struct GPUSharedMemory : public GPUKernelTemplate::GPUSharedMemoryScan64<int16_t, SCRATCH_PAD_WORK_GROUP_SIZE> {
-    ChargePos posBcast1[SCRATCH_PAD_WORK_GROUP_SIZE];
+    CfChargePos posBcast1[SCRATCH_PAD_WORK_GROUP_SIZE];
     uint8_t aboveThresholdBcast[SCRATCH_PAD_WORK_GROUP_SIZE];
     uint8_t buf[SCRATCH_PAD_WORK_GROUP_SIZE * SCRATCH_PAD_COUNT_N];
   };
@@ -51,7 +51,7 @@ class GPUTPCCFDeconvolution : public GPUKernelTemplate
   GPUd() static void Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, Args... args);
 
  private:
-  static GPUd() void deconvolutionImpl(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, const Array2D<uint8_t>&, Array2D<PackedCharge>&, const ChargePos*, const uint32_t);
+  static GPUd() void deconvolutionImpl(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, const CfArray2D<uint8_t>&, CfArray2D<PackedCharge>&, const CfChargePos*, const uint32_t);
 
   static GPUdi() uint8_t countPeaksInner(uint16_t, const uint8_t*, uint8_t*);
   static GPUdi() uint8_t countPeaksOuter(uint16_t, uint8_t, const uint8_t*);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFNoiseSuppression.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFNoiseSuppression.cxx
index f3a914cbfcaee..4dfa50d9439e4 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFNoiseSuppression.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFNoiseSuppression.cxx
@@ -13,10 +13,10 @@
 /// \author Felix Weiglhofer
 
 #include "GPUTPCCFNoiseSuppression.h"
-#include "Array2D.h"
+#include "CfArray2D.h"
 #include "CfConsts.h"
 #include "CfUtils.h"
-#include "ChargePos.h"
+#include "CfChargePos.h"
 
 using namespace o2::gpu;
 using namespace o2::gpu::tpccf;
@@ -24,29 +24,29 @@ using namespace o2::gpu::tpccf;
 template <>
 GPUdii() void GPUTPCCFNoiseSuppression::Thread<GPUTPCCFNoiseSuppression::noiseSuppression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
 {
-  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
-  Array2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
+  CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  CfArray2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
   noiseSuppressionImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), smem, clusterer.Param().rec, chargeMap, isPeakMap, clusterer.mPpeakPositions, clusterer.mPmemory->counters.nPeaks, clusterer.mPisPeak);
 }
 
 template <>
 GPUdii() void GPUTPCCFNoiseSuppression::Thread<GPUTPCCFNoiseSuppression::updatePeaks>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
 {
-  Array2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
+  CfArray2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
   updatePeaksImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer.mPpeakPositions, clusterer.mPisPeak, clusterer.mPmemory->counters.nPeaks, isPeakMap);
 }
 
 GPUdii() void GPUTPCCFNoiseSuppression::noiseSuppressionImpl(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem,
                                                              const GPUSettingsRec& calibration,
-                                                             const Array2D<PackedCharge>& chargeMap,
-                                                             const Array2D<uint8_t>& peakMap,
-                                                             const ChargePos* peakPositions,
+                                                             const CfArray2D<PackedCharge>& chargeMap,
+                                                             const CfArray2D<uint8_t>& peakMap,
+                                                             const CfChargePos* peakPositions,
                                                              const uint32_t peaknum,
                                                              uint8_t* isPeakPredicate)
 {
   SizeT idx = get_global_id(0);
 
-  ChargePos pos = peakPositions[CAMath::Min(idx, (SizeT)(peaknum - 1))];
+  CfChargePos pos = peakPositions[CAMath::Min(idx, (SizeT)(peaknum - 1))];
   Charge charge = chargeMap[pos].unpack();
 
   uint64_t minimas, bigger, peaksAround;
@@ -75,10 +75,10 @@ GPUdii() void GPUTPCCFNoiseSuppression::noiseSuppressionImpl(int32_t nBlocks, in
 }
 
 GPUd() void GPUTPCCFNoiseSuppression::updatePeaksImpl(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread,
-                                                      const ChargePos* peakPositions,
+                                                      const CfChargePos* peakPositions,
                                                       const uint8_t* isPeak,
                                                       const uint32_t peakNum,
-                                                      Array2D<uint8_t>& peakMap)
+                                                      CfArray2D<uint8_t>& peakMap)
 {
   SizeT idx = get_global_id(0);
 
@@ -86,7 +86,7 @@ GPUd() void GPUTPCCFNoiseSuppression::updatePeaksImpl(int32_t nBlocks, int32_t n
     return;
   }
 
-  ChargePos pos = peakPositions[idx];
+  CfChargePos pos = peakPositions[idx];
 
   uint8_t peak = isPeak[idx];
 
@@ -164,12 +164,12 @@ GPUdi() bool GPUTPCCFNoiseSuppression::keepPeak(
 }
 
 GPUd() void GPUTPCCFNoiseSuppression::findMinimaAndPeaks(
-  const Array2D<PackedCharge>& chargeMap,
-  const Array2D<uint8_t>& peakMap,
+  const CfArray2D<PackedCharge>& chargeMap,
+  const CfArray2D<uint8_t>& peakMap,
   const GPUSettingsRec& calibration,
   float q,
-  const ChargePos& pos,
-  ChargePos* posBcast,
+  const CfChargePos& pos,
+  CfChargePos* posBcast,
   PackedCharge* buf,
   uint64_t* minimas,
   uint64_t* bigger,
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFNoiseSuppression.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFNoiseSuppression.h
index 71236bc317443..59196da11079b 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFNoiseSuppression.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFNoiseSuppression.h
@@ -19,13 +19,13 @@
 #include "GPUGeneralKernels.h"
 #include "GPUConstantMem.h"
 #include "GPUTPCClusterFinder.h"
-#include "Array2D.h"
+#include "CfArray2D.h"
 #include "PackedCharge.h"
 
 namespace o2::gpu
 {
 
-struct ChargePos;
+struct CfChargePos;
 
 class GPUTPCCFNoiseSuppression : public GPUKernelTemplate
 {
@@ -38,7 +38,7 @@ class GPUTPCCFNoiseSuppression : public GPUKernelTemplate
   static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFNoiseSuppression_noiseSuppression) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFNoiseSuppression_updatePeaks));
 
   struct GPUSharedMemory {
-    ChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE];
+    CfChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE];
     PackedCharge buf[SCRATCH_PAD_WORK_GROUP_SIZE * SCRATCH_PAD_NOISE_N];
   };
 
@@ -57,9 +57,9 @@ class GPUTPCCFNoiseSuppression : public GPUKernelTemplate
   GPUd() static void Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, Args... args);
 
  private:
-  static GPUd() void noiseSuppressionImpl(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, const GPUSettingsRec&, const Array2D<PackedCharge>&, const Array2D<uint8_t>&, const ChargePos*, const uint32_t, uint8_t*);
+  static GPUd() void noiseSuppressionImpl(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, const GPUSettingsRec&, const CfArray2D<PackedCharge>&, const CfArray2D<uint8_t>&, const CfChargePos*, const uint32_t, uint8_t*);
 
-  static GPUd() void updatePeaksImpl(int32_t, int32_t, int32_t, int32_t, const ChargePos*, const uint8_t*, const uint32_t, Array2D<uint8_t>&);
+  static GPUd() void updatePeaksImpl(int32_t, int32_t, int32_t, int32_t, const CfChargePos*, const uint8_t*, const uint32_t, CfArray2D<uint8_t>&);
 
   static GPUdi() void checkForMinima(const float, const float, const float, PackedCharge, int32_t, uint64_t*, uint64_t*);
 
@@ -69,7 +69,7 @@ class GPUTPCCFNoiseSuppression : public GPUKernelTemplate
 
   static GPUdi() bool keepPeak(uint64_t, uint64_t);
 
-  static GPUd() void findMinimaAndPeaks(const Array2D<PackedCharge>&, const Array2D<uint8_t>&, const GPUSettingsRec&, float, const ChargePos&, ChargePos*, PackedCharge*, uint64_t*, uint64_t*, uint64_t*);
+  static GPUd() void findMinimaAndPeaks(const CfArray2D<PackedCharge>&, const CfArray2D<uint8_t>&, const GPUSettingsRec&, float, const CfChargePos&, CfChargePos*, PackedCharge*, uint64_t*, uint64_t*, uint64_t*);
 };
 
 } // namespace o2::gpu
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFPeakFinder.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFPeakFinder.cxx
index 1de922f716c14..6749ab8e8485e 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFPeakFinder.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFPeakFinder.cxx
@@ -14,7 +14,7 @@
 
 #include "GPUTPCCFPeakFinder.h"
 
-#include "Array2D.h"
+#include "CfArray2D.h"
 #include "CfUtils.h"
 #include "PackedCharge.h"
 #include "TPCPadGainCalib.h"
@@ -25,19 +25,19 @@ using namespace o2::gpu::tpccf;
 template <>
 GPUdii() void GPUTPCCFPeakFinder::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
 {
-  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
-  Array2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
+  CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  CfArray2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
   findPeaksImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), smem, chargeMap, clusterer.mPpadIsNoisy, clusterer.mPpositions, clusterer.mPmemory->counters.nPositions, clusterer.Param().rec, *clusterer.GetConstantMem()->calibObjects.tpcPadGain, clusterer.mPisPeak, isPeakMap);
 }
 
 GPUdii() bool GPUTPCCFPeakFinder::isPeak(
   GPUSharedMemory& smem,
   Charge q,
-  const ChargePos& pos,
+  const CfChargePos& pos,
   uint16_t N,
-  const Array2D<PackedCharge>& chargeMap,
+  const CfArray2D<PackedCharge>& chargeMap,
   const GPUSettingsRec& calib,
-  ChargePos* posBcast,
+  CfChargePos* posBcast,
   PackedCharge* buf)
 {
   uint16_t ll = get_local_id(0);
@@ -91,21 +91,21 @@ GPUdii() bool GPUTPCCFPeakFinder::isPeak(
 }
 
 GPUd() void GPUTPCCFPeakFinder::findPeaksImpl(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem,
-                                              const Array2D<PackedCharge>& chargeMap,
+                                              const CfArray2D<PackedCharge>& chargeMap,
                                               const uint8_t* padHasLostBaseline,
-                                              const ChargePos* positions,
+                                              const CfChargePos* positions,
                                               SizeT digitnum,
                                               const GPUSettingsRec& calib,
                                               const TPCPadGainCalib& gainCorrection, // Only used for globalPad() function
                                               uint8_t* isPeakPredicate,
-                                              Array2D<uint8_t>& peakMap)
+                                              CfArray2D<uint8_t>& peakMap)
 {
   SizeT idx = get_global_id(0);
 
   // For certain configurations dummy work items are added, so the total
   // number of work items is dividable by 64.
   // These dummy items also compute the last digit but discard the result.
-  ChargePos pos = positions[CAMath::Min(idx, (SizeT)(digitnum - 1))];
+  CfChargePos pos = positions[CAMath::Min(idx, (SizeT)(digitnum - 1))];
   Charge charge = pos.valid() ? chargeMap[pos].unpack() : Charge(0);
 
   bool hasLostBaseline = padHasLostBaseline[gainCorrection.globalPad(pos.row(), pos.pad())];
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFPeakFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFPeakFinder.h
index ec17d98322239..e480518ddc9dd 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFPeakFinder.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFPeakFinder.h
@@ -19,20 +19,20 @@
 #include "GPUConstantMem.h"
 
 #include "clusterFinderDefs.h"
-#include "Array2D.h"
+#include "CfArray2D.h"
 #include "PackedCharge.h"
 
 namespace o2::gpu
 {
 
-struct ChargePos;
+struct CfChargePos;
 
 class GPUTPCCFPeakFinder : public GPUKernelTemplate
 {
  public:
   static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFPeakFinder);
   struct GPUSharedMemory : public GPUKernelTemplate::GPUSharedMemoryScan64<int16_t, SCRATCH_PAD_WORK_GROUP_SIZE> {
-    ChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE];
+    CfChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE];
     PackedCharge buf[SCRATCH_PAD_WORK_GROUP_SIZE * SCRATCH_PAD_SEARCH_N];
   };
 
@@ -51,9 +51,9 @@ class GPUTPCCFPeakFinder : public GPUKernelTemplate
   GPUd() static void Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, Args... args);
 
  private:
-  static GPUd() void findPeaksImpl(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, const Array2D<PackedCharge>&, const uint8_t*, const ChargePos*, tpccf::SizeT, const GPUSettingsRec&, const TPCPadGainCalib&, uint8_t*, Array2D<uint8_t>&);
+  static GPUd() void findPeaksImpl(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, const CfArray2D<PackedCharge>&, const uint8_t*, const CfChargePos*, tpccf::SizeT, const GPUSettingsRec&, const TPCPadGainCalib&, uint8_t*, CfArray2D<uint8_t>&);
 
-  static GPUd() bool isPeak(GPUSharedMemory&, tpccf::Charge, const ChargePos&, uint16_t, const Array2D<PackedCharge>&, const GPUSettingsRec&, ChargePos*, PackedCharge*);
+  static GPUd() bool isPeak(GPUSharedMemory&, tpccf::Charge, const CfChargePos&, uint16_t, const CfArray2D<PackedCharge>&, const GPUSettingsRec&, CfChargePos*, PackedCharge*);
 };
 
 } // namespace o2::gpu
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFStreamCompaction.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFStreamCompaction.cxx
index efed3643800b6..1da5a1158a8c2 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFStreamCompaction.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFStreamCompaction.cxx
@@ -15,7 +15,7 @@
 #include "GPUTPCCFStreamCompaction.h"
 #include "GPUCommonAlgorithm.h"
 
-#include "ChargePos.h"
+#include "CfChargePos.h"
 #include "CfUtils.h"
 
 using namespace o2::gpu;
@@ -92,7 +92,7 @@ GPUdii() void GPUTPCCFStreamCompaction::Thread<GPUTPCCFStreamCompaction::scanDow
 }
 
 template <>
-GPUdii() void GPUTPCCFStreamCompaction::Thread<GPUTPCCFStreamCompaction::compactDigits>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int32_t iBuf, int32_t stage, ChargePos* in, ChargePos* out)
+GPUdii() void GPUTPCCFStreamCompaction::Thread<GPUTPCCFStreamCompaction::compactDigits>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int32_t iBuf, int32_t stage, CfChargePos* in, CfChargePos* out)
 {
   uint32_t nElems = CompactionElems(clusterer, stage);
   SizeT bufferSize = (stage) ? clusterer.mNMaxClusters : clusterer.mNMaxPeaks;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.cxx
index 613c4ad9e5fa6..051391f12cc6d 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.cxx
@@ -16,12 +16,14 @@
 #include "GPUReconstruction.h"
 #include "GPUMemorySizeScalers.h"
 #include "GPUHostDataTypes.h"
+#include "GPUSettings.h"
 
+#include "DataFormatsTPC/ClusterNative.h"
 #include "DataFormatsTPC/ZeroSuppression.h"
 #include "DataFormatsTPC/Digit.h"
 
-#include "ChargePos.h"
-#include "Array2D.h"
+#include "CfChargePos.h"
+#include "CfArray2D.h"
 
 using namespace o2::gpu;
 using namespace o2::tpc;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
index 3e9ea2c6f608b..96efe08be6dc6 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
@@ -46,7 +46,7 @@ namespace o2::gpu
 struct GPUTPCClusterMCInterimArray;
 struct TPCPadGainCalib;
 
-struct ChargePos;
+struct CfChargePos;
 
 class GPUTPCClusterFinder : public GPUProcessor
 {
@@ -98,9 +98,9 @@ class GPUTPCClusterFinder : public GPUProcessor
   MinMaxCN* mMinMaxCN = nullptr;
   uint8_t* mPpadIsNoisy = nullptr;
   tpc::Digit* mPdigits = nullptr; // input digits, only set if ZS is skipped
-  ChargePos* mPpositions = nullptr;
-  ChargePos* mPpeakPositions = nullptr;
-  ChargePos* mPfilteredPeakPositions = nullptr;
+  CfChargePos* mPpositions = nullptr;
+  CfChargePos* mPpeakPositions = nullptr;
+  CfChargePos* mPfilteredPeakPositions = nullptr;
   uint8_t* mPisPeak = nullptr;
   uint32_t* mPclusterPosInRow = nullptr; // store the index where the corresponding cluster is stored in a bucket.
                                          // Required when MC are enabled to write the mc data to the correct position.
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinderDump.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinderDump.cxx
index a9fbc1b5f40e0..da30375149b7c 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinderDump.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinderDump.cxx
@@ -14,8 +14,10 @@
 
 #include "GPUTPCClusterFinder.h"
 #include "GPUReconstruction.h"
-#include "Array2D.h"
+#include "CfArray2D.h"
 #include "DataFormatsTPC/Digit.h"
+#include "DataFormatsTPC/ClusterNative.h"
+#include "GPUSettings.h"
 
 using namespace o2::gpu;
 using namespace o2::gpu::tpccf;
@@ -37,7 +39,7 @@ void GPUTPCClusterFinder::DumpDigits(std::ostream& out)
 void GPUTPCClusterFinder::DumpChargeMap(std::ostream& out, std::string_view title)
 {
   out << "\nClusterer - " << title << " - Sector " << mISector << " - Fragment " << mPmemory->fragment.index << "\n";
-  Array2D<uint16_t> map(mPchargeMap);
+  CfArray2D<uint16_t> map(mPchargeMap);
 
   out << std::hex;
 
@@ -71,7 +73,7 @@ void GPUTPCClusterFinder::DumpPeakMap(std::ostream& out, std::string_view title)
 {
   out << "\nClusterer - " << title << " - Sector " << mISector << " - Fragment " << mPmemory->fragment.index << "\n";
 
-  Array2D<uint8_t> map(mPpeakMap);
+  CfArray2D<uint8_t> map(mPpeakMap);
 
   out << std::hex;
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index fe3202fe7b439..092af2ea393c5 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -15,6 +15,7 @@
 #include "GPUReconstruction.h"
 #include "ML/3rdparty/GPUORTFloat16.h"
 #include "GPUTPCNNClusterizer.h"
+#include "GPUSettings.h"
 
 using namespace o2::gpu;
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index da490b0f94d58..022642f9f142e 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -15,7 +15,7 @@
 #ifndef O2_GPUTPCNNCLUSTERIZER_H
 #define O2_GPUTPCNNCLUSTERIZER_H
 
-#include "ChargePos.h"
+#include "CfChargePos.h"
 #include "GPUProcessor.h"
 
 namespace o2::OrtDataType
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 2cf9ab2037007..512bc1d3bb09b 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -43,7 +43,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::run
   if (clustererNN.outputDataClass[glo_idx] == 0) { // default clusterizer should not be called in batched mode due to mess-up with thread indices
     return;
   }
-  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CPU_ONLY(MCLabelAccumulator labelAcc(clusterer));
   tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow;
   o2::gpu::GPUTPCCFClusterizer::GPUSharedMemory smem_new;
@@ -58,9 +58,9 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
   auto& clustererNN = processors.tpcNNClusterer[sector];
   uint write_idx = glo_idx * clustererNN.nnClusterizerElementSize; // Potential optimization: Either choose nnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId
 
-  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
-  Array2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
-  ChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart];
+  CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  CfArray2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
+  CfChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart];
   int row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad()), time = static_cast<int>(peak.time()); // Explicit casting to avoid conversion errors
   float central_charge = static_cast<float>(chargeMap[peak].unpack());
   int row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.nnClusterizerSizeInputRow);
@@ -75,7 +75,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
       bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.nnClusterizerSizeInputRow);
       for (int t = -clustererNN.nnClusterizerSizeInputTime; t <= clustererNN.nnClusterizerSizeInputTime; t++) {
         if (!is_boundary) {
-          ChargePos tmp_pos(row + r, pad + p, time + t);
+          CfChargePos tmp_pos(row + r, pad + p, time + t);
           if (r == 0 && !clustererNN.clusterFlags[2 * glo_idx] && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization
             clustererNN.clusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
             clustererNN.clusterFlags[2 * glo_idx + 1] = clustererNN.clusterFlags[2 * glo_idx];
@@ -119,16 +119,16 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
   uint base_idx = CAMath::Floor(glo_idx / clustererNN.nnClusterizerElementSize);
   uint transient_index = glo_idx % clustererNN.nnClusterizerElementSize;
 
-  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
-  Array2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
-  ChargePos peak = clusterer.mPfilteredPeakPositions[base_idx + batchStart];
+  CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  CfArray2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
+  CfChargePos peak = clusterer.mPfilteredPeakPositions[base_idx + batchStart];
   int row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad());
 
   if (clustererNN.nnClusterizerAddIndexData && transient_index == (clustererNN.nnClusterizerElementSize - 1)) {
     uint top_idx = (base_idx + 1) * clustererNN.nnClusterizerElementSize;
     for (uint16_t i = 0; i < 8; i++) {
       Delta2 d = cfconsts::InnerNeighbors[i];
-      ChargePos tmp_pos = peak.delta(d);
+      CfChargePos tmp_pos = peak.delta(d);
       clustererNN.clusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
       clustererNN.clusterFlags[2 * glo_idx + 1] = clustererNN.clusterFlags[2 * glo_idx];
     }
@@ -161,7 +161,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
       if (!is_boundary) {
         float central_charge = static_cast<float>(chargeMap[peak].unpack());
         int t = (rest_1 % (2 * clustererNN.nnClusterizerSizeInputTime + 1)) - clustererNN.nnClusterizerSizeInputTime;
-        ChargePos tmp_pos(row + r, pad + p, time + t);
+        CfChargePos tmp_pos(row + r, pad + p, time + t);
         if (dtype == 0) {
           clustererNN.inputData_16[base_idx * clustererNN.nnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
         } else if (dtype == 1) {
@@ -227,8 +227,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
 
-  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
-  ChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart];
+  CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  CfChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart];
   float central_charge = static_cast<float>(chargeMap[peak].unpack());
 
   CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
@@ -322,8 +322,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
 
-  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
-  ChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart];
+  CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  CfChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart];
   float central_charge = static_cast<float>(chargeMap[peak].unpack());
 
   CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
index 27cfba2487aed..dc7f537c6c1e8 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
@@ -19,7 +19,7 @@
 #include "GPUGeneralKernels.h"
 #include "GPUConstantMem.h"
 #include "GPUTPCClusterFinder.h"
-#include "Array2D.h"
+#include "CfArray2D.h"
 #include "PackedCharge.h"
 #include "GPUTPCNNClusterizer.h"
 
@@ -47,7 +47,7 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate
   static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer);
   struct GPUSharedMemory {
     // Regular cluster finder
-    ChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE];
+    CfChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE];
     PackedCharge buf[SCRATCH_PAD_WORK_GROUP_SIZE * SCRATCH_PAD_BUILD_N];
     uint8_t innerAboveThreshold[SCRATCH_PAD_WORK_GROUP_SIZE];
   };
diff --git a/GPU/GPUTracking/TPCClusterFinder/MCLabelAccumulator.cxx b/GPU/GPUTracking/TPCClusterFinder/MCLabelAccumulator.cxx
index 19ef7aa9ecd0d..e58edae208115 100644
--- a/GPU/GPUTracking/TPCClusterFinder/MCLabelAccumulator.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/MCLabelAccumulator.cxx
@@ -26,7 +26,7 @@ MCLabelAccumulator::MCLabelAccumulator(GPUTPCClusterFinder& clusterer)
 {
 }
 
-void MCLabelAccumulator::collect(const ChargePos& pos, Charge q)
+void MCLabelAccumulator::collect(const CfChargePos& pos, Charge q)
 {
   if (q == 0 || !engaged()) {
     return;
diff --git a/GPU/GPUTracking/TPCClusterFinder/MCLabelAccumulator.h b/GPU/GPUTracking/TPCClusterFinder/MCLabelAccumulator.h
index 176fbea02befe..35c24bfeb5f18 100644
--- a/GPU/GPUTracking/TPCClusterFinder/MCLabelAccumulator.h
+++ b/GPU/GPUTracking/TPCClusterFinder/MCLabelAccumulator.h
@@ -16,7 +16,7 @@
 #define O2_GPU_MC_LABEL_ACCUMULATOR_H
 
 #include "clusterFinderDefs.h"
-#include "Array2D.h"
+#include "CfArray2D.h"
 #include <bitset>
 #include <vector>
 
@@ -44,14 +44,14 @@ class MCLabelAccumulator
  public:
   MCLabelAccumulator(GPUTPCClusterFinder&);
 
-  void collect(const ChargePos&, tpccf::Charge);
+  void collect(const CfChargePos&, tpccf::Charge);
 
   bool engaged() const { return mLabels != nullptr && mOutput != nullptr; }
 
   void commit(tpccf::Row, uint32_t, uint32_t);
 
  private:
-  Array2D<const uint32_t> mIndexMap;
+  CfArray2D<const uint32_t> mIndexMap;
   const o2::dataformats::ConstMCLabelContainerView* mLabels = nullptr;
   GPUTPCClusterMCInterimArray* mOutput = nullptr;
 
diff --git a/GPU/GPUTracking/TPCConvert/GPUTPCConvert.cxx b/GPU/GPUTracking/TPCConvert/GPUTPCConvert.cxx
index 3d6b45c372ea0..899149d320bda 100644
--- a/GPU/GPUTracking/TPCConvert/GPUTPCConvert.cxx
+++ b/GPU/GPUTracking/TPCConvert/GPUTPCConvert.cxx
@@ -17,6 +17,7 @@
 #include "GPUTPCClusterData.h"
 #include "GPUReconstruction.h"
 #include "GPUO2DataTypes.h"
+#include "GPUParam.h"
 
 using namespace o2::gpu;
 
diff --git a/GPU/GPUTracking/TRDTracking/GPUTRDTracker.cxx b/GPU/GPUTracking/TRDTracking/GPUTRDTracker.cxx
index c633f10adae38..2f754d2416bc1 100644
--- a/GPU/GPUTracking/TRDTracking/GPUTRDTracker.cxx
+++ b/GPU/GPUTracking/TRDTracking/GPUTRDTracker.cxx
@@ -22,6 +22,7 @@
 #include "GPUTRDTrackerDebug.h"
 #include "GPUCommonMath.h"
 #include "GPUCommonAlgorithm.h"
+#include "GPUConstantMem.h"
 
 using namespace o2::gpu;
 
diff --git a/GPU/GPUTracking/display/GPUDisplay.cxx b/GPU/GPUTracking/display/GPUDisplay.cxx
index c2b74489e6250..5b0960919da15 100644
--- a/GPU/GPUTracking/display/GPUDisplay.cxx
+++ b/GPU/GPUTracking/display/GPUDisplay.cxx
@@ -13,6 +13,7 @@
 /// \author David Rohr
 
 #include "GPUDisplay.h"
+#include "frontend/GPUDisplayInfo.inc"
 
 #include "GPUTPCDef.h"
 
@@ -204,7 +205,7 @@ int32_t GPUDisplay::DrawGLScene()
   int32_t retVal = 0;
   if (mChain) {
     mIOPtrs = &mChain->mIOPtrs;
-    mCalib = &mChain->calib();
+    mCalib = &mChain->GetCalib();
   }
   if (!mIOPtrs) {
     mNCollissions = 0;
diff --git a/GPU/GPUTracking/display/GPUDisplay.h b/GPU/GPUTracking/display/GPUDisplay.h
index bb270cda23565..dbd90020698b2 100644
--- a/GPU/GPUTracking/display/GPUDisplay.h
+++ b/GPU/GPUTracking/display/GPUDisplay.h
@@ -15,12 +15,10 @@
 #ifndef GPUDISPLAY_H
 #define GPUDISPLAY_H
 
-#include "GPUSettings.h"
 #include "frontend/GPUDisplayFrontend.h"
 #include "backend/GPUDisplayBackend.h"
 #include "GPUDisplayInterface.h"
 
-#include "GPUChainTracking.h"
 #include "../utils/vecpod.h"
 #include "../utils/qsem.h"
 
@@ -34,6 +32,7 @@ namespace o2::gpu
 class GPUTPCTracker;
 struct GPUParam;
 class GPUQA;
+class GPUTRDGeometry;
 
 class GPUDisplay : public GPUDisplayInterface
 {
@@ -77,7 +76,7 @@ class GPUDisplay : public GPUDisplayInterface
   int32_t& drawTextFontSize() { return mDrawTextFontSize; }
 
  private:
-  static constexpr int32_t NSECTORS = GPUChainTracking::NSECTORS;
+  static constexpr int32_t NSECTORS = GPUCA_NSECTORS;
   static constexpr float GL_SCALE_FACTOR = (1.f / 100.f);
 
   static constexpr const int32_t N_POINTS_TYPE = 15;
@@ -157,16 +156,7 @@ class GPUDisplay : public GPUDisplayInterface
   void insertVertexList(std::pair<vecpod<int32_t>*, vecpod<uint32_t>*>& vBuf, size_t first, size_t last);
   void insertVertexList(int32_t iSector, size_t first, size_t last);
   template <typename... Args>
-  void SetInfo(Args... args)
-  {
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wformat-security"
-#pragma GCC diagnostic ignored "-Wformat-truncation"
-    snprintf(mInfoText2, 1024, args...);
-#pragma GCC diagnostic pop
-    GPUInfo("%s", mInfoText2);
-    mInfoText2Timer.ResetStart();
-  }
+  void SetInfo(Args... args);
   void PrintGLHelpText(float colorValue);
   void calcXYZ(const float*);
   void mAnimationCloseAngle(float& newangle, float lastAngle);
diff --git a/GPU/GPUTracking/display/backend/GPUDisplayBackend.cxx b/GPU/GPUTracking/display/backend/GPUDisplayBackend.cxx
index ded8803801fb7..98d2593c27950 100644
--- a/GPU/GPUTracking/display/backend/GPUDisplayBackend.cxx
+++ b/GPU/GPUTracking/display/backend/GPUDisplayBackend.cxx
@@ -34,6 +34,7 @@
 #endif
 
 #include "GPUDisplay.h"
+#include "GPULogging.h"
 #include <string>
 
 using namespace o2::gpu;
diff --git a/GPU/GPUTracking/display/backend/GPUDisplayBackendOpenGL.cxx b/GPU/GPUTracking/display/backend/GPUDisplayBackendOpenGL.cxx
index 3ee3384c8e118..25ae5e1f8055f 100644
--- a/GPU/GPUTracking/display/backend/GPUDisplayBackendOpenGL.cxx
+++ b/GPU/GPUTracking/display/backend/GPUDisplayBackendOpenGL.cxx
@@ -27,6 +27,8 @@
 #include "GPUDisplayBackendOpenGL.h"
 #include "shaders/GPUDisplayShaders.h"
 #include "GPUDisplay.h"
+#include "GPULogging.h"
+#include "GPUParam.h"
 
 #define OPENGL_EMULATE_MULTI_DRAW 0
 
diff --git a/GPU/GPUTracking/display/backend/GPUDisplayBackendVulkan.cxx b/GPU/GPUTracking/display/backend/GPUDisplayBackendVulkan.cxx
index 2324c194d04b9..93c19356ac062 100644
--- a/GPU/GPUTracking/display/backend/GPUDisplayBackendVulkan.cxx
+++ b/GPU/GPUTracking/display/backend/GPUDisplayBackendVulkan.cxx
@@ -19,6 +19,8 @@ VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
 #include "GPUCommonDef.h"
 #include "GPUDisplayBackendVulkan.h"
 #include "GPUDisplay.h"
+#include "GPULogging.h"
+#include "GPUParam.h"
 
 #include <mutex>
 
diff --git a/GPU/GPUTracking/display/frontend/GPUDisplayFrontend.cxx b/GPU/GPUTracking/display/frontend/GPUDisplayFrontend.cxx
index 590d8648eb5bb..22970c3228815 100644
--- a/GPU/GPUTracking/display/frontend/GPUDisplayFrontend.cxx
+++ b/GPU/GPUTracking/display/frontend/GPUDisplayFrontend.cxx
@@ -30,6 +30,9 @@
 #include "GPUDisplayFrontendWayland.h"
 #endif
 
+#include "GPULogging.h"
+#include <cstring>
+
 #ifdef GPUCA_BUILD_EVENT_DISPLAY_QT
 #include "GPUDisplayGUIWrapper.h"
 #else
diff --git a/GPU/GPUTracking/display/frontend/GPUDisplayFrontendWayland.cxx b/GPU/GPUTracking/display/frontend/GPUDisplayFrontendWayland.cxx
index ad3b620ba8f55..d0aae2ffaad02 100644
--- a/GPU/GPUTracking/display/frontend/GPUDisplayFrontendWayland.cxx
+++ b/GPU/GPUTracking/display/frontend/GPUDisplayFrontendWayland.cxx
@@ -18,6 +18,7 @@
 #include "GPUDisplayGUIWrapper.h"
 #include "GPUDisplay.h"
 #include "GPULogging.h"
+#include "GPUParam.h"
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
diff --git a/GPU/GPUTracking/display/frontend/GPUDisplayInfo.inc b/GPU/GPUTracking/display/frontend/GPUDisplayInfo.inc
new file mode 100644
index 0000000000000..b6ac78b31f315
--- /dev/null
+++ b/GPU/GPUTracking/display/frontend/GPUDisplayInfo.inc
@@ -0,0 +1,36 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUDisplayInfo.inc
+/// \author David Rohr
+
+#ifndef GPUDISPLAYINFO_INC_H
+#define GPUDISPLAYINFO_INC_H
+
+#include "GPUDisplay.h"
+#include "GPULogging.h"
+
+namespace o2::gpu
+{
+template <typename... Args>
+void GPUDisplay::SetInfo(Args... args)
+{
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wformat-security"
+#pragma GCC diagnostic ignored "-Wformat-truncation"
+  snprintf(mInfoText2, 1024, args...);
+#pragma GCC diagnostic pop
+  GPUInfo("%s", mInfoText2);
+  mInfoText2Timer.ResetStart();
+}
+} // namespace o2::gpu
+
+#endif
diff --git a/GPU/GPUTracking/display/frontend/GPUDisplayKeys.cxx b/GPU/GPUTracking/display/frontend/GPUDisplayKeys.cxx
index 32ff6c73e110c..54258857a244c 100644
--- a/GPU/GPUTracking/display/frontend/GPUDisplayKeys.cxx
+++ b/GPU/GPUTracking/display/frontend/GPUDisplayKeys.cxx
@@ -13,6 +13,7 @@
 /// \author David Rohr
 
 #include "GPUDisplay.h"
+#include "frontend/GPUDisplayInfo.inc"
 
 using namespace o2::gpu;
 
diff --git a/GPU/GPUTracking/display/helpers/GPUDisplayAnimation.cxx b/GPU/GPUTracking/display/helpers/GPUDisplayAnimation.cxx
index 6c0595b073cd0..cc9ec2e766c4d 100644
--- a/GPU/GPUTracking/display/helpers/GPUDisplayAnimation.cxx
+++ b/GPU/GPUTracking/display/helpers/GPUDisplayAnimation.cxx
@@ -13,6 +13,8 @@
 /// \author David Rohr
 
 #include "GPUDisplay.h"
+#include "frontend/GPUDisplayInfo.inc"
+#include "GPUCommonMath.h"
 
 using namespace o2::gpu;
 
diff --git a/GPU/GPUTracking/display/helpers/GPUDisplayBackendOpenGLMagneticField.cxx b/GPU/GPUTracking/display/helpers/GPUDisplayBackendOpenGLMagneticField.cxx
index b04c93ab8496e..ef94628baeb38 100644
--- a/GPU/GPUTracking/display/helpers/GPUDisplayBackendOpenGLMagneticField.cxx
+++ b/GPU/GPUTracking/display/helpers/GPUDisplayBackendOpenGLMagneticField.cxx
@@ -31,6 +31,7 @@
 #include "backend/GPUDisplayBackendOpenGL.h"
 #include "shaders/GPUDisplayShaders.h"
 #include "GPUDisplay.h"
+#include "GPULogging.h"
 
 using namespace o2::gpu;
 
diff --git a/GPU/GPUTracking/display/helpers/GPUDisplayHelpers.cxx b/GPU/GPUTracking/display/helpers/GPUDisplayHelpers.cxx
index ca9fd6be01703..9d188d03c7b69 100644
--- a/GPU/GPUTracking/display/helpers/GPUDisplayHelpers.cxx
+++ b/GPU/GPUTracking/display/helpers/GPUDisplayHelpers.cxx
@@ -13,6 +13,9 @@
 /// \author David Rohr
 
 #include "GPUDisplay.h"
+#include "GPUChainTracking.h"
+#include "GPULogging.h"
+#include "GPUParam.h"
 
 #ifndef _WIN32
 #include "bitmapfile.h"
diff --git a/GPU/GPUTracking/display/helpers/GPUDisplayInterpolation.cxx b/GPU/GPUTracking/display/helpers/GPUDisplayInterpolation.cxx
index 644995929acb7..4dacaec2fbca5 100644
--- a/GPU/GPUTracking/display/helpers/GPUDisplayInterpolation.cxx
+++ b/GPU/GPUTracking/display/helpers/GPUDisplayInterpolation.cxx
@@ -14,6 +14,7 @@
 
 #include <cstdio>
 #include "GPUDisplay.h"
+#include "GPULogging.h"
 
 using namespace o2::gpu;
 
diff --git a/GPU/GPUTracking/display/helpers/GPUDisplayLoader.cxx b/GPU/GPUTracking/display/helpers/GPUDisplayLoader.cxx
index ee50f32e3c1ac..d31ee206f35e2 100644
--- a/GPU/GPUTracking/display/helpers/GPUDisplayLoader.cxx
+++ b/GPU/GPUTracking/display/helpers/GPUDisplayLoader.cxx
@@ -16,6 +16,7 @@
 #include "frontend/GPUDisplayFrontend.h"
 #include "GPUDisplayInterface.h"
 
+#include <cstring>
 #include <tuple>
 #include <stdexcept>
 
diff --git a/GPU/GPUTracking/display/helpers/GPUDisplayROOT.cxx b/GPU/GPUTracking/display/helpers/GPUDisplayROOT.cxx
index dbeefc7bf9b07..07a05e585d422 100644
--- a/GPU/GPUTracking/display/helpers/GPUDisplayROOT.cxx
+++ b/GPU/GPUTracking/display/helpers/GPUDisplayROOT.cxx
@@ -17,6 +17,9 @@
 #endif
 
 #include "GPUDisplay.h"
+#include "GPULogging.h"
+#include "GPUConstantMem.h"
+#include "GPUChainTracking.h"
 using namespace o2::gpu;
 
 #ifndef GPUCA_NO_ROOT
diff --git a/GPU/GPUTracking/display/render/GPUDisplayDraw.cxx b/GPU/GPUTracking/display/render/GPUDisplayDraw.cxx
index 24668c576d795..e1d63ea1a21e4 100644
--- a/GPU/GPUTracking/display/render/GPUDisplayDraw.cxx
+++ b/GPU/GPUTracking/display/render/GPUDisplayDraw.cxx
@@ -28,6 +28,7 @@
 #include "GPUTPCMCInfo.h"
 #include "GPUParam.inc"
 #include "GPUCommonMath.h"
+#include "GPUChainTracking.h"
 
 #include <type_traits>
 
@@ -43,7 +44,7 @@ using namespace o2::gpu;
 #define GET_CID(sector, i) (mParam->par.earlyTpcTransform ? mIOPtrs->clusterData[sector][i].id : (mIOPtrs->clustersNative->clusterOffset[sector][0] + i))
 
 const GPUTRDGeometry* GPUDisplay::trdGeometry() { return (GPUTRDGeometry*)mCalib->trdGeometry; }
-const GPUTPCTracker& GPUDisplay::sectorTracker(int32_t iSector) { return mChain->GetTPCSectorTrackers()[iSector]; }
+const GPUTPCTracker& GPUDisplay::sectorTracker(int32_t iSector) { return mChain->GetProcessors()->tpcTrackers[iSector]; }
 
 inline void GPUDisplay::insertVertexList(std::pair<vecpod<int32_t>*, vecpod<uint32_t>*>& vBuf, size_t first, size_t last)
 {
diff --git a/GPU/GPUTracking/display/render/GPUDisplayImportEvent.cxx b/GPU/GPUTracking/display/render/GPUDisplayImportEvent.cxx
index 0a780732273db..ab4c0abd7b60e 100644
--- a/GPU/GPUTracking/display/render/GPUDisplayImportEvent.cxx
+++ b/GPU/GPUTracking/display/render/GPUDisplayImportEvent.cxx
@@ -17,6 +17,7 @@
 #endif
 
 #include "GPUDisplay.h"
+#include "frontend/GPUDisplayInfo.inc"
 #include "GPUO2DataTypes.h"
 #include "GPUTPCClusterData.h"
 #include "GPUTPCConvertImpl.h"
diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake
index 7e3ddf868af2a..202ea47d1f3bf 100644
--- a/GPU/GPUTracking/kernels.cmake
+++ b/GPU/GPUTracking/kernels.cmake
@@ -128,7 +128,7 @@ o2_gpu_add_kernel("GPUTPCCFStreamCompaction, scanStart"               "= TPCCLUS
 o2_gpu_add_kernel("GPUTPCCFStreamCompaction, scanUp"                  "= TPCCLUSTERFINDER"                                    LB int32_t iBuf int32_t nElems)
 o2_gpu_add_kernel("GPUTPCCFStreamCompaction, scanTop"                 "= TPCCLUSTERFINDER"                                    LB int32_t iBuf int32_t nElems)
 o2_gpu_add_kernel("GPUTPCCFStreamCompaction, scanDown"                "= TPCCLUSTERFINDER"                                    LB int32_t iBuf "uint32_t" offset int32_t nElems)
-o2_gpu_add_kernel("GPUTPCCFStreamCompaction, compactDigits"           "= TPCCLUSTERFINDER"                                    LB int32_t iBuf int32_t stage ChargePos* in ChargePos* out)
+o2_gpu_add_kernel("GPUTPCCFStreamCompaction, compactDigits"           "= TPCCLUSTERFINDER"                                    LB int32_t iBuf int32_t stage CfChargePos* in CfChargePos* out)
 o2_gpu_add_kernel("GPUTPCCFDecodeZS"                                  "= TPCCLUSTERFINDER"                                    LB int32_t firstHBF)
 o2_gpu_add_kernel("GPUTPCCFDecodeZSLink"                              "GPUTPCCFDecodeZS"                                      LB int32_t firstHBF)
 o2_gpu_add_kernel("GPUTPCCFDecodeZSDenseLink"                         "GPUTPCCFDecodeZS"                                      LB int32_t firstHBF)
diff --git a/GPU/GPUTracking/qa/GPUQA.cxx b/GPU/GPUTracking/qa/GPUQA.cxx
index ba7aeb3800a5e..6a2623fb6e09d 100644
--- a/GPU/GPUTracking/qa/GPUQA.cxx
+++ b/GPU/GPUTracking/qa/GPUQA.cxx
@@ -38,6 +38,7 @@
 #include "GPUTPCDef.h"
 #include "GPUTPCTrackingData.h"
 #include "GPUChainTracking.h"
+#include "GPUChainTrackingGetters.inc"
 #include "GPUTPCTrack.h"
 #include "GPUTPCTracker.h"
 #include "GPUTPCGMMergedTrack.h"
@@ -1702,7 +1703,7 @@ void GPUQA::RunQA(bool matchOnly, const std::vector<o2::tpc::TrackTPC>* tracksEx
     }
   }
 
-  uint32_t nCl = clNative ? clNative->nClustersTotal : mTracking->GetTPCMerger().NMaxClusters();
+  uint32_t nCl = clNative ? clNative->nClustersTotal : mTracking->GetProcessors()->tpcMerger.NMaxClusters();
   mClusterCounts.nTotal += nCl;
   if (mQATasks & taskClusterCounts) {
     for (uint32_t i = 0; i < nCl; i++) {
diff --git a/GPU/GPUTracking/qa/genEvents.cxx b/GPU/GPUTracking/qa/genEvents.cxx
index 2e1bc1c5c64b2..9c2220f9ef748 100644
--- a/GPU/GPUTracking/qa/genEvents.cxx
+++ b/GPU/GPUTracking/qa/genEvents.cxx
@@ -37,6 +37,7 @@
 #include "GPUTPCGMPropagator.h"
 #include "GPUTPCGMMerger.h"
 #include "GPUChainTracking.h"
+#include "GPUConstantMem.h"
 
 #include "../utils/qconfig.h"
 
@@ -169,7 +170,7 @@ int32_t genEvents::GenerateEvent(const GPUParam& param, char* filename)
   GPUTPCGMPropagator prop;
   {
     prop.SetToyMCEventsFlag(kTRUE);
-    const GPUTPCGMMerger& merger = mRec->GetTPCMerger();
+    const GPUTPCGMMerger& merger = mRec->GetProcessors()->tpcMerger;
     prop.SetPolynomialField(&merger.Param().polynomialField);
   }