diff --git a/GPU/Common/GPUCommonDefAPI.h b/GPU/Common/GPUCommonDefAPI.h
index f7efbf7e976d4..b029038a3b521 100644
--- a/GPU/Common/GPUCommonDefAPI.h
+++ b/GPU/Common/GPUCommonDefAPI.h
@@ -43,7 +43,7 @@
   #define GPUhd()                                   // Host and device function, inlined during GPU compilation to avoid symbol clashes in host code
   #define GPUhdi() inline                           // Host and device function, to-be-inlined on host and device
   #define GPUhdni()                                 // Host and device function, not to-be-inlined automatically
-  #define GPUg() INVALID_TRIGGER_ERROR_NO_HOST_CODE // GPU kernel
+  #define GPUg() INVALID_TRIGGER_ERROR_NO_GPU_CODE  // GPU kernel
   #define GPUshared()                               // shared memory variable declaration
   #define GPUglobal()                               // global memory variable declaration (only used for kernel input pointers)
   #define GPUconstant()                             // constant memory variable declaraion
diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx
index 9b569d3e88f3c..39507beda8a55 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx
+++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx
@@ -16,7 +16,7 @@
 #include "GPUReconstructionIncludes.h"
 #include "GPUReconstructionThreading.h"
 #include "GPUChain.h"
-#include "GPUDefParameters.h"
+#include "GPUDefParametersRuntime.h"
 #include "GPUTPCClusterData.h"
 #include "GPUTPCSectorOutCluster.h"
 #include "GPUTPCGMMergedTrack.h"
diff --git a/GPU/GPUTracking/Base/cuda/CMakeLists.txt b/GPU/GPUTracking/Base/cuda/CMakeLists.txt
index e17f1fcd7091e..3655eaf66055e 100644
--- a/GPU/GPUTracking/Base/cuda/CMakeLists.txt
+++ b/GPU/GPUTracking/Base/cuda/CMakeLists.txt
@@ -17,7 +17,7 @@ if(DEFINED CUDA_COMPUTETARGET)
 endif()
 message(STATUS "Building GPUTracking with CUDA support ${TMP_TARGET}")
 
-set(SRCS GPUReconstructionCUDA.cu GPUReconstructionCUDAGenRTC.cxx GPUReconstructionCUDAKernels.cu)
+set(SRCS GPUReconstructionCUDA.cu GPUReconstructionCUDAGenRTC.cxx GPUReconstructionCUDAKernels.cu GPUReconstructionCUDARTCCalls.cu)
 set(HDRS GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDAHelpers.inc GPUReconstructionCUDADef.h GPUReconstructionCUDAIncludesSystem.h)
 # -------------------------------- Prepare RTC -------------------------------------------------------
 enable_language(ASM)
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
index f475929d49d50..47a9b675d27f6 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -111,6 +111,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime()
   constexpr int32_t reqVerMin = 0;
 #endif
   if (mProcessingSettings.rtc.enable && mProcessingSettings.rtctech.runTest == 2) {
+    mWarpSize = GPUCA_WARP_SIZE;
     genAndLoadRTC();
     exit(0);
   }
@@ -244,16 +245,12 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime()
       GPUInfo("\ttextureAlignment = %ld", (uint64_t)deviceProp.textureAlignment);
       GPUInfo(" ");
     }
-    if (deviceProp.warpSize != GPUCA_WARP_SIZE) {
+    if (deviceProp.warpSize != GPUCA_WARP_SIZE && !mProcessingSettings.rtc.enable) {
       throw std::runtime_error("Invalid warp size on GPU");
     }
+    mWarpSize = deviceProp.warpSize;
     mBlockCount = deviceProp.multiProcessorCount;
     mMaxBackendThreads = std::max<int32_t>(mMaxBackendThreads, deviceProp.maxThreadsPerBlock * mBlockCount);
-#ifndef __HIPCC__ // CUDA
-    mWarpSize = 32;
-#else // HIP
-    mWarpSize = 64;
-#endif
     mDeviceName = deviceProp.name;
     mDeviceName += " (CUDA GPU)";
 
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
index ac5920f769f25..3441c6b9a4fd6 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
@@ -45,8 +45,6 @@ class GPUReconstructionCUDABackend : public GPUReconstructionDeviceBase
   template <class T, int32_t I = 0, typename... Args>
   void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args);
 
-  void getRTCKernelCalls(std::vector<std::string>& kernels);
-
   template <class T, class S>
   friend GPUh() void GPUCommonAlgorithm::sortOnDevice(auto* rec, int32_t stream, T* begin, size_t N, const S& comp);
   GPUReconstructionCUDAInternals* mInternals;
@@ -91,6 +89,7 @@ class GPUReconstructionCUDA : public GPUReconstructionKernels<GPUReconstructionC
 
  private:
   int32_t genRTC(std::string& filename, uint32_t& nCompile);
+  void getRTCKernelCalls(std::vector<std::string>& kernels);
   void genAndLoadRTC();
   void loadKernelModules(bool perKernel);
   const char *mRtcSrcExtension = ".src", *mRtcBinExtension = ".o";
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx
index 5f481d2cb9058..abcd47ca01c90 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx
@@ -73,7 +73,8 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
     }
     fclose(fp);
   }
-  const std::string launchBounds = o2::gpu::internal::GPUDefParametersExport(*mParDevice, true);
+  const std::string launchBounds = o2::gpu::internal::GPUDefParametersExport(*mParDevice, true) +
+                                   "#define GPUCA_WARP_SIZE " + std::to_string(mWarpSize) + "\n";
   if (mProcessingSettings.rtctech.printLaunchBounds || mProcessingSettings.debugLevel >= 3) {
     GPUInfo("RTC Launch Bounds:\n%s", launchBounds.c_str());
   }
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAIncludesSystem.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAIncludesSystem.h
index 1cb3679fc30dc..3f072059a9ad7 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAIncludesSystem.h
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAIncludesSystem.h
@@ -12,8 +12,8 @@
 /// \file GPUReconstructionCUDAIncludesSystem.h
 /// \author David Rohr
 
-#ifndef O2_GPU_GPURECONSTRUCTIONCUDAINCLUDES_H
-#define O2_GPU_GPURECONSTRUCTIONCUDAINCLUDES_H
+#ifndef O2_GPU_GPURECONSTRUCTIONCUDAINCLUDESSYSTEM_H
+#define O2_GPU_GPURECONSTRUCTIONCUDAINCLUDESSYSTEM_H
 
 #include <cstdint>
 #include <type_traits>
@@ -32,4 +32,4 @@
 #include <sm_20_atomic_functions.h>
 #include <cuda_fp16.h>
 
-#endif
+#endif // O2_GPU_GPURECONSTRUCTIONCUDAINCLUDESSYSTEM_H
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu
index f8efd8428f035..cf08785e6b3d5 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu
@@ -118,14 +118,3 @@ static GPUReconstructionDeviceBase::deviceConstantMemRegistration registerConstS
   return retVal;
 });
 #endif
-
-void GPUReconstructionCUDABackend::getRTCKernelCalls(std::vector<std::string>& kernels)
-{
-#undef GPUCA_KRNL_LB
-#undef __launch_bounds__
-#define GPUCA_KRNL(...) kernels.emplace_back(GPUCA_M_STR(GPUCA_KRNLGPU(__VA_ARGS__)));
-#define GPUCA_KRNL_LB(x_class, x_attributes, ...) GPUCA_KRNL(x_class, (REG, (GPUCA_M_CAT(GPUCA_RTC_LB_, GPUCA_M_KRNL_NAME(x_class))), GPUCA_M_STRIP(x_attributes)), __VA_ARGS__)
-#include "GPUReconstructionKernelList.h"
-#undef GPUCA_KRNL
-#undef GPUCA_KRNL_LB
-}
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDARTCCalls.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDARTCCalls.cu
new file mode 100644
index 0000000000000..571428dc39e21
--- /dev/null
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDARTCCalls.cu
@@ -0,0 +1,32 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUReconstructionCUDARTCCalls.cu
+/// \author David Rohr
+
+#define GPUCA_GPUCODE_HOSTONLY
+#define GPUCA_GPUCODE_NO_LAUNCH_BOUNDS
+
+#define GPUCA_KRNL_REG(args) __launch_bounds__(GPUCA_M_STRIP(args))
+
+#include "GPUReconstructionCUDAIncludesSystem.h"
+#include "GPUReconstructionCUDADef.h"
+#include "GPUReconstructionCUDA.h"
+
+using namespace o2::gpu;
+
+void GPUReconstructionCUDA::getRTCKernelCalls(std::vector<std::string>& kernels)
+{
+#undef GPUCA_KRNL
+#define GPUCA_KRNL(...) kernels.emplace_back(GPUCA_M_STR(GPUCA_KRNLGPU(__VA_ARGS__)));
+#undef __launch_bounds__
+#include "GPUReconstructionKernelList.h"
+}
diff --git a/GPU/GPUTracking/Base/hip/CMakeLists.txt b/GPU/GPUTracking/Base/hip/CMakeLists.txt
index 9a9b1e36a167c..3a03a054d4a7e 100644
--- a/GPU/GPUTracking/Base/hip/CMakeLists.txt
+++ b/GPU/GPUTracking/Base/hip/CMakeLists.txt
@@ -24,20 +24,30 @@ message(STATUS "Building GPUTracking with HIP support ${TMP_TARGET}")
 if(NOT DEFINED GPUCA_HIP_HIPIFY_FROM_CUDA OR "${GPUCA_HIP_HIPIFY_FROM_CUDA}")
   set(GPUCA_HIP_SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/hipify)
   file(MAKE_DIRECTORY ${GPUCA_HIP_SOURCE_DIR})
-  set(GPUCA_HIP_FILE_LIST GPUReconstructionCUDA.cu GPUReconstructionCUDAExternalProvider.cu GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDAHelpers.inc GPUReconstructionCUDAkernel.template.cu GPUReconstructionCUDADef.h GPUReconstructionCUDAGenRTC.cxx GPUReconstructionCUDAKernels.cu GPUReconstructionCUDArtc.cu)
+  set(GPUCA_HIP_FILE_LIST GPUReconstructionCUDA.cu GPUReconstructionCUDAExternalProvider.cu GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDAHelpers.inc GPUReconstructionCUDAkernel.template.cu GPUReconstructionCUDADef.h GPUReconstructionCUDAGenRTC.cxx GPUReconstructionCUDAKernels.cu GPUReconstructionCUDArtc.cu GPUReconstructionCUDARTCCalls.cu)
   set(GPUCA_HIP_LOCAL_FILE_LIST GPUReconstructionHIPIncludesSystem.h)
   set(HIP_SOURCES "")
   foreach(file ${GPUCA_HIP_FILE_LIST})
     get_filename_component(ABS_CUDA_SORUCE ../cuda/${file} ABSOLUTE)
     get_filename_component(CUDA_SOURCE ${file} NAME)
+    get_filename_component(CUDA_SOURCE_EXT ${file} EXT)
     string(REPLACE ".cu" ".hip" HIP_SOURCE1 ${CUDA_SOURCE})
     string(REPLACE "CUDA" "HIP" HIP_SOURCE ${HIP_SOURCE1})
-    add_custom_command(
-      OUTPUT ${GPUCA_HIP_SOURCE_DIR}/${HIP_SOURCE}
-      COMMAND ${hip_HIPIFY_PERL_EXECUTABLE} --quiet-warnings ${ABS_CUDA_SORUCE} | sed -e 's/CUDA/HIP/g' -e 's/cuda/hip/g' > ${GPUCA_HIP_SOURCE_DIR}/${HIP_SOURCE}
-      DEPENDS ${ABS_CUDA_SORUCE}
-      COMMENT "Hippifying ${HIP_SOURCE}"
-    )
+    if(CUDA_SOURCE_EXT STREQUAL ".cu" OR CUDA_SOURCE_EXT STREQUAL ".h")
+      add_custom_command(
+        OUTPUT ${GPUCA_HIP_SOURCE_DIR}/${HIP_SOURCE}
+        COMMAND ${hip_HIPIFY_PERL_EXECUTABLE} --quiet-warnings ${ABS_CUDA_SORUCE} | sed -e 's/CUDA/HIP/g' -e 's/cuda/hip/g' > ${GPUCA_HIP_SOURCE_DIR}/${HIP_SOURCE}
+        DEPENDS ${ABS_CUDA_SORUCE}
+        COMMENT "Hippifying ${HIP_SOURCE}"
+      )
+    else()
+      add_custom_command(
+        OUTPUT ${GPUCA_HIP_SOURCE_DIR}/${HIP_SOURCE}
+        COMMAND sed -e 's/CUDA/HIP/g' -e 's/cuda/hip/g' ${ABS_CUDA_SORUCE} > ${GPUCA_HIP_SOURCE_DIR}/${HIP_SOURCE}
+        DEPENDS ${ABS_CUDA_SORUCE}
+        COMMENT "Generating HIP source ${HIP_SOURCE}"
+      )
+    endif()
     list(APPEND HIP_SOURCES "${GPUCA_HIP_SOURCE_DIR}/${HIP_SOURCE}")
   endforeach()
   foreach(file ${GPUCA_HIP_LOCAL_FILE_LIST})
@@ -61,7 +71,7 @@ else()
   get_filename_component(GPUCA_HIP_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR} ABSOLUTE)
 endif()
 
-set(SRCS ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIP.hip ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPKernels.hip)
+set(SRCS ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIP.hip ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPKernels.hip ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPRTCCalls.hip)
 set(SRCS_CXX ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPGenRTC.cxx)
 set(HDRS ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIP.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPInternals.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPHelpers.inc ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPDef.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPIncludesSystem.h)
 
diff --git a/GPU/GPUTracking/Base/hip/GPUReconstructionHIPIncludesSystem.h b/GPU/GPUTracking/Base/hip/GPUReconstructionHIPIncludesSystem.h
index cfe1121ef1089..1a3a1ff0108af 100644
--- a/GPU/GPUTracking/Base/hip/GPUReconstructionHIPIncludesSystem.h
+++ b/GPU/GPUTracking/Base/hip/GPUReconstructionHIPIncludesSystem.h
@@ -12,8 +12,8 @@
 /// \file GPUReconstructionHIPIncludesSystem.h
 /// \author David Rohr
 
-#ifndef O2_GPU_RECONSTRUCTIONHIPINCLUDES_H
-#define O2_GPU_RECONSTRUCTIONHIPINCLUDES_H
+#ifndef O2_GPU_RECONSTRUCTIONHIPINCLUDESSYSTEM_H
+#define O2_GPU_RECONSTRUCTIONHIPINCLUDESSYSTEM_H
 
 #include <hip/hip_runtime.h>
 #include <hip/hip_ext.h>
@@ -25,4 +25,4 @@
 #include <thrust/device_ptr.h>
 #pragma GCC diagnostic pop
 
-#endif
+#endif // O2_GPU_RECONSTRUCTIONHIPINCLUDESSYSTEM_H
diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index 44a630fe19f48..f428d982394e0 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -129,7 +129,9 @@ set(HDRS_INSTALL
     DataTypes/GPUTriggerOutputs.h
     Debug/GPUROOTDump.h
     Definitions/GPUDefConstantsAndSettings.h
-    Definitions/GPUDefParametersDefault.h
+    Definitions/GPUDefParametersWrapper.h
+    Definitions/GPUDefParametersConstants.h
+    Definitions/GPUDefParametersDefaults.h
     Definitions/GPUDef.h
     Definitions/GPUDefMacros.h
     Definitions/GPULogging.h
@@ -234,7 +236,7 @@ set(TEMPLATE_HEADER_LIST Base/GPUReconstructionKernelList.template.h
                          Base/GPUReconstructionKernelIncludes.template.h
                          Base/GPUReconstructionIncludesDeviceAll.template.h
                          cmake/GPUNoFastMathKernels.template.h
-                         Definitions/GPUDefParameters.template.h
+                         Definitions/GPUDefParametersRuntime.template.h
                          Definitions/GPUDefParametersLoad.template.inc)
 set(GENERATED_HEADERS_LIST "")
 
@@ -258,7 +260,7 @@ add_custom_command(
 )
 list(APPEND GENERATED_HEADERS_LIST ${CMAKE_CURRENT_BINARY_DIR}/include_gpu_onthefly/GPUDefParametersLoadPrepare.h)
 
-set(HDRS_INSTALL ${HDRS_INSTALL} ${CMAKE_CURRENT_BINARY_DIR}/include_gpu_onthefly/GPUReconstructionKernelList.h ${CMAKE_CURRENT_BINARY_DIR}/include_gpu_onthefly/GPUDefParameters.h ${CMAKE_CURRENT_BINARY_DIR}/include_gpu_onthefly/GPUDefParametersLoad.inc ${CMAKE_CURRENT_BINARY_DIR}/include_gpu_onthefly/GPUDefParametersLoadPrepare.h)
+set(HDRS_INSTALL ${HDRS_INSTALL} ${GENERATED_HEADERS_LIST})
 include(kernels.cmake)
 
 # Optional sources depending on optional dependencies
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.cxx
index 445c03113cd39..5dbbf63ca8264 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.cxx
+++ b/GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.cxx
@@ -258,6 +258,9 @@ GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step1un
         break;
       }
       if (param.rec.tpc.compressionTypeMask & GPUSettings::CompressionDifferences) {
+#ifdef GPUCA_GPUCODE
+        static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCompressionKernels_step1unattached) * 2 <= GPUCA_TPC_COMP_CHUNK_SIZE);
+#endif
         if (param.rec.tpc.compressionSortOrder == GPUSettings::SortZPadTime) {
           CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortZPadTime>(clusters->clusters[iSector][iRow]));
         } else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortZTimePad) {
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.h b/GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.h
index b0bb8a6c12ecc..81817abf1e6d6 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.h
+++ b/GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.h
@@ -72,15 +72,19 @@ class GPUTPCCompressionGatherKernels : public GPUKernelTemplate
   using Vec64 = uint64_t;
   using Vec128 = uint4;
 
-  struct GPUSharedMemory : public GPUKernelTemplate::GPUSharedMemoryScan64<uint32_t, GPUCA_GET_THREAD_COUNT(GPUCA_LB_COMPRESSION_GATHER)> {
+  static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCompressionGatherKernels_unbuffered) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCompressionGatherKernels_buffered32));
+  static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCompressionGatherKernels_unbuffered) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCompressionGatherKernels_buffered64));
+  static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCompressionGatherKernels_unbuffered) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCompressionGatherKernels_buffered128));
+  static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCompressionGatherKernels_unbuffered) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCompressionGatherKernels_multiBlock));
+  struct GPUSharedMemory : public GPUKernelTemplate::GPUSharedMemoryScan64<uint32_t, GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCompressionGatherKernels_unbuffered)> {
     union {
-      uint32_t warpOffset[GPUCA_GET_WARP_COUNT(GPUCA_LB_COMPRESSION_GATHER)];
-      Vec32 buf32[GPUCA_GET_WARP_COUNT(GPUCA_LB_COMPRESSION_GATHER)][GPUCA_WARP_SIZE];
-      Vec64 buf64[GPUCA_GET_WARP_COUNT(GPUCA_LB_COMPRESSION_GATHER)][GPUCA_WARP_SIZE];
-      Vec128 buf128[GPUCA_GET_WARP_COUNT(GPUCA_LB_COMPRESSION_GATHER)][GPUCA_WARP_SIZE];
+      uint32_t warpOffset[GPUCA_GET_WARP_COUNT(GPUCA_LB_GPUTPCCompressionGatherKernels_unbuffered)];
+      Vec32 buf32[GPUCA_GET_WARP_COUNT(GPUCA_LB_GPUTPCCompressionGatherKernels_unbuffered)][GPUCA_WARP_SIZE];
+      Vec64 buf64[GPUCA_GET_WARP_COUNT(GPUCA_LB_GPUTPCCompressionGatherKernels_unbuffered)][GPUCA_WARP_SIZE];
+      Vec128 buf128[GPUCA_GET_WARP_COUNT(GPUCA_LB_GPUTPCCompressionGatherKernels_unbuffered)][GPUCA_WARP_SIZE];
       struct {
-        uint32_t sizes[GPUCA_GET_WARP_COUNT(GPUCA_LB_COMPRESSION_GATHER)][GPUCA_WARP_SIZE];
-        uint32_t srcOffsets[GPUCA_GET_WARP_COUNT(GPUCA_LB_COMPRESSION_GATHER)][GPUCA_WARP_SIZE];
+        uint32_t sizes[GPUCA_GET_WARP_COUNT(GPUCA_LB_GPUTPCCompressionGatherKernels_unbuffered)][GPUCA_WARP_SIZE];
+        uint32_t srcOffsets[GPUCA_GET_WARP_COUNT(GPUCA_LB_GPUTPCCompressionGatherKernels_unbuffered)][GPUCA_WARP_SIZE];
       } unbuffered;
     };
 
diff --git a/GPU/GPUTracking/Definitions/GPUDef.h b/GPU/GPUTracking/Definitions/GPUDef.h
index 404f35f971c94..c77b9ce159306 100644
--- a/GPU/GPUTracking/Definitions/GPUDef.h
+++ b/GPU/GPUTracking/Definitions/GPUDef.h
@@ -18,7 +18,7 @@
 
 #include "GPUCommonDef.h"
 #include "GPUDefConstantsAndSettings.h"
-#include "GPUDefParametersDefault.h"
+#include "GPUDefParametersWrapper.h"
 #include "GPUCommonRtypes.h"
 
 // Macros for masking ptrs in OpenCL kernel calls as uint64_t (The API only allows us to pass buffer objects)
diff --git a/GPU/GPUTracking/Definitions/GPUDefParametersConstants.h b/GPU/GPUTracking/Definitions/GPUDefParametersConstants.h
new file mode 100644
index 0000000000000..3a16d02ecf7c6
--- /dev/null
+++ b/GPU/GPUTracking/Definitions/GPUDefParametersConstants.h
@@ -0,0 +1,87 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUDefParametersConstants.h
+/// \author David Rohr
+
+// This file contains compile-time constants, independent from the backend
+
+#ifndef GPUDEFPARAMETERSCONSTANTS_H
+#define GPUDEFPARAMETERSCONSTANTS_H
+// clang-format off
+
+#define GPUCA_THREAD_COUNT_SCAN 512 // TODO: WARNING!!! Must not be GPUTYPE-dependent right now! // TODO: Fix!
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  #define GPUCA_SPECIALIZE_THRUST_SORTS
+#endif
+
+#define GPUCA_MAX_THREADS 1024
+#define GPUCA_MAX_STREAMS 36
+
+#if defined(GPUCA_GPUCODE)
+  #define GPUCA_SORT_STARTHITS                                         // Sort the start hits when running on GPU
+#endif
+
+#define GPUCA_ROWALIGNMENT 16                                          // Align of Row Hits and Grid
+#define GPUCA_BUFFER_ALIGNMENT 64                                      // Alignment of buffers obtained from SetPointers
+#define GPUCA_MEMALIGN (64 * 1024)                                     // Alignment of allocated memory blocks
+
+// Default maximum numbers
+#define GPUCA_MAX_CLUSTERS           ((size_t)     1024 * 1024 * 1024) // Maximum number of TPC clusters
+#define GPUCA_MAX_TRD_TRACKLETS      ((size_t)             128 * 1024) // Maximum number of TRD tracklets
+#define GPUCA_MAX_ITS_FIT_TRACKS     ((size_t)              96 * 1024) // Max number of tracks for ITS track fit
+#define GPUCA_MEMORY_SIZE            ((size_t) 6 * 1024 * 1024 * 1024) // Size of memory allocated on Device
+#define GPUCA_HOST_MEMORY_SIZE       ((size_t) 1 * 1024 * 1024 * 1024) // Size of memory allocated on Host
+#define GPUCA_GPU_STACK_SIZE         ((size_t)               8 * 1024) // Stack size per GPU thread
+#define GPUCA_GPU_HEAP_SIZE          ((size_t)       16 * 1025 * 1024) // Stack size per GPU thread
+
+#ifdef GPUCA_GPUCODE
+  #ifndef GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP
+     #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 6
+  #endif
+  #ifndef GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE
+     #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 12
+  #endif
+  #ifndef GPUCA_ALTERNATE_BORDER_SORT
+     #define GPUCA_ALTERNATE_BORDER_SORT 0
+  #endif
+  #ifndef GPUCA_SORT_BEFORE_FIT
+     #define GPUCA_SORT_BEFORE_FIT 0
+  #endif
+  #ifndef GPUCA_MERGER_SPLIT_LOOP_INTERPOLATION
+     #define GPUCA_MERGER_SPLIT_LOOP_INTERPOLATION 0
+  #endif
+  #ifndef GPUCA_COMP_GATHER_KERNEL
+     #define GPUCA_COMP_GATHER_KERNEL 0
+  #endif
+  #ifndef GPUCA_COMP_GATHER_MODE
+     #define GPUCA_COMP_GATHER_MODE 2
+  #endif
+#else
+  #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 0
+  #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 0
+  #define GPUCA_ALTERNATE_BORDER_SORT 0
+  #define GPUCA_SORT_BEFORE_FIT 0
+  #define GPUCA_MERGER_SPLIT_LOOP_INTERPOLATION 0
+  #define GPUCA_THREAD_COUNT_FINDER 1
+  #define GPUCA_COMP_GATHER_KERNEL 0
+  #define GPUCA_COMP_GATHER_MODE 0
+#endif
+#ifndef GPUCA_DEDX_STORAGE_TYPE
+  #define GPUCA_DEDX_STORAGE_TYPE float
+#endif
+#ifndef GPUCA_MERGER_INTERPOLATION_ERROR_TYPE
+  #define GPUCA_MERGER_INTERPOLATION_ERROR_TYPE float
+#endif
+
+// clang-format on
+#endif // GPUDEFPARAMETERSCONSTANTS_H
diff --git a/GPU/GPUTracking/Definitions/GPUDefParametersDefault.h b/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h
similarity index 75%
rename from GPU/GPUTracking/Definitions/GPUDefParametersDefault.h
rename to GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h
index 44f3eb299d4c1..ce703e2ceba4a 100644
--- a/GPU/GPUTracking/Definitions/GPUDefParametersDefault.h
+++ b/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h
@@ -9,28 +9,22 @@
 // granted to it by virtue of its status as an Intergovernmental Organization
 // or submit itself to any jurisdiction.
 
-/// \file GPUDefParametersDefault.h
+/// \file GPUDefParametersDefaults.h
 /// \author David Rohr
 
-// This files contains compile-time constants affecting the GPU performance.
-// Many of these constants are GPU-architecture specific.
-// This file also contains all constants describing memory limitations, essentially limiting the total number of tracks, etc.
-// Compile-time constants affecting the tracking algorithms / results are located in GPUDefConstantsAndSettings.h
+// This file contains compile-time constants affecting the GPU performance.
 
-#ifndef GPUDEFPARAMETERSDEFAULT_H
-#define GPUDEFPARAMETERSDEFAULT_H
+#if !defined(GPUDEFPARAMETERSDEFAULTS_H) && !defined(GPUCA_GPUCODE_GENRTC) && !defined(GPUCA_GPUCODE_NO_LAUNCH_BOUNDS) // Avoid including for RTC generation besides normal include protection.
+#define GPUDEFPARAMETERSDEFAULTS_H
 // clang-format off
 
-#include "GPUCommonDef.h"
-#include "GPUDefMacros.h"
-
 // Launch bound definition, 3 optional parameters: maxThreads per block, minBlocks per multiprocessor, force number of blocks (not passed to compiler as launch bounds)
 
 // GPU Run Configuration
 #ifdef GPUCA_GPUCODE
 #if defined(GPUCA_GPUTYPE_MI2xx)
   #define GPUCA_WARP_SIZE 64
-  #define GPUCA_THREAD_COUNT 256
+  #define GPUCA_THREAD_COUNT_DEFAULT 256
   #define GPUCA_LB_GPUTPCCreateTrackingData 256
   #define GPUCA_LB_GPUTPCStartHitsSorter 512, 1
   #define GPUCA_LB_GPUTPCStartHitsFinder 1024
@@ -81,19 +75,9 @@
   #define GPUCA_LB_GPUTPCCFDeconvolution 512
   #define GPUCA_LB_GPUTPCCFClusterizer 448
   #define GPUCA_LB_COMPRESSION_GATHER 1024
-  #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 5
-  #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
-  #define GPUCA_ALTERNATE_BORDER_SORT 1
-  #define GPUCA_SORT_BEFORE_FIT 1
-  #define GPUCA_MERGER_SPLIT_LOOP_INTERPOLATION 1
-  #define GPUCA_NO_ATOMIC_PRECHECK 1
-  #define GPUCA_DEDX_STORAGE_TYPE uint16_t
-  #define GPUCA_MERGER_INTERPOLATION_ERROR_TYPE half
-  #define GPUCA_COMP_GATHER_KERNEL 4
-  #define GPUCA_COMP_GATHER_MODE 3
 #elif defined(GPUCA_GPUTYPE_VEGA)
   #define GPUCA_WARP_SIZE 64
-  #define GPUCA_THREAD_COUNT 256
+  #define GPUCA_THREAD_COUNT_DEFAULT 256
   #define GPUCA_LB_GPUTPCCreateTrackingData 128
   #define GPUCA_LB_GPUTPCStartHitsSorter 1024, 2
   #define GPUCA_LB_GPUTPCStartHitsFinder 1024
@@ -144,19 +128,9 @@
   #define GPUCA_LB_GPUTPCCFDeconvolution 512
   #define GPUCA_LB_GPUTPCCFClusterizer 512
   #define GPUCA_LB_COMPRESSION_GATHER 1024
-  #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 5
-  #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
-  #define GPUCA_ALTERNATE_BORDER_SORT 1
-  #define GPUCA_SORT_BEFORE_FIT 1
-  #define GPUCA_MERGER_SPLIT_LOOP_INTERPOLATION 1
-  #define GPUCA_NO_ATOMIC_PRECHECK 1
-  #define GPUCA_DEDX_STORAGE_TYPE uint16_t
-  #define GPUCA_MERGER_INTERPOLATION_ERROR_TYPE half
-  #define GPUCA_COMP_GATHER_KERNEL 4
-  #define GPUCA_COMP_GATHER_MODE 3
 #elif defined(GPUCA_GPUTYPE_AMPERE)
   #define GPUCA_WARP_SIZE 32
-  #define GPUCA_THREAD_COUNT 512
+  #define GPUCA_THREAD_COUNT_DEFAULT 512
   #define GPUCA_LB_GPUTPCCreateTrackingData 384
   #define GPUCA_LB_GPUTPCStartHitsSorter 512, 1
   #define GPUCA_LB_GPUTPCStartHitsFinder 512
@@ -207,19 +181,9 @@
   #define GPUCA_LB_GPUTPCCFDeconvolution 384
   #define GPUCA_LB_GPUTPCCFClusterizer 448
   #define GPUCA_LB_COMPRESSION_GATHER 1024
-  #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 4
-  #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
-  #define GPUCA_ALTERNATE_BORDER_SORT 1
-  #define GPUCA_SORT_BEFORE_FIT 1
-  #define GPUCA_MERGER_SPLIT_LOOP_INTERPOLATION 1
-  #define GPUCA_NO_ATOMIC_PRECHECK 1
-  #define GPUCA_DEDX_STORAGE_TYPE uint16_t
-  #define GPUCA_MERGER_INTERPOLATION_ERROR_TYPE half
-  #define GPUCA_COMP_GATHER_KERNEL 4
-  #define GPUCA_COMP_GATHER_MODE 3
 #elif defined(GPUCA_GPUTYPE_TURING)
   #define GPUCA_WARP_SIZE 32
-  #define GPUCA_THREAD_COUNT 512
+  #define GPUCA_THREAD_COUNT_DEFAULT 512
   #define GPUCA_LB_GPUTPCCreateTrackingData 256
   #define GPUCA_LB_GPUTPCStartHitsSorter 512, 1
   #define GPUCA_LB_GPUTPCStartHitsFinder 512
@@ -262,16 +226,6 @@
   #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 32, 1
   #define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 32, 1
   #define GPUCA_LB_COMPRESSION_GATHER 1024
-  #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 4
-  #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
-  #define GPUCA_ALTERNATE_BORDER_SORT 1
-  #define GPUCA_SORT_BEFORE_FIT 1
-  #define GPUCA_MERGER_SPLIT_LOOP_INTERPOLATION 1
-  #define GPUCA_NO_ATOMIC_PRECHECK 1
-  #define GPUCA_COMP_GATHER_KERNEL 4
-  #define GPUCA_COMP_GATHER_MODE 3
-  #define GPUCA_DEDX_STORAGE_TYPE uint16_t
-  #define GPUCA_MERGER_INTERPOLATION_ERROR_TYPE half
   // #define GPUCA_USE_TEXTURES
 #elif defined(GPUCA_GPUTYPE_OPENCL)
 #else
@@ -281,8 +235,11 @@
 
 #ifdef GPUCA_GPUCODE
   // Default settings for GPU, if not already set for selected GPU type
-  #ifndef GPUCA_THREAD_COUNT
-    #define GPUCA_THREAD_COUNT 256
+  #ifndef GPUCA_WARP_SIZE
+    #define GPUCA_WARP_SIZE 32
+  #endif
+  #ifndef GPUCA_THREAD_COUNT_DEFAULT
+    #define GPUCA_THREAD_COUNT_DEFAULT 256
   #endif
   #ifndef GPUCA_LB_GPUTPCCreateTrackingData
     #define GPUCA_LB_GPUTPCCreateTrackingData 256
@@ -486,25 +443,21 @@
     #define GPUCA_LB_GPUTrackingRefitKernel_mode1asTrackParCov 256
   #endif
   #ifndef GPUCA_LB_GPUMemClean16
-    #define GPUCA_LB_GPUMemClean16 GPUCA_THREAD_COUNT, 1
+    #define GPUCA_LB_GPUMemClean16 GPUCA_THREAD_COUNT_DEFAULT, 1
   #endif
   #ifndef GPUCA_LB_GPUitoa
-    #define GPUCA_LB_GPUitoa GPUCA_THREAD_COUNT, 1
+    #define GPUCA_LB_GPUitoa GPUCA_THREAD_COUNT_DEFAULT, 1
   #endif
-  #define GPUCA_GET_THREAD_COUNT(...) GPUCA_M_FIRST(__VA_ARGS__)
-
   // These kernel launch-bounds are derrived from one of the constants set above
   #define GPUCA_LB_GPUTPCCFNoiseSuppression_noiseSuppression GPUCA_LB_GPUTPCCFNoiseSuppression
   #define GPUCA_LB_GPUTPCCFNoiseSuppression_updatePeaks GPUCA_LB_GPUTPCCFNoiseSuppression
 
-  #ifdef GPUCA_HAS_ONNX
   #define GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer GPUCA_LB_GPUTPCNNClusterizerKernels
   #define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNN GPUCA_LB_GPUTPCNNClusterizerKernels
   #define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass1Labels GPUCA_LB_GPUTPCNNClusterizerKernels
   #define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass2Labels GPUCA_LB_GPUTPCNNClusterizerKernels
   #define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass1Regression GPUCA_LB_GPUTPCNNClusterizerKernels
   #define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass2Regression GPUCA_LB_GPUTPCNNClusterizerKernels
-  #endif
 
   #define GPUCA_LB_GPUTPCCFStreamCompaction_scanStart GPUCA_THREAD_COUNT_SCAN
   #define GPUCA_LB_GPUTPCCFStreamCompaction_scanUp GPUCA_THREAD_COUNT_SCAN
@@ -516,105 +469,7 @@
   #define GPUCA_LB_GPUTPCCompressionGatherKernels_buffered64 GPUCA_LB_COMPRESSION_GATHER
   #define GPUCA_LB_GPUTPCCompressionGatherKernels_buffered128 GPUCA_LB_COMPRESSION_GATHER
   #define GPUCA_LB_GPUTPCCompressionGatherKernels_multiBlock GPUCA_LB_COMPRESSION_GATHER
-#else
-  #define GPUCA_GET_THREAD_COUNT(...) 1 // On the host, a thread is a block, and we run 1 "device thread" per block.
-#endif
-
-#define GPUCA_GET_WARP_COUNT(...) (GPUCA_GET_THREAD_COUNT(__VA_ARGS__) / GPUCA_WARP_SIZE)
-
-#define GPUCA_THREAD_COUNT_SCAN 512 // TODO: WARNING!!! Must not be GPUTYPE-dependent right now! // TODO: Fix!
-
-#if defined(__CUDACC__) || defined(__HIPCC__)
-  #define GPUCA_SPECIALIZE_THRUST_SORTS
-#endif
-
-#ifndef GPUCA_NEIGHBORSFINDER_REGS
-  #define GPUCA_NEIGHBORSFINDER_REGS NONE, 0
-#endif
-#ifdef GPUCA_GPUCODE
-  #ifndef GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP
-     #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 6
-  #endif
-  #ifndef GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE
-     #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 12
-  #endif
-  #ifndef GPUCA_ALTERNATE_BORDER_SORT
-     #define GPUCA_ALTERNATE_BORDER_SORT 0
-  #endif
-  #ifndef GPUCA_SORT_BEFORE_FIT
-     #define GPUCA_SORT_BEFORE_FIT 0
-  #endif
-  #ifndef GPUCA_MERGER_SPLIT_LOOP_INTERPOLATION
-     #define GPUCA_MERGER_SPLIT_LOOP_INTERPOLATION 0
-  #endif
-  #ifndef GPUCA_COMP_GATHER_KERNEL
-     #define GPUCA_COMP_GATHER_KERNEL 0
-  #endif
-  #ifndef GPUCA_COMP_GATHER_MODE
-     #define GPUCA_COMP_GATHER_MODE 2
-  #endif
-#else
-  #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 0
-  #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 0
-  #define GPUCA_ALTERNATE_BORDER_SORT 0
-  #define GPUCA_SORT_BEFORE_FIT 0
-  #define GPUCA_MERGER_SPLIT_LOOP_INTERPOLATION 0
-  #define GPUCA_THREAD_COUNT_FINDER 1
-  #define GPUCA_COMP_GATHER_KERNEL 0
-  #define GPUCA_COMP_GATHER_MODE 0
-#endif
-#ifndef GPUCA_DEDX_STORAGE_TYPE
-  #define GPUCA_DEDX_STORAGE_TYPE float
-#endif
-#ifndef GPUCA_MERGER_INTERPOLATION_ERROR_TYPE
-  #define GPUCA_MERGER_INTERPOLATION_ERROR_TYPE float
-#endif
-#define GPUCA_MERGER_INTERPOLATION_ERROR_TYPE_A GPUCA_DETERMINISTIC_CODE(float, GPUCA_MERGER_INTERPOLATION_ERROR_TYPE)
-#define GPUCA_DEDX_STORAGE_TYPE_A GPUCA_DETERMINISTIC_CODE(float, GPUCA_DEDX_STORAGE_TYPE)
-
-#ifndef GPUCA_WARP_SIZE
-  #ifdef GPUCA_GPUCODE
-    #define GPUCA_WARP_SIZE 32
-  #else
-    #define GPUCA_WARP_SIZE 1
-  #endif
-#endif
-
-#define GPUCA_MAX_THREADS 1024
-#define GPUCA_MAX_STREAMS 36
-
-#define GPUCA_SORT_STARTHITS_GPU                                       // Sort the start hits when running on GPU
-#define GPUCA_ROWALIGNMENT 16                                          // Align of Row Hits and Grid
-#define GPUCA_BUFFER_ALIGNMENT 64                                      // Alignment of buffers obtained from SetPointers
-#define GPUCA_MEMALIGN (64 * 1024)                                     // Alignment of allocated memory blocks
-
-// #define GPUCA_TRACKLET_CONSTRUCTOR_DO_PROFILE                       // Output Profiling Data for Tracklet Constructor Tracklet Scheduling
-
-// Default maximum numbers
-#define GPUCA_MAX_CLUSTERS           ((size_t)     1024 * 1024 * 1024) // Maximum number of TPC clusters
-#define GPUCA_MAX_TRD_TRACKLETS      ((size_t)             128 * 1024) // Maximum number of TRD tracklets
-#define GPUCA_MAX_ITS_FIT_TRACKS     ((size_t)              96 * 1024) // Max number of tracks for ITS track fit
-#define GPUCA_MEMORY_SIZE            ((size_t) 6 * 1024 * 1024 * 1024) // Size of memory allocated on Device
-#define GPUCA_HOST_MEMORY_SIZE       ((size_t) 1 * 1024 * 1024 * 1024) // Size of memory allocated on Host
-#define GPUCA_GPU_STACK_SIZE         ((size_t)               8 * 1024) // Stack size per GPU thread
-#define GPUCA_GPU_HEAP_SIZE          ((size_t)       16 * 1025 * 1024) // Stack size per GPU thread
-
-// #define GPUCA_KERNEL_DEBUGGER_OUTPUT
-
-// Some assertions to make sure the parameters are not invalid
-#if defined(GPUCA_GPUCODE)
-  static_assert(GPUCA_MAXN >= GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP, "Invalid GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP");
-  static_assert(GPUCA_ROW_COUNT >= GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE, "Invalid GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE");
-  static_assert(GPUCA_M_FIRST(GPUCA_LB_GPUTPCCompressionKernels_step1unattached) * 2 <= GPUCA_TPC_COMP_CHUNK_SIZE, "Invalid GPUCA_TPC_COMP_CHUNK_SIZE");
-#endif
-
-// Derived parameters
-#ifdef GPUCA_USE_TEXTURES
-  #define GPUCA_TEXTURE_FETCH_CONSTRUCTOR                              // Fetch data through texture cache
-#endif
-#if defined(GPUCA_SORT_STARTHITS_GPU) && defined(GPUCA_GPUCODE)
-  #define GPUCA_SORT_STARTHITS
 #endif
 
 // clang-format on
-#endif // GPUDEFPARAMETERSDEFAULT_H
+#endif // GPUDEFPARAMETERSDEFAULTS_H
diff --git a/GPU/GPUTracking/Definitions/GPUDefParametersLoad.template.inc b/GPU/GPUTracking/Definitions/GPUDefParametersLoad.template.inc
index c17244572ee0c..938cedbdacc93 100644
--- a/GPU/GPUTracking/Definitions/GPUDefParametersLoad.template.inc
+++ b/GPU/GPUTracking/Definitions/GPUDefParametersLoad.template.inc
@@ -18,7 +18,7 @@
 #define GPUCA_M_LB_EMPTY_1(...) __VA_ARGS__
 #define GPUCA_M_LB_EMPTY0(...) GPUCA_M_CAT(GPUCA_M_LB_EMPTY_, __VA_OPT__(1))(__VA_ARGS__)
 
-#include "GPUDefParameters.h"
+#include "GPUDefParametersRuntime.h"
 #include "GPUDefMacros.h"
 #include <sstream>
 #include <string>
@@ -37,17 +37,17 @@ static GPUDefParameters GPUDefParametersLoad()
   };
 }
 
-#define GPUCA_EXPORT_KERNEL(name)                                                                                  \
-  if (par.par_LB_maxThreads[i] > 0) {                                                                              \
-    o << "#define GPUCA_" << (forRTC ? "RTC_" : "") << "LB_" << GPUCA_M_STR(name) " " << par.par_LB_maxThreads[i]; \
-    if (par.par_LB_minBlocks[i] > 0) {                                                                             \
-      o << ", " << par.par_LB_minBlocks[i];                                                                        \
-    }                                                                                                              \
-    if (!forRTC && par.par_LB_forceBlocks[i] > 0) {                                                                \
-      o << ", " << par.par_LB_forceBlocks[i];                                                                      \
-    }                                                                                                              \
-    o << "\n";                                                                                                     \
-  }                                                                                                                \
+#define GPUCA_EXPORT_KERNEL(name)                                                  \
+  if (par.par_LB_maxThreads[i] > 0) {                                              \
+    o << "#define GPUCA_LB_" << GPUCA_M_STR(name) " " << par.par_LB_maxThreads[i]; \
+    if (par.par_LB_minBlocks[i] > 0) {                                             \
+      o << ", " << par.par_LB_minBlocks[i];                                        \
+    }                                                                              \
+    if (!forRTC && par.par_LB_forceBlocks[i] > 0) {                                \
+      o << ", " << par.par_LB_forceBlocks[i];                                      \
+    }                                                                              \
+    o << "\n";                                                                     \
+  }                                                                                \
   i++;
 
 static std::string GPUDefParametersExport(const GPUDefParameters& par, bool forRTC)
diff --git a/GPU/GPUTracking/Definitions/GPUDefParameters.template.h b/GPU/GPUTracking/Definitions/GPUDefParametersRuntime.template.h
similarity index 87%
rename from GPU/GPUTracking/Definitions/GPUDefParameters.template.h
rename to GPU/GPUTracking/Definitions/GPUDefParametersRuntime.template.h
index 731cb76b89193..f3537c058a824 100644
--- a/GPU/GPUTracking/Definitions/GPUDefParameters.template.h
+++ b/GPU/GPUTracking/Definitions/GPUDefParametersRuntime.template.h
@@ -9,11 +9,11 @@
 // granted to it by virtue of its status as an Intergovernmental Organization
 // or submit itself to any jurisdiction.
 
-/// \file GPUDefParameters.h
+/// \file GPUDefParametersRuntime.h
 /// \author David Rohr
 
-#ifndef GPUDEFPARAMETERS_H
-#define GPUDEFPARAMETERS_H
+#ifndef GPUDEFPARAMETERSRUNTIME_H
+#define GPUDEFPARAMETERSRUNTIME_H
 
 namespace o2::gpu
 {
@@ -24,4 +24,4 @@ struct GPUDefParameters {  // clang-format off
 };  // clang-format on
 }  // namespace o2::gpu
 
-#endif
+#endif  // GPUDEFPARAMETERSRUNTIME_H
diff --git a/GPU/GPUTracking/Definitions/GPUDefParametersWrapper.h b/GPU/GPUTracking/Definitions/GPUDefParametersWrapper.h
new file mode 100644
index 0000000000000..8d8815d8a8044
--- /dev/null
+++ b/GPU/GPUTracking/Definitions/GPUDefParametersWrapper.h
@@ -0,0 +1,104 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUDefParametersWrapper.h
+/// \author David Rohr
+
+// Wrapper file to load all compile-time parameters (architecture / rtc - dependent ones, and constant ones)
+// Compile-time constants affecting the tracking algorithms / results are located in GPUDefConstantsAndSettings.h
+
+#ifndef GPUDEFPARAMETERSWRAPPER_H
+#define GPUDEFPARAMETERSWRAPPER_H
+// clang-format off
+
+#include "GPUCommonDef.h"
+#include "GPUDefMacros.h"
+
+#ifdef GPUCA_GPUCODE
+#if defined(GPUCA_GPUTYPE_MI2xx)
+  #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 5
+  #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
+  #define GPUCA_ALTERNATE_BORDER_SORT 1
+  #define GPUCA_SORT_BEFORE_FIT 1
+  #define GPUCA_MERGER_SPLIT_LOOP_INTERPOLATION 1
+  #define GPUCA_NO_ATOMIC_PRECHECK 1
+  #define GPUCA_DEDX_STORAGE_TYPE uint16_t
+  #define GPUCA_MERGER_INTERPOLATION_ERROR_TYPE half
+  #define GPUCA_COMP_GATHER_KERNEL 4
+  #define GPUCA_COMP_GATHER_MODE 3
+#elif defined(GPUCA_GPUTYPE_VEGA)
+  #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 5
+  #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
+  #define GPUCA_ALTERNATE_BORDER_SORT 1
+  #define GPUCA_SORT_BEFORE_FIT 1
+  #define GPUCA_MERGER_SPLIT_LOOP_INTERPOLATION 1
+  #define GPUCA_NO_ATOMIC_PRECHECK 1
+  #define GPUCA_DEDX_STORAGE_TYPE uint16_t
+  #define GPUCA_MERGER_INTERPOLATION_ERROR_TYPE half
+  #define GPUCA_COMP_GATHER_KERNEL 4
+  #define GPUCA_COMP_GATHER_MODE 3
+#elif defined(GPUCA_GPUTYPE_AMPERE)
+  #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 4
+  #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
+  #define GPUCA_ALTERNATE_BORDER_SORT 1
+  #define GPUCA_SORT_BEFORE_FIT 1
+  #define GPUCA_MERGER_SPLIT_LOOP_INTERPOLATION 1
+  #define GPUCA_NO_ATOMIC_PRECHECK 1
+  #define GPUCA_DEDX_STORAGE_TYPE uint16_t
+  #define GPUCA_MERGER_INTERPOLATION_ERROR_TYPE half
+  #define GPUCA_COMP_GATHER_KERNEL 4
+  #define GPUCA_COMP_GATHER_MODE 3
+#elif defined(GPUCA_GPUTYPE_TURING)
+  #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 4
+  #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
+  #define GPUCA_ALTERNATE_BORDER_SORT 1
+  #define GPUCA_SORT_BEFORE_FIT 1
+  #define GPUCA_MERGER_SPLIT_LOOP_INTERPOLATION 1
+  #define GPUCA_NO_ATOMIC_PRECHECK 1
+  #define GPUCA_COMP_GATHER_KERNEL 4
+  #define GPUCA_COMP_GATHER_MODE 3
+  #define GPUCA_DEDX_STORAGE_TYPE uint16_t
+  #define GPUCA_MERGER_INTERPOLATION_ERROR_TYPE half
+#endif
+#endif
+
+#ifdef GPUCA_GPUCODE
+#include "GPUDefParametersDefaults.h"
+#endif
+#include "GPUDefParametersConstants.h"
+
+namespace o2::gpu
+{
+#if defined(GPUCA_GPUCODE) && !defined(GPUCA_GPUCODE_NO_LAUNCH_BOUNDS)
+  GPUhdi() static constexpr uint32_t GPUCA_GET_THREAD_COUNT(uint32_t val, ...) { return val; }
+  GPUhdi() static constexpr uint32_t GPUCA_GET_WARP_COUNT(uint32_t val, ...) { return val / GPUCA_WARP_SIZE; }
+#else
+  static constexpr uint32_t GPUCA_WARP_SIZE = 1; // On the host, a thread is a block is a warp, and we run 1 "device thread" per block.
+  #define GPUCA_GET_THREAD_COUNT(...) 1          // This must be a define not a constexpr function
+  #define GPUCA_GET_WARP_COUNT(...) 1            // since launch bound constants are not defined in host-code, and must evaluate to 1!
+#endif
+
+#define GPUCA_MERGER_INTERPOLATION_ERROR_TYPE_A GPUCA_DETERMINISTIC_CODE(float, GPUCA_MERGER_INTERPOLATION_ERROR_TYPE)
+#define GPUCA_DEDX_STORAGE_TYPE_A GPUCA_DETERMINISTIC_CODE(float, GPUCA_DEDX_STORAGE_TYPE)
+
+// #define GPUCA_TRACKLET_CONSTRUCTOR_DO_PROFILE                       // Output Profiling Data for Tracklet Constructor Tracklet Scheduling
+
+// #define GPUCA_KERNEL_DEBUGGER_OUTPUT
+
+// Derived parameters
+#ifdef GPUCA_USE_TEXTURES
+  #define GPUCA_TEXTURE_FETCH_CONSTRUCTOR                              // Fetch data through texture cache
+#endif
+
+} // namespace o2::gpu
+
+// clang-format on
+#endif // GPUDEFPARAMETERSWRAPPER_H
diff --git a/GPU/GPUTracking/SectorTracker/GPUTPCNeighboursFinder.h b/GPU/GPUTracking/SectorTracker/GPUTPCNeighboursFinder.h
index 41b5eb8a4ffb8..1bf5000cfbe5c 100644
--- a/GPU/GPUTracking/SectorTracker/GPUTPCNeighboursFinder.h
+++ b/GPU/GPUTracking/SectorTracker/GPUTPCNeighboursFinder.h
@@ -41,6 +41,7 @@ class GPUTPCNeighboursFinder : public GPUKernelTemplate
     int32_t mIRowUp; // next row number
     int32_t mIRowDn; // previous row number
 #if GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP > 0
+    static_assert(GPUCA_MAXN >= GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP);
     float mA1[GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP][GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNeighboursFinder)];
     float mA2[GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP][GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNeighboursFinder)];
     calink mB[GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP][GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNeighboursFinder)];
diff --git a/GPU/GPUTracking/SectorTracker/GPUTPCTrackletSelector.h b/GPU/GPUTracking/SectorTracker/GPUTPCTrackletSelector.h
index 5009c672b030e..f487931bdaf4b 100644
--- a/GPU/GPUTracking/SectorTracker/GPUTPCTrackletSelector.h
+++ b/GPU/GPUTracking/SectorTracker/GPUTPCTrackletSelector.h
@@ -37,6 +37,7 @@ class GPUTPCTrackletSelector : public GPUKernelTemplate
     int32_t mNTracklets;    // n of tracklets
     int32_t mReserved;      // for alignment reasons
 #if GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE != 0
+    static_assert(GPUCA_ROW_COUNT >= GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE);
     GPUTPCHitId mHits[GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE][GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCTrackletSelector)];
 #endif // GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE != 0
   };
diff --git a/GPU/GPUTracking/Standalone/tools/dumpGPUDefParam.C b/GPU/GPUTracking/Standalone/tools/dumpGPUDefParam.C
index 4a72b0cef31a3..785c049816252 100644
--- a/GPU/GPUTracking/Standalone/tools/dumpGPUDefParam.C
+++ b/GPU/GPUTracking/Standalone/tools/dumpGPUDefParam.C
@@ -16,14 +16,15 @@
 // ROOT_INCLUDE_PATH="`pwd`/include" root -l -q -b src/GPU/GPUTracking/Standalone/tools/dumpGPUDefParam.C'()'
 
 // Logic for testing to load the default parameters
-/*#define GPUCA_GPUCODE
+/* #define GPUCA_GPUCODE
 #define GPUCA_GPUTYPE_AMPERE
 #define GPUCA_MAXN 40
 #define GPUCA_ROW_COUNT 152
 #define GPUCA_TPC_COMP_CHUNK_SIZE 1024
-#include "GPUDefParametersDefault.h"*/
+#include "GPUDefParametersConstants.h"
+#include "GPUDefParametersDefaults.h" */
 
-// Load file that sets GPUDefParameters
+// Alternatively, logic to load file that sets GPUDefParameters
 #include "testParam.h"
 
 #include "GPUDefParametersLoad.inc"
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFNoiseSuppression.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFNoiseSuppression.h
index f5d8f533df651..71236bc317443 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFNoiseSuppression.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFNoiseSuppression.h
@@ -34,7 +34,9 @@ class GPUTPCCFNoiseSuppression : public GPUKernelTemplate
     noiseSuppression = 0,
     updatePeaks = 1,
   };
-  static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFNoiseSuppression);
+  static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFNoiseSuppression_noiseSuppression);
+  static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFNoiseSuppression_noiseSuppression) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFNoiseSuppression_updatePeaks));
+
   struct GPUSharedMemory {
     ChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE];
     PackedCharge buf[SCRATCH_PAD_WORK_GROUP_SIZE * SCRATCH_PAD_NOISE_N];
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFStreamCompaction.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFStreamCompaction.h
index 25d3588be6d17..a72907fe55e89 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFStreamCompaction.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFStreamCompaction.h
@@ -37,6 +37,13 @@ class GPUTPCCFStreamCompaction : public GPUKernelTemplate
 
   struct GPUSharedMemory : public GPUKernelTemplate::GPUSharedMemoryScan64<int32_t, GPUCA_THREAD_COUNT_SCAN> {
   };
+#if defined(GPUCA_GPUCODE) && !defined(GPUCA_GPUCODE_NO_LAUNCH_BOUNDS)
+  static_assert(GPUCA_THREAD_COUNT_SCAN == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFStreamCompaction_scanStart));
+  static_assert(GPUCA_THREAD_COUNT_SCAN == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFStreamCompaction_scanUp));
+  static_assert(GPUCA_THREAD_COUNT_SCAN == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFStreamCompaction_scanTop));
+  static_assert(GPUCA_THREAD_COUNT_SCAN == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFStreamCompaction_scanDown));
+  static_assert(GPUCA_THREAD_COUNT_SCAN == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFStreamCompaction_compactDigits));
+#endif
 
   typedef GPUTPCClusterFinder processorType;
   GPUhdi() static processorType* Processor(GPUConstantMem& processors)
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
index e6c1dc508d6e4..a1d641fdb0b93 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
@@ -37,7 +37,13 @@ class MCLabelAccumulator;
 class GPUTPCNNClusterizerKernels : public GPUKernelTemplate
 {
  public:
-  static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels);
+  // Must all have same number of threads, since they use a common SCRATCH_PAD_WORK_GROUP_SIZE below
+  static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNN) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer));
+  static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass1Labels) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer));
+  static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass2Labels) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer));
+  static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass1Regression) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer));
+  static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass2Regression) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer));
+  static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer);
   struct GPUSharedMemory {
     // Regular cluster finder
     ChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE];