diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx
index d95a57c8f2063..969dd06d6297e 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx
+++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx
@@ -56,7 +56,7 @@ GPUReconstructionCPU::~GPUReconstructionCPU()
 }
 
 template <class T, int32_t I, typename... Args>
-inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args)
+inline void GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args)
 {
   auto& x = _xyz.x;
   auto& y = _xyz.y;
@@ -90,11 +90,10 @@ inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlS
       }
     }
   }
-  return 0;
 }
 
 template <>
-inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
+inline void GPUReconstructionCPUBackend::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
 {
   int32_t nnThreads = std::max<int32_t>(1, std::min<int32_t>(size / (16 * 1024 * 1024), getNKernelHostThreads(true)));
   if (nnThreads > 1) {
@@ -112,13 +111,12 @@ inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal<GPUMemClean
   } else {
     memset(ptr, 0, size);
   }
-  return 0;
 }
 
 template <class T, int32_t I, typename... Args>
-int32_t GPUReconstructionCPUBackend::runKernelBackend(const krnlSetupArgs<T, I, Args...>& args)
+void GPUReconstructionCPUBackend::runKernelBackend(const krnlSetupArgs<T, I, Args...>& args)
 {
-  return std::apply([this, &args](auto&... vals) { return runKernelBackendInternal<T, I, Args...>(args.s, vals...); }, args.v);
+  std::apply([this, &args](auto&... vals) { runKernelBackendInternal<T, I, Args...>(args.s, vals...); }, args.v);
 }
 
 template <class T, int32_t I>
@@ -127,8 +125,8 @@ krnlProperties GPUReconstructionCPUBackend::getKernelPropertiesBackend()
   return krnlProperties{1, 1};
 }
 
-#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types)                                                                                                          \
-  template int32_t GPUReconstructionCPUBackend::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args); \
+#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types)                                                                                                       \
+  template void GPUReconstructionCPUBackend::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args); \
   template krnlProperties GPUReconstructionCPUBackend::getKernelPropertiesBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>();
 #include "GPUReconstructionKernelList.h"
 #undef GPUCA_KRNL
diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h
index b6225999c68a0..7901c34866c66 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionCPU.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.h
@@ -35,9 +35,9 @@ class GPUReconstructionCPUBackend : public GPUReconstructionProcessing
  protected:
   GPUReconstructionCPUBackend(const GPUSettingsDeviceBackend& cfg) : GPUReconstructionProcessing(cfg) {}
   template <class T, int32_t I = 0, typename... Args>
-  int32_t runKernelBackend(const gpu_reconstruction_kernels::krnlSetupArgs<T, I, Args...>& args);
+  void runKernelBackend(const gpu_reconstruction_kernels::krnlSetupArgs<T, I, Args...>& args);
   template <class T, int32_t I = 0, typename... Args>
-  int32_t runKernelBackendInternal(const gpu_reconstruction_kernels::krnlSetupTime& _xyz, const Args&... args);
+  void runKernelBackendInternal(const gpu_reconstruction_kernels::krnlSetupTime& _xyz, const Args&... args);
   template <class T, int32_t I>
   gpu_reconstruction_kernels::krnlProperties getKernelPropertiesBackend();
 };
@@ -53,7 +53,7 @@ class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCP
   static constexpr krnlEvent krnlEventNone = krnlEvent{nullptr, nullptr, 0};
 
   template <class S, int32_t I = 0, typename... Args>
-  int32_t runKernel(krnlSetup&& setup, Args&&... args);
+  void runKernel(krnlSetup&& setup, Args&&... args);
   template <class S, int32_t I = 0>
   const gpu_reconstruction_kernels::krnlProperties getKernelProperties()
   {
@@ -77,14 +77,14 @@ class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCP
 
   GPUReconstructionCPU(const GPUSettingsDeviceBackend& cfg) : GPUReconstructionKernels(cfg) {}
 
-#define GPUCA_KRNL(x_class, attributes, x_arguments, x_forward, x_types)                                                                                                                        \
-  inline int32_t runKernelImplWrapper(gpu_reconstruction_kernels::classArgument<GPUCA_M_KRNL_TEMPLATE(x_class)>, bool cpuFallback, double& timer, krnlSetup&& setup GPUCA_M_STRIP(x_arguments)) \
-  {                                                                                                                                                                                             \
-    if (cpuFallback) {                                                                                                                                                                          \
-      return GPUReconstructionCPU::runKernelImpl(krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>(setup.x, setup.y, setup.z, timer GPUCA_M_STRIP(x_forward)));              \
-    } else {                                                                                                                                                                                    \
-      return runKernelImpl(krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>(setup.x, setup.y, setup.z, timer GPUCA_M_STRIP(x_forward)));                                    \
-    }                                                                                                                                                                                           \
+#define GPUCA_KRNL(x_class, attributes, x_arguments, x_forward, x_types)                                                                                                                     \
+  inline void runKernelImplWrapper(gpu_reconstruction_kernels::classArgument<GPUCA_M_KRNL_TEMPLATE(x_class)>, bool cpuFallback, double& timer, krnlSetup&& setup GPUCA_M_STRIP(x_arguments)) \
+  {                                                                                                                                                                                          \
+    if (cpuFallback) {                                                                                                                                                                       \
+      GPUReconstructionCPU::runKernelImpl(krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>(setup.x, setup.y, setup.z, timer GPUCA_M_STRIP(x_forward)));                  \
+    } else {                                                                                                                                                                                 \
+      runKernelImpl(krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>(setup.x, setup.y, setup.z, timer GPUCA_M_STRIP(x_forward)));                                        \
+    }                                                                                                                                                                                        \
   }
 #include "GPUReconstructionKernelList.h"
 #undef GPUCA_KRNL
@@ -131,7 +131,7 @@ class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCP
 };
 
 template <class S, int32_t I, typename... Args>
-inline int32_t GPUReconstructionCPU::runKernel(krnlSetup&& setup, Args&&... args)
+inline void GPUReconstructionCPU::runKernel(krnlSetup&& setup, Args&&... args)
 {
   HighResTimer* t = nullptr;
   GPUCA_RECO_STEP myStep = S::GetRecoStep() == GPUCA_RECO_STEP::NoRecoStep ? setup.x.step : S::GetRecoStep();
@@ -164,7 +164,7 @@ inline int32_t GPUReconstructionCPU::runKernel(krnlSetup&& setup, Args&&... args
     GPUInfo("Running kernel %s (Stream %d, Range %d/%d, Grid %d/%d) on %s", GetKernelName<S, I>(), stream, setup.y.start, setup.y.num, nBlocks, nThreads, cpuFallback == 2 ? "CPU (forced)" : cpuFallback ? "CPU (fallback)" : mDeviceName.c_str());
   }
   if (nThreads == 0 || nBlocks == 0) {
-    return 0;
+    return;
   }
   if (mProcessingSettings.debugLevel >= 1) {
     t = &getKernelTimer<S, I>(myStep, !IsGPU() || cpuFallback ? getHostThreadIndex() : stream);
@@ -173,7 +173,7 @@ inline int32_t GPUReconstructionCPU::runKernel(krnlSetup&& setup, Args&&... args
     }
   }
   double deviceTimerTime = 0.;
-  int32_t retVal = runKernelImplWrapper(gpu_reconstruction_kernels::classArgument<S, I>(), cpuFallback, deviceTimerTime, std::forward<krnlSetup&&>(setup), std::forward<Args>(args)...);
+  runKernelImplWrapper(gpu_reconstruction_kernels::classArgument<S, I>(), cpuFallback, deviceTimerTime, std::forward<krnlSetup&&>(setup), std::forward<Args>(args)...);
   if (GPUDebug(GetKernelName<S, I>(), stream, mProcessingSettings.serializeGPU & 1)) {
     throw std::runtime_error("kernel failure");
   }
@@ -192,7 +192,6 @@ inline int32_t GPUReconstructionCPU::runKernel(krnlSetup&& setup, Args&&... args
       throw std::runtime_error("kernel error code");
     }
   }
-  return retVal;
 }
 
 } // namespace o2::gpu
diff --git a/GPU/GPUTracking/Base/GPUReconstructionKernels.h b/GPU/GPUTracking/Base/GPUReconstructionKernels.h
index e95a59df6cfd5..d541e36a06af9 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionKernels.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionKernels.h
@@ -99,9 +99,9 @@ class GPUReconstructionKernels : public T
   using krnlSetupArgs = gpu_reconstruction_kernels::krnlSetupArgs<S, I, Args...>;
 
 #define GPUCA_KRNL(x_class, attributes, x_arguments, x_forward, x_types)                                                                                \
-  virtual int32_t runKernelImpl(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args)                                       \
+  virtual void runKernelImpl(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args)                                          \
   {                                                                                                                                                     \
-    return T::template runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(args);                                                                          \
+    T::template runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(args);                                                                                 \
   }                                                                                                                                                     \
   virtual gpu_reconstruction_kernels::krnlProperties getKernelPropertiesImpl(gpu_reconstruction_kernels::classArgument<GPUCA_M_KRNL_TEMPLATE(x_class)>) \
   {                                                                                                                                                     \
diff --git a/GPU/GPUTracking/Base/GPUReconstructionProcessing.cxx b/GPU/GPUTracking/Base/GPUReconstructionProcessing.cxx
index 18662870ed45e..51da17fe58628 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionProcessing.cxx
+++ b/GPU/GPUTracking/Base/GPUReconstructionProcessing.cxx
@@ -45,9 +45,14 @@ void GPUReconstructionProcessing::SetNActiveThreads(int32_t n)
 
 void GPUReconstructionProcessing::runParallelOuterLoop(bool doGPU, uint32_t nThreads, std::function<void(uint32_t)> lambda)
 {
-  tbb::task_arena(SetAndGetNActiveThreadsOuterLoop(!doGPU, nThreads)).execute([&] {
-    tbb::parallel_for<uint32_t>(0, nThreads, lambda, tbb::simple_partitioner());
-  });
+  uint32_t nThreadsAdjusted = SetAndGetNActiveThreadsOuterLoop(!doGPU, nThreads);
+  if (nThreadsAdjusted > 1) {
+    tbb::task_arena(nThreadsAdjusted).execute([&] {
+      tbb::parallel_for<uint32_t>(0, nThreads, lambda, tbb::simple_partitioner());
+    });
+  } else {
+    lambda(0);
+  }
 }
 
 namespace o2::gpu
diff --git a/GPU/GPUTracking/Base/cuda/CMakeLists.txt b/GPU/GPUTracking/Base/cuda/CMakeLists.txt
index 5bc1e6e4e6783..a33234db49a27 100644
--- a/GPU/GPUTracking/Base/cuda/CMakeLists.txt
+++ b/GPU/GPUTracking/Base/cuda/CMakeLists.txt
@@ -18,7 +18,7 @@ endif()
 message(STATUS "Building GPUTracking with CUDA support ${TMP_TARGET}")
 
 set(SRCS GPUReconstructionCUDA.cu GPUReconstructionCUDAGenRTC.cxx GPUReconstructionCUDAKernels.cu)
-set(HDRS GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDADef.h GPUReconstructionCUDAIncludes.h CUDAThrustHelpers.h)
+set(HDRS GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDADef.h GPUReconstructionCUDAIncludesHost.h CUDAThrustHelpers.h)
 # -------------------------------- Prepare RTC -------------------------------------------------------
 enable_language(ASM)
 if(ALIGPU_BUILD_TYPE STREQUAL "O2")
@@ -67,7 +67,7 @@ set(GPU_RTC_BIN ${CMAKE_CURRENT_BINARY_DIR}/GPUReconstructionCUDArtc)
 # cmake-format: off
 add_custom_command(
     OUTPUT ${GPU_RTC_BIN}.src
-    COMMAND cat ${GPUDIR}/Base/cuda/GPUReconstructionCUDAIncludes.h > ${GPU_RTC_BIN}.src
+    COMMAND cp ${GPUDIR}/Base/cuda/GPUReconstructionCUDAIncludesHost.h ${GPU_RTC_BIN}.src
     COMMAND ${CMAKE_CXX_COMPILER} ${GPU_RTC_DEFINES} ${GPU_RTC_INCLUDES} -std=c++${CMAKE_CUDA_STANDARD} -D__CUDA_ARCH__=${RTC_CUDA_ARCH} -D__CUDACC__ -x c++ -nostdinc -E ${GPU_RTC_SRC} >> ${GPU_RTC_BIN}.src
     MAIN_DEPENDENCY ${GPU_RTC_SRC}
     IMPLICIT_DEPENDS CXX ${GPU_RTC_SRC}
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
index b195b375b4503..3c118f402dc4f 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -13,8 +13,7 @@
 /// \author David Rohr
 
 #define GPUCA_GPUCODE_HOSTONLY
-#include "GPUReconstructionCUDADef.h"
-#include "GPUReconstructionCUDAIncludes.h"
+#include "GPUReconstructionCUDAIncludesHost.h"
 
 #include <cuda_profiler_api.h>
 
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
index f14696a92a5b0..ee2f069028d74 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
@@ -42,7 +42,7 @@ class GPUReconstructionCUDABackend : public GPUReconstructionDeviceBase
   void PrintKernelOccupancies() override;
 
   template <class T, int32_t I = 0, typename... Args>
-  int32_t runKernelBackend(const krnlSetupArgs<T, I, Args...>& args);
+  void runKernelBackend(const krnlSetupArgs<T, I, Args...>& args);
   template <class T, int32_t I = 0, typename... Args>
   void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args);
   template <class T, int32_t I = 0>
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDADef.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDADef.h
index 845ccc9ec09b1..7f77925ca3aaa 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDADef.h
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDADef.h
@@ -9,7 +9,7 @@
 // granted to it by virtue of its status as an Intergovernmental Organization
 // or submit itself to any jurisdiction.
 
-/// \file GPUReconstructionCUDDef.h
+/// \file GPUReconstructionCUDADef.h
 /// \author David Rohr
 
 #ifndef O2_GPU_GPURECONSTRUCTIONCUDADEF_H
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAExternalProvider.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAExternalProvider.cu
index 3862a3a476324..6bcafe565e930 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAExternalProvider.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAExternalProvider.cu
@@ -12,8 +12,7 @@
 /// \file GPUReconstructionCUDAExternalProvider.cu
 /// \author David Rohr
 
-#include "GPUReconstructionCUDADef.h"
-#include "GPUReconstructionCUDAIncludes.h"
+#include "GPUReconstructionCUDAIncludesHost.h"
 
 #include "GPUReconstructionCUDA.h"
 #include "GPUReconstructionCUDAInternals.h"
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx
index 3bd3afc0ffc23..1a4721035818e 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx
@@ -32,7 +32,9 @@ QGET_LD_BINARY_SYMBOLS(GPUReconstructionCUDArtc_command_arch);
 
 int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
 {
-  std::string rtcparam = std::string(mProcessingSettings.rtc.optSpecialCode ? "#define GPUCA_RTC_SPECIAL_CODE(...) __VA_ARGS__\n" : "#define GPUCA_RTC_SPECIAL_CODE(...)\n") + GPUParamRTC::generateRTCCode(param(), mProcessingSettings.rtc.optConstexpr);
+  std::string rtcparam = std::string("#define GPUCA_RTC_CODE\n") +
+                         std::string(mProcessingSettings.rtc.optSpecialCode ? "#define GPUCA_RTC_SPECIAL_CODE(...) __VA_ARGS__\n" : "#define GPUCA_RTC_SPECIAL_CODE(...)\n") +
+                         GPUParamRTC::generateRTCCode(param(), mProcessingSettings.rtc.optConstexpr);
   if (filename == "") {
     filename = "/tmp/o2cagpu_rtc_";
   }
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAIncludes.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAIncludesHost.h
similarity index 94%
rename from GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAIncludes.h
rename to GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAIncludesHost.h
index ae79494ded496..e3e26e6482fc4 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAIncludes.h
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAIncludesHost.h
@@ -32,4 +32,8 @@
 #include <sm_20_atomic_functions.h>
 #include <cuda_fp16.h>
 
+#ifndef GPUCA_RTC_CODE
+#include "GPUReconstructionCUDADef.h"
+#endif
+
 #endif
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu
index 2e695b49ebb6c..c22aff4aab28c 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu
@@ -12,8 +12,7 @@
 /// \file GPUReconstructionCUDAKernels.cu
 /// \author David Rohr
 
-#include "GPUReconstructionCUDADef.h"
-#include "GPUReconstructionCUDAIncludes.h"
+#include "GPUReconstructionCUDAIncludesHost.h"
 
 #include "GPUReconstructionCUDA.h"
 #include "GPUReconstructionCUDAInternals.h"
@@ -67,7 +66,7 @@ inline void GPUReconstructionCUDABackend::runKernelBackendInternal(const krnlSet
 }
 
 template <class T, int32_t I, typename... Args>
-int32_t GPUReconstructionCUDABackend::runKernelBackend(const krnlSetupArgs<T, I, Args...>& args)
+void GPUReconstructionCUDABackend::runKernelBackend(const krnlSetupArgs<T, I, Args...>& args)
 {
   auto& x = args.s.x;
   auto& z = args.s.z;
@@ -84,7 +83,6 @@ int32_t GPUReconstructionCUDABackend::runKernelBackend(const krnlSetupArgs<T, I,
   if (z.ev) {
     GPUFailedMsg(cudaEventRecord(*(cudaEvent_t*)z.ev, mInternals->Streams[x.stream]));
   }
-  return 0;
 }
 
 #undef GPUCA_KRNL_REG
@@ -93,7 +91,7 @@ int32_t GPUReconstructionCUDABackend::runKernelBackend(const krnlSetupArgs<T, I,
 #if defined(GPUCA_KERNEL_COMPILE_MODE) && GPUCA_KERNEL_COMPILE_MODE == 1
 #define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types) \
   GPUCA_KRNL_PROP(x_class, x_attributes)                                   \
-  template int32_t GPUReconstructionCUDABackend::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args);
+  template void GPUReconstructionCUDABackend::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args);
 #else
 #if defined(GPUCA_KERNEL_COMPILE_MODE) && GPUCA_KERNEL_COMPILE_MODE == 2
 #define GPUCA_KRNL_DEFONLY
@@ -102,7 +100,7 @@ int32_t GPUReconstructionCUDABackend::runKernelBackend(const krnlSetupArgs<T, I,
 #define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types)             \
   GPUCA_KRNL_PROP(x_class, x_attributes)                                               \
   GPUCA_KRNL_WRAP(GPUCA_KRNL_, x_class, x_attributes, x_arguments, x_forward, x_types) \
-  template int32_t GPUReconstructionCUDABackend::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args);
+  template void GPUReconstructionCUDABackend::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args);
 #ifndef __HIPCC__ // CUDA version
 #define GPUCA_KRNL_CALL_single(x_class, ...) \
   GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))<<<x.nBlocks, x.nThreads, 0, me->mInternals->Streams[x.stream]>>>(GPUCA_CONSMEM_CALL y.start, args...);
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAkernel.template.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAkernel.template.cu
index c22b873961e09..bcf61eb07383f 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAkernel.template.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAkernel.template.cu
@@ -13,8 +13,7 @@
 /// \author David Rohr
 
 #define GPUCA_GPUCODE_COMPILEKERNELS
-#include "GPUReconstructionCUDAIncludes.h"
-#include "GPUReconstructionCUDADef.h"
+#include "GPUReconstructionCUDAIncludesHost.h"
 #define GPUCA_KRNL_REG(args) __launch_bounds__(GPUCA_M_MAX2_3(GPUCA_M_STRIP(args)))
 #define GPUCA_KRNL(...) GPUCA_KRNL_WRAP(GPUCA_KRNL_LOAD_, __VA_ARGS__)
 #define GPUCA_KRNL_LOAD_single(...) GPUCA_KRNLGPU_SINGLE(__VA_ARGS__);
diff --git a/GPU/GPUTracking/Base/hip/CMakeLists.txt b/GPU/GPUTracking/Base/hip/CMakeLists.txt
index 10fbfa8d21ddf..f6e420d5b9656 100644
--- a/GPU/GPUTracking/Base/hip/CMakeLists.txt
+++ b/GPU/GPUTracking/Base/hip/CMakeLists.txt
@@ -25,7 +25,7 @@ if(NOT DEFINED GPUCA_HIP_HIPIFY_FROM_CUDA OR "${GPUCA_HIP_HIPIFY_FROM_CUDA}")
   set(GPUCA_HIP_SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/hipify)
   file(MAKE_DIRECTORY ${GPUCA_HIP_SOURCE_DIR})
   set(GPUCA_HIP_FILE_LIST GPUReconstructionCUDA.cu GPUReconstructionCUDAExternalProvider.cu GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDAkernel.template.cu CUDAThrustHelpers.h GPUReconstructionCUDADef.h GPUReconstructionCUDAGenRTC.cxx GPUReconstructionCUDAKernels.cu GPUReconstructionCUDArtc.cu)
-  set(GPUCA_HIP_LOCAL_FILE_LIST GPUReconstructionHIPIncludes.h)
+  set(GPUCA_HIP_LOCAL_FILE_LIST GPUReconstructionHIPIncludesHost.h)
   set(HIP_SOURCES "")
   foreach(file ${GPUCA_HIP_FILE_LIST})
     get_filename_component(ABS_CUDA_SORUCE ../cuda/${file} ABSOLUTE)
@@ -63,7 +63,7 @@ endif()
 
 set(SRCS ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIP.hip ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPKernels.hip)
 set(SRCS_CXX ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPGenRTC.cxx)
-set(HDRS ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIP.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPInternals.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPDef.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPIncludes.h ${GPUCA_HIP_SOURCE_DIR}/HIPThrustHelpers.h)
+set(HDRS ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIP.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPInternals.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPDef.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPIncludesHost.h ${GPUCA_HIP_SOURCE_DIR}/HIPThrustHelpers.h)
 
 # -------------------------------- Prepare RTC -------------------------------------------------------
 enable_language(ASM)
@@ -104,7 +104,7 @@ set(GPU_RTC_BIN ${CMAKE_CURRENT_BINARY_DIR}/GPUReconstructionHIPrtc)
 # cmake-format: off
 add_custom_command(
     OUTPUT ${GPU_RTC_BIN}.src
-    COMMAND cat ${GPUDIR}/Base/hip/GPUReconstructionHIPIncludes.h > ${GPU_RTC_BIN}.src
+    COMMAND cp ${GPUDIR}/Base/hip/GPUReconstructionHIPIncludesHost.h ${GPU_RTC_BIN}.src
     COMMAND ${CMAKE_CXX_COMPILER} ${GPU_RTC_DEFINES} ${GPU_RTC_INCLUDES} -std=c++${CMAKE_HIP_STANDARD} -D__HIPCC__ -D__HIP_DEVICE_COMPILE__ -x c++ -nostdinc -E ${GPU_RTC_SRC} >> ${GPU_RTC_BIN}.src
     MAIN_DEPENDENCY ${GPU_RTC_SRC}
     IMPLICIT_DEPENDS CXX ${GPU_RTC_SRC}
diff --git a/GPU/GPUTracking/Base/hip/GPUReconstructionHIPIncludes.h b/GPU/GPUTracking/Base/hip/GPUReconstructionHIPIncludesHost.h
similarity index 89%
rename from GPU/GPUTracking/Base/hip/GPUReconstructionHIPIncludes.h
rename to GPU/GPUTracking/Base/hip/GPUReconstructionHIPIncludesHost.h
index 94d3e46b8f462..7117dd0c718c6 100644
--- a/GPU/GPUTracking/Base/hip/GPUReconstructionHIPIncludes.h
+++ b/GPU/GPUTracking/Base/hip/GPUReconstructionHIPIncludesHost.h
@@ -9,7 +9,7 @@
 // granted to it by virtue of its status as an Intergovernmental Organization
 // or submit itself to any jurisdiction.
 
-/// \file GPUReconstructionHIPInclude.h
+/// \file GPUReconstructionHIPIncludesHost.h
 /// \author David Rohr
 
 #ifndef O2_GPU_RECONSTRUCTIONHIPINCLUDES_H
@@ -27,4 +27,8 @@
 #include <thrust/device_ptr.h>
 #pragma GCC diagnostic pop
 
+#ifndef GPUCA_RTC_CODE
+#include "GPUReconstructionHIPDef.h"
+#endif
+
 #endif
diff --git a/GPU/GPUTracking/Base/hip/GPUReconstructionHIPkernel.template.hip b/GPU/GPUTracking/Base/hip/GPUReconstructionHIPkernel.template.hip
index 0ecaf7a83b18c..ddbc9285763a9 100644
--- a/GPU/GPUTracking/Base/hip/GPUReconstructionHIPkernel.template.hip
+++ b/GPU/GPUTracking/Base/hip/GPUReconstructionHIPkernel.template.hip
@@ -13,8 +13,7 @@
 /// \author David Rohr
 
 #define GPUCA_GPUCODE_COMPILEKERNELS
-#include "GPUReconstructionHIPIncludes.h"
-#include "GPUReconstructionHIPDef.h"
+#include "GPUReconstructionHIPIncludesHost.h"
 #define GPUCA_KRNL_REG(args) __launch_bounds__(GPUCA_M_MAX2_3(GPUCA_M_STRIP(args)))
 #define GPUCA_KRNL(...) GPUCA_KRNL_WRAP(GPUCA_KRNL_LOAD_, __VA_ARGS__)
 #define GPUCA_KRNL_LOAD_single(...) GPUCA_KRNLGPU_SINGLE(__VA_ARGS__);
diff --git a/GPU/GPUTracking/Base/opencl/CMakeLists.txt b/GPU/GPUTracking/Base/opencl/CMakeLists.txt
index d6aa945fc77b7..89d2f386f768f 100644
--- a/GPU/GPUTracking/Base/opencl/CMakeLists.txt
+++ b/GPU/GPUTracking/Base/opencl/CMakeLists.txt
@@ -36,8 +36,8 @@ set(OCL_DEFINECL "-D$<JOIN:$<TARGET_PROPERTY:O2::GPUTracking,COMPILE_DEFINITIONS
             -I${CMAKE_SOURCE_DIR}/DataFormats/Reconstruction/src
 )
 
-set(SRCS GPUReconstructionOCL.cxx)
-set(HDRS GPUReconstructionOCL.h GPUReconstructionOCLInternals.h)
+set(SRCS GPUReconstructionOCL.cxx GPUReconstructionOCLKernels.cxx)
+set(HDRS GPUReconstructionOCL.h GPUReconstructionOCLIncludesHost.h)
 
 if (NOT DEFINED GPUCA_OCL_SPIRV_VERSION)
   set(GPUCA_OCL_SPIRV_VERSION 1.2)
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx
index 5fac46a214815..c7a8be62a12ea 100644
--- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx
+++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx
@@ -12,19 +12,11 @@
 /// \file GPUReconstructionOCL.cxx
 /// \author David Rohr
 
-#define GPUCA_GPUTYPE_OPENCL
-#define __OPENCL_HOST__
+#include "GPUReconstructionOCLIncludesHost.h"
 
-#include "GPUReconstructionOCL.h"
-#include "GPUReconstructionOCLInternals.h"
-#include "GPUReconstructionIncludes.h"
+#include <map>
 
-using namespace o2::gpu;
-
-#include <cstring>
-#include <unistd.h>
-#include <typeinfo>
-#include <cstdlib>
+static_assert(std::is_convertible<cl_event, void*>::value, "OpenCL event type incompatible to deviceEvent");
 
 #define GPUErrorReturn(...) \
   {                         \
@@ -32,11 +24,6 @@ using namespace o2::gpu;
     return (1);             \
   }
 
-#define GPUCA_KRNL(x_class, x_attributes, ...) GPUCA_KRNL_PROP(x_class, x_attributes)
-#define GPUCA_KRNL_BACKEND_CLASS GPUReconstructionOCLBackend
-#include "GPUReconstructionKernelList.h"
-#undef GPUCA_KRNL
-
 #include "utils/qGetLdBinarySymbols.h"
 QGET_LD_BINARY_SYMBOLS(GPUReconstructionOCLCode_src);
 #ifdef OPENCL_ENABLED_SPIRV
@@ -67,7 +54,7 @@ int32_t GPUReconstructionOCLBackend::GPUFailedMsgAI(const int64_t error, const c
   if (error == CL_SUCCESS) {
     return (0);
   }
-  GPUError("OCL Error: %ld / %s (%s:%d)", error, opencl_error_string(error), file, line);
+  GPUError("OCL Error: %ld / %s (%s:%d)", error, convertErrorToString(error), file, line);
   return 1;
 }
 
@@ -457,7 +444,11 @@ size_t GPUReconstructionOCLBackend::GPUMemCpy(void* dst, const void* src, size_t
   if (stream == -1) {
     SynchronizeGPU();
   }
-  if (toGPU == -2) {
+  if (size == 0) {
+    if (ev || nEvents) { // Workaround for OCL runtimes, which can throw an error in case size = 0
+      GPUFailedMsg(clEnqueueMarkerWithWaitList(mInternals->command_queue[stream == -1 ? 0 : stream], nEvents, evList->getEventList<cl_event>(), ev->getEventList<cl_event>()));
+    }
+  } else if (toGPU == -2) {
     GPUFailedMsg(clEnqueueCopyBuffer(mInternals->command_queue[stream == -1 ? 0 : stream], mInternals->mem_gpu, mInternals->mem_gpu, (char*)src - (char*)mDeviceMemoryBase, (char*)dst - (char*)mDeviceMemoryBase, size, nEvents, evList->getEventList<cl_event>(), ev->getEventList<cl_event>()));
   } else if (toGPU) {
     GPUFailedMsg(clEnqueueWriteBuffer(mInternals->command_queue[stream == -1 ? 0 : stream], mInternals->mem_gpu, stream == -1, (char*)dst - (char*)mDeviceMemoryBase, size, src, nEvents, evList->getEventList<cl_event>(), ev->getEventList<cl_event>()));
@@ -554,20 +545,6 @@ int32_t GPUReconstructionOCLBackend::GPUDebug(const char* state, int32_t stream,
   return (0);
 }
 
-template <class T, int32_t I, typename... Args>
-int32_t GPUReconstructionOCLBackend::runKernelBackend(const krnlSetupArgs<T, I, Args...>& args)
-{
-  cl_kernel k = args.s.y.num > 1 ? getKernelObject<cl_kernel, T, I, true>() : getKernelObject<cl_kernel, T, I, false>();
-  return std::apply([this, &args, &k](auto&... vals) { return runKernelBackendInternal(args.s, k, vals...); }, args.v);
-}
-
-template <class S, class T, int32_t I, bool MULTI>
-S& GPUReconstructionOCLBackend::getKernelObject()
-{
-  static uint32_t krnl = FindKernel<T, I>(MULTI ? 2 : 1);
-  return mInternals->kernels[krnl].first;
-}
-
 int32_t GPUReconstructionOCLBackend::GetOCLPrograms()
 {
   cl_int ocl_error;
@@ -606,20 +583,75 @@ int32_t GPUReconstructionOCLBackend::GetOCLPrograms()
     return 1;
   }
 
-#define GPUCA_KRNL(...) \
-  GPUCA_KRNL_WRAP(GPUCA_KRNL_LOAD_, __VA_ARGS__)
-#define GPUCA_KRNL_LOAD_single(x_class, ...)              \
-  if (AddKernel<GPUCA_M_KRNL_TEMPLATE(x_class)>(false)) { \
-    return 1;                                             \
-  }
-#define GPUCA_KRNL_LOAD_multi(x_class, ...)              \
-  if (AddKernel<GPUCA_M_KRNL_TEMPLATE(x_class)>(true)) { \
-    return 1;                                            \
-  }
-#include "GPUReconstructionKernelList.h"
-#undef GPUCA_KRNL
-#undef GPUCA_KRNL_LOAD_single
-#undef GPUCA_KRNL_LOAD_multi
+  return AddKernels();
+}
 
-  return 0;
+const char* GPUReconstructionOCLBackend::convertErrorToString(int32_t errorcode)
+{
+  static const std::map<cl_int, const char*> error_map = {
+    {CL_SUCCESS, "CL_SUCCESS"},
+    {CL_DEVICE_NOT_FOUND, "CL_DEVICE_NOT_FOUND"},
+    {CL_DEVICE_NOT_AVAILABLE, "CL_DEVICE_NOT_AVAILABLE"},
+    {CL_COMPILER_NOT_AVAILABLE, "CL_COMPILER_NOT_AVAILABLE"},
+    {CL_MEM_OBJECT_ALLOCATION_FAILURE, "CL_MEM_OBJECT_ALLOCATION_FAILURE"},
+    {CL_OUT_OF_RESOURCES, "CL_OUT_OF_RESOURCES"},
+    {CL_OUT_OF_HOST_MEMORY, "CL_OUT_OF_HOST_MEMORY"},
+    {CL_PROFILING_INFO_NOT_AVAILABLE, "CL_PROFILING_INFO_NOT_AVAILABLE"},
+    {CL_MEM_COPY_OVERLAP, "CL_MEM_COPY_OVERLAP"},
+    {CL_IMAGE_FORMAT_MISMATCH, "CL_IMAGE_FORMAT_MISMATCH"},
+    {CL_IMAGE_FORMAT_NOT_SUPPORTED, "CL_IMAGE_FORMAT_NOT_SUPPORTED"},
+    {CL_BUILD_PROGRAM_FAILURE, "CL_BUILD_PROGRAM_FAILURE"},
+    {CL_MAP_FAILURE, "CL_MAP_FAILURE"},
+    {CL_MISALIGNED_SUB_BUFFER_OFFSET, "CL_MISALIGNED_SUB_BUFFER_OFFSET"},
+    {CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST, "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"},
+    {CL_COMPILE_PROGRAM_FAILURE, "CL_COMPILE_PROGRAM_FAILURE"},
+    {CL_LINKER_NOT_AVAILABLE, "CL_LINKER_NOT_AVAILABLE"},
+    {CL_LINK_PROGRAM_FAILURE, "CL_LINK_PROGRAM_FAILURE"},
+    {CL_DEVICE_PARTITION_FAILED, "CL_DEVICE_PARTITION_FAILED"},
+    {CL_KERNEL_ARG_INFO_NOT_AVAILABLE, "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"},
+    {CL_INVALID_VALUE, "CL_INVALID_VALUE"},
+    {CL_INVALID_DEVICE_TYPE, "CL_INVALID_DEVICE_TYPE"},
+    {CL_INVALID_PLATFORM, "CL_INVALID_PLATFORM"},
+    {CL_INVALID_DEVICE, "CL_INVALID_DEVICE"},
+    {CL_INVALID_CONTEXT, "CL_INVALID_CONTEXT"},
+    {CL_INVALID_QUEUE_PROPERTIES, "CL_INVALID_QUEUE_PROPERTIES"},
+    {CL_INVALID_COMMAND_QUEUE, "CL_INVALID_COMMAND_QUEUE"},
+    {CL_INVALID_HOST_PTR, "CL_INVALID_HOST_PTR"},
+    {CL_INVALID_MEM_OBJECT, "CL_INVALID_MEM_OBJECT"},
+    {CL_INVALID_IMAGE_FORMAT_DESCRIPTOR, "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"},
+    {CL_INVALID_IMAGE_SIZE, "CL_INVALID_IMAGE_SIZE"},
+    {CL_INVALID_SAMPLER, "CL_INVALID_SAMPLER"},
+    {CL_INVALID_BINARY, "CL_INVALID_BINARY"},
+    {CL_INVALID_BUILD_OPTIONS, "CL_INVALID_BUILD_OPTIONS"},
+    {CL_INVALID_PROGRAM, "CL_INVALID_PROGRAM"},
+    {CL_INVALID_PROGRAM_EXECUTABLE, "CL_INVALID_PROGRAM_EXECUTABLE"},
+    {CL_INVALID_KERNEL_NAME, "CL_INVALID_KERNEL_NAME"},
+    {CL_INVALID_KERNEL_DEFINITION, "CL_INVALID_KERNEL_DEFINITION"},
+    {CL_INVALID_KERNEL, "CL_INVALID_KERNEL"},
+    {CL_INVALID_ARG_INDEX, "CL_INVALID_ARG_INDEX"},
+    {CL_INVALID_ARG_VALUE, "CL_INVALID_ARG_VALUE"},
+    {CL_INVALID_ARG_SIZE, "CL_INVALID_ARG_SIZE"},
+    {CL_INVALID_KERNEL_ARGS, "CL_INVALID_KERNEL_ARGS"},
+    {CL_INVALID_WORK_DIMENSION, "CL_INVALID_WORK_DIMENSION"},
+    {CL_INVALID_WORK_GROUP_SIZE, "CL_INVALID_WORK_GROUP_SIZE"},
+    {CL_INVALID_WORK_ITEM_SIZE, "CL_INVALID_WORK_ITEM_SIZE"},
+    {CL_INVALID_GLOBAL_OFFSET, "CL_INVALID_GLOBAL_OFFSET"},
+    {CL_INVALID_EVENT_WAIT_LIST, "CL_INVALID_EVENT_WAIT_LIST"},
+    {CL_INVALID_EVENT, "CL_INVALID_EVENT"},
+    {CL_INVALID_OPERATION, "CL_INVALID_OPERATION"},
+    {CL_INVALID_GL_OBJECT, "CL_INVALID_GL_OBJECT"},
+    {CL_INVALID_BUFFER_SIZE, "CL_INVALID_BUFFER_SIZE"},
+    {CL_INVALID_MIP_LEVEL, "CL_INVALID_MIP_LEVEL"},
+    {CL_INVALID_GLOBAL_WORK_SIZE, "CL_INVALID_GLOBAL_WORK_SIZE"},
+    {CL_INVALID_PROPERTY, "CL_INVALID_PROPERTY"},
+    {CL_INVALID_IMAGE_DESCRIPTOR, "CL_INVALID_IMAGE_DESCRIPTOR"},
+    {CL_INVALID_COMPILER_OPTIONS, "CL_INVALID_COMPILER_OPTIONS"},
+    {CL_INVALID_LINKER_OPTIONS, "CL_INVALID_LINKER_OPTIONS"},
+    {CL_INVALID_DEVICE_PARTITION_COUNT, "CL_INVALID_DEVICE_PARTITION_COUNT"},
+    {CL_INVALID_PIPE_SIZE, "CL_INVALID_PIPE_SIZE"},
+    {CL_INVALID_DEVICE_QUEUE, "CL_INVALID_DEVICE_QUEUE"},
+    {CL_INVALID_SPEC_ID, "CL_INVALID_SPEC_ID"},
+    {CL_MAX_SIZE_RESTRICTION_EXCEEDED, "CL_MAX_SIZE_RESTRICTION_EXCEEDED"}};
+  auto entry = error_map.find(errorcode);
+  return (entry != error_map.end()) ? entry->second : "Unknown Errorcode";
 }
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h
index fadb393277758..15015cdcb43c5 100644
--- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h
+++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h
@@ -59,8 +59,8 @@ class GPUReconstructionOCLBackend : public GPUReconstructionDeviceBase
   int32_t AddKernel(bool multi = false);
   template <class T, int32_t I = 0>
   uint32_t FindKernel(int32_t num);
-  template <typename K, typename... Args>
-  int32_t runKernelBackendInternal(const krnlSetupTime& _xyz, K& k, const Args&... args);
+  template <class T, int32_t I = 0, typename... Args>
+  void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args);
   template <class T, int32_t I = 0>
   gpu_reconstruction_kernels::krnlProperties getKernelPropertiesBackend();
 
@@ -68,11 +68,20 @@ class GPUReconstructionOCLBackend : public GPUReconstructionDeviceBase
   float mOclVersion;
 
   template <class T, int32_t I = 0, typename... Args>
-  int32_t runKernelBackend(const krnlSetupArgs<T, I, Args...>& args);
+  void runKernelBackend(const krnlSetupArgs<T, I, Args...>& args);
   template <class S, class T, int32_t I, bool MULTI>
   S& getKernelObject();
 
   int32_t GetOCLPrograms();
+
+ private:
+  static const char* convertErrorToString(int32_t errorCode);
+  template <typename T, typename... Args>
+  static inline int64_t OCLsetKernelParameters_helper(cl_kernel& kernel, int32_t i, const T& firstParameter, const Args&... restOfParameters);
+  template <typename... Args>
+  static int64_t OCLsetKernelParameters(cl_kernel& kernel, const Args&... args);
+  static int64_t clExecuteKernelA(cl_command_queue queue, cl_kernel krnl, size_t local_size, size_t global_size, cl_event* pEvent = nullptr, cl_event* wait = nullptr, cl_int nWaitEvents = 1);
+  int32_t AddKernels();
 };
 
 using GPUReconstructionOCL = GPUReconstructionKernels<GPUReconstructionOCLBackend>;
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLIncludesHost.h b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLIncludesHost.h
new file mode 100644
index 0000000000000..aec5708a80f3c
--- /dev/null
+++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLIncludesHost.h
@@ -0,0 +1,82 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUReconstructionOCLIncludesHost.h
+/// \author David Rohr
+
+#ifndef GPURECONSTRUCTIONOCLINCLUDESHOST_H
+#define GPURECONSTRUCTIONOCLINCLUDESHOST_H
+
+#define GPUCA_GPUTYPE_OPENCL
+#define __OPENCL_HOST__
+
+#define CL_TARGET_OPENCL_VERSION 220
+#include <CL/opencl.h>
+#include <CL/cl_ext.h>
+#include <vector>
+#include <string>
+#include <memory>
+#include "GPULogging.h"
+
+#include "GPUReconstructionOCL.h"
+#include "GPUReconstructionIncludes.h"
+
+using namespace o2::gpu;
+
+#include <cstring>
+#include <unistd.h>
+#include <typeinfo>
+#include <cstdlib>
+
+#define GPUFailedMsg(x) GPUFailedMsgA(x, __FILE__, __LINE__)
+#define GPUFailedMsgI(x) GPUFailedMsgAI(x, __FILE__, __LINE__)
+
+namespace o2::gpu
+{
+struct GPUReconstructionOCLInternals {
+  cl_platform_id platform;
+  cl_device_id device;
+  cl_context context;
+  cl_command_queue command_queue[GPUCA_MAX_STREAMS];
+  cl_mem mem_gpu;
+  cl_mem mem_constant;
+  cl_mem mem_host;
+  cl_program program;
+
+  std::vector<std::pair<cl_kernel, std::string>> kernels;
+};
+} // namespace o2::gpu
+
+template <typename T, typename... Args>
+inline int64_t GPUReconstructionOCLBackend::OCLsetKernelParameters_helper(cl_kernel& kernel, int32_t i, const T& firstParameter, const Args&... restOfParameters)
+{
+  int64_t retVal = clSetKernelArg(kernel, i, sizeof(T), &firstParameter);
+  if (retVal) {
+    return retVal;
+  }
+  if constexpr (sizeof...(restOfParameters) > 0) {
+    return OCLsetKernelParameters_helper(kernel, i + 1, restOfParameters...);
+  }
+  return 0;
+}
+
+template <typename... Args>
+inline int64_t GPUReconstructionOCLBackend::OCLsetKernelParameters(cl_kernel& kernel, const Args&... args)
+{
+  return OCLsetKernelParameters_helper(kernel, 0, args...);
+}
+
+inline int64_t GPUReconstructionOCLBackend::clExecuteKernelA(cl_command_queue queue, cl_kernel krnl, size_t local_size, size_t global_size, cl_event* pEvent, cl_event* wait, cl_int nWaitEvents)
+{
+  return clEnqueueNDRangeKernel(queue, krnl, 1, nullptr, &global_size, &local_size, wait == nullptr ? 0 : nWaitEvents, wait, pEvent);
+}
+
+#endif // GPURECONSTRUCTIONOCLINCLUDESHOST_H
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLInternals.h b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLInternals.h
deleted file mode 100644
index b47c612b192d7..0000000000000
--- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLInternals.h
+++ /dev/null
@@ -1,247 +0,0 @@
-// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
-// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
-// All rights not expressly granted are reserved.
-//
-// This software is distributed under the terms of the GNU General Public
-// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
-//
-// In applying this license CERN does not waive the privileges and immunities
-// granted to it by virtue of its status as an Intergovernmental Organization
-// or submit itself to any jurisdiction.
-
-/// \file GPUReconstructionOCLInternals.h
-/// \author David Rohr, Sergey Gorbunov
-
-// All OpenCL-header related stuff goes here, so we can run CING over GPUReconstructionOCL
-
-#ifndef GPUTPCGPUTRACKEROPENCLINTERNALS_H
-#define GPUTPCGPUTRACKEROPENCLINTERNALS_H
-
-#define CL_TARGET_OPENCL_VERSION 220
-#include <CL/opencl.h>
-#include <CL/cl_ext.h>
-#include <vector>
-#include <string>
-#include <memory>
-#include "GPULogging.h"
-
-namespace o2::gpu
-{
-
-static const char* opencl_error_string(int32_t errorcode)
-{
-  switch (errorcode) {
-    case CL_SUCCESS:
-      return "Success!";
-    case CL_DEVICE_NOT_FOUND:
-      return "Device not found.";
-    case CL_DEVICE_NOT_AVAILABLE:
-      return "Device not available";
-    case CL_COMPILER_NOT_AVAILABLE:
-      return "Compiler not available";
-    case CL_MEM_OBJECT_ALLOCATION_FAILURE:
-      return "Memory object allocation failure";
-    case CL_OUT_OF_RESOURCES:
-      return "Out of resources";
-    case CL_OUT_OF_HOST_MEMORY:
-      return "Out of host memory";
-    case CL_PROFILING_INFO_NOT_AVAILABLE:
-      return "Profiling information not available";
-    case CL_MEM_COPY_OVERLAP:
-      return "Memory copy overlap";
-    case CL_IMAGE_FORMAT_MISMATCH:
-      return "Image format mismatch";
-    case CL_IMAGE_FORMAT_NOT_SUPPORTED:
-      return "Image format not supported";
-    case CL_BUILD_PROGRAM_FAILURE:
-      return "Program build failure";
-    case CL_MAP_FAILURE:
-      return "Map failure";
-    case CL_INVALID_VALUE:
-      return "Invalid value";
-    case CL_INVALID_DEVICE_TYPE:
-      return "Invalid device type";
-    case CL_INVALID_PLATFORM:
-      return "Invalid platform";
-    case CL_INVALID_DEVICE:
-      return "Invalid device";
-    case CL_INVALID_CONTEXT:
-      return "Invalid context";
-    case CL_INVALID_QUEUE_PROPERTIES:
-      return "Invalid queue properties";
-    case CL_INVALID_COMMAND_QUEUE:
-      return "Invalid command queue";
-    case CL_INVALID_HOST_PTR:
-      return "Invalid host pointer";
-    case CL_INVALID_MEM_OBJECT:
-      return "Invalid memory object";
-    case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
-      return "Invalid image format descriptor";
-    case CL_INVALID_IMAGE_SIZE:
-      return "Invalid image size";
-    case CL_INVALID_SAMPLER:
-      return "Invalid sampler";
-    case CL_INVALID_BINARY:
-      return "Invalid binary";
-    case CL_INVALID_BUILD_OPTIONS:
-      return "Invalid build options";
-    case CL_INVALID_PROGRAM:
-      return "Invalid program";
-    case CL_INVALID_PROGRAM_EXECUTABLE:
-      return "Invalid program executable";
-    case CL_INVALID_KERNEL_NAME:
-      return "Invalid kernel name";
-    case CL_INVALID_KERNEL_DEFINITION:
-      return "Invalid kernel definition";
-    case CL_INVALID_KERNEL:
-      return "Invalid kernel";
-    case CL_INVALID_ARG_INDEX:
-      return "Invalid argument index";
-    case CL_INVALID_ARG_VALUE:
-      return "Invalid argument value";
-    case CL_INVALID_ARG_SIZE:
-      return "Invalid argument size";
-    case CL_INVALID_KERNEL_ARGS:
-      return "Invalid kernel arguments";
-    case CL_INVALID_WORK_DIMENSION:
-      return "Invalid work dimension";
-    case CL_INVALID_WORK_GROUP_SIZE:
-      return "Invalid work group size";
-    case CL_INVALID_WORK_ITEM_SIZE:
-      return "Invalid work item size";
-    case CL_INVALID_GLOBAL_OFFSET:
-      return "Invalid global offset";
-    case CL_INVALID_EVENT_WAIT_LIST:
-      return "Invalid event wait list";
-    case CL_INVALID_EVENT:
-      return "Invalid event";
-    case CL_INVALID_OPERATION:
-      return "Invalid operation";
-    case CL_INVALID_GL_OBJECT:
-      return "Invalid OpenGL object";
-    case CL_INVALID_BUFFER_SIZE:
-      return "Invalid buffer size";
-    case CL_INVALID_MIP_LEVEL:
-      return "Invalid mip-map level";
-    default:
-      return "Unknown Errorcode";
-  }
-}
-
-#define GPUFailedMsg(x) GPUFailedMsgA(x, __FILE__, __LINE__)
-#define GPUFailedMsgI(x) GPUFailedMsgAI(x, __FILE__, __LINE__)
-
-static inline int64_t OCLsetKernelParameters_helper(cl_kernel& k, int32_t i)
-{
-  return 0;
-}
-
-template <typename T, typename... Args>
-static inline int64_t OCLsetKernelParameters_helper(cl_kernel& kernel, int32_t i, const T& firstParameter, const Args&... restOfParameters)
-{
-  int64_t retVal = clSetKernelArg(kernel, i, sizeof(T), &firstParameter);
-  if (retVal) {
-    return retVal;
-  }
-  return OCLsetKernelParameters_helper(kernel, i + 1, restOfParameters...);
-}
-
-template <typename... Args>
-static inline int64_t OCLsetKernelParameters(cl_kernel& kernel, const Args&... args)
-{
-  return OCLsetKernelParameters_helper(kernel, 0, args...);
-}
-
-static inline int64_t clExecuteKernelA(cl_command_queue queue, cl_kernel krnl, size_t local_size, size_t global_size, cl_event* pEvent, cl_event* wait = nullptr, cl_int nWaitEvents = 1)
-{
-  return clEnqueueNDRangeKernel(queue, krnl, 1, nullptr, &global_size, &local_size, wait == nullptr ? 0 : nWaitEvents, wait, pEvent);
-}
-
-struct GPUReconstructionOCLInternals {
-  cl_platform_id platform;
-  cl_device_id device;
-  cl_context context;
-  cl_command_queue command_queue[GPUCA_MAX_STREAMS];
-  cl_mem mem_gpu;
-  cl_mem mem_constant;
-  cl_mem mem_host;
-  cl_program program;
-
-  std::vector<std::pair<cl_kernel, std::string>> kernels;
-};
-
-template <typename K, typename... Args>
-inline int32_t GPUReconstructionOCLBackend::runKernelBackendInternal(const krnlSetupTime& _xyz, K& k, const Args&... args)
-{
-  auto& x = _xyz.x;
-  auto& y = _xyz.y;
-  auto& z = _xyz.z;
-  if (y.num <= 1) {
-    GPUFailedMsg(OCLsetKernelParameters(k, mInternals->mem_gpu, mInternals->mem_constant, y.start, args...));
-  } else {
-    GPUFailedMsg(OCLsetKernelParameters(k, mInternals->mem_gpu, mInternals->mem_constant, y.start, y.num, args...));
-  }
-
-  cl_event ev;
-  cl_event* evr;
-  bool tmpEvent = false;
-  if (z.ev == nullptr && mProcessingSettings.deviceTimers && mProcessingSettings.debugLevel > 0) {
-    evr = &ev;
-    tmpEvent = true;
-  } else {
-    evr = (cl_event*)z.ev;
-  }
-  GPUFailedMsg(clExecuteKernelA(mInternals->command_queue[x.stream], k, x.nThreads, x.nThreads * x.nBlocks, evr, (cl_event*)z.evList, z.nEvents));
-  if (mProcessingSettings.deviceTimers && mProcessingSettings.debugLevel > 0) {
-    cl_ulong time_start, time_end;
-    GPUFailedMsg(clWaitForEvents(1, evr));
-    GPUFailedMsg(clGetEventProfilingInfo(*evr, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, nullptr));
-    GPUFailedMsg(clGetEventProfilingInfo(*evr, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, nullptr));
-    _xyz.t = (time_end - time_start) * 1.e-9f;
-    if (tmpEvent) {
-      GPUFailedMsg(clReleaseEvent(ev));
-    }
-  }
-  return 0;
-}
-
-template <class T, int32_t I>
-int32_t GPUReconstructionOCLBackend::AddKernel(bool multi)
-{
-  std::string name(GetKernelName<T, I>());
-  if (multi) {
-    name += "_multi";
-  }
-  std::string kname("krnl_" + name);
-
-  cl_int ocl_error;
-  cl_kernel krnl = clCreateKernel(mInternals->program, kname.c_str(), &ocl_error);
-  if (GPUFailedMsgI(ocl_error)) {
-    GPUError("Error creating OPENCL Kernel: %s", name.c_str());
-    return 1;
-  }
-  mInternals->kernels.emplace_back(krnl, name);
-  return 0;
-}
-
-template <class T, int32_t I>
-inline uint32_t GPUReconstructionOCLBackend::FindKernel(int32_t num)
-{
-  std::string name(GetKernelName<T, I>());
-  if (num > 1) {
-    name += "_multi";
-  }
-
-  for (uint32_t k = 0; k < mInternals->kernels.size(); k++) {
-    if (mInternals->kernels[k].second == name) {
-      return (k);
-    }
-  }
-  GPUError("Could not find OpenCL kernel %s", name.c_str());
-  throw ::std::runtime_error("Requested unsupported OpenCL kernel");
-}
-
-static_assert(std::is_convertible<cl_event, void*>::value, "OpenCL event type incompatible to deviceEvent");
-} // namespace o2::gpu
-
-#endif
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx
new file mode 100644
index 0000000000000..8a1c8a6525c0d
--- /dev/null
+++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx
@@ -0,0 +1,133 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUReconstructionOCLKernels.cxx
+/// \author David Rohr
+
+#include "GPUReconstructionOCLIncludesHost.h"
+
+template <>
+inline void GPUReconstructionOCLBackend::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
+{
+  cl_int4 val0 = {0, 0, 0, 0};
+  GPUFailedMsg(clEnqueueFillBuffer(mInternals->command_queue[_xyz.x.stream], mInternals->mem_gpu, &val0, sizeof(val0), (char*)ptr - (char*)mDeviceMemoryBase, (size + sizeof(val0) - 1) & ~(sizeof(val0) - 1), _xyz.z.evList == nullptr ? 0 : _xyz.z.nEvents, _xyz.z.evList->getEventList<cl_event>(), _xyz.z.ev->getEventList<cl_event>()));
+}
+
+template <class T, int32_t I, typename... Args>
+inline void GPUReconstructionOCLBackend::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args)
+{
+  cl_kernel k = _xyz.y.num > 1 ? getKernelObject<cl_kernel, T, I, true>() : getKernelObject<cl_kernel, T, I, false>();
+  auto& x = _xyz.x;
+  auto& y = _xyz.y;
+  auto& z = _xyz.z;
+  if (y.num <= 1) {
+    GPUFailedMsg(OCLsetKernelParameters(k, mInternals->mem_gpu, mInternals->mem_constant, y.start, args...));
+  } else {
+    GPUFailedMsg(OCLsetKernelParameters(k, mInternals->mem_gpu, mInternals->mem_constant, y.start, y.num, args...));
+  }
+
+  cl_event ev;
+  cl_event* evr;
+  bool tmpEvent = false;
+  if (z.ev == nullptr && mProcessingSettings.deviceTimers && mProcessingSettings.debugLevel > 0) {
+    evr = &ev;
+    tmpEvent = true;
+  } else {
+    evr = (cl_event*)z.ev;
+  }
+  GPUFailedMsg(clExecuteKernelA(mInternals->command_queue[x.stream], k, x.nThreads, x.nThreads * x.nBlocks, evr, (cl_event*)z.evList, z.nEvents));
+  if (mProcessingSettings.deviceTimers && mProcessingSettings.debugLevel > 0) {
+    cl_ulong time_start, time_end;
+    GPUFailedMsg(clWaitForEvents(1, evr));
+    GPUFailedMsg(clGetEventProfilingInfo(*evr, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, nullptr));
+    GPUFailedMsg(clGetEventProfilingInfo(*evr, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, nullptr));
+    _xyz.t = (time_end - time_start) * 1.e-9f;
+    if (tmpEvent) {
+      GPUFailedMsg(clReleaseEvent(ev));
+    }
+  }
+}
+
+template <class T, int32_t I, typename... Args>
+void GPUReconstructionOCLBackend::runKernelBackend(const krnlSetupArgs<T, I, Args...>& args)
+{
+  std::apply([this, &args](auto&... vals) { runKernelBackendInternal<T, I, Args...>(args.s, vals...); }, args.v);
+}
+
+template <class T, int32_t I>
+inline uint32_t GPUReconstructionOCLBackend::FindKernel(int32_t num)
+{
+  std::string name(GetKernelName<T, I>());
+  if (num > 1) {
+    name += "_multi";
+  }
+
+  for (uint32_t k = 0; k < mInternals->kernels.size(); k++) {
+    if (mInternals->kernels[k].second == name) {
+      return (k);
+    }
+  }
+  GPUError("Could not find OpenCL kernel %s", name.c_str());
+  throw ::std::runtime_error("Requested unsupported OpenCL kernel");
+}
+
+template <class T, int32_t I>
+int32_t GPUReconstructionOCLBackend::AddKernel(bool multi)
+{
+  std::string name(GetKernelName<T, I>());
+  if (multi) {
+    name += "_multi";
+  }
+  std::string kname("krnl_" + name);
+
+  cl_int ocl_error;
+  cl_kernel krnl = clCreateKernel(mInternals->program, kname.c_str(), &ocl_error);
+  if (GPUFailedMsgI(ocl_error)) {
+    GPUError("Error creating OPENCL Kernel: %s", name.c_str());
+    return 1;
+  }
+  mInternals->kernels.emplace_back(krnl, name);
+  return 0;
+}
+
+template <class S, class T, int32_t I, bool MULTI>
+S& GPUReconstructionOCLBackend::getKernelObject()
+{
+  static uint32_t krnl = FindKernel<T, I>(MULTI ? 2 : 1);
+  return mInternals->kernels[krnl].first;
+}
+
+int32_t GPUReconstructionOCLBackend::AddKernels()
+{
+#define GPUCA_KRNL(...) \
+  GPUCA_KRNL_WRAP(GPUCA_KRNL_LOAD_, __VA_ARGS__)
+#define GPUCA_KRNL_LOAD_single(x_class, ...)              \
+  if (AddKernel<GPUCA_M_KRNL_TEMPLATE(x_class)>(false)) { \
+    return 1;                                             \
+  }
+#define GPUCA_KRNL_LOAD_multi(x_class, ...)              \
+  if (AddKernel<GPUCA_M_KRNL_TEMPLATE(x_class)>(true)) { \
+    return 1;                                            \
+  }
+#include "GPUReconstructionKernelList.h"
+#undef GPUCA_KRNL
+#undef GPUCA_KRNL_LOAD_single
+#undef GPUCA_KRNL_LOAD_multi
+
+  return 0;
+}
+
+#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types) \
+  GPUCA_KRNL_PROP(x_class, x_attributes)                                   \
+  template void GPUReconstructionOCLBackend::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args);
+#define GPUCA_KRNL_BACKEND_CLASS GPUReconstructionOCLBackend
+#include "GPUReconstructionKernelList.h"
+#undef GPUCA_KRNL
diff --git a/GPU/GPUTracking/Global/GPUChain.h b/GPU/GPUTracking/Global/GPUChain.h
index a7c582b79d964..e017d9b60a269 100644
--- a/GPU/GPUTracking/Global/GPUChain.h
+++ b/GPU/GPUTracking/Global/GPUChain.h
@@ -171,7 +171,7 @@ class GPUChain
     mRec->ReadStructFromFile<T>(file, obj);
   }
   template <class S, int32_t I = 0, typename... Args>
-  inline int32_t runKernel(gpu_reconstruction_kernels::krnlSetup&& setup, Args&&... args)
+  inline void runKernel(gpu_reconstruction_kernels::krnlSetup&& setup, Args&&... args)
   {
     return mRec->runKernel<S, I, Args...>(std::forward<gpu_reconstruction_kernels::krnlSetup&&>(setup), std::forward<Args>(args)...);
   }