diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx
index d96d5aad74622..9b6562d8e77ee 100644
--- a/GPU/GPUTracking/Base/GPUReconstruction.cxx
+++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx
@@ -273,6 +273,7 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice()
     if (mProcessingSettings.createO2Output > 1) {
       mProcessingSettings.createO2Output = 1;
     }
+    mProcessingSettings.rtc.deterministic = 1;
   }
   if (mProcessingSettings.deterministicGPUReconstruction && mProcessingSettings.debugLevel >= 6) {
     mProcessingSettings.nTPCClustererLanes = 1;
diff --git a/GPU/GPUTracking/Base/GPUReconstructionProcessing.cxx b/GPU/GPUTracking/Base/GPUReconstructionProcessing.cxx
index 51c48ebbfc0b2..bae95ac8d3f38 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionProcessing.cxx
+++ b/GPU/GPUTracking/Base/GPUReconstructionProcessing.cxx
@@ -57,17 +57,24 @@ void GPUReconstructionProcessing::runParallelOuterLoop(bool doGPU, uint32_t nThr
   }
 }
 
-namespace o2::gpu
-{
-namespace // anonymous
+uint32_t GPUReconstructionProcessing::SetAndGetNActiveThreadsOuterLoop(bool condition, uint32_t max)
 {
-static std::atomic_flag timerFlag = ATOMIC_FLAG_INIT; // TODO: Should be a class member not global, but cannot be moved to header due to ROOT limitation
-} // anonymous namespace
-} // namespace o2::gpu
+  if (condition && mProcessingSettings.inKernelParallel != 1) {
+    mNActiveThreadsOuterLoop = mProcessingSettings.inKernelParallel == 2 ? std::min<uint32_t>(max, mMaxHostThreads) : mMaxHostThreads;
+  } else {
+    mNActiveThreadsOuterLoop = 1;
+  }
+  if (mProcessingSettings.debugLevel >= 5) {
+    printf("Running %d threads in outer loop\n", mNActiveThreadsOuterLoop);
+  }
+  return mNActiveThreadsOuterLoop;
+}
+
+std::atomic_flag GPUReconstructionProcessing::mTimerFlag = ATOMIC_FLAG_INIT;
 
 GPUReconstructionProcessing::timerMeta* GPUReconstructionProcessing::insertTimer(uint32_t id, std::string&& name, int32_t J, int32_t num, int32_t type, RecoStep step)
 {
-  while (timerFlag.test_and_set()) {
+  while (mTimerFlag.test_and_set()) {
   }
   if (mTimers.size() <= id) {
     mTimers.resize(id + 1);
@@ -81,20 +88,20 @@ GPUReconstructionProcessing::timerMeta* GPUReconstructionProcessing::insertTimer
     mTimers[id]->count++;
   }
   timerMeta* retVal = mTimers[id].get();
-  timerFlag.clear();
+  mTimerFlag.clear();
   return retVal;
 }
 
 GPUReconstructionProcessing::timerMeta* GPUReconstructionProcessing::getTimerById(uint32_t id, bool increment)
 {
   timerMeta* retVal = nullptr;
-  while (timerFlag.test_and_set()) {
+  while (mTimerFlag.test_and_set()) {
   }
   if (mTimers.size() > id && mTimers[id]) {
     retVal = mTimers[id].get();
     retVal->count += increment;
   }
-  timerFlag.clear();
+  mTimerFlag.clear();
   return retVal;
 }
 
@@ -104,19 +111,6 @@ uint32_t GPUReconstructionProcessing::getNextTimerId()
   return id.fetch_add(1);
 }
 
-uint32_t GPUReconstructionProcessing::SetAndGetNActiveThreadsOuterLoop(bool condition, uint32_t max)
-{
-  if (condition && mProcessingSettings.inKernelParallel != 1) {
-    mNActiveThreadsOuterLoop = mProcessingSettings.inKernelParallel == 2 ? std::min<uint32_t>(max, mMaxHostThreads) : mMaxHostThreads;
-  } else {
-    mNActiveThreadsOuterLoop = 1;
-  }
-  if (mProcessingSettings.debugLevel >= 5) {
-    printf("Running %d threads in outer loop\n", mNActiveThreadsOuterLoop);
-  }
-  return mNActiveThreadsOuterLoop;
-}
-
 std::unique_ptr<gpu_reconstruction_kernels::threadContext> GPUReconstructionProcessing::GetThreadContext()
 {
   return std::make_unique<gpu_reconstruction_kernels::threadContext>();
@@ -124,3 +118,39 @@ std::unique_ptr<gpu_reconstruction_kernels::threadContext> GPUReconstructionProc
 
 gpu_reconstruction_kernels::threadContext::threadContext() = default;
 gpu_reconstruction_kernels::threadContext::~threadContext() = default;
+
+template <class T, int32_t I>
+uint32_t GPUReconstructionProcessing::GetKernelNum(int32_t k)
+{
+  static int32_t num = k;
+  if (num < 0) {
+    throw std::runtime_error("Internal Error - Kernel Number not Set");
+  }
+  return num;
+}
+
+namespace o2::gpu::internal
+{
+static std::vector<std::string> initKernelNames()
+{
+  std::vector<std::string> retVal;
+#define GPUCA_KRNL(x_class, ...)                                                            \
+  GPUReconstructionProcessing::GetKernelNum<GPUCA_M_KRNL_TEMPLATE(x_class)>(retVal.size()); \
+  retVal.emplace_back(GPUCA_M_STR(GPUCA_M_KRNL_NAME(x_class)));
+#include "GPUReconstructionKernelList.h"
+#undef GPUCA_KRNL
+  return retVal;
+}
+} // namespace o2::gpu::internal
+
+const std::vector<std::string> GPUReconstructionProcessing::mKernelNames = o2::gpu::internal::initKernelNames();
+
+#define GPUCA_KRNL(x_class, ...)                                                                        \
+  template uint32_t GPUReconstructionProcessing::GetKernelNum<GPUCA_M_KRNL_TEMPLATE(x_class)>(int32_t); \
+  template <>                                                                                           \
+  const char* GPUReconstructionProcessing::GetKernelName<GPUCA_M_KRNL_TEMPLATE(x_class)>()              \
+  {                                                                                                     \
+    return GPUCA_M_STR(GPUCA_M_KRNL_NAME(x_class));                                                     \
+  }
+#include "GPUReconstructionKernelList.h"
+#undef GPUCA_KRNL
diff --git a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h
index 43560616782db..b0466efceac24 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h
@@ -20,6 +20,7 @@
 
 #include "utils/timer.h"
 #include <functional>
+#include <atomic>
 
 namespace o2::gpu
 {
@@ -74,7 +75,10 @@ class GPUReconstructionProcessing : public GPUReconstruction
 
   // Interface to query name of a kernel
   template <class T, int32_t I>
-  constexpr static const char* GetKernelName();
+  static const char* GetKernelName();
+  const std::string& GetKernelName(int32_t i) const { return mKernelNames[i]; }
+  template <class T, int32_t I = 0>
+  static uint32_t GetKernelNum(int32_t k = -1);
 
   // Public queries for timers
   auto& getRecoStepTimer(RecoStep step) { return mTimersRecoSteps[getRecoStepNum(step)]; }
@@ -100,6 +104,8 @@ class GPUReconstructionProcessing : public GPUReconstruction
   GPUReconstructionProcessing(const GPUSettingsDeviceBackend& cfg) : GPUReconstruction(cfg) {}
   using deviceEvent = gpu_reconstruction_kernels::deviceEvent;
 
+  static const std::vector<std::string> mKernelNames;
+
   int32_t mActiveHostKernelThreads = 0;  // Number of currently active threads on the host for kernels
   uint32_t mNActiveThreadsOuterLoop = 1; // Number of threads currently running an outer loop
 
@@ -130,6 +136,8 @@ class GPUReconstructionProcessing : public GPUReconstruction
   uint32_t getNextTimerId();
   timerMeta* getTimerById(uint32_t id, bool increment = true);
   timerMeta* insertTimer(uint32_t id, std::string&& name, int32_t J, int32_t num, int32_t type, RecoStep step);
+
+  static std::atomic_flag mTimerFlag;
 };
 
 template <class T>
@@ -174,15 +182,6 @@ HighResTimer& GPUReconstructionProcessing::getTimer(const char* name, int32_t nu
   return timer->timer[num];
 }
 
-#define GPUCA_KRNL(x_class, ...)                                                                     \
-  template <>                                                                                        \
-  constexpr const char* GPUReconstructionProcessing::GetKernelName<GPUCA_M_KRNL_TEMPLATE(x_class)>() \
-  {                                                                                                  \
-    return GPUCA_M_STR(GPUCA_M_KRNL_NAME(x_class));                                                  \
-  }
-#include "GPUReconstructionKernelList.h"
-#undef GPUCA_KRNL
-
 } // namespace o2::gpu
 
 #endif
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
index 175fd205153ea..f87d5c8189cdc 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -610,7 +610,7 @@ void GPUReconstructionCUDABackend::PrintKernelOccupancies()
     GPUChkErr(cuOccupancyMaxActiveBlocksPerMultiprocessor(&maxBlocks, *mInternals->kernelFunctions[i], threads, 0));
     GPUChkErr(cuFuncGetAttribute(&nRegs, CU_FUNC_ATTRIBUTE_NUM_REGS, *mInternals->kernelFunctions[i]));
     GPUChkErr(cuFuncGetAttribute(&sMem, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, *mInternals->kernelFunctions[i]));
-    GPUInfo("Kernel: %50s Block size: %4d, Maximum active blocks: %3d, Suggested blocks: %3d, Regs: %3d, smem: %3d", mInternals->kernelNames[i].c_str(), threads, maxBlocks, suggestedBlocks, nRegs, sMem);
+    GPUInfo("Kernel: %50s Block size: %4d, Maximum active blocks: %3d, Suggested blocks: %3d, Regs: %3d, smem: %3d", GetKernelName(i).c_str(), threads, maxBlocks, suggestedBlocks, nRegs, sMem);
   }
 }
 
@@ -618,9 +618,10 @@ void GPUReconstructionCUDA::loadKernelModules(bool perKernel)
 {
   uint32_t j = 0;
 #define GPUCA_KRNL(x_class, ...)                                                                                                                                                        \
-  getRTCkernelNum<false, GPUCA_M_KRNL_TEMPLATE(x_class)>(mInternals->kernelFunctions.size());                                                                                           \
+  if (GetKernelNum<GPUCA_M_KRNL_TEMPLATE(x_class)>() != j) {                                                                                                                            \
+    GPUFatal("kernel numbers out of sync");                                                                                                                                             \
+  }                                                                                                                                                                                     \
   mInternals->kernelFunctions.emplace_back(new CUfunction);                                                                                                                             \
-  mInternals->kernelNames.emplace_back(GPUCA_M_STR(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))));                                                                                    \
   if (mProcessingSettings.debugLevel >= 3) {                                                                                                                                            \
     GPUInfo("Loading kernel %s (j = %u)", GPUCA_M_STR(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))), j);                                                                              \
   }                                                                                                                                                                                     \
@@ -628,7 +629,6 @@ void GPUReconstructionCUDA::loadKernelModules(bool perKernel)
   j++;
 #include "GPUReconstructionKernelList.h"
 #undef GPUCA_KRNL
-
   if (j != mInternals->kernelModules.size()) {
     GPUFatal("Did not load all kernels (%u < %u)", j, (uint32_t)mInternals->kernelModules.size());
   }
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
index 30bbc76d4c415..a98b14a873ca0 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
@@ -46,11 +46,6 @@ class GPUReconstructionCUDABackend : public GPUReconstructionDeviceBase
   void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args);
   template <class T, int32_t I = 0>
   gpu_reconstruction_kernels::krnlProperties getKernelPropertiesBackend();
-  template <class T, int32_t I>
-  class backendInternal;
-
-  template <bool multi, class T, int32_t I = 0>
-  static int32_t getRTCkernelNum(int32_t k = -1);
 
   void getRTCKernelCalls(std::vector<std::string>& kernels);
 
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx
index e789dc9b9ebc3..51d3bd4044e8d 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx
@@ -31,11 +31,12 @@ QGET_LD_BINARY_SYMBOLS(GPUReconstructionCUDArtc_command);
 QGET_LD_BINARY_SYMBOLS(GPUReconstructionCUDArtc_command_arch);
 QGET_LD_BINARY_SYMBOLS(GPUReconstructionCUDArtc_command_no_fast_math);
 
+#include "GPUNoFastMathKernels.h"
+
 int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
 {
   std::string rtcparam = std::string("#define GPUCA_RTC_CODE\n") +
                          std::string(mProcessingSettings.rtc.optSpecialCode ? "#define GPUCA_RTC_SPECIAL_CODE(...) __VA_ARGS__\n" : "#define GPUCA_RTC_SPECIAL_CODE(...)\n") +
-                         std::string(mProcessingSettings.rtc.deterministic ? "#define GPUCA_DETERMINISTIC_CODE(det, indet) det\n" : "#define GPUCA_DETERMINISTIC_CODE(det, indet) indet\n") +
                          GPUParamRTC::generateRTCCode(param(), mProcessingSettings.rtc.optConstexpr);
   if (filename == "") {
     filename = "/tmp/o2cagpu_rtc_";
@@ -54,7 +55,6 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
   std::string baseCommand = (mProcessingSettings.RTCprependCommand != "" ? (mProcessingSettings.RTCprependCommand + " ") : "");
   baseCommand += (getenv("O2_GPU_RTC_OVERRIDE_CMD") ? std::string(getenv("O2_GPU_RTC_OVERRIDE_CMD")) : std::string(_binary_GPUReconstructionCUDArtc_command_start, _binary_GPUReconstructionCUDArtc_command_len));
   baseCommand += std::string(" ") + (mProcessingSettings.RTCoverrideArchitecture != "" ? mProcessingSettings.RTCoverrideArchitecture : std::string(_binary_GPUReconstructionCUDArtc_command_arch_start, _binary_GPUReconstructionCUDArtc_command_arch_len));
-  baseCommand += mProcessingSettings.rtc.deterministic ? (std::string(" ") + std::string(_binary_GPUReconstructionCUDArtc_command_no_fast_math_start, _binary_GPUReconstructionCUDArtc_command_no_fast_math_len)) : std::string("");
 
   char shasource[21], shaparam[21], shacmd[21], shakernels[21];
   if (mProcessingSettings.rtc.cacheOutput) {
@@ -169,13 +169,20 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
       kernel += mProcessingSettings.rtc.compilePerKernel ? kernels[i] : kernelsall;
       kernel += "}";
 
-      if (fwrite(rtcparam.c_str(), 1, rtcparam.size(), fp) != rtcparam.size() ||
+      bool deterministic = mProcessingSettings.rtc.deterministic || o2::gpu::internal::noFastMathKernels.find(GetKernelName(i)) != o2::gpu::internal::noFastMathKernels.end();
+      const std::string deterministicStr = std::string(deterministic ? "#define GPUCA_DETERMINISTIC_CODE(det, indet) det\n" : "#define GPUCA_DETERMINISTIC_CODE(det, indet) indet\n");
+
+      if (fwrite(deterministicStr.c_str(), 1, deterministicStr.size(), fp) != deterministicStr.size() ||
+          fwrite(rtcparam.c_str(), 1, rtcparam.size(), fp) != rtcparam.size() ||
           fwrite(_binary_GPUReconstructionCUDArtc_src_start, 1, _binary_GPUReconstructionCUDArtc_src_len, fp) != _binary_GPUReconstructionCUDArtc_src_len ||
           fwrite(kernel.c_str(), 1, kernel.size(), fp) != kernel.size()) {
         throw std::runtime_error("Error writing file");
       }
       fclose(fp);
       std::string command = baseCommand;
+      if (deterministic) {
+        command += std::string(" ") + std::string(_binary_GPUReconstructionCUDArtc_command_no_fast_math_start, _binary_GPUReconstructionCUDArtc_command_no_fast_math_len);
+      }
       command += " -c " + filename + "_" + std::to_string(i) + mRtcSrcExtension + " -o " + filename + "_" + std::to_string(i) + mRtcBinExtension;
       if (mProcessingSettings.debugLevel < 0) {
         command += " &> /dev/null";
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAInternals.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAInternals.h
index c85d98d85420e..f3fc21243ef0e 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAInternals.h
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAInternals.h
@@ -30,7 +30,6 @@ namespace o2::gpu
 struct GPUReconstructionCUDAInternals {
   std::vector<std::unique_ptr<CUmodule>> kernelModules;     // module for RTC compilation
   std::vector<std::unique_ptr<CUfunction>> kernelFunctions; // vector of ptrs to RTC kernels
-  std::vector<std::string> kernelNames;                     // names of kernels
   cudaStream_t Streams[GPUCA_MAX_STREAMS];                  // Pointer to array of CUDA Streams
 
   static void getArgPtrs(const void** pArgs) {}
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu
index f1f459fe021bc..4b3f8a767226c 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu
@@ -55,7 +55,7 @@ inline void GPUReconstructionCUDABackend::runKernelBackendInternal(const krnlSet
 #endif
     pArgs[arg_offset] = &y.index;
     GPUReconstructionCUDAInternals::getArgPtrs(&pArgs[arg_offset + 1], args...);
-    GPUChkErr(cuLaunchKernel(*mInternals->kernelFunctions[getRTCkernelNum<false, T, I>()], x.nBlocks, 1, 1, x.nThreads, 1, 1, 0, mInternals->Streams[x.stream], (void**)pArgs, nullptr));
+    GPUChkErr(cuLaunchKernel(*mInternals->kernelFunctions[GetKernelNum<T, I>()], x.nBlocks, 1, 1, x.nThreads, 1, 1, 0, mInternals->Streams[x.stream], (void**)pArgs, nullptr));
   }
 }
 
@@ -111,22 +111,6 @@ void GPUReconstructionCUDABackend::runKernelBackend(const krnlSetupArgs<T, I, Ar
 #include "GPUReconstructionKernelList.h"
 #undef GPUCA_KRNL
 
-template <bool multi, class T, int32_t I>
-int32_t GPUReconstructionCUDABackend::getRTCkernelNum(int32_t k)
-{
-  static int32_t num = k;
-  if (num < 0) {
-    throw std::runtime_error("Invalid kernel");
-  }
-  return num;
-}
-
-#define GPUCA_KRNL(x_class, ...)                                                                                    \
-  template int32_t GPUReconstructionCUDABackend::getRTCkernelNum<false, GPUCA_M_KRNL_TEMPLATE(x_class)>(int32_t k); \
-  template int32_t GPUReconstructionCUDABackend::getRTCkernelNum<true, GPUCA_M_KRNL_TEMPLATE(x_class)>(int32_t k);
-#include "GPUReconstructionKernelList.h"
-#undef GPUCA_KRNL
-
 void GPUReconstructionCUDABackend::getRTCKernelCalls(std::vector<std::string>& kernels)
 {
 #define GPUCA_KRNL(...) kernels.emplace_back(GPUCA_M_STR(GPUCA_KRNLGPU(__VA_ARGS__)));
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx
index e92205b9864e6..e724f0f2cbfcd 100644
--- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx
+++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx
@@ -399,7 +399,7 @@ int32_t GPUReconstructionOCLBackend::ExitDevice_Runtime()
       clReleaseMemObject(mInternals->mem_gpu);
       clReleaseMemObject(mInternals->mem_constant);
       for (uint32_t i = 0; i < mInternals->kernels.size(); i++) {
-        clReleaseKernel(mInternals->kernels[i].first);
+        clReleaseKernel(mInternals->kernels[i]);
       }
       mInternals->kernels.clear();
     }
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h
index 2abae229c74bb..29951cd43f167 100644
--- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h
+++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h
@@ -56,8 +56,6 @@ class GPUReconstructionOCLBackend : public GPUReconstructionDeviceBase
 
   template <class T, int32_t I = 0>
   int32_t AddKernel();
-  template <class T, int32_t I = 0>
-  uint32_t FindKernel();
   template <class T, int32_t I = 0, typename... Args>
   void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args);
   template <class T, int32_t I = 0>
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLIncludesHost.h b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLIncludesHost.h
index 97316cf9aa32e..0bb2f25093789 100644
--- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLIncludesHost.h
+++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLIncludesHost.h
@@ -49,7 +49,7 @@ struct GPUReconstructionOCLInternals {
   cl_mem mem_host;
   cl_program program;
 
-  std::vector<std::pair<cl_kernel, std::string>> kernels;
+  std::vector<cl_kernel> kernels;
 };
 } // namespace o2::gpu
 
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx
index ce6b6553ae1f7..fff69038c056f 100644
--- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx
+++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx
@@ -58,20 +58,6 @@ void GPUReconstructionOCLBackend::runKernelBackend(const krnlSetupArgs<T, I, Arg
   std::apply([this, &args](auto&... vals) { runKernelBackendInternal<T, I, Args...>(args.s, vals...); }, args.v);
 }
 
-template <class T, int32_t I>
-inline uint32_t GPUReconstructionOCLBackend::FindKernel()
-{
-  std::string name(GetKernelName<T, I>());
-
-  for (uint32_t k = 0; k < mInternals->kernels.size(); k++) {
-    if (mInternals->kernels[k].second == name) {
-      return (k);
-    }
-  }
-  GPUError("Could not find OpenCL kernel %s", name.c_str());
-  throw ::std::runtime_error("Requested unsupported OpenCL kernel");
-}
-
 template <class T, int32_t I>
 int32_t GPUReconstructionOCLBackend::AddKernel()
 {
@@ -84,15 +70,14 @@ int32_t GPUReconstructionOCLBackend::AddKernel()
     GPUError("Error creating OPENCL Kernel: %s", name.c_str());
     return 1;
   }
-  mInternals->kernels.emplace_back(krnl, name);
+  mInternals->kernels.emplace_back(krnl);
   return 0;
 }
 
 template <class S, class T, int32_t I>
 S& GPUReconstructionOCLBackend::getKernelObject()
 {
-  static uint32_t krnl = FindKernel<T, I>();
-  return mInternals->kernels[krnl].first;
+  return mInternals->kernels[GetKernelNum<T, I>()];
 }
 
 int32_t GPUReconstructionOCLBackend::AddKernels()
diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index ba2b9d05a3192..631f9f0edff4f 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -242,6 +242,10 @@ file(GENERATE
      OUTPUT include_gpu_onthefly/GPUReconstructionIncludesDeviceAll.h
      INPUT Base/GPUReconstructionIncludesDeviceAll.template.h
 )
+file(GENERATE
+     OUTPUT include_gpu_onthefly/GPUNoFastMathKernels.h
+     INPUT cmake/GPUNoFastMathKernels.template.h
+)
 if(NOT ALIGPU_BUILD_TYPE STREQUAL "O2")
   include_directories(${CMAKE_CURRENT_BINARY_DIR}/include_gpu_onthefly)
 endif()
diff --git a/GPU/GPUTracking/cmake/GPUNoFastMathKernels.template.h b/GPU/GPUTracking/cmake/GPUNoFastMathKernels.template.h
new file mode 100644
index 0000000000000..dac93277d5ec9
--- /dev/null
+++ b/GPU/GPUTracking/cmake/GPUNoFastMathKernels.template.h
@@ -0,0 +1,23 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUNoFastMathKernels.h
+/// \author David Rohr
+
+#include <unordered_set>
+#include <string>
+
+namespace o2::gpu::internal
+{
+// clang-format off
+static const std::unordered_set<std::string> noFastMathKernels = {$<JOIN:$<LIST:TRANSFORM,$<LIST:TRANSFORM,$<LIST:REMOVE_DUPLICATES,$<TARGET_PROPERTY:O2_GPU_KERNELS,O2_GPU_KERNEL_NO_FAST_MATH>>,APPEND,">,PREPEND,">,$<COMMA> >};
+// clang-format on
+} // namespace o2::gpu::internal
diff --git a/GPU/GPUTracking/cmake/kernel_helpers.cmake b/GPU/GPUTracking/cmake/kernel_helpers.cmake
index e63b915640e8a..99699cc72e940 100644
--- a/GPU/GPUTracking/cmake/kernel_helpers.cmake
+++ b/GPU/GPUTracking/cmake/kernel_helpers.cmake
@@ -17,6 +17,7 @@ define_property(TARGET PROPERTY O2_GPU_KERNELS)
 define_property(TARGET PROPERTY O2_GPU_KERNEL_NAMES)
 define_property(TARGET PROPERTY O2_GPU_KERNEL_INCLUDES)
 define_property(TARGET PROPERTY O2_GPU_KERNEL_FILES)
+define_property(TARGET PROPERTY O2_GPU_KERNEL_NO_FAST_MATH)
 set(O2_GPU_KERNEL_WRAPPER_FOLDER "${CMAKE_CURRENT_BINARY_DIR}/GPU/include_gpu_onthefly")
 file(MAKE_DIRECTORY ${O2_GPU_KERNEL_WRAPPER_FOLDER})
 set(O2_GPU_BASE_DIR "${CMAKE_CURRENT_LIST_DIR}/../")
@@ -144,24 +145,23 @@ function(o2_gpu_kernel_file_list list)
 endfunction()
 
 function(o2_gpu_kernel_set_deterministic)
-  if(NOT GPUCA_DETERMINISTIC_MODE GREATER_EQUAL ${GPUCA_DETERMINISTIC_MODE_MAP_GPU})
-    list(LENGTH ARGV n)
-    math(EXPR n "${n} - 1")
-    foreach(i RANGE 0 ${n})
-      if(CUDA_ENABLED AND (NOT DEFINED GPUCA_CUDA_COMPILE_MODE OR GPUCA_CUDA_COMPILE_MODE STREQUAL "perkernel"))
-        set_source_files_properties("${O2_GPU_KERNEL_WRAPPER_FOLDER}/krnl_${ARGV${i}}.cu"
-                                    TARGET_DIRECTORY O2::GPUTrackingCUDA
-                                    PROPERTIES
-                                    COMPILE_FLAGS "${GPUCA_CUDA_NO_FAST_MATH_FLAGS}"
-                                    COMPILE_DEFINITIONS "GPUCA_DETERMINISTIC_MODE")
-      endif()
-      if(HIP_ENABLED AND (NOT DEFINED GPUCA_HIP_COMPILE_MODE OR GPUCA_HIP_COMPILE_MODE STREQUAL "perkernel"))
-        set_source_files_properties("${O2_GPU_KERNEL_WRAPPER_FOLDER}/krnl_${ARGV${i}}.hip"
-                                    TARGET_DIRECTORY O2::GPUTrackingHIP
-                                    PROPERTIES
-                                    COMPILE_FLAGS "${GPUCA_CXX_NO_FAST_MATH_FLAGS}"
-                                    COMPILE_DEFINITIONS "GPUCA_DETERMINISTIC_MODE")
-      endif()
-    endforeach()
-  endif()
+  list(LENGTH ARGV n)
+  math(EXPR n "${n} - 1")
+  foreach(i RANGE 0 ${n})
+    set_property(TARGET O2_GPU_KERNELS APPEND PROPERTY O2_GPU_KERNEL_NO_FAST_MATH "${ARGV${i}}")
+    if(CUDA_ENABLED AND (NOT DEFINED GPUCA_CUDA_COMPILE_MODE OR GPUCA_CUDA_COMPILE_MODE STREQUAL "perkernel"))
+      set_source_files_properties("${O2_GPU_KERNEL_WRAPPER_FOLDER}/krnl_${ARGV${i}}.cu"
+                                  TARGET_DIRECTORY O2::GPUTrackingCUDA
+                                  PROPERTIES
+                                  COMPILE_FLAGS "${GPUCA_CUDA_NO_FAST_MATH_FLAGS}"
+                                  COMPILE_DEFINITIONS "GPUCA_DETERMINISTIC_MODE")
+    endif()
+    if(HIP_ENABLED AND (NOT DEFINED GPUCA_HIP_COMPILE_MODE OR GPUCA_HIP_COMPILE_MODE STREQUAL "perkernel"))
+      set_source_files_properties("${O2_GPU_KERNEL_WRAPPER_FOLDER}/krnl_${ARGV${i}}.hip"
+                                  TARGET_DIRECTORY O2::GPUTrackingHIP
+                                  PROPERTIES
+                                  COMPILE_FLAGS "${GPUCA_CXX_NO_FAST_MATH_FLAGS}"
+                                  COMPILE_DEFINITIONS "GPUCA_DETERMINISTIC_MODE")
+    endif()
+  endforeach()
 endfunction()