From 8d4d4dc0250818497dd8edd4aa5bce21dfdb1f68 Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Thu, 27 Mar 2025 22:19:37 +0100
Subject: [PATCH 1/4] GPU RTC: Do not store preprocessed launch-bounds for RTC

---
 GPU/Common/GPUCommonAlgorithm.h               |  2 +-
 GPU/Common/GPUCommonAlgorithmThrust.h         | 13 +++-
 GPU/GPUTracking/Base/GPUParam.cxx             |  4 +-
 GPU/GPUTracking/Base/cuda/CMakeLists.txt      |  6 +-
 .../Base/cuda/GPUReconstructionCUDA.cu        |  3 +-
 .../GPUReconstructionCUDAExternalProvider.cu  |  3 +-
 .../Base/cuda/GPUReconstructionCUDAGenRTC.cxx | 61 ++++++++++---------
 ... => GPUReconstructionCUDAIncludesSystem.h} |  6 +-
 .../Base/cuda/GPUReconstructionCUDAKernels.cu | 21 ++++---
 .../GPUReconstructionCUDAkernel.template.cu   |  3 +-
 .../Base/cuda/GPUReconstructionCUDArtc.cu     |  2 +-
 GPU/GPUTracking/Base/hip/CMakeLists.txt       |  8 +--
 ...h => GPUReconstructionHIPIncludesSystem.h} |  6 +-
 .../GPUReconstructionHIPkernel.template.hip   |  3 +-
 GPU/GPUTracking/Base/opencl/CMakeLists.txt    |  2 +-
 .../GPUDefParametersLoad.template.inc         | 24 ++++----
 GPU/GPUTracking/Definitions/GPUSettingsList.h |  2 +-
 17 files changed, 91 insertions(+), 78 deletions(-)
 rename GPU/GPUTracking/Base/cuda/{GPUReconstructionCUDAIncludesHost.h => GPUReconstructionCUDAIncludesSystem.h} (91%)
 rename GPU/GPUTracking/Base/hip/{GPUReconstructionHIPIncludesHost.h => GPUReconstructionHIPIncludesSystem.h} (89%)
diff --git a/GPU/Common/GPUCommonAlgorithm.h b/GPU/Common/GPUCommonAlgorithm.h
index 5c19dda27f593..417c9e0d1f8c1 100644
--- a/GPU/Common/GPUCommonAlgorithm.h
+++ b/GPU/Common/GPUCommonAlgorithm.h
@@ -214,7 +214,7 @@ typedef GPUCommonAlgorithm CAAlgo;
 
 } // namespace o2::gpu
 
-#if (((defined(__CUDACC__) && !defined(__clang__)) || defined(__HIPCC__))) && !defined(GPUCA_GPUCODE_GENRTC) && !defined(GPUCA_GPUCODE_HOSTONLY)
+#if (((defined(__CUDACC__) && !defined(__clang__)) || defined(__HIPCC__))) && !defined(GPUCA_GPUCODE_HOSTONLY)
 
 #include "GPUCommonAlgorithmThrust.h"
 
diff --git a/GPU/Common/GPUCommonAlgorithmThrust.h b/GPU/Common/GPUCommonAlgorithmThrust.h
index 049071227a58e..7af3138d45490 100644
--- a/GPU/Common/GPUCommonAlgorithmThrust.h
+++ b/GPU/Common/GPUCommonAlgorithmThrust.h
@@ -15,6 +15,7 @@
 #ifndef GPUCOMMONALGORITHMTHRUST_H
 #define GPUCOMMONALGORITHMTHRUST_H
 
+#ifndef GPUCA_GPUCODE_COMPILEKERNELS
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wshadow"
 #include <thrust/sort.h>
@@ -25,14 +26,19 @@
 #include "GPUCommonDef.h"
 #include "GPUCommonHelpers.h"
 
+#ifndef __HIPCC__ // CUDA
+#include <cub/cub.cuh>
+#else // HIP
+#include <hipcub/hipcub.hpp>
+#endif
+#endif // GPUCA_GPUCODE_COMPILEKERNELS
+
 #ifndef __HIPCC__ // CUDA
 #define GPUCA_THRUST_NAMESPACE thrust::cuda
 #define GPUCA_CUB_NAMESPACE cub
-#include <cub/cub.cuh>
 #else // HIP
 #define GPUCA_THRUST_NAMESPACE thrust::hip
 #define GPUCA_CUB_NAMESPACE hipcub
-#include <hipcub/hipcub.hpp>
 #endif
 
 namespace o2::gpu
@@ -90,6 +96,7 @@ GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T* begin, T* end, const S& co
   thrust::sort(GPUCA_THRUST_NAMESPACE::par, thrustBegin, thrustEnd, comp);
 }
 
+#ifndef GPUCA_GPUCODE_COMPILEKERNELS
 template <class T, class S>
 GPUhi() void GPUCommonAlgorithm::sortOnDevice(auto* rec, int32_t stream, T* begin, size_t N, const S& comp)
 {
@@ -105,6 +112,8 @@ GPUhi() void GPUCommonAlgorithm::sortOnDevice(auto* rec, int32_t stream, T* begi
   GPUChkErrS(GPUCA_CUB_NAMESPACE::DeviceMergeSort::SortKeys(tempMem, tempSize, begin, N, comp, rec->mInternals->Streams[stream]));
 #endif
 }
+#endif // #ifndef GPUCA_GPUCODE_COMPILEKERNELS
+
 } // namespace o2::gpu
 
 #undef GPUCA_THRUST_NAMESPACE
diff --git a/GPU/GPUTracking/Base/GPUParam.cxx b/GPU/GPUTracking/Base/GPUParam.cxx
index bbca150df405a..b835e1b198eea 100644
--- a/GPU/GPUTracking/Base/GPUParam.cxx
+++ b/GPU/GPUTracking/Base/GPUParam.cxx
@@ -193,12 +193,10 @@ void GPUParamRTC::setFrom(const GPUParam& param)
 
 std::string GPUParamRTC::generateRTCCode(const GPUParam& param, bool useConstexpr)
 {
-  return "#ifndef GPUCA_GPUCODE_DEVICE\n"
-         "#include <string>\n"
+  return "#include <string>\n"
          "#include <vector>\n"
          "#include <cstdint>\n"
          "#include <cstddef>\n"
-         "#endif\n"
          "namespace o2::gpu { class GPUDisplayFrontendInterface; }\n" +
          qConfigPrintRtc(std::make_tuple(&param.rec.tpc, &param.rec.trd, &param.rec, &param.par), useConstexpr);
 }
diff --git a/GPU/GPUTracking/Base/cuda/CMakeLists.txt b/GPU/GPUTracking/Base/cuda/CMakeLists.txt
index de54f09fdc2e1..ae3f0ecaf69dc 100644
--- a/GPU/GPUTracking/Base/cuda/CMakeLists.txt
+++ b/GPU/GPUTracking/Base/cuda/CMakeLists.txt
@@ -18,7 +18,7 @@ endif()
 message(STATUS "Building GPUTracking with CUDA support ${TMP_TARGET}")
 
 set(SRCS GPUReconstructionCUDA.cu GPUReconstructionCUDAGenRTC.cxx GPUReconstructionCUDAKernels.cu)
-set(HDRS GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDAHelpers.inc GPUReconstructionCUDADef.h GPUReconstructionCUDAIncludesHost.h)
+set(HDRS GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDAHelpers.inc GPUReconstructionCUDADef.h GPUReconstructionCUDAIncludesSystem.h)
 # -------------------------------- Prepare RTC -------------------------------------------------------
 enable_language(ASM)
 if(ALIGPU_BUILD_TYPE STREQUAL "O2")
@@ -67,8 +67,8 @@ set(GPU_RTC_BIN ${CMAKE_CURRENT_BINARY_DIR}/GPUReconstructionCUDArtc)
 # cmake-format: off
 add_custom_command(
     OUTPUT ${GPU_RTC_BIN}.src
-    COMMAND cp ${GPUDIR}/Base/cuda/GPUReconstructionCUDAIncludesHost.h ${GPU_RTC_BIN}.src
-    COMMAND ${CMAKE_CXX_COMPILER} ${GPU_RTC_DEFINES} ${GPU_RTC_INCLUDES} -std=c++${CMAKE_CUDA_STANDARD} -D__CUDA_ARCH__=${RTC_CUDA_ARCH} -D__CUDACC__ -x c++ -nostdinc -E ${GPU_RTC_SRC} >> ${GPU_RTC_BIN}.src
+    COMMAND cp ${GPUDIR}/Base/cuda/GPUReconstructionCUDAIncludesSystem.h ${GPU_RTC_BIN}.src
+    COMMAND ${CMAKE_CXX_COMPILER} ${GPU_RTC_DEFINES} ${GPU_RTC_INCLUDES} -std=c++${CMAKE_CUDA_STANDARD} -D__CUDA_ARCH__=${RTC_CUDA_ARCH} -D__CUDACC__ -x c++ -nostdinc -E -P ${GPU_RTC_SRC} >> ${GPU_RTC_BIN}.src
     MAIN_DEPENDENCY ${GPU_RTC_SRC}
     IMPLICIT_DEPENDS CXX ${GPU_RTC_SRC}
     COMMAND_EXPAND_LISTS
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
index 8790d7718f517..3ca759626619b 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -19,7 +19,8 @@
 #include "GPUDefParametersDefault.h"
 #include "GPUDefParametersLoad.inc"
 
-#include "GPUReconstructionCUDAIncludesHost.h"
+#include "GPUReconstructionCUDAIncludesSystem.h"
+#include "GPUReconstructionCUDADef.h"
 #include <cuda_profiler_api.h>
 
 #include "GPUReconstructionCUDA.h"
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAExternalProvider.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAExternalProvider.cu
index 534f5e8606897..9a23fcb460aa4 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAExternalProvider.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAExternalProvider.cu
@@ -12,7 +12,8 @@
 /// \file GPUReconstructionCUDAExternalProvider.cu
 /// \author David Rohr
 
-#include "GPUReconstructionCUDAIncludesHost.h"
+#include "GPUReconstructionCUDAIncludesSystem.h"
+#include "GPUReconstructionCUDADef.h"
 
 #include "GPUReconstructionCUDA.h"
 #include "GPUReconstructionCUDAInternals.h"
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx
index 51d3bd4044e8d..31d4f54c00d10 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx
@@ -13,6 +13,10 @@
 /// \author David Rohr
 
 #define GPUCA_GPUCODE_HOSTONLY
+#define GPUCA_DEF_PARAMETERS_LOAD_DEFAULTS
+#include "GPUDefParametersDefault.h"
+#include "GPUDefParametersLoad.inc"
+
 #include "GPUReconstructionCUDA.h"
 #include "GPUParamRTC.h"
 #include "GPUDefMacros.h"
@@ -55,13 +59,15 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
   std::string baseCommand = (mProcessingSettings.RTCprependCommand != "" ? (mProcessingSettings.RTCprependCommand + " ") : "");
   baseCommand += (getenv("O2_GPU_RTC_OVERRIDE_CMD") ? std::string(getenv("O2_GPU_RTC_OVERRIDE_CMD")) : std::string(_binary_GPUReconstructionCUDArtc_command_start, _binary_GPUReconstructionCUDArtc_command_len));
   baseCommand += std::string(" ") + (mProcessingSettings.RTCoverrideArchitecture != "" ? mProcessingSettings.RTCoverrideArchitecture : std::string(_binary_GPUReconstructionCUDArtc_command_arch_start, _binary_GPUReconstructionCUDArtc_command_arch_len));
+  const std::string launchBounds = o2::gpu::internal::GPUDefParametersExport(*mParDevice, true);
 
-  char shasource[21], shaparam[21], shacmd[21], shakernels[21];
+  char shasource[21], shaparam[21], shacmd[21], shakernels[21], shabounds[21];
   if (mProcessingSettings.rtc.cacheOutput) {
     o2::framework::internal::SHA1(shasource, _binary_GPUReconstructionCUDArtc_src_start, _binary_GPUReconstructionCUDArtc_src_len);
     o2::framework::internal::SHA1(shaparam, rtcparam.c_str(), rtcparam.size());
     o2::framework::internal::SHA1(shacmd, baseCommand.c_str(), baseCommand.size());
     o2::framework::internal::SHA1(shakernels, kernelsall.c_str(), kernelsall.size());
+    o2::framework::internal::SHA1(shabounds, launchBounds.c_str(), launchBounds.size());
   }
 
   nCompile = mProcessingSettings.rtc.compilePerKernel ? kernels.size() : 1;
@@ -88,32 +94,29 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
     if (fp) {
       size_t len;
       while (true) {
-        if (fread(sharead, 1, 20, fp) != 20) {
-          throw std::runtime_error("Cache file corrupt");
-        }
-        if (!mProcessingSettings.rtc.ignoreCacheValid && memcmp(sharead, shasource, 20)) {
-          GPUInfo("Cache file content outdated (source)");
-          break;
-        }
-        if (fread(sharead, 1, 20, fp) != 20) {
-          throw std::runtime_error("Cache file corrupt");
-        }
-        if (!mProcessingSettings.rtc.ignoreCacheValid && memcmp(sharead, shaparam, 20)) {
-          GPUInfo("Cache file content outdated (param)");
-          break;
-        }
-        if (fread(sharead, 1, 20, fp) != 20) {
-          throw std::runtime_error("Cache file corrupt");
-        }
-        if (!mProcessingSettings.rtc.ignoreCacheValid && memcmp(sharead, shacmd, 20)) {
-          GPUInfo("Cache file content outdated (commandline)");
-          break;
-        }
-        if (fread(sharead, 1, 20, fp) != 20) {
-          throw std::runtime_error("Cache file corrupt");
-        }
-        if (!mProcessingSettings.rtc.ignoreCacheValid && memcmp(sharead, shakernels, 20)) {
-          GPUInfo("Cache file content outdated (kernel definitions)");
+        auto checkSHA = [&](const char* shacmp, const char* name) {
+          if (fread(sharead, 1, 20, fp) != 20) {
+            throw std::runtime_error("Cache file corrupt");
+          }
+          if (mProcessingSettings.debugLevel >= 3) {
+            char shaprint1[41], shaprint2[41];
+            for (uint32_t i = 0; i < 20; i++) {
+              sprintf(shaprint1 + 2 * i, "%02X ", shacmp[i]);
+              sprintf(shaprint2 + 2 * i, "%02X ", sharead[i]);
+            }
+            GPUInfo("SHA for %s: expected %s, read %s", name, shaprint1, shaprint2);
+          }
+          if (!mProcessingSettings.rtc.ignoreCacheValid && memcmp(sharead, shacmp, 20)) {
+            GPUInfo("Cache file content outdated (%s)", name);
+            return 1;
+          }
+          return 0;
+        };
+        if (checkSHA(shasource, "source") ||
+            checkSHA(shaparam, "param") ||
+            checkSHA(shacmd, "command line") ||
+            checkSHA(shakernels, "kernel definitions") ||
+            checkSHA(shabounds, "launch bounds")) {
           break;
         }
         GPUSettingsProcessingRTC cachedSettings;
@@ -169,11 +172,12 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
       kernel += mProcessingSettings.rtc.compilePerKernel ? kernels[i] : kernelsall;
       kernel += "}";
 
-      bool deterministic = mProcessingSettings.rtc.deterministic || o2::gpu::internal::noFastMathKernels.find(GetKernelName(i)) != o2::gpu::internal::noFastMathKernels.end();
+      bool deterministic = mProcessingSettings.rtc.deterministic || (mProcessingSettings.rtc.compilePerKernel && o2::gpu::internal::noFastMathKernels.find(GetKernelName(i)) != o2::gpu::internal::noFastMathKernels.end());
       const std::string deterministicStr = std::string(deterministic ? "#define GPUCA_DETERMINISTIC_CODE(det, indet) det\n" : "#define GPUCA_DETERMINISTIC_CODE(det, indet) indet\n");
 
       if (fwrite(deterministicStr.c_str(), 1, deterministicStr.size(), fp) != deterministicStr.size() ||
           fwrite(rtcparam.c_str(), 1, rtcparam.size(), fp) != rtcparam.size() ||
+          fwrite(launchBounds.c_str(), 1, launchBounds.size(), fp) != launchBounds.size() ||
           fwrite(_binary_GPUReconstructionCUDArtc_src_start, 1, _binary_GPUReconstructionCUDArtc_src_len, fp) != _binary_GPUReconstructionCUDArtc_src_len ||
           fwrite(kernel.c_str(), 1, kernel.size(), fp) != kernel.size()) {
         throw std::runtime_error("Error writing file");
@@ -213,6 +217,7 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
           fwrite(shaparam, 1, 20, fp) != 20 ||
           fwrite(shacmd, 1, 20, fp) != 20 ||
           fwrite(shakernels, 1, 20, fp) != 20 ||
+          fwrite(shabounds, 1, 20, fp) != 20 ||
           fwrite(&mProcessingSettings.rtc, sizeof(mProcessingSettings.rtc), 1, fp) != 1) {
         throw std::runtime_error("Error writing cache file");
       }
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAIncludesHost.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAIncludesSystem.h
similarity index 91%
rename from GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAIncludesHost.h
rename to GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAIncludesSystem.h
index e3e26e6482fc4..1cb3679fc30dc 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAIncludesHost.h
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAIncludesSystem.h
@@ -9,7 +9,7 @@
 // granted to it by virtue of its status as an Intergovernmental Organization
 // or submit itself to any jurisdiction.
 
-/// \file GPUReconstructionCUDAIncludes.h
+/// \file GPUReconstructionCUDAIncludesSystem.h
 /// \author David Rohr
 
 #ifndef O2_GPU_GPURECONSTRUCTIONCUDAINCLUDES_H
@@ -32,8 +32,4 @@
 #include <sm_20_atomic_functions.h>
 #include <cuda_fp16.h>
 
-#ifndef GPUCA_RTC_CODE
-#include "GPUReconstructionCUDADef.h"
-#endif
-
 #endif
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu
index ac79dd7576e48..f8efd8428f035 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu
@@ -12,7 +12,8 @@
 /// \file GPUReconstructionCUDAKernels.cu
 /// \author David Rohr
 
-#include "GPUReconstructionCUDAIncludesHost.h"
+#include "GPUReconstructionCUDAIncludesSystem.h"
+#include "GPUReconstructionCUDADef.h"
 
 #include "GPUReconstructionCUDA.h"
 #include "GPUReconstructionCUDAInternals.h"
@@ -108,13 +109,6 @@ void GPUReconstructionCUDABackend::runKernelBackend(const krnlSetupArgs<T, I, Ar
 #include "GPUReconstructionKernelList.h"
 #undef GPUCA_KRNL
 
-void GPUReconstructionCUDABackend::getRTCKernelCalls(std::vector<std::string>& kernels)
-{
-#define GPUCA_KRNL(...) kernels.emplace_back(GPUCA_M_STR(GPUCA_KRNLGPU(__VA_ARGS__)));
-#include "GPUReconstructionKernelList.h"
-#undef GPUCA_KRNL
-}
-
 #ifndef GPUCA_NO_CONSTANT_MEMORY
 static GPUReconstructionDeviceBase::deviceConstantMemRegistration registerConstSymbol([]() {
   void* retVal = nullptr;
@@ -124,3 +118,14 @@ static GPUReconstructionDeviceBase::deviceConstantMemRegistration registerConstS
   return retVal;
 });
 #endif
+
+void GPUReconstructionCUDABackend::getRTCKernelCalls(std::vector<std::string>& kernels)
+{
+#undef GPUCA_KRNL_LB
+#undef __launch_bounds__
+#define GPUCA_KRNL(...) kernels.emplace_back(GPUCA_M_STR(GPUCA_KRNLGPU(__VA_ARGS__)));
+#define GPUCA_KRNL_LB(x_class, x_attributes, ...) GPUCA_KRNL(x_class, (REG, (GPUCA_M_CAT(GPUCA_RTC_LB_, GPUCA_M_KRNL_NAME(x_class))), GPUCA_M_STRIP(x_attributes)), __VA_ARGS__)
+#include "GPUReconstructionKernelList.h"
+#undef GPUCA_KRNL
+#undef GPUCA_KRNL_LB
+}
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAkernel.template.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAkernel.template.cu
index 3140c6b9158ad..847011a70f7f9 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAkernel.template.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAkernel.template.cu
@@ -13,9 +13,10 @@
 /// \author David Rohr
 
 #define GPUCA_GPUCODE_COMPILEKERNELS
-#include "GPUReconstructionCUDAIncludesHost.h"
+#include "GPUReconstructionCUDAIncludesSystem.h"
 #define GPUCA_KRNL_REG(args) __launch_bounds__(GPUCA_M_MAX2_3(GPUCA_M_STRIP(args)))
 #define GPUCA_KRNL(...) GPUCA_KRNLGPU(__VA_ARGS__);
+#include "GPUReconstructionCUDADef.h"
 #include "GPUReconstructionKernelMacros.h"
 
 // clang-format off
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDArtc.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDArtc.cu
index 919b5c11477ef..50a568ab345cf 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDArtc.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDArtc.cu
@@ -16,7 +16,7 @@
 #define GPUCA_GPUCODE_COMPILEKERNELS
 #define GPUCA_RTC_SPECIAL_CODE(...) GPUCA_RTC_SPECIAL_CODE(__VA_ARGS__)
 #define GPUCA_DETERMINISTIC_CODE(...) GPUCA_DETERMINISTIC_CODE(__VA_ARGS__)
-// GPUReconstructionCUDAIncludesHost.h auto-prependended without preprocessor running
+// GPUReconstructionCUDAIncludesSystem.h prependended without preprocessor running
 #include "GPUReconstructionCUDADef.h"
 #include "GPUReconstructionIncludesDeviceAll.h"
 
diff --git a/GPU/GPUTracking/Base/hip/CMakeLists.txt b/GPU/GPUTracking/Base/hip/CMakeLists.txt
index 43259decef956..eaa87d69c0de6 100644
--- a/GPU/GPUTracking/Base/hip/CMakeLists.txt
+++ b/GPU/GPUTracking/Base/hip/CMakeLists.txt
@@ -25,7 +25,7 @@ if(NOT DEFINED GPUCA_HIP_HIPIFY_FROM_CUDA OR "${GPUCA_HIP_HIPIFY_FROM_CUDA}")
   set(GPUCA_HIP_SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/hipify)
   file(MAKE_DIRECTORY ${GPUCA_HIP_SOURCE_DIR})
   set(GPUCA_HIP_FILE_LIST GPUReconstructionCUDA.cu GPUReconstructionCUDAExternalProvider.cu GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDAHelpers.inc GPUReconstructionCUDAkernel.template.cu GPUReconstructionCUDADef.h GPUReconstructionCUDAGenRTC.cxx GPUReconstructionCUDAKernels.cu GPUReconstructionCUDArtc.cu)
-  set(GPUCA_HIP_LOCAL_FILE_LIST GPUReconstructionHIPIncludesHost.h)
+  set(GPUCA_HIP_LOCAL_FILE_LIST GPUReconstructionHIPIncludesSystem.h)
   set(HIP_SOURCES "")
   foreach(file ${GPUCA_HIP_FILE_LIST})
     get_filename_component(ABS_CUDA_SORUCE ../cuda/${file} ABSOLUTE)
@@ -63,7 +63,7 @@ endif()
 
 set(SRCS ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIP.hip ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPKernels.hip)
 set(SRCS_CXX ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPGenRTC.cxx)
-set(HDRS ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIP.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPInternals.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPHelpers.inc ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPDef.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPIncludesHost.h)
+set(HDRS ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIP.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPInternals.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPHelpers.inc ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPDef.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPIncludesSystem.h)
 
 # -------------------------------- Prepare RTC -------------------------------------------------------
 enable_language(ASM)
@@ -104,8 +104,8 @@ set(GPU_RTC_BIN ${CMAKE_CURRENT_BINARY_DIR}/GPUReconstructionHIPrtc)
 # cmake-format: off
 add_custom_command(
     OUTPUT ${GPU_RTC_BIN}.src
-    COMMAND cp ${GPUDIR}/Base/hip/GPUReconstructionHIPIncludesHost.h ${GPU_RTC_BIN}.src
-    COMMAND ${CMAKE_CXX_COMPILER} ${GPU_RTC_DEFINES} ${GPU_RTC_INCLUDES} -std=c++${CMAKE_HIP_STANDARD} -D__HIPCC__ -D__HIP_DEVICE_COMPILE__ -x c++ -nostdinc -E ${GPU_RTC_SRC} >> ${GPU_RTC_BIN}.src
+    COMMAND cp ${GPUDIR}/Base/hip/GPUReconstructionHIPIncludesSystem.h ${GPU_RTC_BIN}.src
+    COMMAND ${CMAKE_CXX_COMPILER} ${GPU_RTC_DEFINES} ${GPU_RTC_INCLUDES} -std=c++${CMAKE_HIP_STANDARD} -D__HIPCC__ -D__HIP_DEVICE_COMPILE__ -x c++ -nostdinc -E -P ${GPU_RTC_SRC} >> ${GPU_RTC_BIN}.src
     MAIN_DEPENDENCY ${GPU_RTC_SRC}
     IMPLICIT_DEPENDS CXX ${GPU_RTC_SRC}
     DEPENDS ${MODULE}_HIPIFIED
diff --git a/GPU/GPUTracking/Base/hip/GPUReconstructionHIPIncludesHost.h b/GPU/GPUTracking/Base/hip/GPUReconstructionHIPIncludesSystem.h
similarity index 89%
rename from GPU/GPUTracking/Base/hip/GPUReconstructionHIPIncludesHost.h
rename to GPU/GPUTracking/Base/hip/GPUReconstructionHIPIncludesSystem.h
index 5506e3925bf80..cfe1121ef1089 100644
--- a/GPU/GPUTracking/Base/hip/GPUReconstructionHIPIncludesHost.h
+++ b/GPU/GPUTracking/Base/hip/GPUReconstructionHIPIncludesSystem.h
@@ -9,7 +9,7 @@
 // granted to it by virtue of its status as an Intergovernmental Organization
 // or submit itself to any jurisdiction.
 
-/// \file GPUReconstructionHIPIncludesHost.h
+/// \file GPUReconstructionHIPIncludesSystem.h
 /// \author David Rohr
 
 #ifndef O2_GPU_RECONSTRUCTIONHIPINCLUDES_H
@@ -25,8 +25,4 @@
 #include <thrust/device_ptr.h>
 #pragma GCC diagnostic pop
 
-#ifndef GPUCA_RTC_CODE
-#include "GPUReconstructionHIPDef.h"
-#endif
-
 #endif
diff --git a/GPU/GPUTracking/Base/hip/GPUReconstructionHIPkernel.template.hip b/GPU/GPUTracking/Base/hip/GPUReconstructionHIPkernel.template.hip
index 427938a3bd704..d5ac1d14c2d9e 100644
--- a/GPU/GPUTracking/Base/hip/GPUReconstructionHIPkernel.template.hip
+++ b/GPU/GPUTracking/Base/hip/GPUReconstructionHIPkernel.template.hip
@@ -13,9 +13,10 @@
 /// \author David Rohr
 
 #define GPUCA_GPUCODE_COMPILEKERNELS
-#include "GPUReconstructionHIPIncludesHost.h"
+#include "GPUReconstructionHIPIncludesSystem.h"
 #define GPUCA_KRNL_REG(args) __launch_bounds__(GPUCA_M_MAX2_3(GPUCA_M_STRIP(args)))
 #define GPUCA_KRNL(...) GPUCA_KRNLGPU(__VA_ARGS__);
+#include "GPUReconstructionHIPDef.h"
 #include "GPUReconstructionKernelMacros.h"
 
 // clang-format off
diff --git a/GPU/GPUTracking/Base/opencl/CMakeLists.txt b/GPU/GPUTracking/Base/opencl/CMakeLists.txt
index 381c9c050ca09..7ab70553958ef 100644
--- a/GPU/GPUTracking/Base/opencl/CMakeLists.txt
+++ b/GPU/GPUTracking/Base/opencl/CMakeLists.txt
@@ -76,7 +76,7 @@ if(OPENCL_ENABLED) # BUILD OpenCL source code for runtime compilation target
               ${OCL_DEFINECL}
               -cl-no-stdinc
               -nostdinc
-              -E ${CL_SRC} > ${CL_BIN}.src
+              -E -P ${CL_SRC} > ${CL_BIN}.src
       MAIN_DEPENDENCY ${CL_SRC}
       IMPLICIT_DEPENDS CXX ${CL_SRC}
       COMMAND_EXPAND_LISTS
diff --git a/GPU/GPUTracking/Definitions/GPUDefParametersLoad.template.inc b/GPU/GPUTracking/Definitions/GPUDefParametersLoad.template.inc
index 953750b6f925b..73f7b5155fbdd 100644
--- a/GPU/GPUTracking/Definitions/GPUDefParametersLoad.template.inc
+++ b/GPU/GPUTracking/Definitions/GPUDefParametersLoad.template.inc
@@ -31,20 +31,20 @@ static GPUDefParameters GPUDefParametersLoad()
   };
 }
 
-#define GPUCA_EXPORT_KERNEL(name)                                      \
-  if (par.par_LB_maxThreads[i] > 0) {                                  \
-    o << "#define " GPUCA_M_STR(name) " " << par.par_LB_maxThreads[i]; \
-    if (par.par_LB_minBlocks[i] > 0) {                                 \
-      o << ", " << par.par_LB_minBlocks[i];                            \
-    }                                                                  \
-    if (par.par_LB_forceBlocks[i] > 0) {                               \
-      o << ", " << par.par_LB_forceBlocks[i];                          \
-    }                                                                  \
-    o << "\n";                                                         \
-  }                                                                    \
+#define GPUCA_EXPORT_KERNEL(name)                                                                                  \
+  if (par.par_LB_maxThreads[i] > 0) {                                                                              \
+    o << "#define GPUCA_" << (forRTC ? "RTC_" : "") << "LB_" << GPUCA_M_STR(name) " " << par.par_LB_maxThreads[i]; \
+    if (par.par_LB_minBlocks[i] > 0) {                                                                             \
+      o << ", " << par.par_LB_minBlocks[i];                                                                        \
+    }                                                                                                              \
+    if (par.par_LB_forceBlocks[i] > 0) {                                                                           \
+      o << ", " << par.par_LB_forceBlocks[i];                                                                      \
+    }                                                                                                              \
+    o << "\n";                                                                                                     \
+  }                                                                                                                \
   i++;
 
-static std::string GPUDefParametersExport(const GPUDefParameters& par)
+static std::string GPUDefParametersExport(const GPUDefParameters& par, bool forRTC)
 {
   std::stringstream o; // clang-format off
   int32_t i = 0;
diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index e0c5f845a475e..7fdc7054628a6 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -203,7 +203,6 @@ AddHelp("help", 'h')
 EndConfig()
 
 #ifndef __OPENCL__
-// Settings steering the processing once the device was selected, only available on the host
 BeginSubConfig(GPUSettingsProcessingRTC, rtc, configStandalone.proc, "RTC", 0, "Processing settings", proc_rtc)
 AddOption(cacheOutput, bool, false, "", 0, "Cache RTC compilation results")
 AddOption(optConstexpr, bool, true, "", 0, "Replace constant variables by static constexpr expressions")
@@ -253,6 +252,7 @@ AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then
 AddHelp("help", 'h')
 EndConfig()
 
+// Settings steering the processing once the device was selected, only available on the host
 BeginSubConfig(GPUSettingsProcessing, proc, configStandalone, "PROC", 0, "Processing settings", proc)
 AddOption(deviceNum, int32_t, -1, "gpuDevice", 0, "Set GPU device to use (-1: automatic, -2: for round-robin usage in timeslice-pipeline)")
 AddOption(gpuDeviceOnly, bool, false, "", 0, "Use only GPU as device (i.e. no CPU for OpenCL)")

From 11b291b282be3cf8698b2ba7c28f275a497d575e Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Thu, 27 Mar 2025 23:59:52 +0100
Subject: [PATCH 2/4] GPU RTC: Disable CUDA compiler diagnostic about unused
 variables

---
 GPU/GPUTracking/Base/cuda/CMakeLists.txt | 2 +-
 GPU/GPUTracking/Base/hip/CMakeLists.txt  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/GPU/GPUTracking/Base/cuda/CMakeLists.txt b/GPU/GPUTracking/Base/cuda/CMakeLists.txt
index ae3f0ecaf69dc..843fc3464e151 100644
--- a/GPU/GPUTracking/Base/cuda/CMakeLists.txt
+++ b/GPU/GPUTracking/Base/cuda/CMakeLists.txt
@@ -84,7 +84,7 @@ add_custom_target(${MODULE}_CUDA_SRC_CHK ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}
 
 add_custom_command(
     OUTPUT ${GPU_RTC_BIN}.command
-    COMMAND echo -n "${CMAKE_CUDA_COMPILER} -forward-unknown-to-host-compiler ${GPU_RTC_DEFINES} ${GPU_RTC_FLAGS_SEPARATED} -x cu -fatbin" > ${GPU_RTC_BIN}.command
+    COMMAND echo -n "${CMAKE_CUDA_COMPILER} -forward-unknown-to-host-compiler ${GPU_RTC_DEFINES} ${GPU_RTC_FLAGS_SEPARATED} -x cu -fatbin -Xcudafe --diag_suppress=177" > ${GPU_RTC_BIN}.command
     COMMAND_EXPAND_LISTS VERBATIM
     COMMENT "Preparing CUDA RTC command file ${GPU_RTC_BIN}.command"
 )
diff --git a/GPU/GPUTracking/Base/hip/CMakeLists.txt b/GPU/GPUTracking/Base/hip/CMakeLists.txt
index eaa87d69c0de6..5796c0c48686b 100644
--- a/GPU/GPUTracking/Base/hip/CMakeLists.txt
+++ b/GPU/GPUTracking/Base/hip/CMakeLists.txt
@@ -122,7 +122,7 @@ add_custom_target(${MODULE}_HIP_SRC_CHK ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/
 
 add_custom_command(
     OUTPUT ${GPU_RTC_BIN}.command
-    COMMAND echo -n "${CMAKE_HIP_COMPILER} ${GPU_RTC_DEFINES} ${GPU_RTC_FLAGS_SEPARATED} -x hip --cuda-device-only" > ${GPU_RTC_BIN}.command
+    COMMAND echo -n "${CMAKE_HIP_COMPILER} ${GPU_RTC_DEFINES} ${GPU_RTC_FLAGS_SEPARATED} -x hip --cuda-device-only -Wno-unused-const-variable" > ${GPU_RTC_BIN}.command
     COMMAND_EXPAND_LISTS VERBATIM
     COMMENT "Preparing HIP RTC command file ${GPU_RTC_BIN}.command"
 )

From ca0c33bb108e9757f0d7aab379d34a4c425d99a3 Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Fri, 28 Mar 2025 00:00:11 +0100
Subject: [PATCH 3/4] GPU TPC Clusterizer: remove unused constexpr debug
 variables

---
 GPU/GPUTracking/TPCClusterFinder/CfConsts.h | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/CfConsts.h b/GPU/GPUTracking/TPCClusterFinder/CfConsts.h
index 62695f2ae30a5..c600b8f3ea8f8 100644
--- a/GPU/GPUTracking/TPCClusterFinder/CfConsts.h
+++ b/GPU/GPUTracking/TPCClusterFinder/CfConsts.h
@@ -35,11 +35,6 @@ GPUconstexpr() tpccf::Delta2 InnerNeighbors[8] =
     {1, 0},
     {1, 1}};
 
-GPUconstexpr() bool InnerTestEq[8] =
-  {
-    true, true, true, true,
-    false, false, false, false};
-
 GPUconstexpr() tpccf::Delta2 OuterNeighbors[16] =
   {
     {-2, -1},

From 7614845e0359088b40393b163833bf4c90e8dd50 Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Fri, 28 Mar 2025 11:46:53 +0100
Subject: [PATCH 4/4] GPU RTC: Split options into technical and code-creation
 ones, add option to print launch-bounds used for RTC

---
 .../Base/cuda/GPUReconstructionCUDA.cu        |  6 ++---
 .../Base/cuda/GPUReconstructionCUDAGenRTC.cxx | 25 +++++++++++--------
 GPU/GPUTracking/Definitions/GPUSettingsList.h | 23 ++++++++++++++---
 .../GPUTrackingLinkDef_O2_DataTypes.h         |  1 +
 prodtests/full-system-test/dpl-workflow.sh    |  8 +++---
 5 files changed, 41 insertions(+), 22 deletions(-)

diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
index 3ca759626619b..4cfdf7febabd7 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -114,7 +114,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime()
   constexpr int32_t reqVerMaj = 2;
   constexpr int32_t reqVerMin = 0;
 #endif
-  if (mProcessingSettings.rtc.enable && mProcessingSettings.rtc.runTest == 2) {
+  if (mProcessingSettings.rtc.enable && mProcessingSettings.rtctech.runTest == 2) {
     genAndLoadRTC();
     exit(0);
   }
@@ -433,14 +433,14 @@ void GPUReconstructionCUDA::genAndLoadRTC()
     throw std::runtime_error("Runtime compilation failed");
   }
   for (uint32_t i = 0; i < nCompile; i++) {
-    if (mProcessingSettings.rtc.runTest != 2) {
+    if (mProcessingSettings.rtctech.runTest != 2) {
       mInternals->kernelModules.emplace_back(std::make_unique<CUmodule>());
       GPUChkErr(cuModuleLoad(mInternals->kernelModules.back().get(), (filename + "_" + std::to_string(i) + mRtcBinExtension).c_str()));
     }
     remove((filename + "_" + std::to_string(i) + mRtcSrcExtension).c_str());
     remove((filename + "_" + std::to_string(i) + mRtcBinExtension).c_str());
   }
-  if (mProcessingSettings.rtc.runTest == 2) {
+  if (mProcessingSettings.rtctech.runTest == 2) {
     return;
   }
   loadKernelModules(mProcessingSettings.rtc.compilePerKernel);
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx
index 31d4f54c00d10..c6bbc26977f09 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx
@@ -56,10 +56,13 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
     kernelsall += kernels[i] + "\n";
   }
 
-  std::string baseCommand = (mProcessingSettings.RTCprependCommand != "" ? (mProcessingSettings.RTCprependCommand + " ") : "");
+  std::string baseCommand = (mProcessingSettings.rtctech.prependCommand != "" ? (mProcessingSettings.rtctech.prependCommand + " ") : "");
   baseCommand += (getenv("O2_GPU_RTC_OVERRIDE_CMD") ? std::string(getenv("O2_GPU_RTC_OVERRIDE_CMD")) : std::string(_binary_GPUReconstructionCUDArtc_command_start, _binary_GPUReconstructionCUDArtc_command_len));
-  baseCommand += std::string(" ") + (mProcessingSettings.RTCoverrideArchitecture != "" ? mProcessingSettings.RTCoverrideArchitecture : std::string(_binary_GPUReconstructionCUDArtc_command_arch_start, _binary_GPUReconstructionCUDArtc_command_arch_len));
+  baseCommand += std::string(" ") + (mProcessingSettings.rtctech.overrideArchitecture != "" ? mProcessingSettings.rtctech.overrideArchitecture : std::string(_binary_GPUReconstructionCUDArtc_command_arch_start, _binary_GPUReconstructionCUDArtc_command_arch_len));
   const std::string launchBounds = o2::gpu::internal::GPUDefParametersExport(*mParDevice, true);
+  if (mProcessingSettings.rtctech.printLaunchBounds || mProcessingSettings.debugLevel >= 3) {
+    GPUInfo("RTC Launch Bounds:\n%s", launchBounds.c_str());
+  }
 
   char shasource[21], shaparam[21], shacmd[21], shakernels[21], shabounds[21];
   if (mProcessingSettings.rtc.cacheOutput) {
@@ -74,12 +77,12 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
   bool cacheLoaded = false;
   int32_t fd = 0;
   if (mProcessingSettings.rtc.cacheOutput) {
-    if (mProcessingSettings.RTCcacheFolder != ".") {
-      std::filesystem::create_directories(mProcessingSettings.RTCcacheFolder);
+    if (mProcessingSettings.rtctech.cacheFolder != ".") {
+      std::filesystem::create_directories(mProcessingSettings.rtctech.cacheFolder);
     }
-    if (mProcessingSettings.rtc.cacheMutex) {
+    if (mProcessingSettings.rtctech.cacheMutex) {
       mode_t mask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH;
-      fd = open((mProcessingSettings.RTCcacheFolder + "/cache.lock").c_str(), O_RDWR | O_CREAT | O_CLOEXEC, mask);
+      fd = open((mProcessingSettings.rtctech.cacheFolder + "/cache.lock").c_str(), O_RDWR | O_CREAT | O_CLOEXEC, mask);
       if (fd == -1) {
         throw std::runtime_error("Error opening rtc cache mutex lock file");
       }
@@ -89,7 +92,7 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
       }
     }
 
-    FILE* fp = fopen((mProcessingSettings.RTCcacheFolder + "/rtc.cuda.cache").c_str(), "rb");
+    FILE* fp = fopen((mProcessingSettings.rtctech.cacheFolder + "/rtc.cuda.cache").c_str(), "rb");
     char sharead[20];
     if (fp) {
       size_t len;
@@ -106,7 +109,7 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
             }
             GPUInfo("SHA for %s: expected %s, read %s", name, shaprint1, shaprint2);
           }
-          if (!mProcessingSettings.rtc.ignoreCacheValid && memcmp(sharead, shacmp, 20)) {
+          if (!mProcessingSettings.rtctech.ignoreCacheValid && memcmp(sharead, shacmp, 20)) {
             GPUInfo("Cache file content outdated (%s)", name);
             return 1;
           }
@@ -124,7 +127,7 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
         if (fread(&cachedSettings, sizeof(cachedSettings), 1, fp) != 1) {
           throw std::runtime_error("Cache file corrupt");
         }
-        if (!mProcessingSettings.rtc.ignoreCacheValid && !(cachedSettings == mProcessingSettings.rtc)) {
+        if (!mProcessingSettings.rtctech.ignoreCacheValid && !(cachedSettings == mProcessingSettings.rtc)) {
           GPUInfo("Cache file content outdated (rtc parameters)");
           break;
         }
@@ -207,7 +210,7 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
       GPUInfo("RTC Compilation finished (%f seconds)", rtcTimer.GetCurrentElapsedTime());
     }
     if (mProcessingSettings.rtc.cacheOutput) {
-      FILE* fp = fopen((mProcessingSettings.RTCcacheFolder + "/rtc.cuda.cache").c_str(), "w+b");
+      FILE* fp = fopen((mProcessingSettings.rtctech.cacheFolder + "/rtc.cuda.cache").c_str(), "w+b");
       if (fp == nullptr) {
         throw std::runtime_error("Cannot open cache file for writing");
       }
@@ -245,7 +248,7 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
       fclose(fp);
     }
   }
-  if (mProcessingSettings.rtc.cacheOutput && mProcessingSettings.rtc.cacheMutex) {
+  if (mProcessingSettings.rtc.cacheOutput && mProcessingSettings.rtctech.cacheMutex) {
     if (lockf(fd, F_ULOCK, 0)) {
       throw std::runtime_error("Error unlocking RTC cache mutex file");
     }
diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 7fdc7054628a6..d3a3fbaff16ff 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -35,6 +35,9 @@ BeginNamespace(gpu)
 
 // Settings concerning the reconstruction, stored as parameters in GPU constant memory
 // There must be no bool in here, use int8_t, as sizeof(bool) is compiler dependent and fails on GPUs!!!!!!
+// Split in different blocks for global and per Detector
+
+// Reconstruction parameters for TPC, no bool in here !!!
 BeginSubConfig(GPUSettingsRecTPC, tpc, configStandalone.rec, "RECTPC", 0, "Reconstruction settings", rec_tpc)
 AddOptionRTC(rejectQPtB5, float, 1.f / GPUCA_MIN_TRACK_PTB5_REJECT_DEFAULT, "", 0, "QPt threshold to reject clusters of TPC tracks (Inverse Pt, scaled to B=0.5T!!!)")
 AddOptionRTC(hitPickUpFactor, float, 1.f, "", 0, "multiplier for the combined cluster+track error during track following")
@@ -161,6 +164,7 @@ AddOptionArray(PID_remap, int8_t, 9, (0, 1, 2, 3, 4, 5, 6, 7, 8), "", 0, "Remap
 AddHelp("help", 'h')
 EndConfig()
 
+// Reconstruction parameters for TRD, no bool in here !!!
 BeginSubConfig(GPUSettingsRecTRD, trd, configStandalone.rec, "RECTRD", 0, "Reconstruction settings", rec_trd)
 AddOptionRTC(minTrackPt, float, .5f, "", 0, "Min Pt for tracks to be propagated through the TRD")
 AddOptionRTC(maxChi2, float, 20.f, "", 0, "Max chi2 for TRD tracklets to be matched to a track")
@@ -182,11 +186,12 @@ AddOptionRTC(pileupBwdNBC, uint8_t, 80, "", 0, "Pre-trigger Pile-up integration
 AddHelp("help", 'h')
 EndConfig()
 
-// Dynamic settings, must NOT use AddOptionRTC(...) !!!
+// Dynamic reconstruction parameters, no bool in here!!!, must NOT use AddOptionRTC(...) !!!
 BeginSubConfig(GPUSettingsRecDynamic, dyn, configStandalone.rec, "RECDYN", 0, "Reconstruction settings", rec_dyn)
 AddHelp("help", 'h')
 EndConfig()
 
+// Global reconstruction parameters, no bool in here !!!
 BeginSubConfig(GPUSettingsRec, rec, configStandalone, "REC", 0, "Reconstruction settings", rec)
 AddOptionRTC(maxTrackQPtB5, float, 1.f / GPUCA_MIN_TRACK_PTB5_DEFAULT, "", 0, "required max Q/Pt (==min Pt) of tracks")
 AddOptionRTC(fwdTPCDigitsAsClusters, uint8_t, 0, "", 0, "Forward TPC digits as clusters (if they pass the ZS threshold)")
@@ -203,6 +208,7 @@ AddHelp("help", 'h')
 EndConfig()
 
 #ifndef __OPENCL__
+// Parameters that might affect the RTC code (if these change, the cache cannot be used)
 BeginSubConfig(GPUSettingsProcessingRTC, rtc, configStandalone.proc, "RTC", 0, "Processing settings", proc_rtc)
 AddOption(cacheOutput, bool, false, "", 0, "Cache RTC compilation results")
 AddOption(optConstexpr, bool, true, "", 0, "Replace constant variables by static constexpr expressions")
@@ -210,12 +216,22 @@ AddOption(optSpecialCode, int8_t, -1, "", 0, "Insert GPUCA_RTC_SPECIAL_CODE spec
 AddOption(deterministic, bool, false, "", 0, "Compile RTC in deterministic mode, with NO_FAST_MATH flags and GPUCA_DETERMINISTIC_MODE define")
 AddOption(compilePerKernel, bool, true, "", 0, "Run one RTC compilation per kernel")
 AddOption(enable, bool, false, "", 0, "Use RTC to optimize GPU code")
+AddHelp("help", 'h')
+EndConfig()
+
+// Technical parameters for RunTimeCompilation, which do not change the RTC code
+BeginSubConfig(GPUSettingsProcessingRTCtechnical, rtctech, configStandalone.proc, "RTCTECH", 0, "Processing settings", proc_rtctech)
 AddOption(runTest, int32_t, 0, "", 0, "Do not run the actual benchmark, but just test RTC compilation (1 full test, 2 test only compilation)")
 AddOption(cacheMutex, bool, true, "", 0, "Use a file lock to serialize access to the cache folder")
 AddOption(ignoreCacheValid, bool, false, "", 0, "If set, allows to use RTC cached code files even if they are not valid for the current source code / parameters")
+AddOption(printLaunchBounds, bool, false, "", 0, "Print launch bounds used for RTC code as debugging option")
+AddOption(cacheFolder, std::string, "./rtccache/", "", 0, "Folder in which the cache file is stored")
+AddOption(prependCommand, std::string, "", "", 0, "Prepend RTC compilation commands by this string")
+AddOption(overrideArchitecture, std::string, "", "", 0, "Override arhcitecture part of RTC compilation command line") // Part of cmdLine, so checked against the cache
 AddHelp("help", 'h')
 EndConfig()
 
+// Parameters that steer reconstruction that do not go to the device, or only in derrived form.
 BeginSubConfig(GPUSettingsProcessingParam, param, configStandalone.proc, "PARAM", 0, "Processing settings", proc_param)
 AddOptionArray(tpcErrorParamY, float, 4, (0.06f, 0.24f, 0.12f, 0.1f), "", 0, "TPC Cluster Y Error Parameterization")
 AddOptionArray(tpcErrorParamZ, float, 4, (0.06f, 0.24f, 0.15f, 0.1f), "", 0, "TPC Cluster Z Error Parameterization")
@@ -223,6 +239,7 @@ AddOption(tpcTriggerHandling, bool, true, "", 0, "Enable TPC trigger handling")
 AddHelp("help", 'h')
 EndConfig()
 
+// Settings steering the processing of NN Clusterization
 BeginSubConfig(GPUSettingsProcessingNNclusterizer, nn, configStandalone.proc, "NN", 0, "Processing settings for neural network clusterizer", proc_nn)
 AddOption(applyNNclusterizer, int, 0, "", 0, "(bool, default = 0), if the neural network clusterizer should be used.")
 AddOption(nnInferenceDevice, std::string, "CPU", "", 0, "(std::string) Specify inference device (cpu (default), rocm, cuda)")
@@ -320,9 +337,6 @@ AddOption(tpcMaxAttachedClustersPerSectorRow, uint32_t, 51000, "", 0, "Maximum n
 AddOption(tpcUseOldCPUDecoding, bool, false, "", 0, "Enable old CPU-based TPC decoding")
 AddOption(tpcApplyCFCutsAtDecoding, bool, false, "", 0, "Apply cluster cuts from clusterization during decoding of compressed clusters")
 AddOption(tpcApplyClusterFilterOnCPU, uint8_t, 0, "", 0, "Apply custom cluster filter of GPUTPCClusterFilter class, 0: off, 1: debug, 2: PbPb23")
-AddOption(RTCcacheFolder, std::string, "./rtccache/", "", 0, "Folder in which the cache file is stored")
-AddOption(RTCprependCommand, std::string, "", "", 0, "Prepend RTC compilation commands by this string")
-AddOption(RTCoverrideArchitecture, std::string, "", "", 0, "Override arhcitecture part of RTC compilation command line")
 AddOption(oclPlatformNum, int32_t, -1, "", 0, "Platform to use, in case the backend provides multiple platforms (OpenCL only, -1 = auto-select, -2 query all platforms (also incompatible))")
 AddOption(oclCompileFromSources, bool, false, "", 0, "Compile OpenCL binary from included source code instead of using included spirv code")
 AddOption(oclOverrideSourceBuildFlags, std::string, "", "", 0, "Override OCL build flags for compilation from source, put a space for empty options")
@@ -330,6 +344,7 @@ AddOption(printSettings, bool, false, "", 0, "Print all settings when initializi
 AddOption(tpcFreeAllocatedMemoryAfterProcessing, bool, false, "", 0, "Clean all memory allocated by TPC when TPC processing done, only data written to external output resources will remain")
 AddVariable(eventDisplay, o2::gpu::GPUDisplayFrontendInterface*, nullptr)
 AddSubConfig(GPUSettingsProcessingRTC, rtc)
+AddSubConfig(GPUSettingsProcessingRTCtechnical, rtctech)
 AddSubConfig(GPUSettingsProcessingParam, param)
 AddSubConfig(GPUSettingsProcessingNNclusterizer, nn)
 AddHelp("help", 'h')
diff --git a/GPU/GPUTracking/GPUTrackingLinkDef_O2_DataTypes.h b/GPU/GPUTracking/GPUTrackingLinkDef_O2_DataTypes.h
index 35ebbabe41672..46fd50464c69b 100644
--- a/GPU/GPUTracking/GPUTrackingLinkDef_O2_DataTypes.h
+++ b/GPU/GPUTracking/GPUTrackingLinkDef_O2_DataTypes.h
@@ -30,6 +30,7 @@
 #pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsProcessing + ;
 #pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsProcessingParam + ;
 #pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsProcessingRTC + ;
+#pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsProcessingRTCtechnical + ;
 #pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsProcessingNNclusterizer + ;
 #pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsDisplay + ;
 #pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsDisplayLight + ;
diff --git a/prodtests/full-system-test/dpl-workflow.sh b/prodtests/full-system-test/dpl-workflow.sh
index 5259bbf951d73..bc92a686ab889 100755
--- a/prodtests/full-system-test/dpl-workflow.sh
+++ b/prodtests/full-system-test/dpl-workflow.sh
@@ -327,11 +327,11 @@ if has_detector_calib PHS && workflow_has_parameter CALIB; then
 fi
 
 [[ ${O2_GPU_DOUBLE_PIPELINE:-$EPNSYNCMODE} == 1 && $GPUTYPE != "CPU" ]] && GPU_CONFIG+=" --enableDoublePipeline"
-[[ ${O2_GPU_RTC:-$EPNSYNCMODE} == 1 ]] && GPU_CONFIG_KEY+="GPU_proc_rtc.enable=1;GPU_proc_rtc.cacheOutput=1;GPU_proc.RTCprependCommand=/usr/bin/env TMPDIR=/tmp /usr/bin/taskset -c 0-191;"
-[[ ${O2_GPU_RTC:-$EPNSYNCMODE} == 1 && $EPNSYNCMODE == 1 ]] && GPU_CONFIG_KEY+="GPU_proc.RTCcacheFolder=/var/tmp/o2_gpu_rtc_cache;"
+[[ ${O2_GPU_RTC:-$EPNSYNCMODE} == 1 ]] && GPU_CONFIG_KEY+="GPU_proc_rtc.enable=1;GPU_proc_rtc.cacheOutput=1;GPU_proc.RTCTECH.prependCommand=/usr/bin/env TMPDIR=/tmp /usr/bin/taskset -c 0-191;"
+[[ ${O2_GPU_RTC:-$EPNSYNCMODE} == 1 && $EPNSYNCMODE == 1 ]] && GPU_CONFIG_KEY+="GPU_proc.RTCTECH.cacheFolder=/var/tmp/o2_gpu_rtc_cache;"
 if [[ ${O2_GPU_RTC:-$EPNSYNCMODE} == 1 ]] && [[ ( ${ALICE_O2_FST:-0} == 1 && ${FST_TMUX_NO_EPN:-0} == 0 ) || $EPNSYNCMODE == 1 ]]; then
-  [[ ${EPN_NODE_MI100:-0} == 0 ]] && GPU_CONFIG_KEY+="GPU_proc.RTCoverrideArchitecture=--offload-arch=gfx906;"
-  [[ ${EPN_NODE_MI100:-0} == 1 ]] && GPU_CONFIG_KEY+="GPU_proc.RTCoverrideArchitecture=--offload-arch=gfx908;"
+  [[ ${EPN_NODE_MI100:-0} == 0 ]] && GPU_CONFIG_KEY+="GPU_proc.RTCTECH.overrideArchitecture=--offload-arch=gfx906;"
+  [[ ${EPN_NODE_MI100:-0} == 1 ]] && GPU_CONFIG_KEY+="GPU_proc.RTCTECH.overrideArchitecture=--offload-arch=gfx908;"
 fi
 
 ( workflow_has_parameter AOD || [[ -z "$DISABLE_ROOT_OUTPUT" ]] || needs_root_output o2-emcal-cell-writer-workflow ) && has_detector EMC && RAW_EMC_SUBSPEC=" --subspecification 1 "