From e972381c7a39cc91b945de343a4b7aa484bf609d Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Sun, 20 Apr 2025 23:01:33 +0200
Subject: [PATCH 1/2] GPU: Get rid of backendInternal additional wrapper

---
 GPU/GPUTracking/Base/GPUReconstructionCPU.cxx | 18 +-----
 GPU/GPUTracking/Base/GPUReconstructionCPU.h   |  5 +-
 .../GPUReconstructionProcessingKernels.inc    |  9 ++-
 .../Base/cuda/GPUReconstructionCUDA.cu        |  3 +-
 .../Base/cuda/GPUReconstructionCUDA.h         |  7 +--
 .../Base/cuda/GPUReconstructionCUDAKernels.cu | 60 +++++++++----------
 ...GPUReconstructionCUDAKernelsSpecialize.inc | 12 ++--
 .../Base/opencl/GPUReconstructionOCL.cxx      |  2 +-
 .../Base/opencl/GPUReconstructionOCL.h        |  4 +-
 .../opencl/GPUReconstructionOCLKernels.cxx    | 14 +----
 .../GPUReconstructionOCLKernelsSpecialize.inc |  2 +-
 11 files changed, 59 insertions(+), 77 deletions(-)
diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx
index 2453ce4a2328f..5f80a56e9e64e 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx
+++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx
@@ -54,7 +54,7 @@ GPUReconstructionCPU::~GPUReconstructionCPU()
 }
 
 template <class T, int32_t I, typename... Args>
-inline void GPUReconstructionCPU::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args)
+inline void GPUReconstructionCPU::runKernelBackend(const krnlSetupTime& _xyz, const Args&... args)
 {
   auto& x = _xyz.x;
   auto& y = _xyz.y;
@@ -88,7 +88,7 @@ inline void GPUReconstructionCPU::runKernelBackendInternal(const krnlSetupTime&
 }
 
 template <>
-inline void GPUReconstructionCPU::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
+inline void GPUReconstructionCPU::runKernelBackend<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
 {
   int32_t nThreads = std::max<int32_t>(1, std::min<int32_t>(size / (16 * 1024 * 1024), getNKernelHostThreads(true)));
   if (nThreads > 1) {
@@ -108,17 +108,6 @@ inline void GPUReconstructionCPU::runKernelBackendInternal<GPUMemClean16, 0>(con
   }
 }
 
-template <class T, int32_t I, typename... Args>
-void GPUReconstructionCPU::runKernelBackend(const krnlSetupArgs<T, I, Args...>& args)
-{
-#pragma GCC diagnostic push
-#if defined(__clang__)
-#pragma GCC diagnostic ignored "-Wunused-lambda-capture" // this is not alway captured below
-#endif
-  std::apply([this, &args](auto&... vals) { runKernelBackendInternal<T, I, Args...>(args.s, vals...); }, args.v);
-#pragma GCC diagnostic push
-}
-
 template <class S, int32_t I>
 GPUReconstructionProcessing::krnlProperties GPUReconstructionCPU::getKernelProperties(int gpu)
 {
@@ -137,8 +126,7 @@ GPUReconstructionProcessing::krnlProperties GPUReconstructionCPU::getKernelPrope
   return ret;
 }
 
-#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...)                                                                                           \
-  template void GPUReconstructionCPU::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args); \
+#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) \
   template GPUReconstructionProcessing::krnlProperties GPUReconstructionCPU::getKernelProperties<GPUCA_M_KRNL_TEMPLATE(x_class)>(int gpu);
 #include "GPUReconstructionKernelList.h"
 #undef GPUCA_KRNL
diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h
index d0d8b05c4af0e..d93d1335d45c5 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionCPU.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.h
@@ -40,7 +40,7 @@ class GPUReconstructionCPU : public GPUReconstructionProcessing::KernelInterface
   template <class S, int32_t I = 0>
   krnlProperties getKernelProperties(int gpu = -1);
   template <class T, int32_t I = 0, typename... Args>
-  void runKernelBackend(const krnlSetupArgs<T, I, Args...>& args);
+  void runKernelBackend(const krnlSetupTime& _xyz, const Args&... args);
 
   virtual int32_t GPUDebug(const char* state = "UNKNOWN", int32_t stream = -1, bool force = false);
   int32_t GPUStuck() { return mGPUStuck; }
@@ -59,9 +59,6 @@ class GPUReconstructionCPU : public GPUReconstructionProcessing::KernelInterface
 
   GPUReconstructionCPU(const GPUSettingsDeviceBackend& cfg) : GPUReconstructionProcessing::KernelInterface<GPUReconstructionCPU, GPUReconstructionProcessing>(cfg) {}
 
-  template <class T, int32_t I = 0, typename... Args>
-  void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args);
-
   int32_t registerMemoryForGPU_internal(const void* ptr, size_t size) override { return 0; }
   int32_t unregisterMemoryForGPU_internal(const void* ptr) override { return 0; }
 
diff --git a/GPU/GPUTracking/Base/GPUReconstructionProcessingKernels.inc b/GPU/GPUTracking/Base/GPUReconstructionProcessingKernels.inc
index 49d02515372b8..b303cb7c8d39c 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionProcessingKernels.inc
+++ b/GPU/GPUTracking/Base/GPUReconstructionProcessingKernels.inc
@@ -21,6 +21,11 @@
 namespace o2::gpu
 {
 
+#pragma GCC diagnostic push
+#if defined(__clang__)
+#pragma GCC diagnostic ignored "-Wunused-lambda-capture" // this is not alway captured below
+#endif
+
 template <class T, class S>
 void GPUReconstructionProcessing::KernelInterface<T, S>::runKernelVirtual(const int num, const void* args)
 {
@@ -28,7 +33,7 @@ void GPUReconstructionProcessing::KernelInterface<T, S>::runKernelVirtual(const
 #define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, x_num)                             \
     case x_num: {                                                                                             \
       const auto& args2 = *(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>*)args; \
-      ((T*)this)->template runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(args2);                                    \
+      std::apply([this, &args2](auto&... vals) { ((T*)this)->template runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>(args2.s, vals...); }, args2.v); \
       break;                                                                                                  \
     }
 #include "GPUReconstructionKernelList.h"
@@ -36,6 +41,8 @@ void GPUReconstructionProcessing::KernelInterface<T, S>::runKernelVirtual(const
   } // clang-format on
 }
 
+#pragma GCC diagnostic push
+
 } // namespace o2::gpu
 
 #endif // GPURECONSTRUCTIONPROCESSINGKERNELS_H
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
index 970b331ea99fb..c40c607396f3f 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -24,7 +24,8 @@
 #include "GPUParamRTC.h"
 #include "GPUReconstructionCUDAHelpers.inc"
 #include "GPUDefParametersLoad.inc"
-#include "GPUReconstructionProcessingKernels.inc"
+#include "GPUReconstructionKernelIncludes.h"
+#include "GPUConstantMem.h"
 
 #if defined(GPUCA_KERNEL_COMPILE_MODE) && GPUCA_KERNEL_COMPILE_MODE == 1
 #include "utils/qGetLdBinarySymbols.h"
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
index ed75100dfe351..36dcdffb1c6d6 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
@@ -45,7 +45,9 @@ class GPUReconstructionCUDA : public GPUReconstructionProcessing::KernelInterfac
   virtual int32_t GPUChkErrInternal(const int64_t error, const char* file, int32_t line) const override;
 
   template <class T, int32_t I = 0, typename... Args>
-  void runKernelBackend(const krnlSetupArgs<T, I, Args...>& args);
+  void runKernelBackend(const krnlSetupTime& _xyz, const Args&... args);
+  template <class T, int32_t I = 0, typename... Args>
+  void runKernelBackendTimed(const krnlSetupTime& _xyz, const Args&... args);
 
   template <class T, class S>
   friend GPUh() void GPUCommonAlgorithm::sortOnDevice(auto* rec, int32_t stream, T* begin, size_t N, const S& comp);
@@ -53,9 +55,6 @@ class GPUReconstructionCUDA : public GPUReconstructionProcessing::KernelInterfac
  protected:
   GPUReconstructionCUDAInternals* mInternals;
 
-  template <class T, int32_t I = 0, typename... Args>
-  void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args);
-
   int32_t InitDevice_Runtime() override;
   int32_t ExitDevice_Runtime() override;
 
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu
index 11a62bcec2318..3267e1d5c67f6 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu
@@ -23,13 +23,15 @@ using namespace o2::gpu;
 #include "GPUReconstructionIncludesDeviceAll.h"
 
 #include "GPUReconstructionCUDAKernelsSpecialize.inc"
+#include "GPUReconstructionProcessingKernels.inc"
+template void GPUReconstructionProcessing::KernelInterface<GPUReconstructionCUDA, GPUReconstructionDeviceBase>::runKernelVirtual(const int num, const void* args);
 
 #if defined(__HIPCC__) && defined(GPUCA_HAS_GLOBAL_SYMBOL_CONSTANT_MEM)
 __global__ void gGPUConstantMemBuffer_dummy(int32_t* p) { *p = *(int32_t*)&gGPUConstantMemBuffer; }
 #endif
 
 template <class T, int32_t I, typename... Args>
-inline void GPUReconstructionCUDA::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args)
+inline void GPUReconstructionCUDA::runKernelBackendTimed(const krnlSetupTime& _xyz, const Args&... args)
 {
 #if !defined(GPUCA_KERNEL_COMPILE_MODE) || GPUCA_KERNEL_COMPILE_MODE != 1
   if (!GetProcessingSettings().rtc.enable) {
@@ -52,18 +54,18 @@ inline void GPUReconstructionCUDA::runKernelBackendInternal(const krnlSetupTime&
 }
 
 template <class T, int32_t I, typename... Args>
-void GPUReconstructionCUDA::runKernelBackend(const krnlSetupArgs<T, I, Args...>& args)
+inline void GPUReconstructionCUDA::runKernelBackend(const krnlSetupTime& _xyz, const Args&... args)
 {
-  auto& x = args.s.x;
-  auto& z = args.s.z;
+  auto& x = _xyz.x;
+  auto& z = _xyz.z;
   if (z.evList) {
     for (int32_t k = 0; k < z.nEvents; k++) {
       GPUChkErr(cudaStreamWaitEvent(mInternals->Streams[x.stream], ((cudaEvent_t*)z.evList)[k], 0));
     }
   }
   {
-    GPUDebugTiming timer(GetProcessingSettings().deviceTimers && GetProcessingSettings().debugLevel > 0, (deviceEvent*)mDebugEvents, mInternals->Streams, args.s, this);
-    std::apply([this, &args](auto&... vals) { this->runKernelBackendInternal<T, I, Args...>(args.s, vals...); }, args.v);
+    GPUDebugTiming timer(GetProcessingSettings().deviceTimers && GetProcessingSettings().debugLevel > 0, (deviceEvent*)mDebugEvents, mInternals->Streams, _xyz, this);
+    runKernelBackendTimed<T, I, Args...>(_xyz, args...);
   }
   GPUChkErr(cudaGetLastError());
   if (z.ev) {
@@ -74,31 +76,29 @@ void GPUReconstructionCUDA::runKernelBackend(const krnlSetupArgs<T, I, Args...>&
 #undef GPUCA_KRNL_REG
 #define GPUCA_KRNL_REG(args) __launch_bounds__(GPUCA_M_MAX2_3(GPUCA_M_STRIP(args)))
 
-#if defined(GPUCA_KERNEL_COMPILE_MODE) && GPUCA_KERNEL_COMPILE_MODE == 1 // ---------- COMPILE_MODE = perkernel ----------
-#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) template void GPUReconstructionCUDA::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args);
-#else // ---------- COMPILE_MODE = onefile | rdc ----------
-#if defined(GPUCA_KERNEL_COMPILE_MODE) && GPUCA_KERNEL_COMPILE_MODE == 2
-#define GPUCA_KRNL_DEFONLY // COMPILE_MODE = rdc
-#endif
-
-#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...)        \
-  GPUCA_KRNL_HOST(x_class, x_attributes, x_arguments, x_forward, x_types, __VA_ARGS__) \
-  template void GPUReconstructionCUDA::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args);
-
-#ifndef __HIPCC__ // CUDA version
-#define GPUCA_KRNL_CALL(x_class, ...) \
-  GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))<<<x.nBlocks, x.nThreads, 0, me->mInternals->Streams[x.stream]>>>(GPUCA_CONSMEM_CALL y.index, args...);
-#else // HIP version
-#undef GPUCA_KRNL_CUSTOM
-#define GPUCA_KRNL_CUSTOM(args) GPUCA_M_STRIP(args)
-#define GPUCA_KRNL_CALL(x_class, ...) \
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))), dim3(x.nBlocks), dim3(x.nThreads), 0, me->mInternals->Streams[x.stream], GPUCA_CONSMEM_CALL y.index, args...);
-#endif // __HIPCC__
-
+// clang-format off
+#if defined(GPUCA_KERNEL_COMPILE_MODE) && GPUCA_KERNEL_COMPILE_MODE != 1 // ---------- COMPILE_MODE = perkernel ----------
+  #if defined(GPUCA_KERNEL_COMPILE_MODE) && GPUCA_KERNEL_COMPILE_MODE == 2
+    #define GPUCA_KRNL_DEFONLY // COMPILE_MODE = rdc
+  #endif
+
+  #define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) \
+    GPUCA_KRNL_HOST(x_class, x_attributes, x_arguments, x_forward, x_types, __VA_ARGS__)
+
+  #ifndef __HIPCC__ // CUDA version
+    #define GPUCA_KRNL_CALL(x_class, ...) \
+      GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))<<<x.nBlocks, x.nThreads, 0, me->mInternals->Streams[x.stream]>>>(GPUCA_CONSMEM_CALL y.index, args...);
+  #else // HIP version
+    #undef GPUCA_KRNL_CUSTOM
+    #define GPUCA_KRNL_CUSTOM(args) GPUCA_M_STRIP(args)
+    #define GPUCA_KRNL_CALL(x_class, ...) \
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))), dim3(x.nBlocks), dim3(x.nThreads), 0, me->mInternals->Streams[x.stream], GPUCA_CONSMEM_CALL y.index, args...);
+  #endif // __HIPCC__
+
+  #include "GPUReconstructionKernelList.h"
+  #undef GPUCA_KRNL
 #endif // ---------- COMPILE_MODE = onefile | rdc ----------
-
-#include "GPUReconstructionKernelList.h"
-#undef GPUCA_KRNL
+// clang-format on
 
 #ifndef GPUCA_NO_CONSTANT_MEMORY
 static GPUReconstructionDeviceBase::deviceConstantMemRegistration registerConstSymbol([]() {
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc
index 899c2e240cd94..8796f063abdc5 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc
@@ -97,7 +97,7 @@ struct GPUTPCGMO2OutputSort_comp {
 } // namespace o2::gpu::internal
 
 template <>
-inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal<GPUTPCGMMergerMergeBorders, 3>(const krnlSetupTime& _xyz, GPUTPCGMBorderRange* const& range, int32_t const& N, int32_t const& cmpMax)
+inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed<GPUTPCGMMergerMergeBorders, 3>(const krnlSetupTime& _xyz, GPUTPCGMBorderRange* const& range, int32_t const& N, int32_t const& cmpMax)
 {
   if (cmpMax) {
     GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, range, N, MergeBorderTracks_compMax());
@@ -107,32 +107,32 @@ inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInter
 }
 
 template <>
-inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal<GPUTPCGMMergerSortTracks, 0>(const krnlSetupTime& _xyz)
+inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed<GPUTPCGMMergerSortTracks, 0>(const krnlSetupTime& _xyz)
 {
   GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackOrderProcess(), processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracks_comp(mProcessorsShadow->tpcMerger.OutputTracks()));
 }
 
 template <>
-inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal<GPUTPCGMMergerSortTracksQPt, 0>(const krnlSetupTime& _xyz)
+inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed<GPUTPCGMMergerSortTracksQPt, 0>(const krnlSetupTime& _xyz)
 {
   GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackSort(), processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracksQPt_comp(mProcessorsShadow->tpcMerger.OutputTracks()));
 }
 
 template <>
-inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal<GPUTPCGMMergerMergeLoopers, 1>(const krnlSetupTime& _xyz)
+inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed<GPUTPCGMMergerMergeLoopers, 1>(const krnlSetupTime& _xyz)
 {
   GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.LooperCandidates(), processors()->tpcMerger.Memory()->nLooperMatchCandidates, GPUTPCGMMergerMergeLoopers_comp());
 }
 
 template <>
-inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal<GPUTPCGMO2Output, GPUTPCGMO2Output::sort>(const krnlSetupTime& _xyz)
+inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed<GPUTPCGMO2Output, GPUTPCGMO2Output::sort>(const krnlSetupTime& _xyz)
 {
   GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackSortO2(), processors()->tpcMerger.NOutputTracksTPCO2(), GPUTPCGMO2OutputSort_comp());
 }
 #endif // GPUCA_SPECIALIZE_THRUST_SORTS
 
 template <>
-inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
+inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
 {
   GPUChkErr(cudaMemsetAsync(ptr, 0, size, mInternals->Streams[_xyz.x.stream]));
 }
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx
index 28c809dd4a09a..949dd6195b262 100644
--- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx
+++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx
@@ -13,8 +13,8 @@
 /// \author David Rohr
 
 #include "GPUReconstructionOCLIncludesHost.h"
-#include "GPUReconstructionProcessingKernels.inc"
 #include "GPUDefParametersLoad.inc"
+#include "GPUConstantMem.h"
 
 #include <map>
 
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h
index 091bc0409630d..958d5186bf41a 100644
--- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h
+++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h
@@ -34,7 +34,7 @@ class GPUReconstructionOCL : public GPUReconstructionProcessing::KernelInterface
   ~GPUReconstructionOCL() override;
 
   template <class T, int32_t I = 0, typename... Args>
-  void runKernelBackend(const krnlSetupArgs<T, I, Args...>& args);
+  void runKernelBackend(const krnlSetupTime& _xyz, const Args&... args);
 
  protected:
   int32_t InitDevice_Runtime() override;
@@ -57,8 +57,6 @@ class GPUReconstructionOCL : public GPUReconstructionProcessing::KernelInterface
 
   template <class T, int32_t I = 0>
   int32_t AddKernel();
-  template <class T, int32_t I = 0, typename... Args>
-  void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args);
 
   GPUReconstructionOCLInternals* mInternals;
   float mOclVersion;
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx
index 72c68428149dd..655df5404276b 100644
--- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx
+++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx
@@ -16,9 +16,11 @@
 #include "GPUReconstructionKernelIncludes.h"
 
 #include "GPUReconstructionOCLKernelsSpecialize.inc"
+#include "GPUReconstructionProcessingKernels.inc"
+template void GPUReconstructionProcessing::KernelInterface<GPUReconstructionOCL, GPUReconstructionDeviceBase>::runKernelVirtual(const int num, const void* args);
 
 template <class T, int32_t I, typename... Args>
-inline void GPUReconstructionOCL::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args)
+inline void GPUReconstructionOCL::runKernelBackend(const krnlSetupTime& _xyz, const Args&... args)
 {
   cl_kernel k = getKernelObject<cl_kernel, T, I>();
   auto& x = _xyz.x;
@@ -48,12 +50,6 @@ inline void GPUReconstructionOCL::runKernelBackendInternal(const krnlSetupTime&
   }
 }
 
-template <class T, int32_t I, typename... Args>
-void GPUReconstructionOCL::runKernelBackend(const krnlSetupArgs<T, I, Args...>& args)
-{
-  std::apply([this, &args](auto&... vals) { runKernelBackendInternal<T, I, Args...>(args.s, vals...); }, args.v);
-}
-
 template <class T, int32_t I>
 int32_t GPUReconstructionOCL::AddKernel()
 {
@@ -86,7 +82,3 @@ int32_t GPUReconstructionOCL::AddKernels()
 #undef GPUCA_KRNL
   return 0;
 }
-
-#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) template void GPUReconstructionOCL::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args);
-#include "GPUReconstructionKernelList.h"
-#undef GPUCA_KRNL
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernelsSpecialize.inc b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernelsSpecialize.inc
index 1b860e47a4243..d5b0338aecbd9 100644
--- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernelsSpecialize.inc
+++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernelsSpecialize.inc
@@ -13,7 +13,7 @@
 /// \author David Rohr
 
 template <>
-inline void GPUReconstructionOCL::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
+inline void GPUReconstructionOCL::runKernelBackend<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
 {
   cl_int4 val0 = {0, 0, 0, 0};
   GPUChkErr(clEnqueueFillBuffer(mInternals->command_queue[_xyz.x.stream], mInternals->mem_gpu, &val0, sizeof(val0), (char*)ptr - (char*)mDeviceMemoryBase, (size + sizeof(val0) - 1) & ~(sizeof(val0) - 1), _xyz.z.evList == nullptr ? 0 : _xyz.z.nEvents, _xyz.z.evList->getEventList<cl_event>(), _xyz.z.ev->getEventList<cl_event>()));

From 4f3e811d2f5e552fdc60e412c974fe2b1e793a6f Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Sun, 20 Apr 2025 23:47:46 +0200
Subject: [PATCH 2/2] GPU: Fix onefile compile mode after all the refactoring

---
 .../Base/GPUReconstructionKernelMacros.h        | 14 --------------
 .../Base/cuda/GPUReconstructionCUDA.h           |  2 ++
 .../Base/cuda/GPUReconstructionCUDAKernels.cu   | 17 +++++++++++++----
 3 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/GPU/GPUTracking/Base/GPUReconstructionKernelMacros.h b/GPU/GPUTracking/Base/GPUReconstructionKernelMacros.h
index 2b16dfb32fe14..a03d9de13ef8f 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionKernelMacros.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionKernelMacros.h
@@ -62,20 +62,6 @@
   }
 #endif
 
-// GPU Host wrappers for kernel
-#define GPUCA_KRNL_HOST(x_class, ...) \
-  GPUCA_KRNLGPU(x_class, __VA_ARGS__) \
-  template <> class GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::backendInternal<GPUCA_M_KRNL_TEMPLATE(x_class)> { \
-   public: \
-    template <typename T, typename... Args> \
-    static inline void runKernelBackendMacro(const krnlSetupTime& _xyz, T* me, const Args&... args) \
-    { \
-      auto& x = _xyz.x; \
-      auto& y = _xyz.y; \
-      GPUCA_KRNL_CALL(x_class, __VA_ARGS__) \
-    } \
-  };
-
 #endif // GPUCA_GPUCODE
 
 #define GPUCA_KRNL_LB(x_class, x_attributes, ...) GPUCA_KRNL(x_class, (REG, (GPUCA_M_CAT(GPUCA_LB_, GPUCA_M_KRNL_NAME(x_class))), GPUCA_M_STRIP(x_attributes)), __VA_ARGS__)
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
index 36dcdffb1c6d6..6c126d153d8ae 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
@@ -48,6 +48,8 @@ class GPUReconstructionCUDA : public GPUReconstructionProcessing::KernelInterfac
   void runKernelBackend(const krnlSetupTime& _xyz, const Args&... args);
   template <class T, int32_t I = 0, typename... Args>
   void runKernelBackendTimed(const krnlSetupTime& _xyz, const Args&... args);
+  template <class T, int32_t I>
+  struct kernelBackendMacro;
 
   template <class T, class S>
   friend GPUh() void GPUCommonAlgorithm::sortOnDevice(auto* rec, int32_t stream, T* begin, size_t N, const S& comp);
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu
index 3267e1d5c67f6..e6ed94bba2cec 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu
@@ -35,7 +35,7 @@ inline void GPUReconstructionCUDA::runKernelBackendTimed(const krnlSetupTime& _x
 {
 #if !defined(GPUCA_KERNEL_COMPILE_MODE) || GPUCA_KERNEL_COMPILE_MODE != 1
   if (!GetProcessingSettings().rtc.enable) {
-    backendInternal<T, I>::runKernelBackendMacro(_xyz, this, args...);
+    kernelBackendMacro<T, I>::run(_xyz, this, args...);
   } else
 #endif
   {
@@ -82,9 +82,6 @@ inline void GPUReconstructionCUDA::runKernelBackend(const krnlSetupTime& _xyz, c
     #define GPUCA_KRNL_DEFONLY // COMPILE_MODE = rdc
   #endif
 
-  #define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) \
-    GPUCA_KRNL_HOST(x_class, x_attributes, x_arguments, x_forward, x_types, __VA_ARGS__)
-
   #ifndef __HIPCC__ // CUDA version
     #define GPUCA_KRNL_CALL(x_class, ...) \
       GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))<<<x.nBlocks, x.nThreads, 0, me->mInternals->Streams[x.stream]>>>(GPUCA_CONSMEM_CALL y.index, args...);
@@ -95,6 +92,18 @@ inline void GPUReconstructionCUDA::runKernelBackend(const krnlSetupTime& _xyz, c
       hipLaunchKernelGGL(HIP_KERNEL_NAME(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))), dim3(x.nBlocks), dim3(x.nThreads), 0, me->mInternals->Streams[x.stream], GPUCA_CONSMEM_CALL y.index, args...);
   #endif // __HIPCC__
 
+  #define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) \
+    GPUCA_KRNLGPU(x_class, x_attributes, x_arguments, x_forward, x_types, __VA_ARGS__) \
+    template <> struct GPUReconstructionCUDA::kernelBackendMacro<GPUCA_M_KRNL_TEMPLATE(x_class)> { \
+      template <typename... Args> \
+      static inline void run(const GPUReconstructionProcessing::krnlSetupTime& _xyz, auto* me, const Args&... args) \
+      { \
+        auto& x = _xyz.x; \
+        auto& y = _xyz.y; \
+        GPUCA_KRNL_CALL(x_class, x_attributes, x_arguments, x_forward, x_types, __VA_ARGS__) \
+      } \
+    };
+
   #include "GPUReconstructionKernelList.h"
   #undef GPUCA_KRNL
 #endif // ---------- COMPILE_MODE = onefile | rdc ----------