From e972381c7a39cc91b945de343a4b7aa484bf609d Mon Sep 17 00:00:00 2001 From: David Rohr Date: Sun, 20 Apr 2025 23:01:33 +0200 Subject: [PATCH 1/2] GPU: Get rid of backendInternal additional wrapper --- GPU/GPUTracking/Base/GPUReconstructionCPU.cxx | 18 +----- GPU/GPUTracking/Base/GPUReconstructionCPU.h | 5 +- .../GPUReconstructionProcessingKernels.inc | 9 ++- .../Base/cuda/GPUReconstructionCUDA.cu | 3 +- .../Base/cuda/GPUReconstructionCUDA.h | 7 +-- .../Base/cuda/GPUReconstructionCUDAKernels.cu | 60 +++++++++---------- ...GPUReconstructionCUDAKernelsSpecialize.inc | 12 ++-- .../Base/opencl/GPUReconstructionOCL.cxx | 2 +- .../Base/opencl/GPUReconstructionOCL.h | 4 +- .../opencl/GPUReconstructionOCLKernels.cxx | 14 +---- .../GPUReconstructionOCLKernelsSpecialize.inc | 2 +- 11 files changed, 59 insertions(+), 77 deletions(-) diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx index 2453ce4a2328f..5f80a56e9e64e 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx @@ -54,7 +54,7 @@ GPUReconstructionCPU::~GPUReconstructionCPU() } template -inline void GPUReconstructionCPU::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args) +inline void GPUReconstructionCPU::runKernelBackend(const krnlSetupTime& _xyz, const Args&... args) { auto& x = _xyz.x; auto& y = _xyz.y; @@ -88,7 +88,7 @@ inline void GPUReconstructionCPU::runKernelBackendInternal(const krnlSetupTime& } template <> -inline void GPUReconstructionCPU::runKernelBackendInternal(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size) +inline void GPUReconstructionCPU::runKernelBackend(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size) { int32_t nThreads = std::max(1, std::min(size / (16 * 1024 * 1024), getNKernelHostThreads(true))); if (nThreads > 1) { @@ -108,17 +108,6 @@ inline void GPUReconstructionCPU::runKernelBackendInternal(con } } -template -void GPUReconstructionCPU::runKernelBackend(const krnlSetupArgs& args) -{ -#pragma GCC diagnostic push -#if defined(__clang__) -#pragma GCC diagnostic ignored "-Wunused-lambda-capture" // this is not alway captured below -#endif - std::apply([this, &args](auto&... vals) { runKernelBackendInternal(args.s, vals...); }, args.v); -#pragma GCC diagnostic push -} - template GPUReconstructionProcessing::krnlProperties GPUReconstructionCPU::getKernelProperties(int gpu) { @@ -137,8 +126,7 @@ GPUReconstructionProcessing::krnlProperties GPUReconstructionCPU::getKernelPrope return ret; } -#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) \ - template void GPUReconstructionCPU::runKernelBackend(const krnlSetupArgs& args); \ +#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) \ template GPUReconstructionProcessing::krnlProperties GPUReconstructionCPU::getKernelProperties(int gpu); #include "GPUReconstructionKernelList.h" #undef GPUCA_KRNL diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h index d0d8b05c4af0e..d93d1335d45c5 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.h +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.h @@ -40,7 +40,7 @@ class GPUReconstructionCPU : public GPUReconstructionProcessing::KernelInterface template krnlProperties getKernelProperties(int gpu = -1); template - void runKernelBackend(const krnlSetupArgs& args); + void runKernelBackend(const krnlSetupTime& _xyz, const Args&... args); virtual int32_t GPUDebug(const char* state = "UNKNOWN", int32_t stream = -1, bool force = false); int32_t GPUStuck() { return mGPUStuck; } @@ -59,9 +59,6 @@ class GPUReconstructionCPU : public GPUReconstructionProcessing::KernelInterface GPUReconstructionCPU(const GPUSettingsDeviceBackend& cfg) : GPUReconstructionProcessing::KernelInterface(cfg) {} - template - void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args); - int32_t registerMemoryForGPU_internal(const void* ptr, size_t size) override { return 0; } int32_t unregisterMemoryForGPU_internal(const void* ptr) override { return 0; } diff --git a/GPU/GPUTracking/Base/GPUReconstructionProcessingKernels.inc b/GPU/GPUTracking/Base/GPUReconstructionProcessingKernels.inc index 49d02515372b8..b303cb7c8d39c 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionProcessingKernels.inc +++ b/GPU/GPUTracking/Base/GPUReconstructionProcessingKernels.inc @@ -21,6 +21,11 @@ namespace o2::gpu { +#pragma GCC diagnostic push +#if defined(__clang__) +#pragma GCC diagnostic ignored "-Wunused-lambda-capture" // this is not alway captured below +#endif + template void GPUReconstructionProcessing::KernelInterface::runKernelVirtual(const int num, const void* args) { @@ -28,7 +33,7 @@ void GPUReconstructionProcessing::KernelInterface::runKernelVirtual(const #define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, x_num) \ case x_num: { \ const auto& args2 = *(const krnlSetupArgs*)args; \ - ((T*)this)->template runKernelBackend(args2); \ + std::apply([this, &args2](auto&... vals) { ((T*)this)->template runKernelBackend(args2.s, vals...); }, args2.v); \ break; \ } #include "GPUReconstructionKernelList.h" @@ -36,6 +41,8 @@ void GPUReconstructionProcessing::KernelInterface::runKernelVirtual(const } // clang-format on } +#pragma GCC diagnostic push + } // namespace o2::gpu #endif // GPURECONSTRUCTIONPROCESSINGKERNELS_H diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu index 970b331ea99fb..c40c607396f3f 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu @@ -24,7 +24,8 @@ #include "GPUParamRTC.h" #include "GPUReconstructionCUDAHelpers.inc" #include "GPUDefParametersLoad.inc" -#include "GPUReconstructionProcessingKernels.inc" +#include "GPUReconstructionKernelIncludes.h" +#include "GPUConstantMem.h" #if defined(GPUCA_KERNEL_COMPILE_MODE) && GPUCA_KERNEL_COMPILE_MODE == 1 #include "utils/qGetLdBinarySymbols.h" diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h index ed75100dfe351..36dcdffb1c6d6 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h @@ -45,7 +45,9 @@ class GPUReconstructionCUDA : public GPUReconstructionProcessing::KernelInterfac virtual int32_t GPUChkErrInternal(const int64_t error, const char* file, int32_t line) const override; template - void runKernelBackend(const krnlSetupArgs& args); + void runKernelBackend(const krnlSetupTime& _xyz, const Args&... args); + template + void runKernelBackendTimed(const krnlSetupTime& _xyz, const Args&... args); template friend GPUh() void GPUCommonAlgorithm::sortOnDevice(auto* rec, int32_t stream, T* begin, size_t N, const S& comp); @@ -53,9 +55,6 @@ class GPUReconstructionCUDA : public GPUReconstructionProcessing::KernelInterfac protected: GPUReconstructionCUDAInternals* mInternals; - template - void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args); - int32_t InitDevice_Runtime() override; int32_t ExitDevice_Runtime() override; diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu index 11a62bcec2318..3267e1d5c67f6 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu @@ -23,13 +23,15 @@ using namespace o2::gpu; #include "GPUReconstructionIncludesDeviceAll.h" #include "GPUReconstructionCUDAKernelsSpecialize.inc" +#include "GPUReconstructionProcessingKernels.inc" +template void GPUReconstructionProcessing::KernelInterface::runKernelVirtual(const int num, const void* args); #if defined(__HIPCC__) && defined(GPUCA_HAS_GLOBAL_SYMBOL_CONSTANT_MEM) __global__ void gGPUConstantMemBuffer_dummy(int32_t* p) { *p = *(int32_t*)&gGPUConstantMemBuffer; } #endif template -inline void GPUReconstructionCUDA::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args) +inline void GPUReconstructionCUDA::runKernelBackendTimed(const krnlSetupTime& _xyz, const Args&... args) { #if !defined(GPUCA_KERNEL_COMPILE_MODE) || GPUCA_KERNEL_COMPILE_MODE != 1 if (!GetProcessingSettings().rtc.enable) { @@ -52,18 +54,18 @@ inline void GPUReconstructionCUDA::runKernelBackendInternal(const krnlSetupTime& } template -void GPUReconstructionCUDA::runKernelBackend(const krnlSetupArgs& args) +inline void GPUReconstructionCUDA::runKernelBackend(const krnlSetupTime& _xyz, const Args&... args) { - auto& x = args.s.x; - auto& z = args.s.z; + auto& x = _xyz.x; + auto& z = _xyz.z; if (z.evList) { for (int32_t k = 0; k < z.nEvents; k++) { GPUChkErr(cudaStreamWaitEvent(mInternals->Streams[x.stream], ((cudaEvent_t*)z.evList)[k], 0)); } } { - GPUDebugTiming timer(GetProcessingSettings().deviceTimers && GetProcessingSettings().debugLevel > 0, (deviceEvent*)mDebugEvents, mInternals->Streams, args.s, this); - std::apply([this, &args](auto&... vals) { this->runKernelBackendInternal(args.s, vals...); }, args.v); + GPUDebugTiming timer(GetProcessingSettings().deviceTimers && GetProcessingSettings().debugLevel > 0, (deviceEvent*)mDebugEvents, mInternals->Streams, _xyz, this); + runKernelBackendTimed(_xyz, args...); } GPUChkErr(cudaGetLastError()); if (z.ev) { @@ -74,31 +76,29 @@ void GPUReconstructionCUDA::runKernelBackend(const krnlSetupArgs& #undef GPUCA_KRNL_REG #define GPUCA_KRNL_REG(args) __launch_bounds__(GPUCA_M_MAX2_3(GPUCA_M_STRIP(args))) -#if defined(GPUCA_KERNEL_COMPILE_MODE) && GPUCA_KERNEL_COMPILE_MODE == 1 // ---------- COMPILE_MODE = perkernel ---------- -#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) template void GPUReconstructionCUDA::runKernelBackend(const krnlSetupArgs& args); -#else // ---------- COMPILE_MODE = onefile | rdc ---------- -#if defined(GPUCA_KERNEL_COMPILE_MODE) && GPUCA_KERNEL_COMPILE_MODE == 2 -#define GPUCA_KRNL_DEFONLY // COMPILE_MODE = rdc -#endif - -#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) \ - GPUCA_KRNL_HOST(x_class, x_attributes, x_arguments, x_forward, x_types, __VA_ARGS__) \ - template void GPUReconstructionCUDA::runKernelBackend(const krnlSetupArgs& args); - -#ifndef __HIPCC__ // CUDA version -#define GPUCA_KRNL_CALL(x_class, ...) \ - GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))<<mInternals->Streams[x.stream]>>>(GPUCA_CONSMEM_CALL y.index, args...); -#else // HIP version -#undef GPUCA_KRNL_CUSTOM -#define GPUCA_KRNL_CUSTOM(args) GPUCA_M_STRIP(args) -#define GPUCA_KRNL_CALL(x_class, ...) \ - hipLaunchKernelGGL(HIP_KERNEL_NAME(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))), dim3(x.nBlocks), dim3(x.nThreads), 0, me->mInternals->Streams[x.stream], GPUCA_CONSMEM_CALL y.index, args...); -#endif // __HIPCC__ - +// clang-format off +#if defined(GPUCA_KERNEL_COMPILE_MODE) && GPUCA_KERNEL_COMPILE_MODE != 1 // ---------- COMPILE_MODE = perkernel ---------- + #if defined(GPUCA_KERNEL_COMPILE_MODE) && GPUCA_KERNEL_COMPILE_MODE == 2 + #define GPUCA_KRNL_DEFONLY // COMPILE_MODE = rdc + #endif + + #define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) \ + GPUCA_KRNL_HOST(x_class, x_attributes, x_arguments, x_forward, x_types, __VA_ARGS__) + + #ifndef __HIPCC__ // CUDA version + #define GPUCA_KRNL_CALL(x_class, ...) \ + GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))<<mInternals->Streams[x.stream]>>>(GPUCA_CONSMEM_CALL y.index, args...); + #else // HIP version + #undef GPUCA_KRNL_CUSTOM + #define GPUCA_KRNL_CUSTOM(args) GPUCA_M_STRIP(args) + #define GPUCA_KRNL_CALL(x_class, ...) \ + hipLaunchKernelGGL(HIP_KERNEL_NAME(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))), dim3(x.nBlocks), dim3(x.nThreads), 0, me->mInternals->Streams[x.stream], GPUCA_CONSMEM_CALL y.index, args...); + #endif // __HIPCC__ + + #include "GPUReconstructionKernelList.h" + #undef GPUCA_KRNL #endif // ---------- COMPILE_MODE = onefile | rdc ---------- - -#include "GPUReconstructionKernelList.h" -#undef GPUCA_KRNL +// clang-format on #ifndef GPUCA_NO_CONSTANT_MEMORY static GPUReconstructionDeviceBase::deviceConstantMemRegistration registerConstSymbol([]() { diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc index 899c2e240cd94..8796f063abdc5 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc @@ -97,7 +97,7 @@ struct GPUTPCGMO2OutputSort_comp { } // namespace o2::gpu::internal template <> -inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal(const krnlSetupTime& _xyz, GPUTPCGMBorderRange* const& range, int32_t const& N, int32_t const& cmpMax) +inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed(const krnlSetupTime& _xyz, GPUTPCGMBorderRange* const& range, int32_t const& N, int32_t const& cmpMax) { if (cmpMax) { GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, range, N, MergeBorderTracks_compMax()); @@ -107,32 +107,32 @@ inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInter } template <> -inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal(const krnlSetupTime& _xyz) +inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed(const krnlSetupTime& _xyz) { GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackOrderProcess(), processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracks_comp(mProcessorsShadow->tpcMerger.OutputTracks())); } template <> -inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal(const krnlSetupTime& _xyz) +inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed(const krnlSetupTime& _xyz) { GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackSort(), processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracksQPt_comp(mProcessorsShadow->tpcMerger.OutputTracks())); } template <> -inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal(const krnlSetupTime& _xyz) +inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed(const krnlSetupTime& _xyz) { GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.LooperCandidates(), processors()->tpcMerger.Memory()->nLooperMatchCandidates, GPUTPCGMMergerMergeLoopers_comp()); } template <> -inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal(const krnlSetupTime& _xyz) +inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed(const krnlSetupTime& _xyz) { GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackSortO2(), processors()->tpcMerger.NOutputTracksTPCO2(), GPUTPCGMO2OutputSort_comp()); } #endif // GPUCA_SPECIALIZE_THRUST_SORTS template <> -inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size) +inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size) { GPUChkErr(cudaMemsetAsync(ptr, 0, size, mInternals->Streams[_xyz.x.stream])); } diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx index 28c809dd4a09a..949dd6195b262 100644 --- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx +++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx @@ -13,8 +13,8 @@ /// \author David Rohr #include "GPUReconstructionOCLIncludesHost.h" -#include "GPUReconstructionProcessingKernels.inc" #include "GPUDefParametersLoad.inc" +#include "GPUConstantMem.h" #include diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h index 091bc0409630d..958d5186bf41a 100644 --- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h +++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h @@ -34,7 +34,7 @@ class GPUReconstructionOCL : public GPUReconstructionProcessing::KernelInterface ~GPUReconstructionOCL() override; template - void runKernelBackend(const krnlSetupArgs& args); + void runKernelBackend(const krnlSetupTime& _xyz, const Args&... args); protected: int32_t InitDevice_Runtime() override; @@ -57,8 +57,6 @@ class GPUReconstructionOCL : public GPUReconstructionProcessing::KernelInterface template int32_t AddKernel(); - template - void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args); GPUReconstructionOCLInternals* mInternals; float mOclVersion; diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx index 72c68428149dd..655df5404276b 100644 --- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx +++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx @@ -16,9 +16,11 @@ #include "GPUReconstructionKernelIncludes.h" #include "GPUReconstructionOCLKernelsSpecialize.inc" +#include "GPUReconstructionProcessingKernels.inc" +template void GPUReconstructionProcessing::KernelInterface::runKernelVirtual(const int num, const void* args); template -inline void GPUReconstructionOCL::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args) +inline void GPUReconstructionOCL::runKernelBackend(const krnlSetupTime& _xyz, const Args&... args) { cl_kernel k = getKernelObject(); auto& x = _xyz.x; @@ -48,12 +50,6 @@ inline void GPUReconstructionOCL::runKernelBackendInternal(const krnlSetupTime& } } -template -void GPUReconstructionOCL::runKernelBackend(const krnlSetupArgs& args) -{ - std::apply([this, &args](auto&... vals) { runKernelBackendInternal(args.s, vals...); }, args.v); -} - template int32_t GPUReconstructionOCL::AddKernel() { @@ -86,7 +82,3 @@ int32_t GPUReconstructionOCL::AddKernels() #undef GPUCA_KRNL return 0; } - -#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) template void GPUReconstructionOCL::runKernelBackend(const krnlSetupArgs& args); -#include "GPUReconstructionKernelList.h" -#undef GPUCA_KRNL diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernelsSpecialize.inc b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernelsSpecialize.inc index 1b860e47a4243..d5b0338aecbd9 100644 --- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernelsSpecialize.inc +++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernelsSpecialize.inc @@ -13,7 +13,7 @@ /// \author David Rohr template <> -inline void GPUReconstructionOCL::runKernelBackendInternal(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size) +inline void GPUReconstructionOCL::runKernelBackend(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size) { cl_int4 val0 = {0, 0, 0, 0}; GPUChkErr(clEnqueueFillBuffer(mInternals->command_queue[_xyz.x.stream], mInternals->mem_gpu, &val0, sizeof(val0), (char*)ptr - (char*)mDeviceMemoryBase, (size + sizeof(val0) - 1) & ~(sizeof(val0) - 1), _xyz.z.evList == nullptr ? 0 : _xyz.z.nEvents, _xyz.z.evList->getEventList(), _xyz.z.ev->getEventList())); From 4f3e811d2f5e552fdc60e412c974fe2b1e793a6f Mon Sep 17 00:00:00 2001 From: David Rohr Date: Sun, 20 Apr 2025 23:47:46 +0200 Subject: [PATCH 2/2] GPU: Fix onefile compile mode after all the refactoring --- .../Base/GPUReconstructionKernelMacros.h | 14 -------------- .../Base/cuda/GPUReconstructionCUDA.h | 2 ++ .../Base/cuda/GPUReconstructionCUDAKernels.cu | 17 +++++++++++++---- 3 files changed, 15 insertions(+), 18 deletions(-) diff --git a/GPU/GPUTracking/Base/GPUReconstructionKernelMacros.h b/GPU/GPUTracking/Base/GPUReconstructionKernelMacros.h index 2b16dfb32fe14..a03d9de13ef8f 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionKernelMacros.h +++ b/GPU/GPUTracking/Base/GPUReconstructionKernelMacros.h @@ -62,20 +62,6 @@ } #endif -// GPU Host wrappers for kernel -#define GPUCA_KRNL_HOST(x_class, ...) \ - GPUCA_KRNLGPU(x_class, __VA_ARGS__) \ - template <> class GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::backendInternal { \ - public: \ - template \ - static inline void runKernelBackendMacro(const krnlSetupTime& _xyz, T* me, const Args&... args) \ - { \ - auto& x = _xyz.x; \ - auto& y = _xyz.y; \ - GPUCA_KRNL_CALL(x_class, __VA_ARGS__) \ - } \ - }; - #endif // GPUCA_GPUCODE #define GPUCA_KRNL_LB(x_class, x_attributes, ...) GPUCA_KRNL(x_class, (REG, (GPUCA_M_CAT(GPUCA_LB_, GPUCA_M_KRNL_NAME(x_class))), GPUCA_M_STRIP(x_attributes)), __VA_ARGS__) diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h index 36dcdffb1c6d6..6c126d153d8ae 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h @@ -48,6 +48,8 @@ class GPUReconstructionCUDA : public GPUReconstructionProcessing::KernelInterfac void runKernelBackend(const krnlSetupTime& _xyz, const Args&... args); template void runKernelBackendTimed(const krnlSetupTime& _xyz, const Args&... args); + template + struct kernelBackendMacro; template friend GPUh() void GPUCommonAlgorithm::sortOnDevice(auto* rec, int32_t stream, T* begin, size_t N, const S& comp); diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu index 3267e1d5c67f6..e6ed94bba2cec 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu @@ -35,7 +35,7 @@ inline void GPUReconstructionCUDA::runKernelBackendTimed(const krnlSetupTime& _x { #if !defined(GPUCA_KERNEL_COMPILE_MODE) || GPUCA_KERNEL_COMPILE_MODE != 1 if (!GetProcessingSettings().rtc.enable) { - backendInternal::runKernelBackendMacro(_xyz, this, args...); + kernelBackendMacro::run(_xyz, this, args...); } else #endif { @@ -82,9 +82,6 @@ inline void GPUReconstructionCUDA::runKernelBackend(const krnlSetupTime& _xyz, c #define GPUCA_KRNL_DEFONLY // COMPILE_MODE = rdc #endif - #define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) \ - GPUCA_KRNL_HOST(x_class, x_attributes, x_arguments, x_forward, x_types, __VA_ARGS__) - #ifndef __HIPCC__ // CUDA version #define GPUCA_KRNL_CALL(x_class, ...) \ GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))<<mInternals->Streams[x.stream]>>>(GPUCA_CONSMEM_CALL y.index, args...); @@ -95,6 +92,18 @@ inline void GPUReconstructionCUDA::runKernelBackend(const krnlSetupTime& _xyz, c hipLaunchKernelGGL(HIP_KERNEL_NAME(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))), dim3(x.nBlocks), dim3(x.nThreads), 0, me->mInternals->Streams[x.stream], GPUCA_CONSMEM_CALL y.index, args...); #endif // __HIPCC__ + #define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) \ + GPUCA_KRNLGPU(x_class, x_attributes, x_arguments, x_forward, x_types, __VA_ARGS__) \ + template <> struct GPUReconstructionCUDA::kernelBackendMacro { \ + template \ + static inline void run(const GPUReconstructionProcessing::krnlSetupTime& _xyz, auto* me, const Args&... args) \ + { \ + auto& x = _xyz.x; \ + auto& y = _xyz.y; \ + GPUCA_KRNL_CALL(x_class, x_attributes, x_arguments, x_forward, x_types, __VA_ARGS__) \ + } \ + }; + #include "GPUReconstructionKernelList.h" #undef GPUCA_KRNL #endif // ---------- COMPILE_MODE = onefile | rdc ----------