diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx index d95a57c8f2063..969dd06d6297e 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx @@ -56,7 +56,7 @@ GPUReconstructionCPU::~GPUReconstructionCPU() } template -inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args) +inline void GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args) { auto& x = _xyz.x; auto& y = _xyz.y; @@ -90,11 +90,10 @@ inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlS } } } - return 0; } template <> -inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size) +inline void GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size) { int32_t nnThreads = std::max(1, std::min(size / (16 * 1024 * 1024), getNKernelHostThreads(true))); if (nnThreads > 1) { @@ -112,13 +111,12 @@ inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal -int32_t GPUReconstructionCPUBackend::runKernelBackend(const krnlSetupArgs& args) +void GPUReconstructionCPUBackend::runKernelBackend(const krnlSetupArgs& args) { - return std::apply([this, &args](auto&... vals) { return runKernelBackendInternal(args.s, vals...); }, args.v); + std::apply([this, &args](auto&... vals) { runKernelBackendInternal(args.s, vals...); }, args.v); } template @@ -127,8 +125,8 @@ krnlProperties GPUReconstructionCPUBackend::getKernelPropertiesBackend() return krnlProperties{1, 1}; } -#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types) \ - template int32_t GPUReconstructionCPUBackend::runKernelBackend(const krnlSetupArgs& args); \ +#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types) \ + template void GPUReconstructionCPUBackend::runKernelBackend(const krnlSetupArgs& args); \ template krnlProperties GPUReconstructionCPUBackend::getKernelPropertiesBackend(); #include "GPUReconstructionKernelList.h" #undef GPUCA_KRNL diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h index b6225999c68a0..7901c34866c66 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.h +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.h @@ -35,9 +35,9 @@ class GPUReconstructionCPUBackend : public GPUReconstructionProcessing protected: GPUReconstructionCPUBackend(const GPUSettingsDeviceBackend& cfg) : GPUReconstructionProcessing(cfg) {} template - int32_t runKernelBackend(const gpu_reconstruction_kernels::krnlSetupArgs& args); + void runKernelBackend(const gpu_reconstruction_kernels::krnlSetupArgs& args); template - int32_t runKernelBackendInternal(const gpu_reconstruction_kernels::krnlSetupTime& _xyz, const Args&... args); + void runKernelBackendInternal(const gpu_reconstruction_kernels::krnlSetupTime& _xyz, const Args&... args); template gpu_reconstruction_kernels::krnlProperties getKernelPropertiesBackend(); }; @@ -53,7 +53,7 @@ class GPUReconstructionCPU : public GPUReconstructionKernels - int32_t runKernel(krnlSetup&& setup, Args&&... args); + void runKernel(krnlSetup&& setup, Args&&... args); template const gpu_reconstruction_kernels::krnlProperties getKernelProperties() { @@ -77,14 +77,14 @@ class GPUReconstructionCPU : public GPUReconstructionKernels, bool cpuFallback, double& timer, krnlSetup&& setup GPUCA_M_STRIP(x_arguments)) \ - { \ - if (cpuFallback) { \ - return GPUReconstructionCPU::runKernelImpl(krnlSetupArgs(setup.x, setup.y, setup.z, timer GPUCA_M_STRIP(x_forward))); \ - } else { \ - return runKernelImpl(krnlSetupArgs(setup.x, setup.y, setup.z, timer GPUCA_M_STRIP(x_forward))); \ - } \ +#define GPUCA_KRNL(x_class, attributes, x_arguments, x_forward, x_types) \ + inline void runKernelImplWrapper(gpu_reconstruction_kernels::classArgument, bool cpuFallback, double& timer, krnlSetup&& setup GPUCA_M_STRIP(x_arguments)) \ + { \ + if (cpuFallback) { \ + GPUReconstructionCPU::runKernelImpl(krnlSetupArgs(setup.x, setup.y, setup.z, timer GPUCA_M_STRIP(x_forward))); \ + } else { \ + runKernelImpl(krnlSetupArgs(setup.x, setup.y, setup.z, timer GPUCA_M_STRIP(x_forward))); \ + } \ } #include "GPUReconstructionKernelList.h" #undef GPUCA_KRNL @@ -131,7 +131,7 @@ class GPUReconstructionCPU : public GPUReconstructionKernels -inline int32_t GPUReconstructionCPU::runKernel(krnlSetup&& setup, Args&&... args) +inline void GPUReconstructionCPU::runKernel(krnlSetup&& setup, Args&&... args) { HighResTimer* t = nullptr; GPUCA_RECO_STEP myStep = S::GetRecoStep() == GPUCA_RECO_STEP::NoRecoStep ? setup.x.step : S::GetRecoStep(); @@ -164,7 +164,7 @@ inline int32_t GPUReconstructionCPU::runKernel(krnlSetup&& setup, Args&&... args GPUInfo("Running kernel %s (Stream %d, Range %d/%d, Grid %d/%d) on %s", GetKernelName(), stream, setup.y.start, setup.y.num, nBlocks, nThreads, cpuFallback == 2 ? "CPU (forced)" : cpuFallback ? "CPU (fallback)" : mDeviceName.c_str()); } if (nThreads == 0 || nBlocks == 0) { - return 0; + return; } if (mProcessingSettings.debugLevel >= 1) { t = &getKernelTimer(myStep, !IsGPU() || cpuFallback ? getHostThreadIndex() : stream); @@ -173,7 +173,7 @@ inline int32_t GPUReconstructionCPU::runKernel(krnlSetup&& setup, Args&&... args } } double deviceTimerTime = 0.; - int32_t retVal = runKernelImplWrapper(gpu_reconstruction_kernels::classArgument(), cpuFallback, deviceTimerTime, std::forward(setup), std::forward(args)...); + runKernelImplWrapper(gpu_reconstruction_kernels::classArgument(), cpuFallback, deviceTimerTime, std::forward(setup), std::forward(args)...); if (GPUDebug(GetKernelName(), stream, mProcessingSettings.serializeGPU & 1)) { throw std::runtime_error("kernel failure"); } @@ -192,7 +192,6 @@ inline int32_t GPUReconstructionCPU::runKernel(krnlSetup&& setup, Args&&... args throw std::runtime_error("kernel error code"); } } - return retVal; } } // namespace o2::gpu diff --git a/GPU/GPUTracking/Base/GPUReconstructionKernels.h b/GPU/GPUTracking/Base/GPUReconstructionKernels.h index e95a59df6cfd5..d541e36a06af9 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionKernels.h +++ b/GPU/GPUTracking/Base/GPUReconstructionKernels.h @@ -99,9 +99,9 @@ class GPUReconstructionKernels : public T using krnlSetupArgs = gpu_reconstruction_kernels::krnlSetupArgs; #define GPUCA_KRNL(x_class, attributes, x_arguments, x_forward, x_types) \ - virtual int32_t runKernelImpl(const krnlSetupArgs& args) \ + virtual void runKernelImpl(const krnlSetupArgs& args) \ { \ - return T::template runKernelBackend(args); \ + T::template runKernelBackend(args); \ } \ virtual gpu_reconstruction_kernels::krnlProperties getKernelPropertiesImpl(gpu_reconstruction_kernels::classArgument) \ { \ diff --git a/GPU/GPUTracking/Base/GPUReconstructionProcessing.cxx b/GPU/GPUTracking/Base/GPUReconstructionProcessing.cxx index 18662870ed45e..51da17fe58628 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionProcessing.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionProcessing.cxx @@ -45,9 +45,14 @@ void GPUReconstructionProcessing::SetNActiveThreads(int32_t n) void GPUReconstructionProcessing::runParallelOuterLoop(bool doGPU, uint32_t nThreads, std::function lambda) { - tbb::task_arena(SetAndGetNActiveThreadsOuterLoop(!doGPU, nThreads)).execute([&] { - tbb::parallel_for(0, nThreads, lambda, tbb::simple_partitioner()); - }); + uint32_t nThreadsAdjusted = SetAndGetNActiveThreadsOuterLoop(!doGPU, nThreads); + if (nThreadsAdjusted > 1) { + tbb::task_arena(nThreadsAdjusted).execute([&] { + tbb::parallel_for(0, nThreads, lambda, tbb::simple_partitioner()); + }); + } else { + lambda(0); + } } namespace o2::gpu diff --git a/GPU/GPUTracking/Base/cuda/CMakeLists.txt b/GPU/GPUTracking/Base/cuda/CMakeLists.txt index 5bc1e6e4e6783..a33234db49a27 100644 --- a/GPU/GPUTracking/Base/cuda/CMakeLists.txt +++ b/GPU/GPUTracking/Base/cuda/CMakeLists.txt @@ -18,7 +18,7 @@ endif() message(STATUS "Building GPUTracking with CUDA support ${TMP_TARGET}") set(SRCS GPUReconstructionCUDA.cu GPUReconstructionCUDAGenRTC.cxx GPUReconstructionCUDAKernels.cu) -set(HDRS GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDADef.h GPUReconstructionCUDAIncludes.h CUDAThrustHelpers.h) +set(HDRS GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDADef.h GPUReconstructionCUDAIncludesHost.h CUDAThrustHelpers.h) # -------------------------------- Prepare RTC ------------------------------------------------------- enable_language(ASM) if(ALIGPU_BUILD_TYPE STREQUAL "O2") @@ -67,7 +67,7 @@ set(GPU_RTC_BIN ${CMAKE_CURRENT_BINARY_DIR}/GPUReconstructionCUDArtc) # cmake-format: off add_custom_command( OUTPUT ${GPU_RTC_BIN}.src - COMMAND cat ${GPUDIR}/Base/cuda/GPUReconstructionCUDAIncludes.h > ${GPU_RTC_BIN}.src + COMMAND cp ${GPUDIR}/Base/cuda/GPUReconstructionCUDAIncludesHost.h ${GPU_RTC_BIN}.src COMMAND ${CMAKE_CXX_COMPILER} ${GPU_RTC_DEFINES} ${GPU_RTC_INCLUDES} -std=c++${CMAKE_CUDA_STANDARD} -D__CUDA_ARCH__=${RTC_CUDA_ARCH} -D__CUDACC__ -x c++ -nostdinc -E ${GPU_RTC_SRC} >> ${GPU_RTC_BIN}.src MAIN_DEPENDENCY ${GPU_RTC_SRC} IMPLICIT_DEPENDS CXX ${GPU_RTC_SRC} diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu index b195b375b4503..3c118f402dc4f 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu @@ -13,8 +13,7 @@ /// \author David Rohr #define GPUCA_GPUCODE_HOSTONLY -#include "GPUReconstructionCUDADef.h" -#include "GPUReconstructionCUDAIncludes.h" +#include "GPUReconstructionCUDAIncludesHost.h" #include diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h index f14696a92a5b0..ee2f069028d74 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h @@ -42,7 +42,7 @@ class GPUReconstructionCUDABackend : public GPUReconstructionDeviceBase void PrintKernelOccupancies() override; template - int32_t runKernelBackend(const krnlSetupArgs& args); + void runKernelBackend(const krnlSetupArgs& args); template void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args); template diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDADef.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDADef.h index 845ccc9ec09b1..7f77925ca3aaa 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDADef.h +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDADef.h @@ -9,7 +9,7 @@ // granted to it by virtue of its status as an Intergovernmental Organization // or submit itself to any jurisdiction. -/// \file GPUReconstructionCUDDef.h +/// \file GPUReconstructionCUDADef.h /// \author David Rohr #ifndef O2_GPU_GPURECONSTRUCTIONCUDADEF_H diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAExternalProvider.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAExternalProvider.cu index 3862a3a476324..6bcafe565e930 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAExternalProvider.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAExternalProvider.cu @@ -12,8 +12,7 @@ /// \file GPUReconstructionCUDAExternalProvider.cu /// \author David Rohr -#include "GPUReconstructionCUDADef.h" -#include "GPUReconstructionCUDAIncludes.h" +#include "GPUReconstructionCUDAIncludesHost.h" #include "GPUReconstructionCUDA.h" #include "GPUReconstructionCUDAInternals.h" diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx index 3bd3afc0ffc23..1a4721035818e 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx @@ -32,7 +32,9 @@ QGET_LD_BINARY_SYMBOLS(GPUReconstructionCUDArtc_command_arch); int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile) { - std::string rtcparam = std::string(mProcessingSettings.rtc.optSpecialCode ? "#define GPUCA_RTC_SPECIAL_CODE(...) __VA_ARGS__\n" : "#define GPUCA_RTC_SPECIAL_CODE(...)\n") + GPUParamRTC::generateRTCCode(param(), mProcessingSettings.rtc.optConstexpr); + std::string rtcparam = std::string("#define GPUCA_RTC_CODE\n") + + std::string(mProcessingSettings.rtc.optSpecialCode ? "#define GPUCA_RTC_SPECIAL_CODE(...) __VA_ARGS__\n" : "#define GPUCA_RTC_SPECIAL_CODE(...)\n") + + GPUParamRTC::generateRTCCode(param(), mProcessingSettings.rtc.optConstexpr); if (filename == "") { filename = "/tmp/o2cagpu_rtc_"; } diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAIncludes.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAIncludesHost.h similarity index 94% rename from GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAIncludes.h rename to GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAIncludesHost.h index ae79494ded496..e3e26e6482fc4 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAIncludes.h +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAIncludesHost.h @@ -32,4 +32,8 @@ #include #include +#ifndef GPUCA_RTC_CODE +#include "GPUReconstructionCUDADef.h" +#endif + #endif diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu index 2e695b49ebb6c..c22aff4aab28c 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu @@ -12,8 +12,7 @@ /// \file GPUReconstructionCUDAKernels.cu /// \author David Rohr -#include "GPUReconstructionCUDADef.h" -#include "GPUReconstructionCUDAIncludes.h" +#include "GPUReconstructionCUDAIncludesHost.h" #include "GPUReconstructionCUDA.h" #include "GPUReconstructionCUDAInternals.h" @@ -67,7 +66,7 @@ inline void GPUReconstructionCUDABackend::runKernelBackendInternal(const krnlSet } template -int32_t GPUReconstructionCUDABackend::runKernelBackend(const krnlSetupArgs& args) +void GPUReconstructionCUDABackend::runKernelBackend(const krnlSetupArgs& args) { auto& x = args.s.x; auto& z = args.s.z; @@ -84,7 +83,6 @@ int32_t GPUReconstructionCUDABackend::runKernelBackend(const krnlSetupArgsStreams[x.stream])); } - return 0; } #undef GPUCA_KRNL_REG @@ -93,7 +91,7 @@ int32_t GPUReconstructionCUDABackend::runKernelBackend(const krnlSetupArgs(const krnlSetupArgs& args); + template void GPUReconstructionCUDABackend::runKernelBackend(const krnlSetupArgs& args); #else #if defined(GPUCA_KERNEL_COMPILE_MODE) && GPUCA_KERNEL_COMPILE_MODE == 2 #define GPUCA_KRNL_DEFONLY @@ -102,7 +100,7 @@ int32_t GPUReconstructionCUDABackend::runKernelBackend(const krnlSetupArgs(const krnlSetupArgs& args); + template void GPUReconstructionCUDABackend::runKernelBackend(const krnlSetupArgs& args); #ifndef __HIPCC__ // CUDA version #define GPUCA_KRNL_CALL_single(x_class, ...) \ GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))<<mInternals->Streams[x.stream]>>>(GPUCA_CONSMEM_CALL y.start, args...); diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAkernel.template.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAkernel.template.cu index c22b873961e09..bcf61eb07383f 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAkernel.template.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAkernel.template.cu @@ -13,8 +13,7 @@ /// \author David Rohr #define GPUCA_GPUCODE_COMPILEKERNELS -#include "GPUReconstructionCUDAIncludes.h" -#include "GPUReconstructionCUDADef.h" +#include "GPUReconstructionCUDAIncludesHost.h" #define GPUCA_KRNL_REG(args) __launch_bounds__(GPUCA_M_MAX2_3(GPUCA_M_STRIP(args))) #define GPUCA_KRNL(...) GPUCA_KRNL_WRAP(GPUCA_KRNL_LOAD_, __VA_ARGS__) #define GPUCA_KRNL_LOAD_single(...) GPUCA_KRNLGPU_SINGLE(__VA_ARGS__); diff --git a/GPU/GPUTracking/Base/hip/CMakeLists.txt b/GPU/GPUTracking/Base/hip/CMakeLists.txt index 10fbfa8d21ddf..f6e420d5b9656 100644 --- a/GPU/GPUTracking/Base/hip/CMakeLists.txt +++ b/GPU/GPUTracking/Base/hip/CMakeLists.txt @@ -25,7 +25,7 @@ if(NOT DEFINED GPUCA_HIP_HIPIFY_FROM_CUDA OR "${GPUCA_HIP_HIPIFY_FROM_CUDA}") set(GPUCA_HIP_SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/hipify) file(MAKE_DIRECTORY ${GPUCA_HIP_SOURCE_DIR}) set(GPUCA_HIP_FILE_LIST GPUReconstructionCUDA.cu GPUReconstructionCUDAExternalProvider.cu GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDAkernel.template.cu CUDAThrustHelpers.h GPUReconstructionCUDADef.h GPUReconstructionCUDAGenRTC.cxx GPUReconstructionCUDAKernels.cu GPUReconstructionCUDArtc.cu) - set(GPUCA_HIP_LOCAL_FILE_LIST GPUReconstructionHIPIncludes.h) + set(GPUCA_HIP_LOCAL_FILE_LIST GPUReconstructionHIPIncludesHost.h) set(HIP_SOURCES "") foreach(file ${GPUCA_HIP_FILE_LIST}) get_filename_component(ABS_CUDA_SORUCE ../cuda/${file} ABSOLUTE) @@ -63,7 +63,7 @@ endif() set(SRCS ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIP.hip ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPKernels.hip) set(SRCS_CXX ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPGenRTC.cxx) -set(HDRS ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIP.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPInternals.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPDef.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPIncludes.h ${GPUCA_HIP_SOURCE_DIR}/HIPThrustHelpers.h) +set(HDRS ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIP.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPInternals.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPDef.h ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPIncludesHost.h ${GPUCA_HIP_SOURCE_DIR}/HIPThrustHelpers.h) # -------------------------------- Prepare RTC ------------------------------------------------------- enable_language(ASM) @@ -104,7 +104,7 @@ set(GPU_RTC_BIN ${CMAKE_CURRENT_BINARY_DIR}/GPUReconstructionHIPrtc) # cmake-format: off add_custom_command( OUTPUT ${GPU_RTC_BIN}.src - COMMAND cat ${GPUDIR}/Base/hip/GPUReconstructionHIPIncludes.h > ${GPU_RTC_BIN}.src + COMMAND cp ${GPUDIR}/Base/hip/GPUReconstructionHIPIncludesHost.h ${GPU_RTC_BIN}.src COMMAND ${CMAKE_CXX_COMPILER} ${GPU_RTC_DEFINES} ${GPU_RTC_INCLUDES} -std=c++${CMAKE_HIP_STANDARD} -D__HIPCC__ -D__HIP_DEVICE_COMPILE__ -x c++ -nostdinc -E ${GPU_RTC_SRC} >> ${GPU_RTC_BIN}.src MAIN_DEPENDENCY ${GPU_RTC_SRC} IMPLICIT_DEPENDS CXX ${GPU_RTC_SRC} diff --git a/GPU/GPUTracking/Base/hip/GPUReconstructionHIPIncludes.h b/GPU/GPUTracking/Base/hip/GPUReconstructionHIPIncludesHost.h similarity index 89% rename from GPU/GPUTracking/Base/hip/GPUReconstructionHIPIncludes.h rename to GPU/GPUTracking/Base/hip/GPUReconstructionHIPIncludesHost.h index 94d3e46b8f462..7117dd0c718c6 100644 --- a/GPU/GPUTracking/Base/hip/GPUReconstructionHIPIncludes.h +++ b/GPU/GPUTracking/Base/hip/GPUReconstructionHIPIncludesHost.h @@ -9,7 +9,7 @@ // granted to it by virtue of its status as an Intergovernmental Organization // or submit itself to any jurisdiction. -/// \file GPUReconstructionHIPInclude.h +/// \file GPUReconstructionHIPIncludesHost.h /// \author David Rohr #ifndef O2_GPU_RECONSTRUCTIONHIPINCLUDES_H @@ -27,4 +27,8 @@ #include #pragma GCC diagnostic pop +#ifndef GPUCA_RTC_CODE +#include "GPUReconstructionHIPDef.h" +#endif + #endif diff --git a/GPU/GPUTracking/Base/hip/GPUReconstructionHIPkernel.template.hip b/GPU/GPUTracking/Base/hip/GPUReconstructionHIPkernel.template.hip index 0ecaf7a83b18c..ddbc9285763a9 100644 --- a/GPU/GPUTracking/Base/hip/GPUReconstructionHIPkernel.template.hip +++ b/GPU/GPUTracking/Base/hip/GPUReconstructionHIPkernel.template.hip @@ -13,8 +13,7 @@ /// \author David Rohr #define GPUCA_GPUCODE_COMPILEKERNELS -#include "GPUReconstructionHIPIncludes.h" -#include "GPUReconstructionHIPDef.h" +#include "GPUReconstructionHIPIncludesHost.h" #define GPUCA_KRNL_REG(args) __launch_bounds__(GPUCA_M_MAX2_3(GPUCA_M_STRIP(args))) #define GPUCA_KRNL(...) GPUCA_KRNL_WRAP(GPUCA_KRNL_LOAD_, __VA_ARGS__) #define GPUCA_KRNL_LOAD_single(...) GPUCA_KRNLGPU_SINGLE(__VA_ARGS__); diff --git a/GPU/GPUTracking/Base/opencl/CMakeLists.txt b/GPU/GPUTracking/Base/opencl/CMakeLists.txt index d6aa945fc77b7..89d2f386f768f 100644 --- a/GPU/GPUTracking/Base/opencl/CMakeLists.txt +++ b/GPU/GPUTracking/Base/opencl/CMakeLists.txt @@ -36,8 +36,8 @@ set(OCL_DEFINECL "-D$ -using namespace o2::gpu; - -#include -#include -#include -#include +static_assert(std::is_convertible::value, "OpenCL event type incompatible to deviceEvent"); #define GPUErrorReturn(...) \ { \ @@ -32,11 +24,6 @@ using namespace o2::gpu; return (1); \ } -#define GPUCA_KRNL(x_class, x_attributes, ...) GPUCA_KRNL_PROP(x_class, x_attributes) -#define GPUCA_KRNL_BACKEND_CLASS GPUReconstructionOCLBackend -#include "GPUReconstructionKernelList.h" -#undef GPUCA_KRNL - #include "utils/qGetLdBinarySymbols.h" QGET_LD_BINARY_SYMBOLS(GPUReconstructionOCLCode_src); #ifdef OPENCL_ENABLED_SPIRV @@ -67,7 +54,7 @@ int32_t GPUReconstructionOCLBackend::GPUFailedMsgAI(const int64_t error, const c if (error == CL_SUCCESS) { return (0); } - GPUError("OCL Error: %ld / %s (%s:%d)", error, opencl_error_string(error), file, line); + GPUError("OCL Error: %ld / %s (%s:%d)", error, convertErrorToString(error), file, line); return 1; } @@ -457,7 +444,11 @@ size_t GPUReconstructionOCLBackend::GPUMemCpy(void* dst, const void* src, size_t if (stream == -1) { SynchronizeGPU(); } - if (toGPU == -2) { + if (size == 0) { + if (ev || nEvents) { // Workaround for OCL runtimes, which can throw an error in case size = 0 + GPUFailedMsg(clEnqueueMarkerWithWaitList(mInternals->command_queue[stream == -1 ? 0 : stream], nEvents, evList->getEventList(), ev->getEventList())); + } + } else if (toGPU == -2) { GPUFailedMsg(clEnqueueCopyBuffer(mInternals->command_queue[stream == -1 ? 0 : stream], mInternals->mem_gpu, mInternals->mem_gpu, (char*)src - (char*)mDeviceMemoryBase, (char*)dst - (char*)mDeviceMemoryBase, size, nEvents, evList->getEventList(), ev->getEventList())); } else if (toGPU) { GPUFailedMsg(clEnqueueWriteBuffer(mInternals->command_queue[stream == -1 ? 0 : stream], mInternals->mem_gpu, stream == -1, (char*)dst - (char*)mDeviceMemoryBase, size, src, nEvents, evList->getEventList(), ev->getEventList())); @@ -554,20 +545,6 @@ int32_t GPUReconstructionOCLBackend::GPUDebug(const char* state, int32_t stream, return (0); } -template -int32_t GPUReconstructionOCLBackend::runKernelBackend(const krnlSetupArgs& args) -{ - cl_kernel k = args.s.y.num > 1 ? getKernelObject() : getKernelObject(); - return std::apply([this, &args, &k](auto&... vals) { return runKernelBackendInternal(args.s, k, vals...); }, args.v); -} - -template -S& GPUReconstructionOCLBackend::getKernelObject() -{ - static uint32_t krnl = FindKernel(MULTI ? 2 : 1); - return mInternals->kernels[krnl].first; -} - int32_t GPUReconstructionOCLBackend::GetOCLPrograms() { cl_int ocl_error; @@ -606,20 +583,75 @@ int32_t GPUReconstructionOCLBackend::GetOCLPrograms() return 1; } -#define GPUCA_KRNL(...) \ - GPUCA_KRNL_WRAP(GPUCA_KRNL_LOAD_, __VA_ARGS__) -#define GPUCA_KRNL_LOAD_single(x_class, ...) \ - if (AddKernel(false)) { \ - return 1; \ - } -#define GPUCA_KRNL_LOAD_multi(x_class, ...) \ - if (AddKernel(true)) { \ - return 1; \ - } -#include "GPUReconstructionKernelList.h" -#undef GPUCA_KRNL -#undef GPUCA_KRNL_LOAD_single -#undef GPUCA_KRNL_LOAD_multi + return AddKernels(); +} - return 0; +const char* GPUReconstructionOCLBackend::convertErrorToString(int32_t errorcode) +{ + static const std::map error_map = { + {CL_SUCCESS, "CL_SUCCESS"}, + {CL_DEVICE_NOT_FOUND, "CL_DEVICE_NOT_FOUND"}, + {CL_DEVICE_NOT_AVAILABLE, "CL_DEVICE_NOT_AVAILABLE"}, + {CL_COMPILER_NOT_AVAILABLE, "CL_COMPILER_NOT_AVAILABLE"}, + {CL_MEM_OBJECT_ALLOCATION_FAILURE, "CL_MEM_OBJECT_ALLOCATION_FAILURE"}, + {CL_OUT_OF_RESOURCES, "CL_OUT_OF_RESOURCES"}, + {CL_OUT_OF_HOST_MEMORY, "CL_OUT_OF_HOST_MEMORY"}, + {CL_PROFILING_INFO_NOT_AVAILABLE, "CL_PROFILING_INFO_NOT_AVAILABLE"}, + {CL_MEM_COPY_OVERLAP, "CL_MEM_COPY_OVERLAP"}, + {CL_IMAGE_FORMAT_MISMATCH, "CL_IMAGE_FORMAT_MISMATCH"}, + {CL_IMAGE_FORMAT_NOT_SUPPORTED, "CL_IMAGE_FORMAT_NOT_SUPPORTED"}, + {CL_BUILD_PROGRAM_FAILURE, "CL_BUILD_PROGRAM_FAILURE"}, + {CL_MAP_FAILURE, "CL_MAP_FAILURE"}, + {CL_MISALIGNED_SUB_BUFFER_OFFSET, "CL_MISALIGNED_SUB_BUFFER_OFFSET"}, + {CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST, "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"}, + {CL_COMPILE_PROGRAM_FAILURE, "CL_COMPILE_PROGRAM_FAILURE"}, + {CL_LINKER_NOT_AVAILABLE, "CL_LINKER_NOT_AVAILABLE"}, + {CL_LINK_PROGRAM_FAILURE, "CL_LINK_PROGRAM_FAILURE"}, + {CL_DEVICE_PARTITION_FAILED, "CL_DEVICE_PARTITION_FAILED"}, + {CL_KERNEL_ARG_INFO_NOT_AVAILABLE, "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"}, + {CL_INVALID_VALUE, "CL_INVALID_VALUE"}, + {CL_INVALID_DEVICE_TYPE, "CL_INVALID_DEVICE_TYPE"}, + {CL_INVALID_PLATFORM, "CL_INVALID_PLATFORM"}, + {CL_INVALID_DEVICE, "CL_INVALID_DEVICE"}, + {CL_INVALID_CONTEXT, "CL_INVALID_CONTEXT"}, + {CL_INVALID_QUEUE_PROPERTIES, "CL_INVALID_QUEUE_PROPERTIES"}, + {CL_INVALID_COMMAND_QUEUE, "CL_INVALID_COMMAND_QUEUE"}, + {CL_INVALID_HOST_PTR, "CL_INVALID_HOST_PTR"}, + {CL_INVALID_MEM_OBJECT, "CL_INVALID_MEM_OBJECT"}, + {CL_INVALID_IMAGE_FORMAT_DESCRIPTOR, "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"}, + {CL_INVALID_IMAGE_SIZE, "CL_INVALID_IMAGE_SIZE"}, + {CL_INVALID_SAMPLER, "CL_INVALID_SAMPLER"}, + {CL_INVALID_BINARY, "CL_INVALID_BINARY"}, + {CL_INVALID_BUILD_OPTIONS, "CL_INVALID_BUILD_OPTIONS"}, + {CL_INVALID_PROGRAM, "CL_INVALID_PROGRAM"}, + {CL_INVALID_PROGRAM_EXECUTABLE, "CL_INVALID_PROGRAM_EXECUTABLE"}, + {CL_INVALID_KERNEL_NAME, "CL_INVALID_KERNEL_NAME"}, + {CL_INVALID_KERNEL_DEFINITION, "CL_INVALID_KERNEL_DEFINITION"}, + {CL_INVALID_KERNEL, "CL_INVALID_KERNEL"}, + {CL_INVALID_ARG_INDEX, "CL_INVALID_ARG_INDEX"}, + {CL_INVALID_ARG_VALUE, "CL_INVALID_ARG_VALUE"}, + {CL_INVALID_ARG_SIZE, "CL_INVALID_ARG_SIZE"}, + {CL_INVALID_KERNEL_ARGS, "CL_INVALID_KERNEL_ARGS"}, + {CL_INVALID_WORK_DIMENSION, "CL_INVALID_WORK_DIMENSION"}, + {CL_INVALID_WORK_GROUP_SIZE, "CL_INVALID_WORK_GROUP_SIZE"}, + {CL_INVALID_WORK_ITEM_SIZE, "CL_INVALID_WORK_ITEM_SIZE"}, + {CL_INVALID_GLOBAL_OFFSET, "CL_INVALID_GLOBAL_OFFSET"}, + {CL_INVALID_EVENT_WAIT_LIST, "CL_INVALID_EVENT_WAIT_LIST"}, + {CL_INVALID_EVENT, "CL_INVALID_EVENT"}, + {CL_INVALID_OPERATION, "CL_INVALID_OPERATION"}, + {CL_INVALID_GL_OBJECT, "CL_INVALID_GL_OBJECT"}, + {CL_INVALID_BUFFER_SIZE, "CL_INVALID_BUFFER_SIZE"}, + {CL_INVALID_MIP_LEVEL, "CL_INVALID_MIP_LEVEL"}, + {CL_INVALID_GLOBAL_WORK_SIZE, "CL_INVALID_GLOBAL_WORK_SIZE"}, + {CL_INVALID_PROPERTY, "CL_INVALID_PROPERTY"}, + {CL_INVALID_IMAGE_DESCRIPTOR, "CL_INVALID_IMAGE_DESCRIPTOR"}, + {CL_INVALID_COMPILER_OPTIONS, "CL_INVALID_COMPILER_OPTIONS"}, + {CL_INVALID_LINKER_OPTIONS, "CL_INVALID_LINKER_OPTIONS"}, + {CL_INVALID_DEVICE_PARTITION_COUNT, "CL_INVALID_DEVICE_PARTITION_COUNT"}, + {CL_INVALID_PIPE_SIZE, "CL_INVALID_PIPE_SIZE"}, + {CL_INVALID_DEVICE_QUEUE, "CL_INVALID_DEVICE_QUEUE"}, + {CL_INVALID_SPEC_ID, "CL_INVALID_SPEC_ID"}, + {CL_MAX_SIZE_RESTRICTION_EXCEEDED, "CL_MAX_SIZE_RESTRICTION_EXCEEDED"}}; + auto entry = error_map.find(errorcode); + return (entry != error_map.end()) ? entry->second : "Unknown Errorcode"; } diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h index fadb393277758..15015cdcb43c5 100644 --- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h +++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h @@ -59,8 +59,8 @@ class GPUReconstructionOCLBackend : public GPUReconstructionDeviceBase int32_t AddKernel(bool multi = false); template uint32_t FindKernel(int32_t num); - template - int32_t runKernelBackendInternal(const krnlSetupTime& _xyz, K& k, const Args&... args); + template + void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args); template gpu_reconstruction_kernels::krnlProperties getKernelPropertiesBackend(); @@ -68,11 +68,20 @@ class GPUReconstructionOCLBackend : public GPUReconstructionDeviceBase float mOclVersion; template - int32_t runKernelBackend(const krnlSetupArgs& args); + void runKernelBackend(const krnlSetupArgs& args); template S& getKernelObject(); int32_t GetOCLPrograms(); + + private: + static const char* convertErrorToString(int32_t errorCode); + template + static inline int64_t OCLsetKernelParameters_helper(cl_kernel& kernel, int32_t i, const T& firstParameter, const Args&... restOfParameters); + template + static int64_t OCLsetKernelParameters(cl_kernel& kernel, const Args&... args); + static int64_t clExecuteKernelA(cl_command_queue queue, cl_kernel krnl, size_t local_size, size_t global_size, cl_event* pEvent = nullptr, cl_event* wait = nullptr, cl_int nWaitEvents = 1); + int32_t AddKernels(); }; using GPUReconstructionOCL = GPUReconstructionKernels; diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLIncludesHost.h b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLIncludesHost.h new file mode 100644 index 0000000000000..aec5708a80f3c --- /dev/null +++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLIncludesHost.h @@ -0,0 +1,82 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUReconstructionOCLIncludesHost.h +/// \author David Rohr + +#ifndef GPURECONSTRUCTIONOCLINCLUDESHOST_H +#define GPURECONSTRUCTIONOCLINCLUDESHOST_H + +#define GPUCA_GPUTYPE_OPENCL +#define __OPENCL_HOST__ + +#define CL_TARGET_OPENCL_VERSION 220 +#include +#include +#include +#include +#include +#include "GPULogging.h" + +#include "GPUReconstructionOCL.h" +#include "GPUReconstructionIncludes.h" + +using namespace o2::gpu; + +#include +#include +#include +#include + +#define GPUFailedMsg(x) GPUFailedMsgA(x, __FILE__, __LINE__) +#define GPUFailedMsgI(x) GPUFailedMsgAI(x, __FILE__, __LINE__) + +namespace o2::gpu +{ +struct GPUReconstructionOCLInternals { + cl_platform_id platform; + cl_device_id device; + cl_context context; + cl_command_queue command_queue[GPUCA_MAX_STREAMS]; + cl_mem mem_gpu; + cl_mem mem_constant; + cl_mem mem_host; + cl_program program; + + std::vector> kernels; +}; +} // namespace o2::gpu + +template +inline int64_t GPUReconstructionOCLBackend::OCLsetKernelParameters_helper(cl_kernel& kernel, int32_t i, const T& firstParameter, const Args&... restOfParameters) +{ + int64_t retVal = clSetKernelArg(kernel, i, sizeof(T), &firstParameter); + if (retVal) { + return retVal; + } + if constexpr (sizeof...(restOfParameters) > 0) { + return OCLsetKernelParameters_helper(kernel, i + 1, restOfParameters...); + } + return 0; +} + +template +inline int64_t GPUReconstructionOCLBackend::OCLsetKernelParameters(cl_kernel& kernel, const Args&... args) +{ + return OCLsetKernelParameters_helper(kernel, 0, args...); +} + +inline int64_t GPUReconstructionOCLBackend::clExecuteKernelA(cl_command_queue queue, cl_kernel krnl, size_t local_size, size_t global_size, cl_event* pEvent, cl_event* wait, cl_int nWaitEvents) +{ + return clEnqueueNDRangeKernel(queue, krnl, 1, nullptr, &global_size, &local_size, wait == nullptr ? 0 : nWaitEvents, wait, pEvent); +} + +#endif // GPURECONSTRUCTIONOCLINCLUDESHOST_H diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLInternals.h b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLInternals.h deleted file mode 100644 index b47c612b192d7..0000000000000 --- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLInternals.h +++ /dev/null @@ -1,247 +0,0 @@ -// Copyright 2019-2020 CERN and copyright holders of ALICE O2. -// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. -// All rights not expressly granted are reserved. -// -// This software is distributed under the terms of the GNU General Public -// License v3 (GPL Version 3), copied verbatim in the file "COPYING". -// -// In applying this license CERN does not waive the privileges and immunities -// granted to it by virtue of its status as an Intergovernmental Organization -// or submit itself to any jurisdiction. - -/// \file GPUReconstructionOCLInternals.h -/// \author David Rohr, Sergey Gorbunov - -// All OpenCL-header related stuff goes here, so we can run CING over GPUReconstructionOCL - -#ifndef GPUTPCGPUTRACKEROPENCLINTERNALS_H -#define GPUTPCGPUTRACKEROPENCLINTERNALS_H - -#define CL_TARGET_OPENCL_VERSION 220 -#include -#include -#include -#include -#include -#include "GPULogging.h" - -namespace o2::gpu -{ - -static const char* opencl_error_string(int32_t errorcode) -{ - switch (errorcode) { - case CL_SUCCESS: - return "Success!"; - case CL_DEVICE_NOT_FOUND: - return "Device not found."; - case CL_DEVICE_NOT_AVAILABLE: - return "Device not available"; - case CL_COMPILER_NOT_AVAILABLE: - return "Compiler not available"; - case CL_MEM_OBJECT_ALLOCATION_FAILURE: - return "Memory object allocation failure"; - case CL_OUT_OF_RESOURCES: - return "Out of resources"; - case CL_OUT_OF_HOST_MEMORY: - return "Out of host memory"; - case CL_PROFILING_INFO_NOT_AVAILABLE: - return "Profiling information not available"; - case CL_MEM_COPY_OVERLAP: - return "Memory copy overlap"; - case CL_IMAGE_FORMAT_MISMATCH: - return "Image format mismatch"; - case CL_IMAGE_FORMAT_NOT_SUPPORTED: - return "Image format not supported"; - case CL_BUILD_PROGRAM_FAILURE: - return "Program build failure"; - case CL_MAP_FAILURE: - return "Map failure"; - case CL_INVALID_VALUE: - return "Invalid value"; - case CL_INVALID_DEVICE_TYPE: - return "Invalid device type"; - case CL_INVALID_PLATFORM: - return "Invalid platform"; - case CL_INVALID_DEVICE: - return "Invalid device"; - case CL_INVALID_CONTEXT: - return "Invalid context"; - case CL_INVALID_QUEUE_PROPERTIES: - return "Invalid queue properties"; - case CL_INVALID_COMMAND_QUEUE: - return "Invalid command queue"; - case CL_INVALID_HOST_PTR: - return "Invalid host pointer"; - case CL_INVALID_MEM_OBJECT: - return "Invalid memory object"; - case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: - return "Invalid image format descriptor"; - case CL_INVALID_IMAGE_SIZE: - return "Invalid image size"; - case CL_INVALID_SAMPLER: - return "Invalid sampler"; - case CL_INVALID_BINARY: - return "Invalid binary"; - case CL_INVALID_BUILD_OPTIONS: - return "Invalid build options"; - case CL_INVALID_PROGRAM: - return "Invalid program"; - case CL_INVALID_PROGRAM_EXECUTABLE: - return "Invalid program executable"; - case CL_INVALID_KERNEL_NAME: - return "Invalid kernel name"; - case CL_INVALID_KERNEL_DEFINITION: - return "Invalid kernel definition"; - case CL_INVALID_KERNEL: - return "Invalid kernel"; - case CL_INVALID_ARG_INDEX: - return "Invalid argument index"; - case CL_INVALID_ARG_VALUE: - return "Invalid argument value"; - case CL_INVALID_ARG_SIZE: - return "Invalid argument size"; - case CL_INVALID_KERNEL_ARGS: - return "Invalid kernel arguments"; - case CL_INVALID_WORK_DIMENSION: - return "Invalid work dimension"; - case CL_INVALID_WORK_GROUP_SIZE: - return "Invalid work group size"; - case CL_INVALID_WORK_ITEM_SIZE: - return "Invalid work item size"; - case CL_INVALID_GLOBAL_OFFSET: - return "Invalid global offset"; - case CL_INVALID_EVENT_WAIT_LIST: - return "Invalid event wait list"; - case CL_INVALID_EVENT: - return "Invalid event"; - case CL_INVALID_OPERATION: - return "Invalid operation"; - case CL_INVALID_GL_OBJECT: - return "Invalid OpenGL object"; - case CL_INVALID_BUFFER_SIZE: - return "Invalid buffer size"; - case CL_INVALID_MIP_LEVEL: - return "Invalid mip-map level"; - default: - return "Unknown Errorcode"; - } -} - -#define GPUFailedMsg(x) GPUFailedMsgA(x, __FILE__, __LINE__) -#define GPUFailedMsgI(x) GPUFailedMsgAI(x, __FILE__, __LINE__) - -static inline int64_t OCLsetKernelParameters_helper(cl_kernel& k, int32_t i) -{ - return 0; -} - -template -static inline int64_t OCLsetKernelParameters_helper(cl_kernel& kernel, int32_t i, const T& firstParameter, const Args&... restOfParameters) -{ - int64_t retVal = clSetKernelArg(kernel, i, sizeof(T), &firstParameter); - if (retVal) { - return retVal; - } - return OCLsetKernelParameters_helper(kernel, i + 1, restOfParameters...); -} - -template -static inline int64_t OCLsetKernelParameters(cl_kernel& kernel, const Args&... args) -{ - return OCLsetKernelParameters_helper(kernel, 0, args...); -} - -static inline int64_t clExecuteKernelA(cl_command_queue queue, cl_kernel krnl, size_t local_size, size_t global_size, cl_event* pEvent, cl_event* wait = nullptr, cl_int nWaitEvents = 1) -{ - return clEnqueueNDRangeKernel(queue, krnl, 1, nullptr, &global_size, &local_size, wait == nullptr ? 0 : nWaitEvents, wait, pEvent); -} - -struct GPUReconstructionOCLInternals { - cl_platform_id platform; - cl_device_id device; - cl_context context; - cl_command_queue command_queue[GPUCA_MAX_STREAMS]; - cl_mem mem_gpu; - cl_mem mem_constant; - cl_mem mem_host; - cl_program program; - - std::vector> kernels; -}; - -template -inline int32_t GPUReconstructionOCLBackend::runKernelBackendInternal(const krnlSetupTime& _xyz, K& k, const Args&... args) -{ - auto& x = _xyz.x; - auto& y = _xyz.y; - auto& z = _xyz.z; - if (y.num <= 1) { - GPUFailedMsg(OCLsetKernelParameters(k, mInternals->mem_gpu, mInternals->mem_constant, y.start, args...)); - } else { - GPUFailedMsg(OCLsetKernelParameters(k, mInternals->mem_gpu, mInternals->mem_constant, y.start, y.num, args...)); - } - - cl_event ev; - cl_event* evr; - bool tmpEvent = false; - if (z.ev == nullptr && mProcessingSettings.deviceTimers && mProcessingSettings.debugLevel > 0) { - evr = &ev; - tmpEvent = true; - } else { - evr = (cl_event*)z.ev; - } - GPUFailedMsg(clExecuteKernelA(mInternals->command_queue[x.stream], k, x.nThreads, x.nThreads * x.nBlocks, evr, (cl_event*)z.evList, z.nEvents)); - if (mProcessingSettings.deviceTimers && mProcessingSettings.debugLevel > 0) { - cl_ulong time_start, time_end; - GPUFailedMsg(clWaitForEvents(1, evr)); - GPUFailedMsg(clGetEventProfilingInfo(*evr, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, nullptr)); - GPUFailedMsg(clGetEventProfilingInfo(*evr, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, nullptr)); - _xyz.t = (time_end - time_start) * 1.e-9f; - if (tmpEvent) { - GPUFailedMsg(clReleaseEvent(ev)); - } - } - return 0; -} - -template -int32_t GPUReconstructionOCLBackend::AddKernel(bool multi) -{ - std::string name(GetKernelName()); - if (multi) { - name += "_multi"; - } - std::string kname("krnl_" + name); - - cl_int ocl_error; - cl_kernel krnl = clCreateKernel(mInternals->program, kname.c_str(), &ocl_error); - if (GPUFailedMsgI(ocl_error)) { - GPUError("Error creating OPENCL Kernel: %s", name.c_str()); - return 1; - } - mInternals->kernels.emplace_back(krnl, name); - return 0; -} - -template -inline uint32_t GPUReconstructionOCLBackend::FindKernel(int32_t num) -{ - std::string name(GetKernelName()); - if (num > 1) { - name += "_multi"; - } - - for (uint32_t k = 0; k < mInternals->kernels.size(); k++) { - if (mInternals->kernels[k].second == name) { - return (k); - } - } - GPUError("Could not find OpenCL kernel %s", name.c_str()); - throw ::std::runtime_error("Requested unsupported OpenCL kernel"); -} - -static_assert(std::is_convertible::value, "OpenCL event type incompatible to deviceEvent"); -} // namespace o2::gpu - -#endif diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx new file mode 100644 index 0000000000000..8a1c8a6525c0d --- /dev/null +++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx @@ -0,0 +1,133 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUReconstructionOCLKernels.cxx +/// \author David Rohr + +#include "GPUReconstructionOCLIncludesHost.h" + +template <> +inline void GPUReconstructionOCLBackend::runKernelBackendInternal(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size) +{ + cl_int4 val0 = {0, 0, 0, 0}; + GPUFailedMsg(clEnqueueFillBuffer(mInternals->command_queue[_xyz.x.stream], mInternals->mem_gpu, &val0, sizeof(val0), (char*)ptr - (char*)mDeviceMemoryBase, (size + sizeof(val0) - 1) & ~(sizeof(val0) - 1), _xyz.z.evList == nullptr ? 0 : _xyz.z.nEvents, _xyz.z.evList->getEventList(), _xyz.z.ev->getEventList())); +} + +template +inline void GPUReconstructionOCLBackend::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args) +{ + cl_kernel k = _xyz.y.num > 1 ? getKernelObject() : getKernelObject(); + auto& x = _xyz.x; + auto& y = _xyz.y; + auto& z = _xyz.z; + if (y.num <= 1) { + GPUFailedMsg(OCLsetKernelParameters(k, mInternals->mem_gpu, mInternals->mem_constant, y.start, args...)); + } else { + GPUFailedMsg(OCLsetKernelParameters(k, mInternals->mem_gpu, mInternals->mem_constant, y.start, y.num, args...)); + } + + cl_event ev; + cl_event* evr; + bool tmpEvent = false; + if (z.ev == nullptr && mProcessingSettings.deviceTimers && mProcessingSettings.debugLevel > 0) { + evr = &ev; + tmpEvent = true; + } else { + evr = (cl_event*)z.ev; + } + GPUFailedMsg(clExecuteKernelA(mInternals->command_queue[x.stream], k, x.nThreads, x.nThreads * x.nBlocks, evr, (cl_event*)z.evList, z.nEvents)); + if (mProcessingSettings.deviceTimers && mProcessingSettings.debugLevel > 0) { + cl_ulong time_start, time_end; + GPUFailedMsg(clWaitForEvents(1, evr)); + GPUFailedMsg(clGetEventProfilingInfo(*evr, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, nullptr)); + GPUFailedMsg(clGetEventProfilingInfo(*evr, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, nullptr)); + _xyz.t = (time_end - time_start) * 1.e-9f; + if (tmpEvent) { + GPUFailedMsg(clReleaseEvent(ev)); + } + } +} + +template +void GPUReconstructionOCLBackend::runKernelBackend(const krnlSetupArgs& args) +{ + std::apply([this, &args](auto&... vals) { runKernelBackendInternal(args.s, vals...); }, args.v); +} + +template +inline uint32_t GPUReconstructionOCLBackend::FindKernel(int32_t num) +{ + std::string name(GetKernelName()); + if (num > 1) { + name += "_multi"; + } + + for (uint32_t k = 0; k < mInternals->kernels.size(); k++) { + if (mInternals->kernels[k].second == name) { + return (k); + } + } + GPUError("Could not find OpenCL kernel %s", name.c_str()); + throw ::std::runtime_error("Requested unsupported OpenCL kernel"); +} + +template +int32_t GPUReconstructionOCLBackend::AddKernel(bool multi) +{ + std::string name(GetKernelName()); + if (multi) { + name += "_multi"; + } + std::string kname("krnl_" + name); + + cl_int ocl_error; + cl_kernel krnl = clCreateKernel(mInternals->program, kname.c_str(), &ocl_error); + if (GPUFailedMsgI(ocl_error)) { + GPUError("Error creating OPENCL Kernel: %s", name.c_str()); + return 1; + } + mInternals->kernels.emplace_back(krnl, name); + return 0; +} + +template +S& GPUReconstructionOCLBackend::getKernelObject() +{ + static uint32_t krnl = FindKernel(MULTI ? 2 : 1); + return mInternals->kernels[krnl].first; +} + +int32_t GPUReconstructionOCLBackend::AddKernels() +{ +#define GPUCA_KRNL(...) \ + GPUCA_KRNL_WRAP(GPUCA_KRNL_LOAD_, __VA_ARGS__) +#define GPUCA_KRNL_LOAD_single(x_class, ...) \ + if (AddKernel(false)) { \ + return 1; \ + } +#define GPUCA_KRNL_LOAD_multi(x_class, ...) \ + if (AddKernel(true)) { \ + return 1; \ + } +#include "GPUReconstructionKernelList.h" +#undef GPUCA_KRNL +#undef GPUCA_KRNL_LOAD_single +#undef GPUCA_KRNL_LOAD_multi + + return 0; +} + +#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types) \ + GPUCA_KRNL_PROP(x_class, x_attributes) \ + template void GPUReconstructionOCLBackend::runKernelBackend(const krnlSetupArgs& args); +#define GPUCA_KRNL_BACKEND_CLASS GPUReconstructionOCLBackend +#include "GPUReconstructionKernelList.h" +#undef GPUCA_KRNL diff --git a/GPU/GPUTracking/Global/GPUChain.h b/GPU/GPUTracking/Global/GPUChain.h index a7c582b79d964..e017d9b60a269 100644 --- a/GPU/GPUTracking/Global/GPUChain.h +++ b/GPU/GPUTracking/Global/GPUChain.h @@ -171,7 +171,7 @@ class GPUChain mRec->ReadStructFromFile(file, obj); } template - inline int32_t runKernel(gpu_reconstruction_kernels::krnlSetup&& setup, Args&&... args) + inline void runKernel(gpu_reconstruction_kernels::krnlSetup&& setup, Args&&... args) { return mRec->runKernel(std::forward(setup), std::forward(args)...); }