diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx index d96d5aad74622..9b6562d8e77ee 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.cxx +++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx @@ -273,6 +273,7 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice() if (mProcessingSettings.createO2Output > 1) { mProcessingSettings.createO2Output = 1; } + mProcessingSettings.rtc.deterministic = 1; } if (mProcessingSettings.deterministicGPUReconstruction && mProcessingSettings.debugLevel >= 6) { mProcessingSettings.nTPCClustererLanes = 1; diff --git a/GPU/GPUTracking/Base/GPUReconstructionProcessing.cxx b/GPU/GPUTracking/Base/GPUReconstructionProcessing.cxx index 51c48ebbfc0b2..bae95ac8d3f38 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionProcessing.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionProcessing.cxx @@ -57,17 +57,24 @@ void GPUReconstructionProcessing::runParallelOuterLoop(bool doGPU, uint32_t nThr } } -namespace o2::gpu -{ -namespace // anonymous +uint32_t GPUReconstructionProcessing::SetAndGetNActiveThreadsOuterLoop(bool condition, uint32_t max) { -static std::atomic_flag timerFlag = ATOMIC_FLAG_INIT; // TODO: Should be a class member not global, but cannot be moved to header due to ROOT limitation -} // anonymous namespace -} // namespace o2::gpu + if (condition && mProcessingSettings.inKernelParallel != 1) { + mNActiveThreadsOuterLoop = mProcessingSettings.inKernelParallel == 2 ? std::min(max, mMaxHostThreads) : mMaxHostThreads; + } else { + mNActiveThreadsOuterLoop = 1; + } + if (mProcessingSettings.debugLevel >= 5) { + printf("Running %d threads in outer loop\n", mNActiveThreadsOuterLoop); + } + return mNActiveThreadsOuterLoop; +} + +std::atomic_flag GPUReconstructionProcessing::mTimerFlag = ATOMIC_FLAG_INIT; GPUReconstructionProcessing::timerMeta* GPUReconstructionProcessing::insertTimer(uint32_t id, std::string&& name, int32_t J, int32_t num, int32_t type, RecoStep step) { - while (timerFlag.test_and_set()) { + while (mTimerFlag.test_and_set()) { } if (mTimers.size() <= id) { mTimers.resize(id + 1); @@ -81,20 +88,20 @@ GPUReconstructionProcessing::timerMeta* GPUReconstructionProcessing::insertTimer mTimers[id]->count++; } timerMeta* retVal = mTimers[id].get(); - timerFlag.clear(); + mTimerFlag.clear(); return retVal; } GPUReconstructionProcessing::timerMeta* GPUReconstructionProcessing::getTimerById(uint32_t id, bool increment) { timerMeta* retVal = nullptr; - while (timerFlag.test_and_set()) { + while (mTimerFlag.test_and_set()) { } if (mTimers.size() > id && mTimers[id]) { retVal = mTimers[id].get(); retVal->count += increment; } - timerFlag.clear(); + mTimerFlag.clear(); return retVal; } @@ -104,19 +111,6 @@ uint32_t GPUReconstructionProcessing::getNextTimerId() return id.fetch_add(1); } -uint32_t GPUReconstructionProcessing::SetAndGetNActiveThreadsOuterLoop(bool condition, uint32_t max) -{ - if (condition && mProcessingSettings.inKernelParallel != 1) { - mNActiveThreadsOuterLoop = mProcessingSettings.inKernelParallel == 2 ? std::min(max, mMaxHostThreads) : mMaxHostThreads; - } else { - mNActiveThreadsOuterLoop = 1; - } - if (mProcessingSettings.debugLevel >= 5) { - printf("Running %d threads in outer loop\n", mNActiveThreadsOuterLoop); - } - return mNActiveThreadsOuterLoop; -} - std::unique_ptr GPUReconstructionProcessing::GetThreadContext() { return std::make_unique(); @@ -124,3 +118,39 @@ std::unique_ptr GPUReconstructionProc gpu_reconstruction_kernels::threadContext::threadContext() = default; gpu_reconstruction_kernels::threadContext::~threadContext() = default; + +template +uint32_t GPUReconstructionProcessing::GetKernelNum(int32_t k) +{ + static int32_t num = k; + if (num < 0) { + throw std::runtime_error("Internal Error - Kernel Number not Set"); + } + return num; +} + +namespace o2::gpu::internal +{ +static std::vector initKernelNames() +{ + std::vector retVal; +#define GPUCA_KRNL(x_class, ...) \ + GPUReconstructionProcessing::GetKernelNum(retVal.size()); \ + retVal.emplace_back(GPUCA_M_STR(GPUCA_M_KRNL_NAME(x_class))); +#include "GPUReconstructionKernelList.h" +#undef GPUCA_KRNL + return retVal; +} +} // namespace o2::gpu::internal + +const std::vector GPUReconstructionProcessing::mKernelNames = o2::gpu::internal::initKernelNames(); + +#define GPUCA_KRNL(x_class, ...) \ + template uint32_t GPUReconstructionProcessing::GetKernelNum(int32_t); \ + template <> \ + const char* GPUReconstructionProcessing::GetKernelName() \ + { \ + return GPUCA_M_STR(GPUCA_M_KRNL_NAME(x_class)); \ + } +#include "GPUReconstructionKernelList.h" +#undef GPUCA_KRNL diff --git a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h index 43560616782db..b0466efceac24 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h +++ b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h @@ -20,6 +20,7 @@ #include "utils/timer.h" #include +#include namespace o2::gpu { @@ -74,7 +75,10 @@ class GPUReconstructionProcessing : public GPUReconstruction // Interface to query name of a kernel template - constexpr static const char* GetKernelName(); + static const char* GetKernelName(); + const std::string& GetKernelName(int32_t i) const { return mKernelNames[i]; } + template + static uint32_t GetKernelNum(int32_t k = -1); // Public queries for timers auto& getRecoStepTimer(RecoStep step) { return mTimersRecoSteps[getRecoStepNum(step)]; } @@ -100,6 +104,8 @@ class GPUReconstructionProcessing : public GPUReconstruction GPUReconstructionProcessing(const GPUSettingsDeviceBackend& cfg) : GPUReconstruction(cfg) {} using deviceEvent = gpu_reconstruction_kernels::deviceEvent; + static const std::vector mKernelNames; + int32_t mActiveHostKernelThreads = 0; // Number of currently active threads on the host for kernels uint32_t mNActiveThreadsOuterLoop = 1; // Number of threads currently running an outer loop @@ -130,6 +136,8 @@ class GPUReconstructionProcessing : public GPUReconstruction uint32_t getNextTimerId(); timerMeta* getTimerById(uint32_t id, bool increment = true); timerMeta* insertTimer(uint32_t id, std::string&& name, int32_t J, int32_t num, int32_t type, RecoStep step); + + static std::atomic_flag mTimerFlag; }; template @@ -174,15 +182,6 @@ HighResTimer& GPUReconstructionProcessing::getTimer(const char* name, int32_t nu return timer->timer[num]; } -#define GPUCA_KRNL(x_class, ...) \ - template <> \ - constexpr const char* GPUReconstructionProcessing::GetKernelName() \ - { \ - return GPUCA_M_STR(GPUCA_M_KRNL_NAME(x_class)); \ - } -#include "GPUReconstructionKernelList.h" -#undef GPUCA_KRNL - } // namespace o2::gpu #endif diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu index 175fd205153ea..f87d5c8189cdc 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu @@ -610,7 +610,7 @@ void GPUReconstructionCUDABackend::PrintKernelOccupancies() GPUChkErr(cuOccupancyMaxActiveBlocksPerMultiprocessor(&maxBlocks, *mInternals->kernelFunctions[i], threads, 0)); GPUChkErr(cuFuncGetAttribute(&nRegs, CU_FUNC_ATTRIBUTE_NUM_REGS, *mInternals->kernelFunctions[i])); GPUChkErr(cuFuncGetAttribute(&sMem, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, *mInternals->kernelFunctions[i])); - GPUInfo("Kernel: %50s Block size: %4d, Maximum active blocks: %3d, Suggested blocks: %3d, Regs: %3d, smem: %3d", mInternals->kernelNames[i].c_str(), threads, maxBlocks, suggestedBlocks, nRegs, sMem); + GPUInfo("Kernel: %50s Block size: %4d, Maximum active blocks: %3d, Suggested blocks: %3d, Regs: %3d, smem: %3d", GetKernelName(i).c_str(), threads, maxBlocks, suggestedBlocks, nRegs, sMem); } } @@ -618,9 +618,10 @@ void GPUReconstructionCUDA::loadKernelModules(bool perKernel) { uint32_t j = 0; #define GPUCA_KRNL(x_class, ...) \ - getRTCkernelNum(mInternals->kernelFunctions.size()); \ + if (GetKernelNum() != j) { \ + GPUFatal("kernel numbers out of sync"); \ + } \ mInternals->kernelFunctions.emplace_back(new CUfunction); \ - mInternals->kernelNames.emplace_back(GPUCA_M_STR(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class)))); \ if (mProcessingSettings.debugLevel >= 3) { \ GPUInfo("Loading kernel %s (j = %u)", GPUCA_M_STR(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))), j); \ } \ @@ -628,7 +629,6 @@ void GPUReconstructionCUDA::loadKernelModules(bool perKernel) j++; #include "GPUReconstructionKernelList.h" #undef GPUCA_KRNL - if (j != mInternals->kernelModules.size()) { GPUFatal("Did not load all kernels (%u < %u)", j, (uint32_t)mInternals->kernelModules.size()); } diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h index 30bbc76d4c415..a98b14a873ca0 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h @@ -46,11 +46,6 @@ class GPUReconstructionCUDABackend : public GPUReconstructionDeviceBase void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args); template gpu_reconstruction_kernels::krnlProperties getKernelPropertiesBackend(); - template - class backendInternal; - - template - static int32_t getRTCkernelNum(int32_t k = -1); void getRTCKernelCalls(std::vector& kernels); diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx index e789dc9b9ebc3..51d3bd4044e8d 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx @@ -31,11 +31,12 @@ QGET_LD_BINARY_SYMBOLS(GPUReconstructionCUDArtc_command); QGET_LD_BINARY_SYMBOLS(GPUReconstructionCUDArtc_command_arch); QGET_LD_BINARY_SYMBOLS(GPUReconstructionCUDArtc_command_no_fast_math); +#include "GPUNoFastMathKernels.h" + int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile) { std::string rtcparam = std::string("#define GPUCA_RTC_CODE\n") + std::string(mProcessingSettings.rtc.optSpecialCode ? "#define GPUCA_RTC_SPECIAL_CODE(...) __VA_ARGS__\n" : "#define GPUCA_RTC_SPECIAL_CODE(...)\n") + - std::string(mProcessingSettings.rtc.deterministic ? "#define GPUCA_DETERMINISTIC_CODE(det, indet) det\n" : "#define GPUCA_DETERMINISTIC_CODE(det, indet) indet\n") + GPUParamRTC::generateRTCCode(param(), mProcessingSettings.rtc.optConstexpr); if (filename == "") { filename = "/tmp/o2cagpu_rtc_"; @@ -54,7 +55,6 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile) std::string baseCommand = (mProcessingSettings.RTCprependCommand != "" ? (mProcessingSettings.RTCprependCommand + " ") : ""); baseCommand += (getenv("O2_GPU_RTC_OVERRIDE_CMD") ? std::string(getenv("O2_GPU_RTC_OVERRIDE_CMD")) : std::string(_binary_GPUReconstructionCUDArtc_command_start, _binary_GPUReconstructionCUDArtc_command_len)); baseCommand += std::string(" ") + (mProcessingSettings.RTCoverrideArchitecture != "" ? mProcessingSettings.RTCoverrideArchitecture : std::string(_binary_GPUReconstructionCUDArtc_command_arch_start, _binary_GPUReconstructionCUDArtc_command_arch_len)); - baseCommand += mProcessingSettings.rtc.deterministic ? (std::string(" ") + std::string(_binary_GPUReconstructionCUDArtc_command_no_fast_math_start, _binary_GPUReconstructionCUDArtc_command_no_fast_math_len)) : std::string(""); char shasource[21], shaparam[21], shacmd[21], shakernels[21]; if (mProcessingSettings.rtc.cacheOutput) { @@ -169,13 +169,20 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile) kernel += mProcessingSettings.rtc.compilePerKernel ? kernels[i] : kernelsall; kernel += "}"; - if (fwrite(rtcparam.c_str(), 1, rtcparam.size(), fp) != rtcparam.size() || + bool deterministic = mProcessingSettings.rtc.deterministic || o2::gpu::internal::noFastMathKernels.find(GetKernelName(i)) != o2::gpu::internal::noFastMathKernels.end(); + const std::string deterministicStr = std::string(deterministic ? "#define GPUCA_DETERMINISTIC_CODE(det, indet) det\n" : "#define GPUCA_DETERMINISTIC_CODE(det, indet) indet\n"); + + if (fwrite(deterministicStr.c_str(), 1, deterministicStr.size(), fp) != deterministicStr.size() || + fwrite(rtcparam.c_str(), 1, rtcparam.size(), fp) != rtcparam.size() || fwrite(_binary_GPUReconstructionCUDArtc_src_start, 1, _binary_GPUReconstructionCUDArtc_src_len, fp) != _binary_GPUReconstructionCUDArtc_src_len || fwrite(kernel.c_str(), 1, kernel.size(), fp) != kernel.size()) { throw std::runtime_error("Error writing file"); } fclose(fp); std::string command = baseCommand; + if (deterministic) { + command += std::string(" ") + std::string(_binary_GPUReconstructionCUDArtc_command_no_fast_math_start, _binary_GPUReconstructionCUDArtc_command_no_fast_math_len); + } command += " -c " + filename + "_" + std::to_string(i) + mRtcSrcExtension + " -o " + filename + "_" + std::to_string(i) + mRtcBinExtension; if (mProcessingSettings.debugLevel < 0) { command += " &> /dev/null"; diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAInternals.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAInternals.h index c85d98d85420e..f3fc21243ef0e 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAInternals.h +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAInternals.h @@ -30,7 +30,6 @@ namespace o2::gpu struct GPUReconstructionCUDAInternals { std::vector> kernelModules; // module for RTC compilation std::vector> kernelFunctions; // vector of ptrs to RTC kernels - std::vector kernelNames; // names of kernels cudaStream_t Streams[GPUCA_MAX_STREAMS]; // Pointer to array of CUDA Streams static void getArgPtrs(const void** pArgs) {} diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu index f1f459fe021bc..4b3f8a767226c 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu @@ -55,7 +55,7 @@ inline void GPUReconstructionCUDABackend::runKernelBackendInternal(const krnlSet #endif pArgs[arg_offset] = &y.index; GPUReconstructionCUDAInternals::getArgPtrs(&pArgs[arg_offset + 1], args...); - GPUChkErr(cuLaunchKernel(*mInternals->kernelFunctions[getRTCkernelNum()], x.nBlocks, 1, 1, x.nThreads, 1, 1, 0, mInternals->Streams[x.stream], (void**)pArgs, nullptr)); + GPUChkErr(cuLaunchKernel(*mInternals->kernelFunctions[GetKernelNum()], x.nBlocks, 1, 1, x.nThreads, 1, 1, 0, mInternals->Streams[x.stream], (void**)pArgs, nullptr)); } } @@ -111,22 +111,6 @@ void GPUReconstructionCUDABackend::runKernelBackend(const krnlSetupArgs -int32_t GPUReconstructionCUDABackend::getRTCkernelNum(int32_t k) -{ - static int32_t num = k; - if (num < 0) { - throw std::runtime_error("Invalid kernel"); - } - return num; -} - -#define GPUCA_KRNL(x_class, ...) \ - template int32_t GPUReconstructionCUDABackend::getRTCkernelNum(int32_t k); \ - template int32_t GPUReconstructionCUDABackend::getRTCkernelNum(int32_t k); -#include "GPUReconstructionKernelList.h" -#undef GPUCA_KRNL - void GPUReconstructionCUDABackend::getRTCKernelCalls(std::vector& kernels) { #define GPUCA_KRNL(...) kernels.emplace_back(GPUCA_M_STR(GPUCA_KRNLGPU(__VA_ARGS__))); diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx index e92205b9864e6..e724f0f2cbfcd 100644 --- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx +++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx @@ -399,7 +399,7 @@ int32_t GPUReconstructionOCLBackend::ExitDevice_Runtime() clReleaseMemObject(mInternals->mem_gpu); clReleaseMemObject(mInternals->mem_constant); for (uint32_t i = 0; i < mInternals->kernels.size(); i++) { - clReleaseKernel(mInternals->kernels[i].first); + clReleaseKernel(mInternals->kernels[i]); } mInternals->kernels.clear(); } diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h index 2abae229c74bb..29951cd43f167 100644 --- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h +++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h @@ -56,8 +56,6 @@ class GPUReconstructionOCLBackend : public GPUReconstructionDeviceBase template int32_t AddKernel(); - template - uint32_t FindKernel(); template void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args); template diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLIncludesHost.h b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLIncludesHost.h index 97316cf9aa32e..0bb2f25093789 100644 --- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLIncludesHost.h +++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLIncludesHost.h @@ -49,7 +49,7 @@ struct GPUReconstructionOCLInternals { cl_mem mem_host; cl_program program; - std::vector> kernels; + std::vector kernels; }; } // namespace o2::gpu diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx index ce6b6553ae1f7..fff69038c056f 100644 --- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx +++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx @@ -58,20 +58,6 @@ void GPUReconstructionOCLBackend::runKernelBackend(const krnlSetupArgs(args.s, vals...); }, args.v); } -template -inline uint32_t GPUReconstructionOCLBackend::FindKernel() -{ - std::string name(GetKernelName()); - - for (uint32_t k = 0; k < mInternals->kernels.size(); k++) { - if (mInternals->kernels[k].second == name) { - return (k); - } - } - GPUError("Could not find OpenCL kernel %s", name.c_str()); - throw ::std::runtime_error("Requested unsupported OpenCL kernel"); -} - template int32_t GPUReconstructionOCLBackend::AddKernel() { @@ -84,15 +70,14 @@ int32_t GPUReconstructionOCLBackend::AddKernel() GPUError("Error creating OPENCL Kernel: %s", name.c_str()); return 1; } - mInternals->kernels.emplace_back(krnl, name); + mInternals->kernels.emplace_back(krnl); return 0; } template S& GPUReconstructionOCLBackend::getKernelObject() { - static uint32_t krnl = FindKernel(); - return mInternals->kernels[krnl].first; + return mInternals->kernels[GetKernelNum()]; } int32_t GPUReconstructionOCLBackend::AddKernels() diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt index ba2b9d05a3192..631f9f0edff4f 100644 --- a/GPU/GPUTracking/CMakeLists.txt +++ b/GPU/GPUTracking/CMakeLists.txt @@ -242,6 +242,10 @@ file(GENERATE OUTPUT include_gpu_onthefly/GPUReconstructionIncludesDeviceAll.h INPUT Base/GPUReconstructionIncludesDeviceAll.template.h ) +file(GENERATE + OUTPUT include_gpu_onthefly/GPUNoFastMathKernels.h + INPUT cmake/GPUNoFastMathKernels.template.h +) if(NOT ALIGPU_BUILD_TYPE STREQUAL "O2") include_directories(${CMAKE_CURRENT_BINARY_DIR}/include_gpu_onthefly) endif() diff --git a/GPU/GPUTracking/cmake/GPUNoFastMathKernels.template.h b/GPU/GPUTracking/cmake/GPUNoFastMathKernels.template.h new file mode 100644 index 0000000000000..dac93277d5ec9 --- /dev/null +++ b/GPU/GPUTracking/cmake/GPUNoFastMathKernels.template.h @@ -0,0 +1,23 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUNoFastMathKernels.h +/// \author David Rohr + +#include +#include + +namespace o2::gpu::internal +{ +// clang-format off +static const std::unordered_set noFastMathKernels = {$>,APPEND,">,PREPEND,">,$ >}; +// clang-format on +} // namespace o2::gpu::internal diff --git a/GPU/GPUTracking/cmake/kernel_helpers.cmake b/GPU/GPUTracking/cmake/kernel_helpers.cmake index e63b915640e8a..99699cc72e940 100644 --- a/GPU/GPUTracking/cmake/kernel_helpers.cmake +++ b/GPU/GPUTracking/cmake/kernel_helpers.cmake @@ -17,6 +17,7 @@ define_property(TARGET PROPERTY O2_GPU_KERNELS) define_property(TARGET PROPERTY O2_GPU_KERNEL_NAMES) define_property(TARGET PROPERTY O2_GPU_KERNEL_INCLUDES) define_property(TARGET PROPERTY O2_GPU_KERNEL_FILES) +define_property(TARGET PROPERTY O2_GPU_KERNEL_NO_FAST_MATH) set(O2_GPU_KERNEL_WRAPPER_FOLDER "${CMAKE_CURRENT_BINARY_DIR}/GPU/include_gpu_onthefly") file(MAKE_DIRECTORY ${O2_GPU_KERNEL_WRAPPER_FOLDER}) set(O2_GPU_BASE_DIR "${CMAKE_CURRENT_LIST_DIR}/../") @@ -144,24 +145,23 @@ function(o2_gpu_kernel_file_list list) endfunction() function(o2_gpu_kernel_set_deterministic) - if(NOT GPUCA_DETERMINISTIC_MODE GREATER_EQUAL ${GPUCA_DETERMINISTIC_MODE_MAP_GPU}) - list(LENGTH ARGV n) - math(EXPR n "${n} - 1") - foreach(i RANGE 0 ${n}) - if(CUDA_ENABLED AND (NOT DEFINED GPUCA_CUDA_COMPILE_MODE OR GPUCA_CUDA_COMPILE_MODE STREQUAL "perkernel")) - set_source_files_properties("${O2_GPU_KERNEL_WRAPPER_FOLDER}/krnl_${ARGV${i}}.cu" - TARGET_DIRECTORY O2::GPUTrackingCUDA - PROPERTIES - COMPILE_FLAGS "${GPUCA_CUDA_NO_FAST_MATH_FLAGS}" - COMPILE_DEFINITIONS "GPUCA_DETERMINISTIC_MODE") - endif() - if(HIP_ENABLED AND (NOT DEFINED GPUCA_HIP_COMPILE_MODE OR GPUCA_HIP_COMPILE_MODE STREQUAL "perkernel")) - set_source_files_properties("${O2_GPU_KERNEL_WRAPPER_FOLDER}/krnl_${ARGV${i}}.hip" - TARGET_DIRECTORY O2::GPUTrackingHIP - PROPERTIES - COMPILE_FLAGS "${GPUCA_CXX_NO_FAST_MATH_FLAGS}" - COMPILE_DEFINITIONS "GPUCA_DETERMINISTIC_MODE") - endif() - endforeach() - endif() + list(LENGTH ARGV n) + math(EXPR n "${n} - 1") + foreach(i RANGE 0 ${n}) + set_property(TARGET O2_GPU_KERNELS APPEND PROPERTY O2_GPU_KERNEL_NO_FAST_MATH "${ARGV${i}}") + if(CUDA_ENABLED AND (NOT DEFINED GPUCA_CUDA_COMPILE_MODE OR GPUCA_CUDA_COMPILE_MODE STREQUAL "perkernel")) + set_source_files_properties("${O2_GPU_KERNEL_WRAPPER_FOLDER}/krnl_${ARGV${i}}.cu" + TARGET_DIRECTORY O2::GPUTrackingCUDA + PROPERTIES + COMPILE_FLAGS "${GPUCA_CUDA_NO_FAST_MATH_FLAGS}" + COMPILE_DEFINITIONS "GPUCA_DETERMINISTIC_MODE") + endif() + if(HIP_ENABLED AND (NOT DEFINED GPUCA_HIP_COMPILE_MODE OR GPUCA_HIP_COMPILE_MODE STREQUAL "perkernel")) + set_source_files_properties("${O2_GPU_KERNEL_WRAPPER_FOLDER}/krnl_${ARGV${i}}.hip" + TARGET_DIRECTORY O2::GPUTrackingHIP + PROPERTIES + COMPILE_FLAGS "${GPUCA_CXX_NO_FAST_MATH_FLAGS}" + COMPILE_DEFINITIONS "GPUCA_DETERMINISTIC_MODE") + endif() + endforeach() endfunction()