From 7913bcd5d7c8ea6db837b320500becc8414dc74c Mon Sep 17 00:00:00 2001 From: David Rohr Date: Thu, 27 Mar 2025 09:28:41 +0100 Subject: [PATCH 1/2] GPU CMake: Cleanup and do not use Clang x86 denormals flag for device compilation --- GPU/GPUTracking/Base/opencl/CMakeLists.txt | 4 +- GPU/GPUTracking/CMakeLists.txt | 5 +- dependencies/FindO2GPU.cmake | 55 +++++++++++----------- 3 files changed, 33 insertions(+), 31 deletions(-) diff --git a/GPU/GPUTracking/Base/opencl/CMakeLists.txt b/GPU/GPUTracking/Base/opencl/CMakeLists.txt index 48f292a198b9c..381c9c050ca09 100644 --- a/GPU/GPUTracking/Base/opencl/CMakeLists.txt +++ b/GPU/GPUTracking/Base/opencl/CMakeLists.txt @@ -23,9 +23,9 @@ endif() set(CL_SRC ${GPUDIR}/Base/opencl/GPUReconstructionOCL.cl) set(CL_BIN ${CMAKE_CURRENT_BINARY_DIR}/GPUReconstructionOCLCode) -set(OCL_FLAGS -Dcl_clang_storage_class_specifiers -cl-std=CLC++2021) +set(OCL_FLAGS -Dcl_clang_storage_class_specifiers -cl-std=CLC++2021 ${GPUCA_OCL_DENORMALS_FLAGS}) if(NOT GPUCA_DETERMINISTIC_MODE GREATER_EQUAL ${GPUCA_DETERMINISTIC_MODE_MAP_NO_FAST_MATH}) - set(OCL_FLAGS ${OCL_FLAGS} -cl-denorms-are-zero -cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math) + set(OCL_FLAGS ${OCL_FLAGS} -cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math) else() set(OCL_FLAGS ${OCL_FLAGS} ${GPUCA_OCL_NO_FAST_MATH_FLAGS}) endif() diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt index ad7dd9c210cd1..39218e9f94527 100644 --- a/GPU/GPUTracking/CMakeLists.txt +++ b/GPU/GPUTracking/CMakeLists.txt @@ -17,11 +17,12 @@ set(MODULE GPUTracking) if(GPUCA_DETERMINISTIC_MODE GREATER_EQUAL ${GPUCA_DETERMINISTIC_MODE_MAP_NO_FAST_MATH}) set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} ${GPUCA_CXX_NO_FAST_MATH_FLAGS}") if(GPUCA_DETERMINISTIC_MODE GREATER_EQUAL ${GPUCA_DETERMINISTIC_MODE_MAP_OPTO2}) - set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} -O2 ${GPUCA_CLANG_FTZ}") + set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} -O2") endif() elseif(NOT CMAKE_BUILD_TYPE_UPPER STREQUAL "DEBUG") - set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} -O3 -ffast-math ${GPUCA_CLANG_FTZ}") + set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} -O3 -ffast-math") endif() +set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} ${GPUCA_CXX_DENORMALS_FLAGS}") include(cmake/helpers.cmake) diff --git a/dependencies/FindO2GPU.cmake b/dependencies/FindO2GPU.cmake index 56b53e1be8879..d796d0b7145a8 100644 --- a/dependencies/FindO2GPU.cmake +++ b/dependencies/FindO2GPU.cmake @@ -85,17 +85,20 @@ elseif(NOT GPUCA_DETERMINISTIC_MODE MATCHES "^[0-9]+$") message(STATUS "Set to ${GPUCA_DETERMINISTIC_MODE}") endif() if (CMAKE_SYSTEM_NAME MATCHES Darwin OR NOT CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)") - set(GPUCA_CLANG_FTZ "") + set(GPUCA_CXX_DENORMALS_FLAGS "") else() - set(GPUCA_CLANG_FTZ "-mdaz-ftz") + set(GPUCA_CXX_DENORMALS_FLAGS "-mdaz-ftz") endif() -set(GPUCA_CXX_NO_FAST_MATH_FLAGS "-fno-fast-math -ffp-contract=off ${GPUCA_CLANG_FTZ}") -set(GPUCA_CUDA_NO_FAST_MATH_FLAGS "--ftz=true --prec-div=true --prec-sqrt=true --fmad false") -set(GPUCA_OCL_NO_FAST_MATH_FLAGS -cl-fp32-correctly-rounded-divide-sqrt -cl-denorms-are-zero) +set(GPUCA_CUDA_DENORMALS_FLAGS "--ftz=true") +set(GPUCA_OCL_DENORMALS_FLAGS "-cl-denorms-are-zero") +set(GPUCA_HIP_DENORMALS_FLAGS "-fgpu-flush-denormals-to-zero") +set(GPUCA_CXX_NO_FAST_MATH_FLAGS "-fno-fast-math -ffp-contract=off") +set(GPUCA_CUDA_NO_FAST_MATH_FLAGS "--prec-div=true --prec-sqrt=true --fmad false") +set(GPUCA_OCL_NO_FAST_MATH_FLAGS -cl-fp32-correctly-rounded-divide-sqrt ) if(GPUCA_DETERMINISTIC_MODE GREATER_EQUAL ${GPUCA_DETERMINISTIC_MODE_MAP_WHOLEO2}) add_definitions(-DGPUCA_DETERMINISTIC_MODE) - set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} ${GPUCA_CXX_NO_FAST_MATH_FLAGS}") - set(CMAKE_C_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_C_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} ${GPUCA_CXX_NO_FAST_MATH_FLAGS}") + string(APPEND CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} " ${GPUCA_CXX_NO_FAST_MATH_FLAGS}") + string(APPEND CMAKE_C_FLAGS_${CMAKE_BUILD_TYPE_UPPER} " ${GPUCA_CXX_NO_FAST_MATH_FLAGS}") endif() @@ -143,7 +146,8 @@ if(ENABLE_CUDA) endif() endif() if(CMAKE_CUDA_COMPILER) - set(CMAKE_CUDA_FLAGS "-Xcompiler \"${O2_GPU_CMAKE_CXX_FLAGS_NOSTD}\" ${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --extended-lambda -Xcompiler -Wno-attributes") + set(CMAKE_CUDA_FLAGS "-Xcompiler \"${O2_GPU_CMAKE_CXX_FLAGS_NOSTD}\" ${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --extended-lambda -Xcompiler -Wno-attributes ${GPUCA_CUDA_DENORMALS_FLAGS}") + set(CMAKE_CUDA_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "-Xcompiler \"${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}}\" ${CMAKE_CUDA_FLAGS_${CMAKE_BUILD_TYPE_UPPER}}") if(GPUCA_KERNEL_RESOURCE_USAGE_VERBOSE) string(APPEND CMAKE_CUDA_FLAGS " -Xptxas -v") endif() @@ -151,19 +155,18 @@ if(ENABLE_CUDA) if (NOT ENABLE_CUDA STREQUAL "AUTO") string(APPEND CMAKE_CUDA_FLAGS " --allow-unsupported-compiler") endif() - set(CMAKE_CUDA_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "-Xcompiler \"${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}}\" ${CMAKE_CUDA_FLAGS_${CMAKE_BUILD_TYPE_UPPER}}") if(CMAKE_BUILD_TYPE_UPPER STREQUAL "DEBUG") - set(CMAKE_CUDA_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CUDA_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} -lineinfo -Xptxas -O0 -Xcompiler -O0") + string(APPEND CMAKE_CUDA_FLAGS_${CMAKE_BUILD_TYPE_UPPER} " -lineinfo -Xptxas -O0") else() - set(CMAKE_CUDA_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CUDA_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} -Xptxas -O4 -Xcompiler -O4") + string(APPEND CMAKE_CUDA_FLAGS_${CMAKE_BUILD_TYPE_UPPER} " -Xptxas -O4 -Xcompiler -O4") endif() if(GPUCA_DETERMINISTIC_MODE GREATER_EQUAL ${GPUCA_DETERMINISTIC_MODE_MAP_NO_FAST_MATH}) - set(CMAKE_CUDA_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CUDA_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} ${GPUCA_CUDA_NO_FAST_MATH_FLAGS}") + string(APPEND CMAKE_CUDA_FLAGS_${CMAKE_BUILD_TYPE_UPPER} " ${GPUCA_CUDA_NO_FAST_MATH_FLAGS}") elseif(NOT CMAKE_BUILD_TYPE_UPPER STREQUAL "DEBUG") - set(CMAKE_CUDA_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CUDA_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} -use_fast_math --ftz=true")# + string(APPEND CMAKE_CUDA_FLAGS_${CMAKE_BUILD_TYPE_UPPER} " -use_fast_math --ftz=true") endif() if(CMAKE_CXX_FLAGS MATCHES "(^| )-Werror( |$)") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Werror=cross-execution-space-call") + string(APPEND CMAKE_CUDA_FLAGS " -Werror=cross-execution-space-call") endif() if(GPUCA_CUDA_GCCBIN) list(FILTER CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES EXCLUDE REGEX "^/usr/lib.*/gcc/") # Workaround, since CMake adds old GCC lib paths implicitly if we request that gcc for CUDA @@ -285,27 +288,25 @@ if(ENABLE_HIP) set(HIP_ENABLED ON) set_target_properties(roc::rocthrust PROPERTIES IMPORTED_GLOBAL TRUE) message(STATUS "HIP Found (${hip_HIPCC_EXECUTABLE} version ${hip_VERSION})") - set(O2_HIP_CMAKE_CXX_FLAGS "-fgpu-defer-diag -mllvm -amdgpu-enable-lower-module-lds=false -mllvm -amdgpu-function-calls=true -Wno-invalid-command-line-argument -Wno-unused-command-line-argument -Wno-invalid-constexpr -Wno-ignored-optimization-argument -Wno-unused-private-field -Wno-pass-failed") + set(CMAKE_HIP_FLAGS "${O2_GPU_CMAKE_CXX_FLAGS_NOSTD} ${CMAKE_HIP_FLAGS} ${GPUCA_HIP_DENORMALS_FLAGS}") + set(CMAKE_HIP_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} ${CMAKE_HIP_FLAGS_${CMAKE_BUILD_TYPE_UPPER}}") + string(APPEND CMAKE_HIP_FLAGS " -fgpu-defer-diag -mllvm -amdgpu-enable-lower-module-lds=false -mllvm -amdgpu-function-calls=true -Wno-invalid-command-line-argument -Wno-unused-command-line-argument -Wno-invalid-constexpr -Wno-ignored-optimization-argument -Wno-unused-private-field -Wno-pass-failed ") if(hip_VERSION VERSION_GREATER_EQUAL "6.0" AND NOT hip_VERSION VERSION_GREATER_EQUAL "6.2") - string(APPEND O2_HIP_CMAKE_CXX_FLAGS " -mllvm -amdgpu-legacy-sgpr-spill-lowering=true") # TODO: Cleanup + string(APPEND CMAKE_HIP_FLAGS " -mllvm -amdgpu-legacy-sgpr-spill-lowering=true") # TODO: Cleanup endif() if(GPUCA_KERNEL_RESOURCE_USAGE_VERBOSE) - string(APPEND O2_HIP_CMAKE_CXX_FLAGS " -Rpass-analysis=kernel-resource-usage") + string(APPEND CMAKE_HIP_FLAGS " -Rpass-analysis=kernel-resource-usage") + endif() + if(NOT GPUCA_DETERMINISTIC_MODE GREATER_EQUAL ${GPUCA_DETERMINISTIC_MODE_MAP_NO_FAST_MATH}) + string(APPEND CMAKE_HIP_FLAGS " -ffast-math") + endif() + if(NOT CMAKE_BUILD_TYPE_UPPER STREQUAL "DEBUG") + string(APPEND CMAKE_HIP_FLAGS_${CMAKE_BUILD_TYPE_UPPER} " -O3") endif() string(REGEX REPLACE "(gfx1[0-9]+;?)" "" CMAKE_HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}") # ROCm currently doesn’t support integrated graphics if(HIP_AMDGPUTARGET) set(CMAKE_HIP_ARCHITECTURES "${HIP_AMDGPUTARGET}") # If GPU build is enforced we override autodetection endif() - if(NOT GPUCA_DETERMINISTIC_MODE GREATER_EQUAL ${GPUCA_DETERMINISTIC_MODE_MAP_NO_FAST_MATH}) - string(APPEND O2_HIP_CMAKE_CXX_FLAGS " -fgpu-flush-denormals-to-zero -ffast-math") - endif() - set(CMAKE_HIP_FLAGS "${O2_GPU_CMAKE_CXX_FLAGS_NOSTD} ${CMAKE_HIP_FLAGS} ${O2_HIP_CMAKE_CXX_FLAGS}") - set(CMAKE_HIP_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} ${CMAKE_HIP_FLAGS_${CMAKE_BUILD_TYPE_UPPER}}") - if(CMAKE_BUILD_TYPE_UPPER STREQUAL "DEBUG") - set(CMAKE_HIP_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_HIP_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} -O0 -ggdb") - else() - set(CMAKE_HIP_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_HIP_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} -O3") - endif() else() set(HIP_ENABLED OFF) endif() From 5459352aadd8063bcbbb414ecf9b610202300224 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Thu, 27 Mar 2025 09:29:05 +0100 Subject: [PATCH 2/2] GPU HIP CMake: Remove deprecated option --- dependencies/FindO2GPU.cmake | 3 --- 1 file changed, 3 deletions(-) diff --git a/dependencies/FindO2GPU.cmake b/dependencies/FindO2GPU.cmake index d796d0b7145a8..bbbb420354fae 100644 --- a/dependencies/FindO2GPU.cmake +++ b/dependencies/FindO2GPU.cmake @@ -291,9 +291,6 @@ if(ENABLE_HIP) set(CMAKE_HIP_FLAGS "${O2_GPU_CMAKE_CXX_FLAGS_NOSTD} ${CMAKE_HIP_FLAGS} ${GPUCA_HIP_DENORMALS_FLAGS}") set(CMAKE_HIP_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} ${CMAKE_HIP_FLAGS_${CMAKE_BUILD_TYPE_UPPER}}") string(APPEND CMAKE_HIP_FLAGS " -fgpu-defer-diag -mllvm -amdgpu-enable-lower-module-lds=false -mllvm -amdgpu-function-calls=true -Wno-invalid-command-line-argument -Wno-unused-command-line-argument -Wno-invalid-constexpr -Wno-ignored-optimization-argument -Wno-unused-private-field -Wno-pass-failed ") - if(hip_VERSION VERSION_GREATER_EQUAL "6.0" AND NOT hip_VERSION VERSION_GREATER_EQUAL "6.2") - string(APPEND CMAKE_HIP_FLAGS " -mllvm -amdgpu-legacy-sgpr-spill-lowering=true") # TODO: Cleanup - endif() if(GPUCA_KERNEL_RESOURCE_USAGE_VERBOSE) string(APPEND CMAKE_HIP_FLAGS " -Rpass-analysis=kernel-resource-usage") endif()