From 0ab24b0fbba6164847d1395014b7df9364828de5 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Sun, 23 Mar 2025 19:30:12 +0100 Subject: [PATCH 1/2] GPU CMake: Shuffle stuff a bit and move all NO_FAST_MATH stuff to GPU/... and FindO2GPU.cmake --- GPU/CMakeLists.txt | 1 - dependencies/FindO2GPU.cmake | 26 +++++++++++++++----------- dependencies/O2CompileFlags.cmake | 7 ------- 3 files changed, 15 insertions(+), 19 deletions(-) diff --git a/GPU/CMakeLists.txt b/GPU/CMakeLists.txt index 3c83c583eebfc..75cd5eddc0b24 100644 --- a/GPU/CMakeLists.txt +++ b/GPU/CMakeLists.txt @@ -16,7 +16,6 @@ if(NOT DEFINED GPUCA_NO_FAST_MATH) set(GPUCA_NO_FAST_MATH 0) endif() -set(GPUCA_CXX_NO_FAST_MATH_FLAGS "-fno-fast-math -ffp-contract=off") if(${GPUCA_NO_FAST_MATH}) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GPUCA_CXX_NO_FAST_MATH_FLAGS}") endif() diff --git a/dependencies/FindO2GPU.cmake b/dependencies/FindO2GPU.cmake index f8d41c032078f..69241ea30a375 100644 --- a/dependencies/FindO2GPU.cmake +++ b/dependencies/FindO2GPU.cmake @@ -64,9 +64,20 @@ function(set_target_hip_arch target) endif() endfunction() -# Detect and enable CUDA -STRING(REGEX REPLACE "\-std=[^ ]*" "" O2_GPU_CMAKE_CXX_FLAGS_NOSTD "${CMAKE_CXX_FLAGS}") # Need to strip c++17 imposed by alidist defaults +# Need to strip c++17 imposed by alidist defaults +STRING(REGEX REPLACE "\-std=[^ ]*" "" O2_GPU_CMAKE_CXX_FLAGS_NOSTD "${CMAKE_CXX_FLAGS}") +# ---------------------------------- Fast Math / Deterministic Mode ---------------------------------- +if(GPUCA_NO_FAST_MATH_WHOLEO2) + set(GPUCA_NO_FAST_MATH 1) + add_definitions(-DGPUCA_NO_FAST_MATH) + set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} -fno-fast-math -ffp-contract=off") + set(CMAKE_C_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_C_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} -fno-fast-math -ffp-contract=off") +endif() +set(GPUCA_CXX_NO_FAST_MATH_FLAGS "-fno-fast-math -ffp-contract=off") +set(GPUCA_CUDA_NO_FAST_MATH_FLAGS "--ftz=false --prec-div=true --prec-sqrt=true --fmad false") + +# ---------------------------------- CUDA ---------------------------------- if(ENABLE_CUDA) set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD}) set(CMAKE_CUDA_STANDARD_REQUIRED TRUE) @@ -124,7 +135,6 @@ if(ENABLE_CUDA) else() set(CMAKE_CUDA_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CUDA_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} -Xptxas -O4 -Xcompiler -O4") endif() - set(GPUCA_CUDA_NO_FAST_MATH_FLAGS "--ftz=false --prec-div=true --prec-sqrt=true --fmad false") if(DEFINED GPUCA_NO_FAST_MATH AND "${GPUCA_NO_FAST_MATH}") set(CMAKE_CUDA_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CUDA_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} ${GPUCA_CUDA_NO_FAST_MATH_FLAGS}") elseif(NOT CMAKE_BUILD_TYPE_UPPER STREQUAL "DEBUG") @@ -146,7 +156,7 @@ if(ENABLE_CUDA) endif() endif() -# Detect and enable OpenCL 1.2 from AMD +# ---------------------------------- HIP ---------------------------------- if(ENABLE_OPENCL) find_package(OpenCL) if(ENABLE_OPENCL AND NOT ENABLE_OPENCL STREQUAL "AUTO") @@ -154,11 +164,6 @@ if(ENABLE_OPENCL) else() set_package_properties(OpenCL PROPERTIES TYPE OPTIONAL) endif() -endif() - -# Detect and enable OpenCL 2.x -if(ENABLE_OPENCL) - find_package(OpenCL) find_package(LLVM) if(LLVM_FOUND) find_package(Clang) @@ -196,7 +201,7 @@ if(ENABLE_OPENCL) endif() endif() -# Detect and enable HIP +# ---------------------------------- HIP ---------------------------------- if(ENABLE_HIP) if(NOT "$ENV{CMAKE_PREFIX_PATH}" MATCHES "rocm" AND NOT CMAKE_PREFIX_PATH MATCHES "rocm" AND EXISTS "/opt/rocm/lib/cmake/") list(APPEND CMAKE_PREFIX_PATH "/opt/rocm/lib/cmake") @@ -303,7 +308,6 @@ if(ENABLE_HIP) endif() message(FATAL_ERROR "HIP requested but some of the above packages are not found") endif() - endif() # if we end up here without a FATAL, it means we have found the "O2GPU" package diff --git a/dependencies/O2CompileFlags.cmake b/dependencies/O2CompileFlags.cmake index 08dd388cbdf36..eeddc189e8897 100644 --- a/dependencies/O2CompileFlags.cmake +++ b/dependencies/O2CompileFlags.cmake @@ -138,11 +138,4 @@ if(DEFINED ENV{O2_CXXFLAGS_OVERRIDE}) message(STATUS "Setting CXXFLAGS Override $ENV{O2_CXXFLAGS_OVERRIDE}") endif() -if(GPUCA_NO_FAST_MATH_WHOLEO2) - set(GPUCA_NO_FAST_MATH 1) - add_definitions(-DGPUCA_NO_FAST_MATH) - set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} -fno-fast-math -ffp-contract=off") - set(CMAKE_C_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_C_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} -fno-fast-math -ffp-contract=off") -endif() - message(STATUS "Using build type: ${CMAKE_BUILD_TYPE} - CXXFLAGS: ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}}") From d8e2e88149468f43044ca66f2abe57900f7287bc Mon Sep 17 00:00:00 2001 From: David Rohr Date: Sun, 23 Mar 2025 20:56:42 +0100 Subject: [PATCH 2/2] GPU: Replace GPUCA_NO_FAST_MATH by more fine-grain GPUCA_DETERMINISTIC_MODE --- .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 4 +-- GPU/CMakeLists.txt | 7 ----- GPU/Common/GPUCommonMath.h | 6 ++-- GPU/GPUTracking/Base/GPUReconstruction.cxx | 4 +-- GPU/GPUTracking/Base/cuda/CMakeLists.txt | 2 +- GPU/GPUTracking/Base/hip/CMakeLists.txt | 2 +- GPU/GPUTracking/Base/opencl/CMakeLists.txt | 4 +-- GPU/GPUTracking/CMakeLists.txt | 15 ++++++---- .../Definitions/GPUDefGPUParameters.h | 2 +- GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx | 24 +++++++-------- GPU/GPUTracking/Standalone/CMakeLists.txt | 10 +------ GPU/GPUTracking/Standalone/cmake/config.cmake | 16 +++++----- dependencies/FindO2GPU.cmake | 30 ++++++++++++++----- 13 files changed, 66 insertions(+), 60 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index ce93523319e99..a66eba7c3bacb 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -43,7 +43,7 @@ #define THRUST_NAMESPACE thrust::hip #endif -#ifdef GPUCA_NO_FAST_MATH +#ifdef GPUCA_DETERMINISTIC_MODE #define GPU_BLOCKS 1 #define GPU_THREADS 1 #else @@ -1452,4 +1452,4 @@ template void processNeighboursHandler<7>(const int startLayer, const o2::base::PropagatorF::MatCorrType matCorrType, const int nBlocks, const int nThreads); -} // namespace o2::its \ No newline at end of file +} // namespace o2::its diff --git a/GPU/CMakeLists.txt b/GPU/CMakeLists.txt index 75cd5eddc0b24..7e2b797ae714f 100644 --- a/GPU/CMakeLists.txt +++ b/GPU/CMakeLists.txt @@ -13,13 +13,6 @@ # CMake, variables are defined for Sources / Headers first. Then, the actual # CMake build scripts use these variables. -if(NOT DEFINED GPUCA_NO_FAST_MATH) - set(GPUCA_NO_FAST_MATH 0) -endif() -if(${GPUCA_NO_FAST_MATH}) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GPUCA_CXX_NO_FAST_MATH_FLAGS}") -endif() - add_subdirectory(Common) add_subdirectory(Utils) add_subdirectory(TPCFastTransformation) diff --git a/GPU/Common/GPUCommonMath.h b/GPU/Common/GPUCommonMath.h index c412662fc0c64..b7a44c1df0f38 100644 --- a/GPU/Common/GPUCommonMath.h +++ b/GPU/Common/GPUCommonMath.h @@ -250,7 +250,7 @@ GPUdi() uint32_t GPUCommonMath::Float2UIntReint(const float& x) #endif } -#ifdef GPUCA_NO_FAST_MATH +#ifdef GPUCA_DETERMINISTIC_MODE GPUdi() constexpr float GPUCommonMath::Round(float x) { return GPUCA_CHOICE(roundf(x), roundf(x), round(x)); } GPUdi() constexpr int32_t GPUCommonMath::Float2IntRn(float x) { return (int32_t)Round(x); } GPUhdi() constexpr float GPUCommonMath::Sqrt(float x) { return GPUCA_CHOICE(sqrtf(x), (float)sqrt((double)x), sqrt(x)); } @@ -286,7 +286,7 @@ GPUdi() constexpr bool GPUCommonMath::IsNaN(float x) { return false; } GPUhdi() void GPUCommonMath::SinCos(float x, float& s, float& c) { -#if defined(GPUCA_NO_FAST_MATH) && !defined(__OPENCL__) +#if defined(GPUCA_DETERMINISTIC_MODE) && !defined(__OPENCL__) s = sin((double)x); c = cos((double)x); #elif !defined(GPUCA_GPUCODE_DEVICE) && defined(__APPLE__) @@ -392,7 +392,7 @@ GPUdi() T GPUCommonMath::MaxWithRef(T x, T y, T z, T w, S refX, S refY, S refZ, GPUdi() float GPUCommonMath::InvSqrt(float _x) { -#if defined(GPUCA_NO_FAST_MATH) || defined(__OPENCL__) +#if defined(GPUCA_DETERMINISTIC_MODE) || defined(__OPENCL__) return 1.f / Sqrt(_x); #elif defined(__CUDACC__) || defined(__HIPCC__) return __frsqrt_rn(_x); diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx index 8bae1df267412..5582084fd0e17 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.cxx +++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx @@ -261,8 +261,8 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice() mProcessingSettings.deterministicGPUReconstruction = mProcessingSettings.debugLevel >= 6; } if (mProcessingSettings.deterministicGPUReconstruction) { -#ifndef GPUCA_NO_FAST_MATH - GPUError("Warning, deterministicGPUReconstruction needs GPUCA_NO_FAST_MATH for being fully deterministic, without only most indeterminism by concurrency is removed, but floating point effects remain!"); +#ifndef GPUCA_DETERMINISTIC_MODE + GPUError("Warning, deterministicGPUReconstruction needs GPUCA_DETERMINISTIC_MODE for being fully deterministic, without only most indeterminism by concurrency is removed, but floating point effects remain!"); #endif mProcessingSettings.overrideClusterizerFragmentLen = TPC_MAX_FRAGMENT_LEN_GPU; param().rec.tpc.nWaysOuter = true; diff --git a/GPU/GPUTracking/Base/cuda/CMakeLists.txt b/GPU/GPUTracking/Base/cuda/CMakeLists.txt index 5b2e53179e50c..f8203c2dc5858 100644 --- a/GPU/GPUTracking/Base/cuda/CMakeLists.txt +++ b/GPU/GPUTracking/Base/cuda/CMakeLists.txt @@ -172,7 +172,7 @@ elseif(GPUCA_CUDA_COMPILE_MODE STREQUAL "perkernel") TARGET_DIRECTORY ${targetName} PROPERTIES COMPILE_FLAGS "${GPUCA_CUDA_NO_FAST_MATH_FLAGS}" - COMPILE_DEFINITIONS "GPUCA_NO_FAST_MATH") + COMPILE_DEFINITIONS "GPUCA_DETERMINISTIC_MODE") elseif(GPUCA_CUDA_COMPILE_MODE STREQUAL "rdc") message(FATAL_ERROR "CUDA RDC compilation of GPUReconstruction ios not yet working!") target_compile_definitions(${targetName} PRIVATE GPUCA_KERNEL_COMPILE_MODE=2) diff --git a/GPU/GPUTracking/Base/hip/CMakeLists.txt b/GPU/GPUTracking/Base/hip/CMakeLists.txt index 21a641c0cc7c0..fee43eb6d8b0d 100644 --- a/GPU/GPUTracking/Base/hip/CMakeLists.txt +++ b/GPU/GPUTracking/Base/hip/CMakeLists.txt @@ -230,7 +230,7 @@ elseif(GPUCA_HIP_COMPILE_MODE STREQUAL "perkernel") TARGET_DIRECTORY ${targetName} PROPERTIES COMPILE_FLAGS "${GPUCA_CXX_NO_FAST_MATH_FLAGS}" - COMPILE_DEFINITIONS "GPUCA_NO_FAST_MATH") + COMPILE_DEFINITIONS "GPUCA_DETERMINISTIC_MODE") elseif(GPUCA_HIP_COMPILE_MODE STREQUAL "rdc") message(FATAL_ERROR "HIP RDC compilation of GPUReconstruction ios not yet working!") target_compile_definitions(${targetName} PRIVATE GPUCA_KERNEL_COMPILE_MODE=2) diff --git a/GPU/GPUTracking/Base/opencl/CMakeLists.txt b/GPU/GPUTracking/Base/opencl/CMakeLists.txt index 89d2f386f768f..3da5b77f80d86 100644 --- a/GPU/GPUTracking/Base/opencl/CMakeLists.txt +++ b/GPU/GPUTracking/Base/opencl/CMakeLists.txt @@ -24,10 +24,10 @@ set(CL_SRC ${GPUDIR}/Base/opencl/GPUReconstructionOCL.cl) set(CL_BIN ${CMAKE_CURRENT_BINARY_DIR}/GPUReconstructionOCLCode) set(OCL_FLAGS -Dcl_clang_storage_class_specifiers -cl-std=CLC++2021) -if(NOT DEFINED GPUCA_NO_FAST_MATH OR NOT ${GPUCA_NO_FAST_MATH}) +if(NOT GPUCA_DETERMINISTIC_MODE GREATER_EQUAL ${GPUCA_DETERMINISTIC_MODE_MAP_NO_FAST_MATH}) set(OCL_FLAGS ${OCL_FLAGS} -cl-denorms-are-zero -cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math) else() -set(OCL_FLAGS ${OCL_FLAGS} -cl-fp32-correctly-rounded-divide-sqrt) + set(OCL_FLAGS ${OCL_FLAGS} -cl-fp32-correctly-rounded-divide-sqrt) endif() set(OCL_DEFINECL "-D$,$-D>" "-I$,EXCLUDE,^/usr/include/?>,$-I>" diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt index e69e11c91d157..dedfcf5953394 100644 --- a/GPU/GPUTracking/CMakeLists.txt +++ b/GPU/GPUTracking/CMakeLists.txt @@ -14,8 +14,13 @@ set(MODULE GPUTracking) # set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} -O0") # to uncomment if needed, tired of typing this... # set(GPUCA_BUILD_DEBUG 1) -if(NOT "${GPUCA_NO_FAST_MATH}" AND NOT CMAKE_BUILD_TYPE_UPPER STREQUAL "DEBUG") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math") +if(GPUCA_DETERMINISTIC_MODE GREATER_EQUAL ${GPUCA_DETERMINISTIC_MODE_MAP_NO_FAST_MATH}) + set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} ${GPUCA_CXX_NO_FAST_MATH_FLAGS}") + if(GPUCA_DETERMINISTIC_MODE GREATER_EQUAL ${GPUCA_DETERMINISTIC_MODE_MAP_OPTO2}) + set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} -O2") + endif() +elseif(NOT CMAKE_BUILD_TYPE_UPPER STREQUAL "DEBUG") + set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} -O3 -ffast-math") endif() include(cmake/helpers.cmake) @@ -375,7 +380,7 @@ set_source_files_properties(DataCompression/GPUTPCCompressionTrackModel.cxx TARGET_DIRECTORY ${targetName} PROPERTIES COMPILE_FLAGS "${GPUCA_CXX_NO_FAST_MATH_FLAGS}" - COMPILE_DEFINITIONS "GPUCA_NO_FAST_MATH") + COMPILE_DEFINITIONS "GPUCA_DETERMINISTIC_MODE") # GPUReconstructionLibrary needs to know which GPU backends are enabled for proper error messages configure_file(Base/GPUReconstructionAvailableBackends.template.h ${CMAKE_CURRENT_BINARY_DIR}/GPUReconstructionAvailableBackends.h) @@ -417,6 +422,6 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2" OR ALIGPU_BUILD_TYPE STREQUAL "Standalone") endif() endif() -if(${GPUCA_NO_FAST_MATH}) - target_compile_definitions(${targetName} PUBLIC GPUCA_NO_FAST_MATH) +if(GPUCA_DETERMINISTIC_MODE GREATER_EQUAL ${GPUCA_DETERMINISTIC_MODE_MAP_GPU}) + target_compile_definitions(${targetName} PUBLIC GPUCA_DETERMINISTIC_MODE) endif() diff --git a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h index 55f2e76344bd5..6d6645850408f 100644 --- a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h +++ b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h @@ -566,7 +566,7 @@ #ifndef GPUCA_MERGER_INTERPOLATION_ERROR_TYPE #define GPUCA_MERGER_INTERPOLATION_ERROR_TYPE float #endif -#ifdef GPUCA_NO_FAST_MATH +#ifdef GPUCA_DETERMINISTIC_MODE #undef GPUCA_MERGER_INTERPOLATION_ERROR_TYPE #define GPUCA_MERGER_INTERPOLATION_ERROR_TYPE float #undef GPUCA_DEDX_STORAGE_TYPE diff --git a/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx b/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx index eb1df3f37b6b5..5ede29d7fd851 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx @@ -723,13 +723,13 @@ GPUd() void GPUTPCGMMerger::MergeBorderTracks<1>(int32_t nBlocks, int32_t nThrea if (iThread == 0) { if (iBlock == 0) { -#ifdef GPUCA_NO_FAST_MATH // TODO: Use a better define as swith +#ifdef GPUCA_DETERMINISTIC_MODE GPUCommonAlgorithm::sortDeviceDynamic(range1, range1 + N1, [](const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b) { return (a.fMin != b.fMin) ? (a.fMin < b.fMin) : (a.fId < b.fId); }); #else GPUCommonAlgorithm::sortDeviceDynamic(range1, range1 + N1, [](const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b) { return a.fMin < b.fMin; }); #endif } else if (iBlock == 1) { -#ifdef GPUCA_NO_FAST_MATH // TODO: Use a better define as swith +#ifdef GPUCA_DETERMINISTIC_MODE GPUCommonAlgorithm::sortDeviceDynamic(range2, range2 + N2, [](const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b) { return (a.fMax != b.fMax) ? (a.fMax < b.fMax) : (a.fId < b.fId); }); #else GPUCommonAlgorithm::sortDeviceDynamic(range2, range2 + N2, [](const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b) { return a.fMax < b.fMax; }); @@ -749,7 +749,7 @@ namespace // anonymous struct MergeBorderTracks_compMax { GPUd() bool operator()(const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b) { -#ifdef GPUCA_NO_FAST_MATH // TODO: Use a better define as swith +#ifdef GPUCA_DETERMINISTIC_MODE return (a.fMax != b.fMax) ? (a.fMax < b.fMax) : (a.fId < b.fId); #else return a.fMax < b.fMax; @@ -759,7 +759,7 @@ struct MergeBorderTracks_compMax { struct MergeBorderTracks_compMin { GPUd() bool operator()(const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b) { -#ifdef GPUCA_NO_FAST_MATH // TODO: Use a better define as swith +#ifdef GPUCA_DETERMINISTIC_MODE return (a.fMin != b.fMin) ? (a.fMin < b.fMin) : (a.fId < b.fId); #else return a.fMin < b.fMin; @@ -906,7 +906,7 @@ GPUd() void GPUTPCGMMerger::MergeBorderTracks<2>(int32_t nBlocks, int32_t nThrea mTrackLinks[b1.TrackID()] = iBest2; if (mergeMode > 0) { -#if defined(GPUCA_NO_FAST_MATH) // TODO: Use a better define as swith +#ifdef GPUCA_DETERMINISTIC_MODE CAMath::AtomicMax(&mTrackLinks[iBest2], b1.TrackID()); #else mTrackLinks[iBest2] = b1.TrackID(); @@ -1469,7 +1469,7 @@ struct GPUTPCGMMerger_CompareClusterIdsLooper { if (a1.row != b1.row) { return ((a1.row > b1.row) ^ ((a.leg - leg) & 1) ^ outwards); } -#ifdef GPUCA_NO_FAST_MATH // TODO: Use a better define as swith +#ifdef GPUCA_DETERMINISTIC_MODE if (a1.id != b1.id) { return (a1.id > b1.id); } @@ -1490,7 +1490,7 @@ struct GPUTPCGMMerger_CompareClusterIds { if (a.row != b.row) { return (a.row > b.row); } -#ifdef GPUCA_NO_FAST_MATH // TODO: Use a better define as swith +#ifdef GPUCA_DETERMINISTIC_MODE if (a.id != b.id) { return (a.id > b.id); } @@ -1569,7 +1569,7 @@ GPUd() void GPUTPCGMMerger::CollectMergedTracks(int32_t nBlocks, int32_t nThread // unpack and sort clusters if (nParts > 1 && leg == 0) { GPUCommonAlgorithm::sort(trackParts, trackParts + nParts, [](const GPUTPCGMSectorTrack* a, const GPUTPCGMSectorTrack* b) { -#ifdef GPUCA_NO_FAST_MATH // TODO: Use a better define as swith +#ifdef GPUCA_DETERMINISTIC_MODE if (a->X() != b->X()) { return (a->X() > b->X()); } @@ -1834,7 +1834,7 @@ struct GPUTPCGMMergerSortTracks_comp { if (a.Legs() != b.Legs()) { return a.Legs() > b.Legs(); } -#ifdef GPUCA_NO_FAST_MATH // TODO: Use a better define as swith +#ifdef GPUCA_DETERMINISTIC_MODE if (a.NClusters() != b.NClusters()) { return a.NClusters() > b.NClusters(); } @@ -1858,7 +1858,7 @@ struct GPUTPCGMMergerSortTracksQPt_comp { { const GPUTPCGMMergedTrack& GPUrestrict() a = mCmp[aa]; const GPUTPCGMMergedTrack& GPUrestrict() b = mCmp[bb]; -#ifdef GPUCA_NO_FAST_MATH // TODO: Use a better define as swith +#ifdef GPUCA_DETERMINISTIC_MODE if (CAMath::Abs(a.GetParam().GetQPt()) != CAMath::Abs(b.GetParam().GetQPt())) { return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt()); } @@ -1907,7 +1907,7 @@ GPUd() void GPUTPCGMMerger::SortTracks(int32_t nBlocks, int32_t nThreads, int32_ if (a.Legs() != b.Legs()) { return a.Legs() > b.Legs(); } -#ifdef GPUCA_NO_FAST_MATH // TODO: Use a better define as swith +#ifdef GPUCA_DETERMINISTIC_MODE if (a.NClusters() != b.NClusters()) { return a.NClusters() > b.NClusters(); } @@ -1937,7 +1937,7 @@ GPUd() void GPUTPCGMMerger::SortTracksQPt(int32_t nBlocks, int32_t nThreads, int auto comp = [cmp = mOutputTracks](const int32_t aa, const int32_t bb) { const GPUTPCGMMergedTrack& GPUrestrict() a = cmp[aa]; const GPUTPCGMMergedTrack& GPUrestrict() b = cmp[bb]; -#ifdef GPUCA_NO_FAST_MATH // TODO: Use a better define as swith +#ifdef GPUCA_DETERMINISTIC_MODE if (CAMath::Abs(a.GetParam().GetQPt()) != CAMath::Abs(b.GetParam().GetQPt())) { return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt()); } diff --git a/GPU/GPUTracking/Standalone/CMakeLists.txt b/GPU/GPUTracking/Standalone/CMakeLists.txt index 1f48b4fc5ade1..ed4fc5c9f7e2d 100644 --- a/GPU/GPUTracking/Standalone/CMakeLists.txt +++ b/GPU/GPUTracking/Standalone/CMakeLists.txt @@ -52,15 +52,7 @@ if(GPUCA_BUILD_DEBUG) set(CMAKE_CXX_FLAGS "-O0 -ggdb") set(CMAKE_BUILD_TYPE DEBUG) else() - set(CMAKE_CXX_FLAGS "-O3 -march=native -ggdb -minline-all-stringops -funroll-loops -fno-stack-protector") - if(DEFINED GPUCA_NO_FAST_MATH AND ${GPUCA_NO_FAST_MATH}) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-fast-math -ffp-contract=off") - else() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math") - endif() - if (NOT CMAKE_CXX_COMPILER STREQUAL "clang++") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ftracer -fprefetch-loop-arrays") - endif() + set(CMAKE_CXX_FLAGS "-O3 -march=native -ggdb") set(CMAKE_BUILD_TYPE RELEASE) add_definitions(-DNDEBUG) endif() diff --git a/GPU/GPUTracking/Standalone/cmake/config.cmake b/GPU/GPUTracking/Standalone/cmake/config.cmake index 87716d700abc8..af7c96bb96fbb 100644 --- a/GPU/GPUTracking/Standalone/cmake/config.cmake +++ b/GPU/GPUTracking/Standalone/cmake/config.cmake @@ -27,13 +27,13 @@ set(GPUCA_CONFIG_GL3W 0) set(GPUCA_CONFIG_O2 1) set(GPUCA_BUILD_DEBUG 0) set(GPUCA_BUILD_DEBUG_SANITIZE 0) -set(GPUCA_NO_FAST_MATH 0) -#set(GPUCA_CUDA_GCCBIN c++-13) -#set(GPUCA_OPENCL_CLANGBIN clang-18) -#set(HIP_AMDGPUTARGET "gfx906;gfx908;gfx90a") -set(HIP_AMDGPUTARGET "default") -#set(CUDA_COMPUTETARGET 86 89) -set(CUDA_COMPUTETARGET "default") -#set(GPUCA_CUDA_COMPILE_MODE perkernel) +set(GPUCA_DETERMINISTIC_MODE 0) # OFF / NO_FAST_MATH / OPTO2 / GPU / WHOLEO2 +#set(GPUCA_CUDA_GCCBIN c++-14) +#set(GPUCA_OPENCL_CLANGBIN clang-19) +set(HIP_AMDGPUTARGET "default") # "gfx906;gfx908;gfx90a" +set(CUDA_COMPUTETARGET "default") # 86 89 +#set(GPUCA_CUDA_COMPILE_MODE perkernel) # onefile / perkernel / rtc #set(GPUCA_HIP_COMPILE_MODE perkernel) #set(GPUCA_KERNEL_RESOURCE_USAGE_VERBOSE 1) +#set(GPUCA_CONFIG_COMPILER gcc) # gcc / clang +#add_definitions(-DGPUCA_GPU_DEBUG_PRINT) diff --git a/dependencies/FindO2GPU.cmake b/dependencies/FindO2GPU.cmake index 69241ea30a375..650a269209d9b 100644 --- a/dependencies/FindO2GPU.cmake +++ b/dependencies/FindO2GPU.cmake @@ -68,14 +68,30 @@ endfunction() STRING(REGEX REPLACE "\-std=[^ ]*" "" O2_GPU_CMAKE_CXX_FLAGS_NOSTD "${CMAKE_CXX_FLAGS}") # ---------------------------------- Fast Math / Deterministic Mode ---------------------------------- -if(GPUCA_NO_FAST_MATH_WHOLEO2) - set(GPUCA_NO_FAST_MATH 1) - add_definitions(-DGPUCA_NO_FAST_MATH) - set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} -fno-fast-math -ffp-contract=off") - set(CMAKE_C_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_C_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} -fno-fast-math -ffp-contract=off") +# set(GPUCA_DETERMINISTIC_MODE WHOLEO2) # Override +set(GPUCA_DETERMINISTIC_MODE_MAP_OFF 0) +set(GPUCA_DETERMINISTIC_MODE_MAP_NO_FAST_MATH 1) # No -ffast-math and similar compile flags for GPU folder +set(GPUCA_DETERMINISTIC_MODE_MAP_OPTO2 2) # In addition, -O2 optimization on host for GPU folder +set(GPUCA_DETERMINISTIC_MODE_MAP_GPU 3) # In addition, GPUCA_DETERMINISTIC_MODE define for GPU folder +set(GPUCA_DETERMINISTIC_MODE_MAP_ON 3) # Synonym for GPU +set(GPUCA_DETERMINISTIC_MODE_MAP_WHOLEO2 4) # As GPU but for whole O2 code +if(NOT DEFINED GPUCA_DETERMINISTIC_MODE) + set(GPUCA_DETERMINISTIC_MODE 0) +elseif(NOT GPUCA_DETERMINISTIC_MODE MATCHES "^[0-9]+$") + if(NOT DEFINED GPUCA_DETERMINISTIC_MODE_MAP_${GPUCA_DETERMINISTIC_MODE}) + message(FATAL_ERROR "Invalid setting ${GPUCA_DETERMINISTIC_MODE} for GPUCA_DETERMINISTIC_MODE") + endif() + set(GPUCA_DETERMINISTIC_MODE ${GPUCA_DETERMINISTIC_MODE_MAP_${GPUCA_DETERMINISTIC_MODE}}) + message(STATUS "Set to ${GPUCA_DETERMINISTIC_MODE}") endif() set(GPUCA_CXX_NO_FAST_MATH_FLAGS "-fno-fast-math -ffp-contract=off") set(GPUCA_CUDA_NO_FAST_MATH_FLAGS "--ftz=false --prec-div=true --prec-sqrt=true --fmad false") +if(GPUCA_DETERMINISTIC_MODE GREATER_EQUAL ${GPUCA_DETERMINISTIC_MODE_MAP_WHOLEO2}) + add_definitions(-DGPUCA_DETERMINISTIC_MODE) + set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} ${GPUCA_CXX_NO_FAST_MATH_FLAGS}") + set(CMAKE_C_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_C_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} ${GPUCA_CXX_NO_FAST_MATH_FLAGS}") +endif() + # ---------------------------------- CUDA ---------------------------------- if(ENABLE_CUDA) @@ -135,7 +151,7 @@ if(ENABLE_CUDA) else() set(CMAKE_CUDA_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CUDA_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} -Xptxas -O4 -Xcompiler -O4") endif() - if(DEFINED GPUCA_NO_FAST_MATH AND "${GPUCA_NO_FAST_MATH}") + if(GPUCA_DETERMINISTIC_MODE GREATER_EQUAL ${GPUCA_DETERMINISTIC_MODE_MAP_NO_FAST_MATH}) set(CMAKE_CUDA_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CUDA_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} ${GPUCA_CUDA_NO_FAST_MATH_FLAGS}") elseif(NOT CMAKE_BUILD_TYPE_UPPER STREQUAL "DEBUG") set(CMAKE_CUDA_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CUDA_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} -use_fast_math --ftz=true")# @@ -274,7 +290,7 @@ if(ENABLE_HIP) if(HIP_AMDGPUTARGET) set(CMAKE_HIP_ARCHITECTURES "${HIP_AMDGPUTARGET}") # If GPU build is enforced we override autodetection endif() - if(NOT DEFINED GPUCA_NO_FAST_MATH OR NOT ${GPUCA_NO_FAST_MATH}) + if(NOT GPUCA_DETERMINISTIC_MODE GREATER_EQUAL ${GPUCA_DETERMINISTIC_MODE_MAP_NO_FAST_MATH}) string(APPEND O2_HIP_CMAKE_CXX_FLAGS " -fgpu-flush-denormals-to-zero -ffast-math") endif() set(CMAKE_HIP_FLAGS "${O2_GPU_CMAKE_CXX_FLAGS_NOSTD} ${CMAKE_HIP_FLAGS} ${O2_HIP_CMAKE_CXX_FLAGS}")