From 14da7cd0d93fce264e0a1c22c6a3bc0f45bd68dc Mon Sep 17 00:00:00 2001 From: David Rohr Date: Mon, 12 May 2025 11:43:12 +0200 Subject: [PATCH 1/4] GPU: Add GPUCA_RTC_CONSTEXPR macro for constexpr only in RTC --- GPU/Common/GPUCommonDef.h | 4 ++++ GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx | 1 + GPU/GPUTracking/Base/cuda/GPUReconstructionCUDArtc.cu | 1 + 3 files changed, 6 insertions(+) diff --git a/GPU/Common/GPUCommonDef.h b/GPU/Common/GPUCommonDef.h index d7e99f53d4ce8..d9a5bdf92b6ac 100644 --- a/GPU/Common/GPUCommonDef.h +++ b/GPU/Common/GPUCommonDef.h @@ -72,6 +72,10 @@ #define GPUCA_RTC_SPECIAL_CODE(...) #endif +#ifndef GPUCA_RTC_CONSTEXPR + #define GPUCA_RTC_CONSTEXPR +#endif + #ifndef GPUCA_DETERMINISTIC_CODE #ifdef GPUCA_DETERMINISTIC_MODE #define GPUCA_DETERMINISTIC_CODE(det, indet) det // In deterministic mode, take deterministic code path diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx index 5706f32e73e96..acc77648d954b 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx @@ -38,6 +38,7 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile) { std::string rtcparam = std::string("#define GPUCA_RTC_CODE\n") + std::string(GetProcessingSettings().rtc.optSpecialCode ? "#define GPUCA_RTC_SPECIAL_CODE(...) __VA_ARGS__\n" : "#define GPUCA_RTC_SPECIAL_CODE(...)\n") + + std::string(GetProcessingSettings().rtc.optConstexpr ? "#define GPUCA_RTC_CONSTEXPR constexpr\n" : "#define GPUCA_RTC_CONSTEXPR\n") + GPUParamRTC::generateRTCCode(param(), GetProcessingSettings().rtc.optConstexpr); if (filename == "") { filename = "/tmp/o2cagpu_rtc_"; diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDArtc.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDArtc.cu index 805397c9b430e..66c02d6ed251c 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDArtc.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDArtc.cu @@ -18,6 +18,7 @@ // Keep some preprocessor calls unprocessed #define GPUCA_RTC_SPECIAL_CODE(...) GPUCA_RTC_SPECIAL_CODE(__VA_ARGS__) #define GPUCA_DETERMINISTIC_CODE(...) GPUCA_DETERMINISTIC_CODE(__VA_ARGS__) +#define GPUCA_RTC_CONSTEXPR GPUCA_RTC_CONSTEXPR // GPUReconstructionCUDAIncludesSystem.h prependended by CMakewithout preprocessor running #include "GPUReconstructionCUDADef.h" From b270fd5c6026808a1afc35a0d10a4a93e6b3c022 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Mon, 12 May 2025 11:43:40 +0200 Subject: [PATCH 2/4] GPU TPC: Compute alternative dEdx only if it has different settings than normal dEdx --- GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx | 4 +- GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx | 17 +++-- GPU/GPUTracking/Merger/GPUTPCGMTrackParam.cxx | 70 +++++++++++-------- 3 files changed, 55 insertions(+), 36 deletions(-) diff --git a/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx b/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx index 6e7de7ee48ca6..d2aba503be6a6 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx @@ -301,7 +301,9 @@ void* GPUTPCGMMerger::SetPointersOutput(void* mem) computePointerWithAlignment(mem, mOutputTracks, mNMaxTracks); if (mRec->GetParam().dodEdxEnabled) { computePointerWithAlignment(mem, mOutputTracksdEdx, mNMaxTracks); - computePointerWithAlignment(mem, mOutputTracksdEdxAlt, mNMaxTracks); + if (mRec->GetParam().rec.tpc.dEdxClusterRejectionFlagMask != mRec->GetParam().rec.tpc.dEdxClusterRejectionFlagMaskAlt) { + computePointerWithAlignment(mem, mOutputTracksdEdxAlt, mNMaxTracks); + } } computePointerWithAlignment(mem, mClusters, mNMaxOutputTrackClusters); if (mRec->GetParam().par.earlyTpcTransform) { diff --git a/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx b/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx index 9dc6ddc59c2b4..9ead17ea5c7c0 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx @@ -106,6 +106,7 @@ GPUdii() void GPUTPCGMO2Output::Thread(int32_t nBlocks const uint32_t flagsRequired = getFlagsRequired(merger.Param().rec); TrackTPC* outputTracks = merger.OutputTracksTPCO2(); uint32_t* clusRefs = merger.OutputClusRefsTPCO2(); + const auto& param = merger.Param(); GPUTPCGMMerger::tmpSort* GPUrestrict() trackSort = merger.TrackSortO2(); uint2* GPUrestrict() tmpData = merger.ClusRefTmp(); @@ -130,9 +131,15 @@ GPUdii() void GPUTPCGMO2Output::Thread(int32_t nBlocks oTrack.setChi2(tracks[i].GetParam().GetChi2()); auto& outerPar = tracks[i].OuterParam(); - if (merger.Param().par.dodEdx && merger.Param().dodEdxEnabled) { - oTrack.setdEdx(tracksdEdx[i]); - oTrack.setdEdxAlt(tracksdEdxAlt[i]); + if GPUCA_RTC_CONSTEXPR (param.par.dodEdx) { + if (param.dodEdxEnabled) { + oTrack.setdEdx(tracksdEdx[i]); + if GPUCA_RTC_CONSTEXPR (param.rec.tpc.dEdxClusterRejectionFlagMask != param.rec.tpc.dEdxClusterRejectionFlagMaskAlt) { + oTrack.setdEdxAlt(tracksdEdxAlt[i]); + } else { + oTrack.setdEdxAlt(tracksdEdx[i]); + } + } } auto snpOut = outerPar.P[2]; @@ -148,9 +155,9 @@ GPUdii() void GPUTPCGMO2Output::Thread(int32_t nBlocks outerPar.C[6], outerPar.C[7], outerPar.C[8], outerPar.C[9], outerPar.C[10], outerPar.C[11], outerPar.C[12], outerPar.C[13], outerPar.C[14]})); - if (merger.Param().par.dodEdx && merger.Param().dodEdxEnabled && merger.Param().rec.tpc.enablePID) { + if (param.par.dodEdx && param.dodEdxEnabled && param.rec.tpc.enablePID) { PIDResponse pidResponse{}; - auto pid = pidResponse.getMostProbablePID(oTrack, merger.Param().rec.tpc.PID_EKrangeMin, merger.Param().rec.tpc.PID_EKrangeMax, merger.Param().rec.tpc.PID_EPrangeMin, merger.Param().rec.tpc.PID_EPrangeMax, merger.Param().rec.tpc.PID_EDrangeMin, merger.Param().rec.tpc.PID_EDrangeMax, merger.Param().rec.tpc.PID_ETrangeMin, merger.Param().rec.tpc.PID_ETrangeMax, merger.Param().rec.tpc.PID_useNsigma, merger.Param().rec.tpc.PID_sigma); + auto pid = pidResponse.getMostProbablePID(oTrack, param.rec.tpc.PID_EKrangeMin, param.rec.tpc.PID_EKrangeMax, param.rec.tpc.PID_EPrangeMin, param.rec.tpc.PID_EPrangeMax, param.rec.tpc.PID_EDrangeMin, param.rec.tpc.PID_EDrangeMax, param.rec.tpc.PID_ETrangeMin, merger.Param().rec.tpc.PID_ETrangeMax, merger.Param().rec.tpc.PID_useNsigma, merger.Param().rec.tpc.PID_sigma); auto pidRemap = merger.Param().rec.tpc.PID_remap[pid]; if (pidRemap >= 0) { pid = pidRemap; diff --git a/GPU/GPUTracking/Merger/GPUTPCGMTrackParam.cxx b/GPU/GPUTracking/Merger/GPUTPCGMTrackParam.cxx index f5bfbe985fb8c..0d8547263207b 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMTrackParam.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGMTrackParam.cxx @@ -216,11 +216,15 @@ GPUd() bool GPUTPCGMTrackParam::Fit(GPUTPCGMMerger* GPUrestrict() merger, int32_ continue; } } else if (allowModification && lastRow != 255 && CAMath::Abs(cluster.row - lastRow) > 1) { - bool dodEdx = param.par.dodEdx && param.dodEdxEnabled && param.rec.tpc.adddEdxSubThresholdClusters && iWay == nWays - 1 && CAMath::Abs(cluster.row - lastRow) == 2 && cluster.leg == clusters[maxN - 1].leg; - dodEdx = AttachClustersPropagate(merger, cluster.sector, lastRow, cluster.row, iTrk, cluster.leg == clusters[maxN - 1].leg, prop, inFlyDirection, GPUCA_MAX_SIN_PHI, dodEdx); - if (dodEdx) { - dEdx.fillSubThreshold(lastRow - wayDirection); - dEdxAlt.fillSubThreshold(lastRow - wayDirection); + if GPUCA_RTC_CONSTEXPR (param.par.dodEdx) { + bool dodEdx = param.dodEdxEnabled && param.rec.tpc.adddEdxSubThresholdClusters && iWay == nWays - 1 && CAMath::Abs(cluster.row - lastRow) == 2 && cluster.leg == clusters[maxN - 1].leg; + dodEdx = AttachClustersPropagate(merger, cluster.sector, lastRow, cluster.row, iTrk, cluster.leg == clusters[maxN - 1].leg, prop, inFlyDirection, GPUCA_MAX_SIN_PHI, dodEdx); + if (dodEdx) { + dEdx.fillSubThreshold(lastRow - wayDirection); + if GPUCA_RTC_CONSTEXPR (param.rec.tpc.dEdxClusterRejectionFlagMask != param.rec.tpc.dEdxClusterRejectionFlagMaskAlt) { + dEdxAlt.fillSubThreshold(lastRow - wayDirection); + } + } } } @@ -367,31 +371,35 @@ GPUd() bool GPUTPCGMTrackParam::Fit(GPUTPCGMMerger* GPUrestrict() merger, int32_ CADEBUG(printf("Reinit linearization\n")); prop.SetTrack(this, prop.GetAlpha()); } - if (param.par.dodEdx && param.dodEdxEnabled && iWay == nWays - 1 && cluster.leg == clusters[maxN - 1].leg) { // TODO: Costimize flag to remove, and option to remove double-clusters - bool acc = (clusterState & param.rec.tpc.dEdxClusterRejectionFlagMask) == 0, accAlt = (clusterState & param.rec.tpc.dEdxClusterRejectionFlagMaskAlt) == 0; - if (acc || accAlt) { - float qtot = 0, qmax = 0, pad = 0, relTime = 0; - const int32_t clusterCount = (ihit - ihitMergeFirst) * wayDirection + 1; - for (int32_t iTmp = ihitMergeFirst; iTmp != ihit + wayDirection; iTmp += wayDirection) { - if (merger->GetConstantMem()->ioPtrs.clustersNative == nullptr) { - qtot += clustersXYZ[ihit].amp; - } else { - const ClusterNative& cl = merger->GetConstantMem()->ioPtrs.clustersNative->clustersLinear[cluster.num]; - qtot += cl.qTot; - qmax = CAMath::Max(qmax, cl.qMax); - pad += cl.getPad(); - relTime += cl.getTime(); + if GPUCA_RTC_CONSTEXPR (param.par.dodEdx) { + if (param.dodEdxEnabled && iWay == nWays - 1 && cluster.leg == clusters[maxN - 1].leg) { // TODO: Costimize flag to remove, and option to remove double-clusters + bool acc = (clusterState & param.rec.tpc.dEdxClusterRejectionFlagMask) == 0, accAlt = (clusterState & param.rec.tpc.dEdxClusterRejectionFlagMaskAlt) == 0; + if (acc || accAlt) { + float qtot = 0, qmax = 0, pad = 0, relTime = 0; + const int32_t clusterCount = (ihit - ihitMergeFirst) * wayDirection + 1; + for (int32_t iTmp = ihitMergeFirst; iTmp != ihit + wayDirection; iTmp += wayDirection) { + if (merger->GetConstantMem()->ioPtrs.clustersNative == nullptr) { + qtot += clustersXYZ[ihit].amp; + } else { + const ClusterNative& cl = merger->GetConstantMem()->ioPtrs.clustersNative->clustersLinear[cluster.num]; + qtot += cl.qTot; + qmax = CAMath::Max(qmax, cl.qMax); + pad += cl.getPad(); + relTime += cl.getTime(); + } + } + qtot /= clusterCount; // TODO: Weighted Average + pad /= clusterCount; + relTime /= clusterCount; + relTime = relTime - CAMath::Round(relTime); + if (acc) { + dEdx.fillCluster(qtot, qmax, cluster.row, cluster.sector, mP[2], mP[3], merger->GetConstantMem()->calibObjects, zz, pad, relTime); + } + if GPUCA_RTC_CONSTEXPR (param.rec.tpc.dEdxClusterRejectionFlagMask != param.rec.tpc.dEdxClusterRejectionFlagMaskAlt) { + if (accAlt) { + dEdxAlt.fillCluster(qtot, qmax, cluster.row, cluster.sector, mP[2], mP[3], merger->GetConstantMem()->calibObjects, zz, pad, relTime); + } } - } - qtot /= clusterCount; // TODO: Weighted Average - pad /= clusterCount; - relTime /= clusterCount; - relTime = relTime - CAMath::Round(relTime); - if (acc) { - dEdx.fillCluster(qtot, qmax, cluster.row, cluster.sector, mP[2], mP[3], merger->GetConstantMem()->calibObjects, zz, pad, relTime); - } - if (accAlt) { - dEdxAlt.fillCluster(qtot, qmax, cluster.row, cluster.sector, mP[2], mP[3], merger->GetConstantMem()->calibObjects, zz, pad, relTime); } } } @@ -428,7 +436,9 @@ GPUd() bool GPUTPCGMTrackParam::Fit(GPUTPCGMMerger* GPUrestrict() merger, int32_ if (param.par.dodEdx && param.dodEdxEnabled) { dEdx.computedEdx(merger->OutputTracksdEdx()[iTrk], param); - dEdxAlt.computedEdx(merger->OutputTracksdEdxAlt()[iTrk], param); + if GPUCA_RTC_CONSTEXPR (param.rec.tpc.dEdxClusterRejectionFlagMask != param.rec.tpc.dEdxClusterRejectionFlagMaskAlt) { + dEdxAlt.computedEdx(merger->OutputTracksdEdxAlt()[iTrk], param); + } } Alpha = prop.GetAlpha(); MoveToReference(prop, param, Alpha); From 7b05244438d99b4d20d32905e0a66dbc72c7fe8b Mon Sep 17 00:00:00 2001 From: David Rohr Date: Mon, 12 May 2025 21:19:41 +0200 Subject: [PATCH 3/4] GPU CMake: If deterministic mode is set to MaxOptO2, do not impose -O2 when BUILD_TYPE is DEBUG --- GPU/GPUTracking/CMakeLists.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt index 2e26622d05291..52848692e7516 100644 --- a/GPU/GPUTracking/CMakeLists.txt +++ b/GPU/GPUTracking/CMakeLists.txt @@ -16,11 +16,12 @@ set(MODULE GPUTracking) if(GPUCA_DETERMINISTIC_MODE GREATER_EQUAL ${GPUCA_DETERMINISTIC_MODE_MAP_NO_FAST_MATH}) set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} ${GPUCA_CXX_NO_FAST_MATH_FLAGS}") +elseif(NOT CMAKE_BUILD_TYPE_UPPER STREQUAL "DEBUG") if(GPUCA_DETERMINISTIC_MODE GREATER_EQUAL ${GPUCA_DETERMINISTIC_MODE_MAP_OPTO2}) set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} -O2") + else() + set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} -O3 -ffast-math") endif() -elseif(NOT CMAKE_BUILD_TYPE_UPPER STREQUAL "DEBUG") - set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} -O3 -ffast-math") endif() set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} ${GPUCA_CXX_DENORMALS_FLAGS}") From da5edf700afdc76c1c7bd169ec67bcaa4eb7096d Mon Sep 17 00:00:00 2001 From: David Rohr Date: Mon, 12 May 2025 21:22:52 +0200 Subject: [PATCH 4/4] GPU: Workaround for Clang Frontend issue This is fixed with Clang >= 20 and C++23 (P2280R4) --- GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx | 4 ++++ GPU/GPUTracking/Base/hip/CMakeLists.txt | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx index acc77648d954b..67ad608c13417 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx @@ -38,7 +38,11 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile) { std::string rtcparam = std::string("#define GPUCA_RTC_CODE\n") + std::string(GetProcessingSettings().rtc.optSpecialCode ? "#define GPUCA_RTC_SPECIAL_CODE(...) __VA_ARGS__\n" : "#define GPUCA_RTC_SPECIAL_CODE(...)\n") + +#ifndef GPUCA_HIP_WORKAROUND_CONSTEXPR // TODO: Fixme, once we have C++ P2280R4 in Clang std::string(GetProcessingSettings().rtc.optConstexpr ? "#define GPUCA_RTC_CONSTEXPR constexpr\n" : "#define GPUCA_RTC_CONSTEXPR\n") + +#else + std::string("#define GPUCA_RTC_CONSTEXPR\n") + +#endif GPUParamRTC::generateRTCCode(param(), GetProcessingSettings().rtc.optConstexpr); if (filename == "") { filename = "/tmp/o2cagpu_rtc_"; diff --git a/GPU/GPUTracking/Base/hip/CMakeLists.txt b/GPU/GPUTracking/Base/hip/CMakeLists.txt index 6eded3499e46e..c89ef1769ad81 100644 --- a/GPU/GPUTracking/Base/hip/CMakeLists.txt +++ b/GPU/GPUTracking/Base/hip/CMakeLists.txt @@ -270,3 +270,8 @@ add_dependencies(GPUTrackingHIPExternalProvider O2::GPUTracking) # must not depe if(NOT DEFINED GPUCA_HIP_HIPIFY_FROM_CUDA OR "${GPUCA_HIP_HIPIFY_FROM_CUDA}") add_dependencies(GPUTrackingHIPExternalProvider ${MODULE}_HIPIFIED) endif() + +set_source_files_properties("${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPGenRTC.cxx" +TARGET_DIRECTORY O2::GPUTrackingHIP +PROPERTIES +COMPILE_DEFINITIONS "GPUCA_HIP_WORKAROUND_CONSTEXPR")