diff --git a/GPU/GPUTracking/Base/GPUProcessor.h b/GPU/GPUTracking/Base/GPUProcessor.h index 2e0e0a003f87d..df551c9f0330d 100644 --- a/GPU/GPUTracking/Base/GPUProcessor.h +++ b/GPU/GPUTracking/Base/GPUProcessor.h @@ -63,7 +63,7 @@ class GPUProcessor } template - static inline size_t getAlignmentMod(size_t addr) + static constexpr inline size_t getAlignmentMod(size_t addr) { static_assert((alignment & (alignment - 1)) == 0, "Invalid alignment, not power of 2"); if (alignment <= 1) { @@ -72,7 +72,7 @@ class GPUProcessor return addr & (alignment - 1); } template - static inline size_t getAlignment(size_t addr) + static constexpr inline size_t getAlignment(size_t addr) { size_t mod = getAlignmentMod(addr); if (mod == 0) { @@ -81,10 +81,22 @@ class GPUProcessor return (alignment - mod); } template - static inline size_t nextMultipleOf(size_t size) + static constexpr inline size_t nextMultipleOf(size_t size) { return size + getAlignment(size); } + static constexpr inline size_t nextMultipleOf(size_t size, size_t alignment) + { + if (alignment & (alignment - 1)) { + size_t tmp = size % alignment; + if (tmp) { + size += alignment - tmp; + } + return size; + } else { + return (size + alignment - 1) & ~(alignment - 1); + } + } template static inline void* alignPointer(void* ptr) { diff --git a/GPU/GPUTracking/Definitions/GPUDefParametersConstants.h b/GPU/GPUTracking/Definitions/GPUDefParametersConstants.h index dd4a5dcbe7ba8..78036e47fc49d 100644 --- a/GPU/GPUTracking/Definitions/GPUDefParametersConstants.h +++ b/GPU/GPUTracking/Definitions/GPUDefParametersConstants.h @@ -18,8 +18,6 @@ #define GPUDEFPARAMETERSCONSTANTS_H // clang-format off -#define GPUCA_THREAD_COUNT_SCAN 512 // TODO: WARNING!!! Must not be GPUTYPE-dependent right now! // TODO: Fix! - #if defined(__CUDACC__) || defined(__HIPCC__) #define GPUCA_SPECIALIZE_THRUST_SORTS // Not compiled with RTC, so must be compile-time constant #endif diff --git a/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h b/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h index cdc5efd56ddfd..7879789bf91c8 100644 --- a/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h +++ b/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h @@ -22,7 +22,6 @@ // GPU Run Configuration #if defined(GPUCA_GPUCODE) && !defined(GPUCA_GPUCODE_GENRTC) && !defined(GPUCA_GPUCODE_NO_LAUNCH_BOUNDS) // Avoid including for RTC generation besides normal include protection. - #define GPUCA_LB_SCAN 512 // GPU-architecture-dependent default settings #if defined(GPUCA_GPUTYPE_MI2xx) #define GPUCA_WARP_SIZE 64 @@ -499,11 +498,11 @@ #define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass1Regression GPUCA_LB_GPUTPCNNClusterizerKernels #define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass2Regression GPUCA_LB_GPUTPCNNClusterizerKernels - #define GPUCA_LB_GPUTPCCFStreamCompaction_scanStart GPUCA_LB_SCAN - #define GPUCA_LB_GPUTPCCFStreamCompaction_scanUp GPUCA_LB_SCAN - #define GPUCA_LB_GPUTPCCFStreamCompaction_scanTop GPUCA_LB_SCAN - #define GPUCA_LB_GPUTPCCFStreamCompaction_scanDown GPUCA_LB_SCAN - #define GPUCA_LB_GPUTPCCFStreamCompaction_compactDigits GPUCA_LB_SCAN + #define GPUCA_LB_GPUTPCCFStreamCompaction_scanStart GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE + #define GPUCA_LB_GPUTPCCFStreamCompaction_scanUp GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE + #define GPUCA_LB_GPUTPCCFStreamCompaction_scanTop GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE + #define GPUCA_LB_GPUTPCCFStreamCompaction_scanDown GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE + #define GPUCA_LB_GPUTPCCFStreamCompaction_compactDigits GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE #define GPUCA_LB_GPUTPCCompressionGatherKernels_unbuffered GPUCA_LB_COMPRESSION_GATHER #define GPUCA_LB_GPUTPCCompressionGatherKernels_buffered32 GPUCA_LB_COMPRESSION_GATHER #define GPUCA_LB_GPUTPCCompressionGatherKernels_buffered64 GPUCA_LB_COMPRESSION_GATHER @@ -541,6 +540,9 @@ #ifndef GPUCA_PAR_COMP_GATHER_MODE #define GPUCA_PAR_COMP_GATHER_MODE 2 #endif + #ifndef GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE + #define GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE 512 + #endif #endif // defined(GPUCA_GPUCODE) && !defined(GPUCA_GPUCODE_GENRTC) && !defined(GPUCA_GPUCODE_NO_LAUNCH_BOUNDS) #ifndef GPUCA_GPUCODE_GENRTC @@ -578,6 +580,9 @@ #ifndef GPUCA_PAR_NO_ATOMIC_PRECHECK #define GPUCA_PAR_NO_ATOMIC_PRECHECK 0 #endif + #ifndef GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE + #define GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE 0 + #endif #ifndef GPUCA_PAR_DEDX_STORAGE_TYPE #define GPUCA_PAR_DEDX_STORAGE_TYPE float #endif diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 630c2200e5900..f188388e76a02 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -23,6 +23,7 @@ #include "CfChargePos.h" #include "CfArray2D.h" #include "GPUGeneralKernels.h" +#include "GPUDefParametersRuntime.h" #include "GPUTPCCFStreamCompaction.h" #include "GPUTPCCFChargeMapFiller.h" #include "GPUTPCCFDecodeZS.h" @@ -402,27 +403,28 @@ void GPUChainTracking::RunTPCClusterizer_compactPeaks(GPUTPCClusterFinder& clust exit(1); } + int32_t scanWorkgroupSize = mRec->getGPUParameters(doGPU).par_CF_SCAN_WORKGROUP_SIZE; size_t tmpCount = count; if (nSteps > 1) { for (uint32_t i = 1; i < nSteps; i++) { counts.push_back(tmpCount); if (i == 1) { - runKernel({GetGrid(tmpCount, clusterer.mScanWorkGroupSize, lane), {iSector}}, i, stage); + runKernel({GetGrid(tmpCount, scanWorkgroupSize, lane), {iSector}}, i, stage); } else { - runKernel({GetGrid(tmpCount, clusterer.mScanWorkGroupSize, lane), {iSector}}, i, tmpCount); + runKernel({GetGrid(tmpCount, scanWorkgroupSize, lane), {iSector}}, i, tmpCount); } - tmpCount = (tmpCount + clusterer.mScanWorkGroupSize - 1) / clusterer.mScanWorkGroupSize; + tmpCount = (tmpCount + scanWorkgroupSize - 1) / scanWorkgroupSize; } - runKernel({GetGrid(tmpCount, clusterer.mScanWorkGroupSize, lane), {iSector}}, nSteps, tmpCount); + runKernel({GetGrid(tmpCount, scanWorkgroupSize, lane), {iSector}}, nSteps, tmpCount); for (uint32_t i = nSteps - 1; i > 1; i--) { tmpCount = counts[i - 1]; - runKernel({GetGrid(tmpCount - clusterer.mScanWorkGroupSize, clusterer.mScanWorkGroupSize, lane), {iSector}}, i, clusterer.mScanWorkGroupSize, tmpCount); + runKernel({GetGrid(tmpCount - scanWorkgroupSize, scanWorkgroupSize, lane), {iSector}}, i, scanWorkgroupSize, tmpCount); } } - runKernel({GetGrid(count, clusterer.mScanWorkGroupSize, lane), {iSector}}, 1, stage, in, out); + runKernel({GetGrid(count, scanWorkgroupSize, lane), {iSector}}, 1, stage, in, out); } else { auto& nOut = stage ? clusterer.mPmemory->counters.nClusters : clusterer.mPmemory->counters.nPeaks; auto& nIn = stage ? clusterer.mPmemory->counters.nPeaks : clusterer.mPmemory->counters.nPositions; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFStreamCompaction.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFStreamCompaction.cxx index 1da5a1158a8c2..d43e96b19c5d0 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFStreamCompaction.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFStreamCompaction.cxx @@ -24,6 +24,7 @@ using namespace o2::gpu::tpccf; template <> GPUdii() void GPUTPCCFStreamCompaction::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int32_t iBuf, int32_t stage) { +#ifdef GPUCA_GPUCODE int32_t nElems = CompactionElems(clusterer, stage); const auto* predicate = clusterer.mPisPeak; @@ -35,17 +36,19 @@ GPUdii() void GPUTPCCFStreamCompaction::Thread(smem, pred); + int32_t nElemsInBlock = CfUtils::blockPredicateSum(smem, pred); int32_t lastThread = nThreads - 1; if (iThread == lastThread) { scanOffset[iBlock] = nElemsInBlock; } +#endif } template <> GPUdii() void GPUTPCCFStreamCompaction::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int32_t iBuf, int32_t nElems) { +#ifdef GPUCA_GPUCODE auto* scanOffset = clusterer.GetScanBuffer(iBuf - 1); auto* scanOffsetNext = clusterer.GetScanBuffer(iBuf); @@ -59,11 +62,13 @@ GPUdii() void GPUTPCCFStreamCompaction::Thread if (iThread == lastThread) { scanOffsetNext[iBlock] = offsetInBlock; } +#endif } template <> GPUdii() void GPUTPCCFStreamCompaction::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int32_t iBuf, int32_t nElems) { +#ifdef GPUCA_GPUCODE int32_t iThreadGlobal = get_global_id(0); int32_t* scanOffset = clusterer.GetScanBuffer(iBuf - 1); @@ -74,11 +79,13 @@ GPUdii() void GPUTPCCFStreamCompaction::Thread GPUdii() void GPUTPCCFStreamCompaction::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& /*smem*/, processorType& clusterer, int32_t iBuf, uint32_t offset, int32_t nElems) { +#ifdef GPUCA_GPUCODE int32_t iThreadGlobal = get_global_id(0) + offset; int32_t* scanOffsetPrev = clusterer.GetScanBuffer(iBuf - 1); @@ -89,11 +96,13 @@ GPUdii() void GPUTPCCFStreamCompaction::Thread GPUdii() void GPUTPCCFStreamCompaction::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int32_t iBuf, int32_t stage, CfChargePos* in, CfChargePos* out) { +#ifdef GPUCA_GPUCODE uint32_t nElems = CompactionElems(clusterer, stage); SizeT bufferSize = (stage) ? clusterer.mNMaxClusters : clusterer.mNMaxPeaks; @@ -105,7 +114,7 @@ GPUdii() void GPUTPCCFStreamCompaction::Thread= nElems); int32_t pred = (iAmDummy) ? 0 : predicate[iThreadGlobal]; - int32_t offsetInBlock = CfUtils::blockPredicateScan(smem, pred); + int32_t offsetInBlock = CfUtils::blockPredicateScan(smem, pred); SizeT globalOffsetOut = offsetInBlock; if (iBlock > 0) { @@ -129,6 +138,7 @@ GPUdii() void GPUTPCCFStreamCompaction::Threadcounters.nPeaks = nFinal; } } +#endif } GPUdii() int32_t GPUTPCCFStreamCompaction::CompactionElems(processorType& clusterer, int32_t stage) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFStreamCompaction.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFStreamCompaction.h index a72907fe55e89..a5ea8b24e9522 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFStreamCompaction.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFStreamCompaction.h @@ -35,14 +35,14 @@ class GPUTPCCFStreamCompaction : public GPUKernelTemplate compactDigits = 4, }; - struct GPUSharedMemory : public GPUKernelTemplate::GPUSharedMemoryScan64 { - }; #if defined(GPUCA_GPUCODE) && !defined(GPUCA_GPUCODE_NO_LAUNCH_BOUNDS) - static_assert(GPUCA_THREAD_COUNT_SCAN == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFStreamCompaction_scanStart)); - static_assert(GPUCA_THREAD_COUNT_SCAN == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFStreamCompaction_scanUp)); - static_assert(GPUCA_THREAD_COUNT_SCAN == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFStreamCompaction_scanTop)); - static_assert(GPUCA_THREAD_COUNT_SCAN == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFStreamCompaction_scanDown)); - static_assert(GPUCA_THREAD_COUNT_SCAN == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFStreamCompaction_compactDigits)); + struct GPUSharedMemory : public GPUKernelTemplate::GPUSharedMemoryScan64 { + }; + static_assert(GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFStreamCompaction_scanStart)); + static_assert(GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFStreamCompaction_scanUp)); + static_assert(GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFStreamCompaction_scanTop)); + static_assert(GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFStreamCompaction_scanDown)); + static_assert(GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFStreamCompaction_compactDigits)); #endif typedef GPUTPCClusterFinder processorType; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.cxx index 051391f12cc6d..541edaa689c6c 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.cxx @@ -17,6 +17,7 @@ #include "GPUMemorySizeScalers.h" #include "GPUHostDataTypes.h" #include "GPUSettings.h" +#include "GPUDefParametersRuntime.h" #include "DataFormatsTPC/ClusterNative.h" #include "DataFormatsTPC/ZeroSuppression.h" @@ -90,9 +91,10 @@ void* GPUTPCClusterFinder::SetPointersScratch(void* mem) computePointerWithAlignment(mem, mPisPeak, mNMaxDigitsFragment); computePointerWithAlignment(mem, mPchargeMap, TPCMapMemoryLayout::items(mRec->GetProcessingSettings().overrideClusterizerFragmentLen)); computePointerWithAlignment(mem, mPpeakMap, TPCMapMemoryLayout::items(mRec->GetProcessingSettings().overrideClusterizerFragmentLen)); - computePointerWithAlignment(mem, mPbuf, mBufSize * mNBufs); computePointerWithAlignment(mem, mPclusterByRow, GPUCA_ROW_COUNT * mNMaxClusterPerRow); - + if ((mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCClusterFinding)) { + computePointerWithAlignment(mem, mPscanBuf, mBufSize * mNBufs); + } return mem; } @@ -129,14 +131,15 @@ void GPUTPCClusterFinder::SetMaxData(const GPUTrackingInOutPointers& io) if (mRec->GetProcessingSettings().tpcIncreasedMinClustersPerRow) { mNMaxClusterPerRow = std::max(mNMaxClusterPerRow, mRec->GetProcessingSettings().tpcIncreasedMinClustersPerRow); } - - mBufSize = nextMultipleOf(GPUCA_MEMALIGN, mScanWorkGroupSize)>(mNMaxDigitsFragment); - mNBufs = getNSteps(mBufSize); + if ((mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCClusterFinding)) { + mBufSize = nextMultipleOf(mNMaxDigitsFragment, std::max(GPUCA_MEMALIGN, mRec->getGPUParameters(mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCClusterFinding).par_CF_SCAN_WORKGROUP_SIZE)); + mNBufs = getNSteps(mBufSize); + } } void GPUTPCClusterFinder::SetNMaxDigits(size_t nDigits, size_t nPages, size_t nDigitsFragment, size_t nDigitsEndpointMax) { - mNMaxDigits = nextMultipleOf(GPUCA_MEMALIGN, mScanWorkGroupSize)>(nDigits); + mNMaxDigits = nextMultipleOf(nDigits, std::max(GPUCA_MEMALIGN, mRec->getGPUParameters(mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCClusterFinding).par_CF_SCAN_WORKGROUP_SIZE)); mNMaxPages = nPages; mNMaxDigitsFragment = nDigitsFragment; mNMaxDigitsEndpoint = nDigitsEndpointMax; @@ -148,9 +151,10 @@ uint32_t GPUTPCClusterFinder::getNSteps(size_t items) const return 0; } uint32_t c = 1; - size_t capacity = mScanWorkGroupSize; + const size_t scanWorkgroupSize = mRec->getGPUParameters(mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCClusterFinding).par_CF_SCAN_WORKGROUP_SIZE; + size_t capacity = scanWorkgroupSize; while (items > capacity) { - capacity *= mScanWorkGroupSize; + capacity *= scanWorkgroupSize; c++; } return c; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h index 96efe08be6dc6..37399f5e4863f 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h @@ -111,10 +111,10 @@ class GPUTPCClusterFinder : public GPUProcessor uint32_t* mPclusterInRow = nullptr; tpc::ClusterNative* mPclusterByRow = nullptr; GPUTPCClusterMCInterimArray* mPlabelsByRow = nullptr; - int32_t* mPbuf = nullptr; + int32_t* mPscanBuf = nullptr; Memory* mPmemory = nullptr; - GPUdi() int32_t* GetScanBuffer(int32_t iBuf) const { return mPbuf + iBuf * mBufSize; } + GPUdi() int32_t* GetScanBuffer(int32_t iBuf) const { return mPscanBuf + iBuf * mBufSize; } o2::dataformats::ConstMCTruthContainerView const* mPinputLabels = nullptr; uint32_t* mPlabelsInRow = nullptr; @@ -122,7 +122,6 @@ class GPUTPCClusterFinder : public GPUProcessor uint32_t mPlabelsDataGlobalOffset = 0; int32_t mISector = 0; - constexpr static int32_t mScanWorkGroupSize = GPUCA_THREAD_COUNT_SCAN; uint32_t mNMaxClusterPerRow = 0; uint32_t mNMaxClusters = 0; uint32_t mNMaxPages = 0; diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake index 937a92fef33df..08d879fbb8e9a 100644 --- a/GPU/GPUTracking/kernels.cmake +++ b/GPU/GPUTracking/kernels.cmake @@ -146,7 +146,8 @@ o2_gpu_kernel_add_parameter(NEIGHBOURS_FINDER_MAX_NNEIGHUP NO_ATOMIC_PRECHECK COMP_GATHER_KERNEL COMP_GATHER_MODE - SORT_STARTHITS) + SORT_STARTHITS + CF_SCAN_WORKGROUP_SIZE) o2_gpu_kernel_add_string_parameter(DEDX_STORAGE_TYPE MERGER_INTERPOLATION_ERROR_TYPE)