Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions GPU/GPUTracking/Base/GPUReconstructionKernelMacros.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,21 @@
#define GPUCA_M_KRNL_NAME(...) GPUCA_M_KRNL_NAME_A(GPUCA_M_STRIP(__VA_ARGS__))

#if defined(GPUCA_GPUCODE) || defined(GPUCA_GPUCODE_HOSTONLY)

#if defined(__HIPCC__) && !defined(GPUCA_GPUCODE_NO_LAUNCH_BOUNDS)
static_assert(GPUCA_PAR_AMD_EUS_PER_CU > 0);
#define GPUCA_MIN_WARPS_PER_EU(maxThreadsPerBlock, minBlocksPerCU) GPUCA_CEIL_INT_DIV((minBlocksPerCU) * (maxThreadsPerBlock), (GPUCA_WARP_SIZE * GPUCA_PAR_AMD_EUS_PER_CU))

#define GPUCA_LB_ARGS_1(maxThreadsPerBlock) maxThreadsPerBlock
#define GPUCA_LB_ARGS_2(maxThreadsPerBlock, minBlocksPerCU) maxThreadsPerBlock, GPUCA_MIN_WARPS_PER_EU(maxThreadsPerBlock, minBlocksPerCU)

#define GPUCA_LAUNCH_BOUNDS_SELECT(n, ...) GPUCA_M_CAT(GPUCA_LB_ARGS_, n)(__VA_ARGS__)
#define GPUCA_LAUNCH_BOUNDS_DISP(...) GPUCA_LAUNCH_BOUNDS_SELECT(GPUCA_M_COUNT(__VA_ARGS__), __VA_ARGS__)
#define GPUCA_KRNL_REG_DEFAULT(args) __launch_bounds__(GPUCA_LAUNCH_BOUNDS_DISP(GPUCA_M_MAX2_3(GPUCA_M_STRIP(args))))
#elif !defined(GPUCA_GPUCODE_NO_LAUNCH_BOUNDS)
#define GPUCA_KRNL_REG_DEFAULT(args) __launch_bounds__(GPUCA_M_MAX2_3(GPUCA_M_STRIP(args)))
#endif

#ifndef GPUCA_KRNL_REG
#define GPUCA_KRNL_REG(...)
#endif
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
}
fclose(fp);
}
const std::string launchBounds = o2::gpu::internal::GPUDefParametersExport(*mParDevice, true) +
const std::string launchBounds = o2::gpu::internal::GPUDefParametersExport(*mParDevice, true, mParDevice->par_AMD_EUS_PER_CU ? (mParDevice->par_AMD_EUS_PER_CU * mWarpSize) : 0) +
"#define GPUCA_WARP_SIZE " + std::to_string(mWarpSize) + "\n";
if (GetProcessingSettings().rtctech.printLaunchBounds || GetProcessingSettings().debugLevel >= 3) {
GPUInfo("RTC Launch Bounds:\n%s", launchBounds.c_str());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ inline void GPUReconstructionCUDA::runKernelBackend(const krnlSetupTime& _xyz, c
}

#undef GPUCA_KRNL_REG
#define GPUCA_KRNL_REG(args) __launch_bounds__(GPUCA_M_MAX2_3(GPUCA_M_STRIP(args)))
#define GPUCA_KRNL_REG(...) GPUCA_KRNL_REG_DEFAULT(__VA_ARGS__)

// clang-format off
#if defined(GPUCA_KERNEL_COMPILE_MODE) && GPUCA_KERNEL_COMPILE_MODE != 1 // ---------- COMPILE_MODE = perkernel ----------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#define GPUCA_GPUCODE_HOSTONLY
#define GPUCA_GPUCODE_NO_LAUNCH_BOUNDS

#define GPUCA_KRNL_REG(args) __launch_bounds__(GPUCA_M_MAX2_3(GPUCA_M_STRIP(args)))
#define GPUCA_KRNL_REG(args) __launch_bounds__(GPUCA_M_STRIP(args))

#include "GPUReconstructionCUDAIncludesSystem.h"
#include "GPUReconstructionCUDADef.h"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

#define GPUCA_GPUCODE_COMPILEKERNELS
#include "GPUReconstructionCUDAIncludesSystem.h"
#define GPUCA_KRNL_REG(args) __launch_bounds__(GPUCA_M_MAX2_3(GPUCA_M_STRIP(args)))
#define GPUCA_KRNL_REG(...) GPUCA_KRNL_REG_DEFAULT(__VA_ARGS__)
#define GPUCA_KRNL(...) GPUCA_KRNLGPU(__VA_ARGS__);
#include "GPUReconstructionCUDADef.h"
#include "GPUReconstructionKernelMacros.h"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

#define GPUCA_GPUCODE_COMPILEKERNELS
#include "GPUReconstructionHIPIncludesSystem.h"
#define GPUCA_KRNL_REG(args) __launch_bounds__(GPUCA_M_MAX2_3(GPUCA_M_STRIP(args)))
#define GPUCA_KRNL_REG(...) GPUCA_KRNL_REG_DEFAULT(__VA_ARGS__)
#define GPUCA_KRNL(...) GPUCA_KRNLGPU(__VA_ARGS__);
#include "GPUReconstructionHIPDef.h"
#include "GPUReconstructionKernelMacros.h"
Expand Down
2 changes: 2 additions & 0 deletions GPU/GPUTracking/Definitions/GPUDefMacros.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,5 +50,7 @@
#define GPUCA_UNROLL(...)
#endif

#define GPUCA_CEIL_INT_DIV(a, b) (((a) + (b) - 1) / (b))

#endif
// clang-format on
93 changes: 51 additions & 42 deletions GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,22 +25,23 @@
// GPU-architecture-dependent default settings
#if defined(GPUCA_GPUTYPE_MI2xx)
#define GPUCA_WARP_SIZE 64
#define GPUCA_PAR_AMD_EUS_PER_CU 4
#define GPUCA_THREAD_COUNT_DEFAULT 256
#define GPUCA_LB_GPUTPCCreateTrackingData 256
#define GPUCA_LB_GPUTPCStartHitsSorter 512, 1
#define GPUCA_LB_GPUTPCStartHitsFinder 1024
#define GPUCA_LB_GPUTPCTrackletConstructor 512, 2
#define GPUCA_LB_GPUTPCTrackletSelector 192, 3
#define GPUCA_LB_GPUTPCNeighboursFinder 1024, 1
#define GPUCA_LB_GPUTPCNeighboursCleaner 896
#define GPUCA_LB_GPUTPCExtrapolationTracking 256
#define GPUCA_LB_GPUTPCCreateTrackingData 256, 7
#define GPUCA_LB_GPUTPCStartHitsSorter 1024, 5
#define GPUCA_LB_GPUTPCStartHitsFinder 1024, 2
#define GPUCA_LB_GPUTPCTrackletConstructor 768, 8
#define GPUCA_LB_GPUTPCTrackletSelector 384, 5
#define GPUCA_LB_GPUTPCNeighboursFinder 192, 8
#define GPUCA_LB_GPUTPCNeighboursCleaner 128, 5
#define GPUCA_LB_GPUTPCExtrapolationTracking 256, 7
#define GPUCA_LB_GPUTPCCFDecodeZS 64, 4
#define GPUCA_LB_GPUTPCCFDecodeZSLink GPUCA_WARP_SIZE
#define GPUCA_LB_GPUTPCCFDecodeZSDenseLink GPUCA_WARP_SIZE
#define GPUCA_LB_GPUTPCCFGather 1024, 1
#define GPUCA_LB_GPUTPCGMMergerTrackFit 128, 1
#define GPUCA_LB_GPUTPCGMMergerFollowLoopers 64, 12
#define GPUCA_LB_GPUTPCGMMergerSectorRefit 256
#define GPUCA_LB_GPUTPCCFDecodeZSDenseLink GPUCA_WARP_SIZE, 4
#define GPUCA_LB_GPUTPCCFGather 1024, 5
#define GPUCA_LB_GPUTPCGMMergerTrackFit 192, 2
#define GPUCA_LB_GPUTPCGMMergerFollowLoopers 256, 5
#define GPUCA_LB_GPUTPCGMMergerSectorRefit 64, 4
#define GPUCA_LB_GPUTPCGMMergerUnpackResetIds 256
#define GPUCA_LB_GPUTPCGMMergerUnpackGlobal 256
#define GPUCA_LB_GPUTPCGMMergerResolve_step0 512
Expand All @@ -55,29 +56,31 @@
#define GPUCA_LB_GPUTPCGMMergerMergeBorders_step2 512
#define GPUCA_LB_GPUTPCGMMergerMergeCE 512
#define GPUCA_LB_GPUTPCGMMergerLinkExtrapolatedTracks 256
#define GPUCA_LB_GPUTPCGMMergerCollect 512
#define GPUCA_LB_GPUTPCGMMergerCollect 768, 1
#define GPUCA_LB_GPUTPCGMMergerSortTracksPrepare 256
#define GPUCA_LB_GPUTPCGMMergerPrepareForFit_step0 256
#define GPUCA_LB_GPUTPCGMMergerPrepareForFit_step1 256
#define GPUCA_LB_GPUTPCGMMergerPrepareForFit_step2 256
#define GPUCA_LB_GPUTPCGMMergerFinalize_0 256
#define GPUCA_LB_GPUTPCGMMergerFinalize_1 256
#define GPUCA_LB_GPUTPCGMMergerFinalize_2 256
#define GPUCA_LB_GPUTPCCompressionKernels_step0attached 64, 2
#define GPUCA_LB_GPUTPCCompressionKernels_step0attached 128, 1
#define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 2
#define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 128, 2
#define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 64, 2
#define GPUCA_LB_GPUTPCCFCheckPadBaseline 64
#define GPUCA_LB_GPUTPCCFCheckPadBaseline 64, 10
#define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 512
#define GPUCA_LB_GPUTPCCFChargeMapFiller_fillFromDigits 512
#define GPUCA_LB_GPUTPCCFChargeMapFiller_findFragmentStart 512
#define GPUCA_LB_GPUTPCCFPeakFinder 512
#define GPUCA_LB_GPUTPCCFPeakFinder 512, 9
#define GPUCA_LB_GPUTPCCFNoiseSuppression 512
#define GPUCA_LB_GPUTPCCFDeconvolution 512
#define GPUCA_LB_GPUTPCCFClusterizer 448
#define GPUCA_LB_GPUTPCCFDeconvolution 512, 5
#define GPUCA_LB_GPUTPCCFClusterizer 448, 3
#define GPUCA_LB_COMPRESSION_GATHER 1024
#define GPUCA_PAR_NEIGHBOURS_FINDER_MAX_NNEIGHUP 5
#define GPUCA_PAR_TRACKLET_SELECTOR_HITS_REG_SIZE 20
#define GPUCA_PAR_NEIGHBOURS_FINDER_MAX_NNEIGHUP 10
#define PAR_NEIGHBOURS_FINDER_UNROLL_GLOBAL 4
#define GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_SHARED 0
#define GPUCA_PAR_TRACKLET_SELECTOR_HITS_REG_SIZE 9
#define GPUCA_PAR_ALTERNATE_BORDER_SORT 1
#define GPUCA_PAR_SORT_BEFORE_FIT 1
#define GPUCA_PAR_NO_ATOMIC_PRECHECK 1
Expand All @@ -87,22 +90,23 @@
#define GPUCA_PAR_COMP_GATHER_MODE 3
#elif defined(GPUCA_GPUTYPE_VEGA)
#define GPUCA_WARP_SIZE 64
#define GPUCA_PAR_AMD_EUS_PER_CU 4
#define GPUCA_THREAD_COUNT_DEFAULT 256
#define GPUCA_LB_GPUTPCCreateTrackingData 128
#define GPUCA_LB_GPUTPCStartHitsSorter 1024, 2
#define GPUCA_LB_GPUTPCStartHitsFinder 1024
#define GPUCA_LB_GPUTPCTrackletConstructor 256, 2
#define GPUCA_LB_GPUTPCTrackletSelector 256, 8
#define GPUCA_LB_GPUTPCNeighboursFinder 1024, 1
#define GPUCA_LB_GPUTPCNeighboursCleaner 896
#define GPUCA_LB_GPUTPCExtrapolationTracking 256
#define GPUCA_LB_GPUTPCCFDecodeZS 64, 4
#define GPUCA_LB_GPUTPCCreateTrackingData 192, 2
#define GPUCA_LB_GPUTPCStartHitsSorter 512, 7
#define GPUCA_LB_GPUTPCStartHitsFinder 1024, 7
#define GPUCA_LB_GPUTPCTrackletConstructor 512, 10
#define GPUCA_LB_GPUTPCTrackletSelector 192, 10
#define GPUCA_LB_GPUTPCNeighboursFinder 960, 8
#define GPUCA_LB_GPUTPCNeighboursCleaner 384, 9
#define GPUCA_LB_GPUTPCExtrapolationTracking 256, 2
#define GPUCA_LB_GPUTPCCFDecodeZS 64, 1
#define GPUCA_LB_GPUTPCCFDecodeZSLink GPUCA_WARP_SIZE
#define GPUCA_LB_GPUTPCCFDecodeZSDenseLink GPUCA_WARP_SIZE
#define GPUCA_LB_GPUTPCCFDecodeZSDenseLink GPUCA_WARP_SIZE, 14
#define GPUCA_LB_GPUTPCCFGather 1024, 1
#define GPUCA_LB_GPUTPCGMMergerTrackFit 64, 1
#define GPUCA_LB_GPUTPCGMMergerFollowLoopers 256, 4, 200
#define GPUCA_LB_GPUTPCGMMergerSectorRefit 256
#define GPUCA_LB_GPUTPCGMMergerTrackFit 64, 7
#define GPUCA_LB_GPUTPCGMMergerFollowLoopers 256, 5
#define GPUCA_LB_GPUTPCGMMergerSectorRefit 256, 2
#define GPUCA_LB_GPUTPCGMMergerUnpackResetIds 256
#define GPUCA_LB_GPUTPCGMMergerUnpackGlobal 256
#define GPUCA_LB_GPUTPCGMMergerResolve_step0 256
Expand All @@ -117,29 +121,31 @@
#define GPUCA_LB_GPUTPCGMMergerMergeBorders_step2 256
#define GPUCA_LB_GPUTPCGMMergerMergeCE 256
#define GPUCA_LB_GPUTPCGMMergerLinkExtrapolatedTracks 256
#define GPUCA_LB_GPUTPCGMMergerCollect 512
#define GPUCA_LB_GPUTPCGMMergerCollect 1024, 1
#define GPUCA_LB_GPUTPCGMMergerSortTracksPrepare 256
#define GPUCA_LB_GPUTPCGMMergerPrepareForFit_step0 256
#define GPUCA_LB_GPUTPCGMMergerPrepareForFit_step1 256
#define GPUCA_LB_GPUTPCGMMergerPrepareForFit_step2 256
#define GPUCA_LB_GPUTPCGMMergerFinalize_0 256
#define GPUCA_LB_GPUTPCGMMergerFinalize_1 256
#define GPUCA_LB_GPUTPCGMMergerFinalize_2 256
#define GPUCA_LB_GPUTPCCompressionKernels_step0attached 192, 2
#define GPUCA_LB_GPUTPCCompressionKernels_step0attached 64, 2
#define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 2
#define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 128, 2
#define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 64, 2
#define GPUCA_LB_GPUTPCCFCheckPadBaseline 64
#define GPUCA_LB_GPUTPCCFCheckPadBaseline 64, 2
#define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 512
#define GPUCA_LB_GPUTPCCFChargeMapFiller_fillFromDigits 512
#define GPUCA_LB_GPUTPCCFChargeMapFiller_findFragmentStart 512
#define GPUCA_LB_GPUTPCCFPeakFinder 512
#define GPUCA_LB_GPUTPCCFPeakFinder 512, 4
#define GPUCA_LB_GPUTPCCFNoiseSuppression 512
#define GPUCA_LB_GPUTPCCFDeconvolution 512
#define GPUCA_LB_GPUTPCCFClusterizer 512
#define GPUCA_LB_GPUTPCCFDeconvolution 512, 5
#define GPUCA_LB_GPUTPCCFClusterizer 512, 2
#define GPUCA_LB_COMPRESSION_GATHER 1024
#define GPUCA_PAR_NEIGHBOURS_FINDER_MAX_NNEIGHUP 5
#define GPUCA_PAR_TRACKLET_SELECTOR_HITS_REG_SIZE 20
#define GPUCA_PAR_NEIGHBOURS_FINDER_MAX_NNEIGHUP 4
#define GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_GLOBAL 2
#define GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_SHARED 0
#define GPUCA_PAR_TRACKLET_SELECTOR_HITS_REG_SIZE 27
#define GPUCA_PAR_ALTERNATE_BORDER_SORT 1
#define GPUCA_PAR_SORT_BEFORE_FIT 1
#define GPUCA_PAR_NO_ATOMIC_PRECHECK 1
Expand Down Expand Up @@ -272,6 +278,9 @@
#ifndef GPUCA_WARP_SIZE
#define GPUCA_WARP_SIZE 32
#endif
#ifndef GPUCA_PAR_AMD_EUS_PER_CU
#define GPUCA_PAR_AMD_EUS_PER_CU 0
#endif
#ifndef GPUCA_THREAD_COUNT_DEFAULT
#define GPUCA_THREAD_COUNT_DEFAULT 256
#endif
Expand Down
24 changes: 12 additions & 12 deletions GPU/GPUTracking/Definitions/GPUDefParametersLoad.template.inc
Original file line number Diff line number Diff line change
Expand Up @@ -39,23 +39,23 @@ static GPUDefParameters GPUDefParametersLoad()
};
}

#define GPUCA_EXPORT_KERNEL_LB(name) \
if (par.par_LB_maxThreads[i] > 0) { \
o << "#define GPUCA_LB_" GPUCA_M_STR(name) " " << par.par_LB_maxThreads[i]; \
if (par.par_LB_minBlocks[i] > 0) { \
o << ", " << par.par_LB_minBlocks[i]; \
} \
if (!forRTC && par.par_LB_forceBlocks[i] > 0) { \
o << ", " << par.par_LB_forceBlocks[i]; \
} \
o << "\n"; \
} \
#define GPUCA_EXPORT_KERNEL_LB(name) \
if (par.par_LB_maxThreads[i] > 0) { \
o << "#define GPUCA_LB_" GPUCA_M_STR(name) " " << par.par_LB_maxThreads[i]; \
if (par.par_LB_minBlocks[i] > 0) { \
o << ", " << GPUCA_CEIL_INT_DIV(par.par_LB_maxThreads[i] * par.par_LB_minBlocks[i], (minBlockFactor ? minBlockFactor : par.par_LB_maxThreads[i])); \
} \
if (!forRTC && par.par_LB_forceBlocks[i] > 0) { \
o << ", " << par.par_LB_forceBlocks[i]; \
} \
o << "\n"; \
} \
i++;

#define GPUCA_EXPORT_KERNEL_PARAM(name) \
o << "#define GPUCA_PAR_" GPUCA_M_STR(name) " " << GPUCA_M_CAT(par.par_, name) << "\n";

static std::string GPUDefParametersExport(const GPUDefParameters& par, bool forRTC)
static std::string GPUDefParametersExport(const GPUDefParameters& par, bool forRTC, int32_t minBlockFactor = 0)
{
std::stringstream o; // clang-format off
int32_t i = 0;
Expand Down
3 changes: 2 additions & 1 deletion GPU/GPUTracking/kernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,8 @@ o2_gpu_kernel_add_parameter(NEIGHBOURS_FINDER_MAX_NNEIGHUP # Number of neighhbo
COMP_GATHER_KERNEL # Default kernel to use for Compression Gather Operation [0 - 4]
COMP_GATHER_MODE # TPC Compression Gather Mode [0 - 3]
SORT_STARTHITS # Sort start hits to improve cache locality during tracklet construction [0/1]
CF_SCAN_WORKGROUP_SIZE) # Work group size to use in clusterizer scan operation
CF_SCAN_WORKGROUP_SIZE # Work group size to use in clusterizer scan operation
AMD_EUS_PER_CU) # Number of SIMD units per Compute Unit (only for AMD GPUs)

o2_gpu_kernel_add_string_parameter(DEDX_STORAGE_TYPE # Data type to use for intermediate storage of dEdx truncated mean inputs
MERGER_INTERPOLATION_ERROR_TYPE) # Data type for storing intermediate track residuals for interpolation