From eb86cbdbb84a7aa038a14bea38f5613d25bd3434 Mon Sep 17 00:00:00 2001 From: Felix Thaler Date: Wed, 12 Jun 2024 12:15:42 +0200 Subject: [PATCH 01/53] =?UTF-8?q?Renamed=20BlockSizes=20=E2=86=92=20Thread?= =?UTF-8?q?BlockSizes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/gridtools/fn/backend/gpu.hpp | 57 ++++++++++++++-------------- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/include/gridtools/fn/backend/gpu.hpp b/include/gridtools/fn/backend/gpu.hpp index 5f1500b70..416621827 100644 --- a/include/gridtools/fn/backend/gpu.hpp +++ b/include/gridtools/fn/backend/gpu.hpp @@ -24,23 +24,23 @@ namespace gridtools::fn::backend { namespace gpu_impl_ { /* - * BlockSizes must be a meta map, mapping dimensions to integral constant block sizes. + * ThreadBlockSizes must be a meta map, mapping dimensions to integral constant block sizes. * * For example, meta::list>, * meta::list>, * meta::list>>; * When using a cartesian grid. */ - template + template struct gpu { - using block_sizes_t = BlockSizes; + using thread_block_sizes_t = ThreadBlockSizes; cudaStream_t stream = 0; }; template using block_size_at_dim = meta::second>>; - template + template GT_FUNCTION_DEVICE auto global_thread_index() { using all_keys_t = get_keys; using ndims_t = meta::length; @@ -48,19 +48,19 @@ namespace gridtools::fn::backend { if constexpr (ndims_t::value == 0) { return hymap::keys<>::values<>(); } else if constexpr (ndims_t::value == 1) { - using block_dim_x = block_size_at_dim; + using block_dim_x = block_size_at_dim; using values_t = typename keys_t::template values; return values_t(blockIdx.x * block_dim_x::value + threadIdx.x); } else if constexpr (ndims_t::value == 2) { - using block_dim_x = block_size_at_dim; - using block_dim_y = block_size_at_dim; + using block_dim_x = block_size_at_dim; + using block_dim_y = block_size_at_dim; using values_t = typename keys_t::template values; return values_t( blockIdx.x * block_dim_x::value + threadIdx.x, blockIdx.y * block_dim_y::value + threadIdx.y); } else { - using block_dim_x = block_size_at_dim; - using block_dim_y = block_size_at_dim; - using block_dim_z = block_size_at_dim; + using block_dim_x = block_size_at_dim; + using block_dim_y = block_size_at_dim; + using block_dim_z = block_size_at_dim; using values_t = typename keys_t::template values; return values_t(blockIdx.x * block_dim_x::value + threadIdx.x, blockIdx.y * block_dim_y::value + threadIdx.y, @@ -87,7 +87,7 @@ namespace gridtools::fn::backend { return tuple_util::device::all_of(std::less(), index, indexed_sizes); } - template , class SizeKeys = get_keys> __global__ void kernel(Sizes sizes, PtrHolder ptr_holder, Strides strides, Fun fun) { - auto thread_idx = global_thread_index(); + auto thread_idx = global_thread_index(); if (!in_domain(thread_idx, sizes)) return; auto ptr = ptr_holder(); @@ -108,22 +108,22 @@ namespace gridtools::fn::backend { } } - template + template std::tuple blocks_and_threads(Sizes const &sizes) { using keys_t = get_keys; using ndims_t = meta::length; dim3 blocks(1, 1, 1); dim3 threads(1, 1, 1); if constexpr (ndims_t::value >= 1) { - threads.x = block_size_at_dim(); + threads.x = block_size_at_dim(); blocks.x = (tuple_util::get<0>(sizes) + threads.x - 1) / threads.x; } if constexpr (ndims_t::value >= 2) { - threads.y = block_size_at_dim(); + threads.y = block_size_at_dim(); blocks.y = (tuple_util::get<1>(sizes) + threads.y - 1) / threads.y; } if constexpr (ndims_t::value >= 3) { - threads.z = block_size_at_dim(); + threads.z = block_size_at_dim(); blocks.z = (tuple_util::get<2>(sizes) + threads.z - 1) / threads.z; } return {blocks, threads}; @@ -144,8 +144,8 @@ namespace gridtools::fn::backend { return tuple_util::host::apply([](auto... sizes) { return ((sizes == 0) || ...); }, sizes); } - template - void apply_stencil_stage(gpu const &g, + template + void apply_stencil_stage(gpu const &g, Sizes const &sizes, StencilStage, MakeIterator make_iterator, @@ -158,13 +158,13 @@ namespace gridtools::fn::backend { auto ptr_holder = sid::get_origin(std::forward(composite)); auto strides = sid::get_strides(std::forward(composite)); - auto [blocks, threads] = blocks_and_threads(sizes); + auto [blocks, threads] = blocks_and_threads(sizes); assert(threads.x > 0 && threads.y > 0 && threads.z > 0); cuda_util::launch(blocks, threads, 0, g.stream, - kernel - void apply_column_stage(gpu const &g, + void apply_column_stage(gpu const &g, Sizes const &sizes, ColumnStage, MakeIterator make_iterator, @@ -211,13 +211,13 @@ namespace gridtools::fn::backend { auto h_sizes = hymap::canonicalize_and_remove_key(sizes); int v_size = at_key(sizes); - auto [blocks, threads] = blocks_and_threads(h_sizes); + auto [blocks, threads] = blocks_and_threads(h_sizes); assert(threads.x > 0 && threads.y > 0 && threads.z > 0); cuda_util::launch(blocks, threads, 0, g.stream, - kernel{std::move(make_iterator), std::move(seed), v_size}); } - template - auto tmp_allocator(gpu be) { + template + auto tmp_allocator(gpu be) { return std::make_tuple(be, sid::device::cached_allocator(&cuda_util::cuda_malloc)); } - template - auto allocate_global_tmp(std::tuple, Allocator> &alloc, Sizes const &sizes, data_type) { + template + auto allocate_global_tmp( + std::tuple, Allocator> &alloc, Sizes const &sizes, data_type) { return sid::make_contiguous(std::get<1>(alloc), sizes); } } // namespace gpu_impl_ From f10f9adc13a44663a3c7aabcb904d6b4b8c0e7f4 Mon Sep 17 00:00:00 2001 From: Felix Thaler Date: Mon, 17 Jun 2024 15:12:43 +0200 Subject: [PATCH 02/53] Add loop blocking to fn GPU backend --- include/gridtools/fn/backend/gpu.hpp | 132 +++++++++++++++------ tests/include/fn_select.hpp | 16 +-- tests/unit_tests/fn/test_fn_backend_gpu.cu | 19 +-- 3 files changed, 118 insertions(+), 49 deletions(-) diff --git a/include/gridtools/fn/backend/gpu.hpp b/include/gridtools/fn/backend/gpu.hpp index 416621827..6bbf857f7 100644 --- a/include/gridtools/fn/backend/gpu.hpp +++ b/include/gridtools/fn/backend/gpu.hpp @@ -31,40 +31,89 @@ namespace gridtools::fn::backend { * meta::list>>; * When using a cartesian grid. */ - template + template >, + meta::repeat, meta::list>>>>> struct gpu { using thread_block_sizes_t = ThreadBlockSizes; + using loop_block_sizes_t = LoopBlockSizes; cudaStream_t stream = 0; }; template using block_size_at_dim = meta::second>>; - template - GT_FUNCTION_DEVICE auto global_thread_index() { + template + GT_FUNCTION_DEVICE auto global_thread_index(Sizes const &sizes) { using all_keys_t = get_keys; using ndims_t = meta::length; using keys_t = meta::rename>; if constexpr (ndims_t::value == 0) { - return hymap::keys<>::values<>(); + return std::make_tuple(hymap::keys<>::values<>(), hymap::keys<>::values<>()); } else if constexpr (ndims_t::value == 1) { - using block_dim_x = block_size_at_dim; + using thread_block_dim_x = block_size_at_dim; + using loop_block_dim_x = block_size_at_dim; using values_t = typename keys_t::template values; - return values_t(blockIdx.x * block_dim_x::value + threadIdx.x); + int thread_idx_x = blockIdx.x * (thread_block_dim_x::value * loop_block_dim_x::value) + + threadIdx.x * loop_block_dim_x::value; + int block_size_x = tuple_util::get<0>(sizes) - thread_idx_x; + if (block_size_x > loop_block_dim_x::value) + block_size_x = loop_block_dim_x::value; + if (block_size_x < 0) + block_size_x = 0; + return std::make_tuple(values_t(thread_idx_x), values_t(block_size_x)); } else if constexpr (ndims_t::value == 2) { - using block_dim_x = block_size_at_dim; - using block_dim_y = block_size_at_dim; + using thread_block_dim_x = block_size_at_dim; + using thread_block_dim_y = block_size_at_dim; + using loop_block_dim_x = block_size_at_dim; + using loop_block_dim_y = block_size_at_dim; using values_t = typename keys_t::template values; - return values_t( - blockIdx.x * block_dim_x::value + threadIdx.x, blockIdx.y * block_dim_y::value + threadIdx.y); + int thread_idx_x = blockIdx.x * (thread_block_dim_x::value * loop_block_dim_x::value) + + threadIdx.x * loop_block_dim_x::value; + int thread_idx_y = blockIdx.y * (thread_block_dim_y::value * loop_block_dim_y::value) + + threadIdx.y * loop_block_dim_y::value; + int block_size_x = tuple_util::get<0>(sizes) - thread_idx_x; + int block_size_y = tuple_util::get<1>(sizes) - thread_idx_y; + if (block_size_x > loop_block_dim_x::value) + block_size_x = loop_block_dim_x::value; + if (block_size_x < 0) + block_size_x = 0; + if (block_size_y > loop_block_dim_y::value) + block_size_y = loop_block_dim_y::value; + if (block_size_y < 0) + block_size_y = 0; + return std::make_tuple(values_t(thread_idx_x, thread_idx_y), values_t(block_size_x, block_size_y)); } else { - using block_dim_x = block_size_at_dim; - using block_dim_y = block_size_at_dim; - using block_dim_z = block_size_at_dim; + using thread_block_dim_x = block_size_at_dim; + using thread_block_dim_y = block_size_at_dim; + using thread_block_dim_z = block_size_at_dim; + using loop_block_dim_x = block_size_at_dim; + using loop_block_dim_y = block_size_at_dim; + using loop_block_dim_z = block_size_at_dim; using values_t = typename keys_t::template values; - return values_t(blockIdx.x * block_dim_x::value + threadIdx.x, - blockIdx.y * block_dim_y::value + threadIdx.y, - blockIdx.z * block_dim_z::value + threadIdx.z); + int thread_idx_x = blockIdx.x * (thread_block_dim_x::value * loop_block_dim_x::value) + + threadIdx.x * loop_block_dim_x::value; + int thread_idx_y = blockIdx.y * (thread_block_dim_y::value * loop_block_dim_y::value) + + threadIdx.y * loop_block_dim_y::value; + int thread_idx_z = blockIdx.z * (thread_block_dim_z::value * loop_block_dim_z::value) + + threadIdx.z * loop_block_dim_z::value; + int block_size_x = tuple_util::get<0>(sizes) - thread_idx_x; + int block_size_y = tuple_util::get<1>(sizes) - thread_idx_y; + int block_size_z = tuple_util::get<2>(sizes) - thread_idx_z; + if (block_size_x > loop_block_dim_x::value) + block_size_x = loop_block_dim_x::value; + if (block_size_x < 0) + block_size_x = 0; + if (block_size_y > loop_block_dim_y::value) + block_size_y = loop_block_dim_y::value; + if (block_size_y < 0) + block_size_y = 0; + if (block_size_z > loop_block_dim_z::value) + block_size_z = loop_block_dim_z::value; + if (block_size_z < 0) + block_size_z = 0; + return std::make_tuple(values_t(thread_idx_x, thread_idx_y, thread_idx_z), + values_t(block_size_x, block_size_y, block_size_z)); } // disable incorrect warning "missing return statement at end of non-void function" GT_NVCC_DIAG_PUSH_SUPPRESS(940) @@ -88,6 +137,7 @@ namespace gridtools::fn::backend { } template , class SizeKeys = get_keys> __global__ void kernel(Sizes sizes, PtrHolder ptr_holder, Strides strides, Fun fun) { - auto thread_idx = global_thread_index(); + auto const [thread_idx, block_size] = global_thread_index(sizes); if (!in_domain(thread_idx, sizes)) return; auto ptr = ptr_holder(); sid::multi_shift(ptr, strides, thread_idx); if constexpr (NDims::value <= 3) { - fun(ptr, strides); + common::make_loops(block_size)(std::move(fun))(ptr, strides); } else { - using loop_dims_t = meta::drop_front_c<3, SizeKeys>; - common::make_loops(sizes)(std::move(fun))(ptr, strides); + auto inner_sizes = tuple_util::device::convert_to::values>( + tuple_util::device::drop_front<3>(tuple_util::device::convert_to(sizes))); + auto loop_sizes = hymap::concat(block_size, inner_sizes); + common::make_loops(loop_sizes)(std::move(fun))(ptr, strides); } } - template + template std::tuple blocks_and_threads(Sizes const &sizes) { using keys_t = get_keys; using ndims_t = meta::length; @@ -116,15 +168,21 @@ namespace gridtools::fn::backend { dim3 threads(1, 1, 1); if constexpr (ndims_t::value >= 1) { threads.x = block_size_at_dim(); - blocks.x = (tuple_util::get<0>(sizes) + threads.x - 1) / threads.x; + constexpr int block_dim_x = block_size_at_dim::value * + block_size_at_dim::value; + blocks.x = (tuple_util::get<0>(sizes) + block_dim_x - 1) / block_dim_x; } if constexpr (ndims_t::value >= 2) { threads.y = block_size_at_dim(); - blocks.y = (tuple_util::get<1>(sizes) + threads.y - 1) / threads.y; + constexpr int block_dim_y = block_size_at_dim::value * + block_size_at_dim::value; + blocks.y = (tuple_util::get<1>(sizes) + block_dim_y - 1) / block_dim_y; } if constexpr (ndims_t::value >= 3) { threads.z = block_size_at_dim(); - blocks.z = (tuple_util::get<2>(sizes) + threads.z - 1) / threads.z; + constexpr int block_dim_z = block_size_at_dim::value * + block_size_at_dim::value; + blocks.z = (tuple_util::get<2>(sizes) + block_dim_z - 1) / block_dim_z; } return {blocks, threads}; } @@ -144,8 +202,13 @@ namespace gridtools::fn::backend { return tuple_util::host::apply([](auto... sizes) { return ((sizes == 0) || ...); }, sizes); } - template - void apply_stencil_stage(gpu const &g, + template + void apply_stencil_stage(gpu const &g, Sizes const &sizes, StencilStage, MakeIterator make_iterator, @@ -158,13 +221,14 @@ namespace gridtools::fn::backend { auto ptr_holder = sid::get_origin(std::forward(composite)); auto strides = sid::get_strides(std::forward(composite)); - auto [blocks, threads] = blocks_and_threads(sizes); + auto [blocks, threads] = blocks_and_threads(sizes); assert(threads.x > 0 && threads.y > 0 && threads.z > 0); cuda_util::launch(blocks, threads, 0, g.stream, kernel - void apply_column_stage(gpu const &g, + void apply_column_stage(gpu const &g, Sizes const &sizes, ColumnStage, MakeIterator make_iterator, @@ -211,13 +276,14 @@ namespace gridtools::fn::backend { auto h_sizes = hymap::canonicalize_and_remove_key(sizes); int v_size = at_key(sizes); - auto [blocks, threads] = blocks_and_threads(h_sizes); + auto [blocks, threads] = blocks_and_threads(h_sizes); assert(threads.x > 0 && threads.y > 0 && threads.z > 0); cuda_util::launch(blocks, threads, 0, g.stream, kernel{std::move(make_iterator), std::move(seed), v_size}); } - template - auto tmp_allocator(gpu be) { + template + auto tmp_allocator(gpu be) { return std::make_tuple(be, sid::device::cached_allocator(&cuda_util::cuda_malloc)); } - template + template auto allocate_global_tmp( - std::tuple, Allocator> &alloc, Sizes const &sizes, data_type) { + std::tuple, Allocator> &alloc, Sizes const &sizes, data_type) { return sid::make_contiguous(std::get<1>(alloc), sizes); } } // namespace gpu_impl_ diff --git a/tests/include/fn_select.hpp b/tests/include/fn_select.hpp index 68cc0ed0b..65e9326e1 100644 --- a/tests/include/fn_select.hpp +++ b/tests/include/fn_select.hpp @@ -47,7 +47,7 @@ namespace { gridtools::integral_constant>, gridtools::meta::list...>>; - using fn_backend_t = gridtools::fn::backend::gpu>; + using fn_backend_t = gridtools::fn::backend::gpu, block_sizes_t<2, 2, 2>>; } // namespace #endif @@ -70,14 +70,14 @@ namespace gridtools::fn::backend { } // namespace naive_impl_ namespace gpu_impl_ { - template + template struct gpu; - template - storage::gpu backend_storage_traits(gpu); - template - timer_cuda backend_timer_impl(gpu); - template - inline char const *backend_name(gpu const &) { + template + storage::gpu backend_storage_traits(gpu); + template + timer_cuda backend_timer_impl(gpu); + template + inline char const *backend_name(gpu const &) { return "gpu"; } } // namespace gpu_impl_ diff --git a/tests/unit_tests/fn/test_fn_backend_gpu.cu b/tests/unit_tests/fn/test_fn_backend_gpu.cu index 0f9c45a05..dec9e0e6d 100644 --- a/tests/unit_tests/fn/test_fn_backend_gpu.cu +++ b/tests/unit_tests/fn/test_fn_backend_gpu.cu @@ -63,10 +63,11 @@ namespace gridtools::fn::backend { column_stage, sum_scan, 0, 1> cs; - using block_sizes_t = meta::list, int_t<4>>, meta::list, int_t<2>>>; + using thread_block_sizes_t = meta::list, int_t<4>>, meta::list, int_t<2>>>; + using loop_block_sizes_t = meta::list, int_t<1>>, meta::list, int_t<2>>>; apply_column_stage( - gpu(), sizes, cs, make_iterator_mock(), composite, int_t<1>(), tuple(42, 1)); + gpu(), sizes, cs, make_iterator_mock(), composite, int_t<1>(), tuple(42, 1)); cudaMemcpy(outh, out.get(), 5 * 7 * 3 * sizeof(int), cudaMemcpyDeviceToHost); for (int i = 0; i < 5; ++i) @@ -100,10 +101,11 @@ namespace gridtools::fn::backend { column_stage, sum_scan, 0, 1> cs; - using block_sizes_t = meta::list, int_t<4>>, meta::list, int_t<2>>>; + using thread_block_sizes_t = meta::list, int_t<4>>, meta::list, int_t<2>>>; + using loop_block_sizes_t = meta::list, int_t<2>>, meta::list, int_t<2>>>; apply_column_stage( - gpu(), sizes, cs, make_iterator_mock(), composite, int_t<0>(), tuple(42, 1)); + gpu(), sizes, cs, make_iterator_mock(), composite, int_t<0>(), tuple(42, 1)); cudaMemcpy(outh, out.get(), 5 * sizeof(int), cudaMemcpyDeviceToHost); int res = 42; @@ -139,13 +141,14 @@ namespace gridtools::fn::backend { column_stage, sum_scan, 0, 1> cs; - using block_sizes_t = meta::list, int_t<4>>, + using thread_block_sizes_t = meta::list, int_t<4>>, meta::list, int_t<2>>, meta::list, int_t<2>>, meta::list, int_t<1>>>; + using loop_block_sizes_t = meta::list, int_t<2>>, meta::list, int_t<2>>, meta::list, int_t<2>>, meta::list, int_t<2>>>; apply_column_stage( - gpu(), sizes, cs, make_iterator_mock(), composite, int_t<1>(), tuple(42, 1)); + gpu(), sizes, cs, make_iterator_mock(), composite, int_t<1>(), tuple(42, 1)); cudaMemcpy(outh, out.get(), 5 * 7 * 3 * 2 * 3 * sizeof(int), cudaMemcpyDeviceToHost); for (int i = 0; i < 5; ++i) @@ -197,8 +200,8 @@ namespace gridtools::fn::backend { }; TEST(backend_gpu, global_tmp) { - using block_sizes_t = meta::list, int_t<4>>, meta::list, int_t<2>>>; - auto alloc = tmp_allocator(gpu()); + using thread_block_sizes_t = meta::list, int_t<4>>, meta::list, int_t<2>>>; + auto alloc = tmp_allocator(gpu()); auto sizes = hymap::keys, int_t<1>, int_t<2>>::values, int_t<7>, int_t<3>>(); auto tmp = allocate_global_tmp(alloc, sizes, data_type()); static_assert(sid::is_sid()); From c616749792999e0e45463a07851787ec15d4cf44 Mon Sep 17 00:00:00 2001 From: Felix Thaler Date: Tue, 18 Jun 2024 08:08:33 +0200 Subject: [PATCH 03/53] Cleanup/refactor --- include/gridtools/fn/backend/gpu.hpp | 184 +++++++++++---------------- 1 file changed, 75 insertions(+), 109 deletions(-) diff --git a/include/gridtools/fn/backend/gpu.hpp b/include/gridtools/fn/backend/gpu.hpp index 6bbf857f7..7e5576856 100644 --- a/include/gridtools/fn/backend/gpu.hpp +++ b/include/gridtools/fn/backend/gpu.hpp @@ -9,6 +9,7 @@ */ #pragma once +#include #include #include "../../common/cuda_util.hpp" @@ -40,100 +41,72 @@ namespace gridtools::fn::backend { cudaStream_t stream = 0; }; - template - using block_size_at_dim = meta::second>>; + template + struct block_size_at_dim { + template + using apply = std::conditional_t>, + meta::list>, + meta::mp_find>; + }; - template - GT_FUNCTION_DEVICE auto global_thread_index(Sizes const &sizes) { - using all_keys_t = get_keys; - using ndims_t = meta::length; - using keys_t = meta::rename>; - if constexpr (ndims_t::value == 0) { - return std::make_tuple(hymap::keys<>::values<>(), hymap::keys<>::values<>()); - } else if constexpr (ndims_t::value == 1) { - using thread_block_dim_x = block_size_at_dim; - using loop_block_dim_x = block_size_at_dim; - using values_t = typename keys_t::template values; - int thread_idx_x = blockIdx.x * (thread_block_dim_x::value * loop_block_dim_x::value) + - threadIdx.x * loop_block_dim_x::value; - int block_size_x = tuple_util::get<0>(sizes) - thread_idx_x; - if (block_size_x > loop_block_dim_x::value) - block_size_x = loop_block_dim_x::value; - if (block_size_x < 0) - block_size_x = 0; - return std::make_tuple(values_t(thread_idx_x), values_t(block_size_x)); - } else if constexpr (ndims_t::value == 2) { - using thread_block_dim_x = block_size_at_dim; - using thread_block_dim_y = block_size_at_dim; - using loop_block_dim_x = block_size_at_dim; - using loop_block_dim_y = block_size_at_dim; - using values_t = typename keys_t::template values; - int thread_idx_x = blockIdx.x * (thread_block_dim_x::value * loop_block_dim_x::value) + - threadIdx.x * loop_block_dim_x::value; - int thread_idx_y = blockIdx.y * (thread_block_dim_y::value * loop_block_dim_y::value) + - threadIdx.y * loop_block_dim_y::value; - int block_size_x = tuple_util::get<0>(sizes) - thread_idx_x; - int block_size_y = tuple_util::get<1>(sizes) - thread_idx_y; - if (block_size_x > loop_block_dim_x::value) - block_size_x = loop_block_dim_x::value; - if (block_size_x < 0) - block_size_x = 0; - if (block_size_y > loop_block_dim_y::value) - block_size_y = loop_block_dim_y::value; - if (block_size_y < 0) - block_size_y = 0; - return std::make_tuple(values_t(thread_idx_x, thread_idx_y), values_t(block_size_x, block_size_y)); - } else { - using thread_block_dim_x = block_size_at_dim; - using thread_block_dim_y = block_size_at_dim; - using thread_block_dim_z = block_size_at_dim; - using loop_block_dim_x = block_size_at_dim; - using loop_block_dim_y = block_size_at_dim; - using loop_block_dim_z = block_size_at_dim; - using values_t = typename keys_t::template values; - int thread_idx_x = blockIdx.x * (thread_block_dim_x::value * loop_block_dim_x::value) + - threadIdx.x * loop_block_dim_x::value; - int thread_idx_y = blockIdx.y * (thread_block_dim_y::value * loop_block_dim_y::value) + - threadIdx.y * loop_block_dim_y::value; - int thread_idx_z = blockIdx.z * (thread_block_dim_z::value * loop_block_dim_z::value) + - threadIdx.z * loop_block_dim_z::value; - int block_size_x = tuple_util::get<0>(sizes) - thread_idx_x; - int block_size_y = tuple_util::get<1>(sizes) - thread_idx_y; - int block_size_z = tuple_util::get<2>(sizes) - thread_idx_z; - if (block_size_x > loop_block_dim_x::value) - block_size_x = loop_block_dim_x::value; - if (block_size_x < 0) - block_size_x = 0; - if (block_size_y > loop_block_dim_y::value) - block_size_y = loop_block_dim_y::value; - if (block_size_y < 0) - block_size_y = 0; - if (block_size_z > loop_block_dim_z::value) - block_size_z = loop_block_dim_z::value; - if (block_size_z < 0) - block_size_z = 0; - return std::make_tuple(values_t(thread_idx_x, thread_idx_y, thread_idx_z), - values_t(block_size_x, block_size_y, block_size_z)); - } - // disable incorrect warning "missing return statement at end of non-void function" - GT_NVCC_DIAG_PUSH_SUPPRESS(940) + template + GT_FUNCTION_DEVICE constexpr auto block_sizes_for_sizes() { + return hymap::from_meta_map::apply, get_keys>>(); } - GT_NVCC_DIAG_POP_SUPPRESS(940) - template - struct at_generator_f { - template - GT_FUNCTION_DEVICE decltype(auto) operator()(Value &&value) const { - return device::at_key(std::forward(value)); + struct extract_dim3_f { + dim3 values; + + template + GT_FUNCTION_DEVICE constexpr void operator()(int &value) const { + if constexpr (I == 0) + value = values.x; + else if constexpr (I == 1) + value = values.y; + else if constexpr (I == 2) + value = values.z; + else + value = 0; } }; - template - GT_FUNCTION_DEVICE bool in_domain(Index const &index, Sizes const &sizes) { - using sizes_t = meta::rename; - using generators_t = meta::transform>; - auto indexed_sizes = tuple_util::device::generate(sizes); - return tuple_util::device::all_of(std::less(), index, indexed_sizes); + struct global_thread_index_f { + template + GT_FUNCTION_DEVICE constexpr int operator()( + int block_index, int thread_index, ThreadBlockSize, LoopBlockSize) const { + return block_index * (ThreadBlockSize::value * LoopBlockSize::value) + + thread_index * LoopBlockSize::value; + } + }; + + struct block_size_f { + template + GT_FUNCTION_DEVICE constexpr int operator()(int global_thread_index, LoopBlockSize, int size) const { + if constexpr (I < 3) { + return std::clamp(size - global_thread_index, 0, int(LoopBlockSize::value)); + } else { + return size; + } + } + }; + + template + GT_FUNCTION_DEVICE auto global_thread_index(Sizes const &sizes) { + using keys_t = meta::rename>; + + using indices_t = meta::rename, meta::list>>; + indices_t thread_indices, block_indices; + tuple_util::device::for_each_index(extract_dim3_f{threadIdx}, thread_indices); + tuple_util::device::for_each_index(extract_dim3_f{blockIdx}, block_indices); + + constexpr auto thread_block_sizes = block_sizes_for_sizes(); + constexpr auto loop_block_sizes = block_sizes_for_sizes(); + + auto global_thread_indices = tuple_util::device::transform( + global_thread_index_f{}, block_indices, thread_indices, thread_block_sizes, loop_block_sizes); + auto block_sizes = + tuple_util::device::transform_index(block_size_f{}, global_thread_indices, loop_block_sizes, sizes); + return std::make_tuple(std::move(global_thread_indices), std::move(block_sizes)); } template , - class SizeKeys = get_keys> + class Fun> __global__ void kernel(Sizes sizes, PtrHolder ptr_holder, Strides strides, Fun fun) { auto const [thread_idx, block_size] = global_thread_index(sizes); - if (!in_domain(thread_idx, sizes)) + if (!tuple_util::all_of(std::less(), thread_idx, sizes)) return; auto ptr = ptr_holder(); sid::multi_shift(ptr, strides, thread_idx); - if constexpr (NDims::value <= 3) { - common::make_loops(block_size)(std::move(fun))(ptr, strides); - } else { - auto inner_sizes = tuple_util::device::convert_to::values>( - tuple_util::device::drop_front<3>(tuple_util::device::convert_to(sizes))); - auto loop_sizes = hymap::concat(block_size, inner_sizes); - common::make_loops(loop_sizes)(std::move(fun))(ptr, strides); - } + common::make_loops(block_size)(std::move(fun))(ptr, strides); } template std::tuple blocks_and_threads(Sizes const &sizes) { using keys_t = get_keys; using ndims_t = meta::length; + [[maybe_unused]] constexpr auto thread_block_sizes = block_sizes_for_sizes(); + [[maybe_unused]] constexpr auto loop_block_sizes = block_sizes_for_sizes(); dim3 blocks(1, 1, 1); dim3 threads(1, 1, 1); if constexpr (ndims_t::value >= 1) { - threads.x = block_size_at_dim(); - constexpr int block_dim_x = block_size_at_dim::value * - block_size_at_dim::value; + threads.x = tuple_util::get<0>(thread_block_sizes); + constexpr int block_dim_x = + tuple_util::get<0>(thread_block_sizes) * tuple_util::get<0>(loop_block_sizes); blocks.x = (tuple_util::get<0>(sizes) + block_dim_x - 1) / block_dim_x; } if constexpr (ndims_t::value >= 2) { - threads.y = block_size_at_dim(); - constexpr int block_dim_y = block_size_at_dim::value * - block_size_at_dim::value; + threads.y = tuple_util::get<1>(thread_block_sizes); + constexpr int block_dim_y = + tuple_util::get<1>(thread_block_sizes) * tuple_util::get<1>(loop_block_sizes); blocks.y = (tuple_util::get<1>(sizes) + block_dim_y - 1) / block_dim_y; } if constexpr (ndims_t::value >= 3) { - threads.z = block_size_at_dim(); - constexpr int block_dim_z = block_size_at_dim::value * - block_size_at_dim::value; + threads.z = tuple_util::get<2>(thread_block_sizes); + constexpr int block_dim_z = + tuple_util::get<2>(thread_block_sizes) * tuple_util::get<2>(loop_block_sizes); blocks.z = (tuple_util::get<2>(sizes) + block_dim_z - 1) / block_dim_z; } return {blocks, threads}; From 44f13fda55c25f935f9342c6c3afb6e7acc55917 Mon Sep 17 00:00:00 2001 From: Felix Thaler Date: Tue, 18 Jun 2024 09:13:30 +0200 Subject: [PATCH 04/53] Treat compile-time block sizes as compile-time --- include/gridtools/fn/backend/gpu.hpp | 35 ++++++++++++++-------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/include/gridtools/fn/backend/gpu.hpp b/include/gridtools/fn/backend/gpu.hpp index 7e5576856..f6e454a5a 100644 --- a/include/gridtools/fn/backend/gpu.hpp +++ b/include/gridtools/fn/backend/gpu.hpp @@ -57,33 +57,35 @@ namespace gridtools::fn::backend { struct extract_dim3_f { dim3 values; - template - GT_FUNCTION_DEVICE constexpr void operator()(int &value) const { + template + GT_FUNCTION_DEVICE constexpr void operator()(Value &value) const { if constexpr (I == 0) value = values.x; else if constexpr (I == 1) value = values.y; else if constexpr (I == 2) value = values.z; - else - value = 0; } }; struct global_thread_index_f { - template - GT_FUNCTION_DEVICE constexpr int operator()( - int block_index, int thread_index, ThreadBlockSize, LoopBlockSize) const { + template + GT_FUNCTION_DEVICE constexpr auto operator()( + BlockIndex block_index, ThreadIndex thread_index, ThreadBlockSize, LoopBlockSize) const { return block_index * (ThreadBlockSize::value * LoopBlockSize::value) + thread_index * LoopBlockSize::value; } }; struct block_size_f { - template - GT_FUNCTION_DEVICE constexpr int operator()(int global_thread_index, LoopBlockSize, int size) const { + template + GT_FUNCTION_DEVICE constexpr auto operator()( + GlobalThreadIndex global_thread_index, LoopBlockSize, Size size) const { if constexpr (I < 3) { - return std::clamp(size - global_thread_index, 0, int(LoopBlockSize::value)); + if constexpr (LoopBlockSize::value == 1) + return integral_constant(); + else + return std::clamp(size - global_thread_index, 0, int(LoopBlockSize::value)); } else { return size; } @@ -94,7 +96,11 @@ namespace gridtools::fn::backend { GT_FUNCTION_DEVICE auto global_thread_index(Sizes const &sizes) { using keys_t = meta::rename>; - using indices_t = meta::rename, meta::list>>; + constexpr int dynamic_indices = std::min(int(meta::length::value), 3); + constexpr int static_indices = meta::length::value - dynamic_indices; + using indices_t = meta::rename>, + meta::repeat_c>>>>; indices_t thread_indices, block_indices; tuple_util::device::for_each_index(extract_dim3_f{threadIdx}, thread_indices); tuple_util::device::for_each_index(extract_dim3_f{blockIdx}, block_indices); @@ -109,12 +115,7 @@ namespace gridtools::fn::backend { return std::make_tuple(std::move(global_thread_indices), std::move(block_sizes)); } - template + template __global__ void kernel(Sizes sizes, PtrHolder ptr_holder, Strides strides, Fun fun) { auto const [thread_idx, block_size] = global_thread_index(sizes); if (!tuple_util::all_of(std::less(), thread_idx, sizes)) From 9a5edf1017b6da9d899b53ba71c264f86d4d9541 Mon Sep 17 00:00:00 2001 From: Felix Thaler Date: Tue, 18 Jun 2024 13:24:00 +0200 Subject: [PATCH 05/53] Minor cleanup --- include/gridtools/fn/backend/gpu.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/gridtools/fn/backend/gpu.hpp b/include/gridtools/fn/backend/gpu.hpp index f6e454a5a..9de8c08b5 100644 --- a/include/gridtools/fn/backend/gpu.hpp +++ b/include/gridtools/fn/backend/gpu.hpp @@ -50,9 +50,8 @@ namespace gridtools::fn::backend { }; template - GT_FUNCTION_DEVICE constexpr auto block_sizes_for_sizes() { - return hymap::from_meta_map::apply, get_keys>>(); - } + using block_sizes_for_sizes = + hymap::from_meta_map::apply, get_keys>>; struct extract_dim3_f { dim3 values; From 90e3a50402f01d5d2eb9aa0bf62fea2012168a7c Mon Sep 17 00:00:00 2001 From: Felix Thaler Date: Tue, 18 Jun 2024 13:24:30 +0200 Subject: [PATCH 06/53] Use uint3 instead of dim3 --- include/gridtools/fn/backend/gpu.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/gridtools/fn/backend/gpu.hpp b/include/gridtools/fn/backend/gpu.hpp index 9de8c08b5..d3b206121 100644 --- a/include/gridtools/fn/backend/gpu.hpp +++ b/include/gridtools/fn/backend/gpu.hpp @@ -53,8 +53,8 @@ namespace gridtools::fn::backend { using block_sizes_for_sizes = hymap::from_meta_map::apply, get_keys>>; - struct extract_dim3_f { - dim3 values; + struct extract_uint3_f { + uint3 const &values; template GT_FUNCTION_DEVICE constexpr void operator()(Value &value) const { @@ -92,7 +92,7 @@ namespace gridtools::fn::backend { }; template - GT_FUNCTION_DEVICE auto global_thread_index(Sizes const &sizes) { + GT_FUNCTION_DEVICE constexpr auto global_thread_index(Sizes const &sizes) { using keys_t = meta::rename>; constexpr int dynamic_indices = std::min(int(meta::length::value), 3); @@ -101,8 +101,8 @@ namespace gridtools::fn::backend { meta::concat>, meta::repeat_c>>>>; indices_t thread_indices, block_indices; - tuple_util::device::for_each_index(extract_dim3_f{threadIdx}, thread_indices); - tuple_util::device::for_each_index(extract_dim3_f{blockIdx}, block_indices); + tuple_util::device::for_each_index(extract_uint3_f{threadIdx}, thread_indices); + tuple_util::device::for_each_index(extract_uint3_f{blockIdx}, block_indices); constexpr auto thread_block_sizes = block_sizes_for_sizes(); constexpr auto loop_block_sizes = block_sizes_for_sizes(); From f70b885149bb9c8795b81d9634bbec1d31d1d7f2 Mon Sep 17 00:00:00 2001 From: Felix Thaler Date: Tue, 18 Jun 2024 13:24:59 +0200 Subject: [PATCH 07/53] Silent warning --- include/gridtools/fn/backend/gpu.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/gridtools/fn/backend/gpu.hpp b/include/gridtools/fn/backend/gpu.hpp index d3b206121..aef0f4014 100644 --- a/include/gridtools/fn/backend/gpu.hpp +++ b/include/gridtools/fn/backend/gpu.hpp @@ -88,7 +88,10 @@ namespace gridtools::fn::backend { } else { return size; } + // disable incorrect warning "missing return statement at end of non-void function" + GT_NVCC_DIAG_PUSH_SUPPRESS(940) } + GT_NVCC_DIAG_POP_SUPPRESS(940) }; template From 6e77d53e15d98e0f614349753aa645ef1427d0c2 Mon Sep 17 00:00:00 2001 From: Felix Thaler Date: Tue, 18 Jun 2024 14:59:14 +0200 Subject: [PATCH 08/53] Add __launch_bounds__ --- include/gridtools/fn/backend/gpu.hpp | 16 ++++++++++++++-- tests/include/fn_select.hpp | 2 +- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/include/gridtools/fn/backend/gpu.hpp b/include/gridtools/fn/backend/gpu.hpp index aef0f4014..1f26abdd2 100644 --- a/include/gridtools/fn/backend/gpu.hpp +++ b/include/gridtools/fn/backend/gpu.hpp @@ -117,8 +117,20 @@ namespace gridtools::fn::backend { return std::make_tuple(std::move(global_thread_indices), std::move(block_sizes)); } - template - __global__ void kernel(Sizes sizes, PtrHolder ptr_holder, Strides strides, Fun fun) { + template + constexpr int iseq_product(std::integer_sequence) { + return (1 * ... * i); + } + + template >())> + __global__ void __launch_bounds__(NumThreads) + kernel(Sizes sizes, PtrHolder ptr_holder, Strides strides, Fun fun) { auto const [thread_idx, block_size] = global_thread_index(sizes); if (!tuple_util::all_of(std::less(), thread_idx, sizes)) return; diff --git a/tests/include/fn_select.hpp b/tests/include/fn_select.hpp index 65e9326e1..3d5718bea 100644 --- a/tests/include/fn_select.hpp +++ b/tests/include/fn_select.hpp @@ -47,7 +47,7 @@ namespace { gridtools::integral_constant>, gridtools::meta::list...>>; - using fn_backend_t = gridtools::fn::backend::gpu, block_sizes_t<2, 2, 2>>; + using fn_backend_t = gridtools::fn::backend::gpu, block_sizes_t<1, 1, 1>>; } // namespace #endif From 508852d4a0d029bab586e56df57bca57729143fc Mon Sep 17 00:00:00 2001 From: Felix Thaler Date: Tue, 18 Jun 2024 15:06:12 +0200 Subject: [PATCH 09/53] Fixed comment --- include/gridtools/fn/backend/gpu.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/gridtools/fn/backend/gpu.hpp b/include/gridtools/fn/backend/gpu.hpp index 1f26abdd2..b426dbba4 100644 --- a/include/gridtools/fn/backend/gpu.hpp +++ b/include/gridtools/fn/backend/gpu.hpp @@ -25,7 +25,7 @@ namespace gridtools::fn::backend { namespace gpu_impl_ { /* - * ThreadBlockSizes must be a meta map, mapping dimensions to integral constant block sizes. + * ThreadBlockSizes and LoopBlockSizes must be meta maps, mapping dimensions to integral constant block sizes. * * For example, meta::list>, * meta::list>, From bade4550208bf9c5794b4888fdb6495a7c84e6c0 Mon Sep 17 00:00:00 2001 From: Felix Thaler Date: Tue, 18 Jun 2024 15:06:40 +0200 Subject: [PATCH 10/53] Slightly simpler meta:: calculation --- include/gridtools/fn/backend/gpu.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/gridtools/fn/backend/gpu.hpp b/include/gridtools/fn/backend/gpu.hpp index b426dbba4..3c3b656e6 100644 --- a/include/gridtools/fn/backend/gpu.hpp +++ b/include/gridtools/fn/backend/gpu.hpp @@ -33,8 +33,8 @@ namespace gridtools::fn::backend { * When using a cartesian grid. */ template >, - meta::repeat, meta::list>>>>> + class LoopBlockSizes = meta::zip, + meta::repeat, meta::list>>>> struct gpu { using thread_block_sizes_t = ThreadBlockSizes; using loop_block_sizes_t = LoopBlockSizes; From 1641040e79ca5bf7e580384fe5eb31b964b33e45 Mon Sep 17 00:00:00 2001 From: Felix Thaler Date: Tue, 18 Jun 2024 15:09:53 +0200 Subject: [PATCH 11/53] Maybe fix for Clang-CUDA compilation --- include/gridtools/fn/backend/gpu.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/gridtools/fn/backend/gpu.hpp b/include/gridtools/fn/backend/gpu.hpp index 3c3b656e6..8394d310a 100644 --- a/include/gridtools/fn/backend/gpu.hpp +++ b/include/gridtools/fn/backend/gpu.hpp @@ -51,7 +51,7 @@ namespace gridtools::fn::backend { template using block_sizes_for_sizes = - hymap::from_meta_map::apply, get_keys>>; + hymap::from_meta_map::template apply, get_keys>>; struct extract_uint3_f { uint3 const &values; From 990555e12c9f736573e04b3ab6f32907392fcff8 Mon Sep 17 00:00:00 2001 From: Felix Thaler Date: Tue, 18 Jun 2024 15:25:47 +0200 Subject: [PATCH 12/53] Added another ::template --- include/gridtools/fn/backend/gpu.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/gridtools/fn/backend/gpu.hpp b/include/gridtools/fn/backend/gpu.hpp index 8394d310a..d4f2a7863 100644 --- a/include/gridtools/fn/backend/gpu.hpp +++ b/include/gridtools/fn/backend/gpu.hpp @@ -100,7 +100,7 @@ namespace gridtools::fn::backend { constexpr int dynamic_indices = std::min(int(meta::length::value), 3); constexpr int static_indices = meta::length::value - dynamic_indices; - using indices_t = meta::rename>, meta::repeat_c>>>>; indices_t thread_indices, block_indices; From ea9e3c9dcf9bb7a99f0c0fa95fef7893642d7665 Mon Sep 17 00:00:00 2001 From: Felix Thaler Date: Tue, 18 Jun 2024 16:02:15 +0200 Subject: [PATCH 13/53] Make clang-14-cuda-11 happy --- include/gridtools/fn/backend/gpu.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/gridtools/fn/backend/gpu.hpp b/include/gridtools/fn/backend/gpu.hpp index d4f2a7863..a421355f4 100644 --- a/include/gridtools/fn/backend/gpu.hpp +++ b/include/gridtools/fn/backend/gpu.hpp @@ -132,7 +132,7 @@ namespace gridtools::fn::backend { __global__ void __launch_bounds__(NumThreads) kernel(Sizes sizes, PtrHolder ptr_holder, Strides strides, Fun fun) { auto const [thread_idx, block_size] = global_thread_index(sizes); - if (!tuple_util::all_of(std::less(), thread_idx, sizes)) + if (!tuple_util::device::all_of(std::less(), thread_idx, sizes)) return; auto ptr = ptr_holder(); sid::multi_shift(ptr, strides, thread_idx); From d3d522894bf2cf405cafad63f0ee164155b91c6c Mon Sep 17 00:00:00 2001 From: Felix Thaler Date: Wed, 19 Jun 2024 09:10:33 +0200 Subject: [PATCH 14/53] Fix HIP compilation --- include/gridtools/fn/backend/gpu.hpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/include/gridtools/fn/backend/gpu.hpp b/include/gridtools/fn/backend/gpu.hpp index a421355f4..d98180617 100644 --- a/include/gridtools/fn/backend/gpu.hpp +++ b/include/gridtools/fn/backend/gpu.hpp @@ -53,8 +53,9 @@ namespace gridtools::fn::backend { using block_sizes_for_sizes = hymap::from_meta_map::template apply, get_keys>>; - struct extract_uint3_f { - uint3 const &values; + template + struct extract_index_f { + IndexType const &values; template GT_FUNCTION_DEVICE constexpr void operator()(Value &value) const { @@ -104,8 +105,8 @@ namespace gridtools::fn::backend { meta::concat>, meta::repeat_c>>>>; indices_t thread_indices, block_indices; - tuple_util::device::for_each_index(extract_uint3_f{threadIdx}, thread_indices); - tuple_util::device::for_each_index(extract_uint3_f{blockIdx}, block_indices); + tuple_util::device::for_each_index(extract_index_f{threadIdx}, thread_indices); + tuple_util::device::for_each_index(extract_index_f{blockIdx}, block_indices); constexpr auto thread_block_sizes = block_sizes_for_sizes(); constexpr auto loop_block_sizes = block_sizes_for_sizes(); From bba6c78e46b474f82748062afc62026b544634c5 Mon Sep 17 00:00:00 2001 From: Felix Thaler Date: Wed, 19 Jun 2024 14:27:42 +0200 Subject: [PATCH 15/53] Some cleanup --- include/gridtools/fn/backend/gpu.hpp | 52 ++++++++++++---------------- 1 file changed, 22 insertions(+), 30 deletions(-) diff --git a/include/gridtools/fn/backend/gpu.hpp b/include/gridtools/fn/backend/gpu.hpp index d98180617..6798d38c6 100644 --- a/include/gridtools/fn/backend/gpu.hpp +++ b/include/gridtools/fn/backend/gpu.hpp @@ -53,32 +53,33 @@ namespace gridtools::fn::backend { using block_sizes_for_sizes = hymap::from_meta_map::template apply, get_keys>>; - template - struct extract_index_f { - IndexType const &values; - - template - GT_FUNCTION_DEVICE constexpr void operator()(Value &value) const { + struct global_thread_index_f { + template + GT_FUNCTION_DEVICE static constexpr int index_at_dim(Index const &idx) { + static_assert(I < 3); if constexpr (I == 0) - value = values.x; - else if constexpr (I == 1) - value = values.y; - else if constexpr (I == 2) - value = values.z; + return idx.x; + if constexpr (I == 1) + return idx.y; + return idx.z; } - }; - struct global_thread_index_f { - template - GT_FUNCTION_DEVICE constexpr auto operator()( - BlockIndex block_index, ThreadIndex thread_index, ThreadBlockSize, LoopBlockSize) const { - return block_index * (ThreadBlockSize::value * LoopBlockSize::value) + - thread_index * LoopBlockSize::value; + template + GT_FUNCTION_DEVICE constexpr auto operator()(ThreadBlockSize, LoopBlockSize) const { + if constexpr (I < 3) { + return index_at_dim(blockIdx) * (ThreadBlockSize::value * LoopBlockSize::value) + + index_at_dim(threadIdx) * LoopBlockSize::value; + } else { + return integral_constant(); + } + // disable incorrect warning "missing return statement at end of non-void function" + GT_NVCC_DIAG_PUSH_SUPPRESS(940) } + GT_NVCC_DIAG_POP_SUPPRESS(940) }; struct block_size_f { - template + template GT_FUNCTION_DEVICE constexpr auto operator()( GlobalThreadIndex global_thread_index, LoopBlockSize, Size size) const { if constexpr (I < 3) { @@ -99,20 +100,11 @@ namespace gridtools::fn::backend { GT_FUNCTION_DEVICE constexpr auto global_thread_index(Sizes const &sizes) { using keys_t = meta::rename>; - constexpr int dynamic_indices = std::min(int(meta::length::value), 3); - constexpr int static_indices = meta::length::value - dynamic_indices; - using indices_t = meta::rename>, - meta::repeat_c>>>>; - indices_t thread_indices, block_indices; - tuple_util::device::for_each_index(extract_index_f{threadIdx}, thread_indices); - tuple_util::device::for_each_index(extract_index_f{blockIdx}, block_indices); - constexpr auto thread_block_sizes = block_sizes_for_sizes(); constexpr auto loop_block_sizes = block_sizes_for_sizes(); - auto global_thread_indices = tuple_util::device::transform( - global_thread_index_f{}, block_indices, thread_indices, thread_block_sizes, loop_block_sizes); + auto global_thread_indices = + tuple_util::device::transform_index(global_thread_index_f{}, thread_block_sizes, loop_block_sizes); auto block_sizes = tuple_util::device::transform_index(block_size_f{}, global_thread_indices, loop_block_sizes, sizes); return std::make_tuple(std::move(global_thread_indices), std::move(block_sizes)); From 1b547d0a43e4af78be398aefcda1e6c6eab4a046 Mon Sep 17 00:00:00 2001 From: Felix Thaler Date: Wed, 19 Jun 2024 14:30:22 +0200 Subject: [PATCH 16/53] Fix formatting --- tests/unit_tests/fn/test_fn_backend_gpu.cu | 34 +++++++++++++++++----- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/tests/unit_tests/fn/test_fn_backend_gpu.cu b/tests/unit_tests/fn/test_fn_backend_gpu.cu index dec9e0e6d..feb472ea2 100644 --- a/tests/unit_tests/fn/test_fn_backend_gpu.cu +++ b/tests/unit_tests/fn/test_fn_backend_gpu.cu @@ -66,8 +66,13 @@ namespace gridtools::fn::backend { using thread_block_sizes_t = meta::list, int_t<4>>, meta::list, int_t<2>>>; using loop_block_sizes_t = meta::list, int_t<1>>, meta::list, int_t<2>>>; - apply_column_stage( - gpu(), sizes, cs, make_iterator_mock(), composite, int_t<1>(), tuple(42, 1)); + apply_column_stage(gpu(), + sizes, + cs, + make_iterator_mock(), + composite, + int_t<1>(), + tuple(42, 1)); cudaMemcpy(outh, out.get(), 5 * 7 * 3 * sizeof(int), cudaMemcpyDeviceToHost); for (int i = 0; i < 5; ++i) @@ -104,8 +109,13 @@ namespace gridtools::fn::backend { using thread_block_sizes_t = meta::list, int_t<4>>, meta::list, int_t<2>>>; using loop_block_sizes_t = meta::list, int_t<2>>, meta::list, int_t<2>>>; - apply_column_stage( - gpu(), sizes, cs, make_iterator_mock(), composite, int_t<0>(), tuple(42, 1)); + apply_column_stage(gpu(), + sizes, + cs, + make_iterator_mock(), + composite, + int_t<0>(), + tuple(42, 1)); cudaMemcpy(outh, out.get(), 5 * sizeof(int), cudaMemcpyDeviceToHost); int res = 42; @@ -145,10 +155,18 @@ namespace gridtools::fn::backend { meta::list, int_t<2>>, meta::list, int_t<2>>, meta::list, int_t<1>>>; - using loop_block_sizes_t = meta::list, int_t<2>>, meta::list, int_t<2>>, meta::list, int_t<2>>, meta::list, int_t<2>>>; - - apply_column_stage( - gpu(), sizes, cs, make_iterator_mock(), composite, int_t<1>(), tuple(42, 1)); + using loop_block_sizes_t = meta::list, int_t<2>>, + meta::list, int_t<2>>, + meta::list, int_t<2>>, + meta::list, int_t<2>>>; + + apply_column_stage(gpu(), + sizes, + cs, + make_iterator_mock(), + composite, + int_t<1>(), + tuple(42, 1)); cudaMemcpy(outh, out.get(), 5 * 7 * 3 * 2 * 3 * sizeof(int), cudaMemcpyDeviceToHost); for (int i = 0; i < 5; ++i) From 6bc7bcfc50b5415cacacdb5b5bbcfd67dcc59ec1 Mon Sep 17 00:00:00 2001 From: Felix Thaler Date: Wed, 19 Jun 2024 15:03:02 +0200 Subject: [PATCH 17/53] Check types of ThreadBlockSizes and LoopBlockSizes --- include/gridtools/fn/backend/gpu.hpp | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/include/gridtools/fn/backend/gpu.hpp b/include/gridtools/fn/backend/gpu.hpp index 6798d38c6..cd15a75a3 100644 --- a/include/gridtools/fn/backend/gpu.hpp +++ b/include/gridtools/fn/backend/gpu.hpp @@ -24,6 +24,18 @@ namespace gridtools::fn::backend { namespace gpu_impl_ { + template + struct is_valid_block_size_key_value_pair : std::false_type {}; + + template