Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 33 additions & 3 deletions backends/vulkan/runtime/graph/ops/glsl/col2im.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,40 @@ ${layout_declare_tensor(B, "w", "t_output", DTYPE, OUTPUT_STORAGE, is_scalar_arr
${layout_declare_tensor(B, "r", "t_input", DTYPE, INPUT_STORAGE, is_scalar_array=False)}

// Sizes of the convolution output image
${layout_declare_ubo(B, "ivec4", "output_sizes")}
//${layout_declare_ubo(B, "ivec4", "output_sizes")}
// Sizes of the convolution input image
${layout_declare_ubo(B, "ivec4", "input_sizes")}
//${layout_declare_ubo(B, "ivec4", "input_sizes")}
// Sizes of the im2col matrix of the convolution output
${layout_declare_ubo(B, "ivec4", "matrix_sizes")}

${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")}
//${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")}

${layout_declare_spec_const(C, "int", "apply_bias", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_stride_x", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_stride_y", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_padding_x", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_padding_y", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_dilation_x", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_dilation_y", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_kernel_size_x", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_kernel_size_y", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_in_channels_per_group", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_out_channels_per_group", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_K4_per_group", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_K4", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_K_per_group", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_logical_K", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_logical_K_per_group", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_groups", "1")}

${layout_declare_spec_const(C, "int", "output_x", "1")}
${layout_declare_spec_const(C, "int", "output_y", "1")}
${layout_declare_spec_const(C, "int", "output_z", "1")}
${layout_declare_spec_const(C, "int", "output_w", "1")}
${layout_declare_spec_const(C, "int", "input_x", "1")}
${layout_declare_spec_const(C, "int", "input_y", "1")}
${layout_declare_spec_const(C, "int", "input_z", "1")}
${layout_declare_spec_const(C, "int", "input_w", "1")}

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

Expand Down Expand Up @@ -79,6 +106,9 @@ void main() {
const int n4 = int(gl_GlobalInvocationID.x);
const int m4 = int(gl_GlobalInvocationID.y);

const ivec4 output_sizes = ivec4(int(output_x), int(output_y), int(output_z), int(output_w));
const ivec4 input_sizes = ivec4(int(input_x), int(input_y), int(input_z), int(input_w));

const int n = mul_4(n4);
const int m = mul_4(m4);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ void perform_conv1d(
const WeightRow weight_row) {
for (int out_w = 0; out_w < 4; ++out_w) {
[[unroll]] for (int kx = 0; kx < weight_row.len; ++kx) {
const int in_w = out_w * conv2d_params.stride.x;
const int in_w = out_w * conv2d_params_stride_x;
out_block.data[out_w] = fma(
input_window.data[in_w + kx],
weight_row.data[kx],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@ ${layout_declare_tensor(B, "r", "t_weight_sums", "int", "buffer", is_scalar_arra
${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)}
${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False)}

${layout_declare_ubo(B, "ivec4", "output_sizes")}
${layout_declare_ubo(B, "ivec4", "input_sizes")}
${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")}
//${layout_declare_ubo(B, "ivec4", "output_sizes")}
//${layout_declare_ubo(B, "ivec4", "input_sizes")}
//${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")}

layout(push_constant) uniform restrict Block {
float input_scale;
Expand All @@ -48,11 +48,42 @@ layout(push_constant) uniform restrict Block {
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

${layout_declare_spec_const(C, "int", "apply_bias", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_stride_x", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_stride_y", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_padding_x", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_padding_y", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_dilation_x", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_dilation_y", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_kernel_size_x", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_kernel_size_y", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_in_channels_per_group", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_out_channels_per_group", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_K4_per_group", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_K4", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_K_per_group", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_logical_K", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_logical_K_per_group", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_groups", "1")}

${layout_declare_spec_const(C, "int", "output_x", "1")}
${layout_declare_spec_const(C, "int", "output_y", "1")}
${layout_declare_spec_const(C, "int", "output_z", "1")}
${layout_declare_spec_const(C, "int", "output_w", "1")}
${layout_declare_spec_const(C, "int", "input_x", "1")}
${layout_declare_spec_const(C, "int", "input_y", "1")}
${layout_declare_spec_const(C, "int", "input_z", "1")}
${layout_declare_spec_const(C, "int", "input_w", "1")}


#include "conv2d_dw_q8_utils.glslh"

void main() {
const int tid = int(gl_GlobalInvocationID.x);

const ivec4 output_sizes = ivec4(int(output_x), int(output_y), int(output_z), int(output_w));
const ivec4 input_sizes = ivec4(int(input_x), int(input_y), int(input_z), int(input_w));


Conv2dBlockExtents out_block_extents = make_block_extents(output_sizes);

Conv2dBlockIndex out_block_idx = linear_idx_to_block_idx(
Expand All @@ -64,23 +95,23 @@ void main() {

const int out_w = mul_4(out_block_idx.data.x);
const int w_start =
(out_w * conv2d_params.stride.x) - conv2d_params.padding.x;
const int w_end = ((out_w + 3) * conv2d_params.stride.x) -
conv2d_params.padding.x +
(conv2d_params.kernel_size.x - 1) * conv2d_params.dilation.x;
(out_w * conv2d_params_stride_x) - conv2d_params_padding_x;
const int w_end = ((out_w + 3) * conv2d_params_stride_x) -
conv2d_params_padding_x +
(conv2d_params_kernel_size_x - 1) * conv2d_params_dilation_x;

Conv2dBlockExtents in_block_extents = make_block_extents(input_sizes);

const ivec4 input_zps = ivec4(pack_into_int32(ivec4(input_zp)));
const vec4 weight_scales = vec4(t_weight_scales[out_block_idx.data.z]);

const int Kw4 = div_up_4(conv2d_params.kernel_size.x);
const int Kw4 = div_up_4(conv2d_params_kernel_size_x);

FPOutBlock out_block;
for (int ky = 0; ky < conv2d_params.kernel_size.y; ky++) {
for (int ky = 0; ky < conv2d_params_kernel_size_y; ky++) {
const int out_h = out_block_idx.data.y;
const int h = out_h * conv2d_params.stride.y - conv2d_params.padding.y +
ky * conv2d_params.dilation.y;
const int h = out_h * conv2d_params_stride_y - conv2d_params_padding_y +
ky * conv2d_params_dilation_y;

InputWindow1D input_window = load_input_window(
w_start,
Expand All @@ -96,7 +127,7 @@ void main() {
out_block_idx.data.z,
ky,
out_block_extents.data.z,
conv2d_params.kernel_size.x,
conv2d_params_kernel_size_x,
Kw4,
weight_scales);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,27 +63,27 @@ void im2col_idx_to_input_tidx(
TensorIndex4D output_tidx;
unwrap_m(output_tidx, im2col_idx.row);

const int in_channels_per_group = conv2d_params.in_channels_per_group;
const int in_channels_per_group = conv2d_params_in_channels_per_group;
// Determine the corresponding position within the convolution window based
// on the col index (more specifically, the col index within the group)
const int channel_within_group =
im2col_idx.col_idx_in_group % in_channels_per_group;
const int kernel_x = (im2col_idx.col_idx_in_group / in_channels_per_group) %
conv2d_params.kernel_size.x;
conv2d_params_kernel_size_x;
const int kernel_y = im2col_idx.col_idx_in_group /
(in_channels_per_group * conv2d_params.kernel_size.x);
(in_channels_per_group * conv2d_params_kernel_size_x);

// Calculate the actual input channel index
const int channel_idx =
im2col_idx.group_idx * conv2d_params.in_channels_per_group +
im2col_idx.group_idx * conv2d_params_in_channels_per_group +
channel_within_group;

// Calculate corresponding input coordinates based on output position
// associated with the row index.
const int input_y = int(output_tidx.data.y * conv2d_params.stride.y) -
int(conv2d_params.padding.y) + int(kernel_y * conv2d_params.dilation.y);
const int input_x = int(output_tidx.data.x * conv2d_params.stride.x) -
int(conv2d_params.padding.x) + int(kernel_x * conv2d_params.dilation.x);
const int input_y = int(output_tidx.data.y * conv2d_params_stride_y) -
int(conv2d_params_padding_y) + int(kernel_y * conv2d_params_dilation_y);
const int input_x = int(output_tidx.data.x * conv2d_params_stride_x) -
int(conv2d_params_padding_x) + int(kernel_x * conv2d_params_dilation_x);

input_tidx.data = ivec4(input_x, input_y, channel_idx, output_tidx.data.w);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@ void load_im2col_block_fast(
// Due to the assumption that in_channels_per_group % 4 == 0, it is
// guaranteed that the next 4 columns (including this one) is part of the
// same group.
im2col_idx.group_idx = im2col_idx.col / conv2d_params.K_per_group;
im2col_idx.col_idx_in_group = im2col_idx.col % conv2d_params.K_per_group;
im2col_idx.group_idx = im2col_idx.col / conv2d_params_K_per_group;
im2col_idx.col_idx_in_group = im2col_idx.col % conv2d_params_K_per_group;

[[unroll]] for (int m_off = 0; m_off < 4; ++m_off) {
if (im2col_idx.row >= M) {
Expand Down Expand Up @@ -98,9 +98,9 @@ void load_im2col_block_slow(
im2col_idx_base.col = mul_4(k4);
im2col_idx_base.row = mul_4(m4);

im2col_idx_base.group_idx = im2col_idx_base.col / conv2d_params.K_per_group;
im2col_idx_base.group_idx = im2col_idx_base.col / conv2d_params_K_per_group;
im2col_idx_base.col_idx_in_group =
im2col_idx_base.col % conv2d_params.K_per_group;
im2col_idx_base.col % conv2d_params_K_per_group;

[[unroll]] for (int m_off = 0; m_off < 4; ++m_off) {
[[unroll]] for (int k_off = 0; k_off < 4; ++k_off) {
Expand All @@ -109,7 +109,7 @@ void load_im2col_block_slow(
im2col_idx.col_idx_in_group += k_off;

// bounds checking
if (im2col_idx.col_idx_in_group >= conv2d_params.logical_K_per_group ||
if (im2col_idx.col_idx_in_group >= conv2d_params_logical_K_per_group ||
im2col_idx.row >= M) {
block.data[m_off][k_off] = T(0);
continue;
Expand All @@ -129,7 +129,7 @@ void load_im2col_block(
const int m4,
const int logical_K,
const int M) {
if (mod_4(conv2d_params.in_channels_per_group) == 0) {
if (mod_4(conv2d_params_in_channels_per_group) == 0) {
load_im2col_block_fast(block, k4, m4, logical_K, M);
} else {
load_im2col_block_slow(block, k4, m4, logical_K, M);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ ${layout_declare_tensor(B, "r", "t_weight_sums", "int", "buffer", is_scalar_arra
${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)}
${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False)}

${layout_declare_ubo(B, "ivec4", "output_sizes")}
${layout_declare_ubo(B, "ivec4", "input_sizes")}
${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")}
//${layout_declare_ubo(B, "ivec4", "output_sizes")}
//${layout_declare_ubo(B, "ivec4", "input_sizes")}
//${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")}

layout(push_constant) uniform restrict Block {
float input_scale;
Expand All @@ -56,6 +56,32 @@ layout(push_constant) uniform restrict Block {
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

${layout_declare_spec_const(C, "int", "apply_bias", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_stride_x", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_stride_y", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_padding_x", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_padding_y", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_dilation_x", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_dilation_y", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_kernel_size_x", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_kernel_size_y", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_in_channels_per_group", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_out_channels_per_group", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_K4_per_group", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_K4", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_K_per_group", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_logical_K", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_logical_K_per_group", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_groups", "1")}

${layout_declare_spec_const(C, "int", "output_x", "1")}
${layout_declare_spec_const(C, "int", "output_y", "1")}
${layout_declare_spec_const(C, "int", "output_z", "1")}
${layout_declare_spec_const(C, "int", "output_w", "1")}
${layout_declare_spec_const(C, "int", "input_x", "1")}
${layout_declare_spec_const(C, "int", "input_y", "1")}
${layout_declare_spec_const(C, "int", "input_z", "1")}
${layout_declare_spec_const(C, "int", "input_w", "1")}


#include "conv2d_int8_input_tile_load.glslh"
#include "linear_int8_weight_tile_load.glslh"
Expand All @@ -72,6 +98,9 @@ void main() {
output_block_idx.data.x = int(gl_GlobalInvocationID.y) * TILE_M4;
output_block_idx.data.y = int(gl_GlobalInvocationID.z);

const ivec4 output_sizes = ivec4(int(output_x), int(output_y), int(output_z), int(output_w));
const ivec4 input_sizes = ivec4(int(input_x), int(input_y), int(input_z), int(input_w));

Conv2dBlockExtents output_block_extents = make_block_extents(output_sizes);
if (block_idx_out_of_bounds(output_block_idx, output_block_extents)) {
return;
Expand All @@ -88,7 +117,7 @@ void main() {
Int8InputTileIndex input_idx = make_initial_int8_input_tile_index(
output_block_idx, input_block_extents);

for (int k4 = 0; k4 < conv2d_params.K4_per_group; k4++) {
for (int k4 = 0; k4 < conv2d_params_K4_per_group; k4++) {
load_packed_int8_input_tile(int8_input_tile, input_idx);

load_int8_weight_tile(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ void perform_conv1d(
const ivec4 weight_block,
const int kx) {
[[unroll]] for (int out_w = 0; out_w < 4; ++out_w) {
const int window_i = out_w * conv2d_params.stride.x + kx;
const int window_i = out_w * conv2d_params_stride_x + kx;
[[unroll]] for (int out_c = 0; out_c < 4; ++out_c) {
accum.data[out_w][0][out_c] = dotPacked4x8AccSatEXT(
input_window.data[window_i],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,40 @@ ${layout_declare_tensor(B, "r", "t_packed_int8_weight", "int", WEIGHT_STORAGE, i
${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)}
${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False)}

${layout_declare_ubo(B, "ivec4", "output_sizes")}
${layout_declare_ubo(B, "ivec4", "input_sizes")}
${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")}
//${layout_declare_ubo(B, "ivec4", "output_sizes")}
//${layout_declare_ubo(B, "ivec4", "input_sizes")}
//${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")}

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

${layout_declare_spec_const(C, "int", "apply_bias", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_stride_x", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_stride_y", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_padding_x", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_padding_y", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_dilation_x", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_dilation_y", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_kernel_size_x", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_kernel_size_y", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_in_channels_per_group", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_out_channels_per_group", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_K4_per_group", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_K4", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_K_per_group", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_logical_K", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_logical_K_per_group", "1")}
${layout_declare_spec_const(C, "int", "conv2d_params_groups", "1")}

${layout_declare_spec_const(C, "int", "output_x", "1")}
${layout_declare_spec_const(C, "int", "output_y", "1")}
${layout_declare_spec_const(C, "int", "output_z", "1")}
${layout_declare_spec_const(C, "int", "output_w", "1")}
${layout_declare_spec_const(C, "int", "input_x", "1")}
${layout_declare_spec_const(C, "int", "input_y", "1")}
${layout_declare_spec_const(C, "int", "input_z", "1")}
${layout_declare_spec_const(C, "int", "input_w", "1")}



#include "linear_fp_input_tile_load.glslh"
#include "linear_int8_weight_tile_load.glslh"
Expand All @@ -60,6 +87,10 @@ void main() {
const int out_tile_x = int(gl_GlobalInvocationID.x);
const int out_tile_y = int(gl_GlobalInvocationID.y);

const ivec4 output_sizes = ivec4(int(output_x), int(output_y), int(output_z), int(output_w));
const ivec4 input_sizes = ivec4(int(input_x), int(input_y), int(input_z), int(input_w));


const int n = int(out_tile_x * TILE_N);
const int m = int(out_tile_y * TILE_M);

Expand All @@ -75,10 +106,10 @@ void main() {
return;
}

const int group_idx = n / conv2d_params.out_channels_per_group;
const int input_k4_offset = conv2d_params.K4_per_group * group_idx;
const int group_idx = n / conv2d_params_out_channels_per_group;
const int input_k4_offset = conv2d_params_K4_per_group * group_idx;

const int K4 = conv2d_params.K4;
const int K4 = conv2d_params_K4;
const int N4 = div_up_4(N);

FPOutTile out_tile;
Expand All @@ -90,13 +121,13 @@ void main() {
const bool dont_check_bounds = (M - m) >= TILE_M;

if (dont_check_bounds) {
for (int k4 = 0; k4 < conv2d_params.K4_per_group; k4++) {
for (int k4 = 0; k4 < conv2d_params_K4_per_group; k4++) {
load_input_tile_no_checks(in_tile, k4 + input_k4_offset, m, K4, M);
load_int8_weight_tile(int8_weight_tile, n4, k4, N4);
fp_accumulate_with_int8_weight(out_tile, in_tile, int8_weight_tile);
}
} else {
for (int k4 = 0; k4 < conv2d_params.K4_per_group; k4++) {
for (int k4 = 0; k4 < conv2d_params_K4_per_group; k4++) {
load_input_tile_with_checks(in_tile, k4 + input_k4_offset, m, K4, M);
load_int8_weight_tile(int8_weight_tile, n4, k4, N4);
fp_accumulate_with_int8_weight(out_tile, in_tile, int8_weight_tile);
Expand Down
Loading
Loading