Skip to content

Commit df6a560

Browse files
0Marble5ujinKang
andcommitted
CANN: implement SSM_CONV operator
Co-authored-by: Aleksei Lobanov, <zeromarblectm@gmail.com> Co-authored-by: Sujin Kang, <waterjin326@gmail.com>
1 parent b3e3060 commit df6a560

File tree

4 files changed

+137
-0
lines changed

4 files changed

+137
-0
lines changed

ggml/src/ggml-cann/aclnn_ops.cpp

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3484,3 +3484,126 @@ void ggml_cann_out_prod(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
34843484
break;
34853485
}
34863486
}
3487+
3488+
void ggml_cann_ssm_conv(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
3489+
ggml_tensor * src0 = dst->src[0]; // conv_x
3490+
ggml_tensor * src1 = dst->src[1]; // conv1d.weight
3491+
3492+
// This op is currently defined only for F32 in ggml_cpu
3493+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
3494+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
3495+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
3496+
3497+
// Shapes follow ggml_compute_forward_ssm_conv_f32
3498+
const int64_t nc = src1->ne[0]; // d_conv
3499+
const int64_t ncs = src0->ne[0]; // d_conv - 1 + n_t
3500+
const int64_t nr = src0->ne[1]; // d_inner
3501+
const int64_t n_s = src0->ne[2]; // n_seqs
3502+
3503+
const int64_t n_t = dst->ne[1]; // tokens per sequence
3504+
3505+
GGML_ASSERT(dst->ne[0] == nr); // dst: {d_inner, n_t, n_s}
3506+
GGML_ASSERT(src1->ne[1] == nr); // weight: {d_conv, d_inner}
3507+
GGML_ASSERT(ncs == nc - 1 + n_t); // conv_x: {d_conv - 1 + n_t, d_inner, n_s}
3508+
GGML_ASSERT(src0->nb[0] == sizeof(float));
3509+
GGML_ASSERT(src1->nb[0] == sizeof(float));
3510+
3511+
// --- Build CANN tensors ---
3512+
3513+
// 1) Input: conv_x as NCL
3514+
//
3515+
// src0->ne = { ncs, nr, n_s, 1 } // {L_in, C, N}
3516+
// Passing ACL_FORMAT_NCL here means:
3517+
// reversed dims -> [N, C, L_in] = [n_s, nr, ncs]
3518+
acl_tensor_ptr acl_x = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3, ACL_FORMAT_NCL);
3519+
3520+
// 2) Weights: depthwise conv kernel, view src1 as {K, 1, C}
3521+
//
3522+
// src1 original: ne = { nc, nr, 1, 1 } // [K, C, 1, 1]
3523+
// we want a view: ne_w = { nc, 1, nr } // [K, 1, C]
3524+
// so that reversed dims -> [C, 1, K] which matches
3525+
// [out_channels, in_channels/groups, kernel_size]
3526+
int64_t w_ne[GGML_MAX_DIMS] = { 0 };
3527+
size_t w_nb[GGML_MAX_DIMS] = { 0 };
3528+
3529+
w_ne[0] = nc; // K
3530+
w_ne[1] = 1; // 1 input channel per group
3531+
w_ne[2] = nr; // C groups
3532+
w_ne[3] = 1;
3533+
3534+
// Layout: src1 data is [K, C] with
3535+
// offset(k, c) = k*nb0 + c*nb1
3536+
// We want offset_w(k, 0, c) = k*nb0 + c*nb1,
3537+
// so we can reuse nb0 and nb1, and set nb2 = nb1.
3538+
w_nb[0] = src1->nb[0]; // sizeof(float)
3539+
w_nb[1] = src1->nb[1]; // nc * sizeof(float)
3540+
w_nb[2] = src1->nb[1]; // same stride for each (fake) "channel"
3541+
w_nb[3] = src1->nb[3];
3542+
3543+
acl_tensor_ptr acl_w = ggml_cann_create_tensor(
3544+
src1->data, ggml_cann_type_mapping(src1->type), ggml_type_size(src1->type), w_ne, w_nb, 3, ACL_FORMAT_NCL);
3545+
3546+
// 3) Output: dst is { d_inner, n_t, n_s } (CLN)
3547+
//
3548+
// We need an NCL view of the same buffer:
3549+
// desired NCL logical shape: { L_out = n_t, C = nr, N = n_s }
3550+
//
3551+
// Original CLN layout:
3552+
// dst->ne = { nr, n_t, n_s }
3553+
// dst->nb[0] = sizeof(float)
3554+
// dst->nb[1] = nr * sizeof(float)
3555+
// dst->nb[2] = nr * n_t * sizeof(float)
3556+
//
3557+
// We want offset_new(L, C, N) = offset_orig(C, L, N).
3558+
// Choose:
3559+
// nb_y[0] = nr * sizeof(float); // step in L
3560+
// nb_y[1] = sizeof(float); // step in C
3561+
// nb_y[2] = nr * n_t * sizeof(float); // step in N
3562+
int64_t y_ne[GGML_MAX_DIMS] = { 0 };
3563+
size_t y_nb[GGML_MAX_DIMS] = { 0 };
3564+
3565+
y_ne[0] = n_t; // L_out
3566+
y_ne[1] = nr; // C
3567+
y_ne[2] = n_s; // N
3568+
y_ne[3] = 1;
3569+
3570+
y_nb[0] = dst->ne[0] * sizeof(float); // nr * sizeof(float)
3571+
y_nb[1] = sizeof(float);
3572+
y_nb[2] = dst->ne[0] * dst->ne[1] * sizeof(float); // nr * n_t * sizeof(float)
3573+
y_nb[3] = dst->nb[3];
3574+
3575+
acl_tensor_ptr acl_y = ggml_cann_create_tensor(
3576+
dst->data, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type), y_ne, y_nb, 3, ACL_FORMAT_NCL);
3577+
3578+
// --- Conv1d parameters: depthwise, stride 1, no padding ("valid") ---
3579+
int64_t strideVal[1] = { 1 };
3580+
int64_t paddingVal[1] = { 0 };
3581+
int64_t dilationVal[1] = { 1 };
3582+
3583+
acl_int_array_ptr stride = ggml_cann_create_int_array(strideVal, 1);
3584+
acl_int_array_ptr padding = ggml_cann_create_int_array(paddingVal, 1);
3585+
acl_int_array_ptr dilation = ggml_cann_create_int_array(dilationVal, 1);
3586+
3587+
const bool transposed = false;
3588+
const int64_t groups = nr; // depthwise: one group per inner dim
3589+
int8_t cubeMathType = 0;
3590+
3591+
#ifdef ASCEND_310P
3592+
cubeMathType = 1;
3593+
#endif
3594+
3595+
GGML_CANN_CALL_ACLNN_OP(ctx,
3596+
Convolution,
3597+
acl_x.get(), // input: N, C, L_in = ncs
3598+
acl_w.get(), // weight: [C, 1, K] with groups=nr
3599+
nullptr, // bias
3600+
stride.get(),
3601+
padding.get(),
3602+
dilation.get(),
3603+
transposed,
3604+
padding.get(), // output padding (unused for non-transposed)
3605+
groups,
3606+
acl_y.get(),
3607+
cubeMathType);
3608+
}
3609+

ggml/src/ggml-cann/aclnn_ops.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1032,6 +1032,8 @@ void ggml_cann_op_unary(std::function<void(ggml_backend_cann_context &, aclTenso
10321032
ggml_backend_cann_context & ctx,
10331033
ggml_tensor * dst);
10341034

1035+
void ggml_cann_ssm_conv(ggml_backend_cann_context & ctx, ggml_tensor * dst);
1036+
10351037
/**
10361038
* @brief Applies a gated (GLU-style) unary operation using the CANN backend.
10371039
*

ggml/src/ggml-cann/ggml-cann.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1888,6 +1888,8 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context & ctx, struct gg
18881888
break;
18891889
case GGML_OP_OUT_PROD:
18901890
ggml_cann_out_prod(ctx, dst);
1891+
case GGML_OP_SSM_CONV:
1892+
ggml_cann_ssm_conv(ctx, dst);
18911893
break;
18921894
default:
18931895
return false;
@@ -2625,6 +2627,8 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
26252627
}
26262628
return true;
26272629
}
2630+
case GGML_OP_SSM_CONV:
2631+
return true;
26282632
default:
26292633
return false;
26302634
}

tests/test-backend-ops.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3379,6 +3379,14 @@ struct test_ssm_conv : public test_case {
33793379
ggml_tensor * out = ggml_ssm_conv(ctx, a, b);
33803380
return out;
33813381
}
3382+
3383+
// for CANN Ascend310P3:
3384+
// this card requires setting cubeMathType=1 (ALLOW_FP32_DOWN_PRECISION)
3385+
// so the inputs are converted from f32
3386+
// and tests fail with NMSE = 0.000000114 > 0.000000100
3387+
double max_nmse_err() override {
3388+
return 1e-6;
3389+
}
33823390
};
33833391

33843392
// GGML_OP_SSM_SCAN

0 commit comments

Comments
 (0)