@@ -3484,3 +3484,126 @@ void ggml_cann_out_prod(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
34843484 break ;
34853485 }
34863486}
3487+
3488+ void ggml_cann_ssm_conv (ggml_backend_cann_context & ctx, ggml_tensor * dst) {
3489+ ggml_tensor * src0 = dst->src [0 ]; // conv_x
3490+ ggml_tensor * src1 = dst->src [1 ]; // conv1d.weight
3491+
3492+ // This op is currently defined only for F32 in ggml_cpu
3493+ GGML_ASSERT (src0->type == GGML_TYPE_F32);
3494+ GGML_ASSERT (src1->type == GGML_TYPE_F32);
3495+ GGML_ASSERT (dst->type == GGML_TYPE_F32);
3496+
3497+ // Shapes follow ggml_compute_forward_ssm_conv_f32
3498+ const int64_t nc = src1->ne [0 ]; // d_conv
3499+ const int64_t ncs = src0->ne [0 ]; // d_conv - 1 + n_t
3500+ const int64_t nr = src0->ne [1 ]; // d_inner
3501+ const int64_t n_s = src0->ne [2 ]; // n_seqs
3502+
3503+ const int64_t n_t = dst->ne [1 ]; // tokens per sequence
3504+
3505+ GGML_ASSERT (dst->ne [0 ] == nr); // dst: {d_inner, n_t, n_s}
3506+ GGML_ASSERT (src1->ne [1 ] == nr); // weight: {d_conv, d_inner}
3507+ GGML_ASSERT (ncs == nc - 1 + n_t ); // conv_x: {d_conv - 1 + n_t, d_inner, n_s}
3508+ GGML_ASSERT (src0->nb [0 ] == sizeof (float ));
3509+ GGML_ASSERT (src1->nb [0 ] == sizeof (float ));
3510+
3511+ // --- Build CANN tensors ---
3512+
3513+ // 1) Input: conv_x as NCL
3514+ //
3515+ // src0->ne = { ncs, nr, n_s, 1 } // {L_in, C, N}
3516+ // Passing ACL_FORMAT_NCL here means:
3517+ // reversed dims -> [N, C, L_in] = [n_s, nr, ncs]
3518+ acl_tensor_ptr acl_x = ggml_cann_create_tensor (src0, src0->ne , src0->nb , 3 , ACL_FORMAT_NCL);
3519+
3520+ // 2) Weights: depthwise conv kernel, view src1 as {K, 1, C}
3521+ //
3522+ // src1 original: ne = { nc, nr, 1, 1 } // [K, C, 1, 1]
3523+ // we want a view: ne_w = { nc, 1, nr } // [K, 1, C]
3524+ // so that reversed dims -> [C, 1, K] which matches
3525+ // [out_channels, in_channels/groups, kernel_size]
3526+ int64_t w_ne[GGML_MAX_DIMS] = { 0 };
3527+ size_t w_nb[GGML_MAX_DIMS] = { 0 };
3528+
3529+ w_ne[0 ] = nc; // K
3530+ w_ne[1 ] = 1 ; // 1 input channel per group
3531+ w_ne[2 ] = nr; // C groups
3532+ w_ne[3 ] = 1 ;
3533+
3534+ // Layout: src1 data is [K, C] with
3535+ // offset(k, c) = k*nb0 + c*nb1
3536+ // We want offset_w(k, 0, c) = k*nb0 + c*nb1,
3537+ // so we can reuse nb0 and nb1, and set nb2 = nb1.
3538+ w_nb[0 ] = src1->nb [0 ]; // sizeof(float)
3539+ w_nb[1 ] = src1->nb [1 ]; // nc * sizeof(float)
3540+ w_nb[2 ] = src1->nb [1 ]; // same stride for each (fake) "channel"
3541+ w_nb[3 ] = src1->nb [3 ];
3542+
3543+ acl_tensor_ptr acl_w = ggml_cann_create_tensor (
3544+ src1->data , ggml_cann_type_mapping (src1->type ), ggml_type_size (src1->type ), w_ne, w_nb, 3 , ACL_FORMAT_NCL);
3545+
3546+ // 3) Output: dst is { d_inner, n_t, n_s } (CLN)
3547+ //
3548+ // We need an NCL view of the same buffer:
3549+ // desired NCL logical shape: { L_out = n_t, C = nr, N = n_s }
3550+ //
3551+ // Original CLN layout:
3552+ // dst->ne = { nr, n_t, n_s }
3553+ // dst->nb[0] = sizeof(float)
3554+ // dst->nb[1] = nr * sizeof(float)
3555+ // dst->nb[2] = nr * n_t * sizeof(float)
3556+ //
3557+ // We want offset_new(L, C, N) = offset_orig(C, L, N).
3558+ // Choose:
3559+ // nb_y[0] = nr * sizeof(float); // step in L
3560+ // nb_y[1] = sizeof(float); // step in C
3561+ // nb_y[2] = nr * n_t * sizeof(float); // step in N
3562+ int64_t y_ne[GGML_MAX_DIMS] = { 0 };
3563+ size_t y_nb[GGML_MAX_DIMS] = { 0 };
3564+
3565+ y_ne[0 ] = n_t ; // L_out
3566+ y_ne[1 ] = nr; // C
3567+ y_ne[2 ] = n_s; // N
3568+ y_ne[3 ] = 1 ;
3569+
3570+ y_nb[0 ] = dst->ne [0 ] * sizeof (float ); // nr * sizeof(float)
3571+ y_nb[1 ] = sizeof (float );
3572+ y_nb[2 ] = dst->ne [0 ] * dst->ne [1 ] * sizeof (float ); // nr * n_t * sizeof(float)
3573+ y_nb[3 ] = dst->nb [3 ];
3574+
3575+ acl_tensor_ptr acl_y = ggml_cann_create_tensor (
3576+ dst->data , ggml_cann_type_mapping (dst->type ), ggml_type_size (dst->type ), y_ne, y_nb, 3 , ACL_FORMAT_NCL);
3577+
3578+ // --- Conv1d parameters: depthwise, stride 1, no padding ("valid") ---
3579+ int64_t strideVal[1 ] = { 1 };
3580+ int64_t paddingVal[1 ] = { 0 };
3581+ int64_t dilationVal[1 ] = { 1 };
3582+
3583+ acl_int_array_ptr stride = ggml_cann_create_int_array (strideVal, 1 );
3584+ acl_int_array_ptr padding = ggml_cann_create_int_array (paddingVal, 1 );
3585+ acl_int_array_ptr dilation = ggml_cann_create_int_array (dilationVal, 1 );
3586+
3587+ const bool transposed = false ;
3588+ const int64_t groups = nr; // depthwise: one group per inner dim
3589+ int8_t cubeMathType = 0 ;
3590+
3591+ #ifdef ASCEND_310P
3592+ cubeMathType = 1 ;
3593+ #endif
3594+
3595+ GGML_CANN_CALL_ACLNN_OP (ctx,
3596+ Convolution,
3597+ acl_x.get (), // input: N, C, L_in = ncs
3598+ acl_w.get (), // weight: [C, 1, K] with groups=nr
3599+ nullptr , // bias
3600+ stride.get (),
3601+ padding.get (),
3602+ dilation.get (),
3603+ transposed,
3604+ padding.get (), // output padding (unused for non-transposed)
3605+ groups,
3606+ acl_y.get (),
3607+ cubeMathType);
3608+ }
3609+
0 commit comments