Skip to content

Commit a5ddf5a

Browse files
author
Mete Durlu
committed
net/smc: support SMC-R V2 for rdma devices with max_recv_sge equals to 1
JIRA: https://issues.redhat.com/browse/RHEL-99989 commit 27ef6a9 Author: Guangguan Wang <guangguan.wang@linux.alibaba.com> Date: Wed Dec 11 10:30:54 2024 +0800 net/smc: support SMC-R V2 for rdma devices with max_recv_sge equals to 1 For SMC-R V2, llc msg can be larger than SMC_WR_BUF_SIZE, thus every recv wr has 2 sges, the first sge with length SMC_WR_BUF_SIZE is for V1/V2 compatible llc/cdc msg, and the second sge with length SMC_WR_BUF_V2_SIZE-SMC_WR_TX_SIZE is for V2 specific llc msg, like SMC_LLC_DELETE_RKEY and SMC_LLC_ADD_LINK for SMC-R V2. The memory buffer in the second sge is shared by all recv wr in one link and all link in one lgr for saving memory usage purpose. But not all RDMA devices with max_recv_sge greater than 1. Thus SMC-R V2 can not support on such RDMA devices and SMC_CLC_DECL_INTERR fallback happens because of the failure of create qp. This patch introduce the support for SMC-R V2 on RDMA devices with max_recv_sge equals to 1. Every recv wr has only one sge with individual buffer whose size is SMC_WR_BUF_V2_SIZE once the RDMA device's max_recv_sge equals to 1. It may use more memory, but it is better than SMC_CLC_DECL_INTERR fallback. Co-developed-by: Wen Gu <guwen@linux.alibaba.com> Signed-off-by: Wen Gu <guwen@linux.alibaba.com> Signed-off-by: Guangguan Wang <guangguan.wang@linux.alibaba.com> Reviewed-by: Wenjia Zhang <wenjia@linux.ibm.com> Signed-off-by: Paolo Abeni <pabeni@redhat.com> Signed-off-by: Mahanta Jambigi <mjambigi@linux.ibm.com> Signed-off-by: Mete Durlu <mdurlu@redhat.com>
1 parent b3ef0f0 commit a5ddf5a

File tree

5 files changed

+52
-30
lines changed

5 files changed

+52
-30
lines changed

net/smc/smc_core.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -795,9 +795,14 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
795795
if (lgr->smc_version == SMC_V2) {
796796
lnk->smcibdev = ini->smcrv2.ib_dev_v2;
797797
lnk->ibport = ini->smcrv2.ib_port_v2;
798+
lnk->wr_rx_sge_cnt = lnk->smcibdev->ibdev->attrs.max_recv_sge < 2 ? 1 : 2;
799+
lnk->wr_rx_buflen = smc_link_shared_v2_rxbuf(lnk) ?
800+
SMC_WR_BUF_SIZE : SMC_WR_BUF_V2_SIZE;
798801
} else {
799802
lnk->smcibdev = ini->ib_dev;
800803
lnk->ibport = ini->ib_port;
804+
lnk->wr_rx_sge_cnt = 1;
805+
lnk->wr_rx_buflen = SMC_WR_BUF_SIZE;
801806
}
802807
get_device(&lnk->smcibdev->ibdev->dev);
803808
atomic_inc(&lnk->smcibdev->lnk_cnt);

net/smc/smc_core.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,10 +122,14 @@ struct smc_link {
122122
} ____cacheline_aligned_in_smp;
123123
struct completion tx_ref_comp;
124124

125-
struct smc_wr_buf *wr_rx_bufs; /* WR recv payload buffers */
125+
u8 *wr_rx_bufs; /* WR recv payload buffers */
126126
struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */
127127
struct ib_sge *wr_rx_sges; /* WR recv scatter meta data */
128128
/* above three vectors have wr_rx_cnt elements and use the same index */
129+
int wr_rx_sge_cnt; /* rx sge, V1 is 1, V2 is either 2 or 1 */
130+
int wr_rx_buflen; /* buffer len for the first sge, len for the
131+
* second sge is lgr shared if rx sge is 2.
132+
*/
129133
dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */
130134
dma_addr_t wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/
131135
u64 wr_rx_id; /* seq # of last recv WR */
@@ -506,6 +510,11 @@ static inline bool smc_link_active(struct smc_link *lnk)
506510
return lnk->state == SMC_LNK_ACTIVE;
507511
}
508512

513+
static inline bool smc_link_shared_v2_rxbuf(struct smc_link *lnk)
514+
{
515+
return lnk->wr_rx_sge_cnt > 1;
516+
}
517+
509518
static inline void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw)
510519
{
511520
sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x",

net/smc/smc_ib.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -662,7 +662,6 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk)
662662
/* create a queue pair within the protection domain for a link */
663663
int smc_ib_create_queue_pair(struct smc_link *lnk)
664664
{
665-
int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
666665
struct ib_qp_init_attr qp_attr = {
667666
.event_handler = smc_ib_qp_event_handler,
668667
.qp_context = lnk,
@@ -676,7 +675,7 @@ int smc_ib_create_queue_pair(struct smc_link *lnk)
676675
.max_send_wr = SMC_WR_BUF_CNT * 3,
677676
.max_recv_wr = SMC_WR_BUF_CNT * 3,
678677
.max_send_sge = SMC_IB_MAX_SEND_SGE,
679-
.max_recv_sge = sges_per_buf,
678+
.max_recv_sge = lnk->wr_rx_sge_cnt,
680679
.max_inline_data = 0,
681680
},
682681
.sq_sig_type = IB_SIGNAL_REQ_WR,

net/smc/smc_llc.c

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -997,13 +997,14 @@ static int smc_llc_cli_conf_link(struct smc_link *link,
997997
}
998998

999999
static void smc_llc_save_add_link_rkeys(struct smc_link *link,
1000-
struct smc_link *link_new)
1000+
struct smc_link *link_new,
1001+
u8 *llc_msg)
10011002
{
10021003
struct smc_llc_msg_add_link_v2_ext *ext;
10031004
struct smc_link_group *lgr = link->lgr;
10041005
int max, i;
10051006

1006-
ext = (struct smc_llc_msg_add_link_v2_ext *)((u8 *)lgr->wr_rx_buf_v2 +
1007+
ext = (struct smc_llc_msg_add_link_v2_ext *)(llc_msg +
10071008
SMC_WR_TX_SIZE);
10081009
max = min_t(u8, ext->num_rkeys, SMC_LLC_RKEYS_PER_MSG_V2);
10091010
down_write(&lgr->rmbs_lock);
@@ -1098,7 +1099,9 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry)
10981099
if (rc)
10991100
goto out_clear_lnk;
11001101
if (lgr->smc_version == SMC_V2) {
1101-
smc_llc_save_add_link_rkeys(link, lnk_new);
1102+
u8 *llc_msg = smc_link_shared_v2_rxbuf(link) ?
1103+
(u8 *)lgr->wr_rx_buf_v2 : (u8 *)llc;
1104+
smc_llc_save_add_link_rkeys(link, lnk_new, llc_msg);
11021105
} else {
11031106
rc = smc_llc_cli_rkey_exchange(link, lnk_new);
11041107
if (rc) {
@@ -1498,7 +1501,9 @@ int smc_llc_srv_add_link(struct smc_link *link,
14981501
if (rc)
14991502
goto out_err;
15001503
if (lgr->smc_version == SMC_V2) {
1501-
smc_llc_save_add_link_rkeys(link, link_new);
1504+
u8 *llc_msg = smc_link_shared_v2_rxbuf(link) ?
1505+
(u8 *)lgr->wr_rx_buf_v2 : (u8 *)add_llc;
1506+
smc_llc_save_add_link_rkeys(link, link_new, llc_msg);
15021507
} else {
15031508
rc = smc_llc_srv_rkey_exchange(link, link_new);
15041509
if (rc)
@@ -1807,8 +1812,12 @@ static void smc_llc_rmt_delete_rkey(struct smc_link_group *lgr)
18071812
if (lgr->smc_version == SMC_V2) {
18081813
struct smc_llc_msg_delete_rkey_v2 *llcv2;
18091814

1810-
memcpy(lgr->wr_rx_buf_v2, llc, sizeof(*llc));
1811-
llcv2 = (struct smc_llc_msg_delete_rkey_v2 *)lgr->wr_rx_buf_v2;
1815+
if (smc_link_shared_v2_rxbuf(link)) {
1816+
memcpy(lgr->wr_rx_buf_v2, llc, sizeof(*llc));
1817+
llcv2 = (struct smc_llc_msg_delete_rkey_v2 *)lgr->wr_rx_buf_v2;
1818+
} else {
1819+
llcv2 = (struct smc_llc_msg_delete_rkey_v2 *)llc;
1820+
}
18121821
llcv2->num_inval_rkeys = 0;
18131822

18141823
max = min_t(u8, llcv2->num_rkeys, SMC_LLC_RKEYS_PER_MSG_V2);

net/smc/smc_wr.c

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -439,7 +439,7 @@ static inline void smc_wr_rx_demultiplex(struct ib_wc *wc)
439439
return; /* short message */
440440
temp_wr_id = wc->wr_id;
441441
index = do_div(temp_wr_id, link->wr_rx_cnt);
442-
wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index];
442+
wr_rx = (struct smc_wr_rx_hdr *)(link->wr_rx_bufs + index * link->wr_rx_buflen);
443443
hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) {
444444
if (handler->type == wr_rx->type)
445445
handler->handler(wc, wr_rx);
@@ -555,7 +555,6 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk)
555555

556556
static void smc_wr_init_sge(struct smc_link *lnk)
557557
{
558-
int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
559558
bool send_inline = (lnk->qp_attr.cap.max_inline_data > SMC_WR_TX_SIZE);
560559
u32 i;
561560

@@ -608,13 +607,14 @@ static void smc_wr_init_sge(struct smc_link *lnk)
608607
* the larger spillover buffer, allowing easy data mapping.
609608
*/
610609
for (i = 0; i < lnk->wr_rx_cnt; i++) {
611-
int x = i * sges_per_buf;
610+
int x = i * lnk->wr_rx_sge_cnt;
612611

613612
lnk->wr_rx_sges[x].addr =
614-
lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE;
615-
lnk->wr_rx_sges[x].length = SMC_WR_TX_SIZE;
613+
lnk->wr_rx_dma_addr + i * lnk->wr_rx_buflen;
614+
lnk->wr_rx_sges[x].length = smc_link_shared_v2_rxbuf(lnk) ?
615+
SMC_WR_TX_SIZE : lnk->wr_rx_buflen;
616616
lnk->wr_rx_sges[x].lkey = lnk->roce_pd->local_dma_lkey;
617-
if (lnk->lgr->smc_version == SMC_V2) {
617+
if (lnk->lgr->smc_version == SMC_V2 && smc_link_shared_v2_rxbuf(lnk)) {
618618
lnk->wr_rx_sges[x + 1].addr =
619619
lnk->wr_rx_v2_dma_addr + SMC_WR_TX_SIZE;
620620
lnk->wr_rx_sges[x + 1].length =
@@ -624,7 +624,7 @@ static void smc_wr_init_sge(struct smc_link *lnk)
624624
}
625625
lnk->wr_rx_ibs[i].next = NULL;
626626
lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[x];
627-
lnk->wr_rx_ibs[i].num_sge = sges_per_buf;
627+
lnk->wr_rx_ibs[i].num_sge = lnk->wr_rx_sge_cnt;
628628
}
629629
lnk->wr_reg.wr.next = NULL;
630630
lnk->wr_reg.wr.num_sge = 0;
@@ -655,7 +655,7 @@ void smc_wr_free_link(struct smc_link *lnk)
655655

656656
if (lnk->wr_rx_dma_addr) {
657657
ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
658-
SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
658+
lnk->wr_rx_buflen * lnk->wr_rx_cnt,
659659
DMA_FROM_DEVICE);
660660
lnk->wr_rx_dma_addr = 0;
661661
}
@@ -740,13 +740,11 @@ int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr)
740740

741741
int smc_wr_alloc_link_mem(struct smc_link *link)
742742
{
743-
int sges_per_buf = link->lgr->smc_version == SMC_V2 ? 2 : 1;
744-
745743
/* allocate link related memory */
746744
link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL);
747745
if (!link->wr_tx_bufs)
748746
goto no_mem;
749-
link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE,
747+
link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, link->wr_rx_buflen,
750748
GFP_KERNEL);
751749
if (!link->wr_rx_bufs)
752750
goto no_mem_wr_tx_bufs;
@@ -774,7 +772,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link)
774772
if (!link->wr_tx_sges)
775773
goto no_mem_wr_tx_rdma_sges;
776774
link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3,
777-
sizeof(link->wr_rx_sges[0]) * sges_per_buf,
775+
sizeof(link->wr_rx_sges[0]) * link->wr_rx_sge_cnt,
778776
GFP_KERNEL);
779777
if (!link->wr_rx_sges)
780778
goto no_mem_wr_tx_sges;
@@ -872,21 +870,23 @@ int smc_wr_create_link(struct smc_link *lnk)
872870
smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0);
873871
lnk->wr_rx_id = 0;
874872
lnk->wr_rx_dma_addr = ib_dma_map_single(
875-
ibdev, lnk->wr_rx_bufs, SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
873+
ibdev, lnk->wr_rx_bufs, lnk->wr_rx_buflen * lnk->wr_rx_cnt,
876874
DMA_FROM_DEVICE);
877875
if (ib_dma_mapping_error(ibdev, lnk->wr_rx_dma_addr)) {
878876
lnk->wr_rx_dma_addr = 0;
879877
rc = -EIO;
880878
goto out;
881879
}
882880
if (lnk->lgr->smc_version == SMC_V2) {
883-
lnk->wr_rx_v2_dma_addr = ib_dma_map_single(ibdev,
884-
lnk->lgr->wr_rx_buf_v2, SMC_WR_BUF_V2_SIZE,
885-
DMA_FROM_DEVICE);
886-
if (ib_dma_mapping_error(ibdev, lnk->wr_rx_v2_dma_addr)) {
887-
lnk->wr_rx_v2_dma_addr = 0;
888-
rc = -EIO;
889-
goto dma_unmap;
881+
if (smc_link_shared_v2_rxbuf(lnk)) {
882+
lnk->wr_rx_v2_dma_addr =
883+
ib_dma_map_single(ibdev, lnk->lgr->wr_rx_buf_v2,
884+
SMC_WR_BUF_V2_SIZE, DMA_FROM_DEVICE);
885+
if (ib_dma_mapping_error(ibdev, lnk->wr_rx_v2_dma_addr)) {
886+
lnk->wr_rx_v2_dma_addr = 0;
887+
rc = -EIO;
888+
goto dma_unmap;
889+
}
890890
}
891891
lnk->wr_tx_v2_dma_addr = ib_dma_map_single(ibdev,
892892
lnk->lgr->wr_tx_buf_v2, SMC_WR_BUF_V2_SIZE,
@@ -935,7 +935,7 @@ int smc_wr_create_link(struct smc_link *lnk)
935935
lnk->wr_tx_v2_dma_addr = 0;
936936
}
937937
ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
938-
SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
938+
lnk->wr_rx_buflen * lnk->wr_rx_cnt,
939939
DMA_FROM_DEVICE);
940940
lnk->wr_rx_dma_addr = 0;
941941
out:

0 commit comments

Comments
 (0)