Skip to content

Commit c642870

Browse files
committed
[ICRDMA] Do not use RoCEv1 gid index if present. Improve rdma diagnostic info. EXT-1667 (#29694)
One port can have multiple indexes for same GID. The app must use correct index (RoCEv1 or RoCEv2). Unfortunately we can't check index type (no support in the ibdrv wrapper) so right now we use some hack to pick RoCEv2 index. This pr also adds more diagnostic info.
1 parent dc64d31 commit c642870

File tree

9 files changed

+58
-13
lines changed

9 files changed

+58
-13
lines changed

ydb/library/actors/interconnect/interconnect_handshake.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1447,6 +1447,9 @@ namespace NActors {
14471447
} else {
14481448
sb << "Unable to complete rdma READ work request due to cq runtime error";
14491449
}
1450+
if (Rdma.Qp) {
1451+
sb << " qp: " << Rdma.Qp;
1452+
}
14501453
LOG_LOG_IC_X(NActorsServices::INTERCONNECT, "ICRDMA", NLog::PRI_ERROR, sb.c_str());
14511454
rdmaReadAck.SetErr(sb);
14521455
}

ydb/library/actors/interconnect/interconnect_mon.cpp

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,22 @@ namespace NInterconnect {
7878
}
7979
}
8080

81+
TString XdcFlagsToString(const ui8 xdcFlags) {
82+
TString res;
83+
if (xdcFlags & TInterconnectProxyTCP::TProxyStats::XDCFlags::MSG_ZERO_COPY_SEND) {
84+
res.append("MSG_ZC_SEND|");
85+
}
86+
if (xdcFlags & TInterconnectProxyTCP::TProxyStats::XDCFlags::RDMA_READ) {
87+
res.append("RDMA_READ|");
88+
}
89+
if (res.empty()) {
90+
res.append("_");
91+
} else {
92+
res.pop_back();
93+
}
94+
return res;
95+
}
96+
8197
TString GenerateHtml() {
8298
TStringStream str;
8399
HTML(str) {
@@ -131,7 +147,7 @@ namespace NInterconnect {
131147
TABLED() { str << kv.second.TotalOutputQueueSize; }
132148
TABLED() { str << (kv.second.Connected ? "yes" : "<strong>no</strong>"); }
133149
TABLED() { str << (kv.second.ExternalDataChannel ? "yes" : "no")
134-
<< " (" << (kv.second.XDCFlags & TInterconnectProxyTCP::TProxyStats::XDCFlags::MSG_ZERO_COPY_SEND ? "MSG_ZC_SEND" : "_") << ")"; }
150+
<< " (" << XdcFlagsToString(kv.second.XDCFlags) << ")"; }
135151
TABLED() { str << kv.second.Host; }
136152
TABLED() { str << kv.second.Port; }
137153
TABLED() {

ydb/library/actors/interconnect/interconnect_tcp_proxy.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ namespace NActors {
5353
enum XDCFlags {
5454
NONE = 0,
5555
MSG_ZERO_COPY_SEND = 1,
56+
RDMA_READ = 1 << 1,
5657
};
5758
ui8 XDCFlags;
5859
};

ydb/library/actors/interconnect/interconnect_tcp_session.cpp

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -68,16 +68,25 @@ namespace NActors {
6868
SendUpdateToWhiteboard();
6969
}
7070

71-
std::optional<ui8> TInterconnectSessionTCP::GetXDCFlags() const {
71+
std::optional<ui8> TInterconnectSessionTCP::GetXDCFlags() const noexcept {
72+
std::optional<ui8> flags;
73+
using NInterconnect::NRdma::TQueuePair;
7274
if (XdcSocket) {
75+
flags = TInterconnectProxyTCP::TProxyStats::NONE;
7376
if (ZcProcessor.ZcStateIsOk()) {
74-
return TInterconnectProxyTCP::TProxyStats::MSG_ZERO_COPY_SEND;
75-
} else {
76-
return TInterconnectProxyTCP::TProxyStats::NONE;
77+
*flags |= TInterconnectProxyTCP::TProxyStats::MSG_ZERO_COPY_SEND;
78+
}
79+
if (RdmaQp) {
80+
TQueuePair::TQpState res = RdmaQp->GetState(false);
81+
TQueuePair::TQpS* qpState = std::get_if<TQueuePair::TQpS>(&res);
82+
if (qpState) {
83+
if (TQueuePair::IsRtsState(*qpState)) {
84+
*flags |= TInterconnectProxyTCP::TProxyStats::RDMA_READ;
85+
}
86+
}
7787
}
78-
} else {
79-
return {};
8088
}
89+
return flags;
8190
}
8291

8392
void TInterconnectSessionTCP::CloseInputSession() {

ydb/library/actors/interconnect/interconnect_tcp_session.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -466,7 +466,7 @@ namespace NActors {
466466
return ReceiveContext->ClockSkew_us;
467467
}
468468

469-
std::optional<ui8> GetXDCFlags() const;
469+
std::optional<ui8> GetXDCFlags() const noexcept;
470470

471471
private:
472472
friend class TInterconnectProxyTCP;

ydb/library/actors/interconnect/rdma/link_manager.cpp

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,14 +29,14 @@
2929

3030
template <>
3131
struct std::less<ibv_gid> {
32-
std::size_t operator()(const ibv_gid& a, const ibv_gid& b) const {
32+
bool operator()(const ibv_gid& a, const ibv_gid& b) const noexcept {
3333
return std::tie(a.global.subnet_prefix, a.global.interface_id) <
3434
std::tie(b.global.subnet_prefix, b.global.interface_id);
3535
}
3636
};
3737
template <>
3838
struct std::equal_to<ibv_gid> {
39-
bool operator()(const ibv_gid& a, const ibv_gid& b) const {
39+
bool operator()(const ibv_gid& a, const ibv_gid& b) const noexcept {
4040
return a.global.interface_id == b.global.interface_id
4141
&& a.global.subnet_prefix == b.global.subnet_prefix;
4242
}
@@ -118,10 +118,19 @@ static class TRdmaLinkManager {
118118
continue;
119119
}
120120

121+
/*
122+
* TODO: Add ibv_query_gid_table support into arcadia ibv wrapper (contrib/libs/ibdrv)
123+
* This extension allows to get type of GID index to skip unsupported RoCEv1
124+
*/
125+
121126
for (uint8_t portNum = 1; portNum <= devAttrs.phys_port_cnt; portNum++) {
122127
ibv_port_attr portAttrs;
123128
err = ibv_query_port(ctx, portNum, &portAttrs);
124129
if (err == 0) {
130+
if (portAttrs.state != IBV_PORT_ACTIVE) {
131+
continue; //Skip non active ports
132+
}
133+
125134
for (int gidIndex = 0; gidIndex < portAttrs.gid_tbl_len; gidIndex++ ) {
126135
auto ctx = TRdmaCtx::Create(deviceCtx, portNum, gidIndex);
127136
if (!ctx) {
@@ -135,6 +144,10 @@ static class TRdmaLinkManager {
135144
}
136145
std::sort(CtxMap.begin(), CtxMap.end(),
137146
[](const auto& a, const auto& b) {
147+
if (std::equal_to<ibv_gid>()(a.first, b.first)) {
148+
// Hack: Most implementations have RoCEv2 after RoCEv1, but we prefer RoCEv2 gid index
149+
return a.second->GetGidIndex() > b.second->GetGidIndex();
150+
}
138151
return std::less<ibv_gid>()(a.first, b.first);
139152
});
140153

ydb/library/actors/interconnect/rdma/rdma.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,9 @@ void TQueuePair::Output(IOutputStream& os) const noexcept {
290290
} else {
291291
os << attr.qp_state;
292292
}
293+
if (Ctx) {
294+
os << ", ctx: " << *Ctx;
295+
}
293296
}
294297

295298
TQueuePair::TQpState TQueuePair::GetState(bool forseUpdate) const noexcept {
@@ -324,7 +327,7 @@ size_t TQueuePair::GetDeviceIndex() const noexcept {
324327
return Ctx->GetDeviceIndex();
325328
}
326329

327-
bool TQueuePair::IsRtsState(TQpS state) {
330+
bool TQueuePair::IsRtsState(TQpS state) noexcept {
328331
enum ibv_qp_state qpState = static_cast<enum ibv_qp_state>(state.State);
329332
return qpState == IBV_QPS_RTS;
330333
}

ydb/library/actors/interconnect/rdma/rdma.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ class TQueuePair: public NNonCopyable::TMoveOnly {
9494
struct TQpS {
9595
int State;
9696
};
97-
static bool IsRtsState(TQpS state);
97+
static bool IsRtsState(TQpS state) noexcept;
9898
struct TQpErr {
9999
int Err;
100100
};

ydb/library/actors/interconnect/ut/interconnect_ut.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ Y_UNIT_TEST_SUITE(Interconnect) {
225225
{
226226
auto s = GetRdmaQpStatus(cluster, 1, 2);
227227
auto tokens = SplitString(s, ",");
228-
UNIT_ASSERT(tokens.size() == 2);
228+
UNIT_ASSERT(tokens.size() > 2);
229229
UNIT_ASSERT(tokens[1] == "QPS_RTS");
230230
}
231231

0 commit comments

Comments
 (0)