1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 */ 4 5 #include "mlx5_priv.h" 6 #include "mlx5_ifc.h" 7 #include "spdk/log.h" 8 #include "spdk/util.h" 9 #include "spdk/barrier.h" 10 #include "spdk/likely.h" 11 12 #include "spdk_internal/rdma_utils.h" 13 #include "spdk_internal/mlx5.h" 14 15 #define MLX5_DMA_Q_TX_CQE_SIZE 64 16 17 struct _mlx5_err_cqe { 18 uint8_t rsvd0[32]; 19 uint32_t srqn; 20 uint8_t rsvd1[16]; 21 uint8_t hw_err_synd; 22 uint8_t rsvd2[1]; 23 uint8_t vendor_err_synd; 24 uint8_t syndrome; 25 uint32_t s_wqe_opcode_qpn; 26 uint16_t wqe_counter; 27 uint8_t signature; 28 uint8_t op_own; 29 }; 30 31 struct mlx5_sigerr_cqe { 32 uint8_t rsvd0[16]; 33 uint32_t expected_trans_sig; 34 uint32_t actual_trans_sig; 35 uint32_t expected_ref_tag; 36 uint32_t actual_ref_tag; 37 uint16_t syndrome; 38 uint8_t sig_type; 39 uint8_t domain; 40 uint32_t mkey; 41 uint64_t sig_err_offset; 42 uint8_t rsvd30[14]; 43 uint8_t signature; 44 uint8_t op_own; 45 }; 46 47 static const char * 48 mlx5_cqe_err_opcode(struct _mlx5_err_cqe *ecqe) 49 { 50 uint8_t wqe_err_opcode = be32toh(ecqe->s_wqe_opcode_qpn) >> 24; 51 52 switch (ecqe->op_own >> 4) { 53 case MLX5_CQE_REQ_ERR: 54 switch (wqe_err_opcode) { 55 case MLX5_OPCODE_RDMA_WRITE_IMM: 56 case MLX5_OPCODE_RDMA_WRITE: 57 return "RDMA_WRITE"; 58 case MLX5_OPCODE_SEND_IMM: 59 case MLX5_OPCODE_SEND: 60 case MLX5_OPCODE_SEND_INVAL: 61 return "SEND"; 62 case MLX5_OPCODE_RDMA_READ: 63 return "RDMA_READ"; 64 case MLX5_OPCODE_ATOMIC_CS: 65 return "COMPARE_SWAP"; 66 case MLX5_OPCODE_ATOMIC_FA: 67 return "FETCH_ADD"; 68 case MLX5_OPCODE_ATOMIC_MASKED_CS: 69 return "MASKED_COMPARE_SWAP"; 70 case MLX5_OPCODE_ATOMIC_MASKED_FA: 71 return "MASKED_FETCH_ADD"; 72 case MLX5_OPCODE_MMO: 73 return "GGA_DMA"; 74 default: 75 return ""; 76 } 77 case MLX5_CQE_RESP_ERR: 78 return "RECV"; 79 default: 80 return ""; 81 } 82 } 83 84 static int 85 mlx5_cqe_err(struct mlx5_cqe64 *cqe) 86 { 87 struct _mlx5_err_cqe *ecqe = (struct _mlx5_err_cqe *)cqe; 88 uint16_t wqe_counter; 89 uint32_t qp_num = 0; 90 char info[200] = {0}; 91 92 wqe_counter = be16toh(ecqe->wqe_counter); 93 qp_num = be32toh(ecqe->s_wqe_opcode_qpn) & ((1 << 24) - 1); 94 95 if (ecqe->syndrome == MLX5_CQE_SYNDROME_WR_FLUSH_ERR) { 96 SPDK_DEBUGLOG(mlx5, "QP 0x%x wqe[%d] is flushed\n", qp_num, wqe_counter); 97 return ecqe->syndrome; 98 } 99 100 switch (ecqe->syndrome) { 101 case MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR: 102 snprintf(info, sizeof(info), "Local length"); 103 break; 104 case MLX5_CQE_SYNDROME_LOCAL_QP_OP_ERR: 105 snprintf(info, sizeof(info), "Local QP operation"); 106 break; 107 case MLX5_CQE_SYNDROME_LOCAL_PROT_ERR: 108 snprintf(info, sizeof(info), "Local protection"); 109 break; 110 case MLX5_CQE_SYNDROME_WR_FLUSH_ERR: 111 snprintf(info, sizeof(info), "WR flushed because QP in error state"); 112 break; 113 case MLX5_CQE_SYNDROME_MW_BIND_ERR: 114 snprintf(info, sizeof(info), "Memory window bind"); 115 break; 116 case MLX5_CQE_SYNDROME_BAD_RESP_ERR: 117 snprintf(info, sizeof(info), "Bad response"); 118 break; 119 case MLX5_CQE_SYNDROME_LOCAL_ACCESS_ERR: 120 snprintf(info, sizeof(info), "Local access"); 121 break; 122 case MLX5_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR: 123 snprintf(info, sizeof(info), "Invalid request"); 124 break; 125 case MLX5_CQE_SYNDROME_REMOTE_ACCESS_ERR: 126 snprintf(info, sizeof(info), "Remote access"); 127 break; 128 case MLX5_CQE_SYNDROME_REMOTE_OP_ERR: 129 snprintf(info, sizeof(info), "Remote QP"); 130 break; 131 case MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR: 132 snprintf(info, sizeof(info), "Transport retry count exceeded"); 133 break; 134 case MLX5_CQE_SYNDROME_RNR_RETRY_EXC_ERR: 135 snprintf(info, sizeof(info), "Receive-no-ready retry count exceeded"); 136 break; 137 case MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR: 138 snprintf(info, sizeof(info), "Remote side aborted"); 139 break; 140 default: 141 snprintf(info, sizeof(info), "Generic"); 142 break; 143 } 144 SPDK_WARNLOG("Error on QP 0x%x wqe[%03d]: %s (synd 0x%x vend 0x%x hw 0x%x) opcode %s\n", 145 qp_num, wqe_counter, info, ecqe->syndrome, ecqe->vendor_err_synd, ecqe->hw_err_synd, 146 mlx5_cqe_err_opcode(ecqe)); 147 148 return ecqe->syndrome; 149 } 150 151 /* 152 * DATA WQE LAYOUT: 153 * ---------------------------------- 154 * | gen_ctrl | rseg | dseg | 155 * ---------------------------------- 156 * 16bytes 16bytes 16bytes * sge_count 157 */ 158 159 static inline void 160 mlx5_dma_xfer_full(struct spdk_mlx5_qp *qp, struct ibv_sge *sge, uint32_t sge_count, uint64_t raddr, 161 uint32_t rkey, int op, uint32_t flags, uint64_t wr_id, uint32_t bb_count) 162 { 163 struct mlx5_hw_qp *hw_qp = &qp->hw; 164 struct mlx5_wqe_ctrl_seg *ctrl; 165 struct mlx5_wqe_raddr_seg *rseg; 166 struct mlx5_wqe_data_seg *dseg; 167 uint8_t fm_ce_se; 168 uint32_t i, pi; 169 170 fm_ce_se = mlx5_qp_fm_ce_se_update(qp, (uint8_t)flags); 171 172 /* absolute PI value */ 173 pi = hw_qp->sq_pi & (hw_qp->sq_wqe_cnt - 1); 174 SPDK_DEBUGLOG(mlx5, "opc %d, sge_count %u, bb_count %u, orig pi %u, fm_ce_se %x\n", op, sge_count, 175 bb_count, hw_qp->sq_pi, fm_ce_se); 176 177 ctrl = (struct mlx5_wqe_ctrl_seg *) mlx5_qp_get_wqe_bb(hw_qp); 178 /* WQE size in octowords (16-byte units). DS (data segment) accounts for all the segments in the WQE 179 * as summarized in WQE construction */ 180 mlx5_set_ctrl_seg(ctrl, hw_qp->sq_pi, op, 0, hw_qp->qp_num, fm_ce_se, 2 + sge_count, 0, 0); 181 182 rseg = (struct mlx5_wqe_raddr_seg *)(ctrl + 1); 183 rseg->raddr = htobe64(raddr); 184 rseg->rkey = htobe32(rkey); 185 rseg->reserved = 0; 186 187 dseg = (struct mlx5_wqe_data_seg *)(rseg + 1); 188 for (i = 0; i < sge_count; i++) { 189 mlx5dv_set_data_seg(dseg, sge[i].length, sge[i].lkey, sge[i].addr); 190 dseg = dseg + 1; 191 } 192 193 mlx5_qp_wqe_submit(qp, ctrl, bb_count, pi); 194 195 mlx5_qp_set_comp(qp, pi, wr_id, fm_ce_se, bb_count); 196 assert(qp->tx_available >= bb_count); 197 qp->tx_available -= bb_count; 198 } 199 200 static inline void 201 mlx5_dma_xfer_wrap_around(struct spdk_mlx5_qp *qp, struct ibv_sge *sge, uint32_t sge_count, 202 uint64_t raddr, uint32_t rkey, int op, uint32_t flags, uint64_t wr_id, uint32_t bb_count) 203 { 204 struct mlx5_hw_qp *hw_qp = &qp->hw; 205 struct mlx5_wqe_ctrl_seg *ctrl; 206 struct mlx5_wqe_raddr_seg *rseg; 207 struct mlx5_wqe_data_seg *dseg; 208 uint8_t fm_ce_se; 209 uint32_t i, to_end, pi; 210 211 fm_ce_se = mlx5_qp_fm_ce_se_update(qp, (uint8_t)flags); 212 213 /* absolute PI value */ 214 pi = hw_qp->sq_pi & (hw_qp->sq_wqe_cnt - 1); 215 SPDK_DEBUGLOG(mlx5, "opc %d, sge_count %u, bb_count %u, orig pi %u, fm_ce_se %x\n", op, sge_count, 216 bb_count, pi, fm_ce_se); 217 218 to_end = (hw_qp->sq_wqe_cnt - pi) * MLX5_SEND_WQE_BB; 219 ctrl = (struct mlx5_wqe_ctrl_seg *) mlx5_qp_get_wqe_bb(hw_qp); 220 /* WQE size in octowords (16-byte units). DS (data segment) accounts for all the segments in the WQE 221 * as summarized in WQE construction */ 222 mlx5_set_ctrl_seg(ctrl, hw_qp->sq_pi, op, 0, hw_qp->qp_num, fm_ce_se, 2 + sge_count, 0, 0); 223 to_end -= sizeof(struct mlx5_wqe_ctrl_seg); /* 16 bytes */ 224 225 rseg = (struct mlx5_wqe_raddr_seg *)(ctrl + 1); 226 rseg->raddr = htobe64(raddr); 227 rseg->rkey = htobe32(rkey); 228 rseg->reserved = 0; 229 to_end -= sizeof(struct mlx5_wqe_raddr_seg); /* 16 bytes */ 230 231 dseg = (struct mlx5_wqe_data_seg *)(rseg + 1); 232 for (i = 0; i < sge_count; i++) { 233 mlx5dv_set_data_seg(dseg, sge[i].length, sge[i].lkey, sge[i].addr); 234 to_end -= sizeof(struct mlx5_wqe_data_seg); /* 16 bytes */ 235 if (to_end != 0) { 236 dseg = dseg + 1; 237 } else { 238 /* Start from the beginning of SQ */ 239 dseg = (struct mlx5_wqe_data_seg *)(hw_qp->sq_addr); 240 to_end = hw_qp->sq_wqe_cnt * MLX5_SEND_WQE_BB; 241 } 242 } 243 244 mlx5_qp_wqe_submit(qp, ctrl, bb_count, pi); 245 246 mlx5_qp_set_comp(qp, pi, wr_id, fm_ce_se, bb_count); 247 assert(qp->tx_available >= bb_count); 248 qp->tx_available -= bb_count; 249 } 250 251 static inline int 252 mlx5_qp_rdma_op(struct spdk_mlx5_qp *qp, struct ibv_sge *sge, uint32_t sge_count, uint64_t dstaddr, 253 uint32_t rkey, uint64_t wrid, uint32_t flags, int op) 254 { 255 struct mlx5_hw_qp *hw_qp = &qp->hw; 256 uint32_t to_end, pi, bb_count; 257 258 /* One bb (building block) is 64 bytes - 4 octowords 259 * It can hold control segment + raddr segment + 2 sge segments. 260 * If sge_count (data segments) is bigger than 2 then we consume additional bb */ 261 bb_count = (sge_count <= 2) ? 1 : 1 + SPDK_CEIL_DIV(sge_count - 2, 4); 262 263 if (spdk_unlikely(bb_count > qp->tx_available)) { 264 return -ENOMEM; 265 } 266 if (spdk_unlikely(sge_count > qp->max_send_sge)) { 267 return -E2BIG; 268 } 269 pi = hw_qp->sq_pi & (hw_qp->sq_wqe_cnt - 1); 270 to_end = (hw_qp->sq_wqe_cnt - pi) * MLX5_SEND_WQE_BB; 271 272 if (spdk_likely(to_end >= bb_count * MLX5_SEND_WQE_BB)) { 273 mlx5_dma_xfer_full(qp, sge, sge_count, dstaddr, rkey, op, flags, wrid, bb_count); 274 } else { 275 mlx5_dma_xfer_wrap_around(qp, sge, sge_count, dstaddr, rkey, op, flags, wrid, bb_count); 276 } 277 278 return 0; 279 } 280 281 int 282 spdk_mlx5_qp_rdma_write(struct spdk_mlx5_qp *qp, struct ibv_sge *sge, uint32_t sge_count, 283 uint64_t dstaddr, uint32_t rkey, uint64_t wrid, uint32_t flags) 284 { 285 return mlx5_qp_rdma_op(qp, sge, sge_count, dstaddr, rkey, wrid, flags, MLX5_OPCODE_RDMA_WRITE); 286 } 287 288 int 289 spdk_mlx5_qp_rdma_read(struct spdk_mlx5_qp *qp, struct ibv_sge *sge, uint32_t sge_count, 290 uint64_t dstaddr, uint32_t rkey, uint64_t wrid, uint32_t flags) 291 { 292 return mlx5_qp_rdma_op(qp, sge, sge_count, dstaddr, rkey, wrid, flags, MLX5_OPCODE_RDMA_READ); 293 } 294 295 /* polling start */ 296 297 static inline void 298 mlx5_qp_update_comp(struct spdk_mlx5_qp *qp) 299 { 300 qp->completions[qp->last_pi].completions = qp->nonsignaled_outstanding; 301 qp->nonsignaled_outstanding = 0; 302 } 303 304 static inline void 305 mlx5_qp_tx_complete(struct spdk_mlx5_qp *qp) 306 { 307 if (qp->sigmode == SPDK_MLX5_QP_SIG_LAST) { 308 qp->ctrl->fm_ce_se &= ~SPDK_MLX5_WQE_CTRL_CE_MASK; 309 qp->ctrl->fm_ce_se |= SPDK_MLX5_WQE_CTRL_CE_CQ_UPDATE; 310 mlx5_qp_update_comp(qp); 311 } 312 mlx5_ring_tx_db(qp, qp->ctrl); 313 } 314 315 static inline struct mlx5_cqe64 * 316 mlx5_cq_get_cqe(struct mlx5_hw_cq *hw_cq, int cqe_size) 317 { 318 struct mlx5_cqe64 *cqe; 319 320 /* note: that the cq_size is known at the compilation time. We pass it 321 * down here so that branch and multiplication will be done at the 322 * compile time during inlining 323 */ 324 cqe = (struct mlx5_cqe64 *)(hw_cq->cq_addr + (hw_cq->ci & (hw_cq->cqe_cnt - 1)) * 325 cqe_size); 326 return cqe_size == 64 ? cqe : cqe + 1; 327 } 328 329 330 static inline struct mlx5_cqe64 * 331 mlx5_cq_poll_one(struct mlx5_hw_cq *hw_cq, int cqe_size) 332 { 333 struct mlx5_cqe64 *cqe; 334 335 cqe = mlx5_cq_get_cqe(hw_cq, cqe_size); 336 337 /* cqe is hw owned */ 338 if (mlx5dv_get_cqe_owner(cqe) == !(hw_cq->ci & hw_cq->cqe_cnt)) { 339 return NULL; 340 } 341 342 /* and must have valid opcode */ 343 if (mlx5dv_get_cqe_opcode(cqe) == MLX5_CQE_INVALID) { 344 return NULL; 345 } 346 347 hw_cq->ci++; 348 349 SPDK_DEBUGLOG(mlx5, 350 "cq: 0x%x ci: %d CQ opcode %d size %d wqe_counter %d scatter32 %d scatter64 %d\n", 351 hw_cq->cq_num, hw_cq->ci, 352 mlx5dv_get_cqe_opcode(cqe), 353 be32toh(cqe->byte_cnt), 354 be16toh(cqe->wqe_counter), 355 cqe->op_own & MLX5_INLINE_SCATTER_32, 356 cqe->op_own & MLX5_INLINE_SCATTER_64); 357 return cqe; 358 } 359 360 static inline uint64_t 361 mlx5_qp_get_comp_wr_id(struct spdk_mlx5_qp *qp, struct mlx5_cqe64 *cqe) 362 { 363 uint16_t comp_idx; 364 uint32_t sq_mask; 365 366 sq_mask = qp->hw.sq_wqe_cnt - 1; 367 comp_idx = be16toh(cqe->wqe_counter) & sq_mask; 368 SPDK_DEBUGLOG(mlx5, "got cpl, wqe_counter %u, comp_idx %u; wrid %"PRIx64", cpls %u\n", 369 cqe->wqe_counter, comp_idx, qp->completions[comp_idx].wr_id, qp->completions[comp_idx].completions); 370 /* If we have several unsignaled WRs, we accumulate them in the completion of the next signaled WR */ 371 qp->tx_available += qp->completions[comp_idx].completions; 372 373 return qp->completions[comp_idx].wr_id; 374 } 375 376 int 377 spdk_mlx5_cq_poll_completions(struct spdk_mlx5_cq *cq, struct spdk_mlx5_cq_completion *comp, 378 int max_completions) 379 { 380 struct spdk_mlx5_qp *qp; 381 struct mlx5_cqe64 *cqe; 382 uint8_t opcode; 383 int n = 0; 384 385 do { 386 cqe = mlx5_cq_poll_one(&cq->hw, MLX5_DMA_Q_TX_CQE_SIZE); 387 if (!cqe) { 388 break; 389 } 390 391 qp = mlx5_cq_find_qp(cq, be32toh(cqe->sop_drop_qpn) & 0xffffff); 392 if (spdk_unlikely(!qp)) { 393 return -ENODEV; 394 } 395 396 opcode = mlx5dv_get_cqe_opcode(cqe); 397 comp[n].wr_id = mlx5_qp_get_comp_wr_id(qp, cqe); 398 if (spdk_likely(opcode == MLX5_CQE_REQ)) { 399 comp[n].status = IBV_WC_SUCCESS; 400 } else { 401 comp[n].status = mlx5_cqe_err(cqe); 402 } 403 n++; 404 } while (n < max_completions); 405 406 return n; 407 } 408 409 void 410 spdk_mlx5_qp_complete_send(struct spdk_mlx5_qp *qp) 411 { 412 mlx5_qp_tx_complete(qp); 413 } 414 415 #ifdef DEBUG 416 void 417 mlx5_qp_dump_wqe(struct spdk_mlx5_qp *qp, int n_wqe_bb) 418 { 419 struct mlx5_hw_qp *hw = &qp->hw; 420 uint32_t pi; 421 uint32_t to_end; 422 uint32_t *wqe; 423 int i; 424 extern struct spdk_log_flag SPDK_LOG_mlx5_sq; 425 426 if (!SPDK_LOG_mlx5_sq.enabled) { 427 return; 428 } 429 430 pi = hw->sq_pi & (hw->sq_wqe_cnt - 1); 431 to_end = (hw->sq_wqe_cnt - pi) * MLX5_SEND_WQE_BB; 432 wqe = mlx5_qp_get_wqe_bb(hw); 433 434 SPDK_DEBUGLOG(mlx5_sq, "QP: qpn 0x%" PRIx32 ", wqe_index 0x%" PRIx32 ", addr %p\n", 435 hw->qp_num, pi, wqe); 436 for (i = 0; i < n_wqe_bb; i++) { 437 fprintf(stderr, 438 "%08" PRIx32 " %08" PRIx32 " %08" PRIx32 " %08" PRIx32 "\n" 439 "%08" PRIx32 " %08" PRIx32 " %08" PRIx32 " %08" PRIx32 "\n" 440 "%08" PRIx32 " %08" PRIx32 " %08" PRIx32 " %08" PRIx32 "\n" 441 "%08" PRIx32 " %08" PRIx32 " %08" PRIx32 " %08" PRIx32 "\n", 442 be32toh(wqe[0]), be32toh(wqe[1]), be32toh(wqe[2]), be32toh(wqe[3]), 443 be32toh(wqe[4]), be32toh(wqe[5]), be32toh(wqe[6]), be32toh(wqe[7]), 444 be32toh(wqe[8]), be32toh(wqe[9]), be32toh(wqe[10]), be32toh(wqe[11]), 445 be32toh(wqe[12]), be32toh(wqe[13]), be32toh(wqe[14]), be32toh(wqe[15])); 446 wqe = mlx5_qp_get_next_wqebb(hw, &to_end, wqe); 447 } 448 } 449 #endif 450 451 SPDK_LOG_REGISTER_COMPONENT(mlx5_sq) 452