xref: /spdk/lib/mlx5/mlx5_dma.c (revision 57fd99b91e71a4baa5543e19ff83958dc99d4dac)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  */
4 
5 #include "mlx5_priv.h"
6 #include "mlx5_ifc.h"
7 #include "spdk/log.h"
8 #include "spdk/util.h"
9 #include "spdk/barrier.h"
10 #include "spdk/likely.h"
11 
12 #include "spdk_internal/rdma_utils.h"
13 #include "spdk_internal/mlx5.h"
14 
15 #define MLX5_DMA_Q_TX_CQE_SIZE  64
16 
17 struct _mlx5_err_cqe {
18 	uint8_t		rsvd0[32];
19 	uint32_t	srqn;
20 	uint8_t		rsvd1[16];
21 	uint8_t		hw_err_synd;
22 	uint8_t		rsvd2[1];
23 	uint8_t		vendor_err_synd;
24 	uint8_t		syndrome;
25 	uint32_t	s_wqe_opcode_qpn;
26 	uint16_t	wqe_counter;
27 	uint8_t		signature;
28 	uint8_t		op_own;
29 };
30 
31 struct mlx5_sigerr_cqe {
32 	uint8_t		rsvd0[16];
33 	uint32_t	expected_trans_sig;
34 	uint32_t	actual_trans_sig;
35 	uint32_t	expected_ref_tag;
36 	uint32_t	actual_ref_tag;
37 	uint16_t	syndrome;
38 	uint8_t		sig_type;
39 	uint8_t		domain;
40 	uint32_t	mkey;
41 	uint64_t	sig_err_offset;
42 	uint8_t		rsvd30[14];
43 	uint8_t		signature;
44 	uint8_t		op_own;
45 };
46 
47 static const char *
48 mlx5_cqe_err_opcode(struct _mlx5_err_cqe *ecqe)
49 {
50 	uint8_t wqe_err_opcode = be32toh(ecqe->s_wqe_opcode_qpn) >> 24;
51 
52 	switch (ecqe->op_own >> 4) {
53 	case MLX5_CQE_REQ_ERR:
54 		switch (wqe_err_opcode) {
55 		case MLX5_OPCODE_RDMA_WRITE_IMM:
56 		case MLX5_OPCODE_RDMA_WRITE:
57 			return "RDMA_WRITE";
58 		case MLX5_OPCODE_SEND_IMM:
59 		case MLX5_OPCODE_SEND:
60 		case MLX5_OPCODE_SEND_INVAL:
61 			return "SEND";
62 		case MLX5_OPCODE_RDMA_READ:
63 			return "RDMA_READ";
64 		case MLX5_OPCODE_ATOMIC_CS:
65 			return "COMPARE_SWAP";
66 		case MLX5_OPCODE_ATOMIC_FA:
67 			return "FETCH_ADD";
68 		case MLX5_OPCODE_ATOMIC_MASKED_CS:
69 			return "MASKED_COMPARE_SWAP";
70 		case MLX5_OPCODE_ATOMIC_MASKED_FA:
71 			return "MASKED_FETCH_ADD";
72 		case MLX5_OPCODE_MMO:
73 			return "GGA_DMA";
74 		default:
75 			return "";
76 		}
77 	case MLX5_CQE_RESP_ERR:
78 		return "RECV";
79 	default:
80 		return "";
81 	}
82 }
83 
84 static int
85 mlx5_cqe_err(struct mlx5_cqe64 *cqe)
86 {
87 	struct _mlx5_err_cqe *ecqe = (struct _mlx5_err_cqe *)cqe;
88 	uint16_t wqe_counter;
89 	uint32_t qp_num = 0;
90 	char info[200] = {0};
91 
92 	wqe_counter = be16toh(ecqe->wqe_counter);
93 	qp_num = be32toh(ecqe->s_wqe_opcode_qpn) & ((1 << 24) - 1);
94 
95 	if (ecqe->syndrome == MLX5_CQE_SYNDROME_WR_FLUSH_ERR) {
96 		SPDK_DEBUGLOG(mlx5, "QP 0x%x wqe[%d] is flushed\n", qp_num, wqe_counter);
97 		return ecqe->syndrome;
98 	}
99 
100 	switch (ecqe->syndrome) {
101 	case MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR:
102 		snprintf(info, sizeof(info), "Local length");
103 		break;
104 	case MLX5_CQE_SYNDROME_LOCAL_QP_OP_ERR:
105 		snprintf(info, sizeof(info), "Local QP operation");
106 		break;
107 	case MLX5_CQE_SYNDROME_LOCAL_PROT_ERR:
108 		snprintf(info, sizeof(info), "Local protection");
109 		break;
110 	case MLX5_CQE_SYNDROME_WR_FLUSH_ERR:
111 		snprintf(info, sizeof(info), "WR flushed because QP in error state");
112 		break;
113 	case MLX5_CQE_SYNDROME_MW_BIND_ERR:
114 		snprintf(info, sizeof(info), "Memory window bind");
115 		break;
116 	case MLX5_CQE_SYNDROME_BAD_RESP_ERR:
117 		snprintf(info, sizeof(info), "Bad response");
118 		break;
119 	case MLX5_CQE_SYNDROME_LOCAL_ACCESS_ERR:
120 		snprintf(info, sizeof(info), "Local access");
121 		break;
122 	case MLX5_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR:
123 		snprintf(info, sizeof(info), "Invalid request");
124 		break;
125 	case MLX5_CQE_SYNDROME_REMOTE_ACCESS_ERR:
126 		snprintf(info, sizeof(info), "Remote access");
127 		break;
128 	case MLX5_CQE_SYNDROME_REMOTE_OP_ERR:
129 		snprintf(info, sizeof(info), "Remote QP");
130 		break;
131 	case MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR:
132 		snprintf(info, sizeof(info), "Transport retry count exceeded");
133 		break;
134 	case MLX5_CQE_SYNDROME_RNR_RETRY_EXC_ERR:
135 		snprintf(info, sizeof(info), "Receive-no-ready retry count exceeded");
136 		break;
137 	case MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR:
138 		snprintf(info, sizeof(info), "Remote side aborted");
139 		break;
140 	default:
141 		snprintf(info, sizeof(info), "Generic");
142 		break;
143 	}
144 	SPDK_WARNLOG("Error on QP 0x%x wqe[%03d]: %s (synd 0x%x vend 0x%x hw 0x%x) opcode %s\n",
145 		     qp_num, wqe_counter, info, ecqe->syndrome, ecqe->vendor_err_synd, ecqe->hw_err_synd,
146 		     mlx5_cqe_err_opcode(ecqe));
147 
148 	return ecqe->syndrome;
149 }
150 
151 /*
152  * DATA WQE LAYOUT:
153  * ----------------------------------
154  * | gen_ctrl |   rseg   |   dseg   |
155  * ----------------------------------
156  *   16bytes    16bytes    16bytes * sge_count
157  */
158 
159 static inline void
160 mlx5_dma_xfer_full(struct spdk_mlx5_qp *qp, struct ibv_sge *sge, uint32_t sge_count, uint64_t raddr,
161 		   uint32_t rkey, int op, uint32_t flags, uint64_t wr_id, uint32_t bb_count)
162 {
163 	struct mlx5_hw_qp *hw_qp = &qp->hw;
164 	struct mlx5_wqe_ctrl_seg *ctrl;
165 	struct mlx5_wqe_raddr_seg *rseg;
166 	struct mlx5_wqe_data_seg *dseg;
167 	uint8_t fm_ce_se;
168 	uint32_t i, pi;
169 
170 	fm_ce_se = mlx5_qp_fm_ce_se_update(qp, (uint8_t)flags);
171 
172 	/* absolute PI value */
173 	pi = hw_qp->sq_pi & (hw_qp->sq_wqe_cnt - 1);
174 	SPDK_DEBUGLOG(mlx5, "opc %d, sge_count %u, bb_count %u, orig pi %u, fm_ce_se %x\n", op, sge_count,
175 		      bb_count, hw_qp->sq_pi, fm_ce_se);
176 
177 	ctrl = (struct mlx5_wqe_ctrl_seg *) mlx5_qp_get_wqe_bb(hw_qp);
178 	/* WQE size in octowords (16-byte units). DS (data segment) accounts for all the segments in the WQE
179 	 * as summarized in WQE construction */
180 	mlx5_set_ctrl_seg(ctrl, hw_qp->sq_pi, op, 0, hw_qp->qp_num, fm_ce_se, 2 + sge_count, 0, 0);
181 
182 	rseg = (struct mlx5_wqe_raddr_seg *)(ctrl + 1);
183 	rseg->raddr = htobe64(raddr);
184 	rseg->rkey  = htobe32(rkey);
185 	rseg->reserved = 0;
186 
187 	dseg = (struct mlx5_wqe_data_seg *)(rseg + 1);
188 	for (i = 0; i < sge_count; i++) {
189 		mlx5dv_set_data_seg(dseg, sge[i].length, sge[i].lkey, sge[i].addr);
190 		dseg = dseg + 1;
191 	}
192 
193 	mlx5_qp_wqe_submit(qp, ctrl, bb_count, pi);
194 
195 	mlx5_qp_set_comp(qp, pi, wr_id, fm_ce_se, bb_count);
196 	assert(qp->tx_available >= bb_count);
197 	qp->tx_available -= bb_count;
198 }
199 
200 static inline void
201 mlx5_dma_xfer_wrap_around(struct spdk_mlx5_qp *qp, struct ibv_sge *sge, uint32_t sge_count,
202 			  uint64_t raddr, uint32_t rkey, int op, uint32_t flags, uint64_t wr_id, uint32_t bb_count)
203 {
204 	struct mlx5_hw_qp *hw_qp = &qp->hw;
205 	struct mlx5_wqe_ctrl_seg *ctrl;
206 	struct mlx5_wqe_raddr_seg *rseg;
207 	struct mlx5_wqe_data_seg *dseg;
208 	uint8_t fm_ce_se;
209 	uint32_t i, to_end, pi;
210 
211 	fm_ce_se = mlx5_qp_fm_ce_se_update(qp, (uint8_t)flags);
212 
213 	/* absolute PI value */
214 	pi = hw_qp->sq_pi & (hw_qp->sq_wqe_cnt - 1);
215 	SPDK_DEBUGLOG(mlx5, "opc %d, sge_count %u, bb_count %u, orig pi %u, fm_ce_se %x\n", op, sge_count,
216 		      bb_count, pi, fm_ce_se);
217 
218 	to_end = (hw_qp->sq_wqe_cnt - pi) * MLX5_SEND_WQE_BB;
219 	ctrl = (struct mlx5_wqe_ctrl_seg *) mlx5_qp_get_wqe_bb(hw_qp);
220 	/* WQE size in octowords (16-byte units). DS (data segment) accounts for all the segments in the WQE
221 	 * as summarized in WQE construction */
222 	mlx5_set_ctrl_seg(ctrl, hw_qp->sq_pi, op, 0, hw_qp->qp_num, fm_ce_se, 2 + sge_count, 0, 0);
223 	to_end -= sizeof(struct mlx5_wqe_ctrl_seg); /* 16 bytes */
224 
225 	rseg = (struct mlx5_wqe_raddr_seg *)(ctrl + 1);
226 	rseg->raddr = htobe64(raddr);
227 	rseg->rkey  = htobe32(rkey);
228 	rseg->reserved = 0;
229 	to_end -= sizeof(struct mlx5_wqe_raddr_seg); /* 16 bytes */
230 
231 	dseg = (struct mlx5_wqe_data_seg *)(rseg + 1);
232 	for (i = 0; i < sge_count; i++) {
233 		mlx5dv_set_data_seg(dseg, sge[i].length, sge[i].lkey, sge[i].addr);
234 		to_end -= sizeof(struct mlx5_wqe_data_seg); /* 16 bytes */
235 		if (to_end != 0) {
236 			dseg = dseg + 1;
237 		} else {
238 			/* Start from the beginning of SQ */
239 			dseg = (struct mlx5_wqe_data_seg *)(hw_qp->sq_addr);
240 			to_end = hw_qp->sq_wqe_cnt * MLX5_SEND_WQE_BB;
241 		}
242 	}
243 
244 	mlx5_qp_wqe_submit(qp, ctrl, bb_count, pi);
245 
246 	mlx5_qp_set_comp(qp, pi, wr_id, fm_ce_se, bb_count);
247 	assert(qp->tx_available >= bb_count);
248 	qp->tx_available -= bb_count;
249 }
250 
251 static inline int
252 mlx5_qp_rdma_op(struct spdk_mlx5_qp *qp, struct ibv_sge *sge, uint32_t sge_count, uint64_t dstaddr,
253 		uint32_t rkey, uint64_t wrid, uint32_t flags, int op)
254 {
255 	struct mlx5_hw_qp *hw_qp = &qp->hw;
256 	uint32_t to_end, pi, bb_count;
257 
258 	/* One bb (building block) is 64 bytes - 4 octowords
259 	 * It can hold control segment + raddr segment + 2 sge segments.
260 	 * If sge_count (data segments) is bigger than 2 then we consume additional bb */
261 	bb_count = (sge_count <= 2) ? 1 : 1 + SPDK_CEIL_DIV(sge_count - 2, 4);
262 
263 	if (spdk_unlikely(bb_count > qp->tx_available)) {
264 		return -ENOMEM;
265 	}
266 	if (spdk_unlikely(sge_count > qp->max_send_sge)) {
267 		return -E2BIG;
268 	}
269 	pi = hw_qp->sq_pi & (hw_qp->sq_wqe_cnt - 1);
270 	to_end = (hw_qp->sq_wqe_cnt - pi) * MLX5_SEND_WQE_BB;
271 
272 	if (spdk_likely(to_end >= bb_count * MLX5_SEND_WQE_BB)) {
273 		mlx5_dma_xfer_full(qp, sge, sge_count, dstaddr, rkey, op, flags, wrid, bb_count);
274 	} else {
275 		mlx5_dma_xfer_wrap_around(qp, sge, sge_count, dstaddr, rkey, op, flags, wrid, bb_count);
276 	}
277 
278 	return 0;
279 }
280 
281 int
282 spdk_mlx5_qp_rdma_write(struct spdk_mlx5_qp *qp, struct ibv_sge *sge, uint32_t sge_count,
283 			uint64_t dstaddr, uint32_t rkey, uint64_t wrid, uint32_t flags)
284 {
285 	return mlx5_qp_rdma_op(qp, sge, sge_count, dstaddr, rkey, wrid, flags, MLX5_OPCODE_RDMA_WRITE);
286 }
287 
288 int
289 spdk_mlx5_qp_rdma_read(struct spdk_mlx5_qp *qp, struct ibv_sge *sge, uint32_t sge_count,
290 		       uint64_t dstaddr, uint32_t rkey, uint64_t wrid, uint32_t flags)
291 {
292 	return mlx5_qp_rdma_op(qp, sge, sge_count, dstaddr, rkey, wrid, flags, MLX5_OPCODE_RDMA_READ);
293 }
294 
295 /* polling start */
296 
297 static inline void
298 mlx5_qp_update_comp(struct spdk_mlx5_qp *qp)
299 {
300 	qp->completions[qp->last_pi].completions = qp->nonsignaled_outstanding;
301 	qp->nonsignaled_outstanding = 0;
302 }
303 
304 static inline void
305 mlx5_qp_tx_complete(struct spdk_mlx5_qp *qp)
306 {
307 	if (qp->sigmode == SPDK_MLX5_QP_SIG_LAST) {
308 		qp->ctrl->fm_ce_se &= ~SPDK_MLX5_WQE_CTRL_CE_MASK;
309 		qp->ctrl->fm_ce_se |= SPDK_MLX5_WQE_CTRL_CE_CQ_UPDATE;
310 		mlx5_qp_update_comp(qp);
311 	}
312 	mlx5_ring_tx_db(qp, qp->ctrl);
313 }
314 
315 static inline struct mlx5_cqe64 *
316 mlx5_cq_get_cqe(struct mlx5_hw_cq *hw_cq, int cqe_size)
317 {
318 	struct mlx5_cqe64 *cqe;
319 
320 	/* note: that the cq_size is known at the compilation time. We pass it
321 	 * down here so that branch and multiplication will be done at the
322 	 * compile time during inlining
323 	 */
324 	cqe = (struct mlx5_cqe64 *)(hw_cq->cq_addr + (hw_cq->ci & (hw_cq->cqe_cnt - 1)) *
325 				    cqe_size);
326 	return cqe_size == 64 ? cqe : cqe + 1;
327 }
328 
329 
330 static inline struct mlx5_cqe64 *
331 mlx5_cq_poll_one(struct mlx5_hw_cq *hw_cq, int cqe_size)
332 {
333 	struct mlx5_cqe64 *cqe;
334 
335 	cqe = mlx5_cq_get_cqe(hw_cq, cqe_size);
336 
337 	/* cqe is hw owned */
338 	if (mlx5dv_get_cqe_owner(cqe) == !(hw_cq->ci & hw_cq->cqe_cnt)) {
339 		return NULL;
340 	}
341 
342 	/* and must have valid opcode */
343 	if (mlx5dv_get_cqe_opcode(cqe) == MLX5_CQE_INVALID) {
344 		return NULL;
345 	}
346 
347 	hw_cq->ci++;
348 
349 	SPDK_DEBUGLOG(mlx5,
350 		      "cq: 0x%x ci: %d CQ opcode %d size %d wqe_counter %d scatter32 %d scatter64 %d\n",
351 		      hw_cq->cq_num, hw_cq->ci,
352 		      mlx5dv_get_cqe_opcode(cqe),
353 		      be32toh(cqe->byte_cnt),
354 		      be16toh(cqe->wqe_counter),
355 		      cqe->op_own & MLX5_INLINE_SCATTER_32,
356 		      cqe->op_own & MLX5_INLINE_SCATTER_64);
357 	return cqe;
358 }
359 
360 static inline uint64_t
361 mlx5_qp_get_comp_wr_id(struct spdk_mlx5_qp *qp, struct mlx5_cqe64 *cqe)
362 {
363 	uint16_t comp_idx;
364 	uint32_t sq_mask;
365 
366 	sq_mask = qp->hw.sq_wqe_cnt - 1;
367 	comp_idx = be16toh(cqe->wqe_counter) & sq_mask;
368 	SPDK_DEBUGLOG(mlx5, "got cpl, wqe_counter %u, comp_idx %u; wrid %"PRIx64", cpls %u\n",
369 		      cqe->wqe_counter, comp_idx, qp->completions[comp_idx].wr_id, qp->completions[comp_idx].completions);
370 	/* If we have several unsignaled WRs, we accumulate them in the completion of the next signaled WR */
371 	qp->tx_available += qp->completions[comp_idx].completions;
372 
373 	return qp->completions[comp_idx].wr_id;
374 }
375 
376 int
377 spdk_mlx5_cq_poll_completions(struct spdk_mlx5_cq *cq, struct spdk_mlx5_cq_completion *comp,
378 			      int max_completions)
379 {
380 	struct spdk_mlx5_qp *qp;
381 	struct mlx5_cqe64 *cqe;
382 	uint8_t opcode;
383 	int n = 0;
384 
385 	do {
386 		cqe = mlx5_cq_poll_one(&cq->hw, MLX5_DMA_Q_TX_CQE_SIZE);
387 		if (!cqe) {
388 			break;
389 		}
390 
391 		qp = mlx5_cq_find_qp(cq, be32toh(cqe->sop_drop_qpn) & 0xffffff);
392 		if (spdk_unlikely(!qp)) {
393 			return -ENODEV;
394 		}
395 
396 		opcode = mlx5dv_get_cqe_opcode(cqe);
397 		comp[n].wr_id = mlx5_qp_get_comp_wr_id(qp, cqe);
398 		if (spdk_likely(opcode == MLX5_CQE_REQ)) {
399 			comp[n].status = IBV_WC_SUCCESS;
400 		} else {
401 			comp[n].status = mlx5_cqe_err(cqe);
402 		}
403 		n++;
404 	} while (n < max_completions);
405 
406 	return n;
407 }
408 
409 void
410 spdk_mlx5_qp_complete_send(struct spdk_mlx5_qp *qp)
411 {
412 	mlx5_qp_tx_complete(qp);
413 }
414 
415 #ifdef DEBUG
416 void
417 mlx5_qp_dump_wqe(struct spdk_mlx5_qp *qp, int n_wqe_bb)
418 {
419 	struct mlx5_hw_qp *hw = &qp->hw;
420 	uint32_t pi;
421 	uint32_t to_end;
422 	uint32_t *wqe;
423 	int i;
424 	extern struct spdk_log_flag SPDK_LOG_mlx5_sq;
425 
426 	if (!SPDK_LOG_mlx5_sq.enabled) {
427 		return;
428 	}
429 
430 	pi = hw->sq_pi & (hw->sq_wqe_cnt - 1);
431 	to_end = (hw->sq_wqe_cnt - pi) * MLX5_SEND_WQE_BB;
432 	wqe = mlx5_qp_get_wqe_bb(hw);
433 
434 	SPDK_DEBUGLOG(mlx5_sq, "QP: qpn 0x%" PRIx32 ", wqe_index 0x%" PRIx32 ", addr %p\n",
435 		      hw->qp_num, pi, wqe);
436 	for (i = 0; i < n_wqe_bb; i++) {
437 		fprintf(stderr,
438 			"%08" PRIx32 " %08" PRIx32 " %08" PRIx32 " %08" PRIx32 "\n"
439 			"%08" PRIx32 " %08" PRIx32 " %08" PRIx32 " %08" PRIx32 "\n"
440 			"%08" PRIx32 " %08" PRIx32 " %08" PRIx32 " %08" PRIx32 "\n"
441 			"%08" PRIx32 " %08" PRIx32 " %08" PRIx32 " %08" PRIx32 "\n",
442 			be32toh(wqe[0]),  be32toh(wqe[1]),  be32toh(wqe[2]),  be32toh(wqe[3]),
443 			be32toh(wqe[4]),  be32toh(wqe[5]),  be32toh(wqe[6]),  be32toh(wqe[7]),
444 			be32toh(wqe[8]),  be32toh(wqe[9]),  be32toh(wqe[10]), be32toh(wqe[11]),
445 			be32toh(wqe[12]), be32toh(wqe[13]), be32toh(wqe[14]), be32toh(wqe[15]));
446 		wqe = mlx5_qp_get_next_wqebb(hw, &to_end, wqe);
447 	}
448 }
449 #endif
450 
451 SPDK_LOG_REGISTER_COMPONENT(mlx5_sq)
452