xref: /dpdk/drivers/net/mlx5/mlx5_rx.c (revision 665b49c51639a10c553433bc2bcd85c7331c631e)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2021 6WIND S.A.
3  * Copyright 2021 Mellanox Technologies, Ltd
4  */
5 
6 #include <stdint.h>
7 #include <string.h>
8 #include <stdlib.h>
9 
10 #include <rte_mbuf.h>
11 #include <rte_mempool.h>
12 #include <rte_prefetch.h>
13 #include <rte_common.h>
14 #include <rte_branch_prediction.h>
15 #include <rte_ether.h>
16 #include <rte_cycles.h>
17 #include <rte_flow.h>
18 
19 #include <mlx5_prm.h>
20 #include <mlx5_common.h>
21 #include <mlx5_common_mr.h>
22 #include <rte_pmd_mlx5.h>
23 
24 #include "mlx5_autoconf.h"
25 #include "mlx5_defs.h"
26 #include "mlx5.h"
27 #include "mlx5_utils.h"
28 #include "mlx5_rxtx.h"
29 #include "mlx5_devx.h"
30 #include "mlx5_rx.h"
31 #ifdef HAVE_MLX5_MSTFLINT
32 #include <mstflint/mtcr.h>
33 #endif
34 
35 
36 static __rte_always_inline uint32_t
37 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
38 		   volatile struct mlx5_mini_cqe8 *mcqe);
39 
40 static __rte_always_inline int
41 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
42 		 uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe,
43 		 uint16_t *skip_cnt, bool mprq);
44 
45 static __rte_always_inline uint32_t
46 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe);
47 
48 static __rte_always_inline void
49 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
50 	       volatile struct mlx5_cqe *cqe,
51 	       volatile struct mlx5_mini_cqe8 *mcqe);
52 
53 static inline void
54 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *__rte_restrict tcp,
55 			volatile struct mlx5_cqe *__rte_restrict cqe,
56 			uint32_t phcsum, uint8_t l4_type);
57 
58 static inline void
59 mlx5_lro_update_hdr(uint8_t *__rte_restrict padd,
60 		    volatile struct mlx5_cqe *__rte_restrict cqe,
61 		    volatile struct mlx5_mini_cqe8 *mcqe,
62 		    struct mlx5_rxq_data *rxq, uint32_t len);
63 
64 
65 /**
66  * Internal function to compute the number of used descriptors in an RX queue.
67  *
68  * @param rxq
69  *   The Rx queue.
70  *
71  * @return
72  *   The number of used Rx descriptor.
73  */
74 static uint32_t
75 rx_queue_count(struct mlx5_rxq_data *rxq)
76 {
77 	struct rxq_zip *zip = &rxq->zip;
78 	volatile struct mlx5_cqe *cqe;
79 	const unsigned int cqe_n = (1 << rxq->cqe_n);
80 	const unsigned int sges_n = (1 << rxq->sges_n);
81 	const unsigned int elts_n = (1 << rxq->elts_n);
82 	const unsigned int strd_n = RTE_BIT32(rxq->log_strd_num);
83 	const unsigned int cqe_cnt = cqe_n - 1;
84 	unsigned int cq_ci, used;
85 
86 	/* if we are processing a compressed cqe */
87 	if (zip->ai) {
88 		used = zip->cqe_cnt - zip->ai;
89 		cq_ci = zip->cq_ci;
90 	} else {
91 		used = 0;
92 		cq_ci = rxq->cq_ci;
93 	}
94 	cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
95 	while (check_cqe(cqe, cqe_n, cq_ci) != MLX5_CQE_STATUS_HW_OWN) {
96 		int8_t op_own;
97 		unsigned int n;
98 
99 		op_own = cqe->op_own;
100 		if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED)
101 			n = rte_be_to_cpu_32(cqe->byte_cnt);
102 		else
103 			n = 1;
104 		cq_ci += n;
105 		used += n;
106 		cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
107 	}
108 	used = RTE_MIN(used * sges_n, elts_n * strd_n);
109 	return used;
110 }
111 
112 /**
113  * DPDK callback to check the status of a Rx descriptor.
114  *
115  * @param rx_queue
116  *   The Rx queue.
117  * @param[in] offset
118  *   The index of the descriptor in the ring.
119  *
120  * @return
121  *   The status of the Rx descriptor.
122  */
123 int
124 mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset)
125 {
126 	struct mlx5_rxq_data *rxq = rx_queue;
127 
128 	if (offset >= (1 << rxq->cqe_n)) {
129 		rte_errno = EINVAL;
130 		return -rte_errno;
131 	}
132 	if (offset < rx_queue_count(rxq))
133 		return RTE_ETH_RX_DESC_DONE;
134 	return RTE_ETH_RX_DESC_AVAIL;
135 }
136 
137 /* Get rxq lwm percentage according to lwm number. */
138 static uint8_t
139 mlx5_rxq_lwm_to_percentage(struct mlx5_rxq_priv *rxq)
140 {
141 	struct mlx5_rxq_data *rxq_data = &rxq->ctrl->rxq;
142 	uint32_t wqe_cnt = 1 << (rxq_data->elts_n - rxq_data->sges_n);
143 
144 	return rxq->lwm * 100 / wqe_cnt;
145 }
146 
147 /**
148  * DPDK callback to get the RX queue information.
149  *
150  * @param dev
151  *   Pointer to the device structure.
152  *
153  * @param rx_queue_id
154  *   Rx queue identificator.
155  *
156  * @param qinfo
157  *   Pointer to the RX queue information structure.
158  *
159  * @return
160  *   None.
161  */
162 
163 void
164 mlx5_rxq_info_get(struct rte_eth_dev *dev, uint16_t rx_queue_id,
165 		  struct rte_eth_rxq_info *qinfo)
166 {
167 	struct mlx5_rxq_ctrl *rxq_ctrl = mlx5_rxq_ctrl_get(dev, rx_queue_id);
168 	struct mlx5_rxq_data *rxq = mlx5_rxq_data_get(dev, rx_queue_id);
169 	struct mlx5_rxq_priv *rxq_priv = mlx5_rxq_get(dev, rx_queue_id);
170 
171 	if (!rxq)
172 		return;
173 	qinfo->mp = mlx5_rxq_mprq_enabled(rxq) ?
174 					rxq->mprq_mp : rxq->mp;
175 	qinfo->conf.rx_thresh.pthresh = 0;
176 	qinfo->conf.rx_thresh.hthresh = 0;
177 	qinfo->conf.rx_thresh.wthresh = 0;
178 	qinfo->conf.rx_free_thresh = rxq->rq_repl_thresh;
179 	qinfo->conf.rx_drop_en = 1;
180 	if (rxq_ctrl == NULL || rxq_ctrl->obj == NULL)
181 		qinfo->conf.rx_deferred_start = 0;
182 	else
183 		qinfo->conf.rx_deferred_start = 1;
184 	qinfo->conf.offloads = dev->data->dev_conf.rxmode.offloads;
185 	qinfo->scattered_rx = dev->data->scattered_rx;
186 	qinfo->nb_desc = mlx5_rxq_mprq_enabled(rxq) ?
187 		RTE_BIT32(rxq->elts_n) * RTE_BIT32(rxq->log_strd_num) :
188 		RTE_BIT32(rxq->elts_n);
189 	qinfo->avail_thresh = rxq_priv ?
190 		mlx5_rxq_lwm_to_percentage(rxq_priv) : 0;
191 }
192 
193 /**
194  * DPDK callback to get the RX packet burst mode information.
195  *
196  * @param dev
197  *   Pointer to the device structure.
198  *
199  * @param rx_queue_id
200  *   Rx queue identification.
201  *
202  * @param mode
203  *   Pointer to the burts mode information.
204  *
205  * @return
206  *   0 as success, -EINVAL as failure.
207  */
208 int
209 mlx5_rx_burst_mode_get(struct rte_eth_dev *dev,
210 		       uint16_t rx_queue_id __rte_unused,
211 		       struct rte_eth_burst_mode *mode)
212 {
213 	eth_rx_burst_t pkt_burst = dev->rx_pkt_burst;
214 	struct mlx5_rxq_priv *rxq = mlx5_rxq_get(dev, rx_queue_id);
215 
216 	if (!rxq) {
217 		rte_errno = EINVAL;
218 		return -rte_errno;
219 	}
220 	if (pkt_burst == mlx5_rx_burst) {
221 		snprintf(mode->info, sizeof(mode->info), "%s", "Scalar");
222 	} else if (pkt_burst == mlx5_rx_burst_mprq) {
223 		snprintf(mode->info, sizeof(mode->info), "%s", "Multi-Packet RQ");
224 	} else if (pkt_burst == mlx5_rx_burst_vec) {
225 #if defined RTE_ARCH_X86_64
226 		snprintf(mode->info, sizeof(mode->info), "%s", "Vector SSE");
227 #elif defined RTE_ARCH_ARM64
228 		snprintf(mode->info, sizeof(mode->info), "%s", "Vector Neon");
229 #elif defined RTE_ARCH_PPC_64
230 		snprintf(mode->info, sizeof(mode->info), "%s", "Vector AltiVec");
231 #else
232 		return -EINVAL;
233 #endif
234 	} else if (pkt_burst == mlx5_rx_burst_mprq_vec) {
235 #if defined RTE_ARCH_X86_64
236 		snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector SSE");
237 #elif defined RTE_ARCH_ARM64
238 		snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector Neon");
239 #elif defined RTE_ARCH_PPC_64
240 		snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector AltiVec");
241 #else
242 		return -EINVAL;
243 #endif
244 	} else {
245 		return -EINVAL;
246 	}
247 	return 0;
248 }
249 
250 /**
251  * DPDK callback to get the number of used descriptors in a RX queue.
252  *
253  * @param rx_queue
254  *   The Rx queue pointer.
255  *
256  * @return
257  *   The number of used rx descriptor.
258  *   -EINVAL if the queue is invalid
259  */
260 uint32_t
261 mlx5_rx_queue_count(void *rx_queue)
262 {
263 	struct mlx5_rxq_data *rxq = rx_queue;
264 	struct rte_eth_dev *dev;
265 
266 	if (!rxq) {
267 		rte_errno = EINVAL;
268 		return -rte_errno;
269 	}
270 
271 	dev = &rte_eth_devices[rxq->port_id];
272 
273 	if (dev->rx_pkt_burst == NULL ||
274 	    dev->rx_pkt_burst == rte_eth_pkt_burst_dummy) {
275 		rte_errno = ENOTSUP;
276 		return -rte_errno;
277 	}
278 
279 	return rx_queue_count(rxq);
280 }
281 
282 #define CLB_VAL_IDX 0
283 #define CLB_MSK_IDX 1
284 static int
285 mlx5_monitor_callback(const uint64_t value,
286 		const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ])
287 {
288 	const uint64_t m = opaque[CLB_MSK_IDX];
289 	const uint64_t v = opaque[CLB_VAL_IDX];
290 
291 	return (value & m) == v ? -1 : 0;
292 }
293 
294 int mlx5_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc)
295 {
296 	struct mlx5_rxq_data *rxq = rx_queue;
297 	const unsigned int cqe_num = 1 << rxq->cqe_n;
298 	const unsigned int cqe_mask = cqe_num - 1;
299 	const uint16_t idx = rxq->cq_ci & cqe_num;
300 	volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask];
301 
302 	if (unlikely(rxq->cqes == NULL)) {
303 		rte_errno = EINVAL;
304 		return -rte_errno;
305 	}
306 	pmc->addr = &cqe->op_own;
307 	pmc->opaque[CLB_VAL_IDX] = !!idx;
308 	pmc->opaque[CLB_MSK_IDX] = MLX5_CQE_OWNER_MASK;
309 	pmc->fn = mlx5_monitor_callback;
310 	pmc->size = sizeof(uint8_t);
311 	return 0;
312 }
313 
314 /**
315  * Translate RX completion flags to packet type.
316  *
317  * @param[in] rxq
318  *   Pointer to RX queue structure.
319  * @param[in] cqe
320  *   Pointer to CQE.
321  *
322  * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
323  *
324  * @return
325  *   Packet type for struct rte_mbuf.
326  */
327 static inline uint32_t
328 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
329 				   volatile struct mlx5_mini_cqe8 *mcqe)
330 {
331 	uint8_t idx;
332 	uint8_t ptype;
333 	uint8_t pinfo = (cqe->pkt_info & 0x3) << 6;
334 
335 	/* Get l3/l4 header from mini-CQE in case L3/L4 format*/
336 	if (mcqe == NULL ||
337 	    rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX)
338 		ptype = (cqe->hdr_type_etc & 0xfc00) >> 10;
339 	else
340 		ptype = mcqe->hdr_type >> 2;
341 	/*
342 	 * The index to the array should have:
343 	 * bit[1:0] = l3_hdr_type
344 	 * bit[4:2] = l4_hdr_type
345 	 * bit[5] = ip_frag
346 	 * bit[6] = tunneled
347 	 * bit[7] = outer_l3_type
348 	 */
349 	idx = pinfo | ptype;
350 	return mlx5_ptype_table[idx] | rxq->tunnel * !!(idx & (1 << 6));
351 }
352 
353 /**
354  * Initialize Rx WQ and indexes.
355  *
356  * @param[in] rxq
357  *   Pointer to RX queue structure.
358  */
359 void
360 mlx5_rxq_initialize(struct mlx5_rxq_data *rxq)
361 {
362 	const unsigned int wqe_n = 1 << rxq->elts_n;
363 	unsigned int i;
364 
365 	for (i = 0; (i != wqe_n); ++i) {
366 		volatile struct mlx5_wqe_data_seg *scat;
367 		uintptr_t addr;
368 		uint32_t byte_count;
369 		uint32_t lkey;
370 
371 		if (mlx5_rxq_mprq_enabled(rxq)) {
372 			struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[i];
373 
374 			scat = &((volatile struct mlx5_wqe_mprq *)
375 				rxq->wqes)[i].dseg;
376 			addr = (uintptr_t)mlx5_mprq_buf_addr
377 					(buf, RTE_BIT32(rxq->log_strd_num));
378 			byte_count = RTE_BIT32(rxq->log_strd_sz) *
379 				     RTE_BIT32(rxq->log_strd_num);
380 			lkey = mlx5_rx_addr2mr(rxq, addr);
381 		} else {
382 			struct rte_mbuf *buf = (*rxq->elts)[i];
383 
384 			scat = &((volatile struct mlx5_wqe_data_seg *)
385 					rxq->wqes)[i];
386 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
387 			byte_count = DATA_LEN(buf);
388 			lkey = mlx5_rx_mb2mr(rxq, buf);
389 		}
390 		/* scat->addr must be able to store a pointer. */
391 		MLX5_ASSERT(sizeof(scat->addr) >= sizeof(uintptr_t));
392 		*scat = (struct mlx5_wqe_data_seg){
393 			.addr = rte_cpu_to_be_64(addr),
394 			.byte_count = rte_cpu_to_be_32(byte_count),
395 			.lkey = lkey,
396 		};
397 	}
398 	rxq->consumed_strd = 0;
399 	rxq->decompressed = 0;
400 	rxq->rq_pi = 0;
401 	rxq->zip = (struct rxq_zip){
402 		.ai = 0,
403 	};
404 	rxq->elts_ci = mlx5_rxq_mprq_enabled(rxq) ?
405 		(wqe_n >> rxq->sges_n) * RTE_BIT32(rxq->log_strd_num) : 0;
406 	/* Update doorbell counter. */
407 	rxq->rq_ci = wqe_n >> rxq->sges_n;
408 	rte_io_wmb();
409 	*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
410 }
411 
412 #define MLX5_ERROR_CQE_MASK 0x40000000
413 /* Must be negative. */
414 #define MLX5_REGULAR_ERROR_CQE_RET (-5)
415 #define MLX5_CRITICAL_ERROR_CQE_RET (-4)
416 /* Must not be negative. */
417 #define MLX5_RECOVERY_ERROR_RET 0
418 #define MLX5_RECOVERY_IGNORE_RET 1
419 #define MLX5_RECOVERY_COMPLETED_RET 2
420 
421 /**
422  * Handle a Rx error.
423  * The function inserts the RQ state to reset when the first error CQE is
424  * shown, then drains the CQ by the caller function loop. When the CQ is empty,
425  * it moves the RQ state to ready and initializes the RQ.
426  * Next CQE identification and error counting are in the caller responsibility.
427  *
428  * @param[in] rxq
429  *   Pointer to RX queue structure.
430  * @param[in] vec
431  *   1 when called from vectorized Rx burst, need to prepare mbufs for the RQ.
432  *   0 when called from non-vectorized Rx burst.
433  * @param[in] err_n
434  *   Number of CQEs to check for an error.
435  *
436  * @return
437  *   MLX5_RECOVERY_ERROR_RET in case of recovery error,
438  *   MLX5_RECOVERY_IGNORE_RET in case of non-critical error syndrome,
439  *   MLX5_RECOVERY_COMPLETED_RET in case of recovery is completed,
440  *   otherwise the CQE status after ignored error syndrome or queue reset.
441  */
442 int
443 mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec,
444 		   uint16_t err_n, uint16_t *skip_cnt)
445 {
446 	const uint16_t cqe_n = 1 << rxq->cqe_n;
447 	const uint16_t cqe_mask = cqe_n - 1;
448 	const uint16_t wqe_n = 1 << rxq->elts_n;
449 	const uint16_t strd_n = RTE_BIT32(rxq->log_strd_num);
450 	struct mlx5_rxq_ctrl *rxq_ctrl =
451 			container_of(rxq, struct mlx5_rxq_ctrl, rxq);
452 	union {
453 		volatile struct mlx5_cqe *cqe;
454 		volatile struct mlx5_err_cqe *err_cqe;
455 	} u = {
456 		.cqe = &(*rxq->cqes)[(rxq->cq_ci - vec) & cqe_mask],
457 	};
458 	struct mlx5_mp_arg_queue_state_modify sm;
459 	bool critical_syndrome = false;
460 	int ret, i;
461 
462 	switch (rxq->err_state) {
463 	case MLX5_RXQ_ERR_STATE_IGNORE:
464 		ret = check_cqe(u.cqe, cqe_n, rxq->cq_ci - vec);
465 		if (ret != MLX5_CQE_STATUS_ERR) {
466 			rxq->err_state = MLX5_RXQ_ERR_STATE_NO_ERROR;
467 			return ret;
468 		}
469 		/* Fall-through */
470 	case MLX5_RXQ_ERR_STATE_NO_ERROR:
471 		for (i = 0; i < (int)err_n; i++) {
472 			u.cqe = &(*rxq->cqes)[(rxq->cq_ci - vec - i) & cqe_mask];
473 			if (MLX5_CQE_OPCODE(u.cqe->op_own) == MLX5_CQE_RESP_ERR) {
474 				if (u.err_cqe->syndrome == MLX5_CQE_SYNDROME_LOCAL_QP_OP_ERR ||
475 				    u.err_cqe->syndrome == MLX5_CQE_SYNDROME_LOCAL_PROT_ERR ||
476 				    u.err_cqe->syndrome == MLX5_CQE_SYNDROME_WR_FLUSH_ERR)
477 					critical_syndrome = true;
478 				break;
479 			}
480 		}
481 		if (!critical_syndrome) {
482 			if (rxq->err_state == MLX5_RXQ_ERR_STATE_NO_ERROR) {
483 				*skip_cnt = 0;
484 				if (i == err_n)
485 					rxq->err_state = MLX5_RXQ_ERR_STATE_IGNORE;
486 			}
487 			return MLX5_RECOVERY_IGNORE_RET;
488 		}
489 		rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_RESET;
490 		/* Fall-through */
491 	case MLX5_RXQ_ERR_STATE_NEED_RESET:
492 		sm.is_wq = 1;
493 		sm.queue_id = rxq->idx;
494 		sm.state = IBV_WQS_RESET;
495 		if (mlx5_queue_state_modify(RXQ_DEV(rxq_ctrl), &sm))
496 			return MLX5_RECOVERY_ERROR_RET;
497 		if (rxq_ctrl->dump_file_n <
498 		    RXQ_PORT(rxq_ctrl)->config.max_dump_files_num) {
499 			MKSTR(err_str, "Unexpected CQE error syndrome "
500 			      "0x%02x CQN = %u RQN = %u wqe_counter = %u"
501 			      " rq_ci = %u cq_ci = %u", u.err_cqe->syndrome,
502 			      rxq->cqn, rxq_ctrl->wqn,
503 			      rte_be_to_cpu_16(u.err_cqe->wqe_counter),
504 			      rxq->rq_ci << rxq->sges_n, rxq->cq_ci);
505 			MKSTR(name, "dpdk_mlx5_port_%u_rxq_%u_%u",
506 			      rxq->port_id, rxq->idx, (uint32_t)rte_rdtsc());
507 			mlx5_dump_debug_information(name, NULL, err_str, 0);
508 			mlx5_dump_debug_information(name, "MLX5 Error CQ:",
509 						    (const void *)((uintptr_t)
510 								    rxq->cqes),
511 						    sizeof(*u.cqe) * cqe_n);
512 			mlx5_dump_debug_information(name, "MLX5 Error RQ:",
513 						    (const void *)((uintptr_t)
514 								    rxq->wqes),
515 						    16 * wqe_n);
516 			rxq_ctrl->dump_file_n++;
517 		}
518 		rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_READY;
519 		/* Fall-through */
520 	case MLX5_RXQ_ERR_STATE_NEED_READY:
521 		ret = check_cqe(u.cqe, cqe_n, rxq->cq_ci);
522 		if (ret == MLX5_CQE_STATUS_HW_OWN) {
523 			rte_io_wmb();
524 			*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
525 			rte_io_wmb();
526 			/*
527 			 * The RQ consumer index must be zeroed while moving
528 			 * from RESET state to RDY state.
529 			 */
530 			*rxq->rq_db = rte_cpu_to_be_32(0);
531 			rte_io_wmb();
532 			sm.is_wq = 1;
533 			sm.queue_id = rxq->idx;
534 			sm.state = IBV_WQS_RDY;
535 			if (mlx5_queue_state_modify(RXQ_DEV(rxq_ctrl), &sm))
536 				return MLX5_RECOVERY_ERROR_RET;
537 			if (vec) {
538 				const uint32_t elts_n =
539 					mlx5_rxq_mprq_enabled(rxq) ?
540 					wqe_n * strd_n : wqe_n;
541 				const uint32_t e_mask = elts_n - 1;
542 				uint32_t elts_ci =
543 					mlx5_rxq_mprq_enabled(rxq) ?
544 					rxq->elts_ci : rxq->rq_ci;
545 				uint32_t elt_idx;
546 				struct rte_mbuf **elt;
547 				unsigned int n = elts_n - (elts_ci -
548 							  rxq->rq_pi);
549 
550 				for (i = 0; i < (int)n; ++i) {
551 					elt_idx = (elts_ci + i) & e_mask;
552 					elt = &(*rxq->elts)[elt_idx];
553 					*elt = rte_mbuf_raw_alloc(rxq->mp);
554 					if (!*elt) {
555 						for (i--; i >= 0; --i) {
556 							elt_idx = (elts_ci +
557 								   i) & elts_n;
558 							elt = &(*rxq->elts)
559 								[elt_idx];
560 							rte_pktmbuf_free_seg
561 								(*elt);
562 						}
563 						return MLX5_RECOVERY_ERROR_RET;
564 					}
565 				}
566 				for (i = 0; i < (int)elts_n; ++i) {
567 					elt = &(*rxq->elts)[i];
568 					DATA_LEN(*elt) =
569 						(uint16_t)((*elt)->buf_len -
570 						rte_pktmbuf_headroom(*elt));
571 				}
572 				/* Padding with a fake mbuf for vec Rx. */
573 				for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
574 					(*rxq->elts)[elts_n + i] =
575 								&rxq->fake_mbuf;
576 			}
577 			mlx5_rxq_initialize(rxq);
578 			rxq->err_state = MLX5_RXQ_ERR_STATE_NO_ERROR;
579 			return MLX5_RECOVERY_COMPLETED_RET;
580 		}
581 		return ret;
582 	default:
583 		return MLX5_RECOVERY_ERROR_RET;
584 	}
585 }
586 
587 /**
588  * Get size of the next packet for a given CQE. For compressed CQEs, the
589  * consumer index is updated only once all packets of the current one have
590  * been processed.
591  *
592  * @param rxq
593  *   Pointer to RX queue.
594  * @param cqe
595  *   CQE to process.
596  * @param[out] mcqe
597  *   Store pointer to mini-CQE if compressed. Otherwise, the pointer is not
598  *   written.
599  * @param[out] skip_cnt
600  *   Number of packets skipped due to recoverable errors.
601  * @param mprq
602  *   Indication if it is called from MPRQ.
603  * @return
604  *   0 in case of empty CQE, MLX5_REGULAR_ERROR_CQE_RET in case of error CQE,
605  *   MLX5_CRITICAL_ERROR_CQE_RET in case of error CQE lead to Rx queue reset,
606  *   otherwise the packet size in regular RxQ,
607  *   and striding byte count format in mprq case.
608  */
609 static inline int
610 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
611 		 uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe,
612 		 uint16_t *skip_cnt, bool mprq)
613 {
614 	struct rxq_zip *zip = &rxq->zip;
615 	uint16_t cqe_n = cqe_cnt + 1;
616 	int len = 0, ret = 0;
617 	uint16_t idx, end;
618 
619 	do {
620 		len = 0;
621 		/* Process compressed data in the CQE and mini arrays. */
622 		if (zip->ai) {
623 			volatile struct mlx5_mini_cqe8 (*mc)[8] =
624 				(volatile struct mlx5_mini_cqe8 (*)[8])
625 				(uintptr_t)(&(*rxq->cqes)[zip->ca &
626 							  cqe_cnt].pkt_info);
627 			len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt &
628 					       rxq->byte_mask);
629 			*mcqe = &(*mc)[zip->ai & 7];
630 			if ((++zip->ai & 7) == 0) {
631 				/* Invalidate consumed CQEs */
632 				idx = zip->ca;
633 				end = zip->na;
634 				while (idx != end) {
635 					(*rxq->cqes)[idx & cqe_cnt].op_own =
636 						MLX5_CQE_INVALIDATE;
637 					++idx;
638 				}
639 				/*
640 				 * Increment consumer index to skip the number
641 				 * of CQEs consumed. Hardware leaves holes in
642 				 * the CQ ring for software use.
643 				 */
644 				zip->ca = zip->na;
645 				zip->na += 8;
646 			}
647 			if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
648 				/* Invalidate the rest */
649 				idx = zip->ca;
650 				end = zip->cq_ci;
651 
652 				while (idx != end) {
653 					(*rxq->cqes)[idx & cqe_cnt].op_own =
654 						MLX5_CQE_INVALIDATE;
655 					++idx;
656 				}
657 				rxq->cq_ci = zip->cq_ci;
658 				zip->ai = 0;
659 			}
660 		/*
661 		 * No compressed data, get next CQE and verify if it is
662 		 * compressed.
663 		 */
664 		} else {
665 			int8_t op_own;
666 			uint32_t cq_ci;
667 
668 			ret = check_cqe(cqe, cqe_n, rxq->cq_ci);
669 			if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
670 				if (unlikely(ret == MLX5_CQE_STATUS_ERR ||
671 					     rxq->err_state)) {
672 					ret = mlx5_rx_err_handle(rxq, 0, 1, skip_cnt);
673 					if (ret == MLX5_CQE_STATUS_HW_OWN)
674 						return MLX5_ERROR_CQE_MASK;
675 					if (ret == MLX5_RECOVERY_ERROR_RET ||
676 						ret == MLX5_RECOVERY_COMPLETED_RET)
677 						return MLX5_CRITICAL_ERROR_CQE_RET;
678 				} else {
679 					return 0;
680 				}
681 			}
682 			/*
683 			 * Introduce the local variable to have queue cq_ci
684 			 * index in queue structure always consistent with
685 			 * actual CQE boundary (not pointing to the middle
686 			 * of compressed CQE session).
687 			 */
688 			cq_ci = rxq->cq_ci + 1;
689 			op_own = cqe->op_own;
690 			if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
691 				volatile struct mlx5_mini_cqe8 (*mc)[8] =
692 					(volatile struct mlx5_mini_cqe8 (*)[8])
693 					(uintptr_t)(&(*rxq->cqes)
694 						[cq_ci & cqe_cnt].pkt_info);
695 
696 				/* Fix endianness. */
697 				zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt);
698 				/*
699 				 * Current mini array position is the one
700 				 * returned by check_cqe64().
701 				 *
702 				 * If completion comprises several mini arrays,
703 				 * as a special case the second one is located
704 				 * 7 CQEs after the initial CQE instead of 8
705 				 * for subsequent ones.
706 				 */
707 				zip->ca = cq_ci;
708 				zip->na = zip->ca + 7;
709 				/* Compute the next non compressed CQE. */
710 				zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
711 				/* Get packet size to return. */
712 				len = rte_be_to_cpu_32((*mc)[0].byte_cnt &
713 						       rxq->byte_mask);
714 				*mcqe = &(*mc)[0];
715 				zip->ai = 1;
716 				/* Prefetch all to be invalidated */
717 				idx = zip->ca;
718 				end = zip->cq_ci;
719 				while (idx != end) {
720 					rte_prefetch0(&(*rxq->cqes)[(idx) &
721 								    cqe_cnt]);
722 					++idx;
723 				}
724 			} else {
725 				rxq->cq_ci = cq_ci;
726 				len = rte_be_to_cpu_32(cqe->byte_cnt);
727 			}
728 		}
729 		if (unlikely(rxq->err_state)) {
730 			if (rxq->err_state == MLX5_RXQ_ERR_STATE_IGNORE &&
731 			    ret == MLX5_CQE_STATUS_SW_OWN) {
732 				rxq->err_state = MLX5_RXQ_ERR_STATE_NO_ERROR;
733 				return len & MLX5_ERROR_CQE_MASK;
734 			}
735 			cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
736 			++rxq->stats.idropped;
737 			(*skip_cnt) += mprq ? (len & MLX5_MPRQ_STRIDE_NUM_MASK) >>
738 				MLX5_MPRQ_STRIDE_NUM_SHIFT : 1;
739 		} else {
740 			return len;
741 		}
742 	} while (1);
743 }
744 
745 /**
746  * Translate RX completion flags to offload flags.
747  *
748  * @param[in] cqe
749  *   Pointer to CQE.
750  *
751  * @return
752  *   Offload flags (ol_flags) for struct rte_mbuf.
753  */
754 static inline uint32_t
755 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe)
756 {
757 	uint32_t ol_flags = 0;
758 	uint16_t flags = rte_be_to_cpu_16(cqe->hdr_type_etc);
759 
760 	ol_flags =
761 		TRANSPOSE(flags,
762 			  MLX5_CQE_RX_L3_HDR_VALID,
763 			  RTE_MBUF_F_RX_IP_CKSUM_GOOD) |
764 		TRANSPOSE(flags,
765 			  MLX5_CQE_RX_L4_HDR_VALID,
766 			  RTE_MBUF_F_RX_L4_CKSUM_GOOD);
767 	return ol_flags;
768 }
769 
770 /**
771  * Fill in mbuf fields from RX completion flags.
772  * Note that pkt->ol_flags should be initialized outside of this function.
773  *
774  * @param rxq
775  *   Pointer to RX queue.
776  * @param pkt
777  *   mbuf to fill.
778  * @param cqe
779  *   CQE to process.
780  * @param rss_hash_res
781  *   Packet RSS Hash result.
782  */
783 static inline void
784 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
785 	       volatile struct mlx5_cqe *cqe,
786 	       volatile struct mlx5_mini_cqe8 *mcqe)
787 {
788 	/* Update packet information. */
789 	pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe, mcqe);
790 	pkt->port = unlikely(rxq->shared) ? cqe->user_index_low : rxq->port_id;
791 
792 	if (rxq->rss_hash) {
793 		uint32_t rss_hash_res = 0;
794 
795 		/* If compressed, take hash result from mini-CQE. */
796 		if (mcqe == NULL ||
797 		    rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_HASH)
798 			rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res);
799 		else
800 			rss_hash_res = rte_be_to_cpu_32(mcqe->rx_hash_result);
801 		if (rss_hash_res) {
802 			pkt->hash.rss = rss_hash_res;
803 			pkt->ol_flags |= RTE_MBUF_F_RX_RSS_HASH;
804 		}
805 	}
806 	if (rxq->mark) {
807 		uint32_t mark = 0;
808 
809 		/* If compressed, take flow tag from mini-CQE. */
810 		if (mcqe == NULL ||
811 		    rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_FTAG_STRIDX)
812 			mark = cqe->sop_drop_qpn;
813 		else
814 			mark = ((mcqe->byte_cnt_flow & 0xff) << 8) |
815 				(mcqe->flow_tag_high << 16);
816 		if (MLX5_FLOW_MARK_IS_VALID(mark)) {
817 			pkt->ol_flags |= RTE_MBUF_F_RX_FDIR;
818 			if (mark != RTE_BE32(MLX5_FLOW_MARK_DEFAULT)) {
819 				pkt->ol_flags |= RTE_MBUF_F_RX_FDIR_ID;
820 				pkt->hash.fdir.hi = mlx5_flow_mark_get(mark);
821 			}
822 		}
823 	}
824 	if (rxq->dynf_meta) {
825 		uint32_t meta = rte_be_to_cpu_32(cqe->flow_table_metadata) &
826 			rxq->flow_meta_port_mask;
827 
828 		if (meta) {
829 			pkt->ol_flags |= rxq->flow_meta_mask;
830 			*RTE_MBUF_DYNFIELD(pkt, rxq->flow_meta_offset,
831 						uint32_t *) = meta;
832 		}
833 	}
834 	if (rxq->csum)
835 		pkt->ol_flags |= rxq_cq_to_ol_flags(cqe);
836 	if (rxq->vlan_strip) {
837 		bool vlan_strip;
838 
839 		if (mcqe == NULL ||
840 		    rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX)
841 			vlan_strip = cqe->hdr_type_etc &
842 				     RTE_BE16(MLX5_CQE_VLAN_STRIPPED);
843 		else
844 			vlan_strip = mcqe->hdr_type &
845 				     RTE_BE16(MLX5_CQE_VLAN_STRIPPED);
846 		if (vlan_strip) {
847 			pkt->ol_flags |= RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED;
848 			pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info);
849 		}
850 	}
851 	if (rxq->hw_timestamp) {
852 		uint64_t ts = rte_be_to_cpu_64(cqe->timestamp);
853 
854 		if (rxq->rt_timestamp)
855 			ts = mlx5_txpp_convert_rx_ts(rxq->sh, ts);
856 		mlx5_timestamp_set(pkt, rxq->timestamp_offset, ts);
857 		pkt->ol_flags |= rxq->timestamp_rx_flag;
858 	}
859 }
860 
861 /**
862  * DPDK callback for RX.
863  *
864  * @param dpdk_rxq
865  *   Generic pointer to RX queue structure.
866  * @param[out] pkts
867  *   Array to store received packets.
868  * @param pkts_n
869  *   Maximum number of packets in array.
870  *
871  * @return
872  *   Number of packets successfully received (<= pkts_n).
873  */
874 uint16_t
875 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
876 {
877 	struct mlx5_rxq_data *rxq = dpdk_rxq;
878 	const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1;
879 	const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1;
880 	const unsigned int sges_n = rxq->sges_n;
881 	struct rte_mbuf *pkt = NULL;
882 	struct rte_mbuf *seg = NULL;
883 	volatile struct mlx5_cqe *cqe =
884 		&(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
885 	unsigned int i = 0;
886 	unsigned int rq_ci = rxq->rq_ci << sges_n;
887 	int len = 0; /* keep its value across iterations. */
888 
889 	while (pkts_n) {
890 		uint16_t skip_cnt;
891 		unsigned int idx = rq_ci & wqe_cnt;
892 		volatile struct mlx5_wqe_data_seg *wqe =
893 			&((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx];
894 		struct rte_mbuf *rep = (*rxq->elts)[idx];
895 		volatile struct mlx5_mini_cqe8 *mcqe = NULL;
896 
897 		if (pkt)
898 			NEXT(seg) = rep;
899 		seg = rep;
900 		rte_prefetch0(seg);
901 		rte_prefetch0(cqe);
902 		rte_prefetch0(wqe);
903 		/* Allocate the buf from the same pool. */
904 		rep = rte_mbuf_raw_alloc(seg->pool);
905 		if (unlikely(rep == NULL)) {
906 			++rxq->stats.rx_nombuf;
907 			if (!pkt) {
908 				/*
909 				 * no buffers before we even started,
910 				 * bail out silently.
911 				 */
912 				break;
913 			}
914 			while (pkt != seg) {
915 				MLX5_ASSERT(pkt != (*rxq->elts)[idx]);
916 				rep = NEXT(pkt);
917 				NEXT(pkt) = NULL;
918 				NB_SEGS(pkt) = 1;
919 				rte_mbuf_raw_free(pkt);
920 				pkt = rep;
921 			}
922 			rq_ci >>= sges_n;
923 			++rq_ci;
924 			rq_ci <<= sges_n;
925 			break;
926 		}
927 		if (!pkt) {
928 			cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
929 			len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt, &mcqe, &skip_cnt, false);
930 			if (unlikely(len & MLX5_ERROR_CQE_MASK)) {
931 				if (len == MLX5_CRITICAL_ERROR_CQE_RET) {
932 					rte_mbuf_raw_free(rep);
933 					rq_ci = rxq->rq_ci << sges_n;
934 					break;
935 				}
936 				rq_ci >>= sges_n;
937 				rq_ci += skip_cnt;
938 				rq_ci <<= sges_n;
939 				idx = rq_ci & wqe_cnt;
940 				wqe = &((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx];
941 				seg = (*rxq->elts)[idx];
942 				cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
943 				len = len & ~MLX5_ERROR_CQE_MASK;
944 			}
945 			if (len == 0) {
946 				rte_mbuf_raw_free(rep);
947 				break;
948 			}
949 			pkt = seg;
950 			MLX5_ASSERT(len >= (rxq->crc_present << 2));
951 			pkt->ol_flags &= RTE_MBUF_F_EXTERNAL;
952 			rxq_cq_to_mbuf(rxq, pkt, cqe, mcqe);
953 			if (rxq->crc_present)
954 				len -= RTE_ETHER_CRC_LEN;
955 			PKT_LEN(pkt) = len;
956 			if (cqe->lro_num_seg > 1) {
957 				mlx5_lro_update_hdr
958 					(rte_pktmbuf_mtod(pkt, uint8_t *), cqe,
959 					 mcqe, rxq, len);
960 				pkt->ol_flags |= RTE_MBUF_F_RX_LRO;
961 				pkt->tso_segsz = len / cqe->lro_num_seg;
962 			}
963 		}
964 		DATA_LEN(rep) = DATA_LEN(seg);
965 		PKT_LEN(rep) = PKT_LEN(seg);
966 		SET_DATA_OFF(rep, DATA_OFF(seg));
967 		PORT(rep) = PORT(seg);
968 		(*rxq->elts)[idx] = rep;
969 		/*
970 		 * Fill NIC descriptor with the new buffer. The lkey and size
971 		 * of the buffers are already known, only the buffer address
972 		 * changes.
973 		 */
974 		wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t));
975 		/* If there's only one MR, no need to replace LKey in WQE. */
976 		if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
977 			wqe->lkey = mlx5_rx_mb2mr(rxq, rep);
978 		if (len > DATA_LEN(seg)) {
979 			len -= DATA_LEN(seg);
980 			++NB_SEGS(pkt);
981 			++rq_ci;
982 			continue;
983 		}
984 		DATA_LEN(seg) = len;
985 #ifdef MLX5_PMD_SOFT_COUNTERS
986 		/* Increment bytes counter. */
987 		rxq->stats.ibytes += PKT_LEN(pkt);
988 #endif
989 		/* Return packet. */
990 		*(pkts++) = pkt;
991 		pkt = NULL;
992 		--pkts_n;
993 		++i;
994 		/* Align consumer index to the next stride. */
995 		rq_ci >>= sges_n;
996 		++rq_ci;
997 		rq_ci <<= sges_n;
998 	}
999 	if (unlikely(i == 0 && ((rq_ci >> sges_n) == rxq->rq_ci)))
1000 		return 0;
1001 	/* Update the consumer index. */
1002 	rxq->rq_ci = rq_ci >> sges_n;
1003 	rte_io_wmb();
1004 	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
1005 	rte_io_wmb();
1006 	*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
1007 #ifdef MLX5_PMD_SOFT_COUNTERS
1008 	/* Increment packets counter. */
1009 	rxq->stats.ipackets += i;
1010 #endif
1011 	return i;
1012 }
1013 
1014 /**
1015  * Update LRO packet TCP header.
1016  * The HW LRO feature doesn't update the TCP header after coalescing the
1017  * TCP segments but supplies information in CQE to fill it by SW.
1018  *
1019  * @param tcp
1020  *   Pointer to the TCP header.
1021  * @param cqe
1022  *   Pointer to the completion entry.
1023  * @param phcsum
1024  *   The L3 pseudo-header checksum.
1025  */
1026 static inline void
1027 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *__rte_restrict tcp,
1028 			volatile struct mlx5_cqe *__rte_restrict cqe,
1029 			uint32_t phcsum, uint8_t l4_type)
1030 {
1031 	/*
1032 	 * The HW calculates only the TCP payload checksum, need to complete
1033 	 * the TCP header checksum and the L3 pseudo-header checksum.
1034 	 */
1035 	uint32_t csum = phcsum + cqe->csum;
1036 
1037 	if (l4_type == MLX5_L4_HDR_TYPE_TCP_EMPTY_ACK ||
1038 	    l4_type == MLX5_L4_HDR_TYPE_TCP_WITH_ACL) {
1039 		tcp->tcp_flags |= RTE_TCP_ACK_FLAG;
1040 		tcp->recv_ack = cqe->lro_ack_seq_num;
1041 		tcp->rx_win = cqe->lro_tcp_win;
1042 	}
1043 	if (cqe->lro_tcppsh_abort_dupack & MLX5_CQE_LRO_PUSH_MASK)
1044 		tcp->tcp_flags |= RTE_TCP_PSH_FLAG;
1045 	tcp->cksum = 0;
1046 	csum += rte_raw_cksum(tcp, (tcp->data_off >> 4) * 4);
1047 	csum = ((csum & 0xffff0000) >> 16) + (csum & 0xffff);
1048 	csum = (~csum) & 0xffff;
1049 	if (csum == 0)
1050 		csum = 0xffff;
1051 	tcp->cksum = csum;
1052 }
1053 
1054 /**
1055  * Update LRO packet headers.
1056  * The HW LRO feature doesn't update the L3/TCP headers after coalescing the
1057  * TCP segments but supply information in CQE to fill it by SW.
1058  *
1059  * @param padd
1060  *   The packet address.
1061  * @param cqe
1062  *   Pointer to the completion entry.
1063  * @param len
1064  *   The packet length.
1065  */
1066 static inline void
1067 mlx5_lro_update_hdr(uint8_t *__rte_restrict padd,
1068 		    volatile struct mlx5_cqe *__rte_restrict cqe,
1069 		    volatile struct mlx5_mini_cqe8 *mcqe,
1070 		    struct mlx5_rxq_data *rxq, uint32_t len)
1071 {
1072 	union {
1073 		struct rte_ether_hdr *eth;
1074 		struct rte_vlan_hdr *vlan;
1075 		struct rte_ipv4_hdr *ipv4;
1076 		struct rte_ipv6_hdr *ipv6;
1077 		struct rte_tcp_hdr *tcp;
1078 		uint8_t *hdr;
1079 	} h = {
1080 		.hdr = padd,
1081 	};
1082 	uint16_t proto = h.eth->ether_type;
1083 	uint32_t phcsum;
1084 	uint8_t l4_type;
1085 
1086 	h.eth++;
1087 	while (proto == RTE_BE16(RTE_ETHER_TYPE_VLAN) ||
1088 	       proto == RTE_BE16(RTE_ETHER_TYPE_QINQ)) {
1089 		proto = h.vlan->eth_proto;
1090 		h.vlan++;
1091 	}
1092 	if (proto == RTE_BE16(RTE_ETHER_TYPE_IPV4)) {
1093 		h.ipv4->time_to_live = cqe->lro_min_ttl;
1094 		h.ipv4->total_length = rte_cpu_to_be_16(len - (h.hdr - padd));
1095 		h.ipv4->hdr_checksum = 0;
1096 		h.ipv4->hdr_checksum = rte_ipv4_cksum(h.ipv4);
1097 		phcsum = rte_ipv4_phdr_cksum(h.ipv4, 0);
1098 		h.ipv4++;
1099 	} else {
1100 		h.ipv6->hop_limits = cqe->lro_min_ttl;
1101 		h.ipv6->payload_len = rte_cpu_to_be_16(len - (h.hdr - padd) -
1102 						       sizeof(*h.ipv6));
1103 		phcsum = rte_ipv6_phdr_cksum(h.ipv6, 0);
1104 		h.ipv6++;
1105 	}
1106 	if (mcqe == NULL ||
1107 	    rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX)
1108 		l4_type = (rte_be_to_cpu_16(cqe->hdr_type_etc) &
1109 			   MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT;
1110 	else
1111 		l4_type = (rte_be_to_cpu_16(mcqe->hdr_type) &
1112 			   MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT;
1113 	mlx5_lro_update_tcp_hdr(h.tcp, cqe, phcsum, l4_type);
1114 }
1115 
1116 void
1117 mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf)
1118 {
1119 	mlx5_mprq_buf_free_cb(NULL, buf);
1120 }
1121 
1122 /**
1123  * DPDK callback for RX with Multi-Packet RQ support.
1124  *
1125  * @param dpdk_rxq
1126  *   Generic pointer to RX queue structure.
1127  * @param[out] pkts
1128  *   Array to store received packets.
1129  * @param pkts_n
1130  *   Maximum number of packets in array.
1131  *
1132  * @return
1133  *   Number of packets successfully received (<= pkts_n).
1134  */
1135 uint16_t
1136 mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
1137 {
1138 	struct mlx5_rxq_data *rxq = dpdk_rxq;
1139 	const uint32_t strd_n = RTE_BIT32(rxq->log_strd_num);
1140 	const uint32_t strd_sz = RTE_BIT32(rxq->log_strd_sz);
1141 	const uint32_t cq_mask = (1 << rxq->cqe_n) - 1;
1142 	const uint32_t wq_mask = (1 << rxq->elts_n) - 1;
1143 	volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask];
1144 	unsigned int i = 0;
1145 	uint32_t rq_ci = rxq->rq_ci;
1146 	uint16_t consumed_strd = rxq->consumed_strd;
1147 	struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_ci & wq_mask];
1148 
1149 	while (i < pkts_n) {
1150 		struct rte_mbuf *pkt;
1151 		int ret;
1152 		uint32_t len;
1153 		uint16_t strd_cnt;
1154 		uint16_t strd_idx;
1155 		uint32_t byte_cnt;
1156 		uint16_t skip_cnt;
1157 		volatile struct mlx5_mini_cqe8 *mcqe = NULL;
1158 		enum mlx5_rqx_code rxq_code;
1159 
1160 		if (consumed_strd == strd_n) {
1161 			/* Replace WQE if the buffer is still in use. */
1162 			mprq_buf_replace(rxq, rq_ci & wq_mask);
1163 			/* Advance to the next WQE. */
1164 			consumed_strd = 0;
1165 			++rq_ci;
1166 			buf = (*rxq->mprq_bufs)[rq_ci & wq_mask];
1167 		}
1168 		cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask];
1169 		ret = mlx5_rx_poll_len(rxq, cqe, cq_mask, &mcqe, &skip_cnt, true);
1170 		if (unlikely(ret & MLX5_ERROR_CQE_MASK)) {
1171 			if (ret == MLX5_CRITICAL_ERROR_CQE_RET) {
1172 				rq_ci = rxq->rq_ci;
1173 				consumed_strd = rxq->consumed_strd;
1174 				break;
1175 			}
1176 			consumed_strd += skip_cnt;
1177 			while (consumed_strd >= strd_n) {
1178 				/* Replace WQE if the buffer is still in use. */
1179 				mprq_buf_replace(rxq, rq_ci & wq_mask);
1180 				/* Advance to the next WQE. */
1181 				consumed_strd -= strd_n;
1182 				++rq_ci;
1183 				buf = (*rxq->mprq_bufs)[rq_ci & wq_mask];
1184 			}
1185 			cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask];
1186 		}
1187 		if (ret == 0)
1188 			break;
1189 		byte_cnt = ret;
1190 		len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT;
1191 		MLX5_ASSERT((int)len >= (rxq->crc_present << 2));
1192 		if (rxq->crc_present)
1193 			len -= RTE_ETHER_CRC_LEN;
1194 		if (mcqe &&
1195 		    rxq->mcqe_format == MLX5_CQE_RESP_FORMAT_FTAG_STRIDX)
1196 			strd_cnt = (len / strd_sz) + !!(len % strd_sz);
1197 		else
1198 			strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >>
1199 				   MLX5_MPRQ_STRIDE_NUM_SHIFT;
1200 		MLX5_ASSERT(strd_cnt);
1201 		consumed_strd += strd_cnt;
1202 		if (byte_cnt & MLX5_MPRQ_FILLER_MASK)
1203 			continue;
1204 		strd_idx = rte_be_to_cpu_16(mcqe == NULL ?
1205 					cqe->wqe_counter :
1206 					mcqe->stride_idx);
1207 		MLX5_ASSERT(strd_idx < strd_n);
1208 		MLX5_ASSERT(!((rte_be_to_cpu_16(cqe->wqe_id) ^ rq_ci) &
1209 			    wq_mask));
1210 		pkt = rte_pktmbuf_alloc(rxq->mp);
1211 		if (unlikely(pkt == NULL)) {
1212 			++rxq->stats.rx_nombuf;
1213 			break;
1214 		}
1215 		len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT;
1216 		MLX5_ASSERT((int)len >= (rxq->crc_present << 2));
1217 		if (rxq->crc_present)
1218 			len -= RTE_ETHER_CRC_LEN;
1219 		rxq_code = mprq_buf_to_pkt(rxq, pkt, len, buf,
1220 					   strd_idx, strd_cnt);
1221 		if (unlikely(rxq_code != MLX5_RXQ_CODE_EXIT)) {
1222 			rte_pktmbuf_free_seg(pkt);
1223 			if (rxq_code == MLX5_RXQ_CODE_DROPPED) {
1224 				++rxq->stats.idropped;
1225 				continue;
1226 			}
1227 			if (rxq_code == MLX5_RXQ_CODE_NOMBUF) {
1228 				++rxq->stats.rx_nombuf;
1229 				break;
1230 			}
1231 		}
1232 		rxq_cq_to_mbuf(rxq, pkt, cqe, mcqe);
1233 		if (cqe->lro_num_seg > 1) {
1234 			mlx5_lro_update_hdr(rte_pktmbuf_mtod(pkt, uint8_t *),
1235 					    cqe, mcqe, rxq, len);
1236 			pkt->ol_flags |= RTE_MBUF_F_RX_LRO;
1237 			pkt->tso_segsz = len / cqe->lro_num_seg;
1238 		}
1239 		PKT_LEN(pkt) = len;
1240 		PORT(pkt) = rxq->port_id;
1241 #ifdef MLX5_PMD_SOFT_COUNTERS
1242 		/* Increment bytes counter. */
1243 		rxq->stats.ibytes += PKT_LEN(pkt);
1244 #endif
1245 		/* Return packet. */
1246 		*(pkts++) = pkt;
1247 		++i;
1248 	}
1249 	/* Update the consumer indexes. */
1250 	rxq->consumed_strd = consumed_strd;
1251 	rte_io_wmb();
1252 	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
1253 	if (rq_ci != rxq->rq_ci) {
1254 		rxq->rq_ci = rq_ci;
1255 		rte_io_wmb();
1256 		*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
1257 	}
1258 #ifdef MLX5_PMD_SOFT_COUNTERS
1259 	/* Increment packets counter. */
1260 	rxq->stats.ipackets += i;
1261 #endif
1262 	return i;
1263 }
1264 
1265 /*
1266  * Vectorized Rx routines are not compiled in when required vector instructions
1267  * are not supported on a target architecture.
1268  * The following null stubs are needed for linkage when those are not included
1269  * outside of this file (e.g. mlx5_rxtx_vec_sse.c for x86).
1270  */
1271 
1272 __rte_weak uint16_t
1273 mlx5_rx_burst_vec(void *dpdk_rxq __rte_unused,
1274 		  struct rte_mbuf **pkts __rte_unused,
1275 		  uint16_t pkts_n __rte_unused)
1276 {
1277 	return 0;
1278 }
1279 
1280 __rte_weak uint16_t
1281 mlx5_rx_burst_mprq_vec(void *dpdk_rxq __rte_unused,
1282 		       struct rte_mbuf **pkts __rte_unused,
1283 		       uint16_t pkts_n __rte_unused)
1284 {
1285 	return 0;
1286 }
1287 
1288 __rte_weak int
1289 mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused)
1290 {
1291 	return -ENOTSUP;
1292 }
1293 
1294 __rte_weak int
1295 mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused)
1296 {
1297 	return -ENOTSUP;
1298 }
1299 
1300 int
1301 mlx5_rx_queue_lwm_query(struct rte_eth_dev *dev,
1302 			uint16_t *queue_id, uint8_t *lwm)
1303 {
1304 	struct mlx5_priv *priv = dev->data->dev_private;
1305 	unsigned int rxq_id, found = 0, n;
1306 	struct mlx5_rxq_priv *rxq;
1307 
1308 	if (!queue_id)
1309 		return -EINVAL;
1310 	/* Query all the Rx queues of the port in a circular way. */
1311 	for (rxq_id = *queue_id, n = 0; n < priv->rxqs_n; n++) {
1312 		rxq = mlx5_rxq_get(dev, rxq_id);
1313 		if (rxq && rxq->lwm_event_pending) {
1314 			pthread_mutex_lock(&priv->sh->lwm_config_lock);
1315 			rxq->lwm_event_pending = 0;
1316 			pthread_mutex_unlock(&priv->sh->lwm_config_lock);
1317 			*queue_id = rxq_id;
1318 			found = 1;
1319 			if (lwm)
1320 				*lwm =  mlx5_rxq_lwm_to_percentage(rxq);
1321 			break;
1322 		}
1323 		rxq_id = (rxq_id + 1) % priv->rxqs_n;
1324 	}
1325 	return found;
1326 }
1327 
1328 /**
1329  * Rte interrupt handler for LWM event.
1330  * It first checks if the event arrives, if so process the callback for
1331  * RTE_ETH_EVENT_RX_LWM.
1332  *
1333  * @param args
1334  *   Generic pointer to mlx5_priv.
1335  */
1336 void
1337 mlx5_dev_interrupt_handler_lwm(void *args)
1338 {
1339 	struct mlx5_priv *priv = args;
1340 	struct mlx5_rxq_priv *rxq;
1341 	struct rte_eth_dev *dev;
1342 	int ret, rxq_idx = 0, port_id = 0;
1343 
1344 	ret = priv->obj_ops.rxq_event_get_lwm(priv, &rxq_idx, &port_id);
1345 	if (unlikely(ret < 0)) {
1346 		DRV_LOG(WARNING, "Cannot get LWM event context.");
1347 		return;
1348 	}
1349 	DRV_LOG(INFO, "%s get LWM event, port_id:%d rxq_id:%d.", __func__,
1350 		port_id, rxq_idx);
1351 	dev = &rte_eth_devices[port_id];
1352 	rxq = mlx5_rxq_get(dev, rxq_idx);
1353 	if (rxq) {
1354 		pthread_mutex_lock(&priv->sh->lwm_config_lock);
1355 		rxq->lwm_event_pending = 1;
1356 		pthread_mutex_unlock(&priv->sh->lwm_config_lock);
1357 	}
1358 	rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_RX_AVAIL_THRESH, NULL);
1359 }
1360 
1361 /**
1362  * DPDK callback to arm an Rx queue LWM(limit watermark) event.
1363  * While the Rx queue fullness reaches the LWM limit, the driver catches
1364  * an HW event and invokes the user event callback.
1365  * After the last event handling, the user needs to call this API again
1366  * to arm an additional event.
1367  *
1368  * @param dev
1369  *   Pointer to the device structure.
1370  * @param[in] rx_queue_id
1371  *   Rx queue identificator.
1372  * @param[in] lwm
1373  *   The LWM value, is defined by a percentage of the Rx queue size.
1374  *   [1-99] to set a new LWM (update the old value).
1375  *   0 to unarm the event.
1376  *
1377  * @return
1378  *   0 : operation success.
1379  *   Otherwise:
1380  *   - ENOMEM - not enough memory to create LWM event channel.
1381  *   - EINVAL - the input Rxq is not created by devx.
1382  *   - E2BIG  - lwm is bigger than 99.
1383  */
1384 int
1385 mlx5_rx_queue_lwm_set(struct rte_eth_dev *dev, uint16_t rx_queue_id,
1386 		      uint8_t lwm)
1387 {
1388 	struct mlx5_priv *priv = dev->data->dev_private;
1389 	uint16_t port_id = PORT_ID(priv);
1390 	struct mlx5_rxq_priv *rxq = mlx5_rxq_get(dev, rx_queue_id);
1391 	uint16_t event_nums[1] = {MLX5_EVENT_TYPE_SRQ_LIMIT_REACHED};
1392 	struct mlx5_rxq_data *rxq_data;
1393 	uint32_t wqe_cnt;
1394 	uint64_t cookie;
1395 	int ret = 0;
1396 
1397 	if (!rxq) {
1398 		rte_errno = EINVAL;
1399 		return -rte_errno;
1400 	}
1401 	rxq_data = &rxq->ctrl->rxq;
1402 	/* Ensure the Rq is created by devx. */
1403 	if (priv->obj_ops.rxq_obj_new != devx_obj_ops.rxq_obj_new) {
1404 		rte_errno = EINVAL;
1405 		return -rte_errno;
1406 	}
1407 	if (lwm > 99) {
1408 		DRV_LOG(WARNING, "Too big LWM configuration.");
1409 		rte_errno = E2BIG;
1410 		return -rte_errno;
1411 	}
1412 	/* Start config LWM. */
1413 	pthread_mutex_lock(&priv->sh->lwm_config_lock);
1414 	if (rxq->lwm == 0 && lwm == 0) {
1415 		/* Both old/new values are 0, do nothing. */
1416 		ret = 0;
1417 		goto end;
1418 	}
1419 	wqe_cnt = 1 << (rxq_data->elts_n - rxq_data->sges_n);
1420 	if (lwm) {
1421 		if (!priv->sh->devx_channel_lwm) {
1422 			ret = mlx5_lwm_setup(priv);
1423 			if (ret) {
1424 				DRV_LOG(WARNING,
1425 					"Failed to create shared_lwm.");
1426 				rte_errno = ENOMEM;
1427 				ret = -rte_errno;
1428 				goto end;
1429 			}
1430 		}
1431 		if (!rxq->lwm_devx_subscribed) {
1432 			cookie = ((uint32_t)
1433 				  (port_id << LWM_COOKIE_PORTID_OFFSET)) |
1434 				(rx_queue_id << LWM_COOKIE_RXQID_OFFSET);
1435 			ret = mlx5_os_devx_subscribe_devx_event
1436 				(priv->sh->devx_channel_lwm,
1437 				 rxq->devx_rq.rq->obj,
1438 				 sizeof(event_nums),
1439 				 event_nums,
1440 				 cookie);
1441 			if (ret) {
1442 				rte_errno = rte_errno ? rte_errno : EINVAL;
1443 				ret = -rte_errno;
1444 				goto end;
1445 			}
1446 			rxq->lwm_devx_subscribed = 1;
1447 		}
1448 	}
1449 	/* Save LWM to rxq and send modify_rq devx command. */
1450 	rxq->lwm = lwm * wqe_cnt / 100;
1451 	/* Prevent integer division loss when switch lwm number to percentage. */
1452 	if (lwm && (lwm * wqe_cnt % 100)) {
1453 		rxq->lwm = ((uint32_t)(rxq->lwm + 1) >= wqe_cnt) ?
1454 			rxq->lwm : (rxq->lwm + 1);
1455 	}
1456 	if (lwm && !rxq->lwm) {
1457 		/* With mprq, wqe_cnt may be < 100. */
1458 		DRV_LOG(WARNING, "Too small LWM configuration.");
1459 		rte_errno = EINVAL;
1460 		ret = -rte_errno;
1461 		goto end;
1462 	}
1463 	ret = mlx5_devx_modify_rq(rxq, MLX5_RXQ_MOD_RDY2RDY);
1464 end:
1465 	pthread_mutex_unlock(&priv->sh->lwm_config_lock);
1466 	return ret;
1467 }
1468 
1469 /**
1470  * Mlx5 access register function to configure host shaper.
1471  * It calls API in libmtcr_ul to access QSHR(Qos Shaper Host Register)
1472  * in firmware.
1473  *
1474  * @param dev
1475  *   Pointer to rte_eth_dev.
1476  * @param lwm_triggered
1477  *   Flag to enable/disable lwm_triggered bit in QSHR.
1478  * @param rate
1479  *   Host shaper rate, unit is 100Mbps, set to 0 means disable the shaper.
1480  * @return
1481  *   0 : operation success.
1482  *   Otherwise:
1483  *   - ENOENT - no ibdev interface.
1484  *   - EBUSY  - the register access unit is busy.
1485  *   - EIO    - the register access command meets IO error.
1486  */
1487 static int
1488 mlxreg_host_shaper_config(struct rte_eth_dev *dev,
1489 			  bool lwm_triggered, uint8_t rate)
1490 {
1491 #ifdef HAVE_MLX5_MSTFLINT
1492 	struct mlx5_priv *priv = dev->data->dev_private;
1493 	uint32_t data[MLX5_ST_SZ_DW(register_qshr)] = {0};
1494 	int rc, retry_count = 3;
1495 	mfile *mf = NULL;
1496 	int status;
1497 	void *ptr;
1498 
1499 	mf = mopen(priv->sh->ibdev_name);
1500 	if (!mf) {
1501 		DRV_LOG(WARNING, "mopen failed\n");
1502 		rte_errno = ENOENT;
1503 		return -rte_errno;
1504 	}
1505 	MLX5_SET(register_qshr, data, connected_host, 1);
1506 	MLX5_SET(register_qshr, data, fast_response, lwm_triggered ? 1 : 0);
1507 	MLX5_SET(register_qshr, data, local_port, 1);
1508 	ptr = MLX5_ADDR_OF(register_qshr, data, global_config);
1509 	MLX5_SET(ets_global_config_register, ptr, rate_limit_update, 1);
1510 	MLX5_SET(ets_global_config_register, ptr, max_bw_units,
1511 		 rate ? ETS_GLOBAL_CONFIG_BW_UNIT_HUNDREDS_MBPS :
1512 		 ETS_GLOBAL_CONFIG_BW_UNIT_DISABLED);
1513 	MLX5_SET(ets_global_config_register, ptr, max_bw_value, rate);
1514 	do {
1515 		rc = maccess_reg(mf,
1516 				 MLX5_QSHR_REGISTER_ID,
1517 				 MACCESS_REG_METHOD_SET,
1518 				 (u_int32_t *)&data[0],
1519 				 sizeof(data),
1520 				 sizeof(data),
1521 				 sizeof(data),
1522 				 &status);
1523 		if ((rc != ME_ICMD_STATUS_IFC_BUSY &&
1524 		     status != ME_REG_ACCESS_BAD_PARAM) ||
1525 		    !(mf->flags & MDEVS_REM)) {
1526 			break;
1527 		}
1528 		DRV_LOG(WARNING, "%s retry.", __func__);
1529 		usleep(10000);
1530 	} while (retry_count-- > 0);
1531 	mclose(mf);
1532 	rte_errno = (rc == ME_REG_ACCESS_DEV_BUSY) ? EBUSY : EIO;
1533 	return rc ? -rte_errno : 0;
1534 #else
1535 	(void)dev;
1536 	(void)lwm_triggered;
1537 	(void)rate;
1538 	return -1;
1539 #endif
1540 }
1541 
1542 int rte_pmd_mlx5_host_shaper_config(int port_id, uint8_t rate,
1543 				    uint32_t flags)
1544 {
1545 	struct rte_eth_dev *dev = &rte_eth_devices[port_id];
1546 	struct mlx5_priv *priv = dev->data->dev_private;
1547 	bool lwm_triggered =
1548 	     !!(flags & RTE_BIT32(MLX5_HOST_SHAPER_FLAG_AVAIL_THRESH_TRIGGERED));
1549 
1550 	if (!lwm_triggered) {
1551 		priv->sh->host_shaper_rate = rate;
1552 	} else {
1553 		switch (rate) {
1554 		case 0:
1555 		/* Rate 0 means disable lwm_triggered. */
1556 			priv->sh->lwm_triggered = 0;
1557 			break;
1558 		case 1:
1559 		/* Rate 1 means enable lwm_triggered. */
1560 			priv->sh->lwm_triggered = 1;
1561 			break;
1562 		default:
1563 			return -ENOTSUP;
1564 		}
1565 	}
1566 	return mlxreg_host_shaper_config(dev, priv->sh->lwm_triggered,
1567 					 priv->sh->host_shaper_rate);
1568 }
1569