xref: /dpdk/drivers/net/mlx5/mlx5_rx.c (revision 8d54b1ec4a8be40975ae6978535bcc1431caad02)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2021 6WIND S.A.
3  * Copyright 2021 Mellanox Technologies, Ltd
4  */
5 
6 #include <stdint.h>
7 #include <string.h>
8 #include <stdlib.h>
9 
10 #include <rte_mbuf.h>
11 #include <rte_mempool.h>
12 #include <rte_prefetch.h>
13 #include <rte_common.h>
14 #include <rte_branch_prediction.h>
15 #include <rte_ether.h>
16 #include <rte_cycles.h>
17 #include <rte_flow.h>
18 
19 #include <mlx5_prm.h>
20 #include <mlx5_common.h>
21 #include <mlx5_common_mr.h>
22 #include <rte_pmd_mlx5.h>
23 
24 #include "mlx5_autoconf.h"
25 #include "mlx5_defs.h"
26 #include "mlx5.h"
27 #include "mlx5_utils.h"
28 #include "mlx5_rxtx.h"
29 #include "mlx5_devx.h"
30 #include "mlx5_rx.h"
31 #ifdef HAVE_MLX5_MSTFLINT
32 #include <mstflint/mtcr.h>
33 #endif
34 
35 
36 static __rte_always_inline uint32_t
37 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
38 		   volatile struct mlx5_mini_cqe8 *mcqe);
39 
40 static __rte_always_inline int
41 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
42 		 uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe);
43 
44 static __rte_always_inline uint32_t
45 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe);
46 
47 static __rte_always_inline void
48 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
49 	       volatile struct mlx5_cqe *cqe,
50 	       volatile struct mlx5_mini_cqe8 *mcqe);
51 
52 static inline void
53 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *__rte_restrict tcp,
54 			volatile struct mlx5_cqe *__rte_restrict cqe,
55 			uint32_t phcsum, uint8_t l4_type);
56 
57 static inline void
58 mlx5_lro_update_hdr(uint8_t *__rte_restrict padd,
59 		    volatile struct mlx5_cqe *__rte_restrict cqe,
60 		    volatile struct mlx5_mini_cqe8 *mcqe,
61 		    struct mlx5_rxq_data *rxq, uint32_t len);
62 
63 
64 /**
65  * Internal function to compute the number of used descriptors in an RX queue.
66  *
67  * @param rxq
68  *   The Rx queue.
69  *
70  * @return
71  *   The number of used Rx descriptor.
72  */
73 static uint32_t
74 rx_queue_count(struct mlx5_rxq_data *rxq)
75 {
76 	struct rxq_zip *zip = &rxq->zip;
77 	volatile struct mlx5_cqe *cqe;
78 	const unsigned int cqe_n = (1 << rxq->cqe_n);
79 	const unsigned int sges_n = (1 << rxq->sges_n);
80 	const unsigned int elts_n = (1 << rxq->elts_n);
81 	const unsigned int strd_n = RTE_BIT32(rxq->log_strd_num);
82 	const unsigned int cqe_cnt = cqe_n - 1;
83 	unsigned int cq_ci, used;
84 
85 	/* if we are processing a compressed cqe */
86 	if (zip->ai) {
87 		used = zip->cqe_cnt - zip->ai;
88 		cq_ci = zip->cq_ci;
89 	} else {
90 		used = 0;
91 		cq_ci = rxq->cq_ci;
92 	}
93 	cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
94 	while (check_cqe(cqe, cqe_n, cq_ci) != MLX5_CQE_STATUS_HW_OWN) {
95 		int8_t op_own;
96 		unsigned int n;
97 
98 		op_own = cqe->op_own;
99 		if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED)
100 			n = rte_be_to_cpu_32(cqe->byte_cnt);
101 		else
102 			n = 1;
103 		cq_ci += n;
104 		used += n;
105 		cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
106 	}
107 	used = RTE_MIN(used * sges_n, elts_n * strd_n);
108 	return used;
109 }
110 
111 /**
112  * DPDK callback to check the status of a Rx descriptor.
113  *
114  * @param rx_queue
115  *   The Rx queue.
116  * @param[in] offset
117  *   The index of the descriptor in the ring.
118  *
119  * @return
120  *   The status of the Rx descriptor.
121  */
122 int
123 mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset)
124 {
125 	struct mlx5_rxq_data *rxq = rx_queue;
126 
127 	if (offset >= (1 << rxq->cqe_n)) {
128 		rte_errno = EINVAL;
129 		return -rte_errno;
130 	}
131 	if (offset < rx_queue_count(rxq))
132 		return RTE_ETH_RX_DESC_DONE;
133 	return RTE_ETH_RX_DESC_AVAIL;
134 }
135 
136 /* Get rxq lwm percentage according to lwm number. */
137 static uint8_t
138 mlx5_rxq_lwm_to_percentage(struct mlx5_rxq_priv *rxq)
139 {
140 	struct mlx5_rxq_data *rxq_data = &rxq->ctrl->rxq;
141 	uint32_t wqe_cnt = 1 << (rxq_data->elts_n - rxq_data->sges_n);
142 
143 	return rxq->lwm * 100 / wqe_cnt;
144 }
145 
146 /**
147  * DPDK callback to get the RX queue information.
148  *
149  * @param dev
150  *   Pointer to the device structure.
151  *
152  * @param rx_queue_id
153  *   Rx queue identificator.
154  *
155  * @param qinfo
156  *   Pointer to the RX queue information structure.
157  *
158  * @return
159  *   None.
160  */
161 
162 void
163 mlx5_rxq_info_get(struct rte_eth_dev *dev, uint16_t rx_queue_id,
164 		  struct rte_eth_rxq_info *qinfo)
165 {
166 	struct mlx5_rxq_ctrl *rxq_ctrl = mlx5_rxq_ctrl_get(dev, rx_queue_id);
167 	struct mlx5_rxq_data *rxq = mlx5_rxq_data_get(dev, rx_queue_id);
168 	struct mlx5_rxq_priv *rxq_priv = mlx5_rxq_get(dev, rx_queue_id);
169 
170 	if (!rxq)
171 		return;
172 	qinfo->mp = mlx5_rxq_mprq_enabled(rxq) ?
173 					rxq->mprq_mp : rxq->mp;
174 	qinfo->conf.rx_thresh.pthresh = 0;
175 	qinfo->conf.rx_thresh.hthresh = 0;
176 	qinfo->conf.rx_thresh.wthresh = 0;
177 	qinfo->conf.rx_free_thresh = rxq->rq_repl_thresh;
178 	qinfo->conf.rx_drop_en = 1;
179 	if (rxq_ctrl == NULL || rxq_ctrl->obj == NULL)
180 		qinfo->conf.rx_deferred_start = 0;
181 	else
182 		qinfo->conf.rx_deferred_start = 1;
183 	qinfo->conf.offloads = dev->data->dev_conf.rxmode.offloads;
184 	qinfo->scattered_rx = dev->data->scattered_rx;
185 	qinfo->nb_desc = mlx5_rxq_mprq_enabled(rxq) ?
186 		RTE_BIT32(rxq->elts_n) * RTE_BIT32(rxq->log_strd_num) :
187 		RTE_BIT32(rxq->elts_n);
188 	qinfo->avail_thresh = rxq_priv ?
189 		mlx5_rxq_lwm_to_percentage(rxq_priv) : 0;
190 }
191 
192 /**
193  * DPDK callback to get the RX packet burst mode information.
194  *
195  * @param dev
196  *   Pointer to the device structure.
197  *
198  * @param rx_queue_id
199  *   Rx queue identification.
200  *
201  * @param mode
202  *   Pointer to the burts mode information.
203  *
204  * @return
205  *   0 as success, -EINVAL as failure.
206  */
207 int
208 mlx5_rx_burst_mode_get(struct rte_eth_dev *dev,
209 		       uint16_t rx_queue_id __rte_unused,
210 		       struct rte_eth_burst_mode *mode)
211 {
212 	eth_rx_burst_t pkt_burst = dev->rx_pkt_burst;
213 	struct mlx5_rxq_priv *rxq = mlx5_rxq_get(dev, rx_queue_id);
214 
215 	if (!rxq) {
216 		rte_errno = EINVAL;
217 		return -rte_errno;
218 	}
219 	if (pkt_burst == mlx5_rx_burst) {
220 		snprintf(mode->info, sizeof(mode->info), "%s", "Scalar");
221 	} else if (pkt_burst == mlx5_rx_burst_mprq) {
222 		snprintf(mode->info, sizeof(mode->info), "%s", "Multi-Packet RQ");
223 	} else if (pkt_burst == mlx5_rx_burst_vec) {
224 #if defined RTE_ARCH_X86_64
225 		snprintf(mode->info, sizeof(mode->info), "%s", "Vector SSE");
226 #elif defined RTE_ARCH_ARM64
227 		snprintf(mode->info, sizeof(mode->info), "%s", "Vector Neon");
228 #elif defined RTE_ARCH_PPC_64
229 		snprintf(mode->info, sizeof(mode->info), "%s", "Vector AltiVec");
230 #else
231 		return -EINVAL;
232 #endif
233 	} else if (pkt_burst == mlx5_rx_burst_mprq_vec) {
234 #if defined RTE_ARCH_X86_64
235 		snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector SSE");
236 #elif defined RTE_ARCH_ARM64
237 		snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector Neon");
238 #elif defined RTE_ARCH_PPC_64
239 		snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector AltiVec");
240 #else
241 		return -EINVAL;
242 #endif
243 	} else {
244 		return -EINVAL;
245 	}
246 	return 0;
247 }
248 
249 /**
250  * DPDK callback to get the number of used descriptors in a RX queue.
251  *
252  * @param rx_queue
253  *   The Rx queue pointer.
254  *
255  * @return
256  *   The number of used rx descriptor.
257  *   -EINVAL if the queue is invalid
258  */
259 uint32_t
260 mlx5_rx_queue_count(void *rx_queue)
261 {
262 	struct mlx5_rxq_data *rxq = rx_queue;
263 	struct rte_eth_dev *dev;
264 
265 	if (!rxq) {
266 		rte_errno = EINVAL;
267 		return -rte_errno;
268 	}
269 
270 	dev = &rte_eth_devices[rxq->port_id];
271 
272 	if (dev->rx_pkt_burst == NULL ||
273 	    dev->rx_pkt_burst == rte_eth_pkt_burst_dummy) {
274 		rte_errno = ENOTSUP;
275 		return -rte_errno;
276 	}
277 
278 	return rx_queue_count(rxq);
279 }
280 
281 #define CLB_VAL_IDX 0
282 #define CLB_MSK_IDX 1
283 static int
284 mlx5_monitor_callback(const uint64_t value,
285 		const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ])
286 {
287 	const uint64_t m = opaque[CLB_MSK_IDX];
288 	const uint64_t v = opaque[CLB_VAL_IDX];
289 
290 	return (value & m) == v ? -1 : 0;
291 }
292 
293 int mlx5_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc)
294 {
295 	struct mlx5_rxq_data *rxq = rx_queue;
296 	const unsigned int cqe_num = 1 << rxq->cqe_n;
297 	const unsigned int cqe_mask = cqe_num - 1;
298 	const uint16_t idx = rxq->cq_ci & cqe_num;
299 	volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask];
300 
301 	if (unlikely(rxq->cqes == NULL)) {
302 		rte_errno = EINVAL;
303 		return -rte_errno;
304 	}
305 	pmc->addr = &cqe->op_own;
306 	pmc->opaque[CLB_VAL_IDX] = !!idx;
307 	pmc->opaque[CLB_MSK_IDX] = MLX5_CQE_OWNER_MASK;
308 	pmc->fn = mlx5_monitor_callback;
309 	pmc->size = sizeof(uint8_t);
310 	return 0;
311 }
312 
313 /**
314  * Translate RX completion flags to packet type.
315  *
316  * @param[in] rxq
317  *   Pointer to RX queue structure.
318  * @param[in] cqe
319  *   Pointer to CQE.
320  *
321  * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
322  *
323  * @return
324  *   Packet type for struct rte_mbuf.
325  */
326 static inline uint32_t
327 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
328 				   volatile struct mlx5_mini_cqe8 *mcqe)
329 {
330 	uint8_t idx;
331 	uint8_t ptype;
332 	uint8_t pinfo = (cqe->pkt_info & 0x3) << 6;
333 
334 	/* Get l3/l4 header from mini-CQE in case L3/L4 format*/
335 	if (mcqe == NULL ||
336 	    rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX)
337 		ptype = (cqe->hdr_type_etc & 0xfc00) >> 10;
338 	else
339 		ptype = mcqe->hdr_type >> 2;
340 	/*
341 	 * The index to the array should have:
342 	 * bit[1:0] = l3_hdr_type
343 	 * bit[4:2] = l4_hdr_type
344 	 * bit[5] = ip_frag
345 	 * bit[6] = tunneled
346 	 * bit[7] = outer_l3_type
347 	 */
348 	idx = pinfo | ptype;
349 	return mlx5_ptype_table[idx] | rxq->tunnel * !!(idx & (1 << 6));
350 }
351 
352 /**
353  * Initialize Rx WQ and indexes.
354  *
355  * @param[in] rxq
356  *   Pointer to RX queue structure.
357  */
358 void
359 mlx5_rxq_initialize(struct mlx5_rxq_data *rxq)
360 {
361 	const unsigned int wqe_n = 1 << rxq->elts_n;
362 	unsigned int i;
363 
364 	for (i = 0; (i != wqe_n); ++i) {
365 		volatile struct mlx5_wqe_data_seg *scat;
366 		uintptr_t addr;
367 		uint32_t byte_count;
368 		uint32_t lkey;
369 
370 		if (mlx5_rxq_mprq_enabled(rxq)) {
371 			struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[i];
372 
373 			scat = &((volatile struct mlx5_wqe_mprq *)
374 				rxq->wqes)[i].dseg;
375 			addr = (uintptr_t)mlx5_mprq_buf_addr
376 					(buf, RTE_BIT32(rxq->log_strd_num));
377 			byte_count = RTE_BIT32(rxq->log_strd_sz) *
378 				     RTE_BIT32(rxq->log_strd_num);
379 			lkey = mlx5_rx_addr2mr(rxq, addr);
380 		} else {
381 			struct rte_mbuf *buf = (*rxq->elts)[i];
382 
383 			scat = &((volatile struct mlx5_wqe_data_seg *)
384 					rxq->wqes)[i];
385 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
386 			byte_count = DATA_LEN(buf);
387 			lkey = mlx5_rx_mb2mr(rxq, buf);
388 		}
389 		/* scat->addr must be able to store a pointer. */
390 		MLX5_ASSERT(sizeof(scat->addr) >= sizeof(uintptr_t));
391 		*scat = (struct mlx5_wqe_data_seg){
392 			.addr = rte_cpu_to_be_64(addr),
393 			.byte_count = rte_cpu_to_be_32(byte_count),
394 			.lkey = lkey,
395 		};
396 	}
397 	rxq->consumed_strd = 0;
398 	rxq->decompressed = 0;
399 	rxq->rq_pi = 0;
400 	rxq->zip = (struct rxq_zip){
401 		.ai = 0,
402 	};
403 	rxq->elts_ci = mlx5_rxq_mprq_enabled(rxq) ?
404 		(wqe_n >> rxq->sges_n) * RTE_BIT32(rxq->log_strd_num) : 0;
405 	/* Update doorbell counter. */
406 	rxq->rq_ci = wqe_n >> rxq->sges_n;
407 	rte_io_wmb();
408 	*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
409 }
410 
411 /* Must be negative. */
412 #define MLX5_ERROR_CQE_RET (-1)
413 /* Must not be negative. */
414 #define MLX5_RECOVERY_ERROR_RET 0
415 
416 /**
417  * Handle a Rx error.
418  * The function inserts the RQ state to reset when the first error CQE is
419  * shown, then drains the CQ by the caller function loop. When the CQ is empty,
420  * it moves the RQ state to ready and initializes the RQ.
421  * Next CQE identification and error counting are in the caller responsibility.
422  *
423  * @param[in] rxq
424  *   Pointer to RX queue structure.
425  * @param[in] vec
426  *   1 when called from vectorized Rx burst, need to prepare mbufs for the RQ.
427  *   0 when called from non-vectorized Rx burst.
428  *
429  * @return
430  *   MLX5_RECOVERY_ERROR_RET in case of recovery error, otherwise the CQE status.
431  */
432 int
433 mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec)
434 {
435 	const uint16_t cqe_n = 1 << rxq->cqe_n;
436 	const uint16_t cqe_mask = cqe_n - 1;
437 	const uint16_t wqe_n = 1 << rxq->elts_n;
438 	const uint16_t strd_n = RTE_BIT32(rxq->log_strd_num);
439 	struct mlx5_rxq_ctrl *rxq_ctrl =
440 			container_of(rxq, struct mlx5_rxq_ctrl, rxq);
441 	union {
442 		volatile struct mlx5_cqe *cqe;
443 		volatile struct mlx5_err_cqe *err_cqe;
444 	} u = {
445 		.cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask],
446 	};
447 	struct mlx5_mp_arg_queue_state_modify sm;
448 	int ret;
449 
450 	switch (rxq->err_state) {
451 	case MLX5_RXQ_ERR_STATE_NO_ERROR:
452 		rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_RESET;
453 		/* Fall-through */
454 	case MLX5_RXQ_ERR_STATE_NEED_RESET:
455 		sm.is_wq = 1;
456 		sm.queue_id = rxq->idx;
457 		sm.state = IBV_WQS_RESET;
458 		if (mlx5_queue_state_modify(RXQ_DEV(rxq_ctrl), &sm))
459 			return MLX5_RECOVERY_ERROR_RET;
460 		if (rxq_ctrl->dump_file_n <
461 		    RXQ_PORT(rxq_ctrl)->config.max_dump_files_num) {
462 			MKSTR(err_str, "Unexpected CQE error syndrome "
463 			      "0x%02x CQN = %u RQN = %u wqe_counter = %u"
464 			      " rq_ci = %u cq_ci = %u", u.err_cqe->syndrome,
465 			      rxq->cqn, rxq_ctrl->wqn,
466 			      rte_be_to_cpu_16(u.err_cqe->wqe_counter),
467 			      rxq->rq_ci << rxq->sges_n, rxq->cq_ci);
468 			MKSTR(name, "dpdk_mlx5_port_%u_rxq_%u_%u",
469 			      rxq->port_id, rxq->idx, (uint32_t)rte_rdtsc());
470 			mlx5_dump_debug_information(name, NULL, err_str, 0);
471 			mlx5_dump_debug_information(name, "MLX5 Error CQ:",
472 						    (const void *)((uintptr_t)
473 								    rxq->cqes),
474 						    sizeof(*u.cqe) * cqe_n);
475 			mlx5_dump_debug_information(name, "MLX5 Error RQ:",
476 						    (const void *)((uintptr_t)
477 								    rxq->wqes),
478 						    16 * wqe_n);
479 			rxq_ctrl->dump_file_n++;
480 		}
481 		rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_READY;
482 		/* Fall-through */
483 	case MLX5_RXQ_ERR_STATE_NEED_READY:
484 		ret = check_cqe(u.cqe, cqe_n, rxq->cq_ci);
485 		if (ret == MLX5_CQE_STATUS_HW_OWN) {
486 			rte_io_wmb();
487 			*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
488 			rte_io_wmb();
489 			/*
490 			 * The RQ consumer index must be zeroed while moving
491 			 * from RESET state to RDY state.
492 			 */
493 			*rxq->rq_db = rte_cpu_to_be_32(0);
494 			rte_io_wmb();
495 			sm.is_wq = 1;
496 			sm.queue_id = rxq->idx;
497 			sm.state = IBV_WQS_RDY;
498 			if (mlx5_queue_state_modify(RXQ_DEV(rxq_ctrl), &sm))
499 				return MLX5_RECOVERY_ERROR_RET;
500 			if (vec) {
501 				const uint32_t elts_n =
502 					mlx5_rxq_mprq_enabled(rxq) ?
503 					wqe_n * strd_n : wqe_n;
504 				const uint32_t e_mask = elts_n - 1;
505 				uint32_t elts_ci =
506 					mlx5_rxq_mprq_enabled(rxq) ?
507 					rxq->elts_ci : rxq->rq_ci;
508 				uint32_t elt_idx;
509 				struct rte_mbuf **elt;
510 				int i;
511 				unsigned int n = elts_n - (elts_ci -
512 							  rxq->rq_pi);
513 
514 				for (i = 0; i < (int)n; ++i) {
515 					elt_idx = (elts_ci + i) & e_mask;
516 					elt = &(*rxq->elts)[elt_idx];
517 					*elt = rte_mbuf_raw_alloc(rxq->mp);
518 					if (!*elt) {
519 						for (i--; i >= 0; --i) {
520 							elt_idx = (elts_ci +
521 								   i) & elts_n;
522 							elt = &(*rxq->elts)
523 								[elt_idx];
524 							rte_pktmbuf_free_seg
525 								(*elt);
526 						}
527 						return MLX5_RECOVERY_ERROR_RET;
528 					}
529 				}
530 				for (i = 0; i < (int)elts_n; ++i) {
531 					elt = &(*rxq->elts)[i];
532 					DATA_LEN(*elt) =
533 						(uint16_t)((*elt)->buf_len -
534 						rte_pktmbuf_headroom(*elt));
535 				}
536 				/* Padding with a fake mbuf for vec Rx. */
537 				for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
538 					(*rxq->elts)[elts_n + i] =
539 								&rxq->fake_mbuf;
540 			}
541 			mlx5_rxq_initialize(rxq);
542 			rxq->err_state = MLX5_RXQ_ERR_STATE_NO_ERROR;
543 		}
544 		return ret;
545 	default:
546 		return MLX5_RECOVERY_ERROR_RET;
547 	}
548 }
549 
550 /**
551  * Get size of the next packet for a given CQE. For compressed CQEs, the
552  * consumer index is updated only once all packets of the current one have
553  * been processed.
554  *
555  * @param rxq
556  *   Pointer to RX queue.
557  * @param cqe
558  *   CQE to process.
559  * @param[out] mcqe
560  *   Store pointer to mini-CQE if compressed. Otherwise, the pointer is not
561  *   written.
562  *
563  * @return
564  *   0 in case of empty CQE, MLX5_ERROR_CQE_RET in case of error CQE,
565  *   otherwise the packet size in regular RxQ, and striding byte
566  *   count format in mprq case.
567  */
568 static inline int
569 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
570 		 uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe)
571 {
572 	struct rxq_zip *zip = &rxq->zip;
573 	uint16_t cqe_n = cqe_cnt + 1;
574 	int len;
575 	uint16_t idx, end;
576 
577 	do {
578 		len = 0;
579 		/* Process compressed data in the CQE and mini arrays. */
580 		if (zip->ai) {
581 			volatile struct mlx5_mini_cqe8 (*mc)[8] =
582 				(volatile struct mlx5_mini_cqe8 (*)[8])
583 				(uintptr_t)(&(*rxq->cqes)[zip->ca &
584 							  cqe_cnt].pkt_info);
585 			len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt &
586 					       rxq->byte_mask);
587 			*mcqe = &(*mc)[zip->ai & 7];
588 			if ((++zip->ai & 7) == 0) {
589 				/* Invalidate consumed CQEs */
590 				idx = zip->ca;
591 				end = zip->na;
592 				while (idx != end) {
593 					(*rxq->cqes)[idx & cqe_cnt].op_own =
594 						MLX5_CQE_INVALIDATE;
595 					++idx;
596 				}
597 				/*
598 				 * Increment consumer index to skip the number
599 				 * of CQEs consumed. Hardware leaves holes in
600 				 * the CQ ring for software use.
601 				 */
602 				zip->ca = zip->na;
603 				zip->na += 8;
604 			}
605 			if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
606 				/* Invalidate the rest */
607 				idx = zip->ca;
608 				end = zip->cq_ci;
609 
610 				while (idx != end) {
611 					(*rxq->cqes)[idx & cqe_cnt].op_own =
612 						MLX5_CQE_INVALIDATE;
613 					++idx;
614 				}
615 				rxq->cq_ci = zip->cq_ci;
616 				zip->ai = 0;
617 			}
618 		/*
619 		 * No compressed data, get next CQE and verify if it is
620 		 * compressed.
621 		 */
622 		} else {
623 			int ret;
624 			int8_t op_own;
625 			uint32_t cq_ci;
626 
627 			ret = check_cqe(cqe, cqe_n, rxq->cq_ci);
628 			if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
629 				if (unlikely(ret == MLX5_CQE_STATUS_ERR ||
630 					     rxq->err_state)) {
631 					ret = mlx5_rx_err_handle(rxq, 0);
632 					if (ret == MLX5_CQE_STATUS_HW_OWN ||
633 					    ret == MLX5_RECOVERY_ERROR_RET)
634 						return MLX5_ERROR_CQE_RET;
635 				} else {
636 					return 0;
637 				}
638 			}
639 			/*
640 			 * Introduce the local variable to have queue cq_ci
641 			 * index in queue structure always consistent with
642 			 * actual CQE boundary (not pointing to the middle
643 			 * of compressed CQE session).
644 			 */
645 			cq_ci = rxq->cq_ci + 1;
646 			op_own = cqe->op_own;
647 			if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
648 				volatile struct mlx5_mini_cqe8 (*mc)[8] =
649 					(volatile struct mlx5_mini_cqe8 (*)[8])
650 					(uintptr_t)(&(*rxq->cqes)
651 						[cq_ci & cqe_cnt].pkt_info);
652 
653 				/* Fix endianness. */
654 				zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt);
655 				/*
656 				 * Current mini array position is the one
657 				 * returned by check_cqe64().
658 				 *
659 				 * If completion comprises several mini arrays,
660 				 * as a special case the second one is located
661 				 * 7 CQEs after the initial CQE instead of 8
662 				 * for subsequent ones.
663 				 */
664 				zip->ca = cq_ci;
665 				zip->na = zip->ca + 7;
666 				/* Compute the next non compressed CQE. */
667 				zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
668 				/* Get packet size to return. */
669 				len = rte_be_to_cpu_32((*mc)[0].byte_cnt &
670 						       rxq->byte_mask);
671 				*mcqe = &(*mc)[0];
672 				zip->ai = 1;
673 				/* Prefetch all to be invalidated */
674 				idx = zip->ca;
675 				end = zip->cq_ci;
676 				while (idx != end) {
677 					rte_prefetch0(&(*rxq->cqes)[(idx) &
678 								    cqe_cnt]);
679 					++idx;
680 				}
681 			} else {
682 				rxq->cq_ci = cq_ci;
683 				len = rte_be_to_cpu_32(cqe->byte_cnt);
684 			}
685 		}
686 		if (unlikely(rxq->err_state)) {
687 			cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
688 			++rxq->stats.idropped;
689 		} else {
690 			return len;
691 		}
692 	} while (1);
693 }
694 
695 /**
696  * Translate RX completion flags to offload flags.
697  *
698  * @param[in] cqe
699  *   Pointer to CQE.
700  *
701  * @return
702  *   Offload flags (ol_flags) for struct rte_mbuf.
703  */
704 static inline uint32_t
705 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe)
706 {
707 	uint32_t ol_flags = 0;
708 	uint16_t flags = rte_be_to_cpu_16(cqe->hdr_type_etc);
709 
710 	ol_flags =
711 		TRANSPOSE(flags,
712 			  MLX5_CQE_RX_L3_HDR_VALID,
713 			  RTE_MBUF_F_RX_IP_CKSUM_GOOD) |
714 		TRANSPOSE(flags,
715 			  MLX5_CQE_RX_L4_HDR_VALID,
716 			  RTE_MBUF_F_RX_L4_CKSUM_GOOD);
717 	return ol_flags;
718 }
719 
720 /**
721  * Fill in mbuf fields from RX completion flags.
722  * Note that pkt->ol_flags should be initialized outside of this function.
723  *
724  * @param rxq
725  *   Pointer to RX queue.
726  * @param pkt
727  *   mbuf to fill.
728  * @param cqe
729  *   CQE to process.
730  * @param rss_hash_res
731  *   Packet RSS Hash result.
732  */
733 static inline void
734 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
735 	       volatile struct mlx5_cqe *cqe,
736 	       volatile struct mlx5_mini_cqe8 *mcqe)
737 {
738 	/* Update packet information. */
739 	pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe, mcqe);
740 	pkt->port = unlikely(rxq->shared) ? cqe->user_index_low : rxq->port_id;
741 
742 	if (rxq->rss_hash) {
743 		uint32_t rss_hash_res = 0;
744 
745 		/* If compressed, take hash result from mini-CQE. */
746 		if (mcqe == NULL ||
747 		    rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_HASH)
748 			rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res);
749 		else
750 			rss_hash_res = rte_be_to_cpu_32(mcqe->rx_hash_result);
751 		if (rss_hash_res) {
752 			pkt->hash.rss = rss_hash_res;
753 			pkt->ol_flags |= RTE_MBUF_F_RX_RSS_HASH;
754 		}
755 	}
756 	if (rxq->mark) {
757 		uint32_t mark = 0;
758 
759 		/* If compressed, take flow tag from mini-CQE. */
760 		if (mcqe == NULL ||
761 		    rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_FTAG_STRIDX)
762 			mark = cqe->sop_drop_qpn;
763 		else
764 			mark = ((mcqe->byte_cnt_flow & 0xff) << 8) |
765 				(mcqe->flow_tag_high << 16);
766 		if (MLX5_FLOW_MARK_IS_VALID(mark)) {
767 			pkt->ol_flags |= RTE_MBUF_F_RX_FDIR;
768 			if (mark != RTE_BE32(MLX5_FLOW_MARK_DEFAULT)) {
769 				pkt->ol_flags |= RTE_MBUF_F_RX_FDIR_ID;
770 				pkt->hash.fdir.hi = mlx5_flow_mark_get(mark);
771 			}
772 		}
773 	}
774 	if (rxq->dynf_meta) {
775 		uint32_t meta = rte_be_to_cpu_32(cqe->flow_table_metadata) &
776 			rxq->flow_meta_port_mask;
777 
778 		if (meta) {
779 			pkt->ol_flags |= rxq->flow_meta_mask;
780 			*RTE_MBUF_DYNFIELD(pkt, rxq->flow_meta_offset,
781 						uint32_t *) = meta;
782 		}
783 	}
784 	if (rxq->csum)
785 		pkt->ol_flags |= rxq_cq_to_ol_flags(cqe);
786 	if (rxq->vlan_strip) {
787 		bool vlan_strip;
788 
789 		if (mcqe == NULL ||
790 		    rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX)
791 			vlan_strip = cqe->hdr_type_etc &
792 				     RTE_BE16(MLX5_CQE_VLAN_STRIPPED);
793 		else
794 			vlan_strip = mcqe->hdr_type &
795 				     RTE_BE16(MLX5_CQE_VLAN_STRIPPED);
796 		if (vlan_strip) {
797 			pkt->ol_flags |= RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED;
798 			pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info);
799 		}
800 	}
801 	if (rxq->hw_timestamp) {
802 		uint64_t ts = rte_be_to_cpu_64(cqe->timestamp);
803 
804 		if (rxq->rt_timestamp)
805 			ts = mlx5_txpp_convert_rx_ts(rxq->sh, ts);
806 		mlx5_timestamp_set(pkt, rxq->timestamp_offset, ts);
807 		pkt->ol_flags |= rxq->timestamp_rx_flag;
808 	}
809 }
810 
811 /**
812  * DPDK callback for RX.
813  *
814  * @param dpdk_rxq
815  *   Generic pointer to RX queue structure.
816  * @param[out] pkts
817  *   Array to store received packets.
818  * @param pkts_n
819  *   Maximum number of packets in array.
820  *
821  * @return
822  *   Number of packets successfully received (<= pkts_n).
823  */
824 uint16_t
825 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
826 {
827 	struct mlx5_rxq_data *rxq = dpdk_rxq;
828 	const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1;
829 	const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1;
830 	const unsigned int sges_n = rxq->sges_n;
831 	struct rte_mbuf *pkt = NULL;
832 	struct rte_mbuf *seg = NULL;
833 	volatile struct mlx5_cqe *cqe =
834 		&(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
835 	unsigned int i = 0;
836 	unsigned int rq_ci = rxq->rq_ci << sges_n;
837 	int len = 0; /* keep its value across iterations. */
838 
839 	while (pkts_n) {
840 		unsigned int idx = rq_ci & wqe_cnt;
841 		volatile struct mlx5_wqe_data_seg *wqe =
842 			&((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx];
843 		struct rte_mbuf *rep = (*rxq->elts)[idx];
844 		volatile struct mlx5_mini_cqe8 *mcqe = NULL;
845 
846 		if (pkt)
847 			NEXT(seg) = rep;
848 		seg = rep;
849 		rte_prefetch0(seg);
850 		rte_prefetch0(cqe);
851 		rte_prefetch0(wqe);
852 		/* Allocate the buf from the same pool. */
853 		rep = rte_mbuf_raw_alloc(seg->pool);
854 		if (unlikely(rep == NULL)) {
855 			++rxq->stats.rx_nombuf;
856 			if (!pkt) {
857 				/*
858 				 * no buffers before we even started,
859 				 * bail out silently.
860 				 */
861 				break;
862 			}
863 			while (pkt != seg) {
864 				MLX5_ASSERT(pkt != (*rxq->elts)[idx]);
865 				rep = NEXT(pkt);
866 				NEXT(pkt) = NULL;
867 				NB_SEGS(pkt) = 1;
868 				rte_mbuf_raw_free(pkt);
869 				pkt = rep;
870 			}
871 			rq_ci >>= sges_n;
872 			++rq_ci;
873 			rq_ci <<= sges_n;
874 			break;
875 		}
876 		if (!pkt) {
877 			cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
878 			len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt, &mcqe);
879 			if (len <= 0) {
880 				rte_mbuf_raw_free(rep);
881 				if (unlikely(len == MLX5_ERROR_CQE_RET))
882 					rq_ci = rxq->rq_ci << sges_n;
883 				break;
884 			}
885 			pkt = seg;
886 			MLX5_ASSERT(len >= (rxq->crc_present << 2));
887 			pkt->ol_flags &= RTE_MBUF_F_EXTERNAL;
888 			rxq_cq_to_mbuf(rxq, pkt, cqe, mcqe);
889 			if (rxq->crc_present)
890 				len -= RTE_ETHER_CRC_LEN;
891 			PKT_LEN(pkt) = len;
892 			if (cqe->lro_num_seg > 1) {
893 				mlx5_lro_update_hdr
894 					(rte_pktmbuf_mtod(pkt, uint8_t *), cqe,
895 					 mcqe, rxq, len);
896 				pkt->ol_flags |= RTE_MBUF_F_RX_LRO;
897 				pkt->tso_segsz = len / cqe->lro_num_seg;
898 			}
899 		}
900 		DATA_LEN(rep) = DATA_LEN(seg);
901 		PKT_LEN(rep) = PKT_LEN(seg);
902 		SET_DATA_OFF(rep, DATA_OFF(seg));
903 		PORT(rep) = PORT(seg);
904 		(*rxq->elts)[idx] = rep;
905 		/*
906 		 * Fill NIC descriptor with the new buffer. The lkey and size
907 		 * of the buffers are already known, only the buffer address
908 		 * changes.
909 		 */
910 		wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t));
911 		/* If there's only one MR, no need to replace LKey in WQE. */
912 		if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
913 			wqe->lkey = mlx5_rx_mb2mr(rxq, rep);
914 		if (len > DATA_LEN(seg)) {
915 			len -= DATA_LEN(seg);
916 			++NB_SEGS(pkt);
917 			++rq_ci;
918 			continue;
919 		}
920 		DATA_LEN(seg) = len;
921 #ifdef MLX5_PMD_SOFT_COUNTERS
922 		/* Increment bytes counter. */
923 		rxq->stats.ibytes += PKT_LEN(pkt);
924 #endif
925 		/* Return packet. */
926 		*(pkts++) = pkt;
927 		pkt = NULL;
928 		--pkts_n;
929 		++i;
930 		/* Align consumer index to the next stride. */
931 		rq_ci >>= sges_n;
932 		++rq_ci;
933 		rq_ci <<= sges_n;
934 	}
935 	if (unlikely(i == 0 && ((rq_ci >> sges_n) == rxq->rq_ci)))
936 		return 0;
937 	/* Update the consumer index. */
938 	rxq->rq_ci = rq_ci >> sges_n;
939 	rte_io_wmb();
940 	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
941 	rte_io_wmb();
942 	*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
943 #ifdef MLX5_PMD_SOFT_COUNTERS
944 	/* Increment packets counter. */
945 	rxq->stats.ipackets += i;
946 #endif
947 	return i;
948 }
949 
950 /**
951  * Update LRO packet TCP header.
952  * The HW LRO feature doesn't update the TCP header after coalescing the
953  * TCP segments but supplies information in CQE to fill it by SW.
954  *
955  * @param tcp
956  *   Pointer to the TCP header.
957  * @param cqe
958  *   Pointer to the completion entry.
959  * @param phcsum
960  *   The L3 pseudo-header checksum.
961  */
962 static inline void
963 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *__rte_restrict tcp,
964 			volatile struct mlx5_cqe *__rte_restrict cqe,
965 			uint32_t phcsum, uint8_t l4_type)
966 {
967 	/*
968 	 * The HW calculates only the TCP payload checksum, need to complete
969 	 * the TCP header checksum and the L3 pseudo-header checksum.
970 	 */
971 	uint32_t csum = phcsum + cqe->csum;
972 
973 	if (l4_type == MLX5_L4_HDR_TYPE_TCP_EMPTY_ACK ||
974 	    l4_type == MLX5_L4_HDR_TYPE_TCP_WITH_ACL) {
975 		tcp->tcp_flags |= RTE_TCP_ACK_FLAG;
976 		tcp->recv_ack = cqe->lro_ack_seq_num;
977 		tcp->rx_win = cqe->lro_tcp_win;
978 	}
979 	if (cqe->lro_tcppsh_abort_dupack & MLX5_CQE_LRO_PUSH_MASK)
980 		tcp->tcp_flags |= RTE_TCP_PSH_FLAG;
981 	tcp->cksum = 0;
982 	csum += rte_raw_cksum(tcp, (tcp->data_off >> 4) * 4);
983 	csum = ((csum & 0xffff0000) >> 16) + (csum & 0xffff);
984 	csum = (~csum) & 0xffff;
985 	if (csum == 0)
986 		csum = 0xffff;
987 	tcp->cksum = csum;
988 }
989 
990 /**
991  * Update LRO packet headers.
992  * The HW LRO feature doesn't update the L3/TCP headers after coalescing the
993  * TCP segments but supply information in CQE to fill it by SW.
994  *
995  * @param padd
996  *   The packet address.
997  * @param cqe
998  *   Pointer to the completion entry.
999  * @param len
1000  *   The packet length.
1001  */
1002 static inline void
1003 mlx5_lro_update_hdr(uint8_t *__rte_restrict padd,
1004 		    volatile struct mlx5_cqe *__rte_restrict cqe,
1005 		    volatile struct mlx5_mini_cqe8 *mcqe,
1006 		    struct mlx5_rxq_data *rxq, uint32_t len)
1007 {
1008 	union {
1009 		struct rte_ether_hdr *eth;
1010 		struct rte_vlan_hdr *vlan;
1011 		struct rte_ipv4_hdr *ipv4;
1012 		struct rte_ipv6_hdr *ipv6;
1013 		struct rte_tcp_hdr *tcp;
1014 		uint8_t *hdr;
1015 	} h = {
1016 		.hdr = padd,
1017 	};
1018 	uint16_t proto = h.eth->ether_type;
1019 	uint32_t phcsum;
1020 	uint8_t l4_type;
1021 
1022 	h.eth++;
1023 	while (proto == RTE_BE16(RTE_ETHER_TYPE_VLAN) ||
1024 	       proto == RTE_BE16(RTE_ETHER_TYPE_QINQ)) {
1025 		proto = h.vlan->eth_proto;
1026 		h.vlan++;
1027 	}
1028 	if (proto == RTE_BE16(RTE_ETHER_TYPE_IPV4)) {
1029 		h.ipv4->time_to_live = cqe->lro_min_ttl;
1030 		h.ipv4->total_length = rte_cpu_to_be_16(len - (h.hdr - padd));
1031 		h.ipv4->hdr_checksum = 0;
1032 		h.ipv4->hdr_checksum = rte_ipv4_cksum(h.ipv4);
1033 		phcsum = rte_ipv4_phdr_cksum(h.ipv4, 0);
1034 		h.ipv4++;
1035 	} else {
1036 		h.ipv6->hop_limits = cqe->lro_min_ttl;
1037 		h.ipv6->payload_len = rte_cpu_to_be_16(len - (h.hdr - padd) -
1038 						       sizeof(*h.ipv6));
1039 		phcsum = rte_ipv6_phdr_cksum(h.ipv6, 0);
1040 		h.ipv6++;
1041 	}
1042 	if (mcqe == NULL ||
1043 	    rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX)
1044 		l4_type = (rte_be_to_cpu_16(cqe->hdr_type_etc) &
1045 			   MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT;
1046 	else
1047 		l4_type = (rte_be_to_cpu_16(mcqe->hdr_type) &
1048 			   MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT;
1049 	mlx5_lro_update_tcp_hdr(h.tcp, cqe, phcsum, l4_type);
1050 }
1051 
1052 void
1053 mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf)
1054 {
1055 	mlx5_mprq_buf_free_cb(NULL, buf);
1056 }
1057 
1058 /**
1059  * DPDK callback for RX with Multi-Packet RQ support.
1060  *
1061  * @param dpdk_rxq
1062  *   Generic pointer to RX queue structure.
1063  * @param[out] pkts
1064  *   Array to store received packets.
1065  * @param pkts_n
1066  *   Maximum number of packets in array.
1067  *
1068  * @return
1069  *   Number of packets successfully received (<= pkts_n).
1070  */
1071 uint16_t
1072 mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
1073 {
1074 	struct mlx5_rxq_data *rxq = dpdk_rxq;
1075 	const uint32_t strd_n = RTE_BIT32(rxq->log_strd_num);
1076 	const uint32_t strd_sz = RTE_BIT32(rxq->log_strd_sz);
1077 	const uint32_t cq_mask = (1 << rxq->cqe_n) - 1;
1078 	const uint32_t wq_mask = (1 << rxq->elts_n) - 1;
1079 	volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask];
1080 	unsigned int i = 0;
1081 	uint32_t rq_ci = rxq->rq_ci;
1082 	uint16_t consumed_strd = rxq->consumed_strd;
1083 	struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_ci & wq_mask];
1084 
1085 	while (i < pkts_n) {
1086 		struct rte_mbuf *pkt;
1087 		int ret;
1088 		uint32_t len;
1089 		uint16_t strd_cnt;
1090 		uint16_t strd_idx;
1091 		uint32_t byte_cnt;
1092 		volatile struct mlx5_mini_cqe8 *mcqe = NULL;
1093 		enum mlx5_rqx_code rxq_code;
1094 
1095 		if (consumed_strd == strd_n) {
1096 			/* Replace WQE if the buffer is still in use. */
1097 			mprq_buf_replace(rxq, rq_ci & wq_mask);
1098 			/* Advance to the next WQE. */
1099 			consumed_strd = 0;
1100 			++rq_ci;
1101 			buf = (*rxq->mprq_bufs)[rq_ci & wq_mask];
1102 		}
1103 		cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask];
1104 		ret = mlx5_rx_poll_len(rxq, cqe, cq_mask, &mcqe);
1105 		if (ret == 0)
1106 			break;
1107 		if (unlikely(ret == MLX5_ERROR_CQE_RET)) {
1108 			rq_ci = rxq->rq_ci;
1109 			consumed_strd = rxq->consumed_strd;
1110 			break;
1111 		}
1112 		byte_cnt = ret;
1113 		len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT;
1114 		MLX5_ASSERT((int)len >= (rxq->crc_present << 2));
1115 		if (rxq->crc_present)
1116 			len -= RTE_ETHER_CRC_LEN;
1117 		if (mcqe &&
1118 		    rxq->mcqe_format == MLX5_CQE_RESP_FORMAT_FTAG_STRIDX)
1119 			strd_cnt = (len / strd_sz) + !!(len % strd_sz);
1120 		else
1121 			strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >>
1122 				   MLX5_MPRQ_STRIDE_NUM_SHIFT;
1123 		MLX5_ASSERT(strd_cnt);
1124 		consumed_strd += strd_cnt;
1125 		if (byte_cnt & MLX5_MPRQ_FILLER_MASK)
1126 			continue;
1127 		strd_idx = rte_be_to_cpu_16(mcqe == NULL ?
1128 					cqe->wqe_counter :
1129 					mcqe->stride_idx);
1130 		MLX5_ASSERT(strd_idx < strd_n);
1131 		MLX5_ASSERT(!((rte_be_to_cpu_16(cqe->wqe_id) ^ rq_ci) &
1132 			    wq_mask));
1133 		pkt = rte_pktmbuf_alloc(rxq->mp);
1134 		if (unlikely(pkt == NULL)) {
1135 			++rxq->stats.rx_nombuf;
1136 			break;
1137 		}
1138 		len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT;
1139 		MLX5_ASSERT((int)len >= (rxq->crc_present << 2));
1140 		if (rxq->crc_present)
1141 			len -= RTE_ETHER_CRC_LEN;
1142 		rxq_code = mprq_buf_to_pkt(rxq, pkt, len, buf,
1143 					   strd_idx, strd_cnt);
1144 		if (unlikely(rxq_code != MLX5_RXQ_CODE_EXIT)) {
1145 			rte_pktmbuf_free_seg(pkt);
1146 			if (rxq_code == MLX5_RXQ_CODE_DROPPED) {
1147 				++rxq->stats.idropped;
1148 				continue;
1149 			}
1150 			if (rxq_code == MLX5_RXQ_CODE_NOMBUF) {
1151 				++rxq->stats.rx_nombuf;
1152 				break;
1153 			}
1154 		}
1155 		rxq_cq_to_mbuf(rxq, pkt, cqe, mcqe);
1156 		if (cqe->lro_num_seg > 1) {
1157 			mlx5_lro_update_hdr(rte_pktmbuf_mtod(pkt, uint8_t *),
1158 					    cqe, mcqe, rxq, len);
1159 			pkt->ol_flags |= RTE_MBUF_F_RX_LRO;
1160 			pkt->tso_segsz = len / cqe->lro_num_seg;
1161 		}
1162 		PKT_LEN(pkt) = len;
1163 		PORT(pkt) = rxq->port_id;
1164 #ifdef MLX5_PMD_SOFT_COUNTERS
1165 		/* Increment bytes counter. */
1166 		rxq->stats.ibytes += PKT_LEN(pkt);
1167 #endif
1168 		/* Return packet. */
1169 		*(pkts++) = pkt;
1170 		++i;
1171 	}
1172 	/* Update the consumer indexes. */
1173 	rxq->consumed_strd = consumed_strd;
1174 	rte_io_wmb();
1175 	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
1176 	if (rq_ci != rxq->rq_ci) {
1177 		rxq->rq_ci = rq_ci;
1178 		rte_io_wmb();
1179 		*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
1180 	}
1181 #ifdef MLX5_PMD_SOFT_COUNTERS
1182 	/* Increment packets counter. */
1183 	rxq->stats.ipackets += i;
1184 #endif
1185 	return i;
1186 }
1187 
1188 /*
1189  * Vectorized Rx routines are not compiled in when required vector instructions
1190  * are not supported on a target architecture.
1191  * The following null stubs are needed for linkage when those are not included
1192  * outside of this file (e.g. mlx5_rxtx_vec_sse.c for x86).
1193  */
1194 
1195 __rte_weak uint16_t
1196 mlx5_rx_burst_vec(void *dpdk_rxq __rte_unused,
1197 		  struct rte_mbuf **pkts __rte_unused,
1198 		  uint16_t pkts_n __rte_unused)
1199 {
1200 	return 0;
1201 }
1202 
1203 __rte_weak uint16_t
1204 mlx5_rx_burst_mprq_vec(void *dpdk_rxq __rte_unused,
1205 		       struct rte_mbuf **pkts __rte_unused,
1206 		       uint16_t pkts_n __rte_unused)
1207 {
1208 	return 0;
1209 }
1210 
1211 __rte_weak int
1212 mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused)
1213 {
1214 	return -ENOTSUP;
1215 }
1216 
1217 __rte_weak int
1218 mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused)
1219 {
1220 	return -ENOTSUP;
1221 }
1222 
1223 int
1224 mlx5_rx_queue_lwm_query(struct rte_eth_dev *dev,
1225 			uint16_t *queue_id, uint8_t *lwm)
1226 {
1227 	struct mlx5_priv *priv = dev->data->dev_private;
1228 	unsigned int rxq_id, found = 0, n;
1229 	struct mlx5_rxq_priv *rxq;
1230 
1231 	if (!queue_id)
1232 		return -EINVAL;
1233 	/* Query all the Rx queues of the port in a circular way. */
1234 	for (rxq_id = *queue_id, n = 0; n < priv->rxqs_n; n++) {
1235 		rxq = mlx5_rxq_get(dev, rxq_id);
1236 		if (rxq && rxq->lwm_event_pending) {
1237 			pthread_mutex_lock(&priv->sh->lwm_config_lock);
1238 			rxq->lwm_event_pending = 0;
1239 			pthread_mutex_unlock(&priv->sh->lwm_config_lock);
1240 			*queue_id = rxq_id;
1241 			found = 1;
1242 			if (lwm)
1243 				*lwm =  mlx5_rxq_lwm_to_percentage(rxq);
1244 			break;
1245 		}
1246 		rxq_id = (rxq_id + 1) % priv->rxqs_n;
1247 	}
1248 	return found;
1249 }
1250 
1251 /**
1252  * Rte interrupt handler for LWM event.
1253  * It first checks if the event arrives, if so process the callback for
1254  * RTE_ETH_EVENT_RX_LWM.
1255  *
1256  * @param args
1257  *   Generic pointer to mlx5_priv.
1258  */
1259 void
1260 mlx5_dev_interrupt_handler_lwm(void *args)
1261 {
1262 	struct mlx5_priv *priv = args;
1263 	struct mlx5_rxq_priv *rxq;
1264 	struct rte_eth_dev *dev;
1265 	int ret, rxq_idx = 0, port_id = 0;
1266 
1267 	ret = priv->obj_ops.rxq_event_get_lwm(priv, &rxq_idx, &port_id);
1268 	if (unlikely(ret < 0)) {
1269 		DRV_LOG(WARNING, "Cannot get LWM event context.");
1270 		return;
1271 	}
1272 	DRV_LOG(INFO, "%s get LWM event, port_id:%d rxq_id:%d.", __func__,
1273 		port_id, rxq_idx);
1274 	dev = &rte_eth_devices[port_id];
1275 	rxq = mlx5_rxq_get(dev, rxq_idx);
1276 	if (rxq) {
1277 		pthread_mutex_lock(&priv->sh->lwm_config_lock);
1278 		rxq->lwm_event_pending = 1;
1279 		pthread_mutex_unlock(&priv->sh->lwm_config_lock);
1280 	}
1281 	rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_RX_AVAIL_THRESH, NULL);
1282 }
1283 
1284 /**
1285  * DPDK callback to arm an Rx queue LWM(limit watermark) event.
1286  * While the Rx queue fullness reaches the LWM limit, the driver catches
1287  * an HW event and invokes the user event callback.
1288  * After the last event handling, the user needs to call this API again
1289  * to arm an additional event.
1290  *
1291  * @param dev
1292  *   Pointer to the device structure.
1293  * @param[in] rx_queue_id
1294  *   Rx queue identificator.
1295  * @param[in] lwm
1296  *   The LWM value, is defined by a percentage of the Rx queue size.
1297  *   [1-99] to set a new LWM (update the old value).
1298  *   0 to unarm the event.
1299  *
1300  * @return
1301  *   0 : operation success.
1302  *   Otherwise:
1303  *   - ENOMEM - not enough memory to create LWM event channel.
1304  *   - EINVAL - the input Rxq is not created by devx.
1305  *   - E2BIG  - lwm is bigger than 99.
1306  */
1307 int
1308 mlx5_rx_queue_lwm_set(struct rte_eth_dev *dev, uint16_t rx_queue_id,
1309 		      uint8_t lwm)
1310 {
1311 	struct mlx5_priv *priv = dev->data->dev_private;
1312 	uint16_t port_id = PORT_ID(priv);
1313 	struct mlx5_rxq_priv *rxq = mlx5_rxq_get(dev, rx_queue_id);
1314 	uint16_t event_nums[1] = {MLX5_EVENT_TYPE_SRQ_LIMIT_REACHED};
1315 	struct mlx5_rxq_data *rxq_data;
1316 	uint32_t wqe_cnt;
1317 	uint64_t cookie;
1318 	int ret = 0;
1319 
1320 	if (!rxq) {
1321 		rte_errno = EINVAL;
1322 		return -rte_errno;
1323 	}
1324 	rxq_data = &rxq->ctrl->rxq;
1325 	/* Ensure the Rq is created by devx. */
1326 	if (priv->obj_ops.rxq_obj_new != devx_obj_ops.rxq_obj_new) {
1327 		rte_errno = EINVAL;
1328 		return -rte_errno;
1329 	}
1330 	if (lwm > 99) {
1331 		DRV_LOG(WARNING, "Too big LWM configuration.");
1332 		rte_errno = E2BIG;
1333 		return -rte_errno;
1334 	}
1335 	/* Start config LWM. */
1336 	pthread_mutex_lock(&priv->sh->lwm_config_lock);
1337 	if (rxq->lwm == 0 && lwm == 0) {
1338 		/* Both old/new values are 0, do nothing. */
1339 		ret = 0;
1340 		goto end;
1341 	}
1342 	wqe_cnt = 1 << (rxq_data->elts_n - rxq_data->sges_n);
1343 	if (lwm) {
1344 		if (!priv->sh->devx_channel_lwm) {
1345 			ret = mlx5_lwm_setup(priv);
1346 			if (ret) {
1347 				DRV_LOG(WARNING,
1348 					"Failed to create shared_lwm.");
1349 				rte_errno = ENOMEM;
1350 				ret = -rte_errno;
1351 				goto end;
1352 			}
1353 		}
1354 		if (!rxq->lwm_devx_subscribed) {
1355 			cookie = ((uint32_t)
1356 				  (port_id << LWM_COOKIE_PORTID_OFFSET)) |
1357 				(rx_queue_id << LWM_COOKIE_RXQID_OFFSET);
1358 			ret = mlx5_os_devx_subscribe_devx_event
1359 				(priv->sh->devx_channel_lwm,
1360 				 rxq->devx_rq.rq->obj,
1361 				 sizeof(event_nums),
1362 				 event_nums,
1363 				 cookie);
1364 			if (ret) {
1365 				rte_errno = rte_errno ? rte_errno : EINVAL;
1366 				ret = -rte_errno;
1367 				goto end;
1368 			}
1369 			rxq->lwm_devx_subscribed = 1;
1370 		}
1371 	}
1372 	/* Save LWM to rxq and send modify_rq devx command. */
1373 	rxq->lwm = lwm * wqe_cnt / 100;
1374 	/* Prevent integer division loss when switch lwm number to percentage. */
1375 	if (lwm && (lwm * wqe_cnt % 100)) {
1376 		rxq->lwm = ((uint32_t)(rxq->lwm + 1) >= wqe_cnt) ?
1377 			rxq->lwm : (rxq->lwm + 1);
1378 	}
1379 	if (lwm && !rxq->lwm) {
1380 		/* With mprq, wqe_cnt may be < 100. */
1381 		DRV_LOG(WARNING, "Too small LWM configuration.");
1382 		rte_errno = EINVAL;
1383 		ret = -rte_errno;
1384 		goto end;
1385 	}
1386 	ret = mlx5_devx_modify_rq(rxq, MLX5_RXQ_MOD_RDY2RDY);
1387 end:
1388 	pthread_mutex_unlock(&priv->sh->lwm_config_lock);
1389 	return ret;
1390 }
1391 
1392 /**
1393  * Mlx5 access register function to configure host shaper.
1394  * It calls API in libmtcr_ul to access QSHR(Qos Shaper Host Register)
1395  * in firmware.
1396  *
1397  * @param dev
1398  *   Pointer to rte_eth_dev.
1399  * @param lwm_triggered
1400  *   Flag to enable/disable lwm_triggered bit in QSHR.
1401  * @param rate
1402  *   Host shaper rate, unit is 100Mbps, set to 0 means disable the shaper.
1403  * @return
1404  *   0 : operation success.
1405  *   Otherwise:
1406  *   - ENOENT - no ibdev interface.
1407  *   - EBUSY  - the register access unit is busy.
1408  *   - EIO    - the register access command meets IO error.
1409  */
1410 static int
1411 mlxreg_host_shaper_config(struct rte_eth_dev *dev,
1412 			  bool lwm_triggered, uint8_t rate)
1413 {
1414 #ifdef HAVE_MLX5_MSTFLINT
1415 	struct mlx5_priv *priv = dev->data->dev_private;
1416 	uint32_t data[MLX5_ST_SZ_DW(register_qshr)] = {0};
1417 	int rc, retry_count = 3;
1418 	mfile *mf = NULL;
1419 	int status;
1420 	void *ptr;
1421 
1422 	mf = mopen(priv->sh->ibdev_name);
1423 	if (!mf) {
1424 		DRV_LOG(WARNING, "mopen failed\n");
1425 		rte_errno = ENOENT;
1426 		return -rte_errno;
1427 	}
1428 	MLX5_SET(register_qshr, data, connected_host, 1);
1429 	MLX5_SET(register_qshr, data, fast_response, lwm_triggered ? 1 : 0);
1430 	MLX5_SET(register_qshr, data, local_port, 1);
1431 	ptr = MLX5_ADDR_OF(register_qshr, data, global_config);
1432 	MLX5_SET(ets_global_config_register, ptr, rate_limit_update, 1);
1433 	MLX5_SET(ets_global_config_register, ptr, max_bw_units,
1434 		 rate ? ETS_GLOBAL_CONFIG_BW_UNIT_HUNDREDS_MBPS :
1435 		 ETS_GLOBAL_CONFIG_BW_UNIT_DISABLED);
1436 	MLX5_SET(ets_global_config_register, ptr, max_bw_value, rate);
1437 	do {
1438 		rc = maccess_reg(mf,
1439 				 MLX5_QSHR_REGISTER_ID,
1440 				 MACCESS_REG_METHOD_SET,
1441 				 (u_int32_t *)&data[0],
1442 				 sizeof(data),
1443 				 sizeof(data),
1444 				 sizeof(data),
1445 				 &status);
1446 		if ((rc != ME_ICMD_STATUS_IFC_BUSY &&
1447 		     status != ME_REG_ACCESS_BAD_PARAM) ||
1448 		    !(mf->flags & MDEVS_REM)) {
1449 			break;
1450 		}
1451 		DRV_LOG(WARNING, "%s retry.", __func__);
1452 		usleep(10000);
1453 	} while (retry_count-- > 0);
1454 	mclose(mf);
1455 	rte_errno = (rc == ME_REG_ACCESS_DEV_BUSY) ? EBUSY : EIO;
1456 	return rc ? -rte_errno : 0;
1457 #else
1458 	(void)dev;
1459 	(void)lwm_triggered;
1460 	(void)rate;
1461 	return -1;
1462 #endif
1463 }
1464 
1465 int rte_pmd_mlx5_host_shaper_config(int port_id, uint8_t rate,
1466 				    uint32_t flags)
1467 {
1468 	struct rte_eth_dev *dev = &rte_eth_devices[port_id];
1469 	struct mlx5_priv *priv = dev->data->dev_private;
1470 	bool lwm_triggered =
1471 	     !!(flags & RTE_BIT32(MLX5_HOST_SHAPER_FLAG_AVAIL_THRESH_TRIGGERED));
1472 
1473 	if (!lwm_triggered) {
1474 		priv->sh->host_shaper_rate = rate;
1475 	} else {
1476 		switch (rate) {
1477 		case 0:
1478 		/* Rate 0 means disable lwm_triggered. */
1479 			priv->sh->lwm_triggered = 0;
1480 			break;
1481 		case 1:
1482 		/* Rate 1 means enable lwm_triggered. */
1483 			priv->sh->lwm_triggered = 1;
1484 			break;
1485 		default:
1486 			return -ENOTSUP;
1487 		}
1488 	}
1489 	return mlxreg_host_shaper_config(dev, priv->sh->lwm_triggered,
1490 					 priv->sh->host_shaper_rate);
1491 }
1492