xref: /dpdk/drivers/net/mlx5/mlx5_rxtx.c (revision 4e30ead5e7ca886535e2b30632b2948d2aac1681)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright 2015 6WIND S.A.
5  *   Copyright 2015 Mellanox.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of 6WIND S.A. nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <assert.h>
35 #include <stdint.h>
36 #include <string.h>
37 #include <stdlib.h>
38 
39 /* Verbs header. */
40 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
41 #ifdef PEDANTIC
42 #pragma GCC diagnostic ignored "-Wpedantic"
43 #endif
44 #include <infiniband/verbs.h>
45 #include <infiniband/mlx5_hw.h>
46 #include <infiniband/arch.h>
47 #ifdef PEDANTIC
48 #pragma GCC diagnostic error "-Wpedantic"
49 #endif
50 
51 /* DPDK headers don't like -pedantic. */
52 #ifdef PEDANTIC
53 #pragma GCC diagnostic ignored "-Wpedantic"
54 #endif
55 #include <rte_mbuf.h>
56 #include <rte_mempool.h>
57 #include <rte_prefetch.h>
58 #include <rte_common.h>
59 #include <rte_branch_prediction.h>
60 #include <rte_ether.h>
61 #ifdef PEDANTIC
62 #pragma GCC diagnostic error "-Wpedantic"
63 #endif
64 
65 #include "mlx5.h"
66 #include "mlx5_utils.h"
67 #include "mlx5_rxtx.h"
68 #include "mlx5_autoconf.h"
69 #include "mlx5_defs.h"
70 #include "mlx5_prm.h"
71 
72 static inline int
73 check_cqe(volatile struct mlx5_cqe *cqe,
74 	  unsigned int cqes_n, const uint16_t ci)
75 	  __attribute__((always_inline));
76 
77 static inline void
78 txq_complete(struct txq *txq) __attribute__((always_inline));
79 
80 static inline uint32_t
81 txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
82 	__attribute__((always_inline));
83 
84 static inline void
85 mlx5_tx_dbrec(struct txq *txq, volatile struct mlx5_wqe *wqe)
86 	__attribute__((always_inline));
87 
88 static inline uint32_t
89 rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe)
90 	__attribute__((always_inline));
91 
92 static inline int
93 mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe,
94 		 uint16_t cqe_cnt, uint32_t *rss_hash)
95 		 __attribute__((always_inline));
96 
97 static inline uint32_t
98 rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe *cqe)
99 		   __attribute__((always_inline));
100 
101 #ifndef NDEBUG
102 
103 /**
104  * Verify or set magic value in CQE.
105  *
106  * @param cqe
107  *   Pointer to CQE.
108  *
109  * @return
110  *   0 the first time.
111  */
112 static inline int
113 check_cqe_seen(volatile struct mlx5_cqe *cqe)
114 {
115 	static const uint8_t magic[] = "seen";
116 	volatile uint8_t (*buf)[sizeof(cqe->rsvd0)] = &cqe->rsvd0;
117 	int ret = 1;
118 	unsigned int i;
119 
120 	for (i = 0; i < sizeof(magic) && i < sizeof(*buf); ++i)
121 		if (!ret || (*buf)[i] != magic[i]) {
122 			ret = 0;
123 			(*buf)[i] = magic[i];
124 		}
125 	return ret;
126 }
127 
128 #endif /* NDEBUG */
129 
130 /**
131  * Check whether CQE is valid.
132  *
133  * @param cqe
134  *   Pointer to CQE.
135  * @param cqes_n
136  *   Size of completion queue.
137  * @param ci
138  *   Consumer index.
139  *
140  * @return
141  *   0 on success, 1 on failure.
142  */
143 static inline int
144 check_cqe(volatile struct mlx5_cqe *cqe,
145 	  unsigned int cqes_n, const uint16_t ci)
146 {
147 	uint16_t idx = ci & cqes_n;
148 	uint8_t op_own = cqe->op_own;
149 	uint8_t op_owner = MLX5_CQE_OWNER(op_own);
150 	uint8_t op_code = MLX5_CQE_OPCODE(op_own);
151 
152 	if (unlikely((op_owner != (!!(idx))) || (op_code == MLX5_CQE_INVALID)))
153 		return 1; /* No CQE. */
154 #ifndef NDEBUG
155 	if ((op_code == MLX5_CQE_RESP_ERR) ||
156 	    (op_code == MLX5_CQE_REQ_ERR)) {
157 		volatile struct mlx5_err_cqe *err_cqe = (volatile void *)cqe;
158 		uint8_t syndrome = err_cqe->syndrome;
159 
160 		if ((syndrome == MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR) ||
161 		    (syndrome == MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR))
162 			return 0;
163 		if (!check_cqe_seen(cqe))
164 			ERROR("unexpected CQE error %u (0x%02x)"
165 			      " syndrome 0x%02x",
166 			      op_code, op_code, syndrome);
167 		return 1;
168 	} else if ((op_code != MLX5_CQE_RESP_SEND) &&
169 		   (op_code != MLX5_CQE_REQ)) {
170 		if (!check_cqe_seen(cqe))
171 			ERROR("unexpected CQE opcode %u (0x%02x)",
172 			      op_code, op_code);
173 		return 1;
174 	}
175 #endif /* NDEBUG */
176 	return 0;
177 }
178 
179 /**
180  * Return the address of the WQE.
181  *
182  * @param txq
183  *   Pointer to TX queue structure.
184  * @param  wqe_ci
185  *   WQE consumer index.
186  *
187  * @return
188  *   WQE address.
189  */
190 static inline uintptr_t *
191 tx_mlx5_wqe(struct txq *txq, uint16_t ci)
192 {
193 	ci &= ((1 << txq->wqe_n) - 1);
194 	return (uintptr_t *)((uintptr_t)txq->wqes + ci * MLX5_WQE_SIZE);
195 }
196 
197 /**
198  * Return the size of tailroom of WQ.
199  *
200  * @param txq
201  *   Pointer to TX queue structure.
202  * @param addr
203  *   Pointer to tail of WQ.
204  *
205  * @return
206  *   Size of tailroom.
207  */
208 static inline size_t
209 tx_mlx5_wq_tailroom(struct txq *txq, void *addr)
210 {
211 	size_t tailroom;
212 	tailroom = (uintptr_t)(txq->wqes) +
213 		   (1 << txq->wqe_n) * MLX5_WQE_SIZE -
214 		   (uintptr_t)addr;
215 	return tailroom;
216 }
217 
218 /**
219  * Copy data to tailroom of circular queue.
220  *
221  * @param dst
222  *   Pointer to destination.
223  * @param src
224  *   Pointer to source.
225  * @param n
226  *   Number of bytes to copy.
227  * @param base
228  *   Pointer to head of queue.
229  * @param tailroom
230  *   Size of tailroom from dst.
231  *
232  * @return
233  *   Pointer after copied data.
234  */
235 static inline void *
236 mlx5_copy_to_wq(void *dst, const void *src, size_t n,
237 		void *base, size_t tailroom)
238 {
239 	void *ret;
240 
241 	if (n > tailroom) {
242 		rte_memcpy(dst, src, tailroom);
243 		rte_memcpy(base, (void *)((uintptr_t)src + tailroom),
244 			   n - tailroom);
245 		ret = (uint8_t *)base + n - tailroom;
246 	} else {
247 		rte_memcpy(dst, src, n);
248 		ret = (n == tailroom) ? base : (uint8_t *)dst + n;
249 	}
250 	return ret;
251 }
252 
253 /**
254  * Manage TX completions.
255  *
256  * When sending a burst, mlx5_tx_burst() posts several WRs.
257  *
258  * @param txq
259  *   Pointer to TX queue structure.
260  */
261 static inline void
262 txq_complete(struct txq *txq)
263 {
264 	const unsigned int elts_n = 1 << txq->elts_n;
265 	const unsigned int cqe_n = 1 << txq->cqe_n;
266 	const unsigned int cqe_cnt = cqe_n - 1;
267 	uint16_t elts_free = txq->elts_tail;
268 	uint16_t elts_tail;
269 	uint16_t cq_ci = txq->cq_ci;
270 	volatile struct mlx5_cqe *cqe = NULL;
271 	volatile struct mlx5_wqe_ctrl *ctrl;
272 
273 	do {
274 		volatile struct mlx5_cqe *tmp;
275 
276 		tmp = &(*txq->cqes)[cq_ci & cqe_cnt];
277 		if (check_cqe(tmp, cqe_n, cq_ci))
278 			break;
279 		cqe = tmp;
280 #ifndef NDEBUG
281 		if (MLX5_CQE_FORMAT(cqe->op_own) == MLX5_COMPRESSED) {
282 			if (!check_cqe_seen(cqe))
283 				ERROR("unexpected compressed CQE, TX stopped");
284 			return;
285 		}
286 		if ((MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_RESP_ERR) ||
287 		    (MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_REQ_ERR)) {
288 			if (!check_cqe_seen(cqe))
289 				ERROR("unexpected error CQE, TX stopped");
290 			return;
291 		}
292 #endif /* NDEBUG */
293 		++cq_ci;
294 	} while (1);
295 	if (unlikely(cqe == NULL))
296 		return;
297 	txq->wqe_pi = ntohs(cqe->wqe_counter);
298 	ctrl = (volatile struct mlx5_wqe_ctrl *)
299 		tx_mlx5_wqe(txq, txq->wqe_pi);
300 	elts_tail = ctrl->ctrl3;
301 	assert(elts_tail < (1 << txq->wqe_n));
302 	/* Free buffers. */
303 	while (elts_free != elts_tail) {
304 		struct rte_mbuf *elt = (*txq->elts)[elts_free];
305 		unsigned int elts_free_next =
306 			(elts_free + 1) & (elts_n - 1);
307 		struct rte_mbuf *elt_next = (*txq->elts)[elts_free_next];
308 
309 #ifndef NDEBUG
310 		/* Poisoning. */
311 		memset(&(*txq->elts)[elts_free],
312 		       0x66,
313 		       sizeof((*txq->elts)[elts_free]));
314 #endif
315 		RTE_MBUF_PREFETCH_TO_FREE(elt_next);
316 		/* Only one segment needs to be freed. */
317 		rte_pktmbuf_free_seg(elt);
318 		elts_free = elts_free_next;
319 	}
320 	txq->cq_ci = cq_ci;
321 	txq->elts_tail = elts_tail;
322 	/* Update the consumer index. */
323 	rte_wmb();
324 	*txq->cq_db = htonl(cq_ci);
325 }
326 
327 /**
328  * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which
329  * the cloned mbuf is allocated is returned instead.
330  *
331  * @param buf
332  *   Pointer to mbuf.
333  *
334  * @return
335  *   Memory pool where data is located for given mbuf.
336  */
337 static struct rte_mempool *
338 txq_mb2mp(struct rte_mbuf *buf)
339 {
340 	if (unlikely(RTE_MBUF_INDIRECT(buf)))
341 		return rte_mbuf_from_indirect(buf)->pool;
342 	return buf->pool;
343 }
344 
345 /**
346  * Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[].
347  * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
348  * remove an entry first.
349  *
350  * @param txq
351  *   Pointer to TX queue structure.
352  * @param[in] mp
353  *   Memory Pool for which a Memory Region lkey must be returned.
354  *
355  * @return
356  *   mr->lkey on success, (uint32_t)-1 on failure.
357  */
358 static inline uint32_t
359 txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
360 {
361 	unsigned int i;
362 	uint32_t lkey = (uint32_t)-1;
363 
364 	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
365 		if (unlikely(txq->mp2mr[i].mp == NULL)) {
366 			/* Unknown MP, add a new MR for it. */
367 			break;
368 		}
369 		if (txq->mp2mr[i].mp == mp) {
370 			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
371 			assert(htonl(txq->mp2mr[i].mr->lkey) ==
372 			       txq->mp2mr[i].lkey);
373 			lkey = txq->mp2mr[i].lkey;
374 			break;
375 		}
376 	}
377 	if (unlikely(lkey == (uint32_t)-1))
378 		lkey = txq_mp2mr_reg(txq, mp, i);
379 	return lkey;
380 }
381 
382 /**
383  * Ring TX queue doorbell.
384  *
385  * @param txq
386  *   Pointer to TX queue structure.
387  * @param wqe
388  *   Pointer to the last WQE posted in the NIC.
389  */
390 static inline void
391 mlx5_tx_dbrec(struct txq *txq, volatile struct mlx5_wqe *wqe)
392 {
393 	uint64_t *dst = (uint64_t *)((uintptr_t)txq->bf_reg);
394 	volatile uint64_t *src = ((volatile uint64_t *)wqe);
395 
396 	rte_wmb();
397 	*txq->qp_db = htonl(txq->wqe_ci);
398 	/* Ensure ordering between DB record and BF copy. */
399 	rte_wmb();
400 	*dst = *src;
401 }
402 
403 /**
404  * DPDK callback to check the status of a tx descriptor.
405  *
406  * @param tx_queue
407  *   The tx queue.
408  * @param[in] offset
409  *   The index of the descriptor in the ring.
410  *
411  * @return
412  *   The status of the tx descriptor.
413  */
414 int
415 mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset)
416 {
417 	struct txq *txq = tx_queue;
418 	const unsigned int elts_n = 1 << txq->elts_n;
419 	const unsigned int elts_cnt = elts_n - 1;
420 	unsigned int used;
421 
422 	txq_complete(txq);
423 	used = (txq->elts_head - txq->elts_tail) & elts_cnt;
424 	if (offset < used)
425 		return RTE_ETH_TX_DESC_FULL;
426 	return RTE_ETH_TX_DESC_DONE;
427 }
428 
429 /**
430  * DPDK callback to check the status of a rx descriptor.
431  *
432  * @param rx_queue
433  *   The rx queue.
434  * @param[in] offset
435  *   The index of the descriptor in the ring.
436  *
437  * @return
438  *   The status of the tx descriptor.
439  */
440 int
441 mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset)
442 {
443 	struct rxq *rxq = rx_queue;
444 	struct rxq_zip *zip = &rxq->zip;
445 	volatile struct mlx5_cqe *cqe;
446 	const unsigned int cqe_n = (1 << rxq->cqe_n);
447 	const unsigned int cqe_cnt = cqe_n - 1;
448 	unsigned int cq_ci;
449 	unsigned int used;
450 
451 	/* if we are processing a compressed cqe */
452 	if (zip->ai) {
453 		used = zip->cqe_cnt - zip->ca;
454 		cq_ci = zip->cq_ci;
455 	} else {
456 		used = 0;
457 		cq_ci = rxq->cq_ci;
458 	}
459 	cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
460 	while (check_cqe(cqe, cqe_n, cq_ci) == 0) {
461 		int8_t op_own;
462 		unsigned int n;
463 
464 		op_own = cqe->op_own;
465 		if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED)
466 			n = ntohl(cqe->byte_cnt);
467 		else
468 			n = 1;
469 		cq_ci += n;
470 		used += n;
471 		cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
472 	}
473 	used = RTE_MIN(used, (1U << rxq->elts_n) - 1);
474 	if (offset < used)
475 		return RTE_ETH_RX_DESC_DONE;
476 	return RTE_ETH_RX_DESC_AVAIL;
477 }
478 
479 /**
480  * DPDK callback for TX.
481  *
482  * @param dpdk_txq
483  *   Generic pointer to TX queue structure.
484  * @param[in] pkts
485  *   Packets to transmit.
486  * @param pkts_n
487  *   Number of packets in array.
488  *
489  * @return
490  *   Number of packets successfully transmitted (<= pkts_n).
491  */
492 uint16_t
493 mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
494 {
495 	struct txq *txq = (struct txq *)dpdk_txq;
496 	uint16_t elts_head = txq->elts_head;
497 	const unsigned int elts_n = 1 << txq->elts_n;
498 	unsigned int i = 0;
499 	unsigned int j = 0;
500 	unsigned int k = 0;
501 	unsigned int max;
502 	unsigned int max_inline = txq->max_inline;
503 	const unsigned int inline_en = !!max_inline && txq->inline_en;
504 	uint16_t max_wqe;
505 	unsigned int comp;
506 	volatile struct mlx5_wqe_v *wqe = NULL;
507 	unsigned int segs_n = 0;
508 	struct rte_mbuf *buf = NULL;
509 	uint8_t *raw;
510 
511 	if (unlikely(!pkts_n))
512 		return 0;
513 	/* Prefetch first packet cacheline. */
514 	rte_prefetch0(*pkts);
515 	/* Start processing. */
516 	txq_complete(txq);
517 	max = (elts_n - (elts_head - txq->elts_tail));
518 	if (max > elts_n)
519 		max -= elts_n;
520 	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
521 	if (unlikely(!max_wqe))
522 		return 0;
523 	do {
524 		volatile rte_v128u32_t *dseg = NULL;
525 		uint32_t length;
526 		unsigned int ds = 0;
527 		uintptr_t addr;
528 		uint64_t naddr;
529 		uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE + 2;
530 		uint16_t tso_header_sz = 0;
531 		uint16_t ehdr;
532 		uint8_t cs_flags = 0;
533 		uint64_t tso = 0;
534 #ifdef MLX5_PMD_SOFT_COUNTERS
535 		uint32_t total_length = 0;
536 #endif
537 
538 		/* first_seg */
539 		buf = *(pkts++);
540 		segs_n = buf->nb_segs;
541 		/*
542 		 * Make sure there is enough room to store this packet and
543 		 * that one ring entry remains unused.
544 		 */
545 		assert(segs_n);
546 		if (max < segs_n + 1)
547 			break;
548 		max -= segs_n;
549 		--segs_n;
550 		if (!segs_n)
551 			--pkts_n;
552 		if (unlikely(--max_wqe == 0))
553 			break;
554 		wqe = (volatile struct mlx5_wqe_v *)
555 			tx_mlx5_wqe(txq, txq->wqe_ci);
556 		rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
557 		if (pkts_n > 1)
558 			rte_prefetch0(*pkts);
559 		addr = rte_pktmbuf_mtod(buf, uintptr_t);
560 		length = DATA_LEN(buf);
561 		ehdr = (((uint8_t *)addr)[1] << 8) |
562 		       ((uint8_t *)addr)[0];
563 #ifdef MLX5_PMD_SOFT_COUNTERS
564 		total_length = length;
565 #endif
566 		if (length < (MLX5_WQE_DWORD_SIZE + 2))
567 			break;
568 		/* Update element. */
569 		(*txq->elts)[elts_head] = buf;
570 		elts_head = (elts_head + 1) & (elts_n - 1);
571 		/* Prefetch next buffer data. */
572 		if (pkts_n > 1) {
573 			volatile void *pkt_addr;
574 
575 			pkt_addr = rte_pktmbuf_mtod(*pkts, volatile void *);
576 			rte_prefetch0(pkt_addr);
577 		}
578 		/* Should we enable HW CKSUM offload */
579 		if (buf->ol_flags &
580 		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
581 			const uint64_t is_tunneled = buf->ol_flags &
582 						     (PKT_TX_TUNNEL_GRE |
583 						      PKT_TX_TUNNEL_VXLAN);
584 
585 			if (is_tunneled && txq->tunnel_en) {
586 				cs_flags = MLX5_ETH_WQE_L3_INNER_CSUM |
587 					   MLX5_ETH_WQE_L4_INNER_CSUM;
588 				if (buf->ol_flags & PKT_TX_OUTER_IP_CKSUM)
589 					cs_flags |= MLX5_ETH_WQE_L3_CSUM;
590 			} else {
591 				cs_flags = MLX5_ETH_WQE_L3_CSUM |
592 					   MLX5_ETH_WQE_L4_CSUM;
593 			}
594 		}
595 		raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE;
596 		/* Replace the Ethernet type by the VLAN if necessary. */
597 		if (buf->ol_flags & PKT_TX_VLAN_PKT) {
598 			uint32_t vlan = htonl(0x81000000 | buf->vlan_tci);
599 			unsigned int len = 2 * ETHER_ADDR_LEN - 2;
600 
601 			addr += 2;
602 			length -= 2;
603 			/* Copy Destination and source mac address. */
604 			memcpy((uint8_t *)raw, ((uint8_t *)addr), len);
605 			/* Copy VLAN. */
606 			memcpy((uint8_t *)raw + len, &vlan, sizeof(vlan));
607 			/* Copy missing two bytes to end the DSeg. */
608 			memcpy((uint8_t *)raw + len + sizeof(vlan),
609 			       ((uint8_t *)addr) + len, 2);
610 			addr += len + 2;
611 			length -= (len + 2);
612 		} else {
613 			memcpy((uint8_t *)raw, ((uint8_t *)addr) + 2,
614 			       MLX5_WQE_DWORD_SIZE);
615 			length -= pkt_inline_sz;
616 			addr += pkt_inline_sz;
617 		}
618 		if (txq->tso_en) {
619 			tso = buf->ol_flags & PKT_TX_TCP_SEG;
620 			if (tso) {
621 				uintptr_t end = (uintptr_t)
622 						(((uintptr_t)txq->wqes) +
623 						(1 << txq->wqe_n) *
624 						MLX5_WQE_SIZE);
625 				unsigned int copy_b;
626 				uint8_t vlan_sz = (buf->ol_flags &
627 						  PKT_TX_VLAN_PKT) ? 4 : 0;
628 				const uint64_t is_tunneled =
629 							buf->ol_flags &
630 							(PKT_TX_TUNNEL_GRE |
631 							 PKT_TX_TUNNEL_VXLAN);
632 
633 				tso_header_sz = buf->l2_len + vlan_sz +
634 						buf->l3_len + buf->l4_len;
635 
636 				if (is_tunneled	&& txq->tunnel_en) {
637 					tso_header_sz += buf->outer_l2_len +
638 							 buf->outer_l3_len;
639 					cs_flags |= MLX5_ETH_WQE_L4_INNER_CSUM;
640 				} else {
641 					cs_flags |= MLX5_ETH_WQE_L4_CSUM;
642 				}
643 				if (unlikely(tso_header_sz >
644 					     MLX5_MAX_TSO_HEADER))
645 					break;
646 				copy_b = tso_header_sz - pkt_inline_sz;
647 				/* First seg must contain all headers. */
648 				assert(copy_b <= length);
649 				raw += MLX5_WQE_DWORD_SIZE;
650 				if (copy_b &&
651 				   ((end - (uintptr_t)raw) > copy_b)) {
652 					uint16_t n = (MLX5_WQE_DS(copy_b) -
653 						      1 + 3) / 4;
654 
655 					if (unlikely(max_wqe < n))
656 						break;
657 					max_wqe -= n;
658 					rte_memcpy((void *)raw,
659 						   (void *)addr, copy_b);
660 					addr += copy_b;
661 					length -= copy_b;
662 					pkt_inline_sz += copy_b;
663 					/*
664 					 * Another DWORD will be added
665 					 * in the inline part.
666 					 */
667 					raw += MLX5_WQE_DS(copy_b) *
668 					       MLX5_WQE_DWORD_SIZE -
669 					       MLX5_WQE_DWORD_SIZE;
670 				} else {
671 					/* NOP WQE. */
672 					wqe->ctrl = (rte_v128u32_t){
673 						     htonl(txq->wqe_ci << 8),
674 						     htonl(txq->qp_num_8s | 1),
675 						     0,
676 						     0,
677 					};
678 					ds = 1;
679 					total_length = 0;
680 					pkts--;
681 					pkts_n++;
682 					elts_head = (elts_head - 1) &
683 						    (elts_n - 1);
684 					k++;
685 					goto next_wqe;
686 				}
687 			}
688 		}
689 		/* Inline if enough room. */
690 		if (inline_en || tso) {
691 			uintptr_t end = (uintptr_t)
692 				(((uintptr_t)txq->wqes) +
693 				 (1 << txq->wqe_n) * MLX5_WQE_SIZE);
694 			unsigned int inline_room = max_inline *
695 						   RTE_CACHE_LINE_SIZE -
696 						   (pkt_inline_sz - 2);
697 			uintptr_t addr_end = (addr + inline_room) &
698 					     ~(RTE_CACHE_LINE_SIZE - 1);
699 			unsigned int copy_b = (addr_end > addr) ?
700 				RTE_MIN((addr_end - addr), length) :
701 				0;
702 
703 			raw += MLX5_WQE_DWORD_SIZE;
704 			if (copy_b && ((end - (uintptr_t)raw) > copy_b)) {
705 				/*
706 				 * One Dseg remains in the current WQE.  To
707 				 * keep the computation positive, it is
708 				 * removed after the bytes to Dseg conversion.
709 				 */
710 				uint16_t n = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4;
711 
712 				if (unlikely(max_wqe < n))
713 					break;
714 				max_wqe -= n;
715 				if (tso) {
716 					uint32_t inl =
717 						htonl(copy_b | MLX5_INLINE_SEG);
718 
719 					pkt_inline_sz =
720 						MLX5_WQE_DS(tso_header_sz) *
721 						MLX5_WQE_DWORD_SIZE;
722 					rte_memcpy((void *)raw,
723 						   (void *)&inl, sizeof(inl));
724 					raw += sizeof(inl);
725 					pkt_inline_sz += sizeof(inl);
726 				}
727 				rte_memcpy((void *)raw, (void *)addr, copy_b);
728 				addr += copy_b;
729 				length -= copy_b;
730 				pkt_inline_sz += copy_b;
731 			}
732 			/*
733 			 * 2 DWORDs consumed by the WQE header + ETH segment +
734 			 * the size of the inline part of the packet.
735 			 */
736 			ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2);
737 			if (length > 0) {
738 				if (ds % (MLX5_WQE_SIZE /
739 					  MLX5_WQE_DWORD_SIZE) == 0) {
740 					if (unlikely(--max_wqe == 0))
741 						break;
742 					dseg = (volatile rte_v128u32_t *)
743 					       tx_mlx5_wqe(txq, txq->wqe_ci +
744 							   ds / 4);
745 				} else {
746 					dseg = (volatile rte_v128u32_t *)
747 						((uintptr_t)wqe +
748 						 (ds * MLX5_WQE_DWORD_SIZE));
749 				}
750 				goto use_dseg;
751 			} else if (!segs_n) {
752 				goto next_pkt;
753 			} else {
754 				/* dseg will be advance as part of next_seg */
755 				dseg = (volatile rte_v128u32_t *)
756 					((uintptr_t)wqe +
757 					 ((ds - 1) * MLX5_WQE_DWORD_SIZE));
758 				goto next_seg;
759 			}
760 		} else {
761 			/*
762 			 * No inline has been done in the packet, only the
763 			 * Ethernet Header as been stored.
764 			 */
765 			dseg = (volatile rte_v128u32_t *)
766 				((uintptr_t)wqe + (3 * MLX5_WQE_DWORD_SIZE));
767 			ds = 3;
768 use_dseg:
769 			/* Add the remaining packet as a simple ds. */
770 			naddr = htonll(addr);
771 			*dseg = (rte_v128u32_t){
772 				htonl(length),
773 				txq_mp2mr(txq, txq_mb2mp(buf)),
774 				naddr,
775 				naddr >> 32,
776 			};
777 			++ds;
778 			if (!segs_n)
779 				goto next_pkt;
780 		}
781 next_seg:
782 		assert(buf);
783 		assert(ds);
784 		assert(wqe);
785 		/*
786 		 * Spill on next WQE when the current one does not have
787 		 * enough room left. Size of WQE must a be a multiple
788 		 * of data segment size.
789 		 */
790 		assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE));
791 		if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE))) {
792 			if (unlikely(--max_wqe == 0))
793 				break;
794 			dseg = (volatile rte_v128u32_t *)
795 			       tx_mlx5_wqe(txq, txq->wqe_ci + ds / 4);
796 			rte_prefetch0(tx_mlx5_wqe(txq,
797 						  txq->wqe_ci + ds / 4 + 1));
798 		} else {
799 			++dseg;
800 		}
801 		++ds;
802 		buf = buf->next;
803 		assert(buf);
804 		length = DATA_LEN(buf);
805 #ifdef MLX5_PMD_SOFT_COUNTERS
806 		total_length += length;
807 #endif
808 		/* Store segment information. */
809 		naddr = htonll(rte_pktmbuf_mtod(buf, uintptr_t));
810 		*dseg = (rte_v128u32_t){
811 			htonl(length),
812 			txq_mp2mr(txq, txq_mb2mp(buf)),
813 			naddr,
814 			naddr >> 32,
815 		};
816 		(*txq->elts)[elts_head] = buf;
817 		elts_head = (elts_head + 1) & (elts_n - 1);
818 		++j;
819 		--segs_n;
820 		if (segs_n)
821 			goto next_seg;
822 		else
823 			--pkts_n;
824 next_pkt:
825 		++i;
826 		/* Initialize known and common part of the WQE structure. */
827 		if (tso) {
828 			wqe->ctrl = (rte_v128u32_t){
829 				htonl((txq->wqe_ci << 8) | MLX5_OPCODE_TSO),
830 				htonl(txq->qp_num_8s | ds),
831 				0,
832 				0,
833 			};
834 			wqe->eseg = (rte_v128u32_t){
835 				0,
836 				cs_flags | (htons(buf->tso_segsz) << 16),
837 				0,
838 				(ehdr << 16) | htons(tso_header_sz),
839 			};
840 		} else {
841 			wqe->ctrl = (rte_v128u32_t){
842 				htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND),
843 				htonl(txq->qp_num_8s | ds),
844 				0,
845 				0,
846 			};
847 			wqe->eseg = (rte_v128u32_t){
848 				0,
849 				cs_flags,
850 				0,
851 				(ehdr << 16) | htons(pkt_inline_sz),
852 			};
853 		}
854 next_wqe:
855 		txq->wqe_ci += (ds + 3) / 4;
856 #ifdef MLX5_PMD_SOFT_COUNTERS
857 		/* Increment sent bytes counter. */
858 		txq->stats.obytes += total_length;
859 #endif
860 	} while (pkts_n);
861 	/* Take a shortcut if nothing must be sent. */
862 	if (unlikely((i + k) == 0))
863 		return 0;
864 	/* Check whether completion threshold has been reached. */
865 	comp = txq->elts_comp + i + j + k;
866 	if (comp >= MLX5_TX_COMP_THRESH) {
867 		volatile struct mlx5_wqe_ctrl *w =
868 			(volatile struct mlx5_wqe_ctrl *)wqe;
869 
870 		/* Request completion on last WQE. */
871 		w->ctrl2 = htonl(8);
872 		/* Save elts_head in unused "immediate" field of WQE. */
873 		w->ctrl3 = elts_head;
874 		txq->elts_comp = 0;
875 	} else {
876 		txq->elts_comp = comp;
877 	}
878 #ifdef MLX5_PMD_SOFT_COUNTERS
879 	/* Increment sent packets counter. */
880 	txq->stats.opackets += i;
881 #endif
882 	/* Ring QP doorbell. */
883 	mlx5_tx_dbrec(txq, (volatile struct mlx5_wqe *)wqe);
884 	txq->elts_head = elts_head;
885 	return i;
886 }
887 
888 /**
889  * Open a MPW session.
890  *
891  * @param txq
892  *   Pointer to TX queue structure.
893  * @param mpw
894  *   Pointer to MPW session structure.
895  * @param length
896  *   Packet length.
897  */
898 static inline void
899 mlx5_mpw_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
900 {
901 	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
902 	volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] =
903 		(volatile struct mlx5_wqe_data_seg (*)[])
904 		tx_mlx5_wqe(txq, idx + 1);
905 
906 	mpw->state = MLX5_MPW_STATE_OPENED;
907 	mpw->pkts_n = 0;
908 	mpw->len = length;
909 	mpw->total_len = 0;
910 	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
911 	mpw->wqe->eseg.mss = htons(length);
912 	mpw->wqe->eseg.inline_hdr_sz = 0;
913 	mpw->wqe->eseg.rsvd0 = 0;
914 	mpw->wqe->eseg.rsvd1 = 0;
915 	mpw->wqe->eseg.rsvd2 = 0;
916 	mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
917 				  (txq->wqe_ci << 8) | MLX5_OPCODE_TSO);
918 	mpw->wqe->ctrl[2] = 0;
919 	mpw->wqe->ctrl[3] = 0;
920 	mpw->data.dseg[0] = (volatile struct mlx5_wqe_data_seg *)
921 		(((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
922 	mpw->data.dseg[1] = (volatile struct mlx5_wqe_data_seg *)
923 		(((uintptr_t)mpw->wqe) + (3 * MLX5_WQE_DWORD_SIZE));
924 	mpw->data.dseg[2] = &(*dseg)[0];
925 	mpw->data.dseg[3] = &(*dseg)[1];
926 	mpw->data.dseg[4] = &(*dseg)[2];
927 }
928 
929 /**
930  * Close a MPW session.
931  *
932  * @param txq
933  *   Pointer to TX queue structure.
934  * @param mpw
935  *   Pointer to MPW session structure.
936  */
937 static inline void
938 mlx5_mpw_close(struct txq *txq, struct mlx5_mpw *mpw)
939 {
940 	unsigned int num = mpw->pkts_n;
941 
942 	/*
943 	 * Store size in multiple of 16 bytes. Control and Ethernet segments
944 	 * count as 2.
945 	 */
946 	mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | (2 + num));
947 	mpw->state = MLX5_MPW_STATE_CLOSED;
948 	if (num < 3)
949 		++txq->wqe_ci;
950 	else
951 		txq->wqe_ci += 2;
952 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
953 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
954 }
955 
956 /**
957  * DPDK callback for TX with MPW support.
958  *
959  * @param dpdk_txq
960  *   Generic pointer to TX queue structure.
961  * @param[in] pkts
962  *   Packets to transmit.
963  * @param pkts_n
964  *   Number of packets in array.
965  *
966  * @return
967  *   Number of packets successfully transmitted (<= pkts_n).
968  */
969 uint16_t
970 mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
971 {
972 	struct txq *txq = (struct txq *)dpdk_txq;
973 	uint16_t elts_head = txq->elts_head;
974 	const unsigned int elts_n = 1 << txq->elts_n;
975 	unsigned int i = 0;
976 	unsigned int j = 0;
977 	unsigned int max;
978 	uint16_t max_wqe;
979 	unsigned int comp;
980 	struct mlx5_mpw mpw = {
981 		.state = MLX5_MPW_STATE_CLOSED,
982 	};
983 
984 	if (unlikely(!pkts_n))
985 		return 0;
986 	/* Prefetch first packet cacheline. */
987 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
988 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
989 	/* Start processing. */
990 	txq_complete(txq);
991 	max = (elts_n - (elts_head - txq->elts_tail));
992 	if (max > elts_n)
993 		max -= elts_n;
994 	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
995 	if (unlikely(!max_wqe))
996 		return 0;
997 	do {
998 		struct rte_mbuf *buf = *(pkts++);
999 		unsigned int elts_head_next;
1000 		uint32_t length;
1001 		unsigned int segs_n = buf->nb_segs;
1002 		uint32_t cs_flags = 0;
1003 
1004 		/*
1005 		 * Make sure there is enough room to store this packet and
1006 		 * that one ring entry remains unused.
1007 		 */
1008 		assert(segs_n);
1009 		if (max < segs_n + 1)
1010 			break;
1011 		/* Do not bother with large packets MPW cannot handle. */
1012 		if (segs_n > MLX5_MPW_DSEG_MAX)
1013 			break;
1014 		max -= segs_n;
1015 		--pkts_n;
1016 		/* Should we enable HW CKSUM offload */
1017 		if (buf->ol_flags &
1018 		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
1019 			cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
1020 		/* Retrieve packet information. */
1021 		length = PKT_LEN(buf);
1022 		assert(length);
1023 		/* Start new session if packet differs. */
1024 		if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
1025 		    ((mpw.len != length) ||
1026 		     (segs_n != 1) ||
1027 		     (mpw.wqe->eseg.cs_flags != cs_flags)))
1028 			mlx5_mpw_close(txq, &mpw);
1029 		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
1030 			/*
1031 			 * Multi-Packet WQE consumes at most two WQE.
1032 			 * mlx5_mpw_new() expects to be able to use such
1033 			 * resources.
1034 			 */
1035 			if (unlikely(max_wqe < 2))
1036 				break;
1037 			max_wqe -= 2;
1038 			mlx5_mpw_new(txq, &mpw, length);
1039 			mpw.wqe->eseg.cs_flags = cs_flags;
1040 		}
1041 		/* Multi-segment packets must be alone in their MPW. */
1042 		assert((segs_n == 1) || (mpw.pkts_n == 0));
1043 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1044 		length = 0;
1045 #endif
1046 		do {
1047 			volatile struct mlx5_wqe_data_seg *dseg;
1048 			uintptr_t addr;
1049 
1050 			elts_head_next = (elts_head + 1) & (elts_n - 1);
1051 			assert(buf);
1052 			(*txq->elts)[elts_head] = buf;
1053 			dseg = mpw.data.dseg[mpw.pkts_n];
1054 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
1055 			*dseg = (struct mlx5_wqe_data_seg){
1056 				.byte_count = htonl(DATA_LEN(buf)),
1057 				.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
1058 				.addr = htonll(addr),
1059 			};
1060 			elts_head = elts_head_next;
1061 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1062 			length += DATA_LEN(buf);
1063 #endif
1064 			buf = buf->next;
1065 			++mpw.pkts_n;
1066 			++j;
1067 		} while (--segs_n);
1068 		assert(length == mpw.len);
1069 		if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
1070 			mlx5_mpw_close(txq, &mpw);
1071 		elts_head = elts_head_next;
1072 #ifdef MLX5_PMD_SOFT_COUNTERS
1073 		/* Increment sent bytes counter. */
1074 		txq->stats.obytes += length;
1075 #endif
1076 		++i;
1077 	} while (pkts_n);
1078 	/* Take a shortcut if nothing must be sent. */
1079 	if (unlikely(i == 0))
1080 		return 0;
1081 	/* Check whether completion threshold has been reached. */
1082 	/* "j" includes both packets and segments. */
1083 	comp = txq->elts_comp + j;
1084 	if (comp >= MLX5_TX_COMP_THRESH) {
1085 		volatile struct mlx5_wqe *wqe = mpw.wqe;
1086 
1087 		/* Request completion on last WQE. */
1088 		wqe->ctrl[2] = htonl(8);
1089 		/* Save elts_head in unused "immediate" field of WQE. */
1090 		wqe->ctrl[3] = elts_head;
1091 		txq->elts_comp = 0;
1092 	} else {
1093 		txq->elts_comp = comp;
1094 	}
1095 #ifdef MLX5_PMD_SOFT_COUNTERS
1096 	/* Increment sent packets counter. */
1097 	txq->stats.opackets += i;
1098 #endif
1099 	/* Ring QP doorbell. */
1100 	if (mpw.state == MLX5_MPW_STATE_OPENED)
1101 		mlx5_mpw_close(txq, &mpw);
1102 	mlx5_tx_dbrec(txq, mpw.wqe);
1103 	txq->elts_head = elts_head;
1104 	return i;
1105 }
1106 
1107 /**
1108  * Open a MPW inline session.
1109  *
1110  * @param txq
1111  *   Pointer to TX queue structure.
1112  * @param mpw
1113  *   Pointer to MPW session structure.
1114  * @param length
1115  *   Packet length.
1116  */
1117 static inline void
1118 mlx5_mpw_inline_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
1119 {
1120 	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
1121 	struct mlx5_wqe_inl_small *inl;
1122 
1123 	mpw->state = MLX5_MPW_INL_STATE_OPENED;
1124 	mpw->pkts_n = 0;
1125 	mpw->len = length;
1126 	mpw->total_len = 0;
1127 	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
1128 	mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
1129 				  (txq->wqe_ci << 8) |
1130 				  MLX5_OPCODE_TSO);
1131 	mpw->wqe->ctrl[2] = 0;
1132 	mpw->wqe->ctrl[3] = 0;
1133 	mpw->wqe->eseg.mss = htons(length);
1134 	mpw->wqe->eseg.inline_hdr_sz = 0;
1135 	mpw->wqe->eseg.cs_flags = 0;
1136 	mpw->wqe->eseg.rsvd0 = 0;
1137 	mpw->wqe->eseg.rsvd1 = 0;
1138 	mpw->wqe->eseg.rsvd2 = 0;
1139 	inl = (struct mlx5_wqe_inl_small *)
1140 		(((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE);
1141 	mpw->data.raw = (uint8_t *)&inl->raw;
1142 }
1143 
1144 /**
1145  * Close a MPW inline session.
1146  *
1147  * @param txq
1148  *   Pointer to TX queue structure.
1149  * @param mpw
1150  *   Pointer to MPW session structure.
1151  */
1152 static inline void
1153 mlx5_mpw_inline_close(struct txq *txq, struct mlx5_mpw *mpw)
1154 {
1155 	unsigned int size;
1156 	struct mlx5_wqe_inl_small *inl = (struct mlx5_wqe_inl_small *)
1157 		(((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
1158 
1159 	size = MLX5_WQE_SIZE - MLX5_MWQE64_INL_DATA + mpw->total_len;
1160 	/*
1161 	 * Store size in multiple of 16 bytes. Control and Ethernet segments
1162 	 * count as 2.
1163 	 */
1164 	mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | MLX5_WQE_DS(size));
1165 	mpw->state = MLX5_MPW_STATE_CLOSED;
1166 	inl->byte_cnt = htonl(mpw->total_len | MLX5_INLINE_SEG);
1167 	txq->wqe_ci += (size + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
1168 }
1169 
1170 /**
1171  * DPDK callback for TX with MPW inline support.
1172  *
1173  * @param dpdk_txq
1174  *   Generic pointer to TX queue structure.
1175  * @param[in] pkts
1176  *   Packets to transmit.
1177  * @param pkts_n
1178  *   Number of packets in array.
1179  *
1180  * @return
1181  *   Number of packets successfully transmitted (<= pkts_n).
1182  */
1183 uint16_t
1184 mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
1185 			 uint16_t pkts_n)
1186 {
1187 	struct txq *txq = (struct txq *)dpdk_txq;
1188 	uint16_t elts_head = txq->elts_head;
1189 	const unsigned int elts_n = 1 << txq->elts_n;
1190 	unsigned int i = 0;
1191 	unsigned int j = 0;
1192 	unsigned int max;
1193 	uint16_t max_wqe;
1194 	unsigned int comp;
1195 	unsigned int inline_room = txq->max_inline * RTE_CACHE_LINE_SIZE;
1196 	struct mlx5_mpw mpw = {
1197 		.state = MLX5_MPW_STATE_CLOSED,
1198 	};
1199 	/*
1200 	 * Compute the maximum number of WQE which can be consumed by inline
1201 	 * code.
1202 	 * - 2 DSEG for:
1203 	 *   - 1 control segment,
1204 	 *   - 1 Ethernet segment,
1205 	 * - N Dseg from the inline request.
1206 	 */
1207 	const unsigned int wqe_inl_n =
1208 		((2 * MLX5_WQE_DWORD_SIZE +
1209 		  txq->max_inline * RTE_CACHE_LINE_SIZE) +
1210 		 RTE_CACHE_LINE_SIZE - 1) / RTE_CACHE_LINE_SIZE;
1211 
1212 	if (unlikely(!pkts_n))
1213 		return 0;
1214 	/* Prefetch first packet cacheline. */
1215 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
1216 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
1217 	/* Start processing. */
1218 	txq_complete(txq);
1219 	max = (elts_n - (elts_head - txq->elts_tail));
1220 	if (max > elts_n)
1221 		max -= elts_n;
1222 	do {
1223 		struct rte_mbuf *buf = *(pkts++);
1224 		unsigned int elts_head_next;
1225 		uintptr_t addr;
1226 		uint32_t length;
1227 		unsigned int segs_n = buf->nb_segs;
1228 		uint32_t cs_flags = 0;
1229 
1230 		/*
1231 		 * Make sure there is enough room to store this packet and
1232 		 * that one ring entry remains unused.
1233 		 */
1234 		assert(segs_n);
1235 		if (max < segs_n + 1)
1236 			break;
1237 		/* Do not bother with large packets MPW cannot handle. */
1238 		if (segs_n > MLX5_MPW_DSEG_MAX)
1239 			break;
1240 		max -= segs_n;
1241 		--pkts_n;
1242 		/*
1243 		 * Compute max_wqe in case less WQE were consumed in previous
1244 		 * iteration.
1245 		 */
1246 		max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
1247 		/* Should we enable HW CKSUM offload */
1248 		if (buf->ol_flags &
1249 		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
1250 			cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
1251 		/* Retrieve packet information. */
1252 		length = PKT_LEN(buf);
1253 		/* Start new session if packet differs. */
1254 		if (mpw.state == MLX5_MPW_STATE_OPENED) {
1255 			if ((mpw.len != length) ||
1256 			    (segs_n != 1) ||
1257 			    (mpw.wqe->eseg.cs_flags != cs_flags))
1258 				mlx5_mpw_close(txq, &mpw);
1259 		} else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
1260 			if ((mpw.len != length) ||
1261 			    (segs_n != 1) ||
1262 			    (length > inline_room) ||
1263 			    (mpw.wqe->eseg.cs_flags != cs_flags)) {
1264 				mlx5_mpw_inline_close(txq, &mpw);
1265 				inline_room =
1266 					txq->max_inline * RTE_CACHE_LINE_SIZE;
1267 			}
1268 		}
1269 		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
1270 			if ((segs_n != 1) ||
1271 			    (length > inline_room)) {
1272 				/*
1273 				 * Multi-Packet WQE consumes at most two WQE.
1274 				 * mlx5_mpw_new() expects to be able to use
1275 				 * such resources.
1276 				 */
1277 				if (unlikely(max_wqe < 2))
1278 					break;
1279 				max_wqe -= 2;
1280 				mlx5_mpw_new(txq, &mpw, length);
1281 				mpw.wqe->eseg.cs_flags = cs_flags;
1282 			} else {
1283 				if (unlikely(max_wqe < wqe_inl_n))
1284 					break;
1285 				max_wqe -= wqe_inl_n;
1286 				mlx5_mpw_inline_new(txq, &mpw, length);
1287 				mpw.wqe->eseg.cs_flags = cs_flags;
1288 			}
1289 		}
1290 		/* Multi-segment packets must be alone in their MPW. */
1291 		assert((segs_n == 1) || (mpw.pkts_n == 0));
1292 		if (mpw.state == MLX5_MPW_STATE_OPENED) {
1293 			assert(inline_room ==
1294 			       txq->max_inline * RTE_CACHE_LINE_SIZE);
1295 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1296 			length = 0;
1297 #endif
1298 			do {
1299 				volatile struct mlx5_wqe_data_seg *dseg;
1300 
1301 				elts_head_next =
1302 					(elts_head + 1) & (elts_n - 1);
1303 				assert(buf);
1304 				(*txq->elts)[elts_head] = buf;
1305 				dseg = mpw.data.dseg[mpw.pkts_n];
1306 				addr = rte_pktmbuf_mtod(buf, uintptr_t);
1307 				*dseg = (struct mlx5_wqe_data_seg){
1308 					.byte_count = htonl(DATA_LEN(buf)),
1309 					.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
1310 					.addr = htonll(addr),
1311 				};
1312 				elts_head = elts_head_next;
1313 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1314 				length += DATA_LEN(buf);
1315 #endif
1316 				buf = buf->next;
1317 				++mpw.pkts_n;
1318 				++j;
1319 			} while (--segs_n);
1320 			assert(length == mpw.len);
1321 			if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
1322 				mlx5_mpw_close(txq, &mpw);
1323 		} else {
1324 			unsigned int max;
1325 
1326 			assert(mpw.state == MLX5_MPW_INL_STATE_OPENED);
1327 			assert(length <= inline_room);
1328 			assert(length == DATA_LEN(buf));
1329 			elts_head_next = (elts_head + 1) & (elts_n - 1);
1330 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
1331 			(*txq->elts)[elts_head] = buf;
1332 			/* Maximum number of bytes before wrapping. */
1333 			max = ((((uintptr_t)(txq->wqes)) +
1334 				(1 << txq->wqe_n) *
1335 				MLX5_WQE_SIZE) -
1336 			       (uintptr_t)mpw.data.raw);
1337 			if (length > max) {
1338 				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1339 					   (void *)addr,
1340 					   max);
1341 				mpw.data.raw = (volatile void *)txq->wqes;
1342 				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1343 					   (void *)(addr + max),
1344 					   length - max);
1345 				mpw.data.raw += length - max;
1346 			} else {
1347 				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1348 					   (void *)addr,
1349 					   length);
1350 
1351 				if (length == max)
1352 					mpw.data.raw =
1353 						(volatile void *)txq->wqes;
1354 				else
1355 					mpw.data.raw += length;
1356 			}
1357 			++mpw.pkts_n;
1358 			mpw.total_len += length;
1359 			++j;
1360 			if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) {
1361 				mlx5_mpw_inline_close(txq, &mpw);
1362 				inline_room =
1363 					txq->max_inline * RTE_CACHE_LINE_SIZE;
1364 			} else {
1365 				inline_room -= length;
1366 			}
1367 		}
1368 		elts_head = elts_head_next;
1369 #ifdef MLX5_PMD_SOFT_COUNTERS
1370 		/* Increment sent bytes counter. */
1371 		txq->stats.obytes += length;
1372 #endif
1373 		++i;
1374 	} while (pkts_n);
1375 	/* Take a shortcut if nothing must be sent. */
1376 	if (unlikely(i == 0))
1377 		return 0;
1378 	/* Check whether completion threshold has been reached. */
1379 	/* "j" includes both packets and segments. */
1380 	comp = txq->elts_comp + j;
1381 	if (comp >= MLX5_TX_COMP_THRESH) {
1382 		volatile struct mlx5_wqe *wqe = mpw.wqe;
1383 
1384 		/* Request completion on last WQE. */
1385 		wqe->ctrl[2] = htonl(8);
1386 		/* Save elts_head in unused "immediate" field of WQE. */
1387 		wqe->ctrl[3] = elts_head;
1388 		txq->elts_comp = 0;
1389 	} else {
1390 		txq->elts_comp = comp;
1391 	}
1392 #ifdef MLX5_PMD_SOFT_COUNTERS
1393 	/* Increment sent packets counter. */
1394 	txq->stats.opackets += i;
1395 #endif
1396 	/* Ring QP doorbell. */
1397 	if (mpw.state == MLX5_MPW_INL_STATE_OPENED)
1398 		mlx5_mpw_inline_close(txq, &mpw);
1399 	else if (mpw.state == MLX5_MPW_STATE_OPENED)
1400 		mlx5_mpw_close(txq, &mpw);
1401 	mlx5_tx_dbrec(txq, mpw.wqe);
1402 	txq->elts_head = elts_head;
1403 	return i;
1404 }
1405 
1406 /**
1407  * Open an Enhanced MPW session.
1408  *
1409  * @param txq
1410  *   Pointer to TX queue structure.
1411  * @param mpw
1412  *   Pointer to MPW session structure.
1413  * @param length
1414  *   Packet length.
1415  */
1416 static inline void
1417 mlx5_empw_new(struct txq *txq, struct mlx5_mpw *mpw, int padding)
1418 {
1419 	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
1420 
1421 	mpw->state = MLX5_MPW_ENHANCED_STATE_OPENED;
1422 	mpw->pkts_n = 0;
1423 	mpw->total_len = sizeof(struct mlx5_wqe);
1424 	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
1425 	mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_ENHANCED_MPSW << 24) |
1426 				  (txq->wqe_ci << 8) |
1427 				  MLX5_OPCODE_ENHANCED_MPSW);
1428 	mpw->wqe->ctrl[2] = 0;
1429 	mpw->wqe->ctrl[3] = 0;
1430 	memset((void *)(uintptr_t)&mpw->wqe->eseg, 0, MLX5_WQE_DWORD_SIZE);
1431 	if (unlikely(padding)) {
1432 		uintptr_t addr = (uintptr_t)(mpw->wqe + 1);
1433 
1434 		/* Pad the first 2 DWORDs with zero-length inline header. */
1435 		*(volatile uint32_t *)addr = htonl(MLX5_INLINE_SEG);
1436 		*(volatile uint32_t *)(addr + MLX5_WQE_DWORD_SIZE) =
1437 			htonl(MLX5_INLINE_SEG);
1438 		mpw->total_len += 2 * MLX5_WQE_DWORD_SIZE;
1439 		/* Start from the next WQEBB. */
1440 		mpw->data.raw = (volatile void *)(tx_mlx5_wqe(txq, idx + 1));
1441 	} else {
1442 		mpw->data.raw = (volatile void *)(mpw->wqe + 1);
1443 	}
1444 }
1445 
1446 /**
1447  * Close an Enhanced MPW session.
1448  *
1449  * @param txq
1450  *   Pointer to TX queue structure.
1451  * @param mpw
1452  *   Pointer to MPW session structure.
1453  *
1454  * @return
1455  *   Number of consumed WQEs.
1456  */
1457 static inline uint16_t
1458 mlx5_empw_close(struct txq *txq, struct mlx5_mpw *mpw)
1459 {
1460 	uint16_t ret;
1461 
1462 	/* Store size in multiple of 16 bytes. Control and Ethernet segments
1463 	 * count as 2.
1464 	 */
1465 	mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | MLX5_WQE_DS(mpw->total_len));
1466 	mpw->state = MLX5_MPW_STATE_CLOSED;
1467 	ret = (mpw->total_len + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
1468 	txq->wqe_ci += ret;
1469 	return ret;
1470 }
1471 
1472 /**
1473  * DPDK callback for TX with Enhanced MPW support.
1474  *
1475  * @param dpdk_txq
1476  *   Generic pointer to TX queue structure.
1477  * @param[in] pkts
1478  *   Packets to transmit.
1479  * @param pkts_n
1480  *   Number of packets in array.
1481  *
1482  * @return
1483  *   Number of packets successfully transmitted (<= pkts_n).
1484  */
1485 uint16_t
1486 mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
1487 {
1488 	struct txq *txq = (struct txq *)dpdk_txq;
1489 	uint16_t elts_head = txq->elts_head;
1490 	const unsigned int elts_n = 1 << txq->elts_n;
1491 	unsigned int i = 0;
1492 	unsigned int j = 0;
1493 	unsigned int max_elts;
1494 	uint16_t max_wqe;
1495 	unsigned int max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE;
1496 	unsigned int mpw_room = 0;
1497 	unsigned int inl_pad = 0;
1498 	uint32_t inl_hdr;
1499 	struct mlx5_mpw mpw = {
1500 		.state = MLX5_MPW_STATE_CLOSED,
1501 	};
1502 
1503 	if (unlikely(!pkts_n))
1504 		return 0;
1505 	/* Start processing. */
1506 	txq_complete(txq);
1507 	max_elts = (elts_n - (elts_head - txq->elts_tail));
1508 	if (max_elts > elts_n)
1509 		max_elts -= elts_n;
1510 	/* A CQE slot must always be available. */
1511 	assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci));
1512 	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
1513 	if (unlikely(!max_wqe))
1514 		return 0;
1515 	do {
1516 		struct rte_mbuf *buf = *(pkts++);
1517 		unsigned int elts_head_next;
1518 		uintptr_t addr;
1519 		uint64_t naddr;
1520 		unsigned int n;
1521 		unsigned int do_inline = 0; /* Whether inline is possible. */
1522 		uint32_t length;
1523 		unsigned int segs_n = buf->nb_segs;
1524 		uint32_t cs_flags = 0;
1525 
1526 		/*
1527 		 * Make sure there is enough room to store this packet and
1528 		 * that one ring entry remains unused.
1529 		 */
1530 		assert(segs_n);
1531 		if (max_elts - j < segs_n + 1)
1532 			break;
1533 		/* Do not bother with large packets MPW cannot handle. */
1534 		if (segs_n > MLX5_MPW_DSEG_MAX)
1535 			break;
1536 		/* Should we enable HW CKSUM offload. */
1537 		if (buf->ol_flags &
1538 		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
1539 			cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
1540 		/* Retrieve packet information. */
1541 		length = PKT_LEN(buf);
1542 		/* Start new session if:
1543 		 * - multi-segment packet
1544 		 * - no space left even for a dseg
1545 		 * - next packet can be inlined with a new WQE
1546 		 * - cs_flag differs
1547 		 * It can't be MLX5_MPW_STATE_OPENED as always have a single
1548 		 * segmented packet.
1549 		 */
1550 		if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED) {
1551 			if ((segs_n != 1) ||
1552 			    (inl_pad + sizeof(struct mlx5_wqe_data_seg) >
1553 			      mpw_room) ||
1554 			    (length <= txq->inline_max_packet_sz &&
1555 			     inl_pad + sizeof(inl_hdr) + length >
1556 			      mpw_room) ||
1557 			    (mpw.wqe->eseg.cs_flags != cs_flags))
1558 				max_wqe -= mlx5_empw_close(txq, &mpw);
1559 		}
1560 		if (unlikely(mpw.state == MLX5_MPW_STATE_CLOSED)) {
1561 			if (unlikely(segs_n != 1)) {
1562 				/* Fall back to legacy MPW.
1563 				 * A MPW session consumes 2 WQEs at most to
1564 				 * include MLX5_MPW_DSEG_MAX pointers.
1565 				 */
1566 				if (unlikely(max_wqe < 2))
1567 					break;
1568 				mlx5_mpw_new(txq, &mpw, length);
1569 			} else {
1570 				/* In Enhanced MPW, inline as much as the budget
1571 				 * is allowed. The remaining space is to be
1572 				 * filled with dsegs. If the title WQEBB isn't
1573 				 * padded, it will have 2 dsegs there.
1574 				 */
1575 				mpw_room = RTE_MIN(MLX5_WQE_SIZE_MAX,
1576 					    (max_inline ? max_inline :
1577 					     pkts_n * MLX5_WQE_DWORD_SIZE) +
1578 					    MLX5_WQE_SIZE);
1579 				if (unlikely(max_wqe * MLX5_WQE_SIZE <
1580 					      mpw_room))
1581 					break;
1582 				/* Don't pad the title WQEBB to not waste WQ. */
1583 				mlx5_empw_new(txq, &mpw, 0);
1584 				mpw_room -= mpw.total_len;
1585 				inl_pad = 0;
1586 				do_inline =
1587 					length <= txq->inline_max_packet_sz &&
1588 					sizeof(inl_hdr) + length <= mpw_room &&
1589 					!txq->mpw_hdr_dseg;
1590 			}
1591 			mpw.wqe->eseg.cs_flags = cs_flags;
1592 		} else {
1593 			/* Evaluate whether the next packet can be inlined.
1594 			 * Inlininig is possible when:
1595 			 * - length is less than configured value
1596 			 * - length fits for remaining space
1597 			 * - not required to fill the title WQEBB with dsegs
1598 			 */
1599 			do_inline =
1600 				length <= txq->inline_max_packet_sz &&
1601 				inl_pad + sizeof(inl_hdr) + length <=
1602 				 mpw_room &&
1603 				(!txq->mpw_hdr_dseg ||
1604 				 mpw.total_len >= MLX5_WQE_SIZE);
1605 		}
1606 		/* Multi-segment packets must be alone in their MPW. */
1607 		assert((segs_n == 1) || (mpw.pkts_n == 0));
1608 		if (unlikely(mpw.state == MLX5_MPW_STATE_OPENED)) {
1609 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1610 			length = 0;
1611 #endif
1612 			do {
1613 				volatile struct mlx5_wqe_data_seg *dseg;
1614 
1615 				elts_head_next =
1616 					(elts_head + 1) & (elts_n - 1);
1617 				assert(buf);
1618 				(*txq->elts)[elts_head] = buf;
1619 				dseg = mpw.data.dseg[mpw.pkts_n];
1620 				addr = rte_pktmbuf_mtod(buf, uintptr_t);
1621 				*dseg = (struct mlx5_wqe_data_seg){
1622 					.byte_count = htonl(DATA_LEN(buf)),
1623 					.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
1624 					.addr = htonll(addr),
1625 				};
1626 				elts_head = elts_head_next;
1627 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1628 				length += DATA_LEN(buf);
1629 #endif
1630 				buf = buf->next;
1631 				++j;
1632 				++mpw.pkts_n;
1633 			} while (--segs_n);
1634 			/* A multi-segmented packet takes one MPW session.
1635 			 * TODO: Pack more multi-segmented packets if possible.
1636 			 */
1637 			mlx5_mpw_close(txq, &mpw);
1638 			if (mpw.pkts_n < 3)
1639 				max_wqe--;
1640 			else
1641 				max_wqe -= 2;
1642 		} else if (do_inline) {
1643 			/* Inline packet into WQE. */
1644 			unsigned int max;
1645 
1646 			assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
1647 			assert(length == DATA_LEN(buf));
1648 			inl_hdr = htonl(length | MLX5_INLINE_SEG);
1649 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
1650 			mpw.data.raw = (volatile void *)
1651 				((uintptr_t)mpw.data.raw + inl_pad);
1652 			max = tx_mlx5_wq_tailroom(txq,
1653 					(void *)(uintptr_t)mpw.data.raw);
1654 			/* Copy inline header. */
1655 			mpw.data.raw = (volatile void *)
1656 				mlx5_copy_to_wq(
1657 					  (void *)(uintptr_t)mpw.data.raw,
1658 					  &inl_hdr,
1659 					  sizeof(inl_hdr),
1660 					  (void *)(uintptr_t)txq->wqes,
1661 					  max);
1662 			max = tx_mlx5_wq_tailroom(txq,
1663 					(void *)(uintptr_t)mpw.data.raw);
1664 			/* Copy packet data. */
1665 			mpw.data.raw = (volatile void *)
1666 				mlx5_copy_to_wq(
1667 					  (void *)(uintptr_t)mpw.data.raw,
1668 					  (void *)addr,
1669 					  length,
1670 					  (void *)(uintptr_t)txq->wqes,
1671 					  max);
1672 			++mpw.pkts_n;
1673 			mpw.total_len += (inl_pad + sizeof(inl_hdr) + length);
1674 			/* No need to get completion as the entire packet is
1675 			 * copied to WQ. Free the buf right away.
1676 			 */
1677 			elts_head_next = elts_head;
1678 			rte_pktmbuf_free_seg(buf);
1679 			mpw_room -= (inl_pad + sizeof(inl_hdr) + length);
1680 			/* Add pad in the next packet if any. */
1681 			inl_pad = (((uintptr_t)mpw.data.raw +
1682 					(MLX5_WQE_DWORD_SIZE - 1)) &
1683 					~(MLX5_WQE_DWORD_SIZE - 1)) -
1684 				  (uintptr_t)mpw.data.raw;
1685 		} else {
1686 			/* No inline. Load a dseg of packet pointer. */
1687 			volatile rte_v128u32_t *dseg;
1688 
1689 			assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
1690 			assert((inl_pad + sizeof(*dseg)) <= mpw_room);
1691 			assert(length == DATA_LEN(buf));
1692 			if (!tx_mlx5_wq_tailroom(txq,
1693 					(void *)((uintptr_t)mpw.data.raw
1694 						+ inl_pad)))
1695 				dseg = (volatile void *)txq->wqes;
1696 			else
1697 				dseg = (volatile void *)
1698 					((uintptr_t)mpw.data.raw +
1699 					 inl_pad);
1700 			elts_head_next = (elts_head + 1) & (elts_n - 1);
1701 			(*txq->elts)[elts_head] = buf;
1702 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
1703 			for (n = 0; n * RTE_CACHE_LINE_SIZE < length; n++)
1704 				rte_prefetch2((void *)(addr +
1705 						n * RTE_CACHE_LINE_SIZE));
1706 			naddr = htonll(addr);
1707 			*dseg = (rte_v128u32_t) {
1708 				htonl(length),
1709 				txq_mp2mr(txq, txq_mb2mp(buf)),
1710 				naddr,
1711 				naddr >> 32,
1712 			};
1713 			mpw.data.raw = (volatile void *)(dseg + 1);
1714 			mpw.total_len += (inl_pad + sizeof(*dseg));
1715 			++j;
1716 			++mpw.pkts_n;
1717 			mpw_room -= (inl_pad + sizeof(*dseg));
1718 			inl_pad = 0;
1719 		}
1720 		elts_head = elts_head_next;
1721 #ifdef MLX5_PMD_SOFT_COUNTERS
1722 		/* Increment sent bytes counter. */
1723 		txq->stats.obytes += length;
1724 #endif
1725 		++i;
1726 	} while (i < pkts_n);
1727 	/* Take a shortcut if nothing must be sent. */
1728 	if (unlikely(i == 0))
1729 		return 0;
1730 	/* Check whether completion threshold has been reached. */
1731 	if (txq->elts_comp + j >= MLX5_TX_COMP_THRESH ||
1732 			(uint16_t)(txq->wqe_ci - txq->mpw_comp) >=
1733 			 (1 << txq->wqe_n) / MLX5_TX_COMP_THRESH_INLINE_DIV) {
1734 		volatile struct mlx5_wqe *wqe = mpw.wqe;
1735 
1736 		/* Request completion on last WQE. */
1737 		wqe->ctrl[2] = htonl(8);
1738 		/* Save elts_head in unused "immediate" field of WQE. */
1739 		wqe->ctrl[3] = elts_head;
1740 		txq->elts_comp = 0;
1741 		txq->mpw_comp = txq->wqe_ci;
1742 		txq->cq_pi++;
1743 	} else {
1744 		txq->elts_comp += j;
1745 	}
1746 #ifdef MLX5_PMD_SOFT_COUNTERS
1747 	/* Increment sent packets counter. */
1748 	txq->stats.opackets += i;
1749 #endif
1750 	if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED)
1751 		mlx5_empw_close(txq, &mpw);
1752 	else if (mpw.state == MLX5_MPW_STATE_OPENED)
1753 		mlx5_mpw_close(txq, &mpw);
1754 	/* Ring QP doorbell. */
1755 	mlx5_tx_dbrec(txq, mpw.wqe);
1756 	txq->elts_head = elts_head;
1757 	return i;
1758 }
1759 
1760 /**
1761  * Translate RX completion flags to packet type.
1762  *
1763  * @param[in] cqe
1764  *   Pointer to CQE.
1765  *
1766  * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
1767  *
1768  * @return
1769  *   Packet type for struct rte_mbuf.
1770  */
1771 static inline uint32_t
1772 rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe)
1773 {
1774 	uint32_t pkt_type;
1775 	uint16_t flags = ntohs(cqe->hdr_type_etc);
1776 
1777 	if (cqe->pkt_info & MLX5_CQE_RX_TUNNEL_PACKET) {
1778 		pkt_type =
1779 			TRANSPOSE(flags,
1780 				  MLX5_CQE_RX_IPV4_PACKET,
1781 				  RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN) |
1782 			TRANSPOSE(flags,
1783 				  MLX5_CQE_RX_IPV6_PACKET,
1784 				  RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN);
1785 		pkt_type |= ((cqe->pkt_info & MLX5_CQE_RX_OUTER_PACKET) ?
1786 			     RTE_PTYPE_L3_IPV6_EXT_UNKNOWN :
1787 			     RTE_PTYPE_L3_IPV4_EXT_UNKNOWN);
1788 	} else {
1789 		pkt_type =
1790 			TRANSPOSE(flags,
1791 				  MLX5_CQE_L3_HDR_TYPE_IPV6,
1792 				  RTE_PTYPE_L3_IPV6_EXT_UNKNOWN) |
1793 			TRANSPOSE(flags,
1794 				  MLX5_CQE_L3_HDR_TYPE_IPV4,
1795 				  RTE_PTYPE_L3_IPV4_EXT_UNKNOWN);
1796 	}
1797 	return pkt_type;
1798 }
1799 
1800 /**
1801  * Get size of the next packet for a given CQE. For compressed CQEs, the
1802  * consumer index is updated only once all packets of the current one have
1803  * been processed.
1804  *
1805  * @param rxq
1806  *   Pointer to RX queue.
1807  * @param cqe
1808  *   CQE to process.
1809  * @param[out] rss_hash
1810  *   Packet RSS Hash result.
1811  *
1812  * @return
1813  *   Packet size in bytes (0 if there is none), -1 in case of completion
1814  *   with error.
1815  */
1816 static inline int
1817 mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe,
1818 		 uint16_t cqe_cnt, uint32_t *rss_hash)
1819 {
1820 	struct rxq_zip *zip = &rxq->zip;
1821 	uint16_t cqe_n = cqe_cnt + 1;
1822 	int len = 0;
1823 	uint16_t idx, end;
1824 
1825 	/* Process compressed data in the CQE and mini arrays. */
1826 	if (zip->ai) {
1827 		volatile struct mlx5_mini_cqe8 (*mc)[8] =
1828 			(volatile struct mlx5_mini_cqe8 (*)[8])
1829 			(uintptr_t)(&(*rxq->cqes)[zip->ca & cqe_cnt]);
1830 
1831 		len = ntohl((*mc)[zip->ai & 7].byte_cnt);
1832 		*rss_hash = ntohl((*mc)[zip->ai & 7].rx_hash_result);
1833 		if ((++zip->ai & 7) == 0) {
1834 			/* Invalidate consumed CQEs */
1835 			idx = zip->ca;
1836 			end = zip->na;
1837 			while (idx != end) {
1838 				(*rxq->cqes)[idx & cqe_cnt].op_own =
1839 					MLX5_CQE_INVALIDATE;
1840 				++idx;
1841 			}
1842 			/*
1843 			 * Increment consumer index to skip the number of
1844 			 * CQEs consumed. Hardware leaves holes in the CQ
1845 			 * ring for software use.
1846 			 */
1847 			zip->ca = zip->na;
1848 			zip->na += 8;
1849 		}
1850 		if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
1851 			/* Invalidate the rest */
1852 			idx = zip->ca;
1853 			end = zip->cq_ci;
1854 
1855 			while (idx != end) {
1856 				(*rxq->cqes)[idx & cqe_cnt].op_own =
1857 					MLX5_CQE_INVALIDATE;
1858 				++idx;
1859 			}
1860 			rxq->cq_ci = zip->cq_ci;
1861 			zip->ai = 0;
1862 		}
1863 	/* No compressed data, get next CQE and verify if it is compressed. */
1864 	} else {
1865 		int ret;
1866 		int8_t op_own;
1867 
1868 		ret = check_cqe(cqe, cqe_n, rxq->cq_ci);
1869 		if (unlikely(ret == 1))
1870 			return 0;
1871 		++rxq->cq_ci;
1872 		op_own = cqe->op_own;
1873 		if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
1874 			volatile struct mlx5_mini_cqe8 (*mc)[8] =
1875 				(volatile struct mlx5_mini_cqe8 (*)[8])
1876 				(uintptr_t)(&(*rxq->cqes)[rxq->cq_ci &
1877 							  cqe_cnt]);
1878 
1879 			/* Fix endianness. */
1880 			zip->cqe_cnt = ntohl(cqe->byte_cnt);
1881 			/*
1882 			 * Current mini array position is the one returned by
1883 			 * check_cqe64().
1884 			 *
1885 			 * If completion comprises several mini arrays, as a
1886 			 * special case the second one is located 7 CQEs after
1887 			 * the initial CQE instead of 8 for subsequent ones.
1888 			 */
1889 			zip->ca = rxq->cq_ci;
1890 			zip->na = zip->ca + 7;
1891 			/* Compute the next non compressed CQE. */
1892 			--rxq->cq_ci;
1893 			zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
1894 			/* Get packet size to return. */
1895 			len = ntohl((*mc)[0].byte_cnt);
1896 			*rss_hash = ntohl((*mc)[0].rx_hash_result);
1897 			zip->ai = 1;
1898 			/* Prefetch all the entries to be invalidated */
1899 			idx = zip->ca;
1900 			end = zip->cq_ci;
1901 			while (idx != end) {
1902 				rte_prefetch0(&(*rxq->cqes)[(idx) & cqe_cnt]);
1903 				++idx;
1904 			}
1905 		} else {
1906 			len = ntohl(cqe->byte_cnt);
1907 			*rss_hash = ntohl(cqe->rx_hash_res);
1908 		}
1909 		/* Error while receiving packet. */
1910 		if (unlikely(MLX5_CQE_OPCODE(op_own) == MLX5_CQE_RESP_ERR))
1911 			return -1;
1912 	}
1913 	return len;
1914 }
1915 
1916 /**
1917  * Translate RX completion flags to offload flags.
1918  *
1919  * @param[in] rxq
1920  *   Pointer to RX queue structure.
1921  * @param[in] cqe
1922  *   Pointer to CQE.
1923  *
1924  * @return
1925  *   Offload flags (ol_flags) for struct rte_mbuf.
1926  */
1927 static inline uint32_t
1928 rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe *cqe)
1929 {
1930 	uint32_t ol_flags = 0;
1931 	uint16_t flags = ntohs(cqe->hdr_type_etc);
1932 
1933 	ol_flags =
1934 		TRANSPOSE(flags,
1935 			  MLX5_CQE_RX_L3_HDR_VALID,
1936 			  PKT_RX_IP_CKSUM_GOOD) |
1937 		TRANSPOSE(flags,
1938 			  MLX5_CQE_RX_L4_HDR_VALID,
1939 			  PKT_RX_L4_CKSUM_GOOD);
1940 	if ((cqe->pkt_info & MLX5_CQE_RX_TUNNEL_PACKET) && (rxq->csum_l2tun))
1941 		ol_flags |=
1942 			TRANSPOSE(flags,
1943 				  MLX5_CQE_RX_L3_HDR_VALID,
1944 				  PKT_RX_IP_CKSUM_GOOD) |
1945 			TRANSPOSE(flags,
1946 				  MLX5_CQE_RX_L4_HDR_VALID,
1947 				  PKT_RX_L4_CKSUM_GOOD);
1948 	return ol_flags;
1949 }
1950 
1951 /**
1952  * DPDK callback for RX.
1953  *
1954  * @param dpdk_rxq
1955  *   Generic pointer to RX queue structure.
1956  * @param[out] pkts
1957  *   Array to store received packets.
1958  * @param pkts_n
1959  *   Maximum number of packets in array.
1960  *
1961  * @return
1962  *   Number of packets successfully received (<= pkts_n).
1963  */
1964 uint16_t
1965 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
1966 {
1967 	struct rxq *rxq = dpdk_rxq;
1968 	const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1;
1969 	const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1;
1970 	const unsigned int sges_n = rxq->sges_n;
1971 	struct rte_mbuf *pkt = NULL;
1972 	struct rte_mbuf *seg = NULL;
1973 	volatile struct mlx5_cqe *cqe =
1974 		&(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
1975 	unsigned int i = 0;
1976 	unsigned int rq_ci = rxq->rq_ci << sges_n;
1977 	int len = 0; /* keep its value across iterations. */
1978 
1979 	while (pkts_n) {
1980 		unsigned int idx = rq_ci & wqe_cnt;
1981 		volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx];
1982 		struct rte_mbuf *rep = (*rxq->elts)[idx];
1983 		uint32_t rss_hash_res = 0;
1984 
1985 		if (pkt)
1986 			NEXT(seg) = rep;
1987 		seg = rep;
1988 		rte_prefetch0(seg);
1989 		rte_prefetch0(cqe);
1990 		rte_prefetch0(wqe);
1991 		rep = rte_mbuf_raw_alloc(rxq->mp);
1992 		if (unlikely(rep == NULL)) {
1993 			++rxq->stats.rx_nombuf;
1994 			if (!pkt) {
1995 				/*
1996 				 * no buffers before we even started,
1997 				 * bail out silently.
1998 				 */
1999 				break;
2000 			}
2001 			while (pkt != seg) {
2002 				assert(pkt != (*rxq->elts)[idx]);
2003 				rep = NEXT(pkt);
2004 				NEXT(pkt) = NULL;
2005 				NB_SEGS(pkt) = 1;
2006 				rte_mbuf_raw_free(pkt);
2007 				pkt = rep;
2008 			}
2009 			break;
2010 		}
2011 		if (!pkt) {
2012 			cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
2013 			len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt,
2014 					       &rss_hash_res);
2015 			if (!len) {
2016 				rte_mbuf_raw_free(rep);
2017 				break;
2018 			}
2019 			if (unlikely(len == -1)) {
2020 				/* RX error, packet is likely too large. */
2021 				rte_mbuf_raw_free(rep);
2022 				++rxq->stats.idropped;
2023 				goto skip;
2024 			}
2025 			pkt = seg;
2026 			assert(len >= (rxq->crc_present << 2));
2027 			/* Update packet information. */
2028 			pkt->packet_type = 0;
2029 			pkt->ol_flags = 0;
2030 			if (rss_hash_res && rxq->rss_hash) {
2031 				pkt->hash.rss = rss_hash_res;
2032 				pkt->ol_flags = PKT_RX_RSS_HASH;
2033 			}
2034 			if (rxq->mark &&
2035 			    MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) {
2036 				pkt->ol_flags |= PKT_RX_FDIR;
2037 				if (cqe->sop_drop_qpn !=
2038 				    htonl(MLX5_FLOW_MARK_DEFAULT)) {
2039 					uint32_t mark = cqe->sop_drop_qpn;
2040 
2041 					pkt->ol_flags |= PKT_RX_FDIR_ID;
2042 					pkt->hash.fdir.hi =
2043 						mlx5_flow_mark_get(mark);
2044 				}
2045 			}
2046 			if (rxq->csum | rxq->csum_l2tun) {
2047 				pkt->packet_type = rxq_cq_to_pkt_type(cqe);
2048 				pkt->ol_flags |= rxq_cq_to_ol_flags(rxq, cqe);
2049 			}
2050 			if (rxq->vlan_strip &&
2051 			    (cqe->hdr_type_etc &
2052 			     htons(MLX5_CQE_VLAN_STRIPPED))) {
2053 				pkt->ol_flags |= PKT_RX_VLAN_PKT |
2054 					PKT_RX_VLAN_STRIPPED;
2055 				pkt->vlan_tci = ntohs(cqe->vlan_info);
2056 			}
2057 			if (rxq->crc_present)
2058 				len -= ETHER_CRC_LEN;
2059 			PKT_LEN(pkt) = len;
2060 		}
2061 		DATA_LEN(rep) = DATA_LEN(seg);
2062 		PKT_LEN(rep) = PKT_LEN(seg);
2063 		SET_DATA_OFF(rep, DATA_OFF(seg));
2064 		NB_SEGS(rep) = NB_SEGS(seg);
2065 		PORT(rep) = PORT(seg);
2066 		NEXT(rep) = NULL;
2067 		(*rxq->elts)[idx] = rep;
2068 		/*
2069 		 * Fill NIC descriptor with the new buffer.  The lkey and size
2070 		 * of the buffers are already known, only the buffer address
2071 		 * changes.
2072 		 */
2073 		wqe->addr = htonll(rte_pktmbuf_mtod(rep, uintptr_t));
2074 		if (len > DATA_LEN(seg)) {
2075 			len -= DATA_LEN(seg);
2076 			++NB_SEGS(pkt);
2077 			++rq_ci;
2078 			continue;
2079 		}
2080 		DATA_LEN(seg) = len;
2081 #ifdef MLX5_PMD_SOFT_COUNTERS
2082 		/* Increment bytes counter. */
2083 		rxq->stats.ibytes += PKT_LEN(pkt);
2084 #endif
2085 		/* Return packet. */
2086 		*(pkts++) = pkt;
2087 		pkt = NULL;
2088 		--pkts_n;
2089 		++i;
2090 skip:
2091 		/* Align consumer index to the next stride. */
2092 		rq_ci >>= sges_n;
2093 		++rq_ci;
2094 		rq_ci <<= sges_n;
2095 	}
2096 	if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci)))
2097 		return 0;
2098 	/* Update the consumer index. */
2099 	rxq->rq_ci = rq_ci >> sges_n;
2100 	rte_wmb();
2101 	*rxq->cq_db = htonl(rxq->cq_ci);
2102 	rte_wmb();
2103 	*rxq->rq_db = htonl(rxq->rq_ci);
2104 #ifdef MLX5_PMD_SOFT_COUNTERS
2105 	/* Increment packets counter. */
2106 	rxq->stats.ipackets += i;
2107 #endif
2108 	return i;
2109 }
2110 
2111 /**
2112  * Dummy DPDK callback for TX.
2113  *
2114  * This function is used to temporarily replace the real callback during
2115  * unsafe control operations on the queue, or in case of error.
2116  *
2117  * @param dpdk_txq
2118  *   Generic pointer to TX queue structure.
2119  * @param[in] pkts
2120  *   Packets to transmit.
2121  * @param pkts_n
2122  *   Number of packets in array.
2123  *
2124  * @return
2125  *   Number of packets successfully transmitted (<= pkts_n).
2126  */
2127 uint16_t
2128 removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
2129 {
2130 	(void)dpdk_txq;
2131 	(void)pkts;
2132 	(void)pkts_n;
2133 	return 0;
2134 }
2135 
2136 /**
2137  * Dummy DPDK callback for RX.
2138  *
2139  * This function is used to temporarily replace the real callback during
2140  * unsafe control operations on the queue, or in case of error.
2141  *
2142  * @param dpdk_rxq
2143  *   Generic pointer to RX queue structure.
2144  * @param[out] pkts
2145  *   Array to store received packets.
2146  * @param pkts_n
2147  *   Maximum number of packets in array.
2148  *
2149  * @return
2150  *   Number of packets successfully received (<= pkts_n).
2151  */
2152 uint16_t
2153 removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
2154 {
2155 	(void)dpdk_rxq;
2156 	(void)pkts;
2157 	(void)pkts_n;
2158 	return 0;
2159 }
2160 
2161 /**
2162  * DPDK callback for rx queue interrupt enable.
2163  *
2164  * @param dev
2165  *   Pointer to Ethernet device structure.
2166  * @param rx_queue_id
2167  *   RX queue number
2168  *
2169  * @return
2170  *   0 on success, negative on failure.
2171  */
2172 int
2173 mlx5_rx_intr_enable(struct rte_eth_dev *dev, uint16_t rx_queue_id)
2174 {
2175 #ifdef HAVE_UPDATE_CQ_CI
2176 	struct priv *priv = mlx5_get_priv(dev);
2177 	struct rxq *rxq = (*priv->rxqs)[rx_queue_id];
2178 	struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
2179 	struct ibv_cq *cq = rxq_ctrl->cq;
2180 	uint16_t ci = rxq->cq_ci;
2181 	int ret = 0;
2182 
2183 	ibv_mlx5_exp_update_cq_ci(cq, ci);
2184 	ret = ibv_req_notify_cq(cq, 0);
2185 #else
2186 	int ret = -1;
2187 	(void)dev;
2188 	(void)rx_queue_id;
2189 #endif
2190 	if (ret)
2191 		WARN("unable to arm interrupt on rx queue %d", rx_queue_id);
2192 	return ret;
2193 }
2194 
2195 /**
2196  * DPDK callback for rx queue interrupt disable.
2197  *
2198  * @param dev
2199  *   Pointer to Ethernet device structure.
2200  * @param rx_queue_id
2201  *   RX queue number
2202  *
2203  * @return
2204  *   0 on success, negative on failure.
2205  */
2206 int
2207 mlx5_rx_intr_disable(struct rte_eth_dev *dev, uint16_t rx_queue_id)
2208 {
2209 #ifdef HAVE_UPDATE_CQ_CI
2210 	struct priv *priv = mlx5_get_priv(dev);
2211 	struct rxq *rxq = (*priv->rxqs)[rx_queue_id];
2212 	struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
2213 	struct ibv_cq *cq = rxq_ctrl->cq;
2214 	struct ibv_cq *ev_cq;
2215 	void *ev_ctx;
2216 	int ret = 0;
2217 
2218 	ret = ibv_get_cq_event(cq->channel, &ev_cq, &ev_ctx);
2219 	if (ret || ev_cq != cq)
2220 		ret = -1;
2221 	else
2222 		ibv_ack_cq_events(cq, 1);
2223 #else
2224 	int ret = -1;
2225 	(void)dev;
2226 	(void)rx_queue_id;
2227 #endif
2228 	if (ret)
2229 		WARN("unable to disable interrupt on rx queue %d",
2230 		     rx_queue_id);
2231 	return ret;
2232 }
2233