xref: /dpdk/drivers/net/mlx5/mlx5_rxtx.c (revision 5c451eb543fab68bb40ea6d0ed4090b14f0973e4)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright 2015 6WIND S.A.
5  *   Copyright 2015 Mellanox.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of 6WIND S.A. nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <assert.h>
35 #include <stdint.h>
36 #include <string.h>
37 #include <stdlib.h>
38 
39 /* Verbs header. */
40 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
41 #ifdef PEDANTIC
42 #pragma GCC diagnostic ignored "-Wpedantic"
43 #endif
44 #include <infiniband/verbs.h>
45 #include <infiniband/mlx5_hw.h>
46 #include <infiniband/arch.h>
47 #ifdef PEDANTIC
48 #pragma GCC diagnostic error "-Wpedantic"
49 #endif
50 
51 /* DPDK headers don't like -pedantic. */
52 #ifdef PEDANTIC
53 #pragma GCC diagnostic ignored "-Wpedantic"
54 #endif
55 #include <rte_mbuf.h>
56 #include <rte_mempool.h>
57 #include <rte_prefetch.h>
58 #include <rte_common.h>
59 #include <rte_branch_prediction.h>
60 #include <rte_ether.h>
61 #ifdef PEDANTIC
62 #pragma GCC diagnostic error "-Wpedantic"
63 #endif
64 
65 #include "mlx5.h"
66 #include "mlx5_utils.h"
67 #include "mlx5_rxtx.h"
68 #include "mlx5_autoconf.h"
69 #include "mlx5_defs.h"
70 #include "mlx5_prm.h"
71 
72 static __rte_always_inline uint32_t
73 rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe);
74 
75 static __rte_always_inline int
76 mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe,
77 		 uint16_t cqe_cnt, uint32_t *rss_hash);
78 
79 static __rte_always_inline uint32_t
80 rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe *cqe);
81 
82 uint32_t mlx5_ptype_table[] __rte_cache_aligned = {
83 	[0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */
84 };
85 
86 /**
87  * Build a table to translate Rx completion flags to packet type.
88  *
89  * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
90  */
91 void
92 mlx5_set_ptype_table(void)
93 {
94 	unsigned int i;
95 	uint32_t (*p)[RTE_DIM(mlx5_ptype_table)] = &mlx5_ptype_table;
96 
97 	/* Last entry must not be overwritten, reserved for errored packet. */
98 	for (i = 0; i < RTE_DIM(mlx5_ptype_table) - 1; ++i)
99 		(*p)[i] = RTE_PTYPE_UNKNOWN;
100 	/*
101 	 * The index to the array should have:
102 	 * bit[1:0] = l3_hdr_type
103 	 * bit[4:2] = l4_hdr_type
104 	 * bit[5] = ip_frag
105 	 * bit[6] = tunneled
106 	 * bit[7] = outer_l3_type
107 	 */
108 	/* L3 */
109 	(*p)[0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
110 		     RTE_PTYPE_L4_NONFRAG;
111 	(*p)[0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
112 		     RTE_PTYPE_L4_NONFRAG;
113 	/* Fragmented */
114 	(*p)[0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
115 		     RTE_PTYPE_L4_FRAG;
116 	(*p)[0x22] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
117 		     RTE_PTYPE_L4_FRAG;
118 	/* TCP */
119 	(*p)[0x05] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
120 		     RTE_PTYPE_L4_TCP;
121 	(*p)[0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
122 		     RTE_PTYPE_L4_TCP;
123 	/* UDP */
124 	(*p)[0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
125 		     RTE_PTYPE_L4_UDP;
126 	(*p)[0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
127 		     RTE_PTYPE_L4_UDP;
128 	/* Repeat with outer_l3_type being set. Just in case. */
129 	(*p)[0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
130 		     RTE_PTYPE_L4_NONFRAG;
131 	(*p)[0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
132 		     RTE_PTYPE_L4_NONFRAG;
133 	(*p)[0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
134 		     RTE_PTYPE_L4_FRAG;
135 	(*p)[0xa2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
136 		     RTE_PTYPE_L4_FRAG;
137 	(*p)[0x85] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
138 		     RTE_PTYPE_L4_TCP;
139 	(*p)[0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
140 		     RTE_PTYPE_L4_TCP;
141 	(*p)[0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
142 		     RTE_PTYPE_L4_UDP;
143 	(*p)[0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
144 		     RTE_PTYPE_L4_UDP;
145 	/* Tunneled - L3 */
146 	(*p)[0x41] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
147 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
148 		     RTE_PTYPE_INNER_L4_NONFRAG;
149 	(*p)[0x42] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
150 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
151 		     RTE_PTYPE_INNER_L4_NONFRAG;
152 	(*p)[0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
153 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
154 		     RTE_PTYPE_INNER_L4_NONFRAG;
155 	(*p)[0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
156 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
157 		     RTE_PTYPE_INNER_L4_NONFRAG;
158 	/* Tunneled - Fragmented */
159 	(*p)[0x61] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
160 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
161 		     RTE_PTYPE_INNER_L4_FRAG;
162 	(*p)[0x62] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
163 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
164 		     RTE_PTYPE_INNER_L4_FRAG;
165 	(*p)[0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
166 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
167 		     RTE_PTYPE_INNER_L4_FRAG;
168 	(*p)[0xe2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
169 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
170 		     RTE_PTYPE_INNER_L4_FRAG;
171 	/* Tunneled - TCP */
172 	(*p)[0x45] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
173 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
174 		     RTE_PTYPE_L4_TCP;
175 	(*p)[0x46] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
176 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
177 		     RTE_PTYPE_L4_TCP;
178 	(*p)[0xc5] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
179 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
180 		     RTE_PTYPE_L4_TCP;
181 	(*p)[0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
182 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
183 		     RTE_PTYPE_L4_TCP;
184 	/* Tunneled - UDP */
185 	(*p)[0x49] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
186 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
187 		     RTE_PTYPE_L4_UDP;
188 	(*p)[0x4a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
189 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
190 		     RTE_PTYPE_L4_UDP;
191 	(*p)[0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
192 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
193 		     RTE_PTYPE_L4_UDP;
194 	(*p)[0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
195 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
196 		     RTE_PTYPE_L4_UDP;
197 }
198 
199 /**
200  * Return the size of tailroom of WQ.
201  *
202  * @param txq
203  *   Pointer to TX queue structure.
204  * @param addr
205  *   Pointer to tail of WQ.
206  *
207  * @return
208  *   Size of tailroom.
209  */
210 static inline size_t
211 tx_mlx5_wq_tailroom(struct txq *txq, void *addr)
212 {
213 	size_t tailroom;
214 	tailroom = (uintptr_t)(txq->wqes) +
215 		   (1 << txq->wqe_n) * MLX5_WQE_SIZE -
216 		   (uintptr_t)addr;
217 	return tailroom;
218 }
219 
220 /**
221  * Copy data to tailroom of circular queue.
222  *
223  * @param dst
224  *   Pointer to destination.
225  * @param src
226  *   Pointer to source.
227  * @param n
228  *   Number of bytes to copy.
229  * @param base
230  *   Pointer to head of queue.
231  * @param tailroom
232  *   Size of tailroom from dst.
233  *
234  * @return
235  *   Pointer after copied data.
236  */
237 static inline void *
238 mlx5_copy_to_wq(void *dst, const void *src, size_t n,
239 		void *base, size_t tailroom)
240 {
241 	void *ret;
242 
243 	if (n > tailroom) {
244 		rte_memcpy(dst, src, tailroom);
245 		rte_memcpy(base, (void *)((uintptr_t)src + tailroom),
246 			   n - tailroom);
247 		ret = (uint8_t *)base + n - tailroom;
248 	} else {
249 		rte_memcpy(dst, src, n);
250 		ret = (n == tailroom) ? base : (uint8_t *)dst + n;
251 	}
252 	return ret;
253 }
254 
255 /**
256  * DPDK callback to check the status of a tx descriptor.
257  *
258  * @param tx_queue
259  *   The tx queue.
260  * @param[in] offset
261  *   The index of the descriptor in the ring.
262  *
263  * @return
264  *   The status of the tx descriptor.
265  */
266 int
267 mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset)
268 {
269 	struct txq *txq = tx_queue;
270 	uint16_t used;
271 
272 	mlx5_tx_complete(txq);
273 	used = txq->elts_head - txq->elts_tail;
274 	if (offset < used)
275 		return RTE_ETH_TX_DESC_FULL;
276 	return RTE_ETH_TX_DESC_DONE;
277 }
278 
279 /**
280  * DPDK callback to check the status of a rx descriptor.
281  *
282  * @param rx_queue
283  *   The rx queue.
284  * @param[in] offset
285  *   The index of the descriptor in the ring.
286  *
287  * @return
288  *   The status of the tx descriptor.
289  */
290 int
291 mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset)
292 {
293 	struct rxq *rxq = rx_queue;
294 	struct rxq_zip *zip = &rxq->zip;
295 	volatile struct mlx5_cqe *cqe;
296 	const unsigned int cqe_n = (1 << rxq->cqe_n);
297 	const unsigned int cqe_cnt = cqe_n - 1;
298 	unsigned int cq_ci;
299 	unsigned int used;
300 
301 	/* if we are processing a compressed cqe */
302 	if (zip->ai) {
303 		used = zip->cqe_cnt - zip->ca;
304 		cq_ci = zip->cq_ci;
305 	} else {
306 		used = 0;
307 		cq_ci = rxq->cq_ci;
308 	}
309 	cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
310 	while (check_cqe(cqe, cqe_n, cq_ci) == 0) {
311 		int8_t op_own;
312 		unsigned int n;
313 
314 		op_own = cqe->op_own;
315 		if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED)
316 			n = ntohl(cqe->byte_cnt);
317 		else
318 			n = 1;
319 		cq_ci += n;
320 		used += n;
321 		cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
322 	}
323 	used = RTE_MIN(used, (1U << rxq->elts_n) - 1);
324 	if (offset < used)
325 		return RTE_ETH_RX_DESC_DONE;
326 	return RTE_ETH_RX_DESC_AVAIL;
327 }
328 
329 /**
330  * DPDK callback for TX.
331  *
332  * @param dpdk_txq
333  *   Generic pointer to TX queue structure.
334  * @param[in] pkts
335  *   Packets to transmit.
336  * @param pkts_n
337  *   Number of packets in array.
338  *
339  * @return
340  *   Number of packets successfully transmitted (<= pkts_n).
341  */
342 uint16_t
343 mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
344 {
345 	struct txq *txq = (struct txq *)dpdk_txq;
346 	uint16_t elts_head = txq->elts_head;
347 	const uint16_t elts_n = 1 << txq->elts_n;
348 	const uint16_t elts_m = elts_n - 1;
349 	unsigned int i = 0;
350 	unsigned int j = 0;
351 	unsigned int k = 0;
352 	uint16_t max_elts;
353 	unsigned int max_inline = txq->max_inline;
354 	const unsigned int inline_en = !!max_inline && txq->inline_en;
355 	uint16_t max_wqe;
356 	unsigned int comp;
357 	volatile struct mlx5_wqe_v *wqe = NULL;
358 	volatile struct mlx5_wqe_ctrl *last_wqe = NULL;
359 	unsigned int segs_n = 0;
360 	struct rte_mbuf *buf = NULL;
361 	uint8_t *raw;
362 
363 	if (unlikely(!pkts_n))
364 		return 0;
365 	/* Prefetch first packet cacheline. */
366 	rte_prefetch0(*pkts);
367 	/* Start processing. */
368 	mlx5_tx_complete(txq);
369 	max_elts = (elts_n - (elts_head - txq->elts_tail));
370 	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
371 	if (unlikely(!max_wqe))
372 		return 0;
373 	do {
374 		volatile rte_v128u32_t *dseg = NULL;
375 		uint32_t length;
376 		unsigned int ds = 0;
377 		unsigned int sg = 0; /* counter of additional segs attached. */
378 		uintptr_t addr;
379 		uint64_t naddr;
380 		uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE + 2;
381 		uint16_t tso_header_sz = 0;
382 		uint16_t ehdr;
383 		uint8_t cs_flags = 0;
384 		uint64_t tso = 0;
385 		uint16_t tso_segsz = 0;
386 #ifdef MLX5_PMD_SOFT_COUNTERS
387 		uint32_t total_length = 0;
388 #endif
389 
390 		/* first_seg */
391 		buf = *pkts;
392 		segs_n = buf->nb_segs;
393 		/*
394 		 * Make sure there is enough room to store this packet and
395 		 * that one ring entry remains unused.
396 		 */
397 		assert(segs_n);
398 		if (max_elts < segs_n)
399 			break;
400 		max_elts -= segs_n;
401 		--segs_n;
402 		if (unlikely(--max_wqe == 0))
403 			break;
404 		wqe = (volatile struct mlx5_wqe_v *)
405 			tx_mlx5_wqe(txq, txq->wqe_ci);
406 		rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
407 		if (pkts_n - i > 1)
408 			rte_prefetch0(*(pkts + 1));
409 		addr = rte_pktmbuf_mtod(buf, uintptr_t);
410 		length = DATA_LEN(buf);
411 		ehdr = (((uint8_t *)addr)[1] << 8) |
412 		       ((uint8_t *)addr)[0];
413 #ifdef MLX5_PMD_SOFT_COUNTERS
414 		total_length = length;
415 #endif
416 		if (length < (MLX5_WQE_DWORD_SIZE + 2))
417 			break;
418 		/* Update element. */
419 		(*txq->elts)[elts_head & elts_m] = buf;
420 		/* Prefetch next buffer data. */
421 		if (pkts_n - i > 1)
422 			rte_prefetch0(
423 			    rte_pktmbuf_mtod(*(pkts + 1), volatile void *));
424 		/* Should we enable HW CKSUM offload */
425 		if (buf->ol_flags &
426 		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
427 			const uint64_t is_tunneled = buf->ol_flags &
428 						     (PKT_TX_TUNNEL_GRE |
429 						      PKT_TX_TUNNEL_VXLAN);
430 
431 			if (is_tunneled && txq->tunnel_en) {
432 				cs_flags = MLX5_ETH_WQE_L3_INNER_CSUM |
433 					   MLX5_ETH_WQE_L4_INNER_CSUM;
434 				if (buf->ol_flags & PKT_TX_OUTER_IP_CKSUM)
435 					cs_flags |= MLX5_ETH_WQE_L3_CSUM;
436 			} else {
437 				cs_flags = MLX5_ETH_WQE_L3_CSUM |
438 					   MLX5_ETH_WQE_L4_CSUM;
439 			}
440 		}
441 		raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE;
442 		/* Replace the Ethernet type by the VLAN if necessary. */
443 		if (buf->ol_flags & PKT_TX_VLAN_PKT) {
444 			uint32_t vlan = htonl(0x81000000 | buf->vlan_tci);
445 			unsigned int len = 2 * ETHER_ADDR_LEN - 2;
446 
447 			addr += 2;
448 			length -= 2;
449 			/* Copy Destination and source mac address. */
450 			memcpy((uint8_t *)raw, ((uint8_t *)addr), len);
451 			/* Copy VLAN. */
452 			memcpy((uint8_t *)raw + len, &vlan, sizeof(vlan));
453 			/* Copy missing two bytes to end the DSeg. */
454 			memcpy((uint8_t *)raw + len + sizeof(vlan),
455 			       ((uint8_t *)addr) + len, 2);
456 			addr += len + 2;
457 			length -= (len + 2);
458 		} else {
459 			memcpy((uint8_t *)raw, ((uint8_t *)addr) + 2,
460 			       MLX5_WQE_DWORD_SIZE);
461 			length -= pkt_inline_sz;
462 			addr += pkt_inline_sz;
463 		}
464 		if (txq->tso_en) {
465 			tso = buf->ol_flags & PKT_TX_TCP_SEG;
466 			if (tso) {
467 				uintptr_t end = (uintptr_t)
468 						(((uintptr_t)txq->wqes) +
469 						(1 << txq->wqe_n) *
470 						MLX5_WQE_SIZE);
471 				unsigned int copy_b;
472 				uint8_t vlan_sz = (buf->ol_flags &
473 						  PKT_TX_VLAN_PKT) ? 4 : 0;
474 				const uint64_t is_tunneled =
475 							buf->ol_flags &
476 							(PKT_TX_TUNNEL_GRE |
477 							 PKT_TX_TUNNEL_VXLAN);
478 
479 				tso_header_sz = buf->l2_len + vlan_sz +
480 						buf->l3_len + buf->l4_len;
481 				tso_segsz = buf->tso_segsz;
482 
483 				if (is_tunneled	&& txq->tunnel_en) {
484 					tso_header_sz += buf->outer_l2_len +
485 							 buf->outer_l3_len;
486 					cs_flags |= MLX5_ETH_WQE_L4_INNER_CSUM;
487 				} else {
488 					cs_flags |= MLX5_ETH_WQE_L4_CSUM;
489 				}
490 				if (unlikely(tso_header_sz >
491 					     MLX5_MAX_TSO_HEADER))
492 					break;
493 				copy_b = tso_header_sz - pkt_inline_sz;
494 				/* First seg must contain all headers. */
495 				assert(copy_b <= length);
496 				raw += MLX5_WQE_DWORD_SIZE;
497 				if (copy_b &&
498 				   ((end - (uintptr_t)raw) > copy_b)) {
499 					uint16_t n = (MLX5_WQE_DS(copy_b) -
500 						      1 + 3) / 4;
501 
502 					if (unlikely(max_wqe < n))
503 						break;
504 					max_wqe -= n;
505 					rte_memcpy((void *)raw,
506 						   (void *)addr, copy_b);
507 					addr += copy_b;
508 					length -= copy_b;
509 					pkt_inline_sz += copy_b;
510 					/*
511 					 * Another DWORD will be added
512 					 * in the inline part.
513 					 */
514 					raw += MLX5_WQE_DS(copy_b) *
515 					       MLX5_WQE_DWORD_SIZE -
516 					       MLX5_WQE_DWORD_SIZE;
517 				} else {
518 					/* NOP WQE. */
519 					wqe->ctrl = (rte_v128u32_t){
520 						     htonl(txq->wqe_ci << 8),
521 						     htonl(txq->qp_num_8s | 1),
522 						     0,
523 						     0,
524 					};
525 					ds = 1;
526 					total_length = 0;
527 					k++;
528 					goto next_wqe;
529 				}
530 			}
531 		}
532 		/* Inline if enough room. */
533 		if (inline_en || tso) {
534 			uintptr_t end = (uintptr_t)
535 				(((uintptr_t)txq->wqes) +
536 				 (1 << txq->wqe_n) * MLX5_WQE_SIZE);
537 			unsigned int inline_room = max_inline *
538 						   RTE_CACHE_LINE_SIZE -
539 						   (pkt_inline_sz - 2);
540 			uintptr_t addr_end = (addr + inline_room) &
541 					     ~(RTE_CACHE_LINE_SIZE - 1);
542 			unsigned int copy_b = (addr_end > addr) ?
543 				RTE_MIN((addr_end - addr), length) :
544 				0;
545 
546 			raw += MLX5_WQE_DWORD_SIZE;
547 			if (copy_b && ((end - (uintptr_t)raw) > copy_b)) {
548 				/*
549 				 * One Dseg remains in the current WQE.  To
550 				 * keep the computation positive, it is
551 				 * removed after the bytes to Dseg conversion.
552 				 */
553 				uint16_t n = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4;
554 
555 				if (unlikely(max_wqe < n))
556 					break;
557 				max_wqe -= n;
558 				if (tso) {
559 					uint32_t inl =
560 						htonl(copy_b | MLX5_INLINE_SEG);
561 
562 					pkt_inline_sz =
563 						MLX5_WQE_DS(tso_header_sz) *
564 						MLX5_WQE_DWORD_SIZE;
565 					rte_memcpy((void *)raw,
566 						   (void *)&inl, sizeof(inl));
567 					raw += sizeof(inl);
568 					pkt_inline_sz += sizeof(inl);
569 				}
570 				rte_memcpy((void *)raw, (void *)addr, copy_b);
571 				addr += copy_b;
572 				length -= copy_b;
573 				pkt_inline_sz += copy_b;
574 			}
575 			/*
576 			 * 2 DWORDs consumed by the WQE header + ETH segment +
577 			 * the size of the inline part of the packet.
578 			 */
579 			ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2);
580 			if (length > 0) {
581 				if (ds % (MLX5_WQE_SIZE /
582 					  MLX5_WQE_DWORD_SIZE) == 0) {
583 					if (unlikely(--max_wqe == 0))
584 						break;
585 					dseg = (volatile rte_v128u32_t *)
586 					       tx_mlx5_wqe(txq, txq->wqe_ci +
587 							   ds / 4);
588 				} else {
589 					dseg = (volatile rte_v128u32_t *)
590 						((uintptr_t)wqe +
591 						 (ds * MLX5_WQE_DWORD_SIZE));
592 				}
593 				goto use_dseg;
594 			} else if (!segs_n) {
595 				goto next_pkt;
596 			} else {
597 				/* dseg will be advance as part of next_seg */
598 				dseg = (volatile rte_v128u32_t *)
599 					((uintptr_t)wqe +
600 					 ((ds - 1) * MLX5_WQE_DWORD_SIZE));
601 				goto next_seg;
602 			}
603 		} else {
604 			/*
605 			 * No inline has been done in the packet, only the
606 			 * Ethernet Header as been stored.
607 			 */
608 			dseg = (volatile rte_v128u32_t *)
609 				((uintptr_t)wqe + (3 * MLX5_WQE_DWORD_SIZE));
610 			ds = 3;
611 use_dseg:
612 			/* Add the remaining packet as a simple ds. */
613 			naddr = htonll(addr);
614 			*dseg = (rte_v128u32_t){
615 				htonl(length),
616 				mlx5_tx_mb2mr(txq, buf),
617 				naddr,
618 				naddr >> 32,
619 			};
620 			++ds;
621 			if (!segs_n)
622 				goto next_pkt;
623 		}
624 next_seg:
625 		assert(buf);
626 		assert(ds);
627 		assert(wqe);
628 		/*
629 		 * Spill on next WQE when the current one does not have
630 		 * enough room left. Size of WQE must a be a multiple
631 		 * of data segment size.
632 		 */
633 		assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE));
634 		if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE))) {
635 			if (unlikely(--max_wqe == 0))
636 				break;
637 			dseg = (volatile rte_v128u32_t *)
638 			       tx_mlx5_wqe(txq, txq->wqe_ci + ds / 4);
639 			rte_prefetch0(tx_mlx5_wqe(txq,
640 						  txq->wqe_ci + ds / 4 + 1));
641 		} else {
642 			++dseg;
643 		}
644 		++ds;
645 		buf = buf->next;
646 		assert(buf);
647 		length = DATA_LEN(buf);
648 #ifdef MLX5_PMD_SOFT_COUNTERS
649 		total_length += length;
650 #endif
651 		/* Store segment information. */
652 		naddr = htonll(rte_pktmbuf_mtod(buf, uintptr_t));
653 		*dseg = (rte_v128u32_t){
654 			htonl(length),
655 			mlx5_tx_mb2mr(txq, buf),
656 			naddr,
657 			naddr >> 32,
658 		};
659 		(*txq->elts)[++elts_head & elts_m] = buf;
660 		++sg;
661 		/* Advance counter only if all segs are successfully posted. */
662 		if (sg < segs_n)
663 			goto next_seg;
664 		else
665 			j += sg;
666 next_pkt:
667 		++elts_head;
668 		++pkts;
669 		++i;
670 		/* Initialize known and common part of the WQE structure. */
671 		if (tso) {
672 			wqe->ctrl = (rte_v128u32_t){
673 				htonl((txq->wqe_ci << 8) | MLX5_OPCODE_TSO),
674 				htonl(txq->qp_num_8s | ds),
675 				0,
676 				0,
677 			};
678 			wqe->eseg = (rte_v128u32_t){
679 				0,
680 				cs_flags | (htons(tso_segsz) << 16),
681 				0,
682 				(ehdr << 16) | htons(tso_header_sz),
683 			};
684 		} else {
685 			wqe->ctrl = (rte_v128u32_t){
686 				htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND),
687 				htonl(txq->qp_num_8s | ds),
688 				0,
689 				0,
690 			};
691 			wqe->eseg = (rte_v128u32_t){
692 				0,
693 				cs_flags,
694 				0,
695 				(ehdr << 16) | htons(pkt_inline_sz),
696 			};
697 		}
698 next_wqe:
699 		txq->wqe_ci += (ds + 3) / 4;
700 		/* Save the last successful WQE for completion request */
701 		last_wqe = (volatile struct mlx5_wqe_ctrl *)wqe;
702 #ifdef MLX5_PMD_SOFT_COUNTERS
703 		/* Increment sent bytes counter. */
704 		txq->stats.obytes += total_length;
705 #endif
706 	} while (i < pkts_n);
707 	/* Take a shortcut if nothing must be sent. */
708 	if (unlikely((i + k) == 0))
709 		return 0;
710 	txq->elts_head += (i + j);
711 	/* Check whether completion threshold has been reached. */
712 	comp = txq->elts_comp + i + j + k;
713 	if (comp >= MLX5_TX_COMP_THRESH) {
714 		/* Request completion on last WQE. */
715 		last_wqe->ctrl2 = htonl(8);
716 		/* Save elts_head in unused "immediate" field of WQE. */
717 		last_wqe->ctrl3 = txq->elts_head;
718 		txq->elts_comp = 0;
719 	} else {
720 		txq->elts_comp = comp;
721 	}
722 #ifdef MLX5_PMD_SOFT_COUNTERS
723 	/* Increment sent packets counter. */
724 	txq->stats.opackets += i;
725 #endif
726 	/* Ring QP doorbell. */
727 	mlx5_tx_dbrec(txq, (volatile struct mlx5_wqe *)last_wqe);
728 	return i;
729 }
730 
731 /**
732  * Open a MPW session.
733  *
734  * @param txq
735  *   Pointer to TX queue structure.
736  * @param mpw
737  *   Pointer to MPW session structure.
738  * @param length
739  *   Packet length.
740  */
741 static inline void
742 mlx5_mpw_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
743 {
744 	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
745 	volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] =
746 		(volatile struct mlx5_wqe_data_seg (*)[])
747 		tx_mlx5_wqe(txq, idx + 1);
748 
749 	mpw->state = MLX5_MPW_STATE_OPENED;
750 	mpw->pkts_n = 0;
751 	mpw->len = length;
752 	mpw->total_len = 0;
753 	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
754 	mpw->wqe->eseg.mss = htons(length);
755 	mpw->wqe->eseg.inline_hdr_sz = 0;
756 	mpw->wqe->eseg.rsvd0 = 0;
757 	mpw->wqe->eseg.rsvd1 = 0;
758 	mpw->wqe->eseg.rsvd2 = 0;
759 	mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
760 				  (txq->wqe_ci << 8) | MLX5_OPCODE_TSO);
761 	mpw->wqe->ctrl[2] = 0;
762 	mpw->wqe->ctrl[3] = 0;
763 	mpw->data.dseg[0] = (volatile struct mlx5_wqe_data_seg *)
764 		(((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
765 	mpw->data.dseg[1] = (volatile struct mlx5_wqe_data_seg *)
766 		(((uintptr_t)mpw->wqe) + (3 * MLX5_WQE_DWORD_SIZE));
767 	mpw->data.dseg[2] = &(*dseg)[0];
768 	mpw->data.dseg[3] = &(*dseg)[1];
769 	mpw->data.dseg[4] = &(*dseg)[2];
770 }
771 
772 /**
773  * Close a MPW session.
774  *
775  * @param txq
776  *   Pointer to TX queue structure.
777  * @param mpw
778  *   Pointer to MPW session structure.
779  */
780 static inline void
781 mlx5_mpw_close(struct txq *txq, struct mlx5_mpw *mpw)
782 {
783 	unsigned int num = mpw->pkts_n;
784 
785 	/*
786 	 * Store size in multiple of 16 bytes. Control and Ethernet segments
787 	 * count as 2.
788 	 */
789 	mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | (2 + num));
790 	mpw->state = MLX5_MPW_STATE_CLOSED;
791 	if (num < 3)
792 		++txq->wqe_ci;
793 	else
794 		txq->wqe_ci += 2;
795 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
796 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
797 }
798 
799 /**
800  * DPDK callback for TX with MPW support.
801  *
802  * @param dpdk_txq
803  *   Generic pointer to TX queue structure.
804  * @param[in] pkts
805  *   Packets to transmit.
806  * @param pkts_n
807  *   Number of packets in array.
808  *
809  * @return
810  *   Number of packets successfully transmitted (<= pkts_n).
811  */
812 uint16_t
813 mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
814 {
815 	struct txq *txq = (struct txq *)dpdk_txq;
816 	uint16_t elts_head = txq->elts_head;
817 	const uint16_t elts_n = 1 << txq->elts_n;
818 	const uint16_t elts_m = elts_n - 1;
819 	unsigned int i = 0;
820 	unsigned int j = 0;
821 	uint16_t max_elts;
822 	uint16_t max_wqe;
823 	unsigned int comp;
824 	struct mlx5_mpw mpw = {
825 		.state = MLX5_MPW_STATE_CLOSED,
826 	};
827 
828 	if (unlikely(!pkts_n))
829 		return 0;
830 	/* Prefetch first packet cacheline. */
831 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
832 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
833 	/* Start processing. */
834 	mlx5_tx_complete(txq);
835 	max_elts = (elts_n - (elts_head - txq->elts_tail));
836 	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
837 	if (unlikely(!max_wqe))
838 		return 0;
839 	do {
840 		struct rte_mbuf *buf = *(pkts++);
841 		uint32_t length;
842 		unsigned int segs_n = buf->nb_segs;
843 		uint32_t cs_flags = 0;
844 
845 		/*
846 		 * Make sure there is enough room to store this packet and
847 		 * that one ring entry remains unused.
848 		 */
849 		assert(segs_n);
850 		if (max_elts < segs_n)
851 			break;
852 		/* Do not bother with large packets MPW cannot handle. */
853 		if (segs_n > MLX5_MPW_DSEG_MAX)
854 			break;
855 		max_elts -= segs_n;
856 		--pkts_n;
857 		/* Should we enable HW CKSUM offload */
858 		if (buf->ol_flags &
859 		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
860 			cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
861 		/* Retrieve packet information. */
862 		length = PKT_LEN(buf);
863 		assert(length);
864 		/* Start new session if packet differs. */
865 		if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
866 		    ((mpw.len != length) ||
867 		     (segs_n != 1) ||
868 		     (mpw.wqe->eseg.cs_flags != cs_flags)))
869 			mlx5_mpw_close(txq, &mpw);
870 		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
871 			/*
872 			 * Multi-Packet WQE consumes at most two WQE.
873 			 * mlx5_mpw_new() expects to be able to use such
874 			 * resources.
875 			 */
876 			if (unlikely(max_wqe < 2))
877 				break;
878 			max_wqe -= 2;
879 			mlx5_mpw_new(txq, &mpw, length);
880 			mpw.wqe->eseg.cs_flags = cs_flags;
881 		}
882 		/* Multi-segment packets must be alone in their MPW. */
883 		assert((segs_n == 1) || (mpw.pkts_n == 0));
884 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
885 		length = 0;
886 #endif
887 		do {
888 			volatile struct mlx5_wqe_data_seg *dseg;
889 			uintptr_t addr;
890 
891 			assert(buf);
892 			(*txq->elts)[elts_head++ & elts_m] = buf;
893 			dseg = mpw.data.dseg[mpw.pkts_n];
894 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
895 			*dseg = (struct mlx5_wqe_data_seg){
896 				.byte_count = htonl(DATA_LEN(buf)),
897 				.lkey = mlx5_tx_mb2mr(txq, buf),
898 				.addr = htonll(addr),
899 			};
900 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
901 			length += DATA_LEN(buf);
902 #endif
903 			buf = buf->next;
904 			++mpw.pkts_n;
905 			++j;
906 		} while (--segs_n);
907 		assert(length == mpw.len);
908 		if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
909 			mlx5_mpw_close(txq, &mpw);
910 #ifdef MLX5_PMD_SOFT_COUNTERS
911 		/* Increment sent bytes counter. */
912 		txq->stats.obytes += length;
913 #endif
914 		++i;
915 	} while (pkts_n);
916 	/* Take a shortcut if nothing must be sent. */
917 	if (unlikely(i == 0))
918 		return 0;
919 	/* Check whether completion threshold has been reached. */
920 	/* "j" includes both packets and segments. */
921 	comp = txq->elts_comp + j;
922 	if (comp >= MLX5_TX_COMP_THRESH) {
923 		volatile struct mlx5_wqe *wqe = mpw.wqe;
924 
925 		/* Request completion on last WQE. */
926 		wqe->ctrl[2] = htonl(8);
927 		/* Save elts_head in unused "immediate" field of WQE. */
928 		wqe->ctrl[3] = elts_head;
929 		txq->elts_comp = 0;
930 	} else {
931 		txq->elts_comp = comp;
932 	}
933 #ifdef MLX5_PMD_SOFT_COUNTERS
934 	/* Increment sent packets counter. */
935 	txq->stats.opackets += i;
936 #endif
937 	/* Ring QP doorbell. */
938 	if (mpw.state == MLX5_MPW_STATE_OPENED)
939 		mlx5_mpw_close(txq, &mpw);
940 	mlx5_tx_dbrec(txq, mpw.wqe);
941 	txq->elts_head = elts_head;
942 	return i;
943 }
944 
945 /**
946  * Open a MPW inline session.
947  *
948  * @param txq
949  *   Pointer to TX queue structure.
950  * @param mpw
951  *   Pointer to MPW session structure.
952  * @param length
953  *   Packet length.
954  */
955 static inline void
956 mlx5_mpw_inline_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
957 {
958 	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
959 	struct mlx5_wqe_inl_small *inl;
960 
961 	mpw->state = MLX5_MPW_INL_STATE_OPENED;
962 	mpw->pkts_n = 0;
963 	mpw->len = length;
964 	mpw->total_len = 0;
965 	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
966 	mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
967 				  (txq->wqe_ci << 8) |
968 				  MLX5_OPCODE_TSO);
969 	mpw->wqe->ctrl[2] = 0;
970 	mpw->wqe->ctrl[3] = 0;
971 	mpw->wqe->eseg.mss = htons(length);
972 	mpw->wqe->eseg.inline_hdr_sz = 0;
973 	mpw->wqe->eseg.cs_flags = 0;
974 	mpw->wqe->eseg.rsvd0 = 0;
975 	mpw->wqe->eseg.rsvd1 = 0;
976 	mpw->wqe->eseg.rsvd2 = 0;
977 	inl = (struct mlx5_wqe_inl_small *)
978 		(((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE);
979 	mpw->data.raw = (uint8_t *)&inl->raw;
980 }
981 
982 /**
983  * Close a MPW inline session.
984  *
985  * @param txq
986  *   Pointer to TX queue structure.
987  * @param mpw
988  *   Pointer to MPW session structure.
989  */
990 static inline void
991 mlx5_mpw_inline_close(struct txq *txq, struct mlx5_mpw *mpw)
992 {
993 	unsigned int size;
994 	struct mlx5_wqe_inl_small *inl = (struct mlx5_wqe_inl_small *)
995 		(((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
996 
997 	size = MLX5_WQE_SIZE - MLX5_MWQE64_INL_DATA + mpw->total_len;
998 	/*
999 	 * Store size in multiple of 16 bytes. Control and Ethernet segments
1000 	 * count as 2.
1001 	 */
1002 	mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | MLX5_WQE_DS(size));
1003 	mpw->state = MLX5_MPW_STATE_CLOSED;
1004 	inl->byte_cnt = htonl(mpw->total_len | MLX5_INLINE_SEG);
1005 	txq->wqe_ci += (size + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
1006 }
1007 
1008 /**
1009  * DPDK callback for TX with MPW inline support.
1010  *
1011  * @param dpdk_txq
1012  *   Generic pointer to TX queue structure.
1013  * @param[in] pkts
1014  *   Packets to transmit.
1015  * @param pkts_n
1016  *   Number of packets in array.
1017  *
1018  * @return
1019  *   Number of packets successfully transmitted (<= pkts_n).
1020  */
1021 uint16_t
1022 mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
1023 			 uint16_t pkts_n)
1024 {
1025 	struct txq *txq = (struct txq *)dpdk_txq;
1026 	uint16_t elts_head = txq->elts_head;
1027 	const uint16_t elts_n = 1 << txq->elts_n;
1028 	const uint16_t elts_m = elts_n - 1;
1029 	unsigned int i = 0;
1030 	unsigned int j = 0;
1031 	uint16_t max_elts;
1032 	uint16_t max_wqe;
1033 	unsigned int comp;
1034 	unsigned int inline_room = txq->max_inline * RTE_CACHE_LINE_SIZE;
1035 	struct mlx5_mpw mpw = {
1036 		.state = MLX5_MPW_STATE_CLOSED,
1037 	};
1038 	/*
1039 	 * Compute the maximum number of WQE which can be consumed by inline
1040 	 * code.
1041 	 * - 2 DSEG for:
1042 	 *   - 1 control segment,
1043 	 *   - 1 Ethernet segment,
1044 	 * - N Dseg from the inline request.
1045 	 */
1046 	const unsigned int wqe_inl_n =
1047 		((2 * MLX5_WQE_DWORD_SIZE +
1048 		  txq->max_inline * RTE_CACHE_LINE_SIZE) +
1049 		 RTE_CACHE_LINE_SIZE - 1) / RTE_CACHE_LINE_SIZE;
1050 
1051 	if (unlikely(!pkts_n))
1052 		return 0;
1053 	/* Prefetch first packet cacheline. */
1054 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
1055 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
1056 	/* Start processing. */
1057 	mlx5_tx_complete(txq);
1058 	max_elts = (elts_n - (elts_head - txq->elts_tail));
1059 	do {
1060 		struct rte_mbuf *buf = *(pkts++);
1061 		uintptr_t addr;
1062 		uint32_t length;
1063 		unsigned int segs_n = buf->nb_segs;
1064 		uint32_t cs_flags = 0;
1065 
1066 		/*
1067 		 * Make sure there is enough room to store this packet and
1068 		 * that one ring entry remains unused.
1069 		 */
1070 		assert(segs_n);
1071 		if (max_elts < segs_n)
1072 			break;
1073 		/* Do not bother with large packets MPW cannot handle. */
1074 		if (segs_n > MLX5_MPW_DSEG_MAX)
1075 			break;
1076 		max_elts -= segs_n;
1077 		--pkts_n;
1078 		/*
1079 		 * Compute max_wqe in case less WQE were consumed in previous
1080 		 * iteration.
1081 		 */
1082 		max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
1083 		/* Should we enable HW CKSUM offload */
1084 		if (buf->ol_flags &
1085 		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
1086 			cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
1087 		/* Retrieve packet information. */
1088 		length = PKT_LEN(buf);
1089 		/* Start new session if packet differs. */
1090 		if (mpw.state == MLX5_MPW_STATE_OPENED) {
1091 			if ((mpw.len != length) ||
1092 			    (segs_n != 1) ||
1093 			    (mpw.wqe->eseg.cs_flags != cs_flags))
1094 				mlx5_mpw_close(txq, &mpw);
1095 		} else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
1096 			if ((mpw.len != length) ||
1097 			    (segs_n != 1) ||
1098 			    (length > inline_room) ||
1099 			    (mpw.wqe->eseg.cs_flags != cs_flags)) {
1100 				mlx5_mpw_inline_close(txq, &mpw);
1101 				inline_room =
1102 					txq->max_inline * RTE_CACHE_LINE_SIZE;
1103 			}
1104 		}
1105 		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
1106 			if ((segs_n != 1) ||
1107 			    (length > inline_room)) {
1108 				/*
1109 				 * Multi-Packet WQE consumes at most two WQE.
1110 				 * mlx5_mpw_new() expects to be able to use
1111 				 * such resources.
1112 				 */
1113 				if (unlikely(max_wqe < 2))
1114 					break;
1115 				max_wqe -= 2;
1116 				mlx5_mpw_new(txq, &mpw, length);
1117 				mpw.wqe->eseg.cs_flags = cs_flags;
1118 			} else {
1119 				if (unlikely(max_wqe < wqe_inl_n))
1120 					break;
1121 				max_wqe -= wqe_inl_n;
1122 				mlx5_mpw_inline_new(txq, &mpw, length);
1123 				mpw.wqe->eseg.cs_flags = cs_flags;
1124 			}
1125 		}
1126 		/* Multi-segment packets must be alone in their MPW. */
1127 		assert((segs_n == 1) || (mpw.pkts_n == 0));
1128 		if (mpw.state == MLX5_MPW_STATE_OPENED) {
1129 			assert(inline_room ==
1130 			       txq->max_inline * RTE_CACHE_LINE_SIZE);
1131 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1132 			length = 0;
1133 #endif
1134 			do {
1135 				volatile struct mlx5_wqe_data_seg *dseg;
1136 
1137 				assert(buf);
1138 				(*txq->elts)[elts_head++ & elts_m] = buf;
1139 				dseg = mpw.data.dseg[mpw.pkts_n];
1140 				addr = rte_pktmbuf_mtod(buf, uintptr_t);
1141 				*dseg = (struct mlx5_wqe_data_seg){
1142 					.byte_count = htonl(DATA_LEN(buf)),
1143 					.lkey = mlx5_tx_mb2mr(txq, buf),
1144 					.addr = htonll(addr),
1145 				};
1146 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1147 				length += DATA_LEN(buf);
1148 #endif
1149 				buf = buf->next;
1150 				++mpw.pkts_n;
1151 				++j;
1152 			} while (--segs_n);
1153 			assert(length == mpw.len);
1154 			if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
1155 				mlx5_mpw_close(txq, &mpw);
1156 		} else {
1157 			unsigned int max;
1158 
1159 			assert(mpw.state == MLX5_MPW_INL_STATE_OPENED);
1160 			assert(length <= inline_room);
1161 			assert(length == DATA_LEN(buf));
1162 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
1163 			(*txq->elts)[elts_head++ & elts_m] = buf;
1164 			/* Maximum number of bytes before wrapping. */
1165 			max = ((((uintptr_t)(txq->wqes)) +
1166 				(1 << txq->wqe_n) *
1167 				MLX5_WQE_SIZE) -
1168 			       (uintptr_t)mpw.data.raw);
1169 			if (length > max) {
1170 				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1171 					   (void *)addr,
1172 					   max);
1173 				mpw.data.raw = (volatile void *)txq->wqes;
1174 				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1175 					   (void *)(addr + max),
1176 					   length - max);
1177 				mpw.data.raw += length - max;
1178 			} else {
1179 				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1180 					   (void *)addr,
1181 					   length);
1182 
1183 				if (length == max)
1184 					mpw.data.raw =
1185 						(volatile void *)txq->wqes;
1186 				else
1187 					mpw.data.raw += length;
1188 			}
1189 			++mpw.pkts_n;
1190 			mpw.total_len += length;
1191 			++j;
1192 			if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) {
1193 				mlx5_mpw_inline_close(txq, &mpw);
1194 				inline_room =
1195 					txq->max_inline * RTE_CACHE_LINE_SIZE;
1196 			} else {
1197 				inline_room -= length;
1198 			}
1199 		}
1200 #ifdef MLX5_PMD_SOFT_COUNTERS
1201 		/* Increment sent bytes counter. */
1202 		txq->stats.obytes += length;
1203 #endif
1204 		++i;
1205 	} while (pkts_n);
1206 	/* Take a shortcut if nothing must be sent. */
1207 	if (unlikely(i == 0))
1208 		return 0;
1209 	/* Check whether completion threshold has been reached. */
1210 	/* "j" includes both packets and segments. */
1211 	comp = txq->elts_comp + j;
1212 	if (comp >= MLX5_TX_COMP_THRESH) {
1213 		volatile struct mlx5_wqe *wqe = mpw.wqe;
1214 
1215 		/* Request completion on last WQE. */
1216 		wqe->ctrl[2] = htonl(8);
1217 		/* Save elts_head in unused "immediate" field of WQE. */
1218 		wqe->ctrl[3] = elts_head;
1219 		txq->elts_comp = 0;
1220 	} else {
1221 		txq->elts_comp = comp;
1222 	}
1223 #ifdef MLX5_PMD_SOFT_COUNTERS
1224 	/* Increment sent packets counter. */
1225 	txq->stats.opackets += i;
1226 #endif
1227 	/* Ring QP doorbell. */
1228 	if (mpw.state == MLX5_MPW_INL_STATE_OPENED)
1229 		mlx5_mpw_inline_close(txq, &mpw);
1230 	else if (mpw.state == MLX5_MPW_STATE_OPENED)
1231 		mlx5_mpw_close(txq, &mpw);
1232 	mlx5_tx_dbrec(txq, mpw.wqe);
1233 	txq->elts_head = elts_head;
1234 	return i;
1235 }
1236 
1237 /**
1238  * Open an Enhanced MPW session.
1239  *
1240  * @param txq
1241  *   Pointer to TX queue structure.
1242  * @param mpw
1243  *   Pointer to MPW session structure.
1244  * @param length
1245  *   Packet length.
1246  */
1247 static inline void
1248 mlx5_empw_new(struct txq *txq, struct mlx5_mpw *mpw, int padding)
1249 {
1250 	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
1251 
1252 	mpw->state = MLX5_MPW_ENHANCED_STATE_OPENED;
1253 	mpw->pkts_n = 0;
1254 	mpw->total_len = sizeof(struct mlx5_wqe);
1255 	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
1256 	mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_ENHANCED_MPSW << 24) |
1257 				  (txq->wqe_ci << 8) |
1258 				  MLX5_OPCODE_ENHANCED_MPSW);
1259 	mpw->wqe->ctrl[2] = 0;
1260 	mpw->wqe->ctrl[3] = 0;
1261 	memset((void *)(uintptr_t)&mpw->wqe->eseg, 0, MLX5_WQE_DWORD_SIZE);
1262 	if (unlikely(padding)) {
1263 		uintptr_t addr = (uintptr_t)(mpw->wqe + 1);
1264 
1265 		/* Pad the first 2 DWORDs with zero-length inline header. */
1266 		*(volatile uint32_t *)addr = htonl(MLX5_INLINE_SEG);
1267 		*(volatile uint32_t *)(addr + MLX5_WQE_DWORD_SIZE) =
1268 			htonl(MLX5_INLINE_SEG);
1269 		mpw->total_len += 2 * MLX5_WQE_DWORD_SIZE;
1270 		/* Start from the next WQEBB. */
1271 		mpw->data.raw = (volatile void *)(tx_mlx5_wqe(txq, idx + 1));
1272 	} else {
1273 		mpw->data.raw = (volatile void *)(mpw->wqe + 1);
1274 	}
1275 }
1276 
1277 /**
1278  * Close an Enhanced MPW session.
1279  *
1280  * @param txq
1281  *   Pointer to TX queue structure.
1282  * @param mpw
1283  *   Pointer to MPW session structure.
1284  *
1285  * @return
1286  *   Number of consumed WQEs.
1287  */
1288 static inline uint16_t
1289 mlx5_empw_close(struct txq *txq, struct mlx5_mpw *mpw)
1290 {
1291 	uint16_t ret;
1292 
1293 	/* Store size in multiple of 16 bytes. Control and Ethernet segments
1294 	 * count as 2.
1295 	 */
1296 	mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | MLX5_WQE_DS(mpw->total_len));
1297 	mpw->state = MLX5_MPW_STATE_CLOSED;
1298 	ret = (mpw->total_len + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
1299 	txq->wqe_ci += ret;
1300 	return ret;
1301 }
1302 
1303 /**
1304  * DPDK callback for TX with Enhanced MPW support.
1305  *
1306  * @param dpdk_txq
1307  *   Generic pointer to TX queue structure.
1308  * @param[in] pkts
1309  *   Packets to transmit.
1310  * @param pkts_n
1311  *   Number of packets in array.
1312  *
1313  * @return
1314  *   Number of packets successfully transmitted (<= pkts_n).
1315  */
1316 uint16_t
1317 mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
1318 {
1319 	struct txq *txq = (struct txq *)dpdk_txq;
1320 	uint16_t elts_head = txq->elts_head;
1321 	const uint16_t elts_n = 1 << txq->elts_n;
1322 	const uint16_t elts_m = elts_n - 1;
1323 	unsigned int i = 0;
1324 	unsigned int j = 0;
1325 	uint16_t max_elts;
1326 	uint16_t max_wqe;
1327 	unsigned int max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE;
1328 	unsigned int mpw_room = 0;
1329 	unsigned int inl_pad = 0;
1330 	uint32_t inl_hdr;
1331 	struct mlx5_mpw mpw = {
1332 		.state = MLX5_MPW_STATE_CLOSED,
1333 	};
1334 
1335 	if (unlikely(!pkts_n))
1336 		return 0;
1337 	/* Start processing. */
1338 	mlx5_tx_complete(txq);
1339 	max_elts = (elts_n - (elts_head - txq->elts_tail));
1340 	/* A CQE slot must always be available. */
1341 	assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci));
1342 	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
1343 	if (unlikely(!max_wqe))
1344 		return 0;
1345 	do {
1346 		struct rte_mbuf *buf = *(pkts++);
1347 		uintptr_t addr;
1348 		uint64_t naddr;
1349 		unsigned int n;
1350 		unsigned int do_inline = 0; /* Whether inline is possible. */
1351 		uint32_t length;
1352 		unsigned int segs_n = buf->nb_segs;
1353 		uint32_t cs_flags = 0;
1354 
1355 		/*
1356 		 * Make sure there is enough room to store this packet and
1357 		 * that one ring entry remains unused.
1358 		 */
1359 		assert(segs_n);
1360 		if (max_elts - j < segs_n)
1361 			break;
1362 		/* Do not bother with large packets MPW cannot handle. */
1363 		if (segs_n > MLX5_MPW_DSEG_MAX)
1364 			break;
1365 		/* Should we enable HW CKSUM offload. */
1366 		if (buf->ol_flags &
1367 		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
1368 			cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
1369 		/* Retrieve packet information. */
1370 		length = PKT_LEN(buf);
1371 		/* Start new session if:
1372 		 * - multi-segment packet
1373 		 * - no space left even for a dseg
1374 		 * - next packet can be inlined with a new WQE
1375 		 * - cs_flag differs
1376 		 * It can't be MLX5_MPW_STATE_OPENED as always have a single
1377 		 * segmented packet.
1378 		 */
1379 		if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED) {
1380 			if ((segs_n != 1) ||
1381 			    (inl_pad + sizeof(struct mlx5_wqe_data_seg) >
1382 			      mpw_room) ||
1383 			    (length <= txq->inline_max_packet_sz &&
1384 			     inl_pad + sizeof(inl_hdr) + length >
1385 			      mpw_room) ||
1386 			    (mpw.wqe->eseg.cs_flags != cs_flags))
1387 				max_wqe -= mlx5_empw_close(txq, &mpw);
1388 		}
1389 		if (unlikely(mpw.state == MLX5_MPW_STATE_CLOSED)) {
1390 			if (unlikely(segs_n != 1)) {
1391 				/* Fall back to legacy MPW.
1392 				 * A MPW session consumes 2 WQEs at most to
1393 				 * include MLX5_MPW_DSEG_MAX pointers.
1394 				 */
1395 				if (unlikely(max_wqe < 2))
1396 					break;
1397 				mlx5_mpw_new(txq, &mpw, length);
1398 			} else {
1399 				/* In Enhanced MPW, inline as much as the budget
1400 				 * is allowed. The remaining space is to be
1401 				 * filled with dsegs. If the title WQEBB isn't
1402 				 * padded, it will have 2 dsegs there.
1403 				 */
1404 				mpw_room = RTE_MIN(MLX5_WQE_SIZE_MAX,
1405 					    (max_inline ? max_inline :
1406 					     pkts_n * MLX5_WQE_DWORD_SIZE) +
1407 					    MLX5_WQE_SIZE);
1408 				if (unlikely(max_wqe * MLX5_WQE_SIZE <
1409 					      mpw_room))
1410 					break;
1411 				/* Don't pad the title WQEBB to not waste WQ. */
1412 				mlx5_empw_new(txq, &mpw, 0);
1413 				mpw_room -= mpw.total_len;
1414 				inl_pad = 0;
1415 				do_inline =
1416 					length <= txq->inline_max_packet_sz &&
1417 					sizeof(inl_hdr) + length <= mpw_room &&
1418 					!txq->mpw_hdr_dseg;
1419 			}
1420 			mpw.wqe->eseg.cs_flags = cs_flags;
1421 		} else {
1422 			/* Evaluate whether the next packet can be inlined.
1423 			 * Inlininig is possible when:
1424 			 * - length is less than configured value
1425 			 * - length fits for remaining space
1426 			 * - not required to fill the title WQEBB with dsegs
1427 			 */
1428 			do_inline =
1429 				length <= txq->inline_max_packet_sz &&
1430 				inl_pad + sizeof(inl_hdr) + length <=
1431 				 mpw_room &&
1432 				(!txq->mpw_hdr_dseg ||
1433 				 mpw.total_len >= MLX5_WQE_SIZE);
1434 		}
1435 		/* Multi-segment packets must be alone in their MPW. */
1436 		assert((segs_n == 1) || (mpw.pkts_n == 0));
1437 		if (unlikely(mpw.state == MLX5_MPW_STATE_OPENED)) {
1438 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1439 			length = 0;
1440 #endif
1441 			do {
1442 				volatile struct mlx5_wqe_data_seg *dseg;
1443 
1444 				assert(buf);
1445 				(*txq->elts)[elts_head++ & elts_m] = buf;
1446 				dseg = mpw.data.dseg[mpw.pkts_n];
1447 				addr = rte_pktmbuf_mtod(buf, uintptr_t);
1448 				*dseg = (struct mlx5_wqe_data_seg){
1449 					.byte_count = htonl(DATA_LEN(buf)),
1450 					.lkey = mlx5_tx_mb2mr(txq, buf),
1451 					.addr = htonll(addr),
1452 				};
1453 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1454 				length += DATA_LEN(buf);
1455 #endif
1456 				buf = buf->next;
1457 				++j;
1458 				++mpw.pkts_n;
1459 			} while (--segs_n);
1460 			/* A multi-segmented packet takes one MPW session.
1461 			 * TODO: Pack more multi-segmented packets if possible.
1462 			 */
1463 			mlx5_mpw_close(txq, &mpw);
1464 			if (mpw.pkts_n < 3)
1465 				max_wqe--;
1466 			else
1467 				max_wqe -= 2;
1468 		} else if (do_inline) {
1469 			/* Inline packet into WQE. */
1470 			unsigned int max;
1471 
1472 			assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
1473 			assert(length == DATA_LEN(buf));
1474 			inl_hdr = htonl(length | MLX5_INLINE_SEG);
1475 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
1476 			mpw.data.raw = (volatile void *)
1477 				((uintptr_t)mpw.data.raw + inl_pad);
1478 			max = tx_mlx5_wq_tailroom(txq,
1479 					(void *)(uintptr_t)mpw.data.raw);
1480 			/* Copy inline header. */
1481 			mpw.data.raw = (volatile void *)
1482 				mlx5_copy_to_wq(
1483 					  (void *)(uintptr_t)mpw.data.raw,
1484 					  &inl_hdr,
1485 					  sizeof(inl_hdr),
1486 					  (void *)(uintptr_t)txq->wqes,
1487 					  max);
1488 			max = tx_mlx5_wq_tailroom(txq,
1489 					(void *)(uintptr_t)mpw.data.raw);
1490 			/* Copy packet data. */
1491 			mpw.data.raw = (volatile void *)
1492 				mlx5_copy_to_wq(
1493 					  (void *)(uintptr_t)mpw.data.raw,
1494 					  (void *)addr,
1495 					  length,
1496 					  (void *)(uintptr_t)txq->wqes,
1497 					  max);
1498 			++mpw.pkts_n;
1499 			mpw.total_len += (inl_pad + sizeof(inl_hdr) + length);
1500 			/* No need to get completion as the entire packet is
1501 			 * copied to WQ. Free the buf right away.
1502 			 */
1503 			rte_pktmbuf_free_seg(buf);
1504 			mpw_room -= (inl_pad + sizeof(inl_hdr) + length);
1505 			/* Add pad in the next packet if any. */
1506 			inl_pad = (((uintptr_t)mpw.data.raw +
1507 					(MLX5_WQE_DWORD_SIZE - 1)) &
1508 					~(MLX5_WQE_DWORD_SIZE - 1)) -
1509 				  (uintptr_t)mpw.data.raw;
1510 		} else {
1511 			/* No inline. Load a dseg of packet pointer. */
1512 			volatile rte_v128u32_t *dseg;
1513 
1514 			assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
1515 			assert((inl_pad + sizeof(*dseg)) <= mpw_room);
1516 			assert(length == DATA_LEN(buf));
1517 			if (!tx_mlx5_wq_tailroom(txq,
1518 					(void *)((uintptr_t)mpw.data.raw
1519 						+ inl_pad)))
1520 				dseg = (volatile void *)txq->wqes;
1521 			else
1522 				dseg = (volatile void *)
1523 					((uintptr_t)mpw.data.raw +
1524 					 inl_pad);
1525 			(*txq->elts)[elts_head++ & elts_m] = buf;
1526 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
1527 			for (n = 0; n * RTE_CACHE_LINE_SIZE < length; n++)
1528 				rte_prefetch2((void *)(addr +
1529 						n * RTE_CACHE_LINE_SIZE));
1530 			naddr = htonll(addr);
1531 			*dseg = (rte_v128u32_t) {
1532 				htonl(length),
1533 				mlx5_tx_mb2mr(txq, buf),
1534 				naddr,
1535 				naddr >> 32,
1536 			};
1537 			mpw.data.raw = (volatile void *)(dseg + 1);
1538 			mpw.total_len += (inl_pad + sizeof(*dseg));
1539 			++j;
1540 			++mpw.pkts_n;
1541 			mpw_room -= (inl_pad + sizeof(*dseg));
1542 			inl_pad = 0;
1543 		}
1544 #ifdef MLX5_PMD_SOFT_COUNTERS
1545 		/* Increment sent bytes counter. */
1546 		txq->stats.obytes += length;
1547 #endif
1548 		++i;
1549 	} while (i < pkts_n);
1550 	/* Take a shortcut if nothing must be sent. */
1551 	if (unlikely(i == 0))
1552 		return 0;
1553 	/* Check whether completion threshold has been reached. */
1554 	if (txq->elts_comp + j >= MLX5_TX_COMP_THRESH ||
1555 			(uint16_t)(txq->wqe_ci - txq->mpw_comp) >=
1556 			 (1 << txq->wqe_n) / MLX5_TX_COMP_THRESH_INLINE_DIV) {
1557 		volatile struct mlx5_wqe *wqe = mpw.wqe;
1558 
1559 		/* Request completion on last WQE. */
1560 		wqe->ctrl[2] = htonl(8);
1561 		/* Save elts_head in unused "immediate" field of WQE. */
1562 		wqe->ctrl[3] = elts_head;
1563 		txq->elts_comp = 0;
1564 		txq->mpw_comp = txq->wqe_ci;
1565 		txq->cq_pi++;
1566 	} else {
1567 		txq->elts_comp += j;
1568 	}
1569 #ifdef MLX5_PMD_SOFT_COUNTERS
1570 	/* Increment sent packets counter. */
1571 	txq->stats.opackets += i;
1572 #endif
1573 	if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED)
1574 		mlx5_empw_close(txq, &mpw);
1575 	else if (mpw.state == MLX5_MPW_STATE_OPENED)
1576 		mlx5_mpw_close(txq, &mpw);
1577 	/* Ring QP doorbell. */
1578 	mlx5_tx_dbrec(txq, mpw.wqe);
1579 	txq->elts_head = elts_head;
1580 	return i;
1581 }
1582 
1583 /**
1584  * Translate RX completion flags to packet type.
1585  *
1586  * @param[in] cqe
1587  *   Pointer to CQE.
1588  *
1589  * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
1590  *
1591  * @return
1592  *   Packet type for struct rte_mbuf.
1593  */
1594 static inline uint32_t
1595 rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe)
1596 {
1597 	uint8_t idx;
1598 	uint8_t pinfo = cqe->pkt_info;
1599 	uint16_t ptype = cqe->hdr_type_etc;
1600 
1601 	/*
1602 	 * The index to the array should have:
1603 	 * bit[1:0] = l3_hdr_type
1604 	 * bit[4:2] = l4_hdr_type
1605 	 * bit[5] = ip_frag
1606 	 * bit[6] = tunneled
1607 	 * bit[7] = outer_l3_type
1608 	 */
1609 	idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10);
1610 	return mlx5_ptype_table[idx];
1611 }
1612 
1613 /**
1614  * Get size of the next packet for a given CQE. For compressed CQEs, the
1615  * consumer index is updated only once all packets of the current one have
1616  * been processed.
1617  *
1618  * @param rxq
1619  *   Pointer to RX queue.
1620  * @param cqe
1621  *   CQE to process.
1622  * @param[out] rss_hash
1623  *   Packet RSS Hash result.
1624  *
1625  * @return
1626  *   Packet size in bytes (0 if there is none), -1 in case of completion
1627  *   with error.
1628  */
1629 static inline int
1630 mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe,
1631 		 uint16_t cqe_cnt, uint32_t *rss_hash)
1632 {
1633 	struct rxq_zip *zip = &rxq->zip;
1634 	uint16_t cqe_n = cqe_cnt + 1;
1635 	int len = 0;
1636 	uint16_t idx, end;
1637 
1638 	/* Process compressed data in the CQE and mini arrays. */
1639 	if (zip->ai) {
1640 		volatile struct mlx5_mini_cqe8 (*mc)[8] =
1641 			(volatile struct mlx5_mini_cqe8 (*)[8])
1642 			(uintptr_t)(&(*rxq->cqes)[zip->ca & cqe_cnt].pkt_info);
1643 
1644 		len = ntohl((*mc)[zip->ai & 7].byte_cnt);
1645 		*rss_hash = ntohl((*mc)[zip->ai & 7].rx_hash_result);
1646 		if ((++zip->ai & 7) == 0) {
1647 			/* Invalidate consumed CQEs */
1648 			idx = zip->ca;
1649 			end = zip->na;
1650 			while (idx != end) {
1651 				(*rxq->cqes)[idx & cqe_cnt].op_own =
1652 					MLX5_CQE_INVALIDATE;
1653 				++idx;
1654 			}
1655 			/*
1656 			 * Increment consumer index to skip the number of
1657 			 * CQEs consumed. Hardware leaves holes in the CQ
1658 			 * ring for software use.
1659 			 */
1660 			zip->ca = zip->na;
1661 			zip->na += 8;
1662 		}
1663 		if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
1664 			/* Invalidate the rest */
1665 			idx = zip->ca;
1666 			end = zip->cq_ci;
1667 
1668 			while (idx != end) {
1669 				(*rxq->cqes)[idx & cqe_cnt].op_own =
1670 					MLX5_CQE_INVALIDATE;
1671 				++idx;
1672 			}
1673 			rxq->cq_ci = zip->cq_ci;
1674 			zip->ai = 0;
1675 		}
1676 	/* No compressed data, get next CQE and verify if it is compressed. */
1677 	} else {
1678 		int ret;
1679 		int8_t op_own;
1680 
1681 		ret = check_cqe(cqe, cqe_n, rxq->cq_ci);
1682 		if (unlikely(ret == 1))
1683 			return 0;
1684 		++rxq->cq_ci;
1685 		op_own = cqe->op_own;
1686 		if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
1687 			volatile struct mlx5_mini_cqe8 (*mc)[8] =
1688 				(volatile struct mlx5_mini_cqe8 (*)[8])
1689 				(uintptr_t)(&(*rxq->cqes)[rxq->cq_ci &
1690 							  cqe_cnt].pkt_info);
1691 
1692 			/* Fix endianness. */
1693 			zip->cqe_cnt = ntohl(cqe->byte_cnt);
1694 			/*
1695 			 * Current mini array position is the one returned by
1696 			 * check_cqe64().
1697 			 *
1698 			 * If completion comprises several mini arrays, as a
1699 			 * special case the second one is located 7 CQEs after
1700 			 * the initial CQE instead of 8 for subsequent ones.
1701 			 */
1702 			zip->ca = rxq->cq_ci;
1703 			zip->na = zip->ca + 7;
1704 			/* Compute the next non compressed CQE. */
1705 			--rxq->cq_ci;
1706 			zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
1707 			/* Get packet size to return. */
1708 			len = ntohl((*mc)[0].byte_cnt);
1709 			*rss_hash = ntohl((*mc)[0].rx_hash_result);
1710 			zip->ai = 1;
1711 			/* Prefetch all the entries to be invalidated */
1712 			idx = zip->ca;
1713 			end = zip->cq_ci;
1714 			while (idx != end) {
1715 				rte_prefetch0(&(*rxq->cqes)[(idx) & cqe_cnt]);
1716 				++idx;
1717 			}
1718 		} else {
1719 			len = ntohl(cqe->byte_cnt);
1720 			*rss_hash = ntohl(cqe->rx_hash_res);
1721 		}
1722 		/* Error while receiving packet. */
1723 		if (unlikely(MLX5_CQE_OPCODE(op_own) == MLX5_CQE_RESP_ERR))
1724 			return -1;
1725 	}
1726 	return len;
1727 }
1728 
1729 /**
1730  * Translate RX completion flags to offload flags.
1731  *
1732  * @param[in] rxq
1733  *   Pointer to RX queue structure.
1734  * @param[in] cqe
1735  *   Pointer to CQE.
1736  *
1737  * @return
1738  *   Offload flags (ol_flags) for struct rte_mbuf.
1739  */
1740 static inline uint32_t
1741 rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe *cqe)
1742 {
1743 	uint32_t ol_flags = 0;
1744 	uint16_t flags = ntohs(cqe->hdr_type_etc);
1745 
1746 	ol_flags =
1747 		TRANSPOSE(flags,
1748 			  MLX5_CQE_RX_L3_HDR_VALID,
1749 			  PKT_RX_IP_CKSUM_GOOD) |
1750 		TRANSPOSE(flags,
1751 			  MLX5_CQE_RX_L4_HDR_VALID,
1752 			  PKT_RX_L4_CKSUM_GOOD);
1753 	if ((cqe->pkt_info & MLX5_CQE_RX_TUNNEL_PACKET) && (rxq->csum_l2tun))
1754 		ol_flags |=
1755 			TRANSPOSE(flags,
1756 				  MLX5_CQE_RX_L3_HDR_VALID,
1757 				  PKT_RX_IP_CKSUM_GOOD) |
1758 			TRANSPOSE(flags,
1759 				  MLX5_CQE_RX_L4_HDR_VALID,
1760 				  PKT_RX_L4_CKSUM_GOOD);
1761 	return ol_flags;
1762 }
1763 
1764 /**
1765  * DPDK callback for RX.
1766  *
1767  * @param dpdk_rxq
1768  *   Generic pointer to RX queue structure.
1769  * @param[out] pkts
1770  *   Array to store received packets.
1771  * @param pkts_n
1772  *   Maximum number of packets in array.
1773  *
1774  * @return
1775  *   Number of packets successfully received (<= pkts_n).
1776  */
1777 uint16_t
1778 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
1779 {
1780 	struct rxq *rxq = dpdk_rxq;
1781 	const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1;
1782 	const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1;
1783 	const unsigned int sges_n = rxq->sges_n;
1784 	struct rte_mbuf *pkt = NULL;
1785 	struct rte_mbuf *seg = NULL;
1786 	volatile struct mlx5_cqe *cqe =
1787 		&(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
1788 	unsigned int i = 0;
1789 	unsigned int rq_ci = rxq->rq_ci << sges_n;
1790 	int len = 0; /* keep its value across iterations. */
1791 
1792 	while (pkts_n) {
1793 		unsigned int idx = rq_ci & wqe_cnt;
1794 		volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx];
1795 		struct rte_mbuf *rep = (*rxq->elts)[idx];
1796 		uint32_t rss_hash_res = 0;
1797 
1798 		if (pkt)
1799 			NEXT(seg) = rep;
1800 		seg = rep;
1801 		rte_prefetch0(seg);
1802 		rte_prefetch0(cqe);
1803 		rte_prefetch0(wqe);
1804 		rep = rte_mbuf_raw_alloc(rxq->mp);
1805 		if (unlikely(rep == NULL)) {
1806 			++rxq->stats.rx_nombuf;
1807 			if (!pkt) {
1808 				/*
1809 				 * no buffers before we even started,
1810 				 * bail out silently.
1811 				 */
1812 				break;
1813 			}
1814 			while (pkt != seg) {
1815 				assert(pkt != (*rxq->elts)[idx]);
1816 				rep = NEXT(pkt);
1817 				NEXT(pkt) = NULL;
1818 				NB_SEGS(pkt) = 1;
1819 				rte_mbuf_raw_free(pkt);
1820 				pkt = rep;
1821 			}
1822 			break;
1823 		}
1824 		if (!pkt) {
1825 			cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
1826 			len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt,
1827 					       &rss_hash_res);
1828 			if (!len) {
1829 				rte_mbuf_raw_free(rep);
1830 				break;
1831 			}
1832 			if (unlikely(len == -1)) {
1833 				/* RX error, packet is likely too large. */
1834 				rte_mbuf_raw_free(rep);
1835 				++rxq->stats.idropped;
1836 				goto skip;
1837 			}
1838 			pkt = seg;
1839 			assert(len >= (rxq->crc_present << 2));
1840 			/* Update packet information. */
1841 			pkt->packet_type = rxq_cq_to_pkt_type(cqe);
1842 			pkt->ol_flags = 0;
1843 			if (rss_hash_res && rxq->rss_hash) {
1844 				pkt->hash.rss = rss_hash_res;
1845 				pkt->ol_flags = PKT_RX_RSS_HASH;
1846 			}
1847 			if (rxq->mark &&
1848 			    MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) {
1849 				pkt->ol_flags |= PKT_RX_FDIR;
1850 				if (cqe->sop_drop_qpn !=
1851 				    htonl(MLX5_FLOW_MARK_DEFAULT)) {
1852 					uint32_t mark = cqe->sop_drop_qpn;
1853 
1854 					pkt->ol_flags |= PKT_RX_FDIR_ID;
1855 					pkt->hash.fdir.hi =
1856 						mlx5_flow_mark_get(mark);
1857 				}
1858 			}
1859 			if (rxq->csum | rxq->csum_l2tun)
1860 				pkt->ol_flags |= rxq_cq_to_ol_flags(rxq, cqe);
1861 			if (rxq->vlan_strip &&
1862 			    (cqe->hdr_type_etc &
1863 			     htons(MLX5_CQE_VLAN_STRIPPED))) {
1864 				pkt->ol_flags |= PKT_RX_VLAN_PKT |
1865 					PKT_RX_VLAN_STRIPPED;
1866 				pkt->vlan_tci = ntohs(cqe->vlan_info);
1867 			}
1868 			if (rxq->crc_present)
1869 				len -= ETHER_CRC_LEN;
1870 			PKT_LEN(pkt) = len;
1871 		}
1872 		DATA_LEN(rep) = DATA_LEN(seg);
1873 		PKT_LEN(rep) = PKT_LEN(seg);
1874 		SET_DATA_OFF(rep, DATA_OFF(seg));
1875 		PORT(rep) = PORT(seg);
1876 		(*rxq->elts)[idx] = rep;
1877 		/*
1878 		 * Fill NIC descriptor with the new buffer.  The lkey and size
1879 		 * of the buffers are already known, only the buffer address
1880 		 * changes.
1881 		 */
1882 		wqe->addr = htonll(rte_pktmbuf_mtod(rep, uintptr_t));
1883 		if (len > DATA_LEN(seg)) {
1884 			len -= DATA_LEN(seg);
1885 			++NB_SEGS(pkt);
1886 			++rq_ci;
1887 			continue;
1888 		}
1889 		DATA_LEN(seg) = len;
1890 #ifdef MLX5_PMD_SOFT_COUNTERS
1891 		/* Increment bytes counter. */
1892 		rxq->stats.ibytes += PKT_LEN(pkt);
1893 #endif
1894 		/* Return packet. */
1895 		*(pkts++) = pkt;
1896 		pkt = NULL;
1897 		--pkts_n;
1898 		++i;
1899 skip:
1900 		/* Align consumer index to the next stride. */
1901 		rq_ci >>= sges_n;
1902 		++rq_ci;
1903 		rq_ci <<= sges_n;
1904 	}
1905 	if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci)))
1906 		return 0;
1907 	/* Update the consumer index. */
1908 	rxq->rq_ci = rq_ci >> sges_n;
1909 	rte_wmb();
1910 	*rxq->cq_db = htonl(rxq->cq_ci);
1911 	rte_wmb();
1912 	*rxq->rq_db = htonl(rxq->rq_ci);
1913 #ifdef MLX5_PMD_SOFT_COUNTERS
1914 	/* Increment packets counter. */
1915 	rxq->stats.ipackets += i;
1916 #endif
1917 	return i;
1918 }
1919 
1920 /**
1921  * Dummy DPDK callback for TX.
1922  *
1923  * This function is used to temporarily replace the real callback during
1924  * unsafe control operations on the queue, or in case of error.
1925  *
1926  * @param dpdk_txq
1927  *   Generic pointer to TX queue structure.
1928  * @param[in] pkts
1929  *   Packets to transmit.
1930  * @param pkts_n
1931  *   Number of packets in array.
1932  *
1933  * @return
1934  *   Number of packets successfully transmitted (<= pkts_n).
1935  */
1936 uint16_t
1937 removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
1938 {
1939 	(void)dpdk_txq;
1940 	(void)pkts;
1941 	(void)pkts_n;
1942 	return 0;
1943 }
1944 
1945 /**
1946  * Dummy DPDK callback for RX.
1947  *
1948  * This function is used to temporarily replace the real callback during
1949  * unsafe control operations on the queue, or in case of error.
1950  *
1951  * @param dpdk_rxq
1952  *   Generic pointer to RX queue structure.
1953  * @param[out] pkts
1954  *   Array to store received packets.
1955  * @param pkts_n
1956  *   Maximum number of packets in array.
1957  *
1958  * @return
1959  *   Number of packets successfully received (<= pkts_n).
1960  */
1961 uint16_t
1962 removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
1963 {
1964 	(void)dpdk_rxq;
1965 	(void)pkts;
1966 	(void)pkts_n;
1967 	return 0;
1968 }
1969 
1970 /*
1971  * Vectorized Rx/Tx routines are not compiled in when required vector
1972  * instructions are not supported on a target architecture. The following null
1973  * stubs are needed for linkage when those are not included outside of this file
1974  * (e.g.  mlx5_rxtx_vec_sse.c for x86).
1975  */
1976 
1977 uint16_t __attribute__((weak))
1978 mlx5_tx_burst_raw_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
1979 {
1980 	(void)dpdk_txq;
1981 	(void)pkts;
1982 	(void)pkts_n;
1983 	return 0;
1984 }
1985 
1986 uint16_t __attribute__((weak))
1987 mlx5_tx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
1988 {
1989 	(void)dpdk_txq;
1990 	(void)pkts;
1991 	(void)pkts_n;
1992 	return 0;
1993 }
1994 
1995 uint16_t __attribute__((weak))
1996 mlx5_rx_burst_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
1997 {
1998 	(void)dpdk_rxq;
1999 	(void)pkts;
2000 	(void)pkts_n;
2001 	return 0;
2002 }
2003 
2004 int __attribute__((weak))
2005 priv_check_raw_vec_tx_support(struct priv *priv)
2006 {
2007 	(void)priv;
2008 	return -ENOTSUP;
2009 }
2010 
2011 int __attribute__((weak))
2012 priv_check_vec_tx_support(struct priv *priv)
2013 {
2014 	(void)priv;
2015 	return -ENOTSUP;
2016 }
2017 
2018 int __attribute__((weak))
2019 rxq_check_vec_support(struct rxq *rxq)
2020 {
2021 	(void)rxq;
2022 	return -ENOTSUP;
2023 }
2024 
2025 int __attribute__((weak))
2026 priv_check_vec_rx_support(struct priv *priv)
2027 {
2028 	(void)priv;
2029 	return -ENOTSUP;
2030 }
2031 
2032 void __attribute__((weak))
2033 priv_prep_vec_rx_function(struct priv *priv)
2034 {
2035 	(void)priv;
2036 }
2037