xref: /dpdk/drivers/net/mlx5/mlx5_rxtx.c (revision 5dba3b9c4c131b88a78bcecfef39db23ebc47873)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright 2015 6WIND S.A.
5  *   Copyright 2015 Mellanox.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of 6WIND S.A. nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <assert.h>
35 #include <stdint.h>
36 #include <string.h>
37 #include <stdlib.h>
38 
39 /* Verbs header. */
40 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
41 #ifdef PEDANTIC
42 #pragma GCC diagnostic ignored "-Wpedantic"
43 #endif
44 #include <infiniband/verbs.h>
45 #include <infiniband/mlx5dv.h>
46 #ifdef PEDANTIC
47 #pragma GCC diagnostic error "-Wpedantic"
48 #endif
49 
50 #include <rte_mbuf.h>
51 #include <rte_mempool.h>
52 #include <rte_prefetch.h>
53 #include <rte_common.h>
54 #include <rte_branch_prediction.h>
55 #include <rte_ether.h>
56 
57 #include "mlx5.h"
58 #include "mlx5_utils.h"
59 #include "mlx5_rxtx.h"
60 #include "mlx5_autoconf.h"
61 #include "mlx5_defs.h"
62 #include "mlx5_prm.h"
63 
64 static __rte_always_inline uint32_t
65 rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe);
66 
67 static __rte_always_inline int
68 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
69 		 uint16_t cqe_cnt, uint32_t *rss_hash);
70 
71 static __rte_always_inline uint32_t
72 rxq_cq_to_ol_flags(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe);
73 
74 uint32_t mlx5_ptype_table[] __rte_cache_aligned = {
75 	[0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */
76 };
77 
78 /**
79  * Build a table to translate Rx completion flags to packet type.
80  *
81  * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
82  */
83 void
84 mlx5_set_ptype_table(void)
85 {
86 	unsigned int i;
87 	uint32_t (*p)[RTE_DIM(mlx5_ptype_table)] = &mlx5_ptype_table;
88 
89 	/* Last entry must not be overwritten, reserved for errored packet. */
90 	for (i = 0; i < RTE_DIM(mlx5_ptype_table) - 1; ++i)
91 		(*p)[i] = RTE_PTYPE_UNKNOWN;
92 	/*
93 	 * The index to the array should have:
94 	 * bit[1:0] = l3_hdr_type
95 	 * bit[4:2] = l4_hdr_type
96 	 * bit[5] = ip_frag
97 	 * bit[6] = tunneled
98 	 * bit[7] = outer_l3_type
99 	 */
100 	/* L2 */
101 	(*p)[0x00] = RTE_PTYPE_L2_ETHER;
102 	/* L3 */
103 	(*p)[0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
104 		     RTE_PTYPE_L4_NONFRAG;
105 	(*p)[0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
106 		     RTE_PTYPE_L4_NONFRAG;
107 	/* Fragmented */
108 	(*p)[0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
109 		     RTE_PTYPE_L4_FRAG;
110 	(*p)[0x22] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
111 		     RTE_PTYPE_L4_FRAG;
112 	/* TCP */
113 	(*p)[0x05] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
114 		     RTE_PTYPE_L4_TCP;
115 	(*p)[0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
116 		     RTE_PTYPE_L4_TCP;
117 	/* UDP */
118 	(*p)[0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
119 		     RTE_PTYPE_L4_UDP;
120 	(*p)[0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
121 		     RTE_PTYPE_L4_UDP;
122 	/* Repeat with outer_l3_type being set. Just in case. */
123 	(*p)[0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
124 		     RTE_PTYPE_L4_NONFRAG;
125 	(*p)[0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
126 		     RTE_PTYPE_L4_NONFRAG;
127 	(*p)[0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
128 		     RTE_PTYPE_L4_FRAG;
129 	(*p)[0xa2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
130 		     RTE_PTYPE_L4_FRAG;
131 	(*p)[0x85] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
132 		     RTE_PTYPE_L4_TCP;
133 	(*p)[0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
134 		     RTE_PTYPE_L4_TCP;
135 	(*p)[0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
136 		     RTE_PTYPE_L4_UDP;
137 	(*p)[0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
138 		     RTE_PTYPE_L4_UDP;
139 	/* Tunneled - L3 */
140 	(*p)[0x41] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
141 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
142 		     RTE_PTYPE_INNER_L4_NONFRAG;
143 	(*p)[0x42] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
144 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
145 		     RTE_PTYPE_INNER_L4_NONFRAG;
146 	(*p)[0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
147 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
148 		     RTE_PTYPE_INNER_L4_NONFRAG;
149 	(*p)[0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
150 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
151 		     RTE_PTYPE_INNER_L4_NONFRAG;
152 	/* Tunneled - Fragmented */
153 	(*p)[0x61] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
154 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
155 		     RTE_PTYPE_INNER_L4_FRAG;
156 	(*p)[0x62] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
157 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
158 		     RTE_PTYPE_INNER_L4_FRAG;
159 	(*p)[0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
160 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
161 		     RTE_PTYPE_INNER_L4_FRAG;
162 	(*p)[0xe2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
163 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
164 		     RTE_PTYPE_INNER_L4_FRAG;
165 	/* Tunneled - TCP */
166 	(*p)[0x45] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
167 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
168 		     RTE_PTYPE_INNER_L4_TCP;
169 	(*p)[0x46] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
170 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
171 		     RTE_PTYPE_INNER_L4_TCP;
172 	(*p)[0xc5] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
173 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
174 		     RTE_PTYPE_INNER_L4_TCP;
175 	(*p)[0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
176 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
177 		     RTE_PTYPE_INNER_L4_TCP;
178 	/* Tunneled - UDP */
179 	(*p)[0x49] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
180 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
181 		     RTE_PTYPE_INNER_L4_UDP;
182 	(*p)[0x4a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
183 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
184 		     RTE_PTYPE_INNER_L4_UDP;
185 	(*p)[0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
186 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
187 		     RTE_PTYPE_INNER_L4_UDP;
188 	(*p)[0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
189 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
190 		     RTE_PTYPE_INNER_L4_UDP;
191 }
192 
193 /**
194  * Return the size of tailroom of WQ.
195  *
196  * @param txq
197  *   Pointer to TX queue structure.
198  * @param addr
199  *   Pointer to tail of WQ.
200  *
201  * @return
202  *   Size of tailroom.
203  */
204 static inline size_t
205 tx_mlx5_wq_tailroom(struct mlx5_txq_data *txq, void *addr)
206 {
207 	size_t tailroom;
208 	tailroom = (uintptr_t)(txq->wqes) +
209 		   (1 << txq->wqe_n) * MLX5_WQE_SIZE -
210 		   (uintptr_t)addr;
211 	return tailroom;
212 }
213 
214 /**
215  * Copy data to tailroom of circular queue.
216  *
217  * @param dst
218  *   Pointer to destination.
219  * @param src
220  *   Pointer to source.
221  * @param n
222  *   Number of bytes to copy.
223  * @param base
224  *   Pointer to head of queue.
225  * @param tailroom
226  *   Size of tailroom from dst.
227  *
228  * @return
229  *   Pointer after copied data.
230  */
231 static inline void *
232 mlx5_copy_to_wq(void *dst, const void *src, size_t n,
233 		void *base, size_t tailroom)
234 {
235 	void *ret;
236 
237 	if (n > tailroom) {
238 		rte_memcpy(dst, src, tailroom);
239 		rte_memcpy(base, (void *)((uintptr_t)src + tailroom),
240 			   n - tailroom);
241 		ret = (uint8_t *)base + n - tailroom;
242 	} else {
243 		rte_memcpy(dst, src, n);
244 		ret = (n == tailroom) ? base : (uint8_t *)dst + n;
245 	}
246 	return ret;
247 }
248 
249 /**
250  * DPDK callback to check the status of a tx descriptor.
251  *
252  * @param tx_queue
253  *   The tx queue.
254  * @param[in] offset
255  *   The index of the descriptor in the ring.
256  *
257  * @return
258  *   The status of the tx descriptor.
259  */
260 int
261 mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset)
262 {
263 	struct mlx5_txq_data *txq = tx_queue;
264 	uint16_t used;
265 
266 	mlx5_tx_complete(txq);
267 	used = txq->elts_head - txq->elts_tail;
268 	if (offset < used)
269 		return RTE_ETH_TX_DESC_FULL;
270 	return RTE_ETH_TX_DESC_DONE;
271 }
272 
273 /**
274  * DPDK callback to check the status of a rx descriptor.
275  *
276  * @param rx_queue
277  *   The rx queue.
278  * @param[in] offset
279  *   The index of the descriptor in the ring.
280  *
281  * @return
282  *   The status of the tx descriptor.
283  */
284 int
285 mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset)
286 {
287 	struct mlx5_rxq_data *rxq = rx_queue;
288 	struct rxq_zip *zip = &rxq->zip;
289 	volatile struct mlx5_cqe *cqe;
290 	const unsigned int cqe_n = (1 << rxq->cqe_n);
291 	const unsigned int cqe_cnt = cqe_n - 1;
292 	unsigned int cq_ci;
293 	unsigned int used;
294 
295 	/* if we are processing a compressed cqe */
296 	if (zip->ai) {
297 		used = zip->cqe_cnt - zip->ca;
298 		cq_ci = zip->cq_ci;
299 	} else {
300 		used = 0;
301 		cq_ci = rxq->cq_ci;
302 	}
303 	cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
304 	while (check_cqe(cqe, cqe_n, cq_ci) == 0) {
305 		int8_t op_own;
306 		unsigned int n;
307 
308 		op_own = cqe->op_own;
309 		if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED)
310 			n = rte_be_to_cpu_32(cqe->byte_cnt);
311 		else
312 			n = 1;
313 		cq_ci += n;
314 		used += n;
315 		cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
316 	}
317 	used = RTE_MIN(used, (1U << rxq->elts_n) - 1);
318 	if (offset < used)
319 		return RTE_ETH_RX_DESC_DONE;
320 	return RTE_ETH_RX_DESC_AVAIL;
321 }
322 
323 /**
324  * DPDK callback for TX.
325  *
326  * @param dpdk_txq
327  *   Generic pointer to TX queue structure.
328  * @param[in] pkts
329  *   Packets to transmit.
330  * @param pkts_n
331  *   Number of packets in array.
332  *
333  * @return
334  *   Number of packets successfully transmitted (<= pkts_n).
335  */
336 uint16_t
337 mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
338 {
339 	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
340 	uint16_t elts_head = txq->elts_head;
341 	const uint16_t elts_n = 1 << txq->elts_n;
342 	const uint16_t elts_m = elts_n - 1;
343 	unsigned int i = 0;
344 	unsigned int j = 0;
345 	unsigned int k = 0;
346 	uint16_t max_elts;
347 	uint16_t max_wqe;
348 	unsigned int comp;
349 	volatile struct mlx5_wqe_ctrl *last_wqe = NULL;
350 	unsigned int segs_n = 0;
351 	const unsigned int max_inline = txq->max_inline;
352 
353 	if (unlikely(!pkts_n))
354 		return 0;
355 	/* Prefetch first packet cacheline. */
356 	rte_prefetch0(*pkts);
357 	/* Start processing. */
358 	mlx5_tx_complete(txq);
359 	max_elts = (elts_n - (elts_head - txq->elts_tail));
360 	/* A CQE slot must always be available. */
361 	assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci));
362 	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
363 	if (unlikely(!max_wqe))
364 		return 0;
365 	do {
366 		struct rte_mbuf *buf = NULL;
367 		uint8_t *raw;
368 		volatile struct mlx5_wqe_v *wqe = NULL;
369 		volatile rte_v128u32_t *dseg = NULL;
370 		uint32_t length;
371 		unsigned int ds = 0;
372 		unsigned int sg = 0; /* counter of additional segs attached. */
373 		uintptr_t addr;
374 		uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE + 2;
375 		uint16_t tso_header_sz = 0;
376 		uint16_t ehdr;
377 		uint8_t cs_flags;
378 		uint64_t tso = 0;
379 		uint16_t tso_segsz = 0;
380 #ifdef MLX5_PMD_SOFT_COUNTERS
381 		uint32_t total_length = 0;
382 #endif
383 
384 		/* first_seg */
385 		buf = *pkts;
386 		segs_n = buf->nb_segs;
387 		/*
388 		 * Make sure there is enough room to store this packet and
389 		 * that one ring entry remains unused.
390 		 */
391 		assert(segs_n);
392 		if (max_elts < segs_n)
393 			break;
394 		max_elts -= segs_n;
395 		sg = --segs_n;
396 		if (unlikely(--max_wqe == 0))
397 			break;
398 		wqe = (volatile struct mlx5_wqe_v *)
399 			tx_mlx5_wqe(txq, txq->wqe_ci);
400 		rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
401 		if (pkts_n - i > 1)
402 			rte_prefetch0(*(pkts + 1));
403 		addr = rte_pktmbuf_mtod(buf, uintptr_t);
404 		length = DATA_LEN(buf);
405 		ehdr = (((uint8_t *)addr)[1] << 8) |
406 		       ((uint8_t *)addr)[0];
407 #ifdef MLX5_PMD_SOFT_COUNTERS
408 		total_length = length;
409 #endif
410 		if (length < (MLX5_WQE_DWORD_SIZE + 2)) {
411 			txq->stats.oerrors++;
412 			break;
413 		}
414 		/* Update element. */
415 		(*txq->elts)[elts_head & elts_m] = buf;
416 		/* Prefetch next buffer data. */
417 		if (pkts_n - i > 1)
418 			rte_prefetch0(
419 			    rte_pktmbuf_mtod(*(pkts + 1), volatile void *));
420 		cs_flags = txq_ol_cksum_to_cs(txq, buf);
421 		raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE;
422 		/* Replace the Ethernet type by the VLAN if necessary. */
423 		if (buf->ol_flags & PKT_TX_VLAN_PKT) {
424 			uint32_t vlan = rte_cpu_to_be_32(0x81000000 |
425 							 buf->vlan_tci);
426 			unsigned int len = 2 * ETHER_ADDR_LEN - 2;
427 
428 			addr += 2;
429 			length -= 2;
430 			/* Copy Destination and source mac address. */
431 			memcpy((uint8_t *)raw, ((uint8_t *)addr), len);
432 			/* Copy VLAN. */
433 			memcpy((uint8_t *)raw + len, &vlan, sizeof(vlan));
434 			/* Copy missing two bytes to end the DSeg. */
435 			memcpy((uint8_t *)raw + len + sizeof(vlan),
436 			       ((uint8_t *)addr) + len, 2);
437 			addr += len + 2;
438 			length -= (len + 2);
439 		} else {
440 			memcpy((uint8_t *)raw, ((uint8_t *)addr) + 2,
441 			       MLX5_WQE_DWORD_SIZE);
442 			length -= pkt_inline_sz;
443 			addr += pkt_inline_sz;
444 		}
445 		raw += MLX5_WQE_DWORD_SIZE;
446 		tso = txq->tso_en && (buf->ol_flags & PKT_TX_TCP_SEG);
447 		if (tso) {
448 			uintptr_t end =
449 				(uintptr_t)(((uintptr_t)txq->wqes) +
450 					    (1 << txq->wqe_n) * MLX5_WQE_SIZE);
451 			unsigned int copy_b;
452 			uint8_t vlan_sz =
453 				(buf->ol_flags & PKT_TX_VLAN_PKT) ? 4 : 0;
454 			const uint64_t is_tunneled =
455 				buf->ol_flags & (PKT_TX_TUNNEL_GRE |
456 						 PKT_TX_TUNNEL_VXLAN);
457 
458 			tso_header_sz = buf->l2_len + vlan_sz +
459 					buf->l3_len + buf->l4_len;
460 			tso_segsz = buf->tso_segsz;
461 			if (unlikely(tso_segsz == 0)) {
462 				txq->stats.oerrors++;
463 				break;
464 			}
465 			if (is_tunneled	&& txq->tunnel_en) {
466 				tso_header_sz += buf->outer_l2_len +
467 						 buf->outer_l3_len;
468 				cs_flags |= MLX5_ETH_WQE_L4_INNER_CSUM;
469 			} else {
470 				cs_flags |= MLX5_ETH_WQE_L4_CSUM;
471 			}
472 			if (unlikely(tso_header_sz > MLX5_MAX_TSO_HEADER)) {
473 				txq->stats.oerrors++;
474 				break;
475 			}
476 			copy_b = tso_header_sz - pkt_inline_sz;
477 			/* First seg must contain all headers. */
478 			assert(copy_b <= length);
479 			if (copy_b && ((end - (uintptr_t)raw) > copy_b)) {
480 				uint16_t n = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4;
481 
482 				if (unlikely(max_wqe < n))
483 					break;
484 				max_wqe -= n;
485 				rte_memcpy((void *)raw, (void *)addr, copy_b);
486 				addr += copy_b;
487 				length -= copy_b;
488 				/* Include padding for TSO header. */
489 				copy_b = MLX5_WQE_DS(copy_b) *
490 					 MLX5_WQE_DWORD_SIZE;
491 				pkt_inline_sz += copy_b;
492 				raw += copy_b;
493 			} else {
494 				/* NOP WQE. */
495 				wqe->ctrl = (rte_v128u32_t){
496 					rte_cpu_to_be_32(txq->wqe_ci << 8),
497 					rte_cpu_to_be_32(txq->qp_num_8s | 1),
498 					0,
499 					0,
500 				};
501 				ds = 1;
502 #ifdef MLX5_PMD_SOFT_COUNTERS
503 				total_length = 0;
504 #endif
505 				k++;
506 				goto next_wqe;
507 			}
508 		}
509 		/* Inline if enough room. */
510 		if (max_inline || tso) {
511 			uint32_t inl = 0;
512 			uintptr_t end = (uintptr_t)
513 				(((uintptr_t)txq->wqes) +
514 				 (1 << txq->wqe_n) * MLX5_WQE_SIZE);
515 			unsigned int inline_room = max_inline *
516 						   RTE_CACHE_LINE_SIZE -
517 						   (pkt_inline_sz - 2) -
518 						   !!tso * sizeof(inl);
519 			uintptr_t addr_end;
520 			unsigned int copy_b;
521 
522 pkt_inline:
523 			addr_end = RTE_ALIGN_FLOOR(addr + inline_room,
524 						   RTE_CACHE_LINE_SIZE);
525 			copy_b = (addr_end > addr) ?
526 				 RTE_MIN((addr_end - addr), length) : 0;
527 			if (copy_b && ((end - (uintptr_t)raw) > copy_b)) {
528 				/*
529 				 * One Dseg remains in the current WQE.  To
530 				 * keep the computation positive, it is
531 				 * removed after the bytes to Dseg conversion.
532 				 */
533 				uint16_t n = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4;
534 
535 				if (unlikely(max_wqe < n))
536 					break;
537 				max_wqe -= n;
538 				if (tso && !inl) {
539 					inl = rte_cpu_to_be_32(copy_b |
540 							       MLX5_INLINE_SEG);
541 					rte_memcpy((void *)raw,
542 						   (void *)&inl, sizeof(inl));
543 					raw += sizeof(inl);
544 					pkt_inline_sz += sizeof(inl);
545 				}
546 				rte_memcpy((void *)raw, (void *)addr, copy_b);
547 				addr += copy_b;
548 				length -= copy_b;
549 				pkt_inline_sz += copy_b;
550 			}
551 			/*
552 			 * 2 DWORDs consumed by the WQE header + ETH segment +
553 			 * the size of the inline part of the packet.
554 			 */
555 			ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2);
556 			if (length > 0) {
557 				if (ds % (MLX5_WQE_SIZE /
558 					  MLX5_WQE_DWORD_SIZE) == 0) {
559 					if (unlikely(--max_wqe == 0))
560 						break;
561 					dseg = (volatile rte_v128u32_t *)
562 					       tx_mlx5_wqe(txq, txq->wqe_ci +
563 							   ds / 4);
564 				} else {
565 					dseg = (volatile rte_v128u32_t *)
566 						((uintptr_t)wqe +
567 						 (ds * MLX5_WQE_DWORD_SIZE));
568 				}
569 				goto use_dseg;
570 			} else if (!segs_n) {
571 				goto next_pkt;
572 			} else {
573 				raw += copy_b;
574 				inline_room -= copy_b;
575 				--segs_n;
576 				buf = buf->next;
577 				assert(buf);
578 				addr = rte_pktmbuf_mtod(buf, uintptr_t);
579 				length = DATA_LEN(buf);
580 #ifdef MLX5_PMD_SOFT_COUNTERS
581 				total_length += length;
582 #endif
583 				(*txq->elts)[++elts_head & elts_m] = buf;
584 				goto pkt_inline;
585 			}
586 		} else {
587 			/*
588 			 * No inline has been done in the packet, only the
589 			 * Ethernet Header as been stored.
590 			 */
591 			dseg = (volatile rte_v128u32_t *)
592 				((uintptr_t)wqe + (3 * MLX5_WQE_DWORD_SIZE));
593 			ds = 3;
594 use_dseg:
595 			/* Add the remaining packet as a simple ds. */
596 			addr = rte_cpu_to_be_64(addr);
597 			*dseg = (rte_v128u32_t){
598 				rte_cpu_to_be_32(length),
599 				mlx5_tx_mb2mr(txq, buf),
600 				addr,
601 				addr >> 32,
602 			};
603 			++ds;
604 			if (!segs_n)
605 				goto next_pkt;
606 		}
607 next_seg:
608 		assert(buf);
609 		assert(ds);
610 		assert(wqe);
611 		/*
612 		 * Spill on next WQE when the current one does not have
613 		 * enough room left. Size of WQE must a be a multiple
614 		 * of data segment size.
615 		 */
616 		assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE));
617 		if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE))) {
618 			if (unlikely(--max_wqe == 0))
619 				break;
620 			dseg = (volatile rte_v128u32_t *)
621 			       tx_mlx5_wqe(txq, txq->wqe_ci + ds / 4);
622 			rte_prefetch0(tx_mlx5_wqe(txq,
623 						  txq->wqe_ci + ds / 4 + 1));
624 		} else {
625 			++dseg;
626 		}
627 		++ds;
628 		buf = buf->next;
629 		assert(buf);
630 		length = DATA_LEN(buf);
631 #ifdef MLX5_PMD_SOFT_COUNTERS
632 		total_length += length;
633 #endif
634 		/* Store segment information. */
635 		addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf, uintptr_t));
636 		*dseg = (rte_v128u32_t){
637 			rte_cpu_to_be_32(length),
638 			mlx5_tx_mb2mr(txq, buf),
639 			addr,
640 			addr >> 32,
641 		};
642 		(*txq->elts)[++elts_head & elts_m] = buf;
643 		if (--segs_n)
644 			goto next_seg;
645 next_pkt:
646 		if (ds > MLX5_DSEG_MAX) {
647 			txq->stats.oerrors++;
648 			break;
649 		}
650 		++elts_head;
651 		++pkts;
652 		++i;
653 		j += sg;
654 		/* Initialize known and common part of the WQE structure. */
655 		if (tso) {
656 			wqe->ctrl = (rte_v128u32_t){
657 				rte_cpu_to_be_32((txq->wqe_ci << 8) |
658 						 MLX5_OPCODE_TSO),
659 				rte_cpu_to_be_32(txq->qp_num_8s | ds),
660 				0,
661 				0,
662 			};
663 			wqe->eseg = (rte_v128u32_t){
664 				0,
665 				cs_flags | (rte_cpu_to_be_16(tso_segsz) << 16),
666 				0,
667 				(ehdr << 16) | rte_cpu_to_be_16(tso_header_sz),
668 			};
669 		} else {
670 			wqe->ctrl = (rte_v128u32_t){
671 				rte_cpu_to_be_32((txq->wqe_ci << 8) |
672 						 MLX5_OPCODE_SEND),
673 				rte_cpu_to_be_32(txq->qp_num_8s | ds),
674 				0,
675 				0,
676 			};
677 			wqe->eseg = (rte_v128u32_t){
678 				0,
679 				cs_flags,
680 				0,
681 				(ehdr << 16) | rte_cpu_to_be_16(pkt_inline_sz),
682 			};
683 		}
684 next_wqe:
685 		txq->wqe_ci += (ds + 3) / 4;
686 		/* Save the last successful WQE for completion request */
687 		last_wqe = (volatile struct mlx5_wqe_ctrl *)wqe;
688 #ifdef MLX5_PMD_SOFT_COUNTERS
689 		/* Increment sent bytes counter. */
690 		txq->stats.obytes += total_length;
691 #endif
692 	} while (i < pkts_n);
693 	/* Take a shortcut if nothing must be sent. */
694 	if (unlikely((i + k) == 0))
695 		return 0;
696 	txq->elts_head += (i + j);
697 	/* Check whether completion threshold has been reached. */
698 	comp = txq->elts_comp + i + j + k;
699 	if (comp >= MLX5_TX_COMP_THRESH) {
700 		/* Request completion on last WQE. */
701 		last_wqe->ctrl2 = rte_cpu_to_be_32(8);
702 		/* Save elts_head in unused "immediate" field of WQE. */
703 		last_wqe->ctrl3 = txq->elts_head;
704 		txq->elts_comp = 0;
705 #ifndef NDEBUG
706 		++txq->cq_pi;
707 #endif
708 	} else {
709 		txq->elts_comp = comp;
710 	}
711 #ifdef MLX5_PMD_SOFT_COUNTERS
712 	/* Increment sent packets counter. */
713 	txq->stats.opackets += i;
714 #endif
715 	/* Ring QP doorbell. */
716 	mlx5_tx_dbrec(txq, (volatile struct mlx5_wqe *)last_wqe);
717 	return i;
718 }
719 
720 /**
721  * Open a MPW session.
722  *
723  * @param txq
724  *   Pointer to TX queue structure.
725  * @param mpw
726  *   Pointer to MPW session structure.
727  * @param length
728  *   Packet length.
729  */
730 static inline void
731 mlx5_mpw_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw, uint32_t length)
732 {
733 	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
734 	volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] =
735 		(volatile struct mlx5_wqe_data_seg (*)[])
736 		tx_mlx5_wqe(txq, idx + 1);
737 
738 	mpw->state = MLX5_MPW_STATE_OPENED;
739 	mpw->pkts_n = 0;
740 	mpw->len = length;
741 	mpw->total_len = 0;
742 	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
743 	mpw->wqe->eseg.mss = rte_cpu_to_be_16(length);
744 	mpw->wqe->eseg.inline_hdr_sz = 0;
745 	mpw->wqe->eseg.rsvd0 = 0;
746 	mpw->wqe->eseg.rsvd1 = 0;
747 	mpw->wqe->eseg.rsvd2 = 0;
748 	mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) |
749 					     (txq->wqe_ci << 8) |
750 					     MLX5_OPCODE_TSO);
751 	mpw->wqe->ctrl[2] = 0;
752 	mpw->wqe->ctrl[3] = 0;
753 	mpw->data.dseg[0] = (volatile struct mlx5_wqe_data_seg *)
754 		(((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
755 	mpw->data.dseg[1] = (volatile struct mlx5_wqe_data_seg *)
756 		(((uintptr_t)mpw->wqe) + (3 * MLX5_WQE_DWORD_SIZE));
757 	mpw->data.dseg[2] = &(*dseg)[0];
758 	mpw->data.dseg[3] = &(*dseg)[1];
759 	mpw->data.dseg[4] = &(*dseg)[2];
760 }
761 
762 /**
763  * Close a MPW session.
764  *
765  * @param txq
766  *   Pointer to TX queue structure.
767  * @param mpw
768  *   Pointer to MPW session structure.
769  */
770 static inline void
771 mlx5_mpw_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
772 {
773 	unsigned int num = mpw->pkts_n;
774 
775 	/*
776 	 * Store size in multiple of 16 bytes. Control and Ethernet segments
777 	 * count as 2.
778 	 */
779 	mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s | (2 + num));
780 	mpw->state = MLX5_MPW_STATE_CLOSED;
781 	if (num < 3)
782 		++txq->wqe_ci;
783 	else
784 		txq->wqe_ci += 2;
785 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
786 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
787 }
788 
789 /**
790  * DPDK callback for TX with MPW support.
791  *
792  * @param dpdk_txq
793  *   Generic pointer to TX queue structure.
794  * @param[in] pkts
795  *   Packets to transmit.
796  * @param pkts_n
797  *   Number of packets in array.
798  *
799  * @return
800  *   Number of packets successfully transmitted (<= pkts_n).
801  */
802 uint16_t
803 mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
804 {
805 	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
806 	uint16_t elts_head = txq->elts_head;
807 	const uint16_t elts_n = 1 << txq->elts_n;
808 	const uint16_t elts_m = elts_n - 1;
809 	unsigned int i = 0;
810 	unsigned int j = 0;
811 	uint16_t max_elts;
812 	uint16_t max_wqe;
813 	unsigned int comp;
814 	struct mlx5_mpw mpw = {
815 		.state = MLX5_MPW_STATE_CLOSED,
816 	};
817 
818 	if (unlikely(!pkts_n))
819 		return 0;
820 	/* Prefetch first packet cacheline. */
821 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
822 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
823 	/* Start processing. */
824 	mlx5_tx_complete(txq);
825 	max_elts = (elts_n - (elts_head - txq->elts_tail));
826 	/* A CQE slot must always be available. */
827 	assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci));
828 	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
829 	if (unlikely(!max_wqe))
830 		return 0;
831 	do {
832 		struct rte_mbuf *buf = *(pkts++);
833 		uint32_t length;
834 		unsigned int segs_n = buf->nb_segs;
835 		uint32_t cs_flags;
836 
837 		/*
838 		 * Make sure there is enough room to store this packet and
839 		 * that one ring entry remains unused.
840 		 */
841 		assert(segs_n);
842 		if (max_elts < segs_n)
843 			break;
844 		/* Do not bother with large packets MPW cannot handle. */
845 		if (segs_n > MLX5_MPW_DSEG_MAX) {
846 			txq->stats.oerrors++;
847 			break;
848 		}
849 		max_elts -= segs_n;
850 		--pkts_n;
851 		cs_flags = txq_ol_cksum_to_cs(txq, buf);
852 		/* Retrieve packet information. */
853 		length = PKT_LEN(buf);
854 		assert(length);
855 		/* Start new session if packet differs. */
856 		if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
857 		    ((mpw.len != length) ||
858 		     (segs_n != 1) ||
859 		     (mpw.wqe->eseg.cs_flags != cs_flags)))
860 			mlx5_mpw_close(txq, &mpw);
861 		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
862 			/*
863 			 * Multi-Packet WQE consumes at most two WQE.
864 			 * mlx5_mpw_new() expects to be able to use such
865 			 * resources.
866 			 */
867 			if (unlikely(max_wqe < 2))
868 				break;
869 			max_wqe -= 2;
870 			mlx5_mpw_new(txq, &mpw, length);
871 			mpw.wqe->eseg.cs_flags = cs_flags;
872 		}
873 		/* Multi-segment packets must be alone in their MPW. */
874 		assert((segs_n == 1) || (mpw.pkts_n == 0));
875 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
876 		length = 0;
877 #endif
878 		do {
879 			volatile struct mlx5_wqe_data_seg *dseg;
880 			uintptr_t addr;
881 
882 			assert(buf);
883 			(*txq->elts)[elts_head++ & elts_m] = buf;
884 			dseg = mpw.data.dseg[mpw.pkts_n];
885 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
886 			*dseg = (struct mlx5_wqe_data_seg){
887 				.byte_count = rte_cpu_to_be_32(DATA_LEN(buf)),
888 				.lkey = mlx5_tx_mb2mr(txq, buf),
889 				.addr = rte_cpu_to_be_64(addr),
890 			};
891 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
892 			length += DATA_LEN(buf);
893 #endif
894 			buf = buf->next;
895 			++mpw.pkts_n;
896 			++j;
897 		} while (--segs_n);
898 		assert(length == mpw.len);
899 		if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
900 			mlx5_mpw_close(txq, &mpw);
901 #ifdef MLX5_PMD_SOFT_COUNTERS
902 		/* Increment sent bytes counter. */
903 		txq->stats.obytes += length;
904 #endif
905 		++i;
906 	} while (pkts_n);
907 	/* Take a shortcut if nothing must be sent. */
908 	if (unlikely(i == 0))
909 		return 0;
910 	/* Check whether completion threshold has been reached. */
911 	/* "j" includes both packets and segments. */
912 	comp = txq->elts_comp + j;
913 	if (comp >= MLX5_TX_COMP_THRESH) {
914 		volatile struct mlx5_wqe *wqe = mpw.wqe;
915 
916 		/* Request completion on last WQE. */
917 		wqe->ctrl[2] = rte_cpu_to_be_32(8);
918 		/* Save elts_head in unused "immediate" field of WQE. */
919 		wqe->ctrl[3] = elts_head;
920 		txq->elts_comp = 0;
921 #ifndef NDEBUG
922 		++txq->cq_pi;
923 #endif
924 	} else {
925 		txq->elts_comp = comp;
926 	}
927 #ifdef MLX5_PMD_SOFT_COUNTERS
928 	/* Increment sent packets counter. */
929 	txq->stats.opackets += i;
930 #endif
931 	/* Ring QP doorbell. */
932 	if (mpw.state == MLX5_MPW_STATE_OPENED)
933 		mlx5_mpw_close(txq, &mpw);
934 	mlx5_tx_dbrec(txq, mpw.wqe);
935 	txq->elts_head = elts_head;
936 	return i;
937 }
938 
939 /**
940  * Open a MPW inline session.
941  *
942  * @param txq
943  *   Pointer to TX queue structure.
944  * @param mpw
945  *   Pointer to MPW session structure.
946  * @param length
947  *   Packet length.
948  */
949 static inline void
950 mlx5_mpw_inline_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw,
951 		    uint32_t length)
952 {
953 	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
954 	struct mlx5_wqe_inl_small *inl;
955 
956 	mpw->state = MLX5_MPW_INL_STATE_OPENED;
957 	mpw->pkts_n = 0;
958 	mpw->len = length;
959 	mpw->total_len = 0;
960 	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
961 	mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) |
962 					     (txq->wqe_ci << 8) |
963 					     MLX5_OPCODE_TSO);
964 	mpw->wqe->ctrl[2] = 0;
965 	mpw->wqe->ctrl[3] = 0;
966 	mpw->wqe->eseg.mss = rte_cpu_to_be_16(length);
967 	mpw->wqe->eseg.inline_hdr_sz = 0;
968 	mpw->wqe->eseg.cs_flags = 0;
969 	mpw->wqe->eseg.rsvd0 = 0;
970 	mpw->wqe->eseg.rsvd1 = 0;
971 	mpw->wqe->eseg.rsvd2 = 0;
972 	inl = (struct mlx5_wqe_inl_small *)
973 		(((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE);
974 	mpw->data.raw = (uint8_t *)&inl->raw;
975 }
976 
977 /**
978  * Close a MPW inline session.
979  *
980  * @param txq
981  *   Pointer to TX queue structure.
982  * @param mpw
983  *   Pointer to MPW session structure.
984  */
985 static inline void
986 mlx5_mpw_inline_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
987 {
988 	unsigned int size;
989 	struct mlx5_wqe_inl_small *inl = (struct mlx5_wqe_inl_small *)
990 		(((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
991 
992 	size = MLX5_WQE_SIZE - MLX5_MWQE64_INL_DATA + mpw->total_len;
993 	/*
994 	 * Store size in multiple of 16 bytes. Control and Ethernet segments
995 	 * count as 2.
996 	 */
997 	mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s |
998 					     MLX5_WQE_DS(size));
999 	mpw->state = MLX5_MPW_STATE_CLOSED;
1000 	inl->byte_cnt = rte_cpu_to_be_32(mpw->total_len | MLX5_INLINE_SEG);
1001 	txq->wqe_ci += (size + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
1002 }
1003 
1004 /**
1005  * DPDK callback for TX with MPW inline support.
1006  *
1007  * @param dpdk_txq
1008  *   Generic pointer to TX queue structure.
1009  * @param[in] pkts
1010  *   Packets to transmit.
1011  * @param pkts_n
1012  *   Number of packets in array.
1013  *
1014  * @return
1015  *   Number of packets successfully transmitted (<= pkts_n).
1016  */
1017 uint16_t
1018 mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
1019 			 uint16_t pkts_n)
1020 {
1021 	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
1022 	uint16_t elts_head = txq->elts_head;
1023 	const uint16_t elts_n = 1 << txq->elts_n;
1024 	const uint16_t elts_m = elts_n - 1;
1025 	unsigned int i = 0;
1026 	unsigned int j = 0;
1027 	uint16_t max_elts;
1028 	uint16_t max_wqe;
1029 	unsigned int comp;
1030 	unsigned int inline_room = txq->max_inline * RTE_CACHE_LINE_SIZE;
1031 	struct mlx5_mpw mpw = {
1032 		.state = MLX5_MPW_STATE_CLOSED,
1033 	};
1034 	/*
1035 	 * Compute the maximum number of WQE which can be consumed by inline
1036 	 * code.
1037 	 * - 2 DSEG for:
1038 	 *   - 1 control segment,
1039 	 *   - 1 Ethernet segment,
1040 	 * - N Dseg from the inline request.
1041 	 */
1042 	const unsigned int wqe_inl_n =
1043 		((2 * MLX5_WQE_DWORD_SIZE +
1044 		  txq->max_inline * RTE_CACHE_LINE_SIZE) +
1045 		 RTE_CACHE_LINE_SIZE - 1) / RTE_CACHE_LINE_SIZE;
1046 
1047 	if (unlikely(!pkts_n))
1048 		return 0;
1049 	/* Prefetch first packet cacheline. */
1050 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
1051 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
1052 	/* Start processing. */
1053 	mlx5_tx_complete(txq);
1054 	max_elts = (elts_n - (elts_head - txq->elts_tail));
1055 	/* A CQE slot must always be available. */
1056 	assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci));
1057 	do {
1058 		struct rte_mbuf *buf = *(pkts++);
1059 		uintptr_t addr;
1060 		uint32_t length;
1061 		unsigned int segs_n = buf->nb_segs;
1062 		uint8_t cs_flags;
1063 
1064 		/*
1065 		 * Make sure there is enough room to store this packet and
1066 		 * that one ring entry remains unused.
1067 		 */
1068 		assert(segs_n);
1069 		if (max_elts < segs_n)
1070 			break;
1071 		/* Do not bother with large packets MPW cannot handle. */
1072 		if (segs_n > MLX5_MPW_DSEG_MAX) {
1073 			txq->stats.oerrors++;
1074 			break;
1075 		}
1076 		max_elts -= segs_n;
1077 		--pkts_n;
1078 		/*
1079 		 * Compute max_wqe in case less WQE were consumed in previous
1080 		 * iteration.
1081 		 */
1082 		max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
1083 		cs_flags = txq_ol_cksum_to_cs(txq, buf);
1084 		/* Retrieve packet information. */
1085 		length = PKT_LEN(buf);
1086 		/* Start new session if packet differs. */
1087 		if (mpw.state == MLX5_MPW_STATE_OPENED) {
1088 			if ((mpw.len != length) ||
1089 			    (segs_n != 1) ||
1090 			    (mpw.wqe->eseg.cs_flags != cs_flags))
1091 				mlx5_mpw_close(txq, &mpw);
1092 		} else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
1093 			if ((mpw.len != length) ||
1094 			    (segs_n != 1) ||
1095 			    (length > inline_room) ||
1096 			    (mpw.wqe->eseg.cs_flags != cs_flags)) {
1097 				mlx5_mpw_inline_close(txq, &mpw);
1098 				inline_room =
1099 					txq->max_inline * RTE_CACHE_LINE_SIZE;
1100 			}
1101 		}
1102 		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
1103 			if ((segs_n != 1) ||
1104 			    (length > inline_room)) {
1105 				/*
1106 				 * Multi-Packet WQE consumes at most two WQE.
1107 				 * mlx5_mpw_new() expects to be able to use
1108 				 * such resources.
1109 				 */
1110 				if (unlikely(max_wqe < 2))
1111 					break;
1112 				max_wqe -= 2;
1113 				mlx5_mpw_new(txq, &mpw, length);
1114 				mpw.wqe->eseg.cs_flags = cs_flags;
1115 			} else {
1116 				if (unlikely(max_wqe < wqe_inl_n))
1117 					break;
1118 				max_wqe -= wqe_inl_n;
1119 				mlx5_mpw_inline_new(txq, &mpw, length);
1120 				mpw.wqe->eseg.cs_flags = cs_flags;
1121 			}
1122 		}
1123 		/* Multi-segment packets must be alone in their MPW. */
1124 		assert((segs_n == 1) || (mpw.pkts_n == 0));
1125 		if (mpw.state == MLX5_MPW_STATE_OPENED) {
1126 			assert(inline_room ==
1127 			       txq->max_inline * RTE_CACHE_LINE_SIZE);
1128 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1129 			length = 0;
1130 #endif
1131 			do {
1132 				volatile struct mlx5_wqe_data_seg *dseg;
1133 
1134 				assert(buf);
1135 				(*txq->elts)[elts_head++ & elts_m] = buf;
1136 				dseg = mpw.data.dseg[mpw.pkts_n];
1137 				addr = rte_pktmbuf_mtod(buf, uintptr_t);
1138 				*dseg = (struct mlx5_wqe_data_seg){
1139 					.byte_count =
1140 					       rte_cpu_to_be_32(DATA_LEN(buf)),
1141 					.lkey = mlx5_tx_mb2mr(txq, buf),
1142 					.addr = rte_cpu_to_be_64(addr),
1143 				};
1144 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1145 				length += DATA_LEN(buf);
1146 #endif
1147 				buf = buf->next;
1148 				++mpw.pkts_n;
1149 				++j;
1150 			} while (--segs_n);
1151 			assert(length == mpw.len);
1152 			if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
1153 				mlx5_mpw_close(txq, &mpw);
1154 		} else {
1155 			unsigned int max;
1156 
1157 			assert(mpw.state == MLX5_MPW_INL_STATE_OPENED);
1158 			assert(length <= inline_room);
1159 			assert(length == DATA_LEN(buf));
1160 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
1161 			(*txq->elts)[elts_head++ & elts_m] = buf;
1162 			/* Maximum number of bytes before wrapping. */
1163 			max = ((((uintptr_t)(txq->wqes)) +
1164 				(1 << txq->wqe_n) *
1165 				MLX5_WQE_SIZE) -
1166 			       (uintptr_t)mpw.data.raw);
1167 			if (length > max) {
1168 				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1169 					   (void *)addr,
1170 					   max);
1171 				mpw.data.raw = (volatile void *)txq->wqes;
1172 				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1173 					   (void *)(addr + max),
1174 					   length - max);
1175 				mpw.data.raw += length - max;
1176 			} else {
1177 				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1178 					   (void *)addr,
1179 					   length);
1180 
1181 				if (length == max)
1182 					mpw.data.raw =
1183 						(volatile void *)txq->wqes;
1184 				else
1185 					mpw.data.raw += length;
1186 			}
1187 			++mpw.pkts_n;
1188 			mpw.total_len += length;
1189 			++j;
1190 			if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) {
1191 				mlx5_mpw_inline_close(txq, &mpw);
1192 				inline_room =
1193 					txq->max_inline * RTE_CACHE_LINE_SIZE;
1194 			} else {
1195 				inline_room -= length;
1196 			}
1197 		}
1198 #ifdef MLX5_PMD_SOFT_COUNTERS
1199 		/* Increment sent bytes counter. */
1200 		txq->stats.obytes += length;
1201 #endif
1202 		++i;
1203 	} while (pkts_n);
1204 	/* Take a shortcut if nothing must be sent. */
1205 	if (unlikely(i == 0))
1206 		return 0;
1207 	/* Check whether completion threshold has been reached. */
1208 	/* "j" includes both packets and segments. */
1209 	comp = txq->elts_comp + j;
1210 	if (comp >= MLX5_TX_COMP_THRESH) {
1211 		volatile struct mlx5_wqe *wqe = mpw.wqe;
1212 
1213 		/* Request completion on last WQE. */
1214 		wqe->ctrl[2] = rte_cpu_to_be_32(8);
1215 		/* Save elts_head in unused "immediate" field of WQE. */
1216 		wqe->ctrl[3] = elts_head;
1217 		txq->elts_comp = 0;
1218 #ifndef NDEBUG
1219 		++txq->cq_pi;
1220 #endif
1221 	} else {
1222 		txq->elts_comp = comp;
1223 	}
1224 #ifdef MLX5_PMD_SOFT_COUNTERS
1225 	/* Increment sent packets counter. */
1226 	txq->stats.opackets += i;
1227 #endif
1228 	/* Ring QP doorbell. */
1229 	if (mpw.state == MLX5_MPW_INL_STATE_OPENED)
1230 		mlx5_mpw_inline_close(txq, &mpw);
1231 	else if (mpw.state == MLX5_MPW_STATE_OPENED)
1232 		mlx5_mpw_close(txq, &mpw);
1233 	mlx5_tx_dbrec(txq, mpw.wqe);
1234 	txq->elts_head = elts_head;
1235 	return i;
1236 }
1237 
1238 /**
1239  * Open an Enhanced MPW session.
1240  *
1241  * @param txq
1242  *   Pointer to TX queue structure.
1243  * @param mpw
1244  *   Pointer to MPW session structure.
1245  * @param length
1246  *   Packet length.
1247  */
1248 static inline void
1249 mlx5_empw_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw, int padding)
1250 {
1251 	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
1252 
1253 	mpw->state = MLX5_MPW_ENHANCED_STATE_OPENED;
1254 	mpw->pkts_n = 0;
1255 	mpw->total_len = sizeof(struct mlx5_wqe);
1256 	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
1257 	mpw->wqe->ctrl[0] =
1258 		rte_cpu_to_be_32((MLX5_OPC_MOD_ENHANCED_MPSW << 24) |
1259 				 (txq->wqe_ci << 8) |
1260 				 MLX5_OPCODE_ENHANCED_MPSW);
1261 	mpw->wqe->ctrl[2] = 0;
1262 	mpw->wqe->ctrl[3] = 0;
1263 	memset((void *)(uintptr_t)&mpw->wqe->eseg, 0, MLX5_WQE_DWORD_SIZE);
1264 	if (unlikely(padding)) {
1265 		uintptr_t addr = (uintptr_t)(mpw->wqe + 1);
1266 
1267 		/* Pad the first 2 DWORDs with zero-length inline header. */
1268 		*(volatile uint32_t *)addr = rte_cpu_to_be_32(MLX5_INLINE_SEG);
1269 		*(volatile uint32_t *)(addr + MLX5_WQE_DWORD_SIZE) =
1270 			rte_cpu_to_be_32(MLX5_INLINE_SEG);
1271 		mpw->total_len += 2 * MLX5_WQE_DWORD_SIZE;
1272 		/* Start from the next WQEBB. */
1273 		mpw->data.raw = (volatile void *)(tx_mlx5_wqe(txq, idx + 1));
1274 	} else {
1275 		mpw->data.raw = (volatile void *)(mpw->wqe + 1);
1276 	}
1277 }
1278 
1279 /**
1280  * Close an Enhanced MPW session.
1281  *
1282  * @param txq
1283  *   Pointer to TX queue structure.
1284  * @param mpw
1285  *   Pointer to MPW session structure.
1286  *
1287  * @return
1288  *   Number of consumed WQEs.
1289  */
1290 static inline uint16_t
1291 mlx5_empw_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
1292 {
1293 	uint16_t ret;
1294 
1295 	/* Store size in multiple of 16 bytes. Control and Ethernet segments
1296 	 * count as 2.
1297 	 */
1298 	mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s |
1299 					     MLX5_WQE_DS(mpw->total_len));
1300 	mpw->state = MLX5_MPW_STATE_CLOSED;
1301 	ret = (mpw->total_len + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
1302 	txq->wqe_ci += ret;
1303 	return ret;
1304 }
1305 
1306 /**
1307  * TX with Enhanced MPW support.
1308  *
1309  * @param txq
1310  *   Pointer to TX queue structure.
1311  * @param[in] pkts
1312  *   Packets to transmit.
1313  * @param pkts_n
1314  *   Number of packets in array.
1315  *
1316  * @return
1317  *   Number of packets successfully transmitted (<= pkts_n).
1318  */
1319 static inline uint16_t
1320 txq_burst_empw(struct mlx5_txq_data *txq, struct rte_mbuf **pkts,
1321 	       uint16_t pkts_n)
1322 {
1323 	uint16_t elts_head = txq->elts_head;
1324 	const uint16_t elts_n = 1 << txq->elts_n;
1325 	const uint16_t elts_m = elts_n - 1;
1326 	unsigned int i = 0;
1327 	unsigned int j = 0;
1328 	uint16_t max_elts;
1329 	uint16_t max_wqe;
1330 	unsigned int max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE;
1331 	unsigned int mpw_room = 0;
1332 	unsigned int inl_pad = 0;
1333 	uint32_t inl_hdr;
1334 	struct mlx5_mpw mpw = {
1335 		.state = MLX5_MPW_STATE_CLOSED,
1336 	};
1337 
1338 	if (unlikely(!pkts_n))
1339 		return 0;
1340 	/* Start processing. */
1341 	mlx5_tx_complete(txq);
1342 	max_elts = (elts_n - (elts_head - txq->elts_tail));
1343 	/* A CQE slot must always be available. */
1344 	assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci));
1345 	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
1346 	if (unlikely(!max_wqe))
1347 		return 0;
1348 	do {
1349 		struct rte_mbuf *buf = *(pkts++);
1350 		uintptr_t addr;
1351 		unsigned int n;
1352 		unsigned int do_inline = 0; /* Whether inline is possible. */
1353 		uint32_t length;
1354 		uint8_t cs_flags;
1355 
1356 		/* Multi-segmented packet is handled in slow-path outside. */
1357 		assert(NB_SEGS(buf) == 1);
1358 		/* Make sure there is enough room to store this packet. */
1359 		if (max_elts - j == 0)
1360 			break;
1361 		cs_flags = txq_ol_cksum_to_cs(txq, buf);
1362 		/* Retrieve packet information. */
1363 		length = PKT_LEN(buf);
1364 		/* Start new session if:
1365 		 * - multi-segment packet
1366 		 * - no space left even for a dseg
1367 		 * - next packet can be inlined with a new WQE
1368 		 * - cs_flag differs
1369 		 */
1370 		if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED) {
1371 			if ((inl_pad + sizeof(struct mlx5_wqe_data_seg) >
1372 			     mpw_room) ||
1373 			    (length <= txq->inline_max_packet_sz &&
1374 			     inl_pad + sizeof(inl_hdr) + length >
1375 			     mpw_room) ||
1376 			    (mpw.wqe->eseg.cs_flags != cs_flags))
1377 				max_wqe -= mlx5_empw_close(txq, &mpw);
1378 		}
1379 		if (unlikely(mpw.state == MLX5_MPW_STATE_CLOSED)) {
1380 			/* In Enhanced MPW, inline as much as the budget is
1381 			 * allowed. The remaining space is to be filled with
1382 			 * dsegs. If the title WQEBB isn't padded, it will have
1383 			 * 2 dsegs there.
1384 			 */
1385 			mpw_room = RTE_MIN(MLX5_WQE_SIZE_MAX,
1386 					   (max_inline ? max_inline :
1387 					    pkts_n * MLX5_WQE_DWORD_SIZE) +
1388 					   MLX5_WQE_SIZE);
1389 			if (unlikely(max_wqe * MLX5_WQE_SIZE < mpw_room))
1390 				break;
1391 			/* Don't pad the title WQEBB to not waste WQ. */
1392 			mlx5_empw_new(txq, &mpw, 0);
1393 			mpw_room -= mpw.total_len;
1394 			inl_pad = 0;
1395 			do_inline = length <= txq->inline_max_packet_sz &&
1396 				    sizeof(inl_hdr) + length <= mpw_room &&
1397 				    !txq->mpw_hdr_dseg;
1398 			mpw.wqe->eseg.cs_flags = cs_flags;
1399 		} else {
1400 			/* Evaluate whether the next packet can be inlined.
1401 			 * Inlininig is possible when:
1402 			 * - length is less than configured value
1403 			 * - length fits for remaining space
1404 			 * - not required to fill the title WQEBB with dsegs
1405 			 */
1406 			do_inline =
1407 				length <= txq->inline_max_packet_sz &&
1408 				inl_pad + sizeof(inl_hdr) + length <=
1409 				 mpw_room &&
1410 				(!txq->mpw_hdr_dseg ||
1411 				 mpw.total_len >= MLX5_WQE_SIZE);
1412 		}
1413 		if (do_inline) {
1414 			/* Inline packet into WQE. */
1415 			unsigned int max;
1416 
1417 			assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
1418 			assert(length == DATA_LEN(buf));
1419 			inl_hdr = rte_cpu_to_be_32(length | MLX5_INLINE_SEG);
1420 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
1421 			mpw.data.raw = (volatile void *)
1422 				((uintptr_t)mpw.data.raw + inl_pad);
1423 			max = tx_mlx5_wq_tailroom(txq,
1424 					(void *)(uintptr_t)mpw.data.raw);
1425 			/* Copy inline header. */
1426 			mpw.data.raw = (volatile void *)
1427 				mlx5_copy_to_wq(
1428 					  (void *)(uintptr_t)mpw.data.raw,
1429 					  &inl_hdr,
1430 					  sizeof(inl_hdr),
1431 					  (void *)(uintptr_t)txq->wqes,
1432 					  max);
1433 			max = tx_mlx5_wq_tailroom(txq,
1434 					(void *)(uintptr_t)mpw.data.raw);
1435 			/* Copy packet data. */
1436 			mpw.data.raw = (volatile void *)
1437 				mlx5_copy_to_wq(
1438 					  (void *)(uintptr_t)mpw.data.raw,
1439 					  (void *)addr,
1440 					  length,
1441 					  (void *)(uintptr_t)txq->wqes,
1442 					  max);
1443 			++mpw.pkts_n;
1444 			mpw.total_len += (inl_pad + sizeof(inl_hdr) + length);
1445 			/* No need to get completion as the entire packet is
1446 			 * copied to WQ. Free the buf right away.
1447 			 */
1448 			rte_pktmbuf_free_seg(buf);
1449 			mpw_room -= (inl_pad + sizeof(inl_hdr) + length);
1450 			/* Add pad in the next packet if any. */
1451 			inl_pad = (((uintptr_t)mpw.data.raw +
1452 					(MLX5_WQE_DWORD_SIZE - 1)) &
1453 					~(MLX5_WQE_DWORD_SIZE - 1)) -
1454 				  (uintptr_t)mpw.data.raw;
1455 		} else {
1456 			/* No inline. Load a dseg of packet pointer. */
1457 			volatile rte_v128u32_t *dseg;
1458 
1459 			assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
1460 			assert((inl_pad + sizeof(*dseg)) <= mpw_room);
1461 			assert(length == DATA_LEN(buf));
1462 			if (!tx_mlx5_wq_tailroom(txq,
1463 					(void *)((uintptr_t)mpw.data.raw
1464 						+ inl_pad)))
1465 				dseg = (volatile void *)txq->wqes;
1466 			else
1467 				dseg = (volatile void *)
1468 					((uintptr_t)mpw.data.raw +
1469 					 inl_pad);
1470 			(*txq->elts)[elts_head++ & elts_m] = buf;
1471 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
1472 			for (n = 0; n * RTE_CACHE_LINE_SIZE < length; n++)
1473 				rte_prefetch2((void *)(addr +
1474 						n * RTE_CACHE_LINE_SIZE));
1475 			addr = rte_cpu_to_be_64(addr);
1476 			*dseg = (rte_v128u32_t) {
1477 				rte_cpu_to_be_32(length),
1478 				mlx5_tx_mb2mr(txq, buf),
1479 				addr,
1480 				addr >> 32,
1481 			};
1482 			mpw.data.raw = (volatile void *)(dseg + 1);
1483 			mpw.total_len += (inl_pad + sizeof(*dseg));
1484 			++j;
1485 			++mpw.pkts_n;
1486 			mpw_room -= (inl_pad + sizeof(*dseg));
1487 			inl_pad = 0;
1488 		}
1489 #ifdef MLX5_PMD_SOFT_COUNTERS
1490 		/* Increment sent bytes counter. */
1491 		txq->stats.obytes += length;
1492 #endif
1493 		++i;
1494 	} while (i < pkts_n);
1495 	/* Take a shortcut if nothing must be sent. */
1496 	if (unlikely(i == 0))
1497 		return 0;
1498 	/* Check whether completion threshold has been reached. */
1499 	if (txq->elts_comp + j >= MLX5_TX_COMP_THRESH ||
1500 			(uint16_t)(txq->wqe_ci - txq->mpw_comp) >=
1501 			 (1 << txq->wqe_n) / MLX5_TX_COMP_THRESH_INLINE_DIV) {
1502 		volatile struct mlx5_wqe *wqe = mpw.wqe;
1503 
1504 		/* Request completion on last WQE. */
1505 		wqe->ctrl[2] = rte_cpu_to_be_32(8);
1506 		/* Save elts_head in unused "immediate" field of WQE. */
1507 		wqe->ctrl[3] = elts_head;
1508 		txq->elts_comp = 0;
1509 		txq->mpw_comp = txq->wqe_ci;
1510 #ifndef NDEBUG
1511 		++txq->cq_pi;
1512 #endif
1513 	} else {
1514 		txq->elts_comp += j;
1515 	}
1516 #ifdef MLX5_PMD_SOFT_COUNTERS
1517 	/* Increment sent packets counter. */
1518 	txq->stats.opackets += i;
1519 #endif
1520 	if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED)
1521 		mlx5_empw_close(txq, &mpw);
1522 	/* Ring QP doorbell. */
1523 	mlx5_tx_dbrec(txq, mpw.wqe);
1524 	txq->elts_head = elts_head;
1525 	return i;
1526 }
1527 
1528 /**
1529  * DPDK callback for TX with Enhanced MPW support.
1530  *
1531  * @param dpdk_txq
1532  *   Generic pointer to TX queue structure.
1533  * @param[in] pkts
1534  *   Packets to transmit.
1535  * @param pkts_n
1536  *   Number of packets in array.
1537  *
1538  * @return
1539  *   Number of packets successfully transmitted (<= pkts_n).
1540  */
1541 uint16_t
1542 mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
1543 {
1544 	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
1545 	uint16_t nb_tx = 0;
1546 
1547 	while (pkts_n > nb_tx) {
1548 		uint16_t n;
1549 		uint16_t ret;
1550 
1551 		n = txq_count_contig_multi_seg(&pkts[nb_tx], pkts_n - nb_tx);
1552 		if (n) {
1553 			ret = mlx5_tx_burst(dpdk_txq, &pkts[nb_tx], n);
1554 			if (!ret)
1555 				break;
1556 			nb_tx += ret;
1557 		}
1558 		n = txq_count_contig_single_seg(&pkts[nb_tx], pkts_n - nb_tx);
1559 		if (n) {
1560 			ret = txq_burst_empw(txq, &pkts[nb_tx], n);
1561 			if (!ret)
1562 				break;
1563 			nb_tx += ret;
1564 		}
1565 	}
1566 	return nb_tx;
1567 }
1568 
1569 /**
1570  * Translate RX completion flags to packet type.
1571  *
1572  * @param[in] cqe
1573  *   Pointer to CQE.
1574  *
1575  * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
1576  *
1577  * @return
1578  *   Packet type for struct rte_mbuf.
1579  */
1580 static inline uint32_t
1581 rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe)
1582 {
1583 	uint8_t idx;
1584 	uint8_t pinfo = cqe->pkt_info;
1585 	uint16_t ptype = cqe->hdr_type_etc;
1586 
1587 	/*
1588 	 * The index to the array should have:
1589 	 * bit[1:0] = l3_hdr_type
1590 	 * bit[4:2] = l4_hdr_type
1591 	 * bit[5] = ip_frag
1592 	 * bit[6] = tunneled
1593 	 * bit[7] = outer_l3_type
1594 	 */
1595 	idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10);
1596 	return mlx5_ptype_table[idx];
1597 }
1598 
1599 /**
1600  * Get size of the next packet for a given CQE. For compressed CQEs, the
1601  * consumer index is updated only once all packets of the current one have
1602  * been processed.
1603  *
1604  * @param rxq
1605  *   Pointer to RX queue.
1606  * @param cqe
1607  *   CQE to process.
1608  * @param[out] rss_hash
1609  *   Packet RSS Hash result.
1610  *
1611  * @return
1612  *   Packet size in bytes (0 if there is none), -1 in case of completion
1613  *   with error.
1614  */
1615 static inline int
1616 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
1617 		 uint16_t cqe_cnt, uint32_t *rss_hash)
1618 {
1619 	struct rxq_zip *zip = &rxq->zip;
1620 	uint16_t cqe_n = cqe_cnt + 1;
1621 	int len = 0;
1622 	uint16_t idx, end;
1623 
1624 	/* Process compressed data in the CQE and mini arrays. */
1625 	if (zip->ai) {
1626 		volatile struct mlx5_mini_cqe8 (*mc)[8] =
1627 			(volatile struct mlx5_mini_cqe8 (*)[8])
1628 			(uintptr_t)(&(*rxq->cqes)[zip->ca & cqe_cnt].pkt_info);
1629 
1630 		len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt);
1631 		*rss_hash = rte_be_to_cpu_32((*mc)[zip->ai & 7].rx_hash_result);
1632 		if ((++zip->ai & 7) == 0) {
1633 			/* Invalidate consumed CQEs */
1634 			idx = zip->ca;
1635 			end = zip->na;
1636 			while (idx != end) {
1637 				(*rxq->cqes)[idx & cqe_cnt].op_own =
1638 					MLX5_CQE_INVALIDATE;
1639 				++idx;
1640 			}
1641 			/*
1642 			 * Increment consumer index to skip the number of
1643 			 * CQEs consumed. Hardware leaves holes in the CQ
1644 			 * ring for software use.
1645 			 */
1646 			zip->ca = zip->na;
1647 			zip->na += 8;
1648 		}
1649 		if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
1650 			/* Invalidate the rest */
1651 			idx = zip->ca;
1652 			end = zip->cq_ci;
1653 
1654 			while (idx != end) {
1655 				(*rxq->cqes)[idx & cqe_cnt].op_own =
1656 					MLX5_CQE_INVALIDATE;
1657 				++idx;
1658 			}
1659 			rxq->cq_ci = zip->cq_ci;
1660 			zip->ai = 0;
1661 		}
1662 	/* No compressed data, get next CQE and verify if it is compressed. */
1663 	} else {
1664 		int ret;
1665 		int8_t op_own;
1666 
1667 		ret = check_cqe(cqe, cqe_n, rxq->cq_ci);
1668 		if (unlikely(ret == 1))
1669 			return 0;
1670 		++rxq->cq_ci;
1671 		op_own = cqe->op_own;
1672 		if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
1673 			volatile struct mlx5_mini_cqe8 (*mc)[8] =
1674 				(volatile struct mlx5_mini_cqe8 (*)[8])
1675 				(uintptr_t)(&(*rxq->cqes)[rxq->cq_ci &
1676 							  cqe_cnt].pkt_info);
1677 
1678 			/* Fix endianness. */
1679 			zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt);
1680 			/*
1681 			 * Current mini array position is the one returned by
1682 			 * check_cqe64().
1683 			 *
1684 			 * If completion comprises several mini arrays, as a
1685 			 * special case the second one is located 7 CQEs after
1686 			 * the initial CQE instead of 8 for subsequent ones.
1687 			 */
1688 			zip->ca = rxq->cq_ci;
1689 			zip->na = zip->ca + 7;
1690 			/* Compute the next non compressed CQE. */
1691 			--rxq->cq_ci;
1692 			zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
1693 			/* Get packet size to return. */
1694 			len = rte_be_to_cpu_32((*mc)[0].byte_cnt);
1695 			*rss_hash = rte_be_to_cpu_32((*mc)[0].rx_hash_result);
1696 			zip->ai = 1;
1697 			/* Prefetch all the entries to be invalidated */
1698 			idx = zip->ca;
1699 			end = zip->cq_ci;
1700 			while (idx != end) {
1701 				rte_prefetch0(&(*rxq->cqes)[(idx) & cqe_cnt]);
1702 				++idx;
1703 			}
1704 		} else {
1705 			len = rte_be_to_cpu_32(cqe->byte_cnt);
1706 			*rss_hash = rte_be_to_cpu_32(cqe->rx_hash_res);
1707 		}
1708 		/* Error while receiving packet. */
1709 		if (unlikely(MLX5_CQE_OPCODE(op_own) == MLX5_CQE_RESP_ERR))
1710 			return -1;
1711 	}
1712 	return len;
1713 }
1714 
1715 /**
1716  * Translate RX completion flags to offload flags.
1717  *
1718  * @param[in] rxq
1719  *   Pointer to RX queue structure.
1720  * @param[in] cqe
1721  *   Pointer to CQE.
1722  *
1723  * @return
1724  *   Offload flags (ol_flags) for struct rte_mbuf.
1725  */
1726 static inline uint32_t
1727 rxq_cq_to_ol_flags(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe)
1728 {
1729 	uint32_t ol_flags = 0;
1730 	uint16_t flags = rte_be_to_cpu_16(cqe->hdr_type_etc);
1731 
1732 	ol_flags =
1733 		TRANSPOSE(flags,
1734 			  MLX5_CQE_RX_L3_HDR_VALID,
1735 			  PKT_RX_IP_CKSUM_GOOD) |
1736 		TRANSPOSE(flags,
1737 			  MLX5_CQE_RX_L4_HDR_VALID,
1738 			  PKT_RX_L4_CKSUM_GOOD);
1739 	if ((cqe->pkt_info & MLX5_CQE_RX_TUNNEL_PACKET) && (rxq->csum_l2tun))
1740 		ol_flags |=
1741 			TRANSPOSE(flags,
1742 				  MLX5_CQE_RX_L3_HDR_VALID,
1743 				  PKT_RX_IP_CKSUM_GOOD) |
1744 			TRANSPOSE(flags,
1745 				  MLX5_CQE_RX_L4_HDR_VALID,
1746 				  PKT_RX_L4_CKSUM_GOOD);
1747 	return ol_flags;
1748 }
1749 
1750 /**
1751  * DPDK callback for RX.
1752  *
1753  * @param dpdk_rxq
1754  *   Generic pointer to RX queue structure.
1755  * @param[out] pkts
1756  *   Array to store received packets.
1757  * @param pkts_n
1758  *   Maximum number of packets in array.
1759  *
1760  * @return
1761  *   Number of packets successfully received (<= pkts_n).
1762  */
1763 uint16_t
1764 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
1765 {
1766 	struct mlx5_rxq_data *rxq = dpdk_rxq;
1767 	const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1;
1768 	const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1;
1769 	const unsigned int sges_n = rxq->sges_n;
1770 	struct rte_mbuf *pkt = NULL;
1771 	struct rte_mbuf *seg = NULL;
1772 	volatile struct mlx5_cqe *cqe =
1773 		&(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
1774 	unsigned int i = 0;
1775 	unsigned int rq_ci = rxq->rq_ci << sges_n;
1776 	int len = 0; /* keep its value across iterations. */
1777 
1778 	while (pkts_n) {
1779 		unsigned int idx = rq_ci & wqe_cnt;
1780 		volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx];
1781 		struct rte_mbuf *rep = (*rxq->elts)[idx];
1782 		uint32_t rss_hash_res = 0;
1783 
1784 		if (pkt)
1785 			NEXT(seg) = rep;
1786 		seg = rep;
1787 		rte_prefetch0(seg);
1788 		rte_prefetch0(cqe);
1789 		rte_prefetch0(wqe);
1790 		rep = rte_mbuf_raw_alloc(rxq->mp);
1791 		if (unlikely(rep == NULL)) {
1792 			++rxq->stats.rx_nombuf;
1793 			if (!pkt) {
1794 				/*
1795 				 * no buffers before we even started,
1796 				 * bail out silently.
1797 				 */
1798 				break;
1799 			}
1800 			while (pkt != seg) {
1801 				assert(pkt != (*rxq->elts)[idx]);
1802 				rep = NEXT(pkt);
1803 				NEXT(pkt) = NULL;
1804 				NB_SEGS(pkt) = 1;
1805 				rte_mbuf_raw_free(pkt);
1806 				pkt = rep;
1807 			}
1808 			break;
1809 		}
1810 		if (!pkt) {
1811 			cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
1812 			len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt,
1813 					       &rss_hash_res);
1814 			if (!len) {
1815 				rte_mbuf_raw_free(rep);
1816 				break;
1817 			}
1818 			if (unlikely(len == -1)) {
1819 				/* RX error, packet is likely too large. */
1820 				rte_mbuf_raw_free(rep);
1821 				++rxq->stats.idropped;
1822 				goto skip;
1823 			}
1824 			pkt = seg;
1825 			assert(len >= (rxq->crc_present << 2));
1826 			/* Update packet information. */
1827 			pkt->packet_type = rxq_cq_to_pkt_type(cqe);
1828 			pkt->ol_flags = 0;
1829 			if (rss_hash_res && rxq->rss_hash) {
1830 				pkt->hash.rss = rss_hash_res;
1831 				pkt->ol_flags = PKT_RX_RSS_HASH;
1832 			}
1833 			if (rxq->mark &&
1834 			    MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) {
1835 				pkt->ol_flags |= PKT_RX_FDIR;
1836 				if (cqe->sop_drop_qpn !=
1837 				    rte_cpu_to_be_32(MLX5_FLOW_MARK_DEFAULT)) {
1838 					uint32_t mark = cqe->sop_drop_qpn;
1839 
1840 					pkt->ol_flags |= PKT_RX_FDIR_ID;
1841 					pkt->hash.fdir.hi =
1842 						mlx5_flow_mark_get(mark);
1843 				}
1844 			}
1845 			if (rxq->csum | rxq->csum_l2tun)
1846 				pkt->ol_flags |= rxq_cq_to_ol_flags(rxq, cqe);
1847 			if (rxq->vlan_strip &&
1848 			    (cqe->hdr_type_etc &
1849 			     rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED))) {
1850 				pkt->ol_flags |= PKT_RX_VLAN |
1851 					PKT_RX_VLAN_STRIPPED;
1852 				pkt->vlan_tci =
1853 					rte_be_to_cpu_16(cqe->vlan_info);
1854 			}
1855 			if (rxq->hw_timestamp) {
1856 				pkt->timestamp =
1857 					rte_be_to_cpu_64(cqe->timestamp);
1858 				pkt->ol_flags |= PKT_RX_TIMESTAMP;
1859 			}
1860 			if (rxq->crc_present)
1861 				len -= ETHER_CRC_LEN;
1862 			PKT_LEN(pkt) = len;
1863 		}
1864 		DATA_LEN(rep) = DATA_LEN(seg);
1865 		PKT_LEN(rep) = PKT_LEN(seg);
1866 		SET_DATA_OFF(rep, DATA_OFF(seg));
1867 		PORT(rep) = PORT(seg);
1868 		(*rxq->elts)[idx] = rep;
1869 		/*
1870 		 * Fill NIC descriptor with the new buffer.  The lkey and size
1871 		 * of the buffers are already known, only the buffer address
1872 		 * changes.
1873 		 */
1874 		wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t));
1875 		if (len > DATA_LEN(seg)) {
1876 			len -= DATA_LEN(seg);
1877 			++NB_SEGS(pkt);
1878 			++rq_ci;
1879 			continue;
1880 		}
1881 		DATA_LEN(seg) = len;
1882 #ifdef MLX5_PMD_SOFT_COUNTERS
1883 		/* Increment bytes counter. */
1884 		rxq->stats.ibytes += PKT_LEN(pkt);
1885 #endif
1886 		/* Return packet. */
1887 		*(pkts++) = pkt;
1888 		pkt = NULL;
1889 		--pkts_n;
1890 		++i;
1891 skip:
1892 		/* Align consumer index to the next stride. */
1893 		rq_ci >>= sges_n;
1894 		++rq_ci;
1895 		rq_ci <<= sges_n;
1896 	}
1897 	if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci)))
1898 		return 0;
1899 	/* Update the consumer index. */
1900 	rxq->rq_ci = rq_ci >> sges_n;
1901 	rte_io_wmb();
1902 	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
1903 	rte_io_wmb();
1904 	*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
1905 #ifdef MLX5_PMD_SOFT_COUNTERS
1906 	/* Increment packets counter. */
1907 	rxq->stats.ipackets += i;
1908 #endif
1909 	return i;
1910 }
1911 
1912 /**
1913  * Dummy DPDK callback for TX.
1914  *
1915  * This function is used to temporarily replace the real callback during
1916  * unsafe control operations on the queue, or in case of error.
1917  *
1918  * @param dpdk_txq
1919  *   Generic pointer to TX queue structure.
1920  * @param[in] pkts
1921  *   Packets to transmit.
1922  * @param pkts_n
1923  *   Number of packets in array.
1924  *
1925  * @return
1926  *   Number of packets successfully transmitted (<= pkts_n).
1927  */
1928 uint16_t
1929 removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
1930 {
1931 	(void)dpdk_txq;
1932 	(void)pkts;
1933 	(void)pkts_n;
1934 	return 0;
1935 }
1936 
1937 /**
1938  * Dummy DPDK callback for RX.
1939  *
1940  * This function is used to temporarily replace the real callback during
1941  * unsafe control operations on the queue, or in case of error.
1942  *
1943  * @param dpdk_rxq
1944  *   Generic pointer to RX queue structure.
1945  * @param[out] pkts
1946  *   Array to store received packets.
1947  * @param pkts_n
1948  *   Maximum number of packets in array.
1949  *
1950  * @return
1951  *   Number of packets successfully received (<= pkts_n).
1952  */
1953 uint16_t
1954 removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
1955 {
1956 	(void)dpdk_rxq;
1957 	(void)pkts;
1958 	(void)pkts_n;
1959 	return 0;
1960 }
1961 
1962 /*
1963  * Vectorized Rx/Tx routines are not compiled in when required vector
1964  * instructions are not supported on a target architecture. The following null
1965  * stubs are needed for linkage when those are not included outside of this file
1966  * (e.g.  mlx5_rxtx_vec_sse.c for x86).
1967  */
1968 
1969 uint16_t __attribute__((weak))
1970 mlx5_tx_burst_raw_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
1971 {
1972 	(void)dpdk_txq;
1973 	(void)pkts;
1974 	(void)pkts_n;
1975 	return 0;
1976 }
1977 
1978 uint16_t __attribute__((weak))
1979 mlx5_tx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
1980 {
1981 	(void)dpdk_txq;
1982 	(void)pkts;
1983 	(void)pkts_n;
1984 	return 0;
1985 }
1986 
1987 uint16_t __attribute__((weak))
1988 mlx5_rx_burst_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
1989 {
1990 	(void)dpdk_rxq;
1991 	(void)pkts;
1992 	(void)pkts_n;
1993 	return 0;
1994 }
1995 
1996 int __attribute__((weak))
1997 priv_check_raw_vec_tx_support(struct priv *priv)
1998 {
1999 	(void)priv;
2000 	return -ENOTSUP;
2001 }
2002 
2003 int __attribute__((weak))
2004 priv_check_vec_tx_support(struct priv *priv)
2005 {
2006 	(void)priv;
2007 	return -ENOTSUP;
2008 }
2009 
2010 int __attribute__((weak))
2011 rxq_check_vec_support(struct mlx5_rxq_data *rxq)
2012 {
2013 	(void)rxq;
2014 	return -ENOTSUP;
2015 }
2016 
2017 int __attribute__((weak))
2018 priv_check_vec_rx_support(struct priv *priv)
2019 {
2020 	(void)priv;
2021 	return -ENOTSUP;
2022 }
2023