xref: /dpdk/drivers/net/mlx5/mlx5_rxtx.c (revision 358309f36776ba397601ba25710e7d23ee8f55ce)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox.
4  */
5 
6 #include <assert.h>
7 #include <stdint.h>
8 #include <string.h>
9 #include <stdlib.h>
10 
11 /* Verbs header. */
12 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
13 #ifdef PEDANTIC
14 #pragma GCC diagnostic ignored "-Wpedantic"
15 #endif
16 #include <infiniband/verbs.h>
17 #include <infiniband/mlx5dv.h>
18 #ifdef PEDANTIC
19 #pragma GCC diagnostic error "-Wpedantic"
20 #endif
21 
22 #include <rte_mbuf.h>
23 #include <rte_mempool.h>
24 #include <rte_prefetch.h>
25 #include <rte_common.h>
26 #include <rte_branch_prediction.h>
27 #include <rte_ether.h>
28 
29 #include "mlx5.h"
30 #include "mlx5_utils.h"
31 #include "mlx5_rxtx.h"
32 #include "mlx5_autoconf.h"
33 #include "mlx5_defs.h"
34 #include "mlx5_prm.h"
35 
36 static __rte_always_inline uint32_t
37 rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe);
38 
39 static __rte_always_inline int
40 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
41 		 uint16_t cqe_cnt, uint32_t *rss_hash);
42 
43 static __rte_always_inline uint32_t
44 rxq_cq_to_ol_flags(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe);
45 
46 uint32_t mlx5_ptype_table[] __rte_cache_aligned = {
47 	[0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */
48 };
49 
50 /**
51  * Build a table to translate Rx completion flags to packet type.
52  *
53  * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
54  */
55 void
56 mlx5_set_ptype_table(void)
57 {
58 	unsigned int i;
59 	uint32_t (*p)[RTE_DIM(mlx5_ptype_table)] = &mlx5_ptype_table;
60 
61 	/* Last entry must not be overwritten, reserved for errored packet. */
62 	for (i = 0; i < RTE_DIM(mlx5_ptype_table) - 1; ++i)
63 		(*p)[i] = RTE_PTYPE_UNKNOWN;
64 	/*
65 	 * The index to the array should have:
66 	 * bit[1:0] = l3_hdr_type
67 	 * bit[4:2] = l4_hdr_type
68 	 * bit[5] = ip_frag
69 	 * bit[6] = tunneled
70 	 * bit[7] = outer_l3_type
71 	 */
72 	/* L2 */
73 	(*p)[0x00] = RTE_PTYPE_L2_ETHER;
74 	/* L3 */
75 	(*p)[0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
76 		     RTE_PTYPE_L4_NONFRAG;
77 	(*p)[0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
78 		     RTE_PTYPE_L4_NONFRAG;
79 	/* Fragmented */
80 	(*p)[0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
81 		     RTE_PTYPE_L4_FRAG;
82 	(*p)[0x22] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
83 		     RTE_PTYPE_L4_FRAG;
84 	/* TCP */
85 	(*p)[0x05] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
86 		     RTE_PTYPE_L4_TCP;
87 	(*p)[0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
88 		     RTE_PTYPE_L4_TCP;
89 	/* UDP */
90 	(*p)[0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
91 		     RTE_PTYPE_L4_UDP;
92 	(*p)[0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
93 		     RTE_PTYPE_L4_UDP;
94 	/* Repeat with outer_l3_type being set. Just in case. */
95 	(*p)[0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
96 		     RTE_PTYPE_L4_NONFRAG;
97 	(*p)[0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
98 		     RTE_PTYPE_L4_NONFRAG;
99 	(*p)[0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
100 		     RTE_PTYPE_L4_FRAG;
101 	(*p)[0xa2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
102 		     RTE_PTYPE_L4_FRAG;
103 	(*p)[0x85] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
104 		     RTE_PTYPE_L4_TCP;
105 	(*p)[0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
106 		     RTE_PTYPE_L4_TCP;
107 	(*p)[0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
108 		     RTE_PTYPE_L4_UDP;
109 	(*p)[0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
110 		     RTE_PTYPE_L4_UDP;
111 	/* Tunneled - L3 */
112 	(*p)[0x41] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
113 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
114 		     RTE_PTYPE_INNER_L4_NONFRAG;
115 	(*p)[0x42] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
116 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
117 		     RTE_PTYPE_INNER_L4_NONFRAG;
118 	(*p)[0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
119 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
120 		     RTE_PTYPE_INNER_L4_NONFRAG;
121 	(*p)[0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
122 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
123 		     RTE_PTYPE_INNER_L4_NONFRAG;
124 	/* Tunneled - Fragmented */
125 	(*p)[0x61] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
126 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
127 		     RTE_PTYPE_INNER_L4_FRAG;
128 	(*p)[0x62] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
129 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
130 		     RTE_PTYPE_INNER_L4_FRAG;
131 	(*p)[0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
132 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
133 		     RTE_PTYPE_INNER_L4_FRAG;
134 	(*p)[0xe2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
135 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
136 		     RTE_PTYPE_INNER_L4_FRAG;
137 	/* Tunneled - TCP */
138 	(*p)[0x45] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
139 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
140 		     RTE_PTYPE_INNER_L4_TCP;
141 	(*p)[0x46] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
142 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
143 		     RTE_PTYPE_INNER_L4_TCP;
144 	(*p)[0xc5] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
145 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
146 		     RTE_PTYPE_INNER_L4_TCP;
147 	(*p)[0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
148 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
149 		     RTE_PTYPE_INNER_L4_TCP;
150 	/* Tunneled - UDP */
151 	(*p)[0x49] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
152 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
153 		     RTE_PTYPE_INNER_L4_UDP;
154 	(*p)[0x4a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
155 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
156 		     RTE_PTYPE_INNER_L4_UDP;
157 	(*p)[0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
158 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
159 		     RTE_PTYPE_INNER_L4_UDP;
160 	(*p)[0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
161 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
162 		     RTE_PTYPE_INNER_L4_UDP;
163 }
164 
165 /**
166  * Return the size of tailroom of WQ.
167  *
168  * @param txq
169  *   Pointer to TX queue structure.
170  * @param addr
171  *   Pointer to tail of WQ.
172  *
173  * @return
174  *   Size of tailroom.
175  */
176 static inline size_t
177 tx_mlx5_wq_tailroom(struct mlx5_txq_data *txq, void *addr)
178 {
179 	size_t tailroom;
180 	tailroom = (uintptr_t)(txq->wqes) +
181 		   (1 << txq->wqe_n) * MLX5_WQE_SIZE -
182 		   (uintptr_t)addr;
183 	return tailroom;
184 }
185 
186 /**
187  * Copy data to tailroom of circular queue.
188  *
189  * @param dst
190  *   Pointer to destination.
191  * @param src
192  *   Pointer to source.
193  * @param n
194  *   Number of bytes to copy.
195  * @param base
196  *   Pointer to head of queue.
197  * @param tailroom
198  *   Size of tailroom from dst.
199  *
200  * @return
201  *   Pointer after copied data.
202  */
203 static inline void *
204 mlx5_copy_to_wq(void *dst, const void *src, size_t n,
205 		void *base, size_t tailroom)
206 {
207 	void *ret;
208 
209 	if (n > tailroom) {
210 		rte_memcpy(dst, src, tailroom);
211 		rte_memcpy(base, (void *)((uintptr_t)src + tailroom),
212 			   n - tailroom);
213 		ret = (uint8_t *)base + n - tailroom;
214 	} else {
215 		rte_memcpy(dst, src, n);
216 		ret = (n == tailroom) ? base : (uint8_t *)dst + n;
217 	}
218 	return ret;
219 }
220 
221 /**
222  * DPDK callback to check the status of a tx descriptor.
223  *
224  * @param tx_queue
225  *   The tx queue.
226  * @param[in] offset
227  *   The index of the descriptor in the ring.
228  *
229  * @return
230  *   The status of the tx descriptor.
231  */
232 int
233 mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset)
234 {
235 	struct mlx5_txq_data *txq = tx_queue;
236 	uint16_t used;
237 
238 	mlx5_tx_complete(txq);
239 	used = txq->elts_head - txq->elts_tail;
240 	if (offset < used)
241 		return RTE_ETH_TX_DESC_FULL;
242 	return RTE_ETH_TX_DESC_DONE;
243 }
244 
245 /**
246  * DPDK callback to check the status of a rx descriptor.
247  *
248  * @param rx_queue
249  *   The rx queue.
250  * @param[in] offset
251  *   The index of the descriptor in the ring.
252  *
253  * @return
254  *   The status of the tx descriptor.
255  */
256 int
257 mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset)
258 {
259 	struct mlx5_rxq_data *rxq = rx_queue;
260 	struct rxq_zip *zip = &rxq->zip;
261 	volatile struct mlx5_cqe *cqe;
262 	const unsigned int cqe_n = (1 << rxq->cqe_n);
263 	const unsigned int cqe_cnt = cqe_n - 1;
264 	unsigned int cq_ci;
265 	unsigned int used;
266 
267 	/* if we are processing a compressed cqe */
268 	if (zip->ai) {
269 		used = zip->cqe_cnt - zip->ca;
270 		cq_ci = zip->cq_ci;
271 	} else {
272 		used = 0;
273 		cq_ci = rxq->cq_ci;
274 	}
275 	cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
276 	while (check_cqe(cqe, cqe_n, cq_ci) == 0) {
277 		int8_t op_own;
278 		unsigned int n;
279 
280 		op_own = cqe->op_own;
281 		if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED)
282 			n = rte_be_to_cpu_32(cqe->byte_cnt);
283 		else
284 			n = 1;
285 		cq_ci += n;
286 		used += n;
287 		cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
288 	}
289 	used = RTE_MIN(used, (1U << rxq->elts_n) - 1);
290 	if (offset < used)
291 		return RTE_ETH_RX_DESC_DONE;
292 	return RTE_ETH_RX_DESC_AVAIL;
293 }
294 
295 /**
296  * DPDK callback for TX.
297  *
298  * @param dpdk_txq
299  *   Generic pointer to TX queue structure.
300  * @param[in] pkts
301  *   Packets to transmit.
302  * @param pkts_n
303  *   Number of packets in array.
304  *
305  * @return
306  *   Number of packets successfully transmitted (<= pkts_n).
307  */
308 uint16_t
309 mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
310 {
311 	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
312 	uint16_t elts_head = txq->elts_head;
313 	const uint16_t elts_n = 1 << txq->elts_n;
314 	const uint16_t elts_m = elts_n - 1;
315 	unsigned int i = 0;
316 	unsigned int j = 0;
317 	unsigned int k = 0;
318 	uint16_t max_elts;
319 	uint16_t max_wqe;
320 	unsigned int comp;
321 	volatile struct mlx5_wqe_ctrl *last_wqe = NULL;
322 	unsigned int segs_n = 0;
323 	const unsigned int max_inline = txq->max_inline;
324 
325 	if (unlikely(!pkts_n))
326 		return 0;
327 	/* Prefetch first packet cacheline. */
328 	rte_prefetch0(*pkts);
329 	/* Start processing. */
330 	mlx5_tx_complete(txq);
331 	max_elts = (elts_n - (elts_head - txq->elts_tail));
332 	/* A CQE slot must always be available. */
333 	assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci));
334 	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
335 	if (unlikely(!max_wqe))
336 		return 0;
337 	do {
338 		struct rte_mbuf *buf = NULL;
339 		uint8_t *raw;
340 		volatile struct mlx5_wqe_v *wqe = NULL;
341 		volatile rte_v128u32_t *dseg = NULL;
342 		uint32_t length;
343 		unsigned int ds = 0;
344 		unsigned int sg = 0; /* counter of additional segs attached. */
345 		uintptr_t addr;
346 		uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE + 2;
347 		uint16_t tso_header_sz = 0;
348 		uint16_t ehdr;
349 		uint8_t cs_flags;
350 		uint64_t tso = 0;
351 		uint16_t tso_segsz = 0;
352 #ifdef MLX5_PMD_SOFT_COUNTERS
353 		uint32_t total_length = 0;
354 #endif
355 
356 		/* first_seg */
357 		buf = *pkts;
358 		segs_n = buf->nb_segs;
359 		/*
360 		 * Make sure there is enough room to store this packet and
361 		 * that one ring entry remains unused.
362 		 */
363 		assert(segs_n);
364 		if (max_elts < segs_n)
365 			break;
366 		max_elts -= segs_n;
367 		sg = --segs_n;
368 		if (unlikely(--max_wqe == 0))
369 			break;
370 		wqe = (volatile struct mlx5_wqe_v *)
371 			tx_mlx5_wqe(txq, txq->wqe_ci);
372 		rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
373 		if (pkts_n - i > 1)
374 			rte_prefetch0(*(pkts + 1));
375 		addr = rte_pktmbuf_mtod(buf, uintptr_t);
376 		length = DATA_LEN(buf);
377 		ehdr = (((uint8_t *)addr)[1] << 8) |
378 		       ((uint8_t *)addr)[0];
379 #ifdef MLX5_PMD_SOFT_COUNTERS
380 		total_length = length;
381 #endif
382 		if (length < (MLX5_WQE_DWORD_SIZE + 2)) {
383 			txq->stats.oerrors++;
384 			break;
385 		}
386 		/* Update element. */
387 		(*txq->elts)[elts_head & elts_m] = buf;
388 		/* Prefetch next buffer data. */
389 		if (pkts_n - i > 1)
390 			rte_prefetch0(
391 			    rte_pktmbuf_mtod(*(pkts + 1), volatile void *));
392 		cs_flags = txq_ol_cksum_to_cs(txq, buf);
393 		raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE;
394 		/* Replace the Ethernet type by the VLAN if necessary. */
395 		if (buf->ol_flags & PKT_TX_VLAN_PKT) {
396 			uint32_t vlan = rte_cpu_to_be_32(0x81000000 |
397 							 buf->vlan_tci);
398 			unsigned int len = 2 * ETHER_ADDR_LEN - 2;
399 
400 			addr += 2;
401 			length -= 2;
402 			/* Copy Destination and source mac address. */
403 			memcpy((uint8_t *)raw, ((uint8_t *)addr), len);
404 			/* Copy VLAN. */
405 			memcpy((uint8_t *)raw + len, &vlan, sizeof(vlan));
406 			/* Copy missing two bytes to end the DSeg. */
407 			memcpy((uint8_t *)raw + len + sizeof(vlan),
408 			       ((uint8_t *)addr) + len, 2);
409 			addr += len + 2;
410 			length -= (len + 2);
411 		} else {
412 			memcpy((uint8_t *)raw, ((uint8_t *)addr) + 2,
413 			       MLX5_WQE_DWORD_SIZE);
414 			length -= pkt_inline_sz;
415 			addr += pkt_inline_sz;
416 		}
417 		raw += MLX5_WQE_DWORD_SIZE;
418 		tso = txq->tso_en && (buf->ol_flags & PKT_TX_TCP_SEG);
419 		if (tso) {
420 			uintptr_t end =
421 				(uintptr_t)(((uintptr_t)txq->wqes) +
422 					    (1 << txq->wqe_n) * MLX5_WQE_SIZE);
423 			unsigned int copy_b;
424 			uint8_t vlan_sz =
425 				(buf->ol_flags & PKT_TX_VLAN_PKT) ? 4 : 0;
426 			const uint64_t is_tunneled =
427 				buf->ol_flags & (PKT_TX_TUNNEL_GRE |
428 						 PKT_TX_TUNNEL_VXLAN);
429 
430 			tso_header_sz = buf->l2_len + vlan_sz +
431 					buf->l3_len + buf->l4_len;
432 			tso_segsz = buf->tso_segsz;
433 			if (unlikely(tso_segsz == 0)) {
434 				txq->stats.oerrors++;
435 				break;
436 			}
437 			if (is_tunneled	&& txq->tunnel_en) {
438 				tso_header_sz += buf->outer_l2_len +
439 						 buf->outer_l3_len;
440 				cs_flags |= MLX5_ETH_WQE_L4_INNER_CSUM;
441 			} else {
442 				cs_flags |= MLX5_ETH_WQE_L4_CSUM;
443 			}
444 			if (unlikely(tso_header_sz > MLX5_MAX_TSO_HEADER)) {
445 				txq->stats.oerrors++;
446 				break;
447 			}
448 			copy_b = tso_header_sz - pkt_inline_sz;
449 			/* First seg must contain all headers. */
450 			assert(copy_b <= length);
451 			if (copy_b && ((end - (uintptr_t)raw) > copy_b)) {
452 				uint16_t n = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4;
453 
454 				if (unlikely(max_wqe < n))
455 					break;
456 				max_wqe -= n;
457 				rte_memcpy((void *)raw, (void *)addr, copy_b);
458 				addr += copy_b;
459 				length -= copy_b;
460 				/* Include padding for TSO header. */
461 				copy_b = MLX5_WQE_DS(copy_b) *
462 					 MLX5_WQE_DWORD_SIZE;
463 				pkt_inline_sz += copy_b;
464 				raw += copy_b;
465 			} else {
466 				/* NOP WQE. */
467 				wqe->ctrl = (rte_v128u32_t){
468 					rte_cpu_to_be_32(txq->wqe_ci << 8),
469 					rte_cpu_to_be_32(txq->qp_num_8s | 1),
470 					0,
471 					0,
472 				};
473 				ds = 1;
474 #ifdef MLX5_PMD_SOFT_COUNTERS
475 				total_length = 0;
476 #endif
477 				k++;
478 				goto next_wqe;
479 			}
480 		}
481 		/* Inline if enough room. */
482 		if (max_inline || tso) {
483 			uint32_t inl = 0;
484 			uintptr_t end = (uintptr_t)
485 				(((uintptr_t)txq->wqes) +
486 				 (1 << txq->wqe_n) * MLX5_WQE_SIZE);
487 			unsigned int inline_room = max_inline *
488 						   RTE_CACHE_LINE_SIZE -
489 						   (pkt_inline_sz - 2) -
490 						   !!tso * sizeof(inl);
491 			uintptr_t addr_end;
492 			unsigned int copy_b;
493 
494 pkt_inline:
495 			addr_end = RTE_ALIGN_FLOOR(addr + inline_room,
496 						   RTE_CACHE_LINE_SIZE);
497 			copy_b = (addr_end > addr) ?
498 				 RTE_MIN((addr_end - addr), length) : 0;
499 			if (copy_b && ((end - (uintptr_t)raw) > copy_b)) {
500 				/*
501 				 * One Dseg remains in the current WQE.  To
502 				 * keep the computation positive, it is
503 				 * removed after the bytes to Dseg conversion.
504 				 */
505 				uint16_t n = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4;
506 
507 				if (unlikely(max_wqe < n))
508 					break;
509 				max_wqe -= n;
510 				if (tso && !inl) {
511 					inl = rte_cpu_to_be_32(copy_b |
512 							       MLX5_INLINE_SEG);
513 					rte_memcpy((void *)raw,
514 						   (void *)&inl, sizeof(inl));
515 					raw += sizeof(inl);
516 					pkt_inline_sz += sizeof(inl);
517 				}
518 				rte_memcpy((void *)raw, (void *)addr, copy_b);
519 				addr += copy_b;
520 				length -= copy_b;
521 				pkt_inline_sz += copy_b;
522 			}
523 			/*
524 			 * 2 DWORDs consumed by the WQE header + ETH segment +
525 			 * the size of the inline part of the packet.
526 			 */
527 			ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2);
528 			if (length > 0) {
529 				if (ds % (MLX5_WQE_SIZE /
530 					  MLX5_WQE_DWORD_SIZE) == 0) {
531 					if (unlikely(--max_wqe == 0))
532 						break;
533 					dseg = (volatile rte_v128u32_t *)
534 					       tx_mlx5_wqe(txq, txq->wqe_ci +
535 							   ds / 4);
536 				} else {
537 					dseg = (volatile rte_v128u32_t *)
538 						((uintptr_t)wqe +
539 						 (ds * MLX5_WQE_DWORD_SIZE));
540 				}
541 				goto use_dseg;
542 			} else if (!segs_n) {
543 				goto next_pkt;
544 			} else {
545 				raw += copy_b;
546 				inline_room -= copy_b;
547 				--segs_n;
548 				buf = buf->next;
549 				assert(buf);
550 				addr = rte_pktmbuf_mtod(buf, uintptr_t);
551 				length = DATA_LEN(buf);
552 #ifdef MLX5_PMD_SOFT_COUNTERS
553 				total_length += length;
554 #endif
555 				(*txq->elts)[++elts_head & elts_m] = buf;
556 				goto pkt_inline;
557 			}
558 		} else {
559 			/*
560 			 * No inline has been done in the packet, only the
561 			 * Ethernet Header as been stored.
562 			 */
563 			dseg = (volatile rte_v128u32_t *)
564 				((uintptr_t)wqe + (3 * MLX5_WQE_DWORD_SIZE));
565 			ds = 3;
566 use_dseg:
567 			/* Add the remaining packet as a simple ds. */
568 			addr = rte_cpu_to_be_64(addr);
569 			*dseg = (rte_v128u32_t){
570 				rte_cpu_to_be_32(length),
571 				mlx5_tx_mb2mr(txq, buf),
572 				addr,
573 				addr >> 32,
574 			};
575 			++ds;
576 			if (!segs_n)
577 				goto next_pkt;
578 		}
579 next_seg:
580 		assert(buf);
581 		assert(ds);
582 		assert(wqe);
583 		/*
584 		 * Spill on next WQE when the current one does not have
585 		 * enough room left. Size of WQE must a be a multiple
586 		 * of data segment size.
587 		 */
588 		assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE));
589 		if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE))) {
590 			if (unlikely(--max_wqe == 0))
591 				break;
592 			dseg = (volatile rte_v128u32_t *)
593 			       tx_mlx5_wqe(txq, txq->wqe_ci + ds / 4);
594 			rte_prefetch0(tx_mlx5_wqe(txq,
595 						  txq->wqe_ci + ds / 4 + 1));
596 		} else {
597 			++dseg;
598 		}
599 		++ds;
600 		buf = buf->next;
601 		assert(buf);
602 		length = DATA_LEN(buf);
603 #ifdef MLX5_PMD_SOFT_COUNTERS
604 		total_length += length;
605 #endif
606 		/* Store segment information. */
607 		addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf, uintptr_t));
608 		*dseg = (rte_v128u32_t){
609 			rte_cpu_to_be_32(length),
610 			mlx5_tx_mb2mr(txq, buf),
611 			addr,
612 			addr >> 32,
613 		};
614 		(*txq->elts)[++elts_head & elts_m] = buf;
615 		if (--segs_n)
616 			goto next_seg;
617 next_pkt:
618 		if (ds > MLX5_DSEG_MAX) {
619 			txq->stats.oerrors++;
620 			break;
621 		}
622 		++elts_head;
623 		++pkts;
624 		++i;
625 		j += sg;
626 		/* Initialize known and common part of the WQE structure. */
627 		if (tso) {
628 			wqe->ctrl = (rte_v128u32_t){
629 				rte_cpu_to_be_32((txq->wqe_ci << 8) |
630 						 MLX5_OPCODE_TSO),
631 				rte_cpu_to_be_32(txq->qp_num_8s | ds),
632 				0,
633 				0,
634 			};
635 			wqe->eseg = (rte_v128u32_t){
636 				0,
637 				cs_flags | (rte_cpu_to_be_16(tso_segsz) << 16),
638 				0,
639 				(ehdr << 16) | rte_cpu_to_be_16(tso_header_sz),
640 			};
641 		} else {
642 			wqe->ctrl = (rte_v128u32_t){
643 				rte_cpu_to_be_32((txq->wqe_ci << 8) |
644 						 MLX5_OPCODE_SEND),
645 				rte_cpu_to_be_32(txq->qp_num_8s | ds),
646 				0,
647 				0,
648 			};
649 			wqe->eseg = (rte_v128u32_t){
650 				0,
651 				cs_flags,
652 				0,
653 				(ehdr << 16) | rte_cpu_to_be_16(pkt_inline_sz),
654 			};
655 		}
656 next_wqe:
657 		txq->wqe_ci += (ds + 3) / 4;
658 		/* Save the last successful WQE for completion request */
659 		last_wqe = (volatile struct mlx5_wqe_ctrl *)wqe;
660 #ifdef MLX5_PMD_SOFT_COUNTERS
661 		/* Increment sent bytes counter. */
662 		txq->stats.obytes += total_length;
663 #endif
664 	} while (i < pkts_n);
665 	/* Take a shortcut if nothing must be sent. */
666 	if (unlikely((i + k) == 0))
667 		return 0;
668 	txq->elts_head += (i + j);
669 	/* Check whether completion threshold has been reached. */
670 	comp = txq->elts_comp + i + j + k;
671 	if (comp >= MLX5_TX_COMP_THRESH) {
672 		/* Request completion on last WQE. */
673 		last_wqe->ctrl2 = rte_cpu_to_be_32(8);
674 		/* Save elts_head in unused "immediate" field of WQE. */
675 		last_wqe->ctrl3 = txq->elts_head;
676 		txq->elts_comp = 0;
677 #ifndef NDEBUG
678 		++txq->cq_pi;
679 #endif
680 	} else {
681 		txq->elts_comp = comp;
682 	}
683 #ifdef MLX5_PMD_SOFT_COUNTERS
684 	/* Increment sent packets counter. */
685 	txq->stats.opackets += i;
686 #endif
687 	/* Ring QP doorbell. */
688 	mlx5_tx_dbrec(txq, (volatile struct mlx5_wqe *)last_wqe);
689 	return i;
690 }
691 
692 /**
693  * Open a MPW session.
694  *
695  * @param txq
696  *   Pointer to TX queue structure.
697  * @param mpw
698  *   Pointer to MPW session structure.
699  * @param length
700  *   Packet length.
701  */
702 static inline void
703 mlx5_mpw_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw, uint32_t length)
704 {
705 	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
706 	volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] =
707 		(volatile struct mlx5_wqe_data_seg (*)[])
708 		tx_mlx5_wqe(txq, idx + 1);
709 
710 	mpw->state = MLX5_MPW_STATE_OPENED;
711 	mpw->pkts_n = 0;
712 	mpw->len = length;
713 	mpw->total_len = 0;
714 	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
715 	mpw->wqe->eseg.mss = rte_cpu_to_be_16(length);
716 	mpw->wqe->eseg.inline_hdr_sz = 0;
717 	mpw->wqe->eseg.rsvd0 = 0;
718 	mpw->wqe->eseg.rsvd1 = 0;
719 	mpw->wqe->eseg.rsvd2 = 0;
720 	mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) |
721 					     (txq->wqe_ci << 8) |
722 					     MLX5_OPCODE_TSO);
723 	mpw->wqe->ctrl[2] = 0;
724 	mpw->wqe->ctrl[3] = 0;
725 	mpw->data.dseg[0] = (volatile struct mlx5_wqe_data_seg *)
726 		(((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
727 	mpw->data.dseg[1] = (volatile struct mlx5_wqe_data_seg *)
728 		(((uintptr_t)mpw->wqe) + (3 * MLX5_WQE_DWORD_SIZE));
729 	mpw->data.dseg[2] = &(*dseg)[0];
730 	mpw->data.dseg[3] = &(*dseg)[1];
731 	mpw->data.dseg[4] = &(*dseg)[2];
732 }
733 
734 /**
735  * Close a MPW session.
736  *
737  * @param txq
738  *   Pointer to TX queue structure.
739  * @param mpw
740  *   Pointer to MPW session structure.
741  */
742 static inline void
743 mlx5_mpw_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
744 {
745 	unsigned int num = mpw->pkts_n;
746 
747 	/*
748 	 * Store size in multiple of 16 bytes. Control and Ethernet segments
749 	 * count as 2.
750 	 */
751 	mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s | (2 + num));
752 	mpw->state = MLX5_MPW_STATE_CLOSED;
753 	if (num < 3)
754 		++txq->wqe_ci;
755 	else
756 		txq->wqe_ci += 2;
757 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
758 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
759 }
760 
761 /**
762  * DPDK callback for TX with MPW support.
763  *
764  * @param dpdk_txq
765  *   Generic pointer to TX queue structure.
766  * @param[in] pkts
767  *   Packets to transmit.
768  * @param pkts_n
769  *   Number of packets in array.
770  *
771  * @return
772  *   Number of packets successfully transmitted (<= pkts_n).
773  */
774 uint16_t
775 mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
776 {
777 	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
778 	uint16_t elts_head = txq->elts_head;
779 	const uint16_t elts_n = 1 << txq->elts_n;
780 	const uint16_t elts_m = elts_n - 1;
781 	unsigned int i = 0;
782 	unsigned int j = 0;
783 	uint16_t max_elts;
784 	uint16_t max_wqe;
785 	unsigned int comp;
786 	struct mlx5_mpw mpw = {
787 		.state = MLX5_MPW_STATE_CLOSED,
788 	};
789 
790 	if (unlikely(!pkts_n))
791 		return 0;
792 	/* Prefetch first packet cacheline. */
793 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
794 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
795 	/* Start processing. */
796 	mlx5_tx_complete(txq);
797 	max_elts = (elts_n - (elts_head - txq->elts_tail));
798 	/* A CQE slot must always be available. */
799 	assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci));
800 	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
801 	if (unlikely(!max_wqe))
802 		return 0;
803 	do {
804 		struct rte_mbuf *buf = *(pkts++);
805 		uint32_t length;
806 		unsigned int segs_n = buf->nb_segs;
807 		uint32_t cs_flags;
808 
809 		/*
810 		 * Make sure there is enough room to store this packet and
811 		 * that one ring entry remains unused.
812 		 */
813 		assert(segs_n);
814 		if (max_elts < segs_n)
815 			break;
816 		/* Do not bother with large packets MPW cannot handle. */
817 		if (segs_n > MLX5_MPW_DSEG_MAX) {
818 			txq->stats.oerrors++;
819 			break;
820 		}
821 		max_elts -= segs_n;
822 		--pkts_n;
823 		cs_flags = txq_ol_cksum_to_cs(txq, buf);
824 		/* Retrieve packet information. */
825 		length = PKT_LEN(buf);
826 		assert(length);
827 		/* Start new session if packet differs. */
828 		if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
829 		    ((mpw.len != length) ||
830 		     (segs_n != 1) ||
831 		     (mpw.wqe->eseg.cs_flags != cs_flags)))
832 			mlx5_mpw_close(txq, &mpw);
833 		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
834 			/*
835 			 * Multi-Packet WQE consumes at most two WQE.
836 			 * mlx5_mpw_new() expects to be able to use such
837 			 * resources.
838 			 */
839 			if (unlikely(max_wqe < 2))
840 				break;
841 			max_wqe -= 2;
842 			mlx5_mpw_new(txq, &mpw, length);
843 			mpw.wqe->eseg.cs_flags = cs_flags;
844 		}
845 		/* Multi-segment packets must be alone in their MPW. */
846 		assert((segs_n == 1) || (mpw.pkts_n == 0));
847 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
848 		length = 0;
849 #endif
850 		do {
851 			volatile struct mlx5_wqe_data_seg *dseg;
852 			uintptr_t addr;
853 
854 			assert(buf);
855 			(*txq->elts)[elts_head++ & elts_m] = buf;
856 			dseg = mpw.data.dseg[mpw.pkts_n];
857 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
858 			*dseg = (struct mlx5_wqe_data_seg){
859 				.byte_count = rte_cpu_to_be_32(DATA_LEN(buf)),
860 				.lkey = mlx5_tx_mb2mr(txq, buf),
861 				.addr = rte_cpu_to_be_64(addr),
862 			};
863 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
864 			length += DATA_LEN(buf);
865 #endif
866 			buf = buf->next;
867 			++mpw.pkts_n;
868 			++j;
869 		} while (--segs_n);
870 		assert(length == mpw.len);
871 		if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
872 			mlx5_mpw_close(txq, &mpw);
873 #ifdef MLX5_PMD_SOFT_COUNTERS
874 		/* Increment sent bytes counter. */
875 		txq->stats.obytes += length;
876 #endif
877 		++i;
878 	} while (pkts_n);
879 	/* Take a shortcut if nothing must be sent. */
880 	if (unlikely(i == 0))
881 		return 0;
882 	/* Check whether completion threshold has been reached. */
883 	/* "j" includes both packets and segments. */
884 	comp = txq->elts_comp + j;
885 	if (comp >= MLX5_TX_COMP_THRESH) {
886 		volatile struct mlx5_wqe *wqe = mpw.wqe;
887 
888 		/* Request completion on last WQE. */
889 		wqe->ctrl[2] = rte_cpu_to_be_32(8);
890 		/* Save elts_head in unused "immediate" field of WQE. */
891 		wqe->ctrl[3] = elts_head;
892 		txq->elts_comp = 0;
893 #ifndef NDEBUG
894 		++txq->cq_pi;
895 #endif
896 	} else {
897 		txq->elts_comp = comp;
898 	}
899 #ifdef MLX5_PMD_SOFT_COUNTERS
900 	/* Increment sent packets counter. */
901 	txq->stats.opackets += i;
902 #endif
903 	/* Ring QP doorbell. */
904 	if (mpw.state == MLX5_MPW_STATE_OPENED)
905 		mlx5_mpw_close(txq, &mpw);
906 	mlx5_tx_dbrec(txq, mpw.wqe);
907 	txq->elts_head = elts_head;
908 	return i;
909 }
910 
911 /**
912  * Open a MPW inline session.
913  *
914  * @param txq
915  *   Pointer to TX queue structure.
916  * @param mpw
917  *   Pointer to MPW session structure.
918  * @param length
919  *   Packet length.
920  */
921 static inline void
922 mlx5_mpw_inline_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw,
923 		    uint32_t length)
924 {
925 	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
926 	struct mlx5_wqe_inl_small *inl;
927 
928 	mpw->state = MLX5_MPW_INL_STATE_OPENED;
929 	mpw->pkts_n = 0;
930 	mpw->len = length;
931 	mpw->total_len = 0;
932 	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
933 	mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) |
934 					     (txq->wqe_ci << 8) |
935 					     MLX5_OPCODE_TSO);
936 	mpw->wqe->ctrl[2] = 0;
937 	mpw->wqe->ctrl[3] = 0;
938 	mpw->wqe->eseg.mss = rte_cpu_to_be_16(length);
939 	mpw->wqe->eseg.inline_hdr_sz = 0;
940 	mpw->wqe->eseg.cs_flags = 0;
941 	mpw->wqe->eseg.rsvd0 = 0;
942 	mpw->wqe->eseg.rsvd1 = 0;
943 	mpw->wqe->eseg.rsvd2 = 0;
944 	inl = (struct mlx5_wqe_inl_small *)
945 		(((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE);
946 	mpw->data.raw = (uint8_t *)&inl->raw;
947 }
948 
949 /**
950  * Close a MPW inline session.
951  *
952  * @param txq
953  *   Pointer to TX queue structure.
954  * @param mpw
955  *   Pointer to MPW session structure.
956  */
957 static inline void
958 mlx5_mpw_inline_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
959 {
960 	unsigned int size;
961 	struct mlx5_wqe_inl_small *inl = (struct mlx5_wqe_inl_small *)
962 		(((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
963 
964 	size = MLX5_WQE_SIZE - MLX5_MWQE64_INL_DATA + mpw->total_len;
965 	/*
966 	 * Store size in multiple of 16 bytes. Control and Ethernet segments
967 	 * count as 2.
968 	 */
969 	mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s |
970 					     MLX5_WQE_DS(size));
971 	mpw->state = MLX5_MPW_STATE_CLOSED;
972 	inl->byte_cnt = rte_cpu_to_be_32(mpw->total_len | MLX5_INLINE_SEG);
973 	txq->wqe_ci += (size + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
974 }
975 
976 /**
977  * DPDK callback for TX with MPW inline support.
978  *
979  * @param dpdk_txq
980  *   Generic pointer to TX queue structure.
981  * @param[in] pkts
982  *   Packets to transmit.
983  * @param pkts_n
984  *   Number of packets in array.
985  *
986  * @return
987  *   Number of packets successfully transmitted (<= pkts_n).
988  */
989 uint16_t
990 mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
991 			 uint16_t pkts_n)
992 {
993 	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
994 	uint16_t elts_head = txq->elts_head;
995 	const uint16_t elts_n = 1 << txq->elts_n;
996 	const uint16_t elts_m = elts_n - 1;
997 	unsigned int i = 0;
998 	unsigned int j = 0;
999 	uint16_t max_elts;
1000 	uint16_t max_wqe;
1001 	unsigned int comp;
1002 	unsigned int inline_room = txq->max_inline * RTE_CACHE_LINE_SIZE;
1003 	struct mlx5_mpw mpw = {
1004 		.state = MLX5_MPW_STATE_CLOSED,
1005 	};
1006 	/*
1007 	 * Compute the maximum number of WQE which can be consumed by inline
1008 	 * code.
1009 	 * - 2 DSEG for:
1010 	 *   - 1 control segment,
1011 	 *   - 1 Ethernet segment,
1012 	 * - N Dseg from the inline request.
1013 	 */
1014 	const unsigned int wqe_inl_n =
1015 		((2 * MLX5_WQE_DWORD_SIZE +
1016 		  txq->max_inline * RTE_CACHE_LINE_SIZE) +
1017 		 RTE_CACHE_LINE_SIZE - 1) / RTE_CACHE_LINE_SIZE;
1018 
1019 	if (unlikely(!pkts_n))
1020 		return 0;
1021 	/* Prefetch first packet cacheline. */
1022 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
1023 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
1024 	/* Start processing. */
1025 	mlx5_tx_complete(txq);
1026 	max_elts = (elts_n - (elts_head - txq->elts_tail));
1027 	/* A CQE slot must always be available. */
1028 	assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci));
1029 	do {
1030 		struct rte_mbuf *buf = *(pkts++);
1031 		uintptr_t addr;
1032 		uint32_t length;
1033 		unsigned int segs_n = buf->nb_segs;
1034 		uint8_t cs_flags;
1035 
1036 		/*
1037 		 * Make sure there is enough room to store this packet and
1038 		 * that one ring entry remains unused.
1039 		 */
1040 		assert(segs_n);
1041 		if (max_elts < segs_n)
1042 			break;
1043 		/* Do not bother with large packets MPW cannot handle. */
1044 		if (segs_n > MLX5_MPW_DSEG_MAX) {
1045 			txq->stats.oerrors++;
1046 			break;
1047 		}
1048 		max_elts -= segs_n;
1049 		--pkts_n;
1050 		/*
1051 		 * Compute max_wqe in case less WQE were consumed in previous
1052 		 * iteration.
1053 		 */
1054 		max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
1055 		cs_flags = txq_ol_cksum_to_cs(txq, buf);
1056 		/* Retrieve packet information. */
1057 		length = PKT_LEN(buf);
1058 		/* Start new session if packet differs. */
1059 		if (mpw.state == MLX5_MPW_STATE_OPENED) {
1060 			if ((mpw.len != length) ||
1061 			    (segs_n != 1) ||
1062 			    (mpw.wqe->eseg.cs_flags != cs_flags))
1063 				mlx5_mpw_close(txq, &mpw);
1064 		} else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
1065 			if ((mpw.len != length) ||
1066 			    (segs_n != 1) ||
1067 			    (length > inline_room) ||
1068 			    (mpw.wqe->eseg.cs_flags != cs_flags)) {
1069 				mlx5_mpw_inline_close(txq, &mpw);
1070 				inline_room =
1071 					txq->max_inline * RTE_CACHE_LINE_SIZE;
1072 			}
1073 		}
1074 		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
1075 			if ((segs_n != 1) ||
1076 			    (length > inline_room)) {
1077 				/*
1078 				 * Multi-Packet WQE consumes at most two WQE.
1079 				 * mlx5_mpw_new() expects to be able to use
1080 				 * such resources.
1081 				 */
1082 				if (unlikely(max_wqe < 2))
1083 					break;
1084 				max_wqe -= 2;
1085 				mlx5_mpw_new(txq, &mpw, length);
1086 				mpw.wqe->eseg.cs_flags = cs_flags;
1087 			} else {
1088 				if (unlikely(max_wqe < wqe_inl_n))
1089 					break;
1090 				max_wqe -= wqe_inl_n;
1091 				mlx5_mpw_inline_new(txq, &mpw, length);
1092 				mpw.wqe->eseg.cs_flags = cs_flags;
1093 			}
1094 		}
1095 		/* Multi-segment packets must be alone in their MPW. */
1096 		assert((segs_n == 1) || (mpw.pkts_n == 0));
1097 		if (mpw.state == MLX5_MPW_STATE_OPENED) {
1098 			assert(inline_room ==
1099 			       txq->max_inline * RTE_CACHE_LINE_SIZE);
1100 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1101 			length = 0;
1102 #endif
1103 			do {
1104 				volatile struct mlx5_wqe_data_seg *dseg;
1105 
1106 				assert(buf);
1107 				(*txq->elts)[elts_head++ & elts_m] = buf;
1108 				dseg = mpw.data.dseg[mpw.pkts_n];
1109 				addr = rte_pktmbuf_mtod(buf, uintptr_t);
1110 				*dseg = (struct mlx5_wqe_data_seg){
1111 					.byte_count =
1112 					       rte_cpu_to_be_32(DATA_LEN(buf)),
1113 					.lkey = mlx5_tx_mb2mr(txq, buf),
1114 					.addr = rte_cpu_to_be_64(addr),
1115 				};
1116 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1117 				length += DATA_LEN(buf);
1118 #endif
1119 				buf = buf->next;
1120 				++mpw.pkts_n;
1121 				++j;
1122 			} while (--segs_n);
1123 			assert(length == mpw.len);
1124 			if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
1125 				mlx5_mpw_close(txq, &mpw);
1126 		} else {
1127 			unsigned int max;
1128 
1129 			assert(mpw.state == MLX5_MPW_INL_STATE_OPENED);
1130 			assert(length <= inline_room);
1131 			assert(length == DATA_LEN(buf));
1132 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
1133 			(*txq->elts)[elts_head++ & elts_m] = buf;
1134 			/* Maximum number of bytes before wrapping. */
1135 			max = ((((uintptr_t)(txq->wqes)) +
1136 				(1 << txq->wqe_n) *
1137 				MLX5_WQE_SIZE) -
1138 			       (uintptr_t)mpw.data.raw);
1139 			if (length > max) {
1140 				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1141 					   (void *)addr,
1142 					   max);
1143 				mpw.data.raw = (volatile void *)txq->wqes;
1144 				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1145 					   (void *)(addr + max),
1146 					   length - max);
1147 				mpw.data.raw += length - max;
1148 			} else {
1149 				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1150 					   (void *)addr,
1151 					   length);
1152 
1153 				if (length == max)
1154 					mpw.data.raw =
1155 						(volatile void *)txq->wqes;
1156 				else
1157 					mpw.data.raw += length;
1158 			}
1159 			++mpw.pkts_n;
1160 			mpw.total_len += length;
1161 			++j;
1162 			if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) {
1163 				mlx5_mpw_inline_close(txq, &mpw);
1164 				inline_room =
1165 					txq->max_inline * RTE_CACHE_LINE_SIZE;
1166 			} else {
1167 				inline_room -= length;
1168 			}
1169 		}
1170 #ifdef MLX5_PMD_SOFT_COUNTERS
1171 		/* Increment sent bytes counter. */
1172 		txq->stats.obytes += length;
1173 #endif
1174 		++i;
1175 	} while (pkts_n);
1176 	/* Take a shortcut if nothing must be sent. */
1177 	if (unlikely(i == 0))
1178 		return 0;
1179 	/* Check whether completion threshold has been reached. */
1180 	/* "j" includes both packets and segments. */
1181 	comp = txq->elts_comp + j;
1182 	if (comp >= MLX5_TX_COMP_THRESH) {
1183 		volatile struct mlx5_wqe *wqe = mpw.wqe;
1184 
1185 		/* Request completion on last WQE. */
1186 		wqe->ctrl[2] = rte_cpu_to_be_32(8);
1187 		/* Save elts_head in unused "immediate" field of WQE. */
1188 		wqe->ctrl[3] = elts_head;
1189 		txq->elts_comp = 0;
1190 #ifndef NDEBUG
1191 		++txq->cq_pi;
1192 #endif
1193 	} else {
1194 		txq->elts_comp = comp;
1195 	}
1196 #ifdef MLX5_PMD_SOFT_COUNTERS
1197 	/* Increment sent packets counter. */
1198 	txq->stats.opackets += i;
1199 #endif
1200 	/* Ring QP doorbell. */
1201 	if (mpw.state == MLX5_MPW_INL_STATE_OPENED)
1202 		mlx5_mpw_inline_close(txq, &mpw);
1203 	else if (mpw.state == MLX5_MPW_STATE_OPENED)
1204 		mlx5_mpw_close(txq, &mpw);
1205 	mlx5_tx_dbrec(txq, mpw.wqe);
1206 	txq->elts_head = elts_head;
1207 	return i;
1208 }
1209 
1210 /**
1211  * Open an Enhanced MPW session.
1212  *
1213  * @param txq
1214  *   Pointer to TX queue structure.
1215  * @param mpw
1216  *   Pointer to MPW session structure.
1217  * @param length
1218  *   Packet length.
1219  */
1220 static inline void
1221 mlx5_empw_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw, int padding)
1222 {
1223 	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
1224 
1225 	mpw->state = MLX5_MPW_ENHANCED_STATE_OPENED;
1226 	mpw->pkts_n = 0;
1227 	mpw->total_len = sizeof(struct mlx5_wqe);
1228 	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
1229 	mpw->wqe->ctrl[0] =
1230 		rte_cpu_to_be_32((MLX5_OPC_MOD_ENHANCED_MPSW << 24) |
1231 				 (txq->wqe_ci << 8) |
1232 				 MLX5_OPCODE_ENHANCED_MPSW);
1233 	mpw->wqe->ctrl[2] = 0;
1234 	mpw->wqe->ctrl[3] = 0;
1235 	memset((void *)(uintptr_t)&mpw->wqe->eseg, 0, MLX5_WQE_DWORD_SIZE);
1236 	if (unlikely(padding)) {
1237 		uintptr_t addr = (uintptr_t)(mpw->wqe + 1);
1238 
1239 		/* Pad the first 2 DWORDs with zero-length inline header. */
1240 		*(volatile uint32_t *)addr = rte_cpu_to_be_32(MLX5_INLINE_SEG);
1241 		*(volatile uint32_t *)(addr + MLX5_WQE_DWORD_SIZE) =
1242 			rte_cpu_to_be_32(MLX5_INLINE_SEG);
1243 		mpw->total_len += 2 * MLX5_WQE_DWORD_SIZE;
1244 		/* Start from the next WQEBB. */
1245 		mpw->data.raw = (volatile void *)(tx_mlx5_wqe(txq, idx + 1));
1246 	} else {
1247 		mpw->data.raw = (volatile void *)(mpw->wqe + 1);
1248 	}
1249 }
1250 
1251 /**
1252  * Close an Enhanced MPW session.
1253  *
1254  * @param txq
1255  *   Pointer to TX queue structure.
1256  * @param mpw
1257  *   Pointer to MPW session structure.
1258  *
1259  * @return
1260  *   Number of consumed WQEs.
1261  */
1262 static inline uint16_t
1263 mlx5_empw_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
1264 {
1265 	uint16_t ret;
1266 
1267 	/* Store size in multiple of 16 bytes. Control and Ethernet segments
1268 	 * count as 2.
1269 	 */
1270 	mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s |
1271 					     MLX5_WQE_DS(mpw->total_len));
1272 	mpw->state = MLX5_MPW_STATE_CLOSED;
1273 	ret = (mpw->total_len + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
1274 	txq->wqe_ci += ret;
1275 	return ret;
1276 }
1277 
1278 /**
1279  * TX with Enhanced MPW support.
1280  *
1281  * @param txq
1282  *   Pointer to TX queue structure.
1283  * @param[in] pkts
1284  *   Packets to transmit.
1285  * @param pkts_n
1286  *   Number of packets in array.
1287  *
1288  * @return
1289  *   Number of packets successfully transmitted (<= pkts_n).
1290  */
1291 static inline uint16_t
1292 txq_burst_empw(struct mlx5_txq_data *txq, struct rte_mbuf **pkts,
1293 	       uint16_t pkts_n)
1294 {
1295 	uint16_t elts_head = txq->elts_head;
1296 	const uint16_t elts_n = 1 << txq->elts_n;
1297 	const uint16_t elts_m = elts_n - 1;
1298 	unsigned int i = 0;
1299 	unsigned int j = 0;
1300 	uint16_t max_elts;
1301 	uint16_t max_wqe;
1302 	unsigned int max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE;
1303 	unsigned int mpw_room = 0;
1304 	unsigned int inl_pad = 0;
1305 	uint32_t inl_hdr;
1306 	struct mlx5_mpw mpw = {
1307 		.state = MLX5_MPW_STATE_CLOSED,
1308 	};
1309 
1310 	if (unlikely(!pkts_n))
1311 		return 0;
1312 	/* Start processing. */
1313 	mlx5_tx_complete(txq);
1314 	max_elts = (elts_n - (elts_head - txq->elts_tail));
1315 	/* A CQE slot must always be available. */
1316 	assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci));
1317 	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
1318 	if (unlikely(!max_wqe))
1319 		return 0;
1320 	do {
1321 		struct rte_mbuf *buf = *(pkts++);
1322 		uintptr_t addr;
1323 		unsigned int n;
1324 		unsigned int do_inline = 0; /* Whether inline is possible. */
1325 		uint32_t length;
1326 		uint8_t cs_flags;
1327 
1328 		/* Multi-segmented packet is handled in slow-path outside. */
1329 		assert(NB_SEGS(buf) == 1);
1330 		/* Make sure there is enough room to store this packet. */
1331 		if (max_elts - j == 0)
1332 			break;
1333 		cs_flags = txq_ol_cksum_to_cs(txq, buf);
1334 		/* Retrieve packet information. */
1335 		length = PKT_LEN(buf);
1336 		/* Start new session if:
1337 		 * - multi-segment packet
1338 		 * - no space left even for a dseg
1339 		 * - next packet can be inlined with a new WQE
1340 		 * - cs_flag differs
1341 		 */
1342 		if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED) {
1343 			if ((inl_pad + sizeof(struct mlx5_wqe_data_seg) >
1344 			     mpw_room) ||
1345 			    (length <= txq->inline_max_packet_sz &&
1346 			     inl_pad + sizeof(inl_hdr) + length >
1347 			     mpw_room) ||
1348 			    (mpw.wqe->eseg.cs_flags != cs_flags))
1349 				max_wqe -= mlx5_empw_close(txq, &mpw);
1350 		}
1351 		if (unlikely(mpw.state == MLX5_MPW_STATE_CLOSED)) {
1352 			/* In Enhanced MPW, inline as much as the budget is
1353 			 * allowed. The remaining space is to be filled with
1354 			 * dsegs. If the title WQEBB isn't padded, it will have
1355 			 * 2 dsegs there.
1356 			 */
1357 			mpw_room = RTE_MIN(MLX5_WQE_SIZE_MAX,
1358 					   (max_inline ? max_inline :
1359 					    pkts_n * MLX5_WQE_DWORD_SIZE) +
1360 					   MLX5_WQE_SIZE);
1361 			if (unlikely(max_wqe * MLX5_WQE_SIZE < mpw_room))
1362 				break;
1363 			/* Don't pad the title WQEBB to not waste WQ. */
1364 			mlx5_empw_new(txq, &mpw, 0);
1365 			mpw_room -= mpw.total_len;
1366 			inl_pad = 0;
1367 			do_inline = length <= txq->inline_max_packet_sz &&
1368 				    sizeof(inl_hdr) + length <= mpw_room &&
1369 				    !txq->mpw_hdr_dseg;
1370 			mpw.wqe->eseg.cs_flags = cs_flags;
1371 		} else {
1372 			/* Evaluate whether the next packet can be inlined.
1373 			 * Inlininig is possible when:
1374 			 * - length is less than configured value
1375 			 * - length fits for remaining space
1376 			 * - not required to fill the title WQEBB with dsegs
1377 			 */
1378 			do_inline =
1379 				length <= txq->inline_max_packet_sz &&
1380 				inl_pad + sizeof(inl_hdr) + length <=
1381 				 mpw_room &&
1382 				(!txq->mpw_hdr_dseg ||
1383 				 mpw.total_len >= MLX5_WQE_SIZE);
1384 		}
1385 		if (max_inline && do_inline) {
1386 			/* Inline packet into WQE. */
1387 			unsigned int max;
1388 
1389 			assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
1390 			assert(length == DATA_LEN(buf));
1391 			inl_hdr = rte_cpu_to_be_32(length | MLX5_INLINE_SEG);
1392 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
1393 			mpw.data.raw = (volatile void *)
1394 				((uintptr_t)mpw.data.raw + inl_pad);
1395 			max = tx_mlx5_wq_tailroom(txq,
1396 					(void *)(uintptr_t)mpw.data.raw);
1397 			/* Copy inline header. */
1398 			mpw.data.raw = (volatile void *)
1399 				mlx5_copy_to_wq(
1400 					  (void *)(uintptr_t)mpw.data.raw,
1401 					  &inl_hdr,
1402 					  sizeof(inl_hdr),
1403 					  (void *)(uintptr_t)txq->wqes,
1404 					  max);
1405 			max = tx_mlx5_wq_tailroom(txq,
1406 					(void *)(uintptr_t)mpw.data.raw);
1407 			/* Copy packet data. */
1408 			mpw.data.raw = (volatile void *)
1409 				mlx5_copy_to_wq(
1410 					  (void *)(uintptr_t)mpw.data.raw,
1411 					  (void *)addr,
1412 					  length,
1413 					  (void *)(uintptr_t)txq->wqes,
1414 					  max);
1415 			++mpw.pkts_n;
1416 			mpw.total_len += (inl_pad + sizeof(inl_hdr) + length);
1417 			/* No need to get completion as the entire packet is
1418 			 * copied to WQ. Free the buf right away.
1419 			 */
1420 			rte_pktmbuf_free_seg(buf);
1421 			mpw_room -= (inl_pad + sizeof(inl_hdr) + length);
1422 			/* Add pad in the next packet if any. */
1423 			inl_pad = (((uintptr_t)mpw.data.raw +
1424 					(MLX5_WQE_DWORD_SIZE - 1)) &
1425 					~(MLX5_WQE_DWORD_SIZE - 1)) -
1426 				  (uintptr_t)mpw.data.raw;
1427 		} else {
1428 			/* No inline. Load a dseg of packet pointer. */
1429 			volatile rte_v128u32_t *dseg;
1430 
1431 			assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
1432 			assert((inl_pad + sizeof(*dseg)) <= mpw_room);
1433 			assert(length == DATA_LEN(buf));
1434 			if (!tx_mlx5_wq_tailroom(txq,
1435 					(void *)((uintptr_t)mpw.data.raw
1436 						+ inl_pad)))
1437 				dseg = (volatile void *)txq->wqes;
1438 			else
1439 				dseg = (volatile void *)
1440 					((uintptr_t)mpw.data.raw +
1441 					 inl_pad);
1442 			(*txq->elts)[elts_head++ & elts_m] = buf;
1443 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
1444 			for (n = 0; n * RTE_CACHE_LINE_SIZE < length; n++)
1445 				rte_prefetch2((void *)(addr +
1446 						n * RTE_CACHE_LINE_SIZE));
1447 			addr = rte_cpu_to_be_64(addr);
1448 			*dseg = (rte_v128u32_t) {
1449 				rte_cpu_to_be_32(length),
1450 				mlx5_tx_mb2mr(txq, buf),
1451 				addr,
1452 				addr >> 32,
1453 			};
1454 			mpw.data.raw = (volatile void *)(dseg + 1);
1455 			mpw.total_len += (inl_pad + sizeof(*dseg));
1456 			++j;
1457 			++mpw.pkts_n;
1458 			mpw_room -= (inl_pad + sizeof(*dseg));
1459 			inl_pad = 0;
1460 		}
1461 #ifdef MLX5_PMD_SOFT_COUNTERS
1462 		/* Increment sent bytes counter. */
1463 		txq->stats.obytes += length;
1464 #endif
1465 		++i;
1466 	} while (i < pkts_n);
1467 	/* Take a shortcut if nothing must be sent. */
1468 	if (unlikely(i == 0))
1469 		return 0;
1470 	/* Check whether completion threshold has been reached. */
1471 	if (txq->elts_comp + j >= MLX5_TX_COMP_THRESH ||
1472 			(uint16_t)(txq->wqe_ci - txq->mpw_comp) >=
1473 			 (1 << txq->wqe_n) / MLX5_TX_COMP_THRESH_INLINE_DIV) {
1474 		volatile struct mlx5_wqe *wqe = mpw.wqe;
1475 
1476 		/* Request completion on last WQE. */
1477 		wqe->ctrl[2] = rte_cpu_to_be_32(8);
1478 		/* Save elts_head in unused "immediate" field of WQE. */
1479 		wqe->ctrl[3] = elts_head;
1480 		txq->elts_comp = 0;
1481 		txq->mpw_comp = txq->wqe_ci;
1482 #ifndef NDEBUG
1483 		++txq->cq_pi;
1484 #endif
1485 	} else {
1486 		txq->elts_comp += j;
1487 	}
1488 #ifdef MLX5_PMD_SOFT_COUNTERS
1489 	/* Increment sent packets counter. */
1490 	txq->stats.opackets += i;
1491 #endif
1492 	if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED)
1493 		mlx5_empw_close(txq, &mpw);
1494 	/* Ring QP doorbell. */
1495 	mlx5_tx_dbrec(txq, mpw.wqe);
1496 	txq->elts_head = elts_head;
1497 	return i;
1498 }
1499 
1500 /**
1501  * DPDK callback for TX with Enhanced MPW support.
1502  *
1503  * @param dpdk_txq
1504  *   Generic pointer to TX queue structure.
1505  * @param[in] pkts
1506  *   Packets to transmit.
1507  * @param pkts_n
1508  *   Number of packets in array.
1509  *
1510  * @return
1511  *   Number of packets successfully transmitted (<= pkts_n).
1512  */
1513 uint16_t
1514 mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
1515 {
1516 	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
1517 	uint16_t nb_tx = 0;
1518 
1519 	while (pkts_n > nb_tx) {
1520 		uint16_t n;
1521 		uint16_t ret;
1522 
1523 		n = txq_count_contig_multi_seg(&pkts[nb_tx], pkts_n - nb_tx);
1524 		if (n) {
1525 			ret = mlx5_tx_burst(dpdk_txq, &pkts[nb_tx], n);
1526 			if (!ret)
1527 				break;
1528 			nb_tx += ret;
1529 		}
1530 		n = txq_count_contig_single_seg(&pkts[nb_tx], pkts_n - nb_tx);
1531 		if (n) {
1532 			ret = txq_burst_empw(txq, &pkts[nb_tx], n);
1533 			if (!ret)
1534 				break;
1535 			nb_tx += ret;
1536 		}
1537 	}
1538 	return nb_tx;
1539 }
1540 
1541 /**
1542  * Translate RX completion flags to packet type.
1543  *
1544  * @param[in] cqe
1545  *   Pointer to CQE.
1546  *
1547  * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
1548  *
1549  * @return
1550  *   Packet type for struct rte_mbuf.
1551  */
1552 static inline uint32_t
1553 rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe)
1554 {
1555 	uint8_t idx;
1556 	uint8_t pinfo = cqe->pkt_info;
1557 	uint16_t ptype = cqe->hdr_type_etc;
1558 
1559 	/*
1560 	 * The index to the array should have:
1561 	 * bit[1:0] = l3_hdr_type
1562 	 * bit[4:2] = l4_hdr_type
1563 	 * bit[5] = ip_frag
1564 	 * bit[6] = tunneled
1565 	 * bit[7] = outer_l3_type
1566 	 */
1567 	idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10);
1568 	return mlx5_ptype_table[idx];
1569 }
1570 
1571 /**
1572  * Get size of the next packet for a given CQE. For compressed CQEs, the
1573  * consumer index is updated only once all packets of the current one have
1574  * been processed.
1575  *
1576  * @param rxq
1577  *   Pointer to RX queue.
1578  * @param cqe
1579  *   CQE to process.
1580  * @param[out] rss_hash
1581  *   Packet RSS Hash result.
1582  *
1583  * @return
1584  *   Packet size in bytes (0 if there is none), -1 in case of completion
1585  *   with error.
1586  */
1587 static inline int
1588 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
1589 		 uint16_t cqe_cnt, uint32_t *rss_hash)
1590 {
1591 	struct rxq_zip *zip = &rxq->zip;
1592 	uint16_t cqe_n = cqe_cnt + 1;
1593 	int len = 0;
1594 	uint16_t idx, end;
1595 
1596 	/* Process compressed data in the CQE and mini arrays. */
1597 	if (zip->ai) {
1598 		volatile struct mlx5_mini_cqe8 (*mc)[8] =
1599 			(volatile struct mlx5_mini_cqe8 (*)[8])
1600 			(uintptr_t)(&(*rxq->cqes)[zip->ca & cqe_cnt].pkt_info);
1601 
1602 		len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt);
1603 		*rss_hash = rte_be_to_cpu_32((*mc)[zip->ai & 7].rx_hash_result);
1604 		if ((++zip->ai & 7) == 0) {
1605 			/* Invalidate consumed CQEs */
1606 			idx = zip->ca;
1607 			end = zip->na;
1608 			while (idx != end) {
1609 				(*rxq->cqes)[idx & cqe_cnt].op_own =
1610 					MLX5_CQE_INVALIDATE;
1611 				++idx;
1612 			}
1613 			/*
1614 			 * Increment consumer index to skip the number of
1615 			 * CQEs consumed. Hardware leaves holes in the CQ
1616 			 * ring for software use.
1617 			 */
1618 			zip->ca = zip->na;
1619 			zip->na += 8;
1620 		}
1621 		if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
1622 			/* Invalidate the rest */
1623 			idx = zip->ca;
1624 			end = zip->cq_ci;
1625 
1626 			while (idx != end) {
1627 				(*rxq->cqes)[idx & cqe_cnt].op_own =
1628 					MLX5_CQE_INVALIDATE;
1629 				++idx;
1630 			}
1631 			rxq->cq_ci = zip->cq_ci;
1632 			zip->ai = 0;
1633 		}
1634 	/* No compressed data, get next CQE and verify if it is compressed. */
1635 	} else {
1636 		int ret;
1637 		int8_t op_own;
1638 
1639 		ret = check_cqe(cqe, cqe_n, rxq->cq_ci);
1640 		if (unlikely(ret == 1))
1641 			return 0;
1642 		++rxq->cq_ci;
1643 		op_own = cqe->op_own;
1644 		rte_cio_rmb();
1645 		if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
1646 			volatile struct mlx5_mini_cqe8 (*mc)[8] =
1647 				(volatile struct mlx5_mini_cqe8 (*)[8])
1648 				(uintptr_t)(&(*rxq->cqes)[rxq->cq_ci &
1649 							  cqe_cnt].pkt_info);
1650 
1651 			/* Fix endianness. */
1652 			zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt);
1653 			/*
1654 			 * Current mini array position is the one returned by
1655 			 * check_cqe64().
1656 			 *
1657 			 * If completion comprises several mini arrays, as a
1658 			 * special case the second one is located 7 CQEs after
1659 			 * the initial CQE instead of 8 for subsequent ones.
1660 			 */
1661 			zip->ca = rxq->cq_ci;
1662 			zip->na = zip->ca + 7;
1663 			/* Compute the next non compressed CQE. */
1664 			--rxq->cq_ci;
1665 			zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
1666 			/* Get packet size to return. */
1667 			len = rte_be_to_cpu_32((*mc)[0].byte_cnt);
1668 			*rss_hash = rte_be_to_cpu_32((*mc)[0].rx_hash_result);
1669 			zip->ai = 1;
1670 			/* Prefetch all the entries to be invalidated */
1671 			idx = zip->ca;
1672 			end = zip->cq_ci;
1673 			while (idx != end) {
1674 				rte_prefetch0(&(*rxq->cqes)[(idx) & cqe_cnt]);
1675 				++idx;
1676 			}
1677 		} else {
1678 			len = rte_be_to_cpu_32(cqe->byte_cnt);
1679 			*rss_hash = rte_be_to_cpu_32(cqe->rx_hash_res);
1680 		}
1681 		/* Error while receiving packet. */
1682 		if (unlikely(MLX5_CQE_OPCODE(op_own) == MLX5_CQE_RESP_ERR))
1683 			return -1;
1684 	}
1685 	return len;
1686 }
1687 
1688 /**
1689  * Translate RX completion flags to offload flags.
1690  *
1691  * @param[in] rxq
1692  *   Pointer to RX queue structure.
1693  * @param[in] cqe
1694  *   Pointer to CQE.
1695  *
1696  * @return
1697  *   Offload flags (ol_flags) for struct rte_mbuf.
1698  */
1699 static inline uint32_t
1700 rxq_cq_to_ol_flags(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe)
1701 {
1702 	uint32_t ol_flags = 0;
1703 	uint16_t flags = rte_be_to_cpu_16(cqe->hdr_type_etc);
1704 
1705 	ol_flags =
1706 		TRANSPOSE(flags,
1707 			  MLX5_CQE_RX_L3_HDR_VALID,
1708 			  PKT_RX_IP_CKSUM_GOOD) |
1709 		TRANSPOSE(flags,
1710 			  MLX5_CQE_RX_L4_HDR_VALID,
1711 			  PKT_RX_L4_CKSUM_GOOD);
1712 	if ((cqe->pkt_info & MLX5_CQE_RX_TUNNEL_PACKET) && (rxq->csum_l2tun))
1713 		ol_flags |=
1714 			TRANSPOSE(flags,
1715 				  MLX5_CQE_RX_L3_HDR_VALID,
1716 				  PKT_RX_IP_CKSUM_GOOD) |
1717 			TRANSPOSE(flags,
1718 				  MLX5_CQE_RX_L4_HDR_VALID,
1719 				  PKT_RX_L4_CKSUM_GOOD);
1720 	return ol_flags;
1721 }
1722 
1723 /**
1724  * DPDK callback for RX.
1725  *
1726  * @param dpdk_rxq
1727  *   Generic pointer to RX queue structure.
1728  * @param[out] pkts
1729  *   Array to store received packets.
1730  * @param pkts_n
1731  *   Maximum number of packets in array.
1732  *
1733  * @return
1734  *   Number of packets successfully received (<= pkts_n).
1735  */
1736 uint16_t
1737 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
1738 {
1739 	struct mlx5_rxq_data *rxq = dpdk_rxq;
1740 	const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1;
1741 	const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1;
1742 	const unsigned int sges_n = rxq->sges_n;
1743 	struct rte_mbuf *pkt = NULL;
1744 	struct rte_mbuf *seg = NULL;
1745 	volatile struct mlx5_cqe *cqe =
1746 		&(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
1747 	unsigned int i = 0;
1748 	unsigned int rq_ci = rxq->rq_ci << sges_n;
1749 	int len = 0; /* keep its value across iterations. */
1750 
1751 	while (pkts_n) {
1752 		unsigned int idx = rq_ci & wqe_cnt;
1753 		volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx];
1754 		struct rte_mbuf *rep = (*rxq->elts)[idx];
1755 		uint32_t rss_hash_res = 0;
1756 
1757 		if (pkt)
1758 			NEXT(seg) = rep;
1759 		seg = rep;
1760 		rte_prefetch0(seg);
1761 		rte_prefetch0(cqe);
1762 		rte_prefetch0(wqe);
1763 		rep = rte_mbuf_raw_alloc(rxq->mp);
1764 		if (unlikely(rep == NULL)) {
1765 			++rxq->stats.rx_nombuf;
1766 			if (!pkt) {
1767 				/*
1768 				 * no buffers before we even started,
1769 				 * bail out silently.
1770 				 */
1771 				break;
1772 			}
1773 			while (pkt != seg) {
1774 				assert(pkt != (*rxq->elts)[idx]);
1775 				rep = NEXT(pkt);
1776 				NEXT(pkt) = NULL;
1777 				NB_SEGS(pkt) = 1;
1778 				rte_mbuf_raw_free(pkt);
1779 				pkt = rep;
1780 			}
1781 			break;
1782 		}
1783 		if (!pkt) {
1784 			cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
1785 			len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt,
1786 					       &rss_hash_res);
1787 			if (!len) {
1788 				rte_mbuf_raw_free(rep);
1789 				break;
1790 			}
1791 			if (unlikely(len == -1)) {
1792 				/* RX error, packet is likely too large. */
1793 				rte_mbuf_raw_free(rep);
1794 				++rxq->stats.idropped;
1795 				goto skip;
1796 			}
1797 			pkt = seg;
1798 			assert(len >= (rxq->crc_present << 2));
1799 			/* Update packet information. */
1800 			pkt->packet_type = rxq_cq_to_pkt_type(cqe);
1801 			pkt->ol_flags = 0;
1802 			if (rss_hash_res && rxq->rss_hash) {
1803 				pkt->hash.rss = rss_hash_res;
1804 				pkt->ol_flags = PKT_RX_RSS_HASH;
1805 			}
1806 			if (rxq->mark &&
1807 			    MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) {
1808 				pkt->ol_flags |= PKT_RX_FDIR;
1809 				if (cqe->sop_drop_qpn !=
1810 				    rte_cpu_to_be_32(MLX5_FLOW_MARK_DEFAULT)) {
1811 					uint32_t mark = cqe->sop_drop_qpn;
1812 
1813 					pkt->ol_flags |= PKT_RX_FDIR_ID;
1814 					pkt->hash.fdir.hi =
1815 						mlx5_flow_mark_get(mark);
1816 				}
1817 			}
1818 			if (rxq->csum | rxq->csum_l2tun)
1819 				pkt->ol_flags |= rxq_cq_to_ol_flags(rxq, cqe);
1820 			if (rxq->vlan_strip &&
1821 			    (cqe->hdr_type_etc &
1822 			     rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED))) {
1823 				pkt->ol_flags |= PKT_RX_VLAN |
1824 					PKT_RX_VLAN_STRIPPED;
1825 				pkt->vlan_tci =
1826 					rte_be_to_cpu_16(cqe->vlan_info);
1827 			}
1828 			if (rxq->hw_timestamp) {
1829 				pkt->timestamp =
1830 					rte_be_to_cpu_64(cqe->timestamp);
1831 				pkt->ol_flags |= PKT_RX_TIMESTAMP;
1832 			}
1833 			if (rxq->crc_present)
1834 				len -= ETHER_CRC_LEN;
1835 			PKT_LEN(pkt) = len;
1836 		}
1837 		DATA_LEN(rep) = DATA_LEN(seg);
1838 		PKT_LEN(rep) = PKT_LEN(seg);
1839 		SET_DATA_OFF(rep, DATA_OFF(seg));
1840 		PORT(rep) = PORT(seg);
1841 		(*rxq->elts)[idx] = rep;
1842 		/*
1843 		 * Fill NIC descriptor with the new buffer.  The lkey and size
1844 		 * of the buffers are already known, only the buffer address
1845 		 * changes.
1846 		 */
1847 		wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t));
1848 		if (len > DATA_LEN(seg)) {
1849 			len -= DATA_LEN(seg);
1850 			++NB_SEGS(pkt);
1851 			++rq_ci;
1852 			continue;
1853 		}
1854 		DATA_LEN(seg) = len;
1855 #ifdef MLX5_PMD_SOFT_COUNTERS
1856 		/* Increment bytes counter. */
1857 		rxq->stats.ibytes += PKT_LEN(pkt);
1858 #endif
1859 		/* Return packet. */
1860 		*(pkts++) = pkt;
1861 		pkt = NULL;
1862 		--pkts_n;
1863 		++i;
1864 skip:
1865 		/* Align consumer index to the next stride. */
1866 		rq_ci >>= sges_n;
1867 		++rq_ci;
1868 		rq_ci <<= sges_n;
1869 	}
1870 	if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci)))
1871 		return 0;
1872 	/* Update the consumer index. */
1873 	rxq->rq_ci = rq_ci >> sges_n;
1874 	rte_cio_wmb();
1875 	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
1876 	rte_cio_wmb();
1877 	*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
1878 #ifdef MLX5_PMD_SOFT_COUNTERS
1879 	/* Increment packets counter. */
1880 	rxq->stats.ipackets += i;
1881 #endif
1882 	return i;
1883 }
1884 
1885 /**
1886  * Dummy DPDK callback for TX.
1887  *
1888  * This function is used to temporarily replace the real callback during
1889  * unsafe control operations on the queue, or in case of error.
1890  *
1891  * @param dpdk_txq
1892  *   Generic pointer to TX queue structure.
1893  * @param[in] pkts
1894  *   Packets to transmit.
1895  * @param pkts_n
1896  *   Number of packets in array.
1897  *
1898  * @return
1899  *   Number of packets successfully transmitted (<= pkts_n).
1900  */
1901 uint16_t
1902 removed_tx_burst(void *dpdk_txq __rte_unused,
1903 		 struct rte_mbuf **pkts __rte_unused,
1904 		 uint16_t pkts_n __rte_unused)
1905 {
1906 	return 0;
1907 }
1908 
1909 /**
1910  * Dummy DPDK callback for RX.
1911  *
1912  * This function is used to temporarily replace the real callback during
1913  * unsafe control operations on the queue, or in case of error.
1914  *
1915  * @param dpdk_rxq
1916  *   Generic pointer to RX queue structure.
1917  * @param[out] pkts
1918  *   Array to store received packets.
1919  * @param pkts_n
1920  *   Maximum number of packets in array.
1921  *
1922  * @return
1923  *   Number of packets successfully received (<= pkts_n).
1924  */
1925 uint16_t
1926 removed_rx_burst(void *dpdk_txq __rte_unused,
1927 		 struct rte_mbuf **pkts __rte_unused,
1928 		 uint16_t pkts_n __rte_unused)
1929 {
1930 	return 0;
1931 }
1932 
1933 /*
1934  * Vectorized Rx/Tx routines are not compiled in when required vector
1935  * instructions are not supported on a target architecture. The following null
1936  * stubs are needed for linkage when those are not included outside of this file
1937  * (e.g.  mlx5_rxtx_vec_sse.c for x86).
1938  */
1939 
1940 uint16_t __attribute__((weak))
1941 mlx5_tx_burst_raw_vec(void *dpdk_txq __rte_unused,
1942 		      struct rte_mbuf **pkts __rte_unused,
1943 		      uint16_t pkts_n __rte_unused)
1944 {
1945 	return 0;
1946 }
1947 
1948 uint16_t __attribute__((weak))
1949 mlx5_tx_burst_vec(void *dpdk_txq __rte_unused,
1950 		  struct rte_mbuf **pkts __rte_unused,
1951 		  uint16_t pkts_n __rte_unused)
1952 {
1953 	return 0;
1954 }
1955 
1956 uint16_t __attribute__((weak))
1957 mlx5_rx_burst_vec(void *dpdk_txq __rte_unused,
1958 		  struct rte_mbuf **pkts __rte_unused,
1959 		  uint16_t pkts_n __rte_unused)
1960 {
1961 	return 0;
1962 }
1963 
1964 int __attribute__((weak))
1965 mlx5_check_raw_vec_tx_support(struct rte_eth_dev *dev __rte_unused)
1966 {
1967 	return -ENOTSUP;
1968 }
1969 
1970 int __attribute__((weak))
1971 mlx5_check_vec_tx_support(struct rte_eth_dev *dev __rte_unused)
1972 {
1973 	return -ENOTSUP;
1974 }
1975 
1976 int __attribute__((weak))
1977 mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused)
1978 {
1979 	return -ENOTSUP;
1980 }
1981 
1982 int __attribute__((weak))
1983 mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused)
1984 {
1985 	return -ENOTSUP;
1986 }
1987