xref: /dpdk/drivers/net/mlx5/mlx5_rxtx.c (revision 2f82d143fb318042f47a50694baa4507b51b7381)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5 
6 #include <assert.h>
7 #include <stdint.h>
8 #include <string.h>
9 #include <stdlib.h>
10 
11 /* Verbs header. */
12 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
13 #ifdef PEDANTIC
14 #pragma GCC diagnostic ignored "-Wpedantic"
15 #endif
16 #include <infiniband/verbs.h>
17 #include <infiniband/mlx5dv.h>
18 #ifdef PEDANTIC
19 #pragma GCC diagnostic error "-Wpedantic"
20 #endif
21 
22 #include <rte_mbuf.h>
23 #include <rte_mempool.h>
24 #include <rte_prefetch.h>
25 #include <rte_common.h>
26 #include <rte_branch_prediction.h>
27 #include <rte_ether.h>
28 
29 #include "mlx5.h"
30 #include "mlx5_utils.h"
31 #include "mlx5_rxtx.h"
32 #include "mlx5_autoconf.h"
33 #include "mlx5_defs.h"
34 #include "mlx5_prm.h"
35 
36 static __rte_always_inline uint32_t
37 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe);
38 
39 static __rte_always_inline int
40 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
41 		 uint16_t cqe_cnt, uint32_t *rss_hash);
42 
43 static __rte_always_inline uint32_t
44 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe);
45 
46 uint32_t mlx5_ptype_table[] __rte_cache_aligned = {
47 	[0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */
48 };
49 
50 uint8_t mlx5_cksum_table[1 << 10] __rte_cache_aligned;
51 uint8_t mlx5_swp_types_table[1 << 10] __rte_cache_aligned;
52 
53 /**
54  * Build a table to translate Rx completion flags to packet type.
55  *
56  * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
57  */
58 void
59 mlx5_set_ptype_table(void)
60 {
61 	unsigned int i;
62 	uint32_t (*p)[RTE_DIM(mlx5_ptype_table)] = &mlx5_ptype_table;
63 
64 	/* Last entry must not be overwritten, reserved for errored packet. */
65 	for (i = 0; i < RTE_DIM(mlx5_ptype_table) - 1; ++i)
66 		(*p)[i] = RTE_PTYPE_UNKNOWN;
67 	/*
68 	 * The index to the array should have:
69 	 * bit[1:0] = l3_hdr_type
70 	 * bit[4:2] = l4_hdr_type
71 	 * bit[5] = ip_frag
72 	 * bit[6] = tunneled
73 	 * bit[7] = outer_l3_type
74 	 */
75 	/* L2 */
76 	(*p)[0x00] = RTE_PTYPE_L2_ETHER;
77 	/* L3 */
78 	(*p)[0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
79 		     RTE_PTYPE_L4_NONFRAG;
80 	(*p)[0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
81 		     RTE_PTYPE_L4_NONFRAG;
82 	/* Fragmented */
83 	(*p)[0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
84 		     RTE_PTYPE_L4_FRAG;
85 	(*p)[0x22] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
86 		     RTE_PTYPE_L4_FRAG;
87 	/* TCP */
88 	(*p)[0x05] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
89 		     RTE_PTYPE_L4_TCP;
90 	(*p)[0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
91 		     RTE_PTYPE_L4_TCP;
92 	(*p)[0x0d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
93 		     RTE_PTYPE_L4_TCP;
94 	(*p)[0x0e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
95 		     RTE_PTYPE_L4_TCP;
96 	(*p)[0x11] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
97 		     RTE_PTYPE_L4_TCP;
98 	(*p)[0x12] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
99 		     RTE_PTYPE_L4_TCP;
100 	/* UDP */
101 	(*p)[0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
102 		     RTE_PTYPE_L4_UDP;
103 	(*p)[0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
104 		     RTE_PTYPE_L4_UDP;
105 	/* Repeat with outer_l3_type being set. Just in case. */
106 	(*p)[0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
107 		     RTE_PTYPE_L4_NONFRAG;
108 	(*p)[0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
109 		     RTE_PTYPE_L4_NONFRAG;
110 	(*p)[0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
111 		     RTE_PTYPE_L4_FRAG;
112 	(*p)[0xa2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
113 		     RTE_PTYPE_L4_FRAG;
114 	(*p)[0x85] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
115 		     RTE_PTYPE_L4_TCP;
116 	(*p)[0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
117 		     RTE_PTYPE_L4_TCP;
118 	(*p)[0x8d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
119 		     RTE_PTYPE_L4_TCP;
120 	(*p)[0x8e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
121 		     RTE_PTYPE_L4_TCP;
122 	(*p)[0x91] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
123 		     RTE_PTYPE_L4_TCP;
124 	(*p)[0x92] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
125 		     RTE_PTYPE_L4_TCP;
126 	(*p)[0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
127 		     RTE_PTYPE_L4_UDP;
128 	(*p)[0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
129 		     RTE_PTYPE_L4_UDP;
130 	/* Tunneled - L3 */
131 	(*p)[0x40] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN;
132 	(*p)[0x41] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
133 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
134 		     RTE_PTYPE_INNER_L4_NONFRAG;
135 	(*p)[0x42] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
136 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
137 		     RTE_PTYPE_INNER_L4_NONFRAG;
138 	(*p)[0xc0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN;
139 	(*p)[0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
140 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
141 		     RTE_PTYPE_INNER_L4_NONFRAG;
142 	(*p)[0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
143 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
144 		     RTE_PTYPE_INNER_L4_NONFRAG;
145 	/* Tunneled - Fragmented */
146 	(*p)[0x61] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
147 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
148 		     RTE_PTYPE_INNER_L4_FRAG;
149 	(*p)[0x62] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
150 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
151 		     RTE_PTYPE_INNER_L4_FRAG;
152 	(*p)[0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
153 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
154 		     RTE_PTYPE_INNER_L4_FRAG;
155 	(*p)[0xe2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
156 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
157 		     RTE_PTYPE_INNER_L4_FRAG;
158 	/* Tunneled - TCP */
159 	(*p)[0x45] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
160 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
161 		     RTE_PTYPE_INNER_L4_TCP;
162 	(*p)[0x46] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
163 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
164 		     RTE_PTYPE_INNER_L4_TCP;
165 	(*p)[0x4d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
166 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
167 		     RTE_PTYPE_INNER_L4_TCP;
168 	(*p)[0x4e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
169 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
170 		     RTE_PTYPE_INNER_L4_TCP;
171 	(*p)[0x51] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
172 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
173 		     RTE_PTYPE_INNER_L4_TCP;
174 	(*p)[0x52] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
175 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
176 		     RTE_PTYPE_INNER_L4_TCP;
177 	(*p)[0xc5] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
178 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
179 		     RTE_PTYPE_INNER_L4_TCP;
180 	(*p)[0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
181 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
182 		     RTE_PTYPE_INNER_L4_TCP;
183 	(*p)[0xcd] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
184 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
185 		     RTE_PTYPE_INNER_L4_TCP;
186 	(*p)[0xce] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
187 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
188 		     RTE_PTYPE_INNER_L4_TCP;
189 	(*p)[0xd1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
190 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
191 		     RTE_PTYPE_INNER_L4_TCP;
192 	(*p)[0xd2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
193 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
194 		     RTE_PTYPE_INNER_L4_TCP;
195 	/* Tunneled - UDP */
196 	(*p)[0x49] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
197 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
198 		     RTE_PTYPE_INNER_L4_UDP;
199 	(*p)[0x4a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
200 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
201 		     RTE_PTYPE_INNER_L4_UDP;
202 	(*p)[0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
203 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
204 		     RTE_PTYPE_INNER_L4_UDP;
205 	(*p)[0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
206 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
207 		     RTE_PTYPE_INNER_L4_UDP;
208 }
209 
210 /**
211  * Build a table to translate packet to checksum type of Verbs.
212  */
213 void
214 mlx5_set_cksum_table(void)
215 {
216 	unsigned int i;
217 	uint8_t v;
218 
219 	/*
220 	 * The index should have:
221 	 * bit[0] = PKT_TX_TCP_SEG
222 	 * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM
223 	 * bit[4] = PKT_TX_IP_CKSUM
224 	 * bit[8] = PKT_TX_OUTER_IP_CKSUM
225 	 * bit[9] = tunnel
226 	 */
227 	for (i = 0; i < RTE_DIM(mlx5_cksum_table); ++i) {
228 		v = 0;
229 		if (i & (1 << 9)) {
230 			/* Tunneled packet. */
231 			if (i & (1 << 8)) /* Outer IP. */
232 				v |= MLX5_ETH_WQE_L3_CSUM;
233 			if (i & (1 << 4)) /* Inner IP. */
234 				v |= MLX5_ETH_WQE_L3_INNER_CSUM;
235 			if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */
236 				v |= MLX5_ETH_WQE_L4_INNER_CSUM;
237 		} else {
238 			/* No tunnel. */
239 			if (i & (1 << 4)) /* IP. */
240 				v |= MLX5_ETH_WQE_L3_CSUM;
241 			if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */
242 				v |= MLX5_ETH_WQE_L4_CSUM;
243 		}
244 		mlx5_cksum_table[i] = v;
245 	}
246 }
247 
248 /**
249  * Build a table to translate packet type of mbuf to SWP type of Verbs.
250  */
251 void
252 mlx5_set_swp_types_table(void)
253 {
254 	unsigned int i;
255 	uint8_t v;
256 
257 	/*
258 	 * The index should have:
259 	 * bit[0:1] = PKT_TX_L4_MASK
260 	 * bit[4] = PKT_TX_IPV6
261 	 * bit[8] = PKT_TX_OUTER_IPV6
262 	 * bit[9] = PKT_TX_OUTER_UDP
263 	 */
264 	for (i = 0; i < RTE_DIM(mlx5_swp_types_table); ++i) {
265 		v = 0;
266 		if (i & (1 << 8))
267 			v |= MLX5_ETH_WQE_L3_OUTER_IPV6;
268 		if (i & (1 << 9))
269 			v |= MLX5_ETH_WQE_L4_OUTER_UDP;
270 		if (i & (1 << 4))
271 			v |= MLX5_ETH_WQE_L3_INNER_IPV6;
272 		if ((i & 3) == (PKT_TX_UDP_CKSUM >> 52))
273 			v |= MLX5_ETH_WQE_L4_INNER_UDP;
274 		mlx5_swp_types_table[i] = v;
275 	}
276 }
277 
278 /**
279  * Return the size of tailroom of WQ.
280  *
281  * @param txq
282  *   Pointer to TX queue structure.
283  * @param addr
284  *   Pointer to tail of WQ.
285  *
286  * @return
287  *   Size of tailroom.
288  */
289 static inline size_t
290 tx_mlx5_wq_tailroom(struct mlx5_txq_data *txq, void *addr)
291 {
292 	size_t tailroom;
293 	tailroom = (uintptr_t)(txq->wqes) +
294 		   (1 << txq->wqe_n) * MLX5_WQE_SIZE -
295 		   (uintptr_t)addr;
296 	return tailroom;
297 }
298 
299 /**
300  * Copy data to tailroom of circular queue.
301  *
302  * @param dst
303  *   Pointer to destination.
304  * @param src
305  *   Pointer to source.
306  * @param n
307  *   Number of bytes to copy.
308  * @param base
309  *   Pointer to head of queue.
310  * @param tailroom
311  *   Size of tailroom from dst.
312  *
313  * @return
314  *   Pointer after copied data.
315  */
316 static inline void *
317 mlx5_copy_to_wq(void *dst, const void *src, size_t n,
318 		void *base, size_t tailroom)
319 {
320 	void *ret;
321 
322 	if (n > tailroom) {
323 		rte_memcpy(dst, src, tailroom);
324 		rte_memcpy(base, (void *)((uintptr_t)src + tailroom),
325 			   n - tailroom);
326 		ret = (uint8_t *)base + n - tailroom;
327 	} else {
328 		rte_memcpy(dst, src, n);
329 		ret = (n == tailroom) ? base : (uint8_t *)dst + n;
330 	}
331 	return ret;
332 }
333 
334 /**
335  * Inline TSO headers into WQE.
336  *
337  * @return
338  *   0 on success, negative errno value on failure.
339  */
340 static int
341 inline_tso(struct mlx5_txq_data *txq, struct rte_mbuf *buf,
342 	   uint32_t *length,
343 	   uintptr_t *addr,
344 	   uint16_t *pkt_inline_sz,
345 	   uint8_t **raw,
346 	   uint16_t *max_wqe,
347 	   uint16_t *tso_segsz,
348 	   uint16_t *tso_header_sz)
349 {
350 	uintptr_t end = (uintptr_t)(((uintptr_t)txq->wqes) +
351 				    (1 << txq->wqe_n) * MLX5_WQE_SIZE);
352 	unsigned int copy_b;
353 	uint8_t vlan_sz = (buf->ol_flags & PKT_TX_VLAN_PKT) ? 4 : 0;
354 	const uint8_t tunneled = txq->tunnel_en && (buf->ol_flags &
355 				 PKT_TX_TUNNEL_MASK);
356 	uint16_t n_wqe;
357 
358 	*tso_segsz = buf->tso_segsz;
359 	*tso_header_sz = buf->l2_len + vlan_sz + buf->l3_len + buf->l4_len;
360 	if (unlikely(*tso_segsz == 0 || *tso_header_sz == 0)) {
361 		txq->stats.oerrors++;
362 		return -EINVAL;
363 	}
364 	if (tunneled)
365 		*tso_header_sz += buf->outer_l2_len + buf->outer_l3_len;
366 	/* First seg must contain all TSO headers. */
367 	if (unlikely(*tso_header_sz > MLX5_MAX_TSO_HEADER) ||
368 		     *tso_header_sz > DATA_LEN(buf)) {
369 		txq->stats.oerrors++;
370 		return -EINVAL;
371 	}
372 	copy_b = *tso_header_sz - *pkt_inline_sz;
373 	if (!copy_b || ((end - (uintptr_t)*raw) < copy_b))
374 		return -EAGAIN;
375 	n_wqe = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4;
376 	if (unlikely(*max_wqe < n_wqe))
377 		return -EINVAL;
378 	*max_wqe -= n_wqe;
379 	rte_memcpy((void *)*raw, (void *)*addr, copy_b);
380 	*length -= copy_b;
381 	*addr += copy_b;
382 	copy_b = MLX5_WQE_DS(copy_b) * MLX5_WQE_DWORD_SIZE;
383 	*pkt_inline_sz += copy_b;
384 	*raw += copy_b;
385 	return 0;
386 }
387 
388 /**
389  * DPDK callback to check the status of a tx descriptor.
390  *
391  * @param tx_queue
392  *   The tx queue.
393  * @param[in] offset
394  *   The index of the descriptor in the ring.
395  *
396  * @return
397  *   The status of the tx descriptor.
398  */
399 int
400 mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset)
401 {
402 	struct mlx5_txq_data *txq = tx_queue;
403 	uint16_t used;
404 
405 	mlx5_tx_complete(txq);
406 	used = txq->elts_head - txq->elts_tail;
407 	if (offset < used)
408 		return RTE_ETH_TX_DESC_FULL;
409 	return RTE_ETH_TX_DESC_DONE;
410 }
411 
412 /**
413  * DPDK callback to check the status of a rx descriptor.
414  *
415  * @param rx_queue
416  *   The rx queue.
417  * @param[in] offset
418  *   The index of the descriptor in the ring.
419  *
420  * @return
421  *   The status of the tx descriptor.
422  */
423 int
424 mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset)
425 {
426 	struct mlx5_rxq_data *rxq = rx_queue;
427 	struct rxq_zip *zip = &rxq->zip;
428 	volatile struct mlx5_cqe *cqe;
429 	const unsigned int cqe_n = (1 << rxq->cqe_n);
430 	const unsigned int cqe_cnt = cqe_n - 1;
431 	unsigned int cq_ci;
432 	unsigned int used;
433 
434 	/* if we are processing a compressed cqe */
435 	if (zip->ai) {
436 		used = zip->cqe_cnt - zip->ca;
437 		cq_ci = zip->cq_ci;
438 	} else {
439 		used = 0;
440 		cq_ci = rxq->cq_ci;
441 	}
442 	cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
443 	while (check_cqe(cqe, cqe_n, cq_ci) == 0) {
444 		int8_t op_own;
445 		unsigned int n;
446 
447 		op_own = cqe->op_own;
448 		if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED)
449 			n = rte_be_to_cpu_32(cqe->byte_cnt);
450 		else
451 			n = 1;
452 		cq_ci += n;
453 		used += n;
454 		cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
455 	}
456 	used = RTE_MIN(used, (1U << rxq->elts_n) - 1);
457 	if (offset < used)
458 		return RTE_ETH_RX_DESC_DONE;
459 	return RTE_ETH_RX_DESC_AVAIL;
460 }
461 
462 /**
463  * DPDK callback for TX.
464  *
465  * @param dpdk_txq
466  *   Generic pointer to TX queue structure.
467  * @param[in] pkts
468  *   Packets to transmit.
469  * @param pkts_n
470  *   Number of packets in array.
471  *
472  * @return
473  *   Number of packets successfully transmitted (<= pkts_n).
474  */
475 uint16_t
476 mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
477 {
478 	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
479 	uint16_t elts_head = txq->elts_head;
480 	const uint16_t elts_n = 1 << txq->elts_n;
481 	const uint16_t elts_m = elts_n - 1;
482 	unsigned int i = 0;
483 	unsigned int j = 0;
484 	unsigned int k = 0;
485 	uint16_t max_elts;
486 	uint16_t max_wqe;
487 	unsigned int comp;
488 	volatile struct mlx5_wqe_ctrl *last_wqe = NULL;
489 	unsigned int segs_n = 0;
490 	const unsigned int max_inline = txq->max_inline;
491 
492 	if (unlikely(!pkts_n))
493 		return 0;
494 	/* Prefetch first packet cacheline. */
495 	rte_prefetch0(*pkts);
496 	/* Start processing. */
497 	mlx5_tx_complete(txq);
498 	max_elts = (elts_n - (elts_head - txq->elts_tail));
499 	/* A CQE slot must always be available. */
500 	assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci));
501 	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
502 	if (unlikely(!max_wqe))
503 		return 0;
504 	do {
505 		struct rte_mbuf *buf = *pkts; /* First_seg. */
506 		uint8_t *raw;
507 		volatile struct mlx5_wqe_v *wqe = NULL;
508 		volatile rte_v128u32_t *dseg = NULL;
509 		uint32_t length;
510 		unsigned int ds = 0;
511 		unsigned int sg = 0; /* counter of additional segs attached. */
512 		uintptr_t addr;
513 		uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE + 2;
514 		uint16_t tso_header_sz = 0;
515 		uint16_t ehdr;
516 		uint8_t cs_flags;
517 		uint8_t tso = txq->tso_en && (buf->ol_flags & PKT_TX_TCP_SEG);
518 		uint8_t is_vlan = !!(buf->ol_flags & PKT_TX_VLAN_PKT);
519 		uint32_t swp_offsets = 0;
520 		uint8_t swp_types = 0;
521 		uint16_t tso_segsz = 0;
522 #ifdef MLX5_PMD_SOFT_COUNTERS
523 		uint32_t total_length = 0;
524 #endif
525 		int ret;
526 
527 		segs_n = buf->nb_segs;
528 		/*
529 		 * Make sure there is enough room to store this packet and
530 		 * that one ring entry remains unused.
531 		 */
532 		assert(segs_n);
533 		if (max_elts < segs_n)
534 			break;
535 		max_elts -= segs_n;
536 		sg = --segs_n;
537 		if (unlikely(--max_wqe == 0))
538 			break;
539 		wqe = (volatile struct mlx5_wqe_v *)
540 			tx_mlx5_wqe(txq, txq->wqe_ci);
541 		rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
542 		if (pkts_n - i > 1)
543 			rte_prefetch0(*(pkts + 1));
544 		addr = rte_pktmbuf_mtod(buf, uintptr_t);
545 		length = DATA_LEN(buf);
546 		ehdr = (((uint8_t *)addr)[1] << 8) |
547 		       ((uint8_t *)addr)[0];
548 #ifdef MLX5_PMD_SOFT_COUNTERS
549 		total_length = length;
550 #endif
551 		if (length < (MLX5_WQE_DWORD_SIZE + 2)) {
552 			txq->stats.oerrors++;
553 			break;
554 		}
555 		/* Update element. */
556 		(*txq->elts)[elts_head & elts_m] = buf;
557 		/* Prefetch next buffer data. */
558 		if (pkts_n - i > 1)
559 			rte_prefetch0(
560 			    rte_pktmbuf_mtod(*(pkts + 1), volatile void *));
561 		cs_flags = txq_ol_cksum_to_cs(buf);
562 		txq_mbuf_to_swp(txq, buf, tso, is_vlan,
563 				(uint8_t *)&swp_offsets, &swp_types);
564 		raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE;
565 		/* Replace the Ethernet type by the VLAN if necessary. */
566 		if (is_vlan) {
567 			uint32_t vlan = rte_cpu_to_be_32(0x81000000 |
568 							 buf->vlan_tci);
569 			unsigned int len = 2 * ETHER_ADDR_LEN - 2;
570 
571 			addr += 2;
572 			length -= 2;
573 			/* Copy Destination and source mac address. */
574 			memcpy((uint8_t *)raw, ((uint8_t *)addr), len);
575 			/* Copy VLAN. */
576 			memcpy((uint8_t *)raw + len, &vlan, sizeof(vlan));
577 			/* Copy missing two bytes to end the DSeg. */
578 			memcpy((uint8_t *)raw + len + sizeof(vlan),
579 			       ((uint8_t *)addr) + len, 2);
580 			addr += len + 2;
581 			length -= (len + 2);
582 		} else {
583 			memcpy((uint8_t *)raw, ((uint8_t *)addr) + 2,
584 			       MLX5_WQE_DWORD_SIZE);
585 			length -= pkt_inline_sz;
586 			addr += pkt_inline_sz;
587 		}
588 		raw += MLX5_WQE_DWORD_SIZE;
589 		if (tso) {
590 			ret = inline_tso(txq, buf, &length,
591 					 &addr, &pkt_inline_sz,
592 					 &raw, &max_wqe,
593 					 &tso_segsz, &tso_header_sz);
594 			if (ret == -EINVAL) {
595 				break;
596 			} else if (ret == -EAGAIN) {
597 				/* NOP WQE. */
598 				wqe->ctrl = (rte_v128u32_t){
599 					rte_cpu_to_be_32(txq->wqe_ci << 8),
600 					rte_cpu_to_be_32(txq->qp_num_8s | 1),
601 					0,
602 					0,
603 				};
604 				ds = 1;
605 #ifdef MLX5_PMD_SOFT_COUNTERS
606 				total_length = 0;
607 #endif
608 				k++;
609 				goto next_wqe;
610 			}
611 		}
612 		/* Inline if enough room. */
613 		if (max_inline || tso) {
614 			uint32_t inl = 0;
615 			uintptr_t end = (uintptr_t)
616 				(((uintptr_t)txq->wqes) +
617 				 (1 << txq->wqe_n) * MLX5_WQE_SIZE);
618 			unsigned int inline_room = max_inline *
619 						   RTE_CACHE_LINE_SIZE -
620 						   (pkt_inline_sz - 2) -
621 						   !!tso * sizeof(inl);
622 			uintptr_t addr_end;
623 			unsigned int copy_b;
624 
625 pkt_inline:
626 			addr_end = RTE_ALIGN_FLOOR(addr + inline_room,
627 						   RTE_CACHE_LINE_SIZE);
628 			copy_b = (addr_end > addr) ?
629 				 RTE_MIN((addr_end - addr), length) : 0;
630 			if (copy_b && ((end - (uintptr_t)raw) > copy_b)) {
631 				/*
632 				 * One Dseg remains in the current WQE.  To
633 				 * keep the computation positive, it is
634 				 * removed after the bytes to Dseg conversion.
635 				 */
636 				uint16_t n = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4;
637 
638 				if (unlikely(max_wqe < n))
639 					break;
640 				max_wqe -= n;
641 				if (tso && !inl) {
642 					inl = rte_cpu_to_be_32(copy_b |
643 							       MLX5_INLINE_SEG);
644 					rte_memcpy((void *)raw,
645 						   (void *)&inl, sizeof(inl));
646 					raw += sizeof(inl);
647 					pkt_inline_sz += sizeof(inl);
648 				}
649 				rte_memcpy((void *)raw, (void *)addr, copy_b);
650 				addr += copy_b;
651 				length -= copy_b;
652 				pkt_inline_sz += copy_b;
653 			}
654 			/*
655 			 * 2 DWORDs consumed by the WQE header + ETH segment +
656 			 * the size of the inline part of the packet.
657 			 */
658 			ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2);
659 			if (length > 0) {
660 				if (ds % (MLX5_WQE_SIZE /
661 					  MLX5_WQE_DWORD_SIZE) == 0) {
662 					if (unlikely(--max_wqe == 0))
663 						break;
664 					dseg = (volatile rte_v128u32_t *)
665 					       tx_mlx5_wqe(txq, txq->wqe_ci +
666 							   ds / 4);
667 				} else {
668 					dseg = (volatile rte_v128u32_t *)
669 						((uintptr_t)wqe +
670 						 (ds * MLX5_WQE_DWORD_SIZE));
671 				}
672 				goto use_dseg;
673 			} else if (!segs_n) {
674 				goto next_pkt;
675 			} else {
676 				raw += copy_b;
677 				inline_room -= copy_b;
678 				--segs_n;
679 				buf = buf->next;
680 				assert(buf);
681 				addr = rte_pktmbuf_mtod(buf, uintptr_t);
682 				length = DATA_LEN(buf);
683 #ifdef MLX5_PMD_SOFT_COUNTERS
684 				total_length += length;
685 #endif
686 				(*txq->elts)[++elts_head & elts_m] = buf;
687 				goto pkt_inline;
688 			}
689 		} else {
690 			/*
691 			 * No inline has been done in the packet, only the
692 			 * Ethernet Header as been stored.
693 			 */
694 			dseg = (volatile rte_v128u32_t *)
695 				((uintptr_t)wqe + (3 * MLX5_WQE_DWORD_SIZE));
696 			ds = 3;
697 use_dseg:
698 			/* Add the remaining packet as a simple ds. */
699 			addr = rte_cpu_to_be_64(addr);
700 			*dseg = (rte_v128u32_t){
701 				rte_cpu_to_be_32(length),
702 				mlx5_tx_mb2mr(txq, buf),
703 				addr,
704 				addr >> 32,
705 			};
706 			++ds;
707 			if (!segs_n)
708 				goto next_pkt;
709 		}
710 next_seg:
711 		assert(buf);
712 		assert(ds);
713 		assert(wqe);
714 		/*
715 		 * Spill on next WQE when the current one does not have
716 		 * enough room left. Size of WQE must a be a multiple
717 		 * of data segment size.
718 		 */
719 		assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE));
720 		if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE))) {
721 			if (unlikely(--max_wqe == 0))
722 				break;
723 			dseg = (volatile rte_v128u32_t *)
724 			       tx_mlx5_wqe(txq, txq->wqe_ci + ds / 4);
725 			rte_prefetch0(tx_mlx5_wqe(txq,
726 						  txq->wqe_ci + ds / 4 + 1));
727 		} else {
728 			++dseg;
729 		}
730 		++ds;
731 		buf = buf->next;
732 		assert(buf);
733 		length = DATA_LEN(buf);
734 #ifdef MLX5_PMD_SOFT_COUNTERS
735 		total_length += length;
736 #endif
737 		/* Store segment information. */
738 		addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf, uintptr_t));
739 		*dseg = (rte_v128u32_t){
740 			rte_cpu_to_be_32(length),
741 			mlx5_tx_mb2mr(txq, buf),
742 			addr,
743 			addr >> 32,
744 		};
745 		(*txq->elts)[++elts_head & elts_m] = buf;
746 		if (--segs_n)
747 			goto next_seg;
748 next_pkt:
749 		if (ds > MLX5_DSEG_MAX) {
750 			txq->stats.oerrors++;
751 			break;
752 		}
753 		++elts_head;
754 		++pkts;
755 		++i;
756 		j += sg;
757 		/* Initialize known and common part of the WQE structure. */
758 		if (tso) {
759 			wqe->ctrl = (rte_v128u32_t){
760 				rte_cpu_to_be_32((txq->wqe_ci << 8) |
761 						 MLX5_OPCODE_TSO),
762 				rte_cpu_to_be_32(txq->qp_num_8s | ds),
763 				0,
764 				0,
765 			};
766 			wqe->eseg = (rte_v128u32_t){
767 				swp_offsets,
768 				cs_flags | (swp_types << 8) |
769 				(rte_cpu_to_be_16(tso_segsz) << 16),
770 				0,
771 				(ehdr << 16) | rte_cpu_to_be_16(tso_header_sz),
772 			};
773 		} else {
774 			wqe->ctrl = (rte_v128u32_t){
775 				rte_cpu_to_be_32((txq->wqe_ci << 8) |
776 						 MLX5_OPCODE_SEND),
777 				rte_cpu_to_be_32(txq->qp_num_8s | ds),
778 				0,
779 				0,
780 			};
781 			wqe->eseg = (rte_v128u32_t){
782 				swp_offsets,
783 				cs_flags | (swp_types << 8),
784 				0,
785 				(ehdr << 16) | rte_cpu_to_be_16(pkt_inline_sz),
786 			};
787 		}
788 next_wqe:
789 		txq->wqe_ci += (ds + 3) / 4;
790 		/* Save the last successful WQE for completion request */
791 		last_wqe = (volatile struct mlx5_wqe_ctrl *)wqe;
792 #ifdef MLX5_PMD_SOFT_COUNTERS
793 		/* Increment sent bytes counter. */
794 		txq->stats.obytes += total_length;
795 #endif
796 	} while (i < pkts_n);
797 	/* Take a shortcut if nothing must be sent. */
798 	if (unlikely((i + k) == 0))
799 		return 0;
800 	txq->elts_head += (i + j);
801 	/* Check whether completion threshold has been reached. */
802 	comp = txq->elts_comp + i + j + k;
803 	if (comp >= MLX5_TX_COMP_THRESH) {
804 		/* Request completion on last WQE. */
805 		last_wqe->ctrl2 = rte_cpu_to_be_32(8);
806 		/* Save elts_head in unused "immediate" field of WQE. */
807 		last_wqe->ctrl3 = txq->elts_head;
808 		txq->elts_comp = 0;
809 #ifndef NDEBUG
810 		++txq->cq_pi;
811 #endif
812 	} else {
813 		txq->elts_comp = comp;
814 	}
815 #ifdef MLX5_PMD_SOFT_COUNTERS
816 	/* Increment sent packets counter. */
817 	txq->stats.opackets += i;
818 #endif
819 	/* Ring QP doorbell. */
820 	mlx5_tx_dbrec(txq, (volatile struct mlx5_wqe *)last_wqe);
821 	return i;
822 }
823 
824 /**
825  * Open a MPW session.
826  *
827  * @param txq
828  *   Pointer to TX queue structure.
829  * @param mpw
830  *   Pointer to MPW session structure.
831  * @param length
832  *   Packet length.
833  */
834 static inline void
835 mlx5_mpw_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw, uint32_t length)
836 {
837 	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
838 	volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] =
839 		(volatile struct mlx5_wqe_data_seg (*)[])
840 		tx_mlx5_wqe(txq, idx + 1);
841 
842 	mpw->state = MLX5_MPW_STATE_OPENED;
843 	mpw->pkts_n = 0;
844 	mpw->len = length;
845 	mpw->total_len = 0;
846 	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
847 	mpw->wqe->eseg.mss = rte_cpu_to_be_16(length);
848 	mpw->wqe->eseg.inline_hdr_sz = 0;
849 	mpw->wqe->eseg.rsvd0 = 0;
850 	mpw->wqe->eseg.rsvd1 = 0;
851 	mpw->wqe->eseg.rsvd2 = 0;
852 	mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) |
853 					     (txq->wqe_ci << 8) |
854 					     MLX5_OPCODE_TSO);
855 	mpw->wqe->ctrl[2] = 0;
856 	mpw->wqe->ctrl[3] = 0;
857 	mpw->data.dseg[0] = (volatile struct mlx5_wqe_data_seg *)
858 		(((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
859 	mpw->data.dseg[1] = (volatile struct mlx5_wqe_data_seg *)
860 		(((uintptr_t)mpw->wqe) + (3 * MLX5_WQE_DWORD_SIZE));
861 	mpw->data.dseg[2] = &(*dseg)[0];
862 	mpw->data.dseg[3] = &(*dseg)[1];
863 	mpw->data.dseg[4] = &(*dseg)[2];
864 }
865 
866 /**
867  * Close a MPW session.
868  *
869  * @param txq
870  *   Pointer to TX queue structure.
871  * @param mpw
872  *   Pointer to MPW session structure.
873  */
874 static inline void
875 mlx5_mpw_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
876 {
877 	unsigned int num = mpw->pkts_n;
878 
879 	/*
880 	 * Store size in multiple of 16 bytes. Control and Ethernet segments
881 	 * count as 2.
882 	 */
883 	mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s | (2 + num));
884 	mpw->state = MLX5_MPW_STATE_CLOSED;
885 	if (num < 3)
886 		++txq->wqe_ci;
887 	else
888 		txq->wqe_ci += 2;
889 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
890 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
891 }
892 
893 /**
894  * DPDK callback for TX with MPW support.
895  *
896  * @param dpdk_txq
897  *   Generic pointer to TX queue structure.
898  * @param[in] pkts
899  *   Packets to transmit.
900  * @param pkts_n
901  *   Number of packets in array.
902  *
903  * @return
904  *   Number of packets successfully transmitted (<= pkts_n).
905  */
906 uint16_t
907 mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
908 {
909 	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
910 	uint16_t elts_head = txq->elts_head;
911 	const uint16_t elts_n = 1 << txq->elts_n;
912 	const uint16_t elts_m = elts_n - 1;
913 	unsigned int i = 0;
914 	unsigned int j = 0;
915 	uint16_t max_elts;
916 	uint16_t max_wqe;
917 	unsigned int comp;
918 	struct mlx5_mpw mpw = {
919 		.state = MLX5_MPW_STATE_CLOSED,
920 	};
921 
922 	if (unlikely(!pkts_n))
923 		return 0;
924 	/* Prefetch first packet cacheline. */
925 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
926 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
927 	/* Start processing. */
928 	mlx5_tx_complete(txq);
929 	max_elts = (elts_n - (elts_head - txq->elts_tail));
930 	/* A CQE slot must always be available. */
931 	assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci));
932 	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
933 	if (unlikely(!max_wqe))
934 		return 0;
935 	do {
936 		struct rte_mbuf *buf = *(pkts++);
937 		uint32_t length;
938 		unsigned int segs_n = buf->nb_segs;
939 		uint32_t cs_flags;
940 
941 		/*
942 		 * Make sure there is enough room to store this packet and
943 		 * that one ring entry remains unused.
944 		 */
945 		assert(segs_n);
946 		if (max_elts < segs_n)
947 			break;
948 		/* Do not bother with large packets MPW cannot handle. */
949 		if (segs_n > MLX5_MPW_DSEG_MAX) {
950 			txq->stats.oerrors++;
951 			break;
952 		}
953 		max_elts -= segs_n;
954 		--pkts_n;
955 		cs_flags = txq_ol_cksum_to_cs(buf);
956 		/* Retrieve packet information. */
957 		length = PKT_LEN(buf);
958 		assert(length);
959 		/* Start new session if packet differs. */
960 		if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
961 		    ((mpw.len != length) ||
962 		     (segs_n != 1) ||
963 		     (mpw.wqe->eseg.cs_flags != cs_flags)))
964 			mlx5_mpw_close(txq, &mpw);
965 		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
966 			/*
967 			 * Multi-Packet WQE consumes at most two WQE.
968 			 * mlx5_mpw_new() expects to be able to use such
969 			 * resources.
970 			 */
971 			if (unlikely(max_wqe < 2))
972 				break;
973 			max_wqe -= 2;
974 			mlx5_mpw_new(txq, &mpw, length);
975 			mpw.wqe->eseg.cs_flags = cs_flags;
976 		}
977 		/* Multi-segment packets must be alone in their MPW. */
978 		assert((segs_n == 1) || (mpw.pkts_n == 0));
979 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
980 		length = 0;
981 #endif
982 		do {
983 			volatile struct mlx5_wqe_data_seg *dseg;
984 			uintptr_t addr;
985 
986 			assert(buf);
987 			(*txq->elts)[elts_head++ & elts_m] = buf;
988 			dseg = mpw.data.dseg[mpw.pkts_n];
989 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
990 			*dseg = (struct mlx5_wqe_data_seg){
991 				.byte_count = rte_cpu_to_be_32(DATA_LEN(buf)),
992 				.lkey = mlx5_tx_mb2mr(txq, buf),
993 				.addr = rte_cpu_to_be_64(addr),
994 			};
995 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
996 			length += DATA_LEN(buf);
997 #endif
998 			buf = buf->next;
999 			++mpw.pkts_n;
1000 			++j;
1001 		} while (--segs_n);
1002 		assert(length == mpw.len);
1003 		if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
1004 			mlx5_mpw_close(txq, &mpw);
1005 #ifdef MLX5_PMD_SOFT_COUNTERS
1006 		/* Increment sent bytes counter. */
1007 		txq->stats.obytes += length;
1008 #endif
1009 		++i;
1010 	} while (pkts_n);
1011 	/* Take a shortcut if nothing must be sent. */
1012 	if (unlikely(i == 0))
1013 		return 0;
1014 	/* Check whether completion threshold has been reached. */
1015 	/* "j" includes both packets and segments. */
1016 	comp = txq->elts_comp + j;
1017 	if (comp >= MLX5_TX_COMP_THRESH) {
1018 		volatile struct mlx5_wqe *wqe = mpw.wqe;
1019 
1020 		/* Request completion on last WQE. */
1021 		wqe->ctrl[2] = rte_cpu_to_be_32(8);
1022 		/* Save elts_head in unused "immediate" field of WQE. */
1023 		wqe->ctrl[3] = elts_head;
1024 		txq->elts_comp = 0;
1025 #ifndef NDEBUG
1026 		++txq->cq_pi;
1027 #endif
1028 	} else {
1029 		txq->elts_comp = comp;
1030 	}
1031 #ifdef MLX5_PMD_SOFT_COUNTERS
1032 	/* Increment sent packets counter. */
1033 	txq->stats.opackets += i;
1034 #endif
1035 	/* Ring QP doorbell. */
1036 	if (mpw.state == MLX5_MPW_STATE_OPENED)
1037 		mlx5_mpw_close(txq, &mpw);
1038 	mlx5_tx_dbrec(txq, mpw.wqe);
1039 	txq->elts_head = elts_head;
1040 	return i;
1041 }
1042 
1043 /**
1044  * Open a MPW inline session.
1045  *
1046  * @param txq
1047  *   Pointer to TX queue structure.
1048  * @param mpw
1049  *   Pointer to MPW session structure.
1050  * @param length
1051  *   Packet length.
1052  */
1053 static inline void
1054 mlx5_mpw_inline_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw,
1055 		    uint32_t length)
1056 {
1057 	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
1058 	struct mlx5_wqe_inl_small *inl;
1059 
1060 	mpw->state = MLX5_MPW_INL_STATE_OPENED;
1061 	mpw->pkts_n = 0;
1062 	mpw->len = length;
1063 	mpw->total_len = 0;
1064 	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
1065 	mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) |
1066 					     (txq->wqe_ci << 8) |
1067 					     MLX5_OPCODE_TSO);
1068 	mpw->wqe->ctrl[2] = 0;
1069 	mpw->wqe->ctrl[3] = 0;
1070 	mpw->wqe->eseg.mss = rte_cpu_to_be_16(length);
1071 	mpw->wqe->eseg.inline_hdr_sz = 0;
1072 	mpw->wqe->eseg.cs_flags = 0;
1073 	mpw->wqe->eseg.rsvd0 = 0;
1074 	mpw->wqe->eseg.rsvd1 = 0;
1075 	mpw->wqe->eseg.rsvd2 = 0;
1076 	inl = (struct mlx5_wqe_inl_small *)
1077 		(((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE);
1078 	mpw->data.raw = (uint8_t *)&inl->raw;
1079 }
1080 
1081 /**
1082  * Close a MPW inline session.
1083  *
1084  * @param txq
1085  *   Pointer to TX queue structure.
1086  * @param mpw
1087  *   Pointer to MPW session structure.
1088  */
1089 static inline void
1090 mlx5_mpw_inline_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
1091 {
1092 	unsigned int size;
1093 	struct mlx5_wqe_inl_small *inl = (struct mlx5_wqe_inl_small *)
1094 		(((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
1095 
1096 	size = MLX5_WQE_SIZE - MLX5_MWQE64_INL_DATA + mpw->total_len;
1097 	/*
1098 	 * Store size in multiple of 16 bytes. Control and Ethernet segments
1099 	 * count as 2.
1100 	 */
1101 	mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s |
1102 					     MLX5_WQE_DS(size));
1103 	mpw->state = MLX5_MPW_STATE_CLOSED;
1104 	inl->byte_cnt = rte_cpu_to_be_32(mpw->total_len | MLX5_INLINE_SEG);
1105 	txq->wqe_ci += (size + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
1106 }
1107 
1108 /**
1109  * DPDK callback for TX with MPW inline support.
1110  *
1111  * @param dpdk_txq
1112  *   Generic pointer to TX queue structure.
1113  * @param[in] pkts
1114  *   Packets to transmit.
1115  * @param pkts_n
1116  *   Number of packets in array.
1117  *
1118  * @return
1119  *   Number of packets successfully transmitted (<= pkts_n).
1120  */
1121 uint16_t
1122 mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
1123 			 uint16_t pkts_n)
1124 {
1125 	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
1126 	uint16_t elts_head = txq->elts_head;
1127 	const uint16_t elts_n = 1 << txq->elts_n;
1128 	const uint16_t elts_m = elts_n - 1;
1129 	unsigned int i = 0;
1130 	unsigned int j = 0;
1131 	uint16_t max_elts;
1132 	uint16_t max_wqe;
1133 	unsigned int comp;
1134 	unsigned int inline_room = txq->max_inline * RTE_CACHE_LINE_SIZE;
1135 	struct mlx5_mpw mpw = {
1136 		.state = MLX5_MPW_STATE_CLOSED,
1137 	};
1138 	/*
1139 	 * Compute the maximum number of WQE which can be consumed by inline
1140 	 * code.
1141 	 * - 2 DSEG for:
1142 	 *   - 1 control segment,
1143 	 *   - 1 Ethernet segment,
1144 	 * - N Dseg from the inline request.
1145 	 */
1146 	const unsigned int wqe_inl_n =
1147 		((2 * MLX5_WQE_DWORD_SIZE +
1148 		  txq->max_inline * RTE_CACHE_LINE_SIZE) +
1149 		 RTE_CACHE_LINE_SIZE - 1) / RTE_CACHE_LINE_SIZE;
1150 
1151 	if (unlikely(!pkts_n))
1152 		return 0;
1153 	/* Prefetch first packet cacheline. */
1154 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
1155 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
1156 	/* Start processing. */
1157 	mlx5_tx_complete(txq);
1158 	max_elts = (elts_n - (elts_head - txq->elts_tail));
1159 	/* A CQE slot must always be available. */
1160 	assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci));
1161 	do {
1162 		struct rte_mbuf *buf = *(pkts++);
1163 		uintptr_t addr;
1164 		uint32_t length;
1165 		unsigned int segs_n = buf->nb_segs;
1166 		uint8_t cs_flags;
1167 
1168 		/*
1169 		 * Make sure there is enough room to store this packet and
1170 		 * that one ring entry remains unused.
1171 		 */
1172 		assert(segs_n);
1173 		if (max_elts < segs_n)
1174 			break;
1175 		/* Do not bother with large packets MPW cannot handle. */
1176 		if (segs_n > MLX5_MPW_DSEG_MAX) {
1177 			txq->stats.oerrors++;
1178 			break;
1179 		}
1180 		max_elts -= segs_n;
1181 		--pkts_n;
1182 		/*
1183 		 * Compute max_wqe in case less WQE were consumed in previous
1184 		 * iteration.
1185 		 */
1186 		max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
1187 		cs_flags = txq_ol_cksum_to_cs(buf);
1188 		/* Retrieve packet information. */
1189 		length = PKT_LEN(buf);
1190 		/* Start new session if packet differs. */
1191 		if (mpw.state == MLX5_MPW_STATE_OPENED) {
1192 			if ((mpw.len != length) ||
1193 			    (segs_n != 1) ||
1194 			    (mpw.wqe->eseg.cs_flags != cs_flags))
1195 				mlx5_mpw_close(txq, &mpw);
1196 		} else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
1197 			if ((mpw.len != length) ||
1198 			    (segs_n != 1) ||
1199 			    (length > inline_room) ||
1200 			    (mpw.wqe->eseg.cs_flags != cs_flags)) {
1201 				mlx5_mpw_inline_close(txq, &mpw);
1202 				inline_room =
1203 					txq->max_inline * RTE_CACHE_LINE_SIZE;
1204 			}
1205 		}
1206 		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
1207 			if ((segs_n != 1) ||
1208 			    (length > inline_room)) {
1209 				/*
1210 				 * Multi-Packet WQE consumes at most two WQE.
1211 				 * mlx5_mpw_new() expects to be able to use
1212 				 * such resources.
1213 				 */
1214 				if (unlikely(max_wqe < 2))
1215 					break;
1216 				max_wqe -= 2;
1217 				mlx5_mpw_new(txq, &mpw, length);
1218 				mpw.wqe->eseg.cs_flags = cs_flags;
1219 			} else {
1220 				if (unlikely(max_wqe < wqe_inl_n))
1221 					break;
1222 				max_wqe -= wqe_inl_n;
1223 				mlx5_mpw_inline_new(txq, &mpw, length);
1224 				mpw.wqe->eseg.cs_flags = cs_flags;
1225 			}
1226 		}
1227 		/* Multi-segment packets must be alone in their MPW. */
1228 		assert((segs_n == 1) || (mpw.pkts_n == 0));
1229 		if (mpw.state == MLX5_MPW_STATE_OPENED) {
1230 			assert(inline_room ==
1231 			       txq->max_inline * RTE_CACHE_LINE_SIZE);
1232 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1233 			length = 0;
1234 #endif
1235 			do {
1236 				volatile struct mlx5_wqe_data_seg *dseg;
1237 
1238 				assert(buf);
1239 				(*txq->elts)[elts_head++ & elts_m] = buf;
1240 				dseg = mpw.data.dseg[mpw.pkts_n];
1241 				addr = rte_pktmbuf_mtod(buf, uintptr_t);
1242 				*dseg = (struct mlx5_wqe_data_seg){
1243 					.byte_count =
1244 					       rte_cpu_to_be_32(DATA_LEN(buf)),
1245 					.lkey = mlx5_tx_mb2mr(txq, buf),
1246 					.addr = rte_cpu_to_be_64(addr),
1247 				};
1248 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1249 				length += DATA_LEN(buf);
1250 #endif
1251 				buf = buf->next;
1252 				++mpw.pkts_n;
1253 				++j;
1254 			} while (--segs_n);
1255 			assert(length == mpw.len);
1256 			if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
1257 				mlx5_mpw_close(txq, &mpw);
1258 		} else {
1259 			unsigned int max;
1260 
1261 			assert(mpw.state == MLX5_MPW_INL_STATE_OPENED);
1262 			assert(length <= inline_room);
1263 			assert(length == DATA_LEN(buf));
1264 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
1265 			(*txq->elts)[elts_head++ & elts_m] = buf;
1266 			/* Maximum number of bytes before wrapping. */
1267 			max = ((((uintptr_t)(txq->wqes)) +
1268 				(1 << txq->wqe_n) *
1269 				MLX5_WQE_SIZE) -
1270 			       (uintptr_t)mpw.data.raw);
1271 			if (length > max) {
1272 				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1273 					   (void *)addr,
1274 					   max);
1275 				mpw.data.raw = (volatile void *)txq->wqes;
1276 				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1277 					   (void *)(addr + max),
1278 					   length - max);
1279 				mpw.data.raw += length - max;
1280 			} else {
1281 				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1282 					   (void *)addr,
1283 					   length);
1284 
1285 				if (length == max)
1286 					mpw.data.raw =
1287 						(volatile void *)txq->wqes;
1288 				else
1289 					mpw.data.raw += length;
1290 			}
1291 			++mpw.pkts_n;
1292 			mpw.total_len += length;
1293 			++j;
1294 			if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) {
1295 				mlx5_mpw_inline_close(txq, &mpw);
1296 				inline_room =
1297 					txq->max_inline * RTE_CACHE_LINE_SIZE;
1298 			} else {
1299 				inline_room -= length;
1300 			}
1301 		}
1302 #ifdef MLX5_PMD_SOFT_COUNTERS
1303 		/* Increment sent bytes counter. */
1304 		txq->stats.obytes += length;
1305 #endif
1306 		++i;
1307 	} while (pkts_n);
1308 	/* Take a shortcut if nothing must be sent. */
1309 	if (unlikely(i == 0))
1310 		return 0;
1311 	/* Check whether completion threshold has been reached. */
1312 	/* "j" includes both packets and segments. */
1313 	comp = txq->elts_comp + j;
1314 	if (comp >= MLX5_TX_COMP_THRESH) {
1315 		volatile struct mlx5_wqe *wqe = mpw.wqe;
1316 
1317 		/* Request completion on last WQE. */
1318 		wqe->ctrl[2] = rte_cpu_to_be_32(8);
1319 		/* Save elts_head in unused "immediate" field of WQE. */
1320 		wqe->ctrl[3] = elts_head;
1321 		txq->elts_comp = 0;
1322 #ifndef NDEBUG
1323 		++txq->cq_pi;
1324 #endif
1325 	} else {
1326 		txq->elts_comp = comp;
1327 	}
1328 #ifdef MLX5_PMD_SOFT_COUNTERS
1329 	/* Increment sent packets counter. */
1330 	txq->stats.opackets += i;
1331 #endif
1332 	/* Ring QP doorbell. */
1333 	if (mpw.state == MLX5_MPW_INL_STATE_OPENED)
1334 		mlx5_mpw_inline_close(txq, &mpw);
1335 	else if (mpw.state == MLX5_MPW_STATE_OPENED)
1336 		mlx5_mpw_close(txq, &mpw);
1337 	mlx5_tx_dbrec(txq, mpw.wqe);
1338 	txq->elts_head = elts_head;
1339 	return i;
1340 }
1341 
1342 /**
1343  * Open an Enhanced MPW session.
1344  *
1345  * @param txq
1346  *   Pointer to TX queue structure.
1347  * @param mpw
1348  *   Pointer to MPW session structure.
1349  * @param length
1350  *   Packet length.
1351  */
1352 static inline void
1353 mlx5_empw_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw, int padding)
1354 {
1355 	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
1356 
1357 	mpw->state = MLX5_MPW_ENHANCED_STATE_OPENED;
1358 	mpw->pkts_n = 0;
1359 	mpw->total_len = sizeof(struct mlx5_wqe);
1360 	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
1361 	mpw->wqe->ctrl[0] =
1362 		rte_cpu_to_be_32((MLX5_OPC_MOD_ENHANCED_MPSW << 24) |
1363 				 (txq->wqe_ci << 8) |
1364 				 MLX5_OPCODE_ENHANCED_MPSW);
1365 	mpw->wqe->ctrl[2] = 0;
1366 	mpw->wqe->ctrl[3] = 0;
1367 	memset((void *)(uintptr_t)&mpw->wqe->eseg, 0, MLX5_WQE_DWORD_SIZE);
1368 	if (unlikely(padding)) {
1369 		uintptr_t addr = (uintptr_t)(mpw->wqe + 1);
1370 
1371 		/* Pad the first 2 DWORDs with zero-length inline header. */
1372 		*(volatile uint32_t *)addr = rte_cpu_to_be_32(MLX5_INLINE_SEG);
1373 		*(volatile uint32_t *)(addr + MLX5_WQE_DWORD_SIZE) =
1374 			rte_cpu_to_be_32(MLX5_INLINE_SEG);
1375 		mpw->total_len += 2 * MLX5_WQE_DWORD_SIZE;
1376 		/* Start from the next WQEBB. */
1377 		mpw->data.raw = (volatile void *)(tx_mlx5_wqe(txq, idx + 1));
1378 	} else {
1379 		mpw->data.raw = (volatile void *)(mpw->wqe + 1);
1380 	}
1381 }
1382 
1383 /**
1384  * Close an Enhanced MPW session.
1385  *
1386  * @param txq
1387  *   Pointer to TX queue structure.
1388  * @param mpw
1389  *   Pointer to MPW session structure.
1390  *
1391  * @return
1392  *   Number of consumed WQEs.
1393  */
1394 static inline uint16_t
1395 mlx5_empw_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
1396 {
1397 	uint16_t ret;
1398 
1399 	/* Store size in multiple of 16 bytes. Control and Ethernet segments
1400 	 * count as 2.
1401 	 */
1402 	mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s |
1403 					     MLX5_WQE_DS(mpw->total_len));
1404 	mpw->state = MLX5_MPW_STATE_CLOSED;
1405 	ret = (mpw->total_len + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
1406 	txq->wqe_ci += ret;
1407 	return ret;
1408 }
1409 
1410 /**
1411  * TX with Enhanced MPW support.
1412  *
1413  * @param txq
1414  *   Pointer to TX queue structure.
1415  * @param[in] pkts
1416  *   Packets to transmit.
1417  * @param pkts_n
1418  *   Number of packets in array.
1419  *
1420  * @return
1421  *   Number of packets successfully transmitted (<= pkts_n).
1422  */
1423 static inline uint16_t
1424 txq_burst_empw(struct mlx5_txq_data *txq, struct rte_mbuf **pkts,
1425 	       uint16_t pkts_n)
1426 {
1427 	uint16_t elts_head = txq->elts_head;
1428 	const uint16_t elts_n = 1 << txq->elts_n;
1429 	const uint16_t elts_m = elts_n - 1;
1430 	unsigned int i = 0;
1431 	unsigned int j = 0;
1432 	uint16_t max_elts;
1433 	uint16_t max_wqe;
1434 	unsigned int max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE;
1435 	unsigned int mpw_room = 0;
1436 	unsigned int inl_pad = 0;
1437 	uint32_t inl_hdr;
1438 	struct mlx5_mpw mpw = {
1439 		.state = MLX5_MPW_STATE_CLOSED,
1440 	};
1441 
1442 	if (unlikely(!pkts_n))
1443 		return 0;
1444 	/* Start processing. */
1445 	mlx5_tx_complete(txq);
1446 	max_elts = (elts_n - (elts_head - txq->elts_tail));
1447 	/* A CQE slot must always be available. */
1448 	assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci));
1449 	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
1450 	if (unlikely(!max_wqe))
1451 		return 0;
1452 	do {
1453 		struct rte_mbuf *buf = *(pkts++);
1454 		uintptr_t addr;
1455 		unsigned int do_inline = 0; /* Whether inline is possible. */
1456 		uint32_t length;
1457 		uint8_t cs_flags;
1458 
1459 		/* Multi-segmented packet is handled in slow-path outside. */
1460 		assert(NB_SEGS(buf) == 1);
1461 		/* Make sure there is enough room to store this packet. */
1462 		if (max_elts - j == 0)
1463 			break;
1464 		cs_flags = txq_ol_cksum_to_cs(buf);
1465 		/* Retrieve packet information. */
1466 		length = PKT_LEN(buf);
1467 		/* Start new session if:
1468 		 * - multi-segment packet
1469 		 * - no space left even for a dseg
1470 		 * - next packet can be inlined with a new WQE
1471 		 * - cs_flag differs
1472 		 */
1473 		if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED) {
1474 			if ((inl_pad + sizeof(struct mlx5_wqe_data_seg) >
1475 			     mpw_room) ||
1476 			    (length <= txq->inline_max_packet_sz &&
1477 			     inl_pad + sizeof(inl_hdr) + length >
1478 			     mpw_room) ||
1479 			    (mpw.wqe->eseg.cs_flags != cs_flags))
1480 				max_wqe -= mlx5_empw_close(txq, &mpw);
1481 		}
1482 		if (unlikely(mpw.state == MLX5_MPW_STATE_CLOSED)) {
1483 			/* In Enhanced MPW, inline as much as the budget is
1484 			 * allowed. The remaining space is to be filled with
1485 			 * dsegs. If the title WQEBB isn't padded, it will have
1486 			 * 2 dsegs there.
1487 			 */
1488 			mpw_room = RTE_MIN(MLX5_WQE_SIZE_MAX,
1489 					   (max_inline ? max_inline :
1490 					    pkts_n * MLX5_WQE_DWORD_SIZE) +
1491 					   MLX5_WQE_SIZE);
1492 			if (unlikely(max_wqe * MLX5_WQE_SIZE < mpw_room))
1493 				break;
1494 			/* Don't pad the title WQEBB to not waste WQ. */
1495 			mlx5_empw_new(txq, &mpw, 0);
1496 			mpw_room -= mpw.total_len;
1497 			inl_pad = 0;
1498 			do_inline = length <= txq->inline_max_packet_sz &&
1499 				    sizeof(inl_hdr) + length <= mpw_room &&
1500 				    !txq->mpw_hdr_dseg;
1501 			mpw.wqe->eseg.cs_flags = cs_flags;
1502 		} else {
1503 			/* Evaluate whether the next packet can be inlined.
1504 			 * Inlininig is possible when:
1505 			 * - length is less than configured value
1506 			 * - length fits for remaining space
1507 			 * - not required to fill the title WQEBB with dsegs
1508 			 */
1509 			do_inline =
1510 				length <= txq->inline_max_packet_sz &&
1511 				inl_pad + sizeof(inl_hdr) + length <=
1512 				 mpw_room &&
1513 				(!txq->mpw_hdr_dseg ||
1514 				 mpw.total_len >= MLX5_WQE_SIZE);
1515 		}
1516 		if (max_inline && do_inline) {
1517 			/* Inline packet into WQE. */
1518 			unsigned int max;
1519 
1520 			assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
1521 			assert(length == DATA_LEN(buf));
1522 			inl_hdr = rte_cpu_to_be_32(length | MLX5_INLINE_SEG);
1523 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
1524 			mpw.data.raw = (volatile void *)
1525 				((uintptr_t)mpw.data.raw + inl_pad);
1526 			max = tx_mlx5_wq_tailroom(txq,
1527 					(void *)(uintptr_t)mpw.data.raw);
1528 			/* Copy inline header. */
1529 			mpw.data.raw = (volatile void *)
1530 				mlx5_copy_to_wq(
1531 					  (void *)(uintptr_t)mpw.data.raw,
1532 					  &inl_hdr,
1533 					  sizeof(inl_hdr),
1534 					  (void *)(uintptr_t)txq->wqes,
1535 					  max);
1536 			max = tx_mlx5_wq_tailroom(txq,
1537 					(void *)(uintptr_t)mpw.data.raw);
1538 			/* Copy packet data. */
1539 			mpw.data.raw = (volatile void *)
1540 				mlx5_copy_to_wq(
1541 					  (void *)(uintptr_t)mpw.data.raw,
1542 					  (void *)addr,
1543 					  length,
1544 					  (void *)(uintptr_t)txq->wqes,
1545 					  max);
1546 			++mpw.pkts_n;
1547 			mpw.total_len += (inl_pad + sizeof(inl_hdr) + length);
1548 			/* No need to get completion as the entire packet is
1549 			 * copied to WQ. Free the buf right away.
1550 			 */
1551 			rte_pktmbuf_free_seg(buf);
1552 			mpw_room -= (inl_pad + sizeof(inl_hdr) + length);
1553 			/* Add pad in the next packet if any. */
1554 			inl_pad = (((uintptr_t)mpw.data.raw +
1555 					(MLX5_WQE_DWORD_SIZE - 1)) &
1556 					~(MLX5_WQE_DWORD_SIZE - 1)) -
1557 				  (uintptr_t)mpw.data.raw;
1558 		} else {
1559 			/* No inline. Load a dseg of packet pointer. */
1560 			volatile rte_v128u32_t *dseg;
1561 
1562 			assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
1563 			assert((inl_pad + sizeof(*dseg)) <= mpw_room);
1564 			assert(length == DATA_LEN(buf));
1565 			if (!tx_mlx5_wq_tailroom(txq,
1566 					(void *)((uintptr_t)mpw.data.raw
1567 						+ inl_pad)))
1568 				dseg = (volatile void *)txq->wqes;
1569 			else
1570 				dseg = (volatile void *)
1571 					((uintptr_t)mpw.data.raw +
1572 					 inl_pad);
1573 			(*txq->elts)[elts_head++ & elts_m] = buf;
1574 			addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf,
1575 								 uintptr_t));
1576 			*dseg = (rte_v128u32_t) {
1577 				rte_cpu_to_be_32(length),
1578 				mlx5_tx_mb2mr(txq, buf),
1579 				addr,
1580 				addr >> 32,
1581 			};
1582 			mpw.data.raw = (volatile void *)(dseg + 1);
1583 			mpw.total_len += (inl_pad + sizeof(*dseg));
1584 			++j;
1585 			++mpw.pkts_n;
1586 			mpw_room -= (inl_pad + sizeof(*dseg));
1587 			inl_pad = 0;
1588 		}
1589 #ifdef MLX5_PMD_SOFT_COUNTERS
1590 		/* Increment sent bytes counter. */
1591 		txq->stats.obytes += length;
1592 #endif
1593 		++i;
1594 	} while (i < pkts_n);
1595 	/* Take a shortcut if nothing must be sent. */
1596 	if (unlikely(i == 0))
1597 		return 0;
1598 	/* Check whether completion threshold has been reached. */
1599 	if (txq->elts_comp + j >= MLX5_TX_COMP_THRESH ||
1600 			(uint16_t)(txq->wqe_ci - txq->mpw_comp) >=
1601 			 (1 << txq->wqe_n) / MLX5_TX_COMP_THRESH_INLINE_DIV) {
1602 		volatile struct mlx5_wqe *wqe = mpw.wqe;
1603 
1604 		/* Request completion on last WQE. */
1605 		wqe->ctrl[2] = rte_cpu_to_be_32(8);
1606 		/* Save elts_head in unused "immediate" field of WQE. */
1607 		wqe->ctrl[3] = elts_head;
1608 		txq->elts_comp = 0;
1609 		txq->mpw_comp = txq->wqe_ci;
1610 #ifndef NDEBUG
1611 		++txq->cq_pi;
1612 #endif
1613 	} else {
1614 		txq->elts_comp += j;
1615 	}
1616 #ifdef MLX5_PMD_SOFT_COUNTERS
1617 	/* Increment sent packets counter. */
1618 	txq->stats.opackets += i;
1619 #endif
1620 	if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED)
1621 		mlx5_empw_close(txq, &mpw);
1622 	/* Ring QP doorbell. */
1623 	mlx5_tx_dbrec(txq, mpw.wqe);
1624 	txq->elts_head = elts_head;
1625 	return i;
1626 }
1627 
1628 /**
1629  * DPDK callback for TX with Enhanced MPW support.
1630  *
1631  * @param dpdk_txq
1632  *   Generic pointer to TX queue structure.
1633  * @param[in] pkts
1634  *   Packets to transmit.
1635  * @param pkts_n
1636  *   Number of packets in array.
1637  *
1638  * @return
1639  *   Number of packets successfully transmitted (<= pkts_n).
1640  */
1641 uint16_t
1642 mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
1643 {
1644 	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
1645 	uint16_t nb_tx = 0;
1646 
1647 	while (pkts_n > nb_tx) {
1648 		uint16_t n;
1649 		uint16_t ret;
1650 
1651 		n = txq_count_contig_multi_seg(&pkts[nb_tx], pkts_n - nb_tx);
1652 		if (n) {
1653 			ret = mlx5_tx_burst(dpdk_txq, &pkts[nb_tx], n);
1654 			if (!ret)
1655 				break;
1656 			nb_tx += ret;
1657 		}
1658 		n = txq_count_contig_single_seg(&pkts[nb_tx], pkts_n - nb_tx);
1659 		if (n) {
1660 			ret = txq_burst_empw(txq, &pkts[nb_tx], n);
1661 			if (!ret)
1662 				break;
1663 			nb_tx += ret;
1664 		}
1665 	}
1666 	return nb_tx;
1667 }
1668 
1669 /**
1670  * Translate RX completion flags to packet type.
1671  *
1672  * @param[in] rxq
1673  *   Pointer to RX queue structure.
1674  * @param[in] cqe
1675  *   Pointer to CQE.
1676  *
1677  * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
1678  *
1679  * @return
1680  *   Packet type for struct rte_mbuf.
1681  */
1682 static inline uint32_t
1683 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe)
1684 {
1685 	uint8_t idx;
1686 	uint8_t pinfo = cqe->pkt_info;
1687 	uint16_t ptype = cqe->hdr_type_etc;
1688 
1689 	/*
1690 	 * The index to the array should have:
1691 	 * bit[1:0] = l3_hdr_type
1692 	 * bit[4:2] = l4_hdr_type
1693 	 * bit[5] = ip_frag
1694 	 * bit[6] = tunneled
1695 	 * bit[7] = outer_l3_type
1696 	 */
1697 	idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10);
1698 	return mlx5_ptype_table[idx] | rxq->tunnel * !!(idx & (1 << 6));
1699 }
1700 
1701 /**
1702  * Get size of the next packet for a given CQE. For compressed CQEs, the
1703  * consumer index is updated only once all packets of the current one have
1704  * been processed.
1705  *
1706  * @param rxq
1707  *   Pointer to RX queue.
1708  * @param cqe
1709  *   CQE to process.
1710  * @param[out] rss_hash
1711  *   Packet RSS Hash result.
1712  *
1713  * @return
1714  *   Packet size in bytes (0 if there is none), -1 in case of completion
1715  *   with error.
1716  */
1717 static inline int
1718 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
1719 		 uint16_t cqe_cnt, uint32_t *rss_hash)
1720 {
1721 	struct rxq_zip *zip = &rxq->zip;
1722 	uint16_t cqe_n = cqe_cnt + 1;
1723 	int len = 0;
1724 	uint16_t idx, end;
1725 
1726 	/* Process compressed data in the CQE and mini arrays. */
1727 	if (zip->ai) {
1728 		volatile struct mlx5_mini_cqe8 (*mc)[8] =
1729 			(volatile struct mlx5_mini_cqe8 (*)[8])
1730 			(uintptr_t)(&(*rxq->cqes)[zip->ca & cqe_cnt].pkt_info);
1731 
1732 		len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt);
1733 		*rss_hash = rte_be_to_cpu_32((*mc)[zip->ai & 7].rx_hash_result);
1734 		if ((++zip->ai & 7) == 0) {
1735 			/* Invalidate consumed CQEs */
1736 			idx = zip->ca;
1737 			end = zip->na;
1738 			while (idx != end) {
1739 				(*rxq->cqes)[idx & cqe_cnt].op_own =
1740 					MLX5_CQE_INVALIDATE;
1741 				++idx;
1742 			}
1743 			/*
1744 			 * Increment consumer index to skip the number of
1745 			 * CQEs consumed. Hardware leaves holes in the CQ
1746 			 * ring for software use.
1747 			 */
1748 			zip->ca = zip->na;
1749 			zip->na += 8;
1750 		}
1751 		if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
1752 			/* Invalidate the rest */
1753 			idx = zip->ca;
1754 			end = zip->cq_ci;
1755 
1756 			while (idx != end) {
1757 				(*rxq->cqes)[idx & cqe_cnt].op_own =
1758 					MLX5_CQE_INVALIDATE;
1759 				++idx;
1760 			}
1761 			rxq->cq_ci = zip->cq_ci;
1762 			zip->ai = 0;
1763 		}
1764 	/* No compressed data, get next CQE and verify if it is compressed. */
1765 	} else {
1766 		int ret;
1767 		int8_t op_own;
1768 
1769 		ret = check_cqe(cqe, cqe_n, rxq->cq_ci);
1770 		if (unlikely(ret == 1))
1771 			return 0;
1772 		++rxq->cq_ci;
1773 		op_own = cqe->op_own;
1774 		rte_cio_rmb();
1775 		if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
1776 			volatile struct mlx5_mini_cqe8 (*mc)[8] =
1777 				(volatile struct mlx5_mini_cqe8 (*)[8])
1778 				(uintptr_t)(&(*rxq->cqes)[rxq->cq_ci &
1779 							  cqe_cnt].pkt_info);
1780 
1781 			/* Fix endianness. */
1782 			zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt);
1783 			/*
1784 			 * Current mini array position is the one returned by
1785 			 * check_cqe64().
1786 			 *
1787 			 * If completion comprises several mini arrays, as a
1788 			 * special case the second one is located 7 CQEs after
1789 			 * the initial CQE instead of 8 for subsequent ones.
1790 			 */
1791 			zip->ca = rxq->cq_ci;
1792 			zip->na = zip->ca + 7;
1793 			/* Compute the next non compressed CQE. */
1794 			--rxq->cq_ci;
1795 			zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
1796 			/* Get packet size to return. */
1797 			len = rte_be_to_cpu_32((*mc)[0].byte_cnt);
1798 			*rss_hash = rte_be_to_cpu_32((*mc)[0].rx_hash_result);
1799 			zip->ai = 1;
1800 			/* Prefetch all the entries to be invalidated */
1801 			idx = zip->ca;
1802 			end = zip->cq_ci;
1803 			while (idx != end) {
1804 				rte_prefetch0(&(*rxq->cqes)[(idx) & cqe_cnt]);
1805 				++idx;
1806 			}
1807 		} else {
1808 			len = rte_be_to_cpu_32(cqe->byte_cnt);
1809 			*rss_hash = rte_be_to_cpu_32(cqe->rx_hash_res);
1810 		}
1811 		/* Error while receiving packet. */
1812 		if (unlikely(MLX5_CQE_OPCODE(op_own) == MLX5_CQE_RESP_ERR))
1813 			return -1;
1814 	}
1815 	return len;
1816 }
1817 
1818 /**
1819  * Translate RX completion flags to offload flags.
1820  *
1821  * @param[in] cqe
1822  *   Pointer to CQE.
1823  *
1824  * @return
1825  *   Offload flags (ol_flags) for struct rte_mbuf.
1826  */
1827 static inline uint32_t
1828 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe)
1829 {
1830 	uint32_t ol_flags = 0;
1831 	uint16_t flags = rte_be_to_cpu_16(cqe->hdr_type_etc);
1832 
1833 	ol_flags =
1834 		TRANSPOSE(flags,
1835 			  MLX5_CQE_RX_L3_HDR_VALID,
1836 			  PKT_RX_IP_CKSUM_GOOD) |
1837 		TRANSPOSE(flags,
1838 			  MLX5_CQE_RX_L4_HDR_VALID,
1839 			  PKT_RX_L4_CKSUM_GOOD);
1840 	return ol_flags;
1841 }
1842 
1843 /**
1844  * DPDK callback for RX.
1845  *
1846  * @param dpdk_rxq
1847  *   Generic pointer to RX queue structure.
1848  * @param[out] pkts
1849  *   Array to store received packets.
1850  * @param pkts_n
1851  *   Maximum number of packets in array.
1852  *
1853  * @return
1854  *   Number of packets successfully received (<= pkts_n).
1855  */
1856 uint16_t
1857 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
1858 {
1859 	struct mlx5_rxq_data *rxq = dpdk_rxq;
1860 	const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1;
1861 	const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1;
1862 	const unsigned int sges_n = rxq->sges_n;
1863 	struct rte_mbuf *pkt = NULL;
1864 	struct rte_mbuf *seg = NULL;
1865 	volatile struct mlx5_cqe *cqe =
1866 		&(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
1867 	unsigned int i = 0;
1868 	unsigned int rq_ci = rxq->rq_ci << sges_n;
1869 	int len = 0; /* keep its value across iterations. */
1870 
1871 	while (pkts_n) {
1872 		unsigned int idx = rq_ci & wqe_cnt;
1873 		volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx];
1874 		struct rte_mbuf *rep = (*rxq->elts)[idx];
1875 		uint32_t rss_hash_res = 0;
1876 
1877 		if (pkt)
1878 			NEXT(seg) = rep;
1879 		seg = rep;
1880 		rte_prefetch0(seg);
1881 		rte_prefetch0(cqe);
1882 		rte_prefetch0(wqe);
1883 		rep = rte_mbuf_raw_alloc(rxq->mp);
1884 		if (unlikely(rep == NULL)) {
1885 			++rxq->stats.rx_nombuf;
1886 			if (!pkt) {
1887 				/*
1888 				 * no buffers before we even started,
1889 				 * bail out silently.
1890 				 */
1891 				break;
1892 			}
1893 			while (pkt != seg) {
1894 				assert(pkt != (*rxq->elts)[idx]);
1895 				rep = NEXT(pkt);
1896 				NEXT(pkt) = NULL;
1897 				NB_SEGS(pkt) = 1;
1898 				rte_mbuf_raw_free(pkt);
1899 				pkt = rep;
1900 			}
1901 			break;
1902 		}
1903 		if (!pkt) {
1904 			cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
1905 			len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt,
1906 					       &rss_hash_res);
1907 			if (!len) {
1908 				rte_mbuf_raw_free(rep);
1909 				break;
1910 			}
1911 			if (unlikely(len == -1)) {
1912 				/* RX error, packet is likely too large. */
1913 				rte_mbuf_raw_free(rep);
1914 				++rxq->stats.idropped;
1915 				goto skip;
1916 			}
1917 			pkt = seg;
1918 			assert(len >= (rxq->crc_present << 2));
1919 			/* Update packet information. */
1920 			pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe);
1921 			pkt->ol_flags = 0;
1922 			if (rss_hash_res && rxq->rss_hash) {
1923 				pkt->hash.rss = rss_hash_res;
1924 				pkt->ol_flags = PKT_RX_RSS_HASH;
1925 			}
1926 			if (rxq->mark &&
1927 			    MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) {
1928 				pkt->ol_flags |= PKT_RX_FDIR;
1929 				if (cqe->sop_drop_qpn !=
1930 				    rte_cpu_to_be_32(MLX5_FLOW_MARK_DEFAULT)) {
1931 					uint32_t mark = cqe->sop_drop_qpn;
1932 
1933 					pkt->ol_flags |= PKT_RX_FDIR_ID;
1934 					pkt->hash.fdir.hi =
1935 						mlx5_flow_mark_get(mark);
1936 				}
1937 			}
1938 			if (rxq->csum)
1939 				pkt->ol_flags |= rxq_cq_to_ol_flags(cqe);
1940 			if (rxq->vlan_strip &&
1941 			    (cqe->hdr_type_etc &
1942 			     rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED))) {
1943 				pkt->ol_flags |= PKT_RX_VLAN |
1944 					PKT_RX_VLAN_STRIPPED;
1945 				pkt->vlan_tci =
1946 					rte_be_to_cpu_16(cqe->vlan_info);
1947 			}
1948 			if (rxq->hw_timestamp) {
1949 				pkt->timestamp =
1950 					rte_be_to_cpu_64(cqe->timestamp);
1951 				pkt->ol_flags |= PKT_RX_TIMESTAMP;
1952 			}
1953 			if (rxq->crc_present)
1954 				len -= ETHER_CRC_LEN;
1955 			PKT_LEN(pkt) = len;
1956 		}
1957 		DATA_LEN(rep) = DATA_LEN(seg);
1958 		PKT_LEN(rep) = PKT_LEN(seg);
1959 		SET_DATA_OFF(rep, DATA_OFF(seg));
1960 		PORT(rep) = PORT(seg);
1961 		(*rxq->elts)[idx] = rep;
1962 		/*
1963 		 * Fill NIC descriptor with the new buffer.  The lkey and size
1964 		 * of the buffers are already known, only the buffer address
1965 		 * changes.
1966 		 */
1967 		wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t));
1968 		if (len > DATA_LEN(seg)) {
1969 			len -= DATA_LEN(seg);
1970 			++NB_SEGS(pkt);
1971 			++rq_ci;
1972 			continue;
1973 		}
1974 		DATA_LEN(seg) = len;
1975 #ifdef MLX5_PMD_SOFT_COUNTERS
1976 		/* Increment bytes counter. */
1977 		rxq->stats.ibytes += PKT_LEN(pkt);
1978 #endif
1979 		/* Return packet. */
1980 		*(pkts++) = pkt;
1981 		pkt = NULL;
1982 		--pkts_n;
1983 		++i;
1984 skip:
1985 		/* Align consumer index to the next stride. */
1986 		rq_ci >>= sges_n;
1987 		++rq_ci;
1988 		rq_ci <<= sges_n;
1989 	}
1990 	if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci)))
1991 		return 0;
1992 	/* Update the consumer index. */
1993 	rxq->rq_ci = rq_ci >> sges_n;
1994 	rte_cio_wmb();
1995 	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
1996 	rte_cio_wmb();
1997 	*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
1998 #ifdef MLX5_PMD_SOFT_COUNTERS
1999 	/* Increment packets counter. */
2000 	rxq->stats.ipackets += i;
2001 #endif
2002 	return i;
2003 }
2004 
2005 /**
2006  * Dummy DPDK callback for TX.
2007  *
2008  * This function is used to temporarily replace the real callback during
2009  * unsafe control operations on the queue, or in case of error.
2010  *
2011  * @param dpdk_txq
2012  *   Generic pointer to TX queue structure.
2013  * @param[in] pkts
2014  *   Packets to transmit.
2015  * @param pkts_n
2016  *   Number of packets in array.
2017  *
2018  * @return
2019  *   Number of packets successfully transmitted (<= pkts_n).
2020  */
2021 uint16_t
2022 removed_tx_burst(void *dpdk_txq __rte_unused,
2023 		 struct rte_mbuf **pkts __rte_unused,
2024 		 uint16_t pkts_n __rte_unused)
2025 {
2026 	return 0;
2027 }
2028 
2029 /**
2030  * Dummy DPDK callback for RX.
2031  *
2032  * This function is used to temporarily replace the real callback during
2033  * unsafe control operations on the queue, or in case of error.
2034  *
2035  * @param dpdk_rxq
2036  *   Generic pointer to RX queue structure.
2037  * @param[out] pkts
2038  *   Array to store received packets.
2039  * @param pkts_n
2040  *   Maximum number of packets in array.
2041  *
2042  * @return
2043  *   Number of packets successfully received (<= pkts_n).
2044  */
2045 uint16_t
2046 removed_rx_burst(void *dpdk_txq __rte_unused,
2047 		 struct rte_mbuf **pkts __rte_unused,
2048 		 uint16_t pkts_n __rte_unused)
2049 {
2050 	return 0;
2051 }
2052 
2053 /*
2054  * Vectorized Rx/Tx routines are not compiled in when required vector
2055  * instructions are not supported on a target architecture. The following null
2056  * stubs are needed for linkage when those are not included outside of this file
2057  * (e.g.  mlx5_rxtx_vec_sse.c for x86).
2058  */
2059 
2060 uint16_t __attribute__((weak))
2061 mlx5_tx_burst_raw_vec(void *dpdk_txq __rte_unused,
2062 		      struct rte_mbuf **pkts __rte_unused,
2063 		      uint16_t pkts_n __rte_unused)
2064 {
2065 	return 0;
2066 }
2067 
2068 uint16_t __attribute__((weak))
2069 mlx5_tx_burst_vec(void *dpdk_txq __rte_unused,
2070 		  struct rte_mbuf **pkts __rte_unused,
2071 		  uint16_t pkts_n __rte_unused)
2072 {
2073 	return 0;
2074 }
2075 
2076 uint16_t __attribute__((weak))
2077 mlx5_rx_burst_vec(void *dpdk_txq __rte_unused,
2078 		  struct rte_mbuf **pkts __rte_unused,
2079 		  uint16_t pkts_n __rte_unused)
2080 {
2081 	return 0;
2082 }
2083 
2084 int __attribute__((weak))
2085 mlx5_check_raw_vec_tx_support(struct rte_eth_dev *dev __rte_unused)
2086 {
2087 	return -ENOTSUP;
2088 }
2089 
2090 int __attribute__((weak))
2091 mlx5_check_vec_tx_support(struct rte_eth_dev *dev __rte_unused)
2092 {
2093 	return -ENOTSUP;
2094 }
2095 
2096 int __attribute__((weak))
2097 mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused)
2098 {
2099 	return -ENOTSUP;
2100 }
2101 
2102 int __attribute__((weak))
2103 mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused)
2104 {
2105 	return -ENOTSUP;
2106 }
2107