xref: /dpdk/drivers/net/mlx5/mlx5_rxtx.c (revision 9f3b3a96dec2f4c01cc92a132d763b8887d29e6a)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015-2019 Mellanox Technologies, Ltd
4  */
5 
6 #include <stdint.h>
7 #include <string.h>
8 #include <stdlib.h>
9 
10 #include <rte_mbuf.h>
11 #include <rte_mempool.h>
12 #include <rte_prefetch.h>
13 #include <rte_common.h>
14 #include <rte_branch_prediction.h>
15 #include <rte_ether.h>
16 #include <rte_cycles.h>
17 #include <rte_flow.h>
18 
19 #include <mlx5_prm.h>
20 #include <mlx5_common.h>
21 
22 #include "mlx5_autoconf.h"
23 #include "mlx5_defs.h"
24 #include "mlx5.h"
25 #include "mlx5_mr.h"
26 #include "mlx5_utils.h"
27 #include "mlx5_rxtx.h"
28 
29 /* TX burst subroutines return codes. */
30 enum mlx5_txcmp_code {
31 	MLX5_TXCMP_CODE_EXIT = 0,
32 	MLX5_TXCMP_CODE_ERROR,
33 	MLX5_TXCMP_CODE_SINGLE,
34 	MLX5_TXCMP_CODE_MULTI,
35 	MLX5_TXCMP_CODE_TSO,
36 	MLX5_TXCMP_CODE_EMPW,
37 };
38 
39 /*
40  * These defines are used to configure Tx burst routine option set
41  * supported at compile time. The not specified options are optimized out
42  * out due to if conditions can be explicitly calculated at compile time.
43  * The offloads with bigger runtime check (require more CPU cycles to
44  * skip) overhead should have the bigger index - this is needed to
45  * select the better matching routine function if no exact match and
46  * some offloads are not actually requested.
47  */
48 #define MLX5_TXOFF_CONFIG_MULTI (1u << 0) /* Multi-segment packets.*/
49 #define MLX5_TXOFF_CONFIG_TSO (1u << 1) /* TCP send offload supported.*/
50 #define MLX5_TXOFF_CONFIG_SWP (1u << 2) /* Tunnels/SW Parser offloads.*/
51 #define MLX5_TXOFF_CONFIG_CSUM (1u << 3) /* Check Sums offloaded. */
52 #define MLX5_TXOFF_CONFIG_INLINE (1u << 4) /* Data inlining supported. */
53 #define MLX5_TXOFF_CONFIG_VLAN (1u << 5) /* VLAN insertion supported.*/
54 #define MLX5_TXOFF_CONFIG_METADATA (1u << 6) /* Flow metadata. */
55 #define MLX5_TXOFF_CONFIG_EMPW (1u << 8) /* Enhanced MPW supported.*/
56 #define MLX5_TXOFF_CONFIG_MPW (1u << 9) /* Legacy MPW supported.*/
57 #define MLX5_TXOFF_CONFIG_TXPP (1u << 10) /* Scheduling on timestamp.*/
58 
59 /* The most common offloads groups. */
60 #define MLX5_TXOFF_CONFIG_NONE 0
61 #define MLX5_TXOFF_CONFIG_FULL (MLX5_TXOFF_CONFIG_MULTI | \
62 				MLX5_TXOFF_CONFIG_TSO | \
63 				MLX5_TXOFF_CONFIG_SWP | \
64 				MLX5_TXOFF_CONFIG_CSUM | \
65 				MLX5_TXOFF_CONFIG_INLINE | \
66 				MLX5_TXOFF_CONFIG_VLAN | \
67 				MLX5_TXOFF_CONFIG_METADATA)
68 
69 #define MLX5_TXOFF_CONFIG(mask) (olx & MLX5_TXOFF_CONFIG_##mask)
70 
71 #define MLX5_TXOFF_DECL(func, olx) \
72 static uint16_t mlx5_tx_burst_##func(void *txq, \
73 				     struct rte_mbuf **pkts, \
74 				    uint16_t pkts_n) \
75 { \
76 	return mlx5_tx_burst_tmpl((struct mlx5_txq_data *)txq, \
77 		    pkts, pkts_n, (olx)); \
78 }
79 
80 #define MLX5_TXOFF_INFO(func, olx) {mlx5_tx_burst_##func, olx},
81 
82 static __rte_always_inline uint32_t
83 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe);
84 
85 static __rte_always_inline int
86 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
87 		 uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe);
88 
89 static __rte_always_inline uint32_t
90 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe);
91 
92 static __rte_always_inline void
93 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
94 	       volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res);
95 
96 static int
97 mlx5_queue_state_modify(struct rte_eth_dev *dev,
98 			struct mlx5_mp_arg_queue_state_modify *sm);
99 
100 static inline void
101 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *__rte_restrict tcp,
102 			volatile struct mlx5_cqe *__rte_restrict cqe,
103 			uint32_t phcsum);
104 
105 static inline void
106 mlx5_lro_update_hdr(uint8_t *__rte_restrict padd,
107 		    volatile struct mlx5_cqe *__rte_restrict cqe,
108 		    uint32_t len);
109 
110 uint32_t mlx5_ptype_table[] __rte_cache_aligned = {
111 	[0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */
112 };
113 
114 uint8_t mlx5_cksum_table[1 << 10] __rte_cache_aligned;
115 uint8_t mlx5_swp_types_table[1 << 10] __rte_cache_aligned;
116 
117 uint64_t rte_net_mlx5_dynf_inline_mask;
118 #define PKT_TX_DYNF_NOINLINE rte_net_mlx5_dynf_inline_mask
119 
120 /**
121  * Build a table to translate Rx completion flags to packet type.
122  *
123  * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
124  */
125 void
126 mlx5_set_ptype_table(void)
127 {
128 	unsigned int i;
129 	uint32_t (*p)[RTE_DIM(mlx5_ptype_table)] = &mlx5_ptype_table;
130 
131 	/* Last entry must not be overwritten, reserved for errored packet. */
132 	for (i = 0; i < RTE_DIM(mlx5_ptype_table) - 1; ++i)
133 		(*p)[i] = RTE_PTYPE_UNKNOWN;
134 	/*
135 	 * The index to the array should have:
136 	 * bit[1:0] = l3_hdr_type
137 	 * bit[4:2] = l4_hdr_type
138 	 * bit[5] = ip_frag
139 	 * bit[6] = tunneled
140 	 * bit[7] = outer_l3_type
141 	 */
142 	/* L2 */
143 	(*p)[0x00] = RTE_PTYPE_L2_ETHER;
144 	/* L3 */
145 	(*p)[0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
146 		     RTE_PTYPE_L4_NONFRAG;
147 	(*p)[0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
148 		     RTE_PTYPE_L4_NONFRAG;
149 	/* Fragmented */
150 	(*p)[0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
151 		     RTE_PTYPE_L4_FRAG;
152 	(*p)[0x22] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
153 		     RTE_PTYPE_L4_FRAG;
154 	/* TCP */
155 	(*p)[0x05] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
156 		     RTE_PTYPE_L4_TCP;
157 	(*p)[0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
158 		     RTE_PTYPE_L4_TCP;
159 	(*p)[0x0d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
160 		     RTE_PTYPE_L4_TCP;
161 	(*p)[0x0e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
162 		     RTE_PTYPE_L4_TCP;
163 	(*p)[0x11] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
164 		     RTE_PTYPE_L4_TCP;
165 	(*p)[0x12] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
166 		     RTE_PTYPE_L4_TCP;
167 	/* UDP */
168 	(*p)[0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
169 		     RTE_PTYPE_L4_UDP;
170 	(*p)[0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
171 		     RTE_PTYPE_L4_UDP;
172 	/* Repeat with outer_l3_type being set. Just in case. */
173 	(*p)[0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
174 		     RTE_PTYPE_L4_NONFRAG;
175 	(*p)[0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
176 		     RTE_PTYPE_L4_NONFRAG;
177 	(*p)[0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
178 		     RTE_PTYPE_L4_FRAG;
179 	(*p)[0xa2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
180 		     RTE_PTYPE_L4_FRAG;
181 	(*p)[0x85] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
182 		     RTE_PTYPE_L4_TCP;
183 	(*p)[0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
184 		     RTE_PTYPE_L4_TCP;
185 	(*p)[0x8d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
186 		     RTE_PTYPE_L4_TCP;
187 	(*p)[0x8e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
188 		     RTE_PTYPE_L4_TCP;
189 	(*p)[0x91] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
190 		     RTE_PTYPE_L4_TCP;
191 	(*p)[0x92] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
192 		     RTE_PTYPE_L4_TCP;
193 	(*p)[0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
194 		     RTE_PTYPE_L4_UDP;
195 	(*p)[0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
196 		     RTE_PTYPE_L4_UDP;
197 	/* Tunneled - L3 */
198 	(*p)[0x40] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN;
199 	(*p)[0x41] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
200 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
201 		     RTE_PTYPE_INNER_L4_NONFRAG;
202 	(*p)[0x42] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
203 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
204 		     RTE_PTYPE_INNER_L4_NONFRAG;
205 	(*p)[0xc0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN;
206 	(*p)[0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
207 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
208 		     RTE_PTYPE_INNER_L4_NONFRAG;
209 	(*p)[0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
210 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
211 		     RTE_PTYPE_INNER_L4_NONFRAG;
212 	/* Tunneled - Fragmented */
213 	(*p)[0x61] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
214 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
215 		     RTE_PTYPE_INNER_L4_FRAG;
216 	(*p)[0x62] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
217 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
218 		     RTE_PTYPE_INNER_L4_FRAG;
219 	(*p)[0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
220 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
221 		     RTE_PTYPE_INNER_L4_FRAG;
222 	(*p)[0xe2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
223 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
224 		     RTE_PTYPE_INNER_L4_FRAG;
225 	/* Tunneled - TCP */
226 	(*p)[0x45] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
227 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
228 		     RTE_PTYPE_INNER_L4_TCP;
229 	(*p)[0x46] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
230 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
231 		     RTE_PTYPE_INNER_L4_TCP;
232 	(*p)[0x4d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
233 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
234 		     RTE_PTYPE_INNER_L4_TCP;
235 	(*p)[0x4e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
236 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
237 		     RTE_PTYPE_INNER_L4_TCP;
238 	(*p)[0x51] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
239 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
240 		     RTE_PTYPE_INNER_L4_TCP;
241 	(*p)[0x52] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
242 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
243 		     RTE_PTYPE_INNER_L4_TCP;
244 	(*p)[0xc5] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
245 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
246 		     RTE_PTYPE_INNER_L4_TCP;
247 	(*p)[0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
248 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
249 		     RTE_PTYPE_INNER_L4_TCP;
250 	(*p)[0xcd] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
251 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
252 		     RTE_PTYPE_INNER_L4_TCP;
253 	(*p)[0xce] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
254 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
255 		     RTE_PTYPE_INNER_L4_TCP;
256 	(*p)[0xd1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
257 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
258 		     RTE_PTYPE_INNER_L4_TCP;
259 	(*p)[0xd2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
260 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
261 		     RTE_PTYPE_INNER_L4_TCP;
262 	/* Tunneled - UDP */
263 	(*p)[0x49] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
264 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
265 		     RTE_PTYPE_INNER_L4_UDP;
266 	(*p)[0x4a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
267 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
268 		     RTE_PTYPE_INNER_L4_UDP;
269 	(*p)[0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
270 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
271 		     RTE_PTYPE_INNER_L4_UDP;
272 	(*p)[0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
273 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
274 		     RTE_PTYPE_INNER_L4_UDP;
275 }
276 
277 /**
278  * Build a table to translate packet to checksum type of Verbs.
279  */
280 void
281 mlx5_set_cksum_table(void)
282 {
283 	unsigned int i;
284 	uint8_t v;
285 
286 	/*
287 	 * The index should have:
288 	 * bit[0] = PKT_TX_TCP_SEG
289 	 * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM
290 	 * bit[4] = PKT_TX_IP_CKSUM
291 	 * bit[8] = PKT_TX_OUTER_IP_CKSUM
292 	 * bit[9] = tunnel
293 	 */
294 	for (i = 0; i < RTE_DIM(mlx5_cksum_table); ++i) {
295 		v = 0;
296 		if (i & (1 << 9)) {
297 			/* Tunneled packet. */
298 			if (i & (1 << 8)) /* Outer IP. */
299 				v |= MLX5_ETH_WQE_L3_CSUM;
300 			if (i & (1 << 4)) /* Inner IP. */
301 				v |= MLX5_ETH_WQE_L3_INNER_CSUM;
302 			if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */
303 				v |= MLX5_ETH_WQE_L4_INNER_CSUM;
304 		} else {
305 			/* No tunnel. */
306 			if (i & (1 << 4)) /* IP. */
307 				v |= MLX5_ETH_WQE_L3_CSUM;
308 			if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */
309 				v |= MLX5_ETH_WQE_L4_CSUM;
310 		}
311 		mlx5_cksum_table[i] = v;
312 	}
313 }
314 
315 /**
316  * Build a table to translate packet type of mbuf to SWP type of Verbs.
317  */
318 void
319 mlx5_set_swp_types_table(void)
320 {
321 	unsigned int i;
322 	uint8_t v;
323 
324 	/*
325 	 * The index should have:
326 	 * bit[0:1] = PKT_TX_L4_MASK
327 	 * bit[4] = PKT_TX_IPV6
328 	 * bit[8] = PKT_TX_OUTER_IPV6
329 	 * bit[9] = PKT_TX_OUTER_UDP
330 	 */
331 	for (i = 0; i < RTE_DIM(mlx5_swp_types_table); ++i) {
332 		v = 0;
333 		if (i & (1 << 8))
334 			v |= MLX5_ETH_WQE_L3_OUTER_IPV6;
335 		if (i & (1 << 9))
336 			v |= MLX5_ETH_WQE_L4_OUTER_UDP;
337 		if (i & (1 << 4))
338 			v |= MLX5_ETH_WQE_L3_INNER_IPV6;
339 		if ((i & 3) == (PKT_TX_UDP_CKSUM >> 52))
340 			v |= MLX5_ETH_WQE_L4_INNER_UDP;
341 		mlx5_swp_types_table[i] = v;
342 	}
343 }
344 
345 /**
346  * Set Software Parser flags and offsets in Ethernet Segment of WQE.
347  * Flags must be preliminary initialized to zero.
348  *
349  * @param loc
350  *   Pointer to burst routine local context.
351  * @param swp_flags
352  *   Pointer to store Software Parser flags
353  * @param olx
354  *   Configured Tx offloads mask. It is fully defined at
355  *   compile time and may be used for optimization.
356  *
357  * @return
358  *   Software Parser offsets packed in dword.
359  *   Software Parser flags are set by pointer.
360  */
361 static __rte_always_inline uint32_t
362 txq_mbuf_to_swp(struct mlx5_txq_local *__rte_restrict loc,
363 		uint8_t *swp_flags,
364 		unsigned int olx)
365 {
366 	uint64_t ol, tunnel;
367 	unsigned int idx, off;
368 	uint32_t set;
369 
370 	if (!MLX5_TXOFF_CONFIG(SWP))
371 		return 0;
372 	ol = loc->mbuf->ol_flags;
373 	tunnel = ol & PKT_TX_TUNNEL_MASK;
374 	/*
375 	 * Check whether Software Parser is required.
376 	 * Only customized tunnels may ask for.
377 	 */
378 	if (likely(tunnel != PKT_TX_TUNNEL_UDP && tunnel != PKT_TX_TUNNEL_IP))
379 		return 0;
380 	/*
381 	 * The index should have:
382 	 * bit[0:1] = PKT_TX_L4_MASK
383 	 * bit[4] = PKT_TX_IPV6
384 	 * bit[8] = PKT_TX_OUTER_IPV6
385 	 * bit[9] = PKT_TX_OUTER_UDP
386 	 */
387 	idx = (ol & (PKT_TX_L4_MASK | PKT_TX_IPV6 | PKT_TX_OUTER_IPV6)) >> 52;
388 	idx |= (tunnel == PKT_TX_TUNNEL_UDP) ? (1 << 9) : 0;
389 	*swp_flags = mlx5_swp_types_table[idx];
390 	/*
391 	 * Set offsets for SW parser. Since ConnectX-5, SW parser just
392 	 * complements HW parser. SW parser starts to engage only if HW parser
393 	 * can't reach a header. For the older devices, HW parser will not kick
394 	 * in if any of SWP offsets is set. Therefore, all of the L3 offsets
395 	 * should be set regardless of HW offload.
396 	 */
397 	off = loc->mbuf->outer_l2_len;
398 	if (MLX5_TXOFF_CONFIG(VLAN) && ol & PKT_TX_VLAN_PKT)
399 		off += sizeof(struct rte_vlan_hdr);
400 	set = (off >> 1) << 8; /* Outer L3 offset. */
401 	off += loc->mbuf->outer_l3_len;
402 	if (tunnel == PKT_TX_TUNNEL_UDP)
403 		set |= off >> 1; /* Outer L4 offset. */
404 	if (ol & (PKT_TX_IPV4 | PKT_TX_IPV6)) { /* Inner IP. */
405 		const uint64_t csum = ol & PKT_TX_L4_MASK;
406 			off += loc->mbuf->l2_len;
407 		set |= (off >> 1) << 24; /* Inner L3 offset. */
408 		if (csum == PKT_TX_TCP_CKSUM ||
409 		    csum == PKT_TX_UDP_CKSUM ||
410 		    (MLX5_TXOFF_CONFIG(TSO) && ol & PKT_TX_TCP_SEG)) {
411 			off += loc->mbuf->l3_len;
412 			set |= (off >> 1) << 16; /* Inner L4 offset. */
413 		}
414 	}
415 	set = rte_cpu_to_le_32(set);
416 	return set;
417 }
418 
419 /**
420  * Convert the Checksum offloads to Verbs.
421  *
422  * @param buf
423  *   Pointer to the mbuf.
424  *
425  * @return
426  *   Converted checksum flags.
427  */
428 static __rte_always_inline uint8_t
429 txq_ol_cksum_to_cs(struct rte_mbuf *buf)
430 {
431 	uint32_t idx;
432 	uint8_t is_tunnel = !!(buf->ol_flags & PKT_TX_TUNNEL_MASK);
433 	const uint64_t ol_flags_mask = PKT_TX_TCP_SEG | PKT_TX_L4_MASK |
434 				       PKT_TX_IP_CKSUM | PKT_TX_OUTER_IP_CKSUM;
435 
436 	/*
437 	 * The index should have:
438 	 * bit[0] = PKT_TX_TCP_SEG
439 	 * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM
440 	 * bit[4] = PKT_TX_IP_CKSUM
441 	 * bit[8] = PKT_TX_OUTER_IP_CKSUM
442 	 * bit[9] = tunnel
443 	 */
444 	idx = ((buf->ol_flags & ol_flags_mask) >> 50) | (!!is_tunnel << 9);
445 	return mlx5_cksum_table[idx];
446 }
447 
448 /**
449  * Internal function to compute the number of used descriptors in an RX queue
450  *
451  * @param rxq
452  *   The Rx queue.
453  *
454  * @return
455  *   The number of used rx descriptor.
456  */
457 static uint32_t
458 rx_queue_count(struct mlx5_rxq_data *rxq)
459 {
460 	struct rxq_zip *zip = &rxq->zip;
461 	volatile struct mlx5_cqe *cqe;
462 	unsigned int cq_ci = rxq->cq_ci;
463 	const unsigned int cqe_n = (1 << rxq->cqe_n);
464 	const unsigned int cqe_cnt = cqe_n - 1;
465 	unsigned int used = 0;
466 
467 	cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
468 	while (check_cqe(cqe, cqe_n, cq_ci) != MLX5_CQE_STATUS_HW_OWN) {
469 		int8_t op_own;
470 		unsigned int n;
471 
472 		op_own = cqe->op_own;
473 		if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED)
474 			if (unlikely(zip->ai))
475 				n = zip->cqe_cnt - zip->ai;
476 			else
477 				n = rte_be_to_cpu_32(cqe->byte_cnt);
478 		else
479 			n = 1;
480 		cq_ci += n;
481 		used += n;
482 		cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
483 	}
484 	used = RTE_MIN(used, cqe_n);
485 	return used;
486 }
487 
488 /**
489  * DPDK callback to check the status of a rx descriptor.
490  *
491  * @param rx_queue
492  *   The Rx queue.
493  * @param[in] offset
494  *   The index of the descriptor in the ring.
495  *
496  * @return
497  *   The status of the tx descriptor.
498  */
499 int
500 mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset)
501 {
502 	struct mlx5_rxq_data *rxq = rx_queue;
503 	struct mlx5_rxq_ctrl *rxq_ctrl =
504 			container_of(rxq, struct mlx5_rxq_ctrl, rxq);
505 	struct rte_eth_dev *dev = ETH_DEV(rxq_ctrl->priv);
506 
507 	if (dev->rx_pkt_burst == NULL ||
508 	    dev->rx_pkt_burst == removed_rx_burst) {
509 		rte_errno = ENOTSUP;
510 		return -rte_errno;
511 	}
512 	if (offset >= (1 << rxq->cqe_n)) {
513 		rte_errno = EINVAL;
514 		return -rte_errno;
515 	}
516 	if (offset < rx_queue_count(rxq))
517 		return RTE_ETH_RX_DESC_DONE;
518 	return RTE_ETH_RX_DESC_AVAIL;
519 }
520 
521 /**
522  * DPDK callback to get the RX queue information
523  *
524  * @param dev
525  *   Pointer to the device structure.
526  *
527  * @param rx_queue_id
528  *   Rx queue identificator.
529  *
530  * @param qinfo
531  *   Pointer to the RX queue information structure.
532  *
533  * @return
534  *   None.
535  */
536 
537 void
538 mlx5_rxq_info_get(struct rte_eth_dev *dev, uint16_t rx_queue_id,
539 		  struct rte_eth_rxq_info *qinfo)
540 {
541 	struct mlx5_priv *priv = dev->data->dev_private;
542 	struct mlx5_rxq_data *rxq = (*priv->rxqs)[rx_queue_id];
543 	struct mlx5_rxq_ctrl *rxq_ctrl =
544 		container_of(rxq, struct mlx5_rxq_ctrl, rxq);
545 
546 	if (!rxq)
547 		return;
548 	qinfo->mp = mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ?
549 					rxq->mprq_mp : rxq->mp;
550 	qinfo->conf.rx_thresh.pthresh = 0;
551 	qinfo->conf.rx_thresh.hthresh = 0;
552 	qinfo->conf.rx_thresh.wthresh = 0;
553 	qinfo->conf.rx_free_thresh = rxq->rq_repl_thresh;
554 	qinfo->conf.rx_drop_en = 1;
555 	qinfo->conf.rx_deferred_start = rxq_ctrl ? 0 : 1;
556 	qinfo->conf.offloads = dev->data->dev_conf.rxmode.offloads;
557 	qinfo->scattered_rx = dev->data->scattered_rx;
558 	qinfo->nb_desc = 1 << rxq->elts_n;
559 }
560 
561 /**
562  * DPDK callback to get the RX packet burst mode information
563  *
564  * @param dev
565  *   Pointer to the device structure.
566  *
567  * @param rx_queue_id
568  *   Rx queue identificatior.
569  *
570  * @param mode
571  *   Pointer to the burts mode information.
572  *
573  * @return
574  *   0 as success, -EINVAL as failure.
575  */
576 
577 int
578 mlx5_rx_burst_mode_get(struct rte_eth_dev *dev,
579 		       uint16_t rx_queue_id __rte_unused,
580 		       struct rte_eth_burst_mode *mode)
581 {
582 	eth_rx_burst_t pkt_burst = dev->rx_pkt_burst;
583 	struct mlx5_priv *priv = dev->data->dev_private;
584 	struct mlx5_rxq_data *rxq;
585 
586 	rxq = (*priv->rxqs)[rx_queue_id];
587 	if (!rxq) {
588 		rte_errno = EINVAL;
589 		return -rte_errno;
590 	}
591 	if (pkt_burst == mlx5_rx_burst) {
592 		snprintf(mode->info, sizeof(mode->info), "%s", "Scalar");
593 	} else if (pkt_burst == mlx5_rx_burst_mprq) {
594 		snprintf(mode->info, sizeof(mode->info), "%s", "Multi-Packet RQ");
595 	} else if (pkt_burst == mlx5_rx_burst_vec) {
596 #if defined RTE_ARCH_X86_64
597 		snprintf(mode->info, sizeof(mode->info), "%s", "Vector SSE");
598 #elif defined RTE_ARCH_ARM64
599 		snprintf(mode->info, sizeof(mode->info), "%s", "Vector Neon");
600 #elif defined RTE_ARCH_PPC_64
601 		snprintf(mode->info, sizeof(mode->info), "%s", "Vector AltiVec");
602 #else
603 		return -EINVAL;
604 #endif
605 	} else if (pkt_burst == mlx5_rx_burst_mprq_vec) {
606 #if defined RTE_ARCH_X86_64
607 		snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector SSE");
608 #elif defined RTE_ARCH_ARM64
609 		snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector Neon");
610 #elif defined RTE_ARCH_PPC_64
611 		snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector AltiVec");
612 #else
613 		return -EINVAL;
614 #endif
615 	} else {
616 		return -EINVAL;
617 	}
618 	return 0;
619 }
620 
621 /**
622  * DPDK callback to get the number of used descriptors in a RX queue
623  *
624  * @param dev
625  *   Pointer to the device structure.
626  *
627  * @param rx_queue_id
628  *   The Rx queue.
629  *
630  * @return
631  *   The number of used rx descriptor.
632  *   -EINVAL if the queue is invalid
633  */
634 uint32_t
635 mlx5_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id)
636 {
637 	struct mlx5_priv *priv = dev->data->dev_private;
638 	struct mlx5_rxq_data *rxq;
639 
640 	if (dev->rx_pkt_burst == NULL ||
641 	    dev->rx_pkt_burst == removed_rx_burst) {
642 		rte_errno = ENOTSUP;
643 		return -rte_errno;
644 	}
645 	rxq = (*priv->rxqs)[rx_queue_id];
646 	if (!rxq) {
647 		rte_errno = EINVAL;
648 		return -rte_errno;
649 	}
650 	return rx_queue_count(rxq);
651 }
652 
653 #define MLX5_SYSTEM_LOG_DIR "/var/log"
654 /**
655  * Dump debug information to log file.
656  *
657  * @param fname
658  *   The file name.
659  * @param hex_title
660  *   If not NULL this string is printed as a header to the output
661  *   and the output will be in hexadecimal view.
662  * @param buf
663  *   This is the buffer address to print out.
664  * @param len
665  *   The number of bytes to dump out.
666  */
667 void
668 mlx5_dump_debug_information(const char *fname, const char *hex_title,
669 			    const void *buf, unsigned int hex_len)
670 {
671 	FILE *fd;
672 
673 	MKSTR(path, "%s/%s", MLX5_SYSTEM_LOG_DIR, fname);
674 	fd = fopen(path, "a+");
675 	if (!fd) {
676 		DRV_LOG(WARNING, "cannot open %s for debug dump", path);
677 		MKSTR(path2, "./%s", fname);
678 		fd = fopen(path2, "a+");
679 		if (!fd) {
680 			DRV_LOG(ERR, "cannot open %s for debug dump", path2);
681 			return;
682 		}
683 		DRV_LOG(INFO, "New debug dump in file %s", path2);
684 	} else {
685 		DRV_LOG(INFO, "New debug dump in file %s", path);
686 	}
687 	if (hex_title)
688 		rte_hexdump(fd, hex_title, buf, hex_len);
689 	else
690 		fprintf(fd, "%s", (const char *)buf);
691 	fprintf(fd, "\n\n\n");
692 	fclose(fd);
693 }
694 
695 /**
696  * Move QP from error state to running state and initialize indexes.
697  *
698  * @param txq_ctrl
699  *   Pointer to TX queue control structure.
700  *
701  * @return
702  *   0 on success, else -1.
703  */
704 static int
705 tx_recover_qp(struct mlx5_txq_ctrl *txq_ctrl)
706 {
707 	struct mlx5_mp_arg_queue_state_modify sm = {
708 			.is_wq = 0,
709 			.queue_id = txq_ctrl->txq.idx,
710 	};
711 
712 	if (mlx5_queue_state_modify(ETH_DEV(txq_ctrl->priv), &sm))
713 		return -1;
714 	txq_ctrl->txq.wqe_ci = 0;
715 	txq_ctrl->txq.wqe_pi = 0;
716 	txq_ctrl->txq.elts_comp = 0;
717 	return 0;
718 }
719 
720 /* Return 1 if the error CQE is signed otherwise, sign it and return 0. */
721 static int
722 check_err_cqe_seen(volatile struct mlx5_err_cqe *err_cqe)
723 {
724 	static const uint8_t magic[] = "seen";
725 	int ret = 1;
726 	unsigned int i;
727 
728 	for (i = 0; i < sizeof(magic); ++i)
729 		if (!ret || err_cqe->rsvd1[i] != magic[i]) {
730 			ret = 0;
731 			err_cqe->rsvd1[i] = magic[i];
732 		}
733 	return ret;
734 }
735 
736 /**
737  * Handle error CQE.
738  *
739  * @param txq
740  *   Pointer to TX queue structure.
741  * @param error_cqe
742  *   Pointer to the error CQE.
743  *
744  * @return
745  *   Negative value if queue recovery failed, otherwise
746  *   the error completion entry is handled successfully.
747  */
748 static int
749 mlx5_tx_error_cqe_handle(struct mlx5_txq_data *__rte_restrict txq,
750 			 volatile struct mlx5_err_cqe *err_cqe)
751 {
752 	if (err_cqe->syndrome != MLX5_CQE_SYNDROME_WR_FLUSH_ERR) {
753 		const uint16_t wqe_m = ((1 << txq->wqe_n) - 1);
754 		struct mlx5_txq_ctrl *txq_ctrl =
755 				container_of(txq, struct mlx5_txq_ctrl, txq);
756 		uint16_t new_wqe_pi = rte_be_to_cpu_16(err_cqe->wqe_counter);
757 		int seen = check_err_cqe_seen(err_cqe);
758 
759 		if (!seen && txq_ctrl->dump_file_n <
760 		    txq_ctrl->priv->config.max_dump_files_num) {
761 			MKSTR(err_str, "Unexpected CQE error syndrome "
762 			      "0x%02x CQN = %u SQN = %u wqe_counter = %u "
763 			      "wq_ci = %u cq_ci = %u", err_cqe->syndrome,
764 			      txq->cqe_s, txq->qp_num_8s >> 8,
765 			      rte_be_to_cpu_16(err_cqe->wqe_counter),
766 			      txq->wqe_ci, txq->cq_ci);
767 			MKSTR(name, "dpdk_mlx5_port_%u_txq_%u_index_%u_%u",
768 			      PORT_ID(txq_ctrl->priv), txq->idx,
769 			      txq_ctrl->dump_file_n, (uint32_t)rte_rdtsc());
770 			mlx5_dump_debug_information(name, NULL, err_str, 0);
771 			mlx5_dump_debug_information(name, "MLX5 Error CQ:",
772 						    (const void *)((uintptr_t)
773 						    txq->cqes),
774 						    sizeof(*err_cqe) *
775 						    (1 << txq->cqe_n));
776 			mlx5_dump_debug_information(name, "MLX5 Error SQ:",
777 						    (const void *)((uintptr_t)
778 						    txq->wqes),
779 						    MLX5_WQE_SIZE *
780 						    (1 << txq->wqe_n));
781 			txq_ctrl->dump_file_n++;
782 		}
783 		if (!seen)
784 			/*
785 			 * Count errors in WQEs units.
786 			 * Later it can be improved to count error packets,
787 			 * for example, by SQ parsing to find how much packets
788 			 * should be counted for each WQE.
789 			 */
790 			txq->stats.oerrors += ((txq->wqe_ci & wqe_m) -
791 						new_wqe_pi) & wqe_m;
792 		if (tx_recover_qp(txq_ctrl)) {
793 			/* Recovering failed - retry later on the same WQE. */
794 			return -1;
795 		}
796 		/* Release all the remaining buffers. */
797 		txq_free_elts(txq_ctrl);
798 	}
799 	return 0;
800 }
801 
802 /**
803  * Translate RX completion flags to packet type.
804  *
805  * @param[in] rxq
806  *   Pointer to RX queue structure.
807  * @param[in] cqe
808  *   Pointer to CQE.
809  *
810  * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
811  *
812  * @return
813  *   Packet type for struct rte_mbuf.
814  */
815 static inline uint32_t
816 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe)
817 {
818 	uint8_t idx;
819 	uint8_t pinfo = cqe->pkt_info;
820 	uint16_t ptype = cqe->hdr_type_etc;
821 
822 	/*
823 	 * The index to the array should have:
824 	 * bit[1:0] = l3_hdr_type
825 	 * bit[4:2] = l4_hdr_type
826 	 * bit[5] = ip_frag
827 	 * bit[6] = tunneled
828 	 * bit[7] = outer_l3_type
829 	 */
830 	idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10);
831 	return mlx5_ptype_table[idx] | rxq->tunnel * !!(idx & (1 << 6));
832 }
833 
834 /**
835  * Initialize Rx WQ and indexes.
836  *
837  * @param[in] rxq
838  *   Pointer to RX queue structure.
839  */
840 void
841 mlx5_rxq_initialize(struct mlx5_rxq_data *rxq)
842 {
843 	const unsigned int wqe_n = 1 << rxq->elts_n;
844 	unsigned int i;
845 
846 	for (i = 0; (i != wqe_n); ++i) {
847 		volatile struct mlx5_wqe_data_seg *scat;
848 		uintptr_t addr;
849 		uint32_t byte_count;
850 
851 		if (mlx5_rxq_mprq_enabled(rxq)) {
852 			struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[i];
853 
854 			scat = &((volatile struct mlx5_wqe_mprq *)
855 				rxq->wqes)[i].dseg;
856 			addr = (uintptr_t)mlx5_mprq_buf_addr(buf,
857 							 1 << rxq->strd_num_n);
858 			byte_count = (1 << rxq->strd_sz_n) *
859 					(1 << rxq->strd_num_n);
860 		} else {
861 			struct rte_mbuf *buf = (*rxq->elts)[i];
862 
863 			scat = &((volatile struct mlx5_wqe_data_seg *)
864 					rxq->wqes)[i];
865 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
866 			byte_count = DATA_LEN(buf);
867 		}
868 		/* scat->addr must be able to store a pointer. */
869 		MLX5_ASSERT(sizeof(scat->addr) >= sizeof(uintptr_t));
870 		*scat = (struct mlx5_wqe_data_seg){
871 			.addr = rte_cpu_to_be_64(addr),
872 			.byte_count = rte_cpu_to_be_32(byte_count),
873 			.lkey = mlx5_rx_addr2mr(rxq, addr),
874 		};
875 	}
876 	rxq->consumed_strd = 0;
877 	rxq->decompressed = 0;
878 	rxq->rq_pi = 0;
879 	rxq->zip = (struct rxq_zip){
880 		.ai = 0,
881 	};
882 	rxq->elts_ci = mlx5_rxq_mprq_enabled(rxq) ?
883 		(wqe_n >> rxq->sges_n) * (1 << rxq->strd_num_n) : 0;
884 	/* Update doorbell counter. */
885 	rxq->rq_ci = wqe_n >> rxq->sges_n;
886 	rte_io_wmb();
887 	*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
888 }
889 
890 /**
891  * Modify a Verbs/DevX queue state.
892  * This must be called from the primary process.
893  *
894  * @param dev
895  *   Pointer to Ethernet device.
896  * @param sm
897  *   State modify request parameters.
898  *
899  * @return
900  *   0 in case of success else non-zero value and rte_errno is set.
901  */
902 int
903 mlx5_queue_state_modify_primary(struct rte_eth_dev *dev,
904 			const struct mlx5_mp_arg_queue_state_modify *sm)
905 {
906 	int ret;
907 	struct mlx5_priv *priv = dev->data->dev_private;
908 
909 	if (sm->is_wq) {
910 		struct mlx5_rxq_data *rxq = (*priv->rxqs)[sm->queue_id];
911 		struct mlx5_rxq_ctrl *rxq_ctrl =
912 			container_of(rxq, struct mlx5_rxq_ctrl, rxq);
913 
914 		ret = priv->obj_ops.rxq_obj_modify(rxq_ctrl->obj, sm->state);
915 		if (ret) {
916 			DRV_LOG(ERR, "Cannot change Rx WQ state to %u  - %s",
917 					sm->state, strerror(errno));
918 			rte_errno = errno;
919 			return ret;
920 		}
921 	} else {
922 		struct mlx5_txq_data *txq = (*priv->txqs)[sm->queue_id];
923 		struct mlx5_txq_ctrl *txq_ctrl =
924 			container_of(txq, struct mlx5_txq_ctrl, txq);
925 
926 		ret = priv->obj_ops.txq_obj_modify(txq_ctrl->obj,
927 						   MLX5_TXQ_MOD_ERR2RDY,
928 						   (uint8_t)priv->dev_port);
929 		if (ret)
930 			return ret;
931 	}
932 	return 0;
933 }
934 
935 /**
936  * Modify a Verbs queue state.
937  *
938  * @param dev
939  *   Pointer to Ethernet device.
940  * @param sm
941  *   State modify request parameters.
942  *
943  * @return
944  *   0 in case of success else non-zero value.
945  */
946 static int
947 mlx5_queue_state_modify(struct rte_eth_dev *dev,
948 			struct mlx5_mp_arg_queue_state_modify *sm)
949 {
950 	struct mlx5_priv *priv = dev->data->dev_private;
951 	int ret = 0;
952 
953 	switch (rte_eal_process_type()) {
954 	case RTE_PROC_PRIMARY:
955 		ret = mlx5_queue_state_modify_primary(dev, sm);
956 		break;
957 	case RTE_PROC_SECONDARY:
958 		ret = mlx5_mp_req_queue_state_modify(&priv->mp_id, sm);
959 		break;
960 	default:
961 		break;
962 	}
963 	return ret;
964 }
965 
966 /**
967  * Handle a Rx error.
968  * The function inserts the RQ state to reset when the first error CQE is
969  * shown, then drains the CQ by the caller function loop. When the CQ is empty,
970  * it moves the RQ state to ready and initializes the RQ.
971  * Next CQE identification and error counting are in the caller responsibility.
972  *
973  * @param[in] rxq
974  *   Pointer to RX queue structure.
975  * @param[in] vec
976  *   1 when called from vectorized Rx burst, need to prepare mbufs for the RQ.
977  *   0 when called from non-vectorized Rx burst.
978  *
979  * @return
980  *   -1 in case of recovery error, otherwise the CQE status.
981  */
982 int
983 mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec)
984 {
985 	const uint16_t cqe_n = 1 << rxq->cqe_n;
986 	const uint16_t cqe_mask = cqe_n - 1;
987 	const uint16_t wqe_n = 1 << rxq->elts_n;
988 	const uint16_t strd_n = 1 << rxq->strd_num_n;
989 	struct mlx5_rxq_ctrl *rxq_ctrl =
990 			container_of(rxq, struct mlx5_rxq_ctrl, rxq);
991 	union {
992 		volatile struct mlx5_cqe *cqe;
993 		volatile struct mlx5_err_cqe *err_cqe;
994 	} u = {
995 		.cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask],
996 	};
997 	struct mlx5_mp_arg_queue_state_modify sm;
998 	int ret;
999 
1000 	switch (rxq->err_state) {
1001 	case MLX5_RXQ_ERR_STATE_NO_ERROR:
1002 		rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_RESET;
1003 		/* Fall-through */
1004 	case MLX5_RXQ_ERR_STATE_NEED_RESET:
1005 		sm.is_wq = 1;
1006 		sm.queue_id = rxq->idx;
1007 		sm.state = IBV_WQS_RESET;
1008 		if (mlx5_queue_state_modify(ETH_DEV(rxq_ctrl->priv), &sm))
1009 			return -1;
1010 		if (rxq_ctrl->dump_file_n <
1011 		    rxq_ctrl->priv->config.max_dump_files_num) {
1012 			MKSTR(err_str, "Unexpected CQE error syndrome "
1013 			      "0x%02x CQN = %u RQN = %u wqe_counter = %u"
1014 			      " rq_ci = %u cq_ci = %u", u.err_cqe->syndrome,
1015 			      rxq->cqn, rxq_ctrl->wqn,
1016 			      rte_be_to_cpu_16(u.err_cqe->wqe_counter),
1017 			      rxq->rq_ci << rxq->sges_n, rxq->cq_ci);
1018 			MKSTR(name, "dpdk_mlx5_port_%u_rxq_%u_%u",
1019 			      rxq->port_id, rxq->idx, (uint32_t)rte_rdtsc());
1020 			mlx5_dump_debug_information(name, NULL, err_str, 0);
1021 			mlx5_dump_debug_information(name, "MLX5 Error CQ:",
1022 						    (const void *)((uintptr_t)
1023 								    rxq->cqes),
1024 						    sizeof(*u.cqe) * cqe_n);
1025 			mlx5_dump_debug_information(name, "MLX5 Error RQ:",
1026 						    (const void *)((uintptr_t)
1027 								    rxq->wqes),
1028 						    16 * wqe_n);
1029 			rxq_ctrl->dump_file_n++;
1030 		}
1031 		rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_READY;
1032 		/* Fall-through */
1033 	case MLX5_RXQ_ERR_STATE_NEED_READY:
1034 		ret = check_cqe(u.cqe, cqe_n, rxq->cq_ci);
1035 		if (ret == MLX5_CQE_STATUS_HW_OWN) {
1036 			rte_io_wmb();
1037 			*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
1038 			rte_io_wmb();
1039 			/*
1040 			 * The RQ consumer index must be zeroed while moving
1041 			 * from RESET state to RDY state.
1042 			 */
1043 			*rxq->rq_db = rte_cpu_to_be_32(0);
1044 			rte_io_wmb();
1045 			sm.is_wq = 1;
1046 			sm.queue_id = rxq->idx;
1047 			sm.state = IBV_WQS_RDY;
1048 			if (mlx5_queue_state_modify(ETH_DEV(rxq_ctrl->priv),
1049 						    &sm))
1050 				return -1;
1051 			if (vec) {
1052 				const uint32_t elts_n =
1053 					mlx5_rxq_mprq_enabled(rxq) ?
1054 					wqe_n * strd_n : wqe_n;
1055 				const uint32_t e_mask = elts_n - 1;
1056 				uint32_t elts_ci =
1057 					mlx5_rxq_mprq_enabled(rxq) ?
1058 					rxq->elts_ci : rxq->rq_ci;
1059 				uint32_t elt_idx;
1060 				struct rte_mbuf **elt;
1061 				int i;
1062 				unsigned int n = elts_n - (elts_ci -
1063 							  rxq->rq_pi);
1064 
1065 				for (i = 0; i < (int)n; ++i) {
1066 					elt_idx = (elts_ci + i) & e_mask;
1067 					elt = &(*rxq->elts)[elt_idx];
1068 					*elt = rte_mbuf_raw_alloc(rxq->mp);
1069 					if (!*elt) {
1070 						for (i--; i >= 0; --i) {
1071 							elt_idx = (elts_ci +
1072 								   i) & elts_n;
1073 							elt = &(*rxq->elts)
1074 								[elt_idx];
1075 							rte_pktmbuf_free_seg
1076 								(*elt);
1077 						}
1078 						return -1;
1079 					}
1080 				}
1081 				for (i = 0; i < (int)elts_n; ++i) {
1082 					elt = &(*rxq->elts)[i];
1083 					DATA_LEN(*elt) =
1084 						(uint16_t)((*elt)->buf_len -
1085 						rte_pktmbuf_headroom(*elt));
1086 				}
1087 				/* Padding with a fake mbuf for vec Rx. */
1088 				for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
1089 					(*rxq->elts)[elts_n + i] =
1090 								&rxq->fake_mbuf;
1091 			}
1092 			mlx5_rxq_initialize(rxq);
1093 			rxq->err_state = MLX5_RXQ_ERR_STATE_NO_ERROR;
1094 		}
1095 		return ret;
1096 	default:
1097 		return -1;
1098 	}
1099 }
1100 
1101 /**
1102  * Get size of the next packet for a given CQE. For compressed CQEs, the
1103  * consumer index is updated only once all packets of the current one have
1104  * been processed.
1105  *
1106  * @param rxq
1107  *   Pointer to RX queue.
1108  * @param cqe
1109  *   CQE to process.
1110  * @param[out] mcqe
1111  *   Store pointer to mini-CQE if compressed. Otherwise, the pointer is not
1112  *   written.
1113  *
1114  * @return
1115  *   0 in case of empty CQE, otherwise the packet size in bytes.
1116  */
1117 static inline int
1118 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
1119 		 uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe)
1120 {
1121 	struct rxq_zip *zip = &rxq->zip;
1122 	uint16_t cqe_n = cqe_cnt + 1;
1123 	int len;
1124 	uint16_t idx, end;
1125 
1126 	do {
1127 		len = 0;
1128 		/* Process compressed data in the CQE and mini arrays. */
1129 		if (zip->ai) {
1130 			volatile struct mlx5_mini_cqe8 (*mc)[8] =
1131 				(volatile struct mlx5_mini_cqe8 (*)[8])
1132 				(uintptr_t)(&(*rxq->cqes)[zip->ca &
1133 							  cqe_cnt].pkt_info);
1134 
1135 			len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt);
1136 			*mcqe = &(*mc)[zip->ai & 7];
1137 			if ((++zip->ai & 7) == 0) {
1138 				/* Invalidate consumed CQEs */
1139 				idx = zip->ca;
1140 				end = zip->na;
1141 				while (idx != end) {
1142 					(*rxq->cqes)[idx & cqe_cnt].op_own =
1143 						MLX5_CQE_INVALIDATE;
1144 					++idx;
1145 				}
1146 				/*
1147 				 * Increment consumer index to skip the number
1148 				 * of CQEs consumed. Hardware leaves holes in
1149 				 * the CQ ring for software use.
1150 				 */
1151 				zip->ca = zip->na;
1152 				zip->na += 8;
1153 			}
1154 			if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
1155 				/* Invalidate the rest */
1156 				idx = zip->ca;
1157 				end = zip->cq_ci;
1158 
1159 				while (idx != end) {
1160 					(*rxq->cqes)[idx & cqe_cnt].op_own =
1161 						MLX5_CQE_INVALIDATE;
1162 					++idx;
1163 				}
1164 				rxq->cq_ci = zip->cq_ci;
1165 				zip->ai = 0;
1166 			}
1167 		/*
1168 		 * No compressed data, get next CQE and verify if it is
1169 		 * compressed.
1170 		 */
1171 		} else {
1172 			int ret;
1173 			int8_t op_own;
1174 
1175 			ret = check_cqe(cqe, cqe_n, rxq->cq_ci);
1176 			if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
1177 				if (unlikely(ret == MLX5_CQE_STATUS_ERR ||
1178 					     rxq->err_state)) {
1179 					ret = mlx5_rx_err_handle(rxq, 0);
1180 					if (ret == MLX5_CQE_STATUS_HW_OWN ||
1181 					    ret == -1)
1182 						return 0;
1183 				} else {
1184 					return 0;
1185 				}
1186 			}
1187 			++rxq->cq_ci;
1188 			op_own = cqe->op_own;
1189 			if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
1190 				volatile struct mlx5_mini_cqe8 (*mc)[8] =
1191 					(volatile struct mlx5_mini_cqe8 (*)[8])
1192 					(uintptr_t)(&(*rxq->cqes)
1193 						[rxq->cq_ci &
1194 						 cqe_cnt].pkt_info);
1195 
1196 				/* Fix endianness. */
1197 				zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt);
1198 				/*
1199 				 * Current mini array position is the one
1200 				 * returned by check_cqe64().
1201 				 *
1202 				 * If completion comprises several mini arrays,
1203 				 * as a special case the second one is located
1204 				 * 7 CQEs after the initial CQE instead of 8
1205 				 * for subsequent ones.
1206 				 */
1207 				zip->ca = rxq->cq_ci;
1208 				zip->na = zip->ca + 7;
1209 				/* Compute the next non compressed CQE. */
1210 				--rxq->cq_ci;
1211 				zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
1212 				/* Get packet size to return. */
1213 				len = rte_be_to_cpu_32((*mc)[0].byte_cnt);
1214 				*mcqe = &(*mc)[0];
1215 				zip->ai = 1;
1216 				/* Prefetch all to be invalidated */
1217 				idx = zip->ca;
1218 				end = zip->cq_ci;
1219 				while (idx != end) {
1220 					rte_prefetch0(&(*rxq->cqes)[(idx) &
1221 								    cqe_cnt]);
1222 					++idx;
1223 				}
1224 			} else {
1225 				len = rte_be_to_cpu_32(cqe->byte_cnt);
1226 			}
1227 		}
1228 		if (unlikely(rxq->err_state)) {
1229 			cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
1230 			++rxq->stats.idropped;
1231 		} else {
1232 			return len;
1233 		}
1234 	} while (1);
1235 }
1236 
1237 /**
1238  * Translate RX completion flags to offload flags.
1239  *
1240  * @param[in] cqe
1241  *   Pointer to CQE.
1242  *
1243  * @return
1244  *   Offload flags (ol_flags) for struct rte_mbuf.
1245  */
1246 static inline uint32_t
1247 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe)
1248 {
1249 	uint32_t ol_flags = 0;
1250 	uint16_t flags = rte_be_to_cpu_16(cqe->hdr_type_etc);
1251 
1252 	ol_flags =
1253 		TRANSPOSE(flags,
1254 			  MLX5_CQE_RX_L3_HDR_VALID,
1255 			  PKT_RX_IP_CKSUM_GOOD) |
1256 		TRANSPOSE(flags,
1257 			  MLX5_CQE_RX_L4_HDR_VALID,
1258 			  PKT_RX_L4_CKSUM_GOOD);
1259 	return ol_flags;
1260 }
1261 
1262 /**
1263  * Fill in mbuf fields from RX completion flags.
1264  * Note that pkt->ol_flags should be initialized outside of this function.
1265  *
1266  * @param rxq
1267  *   Pointer to RX queue.
1268  * @param pkt
1269  *   mbuf to fill.
1270  * @param cqe
1271  *   CQE to process.
1272  * @param rss_hash_res
1273  *   Packet RSS Hash result.
1274  */
1275 static inline void
1276 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
1277 	       volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res)
1278 {
1279 	/* Update packet information. */
1280 	pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe);
1281 	if (rss_hash_res && rxq->rss_hash) {
1282 		pkt->hash.rss = rss_hash_res;
1283 		pkt->ol_flags |= PKT_RX_RSS_HASH;
1284 	}
1285 	if (rxq->mark && MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) {
1286 		pkt->ol_flags |= PKT_RX_FDIR;
1287 		if (cqe->sop_drop_qpn !=
1288 		    rte_cpu_to_be_32(MLX5_FLOW_MARK_DEFAULT)) {
1289 			uint32_t mark = cqe->sop_drop_qpn;
1290 
1291 			pkt->ol_flags |= PKT_RX_FDIR_ID;
1292 			pkt->hash.fdir.hi = mlx5_flow_mark_get(mark);
1293 		}
1294 	}
1295 	if (rxq->dynf_meta && cqe->flow_table_metadata) {
1296 		pkt->ol_flags |= rxq->flow_meta_mask;
1297 		*RTE_MBUF_DYNFIELD(pkt, rxq->flow_meta_offset, uint32_t *) =
1298 			cqe->flow_table_metadata;
1299 	}
1300 	if (rxq->csum)
1301 		pkt->ol_flags |= rxq_cq_to_ol_flags(cqe);
1302 	if (rxq->vlan_strip &&
1303 	    (cqe->hdr_type_etc & rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED))) {
1304 		pkt->ol_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED;
1305 		pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info);
1306 	}
1307 	if (rxq->hw_timestamp) {
1308 		uint64_t ts = rte_be_to_cpu_64(cqe->timestamp);
1309 
1310 		if (rxq->rt_timestamp)
1311 			ts = mlx5_txpp_convert_rx_ts(rxq->sh, ts);
1312 		mlx5_timestamp_set(pkt, rxq->timestamp_offset, ts);
1313 		pkt->ol_flags |= rxq->timestamp_rx_flag;
1314 	}
1315 }
1316 
1317 /**
1318  * DPDK callback for RX.
1319  *
1320  * @param dpdk_rxq
1321  *   Generic pointer to RX queue structure.
1322  * @param[out] pkts
1323  *   Array to store received packets.
1324  * @param pkts_n
1325  *   Maximum number of packets in array.
1326  *
1327  * @return
1328  *   Number of packets successfully received (<= pkts_n).
1329  */
1330 uint16_t
1331 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
1332 {
1333 	struct mlx5_rxq_data *rxq = dpdk_rxq;
1334 	const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1;
1335 	const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1;
1336 	const unsigned int sges_n = rxq->sges_n;
1337 	struct rte_mbuf *pkt = NULL;
1338 	struct rte_mbuf *seg = NULL;
1339 	volatile struct mlx5_cqe *cqe =
1340 		&(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
1341 	unsigned int i = 0;
1342 	unsigned int rq_ci = rxq->rq_ci << sges_n;
1343 	int len = 0; /* keep its value across iterations. */
1344 
1345 	while (pkts_n) {
1346 		unsigned int idx = rq_ci & wqe_cnt;
1347 		volatile struct mlx5_wqe_data_seg *wqe =
1348 			&((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx];
1349 		struct rte_mbuf *rep = (*rxq->elts)[idx];
1350 		volatile struct mlx5_mini_cqe8 *mcqe = NULL;
1351 		uint32_t rss_hash_res;
1352 
1353 		if (pkt)
1354 			NEXT(seg) = rep;
1355 		seg = rep;
1356 		rte_prefetch0(seg);
1357 		rte_prefetch0(cqe);
1358 		rte_prefetch0(wqe);
1359 		/* Allocate the buf from the same pool. */
1360 		rep = rte_mbuf_raw_alloc(seg->pool);
1361 		if (unlikely(rep == NULL)) {
1362 			++rxq->stats.rx_nombuf;
1363 			if (!pkt) {
1364 				/*
1365 				 * no buffers before we even started,
1366 				 * bail out silently.
1367 				 */
1368 				break;
1369 			}
1370 			while (pkt != seg) {
1371 				MLX5_ASSERT(pkt != (*rxq->elts)[idx]);
1372 				rep = NEXT(pkt);
1373 				NEXT(pkt) = NULL;
1374 				NB_SEGS(pkt) = 1;
1375 				rte_mbuf_raw_free(pkt);
1376 				pkt = rep;
1377 			}
1378 			break;
1379 		}
1380 		if (!pkt) {
1381 			cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
1382 			len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt, &mcqe);
1383 			if (!len) {
1384 				rte_mbuf_raw_free(rep);
1385 				break;
1386 			}
1387 			pkt = seg;
1388 			MLX5_ASSERT(len >= (rxq->crc_present << 2));
1389 			pkt->ol_flags &= EXT_ATTACHED_MBUF;
1390 			/* If compressed, take hash result from mini-CQE. */
1391 			rss_hash_res = rte_be_to_cpu_32(mcqe == NULL ?
1392 							cqe->rx_hash_res :
1393 							mcqe->rx_hash_result);
1394 			rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res);
1395 			if (rxq->crc_present)
1396 				len -= RTE_ETHER_CRC_LEN;
1397 			PKT_LEN(pkt) = len;
1398 			if (cqe->lro_num_seg > 1) {
1399 				mlx5_lro_update_hdr
1400 					(rte_pktmbuf_mtod(pkt, uint8_t *), cqe,
1401 					 len);
1402 				pkt->ol_flags |= PKT_RX_LRO;
1403 				pkt->tso_segsz = len / cqe->lro_num_seg;
1404 			}
1405 		}
1406 		DATA_LEN(rep) = DATA_LEN(seg);
1407 		PKT_LEN(rep) = PKT_LEN(seg);
1408 		SET_DATA_OFF(rep, DATA_OFF(seg));
1409 		PORT(rep) = PORT(seg);
1410 		(*rxq->elts)[idx] = rep;
1411 		/*
1412 		 * Fill NIC descriptor with the new buffer.  The lkey and size
1413 		 * of the buffers are already known, only the buffer address
1414 		 * changes.
1415 		 */
1416 		wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t));
1417 		/* If there's only one MR, no need to replace LKey in WQE. */
1418 		if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
1419 			wqe->lkey = mlx5_rx_mb2mr(rxq, rep);
1420 		if (len > DATA_LEN(seg)) {
1421 			len -= DATA_LEN(seg);
1422 			++NB_SEGS(pkt);
1423 			++rq_ci;
1424 			continue;
1425 		}
1426 		DATA_LEN(seg) = len;
1427 #ifdef MLX5_PMD_SOFT_COUNTERS
1428 		/* Increment bytes counter. */
1429 		rxq->stats.ibytes += PKT_LEN(pkt);
1430 #endif
1431 		/* Return packet. */
1432 		*(pkts++) = pkt;
1433 		pkt = NULL;
1434 		--pkts_n;
1435 		++i;
1436 		/* Align consumer index to the next stride. */
1437 		rq_ci >>= sges_n;
1438 		++rq_ci;
1439 		rq_ci <<= sges_n;
1440 	}
1441 	if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci)))
1442 		return 0;
1443 	/* Update the consumer index. */
1444 	rxq->rq_ci = rq_ci >> sges_n;
1445 	rte_io_wmb();
1446 	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
1447 	rte_io_wmb();
1448 	*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
1449 #ifdef MLX5_PMD_SOFT_COUNTERS
1450 	/* Increment packets counter. */
1451 	rxq->stats.ipackets += i;
1452 #endif
1453 	return i;
1454 }
1455 
1456 /**
1457  * Update LRO packet TCP header.
1458  * The HW LRO feature doesn't update the TCP header after coalescing the
1459  * TCP segments but supplies information in CQE to fill it by SW.
1460  *
1461  * @param tcp
1462  *   Pointer to the TCP header.
1463  * @param cqe
1464  *   Pointer to the completion entry..
1465  * @param phcsum
1466  *   The L3 pseudo-header checksum.
1467  */
1468 static inline void
1469 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *__rte_restrict tcp,
1470 			volatile struct mlx5_cqe *__rte_restrict cqe,
1471 			uint32_t phcsum)
1472 {
1473 	uint8_t l4_type = (rte_be_to_cpu_16(cqe->hdr_type_etc) &
1474 			   MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT;
1475 	/*
1476 	 * The HW calculates only the TCP payload checksum, need to complete
1477 	 * the TCP header checksum and the L3 pseudo-header checksum.
1478 	 */
1479 	uint32_t csum = phcsum + cqe->csum;
1480 
1481 	if (l4_type == MLX5_L4_HDR_TYPE_TCP_EMPTY_ACK ||
1482 	    l4_type == MLX5_L4_HDR_TYPE_TCP_WITH_ACL) {
1483 		tcp->tcp_flags |= RTE_TCP_ACK_FLAG;
1484 		tcp->recv_ack = cqe->lro_ack_seq_num;
1485 		tcp->rx_win = cqe->lro_tcp_win;
1486 	}
1487 	if (cqe->lro_tcppsh_abort_dupack & MLX5_CQE_LRO_PUSH_MASK)
1488 		tcp->tcp_flags |= RTE_TCP_PSH_FLAG;
1489 	tcp->cksum = 0;
1490 	csum += rte_raw_cksum(tcp, (tcp->data_off >> 4) * 4);
1491 	csum = ((csum & 0xffff0000) >> 16) + (csum & 0xffff);
1492 	csum = (~csum) & 0xffff;
1493 	if (csum == 0)
1494 		csum = 0xffff;
1495 	tcp->cksum = csum;
1496 }
1497 
1498 /**
1499  * Update LRO packet headers.
1500  * The HW LRO feature doesn't update the L3/TCP headers after coalescing the
1501  * TCP segments but supply information in CQE to fill it by SW.
1502  *
1503  * @param padd
1504  *   The packet address.
1505  * @param cqe
1506  *   Pointer to the completion entry..
1507  * @param len
1508  *   The packet length.
1509  */
1510 static inline void
1511 mlx5_lro_update_hdr(uint8_t *__rte_restrict padd,
1512 		    volatile struct mlx5_cqe *__rte_restrict cqe,
1513 		    uint32_t len)
1514 {
1515 	union {
1516 		struct rte_ether_hdr *eth;
1517 		struct rte_vlan_hdr *vlan;
1518 		struct rte_ipv4_hdr *ipv4;
1519 		struct rte_ipv6_hdr *ipv6;
1520 		struct rte_tcp_hdr *tcp;
1521 		uint8_t *hdr;
1522 	} h = {
1523 			.hdr = padd,
1524 	};
1525 	uint16_t proto = h.eth->ether_type;
1526 	uint32_t phcsum;
1527 
1528 	h.eth++;
1529 	while (proto == RTE_BE16(RTE_ETHER_TYPE_VLAN) ||
1530 	       proto == RTE_BE16(RTE_ETHER_TYPE_QINQ)) {
1531 		proto = h.vlan->eth_proto;
1532 		h.vlan++;
1533 	}
1534 	if (proto == RTE_BE16(RTE_ETHER_TYPE_IPV4)) {
1535 		h.ipv4->time_to_live = cqe->lro_min_ttl;
1536 		h.ipv4->total_length = rte_cpu_to_be_16(len - (h.hdr - padd));
1537 		h.ipv4->hdr_checksum = 0;
1538 		h.ipv4->hdr_checksum = rte_ipv4_cksum(h.ipv4);
1539 		phcsum = rte_ipv4_phdr_cksum(h.ipv4, 0);
1540 		h.ipv4++;
1541 	} else {
1542 		h.ipv6->hop_limits = cqe->lro_min_ttl;
1543 		h.ipv6->payload_len = rte_cpu_to_be_16(len - (h.hdr - padd) -
1544 						       sizeof(*h.ipv6));
1545 		phcsum = rte_ipv6_phdr_cksum(h.ipv6, 0);
1546 		h.ipv6++;
1547 	}
1548 	mlx5_lro_update_tcp_hdr(h.tcp, cqe, phcsum);
1549 }
1550 
1551 void
1552 mlx5_mprq_buf_free_cb(void *addr __rte_unused, void *opaque)
1553 {
1554 	struct mlx5_mprq_buf *buf = opaque;
1555 
1556 	if (__atomic_load_n(&buf->refcnt, __ATOMIC_RELAXED) == 1) {
1557 		rte_mempool_put(buf->mp, buf);
1558 	} else if (unlikely(__atomic_sub_fetch(&buf->refcnt, 1,
1559 					       __ATOMIC_RELAXED) == 0)) {
1560 		__atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED);
1561 		rte_mempool_put(buf->mp, buf);
1562 	}
1563 }
1564 
1565 void
1566 mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf)
1567 {
1568 	mlx5_mprq_buf_free_cb(NULL, buf);
1569 }
1570 
1571 /**
1572  * DPDK callback for RX with Multi-Packet RQ support.
1573  *
1574  * @param dpdk_rxq
1575  *   Generic pointer to RX queue structure.
1576  * @param[out] pkts
1577  *   Array to store received packets.
1578  * @param pkts_n
1579  *   Maximum number of packets in array.
1580  *
1581  * @return
1582  *   Number of packets successfully received (<= pkts_n).
1583  */
1584 uint16_t
1585 mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
1586 {
1587 	struct mlx5_rxq_data *rxq = dpdk_rxq;
1588 	const uint32_t strd_n = 1 << rxq->strd_num_n;
1589 	const uint32_t cq_mask = (1 << rxq->cqe_n) - 1;
1590 	const uint32_t wq_mask = (1 << rxq->elts_n) - 1;
1591 	volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask];
1592 	unsigned int i = 0;
1593 	uint32_t rq_ci = rxq->rq_ci;
1594 	uint16_t consumed_strd = rxq->consumed_strd;
1595 	struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_ci & wq_mask];
1596 
1597 	while (i < pkts_n) {
1598 		struct rte_mbuf *pkt;
1599 		int ret;
1600 		uint32_t len;
1601 		uint16_t strd_cnt;
1602 		uint16_t strd_idx;
1603 		uint32_t byte_cnt;
1604 		volatile struct mlx5_mini_cqe8 *mcqe = NULL;
1605 		uint32_t rss_hash_res = 0;
1606 		enum mlx5_rqx_code rxq_code;
1607 
1608 		if (consumed_strd == strd_n) {
1609 			/* Replace WQE if the buffer is still in use. */
1610 			mprq_buf_replace(rxq, rq_ci & wq_mask);
1611 			/* Advance to the next WQE. */
1612 			consumed_strd = 0;
1613 			++rq_ci;
1614 			buf = (*rxq->mprq_bufs)[rq_ci & wq_mask];
1615 		}
1616 		cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask];
1617 		ret = mlx5_rx_poll_len(rxq, cqe, cq_mask, &mcqe);
1618 		if (!ret)
1619 			break;
1620 		byte_cnt = ret;
1621 		strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >>
1622 			   MLX5_MPRQ_STRIDE_NUM_SHIFT;
1623 		MLX5_ASSERT(strd_cnt);
1624 		consumed_strd += strd_cnt;
1625 		if (byte_cnt & MLX5_MPRQ_FILLER_MASK)
1626 			continue;
1627 		if (mcqe == NULL) {
1628 			rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res);
1629 			strd_idx = rte_be_to_cpu_16(cqe->wqe_counter);
1630 		} else {
1631 			/* mini-CQE for MPRQ doesn't have hash result. */
1632 			strd_idx = rte_be_to_cpu_16(mcqe->stride_idx);
1633 		}
1634 		MLX5_ASSERT(strd_idx < strd_n);
1635 		MLX5_ASSERT(!((rte_be_to_cpu_16(cqe->wqe_id) ^ rq_ci) &
1636 			    wq_mask));
1637 		pkt = rte_pktmbuf_alloc(rxq->mp);
1638 		if (unlikely(pkt == NULL)) {
1639 			++rxq->stats.rx_nombuf;
1640 			break;
1641 		}
1642 		len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT;
1643 		MLX5_ASSERT((int)len >= (rxq->crc_present << 2));
1644 		if (rxq->crc_present)
1645 			len -= RTE_ETHER_CRC_LEN;
1646 		rxq_code = mprq_buf_to_pkt(rxq, pkt, len, buf,
1647 					   strd_idx, strd_cnt);
1648 		if (unlikely(rxq_code != MLX5_RXQ_CODE_EXIT)) {
1649 			rte_pktmbuf_free_seg(pkt);
1650 			if (rxq_code == MLX5_RXQ_CODE_DROPPED) {
1651 				++rxq->stats.idropped;
1652 				continue;
1653 			}
1654 			if (rxq_code == MLX5_RXQ_CODE_NOMBUF) {
1655 				++rxq->stats.rx_nombuf;
1656 				break;
1657 			}
1658 		}
1659 		rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res);
1660 		if (cqe->lro_num_seg > 1) {
1661 			mlx5_lro_update_hdr(rte_pktmbuf_mtod(pkt, uint8_t *),
1662 					    cqe, len);
1663 			pkt->ol_flags |= PKT_RX_LRO;
1664 			pkt->tso_segsz = len / cqe->lro_num_seg;
1665 		}
1666 		PKT_LEN(pkt) = len;
1667 		PORT(pkt) = rxq->port_id;
1668 #ifdef MLX5_PMD_SOFT_COUNTERS
1669 		/* Increment bytes counter. */
1670 		rxq->stats.ibytes += PKT_LEN(pkt);
1671 #endif
1672 		/* Return packet. */
1673 		*(pkts++) = pkt;
1674 		++i;
1675 	}
1676 	/* Update the consumer indexes. */
1677 	rxq->consumed_strd = consumed_strd;
1678 	rte_io_wmb();
1679 	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
1680 	if (rq_ci != rxq->rq_ci) {
1681 		rxq->rq_ci = rq_ci;
1682 		rte_io_wmb();
1683 		*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
1684 	}
1685 #ifdef MLX5_PMD_SOFT_COUNTERS
1686 	/* Increment packets counter. */
1687 	rxq->stats.ipackets += i;
1688 #endif
1689 	return i;
1690 }
1691 
1692 /**
1693  * Dummy DPDK callback for TX.
1694  *
1695  * This function is used to temporarily replace the real callback during
1696  * unsafe control operations on the queue, or in case of error.
1697  *
1698  * @param dpdk_txq
1699  *   Generic pointer to TX queue structure.
1700  * @param[in] pkts
1701  *   Packets to transmit.
1702  * @param pkts_n
1703  *   Number of packets in array.
1704  *
1705  * @return
1706  *   Number of packets successfully transmitted (<= pkts_n).
1707  */
1708 uint16_t
1709 removed_tx_burst(void *dpdk_txq __rte_unused,
1710 		 struct rte_mbuf **pkts __rte_unused,
1711 		 uint16_t pkts_n __rte_unused)
1712 {
1713 	rte_mb();
1714 	return 0;
1715 }
1716 
1717 /**
1718  * Dummy DPDK callback for RX.
1719  *
1720  * This function is used to temporarily replace the real callback during
1721  * unsafe control operations on the queue, or in case of error.
1722  *
1723  * @param dpdk_rxq
1724  *   Generic pointer to RX queue structure.
1725  * @param[out] pkts
1726  *   Array to store received packets.
1727  * @param pkts_n
1728  *   Maximum number of packets in array.
1729  *
1730  * @return
1731  *   Number of packets successfully received (<= pkts_n).
1732  */
1733 uint16_t
1734 removed_rx_burst(void *dpdk_txq __rte_unused,
1735 		 struct rte_mbuf **pkts __rte_unused,
1736 		 uint16_t pkts_n __rte_unused)
1737 {
1738 	rte_mb();
1739 	return 0;
1740 }
1741 
1742 /*
1743  * Vectorized Rx/Tx routines are not compiled in when required vector
1744  * instructions are not supported on a target architecture. The following null
1745  * stubs are needed for linkage when those are not included outside of this file
1746  * (e.g.  mlx5_rxtx_vec_sse.c for x86).
1747  */
1748 
1749 __rte_weak uint16_t
1750 mlx5_rx_burst_vec(void *dpdk_txq __rte_unused,
1751 		  struct rte_mbuf **pkts __rte_unused,
1752 		  uint16_t pkts_n __rte_unused)
1753 {
1754 	return 0;
1755 }
1756 
1757 __rte_weak uint16_t
1758 mlx5_rx_burst_mprq_vec(void *dpdk_txq __rte_unused,
1759 		       struct rte_mbuf **pkts __rte_unused,
1760 		       uint16_t pkts_n __rte_unused)
1761 {
1762 	return 0;
1763 }
1764 
1765 __rte_weak int
1766 mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused)
1767 {
1768 	return -ENOTSUP;
1769 }
1770 
1771 __rte_weak int
1772 mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused)
1773 {
1774 	return -ENOTSUP;
1775 }
1776 
1777 /**
1778  * Free the mbufs from the linear array of pointers.
1779  *
1780  * @param pkts
1781  *   Pointer to array of packets to be free.
1782  * @param pkts_n
1783  *   Number of packets to be freed.
1784  * @param olx
1785  *   Configured Tx offloads mask. It is fully defined at
1786  *   compile time and may be used for optimization.
1787  */
1788 static __rte_always_inline void
1789 mlx5_tx_free_mbuf(struct rte_mbuf **__rte_restrict pkts,
1790 		  unsigned int pkts_n,
1791 		  unsigned int olx __rte_unused)
1792 {
1793 	struct rte_mempool *pool = NULL;
1794 	struct rte_mbuf **p_free = NULL;
1795 	struct rte_mbuf *mbuf;
1796 	unsigned int n_free = 0;
1797 
1798 	/*
1799 	 * The implemented algorithm eliminates
1800 	 * copying pointers to temporary array
1801 	 * for rte_mempool_put_bulk() calls.
1802 	 */
1803 	MLX5_ASSERT(pkts);
1804 	MLX5_ASSERT(pkts_n);
1805 	for (;;) {
1806 		for (;;) {
1807 			/*
1808 			 * Decrement mbuf reference counter, detach
1809 			 * indirect and external buffers if needed.
1810 			 */
1811 			mbuf = rte_pktmbuf_prefree_seg(*pkts);
1812 			if (likely(mbuf != NULL)) {
1813 				MLX5_ASSERT(mbuf == *pkts);
1814 				if (likely(n_free != 0)) {
1815 					if (unlikely(pool != mbuf->pool))
1816 						/* From different pool. */
1817 						break;
1818 				} else {
1819 					/* Start new scan array. */
1820 					pool = mbuf->pool;
1821 					p_free = pkts;
1822 				}
1823 				++n_free;
1824 				++pkts;
1825 				--pkts_n;
1826 				if (unlikely(pkts_n == 0)) {
1827 					mbuf = NULL;
1828 					break;
1829 				}
1830 			} else {
1831 				/*
1832 				 * This happens if mbuf is still referenced.
1833 				 * We can't put it back to the pool, skip.
1834 				 */
1835 				++pkts;
1836 				--pkts_n;
1837 				if (unlikely(n_free != 0))
1838 					/* There is some array to free.*/
1839 					break;
1840 				if (unlikely(pkts_n == 0))
1841 					/* Last mbuf, nothing to free. */
1842 					return;
1843 			}
1844 		}
1845 		for (;;) {
1846 			/*
1847 			 * This loop is implemented to avoid multiple
1848 			 * inlining of rte_mempool_put_bulk().
1849 			 */
1850 			MLX5_ASSERT(pool);
1851 			MLX5_ASSERT(p_free);
1852 			MLX5_ASSERT(n_free);
1853 			/*
1854 			 * Free the array of pre-freed mbufs
1855 			 * belonging to the same memory pool.
1856 			 */
1857 			rte_mempool_put_bulk(pool, (void *)p_free, n_free);
1858 			if (unlikely(mbuf != NULL)) {
1859 				/* There is the request to start new scan. */
1860 				pool = mbuf->pool;
1861 				p_free = pkts++;
1862 				n_free = 1;
1863 				--pkts_n;
1864 				if (likely(pkts_n != 0))
1865 					break;
1866 				/*
1867 				 * This is the last mbuf to be freed.
1868 				 * Do one more loop iteration to complete.
1869 				 * This is rare case of the last unique mbuf.
1870 				 */
1871 				mbuf = NULL;
1872 				continue;
1873 			}
1874 			if (likely(pkts_n == 0))
1875 				return;
1876 			n_free = 0;
1877 			break;
1878 		}
1879 	}
1880 }
1881 
1882 /**
1883  * Free the mbuf from the elts ring buffer till new tail.
1884  *
1885  * @param txq
1886  *   Pointer to Tx queue structure.
1887  * @param tail
1888  *   Index in elts to free up to, becomes new elts tail.
1889  * @param olx
1890  *   Configured Tx offloads mask. It is fully defined at
1891  *   compile time and may be used for optimization.
1892  */
1893 static __rte_always_inline void
1894 mlx5_tx_free_elts(struct mlx5_txq_data *__rte_restrict txq,
1895 		  uint16_t tail,
1896 		  unsigned int olx __rte_unused)
1897 {
1898 	uint16_t n_elts = tail - txq->elts_tail;
1899 
1900 	MLX5_ASSERT(n_elts);
1901 	MLX5_ASSERT(n_elts <= txq->elts_s);
1902 	/*
1903 	 * Implement a loop to support ring buffer wraparound
1904 	 * with single inlining of mlx5_tx_free_mbuf().
1905 	 */
1906 	do {
1907 		unsigned int part;
1908 
1909 		part = txq->elts_s - (txq->elts_tail & txq->elts_m);
1910 		part = RTE_MIN(part, n_elts);
1911 		MLX5_ASSERT(part);
1912 		MLX5_ASSERT(part <= txq->elts_s);
1913 		mlx5_tx_free_mbuf(&txq->elts[txq->elts_tail & txq->elts_m],
1914 				  part, olx);
1915 		txq->elts_tail += part;
1916 		n_elts -= part;
1917 	} while (n_elts);
1918 }
1919 
1920 /**
1921  * Store the mbuf being sent into elts ring buffer.
1922  * On Tx completion these mbufs will be freed.
1923  *
1924  * @param txq
1925  *   Pointer to Tx queue structure.
1926  * @param pkts
1927  *   Pointer to array of packets to be stored.
1928  * @param pkts_n
1929  *   Number of packets to be stored.
1930  * @param olx
1931  *   Configured Tx offloads mask. It is fully defined at
1932  *   compile time and may be used for optimization.
1933  */
1934 static __rte_always_inline void
1935 mlx5_tx_copy_elts(struct mlx5_txq_data *__rte_restrict txq,
1936 		  struct rte_mbuf **__rte_restrict pkts,
1937 		  unsigned int pkts_n,
1938 		  unsigned int olx __rte_unused)
1939 {
1940 	unsigned int part;
1941 	struct rte_mbuf **elts = (struct rte_mbuf **)txq->elts;
1942 
1943 	MLX5_ASSERT(pkts);
1944 	MLX5_ASSERT(pkts_n);
1945 	part = txq->elts_s - (txq->elts_head & txq->elts_m);
1946 	MLX5_ASSERT(part);
1947 	MLX5_ASSERT(part <= txq->elts_s);
1948 	/* This code is a good candidate for vectorizing with SIMD. */
1949 	rte_memcpy((void *)(elts + (txq->elts_head & txq->elts_m)),
1950 		   (void *)pkts,
1951 		   RTE_MIN(part, pkts_n) * sizeof(struct rte_mbuf *));
1952 	txq->elts_head += pkts_n;
1953 	if (unlikely(part < pkts_n))
1954 		/* The copy is wrapping around the elts array. */
1955 		rte_memcpy((void *)elts, (void *)(pkts + part),
1956 			   (pkts_n - part) * sizeof(struct rte_mbuf *));
1957 }
1958 
1959 /**
1960  * Update completion queue consuming index via doorbell
1961  * and flush the completed data buffers.
1962  *
1963  * @param txq
1964  *   Pointer to TX queue structure.
1965  * @param valid CQE pointer
1966  *   if not NULL update txq->wqe_pi and flush the buffers
1967  * @param olx
1968  *   Configured Tx offloads mask. It is fully defined at
1969  *   compile time and may be used for optimization.
1970  */
1971 static __rte_always_inline void
1972 mlx5_tx_comp_flush(struct mlx5_txq_data *__rte_restrict txq,
1973 		   volatile struct mlx5_cqe *last_cqe,
1974 		   unsigned int olx __rte_unused)
1975 {
1976 	if (likely(last_cqe != NULL)) {
1977 		uint16_t tail;
1978 
1979 		txq->wqe_pi = rte_be_to_cpu_16(last_cqe->wqe_counter);
1980 		tail = txq->fcqs[(txq->cq_ci - 1) & txq->cqe_m];
1981 		if (likely(tail != txq->elts_tail)) {
1982 			mlx5_tx_free_elts(txq, tail, olx);
1983 			MLX5_ASSERT(tail == txq->elts_tail);
1984 		}
1985 	}
1986 }
1987 
1988 /**
1989  * Manage TX completions. This routine checks the CQ for
1990  * arrived CQEs, deduces the last accomplished WQE in SQ,
1991  * updates SQ producing index and frees all completed mbufs.
1992  *
1993  * @param txq
1994  *   Pointer to TX queue structure.
1995  * @param olx
1996  *   Configured Tx offloads mask. It is fully defined at
1997  *   compile time and may be used for optimization.
1998  *
1999  * NOTE: not inlined intentionally, it makes tx_burst
2000  * routine smaller, simple and faster - from experiments.
2001  */
2002 static void
2003 mlx5_tx_handle_completion(struct mlx5_txq_data *__rte_restrict txq,
2004 			  unsigned int olx __rte_unused)
2005 {
2006 	unsigned int count = MLX5_TX_COMP_MAX_CQE;
2007 	volatile struct mlx5_cqe *last_cqe = NULL;
2008 	bool ring_doorbell = false;
2009 	int ret;
2010 
2011 	static_assert(MLX5_CQE_STATUS_HW_OWN < 0, "Must be negative value");
2012 	static_assert(MLX5_CQE_STATUS_SW_OWN < 0, "Must be negative value");
2013 	do {
2014 		volatile struct mlx5_cqe *cqe;
2015 
2016 		cqe = &txq->cqes[txq->cq_ci & txq->cqe_m];
2017 		ret = check_cqe(cqe, txq->cqe_s, txq->cq_ci);
2018 		if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
2019 			if (likely(ret != MLX5_CQE_STATUS_ERR)) {
2020 				/* No new CQEs in completion queue. */
2021 				MLX5_ASSERT(ret == MLX5_CQE_STATUS_HW_OWN);
2022 				break;
2023 			}
2024 			/*
2025 			 * Some error occurred, try to restart.
2026 			 * We have no barrier after WQE related Doorbell
2027 			 * written, make sure all writes are completed
2028 			 * here, before we might perform SQ reset.
2029 			 */
2030 			rte_wmb();
2031 			ret = mlx5_tx_error_cqe_handle
2032 				(txq, (volatile struct mlx5_err_cqe *)cqe);
2033 			if (unlikely(ret < 0)) {
2034 				/*
2035 				 * Some error occurred on queue error
2036 				 * handling, we do not advance the index
2037 				 * here, allowing to retry on next call.
2038 				 */
2039 				return;
2040 			}
2041 			/*
2042 			 * We are going to fetch all entries with
2043 			 * MLX5_CQE_SYNDROME_WR_FLUSH_ERR status.
2044 			 * The send queue is supposed to be empty.
2045 			 */
2046 			ring_doorbell = true;
2047 			++txq->cq_ci;
2048 			txq->cq_pi = txq->cq_ci;
2049 			last_cqe = NULL;
2050 			continue;
2051 		}
2052 		/* Normal transmit completion. */
2053 		MLX5_ASSERT(txq->cq_ci != txq->cq_pi);
2054 		MLX5_ASSERT((txq->fcqs[txq->cq_ci & txq->cqe_m] >> 16) ==
2055 			    cqe->wqe_counter);
2056 		ring_doorbell = true;
2057 		++txq->cq_ci;
2058 		last_cqe = cqe;
2059 		/*
2060 		 * We have to restrict the amount of processed CQEs
2061 		 * in one tx_burst routine call. The CQ may be large
2062 		 * and many CQEs may be updated by the NIC in one
2063 		 * transaction. Buffers freeing is time consuming,
2064 		 * multiple iterations may introduce significant
2065 		 * latency.
2066 		 */
2067 		if (likely(--count == 0))
2068 			break;
2069 	} while (true);
2070 	if (likely(ring_doorbell)) {
2071 		/* Ring doorbell to notify hardware. */
2072 		rte_compiler_barrier();
2073 		*txq->cq_db = rte_cpu_to_be_32(txq->cq_ci);
2074 		mlx5_tx_comp_flush(txq, last_cqe, olx);
2075 	}
2076 }
2077 
2078 /**
2079  * Check if the completion request flag should be set in the last WQE.
2080  * Both pushed mbufs and WQEs are monitored and the completion request
2081  * flag is set if any of thresholds is reached.
2082  *
2083  * @param txq
2084  *   Pointer to TX queue structure.
2085  * @param loc
2086  *   Pointer to burst routine local context.
2087  * @param olx
2088  *   Configured Tx offloads mask. It is fully defined at
2089  *   compile time and may be used for optimization.
2090  */
2091 static __rte_always_inline void
2092 mlx5_tx_request_completion(struct mlx5_txq_data *__rte_restrict txq,
2093 			   struct mlx5_txq_local *__rte_restrict loc,
2094 			   unsigned int olx)
2095 {
2096 	uint16_t head = txq->elts_head;
2097 	unsigned int part;
2098 
2099 	part = MLX5_TXOFF_CONFIG(INLINE) ?
2100 	       0 : loc->pkts_sent - loc->pkts_copy;
2101 	head += part;
2102 	if ((uint16_t)(head - txq->elts_comp) >= MLX5_TX_COMP_THRESH ||
2103 	     (MLX5_TXOFF_CONFIG(INLINE) &&
2104 	     (uint16_t)(txq->wqe_ci - txq->wqe_comp) >= txq->wqe_thres)) {
2105 		volatile struct mlx5_wqe *last = loc->wqe_last;
2106 
2107 		MLX5_ASSERT(last);
2108 		txq->elts_comp = head;
2109 		if (MLX5_TXOFF_CONFIG(INLINE))
2110 			txq->wqe_comp = txq->wqe_ci;
2111 		/* Request unconditional completion on last WQE. */
2112 		last->cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS <<
2113 					    MLX5_COMP_MODE_OFFSET);
2114 		/* Save elts_head in dedicated free on completion queue. */
2115 #ifdef RTE_LIBRTE_MLX5_DEBUG
2116 		txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head |
2117 			  (last->cseg.opcode >> 8) << 16;
2118 #else
2119 		txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head;
2120 #endif
2121 		/* A CQE slot must always be available. */
2122 		MLX5_ASSERT((txq->cq_pi - txq->cq_ci) <= txq->cqe_s);
2123 	}
2124 }
2125 
2126 /**
2127  * DPDK callback to check the status of a tx descriptor.
2128  *
2129  * @param tx_queue
2130  *   The tx queue.
2131  * @param[in] offset
2132  *   The index of the descriptor in the ring.
2133  *
2134  * @return
2135  *   The status of the tx descriptor.
2136  */
2137 int
2138 mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset)
2139 {
2140 	struct mlx5_txq_data *__rte_restrict txq = tx_queue;
2141 	uint16_t used;
2142 
2143 	mlx5_tx_handle_completion(txq, 0);
2144 	used = txq->elts_head - txq->elts_tail;
2145 	if (offset < used)
2146 		return RTE_ETH_TX_DESC_FULL;
2147 	return RTE_ETH_TX_DESC_DONE;
2148 }
2149 
2150 /**
2151  * Build the Control Segment with specified opcode:
2152  * - MLX5_OPCODE_SEND
2153  * - MLX5_OPCODE_ENHANCED_MPSW
2154  * - MLX5_OPCODE_TSO
2155  *
2156  * @param txq
2157  *   Pointer to TX queue structure.
2158  * @param loc
2159  *   Pointer to burst routine local context.
2160  * @param wqe
2161  *   Pointer to WQE to fill with built Control Segment.
2162  * @param ds
2163  *   Supposed length of WQE in segments.
2164  * @param opcode
2165  *   SQ WQE opcode to put into Control Segment.
2166  * @param olx
2167  *   Configured Tx offloads mask. It is fully defined at
2168  *   compile time and may be used for optimization.
2169  */
2170 static __rte_always_inline void
2171 mlx5_tx_cseg_init(struct mlx5_txq_data *__rte_restrict txq,
2172 		  struct mlx5_txq_local *__rte_restrict loc __rte_unused,
2173 		  struct mlx5_wqe *__rte_restrict wqe,
2174 		  unsigned int ds,
2175 		  unsigned int opcode,
2176 		  unsigned int olx __rte_unused)
2177 {
2178 	struct mlx5_wqe_cseg *__rte_restrict cs = &wqe->cseg;
2179 
2180 	/* For legacy MPW replace the EMPW by TSO with modifier. */
2181 	if (MLX5_TXOFF_CONFIG(MPW) && opcode == MLX5_OPCODE_ENHANCED_MPSW)
2182 		opcode = MLX5_OPCODE_TSO | MLX5_OPC_MOD_MPW << 24;
2183 	cs->opcode = rte_cpu_to_be_32((txq->wqe_ci << 8) | opcode);
2184 	cs->sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
2185 	cs->flags = RTE_BE32(MLX5_COMP_ONLY_FIRST_ERR <<
2186 			     MLX5_COMP_MODE_OFFSET);
2187 	cs->misc = RTE_BE32(0);
2188 }
2189 
2190 /**
2191  * Build the Synchronize Queue Segment with specified completion index.
2192  *
2193  * @param txq
2194  *   Pointer to TX queue structure.
2195  * @param loc
2196  *   Pointer to burst routine local context.
2197  * @param wqe
2198  *   Pointer to WQE to fill with built Control Segment.
2199  * @param wci
2200  *   Completion index in Clock Queue to wait.
2201  * @param olx
2202  *   Configured Tx offloads mask. It is fully defined at
2203  *   compile time and may be used for optimization.
2204  */
2205 static __rte_always_inline void
2206 mlx5_tx_wseg_init(struct mlx5_txq_data *restrict txq,
2207 		  struct mlx5_txq_local *restrict loc __rte_unused,
2208 		  struct mlx5_wqe *restrict wqe,
2209 		  unsigned int wci,
2210 		  unsigned int olx __rte_unused)
2211 {
2212 	struct mlx5_wqe_qseg *qs;
2213 
2214 	qs = RTE_PTR_ADD(wqe, MLX5_WSEG_SIZE);
2215 	qs->max_index = rte_cpu_to_be_32(wci);
2216 	qs->qpn_cqn = rte_cpu_to_be_32(txq->sh->txpp.clock_queue.cq->id);
2217 	qs->reserved0 = RTE_BE32(0);
2218 	qs->reserved1 = RTE_BE32(0);
2219 }
2220 
2221 /**
2222  * Build the Ethernet Segment without inlined data.
2223  * Supports Software Parser, Checksums and VLAN
2224  * insertion Tx offload features.
2225  *
2226  * @param txq
2227  *   Pointer to TX queue structure.
2228  * @param loc
2229  *   Pointer to burst routine local context.
2230  * @param wqe
2231  *   Pointer to WQE to fill with built Ethernet Segment.
2232  * @param olx
2233  *   Configured Tx offloads mask. It is fully defined at
2234  *   compile time and may be used for optimization.
2235  */
2236 static __rte_always_inline void
2237 mlx5_tx_eseg_none(struct mlx5_txq_data *__rte_restrict txq __rte_unused,
2238 		  struct mlx5_txq_local *__rte_restrict loc,
2239 		  struct mlx5_wqe *__rte_restrict wqe,
2240 		  unsigned int olx)
2241 {
2242 	struct mlx5_wqe_eseg *__rte_restrict es = &wqe->eseg;
2243 	uint32_t csum;
2244 
2245 	/*
2246 	 * Calculate and set check sum flags first, dword field
2247 	 * in segment may be shared with Software Parser flags.
2248 	 */
2249 	csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
2250 	es->flags = rte_cpu_to_le_32(csum);
2251 	/*
2252 	 * Calculate and set Software Parser offsets and flags.
2253 	 * These flags a set for custom UDP and IP tunnel packets.
2254 	 */
2255 	es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
2256 	/* Fill metadata field if needed. */
2257 	es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
2258 		       loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ?
2259 		       *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0;
2260 	/* Engage VLAN tag insertion feature if requested. */
2261 	if (MLX5_TXOFF_CONFIG(VLAN) &&
2262 	    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) {
2263 		/*
2264 		 * We should get here only if device support
2265 		 * this feature correctly.
2266 		 */
2267 		MLX5_ASSERT(txq->vlan_en);
2268 		es->inline_hdr = rte_cpu_to_be_32(MLX5_ETH_WQE_VLAN_INSERT |
2269 						  loc->mbuf->vlan_tci);
2270 	} else {
2271 		es->inline_hdr = RTE_BE32(0);
2272 	}
2273 }
2274 
2275 /**
2276  * Build the Ethernet Segment with minimal inlined data
2277  * of MLX5_ESEG_MIN_INLINE_SIZE bytes length. This is
2278  * used to fill the gap in single WQEBB WQEs.
2279  * Supports Software Parser, Checksums and VLAN
2280  * insertion Tx offload features.
2281  *
2282  * @param txq
2283  *   Pointer to TX queue structure.
2284  * @param loc
2285  *   Pointer to burst routine local context.
2286  * @param wqe
2287  *   Pointer to WQE to fill with built Ethernet Segment.
2288  * @param vlan
2289  *   Length of VLAN tag insertion if any.
2290  * @param olx
2291  *   Configured Tx offloads mask. It is fully defined at
2292  *   compile time and may be used for optimization.
2293  */
2294 static __rte_always_inline void
2295 mlx5_tx_eseg_dmin(struct mlx5_txq_data *__rte_restrict txq __rte_unused,
2296 		  struct mlx5_txq_local *__rte_restrict loc,
2297 		  struct mlx5_wqe *__rte_restrict wqe,
2298 		  unsigned int vlan,
2299 		  unsigned int olx)
2300 {
2301 	struct mlx5_wqe_eseg *__rte_restrict es = &wqe->eseg;
2302 	uint32_t csum;
2303 	uint8_t *psrc, *pdst;
2304 
2305 	/*
2306 	 * Calculate and set check sum flags first, dword field
2307 	 * in segment may be shared with Software Parser flags.
2308 	 */
2309 	csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
2310 	es->flags = rte_cpu_to_le_32(csum);
2311 	/*
2312 	 * Calculate and set Software Parser offsets and flags.
2313 	 * These flags a set for custom UDP and IP tunnel packets.
2314 	 */
2315 	es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
2316 	/* Fill metadata field if needed. */
2317 	es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
2318 		       loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ?
2319 		       *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0;
2320 	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
2321 				(sizeof(uint16_t) +
2322 				 sizeof(rte_v128u32_t)),
2323 		      "invalid Ethernet Segment data size");
2324 	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
2325 				(sizeof(uint16_t) +
2326 				 sizeof(struct rte_vlan_hdr) +
2327 				 2 * RTE_ETHER_ADDR_LEN),
2328 		      "invalid Ethernet Segment data size");
2329 	psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
2330 	es->inline_hdr_sz = RTE_BE16(MLX5_ESEG_MIN_INLINE_SIZE);
2331 	es->inline_data = *(unaligned_uint16_t *)psrc;
2332 	psrc +=	sizeof(uint16_t);
2333 	pdst = (uint8_t *)(es + 1);
2334 	if (MLX5_TXOFF_CONFIG(VLAN) && vlan) {
2335 		/* Implement VLAN tag insertion as part inline data. */
2336 		memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t));
2337 		pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
2338 		psrc +=	2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
2339 		/* Insert VLAN ethertype + VLAN tag. */
2340 		*(unaligned_uint32_t *)pdst = rte_cpu_to_be_32
2341 						((RTE_ETHER_TYPE_VLAN << 16) |
2342 						 loc->mbuf->vlan_tci);
2343 		pdst += sizeof(struct rte_vlan_hdr);
2344 		/* Copy the rest two bytes from packet data. */
2345 		MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t)));
2346 		*(uint16_t *)pdst = *(unaligned_uint16_t *)psrc;
2347 	} else {
2348 		/* Fill the gap in the title WQEBB with inline data. */
2349 		rte_mov16(pdst, psrc);
2350 	}
2351 }
2352 
2353 /**
2354  * Build the Ethernet Segment with entire packet
2355  * data inlining. Checks the boundary of WQEBB and
2356  * ring buffer wrapping, supports Software Parser,
2357  * Checksums and VLAN insertion Tx offload features.
2358  *
2359  * @param txq
2360  *   Pointer to TX queue structure.
2361  * @param loc
2362  *   Pointer to burst routine local context.
2363  * @param wqe
2364  *   Pointer to WQE to fill with built Ethernet Segment.
2365  * @param vlan
2366  *   Length of VLAN tag insertion if any.
2367  * @param inlen
2368  *   Length of data to inline (VLAN included, if any).
2369  * @param tso
2370  *   TSO flag, set mss field from the packet.
2371  * @param olx
2372  *   Configured Tx offloads mask. It is fully defined at
2373  *   compile time and may be used for optimization.
2374  *
2375  * @return
2376  *   Pointer to the next Data Segment (aligned and wrapped around).
2377  */
2378 static __rte_always_inline struct mlx5_wqe_dseg *
2379 mlx5_tx_eseg_data(struct mlx5_txq_data *__rte_restrict txq,
2380 		  struct mlx5_txq_local *__rte_restrict loc,
2381 		  struct mlx5_wqe *__rte_restrict wqe,
2382 		  unsigned int vlan,
2383 		  unsigned int inlen,
2384 		  unsigned int tso,
2385 		  unsigned int olx)
2386 {
2387 	struct mlx5_wqe_eseg *__rte_restrict es = &wqe->eseg;
2388 	uint32_t csum;
2389 	uint8_t *psrc, *pdst;
2390 	unsigned int part;
2391 
2392 	/*
2393 	 * Calculate and set check sum flags first, dword field
2394 	 * in segment may be shared with Software Parser flags.
2395 	 */
2396 	csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
2397 	if (tso) {
2398 		csum <<= 24;
2399 		csum |= loc->mbuf->tso_segsz;
2400 		es->flags = rte_cpu_to_be_32(csum);
2401 	} else {
2402 		es->flags = rte_cpu_to_le_32(csum);
2403 	}
2404 	/*
2405 	 * Calculate and set Software Parser offsets and flags.
2406 	 * These flags a set for custom UDP and IP tunnel packets.
2407 	 */
2408 	es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
2409 	/* Fill metadata field if needed. */
2410 	es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
2411 		       loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ?
2412 		       *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0;
2413 	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
2414 				(sizeof(uint16_t) +
2415 				 sizeof(rte_v128u32_t)),
2416 		      "invalid Ethernet Segment data size");
2417 	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
2418 				(sizeof(uint16_t) +
2419 				 sizeof(struct rte_vlan_hdr) +
2420 				 2 * RTE_ETHER_ADDR_LEN),
2421 		      "invalid Ethernet Segment data size");
2422 	psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
2423 	es->inline_hdr_sz = rte_cpu_to_be_16(inlen);
2424 	es->inline_data = *(unaligned_uint16_t *)psrc;
2425 	psrc +=	sizeof(uint16_t);
2426 	pdst = (uint8_t *)(es + 1);
2427 	if (MLX5_TXOFF_CONFIG(VLAN) && vlan) {
2428 		/* Implement VLAN tag insertion as part inline data. */
2429 		memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t));
2430 		pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
2431 		psrc +=	2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
2432 		/* Insert VLAN ethertype + VLAN tag. */
2433 		*(unaligned_uint32_t *)pdst = rte_cpu_to_be_32
2434 						((RTE_ETHER_TYPE_VLAN << 16) |
2435 						 loc->mbuf->vlan_tci);
2436 		pdst += sizeof(struct rte_vlan_hdr);
2437 		/* Copy the rest two bytes from packet data. */
2438 		MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t)));
2439 		*(uint16_t *)pdst = *(unaligned_uint16_t *)psrc;
2440 		psrc += sizeof(uint16_t);
2441 	} else {
2442 		/* Fill the gap in the title WQEBB with inline data. */
2443 		rte_mov16(pdst, psrc);
2444 		psrc += sizeof(rte_v128u32_t);
2445 	}
2446 	pdst = (uint8_t *)(es + 2);
2447 	MLX5_ASSERT(inlen >= MLX5_ESEG_MIN_INLINE_SIZE);
2448 	MLX5_ASSERT(pdst < (uint8_t *)txq->wqes_end);
2449 	inlen -= MLX5_ESEG_MIN_INLINE_SIZE;
2450 	if (!inlen) {
2451 		MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE));
2452 		return (struct mlx5_wqe_dseg *)pdst;
2453 	}
2454 	/*
2455 	 * The WQEBB space availability is checked by caller.
2456 	 * Here we should be aware of WQE ring buffer wraparound only.
2457 	 */
2458 	part = (uint8_t *)txq->wqes_end - pdst;
2459 	part = RTE_MIN(part, inlen);
2460 	do {
2461 		rte_memcpy(pdst, psrc, part);
2462 		inlen -= part;
2463 		if (likely(!inlen)) {
2464 			/*
2465 			 * If return value is not used by the caller
2466 			 * the code below will be optimized out.
2467 			 */
2468 			pdst += part;
2469 			pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
2470 			if (unlikely(pdst >= (uint8_t *)txq->wqes_end))
2471 				pdst = (uint8_t *)txq->wqes;
2472 			return (struct mlx5_wqe_dseg *)pdst;
2473 		}
2474 		pdst = (uint8_t *)txq->wqes;
2475 		psrc += part;
2476 		part = inlen;
2477 	} while (true);
2478 }
2479 
2480 /**
2481  * Copy data from chain of mbuf to the specified linear buffer.
2482  * Checksums and VLAN insertion Tx offload features. If data
2483  * from some mbuf copied completely this mbuf is freed. Local
2484  * structure is used to keep the byte stream state.
2485  *
2486  * @param pdst
2487  *   Pointer to the destination linear buffer.
2488  * @param loc
2489  *   Pointer to burst routine local context.
2490  * @param len
2491  *   Length of data to be copied.
2492  * @param must
2493  *   Length of data to be copied ignoring no inline hint.
2494  * @param olx
2495  *   Configured Tx offloads mask. It is fully defined at
2496  *   compile time and may be used for optimization.
2497  *
2498  * @return
2499  *   Number of actual copied data bytes. This is always greater than or
2500  *   equal to must parameter and might be lesser than len in no inline
2501  *   hint flag is encountered.
2502  */
2503 static __rte_always_inline unsigned int
2504 mlx5_tx_mseg_memcpy(uint8_t *pdst,
2505 		    struct mlx5_txq_local *__rte_restrict loc,
2506 		    unsigned int len,
2507 		    unsigned int must,
2508 		    unsigned int olx __rte_unused)
2509 {
2510 	struct rte_mbuf *mbuf;
2511 	unsigned int part, dlen, copy = 0;
2512 	uint8_t *psrc;
2513 
2514 	MLX5_ASSERT(len);
2515 	MLX5_ASSERT(must <= len);
2516 	do {
2517 		/* Allow zero length packets, must check first. */
2518 		dlen = rte_pktmbuf_data_len(loc->mbuf);
2519 		if (dlen <= loc->mbuf_off) {
2520 			/* Exhausted packet, just free. */
2521 			mbuf = loc->mbuf;
2522 			loc->mbuf = mbuf->next;
2523 			rte_pktmbuf_free_seg(mbuf);
2524 			loc->mbuf_off = 0;
2525 			MLX5_ASSERT(loc->mbuf_nseg > 1);
2526 			MLX5_ASSERT(loc->mbuf);
2527 			--loc->mbuf_nseg;
2528 			if (loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE) {
2529 				unsigned int diff;
2530 
2531 				if (copy >= must) {
2532 					/*
2533 					 * We already copied the minimal
2534 					 * requested amount of data.
2535 					 */
2536 					return copy;
2537 				}
2538 				diff = must - copy;
2539 				if (diff <= rte_pktmbuf_data_len(loc->mbuf)) {
2540 					/*
2541 					 * Copy only the minimal required
2542 					 * part of the data buffer.
2543 					 */
2544 					len = diff;
2545 				}
2546 			}
2547 			continue;
2548 		}
2549 		dlen -= loc->mbuf_off;
2550 		psrc = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *,
2551 					       loc->mbuf_off);
2552 		part = RTE_MIN(len, dlen);
2553 		rte_memcpy(pdst, psrc, part);
2554 		copy += part;
2555 		loc->mbuf_off += part;
2556 		len -= part;
2557 		if (!len) {
2558 			if (loc->mbuf_off >= rte_pktmbuf_data_len(loc->mbuf)) {
2559 				loc->mbuf_off = 0;
2560 				/* Exhausted packet, just free. */
2561 				mbuf = loc->mbuf;
2562 				loc->mbuf = mbuf->next;
2563 				rte_pktmbuf_free_seg(mbuf);
2564 				loc->mbuf_off = 0;
2565 				MLX5_ASSERT(loc->mbuf_nseg >= 1);
2566 				--loc->mbuf_nseg;
2567 			}
2568 			return copy;
2569 		}
2570 		pdst += part;
2571 	} while (true);
2572 }
2573 
2574 /**
2575  * Build the Ethernet Segment with inlined data from
2576  * multi-segment packet. Checks the boundary of WQEBB
2577  * and ring buffer wrapping, supports Software Parser,
2578  * Checksums and VLAN insertion Tx offload features.
2579  *
2580  * @param txq
2581  *   Pointer to TX queue structure.
2582  * @param loc
2583  *   Pointer to burst routine local context.
2584  * @param wqe
2585  *   Pointer to WQE to fill with built Ethernet Segment.
2586  * @param vlan
2587  *   Length of VLAN tag insertion if any.
2588  * @param inlen
2589  *   Length of data to inline (VLAN included, if any).
2590  * @param tso
2591  *   TSO flag, set mss field from the packet.
2592  * @param olx
2593  *   Configured Tx offloads mask. It is fully defined at
2594  *   compile time and may be used for optimization.
2595  *
2596  * @return
2597  *   Pointer to the next Data Segment (aligned and
2598  *   possible NOT wrapped around - caller should do
2599  *   wrapping check on its own).
2600  */
2601 static __rte_always_inline struct mlx5_wqe_dseg *
2602 mlx5_tx_eseg_mdat(struct mlx5_txq_data *__rte_restrict txq,
2603 		  struct mlx5_txq_local *__rte_restrict loc,
2604 		  struct mlx5_wqe *__rte_restrict wqe,
2605 		  unsigned int vlan,
2606 		  unsigned int inlen,
2607 		  unsigned int tso,
2608 		  unsigned int olx)
2609 {
2610 	struct mlx5_wqe_eseg *__rte_restrict es = &wqe->eseg;
2611 	uint32_t csum;
2612 	uint8_t *pdst;
2613 	unsigned int part, tlen = 0;
2614 
2615 	/*
2616 	 * Calculate and set check sum flags first, uint32_t field
2617 	 * in segment may be shared with Software Parser flags.
2618 	 */
2619 	csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
2620 	if (tso) {
2621 		csum <<= 24;
2622 		csum |= loc->mbuf->tso_segsz;
2623 		es->flags = rte_cpu_to_be_32(csum);
2624 	} else {
2625 		es->flags = rte_cpu_to_le_32(csum);
2626 	}
2627 	/*
2628 	 * Calculate and set Software Parser offsets and flags.
2629 	 * These flags a set for custom UDP and IP tunnel packets.
2630 	 */
2631 	es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
2632 	/* Fill metadata field if needed. */
2633 	es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
2634 		       loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ?
2635 		       *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0;
2636 	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
2637 				(sizeof(uint16_t) +
2638 				 sizeof(rte_v128u32_t)),
2639 		      "invalid Ethernet Segment data size");
2640 	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
2641 				(sizeof(uint16_t) +
2642 				 sizeof(struct rte_vlan_hdr) +
2643 				 2 * RTE_ETHER_ADDR_LEN),
2644 		      "invalid Ethernet Segment data size");
2645 	MLX5_ASSERT(inlen >= MLX5_ESEG_MIN_INLINE_SIZE);
2646 	pdst = (uint8_t *)&es->inline_data;
2647 	if (MLX5_TXOFF_CONFIG(VLAN) && vlan) {
2648 		/* Implement VLAN tag insertion as part inline data. */
2649 		mlx5_tx_mseg_memcpy(pdst, loc,
2650 				    2 * RTE_ETHER_ADDR_LEN,
2651 				    2 * RTE_ETHER_ADDR_LEN, olx);
2652 		pdst += 2 * RTE_ETHER_ADDR_LEN;
2653 		*(unaligned_uint32_t *)pdst = rte_cpu_to_be_32
2654 						((RTE_ETHER_TYPE_VLAN << 16) |
2655 						 loc->mbuf->vlan_tci);
2656 		pdst += sizeof(struct rte_vlan_hdr);
2657 		tlen += 2 * RTE_ETHER_ADDR_LEN + sizeof(struct rte_vlan_hdr);
2658 	}
2659 	MLX5_ASSERT(pdst < (uint8_t *)txq->wqes_end);
2660 	/*
2661 	 * The WQEBB space availability is checked by caller.
2662 	 * Here we should be aware of WQE ring buffer wraparound only.
2663 	 */
2664 	part = (uint8_t *)txq->wqes_end - pdst;
2665 	part = RTE_MIN(part, inlen - tlen);
2666 	MLX5_ASSERT(part);
2667 	do {
2668 		unsigned int copy;
2669 
2670 		/*
2671 		 * Copying may be interrupted inside the routine
2672 		 * if run into no inline hint flag.
2673 		 */
2674 		copy = tlen >= txq->inlen_mode ? 0 : (txq->inlen_mode - tlen);
2675 		copy = mlx5_tx_mseg_memcpy(pdst, loc, part, copy, olx);
2676 		tlen += copy;
2677 		if (likely(inlen <= tlen) || copy < part) {
2678 			es->inline_hdr_sz = rte_cpu_to_be_16(tlen);
2679 			pdst += copy;
2680 			pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
2681 			return (struct mlx5_wqe_dseg *)pdst;
2682 		}
2683 		pdst = (uint8_t *)txq->wqes;
2684 		part = inlen - tlen;
2685 	} while (true);
2686 }
2687 
2688 /**
2689  * Build the Data Segment of pointer type.
2690  *
2691  * @param txq
2692  *   Pointer to TX queue structure.
2693  * @param loc
2694  *   Pointer to burst routine local context.
2695  * @param dseg
2696  *   Pointer to WQE to fill with built Data Segment.
2697  * @param buf
2698  *   Data buffer to point.
2699  * @param len
2700  *   Data buffer length.
2701  * @param olx
2702  *   Configured Tx offloads mask. It is fully defined at
2703  *   compile time and may be used for optimization.
2704  */
2705 static __rte_always_inline void
2706 mlx5_tx_dseg_ptr(struct mlx5_txq_data *__rte_restrict txq,
2707 		 struct mlx5_txq_local *__rte_restrict loc,
2708 		 struct mlx5_wqe_dseg *__rte_restrict dseg,
2709 		 uint8_t *buf,
2710 		 unsigned int len,
2711 		 unsigned int olx __rte_unused)
2712 
2713 {
2714 	MLX5_ASSERT(len);
2715 	dseg->bcount = rte_cpu_to_be_32(len);
2716 	dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf);
2717 	dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf);
2718 }
2719 
2720 /**
2721  * Build the Data Segment of pointer type or inline
2722  * if data length is less than buffer in minimal
2723  * Data Segment size.
2724  *
2725  * @param txq
2726  *   Pointer to TX queue structure.
2727  * @param loc
2728  *   Pointer to burst routine local context.
2729  * @param dseg
2730  *   Pointer to WQE to fill with built Data Segment.
2731  * @param buf
2732  *   Data buffer to point.
2733  * @param len
2734  *   Data buffer length.
2735  * @param olx
2736  *   Configured Tx offloads mask. It is fully defined at
2737  *   compile time and may be used for optimization.
2738  */
2739 static __rte_always_inline void
2740 mlx5_tx_dseg_iptr(struct mlx5_txq_data *__rte_restrict txq,
2741 		  struct mlx5_txq_local *__rte_restrict loc,
2742 		  struct mlx5_wqe_dseg *__rte_restrict dseg,
2743 		  uint8_t *buf,
2744 		  unsigned int len,
2745 		  unsigned int olx __rte_unused)
2746 
2747 {
2748 	uintptr_t dst, src;
2749 
2750 	MLX5_ASSERT(len);
2751 	if (len > MLX5_DSEG_MIN_INLINE_SIZE) {
2752 		dseg->bcount = rte_cpu_to_be_32(len);
2753 		dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf);
2754 		dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf);
2755 
2756 		return;
2757 	}
2758 	dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE);
2759 	/* Unrolled implementation of generic rte_memcpy. */
2760 	dst = (uintptr_t)&dseg->inline_data[0];
2761 	src = (uintptr_t)buf;
2762 	if (len & 0x08) {
2763 #ifdef RTE_ARCH_STRICT_ALIGN
2764 		MLX5_ASSERT(dst == RTE_PTR_ALIGN(dst, sizeof(uint32_t)));
2765 		*(uint32_t *)dst = *(unaligned_uint32_t *)src;
2766 		dst += sizeof(uint32_t);
2767 		src += sizeof(uint32_t);
2768 		*(uint32_t *)dst = *(unaligned_uint32_t *)src;
2769 		dst += sizeof(uint32_t);
2770 		src += sizeof(uint32_t);
2771 #else
2772 		*(uint64_t *)dst = *(unaligned_uint64_t *)src;
2773 		dst += sizeof(uint64_t);
2774 		src += sizeof(uint64_t);
2775 #endif
2776 	}
2777 	if (len & 0x04) {
2778 		*(uint32_t *)dst = *(unaligned_uint32_t *)src;
2779 		dst += sizeof(uint32_t);
2780 		src += sizeof(uint32_t);
2781 	}
2782 	if (len & 0x02) {
2783 		*(uint16_t *)dst = *(unaligned_uint16_t *)src;
2784 		dst += sizeof(uint16_t);
2785 		src += sizeof(uint16_t);
2786 	}
2787 	if (len & 0x01)
2788 		*(uint8_t *)dst = *(uint8_t *)src;
2789 }
2790 
2791 /**
2792  * Build the Data Segment of inlined data from single
2793  * segment packet, no VLAN insertion.
2794  *
2795  * @param txq
2796  *   Pointer to TX queue structure.
2797  * @param loc
2798  *   Pointer to burst routine local context.
2799  * @param dseg
2800  *   Pointer to WQE to fill with built Data Segment.
2801  * @param buf
2802  *   Data buffer to point.
2803  * @param len
2804  *   Data buffer length.
2805  * @param olx
2806  *   Configured Tx offloads mask. It is fully defined at
2807  *   compile time and may be used for optimization.
2808  *
2809  * @return
2810  *   Pointer to the next Data Segment after inlined data.
2811  *   Ring buffer wraparound check is needed. We do not
2812  *   do it here because it may not be needed for the
2813  *   last packet in the eMPW session.
2814  */
2815 static __rte_always_inline struct mlx5_wqe_dseg *
2816 mlx5_tx_dseg_empw(struct mlx5_txq_data *__rte_restrict txq,
2817 		  struct mlx5_txq_local *__rte_restrict loc __rte_unused,
2818 		  struct mlx5_wqe_dseg *__rte_restrict dseg,
2819 		  uint8_t *buf,
2820 		  unsigned int len,
2821 		  unsigned int olx __rte_unused)
2822 {
2823 	unsigned int part;
2824 	uint8_t *pdst;
2825 
2826 	if (!MLX5_TXOFF_CONFIG(MPW)) {
2827 		/* Store the descriptor byte counter for eMPW sessions. */
2828 		dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE);
2829 		pdst = &dseg->inline_data[0];
2830 	} else {
2831 		/* The entire legacy MPW session counter is stored on close. */
2832 		pdst = (uint8_t *)dseg;
2833 	}
2834 	/*
2835 	 * The WQEBB space availability is checked by caller.
2836 	 * Here we should be aware of WQE ring buffer wraparound only.
2837 	 */
2838 	part = (uint8_t *)txq->wqes_end - pdst;
2839 	part = RTE_MIN(part, len);
2840 	do {
2841 		rte_memcpy(pdst, buf, part);
2842 		len -= part;
2843 		if (likely(!len)) {
2844 			pdst += part;
2845 			if (!MLX5_TXOFF_CONFIG(MPW))
2846 				pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
2847 			/* Note: no final wraparound check here. */
2848 			return (struct mlx5_wqe_dseg *)pdst;
2849 		}
2850 		pdst = (uint8_t *)txq->wqes;
2851 		buf += part;
2852 		part = len;
2853 	} while (true);
2854 }
2855 
2856 /**
2857  * Build the Data Segment of inlined data from single
2858  * segment packet with VLAN insertion.
2859  *
2860  * @param txq
2861  *   Pointer to TX queue structure.
2862  * @param loc
2863  *   Pointer to burst routine local context.
2864  * @param dseg
2865  *   Pointer to the dseg fill with built Data Segment.
2866  * @param buf
2867  *   Data buffer to point.
2868  * @param len
2869  *   Data buffer length.
2870  * @param olx
2871  *   Configured Tx offloads mask. It is fully defined at
2872  *   compile time and may be used for optimization.
2873  *
2874  * @return
2875  *   Pointer to the next Data Segment after inlined data.
2876  *   Ring buffer wraparound check is needed.
2877  */
2878 static __rte_always_inline struct mlx5_wqe_dseg *
2879 mlx5_tx_dseg_vlan(struct mlx5_txq_data *__rte_restrict txq,
2880 		  struct mlx5_txq_local *__rte_restrict loc __rte_unused,
2881 		  struct mlx5_wqe_dseg *__rte_restrict dseg,
2882 		  uint8_t *buf,
2883 		  unsigned int len,
2884 		  unsigned int olx __rte_unused)
2885 
2886 {
2887 	unsigned int part;
2888 	uint8_t *pdst;
2889 
2890 	MLX5_ASSERT(len > MLX5_ESEG_MIN_INLINE_SIZE);
2891 	static_assert(MLX5_DSEG_MIN_INLINE_SIZE ==
2892 				 (2 * RTE_ETHER_ADDR_LEN),
2893 		      "invalid Data Segment data size");
2894 	if (!MLX5_TXOFF_CONFIG(MPW)) {
2895 		/* Store the descriptor byte counter for eMPW sessions. */
2896 		dseg->bcount = rte_cpu_to_be_32
2897 				((len + sizeof(struct rte_vlan_hdr)) |
2898 				 MLX5_ETH_WQE_DATA_INLINE);
2899 		pdst = &dseg->inline_data[0];
2900 	} else {
2901 		/* The entire legacy MPW session counter is stored on close. */
2902 		pdst = (uint8_t *)dseg;
2903 	}
2904 	memcpy(pdst, buf, MLX5_DSEG_MIN_INLINE_SIZE);
2905 	buf += MLX5_DSEG_MIN_INLINE_SIZE;
2906 	pdst += MLX5_DSEG_MIN_INLINE_SIZE;
2907 	len -= MLX5_DSEG_MIN_INLINE_SIZE;
2908 	/* Insert VLAN ethertype + VLAN tag. Pointer is aligned. */
2909 	MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE));
2910 	if (unlikely(pdst >= (uint8_t *)txq->wqes_end))
2911 		pdst = (uint8_t *)txq->wqes;
2912 	*(uint32_t *)pdst = rte_cpu_to_be_32((RTE_ETHER_TYPE_VLAN << 16) |
2913 					      loc->mbuf->vlan_tci);
2914 	pdst += sizeof(struct rte_vlan_hdr);
2915 	/*
2916 	 * The WQEBB space availability is checked by caller.
2917 	 * Here we should be aware of WQE ring buffer wraparound only.
2918 	 */
2919 	part = (uint8_t *)txq->wqes_end - pdst;
2920 	part = RTE_MIN(part, len);
2921 	do {
2922 		rte_memcpy(pdst, buf, part);
2923 		len -= part;
2924 		if (likely(!len)) {
2925 			pdst += part;
2926 			if (!MLX5_TXOFF_CONFIG(MPW))
2927 				pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
2928 			/* Note: no final wraparound check here. */
2929 			return (struct mlx5_wqe_dseg *)pdst;
2930 		}
2931 		pdst = (uint8_t *)txq->wqes;
2932 		buf += part;
2933 		part = len;
2934 	} while (true);
2935 }
2936 
2937 /**
2938  * Build the Ethernet Segment with optionally inlined data with
2939  * VLAN insertion and following Data Segments (if any) from
2940  * multi-segment packet. Used by ordinary send and TSO.
2941  *
2942  * @param txq
2943  *   Pointer to TX queue structure.
2944  * @param loc
2945  *   Pointer to burst routine local context.
2946  * @param wqe
2947  *   Pointer to WQE to fill with built Ethernet/Data Segments.
2948  * @param vlan
2949  *   Length of VLAN header to insert, 0 means no VLAN insertion.
2950  * @param inlen
2951  *   Data length to inline. For TSO this parameter specifies
2952  *   exact value, for ordinary send routine can be aligned by
2953  *   caller to provide better WQE space saving and data buffer
2954  *   start address alignment. This length includes VLAN header
2955  *   being inserted.
2956  * @param tso
2957  *   Zero means ordinary send, inlined data can be extended,
2958  *   otherwise this is TSO, inlined data length is fixed.
2959  * @param olx
2960  *   Configured Tx offloads mask. It is fully defined at
2961  *   compile time and may be used for optimization.
2962  *
2963  * @return
2964  *   Actual size of built WQE in segments.
2965  */
2966 static __rte_always_inline unsigned int
2967 mlx5_tx_mseg_build(struct mlx5_txq_data *__rte_restrict txq,
2968 		   struct mlx5_txq_local *__rte_restrict loc,
2969 		   struct mlx5_wqe *__rte_restrict wqe,
2970 		   unsigned int vlan,
2971 		   unsigned int inlen,
2972 		   unsigned int tso,
2973 		   unsigned int olx __rte_unused)
2974 {
2975 	struct mlx5_wqe_dseg *__rte_restrict dseg;
2976 	unsigned int ds;
2977 
2978 	MLX5_ASSERT((rte_pktmbuf_pkt_len(loc->mbuf) + vlan) >= inlen);
2979 	loc->mbuf_nseg = NB_SEGS(loc->mbuf);
2980 	loc->mbuf_off = 0;
2981 
2982 	dseg = mlx5_tx_eseg_mdat(txq, loc, wqe, vlan, inlen, tso, olx);
2983 	if (!loc->mbuf_nseg)
2984 		goto dseg_done;
2985 	/*
2986 	 * There are still some mbuf remaining, not inlined.
2987 	 * The first mbuf may be partially inlined and we
2988 	 * must process the possible non-zero data offset.
2989 	 */
2990 	if (loc->mbuf_off) {
2991 		unsigned int dlen;
2992 		uint8_t *dptr;
2993 
2994 		/*
2995 		 * Exhausted packets must be dropped before.
2996 		 * Non-zero offset means there are some data
2997 		 * remained in the packet.
2998 		 */
2999 		MLX5_ASSERT(loc->mbuf_off < rte_pktmbuf_data_len(loc->mbuf));
3000 		MLX5_ASSERT(rte_pktmbuf_data_len(loc->mbuf));
3001 		dptr = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *,
3002 					       loc->mbuf_off);
3003 		dlen = rte_pktmbuf_data_len(loc->mbuf) - loc->mbuf_off;
3004 		/*
3005 		 * Build the pointer/minimal data Data Segment.
3006 		 * Do ring buffer wrapping check in advance.
3007 		 */
3008 		if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
3009 			dseg = (struct mlx5_wqe_dseg *)txq->wqes;
3010 		mlx5_tx_dseg_iptr(txq, loc, dseg, dptr, dlen, olx);
3011 		/* Store the mbuf to be freed on completion. */
3012 		MLX5_ASSERT(loc->elts_free);
3013 		txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
3014 		--loc->elts_free;
3015 		++dseg;
3016 		if (--loc->mbuf_nseg == 0)
3017 			goto dseg_done;
3018 		loc->mbuf = loc->mbuf->next;
3019 		loc->mbuf_off = 0;
3020 	}
3021 	do {
3022 		if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) {
3023 			struct rte_mbuf *mbuf;
3024 
3025 			/* Zero length segment found, just skip. */
3026 			mbuf = loc->mbuf;
3027 			loc->mbuf = loc->mbuf->next;
3028 			rte_pktmbuf_free_seg(mbuf);
3029 			if (--loc->mbuf_nseg == 0)
3030 				break;
3031 		} else {
3032 			if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
3033 				dseg = (struct mlx5_wqe_dseg *)txq->wqes;
3034 			mlx5_tx_dseg_iptr
3035 				(txq, loc, dseg,
3036 				 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
3037 				 rte_pktmbuf_data_len(loc->mbuf), olx);
3038 			MLX5_ASSERT(loc->elts_free);
3039 			txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
3040 			--loc->elts_free;
3041 			++dseg;
3042 			if (--loc->mbuf_nseg == 0)
3043 				break;
3044 			loc->mbuf = loc->mbuf->next;
3045 		}
3046 	} while (true);
3047 
3048 dseg_done:
3049 	/* Calculate actual segments used from the dseg pointer. */
3050 	if ((uintptr_t)wqe < (uintptr_t)dseg)
3051 		ds = ((uintptr_t)dseg - (uintptr_t)wqe) / MLX5_WSEG_SIZE;
3052 	else
3053 		ds = (((uintptr_t)dseg - (uintptr_t)wqe) +
3054 		      txq->wqe_s * MLX5_WQE_SIZE) / MLX5_WSEG_SIZE;
3055 	return ds;
3056 }
3057 
3058 /**
3059  * The routine checks timestamp flag in the current packet,
3060  * and push WAIT WQE into the queue if scheduling is required.
3061  *
3062  * @param txq
3063  *   Pointer to TX queue structure.
3064  * @param loc
3065  *   Pointer to burst routine local context.
3066  * @param olx
3067  *   Configured Tx offloads mask. It is fully defined at
3068  *   compile time and may be used for optimization.
3069  *
3070  * @return
3071  *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
3072  *   MLX5_TXCMP_CODE_SINGLE - continue processing with the packet.
3073  *   MLX5_TXCMP_CODE_MULTI - the WAIT inserted, continue processing.
3074  * Local context variables partially updated.
3075  */
3076 static __rte_always_inline enum mlx5_txcmp_code
3077 mlx5_tx_schedule_send(struct mlx5_txq_data *restrict txq,
3078 		      struct mlx5_txq_local *restrict loc,
3079 		      unsigned int olx)
3080 {
3081 	if (MLX5_TXOFF_CONFIG(TXPP) &&
3082 	    loc->mbuf->ol_flags & txq->ts_mask) {
3083 		struct mlx5_wqe *wqe;
3084 		uint64_t ts;
3085 		int32_t wci;
3086 
3087 		/*
3088 		 * Estimate the required space quickly and roughly.
3089 		 * We would like to ensure the packet can be pushed
3090 		 * to the queue and we won't get the orphan WAIT WQE.
3091 		 */
3092 		if (loc->wqe_free <= MLX5_WQE_SIZE_MAX / MLX5_WQE_SIZE ||
3093 		    loc->elts_free < NB_SEGS(loc->mbuf))
3094 			return MLX5_TXCMP_CODE_EXIT;
3095 		/* Convert the timestamp into completion to wait. */
3096 		ts = *RTE_MBUF_DYNFIELD(loc->mbuf, txq->ts_offset, uint64_t *);
3097 		wci = mlx5_txpp_convert_tx_ts(txq->sh, ts);
3098 		if (unlikely(wci < 0))
3099 			return MLX5_TXCMP_CODE_SINGLE;
3100 		/* Build the WAIT WQE with specified completion. */
3101 		wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
3102 		mlx5_tx_cseg_init(txq, loc, wqe, 2, MLX5_OPCODE_WAIT, olx);
3103 		mlx5_tx_wseg_init(txq, loc, wqe, wci, olx);
3104 		++txq->wqe_ci;
3105 		--loc->wqe_free;
3106 		return MLX5_TXCMP_CODE_MULTI;
3107 	}
3108 	return MLX5_TXCMP_CODE_SINGLE;
3109 }
3110 
3111 /**
3112  * Tx one packet function for multi-segment TSO. Supports all
3113  * types of Tx offloads, uses MLX5_OPCODE_TSO to build WQEs,
3114  * sends one packet per WQE.
3115  *
3116  * This routine is responsible for storing processed mbuf
3117  * into elts ring buffer and update elts_head.
3118  *
3119  * @param txq
3120  *   Pointer to TX queue structure.
3121  * @param loc
3122  *   Pointer to burst routine local context.
3123  * @param olx
3124  *   Configured Tx offloads mask. It is fully defined at
3125  *   compile time and may be used for optimization.
3126  *
3127  * @return
3128  *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
3129  *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
3130  * Local context variables partially updated.
3131  */
3132 static __rte_always_inline enum mlx5_txcmp_code
3133 mlx5_tx_packet_multi_tso(struct mlx5_txq_data *__rte_restrict txq,
3134 			struct mlx5_txq_local *__rte_restrict loc,
3135 			unsigned int olx)
3136 {
3137 	struct mlx5_wqe *__rte_restrict wqe;
3138 	unsigned int ds, dlen, inlen, ntcp, vlan = 0;
3139 
3140 	if (MLX5_TXOFF_CONFIG(TXPP)) {
3141 		enum mlx5_txcmp_code wret;
3142 
3143 		/* Generate WAIT for scheduling if requested. */
3144 		wret = mlx5_tx_schedule_send(txq, loc, olx);
3145 		if (wret == MLX5_TXCMP_CODE_EXIT)
3146 			return MLX5_TXCMP_CODE_EXIT;
3147 		if (wret == MLX5_TXCMP_CODE_ERROR)
3148 			return MLX5_TXCMP_CODE_ERROR;
3149 	}
3150 	/*
3151 	 * Calculate data length to be inlined to estimate
3152 	 * the required space in WQE ring buffer.
3153 	 */
3154 	dlen = rte_pktmbuf_pkt_len(loc->mbuf);
3155 	if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)
3156 		vlan = sizeof(struct rte_vlan_hdr);
3157 	inlen = loc->mbuf->l2_len + vlan +
3158 		loc->mbuf->l3_len + loc->mbuf->l4_len;
3159 	if (unlikely((!inlen || !loc->mbuf->tso_segsz)))
3160 		return MLX5_TXCMP_CODE_ERROR;
3161 	if (loc->mbuf->ol_flags & PKT_TX_TUNNEL_MASK)
3162 		inlen += loc->mbuf->outer_l2_len + loc->mbuf->outer_l3_len;
3163 	/* Packet must contain all TSO headers. */
3164 	if (unlikely(inlen > MLX5_MAX_TSO_HEADER ||
3165 		     inlen <= MLX5_ESEG_MIN_INLINE_SIZE ||
3166 		     inlen > (dlen + vlan)))
3167 		return MLX5_TXCMP_CODE_ERROR;
3168 	MLX5_ASSERT(inlen >= txq->inlen_mode);
3169 	/*
3170 	 * Check whether there are enough free WQEBBs:
3171 	 * - Control Segment
3172 	 * - Ethernet Segment
3173 	 * - First Segment of inlined Ethernet data
3174 	 * - ... data continued ...
3175 	 * - Data Segments of pointer/min inline type
3176 	 */
3177 	ds = NB_SEGS(loc->mbuf) + 2 + (inlen -
3178 				       MLX5_ESEG_MIN_INLINE_SIZE +
3179 				       MLX5_WSEG_SIZE +
3180 				       MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
3181 	if (unlikely(loc->wqe_free < ((ds + 3) / 4)))
3182 		return MLX5_TXCMP_CODE_EXIT;
3183 	/* Check for maximal WQE size. */
3184 	if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4)))
3185 		return MLX5_TXCMP_CODE_ERROR;
3186 #ifdef MLX5_PMD_SOFT_COUNTERS
3187 	/* Update sent data bytes/packets counters. */
3188 	ntcp = (dlen - (inlen - vlan) + loc->mbuf->tso_segsz - 1) /
3189 		loc->mbuf->tso_segsz;
3190 	/*
3191 	 * One will be added for mbuf itself
3192 	 * at the end of the mlx5_tx_burst from
3193 	 * loc->pkts_sent field.
3194 	 */
3195 	--ntcp;
3196 	txq->stats.opackets += ntcp;
3197 	txq->stats.obytes += dlen + vlan + ntcp * inlen;
3198 #endif
3199 	wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
3200 	loc->wqe_last = wqe;
3201 	mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_TSO, olx);
3202 	ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 1, olx);
3203 	wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
3204 	txq->wqe_ci += (ds + 3) / 4;
3205 	loc->wqe_free -= (ds + 3) / 4;
3206 	return MLX5_TXCMP_CODE_MULTI;
3207 }
3208 
3209 /**
3210  * Tx one packet function for multi-segment SEND. Supports all
3211  * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs,
3212  * sends one packet per WQE, without any data inlining in
3213  * Ethernet Segment.
3214  *
3215  * This routine is responsible for storing processed mbuf
3216  * into elts ring buffer and update elts_head.
3217  *
3218  * @param txq
3219  *   Pointer to TX queue structure.
3220  * @param loc
3221  *   Pointer to burst routine local context.
3222  * @param olx
3223  *   Configured Tx offloads mask. It is fully defined at
3224  *   compile time and may be used for optimization.
3225  *
3226  * @return
3227  *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
3228  *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
3229  * Local context variables partially updated.
3230  */
3231 static __rte_always_inline enum mlx5_txcmp_code
3232 mlx5_tx_packet_multi_send(struct mlx5_txq_data *__rte_restrict txq,
3233 			  struct mlx5_txq_local *__rte_restrict loc,
3234 			  unsigned int olx)
3235 {
3236 	struct mlx5_wqe_dseg *__rte_restrict dseg;
3237 	struct mlx5_wqe *__rte_restrict wqe;
3238 	unsigned int ds, nseg;
3239 
3240 	MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1);
3241 	if (MLX5_TXOFF_CONFIG(TXPP)) {
3242 		enum mlx5_txcmp_code wret;
3243 
3244 		/* Generate WAIT for scheduling if requested. */
3245 		wret = mlx5_tx_schedule_send(txq, loc, olx);
3246 		if (wret == MLX5_TXCMP_CODE_EXIT)
3247 			return MLX5_TXCMP_CODE_EXIT;
3248 		if (wret == MLX5_TXCMP_CODE_ERROR)
3249 			return MLX5_TXCMP_CODE_ERROR;
3250 	}
3251 	/*
3252 	 * No inline at all, it means the CPU cycles saving
3253 	 * is prioritized at configuration, we should not
3254 	 * copy any packet data to WQE.
3255 	 */
3256 	nseg = NB_SEGS(loc->mbuf);
3257 	ds = 2 + nseg;
3258 	if (unlikely(loc->wqe_free < ((ds + 3) / 4)))
3259 		return MLX5_TXCMP_CODE_EXIT;
3260 	/* Check for maximal WQE size. */
3261 	if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4)))
3262 		return MLX5_TXCMP_CODE_ERROR;
3263 	/*
3264 	 * Some Tx offloads may cause an error if
3265 	 * packet is not long enough, check against
3266 	 * assumed minimal length.
3267 	 */
3268 	if (rte_pktmbuf_pkt_len(loc->mbuf) <= MLX5_ESEG_MIN_INLINE_SIZE)
3269 		return MLX5_TXCMP_CODE_ERROR;
3270 #ifdef MLX5_PMD_SOFT_COUNTERS
3271 	/* Update sent data bytes counter. */
3272 	txq->stats.obytes += rte_pktmbuf_pkt_len(loc->mbuf);
3273 	if (MLX5_TXOFF_CONFIG(VLAN) &&
3274 	    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)
3275 		txq->stats.obytes += sizeof(struct rte_vlan_hdr);
3276 #endif
3277 	/*
3278 	 * SEND WQE, one WQEBB:
3279 	 * - Control Segment, SEND opcode
3280 	 * - Ethernet Segment, optional VLAN, no inline
3281 	 * - Data Segments, pointer only type
3282 	 */
3283 	wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
3284 	loc->wqe_last = wqe;
3285 	mlx5_tx_cseg_init(txq, loc, wqe, ds, MLX5_OPCODE_SEND, olx);
3286 	mlx5_tx_eseg_none(txq, loc, wqe, olx);
3287 	dseg = &wqe->dseg[0];
3288 	do {
3289 		if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) {
3290 			struct rte_mbuf *mbuf;
3291 
3292 			/*
3293 			 * Zero length segment found, have to
3294 			 * correct total size of WQE in segments.
3295 			 * It is supposed to be rare occasion, so
3296 			 * in normal case (no zero length segments)
3297 			 * we avoid extra writing to the Control
3298 			 * Segment.
3299 			 */
3300 			--ds;
3301 			wqe->cseg.sq_ds -= RTE_BE32(1);
3302 			mbuf = loc->mbuf;
3303 			loc->mbuf = mbuf->next;
3304 			rte_pktmbuf_free_seg(mbuf);
3305 			if (--nseg == 0)
3306 				break;
3307 		} else {
3308 			mlx5_tx_dseg_ptr
3309 				(txq, loc, dseg,
3310 				 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
3311 				 rte_pktmbuf_data_len(loc->mbuf), olx);
3312 			txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
3313 			--loc->elts_free;
3314 			if (--nseg == 0)
3315 				break;
3316 			++dseg;
3317 			if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
3318 				dseg = (struct mlx5_wqe_dseg *)txq->wqes;
3319 			loc->mbuf = loc->mbuf->next;
3320 		}
3321 	} while (true);
3322 	txq->wqe_ci += (ds + 3) / 4;
3323 	loc->wqe_free -= (ds + 3) / 4;
3324 	return MLX5_TXCMP_CODE_MULTI;
3325 }
3326 
3327 /**
3328  * Tx one packet function for multi-segment SEND. Supports all
3329  * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs,
3330  * sends one packet per WQE, with data inlining in
3331  * Ethernet Segment and minimal Data Segments.
3332  *
3333  * This routine is responsible for storing processed mbuf
3334  * into elts ring buffer and update elts_head.
3335  *
3336  * @param txq
3337  *   Pointer to TX queue structure.
3338  * @param loc
3339  *   Pointer to burst routine local context.
3340  * @param olx
3341  *   Configured Tx offloads mask. It is fully defined at
3342  *   compile time and may be used for optimization.
3343  *
3344  * @return
3345  *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
3346  *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
3347  * Local context variables partially updated.
3348  */
3349 static __rte_always_inline enum mlx5_txcmp_code
3350 mlx5_tx_packet_multi_inline(struct mlx5_txq_data *__rte_restrict txq,
3351 			    struct mlx5_txq_local *__rte_restrict loc,
3352 			    unsigned int olx)
3353 {
3354 	struct mlx5_wqe *__rte_restrict wqe;
3355 	unsigned int ds, inlen, dlen, vlan = 0;
3356 
3357 	MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
3358 	MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1);
3359 	if (MLX5_TXOFF_CONFIG(TXPP)) {
3360 		enum mlx5_txcmp_code wret;
3361 
3362 		/* Generate WAIT for scheduling if requested. */
3363 		wret = mlx5_tx_schedule_send(txq, loc, olx);
3364 		if (wret == MLX5_TXCMP_CODE_EXIT)
3365 			return MLX5_TXCMP_CODE_EXIT;
3366 		if (wret == MLX5_TXCMP_CODE_ERROR)
3367 			return MLX5_TXCMP_CODE_ERROR;
3368 	}
3369 	/*
3370 	 * First calculate data length to be inlined
3371 	 * to estimate the required space for WQE.
3372 	 */
3373 	dlen = rte_pktmbuf_pkt_len(loc->mbuf);
3374 	if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)
3375 		vlan = sizeof(struct rte_vlan_hdr);
3376 	inlen = dlen + vlan;
3377 	/* Check against minimal length. */
3378 	if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE)
3379 		return MLX5_TXCMP_CODE_ERROR;
3380 	MLX5_ASSERT(txq->inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE);
3381 	if (inlen > txq->inlen_send ||
3382 	    loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE) {
3383 		struct rte_mbuf *mbuf;
3384 		unsigned int nxlen;
3385 		uintptr_t start;
3386 
3387 		/*
3388 		 * Packet length exceeds the allowed inline
3389 		 * data length, check whether the minimal
3390 		 * inlining is required.
3391 		 */
3392 		if (txq->inlen_mode) {
3393 			MLX5_ASSERT(txq->inlen_mode >=
3394 				    MLX5_ESEG_MIN_INLINE_SIZE);
3395 			MLX5_ASSERT(txq->inlen_mode <= txq->inlen_send);
3396 			inlen = txq->inlen_mode;
3397 		} else {
3398 			if (loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE ||
3399 			    !vlan || txq->vlan_en) {
3400 				/*
3401 				 * VLAN insertion will be done inside by HW.
3402 				 * It is not utmost effective - VLAN flag is
3403 				 * checked twice, but we should proceed the
3404 				 * inlining length correctly and take into
3405 				 * account the VLAN header being inserted.
3406 				 */
3407 				return mlx5_tx_packet_multi_send
3408 							(txq, loc, olx);
3409 			}
3410 			inlen = MLX5_ESEG_MIN_INLINE_SIZE;
3411 		}
3412 		/*
3413 		 * Now we know the minimal amount of data is requested
3414 		 * to inline. Check whether we should inline the buffers
3415 		 * from the chain beginning to eliminate some mbufs.
3416 		 */
3417 		mbuf = loc->mbuf;
3418 		nxlen = rte_pktmbuf_data_len(mbuf);
3419 		if (unlikely(nxlen <= txq->inlen_send)) {
3420 			/* We can inline first mbuf at least. */
3421 			if (nxlen < inlen) {
3422 				unsigned int smlen;
3423 
3424 				/* Scan mbufs till inlen filled. */
3425 				do {
3426 					smlen = nxlen;
3427 					mbuf = NEXT(mbuf);
3428 					MLX5_ASSERT(mbuf);
3429 					nxlen = rte_pktmbuf_data_len(mbuf);
3430 					nxlen += smlen;
3431 				} while (unlikely(nxlen < inlen));
3432 				if (unlikely(nxlen > txq->inlen_send)) {
3433 					/* We cannot inline entire mbuf. */
3434 					smlen = inlen - smlen;
3435 					start = rte_pktmbuf_mtod_offset
3436 						    (mbuf, uintptr_t, smlen);
3437 					goto do_align;
3438 				}
3439 			}
3440 			do {
3441 				inlen = nxlen;
3442 				mbuf = NEXT(mbuf);
3443 				/* There should be not end of packet. */
3444 				MLX5_ASSERT(mbuf);
3445 				nxlen = inlen + rte_pktmbuf_data_len(mbuf);
3446 			} while (unlikely(nxlen < txq->inlen_send));
3447 		}
3448 		start = rte_pktmbuf_mtod(mbuf, uintptr_t);
3449 		/*
3450 		 * Check whether we can do inline to align start
3451 		 * address of data buffer to cacheline.
3452 		 */
3453 do_align:
3454 		start = (~start + 1) & (RTE_CACHE_LINE_SIZE - 1);
3455 		if (unlikely(start)) {
3456 			start += inlen;
3457 			if (start <= txq->inlen_send)
3458 				inlen = start;
3459 		}
3460 	}
3461 	/*
3462 	 * Check whether there are enough free WQEBBs:
3463 	 * - Control Segment
3464 	 * - Ethernet Segment
3465 	 * - First Segment of inlined Ethernet data
3466 	 * - ... data continued ...
3467 	 * - Data Segments of pointer/min inline type
3468 	 *
3469 	 * Estimate the number of Data Segments conservatively,
3470 	 * supposing no any mbufs is being freed during inlining.
3471 	 */
3472 	MLX5_ASSERT(inlen <= txq->inlen_send);
3473 	ds = NB_SEGS(loc->mbuf) + 2 + (inlen -
3474 				       MLX5_ESEG_MIN_INLINE_SIZE +
3475 				       MLX5_WSEG_SIZE +
3476 				       MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
3477 	if (unlikely(loc->wqe_free < ((ds + 3) / 4)))
3478 		return MLX5_TXCMP_CODE_EXIT;
3479 	/* Check for maximal WQE size. */
3480 	if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4)))
3481 		return MLX5_TXCMP_CODE_ERROR;
3482 #ifdef MLX5_PMD_SOFT_COUNTERS
3483 	/* Update sent data bytes/packets counters. */
3484 	txq->stats.obytes += dlen + vlan;
3485 #endif
3486 	wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
3487 	loc->wqe_last = wqe;
3488 	mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_SEND, olx);
3489 	ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 0, olx);
3490 	wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
3491 	txq->wqe_ci += (ds + 3) / 4;
3492 	loc->wqe_free -= (ds + 3) / 4;
3493 	return MLX5_TXCMP_CODE_MULTI;
3494 }
3495 
3496 /**
3497  * Tx burst function for multi-segment packets. Supports all
3498  * types of Tx offloads, uses MLX5_OPCODE_SEND/TSO to build WQEs,
3499  * sends one packet per WQE. Function stops sending if it
3500  * encounters the single-segment packet.
3501  *
3502  * This routine is responsible for storing processed mbuf
3503  * into elts ring buffer and update elts_head.
3504  *
3505  * @param txq
3506  *   Pointer to TX queue structure.
3507  * @param[in] pkts
3508  *   Packets to transmit.
3509  * @param pkts_n
3510  *   Number of packets in array.
3511  * @param loc
3512  *   Pointer to burst routine local context.
3513  * @param olx
3514  *   Configured Tx offloads mask. It is fully defined at
3515  *   compile time and may be used for optimization.
3516  *
3517  * @return
3518  *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
3519  *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
3520  *   MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered.
3521  *   MLX5_TXCMP_CODE_TSO - TSO single-segment packet encountered.
3522  * Local context variables updated.
3523  */
3524 static __rte_always_inline enum mlx5_txcmp_code
3525 mlx5_tx_burst_mseg(struct mlx5_txq_data *__rte_restrict txq,
3526 		   struct rte_mbuf **__rte_restrict pkts,
3527 		   unsigned int pkts_n,
3528 		   struct mlx5_txq_local *__rte_restrict loc,
3529 		   unsigned int olx)
3530 {
3531 	MLX5_ASSERT(loc->elts_free && loc->wqe_free);
3532 	MLX5_ASSERT(pkts_n > loc->pkts_sent);
3533 	pkts += loc->pkts_sent + 1;
3534 	pkts_n -= loc->pkts_sent;
3535 	for (;;) {
3536 		enum mlx5_txcmp_code ret;
3537 
3538 		MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1);
3539 		/*
3540 		 * Estimate the number of free elts quickly but
3541 		 * conservatively. Some segment may be fully inlined
3542 		 * and freed, ignore this here - precise estimation
3543 		 * is costly.
3544 		 */
3545 		if (loc->elts_free < NB_SEGS(loc->mbuf))
3546 			return MLX5_TXCMP_CODE_EXIT;
3547 		if (MLX5_TXOFF_CONFIG(TSO) &&
3548 		    unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) {
3549 			/* Proceed with multi-segment TSO. */
3550 			ret = mlx5_tx_packet_multi_tso(txq, loc, olx);
3551 		} else if (MLX5_TXOFF_CONFIG(INLINE)) {
3552 			/* Proceed with multi-segment SEND with inlining. */
3553 			ret = mlx5_tx_packet_multi_inline(txq, loc, olx);
3554 		} else {
3555 			/* Proceed with multi-segment SEND w/o inlining. */
3556 			ret = mlx5_tx_packet_multi_send(txq, loc, olx);
3557 		}
3558 		if (ret == MLX5_TXCMP_CODE_EXIT)
3559 			return MLX5_TXCMP_CODE_EXIT;
3560 		if (ret == MLX5_TXCMP_CODE_ERROR)
3561 			return MLX5_TXCMP_CODE_ERROR;
3562 		/* WQE is built, go to the next packet. */
3563 		++loc->pkts_sent;
3564 		--pkts_n;
3565 		if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
3566 			return MLX5_TXCMP_CODE_EXIT;
3567 		loc->mbuf = *pkts++;
3568 		if (pkts_n > 1)
3569 			rte_prefetch0(*pkts);
3570 		if (likely(NB_SEGS(loc->mbuf) > 1))
3571 			continue;
3572 		/* Here ends the series of multi-segment packets. */
3573 		if (MLX5_TXOFF_CONFIG(TSO) &&
3574 		    unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG))
3575 			return MLX5_TXCMP_CODE_TSO;
3576 		return MLX5_TXCMP_CODE_SINGLE;
3577 	}
3578 	MLX5_ASSERT(false);
3579 }
3580 
3581 /**
3582  * Tx burst function for single-segment packets with TSO.
3583  * Supports all types of Tx offloads, except multi-packets.
3584  * Uses MLX5_OPCODE_TSO to build WQEs, sends one packet per WQE.
3585  * Function stops sending if it encounters the multi-segment
3586  * packet or packet without TSO requested.
3587  *
3588  * The routine is responsible for storing processed mbuf
3589  * into elts ring buffer and update elts_head if inline
3590  * offloads is requested due to possible early freeing
3591  * of the inlined mbufs (can not store pkts array in elts
3592  * as a batch).
3593  *
3594  * @param txq
3595  *   Pointer to TX queue structure.
3596  * @param[in] pkts
3597  *   Packets to transmit.
3598  * @param pkts_n
3599  *   Number of packets in array.
3600  * @param loc
3601  *   Pointer to burst routine local context.
3602  * @param olx
3603  *   Configured Tx offloads mask. It is fully defined at
3604  *   compile time and may be used for optimization.
3605  *
3606  * @return
3607  *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
3608  *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
3609  *   MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered.
3610  *   MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered.
3611  * Local context variables updated.
3612  */
3613 static __rte_always_inline enum mlx5_txcmp_code
3614 mlx5_tx_burst_tso(struct mlx5_txq_data *__rte_restrict txq,
3615 		  struct rte_mbuf **__rte_restrict pkts,
3616 		  unsigned int pkts_n,
3617 		  struct mlx5_txq_local *__rte_restrict loc,
3618 		  unsigned int olx)
3619 {
3620 	MLX5_ASSERT(loc->elts_free && loc->wqe_free);
3621 	MLX5_ASSERT(pkts_n > loc->pkts_sent);
3622 	pkts += loc->pkts_sent + 1;
3623 	pkts_n -= loc->pkts_sent;
3624 	for (;;) {
3625 		struct mlx5_wqe_dseg *__rte_restrict dseg;
3626 		struct mlx5_wqe *__rte_restrict wqe;
3627 		unsigned int ds, dlen, hlen, ntcp, vlan = 0;
3628 		uint8_t *dptr;
3629 
3630 		MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
3631 		if (MLX5_TXOFF_CONFIG(TXPP)) {
3632 			enum mlx5_txcmp_code wret;
3633 
3634 			/* Generate WAIT for scheduling if requested. */
3635 			wret = mlx5_tx_schedule_send(txq, loc, olx);
3636 			if (wret == MLX5_TXCMP_CODE_EXIT)
3637 				return MLX5_TXCMP_CODE_EXIT;
3638 			if (wret == MLX5_TXCMP_CODE_ERROR)
3639 				return MLX5_TXCMP_CODE_ERROR;
3640 		}
3641 		dlen = rte_pktmbuf_data_len(loc->mbuf);
3642 		if (MLX5_TXOFF_CONFIG(VLAN) &&
3643 		    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) {
3644 			vlan = sizeof(struct rte_vlan_hdr);
3645 		}
3646 		/*
3647 		 * First calculate the WQE size to check
3648 		 * whether we have enough space in ring buffer.
3649 		 */
3650 		hlen = loc->mbuf->l2_len + vlan +
3651 		       loc->mbuf->l3_len + loc->mbuf->l4_len;
3652 		if (unlikely((!hlen || !loc->mbuf->tso_segsz)))
3653 			return MLX5_TXCMP_CODE_ERROR;
3654 		if (loc->mbuf->ol_flags & PKT_TX_TUNNEL_MASK)
3655 			hlen += loc->mbuf->outer_l2_len +
3656 				loc->mbuf->outer_l3_len;
3657 		/* Segment must contain all TSO headers. */
3658 		if (unlikely(hlen > MLX5_MAX_TSO_HEADER ||
3659 			     hlen <= MLX5_ESEG_MIN_INLINE_SIZE ||
3660 			     hlen > (dlen + vlan)))
3661 			return MLX5_TXCMP_CODE_ERROR;
3662 		/*
3663 		 * Check whether there are enough free WQEBBs:
3664 		 * - Control Segment
3665 		 * - Ethernet Segment
3666 		 * - First Segment of inlined Ethernet data
3667 		 * - ... data continued ...
3668 		 * - Finishing Data Segment of pointer type
3669 		 */
3670 		ds = 4 + (hlen - MLX5_ESEG_MIN_INLINE_SIZE +
3671 			  MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
3672 		if (loc->wqe_free < ((ds + 3) / 4))
3673 			return MLX5_TXCMP_CODE_EXIT;
3674 #ifdef MLX5_PMD_SOFT_COUNTERS
3675 		/* Update sent data bytes/packets counters. */
3676 		ntcp = (dlen + vlan - hlen +
3677 			loc->mbuf->tso_segsz - 1) /
3678 			loc->mbuf->tso_segsz;
3679 		/*
3680 		 * One will be added for mbuf itself at the end
3681 		 * of the mlx5_tx_burst from loc->pkts_sent field.
3682 		 */
3683 		--ntcp;
3684 		txq->stats.opackets += ntcp;
3685 		txq->stats.obytes += dlen + vlan + ntcp * hlen;
3686 #endif
3687 		/*
3688 		 * Build the TSO WQE:
3689 		 * - Control Segment
3690 		 * - Ethernet Segment with hlen bytes inlined
3691 		 * - Data Segment of pointer type
3692 		 */
3693 		wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
3694 		loc->wqe_last = wqe;
3695 		mlx5_tx_cseg_init(txq, loc, wqe, ds,
3696 				  MLX5_OPCODE_TSO, olx);
3697 		dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan, hlen, 1, olx);
3698 		dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + hlen - vlan;
3699 		dlen -= hlen - vlan;
3700 		mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx);
3701 		/*
3702 		 * WQE is built, update the loop parameters
3703 		 * and go to the next packet.
3704 		 */
3705 		txq->wqe_ci += (ds + 3) / 4;
3706 		loc->wqe_free -= (ds + 3) / 4;
3707 		if (MLX5_TXOFF_CONFIG(INLINE))
3708 			txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
3709 		--loc->elts_free;
3710 		++loc->pkts_sent;
3711 		--pkts_n;
3712 		if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
3713 			return MLX5_TXCMP_CODE_EXIT;
3714 		loc->mbuf = *pkts++;
3715 		if (pkts_n > 1)
3716 			rte_prefetch0(*pkts);
3717 		if (MLX5_TXOFF_CONFIG(MULTI) &&
3718 		    unlikely(NB_SEGS(loc->mbuf) > 1))
3719 			return MLX5_TXCMP_CODE_MULTI;
3720 		if (likely(!(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)))
3721 			return MLX5_TXCMP_CODE_SINGLE;
3722 		/* Continue with the next TSO packet. */
3723 	}
3724 	MLX5_ASSERT(false);
3725 }
3726 
3727 /**
3728  * Analyze the packet and select the best method to send.
3729  *
3730  * @param txq
3731  *   Pointer to TX queue structure.
3732  * @param loc
3733  *   Pointer to burst routine local context.
3734  * @param olx
3735  *   Configured Tx offloads mask. It is fully defined at
3736  *   compile time and may be used for optimization.
3737  * @param newp
3738  *   The predefined flag whether do complete check for
3739  *   multi-segment packets and TSO.
3740  *
3741  * @return
3742  *  MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered.
3743  *  MLX5_TXCMP_CODE_TSO - TSO required, use TSO/LSO.
3744  *  MLX5_TXCMP_CODE_SINGLE - single-segment packet, use SEND.
3745  *  MLX5_TXCMP_CODE_EMPW - single-segment packet, use MPW.
3746  */
3747 static __rte_always_inline enum mlx5_txcmp_code
3748 mlx5_tx_able_to_empw(struct mlx5_txq_data *__rte_restrict txq,
3749 		     struct mlx5_txq_local *__rte_restrict loc,
3750 		     unsigned int olx,
3751 		     bool newp)
3752 {
3753 	/* Check for multi-segment packet. */
3754 	if (newp &&
3755 	    MLX5_TXOFF_CONFIG(MULTI) &&
3756 	    unlikely(NB_SEGS(loc->mbuf) > 1))
3757 		return MLX5_TXCMP_CODE_MULTI;
3758 	/* Check for TSO packet. */
3759 	if (newp &&
3760 	    MLX5_TXOFF_CONFIG(TSO) &&
3761 	    unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG))
3762 		return MLX5_TXCMP_CODE_TSO;
3763 	/* Check if eMPW is enabled at all. */
3764 	if (!MLX5_TXOFF_CONFIG(EMPW))
3765 		return MLX5_TXCMP_CODE_SINGLE;
3766 	/* Check if eMPW can be engaged. */
3767 	if (MLX5_TXOFF_CONFIG(VLAN) &&
3768 	    unlikely(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) &&
3769 		(!MLX5_TXOFF_CONFIG(INLINE) ||
3770 		 unlikely((rte_pktmbuf_data_len(loc->mbuf) +
3771 			   sizeof(struct rte_vlan_hdr)) > txq->inlen_empw))) {
3772 		/*
3773 		 * eMPW does not support VLAN insertion offload,
3774 		 * we have to inline the entire packet but
3775 		 * packet is too long for inlining.
3776 		 */
3777 		return MLX5_TXCMP_CODE_SINGLE;
3778 	}
3779 	return MLX5_TXCMP_CODE_EMPW;
3780 }
3781 
3782 /**
3783  * Check the next packet attributes to match with the eMPW batch ones.
3784  * In addition, for legacy MPW the packet length is checked either.
3785  *
3786  * @param txq
3787  *   Pointer to TX queue structure.
3788  * @param es
3789  *   Pointer to Ethernet Segment of eMPW batch.
3790  * @param loc
3791  *   Pointer to burst routine local context.
3792  * @param dlen
3793  *   Length of previous packet in MPW descriptor.
3794  * @param olx
3795  *   Configured Tx offloads mask. It is fully defined at
3796  *   compile time and may be used for optimization.
3797  *
3798  * @return
3799  *  true - packet match with eMPW batch attributes.
3800  *  false - no match, eMPW should be restarted.
3801  */
3802 static __rte_always_inline bool
3803 mlx5_tx_match_empw(struct mlx5_txq_data *__rte_restrict txq,
3804 		   struct mlx5_wqe_eseg *__rte_restrict es,
3805 		   struct mlx5_txq_local *__rte_restrict loc,
3806 		   uint32_t dlen,
3807 		   unsigned int olx)
3808 {
3809 	uint8_t swp_flags = 0;
3810 
3811 	/* Compare the checksum flags, if any. */
3812 	if (MLX5_TXOFF_CONFIG(CSUM) &&
3813 	    txq_ol_cksum_to_cs(loc->mbuf) != es->cs_flags)
3814 		return false;
3815 	/* Compare the Software Parser offsets and flags. */
3816 	if (MLX5_TXOFF_CONFIG(SWP) &&
3817 	    (es->swp_offs != txq_mbuf_to_swp(loc, &swp_flags, olx) ||
3818 	     es->swp_flags != swp_flags))
3819 		return false;
3820 	/* Fill metadata field if needed. */
3821 	if (MLX5_TXOFF_CONFIG(METADATA) &&
3822 		es->metadata != (loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ?
3823 				 *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0))
3824 		return false;
3825 	/* Legacy MPW can send packets with the same lengt only. */
3826 	if (MLX5_TXOFF_CONFIG(MPW) &&
3827 	    dlen != rte_pktmbuf_data_len(loc->mbuf))
3828 		return false;
3829 	/* There must be no VLAN packets in eMPW loop. */
3830 	if (MLX5_TXOFF_CONFIG(VLAN))
3831 		MLX5_ASSERT(!(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT));
3832 	/* Check if the scheduling is requested. */
3833 	if (MLX5_TXOFF_CONFIG(TXPP) &&
3834 	    loc->mbuf->ol_flags & txq->ts_mask)
3835 		return false;
3836 	return true;
3837 }
3838 
3839 /*
3840  * Update send loop variables and WQE for eMPW loop
3841  * without data inlining. Number of Data Segments is
3842  * equal to the number of sent packets.
3843  *
3844  * @param txq
3845  *   Pointer to TX queue structure.
3846  * @param loc
3847  *   Pointer to burst routine local context.
3848  * @param ds
3849  *   Number of packets/Data Segments/Packets.
3850  * @param slen
3851  *   Accumulated statistics, bytes sent
3852  * @param olx
3853  *   Configured Tx offloads mask. It is fully defined at
3854  *   compile time and may be used for optimization.
3855  *
3856  * @return
3857  *  true - packet match with eMPW batch attributes.
3858  *  false - no match, eMPW should be restarted.
3859  */
3860 static __rte_always_inline void
3861 mlx5_tx_sdone_empw(struct mlx5_txq_data *__rte_restrict txq,
3862 		   struct mlx5_txq_local *__rte_restrict loc,
3863 		   unsigned int ds,
3864 		   unsigned int slen,
3865 		   unsigned int olx __rte_unused)
3866 {
3867 	MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE));
3868 #ifdef MLX5_PMD_SOFT_COUNTERS
3869 	/* Update sent data bytes counter. */
3870 	 txq->stats.obytes += slen;
3871 #else
3872 	(void)slen;
3873 #endif
3874 	loc->elts_free -= ds;
3875 	loc->pkts_sent += ds;
3876 	ds += 2;
3877 	loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
3878 	txq->wqe_ci += (ds + 3) / 4;
3879 	loc->wqe_free -= (ds + 3) / 4;
3880 }
3881 
3882 /*
3883  * Update send loop variables and WQE for eMPW loop
3884  * with data inlining. Gets the size of pushed descriptors
3885  * and data to the WQE.
3886  *
3887  * @param txq
3888  *   Pointer to TX queue structure.
3889  * @param loc
3890  *   Pointer to burst routine local context.
3891  * @param len
3892  *   Total size of descriptor/data in bytes.
3893  * @param slen
3894  *   Accumulated statistics, data bytes sent.
3895  * @param wqem
3896  *   The base WQE for the eMPW/MPW descriptor.
3897  * @param olx
3898  *   Configured Tx offloads mask. It is fully defined at
3899  *   compile time and may be used for optimization.
3900  *
3901  * @return
3902  *  true - packet match with eMPW batch attributes.
3903  *  false - no match, eMPW should be restarted.
3904  */
3905 static __rte_always_inline void
3906 mlx5_tx_idone_empw(struct mlx5_txq_data *__rte_restrict txq,
3907 		   struct mlx5_txq_local *__rte_restrict loc,
3908 		   unsigned int len,
3909 		   unsigned int slen,
3910 		   struct mlx5_wqe *__rte_restrict wqem,
3911 		   unsigned int olx __rte_unused)
3912 {
3913 	struct mlx5_wqe_dseg *dseg = &wqem->dseg[0];
3914 
3915 	MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
3916 #ifdef MLX5_PMD_SOFT_COUNTERS
3917 	/* Update sent data bytes counter. */
3918 	 txq->stats.obytes += slen;
3919 #else
3920 	(void)slen;
3921 #endif
3922 	if (MLX5_TXOFF_CONFIG(MPW) && dseg->bcount == RTE_BE32(0)) {
3923 		/*
3924 		 * If the legacy MPW session contains the inline packets
3925 		 * we should set the only inline data segment length
3926 		 * and align the total length to the segment size.
3927 		 */
3928 		MLX5_ASSERT(len > sizeof(dseg->bcount));
3929 		dseg->bcount = rte_cpu_to_be_32((len - sizeof(dseg->bcount)) |
3930 						MLX5_ETH_WQE_DATA_INLINE);
3931 		len = (len + MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE + 2;
3932 	} else {
3933 		/*
3934 		 * The session is not legacy MPW or contains the
3935 		 * data buffer pointer segments.
3936 		 */
3937 		MLX5_ASSERT((len % MLX5_WSEG_SIZE) == 0);
3938 		len = len / MLX5_WSEG_SIZE + 2;
3939 	}
3940 	wqem->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | len);
3941 	txq->wqe_ci += (len + 3) / 4;
3942 	loc->wqe_free -= (len + 3) / 4;
3943 	loc->wqe_last = wqem;
3944 }
3945 
3946 /**
3947  * The set of Tx burst functions for single-segment packets
3948  * without TSO and with Multi-Packet Writing feature support.
3949  * Supports all types of Tx offloads, except multi-packets
3950  * and TSO.
3951  *
3952  * Uses MLX5_OPCODE_EMPW to build WQEs if possible and sends
3953  * as many packet per WQE as it can. If eMPW is not configured
3954  * or packet can not be sent with eMPW (VLAN insertion) the
3955  * ordinary SEND opcode is used and only one packet placed
3956  * in WQE.
3957  *
3958  * Functions stop sending if it encounters the multi-segment
3959  * packet or packet with TSO requested.
3960  *
3961  * The routines are responsible for storing processed mbuf
3962  * into elts ring buffer and update elts_head if inlining
3963  * offload is requested. Otherwise the copying mbufs to elts
3964  * can be postponed and completed at the end of burst routine.
3965  *
3966  * @param txq
3967  *   Pointer to TX queue structure.
3968  * @param[in] pkts
3969  *   Packets to transmit.
3970  * @param pkts_n
3971  *   Number of packets in array.
3972  * @param loc
3973  *   Pointer to burst routine local context.
3974  * @param olx
3975  *   Configured Tx offloads mask. It is fully defined at
3976  *   compile time and may be used for optimization.
3977  *
3978  * @return
3979  *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
3980  *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
3981  *   MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered.
3982  *   MLX5_TXCMP_CODE_TSO - TSO packet encountered.
3983  *   MLX5_TXCMP_CODE_SINGLE - used inside functions set.
3984  *   MLX5_TXCMP_CODE_EMPW - used inside functions set.
3985  *
3986  * Local context variables updated.
3987  *
3988  *
3989  * The routine sends packets with MLX5_OPCODE_EMPW
3990  * without inlining, this is dedicated optimized branch.
3991  * No VLAN insertion is supported.
3992  */
3993 static __rte_always_inline enum mlx5_txcmp_code
3994 mlx5_tx_burst_empw_simple(struct mlx5_txq_data *__rte_restrict txq,
3995 			  struct rte_mbuf **__rte_restrict pkts,
3996 			  unsigned int pkts_n,
3997 			  struct mlx5_txq_local *__rte_restrict loc,
3998 			  unsigned int olx)
3999 {
4000 	/*
4001 	 * Subroutine is the part of mlx5_tx_burst_single()
4002 	 * and sends single-segment packet with eMPW opcode
4003 	 * without data inlining.
4004 	 */
4005 	MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE));
4006 	MLX5_ASSERT(MLX5_TXOFF_CONFIG(EMPW));
4007 	MLX5_ASSERT(loc->elts_free && loc->wqe_free);
4008 	MLX5_ASSERT(pkts_n > loc->pkts_sent);
4009 	static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size");
4010 	pkts += loc->pkts_sent + 1;
4011 	pkts_n -= loc->pkts_sent;
4012 	for (;;) {
4013 		struct mlx5_wqe_dseg *__rte_restrict dseg;
4014 		struct mlx5_wqe_eseg *__rte_restrict eseg;
4015 		enum mlx5_txcmp_code ret;
4016 		unsigned int part, loop;
4017 		unsigned int slen = 0;
4018 
4019 next_empw:
4020 		MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
4021 		if (MLX5_TXOFF_CONFIG(TXPP)) {
4022 			enum mlx5_txcmp_code wret;
4023 
4024 			/* Generate WAIT for scheduling if requested. */
4025 			wret = mlx5_tx_schedule_send(txq, loc, olx);
4026 			if (wret == MLX5_TXCMP_CODE_EXIT)
4027 				return MLX5_TXCMP_CODE_EXIT;
4028 			if (wret == MLX5_TXCMP_CODE_ERROR)
4029 				return MLX5_TXCMP_CODE_ERROR;
4030 		}
4031 		part = RTE_MIN(pkts_n, MLX5_TXOFF_CONFIG(MPW) ?
4032 				       MLX5_MPW_MAX_PACKETS :
4033 				       MLX5_EMPW_MAX_PACKETS);
4034 		if (unlikely(loc->elts_free < part)) {
4035 			/* We have no enough elts to save all mbufs. */
4036 			if (unlikely(loc->elts_free < MLX5_EMPW_MIN_PACKETS))
4037 				return MLX5_TXCMP_CODE_EXIT;
4038 			/* But we still able to send at least minimal eMPW. */
4039 			part = loc->elts_free;
4040 		}
4041 		/* Check whether we have enough WQEs */
4042 		if (unlikely(loc->wqe_free < ((2 + part + 3) / 4))) {
4043 			if (unlikely(loc->wqe_free <
4044 				((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4)))
4045 				return MLX5_TXCMP_CODE_EXIT;
4046 			part = (loc->wqe_free * 4) - 2;
4047 		}
4048 		if (likely(part > 1))
4049 			rte_prefetch0(*pkts);
4050 		loc->wqe_last = txq->wqes + (txq->wqe_ci & txq->wqe_m);
4051 		/*
4052 		 * Build eMPW title WQEBB:
4053 		 * - Control Segment, eMPW opcode
4054 		 * - Ethernet Segment, no inline
4055 		 */
4056 		mlx5_tx_cseg_init(txq, loc, loc->wqe_last, part + 2,
4057 				  MLX5_OPCODE_ENHANCED_MPSW, olx);
4058 		mlx5_tx_eseg_none(txq, loc, loc->wqe_last,
4059 				  olx & ~MLX5_TXOFF_CONFIG_VLAN);
4060 		eseg = &loc->wqe_last->eseg;
4061 		dseg = &loc->wqe_last->dseg[0];
4062 		loop = part;
4063 		/* Store the packet length for legacy MPW. */
4064 		if (MLX5_TXOFF_CONFIG(MPW))
4065 			eseg->mss = rte_cpu_to_be_16
4066 					(rte_pktmbuf_data_len(loc->mbuf));
4067 		for (;;) {
4068 			uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf);
4069 #ifdef MLX5_PMD_SOFT_COUNTERS
4070 			/* Update sent data bytes counter. */
4071 			slen += dlen;
4072 #endif
4073 			mlx5_tx_dseg_ptr
4074 				(txq, loc, dseg,
4075 				 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
4076 				 dlen, olx);
4077 			if (unlikely(--loop == 0))
4078 				break;
4079 			loc->mbuf = *pkts++;
4080 			if (likely(loop > 1))
4081 				rte_prefetch0(*pkts);
4082 			ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
4083 			/*
4084 			 * Unroll the completion code to avoid
4085 			 * returning variable value - it results in
4086 			 * unoptimized sequent checking in caller.
4087 			 */
4088 			if (ret == MLX5_TXCMP_CODE_MULTI) {
4089 				part -= loop;
4090 				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
4091 				if (unlikely(!loc->elts_free ||
4092 					     !loc->wqe_free))
4093 					return MLX5_TXCMP_CODE_EXIT;
4094 				return MLX5_TXCMP_CODE_MULTI;
4095 			}
4096 			MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
4097 			if (ret == MLX5_TXCMP_CODE_TSO) {
4098 				part -= loop;
4099 				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
4100 				if (unlikely(!loc->elts_free ||
4101 					     !loc->wqe_free))
4102 					return MLX5_TXCMP_CODE_EXIT;
4103 				return MLX5_TXCMP_CODE_TSO;
4104 			}
4105 			if (ret == MLX5_TXCMP_CODE_SINGLE) {
4106 				part -= loop;
4107 				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
4108 				if (unlikely(!loc->elts_free ||
4109 					     !loc->wqe_free))
4110 					return MLX5_TXCMP_CODE_EXIT;
4111 				return MLX5_TXCMP_CODE_SINGLE;
4112 			}
4113 			if (ret != MLX5_TXCMP_CODE_EMPW) {
4114 				MLX5_ASSERT(false);
4115 				part -= loop;
4116 				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
4117 				return MLX5_TXCMP_CODE_ERROR;
4118 			}
4119 			/*
4120 			 * Check whether packet parameters coincide
4121 			 * within assumed eMPW batch:
4122 			 * - check sum settings
4123 			 * - metadata value
4124 			 * - software parser settings
4125 			 * - packets length (legacy MPW only)
4126 			 * - scheduling is not required
4127 			 */
4128 			if (!mlx5_tx_match_empw(txq, eseg, loc, dlen, olx)) {
4129 				MLX5_ASSERT(loop);
4130 				part -= loop;
4131 				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
4132 				if (unlikely(!loc->elts_free ||
4133 					     !loc->wqe_free))
4134 					return MLX5_TXCMP_CODE_EXIT;
4135 				pkts_n -= part;
4136 				goto next_empw;
4137 			}
4138 			/* Packet attributes match, continue the same eMPW. */
4139 			++dseg;
4140 			if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
4141 				dseg = (struct mlx5_wqe_dseg *)txq->wqes;
4142 		}
4143 		/* eMPW is built successfully, update loop parameters. */
4144 		MLX5_ASSERT(!loop);
4145 		MLX5_ASSERT(pkts_n >= part);
4146 #ifdef MLX5_PMD_SOFT_COUNTERS
4147 		/* Update sent data bytes counter. */
4148 		txq->stats.obytes += slen;
4149 #endif
4150 		loc->elts_free -= part;
4151 		loc->pkts_sent += part;
4152 		txq->wqe_ci += (2 + part + 3) / 4;
4153 		loc->wqe_free -= (2 + part + 3) / 4;
4154 		pkts_n -= part;
4155 		if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
4156 			return MLX5_TXCMP_CODE_EXIT;
4157 		loc->mbuf = *pkts++;
4158 		ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
4159 		if (unlikely(ret != MLX5_TXCMP_CODE_EMPW))
4160 			return ret;
4161 		/* Continue sending eMPW batches. */
4162 	}
4163 	MLX5_ASSERT(false);
4164 }
4165 
4166 /**
4167  * The routine sends packets with MLX5_OPCODE_EMPW
4168  * with inlining, optionally supports VLAN insertion.
4169  */
4170 static __rte_always_inline enum mlx5_txcmp_code
4171 mlx5_tx_burst_empw_inline(struct mlx5_txq_data *__rte_restrict txq,
4172 			  struct rte_mbuf **__rte_restrict pkts,
4173 			  unsigned int pkts_n,
4174 			  struct mlx5_txq_local *__rte_restrict loc,
4175 			  unsigned int olx)
4176 {
4177 	/*
4178 	 * Subroutine is the part of mlx5_tx_burst_single()
4179 	 * and sends single-segment packet with eMPW opcode
4180 	 * with data inlining.
4181 	 */
4182 	MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
4183 	MLX5_ASSERT(MLX5_TXOFF_CONFIG(EMPW));
4184 	MLX5_ASSERT(loc->elts_free && loc->wqe_free);
4185 	MLX5_ASSERT(pkts_n > loc->pkts_sent);
4186 	static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size");
4187 	pkts += loc->pkts_sent + 1;
4188 	pkts_n -= loc->pkts_sent;
4189 	for (;;) {
4190 		struct mlx5_wqe_dseg *__rte_restrict dseg;
4191 		struct mlx5_wqe *__rte_restrict wqem;
4192 		enum mlx5_txcmp_code ret;
4193 		unsigned int room, part, nlim;
4194 		unsigned int slen = 0;
4195 
4196 		MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
4197 		if (MLX5_TXOFF_CONFIG(TXPP)) {
4198 			enum mlx5_txcmp_code wret;
4199 
4200 			/* Generate WAIT for scheduling if requested. */
4201 			wret = mlx5_tx_schedule_send(txq, loc, olx);
4202 			if (wret == MLX5_TXCMP_CODE_EXIT)
4203 				return MLX5_TXCMP_CODE_EXIT;
4204 			if (wret == MLX5_TXCMP_CODE_ERROR)
4205 				return MLX5_TXCMP_CODE_ERROR;
4206 		}
4207 		/*
4208 		 * Limits the amount of packets in one WQE
4209 		 * to improve CQE latency generation.
4210 		 */
4211 		nlim = RTE_MIN(pkts_n, MLX5_TXOFF_CONFIG(MPW) ?
4212 				       MLX5_MPW_INLINE_MAX_PACKETS :
4213 				       MLX5_EMPW_MAX_PACKETS);
4214 		/* Check whether we have minimal amount WQEs */
4215 		if (unlikely(loc->wqe_free <
4216 			    ((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4)))
4217 			return MLX5_TXCMP_CODE_EXIT;
4218 		if (likely(pkts_n > 1))
4219 			rte_prefetch0(*pkts);
4220 		wqem = txq->wqes + (txq->wqe_ci & txq->wqe_m);
4221 		/*
4222 		 * Build eMPW title WQEBB:
4223 		 * - Control Segment, eMPW opcode, zero DS
4224 		 * - Ethernet Segment, no inline
4225 		 */
4226 		mlx5_tx_cseg_init(txq, loc, wqem, 0,
4227 				  MLX5_OPCODE_ENHANCED_MPSW, olx);
4228 		mlx5_tx_eseg_none(txq, loc, wqem,
4229 				  olx & ~MLX5_TXOFF_CONFIG_VLAN);
4230 		dseg = &wqem->dseg[0];
4231 		/* Store the packet length for legacy MPW. */
4232 		if (MLX5_TXOFF_CONFIG(MPW))
4233 			wqem->eseg.mss = rte_cpu_to_be_16
4234 					 (rte_pktmbuf_data_len(loc->mbuf));
4235 		room = RTE_MIN(MLX5_WQE_SIZE_MAX / MLX5_WQE_SIZE,
4236 			       loc->wqe_free) * MLX5_WQE_SIZE -
4237 					MLX5_WQE_CSEG_SIZE -
4238 					MLX5_WQE_ESEG_SIZE;
4239 		/* Limit the room for legacy MPW sessions for performance. */
4240 		if (MLX5_TXOFF_CONFIG(MPW))
4241 			room = RTE_MIN(room,
4242 				       RTE_MAX(txq->inlen_empw +
4243 					       sizeof(dseg->bcount) +
4244 					       (MLX5_TXOFF_CONFIG(VLAN) ?
4245 					       sizeof(struct rte_vlan_hdr) : 0),
4246 					       MLX5_MPW_INLINE_MAX_PACKETS *
4247 					       MLX5_WQE_DSEG_SIZE));
4248 		/* Build WQE till we have space, packets and resources. */
4249 		part = room;
4250 		for (;;) {
4251 			uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf);
4252 			uint8_t *dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
4253 			unsigned int tlen;
4254 
4255 			MLX5_ASSERT(room >= MLX5_WQE_DSEG_SIZE);
4256 			MLX5_ASSERT((room % MLX5_WQE_DSEG_SIZE) == 0);
4257 			MLX5_ASSERT((uintptr_t)dseg < (uintptr_t)txq->wqes_end);
4258 			/*
4259 			 * Some Tx offloads may cause an error if
4260 			 * packet is not long enough, check against
4261 			 * assumed minimal length.
4262 			 */
4263 			if (unlikely(dlen <= MLX5_ESEG_MIN_INLINE_SIZE)) {
4264 				part -= room;
4265 				if (unlikely(!part))
4266 					return MLX5_TXCMP_CODE_ERROR;
4267 				/*
4268 				 * We have some successfully built
4269 				 * packet Data Segments to send.
4270 				 */
4271 				mlx5_tx_idone_empw(txq, loc, part,
4272 						   slen, wqem, olx);
4273 				return MLX5_TXCMP_CODE_ERROR;
4274 			}
4275 			/* Inline or not inline - that's the Question. */
4276 			if (dlen > txq->inlen_empw ||
4277 			    loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE)
4278 				goto pointer_empw;
4279 			if (MLX5_TXOFF_CONFIG(MPW)) {
4280 				if (dlen > txq->inlen_send)
4281 					goto pointer_empw;
4282 				tlen = dlen;
4283 				if (part == room) {
4284 					/* Open new inline MPW session. */
4285 					tlen += sizeof(dseg->bcount);
4286 					dseg->bcount = RTE_BE32(0);
4287 					dseg = RTE_PTR_ADD
4288 						(dseg, sizeof(dseg->bcount));
4289 				} else {
4290 					/*
4291 					 * No pointer and inline descriptor
4292 					 * intermix for legacy MPW sessions.
4293 					 */
4294 					if (wqem->dseg[0].bcount)
4295 						break;
4296 				}
4297 			} else {
4298 				tlen = sizeof(dseg->bcount) + dlen;
4299 			}
4300 			/* Inline entire packet, optional VLAN insertion. */
4301 			if (MLX5_TXOFF_CONFIG(VLAN) &&
4302 			    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) {
4303 				/*
4304 				 * The packet length must be checked in
4305 				 * mlx5_tx_able_to_empw() and packet
4306 				 * fits into inline length guaranteed.
4307 				 */
4308 				MLX5_ASSERT((dlen +
4309 					     sizeof(struct rte_vlan_hdr)) <=
4310 					    txq->inlen_empw);
4311 				tlen += sizeof(struct rte_vlan_hdr);
4312 				if (room < tlen)
4313 					break;
4314 				dseg = mlx5_tx_dseg_vlan(txq, loc, dseg,
4315 							 dptr, dlen, olx);
4316 #ifdef MLX5_PMD_SOFT_COUNTERS
4317 				/* Update sent data bytes counter. */
4318 				slen +=	sizeof(struct rte_vlan_hdr);
4319 #endif
4320 			} else {
4321 				if (room < tlen)
4322 					break;
4323 				dseg = mlx5_tx_dseg_empw(txq, loc, dseg,
4324 							 dptr, dlen, olx);
4325 			}
4326 			if (!MLX5_TXOFF_CONFIG(MPW))
4327 				tlen = RTE_ALIGN(tlen, MLX5_WSEG_SIZE);
4328 			MLX5_ASSERT(room >= tlen);
4329 			room -= tlen;
4330 			/*
4331 			 * Packet data are completely inlined,
4332 			 * free the packet immediately.
4333 			 */
4334 			rte_pktmbuf_free_seg(loc->mbuf);
4335 			goto next_mbuf;
4336 pointer_empw:
4337 			/*
4338 			 * No pointer and inline descriptor
4339 			 * intermix for legacy MPW sessions.
4340 			 */
4341 			if (MLX5_TXOFF_CONFIG(MPW) &&
4342 			    part != room &&
4343 			    wqem->dseg[0].bcount == RTE_BE32(0))
4344 				break;
4345 			/*
4346 			 * Not inlinable VLAN packets are
4347 			 * proceeded outside of this routine.
4348 			 */
4349 			MLX5_ASSERT(room >= MLX5_WQE_DSEG_SIZE);
4350 			if (MLX5_TXOFF_CONFIG(VLAN))
4351 				MLX5_ASSERT(!(loc->mbuf->ol_flags &
4352 					    PKT_TX_VLAN_PKT));
4353 			mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx);
4354 			/* We have to store mbuf in elts.*/
4355 			txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
4356 			room -= MLX5_WQE_DSEG_SIZE;
4357 			/* Ring buffer wraparound is checked at the loop end.*/
4358 			++dseg;
4359 next_mbuf:
4360 #ifdef MLX5_PMD_SOFT_COUNTERS
4361 			/* Update sent data bytes counter. */
4362 			slen += dlen;
4363 #endif
4364 			loc->pkts_sent++;
4365 			loc->elts_free--;
4366 			pkts_n--;
4367 			if (unlikely(!pkts_n || !loc->elts_free)) {
4368 				/*
4369 				 * We have no resources/packets to
4370 				 * continue build descriptors.
4371 				 */
4372 				part -= room;
4373 				mlx5_tx_idone_empw(txq, loc, part,
4374 						   slen, wqem, olx);
4375 				return MLX5_TXCMP_CODE_EXIT;
4376 			}
4377 			loc->mbuf = *pkts++;
4378 			if (likely(pkts_n > 1))
4379 				rte_prefetch0(*pkts);
4380 			ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
4381 			/*
4382 			 * Unroll the completion code to avoid
4383 			 * returning variable value - it results in
4384 			 * unoptimized sequent checking in caller.
4385 			 */
4386 			if (ret == MLX5_TXCMP_CODE_MULTI) {
4387 				part -= room;
4388 				mlx5_tx_idone_empw(txq, loc, part,
4389 						   slen, wqem, olx);
4390 				if (unlikely(!loc->elts_free ||
4391 					     !loc->wqe_free))
4392 					return MLX5_TXCMP_CODE_EXIT;
4393 				return MLX5_TXCMP_CODE_MULTI;
4394 			}
4395 			MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
4396 			if (ret == MLX5_TXCMP_CODE_TSO) {
4397 				part -= room;
4398 				mlx5_tx_idone_empw(txq, loc, part,
4399 						   slen, wqem, olx);
4400 				if (unlikely(!loc->elts_free ||
4401 					     !loc->wqe_free))
4402 					return MLX5_TXCMP_CODE_EXIT;
4403 				return MLX5_TXCMP_CODE_TSO;
4404 			}
4405 			if (ret == MLX5_TXCMP_CODE_SINGLE) {
4406 				part -= room;
4407 				mlx5_tx_idone_empw(txq, loc, part,
4408 						   slen, wqem, olx);
4409 				if (unlikely(!loc->elts_free ||
4410 					     !loc->wqe_free))
4411 					return MLX5_TXCMP_CODE_EXIT;
4412 				return MLX5_TXCMP_CODE_SINGLE;
4413 			}
4414 			if (ret != MLX5_TXCMP_CODE_EMPW) {
4415 				MLX5_ASSERT(false);
4416 				part -= room;
4417 				mlx5_tx_idone_empw(txq, loc, part,
4418 						   slen, wqem, olx);
4419 				return MLX5_TXCMP_CODE_ERROR;
4420 			}
4421 			/* Check if we have minimal room left. */
4422 			nlim--;
4423 			if (unlikely(!nlim || room < MLX5_WQE_DSEG_SIZE))
4424 				break;
4425 			/*
4426 			 * Check whether packet parameters coincide
4427 			 * within assumed eMPW batch:
4428 			 * - check sum settings
4429 			 * - metadata value
4430 			 * - software parser settings
4431 			 * - packets length (legacy MPW only)
4432 			 * - scheduling is not required
4433 			 */
4434 			if (!mlx5_tx_match_empw(txq, &wqem->eseg,
4435 						loc, dlen, olx))
4436 				break;
4437 			/* Packet attributes match, continue the same eMPW. */
4438 			if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
4439 				dseg = (struct mlx5_wqe_dseg *)txq->wqes;
4440 		}
4441 		/*
4442 		 * We get here to close an existing eMPW
4443 		 * session and start the new one.
4444 		 */
4445 		MLX5_ASSERT(pkts_n);
4446 		part -= room;
4447 		if (unlikely(!part))
4448 			return MLX5_TXCMP_CODE_EXIT;
4449 		mlx5_tx_idone_empw(txq, loc, part, slen, wqem, olx);
4450 		if (unlikely(!loc->elts_free ||
4451 			     !loc->wqe_free))
4452 			return MLX5_TXCMP_CODE_EXIT;
4453 		/* Continue the loop with new eMPW session. */
4454 	}
4455 	MLX5_ASSERT(false);
4456 }
4457 
4458 /**
4459  * The routine sends packets with ordinary MLX5_OPCODE_SEND.
4460  * Data inlining and VLAN insertion are supported.
4461  */
4462 static __rte_always_inline enum mlx5_txcmp_code
4463 mlx5_tx_burst_single_send(struct mlx5_txq_data *__rte_restrict txq,
4464 			  struct rte_mbuf **__rte_restrict pkts,
4465 			  unsigned int pkts_n,
4466 			  struct mlx5_txq_local *__rte_restrict loc,
4467 			  unsigned int olx)
4468 {
4469 	/*
4470 	 * Subroutine is the part of mlx5_tx_burst_single()
4471 	 * and sends single-segment packet with SEND opcode.
4472 	 */
4473 	MLX5_ASSERT(loc->elts_free && loc->wqe_free);
4474 	MLX5_ASSERT(pkts_n > loc->pkts_sent);
4475 	pkts += loc->pkts_sent + 1;
4476 	pkts_n -= loc->pkts_sent;
4477 	for (;;) {
4478 		struct mlx5_wqe *__rte_restrict wqe;
4479 		enum mlx5_txcmp_code ret;
4480 
4481 		MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
4482 		if (MLX5_TXOFF_CONFIG(TXPP)) {
4483 			enum mlx5_txcmp_code wret;
4484 
4485 			/* Generate WAIT for scheduling if requested. */
4486 			wret = mlx5_tx_schedule_send(txq, loc, olx);
4487 			if (wret == MLX5_TXCMP_CODE_EXIT)
4488 				return MLX5_TXCMP_CODE_EXIT;
4489 			if (wret == MLX5_TXCMP_CODE_ERROR)
4490 				return MLX5_TXCMP_CODE_ERROR;
4491 		}
4492 		if (MLX5_TXOFF_CONFIG(INLINE)) {
4493 			unsigned int inlen, vlan = 0;
4494 
4495 			inlen = rte_pktmbuf_data_len(loc->mbuf);
4496 			if (MLX5_TXOFF_CONFIG(VLAN) &&
4497 			    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) {
4498 				vlan = sizeof(struct rte_vlan_hdr);
4499 				inlen += vlan;
4500 				static_assert((sizeof(struct rte_vlan_hdr) +
4501 					       sizeof(struct rte_ether_hdr)) ==
4502 					       MLX5_ESEG_MIN_INLINE_SIZE,
4503 					       "invalid min inline data size");
4504 			}
4505 			/*
4506 			 * If inlining is enabled at configuration time
4507 			 * the limit must be not less than minimal size.
4508 			 * Otherwise we would do extra check for data
4509 			 * size to avoid crashes due to length overflow.
4510 			 */
4511 			MLX5_ASSERT(txq->inlen_send >=
4512 				    MLX5_ESEG_MIN_INLINE_SIZE);
4513 			if (inlen <= txq->inlen_send) {
4514 				unsigned int seg_n, wqe_n;
4515 
4516 				rte_prefetch0(rte_pktmbuf_mtod
4517 						(loc->mbuf, uint8_t *));
4518 				/* Check against minimal length. */
4519 				if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE)
4520 					return MLX5_TXCMP_CODE_ERROR;
4521 				if (loc->mbuf->ol_flags &
4522 				    PKT_TX_DYNF_NOINLINE) {
4523 					/*
4524 					 * The hint flag not to inline packet
4525 					 * data is set. Check whether we can
4526 					 * follow the hint.
4527 					 */
4528 					if ((!MLX5_TXOFF_CONFIG(EMPW) &&
4529 					      txq->inlen_mode) ||
4530 					    (MLX5_TXOFF_CONFIG(MPW) &&
4531 					     txq->inlen_mode)) {
4532 						if (inlen <= txq->inlen_send)
4533 							goto single_inline;
4534 						/*
4535 						 * The hardware requires the
4536 						 * minimal inline data header.
4537 						 */
4538 						goto single_min_inline;
4539 					}
4540 					if (MLX5_TXOFF_CONFIG(VLAN) &&
4541 					    vlan && !txq->vlan_en) {
4542 						/*
4543 						 * We must insert VLAN tag
4544 						 * by software means.
4545 						 */
4546 						goto single_part_inline;
4547 					}
4548 					goto single_no_inline;
4549 				}
4550 single_inline:
4551 				/*
4552 				 * Completely inlined packet data WQE:
4553 				 * - Control Segment, SEND opcode
4554 				 * - Ethernet Segment, no VLAN insertion
4555 				 * - Data inlined, VLAN optionally inserted
4556 				 * - Alignment to MLX5_WSEG_SIZE
4557 				 * Have to estimate amount of WQEBBs
4558 				 */
4559 				seg_n = (inlen + 3 * MLX5_WSEG_SIZE -
4560 					 MLX5_ESEG_MIN_INLINE_SIZE +
4561 					 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
4562 				/* Check if there are enough WQEBBs. */
4563 				wqe_n = (seg_n + 3) / 4;
4564 				if (wqe_n > loc->wqe_free)
4565 					return MLX5_TXCMP_CODE_EXIT;
4566 				wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
4567 				loc->wqe_last = wqe;
4568 				mlx5_tx_cseg_init(txq, loc, wqe, seg_n,
4569 						  MLX5_OPCODE_SEND, olx);
4570 				mlx5_tx_eseg_data(txq, loc, wqe,
4571 						  vlan, inlen, 0, olx);
4572 				txq->wqe_ci += wqe_n;
4573 				loc->wqe_free -= wqe_n;
4574 				/*
4575 				 * Packet data are completely inlined,
4576 				 * free the packet immediately.
4577 				 */
4578 				rte_pktmbuf_free_seg(loc->mbuf);
4579 			} else if ((!MLX5_TXOFF_CONFIG(EMPW) ||
4580 				     MLX5_TXOFF_CONFIG(MPW)) &&
4581 					txq->inlen_mode) {
4582 				/*
4583 				 * If minimal inlining is requested the eMPW
4584 				 * feature should be disabled due to data is
4585 				 * inlined into Ethernet Segment, which can
4586 				 * not contain inlined data for eMPW due to
4587 				 * segment shared for all packets.
4588 				 */
4589 				struct mlx5_wqe_dseg *__rte_restrict dseg;
4590 				unsigned int ds;
4591 				uint8_t *dptr;
4592 
4593 				/*
4594 				 * The inline-mode settings require
4595 				 * to inline the specified amount of
4596 				 * data bytes to the Ethernet Segment.
4597 				 * We should check the free space in
4598 				 * WQE ring buffer to inline partially.
4599 				 */
4600 single_min_inline:
4601 				MLX5_ASSERT(txq->inlen_send >= txq->inlen_mode);
4602 				MLX5_ASSERT(inlen > txq->inlen_mode);
4603 				MLX5_ASSERT(txq->inlen_mode >=
4604 					    MLX5_ESEG_MIN_INLINE_SIZE);
4605 				/*
4606 				 * Check whether there are enough free WQEBBs:
4607 				 * - Control Segment
4608 				 * - Ethernet Segment
4609 				 * - First Segment of inlined Ethernet data
4610 				 * - ... data continued ...
4611 				 * - Finishing Data Segment of pointer type
4612 				 */
4613 				ds = (MLX5_WQE_CSEG_SIZE +
4614 				      MLX5_WQE_ESEG_SIZE +
4615 				      MLX5_WQE_DSEG_SIZE +
4616 				      txq->inlen_mode -
4617 				      MLX5_ESEG_MIN_INLINE_SIZE +
4618 				      MLX5_WQE_DSEG_SIZE +
4619 				      MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
4620 				if (loc->wqe_free < ((ds + 3) / 4))
4621 					return MLX5_TXCMP_CODE_EXIT;
4622 				/*
4623 				 * Build the ordinary SEND WQE:
4624 				 * - Control Segment
4625 				 * - Ethernet Segment, inline inlen_mode bytes
4626 				 * - Data Segment of pointer type
4627 				 */
4628 				wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
4629 				loc->wqe_last = wqe;
4630 				mlx5_tx_cseg_init(txq, loc, wqe, ds,
4631 						  MLX5_OPCODE_SEND, olx);
4632 				dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan,
4633 							 txq->inlen_mode,
4634 							 0, olx);
4635 				dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) +
4636 				       txq->inlen_mode - vlan;
4637 				inlen -= txq->inlen_mode;
4638 				mlx5_tx_dseg_ptr(txq, loc, dseg,
4639 						 dptr, inlen, olx);
4640 				/*
4641 				 * WQE is built, update the loop parameters
4642 				 * and got to the next packet.
4643 				 */
4644 				txq->wqe_ci += (ds + 3) / 4;
4645 				loc->wqe_free -= (ds + 3) / 4;
4646 				/* We have to store mbuf in elts.*/
4647 				MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
4648 				txq->elts[txq->elts_head++ & txq->elts_m] =
4649 						loc->mbuf;
4650 				--loc->elts_free;
4651 			} else {
4652 				uint8_t *dptr;
4653 				unsigned int dlen;
4654 
4655 				/*
4656 				 * Partially inlined packet data WQE, we have
4657 				 * some space in title WQEBB, we can fill it
4658 				 * with some packet data. It takes one WQEBB,
4659 				 * it is available, no extra space check:
4660 				 * - Control Segment, SEND opcode
4661 				 * - Ethernet Segment, no VLAN insertion
4662 				 * - MLX5_ESEG_MIN_INLINE_SIZE bytes of Data
4663 				 * - Data Segment, pointer type
4664 				 *
4665 				 * We also get here if VLAN insertion is not
4666 				 * supported by HW, the inline is enabled.
4667 				 */
4668 single_part_inline:
4669 				wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
4670 				loc->wqe_last = wqe;
4671 				mlx5_tx_cseg_init(txq, loc, wqe, 4,
4672 						  MLX5_OPCODE_SEND, olx);
4673 				mlx5_tx_eseg_dmin(txq, loc, wqe, vlan, olx);
4674 				dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) +
4675 				       MLX5_ESEG_MIN_INLINE_SIZE - vlan;
4676 				/*
4677 				 * The length check is performed above, by
4678 				 * comparing with txq->inlen_send. We should
4679 				 * not get overflow here.
4680 				 */
4681 				MLX5_ASSERT(inlen > MLX5_ESEG_MIN_INLINE_SIZE);
4682 				dlen = inlen - MLX5_ESEG_MIN_INLINE_SIZE;
4683 				mlx5_tx_dseg_ptr(txq, loc, &wqe->dseg[1],
4684 						 dptr, dlen, olx);
4685 				++txq->wqe_ci;
4686 				--loc->wqe_free;
4687 				/* We have to store mbuf in elts.*/
4688 				MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
4689 				txq->elts[txq->elts_head++ & txq->elts_m] =
4690 						loc->mbuf;
4691 				--loc->elts_free;
4692 			}
4693 #ifdef MLX5_PMD_SOFT_COUNTERS
4694 			/* Update sent data bytes counter. */
4695 			txq->stats.obytes += vlan +
4696 					rte_pktmbuf_data_len(loc->mbuf);
4697 #endif
4698 		} else {
4699 			/*
4700 			 * No inline at all, it means the CPU cycles saving
4701 			 * is prioritized at configuration, we should not
4702 			 * copy any packet data to WQE.
4703 			 *
4704 			 * SEND WQE, one WQEBB:
4705 			 * - Control Segment, SEND opcode
4706 			 * - Ethernet Segment, optional VLAN, no inline
4707 			 * - Data Segment, pointer type
4708 			 */
4709 single_no_inline:
4710 			wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
4711 			loc->wqe_last = wqe;
4712 			mlx5_tx_cseg_init(txq, loc, wqe, 3,
4713 					  MLX5_OPCODE_SEND, olx);
4714 			mlx5_tx_eseg_none(txq, loc, wqe, olx);
4715 			mlx5_tx_dseg_ptr
4716 				(txq, loc, &wqe->dseg[0],
4717 				 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
4718 				 rte_pktmbuf_data_len(loc->mbuf), olx);
4719 			++txq->wqe_ci;
4720 			--loc->wqe_free;
4721 			/*
4722 			 * We should not store mbuf pointer in elts
4723 			 * if no inlining is configured, this is done
4724 			 * by calling routine in a batch copy.
4725 			 */
4726 			MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE));
4727 			--loc->elts_free;
4728 #ifdef MLX5_PMD_SOFT_COUNTERS
4729 			/* Update sent data bytes counter. */
4730 			txq->stats.obytes += rte_pktmbuf_data_len(loc->mbuf);
4731 			if (MLX5_TXOFF_CONFIG(VLAN) &&
4732 			    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)
4733 				txq->stats.obytes +=
4734 					sizeof(struct rte_vlan_hdr);
4735 #endif
4736 		}
4737 		++loc->pkts_sent;
4738 		--pkts_n;
4739 		if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
4740 			return MLX5_TXCMP_CODE_EXIT;
4741 		loc->mbuf = *pkts++;
4742 		if (pkts_n > 1)
4743 			rte_prefetch0(*pkts);
4744 		ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
4745 		if (unlikely(ret != MLX5_TXCMP_CODE_SINGLE))
4746 			return ret;
4747 	}
4748 	MLX5_ASSERT(false);
4749 }
4750 
4751 static __rte_always_inline enum mlx5_txcmp_code
4752 mlx5_tx_burst_single(struct mlx5_txq_data *__rte_restrict txq,
4753 		     struct rte_mbuf **__rte_restrict pkts,
4754 		     unsigned int pkts_n,
4755 		     struct mlx5_txq_local *__rte_restrict loc,
4756 		     unsigned int olx)
4757 {
4758 	enum mlx5_txcmp_code ret;
4759 
4760 	ret = mlx5_tx_able_to_empw(txq, loc, olx, false);
4761 	if (ret == MLX5_TXCMP_CODE_SINGLE)
4762 		goto ordinary_send;
4763 	MLX5_ASSERT(ret == MLX5_TXCMP_CODE_EMPW);
4764 	for (;;) {
4765 		/* Optimize for inline/no inline eMPW send. */
4766 		ret = (MLX5_TXOFF_CONFIG(INLINE)) ?
4767 			mlx5_tx_burst_empw_inline
4768 				(txq, pkts, pkts_n, loc, olx) :
4769 			mlx5_tx_burst_empw_simple
4770 				(txq, pkts, pkts_n, loc, olx);
4771 		if (ret != MLX5_TXCMP_CODE_SINGLE)
4772 			return ret;
4773 		/* The resources to send one packet should remain. */
4774 		MLX5_ASSERT(loc->elts_free && loc->wqe_free);
4775 ordinary_send:
4776 		ret = mlx5_tx_burst_single_send(txq, pkts, pkts_n, loc, olx);
4777 		MLX5_ASSERT(ret != MLX5_TXCMP_CODE_SINGLE);
4778 		if (ret != MLX5_TXCMP_CODE_EMPW)
4779 			return ret;
4780 		/* The resources to send one packet should remain. */
4781 		MLX5_ASSERT(loc->elts_free && loc->wqe_free);
4782 	}
4783 }
4784 
4785 /**
4786  * DPDK Tx callback template. This is configured template
4787  * used to generate routines optimized for specified offload setup.
4788  * One of this generated functions is chosen at SQ configuration
4789  * time.
4790  *
4791  * @param txq
4792  *   Generic pointer to TX queue structure.
4793  * @param[in] pkts
4794  *   Packets to transmit.
4795  * @param pkts_n
4796  *   Number of packets in array.
4797  * @param olx
4798  *   Configured offloads mask, presents the bits of MLX5_TXOFF_CONFIG_xxx
4799  *   values. Should be static to take compile time static configuration
4800  *   advantages.
4801  *
4802  * @return
4803  *   Number of packets successfully transmitted (<= pkts_n).
4804  */
4805 static __rte_always_inline uint16_t
4806 mlx5_tx_burst_tmpl(struct mlx5_txq_data *__rte_restrict txq,
4807 		   struct rte_mbuf **__rte_restrict pkts,
4808 		   uint16_t pkts_n,
4809 		   unsigned int olx)
4810 {
4811 	struct mlx5_txq_local loc;
4812 	enum mlx5_txcmp_code ret;
4813 	unsigned int part;
4814 
4815 	MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
4816 	MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
4817 	if (unlikely(!pkts_n))
4818 		return 0;
4819 	loc.pkts_sent = 0;
4820 	loc.pkts_copy = 0;
4821 	loc.wqe_last = NULL;
4822 
4823 send_loop:
4824 	loc.pkts_loop = loc.pkts_sent;
4825 	/*
4826 	 * Check if there are some CQEs, if any:
4827 	 * - process an encountered errors
4828 	 * - process the completed WQEs
4829 	 * - free related mbufs
4830 	 * - doorbell the NIC about processed CQEs
4831 	 */
4832 	rte_prefetch0(*(pkts + loc.pkts_sent));
4833 	mlx5_tx_handle_completion(txq, olx);
4834 	/*
4835 	 * Calculate the number of available resources - elts and WQEs.
4836 	 * There are two possible different scenarios:
4837 	 * - no data inlining into WQEs, one WQEBB may contains up to
4838 	 *   four packets, in this case elts become scarce resource
4839 	 * - data inlining into WQEs, one packet may require multiple
4840 	 *   WQEBBs, the WQEs become the limiting factor.
4841 	 */
4842 	MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
4843 	loc.elts_free = txq->elts_s -
4844 				(uint16_t)(txq->elts_head - txq->elts_tail);
4845 	MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
4846 	loc.wqe_free = txq->wqe_s -
4847 				(uint16_t)(txq->wqe_ci - txq->wqe_pi);
4848 	if (unlikely(!loc.elts_free || !loc.wqe_free))
4849 		goto burst_exit;
4850 	for (;;) {
4851 		/*
4852 		 * Fetch the packet from array. Usually this is
4853 		 * the first packet in series of multi/single
4854 		 * segment packets.
4855 		 */
4856 		loc.mbuf = *(pkts + loc.pkts_sent);
4857 		/* Dedicated branch for multi-segment packets. */
4858 		if (MLX5_TXOFF_CONFIG(MULTI) &&
4859 		    unlikely(NB_SEGS(loc.mbuf) > 1)) {
4860 			/*
4861 			 * Multi-segment packet encountered.
4862 			 * Hardware is able to process it only
4863 			 * with SEND/TSO opcodes, one packet
4864 			 * per WQE, do it in dedicated routine.
4865 			 */
4866 enter_send_multi:
4867 			MLX5_ASSERT(loc.pkts_sent >= loc.pkts_copy);
4868 			part = loc.pkts_sent - loc.pkts_copy;
4869 			if (!MLX5_TXOFF_CONFIG(INLINE) && part) {
4870 				/*
4871 				 * There are some single-segment mbufs not
4872 				 * stored in elts. The mbufs must be in the
4873 				 * same order as WQEs, so we must copy the
4874 				 * mbufs to elts here, before the coming
4875 				 * multi-segment packet mbufs is appended.
4876 				 */
4877 				mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy,
4878 						  part, olx);
4879 				loc.pkts_copy = loc.pkts_sent;
4880 			}
4881 			MLX5_ASSERT(pkts_n > loc.pkts_sent);
4882 			ret = mlx5_tx_burst_mseg(txq, pkts, pkts_n, &loc, olx);
4883 			if (!MLX5_TXOFF_CONFIG(INLINE))
4884 				loc.pkts_copy = loc.pkts_sent;
4885 			/*
4886 			 * These returned code checks are supposed
4887 			 * to be optimized out due to routine inlining.
4888 			 */
4889 			if (ret == MLX5_TXCMP_CODE_EXIT) {
4890 				/*
4891 				 * The routine returns this code when
4892 				 * all packets are sent or there is no
4893 				 * enough resources to complete request.
4894 				 */
4895 				break;
4896 			}
4897 			if (ret == MLX5_TXCMP_CODE_ERROR) {
4898 				/*
4899 				 * The routine returns this code when
4900 				 * some error in the incoming packets
4901 				 * format occurred.
4902 				 */
4903 				txq->stats.oerrors++;
4904 				break;
4905 			}
4906 			if (ret == MLX5_TXCMP_CODE_SINGLE) {
4907 				/*
4908 				 * The single-segment packet was encountered
4909 				 * in the array, try to send it with the
4910 				 * best optimized way, possible engaging eMPW.
4911 				 */
4912 				goto enter_send_single;
4913 			}
4914 			if (MLX5_TXOFF_CONFIG(TSO) &&
4915 			    ret == MLX5_TXCMP_CODE_TSO) {
4916 				/*
4917 				 * The single-segment TSO packet was
4918 				 * encountered in the array.
4919 				 */
4920 				goto enter_send_tso;
4921 			}
4922 			/* We must not get here. Something is going wrong. */
4923 			MLX5_ASSERT(false);
4924 			txq->stats.oerrors++;
4925 			break;
4926 		}
4927 		/* Dedicated branch for single-segment TSO packets. */
4928 		if (MLX5_TXOFF_CONFIG(TSO) &&
4929 		    unlikely(loc.mbuf->ol_flags & PKT_TX_TCP_SEG)) {
4930 			/*
4931 			 * TSO might require special way for inlining
4932 			 * (dedicated parameters) and is sent with
4933 			 * MLX5_OPCODE_TSO opcode only, provide this
4934 			 * in dedicated branch.
4935 			 */
4936 enter_send_tso:
4937 			MLX5_ASSERT(NB_SEGS(loc.mbuf) == 1);
4938 			MLX5_ASSERT(pkts_n > loc.pkts_sent);
4939 			ret = mlx5_tx_burst_tso(txq, pkts, pkts_n, &loc, olx);
4940 			/*
4941 			 * These returned code checks are supposed
4942 			 * to be optimized out due to routine inlining.
4943 			 */
4944 			if (ret == MLX5_TXCMP_CODE_EXIT)
4945 				break;
4946 			if (ret == MLX5_TXCMP_CODE_ERROR) {
4947 				txq->stats.oerrors++;
4948 				break;
4949 			}
4950 			if (ret == MLX5_TXCMP_CODE_SINGLE)
4951 				goto enter_send_single;
4952 			if (MLX5_TXOFF_CONFIG(MULTI) &&
4953 			    ret == MLX5_TXCMP_CODE_MULTI) {
4954 				/*
4955 				 * The multi-segment packet was
4956 				 * encountered in the array.
4957 				 */
4958 				goto enter_send_multi;
4959 			}
4960 			/* We must not get here. Something is going wrong. */
4961 			MLX5_ASSERT(false);
4962 			txq->stats.oerrors++;
4963 			break;
4964 		}
4965 		/*
4966 		 * The dedicated branch for the single-segment packets
4967 		 * without TSO. Often these ones can be sent using
4968 		 * MLX5_OPCODE_EMPW with multiple packets in one WQE.
4969 		 * The routine builds the WQEs till it encounters
4970 		 * the TSO or multi-segment packet (in case if these
4971 		 * offloads are requested at SQ configuration time).
4972 		 */
4973 enter_send_single:
4974 		MLX5_ASSERT(pkts_n > loc.pkts_sent);
4975 		ret = mlx5_tx_burst_single(txq, pkts, pkts_n, &loc, olx);
4976 		/*
4977 		 * These returned code checks are supposed
4978 		 * to be optimized out due to routine inlining.
4979 		 */
4980 		if (ret == MLX5_TXCMP_CODE_EXIT)
4981 			break;
4982 		if (ret == MLX5_TXCMP_CODE_ERROR) {
4983 			txq->stats.oerrors++;
4984 			break;
4985 		}
4986 		if (MLX5_TXOFF_CONFIG(MULTI) &&
4987 		    ret == MLX5_TXCMP_CODE_MULTI) {
4988 			/*
4989 			 * The multi-segment packet was
4990 			 * encountered in the array.
4991 			 */
4992 			goto enter_send_multi;
4993 		}
4994 		if (MLX5_TXOFF_CONFIG(TSO) &&
4995 		    ret == MLX5_TXCMP_CODE_TSO) {
4996 			/*
4997 			 * The single-segment TSO packet was
4998 			 * encountered in the array.
4999 			 */
5000 			goto enter_send_tso;
5001 		}
5002 		/* We must not get here. Something is going wrong. */
5003 		MLX5_ASSERT(false);
5004 		txq->stats.oerrors++;
5005 		break;
5006 	}
5007 	/*
5008 	 * Main Tx loop is completed, do the rest:
5009 	 * - set completion request if thresholds are reached
5010 	 * - doorbell the hardware
5011 	 * - copy the rest of mbufs to elts (if any)
5012 	 */
5013 	MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE) ||
5014 		    loc.pkts_sent >= loc.pkts_copy);
5015 	/* Take a shortcut if nothing is sent. */
5016 	if (unlikely(loc.pkts_sent == loc.pkts_loop))
5017 		goto burst_exit;
5018 	/* Request CQE generation if limits are reached. */
5019 	mlx5_tx_request_completion(txq, &loc, olx);
5020 	/*
5021 	 * Ring QP doorbell immediately after WQE building completion
5022 	 * to improve latencies. The pure software related data treatment
5023 	 * can be completed after doorbell. Tx CQEs for this SQ are
5024 	 * processed in this thread only by the polling.
5025 	 *
5026 	 * The rdma core library can map doorbell register in two ways,
5027 	 * depending on the environment variable "MLX5_SHUT_UP_BF":
5028 	 *
5029 	 * - as regular cached memory, the variable is either missing or
5030 	 *   set to zero. This type of mapping may cause the significant
5031 	 *   doorbell register writing latency and requires explicit
5032 	 *   memory write barrier to mitigate this issue and prevent
5033 	 *   write combining.
5034 	 *
5035 	 * - as non-cached memory, the variable is present and set to
5036 	 *   not "0" value. This type of mapping may cause performance
5037 	 *   impact under heavy loading conditions but the explicit write
5038 	 *   memory barrier is not required and it may improve core
5039 	 *   performance.
5040 	 *
5041 	 * - the legacy behaviour (prior 19.08 release) was to use some
5042 	 *   heuristics to decide whether write memory barrier should
5043 	 *   be performed. This behavior is supported with specifying
5044 	 *   tx_db_nc=2, write barrier is skipped if application
5045 	 *   provides the full recommended burst of packets, it
5046 	 *   supposes the next packets are coming and the write barrier
5047 	 *   will be issued on the next burst (after descriptor writing,
5048 	 *   at least).
5049 	 */
5050 	mlx5_tx_dbrec_cond_wmb(txq, loc.wqe_last, !txq->db_nc &&
5051 			(!txq->db_heu || pkts_n % MLX5_TX_DEFAULT_BURST));
5052 	/* Not all of the mbufs may be stored into elts yet. */
5053 	part = MLX5_TXOFF_CONFIG(INLINE) ? 0 : loc.pkts_sent - loc.pkts_copy;
5054 	if (!MLX5_TXOFF_CONFIG(INLINE) && part) {
5055 		/*
5056 		 * There are some single-segment mbufs not stored in elts.
5057 		 * It can be only if the last packet was single-segment.
5058 		 * The copying is gathered into one place due to it is
5059 		 * a good opportunity to optimize that with SIMD.
5060 		 * Unfortunately if inlining is enabled the gaps in
5061 		 * pointer array may happen due to early freeing of the
5062 		 * inlined mbufs.
5063 		 */
5064 		mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy, part, olx);
5065 		loc.pkts_copy = loc.pkts_sent;
5066 	}
5067 	MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
5068 	MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
5069 	if (pkts_n > loc.pkts_sent) {
5070 		/*
5071 		 * If burst size is large there might be no enough CQE
5072 		 * fetched from completion queue and no enough resources
5073 		 * freed to send all the packets.
5074 		 */
5075 		goto send_loop;
5076 	}
5077 burst_exit:
5078 #ifdef MLX5_PMD_SOFT_COUNTERS
5079 	/* Increment sent packets counter. */
5080 	txq->stats.opackets += loc.pkts_sent;
5081 #endif
5082 	return loc.pkts_sent;
5083 }
5084 
5085 /* Generate routines with Enhanced Multi-Packet Write support. */
5086 MLX5_TXOFF_DECL(full_empw,
5087 		MLX5_TXOFF_CONFIG_FULL | MLX5_TXOFF_CONFIG_EMPW)
5088 
5089 MLX5_TXOFF_DECL(none_empw,
5090 		MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW)
5091 
5092 MLX5_TXOFF_DECL(md_empw,
5093 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
5094 
5095 MLX5_TXOFF_DECL(mt_empw,
5096 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
5097 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
5098 
5099 MLX5_TXOFF_DECL(mtsc_empw,
5100 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
5101 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
5102 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
5103 
5104 MLX5_TXOFF_DECL(mti_empw,
5105 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
5106 		MLX5_TXOFF_CONFIG_INLINE |
5107 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
5108 
5109 MLX5_TXOFF_DECL(mtv_empw,
5110 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
5111 		MLX5_TXOFF_CONFIG_VLAN |
5112 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
5113 
5114 MLX5_TXOFF_DECL(mtiv_empw,
5115 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
5116 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
5117 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
5118 
5119 MLX5_TXOFF_DECL(sc_empw,
5120 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
5121 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
5122 
5123 MLX5_TXOFF_DECL(sci_empw,
5124 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
5125 		MLX5_TXOFF_CONFIG_INLINE |
5126 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
5127 
5128 MLX5_TXOFF_DECL(scv_empw,
5129 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
5130 		MLX5_TXOFF_CONFIG_VLAN |
5131 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
5132 
5133 MLX5_TXOFF_DECL(sciv_empw,
5134 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
5135 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
5136 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
5137 
5138 MLX5_TXOFF_DECL(i_empw,
5139 		MLX5_TXOFF_CONFIG_INLINE |
5140 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
5141 
5142 MLX5_TXOFF_DECL(v_empw,
5143 		MLX5_TXOFF_CONFIG_VLAN |
5144 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
5145 
5146 MLX5_TXOFF_DECL(iv_empw,
5147 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
5148 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
5149 
5150 /* Generate routines without Enhanced Multi-Packet Write support. */
5151 MLX5_TXOFF_DECL(full,
5152 		MLX5_TXOFF_CONFIG_FULL)
5153 
5154 MLX5_TXOFF_DECL(none,
5155 		MLX5_TXOFF_CONFIG_NONE)
5156 
5157 MLX5_TXOFF_DECL(md,
5158 		MLX5_TXOFF_CONFIG_METADATA)
5159 
5160 MLX5_TXOFF_DECL(mt,
5161 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
5162 		MLX5_TXOFF_CONFIG_METADATA)
5163 
5164 MLX5_TXOFF_DECL(mtsc,
5165 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
5166 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
5167 		MLX5_TXOFF_CONFIG_METADATA)
5168 
5169 MLX5_TXOFF_DECL(mti,
5170 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
5171 		MLX5_TXOFF_CONFIG_INLINE |
5172 		MLX5_TXOFF_CONFIG_METADATA)
5173 
5174 
5175 MLX5_TXOFF_DECL(mtv,
5176 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
5177 		MLX5_TXOFF_CONFIG_VLAN |
5178 		MLX5_TXOFF_CONFIG_METADATA)
5179 
5180 
5181 MLX5_TXOFF_DECL(mtiv,
5182 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
5183 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
5184 		MLX5_TXOFF_CONFIG_METADATA)
5185 
5186 MLX5_TXOFF_DECL(sc,
5187 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
5188 		MLX5_TXOFF_CONFIG_METADATA)
5189 
5190 MLX5_TXOFF_DECL(sci,
5191 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
5192 		MLX5_TXOFF_CONFIG_INLINE |
5193 		MLX5_TXOFF_CONFIG_METADATA)
5194 
5195 
5196 MLX5_TXOFF_DECL(scv,
5197 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
5198 		MLX5_TXOFF_CONFIG_VLAN |
5199 		MLX5_TXOFF_CONFIG_METADATA)
5200 
5201 
5202 MLX5_TXOFF_DECL(sciv,
5203 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
5204 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
5205 		MLX5_TXOFF_CONFIG_METADATA)
5206 
5207 MLX5_TXOFF_DECL(i,
5208 		MLX5_TXOFF_CONFIG_INLINE |
5209 		MLX5_TXOFF_CONFIG_METADATA)
5210 
5211 MLX5_TXOFF_DECL(v,
5212 		MLX5_TXOFF_CONFIG_VLAN |
5213 		MLX5_TXOFF_CONFIG_METADATA)
5214 
5215 MLX5_TXOFF_DECL(iv,
5216 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
5217 		MLX5_TXOFF_CONFIG_METADATA)
5218 
5219 /* Generate routines with timestamp scheduling. */
5220 MLX5_TXOFF_DECL(full_ts_nompw,
5221 		MLX5_TXOFF_CONFIG_FULL | MLX5_TXOFF_CONFIG_TXPP)
5222 
5223 MLX5_TXOFF_DECL(full_ts_nompwi,
5224 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
5225 		MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM |
5226 		MLX5_TXOFF_CONFIG_VLAN | MLX5_TXOFF_CONFIG_METADATA |
5227 		MLX5_TXOFF_CONFIG_TXPP)
5228 
5229 MLX5_TXOFF_DECL(full_ts,
5230 		MLX5_TXOFF_CONFIG_FULL | MLX5_TXOFF_CONFIG_TXPP |
5231 		MLX5_TXOFF_CONFIG_EMPW)
5232 
5233 MLX5_TXOFF_DECL(full_ts_noi,
5234 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
5235 		MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM |
5236 		MLX5_TXOFF_CONFIG_VLAN | MLX5_TXOFF_CONFIG_METADATA |
5237 		MLX5_TXOFF_CONFIG_TXPP | MLX5_TXOFF_CONFIG_EMPW)
5238 
5239 MLX5_TXOFF_DECL(none_ts,
5240 		MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_TXPP |
5241 		MLX5_TXOFF_CONFIG_EMPW)
5242 
5243 MLX5_TXOFF_DECL(mdi_ts,
5244 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_METADATA |
5245 		MLX5_TXOFF_CONFIG_TXPP | MLX5_TXOFF_CONFIG_EMPW)
5246 
5247 MLX5_TXOFF_DECL(mti_ts,
5248 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
5249 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_METADATA |
5250 		MLX5_TXOFF_CONFIG_TXPP | MLX5_TXOFF_CONFIG_EMPW)
5251 
5252 MLX5_TXOFF_DECL(mtiv_ts,
5253 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
5254 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
5255 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_TXPP |
5256 		MLX5_TXOFF_CONFIG_EMPW)
5257 
5258 /*
5259  * Generate routines with Legacy Multi-Packet Write support.
5260  * This mode is supported by ConnectX-4 Lx only and imposes
5261  * offload limitations, not supported:
5262  *   - ACL/Flows (metadata are becoming meaningless)
5263  *   - WQE Inline headers
5264  *   - SRIOV (E-Switch offloads)
5265  *   - VLAN insertion
5266  *   - tunnel encapsulation/decapsulation
5267  *   - TSO
5268  */
5269 MLX5_TXOFF_DECL(none_mpw,
5270 		MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW |
5271 		MLX5_TXOFF_CONFIG_MPW)
5272 
5273 MLX5_TXOFF_DECL(mci_mpw,
5274 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM |
5275 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW |
5276 		MLX5_TXOFF_CONFIG_MPW)
5277 
5278 MLX5_TXOFF_DECL(mc_mpw,
5279 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM |
5280 		MLX5_TXOFF_CONFIG_EMPW | MLX5_TXOFF_CONFIG_MPW)
5281 
5282 MLX5_TXOFF_DECL(i_mpw,
5283 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW |
5284 		MLX5_TXOFF_CONFIG_MPW)
5285 
5286 /*
5287  * Array of declared and compiled Tx burst function and corresponding
5288  * supported offloads set. The array is used to select the Tx burst
5289  * function for specified offloads set at Tx queue configuration time.
5290  */
5291 const struct {
5292 	eth_tx_burst_t func;
5293 	unsigned int olx;
5294 } txoff_func[] = {
5295 MLX5_TXOFF_INFO(full_empw,
5296 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
5297 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
5298 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
5299 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
5300 
5301 MLX5_TXOFF_INFO(none_empw,
5302 		MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW)
5303 
5304 MLX5_TXOFF_INFO(md_empw,
5305 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
5306 
5307 MLX5_TXOFF_INFO(mt_empw,
5308 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
5309 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
5310 
5311 MLX5_TXOFF_INFO(mtsc_empw,
5312 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
5313 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
5314 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
5315 
5316 MLX5_TXOFF_INFO(mti_empw,
5317 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
5318 		MLX5_TXOFF_CONFIG_INLINE |
5319 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
5320 
5321 MLX5_TXOFF_INFO(mtv_empw,
5322 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
5323 		MLX5_TXOFF_CONFIG_VLAN |
5324 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
5325 
5326 MLX5_TXOFF_INFO(mtiv_empw,
5327 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
5328 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
5329 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
5330 
5331 MLX5_TXOFF_INFO(sc_empw,
5332 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
5333 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
5334 
5335 MLX5_TXOFF_INFO(sci_empw,
5336 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
5337 		MLX5_TXOFF_CONFIG_INLINE |
5338 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
5339 
5340 MLX5_TXOFF_INFO(scv_empw,
5341 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
5342 		MLX5_TXOFF_CONFIG_VLAN |
5343 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
5344 
5345 MLX5_TXOFF_INFO(sciv_empw,
5346 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
5347 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
5348 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
5349 
5350 MLX5_TXOFF_INFO(i_empw,
5351 		MLX5_TXOFF_CONFIG_INLINE |
5352 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
5353 
5354 MLX5_TXOFF_INFO(v_empw,
5355 		MLX5_TXOFF_CONFIG_VLAN |
5356 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
5357 
5358 MLX5_TXOFF_INFO(iv_empw,
5359 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
5360 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
5361 
5362 MLX5_TXOFF_INFO(full_ts_nompw,
5363 		MLX5_TXOFF_CONFIG_FULL | MLX5_TXOFF_CONFIG_TXPP)
5364 
5365 MLX5_TXOFF_INFO(full_ts_nompwi,
5366 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
5367 		MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM |
5368 		MLX5_TXOFF_CONFIG_VLAN | MLX5_TXOFF_CONFIG_METADATA |
5369 		MLX5_TXOFF_CONFIG_TXPP)
5370 
5371 MLX5_TXOFF_INFO(full_ts,
5372 		MLX5_TXOFF_CONFIG_FULL | MLX5_TXOFF_CONFIG_TXPP |
5373 		MLX5_TXOFF_CONFIG_EMPW)
5374 
5375 MLX5_TXOFF_INFO(full_ts_noi,
5376 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
5377 		MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM |
5378 		MLX5_TXOFF_CONFIG_VLAN | MLX5_TXOFF_CONFIG_METADATA |
5379 		MLX5_TXOFF_CONFIG_TXPP | MLX5_TXOFF_CONFIG_EMPW)
5380 
5381 MLX5_TXOFF_INFO(none_ts,
5382 		MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_TXPP |
5383 		MLX5_TXOFF_CONFIG_EMPW)
5384 
5385 MLX5_TXOFF_INFO(mdi_ts,
5386 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_METADATA |
5387 		MLX5_TXOFF_CONFIG_TXPP | MLX5_TXOFF_CONFIG_EMPW)
5388 
5389 MLX5_TXOFF_INFO(mti_ts,
5390 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
5391 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_METADATA |
5392 		MLX5_TXOFF_CONFIG_TXPP | MLX5_TXOFF_CONFIG_EMPW)
5393 
5394 MLX5_TXOFF_INFO(mtiv_ts,
5395 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
5396 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
5397 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_TXPP |
5398 		MLX5_TXOFF_CONFIG_EMPW)
5399 
5400 MLX5_TXOFF_INFO(full,
5401 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
5402 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
5403 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
5404 		MLX5_TXOFF_CONFIG_METADATA)
5405 
5406 MLX5_TXOFF_INFO(none,
5407 		MLX5_TXOFF_CONFIG_NONE)
5408 
5409 MLX5_TXOFF_INFO(md,
5410 		MLX5_TXOFF_CONFIG_METADATA)
5411 
5412 MLX5_TXOFF_INFO(mt,
5413 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
5414 		MLX5_TXOFF_CONFIG_METADATA)
5415 
5416 MLX5_TXOFF_INFO(mtsc,
5417 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
5418 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
5419 		MLX5_TXOFF_CONFIG_METADATA)
5420 
5421 MLX5_TXOFF_INFO(mti,
5422 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
5423 		MLX5_TXOFF_CONFIG_INLINE |
5424 		MLX5_TXOFF_CONFIG_METADATA)
5425 
5426 MLX5_TXOFF_INFO(mtv,
5427 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
5428 		MLX5_TXOFF_CONFIG_VLAN |
5429 		MLX5_TXOFF_CONFIG_METADATA)
5430 
5431 MLX5_TXOFF_INFO(mtiv,
5432 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
5433 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
5434 		MLX5_TXOFF_CONFIG_METADATA)
5435 
5436 MLX5_TXOFF_INFO(sc,
5437 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
5438 		MLX5_TXOFF_CONFIG_METADATA)
5439 
5440 MLX5_TXOFF_INFO(sci,
5441 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
5442 		MLX5_TXOFF_CONFIG_INLINE |
5443 		MLX5_TXOFF_CONFIG_METADATA)
5444 
5445 MLX5_TXOFF_INFO(scv,
5446 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
5447 		MLX5_TXOFF_CONFIG_VLAN |
5448 		MLX5_TXOFF_CONFIG_METADATA)
5449 
5450 MLX5_TXOFF_INFO(sciv,
5451 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
5452 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
5453 		MLX5_TXOFF_CONFIG_METADATA)
5454 
5455 MLX5_TXOFF_INFO(i,
5456 		MLX5_TXOFF_CONFIG_INLINE |
5457 		MLX5_TXOFF_CONFIG_METADATA)
5458 
5459 MLX5_TXOFF_INFO(v,
5460 		MLX5_TXOFF_CONFIG_VLAN |
5461 		MLX5_TXOFF_CONFIG_METADATA)
5462 
5463 MLX5_TXOFF_INFO(iv,
5464 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
5465 		MLX5_TXOFF_CONFIG_METADATA)
5466 
5467 MLX5_TXOFF_INFO(none_mpw,
5468 		MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW |
5469 		MLX5_TXOFF_CONFIG_MPW)
5470 
5471 MLX5_TXOFF_INFO(mci_mpw,
5472 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM |
5473 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW |
5474 		MLX5_TXOFF_CONFIG_MPW)
5475 
5476 MLX5_TXOFF_INFO(mc_mpw,
5477 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM |
5478 		MLX5_TXOFF_CONFIG_EMPW | MLX5_TXOFF_CONFIG_MPW)
5479 
5480 MLX5_TXOFF_INFO(i_mpw,
5481 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW |
5482 		MLX5_TXOFF_CONFIG_MPW)
5483 };
5484 
5485 /**
5486  * Configure the Tx function to use. The routine checks configured
5487  * Tx offloads for the device and selects appropriate Tx burst
5488  * routine. There are multiple Tx burst routines compiled from
5489  * the same template in the most optimal way for the dedicated
5490  * Tx offloads set.
5491  *
5492  * @param dev
5493  *   Pointer to private data structure.
5494  *
5495  * @return
5496  *   Pointer to selected Tx burst function.
5497  */
5498 eth_tx_burst_t
5499 mlx5_select_tx_function(struct rte_eth_dev *dev)
5500 {
5501 	struct mlx5_priv *priv = dev->data->dev_private;
5502 	struct mlx5_dev_config *config = &priv->config;
5503 	uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads;
5504 	unsigned int diff = 0, olx = 0, i, m;
5505 
5506 	static_assert(MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE <=
5507 		      MLX5_DSEG_MAX, "invalid WQE max size");
5508 	static_assert(MLX5_WQE_CSEG_SIZE == MLX5_WSEG_SIZE,
5509 		      "invalid WQE Control Segment size");
5510 	static_assert(MLX5_WQE_ESEG_SIZE == MLX5_WSEG_SIZE,
5511 		      "invalid WQE Ethernet Segment size");
5512 	static_assert(MLX5_WQE_DSEG_SIZE == MLX5_WSEG_SIZE,
5513 		      "invalid WQE Data Segment size");
5514 	static_assert(MLX5_WQE_SIZE == 4 * MLX5_WSEG_SIZE,
5515 		      "invalid WQE size");
5516 	MLX5_ASSERT(priv);
5517 	if (tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS) {
5518 		/* We should support Multi-Segment Packets. */
5519 		olx |= MLX5_TXOFF_CONFIG_MULTI;
5520 	}
5521 	if (tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO |
5522 			   DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
5523 			   DEV_TX_OFFLOAD_GRE_TNL_TSO |
5524 			   DEV_TX_OFFLOAD_IP_TNL_TSO |
5525 			   DEV_TX_OFFLOAD_UDP_TNL_TSO)) {
5526 		/* We should support TCP Send Offload. */
5527 		olx |= MLX5_TXOFF_CONFIG_TSO;
5528 	}
5529 	if (tx_offloads & (DEV_TX_OFFLOAD_IP_TNL_TSO |
5530 			   DEV_TX_OFFLOAD_UDP_TNL_TSO |
5531 			   DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)) {
5532 		/* We should support Software Parser for Tunnels. */
5533 		olx |= MLX5_TXOFF_CONFIG_SWP;
5534 	}
5535 	if (tx_offloads & (DEV_TX_OFFLOAD_IPV4_CKSUM |
5536 			   DEV_TX_OFFLOAD_UDP_CKSUM |
5537 			   DEV_TX_OFFLOAD_TCP_CKSUM |
5538 			   DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)) {
5539 		/* We should support IP/TCP/UDP Checksums. */
5540 		olx |= MLX5_TXOFF_CONFIG_CSUM;
5541 	}
5542 	if (tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT) {
5543 		/* We should support VLAN insertion. */
5544 		olx |= MLX5_TXOFF_CONFIG_VLAN;
5545 	}
5546 	if (tx_offloads & DEV_TX_OFFLOAD_SEND_ON_TIMESTAMP &&
5547 	    rte_mbuf_dynflag_lookup
5548 			(RTE_MBUF_DYNFLAG_TX_TIMESTAMP_NAME, NULL) >= 0 &&
5549 	    rte_mbuf_dynfield_lookup
5550 			(RTE_MBUF_DYNFIELD_TIMESTAMP_NAME, NULL) >= 0) {
5551 		/* Offload configured, dynamic entities registered. */
5552 		olx |= MLX5_TXOFF_CONFIG_TXPP;
5553 	}
5554 	if (priv->txqs_n && (*priv->txqs)[0]) {
5555 		struct mlx5_txq_data *txd = (*priv->txqs)[0];
5556 
5557 		if (txd->inlen_send) {
5558 			/*
5559 			 * Check the data inline requirements. Data inline
5560 			 * is enabled on per device basis, we can check
5561 			 * the first Tx queue only.
5562 			 *
5563 			 * If device does not support VLAN insertion in WQE
5564 			 * and some queues are requested to perform VLAN
5565 			 * insertion offload than inline must be enabled.
5566 			 */
5567 			olx |= MLX5_TXOFF_CONFIG_INLINE;
5568 		}
5569 	}
5570 	if (config->mps == MLX5_MPW_ENHANCED &&
5571 	    config->txq_inline_min <= 0) {
5572 		/*
5573 		 * The NIC supports Enhanced Multi-Packet Write
5574 		 * and does not require minimal inline data.
5575 		 */
5576 		olx |= MLX5_TXOFF_CONFIG_EMPW;
5577 	}
5578 	if (rte_flow_dynf_metadata_avail()) {
5579 		/* We should support Flow metadata. */
5580 		olx |= MLX5_TXOFF_CONFIG_METADATA;
5581 	}
5582 	if (config->mps == MLX5_MPW) {
5583 		/*
5584 		 * The NIC supports Legacy Multi-Packet Write.
5585 		 * The MLX5_TXOFF_CONFIG_MPW controls the
5586 		 * descriptor building method in combination
5587 		 * with MLX5_TXOFF_CONFIG_EMPW.
5588 		 */
5589 		if (!(olx & (MLX5_TXOFF_CONFIG_TSO |
5590 			     MLX5_TXOFF_CONFIG_SWP |
5591 			     MLX5_TXOFF_CONFIG_VLAN |
5592 			     MLX5_TXOFF_CONFIG_METADATA)))
5593 			olx |= MLX5_TXOFF_CONFIG_EMPW |
5594 			       MLX5_TXOFF_CONFIG_MPW;
5595 	}
5596 	/*
5597 	 * Scan the routines table to find the minimal
5598 	 * satisfying routine with requested offloads.
5599 	 */
5600 	m = RTE_DIM(txoff_func);
5601 	for (i = 0; i < RTE_DIM(txoff_func); i++) {
5602 		unsigned int tmp;
5603 
5604 		tmp = txoff_func[i].olx;
5605 		if (tmp == olx) {
5606 			/* Meets requested offloads exactly.*/
5607 			m = i;
5608 			break;
5609 		}
5610 		if ((tmp & olx) != olx) {
5611 			/* Does not meet requested offloads at all. */
5612 			continue;
5613 		}
5614 		if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_MPW)
5615 			/* Do not enable legacy MPW if not configured. */
5616 			continue;
5617 		if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_EMPW)
5618 			/* Do not enable eMPW if not configured. */
5619 			continue;
5620 		if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_INLINE)
5621 			/* Do not enable inlining if not configured. */
5622 			continue;
5623 		if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_TXPP)
5624 			/* Do not enable scheduling if not configured. */
5625 			continue;
5626 		/*
5627 		 * Some routine meets the requirements.
5628 		 * Check whether it has minimal amount
5629 		 * of not requested offloads.
5630 		 */
5631 		tmp = __builtin_popcountl(tmp & ~olx);
5632 		if (m >= RTE_DIM(txoff_func) || tmp < diff) {
5633 			/* First or better match, save and continue. */
5634 			m = i;
5635 			diff = tmp;
5636 			continue;
5637 		}
5638 		if (tmp == diff) {
5639 			tmp = txoff_func[i].olx ^ txoff_func[m].olx;
5640 			if (__builtin_ffsl(txoff_func[i].olx & ~tmp) <
5641 			    __builtin_ffsl(txoff_func[m].olx & ~tmp)) {
5642 				/* Lighter not requested offload. */
5643 				m = i;
5644 			}
5645 		}
5646 	}
5647 	if (m >= RTE_DIM(txoff_func)) {
5648 		DRV_LOG(DEBUG, "port %u has no selected Tx function"
5649 			       " for requested offloads %04X",
5650 				dev->data->port_id, olx);
5651 		return NULL;
5652 	}
5653 	DRV_LOG(DEBUG, "port %u has selected Tx function"
5654 		       " supporting offloads %04X/%04X",
5655 			dev->data->port_id, olx, txoff_func[m].olx);
5656 	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_MULTI)
5657 		DRV_LOG(DEBUG, "\tMULTI (multi segment)");
5658 	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_TSO)
5659 		DRV_LOG(DEBUG, "\tTSO   (TCP send offload)");
5660 	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_SWP)
5661 		DRV_LOG(DEBUG, "\tSWP   (software parser)");
5662 	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_CSUM)
5663 		DRV_LOG(DEBUG, "\tCSUM  (checksum offload)");
5664 	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_INLINE)
5665 		DRV_LOG(DEBUG, "\tINLIN (inline data)");
5666 	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_VLAN)
5667 		DRV_LOG(DEBUG, "\tVLANI (VLAN insertion)");
5668 	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_METADATA)
5669 		DRV_LOG(DEBUG, "\tMETAD (tx Flow metadata)");
5670 	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_TXPP)
5671 		DRV_LOG(DEBUG, "\tMETAD (tx Scheduling)");
5672 	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_EMPW) {
5673 		if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_MPW)
5674 			DRV_LOG(DEBUG, "\tMPW   (Legacy MPW)");
5675 		else
5676 			DRV_LOG(DEBUG, "\tEMPW  (Enhanced MPW)");
5677 	}
5678 	return txoff_func[m].func;
5679 }
5680 
5681 /**
5682  * DPDK callback to get the TX queue information
5683  *
5684  * @param dev
5685  *   Pointer to the device structure.
5686  *
5687  * @param tx_queue_id
5688  *   Tx queue identificator.
5689  *
5690  * @param qinfo
5691  *   Pointer to the TX queue information structure.
5692  *
5693  * @return
5694  *   None.
5695  */
5696 
5697 void
5698 mlx5_txq_info_get(struct rte_eth_dev *dev, uint16_t tx_queue_id,
5699 		  struct rte_eth_txq_info *qinfo)
5700 {
5701 	struct mlx5_priv *priv = dev->data->dev_private;
5702 	struct mlx5_txq_data *txq = (*priv->txqs)[tx_queue_id];
5703 	struct mlx5_txq_ctrl *txq_ctrl =
5704 			container_of(txq, struct mlx5_txq_ctrl, txq);
5705 
5706 	if (!txq)
5707 		return;
5708 	qinfo->nb_desc = txq->elts_s;
5709 	qinfo->conf.tx_thresh.pthresh = 0;
5710 	qinfo->conf.tx_thresh.hthresh = 0;
5711 	qinfo->conf.tx_thresh.wthresh = 0;
5712 	qinfo->conf.tx_rs_thresh = 0;
5713 	qinfo->conf.tx_free_thresh = 0;
5714 	qinfo->conf.tx_deferred_start = txq_ctrl ? 0 : 1;
5715 	qinfo->conf.offloads = dev->data->dev_conf.txmode.offloads;
5716 }
5717 
5718 /**
5719  * DPDK callback to get the TX packet burst mode information
5720  *
5721  * @param dev
5722  *   Pointer to the device structure.
5723  *
5724  * @param tx_queue_id
5725  *   Tx queue identificatior.
5726  *
5727  * @param mode
5728  *   Pointer to the burts mode information.
5729  *
5730  * @return
5731  *   0 as success, -EINVAL as failure.
5732  */
5733 
5734 int
5735 mlx5_tx_burst_mode_get(struct rte_eth_dev *dev,
5736 		       uint16_t tx_queue_id __rte_unused,
5737 		       struct rte_eth_burst_mode *mode)
5738 {
5739 	eth_tx_burst_t pkt_burst = dev->tx_pkt_burst;
5740 	unsigned int i, olx;
5741 
5742 	for (i = 0; i < RTE_DIM(txoff_func); i++) {
5743 		if (pkt_burst == txoff_func[i].func) {
5744 			olx = txoff_func[i].olx;
5745 			snprintf(mode->info, sizeof(mode->info),
5746 				 "%s%s%s%s%s%s%s%s%s",
5747 				 (olx & MLX5_TXOFF_CONFIG_EMPW) ?
5748 				 ((olx & MLX5_TXOFF_CONFIG_MPW) ?
5749 				 "Legacy MPW" : "Enhanced MPW") : "No MPW",
5750 				 (olx & MLX5_TXOFF_CONFIG_MULTI) ?
5751 				 " + MULTI" : "",
5752 				 (olx & MLX5_TXOFF_CONFIG_TSO) ?
5753 				 " + TSO" : "",
5754 				 (olx & MLX5_TXOFF_CONFIG_SWP) ?
5755 				 " + SWP" : "",
5756 				 (olx & MLX5_TXOFF_CONFIG_CSUM) ?
5757 				 "  + CSUM" : "",
5758 				 (olx & MLX5_TXOFF_CONFIG_INLINE) ?
5759 				 " + INLINE" : "",
5760 				 (olx & MLX5_TXOFF_CONFIG_VLAN) ?
5761 				 " + VLAN" : "",
5762 				 (olx & MLX5_TXOFF_CONFIG_METADATA) ?
5763 				 " + METADATA" : "",
5764 				 (olx & MLX5_TXOFF_CONFIG_TXPP) ?
5765 				 " + TXPP" : "");
5766 			return 0;
5767 		}
5768 	}
5769 	return -EINVAL;
5770 }
5771