xref: /dpdk/drivers/net/mlx5/mlx5_rxtx.c (revision 54ad947eda42042d2bdae69b57d0c7c8e291d9ec)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015-2019 Mellanox Technologies, Ltd
4  */
5 
6 #include <assert.h>
7 #include <stdint.h>
8 #include <string.h>
9 #include <stdlib.h>
10 
11 /* Verbs header. */
12 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
13 #ifdef PEDANTIC
14 #pragma GCC diagnostic ignored "-Wpedantic"
15 #endif
16 #include <infiniband/verbs.h>
17 #include <infiniband/mlx5dv.h>
18 #ifdef PEDANTIC
19 #pragma GCC diagnostic error "-Wpedantic"
20 #endif
21 
22 #include <rte_mbuf.h>
23 #include <rte_mempool.h>
24 #include <rte_prefetch.h>
25 #include <rte_common.h>
26 #include <rte_branch_prediction.h>
27 #include <rte_ether.h>
28 #include <rte_cycles.h>
29 
30 #include "mlx5.h"
31 #include "mlx5_utils.h"
32 #include "mlx5_rxtx.h"
33 #include "mlx5_autoconf.h"
34 #include "mlx5_defs.h"
35 #include "mlx5_prm.h"
36 
37 /* TX burst subroutines return codes. */
38 enum mlx5_txcmp_code {
39 	MLX5_TXCMP_CODE_EXIT = 0,
40 	MLX5_TXCMP_CODE_ERROR,
41 	MLX5_TXCMP_CODE_SINGLE,
42 	MLX5_TXCMP_CODE_MULTI,
43 	MLX5_TXCMP_CODE_TSO,
44 	MLX5_TXCMP_CODE_EMPW,
45 };
46 
47 /*
48  * These defines are used to configure Tx burst routine option set
49  * supported at compile time. The not specified options are optimized out
50  * out due to if conditions can be explicitly calculated at compile time.
51  * The offloads with bigger runtime check (require more CPU cycles to
52  * skip) overhead should have the bigger index - this is needed to
53  * select the better matching routine function if no exact match and
54  * some offloads are not actually requested.
55  */
56 #define MLX5_TXOFF_CONFIG_MULTI (1u << 0) /* Multi-segment packets.*/
57 #define MLX5_TXOFF_CONFIG_TSO (1u << 1) /* TCP send offload supported.*/
58 #define MLX5_TXOFF_CONFIG_SWP (1u << 2) /* Tunnels/SW Parser offloads.*/
59 #define MLX5_TXOFF_CONFIG_CSUM (1u << 3) /* Check Sums offloaded. */
60 #define MLX5_TXOFF_CONFIG_INLINE (1u << 4) /* Data inlining supported. */
61 #define MLX5_TXOFF_CONFIG_VLAN (1u << 5) /* VLAN insertion supported.*/
62 #define MLX5_TXOFF_CONFIG_METADATA (1u << 6) /* Flow metadata. */
63 #define MLX5_TXOFF_CONFIG_EMPW (1u << 8) /* Enhanced MPW supported.*/
64 
65 /* The most common offloads groups. */
66 #define MLX5_TXOFF_CONFIG_NONE 0
67 #define MLX5_TXOFF_CONFIG_FULL (MLX5_TXOFF_CONFIG_MULTI | \
68 				MLX5_TXOFF_CONFIG_TSO | \
69 				MLX5_TXOFF_CONFIG_SWP | \
70 				MLX5_TXOFF_CONFIG_CSUM | \
71 				MLX5_TXOFF_CONFIG_INLINE | \
72 				MLX5_TXOFF_CONFIG_VLAN | \
73 				MLX5_TXOFF_CONFIG_METADATA)
74 
75 #define MLX5_TXOFF_CONFIG(mask) (olx & MLX5_TXOFF_CONFIG_##mask)
76 
77 #define MLX5_TXOFF_DECL(func, olx) \
78 static uint16_t mlx5_tx_burst_##func(void *txq, \
79 				     struct rte_mbuf **pkts, \
80 				    uint16_t pkts_n) \
81 { \
82 	return mlx5_tx_burst_tmpl((struct mlx5_txq_data *)txq, \
83 		    pkts, pkts_n, (olx)); \
84 }
85 
86 #define MLX5_TXOFF_INFO(func, olx) {mlx5_tx_burst_##func, olx},
87 
88 static __rte_always_inline uint32_t
89 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe);
90 
91 static __rte_always_inline int
92 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
93 		 uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe);
94 
95 static __rte_always_inline uint32_t
96 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe);
97 
98 static __rte_always_inline void
99 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
100 	       volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res);
101 
102 static __rte_always_inline void
103 mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx,
104 		 const unsigned int strd_n);
105 
106 static int
107 mlx5_queue_state_modify(struct rte_eth_dev *dev,
108 			struct mlx5_mp_arg_queue_state_modify *sm);
109 
110 static inline void
111 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *restrict tcp,
112 			volatile struct mlx5_cqe *restrict cqe,
113 			uint32_t phcsum);
114 
115 static inline void
116 mlx5_lro_update_hdr(uint8_t *restrict padd,
117 		    volatile struct mlx5_cqe *restrict cqe,
118 		    uint32_t len);
119 
120 uint32_t mlx5_ptype_table[] __rte_cache_aligned = {
121 	[0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */
122 };
123 
124 uint8_t mlx5_cksum_table[1 << 10] __rte_cache_aligned;
125 uint8_t mlx5_swp_types_table[1 << 10] __rte_cache_aligned;
126 
127 /**
128  * Build a table to translate Rx completion flags to packet type.
129  *
130  * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
131  */
132 void
133 mlx5_set_ptype_table(void)
134 {
135 	unsigned int i;
136 	uint32_t (*p)[RTE_DIM(mlx5_ptype_table)] = &mlx5_ptype_table;
137 
138 	/* Last entry must not be overwritten, reserved for errored packet. */
139 	for (i = 0; i < RTE_DIM(mlx5_ptype_table) - 1; ++i)
140 		(*p)[i] = RTE_PTYPE_UNKNOWN;
141 	/*
142 	 * The index to the array should have:
143 	 * bit[1:0] = l3_hdr_type
144 	 * bit[4:2] = l4_hdr_type
145 	 * bit[5] = ip_frag
146 	 * bit[6] = tunneled
147 	 * bit[7] = outer_l3_type
148 	 */
149 	/* L2 */
150 	(*p)[0x00] = RTE_PTYPE_L2_ETHER;
151 	/* L3 */
152 	(*p)[0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
153 		     RTE_PTYPE_L4_NONFRAG;
154 	(*p)[0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
155 		     RTE_PTYPE_L4_NONFRAG;
156 	/* Fragmented */
157 	(*p)[0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
158 		     RTE_PTYPE_L4_FRAG;
159 	(*p)[0x22] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
160 		     RTE_PTYPE_L4_FRAG;
161 	/* TCP */
162 	(*p)[0x05] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
163 		     RTE_PTYPE_L4_TCP;
164 	(*p)[0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
165 		     RTE_PTYPE_L4_TCP;
166 	(*p)[0x0d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
167 		     RTE_PTYPE_L4_TCP;
168 	(*p)[0x0e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
169 		     RTE_PTYPE_L4_TCP;
170 	(*p)[0x11] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
171 		     RTE_PTYPE_L4_TCP;
172 	(*p)[0x12] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
173 		     RTE_PTYPE_L4_TCP;
174 	/* UDP */
175 	(*p)[0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
176 		     RTE_PTYPE_L4_UDP;
177 	(*p)[0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
178 		     RTE_PTYPE_L4_UDP;
179 	/* Repeat with outer_l3_type being set. Just in case. */
180 	(*p)[0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
181 		     RTE_PTYPE_L4_NONFRAG;
182 	(*p)[0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
183 		     RTE_PTYPE_L4_NONFRAG;
184 	(*p)[0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
185 		     RTE_PTYPE_L4_FRAG;
186 	(*p)[0xa2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
187 		     RTE_PTYPE_L4_FRAG;
188 	(*p)[0x85] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
189 		     RTE_PTYPE_L4_TCP;
190 	(*p)[0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
191 		     RTE_PTYPE_L4_TCP;
192 	(*p)[0x8d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
193 		     RTE_PTYPE_L4_TCP;
194 	(*p)[0x8e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
195 		     RTE_PTYPE_L4_TCP;
196 	(*p)[0x91] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
197 		     RTE_PTYPE_L4_TCP;
198 	(*p)[0x92] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
199 		     RTE_PTYPE_L4_TCP;
200 	(*p)[0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
201 		     RTE_PTYPE_L4_UDP;
202 	(*p)[0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
203 		     RTE_PTYPE_L4_UDP;
204 	/* Tunneled - L3 */
205 	(*p)[0x40] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN;
206 	(*p)[0x41] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
207 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
208 		     RTE_PTYPE_INNER_L4_NONFRAG;
209 	(*p)[0x42] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
210 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
211 		     RTE_PTYPE_INNER_L4_NONFRAG;
212 	(*p)[0xc0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN;
213 	(*p)[0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
214 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
215 		     RTE_PTYPE_INNER_L4_NONFRAG;
216 	(*p)[0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
217 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
218 		     RTE_PTYPE_INNER_L4_NONFRAG;
219 	/* Tunneled - Fragmented */
220 	(*p)[0x61] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
221 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
222 		     RTE_PTYPE_INNER_L4_FRAG;
223 	(*p)[0x62] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
224 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
225 		     RTE_PTYPE_INNER_L4_FRAG;
226 	(*p)[0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
227 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
228 		     RTE_PTYPE_INNER_L4_FRAG;
229 	(*p)[0xe2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
230 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
231 		     RTE_PTYPE_INNER_L4_FRAG;
232 	/* Tunneled - TCP */
233 	(*p)[0x45] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
234 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
235 		     RTE_PTYPE_INNER_L4_TCP;
236 	(*p)[0x46] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
237 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
238 		     RTE_PTYPE_INNER_L4_TCP;
239 	(*p)[0x4d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
240 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
241 		     RTE_PTYPE_INNER_L4_TCP;
242 	(*p)[0x4e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
243 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
244 		     RTE_PTYPE_INNER_L4_TCP;
245 	(*p)[0x51] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
246 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
247 		     RTE_PTYPE_INNER_L4_TCP;
248 	(*p)[0x52] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
249 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
250 		     RTE_PTYPE_INNER_L4_TCP;
251 	(*p)[0xc5] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
252 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
253 		     RTE_PTYPE_INNER_L4_TCP;
254 	(*p)[0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
255 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
256 		     RTE_PTYPE_INNER_L4_TCP;
257 	(*p)[0xcd] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
258 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
259 		     RTE_PTYPE_INNER_L4_TCP;
260 	(*p)[0xce] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
261 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
262 		     RTE_PTYPE_INNER_L4_TCP;
263 	(*p)[0xd1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
264 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
265 		     RTE_PTYPE_INNER_L4_TCP;
266 	(*p)[0xd2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
267 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
268 		     RTE_PTYPE_INNER_L4_TCP;
269 	/* Tunneled - UDP */
270 	(*p)[0x49] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
271 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
272 		     RTE_PTYPE_INNER_L4_UDP;
273 	(*p)[0x4a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
274 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
275 		     RTE_PTYPE_INNER_L4_UDP;
276 	(*p)[0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
277 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
278 		     RTE_PTYPE_INNER_L4_UDP;
279 	(*p)[0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
280 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
281 		     RTE_PTYPE_INNER_L4_UDP;
282 }
283 
284 /**
285  * Build a table to translate packet to checksum type of Verbs.
286  */
287 void
288 mlx5_set_cksum_table(void)
289 {
290 	unsigned int i;
291 	uint8_t v;
292 
293 	/*
294 	 * The index should have:
295 	 * bit[0] = PKT_TX_TCP_SEG
296 	 * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM
297 	 * bit[4] = PKT_TX_IP_CKSUM
298 	 * bit[8] = PKT_TX_OUTER_IP_CKSUM
299 	 * bit[9] = tunnel
300 	 */
301 	for (i = 0; i < RTE_DIM(mlx5_cksum_table); ++i) {
302 		v = 0;
303 		if (i & (1 << 9)) {
304 			/* Tunneled packet. */
305 			if (i & (1 << 8)) /* Outer IP. */
306 				v |= MLX5_ETH_WQE_L3_CSUM;
307 			if (i & (1 << 4)) /* Inner IP. */
308 				v |= MLX5_ETH_WQE_L3_INNER_CSUM;
309 			if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */
310 				v |= MLX5_ETH_WQE_L4_INNER_CSUM;
311 		} else {
312 			/* No tunnel. */
313 			if (i & (1 << 4)) /* IP. */
314 				v |= MLX5_ETH_WQE_L3_CSUM;
315 			if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */
316 				v |= MLX5_ETH_WQE_L4_CSUM;
317 		}
318 		mlx5_cksum_table[i] = v;
319 	}
320 }
321 
322 /**
323  * Build a table to translate packet type of mbuf to SWP type of Verbs.
324  */
325 void
326 mlx5_set_swp_types_table(void)
327 {
328 	unsigned int i;
329 	uint8_t v;
330 
331 	/*
332 	 * The index should have:
333 	 * bit[0:1] = PKT_TX_L4_MASK
334 	 * bit[4] = PKT_TX_IPV6
335 	 * bit[8] = PKT_TX_OUTER_IPV6
336 	 * bit[9] = PKT_TX_OUTER_UDP
337 	 */
338 	for (i = 0; i < RTE_DIM(mlx5_swp_types_table); ++i) {
339 		v = 0;
340 		if (i & (1 << 8))
341 			v |= MLX5_ETH_WQE_L3_OUTER_IPV6;
342 		if (i & (1 << 9))
343 			v |= MLX5_ETH_WQE_L4_OUTER_UDP;
344 		if (i & (1 << 4))
345 			v |= MLX5_ETH_WQE_L3_INNER_IPV6;
346 		if ((i & 3) == (PKT_TX_UDP_CKSUM >> 52))
347 			v |= MLX5_ETH_WQE_L4_INNER_UDP;
348 		mlx5_swp_types_table[i] = v;
349 	}
350 }
351 
352 /**
353  * Set Software Parser flags and offsets in Ethernet Segment of WQE.
354  * Flags must be preliminary initialized to zero.
355  *
356  * @param loc
357  *   Pointer to burst routine local context.
358  * @param swp_flags
359  *   Pointer to store Software Parser flags
360  * @param olx
361  *   Configured Tx offloads mask. It is fully defined at
362  *   compile time and may be used for optimization.
363  *
364  * @return
365  *   Software Parser offsets packed in dword.
366  *   Software Parser flags are set by pointer.
367  */
368 static __rte_always_inline uint32_t
369 txq_mbuf_to_swp(struct mlx5_txq_local *restrict loc,
370 		uint8_t *swp_flags,
371 		unsigned int olx)
372 {
373 	uint64_t ol, tunnel;
374 	unsigned int idx, off;
375 	uint32_t set;
376 
377 	if (!MLX5_TXOFF_CONFIG(SWP))
378 		return 0;
379 	ol = loc->mbuf->ol_flags;
380 	tunnel = ol & PKT_TX_TUNNEL_MASK;
381 	/*
382 	 * Check whether Software Parser is required.
383 	 * Only customized tunnels may ask for.
384 	 */
385 	if (likely(tunnel != PKT_TX_TUNNEL_UDP && tunnel != PKT_TX_TUNNEL_IP))
386 		return 0;
387 	/*
388 	 * The index should have:
389 	 * bit[0:1] = PKT_TX_L4_MASK
390 	 * bit[4] = PKT_TX_IPV6
391 	 * bit[8] = PKT_TX_OUTER_IPV6
392 	 * bit[9] = PKT_TX_OUTER_UDP
393 	 */
394 	idx = (ol & (PKT_TX_L4_MASK | PKT_TX_IPV6 | PKT_TX_OUTER_IPV6)) >> 52;
395 	idx |= (tunnel == PKT_TX_TUNNEL_UDP) ? (1 << 9) : 0;
396 	*swp_flags = mlx5_swp_types_table[idx];
397 	/*
398 	 * Set offsets for SW parser. Since ConnectX-5, SW parser just
399 	 * complements HW parser. SW parser starts to engage only if HW parser
400 	 * can't reach a header. For the older devices, HW parser will not kick
401 	 * in if any of SWP offsets is set. Therefore, all of the L3 offsets
402 	 * should be set regardless of HW offload.
403 	 */
404 	off = loc->mbuf->outer_l2_len;
405 	if (MLX5_TXOFF_CONFIG(VLAN) && ol & PKT_TX_VLAN_PKT)
406 		off += sizeof(struct rte_vlan_hdr);
407 	set = (off >> 1) << 8; /* Outer L3 offset. */
408 	off += loc->mbuf->outer_l3_len;
409 	if (tunnel == PKT_TX_TUNNEL_UDP)
410 		set |= off >> 1; /* Outer L4 offset. */
411 	if (ol & (PKT_TX_IPV4 | PKT_TX_IPV6)) { /* Inner IP. */
412 		const uint64_t csum = ol & PKT_TX_L4_MASK;
413 			off += loc->mbuf->l2_len;
414 		set |= (off >> 1) << 24; /* Inner L3 offset. */
415 		if (csum == PKT_TX_TCP_CKSUM ||
416 		    csum == PKT_TX_UDP_CKSUM ||
417 		    (MLX5_TXOFF_CONFIG(TSO) && ol & PKT_TX_TCP_SEG)) {
418 			off += loc->mbuf->l3_len;
419 			set |= (off >> 1) << 16; /* Inner L4 offset. */
420 		}
421 	}
422 	set = rte_cpu_to_le_32(set);
423 	return set;
424 }
425 
426 /**
427  * Convert the Checksum offloads to Verbs.
428  *
429  * @param buf
430  *   Pointer to the mbuf.
431  *
432  * @return
433  *   Converted checksum flags.
434  */
435 static __rte_always_inline uint8_t
436 txq_ol_cksum_to_cs(struct rte_mbuf *buf)
437 {
438 	uint32_t idx;
439 	uint8_t is_tunnel = !!(buf->ol_flags & PKT_TX_TUNNEL_MASK);
440 	const uint64_t ol_flags_mask = PKT_TX_TCP_SEG | PKT_TX_L4_MASK |
441 				       PKT_TX_IP_CKSUM | PKT_TX_OUTER_IP_CKSUM;
442 
443 	/*
444 	 * The index should have:
445 	 * bit[0] = PKT_TX_TCP_SEG
446 	 * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM
447 	 * bit[4] = PKT_TX_IP_CKSUM
448 	 * bit[8] = PKT_TX_OUTER_IP_CKSUM
449 	 * bit[9] = tunnel
450 	 */
451 	idx = ((buf->ol_flags & ol_flags_mask) >> 50) | (!!is_tunnel << 9);
452 	return mlx5_cksum_table[idx];
453 }
454 
455 /**
456  * Internal function to compute the number of used descriptors in an RX queue
457  *
458  * @param rxq
459  *   The Rx queue.
460  *
461  * @return
462  *   The number of used rx descriptor.
463  */
464 static uint32_t
465 rx_queue_count(struct mlx5_rxq_data *rxq)
466 {
467 	struct rxq_zip *zip = &rxq->zip;
468 	volatile struct mlx5_cqe *cqe;
469 	const unsigned int cqe_n = (1 << rxq->cqe_n);
470 	const unsigned int cqe_cnt = cqe_n - 1;
471 	unsigned int cq_ci;
472 	unsigned int used;
473 
474 	/* if we are processing a compressed cqe */
475 	if (zip->ai) {
476 		used = zip->cqe_cnt - zip->ca;
477 		cq_ci = zip->cq_ci;
478 	} else {
479 		used = 0;
480 		cq_ci = rxq->cq_ci;
481 	}
482 	cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
483 	while (check_cqe(cqe, cqe_n, cq_ci) != MLX5_CQE_STATUS_HW_OWN) {
484 		int8_t op_own;
485 		unsigned int n;
486 
487 		op_own = cqe->op_own;
488 		if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED)
489 			n = rte_be_to_cpu_32(cqe->byte_cnt);
490 		else
491 			n = 1;
492 		cq_ci += n;
493 		used += n;
494 		cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
495 	}
496 	used = RTE_MIN(used, (1U << rxq->elts_n) - 1);
497 	return used;
498 }
499 
500 /**
501  * DPDK callback to check the status of a rx descriptor.
502  *
503  * @param rx_queue
504  *   The Rx queue.
505  * @param[in] offset
506  *   The index of the descriptor in the ring.
507  *
508  * @return
509  *   The status of the tx descriptor.
510  */
511 int
512 mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset)
513 {
514 	struct mlx5_rxq_data *rxq = rx_queue;
515 	struct mlx5_rxq_ctrl *rxq_ctrl =
516 			container_of(rxq, struct mlx5_rxq_ctrl, rxq);
517 	struct rte_eth_dev *dev = ETH_DEV(rxq_ctrl->priv);
518 
519 	if (dev->rx_pkt_burst != mlx5_rx_burst) {
520 		rte_errno = ENOTSUP;
521 		return -rte_errno;
522 	}
523 	if (offset >= (1 << rxq->elts_n)) {
524 		rte_errno = EINVAL;
525 		return -rte_errno;
526 	}
527 	if (offset < rx_queue_count(rxq))
528 		return RTE_ETH_RX_DESC_DONE;
529 	return RTE_ETH_RX_DESC_AVAIL;
530 }
531 
532 /**
533  * DPDK callback to get the number of used descriptors in a RX queue
534  *
535  * @param dev
536  *   Pointer to the device structure.
537  *
538  * @param rx_queue_id
539  *   The Rx queue.
540  *
541  * @return
542  *   The number of used rx descriptor.
543  *   -EINVAL if the queue is invalid
544  */
545 uint32_t
546 mlx5_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id)
547 {
548 	struct mlx5_priv *priv = dev->data->dev_private;
549 	struct mlx5_rxq_data *rxq;
550 
551 	if (dev->rx_pkt_burst != mlx5_rx_burst) {
552 		rte_errno = ENOTSUP;
553 		return -rte_errno;
554 	}
555 	rxq = (*priv->rxqs)[rx_queue_id];
556 	if (!rxq) {
557 		rte_errno = EINVAL;
558 		return -rte_errno;
559 	}
560 	return rx_queue_count(rxq);
561 }
562 
563 #define MLX5_SYSTEM_LOG_DIR "/var/log"
564 /**
565  * Dump debug information to log file.
566  *
567  * @param fname
568  *   The file name.
569  * @param hex_title
570  *   If not NULL this string is printed as a header to the output
571  *   and the output will be in hexadecimal view.
572  * @param buf
573  *   This is the buffer address to print out.
574  * @param len
575  *   The number of bytes to dump out.
576  */
577 void
578 mlx5_dump_debug_information(const char *fname, const char *hex_title,
579 			    const void *buf, unsigned int hex_len)
580 {
581 	FILE *fd;
582 
583 	MKSTR(path, "%s/%s", MLX5_SYSTEM_LOG_DIR, fname);
584 	fd = fopen(path, "a+");
585 	if (!fd) {
586 		DRV_LOG(WARNING, "cannot open %s for debug dump\n",
587 			path);
588 		MKSTR(path2, "./%s", fname);
589 		fd = fopen(path2, "a+");
590 		if (!fd) {
591 			DRV_LOG(ERR, "cannot open %s for debug dump\n",
592 				path2);
593 			return;
594 		}
595 		DRV_LOG(INFO, "New debug dump in file %s\n", path2);
596 	} else {
597 		DRV_LOG(INFO, "New debug dump in file %s\n", path);
598 	}
599 	if (hex_title)
600 		rte_hexdump(fd, hex_title, buf, hex_len);
601 	else
602 		fprintf(fd, "%s", (const char *)buf);
603 	fprintf(fd, "\n\n\n");
604 	fclose(fd);
605 }
606 
607 /**
608  * Move QP from error state to running state and initialize indexes.
609  *
610  * @param txq_ctrl
611  *   Pointer to TX queue control structure.
612  *
613  * @return
614  *   0 on success, else -1.
615  */
616 static int
617 tx_recover_qp(struct mlx5_txq_ctrl *txq_ctrl)
618 {
619 	struct mlx5_mp_arg_queue_state_modify sm = {
620 			.is_wq = 0,
621 			.queue_id = txq_ctrl->txq.idx,
622 	};
623 
624 	if (mlx5_queue_state_modify(ETH_DEV(txq_ctrl->priv), &sm))
625 		return -1;
626 	txq_ctrl->txq.wqe_ci = 0;
627 	txq_ctrl->txq.wqe_pi = 0;
628 	txq_ctrl->txq.elts_comp = 0;
629 	return 0;
630 }
631 
632 /* Return 1 if the error CQE is signed otherwise, sign it and return 0. */
633 static int
634 check_err_cqe_seen(volatile struct mlx5_err_cqe *err_cqe)
635 {
636 	static const uint8_t magic[] = "seen";
637 	int ret = 1;
638 	unsigned int i;
639 
640 	for (i = 0; i < sizeof(magic); ++i)
641 		if (!ret || err_cqe->rsvd1[i] != magic[i]) {
642 			ret = 0;
643 			err_cqe->rsvd1[i] = magic[i];
644 		}
645 	return ret;
646 }
647 
648 /**
649  * Handle error CQE.
650  *
651  * @param txq
652  *   Pointer to TX queue structure.
653  * @param error_cqe
654  *   Pointer to the error CQE.
655  *
656  * @return
657  *   The last Tx buffer element to free.
658  */
659 uint16_t
660 mlx5_tx_error_cqe_handle(struct mlx5_txq_data *restrict txq,
661 			 volatile struct mlx5_err_cqe *err_cqe)
662 {
663 	if (err_cqe->syndrome != MLX5_CQE_SYNDROME_WR_FLUSH_ERR) {
664 		const uint16_t wqe_m = ((1 << txq->wqe_n) - 1);
665 		struct mlx5_txq_ctrl *txq_ctrl =
666 				container_of(txq, struct mlx5_txq_ctrl, txq);
667 		uint16_t new_wqe_pi = rte_be_to_cpu_16(err_cqe->wqe_counter);
668 		int seen = check_err_cqe_seen(err_cqe);
669 
670 		if (!seen && txq_ctrl->dump_file_n <
671 		    txq_ctrl->priv->config.max_dump_files_num) {
672 			MKSTR(err_str, "Unexpected CQE error syndrome "
673 			      "0x%02x CQN = %u SQN = %u wqe_counter = %u "
674 			      "wq_ci = %u cq_ci = %u", err_cqe->syndrome,
675 			      txq->cqe_s, txq->qp_num_8s >> 8,
676 			      rte_be_to_cpu_16(err_cqe->wqe_counter),
677 			      txq->wqe_ci, txq->cq_ci);
678 			MKSTR(name, "dpdk_mlx5_port_%u_txq_%u_index_%u_%u",
679 			      PORT_ID(txq_ctrl->priv), txq->idx,
680 			      txq_ctrl->dump_file_n, (uint32_t)rte_rdtsc());
681 			mlx5_dump_debug_information(name, NULL, err_str, 0);
682 			mlx5_dump_debug_information(name, "MLX5 Error CQ:",
683 						    (const void *)((uintptr_t)
684 						    txq->cqes),
685 						    sizeof(*err_cqe) *
686 						    (1 << txq->cqe_n));
687 			mlx5_dump_debug_information(name, "MLX5 Error SQ:",
688 						    (const void *)((uintptr_t)
689 						    txq->wqes),
690 						    MLX5_WQE_SIZE *
691 						    (1 << txq->wqe_n));
692 			txq_ctrl->dump_file_n++;
693 		}
694 		if (!seen)
695 			/*
696 			 * Count errors in WQEs units.
697 			 * Later it can be improved to count error packets,
698 			 * for example, by SQ parsing to find how much packets
699 			 * should be counted for each WQE.
700 			 */
701 			txq->stats.oerrors += ((txq->wqe_ci & wqe_m) -
702 						new_wqe_pi) & wqe_m;
703 		if (tx_recover_qp(txq_ctrl) == 0) {
704 			txq->cq_ci++;
705 			/* Release all the remaining buffers. */
706 			return txq->elts_head;
707 		}
708 		/* Recovering failed - try again later on the same WQE. */
709 	} else {
710 		txq->cq_ci++;
711 	}
712 	/* Do not release buffers. */
713 	return txq->elts_tail;
714 }
715 
716 /**
717  * Translate RX completion flags to packet type.
718  *
719  * @param[in] rxq
720  *   Pointer to RX queue structure.
721  * @param[in] cqe
722  *   Pointer to CQE.
723  *
724  * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
725  *
726  * @return
727  *   Packet type for struct rte_mbuf.
728  */
729 static inline uint32_t
730 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe)
731 {
732 	uint8_t idx;
733 	uint8_t pinfo = cqe->pkt_info;
734 	uint16_t ptype = cqe->hdr_type_etc;
735 
736 	/*
737 	 * The index to the array should have:
738 	 * bit[1:0] = l3_hdr_type
739 	 * bit[4:2] = l4_hdr_type
740 	 * bit[5] = ip_frag
741 	 * bit[6] = tunneled
742 	 * bit[7] = outer_l3_type
743 	 */
744 	idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10);
745 	return mlx5_ptype_table[idx] | rxq->tunnel * !!(idx & (1 << 6));
746 }
747 
748 /**
749  * Initialize Rx WQ and indexes.
750  *
751  * @param[in] rxq
752  *   Pointer to RX queue structure.
753  */
754 void
755 mlx5_rxq_initialize(struct mlx5_rxq_data *rxq)
756 {
757 	const unsigned int wqe_n = 1 << rxq->elts_n;
758 	unsigned int i;
759 
760 	for (i = 0; (i != wqe_n); ++i) {
761 		volatile struct mlx5_wqe_data_seg *scat;
762 		uintptr_t addr;
763 		uint32_t byte_count;
764 
765 		if (mlx5_rxq_mprq_enabled(rxq)) {
766 			struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[i];
767 
768 			scat = &((volatile struct mlx5_wqe_mprq *)
769 				rxq->wqes)[i].dseg;
770 			addr = (uintptr_t)mlx5_mprq_buf_addr(buf,
771 							 1 << rxq->strd_num_n);
772 			byte_count = (1 << rxq->strd_sz_n) *
773 					(1 << rxq->strd_num_n);
774 		} else {
775 			struct rte_mbuf *buf = (*rxq->elts)[i];
776 
777 			scat = &((volatile struct mlx5_wqe_data_seg *)
778 					rxq->wqes)[i];
779 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
780 			byte_count = DATA_LEN(buf);
781 		}
782 		/* scat->addr must be able to store a pointer. */
783 		assert(sizeof(scat->addr) >= sizeof(uintptr_t));
784 		*scat = (struct mlx5_wqe_data_seg){
785 			.addr = rte_cpu_to_be_64(addr),
786 			.byte_count = rte_cpu_to_be_32(byte_count),
787 			.lkey = mlx5_rx_addr2mr(rxq, addr),
788 		};
789 	}
790 	rxq->consumed_strd = 0;
791 	rxq->decompressed = 0;
792 	rxq->rq_pi = 0;
793 	rxq->zip = (struct rxq_zip){
794 		.ai = 0,
795 	};
796 	/* Update doorbell counter. */
797 	rxq->rq_ci = wqe_n >> rxq->sges_n;
798 	rte_cio_wmb();
799 	*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
800 }
801 
802 /**
803  * Modify a Verbs/DevX queue state.
804  * This must be called from the primary process.
805  *
806  * @param dev
807  *   Pointer to Ethernet device.
808  * @param sm
809  *   State modify request parameters.
810  *
811  * @return
812  *   0 in case of success else non-zero value and rte_errno is set.
813  */
814 int
815 mlx5_queue_state_modify_primary(struct rte_eth_dev *dev,
816 			const struct mlx5_mp_arg_queue_state_modify *sm)
817 {
818 	int ret;
819 	struct mlx5_priv *priv = dev->data->dev_private;
820 
821 	if (sm->is_wq) {
822 		struct mlx5_rxq_data *rxq = (*priv->rxqs)[sm->queue_id];
823 		struct mlx5_rxq_ctrl *rxq_ctrl =
824 			container_of(rxq, struct mlx5_rxq_ctrl, rxq);
825 
826 		if (rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_IBV) {
827 			struct ibv_wq_attr mod = {
828 				.attr_mask = IBV_WQ_ATTR_STATE,
829 				.wq_state = sm->state,
830 			};
831 
832 			ret = mlx5_glue->modify_wq(rxq_ctrl->obj->wq, &mod);
833 		} else { /* rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ. */
834 			struct mlx5_devx_modify_rq_attr rq_attr;
835 
836 			memset(&rq_attr, 0, sizeof(rq_attr));
837 			if (sm->state == IBV_WQS_RESET) {
838 				rq_attr.rq_state = MLX5_RQC_STATE_ERR;
839 				rq_attr.state = MLX5_RQC_STATE_RST;
840 			} else if (sm->state == IBV_WQS_RDY) {
841 				rq_attr.rq_state = MLX5_RQC_STATE_RST;
842 				rq_attr.state = MLX5_RQC_STATE_RDY;
843 			} else if (sm->state == IBV_WQS_ERR) {
844 				rq_attr.rq_state = MLX5_RQC_STATE_RDY;
845 				rq_attr.state = MLX5_RQC_STATE_ERR;
846 			}
847 			ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq,
848 						      &rq_attr);
849 		}
850 		if (ret) {
851 			DRV_LOG(ERR, "Cannot change Rx WQ state to %u  - %s\n",
852 					sm->state, strerror(errno));
853 			rte_errno = errno;
854 			return ret;
855 		}
856 	} else {
857 		struct mlx5_txq_data *txq = (*priv->txqs)[sm->queue_id];
858 		struct mlx5_txq_ctrl *txq_ctrl =
859 			container_of(txq, struct mlx5_txq_ctrl, txq);
860 		struct ibv_qp_attr mod = {
861 			.qp_state = IBV_QPS_RESET,
862 			.port_num = (uint8_t)priv->ibv_port,
863 		};
864 		struct ibv_qp *qp = txq_ctrl->ibv->qp;
865 
866 		ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
867 		if (ret) {
868 			DRV_LOG(ERR, "Cannot change the Tx QP state to RESET "
869 				"%s\n", strerror(errno));
870 			rte_errno = errno;
871 			return ret;
872 		}
873 		mod.qp_state = IBV_QPS_INIT;
874 		ret = mlx5_glue->modify_qp(qp, &mod,
875 					   (IBV_QP_STATE | IBV_QP_PORT));
876 		if (ret) {
877 			DRV_LOG(ERR, "Cannot change Tx QP state to INIT %s\n",
878 				strerror(errno));
879 			rte_errno = errno;
880 			return ret;
881 		}
882 		mod.qp_state = IBV_QPS_RTR;
883 		ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
884 		if (ret) {
885 			DRV_LOG(ERR, "Cannot change Tx QP state to RTR %s\n",
886 				strerror(errno));
887 			rte_errno = errno;
888 			return ret;
889 		}
890 		mod.qp_state = IBV_QPS_RTS;
891 		ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
892 		if (ret) {
893 			DRV_LOG(ERR, "Cannot change Tx QP state to RTS %s\n",
894 				strerror(errno));
895 			rte_errno = errno;
896 			return ret;
897 		}
898 	}
899 	return 0;
900 }
901 
902 /**
903  * Modify a Verbs queue state.
904  *
905  * @param dev
906  *   Pointer to Ethernet device.
907  * @param sm
908  *   State modify request parameters.
909  *
910  * @return
911  *   0 in case of success else non-zero value.
912  */
913 static int
914 mlx5_queue_state_modify(struct rte_eth_dev *dev,
915 			struct mlx5_mp_arg_queue_state_modify *sm)
916 {
917 	int ret = 0;
918 
919 	switch (rte_eal_process_type()) {
920 	case RTE_PROC_PRIMARY:
921 		ret = mlx5_queue_state_modify_primary(dev, sm);
922 		break;
923 	case RTE_PROC_SECONDARY:
924 		ret = mlx5_mp_req_queue_state_modify(dev, sm);
925 		break;
926 	default:
927 		break;
928 	}
929 	return ret;
930 }
931 
932 /**
933  * Handle a Rx error.
934  * The function inserts the RQ state to reset when the first error CQE is
935  * shown, then drains the CQ by the caller function loop. When the CQ is empty,
936  * it moves the RQ state to ready and initializes the RQ.
937  * Next CQE identification and error counting are in the caller responsibility.
938  *
939  * @param[in] rxq
940  *   Pointer to RX queue structure.
941  * @param[in] mbuf_prepare
942  *   Whether to prepare mbufs for the RQ.
943  *
944  * @return
945  *   -1 in case of recovery error, otherwise the CQE status.
946  */
947 int
948 mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t mbuf_prepare)
949 {
950 	const uint16_t cqe_n = 1 << rxq->cqe_n;
951 	const uint16_t cqe_mask = cqe_n - 1;
952 	const unsigned int wqe_n = 1 << rxq->elts_n;
953 	struct mlx5_rxq_ctrl *rxq_ctrl =
954 			container_of(rxq, struct mlx5_rxq_ctrl, rxq);
955 	union {
956 		volatile struct mlx5_cqe *cqe;
957 		volatile struct mlx5_err_cqe *err_cqe;
958 	} u = {
959 		.cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask],
960 	};
961 	struct mlx5_mp_arg_queue_state_modify sm;
962 	int ret;
963 
964 	switch (rxq->err_state) {
965 	case MLX5_RXQ_ERR_STATE_NO_ERROR:
966 		rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_RESET;
967 		/* Fall-through */
968 	case MLX5_RXQ_ERR_STATE_NEED_RESET:
969 		sm.is_wq = 1;
970 		sm.queue_id = rxq->idx;
971 		sm.state = IBV_WQS_RESET;
972 		if (mlx5_queue_state_modify(ETH_DEV(rxq_ctrl->priv), &sm))
973 			return -1;
974 		if (rxq_ctrl->dump_file_n <
975 		    rxq_ctrl->priv->config.max_dump_files_num) {
976 			MKSTR(err_str, "Unexpected CQE error syndrome "
977 			      "0x%02x CQN = %u RQN = %u wqe_counter = %u"
978 			      " rq_ci = %u cq_ci = %u", u.err_cqe->syndrome,
979 			      rxq->cqn, rxq_ctrl->wqn,
980 			      rte_be_to_cpu_16(u.err_cqe->wqe_counter),
981 			      rxq->rq_ci << rxq->sges_n, rxq->cq_ci);
982 			MKSTR(name, "dpdk_mlx5_port_%u_rxq_%u_%u",
983 			      rxq->port_id, rxq->idx, (uint32_t)rte_rdtsc());
984 			mlx5_dump_debug_information(name, NULL, err_str, 0);
985 			mlx5_dump_debug_information(name, "MLX5 Error CQ:",
986 						    (const void *)((uintptr_t)
987 								    rxq->cqes),
988 						    sizeof(*u.cqe) * cqe_n);
989 			mlx5_dump_debug_information(name, "MLX5 Error RQ:",
990 						    (const void *)((uintptr_t)
991 								    rxq->wqes),
992 						    16 * wqe_n);
993 			rxq_ctrl->dump_file_n++;
994 		}
995 		rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_READY;
996 		/* Fall-through */
997 	case MLX5_RXQ_ERR_STATE_NEED_READY:
998 		ret = check_cqe(u.cqe, cqe_n, rxq->cq_ci);
999 		if (ret == MLX5_CQE_STATUS_HW_OWN) {
1000 			rte_cio_wmb();
1001 			*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
1002 			rte_cio_wmb();
1003 			/*
1004 			 * The RQ consumer index must be zeroed while moving
1005 			 * from RESET state to RDY state.
1006 			 */
1007 			*rxq->rq_db = rte_cpu_to_be_32(0);
1008 			rte_cio_wmb();
1009 			sm.is_wq = 1;
1010 			sm.queue_id = rxq->idx;
1011 			sm.state = IBV_WQS_RDY;
1012 			if (mlx5_queue_state_modify(ETH_DEV(rxq_ctrl->priv),
1013 						    &sm))
1014 				return -1;
1015 			if (mbuf_prepare) {
1016 				const uint16_t q_mask = wqe_n - 1;
1017 				uint16_t elt_idx;
1018 				struct rte_mbuf **elt;
1019 				int i;
1020 				unsigned int n = wqe_n - (rxq->rq_ci -
1021 							  rxq->rq_pi);
1022 
1023 				for (i = 0; i < (int)n; ++i) {
1024 					elt_idx = (rxq->rq_ci + i) & q_mask;
1025 					elt = &(*rxq->elts)[elt_idx];
1026 					*elt = rte_mbuf_raw_alloc(rxq->mp);
1027 					if (!*elt) {
1028 						for (i--; i >= 0; --i) {
1029 							elt_idx = (rxq->rq_ci +
1030 								   i) & q_mask;
1031 							elt = &(*rxq->elts)
1032 								[elt_idx];
1033 							rte_pktmbuf_free_seg
1034 								(*elt);
1035 						}
1036 						return -1;
1037 					}
1038 				}
1039 			}
1040 			mlx5_rxq_initialize(rxq);
1041 			rxq->err_state = MLX5_RXQ_ERR_STATE_NO_ERROR;
1042 		}
1043 		return ret;
1044 	default:
1045 		return -1;
1046 	}
1047 }
1048 
1049 /**
1050  * Get size of the next packet for a given CQE. For compressed CQEs, the
1051  * consumer index is updated only once all packets of the current one have
1052  * been processed.
1053  *
1054  * @param rxq
1055  *   Pointer to RX queue.
1056  * @param cqe
1057  *   CQE to process.
1058  * @param[out] mcqe
1059  *   Store pointer to mini-CQE if compressed. Otherwise, the pointer is not
1060  *   written.
1061  *
1062  * @return
1063  *   0 in case of empty CQE, otherwise the packet size in bytes.
1064  */
1065 static inline int
1066 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
1067 		 uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe)
1068 {
1069 	struct rxq_zip *zip = &rxq->zip;
1070 	uint16_t cqe_n = cqe_cnt + 1;
1071 	int len;
1072 	uint16_t idx, end;
1073 
1074 	do {
1075 		len = 0;
1076 		/* Process compressed data in the CQE and mini arrays. */
1077 		if (zip->ai) {
1078 			volatile struct mlx5_mini_cqe8 (*mc)[8] =
1079 				(volatile struct mlx5_mini_cqe8 (*)[8])
1080 				(uintptr_t)(&(*rxq->cqes)[zip->ca &
1081 							  cqe_cnt].pkt_info);
1082 
1083 			len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt);
1084 			*mcqe = &(*mc)[zip->ai & 7];
1085 			if ((++zip->ai & 7) == 0) {
1086 				/* Invalidate consumed CQEs */
1087 				idx = zip->ca;
1088 				end = zip->na;
1089 				while (idx != end) {
1090 					(*rxq->cqes)[idx & cqe_cnt].op_own =
1091 						MLX5_CQE_INVALIDATE;
1092 					++idx;
1093 				}
1094 				/*
1095 				 * Increment consumer index to skip the number
1096 				 * of CQEs consumed. Hardware leaves holes in
1097 				 * the CQ ring for software use.
1098 				 */
1099 				zip->ca = zip->na;
1100 				zip->na += 8;
1101 			}
1102 			if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
1103 				/* Invalidate the rest */
1104 				idx = zip->ca;
1105 				end = zip->cq_ci;
1106 
1107 				while (idx != end) {
1108 					(*rxq->cqes)[idx & cqe_cnt].op_own =
1109 						MLX5_CQE_INVALIDATE;
1110 					++idx;
1111 				}
1112 				rxq->cq_ci = zip->cq_ci;
1113 				zip->ai = 0;
1114 			}
1115 		/*
1116 		 * No compressed data, get next CQE and verify if it is
1117 		 * compressed.
1118 		 */
1119 		} else {
1120 			int ret;
1121 			int8_t op_own;
1122 
1123 			ret = check_cqe(cqe, cqe_n, rxq->cq_ci);
1124 			if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
1125 				if (unlikely(ret == MLX5_CQE_STATUS_ERR ||
1126 					     rxq->err_state)) {
1127 					ret = mlx5_rx_err_handle(rxq, 0);
1128 					if (ret == MLX5_CQE_STATUS_HW_OWN ||
1129 					    ret == -1)
1130 						return 0;
1131 				} else {
1132 					return 0;
1133 				}
1134 			}
1135 			++rxq->cq_ci;
1136 			op_own = cqe->op_own;
1137 			if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
1138 				volatile struct mlx5_mini_cqe8 (*mc)[8] =
1139 					(volatile struct mlx5_mini_cqe8 (*)[8])
1140 					(uintptr_t)(&(*rxq->cqes)
1141 						[rxq->cq_ci &
1142 						 cqe_cnt].pkt_info);
1143 
1144 				/* Fix endianness. */
1145 				zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt);
1146 				/*
1147 				 * Current mini array position is the one
1148 				 * returned by check_cqe64().
1149 				 *
1150 				 * If completion comprises several mini arrays,
1151 				 * as a special case the second one is located
1152 				 * 7 CQEs after the initial CQE instead of 8
1153 				 * for subsequent ones.
1154 				 */
1155 				zip->ca = rxq->cq_ci;
1156 				zip->na = zip->ca + 7;
1157 				/* Compute the next non compressed CQE. */
1158 				--rxq->cq_ci;
1159 				zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
1160 				/* Get packet size to return. */
1161 				len = rte_be_to_cpu_32((*mc)[0].byte_cnt);
1162 				*mcqe = &(*mc)[0];
1163 				zip->ai = 1;
1164 				/* Prefetch all to be invalidated */
1165 				idx = zip->ca;
1166 				end = zip->cq_ci;
1167 				while (idx != end) {
1168 					rte_prefetch0(&(*rxq->cqes)[(idx) &
1169 								    cqe_cnt]);
1170 					++idx;
1171 				}
1172 			} else {
1173 				len = rte_be_to_cpu_32(cqe->byte_cnt);
1174 			}
1175 		}
1176 		if (unlikely(rxq->err_state)) {
1177 			cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
1178 			++rxq->stats.idropped;
1179 		} else {
1180 			return len;
1181 		}
1182 	} while (1);
1183 }
1184 
1185 /**
1186  * Translate RX completion flags to offload flags.
1187  *
1188  * @param[in] cqe
1189  *   Pointer to CQE.
1190  *
1191  * @return
1192  *   Offload flags (ol_flags) for struct rte_mbuf.
1193  */
1194 static inline uint32_t
1195 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe)
1196 {
1197 	uint32_t ol_flags = 0;
1198 	uint16_t flags = rte_be_to_cpu_16(cqe->hdr_type_etc);
1199 
1200 	ol_flags =
1201 		TRANSPOSE(flags,
1202 			  MLX5_CQE_RX_L3_HDR_VALID,
1203 			  PKT_RX_IP_CKSUM_GOOD) |
1204 		TRANSPOSE(flags,
1205 			  MLX5_CQE_RX_L4_HDR_VALID,
1206 			  PKT_RX_L4_CKSUM_GOOD);
1207 	return ol_flags;
1208 }
1209 
1210 /**
1211  * Fill in mbuf fields from RX completion flags.
1212  * Note that pkt->ol_flags should be initialized outside of this function.
1213  *
1214  * @param rxq
1215  *   Pointer to RX queue.
1216  * @param pkt
1217  *   mbuf to fill.
1218  * @param cqe
1219  *   CQE to process.
1220  * @param rss_hash_res
1221  *   Packet RSS Hash result.
1222  */
1223 static inline void
1224 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
1225 	       volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res)
1226 {
1227 	/* Update packet information. */
1228 	pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe);
1229 	if (rss_hash_res && rxq->rss_hash) {
1230 		pkt->hash.rss = rss_hash_res;
1231 		pkt->ol_flags |= PKT_RX_RSS_HASH;
1232 	}
1233 	if (rxq->mark && MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) {
1234 		pkt->ol_flags |= PKT_RX_FDIR;
1235 		if (cqe->sop_drop_qpn !=
1236 		    rte_cpu_to_be_32(MLX5_FLOW_MARK_DEFAULT)) {
1237 			uint32_t mark = cqe->sop_drop_qpn;
1238 
1239 			pkt->ol_flags |= PKT_RX_FDIR_ID;
1240 			pkt->hash.fdir.hi = mlx5_flow_mark_get(mark);
1241 		}
1242 	}
1243 	if (rxq->csum)
1244 		pkt->ol_flags |= rxq_cq_to_ol_flags(cqe);
1245 	if (rxq->vlan_strip &&
1246 	    (cqe->hdr_type_etc & rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED))) {
1247 		pkt->ol_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED;
1248 		pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info);
1249 	}
1250 	if (rxq->hw_timestamp) {
1251 		pkt->timestamp = rte_be_to_cpu_64(cqe->timestamp);
1252 		pkt->ol_flags |= PKT_RX_TIMESTAMP;
1253 	}
1254 }
1255 
1256 /**
1257  * DPDK callback for RX.
1258  *
1259  * @param dpdk_rxq
1260  *   Generic pointer to RX queue structure.
1261  * @param[out] pkts
1262  *   Array to store received packets.
1263  * @param pkts_n
1264  *   Maximum number of packets in array.
1265  *
1266  * @return
1267  *   Number of packets successfully received (<= pkts_n).
1268  */
1269 uint16_t
1270 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
1271 {
1272 	struct mlx5_rxq_data *rxq = dpdk_rxq;
1273 	const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1;
1274 	const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1;
1275 	const unsigned int sges_n = rxq->sges_n;
1276 	struct rte_mbuf *pkt = NULL;
1277 	struct rte_mbuf *seg = NULL;
1278 	volatile struct mlx5_cqe *cqe =
1279 		&(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
1280 	unsigned int i = 0;
1281 	unsigned int rq_ci = rxq->rq_ci << sges_n;
1282 	int len = 0; /* keep its value across iterations. */
1283 
1284 	while (pkts_n) {
1285 		unsigned int idx = rq_ci & wqe_cnt;
1286 		volatile struct mlx5_wqe_data_seg *wqe =
1287 			&((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx];
1288 		struct rte_mbuf *rep = (*rxq->elts)[idx];
1289 		volatile struct mlx5_mini_cqe8 *mcqe = NULL;
1290 		uint32_t rss_hash_res;
1291 
1292 		if (pkt)
1293 			NEXT(seg) = rep;
1294 		seg = rep;
1295 		rte_prefetch0(seg);
1296 		rte_prefetch0(cqe);
1297 		rte_prefetch0(wqe);
1298 		rep = rte_mbuf_raw_alloc(rxq->mp);
1299 		if (unlikely(rep == NULL)) {
1300 			++rxq->stats.rx_nombuf;
1301 			if (!pkt) {
1302 				/*
1303 				 * no buffers before we even started,
1304 				 * bail out silently.
1305 				 */
1306 				break;
1307 			}
1308 			while (pkt != seg) {
1309 				assert(pkt != (*rxq->elts)[idx]);
1310 				rep = NEXT(pkt);
1311 				NEXT(pkt) = NULL;
1312 				NB_SEGS(pkt) = 1;
1313 				rte_mbuf_raw_free(pkt);
1314 				pkt = rep;
1315 			}
1316 			break;
1317 		}
1318 		if (!pkt) {
1319 			cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
1320 			len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt, &mcqe);
1321 			if (!len) {
1322 				rte_mbuf_raw_free(rep);
1323 				break;
1324 			}
1325 			pkt = seg;
1326 			assert(len >= (rxq->crc_present << 2));
1327 			pkt->ol_flags = 0;
1328 			/* If compressed, take hash result from mini-CQE. */
1329 			rss_hash_res = rte_be_to_cpu_32(mcqe == NULL ?
1330 							cqe->rx_hash_res :
1331 							mcqe->rx_hash_result);
1332 			rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res);
1333 			if (rxq->crc_present)
1334 				len -= RTE_ETHER_CRC_LEN;
1335 			PKT_LEN(pkt) = len;
1336 			if (cqe->lro_num_seg > 1) {
1337 				mlx5_lro_update_hdr
1338 					(rte_pktmbuf_mtod(pkt, uint8_t *), cqe,
1339 					 len);
1340 				pkt->ol_flags |= PKT_RX_LRO;
1341 				pkt->tso_segsz = len / cqe->lro_num_seg;
1342 			}
1343 		}
1344 		DATA_LEN(rep) = DATA_LEN(seg);
1345 		PKT_LEN(rep) = PKT_LEN(seg);
1346 		SET_DATA_OFF(rep, DATA_OFF(seg));
1347 		PORT(rep) = PORT(seg);
1348 		(*rxq->elts)[idx] = rep;
1349 		/*
1350 		 * Fill NIC descriptor with the new buffer.  The lkey and size
1351 		 * of the buffers are already known, only the buffer address
1352 		 * changes.
1353 		 */
1354 		wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t));
1355 		/* If there's only one MR, no need to replace LKey in WQE. */
1356 		if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
1357 			wqe->lkey = mlx5_rx_mb2mr(rxq, rep);
1358 		if (len > DATA_LEN(seg)) {
1359 			len -= DATA_LEN(seg);
1360 			++NB_SEGS(pkt);
1361 			++rq_ci;
1362 			continue;
1363 		}
1364 		DATA_LEN(seg) = len;
1365 #ifdef MLX5_PMD_SOFT_COUNTERS
1366 		/* Increment bytes counter. */
1367 		rxq->stats.ibytes += PKT_LEN(pkt);
1368 #endif
1369 		/* Return packet. */
1370 		*(pkts++) = pkt;
1371 		pkt = NULL;
1372 		--pkts_n;
1373 		++i;
1374 		/* Align consumer index to the next stride. */
1375 		rq_ci >>= sges_n;
1376 		++rq_ci;
1377 		rq_ci <<= sges_n;
1378 	}
1379 	if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci)))
1380 		return 0;
1381 	/* Update the consumer index. */
1382 	rxq->rq_ci = rq_ci >> sges_n;
1383 	rte_cio_wmb();
1384 	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
1385 	rte_cio_wmb();
1386 	*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
1387 #ifdef MLX5_PMD_SOFT_COUNTERS
1388 	/* Increment packets counter. */
1389 	rxq->stats.ipackets += i;
1390 #endif
1391 	return i;
1392 }
1393 
1394 /**
1395  * Update LRO packet TCP header.
1396  * The HW LRO feature doesn't update the TCP header after coalescing the
1397  * TCP segments but supplies information in CQE to fill it by SW.
1398  *
1399  * @param tcp
1400  *   Pointer to the TCP header.
1401  * @param cqe
1402  *   Pointer to the completion entry..
1403  * @param phcsum
1404  *   The L3 pseudo-header checksum.
1405  */
1406 static inline void
1407 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *restrict tcp,
1408 			volatile struct mlx5_cqe *restrict cqe,
1409 			uint32_t phcsum)
1410 {
1411 	uint8_t l4_type = (rte_be_to_cpu_16(cqe->hdr_type_etc) &
1412 			   MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT;
1413 	/*
1414 	 * The HW calculates only the TCP payload checksum, need to complete
1415 	 * the TCP header checksum and the L3 pseudo-header checksum.
1416 	 */
1417 	uint32_t csum = phcsum + cqe->csum;
1418 
1419 	if (l4_type == MLX5_L4_HDR_TYPE_TCP_EMPTY_ACK ||
1420 	    l4_type == MLX5_L4_HDR_TYPE_TCP_WITH_ACL) {
1421 		tcp->tcp_flags |= RTE_TCP_ACK_FLAG;
1422 		tcp->recv_ack = cqe->lro_ack_seq_num;
1423 		tcp->rx_win = cqe->lro_tcp_win;
1424 	}
1425 	if (cqe->lro_tcppsh_abort_dupack & MLX5_CQE_LRO_PUSH_MASK)
1426 		tcp->tcp_flags |= RTE_TCP_PSH_FLAG;
1427 	tcp->cksum = 0;
1428 	csum += rte_raw_cksum(tcp, (tcp->data_off & 0xF) * 4);
1429 	csum = ((csum & 0xffff0000) >> 16) + (csum & 0xffff);
1430 	csum = (~csum) & 0xffff;
1431 	if (csum == 0)
1432 		csum = 0xffff;
1433 	tcp->cksum = csum;
1434 }
1435 
1436 /**
1437  * Update LRO packet headers.
1438  * The HW LRO feature doesn't update the L3/TCP headers after coalescing the
1439  * TCP segments but supply information in CQE to fill it by SW.
1440  *
1441  * @param padd
1442  *   The packet address.
1443  * @param cqe
1444  *   Pointer to the completion entry..
1445  * @param len
1446  *   The packet length.
1447  */
1448 static inline void
1449 mlx5_lro_update_hdr(uint8_t *restrict padd,
1450 		    volatile struct mlx5_cqe *restrict cqe,
1451 		    uint32_t len)
1452 {
1453 	union {
1454 		struct rte_ether_hdr *eth;
1455 		struct rte_vlan_hdr *vlan;
1456 		struct rte_ipv4_hdr *ipv4;
1457 		struct rte_ipv6_hdr *ipv6;
1458 		struct rte_tcp_hdr *tcp;
1459 		uint8_t *hdr;
1460 	} h = {
1461 			.hdr = padd,
1462 	};
1463 	uint16_t proto = h.eth->ether_type;
1464 	uint32_t phcsum;
1465 
1466 	h.eth++;
1467 	while (proto == RTE_BE16(RTE_ETHER_TYPE_VLAN) ||
1468 	       proto == RTE_BE16(RTE_ETHER_TYPE_QINQ)) {
1469 		proto = h.vlan->eth_proto;
1470 		h.vlan++;
1471 	}
1472 	if (proto == RTE_BE16(RTE_ETHER_TYPE_IPV4)) {
1473 		h.ipv4->time_to_live = cqe->lro_min_ttl;
1474 		h.ipv4->total_length = rte_cpu_to_be_16(len - (h.hdr - padd));
1475 		h.ipv4->hdr_checksum = 0;
1476 		h.ipv4->hdr_checksum = rte_ipv4_cksum(h.ipv4);
1477 		phcsum = rte_ipv4_phdr_cksum(h.ipv4, 0);
1478 		h.ipv4++;
1479 	} else {
1480 		h.ipv6->hop_limits = cqe->lro_min_ttl;
1481 		h.ipv6->payload_len = rte_cpu_to_be_16(len - (h.hdr - padd) -
1482 						       sizeof(*h.ipv6));
1483 		phcsum = rte_ipv6_phdr_cksum(h.ipv6, 0);
1484 		h.ipv6++;
1485 	}
1486 	mlx5_lro_update_tcp_hdr(h.tcp, cqe, phcsum);
1487 }
1488 
1489 void
1490 mlx5_mprq_buf_free_cb(void *addr __rte_unused, void *opaque)
1491 {
1492 	struct mlx5_mprq_buf *buf = opaque;
1493 
1494 	if (rte_atomic16_read(&buf->refcnt) == 1) {
1495 		rte_mempool_put(buf->mp, buf);
1496 	} else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) {
1497 		rte_atomic16_set(&buf->refcnt, 1);
1498 		rte_mempool_put(buf->mp, buf);
1499 	}
1500 }
1501 
1502 void
1503 mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf)
1504 {
1505 	mlx5_mprq_buf_free_cb(NULL, buf);
1506 }
1507 
1508 static inline void
1509 mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx,
1510 		 const unsigned int strd_n)
1511 {
1512 	struct mlx5_mprq_buf *rep = rxq->mprq_repl;
1513 	volatile struct mlx5_wqe_data_seg *wqe =
1514 		&((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg;
1515 	void *addr;
1516 
1517 	assert(rep != NULL);
1518 	/* Replace MPRQ buf. */
1519 	(*rxq->mprq_bufs)[rq_idx] = rep;
1520 	/* Replace WQE. */
1521 	addr = mlx5_mprq_buf_addr(rep, strd_n);
1522 	wqe->addr = rte_cpu_to_be_64((uintptr_t)addr);
1523 	/* If there's only one MR, no need to replace LKey in WQE. */
1524 	if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
1525 		wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr);
1526 	/* Stash a mbuf for next replacement. */
1527 	if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep)))
1528 		rxq->mprq_repl = rep;
1529 	else
1530 		rxq->mprq_repl = NULL;
1531 }
1532 
1533 /**
1534  * DPDK callback for RX with Multi-Packet RQ support.
1535  *
1536  * @param dpdk_rxq
1537  *   Generic pointer to RX queue structure.
1538  * @param[out] pkts
1539  *   Array to store received packets.
1540  * @param pkts_n
1541  *   Maximum number of packets in array.
1542  *
1543  * @return
1544  *   Number of packets successfully received (<= pkts_n).
1545  */
1546 uint16_t
1547 mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
1548 {
1549 	struct mlx5_rxq_data *rxq = dpdk_rxq;
1550 	const unsigned int strd_n = 1 << rxq->strd_num_n;
1551 	const unsigned int strd_sz = 1 << rxq->strd_sz_n;
1552 	const unsigned int strd_shift =
1553 		MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en;
1554 	const unsigned int cq_mask = (1 << rxq->cqe_n) - 1;
1555 	const unsigned int wq_mask = (1 << rxq->elts_n) - 1;
1556 	volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask];
1557 	unsigned int i = 0;
1558 	uint32_t rq_ci = rxq->rq_ci;
1559 	uint16_t consumed_strd = rxq->consumed_strd;
1560 	uint16_t headroom_sz = rxq->strd_headroom_en * RTE_PKTMBUF_HEADROOM;
1561 	struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_ci & wq_mask];
1562 
1563 	while (i < pkts_n) {
1564 		struct rte_mbuf *pkt;
1565 		void *addr;
1566 		int ret;
1567 		unsigned int len;
1568 		uint16_t strd_cnt;
1569 		uint16_t strd_idx;
1570 		uint32_t offset;
1571 		uint32_t byte_cnt;
1572 		volatile struct mlx5_mini_cqe8 *mcqe = NULL;
1573 		uint32_t rss_hash_res = 0;
1574 		uint8_t lro_num_seg;
1575 
1576 		if (consumed_strd == strd_n) {
1577 			/* Replace WQE only if the buffer is still in use. */
1578 			if (rte_atomic16_read(&buf->refcnt) > 1) {
1579 				mprq_buf_replace(rxq, rq_ci & wq_mask, strd_n);
1580 				/* Release the old buffer. */
1581 				mlx5_mprq_buf_free(buf);
1582 			} else if (unlikely(rxq->mprq_repl == NULL)) {
1583 				struct mlx5_mprq_buf *rep;
1584 
1585 				/*
1586 				 * Currently, the MPRQ mempool is out of buffer
1587 				 * and doing memcpy regardless of the size of Rx
1588 				 * packet. Retry allocation to get back to
1589 				 * normal.
1590 				 */
1591 				if (!rte_mempool_get(rxq->mprq_mp,
1592 						     (void **)&rep))
1593 					rxq->mprq_repl = rep;
1594 			}
1595 			/* Advance to the next WQE. */
1596 			consumed_strd = 0;
1597 			++rq_ci;
1598 			buf = (*rxq->mprq_bufs)[rq_ci & wq_mask];
1599 		}
1600 		cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask];
1601 		ret = mlx5_rx_poll_len(rxq, cqe, cq_mask, &mcqe);
1602 		if (!ret)
1603 			break;
1604 		byte_cnt = ret;
1605 		strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >>
1606 			   MLX5_MPRQ_STRIDE_NUM_SHIFT;
1607 		assert(strd_cnt);
1608 		consumed_strd += strd_cnt;
1609 		if (byte_cnt & MLX5_MPRQ_FILLER_MASK)
1610 			continue;
1611 		if (mcqe == NULL) {
1612 			rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res);
1613 			strd_idx = rte_be_to_cpu_16(cqe->wqe_counter);
1614 		} else {
1615 			/* mini-CQE for MPRQ doesn't have hash result. */
1616 			strd_idx = rte_be_to_cpu_16(mcqe->stride_idx);
1617 		}
1618 		assert(strd_idx < strd_n);
1619 		assert(!((rte_be_to_cpu_16(cqe->wqe_id) ^ rq_ci) & wq_mask));
1620 		lro_num_seg = cqe->lro_num_seg;
1621 		/*
1622 		 * Currently configured to receive a packet per a stride. But if
1623 		 * MTU is adjusted through kernel interface, device could
1624 		 * consume multiple strides without raising an error. In this
1625 		 * case, the packet should be dropped because it is bigger than
1626 		 * the max_rx_pkt_len.
1627 		 */
1628 		if (unlikely(!lro_num_seg && strd_cnt > 1)) {
1629 			++rxq->stats.idropped;
1630 			continue;
1631 		}
1632 		pkt = rte_pktmbuf_alloc(rxq->mp);
1633 		if (unlikely(pkt == NULL)) {
1634 			++rxq->stats.rx_nombuf;
1635 			break;
1636 		}
1637 		len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT;
1638 		assert((int)len >= (rxq->crc_present << 2));
1639 		if (rxq->crc_present)
1640 			len -= RTE_ETHER_CRC_LEN;
1641 		offset = strd_idx * strd_sz + strd_shift;
1642 		addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset);
1643 		/*
1644 		 * Memcpy packets to the target mbuf if:
1645 		 * - The size of packet is smaller than mprq_max_memcpy_len.
1646 		 * - Out of buffer in the Mempool for Multi-Packet RQ.
1647 		 */
1648 		if (len <= rxq->mprq_max_memcpy_len || rxq->mprq_repl == NULL) {
1649 			/*
1650 			 * When memcpy'ing packet due to out-of-buffer, the
1651 			 * packet must be smaller than the target mbuf.
1652 			 */
1653 			if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) {
1654 				rte_pktmbuf_free_seg(pkt);
1655 				++rxq->stats.idropped;
1656 				continue;
1657 			}
1658 			rte_memcpy(rte_pktmbuf_mtod(pkt, void *), addr, len);
1659 			DATA_LEN(pkt) = len;
1660 		} else {
1661 			rte_iova_t buf_iova;
1662 			struct rte_mbuf_ext_shared_info *shinfo;
1663 			uint16_t buf_len = strd_cnt * strd_sz;
1664 			void *buf_addr;
1665 
1666 			/* Increment the refcnt of the whole chunk. */
1667 			rte_atomic16_add_return(&buf->refcnt, 1);
1668 			assert((uint16_t)rte_atomic16_read(&buf->refcnt) <=
1669 			       strd_n + 1);
1670 			buf_addr = RTE_PTR_SUB(addr, headroom_sz);
1671 			/*
1672 			 * MLX5 device doesn't use iova but it is necessary in a
1673 			 * case where the Rx packet is transmitted via a
1674 			 * different PMD.
1675 			 */
1676 			buf_iova = rte_mempool_virt2iova(buf) +
1677 				   RTE_PTR_DIFF(buf_addr, buf);
1678 			shinfo = &buf->shinfos[strd_idx];
1679 			rte_mbuf_ext_refcnt_set(shinfo, 1);
1680 			/*
1681 			 * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when
1682 			 * attaching the stride to mbuf and more offload flags
1683 			 * will be added below by calling rxq_cq_to_mbuf().
1684 			 * Other fields will be overwritten.
1685 			 */
1686 			rte_pktmbuf_attach_extbuf(pkt, buf_addr, buf_iova,
1687 						  buf_len, shinfo);
1688 			/* Set mbuf head-room. */
1689 			pkt->data_off = headroom_sz;
1690 			assert(pkt->ol_flags == EXT_ATTACHED_MBUF);
1691 			/*
1692 			 * Prevent potential overflow due to MTU change through
1693 			 * kernel interface.
1694 			 */
1695 			if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) {
1696 				rte_pktmbuf_free_seg(pkt);
1697 				++rxq->stats.idropped;
1698 				continue;
1699 			}
1700 			DATA_LEN(pkt) = len;
1701 			/*
1702 			 * LRO packet may consume all the stride memory, in this
1703 			 * case packet head-room space is not guaranteed so must
1704 			 * to add an empty mbuf for the head-room.
1705 			 */
1706 			if (!rxq->strd_headroom_en) {
1707 				struct rte_mbuf *headroom_mbuf =
1708 						rte_pktmbuf_alloc(rxq->mp);
1709 
1710 				if (unlikely(headroom_mbuf == NULL)) {
1711 					rte_pktmbuf_free_seg(pkt);
1712 					++rxq->stats.rx_nombuf;
1713 					break;
1714 				}
1715 				PORT(pkt) = rxq->port_id;
1716 				NEXT(headroom_mbuf) = pkt;
1717 				pkt = headroom_mbuf;
1718 				NB_SEGS(pkt) = 2;
1719 			}
1720 		}
1721 		rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res);
1722 		if (lro_num_seg > 1) {
1723 			mlx5_lro_update_hdr(addr, cqe, len);
1724 			pkt->ol_flags |= PKT_RX_LRO;
1725 			pkt->tso_segsz = strd_sz;
1726 		}
1727 		PKT_LEN(pkt) = len;
1728 		PORT(pkt) = rxq->port_id;
1729 #ifdef MLX5_PMD_SOFT_COUNTERS
1730 		/* Increment bytes counter. */
1731 		rxq->stats.ibytes += PKT_LEN(pkt);
1732 #endif
1733 		/* Return packet. */
1734 		*(pkts++) = pkt;
1735 		++i;
1736 	}
1737 	/* Update the consumer indexes. */
1738 	rxq->consumed_strd = consumed_strd;
1739 	rte_cio_wmb();
1740 	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
1741 	if (rq_ci != rxq->rq_ci) {
1742 		rxq->rq_ci = rq_ci;
1743 		rte_cio_wmb();
1744 		*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
1745 	}
1746 #ifdef MLX5_PMD_SOFT_COUNTERS
1747 	/* Increment packets counter. */
1748 	rxq->stats.ipackets += i;
1749 #endif
1750 	return i;
1751 }
1752 
1753 /**
1754  * Dummy DPDK callback for TX.
1755  *
1756  * This function is used to temporarily replace the real callback during
1757  * unsafe control operations on the queue, or in case of error.
1758  *
1759  * @param dpdk_txq
1760  *   Generic pointer to TX queue structure.
1761  * @param[in] pkts
1762  *   Packets to transmit.
1763  * @param pkts_n
1764  *   Number of packets in array.
1765  *
1766  * @return
1767  *   Number of packets successfully transmitted (<= pkts_n).
1768  */
1769 uint16_t
1770 removed_tx_burst(void *dpdk_txq __rte_unused,
1771 		 struct rte_mbuf **pkts __rte_unused,
1772 		 uint16_t pkts_n __rte_unused)
1773 {
1774 	rte_mb();
1775 	return 0;
1776 }
1777 
1778 /**
1779  * Dummy DPDK callback for RX.
1780  *
1781  * This function is used to temporarily replace the real callback during
1782  * unsafe control operations on the queue, or in case of error.
1783  *
1784  * @param dpdk_rxq
1785  *   Generic pointer to RX queue structure.
1786  * @param[out] pkts
1787  *   Array to store received packets.
1788  * @param pkts_n
1789  *   Maximum number of packets in array.
1790  *
1791  * @return
1792  *   Number of packets successfully received (<= pkts_n).
1793  */
1794 uint16_t
1795 removed_rx_burst(void *dpdk_txq __rte_unused,
1796 		 struct rte_mbuf **pkts __rte_unused,
1797 		 uint16_t pkts_n __rte_unused)
1798 {
1799 	rte_mb();
1800 	return 0;
1801 }
1802 
1803 /*
1804  * Vectorized Rx/Tx routines are not compiled in when required vector
1805  * instructions are not supported on a target architecture. The following null
1806  * stubs are needed for linkage when those are not included outside of this file
1807  * (e.g.  mlx5_rxtx_vec_sse.c for x86).
1808  */
1809 
1810 __rte_weak uint16_t
1811 mlx5_rx_burst_vec(void *dpdk_txq __rte_unused,
1812 		  struct rte_mbuf **pkts __rte_unused,
1813 		  uint16_t pkts_n __rte_unused)
1814 {
1815 	return 0;
1816 }
1817 
1818 __rte_weak int
1819 mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused)
1820 {
1821 	return -ENOTSUP;
1822 }
1823 
1824 __rte_weak int
1825 mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused)
1826 {
1827 	return -ENOTSUP;
1828 }
1829 
1830 /**
1831  * Free the mbufs from the linear array of pointers.
1832  *
1833  * @param pkts
1834  *   Pointer to array of packets to be free.
1835  * @param pkts_n
1836  *   Number of packets to be freed.
1837  * @param olx
1838  *   Configured Tx offloads mask. It is fully defined at
1839  *   compile time and may be used for optimization.
1840  */
1841 static __rte_always_inline void
1842 mlx5_tx_free_mbuf(struct rte_mbuf **restrict pkts,
1843 		  unsigned int pkts_n,
1844 		  unsigned int olx __rte_unused)
1845 {
1846 	struct rte_mempool *pool = NULL;
1847 	struct rte_mbuf **p_free = NULL;
1848 	struct rte_mbuf *mbuf;
1849 	unsigned int n_free = 0;
1850 
1851 	/*
1852 	 * The implemented algorithm eliminates
1853 	 * copying pointers to temporary array
1854 	 * for rte_mempool_put_bulk() calls.
1855 	 */
1856 	assert(pkts);
1857 	assert(pkts_n);
1858 	for (;;) {
1859 		for (;;) {
1860 			/*
1861 			 * Decrement mbuf reference counter, detach
1862 			 * indirect and external buffers if needed.
1863 			 */
1864 			mbuf = rte_pktmbuf_prefree_seg(*pkts);
1865 			if (likely(mbuf != NULL)) {
1866 				assert(mbuf == *pkts);
1867 				if (likely(n_free != 0)) {
1868 					if (unlikely(pool != mbuf->pool))
1869 						/* From different pool. */
1870 						break;
1871 				} else {
1872 					/* Start new scan array. */
1873 					pool = mbuf->pool;
1874 					p_free = pkts;
1875 				}
1876 				++n_free;
1877 				++pkts;
1878 				--pkts_n;
1879 				if (unlikely(pkts_n == 0)) {
1880 					mbuf = NULL;
1881 					break;
1882 				}
1883 			} else {
1884 				/*
1885 				 * This happens if mbuf is still referenced.
1886 				 * We can't put it back to the pool, skip.
1887 				 */
1888 				++pkts;
1889 				--pkts_n;
1890 				if (unlikely(n_free != 0))
1891 					/* There is some array to free.*/
1892 					break;
1893 				if (unlikely(pkts_n == 0))
1894 					/* Last mbuf, nothing to free. */
1895 					return;
1896 			}
1897 		}
1898 		for (;;) {
1899 			/*
1900 			 * This loop is implemented to avoid multiple
1901 			 * inlining of rte_mempool_put_bulk().
1902 			 */
1903 			assert(pool);
1904 			assert(p_free);
1905 			assert(n_free);
1906 			/*
1907 			 * Free the array of pre-freed mbufs
1908 			 * belonging to the same memory pool.
1909 			 */
1910 			rte_mempool_put_bulk(pool, (void *)p_free, n_free);
1911 			if (unlikely(mbuf != NULL)) {
1912 				/* There is the request to start new scan. */
1913 				pool = mbuf->pool;
1914 				p_free = pkts++;
1915 				n_free = 1;
1916 				--pkts_n;
1917 				if (likely(pkts_n != 0))
1918 					break;
1919 				/*
1920 				 * This is the last mbuf to be freed.
1921 				 * Do one more loop iteration to complete.
1922 				 * This is rare case of the last unique mbuf.
1923 				 */
1924 				mbuf = NULL;
1925 				continue;
1926 			}
1927 			if (likely(pkts_n == 0))
1928 				return;
1929 			n_free = 0;
1930 			break;
1931 		}
1932 	}
1933 }
1934 
1935 /**
1936  * Free the mbuf from the elts ring buffer till new tail.
1937  *
1938  * @param txq
1939  *   Pointer to Tx queue structure.
1940  * @param tail
1941  *   Index in elts to free up to, becomes new elts tail.
1942  * @param olx
1943  *   Configured Tx offloads mask. It is fully defined at
1944  *   compile time and may be used for optimization.
1945  */
1946 static __rte_always_inline void
1947 mlx5_tx_free_elts(struct mlx5_txq_data *restrict txq,
1948 		  uint16_t tail,
1949 		  unsigned int olx __rte_unused)
1950 {
1951 	uint16_t n_elts = tail - txq->elts_tail;
1952 
1953 	assert(n_elts);
1954 	assert(n_elts <= txq->elts_s);
1955 	/*
1956 	 * Implement a loop to support ring buffer wraparound
1957 	 * with single inlining of mlx5_tx_free_mbuf().
1958 	 */
1959 	do {
1960 		unsigned int part;
1961 
1962 		part = txq->elts_s - (txq->elts_tail & txq->elts_m);
1963 		part = RTE_MIN(part, n_elts);
1964 		assert(part);
1965 		assert(part <= txq->elts_s);
1966 		mlx5_tx_free_mbuf(&txq->elts[txq->elts_tail & txq->elts_m],
1967 				  part, olx);
1968 		txq->elts_tail += part;
1969 		n_elts -= part;
1970 	} while (n_elts);
1971 }
1972 
1973 /**
1974  * Store the mbuf being sent into elts ring buffer.
1975  * On Tx completion these mbufs will be freed.
1976  *
1977  * @param txq
1978  *   Pointer to Tx queue structure.
1979  * @param pkts
1980  *   Pointer to array of packets to be stored.
1981  * @param pkts_n
1982  *   Number of packets to be stored.
1983  * @param olx
1984  *   Configured Tx offloads mask. It is fully defined at
1985  *   compile time and may be used for optimization.
1986  */
1987 static __rte_always_inline void
1988 mlx5_tx_copy_elts(struct mlx5_txq_data *restrict txq,
1989 		  struct rte_mbuf **restrict pkts,
1990 		  unsigned int pkts_n,
1991 		  unsigned int olx __rte_unused)
1992 {
1993 	unsigned int part;
1994 	struct rte_mbuf **elts = (struct rte_mbuf **)txq->elts;
1995 
1996 	assert(pkts);
1997 	assert(pkts_n);
1998 	part = txq->elts_s - (txq->elts_head & txq->elts_m);
1999 	assert(part);
2000 	assert(part <= txq->elts_s);
2001 	/* This code is a good candidate for vectorizing with SIMD. */
2002 	rte_memcpy((void *)(elts + (txq->elts_head & txq->elts_m)),
2003 		   (void *)pkts,
2004 		   RTE_MIN(part, pkts_n) * sizeof(struct rte_mbuf *));
2005 	txq->elts_head += pkts_n;
2006 	if (unlikely(part < pkts_n))
2007 		/* The copy is wrapping around the elts array. */
2008 		rte_memcpy((void *)elts, (void *)(pkts + part),
2009 			   (pkts_n - part) * sizeof(struct rte_mbuf *));
2010 }
2011 
2012 /**
2013  * Manage TX completions. This routine checks the CQ for
2014  * arrived CQEs, deduces the last accomplished WQE in SQ,
2015  * updates SQ producing index and frees all completed mbufs.
2016  *
2017  * @param txq
2018  *   Pointer to TX queue structure.
2019  * @param olx
2020  *   Configured Tx offloads mask. It is fully defined at
2021  *   compile time and may be used for optimization.
2022  *
2023  * NOTE: not inlined intentionally, it makes tx_burst
2024  * routine smaller, simple and faster - from experiments.
2025  */
2026 static void
2027 mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq,
2028 			  unsigned int olx __rte_unused)
2029 {
2030 	unsigned int count = MLX5_TX_COMP_MAX_CQE;
2031 	bool update = false;
2032 	uint16_t tail = txq->elts_tail;
2033 	int ret;
2034 
2035 	do {
2036 		volatile struct mlx5_cqe *cqe;
2037 
2038 		cqe = &txq->cqes[txq->cq_ci & txq->cqe_m];
2039 		ret = check_cqe(cqe, txq->cqe_s, txq->cq_ci);
2040 		if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
2041 			if (likely(ret != MLX5_CQE_STATUS_ERR)) {
2042 				/* No new CQEs in completion queue. */
2043 				assert(ret == MLX5_CQE_STATUS_HW_OWN);
2044 				break;
2045 			}
2046 			/* Some error occurred, try to restart. */
2047 			rte_wmb();
2048 			tail = mlx5_tx_error_cqe_handle
2049 				(txq, (volatile struct mlx5_err_cqe *)cqe);
2050 			if (likely(tail != txq->elts_tail)) {
2051 				mlx5_tx_free_elts(txq, tail, olx);
2052 				assert(tail == txq->elts_tail);
2053 			}
2054 			/* Allow flushing all CQEs from the queue. */
2055 			count = txq->cqe_s;
2056 		} else {
2057 			volatile struct mlx5_wqe_cseg *cseg;
2058 
2059 			/* Normal transmit completion. */
2060 			++txq->cq_ci;
2061 			rte_cio_rmb();
2062 			txq->wqe_pi = rte_be_to_cpu_16(cqe->wqe_counter);
2063 			cseg = (volatile struct mlx5_wqe_cseg *)
2064 				(txq->wqes + (txq->wqe_pi & txq->wqe_m));
2065 			tail = cseg->misc;
2066 		}
2067 #ifndef NDEBUG
2068 		if (txq->cq_pi)
2069 			--txq->cq_pi;
2070 #endif
2071 		update = true;
2072 	/*
2073 	 * We have to restrict the amount of processed CQEs
2074 	 * in one tx_burst routine call. The CQ may be large
2075 	 * and many CQEs may be updated by the NIC in one
2076 	 * transaction. Buffers freeing is time consuming,
2077 	 * multiple iterations may introduce significant
2078 	 * latency.
2079 	 */
2080 	} while (--count);
2081 	if (likely(tail != txq->elts_tail)) {
2082 		/* Free data buffers from elts. */
2083 		mlx5_tx_free_elts(txq, tail, olx);
2084 		assert(tail == txq->elts_tail);
2085 	}
2086 	if (likely(update)) {
2087 		/* Update the consumer index. */
2088 		rte_compiler_barrier();
2089 		*txq->cq_db =
2090 		rte_cpu_to_be_32(txq->cq_ci);
2091 	}
2092 }
2093 
2094 /**
2095  * Check if the completion request flag should be set in the last WQE.
2096  * Both pushed mbufs and WQEs are monitored and the completion request
2097  * flag is set if any of thresholds is reached.
2098  *
2099  * @param txq
2100  *   Pointer to TX queue structure.
2101  * @param loc
2102  *   Pointer to burst routine local context.
2103  * @param olx
2104  *   Configured Tx offloads mask. It is fully defined at
2105  *   compile time and may be used for optimization.
2106  */
2107 static __rte_always_inline void
2108 mlx5_tx_request_completion(struct mlx5_txq_data *restrict txq,
2109 			   struct mlx5_txq_local *restrict loc,
2110 			   unsigned int olx)
2111 {
2112 	uint16_t head = txq->elts_head;
2113 	unsigned int part;
2114 
2115 	part = MLX5_TXOFF_CONFIG(INLINE) ? 0 : loc->pkts_sent -
2116 		(MLX5_TXOFF_CONFIG(MULTI) ? loc->pkts_copy : 0);
2117 	head += part;
2118 	if ((uint16_t)(head - txq->elts_comp) >= MLX5_TX_COMP_THRESH ||
2119 	     (MLX5_TXOFF_CONFIG(INLINE) &&
2120 	     (uint16_t)(txq->wqe_ci - txq->wqe_comp) >= txq->wqe_thres)) {
2121 		volatile struct mlx5_wqe *last = loc->wqe_last;
2122 
2123 		txq->elts_comp = head;
2124 		if (MLX5_TXOFF_CONFIG(INLINE))
2125 			txq->wqe_comp = txq->wqe_ci;
2126 		/* Request unconditional completion on last WQE. */
2127 		last->cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS <<
2128 					    MLX5_COMP_MODE_OFFSET);
2129 		/* Save elts_head in unused "immediate" field of WQE. */
2130 		last->cseg.misc = head;
2131 		/*
2132 		 * A CQE slot must always be available. Count the
2133 		 * issued CEQ "always" request instead of production
2134 		 * index due to here can be CQE with errors and
2135 		 * difference with ci may become inconsistent.
2136 		 */
2137 		assert(txq->cqe_s > ++txq->cq_pi);
2138 	}
2139 }
2140 
2141 /**
2142  * DPDK callback to check the status of a tx descriptor.
2143  *
2144  * @param tx_queue
2145  *   The tx queue.
2146  * @param[in] offset
2147  *   The index of the descriptor in the ring.
2148  *
2149  * @return
2150  *   The status of the tx descriptor.
2151  */
2152 int
2153 mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset)
2154 {
2155 	struct mlx5_txq_data *restrict txq = tx_queue;
2156 	uint16_t used;
2157 
2158 	mlx5_tx_handle_completion(txq, 0);
2159 	used = txq->elts_head - txq->elts_tail;
2160 	if (offset < used)
2161 		return RTE_ETH_TX_DESC_FULL;
2162 	return RTE_ETH_TX_DESC_DONE;
2163 }
2164 
2165 /**
2166  * Build the Control Segment with specified opcode:
2167  * - MLX5_OPCODE_SEND
2168  * - MLX5_OPCODE_ENHANCED_MPSW
2169  * - MLX5_OPCODE_TSO
2170  *
2171  * @param txq
2172  *   Pointer to TX queue structure.
2173  * @param loc
2174  *   Pointer to burst routine local context.
2175  * @param wqe
2176  *   Pointer to WQE to fill with built Control Segment.
2177  * @param ds
2178  *   Supposed length of WQE in segments.
2179  * @param opcode
2180  *   SQ WQE opcode to put into Control Segment.
2181  * @param olx
2182  *   Configured Tx offloads mask. It is fully defined at
2183  *   compile time and may be used for optimization.
2184  */
2185 static __rte_always_inline void
2186 mlx5_tx_cseg_init(struct mlx5_txq_data *restrict txq,
2187 		  struct mlx5_txq_local *restrict loc __rte_unused,
2188 		  struct mlx5_wqe *restrict wqe,
2189 		  unsigned int ds,
2190 		  unsigned int opcode,
2191 		  unsigned int olx __rte_unused)
2192 {
2193 	struct mlx5_wqe_cseg *restrict cs = &wqe->cseg;
2194 
2195 	cs->opcode = rte_cpu_to_be_32((txq->wqe_ci << 8) | opcode);
2196 	cs->sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
2197 	cs->flags = RTE_BE32(MLX5_COMP_ONLY_FIRST_ERR <<
2198 			     MLX5_COMP_MODE_OFFSET);
2199 	cs->misc = RTE_BE32(0);
2200 }
2201 
2202 /**
2203  * Build the Ethernet Segment without inlined data.
2204  * Supports Software Parser, Checksums and VLAN
2205  * insertion Tx offload features.
2206  *
2207  * @param txq
2208  *   Pointer to TX queue structure.
2209  * @param loc
2210  *   Pointer to burst routine local context.
2211  * @param wqe
2212  *   Pointer to WQE to fill with built Ethernet Segment.
2213  * @param olx
2214  *   Configured Tx offloads mask. It is fully defined at
2215  *   compile time and may be used for optimization.
2216  */
2217 static __rte_always_inline void
2218 mlx5_tx_eseg_none(struct mlx5_txq_data *restrict txq __rte_unused,
2219 		  struct mlx5_txq_local *restrict loc,
2220 		  struct mlx5_wqe *restrict wqe,
2221 		  unsigned int olx)
2222 {
2223 	struct mlx5_wqe_eseg *restrict es = &wqe->eseg;
2224 	uint32_t csum;
2225 
2226 	/*
2227 	 * Calculate and set check sum flags first, dword field
2228 	 * in segment may be shared with Software Parser flags.
2229 	 */
2230 	csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
2231 	es->flags = rte_cpu_to_le_32(csum);
2232 	/*
2233 	 * Calculate and set Software Parser offsets and flags.
2234 	 * These flags a set for custom UDP and IP tunnel packets.
2235 	 */
2236 	es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
2237 	/* Fill metadata field if needed. */
2238 	es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
2239 		       loc->mbuf->ol_flags & PKT_TX_METADATA ?
2240 		       loc->mbuf->tx_metadata : 0 : 0;
2241 	/* Engage VLAN tag insertion feature if requested. */
2242 	if (MLX5_TXOFF_CONFIG(VLAN) &&
2243 	    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) {
2244 		/*
2245 		 * We should get here only if device support
2246 		 * this feature correctly.
2247 		 */
2248 		assert(txq->vlan_en);
2249 		es->inline_hdr = rte_cpu_to_be_32(MLX5_ETH_WQE_VLAN_INSERT |
2250 						  loc->mbuf->vlan_tci);
2251 	} else {
2252 		es->inline_hdr = RTE_BE32(0);
2253 	}
2254 }
2255 
2256 /**
2257  * Build the Ethernet Segment with minimal inlined data
2258  * of MLX5_ESEG_MIN_INLINE_SIZE bytes length. This is
2259  * used to fill the gap in single WQEBB WQEs.
2260  * Supports Software Parser, Checksums and VLAN
2261  * insertion Tx offload features.
2262  *
2263  * @param txq
2264  *   Pointer to TX queue structure.
2265  * @param loc
2266  *   Pointer to burst routine local context.
2267  * @param wqe
2268  *   Pointer to WQE to fill with built Ethernet Segment.
2269  * @param vlan
2270  *   Length of VLAN tag insertion if any.
2271  * @param olx
2272  *   Configured Tx offloads mask. It is fully defined at
2273  *   compile time and may be used for optimization.
2274  */
2275 static __rte_always_inline void
2276 mlx5_tx_eseg_dmin(struct mlx5_txq_data *restrict txq __rte_unused,
2277 		  struct mlx5_txq_local *restrict loc,
2278 		  struct mlx5_wqe *restrict wqe,
2279 		  unsigned int vlan,
2280 		  unsigned int olx)
2281 {
2282 	struct mlx5_wqe_eseg *restrict es = &wqe->eseg;
2283 	uint32_t csum;
2284 	uint8_t *psrc, *pdst;
2285 
2286 	/*
2287 	 * Calculate and set check sum flags first, dword field
2288 	 * in segment may be shared with Software Parser flags.
2289 	 */
2290 	csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
2291 	es->flags = rte_cpu_to_le_32(csum);
2292 	/*
2293 	 * Calculate and set Software Parser offsets and flags.
2294 	 * These flags a set for custom UDP and IP tunnel packets.
2295 	 */
2296 	es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
2297 	/* Fill metadata field if needed. */
2298 	es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
2299 		       loc->mbuf->ol_flags & PKT_TX_METADATA ?
2300 		       loc->mbuf->tx_metadata : 0 : 0;
2301 	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
2302 				(sizeof(uint16_t) +
2303 				 sizeof(rte_v128u32_t)),
2304 		      "invalid Ethernet Segment data size");
2305 	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
2306 				(sizeof(uint16_t) +
2307 				 sizeof(struct rte_vlan_hdr) +
2308 				 2 * RTE_ETHER_ADDR_LEN),
2309 		      "invalid Ethernet Segment data size");
2310 	psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
2311 	es->inline_hdr_sz = RTE_BE16(MLX5_ESEG_MIN_INLINE_SIZE);
2312 	es->inline_data = *(unaligned_uint16_t *)psrc;
2313 	psrc +=	sizeof(uint16_t);
2314 	pdst = (uint8_t *)(es + 1);
2315 	if (MLX5_TXOFF_CONFIG(VLAN) && vlan) {
2316 		/* Implement VLAN tag insertion as part inline data. */
2317 		memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t));
2318 		pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
2319 		psrc +=	2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
2320 		/* Insert VLAN ethertype + VLAN tag. */
2321 		*(unaligned_uint32_t *)pdst = rte_cpu_to_be_32
2322 						((RTE_ETHER_TYPE_VLAN << 16) |
2323 						 loc->mbuf->vlan_tci);
2324 		pdst += sizeof(struct rte_vlan_hdr);
2325 		/* Copy the rest two bytes from packet data. */
2326 		assert(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t)));
2327 		*(uint16_t *)pdst = *(unaligned_uint16_t *)psrc;
2328 	} else {
2329 		/* Fill the gap in the title WQEBB with inline data. */
2330 		rte_mov16(pdst, psrc);
2331 	}
2332 }
2333 
2334 /**
2335  * Build the Ethernet Segment with entire packet
2336  * data inlining. Checks the boundary of WQEBB and
2337  * ring buffer wrapping, supports Software Parser,
2338  * Checksums and VLAN insertion Tx offload features.
2339  *
2340  * @param txq
2341  *   Pointer to TX queue structure.
2342  * @param loc
2343  *   Pointer to burst routine local context.
2344  * @param wqe
2345  *   Pointer to WQE to fill with built Ethernet Segment.
2346  * @param vlan
2347  *   Length of VLAN tag insertion if any.
2348  * @param inlen
2349  *   Length of data to inline (VLAN included, if any).
2350  * @param tso
2351  *   TSO flag, set mss field from the packet.
2352  * @param olx
2353  *   Configured Tx offloads mask. It is fully defined at
2354  *   compile time and may be used for optimization.
2355  *
2356  * @return
2357  *   Pointer to the next Data Segment (aligned and wrapped around).
2358  */
2359 static __rte_always_inline struct mlx5_wqe_dseg *
2360 mlx5_tx_eseg_data(struct mlx5_txq_data *restrict txq,
2361 		  struct mlx5_txq_local *restrict loc,
2362 		  struct mlx5_wqe *restrict wqe,
2363 		  unsigned int vlan,
2364 		  unsigned int inlen,
2365 		  unsigned int tso,
2366 		  unsigned int olx)
2367 {
2368 	struct mlx5_wqe_eseg *restrict es = &wqe->eseg;
2369 	uint32_t csum;
2370 	uint8_t *psrc, *pdst;
2371 	unsigned int part;
2372 
2373 	/*
2374 	 * Calculate and set check sum flags first, dword field
2375 	 * in segment may be shared with Software Parser flags.
2376 	 */
2377 	csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
2378 	if (tso) {
2379 		csum <<= 24;
2380 		csum |= loc->mbuf->tso_segsz;
2381 		es->flags = rte_cpu_to_be_32(csum);
2382 	} else {
2383 		es->flags = rte_cpu_to_le_32(csum);
2384 	}
2385 	/*
2386 	 * Calculate and set Software Parser offsets and flags.
2387 	 * These flags a set for custom UDP and IP tunnel packets.
2388 	 */
2389 	es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
2390 	/* Fill metadata field if needed. */
2391 	es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
2392 		       loc->mbuf->ol_flags & PKT_TX_METADATA ?
2393 		       loc->mbuf->tx_metadata : 0 : 0;
2394 	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
2395 				(sizeof(uint16_t) +
2396 				 sizeof(rte_v128u32_t)),
2397 		      "invalid Ethernet Segment data size");
2398 	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
2399 				(sizeof(uint16_t) +
2400 				 sizeof(struct rte_vlan_hdr) +
2401 				 2 * RTE_ETHER_ADDR_LEN),
2402 		      "invalid Ethernet Segment data size");
2403 	psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
2404 	es->inline_hdr_sz = rte_cpu_to_be_16(inlen);
2405 	es->inline_data = *(unaligned_uint16_t *)psrc;
2406 	psrc +=	sizeof(uint16_t);
2407 	pdst = (uint8_t *)(es + 1);
2408 	if (MLX5_TXOFF_CONFIG(VLAN) && vlan) {
2409 		/* Implement VLAN tag insertion as part inline data. */
2410 		memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t));
2411 		pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
2412 		psrc +=	2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
2413 		/* Insert VLAN ethertype + VLAN tag. */
2414 		*(unaligned_uint32_t *)pdst = rte_cpu_to_be_32
2415 						((RTE_ETHER_TYPE_VLAN << 16) |
2416 						 loc->mbuf->vlan_tci);
2417 		pdst += sizeof(struct rte_vlan_hdr);
2418 		/* Copy the rest two bytes from packet data. */
2419 		assert(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t)));
2420 		*(uint16_t *)pdst = *(unaligned_uint16_t *)psrc;
2421 		psrc += sizeof(uint16_t);
2422 	} else {
2423 		/* Fill the gap in the title WQEBB with inline data. */
2424 		rte_mov16(pdst, psrc);
2425 		psrc += sizeof(rte_v128u32_t);
2426 	}
2427 	pdst = (uint8_t *)(es + 2);
2428 	assert(inlen >= MLX5_ESEG_MIN_INLINE_SIZE);
2429 	assert(pdst < (uint8_t *)txq->wqes_end);
2430 	inlen -= MLX5_ESEG_MIN_INLINE_SIZE;
2431 	if (!inlen) {
2432 		assert(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE));
2433 		return (struct mlx5_wqe_dseg *)pdst;
2434 	}
2435 	/*
2436 	 * The WQEBB space availability is checked by caller.
2437 	 * Here we should be aware of WQE ring buffer wraparound only.
2438 	 */
2439 	part = (uint8_t *)txq->wqes_end - pdst;
2440 	part = RTE_MIN(part, inlen);
2441 	do {
2442 		rte_memcpy(pdst, psrc, part);
2443 		inlen -= part;
2444 		if (likely(!inlen)) {
2445 			/*
2446 			 * If return value is not used by the caller
2447 			 * the code below will be optimized out.
2448 			 */
2449 			pdst += part;
2450 			pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
2451 			if (unlikely(pdst >= (uint8_t *)txq->wqes_end))
2452 				pdst = (uint8_t *)txq->wqes;
2453 			return (struct mlx5_wqe_dseg *)pdst;
2454 		}
2455 		pdst = (uint8_t *)txq->wqes;
2456 		psrc += part;
2457 		part = inlen;
2458 	} while (true);
2459 }
2460 
2461 /**
2462  * Copy data from chain of mbuf to the specified linear buffer.
2463  * Checksums and VLAN insertion Tx offload features. If data
2464  * from some mbuf copied completely this mbuf is freed. Local
2465  * structure is used to keep the byte stream state.
2466  *
2467  * @param pdst
2468  *   Pointer to the destination linear buffer.
2469  * @param loc
2470  *   Pointer to burst routine local context.
2471  * @param len
2472  *   Length of data to be copied.
2473  * @param olx
2474  *   Configured Tx offloads mask. It is fully defined at
2475  *   compile time and may be used for optimization.
2476  */
2477 static __rte_always_inline void
2478 mlx5_tx_mseg_memcpy(uint8_t *pdst,
2479 		    struct mlx5_txq_local *restrict loc,
2480 		    unsigned int len,
2481 		    unsigned int olx __rte_unused)
2482 {
2483 	struct rte_mbuf *mbuf;
2484 	unsigned int part, dlen;
2485 	uint8_t *psrc;
2486 
2487 	assert(len);
2488 	do {
2489 		/* Allow zero length packets, must check first. */
2490 		dlen = rte_pktmbuf_data_len(loc->mbuf);
2491 		if (dlen <= loc->mbuf_off) {
2492 			/* Exhausted packet, just free. */
2493 			mbuf = loc->mbuf;
2494 			loc->mbuf = mbuf->next;
2495 			rte_pktmbuf_free_seg(mbuf);
2496 			loc->mbuf_off = 0;
2497 			assert(loc->mbuf_nseg > 1);
2498 			assert(loc->mbuf);
2499 			--loc->mbuf_nseg;
2500 			continue;
2501 		}
2502 		dlen -= loc->mbuf_off;
2503 		psrc = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *,
2504 					       loc->mbuf_off);
2505 		part = RTE_MIN(len, dlen);
2506 		rte_memcpy(pdst, psrc, part);
2507 		loc->mbuf_off += part;
2508 		len -= part;
2509 		if (!len) {
2510 			if (loc->mbuf_off >= rte_pktmbuf_data_len(loc->mbuf)) {
2511 				loc->mbuf_off = 0;
2512 				/* Exhausted packet, just free. */
2513 				mbuf = loc->mbuf;
2514 				loc->mbuf = mbuf->next;
2515 				rte_pktmbuf_free_seg(mbuf);
2516 				loc->mbuf_off = 0;
2517 				assert(loc->mbuf_nseg >= 1);
2518 				--loc->mbuf_nseg;
2519 			}
2520 			return;
2521 		}
2522 		pdst += part;
2523 	} while (true);
2524 }
2525 
2526 /**
2527  * Build the Ethernet Segment with inlined data from
2528  * multi-segment packet. Checks the boundary of WQEBB
2529  * and ring buffer wrapping, supports Software Parser,
2530  * Checksums and VLAN insertion Tx offload features.
2531  *
2532  * @param txq
2533  *   Pointer to TX queue structure.
2534  * @param loc
2535  *   Pointer to burst routine local context.
2536  * @param wqe
2537  *   Pointer to WQE to fill with built Ethernet Segment.
2538  * @param vlan
2539  *   Length of VLAN tag insertion if any.
2540  * @param inlen
2541  *   Length of data to inline (VLAN included, if any).
2542  * @param tso
2543  *   TSO flag, set mss field from the packet.
2544  * @param olx
2545  *   Configured Tx offloads mask. It is fully defined at
2546  *   compile time and may be used for optimization.
2547  *
2548  * @return
2549  *   Pointer to the next Data Segment (aligned and
2550  *   possible NOT wrapped around - caller should do
2551  *   wrapping check on its own).
2552  */
2553 static __rte_always_inline struct mlx5_wqe_dseg *
2554 mlx5_tx_eseg_mdat(struct mlx5_txq_data *restrict txq,
2555 		  struct mlx5_txq_local *restrict loc,
2556 		  struct mlx5_wqe *restrict wqe,
2557 		  unsigned int vlan,
2558 		  unsigned int inlen,
2559 		  unsigned int tso,
2560 		  unsigned int olx)
2561 {
2562 	struct mlx5_wqe_eseg *restrict es = &wqe->eseg;
2563 	uint32_t csum;
2564 	uint8_t *pdst;
2565 	unsigned int part;
2566 
2567 	/*
2568 	 * Calculate and set check sum flags first, uint32_t field
2569 	 * in segment may be shared with Software Parser flags.
2570 	 */
2571 	csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
2572 	if (tso) {
2573 		csum <<= 24;
2574 		csum |= loc->mbuf->tso_segsz;
2575 		es->flags = rte_cpu_to_be_32(csum);
2576 	} else {
2577 		es->flags = rte_cpu_to_le_32(csum);
2578 	}
2579 	/*
2580 	 * Calculate and set Software Parser offsets and flags.
2581 	 * These flags a set for custom UDP and IP tunnel packets.
2582 	 */
2583 	es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
2584 	/* Fill metadata field if needed. */
2585 	es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
2586 		       loc->mbuf->ol_flags & PKT_TX_METADATA ?
2587 		       loc->mbuf->tx_metadata : 0 : 0;
2588 	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
2589 				(sizeof(uint16_t) +
2590 				 sizeof(rte_v128u32_t)),
2591 		      "invalid Ethernet Segment data size");
2592 	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
2593 				(sizeof(uint16_t) +
2594 				 sizeof(struct rte_vlan_hdr) +
2595 				 2 * RTE_ETHER_ADDR_LEN),
2596 		      "invalid Ethernet Segment data size");
2597 	assert(inlen > MLX5_ESEG_MIN_INLINE_SIZE);
2598 	es->inline_hdr_sz = rte_cpu_to_be_16(inlen);
2599 	pdst = (uint8_t *)&es->inline_data;
2600 	if (MLX5_TXOFF_CONFIG(VLAN) && vlan) {
2601 		/* Implement VLAN tag insertion as part inline data. */
2602 		mlx5_tx_mseg_memcpy(pdst, loc, 2 * RTE_ETHER_ADDR_LEN, olx);
2603 		pdst += 2 * RTE_ETHER_ADDR_LEN;
2604 		*(unaligned_uint32_t *)pdst = rte_cpu_to_be_32
2605 						((RTE_ETHER_TYPE_VLAN << 16) |
2606 						 loc->mbuf->vlan_tci);
2607 		pdst += sizeof(struct rte_vlan_hdr);
2608 		inlen -= 2 * RTE_ETHER_ADDR_LEN + sizeof(struct rte_vlan_hdr);
2609 	}
2610 	assert(pdst < (uint8_t *)txq->wqes_end);
2611 	/*
2612 	 * The WQEBB space availability is checked by caller.
2613 	 * Here we should be aware of WQE ring buffer wraparound only.
2614 	 */
2615 	part = (uint8_t *)txq->wqes_end - pdst;
2616 	part = RTE_MIN(part, inlen);
2617 	assert(part);
2618 	do {
2619 		mlx5_tx_mseg_memcpy(pdst, loc, part, olx);
2620 		inlen -= part;
2621 		if (likely(!inlen)) {
2622 			pdst += part;
2623 			pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
2624 			return (struct mlx5_wqe_dseg *)pdst;
2625 		}
2626 		pdst = (uint8_t *)txq->wqes;
2627 		part = inlen;
2628 	} while (true);
2629 }
2630 
2631 /**
2632  * Build the Data Segment of pointer type.
2633  *
2634  * @param txq
2635  *   Pointer to TX queue structure.
2636  * @param loc
2637  *   Pointer to burst routine local context.
2638  * @param dseg
2639  *   Pointer to WQE to fill with built Data Segment.
2640  * @param buf
2641  *   Data buffer to point.
2642  * @param len
2643  *   Data buffer length.
2644  * @param olx
2645  *   Configured Tx offloads mask. It is fully defined at
2646  *   compile time and may be used for optimization.
2647  */
2648 static __rte_always_inline void
2649 mlx5_tx_dseg_ptr(struct mlx5_txq_data *restrict txq,
2650 		 struct mlx5_txq_local *restrict loc,
2651 		 struct mlx5_wqe_dseg *restrict dseg,
2652 		 uint8_t *buf,
2653 		 unsigned int len,
2654 		 unsigned int olx __rte_unused)
2655 
2656 {
2657 	assert(len);
2658 	dseg->bcount = rte_cpu_to_be_32(len);
2659 	dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf);
2660 	dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf);
2661 }
2662 
2663 /**
2664  * Build the Data Segment of pointer type or inline
2665  * if data length is less than buffer in minimal
2666  * Data Segment size.
2667  *
2668  * @param txq
2669  *   Pointer to TX queue structure.
2670  * @param loc
2671  *   Pointer to burst routine local context.
2672  * @param dseg
2673  *   Pointer to WQE to fill with built Data Segment.
2674  * @param buf
2675  *   Data buffer to point.
2676  * @param len
2677  *   Data buffer length.
2678  * @param olx
2679  *   Configured Tx offloads mask. It is fully defined at
2680  *   compile time and may be used for optimization.
2681  */
2682 static __rte_always_inline void
2683 mlx5_tx_dseg_iptr(struct mlx5_txq_data *restrict txq,
2684 		  struct mlx5_txq_local *restrict loc,
2685 		  struct mlx5_wqe_dseg *restrict dseg,
2686 		  uint8_t *buf,
2687 		  unsigned int len,
2688 		  unsigned int olx __rte_unused)
2689 
2690 {
2691 	uintptr_t dst, src;
2692 
2693 	assert(len);
2694 	if (len > MLX5_DSEG_MIN_INLINE_SIZE) {
2695 		dseg->bcount = rte_cpu_to_be_32(len);
2696 		dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf);
2697 		dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf);
2698 
2699 		return;
2700 	}
2701 	dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE);
2702 	/* Unrolled implementation of generic rte_memcpy. */
2703 	dst = (uintptr_t)&dseg->inline_data[0];
2704 	src = (uintptr_t)buf;
2705 #ifdef RTE_ARCH_STRICT_ALIGN
2706 	memcpy(dst, src, len);
2707 #else
2708 	if (len & 0x08) {
2709 		*(uint64_t *)dst = *(uint64_t *)src;
2710 		dst += sizeof(uint64_t);
2711 		src += sizeof(uint64_t);
2712 	}
2713 	if (len & 0x04) {
2714 		*(uint32_t *)dst = *(uint32_t *)src;
2715 		dst += sizeof(uint32_t);
2716 		src += sizeof(uint32_t);
2717 	}
2718 	if (len & 0x02) {
2719 		*(uint16_t *)dst = *(uint16_t *)src;
2720 		dst += sizeof(uint16_t);
2721 		src += sizeof(uint16_t);
2722 	}
2723 	if (len & 0x01)
2724 		*(uint8_t *)dst = *(uint8_t *)src;
2725 #endif
2726 }
2727 
2728 /**
2729  * Build the Data Segment of inlined data from single
2730  * segment packet, no VLAN insertion.
2731  *
2732  * @param txq
2733  *   Pointer to TX queue structure.
2734  * @param loc
2735  *   Pointer to burst routine local context.
2736  * @param dseg
2737  *   Pointer to WQE to fill with built Data Segment.
2738  * @param buf
2739  *   Data buffer to point.
2740  * @param len
2741  *   Data buffer length.
2742  * @param olx
2743  *   Configured Tx offloads mask. It is fully defined at
2744  *   compile time and may be used for optimization.
2745  *
2746  * @return
2747  *   Pointer to the next Data Segment after inlined data.
2748  *   Ring buffer wraparound check is needed. We do not
2749  *   do it here because it may not be needed for the
2750  *   last packet in the eMPW session.
2751  */
2752 static __rte_always_inline struct mlx5_wqe_dseg *
2753 mlx5_tx_dseg_empw(struct mlx5_txq_data *restrict txq,
2754 		  struct mlx5_txq_local *restrict loc __rte_unused,
2755 		  struct mlx5_wqe_dseg *restrict dseg,
2756 		  uint8_t *buf,
2757 		  unsigned int len,
2758 		  unsigned int olx __rte_unused)
2759 {
2760 	unsigned int part;
2761 	uint8_t *pdst;
2762 
2763 	dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE);
2764 	pdst = &dseg->inline_data[0];
2765 	/*
2766 	 * The WQEBB space availability is checked by caller.
2767 	 * Here we should be aware of WQE ring buffer wraparound only.
2768 	 */
2769 	part = (uint8_t *)txq->wqes_end - pdst;
2770 	part = RTE_MIN(part, len);
2771 	do {
2772 		rte_memcpy(pdst, buf, part);
2773 		len -= part;
2774 		if (likely(!len)) {
2775 			pdst += part;
2776 			pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
2777 			/* Note: no final wraparound check here. */
2778 			return (struct mlx5_wqe_dseg *)pdst;
2779 		}
2780 		pdst = (uint8_t *)txq->wqes;
2781 		buf += part;
2782 		part = len;
2783 	} while (true);
2784 }
2785 
2786 /**
2787  * Build the Data Segment of inlined data from single
2788  * segment packet with VLAN insertion.
2789  *
2790  * @param txq
2791  *   Pointer to TX queue structure.
2792  * @param loc
2793  *   Pointer to burst routine local context.
2794  * @param dseg
2795  *   Pointer to the dseg fill with built Data Segment.
2796  * @param buf
2797  *   Data buffer to point.
2798  * @param len
2799  *   Data buffer length.
2800  * @param olx
2801  *   Configured Tx offloads mask. It is fully defined at
2802  *   compile time and may be used for optimization.
2803  *
2804  * @return
2805  *   Pointer to the next Data Segment after inlined data.
2806  *   Ring buffer wraparound check is needed.
2807  */
2808 static __rte_always_inline struct mlx5_wqe_dseg *
2809 mlx5_tx_dseg_vlan(struct mlx5_txq_data *restrict txq,
2810 		  struct mlx5_txq_local *restrict loc __rte_unused,
2811 		  struct mlx5_wqe_dseg *restrict dseg,
2812 		  uint8_t *buf,
2813 		  unsigned int len,
2814 		  unsigned int olx __rte_unused)
2815 
2816 {
2817 	unsigned int part;
2818 	uint8_t *pdst;
2819 
2820 	assert(len > MLX5_ESEG_MIN_INLINE_SIZE);
2821 	static_assert(MLX5_DSEG_MIN_INLINE_SIZE ==
2822 				 (2 * RTE_ETHER_ADDR_LEN),
2823 		      "invalid Data Segment data size");
2824 	dseg->bcount = rte_cpu_to_be_32((len + sizeof(struct rte_vlan_hdr)) |
2825 					MLX5_ETH_WQE_DATA_INLINE);
2826 	pdst = &dseg->inline_data[0];
2827 	memcpy(pdst, buf, MLX5_DSEG_MIN_INLINE_SIZE);
2828 	buf += MLX5_DSEG_MIN_INLINE_SIZE;
2829 	pdst += MLX5_DSEG_MIN_INLINE_SIZE;
2830 	/* Insert VLAN ethertype + VLAN tag. Pointer is aligned. */
2831 	assert(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE));
2832 	*(uint32_t *)pdst = rte_cpu_to_be_32((RTE_ETHER_TYPE_VLAN << 16) |
2833 					      loc->mbuf->vlan_tci);
2834 	pdst += sizeof(struct rte_vlan_hdr);
2835 	if (unlikely(pdst >= (uint8_t *)txq->wqes_end))
2836 		pdst = (uint8_t *)txq->wqes;
2837 	/*
2838 	 * The WQEBB space availability is checked by caller.
2839 	 * Here we should be aware of WQE ring buffer wraparound only.
2840 	 */
2841 	part = (uint8_t *)txq->wqes_end - pdst;
2842 	part = RTE_MIN(part, len);
2843 	do {
2844 		rte_memcpy(pdst, buf, part);
2845 		len -= part;
2846 		if (likely(!len)) {
2847 			pdst += part;
2848 			pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
2849 			/* Note: no final wraparound check here. */
2850 			return (struct mlx5_wqe_dseg *)pdst;
2851 		}
2852 		pdst = (uint8_t *)txq->wqes;
2853 		buf += part;
2854 		part = len;
2855 	} while (true);
2856 }
2857 
2858 /**
2859  * Build the Ethernet Segment with optionally inlined data with
2860  * VLAN insertion and following Data Segments (if any) from
2861  * multi-segment packet. Used by ordinary send and TSO.
2862  *
2863  * @param txq
2864  *   Pointer to TX queue structure.
2865  * @param loc
2866  *   Pointer to burst routine local context.
2867  * @param wqe
2868  *   Pointer to WQE to fill with built Ethernet/Data Segments.
2869  * @param vlan
2870  *   Length of VLAN header to insert, 0 means no VLAN insertion.
2871  * @param inlen
2872  *   Data length to inline. For TSO this parameter specifies
2873  *   exact value, for ordinary send routine can be aligned by
2874  *   caller to provide better WQE space saving and data buffer
2875  *   start address alignment. This length includes VLAN header
2876  *   being inserted.
2877  * @param tso
2878  *   Zero means ordinary send, inlined data can be extended,
2879  *   otherwise this is TSO, inlined data length is fixed.
2880  * @param olx
2881  *   Configured Tx offloads mask. It is fully defined at
2882  *   compile time and may be used for optimization.
2883  *
2884  * @return
2885  *   Actual size of built WQE in segments.
2886  */
2887 static __rte_always_inline unsigned int
2888 mlx5_tx_mseg_build(struct mlx5_txq_data *restrict txq,
2889 		   struct mlx5_txq_local *restrict loc,
2890 		   struct mlx5_wqe *restrict wqe,
2891 		   unsigned int vlan,
2892 		   unsigned int inlen,
2893 		   unsigned int tso,
2894 		   unsigned int olx __rte_unused)
2895 {
2896 	struct mlx5_wqe_dseg *restrict dseg;
2897 	unsigned int ds;
2898 
2899 	assert((rte_pktmbuf_pkt_len(loc->mbuf) + vlan) >= inlen);
2900 	loc->mbuf_nseg = NB_SEGS(loc->mbuf);
2901 	loc->mbuf_off = 0;
2902 
2903 	dseg = mlx5_tx_eseg_mdat(txq, loc, wqe, vlan, inlen, tso, olx);
2904 	if (!loc->mbuf_nseg)
2905 		goto dseg_done;
2906 	/*
2907 	 * There are still some mbuf remaining, not inlined.
2908 	 * The first mbuf may be partially inlined and we
2909 	 * must process the possible non-zero data offset.
2910 	 */
2911 	if (loc->mbuf_off) {
2912 		unsigned int dlen;
2913 		uint8_t *dptr;
2914 
2915 		/*
2916 		 * Exhausted packets must be dropped before.
2917 		 * Non-zero offset means there are some data
2918 		 * remained in the packet.
2919 		 */
2920 		assert(loc->mbuf_off < rte_pktmbuf_data_len(loc->mbuf));
2921 		assert(rte_pktmbuf_data_len(loc->mbuf));
2922 		dptr = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *,
2923 					       loc->mbuf_off);
2924 		dlen = rte_pktmbuf_data_len(loc->mbuf) - loc->mbuf_off;
2925 		/*
2926 		 * Build the pointer/minimal data Data Segment.
2927 		 * Do ring buffer wrapping check in advance.
2928 		 */
2929 		if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
2930 			dseg = (struct mlx5_wqe_dseg *)txq->wqes;
2931 		mlx5_tx_dseg_iptr(txq, loc, dseg, dptr, dlen, olx);
2932 		/* Store the mbuf to be freed on completion. */
2933 		assert(loc->elts_free);
2934 		txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
2935 		--loc->elts_free;
2936 		++dseg;
2937 		if (--loc->mbuf_nseg == 0)
2938 			goto dseg_done;
2939 		loc->mbuf = loc->mbuf->next;
2940 		loc->mbuf_off = 0;
2941 	}
2942 	do {
2943 		if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) {
2944 			struct rte_mbuf *mbuf;
2945 
2946 			/* Zero length segment found, just skip. */
2947 			mbuf = loc->mbuf;
2948 			loc->mbuf = loc->mbuf->next;
2949 			rte_pktmbuf_free_seg(mbuf);
2950 			if (--loc->mbuf_nseg == 0)
2951 				break;
2952 		} else {
2953 			if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
2954 				dseg = (struct mlx5_wqe_dseg *)txq->wqes;
2955 			mlx5_tx_dseg_iptr
2956 				(txq, loc, dseg,
2957 				 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
2958 				 rte_pktmbuf_data_len(loc->mbuf), olx);
2959 			assert(loc->elts_free);
2960 			txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
2961 			--loc->elts_free;
2962 			++dseg;
2963 			if (--loc->mbuf_nseg == 0)
2964 				break;
2965 			loc->mbuf = loc->mbuf->next;
2966 		}
2967 	} while (true);
2968 
2969 dseg_done:
2970 	/* Calculate actual segments used from the dseg pointer. */
2971 	if ((uintptr_t)wqe < (uintptr_t)dseg)
2972 		ds = ((uintptr_t)dseg - (uintptr_t)wqe) / MLX5_WSEG_SIZE;
2973 	else
2974 		ds = (((uintptr_t)dseg - (uintptr_t)wqe) +
2975 		      txq->wqe_s * MLX5_WQE_SIZE) / MLX5_WSEG_SIZE;
2976 	return ds;
2977 }
2978 
2979 /**
2980  * Tx one packet function for multi-segment TSO. Supports all
2981  * types of Tx offloads, uses MLX5_OPCODE_TSO to build WQEs,
2982  * sends one packet per WQE.
2983  *
2984  * This routine is responsible for storing processed mbuf
2985  * into elts ring buffer and update elts_head.
2986  *
2987  * @param txq
2988  *   Pointer to TX queue structure.
2989  * @param loc
2990  *   Pointer to burst routine local context.
2991  * @param olx
2992  *   Configured Tx offloads mask. It is fully defined at
2993  *   compile time and may be used for optimization.
2994  *
2995  * @return
2996  *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
2997  *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
2998  * Local context variables partially updated.
2999  */
3000 static __rte_always_inline enum mlx5_txcmp_code
3001 mlx5_tx_packet_multi_tso(struct mlx5_txq_data *restrict txq,
3002 			struct mlx5_txq_local *restrict loc,
3003 			unsigned int olx)
3004 {
3005 	struct mlx5_wqe *restrict wqe;
3006 	unsigned int ds, dlen, inlen, ntcp, vlan = 0;
3007 
3008 	/*
3009 	 * Calculate data length to be inlined to estimate
3010 	 * the required space in WQE ring buffer.
3011 	 */
3012 	dlen = rte_pktmbuf_pkt_len(loc->mbuf);
3013 	if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)
3014 		vlan = sizeof(struct rte_vlan_hdr);
3015 	inlen = loc->mbuf->l2_len + vlan +
3016 		loc->mbuf->l3_len + loc->mbuf->l4_len;
3017 	if (unlikely((!inlen || !loc->mbuf->tso_segsz)))
3018 		return MLX5_TXCMP_CODE_ERROR;
3019 	if (loc->mbuf->ol_flags & PKT_TX_TUNNEL_MASK)
3020 		inlen += loc->mbuf->outer_l2_len + loc->mbuf->outer_l3_len;
3021 	/* Packet must contain all TSO headers. */
3022 	if (unlikely(inlen > MLX5_MAX_TSO_HEADER ||
3023 		     inlen <= MLX5_ESEG_MIN_INLINE_SIZE ||
3024 		     inlen > (dlen + vlan)))
3025 		return MLX5_TXCMP_CODE_ERROR;
3026 	assert(inlen >= txq->inlen_mode);
3027 	/*
3028 	 * Check whether there are enough free WQEBBs:
3029 	 * - Control Segment
3030 	 * - Ethernet Segment
3031 	 * - First Segment of inlined Ethernet data
3032 	 * - ... data continued ...
3033 	 * - Data Segments of pointer/min inline type
3034 	 */
3035 	ds = NB_SEGS(loc->mbuf) + 2 + (inlen -
3036 				       MLX5_ESEG_MIN_INLINE_SIZE +
3037 				       MLX5_WSEG_SIZE +
3038 				       MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
3039 	if (unlikely(loc->wqe_free < ((ds + 3) / 4)))
3040 		return MLX5_TXCMP_CODE_EXIT;
3041 	/* Check for maximal WQE size. */
3042 	if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4)))
3043 		return MLX5_TXCMP_CODE_ERROR;
3044 #ifdef MLX5_PMD_SOFT_COUNTERS
3045 	/* Update sent data bytes/packets counters. */
3046 	ntcp = (dlen - (inlen - vlan) + loc->mbuf->tso_segsz - 1) /
3047 		loc->mbuf->tso_segsz;
3048 	/*
3049 	 * One will be added for mbuf itself
3050 	 * at the end of the mlx5_tx_burst from
3051 	 * loc->pkts_sent field.
3052 	 */
3053 	--ntcp;
3054 	txq->stats.opackets += ntcp;
3055 	txq->stats.obytes += dlen + vlan + ntcp * inlen;
3056 #endif
3057 	wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
3058 	loc->wqe_last = wqe;
3059 	mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_TSO, olx);
3060 	ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 1, olx);
3061 	wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
3062 	txq->wqe_ci += (ds + 3) / 4;
3063 	loc->wqe_free -= (ds + 3) / 4;
3064 	/* Request CQE generation if limits are reached. */
3065 	mlx5_tx_request_completion(txq, loc, olx);
3066 	return MLX5_TXCMP_CODE_MULTI;
3067 }
3068 
3069 /**
3070  * Tx one packet function for multi-segment SEND. Supports all
3071  * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs,
3072  * sends one packet per WQE, without any data inlining in
3073  * Ethernet Segment.
3074  *
3075  * This routine is responsible for storing processed mbuf
3076  * into elts ring buffer and update elts_head.
3077  *
3078  * @param txq
3079  *   Pointer to TX queue structure.
3080  * @param loc
3081  *   Pointer to burst routine local context.
3082  * @param olx
3083  *   Configured Tx offloads mask. It is fully defined at
3084  *   compile time and may be used for optimization.
3085  *
3086  * @return
3087  *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
3088  *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
3089  * Local context variables partially updated.
3090  */
3091 static __rte_always_inline enum mlx5_txcmp_code
3092 mlx5_tx_packet_multi_send(struct mlx5_txq_data *restrict txq,
3093 			  struct mlx5_txq_local *restrict loc,
3094 			  unsigned int olx)
3095 {
3096 	struct mlx5_wqe_dseg *restrict dseg;
3097 	struct mlx5_wqe *restrict wqe;
3098 	unsigned int ds, nseg;
3099 
3100 	assert(NB_SEGS(loc->mbuf) > 1);
3101 	/*
3102 	 * No inline at all, it means the CPU cycles saving
3103 	 * is prioritized at configuration, we should not
3104 	 * copy any packet data to WQE.
3105 	 */
3106 	nseg = NB_SEGS(loc->mbuf);
3107 	ds = 2 + nseg;
3108 	if (unlikely(loc->wqe_free < ((ds + 3) / 4)))
3109 		return MLX5_TXCMP_CODE_EXIT;
3110 	/* Check for maximal WQE size. */
3111 	if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4)))
3112 		return MLX5_TXCMP_CODE_ERROR;
3113 	/*
3114 	 * Some Tx offloads may cause an error if
3115 	 * packet is not long enough, check against
3116 	 * assumed minimal length.
3117 	 */
3118 	if (rte_pktmbuf_pkt_len(loc->mbuf) <= MLX5_ESEG_MIN_INLINE_SIZE)
3119 		return MLX5_TXCMP_CODE_ERROR;
3120 #ifdef MLX5_PMD_SOFT_COUNTERS
3121 	/* Update sent data bytes counter. */
3122 	txq->stats.obytes += rte_pktmbuf_pkt_len(loc->mbuf);
3123 	if (MLX5_TXOFF_CONFIG(VLAN) &&
3124 	    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)
3125 		txq->stats.obytes += sizeof(struct rte_vlan_hdr);
3126 #endif
3127 	/*
3128 	 * SEND WQE, one WQEBB:
3129 	 * - Control Segment, SEND opcode
3130 	 * - Ethernet Segment, optional VLAN, no inline
3131 	 * - Data Segments, pointer only type
3132 	 */
3133 	wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
3134 	loc->wqe_last = wqe;
3135 	mlx5_tx_cseg_init(txq, loc, wqe, ds, MLX5_OPCODE_SEND, olx);
3136 	mlx5_tx_eseg_none(txq, loc, wqe, olx);
3137 	dseg = &wqe->dseg[0];
3138 	do {
3139 		if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) {
3140 			struct rte_mbuf *mbuf;
3141 
3142 			/*
3143 			 * Zero length segment found, have to
3144 			 * correct total size of WQE in segments.
3145 			 * It is supposed to be rare occasion, so
3146 			 * in normal case (no zero length segments)
3147 			 * we avoid extra writing to the Control
3148 			 * Segment.
3149 			 */
3150 			--ds;
3151 			wqe->cseg.sq_ds -= RTE_BE32(1);
3152 			mbuf = loc->mbuf;
3153 			loc->mbuf = mbuf->next;
3154 			rte_pktmbuf_free_seg(mbuf);
3155 			if (--nseg == 0)
3156 				break;
3157 		} else {
3158 			mlx5_tx_dseg_ptr
3159 				(txq, loc, dseg,
3160 				 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
3161 				 rte_pktmbuf_data_len(loc->mbuf), olx);
3162 			txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
3163 			--loc->elts_free;
3164 			if (--nseg == 0)
3165 				break;
3166 			++dseg;
3167 			if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
3168 				dseg = (struct mlx5_wqe_dseg *)txq->wqes;
3169 			loc->mbuf = loc->mbuf->next;
3170 		}
3171 	} while (true);
3172 	txq->wqe_ci += (ds + 3) / 4;
3173 	loc->wqe_free -= (ds + 3) / 4;
3174 	/* Request CQE generation if limits are reached. */
3175 	mlx5_tx_request_completion(txq, loc, olx);
3176 	return MLX5_TXCMP_CODE_MULTI;
3177 }
3178 
3179 /**
3180  * Tx one packet function for multi-segment SEND. Supports all
3181  * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs,
3182  * sends one packet per WQE, with data inlining in
3183  * Ethernet Segment and minimal Data Segments.
3184  *
3185  * This routine is responsible for storing processed mbuf
3186  * into elts ring buffer and update elts_head.
3187  *
3188  * @param txq
3189  *   Pointer to TX queue structure.
3190  * @param loc
3191  *   Pointer to burst routine local context.
3192  * @param olx
3193  *   Configured Tx offloads mask. It is fully defined at
3194  *   compile time and may be used for optimization.
3195  *
3196  * @return
3197  *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
3198  *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
3199  * Local context variables partially updated.
3200  */
3201 static __rte_always_inline enum mlx5_txcmp_code
3202 mlx5_tx_packet_multi_inline(struct mlx5_txq_data *restrict txq,
3203 			    struct mlx5_txq_local *restrict loc,
3204 			    unsigned int olx)
3205 {
3206 	struct mlx5_wqe *restrict wqe;
3207 	unsigned int ds, inlen, dlen, vlan = 0;
3208 
3209 	assert(MLX5_TXOFF_CONFIG(INLINE));
3210 	assert(NB_SEGS(loc->mbuf) > 1);
3211 	/*
3212 	 * First calculate data length to be inlined
3213 	 * to estimate the required space for WQE.
3214 	 */
3215 	dlen = rte_pktmbuf_pkt_len(loc->mbuf);
3216 	if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)
3217 		vlan = sizeof(struct rte_vlan_hdr);
3218 	inlen = dlen + vlan;
3219 	/* Check against minimal length. */
3220 	if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE)
3221 		return MLX5_TXCMP_CODE_ERROR;
3222 	assert(txq->inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE);
3223 	if (inlen > txq->inlen_send) {
3224 		struct rte_mbuf *mbuf;
3225 		unsigned int nxlen;
3226 		uintptr_t start;
3227 
3228 		/*
3229 		 * Packet length exceeds the allowed inline
3230 		 * data length, check whether the minimal
3231 		 * inlining is required.
3232 		 */
3233 		if (txq->inlen_mode) {
3234 			assert(txq->inlen_mode >= MLX5_ESEG_MIN_INLINE_SIZE);
3235 			assert(txq->inlen_mode <= txq->inlen_send);
3236 			inlen = txq->inlen_mode;
3237 		} else {
3238 			if (!vlan || txq->vlan_en) {
3239 				/*
3240 				 * VLAN insertion will be done inside by HW.
3241 				 * It is not utmost effective - VLAN flag is
3242 				 * checked twice, but we should proceed the
3243 				 * inlining length correctly and take into
3244 				 * account the VLAN header being inserted.
3245 				 */
3246 				return mlx5_tx_packet_multi_send
3247 							(txq, loc, olx);
3248 			}
3249 			inlen = MLX5_ESEG_MIN_INLINE_SIZE;
3250 		}
3251 		/*
3252 		 * Now we know the minimal amount of data is requested
3253 		 * to inline. Check whether we should inline the buffers
3254 		 * from the chain beginning to eliminate some mbufs.
3255 		 */
3256 		mbuf = loc->mbuf;
3257 		nxlen = rte_pktmbuf_data_len(mbuf);
3258 		if (unlikely(nxlen <= txq->inlen_send)) {
3259 			/* We can inline first mbuf at least. */
3260 			if (nxlen < inlen) {
3261 				unsigned int smlen;
3262 
3263 				/* Scan mbufs till inlen filled. */
3264 				do {
3265 					smlen = nxlen;
3266 					mbuf = NEXT(mbuf);
3267 					assert(mbuf);
3268 					nxlen = rte_pktmbuf_data_len(mbuf);
3269 					nxlen += smlen;
3270 				} while (unlikely(nxlen < inlen));
3271 				if (unlikely(nxlen > txq->inlen_send)) {
3272 					/* We cannot inline entire mbuf. */
3273 					smlen = inlen - smlen;
3274 					start = rte_pktmbuf_mtod_offset
3275 						    (mbuf, uintptr_t, smlen);
3276 					goto do_align;
3277 				}
3278 			}
3279 			do {
3280 				inlen = nxlen;
3281 				mbuf = NEXT(mbuf);
3282 				/* There should be not end of packet. */
3283 				assert(mbuf);
3284 				nxlen = inlen + rte_pktmbuf_data_len(mbuf);
3285 			} while (unlikely(nxlen < txq->inlen_send));
3286 		}
3287 		start = rte_pktmbuf_mtod(mbuf, uintptr_t);
3288 		/*
3289 		 * Check whether we can do inline to align start
3290 		 * address of data buffer to cacheline.
3291 		 */
3292 do_align:
3293 		start = (~start + 1) & (RTE_CACHE_LINE_SIZE - 1);
3294 		if (unlikely(start)) {
3295 			start += inlen;
3296 			if (start <= txq->inlen_send)
3297 				inlen = start;
3298 		}
3299 	}
3300 	/*
3301 	 * Check whether there are enough free WQEBBs:
3302 	 * - Control Segment
3303 	 * - Ethernet Segment
3304 	 * - First Segment of inlined Ethernet data
3305 	 * - ... data continued ...
3306 	 * - Data Segments of pointer/min inline type
3307 	 *
3308 	 * Estimate the number of Data Segments conservatively,
3309 	 * supposing no any mbufs is being freed during inlining.
3310 	 */
3311 	assert(inlen <= txq->inlen_send);
3312 	ds = NB_SEGS(loc->mbuf) + 2 + (inlen -
3313 				       MLX5_ESEG_MIN_INLINE_SIZE +
3314 				       MLX5_WSEG_SIZE +
3315 				       MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
3316 	if (unlikely(loc->wqe_free < ((ds + 3) / 4)))
3317 		return MLX5_TXCMP_CODE_EXIT;
3318 	/* Check for maximal WQE size. */
3319 	if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4)))
3320 		return MLX5_TXCMP_CODE_ERROR;
3321 #ifdef MLX5_PMD_SOFT_COUNTERS
3322 	/* Update sent data bytes/packets counters. */
3323 	txq->stats.obytes += dlen + vlan;
3324 #endif
3325 	wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
3326 	loc->wqe_last = wqe;
3327 	mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_SEND, olx);
3328 	ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 0, olx);
3329 	wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
3330 	txq->wqe_ci += (ds + 3) / 4;
3331 	loc->wqe_free -= (ds + 3) / 4;
3332 	/* Request CQE generation if limits are reached. */
3333 	mlx5_tx_request_completion(txq, loc, olx);
3334 	return MLX5_TXCMP_CODE_MULTI;
3335 }
3336 
3337 /**
3338  * Tx burst function for multi-segment packets. Supports all
3339  * types of Tx offloads, uses MLX5_OPCODE_SEND/TSO to build WQEs,
3340  * sends one packet per WQE. Function stops sending if it
3341  * encounters the single-segment packet.
3342  *
3343  * This routine is responsible for storing processed mbuf
3344  * into elts ring buffer and update elts_head.
3345  *
3346  * @param txq
3347  *   Pointer to TX queue structure.
3348  * @param[in] pkts
3349  *   Packets to transmit.
3350  * @param pkts_n
3351  *   Number of packets in array.
3352  * @param loc
3353  *   Pointer to burst routine local context.
3354  * @param olx
3355  *   Configured Tx offloads mask. It is fully defined at
3356  *   compile time and may be used for optimization.
3357  *
3358  * @return
3359  *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
3360  *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
3361  *   MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered.
3362  *   MLX5_TXCMP_CODE_TSO - TSO single-segment packet encountered.
3363  * Local context variables updated.
3364  */
3365 static __rte_always_inline enum mlx5_txcmp_code
3366 mlx5_tx_burst_mseg(struct mlx5_txq_data *restrict txq,
3367 		   struct rte_mbuf **restrict pkts,
3368 		   unsigned int pkts_n,
3369 		   struct mlx5_txq_local *restrict loc,
3370 		   unsigned int olx)
3371 {
3372 	assert(loc->elts_free && loc->wqe_free);
3373 	assert(pkts_n > loc->pkts_sent);
3374 	pkts += loc->pkts_sent + 1;
3375 	pkts_n -= loc->pkts_sent;
3376 	for (;;) {
3377 		enum mlx5_txcmp_code ret;
3378 
3379 		assert(NB_SEGS(loc->mbuf) > 1);
3380 		/*
3381 		 * Estimate the number of free elts quickly but
3382 		 * conservatively. Some segment may be fully inlined
3383 		 * and freed, ignore this here - precise estimation
3384 		 * is costly.
3385 		 */
3386 		if (loc->elts_free < NB_SEGS(loc->mbuf))
3387 			return MLX5_TXCMP_CODE_EXIT;
3388 		if (MLX5_TXOFF_CONFIG(TSO) &&
3389 		    unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) {
3390 			/* Proceed with multi-segment TSO. */
3391 			ret = mlx5_tx_packet_multi_tso(txq, loc, olx);
3392 		} else if (MLX5_TXOFF_CONFIG(INLINE)) {
3393 			/* Proceed with multi-segment SEND with inlining. */
3394 			ret = mlx5_tx_packet_multi_inline(txq, loc, olx);
3395 		} else {
3396 			/* Proceed with multi-segment SEND w/o inlining. */
3397 			ret = mlx5_tx_packet_multi_send(txq, loc, olx);
3398 		}
3399 		if (ret == MLX5_TXCMP_CODE_EXIT)
3400 			return MLX5_TXCMP_CODE_EXIT;
3401 		if (ret == MLX5_TXCMP_CODE_ERROR)
3402 			return MLX5_TXCMP_CODE_ERROR;
3403 		/* WQE is built, go to the next packet. */
3404 		++loc->pkts_sent;
3405 		--pkts_n;
3406 		if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
3407 			return MLX5_TXCMP_CODE_EXIT;
3408 		loc->mbuf = *pkts++;
3409 		if (pkts_n > 1)
3410 			rte_prefetch0(*pkts);
3411 		if (likely(NB_SEGS(loc->mbuf) > 1))
3412 			continue;
3413 		/* Here ends the series of multi-segment packets. */
3414 		if (MLX5_TXOFF_CONFIG(TSO) &&
3415 		    unlikely(!(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)))
3416 			return MLX5_TXCMP_CODE_TSO;
3417 		return MLX5_TXCMP_CODE_SINGLE;
3418 	}
3419 	assert(false);
3420 }
3421 
3422 /**
3423  * Tx burst function for single-segment packets with TSO.
3424  * Supports all types of Tx offloads, except multi-packets.
3425  * Uses MLX5_OPCODE_TSO to build WQEs, sends one packet per WQE.
3426  * Function stops sending if it encounters the multi-segment
3427  * packet or packet without TSO requested.
3428  *
3429  * The routine is responsible for storing processed mbuf
3430  * into elts ring buffer and update elts_head if inline
3431  * offloads is requested due to possible early freeing
3432  * of the inlined mbufs (can not store pkts array in elts
3433  * as a batch).
3434  *
3435  * @param txq
3436  *   Pointer to TX queue structure.
3437  * @param[in] pkts
3438  *   Packets to transmit.
3439  * @param pkts_n
3440  *   Number of packets in array.
3441  * @param loc
3442  *   Pointer to burst routine local context.
3443  * @param olx
3444  *   Configured Tx offloads mask. It is fully defined at
3445  *   compile time and may be used for optimization.
3446  *
3447  * @return
3448  *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
3449  *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
3450  *   MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered.
3451  *   MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered.
3452  * Local context variables updated.
3453  */
3454 static __rte_always_inline enum mlx5_txcmp_code
3455 mlx5_tx_burst_tso(struct mlx5_txq_data *restrict txq,
3456 		  struct rte_mbuf **restrict pkts,
3457 		  unsigned int pkts_n,
3458 		  struct mlx5_txq_local *restrict loc,
3459 		  unsigned int olx)
3460 {
3461 	assert(loc->elts_free && loc->wqe_free);
3462 	assert(pkts_n > loc->pkts_sent);
3463 	pkts += loc->pkts_sent + 1;
3464 	pkts_n -= loc->pkts_sent;
3465 	for (;;) {
3466 		struct mlx5_wqe_dseg *restrict dseg;
3467 		struct mlx5_wqe *restrict wqe;
3468 		unsigned int ds, dlen, hlen, ntcp, vlan = 0;
3469 		uint8_t *dptr;
3470 
3471 		assert(NB_SEGS(loc->mbuf) == 1);
3472 		dlen = rte_pktmbuf_data_len(loc->mbuf);
3473 		if (MLX5_TXOFF_CONFIG(VLAN) &&
3474 		    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) {
3475 			vlan = sizeof(struct rte_vlan_hdr);
3476 		}
3477 		/*
3478 		 * First calculate the WQE size to check
3479 		 * whether we have enough space in ring buffer.
3480 		 */
3481 		hlen = loc->mbuf->l2_len + vlan +
3482 		       loc->mbuf->l3_len + loc->mbuf->l4_len;
3483 		if (unlikely((!hlen || !loc->mbuf->tso_segsz)))
3484 			return MLX5_TXCMP_CODE_ERROR;
3485 		if (loc->mbuf->ol_flags & PKT_TX_TUNNEL_MASK)
3486 			hlen += loc->mbuf->outer_l2_len +
3487 				loc->mbuf->outer_l3_len;
3488 		/* Segment must contain all TSO headers. */
3489 		if (unlikely(hlen > MLX5_MAX_TSO_HEADER ||
3490 			     hlen <= MLX5_ESEG_MIN_INLINE_SIZE ||
3491 			     hlen > (dlen + vlan)))
3492 			return MLX5_TXCMP_CODE_ERROR;
3493 		/*
3494 		 * Check whether there are enough free WQEBBs:
3495 		 * - Control Segment
3496 		 * - Ethernet Segment
3497 		 * - First Segment of inlined Ethernet data
3498 		 * - ... data continued ...
3499 		 * - Finishing Data Segment of pointer type
3500 		 */
3501 		ds = 4 + (hlen - MLX5_ESEG_MIN_INLINE_SIZE +
3502 			  MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
3503 		if (loc->wqe_free < ((ds + 3) / 4))
3504 			return MLX5_TXCMP_CODE_EXIT;
3505 #ifdef MLX5_PMD_SOFT_COUNTERS
3506 		/* Update sent data bytes/packets counters. */
3507 		ntcp = (dlen + vlan - hlen +
3508 			loc->mbuf->tso_segsz - 1) /
3509 			loc->mbuf->tso_segsz;
3510 		/*
3511 		 * One will be added for mbuf itself at the end
3512 		 * of the mlx5_tx_burst from loc->pkts_sent field.
3513 		 */
3514 		--ntcp;
3515 		txq->stats.opackets += ntcp;
3516 		txq->stats.obytes += dlen + vlan + ntcp * hlen;
3517 #endif
3518 		/*
3519 		 * Build the TSO WQE:
3520 		 * - Control Segment
3521 		 * - Ethernet Segment with hlen bytes inlined
3522 		 * - Data Segment of pointer type
3523 		 */
3524 		wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
3525 		loc->wqe_last = wqe;
3526 		mlx5_tx_cseg_init(txq, loc, wqe, ds,
3527 				  MLX5_OPCODE_TSO, olx);
3528 		dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan, hlen, 1, olx);
3529 		dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + hlen - vlan;
3530 		dlen -= hlen - vlan;
3531 		mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx);
3532 		/*
3533 		 * WQE is built, update the loop parameters
3534 		 * and go to the next packet.
3535 		 */
3536 		txq->wqe_ci += (ds + 3) / 4;
3537 		loc->wqe_free -= (ds + 3) / 4;
3538 		if (MLX5_TXOFF_CONFIG(INLINE))
3539 			txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
3540 		--loc->elts_free;
3541 		++loc->pkts_sent;
3542 		--pkts_n;
3543 		/* Request CQE generation if limits are reached. */
3544 		mlx5_tx_request_completion(txq, loc, olx);
3545 		if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
3546 			return MLX5_TXCMP_CODE_EXIT;
3547 		loc->mbuf = *pkts++;
3548 		if (pkts_n > 1)
3549 			rte_prefetch0(*pkts);
3550 		if (MLX5_TXOFF_CONFIG(MULTI) &&
3551 		    unlikely(NB_SEGS(loc->mbuf) > 1))
3552 			return MLX5_TXCMP_CODE_MULTI;
3553 		if (unlikely(!(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)))
3554 			return MLX5_TXCMP_CODE_SINGLE;
3555 		/* Continue with the next TSO packet. */
3556 	}
3557 	assert(false);
3558 }
3559 
3560 /**
3561  * Analyze the packet and select the best method to send.
3562  *
3563  * @param txq
3564  *   Pointer to TX queue structure.
3565  * @param loc
3566  *   Pointer to burst routine local context.
3567  * @param olx
3568  *   Configured Tx offloads mask. It is fully defined at
3569  *   compile time and may be used for optimization.
3570  * @param newp
3571  *   The predefined flag whether do complete check for
3572  *   multi-segment packets and TSO.
3573  *
3574  * @return
3575  *  MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered.
3576  *  MLX5_TXCMP_CODE_TSO - TSO required, use TSO/LSO.
3577  *  MLX5_TXCMP_CODE_SINGLE - single-segment packet, use SEND.
3578  *  MLX5_TXCMP_CODE_EMPW - single-segment packet, use MPW.
3579  */
3580 static __rte_always_inline enum mlx5_txcmp_code
3581 mlx5_tx_able_to_empw(struct mlx5_txq_data *restrict txq,
3582 		     struct mlx5_txq_local *restrict loc,
3583 		     unsigned int olx,
3584 		     bool newp)
3585 {
3586 	/* Check for multi-segment packet. */
3587 	if (newp &&
3588 	    MLX5_TXOFF_CONFIG(MULTI) &&
3589 	    unlikely(NB_SEGS(loc->mbuf) > 1))
3590 		return MLX5_TXCMP_CODE_MULTI;
3591 	/* Check for TSO packet. */
3592 	if (newp &&
3593 	    MLX5_TXOFF_CONFIG(TSO) &&
3594 	    unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG))
3595 		return MLX5_TXCMP_CODE_TSO;
3596 	/* Check if eMPW is enabled at all. */
3597 	if (!MLX5_TXOFF_CONFIG(EMPW))
3598 		return MLX5_TXCMP_CODE_SINGLE;
3599 	/* Check if eMPW can be engaged. */
3600 	if (MLX5_TXOFF_CONFIG(VLAN) &&
3601 	    unlikely(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) &&
3602 		(!MLX5_TXOFF_CONFIG(INLINE) ||
3603 		 unlikely((rte_pktmbuf_data_len(loc->mbuf) +
3604 			   sizeof(struct rte_vlan_hdr)) > txq->inlen_empw))) {
3605 		/*
3606 		 * eMPW does not support VLAN insertion offload,
3607 		 * we have to inline the entire packet but
3608 		 * packet is too long for inlining.
3609 		 */
3610 		return MLX5_TXCMP_CODE_SINGLE;
3611 	}
3612 	return MLX5_TXCMP_CODE_EMPW;
3613 }
3614 
3615 /**
3616  * Check the next packet attributes to match with the eMPW batch ones.
3617  *
3618  * @param txq
3619  *   Pointer to TX queue structure.
3620  * @param es
3621  *   Pointer to Ethernet Segment of eMPW batch.
3622  * @param loc
3623  *   Pointer to burst routine local context.
3624  * @param olx
3625  *   Configured Tx offloads mask. It is fully defined at
3626  *   compile time and may be used for optimization.
3627  *
3628  * @return
3629  *  true - packet match with eMPW batch attributes.
3630  *  false - no match, eMPW should be restarted.
3631  */
3632 static __rte_always_inline bool
3633 mlx5_tx_match_empw(struct mlx5_txq_data *restrict txq __rte_unused,
3634 		   struct mlx5_wqe_eseg *restrict es,
3635 		   struct mlx5_txq_local *restrict loc,
3636 		   unsigned int olx)
3637 {
3638 	uint8_t swp_flags = 0;
3639 
3640 	/* Compare the checksum flags, if any. */
3641 	if (MLX5_TXOFF_CONFIG(CSUM) &&
3642 	    txq_ol_cksum_to_cs(loc->mbuf) != es->cs_flags)
3643 		return false;
3644 	/* Compare the Software Parser offsets and flags. */
3645 	if (MLX5_TXOFF_CONFIG(SWP) &&
3646 	    (es->swp_offs != txq_mbuf_to_swp(loc, &swp_flags, olx) ||
3647 	     es->swp_flags != swp_flags))
3648 		return false;
3649 	/* Fill metadata field if needed. */
3650 	if (MLX5_TXOFF_CONFIG(METADATA) &&
3651 		es->metadata != (loc->mbuf->ol_flags & PKT_TX_METADATA ?
3652 				 loc->mbuf->tx_metadata : 0))
3653 		return false;
3654 	/* There must be no VLAN packets in eMPW loop. */
3655 	if (MLX5_TXOFF_CONFIG(VLAN))
3656 		assert(!(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT));
3657 	return true;
3658 }
3659 
3660 /*
3661  * Update send loop variables and WQE for eMPW loop
3662  * without data inlining. Number of Data Segments is
3663  * equal to the number of sent packets.
3664  *
3665  * @param txq
3666  *   Pointer to TX queue structure.
3667  * @param loc
3668  *   Pointer to burst routine local context.
3669  * @param ds
3670  *   Number of packets/Data Segments/Packets.
3671  * @param slen
3672  *   Accumulated statistics, bytes sent
3673  * @param olx
3674  *   Configured Tx offloads mask. It is fully defined at
3675  *   compile time and may be used for optimization.
3676  *
3677  * @return
3678  *  true - packet match with eMPW batch attributes.
3679  *  false - no match, eMPW should be restarted.
3680  */
3681 static __rte_always_inline void
3682 mlx5_tx_sdone_empw(struct mlx5_txq_data *restrict txq,
3683 		   struct mlx5_txq_local *restrict loc,
3684 		   unsigned int ds,
3685 		   unsigned int slen,
3686 		   unsigned int olx)
3687 {
3688 	assert(!MLX5_TXOFF_CONFIG(INLINE));
3689 #ifdef MLX5_PMD_SOFT_COUNTERS
3690 	/* Update sent data bytes counter. */
3691 	 txq->stats.obytes += slen;
3692 #else
3693 	(void)slen;
3694 #endif
3695 	loc->elts_free -= ds;
3696 	loc->pkts_sent += ds;
3697 	ds += 2;
3698 	loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
3699 	txq->wqe_ci += (ds + 3) / 4;
3700 	loc->wqe_free -= (ds + 3) / 4;
3701 	/* Request CQE generation if limits are reached. */
3702 	mlx5_tx_request_completion(txq, loc, olx);
3703 }
3704 
3705 /*
3706  * Update send loop variables and WQE for eMPW loop
3707  * with data inlining. Gets the size of pushed descriptors
3708  * and data to the WQE.
3709  *
3710  * @param txq
3711  *   Pointer to TX queue structure.
3712  * @param loc
3713  *   Pointer to burst routine local context.
3714  * @param len
3715  *   Total size of descriptor/data in bytes.
3716  * @param slen
3717  *   Accumulated statistics, data bytes sent.
3718  * @param olx
3719  *   Configured Tx offloads mask. It is fully defined at
3720  *   compile time and may be used for optimization.
3721  *
3722  * @return
3723  *  true - packet match with eMPW batch attributes.
3724  *  false - no match, eMPW should be restarted.
3725  */
3726 static __rte_always_inline void
3727 mlx5_tx_idone_empw(struct mlx5_txq_data *restrict txq,
3728 		   struct mlx5_txq_local *restrict loc,
3729 		   unsigned int len,
3730 		   unsigned int slen,
3731 		   unsigned int olx __rte_unused)
3732 {
3733 	assert(MLX5_TXOFF_CONFIG(INLINE));
3734 	assert((len % MLX5_WSEG_SIZE) == 0);
3735 #ifdef MLX5_PMD_SOFT_COUNTERS
3736 	/* Update sent data bytes counter. */
3737 	 txq->stats.obytes += slen;
3738 #else
3739 	(void)slen;
3740 #endif
3741 	len = len / MLX5_WSEG_SIZE + 2;
3742 	loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | len);
3743 	txq->wqe_ci += (len + 3) / 4;
3744 	loc->wqe_free -= (len + 3) / 4;
3745 	/* Request CQE generation if limits are reached. */
3746 	mlx5_tx_request_completion(txq, loc, olx);
3747 }
3748 
3749 /**
3750  * The set of Tx burst functions for single-segment packets
3751  * without TSO and with Multi-Packet Writing feature support.
3752  * Supports all types of Tx offloads, except multi-packets
3753  * and TSO.
3754  *
3755  * Uses MLX5_OPCODE_EMPW to build WQEs if possible and sends
3756  * as many packet per WQE as it can. If eMPW is not configured
3757  * or packet can not be sent with eMPW (VLAN insertion) the
3758  * ordinary SEND opcode is used and only one packet placed
3759  * in WQE.
3760  *
3761  * Functions stop sending if it encounters the multi-segment
3762  * packet or packet with TSO requested.
3763  *
3764  * The routines are responsible for storing processed mbuf
3765  * into elts ring buffer and update elts_head if inlining
3766  * offload is requested. Otherwise the copying mbufs to elts
3767  * can be postponed and completed at the end of burst routine.
3768  *
3769  * @param txq
3770  *   Pointer to TX queue structure.
3771  * @param[in] pkts
3772  *   Packets to transmit.
3773  * @param pkts_n
3774  *   Number of packets in array.
3775  * @param loc
3776  *   Pointer to burst routine local context.
3777  * @param olx
3778  *   Configured Tx offloads mask. It is fully defined at
3779  *   compile time and may be used for optimization.
3780  *
3781  * @return
3782  *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
3783  *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
3784  *   MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered.
3785  *   MLX5_TXCMP_CODE_TSO - TSO packet encountered.
3786  *   MLX5_TXCMP_CODE_SINGLE - used inside functions set.
3787  *   MLX5_TXCMP_CODE_EMPW - used inside functions set.
3788  *
3789  * Local context variables updated.
3790  *
3791  *
3792  * The routine sends packets with MLX5_OPCODE_EMPW
3793  * without inlining, this is dedicated optimized branch.
3794  * No VLAN insertion is supported.
3795  */
3796 static __rte_always_inline enum mlx5_txcmp_code
3797 mlx5_tx_burst_empw_simple(struct mlx5_txq_data *restrict txq,
3798 			  struct rte_mbuf **restrict pkts,
3799 			  unsigned int pkts_n,
3800 			  struct mlx5_txq_local *restrict loc,
3801 			  unsigned int olx)
3802 {
3803 	/*
3804 	 * Subroutine is the part of mlx5_tx_burst_single()
3805 	 * and sends single-segment packet with eMPW opcode
3806 	 * without data inlining.
3807 	 */
3808 	assert(!MLX5_TXOFF_CONFIG(INLINE));
3809 	assert(MLX5_TXOFF_CONFIG(EMPW));
3810 	assert(loc->elts_free && loc->wqe_free);
3811 	assert(pkts_n > loc->pkts_sent);
3812 	static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size");
3813 	pkts += loc->pkts_sent + 1;
3814 	pkts_n -= loc->pkts_sent;
3815 	for (;;) {
3816 		struct mlx5_wqe_dseg *restrict dseg;
3817 		struct mlx5_wqe_eseg *restrict eseg;
3818 		enum mlx5_txcmp_code ret;
3819 		unsigned int part, loop;
3820 		unsigned int slen = 0;
3821 
3822 next_empw:
3823 		part = RTE_MIN(pkts_n, MLX5_EMPW_MAX_PACKETS);
3824 		if (unlikely(loc->elts_free < part)) {
3825 			/* We have no enough elts to save all mbufs. */
3826 			if (unlikely(loc->elts_free < MLX5_EMPW_MIN_PACKETS))
3827 				return MLX5_TXCMP_CODE_EXIT;
3828 			/* But we still able to send at least minimal eMPW. */
3829 			part = loc->elts_free;
3830 		}
3831 		/* Check whether we have enough WQEs */
3832 		if (unlikely(loc->wqe_free < ((2 + part + 3) / 4))) {
3833 			if (unlikely(loc->wqe_free <
3834 				((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4)))
3835 				return MLX5_TXCMP_CODE_EXIT;
3836 			part = (loc->wqe_free * 4) - 2;
3837 		}
3838 		if (likely(part > 1))
3839 			rte_prefetch0(*pkts);
3840 		loc->wqe_last = txq->wqes + (txq->wqe_ci & txq->wqe_m);
3841 		/*
3842 		 * Build eMPW title WQEBB:
3843 		 * - Control Segment, eMPW opcode
3844 		 * - Ethernet Segment, no inline
3845 		 */
3846 		mlx5_tx_cseg_init(txq, loc, loc->wqe_last, part + 2,
3847 				  MLX5_OPCODE_ENHANCED_MPSW, olx);
3848 		mlx5_tx_eseg_none(txq, loc, loc->wqe_last,
3849 				  olx & ~MLX5_TXOFF_CONFIG_VLAN);
3850 		eseg = &loc->wqe_last->eseg;
3851 		dseg = &loc->wqe_last->dseg[0];
3852 		loop = part;
3853 		for (;;) {
3854 			uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf);
3855 #ifdef MLX5_PMD_SOFT_COUNTERS
3856 			/* Update sent data bytes counter. */
3857 			slen += dlen;
3858 #endif
3859 			mlx5_tx_dseg_ptr
3860 				(txq, loc, dseg,
3861 				 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
3862 				 dlen, olx);
3863 			if (unlikely(--loop == 0))
3864 				break;
3865 			loc->mbuf = *pkts++;
3866 			if (likely(loop > 1))
3867 				rte_prefetch0(*pkts);
3868 			ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
3869 			/*
3870 			 * Unroll the completion code to avoid
3871 			 * returning variable value - it results in
3872 			 * unoptimized sequent checking in caller.
3873 			 */
3874 			if (ret == MLX5_TXCMP_CODE_MULTI) {
3875 				part -= loop;
3876 				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
3877 				if (unlikely(!loc->elts_free ||
3878 					     !loc->wqe_free))
3879 					return MLX5_TXCMP_CODE_EXIT;
3880 				return MLX5_TXCMP_CODE_MULTI;
3881 			}
3882 			if (ret == MLX5_TXCMP_CODE_TSO) {
3883 				part -= loop;
3884 				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
3885 				if (unlikely(!loc->elts_free ||
3886 					     !loc->wqe_free))
3887 					return MLX5_TXCMP_CODE_EXIT;
3888 				return MLX5_TXCMP_CODE_TSO;
3889 			}
3890 			if (ret == MLX5_TXCMP_CODE_SINGLE) {
3891 				part -= loop;
3892 				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
3893 				if (unlikely(!loc->elts_free ||
3894 					     !loc->wqe_free))
3895 					return MLX5_TXCMP_CODE_EXIT;
3896 				return MLX5_TXCMP_CODE_SINGLE;
3897 			}
3898 			if (ret != MLX5_TXCMP_CODE_EMPW) {
3899 				assert(false);
3900 				part -= loop;
3901 				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
3902 				return MLX5_TXCMP_CODE_ERROR;
3903 			}
3904 			/*
3905 			 * Check whether packet parameters coincide
3906 			 * within assumed eMPW batch:
3907 			 * - check sum settings
3908 			 * - metadata value
3909 			 * - software parser settings
3910 			 */
3911 			if (!mlx5_tx_match_empw(txq, eseg, loc, olx)) {
3912 				assert(loop);
3913 				part -= loop;
3914 				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
3915 				if (unlikely(!loc->elts_free ||
3916 					     !loc->wqe_free))
3917 					return MLX5_TXCMP_CODE_EXIT;
3918 				pkts_n -= part;
3919 				goto next_empw;
3920 			}
3921 			/* Packet attributes match, continue the same eMPW. */
3922 			++dseg;
3923 			if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
3924 				dseg = (struct mlx5_wqe_dseg *)txq->wqes;
3925 		}
3926 		/* eMPW is built successfully, update loop parameters. */
3927 		assert(!loop);
3928 		assert(pkts_n >= part);
3929 #ifdef MLX5_PMD_SOFT_COUNTERS
3930 		/* Update sent data bytes counter. */
3931 		txq->stats.obytes += slen;
3932 #endif
3933 		loc->elts_free -= part;
3934 		loc->pkts_sent += part;
3935 		txq->wqe_ci += (2 + part + 3) / 4;
3936 		loc->wqe_free -= (2 + part + 3) / 4;
3937 		pkts_n -= part;
3938 		/* Request CQE generation if limits are reached. */
3939 		mlx5_tx_request_completion(txq, loc, olx);
3940 		if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
3941 			return MLX5_TXCMP_CODE_EXIT;
3942 		loc->mbuf = *pkts++;
3943 		ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
3944 		if (unlikely(ret != MLX5_TXCMP_CODE_EMPW))
3945 			return ret;
3946 		/* Continue sending eMPW batches. */
3947 	}
3948 	assert(false);
3949 }
3950 
3951 /**
3952  * The routine sends packets with MLX5_OPCODE_EMPW
3953  * with inlining, optionally supports VLAN insertion.
3954  */
3955 static __rte_always_inline enum mlx5_txcmp_code
3956 mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq,
3957 			  struct rte_mbuf **restrict pkts,
3958 			  unsigned int pkts_n,
3959 			  struct mlx5_txq_local *restrict loc,
3960 			  unsigned int olx)
3961 {
3962 	/*
3963 	 * Subroutine is the part of mlx5_tx_burst_single()
3964 	 * and sends single-segment packet with eMPW opcode
3965 	 * with data inlining.
3966 	 */
3967 	assert(MLX5_TXOFF_CONFIG(INLINE));
3968 	assert(MLX5_TXOFF_CONFIG(EMPW));
3969 	assert(loc->elts_free && loc->wqe_free);
3970 	assert(pkts_n > loc->pkts_sent);
3971 	static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size");
3972 	pkts += loc->pkts_sent + 1;
3973 	pkts_n -= loc->pkts_sent;
3974 	for (;;) {
3975 		struct mlx5_wqe_dseg *restrict dseg;
3976 		struct mlx5_wqe_eseg *restrict eseg;
3977 		enum mlx5_txcmp_code ret;
3978 		unsigned int room, part, nlim;
3979 		unsigned int slen = 0;
3980 
3981 		/*
3982 		 * Limits the amount of packets in one WQE
3983 		 * to improve CQE latency generation.
3984 		 */
3985 		nlim = RTE_MIN(pkts_n, MLX5_EMPW_MAX_PACKETS);
3986 		/* Check whether we have minimal amount WQEs */
3987 		if (unlikely(loc->wqe_free <
3988 			    ((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4)))
3989 			return MLX5_TXCMP_CODE_EXIT;
3990 		if (likely(pkts_n > 1))
3991 			rte_prefetch0(*pkts);
3992 		loc->wqe_last = txq->wqes + (txq->wqe_ci & txq->wqe_m);
3993 		/*
3994 		 * Build eMPW title WQEBB:
3995 		 * - Control Segment, eMPW opcode, zero DS
3996 		 * - Ethernet Segment, no inline
3997 		 */
3998 		mlx5_tx_cseg_init(txq, loc, loc->wqe_last, 0,
3999 				  MLX5_OPCODE_ENHANCED_MPSW, olx);
4000 		mlx5_tx_eseg_none(txq, loc, loc->wqe_last,
4001 				  olx & ~MLX5_TXOFF_CONFIG_VLAN);
4002 		eseg = &loc->wqe_last->eseg;
4003 		dseg = &loc->wqe_last->dseg[0];
4004 		room = RTE_MIN(MLX5_WQE_SIZE_MAX / MLX5_WQE_SIZE,
4005 			       loc->wqe_free) * MLX5_WQE_SIZE -
4006 					MLX5_WQE_CSEG_SIZE -
4007 					MLX5_WQE_ESEG_SIZE;
4008 		/* Build WQE till we have space, packets and resources. */
4009 		part = room;
4010 		for (;;) {
4011 			uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf);
4012 			uint8_t *dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
4013 			unsigned int tlen;
4014 
4015 			assert(room >= MLX5_WQE_DSEG_SIZE);
4016 			assert((room % MLX5_WQE_DSEG_SIZE) == 0);
4017 			assert((uintptr_t)dseg < (uintptr_t)txq->wqes_end);
4018 			/*
4019 			 * Some Tx offloads may cause an error if
4020 			 * packet is not long enough, check against
4021 			 * assumed minimal length.
4022 			 */
4023 			if (unlikely(dlen <= MLX5_ESEG_MIN_INLINE_SIZE)) {
4024 				part -= room;
4025 				if (unlikely(!part))
4026 					return MLX5_TXCMP_CODE_ERROR;
4027 				/*
4028 				 * We have some successfully built
4029 				 * packet Data Segments to send.
4030 				 */
4031 				mlx5_tx_idone_empw(txq, loc, part, slen, olx);
4032 				return MLX5_TXCMP_CODE_ERROR;
4033 			}
4034 			/* Inline or not inline - that's the Question. */
4035 			if (dlen > txq->inlen_empw)
4036 				goto pointer_empw;
4037 			/* Inline entire packet, optional VLAN insertion. */
4038 			tlen = sizeof(dseg->bcount) + dlen;
4039 			if (MLX5_TXOFF_CONFIG(VLAN) &&
4040 			    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) {
4041 				/*
4042 				 * The packet length must be checked in
4043 				 * mlx5_tx_able_to_empw() and packet
4044 				 * fits into inline length guaranteed.
4045 				 */
4046 				assert((dlen + sizeof(struct rte_vlan_hdr)) <=
4047 					txq->inlen_empw);
4048 				tlen += sizeof(struct rte_vlan_hdr);
4049 				if (room < tlen)
4050 					break;
4051 				dseg = mlx5_tx_dseg_vlan(txq, loc, dseg,
4052 							 dptr, dlen, olx);
4053 #ifdef MLX5_PMD_SOFT_COUNTERS
4054 				/* Update sent data bytes counter. */
4055 				slen +=	sizeof(struct rte_vlan_hdr);
4056 #endif
4057 			} else {
4058 				if (room < tlen)
4059 					break;
4060 				dseg = mlx5_tx_dseg_empw(txq, loc, dseg,
4061 							 dptr, dlen, olx);
4062 			}
4063 			tlen = RTE_ALIGN(tlen, MLX5_WSEG_SIZE);
4064 			assert(room >= tlen);
4065 			room -= tlen;
4066 			/*
4067 			 * Packet data are completely inlined,
4068 			 * free the packet immediately.
4069 			 */
4070 			rte_pktmbuf_free_seg(loc->mbuf);
4071 			goto next_mbuf;
4072 pointer_empw:
4073 			/*
4074 			 * Not inlinable VLAN packets are
4075 			 * proceeded outside of this routine.
4076 			 */
4077 			assert(room >= MLX5_WQE_DSEG_SIZE);
4078 			if (MLX5_TXOFF_CONFIG(VLAN))
4079 				assert(!(loc->mbuf->ol_flags &
4080 					 PKT_TX_VLAN_PKT));
4081 			mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx);
4082 			/* We have to store mbuf in elts.*/
4083 			txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
4084 			room -= MLX5_WQE_DSEG_SIZE;
4085 			/* Ring buffer wraparound is checked at the loop end.*/
4086 			++dseg;
4087 next_mbuf:
4088 #ifdef MLX5_PMD_SOFT_COUNTERS
4089 			/* Update sent data bytes counter. */
4090 			slen += dlen;
4091 #endif
4092 			loc->pkts_sent++;
4093 			loc->elts_free--;
4094 			pkts_n--;
4095 			if (unlikely(!pkts_n || !loc->elts_free)) {
4096 				/*
4097 				 * We have no resources/packets to
4098 				 * continue build descriptors.
4099 				 */
4100 				part -= room;
4101 				mlx5_tx_idone_empw(txq, loc, part, slen, olx);
4102 				return MLX5_TXCMP_CODE_EXIT;
4103 			}
4104 			loc->mbuf = *pkts++;
4105 			if (likely(pkts_n > 1))
4106 				rte_prefetch0(*pkts);
4107 			ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
4108 			/*
4109 			 * Unroll the completion code to avoid
4110 			 * returning variable value - it results in
4111 			 * unoptimized sequent checking in caller.
4112 			 */
4113 			if (ret == MLX5_TXCMP_CODE_MULTI) {
4114 				part -= room;
4115 				mlx5_tx_idone_empw(txq, loc, part, slen, olx);
4116 				if (unlikely(!loc->elts_free ||
4117 					     !loc->wqe_free))
4118 					return MLX5_TXCMP_CODE_EXIT;
4119 				return MLX5_TXCMP_CODE_MULTI;
4120 			}
4121 			if (ret == MLX5_TXCMP_CODE_TSO) {
4122 				part -= room;
4123 				mlx5_tx_idone_empw(txq, loc, part, slen, olx);
4124 				if (unlikely(!loc->elts_free ||
4125 					     !loc->wqe_free))
4126 					return MLX5_TXCMP_CODE_EXIT;
4127 				return MLX5_TXCMP_CODE_TSO;
4128 			}
4129 			if (ret == MLX5_TXCMP_CODE_SINGLE) {
4130 				part -= room;
4131 				mlx5_tx_idone_empw(txq, loc, part, slen, olx);
4132 				if (unlikely(!loc->elts_free ||
4133 					     !loc->wqe_free))
4134 					return MLX5_TXCMP_CODE_EXIT;
4135 				return MLX5_TXCMP_CODE_SINGLE;
4136 			}
4137 			if (ret != MLX5_TXCMP_CODE_EMPW) {
4138 				assert(false);
4139 				part -= room;
4140 				mlx5_tx_idone_empw(txq, loc, part, slen, olx);
4141 				return MLX5_TXCMP_CODE_ERROR;
4142 			}
4143 			/* Check if we have minimal room left. */
4144 			nlim--;
4145 			if (unlikely(!nlim || room < MLX5_WQE_DSEG_SIZE))
4146 				break;
4147 			/*
4148 			 * Check whether packet parameters coincide
4149 			 * within assumed eMPW batch:
4150 			 * - check sum settings
4151 			 * - metadata value
4152 			 * - software parser settings
4153 			 */
4154 			if (!mlx5_tx_match_empw(txq, eseg, loc, olx))
4155 				break;
4156 			/* Packet attributes match, continue the same eMPW. */
4157 			if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
4158 				dseg = (struct mlx5_wqe_dseg *)txq->wqes;
4159 		}
4160 		/*
4161 		 * We get here to close an existing eMPW
4162 		 * session and start the new one.
4163 		 */
4164 		assert(pkts_n);
4165 		part -= room;
4166 		if (unlikely(!part))
4167 			return MLX5_TXCMP_CODE_EXIT;
4168 		mlx5_tx_idone_empw(txq, loc, part, slen, olx);
4169 		if (unlikely(!loc->elts_free ||
4170 			     !loc->wqe_free))
4171 			return MLX5_TXCMP_CODE_EXIT;
4172 		/* Continue the loop with new eMPW session. */
4173 	}
4174 	assert(false);
4175 }
4176 
4177 /**
4178  * The routine sends packets with ordinary MLX5_OPCODE_SEND.
4179  * Data inlining and VLAN insertion are supported.
4180  */
4181 static __rte_always_inline enum mlx5_txcmp_code
4182 mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq,
4183 			  struct rte_mbuf **restrict pkts,
4184 			  unsigned int pkts_n,
4185 			  struct mlx5_txq_local *restrict loc,
4186 			  unsigned int olx)
4187 {
4188 	/*
4189 	 * Subroutine is the part of mlx5_tx_burst_single()
4190 	 * and sends single-segment packet with SEND opcode.
4191 	 */
4192 	assert(loc->elts_free && loc->wqe_free);
4193 	assert(pkts_n > loc->pkts_sent);
4194 	pkts += loc->pkts_sent + 1;
4195 	pkts_n -= loc->pkts_sent;
4196 	for (;;) {
4197 		struct mlx5_wqe *restrict wqe;
4198 		enum mlx5_txcmp_code ret;
4199 
4200 		assert(NB_SEGS(loc->mbuf) == 1);
4201 		if (MLX5_TXOFF_CONFIG(INLINE)) {
4202 			unsigned int inlen, vlan = 0;
4203 
4204 			inlen = rte_pktmbuf_data_len(loc->mbuf);
4205 			if (MLX5_TXOFF_CONFIG(VLAN) &&
4206 			    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) {
4207 				vlan = sizeof(struct rte_vlan_hdr);
4208 				inlen += vlan;
4209 				static_assert((sizeof(struct rte_vlan_hdr) +
4210 					       sizeof(struct rte_ether_hdr)) ==
4211 					       MLX5_ESEG_MIN_INLINE_SIZE,
4212 					       "invalid min inline data size");
4213 			}
4214 			/*
4215 			 * If inlining is enabled at configuration time
4216 			 * the limit must be not less than minimal size.
4217 			 * Otherwise we would do extra check for data
4218 			 * size to avoid crashes due to length overflow.
4219 			 */
4220 			assert(txq->inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE);
4221 			if (inlen <= txq->inlen_send) {
4222 				unsigned int seg_n, wqe_n;
4223 
4224 				rte_prefetch0(rte_pktmbuf_mtod
4225 						(loc->mbuf, uint8_t *));
4226 				/* Check against minimal length. */
4227 				if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE)
4228 					return MLX5_TXCMP_CODE_ERROR;
4229 				/*
4230 				 * Completely inlined packet data WQE:
4231 				 * - Control Segment, SEND opcode
4232 				 * - Ethernet Segment, no VLAN insertion
4233 				 * - Data inlined, VLAN optionally inserted
4234 				 * - Alignment to MLX5_WSEG_SIZE
4235 				 * Have to estimate amount of WQEBBs
4236 				 */
4237 				seg_n = (inlen + 3 * MLX5_WSEG_SIZE -
4238 					 MLX5_ESEG_MIN_INLINE_SIZE +
4239 					 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
4240 				/* Check if there are enough WQEBBs. */
4241 				wqe_n = (seg_n + 3) / 4;
4242 				if (wqe_n > loc->wqe_free)
4243 					return MLX5_TXCMP_CODE_EXIT;
4244 				wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
4245 				loc->wqe_last = wqe;
4246 				mlx5_tx_cseg_init(txq, loc, wqe, seg_n,
4247 						  MLX5_OPCODE_SEND, olx);
4248 				mlx5_tx_eseg_data(txq, loc, wqe,
4249 						  vlan, inlen, 0, olx);
4250 				txq->wqe_ci += wqe_n;
4251 				loc->wqe_free -= wqe_n;
4252 				/*
4253 				 * Packet data are completely inlined,
4254 				 * free the packet immediately.
4255 				 */
4256 				rte_pktmbuf_free_seg(loc->mbuf);
4257 			} else if (!MLX5_TXOFF_CONFIG(EMPW) &&
4258 				   txq->inlen_mode) {
4259 				/*
4260 				 * If minimal inlining is requested the eMPW
4261 				 * feature should be disabled due to data is
4262 				 * inlined into Ethernet Segment, which can
4263 				 * not contain inlined data for eMPW due to
4264 				 * segment shared for all packets.
4265 				 */
4266 				struct mlx5_wqe_dseg *restrict dseg;
4267 				unsigned int ds;
4268 				uint8_t *dptr;
4269 
4270 				/*
4271 				 * The inline-mode settings require
4272 				 * to inline the specified amount of
4273 				 * data bytes to the Ethernet Segment.
4274 				 * We should check the free space in
4275 				 * WQE ring buffer to inline partially.
4276 				 */
4277 				assert(txq->inlen_send >= txq->inlen_mode);
4278 				assert(inlen > txq->inlen_mode);
4279 				assert(txq->inlen_mode >=
4280 						MLX5_ESEG_MIN_INLINE_SIZE);
4281 				/*
4282 				 * Check whether there are enough free WQEBBs:
4283 				 * - Control Segment
4284 				 * - Ethernet Segment
4285 				 * - First Segment of inlined Ethernet data
4286 				 * - ... data continued ...
4287 				 * - Finishing Data Segment of pointer type
4288 				 */
4289 				ds = (MLX5_WQE_CSEG_SIZE +
4290 				      MLX5_WQE_ESEG_SIZE +
4291 				      MLX5_WQE_DSEG_SIZE +
4292 				      txq->inlen_mode -
4293 				      MLX5_ESEG_MIN_INLINE_SIZE +
4294 				      MLX5_WQE_DSEG_SIZE +
4295 				      MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
4296 				if (loc->wqe_free < ((ds + 3) / 4))
4297 					return MLX5_TXCMP_CODE_EXIT;
4298 				/*
4299 				 * Build the ordinary SEND WQE:
4300 				 * - Control Segment
4301 				 * - Ethernet Segment, inline inlen_mode bytes
4302 				 * - Data Segment of pointer type
4303 				 */
4304 				wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
4305 				loc->wqe_last = wqe;
4306 				mlx5_tx_cseg_init(txq, loc, wqe, ds,
4307 						  MLX5_OPCODE_SEND, olx);
4308 				dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan,
4309 							 txq->inlen_mode,
4310 							 0, olx);
4311 				dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) +
4312 				       txq->inlen_mode - vlan;
4313 				inlen -= txq->inlen_mode;
4314 				mlx5_tx_dseg_ptr(txq, loc, dseg,
4315 						 dptr, inlen, olx);
4316 				/*
4317 				 * WQE is built, update the loop parameters
4318 				 * and got to the next packet.
4319 				 */
4320 				txq->wqe_ci += (ds + 3) / 4;
4321 				loc->wqe_free -= (ds + 3) / 4;
4322 				/* We have to store mbuf in elts.*/
4323 				assert(MLX5_TXOFF_CONFIG(INLINE));
4324 				txq->elts[txq->elts_head++ & txq->elts_m] =
4325 						loc->mbuf;
4326 				--loc->elts_free;
4327 			} else {
4328 				uint8_t *dptr;
4329 				unsigned int dlen;
4330 
4331 				/*
4332 				 * Partially inlined packet data WQE, we have
4333 				 * some space in title WQEBB, we can fill it
4334 				 * with some packet data. It takes one WQEBB,
4335 				 * it is available, no extra space check:
4336 				 * - Control Segment, SEND opcode
4337 				 * - Ethernet Segment, no VLAN insertion
4338 				 * - MLX5_ESEG_MIN_INLINE_SIZE bytes of Data
4339 				 * - Data Segment, pointer type
4340 				 *
4341 				 * We also get here if VLAN insertion is not
4342 				 * supported by HW, the inline is enabled.
4343 				 */
4344 				wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
4345 				loc->wqe_last = wqe;
4346 				mlx5_tx_cseg_init(txq, loc, wqe, 4,
4347 						  MLX5_OPCODE_SEND, olx);
4348 				mlx5_tx_eseg_dmin(txq, loc, wqe, vlan, olx);
4349 				dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) +
4350 				       MLX5_ESEG_MIN_INLINE_SIZE - vlan;
4351 				/*
4352 				 * The length check is performed above, by
4353 				 * comparing with txq->inlen_send. We should
4354 				 * not get overflow here.
4355 				 */
4356 				assert(inlen > MLX5_ESEG_MIN_INLINE_SIZE);
4357 				dlen = inlen - MLX5_ESEG_MIN_INLINE_SIZE;
4358 				mlx5_tx_dseg_ptr(txq, loc, &wqe->dseg[1],
4359 						 dptr, dlen, olx);
4360 				++txq->wqe_ci;
4361 				--loc->wqe_free;
4362 				/* We have to store mbuf in elts.*/
4363 				assert(MLX5_TXOFF_CONFIG(INLINE));
4364 				txq->elts[txq->elts_head++ & txq->elts_m] =
4365 						loc->mbuf;
4366 				--loc->elts_free;
4367 			}
4368 #ifdef MLX5_PMD_SOFT_COUNTERS
4369 			/* Update sent data bytes counter. */
4370 			txq->stats.obytes += vlan +
4371 					rte_pktmbuf_data_len(loc->mbuf);
4372 #endif
4373 		} else {
4374 			/*
4375 			 * No inline at all, it means the CPU cycles saving
4376 			 * is prioritized at configuration, we should not
4377 			 * copy any packet data to WQE.
4378 			 *
4379 			 * SEND WQE, one WQEBB:
4380 			 * - Control Segment, SEND opcode
4381 			 * - Ethernet Segment, optional VLAN, no inline
4382 			 * - Data Segment, pointer type
4383 			 */
4384 			wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
4385 			loc->wqe_last = wqe;
4386 			mlx5_tx_cseg_init(txq, loc, wqe, 3,
4387 					  MLX5_OPCODE_SEND, olx);
4388 			mlx5_tx_eseg_none(txq, loc, wqe, olx);
4389 			mlx5_tx_dseg_ptr
4390 				(txq, loc, &wqe->dseg[0],
4391 				 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
4392 				 rte_pktmbuf_data_len(loc->mbuf), olx);
4393 			++txq->wqe_ci;
4394 			--loc->wqe_free;
4395 			/*
4396 			 * We should not store mbuf pointer in elts
4397 			 * if no inlining is configured, this is done
4398 			 * by calling routine in a batch copy.
4399 			 */
4400 			assert(!MLX5_TXOFF_CONFIG(INLINE));
4401 			--loc->elts_free;
4402 #ifdef MLX5_PMD_SOFT_COUNTERS
4403 			/* Update sent data bytes counter. */
4404 			txq->stats.obytes += rte_pktmbuf_data_len(loc->mbuf);
4405 			if (MLX5_TXOFF_CONFIG(VLAN) &&
4406 			    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)
4407 				txq->stats.obytes +=
4408 					sizeof(struct rte_vlan_hdr);
4409 #endif
4410 		}
4411 		++loc->pkts_sent;
4412 		--pkts_n;
4413 		/* Request CQE generation if limits are reached. */
4414 		mlx5_tx_request_completion(txq, loc, olx);
4415 		if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
4416 			return MLX5_TXCMP_CODE_EXIT;
4417 		loc->mbuf = *pkts++;
4418 		if (pkts_n > 1)
4419 			rte_prefetch0(*pkts);
4420 		ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
4421 		if (unlikely(ret != MLX5_TXCMP_CODE_SINGLE))
4422 			return ret;
4423 	}
4424 	assert(false);
4425 }
4426 
4427 static __rte_always_inline enum mlx5_txcmp_code
4428 mlx5_tx_burst_single(struct mlx5_txq_data *restrict txq,
4429 		     struct rte_mbuf **restrict pkts,
4430 		     unsigned int pkts_n,
4431 		     struct mlx5_txq_local *restrict loc,
4432 		     unsigned int olx)
4433 {
4434 	enum mlx5_txcmp_code ret;
4435 
4436 	ret = mlx5_tx_able_to_empw(txq, loc, olx, false);
4437 	if (ret == MLX5_TXCMP_CODE_SINGLE)
4438 		goto ordinary_send;
4439 	assert(ret == MLX5_TXCMP_CODE_EMPW);
4440 	for (;;) {
4441 		/* Optimize for inline/no inline eMPW send. */
4442 		ret = (MLX5_TXOFF_CONFIG(INLINE)) ?
4443 			mlx5_tx_burst_empw_inline
4444 				(txq, pkts, pkts_n, loc, olx) :
4445 			mlx5_tx_burst_empw_simple
4446 				(txq, pkts, pkts_n, loc, olx);
4447 		if (ret != MLX5_TXCMP_CODE_SINGLE)
4448 			return ret;
4449 		/* The resources to send one packet should remain. */
4450 		assert(loc->elts_free && loc->wqe_free);
4451 ordinary_send:
4452 		ret = mlx5_tx_burst_single_send(txq, pkts, pkts_n, loc, olx);
4453 		assert(ret != MLX5_TXCMP_CODE_SINGLE);
4454 		if (ret != MLX5_TXCMP_CODE_EMPW)
4455 			return ret;
4456 		/* The resources to send one packet should remain. */
4457 		assert(loc->elts_free && loc->wqe_free);
4458 	}
4459 }
4460 
4461 /**
4462  * DPDK Tx callback template. This is configured template
4463  * used to generate routines optimized for specified offload setup.
4464  * One of this generated functions is chosen at SQ configuration
4465  * time.
4466  *
4467  * @param txq
4468  *   Generic pointer to TX queue structure.
4469  * @param[in] pkts
4470  *   Packets to transmit.
4471  * @param pkts_n
4472  *   Number of packets in array.
4473  * @param olx
4474  *   Configured offloads mask, presents the bits of MLX5_TXOFF_CONFIG_xxx
4475  *   values. Should be static to take compile time static configuration
4476  *   advantages.
4477  *
4478  * @return
4479  *   Number of packets successfully transmitted (<= pkts_n).
4480  */
4481 static __rte_always_inline uint16_t
4482 mlx5_tx_burst_tmpl(struct mlx5_txq_data *restrict txq,
4483 		   struct rte_mbuf **restrict pkts,
4484 		   uint16_t pkts_n,
4485 		   unsigned int olx)
4486 {
4487 	struct mlx5_txq_local loc;
4488 	enum mlx5_txcmp_code ret;
4489 	unsigned int part;
4490 
4491 	assert(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
4492 	assert(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
4493 	/*
4494 	 * Check if there are some CQEs, if any:
4495 	 * - process an encountered errors
4496 	 * - process the completed WQEs
4497 	 * - free related mbufs
4498 	 * - doorbell the NIC about processed CQEs
4499 	 */
4500 	if (unlikely(!pkts_n))
4501 		return 0;
4502 	rte_prefetch0(*pkts);
4503 	mlx5_tx_handle_completion(txq, olx);
4504 	/*
4505 	 * Calculate the number of available resources - elts and WQEs.
4506 	 * There are two possible different scenarios:
4507 	 * - no data inlining into WQEs, one WQEBB may contains upto
4508 	 *   four packets, in this case elts become scarce resource
4509 	 * - data inlining into WQEs, one packet may require multiple
4510 	 *   WQEBBs, the WQEs become the limiting factor.
4511 	 */
4512 	assert(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
4513 	loc.elts_free = txq->elts_s -
4514 				(uint16_t)(txq->elts_head - txq->elts_tail);
4515 	assert(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
4516 	loc.wqe_free = txq->wqe_s -
4517 				(uint16_t)(txq->wqe_ci - txq->wqe_pi);
4518 	if (unlikely(!loc.elts_free || !loc.wqe_free))
4519 		return 0;
4520 	loc.pkts_sent = 0;
4521 	loc.pkts_copy = 0;
4522 	loc.wqe_last = NULL;
4523 	for (;;) {
4524 		/*
4525 		 * Fetch the packet from array. Usually this is
4526 		 * the first packet in series of multi/single
4527 		 * segment packets.
4528 		 */
4529 		loc.mbuf = *(pkts + loc.pkts_sent);
4530 		/* Dedicated branch for multi-segment packets. */
4531 		if (MLX5_TXOFF_CONFIG(MULTI) &&
4532 		    unlikely(NB_SEGS(loc.mbuf) > 1)) {
4533 			/*
4534 			 * Multi-segment packet encountered.
4535 			 * Hardware is able to process it only
4536 			 * with SEND/TSO opcodes, one packet
4537 			 * per WQE, do it in dedicated routine.
4538 			 */
4539 enter_send_multi:
4540 			assert(loc.pkts_sent >= loc.pkts_copy);
4541 			part = loc.pkts_sent - loc.pkts_copy;
4542 			if (!MLX5_TXOFF_CONFIG(INLINE) && part) {
4543 				/*
4544 				 * There are some single-segment mbufs not
4545 				 * stored in elts. The mbufs must be in the
4546 				 * same order as WQEs, so we must copy the
4547 				 * mbufs to elts here, before the coming
4548 				 * multi-segment packet mbufs is appended.
4549 				 */
4550 				mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy,
4551 						  part, olx);
4552 				loc.pkts_copy = loc.pkts_sent;
4553 			}
4554 			assert(pkts_n > loc.pkts_sent);
4555 			ret = mlx5_tx_burst_mseg(txq, pkts, pkts_n, &loc, olx);
4556 			if (!MLX5_TXOFF_CONFIG(INLINE))
4557 				loc.pkts_copy = loc.pkts_sent;
4558 			/*
4559 			 * These returned code checks are supposed
4560 			 * to be optimized out due to routine inlining.
4561 			 */
4562 			if (ret == MLX5_TXCMP_CODE_EXIT) {
4563 				/*
4564 				 * The routine returns this code when
4565 				 * all packets are sent or there is no
4566 				 * enough resources to complete request.
4567 				 */
4568 				break;
4569 			}
4570 			if (ret == MLX5_TXCMP_CODE_ERROR) {
4571 				/*
4572 				 * The routine returns this code when
4573 				 * some error in the incoming packets
4574 				 * format occurred.
4575 				 */
4576 				txq->stats.oerrors++;
4577 				break;
4578 			}
4579 			if (ret == MLX5_TXCMP_CODE_SINGLE) {
4580 				/*
4581 				 * The single-segment packet was encountered
4582 				 * in the array, try to send it with the
4583 				 * best optimized way, possible engaging eMPW.
4584 				 */
4585 				goto enter_send_single;
4586 			}
4587 			if (MLX5_TXOFF_CONFIG(TSO) &&
4588 			    ret == MLX5_TXCMP_CODE_TSO) {
4589 				/*
4590 				 * The single-segment TSO packet was
4591 				 * encountered in the array.
4592 				 */
4593 				goto enter_send_tso;
4594 			}
4595 			/* We must not get here. Something is going wrong. */
4596 			assert(false);
4597 			txq->stats.oerrors++;
4598 			break;
4599 		}
4600 		/* Dedicated branch for single-segment TSO packets. */
4601 		if (MLX5_TXOFF_CONFIG(TSO) &&
4602 		    unlikely(loc.mbuf->ol_flags & PKT_TX_TCP_SEG)) {
4603 			/*
4604 			 * TSO might require special way for inlining
4605 			 * (dedicated parameters) and is sent with
4606 			 * MLX5_OPCODE_TSO opcode only, provide this
4607 			 * in dedicated branch.
4608 			 */
4609 enter_send_tso:
4610 			assert(NB_SEGS(loc.mbuf) == 1);
4611 			assert(pkts_n > loc.pkts_sent);
4612 			ret = mlx5_tx_burst_tso(txq, pkts, pkts_n, &loc, olx);
4613 			/*
4614 			 * These returned code checks are supposed
4615 			 * to be optimized out due to routine inlining.
4616 			 */
4617 			if (ret == MLX5_TXCMP_CODE_EXIT)
4618 				break;
4619 			if (ret == MLX5_TXCMP_CODE_ERROR) {
4620 				txq->stats.oerrors++;
4621 				break;
4622 			}
4623 			if (ret == MLX5_TXCMP_CODE_SINGLE)
4624 				goto enter_send_single;
4625 			if (MLX5_TXOFF_CONFIG(MULTI) &&
4626 			    ret == MLX5_TXCMP_CODE_MULTI) {
4627 				/*
4628 				 * The multi-segment packet was
4629 				 * encountered in the array.
4630 				 */
4631 				goto enter_send_multi;
4632 			}
4633 			/* We must not get here. Something is going wrong. */
4634 			assert(false);
4635 			txq->stats.oerrors++;
4636 			break;
4637 		}
4638 		/*
4639 		 * The dedicated branch for the single-segment packets
4640 		 * without TSO. Often these ones can be sent using
4641 		 * MLX5_OPCODE_EMPW with multiple packets in one WQE.
4642 		 * The routine builds the WQEs till it encounters
4643 		 * the TSO or multi-segment packet (in case if these
4644 		 * offloads are requested at SQ configuration time).
4645 		 */
4646 enter_send_single:
4647 		assert(pkts_n > loc.pkts_sent);
4648 		ret = mlx5_tx_burst_single(txq, pkts, pkts_n, &loc, olx);
4649 		/*
4650 		 * These returned code checks are supposed
4651 		 * to be optimized out due to routine inlining.
4652 		 */
4653 		if (ret == MLX5_TXCMP_CODE_EXIT)
4654 			break;
4655 		if (ret == MLX5_TXCMP_CODE_ERROR) {
4656 			txq->stats.oerrors++;
4657 			break;
4658 		}
4659 		if (MLX5_TXOFF_CONFIG(MULTI) &&
4660 		    ret == MLX5_TXCMP_CODE_MULTI) {
4661 			/*
4662 			 * The multi-segment packet was
4663 			 * encountered in the array.
4664 			 */
4665 			goto enter_send_multi;
4666 		}
4667 		if (MLX5_TXOFF_CONFIG(TSO) &&
4668 		    ret == MLX5_TXCMP_CODE_TSO) {
4669 			/*
4670 			 * The single-segment TSO packet was
4671 			 * encountered in the array.
4672 			 */
4673 			goto enter_send_tso;
4674 		}
4675 		/* We must not get here. Something is going wrong. */
4676 		assert(false);
4677 		txq->stats.oerrors++;
4678 		break;
4679 	}
4680 	/*
4681 	 * Main Tx loop is completed, do the rest:
4682 	 * - set completion request if thresholds are reached
4683 	 * - doorbell the hardware
4684 	 * - copy the rest of mbufs to elts (if any)
4685 	 */
4686 	assert(MLX5_TXOFF_CONFIG(INLINE) || loc.pkts_sent >= loc.pkts_copy);
4687 	/* Take a shortcut if nothing is sent. */
4688 	if (unlikely(loc.pkts_sent == 0))
4689 		return 0;
4690 	/*
4691 	 * Ring QP doorbell immediately after WQE building completion
4692 	 * to improve latencies. The pure software related data treatment
4693 	 * can be completed after doorbell. Tx CQEs for this SQ are
4694 	 * processed in this thread only by the polling.
4695 	 */
4696 	mlx5_tx_dbrec_cond_wmb(txq, loc.wqe_last, 0);
4697 	/* Not all of the mbufs may be stored into elts yet. */
4698 	part = MLX5_TXOFF_CONFIG(INLINE) ? 0 : loc.pkts_sent -
4699 		(MLX5_TXOFF_CONFIG(MULTI) ? loc.pkts_copy : 0);
4700 	if (!MLX5_TXOFF_CONFIG(INLINE) && part) {
4701 		/*
4702 		 * There are some single-segment mbufs not stored in elts.
4703 		 * It can be only if the last packet was single-segment.
4704 		 * The copying is gathered into one place due to it is
4705 		 * a good opportunity to optimize that with SIMD.
4706 		 * Unfortunately if inlining is enabled the gaps in
4707 		 * pointer array may happen due to early freeing of the
4708 		 * inlined mbufs.
4709 		 */
4710 		mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy, part, olx);
4711 	}
4712 #ifdef MLX5_PMD_SOFT_COUNTERS
4713 	/* Increment sent packets counter. */
4714 	txq->stats.opackets += loc.pkts_sent;
4715 #endif
4716 	assert(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
4717 	assert(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
4718 	return loc.pkts_sent;
4719 }
4720 
4721 /* Generate routines with Enhanced Multi-Packet Write support. */
4722 MLX5_TXOFF_DECL(full_empw,
4723 		MLX5_TXOFF_CONFIG_FULL | MLX5_TXOFF_CONFIG_EMPW)
4724 
4725 MLX5_TXOFF_DECL(none_empw,
4726 		MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW)
4727 
4728 MLX5_TXOFF_DECL(md_empw,
4729 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
4730 
4731 MLX5_TXOFF_DECL(mt_empw,
4732 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
4733 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
4734 
4735 MLX5_TXOFF_DECL(mtsc_empw,
4736 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
4737 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
4738 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
4739 
4740 MLX5_TXOFF_DECL(mti_empw,
4741 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
4742 		MLX5_TXOFF_CONFIG_INLINE |
4743 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
4744 
4745 MLX5_TXOFF_DECL(mtv_empw,
4746 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
4747 		MLX5_TXOFF_CONFIG_VLAN |
4748 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
4749 
4750 MLX5_TXOFF_DECL(mtiv_empw,
4751 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
4752 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
4753 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
4754 
4755 MLX5_TXOFF_DECL(sc_empw,
4756 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
4757 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
4758 
4759 MLX5_TXOFF_DECL(sci_empw,
4760 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
4761 		MLX5_TXOFF_CONFIG_INLINE |
4762 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
4763 
4764 MLX5_TXOFF_DECL(scv_empw,
4765 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
4766 		MLX5_TXOFF_CONFIG_VLAN |
4767 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
4768 
4769 MLX5_TXOFF_DECL(sciv_empw,
4770 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
4771 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
4772 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
4773 
4774 MLX5_TXOFF_DECL(i_empw,
4775 		MLX5_TXOFF_CONFIG_INLINE |
4776 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
4777 
4778 MLX5_TXOFF_DECL(v_empw,
4779 		MLX5_TXOFF_CONFIG_VLAN |
4780 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
4781 
4782 MLX5_TXOFF_DECL(iv_empw,
4783 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
4784 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
4785 
4786 /* Generate routines without Enhanced Multi-Packet Write support. */
4787 MLX5_TXOFF_DECL(full,
4788 		MLX5_TXOFF_CONFIG_FULL)
4789 
4790 MLX5_TXOFF_DECL(none,
4791 		MLX5_TXOFF_CONFIG_NONE)
4792 
4793 MLX5_TXOFF_DECL(md,
4794 		MLX5_TXOFF_CONFIG_METADATA)
4795 
4796 MLX5_TXOFF_DECL(mt,
4797 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
4798 		MLX5_TXOFF_CONFIG_METADATA)
4799 
4800 MLX5_TXOFF_DECL(mtsc,
4801 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
4802 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
4803 		MLX5_TXOFF_CONFIG_METADATA)
4804 
4805 MLX5_TXOFF_DECL(mti,
4806 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
4807 		MLX5_TXOFF_CONFIG_INLINE |
4808 		MLX5_TXOFF_CONFIG_METADATA)
4809 
4810 
4811 MLX5_TXOFF_DECL(mtv,
4812 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
4813 		MLX5_TXOFF_CONFIG_VLAN |
4814 		MLX5_TXOFF_CONFIG_METADATA)
4815 
4816 
4817 MLX5_TXOFF_DECL(mtiv,
4818 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
4819 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
4820 		MLX5_TXOFF_CONFIG_METADATA)
4821 
4822 MLX5_TXOFF_DECL(sc,
4823 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
4824 		MLX5_TXOFF_CONFIG_METADATA)
4825 
4826 MLX5_TXOFF_DECL(sci,
4827 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
4828 		MLX5_TXOFF_CONFIG_INLINE |
4829 		MLX5_TXOFF_CONFIG_METADATA)
4830 
4831 
4832 MLX5_TXOFF_DECL(scv,
4833 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
4834 		MLX5_TXOFF_CONFIG_VLAN |
4835 		MLX5_TXOFF_CONFIG_METADATA)
4836 
4837 
4838 MLX5_TXOFF_DECL(sciv,
4839 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
4840 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
4841 		MLX5_TXOFF_CONFIG_METADATA)
4842 
4843 MLX5_TXOFF_DECL(i,
4844 		MLX5_TXOFF_CONFIG_INLINE |
4845 		MLX5_TXOFF_CONFIG_METADATA)
4846 
4847 MLX5_TXOFF_DECL(v,
4848 		MLX5_TXOFF_CONFIG_VLAN |
4849 		MLX5_TXOFF_CONFIG_METADATA)
4850 
4851 MLX5_TXOFF_DECL(iv,
4852 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
4853 		MLX5_TXOFF_CONFIG_METADATA)
4854 
4855 /*
4856  * Array of declared and compiled Tx burst function and corresponding
4857  * supported offloads set. The array is used to select the Tx burst
4858  * function for specified offloads set at Tx queue configuration time.
4859  */
4860 const struct {
4861 	eth_tx_burst_t func;
4862 	unsigned int olx;
4863 } txoff_func[] = {
4864 MLX5_TXOFF_INFO(full_empw,
4865 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
4866 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
4867 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
4868 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
4869 
4870 MLX5_TXOFF_INFO(none_empw,
4871 		MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW)
4872 
4873 MLX5_TXOFF_INFO(md_empw,
4874 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
4875 
4876 MLX5_TXOFF_INFO(mt_empw,
4877 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
4878 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
4879 
4880 MLX5_TXOFF_INFO(mtsc_empw,
4881 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
4882 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
4883 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
4884 
4885 MLX5_TXOFF_INFO(mti_empw,
4886 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
4887 		MLX5_TXOFF_CONFIG_INLINE |
4888 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
4889 
4890 MLX5_TXOFF_INFO(mtv_empw,
4891 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
4892 		MLX5_TXOFF_CONFIG_VLAN |
4893 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
4894 
4895 MLX5_TXOFF_INFO(mtiv_empw,
4896 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
4897 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
4898 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
4899 
4900 MLX5_TXOFF_INFO(sc_empw,
4901 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
4902 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
4903 
4904 MLX5_TXOFF_INFO(sci_empw,
4905 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
4906 		MLX5_TXOFF_CONFIG_INLINE |
4907 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
4908 
4909 MLX5_TXOFF_INFO(scv_empw,
4910 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
4911 		MLX5_TXOFF_CONFIG_VLAN |
4912 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
4913 
4914 MLX5_TXOFF_INFO(sciv_empw,
4915 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
4916 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
4917 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
4918 
4919 MLX5_TXOFF_INFO(i_empw,
4920 		MLX5_TXOFF_CONFIG_INLINE |
4921 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
4922 
4923 MLX5_TXOFF_INFO(v_empw,
4924 		MLX5_TXOFF_CONFIG_VLAN |
4925 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
4926 
4927 MLX5_TXOFF_INFO(iv_empw,
4928 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
4929 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
4930 
4931 MLX5_TXOFF_INFO(full,
4932 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
4933 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
4934 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
4935 		MLX5_TXOFF_CONFIG_METADATA)
4936 
4937 MLX5_TXOFF_INFO(none,
4938 		MLX5_TXOFF_CONFIG_NONE)
4939 
4940 MLX5_TXOFF_INFO(md,
4941 		MLX5_TXOFF_CONFIG_METADATA)
4942 
4943 MLX5_TXOFF_INFO(mt,
4944 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
4945 		MLX5_TXOFF_CONFIG_METADATA)
4946 
4947 MLX5_TXOFF_INFO(mtsc,
4948 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
4949 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
4950 		MLX5_TXOFF_CONFIG_METADATA)
4951 
4952 MLX5_TXOFF_INFO(mti,
4953 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
4954 		MLX5_TXOFF_CONFIG_INLINE |
4955 		MLX5_TXOFF_CONFIG_METADATA)
4956 
4957 
4958 MLX5_TXOFF_INFO(mtv,
4959 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
4960 		MLX5_TXOFF_CONFIG_VLAN |
4961 		MLX5_TXOFF_CONFIG_METADATA)
4962 
4963 MLX5_TXOFF_INFO(mtiv,
4964 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
4965 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
4966 		MLX5_TXOFF_CONFIG_METADATA)
4967 
4968 MLX5_TXOFF_INFO(sc,
4969 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
4970 		MLX5_TXOFF_CONFIG_METADATA)
4971 
4972 MLX5_TXOFF_INFO(sci,
4973 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
4974 		MLX5_TXOFF_CONFIG_INLINE |
4975 		MLX5_TXOFF_CONFIG_METADATA)
4976 
4977 MLX5_TXOFF_INFO(scv,
4978 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
4979 		MLX5_TXOFF_CONFIG_VLAN |
4980 		MLX5_TXOFF_CONFIG_METADATA)
4981 
4982 MLX5_TXOFF_INFO(sciv,
4983 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
4984 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
4985 		MLX5_TXOFF_CONFIG_METADATA)
4986 
4987 MLX5_TXOFF_INFO(i,
4988 		MLX5_TXOFF_CONFIG_INLINE |
4989 		MLX5_TXOFF_CONFIG_METADATA)
4990 
4991 MLX5_TXOFF_INFO(v,
4992 		MLX5_TXOFF_CONFIG_VLAN |
4993 		MLX5_TXOFF_CONFIG_METADATA)
4994 
4995 MLX5_TXOFF_INFO(iv,
4996 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
4997 		MLX5_TXOFF_CONFIG_METADATA)
4998 };
4999 
5000 /**
5001  * Configure the Tx function to use. The routine checks configured
5002  * Tx offloads for the device and selects appropriate Tx burst
5003  * routine. There are multiple Tx burst routines compiled from
5004  * the same template in the most optimal way for the dedicated
5005  * Tx offloads set.
5006  *
5007  * @param dev
5008  *   Pointer to private data structure.
5009  *
5010  * @return
5011  *   Pointer to selected Tx burst function.
5012  */
5013 eth_tx_burst_t
5014 mlx5_select_tx_function(struct rte_eth_dev *dev)
5015 {
5016 	struct mlx5_priv *priv = dev->data->dev_private;
5017 	struct mlx5_dev_config *config = &priv->config;
5018 	uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads;
5019 	unsigned int diff = 0, olx = 0, i, m;
5020 
5021 	static_assert(MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE <=
5022 		      MLX5_DSEG_MAX, "invalid WQE max size");
5023 	static_assert(MLX5_WQE_CSEG_SIZE == MLX5_WSEG_SIZE,
5024 		      "invalid WQE Control Segment size");
5025 	static_assert(MLX5_WQE_ESEG_SIZE == MLX5_WSEG_SIZE,
5026 		      "invalid WQE Ethernet Segment size");
5027 	static_assert(MLX5_WQE_DSEG_SIZE == MLX5_WSEG_SIZE,
5028 		      "invalid WQE Data Segment size");
5029 	static_assert(MLX5_WQE_SIZE == 4 * MLX5_WSEG_SIZE,
5030 		      "invalid WQE size");
5031 	assert(priv);
5032 	if (tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS) {
5033 		/* We should support Multi-Segment Packets. */
5034 		olx |= MLX5_TXOFF_CONFIG_MULTI;
5035 	}
5036 	if (tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO |
5037 			   DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
5038 			   DEV_TX_OFFLOAD_GRE_TNL_TSO |
5039 			   DEV_TX_OFFLOAD_IP_TNL_TSO |
5040 			   DEV_TX_OFFLOAD_UDP_TNL_TSO)) {
5041 		/* We should support TCP Send Offload. */
5042 		olx |= MLX5_TXOFF_CONFIG_TSO;
5043 	}
5044 	if (tx_offloads & (DEV_TX_OFFLOAD_IP_TNL_TSO |
5045 			   DEV_TX_OFFLOAD_UDP_TNL_TSO |
5046 			   DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)) {
5047 		/* We should support Software Parser for Tunnels. */
5048 		olx |= MLX5_TXOFF_CONFIG_SWP;
5049 	}
5050 	if (tx_offloads & (DEV_TX_OFFLOAD_IPV4_CKSUM |
5051 			   DEV_TX_OFFLOAD_UDP_CKSUM |
5052 			   DEV_TX_OFFLOAD_TCP_CKSUM |
5053 			   DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)) {
5054 		/* We should support IP/TCP/UDP Checksums. */
5055 		olx |= MLX5_TXOFF_CONFIG_CSUM;
5056 	}
5057 	if (tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT) {
5058 		/* We should support VLAN insertion. */
5059 		olx |= MLX5_TXOFF_CONFIG_VLAN;
5060 	}
5061 	if (priv->txqs_n && (*priv->txqs)[0]) {
5062 		struct mlx5_txq_data *txd = (*priv->txqs)[0];
5063 
5064 		if (txd->inlen_send) {
5065 			/*
5066 			 * Check the data inline requirements. Data inline
5067 			 * is enabled on per device basis, we can check
5068 			 * the first Tx queue only.
5069 			 *
5070 			 * If device does not support VLAN insertion in WQE
5071 			 * and some queues are requested to perform VLAN
5072 			 * insertion offload than inline must be enabled.
5073 			 */
5074 			olx |= MLX5_TXOFF_CONFIG_INLINE;
5075 		}
5076 	}
5077 	if (config->mps == MLX5_MPW_ENHANCED &&
5078 	    config->txq_inline_min <= 0) {
5079 		/*
5080 		 * The NIC supports Enhanced Multi-Packet Write.
5081 		 * We do not support legacy MPW due to its
5082 		 * hardware related problems, so we just ignore
5083 		 * legacy MLX5_MPW settings. There should be no
5084 		 * minimal required inline data.
5085 		 */
5086 		olx |= MLX5_TXOFF_CONFIG_EMPW;
5087 	}
5088 	if (tx_offloads & DEV_TX_OFFLOAD_MATCH_METADATA) {
5089 		/* We should support Flow metadata. */
5090 		olx |= MLX5_TXOFF_CONFIG_METADATA;
5091 	}
5092 	/*
5093 	 * Scan the routines table to find the minimal
5094 	 * satisfying routine with requested offloads.
5095 	 */
5096 	m = RTE_DIM(txoff_func);
5097 	for (i = 0; i < RTE_DIM(txoff_func); i++) {
5098 		unsigned int tmp;
5099 
5100 		tmp = txoff_func[i].olx;
5101 		if (tmp == olx) {
5102 			/* Meets requested offloads exactly.*/
5103 			m = i;
5104 			break;
5105 		}
5106 		if ((tmp & olx) != olx) {
5107 			/* Does not meet requested offloads at all. */
5108 			continue;
5109 		}
5110 		if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_EMPW)
5111 			/* Do not enable eMPW if not configured. */
5112 			continue;
5113 		if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_INLINE)
5114 			/* Do not enable inlining if not configured. */
5115 			continue;
5116 		/*
5117 		 * Some routine meets the requirements.
5118 		 * Check whether it has minimal amount
5119 		 * of not requested offloads.
5120 		 */
5121 		tmp = __builtin_popcountl(tmp & ~olx);
5122 		if (m >= RTE_DIM(txoff_func) || tmp < diff) {
5123 			/* First or better match, save and continue. */
5124 			m = i;
5125 			diff = tmp;
5126 			continue;
5127 		}
5128 		if (tmp == diff) {
5129 			tmp = txoff_func[i].olx ^ txoff_func[m].olx;
5130 			if (__builtin_ffsl(txoff_func[i].olx & ~tmp) <
5131 			    __builtin_ffsl(txoff_func[m].olx & ~tmp)) {
5132 				/* Lighter not requested offload. */
5133 				m = i;
5134 			}
5135 		}
5136 	}
5137 	if (m >= RTE_DIM(txoff_func)) {
5138 		DRV_LOG(DEBUG, "port %u has no selected Tx function"
5139 			       " for requested offloads %04X",
5140 				dev->data->port_id, olx);
5141 		return NULL;
5142 	}
5143 	DRV_LOG(DEBUG, "port %u has selected Tx function"
5144 		       " supporting offloads %04X/%04X",
5145 			dev->data->port_id, olx, txoff_func[m].olx);
5146 	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_MULTI)
5147 		DRV_LOG(DEBUG, "\tMULTI (multi segment)");
5148 	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_TSO)
5149 		DRV_LOG(DEBUG, "\tTSO   (TCP send offload)");
5150 	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_SWP)
5151 		DRV_LOG(DEBUG, "\tSWP   (software parser)");
5152 	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_CSUM)
5153 		DRV_LOG(DEBUG, "\tCSUM  (checksum offload)");
5154 	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_INLINE)
5155 		DRV_LOG(DEBUG, "\tINLIN (inline data)");
5156 	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_VLAN)
5157 		DRV_LOG(DEBUG, "\tVLANI (VLAN insertion)");
5158 	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_METADATA)
5159 		DRV_LOG(DEBUG, "\tMETAD (tx Flow metadata)");
5160 	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_EMPW)
5161 		DRV_LOG(DEBUG, "\tEMPW  (Enhanced MPW)");
5162 	return txoff_func[m].func;
5163 }
5164 
5165 
5166