xref: /dpdk/drivers/net/mlx5/mlx5_tx.h (revision 7917b0d38e92e8b9ec5a870415b791420e10f11a)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2021 6WIND S.A.
3  * Copyright 2021 Mellanox Technologies, Ltd
4  */
5 
6 #ifndef RTE_PMD_MLX5_TX_H_
7 #define RTE_PMD_MLX5_TX_H_
8 
9 #include <stdint.h>
10 #include <sys/queue.h>
11 
12 #include <rte_mbuf.h>
13 #include <rte_mempool.h>
14 #include <rte_common.h>
15 #include <rte_spinlock.h>
16 #include <rte_trace_point.h>
17 
18 #include <mlx5_common.h>
19 #include <mlx5_common_mr.h>
20 
21 #include "mlx5.h"
22 #include "mlx5_autoconf.h"
23 #include "mlx5_rxtx.h"
24 #include "mlx5_trace.h"
25 
26 /* TX burst subroutines return codes. */
27 enum mlx5_txcmp_code {
28 	MLX5_TXCMP_CODE_EXIT = 0,
29 	MLX5_TXCMP_CODE_ERROR,
30 	MLX5_TXCMP_CODE_SINGLE,
31 	MLX5_TXCMP_CODE_MULTI,
32 	MLX5_TXCMP_CODE_TSO,
33 	MLX5_TXCMP_CODE_EMPW,
34 };
35 
36 /*
37  * These defines are used to configure Tx burst routine option set supported
38  * at compile time. The not specified options are optimized out due to if
39  * conditions can be explicitly calculated at compile time.
40  * The offloads with bigger runtime check (require more CPU cycles toskip)
41  * overhead should have the bigger index - this is needed to select the better
42  * matching routine function if no exact match and some offloads are not
43  * actually requested.
44  */
45 #define MLX5_TXOFF_CONFIG_MULTI (1u << 0) /* Multi-segment packets.*/
46 #define MLX5_TXOFF_CONFIG_TSO (1u << 1) /* TCP send offload supported.*/
47 #define MLX5_TXOFF_CONFIG_SWP (1u << 2) /* Tunnels/SW Parser offloads.*/
48 #define MLX5_TXOFF_CONFIG_CSUM (1u << 3) /* Check Sums offloaded. */
49 #define MLX5_TXOFF_CONFIG_INLINE (1u << 4) /* Data inlining supported. */
50 #define MLX5_TXOFF_CONFIG_VLAN (1u << 5) /* VLAN insertion supported.*/
51 #define MLX5_TXOFF_CONFIG_METADATA (1u << 6) /* Flow metadata. */
52 #define MLX5_TXOFF_CONFIG_EMPW (1u << 8) /* Enhanced MPW supported.*/
53 #define MLX5_TXOFF_CONFIG_MPW (1u << 9) /* Legacy MPW supported.*/
54 #define MLX5_TXOFF_CONFIG_TXPP (1u << 10) /* Scheduling on timestamp.*/
55 
56 /* The most common offloads groups. */
57 #define MLX5_TXOFF_CONFIG_NONE 0
58 #define MLX5_TXOFF_CONFIG_FULL (MLX5_TXOFF_CONFIG_MULTI | \
59 				MLX5_TXOFF_CONFIG_TSO | \
60 				MLX5_TXOFF_CONFIG_SWP | \
61 				MLX5_TXOFF_CONFIG_CSUM | \
62 				MLX5_TXOFF_CONFIG_INLINE | \
63 				MLX5_TXOFF_CONFIG_VLAN | \
64 				MLX5_TXOFF_CONFIG_METADATA)
65 
66 #define MLX5_TXOFF_CONFIG(mask) (olx & MLX5_TXOFF_CONFIG_##mask)
67 
68 #define MLX5_TXOFF_PRE_DECL(func) \
69 uint16_t mlx5_tx_burst_##func(void *txq, \
70 			      struct rte_mbuf **pkts, \
71 			      uint16_t pkts_n)
72 
73 #define MLX5_TXOFF_DECL(func, olx) \
74 uint16_t mlx5_tx_burst_##func(void *txq, \
75 			      struct rte_mbuf **pkts, \
76 			      uint16_t pkts_n) \
77 { \
78 	return mlx5_tx_burst_tmpl((struct mlx5_txq_data *)txq, \
79 		    pkts, pkts_n, (olx)); \
80 }
81 
82 /* Mbuf dynamic flag offset for inline. */
83 extern uint64_t rte_net_mlx5_dynf_inline_mask;
84 #define RTE_MBUF_F_TX_DYNF_NOINLINE rte_net_mlx5_dynf_inline_mask
85 
86 extern alignas(RTE_CACHE_LINE_SIZE) uint32_t mlx5_ptype_table[];
87 extern alignas(RTE_CACHE_LINE_SIZE) uint8_t mlx5_cksum_table[1 << 10];
88 extern alignas(RTE_CACHE_LINE_SIZE) uint8_t mlx5_swp_types_table[1 << 10];
89 
90 struct mlx5_txq_stats {
91 #ifdef MLX5_PMD_SOFT_COUNTERS
92 	uint64_t opackets; /**< Total of successfully sent packets. */
93 	uint64_t obytes; /**< Total of successfully sent bytes. */
94 #endif
95 	uint64_t oerrors; /**< Total number of failed transmitted packets. */
96 };
97 
98 /* TX queue send local data. */
99 __extension__
100 struct mlx5_txq_local {
101 	struct mlx5_wqe *wqe_last; /* last sent WQE pointer. */
102 	struct rte_mbuf *mbuf; /* first mbuf to process. */
103 	uint16_t pkts_copy; /* packets copied to elts. */
104 	uint16_t pkts_sent; /* packets sent. */
105 	uint16_t pkts_loop; /* packets sent on loop entry. */
106 	uint16_t elts_free; /* available elts remain. */
107 	uint16_t wqe_free; /* available wqe remain. */
108 	uint16_t mbuf_off; /* data offset in current mbuf. */
109 	uint16_t mbuf_nseg; /* number of remaining mbuf. */
110 	uint16_t mbuf_free; /* number of inline mbufs to free. */
111 };
112 
113 /* TX queue descriptor. */
114 __extension__
115 struct __rte_cache_aligned mlx5_txq_data {
116 	uint16_t elts_head; /* Current counter in (*elts)[]. */
117 	uint16_t elts_tail; /* Counter of first element awaiting completion. */
118 	uint16_t elts_comp; /* elts index since last completion request. */
119 	uint16_t elts_s; /* Number of mbuf elements. */
120 	uint16_t elts_m; /* Mask for mbuf elements indices. */
121 	/* Fields related to elts mbuf storage. */
122 	uint16_t wqe_ci; /* Consumer index for work queue. */
123 	uint16_t wqe_pi; /* Producer index for work queue. */
124 	uint16_t wqe_s; /* Number of WQ elements. */
125 	uint16_t wqe_m; /* Mask Number for WQ elements. */
126 	uint16_t wqe_comp; /* WQE index since last completion request. */
127 	uint16_t wqe_thres; /* WQE threshold to request completion in CQ. */
128 	/* WQ related fields. */
129 	uint16_t cq_ci; /* Consumer index for completion queue. */
130 	uint16_t cq_pi; /* Production index for completion queue. */
131 	uint16_t cqe_s; /* Number of CQ elements. */
132 	uint16_t cqe_m; /* Mask for CQ indices. */
133 	/* CQ related fields. */
134 	uint16_t elts_n:4; /* elts[] length (in log2). */
135 	uint16_t cqe_n:4; /* Number of CQ elements (in log2). */
136 	uint16_t wqe_n:4; /* Number of WQ elements (in log2). */
137 	uint16_t tso_en:1; /* When set hardware TSO is enabled. */
138 	uint16_t tunnel_en:1;
139 	/* When set TX offload for tunneled packets are supported. */
140 	uint16_t swp_en:1; /* Whether SW parser is enabled. */
141 	uint16_t vlan_en:1; /* VLAN insertion in WQE is supported. */
142 	uint16_t db_nc:1; /* Doorbell mapped to non-cached region. */
143 	uint16_t db_heu:1; /* Doorbell heuristic write barrier. */
144 	uint16_t rt_timestamp:1; /* Realtime timestamp format. */
145 	uint16_t wait_on_time:1; /* WQE with timestamp is supported. */
146 	uint16_t fast_free:1; /* mbuf fast free on Tx is enabled. */
147 	uint16_t inlen_send; /* Ordinary send data inline size. */
148 	uint16_t inlen_empw; /* eMPW max packet size to inline. */
149 	uint16_t inlen_mode; /* Minimal data length to inline. */
150 	uint8_t tx_aggr_affinity; /* TxQ affinity configuration. */
151 	uint32_t qp_num_8s; /* QP number shifted by 8. */
152 	uint64_t offloads; /* Offloads for Tx Queue. */
153 	struct mlx5_mr_ctrl mr_ctrl; /* MR control descriptor. */
154 	struct mlx5_wqe *wqes; /* Work queue. */
155 	struct mlx5_wqe *wqes_end; /* Work queue array limit. */
156 #ifdef RTE_LIBRTE_MLX5_DEBUG
157 	uint32_t *fcqs; /* Free completion queue (debug extended). */
158 #else
159 	uint16_t *fcqs; /* Free completion queue. */
160 #endif
161 	volatile struct mlx5_cqe *cqes; /* Completion queue. */
162 	volatile uint32_t *qp_db; /* Work queue doorbell. */
163 	volatile uint32_t *cq_db; /* Completion queue doorbell. */
164 	uint16_t port_id; /* Port ID of device. */
165 	uint16_t idx; /* Queue index. */
166 	uint64_t rt_timemask; /* Scheduling timestamp mask. */
167 	uint64_t ts_mask; /* Timestamp flag dynamic mask. */
168 	uint64_t ts_last; /* Last scheduled timestamp. */
169 	int32_t ts_offset; /* Timestamp field dynamic offset. */
170 	struct mlx5_dev_ctx_shared *sh; /* Shared context. */
171 	struct mlx5_txq_stats stats; /* TX queue counters. */
172 	struct mlx5_txq_stats stats_reset; /* stats on last reset. */
173 	struct mlx5_uar_data uar_data;
174 	struct rte_mbuf *elts[];
175 	/* Storage for queued packets, must be the last field. */
176 };
177 
178 /* TX queue control descriptor. */
179 __extension__
180 struct mlx5_txq_ctrl {
181 	LIST_ENTRY(mlx5_txq_ctrl) next; /* Pointer to the next element. */
182 	RTE_ATOMIC(uint32_t) refcnt; /* Reference counter. */
183 	unsigned int socket; /* CPU socket ID for allocations. */
184 	bool is_hairpin; /* Whether TxQ type is Hairpin. */
185 	unsigned int max_inline_data; /* Max inline data. */
186 	unsigned int max_tso_header; /* Max TSO header size. */
187 	struct mlx5_txq_obj *obj; /* Verbs/DevX queue object. */
188 	struct mlx5_priv *priv; /* Back pointer to private data. */
189 	off_t uar_mmap_offset; /* UAR mmap offset for non-primary process. */
190 	uint16_t dump_file_n; /* Number of dump files. */
191 	struct rte_eth_hairpin_conf hairpin_conf; /* Hairpin configuration. */
192 	uint32_t hairpin_status; /* Hairpin binding status. */
193 	struct mlx5_txq_data txq; /* Data path structure. */
194 	/* Must be the last field in the structure, contains elts[]. */
195 };
196 
197 /* mlx5_txq.c */
198 
199 int mlx5_tx_queue_start(struct rte_eth_dev *dev, uint16_t queue_id);
200 int mlx5_tx_queue_stop(struct rte_eth_dev *dev, uint16_t queue_id);
201 int mlx5_tx_queue_start_primary(struct rte_eth_dev *dev, uint16_t queue_id);
202 int mlx5_tx_queue_stop_primary(struct rte_eth_dev *dev, uint16_t queue_id);
203 int mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
204 			unsigned int socket, const struct rte_eth_txconf *conf);
205 int mlx5_tx_hairpin_queue_setup
206 	(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
207 	 const struct rte_eth_hairpin_conf *hairpin_conf);
208 void mlx5_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid);
209 int mlx5_tx_uar_init_secondary(struct rte_eth_dev *dev, int fd);
210 void mlx5_tx_uar_uninit_secondary(struct rte_eth_dev *dev);
211 int mlx5_txq_obj_verify(struct rte_eth_dev *dev);
212 struct mlx5_txq_ctrl *mlx5_txq_new(struct rte_eth_dev *dev, uint16_t idx,
213 				   uint16_t desc, unsigned int socket,
214 				   const struct rte_eth_txconf *conf);
215 struct mlx5_txq_ctrl *mlx5_txq_hairpin_new
216 	(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
217 	 const struct rte_eth_hairpin_conf *hairpin_conf);
218 struct mlx5_txq_ctrl *mlx5_txq_get(struct rte_eth_dev *dev, uint16_t idx);
219 int mlx5_txq_release(struct rte_eth_dev *dev, uint16_t idx);
220 int mlx5_txq_releasable(struct rte_eth_dev *dev, uint16_t idx);
221 int mlx5_txq_verify(struct rte_eth_dev *dev);
222 int mlx5_txq_get_sqn(struct mlx5_txq_ctrl *txq);
223 void txq_alloc_elts(struct mlx5_txq_ctrl *txq_ctrl);
224 void txq_free_elts(struct mlx5_txq_ctrl *txq_ctrl);
225 uint64_t mlx5_get_tx_port_offloads(struct rte_eth_dev *dev);
226 void mlx5_txq_dynf_timestamp_set(struct rte_eth_dev *dev);
227 int mlx5_count_aggr_ports(struct rte_eth_dev *dev);
228 int mlx5_map_aggr_tx_affinity(struct rte_eth_dev *dev, uint16_t tx_queue_id,
229 			      uint8_t affinity);
230 int mlx5_ext_txq_verify(struct rte_eth_dev *dev);
231 struct mlx5_external_q *mlx5_ext_txq_get(struct rte_eth_dev *dev, uint16_t idx);
232 
233 /* mlx5_tx.c */
234 
235 void mlx5_tx_handle_completion(struct mlx5_txq_data *__rte_restrict txq,
236 			       unsigned int olx __rte_unused);
237 int mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset);
238 void mlx5_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
239 		       struct rte_eth_txq_info *qinfo);
240 int mlx5_tx_burst_mode_get(struct rte_eth_dev *dev, uint16_t tx_queue_id,
241 			   struct rte_eth_burst_mode *mode);
242 
243 /* mlx5_tx_empw.c */
244 
245 MLX5_TXOFF_PRE_DECL(full_empw);
246 MLX5_TXOFF_PRE_DECL(none_empw);
247 MLX5_TXOFF_PRE_DECL(md_empw);
248 MLX5_TXOFF_PRE_DECL(mt_empw);
249 MLX5_TXOFF_PRE_DECL(mtsc_empw);
250 MLX5_TXOFF_PRE_DECL(mti_empw);
251 MLX5_TXOFF_PRE_DECL(mtv_empw);
252 MLX5_TXOFF_PRE_DECL(mtiv_empw);
253 MLX5_TXOFF_PRE_DECL(sc_empw);
254 MLX5_TXOFF_PRE_DECL(sci_empw);
255 MLX5_TXOFF_PRE_DECL(scv_empw);
256 MLX5_TXOFF_PRE_DECL(sciv_empw);
257 MLX5_TXOFF_PRE_DECL(i_empw);
258 MLX5_TXOFF_PRE_DECL(v_empw);
259 MLX5_TXOFF_PRE_DECL(iv_empw);
260 
261 /* mlx5_tx_nompw.c */
262 
263 MLX5_TXOFF_PRE_DECL(full);
264 MLX5_TXOFF_PRE_DECL(none);
265 MLX5_TXOFF_PRE_DECL(md);
266 MLX5_TXOFF_PRE_DECL(mt);
267 MLX5_TXOFF_PRE_DECL(mtsc);
268 MLX5_TXOFF_PRE_DECL(mti);
269 MLX5_TXOFF_PRE_DECL(mtv);
270 MLX5_TXOFF_PRE_DECL(mtiv);
271 MLX5_TXOFF_PRE_DECL(sc);
272 MLX5_TXOFF_PRE_DECL(sci);
273 MLX5_TXOFF_PRE_DECL(scv);
274 MLX5_TXOFF_PRE_DECL(sciv);
275 MLX5_TXOFF_PRE_DECL(i);
276 MLX5_TXOFF_PRE_DECL(v);
277 MLX5_TXOFF_PRE_DECL(iv);
278 
279 /* mlx5_tx_txpp.c */
280 
281 MLX5_TXOFF_PRE_DECL(full_ts_nompw);
282 MLX5_TXOFF_PRE_DECL(full_ts_nompwi);
283 MLX5_TXOFF_PRE_DECL(full_ts);
284 MLX5_TXOFF_PRE_DECL(full_ts_noi);
285 MLX5_TXOFF_PRE_DECL(none_ts);
286 MLX5_TXOFF_PRE_DECL(mdi_ts);
287 MLX5_TXOFF_PRE_DECL(mti_ts);
288 MLX5_TXOFF_PRE_DECL(mtiv_ts);
289 
290 /* mlx5_tx_mpw.c */
291 
292 MLX5_TXOFF_PRE_DECL(none_mpw);
293 MLX5_TXOFF_PRE_DECL(mci_mpw);
294 MLX5_TXOFF_PRE_DECL(mc_mpw);
295 MLX5_TXOFF_PRE_DECL(i_mpw);
296 
297 static __rte_always_inline struct mlx5_uar_data *
298 mlx5_tx_bfreg(struct mlx5_txq_data *txq)
299 {
300 	return &MLX5_PROC_PRIV(txq->port_id)->uar_table[txq->idx];
301 }
302 
303 /**
304  * Ring TX queue doorbell and flush the update by write memory barrier.
305  *
306  * @param txq
307  *   Pointer to TX queue structure.
308  * @param wqe
309  *   Pointer to the last WQE posted in the NIC.
310  */
311 static __rte_always_inline void
312 mlx5_tx_dbrec(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe)
313 {
314 	mlx5_doorbell_ring(mlx5_tx_bfreg(txq), *(volatile uint64_t *)wqe,
315 			   txq->wqe_ci, txq->qp_db, 1);
316 }
317 
318 /**
319  * Convert timestamp from mbuf format to linear counter
320  * of Clock Queue completions (24 bits).
321  *
322  * @param sh
323  *   Pointer to the device shared context to fetch Tx
324  *   packet pacing timestamp and parameters.
325  * @param ts
326  *   Timestamp from mbuf to convert.
327  * @return
328  *   positive or zero value - completion ID to wait.
329  *   negative value - conversion error.
330  */
331 static __rte_always_inline int32_t
332 mlx5_txpp_convert_tx_ts(struct mlx5_dev_ctx_shared *sh, uint64_t mts)
333 {
334 	uint64_t ts, ci;
335 	uint32_t tick;
336 
337 	do {
338 		/*
339 		 * Read atomically two uint64_t fields and compare lsb bits.
340 		 * It there is no match - the timestamp was updated in
341 		 * the service thread, data should be re-read.
342 		 */
343 		rte_compiler_barrier();
344 		ci = rte_atomic_load_explicit(&sh->txpp.ts.ci_ts, rte_memory_order_relaxed);
345 		ts = rte_atomic_load_explicit(&sh->txpp.ts.ts, rte_memory_order_relaxed);
346 		rte_compiler_barrier();
347 		if (!((ts ^ ci) << (64 - MLX5_CQ_INDEX_WIDTH)))
348 			break;
349 	} while (true);
350 	/* Perform the skew correction, positive value to send earlier. */
351 	mts -= sh->txpp.skew;
352 	mts -= ts;
353 	if (unlikely(mts >= UINT64_MAX / 2)) {
354 		/* We have negative integer, mts is in the past. */
355 		rte_atomic_fetch_add_explicit(&sh->txpp.err_ts_past,
356 				   1, rte_memory_order_relaxed);
357 		return -1;
358 	}
359 	tick = sh->txpp.tick;
360 	MLX5_ASSERT(tick);
361 	/* Convert delta to completions, round up. */
362 	mts = (mts + tick - 1) / tick;
363 	if (unlikely(mts >= (1 << MLX5_CQ_INDEX_WIDTH) / 2 - 1)) {
364 		/* We have mts is too distant future. */
365 		rte_atomic_fetch_add_explicit(&sh->txpp.err_ts_future,
366 				   1, rte_memory_order_relaxed);
367 		return -1;
368 	}
369 	mts <<= 64 - MLX5_CQ_INDEX_WIDTH;
370 	ci += mts;
371 	ci >>= 64 - MLX5_CQ_INDEX_WIDTH;
372 	return ci;
373 }
374 
375 /**
376  * Set Software Parser flags and offsets in Ethernet Segment of WQE.
377  * Flags must be preliminary initialized to zero.
378  *
379  * @param loc
380  *   Pointer to burst routine local context.
381  * @param swp_flags
382  *   Pointer to store Software Parser flags.
383  * @param olx
384  *   Configured Tx offloads mask. It is fully defined at
385  *   compile time and may be used for optimization.
386  *
387  * @return
388  *   Software Parser offsets packed in dword.
389  *   Software Parser flags are set by pointer.
390  */
391 static __rte_always_inline uint32_t
392 txq_mbuf_to_swp(struct mlx5_txq_local *__rte_restrict loc,
393 		uint8_t *swp_flags,
394 		unsigned int olx)
395 {
396 	uint64_t ol, tunnel;
397 	unsigned int idx, off;
398 	uint32_t set;
399 
400 	if (!MLX5_TXOFF_CONFIG(SWP))
401 		return 0;
402 	ol = loc->mbuf->ol_flags;
403 	tunnel = ol & RTE_MBUF_F_TX_TUNNEL_MASK;
404 	/*
405 	 * Check whether Software Parser is required.
406 	 * Only customized tunnels may ask for.
407 	 */
408 	if (likely(tunnel != RTE_MBUF_F_TX_TUNNEL_UDP && tunnel != RTE_MBUF_F_TX_TUNNEL_IP))
409 		return 0;
410 	/*
411 	 * The index should have:
412 	 * bit[0:1] = RTE_MBUF_F_TX_L4_MASK
413 	 * bit[4] = RTE_MBUF_F_TX_IPV6
414 	 * bit[8] = RTE_MBUF_F_TX_OUTER_IPV6
415 	 * bit[9] = RTE_MBUF_F_TX_OUTER_UDP
416 	 */
417 	idx = (ol & (RTE_MBUF_F_TX_L4_MASK | RTE_MBUF_F_TX_IPV6 | RTE_MBUF_F_TX_OUTER_IPV6)) >> 52;
418 	idx |= (tunnel == RTE_MBUF_F_TX_TUNNEL_UDP) ? (1 << 9) : 0;
419 	*swp_flags = mlx5_swp_types_table[idx];
420 	/*
421 	 * Set offsets for SW parser. Since ConnectX-5, SW parser just
422 	 * complements HW parser. SW parser starts to engage only if HW parser
423 	 * can't reach a header. For the older devices, HW parser will not kick
424 	 * in if any of SWP offsets is set. Therefore, all of the L3 offsets
425 	 * should be set regardless of HW offload.
426 	 */
427 	off = loc->mbuf->outer_l2_len;
428 	if (MLX5_TXOFF_CONFIG(VLAN) && ol & RTE_MBUF_F_TX_VLAN)
429 		off += sizeof(struct rte_vlan_hdr);
430 	set = (off >> 1) << 8; /* Outer L3 offset. */
431 	off += loc->mbuf->outer_l3_len;
432 	if (tunnel == RTE_MBUF_F_TX_TUNNEL_UDP)
433 		set |= off >> 1; /* Outer L4 offset. */
434 	if (ol & (RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_IPV6)) { /* Inner IP. */
435 		const uint64_t csum = ol & RTE_MBUF_F_TX_L4_MASK;
436 			off += loc->mbuf->l2_len;
437 		set |= (off >> 1) << 24; /* Inner L3 offset. */
438 		if (csum == RTE_MBUF_F_TX_TCP_CKSUM ||
439 		    csum == RTE_MBUF_F_TX_UDP_CKSUM ||
440 		    (MLX5_TXOFF_CONFIG(TSO) && ol & RTE_MBUF_F_TX_TCP_SEG)) {
441 			off += loc->mbuf->l3_len;
442 			set |= (off >> 1) << 16; /* Inner L4 offset. */
443 		}
444 	}
445 	set = rte_cpu_to_le_32(set);
446 	return set;
447 }
448 
449 /**
450  * Convert the Checksum offloads to Verbs.
451  *
452  * @param buf
453  *   Pointer to the mbuf.
454  *
455  * @return
456  *   Converted checksum flags.
457  */
458 static __rte_always_inline uint8_t
459 txq_ol_cksum_to_cs(struct rte_mbuf *buf)
460 {
461 	uint32_t idx;
462 	uint8_t is_tunnel = !!(buf->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK);
463 	const uint64_t ol_flags_mask = RTE_MBUF_F_TX_TCP_SEG | RTE_MBUF_F_TX_L4_MASK |
464 				       RTE_MBUF_F_TX_IP_CKSUM | RTE_MBUF_F_TX_OUTER_IP_CKSUM;
465 
466 	/*
467 	 * The index should have:
468 	 * bit[0] = RTE_MBUF_F_TX_TCP_SEG
469 	 * bit[2:3] = RTE_MBUF_F_TX_UDP_CKSUM, RTE_MBUF_F_TX_TCP_CKSUM
470 	 * bit[4] = RTE_MBUF_F_TX_IP_CKSUM
471 	 * bit[8] = RTE_MBUF_F_TX_OUTER_IP_CKSUM
472 	 * bit[9] = tunnel
473 	 */
474 	idx = ((buf->ol_flags & ol_flags_mask) >> 50) | (!!is_tunnel << 9);
475 	return mlx5_cksum_table[idx];
476 }
477 
478 /**
479  * Free the mbufs from the linear array of pointers.
480  *
481  * @param txq
482  *   Pointer to Tx queue structure.
483  * @param pkts
484  *   Pointer to array of packets to be free.
485  * @param pkts_n
486  *   Number of packets to be freed.
487  * @param olx
488  *   Configured Tx offloads mask. It is fully defined at
489  *   compile time and may be used for optimization.
490  */
491 static __rte_always_inline void
492 mlx5_tx_free_mbuf(struct mlx5_txq_data *__rte_restrict txq,
493 		  struct rte_mbuf **__rte_restrict pkts,
494 		  unsigned int pkts_n,
495 		  unsigned int olx __rte_unused)
496 {
497 	struct rte_mempool *pool = NULL;
498 	struct rte_mbuf **p_free = NULL;
499 	struct rte_mbuf *mbuf;
500 	unsigned int n_free = 0;
501 
502 	/*
503 	 * The implemented algorithm eliminates
504 	 * copying pointers to temporary array
505 	 * for rte_mempool_put_bulk() calls.
506 	 */
507 	MLX5_ASSERT(pkts);
508 	MLX5_ASSERT(pkts_n);
509 	/*
510 	 * Free mbufs directly to the pool in bulk
511 	 * if fast free offload is engaged
512 	 */
513 	if (!MLX5_TXOFF_CONFIG(MULTI) && txq->fast_free) {
514 		mbuf = *pkts;
515 		pool = mbuf->pool;
516 		rte_mempool_put_bulk(pool, (void *)pkts, pkts_n);
517 		return;
518 	}
519 	for (;;) {
520 		for (;;) {
521 			/*
522 			 * Decrement mbuf reference counter, detach
523 			 * indirect and external buffers if needed.
524 			 */
525 			mbuf = rte_pktmbuf_prefree_seg(*pkts);
526 			if (likely(mbuf != NULL)) {
527 				MLX5_ASSERT(mbuf == *pkts);
528 				if (likely(n_free != 0)) {
529 					if (unlikely(pool != mbuf->pool))
530 						/* From different pool. */
531 						break;
532 				} else {
533 					/* Start new scan array. */
534 					pool = mbuf->pool;
535 					p_free = pkts;
536 				}
537 				++n_free;
538 				++pkts;
539 				--pkts_n;
540 				if (unlikely(pkts_n == 0)) {
541 					mbuf = NULL;
542 					break;
543 				}
544 			} else {
545 				/*
546 				 * This happens if mbuf is still referenced.
547 				 * We can't put it back to the pool, skip.
548 				 */
549 				++pkts;
550 				--pkts_n;
551 				if (unlikely(n_free != 0))
552 					/* There is some array to free.*/
553 					break;
554 				if (unlikely(pkts_n == 0))
555 					/* Last mbuf, nothing to free. */
556 					return;
557 			}
558 		}
559 		for (;;) {
560 			/*
561 			 * This loop is implemented to avoid multiple
562 			 * inlining of rte_mempool_put_bulk().
563 			 */
564 			MLX5_ASSERT(pool);
565 			MLX5_ASSERT(p_free);
566 			MLX5_ASSERT(n_free);
567 			/*
568 			 * Free the array of pre-freed mbufs
569 			 * belonging to the same memory pool.
570 			 */
571 			rte_mempool_put_bulk(pool, (void *)p_free, n_free);
572 			if (unlikely(mbuf != NULL)) {
573 				/* There is the request to start new scan. */
574 				pool = mbuf->pool;
575 				p_free = pkts++;
576 				n_free = 1;
577 				--pkts_n;
578 				if (likely(pkts_n != 0))
579 					break;
580 				/*
581 				 * This is the last mbuf to be freed.
582 				 * Do one more loop iteration to complete.
583 				 * This is rare case of the last unique mbuf.
584 				 */
585 				mbuf = NULL;
586 				continue;
587 			}
588 			if (likely(pkts_n == 0))
589 				return;
590 			n_free = 0;
591 			break;
592 		}
593 	}
594 }
595 
596 /**
597  * No inline version to free buffers for optimal call
598  * on the tx_burst completion.
599  */
600 static __rte_noinline void
601 __mlx5_tx_free_mbuf(struct mlx5_txq_data *__rte_restrict txq,
602 		    struct rte_mbuf **__rte_restrict pkts,
603 		    unsigned int pkts_n,
604 		    unsigned int olx __rte_unused)
605 {
606 	mlx5_tx_free_mbuf(txq, pkts, pkts_n, olx);
607 }
608 
609 /**
610  * Free the mbuf from the elts ring buffer till new tail.
611  *
612  * @param txq
613  *   Pointer to Tx queue structure.
614  * @param tail
615  *   Index in elts to free up to, becomes new elts tail.
616  * @param olx
617  *   Configured Tx offloads mask. It is fully defined at
618  *   compile time and may be used for optimization.
619  */
620 static __rte_always_inline void
621 mlx5_tx_free_elts(struct mlx5_txq_data *__rte_restrict txq,
622 		  uint16_t tail,
623 		  unsigned int olx __rte_unused)
624 {
625 	uint16_t n_elts = tail - txq->elts_tail;
626 
627 	MLX5_ASSERT(n_elts);
628 	MLX5_ASSERT(n_elts <= txq->elts_s);
629 	/*
630 	 * Implement a loop to support ring buffer wraparound
631 	 * with single inlining of mlx5_tx_free_mbuf().
632 	 */
633 	do {
634 		unsigned int part;
635 
636 		part = txq->elts_s - (txq->elts_tail & txq->elts_m);
637 		part = RTE_MIN(part, n_elts);
638 		MLX5_ASSERT(part);
639 		MLX5_ASSERT(part <= txq->elts_s);
640 		mlx5_tx_free_mbuf(txq,
641 				  &txq->elts[txq->elts_tail & txq->elts_m],
642 				  part, olx);
643 		txq->elts_tail += part;
644 		n_elts -= part;
645 	} while (n_elts);
646 }
647 
648 /**
649  * Store the mbuf being sent into elts ring buffer.
650  * On Tx completion these mbufs will be freed.
651  *
652  * @param txq
653  *   Pointer to Tx queue structure.
654  * @param pkts
655  *   Pointer to array of packets to be stored.
656  * @param pkts_n
657  *   Number of packets to be stored.
658  * @param olx
659  *   Configured Tx offloads mask. It is fully defined at
660  *   compile time and may be used for optimization.
661  */
662 static __rte_always_inline void
663 mlx5_tx_copy_elts(struct mlx5_txq_data *__rte_restrict txq,
664 		  struct rte_mbuf **__rte_restrict pkts,
665 		  unsigned int pkts_n,
666 		  unsigned int olx __rte_unused)
667 {
668 	unsigned int part;
669 	struct rte_mbuf **elts = (struct rte_mbuf **)txq->elts;
670 
671 	MLX5_ASSERT(pkts);
672 	MLX5_ASSERT(pkts_n);
673 	part = txq->elts_s - (txq->elts_head & txq->elts_m);
674 	MLX5_ASSERT(part);
675 	MLX5_ASSERT(part <= txq->elts_s);
676 	/* This code is a good candidate for vectorizing with SIMD. */
677 	rte_memcpy((void *)(elts + (txq->elts_head & txq->elts_m)),
678 		   (void *)pkts,
679 		   RTE_MIN(part, pkts_n) * sizeof(struct rte_mbuf *));
680 	txq->elts_head += pkts_n;
681 	if (unlikely(part < pkts_n))
682 		/* The copy is wrapping around the elts array. */
683 		rte_memcpy((void *)elts, (void *)(pkts + part),
684 			   (pkts_n - part) * sizeof(struct rte_mbuf *));
685 }
686 
687 /**
688  * Check if the completion request flag should be set in the last WQE.
689  * Both pushed mbufs and WQEs are monitored and the completion request
690  * flag is set if any of thresholds is reached.
691  *
692  * @param txq
693  *   Pointer to TX queue structure.
694  * @param loc
695  *   Pointer to burst routine local context.
696  * @param olx
697  *   Configured Tx offloads mask. It is fully defined at
698  *   compile time and may be used for optimization.
699  */
700 static __rte_always_inline void
701 mlx5_tx_request_completion(struct mlx5_txq_data *__rte_restrict txq,
702 			   struct mlx5_txq_local *__rte_restrict loc,
703 			   unsigned int olx)
704 {
705 	uint16_t head = txq->elts_head;
706 	unsigned int part;
707 
708 	part = MLX5_TXOFF_CONFIG(INLINE) ?
709 	       0 : loc->pkts_sent - loc->pkts_copy;
710 	head += part;
711 	if ((uint16_t)(head - txq->elts_comp) >= MLX5_TX_COMP_THRESH ||
712 	     (MLX5_TXOFF_CONFIG(INLINE) &&
713 	     (uint16_t)(txq->wqe_ci - txq->wqe_comp) >= txq->wqe_thres)) {
714 		volatile struct mlx5_wqe *last = loc->wqe_last;
715 
716 		MLX5_ASSERT(last);
717 		txq->elts_comp = head;
718 		if (MLX5_TXOFF_CONFIG(INLINE))
719 			txq->wqe_comp = txq->wqe_ci;
720 		/* Request unconditional completion on last WQE. */
721 		last->cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS <<
722 					    MLX5_COMP_MODE_OFFSET);
723 		/* Save elts_head in dedicated free on completion queue. */
724 #ifdef RTE_LIBRTE_MLX5_DEBUG
725 		txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head |
726 			  (last->cseg.opcode >> 8) << 16;
727 #else
728 		txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head;
729 #endif
730 		/* A CQE slot must always be available. */
731 		MLX5_ASSERT((txq->cq_pi - txq->cq_ci) <= txq->cqe_s);
732 	}
733 }
734 
735 /**
736  * Set completion request flag for all issued WQEs.
737  * This routine is intended to be used with enabled fast path tracing
738  * and send scheduling on time to provide the detailed report in trace
739  * for send completions on every WQE.
740  *
741  * @param txq
742  *   Pointer to TX queue structure.
743  * @param loc
744  *   Pointer to burst routine local context.
745  * @param olx
746  *   Configured Tx offloads mask. It is fully defined at
747  *   compile time and may be used for optimization.
748  */
749 static __rte_always_inline void
750 mlx5_tx_request_completion_trace(struct mlx5_txq_data *__rte_restrict txq,
751 				 struct mlx5_txq_local *__rte_restrict loc,
752 				 unsigned int olx)
753 {
754 	uint16_t head = txq->elts_comp;
755 
756 	while (txq->wqe_comp != txq->wqe_ci) {
757 		volatile struct mlx5_wqe *wqe;
758 		uint32_t wqe_n;
759 
760 		MLX5_ASSERT(loc->wqe_last);
761 		wqe = txq->wqes + (txq->wqe_comp & txq->wqe_m);
762 		if (wqe == loc->wqe_last) {
763 			head = txq->elts_head;
764 			head +=	MLX5_TXOFF_CONFIG(INLINE) ?
765 				0 : loc->pkts_sent - loc->pkts_copy;
766 			txq->elts_comp = head;
767 		}
768 		/* Completion request flag was set on cseg constructing. */
769 #ifdef RTE_LIBRTE_MLX5_DEBUG
770 		txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head |
771 			  (wqe->cseg.opcode >> 8) << 16;
772 #else
773 		txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head;
774 #endif
775 		/* A CQE slot must always be available. */
776 		MLX5_ASSERT((txq->cq_pi - txq->cq_ci) <= txq->cqe_s);
777 		/* Advance to the next WQE in the queue. */
778 		wqe_n = rte_be_to_cpu_32(wqe->cseg.sq_ds) & 0x3F;
779 		txq->wqe_comp += RTE_ALIGN(wqe_n, 4) / 4;
780 	}
781 }
782 
783 /**
784  * Build the Control Segment with specified opcode:
785  * - MLX5_OPCODE_SEND
786  * - MLX5_OPCODE_ENHANCED_MPSW
787  * - MLX5_OPCODE_TSO
788  *
789  * @param txq
790  *   Pointer to TX queue structure.
791  * @param loc
792  *   Pointer to burst routine local context.
793  * @param wqe
794  *   Pointer to WQE to fill with built Control Segment.
795  * @param ds
796  *   Supposed length of WQE in segments.
797  * @param opcode
798  *   SQ WQE opcode to put into Control Segment.
799  * @param olx
800  *   Configured Tx offloads mask. It is fully defined at
801  *   compile time and may be used for optimization.
802  */
803 static __rte_always_inline void
804 mlx5_tx_cseg_init(struct mlx5_txq_data *__rte_restrict txq,
805 		  struct mlx5_txq_local *__rte_restrict loc __rte_unused,
806 		  struct mlx5_wqe *__rte_restrict wqe,
807 		  unsigned int ds,
808 		  unsigned int opcode,
809 		  unsigned int olx)
810 {
811 	struct mlx5_wqe_cseg *__rte_restrict cs = &wqe->cseg;
812 
813 	/* For legacy MPW replace the EMPW by TSO with modifier. */
814 	if (MLX5_TXOFF_CONFIG(MPW) && opcode == MLX5_OPCODE_ENHANCED_MPSW)
815 		opcode = MLX5_OPCODE_TSO | MLX5_OPC_MOD_MPW << 24;
816 	cs->opcode = rte_cpu_to_be_32((txq->wqe_ci << 8) | opcode);
817 	cs->sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
818 	if (MLX5_TXOFF_CONFIG(TXPP) && __rte_trace_point_fp_is_enabled())
819 		cs->flags = RTE_BE32(MLX5_COMP_ALWAYS <<
820 				     MLX5_COMP_MODE_OFFSET);
821 	else
822 		cs->flags = RTE_BE32(MLX5_COMP_ONLY_FIRST_ERR <<
823 				     MLX5_COMP_MODE_OFFSET);
824 	cs->misc = RTE_BE32(0);
825 	if (__rte_trace_point_fp_is_enabled() && !loc->pkts_sent)
826 		rte_pmd_mlx5_trace_tx_entry(txq->port_id, txq->idx);
827 	rte_pmd_mlx5_trace_tx_wqe((txq->wqe_ci << 8) | opcode);
828 }
829 
830 /**
831  * Build the Synchronize Queue Segment with specified completion index.
832  *
833  * @param txq
834  *   Pointer to TX queue structure.
835  * @param loc
836  *   Pointer to burst routine local context.
837  * @param wqe
838  *   Pointer to WQE to fill with built Control Segment.
839  * @param wci
840  *   Completion index in Clock Queue to wait.
841  * @param olx
842  *   Configured Tx offloads mask. It is fully defined at
843  *   compile time and may be used for optimization.
844  */
845 static __rte_always_inline void
846 mlx5_tx_qseg_init(struct mlx5_txq_data *restrict txq,
847 		  struct mlx5_txq_local *restrict loc __rte_unused,
848 		  struct mlx5_wqe *restrict wqe,
849 		  unsigned int wci,
850 		  unsigned int olx __rte_unused)
851 {
852 	struct mlx5_wqe_qseg *qs;
853 
854 	qs = RTE_PTR_ADD(wqe, MLX5_WSEG_SIZE);
855 	qs->max_index = rte_cpu_to_be_32(wci);
856 	qs->qpn_cqn = rte_cpu_to_be_32(txq->sh->txpp.clock_queue.cq_obj.cq->id);
857 	qs->reserved0 = RTE_BE32(0);
858 	qs->reserved1 = RTE_BE32(0);
859 }
860 
861 /**
862  * Build the Wait on Time Segment with specified timestamp value.
863  *
864  * @param txq
865  *   Pointer to TX queue structure.
866  * @param loc
867  *   Pointer to burst routine local context.
868  * @param wqe
869  *   Pointer to WQE to fill with built Control Segment.
870  * @param ts
871  *   Timesatmp value to wait.
872  * @param olx
873  *   Configured Tx offloads mask. It is fully defined at
874  *   compile time and may be used for optimization.
875  */
876 static __rte_always_inline void
877 mlx5_tx_wseg_init(struct mlx5_txq_data *restrict txq,
878 		  struct mlx5_txq_local *restrict loc __rte_unused,
879 		  struct mlx5_wqe *restrict wqe,
880 		  uint64_t ts,
881 		  unsigned int olx __rte_unused)
882 {
883 	struct mlx5_wqe_wseg *ws;
884 
885 	ws = RTE_PTR_ADD(wqe, MLX5_WSEG_SIZE);
886 	ws->operation = rte_cpu_to_be_32(MLX5_WAIT_COND_CYCLIC_SMALLER);
887 	ws->lkey = RTE_BE32(0);
888 	ws->va_high = RTE_BE32(0);
889 	ws->va_low = RTE_BE32(0);
890 	if (txq->rt_timestamp) {
891 		ts = ts % (uint64_t)NS_PER_S
892 		   | (ts / (uint64_t)NS_PER_S) << 32;
893 	}
894 	ws->value = rte_cpu_to_be_64(ts);
895 	ws->mask = txq->rt_timemask;
896 }
897 
898 /**
899  * Build the Ethernet Segment without inlined data.
900  * Supports Software Parser, Checksums and VLAN insertion Tx offload features.
901  *
902  * @param txq
903  *   Pointer to TX queue structure.
904  * @param loc
905  *   Pointer to burst routine local context.
906  * @param wqe
907  *   Pointer to WQE to fill with built Ethernet Segment.
908  * @param olx
909  *   Configured Tx offloads mask. It is fully defined at
910  *   compile time and may be used for optimization.
911  */
912 static __rte_always_inline void
913 mlx5_tx_eseg_none(struct mlx5_txq_data *__rte_restrict txq __rte_unused,
914 		  struct mlx5_txq_local *__rte_restrict loc,
915 		  struct mlx5_wqe *__rte_restrict wqe,
916 		  unsigned int olx)
917 {
918 	struct mlx5_wqe_eseg *__rte_restrict es = &wqe->eseg;
919 	uint32_t csum;
920 
921 	/*
922 	 * Calculate and set check sum flags first, dword field
923 	 * in segment may be shared with Software Parser flags.
924 	 */
925 	csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
926 	es->flags = rte_cpu_to_le_32(csum);
927 	/*
928 	 * Calculate and set Software Parser offsets and flags.
929 	 * These flags a set for custom UDP and IP tunnel packets.
930 	 */
931 	es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
932 	/* Fill metadata field if needed. */
933 	es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
934 		       loc->mbuf->ol_flags & RTE_MBUF_DYNFLAG_TX_METADATA ?
935 		       rte_cpu_to_be_32(*RTE_FLOW_DYNF_METADATA(loc->mbuf)) :
936 		       0 : 0;
937 	/* Engage VLAN tag insertion feature if requested. */
938 	if (MLX5_TXOFF_CONFIG(VLAN) &&
939 	    loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN) {
940 		/*
941 		 * We should get here only if device support
942 		 * this feature correctly.
943 		 */
944 		MLX5_ASSERT(txq->vlan_en);
945 		es->inline_hdr = rte_cpu_to_be_32(MLX5_ETH_WQE_VLAN_INSERT |
946 						  loc->mbuf->vlan_tci);
947 	} else {
948 		es->inline_hdr = RTE_BE32(0);
949 	}
950 }
951 
952 /**
953  * Build the Ethernet Segment with minimal inlined data
954  * of MLX5_ESEG_MIN_INLINE_SIZE bytes length. This is
955  * used to fill the gap in single WQEBB WQEs.
956  * Supports Software Parser, Checksums and VLAN
957  * insertion Tx offload features.
958  *
959  * @param txq
960  *   Pointer to TX queue structure.
961  * @param loc
962  *   Pointer to burst routine local context.
963  * @param wqe
964  *   Pointer to WQE to fill with built Ethernet Segment.
965  * @param vlan
966  *   Length of VLAN tag insertion if any.
967  * @param olx
968  *   Configured Tx offloads mask. It is fully defined at
969  *   compile time and may be used for optimization.
970  */
971 static __rte_always_inline void
972 mlx5_tx_eseg_dmin(struct mlx5_txq_data *__rte_restrict txq __rte_unused,
973 		  struct mlx5_txq_local *__rte_restrict loc,
974 		  struct mlx5_wqe *__rte_restrict wqe,
975 		  unsigned int vlan,
976 		  unsigned int olx)
977 {
978 	struct mlx5_wqe_eseg *__rte_restrict es = &wqe->eseg;
979 	uint32_t csum;
980 	uint8_t *psrc, *pdst;
981 
982 	/*
983 	 * Calculate and set check sum flags first, dword field
984 	 * in segment may be shared with Software Parser flags.
985 	 */
986 	csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
987 	es->flags = rte_cpu_to_le_32(csum);
988 	/*
989 	 * Calculate and set Software Parser offsets and flags.
990 	 * These flags a set for custom UDP and IP tunnel packets.
991 	 */
992 	es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
993 	/* Fill metadata field if needed. */
994 	es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
995 		       loc->mbuf->ol_flags & RTE_MBUF_DYNFLAG_TX_METADATA ?
996 		       rte_cpu_to_be_32(*RTE_FLOW_DYNF_METADATA(loc->mbuf)) :
997 		       0 : 0;
998 	psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
999 	es->inline_hdr_sz = RTE_BE16(MLX5_ESEG_MIN_INLINE_SIZE);
1000 	es->inline_data = *(unaligned_uint16_t *)psrc;
1001 	psrc +=	sizeof(uint16_t);
1002 	pdst = (uint8_t *)(es + 1);
1003 	if (MLX5_TXOFF_CONFIG(VLAN) && vlan) {
1004 		/* Implement VLAN tag insertion as part inline data. */
1005 		memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t));
1006 		pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
1007 		psrc +=	2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
1008 		/* Insert VLAN ethertype + VLAN tag. */
1009 		*(unaligned_uint32_t *)pdst = rte_cpu_to_be_32
1010 						((RTE_ETHER_TYPE_VLAN << 16) |
1011 						 loc->mbuf->vlan_tci);
1012 		pdst += sizeof(struct rte_vlan_hdr);
1013 		/* Copy the rest two bytes from packet data. */
1014 		MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t)));
1015 		*(uint16_t *)pdst = *(unaligned_uint16_t *)psrc;
1016 	} else {
1017 		/* Fill the gap in the title WQEBB with inline data. */
1018 		rte_mov16(pdst, psrc);
1019 	}
1020 }
1021 
1022 /**
1023  * Build the Ethernet Segment with entire packet data inlining. Checks the
1024  * boundary of WQEBB and ring buffer wrapping, supports Software Parser,
1025  * Checksums and VLAN insertion Tx offload features.
1026  *
1027  * @param txq
1028  *   Pointer to TX queue structure.
1029  * @param loc
1030  *   Pointer to burst routine local context.
1031  * @param wqe
1032  *   Pointer to WQE to fill with built Ethernet Segment.
1033  * @param vlan
1034  *   Length of VLAN tag insertion if any.
1035  * @param inlen
1036  *   Length of data to inline (VLAN included, if any).
1037  * @param tso
1038  *   TSO flag, set mss field from the packet.
1039  * @param olx
1040  *   Configured Tx offloads mask. It is fully defined at
1041  *   compile time and may be used for optimization.
1042  *
1043  * @return
1044  *   Pointer to the next Data Segment (aligned and wrapped around).
1045  */
1046 static __rte_always_inline struct mlx5_wqe_dseg *
1047 mlx5_tx_eseg_data(struct mlx5_txq_data *__rte_restrict txq,
1048 		  struct mlx5_txq_local *__rte_restrict loc,
1049 		  struct mlx5_wqe *__rte_restrict wqe,
1050 		  unsigned int vlan,
1051 		  unsigned int inlen,
1052 		  unsigned int tso,
1053 		  unsigned int olx)
1054 {
1055 	struct mlx5_wqe_eseg *__rte_restrict es = &wqe->eseg;
1056 	uint32_t csum;
1057 	uint8_t *psrc, *pdst;
1058 	unsigned int part;
1059 
1060 	/*
1061 	 * Calculate and set check sum flags first, dword field
1062 	 * in segment may be shared with Software Parser flags.
1063 	 */
1064 	csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
1065 	if (tso) {
1066 		csum <<= 24;
1067 		csum |= loc->mbuf->tso_segsz;
1068 		es->flags = rte_cpu_to_be_32(csum);
1069 	} else {
1070 		es->flags = rte_cpu_to_le_32(csum);
1071 	}
1072 	/*
1073 	 * Calculate and set Software Parser offsets and flags.
1074 	 * These flags a set for custom UDP and IP tunnel packets.
1075 	 */
1076 	es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
1077 	/* Fill metadata field if needed. */
1078 	es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
1079 		       loc->mbuf->ol_flags & RTE_MBUF_DYNFLAG_TX_METADATA ?
1080 		       rte_cpu_to_be_32(*RTE_FLOW_DYNF_METADATA(loc->mbuf)) :
1081 		       0 : 0;
1082 	psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
1083 	es->inline_hdr_sz = rte_cpu_to_be_16(inlen);
1084 	es->inline_data = *(unaligned_uint16_t *)psrc;
1085 	psrc +=	sizeof(uint16_t);
1086 	pdst = (uint8_t *)(es + 1);
1087 	if (MLX5_TXOFF_CONFIG(VLAN) && vlan) {
1088 		/* Implement VLAN tag insertion as part inline data. */
1089 		memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t));
1090 		pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
1091 		psrc +=	2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
1092 		/* Insert VLAN ethertype + VLAN tag. */
1093 		*(unaligned_uint32_t *)pdst = rte_cpu_to_be_32
1094 						((RTE_ETHER_TYPE_VLAN << 16) |
1095 						 loc->mbuf->vlan_tci);
1096 		pdst += sizeof(struct rte_vlan_hdr);
1097 		/* Copy the rest two bytes from packet data. */
1098 		MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t)));
1099 		*(uint16_t *)pdst = *(unaligned_uint16_t *)psrc;
1100 		psrc += sizeof(uint16_t);
1101 	} else {
1102 		/* Fill the gap in the title WQEBB with inline data. */
1103 		rte_mov16(pdst, psrc);
1104 		psrc += sizeof(rte_v128u32_t);
1105 	}
1106 	pdst = (uint8_t *)(es + 2);
1107 	MLX5_ASSERT(inlen >= MLX5_ESEG_MIN_INLINE_SIZE);
1108 	MLX5_ASSERT(pdst < (uint8_t *)txq->wqes_end);
1109 	inlen -= MLX5_ESEG_MIN_INLINE_SIZE;
1110 	if (!inlen) {
1111 		MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE));
1112 		return (struct mlx5_wqe_dseg *)pdst;
1113 	}
1114 	/*
1115 	 * The WQEBB space availability is checked by caller.
1116 	 * Here we should be aware of WQE ring buffer wraparound only.
1117 	 */
1118 	part = (uint8_t *)txq->wqes_end - pdst;
1119 	part = RTE_MIN(part, inlen);
1120 	do {
1121 		rte_memcpy(pdst, psrc, part);
1122 		inlen -= part;
1123 		if (likely(!inlen)) {
1124 			/*
1125 			 * If return value is not used by the caller
1126 			 * the code below will be optimized out.
1127 			 */
1128 			pdst += part;
1129 			pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
1130 			if (unlikely(pdst >= (uint8_t *)txq->wqes_end))
1131 				pdst = (uint8_t *)txq->wqes;
1132 			return (struct mlx5_wqe_dseg *)pdst;
1133 		}
1134 		pdst = (uint8_t *)txq->wqes;
1135 		psrc += part;
1136 		part = inlen;
1137 	} while (true);
1138 }
1139 
1140 /**
1141  * Copy data from chain of mbuf to the specified linear buffer.
1142  * Checksums and VLAN insertion Tx offload features. If data
1143  * from some mbuf copied completely this mbuf is freed. Local
1144  * structure is used to keep the byte stream state.
1145  *
1146  * @param pdst
1147  *   Pointer to the destination linear buffer.
1148  * @param loc
1149  *   Pointer to burst routine local context.
1150  * @param len
1151  *   Length of data to be copied.
1152  * @param must
1153  *   Length of data to be copied ignoring no inline hint.
1154  * @param olx
1155  *   Configured Tx offloads mask. It is fully defined at
1156  *   compile time and may be used for optimization.
1157  *
1158  * @return
1159  *   Number of actual copied data bytes. This is always greater than or
1160  *   equal to must parameter and might be lesser than len in no inline
1161  *   hint flag is encountered.
1162  */
1163 static __rte_always_inline unsigned int
1164 mlx5_tx_mseg_memcpy(uint8_t *pdst,
1165 		    struct mlx5_txq_local *__rte_restrict loc,
1166 		    unsigned int len,
1167 		    unsigned int must,
1168 		    unsigned int olx __rte_unused)
1169 {
1170 	struct rte_mbuf *mbuf;
1171 	unsigned int part, dlen, copy = 0;
1172 	uint8_t *psrc;
1173 
1174 	MLX5_ASSERT(len);
1175 	do {
1176 		/* Allow zero length packets, must check first. */
1177 		dlen = rte_pktmbuf_data_len(loc->mbuf);
1178 		if (dlen <= loc->mbuf_off) {
1179 			/* Exhausted packet, just free. */
1180 			mbuf = loc->mbuf;
1181 			loc->mbuf = mbuf->next;
1182 			rte_pktmbuf_free_seg(mbuf);
1183 			loc->mbuf_off = 0;
1184 			MLX5_ASSERT(loc->mbuf_nseg > 1);
1185 			MLX5_ASSERT(loc->mbuf);
1186 			--loc->mbuf_nseg;
1187 			if (loc->mbuf->ol_flags & RTE_MBUF_F_TX_DYNF_NOINLINE) {
1188 				unsigned int diff;
1189 
1190 				if (copy >= must) {
1191 					/*
1192 					 * We already copied the minimal
1193 					 * requested amount of data.
1194 					 */
1195 					return copy;
1196 				}
1197 				diff = must - copy;
1198 				if (diff <= rte_pktmbuf_data_len(loc->mbuf)) {
1199 					/*
1200 					 * Copy only the minimal required
1201 					 * part of the data buffer. Limit amount
1202 					 * of data to be copied to the length of
1203 					 * available space.
1204 					 */
1205 					len = RTE_MIN(len, diff);
1206 				}
1207 			}
1208 			continue;
1209 		}
1210 		dlen -= loc->mbuf_off;
1211 		psrc = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *,
1212 					       loc->mbuf_off);
1213 		part = RTE_MIN(len, dlen);
1214 		rte_memcpy(pdst, psrc, part);
1215 		copy += part;
1216 		loc->mbuf_off += part;
1217 		len -= part;
1218 		if (!len) {
1219 			if (loc->mbuf_off >= rte_pktmbuf_data_len(loc->mbuf)) {
1220 				loc->mbuf_off = 0;
1221 				/* Exhausted packet, just free. */
1222 				mbuf = loc->mbuf;
1223 				loc->mbuf = mbuf->next;
1224 				rte_pktmbuf_free_seg(mbuf);
1225 				loc->mbuf_off = 0;
1226 				MLX5_ASSERT(loc->mbuf_nseg >= 1);
1227 				--loc->mbuf_nseg;
1228 			}
1229 			return copy;
1230 		}
1231 		pdst += part;
1232 	} while (true);
1233 }
1234 
1235 /**
1236  * Build the Ethernet Segment with inlined data from multi-segment packet.
1237  * Checks the boundary of WQEBB and ring buffer wrapping, supports Software
1238  * Parser, Checksums and VLAN insertion Tx offload features.
1239  *
1240  * @param txq
1241  *   Pointer to TX queue structure.
1242  * @param loc
1243  *   Pointer to burst routine local context.
1244  * @param wqe
1245  *   Pointer to WQE to fill with built Ethernet Segment.
1246  * @param vlan
1247  *   Length of VLAN tag insertion if any.
1248  * @param inlen
1249  *   Length of data to inline (VLAN included, if any).
1250  * @param tso
1251  *   TSO flag, set mss field from the packet.
1252  * @param olx
1253  *   Configured Tx offloads mask. It is fully defined at
1254  *   compile time and may be used for optimization.
1255  *
1256  * @return
1257  *   Pointer to the next Data Segment (aligned and possible NOT wrapped
1258  *   around - caller should do wrapping check on its own).
1259  */
1260 static __rte_always_inline struct mlx5_wqe_dseg *
1261 mlx5_tx_eseg_mdat(struct mlx5_txq_data *__rte_restrict txq,
1262 		  struct mlx5_txq_local *__rte_restrict loc,
1263 		  struct mlx5_wqe *__rte_restrict wqe,
1264 		  unsigned int vlan,
1265 		  unsigned int inlen,
1266 		  unsigned int tso,
1267 		  unsigned int olx)
1268 {
1269 	struct mlx5_wqe_eseg *__rte_restrict es = &wqe->eseg;
1270 	uint32_t csum;
1271 	uint8_t *pdst;
1272 	unsigned int part, tlen = 0;
1273 
1274 	/*
1275 	 * Calculate and set check sum flags first, uint32_t field
1276 	 * in segment may be shared with Software Parser flags.
1277 	 */
1278 	csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
1279 	if (tso) {
1280 		csum <<= 24;
1281 		csum |= loc->mbuf->tso_segsz;
1282 		es->flags = rte_cpu_to_be_32(csum);
1283 	} else {
1284 		es->flags = rte_cpu_to_le_32(csum);
1285 	}
1286 	/*
1287 	 * Calculate and set Software Parser offsets and flags.
1288 	 * These flags a set for custom UDP and IP tunnel packets.
1289 	 */
1290 	es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
1291 	/* Fill metadata field if needed. */
1292 	es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
1293 		       loc->mbuf->ol_flags & RTE_MBUF_DYNFLAG_TX_METADATA ?
1294 		       rte_cpu_to_be_32(*RTE_FLOW_DYNF_METADATA(loc->mbuf)) :
1295 		       0 : 0;
1296 	MLX5_ASSERT(inlen >= MLX5_ESEG_MIN_INLINE_SIZE);
1297 	pdst = (uint8_t *)&es->inline_data;
1298 	if (MLX5_TXOFF_CONFIG(VLAN) && vlan) {
1299 		/* Implement VLAN tag insertion as part inline data. */
1300 		mlx5_tx_mseg_memcpy(pdst, loc,
1301 				    2 * RTE_ETHER_ADDR_LEN,
1302 				    2 * RTE_ETHER_ADDR_LEN, olx);
1303 		pdst += 2 * RTE_ETHER_ADDR_LEN;
1304 		*(unaligned_uint32_t *)pdst = rte_cpu_to_be_32
1305 						((RTE_ETHER_TYPE_VLAN << 16) |
1306 						 loc->mbuf->vlan_tci);
1307 		pdst += sizeof(struct rte_vlan_hdr);
1308 		tlen += 2 * RTE_ETHER_ADDR_LEN + sizeof(struct rte_vlan_hdr);
1309 	}
1310 	MLX5_ASSERT(pdst < (uint8_t *)txq->wqes_end);
1311 	/*
1312 	 * The WQEBB space availability is checked by caller.
1313 	 * Here we should be aware of WQE ring buffer wraparound only.
1314 	 */
1315 	part = (uint8_t *)txq->wqes_end - pdst;
1316 	part = RTE_MIN(part, inlen - tlen);
1317 	MLX5_ASSERT(part);
1318 	do {
1319 		unsigned int copy;
1320 
1321 		/*
1322 		 * Copying may be interrupted inside the routine
1323 		 * if run into no inline hint flag.
1324 		 */
1325 		copy = tso ? inlen : txq->inlen_mode;
1326 		copy = tlen >= copy ? 0 : (copy - tlen);
1327 		copy = mlx5_tx_mseg_memcpy(pdst, loc, part, copy, olx);
1328 		tlen += copy;
1329 		if (likely(inlen <= tlen) || copy < part) {
1330 			es->inline_hdr_sz = rte_cpu_to_be_16(tlen);
1331 			pdst += copy;
1332 			pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
1333 			return (struct mlx5_wqe_dseg *)pdst;
1334 		}
1335 		pdst = (uint8_t *)txq->wqes;
1336 		part = inlen - tlen;
1337 	} while (true);
1338 }
1339 
1340 /**
1341  * Build the Data Segment of pointer type.
1342  *
1343  * @param txq
1344  *   Pointer to TX queue structure.
1345  * @param loc
1346  *   Pointer to burst routine local context.
1347  * @param dseg
1348  *   Pointer to WQE to fill with built Data Segment.
1349  * @param buf
1350  *   Data buffer to point.
1351  * @param len
1352  *   Data buffer length.
1353  * @param olx
1354  *   Configured Tx offloads mask. It is fully defined at
1355  *   compile time and may be used for optimization.
1356  */
1357 static __rte_always_inline void
1358 mlx5_tx_dseg_ptr(struct mlx5_txq_data *__rte_restrict txq,
1359 		 struct mlx5_txq_local *__rte_restrict loc,
1360 		 struct mlx5_wqe_dseg *__rte_restrict dseg,
1361 		 uint8_t *buf,
1362 		 unsigned int len,
1363 		 unsigned int olx __rte_unused)
1364 
1365 {
1366 	MLX5_ASSERT(len);
1367 	dseg->bcount = rte_cpu_to_be_32(len);
1368 	dseg->lkey = mlx5_mr_mb2mr(&txq->mr_ctrl, loc->mbuf);
1369 	dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf);
1370 }
1371 
1372 /**
1373  * Build the Data Segment of pointer type or inline if data length is less than
1374  * buffer in minimal Data Segment size.
1375  *
1376  * @param txq
1377  *   Pointer to TX queue structure.
1378  * @param loc
1379  *   Pointer to burst routine local context.
1380  * @param dseg
1381  *   Pointer to WQE to fill with built Data Segment.
1382  * @param buf
1383  *   Data buffer to point.
1384  * @param len
1385  *   Data buffer length.
1386  * @param olx
1387  *   Configured Tx offloads mask. It is fully defined at
1388  *   compile time and may be used for optimization.
1389  */
1390 static __rte_always_inline void
1391 mlx5_tx_dseg_iptr(struct mlx5_txq_data *__rte_restrict txq,
1392 		  struct mlx5_txq_local *__rte_restrict loc,
1393 		  struct mlx5_wqe_dseg *__rte_restrict dseg,
1394 		  uint8_t *buf,
1395 		  unsigned int len,
1396 		  unsigned int olx __rte_unused)
1397 
1398 {
1399 	uintptr_t dst, src;
1400 
1401 	MLX5_ASSERT(len);
1402 	if (len > MLX5_DSEG_MIN_INLINE_SIZE) {
1403 		dseg->bcount = rte_cpu_to_be_32(len);
1404 		dseg->lkey = mlx5_mr_mb2mr(&txq->mr_ctrl, loc->mbuf);
1405 		dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf);
1406 
1407 		return;
1408 	}
1409 	dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE);
1410 	/* Unrolled implementation of generic rte_memcpy. */
1411 	dst = (uintptr_t)&dseg->inline_data[0];
1412 	src = (uintptr_t)buf;
1413 	if (len & 0x08) {
1414 #ifdef RTE_ARCH_STRICT_ALIGN
1415 		MLX5_ASSERT(dst == RTE_PTR_ALIGN(dst, sizeof(uint32_t)));
1416 		*(uint32_t *)dst = *(unaligned_uint32_t *)src;
1417 		dst += sizeof(uint32_t);
1418 		src += sizeof(uint32_t);
1419 		*(uint32_t *)dst = *(unaligned_uint32_t *)src;
1420 		dst += sizeof(uint32_t);
1421 		src += sizeof(uint32_t);
1422 #else
1423 		*(uint64_t *)dst = *(unaligned_uint64_t *)src;
1424 		dst += sizeof(uint64_t);
1425 		src += sizeof(uint64_t);
1426 #endif
1427 	}
1428 	if (len & 0x04) {
1429 		*(uint32_t *)dst = *(unaligned_uint32_t *)src;
1430 		dst += sizeof(uint32_t);
1431 		src += sizeof(uint32_t);
1432 	}
1433 	if (len & 0x02) {
1434 		*(uint16_t *)dst = *(unaligned_uint16_t *)src;
1435 		dst += sizeof(uint16_t);
1436 		src += sizeof(uint16_t);
1437 	}
1438 	if (len & 0x01)
1439 		*(uint8_t *)dst = *(uint8_t *)src;
1440 }
1441 
1442 /**
1443  * Build the Data Segment of inlined data from single
1444  * segment packet, no VLAN insertion.
1445  *
1446  * @param txq
1447  *   Pointer to TX queue structure.
1448  * @param loc
1449  *   Pointer to burst routine local context.
1450  * @param dseg
1451  *   Pointer to WQE to fill with built Data Segment.
1452  * @param buf
1453  *   Data buffer to point.
1454  * @param len
1455  *   Data buffer length.
1456  * @param olx
1457  *   Configured Tx offloads mask. It is fully defined at
1458  *   compile time and may be used for optimization.
1459  *
1460  * @return
1461  *   Pointer to the next Data Segment after inlined data.
1462  *   Ring buffer wraparound check is needed. We do not do it here because it
1463  *   may not be needed for the last packet in the eMPW session.
1464  */
1465 static __rte_always_inline struct mlx5_wqe_dseg *
1466 mlx5_tx_dseg_empw(struct mlx5_txq_data *__rte_restrict txq,
1467 		  struct mlx5_txq_local *__rte_restrict loc __rte_unused,
1468 		  struct mlx5_wqe_dseg *__rte_restrict dseg,
1469 		  uint8_t *buf,
1470 		  unsigned int len,
1471 		  unsigned int olx __rte_unused)
1472 {
1473 	unsigned int part;
1474 	uint8_t *pdst;
1475 
1476 	if (!MLX5_TXOFF_CONFIG(MPW)) {
1477 		/* Store the descriptor byte counter for eMPW sessions. */
1478 		dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE);
1479 		pdst = &dseg->inline_data[0];
1480 	} else {
1481 		/* The entire legacy MPW session counter is stored on close. */
1482 		pdst = (uint8_t *)dseg;
1483 	}
1484 	/*
1485 	 * The WQEBB space availability is checked by caller.
1486 	 * Here we should be aware of WQE ring buffer wraparound only.
1487 	 */
1488 	part = (uint8_t *)txq->wqes_end - pdst;
1489 	part = RTE_MIN(part, len);
1490 	do {
1491 		rte_memcpy(pdst, buf, part);
1492 		len -= part;
1493 		if (likely(!len)) {
1494 			pdst += part;
1495 			if (!MLX5_TXOFF_CONFIG(MPW))
1496 				pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
1497 			/* Note: no final wraparound check here. */
1498 			return (struct mlx5_wqe_dseg *)pdst;
1499 		}
1500 		pdst = (uint8_t *)txq->wqes;
1501 		buf += part;
1502 		part = len;
1503 	} while (true);
1504 }
1505 
1506 /**
1507  * Build the Data Segment of inlined data from single
1508  * segment packet with VLAN insertion.
1509  *
1510  * @param txq
1511  *   Pointer to TX queue structure.
1512  * @param loc
1513  *   Pointer to burst routine local context.
1514  * @param dseg
1515  *   Pointer to the dseg fill with built Data Segment.
1516  * @param buf
1517  *   Data buffer to point.
1518  * @param len
1519  *   Data buffer length.
1520  * @param olx
1521  *   Configured Tx offloads mask. It is fully defined at
1522  *   compile time and may be used for optimization.
1523  *
1524  * @return
1525  *   Pointer to the next Data Segment after inlined data.
1526  *   Ring buffer wraparound check is needed.
1527  */
1528 static __rte_always_inline struct mlx5_wqe_dseg *
1529 mlx5_tx_dseg_vlan(struct mlx5_txq_data *__rte_restrict txq,
1530 		  struct mlx5_txq_local *__rte_restrict loc __rte_unused,
1531 		  struct mlx5_wqe_dseg *__rte_restrict dseg,
1532 		  uint8_t *buf,
1533 		  unsigned int len,
1534 		  unsigned int olx __rte_unused)
1535 
1536 {
1537 	unsigned int part;
1538 	uint8_t *pdst;
1539 
1540 	MLX5_ASSERT(len > MLX5_ESEG_MIN_INLINE_SIZE);
1541 	if (!MLX5_TXOFF_CONFIG(MPW)) {
1542 		/* Store the descriptor byte counter for eMPW sessions. */
1543 		dseg->bcount = rte_cpu_to_be_32
1544 				((len + sizeof(struct rte_vlan_hdr)) |
1545 				 MLX5_ETH_WQE_DATA_INLINE);
1546 		pdst = &dseg->inline_data[0];
1547 	} else {
1548 		/* The entire legacy MPW session counter is stored on close. */
1549 		pdst = (uint8_t *)dseg;
1550 	}
1551 	memcpy(pdst, buf, MLX5_DSEG_MIN_INLINE_SIZE);
1552 	buf += MLX5_DSEG_MIN_INLINE_SIZE;
1553 	pdst += MLX5_DSEG_MIN_INLINE_SIZE;
1554 	len -= MLX5_DSEG_MIN_INLINE_SIZE;
1555 	/* Insert VLAN ethertype + VLAN tag. Pointer is aligned. */
1556 	MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE));
1557 	if (unlikely(pdst >= (uint8_t *)txq->wqes_end))
1558 		pdst = (uint8_t *)txq->wqes;
1559 	*(uint32_t *)pdst = rte_cpu_to_be_32((RTE_ETHER_TYPE_VLAN << 16) |
1560 					      loc->mbuf->vlan_tci);
1561 	pdst += sizeof(struct rte_vlan_hdr);
1562 	/*
1563 	 * The WQEBB space availability is checked by caller.
1564 	 * Here we should be aware of WQE ring buffer wraparound only.
1565 	 */
1566 	part = (uint8_t *)txq->wqes_end - pdst;
1567 	part = RTE_MIN(part, len);
1568 	do {
1569 		rte_memcpy(pdst, buf, part);
1570 		len -= part;
1571 		if (likely(!len)) {
1572 			pdst += part;
1573 			if (!MLX5_TXOFF_CONFIG(MPW))
1574 				pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
1575 			/* Note: no final wraparound check here. */
1576 			return (struct mlx5_wqe_dseg *)pdst;
1577 		}
1578 		pdst = (uint8_t *)txq->wqes;
1579 		buf += part;
1580 		part = len;
1581 	} while (true);
1582 }
1583 
1584 /**
1585  * Build the Ethernet Segment with optionally inlined data with
1586  * VLAN insertion and following Data Segments (if any) from
1587  * multi-segment packet. Used by ordinary send and TSO.
1588  *
1589  * @param txq
1590  *   Pointer to TX queue structure.
1591  * @param loc
1592  *   Pointer to burst routine local context.
1593  * @param wqe
1594  *   Pointer to WQE to fill with built Ethernet/Data Segments.
1595  * @param vlan
1596  *   Length of VLAN header to insert, 0 means no VLAN insertion.
1597  * @param inlen
1598  *   Data length to inline. For TSO this parameter specifies exact value,
1599  *   for ordinary send routine can be aligned by caller to provide better WQE
1600  *   space saving and data buffer start address alignment.
1601  *   This length includes VLAN header being inserted.
1602  * @param tso
1603  *   Zero means ordinary send, inlined data can be extended,
1604  *   otherwise this is TSO, inlined data length is fixed.
1605  * @param olx
1606  *   Configured Tx offloads mask. It is fully defined at
1607  *   compile time and may be used for optimization.
1608  *
1609  * @return
1610  *   Actual size of built WQE in segments.
1611  */
1612 static __rte_always_inline unsigned int
1613 mlx5_tx_mseg_build(struct mlx5_txq_data *__rte_restrict txq,
1614 		   struct mlx5_txq_local *__rte_restrict loc,
1615 		   struct mlx5_wqe *__rte_restrict wqe,
1616 		   unsigned int vlan,
1617 		   unsigned int inlen,
1618 		   unsigned int tso,
1619 		   unsigned int olx __rte_unused)
1620 {
1621 	struct mlx5_wqe_dseg *__rte_restrict dseg;
1622 	unsigned int ds;
1623 
1624 	MLX5_ASSERT((rte_pktmbuf_pkt_len(loc->mbuf) + vlan) >= inlen);
1625 	loc->mbuf_nseg = NB_SEGS(loc->mbuf);
1626 	loc->mbuf_off = 0;
1627 
1628 	dseg = mlx5_tx_eseg_mdat(txq, loc, wqe, vlan, inlen, tso, olx);
1629 	if (!loc->mbuf_nseg)
1630 		goto dseg_done;
1631 	/*
1632 	 * There are still some mbuf remaining, not inlined.
1633 	 * The first mbuf may be partially inlined and we
1634 	 * must process the possible non-zero data offset.
1635 	 */
1636 	if (loc->mbuf_off) {
1637 		unsigned int dlen;
1638 		uint8_t *dptr;
1639 
1640 		/*
1641 		 * Exhausted packets must be dropped before.
1642 		 * Non-zero offset means there are some data
1643 		 * remained in the packet.
1644 		 */
1645 		MLX5_ASSERT(loc->mbuf_off < rte_pktmbuf_data_len(loc->mbuf));
1646 		MLX5_ASSERT(rte_pktmbuf_data_len(loc->mbuf));
1647 		dptr = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *,
1648 					       loc->mbuf_off);
1649 		dlen = rte_pktmbuf_data_len(loc->mbuf) - loc->mbuf_off;
1650 		/*
1651 		 * Build the pointer/minimal Data Segment.
1652 		 * Do ring buffer wrapping check in advance.
1653 		 */
1654 		if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
1655 			dseg = (struct mlx5_wqe_dseg *)txq->wqes;
1656 		mlx5_tx_dseg_iptr(txq, loc, dseg, dptr, dlen, olx);
1657 		/* Store the mbuf to be freed on completion. */
1658 		MLX5_ASSERT(loc->elts_free);
1659 		txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
1660 		--loc->elts_free;
1661 		++dseg;
1662 		if (--loc->mbuf_nseg == 0)
1663 			goto dseg_done;
1664 		loc->mbuf = loc->mbuf->next;
1665 		loc->mbuf_off = 0;
1666 	}
1667 	do {
1668 		if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) {
1669 			struct rte_mbuf *mbuf;
1670 
1671 			/* Zero length segment found, just skip. */
1672 			mbuf = loc->mbuf;
1673 			loc->mbuf = loc->mbuf->next;
1674 			rte_pktmbuf_free_seg(mbuf);
1675 			if (--loc->mbuf_nseg == 0)
1676 				break;
1677 		} else {
1678 			if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
1679 				dseg = (struct mlx5_wqe_dseg *)txq->wqes;
1680 			mlx5_tx_dseg_iptr
1681 				(txq, loc, dseg,
1682 				 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
1683 				 rte_pktmbuf_data_len(loc->mbuf), olx);
1684 			MLX5_ASSERT(loc->elts_free);
1685 			txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
1686 			--loc->elts_free;
1687 			++dseg;
1688 			if (--loc->mbuf_nseg == 0)
1689 				break;
1690 			loc->mbuf = loc->mbuf->next;
1691 		}
1692 	} while (true);
1693 
1694 dseg_done:
1695 	/* Calculate actual segments used from the dseg pointer. */
1696 	if ((uintptr_t)wqe < (uintptr_t)dseg)
1697 		ds = ((uintptr_t)dseg - (uintptr_t)wqe) / MLX5_WSEG_SIZE;
1698 	else
1699 		ds = (((uintptr_t)dseg - (uintptr_t)wqe) +
1700 		      txq->wqe_s * MLX5_WQE_SIZE) / MLX5_WSEG_SIZE;
1701 	return ds;
1702 }
1703 
1704 /**
1705  * The routine checks timestamp flag in the current packet,
1706  * and push WAIT WQE into the queue if scheduling is required.
1707  *
1708  * @param txq
1709  *   Pointer to TX queue structure.
1710  * @param loc
1711  *   Pointer to burst routine local context.
1712  * @param elts
1713  *   Number of free elements in elts buffer to be checked, for zero
1714  *   value the check is optimized out by compiler.
1715  * @param olx
1716  *   Configured Tx offloads mask. It is fully defined at
1717  *   compile time and may be used for optimization.
1718  *
1719  * @return
1720  *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
1721  *   MLX5_TXCMP_CODE_SINGLE - continue processing with the packet.
1722  *   MLX5_TXCMP_CODE_MULTI - the WAIT inserted, continue processing.
1723  * Local context variables partially updated.
1724  */
1725 static __rte_always_inline enum mlx5_txcmp_code
1726 mlx5_tx_schedule_send(struct mlx5_txq_data *restrict txq,
1727 		      struct mlx5_txq_local *restrict loc,
1728 		      uint16_t elts,
1729 		      unsigned int olx)
1730 {
1731 	if (MLX5_TXOFF_CONFIG(TXPP) &&
1732 	    loc->mbuf->ol_flags & txq->ts_mask) {
1733 		struct mlx5_dev_ctx_shared *sh;
1734 		struct mlx5_wqe *wqe;
1735 		uint64_t ts;
1736 
1737 		/*
1738 		 * Estimate the required space quickly and roughly.
1739 		 * We would like to ensure the packet can be pushed
1740 		 * to the queue and we won't get the orphan WAIT WQE.
1741 		 */
1742 		if (loc->wqe_free <= MLX5_WQE_SIZE_MAX / MLX5_WQE_SIZE ||
1743 		    loc->elts_free < elts)
1744 			return MLX5_TXCMP_CODE_EXIT;
1745 		/* Convert the timestamp into completion to wait. */
1746 		ts = *RTE_MBUF_DYNFIELD(loc->mbuf, txq->ts_offset, uint64_t *);
1747 		if (txq->ts_last && ts < txq->ts_last)
1748 			rte_atomic_fetch_add_explicit(&txq->sh->txpp.err_ts_order,
1749 					   1, rte_memory_order_relaxed);
1750 		txq->ts_last = ts;
1751 		wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
1752 		sh = txq->sh;
1753 		if (txq->wait_on_time) {
1754 			/* The wait on time capability should be used. */
1755 			ts -= sh->txpp.skew;
1756 			rte_pmd_mlx5_trace_tx_wait(ts);
1757 			mlx5_tx_cseg_init(txq, loc, wqe,
1758 					  1 + sizeof(struct mlx5_wqe_wseg) /
1759 					      MLX5_WSEG_SIZE,
1760 					  MLX5_OPCODE_WAIT |
1761 					  MLX5_OPC_MOD_WAIT_TIME << 24, olx);
1762 			mlx5_tx_wseg_init(txq, loc, wqe, ts, olx);
1763 		} else {
1764 			/* Legacy cross-channel operation should be used. */
1765 			int32_t wci;
1766 
1767 			wci = mlx5_txpp_convert_tx_ts(sh, ts);
1768 			if (unlikely(wci < 0))
1769 				return MLX5_TXCMP_CODE_SINGLE;
1770 			/* Build the WAIT WQE with specified completion. */
1771 			rte_pmd_mlx5_trace_tx_wait(ts - sh->txpp.skew);
1772 			mlx5_tx_cseg_init(txq, loc, wqe,
1773 					  1 + sizeof(struct mlx5_wqe_qseg) /
1774 					      MLX5_WSEG_SIZE,
1775 					  MLX5_OPCODE_WAIT |
1776 					  MLX5_OPC_MOD_WAIT_CQ_PI << 24, olx);
1777 			mlx5_tx_qseg_init(txq, loc, wqe, wci, olx);
1778 		}
1779 		++txq->wqe_ci;
1780 		--loc->wqe_free;
1781 		return MLX5_TXCMP_CODE_MULTI;
1782 	}
1783 	return MLX5_TXCMP_CODE_SINGLE;
1784 }
1785 
1786 /**
1787  * Tx one packet function for multi-segment TSO. Supports all
1788  * types of Tx offloads, uses MLX5_OPCODE_TSO to build WQEs,
1789  * sends one packet per WQE.
1790  *
1791  * This routine is responsible for storing processed mbuf
1792  * into elts ring buffer and update elts_head.
1793  *
1794  * @param txq
1795  *   Pointer to TX queue structure.
1796  * @param loc
1797  *   Pointer to burst routine local context.
1798  * @param olx
1799  *   Configured Tx offloads mask. It is fully defined at
1800  *   compile time and may be used for optimization.
1801  *
1802  * @return
1803  *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
1804  *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
1805  * Local context variables partially updated.
1806  */
1807 static __rte_always_inline enum mlx5_txcmp_code
1808 mlx5_tx_packet_multi_tso(struct mlx5_txq_data *__rte_restrict txq,
1809 			struct mlx5_txq_local *__rte_restrict loc,
1810 			unsigned int olx)
1811 {
1812 	struct mlx5_wqe *__rte_restrict wqe;
1813 	unsigned int ds, dlen, inlen, ntcp, vlan = 0;
1814 
1815 	MLX5_ASSERT(loc->elts_free >= NB_SEGS(loc->mbuf));
1816 	if (MLX5_TXOFF_CONFIG(TXPP)) {
1817 		enum mlx5_txcmp_code wret;
1818 
1819 		/* Generate WAIT for scheduling if requested. */
1820 		wret = mlx5_tx_schedule_send(txq, loc, 0, olx);
1821 		if (wret == MLX5_TXCMP_CODE_EXIT)
1822 			return MLX5_TXCMP_CODE_EXIT;
1823 		if (wret == MLX5_TXCMP_CODE_ERROR)
1824 			return MLX5_TXCMP_CODE_ERROR;
1825 	}
1826 	/*
1827 	 * Calculate data length to be inlined to estimate
1828 	 * the required space in WQE ring buffer.
1829 	 */
1830 	dlen = rte_pktmbuf_pkt_len(loc->mbuf);
1831 	if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN)
1832 		vlan = sizeof(struct rte_vlan_hdr);
1833 	inlen = loc->mbuf->l2_len + vlan +
1834 		loc->mbuf->l3_len + loc->mbuf->l4_len;
1835 	if (unlikely((!inlen || !loc->mbuf->tso_segsz)))
1836 		return MLX5_TXCMP_CODE_ERROR;
1837 	if (loc->mbuf->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK)
1838 		inlen += loc->mbuf->outer_l2_len + loc->mbuf->outer_l3_len;
1839 	/* Packet must contain all TSO headers. */
1840 	if (unlikely(inlen > MLX5_MAX_TSO_HEADER ||
1841 		     inlen <= MLX5_ESEG_MIN_INLINE_SIZE ||
1842 		     inlen > (dlen + vlan)))
1843 		return MLX5_TXCMP_CODE_ERROR;
1844 	/*
1845 	 * Check whether there are enough free WQEBBs:
1846 	 * - Control Segment
1847 	 * - Ethernet Segment
1848 	 * - First Segment of inlined Ethernet data
1849 	 * - ... data continued ...
1850 	 * - Data Segments of pointer/min inline type
1851 	 */
1852 	ds = NB_SEGS(loc->mbuf) + 2 + (inlen -
1853 				       MLX5_ESEG_MIN_INLINE_SIZE +
1854 				       MLX5_WSEG_SIZE +
1855 				       MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
1856 	if (unlikely(loc->wqe_free < ((ds + 3) / 4)))
1857 		return MLX5_TXCMP_CODE_EXIT;
1858 	/* Check for maximal WQE size. */
1859 	if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ds))
1860 		return MLX5_TXCMP_CODE_ERROR;
1861 #ifdef MLX5_PMD_SOFT_COUNTERS
1862 	/* Update sent data bytes/packets counters. */
1863 	ntcp = (dlen - (inlen - vlan) + loc->mbuf->tso_segsz - 1) /
1864 		loc->mbuf->tso_segsz;
1865 	/*
1866 	 * One will be added for mbuf itself at the end of the mlx5_tx_burst
1867 	 * from loc->pkts_sent field.
1868 	 */
1869 	--ntcp;
1870 	txq->stats.opackets += ntcp;
1871 	txq->stats.obytes += dlen + vlan + ntcp * inlen;
1872 #endif
1873 	wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
1874 	loc->wqe_last = wqe;
1875 	mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_TSO, olx);
1876 	rte_pmd_mlx5_trace_tx_push(loc->mbuf, txq->wqe_ci);
1877 	ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 1, olx);
1878 	wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
1879 	txq->wqe_ci += (ds + 3) / 4;
1880 	loc->wqe_free -= (ds + 3) / 4;
1881 	return MLX5_TXCMP_CODE_MULTI;
1882 }
1883 
1884 /**
1885  * Tx one packet function for multi-segment SEND. Supports all types of Tx
1886  * offloads, uses MLX5_OPCODE_SEND to build WQEs, sends one packet per WQE,
1887  * without any data inlining in Ethernet Segment.
1888  *
1889  * This routine is responsible for storing processed mbuf
1890  * into elts ring buffer and update elts_head.
1891  *
1892  * @param txq
1893  *   Pointer to TX queue structure.
1894  * @param loc
1895  *   Pointer to burst routine local context.
1896  * @param olx
1897  *   Configured Tx offloads mask. It is fully defined at
1898  *   compile time and may be used for optimization.
1899  *
1900  * @return
1901  *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
1902  *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
1903  * Local context variables partially updated.
1904  */
1905 static __rte_always_inline enum mlx5_txcmp_code
1906 mlx5_tx_packet_multi_send(struct mlx5_txq_data *__rte_restrict txq,
1907 			  struct mlx5_txq_local *__rte_restrict loc,
1908 			  unsigned int olx)
1909 {
1910 	struct mlx5_wqe_dseg *__rte_restrict dseg;
1911 	struct mlx5_wqe *__rte_restrict wqe;
1912 	unsigned int ds, nseg;
1913 
1914 	MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1);
1915 	MLX5_ASSERT(loc->elts_free >= NB_SEGS(loc->mbuf));
1916 	if (MLX5_TXOFF_CONFIG(TXPP)) {
1917 		enum mlx5_txcmp_code wret;
1918 
1919 		/* Generate WAIT for scheduling if requested. */
1920 		wret = mlx5_tx_schedule_send(txq, loc, 0, olx);
1921 		if (wret == MLX5_TXCMP_CODE_EXIT)
1922 			return MLX5_TXCMP_CODE_EXIT;
1923 		if (wret == MLX5_TXCMP_CODE_ERROR)
1924 			return MLX5_TXCMP_CODE_ERROR;
1925 	}
1926 	/*
1927 	 * No inline at all, it means the CPU cycles saving is prioritized at
1928 	 * configuration, we should not copy any packet data to WQE.
1929 	 */
1930 	nseg = NB_SEGS(loc->mbuf);
1931 	ds = 2 + nseg;
1932 	if (unlikely(loc->wqe_free < ((ds + 3) / 4)))
1933 		return MLX5_TXCMP_CODE_EXIT;
1934 	/* Check for maximal WQE size. */
1935 	if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ds))
1936 		return MLX5_TXCMP_CODE_ERROR;
1937 	/*
1938 	 * Some Tx offloads may cause an error if packet is not long enough,
1939 	 * check against assumed minimal length.
1940 	 */
1941 	if (rte_pktmbuf_pkt_len(loc->mbuf) <= MLX5_ESEG_MIN_INLINE_SIZE)
1942 		return MLX5_TXCMP_CODE_ERROR;
1943 #ifdef MLX5_PMD_SOFT_COUNTERS
1944 	/* Update sent data bytes counter. */
1945 	txq->stats.obytes += rte_pktmbuf_pkt_len(loc->mbuf);
1946 	if (MLX5_TXOFF_CONFIG(VLAN) &&
1947 	    loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN)
1948 		txq->stats.obytes += sizeof(struct rte_vlan_hdr);
1949 #endif
1950 	/*
1951 	 * SEND WQE, one WQEBB:
1952 	 * - Control Segment, SEND opcode
1953 	 * - Ethernet Segment, optional VLAN, no inline
1954 	 * - Data Segments, pointer only type
1955 	 */
1956 	wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
1957 	loc->wqe_last = wqe;
1958 	mlx5_tx_cseg_init(txq, loc, wqe, ds, MLX5_OPCODE_SEND, olx);
1959 	rte_pmd_mlx5_trace_tx_push(loc->mbuf, txq->wqe_ci);
1960 	mlx5_tx_eseg_none(txq, loc, wqe, olx);
1961 	dseg = &wqe->dseg[0];
1962 	do {
1963 		if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) {
1964 			struct rte_mbuf *mbuf;
1965 
1966 			/*
1967 			 * Zero length segment found, have to correct total
1968 			 * size of WQE in segments.
1969 			 * It is supposed to be rare occasion, so in normal
1970 			 * case (no zero length segments) we avoid extra
1971 			 * writing to the Control Segment.
1972 			 */
1973 			--ds;
1974 			wqe->cseg.sq_ds -= RTE_BE32(1);
1975 			mbuf = loc->mbuf;
1976 			loc->mbuf = mbuf->next;
1977 			rte_pktmbuf_free_seg(mbuf);
1978 			if (--nseg == 0)
1979 				break;
1980 		} else {
1981 			mlx5_tx_dseg_ptr
1982 				(txq, loc, dseg,
1983 				 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
1984 				 rte_pktmbuf_data_len(loc->mbuf), olx);
1985 			txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
1986 			--loc->elts_free;
1987 			if (--nseg == 0)
1988 				break;
1989 			++dseg;
1990 			if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
1991 				dseg = (struct mlx5_wqe_dseg *)txq->wqes;
1992 			loc->mbuf = loc->mbuf->next;
1993 		}
1994 	} while (true);
1995 	txq->wqe_ci += (ds + 3) / 4;
1996 	loc->wqe_free -= (ds + 3) / 4;
1997 	return MLX5_TXCMP_CODE_MULTI;
1998 }
1999 
2000 /**
2001  * Tx one packet function for multi-segment SEND. Supports all
2002  * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs,
2003  * sends one packet per WQE, with data inlining in
2004  * Ethernet Segment and minimal Data Segments.
2005  *
2006  * This routine is responsible for storing processed mbuf
2007  * into elts ring buffer and update elts_head.
2008  *
2009  * @param txq
2010  *   Pointer to TX queue structure.
2011  * @param loc
2012  *   Pointer to burst routine local context.
2013  * @param olx
2014  *   Configured Tx offloads mask. It is fully defined at
2015  *   compile time and may be used for optimization.
2016  *
2017  * @return
2018  *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
2019  *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
2020  * Local context variables partially updated.
2021  */
2022 static __rte_always_inline enum mlx5_txcmp_code
2023 mlx5_tx_packet_multi_inline(struct mlx5_txq_data *__rte_restrict txq,
2024 			    struct mlx5_txq_local *__rte_restrict loc,
2025 			    unsigned int olx)
2026 {
2027 	struct mlx5_wqe *__rte_restrict wqe;
2028 	unsigned int ds, inlen, dlen, vlan = 0;
2029 
2030 	MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
2031 	MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1);
2032 	MLX5_ASSERT(loc->elts_free >= NB_SEGS(loc->mbuf));
2033 	/*
2034 	 * First calculate data length to be inlined
2035 	 * to estimate the required space for WQE.
2036 	 */
2037 	dlen = rte_pktmbuf_pkt_len(loc->mbuf);
2038 	if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN)
2039 		vlan = sizeof(struct rte_vlan_hdr);
2040 	inlen = dlen + vlan;
2041 	/* Check against minimal length. */
2042 	if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE)
2043 		return MLX5_TXCMP_CODE_ERROR;
2044 	MLX5_ASSERT(txq->inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE);
2045 	if (inlen > txq->inlen_send ||
2046 	    loc->mbuf->ol_flags & RTE_MBUF_F_TX_DYNF_NOINLINE) {
2047 		struct rte_mbuf *mbuf;
2048 		unsigned int nxlen;
2049 		uintptr_t start;
2050 
2051 		mbuf = loc->mbuf;
2052 		nxlen = rte_pktmbuf_data_len(mbuf) + vlan;
2053 		/*
2054 		 * Packet length exceeds the allowed inline data length,
2055 		 * check whether the minimal inlining is required.
2056 		 */
2057 		if (txq->inlen_mode) {
2058 			MLX5_ASSERT(txq->inlen_mode >=
2059 				    MLX5_ESEG_MIN_INLINE_SIZE);
2060 			MLX5_ASSERT(txq->inlen_mode <= txq->inlen_send);
2061 			inlen = RTE_MIN(txq->inlen_mode, inlen);
2062 		} else if (vlan && !txq->vlan_en) {
2063 			/*
2064 			 * VLAN insertion is requested and hardware does not
2065 			 * support the offload, will do with software inline.
2066 			 */
2067 			inlen = MLX5_ESEG_MIN_INLINE_SIZE;
2068 		} else if (mbuf->ol_flags & RTE_MBUF_F_TX_DYNF_NOINLINE ||
2069 			   nxlen > txq->inlen_send) {
2070 			return mlx5_tx_packet_multi_send(txq, loc, olx);
2071 		} else if (nxlen <= MLX5_ESEG_MIN_INLINE_SIZE) {
2072 			inlen = MLX5_ESEG_MIN_INLINE_SIZE;
2073 		} else {
2074 			goto do_first;
2075 		}
2076 		if (mbuf->ol_flags & RTE_MBUF_F_TX_DYNF_NOINLINE)
2077 			goto do_build;
2078 		/*
2079 		 * Now we know the minimal amount of data is requested
2080 		 * to inline. Check whether we should inline the buffers
2081 		 * from the chain beginning to eliminate some mbufs.
2082 		 */
2083 		if (unlikely(nxlen <= txq->inlen_send)) {
2084 			/* We can inline first mbuf at least. */
2085 			if (nxlen < inlen) {
2086 				unsigned int smlen;
2087 
2088 				/* Scan mbufs till inlen filled. */
2089 				do {
2090 					smlen = nxlen;
2091 					mbuf = NEXT(mbuf);
2092 					MLX5_ASSERT(mbuf);
2093 					nxlen = rte_pktmbuf_data_len(mbuf);
2094 					nxlen += smlen;
2095 				} while (unlikely(nxlen < inlen));
2096 				if (unlikely(nxlen > txq->inlen_send)) {
2097 					/* We cannot inline entire mbuf. */
2098 					smlen = inlen - smlen;
2099 					start = rte_pktmbuf_mtod_offset
2100 						    (mbuf, uintptr_t, smlen);
2101 					goto do_align;
2102 				}
2103 			}
2104 do_first:
2105 			do {
2106 				inlen = nxlen;
2107 				mbuf = NEXT(mbuf);
2108 				/* There should be not end of packet. */
2109 				MLX5_ASSERT(mbuf);
2110 				if (mbuf->ol_flags & RTE_MBUF_F_TX_DYNF_NOINLINE)
2111 					break;
2112 				nxlen = inlen + rte_pktmbuf_data_len(mbuf);
2113 			} while (unlikely(nxlen < txq->inlen_send));
2114 		}
2115 		start = rte_pktmbuf_mtod(mbuf, uintptr_t);
2116 		/*
2117 		 * Check whether we can do inline to align start
2118 		 * address of data buffer to cacheline.
2119 		 */
2120 do_align:
2121 		start = (~start + 1) & (RTE_CACHE_LINE_SIZE - 1);
2122 		if (unlikely(start)) {
2123 			start += inlen;
2124 			if (start <= txq->inlen_send)
2125 				inlen = start;
2126 		}
2127 	}
2128 	/*
2129 	 * Check whether there are enough free WQEBBs:
2130 	 * - Control Segment
2131 	 * - Ethernet Segment
2132 	 * - First Segment of inlined Ethernet data
2133 	 * - ... data continued ...
2134 	 * - Data Segments of pointer/min inline type
2135 	 *
2136 	 * Estimate the number of Data Segments conservatively,
2137 	 * supposing no any mbufs is being freed during inlining.
2138 	 */
2139 do_build:
2140 	if (MLX5_TXOFF_CONFIG(TXPP)) {
2141 		enum mlx5_txcmp_code wret;
2142 
2143 		/* Generate WAIT for scheduling if requested. */
2144 		wret = mlx5_tx_schedule_send(txq, loc, 0, olx);
2145 		if (wret == MLX5_TXCMP_CODE_EXIT)
2146 			return MLX5_TXCMP_CODE_EXIT;
2147 		if (wret == MLX5_TXCMP_CODE_ERROR)
2148 			return MLX5_TXCMP_CODE_ERROR;
2149 	}
2150 	MLX5_ASSERT(inlen <= txq->inlen_send);
2151 	ds = NB_SEGS(loc->mbuf) + 2 + (inlen -
2152 				       MLX5_ESEG_MIN_INLINE_SIZE +
2153 				       MLX5_WSEG_SIZE +
2154 				       MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
2155 	if (unlikely(loc->wqe_free < ((ds + 3) / 4)))
2156 		return MLX5_TXCMP_CODE_EXIT;
2157 	/* Check for maximal WQE size. */
2158 	if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ds)) {
2159 		/*  Check if we can adjust the inline length. */
2160 		if (unlikely(txq->inlen_mode)) {
2161 			ds = NB_SEGS(loc->mbuf) + 2 +
2162 				(txq->inlen_mode -
2163 				MLX5_ESEG_MIN_INLINE_SIZE +
2164 				MLX5_WSEG_SIZE +
2165 				MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
2166 			if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ds))
2167 				return MLX5_TXCMP_CODE_ERROR;
2168 		}
2169 		/* We have lucky opportunity to adjust. */
2170 		inlen = RTE_MIN(inlen, MLX5_WQE_SIZE_MAX -
2171 				       MLX5_WSEG_SIZE * 2 -
2172 				       MLX5_WSEG_SIZE * NB_SEGS(loc->mbuf) -
2173 				       MLX5_WSEG_SIZE +
2174 				       MLX5_ESEG_MIN_INLINE_SIZE);
2175 	}
2176 #ifdef MLX5_PMD_SOFT_COUNTERS
2177 	/* Update sent data bytes/packets counters. */
2178 	txq->stats.obytes += dlen + vlan;
2179 #endif
2180 	wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
2181 	loc->wqe_last = wqe;
2182 	mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_SEND, olx);
2183 	rte_pmd_mlx5_trace_tx_push(loc->mbuf, txq->wqe_ci);
2184 	ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 0, olx);
2185 	wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
2186 	txq->wqe_ci += (ds + 3) / 4;
2187 	loc->wqe_free -= (ds + 3) / 4;
2188 	return MLX5_TXCMP_CODE_MULTI;
2189 }
2190 
2191 /**
2192  * Tx burst function for multi-segment packets. Supports all
2193  * types of Tx offloads, uses MLX5_OPCODE_SEND/TSO to build WQEs,
2194  * sends one packet per WQE. Function stops sending if it
2195  * encounters the single-segment packet.
2196  *
2197  * This routine is responsible for storing processed mbuf
2198  * into elts ring buffer and update elts_head.
2199  *
2200  * @param txq
2201  *   Pointer to TX queue structure.
2202  * @param[in] pkts
2203  *   Packets to transmit.
2204  * @param pkts_n
2205  *   Number of packets in array.
2206  * @param loc
2207  *   Pointer to burst routine local context.
2208  * @param olx
2209  *   Configured Tx offloads mask. It is fully defined at
2210  *   compile time and may be used for optimization.
2211  *
2212  * @return
2213  *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
2214  *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
2215  *   MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered.
2216  *   MLX5_TXCMP_CODE_TSO - TSO single-segment packet encountered.
2217  * Local context variables updated.
2218  */
2219 static __rte_always_inline enum mlx5_txcmp_code
2220 mlx5_tx_burst_mseg(struct mlx5_txq_data *__rte_restrict txq,
2221 		   struct rte_mbuf **__rte_restrict pkts,
2222 		   unsigned int pkts_n,
2223 		   struct mlx5_txq_local *__rte_restrict loc,
2224 		   unsigned int olx)
2225 {
2226 	MLX5_ASSERT(loc->elts_free && loc->wqe_free);
2227 	MLX5_ASSERT(pkts_n > loc->pkts_sent);
2228 	pkts += loc->pkts_sent + 1;
2229 	pkts_n -= loc->pkts_sent;
2230 	for (;;) {
2231 		enum mlx5_txcmp_code ret;
2232 
2233 		MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1);
2234 		/*
2235 		 * Estimate the number of free elts quickly but conservatively.
2236 		 * Some segment may be fully inlined and freed,
2237 		 * ignore this here - precise estimation is costly.
2238 		 */
2239 		if (loc->elts_free < NB_SEGS(loc->mbuf))
2240 			return MLX5_TXCMP_CODE_EXIT;
2241 		if (MLX5_TXOFF_CONFIG(TSO) &&
2242 		    unlikely(loc->mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG)) {
2243 			/* Proceed with multi-segment TSO. */
2244 			ret = mlx5_tx_packet_multi_tso(txq, loc, olx);
2245 		} else if (MLX5_TXOFF_CONFIG(INLINE)) {
2246 			/* Proceed with multi-segment SEND with inlining. */
2247 			ret = mlx5_tx_packet_multi_inline(txq, loc, olx);
2248 		} else {
2249 			/* Proceed with multi-segment SEND w/o inlining. */
2250 			ret = mlx5_tx_packet_multi_send(txq, loc, olx);
2251 		}
2252 		if (ret == MLX5_TXCMP_CODE_EXIT)
2253 			return MLX5_TXCMP_CODE_EXIT;
2254 		if (ret == MLX5_TXCMP_CODE_ERROR)
2255 			return MLX5_TXCMP_CODE_ERROR;
2256 		/* WQE is built, go to the next packet. */
2257 		++loc->pkts_sent;
2258 		--pkts_n;
2259 		if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
2260 			return MLX5_TXCMP_CODE_EXIT;
2261 		loc->mbuf = *pkts++;
2262 		if (pkts_n > 1)
2263 			rte_prefetch0(*pkts);
2264 		if (likely(NB_SEGS(loc->mbuf) > 1))
2265 			continue;
2266 		/* Here ends the series of multi-segment packets. */
2267 		if (MLX5_TXOFF_CONFIG(TSO) &&
2268 		    unlikely(loc->mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG))
2269 			return MLX5_TXCMP_CODE_TSO;
2270 		return MLX5_TXCMP_CODE_SINGLE;
2271 	}
2272 	MLX5_ASSERT(false);
2273 }
2274 
2275 /**
2276  * Tx burst function for single-segment packets with TSO.
2277  * Supports all types of Tx offloads, except multi-packets.
2278  * Uses MLX5_OPCODE_TSO to build WQEs, sends one packet per WQE.
2279  * Function stops sending if it encounters the multi-segment
2280  * packet or packet without TSO requested.
2281  *
2282  * The routine is responsible for storing processed mbuf into elts ring buffer
2283  * and update elts_head if inline offloads is requested due to possible early
2284  * freeing of the inlined mbufs (can not store pkts array in elts as a batch).
2285  *
2286  * @param txq
2287  *   Pointer to TX queue structure.
2288  * @param[in] pkts
2289  *   Packets to transmit.
2290  * @param pkts_n
2291  *   Number of packets in array.
2292  * @param loc
2293  *   Pointer to burst routine local context.
2294  * @param olx
2295  *   Configured Tx offloads mask. It is fully defined at
2296  *   compile time and may be used for optimization.
2297  *
2298  * @return
2299  *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
2300  *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
2301  *   MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered.
2302  *   MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered.
2303  * Local context variables updated.
2304  */
2305 static __rte_always_inline enum mlx5_txcmp_code
2306 mlx5_tx_burst_tso(struct mlx5_txq_data *__rte_restrict txq,
2307 		  struct rte_mbuf **__rte_restrict pkts,
2308 		  unsigned int pkts_n,
2309 		  struct mlx5_txq_local *__rte_restrict loc,
2310 		  unsigned int olx)
2311 {
2312 	MLX5_ASSERT(loc->elts_free && loc->wqe_free);
2313 	MLX5_ASSERT(pkts_n > loc->pkts_sent);
2314 	pkts += loc->pkts_sent + 1;
2315 	pkts_n -= loc->pkts_sent;
2316 	for (;;) {
2317 		struct mlx5_wqe_dseg *__rte_restrict dseg;
2318 		struct mlx5_wqe *__rte_restrict wqe;
2319 		unsigned int ds, dlen, hlen, ntcp, vlan = 0;
2320 		uint8_t *dptr;
2321 
2322 		MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
2323 		if (MLX5_TXOFF_CONFIG(TXPP)) {
2324 			enum mlx5_txcmp_code wret;
2325 
2326 			/* Generate WAIT for scheduling if requested. */
2327 			wret = mlx5_tx_schedule_send(txq, loc, 1, olx);
2328 			if (wret == MLX5_TXCMP_CODE_EXIT)
2329 				return MLX5_TXCMP_CODE_EXIT;
2330 			if (wret == MLX5_TXCMP_CODE_ERROR)
2331 				return MLX5_TXCMP_CODE_ERROR;
2332 		}
2333 		dlen = rte_pktmbuf_data_len(loc->mbuf);
2334 		if (MLX5_TXOFF_CONFIG(VLAN) &&
2335 		    loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN) {
2336 			vlan = sizeof(struct rte_vlan_hdr);
2337 		}
2338 		/*
2339 		 * First calculate the WQE size to check
2340 		 * whether we have enough space in ring buffer.
2341 		 */
2342 		hlen = loc->mbuf->l2_len + vlan +
2343 		       loc->mbuf->l3_len + loc->mbuf->l4_len;
2344 		if (unlikely((!hlen || !loc->mbuf->tso_segsz)))
2345 			return MLX5_TXCMP_CODE_ERROR;
2346 		if (loc->mbuf->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK)
2347 			hlen += loc->mbuf->outer_l2_len +
2348 				loc->mbuf->outer_l3_len;
2349 		/* Segment must contain all TSO headers. */
2350 		if (unlikely(hlen > MLX5_MAX_TSO_HEADER ||
2351 			     hlen <= MLX5_ESEG_MIN_INLINE_SIZE ||
2352 			     hlen > (dlen + vlan)))
2353 			return MLX5_TXCMP_CODE_ERROR;
2354 		/*
2355 		 * Check whether there are enough free WQEBBs:
2356 		 * - Control Segment
2357 		 * - Ethernet Segment
2358 		 * - First Segment of inlined Ethernet data
2359 		 * - ... data continued ...
2360 		 * - Finishing Data Segment of pointer type
2361 		 */
2362 		ds = 4 + (hlen - MLX5_ESEG_MIN_INLINE_SIZE +
2363 			  MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
2364 		if (loc->wqe_free < ((ds + 3) / 4))
2365 			return MLX5_TXCMP_CODE_EXIT;
2366 #ifdef MLX5_PMD_SOFT_COUNTERS
2367 		/* Update sent data bytes/packets counters. */
2368 		ntcp = (dlen + vlan - hlen +
2369 			loc->mbuf->tso_segsz - 1) /
2370 			loc->mbuf->tso_segsz;
2371 		/*
2372 		 * One will be added for mbuf itself at the end
2373 		 * of the mlx5_tx_burst from loc->pkts_sent field.
2374 		 */
2375 		--ntcp;
2376 		txq->stats.opackets += ntcp;
2377 		txq->stats.obytes += dlen + vlan + ntcp * hlen;
2378 #endif
2379 		/*
2380 		 * Build the TSO WQE:
2381 		 * - Control Segment
2382 		 * - Ethernet Segment with hlen bytes inlined
2383 		 * - Data Segment of pointer type
2384 		 */
2385 		wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
2386 		loc->wqe_last = wqe;
2387 		mlx5_tx_cseg_init(txq, loc, wqe, ds, MLX5_OPCODE_TSO, olx);
2388 		rte_pmd_mlx5_trace_tx_push(loc->mbuf, txq->wqe_ci);
2389 		dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan, hlen, 1, olx);
2390 		dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + hlen - vlan;
2391 		dlen -= hlen - vlan;
2392 		mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx);
2393 		/*
2394 		 * WQE is built, update the loop parameters
2395 		 * and go to the next packet.
2396 		 */
2397 		txq->wqe_ci += (ds + 3) / 4;
2398 		loc->wqe_free -= (ds + 3) / 4;
2399 		if (MLX5_TXOFF_CONFIG(INLINE))
2400 			txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
2401 		--loc->elts_free;
2402 		++loc->pkts_sent;
2403 		--pkts_n;
2404 		if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
2405 			return MLX5_TXCMP_CODE_EXIT;
2406 		loc->mbuf = *pkts++;
2407 		if (pkts_n > 1)
2408 			rte_prefetch0(*pkts);
2409 		if (MLX5_TXOFF_CONFIG(MULTI) &&
2410 		    unlikely(NB_SEGS(loc->mbuf) > 1))
2411 			return MLX5_TXCMP_CODE_MULTI;
2412 		if (likely(!(loc->mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG)))
2413 			return MLX5_TXCMP_CODE_SINGLE;
2414 		/* Continue with the next TSO packet. */
2415 	}
2416 	MLX5_ASSERT(false);
2417 }
2418 
2419 /**
2420  * Analyze the packet and select the best method to send.
2421  *
2422  * @param txq
2423  *   Pointer to TX queue structure.
2424  * @param loc
2425  *   Pointer to burst routine local context.
2426  * @param olx
2427  *   Configured Tx offloads mask. It is fully defined at
2428  *   compile time and may be used for optimization.
2429  * @param newp
2430  *   The predefined flag whether do complete check for
2431  *   multi-segment packets and TSO.
2432  *
2433  * @return
2434  *  MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered.
2435  *  MLX5_TXCMP_CODE_TSO - TSO required, use TSO/LSO.
2436  *  MLX5_TXCMP_CODE_SINGLE - single-segment packet, use SEND.
2437  *  MLX5_TXCMP_CODE_EMPW - single-segment packet, use MPW.
2438  */
2439 static __rte_always_inline enum mlx5_txcmp_code
2440 mlx5_tx_able_to_empw(struct mlx5_txq_data *__rte_restrict txq,
2441 		     struct mlx5_txq_local *__rte_restrict loc,
2442 		     unsigned int olx,
2443 		     bool newp)
2444 {
2445 	/* Check for multi-segment packet. */
2446 	if (newp &&
2447 	    MLX5_TXOFF_CONFIG(MULTI) &&
2448 	    unlikely(NB_SEGS(loc->mbuf) > 1))
2449 		return MLX5_TXCMP_CODE_MULTI;
2450 	/* Check for TSO packet. */
2451 	if (newp &&
2452 	    MLX5_TXOFF_CONFIG(TSO) &&
2453 	    unlikely(loc->mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG))
2454 		return MLX5_TXCMP_CODE_TSO;
2455 	/* Check if eMPW is enabled at all. */
2456 	if (!MLX5_TXOFF_CONFIG(EMPW))
2457 		return MLX5_TXCMP_CODE_SINGLE;
2458 	/* Check if eMPW can be engaged. */
2459 	if (MLX5_TXOFF_CONFIG(VLAN) &&
2460 	    unlikely(loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN) &&
2461 		(!MLX5_TXOFF_CONFIG(INLINE) ||
2462 		 unlikely((rte_pktmbuf_data_len(loc->mbuf) +
2463 			   sizeof(struct rte_vlan_hdr)) > txq->inlen_empw))) {
2464 		/*
2465 		 * eMPW does not support VLAN insertion offload, we have to
2466 		 * inline the entire packet but packet is too long for inlining.
2467 		 */
2468 		return MLX5_TXCMP_CODE_SINGLE;
2469 	}
2470 	return MLX5_TXCMP_CODE_EMPW;
2471 }
2472 
2473 /**
2474  * Check the next packet attributes to match with the eMPW batch ones.
2475  * In addition, for legacy MPW the packet length is checked either.
2476  *
2477  * @param txq
2478  *   Pointer to TX queue structure.
2479  * @param es
2480  *   Pointer to Ethernet Segment of eMPW batch.
2481  * @param loc
2482  *   Pointer to burst routine local context.
2483  * @param dlen
2484  *   Length of previous packet in MPW descriptor.
2485  * @param olx
2486  *   Configured Tx offloads mask. It is fully defined at
2487  *   compile time and may be used for optimization.
2488  *
2489  * @return
2490  *  true - packet match with eMPW batch attributes.
2491  *  false - no match, eMPW should be restarted.
2492  */
2493 static __rte_always_inline bool
2494 mlx5_tx_match_empw(struct mlx5_txq_data *__rte_restrict txq,
2495 		   struct mlx5_wqe_eseg *__rte_restrict es,
2496 		   struct mlx5_txq_local *__rte_restrict loc,
2497 		   uint32_t dlen,
2498 		   unsigned int olx)
2499 {
2500 	uint8_t swp_flags = 0;
2501 
2502 	/* Compare the checksum flags, if any. */
2503 	if (MLX5_TXOFF_CONFIG(CSUM) &&
2504 	    txq_ol_cksum_to_cs(loc->mbuf) != es->cs_flags)
2505 		return false;
2506 	/* Compare the Software Parser offsets and flags. */
2507 	if (MLX5_TXOFF_CONFIG(SWP) &&
2508 	    (es->swp_offs != txq_mbuf_to_swp(loc, &swp_flags, olx) ||
2509 	     es->swp_flags != swp_flags))
2510 		return false;
2511 	/* Fill metadata field if needed. */
2512 	if (MLX5_TXOFF_CONFIG(METADATA) &&
2513 		es->metadata != (loc->mbuf->ol_flags & RTE_MBUF_DYNFLAG_TX_METADATA ?
2514 				 rte_cpu_to_be_32(*RTE_FLOW_DYNF_METADATA(loc->mbuf)) : 0))
2515 		return false;
2516 	/* Legacy MPW can send packets with the same length only. */
2517 	if (MLX5_TXOFF_CONFIG(MPW) &&
2518 	    dlen != rte_pktmbuf_data_len(loc->mbuf))
2519 		return false;
2520 	/* There must be no VLAN packets in eMPW loop. */
2521 	if (MLX5_TXOFF_CONFIG(VLAN))
2522 		MLX5_ASSERT(!(loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN));
2523 	/* Check if the scheduling is requested. */
2524 	if (MLX5_TXOFF_CONFIG(TXPP) &&
2525 	    loc->mbuf->ol_flags & txq->ts_mask)
2526 		return false;
2527 	return true;
2528 }
2529 
2530 /**
2531  * Update send loop variables and WQE for eMPW loop without data inlining.
2532  * Number of Data Segments is equal to the number of sent packets.
2533  *
2534  * @param txq
2535  *   Pointer to TX queue structure.
2536  * @param loc
2537  *   Pointer to burst routine local context.
2538  * @param ds
2539  *   Number of packets/Data Segments/Packets.
2540  * @param slen
2541  *   Accumulated statistics, bytes sent.
2542  * @param olx
2543  *   Configured Tx offloads mask. It is fully defined at
2544  *   compile time and may be used for optimization.
2545  *
2546  * @return
2547  *  true - packet match with eMPW batch attributes.
2548  *  false - no match, eMPW should be restarted.
2549  */
2550 static __rte_always_inline void
2551 mlx5_tx_sdone_empw(struct mlx5_txq_data *__rte_restrict txq,
2552 		   struct mlx5_txq_local *__rte_restrict loc,
2553 		   unsigned int ds,
2554 		   unsigned int slen,
2555 		   unsigned int olx __rte_unused)
2556 {
2557 	MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE));
2558 #ifdef MLX5_PMD_SOFT_COUNTERS
2559 	/* Update sent data bytes counter. */
2560 	 txq->stats.obytes += slen;
2561 #else
2562 	(void)slen;
2563 #endif
2564 	loc->elts_free -= ds;
2565 	loc->pkts_sent += ds;
2566 	ds += 2;
2567 	loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
2568 	txq->wqe_ci += (ds + 3) / 4;
2569 	loc->wqe_free -= (ds + 3) / 4;
2570 }
2571 
2572 /**
2573  * Update send loop variables and WQE for eMPW loop with data inlining.
2574  * Gets the size of pushed descriptors and data to the WQE.
2575  *
2576  * @param txq
2577  *   Pointer to TX queue structure.
2578  * @param loc
2579  *   Pointer to burst routine local context.
2580  * @param len
2581  *   Total size of descriptor/data in bytes.
2582  * @param slen
2583  *   Accumulated statistics, data bytes sent.
2584  * @param wqem
2585  *   The base WQE for the eMPW/MPW descriptor.
2586  * @param olx
2587  *   Configured Tx offloads mask. It is fully defined at
2588  *   compile time and may be used for optimization.
2589  *
2590  * @return
2591  *  true - packet match with eMPW batch attributes.
2592  *  false - no match, eMPW should be restarted.
2593  */
2594 static __rte_always_inline void
2595 mlx5_tx_idone_empw(struct mlx5_txq_data *__rte_restrict txq,
2596 		   struct mlx5_txq_local *__rte_restrict loc,
2597 		   unsigned int len,
2598 		   unsigned int slen,
2599 		   struct mlx5_wqe *__rte_restrict wqem,
2600 		   unsigned int olx __rte_unused)
2601 {
2602 	struct mlx5_wqe_dseg *dseg = &wqem->dseg[0];
2603 
2604 	MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
2605 #ifdef MLX5_PMD_SOFT_COUNTERS
2606 	/* Update sent data bytes counter. */
2607 	 txq->stats.obytes += slen;
2608 #else
2609 	(void)slen;
2610 #endif
2611 	if (MLX5_TXOFF_CONFIG(MPW) && dseg->bcount == RTE_BE32(0)) {
2612 		/*
2613 		 * If the legacy MPW session contains the inline packets
2614 		 * we should set the only inline data segment length
2615 		 * and align the total length to the segment size.
2616 		 */
2617 		MLX5_ASSERT(len > sizeof(dseg->bcount));
2618 		dseg->bcount = rte_cpu_to_be_32((len - sizeof(dseg->bcount)) |
2619 						MLX5_ETH_WQE_DATA_INLINE);
2620 		len = (len + MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE + 2;
2621 	} else {
2622 		/*
2623 		 * The session is not legacy MPW or contains the
2624 		 * data buffer pointer segments.
2625 		 */
2626 		MLX5_ASSERT((len % MLX5_WSEG_SIZE) == 0);
2627 		len = len / MLX5_WSEG_SIZE + 2;
2628 	}
2629 	wqem->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | len);
2630 	txq->wqe_ci += (len + 3) / 4;
2631 	loc->wqe_free -= (len + 3) / 4;
2632 	loc->wqe_last = wqem;
2633 }
2634 
2635 /**
2636  * The set of Tx burst functions for single-segment packets without TSO
2637  * and with Multi-Packet Writing feature support.
2638  * Supports all types of Tx offloads, except multi-packets and TSO.
2639  *
2640  * Uses MLX5_OPCODE_EMPW to build WQEs if possible and sends as many packet
2641  * per WQE as it can. If eMPW is not configured or packet can not be sent with
2642  * eMPW (VLAN insertion) the ordinary SEND opcode is used and only one packet
2643  * placed in WQE.
2644  *
2645  * Functions stop sending if it encounters the multi-segment packet or packet
2646  * with TSO requested.
2647  *
2648  * The routines are responsible for storing processed mbuf into elts ring buffer
2649  * and update elts_head if inlining offload is requested. Otherwise the copying
2650  * mbufs to elts can be postponed and completed at the end of burst routine.
2651  *
2652  * @param txq
2653  *   Pointer to TX queue structure.
2654  * @param[in] pkts
2655  *   Packets to transmit.
2656  * @param pkts_n
2657  *   Number of packets in array.
2658  * @param loc
2659  *   Pointer to burst routine local context.
2660  * @param olx
2661  *   Configured Tx offloads mask. It is fully defined at
2662  *   compile time and may be used for optimization.
2663  *
2664  * @return
2665  *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
2666  *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
2667  *   MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered.
2668  *   MLX5_TXCMP_CODE_TSO - TSO packet encountered.
2669  *   MLX5_TXCMP_CODE_SINGLE - used inside functions set.
2670  *   MLX5_TXCMP_CODE_EMPW - used inside functions set.
2671  *
2672  * Local context variables updated.
2673  *
2674  *
2675  * The routine sends packets with MLX5_OPCODE_EMPW
2676  * without inlining, this is dedicated optimized branch.
2677  * No VLAN insertion is supported.
2678  */
2679 static __rte_always_inline enum mlx5_txcmp_code
2680 mlx5_tx_burst_empw_simple(struct mlx5_txq_data *__rte_restrict txq,
2681 			  struct rte_mbuf **__rte_restrict pkts,
2682 			  unsigned int pkts_n,
2683 			  struct mlx5_txq_local *__rte_restrict loc,
2684 			  unsigned int olx)
2685 {
2686 	/*
2687 	 * Subroutine is the part of mlx5_tx_burst_single() and sends
2688 	 * single-segment packet with eMPW opcode without data inlining.
2689 	 */
2690 	MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE));
2691 	MLX5_ASSERT(MLX5_TXOFF_CONFIG(EMPW));
2692 	MLX5_ASSERT(loc->elts_free && loc->wqe_free);
2693 	MLX5_ASSERT(pkts_n > loc->pkts_sent);
2694 	pkts += loc->pkts_sent + 1;
2695 	pkts_n -= loc->pkts_sent;
2696 	for (;;) {
2697 		struct mlx5_wqe_dseg *__rte_restrict dseg;
2698 		struct mlx5_wqe_eseg *__rte_restrict eseg;
2699 		enum mlx5_txcmp_code ret;
2700 		unsigned int part, loop;
2701 		unsigned int slen = 0;
2702 
2703 next_empw:
2704 		MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
2705 		part = RTE_MIN(pkts_n, MLX5_TXOFF_CONFIG(MPW) ?
2706 				       MLX5_MPW_MAX_PACKETS :
2707 				       MLX5_EMPW_MAX_PACKETS);
2708 		if (unlikely(loc->elts_free < part)) {
2709 			/* We have no enough elts to save all mbufs. */
2710 			if (unlikely(loc->elts_free < MLX5_EMPW_MIN_PACKETS))
2711 				return MLX5_TXCMP_CODE_EXIT;
2712 			/* But we still able to send at least minimal eMPW. */
2713 			part = loc->elts_free;
2714 		}
2715 		if (MLX5_TXOFF_CONFIG(TXPP)) {
2716 			enum mlx5_txcmp_code wret;
2717 
2718 			/* Generate WAIT for scheduling if requested. */
2719 			wret = mlx5_tx_schedule_send(txq, loc, 0, olx);
2720 			if (wret == MLX5_TXCMP_CODE_EXIT)
2721 				return MLX5_TXCMP_CODE_EXIT;
2722 			if (wret == MLX5_TXCMP_CODE_ERROR)
2723 				return MLX5_TXCMP_CODE_ERROR;
2724 		}
2725 		/* Check whether we have enough WQEs */
2726 		if (unlikely(loc->wqe_free < ((2 + part + 3) / 4))) {
2727 			if (unlikely(loc->wqe_free <
2728 				((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4)))
2729 				return MLX5_TXCMP_CODE_EXIT;
2730 			part = (loc->wqe_free * 4) - 2;
2731 		}
2732 		if (likely(part > 1))
2733 			rte_prefetch0(*pkts);
2734 		loc->wqe_last = txq->wqes + (txq->wqe_ci & txq->wqe_m);
2735 		/*
2736 		 * Build eMPW title WQEBB:
2737 		 * - Control Segment, eMPW opcode
2738 		 * - Ethernet Segment, no inline
2739 		 */
2740 		mlx5_tx_cseg_init(txq, loc, loc->wqe_last, part + 2,
2741 				  MLX5_OPCODE_ENHANCED_MPSW, olx);
2742 		mlx5_tx_eseg_none(txq, loc, loc->wqe_last,
2743 				  olx & ~MLX5_TXOFF_CONFIG_VLAN);
2744 		eseg = &loc->wqe_last->eseg;
2745 		dseg = &loc->wqe_last->dseg[0];
2746 		loop = part;
2747 		/* Store the packet length for legacy MPW. */
2748 		if (MLX5_TXOFF_CONFIG(MPW))
2749 			eseg->mss = rte_cpu_to_be_16
2750 					(rte_pktmbuf_data_len(loc->mbuf));
2751 		for (;;) {
2752 			uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf);
2753 #ifdef MLX5_PMD_SOFT_COUNTERS
2754 			/* Update sent data bytes counter. */
2755 			slen += dlen;
2756 #endif
2757 			rte_pmd_mlx5_trace_tx_push(loc->mbuf, txq->wqe_ci);
2758 			mlx5_tx_dseg_ptr
2759 				(txq, loc, dseg,
2760 				 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
2761 				 dlen, olx);
2762 			if (unlikely(--loop == 0))
2763 				break;
2764 			loc->mbuf = *pkts++;
2765 			if (likely(loop > 1))
2766 				rte_prefetch0(*pkts);
2767 			ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
2768 			/*
2769 			 * Unroll the completion code to avoid
2770 			 * returning variable value - it results in
2771 			 * unoptimized sequent checking in caller.
2772 			 */
2773 			if (ret == MLX5_TXCMP_CODE_MULTI) {
2774 				part -= loop;
2775 				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
2776 				if (unlikely(!loc->elts_free ||
2777 					     !loc->wqe_free))
2778 					return MLX5_TXCMP_CODE_EXIT;
2779 				return MLX5_TXCMP_CODE_MULTI;
2780 			}
2781 			MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
2782 			if (ret == MLX5_TXCMP_CODE_TSO) {
2783 				part -= loop;
2784 				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
2785 				if (unlikely(!loc->elts_free ||
2786 					     !loc->wqe_free))
2787 					return MLX5_TXCMP_CODE_EXIT;
2788 				return MLX5_TXCMP_CODE_TSO;
2789 			}
2790 			if (ret == MLX5_TXCMP_CODE_SINGLE) {
2791 				part -= loop;
2792 				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
2793 				if (unlikely(!loc->elts_free ||
2794 					     !loc->wqe_free))
2795 					return MLX5_TXCMP_CODE_EXIT;
2796 				return MLX5_TXCMP_CODE_SINGLE;
2797 			}
2798 			if (ret != MLX5_TXCMP_CODE_EMPW) {
2799 				MLX5_ASSERT(false);
2800 				part -= loop;
2801 				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
2802 				return MLX5_TXCMP_CODE_ERROR;
2803 			}
2804 			/*
2805 			 * Check whether packet parameters coincide
2806 			 * within assumed eMPW batch:
2807 			 * - check sum settings
2808 			 * - metadata value
2809 			 * - software parser settings
2810 			 * - packets length (legacy MPW only)
2811 			 * - scheduling is not required
2812 			 */
2813 			if (!mlx5_tx_match_empw(txq, eseg, loc, dlen, olx)) {
2814 				MLX5_ASSERT(loop);
2815 				part -= loop;
2816 				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
2817 				if (unlikely(!loc->elts_free ||
2818 					     !loc->wqe_free))
2819 					return MLX5_TXCMP_CODE_EXIT;
2820 				pkts_n -= part;
2821 				goto next_empw;
2822 			}
2823 			/* Packet attributes match, continue the same eMPW. */
2824 			++dseg;
2825 			if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
2826 				dseg = (struct mlx5_wqe_dseg *)txq->wqes;
2827 		}
2828 		/* eMPW is built successfully, update loop parameters. */
2829 		MLX5_ASSERT(!loop);
2830 		MLX5_ASSERT(pkts_n >= part);
2831 #ifdef MLX5_PMD_SOFT_COUNTERS
2832 		/* Update sent data bytes counter. */
2833 		txq->stats.obytes += slen;
2834 #endif
2835 		loc->elts_free -= part;
2836 		loc->pkts_sent += part;
2837 		txq->wqe_ci += (2 + part + 3) / 4;
2838 		loc->wqe_free -= (2 + part + 3) / 4;
2839 		pkts_n -= part;
2840 		if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
2841 			return MLX5_TXCMP_CODE_EXIT;
2842 		loc->mbuf = *pkts++;
2843 		ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
2844 		if (unlikely(ret != MLX5_TXCMP_CODE_EMPW))
2845 			return ret;
2846 		/* Continue sending eMPW batches. */
2847 	}
2848 	MLX5_ASSERT(false);
2849 }
2850 
2851 /**
2852  * The routine sends packets with MLX5_OPCODE_EMPW
2853  * with inlining, optionally supports VLAN insertion.
2854  */
2855 static __rte_always_inline enum mlx5_txcmp_code
2856 mlx5_tx_burst_empw_inline(struct mlx5_txq_data *__rte_restrict txq,
2857 			  struct rte_mbuf **__rte_restrict pkts,
2858 			  unsigned int pkts_n,
2859 			  struct mlx5_txq_local *__rte_restrict loc,
2860 			  unsigned int olx)
2861 {
2862 	/*
2863 	 * Subroutine is the part of mlx5_tx_burst_single() and sends
2864 	 * single-segment packet with eMPW opcode with data inlining.
2865 	 */
2866 	MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
2867 	MLX5_ASSERT(MLX5_TXOFF_CONFIG(EMPW));
2868 	MLX5_ASSERT(loc->elts_free && loc->wqe_free);
2869 	MLX5_ASSERT(pkts_n > loc->pkts_sent);
2870 	pkts += loc->pkts_sent + 1;
2871 	pkts_n -= loc->pkts_sent;
2872 	for (;;) {
2873 		struct mlx5_wqe_dseg *__rte_restrict dseg;
2874 		struct mlx5_wqe *__rte_restrict wqem;
2875 		enum mlx5_txcmp_code ret;
2876 		unsigned int room, part, nlim;
2877 		unsigned int slen = 0;
2878 
2879 		MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
2880 		/*
2881 		 * Limits the amount of packets in one WQE
2882 		 * to improve CQE latency generation.
2883 		 */
2884 		nlim = RTE_MIN(pkts_n, MLX5_TXOFF_CONFIG(MPW) ?
2885 				       MLX5_MPW_INLINE_MAX_PACKETS :
2886 				       MLX5_EMPW_MAX_PACKETS);
2887 		if (MLX5_TXOFF_CONFIG(TXPP)) {
2888 			enum mlx5_txcmp_code wret;
2889 
2890 			/* Generate WAIT for scheduling if requested. */
2891 			wret = mlx5_tx_schedule_send(txq, loc, nlim, olx);
2892 			if (wret == MLX5_TXCMP_CODE_EXIT)
2893 				return MLX5_TXCMP_CODE_EXIT;
2894 			if (wret == MLX5_TXCMP_CODE_ERROR)
2895 				return MLX5_TXCMP_CODE_ERROR;
2896 		}
2897 		/* Check whether we have minimal amount WQEs */
2898 		if (unlikely(loc->wqe_free <
2899 			    ((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4)))
2900 			return MLX5_TXCMP_CODE_EXIT;
2901 		if (likely(pkts_n > 1))
2902 			rte_prefetch0(*pkts);
2903 		wqem = txq->wqes + (txq->wqe_ci & txq->wqe_m);
2904 		/*
2905 		 * Build eMPW title WQEBB:
2906 		 * - Control Segment, eMPW opcode, zero DS
2907 		 * - Ethernet Segment, no inline
2908 		 */
2909 		mlx5_tx_cseg_init(txq, loc, wqem, 0,
2910 				  MLX5_OPCODE_ENHANCED_MPSW, olx);
2911 		mlx5_tx_eseg_none(txq, loc, wqem,
2912 				  olx & ~MLX5_TXOFF_CONFIG_VLAN);
2913 		dseg = &wqem->dseg[0];
2914 		/* Store the packet length for legacy MPW. */
2915 		if (MLX5_TXOFF_CONFIG(MPW))
2916 			wqem->eseg.mss = rte_cpu_to_be_16
2917 					 (rte_pktmbuf_data_len(loc->mbuf));
2918 		room = RTE_MIN(MLX5_WQE_SIZE_MAX / MLX5_WQE_SIZE,
2919 			       loc->wqe_free) * MLX5_WQE_SIZE -
2920 					MLX5_WQE_CSEG_SIZE -
2921 					MLX5_WQE_ESEG_SIZE;
2922 		/* Limit the room for legacy MPW sessions for performance. */
2923 		if (MLX5_TXOFF_CONFIG(MPW))
2924 			room = RTE_MIN(room,
2925 				       RTE_MAX(txq->inlen_empw +
2926 					       sizeof(dseg->bcount) +
2927 					       (MLX5_TXOFF_CONFIG(VLAN) ?
2928 					       sizeof(struct rte_vlan_hdr) : 0),
2929 					       MLX5_MPW_INLINE_MAX_PACKETS *
2930 					       MLX5_WQE_DSEG_SIZE));
2931 		/* Build WQE till we have space, packets and resources. */
2932 		part = room;
2933 		for (;;) {
2934 			uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf);
2935 			uint8_t *dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
2936 			unsigned int tlen;
2937 
2938 			MLX5_ASSERT(room >= MLX5_WQE_DSEG_SIZE);
2939 			MLX5_ASSERT((room % MLX5_WQE_DSEG_SIZE) == 0);
2940 			MLX5_ASSERT((uintptr_t)dseg < (uintptr_t)txq->wqes_end);
2941 			/*
2942 			 * Some Tx offloads may cause an error if packet is not
2943 			 * long enough, check against assumed minimal length.
2944 			 */
2945 			if (unlikely(dlen <= MLX5_ESEG_MIN_INLINE_SIZE)) {
2946 				part -= room;
2947 				if (unlikely(!part))
2948 					return MLX5_TXCMP_CODE_ERROR;
2949 				/*
2950 				 * We have some successfully built
2951 				 * packet Data Segments to send.
2952 				 */
2953 				mlx5_tx_idone_empw(txq, loc, part,
2954 						   slen, wqem, olx);
2955 				return MLX5_TXCMP_CODE_ERROR;
2956 			}
2957 			/* Inline or not inline - that's the Question. */
2958 			if (dlen > txq->inlen_empw ||
2959 			    loc->mbuf->ol_flags & RTE_MBUF_F_TX_DYNF_NOINLINE)
2960 				goto pointer_empw;
2961 			if (MLX5_TXOFF_CONFIG(MPW)) {
2962 				if (dlen > txq->inlen_send)
2963 					goto pointer_empw;
2964 				tlen = dlen;
2965 				if (part == room) {
2966 					/* Open new inline MPW session. */
2967 					tlen += sizeof(dseg->bcount);
2968 					dseg->bcount = RTE_BE32(0);
2969 					dseg = RTE_PTR_ADD
2970 						(dseg, sizeof(dseg->bcount));
2971 				} else {
2972 					/*
2973 					 * No pointer and inline descriptor
2974 					 * intermix for legacy MPW sessions.
2975 					 */
2976 					if (wqem->dseg[0].bcount)
2977 						break;
2978 				}
2979 			} else {
2980 				tlen = sizeof(dseg->bcount) + dlen;
2981 			}
2982 			/* Inline entire packet, optional VLAN insertion. */
2983 			if (MLX5_TXOFF_CONFIG(VLAN) &&
2984 			    loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN) {
2985 				/*
2986 				 * The packet length must be checked in
2987 				 * mlx5_tx_able_to_empw() and packet
2988 				 * fits into inline length guaranteed.
2989 				 */
2990 				MLX5_ASSERT((dlen +
2991 					     sizeof(struct rte_vlan_hdr)) <=
2992 					    txq->inlen_empw);
2993 				tlen += sizeof(struct rte_vlan_hdr);
2994 				if (room < tlen)
2995 					break;
2996 				rte_pmd_mlx5_trace_tx_push(loc->mbuf, txq->wqe_ci);
2997 				dseg = mlx5_tx_dseg_vlan(txq, loc, dseg,
2998 							 dptr, dlen, olx);
2999 #ifdef MLX5_PMD_SOFT_COUNTERS
3000 				/* Update sent data bytes counter. */
3001 				slen +=	sizeof(struct rte_vlan_hdr);
3002 #endif
3003 			} else {
3004 				if (room < tlen)
3005 					break;
3006 				rte_pmd_mlx5_trace_tx_push(loc->mbuf, txq->wqe_ci);
3007 				dseg = mlx5_tx_dseg_empw(txq, loc, dseg,
3008 							 dptr, dlen, olx);
3009 			}
3010 			if (!MLX5_TXOFF_CONFIG(MPW))
3011 				tlen = RTE_ALIGN(tlen, MLX5_WSEG_SIZE);
3012 			MLX5_ASSERT(room >= tlen);
3013 			room -= tlen;
3014 			/*
3015 			 * Packet data are completely inline,
3016 			 * we can try to free the packet.
3017 			 */
3018 			if (likely(loc->pkts_sent == loc->mbuf_free)) {
3019 				/*
3020 				 * All the packets from the burst beginning
3021 				 * are inline, we can free mbufs directly
3022 				 * from the origin array on tx_burst exit().
3023 				 */
3024 				loc->mbuf_free++;
3025 				goto next_mbuf;
3026 			}
3027 			/*
3028 			 * In order no to call rte_pktmbuf_free_seg() here,
3029 			 * in the most inner loop (that might be very
3030 			 * expensive) we just save the mbuf in elts.
3031 			 */
3032 			txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
3033 			loc->elts_free--;
3034 			goto next_mbuf;
3035 pointer_empw:
3036 			/*
3037 			 * No pointer and inline descriptor
3038 			 * intermix for legacy MPW sessions.
3039 			 */
3040 			if (MLX5_TXOFF_CONFIG(MPW) &&
3041 			    part != room &&
3042 			    wqem->dseg[0].bcount == RTE_BE32(0))
3043 				break;
3044 			/*
3045 			 * Not inlinable VLAN packets are
3046 			 * proceeded outside of this routine.
3047 			 */
3048 			MLX5_ASSERT(room >= MLX5_WQE_DSEG_SIZE);
3049 			if (MLX5_TXOFF_CONFIG(VLAN))
3050 				MLX5_ASSERT(!(loc->mbuf->ol_flags &
3051 					    RTE_MBUF_F_TX_VLAN));
3052 			rte_pmd_mlx5_trace_tx_push(loc->mbuf, txq->wqe_ci);
3053 			mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx);
3054 			/* We have to store mbuf in elts.*/
3055 			txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
3056 			loc->elts_free--;
3057 			room -= MLX5_WQE_DSEG_SIZE;
3058 			/* Ring buffer wraparound is checked at the loop end.*/
3059 			++dseg;
3060 next_mbuf:
3061 #ifdef MLX5_PMD_SOFT_COUNTERS
3062 			/* Update sent data bytes counter. */
3063 			slen += dlen;
3064 #endif
3065 			loc->pkts_sent++;
3066 			pkts_n--;
3067 			if (unlikely(!pkts_n || !loc->elts_free)) {
3068 				/*
3069 				 * We have no resources/packets to
3070 				 * continue build descriptors.
3071 				 */
3072 				part -= room;
3073 				mlx5_tx_idone_empw(txq, loc, part,
3074 						   slen, wqem, olx);
3075 				return MLX5_TXCMP_CODE_EXIT;
3076 			}
3077 			loc->mbuf = *pkts++;
3078 			if (likely(pkts_n > 1))
3079 				rte_prefetch0(*pkts);
3080 			ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
3081 			/*
3082 			 * Unroll the completion code to avoid
3083 			 * returning variable value - it results in
3084 			 * unoptimized sequent checking in caller.
3085 			 */
3086 			if (ret == MLX5_TXCMP_CODE_MULTI) {
3087 				part -= room;
3088 				mlx5_tx_idone_empw(txq, loc, part,
3089 						   slen, wqem, olx);
3090 				if (unlikely(!loc->elts_free ||
3091 					     !loc->wqe_free))
3092 					return MLX5_TXCMP_CODE_EXIT;
3093 				return MLX5_TXCMP_CODE_MULTI;
3094 			}
3095 			MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
3096 			if (ret == MLX5_TXCMP_CODE_TSO) {
3097 				part -= room;
3098 				mlx5_tx_idone_empw(txq, loc, part,
3099 						   slen, wqem, olx);
3100 				if (unlikely(!loc->elts_free ||
3101 					     !loc->wqe_free))
3102 					return MLX5_TXCMP_CODE_EXIT;
3103 				return MLX5_TXCMP_CODE_TSO;
3104 			}
3105 			if (ret == MLX5_TXCMP_CODE_SINGLE) {
3106 				part -= room;
3107 				mlx5_tx_idone_empw(txq, loc, part,
3108 						   slen, wqem, olx);
3109 				if (unlikely(!loc->elts_free ||
3110 					     !loc->wqe_free))
3111 					return MLX5_TXCMP_CODE_EXIT;
3112 				return MLX5_TXCMP_CODE_SINGLE;
3113 			}
3114 			if (ret != MLX5_TXCMP_CODE_EMPW) {
3115 				MLX5_ASSERT(false);
3116 				part -= room;
3117 				mlx5_tx_idone_empw(txq, loc, part,
3118 						   slen, wqem, olx);
3119 				return MLX5_TXCMP_CODE_ERROR;
3120 			}
3121 			/* Check if we have minimal room left. */
3122 			nlim--;
3123 			if (unlikely(!nlim || room < MLX5_WQE_DSEG_SIZE))
3124 				break;
3125 			/*
3126 			 * Check whether packet parameters coincide
3127 			 * within assumed eMPW batch:
3128 			 * - check sum settings
3129 			 * - metadata value
3130 			 * - software parser settings
3131 			 * - packets length (legacy MPW only)
3132 			 * - scheduling is not required
3133 			 */
3134 			if (!mlx5_tx_match_empw(txq, &wqem->eseg,
3135 						loc, dlen, olx))
3136 				break;
3137 			/* Packet attributes match, continue the same eMPW. */
3138 			if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
3139 				dseg = (struct mlx5_wqe_dseg *)txq->wqes;
3140 		}
3141 		/*
3142 		 * We get here to close an existing eMPW
3143 		 * session and start the new one.
3144 		 */
3145 		MLX5_ASSERT(pkts_n);
3146 		part -= room;
3147 		if (unlikely(!part))
3148 			return MLX5_TXCMP_CODE_EXIT;
3149 		mlx5_tx_idone_empw(txq, loc, part, slen, wqem, olx);
3150 		if (unlikely(!loc->elts_free ||
3151 			     !loc->wqe_free))
3152 			return MLX5_TXCMP_CODE_EXIT;
3153 		/* Continue the loop with new eMPW session. */
3154 	}
3155 	MLX5_ASSERT(false);
3156 }
3157 
3158 /**
3159  * The routine sends packets with ordinary MLX5_OPCODE_SEND.
3160  * Data inlining and VLAN insertion are supported.
3161  */
3162 static __rte_always_inline enum mlx5_txcmp_code
3163 mlx5_tx_burst_single_send(struct mlx5_txq_data *__rte_restrict txq,
3164 			  struct rte_mbuf **__rte_restrict pkts,
3165 			  unsigned int pkts_n,
3166 			  struct mlx5_txq_local *__rte_restrict loc,
3167 			  unsigned int olx)
3168 {
3169 	/*
3170 	 * Subroutine is the part of mlx5_tx_burst_single()
3171 	 * and sends single-segment packet with SEND opcode.
3172 	 */
3173 	MLX5_ASSERT(loc->elts_free && loc->wqe_free);
3174 	MLX5_ASSERT(pkts_n > loc->pkts_sent);
3175 	pkts += loc->pkts_sent + 1;
3176 	pkts_n -= loc->pkts_sent;
3177 	for (;;) {
3178 		struct mlx5_wqe *__rte_restrict wqe;
3179 		enum mlx5_txcmp_code ret;
3180 
3181 		MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
3182 		MLX5_ASSERT(loc->elts_free);
3183 		if (MLX5_TXOFF_CONFIG(TXPP)) {
3184 			enum mlx5_txcmp_code wret;
3185 
3186 			/* Generate WAIT for scheduling if requested. */
3187 			wret = mlx5_tx_schedule_send(txq, loc, 0, olx);
3188 			if (wret == MLX5_TXCMP_CODE_EXIT)
3189 				return MLX5_TXCMP_CODE_EXIT;
3190 			if (wret == MLX5_TXCMP_CODE_ERROR)
3191 				return MLX5_TXCMP_CODE_ERROR;
3192 		}
3193 		if (MLX5_TXOFF_CONFIG(INLINE)) {
3194 			unsigned int inlen, vlan = 0;
3195 
3196 			inlen = rte_pktmbuf_data_len(loc->mbuf);
3197 			if (MLX5_TXOFF_CONFIG(VLAN) &&
3198 			    loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN) {
3199 				vlan = sizeof(struct rte_vlan_hdr);
3200 				inlen += vlan;
3201 			}
3202 			/*
3203 			 * If inlining is enabled at configuration time
3204 			 * the limit must be not less than minimal size.
3205 			 * Otherwise we would do extra check for data
3206 			 * size to avoid crashes due to length overflow.
3207 			 */
3208 			MLX5_ASSERT(txq->inlen_send >=
3209 				    MLX5_ESEG_MIN_INLINE_SIZE);
3210 			if (inlen <= txq->inlen_send) {
3211 				unsigned int seg_n, wqe_n;
3212 
3213 				rte_prefetch0(rte_pktmbuf_mtod
3214 						(loc->mbuf, uint8_t *));
3215 				/* Check against minimal length. */
3216 				if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE)
3217 					return MLX5_TXCMP_CODE_ERROR;
3218 				if (loc->mbuf->ol_flags &
3219 				    RTE_MBUF_F_TX_DYNF_NOINLINE) {
3220 					/*
3221 					 * The hint flag not to inline packet
3222 					 * data is set. Check whether we can
3223 					 * follow the hint.
3224 					 */
3225 					if ((!MLX5_TXOFF_CONFIG(EMPW) &&
3226 					      txq->inlen_mode) ||
3227 					    (MLX5_TXOFF_CONFIG(MPW) &&
3228 					     txq->inlen_mode)) {
3229 						if (inlen <= txq->inlen_send)
3230 							goto single_inline;
3231 						/*
3232 						 * The hardware requires the
3233 						 * minimal inline data header.
3234 						 */
3235 						goto single_min_inline;
3236 					}
3237 					if (MLX5_TXOFF_CONFIG(VLAN) &&
3238 					    vlan && !txq->vlan_en) {
3239 						/*
3240 						 * We must insert VLAN tag
3241 						 * by software means.
3242 						 */
3243 						goto single_part_inline;
3244 					}
3245 					goto single_no_inline;
3246 				}
3247 single_inline:
3248 				/*
3249 				 * Completely inlined packet data WQE:
3250 				 * - Control Segment, SEND opcode
3251 				 * - Ethernet Segment, no VLAN insertion
3252 				 * - Data inlined, VLAN optionally inserted
3253 				 * - Alignment to MLX5_WSEG_SIZE
3254 				 * Have to estimate amount of WQEBBs
3255 				 */
3256 				seg_n = (inlen + 3 * MLX5_WSEG_SIZE -
3257 					 MLX5_ESEG_MIN_INLINE_SIZE +
3258 					 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
3259 				/* Check if there are enough WQEBBs. */
3260 				wqe_n = (seg_n + 3) / 4;
3261 				if (wqe_n > loc->wqe_free)
3262 					return MLX5_TXCMP_CODE_EXIT;
3263 				wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
3264 				loc->wqe_last = wqe;
3265 				mlx5_tx_cseg_init(txq, loc, wqe, seg_n,
3266 						  MLX5_OPCODE_SEND, olx);
3267 				rte_pmd_mlx5_trace_tx_push(loc->mbuf, txq->wqe_ci);
3268 				mlx5_tx_eseg_data(txq, loc, wqe,
3269 						  vlan, inlen, 0, olx);
3270 				txq->wqe_ci += wqe_n;
3271 				loc->wqe_free -= wqe_n;
3272 				/*
3273 				 * Packet data are completely inlined,
3274 				 * free the packet immediately.
3275 				 */
3276 				rte_pktmbuf_free_seg(loc->mbuf);
3277 			} else if ((!MLX5_TXOFF_CONFIG(EMPW) ||
3278 				     MLX5_TXOFF_CONFIG(MPW)) &&
3279 					txq->inlen_mode) {
3280 				/*
3281 				 * If minimal inlining is requested the eMPW
3282 				 * feature should be disabled due to data is
3283 				 * inlined into Ethernet Segment, which can
3284 				 * not contain inlined data for eMPW due to
3285 				 * segment shared for all packets.
3286 				 */
3287 				struct mlx5_wqe_dseg *__rte_restrict dseg;
3288 				unsigned int ds;
3289 				uint8_t *dptr;
3290 
3291 				/*
3292 				 * The inline-mode settings require
3293 				 * to inline the specified amount of
3294 				 * data bytes to the Ethernet Segment.
3295 				 * We should check the free space in
3296 				 * WQE ring buffer to inline partially.
3297 				 */
3298 single_min_inline:
3299 				MLX5_ASSERT(txq->inlen_send >= txq->inlen_mode);
3300 				MLX5_ASSERT(inlen > txq->inlen_mode);
3301 				MLX5_ASSERT(txq->inlen_mode >=
3302 					    MLX5_ESEG_MIN_INLINE_SIZE);
3303 				/*
3304 				 * Check whether there are enough free WQEBBs:
3305 				 * - Control Segment
3306 				 * - Ethernet Segment
3307 				 * - First Segment of inlined Ethernet data
3308 				 * - ... data continued ...
3309 				 * - Finishing Data Segment of pointer type
3310 				 */
3311 				ds = (MLX5_WQE_CSEG_SIZE +
3312 				      MLX5_WQE_ESEG_SIZE +
3313 				      MLX5_WQE_DSEG_SIZE +
3314 				      txq->inlen_mode -
3315 				      MLX5_ESEG_MIN_INLINE_SIZE +
3316 				      MLX5_WQE_DSEG_SIZE +
3317 				      MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
3318 				if (loc->wqe_free < ((ds + 3) / 4))
3319 					return MLX5_TXCMP_CODE_EXIT;
3320 				/*
3321 				 * Build the ordinary SEND WQE:
3322 				 * - Control Segment
3323 				 * - Ethernet Segment, inline inlen_mode bytes
3324 				 * - Data Segment of pointer type
3325 				 */
3326 				wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
3327 				loc->wqe_last = wqe;
3328 				mlx5_tx_cseg_init(txq, loc, wqe, ds,
3329 						  MLX5_OPCODE_SEND, olx);
3330 				rte_pmd_mlx5_trace_tx_push(loc->mbuf, txq->wqe_ci);
3331 				dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan,
3332 							 txq->inlen_mode,
3333 							 0, olx);
3334 				dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) +
3335 				       txq->inlen_mode - vlan;
3336 				inlen -= txq->inlen_mode;
3337 				mlx5_tx_dseg_ptr(txq, loc, dseg,
3338 						 dptr, inlen, olx);
3339 				/*
3340 				 * WQE is built, update the loop parameters
3341 				 * and got to the next packet.
3342 				 */
3343 				txq->wqe_ci += (ds + 3) / 4;
3344 				loc->wqe_free -= (ds + 3) / 4;
3345 				/* We have to store mbuf in elts.*/
3346 				MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
3347 				txq->elts[txq->elts_head++ & txq->elts_m] =
3348 						loc->mbuf;
3349 				--loc->elts_free;
3350 			} else {
3351 				uint8_t *dptr;
3352 				unsigned int dlen;
3353 
3354 				/*
3355 				 * Partially inlined packet data WQE, we have
3356 				 * some space in title WQEBB, we can fill it
3357 				 * with some packet data. It takes one WQEBB,
3358 				 * it is available, no extra space check:
3359 				 * - Control Segment, SEND opcode
3360 				 * - Ethernet Segment, no VLAN insertion
3361 				 * - MLX5_ESEG_MIN_INLINE_SIZE bytes of Data
3362 				 * - Data Segment, pointer type
3363 				 *
3364 				 * We also get here if VLAN insertion is not
3365 				 * supported by HW, the inline is enabled.
3366 				 */
3367 single_part_inline:
3368 				wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
3369 				loc->wqe_last = wqe;
3370 				mlx5_tx_cseg_init(txq, loc, wqe, 4,
3371 						  MLX5_OPCODE_SEND, olx);
3372 				rte_pmd_mlx5_trace_tx_push(loc->mbuf, txq->wqe_ci);
3373 				mlx5_tx_eseg_dmin(txq, loc, wqe, vlan, olx);
3374 				dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) +
3375 				       MLX5_ESEG_MIN_INLINE_SIZE - vlan;
3376 				/*
3377 				 * The length check is performed above, by
3378 				 * comparing with txq->inlen_send. We should
3379 				 * not get overflow here.
3380 				 */
3381 				MLX5_ASSERT(inlen > MLX5_ESEG_MIN_INLINE_SIZE);
3382 				dlen = inlen - MLX5_ESEG_MIN_INLINE_SIZE;
3383 				mlx5_tx_dseg_ptr(txq, loc, &wqe->dseg[1],
3384 						 dptr, dlen, olx);
3385 				++txq->wqe_ci;
3386 				--loc->wqe_free;
3387 				/* We have to store mbuf in elts.*/
3388 				MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
3389 				txq->elts[txq->elts_head++ & txq->elts_m] =
3390 						loc->mbuf;
3391 				--loc->elts_free;
3392 			}
3393 #ifdef MLX5_PMD_SOFT_COUNTERS
3394 			/* Update sent data bytes counter. */
3395 			txq->stats.obytes += vlan +
3396 					rte_pktmbuf_data_len(loc->mbuf);
3397 #endif
3398 		} else {
3399 			/*
3400 			 * No inline at all, it means the CPU cycles saving
3401 			 * is prioritized at configuration, we should not
3402 			 * copy any packet data to WQE.
3403 			 *
3404 			 * SEND WQE, one WQEBB:
3405 			 * - Control Segment, SEND opcode
3406 			 * - Ethernet Segment, optional VLAN, no inline
3407 			 * - Data Segment, pointer type
3408 			 */
3409 single_no_inline:
3410 			wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
3411 			loc->wqe_last = wqe;
3412 			mlx5_tx_cseg_init(txq, loc, wqe, 3,
3413 					  MLX5_OPCODE_SEND, olx);
3414 			rte_pmd_mlx5_trace_tx_push(loc->mbuf, txq->wqe_ci);
3415 			mlx5_tx_eseg_none(txq, loc, wqe, olx);
3416 			mlx5_tx_dseg_ptr
3417 				(txq, loc, &wqe->dseg[0],
3418 				 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
3419 				 rte_pktmbuf_data_len(loc->mbuf), olx);
3420 			++txq->wqe_ci;
3421 			--loc->wqe_free;
3422 			/*
3423 			 * We should not store mbuf pointer in elts
3424 			 * if no inlining is configured, this is done
3425 			 * by calling routine in a batch copy.
3426 			 */
3427 			if (MLX5_TXOFF_CONFIG(INLINE))
3428 				txq->elts[txq->elts_head++ & txq->elts_m] =
3429 							loc->mbuf;
3430 			--loc->elts_free;
3431 #ifdef MLX5_PMD_SOFT_COUNTERS
3432 			/* Update sent data bytes counter. */
3433 			txq->stats.obytes += rte_pktmbuf_data_len(loc->mbuf);
3434 			if (MLX5_TXOFF_CONFIG(VLAN) &&
3435 			    loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN)
3436 				txq->stats.obytes +=
3437 					sizeof(struct rte_vlan_hdr);
3438 #endif
3439 		}
3440 		++loc->pkts_sent;
3441 		--pkts_n;
3442 		if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
3443 			return MLX5_TXCMP_CODE_EXIT;
3444 		loc->mbuf = *pkts++;
3445 		if (pkts_n > 1)
3446 			rte_prefetch0(*pkts);
3447 		ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
3448 		if (unlikely(ret != MLX5_TXCMP_CODE_SINGLE))
3449 			return ret;
3450 	}
3451 	MLX5_ASSERT(false);
3452 }
3453 
3454 static __rte_always_inline enum mlx5_txcmp_code
3455 mlx5_tx_burst_single(struct mlx5_txq_data *__rte_restrict txq,
3456 		     struct rte_mbuf **__rte_restrict pkts,
3457 		     unsigned int pkts_n,
3458 		     struct mlx5_txq_local *__rte_restrict loc,
3459 		     unsigned int olx)
3460 {
3461 	enum mlx5_txcmp_code ret;
3462 
3463 	ret = mlx5_tx_able_to_empw(txq, loc, olx, false);
3464 	if (ret == MLX5_TXCMP_CODE_SINGLE)
3465 		goto ordinary_send;
3466 	MLX5_ASSERT(ret == MLX5_TXCMP_CODE_EMPW);
3467 	for (;;) {
3468 		/* Optimize for inline/no inline eMPW send. */
3469 		ret = (MLX5_TXOFF_CONFIG(INLINE)) ?
3470 			mlx5_tx_burst_empw_inline
3471 				(txq, pkts, pkts_n, loc, olx) :
3472 			mlx5_tx_burst_empw_simple
3473 				(txq, pkts, pkts_n, loc, olx);
3474 		if (ret != MLX5_TXCMP_CODE_SINGLE)
3475 			return ret;
3476 		/* The resources to send one packet should remain. */
3477 		MLX5_ASSERT(loc->elts_free && loc->wqe_free);
3478 ordinary_send:
3479 		ret = mlx5_tx_burst_single_send(txq, pkts, pkts_n, loc, olx);
3480 		MLX5_ASSERT(ret != MLX5_TXCMP_CODE_SINGLE);
3481 		if (ret != MLX5_TXCMP_CODE_EMPW)
3482 			return ret;
3483 		/* The resources to send one packet should remain. */
3484 		MLX5_ASSERT(loc->elts_free && loc->wqe_free);
3485 	}
3486 }
3487 
3488 /**
3489  * DPDK Tx callback template. This is configured template used to generate
3490  * routines optimized for specified offload setup.
3491  * One of this generated functions is chosen at SQ configuration time.
3492  *
3493  * @param txq
3494  *   Generic pointer to TX queue structure.
3495  * @param[in] pkts
3496  *   Packets to transmit.
3497  * @param pkts_n
3498  *   Number of packets in array.
3499  * @param olx
3500  *   Configured offloads mask, presents the bits of MLX5_TXOFF_CONFIG_xxx
3501  *   values. Should be static to take compile time static configuration
3502  *   advantages.
3503  *
3504  * @return
3505  *   Number of packets successfully transmitted (<= pkts_n).
3506  */
3507 static __rte_always_inline uint16_t
3508 mlx5_tx_burst_tmpl(struct mlx5_txq_data *__rte_restrict txq,
3509 		   struct rte_mbuf **__rte_restrict pkts,
3510 		   uint16_t pkts_n,
3511 		   unsigned int olx)
3512 {
3513 	struct mlx5_txq_local loc;
3514 	enum mlx5_txcmp_code ret;
3515 	unsigned int part;
3516 
3517 	MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
3518 	MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
3519 	if (unlikely(!pkts_n))
3520 		return 0;
3521 	if (MLX5_TXOFF_CONFIG(INLINE))
3522 		loc.mbuf_free = 0;
3523 	loc.pkts_sent = 0;
3524 	loc.pkts_copy = 0;
3525 	loc.wqe_last = NULL;
3526 
3527 send_loop:
3528 	loc.pkts_loop = loc.pkts_sent;
3529 	/*
3530 	 * Check if there are some CQEs, if any:
3531 	 * - process an encountered errors
3532 	 * - process the completed WQEs
3533 	 * - free related mbufs
3534 	 * - doorbell the NIC about processed CQEs
3535 	 */
3536 	rte_prefetch0(*(pkts + loc.pkts_sent));
3537 	mlx5_tx_handle_completion(txq, olx);
3538 	/*
3539 	 * Calculate the number of available resources - elts and WQEs.
3540 	 * There are two possible different scenarios:
3541 	 * - no data inlining into WQEs, one WQEBB may contains up to
3542 	 *   four packets, in this case elts become scarce resource
3543 	 * - data inlining into WQEs, one packet may require multiple
3544 	 *   WQEBBs, the WQEs become the limiting factor.
3545 	 */
3546 	MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
3547 	loc.elts_free = txq->elts_s -
3548 				(uint16_t)(txq->elts_head - txq->elts_tail);
3549 	MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
3550 	loc.wqe_free = txq->wqe_s -
3551 				(uint16_t)(txq->wqe_ci - txq->wqe_pi);
3552 	if (unlikely(!loc.elts_free || !loc.wqe_free))
3553 		goto burst_exit;
3554 	for (;;) {
3555 		/*
3556 		 * Fetch the packet from array. Usually this is the first
3557 		 * packet in series of multi/single segment packets.
3558 		 */
3559 		loc.mbuf = *(pkts + loc.pkts_sent);
3560 		/* Dedicated branch for multi-segment packets. */
3561 		if (MLX5_TXOFF_CONFIG(MULTI) &&
3562 		    unlikely(NB_SEGS(loc.mbuf) > 1)) {
3563 			/*
3564 			 * Multi-segment packet encountered.
3565 			 * Hardware is able to process it only
3566 			 * with SEND/TSO opcodes, one packet
3567 			 * per WQE, do it in dedicated routine.
3568 			 */
3569 enter_send_multi:
3570 			MLX5_ASSERT(loc.pkts_sent >= loc.pkts_copy);
3571 			part = loc.pkts_sent - loc.pkts_copy;
3572 			if (!MLX5_TXOFF_CONFIG(INLINE) && part) {
3573 				/*
3574 				 * There are some single-segment mbufs not
3575 				 * stored in elts. The mbufs must be in the
3576 				 * same order as WQEs, so we must copy the
3577 				 * mbufs to elts here, before the coming
3578 				 * multi-segment packet mbufs is appended.
3579 				 */
3580 				mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy,
3581 						  part, olx);
3582 				loc.pkts_copy = loc.pkts_sent;
3583 			}
3584 			MLX5_ASSERT(pkts_n > loc.pkts_sent);
3585 			ret = mlx5_tx_burst_mseg(txq, pkts, pkts_n, &loc, olx);
3586 			if (!MLX5_TXOFF_CONFIG(INLINE))
3587 				loc.pkts_copy = loc.pkts_sent;
3588 			/*
3589 			 * These returned code checks are supposed
3590 			 * to be optimized out due to routine inlining.
3591 			 */
3592 			if (ret == MLX5_TXCMP_CODE_EXIT) {
3593 				/*
3594 				 * The routine returns this code when
3595 				 * all packets are sent or there is no
3596 				 * enough resources to complete request.
3597 				 */
3598 				break;
3599 			}
3600 			if (ret == MLX5_TXCMP_CODE_ERROR) {
3601 				/*
3602 				 * The routine returns this code when some error
3603 				 * in the incoming packets format occurred.
3604 				 */
3605 				txq->stats.oerrors++;
3606 				break;
3607 			}
3608 			if (ret == MLX5_TXCMP_CODE_SINGLE) {
3609 				/*
3610 				 * The single-segment packet was encountered
3611 				 * in the array, try to send it with the
3612 				 * best optimized way, possible engaging eMPW.
3613 				 */
3614 				goto enter_send_single;
3615 			}
3616 			if (MLX5_TXOFF_CONFIG(TSO) &&
3617 			    ret == MLX5_TXCMP_CODE_TSO) {
3618 				/*
3619 				 * The single-segment TSO packet was
3620 				 * encountered in the array.
3621 				 */
3622 				goto enter_send_tso;
3623 			}
3624 			/* We must not get here. Something is going wrong. */
3625 			MLX5_ASSERT(false);
3626 			txq->stats.oerrors++;
3627 			break;
3628 		}
3629 		/* Dedicated branch for single-segment TSO packets. */
3630 		if (MLX5_TXOFF_CONFIG(TSO) &&
3631 		    unlikely(loc.mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG)) {
3632 			/*
3633 			 * TSO might require special way for inlining
3634 			 * (dedicated parameters) and is sent with
3635 			 * MLX5_OPCODE_TSO opcode only, provide this
3636 			 * in dedicated branch.
3637 			 */
3638 enter_send_tso:
3639 			MLX5_ASSERT(NB_SEGS(loc.mbuf) == 1);
3640 			MLX5_ASSERT(pkts_n > loc.pkts_sent);
3641 			ret = mlx5_tx_burst_tso(txq, pkts, pkts_n, &loc, olx);
3642 			/*
3643 			 * These returned code checks are supposed
3644 			 * to be optimized out due to routine inlining.
3645 			 */
3646 			if (ret == MLX5_TXCMP_CODE_EXIT)
3647 				break;
3648 			if (ret == MLX5_TXCMP_CODE_ERROR) {
3649 				txq->stats.oerrors++;
3650 				break;
3651 			}
3652 			if (ret == MLX5_TXCMP_CODE_SINGLE)
3653 				goto enter_send_single;
3654 			if (MLX5_TXOFF_CONFIG(MULTI) &&
3655 			    ret == MLX5_TXCMP_CODE_MULTI) {
3656 				/*
3657 				 * The multi-segment packet was
3658 				 * encountered in the array.
3659 				 */
3660 				goto enter_send_multi;
3661 			}
3662 			/* We must not get here. Something is going wrong. */
3663 			MLX5_ASSERT(false);
3664 			txq->stats.oerrors++;
3665 			break;
3666 		}
3667 		/*
3668 		 * The dedicated branch for the single-segment packets
3669 		 * without TSO. Often these ones can be sent using
3670 		 * MLX5_OPCODE_EMPW with multiple packets in one WQE.
3671 		 * The routine builds the WQEs till it encounters
3672 		 * the TSO or multi-segment packet (in case if these
3673 		 * offloads are requested at SQ configuration time).
3674 		 */
3675 enter_send_single:
3676 		MLX5_ASSERT(pkts_n > loc.pkts_sent);
3677 		ret = mlx5_tx_burst_single(txq, pkts, pkts_n, &loc, olx);
3678 		/*
3679 		 * These returned code checks are supposed
3680 		 * to be optimized out due to routine inlining.
3681 		 */
3682 		if (ret == MLX5_TXCMP_CODE_EXIT)
3683 			break;
3684 		if (ret == MLX5_TXCMP_CODE_ERROR) {
3685 			txq->stats.oerrors++;
3686 			break;
3687 		}
3688 		if (MLX5_TXOFF_CONFIG(MULTI) &&
3689 		    ret == MLX5_TXCMP_CODE_MULTI) {
3690 			/*
3691 			 * The multi-segment packet was
3692 			 * encountered in the array.
3693 			 */
3694 			goto enter_send_multi;
3695 		}
3696 		if (MLX5_TXOFF_CONFIG(TSO) &&
3697 		    ret == MLX5_TXCMP_CODE_TSO) {
3698 			/*
3699 			 * The single-segment TSO packet was
3700 			 * encountered in the array.
3701 			 */
3702 			goto enter_send_tso;
3703 		}
3704 		/* We must not get here. Something is going wrong. */
3705 		MLX5_ASSERT(false);
3706 		txq->stats.oerrors++;
3707 		break;
3708 	}
3709 	/*
3710 	 * Main Tx loop is completed, do the rest:
3711 	 * - set completion request if thresholds are reached
3712 	 * - doorbell the hardware
3713 	 * - copy the rest of mbufs to elts (if any)
3714 	 */
3715 	MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE) ||
3716 		    loc.pkts_sent >= loc.pkts_copy);
3717 	/* Take a shortcut if nothing is sent. */
3718 	if (unlikely(loc.pkts_sent == loc.pkts_loop))
3719 		goto burst_exit;
3720 	/* Request CQE generation if limits are reached. */
3721 	if (MLX5_TXOFF_CONFIG(TXPP) && __rte_trace_point_fp_is_enabled())
3722 		mlx5_tx_request_completion_trace(txq, &loc, olx);
3723 	else
3724 		mlx5_tx_request_completion(txq, &loc, olx);
3725 	/*
3726 	 * Ring QP doorbell immediately after WQE building completion
3727 	 * to improve latencies. The pure software related data treatment
3728 	 * can be completed after doorbell. Tx CQEs for this SQ are
3729 	 * processed in this thread only by the polling.
3730 	 *
3731 	 * The rdma core library can map doorbell register in two ways,
3732 	 * depending on the environment variable "MLX5_SHUT_UP_BF":
3733 	 *
3734 	 * - as regular cached memory, the variable is either missing or
3735 	 *   set to zero. This type of mapping may cause the significant
3736 	 *   doorbell register writing latency and requires explicit memory
3737 	 *   write barrier to mitigate this issue and prevent write combining.
3738 	 *
3739 	 * - as non-cached memory, the variable is present and set to not "0"
3740 	 *   value. This type of mapping may cause performance impact under
3741 	 *   heavy loading conditions but the explicit write memory barrier is
3742 	 *   not required and it may improve core performance.
3743 	 *
3744 	 * - the legacy behaviour (prior 19.08 release) was to use some
3745 	 *   heuristics to decide whether write memory barrier should
3746 	 *   be performed. This behavior is supported with specifying
3747 	 *   tx_db_nc=2, write barrier is skipped if application provides
3748 	 *   the full recommended burst of packets, it supposes the next
3749 	 *   packets are coming and the write barrier will be issued on
3750 	 *   the next burst (after descriptor writing, at least).
3751 	 */
3752 	mlx5_doorbell_ring(mlx5_tx_bfreg(txq),
3753 			   *(volatile uint64_t *)loc.wqe_last, txq->wqe_ci,
3754 			   txq->qp_db, !txq->db_nc &&
3755 			   (!txq->db_heu || pkts_n % MLX5_TX_DEFAULT_BURST));
3756 	/* Not all of the mbufs may be stored into elts yet. */
3757 	part = MLX5_TXOFF_CONFIG(INLINE) ? 0 : loc.pkts_sent - loc.pkts_copy;
3758 	if (!MLX5_TXOFF_CONFIG(INLINE) && part) {
3759 		/*
3760 		 * There are some single-segment mbufs not stored in elts.
3761 		 * It can be only if the last packet was single-segment.
3762 		 * The copying is gathered into one place due to it is
3763 		 * a good opportunity to optimize that with SIMD.
3764 		 * Unfortunately if inlining is enabled the gaps in pointer
3765 		 * array may happen due to early freeing of the inlined mbufs.
3766 		 */
3767 		mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy, part, olx);
3768 		loc.pkts_copy = loc.pkts_sent;
3769 	}
3770 	MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
3771 	MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
3772 	if (pkts_n > loc.pkts_sent) {
3773 		/*
3774 		 * If burst size is large there might be no enough CQE
3775 		 * fetched from completion queue and no enough resources
3776 		 * freed to send all the packets.
3777 		 */
3778 		goto send_loop;
3779 	}
3780 burst_exit:
3781 #ifdef MLX5_PMD_SOFT_COUNTERS
3782 	/* Increment sent packets counter. */
3783 	txq->stats.opackets += loc.pkts_sent;
3784 #endif
3785 	if (MLX5_TXOFF_CONFIG(INLINE) && loc.mbuf_free)
3786 		__mlx5_tx_free_mbuf(txq, pkts, loc.mbuf_free, olx);
3787 	/* Trace productive bursts only. */
3788 	if (__rte_trace_point_fp_is_enabled() && loc.pkts_sent)
3789 		rte_pmd_mlx5_trace_tx_exit(loc.pkts_sent, pkts_n);
3790 	return loc.pkts_sent;
3791 }
3792 
3793 /**
3794  * Check whether given TxQ is external.
3795  *
3796  * @param dev
3797  *   Pointer to Ethernet device.
3798  * @param queue_idx
3799  *   Tx queue index.
3800  *
3801  * @return
3802  *   True if is external TxQ, otherwise false.
3803  */
3804 static __rte_always_inline bool
3805 mlx5_is_external_txq(struct rte_eth_dev *dev, uint16_t queue_idx)
3806 {
3807 	struct mlx5_priv *priv = dev->data->dev_private;
3808 	struct mlx5_external_q *txq;
3809 
3810 	if (!priv->ext_txqs || queue_idx < MLX5_EXTERNAL_TX_QUEUE_ID_MIN)
3811 		return false;
3812 	txq = &priv->ext_txqs[queue_idx - MLX5_EXTERNAL_TX_QUEUE_ID_MIN];
3813 	return !!rte_atomic_load_explicit(&txq->refcnt, rte_memory_order_relaxed);
3814 }
3815 
3816 #endif /* RTE_PMD_MLX5_TX_H_ */
3817