xref: /dpdk/drivers/net/mlx5/mlx5_tx.h (revision 02932480ae82d7ed3c207f02cc40b508cdda6ded)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2021 6WIND S.A.
3  * Copyright 2021 Mellanox Technologies, Ltd
4  */
5 
6 #ifndef RTE_PMD_MLX5_TX_H_
7 #define RTE_PMD_MLX5_TX_H_
8 
9 #include <stdint.h>
10 #include <sys/queue.h>
11 
12 #include <rte_mbuf.h>
13 #include <rte_mempool.h>
14 #include <rte_common.h>
15 #include <rte_spinlock.h>
16 #include <rte_trace_point.h>
17 
18 #include <mlx5_common.h>
19 #include <mlx5_common_mr.h>
20 
21 #include "mlx5.h"
22 #include "mlx5_autoconf.h"
23 #include "mlx5_rxtx.h"
24 #include "mlx5_trace.h"
25 
26 /* TX burst subroutines return codes. */
27 enum mlx5_txcmp_code {
28 	MLX5_TXCMP_CODE_EXIT = 0,
29 	MLX5_TXCMP_CODE_ERROR,
30 	MLX5_TXCMP_CODE_SINGLE,
31 	MLX5_TXCMP_CODE_MULTI,
32 	MLX5_TXCMP_CODE_TSO,
33 	MLX5_TXCMP_CODE_EMPW,
34 };
35 
36 /*
37  * These defines are used to configure Tx burst routine option set supported
38  * at compile time. The not specified options are optimized out due to if
39  * conditions can be explicitly calculated at compile time.
40  * The offloads with bigger runtime check (require more CPU cycles toskip)
41  * overhead should have the bigger index - this is needed to select the better
42  * matching routine function if no exact match and some offloads are not
43  * actually requested.
44  */
45 #define MLX5_TXOFF_CONFIG_MULTI (1u << 0) /* Multi-segment packets.*/
46 #define MLX5_TXOFF_CONFIG_TSO (1u << 1) /* TCP send offload supported.*/
47 #define MLX5_TXOFF_CONFIG_SWP (1u << 2) /* Tunnels/SW Parser offloads.*/
48 #define MLX5_TXOFF_CONFIG_CSUM (1u << 3) /* Check Sums offloaded. */
49 #define MLX5_TXOFF_CONFIG_INLINE (1u << 4) /* Data inlining supported. */
50 #define MLX5_TXOFF_CONFIG_VLAN (1u << 5) /* VLAN insertion supported.*/
51 #define MLX5_TXOFF_CONFIG_METADATA (1u << 6) /* Flow metadata. */
52 #define MLX5_TXOFF_CONFIG_EMPW (1u << 8) /* Enhanced MPW supported.*/
53 #define MLX5_TXOFF_CONFIG_MPW (1u << 9) /* Legacy MPW supported.*/
54 #define MLX5_TXOFF_CONFIG_TXPP (1u << 10) /* Scheduling on timestamp.*/
55 
56 /* The most common offloads groups. */
57 #define MLX5_TXOFF_CONFIG_NONE 0
58 #define MLX5_TXOFF_CONFIG_FULL (MLX5_TXOFF_CONFIG_MULTI | \
59 				MLX5_TXOFF_CONFIG_TSO | \
60 				MLX5_TXOFF_CONFIG_SWP | \
61 				MLX5_TXOFF_CONFIG_CSUM | \
62 				MLX5_TXOFF_CONFIG_INLINE | \
63 				MLX5_TXOFF_CONFIG_VLAN | \
64 				MLX5_TXOFF_CONFIG_METADATA)
65 
66 #define MLX5_TXOFF_CONFIG(mask) (olx & MLX5_TXOFF_CONFIG_##mask)
67 
68 #define MLX5_TXOFF_PRE_DECL(func) \
69 uint16_t mlx5_tx_burst_##func(void *txq, \
70 			      struct rte_mbuf **pkts, \
71 			      uint16_t pkts_n)
72 
73 #define MLX5_TXOFF_DECL(func, olx) \
74 uint16_t mlx5_tx_burst_##func(void *txq, \
75 			      struct rte_mbuf **pkts, \
76 			      uint16_t pkts_n) \
77 { \
78 	return mlx5_tx_burst_tmpl((struct mlx5_txq_data *)txq, \
79 		    pkts, pkts_n, (olx)); \
80 }
81 
82 /* Mbuf dynamic flag offset for inline. */
83 extern uint64_t rte_net_mlx5_dynf_inline_mask;
84 #define RTE_MBUF_F_TX_DYNF_NOINLINE rte_net_mlx5_dynf_inline_mask
85 
86 extern alignas(RTE_CACHE_LINE_SIZE) uint32_t mlx5_ptype_table[];
87 extern alignas(RTE_CACHE_LINE_SIZE) uint8_t mlx5_cksum_table[1 << 10];
88 extern alignas(RTE_CACHE_LINE_SIZE) uint8_t mlx5_swp_types_table[1 << 10];
89 
90 struct mlx5_txq_stats {
91 #ifdef MLX5_PMD_SOFT_COUNTERS
92 	uint64_t opackets; /**< Total of successfully sent packets. */
93 	uint64_t obytes; /**< Total of successfully sent bytes. */
94 #endif
95 	uint64_t oerrors; /**< Total number of failed transmitted packets. */
96 };
97 
98 /* TX queue send local data. */
99 __extension__
100 struct mlx5_txq_local {
101 	struct mlx5_wqe *wqe_last; /* last sent WQE pointer. */
102 	struct rte_mbuf *mbuf; /* first mbuf to process. */
103 	uint16_t pkts_copy; /* packets copied to elts. */
104 	uint16_t pkts_sent; /* packets sent. */
105 	uint16_t pkts_loop; /* packets sent on loop entry. */
106 	uint16_t elts_free; /* available elts remain. */
107 	uint16_t wqe_free; /* available wqe remain. */
108 	uint16_t mbuf_off; /* data offset in current mbuf. */
109 	uint16_t mbuf_nseg; /* number of remaining mbuf. */
110 	uint16_t mbuf_free; /* number of inline mbufs to free. */
111 };
112 
113 /* TX queue descriptor. */
114 __extension__
115 struct __rte_cache_aligned mlx5_txq_data {
116 	uint16_t elts_head; /* Current counter in (*elts)[]. */
117 	uint16_t elts_tail; /* Counter of first element awaiting completion. */
118 	uint16_t elts_comp; /* elts index since last completion request. */
119 	uint16_t elts_s; /* Number of mbuf elements. */
120 	uint16_t elts_m; /* Mask for mbuf elements indices. */
121 	/* Fields related to elts mbuf storage. */
122 	uint16_t wqe_ci; /* Consumer index for work queue. */
123 	uint16_t wqe_pi; /* Producer index for work queue. */
124 	uint16_t wqe_s; /* Number of WQ elements. */
125 	uint16_t wqe_m; /* Mask Number for WQ elements. */
126 	uint16_t wqe_comp; /* WQE index since last completion request. */
127 	uint16_t wqe_thres; /* WQE threshold to request completion in CQ. */
128 	/* WQ related fields. */
129 	uint16_t cq_ci; /* Consumer index for completion queue. */
130 	uint16_t cq_pi; /* Production index for completion queue. */
131 	uint16_t cqe_s; /* Number of CQ elements. */
132 	uint16_t cqe_m; /* Mask for CQ indices. */
133 	/* CQ related fields. */
134 	uint16_t elts_n:4; /* elts[] length (in log2). */
135 	uint16_t cqe_n:4; /* Number of CQ elements (in log2). */
136 	uint16_t wqe_n:4; /* Number of WQ elements (in log2). */
137 	uint16_t tso_en:1; /* When set hardware TSO is enabled. */
138 	uint16_t tunnel_en:1;
139 	/* When set TX offload for tunneled packets are supported. */
140 	uint16_t swp_en:1; /* Whether SW parser is enabled. */
141 	uint16_t vlan_en:1; /* VLAN insertion in WQE is supported. */
142 	uint16_t db_nc:1; /* Doorbell mapped to non-cached region. */
143 	uint16_t db_heu:1; /* Doorbell heuristic write barrier. */
144 	uint16_t rt_timestamp:1; /* Realtime timestamp format. */
145 	uint16_t wait_on_time:1; /* WQE with timestamp is supported. */
146 	uint16_t fast_free:1; /* mbuf fast free on Tx is enabled. */
147 	uint16_t inlen_send; /* Ordinary send data inline size. */
148 	uint16_t inlen_empw; /* eMPW max packet size to inline. */
149 	uint16_t inlen_mode; /* Minimal data length to inline. */
150 	uint8_t tx_aggr_affinity; /* TxQ affinity configuration. */
151 	uint32_t qp_num_8s; /* QP number shifted by 8. */
152 	uint64_t offloads; /* Offloads for Tx Queue. */
153 	struct mlx5_mr_ctrl mr_ctrl; /* MR control descriptor. */
154 	struct mlx5_wqe *wqes; /* Work queue. */
155 	struct mlx5_wqe *wqes_end; /* Work queue array limit. */
156 #ifdef RTE_LIBRTE_MLX5_DEBUG
157 	uint32_t *fcqs; /* Free completion queue (debug extended). */
158 #else
159 	uint16_t *fcqs; /* Free completion queue. */
160 #endif
161 	volatile struct mlx5_cqe *cqes; /* Completion queue. */
162 	volatile uint32_t *qp_db; /* Work queue doorbell. */
163 	volatile uint32_t *cq_db; /* Completion queue doorbell. */
164 	uint16_t port_id; /* Port ID of device. */
165 	uint16_t idx; /* Queue index. */
166 	uint64_t rt_timemask; /* Scheduling timestamp mask. */
167 	uint64_t ts_mask; /* Timestamp flag dynamic mask. */
168 	uint64_t ts_last; /* Last scheduled timestamp. */
169 	int32_t ts_offset; /* Timestamp field dynamic offset. */
170 	struct mlx5_dev_ctx_shared *sh; /* Shared context. */
171 	struct mlx5_txq_stats stats; /* TX queue counters. */
172 	struct mlx5_txq_stats stats_reset; /* stats on last reset. */
173 	struct mlx5_uar_data uar_data;
174 	struct rte_mbuf *elts[];
175 	/* Storage for queued packets, must be the last field. */
176 };
177 
178 /* TX queue control descriptor. */
179 __extension__
180 struct mlx5_txq_ctrl {
181 	LIST_ENTRY(mlx5_txq_ctrl) next; /* Pointer to the next element. */
182 	RTE_ATOMIC(uint32_t) refcnt; /* Reference counter. */
183 	unsigned int socket; /* CPU socket ID for allocations. */
184 	bool is_hairpin; /* Whether TxQ type is Hairpin. */
185 	unsigned int max_inline_data; /* Max inline data. */
186 	unsigned int max_tso_header; /* Max TSO header size. */
187 	struct mlx5_txq_obj *obj; /* Verbs/DevX queue object. */
188 	struct mlx5_priv *priv; /* Back pointer to private data. */
189 	off_t uar_mmap_offset; /* UAR mmap offset for non-primary process. */
190 	uint16_t dump_file_n; /* Number of dump files. */
191 	struct rte_eth_hairpin_conf hairpin_conf; /* Hairpin configuration. */
192 	uint32_t hairpin_status; /* Hairpin binding status. */
193 	struct mlx5_txq_data txq; /* Data path structure. */
194 	/* Must be the last field in the structure, contains elts[]. */
195 };
196 
197 /* mlx5_txq.c */
198 
199 int mlx5_tx_queue_start(struct rte_eth_dev *dev, uint16_t queue_id);
200 int mlx5_tx_queue_stop(struct rte_eth_dev *dev, uint16_t queue_id);
201 int mlx5_tx_queue_start_primary(struct rte_eth_dev *dev, uint16_t queue_id);
202 int mlx5_tx_queue_stop_primary(struct rte_eth_dev *dev, uint16_t queue_id);
203 int mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
204 			unsigned int socket, const struct rte_eth_txconf *conf);
205 int mlx5_tx_hairpin_queue_setup
206 	(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
207 	 const struct rte_eth_hairpin_conf *hairpin_conf);
208 void mlx5_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid);
209 int mlx5_tx_uar_init_secondary(struct rte_eth_dev *dev, int fd);
210 void mlx5_tx_uar_uninit_secondary(struct rte_eth_dev *dev);
211 int mlx5_txq_obj_verify(struct rte_eth_dev *dev);
212 struct mlx5_txq_ctrl *mlx5_txq_new(struct rte_eth_dev *dev, uint16_t idx,
213 				   uint16_t desc, unsigned int socket,
214 				   const struct rte_eth_txconf *conf);
215 struct mlx5_txq_ctrl *mlx5_txq_hairpin_new
216 	(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
217 	 const struct rte_eth_hairpin_conf *hairpin_conf);
218 struct mlx5_txq_ctrl *mlx5_txq_get(struct rte_eth_dev *dev, uint16_t idx);
219 int mlx5_txq_release(struct rte_eth_dev *dev, uint16_t idx);
220 int mlx5_txq_releasable(struct rte_eth_dev *dev, uint16_t idx);
221 int mlx5_txq_verify(struct rte_eth_dev *dev);
222 int mlx5_txq_get_sqn(struct mlx5_txq_ctrl *txq);
223 void txq_alloc_elts(struct mlx5_txq_ctrl *txq_ctrl);
224 void txq_free_elts(struct mlx5_txq_ctrl *txq_ctrl);
225 uint64_t mlx5_get_tx_port_offloads(struct rte_eth_dev *dev);
226 void mlx5_txq_dynf_timestamp_set(struct rte_eth_dev *dev);
227 int mlx5_count_aggr_ports(struct rte_eth_dev *dev);
228 int mlx5_map_aggr_tx_affinity(struct rte_eth_dev *dev, uint16_t tx_queue_id,
229 			      uint8_t affinity);
230 int mlx5_ext_txq_verify(struct rte_eth_dev *dev);
231 struct mlx5_external_q *mlx5_ext_txq_get(struct rte_eth_dev *dev, uint16_t idx);
232 
233 /* mlx5_tx.c */
234 
235 void mlx5_tx_handle_completion(struct mlx5_txq_data *__rte_restrict txq,
236 			       unsigned int olx __rte_unused);
237 int mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset);
238 void mlx5_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
239 		       struct rte_eth_txq_info *qinfo);
240 int mlx5_tx_burst_mode_get(struct rte_eth_dev *dev, uint16_t tx_queue_id,
241 			   struct rte_eth_burst_mode *mode);
242 
243 /* mlx5_tx_empw.c */
244 
245 MLX5_TXOFF_PRE_DECL(full_empw);
246 MLX5_TXOFF_PRE_DECL(none_empw);
247 MLX5_TXOFF_PRE_DECL(md_empw);
248 MLX5_TXOFF_PRE_DECL(mt_empw);
249 MLX5_TXOFF_PRE_DECL(mtsc_empw);
250 MLX5_TXOFF_PRE_DECL(mti_empw);
251 MLX5_TXOFF_PRE_DECL(mtv_empw);
252 MLX5_TXOFF_PRE_DECL(mtiv_empw);
253 MLX5_TXOFF_PRE_DECL(sc_empw);
254 MLX5_TXOFF_PRE_DECL(sci_empw);
255 MLX5_TXOFF_PRE_DECL(scv_empw);
256 MLX5_TXOFF_PRE_DECL(sciv_empw);
257 MLX5_TXOFF_PRE_DECL(i_empw);
258 MLX5_TXOFF_PRE_DECL(v_empw);
259 MLX5_TXOFF_PRE_DECL(iv_empw);
260 
261 /* mlx5_tx_nompw.c */
262 
263 MLX5_TXOFF_PRE_DECL(full);
264 MLX5_TXOFF_PRE_DECL(none);
265 MLX5_TXOFF_PRE_DECL(md);
266 MLX5_TXOFF_PRE_DECL(mt);
267 MLX5_TXOFF_PRE_DECL(mtsc);
268 MLX5_TXOFF_PRE_DECL(mti);
269 MLX5_TXOFF_PRE_DECL(mtv);
270 MLX5_TXOFF_PRE_DECL(mtiv);
271 MLX5_TXOFF_PRE_DECL(sc);
272 MLX5_TXOFF_PRE_DECL(sci);
273 MLX5_TXOFF_PRE_DECL(scv);
274 MLX5_TXOFF_PRE_DECL(sciv);
275 MLX5_TXOFF_PRE_DECL(i);
276 MLX5_TXOFF_PRE_DECL(v);
277 MLX5_TXOFF_PRE_DECL(iv);
278 
279 /* mlx5_tx_txpp.c */
280 
281 MLX5_TXOFF_PRE_DECL(full_ts_nompw);
282 MLX5_TXOFF_PRE_DECL(full_ts_nompwi);
283 MLX5_TXOFF_PRE_DECL(full_ts);
284 MLX5_TXOFF_PRE_DECL(full_ts_noi);
285 MLX5_TXOFF_PRE_DECL(none_ts);
286 MLX5_TXOFF_PRE_DECL(mdi_ts);
287 MLX5_TXOFF_PRE_DECL(mti_ts);
288 MLX5_TXOFF_PRE_DECL(mtiv_ts);
289 
290 /* mlx5_tx_mpw.c */
291 
292 MLX5_TXOFF_PRE_DECL(none_mpw);
293 MLX5_TXOFF_PRE_DECL(mci_mpw);
294 MLX5_TXOFF_PRE_DECL(mc_mpw);
295 MLX5_TXOFF_PRE_DECL(i_mpw);
296 
297 static __rte_always_inline struct mlx5_uar_data *
298 mlx5_tx_bfreg(struct mlx5_txq_data *txq)
299 {
300 	return &MLX5_PROC_PRIV(txq->port_id)->uar_table[txq->idx];
301 }
302 
303 /**
304  * Ring TX queue doorbell and flush the update by write memory barrier.
305  *
306  * @param txq
307  *   Pointer to TX queue structure.
308  * @param wqe
309  *   Pointer to the last WQE posted in the NIC.
310  */
311 static __rte_always_inline void
312 mlx5_tx_dbrec(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe)
313 {
314 	mlx5_doorbell_ring(mlx5_tx_bfreg(txq), *(volatile uint64_t *)wqe,
315 			   txq->wqe_ci, txq->qp_db, 1);
316 }
317 
318 /**
319  * Convert timestamp from mbuf format to linear counter
320  * of Clock Queue completions (24 bits).
321  *
322  * @param sh
323  *   Pointer to the device shared context to fetch Tx
324  *   packet pacing timestamp and parameters.
325  * @param ts
326  *   Timestamp from mbuf to convert.
327  * @return
328  *   positive or zero value - completion ID to wait.
329  *   negative value - conversion error.
330  */
331 static __rte_always_inline int32_t
332 mlx5_txpp_convert_tx_ts(struct mlx5_dev_ctx_shared *sh, uint64_t mts)
333 {
334 	uint64_t ts, ci;
335 	uint32_t tick;
336 
337 	do {
338 		/*
339 		 * Read atomically two uint64_t fields and compare lsb bits.
340 		 * It there is no match - the timestamp was updated in
341 		 * the service thread, data should be re-read.
342 		 */
343 		rte_compiler_barrier();
344 		ci = rte_atomic_load_explicit(&sh->txpp.ts.ci_ts, rte_memory_order_relaxed);
345 		ts = rte_atomic_load_explicit(&sh->txpp.ts.ts, rte_memory_order_relaxed);
346 		rte_compiler_barrier();
347 		if (!((ts ^ ci) << (64 - MLX5_CQ_INDEX_WIDTH)))
348 			break;
349 	} while (true);
350 	/* Perform the skew correction, positive value to send earlier. */
351 	mts -= sh->txpp.skew;
352 	mts -= ts;
353 	if (unlikely(mts >= UINT64_MAX / 2)) {
354 		/* We have negative integer, mts is in the past. */
355 		rte_atomic_fetch_add_explicit(&sh->txpp.err_ts_past,
356 				   1, rte_memory_order_relaxed);
357 		return -1;
358 	}
359 	tick = sh->txpp.tick;
360 	MLX5_ASSERT(tick);
361 	/* Convert delta to completions, round up. */
362 	mts = (mts + tick - 1) / tick;
363 	if (unlikely(mts >= (1 << MLX5_CQ_INDEX_WIDTH) / 2 - 1)) {
364 		/* We have mts is too distant future. */
365 		rte_atomic_fetch_add_explicit(&sh->txpp.err_ts_future,
366 				   1, rte_memory_order_relaxed);
367 		return -1;
368 	}
369 	mts <<= 64 - MLX5_CQ_INDEX_WIDTH;
370 	ci += mts;
371 	ci >>= 64 - MLX5_CQ_INDEX_WIDTH;
372 	return ci;
373 }
374 
375 /**
376  * Read real time clock counter directly from the device PCI BAR area.
377  * The PCI BAR must be mapped to the process memory space at initialization.
378  *
379  * @param dev
380  *   Device to read clock counter from
381  *
382  * @return
383  *   0 - if HCA BAR is not supported or not mapped.
384  *   !=0 - read 64-bit value of real-time in UTC formatv (nanoseconds)
385  */
386 static __rte_always_inline uint64_t mlx5_read_pcibar_clock(struct rte_eth_dev *dev)
387 {
388 	struct mlx5_proc_priv *ppriv = dev->process_private;
389 
390 	if (ppriv && ppriv->hca_bar) {
391 		struct mlx5_priv *priv = dev->data->dev_private;
392 		struct mlx5_dev_ctx_shared *sh = priv->sh;
393 		uint64_t *hca_ptr = (uint64_t *)(ppriv->hca_bar) +
394 				  __mlx5_64_off(initial_seg, real_time);
395 		uint64_t __rte_atomic *ts_addr;
396 		uint64_t ts;
397 
398 		ts_addr = (uint64_t __rte_atomic *)hca_ptr;
399 		ts = rte_atomic_load_explicit(ts_addr, rte_memory_order_seq_cst);
400 		ts = rte_be_to_cpu_64(ts);
401 		ts = mlx5_txpp_convert_rx_ts(sh, ts);
402 		return ts;
403 	}
404 	return 0;
405 }
406 
407 static __rte_always_inline uint64_t mlx5_read_pcibar_clock_from_txq(struct mlx5_txq_data *txq)
408 {
409 	struct mlx5_txq_ctrl *txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq);
410 	struct rte_eth_dev *dev = ETH_DEV(txq_ctrl->priv);
411 
412 	return mlx5_read_pcibar_clock(dev);
413 }
414 
415 /**
416  * Set Software Parser flags and offsets in Ethernet Segment of WQE.
417  * Flags must be preliminary initialized to zero.
418  *
419  * @param loc
420  *   Pointer to burst routine local context.
421  * @param swp_flags
422  *   Pointer to store Software Parser flags.
423  * @param olx
424  *   Configured Tx offloads mask. It is fully defined at
425  *   compile time and may be used for optimization.
426  *
427  * @return
428  *   Software Parser offsets packed in dword.
429  *   Software Parser flags are set by pointer.
430  */
431 static __rte_always_inline uint32_t
432 txq_mbuf_to_swp(struct mlx5_txq_local *__rte_restrict loc,
433 		uint8_t *swp_flags,
434 		unsigned int olx)
435 {
436 	uint64_t ol, tunnel;
437 	unsigned int idx, off;
438 	uint32_t set;
439 
440 	if (!MLX5_TXOFF_CONFIG(SWP))
441 		return 0;
442 	ol = loc->mbuf->ol_flags;
443 	tunnel = ol & RTE_MBUF_F_TX_TUNNEL_MASK;
444 	/*
445 	 * Check whether Software Parser is required.
446 	 * Only customized tunnels may ask for.
447 	 */
448 	if (likely(tunnel != RTE_MBUF_F_TX_TUNNEL_UDP && tunnel != RTE_MBUF_F_TX_TUNNEL_IP))
449 		return 0;
450 	/*
451 	 * The index should have:
452 	 * bit[0:1] = RTE_MBUF_F_TX_L4_MASK
453 	 * bit[4] = RTE_MBUF_F_TX_IPV6
454 	 * bit[8] = RTE_MBUF_F_TX_OUTER_IPV6
455 	 * bit[9] = RTE_MBUF_F_TX_OUTER_UDP
456 	 */
457 	idx = (ol & (RTE_MBUF_F_TX_L4_MASK | RTE_MBUF_F_TX_IPV6 | RTE_MBUF_F_TX_OUTER_IPV6)) >> 52;
458 	idx |= (tunnel == RTE_MBUF_F_TX_TUNNEL_UDP) ? (1 << 9) : 0;
459 	*swp_flags = mlx5_swp_types_table[idx];
460 	/*
461 	 * Set offsets for SW parser. Since ConnectX-5, SW parser just
462 	 * complements HW parser. SW parser starts to engage only if HW parser
463 	 * can't reach a header. For the older devices, HW parser will not kick
464 	 * in if any of SWP offsets is set. Therefore, all of the L3 offsets
465 	 * should be set regardless of HW offload.
466 	 */
467 	off = loc->mbuf->outer_l2_len;
468 	if (MLX5_TXOFF_CONFIG(VLAN) && ol & RTE_MBUF_F_TX_VLAN)
469 		off += sizeof(struct rte_vlan_hdr);
470 	set = (off >> 1) << 8; /* Outer L3 offset. */
471 	off += loc->mbuf->outer_l3_len;
472 	if (tunnel == RTE_MBUF_F_TX_TUNNEL_UDP)
473 		set |= off >> 1; /* Outer L4 offset. */
474 	if (ol & (RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_IPV6)) { /* Inner IP. */
475 		const uint64_t csum = ol & RTE_MBUF_F_TX_L4_MASK;
476 			off += loc->mbuf->l2_len;
477 		set |= (off >> 1) << 24; /* Inner L3 offset. */
478 		if (csum == RTE_MBUF_F_TX_TCP_CKSUM ||
479 		    csum == RTE_MBUF_F_TX_UDP_CKSUM ||
480 		    (MLX5_TXOFF_CONFIG(TSO) && ol & RTE_MBUF_F_TX_TCP_SEG)) {
481 			off += loc->mbuf->l3_len;
482 			set |= (off >> 1) << 16; /* Inner L4 offset. */
483 		}
484 	}
485 	set = rte_cpu_to_le_32(set);
486 	return set;
487 }
488 
489 /**
490  * Convert the Checksum offloads to Verbs.
491  *
492  * @param buf
493  *   Pointer to the mbuf.
494  *
495  * @return
496  *   Converted checksum flags.
497  */
498 static __rte_always_inline uint8_t
499 txq_ol_cksum_to_cs(struct rte_mbuf *buf)
500 {
501 	uint32_t idx;
502 	uint8_t is_tunnel = !!(buf->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK);
503 	const uint64_t ol_flags_mask = RTE_MBUF_F_TX_TCP_SEG | RTE_MBUF_F_TX_L4_MASK |
504 				       RTE_MBUF_F_TX_IP_CKSUM | RTE_MBUF_F_TX_OUTER_IP_CKSUM;
505 
506 	/*
507 	 * The index should have:
508 	 * bit[0] = RTE_MBUF_F_TX_TCP_SEG
509 	 * bit[2:3] = RTE_MBUF_F_TX_UDP_CKSUM, RTE_MBUF_F_TX_TCP_CKSUM
510 	 * bit[4] = RTE_MBUF_F_TX_IP_CKSUM
511 	 * bit[8] = RTE_MBUF_F_TX_OUTER_IP_CKSUM
512 	 * bit[9] = tunnel
513 	 */
514 	idx = ((buf->ol_flags & ol_flags_mask) >> 50) | (!!is_tunnel << 9);
515 	return mlx5_cksum_table[idx];
516 }
517 
518 /**
519  * Free the mbufs from the linear array of pointers.
520  *
521  * @param txq
522  *   Pointer to Tx queue structure.
523  * @param pkts
524  *   Pointer to array of packets to be free.
525  * @param pkts_n
526  *   Number of packets to be freed.
527  * @param olx
528  *   Configured Tx offloads mask. It is fully defined at
529  *   compile time and may be used for optimization.
530  */
531 static __rte_always_inline void
532 mlx5_tx_free_mbuf(struct mlx5_txq_data *__rte_restrict txq,
533 		  struct rte_mbuf **__rte_restrict pkts,
534 		  unsigned int pkts_n,
535 		  unsigned int olx __rte_unused)
536 {
537 	struct rte_mempool *pool = NULL;
538 	struct rte_mbuf **p_free = NULL;
539 	struct rte_mbuf *mbuf;
540 	unsigned int n_free = 0;
541 
542 	/*
543 	 * The implemented algorithm eliminates
544 	 * copying pointers to temporary array
545 	 * for rte_mempool_put_bulk() calls.
546 	 */
547 	MLX5_ASSERT(pkts);
548 	MLX5_ASSERT(pkts_n);
549 	/*
550 	 * Free mbufs directly to the pool in bulk
551 	 * if fast free offload is engaged
552 	 */
553 	if (!MLX5_TXOFF_CONFIG(MULTI) && txq->fast_free) {
554 		mbuf = *pkts;
555 		pool = mbuf->pool;
556 		rte_mempool_put_bulk(pool, (void *)pkts, pkts_n);
557 		return;
558 	}
559 	for (;;) {
560 		for (;;) {
561 			/*
562 			 * Decrement mbuf reference counter, detach
563 			 * indirect and external buffers if needed.
564 			 */
565 			mbuf = rte_pktmbuf_prefree_seg(*pkts);
566 			if (likely(mbuf != NULL)) {
567 				MLX5_ASSERT(mbuf == *pkts);
568 				if (likely(n_free != 0)) {
569 					if (unlikely(pool != mbuf->pool))
570 						/* From different pool. */
571 						break;
572 				} else {
573 					/* Start new scan array. */
574 					pool = mbuf->pool;
575 					p_free = pkts;
576 				}
577 				++n_free;
578 				++pkts;
579 				--pkts_n;
580 				if (unlikely(pkts_n == 0)) {
581 					mbuf = NULL;
582 					break;
583 				}
584 			} else {
585 				/*
586 				 * This happens if mbuf is still referenced.
587 				 * We can't put it back to the pool, skip.
588 				 */
589 				++pkts;
590 				--pkts_n;
591 				if (unlikely(n_free != 0))
592 					/* There is some array to free.*/
593 					break;
594 				if (unlikely(pkts_n == 0))
595 					/* Last mbuf, nothing to free. */
596 					return;
597 			}
598 		}
599 		for (;;) {
600 			/*
601 			 * This loop is implemented to avoid multiple
602 			 * inlining of rte_mempool_put_bulk().
603 			 */
604 			MLX5_ASSERT(pool);
605 			MLX5_ASSERT(p_free);
606 			MLX5_ASSERT(n_free);
607 			/*
608 			 * Free the array of pre-freed mbufs
609 			 * belonging to the same memory pool.
610 			 */
611 			rte_mempool_put_bulk(pool, (void *)p_free, n_free);
612 			if (unlikely(mbuf != NULL)) {
613 				/* There is the request to start new scan. */
614 				pool = mbuf->pool;
615 				p_free = pkts++;
616 				n_free = 1;
617 				--pkts_n;
618 				if (likely(pkts_n != 0))
619 					break;
620 				/*
621 				 * This is the last mbuf to be freed.
622 				 * Do one more loop iteration to complete.
623 				 * This is rare case of the last unique mbuf.
624 				 */
625 				mbuf = NULL;
626 				continue;
627 			}
628 			if (likely(pkts_n == 0))
629 				return;
630 			n_free = 0;
631 			break;
632 		}
633 	}
634 }
635 
636 /**
637  * No inline version to free buffers for optimal call
638  * on the tx_burst completion.
639  */
640 static __rte_noinline void
641 __mlx5_tx_free_mbuf(struct mlx5_txq_data *__rte_restrict txq,
642 		    struct rte_mbuf **__rte_restrict pkts,
643 		    unsigned int pkts_n,
644 		    unsigned int olx __rte_unused)
645 {
646 	mlx5_tx_free_mbuf(txq, pkts, pkts_n, olx);
647 }
648 
649 /**
650  * Free the mbuf from the elts ring buffer till new tail.
651  *
652  * @param txq
653  *   Pointer to Tx queue structure.
654  * @param tail
655  *   Index in elts to free up to, becomes new elts tail.
656  * @param olx
657  *   Configured Tx offloads mask. It is fully defined at
658  *   compile time and may be used for optimization.
659  */
660 static __rte_always_inline void
661 mlx5_tx_free_elts(struct mlx5_txq_data *__rte_restrict txq,
662 		  uint16_t tail,
663 		  unsigned int olx __rte_unused)
664 {
665 	uint16_t n_elts = tail - txq->elts_tail;
666 
667 	MLX5_ASSERT(n_elts);
668 	MLX5_ASSERT(n_elts <= txq->elts_s);
669 	/*
670 	 * Implement a loop to support ring buffer wraparound
671 	 * with single inlining of mlx5_tx_free_mbuf().
672 	 */
673 	do {
674 		unsigned int part;
675 
676 		part = txq->elts_s - (txq->elts_tail & txq->elts_m);
677 		part = RTE_MIN(part, n_elts);
678 		MLX5_ASSERT(part);
679 		MLX5_ASSERT(part <= txq->elts_s);
680 		mlx5_tx_free_mbuf(txq,
681 				  &txq->elts[txq->elts_tail & txq->elts_m],
682 				  part, olx);
683 		txq->elts_tail += part;
684 		n_elts -= part;
685 	} while (n_elts);
686 }
687 
688 /**
689  * Store the mbuf being sent into elts ring buffer.
690  * On Tx completion these mbufs will be freed.
691  *
692  * @param txq
693  *   Pointer to Tx queue structure.
694  * @param pkts
695  *   Pointer to array of packets to be stored.
696  * @param pkts_n
697  *   Number of packets to be stored.
698  * @param olx
699  *   Configured Tx offloads mask. It is fully defined at
700  *   compile time and may be used for optimization.
701  */
702 static __rte_always_inline void
703 mlx5_tx_copy_elts(struct mlx5_txq_data *__rte_restrict txq,
704 		  struct rte_mbuf **__rte_restrict pkts,
705 		  unsigned int pkts_n,
706 		  unsigned int olx __rte_unused)
707 {
708 	unsigned int part;
709 	struct rte_mbuf **elts = (struct rte_mbuf **)txq->elts;
710 
711 	MLX5_ASSERT(pkts);
712 	MLX5_ASSERT(pkts_n);
713 	part = txq->elts_s - (txq->elts_head & txq->elts_m);
714 	MLX5_ASSERT(part);
715 	MLX5_ASSERT(part <= txq->elts_s);
716 	/* This code is a good candidate for vectorizing with SIMD. */
717 	rte_memcpy((void *)(elts + (txq->elts_head & txq->elts_m)),
718 		   (void *)pkts,
719 		   RTE_MIN(part, pkts_n) * sizeof(struct rte_mbuf *));
720 	txq->elts_head += pkts_n;
721 	if (unlikely(part < pkts_n))
722 		/* The copy is wrapping around the elts array. */
723 		rte_memcpy((void *)elts, (void *)(pkts + part),
724 			   (pkts_n - part) * sizeof(struct rte_mbuf *));
725 }
726 
727 /**
728  * Check if the completion request flag should be set in the last WQE.
729  * Both pushed mbufs and WQEs are monitored and the completion request
730  * flag is set if any of thresholds is reached.
731  *
732  * @param txq
733  *   Pointer to TX queue structure.
734  * @param loc
735  *   Pointer to burst routine local context.
736  * @param olx
737  *   Configured Tx offloads mask. It is fully defined at
738  *   compile time and may be used for optimization.
739  */
740 static __rte_always_inline void
741 mlx5_tx_request_completion(struct mlx5_txq_data *__rte_restrict txq,
742 			   struct mlx5_txq_local *__rte_restrict loc,
743 			   unsigned int olx)
744 {
745 	uint16_t head = txq->elts_head;
746 	unsigned int part;
747 
748 	part = MLX5_TXOFF_CONFIG(INLINE) ?
749 	       0 : loc->pkts_sent - loc->pkts_copy;
750 	head += part;
751 	if ((uint16_t)(head - txq->elts_comp) >= MLX5_TX_COMP_THRESH ||
752 	     (MLX5_TXOFF_CONFIG(INLINE) &&
753 	     (uint16_t)(txq->wqe_ci - txq->wqe_comp) >= txq->wqe_thres)) {
754 		volatile struct mlx5_wqe *last = loc->wqe_last;
755 
756 		MLX5_ASSERT(last);
757 		txq->elts_comp = head;
758 		if (MLX5_TXOFF_CONFIG(INLINE))
759 			txq->wqe_comp = txq->wqe_ci;
760 		/* Request unconditional completion on last WQE. */
761 		last->cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS <<
762 					    MLX5_COMP_MODE_OFFSET);
763 		/* Save elts_head in dedicated free on completion queue. */
764 #ifdef RTE_LIBRTE_MLX5_DEBUG
765 		txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head |
766 			  (last->cseg.opcode >> 8) << 16;
767 #else
768 		txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head;
769 #endif
770 		/* A CQE slot must always be available. */
771 		MLX5_ASSERT((txq->cq_pi - txq->cq_ci) <= txq->cqe_s);
772 	}
773 }
774 
775 /**
776  * Set completion request flag for all issued WQEs.
777  * This routine is intended to be used with enabled fast path tracing
778  * and send scheduling on time to provide the detailed report in trace
779  * for send completions on every WQE.
780  *
781  * @param txq
782  *   Pointer to TX queue structure.
783  * @param loc
784  *   Pointer to burst routine local context.
785  * @param olx
786  *   Configured Tx offloads mask. It is fully defined at
787  *   compile time and may be used for optimization.
788  */
789 static __rte_always_inline void
790 mlx5_tx_request_completion_trace(struct mlx5_txq_data *__rte_restrict txq,
791 				 struct mlx5_txq_local *__rte_restrict loc,
792 				 unsigned int olx)
793 {
794 	uint16_t head = txq->elts_comp;
795 
796 	while (txq->wqe_comp != txq->wqe_ci) {
797 		volatile struct mlx5_wqe *wqe;
798 		uint32_t wqe_n;
799 
800 		MLX5_ASSERT(loc->wqe_last);
801 		wqe = txq->wqes + (txq->wqe_comp & txq->wqe_m);
802 		if (wqe == loc->wqe_last) {
803 			head = txq->elts_head;
804 			head +=	MLX5_TXOFF_CONFIG(INLINE) ?
805 				0 : loc->pkts_sent - loc->pkts_copy;
806 			txq->elts_comp = head;
807 		}
808 		/* Completion request flag was set on cseg constructing. */
809 #ifdef RTE_LIBRTE_MLX5_DEBUG
810 		txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head |
811 			  (wqe->cseg.opcode >> 8) << 16;
812 #else
813 		txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head;
814 #endif
815 		/* A CQE slot must always be available. */
816 		MLX5_ASSERT((txq->cq_pi - txq->cq_ci) <= txq->cqe_s);
817 		/* Advance to the next WQE in the queue. */
818 		wqe_n = rte_be_to_cpu_32(wqe->cseg.sq_ds) & 0x3F;
819 		txq->wqe_comp += RTE_ALIGN(wqe_n, 4) / 4;
820 	}
821 }
822 
823 /**
824  * Build the Control Segment with specified opcode:
825  * - MLX5_OPCODE_SEND
826  * - MLX5_OPCODE_ENHANCED_MPSW
827  * - MLX5_OPCODE_TSO
828  *
829  * @param txq
830  *   Pointer to TX queue structure.
831  * @param loc
832  *   Pointer to burst routine local context.
833  * @param wqe
834  *   Pointer to WQE to fill with built Control Segment.
835  * @param ds
836  *   Supposed length of WQE in segments.
837  * @param opcode
838  *   SQ WQE opcode to put into Control Segment.
839  * @param olx
840  *   Configured Tx offloads mask. It is fully defined at
841  *   compile time and may be used for optimization.
842  */
843 static __rte_always_inline void
844 mlx5_tx_cseg_init(struct mlx5_txq_data *__rte_restrict txq,
845 		  struct mlx5_txq_local *__rte_restrict loc __rte_unused,
846 		  struct mlx5_wqe *__rte_restrict wqe,
847 		  unsigned int ds,
848 		  unsigned int opcode,
849 		  unsigned int olx)
850 {
851 	struct mlx5_wqe_cseg *__rte_restrict cs = &wqe->cseg;
852 	uint64_t real_time;
853 
854 	/* For legacy MPW replace the EMPW by TSO with modifier. */
855 	if (MLX5_TXOFF_CONFIG(MPW) && opcode == MLX5_OPCODE_ENHANCED_MPSW)
856 		opcode = MLX5_OPCODE_TSO | MLX5_OPC_MOD_MPW << 24;
857 	cs->opcode = rte_cpu_to_be_32((txq->wqe_ci << 8) | opcode);
858 	cs->sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
859 	if (MLX5_TXOFF_CONFIG(TXPP) && __rte_trace_point_fp_is_enabled())
860 		cs->flags = RTE_BE32(MLX5_COMP_ALWAYS <<
861 				     MLX5_COMP_MODE_OFFSET);
862 	else
863 		cs->flags = RTE_BE32(MLX5_COMP_ONLY_FIRST_ERR <<
864 				     MLX5_COMP_MODE_OFFSET);
865 	cs->misc = RTE_BE32(0);
866 	if (__rte_trace_point_fp_is_enabled()) {
867 		real_time = mlx5_read_pcibar_clock_from_txq(txq);
868 		if (!loc->pkts_sent)
869 			rte_pmd_mlx5_trace_tx_entry(real_time, txq->port_id, txq->idx);
870 		rte_pmd_mlx5_trace_tx_wqe(real_time, (txq->wqe_ci << 8) | opcode);
871 	}
872 }
873 
874 /**
875  * Build the Synchronize Queue Segment with specified completion index.
876  *
877  * @param txq
878  *   Pointer to TX queue structure.
879  * @param loc
880  *   Pointer to burst routine local context.
881  * @param wqe
882  *   Pointer to WQE to fill with built Control Segment.
883  * @param wci
884  *   Completion index in Clock Queue to wait.
885  * @param olx
886  *   Configured Tx offloads mask. It is fully defined at
887  *   compile time and may be used for optimization.
888  */
889 static __rte_always_inline void
890 mlx5_tx_qseg_init(struct mlx5_txq_data *restrict txq,
891 		  struct mlx5_txq_local *restrict loc __rte_unused,
892 		  struct mlx5_wqe *restrict wqe,
893 		  unsigned int wci,
894 		  unsigned int olx __rte_unused)
895 {
896 	struct mlx5_wqe_qseg *qs;
897 
898 	qs = RTE_PTR_ADD(wqe, MLX5_WSEG_SIZE);
899 	qs->max_index = rte_cpu_to_be_32(wci);
900 	qs->qpn_cqn = rte_cpu_to_be_32(txq->sh->txpp.clock_queue.cq_obj.cq->id);
901 	qs->reserved0 = RTE_BE32(0);
902 	qs->reserved1 = RTE_BE32(0);
903 }
904 
905 /**
906  * Build the Wait on Time Segment with specified timestamp value.
907  *
908  * @param txq
909  *   Pointer to TX queue structure.
910  * @param loc
911  *   Pointer to burst routine local context.
912  * @param wqe
913  *   Pointer to WQE to fill with built Control Segment.
914  * @param ts
915  *   Timesatmp value to wait.
916  * @param olx
917  *   Configured Tx offloads mask. It is fully defined at
918  *   compile time and may be used for optimization.
919  */
920 static __rte_always_inline void
921 mlx5_tx_wseg_init(struct mlx5_txq_data *restrict txq,
922 		  struct mlx5_txq_local *restrict loc __rte_unused,
923 		  struct mlx5_wqe *restrict wqe,
924 		  uint64_t ts,
925 		  unsigned int olx __rte_unused)
926 {
927 	struct mlx5_wqe_wseg *ws;
928 
929 	ws = RTE_PTR_ADD(wqe, MLX5_WSEG_SIZE);
930 	ws->operation = rte_cpu_to_be_32(MLX5_WAIT_COND_CYCLIC_SMALLER);
931 	ws->lkey = RTE_BE32(0);
932 	ws->va_high = RTE_BE32(0);
933 	ws->va_low = RTE_BE32(0);
934 	if (txq->rt_timestamp) {
935 		ts = ts % (uint64_t)NS_PER_S
936 		   | (ts / (uint64_t)NS_PER_S) << 32;
937 	}
938 	ws->value = rte_cpu_to_be_64(ts);
939 	ws->mask = txq->rt_timemask;
940 }
941 
942 /**
943  * Build the Ethernet Segment without inlined data.
944  * Supports Software Parser, Checksums and VLAN insertion Tx offload features.
945  *
946  * @param txq
947  *   Pointer to TX queue structure.
948  * @param loc
949  *   Pointer to burst routine local context.
950  * @param wqe
951  *   Pointer to WQE to fill with built Ethernet Segment.
952  * @param olx
953  *   Configured Tx offloads mask. It is fully defined at
954  *   compile time and may be used for optimization.
955  */
956 static __rte_always_inline void
957 mlx5_tx_eseg_none(struct mlx5_txq_data *__rte_restrict txq __rte_unused,
958 		  struct mlx5_txq_local *__rte_restrict loc,
959 		  struct mlx5_wqe *__rte_restrict wqe,
960 		  unsigned int olx)
961 {
962 	struct mlx5_wqe_eseg *__rte_restrict es = &wqe->eseg;
963 	uint32_t csum;
964 
965 	/*
966 	 * Calculate and set check sum flags first, dword field
967 	 * in segment may be shared with Software Parser flags.
968 	 */
969 	csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
970 	es->flags = rte_cpu_to_le_32(csum);
971 	/*
972 	 * Calculate and set Software Parser offsets and flags.
973 	 * These flags a set for custom UDP and IP tunnel packets.
974 	 */
975 	es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
976 	/* Fill metadata field if needed. */
977 	es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
978 		       loc->mbuf->ol_flags & RTE_MBUF_DYNFLAG_TX_METADATA ?
979 		       rte_cpu_to_be_32(*RTE_FLOW_DYNF_METADATA(loc->mbuf)) :
980 		       0 : 0;
981 	/* Engage VLAN tag insertion feature if requested. */
982 	if (MLX5_TXOFF_CONFIG(VLAN) &&
983 	    loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN) {
984 		/*
985 		 * We should get here only if device support
986 		 * this feature correctly.
987 		 */
988 		MLX5_ASSERT(txq->vlan_en);
989 		es->inline_hdr = rte_cpu_to_be_32(MLX5_ETH_WQE_VLAN_INSERT |
990 						  loc->mbuf->vlan_tci);
991 	} else {
992 		es->inline_hdr = RTE_BE32(0);
993 	}
994 }
995 
996 /**
997  * Build the Ethernet Segment with minimal inlined data
998  * of MLX5_ESEG_MIN_INLINE_SIZE bytes length. This is
999  * used to fill the gap in single WQEBB WQEs.
1000  * Supports Software Parser, Checksums and VLAN
1001  * insertion Tx offload features.
1002  *
1003  * @param txq
1004  *   Pointer to TX queue structure.
1005  * @param loc
1006  *   Pointer to burst routine local context.
1007  * @param wqe
1008  *   Pointer to WQE to fill with built Ethernet Segment.
1009  * @param vlan
1010  *   Length of VLAN tag insertion if any.
1011  * @param olx
1012  *   Configured Tx offloads mask. It is fully defined at
1013  *   compile time and may be used for optimization.
1014  */
1015 static __rte_always_inline void
1016 mlx5_tx_eseg_dmin(struct mlx5_txq_data *__rte_restrict txq __rte_unused,
1017 		  struct mlx5_txq_local *__rte_restrict loc,
1018 		  struct mlx5_wqe *__rte_restrict wqe,
1019 		  unsigned int vlan,
1020 		  unsigned int olx)
1021 {
1022 	struct mlx5_wqe_eseg *__rte_restrict es = &wqe->eseg;
1023 	uint32_t csum;
1024 	uint8_t *psrc, *pdst;
1025 
1026 	/*
1027 	 * Calculate and set check sum flags first, dword field
1028 	 * in segment may be shared with Software Parser flags.
1029 	 */
1030 	csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
1031 	es->flags = rte_cpu_to_le_32(csum);
1032 	/*
1033 	 * Calculate and set Software Parser offsets and flags.
1034 	 * These flags a set for custom UDP and IP tunnel packets.
1035 	 */
1036 	es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
1037 	/* Fill metadata field if needed. */
1038 	es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
1039 		       loc->mbuf->ol_flags & RTE_MBUF_DYNFLAG_TX_METADATA ?
1040 		       rte_cpu_to_be_32(*RTE_FLOW_DYNF_METADATA(loc->mbuf)) :
1041 		       0 : 0;
1042 	psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
1043 	es->inline_hdr_sz = RTE_BE16(MLX5_ESEG_MIN_INLINE_SIZE);
1044 	es->inline_data = *(unaligned_uint16_t *)psrc;
1045 	psrc +=	sizeof(uint16_t);
1046 	pdst = (uint8_t *)(es + 1);
1047 	if (MLX5_TXOFF_CONFIG(VLAN) && vlan) {
1048 		/* Implement VLAN tag insertion as part inline data. */
1049 		memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t));
1050 		pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
1051 		psrc +=	2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
1052 		/* Insert VLAN ethertype + VLAN tag. */
1053 		*(unaligned_uint32_t *)pdst = rte_cpu_to_be_32
1054 						((RTE_ETHER_TYPE_VLAN << 16) |
1055 						 loc->mbuf->vlan_tci);
1056 		pdst += sizeof(struct rte_vlan_hdr);
1057 		/* Copy the rest two bytes from packet data. */
1058 		MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t)));
1059 		*(uint16_t *)pdst = *(unaligned_uint16_t *)psrc;
1060 	} else {
1061 		/* Fill the gap in the title WQEBB with inline data. */
1062 		rte_mov16(pdst, psrc);
1063 	}
1064 }
1065 
1066 /**
1067  * Build the Ethernet Segment with entire packet data inlining. Checks the
1068  * boundary of WQEBB and ring buffer wrapping, supports Software Parser,
1069  * Checksums and VLAN insertion Tx offload features.
1070  *
1071  * @param txq
1072  *   Pointer to TX queue structure.
1073  * @param loc
1074  *   Pointer to burst routine local context.
1075  * @param wqe
1076  *   Pointer to WQE to fill with built Ethernet Segment.
1077  * @param vlan
1078  *   Length of VLAN tag insertion if any.
1079  * @param inlen
1080  *   Length of data to inline (VLAN included, if any).
1081  * @param tso
1082  *   TSO flag, set mss field from the packet.
1083  * @param olx
1084  *   Configured Tx offloads mask. It is fully defined at
1085  *   compile time and may be used for optimization.
1086  *
1087  * @return
1088  *   Pointer to the next Data Segment (aligned and wrapped around).
1089  */
1090 static __rte_always_inline struct mlx5_wqe_dseg *
1091 mlx5_tx_eseg_data(struct mlx5_txq_data *__rte_restrict txq,
1092 		  struct mlx5_txq_local *__rte_restrict loc,
1093 		  struct mlx5_wqe *__rte_restrict wqe,
1094 		  unsigned int vlan,
1095 		  unsigned int inlen,
1096 		  unsigned int tso,
1097 		  unsigned int olx)
1098 {
1099 	struct mlx5_wqe_eseg *__rte_restrict es = &wqe->eseg;
1100 	uint32_t csum;
1101 	uint8_t *psrc, *pdst;
1102 	unsigned int part;
1103 
1104 	/*
1105 	 * Calculate and set check sum flags first, dword field
1106 	 * in segment may be shared with Software Parser flags.
1107 	 */
1108 	csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
1109 	if (tso) {
1110 		csum <<= 24;
1111 		csum |= loc->mbuf->tso_segsz;
1112 		es->flags = rte_cpu_to_be_32(csum);
1113 	} else {
1114 		es->flags = rte_cpu_to_le_32(csum);
1115 	}
1116 	/*
1117 	 * Calculate and set Software Parser offsets and flags.
1118 	 * These flags a set for custom UDP and IP tunnel packets.
1119 	 */
1120 	es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
1121 	/* Fill metadata field if needed. */
1122 	es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
1123 		       loc->mbuf->ol_flags & RTE_MBUF_DYNFLAG_TX_METADATA ?
1124 		       rte_cpu_to_be_32(*RTE_FLOW_DYNF_METADATA(loc->mbuf)) :
1125 		       0 : 0;
1126 	psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
1127 	es->inline_hdr_sz = rte_cpu_to_be_16(inlen);
1128 	es->inline_data = *(unaligned_uint16_t *)psrc;
1129 	psrc +=	sizeof(uint16_t);
1130 	pdst = (uint8_t *)(es + 1);
1131 	if (MLX5_TXOFF_CONFIG(VLAN) && vlan) {
1132 		/* Implement VLAN tag insertion as part inline data. */
1133 		memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t));
1134 		pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
1135 		psrc +=	2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
1136 		/* Insert VLAN ethertype + VLAN tag. */
1137 		*(unaligned_uint32_t *)pdst = rte_cpu_to_be_32
1138 						((RTE_ETHER_TYPE_VLAN << 16) |
1139 						 loc->mbuf->vlan_tci);
1140 		pdst += sizeof(struct rte_vlan_hdr);
1141 		/* Copy the rest two bytes from packet data. */
1142 		MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t)));
1143 		*(uint16_t *)pdst = *(unaligned_uint16_t *)psrc;
1144 		psrc += sizeof(uint16_t);
1145 	} else {
1146 		/* Fill the gap in the title WQEBB with inline data. */
1147 		rte_mov16(pdst, psrc);
1148 		psrc += sizeof(rte_v128u32_t);
1149 	}
1150 	pdst = (uint8_t *)(es + 2);
1151 	MLX5_ASSERT(inlen >= MLX5_ESEG_MIN_INLINE_SIZE);
1152 	MLX5_ASSERT(pdst < (uint8_t *)txq->wqes_end);
1153 	inlen -= MLX5_ESEG_MIN_INLINE_SIZE;
1154 	if (!inlen) {
1155 		MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE));
1156 		return (struct mlx5_wqe_dseg *)pdst;
1157 	}
1158 	/*
1159 	 * The WQEBB space availability is checked by caller.
1160 	 * Here we should be aware of WQE ring buffer wraparound only.
1161 	 */
1162 	part = (uint8_t *)txq->wqes_end - pdst;
1163 	part = RTE_MIN(part, inlen);
1164 	do {
1165 		rte_memcpy(pdst, psrc, part);
1166 		inlen -= part;
1167 		if (likely(!inlen)) {
1168 			/*
1169 			 * If return value is not used by the caller
1170 			 * the code below will be optimized out.
1171 			 */
1172 			pdst += part;
1173 			pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
1174 			if (unlikely(pdst >= (uint8_t *)txq->wqes_end))
1175 				pdst = (uint8_t *)txq->wqes;
1176 			return (struct mlx5_wqe_dseg *)pdst;
1177 		}
1178 		pdst = (uint8_t *)txq->wqes;
1179 		psrc += part;
1180 		part = inlen;
1181 	} while (true);
1182 }
1183 
1184 /**
1185  * Copy data from chain of mbuf to the specified linear buffer.
1186  * Checksums and VLAN insertion Tx offload features. If data
1187  * from some mbuf copied completely this mbuf is freed. Local
1188  * structure is used to keep the byte stream state.
1189  *
1190  * @param pdst
1191  *   Pointer to the destination linear buffer.
1192  * @param loc
1193  *   Pointer to burst routine local context.
1194  * @param len
1195  *   Length of data to be copied.
1196  * @param must
1197  *   Length of data to be copied ignoring no inline hint.
1198  * @param olx
1199  *   Configured Tx offloads mask. It is fully defined at
1200  *   compile time and may be used for optimization.
1201  *
1202  * @return
1203  *   Number of actual copied data bytes. This is always greater than or
1204  *   equal to must parameter and might be lesser than len in no inline
1205  *   hint flag is encountered.
1206  */
1207 static __rte_always_inline unsigned int
1208 mlx5_tx_mseg_memcpy(uint8_t *pdst,
1209 		    struct mlx5_txq_local *__rte_restrict loc,
1210 		    unsigned int len,
1211 		    unsigned int must,
1212 		    unsigned int olx __rte_unused)
1213 {
1214 	struct rte_mbuf *mbuf;
1215 	unsigned int part, dlen, copy = 0;
1216 	uint8_t *psrc;
1217 
1218 	MLX5_ASSERT(len);
1219 	do {
1220 		/* Allow zero length packets, must check first. */
1221 		dlen = rte_pktmbuf_data_len(loc->mbuf);
1222 		if (dlen <= loc->mbuf_off) {
1223 			/* Exhausted packet, just free. */
1224 			mbuf = loc->mbuf;
1225 			loc->mbuf = mbuf->next;
1226 			rte_pktmbuf_free_seg(mbuf);
1227 			loc->mbuf_off = 0;
1228 			MLX5_ASSERT(loc->mbuf_nseg > 1);
1229 			MLX5_ASSERT(loc->mbuf);
1230 			--loc->mbuf_nseg;
1231 			if (loc->mbuf->ol_flags & RTE_MBUF_F_TX_DYNF_NOINLINE) {
1232 				unsigned int diff;
1233 
1234 				if (copy >= must) {
1235 					/*
1236 					 * We already copied the minimal
1237 					 * requested amount of data.
1238 					 */
1239 					return copy;
1240 				}
1241 				diff = must - copy;
1242 				if (diff <= rte_pktmbuf_data_len(loc->mbuf)) {
1243 					/*
1244 					 * Copy only the minimal required
1245 					 * part of the data buffer. Limit amount
1246 					 * of data to be copied to the length of
1247 					 * available space.
1248 					 */
1249 					len = RTE_MIN(len, diff);
1250 				}
1251 			}
1252 			continue;
1253 		}
1254 		dlen -= loc->mbuf_off;
1255 		psrc = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *,
1256 					       loc->mbuf_off);
1257 		part = RTE_MIN(len, dlen);
1258 		rte_memcpy(pdst, psrc, part);
1259 		copy += part;
1260 		loc->mbuf_off += part;
1261 		len -= part;
1262 		if (!len) {
1263 			if (loc->mbuf_off >= rte_pktmbuf_data_len(loc->mbuf)) {
1264 				loc->mbuf_off = 0;
1265 				/* Exhausted packet, just free. */
1266 				mbuf = loc->mbuf;
1267 				loc->mbuf = mbuf->next;
1268 				rte_pktmbuf_free_seg(mbuf);
1269 				loc->mbuf_off = 0;
1270 				MLX5_ASSERT(loc->mbuf_nseg >= 1);
1271 				--loc->mbuf_nseg;
1272 			}
1273 			return copy;
1274 		}
1275 		pdst += part;
1276 	} while (true);
1277 }
1278 
1279 /**
1280  * Build the Ethernet Segment with inlined data from multi-segment packet.
1281  * Checks the boundary of WQEBB and ring buffer wrapping, supports Software
1282  * Parser, Checksums and VLAN insertion Tx offload features.
1283  *
1284  * @param txq
1285  *   Pointer to TX queue structure.
1286  * @param loc
1287  *   Pointer to burst routine local context.
1288  * @param wqe
1289  *   Pointer to WQE to fill with built Ethernet Segment.
1290  * @param vlan
1291  *   Length of VLAN tag insertion if any.
1292  * @param inlen
1293  *   Length of data to inline (VLAN included, if any).
1294  * @param tso
1295  *   TSO flag, set mss field from the packet.
1296  * @param olx
1297  *   Configured Tx offloads mask. It is fully defined at
1298  *   compile time and may be used for optimization.
1299  *
1300  * @return
1301  *   Pointer to the next Data Segment (aligned and possible NOT wrapped
1302  *   around - caller should do wrapping check on its own).
1303  */
1304 static __rte_always_inline struct mlx5_wqe_dseg *
1305 mlx5_tx_eseg_mdat(struct mlx5_txq_data *__rte_restrict txq,
1306 		  struct mlx5_txq_local *__rte_restrict loc,
1307 		  struct mlx5_wqe *__rte_restrict wqe,
1308 		  unsigned int vlan,
1309 		  unsigned int inlen,
1310 		  unsigned int tso,
1311 		  unsigned int olx)
1312 {
1313 	struct mlx5_wqe_eseg *__rte_restrict es = &wqe->eseg;
1314 	uint32_t csum;
1315 	uint8_t *pdst;
1316 	unsigned int part, tlen = 0;
1317 
1318 	/*
1319 	 * Calculate and set check sum flags first, uint32_t field
1320 	 * in segment may be shared with Software Parser flags.
1321 	 */
1322 	csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
1323 	if (tso) {
1324 		csum <<= 24;
1325 		csum |= loc->mbuf->tso_segsz;
1326 		es->flags = rte_cpu_to_be_32(csum);
1327 	} else {
1328 		es->flags = rte_cpu_to_le_32(csum);
1329 	}
1330 	/*
1331 	 * Calculate and set Software Parser offsets and flags.
1332 	 * These flags a set for custom UDP and IP tunnel packets.
1333 	 */
1334 	es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
1335 	/* Fill metadata field if needed. */
1336 	es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
1337 		       loc->mbuf->ol_flags & RTE_MBUF_DYNFLAG_TX_METADATA ?
1338 		       rte_cpu_to_be_32(*RTE_FLOW_DYNF_METADATA(loc->mbuf)) :
1339 		       0 : 0;
1340 	MLX5_ASSERT(inlen >= MLX5_ESEG_MIN_INLINE_SIZE);
1341 	pdst = (uint8_t *)&es->inline_data;
1342 	if (MLX5_TXOFF_CONFIG(VLAN) && vlan) {
1343 		/* Implement VLAN tag insertion as part inline data. */
1344 		mlx5_tx_mseg_memcpy(pdst, loc,
1345 				    2 * RTE_ETHER_ADDR_LEN,
1346 				    2 * RTE_ETHER_ADDR_LEN, olx);
1347 		pdst += 2 * RTE_ETHER_ADDR_LEN;
1348 		*(unaligned_uint32_t *)pdst = rte_cpu_to_be_32
1349 						((RTE_ETHER_TYPE_VLAN << 16) |
1350 						 loc->mbuf->vlan_tci);
1351 		pdst += sizeof(struct rte_vlan_hdr);
1352 		tlen += 2 * RTE_ETHER_ADDR_LEN + sizeof(struct rte_vlan_hdr);
1353 	}
1354 	MLX5_ASSERT(pdst < (uint8_t *)txq->wqes_end);
1355 	/*
1356 	 * The WQEBB space availability is checked by caller.
1357 	 * Here we should be aware of WQE ring buffer wraparound only.
1358 	 */
1359 	part = (uint8_t *)txq->wqes_end - pdst;
1360 	part = RTE_MIN(part, inlen - tlen);
1361 	MLX5_ASSERT(part);
1362 	do {
1363 		unsigned int copy;
1364 
1365 		/*
1366 		 * Copying may be interrupted inside the routine
1367 		 * if run into no inline hint flag.
1368 		 */
1369 		copy = tso ? inlen : txq->inlen_mode;
1370 		copy = tlen >= copy ? 0 : (copy - tlen);
1371 		copy = mlx5_tx_mseg_memcpy(pdst, loc, part, copy, olx);
1372 		tlen += copy;
1373 		if (likely(inlen <= tlen) || copy < part) {
1374 			es->inline_hdr_sz = rte_cpu_to_be_16(tlen);
1375 			pdst += copy;
1376 			pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
1377 			return (struct mlx5_wqe_dseg *)pdst;
1378 		}
1379 		pdst = (uint8_t *)txq->wqes;
1380 		part = inlen - tlen;
1381 	} while (true);
1382 }
1383 
1384 /**
1385  * Build the Data Segment of pointer type.
1386  *
1387  * @param txq
1388  *   Pointer to TX queue structure.
1389  * @param loc
1390  *   Pointer to burst routine local context.
1391  * @param dseg
1392  *   Pointer to WQE to fill with built Data Segment.
1393  * @param buf
1394  *   Data buffer to point.
1395  * @param len
1396  *   Data buffer length.
1397  * @param olx
1398  *   Configured Tx offloads mask. It is fully defined at
1399  *   compile time and may be used for optimization.
1400  */
1401 static __rte_always_inline void
1402 mlx5_tx_dseg_ptr(struct mlx5_txq_data *__rte_restrict txq,
1403 		 struct mlx5_txq_local *__rte_restrict loc,
1404 		 struct mlx5_wqe_dseg *__rte_restrict dseg,
1405 		 uint8_t *buf,
1406 		 unsigned int len,
1407 		 unsigned int olx __rte_unused)
1408 
1409 {
1410 	MLX5_ASSERT(len);
1411 	dseg->bcount = rte_cpu_to_be_32(len);
1412 	dseg->lkey = mlx5_mr_mb2mr(&txq->mr_ctrl, loc->mbuf);
1413 	dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf);
1414 }
1415 
1416 /**
1417  * Build the Data Segment of pointer type or inline if data length is less than
1418  * buffer in minimal Data Segment size.
1419  *
1420  * @param txq
1421  *   Pointer to TX queue structure.
1422  * @param loc
1423  *   Pointer to burst routine local context.
1424  * @param dseg
1425  *   Pointer to WQE to fill with built Data Segment.
1426  * @param buf
1427  *   Data buffer to point.
1428  * @param len
1429  *   Data buffer length.
1430  * @param olx
1431  *   Configured Tx offloads mask. It is fully defined at
1432  *   compile time and may be used for optimization.
1433  */
1434 static __rte_always_inline void
1435 mlx5_tx_dseg_iptr(struct mlx5_txq_data *__rte_restrict txq,
1436 		  struct mlx5_txq_local *__rte_restrict loc,
1437 		  struct mlx5_wqe_dseg *__rte_restrict dseg,
1438 		  uint8_t *buf,
1439 		  unsigned int len,
1440 		  unsigned int olx __rte_unused)
1441 
1442 {
1443 	uintptr_t dst, src;
1444 
1445 	MLX5_ASSERT(len);
1446 	if (len > MLX5_DSEG_MIN_INLINE_SIZE) {
1447 		dseg->bcount = rte_cpu_to_be_32(len);
1448 		dseg->lkey = mlx5_mr_mb2mr(&txq->mr_ctrl, loc->mbuf);
1449 		dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf);
1450 
1451 		return;
1452 	}
1453 	dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE);
1454 	/* Unrolled implementation of generic rte_memcpy. */
1455 	dst = (uintptr_t)&dseg->inline_data[0];
1456 	src = (uintptr_t)buf;
1457 	if (len & 0x08) {
1458 #ifdef RTE_ARCH_STRICT_ALIGN
1459 		MLX5_ASSERT(dst == RTE_PTR_ALIGN(dst, sizeof(uint32_t)));
1460 		*(uint32_t *)dst = *(unaligned_uint32_t *)src;
1461 		dst += sizeof(uint32_t);
1462 		src += sizeof(uint32_t);
1463 		*(uint32_t *)dst = *(unaligned_uint32_t *)src;
1464 		dst += sizeof(uint32_t);
1465 		src += sizeof(uint32_t);
1466 #else
1467 		*(uint64_t *)dst = *(unaligned_uint64_t *)src;
1468 		dst += sizeof(uint64_t);
1469 		src += sizeof(uint64_t);
1470 #endif
1471 	}
1472 	if (len & 0x04) {
1473 		*(uint32_t *)dst = *(unaligned_uint32_t *)src;
1474 		dst += sizeof(uint32_t);
1475 		src += sizeof(uint32_t);
1476 	}
1477 	if (len & 0x02) {
1478 		*(uint16_t *)dst = *(unaligned_uint16_t *)src;
1479 		dst += sizeof(uint16_t);
1480 		src += sizeof(uint16_t);
1481 	}
1482 	if (len & 0x01)
1483 		*(uint8_t *)dst = *(uint8_t *)src;
1484 }
1485 
1486 /**
1487  * Build the Data Segment of inlined data from single
1488  * segment packet, no VLAN insertion.
1489  *
1490  * @param txq
1491  *   Pointer to TX queue structure.
1492  * @param loc
1493  *   Pointer to burst routine local context.
1494  * @param dseg
1495  *   Pointer to WQE to fill with built Data Segment.
1496  * @param buf
1497  *   Data buffer to point.
1498  * @param len
1499  *   Data buffer length.
1500  * @param olx
1501  *   Configured Tx offloads mask. It is fully defined at
1502  *   compile time and may be used for optimization.
1503  *
1504  * @return
1505  *   Pointer to the next Data Segment after inlined data.
1506  *   Ring buffer wraparound check is needed. We do not do it here because it
1507  *   may not be needed for the last packet in the eMPW session.
1508  */
1509 static __rte_always_inline struct mlx5_wqe_dseg *
1510 mlx5_tx_dseg_empw(struct mlx5_txq_data *__rte_restrict txq,
1511 		  struct mlx5_txq_local *__rte_restrict loc __rte_unused,
1512 		  struct mlx5_wqe_dseg *__rte_restrict dseg,
1513 		  uint8_t *buf,
1514 		  unsigned int len,
1515 		  unsigned int olx __rte_unused)
1516 {
1517 	unsigned int part;
1518 	uint8_t *pdst;
1519 
1520 	if (!MLX5_TXOFF_CONFIG(MPW)) {
1521 		/* Store the descriptor byte counter for eMPW sessions. */
1522 		dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE);
1523 		pdst = &dseg->inline_data[0];
1524 	} else {
1525 		/* The entire legacy MPW session counter is stored on close. */
1526 		pdst = (uint8_t *)dseg;
1527 	}
1528 	/*
1529 	 * The WQEBB space availability is checked by caller.
1530 	 * Here we should be aware of WQE ring buffer wraparound only.
1531 	 */
1532 	part = (uint8_t *)txq->wqes_end - pdst;
1533 	part = RTE_MIN(part, len);
1534 	do {
1535 		rte_memcpy(pdst, buf, part);
1536 		len -= part;
1537 		if (likely(!len)) {
1538 			pdst += part;
1539 			if (!MLX5_TXOFF_CONFIG(MPW))
1540 				pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
1541 			/* Note: no final wraparound check here. */
1542 			return (struct mlx5_wqe_dseg *)pdst;
1543 		}
1544 		pdst = (uint8_t *)txq->wqes;
1545 		buf += part;
1546 		part = len;
1547 	} while (true);
1548 }
1549 
1550 /**
1551  * Build the Data Segment of inlined data from single
1552  * segment packet with VLAN insertion.
1553  *
1554  * @param txq
1555  *   Pointer to TX queue structure.
1556  * @param loc
1557  *   Pointer to burst routine local context.
1558  * @param dseg
1559  *   Pointer to the dseg fill with built Data Segment.
1560  * @param buf
1561  *   Data buffer to point.
1562  * @param len
1563  *   Data buffer length.
1564  * @param olx
1565  *   Configured Tx offloads mask. It is fully defined at
1566  *   compile time and may be used for optimization.
1567  *
1568  * @return
1569  *   Pointer to the next Data Segment after inlined data.
1570  *   Ring buffer wraparound check is needed.
1571  */
1572 static __rte_always_inline struct mlx5_wqe_dseg *
1573 mlx5_tx_dseg_vlan(struct mlx5_txq_data *__rte_restrict txq,
1574 		  struct mlx5_txq_local *__rte_restrict loc __rte_unused,
1575 		  struct mlx5_wqe_dseg *__rte_restrict dseg,
1576 		  uint8_t *buf,
1577 		  unsigned int len,
1578 		  unsigned int olx __rte_unused)
1579 
1580 {
1581 	unsigned int part;
1582 	uint8_t *pdst;
1583 
1584 	MLX5_ASSERT(len > MLX5_ESEG_MIN_INLINE_SIZE);
1585 	if (!MLX5_TXOFF_CONFIG(MPW)) {
1586 		/* Store the descriptor byte counter for eMPW sessions. */
1587 		dseg->bcount = rte_cpu_to_be_32
1588 				((len + sizeof(struct rte_vlan_hdr)) |
1589 				 MLX5_ETH_WQE_DATA_INLINE);
1590 		pdst = &dseg->inline_data[0];
1591 	} else {
1592 		/* The entire legacy MPW session counter is stored on close. */
1593 		pdst = (uint8_t *)dseg;
1594 	}
1595 	memcpy(pdst, buf, MLX5_DSEG_MIN_INLINE_SIZE);
1596 	buf += MLX5_DSEG_MIN_INLINE_SIZE;
1597 	pdst += MLX5_DSEG_MIN_INLINE_SIZE;
1598 	len -= MLX5_DSEG_MIN_INLINE_SIZE;
1599 	/* Insert VLAN ethertype + VLAN tag. Pointer is aligned. */
1600 	MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE));
1601 	if (unlikely(pdst >= (uint8_t *)txq->wqes_end))
1602 		pdst = (uint8_t *)txq->wqes;
1603 	*(uint32_t *)pdst = rte_cpu_to_be_32((RTE_ETHER_TYPE_VLAN << 16) |
1604 					      loc->mbuf->vlan_tci);
1605 	pdst += sizeof(struct rte_vlan_hdr);
1606 	/*
1607 	 * The WQEBB space availability is checked by caller.
1608 	 * Here we should be aware of WQE ring buffer wraparound only.
1609 	 */
1610 	part = (uint8_t *)txq->wqes_end - pdst;
1611 	part = RTE_MIN(part, len);
1612 	do {
1613 		rte_memcpy(pdst, buf, part);
1614 		len -= part;
1615 		if (likely(!len)) {
1616 			pdst += part;
1617 			if (!MLX5_TXOFF_CONFIG(MPW))
1618 				pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
1619 			/* Note: no final wraparound check here. */
1620 			return (struct mlx5_wqe_dseg *)pdst;
1621 		}
1622 		pdst = (uint8_t *)txq->wqes;
1623 		buf += part;
1624 		part = len;
1625 	} while (true);
1626 }
1627 
1628 /**
1629  * Build the Ethernet Segment with optionally inlined data with
1630  * VLAN insertion and following Data Segments (if any) from
1631  * multi-segment packet. Used by ordinary send and TSO.
1632  *
1633  * @param txq
1634  *   Pointer to TX queue structure.
1635  * @param loc
1636  *   Pointer to burst routine local context.
1637  * @param wqe
1638  *   Pointer to WQE to fill with built Ethernet/Data Segments.
1639  * @param vlan
1640  *   Length of VLAN header to insert, 0 means no VLAN insertion.
1641  * @param inlen
1642  *   Data length to inline. For TSO this parameter specifies exact value,
1643  *   for ordinary send routine can be aligned by caller to provide better WQE
1644  *   space saving and data buffer start address alignment.
1645  *   This length includes VLAN header being inserted.
1646  * @param tso
1647  *   Zero means ordinary send, inlined data can be extended,
1648  *   otherwise this is TSO, inlined data length is fixed.
1649  * @param olx
1650  *   Configured Tx offloads mask. It is fully defined at
1651  *   compile time and may be used for optimization.
1652  *
1653  * @return
1654  *   Actual size of built WQE in segments.
1655  */
1656 static __rte_always_inline unsigned int
1657 mlx5_tx_mseg_build(struct mlx5_txq_data *__rte_restrict txq,
1658 		   struct mlx5_txq_local *__rte_restrict loc,
1659 		   struct mlx5_wqe *__rte_restrict wqe,
1660 		   unsigned int vlan,
1661 		   unsigned int inlen,
1662 		   unsigned int tso,
1663 		   unsigned int olx __rte_unused)
1664 {
1665 	struct mlx5_wqe_dseg *__rte_restrict dseg;
1666 	unsigned int ds;
1667 
1668 	MLX5_ASSERT((rte_pktmbuf_pkt_len(loc->mbuf) + vlan) >= inlen);
1669 	loc->mbuf_nseg = NB_SEGS(loc->mbuf);
1670 	loc->mbuf_off = 0;
1671 
1672 	dseg = mlx5_tx_eseg_mdat(txq, loc, wqe, vlan, inlen, tso, olx);
1673 	if (!loc->mbuf_nseg)
1674 		goto dseg_done;
1675 	/*
1676 	 * There are still some mbuf remaining, not inlined.
1677 	 * The first mbuf may be partially inlined and we
1678 	 * must process the possible non-zero data offset.
1679 	 */
1680 	if (loc->mbuf_off) {
1681 		unsigned int dlen;
1682 		uint8_t *dptr;
1683 
1684 		/*
1685 		 * Exhausted packets must be dropped before.
1686 		 * Non-zero offset means there are some data
1687 		 * remained in the packet.
1688 		 */
1689 		MLX5_ASSERT(loc->mbuf_off < rte_pktmbuf_data_len(loc->mbuf));
1690 		MLX5_ASSERT(rte_pktmbuf_data_len(loc->mbuf));
1691 		dptr = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *,
1692 					       loc->mbuf_off);
1693 		dlen = rte_pktmbuf_data_len(loc->mbuf) - loc->mbuf_off;
1694 		/*
1695 		 * Build the pointer/minimal Data Segment.
1696 		 * Do ring buffer wrapping check in advance.
1697 		 */
1698 		if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
1699 			dseg = (struct mlx5_wqe_dseg *)txq->wqes;
1700 		mlx5_tx_dseg_iptr(txq, loc, dseg, dptr, dlen, olx);
1701 		/* Store the mbuf to be freed on completion. */
1702 		MLX5_ASSERT(loc->elts_free);
1703 		txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
1704 		--loc->elts_free;
1705 		++dseg;
1706 		if (--loc->mbuf_nseg == 0)
1707 			goto dseg_done;
1708 		loc->mbuf = loc->mbuf->next;
1709 		loc->mbuf_off = 0;
1710 	}
1711 	do {
1712 		if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) {
1713 			struct rte_mbuf *mbuf;
1714 
1715 			/* Zero length segment found, just skip. */
1716 			mbuf = loc->mbuf;
1717 			loc->mbuf = loc->mbuf->next;
1718 			rte_pktmbuf_free_seg(mbuf);
1719 			if (--loc->mbuf_nseg == 0)
1720 				break;
1721 		} else {
1722 			if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
1723 				dseg = (struct mlx5_wqe_dseg *)txq->wqes;
1724 			mlx5_tx_dseg_iptr
1725 				(txq, loc, dseg,
1726 				 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
1727 				 rte_pktmbuf_data_len(loc->mbuf), olx);
1728 			MLX5_ASSERT(loc->elts_free);
1729 			txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
1730 			--loc->elts_free;
1731 			++dseg;
1732 			if (--loc->mbuf_nseg == 0)
1733 				break;
1734 			loc->mbuf = loc->mbuf->next;
1735 		}
1736 	} while (true);
1737 
1738 dseg_done:
1739 	/* Calculate actual segments used from the dseg pointer. */
1740 	if ((uintptr_t)wqe < (uintptr_t)dseg)
1741 		ds = ((uintptr_t)dseg - (uintptr_t)wqe) / MLX5_WSEG_SIZE;
1742 	else
1743 		ds = (((uintptr_t)dseg - (uintptr_t)wqe) +
1744 		      txq->wqe_s * MLX5_WQE_SIZE) / MLX5_WSEG_SIZE;
1745 	return ds;
1746 }
1747 
1748 /**
1749  * The routine checks timestamp flag in the current packet,
1750  * and push WAIT WQE into the queue if scheduling is required.
1751  *
1752  * @param txq
1753  *   Pointer to TX queue structure.
1754  * @param loc
1755  *   Pointer to burst routine local context.
1756  * @param elts
1757  *   Number of free elements in elts buffer to be checked, for zero
1758  *   value the check is optimized out by compiler.
1759  * @param olx
1760  *   Configured Tx offloads mask. It is fully defined at
1761  *   compile time and may be used for optimization.
1762  *
1763  * @return
1764  *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
1765  *   MLX5_TXCMP_CODE_SINGLE - continue processing with the packet.
1766  *   MLX5_TXCMP_CODE_MULTI - the WAIT inserted, continue processing.
1767  * Local context variables partially updated.
1768  */
1769 static __rte_always_inline enum mlx5_txcmp_code
1770 mlx5_tx_schedule_send(struct mlx5_txq_data *restrict txq,
1771 		      struct mlx5_txq_local *restrict loc,
1772 		      uint16_t elts,
1773 		      unsigned int olx)
1774 {
1775 	if (MLX5_TXOFF_CONFIG(TXPP) &&
1776 	    loc->mbuf->ol_flags & txq->ts_mask) {
1777 		struct mlx5_dev_ctx_shared *sh;
1778 		struct mlx5_wqe *wqe;
1779 		uint64_t ts;
1780 
1781 		/*
1782 		 * Estimate the required space quickly and roughly.
1783 		 * We would like to ensure the packet can be pushed
1784 		 * to the queue and we won't get the orphan WAIT WQE.
1785 		 */
1786 		if (loc->wqe_free <= MLX5_WQE_SIZE_MAX / MLX5_WQE_SIZE ||
1787 		    loc->elts_free < elts)
1788 			return MLX5_TXCMP_CODE_EXIT;
1789 		/* Convert the timestamp into completion to wait. */
1790 		ts = *RTE_MBUF_DYNFIELD(loc->mbuf, txq->ts_offset, uint64_t *);
1791 		if (txq->ts_last && ts < txq->ts_last)
1792 			rte_atomic_fetch_add_explicit(&txq->sh->txpp.err_ts_order,
1793 					   1, rte_memory_order_relaxed);
1794 		txq->ts_last = ts;
1795 		wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
1796 		sh = txq->sh;
1797 		if (txq->wait_on_time) {
1798 			/* The wait on time capability should be used. */
1799 			ts -= sh->txpp.skew;
1800 			rte_pmd_mlx5_trace_tx_wait(ts);
1801 			mlx5_tx_cseg_init(txq, loc, wqe,
1802 					  1 + sizeof(struct mlx5_wqe_wseg) /
1803 					      MLX5_WSEG_SIZE,
1804 					  MLX5_OPCODE_WAIT |
1805 					  MLX5_OPC_MOD_WAIT_TIME << 24, olx);
1806 			mlx5_tx_wseg_init(txq, loc, wqe, ts, olx);
1807 		} else {
1808 			/* Legacy cross-channel operation should be used. */
1809 			int32_t wci;
1810 
1811 			wci = mlx5_txpp_convert_tx_ts(sh, ts);
1812 			if (unlikely(wci < 0))
1813 				return MLX5_TXCMP_CODE_SINGLE;
1814 			/* Build the WAIT WQE with specified completion. */
1815 			rte_pmd_mlx5_trace_tx_wait(ts - sh->txpp.skew);
1816 			mlx5_tx_cseg_init(txq, loc, wqe,
1817 					  1 + sizeof(struct mlx5_wqe_qseg) /
1818 					      MLX5_WSEG_SIZE,
1819 					  MLX5_OPCODE_WAIT |
1820 					  MLX5_OPC_MOD_WAIT_CQ_PI << 24, olx);
1821 			mlx5_tx_qseg_init(txq, loc, wqe, wci, olx);
1822 		}
1823 		++txq->wqe_ci;
1824 		--loc->wqe_free;
1825 		return MLX5_TXCMP_CODE_MULTI;
1826 	}
1827 	return MLX5_TXCMP_CODE_SINGLE;
1828 }
1829 
1830 /**
1831  * Tx one packet function for multi-segment TSO. Supports all
1832  * types of Tx offloads, uses MLX5_OPCODE_TSO to build WQEs,
1833  * sends one packet per WQE.
1834  *
1835  * This routine is responsible for storing processed mbuf
1836  * into elts ring buffer and update elts_head.
1837  *
1838  * @param txq
1839  *   Pointer to TX queue structure.
1840  * @param loc
1841  *   Pointer to burst routine local context.
1842  * @param olx
1843  *   Configured Tx offloads mask. It is fully defined at
1844  *   compile time and may be used for optimization.
1845  *
1846  * @return
1847  *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
1848  *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
1849  * Local context variables partially updated.
1850  */
1851 static __rte_always_inline enum mlx5_txcmp_code
1852 mlx5_tx_packet_multi_tso(struct mlx5_txq_data *__rte_restrict txq,
1853 			struct mlx5_txq_local *__rte_restrict loc,
1854 			unsigned int olx)
1855 {
1856 	struct mlx5_wqe *__rte_restrict wqe;
1857 	unsigned int ds, dlen, inlen, ntcp, vlan = 0;
1858 
1859 	MLX5_ASSERT(loc->elts_free >= NB_SEGS(loc->mbuf));
1860 	if (MLX5_TXOFF_CONFIG(TXPP)) {
1861 		enum mlx5_txcmp_code wret;
1862 
1863 		/* Generate WAIT for scheduling if requested. */
1864 		wret = mlx5_tx_schedule_send(txq, loc, 0, olx);
1865 		if (wret == MLX5_TXCMP_CODE_EXIT)
1866 			return MLX5_TXCMP_CODE_EXIT;
1867 		if (wret == MLX5_TXCMP_CODE_ERROR)
1868 			return MLX5_TXCMP_CODE_ERROR;
1869 	}
1870 	/*
1871 	 * Calculate data length to be inlined to estimate
1872 	 * the required space in WQE ring buffer.
1873 	 */
1874 	dlen = rte_pktmbuf_pkt_len(loc->mbuf);
1875 	if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN)
1876 		vlan = sizeof(struct rte_vlan_hdr);
1877 	inlen = loc->mbuf->l2_len + vlan +
1878 		loc->mbuf->l3_len + loc->mbuf->l4_len;
1879 	if (unlikely((!inlen || !loc->mbuf->tso_segsz)))
1880 		return MLX5_TXCMP_CODE_ERROR;
1881 	if (loc->mbuf->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK)
1882 		inlen += loc->mbuf->outer_l2_len + loc->mbuf->outer_l3_len;
1883 	/* Packet must contain all TSO headers. */
1884 	if (unlikely(inlen > MLX5_MAX_TSO_HEADER ||
1885 		     inlen <= MLX5_ESEG_MIN_INLINE_SIZE ||
1886 		     inlen > (dlen + vlan)))
1887 		return MLX5_TXCMP_CODE_ERROR;
1888 	/*
1889 	 * Check whether there are enough free WQEBBs:
1890 	 * - Control Segment
1891 	 * - Ethernet Segment
1892 	 * - First Segment of inlined Ethernet data
1893 	 * - ... data continued ...
1894 	 * - Data Segments of pointer/min inline type
1895 	 */
1896 	ds = NB_SEGS(loc->mbuf) + 2 + (inlen -
1897 				       MLX5_ESEG_MIN_INLINE_SIZE +
1898 				       MLX5_WSEG_SIZE +
1899 				       MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
1900 	if (unlikely(loc->wqe_free < ((ds + 3) / 4)))
1901 		return MLX5_TXCMP_CODE_EXIT;
1902 	/* Check for maximal WQE size. */
1903 	if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ds))
1904 		return MLX5_TXCMP_CODE_ERROR;
1905 #ifdef MLX5_PMD_SOFT_COUNTERS
1906 	/* Update sent data bytes/packets counters. */
1907 	ntcp = (dlen - (inlen - vlan) + loc->mbuf->tso_segsz - 1) /
1908 		loc->mbuf->tso_segsz;
1909 	/*
1910 	 * One will be added for mbuf itself at the end of the mlx5_tx_burst
1911 	 * from loc->pkts_sent field.
1912 	 */
1913 	--ntcp;
1914 	txq->stats.opackets += ntcp;
1915 	txq->stats.obytes += dlen + vlan + ntcp * inlen;
1916 #endif
1917 	wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
1918 	loc->wqe_last = wqe;
1919 	mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_TSO, olx);
1920 	rte_pmd_mlx5_trace_tx_push(loc->mbuf, txq->wqe_ci);
1921 	ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 1, olx);
1922 	wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
1923 	txq->wqe_ci += (ds + 3) / 4;
1924 	loc->wqe_free -= (ds + 3) / 4;
1925 	return MLX5_TXCMP_CODE_MULTI;
1926 }
1927 
1928 /**
1929  * Tx one packet function for multi-segment SEND. Supports all types of Tx
1930  * offloads, uses MLX5_OPCODE_SEND to build WQEs, sends one packet per WQE,
1931  * without any data inlining in Ethernet Segment.
1932  *
1933  * This routine is responsible for storing processed mbuf
1934  * into elts ring buffer and update elts_head.
1935  *
1936  * @param txq
1937  *   Pointer to TX queue structure.
1938  * @param loc
1939  *   Pointer to burst routine local context.
1940  * @param olx
1941  *   Configured Tx offloads mask. It is fully defined at
1942  *   compile time and may be used for optimization.
1943  *
1944  * @return
1945  *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
1946  *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
1947  * Local context variables partially updated.
1948  */
1949 static __rte_always_inline enum mlx5_txcmp_code
1950 mlx5_tx_packet_multi_send(struct mlx5_txq_data *__rte_restrict txq,
1951 			  struct mlx5_txq_local *__rte_restrict loc,
1952 			  unsigned int olx)
1953 {
1954 	struct mlx5_wqe_dseg *__rte_restrict dseg;
1955 	struct mlx5_wqe *__rte_restrict wqe;
1956 	unsigned int ds, nseg;
1957 
1958 	MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1);
1959 	MLX5_ASSERT(loc->elts_free >= NB_SEGS(loc->mbuf));
1960 	if (MLX5_TXOFF_CONFIG(TXPP)) {
1961 		enum mlx5_txcmp_code wret;
1962 
1963 		/* Generate WAIT for scheduling if requested. */
1964 		wret = mlx5_tx_schedule_send(txq, loc, 0, olx);
1965 		if (wret == MLX5_TXCMP_CODE_EXIT)
1966 			return MLX5_TXCMP_CODE_EXIT;
1967 		if (wret == MLX5_TXCMP_CODE_ERROR)
1968 			return MLX5_TXCMP_CODE_ERROR;
1969 	}
1970 	/*
1971 	 * No inline at all, it means the CPU cycles saving is prioritized at
1972 	 * configuration, we should not copy any packet data to WQE.
1973 	 */
1974 	nseg = NB_SEGS(loc->mbuf);
1975 	ds = 2 + nseg;
1976 	if (unlikely(loc->wqe_free < ((ds + 3) / 4)))
1977 		return MLX5_TXCMP_CODE_EXIT;
1978 	/* Check for maximal WQE size. */
1979 	if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ds))
1980 		return MLX5_TXCMP_CODE_ERROR;
1981 	/*
1982 	 * Some Tx offloads may cause an error if packet is not long enough,
1983 	 * check against assumed minimal length.
1984 	 */
1985 	if (rte_pktmbuf_pkt_len(loc->mbuf) <= MLX5_ESEG_MIN_INLINE_SIZE)
1986 		return MLX5_TXCMP_CODE_ERROR;
1987 #ifdef MLX5_PMD_SOFT_COUNTERS
1988 	/* Update sent data bytes counter. */
1989 	txq->stats.obytes += rte_pktmbuf_pkt_len(loc->mbuf);
1990 	if (MLX5_TXOFF_CONFIG(VLAN) &&
1991 	    loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN)
1992 		txq->stats.obytes += sizeof(struct rte_vlan_hdr);
1993 #endif
1994 	/*
1995 	 * SEND WQE, one WQEBB:
1996 	 * - Control Segment, SEND opcode
1997 	 * - Ethernet Segment, optional VLAN, no inline
1998 	 * - Data Segments, pointer only type
1999 	 */
2000 	wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
2001 	loc->wqe_last = wqe;
2002 	mlx5_tx_cseg_init(txq, loc, wqe, ds, MLX5_OPCODE_SEND, olx);
2003 	rte_pmd_mlx5_trace_tx_push(loc->mbuf, txq->wqe_ci);
2004 	mlx5_tx_eseg_none(txq, loc, wqe, olx);
2005 	dseg = &wqe->dseg[0];
2006 	do {
2007 		if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) {
2008 			struct rte_mbuf *mbuf;
2009 
2010 			/*
2011 			 * Zero length segment found, have to correct total
2012 			 * size of WQE in segments.
2013 			 * It is supposed to be rare occasion, so in normal
2014 			 * case (no zero length segments) we avoid extra
2015 			 * writing to the Control Segment.
2016 			 */
2017 			--ds;
2018 			wqe->cseg.sq_ds -= RTE_BE32(1);
2019 			mbuf = loc->mbuf;
2020 			loc->mbuf = mbuf->next;
2021 			rte_pktmbuf_free_seg(mbuf);
2022 			if (--nseg == 0)
2023 				break;
2024 		} else {
2025 			mlx5_tx_dseg_ptr
2026 				(txq, loc, dseg,
2027 				 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
2028 				 rte_pktmbuf_data_len(loc->mbuf), olx);
2029 			txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
2030 			--loc->elts_free;
2031 			if (--nseg == 0)
2032 				break;
2033 			++dseg;
2034 			if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
2035 				dseg = (struct mlx5_wqe_dseg *)txq->wqes;
2036 			loc->mbuf = loc->mbuf->next;
2037 		}
2038 	} while (true);
2039 	txq->wqe_ci += (ds + 3) / 4;
2040 	loc->wqe_free -= (ds + 3) / 4;
2041 	return MLX5_TXCMP_CODE_MULTI;
2042 }
2043 
2044 /**
2045  * Tx one packet function for multi-segment SEND. Supports all
2046  * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs,
2047  * sends one packet per WQE, with data inlining in
2048  * Ethernet Segment and minimal Data Segments.
2049  *
2050  * This routine is responsible for storing processed mbuf
2051  * into elts ring buffer and update elts_head.
2052  *
2053  * @param txq
2054  *   Pointer to TX queue structure.
2055  * @param loc
2056  *   Pointer to burst routine local context.
2057  * @param olx
2058  *   Configured Tx offloads mask. It is fully defined at
2059  *   compile time and may be used for optimization.
2060  *
2061  * @return
2062  *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
2063  *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
2064  * Local context variables partially updated.
2065  */
2066 static __rte_always_inline enum mlx5_txcmp_code
2067 mlx5_tx_packet_multi_inline(struct mlx5_txq_data *__rte_restrict txq,
2068 			    struct mlx5_txq_local *__rte_restrict loc,
2069 			    unsigned int olx)
2070 {
2071 	struct mlx5_wqe *__rte_restrict wqe;
2072 	unsigned int ds, inlen, dlen, vlan = 0;
2073 
2074 	MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
2075 	MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1);
2076 	MLX5_ASSERT(loc->elts_free >= NB_SEGS(loc->mbuf));
2077 	/*
2078 	 * First calculate data length to be inlined
2079 	 * to estimate the required space for WQE.
2080 	 */
2081 	dlen = rte_pktmbuf_pkt_len(loc->mbuf);
2082 	if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN)
2083 		vlan = sizeof(struct rte_vlan_hdr);
2084 	inlen = dlen + vlan;
2085 	/* Check against minimal length. */
2086 	if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE)
2087 		return MLX5_TXCMP_CODE_ERROR;
2088 	MLX5_ASSERT(txq->inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE);
2089 	if (inlen > txq->inlen_send ||
2090 	    loc->mbuf->ol_flags & RTE_MBUF_F_TX_DYNF_NOINLINE) {
2091 		struct rte_mbuf *mbuf;
2092 		unsigned int nxlen;
2093 		uintptr_t start;
2094 
2095 		mbuf = loc->mbuf;
2096 		nxlen = rte_pktmbuf_data_len(mbuf) + vlan;
2097 		/*
2098 		 * Packet length exceeds the allowed inline data length,
2099 		 * check whether the minimal inlining is required.
2100 		 */
2101 		if (txq->inlen_mode) {
2102 			MLX5_ASSERT(txq->inlen_mode >=
2103 				    MLX5_ESEG_MIN_INLINE_SIZE);
2104 			MLX5_ASSERT(txq->inlen_mode <= txq->inlen_send);
2105 			inlen = RTE_MIN(txq->inlen_mode, inlen);
2106 		} else if (vlan && !txq->vlan_en) {
2107 			/*
2108 			 * VLAN insertion is requested and hardware does not
2109 			 * support the offload, will do with software inline.
2110 			 */
2111 			inlen = MLX5_ESEG_MIN_INLINE_SIZE;
2112 		} else if (mbuf->ol_flags & RTE_MBUF_F_TX_DYNF_NOINLINE ||
2113 			   nxlen > txq->inlen_send) {
2114 			return mlx5_tx_packet_multi_send(txq, loc, olx);
2115 		} else if (nxlen <= MLX5_ESEG_MIN_INLINE_SIZE) {
2116 			inlen = MLX5_ESEG_MIN_INLINE_SIZE;
2117 		} else {
2118 			goto do_first;
2119 		}
2120 		if (mbuf->ol_flags & RTE_MBUF_F_TX_DYNF_NOINLINE)
2121 			goto do_build;
2122 		/*
2123 		 * Now we know the minimal amount of data is requested
2124 		 * to inline. Check whether we should inline the buffers
2125 		 * from the chain beginning to eliminate some mbufs.
2126 		 */
2127 		if (unlikely(nxlen <= txq->inlen_send)) {
2128 			/* We can inline first mbuf at least. */
2129 			if (nxlen < inlen) {
2130 				unsigned int smlen;
2131 
2132 				/* Scan mbufs till inlen filled. */
2133 				do {
2134 					smlen = nxlen;
2135 					mbuf = NEXT(mbuf);
2136 					MLX5_ASSERT(mbuf);
2137 					nxlen = rte_pktmbuf_data_len(mbuf);
2138 					nxlen += smlen;
2139 				} while (unlikely(nxlen < inlen));
2140 				if (unlikely(nxlen > txq->inlen_send)) {
2141 					/* We cannot inline entire mbuf. */
2142 					smlen = inlen - smlen;
2143 					start = rte_pktmbuf_mtod_offset
2144 						    (mbuf, uintptr_t, smlen);
2145 					goto do_align;
2146 				}
2147 			}
2148 do_first:
2149 			do {
2150 				inlen = nxlen;
2151 				mbuf = NEXT(mbuf);
2152 				/* There should be not end of packet. */
2153 				MLX5_ASSERT(mbuf);
2154 				if (mbuf->ol_flags & RTE_MBUF_F_TX_DYNF_NOINLINE)
2155 					break;
2156 				nxlen = inlen + rte_pktmbuf_data_len(mbuf);
2157 			} while (unlikely(nxlen < txq->inlen_send));
2158 		}
2159 		start = rte_pktmbuf_mtod(mbuf, uintptr_t);
2160 		/*
2161 		 * Check whether we can do inline to align start
2162 		 * address of data buffer to cacheline.
2163 		 */
2164 do_align:
2165 		start = (~start + 1) & (RTE_CACHE_LINE_SIZE - 1);
2166 		if (unlikely(start)) {
2167 			start += inlen;
2168 			if (start <= txq->inlen_send)
2169 				inlen = start;
2170 		}
2171 	}
2172 	/*
2173 	 * Check whether there are enough free WQEBBs:
2174 	 * - Control Segment
2175 	 * - Ethernet Segment
2176 	 * - First Segment of inlined Ethernet data
2177 	 * - ... data continued ...
2178 	 * - Data Segments of pointer/min inline type
2179 	 *
2180 	 * Estimate the number of Data Segments conservatively,
2181 	 * supposing no any mbufs is being freed during inlining.
2182 	 */
2183 do_build:
2184 	if (MLX5_TXOFF_CONFIG(TXPP)) {
2185 		enum mlx5_txcmp_code wret;
2186 
2187 		/* Generate WAIT for scheduling if requested. */
2188 		wret = mlx5_tx_schedule_send(txq, loc, 0, olx);
2189 		if (wret == MLX5_TXCMP_CODE_EXIT)
2190 			return MLX5_TXCMP_CODE_EXIT;
2191 		if (wret == MLX5_TXCMP_CODE_ERROR)
2192 			return MLX5_TXCMP_CODE_ERROR;
2193 	}
2194 	MLX5_ASSERT(inlen <= txq->inlen_send);
2195 	ds = NB_SEGS(loc->mbuf) + 2 + (inlen -
2196 				       MLX5_ESEG_MIN_INLINE_SIZE +
2197 				       MLX5_WSEG_SIZE +
2198 				       MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
2199 	if (unlikely(loc->wqe_free < ((ds + 3) / 4)))
2200 		return MLX5_TXCMP_CODE_EXIT;
2201 	/* Check for maximal WQE size. */
2202 	if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ds)) {
2203 		/*  Check if we can adjust the inline length. */
2204 		if (unlikely(txq->inlen_mode)) {
2205 			ds = NB_SEGS(loc->mbuf) + 2 +
2206 				(txq->inlen_mode -
2207 				MLX5_ESEG_MIN_INLINE_SIZE +
2208 				MLX5_WSEG_SIZE +
2209 				MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
2210 			if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ds))
2211 				return MLX5_TXCMP_CODE_ERROR;
2212 		}
2213 		/* We have lucky opportunity to adjust. */
2214 		inlen = RTE_MIN(inlen, MLX5_WQE_SIZE_MAX -
2215 				       MLX5_WSEG_SIZE * 2 -
2216 				       MLX5_WSEG_SIZE * NB_SEGS(loc->mbuf) -
2217 				       MLX5_WSEG_SIZE +
2218 				       MLX5_ESEG_MIN_INLINE_SIZE);
2219 	}
2220 #ifdef MLX5_PMD_SOFT_COUNTERS
2221 	/* Update sent data bytes/packets counters. */
2222 	txq->stats.obytes += dlen + vlan;
2223 #endif
2224 	wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
2225 	loc->wqe_last = wqe;
2226 	mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_SEND, olx);
2227 	rte_pmd_mlx5_trace_tx_push(loc->mbuf, txq->wqe_ci);
2228 	ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 0, olx);
2229 	wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
2230 	txq->wqe_ci += (ds + 3) / 4;
2231 	loc->wqe_free -= (ds + 3) / 4;
2232 	return MLX5_TXCMP_CODE_MULTI;
2233 }
2234 
2235 /**
2236  * Tx burst function for multi-segment packets. Supports all
2237  * types of Tx offloads, uses MLX5_OPCODE_SEND/TSO to build WQEs,
2238  * sends one packet per WQE. Function stops sending if it
2239  * encounters the single-segment packet.
2240  *
2241  * This routine is responsible for storing processed mbuf
2242  * into elts ring buffer and update elts_head.
2243  *
2244  * @param txq
2245  *   Pointer to TX queue structure.
2246  * @param[in] pkts
2247  *   Packets to transmit.
2248  * @param pkts_n
2249  *   Number of packets in array.
2250  * @param loc
2251  *   Pointer to burst routine local context.
2252  * @param olx
2253  *   Configured Tx offloads mask. It is fully defined at
2254  *   compile time and may be used for optimization.
2255  *
2256  * @return
2257  *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
2258  *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
2259  *   MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered.
2260  *   MLX5_TXCMP_CODE_TSO - TSO single-segment packet encountered.
2261  * Local context variables updated.
2262  */
2263 static __rte_always_inline enum mlx5_txcmp_code
2264 mlx5_tx_burst_mseg(struct mlx5_txq_data *__rte_restrict txq,
2265 		   struct rte_mbuf **__rte_restrict pkts,
2266 		   unsigned int pkts_n,
2267 		   struct mlx5_txq_local *__rte_restrict loc,
2268 		   unsigned int olx)
2269 {
2270 	MLX5_ASSERT(loc->elts_free && loc->wqe_free);
2271 	MLX5_ASSERT(pkts_n > loc->pkts_sent);
2272 	pkts += loc->pkts_sent + 1;
2273 	pkts_n -= loc->pkts_sent;
2274 	for (;;) {
2275 		enum mlx5_txcmp_code ret;
2276 
2277 		MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1);
2278 		/*
2279 		 * Estimate the number of free elts quickly but conservatively.
2280 		 * Some segment may be fully inlined and freed,
2281 		 * ignore this here - precise estimation is costly.
2282 		 */
2283 		if (loc->elts_free < NB_SEGS(loc->mbuf))
2284 			return MLX5_TXCMP_CODE_EXIT;
2285 		if (MLX5_TXOFF_CONFIG(TSO) &&
2286 		    unlikely(loc->mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG)) {
2287 			/* Proceed with multi-segment TSO. */
2288 			ret = mlx5_tx_packet_multi_tso(txq, loc, olx);
2289 		} else if (MLX5_TXOFF_CONFIG(INLINE)) {
2290 			/* Proceed with multi-segment SEND with inlining. */
2291 			ret = mlx5_tx_packet_multi_inline(txq, loc, olx);
2292 		} else {
2293 			/* Proceed with multi-segment SEND w/o inlining. */
2294 			ret = mlx5_tx_packet_multi_send(txq, loc, olx);
2295 		}
2296 		if (ret == MLX5_TXCMP_CODE_EXIT)
2297 			return MLX5_TXCMP_CODE_EXIT;
2298 		if (ret == MLX5_TXCMP_CODE_ERROR)
2299 			return MLX5_TXCMP_CODE_ERROR;
2300 		/* WQE is built, go to the next packet. */
2301 		++loc->pkts_sent;
2302 		--pkts_n;
2303 		if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
2304 			return MLX5_TXCMP_CODE_EXIT;
2305 		loc->mbuf = *pkts++;
2306 		if (pkts_n > 1)
2307 			rte_prefetch0(*pkts);
2308 		if (likely(NB_SEGS(loc->mbuf) > 1))
2309 			continue;
2310 		/* Here ends the series of multi-segment packets. */
2311 		if (MLX5_TXOFF_CONFIG(TSO) &&
2312 		    unlikely(loc->mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG))
2313 			return MLX5_TXCMP_CODE_TSO;
2314 		return MLX5_TXCMP_CODE_SINGLE;
2315 	}
2316 	MLX5_ASSERT(false);
2317 }
2318 
2319 /**
2320  * Tx burst function for single-segment packets with TSO.
2321  * Supports all types of Tx offloads, except multi-packets.
2322  * Uses MLX5_OPCODE_TSO to build WQEs, sends one packet per WQE.
2323  * Function stops sending if it encounters the multi-segment
2324  * packet or packet without TSO requested.
2325  *
2326  * The routine is responsible for storing processed mbuf into elts ring buffer
2327  * and update elts_head if inline offloads is requested due to possible early
2328  * freeing of the inlined mbufs (can not store pkts array in elts as a batch).
2329  *
2330  * @param txq
2331  *   Pointer to TX queue structure.
2332  * @param[in] pkts
2333  *   Packets to transmit.
2334  * @param pkts_n
2335  *   Number of packets in array.
2336  * @param loc
2337  *   Pointer to burst routine local context.
2338  * @param olx
2339  *   Configured Tx offloads mask. It is fully defined at
2340  *   compile time and may be used for optimization.
2341  *
2342  * @return
2343  *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
2344  *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
2345  *   MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered.
2346  *   MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered.
2347  * Local context variables updated.
2348  */
2349 static __rte_always_inline enum mlx5_txcmp_code
2350 mlx5_tx_burst_tso(struct mlx5_txq_data *__rte_restrict txq,
2351 		  struct rte_mbuf **__rte_restrict pkts,
2352 		  unsigned int pkts_n,
2353 		  struct mlx5_txq_local *__rte_restrict loc,
2354 		  unsigned int olx)
2355 {
2356 	MLX5_ASSERT(loc->elts_free && loc->wqe_free);
2357 	MLX5_ASSERT(pkts_n > loc->pkts_sent);
2358 	pkts += loc->pkts_sent + 1;
2359 	pkts_n -= loc->pkts_sent;
2360 	for (;;) {
2361 		struct mlx5_wqe_dseg *__rte_restrict dseg;
2362 		struct mlx5_wqe *__rte_restrict wqe;
2363 		unsigned int ds, dlen, hlen, ntcp, vlan = 0;
2364 		uint8_t *dptr;
2365 
2366 		MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
2367 		if (MLX5_TXOFF_CONFIG(TXPP)) {
2368 			enum mlx5_txcmp_code wret;
2369 
2370 			/* Generate WAIT for scheduling if requested. */
2371 			wret = mlx5_tx_schedule_send(txq, loc, 1, olx);
2372 			if (wret == MLX5_TXCMP_CODE_EXIT)
2373 				return MLX5_TXCMP_CODE_EXIT;
2374 			if (wret == MLX5_TXCMP_CODE_ERROR)
2375 				return MLX5_TXCMP_CODE_ERROR;
2376 		}
2377 		dlen = rte_pktmbuf_data_len(loc->mbuf);
2378 		if (MLX5_TXOFF_CONFIG(VLAN) &&
2379 		    loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN) {
2380 			vlan = sizeof(struct rte_vlan_hdr);
2381 		}
2382 		/*
2383 		 * First calculate the WQE size to check
2384 		 * whether we have enough space in ring buffer.
2385 		 */
2386 		hlen = loc->mbuf->l2_len + vlan +
2387 		       loc->mbuf->l3_len + loc->mbuf->l4_len;
2388 		if (unlikely((!hlen || !loc->mbuf->tso_segsz)))
2389 			return MLX5_TXCMP_CODE_ERROR;
2390 		if (loc->mbuf->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK)
2391 			hlen += loc->mbuf->outer_l2_len +
2392 				loc->mbuf->outer_l3_len;
2393 		/* Segment must contain all TSO headers. */
2394 		if (unlikely(hlen > MLX5_MAX_TSO_HEADER ||
2395 			     hlen <= MLX5_ESEG_MIN_INLINE_SIZE ||
2396 			     hlen > (dlen + vlan)))
2397 			return MLX5_TXCMP_CODE_ERROR;
2398 		/*
2399 		 * Check whether there are enough free WQEBBs:
2400 		 * - Control Segment
2401 		 * - Ethernet Segment
2402 		 * - First Segment of inlined Ethernet data
2403 		 * - ... data continued ...
2404 		 * - Finishing Data Segment of pointer type
2405 		 */
2406 		ds = 4 + (hlen - MLX5_ESEG_MIN_INLINE_SIZE +
2407 			  MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
2408 		if (loc->wqe_free < ((ds + 3) / 4))
2409 			return MLX5_TXCMP_CODE_EXIT;
2410 #ifdef MLX5_PMD_SOFT_COUNTERS
2411 		/* Update sent data bytes/packets counters. */
2412 		ntcp = (dlen + vlan - hlen +
2413 			loc->mbuf->tso_segsz - 1) /
2414 			loc->mbuf->tso_segsz;
2415 		/*
2416 		 * One will be added for mbuf itself at the end
2417 		 * of the mlx5_tx_burst from loc->pkts_sent field.
2418 		 */
2419 		--ntcp;
2420 		txq->stats.opackets += ntcp;
2421 		txq->stats.obytes += dlen + vlan + ntcp * hlen;
2422 #endif
2423 		/*
2424 		 * Build the TSO WQE:
2425 		 * - Control Segment
2426 		 * - Ethernet Segment with hlen bytes inlined
2427 		 * - Data Segment of pointer type
2428 		 */
2429 		wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
2430 		loc->wqe_last = wqe;
2431 		mlx5_tx_cseg_init(txq, loc, wqe, ds, MLX5_OPCODE_TSO, olx);
2432 		rte_pmd_mlx5_trace_tx_push(loc->mbuf, txq->wqe_ci);
2433 		dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan, hlen, 1, olx);
2434 		dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + hlen - vlan;
2435 		dlen -= hlen - vlan;
2436 		mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx);
2437 		/*
2438 		 * WQE is built, update the loop parameters
2439 		 * and go to the next packet.
2440 		 */
2441 		txq->wqe_ci += (ds + 3) / 4;
2442 		loc->wqe_free -= (ds + 3) / 4;
2443 		if (MLX5_TXOFF_CONFIG(INLINE))
2444 			txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
2445 		--loc->elts_free;
2446 		++loc->pkts_sent;
2447 		--pkts_n;
2448 		if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
2449 			return MLX5_TXCMP_CODE_EXIT;
2450 		loc->mbuf = *pkts++;
2451 		if (pkts_n > 1)
2452 			rte_prefetch0(*pkts);
2453 		if (MLX5_TXOFF_CONFIG(MULTI) &&
2454 		    unlikely(NB_SEGS(loc->mbuf) > 1))
2455 			return MLX5_TXCMP_CODE_MULTI;
2456 		if (likely(!(loc->mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG)))
2457 			return MLX5_TXCMP_CODE_SINGLE;
2458 		/* Continue with the next TSO packet. */
2459 	}
2460 	MLX5_ASSERT(false);
2461 }
2462 
2463 /**
2464  * Analyze the packet and select the best method to send.
2465  *
2466  * @param txq
2467  *   Pointer to TX queue structure.
2468  * @param loc
2469  *   Pointer to burst routine local context.
2470  * @param olx
2471  *   Configured Tx offloads mask. It is fully defined at
2472  *   compile time and may be used for optimization.
2473  * @param newp
2474  *   The predefined flag whether do complete check for
2475  *   multi-segment packets and TSO.
2476  *
2477  * @return
2478  *  MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered.
2479  *  MLX5_TXCMP_CODE_TSO - TSO required, use TSO/LSO.
2480  *  MLX5_TXCMP_CODE_SINGLE - single-segment packet, use SEND.
2481  *  MLX5_TXCMP_CODE_EMPW - single-segment packet, use MPW.
2482  */
2483 static __rte_always_inline enum mlx5_txcmp_code
2484 mlx5_tx_able_to_empw(struct mlx5_txq_data *__rte_restrict txq,
2485 		     struct mlx5_txq_local *__rte_restrict loc,
2486 		     unsigned int olx,
2487 		     bool newp)
2488 {
2489 	/* Check for multi-segment packet. */
2490 	if (newp &&
2491 	    MLX5_TXOFF_CONFIG(MULTI) &&
2492 	    unlikely(NB_SEGS(loc->mbuf) > 1))
2493 		return MLX5_TXCMP_CODE_MULTI;
2494 	/* Check for TSO packet. */
2495 	if (newp &&
2496 	    MLX5_TXOFF_CONFIG(TSO) &&
2497 	    unlikely(loc->mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG))
2498 		return MLX5_TXCMP_CODE_TSO;
2499 	/* Check if eMPW is enabled at all. */
2500 	if (!MLX5_TXOFF_CONFIG(EMPW))
2501 		return MLX5_TXCMP_CODE_SINGLE;
2502 	/* Check if eMPW can be engaged. */
2503 	if (MLX5_TXOFF_CONFIG(VLAN) &&
2504 	    unlikely(loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN) &&
2505 		(!MLX5_TXOFF_CONFIG(INLINE) ||
2506 		 unlikely((rte_pktmbuf_data_len(loc->mbuf) +
2507 			   sizeof(struct rte_vlan_hdr)) > txq->inlen_empw))) {
2508 		/*
2509 		 * eMPW does not support VLAN insertion offload, we have to
2510 		 * inline the entire packet but packet is too long for inlining.
2511 		 */
2512 		return MLX5_TXCMP_CODE_SINGLE;
2513 	}
2514 	return MLX5_TXCMP_CODE_EMPW;
2515 }
2516 
2517 /**
2518  * Check the next packet attributes to match with the eMPW batch ones.
2519  * In addition, for legacy MPW the packet length is checked either.
2520  *
2521  * @param txq
2522  *   Pointer to TX queue structure.
2523  * @param es
2524  *   Pointer to Ethernet Segment of eMPW batch.
2525  * @param loc
2526  *   Pointer to burst routine local context.
2527  * @param dlen
2528  *   Length of previous packet in MPW descriptor.
2529  * @param olx
2530  *   Configured Tx offloads mask. It is fully defined at
2531  *   compile time and may be used for optimization.
2532  *
2533  * @return
2534  *  true - packet match with eMPW batch attributes.
2535  *  false - no match, eMPW should be restarted.
2536  */
2537 static __rte_always_inline bool
2538 mlx5_tx_match_empw(struct mlx5_txq_data *__rte_restrict txq,
2539 		   struct mlx5_wqe_eseg *__rte_restrict es,
2540 		   struct mlx5_txq_local *__rte_restrict loc,
2541 		   uint32_t dlen,
2542 		   unsigned int olx)
2543 {
2544 	uint8_t swp_flags = 0;
2545 
2546 	/* Compare the checksum flags, if any. */
2547 	if (MLX5_TXOFF_CONFIG(CSUM) &&
2548 	    txq_ol_cksum_to_cs(loc->mbuf) != es->cs_flags)
2549 		return false;
2550 	/* Compare the Software Parser offsets and flags. */
2551 	if (MLX5_TXOFF_CONFIG(SWP) &&
2552 	    (es->swp_offs != txq_mbuf_to_swp(loc, &swp_flags, olx) ||
2553 	     es->swp_flags != swp_flags))
2554 		return false;
2555 	/* Fill metadata field if needed. */
2556 	if (MLX5_TXOFF_CONFIG(METADATA) &&
2557 		es->metadata != (loc->mbuf->ol_flags & RTE_MBUF_DYNFLAG_TX_METADATA ?
2558 				 rte_cpu_to_be_32(*RTE_FLOW_DYNF_METADATA(loc->mbuf)) : 0))
2559 		return false;
2560 	/* Legacy MPW can send packets with the same length only. */
2561 	if (MLX5_TXOFF_CONFIG(MPW) &&
2562 	    dlen != rte_pktmbuf_data_len(loc->mbuf))
2563 		return false;
2564 	/* There must be no VLAN packets in eMPW loop. */
2565 	if (MLX5_TXOFF_CONFIG(VLAN))
2566 		MLX5_ASSERT(!(loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN));
2567 	/* Check if the scheduling is requested. */
2568 	if (MLX5_TXOFF_CONFIG(TXPP) &&
2569 	    loc->mbuf->ol_flags & txq->ts_mask)
2570 		return false;
2571 	return true;
2572 }
2573 
2574 /**
2575  * Update send loop variables and WQE for eMPW loop without data inlining.
2576  * Number of Data Segments is equal to the number of sent packets.
2577  *
2578  * @param txq
2579  *   Pointer to TX queue structure.
2580  * @param loc
2581  *   Pointer to burst routine local context.
2582  * @param ds
2583  *   Number of packets/Data Segments/Packets.
2584  * @param slen
2585  *   Accumulated statistics, bytes sent.
2586  * @param olx
2587  *   Configured Tx offloads mask. It is fully defined at
2588  *   compile time and may be used for optimization.
2589  *
2590  * @return
2591  *  true - packet match with eMPW batch attributes.
2592  *  false - no match, eMPW should be restarted.
2593  */
2594 static __rte_always_inline void
2595 mlx5_tx_sdone_empw(struct mlx5_txq_data *__rte_restrict txq,
2596 		   struct mlx5_txq_local *__rte_restrict loc,
2597 		   unsigned int ds,
2598 		   unsigned int slen,
2599 		   unsigned int olx __rte_unused)
2600 {
2601 	MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE));
2602 #ifdef MLX5_PMD_SOFT_COUNTERS
2603 	/* Update sent data bytes counter. */
2604 	 txq->stats.obytes += slen;
2605 #else
2606 	(void)slen;
2607 #endif
2608 	loc->elts_free -= ds;
2609 	loc->pkts_sent += ds;
2610 	ds += 2;
2611 	loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
2612 	txq->wqe_ci += (ds + 3) / 4;
2613 	loc->wqe_free -= (ds + 3) / 4;
2614 }
2615 
2616 /**
2617  * Update send loop variables and WQE for eMPW loop with data inlining.
2618  * Gets the size of pushed descriptors and data to the WQE.
2619  *
2620  * @param txq
2621  *   Pointer to TX queue structure.
2622  * @param loc
2623  *   Pointer to burst routine local context.
2624  * @param len
2625  *   Total size of descriptor/data in bytes.
2626  * @param slen
2627  *   Accumulated statistics, data bytes sent.
2628  * @param wqem
2629  *   The base WQE for the eMPW/MPW descriptor.
2630  * @param olx
2631  *   Configured Tx offloads mask. It is fully defined at
2632  *   compile time and may be used for optimization.
2633  *
2634  * @return
2635  *  true - packet match with eMPW batch attributes.
2636  *  false - no match, eMPW should be restarted.
2637  */
2638 static __rte_always_inline void
2639 mlx5_tx_idone_empw(struct mlx5_txq_data *__rte_restrict txq,
2640 		   struct mlx5_txq_local *__rte_restrict loc,
2641 		   unsigned int len,
2642 		   unsigned int slen,
2643 		   struct mlx5_wqe *__rte_restrict wqem,
2644 		   unsigned int olx __rte_unused)
2645 {
2646 	struct mlx5_wqe_dseg *dseg = &wqem->dseg[0];
2647 
2648 	MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
2649 #ifdef MLX5_PMD_SOFT_COUNTERS
2650 	/* Update sent data bytes counter. */
2651 	 txq->stats.obytes += slen;
2652 #else
2653 	(void)slen;
2654 #endif
2655 	if (MLX5_TXOFF_CONFIG(MPW) && dseg->bcount == RTE_BE32(0)) {
2656 		/*
2657 		 * If the legacy MPW session contains the inline packets
2658 		 * we should set the only inline data segment length
2659 		 * and align the total length to the segment size.
2660 		 */
2661 		MLX5_ASSERT(len > sizeof(dseg->bcount));
2662 		dseg->bcount = rte_cpu_to_be_32((len - sizeof(dseg->bcount)) |
2663 						MLX5_ETH_WQE_DATA_INLINE);
2664 		len = (len + MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE + 2;
2665 	} else {
2666 		/*
2667 		 * The session is not legacy MPW or contains the
2668 		 * data buffer pointer segments.
2669 		 */
2670 		MLX5_ASSERT((len % MLX5_WSEG_SIZE) == 0);
2671 		len = len / MLX5_WSEG_SIZE + 2;
2672 	}
2673 	wqem->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | len);
2674 	txq->wqe_ci += (len + 3) / 4;
2675 	loc->wqe_free -= (len + 3) / 4;
2676 	loc->wqe_last = wqem;
2677 }
2678 
2679 /**
2680  * The set of Tx burst functions for single-segment packets without TSO
2681  * and with Multi-Packet Writing feature support.
2682  * Supports all types of Tx offloads, except multi-packets and TSO.
2683  *
2684  * Uses MLX5_OPCODE_EMPW to build WQEs if possible and sends as many packet
2685  * per WQE as it can. If eMPW is not configured or packet can not be sent with
2686  * eMPW (VLAN insertion) the ordinary SEND opcode is used and only one packet
2687  * placed in WQE.
2688  *
2689  * Functions stop sending if it encounters the multi-segment packet or packet
2690  * with TSO requested.
2691  *
2692  * The routines are responsible for storing processed mbuf into elts ring buffer
2693  * and update elts_head if inlining offload is requested. Otherwise the copying
2694  * mbufs to elts can be postponed and completed at the end of burst routine.
2695  *
2696  * @param txq
2697  *   Pointer to TX queue structure.
2698  * @param[in] pkts
2699  *   Packets to transmit.
2700  * @param pkts_n
2701  *   Number of packets in array.
2702  * @param loc
2703  *   Pointer to burst routine local context.
2704  * @param olx
2705  *   Configured Tx offloads mask. It is fully defined at
2706  *   compile time and may be used for optimization.
2707  *
2708  * @return
2709  *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
2710  *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
2711  *   MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered.
2712  *   MLX5_TXCMP_CODE_TSO - TSO packet encountered.
2713  *   MLX5_TXCMP_CODE_SINGLE - used inside functions set.
2714  *   MLX5_TXCMP_CODE_EMPW - used inside functions set.
2715  *
2716  * Local context variables updated.
2717  *
2718  *
2719  * The routine sends packets with MLX5_OPCODE_EMPW
2720  * without inlining, this is dedicated optimized branch.
2721  * No VLAN insertion is supported.
2722  */
2723 static __rte_always_inline enum mlx5_txcmp_code
2724 mlx5_tx_burst_empw_simple(struct mlx5_txq_data *__rte_restrict txq,
2725 			  struct rte_mbuf **__rte_restrict pkts,
2726 			  unsigned int pkts_n,
2727 			  struct mlx5_txq_local *__rte_restrict loc,
2728 			  unsigned int olx)
2729 {
2730 	/*
2731 	 * Subroutine is the part of mlx5_tx_burst_single() and sends
2732 	 * single-segment packet with eMPW opcode without data inlining.
2733 	 */
2734 	MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE));
2735 	MLX5_ASSERT(MLX5_TXOFF_CONFIG(EMPW));
2736 	MLX5_ASSERT(loc->elts_free && loc->wqe_free);
2737 	MLX5_ASSERT(pkts_n > loc->pkts_sent);
2738 	pkts += loc->pkts_sent + 1;
2739 	pkts_n -= loc->pkts_sent;
2740 	for (;;) {
2741 		struct mlx5_wqe_dseg *__rte_restrict dseg;
2742 		struct mlx5_wqe_eseg *__rte_restrict eseg;
2743 		enum mlx5_txcmp_code ret;
2744 		unsigned int part, loop;
2745 		unsigned int slen = 0;
2746 
2747 next_empw:
2748 		MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
2749 		part = RTE_MIN(pkts_n, MLX5_TXOFF_CONFIG(MPW) ?
2750 				       MLX5_MPW_MAX_PACKETS :
2751 				       MLX5_EMPW_MAX_PACKETS);
2752 		if (unlikely(loc->elts_free < part)) {
2753 			/* We have no enough elts to save all mbufs. */
2754 			if (unlikely(loc->elts_free < MLX5_EMPW_MIN_PACKETS))
2755 				return MLX5_TXCMP_CODE_EXIT;
2756 			/* But we still able to send at least minimal eMPW. */
2757 			part = loc->elts_free;
2758 		}
2759 		if (MLX5_TXOFF_CONFIG(TXPP)) {
2760 			enum mlx5_txcmp_code wret;
2761 
2762 			/* Generate WAIT for scheduling if requested. */
2763 			wret = mlx5_tx_schedule_send(txq, loc, 0, olx);
2764 			if (wret == MLX5_TXCMP_CODE_EXIT)
2765 				return MLX5_TXCMP_CODE_EXIT;
2766 			if (wret == MLX5_TXCMP_CODE_ERROR)
2767 				return MLX5_TXCMP_CODE_ERROR;
2768 		}
2769 		/* Check whether we have enough WQEs */
2770 		if (unlikely(loc->wqe_free < ((2 + part + 3) / 4))) {
2771 			if (unlikely(loc->wqe_free <
2772 				((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4)))
2773 				return MLX5_TXCMP_CODE_EXIT;
2774 			part = (loc->wqe_free * 4) - 2;
2775 		}
2776 		if (likely(part > 1))
2777 			rte_prefetch0(*pkts);
2778 		loc->wqe_last = txq->wqes + (txq->wqe_ci & txq->wqe_m);
2779 		/*
2780 		 * Build eMPW title WQEBB:
2781 		 * - Control Segment, eMPW opcode
2782 		 * - Ethernet Segment, no inline
2783 		 */
2784 		mlx5_tx_cseg_init(txq, loc, loc->wqe_last, part + 2,
2785 				  MLX5_OPCODE_ENHANCED_MPSW, olx);
2786 		mlx5_tx_eseg_none(txq, loc, loc->wqe_last,
2787 				  olx & ~MLX5_TXOFF_CONFIG_VLAN);
2788 		eseg = &loc->wqe_last->eseg;
2789 		dseg = &loc->wqe_last->dseg[0];
2790 		loop = part;
2791 		/* Store the packet length for legacy MPW. */
2792 		if (MLX5_TXOFF_CONFIG(MPW))
2793 			eseg->mss = rte_cpu_to_be_16
2794 					(rte_pktmbuf_data_len(loc->mbuf));
2795 		for (;;) {
2796 			uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf);
2797 #ifdef MLX5_PMD_SOFT_COUNTERS
2798 			/* Update sent data bytes counter. */
2799 			slen += dlen;
2800 #endif
2801 			rte_pmd_mlx5_trace_tx_push(loc->mbuf, txq->wqe_ci);
2802 			mlx5_tx_dseg_ptr
2803 				(txq, loc, dseg,
2804 				 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
2805 				 dlen, olx);
2806 			if (unlikely(--loop == 0))
2807 				break;
2808 			loc->mbuf = *pkts++;
2809 			if (likely(loop > 1))
2810 				rte_prefetch0(*pkts);
2811 			ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
2812 			/*
2813 			 * Unroll the completion code to avoid
2814 			 * returning variable value - it results in
2815 			 * unoptimized sequent checking in caller.
2816 			 */
2817 			if (ret == MLX5_TXCMP_CODE_MULTI) {
2818 				part -= loop;
2819 				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
2820 				if (unlikely(!loc->elts_free ||
2821 					     !loc->wqe_free))
2822 					return MLX5_TXCMP_CODE_EXIT;
2823 				return MLX5_TXCMP_CODE_MULTI;
2824 			}
2825 			MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
2826 			if (ret == MLX5_TXCMP_CODE_TSO) {
2827 				part -= loop;
2828 				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
2829 				if (unlikely(!loc->elts_free ||
2830 					     !loc->wqe_free))
2831 					return MLX5_TXCMP_CODE_EXIT;
2832 				return MLX5_TXCMP_CODE_TSO;
2833 			}
2834 			if (ret == MLX5_TXCMP_CODE_SINGLE) {
2835 				part -= loop;
2836 				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
2837 				if (unlikely(!loc->elts_free ||
2838 					     !loc->wqe_free))
2839 					return MLX5_TXCMP_CODE_EXIT;
2840 				return MLX5_TXCMP_CODE_SINGLE;
2841 			}
2842 			if (ret != MLX5_TXCMP_CODE_EMPW) {
2843 				MLX5_ASSERT(false);
2844 				part -= loop;
2845 				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
2846 				return MLX5_TXCMP_CODE_ERROR;
2847 			}
2848 			/*
2849 			 * Check whether packet parameters coincide
2850 			 * within assumed eMPW batch:
2851 			 * - check sum settings
2852 			 * - metadata value
2853 			 * - software parser settings
2854 			 * - packets length (legacy MPW only)
2855 			 * - scheduling is not required
2856 			 */
2857 			if (!mlx5_tx_match_empw(txq, eseg, loc, dlen, olx)) {
2858 				MLX5_ASSERT(loop);
2859 				part -= loop;
2860 				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
2861 				if (unlikely(!loc->elts_free ||
2862 					     !loc->wqe_free))
2863 					return MLX5_TXCMP_CODE_EXIT;
2864 				pkts_n -= part;
2865 				goto next_empw;
2866 			}
2867 			/* Packet attributes match, continue the same eMPW. */
2868 			++dseg;
2869 			if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
2870 				dseg = (struct mlx5_wqe_dseg *)txq->wqes;
2871 		}
2872 		/* eMPW is built successfully, update loop parameters. */
2873 		MLX5_ASSERT(!loop);
2874 		MLX5_ASSERT(pkts_n >= part);
2875 #ifdef MLX5_PMD_SOFT_COUNTERS
2876 		/* Update sent data bytes counter. */
2877 		txq->stats.obytes += slen;
2878 #endif
2879 		loc->elts_free -= part;
2880 		loc->pkts_sent += part;
2881 		txq->wqe_ci += (2 + part + 3) / 4;
2882 		loc->wqe_free -= (2 + part + 3) / 4;
2883 		pkts_n -= part;
2884 		if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
2885 			return MLX5_TXCMP_CODE_EXIT;
2886 		loc->mbuf = *pkts++;
2887 		ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
2888 		if (unlikely(ret != MLX5_TXCMP_CODE_EMPW))
2889 			return ret;
2890 		/* Continue sending eMPW batches. */
2891 	}
2892 	MLX5_ASSERT(false);
2893 }
2894 
2895 /**
2896  * The routine sends packets with MLX5_OPCODE_EMPW
2897  * with inlining, optionally supports VLAN insertion.
2898  */
2899 static __rte_always_inline enum mlx5_txcmp_code
2900 mlx5_tx_burst_empw_inline(struct mlx5_txq_data *__rte_restrict txq,
2901 			  struct rte_mbuf **__rte_restrict pkts,
2902 			  unsigned int pkts_n,
2903 			  struct mlx5_txq_local *__rte_restrict loc,
2904 			  unsigned int olx)
2905 {
2906 	/*
2907 	 * Subroutine is the part of mlx5_tx_burst_single() and sends
2908 	 * single-segment packet with eMPW opcode with data inlining.
2909 	 */
2910 	MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
2911 	MLX5_ASSERT(MLX5_TXOFF_CONFIG(EMPW));
2912 	MLX5_ASSERT(loc->elts_free && loc->wqe_free);
2913 	MLX5_ASSERT(pkts_n > loc->pkts_sent);
2914 	pkts += loc->pkts_sent + 1;
2915 	pkts_n -= loc->pkts_sent;
2916 	for (;;) {
2917 		struct mlx5_wqe_dseg *__rte_restrict dseg;
2918 		struct mlx5_wqe *__rte_restrict wqem;
2919 		enum mlx5_txcmp_code ret;
2920 		unsigned int room, part, nlim;
2921 		unsigned int slen = 0;
2922 
2923 		MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
2924 		/*
2925 		 * Limits the amount of packets in one WQE
2926 		 * to improve CQE latency generation.
2927 		 */
2928 		nlim = RTE_MIN(pkts_n, MLX5_TXOFF_CONFIG(MPW) ?
2929 				       MLX5_MPW_INLINE_MAX_PACKETS :
2930 				       MLX5_EMPW_MAX_PACKETS);
2931 		if (MLX5_TXOFF_CONFIG(TXPP)) {
2932 			enum mlx5_txcmp_code wret;
2933 
2934 			/* Generate WAIT for scheduling if requested. */
2935 			wret = mlx5_tx_schedule_send(txq, loc, nlim, olx);
2936 			if (wret == MLX5_TXCMP_CODE_EXIT)
2937 				return MLX5_TXCMP_CODE_EXIT;
2938 			if (wret == MLX5_TXCMP_CODE_ERROR)
2939 				return MLX5_TXCMP_CODE_ERROR;
2940 		}
2941 		/* Check whether we have minimal amount WQEs */
2942 		if (unlikely(loc->wqe_free <
2943 			    ((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4)))
2944 			return MLX5_TXCMP_CODE_EXIT;
2945 		if (likely(pkts_n > 1))
2946 			rte_prefetch0(*pkts);
2947 		wqem = txq->wqes + (txq->wqe_ci & txq->wqe_m);
2948 		/*
2949 		 * Build eMPW title WQEBB:
2950 		 * - Control Segment, eMPW opcode, zero DS
2951 		 * - Ethernet Segment, no inline
2952 		 */
2953 		mlx5_tx_cseg_init(txq, loc, wqem, 0,
2954 				  MLX5_OPCODE_ENHANCED_MPSW, olx);
2955 		mlx5_tx_eseg_none(txq, loc, wqem,
2956 				  olx & ~MLX5_TXOFF_CONFIG_VLAN);
2957 		dseg = &wqem->dseg[0];
2958 		/* Store the packet length for legacy MPW. */
2959 		if (MLX5_TXOFF_CONFIG(MPW))
2960 			wqem->eseg.mss = rte_cpu_to_be_16
2961 					 (rte_pktmbuf_data_len(loc->mbuf));
2962 		room = RTE_MIN(MLX5_WQE_SIZE_MAX / MLX5_WQE_SIZE,
2963 			       loc->wqe_free) * MLX5_WQE_SIZE -
2964 					MLX5_WQE_CSEG_SIZE -
2965 					MLX5_WQE_ESEG_SIZE;
2966 		/* Limit the room for legacy MPW sessions for performance. */
2967 		if (MLX5_TXOFF_CONFIG(MPW))
2968 			room = RTE_MIN(room,
2969 				       RTE_MAX(txq->inlen_empw +
2970 					       sizeof(dseg->bcount) +
2971 					       (MLX5_TXOFF_CONFIG(VLAN) ?
2972 					       sizeof(struct rte_vlan_hdr) : 0),
2973 					       MLX5_MPW_INLINE_MAX_PACKETS *
2974 					       MLX5_WQE_DSEG_SIZE));
2975 		/* Build WQE till we have space, packets and resources. */
2976 		part = room;
2977 		for (;;) {
2978 			uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf);
2979 			uint8_t *dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
2980 			unsigned int tlen;
2981 
2982 			MLX5_ASSERT(room >= MLX5_WQE_DSEG_SIZE);
2983 			MLX5_ASSERT((room % MLX5_WQE_DSEG_SIZE) == 0);
2984 			MLX5_ASSERT((uintptr_t)dseg < (uintptr_t)txq->wqes_end);
2985 			/*
2986 			 * Some Tx offloads may cause an error if packet is not
2987 			 * long enough, check against assumed minimal length.
2988 			 */
2989 			if (unlikely(dlen <= MLX5_ESEG_MIN_INLINE_SIZE)) {
2990 				part -= room;
2991 				if (unlikely(!part))
2992 					return MLX5_TXCMP_CODE_ERROR;
2993 				/*
2994 				 * We have some successfully built
2995 				 * packet Data Segments to send.
2996 				 */
2997 				mlx5_tx_idone_empw(txq, loc, part,
2998 						   slen, wqem, olx);
2999 				return MLX5_TXCMP_CODE_ERROR;
3000 			}
3001 			/* Inline or not inline - that's the Question. */
3002 			if (dlen > txq->inlen_empw ||
3003 			    loc->mbuf->ol_flags & RTE_MBUF_F_TX_DYNF_NOINLINE)
3004 				goto pointer_empw;
3005 			if (MLX5_TXOFF_CONFIG(MPW)) {
3006 				if (dlen > txq->inlen_send)
3007 					goto pointer_empw;
3008 				tlen = dlen;
3009 				if (part == room) {
3010 					/* Open new inline MPW session. */
3011 					tlen += sizeof(dseg->bcount);
3012 					dseg->bcount = RTE_BE32(0);
3013 					dseg = RTE_PTR_ADD
3014 						(dseg, sizeof(dseg->bcount));
3015 				} else {
3016 					/*
3017 					 * No pointer and inline descriptor
3018 					 * intermix for legacy MPW sessions.
3019 					 */
3020 					if (wqem->dseg[0].bcount)
3021 						break;
3022 				}
3023 			} else {
3024 				tlen = sizeof(dseg->bcount) + dlen;
3025 			}
3026 			/* Inline entire packet, optional VLAN insertion. */
3027 			if (MLX5_TXOFF_CONFIG(VLAN) &&
3028 			    loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN) {
3029 				/*
3030 				 * The packet length must be checked in
3031 				 * mlx5_tx_able_to_empw() and packet
3032 				 * fits into inline length guaranteed.
3033 				 */
3034 				MLX5_ASSERT((dlen +
3035 					     sizeof(struct rte_vlan_hdr)) <=
3036 					    txq->inlen_empw);
3037 				tlen += sizeof(struct rte_vlan_hdr);
3038 				if (room < tlen)
3039 					break;
3040 				rte_pmd_mlx5_trace_tx_push(loc->mbuf, txq->wqe_ci);
3041 				dseg = mlx5_tx_dseg_vlan(txq, loc, dseg,
3042 							 dptr, dlen, olx);
3043 #ifdef MLX5_PMD_SOFT_COUNTERS
3044 				/* Update sent data bytes counter. */
3045 				slen +=	sizeof(struct rte_vlan_hdr);
3046 #endif
3047 			} else {
3048 				if (room < tlen)
3049 					break;
3050 				rte_pmd_mlx5_trace_tx_push(loc->mbuf, txq->wqe_ci);
3051 				dseg = mlx5_tx_dseg_empw(txq, loc, dseg,
3052 							 dptr, dlen, olx);
3053 			}
3054 			if (!MLX5_TXOFF_CONFIG(MPW))
3055 				tlen = RTE_ALIGN(tlen, MLX5_WSEG_SIZE);
3056 			MLX5_ASSERT(room >= tlen);
3057 			room -= tlen;
3058 			/*
3059 			 * Packet data are completely inline,
3060 			 * we can try to free the packet.
3061 			 */
3062 			if (likely(loc->pkts_sent == loc->mbuf_free)) {
3063 				/*
3064 				 * All the packets from the burst beginning
3065 				 * are inline, we can free mbufs directly
3066 				 * from the origin array on tx_burst exit().
3067 				 */
3068 				loc->mbuf_free++;
3069 				goto next_mbuf;
3070 			}
3071 			/*
3072 			 * In order no to call rte_pktmbuf_free_seg() here,
3073 			 * in the most inner loop (that might be very
3074 			 * expensive) we just save the mbuf in elts.
3075 			 */
3076 			txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
3077 			loc->elts_free--;
3078 			goto next_mbuf;
3079 pointer_empw:
3080 			/*
3081 			 * No pointer and inline descriptor
3082 			 * intermix for legacy MPW sessions.
3083 			 */
3084 			if (MLX5_TXOFF_CONFIG(MPW) &&
3085 			    part != room &&
3086 			    wqem->dseg[0].bcount == RTE_BE32(0))
3087 				break;
3088 			/*
3089 			 * Not inlinable VLAN packets are
3090 			 * proceeded outside of this routine.
3091 			 */
3092 			MLX5_ASSERT(room >= MLX5_WQE_DSEG_SIZE);
3093 			if (MLX5_TXOFF_CONFIG(VLAN))
3094 				MLX5_ASSERT(!(loc->mbuf->ol_flags &
3095 					    RTE_MBUF_F_TX_VLAN));
3096 			rte_pmd_mlx5_trace_tx_push(loc->mbuf, txq->wqe_ci);
3097 			mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx);
3098 			/* We have to store mbuf in elts.*/
3099 			txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
3100 			loc->elts_free--;
3101 			room -= MLX5_WQE_DSEG_SIZE;
3102 			/* Ring buffer wraparound is checked at the loop end.*/
3103 			++dseg;
3104 next_mbuf:
3105 #ifdef MLX5_PMD_SOFT_COUNTERS
3106 			/* Update sent data bytes counter. */
3107 			slen += dlen;
3108 #endif
3109 			loc->pkts_sent++;
3110 			pkts_n--;
3111 			if (unlikely(!pkts_n || !loc->elts_free)) {
3112 				/*
3113 				 * We have no resources/packets to
3114 				 * continue build descriptors.
3115 				 */
3116 				part -= room;
3117 				mlx5_tx_idone_empw(txq, loc, part,
3118 						   slen, wqem, olx);
3119 				return MLX5_TXCMP_CODE_EXIT;
3120 			}
3121 			loc->mbuf = *pkts++;
3122 			if (likely(pkts_n > 1))
3123 				rte_prefetch0(*pkts);
3124 			ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
3125 			/*
3126 			 * Unroll the completion code to avoid
3127 			 * returning variable value - it results in
3128 			 * unoptimized sequent checking in caller.
3129 			 */
3130 			if (ret == MLX5_TXCMP_CODE_MULTI) {
3131 				part -= room;
3132 				mlx5_tx_idone_empw(txq, loc, part,
3133 						   slen, wqem, olx);
3134 				if (unlikely(!loc->elts_free ||
3135 					     !loc->wqe_free))
3136 					return MLX5_TXCMP_CODE_EXIT;
3137 				return MLX5_TXCMP_CODE_MULTI;
3138 			}
3139 			MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
3140 			if (ret == MLX5_TXCMP_CODE_TSO) {
3141 				part -= room;
3142 				mlx5_tx_idone_empw(txq, loc, part,
3143 						   slen, wqem, olx);
3144 				if (unlikely(!loc->elts_free ||
3145 					     !loc->wqe_free))
3146 					return MLX5_TXCMP_CODE_EXIT;
3147 				return MLX5_TXCMP_CODE_TSO;
3148 			}
3149 			if (ret == MLX5_TXCMP_CODE_SINGLE) {
3150 				part -= room;
3151 				mlx5_tx_idone_empw(txq, loc, part,
3152 						   slen, wqem, olx);
3153 				if (unlikely(!loc->elts_free ||
3154 					     !loc->wqe_free))
3155 					return MLX5_TXCMP_CODE_EXIT;
3156 				return MLX5_TXCMP_CODE_SINGLE;
3157 			}
3158 			if (ret != MLX5_TXCMP_CODE_EMPW) {
3159 				MLX5_ASSERT(false);
3160 				part -= room;
3161 				mlx5_tx_idone_empw(txq, loc, part,
3162 						   slen, wqem, olx);
3163 				return MLX5_TXCMP_CODE_ERROR;
3164 			}
3165 			/* Check if we have minimal room left. */
3166 			nlim--;
3167 			if (unlikely(!nlim || room < MLX5_WQE_DSEG_SIZE))
3168 				break;
3169 			/*
3170 			 * Check whether packet parameters coincide
3171 			 * within assumed eMPW batch:
3172 			 * - check sum settings
3173 			 * - metadata value
3174 			 * - software parser settings
3175 			 * - packets length (legacy MPW only)
3176 			 * - scheduling is not required
3177 			 */
3178 			if (!mlx5_tx_match_empw(txq, &wqem->eseg,
3179 						loc, dlen, olx))
3180 				break;
3181 			/* Packet attributes match, continue the same eMPW. */
3182 			if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
3183 				dseg = (struct mlx5_wqe_dseg *)txq->wqes;
3184 		}
3185 		/*
3186 		 * We get here to close an existing eMPW
3187 		 * session and start the new one.
3188 		 */
3189 		MLX5_ASSERT(pkts_n);
3190 		part -= room;
3191 		if (unlikely(!part))
3192 			return MLX5_TXCMP_CODE_EXIT;
3193 		mlx5_tx_idone_empw(txq, loc, part, slen, wqem, olx);
3194 		if (unlikely(!loc->elts_free ||
3195 			     !loc->wqe_free))
3196 			return MLX5_TXCMP_CODE_EXIT;
3197 		/* Continue the loop with new eMPW session. */
3198 	}
3199 	MLX5_ASSERT(false);
3200 }
3201 
3202 /**
3203  * The routine sends packets with ordinary MLX5_OPCODE_SEND.
3204  * Data inlining and VLAN insertion are supported.
3205  */
3206 static __rte_always_inline enum mlx5_txcmp_code
3207 mlx5_tx_burst_single_send(struct mlx5_txq_data *__rte_restrict txq,
3208 			  struct rte_mbuf **__rte_restrict pkts,
3209 			  unsigned int pkts_n,
3210 			  struct mlx5_txq_local *__rte_restrict loc,
3211 			  unsigned int olx)
3212 {
3213 	/*
3214 	 * Subroutine is the part of mlx5_tx_burst_single()
3215 	 * and sends single-segment packet with SEND opcode.
3216 	 */
3217 	MLX5_ASSERT(loc->elts_free && loc->wqe_free);
3218 	MLX5_ASSERT(pkts_n > loc->pkts_sent);
3219 	pkts += loc->pkts_sent + 1;
3220 	pkts_n -= loc->pkts_sent;
3221 	for (;;) {
3222 		struct mlx5_wqe *__rte_restrict wqe;
3223 		enum mlx5_txcmp_code ret;
3224 
3225 		MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
3226 		MLX5_ASSERT(loc->elts_free);
3227 		if (MLX5_TXOFF_CONFIG(TXPP)) {
3228 			enum mlx5_txcmp_code wret;
3229 
3230 			/* Generate WAIT for scheduling if requested. */
3231 			wret = mlx5_tx_schedule_send(txq, loc, 0, olx);
3232 			if (wret == MLX5_TXCMP_CODE_EXIT)
3233 				return MLX5_TXCMP_CODE_EXIT;
3234 			if (wret == MLX5_TXCMP_CODE_ERROR)
3235 				return MLX5_TXCMP_CODE_ERROR;
3236 		}
3237 		if (MLX5_TXOFF_CONFIG(INLINE)) {
3238 			unsigned int inlen, vlan = 0;
3239 
3240 			inlen = rte_pktmbuf_data_len(loc->mbuf);
3241 			if (MLX5_TXOFF_CONFIG(VLAN) &&
3242 			    loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN) {
3243 				vlan = sizeof(struct rte_vlan_hdr);
3244 				inlen += vlan;
3245 			}
3246 			/*
3247 			 * If inlining is enabled at configuration time
3248 			 * the limit must be not less than minimal size.
3249 			 * Otherwise we would do extra check for data
3250 			 * size to avoid crashes due to length overflow.
3251 			 */
3252 			MLX5_ASSERT(txq->inlen_send >=
3253 				    MLX5_ESEG_MIN_INLINE_SIZE);
3254 			if (inlen <= txq->inlen_send) {
3255 				unsigned int seg_n, wqe_n;
3256 
3257 				rte_prefetch0(rte_pktmbuf_mtod
3258 						(loc->mbuf, uint8_t *));
3259 				/* Check against minimal length. */
3260 				if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE)
3261 					return MLX5_TXCMP_CODE_ERROR;
3262 				if (loc->mbuf->ol_flags &
3263 				    RTE_MBUF_F_TX_DYNF_NOINLINE) {
3264 					/*
3265 					 * The hint flag not to inline packet
3266 					 * data is set. Check whether we can
3267 					 * follow the hint.
3268 					 */
3269 					if ((!MLX5_TXOFF_CONFIG(EMPW) &&
3270 					      txq->inlen_mode) ||
3271 					    (MLX5_TXOFF_CONFIG(MPW) &&
3272 					     txq->inlen_mode)) {
3273 						if (inlen <= txq->inlen_send)
3274 							goto single_inline;
3275 						/*
3276 						 * The hardware requires the
3277 						 * minimal inline data header.
3278 						 */
3279 						goto single_min_inline;
3280 					}
3281 					if (MLX5_TXOFF_CONFIG(VLAN) &&
3282 					    vlan && !txq->vlan_en) {
3283 						/*
3284 						 * We must insert VLAN tag
3285 						 * by software means.
3286 						 */
3287 						goto single_part_inline;
3288 					}
3289 					goto single_no_inline;
3290 				}
3291 single_inline:
3292 				/*
3293 				 * Completely inlined packet data WQE:
3294 				 * - Control Segment, SEND opcode
3295 				 * - Ethernet Segment, no VLAN insertion
3296 				 * - Data inlined, VLAN optionally inserted
3297 				 * - Alignment to MLX5_WSEG_SIZE
3298 				 * Have to estimate amount of WQEBBs
3299 				 */
3300 				seg_n = (inlen + 3 * MLX5_WSEG_SIZE -
3301 					 MLX5_ESEG_MIN_INLINE_SIZE +
3302 					 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
3303 				/* Check if there are enough WQEBBs. */
3304 				wqe_n = (seg_n + 3) / 4;
3305 				if (wqe_n > loc->wqe_free)
3306 					return MLX5_TXCMP_CODE_EXIT;
3307 				wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
3308 				loc->wqe_last = wqe;
3309 				mlx5_tx_cseg_init(txq, loc, wqe, seg_n,
3310 						  MLX5_OPCODE_SEND, olx);
3311 				rte_pmd_mlx5_trace_tx_push(loc->mbuf, txq->wqe_ci);
3312 				mlx5_tx_eseg_data(txq, loc, wqe,
3313 						  vlan, inlen, 0, olx);
3314 				txq->wqe_ci += wqe_n;
3315 				loc->wqe_free -= wqe_n;
3316 				/*
3317 				 * Packet data are completely inlined,
3318 				 * free the packet immediately.
3319 				 */
3320 				rte_pktmbuf_free_seg(loc->mbuf);
3321 			} else if ((!MLX5_TXOFF_CONFIG(EMPW) ||
3322 				     MLX5_TXOFF_CONFIG(MPW)) &&
3323 					txq->inlen_mode) {
3324 				/*
3325 				 * If minimal inlining is requested the eMPW
3326 				 * feature should be disabled due to data is
3327 				 * inlined into Ethernet Segment, which can
3328 				 * not contain inlined data for eMPW due to
3329 				 * segment shared for all packets.
3330 				 */
3331 				struct mlx5_wqe_dseg *__rte_restrict dseg;
3332 				unsigned int ds;
3333 				uint8_t *dptr;
3334 
3335 				/*
3336 				 * The inline-mode settings require
3337 				 * to inline the specified amount of
3338 				 * data bytes to the Ethernet Segment.
3339 				 * We should check the free space in
3340 				 * WQE ring buffer to inline partially.
3341 				 */
3342 single_min_inline:
3343 				MLX5_ASSERT(txq->inlen_send >= txq->inlen_mode);
3344 				MLX5_ASSERT(inlen > txq->inlen_mode);
3345 				MLX5_ASSERT(txq->inlen_mode >=
3346 					    MLX5_ESEG_MIN_INLINE_SIZE);
3347 				/*
3348 				 * Check whether there are enough free WQEBBs:
3349 				 * - Control Segment
3350 				 * - Ethernet Segment
3351 				 * - First Segment of inlined Ethernet data
3352 				 * - ... data continued ...
3353 				 * - Finishing Data Segment of pointer type
3354 				 */
3355 				ds = (MLX5_WQE_CSEG_SIZE +
3356 				      MLX5_WQE_ESEG_SIZE +
3357 				      MLX5_WQE_DSEG_SIZE +
3358 				      txq->inlen_mode -
3359 				      MLX5_ESEG_MIN_INLINE_SIZE +
3360 				      MLX5_WQE_DSEG_SIZE +
3361 				      MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
3362 				if (loc->wqe_free < ((ds + 3) / 4))
3363 					return MLX5_TXCMP_CODE_EXIT;
3364 				/*
3365 				 * Build the ordinary SEND WQE:
3366 				 * - Control Segment
3367 				 * - Ethernet Segment, inline inlen_mode bytes
3368 				 * - Data Segment of pointer type
3369 				 */
3370 				wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
3371 				loc->wqe_last = wqe;
3372 				mlx5_tx_cseg_init(txq, loc, wqe, ds,
3373 						  MLX5_OPCODE_SEND, olx);
3374 				rte_pmd_mlx5_trace_tx_push(loc->mbuf, txq->wqe_ci);
3375 				dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan,
3376 							 txq->inlen_mode,
3377 							 0, olx);
3378 				dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) +
3379 				       txq->inlen_mode - vlan;
3380 				inlen -= txq->inlen_mode;
3381 				mlx5_tx_dseg_ptr(txq, loc, dseg,
3382 						 dptr, inlen, olx);
3383 				/*
3384 				 * WQE is built, update the loop parameters
3385 				 * and got to the next packet.
3386 				 */
3387 				txq->wqe_ci += (ds + 3) / 4;
3388 				loc->wqe_free -= (ds + 3) / 4;
3389 				/* We have to store mbuf in elts.*/
3390 				MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
3391 				txq->elts[txq->elts_head++ & txq->elts_m] =
3392 						loc->mbuf;
3393 				--loc->elts_free;
3394 			} else {
3395 				uint8_t *dptr;
3396 				unsigned int dlen;
3397 
3398 				/*
3399 				 * Partially inlined packet data WQE, we have
3400 				 * some space in title WQEBB, we can fill it
3401 				 * with some packet data. It takes one WQEBB,
3402 				 * it is available, no extra space check:
3403 				 * - Control Segment, SEND opcode
3404 				 * - Ethernet Segment, no VLAN insertion
3405 				 * - MLX5_ESEG_MIN_INLINE_SIZE bytes of Data
3406 				 * - Data Segment, pointer type
3407 				 *
3408 				 * We also get here if VLAN insertion is not
3409 				 * supported by HW, the inline is enabled.
3410 				 */
3411 single_part_inline:
3412 				wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
3413 				loc->wqe_last = wqe;
3414 				mlx5_tx_cseg_init(txq, loc, wqe, 4,
3415 						  MLX5_OPCODE_SEND, olx);
3416 				rte_pmd_mlx5_trace_tx_push(loc->mbuf, txq->wqe_ci);
3417 				mlx5_tx_eseg_dmin(txq, loc, wqe, vlan, olx);
3418 				dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) +
3419 				       MLX5_ESEG_MIN_INLINE_SIZE - vlan;
3420 				/*
3421 				 * The length check is performed above, by
3422 				 * comparing with txq->inlen_send. We should
3423 				 * not get overflow here.
3424 				 */
3425 				MLX5_ASSERT(inlen > MLX5_ESEG_MIN_INLINE_SIZE);
3426 				dlen = inlen - MLX5_ESEG_MIN_INLINE_SIZE;
3427 				mlx5_tx_dseg_ptr(txq, loc, &wqe->dseg[1],
3428 						 dptr, dlen, olx);
3429 				++txq->wqe_ci;
3430 				--loc->wqe_free;
3431 				/* We have to store mbuf in elts.*/
3432 				MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
3433 				txq->elts[txq->elts_head++ & txq->elts_m] =
3434 						loc->mbuf;
3435 				--loc->elts_free;
3436 			}
3437 #ifdef MLX5_PMD_SOFT_COUNTERS
3438 			/* Update sent data bytes counter. */
3439 			txq->stats.obytes += vlan +
3440 					rte_pktmbuf_data_len(loc->mbuf);
3441 #endif
3442 		} else {
3443 			/*
3444 			 * No inline at all, it means the CPU cycles saving
3445 			 * is prioritized at configuration, we should not
3446 			 * copy any packet data to WQE.
3447 			 *
3448 			 * SEND WQE, one WQEBB:
3449 			 * - Control Segment, SEND opcode
3450 			 * - Ethernet Segment, optional VLAN, no inline
3451 			 * - Data Segment, pointer type
3452 			 */
3453 single_no_inline:
3454 			wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
3455 			loc->wqe_last = wqe;
3456 			mlx5_tx_cseg_init(txq, loc, wqe, 3,
3457 					  MLX5_OPCODE_SEND, olx);
3458 			rte_pmd_mlx5_trace_tx_push(loc->mbuf, txq->wqe_ci);
3459 			mlx5_tx_eseg_none(txq, loc, wqe, olx);
3460 			mlx5_tx_dseg_ptr
3461 				(txq, loc, &wqe->dseg[0],
3462 				 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
3463 				 rte_pktmbuf_data_len(loc->mbuf), olx);
3464 			++txq->wqe_ci;
3465 			--loc->wqe_free;
3466 			/*
3467 			 * We should not store mbuf pointer in elts
3468 			 * if no inlining is configured, this is done
3469 			 * by calling routine in a batch copy.
3470 			 */
3471 			if (MLX5_TXOFF_CONFIG(INLINE))
3472 				txq->elts[txq->elts_head++ & txq->elts_m] =
3473 							loc->mbuf;
3474 			--loc->elts_free;
3475 #ifdef MLX5_PMD_SOFT_COUNTERS
3476 			/* Update sent data bytes counter. */
3477 			txq->stats.obytes += rte_pktmbuf_data_len(loc->mbuf);
3478 			if (MLX5_TXOFF_CONFIG(VLAN) &&
3479 			    loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN)
3480 				txq->stats.obytes +=
3481 					sizeof(struct rte_vlan_hdr);
3482 #endif
3483 		}
3484 		++loc->pkts_sent;
3485 		--pkts_n;
3486 		if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
3487 			return MLX5_TXCMP_CODE_EXIT;
3488 		loc->mbuf = *pkts++;
3489 		if (pkts_n > 1)
3490 			rte_prefetch0(*pkts);
3491 		ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
3492 		if (unlikely(ret != MLX5_TXCMP_CODE_SINGLE))
3493 			return ret;
3494 	}
3495 	MLX5_ASSERT(false);
3496 }
3497 
3498 static __rte_always_inline enum mlx5_txcmp_code
3499 mlx5_tx_burst_single(struct mlx5_txq_data *__rte_restrict txq,
3500 		     struct rte_mbuf **__rte_restrict pkts,
3501 		     unsigned int pkts_n,
3502 		     struct mlx5_txq_local *__rte_restrict loc,
3503 		     unsigned int olx)
3504 {
3505 	enum mlx5_txcmp_code ret;
3506 
3507 	ret = mlx5_tx_able_to_empw(txq, loc, olx, false);
3508 	if (ret == MLX5_TXCMP_CODE_SINGLE)
3509 		goto ordinary_send;
3510 	MLX5_ASSERT(ret == MLX5_TXCMP_CODE_EMPW);
3511 	for (;;) {
3512 		/* Optimize for inline/no inline eMPW send. */
3513 		ret = (MLX5_TXOFF_CONFIG(INLINE)) ?
3514 			mlx5_tx_burst_empw_inline
3515 				(txq, pkts, pkts_n, loc, olx) :
3516 			mlx5_tx_burst_empw_simple
3517 				(txq, pkts, pkts_n, loc, olx);
3518 		if (ret != MLX5_TXCMP_CODE_SINGLE)
3519 			return ret;
3520 		/* The resources to send one packet should remain. */
3521 		MLX5_ASSERT(loc->elts_free && loc->wqe_free);
3522 ordinary_send:
3523 		ret = mlx5_tx_burst_single_send(txq, pkts, pkts_n, loc, olx);
3524 		MLX5_ASSERT(ret != MLX5_TXCMP_CODE_SINGLE);
3525 		if (ret != MLX5_TXCMP_CODE_EMPW)
3526 			return ret;
3527 		/* The resources to send one packet should remain. */
3528 		MLX5_ASSERT(loc->elts_free && loc->wqe_free);
3529 	}
3530 }
3531 
3532 /**
3533  * DPDK Tx callback template. This is configured template used to generate
3534  * routines optimized for specified offload setup.
3535  * One of this generated functions is chosen at SQ configuration time.
3536  *
3537  * @param txq
3538  *   Generic pointer to TX queue structure.
3539  * @param[in] pkts
3540  *   Packets to transmit.
3541  * @param pkts_n
3542  *   Number of packets in array.
3543  * @param olx
3544  *   Configured offloads mask, presents the bits of MLX5_TXOFF_CONFIG_xxx
3545  *   values. Should be static to take compile time static configuration
3546  *   advantages.
3547  *
3548  * @return
3549  *   Number of packets successfully transmitted (<= pkts_n).
3550  */
3551 static __rte_always_inline uint16_t
3552 mlx5_tx_burst_tmpl(struct mlx5_txq_data *__rte_restrict txq,
3553 		   struct rte_mbuf **__rte_restrict pkts,
3554 		   uint16_t pkts_n,
3555 		   unsigned int olx)
3556 {
3557 	struct mlx5_txq_local loc;
3558 	enum mlx5_txcmp_code ret;
3559 	unsigned int part;
3560 
3561 	MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
3562 	MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
3563 	if (unlikely(!pkts_n))
3564 		return 0;
3565 	if (MLX5_TXOFF_CONFIG(INLINE))
3566 		loc.mbuf_free = 0;
3567 	loc.pkts_sent = 0;
3568 	loc.pkts_copy = 0;
3569 	loc.wqe_last = NULL;
3570 
3571 send_loop:
3572 	loc.pkts_loop = loc.pkts_sent;
3573 	/*
3574 	 * Check if there are some CQEs, if any:
3575 	 * - process an encountered errors
3576 	 * - process the completed WQEs
3577 	 * - free related mbufs
3578 	 * - doorbell the NIC about processed CQEs
3579 	 */
3580 	rte_prefetch0(*(pkts + loc.pkts_sent));
3581 	mlx5_tx_handle_completion(txq, olx);
3582 	/*
3583 	 * Calculate the number of available resources - elts and WQEs.
3584 	 * There are two possible different scenarios:
3585 	 * - no data inlining into WQEs, one WQEBB may contains up to
3586 	 *   four packets, in this case elts become scarce resource
3587 	 * - data inlining into WQEs, one packet may require multiple
3588 	 *   WQEBBs, the WQEs become the limiting factor.
3589 	 */
3590 	MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
3591 	loc.elts_free = txq->elts_s -
3592 				(uint16_t)(txq->elts_head - txq->elts_tail);
3593 	MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
3594 	loc.wqe_free = txq->wqe_s -
3595 				(uint16_t)(txq->wqe_ci - txq->wqe_pi);
3596 	if (unlikely(!loc.elts_free || !loc.wqe_free))
3597 		goto burst_exit;
3598 	for (;;) {
3599 		/*
3600 		 * Fetch the packet from array. Usually this is the first
3601 		 * packet in series of multi/single segment packets.
3602 		 */
3603 		loc.mbuf = *(pkts + loc.pkts_sent);
3604 		/* Dedicated branch for multi-segment packets. */
3605 		if (MLX5_TXOFF_CONFIG(MULTI) &&
3606 		    unlikely(NB_SEGS(loc.mbuf) > 1)) {
3607 			/*
3608 			 * Multi-segment packet encountered.
3609 			 * Hardware is able to process it only
3610 			 * with SEND/TSO opcodes, one packet
3611 			 * per WQE, do it in dedicated routine.
3612 			 */
3613 enter_send_multi:
3614 			MLX5_ASSERT(loc.pkts_sent >= loc.pkts_copy);
3615 			part = loc.pkts_sent - loc.pkts_copy;
3616 			if (!MLX5_TXOFF_CONFIG(INLINE) && part) {
3617 				/*
3618 				 * There are some single-segment mbufs not
3619 				 * stored in elts. The mbufs must be in the
3620 				 * same order as WQEs, so we must copy the
3621 				 * mbufs to elts here, before the coming
3622 				 * multi-segment packet mbufs is appended.
3623 				 */
3624 				mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy,
3625 						  part, olx);
3626 				loc.pkts_copy = loc.pkts_sent;
3627 			}
3628 			MLX5_ASSERT(pkts_n > loc.pkts_sent);
3629 			ret = mlx5_tx_burst_mseg(txq, pkts, pkts_n, &loc, olx);
3630 			if (!MLX5_TXOFF_CONFIG(INLINE))
3631 				loc.pkts_copy = loc.pkts_sent;
3632 			/*
3633 			 * These returned code checks are supposed
3634 			 * to be optimized out due to routine inlining.
3635 			 */
3636 			if (ret == MLX5_TXCMP_CODE_EXIT) {
3637 				/*
3638 				 * The routine returns this code when
3639 				 * all packets are sent or there is no
3640 				 * enough resources to complete request.
3641 				 */
3642 				break;
3643 			}
3644 			if (ret == MLX5_TXCMP_CODE_ERROR) {
3645 				/*
3646 				 * The routine returns this code when some error
3647 				 * in the incoming packets format occurred.
3648 				 */
3649 				txq->stats.oerrors++;
3650 				break;
3651 			}
3652 			if (ret == MLX5_TXCMP_CODE_SINGLE) {
3653 				/*
3654 				 * The single-segment packet was encountered
3655 				 * in the array, try to send it with the
3656 				 * best optimized way, possible engaging eMPW.
3657 				 */
3658 				goto enter_send_single;
3659 			}
3660 			if (MLX5_TXOFF_CONFIG(TSO) &&
3661 			    ret == MLX5_TXCMP_CODE_TSO) {
3662 				/*
3663 				 * The single-segment TSO packet was
3664 				 * encountered in the array.
3665 				 */
3666 				goto enter_send_tso;
3667 			}
3668 			/* We must not get here. Something is going wrong. */
3669 			MLX5_ASSERT(false);
3670 			txq->stats.oerrors++;
3671 			break;
3672 		}
3673 		/* Dedicated branch for single-segment TSO packets. */
3674 		if (MLX5_TXOFF_CONFIG(TSO) &&
3675 		    unlikely(loc.mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG)) {
3676 			/*
3677 			 * TSO might require special way for inlining
3678 			 * (dedicated parameters) and is sent with
3679 			 * MLX5_OPCODE_TSO opcode only, provide this
3680 			 * in dedicated branch.
3681 			 */
3682 enter_send_tso:
3683 			MLX5_ASSERT(NB_SEGS(loc.mbuf) == 1);
3684 			MLX5_ASSERT(pkts_n > loc.pkts_sent);
3685 			ret = mlx5_tx_burst_tso(txq, pkts, pkts_n, &loc, olx);
3686 			/*
3687 			 * These returned code checks are supposed
3688 			 * to be optimized out due to routine inlining.
3689 			 */
3690 			if (ret == MLX5_TXCMP_CODE_EXIT)
3691 				break;
3692 			if (ret == MLX5_TXCMP_CODE_ERROR) {
3693 				txq->stats.oerrors++;
3694 				break;
3695 			}
3696 			if (ret == MLX5_TXCMP_CODE_SINGLE)
3697 				goto enter_send_single;
3698 			if (MLX5_TXOFF_CONFIG(MULTI) &&
3699 			    ret == MLX5_TXCMP_CODE_MULTI) {
3700 				/*
3701 				 * The multi-segment packet was
3702 				 * encountered in the array.
3703 				 */
3704 				goto enter_send_multi;
3705 			}
3706 			/* We must not get here. Something is going wrong. */
3707 			MLX5_ASSERT(false);
3708 			txq->stats.oerrors++;
3709 			break;
3710 		}
3711 		/*
3712 		 * The dedicated branch for the single-segment packets
3713 		 * without TSO. Often these ones can be sent using
3714 		 * MLX5_OPCODE_EMPW with multiple packets in one WQE.
3715 		 * The routine builds the WQEs till it encounters
3716 		 * the TSO or multi-segment packet (in case if these
3717 		 * offloads are requested at SQ configuration time).
3718 		 */
3719 enter_send_single:
3720 		MLX5_ASSERT(pkts_n > loc.pkts_sent);
3721 		ret = mlx5_tx_burst_single(txq, pkts, pkts_n, &loc, olx);
3722 		/*
3723 		 * These returned code checks are supposed
3724 		 * to be optimized out due to routine inlining.
3725 		 */
3726 		if (ret == MLX5_TXCMP_CODE_EXIT)
3727 			break;
3728 		if (ret == MLX5_TXCMP_CODE_ERROR) {
3729 			txq->stats.oerrors++;
3730 			break;
3731 		}
3732 		if (MLX5_TXOFF_CONFIG(MULTI) &&
3733 		    ret == MLX5_TXCMP_CODE_MULTI) {
3734 			/*
3735 			 * The multi-segment packet was
3736 			 * encountered in the array.
3737 			 */
3738 			goto enter_send_multi;
3739 		}
3740 		if (MLX5_TXOFF_CONFIG(TSO) &&
3741 		    ret == MLX5_TXCMP_CODE_TSO) {
3742 			/*
3743 			 * The single-segment TSO packet was
3744 			 * encountered in the array.
3745 			 */
3746 			goto enter_send_tso;
3747 		}
3748 		/* We must not get here. Something is going wrong. */
3749 		MLX5_ASSERT(false);
3750 		txq->stats.oerrors++;
3751 		break;
3752 	}
3753 	/*
3754 	 * Main Tx loop is completed, do the rest:
3755 	 * - set completion request if thresholds are reached
3756 	 * - doorbell the hardware
3757 	 * - copy the rest of mbufs to elts (if any)
3758 	 */
3759 	MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE) ||
3760 		    loc.pkts_sent >= loc.pkts_copy);
3761 	/* Take a shortcut if nothing is sent. */
3762 	if (unlikely(loc.pkts_sent == loc.pkts_loop))
3763 		goto burst_exit;
3764 	/* Request CQE generation if limits are reached. */
3765 	if (MLX5_TXOFF_CONFIG(TXPP) && __rte_trace_point_fp_is_enabled())
3766 		mlx5_tx_request_completion_trace(txq, &loc, olx);
3767 	else
3768 		mlx5_tx_request_completion(txq, &loc, olx);
3769 	/*
3770 	 * Ring QP doorbell immediately after WQE building completion
3771 	 * to improve latencies. The pure software related data treatment
3772 	 * can be completed after doorbell. Tx CQEs for this SQ are
3773 	 * processed in this thread only by the polling.
3774 	 *
3775 	 * The rdma core library can map doorbell register in two ways,
3776 	 * depending on the environment variable "MLX5_SHUT_UP_BF":
3777 	 *
3778 	 * - as regular cached memory, the variable is either missing or
3779 	 *   set to zero. This type of mapping may cause the significant
3780 	 *   doorbell register writing latency and requires explicit memory
3781 	 *   write barrier to mitigate this issue and prevent write combining.
3782 	 *
3783 	 * - as non-cached memory, the variable is present and set to not "0"
3784 	 *   value. This type of mapping may cause performance impact under
3785 	 *   heavy loading conditions but the explicit write memory barrier is
3786 	 *   not required and it may improve core performance.
3787 	 *
3788 	 * - the legacy behaviour (prior 19.08 release) was to use some
3789 	 *   heuristics to decide whether write memory barrier should
3790 	 *   be performed. This behavior is supported with specifying
3791 	 *   tx_db_nc=2, write barrier is skipped if application provides
3792 	 *   the full recommended burst of packets, it supposes the next
3793 	 *   packets are coming and the write barrier will be issued on
3794 	 *   the next burst (after descriptor writing, at least).
3795 	 */
3796 	mlx5_doorbell_ring(mlx5_tx_bfreg(txq),
3797 			   *(volatile uint64_t *)loc.wqe_last, txq->wqe_ci,
3798 			   txq->qp_db, !txq->db_nc &&
3799 			   (!txq->db_heu || pkts_n % MLX5_TX_DEFAULT_BURST));
3800 	/* Not all of the mbufs may be stored into elts yet. */
3801 	part = MLX5_TXOFF_CONFIG(INLINE) ? 0 : loc.pkts_sent - loc.pkts_copy;
3802 	if (!MLX5_TXOFF_CONFIG(INLINE) && part) {
3803 		/*
3804 		 * There are some single-segment mbufs not stored in elts.
3805 		 * It can be only if the last packet was single-segment.
3806 		 * The copying is gathered into one place due to it is
3807 		 * a good opportunity to optimize that with SIMD.
3808 		 * Unfortunately if inlining is enabled the gaps in pointer
3809 		 * array may happen due to early freeing of the inlined mbufs.
3810 		 */
3811 		mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy, part, olx);
3812 		loc.pkts_copy = loc.pkts_sent;
3813 	}
3814 	MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
3815 	MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
3816 	if (pkts_n > loc.pkts_sent) {
3817 		/*
3818 		 * If burst size is large there might be no enough CQE
3819 		 * fetched from completion queue and no enough resources
3820 		 * freed to send all the packets.
3821 		 */
3822 		goto send_loop;
3823 	}
3824 burst_exit:
3825 #ifdef MLX5_PMD_SOFT_COUNTERS
3826 	/* Increment sent packets counter. */
3827 	txq->stats.opackets += loc.pkts_sent;
3828 #endif
3829 	if (MLX5_TXOFF_CONFIG(INLINE) && loc.mbuf_free)
3830 		__mlx5_tx_free_mbuf(txq, pkts, loc.mbuf_free, olx);
3831 	/* Trace productive bursts only. */
3832 	if (__rte_trace_point_fp_is_enabled() && loc.pkts_sent)
3833 		rte_pmd_mlx5_trace_tx_exit(mlx5_read_pcibar_clock_from_txq(txq),
3834 					   loc.pkts_sent, pkts_n);
3835 	return loc.pkts_sent;
3836 }
3837 
3838 /**
3839  * Check whether given TxQ is external.
3840  *
3841  * @param dev
3842  *   Pointer to Ethernet device.
3843  * @param queue_idx
3844  *   Tx queue index.
3845  *
3846  * @return
3847  *   True if is external TxQ, otherwise false.
3848  */
3849 static __rte_always_inline bool
3850 mlx5_is_external_txq(struct rte_eth_dev *dev, uint16_t queue_idx)
3851 {
3852 	struct mlx5_priv *priv = dev->data->dev_private;
3853 	struct mlx5_external_q *txq;
3854 
3855 	if (!priv->ext_txqs || queue_idx < MLX5_EXTERNAL_TX_QUEUE_ID_MIN)
3856 		return false;
3857 	txq = &priv->ext_txqs[queue_idx - MLX5_EXTERNAL_TX_QUEUE_ID_MIN];
3858 	return !!rte_atomic_load_explicit(&txq->refcnt, rte_memory_order_relaxed);
3859 }
3860 
3861 #endif /* RTE_PMD_MLX5_TX_H_ */
3862