xref: /dpdk/drivers/net/mlx5/mlx5_rxtx.c (revision fd5baf09cdf9170e0f92a112fd0ef19c29649330)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5 
6 #include <assert.h>
7 #include <stdint.h>
8 #include <string.h>
9 #include <stdlib.h>
10 
11 /* Verbs header. */
12 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
13 #ifdef PEDANTIC
14 #pragma GCC diagnostic ignored "-Wpedantic"
15 #endif
16 #include <infiniband/verbs.h>
17 #include <infiniband/mlx5dv.h>
18 #ifdef PEDANTIC
19 #pragma GCC diagnostic error "-Wpedantic"
20 #endif
21 
22 #include <rte_mbuf.h>
23 #include <rte_mempool.h>
24 #include <rte_prefetch.h>
25 #include <rte_common.h>
26 #include <rte_branch_prediction.h>
27 #include <rte_ether.h>
28 
29 #include "mlx5.h"
30 #include "mlx5_utils.h"
31 #include "mlx5_rxtx.h"
32 #include "mlx5_autoconf.h"
33 #include "mlx5_defs.h"
34 #include "mlx5_prm.h"
35 
36 static __rte_always_inline uint32_t
37 rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe);
38 
39 static __rte_always_inline int
40 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
41 		 uint16_t cqe_cnt, uint32_t *rss_hash);
42 
43 static __rte_always_inline uint32_t
44 rxq_cq_to_ol_flags(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe);
45 
46 uint32_t mlx5_ptype_table[] __rte_cache_aligned = {
47 	[0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */
48 };
49 
50 /**
51  * Build a table to translate Rx completion flags to packet type.
52  *
53  * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
54  */
55 void
56 mlx5_set_ptype_table(void)
57 {
58 	unsigned int i;
59 	uint32_t (*p)[RTE_DIM(mlx5_ptype_table)] = &mlx5_ptype_table;
60 
61 	/* Last entry must not be overwritten, reserved for errored packet. */
62 	for (i = 0; i < RTE_DIM(mlx5_ptype_table) - 1; ++i)
63 		(*p)[i] = RTE_PTYPE_UNKNOWN;
64 	/*
65 	 * The index to the array should have:
66 	 * bit[1:0] = l3_hdr_type
67 	 * bit[4:2] = l4_hdr_type
68 	 * bit[5] = ip_frag
69 	 * bit[6] = tunneled
70 	 * bit[7] = outer_l3_type
71 	 */
72 	/* L2 */
73 	(*p)[0x00] = RTE_PTYPE_L2_ETHER;
74 	/* L3 */
75 	(*p)[0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
76 		     RTE_PTYPE_L4_NONFRAG;
77 	(*p)[0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
78 		     RTE_PTYPE_L4_NONFRAG;
79 	/* Fragmented */
80 	(*p)[0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
81 		     RTE_PTYPE_L4_FRAG;
82 	(*p)[0x22] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
83 		     RTE_PTYPE_L4_FRAG;
84 	/* TCP */
85 	(*p)[0x05] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
86 		     RTE_PTYPE_L4_TCP;
87 	(*p)[0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
88 		     RTE_PTYPE_L4_TCP;
89 	(*p)[0x0d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
90 		     RTE_PTYPE_L4_TCP;
91 	(*p)[0x0e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
92 		     RTE_PTYPE_L4_TCP;
93 	(*p)[0x11] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
94 		     RTE_PTYPE_L4_TCP;
95 	(*p)[0x12] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
96 		     RTE_PTYPE_L4_TCP;
97 	/* UDP */
98 	(*p)[0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
99 		     RTE_PTYPE_L4_UDP;
100 	(*p)[0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
101 		     RTE_PTYPE_L4_UDP;
102 	/* Repeat with outer_l3_type being set. Just in case. */
103 	(*p)[0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
104 		     RTE_PTYPE_L4_NONFRAG;
105 	(*p)[0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
106 		     RTE_PTYPE_L4_NONFRAG;
107 	(*p)[0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
108 		     RTE_PTYPE_L4_FRAG;
109 	(*p)[0xa2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
110 		     RTE_PTYPE_L4_FRAG;
111 	(*p)[0x85] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
112 		     RTE_PTYPE_L4_TCP;
113 	(*p)[0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
114 		     RTE_PTYPE_L4_TCP;
115 	(*p)[0x8d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
116 		     RTE_PTYPE_L4_TCP;
117 	(*p)[0x8e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
118 		     RTE_PTYPE_L4_TCP;
119 	(*p)[0x91] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
120 		     RTE_PTYPE_L4_TCP;
121 	(*p)[0x92] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
122 		     RTE_PTYPE_L4_TCP;
123 	(*p)[0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
124 		     RTE_PTYPE_L4_UDP;
125 	(*p)[0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
126 		     RTE_PTYPE_L4_UDP;
127 	/* Tunneled - L3 */
128 	(*p)[0x41] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
129 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
130 		     RTE_PTYPE_INNER_L4_NONFRAG;
131 	(*p)[0x42] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
132 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
133 		     RTE_PTYPE_INNER_L4_NONFRAG;
134 	(*p)[0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
135 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
136 		     RTE_PTYPE_INNER_L4_NONFRAG;
137 	(*p)[0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
138 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
139 		     RTE_PTYPE_INNER_L4_NONFRAG;
140 	/* Tunneled - Fragmented */
141 	(*p)[0x61] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
142 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
143 		     RTE_PTYPE_INNER_L4_FRAG;
144 	(*p)[0x62] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
145 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
146 		     RTE_PTYPE_INNER_L4_FRAG;
147 	(*p)[0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
148 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
149 		     RTE_PTYPE_INNER_L4_FRAG;
150 	(*p)[0xe2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
151 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
152 		     RTE_PTYPE_INNER_L4_FRAG;
153 	/* Tunneled - TCP */
154 	(*p)[0x45] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
155 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
156 		     RTE_PTYPE_INNER_L4_TCP;
157 	(*p)[0x46] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
158 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
159 		     RTE_PTYPE_INNER_L4_TCP;
160 	(*p)[0x4d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
161 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
162 		     RTE_PTYPE_INNER_L4_TCP;
163 	(*p)[0x4e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
164 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
165 		     RTE_PTYPE_INNER_L4_TCP;
166 	(*p)[0x51] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
167 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
168 		     RTE_PTYPE_INNER_L4_TCP;
169 	(*p)[0x52] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
170 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
171 		     RTE_PTYPE_INNER_L4_TCP;
172 	(*p)[0xc5] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
173 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
174 		     RTE_PTYPE_INNER_L4_TCP;
175 	(*p)[0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
176 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
177 		     RTE_PTYPE_INNER_L4_TCP;
178 	(*p)[0xcd] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
179 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
180 		     RTE_PTYPE_INNER_L4_TCP;
181 	(*p)[0xce] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
182 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
183 		     RTE_PTYPE_INNER_L4_TCP;
184 	(*p)[0xd1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
185 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
186 		     RTE_PTYPE_INNER_L4_TCP;
187 	(*p)[0xd2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
188 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
189 		     RTE_PTYPE_INNER_L4_TCP;
190 	/* Tunneled - UDP */
191 	(*p)[0x49] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
192 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
193 		     RTE_PTYPE_INNER_L4_UDP;
194 	(*p)[0x4a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
195 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
196 		     RTE_PTYPE_INNER_L4_UDP;
197 	(*p)[0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
198 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
199 		     RTE_PTYPE_INNER_L4_UDP;
200 	(*p)[0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
201 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
202 		     RTE_PTYPE_INNER_L4_UDP;
203 }
204 
205 /**
206  * Return the size of tailroom of WQ.
207  *
208  * @param txq
209  *   Pointer to TX queue structure.
210  * @param addr
211  *   Pointer to tail of WQ.
212  *
213  * @return
214  *   Size of tailroom.
215  */
216 static inline size_t
217 tx_mlx5_wq_tailroom(struct mlx5_txq_data *txq, void *addr)
218 {
219 	size_t tailroom;
220 	tailroom = (uintptr_t)(txq->wqes) +
221 		   (1 << txq->wqe_n) * MLX5_WQE_SIZE -
222 		   (uintptr_t)addr;
223 	return tailroom;
224 }
225 
226 /**
227  * Copy data to tailroom of circular queue.
228  *
229  * @param dst
230  *   Pointer to destination.
231  * @param src
232  *   Pointer to source.
233  * @param n
234  *   Number of bytes to copy.
235  * @param base
236  *   Pointer to head of queue.
237  * @param tailroom
238  *   Size of tailroom from dst.
239  *
240  * @return
241  *   Pointer after copied data.
242  */
243 static inline void *
244 mlx5_copy_to_wq(void *dst, const void *src, size_t n,
245 		void *base, size_t tailroom)
246 {
247 	void *ret;
248 
249 	if (n > tailroom) {
250 		rte_memcpy(dst, src, tailroom);
251 		rte_memcpy(base, (void *)((uintptr_t)src + tailroom),
252 			   n - tailroom);
253 		ret = (uint8_t *)base + n - tailroom;
254 	} else {
255 		rte_memcpy(dst, src, n);
256 		ret = (n == tailroom) ? base : (uint8_t *)dst + n;
257 	}
258 	return ret;
259 }
260 
261 /**
262  * DPDK callback to check the status of a tx descriptor.
263  *
264  * @param tx_queue
265  *   The tx queue.
266  * @param[in] offset
267  *   The index of the descriptor in the ring.
268  *
269  * @return
270  *   The status of the tx descriptor.
271  */
272 int
273 mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset)
274 {
275 	struct mlx5_txq_data *txq = tx_queue;
276 	uint16_t used;
277 
278 	mlx5_tx_complete(txq);
279 	used = txq->elts_head - txq->elts_tail;
280 	if (offset < used)
281 		return RTE_ETH_TX_DESC_FULL;
282 	return RTE_ETH_TX_DESC_DONE;
283 }
284 
285 /**
286  * DPDK callback to check the status of a rx descriptor.
287  *
288  * @param rx_queue
289  *   The rx queue.
290  * @param[in] offset
291  *   The index of the descriptor in the ring.
292  *
293  * @return
294  *   The status of the tx descriptor.
295  */
296 int
297 mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset)
298 {
299 	struct mlx5_rxq_data *rxq = rx_queue;
300 	struct rxq_zip *zip = &rxq->zip;
301 	volatile struct mlx5_cqe *cqe;
302 	const unsigned int cqe_n = (1 << rxq->cqe_n);
303 	const unsigned int cqe_cnt = cqe_n - 1;
304 	unsigned int cq_ci;
305 	unsigned int used;
306 
307 	/* if we are processing a compressed cqe */
308 	if (zip->ai) {
309 		used = zip->cqe_cnt - zip->ca;
310 		cq_ci = zip->cq_ci;
311 	} else {
312 		used = 0;
313 		cq_ci = rxq->cq_ci;
314 	}
315 	cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
316 	while (check_cqe(cqe, cqe_n, cq_ci) == 0) {
317 		int8_t op_own;
318 		unsigned int n;
319 
320 		op_own = cqe->op_own;
321 		if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED)
322 			n = rte_be_to_cpu_32(cqe->byte_cnt);
323 		else
324 			n = 1;
325 		cq_ci += n;
326 		used += n;
327 		cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
328 	}
329 	used = RTE_MIN(used, (1U << rxq->elts_n) - 1);
330 	if (offset < used)
331 		return RTE_ETH_RX_DESC_DONE;
332 	return RTE_ETH_RX_DESC_AVAIL;
333 }
334 
335 /**
336  * DPDK callback for TX.
337  *
338  * @param dpdk_txq
339  *   Generic pointer to TX queue structure.
340  * @param[in] pkts
341  *   Packets to transmit.
342  * @param pkts_n
343  *   Number of packets in array.
344  *
345  * @return
346  *   Number of packets successfully transmitted (<= pkts_n).
347  */
348 uint16_t
349 mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
350 {
351 	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
352 	uint16_t elts_head = txq->elts_head;
353 	const uint16_t elts_n = 1 << txq->elts_n;
354 	const uint16_t elts_m = elts_n - 1;
355 	unsigned int i = 0;
356 	unsigned int j = 0;
357 	unsigned int k = 0;
358 	uint16_t max_elts;
359 	uint16_t max_wqe;
360 	unsigned int comp;
361 	volatile struct mlx5_wqe_ctrl *last_wqe = NULL;
362 	unsigned int segs_n = 0;
363 	const unsigned int max_inline = txq->max_inline;
364 
365 	if (unlikely(!pkts_n))
366 		return 0;
367 	/* Prefetch first packet cacheline. */
368 	rte_prefetch0(*pkts);
369 	/* Start processing. */
370 	mlx5_tx_complete(txq);
371 	max_elts = (elts_n - (elts_head - txq->elts_tail));
372 	/* A CQE slot must always be available. */
373 	assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci));
374 	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
375 	if (unlikely(!max_wqe))
376 		return 0;
377 	do {
378 		struct rte_mbuf *buf = NULL;
379 		uint8_t *raw;
380 		volatile struct mlx5_wqe_v *wqe = NULL;
381 		volatile rte_v128u32_t *dseg = NULL;
382 		uint32_t length;
383 		unsigned int ds = 0;
384 		unsigned int sg = 0; /* counter of additional segs attached. */
385 		uintptr_t addr;
386 		uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE + 2;
387 		uint16_t tso_header_sz = 0;
388 		uint16_t ehdr;
389 		uint8_t cs_flags;
390 		uint64_t tso = 0;
391 		uint16_t tso_segsz = 0;
392 #ifdef MLX5_PMD_SOFT_COUNTERS
393 		uint32_t total_length = 0;
394 #endif
395 
396 		/* first_seg */
397 		buf = *pkts;
398 		segs_n = buf->nb_segs;
399 		/*
400 		 * Make sure there is enough room to store this packet and
401 		 * that one ring entry remains unused.
402 		 */
403 		assert(segs_n);
404 		if (max_elts < segs_n)
405 			break;
406 		max_elts -= segs_n;
407 		sg = --segs_n;
408 		if (unlikely(--max_wqe == 0))
409 			break;
410 		wqe = (volatile struct mlx5_wqe_v *)
411 			tx_mlx5_wqe(txq, txq->wqe_ci);
412 		rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
413 		if (pkts_n - i > 1)
414 			rte_prefetch0(*(pkts + 1));
415 		addr = rte_pktmbuf_mtod(buf, uintptr_t);
416 		length = DATA_LEN(buf);
417 		ehdr = (((uint8_t *)addr)[1] << 8) |
418 		       ((uint8_t *)addr)[0];
419 #ifdef MLX5_PMD_SOFT_COUNTERS
420 		total_length = length;
421 #endif
422 		if (length < (MLX5_WQE_DWORD_SIZE + 2)) {
423 			txq->stats.oerrors++;
424 			break;
425 		}
426 		/* Update element. */
427 		(*txq->elts)[elts_head & elts_m] = buf;
428 		/* Prefetch next buffer data. */
429 		if (pkts_n - i > 1)
430 			rte_prefetch0(
431 			    rte_pktmbuf_mtod(*(pkts + 1), volatile void *));
432 		cs_flags = txq_ol_cksum_to_cs(txq, buf);
433 		raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE;
434 		/* Replace the Ethernet type by the VLAN if necessary. */
435 		if (buf->ol_flags & PKT_TX_VLAN_PKT) {
436 			uint32_t vlan = rte_cpu_to_be_32(0x81000000 |
437 							 buf->vlan_tci);
438 			unsigned int len = 2 * ETHER_ADDR_LEN - 2;
439 
440 			addr += 2;
441 			length -= 2;
442 			/* Copy Destination and source mac address. */
443 			memcpy((uint8_t *)raw, ((uint8_t *)addr), len);
444 			/* Copy VLAN. */
445 			memcpy((uint8_t *)raw + len, &vlan, sizeof(vlan));
446 			/* Copy missing two bytes to end the DSeg. */
447 			memcpy((uint8_t *)raw + len + sizeof(vlan),
448 			       ((uint8_t *)addr) + len, 2);
449 			addr += len + 2;
450 			length -= (len + 2);
451 		} else {
452 			memcpy((uint8_t *)raw, ((uint8_t *)addr) + 2,
453 			       MLX5_WQE_DWORD_SIZE);
454 			length -= pkt_inline_sz;
455 			addr += pkt_inline_sz;
456 		}
457 		raw += MLX5_WQE_DWORD_SIZE;
458 		tso = txq->tso_en && (buf->ol_flags & PKT_TX_TCP_SEG);
459 		if (tso) {
460 			uintptr_t end =
461 				(uintptr_t)(((uintptr_t)txq->wqes) +
462 					    (1 << txq->wqe_n) * MLX5_WQE_SIZE);
463 			unsigned int copy_b;
464 			uint8_t vlan_sz =
465 				(buf->ol_flags & PKT_TX_VLAN_PKT) ? 4 : 0;
466 			const uint64_t is_tunneled =
467 				buf->ol_flags & (PKT_TX_TUNNEL_GRE |
468 						 PKT_TX_TUNNEL_VXLAN);
469 
470 			tso_header_sz = buf->l2_len + vlan_sz +
471 					buf->l3_len + buf->l4_len;
472 			tso_segsz = buf->tso_segsz;
473 			if (unlikely(tso_segsz == 0)) {
474 				txq->stats.oerrors++;
475 				break;
476 			}
477 			if (is_tunneled	&& txq->tunnel_en) {
478 				tso_header_sz += buf->outer_l2_len +
479 						 buf->outer_l3_len;
480 				cs_flags |= MLX5_ETH_WQE_L4_INNER_CSUM;
481 			} else {
482 				cs_flags |= MLX5_ETH_WQE_L4_CSUM;
483 			}
484 			if (unlikely(tso_header_sz > MLX5_MAX_TSO_HEADER)) {
485 				txq->stats.oerrors++;
486 				break;
487 			}
488 			copy_b = tso_header_sz - pkt_inline_sz;
489 			/* First seg must contain all headers. */
490 			assert(copy_b <= length);
491 			if (copy_b && ((end - (uintptr_t)raw) > copy_b)) {
492 				uint16_t n = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4;
493 
494 				if (unlikely(max_wqe < n))
495 					break;
496 				max_wqe -= n;
497 				rte_memcpy((void *)raw, (void *)addr, copy_b);
498 				addr += copy_b;
499 				length -= copy_b;
500 				/* Include padding for TSO header. */
501 				copy_b = MLX5_WQE_DS(copy_b) *
502 					 MLX5_WQE_DWORD_SIZE;
503 				pkt_inline_sz += copy_b;
504 				raw += copy_b;
505 			} else {
506 				/* NOP WQE. */
507 				wqe->ctrl = (rte_v128u32_t){
508 					rte_cpu_to_be_32(txq->wqe_ci << 8),
509 					rte_cpu_to_be_32(txq->qp_num_8s | 1),
510 					0,
511 					0,
512 				};
513 				ds = 1;
514 #ifdef MLX5_PMD_SOFT_COUNTERS
515 				total_length = 0;
516 #endif
517 				k++;
518 				goto next_wqe;
519 			}
520 		}
521 		/* Inline if enough room. */
522 		if (max_inline || tso) {
523 			uint32_t inl = 0;
524 			uintptr_t end = (uintptr_t)
525 				(((uintptr_t)txq->wqes) +
526 				 (1 << txq->wqe_n) * MLX5_WQE_SIZE);
527 			unsigned int inline_room = max_inline *
528 						   RTE_CACHE_LINE_SIZE -
529 						   (pkt_inline_sz - 2) -
530 						   !!tso * sizeof(inl);
531 			uintptr_t addr_end;
532 			unsigned int copy_b;
533 
534 pkt_inline:
535 			addr_end = RTE_ALIGN_FLOOR(addr + inline_room,
536 						   RTE_CACHE_LINE_SIZE);
537 			copy_b = (addr_end > addr) ?
538 				 RTE_MIN((addr_end - addr), length) : 0;
539 			if (copy_b && ((end - (uintptr_t)raw) > copy_b)) {
540 				/*
541 				 * One Dseg remains in the current WQE.  To
542 				 * keep the computation positive, it is
543 				 * removed after the bytes to Dseg conversion.
544 				 */
545 				uint16_t n = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4;
546 
547 				if (unlikely(max_wqe < n))
548 					break;
549 				max_wqe -= n;
550 				if (tso && !inl) {
551 					inl = rte_cpu_to_be_32(copy_b |
552 							       MLX5_INLINE_SEG);
553 					rte_memcpy((void *)raw,
554 						   (void *)&inl, sizeof(inl));
555 					raw += sizeof(inl);
556 					pkt_inline_sz += sizeof(inl);
557 				}
558 				rte_memcpy((void *)raw, (void *)addr, copy_b);
559 				addr += copy_b;
560 				length -= copy_b;
561 				pkt_inline_sz += copy_b;
562 			}
563 			/*
564 			 * 2 DWORDs consumed by the WQE header + ETH segment +
565 			 * the size of the inline part of the packet.
566 			 */
567 			ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2);
568 			if (length > 0) {
569 				if (ds % (MLX5_WQE_SIZE /
570 					  MLX5_WQE_DWORD_SIZE) == 0) {
571 					if (unlikely(--max_wqe == 0))
572 						break;
573 					dseg = (volatile rte_v128u32_t *)
574 					       tx_mlx5_wqe(txq, txq->wqe_ci +
575 							   ds / 4);
576 				} else {
577 					dseg = (volatile rte_v128u32_t *)
578 						((uintptr_t)wqe +
579 						 (ds * MLX5_WQE_DWORD_SIZE));
580 				}
581 				goto use_dseg;
582 			} else if (!segs_n) {
583 				goto next_pkt;
584 			} else {
585 				raw += copy_b;
586 				inline_room -= copy_b;
587 				--segs_n;
588 				buf = buf->next;
589 				assert(buf);
590 				addr = rte_pktmbuf_mtod(buf, uintptr_t);
591 				length = DATA_LEN(buf);
592 #ifdef MLX5_PMD_SOFT_COUNTERS
593 				total_length += length;
594 #endif
595 				(*txq->elts)[++elts_head & elts_m] = buf;
596 				goto pkt_inline;
597 			}
598 		} else {
599 			/*
600 			 * No inline has been done in the packet, only the
601 			 * Ethernet Header as been stored.
602 			 */
603 			dseg = (volatile rte_v128u32_t *)
604 				((uintptr_t)wqe + (3 * MLX5_WQE_DWORD_SIZE));
605 			ds = 3;
606 use_dseg:
607 			/* Add the remaining packet as a simple ds. */
608 			addr = rte_cpu_to_be_64(addr);
609 			*dseg = (rte_v128u32_t){
610 				rte_cpu_to_be_32(length),
611 				mlx5_tx_mb2mr(txq, buf),
612 				addr,
613 				addr >> 32,
614 			};
615 			++ds;
616 			if (!segs_n)
617 				goto next_pkt;
618 		}
619 next_seg:
620 		assert(buf);
621 		assert(ds);
622 		assert(wqe);
623 		/*
624 		 * Spill on next WQE when the current one does not have
625 		 * enough room left. Size of WQE must a be a multiple
626 		 * of data segment size.
627 		 */
628 		assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE));
629 		if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE))) {
630 			if (unlikely(--max_wqe == 0))
631 				break;
632 			dseg = (volatile rte_v128u32_t *)
633 			       tx_mlx5_wqe(txq, txq->wqe_ci + ds / 4);
634 			rte_prefetch0(tx_mlx5_wqe(txq,
635 						  txq->wqe_ci + ds / 4 + 1));
636 		} else {
637 			++dseg;
638 		}
639 		++ds;
640 		buf = buf->next;
641 		assert(buf);
642 		length = DATA_LEN(buf);
643 #ifdef MLX5_PMD_SOFT_COUNTERS
644 		total_length += length;
645 #endif
646 		/* Store segment information. */
647 		addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf, uintptr_t));
648 		*dseg = (rte_v128u32_t){
649 			rte_cpu_to_be_32(length),
650 			mlx5_tx_mb2mr(txq, buf),
651 			addr,
652 			addr >> 32,
653 		};
654 		(*txq->elts)[++elts_head & elts_m] = buf;
655 		if (--segs_n)
656 			goto next_seg;
657 next_pkt:
658 		if (ds > MLX5_DSEG_MAX) {
659 			txq->stats.oerrors++;
660 			break;
661 		}
662 		++elts_head;
663 		++pkts;
664 		++i;
665 		j += sg;
666 		/* Initialize known and common part of the WQE structure. */
667 		if (tso) {
668 			wqe->ctrl = (rte_v128u32_t){
669 				rte_cpu_to_be_32((txq->wqe_ci << 8) |
670 						 MLX5_OPCODE_TSO),
671 				rte_cpu_to_be_32(txq->qp_num_8s | ds),
672 				0,
673 				0,
674 			};
675 			wqe->eseg = (rte_v128u32_t){
676 				0,
677 				cs_flags | (rte_cpu_to_be_16(tso_segsz) << 16),
678 				0,
679 				(ehdr << 16) | rte_cpu_to_be_16(tso_header_sz),
680 			};
681 		} else {
682 			wqe->ctrl = (rte_v128u32_t){
683 				rte_cpu_to_be_32((txq->wqe_ci << 8) |
684 						 MLX5_OPCODE_SEND),
685 				rte_cpu_to_be_32(txq->qp_num_8s | ds),
686 				0,
687 				0,
688 			};
689 			wqe->eseg = (rte_v128u32_t){
690 				0,
691 				cs_flags,
692 				0,
693 				(ehdr << 16) | rte_cpu_to_be_16(pkt_inline_sz),
694 			};
695 		}
696 next_wqe:
697 		txq->wqe_ci += (ds + 3) / 4;
698 		/* Save the last successful WQE for completion request */
699 		last_wqe = (volatile struct mlx5_wqe_ctrl *)wqe;
700 #ifdef MLX5_PMD_SOFT_COUNTERS
701 		/* Increment sent bytes counter. */
702 		txq->stats.obytes += total_length;
703 #endif
704 	} while (i < pkts_n);
705 	/* Take a shortcut if nothing must be sent. */
706 	if (unlikely((i + k) == 0))
707 		return 0;
708 	txq->elts_head += (i + j);
709 	/* Check whether completion threshold has been reached. */
710 	comp = txq->elts_comp + i + j + k;
711 	if (comp >= MLX5_TX_COMP_THRESH) {
712 		/* Request completion on last WQE. */
713 		last_wqe->ctrl2 = rte_cpu_to_be_32(8);
714 		/* Save elts_head in unused "immediate" field of WQE. */
715 		last_wqe->ctrl3 = txq->elts_head;
716 		txq->elts_comp = 0;
717 #ifndef NDEBUG
718 		++txq->cq_pi;
719 #endif
720 	} else {
721 		txq->elts_comp = comp;
722 	}
723 #ifdef MLX5_PMD_SOFT_COUNTERS
724 	/* Increment sent packets counter. */
725 	txq->stats.opackets += i;
726 #endif
727 	/* Ring QP doorbell. */
728 	mlx5_tx_dbrec(txq, (volatile struct mlx5_wqe *)last_wqe);
729 	return i;
730 }
731 
732 /**
733  * Open a MPW session.
734  *
735  * @param txq
736  *   Pointer to TX queue structure.
737  * @param mpw
738  *   Pointer to MPW session structure.
739  * @param length
740  *   Packet length.
741  */
742 static inline void
743 mlx5_mpw_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw, uint32_t length)
744 {
745 	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
746 	volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] =
747 		(volatile struct mlx5_wqe_data_seg (*)[])
748 		tx_mlx5_wqe(txq, idx + 1);
749 
750 	mpw->state = MLX5_MPW_STATE_OPENED;
751 	mpw->pkts_n = 0;
752 	mpw->len = length;
753 	mpw->total_len = 0;
754 	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
755 	mpw->wqe->eseg.mss = rte_cpu_to_be_16(length);
756 	mpw->wqe->eseg.inline_hdr_sz = 0;
757 	mpw->wqe->eseg.rsvd0 = 0;
758 	mpw->wqe->eseg.rsvd1 = 0;
759 	mpw->wqe->eseg.rsvd2 = 0;
760 	mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) |
761 					     (txq->wqe_ci << 8) |
762 					     MLX5_OPCODE_TSO);
763 	mpw->wqe->ctrl[2] = 0;
764 	mpw->wqe->ctrl[3] = 0;
765 	mpw->data.dseg[0] = (volatile struct mlx5_wqe_data_seg *)
766 		(((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
767 	mpw->data.dseg[1] = (volatile struct mlx5_wqe_data_seg *)
768 		(((uintptr_t)mpw->wqe) + (3 * MLX5_WQE_DWORD_SIZE));
769 	mpw->data.dseg[2] = &(*dseg)[0];
770 	mpw->data.dseg[3] = &(*dseg)[1];
771 	mpw->data.dseg[4] = &(*dseg)[2];
772 }
773 
774 /**
775  * Close a MPW session.
776  *
777  * @param txq
778  *   Pointer to TX queue structure.
779  * @param mpw
780  *   Pointer to MPW session structure.
781  */
782 static inline void
783 mlx5_mpw_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
784 {
785 	unsigned int num = mpw->pkts_n;
786 
787 	/*
788 	 * Store size in multiple of 16 bytes. Control and Ethernet segments
789 	 * count as 2.
790 	 */
791 	mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s | (2 + num));
792 	mpw->state = MLX5_MPW_STATE_CLOSED;
793 	if (num < 3)
794 		++txq->wqe_ci;
795 	else
796 		txq->wqe_ci += 2;
797 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
798 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
799 }
800 
801 /**
802  * DPDK callback for TX with MPW support.
803  *
804  * @param dpdk_txq
805  *   Generic pointer to TX queue structure.
806  * @param[in] pkts
807  *   Packets to transmit.
808  * @param pkts_n
809  *   Number of packets in array.
810  *
811  * @return
812  *   Number of packets successfully transmitted (<= pkts_n).
813  */
814 uint16_t
815 mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
816 {
817 	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
818 	uint16_t elts_head = txq->elts_head;
819 	const uint16_t elts_n = 1 << txq->elts_n;
820 	const uint16_t elts_m = elts_n - 1;
821 	unsigned int i = 0;
822 	unsigned int j = 0;
823 	uint16_t max_elts;
824 	uint16_t max_wqe;
825 	unsigned int comp;
826 	struct mlx5_mpw mpw = {
827 		.state = MLX5_MPW_STATE_CLOSED,
828 	};
829 
830 	if (unlikely(!pkts_n))
831 		return 0;
832 	/* Prefetch first packet cacheline. */
833 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
834 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
835 	/* Start processing. */
836 	mlx5_tx_complete(txq);
837 	max_elts = (elts_n - (elts_head - txq->elts_tail));
838 	/* A CQE slot must always be available. */
839 	assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci));
840 	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
841 	if (unlikely(!max_wqe))
842 		return 0;
843 	do {
844 		struct rte_mbuf *buf = *(pkts++);
845 		uint32_t length;
846 		unsigned int segs_n = buf->nb_segs;
847 		uint32_t cs_flags;
848 
849 		/*
850 		 * Make sure there is enough room to store this packet and
851 		 * that one ring entry remains unused.
852 		 */
853 		assert(segs_n);
854 		if (max_elts < segs_n)
855 			break;
856 		/* Do not bother with large packets MPW cannot handle. */
857 		if (segs_n > MLX5_MPW_DSEG_MAX) {
858 			txq->stats.oerrors++;
859 			break;
860 		}
861 		max_elts -= segs_n;
862 		--pkts_n;
863 		cs_flags = txq_ol_cksum_to_cs(txq, buf);
864 		/* Retrieve packet information. */
865 		length = PKT_LEN(buf);
866 		assert(length);
867 		/* Start new session if packet differs. */
868 		if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
869 		    ((mpw.len != length) ||
870 		     (segs_n != 1) ||
871 		     (mpw.wqe->eseg.cs_flags != cs_flags)))
872 			mlx5_mpw_close(txq, &mpw);
873 		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
874 			/*
875 			 * Multi-Packet WQE consumes at most two WQE.
876 			 * mlx5_mpw_new() expects to be able to use such
877 			 * resources.
878 			 */
879 			if (unlikely(max_wqe < 2))
880 				break;
881 			max_wqe -= 2;
882 			mlx5_mpw_new(txq, &mpw, length);
883 			mpw.wqe->eseg.cs_flags = cs_flags;
884 		}
885 		/* Multi-segment packets must be alone in their MPW. */
886 		assert((segs_n == 1) || (mpw.pkts_n == 0));
887 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
888 		length = 0;
889 #endif
890 		do {
891 			volatile struct mlx5_wqe_data_seg *dseg;
892 			uintptr_t addr;
893 
894 			assert(buf);
895 			(*txq->elts)[elts_head++ & elts_m] = buf;
896 			dseg = mpw.data.dseg[mpw.pkts_n];
897 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
898 			*dseg = (struct mlx5_wqe_data_seg){
899 				.byte_count = rte_cpu_to_be_32(DATA_LEN(buf)),
900 				.lkey = mlx5_tx_mb2mr(txq, buf),
901 				.addr = rte_cpu_to_be_64(addr),
902 			};
903 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
904 			length += DATA_LEN(buf);
905 #endif
906 			buf = buf->next;
907 			++mpw.pkts_n;
908 			++j;
909 		} while (--segs_n);
910 		assert(length == mpw.len);
911 		if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
912 			mlx5_mpw_close(txq, &mpw);
913 #ifdef MLX5_PMD_SOFT_COUNTERS
914 		/* Increment sent bytes counter. */
915 		txq->stats.obytes += length;
916 #endif
917 		++i;
918 	} while (pkts_n);
919 	/* Take a shortcut if nothing must be sent. */
920 	if (unlikely(i == 0))
921 		return 0;
922 	/* Check whether completion threshold has been reached. */
923 	/* "j" includes both packets and segments. */
924 	comp = txq->elts_comp + j;
925 	if (comp >= MLX5_TX_COMP_THRESH) {
926 		volatile struct mlx5_wqe *wqe = mpw.wqe;
927 
928 		/* Request completion on last WQE. */
929 		wqe->ctrl[2] = rte_cpu_to_be_32(8);
930 		/* Save elts_head in unused "immediate" field of WQE. */
931 		wqe->ctrl[3] = elts_head;
932 		txq->elts_comp = 0;
933 #ifndef NDEBUG
934 		++txq->cq_pi;
935 #endif
936 	} else {
937 		txq->elts_comp = comp;
938 	}
939 #ifdef MLX5_PMD_SOFT_COUNTERS
940 	/* Increment sent packets counter. */
941 	txq->stats.opackets += i;
942 #endif
943 	/* Ring QP doorbell. */
944 	if (mpw.state == MLX5_MPW_STATE_OPENED)
945 		mlx5_mpw_close(txq, &mpw);
946 	mlx5_tx_dbrec(txq, mpw.wqe);
947 	txq->elts_head = elts_head;
948 	return i;
949 }
950 
951 /**
952  * Open a MPW inline session.
953  *
954  * @param txq
955  *   Pointer to TX queue structure.
956  * @param mpw
957  *   Pointer to MPW session structure.
958  * @param length
959  *   Packet length.
960  */
961 static inline void
962 mlx5_mpw_inline_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw,
963 		    uint32_t length)
964 {
965 	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
966 	struct mlx5_wqe_inl_small *inl;
967 
968 	mpw->state = MLX5_MPW_INL_STATE_OPENED;
969 	mpw->pkts_n = 0;
970 	mpw->len = length;
971 	mpw->total_len = 0;
972 	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
973 	mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) |
974 					     (txq->wqe_ci << 8) |
975 					     MLX5_OPCODE_TSO);
976 	mpw->wqe->ctrl[2] = 0;
977 	mpw->wqe->ctrl[3] = 0;
978 	mpw->wqe->eseg.mss = rte_cpu_to_be_16(length);
979 	mpw->wqe->eseg.inline_hdr_sz = 0;
980 	mpw->wqe->eseg.cs_flags = 0;
981 	mpw->wqe->eseg.rsvd0 = 0;
982 	mpw->wqe->eseg.rsvd1 = 0;
983 	mpw->wqe->eseg.rsvd2 = 0;
984 	inl = (struct mlx5_wqe_inl_small *)
985 		(((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE);
986 	mpw->data.raw = (uint8_t *)&inl->raw;
987 }
988 
989 /**
990  * Close a MPW inline session.
991  *
992  * @param txq
993  *   Pointer to TX queue structure.
994  * @param mpw
995  *   Pointer to MPW session structure.
996  */
997 static inline void
998 mlx5_mpw_inline_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
999 {
1000 	unsigned int size;
1001 	struct mlx5_wqe_inl_small *inl = (struct mlx5_wqe_inl_small *)
1002 		(((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
1003 
1004 	size = MLX5_WQE_SIZE - MLX5_MWQE64_INL_DATA + mpw->total_len;
1005 	/*
1006 	 * Store size in multiple of 16 bytes. Control and Ethernet segments
1007 	 * count as 2.
1008 	 */
1009 	mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s |
1010 					     MLX5_WQE_DS(size));
1011 	mpw->state = MLX5_MPW_STATE_CLOSED;
1012 	inl->byte_cnt = rte_cpu_to_be_32(mpw->total_len | MLX5_INLINE_SEG);
1013 	txq->wqe_ci += (size + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
1014 }
1015 
1016 /**
1017  * DPDK callback for TX with MPW inline support.
1018  *
1019  * @param dpdk_txq
1020  *   Generic pointer to TX queue structure.
1021  * @param[in] pkts
1022  *   Packets to transmit.
1023  * @param pkts_n
1024  *   Number of packets in array.
1025  *
1026  * @return
1027  *   Number of packets successfully transmitted (<= pkts_n).
1028  */
1029 uint16_t
1030 mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
1031 			 uint16_t pkts_n)
1032 {
1033 	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
1034 	uint16_t elts_head = txq->elts_head;
1035 	const uint16_t elts_n = 1 << txq->elts_n;
1036 	const uint16_t elts_m = elts_n - 1;
1037 	unsigned int i = 0;
1038 	unsigned int j = 0;
1039 	uint16_t max_elts;
1040 	uint16_t max_wqe;
1041 	unsigned int comp;
1042 	unsigned int inline_room = txq->max_inline * RTE_CACHE_LINE_SIZE;
1043 	struct mlx5_mpw mpw = {
1044 		.state = MLX5_MPW_STATE_CLOSED,
1045 	};
1046 	/*
1047 	 * Compute the maximum number of WQE which can be consumed by inline
1048 	 * code.
1049 	 * - 2 DSEG for:
1050 	 *   - 1 control segment,
1051 	 *   - 1 Ethernet segment,
1052 	 * - N Dseg from the inline request.
1053 	 */
1054 	const unsigned int wqe_inl_n =
1055 		((2 * MLX5_WQE_DWORD_SIZE +
1056 		  txq->max_inline * RTE_CACHE_LINE_SIZE) +
1057 		 RTE_CACHE_LINE_SIZE - 1) / RTE_CACHE_LINE_SIZE;
1058 
1059 	if (unlikely(!pkts_n))
1060 		return 0;
1061 	/* Prefetch first packet cacheline. */
1062 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
1063 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
1064 	/* Start processing. */
1065 	mlx5_tx_complete(txq);
1066 	max_elts = (elts_n - (elts_head - txq->elts_tail));
1067 	/* A CQE slot must always be available. */
1068 	assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci));
1069 	do {
1070 		struct rte_mbuf *buf = *(pkts++);
1071 		uintptr_t addr;
1072 		uint32_t length;
1073 		unsigned int segs_n = buf->nb_segs;
1074 		uint8_t cs_flags;
1075 
1076 		/*
1077 		 * Make sure there is enough room to store this packet and
1078 		 * that one ring entry remains unused.
1079 		 */
1080 		assert(segs_n);
1081 		if (max_elts < segs_n)
1082 			break;
1083 		/* Do not bother with large packets MPW cannot handle. */
1084 		if (segs_n > MLX5_MPW_DSEG_MAX) {
1085 			txq->stats.oerrors++;
1086 			break;
1087 		}
1088 		max_elts -= segs_n;
1089 		--pkts_n;
1090 		/*
1091 		 * Compute max_wqe in case less WQE were consumed in previous
1092 		 * iteration.
1093 		 */
1094 		max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
1095 		cs_flags = txq_ol_cksum_to_cs(txq, buf);
1096 		/* Retrieve packet information. */
1097 		length = PKT_LEN(buf);
1098 		/* Start new session if packet differs. */
1099 		if (mpw.state == MLX5_MPW_STATE_OPENED) {
1100 			if ((mpw.len != length) ||
1101 			    (segs_n != 1) ||
1102 			    (mpw.wqe->eseg.cs_flags != cs_flags))
1103 				mlx5_mpw_close(txq, &mpw);
1104 		} else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
1105 			if ((mpw.len != length) ||
1106 			    (segs_n != 1) ||
1107 			    (length > inline_room) ||
1108 			    (mpw.wqe->eseg.cs_flags != cs_flags)) {
1109 				mlx5_mpw_inline_close(txq, &mpw);
1110 				inline_room =
1111 					txq->max_inline * RTE_CACHE_LINE_SIZE;
1112 			}
1113 		}
1114 		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
1115 			if ((segs_n != 1) ||
1116 			    (length > inline_room)) {
1117 				/*
1118 				 * Multi-Packet WQE consumes at most two WQE.
1119 				 * mlx5_mpw_new() expects to be able to use
1120 				 * such resources.
1121 				 */
1122 				if (unlikely(max_wqe < 2))
1123 					break;
1124 				max_wqe -= 2;
1125 				mlx5_mpw_new(txq, &mpw, length);
1126 				mpw.wqe->eseg.cs_flags = cs_flags;
1127 			} else {
1128 				if (unlikely(max_wqe < wqe_inl_n))
1129 					break;
1130 				max_wqe -= wqe_inl_n;
1131 				mlx5_mpw_inline_new(txq, &mpw, length);
1132 				mpw.wqe->eseg.cs_flags = cs_flags;
1133 			}
1134 		}
1135 		/* Multi-segment packets must be alone in their MPW. */
1136 		assert((segs_n == 1) || (mpw.pkts_n == 0));
1137 		if (mpw.state == MLX5_MPW_STATE_OPENED) {
1138 			assert(inline_room ==
1139 			       txq->max_inline * RTE_CACHE_LINE_SIZE);
1140 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1141 			length = 0;
1142 #endif
1143 			do {
1144 				volatile struct mlx5_wqe_data_seg *dseg;
1145 
1146 				assert(buf);
1147 				(*txq->elts)[elts_head++ & elts_m] = buf;
1148 				dseg = mpw.data.dseg[mpw.pkts_n];
1149 				addr = rte_pktmbuf_mtod(buf, uintptr_t);
1150 				*dseg = (struct mlx5_wqe_data_seg){
1151 					.byte_count =
1152 					       rte_cpu_to_be_32(DATA_LEN(buf)),
1153 					.lkey = mlx5_tx_mb2mr(txq, buf),
1154 					.addr = rte_cpu_to_be_64(addr),
1155 				};
1156 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1157 				length += DATA_LEN(buf);
1158 #endif
1159 				buf = buf->next;
1160 				++mpw.pkts_n;
1161 				++j;
1162 			} while (--segs_n);
1163 			assert(length == mpw.len);
1164 			if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
1165 				mlx5_mpw_close(txq, &mpw);
1166 		} else {
1167 			unsigned int max;
1168 
1169 			assert(mpw.state == MLX5_MPW_INL_STATE_OPENED);
1170 			assert(length <= inline_room);
1171 			assert(length == DATA_LEN(buf));
1172 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
1173 			(*txq->elts)[elts_head++ & elts_m] = buf;
1174 			/* Maximum number of bytes before wrapping. */
1175 			max = ((((uintptr_t)(txq->wqes)) +
1176 				(1 << txq->wqe_n) *
1177 				MLX5_WQE_SIZE) -
1178 			       (uintptr_t)mpw.data.raw);
1179 			if (length > max) {
1180 				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1181 					   (void *)addr,
1182 					   max);
1183 				mpw.data.raw = (volatile void *)txq->wqes;
1184 				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1185 					   (void *)(addr + max),
1186 					   length - max);
1187 				mpw.data.raw += length - max;
1188 			} else {
1189 				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1190 					   (void *)addr,
1191 					   length);
1192 
1193 				if (length == max)
1194 					mpw.data.raw =
1195 						(volatile void *)txq->wqes;
1196 				else
1197 					mpw.data.raw += length;
1198 			}
1199 			++mpw.pkts_n;
1200 			mpw.total_len += length;
1201 			++j;
1202 			if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) {
1203 				mlx5_mpw_inline_close(txq, &mpw);
1204 				inline_room =
1205 					txq->max_inline * RTE_CACHE_LINE_SIZE;
1206 			} else {
1207 				inline_room -= length;
1208 			}
1209 		}
1210 #ifdef MLX5_PMD_SOFT_COUNTERS
1211 		/* Increment sent bytes counter. */
1212 		txq->stats.obytes += length;
1213 #endif
1214 		++i;
1215 	} while (pkts_n);
1216 	/* Take a shortcut if nothing must be sent. */
1217 	if (unlikely(i == 0))
1218 		return 0;
1219 	/* Check whether completion threshold has been reached. */
1220 	/* "j" includes both packets and segments. */
1221 	comp = txq->elts_comp + j;
1222 	if (comp >= MLX5_TX_COMP_THRESH) {
1223 		volatile struct mlx5_wqe *wqe = mpw.wqe;
1224 
1225 		/* Request completion on last WQE. */
1226 		wqe->ctrl[2] = rte_cpu_to_be_32(8);
1227 		/* Save elts_head in unused "immediate" field of WQE. */
1228 		wqe->ctrl[3] = elts_head;
1229 		txq->elts_comp = 0;
1230 #ifndef NDEBUG
1231 		++txq->cq_pi;
1232 #endif
1233 	} else {
1234 		txq->elts_comp = comp;
1235 	}
1236 #ifdef MLX5_PMD_SOFT_COUNTERS
1237 	/* Increment sent packets counter. */
1238 	txq->stats.opackets += i;
1239 #endif
1240 	/* Ring QP doorbell. */
1241 	if (mpw.state == MLX5_MPW_INL_STATE_OPENED)
1242 		mlx5_mpw_inline_close(txq, &mpw);
1243 	else if (mpw.state == MLX5_MPW_STATE_OPENED)
1244 		mlx5_mpw_close(txq, &mpw);
1245 	mlx5_tx_dbrec(txq, mpw.wqe);
1246 	txq->elts_head = elts_head;
1247 	return i;
1248 }
1249 
1250 /**
1251  * Open an Enhanced MPW session.
1252  *
1253  * @param txq
1254  *   Pointer to TX queue structure.
1255  * @param mpw
1256  *   Pointer to MPW session structure.
1257  * @param length
1258  *   Packet length.
1259  */
1260 static inline void
1261 mlx5_empw_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw, int padding)
1262 {
1263 	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
1264 
1265 	mpw->state = MLX5_MPW_ENHANCED_STATE_OPENED;
1266 	mpw->pkts_n = 0;
1267 	mpw->total_len = sizeof(struct mlx5_wqe);
1268 	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
1269 	mpw->wqe->ctrl[0] =
1270 		rte_cpu_to_be_32((MLX5_OPC_MOD_ENHANCED_MPSW << 24) |
1271 				 (txq->wqe_ci << 8) |
1272 				 MLX5_OPCODE_ENHANCED_MPSW);
1273 	mpw->wqe->ctrl[2] = 0;
1274 	mpw->wqe->ctrl[3] = 0;
1275 	memset((void *)(uintptr_t)&mpw->wqe->eseg, 0, MLX5_WQE_DWORD_SIZE);
1276 	if (unlikely(padding)) {
1277 		uintptr_t addr = (uintptr_t)(mpw->wqe + 1);
1278 
1279 		/* Pad the first 2 DWORDs with zero-length inline header. */
1280 		*(volatile uint32_t *)addr = rte_cpu_to_be_32(MLX5_INLINE_SEG);
1281 		*(volatile uint32_t *)(addr + MLX5_WQE_DWORD_SIZE) =
1282 			rte_cpu_to_be_32(MLX5_INLINE_SEG);
1283 		mpw->total_len += 2 * MLX5_WQE_DWORD_SIZE;
1284 		/* Start from the next WQEBB. */
1285 		mpw->data.raw = (volatile void *)(tx_mlx5_wqe(txq, idx + 1));
1286 	} else {
1287 		mpw->data.raw = (volatile void *)(mpw->wqe + 1);
1288 	}
1289 }
1290 
1291 /**
1292  * Close an Enhanced MPW session.
1293  *
1294  * @param txq
1295  *   Pointer to TX queue structure.
1296  * @param mpw
1297  *   Pointer to MPW session structure.
1298  *
1299  * @return
1300  *   Number of consumed WQEs.
1301  */
1302 static inline uint16_t
1303 mlx5_empw_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
1304 {
1305 	uint16_t ret;
1306 
1307 	/* Store size in multiple of 16 bytes. Control and Ethernet segments
1308 	 * count as 2.
1309 	 */
1310 	mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s |
1311 					     MLX5_WQE_DS(mpw->total_len));
1312 	mpw->state = MLX5_MPW_STATE_CLOSED;
1313 	ret = (mpw->total_len + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
1314 	txq->wqe_ci += ret;
1315 	return ret;
1316 }
1317 
1318 /**
1319  * TX with Enhanced MPW support.
1320  *
1321  * @param txq
1322  *   Pointer to TX queue structure.
1323  * @param[in] pkts
1324  *   Packets to transmit.
1325  * @param pkts_n
1326  *   Number of packets in array.
1327  *
1328  * @return
1329  *   Number of packets successfully transmitted (<= pkts_n).
1330  */
1331 static inline uint16_t
1332 txq_burst_empw(struct mlx5_txq_data *txq, struct rte_mbuf **pkts,
1333 	       uint16_t pkts_n)
1334 {
1335 	uint16_t elts_head = txq->elts_head;
1336 	const uint16_t elts_n = 1 << txq->elts_n;
1337 	const uint16_t elts_m = elts_n - 1;
1338 	unsigned int i = 0;
1339 	unsigned int j = 0;
1340 	uint16_t max_elts;
1341 	uint16_t max_wqe;
1342 	unsigned int max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE;
1343 	unsigned int mpw_room = 0;
1344 	unsigned int inl_pad = 0;
1345 	uint32_t inl_hdr;
1346 	struct mlx5_mpw mpw = {
1347 		.state = MLX5_MPW_STATE_CLOSED,
1348 	};
1349 
1350 	if (unlikely(!pkts_n))
1351 		return 0;
1352 	/* Start processing. */
1353 	mlx5_tx_complete(txq);
1354 	max_elts = (elts_n - (elts_head - txq->elts_tail));
1355 	/* A CQE slot must always be available. */
1356 	assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci));
1357 	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
1358 	if (unlikely(!max_wqe))
1359 		return 0;
1360 	do {
1361 		struct rte_mbuf *buf = *(pkts++);
1362 		uintptr_t addr;
1363 		unsigned int do_inline = 0; /* Whether inline is possible. */
1364 		uint32_t length;
1365 		uint8_t cs_flags;
1366 
1367 		/* Multi-segmented packet is handled in slow-path outside. */
1368 		assert(NB_SEGS(buf) == 1);
1369 		/* Make sure there is enough room to store this packet. */
1370 		if (max_elts - j == 0)
1371 			break;
1372 		cs_flags = txq_ol_cksum_to_cs(txq, buf);
1373 		/* Retrieve packet information. */
1374 		length = PKT_LEN(buf);
1375 		/* Start new session if:
1376 		 * - multi-segment packet
1377 		 * - no space left even for a dseg
1378 		 * - next packet can be inlined with a new WQE
1379 		 * - cs_flag differs
1380 		 */
1381 		if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED) {
1382 			if ((inl_pad + sizeof(struct mlx5_wqe_data_seg) >
1383 			     mpw_room) ||
1384 			    (length <= txq->inline_max_packet_sz &&
1385 			     inl_pad + sizeof(inl_hdr) + length >
1386 			     mpw_room) ||
1387 			    (mpw.wqe->eseg.cs_flags != cs_flags))
1388 				max_wqe -= mlx5_empw_close(txq, &mpw);
1389 		}
1390 		if (unlikely(mpw.state == MLX5_MPW_STATE_CLOSED)) {
1391 			/* In Enhanced MPW, inline as much as the budget is
1392 			 * allowed. The remaining space is to be filled with
1393 			 * dsegs. If the title WQEBB isn't padded, it will have
1394 			 * 2 dsegs there.
1395 			 */
1396 			mpw_room = RTE_MIN(MLX5_WQE_SIZE_MAX,
1397 					   (max_inline ? max_inline :
1398 					    pkts_n * MLX5_WQE_DWORD_SIZE) +
1399 					   MLX5_WQE_SIZE);
1400 			if (unlikely(max_wqe * MLX5_WQE_SIZE < mpw_room))
1401 				break;
1402 			/* Don't pad the title WQEBB to not waste WQ. */
1403 			mlx5_empw_new(txq, &mpw, 0);
1404 			mpw_room -= mpw.total_len;
1405 			inl_pad = 0;
1406 			do_inline = length <= txq->inline_max_packet_sz &&
1407 				    sizeof(inl_hdr) + length <= mpw_room &&
1408 				    !txq->mpw_hdr_dseg;
1409 			mpw.wqe->eseg.cs_flags = cs_flags;
1410 		} else {
1411 			/* Evaluate whether the next packet can be inlined.
1412 			 * Inlininig is possible when:
1413 			 * - length is less than configured value
1414 			 * - length fits for remaining space
1415 			 * - not required to fill the title WQEBB with dsegs
1416 			 */
1417 			do_inline =
1418 				length <= txq->inline_max_packet_sz &&
1419 				inl_pad + sizeof(inl_hdr) + length <=
1420 				 mpw_room &&
1421 				(!txq->mpw_hdr_dseg ||
1422 				 mpw.total_len >= MLX5_WQE_SIZE);
1423 		}
1424 		if (max_inline && do_inline) {
1425 			/* Inline packet into WQE. */
1426 			unsigned int max;
1427 
1428 			assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
1429 			assert(length == DATA_LEN(buf));
1430 			inl_hdr = rte_cpu_to_be_32(length | MLX5_INLINE_SEG);
1431 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
1432 			mpw.data.raw = (volatile void *)
1433 				((uintptr_t)mpw.data.raw + inl_pad);
1434 			max = tx_mlx5_wq_tailroom(txq,
1435 					(void *)(uintptr_t)mpw.data.raw);
1436 			/* Copy inline header. */
1437 			mpw.data.raw = (volatile void *)
1438 				mlx5_copy_to_wq(
1439 					  (void *)(uintptr_t)mpw.data.raw,
1440 					  &inl_hdr,
1441 					  sizeof(inl_hdr),
1442 					  (void *)(uintptr_t)txq->wqes,
1443 					  max);
1444 			max = tx_mlx5_wq_tailroom(txq,
1445 					(void *)(uintptr_t)mpw.data.raw);
1446 			/* Copy packet data. */
1447 			mpw.data.raw = (volatile void *)
1448 				mlx5_copy_to_wq(
1449 					  (void *)(uintptr_t)mpw.data.raw,
1450 					  (void *)addr,
1451 					  length,
1452 					  (void *)(uintptr_t)txq->wqes,
1453 					  max);
1454 			++mpw.pkts_n;
1455 			mpw.total_len += (inl_pad + sizeof(inl_hdr) + length);
1456 			/* No need to get completion as the entire packet is
1457 			 * copied to WQ. Free the buf right away.
1458 			 */
1459 			rte_pktmbuf_free_seg(buf);
1460 			mpw_room -= (inl_pad + sizeof(inl_hdr) + length);
1461 			/* Add pad in the next packet if any. */
1462 			inl_pad = (((uintptr_t)mpw.data.raw +
1463 					(MLX5_WQE_DWORD_SIZE - 1)) &
1464 					~(MLX5_WQE_DWORD_SIZE - 1)) -
1465 				  (uintptr_t)mpw.data.raw;
1466 		} else {
1467 			/* No inline. Load a dseg of packet pointer. */
1468 			volatile rte_v128u32_t *dseg;
1469 
1470 			assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
1471 			assert((inl_pad + sizeof(*dseg)) <= mpw_room);
1472 			assert(length == DATA_LEN(buf));
1473 			if (!tx_mlx5_wq_tailroom(txq,
1474 					(void *)((uintptr_t)mpw.data.raw
1475 						+ inl_pad)))
1476 				dseg = (volatile void *)txq->wqes;
1477 			else
1478 				dseg = (volatile void *)
1479 					((uintptr_t)mpw.data.raw +
1480 					 inl_pad);
1481 			(*txq->elts)[elts_head++ & elts_m] = buf;
1482 			addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf,
1483 								 uintptr_t));
1484 			*dseg = (rte_v128u32_t) {
1485 				rte_cpu_to_be_32(length),
1486 				mlx5_tx_mb2mr(txq, buf),
1487 				addr,
1488 				addr >> 32,
1489 			};
1490 			mpw.data.raw = (volatile void *)(dseg + 1);
1491 			mpw.total_len += (inl_pad + sizeof(*dseg));
1492 			++j;
1493 			++mpw.pkts_n;
1494 			mpw_room -= (inl_pad + sizeof(*dseg));
1495 			inl_pad = 0;
1496 		}
1497 #ifdef MLX5_PMD_SOFT_COUNTERS
1498 		/* Increment sent bytes counter. */
1499 		txq->stats.obytes += length;
1500 #endif
1501 		++i;
1502 	} while (i < pkts_n);
1503 	/* Take a shortcut if nothing must be sent. */
1504 	if (unlikely(i == 0))
1505 		return 0;
1506 	/* Check whether completion threshold has been reached. */
1507 	if (txq->elts_comp + j >= MLX5_TX_COMP_THRESH ||
1508 			(uint16_t)(txq->wqe_ci - txq->mpw_comp) >=
1509 			 (1 << txq->wqe_n) / MLX5_TX_COMP_THRESH_INLINE_DIV) {
1510 		volatile struct mlx5_wqe *wqe = mpw.wqe;
1511 
1512 		/* Request completion on last WQE. */
1513 		wqe->ctrl[2] = rte_cpu_to_be_32(8);
1514 		/* Save elts_head in unused "immediate" field of WQE. */
1515 		wqe->ctrl[3] = elts_head;
1516 		txq->elts_comp = 0;
1517 		txq->mpw_comp = txq->wqe_ci;
1518 #ifndef NDEBUG
1519 		++txq->cq_pi;
1520 #endif
1521 	} else {
1522 		txq->elts_comp += j;
1523 	}
1524 #ifdef MLX5_PMD_SOFT_COUNTERS
1525 	/* Increment sent packets counter. */
1526 	txq->stats.opackets += i;
1527 #endif
1528 	if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED)
1529 		mlx5_empw_close(txq, &mpw);
1530 	/* Ring QP doorbell. */
1531 	mlx5_tx_dbrec(txq, mpw.wqe);
1532 	txq->elts_head = elts_head;
1533 	return i;
1534 }
1535 
1536 /**
1537  * DPDK callback for TX with Enhanced MPW support.
1538  *
1539  * @param dpdk_txq
1540  *   Generic pointer to TX queue structure.
1541  * @param[in] pkts
1542  *   Packets to transmit.
1543  * @param pkts_n
1544  *   Number of packets in array.
1545  *
1546  * @return
1547  *   Number of packets successfully transmitted (<= pkts_n).
1548  */
1549 uint16_t
1550 mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
1551 {
1552 	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
1553 	uint16_t nb_tx = 0;
1554 
1555 	while (pkts_n > nb_tx) {
1556 		uint16_t n;
1557 		uint16_t ret;
1558 
1559 		n = txq_count_contig_multi_seg(&pkts[nb_tx], pkts_n - nb_tx);
1560 		if (n) {
1561 			ret = mlx5_tx_burst(dpdk_txq, &pkts[nb_tx], n);
1562 			if (!ret)
1563 				break;
1564 			nb_tx += ret;
1565 		}
1566 		n = txq_count_contig_single_seg(&pkts[nb_tx], pkts_n - nb_tx);
1567 		if (n) {
1568 			ret = txq_burst_empw(txq, &pkts[nb_tx], n);
1569 			if (!ret)
1570 				break;
1571 			nb_tx += ret;
1572 		}
1573 	}
1574 	return nb_tx;
1575 }
1576 
1577 /**
1578  * Translate RX completion flags to packet type.
1579  *
1580  * @param[in] cqe
1581  *   Pointer to CQE.
1582  *
1583  * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
1584  *
1585  * @return
1586  *   Packet type for struct rte_mbuf.
1587  */
1588 static inline uint32_t
1589 rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe)
1590 {
1591 	uint8_t idx;
1592 	uint8_t pinfo = cqe->pkt_info;
1593 	uint16_t ptype = cqe->hdr_type_etc;
1594 
1595 	/*
1596 	 * The index to the array should have:
1597 	 * bit[1:0] = l3_hdr_type
1598 	 * bit[4:2] = l4_hdr_type
1599 	 * bit[5] = ip_frag
1600 	 * bit[6] = tunneled
1601 	 * bit[7] = outer_l3_type
1602 	 */
1603 	idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10);
1604 	return mlx5_ptype_table[idx];
1605 }
1606 
1607 /**
1608  * Get size of the next packet for a given CQE. For compressed CQEs, the
1609  * consumer index is updated only once all packets of the current one have
1610  * been processed.
1611  *
1612  * @param rxq
1613  *   Pointer to RX queue.
1614  * @param cqe
1615  *   CQE to process.
1616  * @param[out] rss_hash
1617  *   Packet RSS Hash result.
1618  *
1619  * @return
1620  *   Packet size in bytes (0 if there is none), -1 in case of completion
1621  *   with error.
1622  */
1623 static inline int
1624 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
1625 		 uint16_t cqe_cnt, uint32_t *rss_hash)
1626 {
1627 	struct rxq_zip *zip = &rxq->zip;
1628 	uint16_t cqe_n = cqe_cnt + 1;
1629 	int len = 0;
1630 	uint16_t idx, end;
1631 
1632 	/* Process compressed data in the CQE and mini arrays. */
1633 	if (zip->ai) {
1634 		volatile struct mlx5_mini_cqe8 (*mc)[8] =
1635 			(volatile struct mlx5_mini_cqe8 (*)[8])
1636 			(uintptr_t)(&(*rxq->cqes)[zip->ca & cqe_cnt].pkt_info);
1637 
1638 		len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt);
1639 		*rss_hash = rte_be_to_cpu_32((*mc)[zip->ai & 7].rx_hash_result);
1640 		if ((++zip->ai & 7) == 0) {
1641 			/* Invalidate consumed CQEs */
1642 			idx = zip->ca;
1643 			end = zip->na;
1644 			while (idx != end) {
1645 				(*rxq->cqes)[idx & cqe_cnt].op_own =
1646 					MLX5_CQE_INVALIDATE;
1647 				++idx;
1648 			}
1649 			/*
1650 			 * Increment consumer index to skip the number of
1651 			 * CQEs consumed. Hardware leaves holes in the CQ
1652 			 * ring for software use.
1653 			 */
1654 			zip->ca = zip->na;
1655 			zip->na += 8;
1656 		}
1657 		if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
1658 			/* Invalidate the rest */
1659 			idx = zip->ca;
1660 			end = zip->cq_ci;
1661 
1662 			while (idx != end) {
1663 				(*rxq->cqes)[idx & cqe_cnt].op_own =
1664 					MLX5_CQE_INVALIDATE;
1665 				++idx;
1666 			}
1667 			rxq->cq_ci = zip->cq_ci;
1668 			zip->ai = 0;
1669 		}
1670 	/* No compressed data, get next CQE and verify if it is compressed. */
1671 	} else {
1672 		int ret;
1673 		int8_t op_own;
1674 
1675 		ret = check_cqe(cqe, cqe_n, rxq->cq_ci);
1676 		if (unlikely(ret == 1))
1677 			return 0;
1678 		++rxq->cq_ci;
1679 		op_own = cqe->op_own;
1680 		rte_cio_rmb();
1681 		if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
1682 			volatile struct mlx5_mini_cqe8 (*mc)[8] =
1683 				(volatile struct mlx5_mini_cqe8 (*)[8])
1684 				(uintptr_t)(&(*rxq->cqes)[rxq->cq_ci &
1685 							  cqe_cnt].pkt_info);
1686 
1687 			/* Fix endianness. */
1688 			zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt);
1689 			/*
1690 			 * Current mini array position is the one returned by
1691 			 * check_cqe64().
1692 			 *
1693 			 * If completion comprises several mini arrays, as a
1694 			 * special case the second one is located 7 CQEs after
1695 			 * the initial CQE instead of 8 for subsequent ones.
1696 			 */
1697 			zip->ca = rxq->cq_ci;
1698 			zip->na = zip->ca + 7;
1699 			/* Compute the next non compressed CQE. */
1700 			--rxq->cq_ci;
1701 			zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
1702 			/* Get packet size to return. */
1703 			len = rte_be_to_cpu_32((*mc)[0].byte_cnt);
1704 			*rss_hash = rte_be_to_cpu_32((*mc)[0].rx_hash_result);
1705 			zip->ai = 1;
1706 			/* Prefetch all the entries to be invalidated */
1707 			idx = zip->ca;
1708 			end = zip->cq_ci;
1709 			while (idx != end) {
1710 				rte_prefetch0(&(*rxq->cqes)[(idx) & cqe_cnt]);
1711 				++idx;
1712 			}
1713 		} else {
1714 			len = rte_be_to_cpu_32(cqe->byte_cnt);
1715 			*rss_hash = rte_be_to_cpu_32(cqe->rx_hash_res);
1716 		}
1717 		/* Error while receiving packet. */
1718 		if (unlikely(MLX5_CQE_OPCODE(op_own) == MLX5_CQE_RESP_ERR))
1719 			return -1;
1720 	}
1721 	return len;
1722 }
1723 
1724 /**
1725  * Translate RX completion flags to offload flags.
1726  *
1727  * @param[in] rxq
1728  *   Pointer to RX queue structure.
1729  * @param[in] cqe
1730  *   Pointer to CQE.
1731  *
1732  * @return
1733  *   Offload flags (ol_flags) for struct rte_mbuf.
1734  */
1735 static inline uint32_t
1736 rxq_cq_to_ol_flags(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe)
1737 {
1738 	uint32_t ol_flags = 0;
1739 	uint16_t flags = rte_be_to_cpu_16(cqe->hdr_type_etc);
1740 
1741 	ol_flags =
1742 		TRANSPOSE(flags,
1743 			  MLX5_CQE_RX_L3_HDR_VALID,
1744 			  PKT_RX_IP_CKSUM_GOOD) |
1745 		TRANSPOSE(flags,
1746 			  MLX5_CQE_RX_L4_HDR_VALID,
1747 			  PKT_RX_L4_CKSUM_GOOD);
1748 	if ((cqe->pkt_info & MLX5_CQE_RX_TUNNEL_PACKET) && (rxq->csum_l2tun))
1749 		ol_flags |=
1750 			TRANSPOSE(flags,
1751 				  MLX5_CQE_RX_L3_HDR_VALID,
1752 				  PKT_RX_IP_CKSUM_GOOD) |
1753 			TRANSPOSE(flags,
1754 				  MLX5_CQE_RX_L4_HDR_VALID,
1755 				  PKT_RX_L4_CKSUM_GOOD);
1756 	return ol_flags;
1757 }
1758 
1759 /**
1760  * DPDK callback for RX.
1761  *
1762  * @param dpdk_rxq
1763  *   Generic pointer to RX queue structure.
1764  * @param[out] pkts
1765  *   Array to store received packets.
1766  * @param pkts_n
1767  *   Maximum number of packets in array.
1768  *
1769  * @return
1770  *   Number of packets successfully received (<= pkts_n).
1771  */
1772 uint16_t
1773 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
1774 {
1775 	struct mlx5_rxq_data *rxq = dpdk_rxq;
1776 	const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1;
1777 	const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1;
1778 	const unsigned int sges_n = rxq->sges_n;
1779 	struct rte_mbuf *pkt = NULL;
1780 	struct rte_mbuf *seg = NULL;
1781 	volatile struct mlx5_cqe *cqe =
1782 		&(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
1783 	unsigned int i = 0;
1784 	unsigned int rq_ci = rxq->rq_ci << sges_n;
1785 	int len = 0; /* keep its value across iterations. */
1786 
1787 	while (pkts_n) {
1788 		unsigned int idx = rq_ci & wqe_cnt;
1789 		volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx];
1790 		struct rte_mbuf *rep = (*rxq->elts)[idx];
1791 		uint32_t rss_hash_res = 0;
1792 
1793 		if (pkt)
1794 			NEXT(seg) = rep;
1795 		seg = rep;
1796 		rte_prefetch0(seg);
1797 		rte_prefetch0(cqe);
1798 		rte_prefetch0(wqe);
1799 		rep = rte_mbuf_raw_alloc(rxq->mp);
1800 		if (unlikely(rep == NULL)) {
1801 			++rxq->stats.rx_nombuf;
1802 			if (!pkt) {
1803 				/*
1804 				 * no buffers before we even started,
1805 				 * bail out silently.
1806 				 */
1807 				break;
1808 			}
1809 			while (pkt != seg) {
1810 				assert(pkt != (*rxq->elts)[idx]);
1811 				rep = NEXT(pkt);
1812 				NEXT(pkt) = NULL;
1813 				NB_SEGS(pkt) = 1;
1814 				rte_mbuf_raw_free(pkt);
1815 				pkt = rep;
1816 			}
1817 			break;
1818 		}
1819 		if (!pkt) {
1820 			cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
1821 			len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt,
1822 					       &rss_hash_res);
1823 			if (!len) {
1824 				rte_mbuf_raw_free(rep);
1825 				break;
1826 			}
1827 			if (unlikely(len == -1)) {
1828 				/* RX error, packet is likely too large. */
1829 				rte_mbuf_raw_free(rep);
1830 				++rxq->stats.idropped;
1831 				goto skip;
1832 			}
1833 			pkt = seg;
1834 			assert(len >= (rxq->crc_present << 2));
1835 			/* Update packet information. */
1836 			pkt->packet_type = rxq_cq_to_pkt_type(cqe);
1837 			pkt->ol_flags = 0;
1838 			if (rss_hash_res && rxq->rss_hash) {
1839 				pkt->hash.rss = rss_hash_res;
1840 				pkt->ol_flags = PKT_RX_RSS_HASH;
1841 			}
1842 			if (rxq->mark &&
1843 			    MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) {
1844 				pkt->ol_flags |= PKT_RX_FDIR;
1845 				if (cqe->sop_drop_qpn !=
1846 				    rte_cpu_to_be_32(MLX5_FLOW_MARK_DEFAULT)) {
1847 					uint32_t mark = cqe->sop_drop_qpn;
1848 
1849 					pkt->ol_flags |= PKT_RX_FDIR_ID;
1850 					pkt->hash.fdir.hi =
1851 						mlx5_flow_mark_get(mark);
1852 				}
1853 			}
1854 			if (rxq->csum | rxq->csum_l2tun)
1855 				pkt->ol_flags |= rxq_cq_to_ol_flags(rxq, cqe);
1856 			if (rxq->vlan_strip &&
1857 			    (cqe->hdr_type_etc &
1858 			     rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED))) {
1859 				pkt->ol_flags |= PKT_RX_VLAN |
1860 					PKT_RX_VLAN_STRIPPED;
1861 				pkt->vlan_tci =
1862 					rte_be_to_cpu_16(cqe->vlan_info);
1863 			}
1864 			if (rxq->hw_timestamp) {
1865 				pkt->timestamp =
1866 					rte_be_to_cpu_64(cqe->timestamp);
1867 				pkt->ol_flags |= PKT_RX_TIMESTAMP;
1868 			}
1869 			if (rxq->crc_present)
1870 				len -= ETHER_CRC_LEN;
1871 			PKT_LEN(pkt) = len;
1872 		}
1873 		DATA_LEN(rep) = DATA_LEN(seg);
1874 		PKT_LEN(rep) = PKT_LEN(seg);
1875 		SET_DATA_OFF(rep, DATA_OFF(seg));
1876 		PORT(rep) = PORT(seg);
1877 		(*rxq->elts)[idx] = rep;
1878 		/*
1879 		 * Fill NIC descriptor with the new buffer.  The lkey and size
1880 		 * of the buffers are already known, only the buffer address
1881 		 * changes.
1882 		 */
1883 		wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t));
1884 		if (len > DATA_LEN(seg)) {
1885 			len -= DATA_LEN(seg);
1886 			++NB_SEGS(pkt);
1887 			++rq_ci;
1888 			continue;
1889 		}
1890 		DATA_LEN(seg) = len;
1891 #ifdef MLX5_PMD_SOFT_COUNTERS
1892 		/* Increment bytes counter. */
1893 		rxq->stats.ibytes += PKT_LEN(pkt);
1894 #endif
1895 		/* Return packet. */
1896 		*(pkts++) = pkt;
1897 		pkt = NULL;
1898 		--pkts_n;
1899 		++i;
1900 skip:
1901 		/* Align consumer index to the next stride. */
1902 		rq_ci >>= sges_n;
1903 		++rq_ci;
1904 		rq_ci <<= sges_n;
1905 	}
1906 	if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci)))
1907 		return 0;
1908 	/* Update the consumer index. */
1909 	rxq->rq_ci = rq_ci >> sges_n;
1910 	rte_cio_wmb();
1911 	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
1912 	rte_cio_wmb();
1913 	*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
1914 #ifdef MLX5_PMD_SOFT_COUNTERS
1915 	/* Increment packets counter. */
1916 	rxq->stats.ipackets += i;
1917 #endif
1918 	return i;
1919 }
1920 
1921 /**
1922  * Dummy DPDK callback for TX.
1923  *
1924  * This function is used to temporarily replace the real callback during
1925  * unsafe control operations on the queue, or in case of error.
1926  *
1927  * @param dpdk_txq
1928  *   Generic pointer to TX queue structure.
1929  * @param[in] pkts
1930  *   Packets to transmit.
1931  * @param pkts_n
1932  *   Number of packets in array.
1933  *
1934  * @return
1935  *   Number of packets successfully transmitted (<= pkts_n).
1936  */
1937 uint16_t
1938 removed_tx_burst(void *dpdk_txq __rte_unused,
1939 		 struct rte_mbuf **pkts __rte_unused,
1940 		 uint16_t pkts_n __rte_unused)
1941 {
1942 	return 0;
1943 }
1944 
1945 /**
1946  * Dummy DPDK callback for RX.
1947  *
1948  * This function is used to temporarily replace the real callback during
1949  * unsafe control operations on the queue, or in case of error.
1950  *
1951  * @param dpdk_rxq
1952  *   Generic pointer to RX queue structure.
1953  * @param[out] pkts
1954  *   Array to store received packets.
1955  * @param pkts_n
1956  *   Maximum number of packets in array.
1957  *
1958  * @return
1959  *   Number of packets successfully received (<= pkts_n).
1960  */
1961 uint16_t
1962 removed_rx_burst(void *dpdk_txq __rte_unused,
1963 		 struct rte_mbuf **pkts __rte_unused,
1964 		 uint16_t pkts_n __rte_unused)
1965 {
1966 	return 0;
1967 }
1968 
1969 /*
1970  * Vectorized Rx/Tx routines are not compiled in when required vector
1971  * instructions are not supported on a target architecture. The following null
1972  * stubs are needed for linkage when those are not included outside of this file
1973  * (e.g.  mlx5_rxtx_vec_sse.c for x86).
1974  */
1975 
1976 uint16_t __attribute__((weak))
1977 mlx5_tx_burst_raw_vec(void *dpdk_txq __rte_unused,
1978 		      struct rte_mbuf **pkts __rte_unused,
1979 		      uint16_t pkts_n __rte_unused)
1980 {
1981 	return 0;
1982 }
1983 
1984 uint16_t __attribute__((weak))
1985 mlx5_tx_burst_vec(void *dpdk_txq __rte_unused,
1986 		  struct rte_mbuf **pkts __rte_unused,
1987 		  uint16_t pkts_n __rte_unused)
1988 {
1989 	return 0;
1990 }
1991 
1992 uint16_t __attribute__((weak))
1993 mlx5_rx_burst_vec(void *dpdk_txq __rte_unused,
1994 		  struct rte_mbuf **pkts __rte_unused,
1995 		  uint16_t pkts_n __rte_unused)
1996 {
1997 	return 0;
1998 }
1999 
2000 int __attribute__((weak))
2001 mlx5_check_raw_vec_tx_support(struct rte_eth_dev *dev __rte_unused)
2002 {
2003 	return -ENOTSUP;
2004 }
2005 
2006 int __attribute__((weak))
2007 mlx5_check_vec_tx_support(struct rte_eth_dev *dev __rte_unused)
2008 {
2009 	return -ENOTSUP;
2010 }
2011 
2012 int __attribute__((weak))
2013 mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused)
2014 {
2015 	return -ENOTSUP;
2016 }
2017 
2018 int __attribute__((weak))
2019 mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused)
2020 {
2021 	return -ENOTSUP;
2022 }
2023