xref: /dpdk/drivers/net/mlx5/mlx5_rxtx.c (revision 14ad4f01845331a0ae98c681efa3086eeed3343a)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5 
6 #include <assert.h>
7 #include <stdint.h>
8 #include <string.h>
9 #include <stdlib.h>
10 
11 /* Verbs header. */
12 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
13 #ifdef PEDANTIC
14 #pragma GCC diagnostic ignored "-Wpedantic"
15 #endif
16 #include <infiniband/verbs.h>
17 #include <infiniband/mlx5dv.h>
18 #ifdef PEDANTIC
19 #pragma GCC diagnostic error "-Wpedantic"
20 #endif
21 
22 #include <rte_mbuf.h>
23 #include <rte_mempool.h>
24 #include <rte_prefetch.h>
25 #include <rte_common.h>
26 #include <rte_branch_prediction.h>
27 #include <rte_ether.h>
28 #include <rte_cycles.h>
29 
30 #include "mlx5.h"
31 #include "mlx5_utils.h"
32 #include "mlx5_rxtx.h"
33 #include "mlx5_autoconf.h"
34 #include "mlx5_defs.h"
35 #include "mlx5_prm.h"
36 
37 static __rte_always_inline uint32_t
38 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe);
39 
40 static __rte_always_inline int
41 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
42 		 uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe);
43 
44 static __rte_always_inline uint32_t
45 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe);
46 
47 static __rte_always_inline void
48 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
49 	       volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res);
50 
51 static __rte_always_inline void
52 mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx);
53 
54 static int
55 mlx5_queue_state_modify(struct rte_eth_dev *dev,
56 			struct mlx5_mp_arg_queue_state_modify *sm);
57 
58 uint32_t mlx5_ptype_table[] __rte_cache_aligned = {
59 	[0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */
60 };
61 
62 uint8_t mlx5_cksum_table[1 << 10] __rte_cache_aligned;
63 uint8_t mlx5_swp_types_table[1 << 10] __rte_cache_aligned;
64 
65 /**
66  * Build a table to translate Rx completion flags to packet type.
67  *
68  * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
69  */
70 void
71 mlx5_set_ptype_table(void)
72 {
73 	unsigned int i;
74 	uint32_t (*p)[RTE_DIM(mlx5_ptype_table)] = &mlx5_ptype_table;
75 
76 	/* Last entry must not be overwritten, reserved for errored packet. */
77 	for (i = 0; i < RTE_DIM(mlx5_ptype_table) - 1; ++i)
78 		(*p)[i] = RTE_PTYPE_UNKNOWN;
79 	/*
80 	 * The index to the array should have:
81 	 * bit[1:0] = l3_hdr_type
82 	 * bit[4:2] = l4_hdr_type
83 	 * bit[5] = ip_frag
84 	 * bit[6] = tunneled
85 	 * bit[7] = outer_l3_type
86 	 */
87 	/* L2 */
88 	(*p)[0x00] = RTE_PTYPE_L2_ETHER;
89 	/* L3 */
90 	(*p)[0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
91 		     RTE_PTYPE_L4_NONFRAG;
92 	(*p)[0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
93 		     RTE_PTYPE_L4_NONFRAG;
94 	/* Fragmented */
95 	(*p)[0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
96 		     RTE_PTYPE_L4_FRAG;
97 	(*p)[0x22] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
98 		     RTE_PTYPE_L4_FRAG;
99 	/* TCP */
100 	(*p)[0x05] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
101 		     RTE_PTYPE_L4_TCP;
102 	(*p)[0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
103 		     RTE_PTYPE_L4_TCP;
104 	(*p)[0x0d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
105 		     RTE_PTYPE_L4_TCP;
106 	(*p)[0x0e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
107 		     RTE_PTYPE_L4_TCP;
108 	(*p)[0x11] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
109 		     RTE_PTYPE_L4_TCP;
110 	(*p)[0x12] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
111 		     RTE_PTYPE_L4_TCP;
112 	/* UDP */
113 	(*p)[0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
114 		     RTE_PTYPE_L4_UDP;
115 	(*p)[0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
116 		     RTE_PTYPE_L4_UDP;
117 	/* Repeat with outer_l3_type being set. Just in case. */
118 	(*p)[0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
119 		     RTE_PTYPE_L4_NONFRAG;
120 	(*p)[0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
121 		     RTE_PTYPE_L4_NONFRAG;
122 	(*p)[0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
123 		     RTE_PTYPE_L4_FRAG;
124 	(*p)[0xa2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
125 		     RTE_PTYPE_L4_FRAG;
126 	(*p)[0x85] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
127 		     RTE_PTYPE_L4_TCP;
128 	(*p)[0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
129 		     RTE_PTYPE_L4_TCP;
130 	(*p)[0x8d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
131 		     RTE_PTYPE_L4_TCP;
132 	(*p)[0x8e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
133 		     RTE_PTYPE_L4_TCP;
134 	(*p)[0x91] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
135 		     RTE_PTYPE_L4_TCP;
136 	(*p)[0x92] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
137 		     RTE_PTYPE_L4_TCP;
138 	(*p)[0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
139 		     RTE_PTYPE_L4_UDP;
140 	(*p)[0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
141 		     RTE_PTYPE_L4_UDP;
142 	/* Tunneled - L3 */
143 	(*p)[0x40] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN;
144 	(*p)[0x41] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
145 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
146 		     RTE_PTYPE_INNER_L4_NONFRAG;
147 	(*p)[0x42] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
148 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
149 		     RTE_PTYPE_INNER_L4_NONFRAG;
150 	(*p)[0xc0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN;
151 	(*p)[0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
152 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
153 		     RTE_PTYPE_INNER_L4_NONFRAG;
154 	(*p)[0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
155 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
156 		     RTE_PTYPE_INNER_L4_NONFRAG;
157 	/* Tunneled - Fragmented */
158 	(*p)[0x61] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
159 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
160 		     RTE_PTYPE_INNER_L4_FRAG;
161 	(*p)[0x62] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
162 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
163 		     RTE_PTYPE_INNER_L4_FRAG;
164 	(*p)[0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
165 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
166 		     RTE_PTYPE_INNER_L4_FRAG;
167 	(*p)[0xe2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
168 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
169 		     RTE_PTYPE_INNER_L4_FRAG;
170 	/* Tunneled - TCP */
171 	(*p)[0x45] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
172 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
173 		     RTE_PTYPE_INNER_L4_TCP;
174 	(*p)[0x46] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
175 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
176 		     RTE_PTYPE_INNER_L4_TCP;
177 	(*p)[0x4d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
178 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
179 		     RTE_PTYPE_INNER_L4_TCP;
180 	(*p)[0x4e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
181 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
182 		     RTE_PTYPE_INNER_L4_TCP;
183 	(*p)[0x51] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
184 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
185 		     RTE_PTYPE_INNER_L4_TCP;
186 	(*p)[0x52] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
187 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
188 		     RTE_PTYPE_INNER_L4_TCP;
189 	(*p)[0xc5] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
190 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
191 		     RTE_PTYPE_INNER_L4_TCP;
192 	(*p)[0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
193 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
194 		     RTE_PTYPE_INNER_L4_TCP;
195 	(*p)[0xcd] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
196 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
197 		     RTE_PTYPE_INNER_L4_TCP;
198 	(*p)[0xce] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
199 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
200 		     RTE_PTYPE_INNER_L4_TCP;
201 	(*p)[0xd1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
202 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
203 		     RTE_PTYPE_INNER_L4_TCP;
204 	(*p)[0xd2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
205 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
206 		     RTE_PTYPE_INNER_L4_TCP;
207 	/* Tunneled - UDP */
208 	(*p)[0x49] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
209 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
210 		     RTE_PTYPE_INNER_L4_UDP;
211 	(*p)[0x4a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
212 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
213 		     RTE_PTYPE_INNER_L4_UDP;
214 	(*p)[0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
215 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
216 		     RTE_PTYPE_INNER_L4_UDP;
217 	(*p)[0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
218 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
219 		     RTE_PTYPE_INNER_L4_UDP;
220 }
221 
222 /**
223  * Build a table to translate packet to checksum type of Verbs.
224  */
225 void
226 mlx5_set_cksum_table(void)
227 {
228 	unsigned int i;
229 	uint8_t v;
230 
231 	/*
232 	 * The index should have:
233 	 * bit[0] = PKT_TX_TCP_SEG
234 	 * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM
235 	 * bit[4] = PKT_TX_IP_CKSUM
236 	 * bit[8] = PKT_TX_OUTER_IP_CKSUM
237 	 * bit[9] = tunnel
238 	 */
239 	for (i = 0; i < RTE_DIM(mlx5_cksum_table); ++i) {
240 		v = 0;
241 		if (i & (1 << 9)) {
242 			/* Tunneled packet. */
243 			if (i & (1 << 8)) /* Outer IP. */
244 				v |= MLX5_ETH_WQE_L3_CSUM;
245 			if (i & (1 << 4)) /* Inner IP. */
246 				v |= MLX5_ETH_WQE_L3_INNER_CSUM;
247 			if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */
248 				v |= MLX5_ETH_WQE_L4_INNER_CSUM;
249 		} else {
250 			/* No tunnel. */
251 			if (i & (1 << 4)) /* IP. */
252 				v |= MLX5_ETH_WQE_L3_CSUM;
253 			if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */
254 				v |= MLX5_ETH_WQE_L4_CSUM;
255 		}
256 		mlx5_cksum_table[i] = v;
257 	}
258 }
259 
260 /**
261  * Build a table to translate packet type of mbuf to SWP type of Verbs.
262  */
263 void
264 mlx5_set_swp_types_table(void)
265 {
266 	unsigned int i;
267 	uint8_t v;
268 
269 	/*
270 	 * The index should have:
271 	 * bit[0:1] = PKT_TX_L4_MASK
272 	 * bit[4] = PKT_TX_IPV6
273 	 * bit[8] = PKT_TX_OUTER_IPV6
274 	 * bit[9] = PKT_TX_OUTER_UDP
275 	 */
276 	for (i = 0; i < RTE_DIM(mlx5_swp_types_table); ++i) {
277 		v = 0;
278 		if (i & (1 << 8))
279 			v |= MLX5_ETH_WQE_L3_OUTER_IPV6;
280 		if (i & (1 << 9))
281 			v |= MLX5_ETH_WQE_L4_OUTER_UDP;
282 		if (i & (1 << 4))
283 			v |= MLX5_ETH_WQE_L3_INNER_IPV6;
284 		if ((i & 3) == (PKT_TX_UDP_CKSUM >> 52))
285 			v |= MLX5_ETH_WQE_L4_INNER_UDP;
286 		mlx5_swp_types_table[i] = v;
287 	}
288 }
289 
290 /**
291  * Return the size of tailroom of WQ.
292  *
293  * @param txq
294  *   Pointer to TX queue structure.
295  * @param addr
296  *   Pointer to tail of WQ.
297  *
298  * @return
299  *   Size of tailroom.
300  */
301 static inline size_t
302 tx_mlx5_wq_tailroom(struct mlx5_txq_data *txq, void *addr)
303 {
304 	size_t tailroom;
305 	tailroom = (uintptr_t)(txq->wqes) +
306 		   (1 << txq->wqe_n) * MLX5_WQE_SIZE -
307 		   (uintptr_t)addr;
308 	return tailroom;
309 }
310 
311 /**
312  * Copy data to tailroom of circular queue.
313  *
314  * @param dst
315  *   Pointer to destination.
316  * @param src
317  *   Pointer to source.
318  * @param n
319  *   Number of bytes to copy.
320  * @param base
321  *   Pointer to head of queue.
322  * @param tailroom
323  *   Size of tailroom from dst.
324  *
325  * @return
326  *   Pointer after copied data.
327  */
328 static inline void *
329 mlx5_copy_to_wq(void *dst, const void *src, size_t n,
330 		void *base, size_t tailroom)
331 {
332 	void *ret;
333 
334 	if (n > tailroom) {
335 		rte_memcpy(dst, src, tailroom);
336 		rte_memcpy(base, (void *)((uintptr_t)src + tailroom),
337 			   n - tailroom);
338 		ret = (uint8_t *)base + n - tailroom;
339 	} else {
340 		rte_memcpy(dst, src, n);
341 		ret = (n == tailroom) ? base : (uint8_t *)dst + n;
342 	}
343 	return ret;
344 }
345 
346 /**
347  * Inline TSO headers into WQE.
348  *
349  * @return
350  *   0 on success, negative errno value on failure.
351  */
352 static int
353 inline_tso(struct mlx5_txq_data *txq, struct rte_mbuf *buf,
354 	   uint32_t *length,
355 	   uintptr_t *addr,
356 	   uint16_t *pkt_inline_sz,
357 	   uint8_t **raw,
358 	   uint16_t *max_wqe,
359 	   uint16_t *tso_segsz,
360 	   uint16_t *tso_header_sz)
361 {
362 	uintptr_t end = (uintptr_t)(((uintptr_t)txq->wqes) +
363 				    (1 << txq->wqe_n) * MLX5_WQE_SIZE);
364 	unsigned int copy_b;
365 	uint8_t vlan_sz = (buf->ol_flags & PKT_TX_VLAN_PKT) ? 4 : 0;
366 	const uint8_t tunneled = txq->tunnel_en && (buf->ol_flags &
367 				 PKT_TX_TUNNEL_MASK);
368 	uint16_t n_wqe;
369 
370 	*tso_segsz = buf->tso_segsz;
371 	*tso_header_sz = buf->l2_len + vlan_sz + buf->l3_len + buf->l4_len;
372 	if (unlikely(*tso_segsz == 0 || *tso_header_sz == 0)) {
373 		txq->stats.oerrors++;
374 		return -EINVAL;
375 	}
376 	if (tunneled)
377 		*tso_header_sz += buf->outer_l2_len + buf->outer_l3_len;
378 	/* First seg must contain all TSO headers. */
379 	if (unlikely(*tso_header_sz > MLX5_MAX_TSO_HEADER) ||
380 		     *tso_header_sz > DATA_LEN(buf)) {
381 		txq->stats.oerrors++;
382 		return -EINVAL;
383 	}
384 	copy_b = *tso_header_sz - *pkt_inline_sz;
385 	if (!copy_b || ((end - (uintptr_t)*raw) < copy_b))
386 		return -EAGAIN;
387 	n_wqe = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4;
388 	if (unlikely(*max_wqe < n_wqe))
389 		return -EINVAL;
390 	*max_wqe -= n_wqe;
391 	rte_memcpy((void *)*raw, (void *)*addr, copy_b);
392 	*length -= copy_b;
393 	*addr += copy_b;
394 	copy_b = MLX5_WQE_DS(copy_b) * MLX5_WQE_DWORD_SIZE;
395 	*pkt_inline_sz += copy_b;
396 	*raw += copy_b;
397 	return 0;
398 }
399 
400 /**
401  * DPDK callback to check the status of a tx descriptor.
402  *
403  * @param tx_queue
404  *   The tx queue.
405  * @param[in] offset
406  *   The index of the descriptor in the ring.
407  *
408  * @return
409  *   The status of the tx descriptor.
410  */
411 int
412 mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset)
413 {
414 	struct mlx5_txq_data *txq = tx_queue;
415 	uint16_t used;
416 
417 	mlx5_tx_complete(txq);
418 	used = txq->elts_head - txq->elts_tail;
419 	if (offset < used)
420 		return RTE_ETH_TX_DESC_FULL;
421 	return RTE_ETH_TX_DESC_DONE;
422 }
423 
424 /**
425  * Internal function to compute the number of used descriptors in an RX queue
426  *
427  * @param rxq
428  *   The Rx queue.
429  *
430  * @return
431  *   The number of used rx descriptor.
432  */
433 static uint32_t
434 rx_queue_count(struct mlx5_rxq_data *rxq)
435 {
436 	struct rxq_zip *zip = &rxq->zip;
437 	volatile struct mlx5_cqe *cqe;
438 	const unsigned int cqe_n = (1 << rxq->cqe_n);
439 	const unsigned int cqe_cnt = cqe_n - 1;
440 	unsigned int cq_ci;
441 	unsigned int used;
442 
443 	/* if we are processing a compressed cqe */
444 	if (zip->ai) {
445 		used = zip->cqe_cnt - zip->ca;
446 		cq_ci = zip->cq_ci;
447 	} else {
448 		used = 0;
449 		cq_ci = rxq->cq_ci;
450 	}
451 	cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
452 	while (check_cqe(cqe, cqe_n, cq_ci) != MLX5_CQE_STATUS_HW_OWN) {
453 		int8_t op_own;
454 		unsigned int n;
455 
456 		op_own = cqe->op_own;
457 		if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED)
458 			n = rte_be_to_cpu_32(cqe->byte_cnt);
459 		else
460 			n = 1;
461 		cq_ci += n;
462 		used += n;
463 		cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
464 	}
465 	used = RTE_MIN(used, (1U << rxq->elts_n) - 1);
466 	return used;
467 }
468 
469 /**
470  * DPDK callback to check the status of a rx descriptor.
471  *
472  * @param rx_queue
473  *   The Rx queue.
474  * @param[in] offset
475  *   The index of the descriptor in the ring.
476  *
477  * @return
478  *   The status of the tx descriptor.
479  */
480 int
481 mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset)
482 {
483 	struct mlx5_rxq_data *rxq = rx_queue;
484 	struct mlx5_rxq_ctrl *rxq_ctrl =
485 			container_of(rxq, struct mlx5_rxq_ctrl, rxq);
486 	struct rte_eth_dev *dev = ETH_DEV(rxq_ctrl->priv);
487 
488 	if (dev->rx_pkt_burst != mlx5_rx_burst) {
489 		rte_errno = ENOTSUP;
490 		return -rte_errno;
491 	}
492 	if (offset >= (1 << rxq->elts_n)) {
493 		rte_errno = EINVAL;
494 		return -rte_errno;
495 	}
496 	if (offset < rx_queue_count(rxq))
497 		return RTE_ETH_RX_DESC_DONE;
498 	return RTE_ETH_RX_DESC_AVAIL;
499 }
500 
501 /**
502  * DPDK callback to get the number of used descriptors in a RX queue
503  *
504  * @param dev
505  *   Pointer to the device structure.
506  *
507  * @param rx_queue_id
508  *   The Rx queue.
509  *
510  * @return
511  *   The number of used rx descriptor.
512  *   -EINVAL if the queue is invalid
513  */
514 uint32_t
515 mlx5_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id)
516 {
517 	struct mlx5_priv *priv = dev->data->dev_private;
518 	struct mlx5_rxq_data *rxq;
519 
520 	if (dev->rx_pkt_burst != mlx5_rx_burst) {
521 		rte_errno = ENOTSUP;
522 		return -rte_errno;
523 	}
524 	rxq = (*priv->rxqs)[rx_queue_id];
525 	if (!rxq) {
526 		rte_errno = EINVAL;
527 		return -rte_errno;
528 	}
529 	return rx_queue_count(rxq);
530 }
531 
532 #define MLX5_SYSTEM_LOG_DIR "/var/log"
533 /**
534  * Dump debug information to log file.
535  *
536  * @param fname
537  *   The file name.
538  * @param hex_title
539  *   If not NULL this string is printed as a header to the output
540  *   and the output will be in hexadecimal view.
541  * @param buf
542  *   This is the buffer address to print out.
543  * @param len
544  *   The number of bytes to dump out.
545  */
546 void
547 mlx5_dump_debug_information(const char *fname, const char *hex_title,
548 			    const void *buf, unsigned int hex_len)
549 {
550 	FILE *fd;
551 
552 	MKSTR(path, "%s/%s", MLX5_SYSTEM_LOG_DIR, fname);
553 	fd = fopen(path, "a+");
554 	if (!fd) {
555 		DRV_LOG(WARNING, "cannot open %s for debug dump\n",
556 			path);
557 		MKSTR(path2, "./%s", fname);
558 		fd = fopen(path2, "a+");
559 		if (!fd) {
560 			DRV_LOG(ERR, "cannot open %s for debug dump\n",
561 				path2);
562 			return;
563 		}
564 		DRV_LOG(INFO, "New debug dump in file %s\n", path2);
565 	} else {
566 		DRV_LOG(INFO, "New debug dump in file %s\n", path);
567 	}
568 	if (hex_title)
569 		rte_hexdump(fd, hex_title, buf, hex_len);
570 	else
571 		fprintf(fd, "%s", (const char *)buf);
572 	fprintf(fd, "\n\n\n");
573 	fclose(fd);
574 }
575 
576 /**
577  * Move QP from error state to running state and initialize indexes.
578  *
579  * @param txq_ctrl
580  *   Pointer to TX queue control structure.
581  *
582  * @return
583  *   0 on success, else -1.
584  */
585 static int
586 tx_recover_qp(struct mlx5_txq_ctrl *txq_ctrl)
587 {
588 	struct mlx5_mp_arg_queue_state_modify sm = {
589 			.is_wq = 0,
590 			.queue_id = txq_ctrl->txq.idx,
591 	};
592 
593 	if (mlx5_queue_state_modify(ETH_DEV(txq_ctrl->priv), &sm))
594 		return -1;
595 	txq_ctrl->txq.wqe_ci = 0;
596 	txq_ctrl->txq.wqe_pi = 0;
597 	txq_ctrl->txq.elts_comp = 0;
598 	return 0;
599 }
600 
601 /* Return 1 if the error CQE is signed otherwise, sign it and return 0. */
602 static int
603 check_err_cqe_seen(volatile struct mlx5_err_cqe *err_cqe)
604 {
605 	static const uint8_t magic[] = "seen";
606 	int ret = 1;
607 	unsigned int i;
608 
609 	for (i = 0; i < sizeof(magic); ++i)
610 		if (!ret || err_cqe->rsvd1[i] != magic[i]) {
611 			ret = 0;
612 			err_cqe->rsvd1[i] = magic[i];
613 		}
614 	return ret;
615 }
616 
617 /**
618  * Handle error CQE.
619  *
620  * @param txq
621  *   Pointer to TX queue structure.
622  * @param error_cqe
623  *   Pointer to the error CQE.
624  *
625  * @return
626  *   The last Tx buffer element to free.
627  */
628 uint16_t
629 mlx5_tx_error_cqe_handle(struct mlx5_txq_data *txq,
630 			 volatile struct mlx5_err_cqe *err_cqe)
631 {
632 	if (err_cqe->syndrome != MLX5_CQE_SYNDROME_WR_FLUSH_ERR) {
633 		const uint16_t wqe_m = ((1 << txq->wqe_n) - 1);
634 		struct mlx5_txq_ctrl *txq_ctrl =
635 				container_of(txq, struct mlx5_txq_ctrl, txq);
636 		uint16_t new_wqe_pi = rte_be_to_cpu_16(err_cqe->wqe_counter);
637 		int seen = check_err_cqe_seen(err_cqe);
638 
639 		if (!seen && txq_ctrl->dump_file_n <
640 		    txq_ctrl->priv->config.max_dump_files_num) {
641 			MKSTR(err_str, "Unexpected CQE error syndrome "
642 			      "0x%02x CQN = %u SQN = %u wqe_counter = %u "
643 			      "wq_ci = %u cq_ci = %u", err_cqe->syndrome,
644 			      txq_ctrl->cqn, txq->qp_num_8s >> 8,
645 			      rte_be_to_cpu_16(err_cqe->wqe_counter),
646 			      txq->wqe_ci, txq->cq_ci);
647 			MKSTR(name, "dpdk_mlx5_port_%u_txq_%u_index_%u_%u",
648 			      PORT_ID(txq_ctrl->priv), txq->idx,
649 			      txq_ctrl->dump_file_n, (uint32_t)rte_rdtsc());
650 			mlx5_dump_debug_information(name, NULL, err_str, 0);
651 			mlx5_dump_debug_information(name, "MLX5 Error CQ:",
652 						    (const void *)((uintptr_t)
653 						    &(*txq->cqes)[0]),
654 						    sizeof(*err_cqe) *
655 						    (1 << txq->cqe_n));
656 			mlx5_dump_debug_information(name, "MLX5 Error SQ:",
657 						    (const void *)((uintptr_t)
658 						    tx_mlx5_wqe(txq, 0)),
659 						    MLX5_WQE_SIZE *
660 						    (1 << txq->wqe_n));
661 			txq_ctrl->dump_file_n++;
662 		}
663 		if (!seen)
664 			/*
665 			 * Count errors in WQEs units.
666 			 * Later it can be improved to count error packets,
667 			 * for example, by SQ parsing to find how much packets
668 			 * should be counted for each WQE.
669 			 */
670 			txq->stats.oerrors += ((txq->wqe_ci & wqe_m) -
671 						new_wqe_pi) & wqe_m;
672 		if (tx_recover_qp(txq_ctrl) == 0) {
673 			txq->cq_ci++;
674 			/* Release all the remaining buffers. */
675 			return txq->elts_head;
676 		}
677 		/* Recovering failed - try again later on the same WQE. */
678 	} else {
679 		txq->cq_ci++;
680 	}
681 	/* Do not release buffers. */
682 	return txq->elts_tail;
683 }
684 
685 /**
686  * DPDK callback for TX.
687  *
688  * @param dpdk_txq
689  *   Generic pointer to TX queue structure.
690  * @param[in] pkts
691  *   Packets to transmit.
692  * @param pkts_n
693  *   Number of packets in array.
694  *
695  * @return
696  *   Number of packets successfully transmitted (<= pkts_n).
697  */
698 uint16_t
699 mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
700 {
701 	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
702 	uint16_t elts_head = txq->elts_head;
703 	const uint16_t elts_n = 1 << txq->elts_n;
704 	const uint16_t elts_m = elts_n - 1;
705 	unsigned int i = 0;
706 	unsigned int j = 0;
707 	unsigned int k = 0;
708 	uint16_t max_elts;
709 	uint16_t max_wqe;
710 	unsigned int comp;
711 	volatile struct mlx5_wqe_ctrl *last_wqe = NULL;
712 	unsigned int segs_n = 0;
713 	const unsigned int max_inline = txq->max_inline;
714 	uint64_t addr_64;
715 
716 	if (unlikely(!pkts_n))
717 		return 0;
718 	/* Prefetch first packet cacheline. */
719 	rte_prefetch0(*pkts);
720 	/* Start processing. */
721 	mlx5_tx_complete(txq);
722 	max_elts = (elts_n - (elts_head - txq->elts_tail));
723 	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
724 	if (unlikely(!max_wqe))
725 		return 0;
726 	do {
727 		struct rte_mbuf *buf = *pkts; /* First_seg. */
728 		uint8_t *raw;
729 		volatile struct mlx5_wqe_v *wqe = NULL;
730 		volatile rte_v128u32_t *dseg = NULL;
731 		uint32_t length;
732 		unsigned int ds = 0;
733 		unsigned int sg = 0; /* counter of additional segs attached. */
734 		uintptr_t addr;
735 		uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE + 2;
736 		uint16_t tso_header_sz = 0;
737 		uint16_t ehdr;
738 		uint8_t cs_flags;
739 		uint8_t tso = txq->tso_en && (buf->ol_flags & PKT_TX_TCP_SEG);
740 		uint32_t swp_offsets = 0;
741 		uint8_t swp_types = 0;
742 		rte_be32_t metadata;
743 		uint16_t tso_segsz = 0;
744 #ifdef MLX5_PMD_SOFT_COUNTERS
745 		uint32_t total_length = 0;
746 #endif
747 		int ret;
748 
749 		segs_n = buf->nb_segs;
750 		/*
751 		 * Make sure there is enough room to store this packet and
752 		 * that one ring entry remains unused.
753 		 */
754 		assert(segs_n);
755 		if (max_elts < segs_n)
756 			break;
757 		max_elts -= segs_n;
758 		sg = --segs_n;
759 		if (unlikely(--max_wqe == 0))
760 			break;
761 		wqe = (volatile struct mlx5_wqe_v *)
762 			tx_mlx5_wqe(txq, txq->wqe_ci);
763 		rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
764 		if (pkts_n - i > 1)
765 			rte_prefetch0(*(pkts + 1));
766 		addr = rte_pktmbuf_mtod(buf, uintptr_t);
767 		length = DATA_LEN(buf);
768 		ehdr = (((uint8_t *)addr)[1] << 8) |
769 		       ((uint8_t *)addr)[0];
770 #ifdef MLX5_PMD_SOFT_COUNTERS
771 		total_length = length;
772 #endif
773 		if (length < (MLX5_WQE_DWORD_SIZE + 2)) {
774 			txq->stats.oerrors++;
775 			break;
776 		}
777 		/* Update element. */
778 		(*txq->elts)[elts_head & elts_m] = buf;
779 		/* Prefetch next buffer data. */
780 		if (pkts_n - i > 1)
781 			rte_prefetch0(
782 			    rte_pktmbuf_mtod(*(pkts + 1), volatile void *));
783 		cs_flags = txq_ol_cksum_to_cs(buf);
784 		txq_mbuf_to_swp(txq, buf, (uint8_t *)&swp_offsets, &swp_types);
785 		raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE;
786 		/* Copy metadata from mbuf if valid */
787 		metadata = buf->ol_flags & PKT_TX_METADATA ? buf->tx_metadata :
788 							     0;
789 		/* Replace the Ethernet type by the VLAN if necessary. */
790 		if (buf->ol_flags & PKT_TX_VLAN_PKT) {
791 			uint32_t vlan = rte_cpu_to_be_32(0x81000000 |
792 							 buf->vlan_tci);
793 			unsigned int len = 2 * RTE_ETHER_ADDR_LEN - 2;
794 
795 			addr += 2;
796 			length -= 2;
797 			/* Copy Destination and source mac address. */
798 			memcpy((uint8_t *)raw, ((uint8_t *)addr), len);
799 			/* Copy VLAN. */
800 			memcpy((uint8_t *)raw + len, &vlan, sizeof(vlan));
801 			/* Copy missing two bytes to end the DSeg. */
802 			memcpy((uint8_t *)raw + len + sizeof(vlan),
803 			       ((uint8_t *)addr) + len, 2);
804 			addr += len + 2;
805 			length -= (len + 2);
806 		} else {
807 			memcpy((uint8_t *)raw, ((uint8_t *)addr) + 2,
808 			       MLX5_WQE_DWORD_SIZE);
809 			length -= pkt_inline_sz;
810 			addr += pkt_inline_sz;
811 		}
812 		raw += MLX5_WQE_DWORD_SIZE;
813 		if (tso) {
814 			ret = inline_tso(txq, buf, &length,
815 					 &addr, &pkt_inline_sz,
816 					 &raw, &max_wqe,
817 					 &tso_segsz, &tso_header_sz);
818 			if (ret == -EINVAL) {
819 				break;
820 			} else if (ret == -EAGAIN) {
821 				/* NOP WQE. */
822 				wqe->ctrl = (rte_v128u32_t){
823 					rte_cpu_to_be_32(txq->wqe_ci << 8),
824 					rte_cpu_to_be_32(txq->qp_num_8s | 1),
825 					rte_cpu_to_be_32
826 						(MLX5_COMP_ONLY_FIRST_ERR <<
827 						 MLX5_COMP_MODE_OFFSET),
828 					0,
829 				};
830 				ds = 1;
831 #ifdef MLX5_PMD_SOFT_COUNTERS
832 				total_length = 0;
833 #endif
834 				k++;
835 				goto next_wqe;
836 			}
837 		}
838 		/* Inline if enough room. */
839 		if (max_inline || tso) {
840 			uint32_t inl = 0;
841 			uintptr_t end = (uintptr_t)
842 				(((uintptr_t)txq->wqes) +
843 				 (1 << txq->wqe_n) * MLX5_WQE_SIZE);
844 			unsigned int inline_room = max_inline *
845 						   RTE_CACHE_LINE_SIZE -
846 						   (pkt_inline_sz - 2) -
847 						   !!tso * sizeof(inl);
848 			uintptr_t addr_end;
849 			unsigned int copy_b;
850 
851 pkt_inline:
852 			addr_end = RTE_ALIGN_FLOOR(addr + inline_room,
853 						   RTE_CACHE_LINE_SIZE);
854 			copy_b = (addr_end > addr) ?
855 				 RTE_MIN((addr_end - addr), length) : 0;
856 			if (copy_b && ((end - (uintptr_t)raw) >
857 				       (copy_b + sizeof(inl)))) {
858 				/*
859 				 * One Dseg remains in the current WQE.  To
860 				 * keep the computation positive, it is
861 				 * removed after the bytes to Dseg conversion.
862 				 */
863 				uint16_t n = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4;
864 
865 				if (unlikely(max_wqe < n))
866 					break;
867 				max_wqe -= n;
868 				if (tso) {
869 					assert(inl == 0);
870 					inl = rte_cpu_to_be_32(copy_b |
871 							       MLX5_INLINE_SEG);
872 					rte_memcpy((void *)raw,
873 						   (void *)&inl, sizeof(inl));
874 					raw += sizeof(inl);
875 					pkt_inline_sz += sizeof(inl);
876 				}
877 				rte_memcpy((void *)raw, (void *)addr, copy_b);
878 				addr += copy_b;
879 				length -= copy_b;
880 				pkt_inline_sz += copy_b;
881 			}
882 			/*
883 			 * 2 DWORDs consumed by the WQE header + ETH segment +
884 			 * the size of the inline part of the packet.
885 			 */
886 			ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2);
887 			if (length > 0) {
888 				if (ds % (MLX5_WQE_SIZE /
889 					  MLX5_WQE_DWORD_SIZE) == 0) {
890 					if (unlikely(--max_wqe == 0))
891 						break;
892 					dseg = (volatile rte_v128u32_t *)
893 					       tx_mlx5_wqe(txq, txq->wqe_ci +
894 							   ds / 4);
895 				} else {
896 					dseg = (volatile rte_v128u32_t *)
897 						((uintptr_t)wqe +
898 						 (ds * MLX5_WQE_DWORD_SIZE));
899 				}
900 				goto use_dseg;
901 			} else if (!segs_n) {
902 				goto next_pkt;
903 			} else {
904 				/*
905 				 * Further inline the next segment only for
906 				 * non-TSO packets.
907 				 */
908 				if (!tso) {
909 					raw += copy_b;
910 					inline_room -= copy_b;
911 				} else {
912 					inline_room = 0;
913 				}
914 				/* Move to the next segment. */
915 				--segs_n;
916 				buf = buf->next;
917 				assert(buf);
918 				addr = rte_pktmbuf_mtod(buf, uintptr_t);
919 				length = DATA_LEN(buf);
920 #ifdef MLX5_PMD_SOFT_COUNTERS
921 				total_length += length;
922 #endif
923 				(*txq->elts)[++elts_head & elts_m] = buf;
924 				goto pkt_inline;
925 			}
926 		} else {
927 			/*
928 			 * No inline has been done in the packet, only the
929 			 * Ethernet Header as been stored.
930 			 */
931 			dseg = (volatile rte_v128u32_t *)
932 				((uintptr_t)wqe + (3 * MLX5_WQE_DWORD_SIZE));
933 			ds = 3;
934 use_dseg:
935 			/* Add the remaining packet as a simple ds. */
936 			addr_64 = rte_cpu_to_be_64(addr);
937 			*dseg = (rte_v128u32_t){
938 				rte_cpu_to_be_32(length),
939 				mlx5_tx_mb2mr(txq, buf),
940 				addr_64,
941 				addr_64 >> 32,
942 			};
943 			++ds;
944 			if (!segs_n)
945 				goto next_pkt;
946 		}
947 next_seg:
948 		assert(buf);
949 		assert(ds);
950 		assert(wqe);
951 		/*
952 		 * Spill on next WQE when the current one does not have
953 		 * enough room left. Size of WQE must a be a multiple
954 		 * of data segment size.
955 		 */
956 		assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE));
957 		if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE))) {
958 			if (unlikely(--max_wqe == 0))
959 				break;
960 			dseg = (volatile rte_v128u32_t *)
961 			       tx_mlx5_wqe(txq, txq->wqe_ci + ds / 4);
962 			rte_prefetch0(tx_mlx5_wqe(txq,
963 						  txq->wqe_ci + ds / 4 + 1));
964 		} else {
965 			++dseg;
966 		}
967 		++ds;
968 		buf = buf->next;
969 		assert(buf);
970 		length = DATA_LEN(buf);
971 #ifdef MLX5_PMD_SOFT_COUNTERS
972 		total_length += length;
973 #endif
974 		/* Store segment information. */
975 		addr_64 = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf, uintptr_t));
976 		*dseg = (rte_v128u32_t){
977 			rte_cpu_to_be_32(length),
978 			mlx5_tx_mb2mr(txq, buf),
979 			addr_64,
980 			addr_64 >> 32,
981 		};
982 		(*txq->elts)[++elts_head & elts_m] = buf;
983 		if (--segs_n)
984 			goto next_seg;
985 next_pkt:
986 		if (ds > MLX5_DSEG_MAX) {
987 			txq->stats.oerrors++;
988 			break;
989 		}
990 		++elts_head;
991 		++pkts;
992 		++i;
993 		j += sg;
994 		/* Initialize known and common part of the WQE structure. */
995 		if (tso) {
996 			wqe->ctrl = (rte_v128u32_t){
997 				rte_cpu_to_be_32((txq->wqe_ci << 8) |
998 						 MLX5_OPCODE_TSO),
999 				rte_cpu_to_be_32(txq->qp_num_8s | ds),
1000 				rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
1001 						 MLX5_COMP_MODE_OFFSET),
1002 				0,
1003 			};
1004 			wqe->eseg = (rte_v128u32_t){
1005 				swp_offsets,
1006 				cs_flags | (swp_types << 8) |
1007 				(rte_cpu_to_be_16(tso_segsz) << 16),
1008 				metadata,
1009 				(ehdr << 16) | rte_cpu_to_be_16(tso_header_sz),
1010 			};
1011 		} else {
1012 			wqe->ctrl = (rte_v128u32_t){
1013 				rte_cpu_to_be_32((txq->wqe_ci << 8) |
1014 						 MLX5_OPCODE_SEND),
1015 				rte_cpu_to_be_32(txq->qp_num_8s | ds),
1016 				rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
1017 						 MLX5_COMP_MODE_OFFSET),
1018 				0,
1019 			};
1020 			wqe->eseg = (rte_v128u32_t){
1021 				swp_offsets,
1022 				cs_flags | (swp_types << 8),
1023 				metadata,
1024 				(ehdr << 16) | rte_cpu_to_be_16(pkt_inline_sz),
1025 			};
1026 		}
1027 next_wqe:
1028 		txq->wqe_ci += (ds + 3) / 4;
1029 		/* Save the last successful WQE for completion request */
1030 		last_wqe = (volatile struct mlx5_wqe_ctrl *)wqe;
1031 #ifdef MLX5_PMD_SOFT_COUNTERS
1032 		/* Increment sent bytes counter. */
1033 		txq->stats.obytes += total_length;
1034 #endif
1035 	} while (i < pkts_n);
1036 	/* Take a shortcut if nothing must be sent. */
1037 	if (unlikely((i + k) == 0))
1038 		return 0;
1039 	txq->elts_head += (i + j);
1040 	/* Check whether completion threshold has been reached. */
1041 	comp = txq->elts_comp + i + j + k;
1042 	if (comp >= MLX5_TX_COMP_THRESH) {
1043 		/* A CQE slot must always be available. */
1044 		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
1045 		/* Request completion on last WQE. */
1046 		last_wqe->ctrl2 = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
1047 						   MLX5_COMP_MODE_OFFSET);
1048 		/* Save elts_head in unused "immediate" field of WQE. */
1049 		last_wqe->ctrl3 = txq->elts_head;
1050 		txq->elts_comp = 0;
1051 	} else {
1052 		txq->elts_comp = comp;
1053 	}
1054 #ifdef MLX5_PMD_SOFT_COUNTERS
1055 	/* Increment sent packets counter. */
1056 	txq->stats.opackets += i;
1057 #endif
1058 	/* Ring QP doorbell. */
1059 	mlx5_tx_dbrec(txq, (volatile struct mlx5_wqe *)last_wqe);
1060 	return i;
1061 }
1062 
1063 /**
1064  * Open a MPW session.
1065  *
1066  * @param txq
1067  *   Pointer to TX queue structure.
1068  * @param mpw
1069  *   Pointer to MPW session structure.
1070  * @param length
1071  *   Packet length.
1072  */
1073 static inline void
1074 mlx5_mpw_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw, uint32_t length)
1075 {
1076 	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
1077 	volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] =
1078 		(volatile struct mlx5_wqe_data_seg (*)[])
1079 		tx_mlx5_wqe(txq, idx + 1);
1080 
1081 	mpw->state = MLX5_MPW_STATE_OPENED;
1082 	mpw->pkts_n = 0;
1083 	mpw->len = length;
1084 	mpw->total_len = 0;
1085 	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
1086 	mpw->wqe->eseg.mss = rte_cpu_to_be_16(length);
1087 	mpw->wqe->eseg.inline_hdr_sz = 0;
1088 	mpw->wqe->eseg.rsvd0 = 0;
1089 	mpw->wqe->eseg.rsvd1 = 0;
1090 	mpw->wqe->eseg.flow_table_metadata = 0;
1091 	mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) |
1092 					     (txq->wqe_ci << 8) |
1093 					     MLX5_OPCODE_TSO);
1094 	mpw->wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
1095 					     MLX5_COMP_MODE_OFFSET);
1096 	mpw->wqe->ctrl[3] = 0;
1097 	mpw->data.dseg[0] = (volatile struct mlx5_wqe_data_seg *)
1098 		(((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
1099 	mpw->data.dseg[1] = (volatile struct mlx5_wqe_data_seg *)
1100 		(((uintptr_t)mpw->wqe) + (3 * MLX5_WQE_DWORD_SIZE));
1101 	mpw->data.dseg[2] = &(*dseg)[0];
1102 	mpw->data.dseg[3] = &(*dseg)[1];
1103 	mpw->data.dseg[4] = &(*dseg)[2];
1104 }
1105 
1106 /**
1107  * Close a MPW session.
1108  *
1109  * @param txq
1110  *   Pointer to TX queue structure.
1111  * @param mpw
1112  *   Pointer to MPW session structure.
1113  */
1114 static inline void
1115 mlx5_mpw_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
1116 {
1117 	unsigned int num = mpw->pkts_n;
1118 
1119 	/*
1120 	 * Store size in multiple of 16 bytes. Control and Ethernet segments
1121 	 * count as 2.
1122 	 */
1123 	mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s | (2 + num));
1124 	mpw->state = MLX5_MPW_STATE_CLOSED;
1125 	if (num < 3)
1126 		++txq->wqe_ci;
1127 	else
1128 		txq->wqe_ci += 2;
1129 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
1130 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
1131 }
1132 
1133 /**
1134  * DPDK callback for TX with MPW support.
1135  *
1136  * @param dpdk_txq
1137  *   Generic pointer to TX queue structure.
1138  * @param[in] pkts
1139  *   Packets to transmit.
1140  * @param pkts_n
1141  *   Number of packets in array.
1142  *
1143  * @return
1144  *   Number of packets successfully transmitted (<= pkts_n).
1145  */
1146 uint16_t
1147 mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
1148 {
1149 	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
1150 	uint16_t elts_head = txq->elts_head;
1151 	const uint16_t elts_n = 1 << txq->elts_n;
1152 	const uint16_t elts_m = elts_n - 1;
1153 	unsigned int i = 0;
1154 	unsigned int j = 0;
1155 	uint16_t max_elts;
1156 	uint16_t max_wqe;
1157 	unsigned int comp;
1158 	struct mlx5_mpw mpw = {
1159 		.state = MLX5_MPW_STATE_CLOSED,
1160 	};
1161 
1162 	if (unlikely(!pkts_n))
1163 		return 0;
1164 	/* Prefetch first packet cacheline. */
1165 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
1166 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
1167 	/* Start processing. */
1168 	mlx5_tx_complete(txq);
1169 	max_elts = (elts_n - (elts_head - txq->elts_tail));
1170 	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
1171 	if (unlikely(!max_wqe))
1172 		return 0;
1173 	do {
1174 		struct rte_mbuf *buf = *(pkts++);
1175 		uint32_t length;
1176 		unsigned int segs_n = buf->nb_segs;
1177 		uint32_t cs_flags;
1178 		rte_be32_t metadata;
1179 
1180 		/*
1181 		 * Make sure there is enough room to store this packet and
1182 		 * that one ring entry remains unused.
1183 		 */
1184 		assert(segs_n);
1185 		if (max_elts < segs_n)
1186 			break;
1187 		/* Do not bother with large packets MPW cannot handle. */
1188 		if (segs_n > MLX5_MPW_DSEG_MAX) {
1189 			txq->stats.oerrors++;
1190 			break;
1191 		}
1192 		max_elts -= segs_n;
1193 		--pkts_n;
1194 		cs_flags = txq_ol_cksum_to_cs(buf);
1195 		/* Copy metadata from mbuf if valid */
1196 		metadata = buf->ol_flags & PKT_TX_METADATA ? buf->tx_metadata :
1197 							     0;
1198 		/* Retrieve packet information. */
1199 		length = PKT_LEN(buf);
1200 		assert(length);
1201 		/* Start new session if packet differs. */
1202 		if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
1203 		    ((mpw.len != length) ||
1204 		     (segs_n != 1) ||
1205 		     (mpw.wqe->eseg.flow_table_metadata != metadata) ||
1206 		     (mpw.wqe->eseg.cs_flags != cs_flags)))
1207 			mlx5_mpw_close(txq, &mpw);
1208 		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
1209 			/*
1210 			 * Multi-Packet WQE consumes at most two WQE.
1211 			 * mlx5_mpw_new() expects to be able to use such
1212 			 * resources.
1213 			 */
1214 			if (unlikely(max_wqe < 2))
1215 				break;
1216 			max_wqe -= 2;
1217 			mlx5_mpw_new(txq, &mpw, length);
1218 			mpw.wqe->eseg.cs_flags = cs_flags;
1219 			mpw.wqe->eseg.flow_table_metadata = metadata;
1220 		}
1221 		/* Multi-segment packets must be alone in their MPW. */
1222 		assert((segs_n == 1) || (mpw.pkts_n == 0));
1223 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1224 		length = 0;
1225 #endif
1226 		do {
1227 			volatile struct mlx5_wqe_data_seg *dseg;
1228 			uintptr_t addr;
1229 
1230 			assert(buf);
1231 			(*txq->elts)[elts_head++ & elts_m] = buf;
1232 			dseg = mpw.data.dseg[mpw.pkts_n];
1233 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
1234 			*dseg = (struct mlx5_wqe_data_seg){
1235 				.byte_count = rte_cpu_to_be_32(DATA_LEN(buf)),
1236 				.lkey = mlx5_tx_mb2mr(txq, buf),
1237 				.addr = rte_cpu_to_be_64(addr),
1238 			};
1239 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1240 			length += DATA_LEN(buf);
1241 #endif
1242 			buf = buf->next;
1243 			++mpw.pkts_n;
1244 			++j;
1245 		} while (--segs_n);
1246 		assert(length == mpw.len);
1247 		if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
1248 			mlx5_mpw_close(txq, &mpw);
1249 #ifdef MLX5_PMD_SOFT_COUNTERS
1250 		/* Increment sent bytes counter. */
1251 		txq->stats.obytes += length;
1252 #endif
1253 		++i;
1254 	} while (pkts_n);
1255 	/* Take a shortcut if nothing must be sent. */
1256 	if (unlikely(i == 0))
1257 		return 0;
1258 	/* Check whether completion threshold has been reached. */
1259 	/* "j" includes both packets and segments. */
1260 	comp = txq->elts_comp + j;
1261 	if (comp >= MLX5_TX_COMP_THRESH) {
1262 		volatile struct mlx5_wqe *wqe = mpw.wqe;
1263 
1264 		/* A CQE slot must always be available. */
1265 		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
1266 		/* Request completion on last WQE. */
1267 		wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
1268 						MLX5_COMP_MODE_OFFSET);
1269 		/* Save elts_head in unused "immediate" field of WQE. */
1270 		wqe->ctrl[3] = elts_head;
1271 		txq->elts_comp = 0;
1272 	} else {
1273 		txq->elts_comp = comp;
1274 	}
1275 #ifdef MLX5_PMD_SOFT_COUNTERS
1276 	/* Increment sent packets counter. */
1277 	txq->stats.opackets += i;
1278 #endif
1279 	/* Ring QP doorbell. */
1280 	if (mpw.state == MLX5_MPW_STATE_OPENED)
1281 		mlx5_mpw_close(txq, &mpw);
1282 	mlx5_tx_dbrec(txq, mpw.wqe);
1283 	txq->elts_head = elts_head;
1284 	return i;
1285 }
1286 
1287 /**
1288  * Open a MPW inline session.
1289  *
1290  * @param txq
1291  *   Pointer to TX queue structure.
1292  * @param mpw
1293  *   Pointer to MPW session structure.
1294  * @param length
1295  *   Packet length.
1296  */
1297 static inline void
1298 mlx5_mpw_inline_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw,
1299 		    uint32_t length)
1300 {
1301 	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
1302 	struct mlx5_wqe_inl_small *inl;
1303 
1304 	mpw->state = MLX5_MPW_INL_STATE_OPENED;
1305 	mpw->pkts_n = 0;
1306 	mpw->len = length;
1307 	mpw->total_len = 0;
1308 	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
1309 	mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) |
1310 					     (txq->wqe_ci << 8) |
1311 					     MLX5_OPCODE_TSO);
1312 	mpw->wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
1313 					     MLX5_COMP_MODE_OFFSET);
1314 	mpw->wqe->ctrl[3] = 0;
1315 	mpw->wqe->eseg.mss = rte_cpu_to_be_16(length);
1316 	mpw->wqe->eseg.inline_hdr_sz = 0;
1317 	mpw->wqe->eseg.cs_flags = 0;
1318 	mpw->wqe->eseg.rsvd0 = 0;
1319 	mpw->wqe->eseg.rsvd1 = 0;
1320 	mpw->wqe->eseg.flow_table_metadata = 0;
1321 	inl = (struct mlx5_wqe_inl_small *)
1322 		(((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE);
1323 	mpw->data.raw = (uint8_t *)&inl->raw;
1324 }
1325 
1326 /**
1327  * Close a MPW inline session.
1328  *
1329  * @param txq
1330  *   Pointer to TX queue structure.
1331  * @param mpw
1332  *   Pointer to MPW session structure.
1333  */
1334 static inline void
1335 mlx5_mpw_inline_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
1336 {
1337 	unsigned int size;
1338 	struct mlx5_wqe_inl_small *inl = (struct mlx5_wqe_inl_small *)
1339 		(((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
1340 
1341 	size = MLX5_WQE_SIZE - MLX5_MWQE64_INL_DATA + mpw->total_len;
1342 	/*
1343 	 * Store size in multiple of 16 bytes. Control and Ethernet segments
1344 	 * count as 2.
1345 	 */
1346 	mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s |
1347 					     MLX5_WQE_DS(size));
1348 	mpw->state = MLX5_MPW_STATE_CLOSED;
1349 	inl->byte_cnt = rte_cpu_to_be_32(mpw->total_len | MLX5_INLINE_SEG);
1350 	txq->wqe_ci += (size + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
1351 }
1352 
1353 /**
1354  * DPDK callback for TX with MPW inline support.
1355  *
1356  * @param dpdk_txq
1357  *   Generic pointer to TX queue structure.
1358  * @param[in] pkts
1359  *   Packets to transmit.
1360  * @param pkts_n
1361  *   Number of packets in array.
1362  *
1363  * @return
1364  *   Number of packets successfully transmitted (<= pkts_n).
1365  */
1366 uint16_t
1367 mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
1368 			 uint16_t pkts_n)
1369 {
1370 	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
1371 	uint16_t elts_head = txq->elts_head;
1372 	const uint16_t elts_n = 1 << txq->elts_n;
1373 	const uint16_t elts_m = elts_n - 1;
1374 	unsigned int i = 0;
1375 	unsigned int j = 0;
1376 	uint16_t max_elts;
1377 	uint16_t max_wqe;
1378 	unsigned int comp;
1379 	unsigned int inline_room = txq->max_inline * RTE_CACHE_LINE_SIZE;
1380 	struct mlx5_mpw mpw = {
1381 		.state = MLX5_MPW_STATE_CLOSED,
1382 	};
1383 	/*
1384 	 * Compute the maximum number of WQE which can be consumed by inline
1385 	 * code.
1386 	 * - 2 DSEG for:
1387 	 *   - 1 control segment,
1388 	 *   - 1 Ethernet segment,
1389 	 * - N Dseg from the inline request.
1390 	 */
1391 	const unsigned int wqe_inl_n =
1392 		((2 * MLX5_WQE_DWORD_SIZE +
1393 		  txq->max_inline * RTE_CACHE_LINE_SIZE) +
1394 		 RTE_CACHE_LINE_SIZE - 1) / RTE_CACHE_LINE_SIZE;
1395 
1396 	if (unlikely(!pkts_n))
1397 		return 0;
1398 	/* Prefetch first packet cacheline. */
1399 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
1400 	rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
1401 	/* Start processing. */
1402 	mlx5_tx_complete(txq);
1403 	max_elts = (elts_n - (elts_head - txq->elts_tail));
1404 	do {
1405 		struct rte_mbuf *buf = *(pkts++);
1406 		uintptr_t addr;
1407 		uint32_t length;
1408 		unsigned int segs_n = buf->nb_segs;
1409 		uint8_t cs_flags;
1410 		rte_be32_t metadata;
1411 
1412 		/*
1413 		 * Make sure there is enough room to store this packet and
1414 		 * that one ring entry remains unused.
1415 		 */
1416 		assert(segs_n);
1417 		if (max_elts < segs_n)
1418 			break;
1419 		/* Do not bother with large packets MPW cannot handle. */
1420 		if (segs_n > MLX5_MPW_DSEG_MAX) {
1421 			txq->stats.oerrors++;
1422 			break;
1423 		}
1424 		max_elts -= segs_n;
1425 		--pkts_n;
1426 		/*
1427 		 * Compute max_wqe in case less WQE were consumed in previous
1428 		 * iteration.
1429 		 */
1430 		max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
1431 		cs_flags = txq_ol_cksum_to_cs(buf);
1432 		/* Copy metadata from mbuf if valid */
1433 		metadata = buf->ol_flags & PKT_TX_METADATA ? buf->tx_metadata :
1434 							     0;
1435 		/* Retrieve packet information. */
1436 		length = PKT_LEN(buf);
1437 		/* Start new session if packet differs. */
1438 		if (mpw.state == MLX5_MPW_STATE_OPENED) {
1439 			if ((mpw.len != length) ||
1440 			    (segs_n != 1) ||
1441 			    (mpw.wqe->eseg.flow_table_metadata != metadata) ||
1442 			    (mpw.wqe->eseg.cs_flags != cs_flags))
1443 				mlx5_mpw_close(txq, &mpw);
1444 		} else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
1445 			if ((mpw.len != length) ||
1446 			    (segs_n != 1) ||
1447 			    (length > inline_room) ||
1448 			    (mpw.wqe->eseg.flow_table_metadata != metadata) ||
1449 			    (mpw.wqe->eseg.cs_flags != cs_flags)) {
1450 				mlx5_mpw_inline_close(txq, &mpw);
1451 				inline_room =
1452 					txq->max_inline * RTE_CACHE_LINE_SIZE;
1453 			}
1454 		}
1455 		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
1456 			if ((segs_n != 1) ||
1457 			    (length > inline_room)) {
1458 				/*
1459 				 * Multi-Packet WQE consumes at most two WQE.
1460 				 * mlx5_mpw_new() expects to be able to use
1461 				 * such resources.
1462 				 */
1463 				if (unlikely(max_wqe < 2))
1464 					break;
1465 				max_wqe -= 2;
1466 				mlx5_mpw_new(txq, &mpw, length);
1467 				mpw.wqe->eseg.cs_flags = cs_flags;
1468 				mpw.wqe->eseg.flow_table_metadata = metadata;
1469 			} else {
1470 				if (unlikely(max_wqe < wqe_inl_n))
1471 					break;
1472 				max_wqe -= wqe_inl_n;
1473 				mlx5_mpw_inline_new(txq, &mpw, length);
1474 				mpw.wqe->eseg.cs_flags = cs_flags;
1475 				mpw.wqe->eseg.flow_table_metadata = metadata;
1476 			}
1477 		}
1478 		/* Multi-segment packets must be alone in their MPW. */
1479 		assert((segs_n == 1) || (mpw.pkts_n == 0));
1480 		if (mpw.state == MLX5_MPW_STATE_OPENED) {
1481 			assert(inline_room ==
1482 			       txq->max_inline * RTE_CACHE_LINE_SIZE);
1483 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1484 			length = 0;
1485 #endif
1486 			do {
1487 				volatile struct mlx5_wqe_data_seg *dseg;
1488 
1489 				assert(buf);
1490 				(*txq->elts)[elts_head++ & elts_m] = buf;
1491 				dseg = mpw.data.dseg[mpw.pkts_n];
1492 				addr = rte_pktmbuf_mtod(buf, uintptr_t);
1493 				*dseg = (struct mlx5_wqe_data_seg){
1494 					.byte_count =
1495 					       rte_cpu_to_be_32(DATA_LEN(buf)),
1496 					.lkey = mlx5_tx_mb2mr(txq, buf),
1497 					.addr = rte_cpu_to_be_64(addr),
1498 				};
1499 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1500 				length += DATA_LEN(buf);
1501 #endif
1502 				buf = buf->next;
1503 				++mpw.pkts_n;
1504 				++j;
1505 			} while (--segs_n);
1506 			assert(length == mpw.len);
1507 			if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
1508 				mlx5_mpw_close(txq, &mpw);
1509 		} else {
1510 			unsigned int max;
1511 
1512 			assert(mpw.state == MLX5_MPW_INL_STATE_OPENED);
1513 			assert(length <= inline_room);
1514 			assert(length == DATA_LEN(buf));
1515 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
1516 			(*txq->elts)[elts_head++ & elts_m] = buf;
1517 			/* Maximum number of bytes before wrapping. */
1518 			max = ((((uintptr_t)(txq->wqes)) +
1519 				(1 << txq->wqe_n) *
1520 				MLX5_WQE_SIZE) -
1521 			       (uintptr_t)mpw.data.raw);
1522 			if (length > max) {
1523 				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1524 					   (void *)addr,
1525 					   max);
1526 				mpw.data.raw = (volatile void *)txq->wqes;
1527 				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1528 					   (void *)(addr + max),
1529 					   length - max);
1530 				mpw.data.raw += length - max;
1531 			} else {
1532 				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1533 					   (void *)addr,
1534 					   length);
1535 
1536 				if (length == max)
1537 					mpw.data.raw =
1538 						(volatile void *)txq->wqes;
1539 				else
1540 					mpw.data.raw += length;
1541 			}
1542 			++mpw.pkts_n;
1543 			mpw.total_len += length;
1544 			++j;
1545 			if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) {
1546 				mlx5_mpw_inline_close(txq, &mpw);
1547 				inline_room =
1548 					txq->max_inline * RTE_CACHE_LINE_SIZE;
1549 			} else {
1550 				inline_room -= length;
1551 			}
1552 		}
1553 #ifdef MLX5_PMD_SOFT_COUNTERS
1554 		/* Increment sent bytes counter. */
1555 		txq->stats.obytes += length;
1556 #endif
1557 		++i;
1558 	} while (pkts_n);
1559 	/* Take a shortcut if nothing must be sent. */
1560 	if (unlikely(i == 0))
1561 		return 0;
1562 	/* Check whether completion threshold has been reached. */
1563 	/* "j" includes both packets and segments. */
1564 	comp = txq->elts_comp + j;
1565 	if (comp >= MLX5_TX_COMP_THRESH) {
1566 		volatile struct mlx5_wqe *wqe = mpw.wqe;
1567 
1568 		/* A CQE slot must always be available. */
1569 		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
1570 		/* Request completion on last WQE. */
1571 		wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
1572 						MLX5_COMP_MODE_OFFSET);
1573 		/* Save elts_head in unused "immediate" field of WQE. */
1574 		wqe->ctrl[3] = elts_head;
1575 		txq->elts_comp = 0;
1576 	} else {
1577 		txq->elts_comp = comp;
1578 	}
1579 #ifdef MLX5_PMD_SOFT_COUNTERS
1580 	/* Increment sent packets counter. */
1581 	txq->stats.opackets += i;
1582 #endif
1583 	/* Ring QP doorbell. */
1584 	if (mpw.state == MLX5_MPW_INL_STATE_OPENED)
1585 		mlx5_mpw_inline_close(txq, &mpw);
1586 	else if (mpw.state == MLX5_MPW_STATE_OPENED)
1587 		mlx5_mpw_close(txq, &mpw);
1588 	mlx5_tx_dbrec(txq, mpw.wqe);
1589 	txq->elts_head = elts_head;
1590 	return i;
1591 }
1592 
1593 /**
1594  * Open an Enhanced MPW session.
1595  *
1596  * @param txq
1597  *   Pointer to TX queue structure.
1598  * @param mpw
1599  *   Pointer to MPW session structure.
1600  * @param length
1601  *   Packet length.
1602  */
1603 static inline void
1604 mlx5_empw_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw, int padding)
1605 {
1606 	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
1607 
1608 	mpw->state = MLX5_MPW_ENHANCED_STATE_OPENED;
1609 	mpw->pkts_n = 0;
1610 	mpw->total_len = sizeof(struct mlx5_wqe);
1611 	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
1612 	mpw->wqe->ctrl[0] =
1613 		rte_cpu_to_be_32((MLX5_OPC_MOD_ENHANCED_MPSW << 24) |
1614 				 (txq->wqe_ci << 8) |
1615 				 MLX5_OPCODE_ENHANCED_MPSW);
1616 	mpw->wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
1617 					     MLX5_COMP_MODE_OFFSET);
1618 	mpw->wqe->ctrl[3] = 0;
1619 	memset((void *)(uintptr_t)&mpw->wqe->eseg, 0, MLX5_WQE_DWORD_SIZE);
1620 	if (unlikely(padding)) {
1621 		uintptr_t addr = (uintptr_t)(mpw->wqe + 1);
1622 
1623 		/* Pad the first 2 DWORDs with zero-length inline header. */
1624 		*(volatile uint32_t *)addr = rte_cpu_to_be_32(MLX5_INLINE_SEG);
1625 		*(volatile uint32_t *)(addr + MLX5_WQE_DWORD_SIZE) =
1626 			rte_cpu_to_be_32(MLX5_INLINE_SEG);
1627 		mpw->total_len += 2 * MLX5_WQE_DWORD_SIZE;
1628 		/* Start from the next WQEBB. */
1629 		mpw->data.raw = (volatile void *)(tx_mlx5_wqe(txq, idx + 1));
1630 	} else {
1631 		mpw->data.raw = (volatile void *)(mpw->wqe + 1);
1632 	}
1633 }
1634 
1635 /**
1636  * Close an Enhanced MPW session.
1637  *
1638  * @param txq
1639  *   Pointer to TX queue structure.
1640  * @param mpw
1641  *   Pointer to MPW session structure.
1642  *
1643  * @return
1644  *   Number of consumed WQEs.
1645  */
1646 static inline uint16_t
1647 mlx5_empw_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
1648 {
1649 	uint16_t ret;
1650 
1651 	/* Store size in multiple of 16 bytes. Control and Ethernet segments
1652 	 * count as 2.
1653 	 */
1654 	mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s |
1655 					     MLX5_WQE_DS(mpw->total_len));
1656 	mpw->state = MLX5_MPW_STATE_CLOSED;
1657 	ret = (mpw->total_len + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
1658 	txq->wqe_ci += ret;
1659 	return ret;
1660 }
1661 
1662 /**
1663  * TX with Enhanced MPW support.
1664  *
1665  * @param txq
1666  *   Pointer to TX queue structure.
1667  * @param[in] pkts
1668  *   Packets to transmit.
1669  * @param pkts_n
1670  *   Number of packets in array.
1671  *
1672  * @return
1673  *   Number of packets successfully transmitted (<= pkts_n).
1674  */
1675 static inline uint16_t
1676 txq_burst_empw(struct mlx5_txq_data *txq, struct rte_mbuf **pkts,
1677 	       uint16_t pkts_n)
1678 {
1679 	uint16_t elts_head = txq->elts_head;
1680 	const uint16_t elts_n = 1 << txq->elts_n;
1681 	const uint16_t elts_m = elts_n - 1;
1682 	unsigned int i = 0;
1683 	unsigned int j = 0;
1684 	uint16_t max_elts;
1685 	uint16_t max_wqe;
1686 	unsigned int max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE;
1687 	unsigned int mpw_room = 0;
1688 	unsigned int inl_pad = 0;
1689 	uint32_t inl_hdr;
1690 	uint64_t addr_64;
1691 	struct mlx5_mpw mpw = {
1692 		.state = MLX5_MPW_STATE_CLOSED,
1693 	};
1694 
1695 	if (unlikely(!pkts_n))
1696 		return 0;
1697 	/* Start processing. */
1698 	mlx5_tx_complete(txq);
1699 	max_elts = (elts_n - (elts_head - txq->elts_tail));
1700 	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
1701 	if (unlikely(!max_wqe))
1702 		return 0;
1703 	do {
1704 		struct rte_mbuf *buf = *(pkts++);
1705 		uintptr_t addr;
1706 		unsigned int do_inline = 0; /* Whether inline is possible. */
1707 		uint32_t length;
1708 		uint8_t cs_flags;
1709 		rte_be32_t metadata;
1710 
1711 		/* Multi-segmented packet is handled in slow-path outside. */
1712 		assert(NB_SEGS(buf) == 1);
1713 		/* Make sure there is enough room to store this packet. */
1714 		if (max_elts - j == 0)
1715 			break;
1716 		cs_flags = txq_ol_cksum_to_cs(buf);
1717 		/* Copy metadata from mbuf if valid */
1718 		metadata = buf->ol_flags & PKT_TX_METADATA ? buf->tx_metadata :
1719 							     0;
1720 		/* Retrieve packet information. */
1721 		length = PKT_LEN(buf);
1722 		/* Start new session if:
1723 		 * - multi-segment packet
1724 		 * - no space left even for a dseg
1725 		 * - next packet can be inlined with a new WQE
1726 		 * - cs_flag differs
1727 		 */
1728 		if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED) {
1729 			if ((inl_pad + sizeof(struct mlx5_wqe_data_seg) >
1730 			     mpw_room) ||
1731 			    (length <= txq->inline_max_packet_sz &&
1732 			     inl_pad + sizeof(inl_hdr) + length >
1733 			     mpw_room) ||
1734 			     (mpw.wqe->eseg.flow_table_metadata != metadata) ||
1735 			    (mpw.wqe->eseg.cs_flags != cs_flags))
1736 				max_wqe -= mlx5_empw_close(txq, &mpw);
1737 		}
1738 		if (unlikely(mpw.state == MLX5_MPW_STATE_CLOSED)) {
1739 			/* In Enhanced MPW, inline as much as the budget is
1740 			 * allowed. The remaining space is to be filled with
1741 			 * dsegs. If the title WQEBB isn't padded, it will have
1742 			 * 2 dsegs there.
1743 			 */
1744 			mpw_room = RTE_MIN(MLX5_WQE_SIZE_MAX,
1745 					   (max_inline ? max_inline :
1746 					    pkts_n * MLX5_WQE_DWORD_SIZE) +
1747 					   MLX5_WQE_SIZE);
1748 			if (unlikely(max_wqe * MLX5_WQE_SIZE < mpw_room))
1749 				break;
1750 			/* Don't pad the title WQEBB to not waste WQ. */
1751 			mlx5_empw_new(txq, &mpw, 0);
1752 			mpw_room -= mpw.total_len;
1753 			inl_pad = 0;
1754 			do_inline = length <= txq->inline_max_packet_sz &&
1755 				    sizeof(inl_hdr) + length <= mpw_room &&
1756 				    !txq->mpw_hdr_dseg;
1757 			mpw.wqe->eseg.cs_flags = cs_flags;
1758 			mpw.wqe->eseg.flow_table_metadata = metadata;
1759 		} else {
1760 			/* Evaluate whether the next packet can be inlined.
1761 			 * Inlininig is possible when:
1762 			 * - length is less than configured value
1763 			 * - length fits for remaining space
1764 			 * - not required to fill the title WQEBB with dsegs
1765 			 */
1766 			do_inline =
1767 				length <= txq->inline_max_packet_sz &&
1768 				inl_pad + sizeof(inl_hdr) + length <=
1769 				 mpw_room &&
1770 				(!txq->mpw_hdr_dseg ||
1771 				 mpw.total_len >= MLX5_WQE_SIZE);
1772 		}
1773 		if (max_inline && do_inline) {
1774 			/* Inline packet into WQE. */
1775 			unsigned int max;
1776 
1777 			assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
1778 			assert(length == DATA_LEN(buf));
1779 			inl_hdr = rte_cpu_to_be_32(length | MLX5_INLINE_SEG);
1780 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
1781 			mpw.data.raw = (volatile void *)
1782 				((uintptr_t)mpw.data.raw + inl_pad);
1783 			max = tx_mlx5_wq_tailroom(txq,
1784 					(void *)(uintptr_t)mpw.data.raw);
1785 			/* Copy inline header. */
1786 			mpw.data.raw = (volatile void *)
1787 				mlx5_copy_to_wq(
1788 					  (void *)(uintptr_t)mpw.data.raw,
1789 					  &inl_hdr,
1790 					  sizeof(inl_hdr),
1791 					  (void *)(uintptr_t)txq->wqes,
1792 					  max);
1793 			max = tx_mlx5_wq_tailroom(txq,
1794 					(void *)(uintptr_t)mpw.data.raw);
1795 			/* Copy packet data. */
1796 			mpw.data.raw = (volatile void *)
1797 				mlx5_copy_to_wq(
1798 					  (void *)(uintptr_t)mpw.data.raw,
1799 					  (void *)addr,
1800 					  length,
1801 					  (void *)(uintptr_t)txq->wqes,
1802 					  max);
1803 			++mpw.pkts_n;
1804 			mpw.total_len += (inl_pad + sizeof(inl_hdr) + length);
1805 			/* No need to get completion as the entire packet is
1806 			 * copied to WQ. Free the buf right away.
1807 			 */
1808 			rte_pktmbuf_free_seg(buf);
1809 			mpw_room -= (inl_pad + sizeof(inl_hdr) + length);
1810 			/* Add pad in the next packet if any. */
1811 			inl_pad = (((uintptr_t)mpw.data.raw +
1812 					(MLX5_WQE_DWORD_SIZE - 1)) &
1813 					~(MLX5_WQE_DWORD_SIZE - 1)) -
1814 				  (uintptr_t)mpw.data.raw;
1815 		} else {
1816 			/* No inline. Load a dseg of packet pointer. */
1817 			volatile rte_v128u32_t *dseg;
1818 
1819 			assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
1820 			assert((inl_pad + sizeof(*dseg)) <= mpw_room);
1821 			assert(length == DATA_LEN(buf));
1822 			if (!tx_mlx5_wq_tailroom(txq,
1823 					(void *)((uintptr_t)mpw.data.raw
1824 						+ inl_pad)))
1825 				dseg = (volatile void *)txq->wqes;
1826 			else
1827 				dseg = (volatile void *)
1828 					((uintptr_t)mpw.data.raw +
1829 					 inl_pad);
1830 			(*txq->elts)[elts_head++ & elts_m] = buf;
1831 			addr_64 = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf,
1832 								    uintptr_t));
1833 			*dseg = (rte_v128u32_t) {
1834 				rte_cpu_to_be_32(length),
1835 				mlx5_tx_mb2mr(txq, buf),
1836 				addr_64,
1837 				addr_64 >> 32,
1838 			};
1839 			mpw.data.raw = (volatile void *)(dseg + 1);
1840 			mpw.total_len += (inl_pad + sizeof(*dseg));
1841 			++j;
1842 			++mpw.pkts_n;
1843 			mpw_room -= (inl_pad + sizeof(*dseg));
1844 			inl_pad = 0;
1845 		}
1846 #ifdef MLX5_PMD_SOFT_COUNTERS
1847 		/* Increment sent bytes counter. */
1848 		txq->stats.obytes += length;
1849 #endif
1850 		++i;
1851 	} while (i < pkts_n);
1852 	/* Take a shortcut if nothing must be sent. */
1853 	if (unlikely(i == 0))
1854 		return 0;
1855 	/* Check whether completion threshold has been reached. */
1856 	if (txq->elts_comp + j >= MLX5_TX_COMP_THRESH ||
1857 			(uint16_t)(txq->wqe_ci - txq->mpw_comp) >=
1858 			 (1 << txq->wqe_n) / MLX5_TX_COMP_THRESH_INLINE_DIV) {
1859 		volatile struct mlx5_wqe *wqe = mpw.wqe;
1860 
1861 		/* A CQE slot must always be available. */
1862 		assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
1863 		/* Request completion on last WQE. */
1864 		wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
1865 						MLX5_COMP_MODE_OFFSET);
1866 		/* Save elts_head in unused "immediate" field of WQE. */
1867 		wqe->ctrl[3] = elts_head;
1868 		txq->elts_comp = 0;
1869 		txq->mpw_comp = txq->wqe_ci;
1870 	} else {
1871 		txq->elts_comp += j;
1872 	}
1873 #ifdef MLX5_PMD_SOFT_COUNTERS
1874 	/* Increment sent packets counter. */
1875 	txq->stats.opackets += i;
1876 #endif
1877 	if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED)
1878 		mlx5_empw_close(txq, &mpw);
1879 	/* Ring QP doorbell. */
1880 	mlx5_tx_dbrec(txq, mpw.wqe);
1881 	txq->elts_head = elts_head;
1882 	return i;
1883 }
1884 
1885 /**
1886  * DPDK callback for TX with Enhanced MPW support.
1887  *
1888  * @param dpdk_txq
1889  *   Generic pointer to TX queue structure.
1890  * @param[in] pkts
1891  *   Packets to transmit.
1892  * @param pkts_n
1893  *   Number of packets in array.
1894  *
1895  * @return
1896  *   Number of packets successfully transmitted (<= pkts_n).
1897  */
1898 uint16_t
1899 mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
1900 {
1901 	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
1902 	uint16_t nb_tx = 0;
1903 
1904 	while (pkts_n > nb_tx) {
1905 		uint16_t n;
1906 		uint16_t ret;
1907 
1908 		n = txq_count_contig_multi_seg(&pkts[nb_tx], pkts_n - nb_tx);
1909 		if (n) {
1910 			ret = mlx5_tx_burst(dpdk_txq, &pkts[nb_tx], n);
1911 			if (!ret)
1912 				break;
1913 			nb_tx += ret;
1914 		}
1915 		n = txq_count_contig_single_seg(&pkts[nb_tx], pkts_n - nb_tx);
1916 		if (n) {
1917 			ret = txq_burst_empw(txq, &pkts[nb_tx], n);
1918 			if (!ret)
1919 				break;
1920 			nb_tx += ret;
1921 		}
1922 	}
1923 	return nb_tx;
1924 }
1925 
1926 /**
1927  * Translate RX completion flags to packet type.
1928  *
1929  * @param[in] rxq
1930  *   Pointer to RX queue structure.
1931  * @param[in] cqe
1932  *   Pointer to CQE.
1933  *
1934  * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
1935  *
1936  * @return
1937  *   Packet type for struct rte_mbuf.
1938  */
1939 static inline uint32_t
1940 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe)
1941 {
1942 	uint8_t idx;
1943 	uint8_t pinfo = cqe->pkt_info;
1944 	uint16_t ptype = cqe->hdr_type_etc;
1945 
1946 	/*
1947 	 * The index to the array should have:
1948 	 * bit[1:0] = l3_hdr_type
1949 	 * bit[4:2] = l4_hdr_type
1950 	 * bit[5] = ip_frag
1951 	 * bit[6] = tunneled
1952 	 * bit[7] = outer_l3_type
1953 	 */
1954 	idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10);
1955 	return mlx5_ptype_table[idx] | rxq->tunnel * !!(idx & (1 << 6));
1956 }
1957 
1958 /**
1959  * Initialize Rx WQ and indexes.
1960  *
1961  * @param[in] rxq
1962  *   Pointer to RX queue structure.
1963  */
1964 void
1965 mlx5_rxq_initialize(struct mlx5_rxq_data *rxq)
1966 {
1967 	const unsigned int wqe_n = 1 << rxq->elts_n;
1968 	unsigned int i;
1969 
1970 	for (i = 0; (i != wqe_n); ++i) {
1971 		volatile struct mlx5_wqe_data_seg *scat;
1972 		uintptr_t addr;
1973 		uint32_t byte_count;
1974 
1975 		if (mlx5_rxq_mprq_enabled(rxq)) {
1976 			struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[i];
1977 
1978 			scat = &((volatile struct mlx5_wqe_mprq *)
1979 				rxq->wqes)[i].dseg;
1980 			addr = (uintptr_t)mlx5_mprq_buf_addr(buf);
1981 			byte_count = (1 << rxq->strd_sz_n) *
1982 					(1 << rxq->strd_num_n);
1983 		} else {
1984 			struct rte_mbuf *buf = (*rxq->elts)[i];
1985 
1986 			scat = &((volatile struct mlx5_wqe_data_seg *)
1987 					rxq->wqes)[i];
1988 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
1989 			byte_count = DATA_LEN(buf);
1990 		}
1991 		/* scat->addr must be able to store a pointer. */
1992 		assert(sizeof(scat->addr) >= sizeof(uintptr_t));
1993 		*scat = (struct mlx5_wqe_data_seg){
1994 			.addr = rte_cpu_to_be_64(addr),
1995 			.byte_count = rte_cpu_to_be_32(byte_count),
1996 			.lkey = mlx5_rx_addr2mr(rxq, addr),
1997 		};
1998 	}
1999 	rxq->consumed_strd = 0;
2000 	rxq->decompressed = 0;
2001 	rxq->rq_pi = 0;
2002 	rxq->zip = (struct rxq_zip){
2003 		.ai = 0,
2004 	};
2005 	/* Update doorbell counter. */
2006 	rxq->rq_ci = wqe_n >> rxq->sges_n;
2007 	rte_cio_wmb();
2008 	*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
2009 }
2010 
2011 /**
2012  * Modify a Verbs queue state.
2013  * This must be called from the primary process.
2014  *
2015  * @param dev
2016  *   Pointer to Ethernet device.
2017  * @param sm
2018  *   State modify request parameters.
2019  *
2020  * @return
2021  *   0 in case of success else non-zero value and rte_errno is set.
2022  */
2023 int
2024 mlx5_queue_state_modify_primary(struct rte_eth_dev *dev,
2025 			const struct mlx5_mp_arg_queue_state_modify *sm)
2026 {
2027 	int ret;
2028 	struct mlx5_priv *priv = dev->data->dev_private;
2029 
2030 	if (sm->is_wq) {
2031 		struct ibv_wq_attr mod = {
2032 			.attr_mask = IBV_WQ_ATTR_STATE,
2033 			.wq_state = sm->state,
2034 		};
2035 		struct mlx5_rxq_data *rxq = (*priv->rxqs)[sm->queue_id];
2036 		struct mlx5_rxq_ctrl *rxq_ctrl =
2037 			container_of(rxq, struct mlx5_rxq_ctrl, rxq);
2038 
2039 		ret = mlx5_glue->modify_wq(rxq_ctrl->ibv->wq, &mod);
2040 		if (ret) {
2041 			DRV_LOG(ERR, "Cannot change Rx WQ state to %u  - %s\n",
2042 					sm->state, strerror(errno));
2043 			rte_errno = errno;
2044 			return ret;
2045 		}
2046 	} else {
2047 		struct mlx5_txq_data *txq = (*priv->txqs)[sm->queue_id];
2048 		struct mlx5_txq_ctrl *txq_ctrl =
2049 			container_of(txq, struct mlx5_txq_ctrl, txq);
2050 		struct ibv_qp_attr mod = {
2051 			.qp_state = IBV_QPS_RESET,
2052 			.port_num = (uint8_t)priv->ibv_port,
2053 		};
2054 		struct ibv_qp *qp = txq_ctrl->ibv->qp;
2055 
2056 		ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
2057 		if (ret) {
2058 			DRV_LOG(ERR, "Cannot change the Tx QP state to RESET "
2059 				"%s\n", strerror(errno));
2060 			rte_errno = errno;
2061 			return ret;
2062 		}
2063 		mod.qp_state = IBV_QPS_INIT;
2064 		ret = mlx5_glue->modify_qp(qp, &mod,
2065 					   (IBV_QP_STATE | IBV_QP_PORT));
2066 		if (ret) {
2067 			DRV_LOG(ERR, "Cannot change Tx QP state to INIT %s\n",
2068 				strerror(errno));
2069 			rte_errno = errno;
2070 			return ret;
2071 		}
2072 		mod.qp_state = IBV_QPS_RTR;
2073 		ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
2074 		if (ret) {
2075 			DRV_LOG(ERR, "Cannot change Tx QP state to RTR %s\n",
2076 				strerror(errno));
2077 			rte_errno = errno;
2078 			return ret;
2079 		}
2080 		mod.qp_state = IBV_QPS_RTS;
2081 		ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
2082 		if (ret) {
2083 			DRV_LOG(ERR, "Cannot change Tx QP state to RTS %s\n",
2084 				strerror(errno));
2085 			rte_errno = errno;
2086 			return ret;
2087 		}
2088 	}
2089 	return 0;
2090 }
2091 
2092 /**
2093  * Modify a Verbs queue state.
2094  *
2095  * @param dev
2096  *   Pointer to Ethernet device.
2097  * @param sm
2098  *   State modify request parameters.
2099  *
2100  * @return
2101  *   0 in case of success else non-zero value.
2102  */
2103 static int
2104 mlx5_queue_state_modify(struct rte_eth_dev *dev,
2105 			struct mlx5_mp_arg_queue_state_modify *sm)
2106 {
2107 	int ret = 0;
2108 
2109 	switch (rte_eal_process_type()) {
2110 	case RTE_PROC_PRIMARY:
2111 		ret = mlx5_queue_state_modify_primary(dev, sm);
2112 		break;
2113 	case RTE_PROC_SECONDARY:
2114 		ret = mlx5_mp_req_queue_state_modify(dev, sm);
2115 		break;
2116 	default:
2117 		break;
2118 	}
2119 	return ret;
2120 }
2121 
2122 /**
2123  * Handle a Rx error.
2124  * The function inserts the RQ state to reset when the first error CQE is
2125  * shown, then drains the CQ by the caller function loop. When the CQ is empty,
2126  * it moves the RQ state to ready and initializes the RQ.
2127  * Next CQE identification and error counting are in the caller responsibility.
2128  *
2129  * @param[in] rxq
2130  *   Pointer to RX queue structure.
2131  * @param[in] mbuf_prepare
2132  *   Whether to prepare mbufs for the RQ.
2133  *
2134  * @return
2135  *   -1 in case of recovery error, otherwise the CQE status.
2136  */
2137 int
2138 mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t mbuf_prepare)
2139 {
2140 	const uint16_t cqe_n = 1 << rxq->cqe_n;
2141 	const uint16_t cqe_mask = cqe_n - 1;
2142 	const unsigned int wqe_n = 1 << rxq->elts_n;
2143 	struct mlx5_rxq_ctrl *rxq_ctrl =
2144 			container_of(rxq, struct mlx5_rxq_ctrl, rxq);
2145 	union {
2146 		volatile struct mlx5_cqe *cqe;
2147 		volatile struct mlx5_err_cqe *err_cqe;
2148 	} u = {
2149 		.cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask],
2150 	};
2151 	struct mlx5_mp_arg_queue_state_modify sm;
2152 	int ret;
2153 
2154 	switch (rxq->err_state) {
2155 	case MLX5_RXQ_ERR_STATE_NO_ERROR:
2156 		rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_RESET;
2157 		/* Fall-through */
2158 	case MLX5_RXQ_ERR_STATE_NEED_RESET:
2159 		sm.is_wq = 1;
2160 		sm.queue_id = rxq->idx;
2161 		sm.state = IBV_WQS_RESET;
2162 		if (mlx5_queue_state_modify(ETH_DEV(rxq_ctrl->priv), &sm))
2163 			return -1;
2164 		if (rxq_ctrl->dump_file_n <
2165 		    rxq_ctrl->priv->config.max_dump_files_num) {
2166 			MKSTR(err_str, "Unexpected CQE error syndrome "
2167 			      "0x%02x CQN = %u RQN = %u wqe_counter = %u"
2168 			      " rq_ci = %u cq_ci = %u", u.err_cqe->syndrome,
2169 			      rxq->cqn, rxq_ctrl->wqn,
2170 			      rte_be_to_cpu_16(u.err_cqe->wqe_counter),
2171 			      rxq->rq_ci << rxq->sges_n, rxq->cq_ci);
2172 			MKSTR(name, "dpdk_mlx5_port_%u_rxq_%u_%u",
2173 			      rxq->port_id, rxq->idx, (uint32_t)rte_rdtsc());
2174 			mlx5_dump_debug_information(name, NULL, err_str, 0);
2175 			mlx5_dump_debug_information(name, "MLX5 Error CQ:",
2176 						    (const void *)((uintptr_t)
2177 								    rxq->cqes),
2178 						    sizeof(*u.cqe) * cqe_n);
2179 			mlx5_dump_debug_information(name, "MLX5 Error RQ:",
2180 						    (const void *)((uintptr_t)
2181 								    rxq->wqes),
2182 						    16 * wqe_n);
2183 			rxq_ctrl->dump_file_n++;
2184 		}
2185 		rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_READY;
2186 		/* Fall-through */
2187 	case MLX5_RXQ_ERR_STATE_NEED_READY:
2188 		ret = check_cqe(u.cqe, cqe_n, rxq->cq_ci);
2189 		if (ret == MLX5_CQE_STATUS_HW_OWN) {
2190 			rte_cio_wmb();
2191 			*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
2192 			rte_cio_wmb();
2193 			/*
2194 			 * The RQ consumer index must be zeroed while moving
2195 			 * from RESET state to RDY state.
2196 			 */
2197 			*rxq->rq_db = rte_cpu_to_be_32(0);
2198 			rte_cio_wmb();
2199 			sm.is_wq = 1;
2200 			sm.queue_id = rxq->idx;
2201 			sm.state = IBV_WQS_RDY;
2202 			if (mlx5_queue_state_modify(ETH_DEV(rxq_ctrl->priv),
2203 						    &sm))
2204 				return -1;
2205 			if (mbuf_prepare) {
2206 				const uint16_t q_mask = wqe_n - 1;
2207 				uint16_t elt_idx;
2208 				struct rte_mbuf **elt;
2209 				int i;
2210 				unsigned int n = wqe_n - (rxq->rq_ci -
2211 							  rxq->rq_pi);
2212 
2213 				for (i = 0; i < (int)n; ++i) {
2214 					elt_idx = (rxq->rq_ci + i) & q_mask;
2215 					elt = &(*rxq->elts)[elt_idx];
2216 					*elt = rte_mbuf_raw_alloc(rxq->mp);
2217 					if (!*elt) {
2218 						for (i--; i >= 0; --i) {
2219 							elt_idx = (rxq->rq_ci +
2220 								   i) & q_mask;
2221 							elt = &(*rxq->elts)
2222 								[elt_idx];
2223 							rte_pktmbuf_free_seg
2224 								(*elt);
2225 						}
2226 						return -1;
2227 					}
2228 				}
2229 			}
2230 			mlx5_rxq_initialize(rxq);
2231 			rxq->err_state = MLX5_RXQ_ERR_STATE_NO_ERROR;
2232 		}
2233 		return ret;
2234 	default:
2235 		return -1;
2236 	}
2237 }
2238 
2239 /**
2240  * Get size of the next packet for a given CQE. For compressed CQEs, the
2241  * consumer index is updated only once all packets of the current one have
2242  * been processed.
2243  *
2244  * @param rxq
2245  *   Pointer to RX queue.
2246  * @param cqe
2247  *   CQE to process.
2248  * @param[out] mcqe
2249  *   Store pointer to mini-CQE if compressed. Otherwise, the pointer is not
2250  *   written.
2251  *
2252  * @return
2253  *   0 in case of empty CQE, otherwise the packet size in bytes.
2254  */
2255 static inline int
2256 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
2257 		 uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe)
2258 {
2259 	struct rxq_zip *zip = &rxq->zip;
2260 	uint16_t cqe_n = cqe_cnt + 1;
2261 	int len;
2262 	uint16_t idx, end;
2263 
2264 	do {
2265 		len = 0;
2266 		/* Process compressed data in the CQE and mini arrays. */
2267 		if (zip->ai) {
2268 			volatile struct mlx5_mini_cqe8 (*mc)[8] =
2269 				(volatile struct mlx5_mini_cqe8 (*)[8])
2270 				(uintptr_t)(&(*rxq->cqes)[zip->ca &
2271 							  cqe_cnt].pkt_info);
2272 
2273 			len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt);
2274 			*mcqe = &(*mc)[zip->ai & 7];
2275 			if ((++zip->ai & 7) == 0) {
2276 				/* Invalidate consumed CQEs */
2277 				idx = zip->ca;
2278 				end = zip->na;
2279 				while (idx != end) {
2280 					(*rxq->cqes)[idx & cqe_cnt].op_own =
2281 						MLX5_CQE_INVALIDATE;
2282 					++idx;
2283 				}
2284 				/*
2285 				 * Increment consumer index to skip the number
2286 				 * of CQEs consumed. Hardware leaves holes in
2287 				 * the CQ ring for software use.
2288 				 */
2289 				zip->ca = zip->na;
2290 				zip->na += 8;
2291 			}
2292 			if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
2293 				/* Invalidate the rest */
2294 				idx = zip->ca;
2295 				end = zip->cq_ci;
2296 
2297 				while (idx != end) {
2298 					(*rxq->cqes)[idx & cqe_cnt].op_own =
2299 						MLX5_CQE_INVALIDATE;
2300 					++idx;
2301 				}
2302 				rxq->cq_ci = zip->cq_ci;
2303 				zip->ai = 0;
2304 			}
2305 		/*
2306 		 * No compressed data, get next CQE and verify if it is
2307 		 * compressed.
2308 		 */
2309 		} else {
2310 			int ret;
2311 			int8_t op_own;
2312 
2313 			ret = check_cqe(cqe, cqe_n, rxq->cq_ci);
2314 			if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
2315 				if (unlikely(ret == MLX5_CQE_STATUS_ERR ||
2316 					     rxq->err_state)) {
2317 					ret = mlx5_rx_err_handle(rxq, 0);
2318 					if (ret == MLX5_CQE_STATUS_HW_OWN ||
2319 					    ret == -1)
2320 						return 0;
2321 				} else {
2322 					return 0;
2323 				}
2324 			}
2325 			++rxq->cq_ci;
2326 			op_own = cqe->op_own;
2327 			if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
2328 				volatile struct mlx5_mini_cqe8 (*mc)[8] =
2329 					(volatile struct mlx5_mini_cqe8 (*)[8])
2330 					(uintptr_t)(&(*rxq->cqes)
2331 						[rxq->cq_ci &
2332 						 cqe_cnt].pkt_info);
2333 
2334 				/* Fix endianness. */
2335 				zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt);
2336 				/*
2337 				 * Current mini array position is the one
2338 				 * returned by check_cqe64().
2339 				 *
2340 				 * If completion comprises several mini arrays,
2341 				 * as a special case the second one is located
2342 				 * 7 CQEs after the initial CQE instead of 8
2343 				 * for subsequent ones.
2344 				 */
2345 				zip->ca = rxq->cq_ci;
2346 				zip->na = zip->ca + 7;
2347 				/* Compute the next non compressed CQE. */
2348 				--rxq->cq_ci;
2349 				zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
2350 				/* Get packet size to return. */
2351 				len = rte_be_to_cpu_32((*mc)[0].byte_cnt);
2352 				*mcqe = &(*mc)[0];
2353 				zip->ai = 1;
2354 				/* Prefetch all to be invalidated */
2355 				idx = zip->ca;
2356 				end = zip->cq_ci;
2357 				while (idx != end) {
2358 					rte_prefetch0(&(*rxq->cqes)[(idx) &
2359 								    cqe_cnt]);
2360 					++idx;
2361 				}
2362 			} else {
2363 				len = rte_be_to_cpu_32(cqe->byte_cnt);
2364 			}
2365 		}
2366 		if (unlikely(rxq->err_state)) {
2367 			cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
2368 			++rxq->stats.idropped;
2369 		} else {
2370 			return len;
2371 		}
2372 	} while (1);
2373 }
2374 
2375 /**
2376  * Translate RX completion flags to offload flags.
2377  *
2378  * @param[in] cqe
2379  *   Pointer to CQE.
2380  *
2381  * @return
2382  *   Offload flags (ol_flags) for struct rte_mbuf.
2383  */
2384 static inline uint32_t
2385 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe)
2386 {
2387 	uint32_t ol_flags = 0;
2388 	uint16_t flags = rte_be_to_cpu_16(cqe->hdr_type_etc);
2389 
2390 	ol_flags =
2391 		TRANSPOSE(flags,
2392 			  MLX5_CQE_RX_L3_HDR_VALID,
2393 			  PKT_RX_IP_CKSUM_GOOD) |
2394 		TRANSPOSE(flags,
2395 			  MLX5_CQE_RX_L4_HDR_VALID,
2396 			  PKT_RX_L4_CKSUM_GOOD);
2397 	return ol_flags;
2398 }
2399 
2400 /**
2401  * Fill in mbuf fields from RX completion flags.
2402  * Note that pkt->ol_flags should be initialized outside of this function.
2403  *
2404  * @param rxq
2405  *   Pointer to RX queue.
2406  * @param pkt
2407  *   mbuf to fill.
2408  * @param cqe
2409  *   CQE to process.
2410  * @param rss_hash_res
2411  *   Packet RSS Hash result.
2412  */
2413 static inline void
2414 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
2415 	       volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res)
2416 {
2417 	/* Update packet information. */
2418 	pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe);
2419 	if (rss_hash_res && rxq->rss_hash) {
2420 		pkt->hash.rss = rss_hash_res;
2421 		pkt->ol_flags |= PKT_RX_RSS_HASH;
2422 	}
2423 	if (rxq->mark && MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) {
2424 		pkt->ol_flags |= PKT_RX_FDIR;
2425 		if (cqe->sop_drop_qpn !=
2426 		    rte_cpu_to_be_32(MLX5_FLOW_MARK_DEFAULT)) {
2427 			uint32_t mark = cqe->sop_drop_qpn;
2428 
2429 			pkt->ol_flags |= PKT_RX_FDIR_ID;
2430 			pkt->hash.fdir.hi = mlx5_flow_mark_get(mark);
2431 		}
2432 	}
2433 	if (rxq->csum)
2434 		pkt->ol_flags |= rxq_cq_to_ol_flags(cqe);
2435 	if (rxq->vlan_strip &&
2436 	    (cqe->hdr_type_etc & rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED))) {
2437 		pkt->ol_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED;
2438 		pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info);
2439 	}
2440 	if (rxq->hw_timestamp) {
2441 		pkt->timestamp = rte_be_to_cpu_64(cqe->timestamp);
2442 		pkt->ol_flags |= PKT_RX_TIMESTAMP;
2443 	}
2444 }
2445 
2446 /**
2447  * DPDK callback for RX.
2448  *
2449  * @param dpdk_rxq
2450  *   Generic pointer to RX queue structure.
2451  * @param[out] pkts
2452  *   Array to store received packets.
2453  * @param pkts_n
2454  *   Maximum number of packets in array.
2455  *
2456  * @return
2457  *   Number of packets successfully received (<= pkts_n).
2458  */
2459 uint16_t
2460 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
2461 {
2462 	struct mlx5_rxq_data *rxq = dpdk_rxq;
2463 	const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1;
2464 	const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1;
2465 	const unsigned int sges_n = rxq->sges_n;
2466 	struct rte_mbuf *pkt = NULL;
2467 	struct rte_mbuf *seg = NULL;
2468 	volatile struct mlx5_cqe *cqe =
2469 		&(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
2470 	unsigned int i = 0;
2471 	unsigned int rq_ci = rxq->rq_ci << sges_n;
2472 	int len = 0; /* keep its value across iterations. */
2473 
2474 	while (pkts_n) {
2475 		unsigned int idx = rq_ci & wqe_cnt;
2476 		volatile struct mlx5_wqe_data_seg *wqe =
2477 			&((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx];
2478 		struct rte_mbuf *rep = (*rxq->elts)[idx];
2479 		volatile struct mlx5_mini_cqe8 *mcqe = NULL;
2480 		uint32_t rss_hash_res;
2481 
2482 		if (pkt)
2483 			NEXT(seg) = rep;
2484 		seg = rep;
2485 		rte_prefetch0(seg);
2486 		rte_prefetch0(cqe);
2487 		rte_prefetch0(wqe);
2488 		rep = rte_mbuf_raw_alloc(rxq->mp);
2489 		if (unlikely(rep == NULL)) {
2490 			++rxq->stats.rx_nombuf;
2491 			if (!pkt) {
2492 				/*
2493 				 * no buffers before we even started,
2494 				 * bail out silently.
2495 				 */
2496 				break;
2497 			}
2498 			while (pkt != seg) {
2499 				assert(pkt != (*rxq->elts)[idx]);
2500 				rep = NEXT(pkt);
2501 				NEXT(pkt) = NULL;
2502 				NB_SEGS(pkt) = 1;
2503 				rte_mbuf_raw_free(pkt);
2504 				pkt = rep;
2505 			}
2506 			break;
2507 		}
2508 		if (!pkt) {
2509 			cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
2510 			len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt, &mcqe);
2511 			if (!len) {
2512 				rte_mbuf_raw_free(rep);
2513 				break;
2514 			}
2515 			pkt = seg;
2516 			assert(len >= (rxq->crc_present << 2));
2517 			pkt->ol_flags = 0;
2518 			/* If compressed, take hash result from mini-CQE. */
2519 			rss_hash_res = rte_be_to_cpu_32(mcqe == NULL ?
2520 							cqe->rx_hash_res :
2521 							mcqe->rx_hash_result);
2522 			rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res);
2523 			if (rxq->crc_present)
2524 				len -= RTE_ETHER_CRC_LEN;
2525 			PKT_LEN(pkt) = len;
2526 		}
2527 		DATA_LEN(rep) = DATA_LEN(seg);
2528 		PKT_LEN(rep) = PKT_LEN(seg);
2529 		SET_DATA_OFF(rep, DATA_OFF(seg));
2530 		PORT(rep) = PORT(seg);
2531 		(*rxq->elts)[idx] = rep;
2532 		/*
2533 		 * Fill NIC descriptor with the new buffer.  The lkey and size
2534 		 * of the buffers are already known, only the buffer address
2535 		 * changes.
2536 		 */
2537 		wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t));
2538 		/* If there's only one MR, no need to replace LKey in WQE. */
2539 		if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
2540 			wqe->lkey = mlx5_rx_mb2mr(rxq, rep);
2541 		if (len > DATA_LEN(seg)) {
2542 			len -= DATA_LEN(seg);
2543 			++NB_SEGS(pkt);
2544 			++rq_ci;
2545 			continue;
2546 		}
2547 		DATA_LEN(seg) = len;
2548 #ifdef MLX5_PMD_SOFT_COUNTERS
2549 		/* Increment bytes counter. */
2550 		rxq->stats.ibytes += PKT_LEN(pkt);
2551 #endif
2552 		/* Return packet. */
2553 		*(pkts++) = pkt;
2554 		pkt = NULL;
2555 		--pkts_n;
2556 		++i;
2557 		/* Align consumer index to the next stride. */
2558 		rq_ci >>= sges_n;
2559 		++rq_ci;
2560 		rq_ci <<= sges_n;
2561 	}
2562 	if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci)))
2563 		return 0;
2564 	/* Update the consumer index. */
2565 	rxq->rq_ci = rq_ci >> sges_n;
2566 	rte_cio_wmb();
2567 	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
2568 	rte_cio_wmb();
2569 	*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
2570 #ifdef MLX5_PMD_SOFT_COUNTERS
2571 	/* Increment packets counter. */
2572 	rxq->stats.ipackets += i;
2573 #endif
2574 	return i;
2575 }
2576 
2577 void
2578 mlx5_mprq_buf_free_cb(void *addr __rte_unused, void *opaque)
2579 {
2580 	struct mlx5_mprq_buf *buf = opaque;
2581 
2582 	if (rte_atomic16_read(&buf->refcnt) == 1) {
2583 		rte_mempool_put(buf->mp, buf);
2584 	} else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) {
2585 		rte_atomic16_set(&buf->refcnt, 1);
2586 		rte_mempool_put(buf->mp, buf);
2587 	}
2588 }
2589 
2590 void
2591 mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf)
2592 {
2593 	mlx5_mprq_buf_free_cb(NULL, buf);
2594 }
2595 
2596 static inline void
2597 mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx)
2598 {
2599 	struct mlx5_mprq_buf *rep = rxq->mprq_repl;
2600 	volatile struct mlx5_wqe_data_seg *wqe =
2601 		&((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg;
2602 	void *addr;
2603 
2604 	assert(rep != NULL);
2605 	/* Replace MPRQ buf. */
2606 	(*rxq->mprq_bufs)[rq_idx] = rep;
2607 	/* Replace WQE. */
2608 	addr = mlx5_mprq_buf_addr(rep);
2609 	wqe->addr = rte_cpu_to_be_64((uintptr_t)addr);
2610 	/* If there's only one MR, no need to replace LKey in WQE. */
2611 	if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
2612 		wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr);
2613 	/* Stash a mbuf for next replacement. */
2614 	if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep)))
2615 		rxq->mprq_repl = rep;
2616 	else
2617 		rxq->mprq_repl = NULL;
2618 }
2619 
2620 /**
2621  * DPDK callback for RX with Multi-Packet RQ support.
2622  *
2623  * @param dpdk_rxq
2624  *   Generic pointer to RX queue structure.
2625  * @param[out] pkts
2626  *   Array to store received packets.
2627  * @param pkts_n
2628  *   Maximum number of packets in array.
2629  *
2630  * @return
2631  *   Number of packets successfully received (<= pkts_n).
2632  */
2633 uint16_t
2634 mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
2635 {
2636 	struct mlx5_rxq_data *rxq = dpdk_rxq;
2637 	const unsigned int strd_n = 1 << rxq->strd_num_n;
2638 	const unsigned int strd_sz = 1 << rxq->strd_sz_n;
2639 	const unsigned int strd_shift =
2640 		MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en;
2641 	const unsigned int cq_mask = (1 << rxq->cqe_n) - 1;
2642 	const unsigned int wq_mask = (1 << rxq->elts_n) - 1;
2643 	volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask];
2644 	unsigned int i = 0;
2645 	uint32_t rq_ci = rxq->rq_ci;
2646 	uint16_t consumed_strd = rxq->consumed_strd;
2647 	struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_ci & wq_mask];
2648 
2649 	while (i < pkts_n) {
2650 		struct rte_mbuf *pkt;
2651 		void *addr;
2652 		int ret;
2653 		unsigned int len;
2654 		uint16_t strd_cnt;
2655 		uint16_t strd_idx;
2656 		uint32_t offset;
2657 		uint32_t byte_cnt;
2658 		volatile struct mlx5_mini_cqe8 *mcqe = NULL;
2659 		uint32_t rss_hash_res = 0;
2660 
2661 		if (consumed_strd == strd_n) {
2662 			/* Replace WQE only if the buffer is still in use. */
2663 			if (rte_atomic16_read(&buf->refcnt) > 1) {
2664 				mprq_buf_replace(rxq, rq_ci & wq_mask);
2665 				/* Release the old buffer. */
2666 				mlx5_mprq_buf_free(buf);
2667 			} else if (unlikely(rxq->mprq_repl == NULL)) {
2668 				struct mlx5_mprq_buf *rep;
2669 
2670 				/*
2671 				 * Currently, the MPRQ mempool is out of buffer
2672 				 * and doing memcpy regardless of the size of Rx
2673 				 * packet. Retry allocation to get back to
2674 				 * normal.
2675 				 */
2676 				if (!rte_mempool_get(rxq->mprq_mp,
2677 						     (void **)&rep))
2678 					rxq->mprq_repl = rep;
2679 			}
2680 			/* Advance to the next WQE. */
2681 			consumed_strd = 0;
2682 			++rq_ci;
2683 			buf = (*rxq->mprq_bufs)[rq_ci & wq_mask];
2684 		}
2685 		cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask];
2686 		ret = mlx5_rx_poll_len(rxq, cqe, cq_mask, &mcqe);
2687 		if (!ret)
2688 			break;
2689 		byte_cnt = ret;
2690 		strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >>
2691 			   MLX5_MPRQ_STRIDE_NUM_SHIFT;
2692 		assert(strd_cnt);
2693 		consumed_strd += strd_cnt;
2694 		if (byte_cnt & MLX5_MPRQ_FILLER_MASK)
2695 			continue;
2696 		if (mcqe == NULL) {
2697 			rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res);
2698 			strd_idx = rte_be_to_cpu_16(cqe->wqe_counter);
2699 		} else {
2700 			/* mini-CQE for MPRQ doesn't have hash result. */
2701 			strd_idx = rte_be_to_cpu_16(mcqe->stride_idx);
2702 		}
2703 		assert(strd_idx < strd_n);
2704 		assert(!((rte_be_to_cpu_16(cqe->wqe_id) ^ rq_ci) & wq_mask));
2705 		/*
2706 		 * Currently configured to receive a packet per a stride. But if
2707 		 * MTU is adjusted through kernel interface, device could
2708 		 * consume multiple strides without raising an error. In this
2709 		 * case, the packet should be dropped because it is bigger than
2710 		 * the max_rx_pkt_len.
2711 		 */
2712 		if (unlikely(strd_cnt > 1)) {
2713 			++rxq->stats.idropped;
2714 			continue;
2715 		}
2716 		pkt = rte_pktmbuf_alloc(rxq->mp);
2717 		if (unlikely(pkt == NULL)) {
2718 			++rxq->stats.rx_nombuf;
2719 			break;
2720 		}
2721 		len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT;
2722 		assert((int)len >= (rxq->crc_present << 2));
2723 		if (rxq->crc_present)
2724 			len -= RTE_ETHER_CRC_LEN;
2725 		offset = strd_idx * strd_sz + strd_shift;
2726 		addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf), offset);
2727 		/* Initialize the offload flag. */
2728 		pkt->ol_flags = 0;
2729 		/*
2730 		 * Memcpy packets to the target mbuf if:
2731 		 * - The size of packet is smaller than mprq_max_memcpy_len.
2732 		 * - Out of buffer in the Mempool for Multi-Packet RQ.
2733 		 */
2734 		if (len <= rxq->mprq_max_memcpy_len || rxq->mprq_repl == NULL) {
2735 			/*
2736 			 * When memcpy'ing packet due to out-of-buffer, the
2737 			 * packet must be smaller than the target mbuf.
2738 			 */
2739 			if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) {
2740 				rte_pktmbuf_free_seg(pkt);
2741 				++rxq->stats.idropped;
2742 				continue;
2743 			}
2744 			rte_memcpy(rte_pktmbuf_mtod(pkt, void *), addr, len);
2745 		} else {
2746 			rte_iova_t buf_iova;
2747 			struct rte_mbuf_ext_shared_info *shinfo;
2748 			uint16_t buf_len = strd_cnt * strd_sz;
2749 
2750 			/* Increment the refcnt of the whole chunk. */
2751 			rte_atomic16_add_return(&buf->refcnt, 1);
2752 			assert((uint16_t)rte_atomic16_read(&buf->refcnt) <=
2753 			       strd_n + 1);
2754 			addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM);
2755 			/*
2756 			 * MLX5 device doesn't use iova but it is necessary in a
2757 			 * case where the Rx packet is transmitted via a
2758 			 * different PMD.
2759 			 */
2760 			buf_iova = rte_mempool_virt2iova(buf) +
2761 				   RTE_PTR_DIFF(addr, buf);
2762 			shinfo = rte_pktmbuf_ext_shinfo_init_helper(addr,
2763 					&buf_len, mlx5_mprq_buf_free_cb, buf);
2764 			/*
2765 			 * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when
2766 			 * attaching the stride to mbuf and more offload flags
2767 			 * will be added below by calling rxq_cq_to_mbuf().
2768 			 * Other fields will be overwritten.
2769 			 */
2770 			rte_pktmbuf_attach_extbuf(pkt, addr, buf_iova, buf_len,
2771 						  shinfo);
2772 			rte_pktmbuf_reset_headroom(pkt);
2773 			assert(pkt->ol_flags == EXT_ATTACHED_MBUF);
2774 			/*
2775 			 * Prevent potential overflow due to MTU change through
2776 			 * kernel interface.
2777 			 */
2778 			if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) {
2779 				rte_pktmbuf_free_seg(pkt);
2780 				++rxq->stats.idropped;
2781 				continue;
2782 			}
2783 		}
2784 		rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res);
2785 		PKT_LEN(pkt) = len;
2786 		DATA_LEN(pkt) = len;
2787 		PORT(pkt) = rxq->port_id;
2788 #ifdef MLX5_PMD_SOFT_COUNTERS
2789 		/* Increment bytes counter. */
2790 		rxq->stats.ibytes += PKT_LEN(pkt);
2791 #endif
2792 		/* Return packet. */
2793 		*(pkts++) = pkt;
2794 		++i;
2795 	}
2796 	/* Update the consumer indexes. */
2797 	rxq->consumed_strd = consumed_strd;
2798 	rte_cio_wmb();
2799 	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
2800 	if (rq_ci != rxq->rq_ci) {
2801 		rxq->rq_ci = rq_ci;
2802 		rte_cio_wmb();
2803 		*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
2804 	}
2805 #ifdef MLX5_PMD_SOFT_COUNTERS
2806 	/* Increment packets counter. */
2807 	rxq->stats.ipackets += i;
2808 #endif
2809 	return i;
2810 }
2811 
2812 /**
2813  * Dummy DPDK callback for TX.
2814  *
2815  * This function is used to temporarily replace the real callback during
2816  * unsafe control operations on the queue, or in case of error.
2817  *
2818  * @param dpdk_txq
2819  *   Generic pointer to TX queue structure.
2820  * @param[in] pkts
2821  *   Packets to transmit.
2822  * @param pkts_n
2823  *   Number of packets in array.
2824  *
2825  * @return
2826  *   Number of packets successfully transmitted (<= pkts_n).
2827  */
2828 uint16_t
2829 removed_tx_burst(void *dpdk_txq __rte_unused,
2830 		 struct rte_mbuf **pkts __rte_unused,
2831 		 uint16_t pkts_n __rte_unused)
2832 {
2833 	rte_mb();
2834 	return 0;
2835 }
2836 
2837 /**
2838  * Dummy DPDK callback for RX.
2839  *
2840  * This function is used to temporarily replace the real callback during
2841  * unsafe control operations on the queue, or in case of error.
2842  *
2843  * @param dpdk_rxq
2844  *   Generic pointer to RX queue structure.
2845  * @param[out] pkts
2846  *   Array to store received packets.
2847  * @param pkts_n
2848  *   Maximum number of packets in array.
2849  *
2850  * @return
2851  *   Number of packets successfully received (<= pkts_n).
2852  */
2853 uint16_t
2854 removed_rx_burst(void *dpdk_txq __rte_unused,
2855 		 struct rte_mbuf **pkts __rte_unused,
2856 		 uint16_t pkts_n __rte_unused)
2857 {
2858 	rte_mb();
2859 	return 0;
2860 }
2861 
2862 /*
2863  * Vectorized Rx/Tx routines are not compiled in when required vector
2864  * instructions are not supported on a target architecture. The following null
2865  * stubs are needed for linkage when those are not included outside of this file
2866  * (e.g.  mlx5_rxtx_vec_sse.c for x86).
2867  */
2868 
2869 __rte_weak uint16_t
2870 mlx5_tx_burst_raw_vec(void *dpdk_txq __rte_unused,
2871 		      struct rte_mbuf **pkts __rte_unused,
2872 		      uint16_t pkts_n __rte_unused)
2873 {
2874 	return 0;
2875 }
2876 
2877 __rte_weak uint16_t
2878 mlx5_tx_burst_vec(void *dpdk_txq __rte_unused,
2879 		  struct rte_mbuf **pkts __rte_unused,
2880 		  uint16_t pkts_n __rte_unused)
2881 {
2882 	return 0;
2883 }
2884 
2885 __rte_weak uint16_t
2886 mlx5_rx_burst_vec(void *dpdk_txq __rte_unused,
2887 		  struct rte_mbuf **pkts __rte_unused,
2888 		  uint16_t pkts_n __rte_unused)
2889 {
2890 	return 0;
2891 }
2892 
2893 __rte_weak int
2894 mlx5_check_raw_vec_tx_support(struct rte_eth_dev *dev __rte_unused)
2895 {
2896 	return -ENOTSUP;
2897 }
2898 
2899 __rte_weak int
2900 mlx5_check_vec_tx_support(struct rte_eth_dev *dev __rte_unused)
2901 {
2902 	return -ENOTSUP;
2903 }
2904 
2905 __rte_weak int
2906 mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused)
2907 {
2908 	return -ENOTSUP;
2909 }
2910 
2911 __rte_weak int
2912 mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused)
2913 {
2914 	return -ENOTSUP;
2915 }
2916