xref: /dpdk/drivers/net/nfp/nfp_rxtx.c (revision 37dda90ee15b7098bc48356868a87d34f727eecc)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright (c) 2014-2021 Netronome Systems, Inc.
3  * All rights reserved.
4  *
5  * Small portions derived from code Copyright(c) 2010-2015 Intel Corporation.
6  */
7 
8 #include "nfp_rxtx.h"
9 
10 #include <ethdev_pci.h>
11 #include <rte_security.h>
12 
13 #include "nfd3/nfp_nfd3.h"
14 #include "nfdk/nfp_nfdk.h"
15 #include "flower/nfp_flower.h"
16 
17 #include "nfp_ipsec.h"
18 #include "nfp_logs.h"
19 #include "nfp_net_meta.h"
20 #include "nfp_rxtx_vec.h"
21 
22 /*
23  * The bit format and map of nfp packet type for rxd.offload_info in Rx descriptor.
24  *
25  * Bit format about nfp packet type refers to the following:
26  * ---------------------------------
27  *            1                   0
28  *  5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
29  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
30  * |       |ol3|tunnel |  l3 |  l4 |
31  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
32  *
33  * Bit map about nfp packet type refers to the following:
34  *
35  * L4: bit 0~2, used for layer 4 or inner layer 4.
36  * 000: NFP_NET_PTYPE_L4_NONE
37  * 001: NFP_NET_PTYPE_L4_TCP
38  * 010: NFP_NET_PTYPE_L4_UDP
39  * 011: NFP_NET_PTYPE_L4_FRAG
40  * 100: NFP_NET_PTYPE_L4_NONFRAG
41  * 101: NFP_NET_PTYPE_L4_ICMP
42  * 110: NFP_NET_PTYPE_L4_SCTP
43  * 111: reserved
44  *
45  * L3: bit 3~5, used for layer 3 or inner layer 3.
46  * 000: NFP_NET_PTYPE_L3_NONE
47  * 001: NFP_NET_PTYPE_L3_IPV6
48  * 010: NFP_NET_PTYPE_L3_IPV4
49  * 011: NFP_NET_PTYPE_L3_IPV4_EXT
50  * 100: NFP_NET_PTYPE_L3_IPV6_EXT
51  * 101: NFP_NET_PTYPE_L3_IPV4_EXT_UNKNOWN
52  * 110: NFP_NET_PTYPE_L3_IPV6_EXT_UNKNOWN
53  * 111: reserved
54  *
55  * Tunnel: bit 6~9, used for tunnel.
56  * 0000: NFP_NET_PTYPE_TUNNEL_NONE
57  * 0001: NFP_NET_PTYPE_TUNNEL_VXLAN
58  * 0100: NFP_NET_PTYPE_TUNNEL_NVGRE
59  * 0101: NFP_NET_PTYPE_TUNNEL_GENEVE
60  * 0010, 0011, 0110~1111: reserved
61  *
62  * Outer L3: bit 10~11, used for outer layer 3.
63  * 00: NFP_NET_PTYPE_OUTER_L3_NONE
64  * 01: NFP_NET_PTYPE_OUTER_L3_IPV6
65  * 10: NFP_NET_PTYPE_OUTER_L3_IPV4
66  * 11: reserved
67  *
68  * Reserved: bit 10~15, used for extension.
69  */
70 
71 /* Mask and offset about nfp packet type based on the bit map above. */
72 #define NFP_NET_PTYPE_L4_MASK                  0x0007
73 #define NFP_NET_PTYPE_L3_MASK                  0x0038
74 #define NFP_NET_PTYPE_TUNNEL_MASK              0x03c0
75 #define NFP_NET_PTYPE_OUTER_L3_MASK            0x0c00
76 
77 #define NFP_NET_PTYPE_L4_OFFSET                0
78 #define NFP_NET_PTYPE_L3_OFFSET                3
79 #define NFP_NET_PTYPE_TUNNEL_OFFSET            6
80 #define NFP_NET_PTYPE_OUTER_L3_OFFSET          10
81 
82 /* Case about nfp packet type based on the bit map above. */
83 #define NFP_NET_PTYPE_L4_NONE                  0
84 #define NFP_NET_PTYPE_L4_TCP                   1
85 #define NFP_NET_PTYPE_L4_UDP                   2
86 #define NFP_NET_PTYPE_L4_FRAG                  3
87 #define NFP_NET_PTYPE_L4_NONFRAG               4
88 #define NFP_NET_PTYPE_L4_ICMP                  5
89 #define NFP_NET_PTYPE_L4_SCTP                  6
90 
91 #define NFP_NET_PTYPE_L3_NONE                  0
92 #define NFP_NET_PTYPE_L3_IPV6                  1
93 #define NFP_NET_PTYPE_L3_IPV4                  2
94 #define NFP_NET_PTYPE_L3_IPV4_EXT              3
95 #define NFP_NET_PTYPE_L3_IPV6_EXT              4
96 #define NFP_NET_PTYPE_L3_IPV4_EXT_UNKNOWN      5
97 #define NFP_NET_PTYPE_L3_IPV6_EXT_UNKNOWN      6
98 
99 #define NFP_NET_PTYPE_TUNNEL_NONE              0
100 #define NFP_NET_PTYPE_TUNNEL_VXLAN             1
101 #define NFP_NET_PTYPE_TUNNEL_NVGRE             4
102 #define NFP_NET_PTYPE_TUNNEL_GENEVE            5
103 
104 #define NFP_NET_PTYPE_OUTER_L3_NONE            0
105 #define NFP_NET_PTYPE_OUTER_L3_IPV6            1
106 #define NFP_NET_PTYPE_OUTER_L3_IPV4            2
107 
108 #define NFP_PTYPE2RTE(tunnel, type) ((tunnel) ? RTE_PTYPE_INNER_##type : RTE_PTYPE_##type)
109 
110 /* Record NFP packet type parsed from rxd.offload_info. */
111 struct nfp_ptype_parsed {
112 	uint8_t l4_ptype;       /**< Packet type of layer 4, or inner layer 4. */
113 	uint8_t l3_ptype;       /**< Packet type of layer 3, or inner layer 3. */
114 	uint8_t tunnel_ptype;   /**< Packet type of tunnel. */
115 	uint8_t outer_l3_ptype; /**< Packet type of outer layer 3. */
116 };
117 
118 /* Set mbuf checksum flags based on RX descriptor flags */
119 void
120 nfp_net_rx_cksum(struct nfp_net_rxq *rxq,
121 		struct nfp_net_rx_desc *rxd,
122 		struct rte_mbuf *mb)
123 {
124 	struct nfp_net_hw *hw = rxq->hw;
125 
126 	if ((hw->super.ctrl & NFP_NET_CFG_CTRL_RXCSUM) == 0)
127 		return;
128 
129 	/* If IPv4 and IP checksum error, fail */
130 	if (unlikely((rxd->rxd.flags & PCIE_DESC_RX_IP4_CSUM) != 0 &&
131 			(rxd->rxd.flags & PCIE_DESC_RX_IP4_CSUM_OK) == 0))
132 		mb->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_BAD;
133 	else
134 		mb->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_GOOD;
135 
136 	/* If neither UDP nor TCP return */
137 	if ((rxd->rxd.flags & PCIE_DESC_RX_TCP_CSUM) == 0 &&
138 			(rxd->rxd.flags & PCIE_DESC_RX_UDP_CSUM) == 0)
139 		return;
140 
141 	if (likely(rxd->rxd.flags & PCIE_DESC_RX_L4_CSUM_OK) != 0)
142 		mb->ol_flags |= RTE_MBUF_F_RX_L4_CKSUM_GOOD;
143 	else
144 		mb->ol_flags |= RTE_MBUF_F_RX_L4_CKSUM_BAD;
145 }
146 
147 static int
148 nfp_net_rx_fill_freelist(struct nfp_net_rxq *rxq)
149 {
150 	uint16_t i;
151 	uint64_t dma_addr;
152 	struct nfp_net_dp_buf *rxe = rxq->rxbufs;
153 
154 	PMD_RX_LOG(DEBUG, "Fill Rx Freelist for %hu descriptors.",
155 			rxq->rx_count);
156 
157 	for (i = 0; i < rxq->rx_count; i++) {
158 		struct nfp_net_rx_desc *rxd;
159 		struct rte_mbuf *mbuf = rte_pktmbuf_alloc(rxq->mem_pool);
160 
161 		if (mbuf == NULL) {
162 			PMD_DRV_LOG(ERR, "RX mbuf alloc failed queue_id=%hu.",
163 				rxq->qidx);
164 			return -ENOMEM;
165 		}
166 
167 		dma_addr = rte_cpu_to_le_64(rte_mbuf_data_iova_default(mbuf));
168 
169 		rxd = &rxq->rxds[i];
170 		rxd->fld.dd = 0;
171 		rxd->fld.dma_addr_hi = (dma_addr >> 32) & 0xffff;
172 		rxd->fld.dma_addr_lo = dma_addr & 0xffffffff;
173 
174 		rxe[i].mbuf = mbuf;
175 	}
176 
177 	/* Make sure all writes are flushed before telling the hardware */
178 	rte_wmb();
179 
180 	/* Not advertising the whole ring as the firmware gets confused if so */
181 	PMD_RX_LOG(DEBUG, "Increment FL write pointer in %hu.", rxq->rx_count - 1);
182 
183 	nfp_qcp_ptr_add(rxq->qcp_fl, NFP_QCP_WRITE_PTR, rxq->rx_count - 1);
184 
185 	return 0;
186 }
187 
188 int
189 nfp_net_rx_freelist_setup(struct rte_eth_dev *dev)
190 {
191 	uint16_t i;
192 
193 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
194 		if (nfp_net_rx_fill_freelist(dev->data->rx_queues[i]) != 0)
195 			return -1;
196 	}
197 
198 	return 0;
199 }
200 
201 uint32_t
202 nfp_net_rx_queue_count(void *rx_queue)
203 {
204 	uint32_t idx;
205 	uint32_t count = 0;
206 	struct nfp_net_rxq *rxq;
207 	struct nfp_net_rx_desc *rxds;
208 
209 	rxq = rx_queue;
210 	idx = rxq->rd_p;
211 
212 	/*
213 	 * Other PMDs are just checking the DD bit in intervals of 4
214 	 * descriptors and counting all four if the first has the DD
215 	 * bit on. Of course, this is not accurate but can be good for
216 	 * performance. But ideally that should be done in descriptors
217 	 * chunks belonging to the same cache line.
218 	 */
219 	while (count < rxq->rx_count) {
220 		rxds = &rxq->rxds[idx];
221 		if ((rxds->rxd.meta_len_dd & PCIE_DESC_RX_DD) == 0)
222 			break;
223 
224 		count++;
225 		idx++;
226 
227 		/* Wrapping */
228 		if ((idx) == rxq->rx_count)
229 			idx = 0;
230 	}
231 
232 	return count;
233 }
234 
235 /**
236  * Set packet type to mbuf based on parsed structure.
237  *
238  * @param nfp_ptype
239  *   Packet type structure parsing from Rx descriptor.
240  * @param mb
241  *   Mbuf to set the packet type.
242  */
243 static void
244 nfp_net_set_ptype(const struct nfp_ptype_parsed *nfp_ptype,
245 		struct rte_mbuf *mb)
246 {
247 	uint32_t mbuf_ptype = RTE_PTYPE_L2_ETHER;
248 	uint8_t nfp_tunnel_ptype = nfp_ptype->tunnel_ptype;
249 
250 	if (nfp_tunnel_ptype != NFP_NET_PTYPE_TUNNEL_NONE)
251 		mbuf_ptype |= RTE_PTYPE_INNER_L2_ETHER;
252 
253 	switch (nfp_ptype->outer_l3_ptype) {
254 	case NFP_NET_PTYPE_OUTER_L3_NONE:
255 		break;
256 	case NFP_NET_PTYPE_OUTER_L3_IPV4:
257 		mbuf_ptype |= RTE_PTYPE_L3_IPV4;
258 		break;
259 	case NFP_NET_PTYPE_OUTER_L3_IPV6:
260 		mbuf_ptype |= RTE_PTYPE_L3_IPV6;
261 		break;
262 	default:
263 		PMD_RX_LOG(DEBUG, "Unrecognized nfp outer layer 3 packet type: %u.",
264 				nfp_ptype->outer_l3_ptype);
265 		break;
266 	}
267 
268 	switch (nfp_tunnel_ptype) {
269 	case NFP_NET_PTYPE_TUNNEL_NONE:
270 		break;
271 	case NFP_NET_PTYPE_TUNNEL_VXLAN:
272 		mbuf_ptype |= RTE_PTYPE_TUNNEL_VXLAN | RTE_PTYPE_L4_UDP;
273 		break;
274 	case NFP_NET_PTYPE_TUNNEL_NVGRE:
275 		mbuf_ptype |= RTE_PTYPE_TUNNEL_NVGRE;
276 		break;
277 	case NFP_NET_PTYPE_TUNNEL_GENEVE:
278 		mbuf_ptype |= RTE_PTYPE_TUNNEL_GENEVE | RTE_PTYPE_L4_UDP;
279 		break;
280 	default:
281 		PMD_RX_LOG(DEBUG, "Unrecognized nfp tunnel packet type: %u.",
282 				nfp_tunnel_ptype);
283 		break;
284 	}
285 
286 	switch (nfp_ptype->l4_ptype) {
287 	case NFP_NET_PTYPE_L4_NONE:
288 		break;
289 	case NFP_NET_PTYPE_L4_TCP:
290 		mbuf_ptype |= NFP_PTYPE2RTE(nfp_tunnel_ptype, L4_TCP);
291 		break;
292 	case NFP_NET_PTYPE_L4_UDP:
293 		mbuf_ptype |= NFP_PTYPE2RTE(nfp_tunnel_ptype, L4_UDP);
294 		break;
295 	case NFP_NET_PTYPE_L4_FRAG:
296 		mbuf_ptype |= NFP_PTYPE2RTE(nfp_tunnel_ptype, L4_FRAG);
297 		break;
298 	case NFP_NET_PTYPE_L4_NONFRAG:
299 		mbuf_ptype |= NFP_PTYPE2RTE(nfp_tunnel_ptype, L4_NONFRAG);
300 		break;
301 	case NFP_NET_PTYPE_L4_ICMP:
302 		mbuf_ptype |= NFP_PTYPE2RTE(nfp_tunnel_ptype, L4_ICMP);
303 		break;
304 	case NFP_NET_PTYPE_L4_SCTP:
305 		mbuf_ptype |= NFP_PTYPE2RTE(nfp_tunnel_ptype, L4_SCTP);
306 		break;
307 	default:
308 		PMD_RX_LOG(DEBUG, "Unrecognized nfp layer 4 packet type: %u.",
309 				nfp_ptype->l4_ptype);
310 		break;
311 	}
312 
313 	switch (nfp_ptype->l3_ptype) {
314 	case NFP_NET_PTYPE_L3_NONE:
315 		break;
316 	case NFP_NET_PTYPE_L3_IPV4:
317 		mbuf_ptype |= NFP_PTYPE2RTE(nfp_tunnel_ptype, L3_IPV4);
318 		break;
319 	case NFP_NET_PTYPE_L3_IPV6:
320 		mbuf_ptype |= NFP_PTYPE2RTE(nfp_tunnel_ptype, L3_IPV6);
321 		break;
322 	case NFP_NET_PTYPE_L3_IPV4_EXT:
323 		mbuf_ptype |= NFP_PTYPE2RTE(nfp_tunnel_ptype, L3_IPV4_EXT);
324 		break;
325 	case NFP_NET_PTYPE_L3_IPV6_EXT:
326 		mbuf_ptype |= NFP_PTYPE2RTE(nfp_tunnel_ptype, L3_IPV6_EXT);
327 		break;
328 	case NFP_NET_PTYPE_L3_IPV4_EXT_UNKNOWN:
329 		mbuf_ptype |= NFP_PTYPE2RTE(nfp_tunnel_ptype, L3_IPV4_EXT_UNKNOWN);
330 		break;
331 	case NFP_NET_PTYPE_L3_IPV6_EXT_UNKNOWN:
332 		mbuf_ptype |= NFP_PTYPE2RTE(nfp_tunnel_ptype, L3_IPV6_EXT_UNKNOWN);
333 		break;
334 	default:
335 		PMD_RX_LOG(DEBUG, "Unrecognized nfp layer 3 packet type: %u.",
336 				nfp_ptype->l3_ptype);
337 		break;
338 	}
339 
340 	mb->packet_type = mbuf_ptype;
341 }
342 
343 /**
344  * Parse the packet type from Rx descriptor and set to mbuf.
345  *
346  * @param rxq
347  *   Rx queue
348  * @param rxds
349  *   Rx descriptor including the offloading info of packet type.
350  * @param mb
351  *   Mbuf to set the packet type.
352  */
353 void
354 nfp_net_parse_ptype(struct nfp_net_rxq *rxq,
355 		struct nfp_net_rx_desc *rxds,
356 		struct rte_mbuf *mb)
357 {
358 	struct nfp_net_hw *hw = rxq->hw;
359 	struct nfp_ptype_parsed nfp_ptype;
360 	uint16_t rxd_ptype = rxds->rxd.offload_info;
361 
362 	if ((hw->super.ctrl_ext & NFP_NET_CFG_CTRL_PKT_TYPE) == 0)
363 		return;
364 
365 	if (rxd_ptype == 0 || (rxds->rxd.flags & PCIE_DESC_RX_VLAN) != 0)
366 		return;
367 
368 	nfp_ptype.l4_ptype = (rxd_ptype & NFP_NET_PTYPE_L4_MASK) >>
369 			NFP_NET_PTYPE_L4_OFFSET;
370 	nfp_ptype.l3_ptype = (rxd_ptype & NFP_NET_PTYPE_L3_MASK) >>
371 			NFP_NET_PTYPE_L3_OFFSET;
372 	nfp_ptype.tunnel_ptype = (rxd_ptype & NFP_NET_PTYPE_TUNNEL_MASK) >>
373 			NFP_NET_PTYPE_TUNNEL_OFFSET;
374 	nfp_ptype.outer_l3_ptype = (rxd_ptype & NFP_NET_PTYPE_OUTER_L3_MASK) >>
375 			NFP_NET_PTYPE_OUTER_L3_OFFSET;
376 
377 	nfp_net_set_ptype(&nfp_ptype, mb);
378 }
379 
380 /*
381  * RX path design:
382  *
383  * There are some decisions to take:
384  * 1) How to check DD RX descriptors bit
385  * 2) How and when to allocate new mbufs
386  *
387  * Current implementation checks just one single DD bit each loop. As each
388  * descriptor is 8 bytes, it is likely a good idea to check descriptors in
389  * a single cache line instead. Tests with this change have not shown any
390  * performance improvement but it requires further investigation. For example,
391  * depending on which descriptor is next, the number of descriptors could be
392  * less than 8 for just checking those in the same cache line. This implies
393  * extra work which could be counterproductive by itself. Indeed, last firmware
394  * changes are just doing this: writing several descriptors with the DD bit
395  * for saving PCIe bandwidth and DMA operations from the NFP.
396  *
397  * Mbuf allocation is done when a new packet is received. Then the descriptor
398  * is automatically linked with the new mbuf and the old one is given to the
399  * user. The main drawback with this design is mbuf allocation is heavier than
400  * using bulk allocations allowed by DPDK with rte_mempool_get_bulk. From the
401  * cache point of view it does not seem allocating the mbuf early on as we are
402  * doing now have any benefit at all. Again, tests with this change have not
403  * shown any improvement. Also, rte_mempool_get_bulk returns all or nothing
404  * so looking at the implications of this type of allocation should be studied
405  * deeply.
406  */
407 uint16_t
408 nfp_net_recv_pkts(void *rx_queue,
409 		struct rte_mbuf **rx_pkts,
410 		uint16_t nb_pkts)
411 {
412 	uint64_t dma_addr;
413 	uint16_t avail = 0;
414 	struct rte_mbuf *mb;
415 	uint16_t nb_hold = 0;
416 	struct nfp_net_hw *hw;
417 	struct rte_mbuf *new_mb;
418 	struct nfp_net_rxq *rxq;
419 	struct nfp_pf_dev *pf_dev;
420 	struct nfp_net_dp_buf *rxb;
421 	struct nfp_net_rx_desc *rxds;
422 	uint16_t avail_multiplexed = 0;
423 
424 	rxq = rx_queue;
425 	if (unlikely(rxq == NULL)) {
426 		/*
427 		 * DPDK just checks the queue is lower than max queues
428 		 * enabled. But the queue needs to be configured.
429 		 */
430 		PMD_RX_LOG(ERR, "RX Bad queue.");
431 		return 0;
432 	}
433 
434 	hw = rxq->hw;
435 	pf_dev = rxq->hw_priv->pf_dev;
436 
437 	while (avail + avail_multiplexed < nb_pkts) {
438 		rxb = &rxq->rxbufs[rxq->rd_p];
439 		if (unlikely(rxb == NULL)) {
440 			PMD_RX_LOG(ERR, "The rxb does not exist!");
441 			break;
442 		}
443 
444 		rxds = &rxq->rxds[rxq->rd_p];
445 		if ((rxds->rxd.meta_len_dd & PCIE_DESC_RX_DD) == 0)
446 			break;
447 
448 		/*
449 		 * Memory barrier to ensure that we won't do other
450 		 * reads before the DD bit.
451 		 */
452 		rte_rmb();
453 
454 		/*
455 		 * We got a packet. Let's alloc a new mbuf for refilling the
456 		 * free descriptor ring as soon as possible.
457 		 */
458 		new_mb = rte_pktmbuf_alloc(rxq->mem_pool);
459 		if (unlikely(new_mb == NULL)) {
460 			PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u queue_id=%hu.",
461 					rxq->port_id, rxq->qidx);
462 			nfp_net_mbuf_alloc_failed(rxq);
463 			break;
464 		}
465 
466 		/*
467 		 * Grab the mbuf and refill the descriptor with the
468 		 * previously allocated mbuf.
469 		 */
470 		mb = rxb->mbuf;
471 		rxb->mbuf = new_mb;
472 
473 		PMD_RX_LOG(DEBUG, "Packet len: %u, mbuf_size: %u.",
474 				rxds->rxd.data_len, rxq->mbuf_size);
475 
476 		/* Size of this segment */
477 		mb->data_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds);
478 		/* Size of the whole packet. We just support 1 segment */
479 		mb->pkt_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds);
480 
481 		if (unlikely((mb->data_len + hw->rx_offset) > rxq->mbuf_size)) {
482 			/*
483 			 * This should not happen and the user has the
484 			 * responsibility of avoiding it. But we have
485 			 * to give some info about the error.
486 			 */
487 			PMD_RX_LOG(ERR, "The mbuf overflow likely due to the RX offset.");
488 			rte_pktmbuf_free(mb);
489 			break;
490 		}
491 
492 		/* Filling the received mbuf with packet info */
493 		if (hw->rx_offset != 0)
494 			mb->data_off = RTE_PKTMBUF_HEADROOM + hw->rx_offset;
495 		else
496 			mb->data_off = RTE_PKTMBUF_HEADROOM + NFP_DESC_META_LEN(rxds);
497 
498 		/* No scatter mode supported */
499 		mb->nb_segs = 1;
500 		mb->next = NULL;
501 		mb->port = rxq->port_id;
502 
503 		struct nfp_net_meta_parsed meta;
504 		nfp_net_meta_parse(rxds, rxq, hw, mb, &meta);
505 
506 		nfp_net_parse_ptype(rxq, rxds, mb);
507 
508 		/* Checking the checksum flag */
509 		nfp_net_rx_cksum(rxq, rxds, mb);
510 
511 		/* Now resetting and updating the descriptor */
512 		rxds->vals[0] = 0;
513 		rxds->vals[1] = 0;
514 		dma_addr = rte_cpu_to_le_64(rte_mbuf_data_iova_default(new_mb));
515 		rxds->fld.dd = 0;
516 		rxds->fld.dma_addr_hi = (dma_addr >> 32) & 0xffff;
517 		rxds->fld.dma_addr_lo = dma_addr & 0xffffffff;
518 		nb_hold++;
519 
520 		rxq->rd_p++;
521 		if (unlikely(rxq->rd_p == rxq->rx_count)) /* Wrapping */
522 			rxq->rd_p = 0;
523 
524 		if (pf_dev->recv_pkt_meta_check_t(&meta)) {
525 			rx_pkts[avail++] = mb;
526 		} else {
527 			if (nfp_flower_pf_dispatch_pkts(rxq, mb, meta.port_id)) {
528 				avail_multiplexed++;
529 			} else {
530 				rte_pktmbuf_free(mb);
531 				break;
532 			}
533 		}
534 	}
535 
536 	if (nb_hold == 0)
537 		return nb_hold;
538 
539 	PMD_RX_LOG(DEBUG, "RX  port_id=%hu queue_id=%hu, %hu packets received.",
540 			rxq->port_id, rxq->qidx, avail);
541 
542 	nb_hold += rxq->nb_rx_hold;
543 
544 	/*
545 	 * FL descriptors needs to be written before incrementing the
546 	 * FL queue WR pointer.
547 	 */
548 	rte_wmb();
549 	if (nb_hold > rxq->rx_free_thresh) {
550 		PMD_RX_LOG(DEBUG, "The port=%hu queue=%hu nb_hold=%hu avail=%hu.",
551 				rxq->port_id, rxq->qidx, nb_hold, avail);
552 		nfp_qcp_ptr_add(rxq->qcp_fl, NFP_QCP_WRITE_PTR, nb_hold);
553 		nb_hold = 0;
554 	}
555 	rxq->nb_rx_hold = nb_hold;
556 
557 	return avail;
558 }
559 
560 static void
561 nfp_net_rx_queue_release_mbufs(struct nfp_net_rxq *rxq)
562 {
563 	uint16_t i;
564 
565 	if (rxq->rxbufs == NULL)
566 		return;
567 
568 	for (i = 0; i < rxq->rx_count; i++) {
569 		if (rxq->rxbufs[i].mbuf != NULL) {
570 			rte_pktmbuf_free_seg(rxq->rxbufs[i].mbuf);
571 			rxq->rxbufs[i].mbuf = NULL;
572 		}
573 	}
574 }
575 
576 void
577 nfp_net_rx_queue_release(struct rte_eth_dev *dev,
578 		uint16_t queue_idx)
579 {
580 	struct nfp_net_rxq *rxq = dev->data->rx_queues[queue_idx];
581 
582 	if (rxq != NULL) {
583 		nfp_net_rx_queue_release_mbufs(rxq);
584 		rte_eth_dma_zone_free(dev, "rx_ring", queue_idx);
585 		rte_free(rxq->rxbufs);
586 		rte_free(rxq);
587 	}
588 }
589 
590 void
591 nfp_net_reset_rx_queue(struct nfp_net_rxq *rxq)
592 {
593 	nfp_net_rx_queue_release_mbufs(rxq);
594 	rxq->rd_p = 0;
595 	rxq->nb_rx_hold = 0;
596 }
597 
598 static void
599 nfp_rx_queue_setup_flbufsz(struct nfp_net_hw *hw,
600 		struct nfp_net_rxq *rxq)
601 {
602 	if (!hw->flbufsz_set_flag) {
603 		hw->flbufsz_set_flag = true;
604 		hw->flbufsz = rxq->mbuf_size;
605 		return;
606 	}
607 
608 	if (hw->flbufsz < rxq->mbuf_size)
609 		hw->flbufsz = rxq->mbuf_size;
610 }
611 
612 int
613 nfp_net_rx_queue_setup(struct rte_eth_dev *dev,
614 		uint16_t queue_idx,
615 		uint16_t nb_desc,
616 		unsigned int socket_id,
617 		const struct rte_eth_rxconf *rx_conf,
618 		struct rte_mempool *mp)
619 {
620 	uint32_t rx_desc_sz;
621 	uint16_t min_rx_desc;
622 	uint16_t max_rx_desc;
623 	struct nfp_net_hw *hw;
624 	struct nfp_net_rxq *rxq;
625 	const struct rte_memzone *tz;
626 	struct nfp_net_hw_priv *hw_priv;
627 
628 	hw = nfp_net_get_hw(dev);
629 	hw_priv = dev->process_private;
630 
631 	nfp_net_rx_desc_limits(hw_priv, &min_rx_desc, &max_rx_desc);
632 
633 	/* Validating number of descriptors */
634 	rx_desc_sz = nb_desc * sizeof(struct nfp_net_rx_desc);
635 	if (rx_desc_sz % NFP_ALIGN_RING_DESC != 0 ||
636 			nb_desc > max_rx_desc || nb_desc < min_rx_desc) {
637 		PMD_DRV_LOG(ERR, "Wrong nb_desc value.");
638 		return -EINVAL;
639 	}
640 
641 	/*
642 	 * Free memory prior to re-allocation if needed. This is the case after
643 	 * calling @nfp_net_stop().
644 	 */
645 	if (dev->data->rx_queues[queue_idx] != NULL) {
646 		nfp_net_rx_queue_release(dev, queue_idx);
647 		dev->data->rx_queues[queue_idx] = NULL;
648 	}
649 
650 	/* Allocating rx queue data structure */
651 	rxq = rte_zmalloc_socket("ethdev RX queue", sizeof(struct nfp_net_rxq),
652 			RTE_CACHE_LINE_SIZE, socket_id);
653 	if (rxq == NULL)
654 		return -ENOMEM;
655 
656 	dev->data->rx_queues[queue_idx] = rxq;
657 
658 	/* Hw queues mapping based on firmware configuration */
659 	rxq->qidx = queue_idx;
660 	rxq->fl_qcidx = queue_idx * hw->stride_rx;
661 	rxq->qcp_fl = hw->rx_bar + NFP_QCP_QUEUE_OFF(rxq->fl_qcidx);
662 
663 	/*
664 	 * Tracking mbuf size for detecting a potential mbuf overflow due to
665 	 * RX offset.
666 	 */
667 	rxq->mem_pool = mp;
668 	rxq->mbuf_size = rxq->mem_pool->elt_size;
669 	rxq->mbuf_size -= (sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM);
670 	nfp_rx_queue_setup_flbufsz(hw, rxq);
671 
672 	rxq->rx_count = nb_desc;
673 	rxq->port_id = dev->data->port_id;
674 	rxq->rx_free_thresh = rx_conf->rx_free_thresh;
675 
676 	/*
677 	 * Allocate RX ring hardware descriptors. A memzone large enough to
678 	 * handle the maximum ring size is allocated in order to allow for
679 	 * resizing in later calls to the queue setup function.
680 	 */
681 	tz = rte_eth_dma_zone_reserve(dev, "rx_ring", queue_idx,
682 			sizeof(struct nfp_net_rx_desc) * max_rx_desc,
683 			NFP_MEMZONE_ALIGN, socket_id);
684 	if (tz == NULL) {
685 		PMD_DRV_LOG(ERR, "Error allocating rx dma.");
686 		nfp_net_rx_queue_release(dev, queue_idx);
687 		dev->data->rx_queues[queue_idx] = NULL;
688 		return -ENOMEM;
689 	}
690 
691 	/* Saving physical and virtual addresses for the RX ring */
692 	rxq->dma = (uint64_t)tz->iova;
693 	rxq->rxds = tz->addr;
694 
695 	/* Mbuf pointers array for referencing mbufs linked to RX descriptors */
696 	rxq->rxbufs = rte_zmalloc_socket("rxq->rxbufs",
697 			sizeof(*rxq->rxbufs) * nb_desc, RTE_CACHE_LINE_SIZE,
698 			socket_id);
699 	if (rxq->rxbufs == NULL) {
700 		nfp_net_rx_queue_release(dev, queue_idx);
701 		dev->data->rx_queues[queue_idx] = NULL;
702 		return -ENOMEM;
703 	}
704 
705 	nfp_net_reset_rx_queue(rxq);
706 
707 	rxq->hw = hw;
708 	rxq->hw_priv = dev->process_private;
709 
710 	/*
711 	 * Telling the HW about the physical address of the RX ring and number
712 	 * of descriptors in log2 format.
713 	 */
714 	nn_cfg_writeq(&hw->super, NFP_NET_CFG_RXR_ADDR(queue_idx), rxq->dma);
715 	nn_cfg_writeb(&hw->super, NFP_NET_CFG_RXR_SZ(queue_idx), rte_log2_u32(nb_desc));
716 
717 	return 0;
718 }
719 
720 static inline uint32_t
721 nfp_net_read_tx_free_qcp(struct nfp_net_txq *txq)
722 {
723 	/*
724 	 * If TX ring pointer write back is not supported, do a PCIe read.
725 	 * Otherwise read qcp value from write back dma address.
726 	 */
727 	if (txq->txrwb == NULL)
728 		return nfp_qcp_read(txq->qcp_q, NFP_QCP_READ_PTR);
729 
730 	/*
731 	 * In most cases the TX count is a power of two and the costly modulus
732 	 * operation can be substituted with a subtraction and an AND operation.
733 	 */
734 	if (rte_is_power_of_2(txq->tx_count) == 1)
735 		return (*txq->txrwb) & (txq->tx_count - 1);
736 	else
737 		return (*txq->txrwb) % txq->tx_count;
738 }
739 
740 /**
741  * Check for descriptors with a complete status
742  *
743  * @param txq
744  *   TX queue to work with
745  *
746  * @return
747  *   Number of descriptors freed
748  */
749 uint32_t
750 nfp_net_tx_free_bufs(struct nfp_net_txq *txq)
751 {
752 	uint32_t todo;
753 	uint32_t qcp_rd_p;
754 
755 	PMD_TX_LOG(DEBUG, "Queue %hu. Check for descriptor with a complete"
756 			" status.", txq->qidx);
757 
758 	/* Work out how many packets have been sent */
759 	qcp_rd_p = nfp_net_read_tx_free_qcp(txq);
760 
761 	if (qcp_rd_p == txq->rd_p) {
762 		PMD_TX_LOG(DEBUG, "Queue %hu: It seems harrier is not sending "
763 				"packets (%u, %u).", txq->qidx,
764 				qcp_rd_p, txq->rd_p);
765 		return 0;
766 	}
767 
768 	if (qcp_rd_p > txq->rd_p)
769 		todo = qcp_rd_p - txq->rd_p;
770 	else
771 		todo = qcp_rd_p + txq->tx_count - txq->rd_p;
772 
773 	PMD_TX_LOG(DEBUG, "The qcp_rd_p %u, txq->rd_p: %u, qcp->rd_p: %u.",
774 			qcp_rd_p, txq->rd_p, txq->rd_p);
775 
776 	if (todo == 0)
777 		return todo;
778 
779 	txq->rd_p += todo;
780 	if (unlikely(txq->rd_p >= txq->tx_count))
781 		txq->rd_p -= txq->tx_count;
782 
783 	return todo;
784 }
785 
786 static void
787 nfp_net_tx_queue_release_mbufs(struct nfp_net_txq *txq)
788 {
789 	uint32_t i;
790 
791 	if (txq->txbufs == NULL)
792 		return;
793 
794 	for (i = 0; i < txq->tx_count; i++) {
795 		if (txq->txbufs[i].mbuf != NULL) {
796 			rte_pktmbuf_free_seg(txq->txbufs[i].mbuf);
797 			txq->txbufs[i].mbuf = NULL;
798 		}
799 	}
800 }
801 
802 void
803 nfp_net_tx_queue_release(struct rte_eth_dev *dev,
804 		uint16_t queue_idx)
805 {
806 	struct nfp_net_hw *net_hw;
807 	struct nfp_net_txq *txq = dev->data->tx_queues[queue_idx];
808 
809 	if (txq != NULL) {
810 		net_hw = nfp_net_get_hw(dev);
811 		if (net_hw->txrwb_mz != NULL)
812 			nn_cfg_writeq(&net_hw->super, NFP_NET_CFG_TXR_WB_ADDR(queue_idx), 0);
813 		nfp_net_tx_queue_release_mbufs(txq);
814 		rte_eth_dma_zone_free(dev, "tx_ring", queue_idx);
815 		rte_free(txq->txbufs);
816 		rte_free(txq);
817 	}
818 }
819 
820 void
821 nfp_net_reset_tx_queue(struct nfp_net_txq *txq)
822 {
823 	nfp_net_tx_queue_release_mbufs(txq);
824 	txq->wr_p = 0;
825 	txq->rd_p = 0;
826 	if (txq->txrwb != NULL)
827 		*txq->txrwb = 0;
828 }
829 
830 int
831 nfp_net_tx_queue_setup(struct rte_eth_dev *dev,
832 		uint16_t queue_idx,
833 		uint16_t nb_desc,
834 		unsigned int socket_id,
835 		const struct rte_eth_txconf *tx_conf)
836 {
837 	struct nfp_net_hw_priv *hw_priv;
838 
839 	hw_priv = dev->process_private;
840 
841 	if (hw_priv->pf_dev->ver.extend == NFP_NET_CFG_VERSION_DP_NFD3)
842 		return nfp_net_nfd3_tx_queue_setup(dev, queue_idx,
843 				nb_desc, socket_id, tx_conf);
844 	else
845 		return nfp_net_nfdk_tx_queue_setup(dev, queue_idx,
846 				nb_desc, socket_id, tx_conf);
847 }
848 
849 void
850 nfp_net_rx_queue_info_get(struct rte_eth_dev *dev,
851 		uint16_t queue_id,
852 		struct rte_eth_rxq_info *info)
853 {
854 	struct rte_eth_dev_info dev_info;
855 	struct nfp_net_rxq *rxq = dev->data->rx_queues[queue_id];
856 
857 	info->mp = rxq->mem_pool;
858 	info->nb_desc = rxq->rx_count;
859 
860 	info->conf.rx_free_thresh = rxq->rx_free_thresh;
861 
862 	nfp_net_infos_get(dev, &dev_info);
863 	info->conf.offloads = dev_info.rx_offload_capa &
864 			dev->data->dev_conf.rxmode.offloads;
865 }
866 
867 void
868 nfp_net_tx_queue_info_get(struct rte_eth_dev *dev,
869 		uint16_t queue_id,
870 		struct rte_eth_txq_info *info)
871 {
872 	struct rte_eth_dev_info dev_info;
873 	struct nfp_net_hw_priv *hw_priv = dev->process_private;
874 	struct nfp_net_txq *txq = dev->data->tx_queues[queue_id];
875 
876 	if (hw_priv->pf_dev->ver.extend == NFP_NET_CFG_VERSION_DP_NFD3)
877 		info->nb_desc = txq->tx_count / NFD3_TX_DESC_PER_PKT;
878 	else
879 		info->nb_desc = txq->tx_count / NFDK_TX_DESC_PER_SIMPLE_PKT;
880 
881 	info->conf.tx_free_thresh = txq->tx_free_thresh;
882 
883 	nfp_net_infos_get(dev, &dev_info);
884 	info->conf.offloads = dev_info.tx_offload_capa &
885 			dev->data->dev_conf.txmode.offloads;
886 }
887 
888 void
889 nfp_net_recv_pkts_set(struct rte_eth_dev *eth_dev)
890 {
891 	if (nfp_net_get_avx2_supported())
892 		eth_dev->rx_pkt_burst = nfp_net_vec_avx2_recv_pkts;
893 	else
894 		eth_dev->rx_pkt_burst = nfp_net_recv_pkts;
895 }
896