xref: /dpdk/drivers/net/nfp/nfp_rxtx.c (revision 4aa10e5dc1b0fd6cc5b1b18770ac603e2c33a66c)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright (c) 2014-2021 Netronome Systems, Inc.
3  * All rights reserved.
4  *
5  * Small portions derived from code Copyright(c) 2010-2015 Intel Corporation.
6  */
7 
8 /*
9  * vim:shiftwidth=8:noexpandtab
10  *
11  * @file dpdk/pmd/nfp_rxtx.c
12  *
13  * Netronome vNIC DPDK Poll-Mode Driver: Rx/Tx functions
14  */
15 
16 #include <ethdev_driver.h>
17 #include <ethdev_pci.h>
18 
19 #include "nfp_common.h"
20 #include "nfp_ctrl.h"
21 #include "nfp_rxtx.h"
22 #include "nfp_logs.h"
23 #include "nfpcore/nfp_mip.h"
24 #include "nfpcore/nfp_rtsym.h"
25 #include "nfpcore/nfp-common/nfp_platform.h"
26 
27 static int
28 nfp_net_rx_fill_freelist(struct nfp_net_rxq *rxq)
29 {
30 	struct nfp_net_rx_buff *rxe = rxq->rxbufs;
31 	uint64_t dma_addr;
32 	unsigned int i;
33 
34 	PMD_RX_LOG(DEBUG, "Fill Rx Freelist for %u descriptors",
35 		   rxq->rx_count);
36 
37 	for (i = 0; i < rxq->rx_count; i++) {
38 		struct nfp_net_rx_desc *rxd;
39 		struct rte_mbuf *mbuf = rte_pktmbuf_alloc(rxq->mem_pool);
40 
41 		if (mbuf == NULL) {
42 			PMD_DRV_LOG(ERR, "RX mbuf alloc failed queue_id=%u",
43 				(unsigned int)rxq->qidx);
44 			return -ENOMEM;
45 		}
46 
47 		dma_addr = rte_cpu_to_le_64(RTE_MBUF_DMA_ADDR_DEFAULT(mbuf));
48 
49 		rxd = &rxq->rxds[i];
50 		rxd->fld.dd = 0;
51 		rxd->fld.dma_addr_hi = (dma_addr >> 32) & 0xff;
52 		rxd->fld.dma_addr_lo = dma_addr & 0xffffffff;
53 		rxe[i].mbuf = mbuf;
54 		PMD_RX_LOG(DEBUG, "[%d]: %" PRIx64, i, dma_addr);
55 	}
56 
57 	/* Make sure all writes are flushed before telling the hardware */
58 	rte_wmb();
59 
60 	/* Not advertising the whole ring as the firmware gets confused if so */
61 	PMD_RX_LOG(DEBUG, "Increment FL write pointer in %u",
62 		   rxq->rx_count - 1);
63 
64 	nfp_qcp_ptr_add(rxq->qcp_fl, NFP_QCP_WRITE_PTR, rxq->rx_count - 1);
65 
66 	return 0;
67 }
68 
69 int
70 nfp_net_rx_freelist_setup(struct rte_eth_dev *dev)
71 {
72 	int i;
73 
74 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
75 		if (nfp_net_rx_fill_freelist(dev->data->rx_queues[i]) < 0)
76 			return -1;
77 	}
78 	return 0;
79 }
80 
81 uint32_t
82 nfp_net_rx_queue_count(void *rx_queue)
83 {
84 	struct nfp_net_rxq *rxq;
85 	struct nfp_net_rx_desc *rxds;
86 	uint32_t idx;
87 	uint32_t count;
88 
89 	rxq = rx_queue;
90 
91 	idx = rxq->rd_p;
92 
93 	count = 0;
94 
95 	/*
96 	 * Other PMDs are just checking the DD bit in intervals of 4
97 	 * descriptors and counting all four if the first has the DD
98 	 * bit on. Of course, this is not accurate but can be good for
99 	 * performance. But ideally that should be done in descriptors
100 	 * chunks belonging to the same cache line
101 	 */
102 
103 	while (count < rxq->rx_count) {
104 		rxds = &rxq->rxds[idx];
105 		if ((rxds->rxd.meta_len_dd & PCIE_DESC_RX_DD) == 0)
106 			break;
107 
108 		count++;
109 		idx++;
110 
111 		/* Wrapping? */
112 		if ((idx) == rxq->rx_count)
113 			idx = 0;
114 	}
115 
116 	return count;
117 }
118 
119 /* nfp_net_parse_meta() - Parse the metadata from packet */
120 static void
121 nfp_net_parse_meta(struct nfp_meta_parsed *meta,
122 		struct nfp_net_rx_desc *rxd,
123 		struct nfp_net_rxq *rxq,
124 		struct rte_mbuf *mbuf)
125 {
126 	uint32_t meta_info;
127 	uint32_t vlan_info;
128 	uint8_t *meta_offset;
129 	struct nfp_net_hw *hw = rxq->hw;
130 
131 	if (unlikely((NFD_CFG_MAJOR_VERSION_of(hw->ver) < 2) ||
132 			NFP_DESC_META_LEN(rxd) == 0))
133 		return;
134 
135 	meta_offset = rte_pktmbuf_mtod(mbuf, uint8_t *);
136 	meta_offset -= NFP_DESC_META_LEN(rxd);
137 	meta_info = rte_be_to_cpu_32(*(rte_be32_t *)meta_offset);
138 	meta_offset += 4;
139 
140 	for (; meta_info != 0; meta_info >>= NFP_NET_META_FIELD_SIZE, meta_offset += 4) {
141 		switch (meta_info & NFP_NET_META_FIELD_MASK) {
142 		case NFP_NET_META_HASH:
143 			/* Next field type is about the hash type */
144 			meta_info >>= NFP_NET_META_FIELD_SIZE;
145 			/* Hash value is in the data field */
146 			meta->hash = rte_be_to_cpu_32(*(rte_be32_t *)meta_offset);
147 			meta->hash_type = meta_info & NFP_NET_META_FIELD_MASK;
148 			break;
149 		case NFP_NET_META_VLAN:
150 			vlan_info = rte_be_to_cpu_32(*(rte_be32_t *)meta_offset);
151 			meta->vlan[meta->vlan_layer].offload =
152 					vlan_info >> NFP_NET_META_VLAN_OFFLOAD;
153 			meta->vlan[meta->vlan_layer].tci =
154 					vlan_info & NFP_NET_META_VLAN_MASK;
155 			meta->vlan[meta->vlan_layer].tpid = NFP_NET_META_TPID(vlan_info);
156 			++meta->vlan_layer;
157 			break;
158 		default:
159 			/* Unsupported metadata can be a performance issue */
160 			return;
161 		}
162 	}
163 }
164 
165 /*
166  * nfp_net_parse_meta_hash() - Set mbuf hash data based on the metadata info
167  *
168  * The RSS hash and hash-type are prepended to the packet data.
169  * Extract and decode it and set the mbuf fields.
170  */
171 static void
172 nfp_net_parse_meta_hash(const struct nfp_meta_parsed *meta,
173 		struct nfp_net_rx_desc *rxd,
174 		struct nfp_net_rxq *rxq,
175 		struct rte_mbuf *mbuf)
176 {
177 	uint32_t hash;
178 	uint32_t hash_type;
179 	struct nfp_net_hw *hw = rxq->hw;
180 
181 	if ((hw->ctrl & NFP_NET_CFG_CTRL_RSS_ANY) == 0)
182 		return;
183 
184 	if (likely((hw->cap & NFP_NET_CFG_CTRL_RSS_ANY) != 0 &&
185 			NFP_DESC_META_LEN(rxd) != 0)) {
186 		hash = meta->hash;
187 		hash_type = meta->hash_type;
188 	} else {
189 		if ((rxd->rxd.flags & PCIE_DESC_RX_RSS) == 0)
190 			return;
191 
192 		hash = rte_be_to_cpu_32(*(uint32_t *)NFP_HASH_OFFSET);
193 		hash_type = rte_be_to_cpu_32(*(uint32_t *)NFP_HASH_TYPE_OFFSET);
194 	}
195 
196 	mbuf->hash.rss = hash;
197 	mbuf->ol_flags |= RTE_MBUF_F_RX_RSS_HASH;
198 
199 	switch (hash_type) {
200 	case NFP_NET_RSS_IPV4:
201 		mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV4;
202 		break;
203 	case NFP_NET_RSS_IPV6:
204 		mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6;
205 		break;
206 	case NFP_NET_RSS_IPV6_EX:
207 		mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6_EXT;
208 		break;
209 	case NFP_NET_RSS_IPV4_TCP:
210 		mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6_EXT;
211 		break;
212 	case NFP_NET_RSS_IPV6_TCP:
213 		mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6_EXT;
214 		break;
215 	case NFP_NET_RSS_IPV4_UDP:
216 		mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6_EXT;
217 		break;
218 	case NFP_NET_RSS_IPV6_UDP:
219 		mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6_EXT;
220 		break;
221 	default:
222 		mbuf->packet_type |= RTE_PTYPE_INNER_L4_MASK;
223 	}
224 }
225 
226 /*
227  * nfp_net_parse_meta_vlan() - Set mbuf vlan_strip data based on metadata info
228  *
229  * The VLAN info TPID and TCI are prepended to the packet data.
230  * Extract and decode it and set the mbuf fields.
231  */
232 static void
233 nfp_net_parse_meta_vlan(const struct nfp_meta_parsed *meta,
234 		struct nfp_net_rx_desc *rxd,
235 		struct nfp_net_rxq *rxq,
236 		struct rte_mbuf *mb)
237 {
238 	struct nfp_net_hw *hw = rxq->hw;
239 
240 	/* Skip if hardware don't support setting vlan. */
241 	if ((hw->ctrl & (NFP_NET_CFG_CTRL_RXVLAN | NFP_NET_CFG_CTRL_RXVLAN_V2)) == 0)
242 		return;
243 
244 	/*
245 	 * The nic support the two way to send the VLAN info,
246 	 * 1. According the metadata to send the VLAN info when NFP_NET_CFG_CTRL_RXVLAN_V2
247 	 * is set
248 	 * 2. According the descriptor to sned the VLAN info when NFP_NET_CFG_CTRL_RXVLAN
249 	 * is set
250 	 *
251 	 * If the nic doesn't send the VLAN info, it is not necessary
252 	 * to do anything.
253 	 */
254 	if ((hw->ctrl & NFP_NET_CFG_CTRL_RXVLAN_V2) != 0) {
255 		if (meta->vlan_layer >= 1 && meta->vlan[0].offload != 0) {
256 			mb->vlan_tci = rte_cpu_to_le_32(meta->vlan[0].tci);
257 			mb->ol_flags |= RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED;
258 		}
259 	} else if ((hw->ctrl & NFP_NET_CFG_CTRL_RXVLAN) != 0) {
260 		if ((rxd->rxd.flags & PCIE_DESC_RX_VLAN) != 0) {
261 			mb->vlan_tci = rte_cpu_to_le_32(rxd->rxd.vlan);
262 			mb->ol_flags |= RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED;
263 		}
264 	}
265 }
266 
267 /*
268  * nfp_net_parse_meta_qinq() - Set mbuf qinq_strip data based on metadata info
269  *
270  * The out VLAN tci are prepended to the packet data.
271  * Extract and decode it and set the mbuf fields.
272  *
273  * If both RTE_MBUF_F_RX_VLAN and NFP_NET_CFG_CTRL_RXQINQ are set, the 2 VLANs
274  *   have been stripped by the hardware and their TCIs are saved in
275  *   mbuf->vlan_tci (inner) and mbuf->vlan_tci_outer (outer).
276  * If NFP_NET_CFG_CTRL_RXQINQ is set and RTE_MBUF_F_RX_VLAN is unset, only the
277  *   outer VLAN is removed from packet data, but both tci are saved in
278  *   mbuf->vlan_tci (inner) and mbuf->vlan_tci_outer (outer).
279  *
280  * qinq set & vlan set : meta->vlan_layer>=2, meta->vlan[0].offload=1, meta->vlan[1].offload=1
281  * qinq set & vlan not set: meta->vlan_layer>=2, meta->vlan[1].offload=1,meta->vlan[0].offload=0
282  * qinq not set & vlan set: meta->vlan_layer=1, meta->vlan[0].offload=1
283  * qinq not set & vlan not set: meta->vlan_layer=0
284  */
285 static void
286 nfp_net_parse_meta_qinq(const struct nfp_meta_parsed *meta,
287 		struct nfp_net_rxq *rxq,
288 		struct rte_mbuf *mb)
289 {
290 	struct nfp_net_hw *hw = rxq->hw;
291 
292 	if ((hw->ctrl & NFP_NET_CFG_CTRL_RXQINQ) == 0 ||
293 			(hw->cap & NFP_NET_CFG_CTRL_RXQINQ) == 0)
294 		return;
295 
296 	if (meta->vlan_layer < NFP_META_MAX_VLANS)
297 		return;
298 
299 	if (meta->vlan[0].offload == 0)
300 		mb->vlan_tci = rte_cpu_to_le_16(meta->vlan[0].tci);
301 	mb->vlan_tci_outer = rte_cpu_to_le_16(meta->vlan[1].tci);
302 	PMD_RX_LOG(DEBUG, "Received outer vlan is %u inter vlan is %u",
303 			mb->vlan_tci_outer, mb->vlan_tci);
304 	mb->ol_flags |= RTE_MBUF_F_RX_QINQ | RTE_MBUF_F_RX_QINQ_STRIPPED;
305 }
306 
307 /*
308  * RX path design:
309  *
310  * There are some decisions to take:
311  * 1) How to check DD RX descriptors bit
312  * 2) How and when to allocate new mbufs
313  *
314  * Current implementation checks just one single DD bit each loop. As each
315  * descriptor is 8 bytes, it is likely a good idea to check descriptors in
316  * a single cache line instead. Tests with this change have not shown any
317  * performance improvement but it requires further investigation. For example,
318  * depending on which descriptor is next, the number of descriptors could be
319  * less than 8 for just checking those in the same cache line. This implies
320  * extra work which could be counterproductive by itself. Indeed, last firmware
321  * changes are just doing this: writing several descriptors with the DD bit
322  * for saving PCIe bandwidth and DMA operations from the NFP.
323  *
324  * Mbuf allocation is done when a new packet is received. Then the descriptor
325  * is automatically linked with the new mbuf and the old one is given to the
326  * user. The main drawback with this design is mbuf allocation is heavier than
327  * using bulk allocations allowed by DPDK with rte_mempool_get_bulk. From the
328  * cache point of view it does not seem allocating the mbuf early on as we are
329  * doing now have any benefit at all. Again, tests with this change have not
330  * shown any improvement. Also, rte_mempool_get_bulk returns all or nothing
331  * so looking at the implications of this type of allocation should be studied
332  * deeply
333  */
334 
335 uint16_t
336 nfp_net_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
337 {
338 	struct nfp_net_rxq *rxq;
339 	struct nfp_net_rx_desc *rxds;
340 	struct nfp_net_rx_buff *rxb;
341 	struct nfp_net_hw *hw;
342 	struct rte_mbuf *mb;
343 	struct rte_mbuf *new_mb;
344 	struct nfp_meta_parsed meta;
345 	uint16_t nb_hold;
346 	uint64_t dma_addr;
347 	uint16_t avail;
348 
349 	avail = 0;
350 	rxq = rx_queue;
351 	if (unlikely(rxq == NULL)) {
352 		/*
353 		 * DPDK just checks the queue is lower than max queues
354 		 * enabled. But the queue needs to be configured
355 		 */
356 		RTE_LOG_DP(ERR, PMD, "RX Bad queue\n");
357 		return avail;
358 	}
359 
360 	hw = rxq->hw;
361 	nb_hold = 0;
362 
363 	while (avail < nb_pkts) {
364 		rxb = &rxq->rxbufs[rxq->rd_p];
365 		if (unlikely(rxb == NULL)) {
366 			RTE_LOG_DP(ERR, PMD, "rxb does not exist!\n");
367 			break;
368 		}
369 
370 		rxds = &rxq->rxds[rxq->rd_p];
371 		if ((rxds->rxd.meta_len_dd & PCIE_DESC_RX_DD) == 0)
372 			break;
373 
374 		/*
375 		 * Memory barrier to ensure that we won't do other
376 		 * reads before the DD bit.
377 		 */
378 		rte_rmb();
379 
380 		/*
381 		 * We got a packet. Let's alloc a new mbuf for refilling the
382 		 * free descriptor ring as soon as possible
383 		 */
384 		new_mb = rte_pktmbuf_alloc(rxq->mem_pool);
385 		if (unlikely(new_mb == NULL)) {
386 			RTE_LOG_DP(DEBUG, PMD,
387 			"RX mbuf alloc failed port_id=%u queue_id=%u\n",
388 				rxq->port_id, (unsigned int)rxq->qidx);
389 			nfp_net_mbuf_alloc_failed(rxq);
390 			break;
391 		}
392 
393 		/*
394 		 * Grab the mbuf and refill the descriptor with the
395 		 * previously allocated mbuf
396 		 */
397 		mb = rxb->mbuf;
398 		rxb->mbuf = new_mb;
399 
400 		PMD_RX_LOG(DEBUG, "Packet len: %u, mbuf_size: %u",
401 			   rxds->rxd.data_len, rxq->mbuf_size);
402 
403 		/* Size of this segment */
404 		mb->data_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds);
405 		/* Size of the whole packet. We just support 1 segment */
406 		mb->pkt_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds);
407 
408 		if (unlikely((mb->data_len + hw->rx_offset) >
409 			     rxq->mbuf_size)) {
410 			/*
411 			 * This should not happen and the user has the
412 			 * responsibility of avoiding it. But we have
413 			 * to give some info about the error
414 			 */
415 			RTE_LOG_DP(ERR, PMD,
416 				"mbuf overflow likely due to the RX offset.\n"
417 				"\t\tYour mbuf size should have extra space for"
418 				" RX offset=%u bytes.\n"
419 				"\t\tCurrently you just have %u bytes available"
420 				" but the received packet is %u bytes long",
421 				hw->rx_offset,
422 				rxq->mbuf_size - hw->rx_offset,
423 				mb->data_len);
424 			rte_pktmbuf_free(mb);
425 			break;
426 		}
427 
428 		/* Filling the received mbuf with packet info */
429 		if (hw->rx_offset)
430 			mb->data_off = RTE_PKTMBUF_HEADROOM + hw->rx_offset;
431 		else
432 			mb->data_off = RTE_PKTMBUF_HEADROOM +
433 				       NFP_DESC_META_LEN(rxds);
434 
435 		/* No scatter mode supported */
436 		mb->nb_segs = 1;
437 		mb->next = NULL;
438 		mb->port = rxq->port_id;
439 
440 		memset(&meta, 0, sizeof(meta));
441 		nfp_net_parse_meta(&meta, rxds, rxq, mb);
442 		nfp_net_parse_meta_hash(&meta, rxds, rxq, mb);
443 		nfp_net_parse_meta_vlan(&meta, rxds, rxq, mb);
444 		nfp_net_parse_meta_qinq(&meta, rxq, mb);
445 
446 		/* Checking the checksum flag */
447 		nfp_net_rx_cksum(rxq, rxds, mb);
448 
449 		/* Adding the mbuf to the mbuf array passed by the app */
450 		rx_pkts[avail++] = mb;
451 
452 		/* Now resetting and updating the descriptor */
453 		rxds->vals[0] = 0;
454 		rxds->vals[1] = 0;
455 		dma_addr = rte_cpu_to_le_64(RTE_MBUF_DMA_ADDR_DEFAULT(new_mb));
456 		rxds->fld.dd = 0;
457 		rxds->fld.dma_addr_hi = (dma_addr >> 32) & 0xff;
458 		rxds->fld.dma_addr_lo = dma_addr & 0xffffffff;
459 		nb_hold++;
460 
461 		rxq->rd_p++;
462 		if (unlikely(rxq->rd_p == rxq->rx_count)) /* wrapping?*/
463 			rxq->rd_p = 0;
464 	}
465 
466 	if (nb_hold == 0)
467 		return nb_hold;
468 
469 	PMD_RX_LOG(DEBUG, "RX  port_id=%u queue_id=%u, %d packets received",
470 		   rxq->port_id, (unsigned int)rxq->qidx, nb_hold);
471 
472 	nb_hold += rxq->nb_rx_hold;
473 
474 	/*
475 	 * FL descriptors needs to be written before incrementing the
476 	 * FL queue WR pointer
477 	 */
478 	rte_wmb();
479 	if (nb_hold > rxq->rx_free_thresh) {
480 		PMD_RX_LOG(DEBUG, "port=%u queue=%u nb_hold=%u avail=%u",
481 			   rxq->port_id, (unsigned int)rxq->qidx,
482 			   (unsigned int)nb_hold, (unsigned int)avail);
483 		nfp_qcp_ptr_add(rxq->qcp_fl, NFP_QCP_WRITE_PTR, nb_hold);
484 		nb_hold = 0;
485 	}
486 	rxq->nb_rx_hold = nb_hold;
487 
488 	return avail;
489 }
490 
491 static void
492 nfp_net_rx_queue_release_mbufs(struct nfp_net_rxq *rxq)
493 {
494 	unsigned int i;
495 
496 	if (rxq->rxbufs == NULL)
497 		return;
498 
499 	for (i = 0; i < rxq->rx_count; i++) {
500 		if (rxq->rxbufs[i].mbuf) {
501 			rte_pktmbuf_free_seg(rxq->rxbufs[i].mbuf);
502 			rxq->rxbufs[i].mbuf = NULL;
503 		}
504 	}
505 }
506 
507 void
508 nfp_net_rx_queue_release(struct rte_eth_dev *dev, uint16_t queue_idx)
509 {
510 	struct nfp_net_rxq *rxq = dev->data->rx_queues[queue_idx];
511 
512 	if (rxq) {
513 		nfp_net_rx_queue_release_mbufs(rxq);
514 		rte_eth_dma_zone_free(dev, "rx_ring", queue_idx);
515 		rte_free(rxq->rxbufs);
516 		rte_free(rxq);
517 	}
518 }
519 
520 void
521 nfp_net_reset_rx_queue(struct nfp_net_rxq *rxq)
522 {
523 	nfp_net_rx_queue_release_mbufs(rxq);
524 	rxq->rd_p = 0;
525 	rxq->nb_rx_hold = 0;
526 }
527 
528 int
529 nfp_net_rx_queue_setup(struct rte_eth_dev *dev,
530 		       uint16_t queue_idx, uint16_t nb_desc,
531 		       unsigned int socket_id,
532 		       const struct rte_eth_rxconf *rx_conf,
533 		       struct rte_mempool *mp)
534 {
535 	int ret;
536 	uint16_t min_rx_desc;
537 	uint16_t max_rx_desc;
538 	const struct rte_memzone *tz;
539 	struct nfp_net_rxq *rxq;
540 	struct nfp_net_hw *hw;
541 	uint32_t rx_desc_sz;
542 
543 	hw = NFP_NET_DEV_PRIVATE_TO_HW(dev->data->dev_private);
544 
545 	PMD_INIT_FUNC_TRACE();
546 
547 	ret = nfp_net_rx_desc_limits(hw, &min_rx_desc, &max_rx_desc);
548 	if (ret != 0)
549 		return ret;
550 
551 	/* Validating number of descriptors */
552 	rx_desc_sz = nb_desc * sizeof(struct nfp_net_rx_desc);
553 	if (rx_desc_sz % NFP_ALIGN_RING_DESC != 0 ||
554 	    nb_desc > max_rx_desc || nb_desc < min_rx_desc) {
555 		PMD_DRV_LOG(ERR, "Wrong nb_desc value");
556 		return -EINVAL;
557 	}
558 
559 	/*
560 	 * Free memory prior to re-allocation if needed. This is the case after
561 	 * calling nfp_net_stop
562 	 */
563 	if (dev->data->rx_queues[queue_idx]) {
564 		nfp_net_rx_queue_release(dev, queue_idx);
565 		dev->data->rx_queues[queue_idx] = NULL;
566 	}
567 
568 	/* Allocating rx queue data structure */
569 	rxq = rte_zmalloc_socket("ethdev RX queue", sizeof(struct nfp_net_rxq),
570 				 RTE_CACHE_LINE_SIZE, socket_id);
571 	if (rxq == NULL)
572 		return -ENOMEM;
573 
574 	dev->data->rx_queues[queue_idx] = rxq;
575 
576 	/* Hw queues mapping based on firmware configuration */
577 	rxq->qidx = queue_idx;
578 	rxq->fl_qcidx = queue_idx * hw->stride_rx;
579 	rxq->rx_qcidx = rxq->fl_qcidx + (hw->stride_rx - 1);
580 	rxq->qcp_fl = hw->rx_bar + NFP_QCP_QUEUE_OFF(rxq->fl_qcidx);
581 	rxq->qcp_rx = hw->rx_bar + NFP_QCP_QUEUE_OFF(rxq->rx_qcidx);
582 
583 	/*
584 	 * Tracking mbuf size for detecting a potential mbuf overflow due to
585 	 * RX offset
586 	 */
587 	rxq->mem_pool = mp;
588 	rxq->mbuf_size = rxq->mem_pool->elt_size;
589 	rxq->mbuf_size -= (sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM);
590 	hw->flbufsz = rxq->mbuf_size;
591 
592 	rxq->rx_count = nb_desc;
593 	rxq->port_id = dev->data->port_id;
594 	rxq->rx_free_thresh = rx_conf->rx_free_thresh;
595 	rxq->drop_en = rx_conf->rx_drop_en;
596 
597 	/*
598 	 * Allocate RX ring hardware descriptors. A memzone large enough to
599 	 * handle the maximum ring size is allocated in order to allow for
600 	 * resizing in later calls to the queue setup function.
601 	 */
602 	tz = rte_eth_dma_zone_reserve(dev, "rx_ring", queue_idx,
603 				   sizeof(struct nfp_net_rx_desc) *
604 				   max_rx_desc, NFP_MEMZONE_ALIGN,
605 				   socket_id);
606 
607 	if (tz == NULL) {
608 		PMD_DRV_LOG(ERR, "Error allocating rx dma");
609 		nfp_net_rx_queue_release(dev, queue_idx);
610 		dev->data->rx_queues[queue_idx] = NULL;
611 		return -ENOMEM;
612 	}
613 
614 	/* Saving physical and virtual addresses for the RX ring */
615 	rxq->dma = (uint64_t)tz->iova;
616 	rxq->rxds = (struct nfp_net_rx_desc *)tz->addr;
617 
618 	/* mbuf pointers array for referencing mbufs linked to RX descriptors */
619 	rxq->rxbufs = rte_zmalloc_socket("rxq->rxbufs",
620 					 sizeof(*rxq->rxbufs) * nb_desc,
621 					 RTE_CACHE_LINE_SIZE, socket_id);
622 	if (rxq->rxbufs == NULL) {
623 		nfp_net_rx_queue_release(dev, queue_idx);
624 		dev->data->rx_queues[queue_idx] = NULL;
625 		return -ENOMEM;
626 	}
627 
628 	PMD_RX_LOG(DEBUG, "rxbufs=%p hw_ring=%p dma_addr=0x%" PRIx64,
629 		   rxq->rxbufs, rxq->rxds, (unsigned long)rxq->dma);
630 
631 	nfp_net_reset_rx_queue(rxq);
632 
633 	rxq->hw = hw;
634 
635 	/*
636 	 * Telling the HW about the physical address of the RX ring and number
637 	 * of descriptors in log2 format
638 	 */
639 	nn_cfg_writeq(hw, NFP_NET_CFG_RXR_ADDR(queue_idx), rxq->dma);
640 	nn_cfg_writeb(hw, NFP_NET_CFG_RXR_SZ(queue_idx), rte_log2_u32(nb_desc));
641 
642 	return 0;
643 }
644 
645 /*
646  * nfp_net_tx_free_bufs - Check for descriptors with a complete
647  * status
648  * @txq: TX queue to work with
649  * Returns number of descriptors freed
650  */
651 int
652 nfp_net_tx_free_bufs(struct nfp_net_txq *txq)
653 {
654 	uint32_t qcp_rd_p;
655 	int todo;
656 
657 	PMD_TX_LOG(DEBUG, "queue %u. Check for descriptor with a complete"
658 		   " status", txq->qidx);
659 
660 	/* Work out how many packets have been sent */
661 	qcp_rd_p = nfp_qcp_read(txq->qcp_q, NFP_QCP_READ_PTR);
662 
663 	if (qcp_rd_p == txq->rd_p) {
664 		PMD_TX_LOG(DEBUG, "queue %u: It seems harrier is not sending "
665 			   "packets (%u, %u)", txq->qidx,
666 			   qcp_rd_p, txq->rd_p);
667 		return 0;
668 	}
669 
670 	if (qcp_rd_p > txq->rd_p)
671 		todo = qcp_rd_p - txq->rd_p;
672 	else
673 		todo = qcp_rd_p + txq->tx_count - txq->rd_p;
674 
675 	PMD_TX_LOG(DEBUG, "qcp_rd_p %u, txq->rd_p: %u, qcp->rd_p: %u",
676 		   qcp_rd_p, txq->rd_p, txq->rd_p);
677 
678 	if (todo == 0)
679 		return todo;
680 
681 	txq->rd_p += todo;
682 	if (unlikely(txq->rd_p >= txq->tx_count))
683 		txq->rd_p -= txq->tx_count;
684 
685 	return todo;
686 }
687 
688 static void
689 nfp_net_tx_queue_release_mbufs(struct nfp_net_txq *txq)
690 {
691 	unsigned int i;
692 
693 	if (txq->txbufs == NULL)
694 		return;
695 
696 	for (i = 0; i < txq->tx_count; i++) {
697 		if (txq->txbufs[i].mbuf) {
698 			rte_pktmbuf_free_seg(txq->txbufs[i].mbuf);
699 			txq->txbufs[i].mbuf = NULL;
700 		}
701 	}
702 }
703 
704 void
705 nfp_net_tx_queue_release(struct rte_eth_dev *dev, uint16_t queue_idx)
706 {
707 	struct nfp_net_txq *txq = dev->data->tx_queues[queue_idx];
708 
709 	if (txq) {
710 		nfp_net_tx_queue_release_mbufs(txq);
711 		rte_eth_dma_zone_free(dev, "tx_ring", queue_idx);
712 		rte_free(txq->txbufs);
713 		rte_free(txq);
714 	}
715 }
716 
717 void
718 nfp_net_reset_tx_queue(struct nfp_net_txq *txq)
719 {
720 	nfp_net_tx_queue_release_mbufs(txq);
721 	txq->wr_p = 0;
722 	txq->rd_p = 0;
723 }
724 
725 static int
726 nfp_net_nfd3_tx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx,
727 		       uint16_t nb_desc, unsigned int socket_id,
728 		       const struct rte_eth_txconf *tx_conf)
729 {
730 	int ret;
731 	uint16_t min_tx_desc;
732 	uint16_t max_tx_desc;
733 	const struct rte_memzone *tz;
734 	struct nfp_net_txq *txq;
735 	uint16_t tx_free_thresh;
736 	struct nfp_net_hw *hw;
737 	uint32_t tx_desc_sz;
738 
739 	hw = NFP_NET_DEV_PRIVATE_TO_HW(dev->data->dev_private);
740 
741 	PMD_INIT_FUNC_TRACE();
742 
743 	ret = nfp_net_tx_desc_limits(hw, &min_tx_desc, &max_tx_desc);
744 	if (ret != 0)
745 		return ret;
746 
747 	/* Validating number of descriptors */
748 	tx_desc_sz = nb_desc * sizeof(struct nfp_net_nfd3_tx_desc);
749 	if ((NFD3_TX_DESC_PER_SIMPLE_PKT * tx_desc_sz) % NFP_ALIGN_RING_DESC != 0 ||
750 	     nb_desc > max_tx_desc || nb_desc < min_tx_desc) {
751 		PMD_DRV_LOG(ERR, "Wrong nb_desc value");
752 		return -EINVAL;
753 	}
754 
755 	tx_free_thresh = (uint16_t)((tx_conf->tx_free_thresh) ?
756 				    tx_conf->tx_free_thresh :
757 				    DEFAULT_TX_FREE_THRESH);
758 
759 	if (tx_free_thresh > (nb_desc)) {
760 		PMD_DRV_LOG(ERR,
761 			"tx_free_thresh must be less than the number of TX "
762 			"descriptors. (tx_free_thresh=%u port=%d "
763 			"queue=%d)", (unsigned int)tx_free_thresh,
764 			dev->data->port_id, (int)queue_idx);
765 		return -(EINVAL);
766 	}
767 
768 	/*
769 	 * Free memory prior to re-allocation if needed. This is the case after
770 	 * calling nfp_net_stop
771 	 */
772 	if (dev->data->tx_queues[queue_idx]) {
773 		PMD_TX_LOG(DEBUG, "Freeing memory prior to re-allocation %d",
774 			   queue_idx);
775 		nfp_net_tx_queue_release(dev, queue_idx);
776 		dev->data->tx_queues[queue_idx] = NULL;
777 	}
778 
779 	/* Allocating tx queue data structure */
780 	txq = rte_zmalloc_socket("ethdev TX queue", sizeof(struct nfp_net_txq),
781 				 RTE_CACHE_LINE_SIZE, socket_id);
782 	if (txq == NULL) {
783 		PMD_DRV_LOG(ERR, "Error allocating tx dma");
784 		return -ENOMEM;
785 	}
786 
787 	dev->data->tx_queues[queue_idx] = txq;
788 
789 	/*
790 	 * Allocate TX ring hardware descriptors. A memzone large enough to
791 	 * handle the maximum ring size is allocated in order to allow for
792 	 * resizing in later calls to the queue setup function.
793 	 */
794 	tz = rte_eth_dma_zone_reserve(dev, "tx_ring", queue_idx,
795 				   sizeof(struct nfp_net_nfd3_tx_desc) *
796 				   NFD3_TX_DESC_PER_SIMPLE_PKT *
797 				   max_tx_desc, NFP_MEMZONE_ALIGN,
798 				   socket_id);
799 	if (tz == NULL) {
800 		PMD_DRV_LOG(ERR, "Error allocating tx dma");
801 		nfp_net_tx_queue_release(dev, queue_idx);
802 		dev->data->tx_queues[queue_idx] = NULL;
803 		return -ENOMEM;
804 	}
805 
806 	txq->tx_count = nb_desc * NFD3_TX_DESC_PER_SIMPLE_PKT;
807 	txq->tx_free_thresh = tx_free_thresh;
808 	txq->tx_pthresh = tx_conf->tx_thresh.pthresh;
809 	txq->tx_hthresh = tx_conf->tx_thresh.hthresh;
810 	txq->tx_wthresh = tx_conf->tx_thresh.wthresh;
811 
812 	/* queue mapping based on firmware configuration */
813 	txq->qidx = queue_idx;
814 	txq->tx_qcidx = queue_idx * hw->stride_tx;
815 	txq->qcp_q = hw->tx_bar + NFP_QCP_QUEUE_OFF(txq->tx_qcidx);
816 
817 	txq->port_id = dev->data->port_id;
818 
819 	/* Saving physical and virtual addresses for the TX ring */
820 	txq->dma = (uint64_t)tz->iova;
821 	txq->txds = (struct nfp_net_nfd3_tx_desc *)tz->addr;
822 
823 	/* mbuf pointers array for referencing mbufs linked to TX descriptors */
824 	txq->txbufs = rte_zmalloc_socket("txq->txbufs",
825 					 sizeof(*txq->txbufs) * txq->tx_count,
826 					 RTE_CACHE_LINE_SIZE, socket_id);
827 	if (txq->txbufs == NULL) {
828 		nfp_net_tx_queue_release(dev, queue_idx);
829 		dev->data->tx_queues[queue_idx] = NULL;
830 		return -ENOMEM;
831 	}
832 	PMD_TX_LOG(DEBUG, "txbufs=%p hw_ring=%p dma_addr=0x%" PRIx64,
833 		   txq->txbufs, txq->txds, (unsigned long)txq->dma);
834 
835 	nfp_net_reset_tx_queue(txq);
836 
837 	txq->hw = hw;
838 
839 	/*
840 	 * Telling the HW about the physical address of the TX ring and number
841 	 * of descriptors in log2 format
842 	 */
843 	nn_cfg_writeq(hw, NFP_NET_CFG_TXR_ADDR(queue_idx), txq->dma);
844 	nn_cfg_writeb(hw, NFP_NET_CFG_TXR_SZ(queue_idx), rte_log2_u32(txq->tx_count));
845 
846 	return 0;
847 }
848 
849 /*
850  * nfp_net_nfd3_tx_vlan() - Set vlan info in the nfd3 tx desc
851  *
852  * If enable NFP_NET_CFG_CTRL_TXVLAN_V2
853  *	Vlan_info is stored in the meta and
854  *	is handled in the nfp_net_nfd3_set_meta_vlan
855  * else if enable NFP_NET_CFG_CTRL_TXVLAN
856  *	Vlan_info is stored in the tx_desc and
857  *	is handled in the nfp_net_nfd3_tx_vlan
858  */
859 static void
860 nfp_net_nfd3_tx_vlan(struct nfp_net_txq *txq,
861 		struct nfp_net_nfd3_tx_desc *txd,
862 		struct rte_mbuf *mb)
863 {
864 	struct nfp_net_hw *hw = txq->hw;
865 
866 	if ((hw->cap & NFP_NET_CFG_CTRL_TXVLAN_V2) != 0 ||
867 		(hw->cap & NFP_NET_CFG_CTRL_TXVLAN) == 0)
868 		return;
869 
870 	if ((mb->ol_flags & RTE_MBUF_F_TX_VLAN) != 0) {
871 		txd->flags |= PCIE_DESC_TX_VLAN;
872 		txd->vlan = mb->vlan_tci;
873 	}
874 }
875 
876 static void
877 nfp_net_set_meta_vlan(struct nfp_net_meta_raw *meta_data,
878 		struct rte_mbuf *pkt,
879 		uint8_t layer)
880 {
881 	uint16_t vlan_tci;
882 	uint16_t tpid;
883 
884 	tpid = RTE_ETHER_TYPE_VLAN;
885 	vlan_tci = pkt->vlan_tci;
886 
887 	meta_data->data[layer] = rte_cpu_to_be_32(tpid << 16 | vlan_tci);
888 }
889 
890 static void
891 nfp_net_nfd3_set_meta_data(struct nfp_net_meta_raw *meta_data,
892 		struct nfp_net_txq *txq,
893 		struct rte_mbuf *pkt)
894 {
895 	uint8_t vlan_layer = 0;
896 	struct nfp_net_hw *hw;
897 	uint32_t meta_info;
898 	uint8_t layer = 0;
899 	char *meta;
900 
901 	hw = txq->hw;
902 
903 	if ((pkt->ol_flags & RTE_MBUF_F_TX_VLAN) != 0 &&
904 			(hw->ctrl & NFP_NET_CFG_CTRL_TXVLAN_V2) != 0) {
905 		if (meta_data->length == 0)
906 			meta_data->length = NFP_NET_META_HEADER_SIZE;
907 		meta_data->length += NFP_NET_META_FIELD_SIZE;
908 		meta_data->header |= NFP_NET_META_VLAN;
909 	}
910 
911 	if (meta_data->length == 0)
912 		return;
913 
914 	meta_info = meta_data->header;
915 	meta_data->header = rte_cpu_to_be_32(meta_data->header);
916 	meta = rte_pktmbuf_prepend(pkt, meta_data->length);
917 	memcpy(meta, &meta_data->header, sizeof(meta_data->header));
918 	meta += NFP_NET_META_HEADER_SIZE;
919 
920 	for (; meta_info != 0; meta_info >>= NFP_NET_META_FIELD_SIZE, layer++,
921 			meta += NFP_NET_META_FIELD_SIZE) {
922 		switch (meta_info & NFP_NET_META_FIELD_MASK) {
923 		case NFP_NET_META_VLAN:
924 			if (vlan_layer > 0) {
925 				PMD_DRV_LOG(ERR, "At most 1 layers of vlan is supported");
926 				return;
927 			}
928 			nfp_net_set_meta_vlan(meta_data, pkt, layer);
929 			vlan_layer++;
930 			break;
931 		default:
932 			PMD_DRV_LOG(ERR, "The metadata type not supported");
933 			return;
934 		}
935 
936 		memcpy(meta, &meta_data->data[layer], sizeof(meta_data->data[layer]));
937 	}
938 }
939 
940 uint16_t
941 nfp_net_nfd3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
942 {
943 	struct nfp_net_txq *txq;
944 	struct nfp_net_hw *hw;
945 	struct nfp_net_nfd3_tx_desc *txds, txd;
946 	struct nfp_net_meta_raw meta_data;
947 	struct rte_mbuf *pkt;
948 	uint64_t dma_addr;
949 	int pkt_size, dma_size;
950 	uint16_t free_descs, issued_descs;
951 	struct rte_mbuf **lmbuf;
952 	int i;
953 
954 	txq = tx_queue;
955 	hw = txq->hw;
956 	txds = &txq->txds[txq->wr_p];
957 
958 	PMD_TX_LOG(DEBUG, "working for queue %u at pos %d and %u packets",
959 		   txq->qidx, txq->wr_p, nb_pkts);
960 
961 	if (nfp_net_nfd3_free_tx_desc(txq) < NFD3_TX_DESC_PER_SIMPLE_PKT * nb_pkts ||
962 	    nfp_net_nfd3_txq_full(txq))
963 		nfp_net_tx_free_bufs(txq);
964 
965 	free_descs = (uint16_t)nfp_net_nfd3_free_tx_desc(txq);
966 	if (unlikely(free_descs == 0))
967 		return 0;
968 
969 	pkt = *tx_pkts;
970 
971 	issued_descs = 0;
972 	PMD_TX_LOG(DEBUG, "queue: %u. Sending %u packets",
973 		   txq->qidx, nb_pkts);
974 	/* Sending packets */
975 	for (i = 0; i < nb_pkts && free_descs > 0; i++) {
976 		memset(&meta_data, 0, sizeof(meta_data));
977 		/* Grabbing the mbuf linked to the current descriptor */
978 		lmbuf = &txq->txbufs[txq->wr_p].mbuf;
979 		/* Warming the cache for releasing the mbuf later on */
980 		RTE_MBUF_PREFETCH_TO_FREE(*lmbuf);
981 
982 		pkt = *(tx_pkts + i);
983 
984 		nfp_net_nfd3_set_meta_data(&meta_data, txq, pkt);
985 
986 		if (unlikely(pkt->nb_segs > 1 &&
987 			     !(hw->cap & NFP_NET_CFG_CTRL_GATHER))) {
988 			PMD_INIT_LOG(ERR, "Multisegment packet not supported");
989 			goto xmit_end;
990 		}
991 
992 		/* Checking if we have enough descriptors */
993 		if (unlikely(pkt->nb_segs > free_descs))
994 			goto xmit_end;
995 
996 		/*
997 		 * Checksum and VLAN flags just in the first descriptor for a
998 		 * multisegment packet, but TSO info needs to be in all of them.
999 		 */
1000 		txd.data_len = pkt->pkt_len;
1001 		nfp_net_nfd3_tx_tso(txq, &txd, pkt);
1002 		nfp_net_nfd3_tx_cksum(txq, &txd, pkt);
1003 		nfp_net_nfd3_tx_vlan(txq, &txd, pkt);
1004 
1005 		/*
1006 		 * mbuf data_len is the data in one segment and pkt_len data
1007 		 * in the whole packet. When the packet is just one segment,
1008 		 * then data_len = pkt_len
1009 		 */
1010 		pkt_size = pkt->pkt_len;
1011 
1012 		while (pkt != NULL && free_descs > 0) {
1013 			/* Copying TSO, VLAN and cksum info */
1014 			*txds = txd;
1015 
1016 			/* Releasing mbuf used by this descriptor previously*/
1017 			if (*lmbuf)
1018 				rte_pktmbuf_free_seg(*lmbuf);
1019 
1020 			/*
1021 			 * Linking mbuf with descriptor for being released
1022 			 * next time descriptor is used
1023 			 */
1024 			*lmbuf = pkt;
1025 
1026 			dma_size = pkt->data_len;
1027 			dma_addr = rte_mbuf_data_iova(pkt);
1028 			PMD_TX_LOG(DEBUG, "Working with mbuf at dma address:"
1029 				   "%" PRIx64 "", dma_addr);
1030 
1031 			/* Filling descriptors fields */
1032 			txds->dma_len = dma_size;
1033 			txds->data_len = txd.data_len;
1034 			txds->dma_addr_hi = (dma_addr >> 32) & 0xff;
1035 			txds->dma_addr_lo = (dma_addr & 0xffffffff);
1036 			free_descs--;
1037 
1038 			txq->wr_p++;
1039 			if (unlikely(txq->wr_p == txq->tx_count)) /* wrapping?*/
1040 				txq->wr_p = 0;
1041 
1042 			pkt_size -= dma_size;
1043 
1044 			/*
1045 			 * Making the EOP, packets with just one segment
1046 			 * the priority
1047 			 */
1048 			if (likely(pkt_size == 0))
1049 				txds->offset_eop = PCIE_DESC_TX_EOP;
1050 			else
1051 				txds->offset_eop = 0;
1052 
1053 			/* Set the meta_len */
1054 			txds->offset_eop |= meta_data.length;
1055 
1056 			pkt = pkt->next;
1057 			/* Referencing next free TX descriptor */
1058 			txds = &txq->txds[txq->wr_p];
1059 			lmbuf = &txq->txbufs[txq->wr_p].mbuf;
1060 			issued_descs++;
1061 		}
1062 	}
1063 
1064 xmit_end:
1065 	/* Increment write pointers. Force memory write before we let HW know */
1066 	rte_wmb();
1067 	nfp_qcp_ptr_add(txq->qcp_q, NFP_QCP_WRITE_PTR, issued_descs);
1068 
1069 	return i;
1070 }
1071 
1072 static void
1073 nfp_net_nfdk_set_meta_data(struct rte_mbuf *pkt,
1074 		struct nfp_net_txq *txq,
1075 		uint64_t *metadata)
1076 {
1077 	char *meta;
1078 	uint8_t layer = 0;
1079 	uint32_t meta_type;
1080 	struct nfp_net_hw *hw;
1081 	uint32_t header_offset;
1082 	uint8_t vlan_layer = 0;
1083 	struct nfp_net_meta_raw meta_data;
1084 
1085 	memset(&meta_data, 0, sizeof(meta_data));
1086 	hw = txq->hw;
1087 
1088 	if ((pkt->ol_flags & RTE_MBUF_F_TX_VLAN) != 0 &&
1089 			(hw->ctrl & NFP_NET_CFG_CTRL_TXVLAN_V2) != 0) {
1090 		if (meta_data.length == 0)
1091 			meta_data.length = NFP_NET_META_HEADER_SIZE;
1092 		meta_data.length += NFP_NET_META_FIELD_SIZE;
1093 		meta_data.header |= NFP_NET_META_VLAN;
1094 	}
1095 
1096 	if (meta_data.length == 0)
1097 		return;
1098 
1099 	meta_type = meta_data.header;
1100 	header_offset = meta_type << NFP_NET_META_NFDK_LENGTH;
1101 	meta_data.header = header_offset | meta_data.length;
1102 	meta_data.header = rte_cpu_to_be_32(meta_data.header);
1103 	meta = rte_pktmbuf_prepend(pkt, meta_data.length);
1104 	memcpy(meta, &meta_data.header, sizeof(meta_data.header));
1105 	meta += NFP_NET_META_HEADER_SIZE;
1106 
1107 	for (; meta_type != 0; meta_type >>= NFP_NET_META_FIELD_SIZE, layer++,
1108 			meta += NFP_NET_META_FIELD_SIZE) {
1109 		switch (meta_type & NFP_NET_META_FIELD_MASK) {
1110 		case NFP_NET_META_VLAN:
1111 			if (vlan_layer > 0) {
1112 				PMD_DRV_LOG(ERR, "At most 1 layers of vlan is supported");
1113 				return;
1114 			}
1115 			nfp_net_set_meta_vlan(&meta_data, pkt, layer);
1116 			vlan_layer++;
1117 			break;
1118 		default:
1119 			PMD_DRV_LOG(ERR, "The metadata type not supported");
1120 			return;
1121 		}
1122 
1123 		memcpy(meta, &meta_data.data[layer], sizeof(meta_data.data[layer]));
1124 	}
1125 
1126 	*metadata = NFDK_DESC_TX_CHAIN_META;
1127 }
1128 
1129 static int
1130 nfp_net_nfdk_tx_queue_setup(struct rte_eth_dev *dev,
1131 		uint16_t queue_idx,
1132 		uint16_t nb_desc,
1133 		unsigned int socket_id,
1134 		const struct rte_eth_txconf *tx_conf)
1135 {
1136 	int ret;
1137 	uint16_t min_tx_desc;
1138 	uint16_t max_tx_desc;
1139 	const struct rte_memzone *tz;
1140 	struct nfp_net_txq *txq;
1141 	uint16_t tx_free_thresh;
1142 	struct nfp_net_hw *hw;
1143 	uint32_t tx_desc_sz;
1144 
1145 	hw = NFP_NET_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1146 
1147 	PMD_INIT_FUNC_TRACE();
1148 
1149 	ret = nfp_net_tx_desc_limits(hw, &min_tx_desc, &max_tx_desc);
1150 	if (ret != 0)
1151 		return ret;
1152 
1153 	/* Validating number of descriptors */
1154 	tx_desc_sz = nb_desc * sizeof(struct nfp_net_nfdk_tx_desc);
1155 	if ((NFDK_TX_DESC_PER_SIMPLE_PKT * tx_desc_sz) % NFP_ALIGN_RING_DESC != 0 ||
1156 	    (NFDK_TX_DESC_PER_SIMPLE_PKT * nb_desc) % NFDK_TX_DESC_BLOCK_CNT != 0 ||
1157 	     nb_desc > max_tx_desc || nb_desc < min_tx_desc) {
1158 		PMD_DRV_LOG(ERR, "Wrong nb_desc value");
1159 		return -EINVAL;
1160 	}
1161 
1162 	tx_free_thresh = (uint16_t)((tx_conf->tx_free_thresh) ?
1163 				tx_conf->tx_free_thresh :
1164 				DEFAULT_TX_FREE_THRESH);
1165 
1166 	if (tx_free_thresh > (nb_desc)) {
1167 		PMD_DRV_LOG(ERR,
1168 			"tx_free_thresh must be less than the number of TX "
1169 			"descriptors. (tx_free_thresh=%u port=%d "
1170 			"queue=%d)", (unsigned int)tx_free_thresh,
1171 			dev->data->port_id, (int)queue_idx);
1172 		return -(EINVAL);
1173 	}
1174 
1175 	/*
1176 	 * Free memory prior to re-allocation if needed. This is the case after
1177 	 * calling nfp_net_stop
1178 	 */
1179 	if (dev->data->tx_queues[queue_idx]) {
1180 		PMD_TX_LOG(DEBUG, "Freeing memory prior to re-allocation %d",
1181 				queue_idx);
1182 		nfp_net_tx_queue_release(dev, queue_idx);
1183 		dev->data->tx_queues[queue_idx] = NULL;
1184 	}
1185 
1186 	/* Allocating tx queue data structure */
1187 	txq = rte_zmalloc_socket("ethdev TX queue", sizeof(struct nfp_net_txq),
1188 			RTE_CACHE_LINE_SIZE, socket_id);
1189 	if (txq == NULL) {
1190 		PMD_DRV_LOG(ERR, "Error allocating tx dma");
1191 		return -ENOMEM;
1192 	}
1193 
1194 	/*
1195 	 * Allocate TX ring hardware descriptors. A memzone large enough to
1196 	 * handle the maximum ring size is allocated in order to allow for
1197 	 * resizing in later calls to the queue setup function.
1198 	 */
1199 	tz = rte_eth_dma_zone_reserve(dev, "tx_ring", queue_idx,
1200 				sizeof(struct nfp_net_nfdk_tx_desc) *
1201 				NFDK_TX_DESC_PER_SIMPLE_PKT *
1202 				max_tx_desc, NFP_MEMZONE_ALIGN,
1203 				socket_id);
1204 	if (tz == NULL) {
1205 		PMD_DRV_LOG(ERR, "Error allocating tx dma");
1206 		nfp_net_tx_queue_release(dev, queue_idx);
1207 		return -ENOMEM;
1208 	}
1209 
1210 	txq->tx_count = nb_desc * NFDK_TX_DESC_PER_SIMPLE_PKT;
1211 	txq->tx_free_thresh = tx_free_thresh;
1212 	txq->tx_pthresh = tx_conf->tx_thresh.pthresh;
1213 	txq->tx_hthresh = tx_conf->tx_thresh.hthresh;
1214 	txq->tx_wthresh = tx_conf->tx_thresh.wthresh;
1215 
1216 	/* queue mapping based on firmware configuration */
1217 	txq->qidx = queue_idx;
1218 	txq->tx_qcidx = queue_idx * hw->stride_tx;
1219 	txq->qcp_q = hw->tx_bar + NFP_QCP_QUEUE_OFF(txq->tx_qcidx);
1220 
1221 	txq->port_id = dev->data->port_id;
1222 
1223 	/* Saving physical and virtual addresses for the TX ring */
1224 	txq->dma = (uint64_t)tz->iova;
1225 	txq->ktxds = (struct nfp_net_nfdk_tx_desc *)tz->addr;
1226 
1227 	/* mbuf pointers array for referencing mbufs linked to TX descriptors */
1228 	txq->txbufs = rte_zmalloc_socket("txq->txbufs",
1229 				sizeof(*txq->txbufs) * txq->tx_count,
1230 				RTE_CACHE_LINE_SIZE, socket_id);
1231 
1232 	if (txq->txbufs == NULL) {
1233 		nfp_net_tx_queue_release(dev, queue_idx);
1234 		return -ENOMEM;
1235 	}
1236 	PMD_TX_LOG(DEBUG, "txbufs=%p hw_ring=%p dma_addr=0x%" PRIx64,
1237 		txq->txbufs, txq->ktxds, (unsigned long)txq->dma);
1238 
1239 	nfp_net_reset_tx_queue(txq);
1240 
1241 	dev->data->tx_queues[queue_idx] = txq;
1242 	txq->hw = hw;
1243 	/*
1244 	 * Telling the HW about the physical address of the TX ring and number
1245 	 * of descriptors in log2 format
1246 	 */
1247 	nn_cfg_writeq(hw, NFP_NET_CFG_TXR_ADDR(queue_idx), txq->dma);
1248 	nn_cfg_writeb(hw, NFP_NET_CFG_TXR_SZ(queue_idx), rte_log2_u32(txq->tx_count));
1249 
1250 	return 0;
1251 }
1252 
1253 int
1254 nfp_net_tx_queue_setup(struct rte_eth_dev *dev,
1255 		uint16_t queue_idx,
1256 		uint16_t nb_desc,
1257 		unsigned int socket_id,
1258 		const struct rte_eth_txconf *tx_conf)
1259 {
1260 	struct nfp_net_hw *hw;
1261 
1262 	hw = NFP_NET_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1263 
1264 	switch (NFD_CFG_CLASS_VER_of(hw->ver)) {
1265 	case NFP_NET_CFG_VERSION_DP_NFD3:
1266 		return nfp_net_nfd3_tx_queue_setup(dev, queue_idx,
1267 				nb_desc, socket_id, tx_conf);
1268 	case NFP_NET_CFG_VERSION_DP_NFDK:
1269 		if (NFD_CFG_MAJOR_VERSION_of(hw->ver) < 5) {
1270 			PMD_DRV_LOG(ERR, "NFDK must use ABI 5 or newer, found: %d",
1271 				NFD_CFG_MAJOR_VERSION_of(hw->ver));
1272 			return -EINVAL;
1273 		}
1274 		return nfp_net_nfdk_tx_queue_setup(dev, queue_idx,
1275 				nb_desc, socket_id, tx_conf);
1276 	default:
1277 		PMD_DRV_LOG(ERR, "The version of firmware is not correct.");
1278 		return -EINVAL;
1279 	}
1280 }
1281 
1282 static inline uint32_t
1283 nfp_net_nfdk_free_tx_desc(struct nfp_net_txq *txq)
1284 {
1285 	uint32_t free_desc;
1286 
1287 	if (txq->wr_p >= txq->rd_p)
1288 		free_desc = txq->tx_count - (txq->wr_p - txq->rd_p);
1289 	else
1290 		free_desc = txq->rd_p - txq->wr_p;
1291 
1292 	return (free_desc > NFDK_TX_DESC_STOP_CNT) ?
1293 		(free_desc - NFDK_TX_DESC_STOP_CNT) : 0;
1294 }
1295 
1296 static inline uint32_t
1297 nfp_net_nfdk_txq_full(struct nfp_net_txq *txq)
1298 {
1299 	return (nfp_net_nfdk_free_tx_desc(txq) < txq->tx_free_thresh);
1300 }
1301 
1302 static inline int
1303 nfp_net_nfdk_headlen_to_segs(unsigned int headlen)
1304 {
1305 	return DIV_ROUND_UP(headlen +
1306 			NFDK_TX_MAX_DATA_PER_DESC -
1307 			NFDK_TX_MAX_DATA_PER_HEAD,
1308 			NFDK_TX_MAX_DATA_PER_DESC);
1309 }
1310 
1311 static int
1312 nfp_net_nfdk_tx_maybe_close_block(struct nfp_net_txq *txq, struct rte_mbuf *pkt)
1313 {
1314 	unsigned int n_descs, wr_p, i, nop_slots;
1315 	struct rte_mbuf *pkt_temp;
1316 
1317 	pkt_temp = pkt;
1318 	n_descs = nfp_net_nfdk_headlen_to_segs(pkt_temp->data_len);
1319 	while (pkt_temp->next) {
1320 		pkt_temp = pkt_temp->next;
1321 		n_descs += DIV_ROUND_UP(pkt_temp->data_len, NFDK_TX_MAX_DATA_PER_DESC);
1322 	}
1323 
1324 	if (unlikely(n_descs > NFDK_TX_DESC_GATHER_MAX))
1325 		return -EINVAL;
1326 
1327 	/* Under count by 1 (don't count meta) for the round down to work out */
1328 	n_descs += !!(pkt->ol_flags & RTE_MBUF_F_TX_TCP_SEG);
1329 
1330 	if (round_down(txq->wr_p, NFDK_TX_DESC_BLOCK_CNT) !=
1331 			round_down(txq->wr_p + n_descs, NFDK_TX_DESC_BLOCK_CNT))
1332 		goto close_block;
1333 
1334 	if ((uint32_t)txq->data_pending + pkt->pkt_len > NFDK_TX_MAX_DATA_PER_BLOCK)
1335 		goto close_block;
1336 
1337 	return 0;
1338 
1339 close_block:
1340 	wr_p = txq->wr_p;
1341 	nop_slots = D_BLOCK_CPL(wr_p);
1342 
1343 	memset(&txq->ktxds[wr_p], 0, nop_slots * sizeof(struct nfp_net_nfdk_tx_desc));
1344 	for (i = wr_p; i < nop_slots + wr_p; i++) {
1345 		if (txq->txbufs[i].mbuf) {
1346 			rte_pktmbuf_free_seg(txq->txbufs[i].mbuf);
1347 			txq->txbufs[i].mbuf = NULL;
1348 		}
1349 	}
1350 	txq->data_pending = 0;
1351 	txq->wr_p = D_IDX(txq, txq->wr_p + nop_slots);
1352 
1353 	return nop_slots;
1354 }
1355 
1356 static inline uint64_t
1357 nfp_net_nfdk_tx_cksum(struct nfp_net_txq *txq, struct rte_mbuf *mb,
1358 		uint64_t flags)
1359 {
1360 	uint64_t ol_flags;
1361 	struct nfp_net_hw *hw = txq->hw;
1362 
1363 	if ((hw->cap & NFP_NET_CFG_CTRL_TXCSUM) == 0)
1364 		return flags;
1365 
1366 	ol_flags = mb->ol_flags;
1367 
1368 	/* IPv6 does not need checksum */
1369 	if (ol_flags & RTE_MBUF_F_TX_IP_CKSUM)
1370 		flags |= NFDK_DESC_TX_L3_CSUM;
1371 
1372 	if (ol_flags & RTE_MBUF_F_TX_L4_MASK)
1373 		flags |= NFDK_DESC_TX_L4_CSUM;
1374 
1375 	return flags;
1376 }
1377 
1378 static inline uint64_t
1379 nfp_net_nfdk_tx_tso(struct nfp_net_txq *txq, struct rte_mbuf *mb)
1380 {
1381 	uint64_t ol_flags;
1382 	struct nfp_net_nfdk_tx_desc txd;
1383 	struct nfp_net_hw *hw = txq->hw;
1384 
1385 	if ((hw->cap & NFP_NET_CFG_CTRL_LSO_ANY) == 0)
1386 		goto clean_txd;
1387 
1388 	ol_flags = mb->ol_flags;
1389 
1390 	if ((ol_flags & RTE_MBUF_F_TX_TCP_SEG) == 0)
1391 		goto clean_txd;
1392 
1393 	txd.l3_offset = mb->l2_len;
1394 	txd.l4_offset = mb->l2_len + mb->l3_len;
1395 	txd.lso_meta_res = 0;
1396 	txd.mss = rte_cpu_to_le_16(mb->tso_segsz);
1397 	txd.lso_hdrlen = mb->l2_len + mb->l3_len + mb->l4_len;
1398 	txd.lso_totsegs = (mb->pkt_len + mb->tso_segsz) / mb->tso_segsz;
1399 
1400 	return txd.raw;
1401 
1402 clean_txd:
1403 	txd.l3_offset = 0;
1404 	txd.l4_offset = 0;
1405 	txd.lso_hdrlen = 0;
1406 	txd.mss = 0;
1407 	txd.lso_totsegs = 0;
1408 	txd.lso_meta_res = 0;
1409 
1410 	return txd.raw;
1411 }
1412 
1413 uint16_t
1414 nfp_net_nfdk_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
1415 {
1416 	uint32_t buf_idx;
1417 	uint64_t dma_addr;
1418 	uint16_t free_descs;
1419 	uint32_t npkts = 0;
1420 	uint64_t metadata = 0;
1421 	uint16_t issued_descs = 0;
1422 	struct nfp_net_txq *txq;
1423 	struct nfp_net_hw *hw;
1424 	struct nfp_net_nfdk_tx_desc *ktxds;
1425 	struct rte_mbuf *pkt, *temp_pkt;
1426 	struct rte_mbuf **lmbuf;
1427 
1428 	txq = tx_queue;
1429 	hw = txq->hw;
1430 
1431 	PMD_TX_LOG(DEBUG, "working for queue %u at pos %d and %u packets",
1432 		txq->qidx, txq->wr_p, nb_pkts);
1433 
1434 	if ((nfp_net_nfdk_free_tx_desc(txq) < NFDK_TX_DESC_PER_SIMPLE_PKT *
1435 			nb_pkts) || (nfp_net_nfdk_txq_full(txq)))
1436 		nfp_net_tx_free_bufs(txq);
1437 
1438 	free_descs = (uint16_t)nfp_net_nfdk_free_tx_desc(txq);
1439 	if (unlikely(free_descs == 0))
1440 		return 0;
1441 
1442 	PMD_TX_LOG(DEBUG, "queue: %u. Sending %u packets", txq->qidx, nb_pkts);
1443 	/* Sending packets */
1444 	while ((npkts < nb_pkts) && free_descs) {
1445 		uint32_t type, dma_len, dlen_type, tmp_dlen;
1446 		int nop_descs, used_descs;
1447 
1448 		pkt = *(tx_pkts + npkts);
1449 		nop_descs = nfp_net_nfdk_tx_maybe_close_block(txq, pkt);
1450 		if (nop_descs < 0)
1451 			goto xmit_end;
1452 
1453 		issued_descs += nop_descs;
1454 		ktxds = &txq->ktxds[txq->wr_p];
1455 		/* Grabbing the mbuf linked to the current descriptor */
1456 		buf_idx = txq->wr_p;
1457 		lmbuf = &txq->txbufs[buf_idx++].mbuf;
1458 		/* Warming the cache for releasing the mbuf later on */
1459 		RTE_MBUF_PREFETCH_TO_FREE(*lmbuf);
1460 
1461 		temp_pkt = pkt;
1462 		nfp_net_nfdk_set_meta_data(pkt, txq, &metadata);
1463 
1464 		if (unlikely(pkt->nb_segs > 1 &&
1465 				!(hw->cap & NFP_NET_CFG_CTRL_GATHER))) {
1466 			PMD_INIT_LOG(ERR, "Multisegment packet not supported");
1467 			goto xmit_end;
1468 		}
1469 
1470 		/*
1471 		 * Checksum and VLAN flags just in the first descriptor for a
1472 		 * multisegment packet, but TSO info needs to be in all of them.
1473 		 */
1474 
1475 		dma_len = pkt->data_len;
1476 		if ((hw->cap & NFP_NET_CFG_CTRL_LSO_ANY) &&
1477 				(pkt->ol_flags & RTE_MBUF_F_TX_TCP_SEG)) {
1478 			type = NFDK_DESC_TX_TYPE_TSO;
1479 		} else if (pkt->next == NULL && dma_len < NFDK_TX_MAX_DATA_PER_HEAD) {
1480 			type = NFDK_DESC_TX_TYPE_SIMPLE;
1481 		} else {
1482 			type = NFDK_DESC_TX_TYPE_GATHER;
1483 		}
1484 
1485 		/* Implicitly truncates to chunk in below logic */
1486 		dma_len -= 1;
1487 
1488 		/*
1489 		 * We will do our best to pass as much data as we can in descriptor
1490 		 * and we need to make sure the first descriptor includes whole
1491 		 * head since there is limitation in firmware side. Sometimes the
1492 		 * value of 'dma_len & NFDK_DESC_TX_DMA_LEN_HEAD' will be less
1493 		 * than packet head len.
1494 		 */
1495 		dlen_type = (dma_len > NFDK_DESC_TX_DMA_LEN_HEAD ?
1496 				NFDK_DESC_TX_DMA_LEN_HEAD : dma_len) |
1497 			(NFDK_DESC_TX_TYPE_HEAD & (type << 12));
1498 		ktxds->dma_len_type = rte_cpu_to_le_16(dlen_type);
1499 		dma_addr = rte_mbuf_data_iova(pkt);
1500 		PMD_TX_LOG(DEBUG, "Working with mbuf at dma address:"
1501 				"%" PRIx64 "", dma_addr);
1502 		ktxds->dma_addr_hi = rte_cpu_to_le_16(dma_addr >> 32);
1503 		ktxds->dma_addr_lo = rte_cpu_to_le_32(dma_addr & 0xffffffff);
1504 		ktxds++;
1505 
1506 		/*
1507 		 * Preserve the original dlen_type, this way below the EOP logic
1508 		 * can use dlen_type.
1509 		 */
1510 		tmp_dlen = dlen_type & NFDK_DESC_TX_DMA_LEN_HEAD;
1511 		dma_len -= tmp_dlen;
1512 		dma_addr += tmp_dlen + 1;
1513 
1514 		/*
1515 		 * The rest of the data (if any) will be in larger DMA descriptors
1516 		 * and is handled with the dma_len loop.
1517 		 */
1518 		while (pkt) {
1519 			if (*lmbuf)
1520 				rte_pktmbuf_free_seg(*lmbuf);
1521 			*lmbuf = pkt;
1522 			while (dma_len > 0) {
1523 				dma_len -= 1;
1524 				dlen_type = NFDK_DESC_TX_DMA_LEN & dma_len;
1525 
1526 				ktxds->dma_len_type = rte_cpu_to_le_16(dlen_type);
1527 				ktxds->dma_addr_hi = rte_cpu_to_le_16(dma_addr >> 32);
1528 				ktxds->dma_addr_lo = rte_cpu_to_le_32(dma_addr & 0xffffffff);
1529 				ktxds++;
1530 
1531 				dma_len -= dlen_type;
1532 				dma_addr += dlen_type + 1;
1533 			}
1534 
1535 			if (pkt->next == NULL)
1536 				break;
1537 
1538 			pkt = pkt->next;
1539 			dma_len = pkt->data_len;
1540 			dma_addr = rte_mbuf_data_iova(pkt);
1541 			PMD_TX_LOG(DEBUG, "Working with mbuf at dma address:"
1542 				"%" PRIx64 "", dma_addr);
1543 
1544 			lmbuf = &txq->txbufs[buf_idx++].mbuf;
1545 		}
1546 
1547 		(ktxds - 1)->dma_len_type = rte_cpu_to_le_16(dlen_type | NFDK_DESC_TX_EOP);
1548 
1549 		ktxds->raw = rte_cpu_to_le_64(nfp_net_nfdk_tx_cksum(txq, temp_pkt, metadata));
1550 		ktxds++;
1551 
1552 		if ((hw->cap & NFP_NET_CFG_CTRL_LSO_ANY) &&
1553 				(temp_pkt->ol_flags & RTE_MBUF_F_TX_TCP_SEG)) {
1554 			ktxds->raw = rte_cpu_to_le_64(nfp_net_nfdk_tx_tso(txq, temp_pkt));
1555 			ktxds++;
1556 		}
1557 
1558 		used_descs = ktxds - txq->ktxds - txq->wr_p;
1559 		if (round_down(txq->wr_p, NFDK_TX_DESC_BLOCK_CNT) !=
1560 			round_down(txq->wr_p + used_descs - 1, NFDK_TX_DESC_BLOCK_CNT)) {
1561 			PMD_INIT_LOG(INFO, "Used descs cross block boundary");
1562 			goto xmit_end;
1563 		}
1564 
1565 		txq->wr_p = D_IDX(txq, txq->wr_p + used_descs);
1566 		if (txq->wr_p % NFDK_TX_DESC_BLOCK_CNT)
1567 			txq->data_pending += temp_pkt->pkt_len;
1568 		else
1569 			txq->data_pending = 0;
1570 
1571 		issued_descs += used_descs;
1572 		npkts++;
1573 		free_descs = (uint16_t)nfp_net_nfdk_free_tx_desc(txq);
1574 	}
1575 
1576 xmit_end:
1577 	/* Increment write pointers. Force memory write before we let HW know */
1578 	rte_wmb();
1579 	nfp_qcp_ptr_add(txq->qcp_q, NFP_QCP_WRITE_PTR, issued_descs);
1580 
1581 	return npkts;
1582 }
1583