xref: /dpdk/drivers/net/nfp/nfp_rxtx.c (revision 97b914f4e715565d53d38ac6e04815b9be5e58a9)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright (c) 2014-2021 Netronome Systems, Inc.
3  * All rights reserved.
4  *
5  * Small portions derived from code Copyright(c) 2010-2015 Intel Corporation.
6  */
7 
8 /*
9  * vim:shiftwidth=8:noexpandtab
10  *
11  * @file dpdk/pmd/nfp_rxtx.c
12  *
13  * Netronome vNIC DPDK Poll-Mode Driver: Rx/Tx functions
14  */
15 
16 #include <ethdev_driver.h>
17 #include <ethdev_pci.h>
18 
19 #include "nfp_common.h"
20 #include "nfp_rxtx.h"
21 #include "nfp_logs.h"
22 #include "nfp_ctrl.h"
23 
24 /* Prototypes */
25 static int nfp_net_rx_fill_freelist(struct nfp_net_rxq *rxq);
26 static inline void nfp_net_mbuf_alloc_failed(struct nfp_net_rxq *rxq);
27 static inline void nfp_net_set_hash(struct nfp_net_rxq *rxq,
28 				    struct nfp_net_rx_desc *rxd,
29 				    struct rte_mbuf *mbuf);
30 static inline void nfp_net_rx_cksum(struct nfp_net_rxq *rxq,
31 				    struct nfp_net_rx_desc *rxd,
32 				    struct rte_mbuf *mb);
33 static void nfp_net_rx_queue_release_mbufs(struct nfp_net_rxq *rxq);
34 static int nfp_net_tx_free_bufs(struct nfp_net_txq *txq);
35 static void nfp_net_tx_queue_release_mbufs(struct nfp_net_txq *txq);
36 static inline uint32_t nfp_free_tx_desc(struct nfp_net_txq *txq);
37 static inline uint32_t nfp_net_txq_full(struct nfp_net_txq *txq);
38 static inline void nfp_net_tx_tso(struct nfp_net_txq *txq,
39 				  struct nfp_net_tx_desc *txd,
40 				  struct rte_mbuf *mb);
41 static inline void nfp_net_tx_cksum(struct nfp_net_txq *txq,
42 				    struct nfp_net_tx_desc *txd,
43 				    struct rte_mbuf *mb);
44 
45 static int
46 nfp_net_rx_fill_freelist(struct nfp_net_rxq *rxq)
47 {
48 	struct nfp_net_rx_buff *rxe = rxq->rxbufs;
49 	uint64_t dma_addr;
50 	unsigned int i;
51 
52 	PMD_RX_LOG(DEBUG, "Fill Rx Freelist for %u descriptors",
53 		   rxq->rx_count);
54 
55 	for (i = 0; i < rxq->rx_count; i++) {
56 		struct nfp_net_rx_desc *rxd;
57 		struct rte_mbuf *mbuf = rte_pktmbuf_alloc(rxq->mem_pool);
58 
59 		if (mbuf == NULL) {
60 			PMD_DRV_LOG(ERR, "RX mbuf alloc failed queue_id=%u",
61 				(unsigned int)rxq->qidx);
62 			return -ENOMEM;
63 		}
64 
65 		dma_addr = rte_cpu_to_le_64(RTE_MBUF_DMA_ADDR_DEFAULT(mbuf));
66 
67 		rxd = &rxq->rxds[i];
68 		rxd->fld.dd = 0;
69 		rxd->fld.dma_addr_hi = (dma_addr >> 32) & 0xff;
70 		rxd->fld.dma_addr_lo = dma_addr & 0xffffffff;
71 		rxe[i].mbuf = mbuf;
72 		PMD_RX_LOG(DEBUG, "[%d]: %" PRIx64, i, dma_addr);
73 	}
74 
75 	/* Make sure all writes are flushed before telling the hardware */
76 	rte_wmb();
77 
78 	/* Not advertising the whole ring as the firmware gets confused if so */
79 	PMD_RX_LOG(DEBUG, "Increment FL write pointer in %u",
80 		   rxq->rx_count - 1);
81 
82 	nfp_qcp_ptr_add(rxq->qcp_fl, NFP_QCP_WRITE_PTR, rxq->rx_count - 1);
83 
84 	return 0;
85 }
86 
87 int
88 nfp_net_rx_freelist_setup(struct rte_eth_dev *dev)
89 {
90 	int i;
91 
92 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
93 		if (nfp_net_rx_fill_freelist(dev->data->rx_queues[i]) < 0)
94 			return -1;
95 	}
96 	return 0;
97 }
98 
99 uint32_t
100 nfp_net_rx_queue_count(void *rx_queue)
101 {
102 	struct nfp_net_rxq *rxq;
103 	struct nfp_net_rx_desc *rxds;
104 	uint32_t idx;
105 	uint32_t count;
106 
107 	rxq = rx_queue;
108 
109 	idx = rxq->rd_p;
110 
111 	count = 0;
112 
113 	/*
114 	 * Other PMDs are just checking the DD bit in intervals of 4
115 	 * descriptors and counting all four if the first has the DD
116 	 * bit on. Of course, this is not accurate but can be good for
117 	 * performance. But ideally that should be done in descriptors
118 	 * chunks belonging to the same cache line
119 	 */
120 
121 	while (count < rxq->rx_count) {
122 		rxds = &rxq->rxds[idx];
123 		if ((rxds->rxd.meta_len_dd & PCIE_DESC_RX_DD) == 0)
124 			break;
125 
126 		count++;
127 		idx++;
128 
129 		/* Wrapping? */
130 		if ((idx) == rxq->rx_count)
131 			idx = 0;
132 	}
133 
134 	return count;
135 }
136 
137 static inline void
138 nfp_net_mbuf_alloc_failed(struct nfp_net_rxq *rxq)
139 {
140 	rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
141 }
142 
143 /*
144  * nfp_net_set_hash - Set mbuf hash data
145  *
146  * The RSS hash and hash-type are pre-pended to the packet data.
147  * Extract and decode it and set the mbuf fields.
148  */
149 static inline void
150 nfp_net_set_hash(struct nfp_net_rxq *rxq, struct nfp_net_rx_desc *rxd,
151 		 struct rte_mbuf *mbuf)
152 {
153 	struct nfp_net_hw *hw = rxq->hw;
154 	uint8_t *meta_offset;
155 	uint32_t meta_info;
156 	uint32_t hash = 0;
157 	uint32_t hash_type = 0;
158 
159 	if (!(hw->ctrl & NFP_NET_CFG_CTRL_RSS))
160 		return;
161 
162 	/* this is true for new firmwares */
163 	if (likely(((hw->cap & NFP_NET_CFG_CTRL_RSS2) ||
164 	    (NFD_CFG_MAJOR_VERSION_of(hw->ver) == 4)) &&
165 	     NFP_DESC_META_LEN(rxd))) {
166 		/*
167 		 * new metadata api:
168 		 * <----  32 bit  ----->
169 		 * m    field type word
170 		 * e     data field #2
171 		 * t     data field #1
172 		 * a     data field #0
173 		 * ====================
174 		 *    packet data
175 		 *
176 		 * Field type word contains up to 8 4bit field types
177 		 * A 4bit field type refers to a data field word
178 		 * A data field word can have several 4bit field types
179 		 */
180 		meta_offset = rte_pktmbuf_mtod(mbuf, uint8_t *);
181 		meta_offset -= NFP_DESC_META_LEN(rxd);
182 		meta_info = rte_be_to_cpu_32(*(uint32_t *)meta_offset);
183 		meta_offset += 4;
184 		/* NFP PMD just supports metadata for hashing */
185 		switch (meta_info & NFP_NET_META_FIELD_MASK) {
186 		case NFP_NET_META_HASH:
187 			/* next field type is about the hash type */
188 			meta_info >>= NFP_NET_META_FIELD_SIZE;
189 			/* hash value is in the data field */
190 			hash = rte_be_to_cpu_32(*(uint32_t *)meta_offset);
191 			hash_type = meta_info & NFP_NET_META_FIELD_MASK;
192 			break;
193 		default:
194 			/* Unsupported metadata can be a performance issue */
195 			return;
196 		}
197 	} else {
198 		if (!(rxd->rxd.flags & PCIE_DESC_RX_RSS))
199 			return;
200 
201 		hash = rte_be_to_cpu_32(*(uint32_t *)NFP_HASH_OFFSET);
202 		hash_type = rte_be_to_cpu_32(*(uint32_t *)NFP_HASH_TYPE_OFFSET);
203 	}
204 
205 	mbuf->hash.rss = hash;
206 	mbuf->ol_flags |= RTE_MBUF_F_RX_RSS_HASH;
207 
208 	switch (hash_type) {
209 	case NFP_NET_RSS_IPV4:
210 		mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV4;
211 		break;
212 	case NFP_NET_RSS_IPV6:
213 		mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6;
214 		break;
215 	case NFP_NET_RSS_IPV6_EX:
216 		mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6_EXT;
217 		break;
218 	case NFP_NET_RSS_IPV4_TCP:
219 		mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6_EXT;
220 		break;
221 	case NFP_NET_RSS_IPV6_TCP:
222 		mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6_EXT;
223 		break;
224 	case NFP_NET_RSS_IPV4_UDP:
225 		mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6_EXT;
226 		break;
227 	case NFP_NET_RSS_IPV6_UDP:
228 		mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6_EXT;
229 		break;
230 	default:
231 		mbuf->packet_type |= RTE_PTYPE_INNER_L4_MASK;
232 	}
233 }
234 
235 /* nfp_net_rx_cksum - set mbuf checksum flags based on RX descriptor flags */
236 static inline void
237 nfp_net_rx_cksum(struct nfp_net_rxq *rxq, struct nfp_net_rx_desc *rxd,
238 		 struct rte_mbuf *mb)
239 {
240 	struct nfp_net_hw *hw = rxq->hw;
241 
242 	if (!(hw->ctrl & NFP_NET_CFG_CTRL_RXCSUM))
243 		return;
244 
245 	/* If IPv4 and IP checksum error, fail */
246 	if (unlikely((rxd->rxd.flags & PCIE_DESC_RX_IP4_CSUM) &&
247 	    !(rxd->rxd.flags & PCIE_DESC_RX_IP4_CSUM_OK)))
248 		mb->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_BAD;
249 	else
250 		mb->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_GOOD;
251 
252 	/* If neither UDP nor TCP return */
253 	if (!(rxd->rxd.flags & PCIE_DESC_RX_TCP_CSUM) &&
254 	    !(rxd->rxd.flags & PCIE_DESC_RX_UDP_CSUM))
255 		return;
256 
257 	if (likely(rxd->rxd.flags & PCIE_DESC_RX_L4_CSUM_OK))
258 		mb->ol_flags |= RTE_MBUF_F_RX_L4_CKSUM_GOOD;
259 	else
260 		mb->ol_flags |= RTE_MBUF_F_RX_L4_CKSUM_BAD;
261 }
262 
263 /*
264  * RX path design:
265  *
266  * There are some decisions to take:
267  * 1) How to check DD RX descriptors bit
268  * 2) How and when to allocate new mbufs
269  *
270  * Current implementation checks just one single DD bit each loop. As each
271  * descriptor is 8 bytes, it is likely a good idea to check descriptors in
272  * a single cache line instead. Tests with this change have not shown any
273  * performance improvement but it requires further investigation. For example,
274  * depending on which descriptor is next, the number of descriptors could be
275  * less than 8 for just checking those in the same cache line. This implies
276  * extra work which could be counterproductive by itself. Indeed, last firmware
277  * changes are just doing this: writing several descriptors with the DD bit
278  * for saving PCIe bandwidth and DMA operations from the NFP.
279  *
280  * Mbuf allocation is done when a new packet is received. Then the descriptor
281  * is automatically linked with the new mbuf and the old one is given to the
282  * user. The main drawback with this design is mbuf allocation is heavier than
283  * using bulk allocations allowed by DPDK with rte_mempool_get_bulk. From the
284  * cache point of view it does not seem allocating the mbuf early on as we are
285  * doing now have any benefit at all. Again, tests with this change have not
286  * shown any improvement. Also, rte_mempool_get_bulk returns all or nothing
287  * so looking at the implications of this type of allocation should be studied
288  * deeply
289  */
290 
291 uint16_t
292 nfp_net_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
293 {
294 	struct nfp_net_rxq *rxq;
295 	struct nfp_net_rx_desc *rxds;
296 	struct nfp_net_rx_buff *rxb;
297 	struct nfp_net_hw *hw;
298 	struct rte_mbuf *mb;
299 	struct rte_mbuf *new_mb;
300 	uint16_t nb_hold;
301 	uint64_t dma_addr;
302 	int avail;
303 
304 	rxq = rx_queue;
305 	if (unlikely(rxq == NULL)) {
306 		/*
307 		 * DPDK just checks the queue is lower than max queues
308 		 * enabled. But the queue needs to be configured
309 		 */
310 		RTE_LOG_DP(ERR, PMD, "RX Bad queue\n");
311 		return -EINVAL;
312 	}
313 
314 	hw = rxq->hw;
315 	avail = 0;
316 	nb_hold = 0;
317 
318 	while (avail < nb_pkts) {
319 		rxb = &rxq->rxbufs[rxq->rd_p];
320 		if (unlikely(rxb == NULL)) {
321 			RTE_LOG_DP(ERR, PMD, "rxb does not exist!\n");
322 			break;
323 		}
324 
325 		rxds = &rxq->rxds[rxq->rd_p];
326 		if ((rxds->rxd.meta_len_dd & PCIE_DESC_RX_DD) == 0)
327 			break;
328 
329 		/*
330 		 * Memory barrier to ensure that we won't do other
331 		 * reads before the DD bit.
332 		 */
333 		rte_rmb();
334 
335 		/*
336 		 * We got a packet. Let's alloc a new mbuf for refilling the
337 		 * free descriptor ring as soon as possible
338 		 */
339 		new_mb = rte_pktmbuf_alloc(rxq->mem_pool);
340 		if (unlikely(new_mb == NULL)) {
341 			RTE_LOG_DP(DEBUG, PMD,
342 			"RX mbuf alloc failed port_id=%u queue_id=%u\n",
343 				rxq->port_id, (unsigned int)rxq->qidx);
344 			nfp_net_mbuf_alloc_failed(rxq);
345 			break;
346 		}
347 
348 		nb_hold++;
349 
350 		/*
351 		 * Grab the mbuf and refill the descriptor with the
352 		 * previously allocated mbuf
353 		 */
354 		mb = rxb->mbuf;
355 		rxb->mbuf = new_mb;
356 
357 		PMD_RX_LOG(DEBUG, "Packet len: %u, mbuf_size: %u",
358 			   rxds->rxd.data_len, rxq->mbuf_size);
359 
360 		/* Size of this segment */
361 		mb->data_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds);
362 		/* Size of the whole packet. We just support 1 segment */
363 		mb->pkt_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds);
364 
365 		if (unlikely((mb->data_len + hw->rx_offset) >
366 			     rxq->mbuf_size)) {
367 			/*
368 			 * This should not happen and the user has the
369 			 * responsibility of avoiding it. But we have
370 			 * to give some info about the error
371 			 */
372 			RTE_LOG_DP(ERR, PMD,
373 				"mbuf overflow likely due to the RX offset.\n"
374 				"\t\tYour mbuf size should have extra space for"
375 				" RX offset=%u bytes.\n"
376 				"\t\tCurrently you just have %u bytes available"
377 				" but the received packet is %u bytes long",
378 				hw->rx_offset,
379 				rxq->mbuf_size - hw->rx_offset,
380 				mb->data_len);
381 			return -EINVAL;
382 		}
383 
384 		/* Filling the received mbuf with packet info */
385 		if (hw->rx_offset)
386 			mb->data_off = RTE_PKTMBUF_HEADROOM + hw->rx_offset;
387 		else
388 			mb->data_off = RTE_PKTMBUF_HEADROOM +
389 				       NFP_DESC_META_LEN(rxds);
390 
391 		/* No scatter mode supported */
392 		mb->nb_segs = 1;
393 		mb->next = NULL;
394 
395 		mb->port = rxq->port_id;
396 
397 		/* Checking the RSS flag */
398 		nfp_net_set_hash(rxq, rxds, mb);
399 
400 		/* Checking the checksum flag */
401 		nfp_net_rx_cksum(rxq, rxds, mb);
402 
403 		if ((rxds->rxd.flags & PCIE_DESC_RX_VLAN) &&
404 		    (hw->ctrl & NFP_NET_CFG_CTRL_RXVLAN)) {
405 			mb->vlan_tci = rte_cpu_to_le_32(rxds->rxd.vlan);
406 			mb->ol_flags |= RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED;
407 		}
408 
409 		/* Adding the mbuf to the mbuf array passed by the app */
410 		rx_pkts[avail++] = mb;
411 
412 		/* Now resetting and updating the descriptor */
413 		rxds->vals[0] = 0;
414 		rxds->vals[1] = 0;
415 		dma_addr = rte_cpu_to_le_64(RTE_MBUF_DMA_ADDR_DEFAULT(new_mb));
416 		rxds->fld.dd = 0;
417 		rxds->fld.dma_addr_hi = (dma_addr >> 32) & 0xff;
418 		rxds->fld.dma_addr_lo = dma_addr & 0xffffffff;
419 
420 		rxq->rd_p++;
421 		if (unlikely(rxq->rd_p == rxq->rx_count)) /* wrapping?*/
422 			rxq->rd_p = 0;
423 	}
424 
425 	if (nb_hold == 0)
426 		return nb_hold;
427 
428 	PMD_RX_LOG(DEBUG, "RX  port_id=%u queue_id=%u, %d packets received",
429 		   rxq->port_id, (unsigned int)rxq->qidx, nb_hold);
430 
431 	nb_hold += rxq->nb_rx_hold;
432 
433 	/*
434 	 * FL descriptors needs to be written before incrementing the
435 	 * FL queue WR pointer
436 	 */
437 	rte_wmb();
438 	if (nb_hold > rxq->rx_free_thresh) {
439 		PMD_RX_LOG(DEBUG, "port=%u queue=%u nb_hold=%u avail=%u",
440 			   rxq->port_id, (unsigned int)rxq->qidx,
441 			   (unsigned int)nb_hold, (unsigned int)avail);
442 		nfp_qcp_ptr_add(rxq->qcp_fl, NFP_QCP_WRITE_PTR, nb_hold);
443 		nb_hold = 0;
444 	}
445 	rxq->nb_rx_hold = nb_hold;
446 
447 	return avail;
448 }
449 
450 static void
451 nfp_net_rx_queue_release_mbufs(struct nfp_net_rxq *rxq)
452 {
453 	unsigned int i;
454 
455 	if (rxq->rxbufs == NULL)
456 		return;
457 
458 	for (i = 0; i < rxq->rx_count; i++) {
459 		if (rxq->rxbufs[i].mbuf) {
460 			rte_pktmbuf_free_seg(rxq->rxbufs[i].mbuf);
461 			rxq->rxbufs[i].mbuf = NULL;
462 		}
463 	}
464 }
465 
466 void
467 nfp_net_rx_queue_release(struct rte_eth_dev *dev, uint16_t queue_idx)
468 {
469 	struct nfp_net_rxq *rxq = dev->data->rx_queues[queue_idx];
470 
471 	if (rxq) {
472 		nfp_net_rx_queue_release_mbufs(rxq);
473 		rte_eth_dma_zone_free(dev, "rx_ring", queue_idx);
474 		rte_free(rxq->rxbufs);
475 		rte_free(rxq);
476 	}
477 }
478 
479 void
480 nfp_net_reset_rx_queue(struct nfp_net_rxq *rxq)
481 {
482 	nfp_net_rx_queue_release_mbufs(rxq);
483 	rxq->rd_p = 0;
484 	rxq->nb_rx_hold = 0;
485 }
486 
487 int
488 nfp_net_rx_queue_setup(struct rte_eth_dev *dev,
489 		       uint16_t queue_idx, uint16_t nb_desc,
490 		       unsigned int socket_id,
491 		       const struct rte_eth_rxconf *rx_conf,
492 		       struct rte_mempool *mp)
493 {
494 	const struct rte_memzone *tz;
495 	struct nfp_net_rxq *rxq;
496 	struct nfp_net_hw *hw;
497 	uint32_t rx_desc_sz;
498 
499 	hw = NFP_NET_DEV_PRIVATE_TO_HW(dev->data->dev_private);
500 
501 	PMD_INIT_FUNC_TRACE();
502 
503 	/* Validating number of descriptors */
504 	rx_desc_sz = nb_desc * sizeof(struct nfp_net_rx_desc);
505 	if (rx_desc_sz % NFP_ALIGN_RING_DESC != 0 ||
506 	    nb_desc > NFP_NET_MAX_RX_DESC ||
507 	    nb_desc < NFP_NET_MIN_RX_DESC) {
508 		PMD_DRV_LOG(ERR, "Wrong nb_desc value");
509 		return -EINVAL;
510 	}
511 
512 	/*
513 	 * Free memory prior to re-allocation if needed. This is the case after
514 	 * calling nfp_net_stop
515 	 */
516 	if (dev->data->rx_queues[queue_idx]) {
517 		nfp_net_rx_queue_release(dev, queue_idx);
518 		dev->data->rx_queues[queue_idx] = NULL;
519 	}
520 
521 	/* Allocating rx queue data structure */
522 	rxq = rte_zmalloc_socket("ethdev RX queue", sizeof(struct nfp_net_rxq),
523 				 RTE_CACHE_LINE_SIZE, socket_id);
524 	if (rxq == NULL)
525 		return -ENOMEM;
526 
527 	dev->data->rx_queues[queue_idx] = rxq;
528 
529 	/* Hw queues mapping based on firmware configuration */
530 	rxq->qidx = queue_idx;
531 	rxq->fl_qcidx = queue_idx * hw->stride_rx;
532 	rxq->rx_qcidx = rxq->fl_qcidx + (hw->stride_rx - 1);
533 	rxq->qcp_fl = hw->rx_bar + NFP_QCP_QUEUE_OFF(rxq->fl_qcidx);
534 	rxq->qcp_rx = hw->rx_bar + NFP_QCP_QUEUE_OFF(rxq->rx_qcidx);
535 
536 	/*
537 	 * Tracking mbuf size for detecting a potential mbuf overflow due to
538 	 * RX offset
539 	 */
540 	rxq->mem_pool = mp;
541 	rxq->mbuf_size = rxq->mem_pool->elt_size;
542 	rxq->mbuf_size -= (sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM);
543 	hw->flbufsz = rxq->mbuf_size;
544 
545 	rxq->rx_count = nb_desc;
546 	rxq->port_id = dev->data->port_id;
547 	rxq->rx_free_thresh = rx_conf->rx_free_thresh;
548 	rxq->drop_en = rx_conf->rx_drop_en;
549 
550 	/*
551 	 * Allocate RX ring hardware descriptors. A memzone large enough to
552 	 * handle the maximum ring size is allocated in order to allow for
553 	 * resizing in later calls to the queue setup function.
554 	 */
555 	tz = rte_eth_dma_zone_reserve(dev, "rx_ring", queue_idx,
556 				   sizeof(struct nfp_net_rx_desc) *
557 				   NFP_NET_MAX_RX_DESC, NFP_MEMZONE_ALIGN,
558 				   socket_id);
559 
560 	if (tz == NULL) {
561 		PMD_DRV_LOG(ERR, "Error allocating rx dma");
562 		nfp_net_rx_queue_release(dev, queue_idx);
563 		dev->data->rx_queues[queue_idx] = NULL;
564 		return -ENOMEM;
565 	}
566 
567 	/* Saving physical and virtual addresses for the RX ring */
568 	rxq->dma = (uint64_t)tz->iova;
569 	rxq->rxds = (struct nfp_net_rx_desc *)tz->addr;
570 
571 	/* mbuf pointers array for referencing mbufs linked to RX descriptors */
572 	rxq->rxbufs = rte_zmalloc_socket("rxq->rxbufs",
573 					 sizeof(*rxq->rxbufs) * nb_desc,
574 					 RTE_CACHE_LINE_SIZE, socket_id);
575 	if (rxq->rxbufs == NULL) {
576 		nfp_net_rx_queue_release(dev, queue_idx);
577 		dev->data->rx_queues[queue_idx] = NULL;
578 		return -ENOMEM;
579 	}
580 
581 	PMD_RX_LOG(DEBUG, "rxbufs=%p hw_ring=%p dma_addr=0x%" PRIx64,
582 		   rxq->rxbufs, rxq->rxds, (unsigned long)rxq->dma);
583 
584 	nfp_net_reset_rx_queue(rxq);
585 
586 	rxq->hw = hw;
587 
588 	/*
589 	 * Telling the HW about the physical address of the RX ring and number
590 	 * of descriptors in log2 format
591 	 */
592 	nn_cfg_writeq(hw, NFP_NET_CFG_RXR_ADDR(queue_idx), rxq->dma);
593 	nn_cfg_writeb(hw, NFP_NET_CFG_RXR_SZ(queue_idx), rte_log2_u32(nb_desc));
594 
595 	return 0;
596 }
597 
598 /*
599  * nfp_net_tx_free_bufs - Check for descriptors with a complete
600  * status
601  * @txq: TX queue to work with
602  * Returns number of descriptors freed
603  */
604 static int
605 nfp_net_tx_free_bufs(struct nfp_net_txq *txq)
606 {
607 	uint32_t qcp_rd_p;
608 	int todo;
609 
610 	PMD_TX_LOG(DEBUG, "queue %u. Check for descriptor with a complete"
611 		   " status", txq->qidx);
612 
613 	/* Work out how many packets have been sent */
614 	qcp_rd_p = nfp_qcp_read(txq->qcp_q, NFP_QCP_READ_PTR);
615 
616 	if (qcp_rd_p == txq->rd_p) {
617 		PMD_TX_LOG(DEBUG, "queue %u: It seems harrier is not sending "
618 			   "packets (%u, %u)", txq->qidx,
619 			   qcp_rd_p, txq->rd_p);
620 		return 0;
621 	}
622 
623 	if (qcp_rd_p > txq->rd_p)
624 		todo = qcp_rd_p - txq->rd_p;
625 	else
626 		todo = qcp_rd_p + txq->tx_count - txq->rd_p;
627 
628 	PMD_TX_LOG(DEBUG, "qcp_rd_p %u, txq->rd_p: %u, qcp->rd_p: %u",
629 		   qcp_rd_p, txq->rd_p, txq->rd_p);
630 
631 	if (todo == 0)
632 		return todo;
633 
634 	txq->rd_p += todo;
635 	if (unlikely(txq->rd_p >= txq->tx_count))
636 		txq->rd_p -= txq->tx_count;
637 
638 	return todo;
639 }
640 
641 static void
642 nfp_net_tx_queue_release_mbufs(struct nfp_net_txq *txq)
643 {
644 	unsigned int i;
645 
646 	if (txq->txbufs == NULL)
647 		return;
648 
649 	for (i = 0; i < txq->tx_count; i++) {
650 		if (txq->txbufs[i].mbuf) {
651 			rte_pktmbuf_free_seg(txq->txbufs[i].mbuf);
652 			txq->txbufs[i].mbuf = NULL;
653 		}
654 	}
655 }
656 
657 void
658 nfp_net_tx_queue_release(struct rte_eth_dev *dev, uint16_t queue_idx)
659 {
660 	struct nfp_net_txq *txq = dev->data->tx_queues[queue_idx];
661 
662 	if (txq) {
663 		nfp_net_tx_queue_release_mbufs(txq);
664 		rte_eth_dma_zone_free(dev, "tx_ring", queue_idx);
665 		rte_free(txq->txbufs);
666 		rte_free(txq);
667 	}
668 }
669 
670 void
671 nfp_net_reset_tx_queue(struct nfp_net_txq *txq)
672 {
673 	nfp_net_tx_queue_release_mbufs(txq);
674 	txq->wr_p = 0;
675 	txq->rd_p = 0;
676 }
677 
678 int
679 nfp_net_tx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx,
680 		       uint16_t nb_desc, unsigned int socket_id,
681 		       const struct rte_eth_txconf *tx_conf)
682 {
683 	const struct rte_memzone *tz;
684 	struct nfp_net_txq *txq;
685 	uint16_t tx_free_thresh;
686 	struct nfp_net_hw *hw;
687 	uint32_t tx_desc_sz;
688 
689 	hw = NFP_NET_DEV_PRIVATE_TO_HW(dev->data->dev_private);
690 
691 	PMD_INIT_FUNC_TRACE();
692 
693 	/* Validating number of descriptors */
694 	tx_desc_sz = nb_desc * sizeof(struct nfp_net_tx_desc);
695 	if (tx_desc_sz % NFP_ALIGN_RING_DESC != 0 ||
696 	    nb_desc > NFP_NET_MAX_TX_DESC ||
697 	    nb_desc < NFP_NET_MIN_TX_DESC) {
698 		PMD_DRV_LOG(ERR, "Wrong nb_desc value");
699 		return -EINVAL;
700 	}
701 
702 	tx_free_thresh = (uint16_t)((tx_conf->tx_free_thresh) ?
703 				    tx_conf->tx_free_thresh :
704 				    DEFAULT_TX_FREE_THRESH);
705 
706 	if (tx_free_thresh > (nb_desc)) {
707 		PMD_DRV_LOG(ERR,
708 			"tx_free_thresh must be less than the number of TX "
709 			"descriptors. (tx_free_thresh=%u port=%d "
710 			"queue=%d)", (unsigned int)tx_free_thresh,
711 			dev->data->port_id, (int)queue_idx);
712 		return -(EINVAL);
713 	}
714 
715 	/*
716 	 * Free memory prior to re-allocation if needed. This is the case after
717 	 * calling nfp_net_stop
718 	 */
719 	if (dev->data->tx_queues[queue_idx]) {
720 		PMD_TX_LOG(DEBUG, "Freeing memory prior to re-allocation %d",
721 			   queue_idx);
722 		nfp_net_tx_queue_release(dev, queue_idx);
723 		dev->data->tx_queues[queue_idx] = NULL;
724 	}
725 
726 	/* Allocating tx queue data structure */
727 	txq = rte_zmalloc_socket("ethdev TX queue", sizeof(struct nfp_net_txq),
728 				 RTE_CACHE_LINE_SIZE, socket_id);
729 	if (txq == NULL) {
730 		PMD_DRV_LOG(ERR, "Error allocating tx dma");
731 		return -ENOMEM;
732 	}
733 
734 	dev->data->tx_queues[queue_idx] = txq;
735 
736 	/*
737 	 * Allocate TX ring hardware descriptors. A memzone large enough to
738 	 * handle the maximum ring size is allocated in order to allow for
739 	 * resizing in later calls to the queue setup function.
740 	 */
741 	tz = rte_eth_dma_zone_reserve(dev, "tx_ring", queue_idx,
742 				   sizeof(struct nfp_net_tx_desc) *
743 				   NFP_NET_MAX_TX_DESC, NFP_MEMZONE_ALIGN,
744 				   socket_id);
745 	if (tz == NULL) {
746 		PMD_DRV_LOG(ERR, "Error allocating tx dma");
747 		nfp_net_tx_queue_release(dev, queue_idx);
748 		dev->data->tx_queues[queue_idx] = NULL;
749 		return -ENOMEM;
750 	}
751 
752 	txq->tx_count = nb_desc;
753 	txq->tx_free_thresh = tx_free_thresh;
754 	txq->tx_pthresh = tx_conf->tx_thresh.pthresh;
755 	txq->tx_hthresh = tx_conf->tx_thresh.hthresh;
756 	txq->tx_wthresh = tx_conf->tx_thresh.wthresh;
757 
758 	/* queue mapping based on firmware configuration */
759 	txq->qidx = queue_idx;
760 	txq->tx_qcidx = queue_idx * hw->stride_tx;
761 	txq->qcp_q = hw->tx_bar + NFP_QCP_QUEUE_OFF(txq->tx_qcidx);
762 
763 	txq->port_id = dev->data->port_id;
764 
765 	/* Saving physical and virtual addresses for the TX ring */
766 	txq->dma = (uint64_t)tz->iova;
767 	txq->txds = (struct nfp_net_tx_desc *)tz->addr;
768 
769 	/* mbuf pointers array for referencing mbufs linked to TX descriptors */
770 	txq->txbufs = rte_zmalloc_socket("txq->txbufs",
771 					 sizeof(*txq->txbufs) * nb_desc,
772 					 RTE_CACHE_LINE_SIZE, socket_id);
773 	if (txq->txbufs == NULL) {
774 		nfp_net_tx_queue_release(dev, queue_idx);
775 		dev->data->tx_queues[queue_idx] = NULL;
776 		return -ENOMEM;
777 	}
778 	PMD_TX_LOG(DEBUG, "txbufs=%p hw_ring=%p dma_addr=0x%" PRIx64,
779 		   txq->txbufs, txq->txds, (unsigned long)txq->dma);
780 
781 	nfp_net_reset_tx_queue(txq);
782 
783 	txq->hw = hw;
784 
785 	/*
786 	 * Telling the HW about the physical address of the TX ring and number
787 	 * of descriptors in log2 format
788 	 */
789 	nn_cfg_writeq(hw, NFP_NET_CFG_TXR_ADDR(queue_idx), txq->dma);
790 	nn_cfg_writeb(hw, NFP_NET_CFG_TXR_SZ(queue_idx), rte_log2_u32(nb_desc));
791 
792 	return 0;
793 }
794 
795 /* Leaving always free descriptors for avoiding wrapping confusion */
796 static inline
797 uint32_t nfp_free_tx_desc(struct nfp_net_txq *txq)
798 {
799 	if (txq->wr_p >= txq->rd_p)
800 		return txq->tx_count - (txq->wr_p - txq->rd_p) - 8;
801 	else
802 		return txq->rd_p - txq->wr_p - 8;
803 }
804 
805 /*
806  * nfp_net_txq_full - Check if the TX queue free descriptors
807  * is below tx_free_threshold
808  *
809  * @txq: TX queue to check
810  *
811  * This function uses the host copy* of read/write pointers
812  */
813 static inline
814 uint32_t nfp_net_txq_full(struct nfp_net_txq *txq)
815 {
816 	return (nfp_free_tx_desc(txq) < txq->tx_free_thresh);
817 }
818 
819 /* nfp_net_tx_tso - Set TX descriptor for TSO */
820 static inline void
821 nfp_net_tx_tso(struct nfp_net_txq *txq, struct nfp_net_tx_desc *txd,
822 	       struct rte_mbuf *mb)
823 {
824 	uint64_t ol_flags;
825 	struct nfp_net_hw *hw = txq->hw;
826 
827 	if (!(hw->cap & NFP_NET_CFG_CTRL_LSO_ANY))
828 		goto clean_txd;
829 
830 	ol_flags = mb->ol_flags;
831 
832 	if (!(ol_flags & RTE_MBUF_F_TX_TCP_SEG))
833 		goto clean_txd;
834 
835 	txd->l3_offset = mb->l2_len;
836 	txd->l4_offset = mb->l2_len + mb->l3_len;
837 	txd->lso_hdrlen = mb->l2_len + mb->l3_len + mb->l4_len;
838 	txd->mss = rte_cpu_to_le_16(mb->tso_segsz);
839 	txd->flags = PCIE_DESC_TX_LSO;
840 	return;
841 
842 clean_txd:
843 	txd->flags = 0;
844 	txd->l3_offset = 0;
845 	txd->l4_offset = 0;
846 	txd->lso_hdrlen = 0;
847 	txd->mss = 0;
848 }
849 
850 /* nfp_net_tx_cksum - Set TX CSUM offload flags in TX descriptor */
851 static inline void
852 nfp_net_tx_cksum(struct nfp_net_txq *txq, struct nfp_net_tx_desc *txd,
853 		 struct rte_mbuf *mb)
854 {
855 	uint64_t ol_flags;
856 	struct nfp_net_hw *hw = txq->hw;
857 
858 	if (!(hw->cap & NFP_NET_CFG_CTRL_TXCSUM))
859 		return;
860 
861 	ol_flags = mb->ol_flags;
862 
863 	/* IPv6 does not need checksum */
864 	if (ol_flags & RTE_MBUF_F_TX_IP_CKSUM)
865 		txd->flags |= PCIE_DESC_TX_IP4_CSUM;
866 
867 	switch (ol_flags & RTE_MBUF_F_TX_L4_MASK) {
868 	case RTE_MBUF_F_TX_UDP_CKSUM:
869 		txd->flags |= PCIE_DESC_TX_UDP_CSUM;
870 		break;
871 	case RTE_MBUF_F_TX_TCP_CKSUM:
872 		txd->flags |= PCIE_DESC_TX_TCP_CSUM;
873 		break;
874 	}
875 
876 	if (ol_flags & (RTE_MBUF_F_TX_IP_CKSUM | RTE_MBUF_F_TX_L4_MASK))
877 		txd->flags |= PCIE_DESC_TX_CSUM;
878 }
879 
880 uint16_t
881 nfp_net_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
882 {
883 	struct nfp_net_txq *txq;
884 	struct nfp_net_hw *hw;
885 	struct nfp_net_tx_desc *txds, txd;
886 	struct rte_mbuf *pkt;
887 	uint64_t dma_addr;
888 	int pkt_size, dma_size;
889 	uint16_t free_descs, issued_descs;
890 	struct rte_mbuf **lmbuf;
891 	int i;
892 
893 	txq = tx_queue;
894 	hw = txq->hw;
895 	txds = &txq->txds[txq->wr_p];
896 
897 	PMD_TX_LOG(DEBUG, "working for queue %u at pos %d and %u packets",
898 		   txq->qidx, txq->wr_p, nb_pkts);
899 
900 	if ((nfp_free_tx_desc(txq) < nb_pkts) || (nfp_net_txq_full(txq)))
901 		nfp_net_tx_free_bufs(txq);
902 
903 	free_descs = (uint16_t)nfp_free_tx_desc(txq);
904 	if (unlikely(free_descs == 0))
905 		return 0;
906 
907 	pkt = *tx_pkts;
908 
909 	i = 0;
910 	issued_descs = 0;
911 	PMD_TX_LOG(DEBUG, "queue: %u. Sending %u packets",
912 		   txq->qidx, nb_pkts);
913 	/* Sending packets */
914 	while ((i < nb_pkts) && free_descs) {
915 		/* Grabbing the mbuf linked to the current descriptor */
916 		lmbuf = &txq->txbufs[txq->wr_p].mbuf;
917 		/* Warming the cache for releasing the mbuf later on */
918 		RTE_MBUF_PREFETCH_TO_FREE(*lmbuf);
919 
920 		pkt = *(tx_pkts + i);
921 
922 		if (unlikely(pkt->nb_segs > 1 &&
923 			     !(hw->cap & NFP_NET_CFG_CTRL_GATHER))) {
924 			PMD_INIT_LOG(INFO, "NFP_NET_CFG_CTRL_GATHER not set");
925 			rte_panic("Multisegment packet unsupported\n");
926 		}
927 
928 		/* Checking if we have enough descriptors */
929 		if (unlikely(pkt->nb_segs > free_descs))
930 			goto xmit_end;
931 
932 		/*
933 		 * Checksum and VLAN flags just in the first descriptor for a
934 		 * multisegment packet, but TSO info needs to be in all of them.
935 		 */
936 		txd.data_len = pkt->pkt_len;
937 		nfp_net_tx_tso(txq, &txd, pkt);
938 		nfp_net_tx_cksum(txq, &txd, pkt);
939 
940 		if ((pkt->ol_flags & RTE_MBUF_F_TX_VLAN) &&
941 		    (hw->cap & NFP_NET_CFG_CTRL_TXVLAN)) {
942 			txd.flags |= PCIE_DESC_TX_VLAN;
943 			txd.vlan = pkt->vlan_tci;
944 		}
945 
946 		/*
947 		 * mbuf data_len is the data in one segment and pkt_len data
948 		 * in the whole packet. When the packet is just one segment,
949 		 * then data_len = pkt_len
950 		 */
951 		pkt_size = pkt->pkt_len;
952 
953 		while (pkt) {
954 			/* Copying TSO, VLAN and cksum info */
955 			*txds = txd;
956 
957 			/* Releasing mbuf used by this descriptor previously*/
958 			if (*lmbuf)
959 				rte_pktmbuf_free_seg(*lmbuf);
960 
961 			/*
962 			 * Linking mbuf with descriptor for being released
963 			 * next time descriptor is used
964 			 */
965 			*lmbuf = pkt;
966 
967 			dma_size = pkt->data_len;
968 			dma_addr = rte_mbuf_data_iova(pkt);
969 			PMD_TX_LOG(DEBUG, "Working with mbuf at dma address:"
970 				   "%" PRIx64 "", dma_addr);
971 
972 			/* Filling descriptors fields */
973 			txds->dma_len = dma_size;
974 			txds->data_len = txd.data_len;
975 			txds->dma_addr_hi = (dma_addr >> 32) & 0xff;
976 			txds->dma_addr_lo = (dma_addr & 0xffffffff);
977 			ASSERT(free_descs > 0);
978 			free_descs--;
979 
980 			txq->wr_p++;
981 			if (unlikely(txq->wr_p == txq->tx_count)) /* wrapping?*/
982 				txq->wr_p = 0;
983 
984 			pkt_size -= dma_size;
985 
986 			/*
987 			 * Making the EOP, packets with just one segment
988 			 * the priority
989 			 */
990 			if (likely(!pkt_size))
991 				txds->offset_eop = PCIE_DESC_TX_EOP;
992 			else
993 				txds->offset_eop = 0;
994 
995 			pkt = pkt->next;
996 			/* Referencing next free TX descriptor */
997 			txds = &txq->txds[txq->wr_p];
998 			lmbuf = &txq->txbufs[txq->wr_p].mbuf;
999 			issued_descs++;
1000 		}
1001 		i++;
1002 	}
1003 
1004 xmit_end:
1005 	/* Increment write pointers. Force memory write before we let HW know */
1006 	rte_wmb();
1007 	nfp_qcp_ptr_add(txq->qcp_q, NFP_QCP_WRITE_PTR, issued_descs);
1008 
1009 	return i;
1010 }
1011