xref: /dpdk/drivers/net/nfp/nfp_rxtx.c (revision 081e42dab11d1add2d038fdf2bd4c86b20043d08)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright (c) 2014-2021 Netronome Systems, Inc.
3  * All rights reserved.
4  *
5  * Small portions derived from code Copyright(c) 2010-2015 Intel Corporation.
6  */
7 
8 /*
9  * vim:shiftwidth=8:noexpandtab
10  *
11  * @file dpdk/pmd/nfp_rxtx.c
12  *
13  * Netronome vNIC DPDK Poll-Mode Driver: Rx/Tx functions
14  */
15 
16 #include <ethdev_driver.h>
17 #include <ethdev_pci.h>
18 
19 #include "nfp_common.h"
20 #include "nfp_rxtx.h"
21 #include "nfp_logs.h"
22 #include "nfp_ctrl.h"
23 
24 /* Prototypes */
25 static int nfp_net_rx_fill_freelist(struct nfp_net_rxq *rxq);
26 static inline void nfp_net_mbuf_alloc_failed(struct nfp_net_rxq *rxq);
27 static inline void nfp_net_set_hash(struct nfp_net_rxq *rxq,
28 				    struct nfp_net_rx_desc *rxd,
29 				    struct rte_mbuf *mbuf);
30 static inline void nfp_net_rx_cksum(struct nfp_net_rxq *rxq,
31 				    struct nfp_net_rx_desc *rxd,
32 				    struct rte_mbuf *mb);
33 static void nfp_net_rx_queue_release_mbufs(struct nfp_net_rxq *rxq);
34 static int nfp_net_tx_free_bufs(struct nfp_net_txq *txq);
35 static void nfp_net_tx_queue_release_mbufs(struct nfp_net_txq *txq);
36 static inline uint32_t nfp_free_tx_desc(struct nfp_net_txq *txq);
37 static inline uint32_t nfp_net_txq_full(struct nfp_net_txq *txq);
38 static inline void nfp_net_tx_tso(struct nfp_net_txq *txq,
39 				  struct nfp_net_tx_desc *txd,
40 				  struct rte_mbuf *mb);
41 static inline void nfp_net_tx_cksum(struct nfp_net_txq *txq,
42 				    struct nfp_net_tx_desc *txd,
43 				    struct rte_mbuf *mb);
44 
45 static int
46 nfp_net_rx_fill_freelist(struct nfp_net_rxq *rxq)
47 {
48 	struct nfp_net_rx_buff *rxe = rxq->rxbufs;
49 	uint64_t dma_addr;
50 	unsigned int i;
51 
52 	PMD_RX_LOG(DEBUG, "Fill Rx Freelist for %u descriptors",
53 		   rxq->rx_count);
54 
55 	for (i = 0; i < rxq->rx_count; i++) {
56 		struct nfp_net_rx_desc *rxd;
57 		struct rte_mbuf *mbuf = rte_pktmbuf_alloc(rxq->mem_pool);
58 
59 		if (mbuf == NULL) {
60 			PMD_DRV_LOG(ERR, "RX mbuf alloc failed queue_id=%u",
61 				(unsigned int)rxq->qidx);
62 			return -ENOMEM;
63 		}
64 
65 		dma_addr = rte_cpu_to_le_64(RTE_MBUF_DMA_ADDR_DEFAULT(mbuf));
66 
67 		rxd = &rxq->rxds[i];
68 		rxd->fld.dd = 0;
69 		rxd->fld.dma_addr_hi = (dma_addr >> 32) & 0xff;
70 		rxd->fld.dma_addr_lo = dma_addr & 0xffffffff;
71 		rxe[i].mbuf = mbuf;
72 		PMD_RX_LOG(DEBUG, "[%d]: %" PRIx64, i, dma_addr);
73 	}
74 
75 	/* Make sure all writes are flushed before telling the hardware */
76 	rte_wmb();
77 
78 	/* Not advertising the whole ring as the firmware gets confused if so */
79 	PMD_RX_LOG(DEBUG, "Increment FL write pointer in %u",
80 		   rxq->rx_count - 1);
81 
82 	nfp_qcp_ptr_add(rxq->qcp_fl, NFP_QCP_WRITE_PTR, rxq->rx_count - 1);
83 
84 	return 0;
85 }
86 
87 int
88 nfp_net_rx_freelist_setup(struct rte_eth_dev *dev)
89 {
90 	int i;
91 
92 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
93 		if (nfp_net_rx_fill_freelist(dev->data->rx_queues[i]) < 0)
94 			return -1;
95 	}
96 	return 0;
97 }
98 
99 uint32_t
100 nfp_net_rx_queue_count(void *rx_queue)
101 {
102 	struct nfp_net_rxq *rxq;
103 	struct nfp_net_rx_desc *rxds;
104 	uint32_t idx;
105 	uint32_t count;
106 
107 	rxq = rx_queue;
108 
109 	idx = rxq->rd_p;
110 
111 	count = 0;
112 
113 	/*
114 	 * Other PMDs are just checking the DD bit in intervals of 4
115 	 * descriptors and counting all four if the first has the DD
116 	 * bit on. Of course, this is not accurate but can be good for
117 	 * performance. But ideally that should be done in descriptors
118 	 * chunks belonging to the same cache line
119 	 */
120 
121 	while (count < rxq->rx_count) {
122 		rxds = &rxq->rxds[idx];
123 		if ((rxds->rxd.meta_len_dd & PCIE_DESC_RX_DD) == 0)
124 			break;
125 
126 		count++;
127 		idx++;
128 
129 		/* Wrapping? */
130 		if ((idx) == rxq->rx_count)
131 			idx = 0;
132 	}
133 
134 	return count;
135 }
136 
137 static inline void
138 nfp_net_mbuf_alloc_failed(struct nfp_net_rxq *rxq)
139 {
140 	rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
141 }
142 
143 /*
144  * nfp_net_set_hash - Set mbuf hash data
145  *
146  * The RSS hash and hash-type are pre-pended to the packet data.
147  * Extract and decode it and set the mbuf fields.
148  */
149 static inline void
150 nfp_net_set_hash(struct nfp_net_rxq *rxq, struct nfp_net_rx_desc *rxd,
151 		 struct rte_mbuf *mbuf)
152 {
153 	struct nfp_net_hw *hw = rxq->hw;
154 	uint8_t *meta_offset;
155 	uint32_t meta_info;
156 	uint32_t hash = 0;
157 	uint32_t hash_type = 0;
158 
159 	if (!(hw->ctrl & NFP_NET_CFG_CTRL_RSS))
160 		return;
161 
162 	/* this is true for new firmwares */
163 	if (likely(((hw->cap & NFP_NET_CFG_CTRL_RSS2) ||
164 	    (NFD_CFG_MAJOR_VERSION_of(hw->ver) == 4)) &&
165 	     NFP_DESC_META_LEN(rxd))) {
166 		/*
167 		 * new metadata api:
168 		 * <----  32 bit  ----->
169 		 * m    field type word
170 		 * e     data field #2
171 		 * t     data field #1
172 		 * a     data field #0
173 		 * ====================
174 		 *    packet data
175 		 *
176 		 * Field type word contains up to 8 4bit field types
177 		 * A 4bit field type refers to a data field word
178 		 * A data field word can have several 4bit field types
179 		 */
180 		meta_offset = rte_pktmbuf_mtod(mbuf, uint8_t *);
181 		meta_offset -= NFP_DESC_META_LEN(rxd);
182 		meta_info = rte_be_to_cpu_32(*(uint32_t *)meta_offset);
183 		meta_offset += 4;
184 		/* NFP PMD just supports metadata for hashing */
185 		switch (meta_info & NFP_NET_META_FIELD_MASK) {
186 		case NFP_NET_META_HASH:
187 			/* next field type is about the hash type */
188 			meta_info >>= NFP_NET_META_FIELD_SIZE;
189 			/* hash value is in the data field */
190 			hash = rte_be_to_cpu_32(*(uint32_t *)meta_offset);
191 			hash_type = meta_info & NFP_NET_META_FIELD_MASK;
192 			break;
193 		default:
194 			/* Unsupported metadata can be a performance issue */
195 			return;
196 		}
197 	} else {
198 		if (!(rxd->rxd.flags & PCIE_DESC_RX_RSS))
199 			return;
200 
201 		hash = rte_be_to_cpu_32(*(uint32_t *)NFP_HASH_OFFSET);
202 		hash_type = rte_be_to_cpu_32(*(uint32_t *)NFP_HASH_TYPE_OFFSET);
203 	}
204 
205 	mbuf->hash.rss = hash;
206 	mbuf->ol_flags |= PKT_RX_RSS_HASH;
207 
208 	switch (hash_type) {
209 	case NFP_NET_RSS_IPV4:
210 		mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV4;
211 		break;
212 	case NFP_NET_RSS_IPV6:
213 		mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6;
214 		break;
215 	case NFP_NET_RSS_IPV6_EX:
216 		mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6_EXT;
217 		break;
218 	case NFP_NET_RSS_IPV4_TCP:
219 		mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6_EXT;
220 		break;
221 	case NFP_NET_RSS_IPV6_TCP:
222 		mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6_EXT;
223 		break;
224 	case NFP_NET_RSS_IPV4_UDP:
225 		mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6_EXT;
226 		break;
227 	case NFP_NET_RSS_IPV6_UDP:
228 		mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6_EXT;
229 		break;
230 	default:
231 		mbuf->packet_type |= RTE_PTYPE_INNER_L4_MASK;
232 	}
233 }
234 
235 /* nfp_net_rx_cksum - set mbuf checksum flags based on RX descriptor flags */
236 static inline void
237 nfp_net_rx_cksum(struct nfp_net_rxq *rxq, struct nfp_net_rx_desc *rxd,
238 		 struct rte_mbuf *mb)
239 {
240 	struct nfp_net_hw *hw = rxq->hw;
241 
242 	if (!(hw->ctrl & NFP_NET_CFG_CTRL_RXCSUM))
243 		return;
244 
245 	/* If IPv4 and IP checksum error, fail */
246 	if (unlikely((rxd->rxd.flags & PCIE_DESC_RX_IP4_CSUM) &&
247 	    !(rxd->rxd.flags & PCIE_DESC_RX_IP4_CSUM_OK)))
248 		mb->ol_flags |= PKT_RX_IP_CKSUM_BAD;
249 	else
250 		mb->ol_flags |= PKT_RX_IP_CKSUM_GOOD;
251 
252 	/* If neither UDP nor TCP return */
253 	if (!(rxd->rxd.flags & PCIE_DESC_RX_TCP_CSUM) &&
254 	    !(rxd->rxd.flags & PCIE_DESC_RX_UDP_CSUM))
255 		return;
256 
257 	if (likely(rxd->rxd.flags & PCIE_DESC_RX_L4_CSUM_OK))
258 		mb->ol_flags |= PKT_RX_L4_CKSUM_GOOD;
259 	else
260 		mb->ol_flags |= PKT_RX_L4_CKSUM_BAD;
261 }
262 
263 /*
264  * RX path design:
265  *
266  * There are some decisions to take:
267  * 1) How to check DD RX descriptors bit
268  * 2) How and when to allocate new mbufs
269  *
270  * Current implementation checks just one single DD bit each loop. As each
271  * descriptor is 8 bytes, it is likely a good idea to check descriptors in
272  * a single cache line instead. Tests with this change have not shown any
273  * performance improvement but it requires further investigation. For example,
274  * depending on which descriptor is next, the number of descriptors could be
275  * less than 8 for just checking those in the same cache line. This implies
276  * extra work which could be counterproductive by itself. Indeed, last firmware
277  * changes are just doing this: writing several descriptors with the DD bit
278  * for saving PCIe bandwidth and DMA operations from the NFP.
279  *
280  * Mbuf allocation is done when a new packet is received. Then the descriptor
281  * is automatically linked with the new mbuf and the old one is given to the
282  * user. The main drawback with this design is mbuf allocation is heavier than
283  * using bulk allocations allowed by DPDK with rte_mempool_get_bulk. From the
284  * cache point of view it does not seem allocating the mbuf early on as we are
285  * doing now have any benefit at all. Again, tests with this change have not
286  * shown any improvement. Also, rte_mempool_get_bulk returns all or nothing
287  * so looking at the implications of this type of allocation should be studied
288  * deeply
289  */
290 
291 uint16_t
292 nfp_net_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
293 {
294 	struct nfp_net_rxq *rxq;
295 	struct nfp_net_rx_desc *rxds;
296 	struct nfp_net_rx_buff *rxb;
297 	struct nfp_net_hw *hw;
298 	struct rte_mbuf *mb;
299 	struct rte_mbuf *new_mb;
300 	uint16_t nb_hold;
301 	uint64_t dma_addr;
302 	int avail;
303 
304 	rxq = rx_queue;
305 	if (unlikely(rxq == NULL)) {
306 		/*
307 		 * DPDK just checks the queue is lower than max queues
308 		 * enabled. But the queue needs to be configured
309 		 */
310 		RTE_LOG_DP(ERR, PMD, "RX Bad queue\n");
311 		return -EINVAL;
312 	}
313 
314 	hw = rxq->hw;
315 	avail = 0;
316 	nb_hold = 0;
317 
318 	while (avail < nb_pkts) {
319 		rxb = &rxq->rxbufs[rxq->rd_p];
320 		if (unlikely(rxb == NULL)) {
321 			RTE_LOG_DP(ERR, PMD, "rxb does not exist!\n");
322 			break;
323 		}
324 
325 		rxds = &rxq->rxds[rxq->rd_p];
326 		if ((rxds->rxd.meta_len_dd & PCIE_DESC_RX_DD) == 0)
327 			break;
328 
329 		/*
330 		 * Memory barrier to ensure that we won't do other
331 		 * reads before the DD bit.
332 		 */
333 		rte_rmb();
334 
335 		/*
336 		 * We got a packet. Let's alloc a new mbuf for refilling the
337 		 * free descriptor ring as soon as possible
338 		 */
339 		new_mb = rte_pktmbuf_alloc(rxq->mem_pool);
340 		if (unlikely(new_mb == NULL)) {
341 			RTE_LOG_DP(DEBUG, PMD,
342 			"RX mbuf alloc failed port_id=%u queue_id=%u\n",
343 				rxq->port_id, (unsigned int)rxq->qidx);
344 			nfp_net_mbuf_alloc_failed(rxq);
345 			break;
346 		}
347 
348 		nb_hold++;
349 
350 		/*
351 		 * Grab the mbuf and refill the descriptor with the
352 		 * previously allocated mbuf
353 		 */
354 		mb = rxb->mbuf;
355 		rxb->mbuf = new_mb;
356 
357 		PMD_RX_LOG(DEBUG, "Packet len: %u, mbuf_size: %u",
358 			   rxds->rxd.data_len, rxq->mbuf_size);
359 
360 		/* Size of this segment */
361 		mb->data_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds);
362 		/* Size of the whole packet. We just support 1 segment */
363 		mb->pkt_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds);
364 
365 		if (unlikely((mb->data_len + hw->rx_offset) >
366 			     rxq->mbuf_size)) {
367 			/*
368 			 * This should not happen and the user has the
369 			 * responsibility of avoiding it. But we have
370 			 * to give some info about the error
371 			 */
372 			RTE_LOG_DP(ERR, PMD,
373 				"mbuf overflow likely due to the RX offset.\n"
374 				"\t\tYour mbuf size should have extra space for"
375 				" RX offset=%u bytes.\n"
376 				"\t\tCurrently you just have %u bytes available"
377 				" but the received packet is %u bytes long",
378 				hw->rx_offset,
379 				rxq->mbuf_size - hw->rx_offset,
380 				mb->data_len);
381 			return -EINVAL;
382 		}
383 
384 		/* Filling the received mbuf with packet info */
385 		if (hw->rx_offset)
386 			mb->data_off = RTE_PKTMBUF_HEADROOM + hw->rx_offset;
387 		else
388 			mb->data_off = RTE_PKTMBUF_HEADROOM +
389 				       NFP_DESC_META_LEN(rxds);
390 
391 		/* No scatter mode supported */
392 		mb->nb_segs = 1;
393 		mb->next = NULL;
394 
395 		mb->port = rxq->port_id;
396 
397 		/* Checking the RSS flag */
398 		nfp_net_set_hash(rxq, rxds, mb);
399 
400 		/* Checking the checksum flag */
401 		nfp_net_rx_cksum(rxq, rxds, mb);
402 
403 		if ((rxds->rxd.flags & PCIE_DESC_RX_VLAN) &&
404 		    (hw->ctrl & NFP_NET_CFG_CTRL_RXVLAN)) {
405 			mb->vlan_tci = rte_cpu_to_le_32(rxds->rxd.vlan);
406 			mb->ol_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED;
407 		}
408 
409 		/* Adding the mbuf to the mbuf array passed by the app */
410 		rx_pkts[avail++] = mb;
411 
412 		/* Now resetting and updating the descriptor */
413 		rxds->vals[0] = 0;
414 		rxds->vals[1] = 0;
415 		dma_addr = rte_cpu_to_le_64(RTE_MBUF_DMA_ADDR_DEFAULT(new_mb));
416 		rxds->fld.dd = 0;
417 		rxds->fld.dma_addr_hi = (dma_addr >> 32) & 0xff;
418 		rxds->fld.dma_addr_lo = dma_addr & 0xffffffff;
419 
420 		rxq->rd_p++;
421 		if (unlikely(rxq->rd_p == rxq->rx_count)) /* wrapping?*/
422 			rxq->rd_p = 0;
423 	}
424 
425 	if (nb_hold == 0)
426 		return nb_hold;
427 
428 	PMD_RX_LOG(DEBUG, "RX  port_id=%u queue_id=%u, %d packets received",
429 		   rxq->port_id, (unsigned int)rxq->qidx, nb_hold);
430 
431 	nb_hold += rxq->nb_rx_hold;
432 
433 	/*
434 	 * FL descriptors needs to be written before incrementing the
435 	 * FL queue WR pointer
436 	 */
437 	rte_wmb();
438 	if (nb_hold > rxq->rx_free_thresh) {
439 		PMD_RX_LOG(DEBUG, "port=%u queue=%u nb_hold=%u avail=%u",
440 			   rxq->port_id, (unsigned int)rxq->qidx,
441 			   (unsigned int)nb_hold, (unsigned int)avail);
442 		nfp_qcp_ptr_add(rxq->qcp_fl, NFP_QCP_WRITE_PTR, nb_hold);
443 		nb_hold = 0;
444 	}
445 	rxq->nb_rx_hold = nb_hold;
446 
447 	return avail;
448 }
449 
450 static void
451 nfp_net_rx_queue_release_mbufs(struct nfp_net_rxq *rxq)
452 {
453 	unsigned int i;
454 
455 	if (rxq->rxbufs == NULL)
456 		return;
457 
458 	for (i = 0; i < rxq->rx_count; i++) {
459 		if (rxq->rxbufs[i].mbuf) {
460 			rte_pktmbuf_free_seg(rxq->rxbufs[i].mbuf);
461 			rxq->rxbufs[i].mbuf = NULL;
462 		}
463 	}
464 }
465 
466 void
467 nfp_net_rx_queue_release(struct rte_eth_dev *dev, uint16_t queue_idx)
468 {
469 	struct nfp_net_rxq *rxq = dev->data->rx_queues[queue_idx];
470 
471 	if (rxq) {
472 		nfp_net_rx_queue_release_mbufs(rxq);
473 		rte_free(rxq->rxbufs);
474 		rte_free(rxq);
475 	}
476 }
477 
478 void
479 nfp_net_reset_rx_queue(struct nfp_net_rxq *rxq)
480 {
481 	nfp_net_rx_queue_release_mbufs(rxq);
482 	rxq->rd_p = 0;
483 	rxq->nb_rx_hold = 0;
484 }
485 
486 int
487 nfp_net_rx_queue_setup(struct rte_eth_dev *dev,
488 		       uint16_t queue_idx, uint16_t nb_desc,
489 		       unsigned int socket_id,
490 		       const struct rte_eth_rxconf *rx_conf,
491 		       struct rte_mempool *mp)
492 {
493 	const struct rte_memzone *tz;
494 	struct nfp_net_rxq *rxq;
495 	struct nfp_net_hw *hw;
496 	uint32_t rx_desc_sz;
497 
498 	hw = NFP_NET_DEV_PRIVATE_TO_HW(dev->data->dev_private);
499 
500 	PMD_INIT_FUNC_TRACE();
501 
502 	/* Validating number of descriptors */
503 	rx_desc_sz = nb_desc * sizeof(struct nfp_net_rx_desc);
504 	if (rx_desc_sz % NFP_ALIGN_RING_DESC != 0 ||
505 	    nb_desc > NFP_NET_MAX_RX_DESC ||
506 	    nb_desc < NFP_NET_MIN_RX_DESC) {
507 		PMD_DRV_LOG(ERR, "Wrong nb_desc value");
508 		return -EINVAL;
509 	}
510 
511 	/*
512 	 * Free memory prior to re-allocation if needed. This is the case after
513 	 * calling nfp_net_stop
514 	 */
515 	if (dev->data->rx_queues[queue_idx]) {
516 		nfp_net_rx_queue_release(dev, queue_idx);
517 		dev->data->rx_queues[queue_idx] = NULL;
518 	}
519 
520 	/* Allocating rx queue data structure */
521 	rxq = rte_zmalloc_socket("ethdev RX queue", sizeof(struct nfp_net_rxq),
522 				 RTE_CACHE_LINE_SIZE, socket_id);
523 	if (rxq == NULL)
524 		return -ENOMEM;
525 
526 	dev->data->rx_queues[queue_idx] = rxq;
527 
528 	/* Hw queues mapping based on firmware configuration */
529 	rxq->qidx = queue_idx;
530 	rxq->fl_qcidx = queue_idx * hw->stride_rx;
531 	rxq->rx_qcidx = rxq->fl_qcidx + (hw->stride_rx - 1);
532 	rxq->qcp_fl = hw->rx_bar + NFP_QCP_QUEUE_OFF(rxq->fl_qcidx);
533 	rxq->qcp_rx = hw->rx_bar + NFP_QCP_QUEUE_OFF(rxq->rx_qcidx);
534 
535 	/*
536 	 * Tracking mbuf size for detecting a potential mbuf overflow due to
537 	 * RX offset
538 	 */
539 	rxq->mem_pool = mp;
540 	rxq->mbuf_size = rxq->mem_pool->elt_size;
541 	rxq->mbuf_size -= (sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM);
542 	hw->flbufsz = rxq->mbuf_size;
543 
544 	rxq->rx_count = nb_desc;
545 	rxq->port_id = dev->data->port_id;
546 	rxq->rx_free_thresh = rx_conf->rx_free_thresh;
547 	rxq->drop_en = rx_conf->rx_drop_en;
548 
549 	/*
550 	 * Allocate RX ring hardware descriptors. A memzone large enough to
551 	 * handle the maximum ring size is allocated in order to allow for
552 	 * resizing in later calls to the queue setup function.
553 	 */
554 	tz = rte_eth_dma_zone_reserve(dev, "rx_ring", queue_idx,
555 				   sizeof(struct nfp_net_rx_desc) *
556 				   NFP_NET_MAX_RX_DESC, NFP_MEMZONE_ALIGN,
557 				   socket_id);
558 
559 	if (tz == NULL) {
560 		PMD_DRV_LOG(ERR, "Error allocating rx dma");
561 		nfp_net_rx_queue_release(dev, queue_idx);
562 		dev->data->rx_queues[queue_idx] = NULL;
563 		return -ENOMEM;
564 	}
565 
566 	/* Saving physical and virtual addresses for the RX ring */
567 	rxq->dma = (uint64_t)tz->iova;
568 	rxq->rxds = (struct nfp_net_rx_desc *)tz->addr;
569 
570 	/* mbuf pointers array for referencing mbufs linked to RX descriptors */
571 	rxq->rxbufs = rte_zmalloc_socket("rxq->rxbufs",
572 					 sizeof(*rxq->rxbufs) * nb_desc,
573 					 RTE_CACHE_LINE_SIZE, socket_id);
574 	if (rxq->rxbufs == NULL) {
575 		nfp_net_rx_queue_release(dev, queue_idx);
576 		dev->data->rx_queues[queue_idx] = NULL;
577 		return -ENOMEM;
578 	}
579 
580 	PMD_RX_LOG(DEBUG, "rxbufs=%p hw_ring=%p dma_addr=0x%" PRIx64,
581 		   rxq->rxbufs, rxq->rxds, (unsigned long)rxq->dma);
582 
583 	nfp_net_reset_rx_queue(rxq);
584 
585 	rxq->hw = hw;
586 
587 	/*
588 	 * Telling the HW about the physical address of the RX ring and number
589 	 * of descriptors in log2 format
590 	 */
591 	nn_cfg_writeq(hw, NFP_NET_CFG_RXR_ADDR(queue_idx), rxq->dma);
592 	nn_cfg_writeb(hw, NFP_NET_CFG_RXR_SZ(queue_idx), rte_log2_u32(nb_desc));
593 
594 	return 0;
595 }
596 
597 /*
598  * nfp_net_tx_free_bufs - Check for descriptors with a complete
599  * status
600  * @txq: TX queue to work with
601  * Returns number of descriptors freed
602  */
603 static int
604 nfp_net_tx_free_bufs(struct nfp_net_txq *txq)
605 {
606 	uint32_t qcp_rd_p;
607 	int todo;
608 
609 	PMD_TX_LOG(DEBUG, "queue %u. Check for descriptor with a complete"
610 		   " status", txq->qidx);
611 
612 	/* Work out how many packets have been sent */
613 	qcp_rd_p = nfp_qcp_read(txq->qcp_q, NFP_QCP_READ_PTR);
614 
615 	if (qcp_rd_p == txq->rd_p) {
616 		PMD_TX_LOG(DEBUG, "queue %u: It seems harrier is not sending "
617 			   "packets (%u, %u)", txq->qidx,
618 			   qcp_rd_p, txq->rd_p);
619 		return 0;
620 	}
621 
622 	if (qcp_rd_p > txq->rd_p)
623 		todo = qcp_rd_p - txq->rd_p;
624 	else
625 		todo = qcp_rd_p + txq->tx_count - txq->rd_p;
626 
627 	PMD_TX_LOG(DEBUG, "qcp_rd_p %u, txq->rd_p: %u, qcp->rd_p: %u",
628 		   qcp_rd_p, txq->rd_p, txq->rd_p);
629 
630 	if (todo == 0)
631 		return todo;
632 
633 	txq->rd_p += todo;
634 	if (unlikely(txq->rd_p >= txq->tx_count))
635 		txq->rd_p -= txq->tx_count;
636 
637 	return todo;
638 }
639 
640 static void
641 nfp_net_tx_queue_release_mbufs(struct nfp_net_txq *txq)
642 {
643 	unsigned int i;
644 
645 	if (txq->txbufs == NULL)
646 		return;
647 
648 	for (i = 0; i < txq->tx_count; i++) {
649 		if (txq->txbufs[i].mbuf) {
650 			rte_pktmbuf_free_seg(txq->txbufs[i].mbuf);
651 			txq->txbufs[i].mbuf = NULL;
652 		}
653 	}
654 }
655 
656 void
657 nfp_net_tx_queue_release(struct rte_eth_dev *dev, uint16_t queue_idx)
658 {
659 	struct nfp_net_txq *txq = dev->data->tx_queues[queue_idx];
660 
661 	if (txq) {
662 		nfp_net_tx_queue_release_mbufs(txq);
663 		rte_free(txq->txbufs);
664 		rte_free(txq);
665 	}
666 }
667 
668 void
669 nfp_net_reset_tx_queue(struct nfp_net_txq *txq)
670 {
671 	nfp_net_tx_queue_release_mbufs(txq);
672 	txq->wr_p = 0;
673 	txq->rd_p = 0;
674 }
675 
676 int
677 nfp_net_tx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx,
678 		       uint16_t nb_desc, unsigned int socket_id,
679 		       const struct rte_eth_txconf *tx_conf)
680 {
681 	const struct rte_memzone *tz;
682 	struct nfp_net_txq *txq;
683 	uint16_t tx_free_thresh;
684 	struct nfp_net_hw *hw;
685 	uint32_t tx_desc_sz;
686 
687 	hw = NFP_NET_DEV_PRIVATE_TO_HW(dev->data->dev_private);
688 
689 	PMD_INIT_FUNC_TRACE();
690 
691 	/* Validating number of descriptors */
692 	tx_desc_sz = nb_desc * sizeof(struct nfp_net_tx_desc);
693 	if (tx_desc_sz % NFP_ALIGN_RING_DESC != 0 ||
694 	    nb_desc > NFP_NET_MAX_TX_DESC ||
695 	    nb_desc < NFP_NET_MIN_TX_DESC) {
696 		PMD_DRV_LOG(ERR, "Wrong nb_desc value");
697 		return -EINVAL;
698 	}
699 
700 	tx_free_thresh = (uint16_t)((tx_conf->tx_free_thresh) ?
701 				    tx_conf->tx_free_thresh :
702 				    DEFAULT_TX_FREE_THRESH);
703 
704 	if (tx_free_thresh > (nb_desc)) {
705 		PMD_DRV_LOG(ERR,
706 			"tx_free_thresh must be less than the number of TX "
707 			"descriptors. (tx_free_thresh=%u port=%d "
708 			"queue=%d)", (unsigned int)tx_free_thresh,
709 			dev->data->port_id, (int)queue_idx);
710 		return -(EINVAL);
711 	}
712 
713 	/*
714 	 * Free memory prior to re-allocation if needed. This is the case after
715 	 * calling nfp_net_stop
716 	 */
717 	if (dev->data->tx_queues[queue_idx]) {
718 		PMD_TX_LOG(DEBUG, "Freeing memory prior to re-allocation %d",
719 			   queue_idx);
720 		nfp_net_tx_queue_release(dev, queue_idx);
721 		dev->data->tx_queues[queue_idx] = NULL;
722 	}
723 
724 	/* Allocating tx queue data structure */
725 	txq = rte_zmalloc_socket("ethdev TX queue", sizeof(struct nfp_net_txq),
726 				 RTE_CACHE_LINE_SIZE, socket_id);
727 	if (txq == NULL) {
728 		PMD_DRV_LOG(ERR, "Error allocating tx dma");
729 		return -ENOMEM;
730 	}
731 
732 	dev->data->tx_queues[queue_idx] = txq;
733 
734 	/*
735 	 * Allocate TX ring hardware descriptors. A memzone large enough to
736 	 * handle the maximum ring size is allocated in order to allow for
737 	 * resizing in later calls to the queue setup function.
738 	 */
739 	tz = rte_eth_dma_zone_reserve(dev, "tx_ring", queue_idx,
740 				   sizeof(struct nfp_net_tx_desc) *
741 				   NFP_NET_MAX_TX_DESC, NFP_MEMZONE_ALIGN,
742 				   socket_id);
743 	if (tz == NULL) {
744 		PMD_DRV_LOG(ERR, "Error allocating tx dma");
745 		nfp_net_tx_queue_release(dev, queue_idx);
746 		dev->data->tx_queues[queue_idx] = NULL;
747 		return -ENOMEM;
748 	}
749 
750 	txq->tx_count = nb_desc;
751 	txq->tx_free_thresh = tx_free_thresh;
752 	txq->tx_pthresh = tx_conf->tx_thresh.pthresh;
753 	txq->tx_hthresh = tx_conf->tx_thresh.hthresh;
754 	txq->tx_wthresh = tx_conf->tx_thresh.wthresh;
755 
756 	/* queue mapping based on firmware configuration */
757 	txq->qidx = queue_idx;
758 	txq->tx_qcidx = queue_idx * hw->stride_tx;
759 	txq->qcp_q = hw->tx_bar + NFP_QCP_QUEUE_OFF(txq->tx_qcidx);
760 
761 	txq->port_id = dev->data->port_id;
762 
763 	/* Saving physical and virtual addresses for the TX ring */
764 	txq->dma = (uint64_t)tz->iova;
765 	txq->txds = (struct nfp_net_tx_desc *)tz->addr;
766 
767 	/* mbuf pointers array for referencing mbufs linked to TX descriptors */
768 	txq->txbufs = rte_zmalloc_socket("txq->txbufs",
769 					 sizeof(*txq->txbufs) * nb_desc,
770 					 RTE_CACHE_LINE_SIZE, socket_id);
771 	if (txq->txbufs == NULL) {
772 		nfp_net_tx_queue_release(dev, queue_idx);
773 		dev->data->tx_queues[queue_idx] = NULL;
774 		return -ENOMEM;
775 	}
776 	PMD_TX_LOG(DEBUG, "txbufs=%p hw_ring=%p dma_addr=0x%" PRIx64,
777 		   txq->txbufs, txq->txds, (unsigned long)txq->dma);
778 
779 	nfp_net_reset_tx_queue(txq);
780 
781 	txq->hw = hw;
782 
783 	/*
784 	 * Telling the HW about the physical address of the TX ring and number
785 	 * of descriptors in log2 format
786 	 */
787 	nn_cfg_writeq(hw, NFP_NET_CFG_TXR_ADDR(queue_idx), txq->dma);
788 	nn_cfg_writeb(hw, NFP_NET_CFG_TXR_SZ(queue_idx), rte_log2_u32(nb_desc));
789 
790 	return 0;
791 }
792 
793 /* Leaving always free descriptors for avoiding wrapping confusion */
794 static inline
795 uint32_t nfp_free_tx_desc(struct nfp_net_txq *txq)
796 {
797 	if (txq->wr_p >= txq->rd_p)
798 		return txq->tx_count - (txq->wr_p - txq->rd_p) - 8;
799 	else
800 		return txq->rd_p - txq->wr_p - 8;
801 }
802 
803 /*
804  * nfp_net_txq_full - Check if the TX queue free descriptors
805  * is below tx_free_threshold
806  *
807  * @txq: TX queue to check
808  *
809  * This function uses the host copy* of read/write pointers
810  */
811 static inline
812 uint32_t nfp_net_txq_full(struct nfp_net_txq *txq)
813 {
814 	return (nfp_free_tx_desc(txq) < txq->tx_free_thresh);
815 }
816 
817 /* nfp_net_tx_tso - Set TX descriptor for TSO */
818 static inline void
819 nfp_net_tx_tso(struct nfp_net_txq *txq, struct nfp_net_tx_desc *txd,
820 	       struct rte_mbuf *mb)
821 {
822 	uint64_t ol_flags;
823 	struct nfp_net_hw *hw = txq->hw;
824 
825 	if (!(hw->cap & NFP_NET_CFG_CTRL_LSO_ANY))
826 		goto clean_txd;
827 
828 	ol_flags = mb->ol_flags;
829 
830 	if (!(ol_flags & PKT_TX_TCP_SEG))
831 		goto clean_txd;
832 
833 	txd->l3_offset = mb->l2_len;
834 	txd->l4_offset = mb->l2_len + mb->l3_len;
835 	txd->lso_hdrlen = mb->l2_len + mb->l3_len + mb->l4_len;
836 	txd->mss = rte_cpu_to_le_16(mb->tso_segsz);
837 	txd->flags = PCIE_DESC_TX_LSO;
838 	return;
839 
840 clean_txd:
841 	txd->flags = 0;
842 	txd->l3_offset = 0;
843 	txd->l4_offset = 0;
844 	txd->lso_hdrlen = 0;
845 	txd->mss = 0;
846 }
847 
848 /* nfp_net_tx_cksum - Set TX CSUM offload flags in TX descriptor */
849 static inline void
850 nfp_net_tx_cksum(struct nfp_net_txq *txq, struct nfp_net_tx_desc *txd,
851 		 struct rte_mbuf *mb)
852 {
853 	uint64_t ol_flags;
854 	struct nfp_net_hw *hw = txq->hw;
855 
856 	if (!(hw->cap & NFP_NET_CFG_CTRL_TXCSUM))
857 		return;
858 
859 	ol_flags = mb->ol_flags;
860 
861 	/* IPv6 does not need checksum */
862 	if (ol_flags & PKT_TX_IP_CKSUM)
863 		txd->flags |= PCIE_DESC_TX_IP4_CSUM;
864 
865 	switch (ol_flags & PKT_TX_L4_MASK) {
866 	case PKT_TX_UDP_CKSUM:
867 		txd->flags |= PCIE_DESC_TX_UDP_CSUM;
868 		break;
869 	case PKT_TX_TCP_CKSUM:
870 		txd->flags |= PCIE_DESC_TX_TCP_CSUM;
871 		break;
872 	}
873 
874 	if (ol_flags & (PKT_TX_IP_CKSUM | PKT_TX_L4_MASK))
875 		txd->flags |= PCIE_DESC_TX_CSUM;
876 }
877 
878 uint16_t
879 nfp_net_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
880 {
881 	struct nfp_net_txq *txq;
882 	struct nfp_net_hw *hw;
883 	struct nfp_net_tx_desc *txds, txd;
884 	struct rte_mbuf *pkt;
885 	uint64_t dma_addr;
886 	int pkt_size, dma_size;
887 	uint16_t free_descs, issued_descs;
888 	struct rte_mbuf **lmbuf;
889 	int i;
890 
891 	txq = tx_queue;
892 	hw = txq->hw;
893 	txds = &txq->txds[txq->wr_p];
894 
895 	PMD_TX_LOG(DEBUG, "working for queue %u at pos %d and %u packets",
896 		   txq->qidx, txq->wr_p, nb_pkts);
897 
898 	if ((nfp_free_tx_desc(txq) < nb_pkts) || (nfp_net_txq_full(txq)))
899 		nfp_net_tx_free_bufs(txq);
900 
901 	free_descs = (uint16_t)nfp_free_tx_desc(txq);
902 	if (unlikely(free_descs == 0))
903 		return 0;
904 
905 	pkt = *tx_pkts;
906 
907 	i = 0;
908 	issued_descs = 0;
909 	PMD_TX_LOG(DEBUG, "queue: %u. Sending %u packets",
910 		   txq->qidx, nb_pkts);
911 	/* Sending packets */
912 	while ((i < nb_pkts) && free_descs) {
913 		/* Grabbing the mbuf linked to the current descriptor */
914 		lmbuf = &txq->txbufs[txq->wr_p].mbuf;
915 		/* Warming the cache for releasing the mbuf later on */
916 		RTE_MBUF_PREFETCH_TO_FREE(*lmbuf);
917 
918 		pkt = *(tx_pkts + i);
919 
920 		if (unlikely(pkt->nb_segs > 1 &&
921 			     !(hw->cap & NFP_NET_CFG_CTRL_GATHER))) {
922 			PMD_INIT_LOG(INFO, "NFP_NET_CFG_CTRL_GATHER not set");
923 			rte_panic("Multisegment packet unsupported\n");
924 		}
925 
926 		/* Checking if we have enough descriptors */
927 		if (unlikely(pkt->nb_segs > free_descs))
928 			goto xmit_end;
929 
930 		/*
931 		 * Checksum and VLAN flags just in the first descriptor for a
932 		 * multisegment packet, but TSO info needs to be in all of them.
933 		 */
934 		txd.data_len = pkt->pkt_len;
935 		nfp_net_tx_tso(txq, &txd, pkt);
936 		nfp_net_tx_cksum(txq, &txd, pkt);
937 
938 		if ((pkt->ol_flags & PKT_TX_VLAN_PKT) &&
939 		    (hw->cap & NFP_NET_CFG_CTRL_TXVLAN)) {
940 			txd.flags |= PCIE_DESC_TX_VLAN;
941 			txd.vlan = pkt->vlan_tci;
942 		}
943 
944 		/*
945 		 * mbuf data_len is the data in one segment and pkt_len data
946 		 * in the whole packet. When the packet is just one segment,
947 		 * then data_len = pkt_len
948 		 */
949 		pkt_size = pkt->pkt_len;
950 
951 		while (pkt) {
952 			/* Copying TSO, VLAN and cksum info */
953 			*txds = txd;
954 
955 			/* Releasing mbuf used by this descriptor previously*/
956 			if (*lmbuf)
957 				rte_pktmbuf_free_seg(*lmbuf);
958 
959 			/*
960 			 * Linking mbuf with descriptor for being released
961 			 * next time descriptor is used
962 			 */
963 			*lmbuf = pkt;
964 
965 			dma_size = pkt->data_len;
966 			dma_addr = rte_mbuf_data_iova(pkt);
967 			PMD_TX_LOG(DEBUG, "Working with mbuf at dma address:"
968 				   "%" PRIx64 "", dma_addr);
969 
970 			/* Filling descriptors fields */
971 			txds->dma_len = dma_size;
972 			txds->data_len = txd.data_len;
973 			txds->dma_addr_hi = (dma_addr >> 32) & 0xff;
974 			txds->dma_addr_lo = (dma_addr & 0xffffffff);
975 			ASSERT(free_descs > 0);
976 			free_descs--;
977 
978 			txq->wr_p++;
979 			if (unlikely(txq->wr_p == txq->tx_count)) /* wrapping?*/
980 				txq->wr_p = 0;
981 
982 			pkt_size -= dma_size;
983 
984 			/*
985 			 * Making the EOP, packets with just one segment
986 			 * the priority
987 			 */
988 			if (likely(!pkt_size))
989 				txds->offset_eop = PCIE_DESC_TX_EOP;
990 			else
991 				txds->offset_eop = 0;
992 
993 			pkt = pkt->next;
994 			/* Referencing next free TX descriptor */
995 			txds = &txq->txds[txq->wr_p];
996 			lmbuf = &txq->txbufs[txq->wr_p].mbuf;
997 			issued_descs++;
998 		}
999 		i++;
1000 	}
1001 
1002 xmit_end:
1003 	/* Increment write pointers. Force memory write before we let HW know */
1004 	rte_wmb();
1005 	nfp_qcp_ptr_add(txq->qcp_q, NFP_QCP_WRITE_PTR, issued_descs);
1006 
1007 	return i;
1008 }
1009