xref: /dpdk/drivers/net/nfp/nfp_rxtx.c (revision 42a8fc7daa46256d150278fc9a7a846e27945a0c)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright (c) 2014-2021 Netronome Systems, Inc.
3  * All rights reserved.
4  *
5  * Small portions derived from code Copyright(c) 2010-2015 Intel Corporation.
6  */
7 
8 /*
9  * vim:shiftwidth=8:noexpandtab
10  *
11  * @file dpdk/pmd/nfp_rxtx.c
12  *
13  * Netronome vNIC DPDK Poll-Mode Driver: Rx/Tx functions
14  */
15 
16 #include <ethdev_driver.h>
17 #include <ethdev_pci.h>
18 
19 #include "nfp_common.h"
20 #include "nfp_rxtx.h"
21 #include "nfp_logs.h"
22 #include "nfp_ctrl.h"
23 
24 static int
25 nfp_net_rx_fill_freelist(struct nfp_net_rxq *rxq)
26 {
27 	struct nfp_net_rx_buff *rxe = rxq->rxbufs;
28 	uint64_t dma_addr;
29 	unsigned int i;
30 
31 	PMD_RX_LOG(DEBUG, "Fill Rx Freelist for %u descriptors",
32 		   rxq->rx_count);
33 
34 	for (i = 0; i < rxq->rx_count; i++) {
35 		struct nfp_net_rx_desc *rxd;
36 		struct rte_mbuf *mbuf = rte_pktmbuf_alloc(rxq->mem_pool);
37 
38 		if (mbuf == NULL) {
39 			PMD_DRV_LOG(ERR, "RX mbuf alloc failed queue_id=%u",
40 				(unsigned int)rxq->qidx);
41 			return -ENOMEM;
42 		}
43 
44 		dma_addr = rte_cpu_to_le_64(RTE_MBUF_DMA_ADDR_DEFAULT(mbuf));
45 
46 		rxd = &rxq->rxds[i];
47 		rxd->fld.dd = 0;
48 		rxd->fld.dma_addr_hi = (dma_addr >> 32) & 0xff;
49 		rxd->fld.dma_addr_lo = dma_addr & 0xffffffff;
50 		rxe[i].mbuf = mbuf;
51 		PMD_RX_LOG(DEBUG, "[%d]: %" PRIx64, i, dma_addr);
52 	}
53 
54 	/* Make sure all writes are flushed before telling the hardware */
55 	rte_wmb();
56 
57 	/* Not advertising the whole ring as the firmware gets confused if so */
58 	PMD_RX_LOG(DEBUG, "Increment FL write pointer in %u",
59 		   rxq->rx_count - 1);
60 
61 	nfp_qcp_ptr_add(rxq->qcp_fl, NFP_QCP_WRITE_PTR, rxq->rx_count - 1);
62 
63 	return 0;
64 }
65 
66 int
67 nfp_net_rx_freelist_setup(struct rte_eth_dev *dev)
68 {
69 	int i;
70 
71 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
72 		if (nfp_net_rx_fill_freelist(dev->data->rx_queues[i]) < 0)
73 			return -1;
74 	}
75 	return 0;
76 }
77 
78 uint32_t
79 nfp_net_rx_queue_count(void *rx_queue)
80 {
81 	struct nfp_net_rxq *rxq;
82 	struct nfp_net_rx_desc *rxds;
83 	uint32_t idx;
84 	uint32_t count;
85 
86 	rxq = rx_queue;
87 
88 	idx = rxq->rd_p;
89 
90 	count = 0;
91 
92 	/*
93 	 * Other PMDs are just checking the DD bit in intervals of 4
94 	 * descriptors and counting all four if the first has the DD
95 	 * bit on. Of course, this is not accurate but can be good for
96 	 * performance. But ideally that should be done in descriptors
97 	 * chunks belonging to the same cache line
98 	 */
99 
100 	while (count < rxq->rx_count) {
101 		rxds = &rxq->rxds[idx];
102 		if ((rxds->rxd.meta_len_dd & PCIE_DESC_RX_DD) == 0)
103 			break;
104 
105 		count++;
106 		idx++;
107 
108 		/* Wrapping? */
109 		if ((idx) == rxq->rx_count)
110 			idx = 0;
111 	}
112 
113 	return count;
114 }
115 
116 static inline void
117 nfp_net_mbuf_alloc_failed(struct nfp_net_rxq *rxq)
118 {
119 	rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
120 }
121 
122 /*
123  * nfp_net_set_hash - Set mbuf hash data
124  *
125  * The RSS hash and hash-type are pre-pended to the packet data.
126  * Extract and decode it and set the mbuf fields.
127  */
128 static inline void
129 nfp_net_set_hash(struct nfp_net_rxq *rxq, struct nfp_net_rx_desc *rxd,
130 		 struct rte_mbuf *mbuf)
131 {
132 	struct nfp_net_hw *hw = rxq->hw;
133 	uint8_t *meta_offset;
134 	uint32_t meta_info;
135 	uint32_t hash = 0;
136 	uint32_t hash_type = 0;
137 
138 	if (!(hw->ctrl & NFP_NET_CFG_CTRL_RSS_ANY))
139 		return;
140 
141 	/* this is true for new firmwares */
142 	if (likely(((hw->cap & NFP_NET_CFG_CTRL_RSS2) ||
143 	    (NFD_CFG_MAJOR_VERSION_of(hw->ver) == 4)) &&
144 	     NFP_DESC_META_LEN(rxd))) {
145 		/*
146 		 * new metadata api:
147 		 * <----  32 bit  ----->
148 		 * m    field type word
149 		 * e     data field #2
150 		 * t     data field #1
151 		 * a     data field #0
152 		 * ====================
153 		 *    packet data
154 		 *
155 		 * Field type word contains up to 8 4bit field types
156 		 * A 4bit field type refers to a data field word
157 		 * A data field word can have several 4bit field types
158 		 */
159 		meta_offset = rte_pktmbuf_mtod(mbuf, uint8_t *);
160 		meta_offset -= NFP_DESC_META_LEN(rxd);
161 		meta_info = rte_be_to_cpu_32(*(uint32_t *)meta_offset);
162 		meta_offset += 4;
163 		/* NFP PMD just supports metadata for hashing */
164 		switch (meta_info & NFP_NET_META_FIELD_MASK) {
165 		case NFP_NET_META_HASH:
166 			/* next field type is about the hash type */
167 			meta_info >>= NFP_NET_META_FIELD_SIZE;
168 			/* hash value is in the data field */
169 			hash = rte_be_to_cpu_32(*(uint32_t *)meta_offset);
170 			hash_type = meta_info & NFP_NET_META_FIELD_MASK;
171 			break;
172 		default:
173 			/* Unsupported metadata can be a performance issue */
174 			return;
175 		}
176 	} else {
177 		if (!(rxd->rxd.flags & PCIE_DESC_RX_RSS))
178 			return;
179 
180 		hash = rte_be_to_cpu_32(*(uint32_t *)NFP_HASH_OFFSET);
181 		hash_type = rte_be_to_cpu_32(*(uint32_t *)NFP_HASH_TYPE_OFFSET);
182 	}
183 
184 	mbuf->hash.rss = hash;
185 	mbuf->ol_flags |= RTE_MBUF_F_RX_RSS_HASH;
186 
187 	switch (hash_type) {
188 	case NFP_NET_RSS_IPV4:
189 		mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV4;
190 		break;
191 	case NFP_NET_RSS_IPV6:
192 		mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6;
193 		break;
194 	case NFP_NET_RSS_IPV6_EX:
195 		mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6_EXT;
196 		break;
197 	case NFP_NET_RSS_IPV4_TCP:
198 		mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6_EXT;
199 		break;
200 	case NFP_NET_RSS_IPV6_TCP:
201 		mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6_EXT;
202 		break;
203 	case NFP_NET_RSS_IPV4_UDP:
204 		mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6_EXT;
205 		break;
206 	case NFP_NET_RSS_IPV6_UDP:
207 		mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6_EXT;
208 		break;
209 	default:
210 		mbuf->packet_type |= RTE_PTYPE_INNER_L4_MASK;
211 	}
212 }
213 
214 /* nfp_net_rx_cksum - set mbuf checksum flags based on RX descriptor flags */
215 static inline void
216 nfp_net_rx_cksum(struct nfp_net_rxq *rxq, struct nfp_net_rx_desc *rxd,
217 		 struct rte_mbuf *mb)
218 {
219 	struct nfp_net_hw *hw = rxq->hw;
220 
221 	if (!(hw->ctrl & NFP_NET_CFG_CTRL_RXCSUM))
222 		return;
223 
224 	/* If IPv4 and IP checksum error, fail */
225 	if (unlikely((rxd->rxd.flags & PCIE_DESC_RX_IP4_CSUM) &&
226 	    !(rxd->rxd.flags & PCIE_DESC_RX_IP4_CSUM_OK)))
227 		mb->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_BAD;
228 	else
229 		mb->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_GOOD;
230 
231 	/* If neither UDP nor TCP return */
232 	if (!(rxd->rxd.flags & PCIE_DESC_RX_TCP_CSUM) &&
233 	    !(rxd->rxd.flags & PCIE_DESC_RX_UDP_CSUM))
234 		return;
235 
236 	if (likely(rxd->rxd.flags & PCIE_DESC_RX_L4_CSUM_OK))
237 		mb->ol_flags |= RTE_MBUF_F_RX_L4_CKSUM_GOOD;
238 	else
239 		mb->ol_flags |= RTE_MBUF_F_RX_L4_CKSUM_BAD;
240 }
241 
242 /*
243  * RX path design:
244  *
245  * There are some decisions to take:
246  * 1) How to check DD RX descriptors bit
247  * 2) How and when to allocate new mbufs
248  *
249  * Current implementation checks just one single DD bit each loop. As each
250  * descriptor is 8 bytes, it is likely a good idea to check descriptors in
251  * a single cache line instead. Tests with this change have not shown any
252  * performance improvement but it requires further investigation. For example,
253  * depending on which descriptor is next, the number of descriptors could be
254  * less than 8 for just checking those in the same cache line. This implies
255  * extra work which could be counterproductive by itself. Indeed, last firmware
256  * changes are just doing this: writing several descriptors with the DD bit
257  * for saving PCIe bandwidth and DMA operations from the NFP.
258  *
259  * Mbuf allocation is done when a new packet is received. Then the descriptor
260  * is automatically linked with the new mbuf and the old one is given to the
261  * user. The main drawback with this design is mbuf allocation is heavier than
262  * using bulk allocations allowed by DPDK with rte_mempool_get_bulk. From the
263  * cache point of view it does not seem allocating the mbuf early on as we are
264  * doing now have any benefit at all. Again, tests with this change have not
265  * shown any improvement. Also, rte_mempool_get_bulk returns all or nothing
266  * so looking at the implications of this type of allocation should be studied
267  * deeply
268  */
269 
270 uint16_t
271 nfp_net_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
272 {
273 	struct nfp_net_rxq *rxq;
274 	struct nfp_net_rx_desc *rxds;
275 	struct nfp_net_rx_buff *rxb;
276 	struct nfp_net_hw *hw;
277 	struct rte_mbuf *mb;
278 	struct rte_mbuf *new_mb;
279 	uint16_t nb_hold;
280 	uint64_t dma_addr;
281 	int avail;
282 
283 	rxq = rx_queue;
284 	if (unlikely(rxq == NULL)) {
285 		/*
286 		 * DPDK just checks the queue is lower than max queues
287 		 * enabled. But the queue needs to be configured
288 		 */
289 		RTE_LOG_DP(ERR, PMD, "RX Bad queue\n");
290 		return -EINVAL;
291 	}
292 
293 	hw = rxq->hw;
294 	avail = 0;
295 	nb_hold = 0;
296 
297 	while (avail < nb_pkts) {
298 		rxb = &rxq->rxbufs[rxq->rd_p];
299 		if (unlikely(rxb == NULL)) {
300 			RTE_LOG_DP(ERR, PMD, "rxb does not exist!\n");
301 			break;
302 		}
303 
304 		rxds = &rxq->rxds[rxq->rd_p];
305 		if ((rxds->rxd.meta_len_dd & PCIE_DESC_RX_DD) == 0)
306 			break;
307 
308 		/*
309 		 * Memory barrier to ensure that we won't do other
310 		 * reads before the DD bit.
311 		 */
312 		rte_rmb();
313 
314 		/*
315 		 * We got a packet. Let's alloc a new mbuf for refilling the
316 		 * free descriptor ring as soon as possible
317 		 */
318 		new_mb = rte_pktmbuf_alloc(rxq->mem_pool);
319 		if (unlikely(new_mb == NULL)) {
320 			RTE_LOG_DP(DEBUG, PMD,
321 			"RX mbuf alloc failed port_id=%u queue_id=%u\n",
322 				rxq->port_id, (unsigned int)rxq->qidx);
323 			nfp_net_mbuf_alloc_failed(rxq);
324 			break;
325 		}
326 
327 		nb_hold++;
328 
329 		/*
330 		 * Grab the mbuf and refill the descriptor with the
331 		 * previously allocated mbuf
332 		 */
333 		mb = rxb->mbuf;
334 		rxb->mbuf = new_mb;
335 
336 		PMD_RX_LOG(DEBUG, "Packet len: %u, mbuf_size: %u",
337 			   rxds->rxd.data_len, rxq->mbuf_size);
338 
339 		/* Size of this segment */
340 		mb->data_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds);
341 		/* Size of the whole packet. We just support 1 segment */
342 		mb->pkt_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds);
343 
344 		if (unlikely((mb->data_len + hw->rx_offset) >
345 			     rxq->mbuf_size)) {
346 			/*
347 			 * This should not happen and the user has the
348 			 * responsibility of avoiding it. But we have
349 			 * to give some info about the error
350 			 */
351 			RTE_LOG_DP(ERR, PMD,
352 				"mbuf overflow likely due to the RX offset.\n"
353 				"\t\tYour mbuf size should have extra space for"
354 				" RX offset=%u bytes.\n"
355 				"\t\tCurrently you just have %u bytes available"
356 				" but the received packet is %u bytes long",
357 				hw->rx_offset,
358 				rxq->mbuf_size - hw->rx_offset,
359 				mb->data_len);
360 			return -EINVAL;
361 		}
362 
363 		/* Filling the received mbuf with packet info */
364 		if (hw->rx_offset)
365 			mb->data_off = RTE_PKTMBUF_HEADROOM + hw->rx_offset;
366 		else
367 			mb->data_off = RTE_PKTMBUF_HEADROOM +
368 				       NFP_DESC_META_LEN(rxds);
369 
370 		/* No scatter mode supported */
371 		mb->nb_segs = 1;
372 		mb->next = NULL;
373 
374 		mb->port = rxq->port_id;
375 
376 		/* Checking the RSS flag */
377 		nfp_net_set_hash(rxq, rxds, mb);
378 
379 		/* Checking the checksum flag */
380 		nfp_net_rx_cksum(rxq, rxds, mb);
381 
382 		if ((rxds->rxd.flags & PCIE_DESC_RX_VLAN) &&
383 		    (hw->ctrl & NFP_NET_CFG_CTRL_RXVLAN)) {
384 			mb->vlan_tci = rte_cpu_to_le_32(rxds->rxd.vlan);
385 			mb->ol_flags |= RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED;
386 		}
387 
388 		/* Adding the mbuf to the mbuf array passed by the app */
389 		rx_pkts[avail++] = mb;
390 
391 		/* Now resetting and updating the descriptor */
392 		rxds->vals[0] = 0;
393 		rxds->vals[1] = 0;
394 		dma_addr = rte_cpu_to_le_64(RTE_MBUF_DMA_ADDR_DEFAULT(new_mb));
395 		rxds->fld.dd = 0;
396 		rxds->fld.dma_addr_hi = (dma_addr >> 32) & 0xff;
397 		rxds->fld.dma_addr_lo = dma_addr & 0xffffffff;
398 
399 		rxq->rd_p++;
400 		if (unlikely(rxq->rd_p == rxq->rx_count)) /* wrapping?*/
401 			rxq->rd_p = 0;
402 	}
403 
404 	if (nb_hold == 0)
405 		return nb_hold;
406 
407 	PMD_RX_LOG(DEBUG, "RX  port_id=%u queue_id=%u, %d packets received",
408 		   rxq->port_id, (unsigned int)rxq->qidx, nb_hold);
409 
410 	nb_hold += rxq->nb_rx_hold;
411 
412 	/*
413 	 * FL descriptors needs to be written before incrementing the
414 	 * FL queue WR pointer
415 	 */
416 	rte_wmb();
417 	if (nb_hold > rxq->rx_free_thresh) {
418 		PMD_RX_LOG(DEBUG, "port=%u queue=%u nb_hold=%u avail=%u",
419 			   rxq->port_id, (unsigned int)rxq->qidx,
420 			   (unsigned int)nb_hold, (unsigned int)avail);
421 		nfp_qcp_ptr_add(rxq->qcp_fl, NFP_QCP_WRITE_PTR, nb_hold);
422 		nb_hold = 0;
423 	}
424 	rxq->nb_rx_hold = nb_hold;
425 
426 	return avail;
427 }
428 
429 static void
430 nfp_net_rx_queue_release_mbufs(struct nfp_net_rxq *rxq)
431 {
432 	unsigned int i;
433 
434 	if (rxq->rxbufs == NULL)
435 		return;
436 
437 	for (i = 0; i < rxq->rx_count; i++) {
438 		if (rxq->rxbufs[i].mbuf) {
439 			rte_pktmbuf_free_seg(rxq->rxbufs[i].mbuf);
440 			rxq->rxbufs[i].mbuf = NULL;
441 		}
442 	}
443 }
444 
445 void
446 nfp_net_rx_queue_release(struct rte_eth_dev *dev, uint16_t queue_idx)
447 {
448 	struct nfp_net_rxq *rxq = dev->data->rx_queues[queue_idx];
449 
450 	if (rxq) {
451 		nfp_net_rx_queue_release_mbufs(rxq);
452 		rte_eth_dma_zone_free(dev, "rx_ring", queue_idx);
453 		rte_free(rxq->rxbufs);
454 		rte_free(rxq);
455 	}
456 }
457 
458 void
459 nfp_net_reset_rx_queue(struct nfp_net_rxq *rxq)
460 {
461 	nfp_net_rx_queue_release_mbufs(rxq);
462 	rxq->rd_p = 0;
463 	rxq->nb_rx_hold = 0;
464 }
465 
466 int
467 nfp_net_rx_queue_setup(struct rte_eth_dev *dev,
468 		       uint16_t queue_idx, uint16_t nb_desc,
469 		       unsigned int socket_id,
470 		       const struct rte_eth_rxconf *rx_conf,
471 		       struct rte_mempool *mp)
472 {
473 	const struct rte_memzone *tz;
474 	struct nfp_net_rxq *rxq;
475 	struct nfp_net_hw *hw;
476 	uint32_t rx_desc_sz;
477 
478 	hw = NFP_NET_DEV_PRIVATE_TO_HW(dev->data->dev_private);
479 
480 	PMD_INIT_FUNC_TRACE();
481 
482 	/* Validating number of descriptors */
483 	rx_desc_sz = nb_desc * sizeof(struct nfp_net_rx_desc);
484 	if (rx_desc_sz % NFP_ALIGN_RING_DESC != 0 ||
485 	    nb_desc > NFP_NET_MAX_RX_DESC ||
486 	    nb_desc < NFP_NET_MIN_RX_DESC) {
487 		PMD_DRV_LOG(ERR, "Wrong nb_desc value");
488 		return -EINVAL;
489 	}
490 
491 	/*
492 	 * Free memory prior to re-allocation if needed. This is the case after
493 	 * calling nfp_net_stop
494 	 */
495 	if (dev->data->rx_queues[queue_idx]) {
496 		nfp_net_rx_queue_release(dev, queue_idx);
497 		dev->data->rx_queues[queue_idx] = NULL;
498 	}
499 
500 	/* Allocating rx queue data structure */
501 	rxq = rte_zmalloc_socket("ethdev RX queue", sizeof(struct nfp_net_rxq),
502 				 RTE_CACHE_LINE_SIZE, socket_id);
503 	if (rxq == NULL)
504 		return -ENOMEM;
505 
506 	dev->data->rx_queues[queue_idx] = rxq;
507 
508 	/* Hw queues mapping based on firmware configuration */
509 	rxq->qidx = queue_idx;
510 	rxq->fl_qcidx = queue_idx * hw->stride_rx;
511 	rxq->rx_qcidx = rxq->fl_qcidx + (hw->stride_rx - 1);
512 	rxq->qcp_fl = hw->rx_bar + NFP_QCP_QUEUE_OFF(rxq->fl_qcidx);
513 	rxq->qcp_rx = hw->rx_bar + NFP_QCP_QUEUE_OFF(rxq->rx_qcidx);
514 
515 	/*
516 	 * Tracking mbuf size for detecting a potential mbuf overflow due to
517 	 * RX offset
518 	 */
519 	rxq->mem_pool = mp;
520 	rxq->mbuf_size = rxq->mem_pool->elt_size;
521 	rxq->mbuf_size -= (sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM);
522 	hw->flbufsz = rxq->mbuf_size;
523 
524 	rxq->rx_count = nb_desc;
525 	rxq->port_id = dev->data->port_id;
526 	rxq->rx_free_thresh = rx_conf->rx_free_thresh;
527 	rxq->drop_en = rx_conf->rx_drop_en;
528 
529 	/*
530 	 * Allocate RX ring hardware descriptors. A memzone large enough to
531 	 * handle the maximum ring size is allocated in order to allow for
532 	 * resizing in later calls to the queue setup function.
533 	 */
534 	tz = rte_eth_dma_zone_reserve(dev, "rx_ring", queue_idx,
535 				   sizeof(struct nfp_net_rx_desc) *
536 				   NFP_NET_MAX_RX_DESC, NFP_MEMZONE_ALIGN,
537 				   socket_id);
538 
539 	if (tz == NULL) {
540 		PMD_DRV_LOG(ERR, "Error allocating rx dma");
541 		nfp_net_rx_queue_release(dev, queue_idx);
542 		dev->data->rx_queues[queue_idx] = NULL;
543 		return -ENOMEM;
544 	}
545 
546 	/* Saving physical and virtual addresses for the RX ring */
547 	rxq->dma = (uint64_t)tz->iova;
548 	rxq->rxds = (struct nfp_net_rx_desc *)tz->addr;
549 
550 	/* mbuf pointers array for referencing mbufs linked to RX descriptors */
551 	rxq->rxbufs = rte_zmalloc_socket("rxq->rxbufs",
552 					 sizeof(*rxq->rxbufs) * nb_desc,
553 					 RTE_CACHE_LINE_SIZE, socket_id);
554 	if (rxq->rxbufs == NULL) {
555 		nfp_net_rx_queue_release(dev, queue_idx);
556 		dev->data->rx_queues[queue_idx] = NULL;
557 		return -ENOMEM;
558 	}
559 
560 	PMD_RX_LOG(DEBUG, "rxbufs=%p hw_ring=%p dma_addr=0x%" PRIx64,
561 		   rxq->rxbufs, rxq->rxds, (unsigned long)rxq->dma);
562 
563 	nfp_net_reset_rx_queue(rxq);
564 
565 	rxq->hw = hw;
566 
567 	/*
568 	 * Telling the HW about the physical address of the RX ring and number
569 	 * of descriptors in log2 format
570 	 */
571 	nn_cfg_writeq(hw, NFP_NET_CFG_RXR_ADDR(queue_idx), rxq->dma);
572 	nn_cfg_writeb(hw, NFP_NET_CFG_RXR_SZ(queue_idx), rte_log2_u32(nb_desc));
573 
574 	return 0;
575 }
576 
577 /*
578  * nfp_net_tx_free_bufs - Check for descriptors with a complete
579  * status
580  * @txq: TX queue to work with
581  * Returns number of descriptors freed
582  */
583 static int
584 nfp_net_tx_free_bufs(struct nfp_net_txq *txq)
585 {
586 	uint32_t qcp_rd_p;
587 	int todo;
588 
589 	PMD_TX_LOG(DEBUG, "queue %u. Check for descriptor with a complete"
590 		   " status", txq->qidx);
591 
592 	/* Work out how many packets have been sent */
593 	qcp_rd_p = nfp_qcp_read(txq->qcp_q, NFP_QCP_READ_PTR);
594 
595 	if (qcp_rd_p == txq->rd_p) {
596 		PMD_TX_LOG(DEBUG, "queue %u: It seems harrier is not sending "
597 			   "packets (%u, %u)", txq->qidx,
598 			   qcp_rd_p, txq->rd_p);
599 		return 0;
600 	}
601 
602 	if (qcp_rd_p > txq->rd_p)
603 		todo = qcp_rd_p - txq->rd_p;
604 	else
605 		todo = qcp_rd_p + txq->tx_count - txq->rd_p;
606 
607 	PMD_TX_LOG(DEBUG, "qcp_rd_p %u, txq->rd_p: %u, qcp->rd_p: %u",
608 		   qcp_rd_p, txq->rd_p, txq->rd_p);
609 
610 	if (todo == 0)
611 		return todo;
612 
613 	txq->rd_p += todo;
614 	if (unlikely(txq->rd_p >= txq->tx_count))
615 		txq->rd_p -= txq->tx_count;
616 
617 	return todo;
618 }
619 
620 static void
621 nfp_net_tx_queue_release_mbufs(struct nfp_net_txq *txq)
622 {
623 	unsigned int i;
624 
625 	if (txq->txbufs == NULL)
626 		return;
627 
628 	for (i = 0; i < txq->tx_count; i++) {
629 		if (txq->txbufs[i].mbuf) {
630 			rte_pktmbuf_free_seg(txq->txbufs[i].mbuf);
631 			txq->txbufs[i].mbuf = NULL;
632 		}
633 	}
634 }
635 
636 void
637 nfp_net_tx_queue_release(struct rte_eth_dev *dev, uint16_t queue_idx)
638 {
639 	struct nfp_net_txq *txq = dev->data->tx_queues[queue_idx];
640 
641 	if (txq) {
642 		nfp_net_tx_queue_release_mbufs(txq);
643 		rte_eth_dma_zone_free(dev, "tx_ring", queue_idx);
644 		rte_free(txq->txbufs);
645 		rte_free(txq);
646 	}
647 }
648 
649 void
650 nfp_net_reset_tx_queue(struct nfp_net_txq *txq)
651 {
652 	nfp_net_tx_queue_release_mbufs(txq);
653 	txq->wr_p = 0;
654 	txq->rd_p = 0;
655 }
656 
657 int
658 nfp_net_nfd3_tx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx,
659 		       uint16_t nb_desc, unsigned int socket_id,
660 		       const struct rte_eth_txconf *tx_conf)
661 {
662 	const struct rte_memzone *tz;
663 	struct nfp_net_txq *txq;
664 	uint16_t tx_free_thresh;
665 	struct nfp_net_hw *hw;
666 	uint32_t tx_desc_sz;
667 
668 	hw = NFP_NET_DEV_PRIVATE_TO_HW(dev->data->dev_private);
669 
670 	PMD_INIT_FUNC_TRACE();
671 
672 	/* Validating number of descriptors */
673 	tx_desc_sz = nb_desc * sizeof(struct nfp_net_nfd3_tx_desc);
674 	if (tx_desc_sz % NFP_ALIGN_RING_DESC != 0 ||
675 	    nb_desc > NFP_NET_MAX_TX_DESC ||
676 	    nb_desc < NFP_NET_MIN_TX_DESC) {
677 		PMD_DRV_LOG(ERR, "Wrong nb_desc value");
678 		return -EINVAL;
679 	}
680 
681 	tx_free_thresh = (uint16_t)((tx_conf->tx_free_thresh) ?
682 				    tx_conf->tx_free_thresh :
683 				    DEFAULT_TX_FREE_THRESH);
684 
685 	if (tx_free_thresh > (nb_desc)) {
686 		PMD_DRV_LOG(ERR,
687 			"tx_free_thresh must be less than the number of TX "
688 			"descriptors. (tx_free_thresh=%u port=%d "
689 			"queue=%d)", (unsigned int)tx_free_thresh,
690 			dev->data->port_id, (int)queue_idx);
691 		return -(EINVAL);
692 	}
693 
694 	/*
695 	 * Free memory prior to re-allocation if needed. This is the case after
696 	 * calling nfp_net_stop
697 	 */
698 	if (dev->data->tx_queues[queue_idx]) {
699 		PMD_TX_LOG(DEBUG, "Freeing memory prior to re-allocation %d",
700 			   queue_idx);
701 		nfp_net_tx_queue_release(dev, queue_idx);
702 		dev->data->tx_queues[queue_idx] = NULL;
703 	}
704 
705 	/* Allocating tx queue data structure */
706 	txq = rte_zmalloc_socket("ethdev TX queue", sizeof(struct nfp_net_txq),
707 				 RTE_CACHE_LINE_SIZE, socket_id);
708 	if (txq == NULL) {
709 		PMD_DRV_LOG(ERR, "Error allocating tx dma");
710 		return -ENOMEM;
711 	}
712 
713 	dev->data->tx_queues[queue_idx] = txq;
714 
715 	/*
716 	 * Allocate TX ring hardware descriptors. A memzone large enough to
717 	 * handle the maximum ring size is allocated in order to allow for
718 	 * resizing in later calls to the queue setup function.
719 	 */
720 	tz = rte_eth_dma_zone_reserve(dev, "tx_ring", queue_idx,
721 				   sizeof(struct nfp_net_nfd3_tx_desc) *
722 				   NFP_NET_MAX_TX_DESC, NFP_MEMZONE_ALIGN,
723 				   socket_id);
724 	if (tz == NULL) {
725 		PMD_DRV_LOG(ERR, "Error allocating tx dma");
726 		nfp_net_tx_queue_release(dev, queue_idx);
727 		dev->data->tx_queues[queue_idx] = NULL;
728 		return -ENOMEM;
729 	}
730 
731 	txq->tx_count = nb_desc;
732 	txq->tx_free_thresh = tx_free_thresh;
733 	txq->tx_pthresh = tx_conf->tx_thresh.pthresh;
734 	txq->tx_hthresh = tx_conf->tx_thresh.hthresh;
735 	txq->tx_wthresh = tx_conf->tx_thresh.wthresh;
736 
737 	/* queue mapping based on firmware configuration */
738 	txq->qidx = queue_idx;
739 	txq->tx_qcidx = queue_idx * hw->stride_tx;
740 	txq->qcp_q = hw->tx_bar + NFP_QCP_QUEUE_OFF(txq->tx_qcidx);
741 
742 	txq->port_id = dev->data->port_id;
743 
744 	/* Saving physical and virtual addresses for the TX ring */
745 	txq->dma = (uint64_t)tz->iova;
746 	txq->txds = (struct nfp_net_nfd3_tx_desc *)tz->addr;
747 
748 	/* mbuf pointers array for referencing mbufs linked to TX descriptors */
749 	txq->txbufs = rte_zmalloc_socket("txq->txbufs",
750 					 sizeof(*txq->txbufs) * nb_desc,
751 					 RTE_CACHE_LINE_SIZE, socket_id);
752 	if (txq->txbufs == NULL) {
753 		nfp_net_tx_queue_release(dev, queue_idx);
754 		dev->data->tx_queues[queue_idx] = NULL;
755 		return -ENOMEM;
756 	}
757 	PMD_TX_LOG(DEBUG, "txbufs=%p hw_ring=%p dma_addr=0x%" PRIx64,
758 		   txq->txbufs, txq->txds, (unsigned long)txq->dma);
759 
760 	nfp_net_reset_tx_queue(txq);
761 
762 	txq->hw = hw;
763 
764 	/*
765 	 * Telling the HW about the physical address of the TX ring and number
766 	 * of descriptors in log2 format
767 	 */
768 	nn_cfg_writeq(hw, NFP_NET_CFG_TXR_ADDR(queue_idx), txq->dma);
769 	nn_cfg_writeb(hw, NFP_NET_CFG_TXR_SZ(queue_idx), rte_log2_u32(nb_desc));
770 
771 	return 0;
772 }
773 
774 /* Leaving always free descriptors for avoiding wrapping confusion */
775 static inline
776 uint32_t nfp_net_nfd3_free_tx_desc(struct nfp_net_txq *txq)
777 {
778 	if (txq->wr_p >= txq->rd_p)
779 		return txq->tx_count - (txq->wr_p - txq->rd_p) - 8;
780 	else
781 		return txq->rd_p - txq->wr_p - 8;
782 }
783 
784 /*
785  * nfp_net_txq_full - Check if the TX queue free descriptors
786  * is below tx_free_threshold
787  *
788  * @txq: TX queue to check
789  *
790  * This function uses the host copy* of read/write pointers
791  */
792 static inline
793 uint32_t nfp_net_nfd3_txq_full(struct nfp_net_txq *txq)
794 {
795 	return (nfp_net_nfd3_free_tx_desc(txq) < txq->tx_free_thresh);
796 }
797 
798 /* nfp_net_tx_tso - Set TX descriptor for TSO */
799 static inline void
800 nfp_net_nfd3_tx_tso(struct nfp_net_txq *txq, struct nfp_net_nfd3_tx_desc *txd,
801 	       struct rte_mbuf *mb)
802 {
803 	uint64_t ol_flags;
804 	struct nfp_net_hw *hw = txq->hw;
805 
806 	if (!(hw->cap & NFP_NET_CFG_CTRL_LSO_ANY))
807 		goto clean_txd;
808 
809 	ol_flags = mb->ol_flags;
810 
811 	if (!(ol_flags & RTE_MBUF_F_TX_TCP_SEG))
812 		goto clean_txd;
813 
814 	txd->l3_offset = mb->l2_len;
815 	txd->l4_offset = mb->l2_len + mb->l3_len;
816 	txd->lso_hdrlen = mb->l2_len + mb->l3_len + mb->l4_len;
817 	txd->mss = rte_cpu_to_le_16(mb->tso_segsz);
818 	txd->flags = PCIE_DESC_TX_LSO;
819 	return;
820 
821 clean_txd:
822 	txd->flags = 0;
823 	txd->l3_offset = 0;
824 	txd->l4_offset = 0;
825 	txd->lso_hdrlen = 0;
826 	txd->mss = 0;
827 }
828 
829 /* nfp_net_tx_cksum - Set TX CSUM offload flags in TX descriptor */
830 static inline void
831 nfp_net_nfd3_tx_cksum(struct nfp_net_txq *txq, struct nfp_net_nfd3_tx_desc *txd,
832 		 struct rte_mbuf *mb)
833 {
834 	uint64_t ol_flags;
835 	struct nfp_net_hw *hw = txq->hw;
836 
837 	if (!(hw->cap & NFP_NET_CFG_CTRL_TXCSUM))
838 		return;
839 
840 	ol_flags = mb->ol_flags;
841 
842 	/* IPv6 does not need checksum */
843 	if (ol_flags & RTE_MBUF_F_TX_IP_CKSUM)
844 		txd->flags |= PCIE_DESC_TX_IP4_CSUM;
845 
846 	switch (ol_flags & RTE_MBUF_F_TX_L4_MASK) {
847 	case RTE_MBUF_F_TX_UDP_CKSUM:
848 		txd->flags |= PCIE_DESC_TX_UDP_CSUM;
849 		break;
850 	case RTE_MBUF_F_TX_TCP_CKSUM:
851 		txd->flags |= PCIE_DESC_TX_TCP_CSUM;
852 		break;
853 	}
854 
855 	if (ol_flags & (RTE_MBUF_F_TX_IP_CKSUM | RTE_MBUF_F_TX_L4_MASK))
856 		txd->flags |= PCIE_DESC_TX_CSUM;
857 }
858 
859 uint16_t
860 nfp_net_nfd3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
861 {
862 	struct nfp_net_txq *txq;
863 	struct nfp_net_hw *hw;
864 	struct nfp_net_nfd3_tx_desc *txds, txd;
865 	struct rte_mbuf *pkt;
866 	uint64_t dma_addr;
867 	int pkt_size, dma_size;
868 	uint16_t free_descs, issued_descs;
869 	struct rte_mbuf **lmbuf;
870 	int i;
871 
872 	txq = tx_queue;
873 	hw = txq->hw;
874 	txds = &txq->txds[txq->wr_p];
875 
876 	PMD_TX_LOG(DEBUG, "working for queue %u at pos %d and %u packets",
877 		   txq->qidx, txq->wr_p, nb_pkts);
878 
879 	if ((nfp_net_nfd3_free_tx_desc(txq) < nb_pkts) || (nfp_net_nfd3_txq_full(txq)))
880 		nfp_net_tx_free_bufs(txq);
881 
882 	free_descs = (uint16_t)nfp_net_nfd3_free_tx_desc(txq);
883 	if (unlikely(free_descs == 0))
884 		return 0;
885 
886 	pkt = *tx_pkts;
887 
888 	i = 0;
889 	issued_descs = 0;
890 	PMD_TX_LOG(DEBUG, "queue: %u. Sending %u packets",
891 		   txq->qidx, nb_pkts);
892 	/* Sending packets */
893 	while ((i < nb_pkts) && free_descs) {
894 		/* Grabbing the mbuf linked to the current descriptor */
895 		lmbuf = &txq->txbufs[txq->wr_p].mbuf;
896 		/* Warming the cache for releasing the mbuf later on */
897 		RTE_MBUF_PREFETCH_TO_FREE(*lmbuf);
898 
899 		pkt = *(tx_pkts + i);
900 
901 		if (unlikely(pkt->nb_segs > 1 &&
902 			     !(hw->cap & NFP_NET_CFG_CTRL_GATHER))) {
903 			PMD_INIT_LOG(INFO, "NFP_NET_CFG_CTRL_GATHER not set");
904 			rte_panic("Multisegment packet unsupported\n");
905 		}
906 
907 		/* Checking if we have enough descriptors */
908 		if (unlikely(pkt->nb_segs > free_descs))
909 			goto xmit_end;
910 
911 		/*
912 		 * Checksum and VLAN flags just in the first descriptor for a
913 		 * multisegment packet, but TSO info needs to be in all of them.
914 		 */
915 		txd.data_len = pkt->pkt_len;
916 		nfp_net_nfd3_tx_tso(txq, &txd, pkt);
917 		nfp_net_nfd3_tx_cksum(txq, &txd, pkt);
918 
919 		if ((pkt->ol_flags & RTE_MBUF_F_TX_VLAN) &&
920 		    (hw->cap & NFP_NET_CFG_CTRL_TXVLAN)) {
921 			txd.flags |= PCIE_DESC_TX_VLAN;
922 			txd.vlan = pkt->vlan_tci;
923 		}
924 
925 		/*
926 		 * mbuf data_len is the data in one segment and pkt_len data
927 		 * in the whole packet. When the packet is just one segment,
928 		 * then data_len = pkt_len
929 		 */
930 		pkt_size = pkt->pkt_len;
931 
932 		while (pkt) {
933 			/* Copying TSO, VLAN and cksum info */
934 			*txds = txd;
935 
936 			/* Releasing mbuf used by this descriptor previously*/
937 			if (*lmbuf)
938 				rte_pktmbuf_free_seg(*lmbuf);
939 
940 			/*
941 			 * Linking mbuf with descriptor for being released
942 			 * next time descriptor is used
943 			 */
944 			*lmbuf = pkt;
945 
946 			dma_size = pkt->data_len;
947 			dma_addr = rte_mbuf_data_iova(pkt);
948 			PMD_TX_LOG(DEBUG, "Working with mbuf at dma address:"
949 				   "%" PRIx64 "", dma_addr);
950 
951 			/* Filling descriptors fields */
952 			txds->dma_len = dma_size;
953 			txds->data_len = txd.data_len;
954 			txds->dma_addr_hi = (dma_addr >> 32) & 0xff;
955 			txds->dma_addr_lo = (dma_addr & 0xffffffff);
956 			ASSERT(free_descs > 0);
957 			free_descs--;
958 
959 			txq->wr_p++;
960 			if (unlikely(txq->wr_p == txq->tx_count)) /* wrapping?*/
961 				txq->wr_p = 0;
962 
963 			pkt_size -= dma_size;
964 
965 			/*
966 			 * Making the EOP, packets with just one segment
967 			 * the priority
968 			 */
969 			if (likely(!pkt_size))
970 				txds->offset_eop = PCIE_DESC_TX_EOP;
971 			else
972 				txds->offset_eop = 0;
973 
974 			pkt = pkt->next;
975 			/* Referencing next free TX descriptor */
976 			txds = &txq->txds[txq->wr_p];
977 			lmbuf = &txq->txbufs[txq->wr_p].mbuf;
978 			issued_descs++;
979 		}
980 		i++;
981 	}
982 
983 xmit_end:
984 	/* Increment write pointers. Force memory write before we let HW know */
985 	rte_wmb();
986 	nfp_qcp_ptr_add(txq->qcp_q, NFP_QCP_WRITE_PTR, issued_descs);
987 
988 	return i;
989 }
990 
991 int
992 nfp_net_nfdk_tx_queue_setup(struct rte_eth_dev *dev,
993 		uint16_t queue_idx,
994 		uint16_t nb_desc,
995 		unsigned int socket_id,
996 		const struct rte_eth_txconf *tx_conf)
997 {
998 	const struct rte_memzone *tz;
999 	struct nfp_net_txq *txq;
1000 	uint16_t tx_free_thresh;
1001 	struct nfp_net_hw *hw;
1002 	uint32_t tx_desc_sz;
1003 
1004 	hw = NFP_NET_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1005 
1006 	PMD_INIT_FUNC_TRACE();
1007 
1008 	/* Validating number of descriptors */
1009 	tx_desc_sz = nb_desc * sizeof(struct nfp_net_nfdk_tx_desc);
1010 	if (((NFDK_TX_DESC_PER_SIMPLE_PKT * tx_desc_sz) % NFP_ALIGN_RING_DESC) != 0 ||
1011 	    ((NFDK_TX_DESC_PER_SIMPLE_PKT * nb_desc) % NFDK_TX_DESC_BLOCK_CNT) != 0 ||
1012 	      nb_desc > NFP_NET_MAX_TX_DESC || nb_desc < NFP_NET_MIN_TX_DESC) {
1013 		PMD_DRV_LOG(ERR, "Wrong nb_desc value");
1014 		return -EINVAL;
1015 	}
1016 
1017 	tx_free_thresh = (uint16_t)((tx_conf->tx_free_thresh) ?
1018 				tx_conf->tx_free_thresh :
1019 				DEFAULT_TX_FREE_THRESH);
1020 
1021 	if (tx_free_thresh > (nb_desc)) {
1022 		PMD_DRV_LOG(ERR,
1023 			"tx_free_thresh must be less than the number of TX "
1024 			"descriptors. (tx_free_thresh=%u port=%d "
1025 			"queue=%d)", (unsigned int)tx_free_thresh,
1026 			dev->data->port_id, (int)queue_idx);
1027 		return -(EINVAL);
1028 	}
1029 
1030 	/*
1031 	 * Free memory prior to re-allocation if needed. This is the case after
1032 	 * calling nfp_net_stop
1033 	 */
1034 	if (dev->data->tx_queues[queue_idx]) {
1035 		PMD_TX_LOG(DEBUG, "Freeing memory prior to re-allocation %d",
1036 				queue_idx);
1037 		nfp_net_tx_queue_release(dev, queue_idx);
1038 		dev->data->tx_queues[queue_idx] = NULL;
1039 	}
1040 
1041 	/* Allocating tx queue data structure */
1042 	txq = rte_zmalloc_socket("ethdev TX queue", sizeof(struct nfp_net_txq),
1043 			RTE_CACHE_LINE_SIZE, socket_id);
1044 	if (txq == NULL) {
1045 		PMD_DRV_LOG(ERR, "Error allocating tx dma");
1046 		return -ENOMEM;
1047 	}
1048 
1049 	/*
1050 	 * Allocate TX ring hardware descriptors. A memzone large enough to
1051 	 * handle the maximum ring size is allocated in order to allow for
1052 	 * resizing in later calls to the queue setup function.
1053 	 */
1054 	tz = rte_eth_dma_zone_reserve(dev, "tx_ring", queue_idx,
1055 				sizeof(struct nfp_net_nfdk_tx_desc) *
1056 				NFDK_TX_DESC_PER_SIMPLE_PKT *
1057 				NFP_NET_MAX_TX_DESC, NFP_MEMZONE_ALIGN,
1058 				socket_id);
1059 	if (tz == NULL) {
1060 		PMD_DRV_LOG(ERR, "Error allocating tx dma");
1061 		nfp_net_tx_queue_release(dev, queue_idx);
1062 		return -ENOMEM;
1063 	}
1064 
1065 	txq->tx_count = nb_desc * NFDK_TX_DESC_PER_SIMPLE_PKT;
1066 	txq->tx_free_thresh = tx_free_thresh;
1067 	txq->tx_pthresh = tx_conf->tx_thresh.pthresh;
1068 	txq->tx_hthresh = tx_conf->tx_thresh.hthresh;
1069 	txq->tx_wthresh = tx_conf->tx_thresh.wthresh;
1070 
1071 	/* queue mapping based on firmware configuration */
1072 	txq->qidx = queue_idx;
1073 	txq->tx_qcidx = queue_idx * hw->stride_tx;
1074 	txq->qcp_q = hw->tx_bar + NFP_QCP_QUEUE_OFF(txq->tx_qcidx);
1075 
1076 	txq->port_id = dev->data->port_id;
1077 
1078 	/* Saving physical and virtual addresses for the TX ring */
1079 	txq->dma = (uint64_t)tz->iova;
1080 	txq->ktxds = (struct nfp_net_nfdk_tx_desc *)tz->addr;
1081 
1082 	/* mbuf pointers array for referencing mbufs linked to TX descriptors */
1083 	txq->txbufs = rte_zmalloc_socket("txq->txbufs",
1084 				sizeof(*txq->txbufs) * txq->tx_count,
1085 				RTE_CACHE_LINE_SIZE, socket_id);
1086 
1087 	if (txq->txbufs == NULL) {
1088 		nfp_net_tx_queue_release(dev, queue_idx);
1089 		return -ENOMEM;
1090 	}
1091 	PMD_TX_LOG(DEBUG, "txbufs=%p hw_ring=%p dma_addr=0x%" PRIx64,
1092 		txq->txbufs, txq->ktxds, (unsigned long)txq->dma);
1093 
1094 	nfp_net_reset_tx_queue(txq);
1095 
1096 	dev->data->tx_queues[queue_idx] = txq;
1097 	txq->hw = hw;
1098 	/*
1099 	 * Telling the HW about the physical address of the TX ring and number
1100 	 * of descriptors in log2 format
1101 	 */
1102 	nn_cfg_writeq(hw, NFP_NET_CFG_TXR_ADDR(queue_idx), txq->dma);
1103 	nn_cfg_writeb(hw, NFP_NET_CFG_TXR_SZ(queue_idx), rte_log2_u32(txq->tx_count));
1104 
1105 	return 0;
1106 }
1107