xref: /dpdk/drivers/net/nfp/nfp_rxtx_vec_avx2.c (revision b6de43530dfa30cbf6b70857e3835099701063d4)
1fb6befdfSLong Wu /* SPDX-License-Identifier: BSD-3-Clause
2fb6befdfSLong Wu  * Copyright(c) 2024 Corigine, Inc.
3fb6befdfSLong Wu  * All rights reserved.
4fb6befdfSLong Wu  */
5fb6befdfSLong Wu 
6fb6befdfSLong Wu #include <stdbool.h>
7fb6befdfSLong Wu 
8b6755530SLong Wu #include <bus_pci_driver.h>
9b6755530SLong Wu #include <ethdev_driver.h>
10fb6befdfSLong Wu #include <rte_cpuflags.h>
11fb6befdfSLong Wu #include <rte_vect.h>
12fb6befdfSLong Wu 
13b6755530SLong Wu #include "nfp_logs.h"
14b6755530SLong Wu #include "nfp_net_common.h"
15b6755530SLong Wu #include "nfp_net_meta.h"
16fb6befdfSLong Wu #include "nfp_rxtx_vec.h"
17fb6befdfSLong Wu 
18fb6befdfSLong Wu bool
19fb6befdfSLong Wu nfp_net_get_avx2_supported(void)
20fb6befdfSLong Wu {
21fb6befdfSLong Wu 	if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256 &&
22fb6befdfSLong Wu 			rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1)
23fb6befdfSLong Wu 		return true;
24fb6befdfSLong Wu 
25fb6befdfSLong Wu 	return false;
26fb6befdfSLong Wu }
27b6755530SLong Wu 
28b6755530SLong Wu static inline void
29b6755530SLong Wu nfp_vec_avx2_recv_set_des1(struct nfp_net_rxq *rxq,
30b6755530SLong Wu 		struct nfp_net_rx_desc *rxds,
31b6755530SLong Wu 		struct rte_mbuf *rxb)
32b6755530SLong Wu {
33b6755530SLong Wu 	__m128i dma;
34b6755530SLong Wu 	__m128i dma_hi;
35b6755530SLong Wu 	__m128i vaddr0;
36b6755530SLong Wu 	__m128i hdr_room = _mm_set_epi64x(0, RTE_PKTMBUF_HEADROOM);
37b6755530SLong Wu 
38b6755530SLong Wu 	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb->buf_addr), hdr_room);
39b6755530SLong Wu 	dma_hi = _mm_srli_epi64(dma, 32);
40b6755530SLong Wu 	vaddr0 = _mm_unpacklo_epi32(dma_hi, dma);
41b6755530SLong Wu 
42b6755530SLong Wu 	_mm_storel_epi64((void *)rxds, vaddr0);
43b6755530SLong Wu 
44b6755530SLong Wu 	rxq->rd_p = (rxq->rd_p + 1) & (rxq->rx_count - 1);
45b6755530SLong Wu }
46b6755530SLong Wu 
47b6755530SLong Wu static inline void
48b6755530SLong Wu nfp_vec_avx2_recv_set_des4(struct nfp_net_rxq *rxq,
49b6755530SLong Wu 		struct nfp_net_rx_desc *rxds,
50b6755530SLong Wu 		struct rte_mbuf **rxb)
51b6755530SLong Wu {
52b6755530SLong Wu 	__m128i dma;
53b6755530SLong Wu 	__m128i dma_hi;
54b6755530SLong Wu 	__m128i vaddr0;
55b6755530SLong Wu 	__m128i vaddr1;
56b6755530SLong Wu 	__m128i vaddr2;
57b6755530SLong Wu 	__m128i vaddr3;
58b6755530SLong Wu 	__m128i vaddr0_1;
59b6755530SLong Wu 	__m128i vaddr2_3;
60b6755530SLong Wu 	__m256i vaddr0_3;
61b6755530SLong Wu 	__m128i hdr_room = _mm_set_epi64x(0, RTE_PKTMBUF_HEADROOM);
62b6755530SLong Wu 
63b6755530SLong Wu 	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[0]->buf_addr), hdr_room);
64b6755530SLong Wu 	dma_hi = _mm_srli_epi64(dma, 32);
65b6755530SLong Wu 	vaddr0 = _mm_unpacklo_epi32(dma_hi, dma);
66b6755530SLong Wu 
67b6755530SLong Wu 	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[1]->buf_addr), hdr_room);
68b6755530SLong Wu 	dma_hi = _mm_srli_epi64(dma, 32);
69b6755530SLong Wu 	vaddr1 = _mm_unpacklo_epi32(dma_hi, dma);
70b6755530SLong Wu 
71b6755530SLong Wu 	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[2]->buf_addr), hdr_room);
72b6755530SLong Wu 	dma_hi = _mm_srli_epi64(dma, 32);
73b6755530SLong Wu 	vaddr2 = _mm_unpacklo_epi32(dma_hi, dma);
74b6755530SLong Wu 
75b6755530SLong Wu 	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[3]->buf_addr), hdr_room);
76b6755530SLong Wu 	dma_hi = _mm_srli_epi64(dma, 32);
77b6755530SLong Wu 	vaddr3 = _mm_unpacklo_epi32(dma_hi, dma);
78b6755530SLong Wu 
79b6755530SLong Wu 	vaddr0_1 = _mm_unpacklo_epi64(vaddr0, vaddr1);
80b6755530SLong Wu 	vaddr2_3 = _mm_unpacklo_epi64(vaddr2, vaddr3);
81b6755530SLong Wu 
82b6755530SLong Wu 	vaddr0_3 = _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0_1),
83b6755530SLong Wu 			vaddr2_3, 1);
84b6755530SLong Wu 
85b6755530SLong Wu 	_mm256_store_si256((void *)rxds, vaddr0_3);
86b6755530SLong Wu 
87b6755530SLong Wu 	rxq->rd_p = (rxq->rd_p + 4) & (rxq->rx_count - 1);
88b6755530SLong Wu }
89b6755530SLong Wu 
90b6755530SLong Wu static inline void
91b6755530SLong Wu nfp_vec_avx2_recv_set_rxpkt1(struct nfp_net_rxq *rxq,
92b6755530SLong Wu 		struct nfp_net_rx_desc *rxds,
93b6755530SLong Wu 		struct rte_mbuf *rx_pkt)
94b6755530SLong Wu {
95b6755530SLong Wu 	struct nfp_net_hw *hw = rxq->hw;
96b6755530SLong Wu 	struct nfp_net_meta_parsed meta;
97b6755530SLong Wu 
98b6755530SLong Wu 	rx_pkt->data_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds);
99b6755530SLong Wu 	/* Size of the whole packet. We just support 1 segment */
100b6755530SLong Wu 	rx_pkt->pkt_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds);
101b6755530SLong Wu 
102b6755530SLong Wu 	/* Filling the received mbuf with packet info */
103b6755530SLong Wu 	if (hw->rx_offset)
104b6755530SLong Wu 		rx_pkt->data_off = RTE_PKTMBUF_HEADROOM + hw->rx_offset;
105b6755530SLong Wu 	else
106b6755530SLong Wu 		rx_pkt->data_off = RTE_PKTMBUF_HEADROOM + NFP_DESC_META_LEN(rxds);
107b6755530SLong Wu 
108b6755530SLong Wu 	rx_pkt->port = rxq->port_id;
109b6755530SLong Wu 	rx_pkt->nb_segs = 1;
110b6755530SLong Wu 	rx_pkt->next = NULL;
111b6755530SLong Wu 
112b6755530SLong Wu 	nfp_net_meta_parse(rxds, rxq, hw, rx_pkt, &meta);
113b6755530SLong Wu 
11462edcfd6SLong Wu 	nfp_net_parse_ptype(rxq, rxds, rx_pkt);
11562edcfd6SLong Wu 
116b6755530SLong Wu 	/* Checking the checksum flag */
117b6755530SLong Wu 	nfp_net_rx_cksum(rxq, rxds, rx_pkt);
118b6755530SLong Wu }
119b6755530SLong Wu 
12014612528SLong Wu static inline int
121b6755530SLong Wu nfp_vec_avx2_recv1(struct nfp_net_rxq *rxq,
122b6755530SLong Wu 		struct nfp_net_rx_desc *rxds,
12314612528SLong Wu 		struct rte_mbuf **rxb,
124b6755530SLong Wu 		struct rte_mbuf *rx_pkt)
125b6755530SLong Wu {
12614612528SLong Wu 	/* Allocate a new mbuf into the software ring. */
12714612528SLong Wu 	if (rte_pktmbuf_alloc_bulk(rxq->mem_pool, rxb, 1) < 0) {
128*b6de4353SZerun Fu 		PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u queue_id=%hu.",
12914612528SLong Wu 				rxq->port_id, rxq->qidx);
13014612528SLong Wu 		nfp_net_mbuf_alloc_failed(rxq);
13114612528SLong Wu 		return -ENOMEM;
132b6755530SLong Wu 	}
133b6755530SLong Wu 
13414612528SLong Wu 	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds, rx_pkt);
13514612528SLong Wu 
13614612528SLong Wu 	nfp_vec_avx2_recv_set_des1(rxq, rxds, *rxb);
13714612528SLong Wu 
13814612528SLong Wu 	return 0;
13914612528SLong Wu }
14014612528SLong Wu 
14114612528SLong Wu static inline int
142b6755530SLong Wu nfp_vec_avx2_recv4(struct nfp_net_rxq *rxq,
143b6755530SLong Wu 		struct nfp_net_rx_desc *rxds,
144b6755530SLong Wu 		struct rte_mbuf **rxb,
145b6755530SLong Wu 		struct rte_mbuf **rx_pkts)
146b6755530SLong Wu {
14714612528SLong Wu 	/* Allocate 4 new mbufs into the software ring. */
14814612528SLong Wu 	if (rte_pktmbuf_alloc_bulk(rxq->mem_pool, rxb, 4) < 0) {
149*b6de4353SZerun Fu 		PMD_RX_LOG(DEBUG, "RX mbuf bulk alloc failed port_id=%u queue_id=%hu.",
15014612528SLong Wu 				rxq->port_id, rxq->qidx);
15114612528SLong Wu 		return -ENOMEM;
15214612528SLong Wu 	}
15314612528SLong Wu 
154b6755530SLong Wu 	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds, rx_pkts[0]);
155b6755530SLong Wu 	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 1, rx_pkts[1]);
156b6755530SLong Wu 	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 2, rx_pkts[2]);
157b6755530SLong Wu 	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 3, rx_pkts[3]);
158b6755530SLong Wu 
159b6755530SLong Wu 	nfp_vec_avx2_recv_set_des4(rxq, rxds, rxb);
16014612528SLong Wu 
16114612528SLong Wu 	return 0;
162b6755530SLong Wu }
163b6755530SLong Wu 
164b6755530SLong Wu static inline bool
165b6755530SLong Wu nfp_vec_avx2_recv_check_packets4(struct nfp_net_rx_desc *rxds)
166b6755530SLong Wu {
167b6755530SLong Wu 	__m256i data = _mm256_loadu_si256((void *)rxds);
168b6755530SLong Wu 
169b6755530SLong Wu 	if ((_mm256_extract_epi8(data, 3) & PCIE_DESC_RX_DD) == 0 ||
170b6755530SLong Wu 			(_mm256_extract_epi8(data, 11) & PCIE_DESC_RX_DD) == 0 ||
171b6755530SLong Wu 			(_mm256_extract_epi8(data, 19) & PCIE_DESC_RX_DD) == 0 ||
172b6755530SLong Wu 			(_mm256_extract_epi8(data, 27) & PCIE_DESC_RX_DD) == 0)
173b6755530SLong Wu 		return false;
174b6755530SLong Wu 
175b6755530SLong Wu 	return true;
176b6755530SLong Wu }
177b6755530SLong Wu 
178b6755530SLong Wu uint16_t
179b6755530SLong Wu nfp_net_vec_avx2_recv_pkts(void *rx_queue,
180b6755530SLong Wu 		struct rte_mbuf **rx_pkts,
181b6755530SLong Wu 		uint16_t nb_pkts)
182b6755530SLong Wu {
183b6755530SLong Wu 	uint16_t avail;
184b6755530SLong Wu 	uint16_t nb_hold;
185b6755530SLong Wu 	bool burst_receive;
186b6755530SLong Wu 	struct rte_mbuf **rxb;
187b6755530SLong Wu 	struct nfp_net_rx_desc *rxds;
188b6755530SLong Wu 	struct nfp_net_rxq *rxq = rx_queue;
189b6755530SLong Wu 
190b6755530SLong Wu 	if (unlikely(rxq == NULL)) {
191*b6de4353SZerun Fu 		PMD_RX_LOG(ERR, "RX Bad queue.");
192b6755530SLong Wu 		return 0;
193b6755530SLong Wu 	}
194b6755530SLong Wu 
195b6755530SLong Wu 	avail = 0;
196b6755530SLong Wu 	nb_hold = 0;
197b6755530SLong Wu 	burst_receive = true;
198b6755530SLong Wu 	while (avail < nb_pkts) {
199b6755530SLong Wu 		rxds = &rxq->rxds[rxq->rd_p];
200b6755530SLong Wu 		rxb = &rxq->rxbufs[rxq->rd_p].mbuf;
201b6755530SLong Wu 
202b6755530SLong Wu 		if ((_mm_extract_epi8(_mm_loadu_si128((void *)(rxds)), 3)
203b6755530SLong Wu 				& PCIE_DESC_RX_DD) == 0)
204b6755530SLong Wu 			goto recv_end;
205b6755530SLong Wu 
206b6755530SLong Wu 		rte_prefetch0(rxq->rxbufs[rxq->rd_p].mbuf);
207b6755530SLong Wu 
208b6755530SLong Wu 		if ((rxq->rd_p & 0x3) == 0) {
209b6755530SLong Wu 			rte_prefetch0(&rxq->rxds[rxq->rd_p]);
210b6755530SLong Wu 			rte_prefetch0(&rxq->rxbufs[rxq->rd_p]);
211b6755530SLong Wu 			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 1].mbuf);
212b6755530SLong Wu 			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 2].mbuf);
213b6755530SLong Wu 			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 3].mbuf);
214b6755530SLong Wu 		}
215b6755530SLong Wu 
216b6755530SLong Wu 		if ((rxq->rd_p & 0x7) == 0) {
217b6755530SLong Wu 			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 4].mbuf);
218b6755530SLong Wu 			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 5].mbuf);
219b6755530SLong Wu 			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 6].mbuf);
220b6755530SLong Wu 			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 7].mbuf);
221b6755530SLong Wu 		}
222b6755530SLong Wu 
223b6755530SLong Wu 		/*
224b6755530SLong Wu 		 * If can not receive burst, just receive one.
225b6755530SLong Wu 		 * 1. Rx ring will coming to the tail.
226b6755530SLong Wu 		 * 2. Do not need to receive 4 packets.
227b6755530SLong Wu 		 * 3. If pointer address unaligned on 32-bit boundary.
228b6755530SLong Wu 		 * 4. Rx ring does not have 4 packets or alloc 4 mbufs failed.
229b6755530SLong Wu 		 */
230b6755530SLong Wu 		if ((rxq->rx_count - rxq->rd_p) < 4 ||
231b6755530SLong Wu 				(nb_pkts - avail) < 4 ||
232b6755530SLong Wu 				((uintptr_t)rxds & 0x1F) != 0 ||
233b6755530SLong Wu 				!burst_receive) {
234b6755530SLong Wu 			_mm_storel_epi64((void *)&rx_pkts[avail],
235b6755530SLong Wu 					_mm_loadu_si128((void *)rxb));
236b6755530SLong Wu 
23714612528SLong Wu 			if (nfp_vec_avx2_recv1(rxq, rxds, rxb, rx_pkts[avail]) != 0)
238b6755530SLong Wu 				goto recv_end;
239b6755530SLong Wu 
240b6755530SLong Wu 			avail++;
241b6755530SLong Wu 			nb_hold++;
242b6755530SLong Wu 			continue;
243b6755530SLong Wu 		}
244b6755530SLong Wu 
245b6755530SLong Wu 		burst_receive = nfp_vec_avx2_recv_check_packets4(rxds);
246b6755530SLong Wu 		if (!burst_receive)
247b6755530SLong Wu 			continue;
248b6755530SLong Wu 
249b6755530SLong Wu 		_mm256_storeu_si256((void *)&rx_pkts[avail],
250b6755530SLong Wu 				_mm256_loadu_si256((void *)rxb));
251b6755530SLong Wu 
25214612528SLong Wu 		if (nfp_vec_avx2_recv4(rxq, rxds, rxb, &rx_pkts[avail]) != 0) {
253b6755530SLong Wu 			burst_receive = false;
254b6755530SLong Wu 			continue;
255b6755530SLong Wu 		}
256b6755530SLong Wu 
257b6755530SLong Wu 		avail += 4;
258b6755530SLong Wu 		nb_hold += 4;
259b6755530SLong Wu 	}
260b6755530SLong Wu 
261b6755530SLong Wu recv_end:
262b6755530SLong Wu 	if (nb_hold == 0)
263b6755530SLong Wu 		return nb_hold;
264b6755530SLong Wu 
265*b6de4353SZerun Fu 	PMD_RX_LOG(DEBUG, "RX port_id=%u queue_id=%u, %d packets received.",
266b6755530SLong Wu 			rxq->port_id, (unsigned int)rxq->qidx, nb_hold);
267b6755530SLong Wu 
268b6755530SLong Wu 	nb_hold += rxq->nb_rx_hold;
269b6755530SLong Wu 
270b6755530SLong Wu 	/*
271b6755530SLong Wu 	 * FL descriptors needs to be written before incrementing the
272b6755530SLong Wu 	 * FL queue WR pointer
273b6755530SLong Wu 	 */
274b6755530SLong Wu 	rte_wmb();
275b6755530SLong Wu 	if (nb_hold > rxq->rx_free_thresh) {
276*b6de4353SZerun Fu 		PMD_RX_LOG(DEBUG, "The port=%hu queue=%hu nb_hold=%hu avail=%hu.",
277b6755530SLong Wu 				rxq->port_id, rxq->qidx, nb_hold, avail);
278b6755530SLong Wu 		nfp_qcp_ptr_add(rxq->qcp_fl, NFP_QCP_WRITE_PTR, nb_hold);
279b6755530SLong Wu 		nb_hold = 0;
280b6755530SLong Wu 	}
281b6755530SLong Wu 	rxq->nb_rx_hold = nb_hold;
282b6755530SLong Wu 
283b6755530SLong Wu 	return avail;
284b6755530SLong Wu }
285