xref: /dpdk/drivers/net/nfp/nfp_rxtx_vec_avx2.c (revision b6de43530dfa30cbf6b70857e3835099701063d4)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2024 Corigine, Inc.
3  * All rights reserved.
4  */
5 
6 #include <stdbool.h>
7 
8 #include <bus_pci_driver.h>
9 #include <ethdev_driver.h>
10 #include <rte_cpuflags.h>
11 #include <rte_vect.h>
12 
13 #include "nfp_logs.h"
14 #include "nfp_net_common.h"
15 #include "nfp_net_meta.h"
16 #include "nfp_rxtx_vec.h"
17 
18 bool
19 nfp_net_get_avx2_supported(void)
20 {
21 	if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256 &&
22 			rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1)
23 		return true;
24 
25 	return false;
26 }
27 
28 static inline void
29 nfp_vec_avx2_recv_set_des1(struct nfp_net_rxq *rxq,
30 		struct nfp_net_rx_desc *rxds,
31 		struct rte_mbuf *rxb)
32 {
33 	__m128i dma;
34 	__m128i dma_hi;
35 	__m128i vaddr0;
36 	__m128i hdr_room = _mm_set_epi64x(0, RTE_PKTMBUF_HEADROOM);
37 
38 	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb->buf_addr), hdr_room);
39 	dma_hi = _mm_srli_epi64(dma, 32);
40 	vaddr0 = _mm_unpacklo_epi32(dma_hi, dma);
41 
42 	_mm_storel_epi64((void *)rxds, vaddr0);
43 
44 	rxq->rd_p = (rxq->rd_p + 1) & (rxq->rx_count - 1);
45 }
46 
47 static inline void
48 nfp_vec_avx2_recv_set_des4(struct nfp_net_rxq *rxq,
49 		struct nfp_net_rx_desc *rxds,
50 		struct rte_mbuf **rxb)
51 {
52 	__m128i dma;
53 	__m128i dma_hi;
54 	__m128i vaddr0;
55 	__m128i vaddr1;
56 	__m128i vaddr2;
57 	__m128i vaddr3;
58 	__m128i vaddr0_1;
59 	__m128i vaddr2_3;
60 	__m256i vaddr0_3;
61 	__m128i hdr_room = _mm_set_epi64x(0, RTE_PKTMBUF_HEADROOM);
62 
63 	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[0]->buf_addr), hdr_room);
64 	dma_hi = _mm_srli_epi64(dma, 32);
65 	vaddr0 = _mm_unpacklo_epi32(dma_hi, dma);
66 
67 	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[1]->buf_addr), hdr_room);
68 	dma_hi = _mm_srli_epi64(dma, 32);
69 	vaddr1 = _mm_unpacklo_epi32(dma_hi, dma);
70 
71 	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[2]->buf_addr), hdr_room);
72 	dma_hi = _mm_srli_epi64(dma, 32);
73 	vaddr2 = _mm_unpacklo_epi32(dma_hi, dma);
74 
75 	dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[3]->buf_addr), hdr_room);
76 	dma_hi = _mm_srli_epi64(dma, 32);
77 	vaddr3 = _mm_unpacklo_epi32(dma_hi, dma);
78 
79 	vaddr0_1 = _mm_unpacklo_epi64(vaddr0, vaddr1);
80 	vaddr2_3 = _mm_unpacklo_epi64(vaddr2, vaddr3);
81 
82 	vaddr0_3 = _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0_1),
83 			vaddr2_3, 1);
84 
85 	_mm256_store_si256((void *)rxds, vaddr0_3);
86 
87 	rxq->rd_p = (rxq->rd_p + 4) & (rxq->rx_count - 1);
88 }
89 
90 static inline void
91 nfp_vec_avx2_recv_set_rxpkt1(struct nfp_net_rxq *rxq,
92 		struct nfp_net_rx_desc *rxds,
93 		struct rte_mbuf *rx_pkt)
94 {
95 	struct nfp_net_hw *hw = rxq->hw;
96 	struct nfp_net_meta_parsed meta;
97 
98 	rx_pkt->data_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds);
99 	/* Size of the whole packet. We just support 1 segment */
100 	rx_pkt->pkt_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds);
101 
102 	/* Filling the received mbuf with packet info */
103 	if (hw->rx_offset)
104 		rx_pkt->data_off = RTE_PKTMBUF_HEADROOM + hw->rx_offset;
105 	else
106 		rx_pkt->data_off = RTE_PKTMBUF_HEADROOM + NFP_DESC_META_LEN(rxds);
107 
108 	rx_pkt->port = rxq->port_id;
109 	rx_pkt->nb_segs = 1;
110 	rx_pkt->next = NULL;
111 
112 	nfp_net_meta_parse(rxds, rxq, hw, rx_pkt, &meta);
113 
114 	nfp_net_parse_ptype(rxq, rxds, rx_pkt);
115 
116 	/* Checking the checksum flag */
117 	nfp_net_rx_cksum(rxq, rxds, rx_pkt);
118 }
119 
120 static inline int
121 nfp_vec_avx2_recv1(struct nfp_net_rxq *rxq,
122 		struct nfp_net_rx_desc *rxds,
123 		struct rte_mbuf **rxb,
124 		struct rte_mbuf *rx_pkt)
125 {
126 	/* Allocate a new mbuf into the software ring. */
127 	if (rte_pktmbuf_alloc_bulk(rxq->mem_pool, rxb, 1) < 0) {
128 		PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u queue_id=%hu.",
129 				rxq->port_id, rxq->qidx);
130 		nfp_net_mbuf_alloc_failed(rxq);
131 		return -ENOMEM;
132 	}
133 
134 	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds, rx_pkt);
135 
136 	nfp_vec_avx2_recv_set_des1(rxq, rxds, *rxb);
137 
138 	return 0;
139 }
140 
141 static inline int
142 nfp_vec_avx2_recv4(struct nfp_net_rxq *rxq,
143 		struct nfp_net_rx_desc *rxds,
144 		struct rte_mbuf **rxb,
145 		struct rte_mbuf **rx_pkts)
146 {
147 	/* Allocate 4 new mbufs into the software ring. */
148 	if (rte_pktmbuf_alloc_bulk(rxq->mem_pool, rxb, 4) < 0) {
149 		PMD_RX_LOG(DEBUG, "RX mbuf bulk alloc failed port_id=%u queue_id=%hu.",
150 				rxq->port_id, rxq->qidx);
151 		return -ENOMEM;
152 	}
153 
154 	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds, rx_pkts[0]);
155 	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 1, rx_pkts[1]);
156 	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 2, rx_pkts[2]);
157 	nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 3, rx_pkts[3]);
158 
159 	nfp_vec_avx2_recv_set_des4(rxq, rxds, rxb);
160 
161 	return 0;
162 }
163 
164 static inline bool
165 nfp_vec_avx2_recv_check_packets4(struct nfp_net_rx_desc *rxds)
166 {
167 	__m256i data = _mm256_loadu_si256((void *)rxds);
168 
169 	if ((_mm256_extract_epi8(data, 3) & PCIE_DESC_RX_DD) == 0 ||
170 			(_mm256_extract_epi8(data, 11) & PCIE_DESC_RX_DD) == 0 ||
171 			(_mm256_extract_epi8(data, 19) & PCIE_DESC_RX_DD) == 0 ||
172 			(_mm256_extract_epi8(data, 27) & PCIE_DESC_RX_DD) == 0)
173 		return false;
174 
175 	return true;
176 }
177 
178 uint16_t
179 nfp_net_vec_avx2_recv_pkts(void *rx_queue,
180 		struct rte_mbuf **rx_pkts,
181 		uint16_t nb_pkts)
182 {
183 	uint16_t avail;
184 	uint16_t nb_hold;
185 	bool burst_receive;
186 	struct rte_mbuf **rxb;
187 	struct nfp_net_rx_desc *rxds;
188 	struct nfp_net_rxq *rxq = rx_queue;
189 
190 	if (unlikely(rxq == NULL)) {
191 		PMD_RX_LOG(ERR, "RX Bad queue.");
192 		return 0;
193 	}
194 
195 	avail = 0;
196 	nb_hold = 0;
197 	burst_receive = true;
198 	while (avail < nb_pkts) {
199 		rxds = &rxq->rxds[rxq->rd_p];
200 		rxb = &rxq->rxbufs[rxq->rd_p].mbuf;
201 
202 		if ((_mm_extract_epi8(_mm_loadu_si128((void *)(rxds)), 3)
203 				& PCIE_DESC_RX_DD) == 0)
204 			goto recv_end;
205 
206 		rte_prefetch0(rxq->rxbufs[rxq->rd_p].mbuf);
207 
208 		if ((rxq->rd_p & 0x3) == 0) {
209 			rte_prefetch0(&rxq->rxds[rxq->rd_p]);
210 			rte_prefetch0(&rxq->rxbufs[rxq->rd_p]);
211 			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 1].mbuf);
212 			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 2].mbuf);
213 			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 3].mbuf);
214 		}
215 
216 		if ((rxq->rd_p & 0x7) == 0) {
217 			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 4].mbuf);
218 			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 5].mbuf);
219 			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 6].mbuf);
220 			rte_prefetch0(rxq->rxbufs[rxq->rd_p + 7].mbuf);
221 		}
222 
223 		/*
224 		 * If can not receive burst, just receive one.
225 		 * 1. Rx ring will coming to the tail.
226 		 * 2. Do not need to receive 4 packets.
227 		 * 3. If pointer address unaligned on 32-bit boundary.
228 		 * 4. Rx ring does not have 4 packets or alloc 4 mbufs failed.
229 		 */
230 		if ((rxq->rx_count - rxq->rd_p) < 4 ||
231 				(nb_pkts - avail) < 4 ||
232 				((uintptr_t)rxds & 0x1F) != 0 ||
233 				!burst_receive) {
234 			_mm_storel_epi64((void *)&rx_pkts[avail],
235 					_mm_loadu_si128((void *)rxb));
236 
237 			if (nfp_vec_avx2_recv1(rxq, rxds, rxb, rx_pkts[avail]) != 0)
238 				goto recv_end;
239 
240 			avail++;
241 			nb_hold++;
242 			continue;
243 		}
244 
245 		burst_receive = nfp_vec_avx2_recv_check_packets4(rxds);
246 		if (!burst_receive)
247 			continue;
248 
249 		_mm256_storeu_si256((void *)&rx_pkts[avail],
250 				_mm256_loadu_si256((void *)rxb));
251 
252 		if (nfp_vec_avx2_recv4(rxq, rxds, rxb, &rx_pkts[avail]) != 0) {
253 			burst_receive = false;
254 			continue;
255 		}
256 
257 		avail += 4;
258 		nb_hold += 4;
259 	}
260 
261 recv_end:
262 	if (nb_hold == 0)
263 		return nb_hold;
264 
265 	PMD_RX_LOG(DEBUG, "RX port_id=%u queue_id=%u, %d packets received.",
266 			rxq->port_id, (unsigned int)rxq->qidx, nb_hold);
267 
268 	nb_hold += rxq->nb_rx_hold;
269 
270 	/*
271 	 * FL descriptors needs to be written before incrementing the
272 	 * FL queue WR pointer
273 	 */
274 	rte_wmb();
275 	if (nb_hold > rxq->rx_free_thresh) {
276 		PMD_RX_LOG(DEBUG, "The port=%hu queue=%hu nb_hold=%hu avail=%hu.",
277 				rxq->port_id, rxq->qidx, nb_hold, avail);
278 		nfp_qcp_ptr_add(rxq->qcp_fl, NFP_QCP_WRITE_PTR, nb_hold);
279 		nb_hold = 0;
280 	}
281 	rxq->nb_rx_hold = nb_hold;
282 
283 	return avail;
284 }
285