xref: /dpdk/drivers/net/hns3/hns3_rxtx_vec_neon.h (revision 68a03efeed657e6e05f281479b33b51102797e15)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2020 Hisilicon Limited.
3  */
4 
5 #ifndef _HNS3_RXTX_VEC_NEON_H_
6 #define _HNS3_RXTX_VEC_NEON_H_
7 
8 #include <arm_neon.h>
9 
10 #pragma GCC diagnostic ignored "-Wcast-qual"
11 
12 static inline void
13 hns3_vec_tx(volatile struct hns3_desc *desc, struct rte_mbuf *pkt)
14 {
15 	uint64x2_t val1 = {
16 		pkt->buf_iova + pkt->data_off,
17 		((uint64_t)pkt->data_len) << HNS3_TXD_SEND_SIZE_SHIFT
18 	};
19 	uint64x2_t val2 = {
20 		0,
21 		((uint64_t)HNS3_TXD_DEFAULT_VLD_FE_BDTYPE) << HNS3_UINT32_BIT
22 	};
23 	vst1q_u64((uint64_t *)&desc->addr, val1);
24 	vst1q_u64((uint64_t *)&desc->tx.outer_vlan_tag, val2);
25 }
26 
27 static uint16_t
28 hns3_xmit_fixed_burst_vec(void *__restrict tx_queue,
29 			  struct rte_mbuf **__restrict tx_pkts,
30 			  uint16_t nb_pkts)
31 {
32 	struct hns3_tx_queue *txq = (struct hns3_tx_queue *)tx_queue;
33 	volatile struct hns3_desc *tx_desc;
34 	struct hns3_entry *tx_entry;
35 	uint16_t next_to_use;
36 	uint16_t nb_commit;
37 	uint16_t nb_tx;
38 	uint16_t n, i;
39 
40 	if (txq->tx_bd_ready < txq->tx_free_thresh)
41 		hns3_tx_free_buffers(txq);
42 
43 	nb_commit = RTE_MIN(txq->tx_bd_ready, nb_pkts);
44 	if (unlikely(nb_commit == 0)) {
45 		txq->dfx_stats.queue_full_cnt++;
46 		return 0;
47 	}
48 	nb_tx = nb_commit;
49 
50 	next_to_use = txq->next_to_use;
51 	tx_desc = &txq->tx_ring[next_to_use];
52 	tx_entry = &txq->sw_ring[next_to_use];
53 
54 	/*
55 	 * We need to deal with n descriptors first for better performance,
56 	 * if nb_commit is greater than the difference between txq->nb_tx_desc
57 	 * and next_to_use in sw_ring and tx_ring.
58 	 */
59 	n = txq->nb_tx_desc - next_to_use;
60 	if (nb_commit >= n) {
61 		for (i = 0; i < n; i++, tx_pkts++, tx_desc++) {
62 			hns3_vec_tx(tx_desc, *tx_pkts);
63 			tx_entry[i].mbuf = *tx_pkts;
64 
65 			/* Increment bytes counter */
66 			txq->basic_stats.bytes += (*tx_pkts)->pkt_len;
67 		}
68 
69 		nb_commit -= n;
70 		next_to_use = 0;
71 		tx_desc = &txq->tx_ring[next_to_use];
72 		tx_entry = &txq->sw_ring[next_to_use];
73 	}
74 
75 	for (i = 0; i < nb_commit; i++, tx_pkts++, tx_desc++) {
76 		hns3_vec_tx(tx_desc, *tx_pkts);
77 		tx_entry[i].mbuf = *tx_pkts;
78 
79 		/* Increment bytes counter */
80 		txq->basic_stats.bytes += (*tx_pkts)->pkt_len;
81 	}
82 
83 	next_to_use += nb_commit;
84 	txq->next_to_use = next_to_use;
85 	txq->tx_bd_ready -= nb_tx;
86 
87 	hns3_write_reg_opt(txq->io_tail_reg, nb_tx);
88 
89 	return nb_tx;
90 }
91 
92 static inline uint32_t
93 hns3_desc_parse_field(struct hns3_rx_queue *rxq,
94 		      struct hns3_entry *sw_ring,
95 		      struct hns3_desc *rxdp,
96 		      uint32_t   bd_vld_num)
97 {
98 	uint32_t l234_info, ol_info, bd_base_info;
99 	struct rte_mbuf *pkt;
100 	uint32_t retcode = 0;
101 	uint32_t cksum_err;
102 	uint32_t i;
103 	int ret;
104 
105 	for (i = 0; i < bd_vld_num; i++) {
106 		pkt = sw_ring[i].mbuf;
107 
108 		/* init rte_mbuf.rearm_data last 64-bit */
109 		pkt->ol_flags = PKT_RX_RSS_HASH;
110 
111 		l234_info = rxdp[i].rx.l234_info;
112 		ol_info = rxdp[i].rx.ol_info;
113 		bd_base_info = rxdp[i].rx.bd_base_info;
114 		ret = hns3_handle_bdinfo(rxq, pkt, bd_base_info,
115 					 l234_info, &cksum_err);
116 		if (unlikely(ret)) {
117 			retcode |= 1u << i;
118 			continue;
119 		}
120 
121 		pkt->packet_type = hns3_rx_calc_ptype(rxq, l234_info, ol_info);
122 		if (likely(bd_base_info & BIT(HNS3_RXD_L3L4P_B)))
123 			hns3_rx_set_cksum_flag(pkt, pkt->packet_type,
124 					       cksum_err);
125 
126 		/* Increment bytes counter */
127 		rxq->basic_stats.bytes += pkt->pkt_len;
128 	}
129 
130 	return retcode;
131 }
132 
133 static inline uint16_t
134 hns3_recv_burst_vec(struct hns3_rx_queue *__restrict rxq,
135 		    struct rte_mbuf **__restrict rx_pkts,
136 		    uint16_t nb_pkts,
137 		    uint64_t *bd_err_mask)
138 {
139 	uint16_t rx_id = rxq->next_to_use;
140 	struct hns3_entry *sw_ring = &rxq->sw_ring[rx_id];
141 	struct hns3_desc *rxdp = &rxq->rx_ring[rx_id];
142 	uint32_t bd_valid_num, parse_retcode;
143 	uint16_t nb_rx = 0;
144 	uint32_t pos;
145 	int offset;
146 
147 	/* mask to shuffle from desc to mbuf's rx_descriptor_fields1 */
148 	uint8x16_t shuf_desc_fields_msk = {
149 		0xff, 0xff, 0xff, 0xff,  /* packet type init zero */
150 		22, 23, 0xff, 0xff,      /* rx.pkt_len to rte_mbuf.pkt_len */
151 		20, 21,	                 /* size to rte_mbuf.data_len */
152 		0xff, 0xff,	         /* rte_mbuf.vlan_tci init zero */
153 		8, 9, 10, 11,	         /* rx.rss_hash to rte_mbuf.hash.rss */
154 	};
155 
156 	uint16x8_t crc_adjust = {
157 		0, 0,         /* ignore pkt_type field */
158 		rxq->crc_len, /* sub crc on pkt_len */
159 		0,            /* ignore high-16bits of pkt_len */
160 		rxq->crc_len, /* sub crc on data_len */
161 		0, 0, 0,      /* ignore non-length fields */
162 	};
163 
164 	for (pos = 0; pos < nb_pkts; pos += HNS3_DEFAULT_DESCS_PER_LOOP,
165 				     rxdp += HNS3_DEFAULT_DESCS_PER_LOOP) {
166 		uint64x2x2_t descs[HNS3_DEFAULT_DESCS_PER_LOOP];
167 		uint8x16x2_t pkt_mbuf1, pkt_mbuf2, pkt_mbuf3, pkt_mbuf4;
168 		uint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
169 		uint64x2_t mbp1, mbp2;
170 		uint16x4_t bd_vld = {0};
171 		uint16x8_t tmp;
172 		uint64_t stat;
173 
174 		/* calc how many bd valid */
175 		bd_vld = vset_lane_u16(rxdp[0].rx.bdtype_vld_udp0, bd_vld, 0);
176 		bd_vld = vset_lane_u16(rxdp[1].rx.bdtype_vld_udp0, bd_vld, 1);
177 		bd_vld = vset_lane_u16(rxdp[2].rx.bdtype_vld_udp0, bd_vld, 2);
178 		bd_vld = vset_lane_u16(rxdp[3].rx.bdtype_vld_udp0, bd_vld, 3);
179 
180 		/* load 2 mbuf pointer */
181 		mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]);
182 
183 		bd_vld = vshl_n_u16(bd_vld,
184 				    HNS3_UINT16_BIT - 1 - HNS3_RXD_VLD_B);
185 		bd_vld = vreinterpret_u16_s16(
186 				vshr_n_s16(vreinterpret_s16_u16(bd_vld),
187 					   HNS3_UINT16_BIT - 1));
188 		stat = ~vget_lane_u64(vreinterpret_u64_u16(bd_vld), 0);
189 
190 		/* load 2 mbuf pointer again */
191 		mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]);
192 
193 		if (likely(stat == 0))
194 			bd_valid_num = HNS3_DEFAULT_DESCS_PER_LOOP;
195 		else
196 			bd_valid_num = __builtin_ctzl(stat) / HNS3_UINT16_BIT;
197 		if (bd_valid_num == 0)
198 			break;
199 
200 		/* use offset to control below data load oper ordering */
201 		offset = rxq->offset_table[bd_valid_num];
202 
203 		/* store 2 mbuf pointer into rx_pkts */
204 		vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);
205 
206 		/* read first two descs */
207 		descs[0] = vld2q_u64((uint64_t *)(rxdp + offset));
208 		descs[1] = vld2q_u64((uint64_t *)(rxdp + offset + 1));
209 
210 		/* store 2 mbuf pointer into rx_pkts again */
211 		vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);
212 
213 		/* read remains two descs */
214 		descs[2] = vld2q_u64((uint64_t *)(rxdp + offset + 2));
215 		descs[3] = vld2q_u64((uint64_t *)(rxdp + offset + 3));
216 
217 		pkt_mbuf1.val[0] = vreinterpretq_u8_u64(descs[0].val[0]);
218 		pkt_mbuf1.val[1] = vreinterpretq_u8_u64(descs[0].val[1]);
219 		pkt_mbuf2.val[0] = vreinterpretq_u8_u64(descs[1].val[0]);
220 		pkt_mbuf2.val[1] = vreinterpretq_u8_u64(descs[1].val[1]);
221 
222 		/* pkt 1,2 convert format from desc to pktmbuf */
223 		pkt_mb1 = vqtbl2q_u8(pkt_mbuf1, shuf_desc_fields_msk);
224 		pkt_mb2 = vqtbl2q_u8(pkt_mbuf2, shuf_desc_fields_msk);
225 
226 		/* store the first 8 bytes of pkt 1,2 mbuf's rearm_data */
227 		*(uint64_t *)&sw_ring[pos + 0].mbuf->rearm_data =
228 			rxq->mbuf_initializer;
229 		*(uint64_t *)&sw_ring[pos + 1].mbuf->rearm_data =
230 			rxq->mbuf_initializer;
231 
232 		/* pkt 1,2 remove crc */
233 		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust);
234 		pkt_mb1 = vreinterpretq_u8_u16(tmp);
235 		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust);
236 		pkt_mb2 = vreinterpretq_u8_u16(tmp);
237 
238 		pkt_mbuf3.val[0] = vreinterpretq_u8_u64(descs[2].val[0]);
239 		pkt_mbuf3.val[1] = vreinterpretq_u8_u64(descs[2].val[1]);
240 		pkt_mbuf4.val[0] = vreinterpretq_u8_u64(descs[3].val[0]);
241 		pkt_mbuf4.val[1] = vreinterpretq_u8_u64(descs[3].val[1]);
242 
243 		/* pkt 3,4 convert format from desc to pktmbuf */
244 		pkt_mb3 = vqtbl2q_u8(pkt_mbuf3, shuf_desc_fields_msk);
245 		pkt_mb4 = vqtbl2q_u8(pkt_mbuf4, shuf_desc_fields_msk);
246 
247 		/* pkt 1,2 save to rx_pkts mbuf */
248 		vst1q_u8((void *)&sw_ring[pos + 0].mbuf->rx_descriptor_fields1,
249 			 pkt_mb1);
250 		vst1q_u8((void *)&sw_ring[pos + 1].mbuf->rx_descriptor_fields1,
251 			 pkt_mb2);
252 
253 		/* pkt 3,4 remove crc */
254 		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust);
255 		pkt_mb3 = vreinterpretq_u8_u16(tmp);
256 		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb4), crc_adjust);
257 		pkt_mb4 = vreinterpretq_u8_u16(tmp);
258 
259 		/* store the first 8 bytes of pkt 3,4 mbuf's rearm_data */
260 		*(uint64_t *)&sw_ring[pos + 2].mbuf->rearm_data =
261 			rxq->mbuf_initializer;
262 		*(uint64_t *)&sw_ring[pos + 3].mbuf->rearm_data =
263 			rxq->mbuf_initializer;
264 
265 		/* pkt 3,4 save to rx_pkts mbuf */
266 		vst1q_u8((void *)&sw_ring[pos + 2].mbuf->rx_descriptor_fields1,
267 			 pkt_mb3);
268 		vst1q_u8((void *)&sw_ring[pos + 3].mbuf->rx_descriptor_fields1,
269 			 pkt_mb4);
270 
271 		rte_prefetch_non_temporal(rxdp + HNS3_DEFAULT_DESCS_PER_LOOP);
272 
273 		parse_retcode = hns3_desc_parse_field(rxq, &sw_ring[pos],
274 			&rxdp[offset], bd_valid_num);
275 		if (unlikely(parse_retcode))
276 			(*bd_err_mask) |= ((uint64_t)parse_retcode) << pos;
277 
278 		rte_prefetch0(sw_ring[pos +
279 				      HNS3_DEFAULT_DESCS_PER_LOOP + 0].mbuf);
280 		rte_prefetch0(sw_ring[pos +
281 				      HNS3_DEFAULT_DESCS_PER_LOOP + 1].mbuf);
282 		rte_prefetch0(sw_ring[pos +
283 				      HNS3_DEFAULT_DESCS_PER_LOOP + 2].mbuf);
284 		rte_prefetch0(sw_ring[pos +
285 				      HNS3_DEFAULT_DESCS_PER_LOOP + 3].mbuf);
286 
287 		nb_rx += bd_valid_num;
288 		if (bd_valid_num < HNS3_DEFAULT_DESCS_PER_LOOP)
289 			break;
290 	}
291 
292 	rxq->rx_rearm_nb += nb_rx;
293 	rxq->next_to_use += nb_rx;
294 	if (rxq->next_to_use >= rxq->nb_rx_desc)
295 		rxq->next_to_use = 0;
296 
297 	return nb_rx;
298 }
299 #endif /* _HNS3_RXTX_VEC_NEON_H_ */
300