xref: /dpdk/drivers/net/octeon_ep/cnxk_ep_rx_neon.c (revision f2b1510f19d7bfd386d130fa38123d6e2152cf80)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(C) 2023 Marvell.
3  */
4 
5 #include "cnxk_ep_rx.h"
6 
7 static __rte_always_inline void
8 cnxk_ep_process_pkts_vec_neon(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq,
9 			      uint16_t new_pkts)
10 {
11 	const uint8x16_t mask0 = {0, 1, 0xff, 0xff, 0, 1, 0xff, 0xff,
12 				  4, 5, 0xff, 0xff, 4, 5, 0xff, 0xff};
13 	const uint8x16_t mask1 = {8,  9,  0xff, 0xff, 8,  9,  0xff, 0xff,
14 				  12, 13, 0xff, 0xff, 12, 13, 0xff, 0xff};
15 	struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
16 	uint32_t pidx0, pidx1, pidx2, pidx3;
17 	struct rte_mbuf *m0, *m1, *m2, *m3;
18 	uint32_t read_idx = droq->read_idx;
19 	uint16_t nb_desc = droq->nb_desc;
20 	uint32_t idx0, idx1, idx2, idx3;
21 	uint64x2_t s01, s23;
22 	uint32x4_t bytes;
23 	uint16_t pkts = 0;
24 
25 	idx0 = read_idx;
26 	s01 = vdupq_n_u64(0);
27 	bytes = vdupq_n_u32(0);
28 	while (pkts < new_pkts) {
29 		idx1 = otx_ep_incr_index(idx0, 1, nb_desc);
30 		idx2 = otx_ep_incr_index(idx1, 1, nb_desc);
31 		idx3 = otx_ep_incr_index(idx2, 1, nb_desc);
32 
33 		if (new_pkts - pkts > 4) {
34 			pidx0 = otx_ep_incr_index(idx3, 1, nb_desc);
35 			pidx1 = otx_ep_incr_index(pidx0, 1, nb_desc);
36 			pidx2 = otx_ep_incr_index(pidx1, 1, nb_desc);
37 			pidx3 = otx_ep_incr_index(pidx2, 1, nb_desc);
38 
39 			rte_prefetch_non_temporal(cnxk_pktmbuf_mtod(recv_buf_list[pidx0], void *));
40 			rte_prefetch_non_temporal(cnxk_pktmbuf_mtod(recv_buf_list[pidx1], void *));
41 			rte_prefetch_non_temporal(cnxk_pktmbuf_mtod(recv_buf_list[pidx2], void *));
42 			rte_prefetch_non_temporal(cnxk_pktmbuf_mtod(recv_buf_list[pidx3], void *));
43 		}
44 
45 		m0 = recv_buf_list[idx0];
46 		m1 = recv_buf_list[idx1];
47 		m2 = recv_buf_list[idx2];
48 		m3 = recv_buf_list[idx3];
49 
50 		/* Load packet size big-endian. */
51 		s01 = vsetq_lane_u32(cnxk_pktmbuf_mtod(m0, struct otx_ep_droq_info *)->length >> 48,
52 				     s01, 0);
53 		s01 = vsetq_lane_u32(cnxk_pktmbuf_mtod(m1, struct otx_ep_droq_info *)->length >> 48,
54 				     s01, 1);
55 		s01 = vsetq_lane_u32(cnxk_pktmbuf_mtod(m2, struct otx_ep_droq_info *)->length >> 48,
56 				     s01, 2);
57 		s01 = vsetq_lane_u32(cnxk_pktmbuf_mtod(m3, struct otx_ep_droq_info *)->length >> 48,
58 				     s01, 3);
59 		/* Convert to little-endian. */
60 		s01 = vrev16q_u8(s01);
61 
62 		/* Vertical add, consolidate outside the loop. */
63 		bytes += vaddq_u32(bytes, s01);
64 		/* Separate into packet length and data length. */
65 		s23 = vqtbl1q_u8(s01, mask1);
66 		s01 = vqtbl1q_u8(s01, mask0);
67 
68 		/* Store packet length and data length to mbuf. */
69 		*(uint64_t *)&m0->pkt_len = vgetq_lane_u64(s01, 0);
70 		*(uint64_t *)&m1->pkt_len = vgetq_lane_u64(s01, 1);
71 		*(uint64_t *)&m2->pkt_len = vgetq_lane_u64(s23, 0);
72 		*(uint64_t *)&m3->pkt_len = vgetq_lane_u64(s23, 1);
73 
74 		/* Reset rearm data. */
75 		*(uint64_t *)&m0->rearm_data = droq->rearm_data;
76 		*(uint64_t *)&m1->rearm_data = droq->rearm_data;
77 		*(uint64_t *)&m2->rearm_data = droq->rearm_data;
78 		*(uint64_t *)&m3->rearm_data = droq->rearm_data;
79 
80 		rx_pkts[pkts++] = m0;
81 		rx_pkts[pkts++] = m1;
82 		rx_pkts[pkts++] = m2;
83 		rx_pkts[pkts++] = m3;
84 		idx0 = otx_ep_incr_index(idx3, 1, nb_desc);
85 	}
86 	droq->read_idx = idx0;
87 
88 	droq->refill_count += new_pkts;
89 	droq->pkts_pending -= new_pkts;
90 	/* Stats */
91 	droq->stats.pkts_received += new_pkts;
92 #if defined(RTE_ARCH_32)
93 	droq->stats.bytes_received += vgetq_lane_u32(bytes, 0);
94 	droq->stats.bytes_received += vgetq_lane_u32(bytes, 1);
95 	droq->stats.bytes_received += vgetq_lane_u32(bytes, 2);
96 	droq->stats.bytes_received += vgetq_lane_u32(bytes, 3);
97 #else
98 	droq->stats.bytes_received += vaddvq_u32(bytes);
99 #endif
100 }
101 
102 uint16_t __rte_noinline __rte_hot
103 cnxk_ep_recv_pkts_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
104 {
105 	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
106 	uint16_t new_pkts, vpkts;
107 
108 	/* Refill RX buffers */
109 	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
110 		cnxk_ep_rx_refill(droq);
111 
112 	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
113 	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
114 	cnxk_ep_process_pkts_vec_neon(rx_pkts, droq, vpkts);
115 	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
116 
117 	return new_pkts;
118 }
119 
120 uint16_t __rte_noinline __rte_hot
121 cn9k_ep_recv_pkts_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
122 {
123 	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
124 	uint16_t new_pkts, vpkts;
125 
126 	/* Refill RX buffers */
127 	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
128 		cnxk_ep_rx_refill(droq);
129 	} else {
130 		/* SDP output goes into DROP state when output doorbell count
131 		 * goes below drop count. When door bell count is written with
132 		 * a value greater than drop count SDP output should come out
133 		 * of DROP state. Due to a race condition this is not happening.
134 		 * Writing doorbell register with 0 again may make SDP output
135 		 * come out of this state.
136 		 */
137 
138 		rte_write32(0, droq->pkts_credit_reg);
139 	}
140 
141 	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
142 	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
143 	cnxk_ep_process_pkts_vec_neon(rx_pkts, droq, vpkts);
144 	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
145 
146 	return new_pkts;
147 }
148