xref: /dpdk/drivers/net/octeon_ep/cnxk_ep_rx_sse.c (revision f2b1510f19d7bfd386d130fa38123d6e2152cf80)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(C) 2023 Marvell.
3  */
4 
5 #include "cnxk_ep_rx.h"
6 
7 static __rte_always_inline uint32_t
8 hadd(__m128i x)
9 {
10 	__m128i hi64 = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2));
11 	__m128i sum64 = _mm_add_epi32(hi64, x);
12 	__m128i hi32 = _mm_shufflelo_epi16(sum64, _MM_SHUFFLE(1, 0, 3, 2));
13 	__m128i sum32 = _mm_add_epi32(sum64, hi32);
14 	return _mm_cvtsi128_si32(sum32);
15 }
16 
17 static __rte_always_inline void
18 cnxk_ep_process_pkts_vec_sse(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, uint16_t new_pkts)
19 {
20 	struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
21 	uint32_t read_idx = droq->read_idx;
22 	struct rte_mbuf *m0, *m1, *m2, *m3;
23 	uint16_t nb_desc = droq->nb_desc;
24 	uint32_t idx0, idx1, idx2, idx3;
25 	uint16_t pkts = 0;
26 	__m128i bytes;
27 
28 	idx0 = read_idx;
29 	bytes = _mm_setzero_si128();
30 	while (pkts < new_pkts) {
31 		const __m128i bswap_mask = _mm_set_epi8(0xFF, 0xFF, 12, 13, 0xFF, 0xFF, 8, 9, 0xFF,
32 							0xFF, 4, 5, 0xFF, 0xFF, 0, 1);
33 		const __m128i cpy_mask = _mm_set_epi8(0xFF, 0xFF, 9, 8, 0xFF, 0xFF, 9, 8, 0xFF,
34 						      0xFF, 1, 0, 0xFF, 0xFF, 1, 0);
35 		__m128i s01, s23;
36 
37 		idx1 = otx_ep_incr_index(idx0, 1, nb_desc);
38 		idx2 = otx_ep_incr_index(idx1, 1, nb_desc);
39 		idx3 = otx_ep_incr_index(idx2, 1, nb_desc);
40 
41 		m0 = recv_buf_list[idx0];
42 		m1 = recv_buf_list[idx1];
43 		m2 = recv_buf_list[idx2];
44 		m3 = recv_buf_list[idx3];
45 
46 		/* Load packet size big-endian. */
47 		s01 = _mm_set_epi32(cnxk_pktmbuf_mtod(m3, struct otx_ep_droq_info *)->length >> 48,
48 				    cnxk_pktmbuf_mtod(m1, struct otx_ep_droq_info *)->length >> 48,
49 				    cnxk_pktmbuf_mtod(m2, struct otx_ep_droq_info *)->length >> 48,
50 				    cnxk_pktmbuf_mtod(m0, struct otx_ep_droq_info *)->length >> 48);
51 		/* Convert to little-endian. */
52 		s01 = _mm_shuffle_epi8(s01, bswap_mask);
53 		/* Vertical add, consolidate outside loop */
54 		bytes = _mm_add_epi32(bytes, s01);
55 		/* Separate into packet length and data length. */
56 		s23 = _mm_shuffle_epi32(s01, _MM_SHUFFLE(3, 3, 1, 1));
57 		s01 = _mm_shuffle_epi8(s01, cpy_mask);
58 		s23 = _mm_shuffle_epi8(s23, cpy_mask);
59 
60 		/* Store packet length and data length to mbuf. */
61 		*(uint64_t *)&m0->pkt_len = ((rte_xmm_t)s01).u64[0];
62 		*(uint64_t *)&m1->pkt_len = ((rte_xmm_t)s01).u64[1];
63 		*(uint64_t *)&m2->pkt_len = ((rte_xmm_t)s23).u64[0];
64 		*(uint64_t *)&m3->pkt_len = ((rte_xmm_t)s23).u64[1];
65 
66 		/* Reset rearm data. */
67 		*(uint64_t *)&m0->rearm_data = droq->rearm_data;
68 		*(uint64_t *)&m1->rearm_data = droq->rearm_data;
69 		*(uint64_t *)&m2->rearm_data = droq->rearm_data;
70 		*(uint64_t *)&m3->rearm_data = droq->rearm_data;
71 
72 		rx_pkts[pkts++] = m0;
73 		rx_pkts[pkts++] = m1;
74 		rx_pkts[pkts++] = m2;
75 		rx_pkts[pkts++] = m3;
76 		idx0 = otx_ep_incr_index(idx3, 1, nb_desc);
77 	}
78 	droq->read_idx = idx0;
79 
80 	droq->refill_count += new_pkts;
81 	droq->pkts_pending -= new_pkts;
82 	/* Stats */
83 	droq->stats.pkts_received += new_pkts;
84 	droq->stats.bytes_received += hadd(bytes);
85 }
86 
87 uint16_t __rte_noinline __rte_hot
88 cnxk_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
89 {
90 	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
91 	uint16_t new_pkts, vpkts;
92 
93 	/* Refill RX buffers */
94 	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
95 		cnxk_ep_rx_refill(droq);
96 
97 	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
98 	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
99 	cnxk_ep_process_pkts_vec_sse(rx_pkts, droq, vpkts);
100 	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
101 
102 	return new_pkts;
103 }
104 
105 uint16_t __rte_noinline __rte_hot
106 cn9k_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
107 {
108 	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
109 	uint16_t new_pkts, vpkts;
110 
111 	/* Refill RX buffers */
112 	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
113 		cnxk_ep_rx_refill(droq);
114 	} else {
115 		/* SDP output goes into DROP state when output doorbell count
116 		 * goes below drop count. When door bell count is written with
117 		 * a value greater than drop count SDP output should come out
118 		 * of DROP state. Due to a race condition this is not happening.
119 		 * Writing doorbell register with 0 again may make SDP output
120 		 * come out of this state.
121 		 */
122 
123 		rte_write32(0, droq->pkts_credit_reg);
124 	}
125 
126 	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
127 	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
128 	cnxk_ep_process_pkts_vec_sse(rx_pkts, droq, vpkts);
129 	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
130 
131 	return new_pkts;
132 }
133