1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(C) 2023 Marvell. 3 */ 4 5 #include "cnxk_ep_rx.h" 6 7 static __rte_always_inline uint32_t 8 hadd(__m128i x) 9 { 10 __m128i hi64 = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)); 11 __m128i sum64 = _mm_add_epi32(hi64, x); 12 __m128i hi32 = _mm_shufflelo_epi16(sum64, _MM_SHUFFLE(1, 0, 3, 2)); 13 __m128i sum32 = _mm_add_epi32(sum64, hi32); 14 return _mm_cvtsi128_si32(sum32); 15 } 16 17 static __rte_always_inline void 18 cnxk_ep_process_pkts_vec_sse(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, uint16_t new_pkts) 19 { 20 struct rte_mbuf **recv_buf_list = droq->recv_buf_list; 21 uint32_t read_idx = droq->read_idx; 22 struct rte_mbuf *m0, *m1, *m2, *m3; 23 uint16_t nb_desc = droq->nb_desc; 24 uint32_t idx0, idx1, idx2, idx3; 25 uint16_t pkts = 0; 26 __m128i bytes; 27 28 idx0 = read_idx; 29 bytes = _mm_setzero_si128(); 30 while (pkts < new_pkts) { 31 const __m128i bswap_mask = _mm_set_epi8(0xFF, 0xFF, 12, 13, 0xFF, 0xFF, 8, 9, 0xFF, 32 0xFF, 4, 5, 0xFF, 0xFF, 0, 1); 33 const __m128i cpy_mask = _mm_set_epi8(0xFF, 0xFF, 9, 8, 0xFF, 0xFF, 9, 8, 0xFF, 34 0xFF, 1, 0, 0xFF, 0xFF, 1, 0); 35 __m128i s01, s23; 36 37 idx1 = otx_ep_incr_index(idx0, 1, nb_desc); 38 idx2 = otx_ep_incr_index(idx1, 1, nb_desc); 39 idx3 = otx_ep_incr_index(idx2, 1, nb_desc); 40 41 m0 = recv_buf_list[idx0]; 42 m1 = recv_buf_list[idx1]; 43 m2 = recv_buf_list[idx2]; 44 m3 = recv_buf_list[idx3]; 45 46 /* Load packet size big-endian. */ 47 s01 = _mm_set_epi32(cnxk_pktmbuf_mtod(m3, struct otx_ep_droq_info *)->length >> 48, 48 cnxk_pktmbuf_mtod(m1, struct otx_ep_droq_info *)->length >> 48, 49 cnxk_pktmbuf_mtod(m2, struct otx_ep_droq_info *)->length >> 48, 50 cnxk_pktmbuf_mtod(m0, struct otx_ep_droq_info *)->length >> 48); 51 /* Convert to little-endian. */ 52 s01 = _mm_shuffle_epi8(s01, bswap_mask); 53 /* Vertical add, consolidate outside loop */ 54 bytes = _mm_add_epi32(bytes, s01); 55 /* Separate into packet length and data length. */ 56 s23 = _mm_shuffle_epi32(s01, _MM_SHUFFLE(3, 3, 1, 1)); 57 s01 = _mm_shuffle_epi8(s01, cpy_mask); 58 s23 = _mm_shuffle_epi8(s23, cpy_mask); 59 60 /* Store packet length and data length to mbuf. */ 61 *(uint64_t *)&m0->pkt_len = ((rte_xmm_t)s01).u64[0]; 62 *(uint64_t *)&m1->pkt_len = ((rte_xmm_t)s01).u64[1]; 63 *(uint64_t *)&m2->pkt_len = ((rte_xmm_t)s23).u64[0]; 64 *(uint64_t *)&m3->pkt_len = ((rte_xmm_t)s23).u64[1]; 65 66 /* Reset rearm data. */ 67 *(uint64_t *)&m0->rearm_data = droq->rearm_data; 68 *(uint64_t *)&m1->rearm_data = droq->rearm_data; 69 *(uint64_t *)&m2->rearm_data = droq->rearm_data; 70 *(uint64_t *)&m3->rearm_data = droq->rearm_data; 71 72 rx_pkts[pkts++] = m0; 73 rx_pkts[pkts++] = m1; 74 rx_pkts[pkts++] = m2; 75 rx_pkts[pkts++] = m3; 76 idx0 = otx_ep_incr_index(idx3, 1, nb_desc); 77 } 78 droq->read_idx = idx0; 79 80 droq->refill_count += new_pkts; 81 droq->pkts_pending -= new_pkts; 82 /* Stats */ 83 droq->stats.pkts_received += new_pkts; 84 droq->stats.bytes_received += hadd(bytes); 85 } 86 87 uint16_t __rte_noinline __rte_hot 88 cnxk_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) 89 { 90 struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue; 91 uint16_t new_pkts, vpkts; 92 93 /* Refill RX buffers */ 94 if (droq->refill_count >= DROQ_REFILL_THRESHOLD) 95 cnxk_ep_rx_refill(droq); 96 97 new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts); 98 vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE); 99 cnxk_ep_process_pkts_vec_sse(rx_pkts, droq, vpkts); 100 cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts); 101 102 return new_pkts; 103 } 104 105 uint16_t __rte_noinline __rte_hot 106 cn9k_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) 107 { 108 struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue; 109 uint16_t new_pkts, vpkts; 110 111 /* Refill RX buffers */ 112 if (droq->refill_count >= DROQ_REFILL_THRESHOLD) { 113 cnxk_ep_rx_refill(droq); 114 } else { 115 /* SDP output goes into DROP state when output doorbell count 116 * goes below drop count. When door bell count is written with 117 * a value greater than drop count SDP output should come out 118 * of DROP state. Due to a race condition this is not happening. 119 * Writing doorbell register with 0 again may make SDP output 120 * come out of this state. 121 */ 122 123 rte_write32(0, droq->pkts_credit_reg); 124 } 125 126 new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts); 127 vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE); 128 cnxk_ep_process_pkts_vec_sse(rx_pkts, droq, vpkts); 129 cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts); 130 131 return new_pkts; 132 } 133