1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(C) 2023 Marvell. 3 */ 4 5 #include "cnxk_ep_rx.h" 6 7 static __rte_always_inline void 8 cnxk_ep_process_pkts_vec_neon(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, 9 uint16_t new_pkts) 10 { 11 const uint8x16_t mask0 = {0, 1, 0xff, 0xff, 0, 1, 0xff, 0xff, 12 4, 5, 0xff, 0xff, 4, 5, 0xff, 0xff}; 13 const uint8x16_t mask1 = {8, 9, 0xff, 0xff, 8, 9, 0xff, 0xff, 14 12, 13, 0xff, 0xff, 12, 13, 0xff, 0xff}; 15 struct rte_mbuf **recv_buf_list = droq->recv_buf_list; 16 uint32_t pidx0, pidx1, pidx2, pidx3; 17 struct rte_mbuf *m0, *m1, *m2, *m3; 18 uint32_t read_idx = droq->read_idx; 19 uint16_t nb_desc = droq->nb_desc; 20 uint32_t idx0, idx1, idx2, idx3; 21 uint64x2_t s01, s23; 22 uint32x4_t bytes; 23 uint16_t pkts = 0; 24 25 idx0 = read_idx; 26 s01 = vdupq_n_u64(0); 27 bytes = vdupq_n_u32(0); 28 while (pkts < new_pkts) { 29 idx1 = otx_ep_incr_index(idx0, 1, nb_desc); 30 idx2 = otx_ep_incr_index(idx1, 1, nb_desc); 31 idx3 = otx_ep_incr_index(idx2, 1, nb_desc); 32 33 if (new_pkts - pkts > 4) { 34 pidx0 = otx_ep_incr_index(idx3, 1, nb_desc); 35 pidx1 = otx_ep_incr_index(pidx0, 1, nb_desc); 36 pidx2 = otx_ep_incr_index(pidx1, 1, nb_desc); 37 pidx3 = otx_ep_incr_index(pidx2, 1, nb_desc); 38 39 rte_prefetch_non_temporal(cnxk_pktmbuf_mtod(recv_buf_list[pidx0], void *)); 40 rte_prefetch_non_temporal(cnxk_pktmbuf_mtod(recv_buf_list[pidx1], void *)); 41 rte_prefetch_non_temporal(cnxk_pktmbuf_mtod(recv_buf_list[pidx2], void *)); 42 rte_prefetch_non_temporal(cnxk_pktmbuf_mtod(recv_buf_list[pidx3], void *)); 43 } 44 45 m0 = recv_buf_list[idx0]; 46 m1 = recv_buf_list[idx1]; 47 m2 = recv_buf_list[idx2]; 48 m3 = recv_buf_list[idx3]; 49 50 /* Load packet size big-endian. */ 51 s01 = vsetq_lane_u32(cnxk_pktmbuf_mtod(m0, struct otx_ep_droq_info *)->length >> 48, 52 s01, 0); 53 s01 = vsetq_lane_u32(cnxk_pktmbuf_mtod(m1, struct otx_ep_droq_info *)->length >> 48, 54 s01, 1); 55 s01 = vsetq_lane_u32(cnxk_pktmbuf_mtod(m2, struct otx_ep_droq_info *)->length >> 48, 56 s01, 2); 57 s01 = vsetq_lane_u32(cnxk_pktmbuf_mtod(m3, struct otx_ep_droq_info *)->length >> 48, 58 s01, 3); 59 /* Convert to little-endian. */ 60 s01 = vrev16q_u8(s01); 61 62 /* Vertical add, consolidate outside the loop. */ 63 bytes += vaddq_u32(bytes, s01); 64 /* Separate into packet length and data length. */ 65 s23 = vqtbl1q_u8(s01, mask1); 66 s01 = vqtbl1q_u8(s01, mask0); 67 68 /* Store packet length and data length to mbuf. */ 69 *(uint64_t *)&m0->pkt_len = vgetq_lane_u64(s01, 0); 70 *(uint64_t *)&m1->pkt_len = vgetq_lane_u64(s01, 1); 71 *(uint64_t *)&m2->pkt_len = vgetq_lane_u64(s23, 0); 72 *(uint64_t *)&m3->pkt_len = vgetq_lane_u64(s23, 1); 73 74 /* Reset rearm data. */ 75 *(uint64_t *)&m0->rearm_data = droq->rearm_data; 76 *(uint64_t *)&m1->rearm_data = droq->rearm_data; 77 *(uint64_t *)&m2->rearm_data = droq->rearm_data; 78 *(uint64_t *)&m3->rearm_data = droq->rearm_data; 79 80 rx_pkts[pkts++] = m0; 81 rx_pkts[pkts++] = m1; 82 rx_pkts[pkts++] = m2; 83 rx_pkts[pkts++] = m3; 84 idx0 = otx_ep_incr_index(idx3, 1, nb_desc); 85 } 86 droq->read_idx = idx0; 87 88 droq->refill_count += new_pkts; 89 droq->pkts_pending -= new_pkts; 90 /* Stats */ 91 droq->stats.pkts_received += new_pkts; 92 #if defined(RTE_ARCH_32) 93 droq->stats.bytes_received += vgetq_lane_u32(bytes, 0); 94 droq->stats.bytes_received += vgetq_lane_u32(bytes, 1); 95 droq->stats.bytes_received += vgetq_lane_u32(bytes, 2); 96 droq->stats.bytes_received += vgetq_lane_u32(bytes, 3); 97 #else 98 droq->stats.bytes_received += vaddvq_u32(bytes); 99 #endif 100 } 101 102 uint16_t __rte_noinline __rte_hot 103 cnxk_ep_recv_pkts_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) 104 { 105 struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue; 106 uint16_t new_pkts, vpkts; 107 108 /* Refill RX buffers */ 109 if (droq->refill_count >= DROQ_REFILL_THRESHOLD) 110 cnxk_ep_rx_refill(droq); 111 112 new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts); 113 vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE); 114 cnxk_ep_process_pkts_vec_neon(rx_pkts, droq, vpkts); 115 cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts); 116 117 return new_pkts; 118 } 119 120 uint16_t __rte_noinline __rte_hot 121 cn9k_ep_recv_pkts_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) 122 { 123 struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue; 124 uint16_t new_pkts, vpkts; 125 126 /* Refill RX buffers */ 127 if (droq->refill_count >= DROQ_REFILL_THRESHOLD) { 128 cnxk_ep_rx_refill(droq); 129 } else { 130 /* SDP output goes into DROP state when output doorbell count 131 * goes below drop count. When door bell count is written with 132 * a value greater than drop count SDP output should come out 133 * of DROP state. Due to a race condition this is not happening. 134 * Writing doorbell register with 0 again may make SDP output 135 * come out of this state. 136 */ 137 138 rte_write32(0, droq->pkts_credit_reg); 139 } 140 141 new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts); 142 vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE); 143 cnxk_ep_process_pkts_vec_neon(rx_pkts, droq, vpkts); 144 cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts); 145 146 return new_pkts; 147 } 148