1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2020 Hisilicon Limited. 3 */ 4 5 #ifndef _HNS3_RXTX_VEC_NEON_H_ 6 #define _HNS3_RXTX_VEC_NEON_H_ 7 8 #include <arm_neon.h> 9 10 #pragma GCC diagnostic ignored "-Wcast-qual" 11 12 static inline void 13 hns3_vec_tx(volatile struct hns3_desc *desc, struct rte_mbuf *pkt) 14 { 15 uint64x2_t val1 = { 16 pkt->buf_iova + pkt->data_off, 17 ((uint64_t)pkt->data_len) << HNS3_TXD_SEND_SIZE_SHIFT 18 }; 19 uint64x2_t val2 = { 20 0, 21 ((uint64_t)HNS3_TXD_DEFAULT_VLD_FE_BDTYPE) << HNS3_UINT32_BIT 22 }; 23 vst1q_u64((uint64_t *)&desc->addr, val1); 24 vst1q_u64((uint64_t *)&desc->tx.outer_vlan_tag, val2); 25 } 26 27 static uint16_t 28 hns3_xmit_fixed_burst_vec(void *__restrict tx_queue, 29 struct rte_mbuf **__restrict tx_pkts, 30 uint16_t nb_pkts) 31 { 32 struct hns3_tx_queue *txq = (struct hns3_tx_queue *)tx_queue; 33 volatile struct hns3_desc *tx_desc; 34 struct hns3_entry *tx_entry; 35 uint16_t next_to_use; 36 uint16_t nb_commit; 37 uint16_t nb_tx; 38 uint16_t n, i; 39 40 if (txq->tx_bd_ready < txq->tx_free_thresh) 41 hns3_tx_free_buffers(txq); 42 43 nb_commit = RTE_MIN(txq->tx_bd_ready, nb_pkts); 44 if (unlikely(nb_commit == 0)) { 45 txq->dfx_stats.queue_full_cnt++; 46 return 0; 47 } 48 nb_tx = nb_commit; 49 50 next_to_use = txq->next_to_use; 51 tx_desc = &txq->tx_ring[next_to_use]; 52 tx_entry = &txq->sw_ring[next_to_use]; 53 54 /* 55 * We need to deal with n descriptors first for better performance, 56 * if nb_commit is greater than the difference between txq->nb_tx_desc 57 * and next_to_use in sw_ring and tx_ring. 58 */ 59 n = txq->nb_tx_desc - next_to_use; 60 if (nb_commit >= n) { 61 for (i = 0; i < n; i++, tx_pkts++, tx_desc++) { 62 hns3_vec_tx(tx_desc, *tx_pkts); 63 tx_entry[i].mbuf = *tx_pkts; 64 65 /* Increment bytes counter */ 66 txq->basic_stats.bytes += (*tx_pkts)->pkt_len; 67 } 68 69 nb_commit -= n; 70 next_to_use = 0; 71 tx_desc = &txq->tx_ring[next_to_use]; 72 tx_entry = &txq->sw_ring[next_to_use]; 73 } 74 75 for (i = 0; i < nb_commit; i++, tx_pkts++, tx_desc++) { 76 hns3_vec_tx(tx_desc, *tx_pkts); 77 tx_entry[i].mbuf = *tx_pkts; 78 79 /* Increment bytes counter */ 80 txq->basic_stats.bytes += (*tx_pkts)->pkt_len; 81 } 82 83 next_to_use += nb_commit; 84 txq->next_to_use = next_to_use; 85 txq->tx_bd_ready -= nb_tx; 86 87 hns3_write_reg_opt(txq->io_tail_reg, nb_tx); 88 89 return nb_tx; 90 } 91 92 static inline uint32_t 93 hns3_desc_parse_field(struct hns3_rx_queue *rxq, 94 struct hns3_entry *sw_ring, 95 struct hns3_desc *rxdp, 96 uint32_t bd_vld_num) 97 { 98 uint32_t l234_info, ol_info, bd_base_info; 99 struct rte_mbuf *pkt; 100 uint32_t retcode = 0; 101 uint32_t cksum_err; 102 uint32_t i; 103 int ret; 104 105 for (i = 0; i < bd_vld_num; i++) { 106 pkt = sw_ring[i].mbuf; 107 108 /* init rte_mbuf.rearm_data last 64-bit */ 109 pkt->ol_flags = PKT_RX_RSS_HASH; 110 111 l234_info = rxdp[i].rx.l234_info; 112 ol_info = rxdp[i].rx.ol_info; 113 bd_base_info = rxdp[i].rx.bd_base_info; 114 ret = hns3_handle_bdinfo(rxq, pkt, bd_base_info, 115 l234_info, &cksum_err); 116 if (unlikely(ret)) { 117 retcode |= 1u << i; 118 continue; 119 } 120 121 pkt->packet_type = hns3_rx_calc_ptype(rxq, l234_info, ol_info); 122 if (likely(bd_base_info & BIT(HNS3_RXD_L3L4P_B))) 123 hns3_rx_set_cksum_flag(pkt, pkt->packet_type, 124 cksum_err); 125 126 /* Increment bytes counter */ 127 rxq->basic_stats.bytes += pkt->pkt_len; 128 } 129 130 return retcode; 131 } 132 133 static inline uint16_t 134 hns3_recv_burst_vec(struct hns3_rx_queue *__restrict rxq, 135 struct rte_mbuf **__restrict rx_pkts, 136 uint16_t nb_pkts, 137 uint64_t *bd_err_mask) 138 { 139 uint16_t rx_id = rxq->next_to_use; 140 struct hns3_entry *sw_ring = &rxq->sw_ring[rx_id]; 141 struct hns3_desc *rxdp = &rxq->rx_ring[rx_id]; 142 uint32_t bd_valid_num, parse_retcode; 143 uint16_t nb_rx = 0; 144 uint32_t pos; 145 int offset; 146 147 /* mask to shuffle from desc to mbuf's rx_descriptor_fields1 */ 148 uint8x16_t shuf_desc_fields_msk = { 149 0xff, 0xff, 0xff, 0xff, /* packet type init zero */ 150 22, 23, 0xff, 0xff, /* rx.pkt_len to rte_mbuf.pkt_len */ 151 20, 21, /* size to rte_mbuf.data_len */ 152 0xff, 0xff, /* rte_mbuf.vlan_tci init zero */ 153 8, 9, 10, 11, /* rx.rss_hash to rte_mbuf.hash.rss */ 154 }; 155 156 uint16x8_t crc_adjust = { 157 0, 0, /* ignore pkt_type field */ 158 rxq->crc_len, /* sub crc on pkt_len */ 159 0, /* ignore high-16bits of pkt_len */ 160 rxq->crc_len, /* sub crc on data_len */ 161 0, 0, 0, /* ignore non-length fields */ 162 }; 163 164 for (pos = 0; pos < nb_pkts; pos += HNS3_DEFAULT_DESCS_PER_LOOP, 165 rxdp += HNS3_DEFAULT_DESCS_PER_LOOP) { 166 uint64x2x2_t descs[HNS3_DEFAULT_DESCS_PER_LOOP]; 167 uint8x16x2_t pkt_mbuf1, pkt_mbuf2, pkt_mbuf3, pkt_mbuf4; 168 uint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4; 169 uint64x2_t mbp1, mbp2; 170 uint16x4_t bd_vld = {0}; 171 uint16x8_t tmp; 172 uint64_t stat; 173 174 /* calc how many bd valid */ 175 bd_vld = vset_lane_u16(rxdp[0].rx.bdtype_vld_udp0, bd_vld, 0); 176 bd_vld = vset_lane_u16(rxdp[1].rx.bdtype_vld_udp0, bd_vld, 1); 177 bd_vld = vset_lane_u16(rxdp[2].rx.bdtype_vld_udp0, bd_vld, 2); 178 bd_vld = vset_lane_u16(rxdp[3].rx.bdtype_vld_udp0, bd_vld, 3); 179 180 /* load 2 mbuf pointer */ 181 mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]); 182 183 bd_vld = vshl_n_u16(bd_vld, 184 HNS3_UINT16_BIT - 1 - HNS3_RXD_VLD_B); 185 bd_vld = vreinterpret_u16_s16( 186 vshr_n_s16(vreinterpret_s16_u16(bd_vld), 187 HNS3_UINT16_BIT - 1)); 188 stat = ~vget_lane_u64(vreinterpret_u64_u16(bd_vld), 0); 189 190 /* load 2 mbuf pointer again */ 191 mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]); 192 193 if (likely(stat == 0)) 194 bd_valid_num = HNS3_DEFAULT_DESCS_PER_LOOP; 195 else 196 bd_valid_num = __builtin_ctzl(stat) / HNS3_UINT16_BIT; 197 if (bd_valid_num == 0) 198 break; 199 200 /* use offset to control below data load oper ordering */ 201 offset = rxq->offset_table[bd_valid_num]; 202 203 /* store 2 mbuf pointer into rx_pkts */ 204 vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1); 205 206 /* read first two descs */ 207 descs[0] = vld2q_u64((uint64_t *)(rxdp + offset)); 208 descs[1] = vld2q_u64((uint64_t *)(rxdp + offset + 1)); 209 210 /* store 2 mbuf pointer into rx_pkts again */ 211 vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2); 212 213 /* read remains two descs */ 214 descs[2] = vld2q_u64((uint64_t *)(rxdp + offset + 2)); 215 descs[3] = vld2q_u64((uint64_t *)(rxdp + offset + 3)); 216 217 pkt_mbuf1.val[0] = vreinterpretq_u8_u64(descs[0].val[0]); 218 pkt_mbuf1.val[1] = vreinterpretq_u8_u64(descs[0].val[1]); 219 pkt_mbuf2.val[0] = vreinterpretq_u8_u64(descs[1].val[0]); 220 pkt_mbuf2.val[1] = vreinterpretq_u8_u64(descs[1].val[1]); 221 222 /* pkt 1,2 convert format from desc to pktmbuf */ 223 pkt_mb1 = vqtbl2q_u8(pkt_mbuf1, shuf_desc_fields_msk); 224 pkt_mb2 = vqtbl2q_u8(pkt_mbuf2, shuf_desc_fields_msk); 225 226 /* store the first 8 bytes of pkt 1,2 mbuf's rearm_data */ 227 *(uint64_t *)&sw_ring[pos + 0].mbuf->rearm_data = 228 rxq->mbuf_initializer; 229 *(uint64_t *)&sw_ring[pos + 1].mbuf->rearm_data = 230 rxq->mbuf_initializer; 231 232 /* pkt 1,2 remove crc */ 233 tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust); 234 pkt_mb1 = vreinterpretq_u8_u16(tmp); 235 tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust); 236 pkt_mb2 = vreinterpretq_u8_u16(tmp); 237 238 pkt_mbuf3.val[0] = vreinterpretq_u8_u64(descs[2].val[0]); 239 pkt_mbuf3.val[1] = vreinterpretq_u8_u64(descs[2].val[1]); 240 pkt_mbuf4.val[0] = vreinterpretq_u8_u64(descs[3].val[0]); 241 pkt_mbuf4.val[1] = vreinterpretq_u8_u64(descs[3].val[1]); 242 243 /* pkt 3,4 convert format from desc to pktmbuf */ 244 pkt_mb3 = vqtbl2q_u8(pkt_mbuf3, shuf_desc_fields_msk); 245 pkt_mb4 = vqtbl2q_u8(pkt_mbuf4, shuf_desc_fields_msk); 246 247 /* pkt 1,2 save to rx_pkts mbuf */ 248 vst1q_u8((void *)&sw_ring[pos + 0].mbuf->rx_descriptor_fields1, 249 pkt_mb1); 250 vst1q_u8((void *)&sw_ring[pos + 1].mbuf->rx_descriptor_fields1, 251 pkt_mb2); 252 253 /* pkt 3,4 remove crc */ 254 tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust); 255 pkt_mb3 = vreinterpretq_u8_u16(tmp); 256 tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb4), crc_adjust); 257 pkt_mb4 = vreinterpretq_u8_u16(tmp); 258 259 /* store the first 8 bytes of pkt 3,4 mbuf's rearm_data */ 260 *(uint64_t *)&sw_ring[pos + 2].mbuf->rearm_data = 261 rxq->mbuf_initializer; 262 *(uint64_t *)&sw_ring[pos + 3].mbuf->rearm_data = 263 rxq->mbuf_initializer; 264 265 /* pkt 3,4 save to rx_pkts mbuf */ 266 vst1q_u8((void *)&sw_ring[pos + 2].mbuf->rx_descriptor_fields1, 267 pkt_mb3); 268 vst1q_u8((void *)&sw_ring[pos + 3].mbuf->rx_descriptor_fields1, 269 pkt_mb4); 270 271 rte_prefetch_non_temporal(rxdp + HNS3_DEFAULT_DESCS_PER_LOOP); 272 273 parse_retcode = hns3_desc_parse_field(rxq, &sw_ring[pos], 274 &rxdp[offset], bd_valid_num); 275 if (unlikely(parse_retcode)) 276 (*bd_err_mask) |= ((uint64_t)parse_retcode) << pos; 277 278 rte_prefetch0(sw_ring[pos + 279 HNS3_DEFAULT_DESCS_PER_LOOP + 0].mbuf); 280 rte_prefetch0(sw_ring[pos + 281 HNS3_DEFAULT_DESCS_PER_LOOP + 1].mbuf); 282 rte_prefetch0(sw_ring[pos + 283 HNS3_DEFAULT_DESCS_PER_LOOP + 2].mbuf); 284 rte_prefetch0(sw_ring[pos + 285 HNS3_DEFAULT_DESCS_PER_LOOP + 3].mbuf); 286 287 nb_rx += bd_valid_num; 288 if (bd_valid_num < HNS3_DEFAULT_DESCS_PER_LOOP) 289 break; 290 } 291 292 rxq->rx_rearm_nb += nb_rx; 293 rxq->next_to_use += nb_rx; 294 if (rxq->next_to_use >= rxq->nb_rx_desc) 295 rxq->next_to_use = 0; 296 297 return nb_rx; 298 } 299 #endif /* _HNS3_RXTX_VEC_NEON_H_ */ 300