1e94c20c3SJiawen Wu /* SPDX-License-Identifier: BSD-3-Clause 2e94c20c3SJiawen Wu * Copyright(c) 2015-2024 Beijing WangXun Technology Co., Ltd. 3e94c20c3SJiawen Wu * Copyright(c) 2010-2015 Intel Corporation 4e94c20c3SJiawen Wu */ 5e94c20c3SJiawen Wu 6e94c20c3SJiawen Wu #include <ethdev_driver.h> 7e94c20c3SJiawen Wu #include <rte_malloc.h> 8e94c20c3SJiawen Wu #include <rte_vect.h> 9e94c20c3SJiawen Wu 10e94c20c3SJiawen Wu #include "ngbe_type.h" 11e94c20c3SJiawen Wu #include "ngbe_ethdev.h" 12e94c20c3SJiawen Wu #include "ngbe_rxtx.h" 13e94c20c3SJiawen Wu #include "ngbe_rxtx_vec_common.h" 14e94c20c3SJiawen Wu 15e94c20c3SJiawen Wu static inline void 16e94c20c3SJiawen Wu ngbe_rxq_rearm(struct ngbe_rx_queue *rxq) 17e94c20c3SJiawen Wu { 18e94c20c3SJiawen Wu int i; 19e94c20c3SJiawen Wu uint16_t rx_id; 20e94c20c3SJiawen Wu volatile struct ngbe_rx_desc *rxdp; 21e94c20c3SJiawen Wu struct ngbe_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start]; 22e94c20c3SJiawen Wu struct rte_mbuf *mb0, *mb1; 23e94c20c3SJiawen Wu uint64x2_t dma_addr0, dma_addr1; 24e94c20c3SJiawen Wu uint64x2_t zero = vdupq_n_u64(0); 25e94c20c3SJiawen Wu uint64_t paddr; 26e94c20c3SJiawen Wu uint8x8_t p; 27e94c20c3SJiawen Wu 28e94c20c3SJiawen Wu rxdp = rxq->rx_ring + rxq->rxrearm_start; 29e94c20c3SJiawen Wu 30e94c20c3SJiawen Wu /* Pull 'n' more MBUFs into the software ring */ 31e94c20c3SJiawen Wu if (unlikely(rte_mempool_get_bulk(rxq->mb_pool, 32e94c20c3SJiawen Wu (void *)rxep, 33e94c20c3SJiawen Wu RTE_NGBE_RXQ_REARM_THRESH) < 0)) { 34e94c20c3SJiawen Wu if (rxq->rxrearm_nb + RTE_NGBE_RXQ_REARM_THRESH >= 35e94c20c3SJiawen Wu rxq->nb_rx_desc) { 36e94c20c3SJiawen Wu for (i = 0; i < RTE_NGBE_DESCS_PER_LOOP; i++) { 37e94c20c3SJiawen Wu rxep[i].mbuf = &rxq->fake_mbuf; 38*43fd3624SAndre Muezerie vst1q_u64(RTE_CAST_PTR(uint64_t *, &rxdp[i]), zero); 39e94c20c3SJiawen Wu } 40e94c20c3SJiawen Wu } 41e94c20c3SJiawen Wu rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed += 42e94c20c3SJiawen Wu RTE_NGBE_RXQ_REARM_THRESH; 43e94c20c3SJiawen Wu return; 44e94c20c3SJiawen Wu } 45e94c20c3SJiawen Wu 46e94c20c3SJiawen Wu p = vld1_u8((uint8_t *)&rxq->mbuf_initializer); 47e94c20c3SJiawen Wu 48e94c20c3SJiawen Wu /* Initialize the mbufs in vector, process 2 mbufs in one loop */ 49e94c20c3SJiawen Wu for (i = 0; i < RTE_NGBE_RXQ_REARM_THRESH; i += 2, rxep += 2) { 50e94c20c3SJiawen Wu mb0 = rxep[0].mbuf; 51e94c20c3SJiawen Wu mb1 = rxep[1].mbuf; 52e94c20c3SJiawen Wu 53e94c20c3SJiawen Wu /* 54e94c20c3SJiawen Wu * Flush mbuf with pkt template. 55e94c20c3SJiawen Wu * Data to be rearmed is 6 bytes long. 56e94c20c3SJiawen Wu */ 57e94c20c3SJiawen Wu vst1_u8((uint8_t *)&mb0->rearm_data, p); 58e94c20c3SJiawen Wu paddr = mb0->buf_iova + RTE_PKTMBUF_HEADROOM; 59e94c20c3SJiawen Wu dma_addr0 = vsetq_lane_u64(paddr, zero, 0); 60e94c20c3SJiawen Wu /* flush desc with pa dma_addr */ 61*43fd3624SAndre Muezerie vst1q_u64(RTE_CAST_PTR(uint64_t *, rxdp++), dma_addr0); 62e94c20c3SJiawen Wu 63e94c20c3SJiawen Wu vst1_u8((uint8_t *)&mb1->rearm_data, p); 64e94c20c3SJiawen Wu paddr = mb1->buf_iova + RTE_PKTMBUF_HEADROOM; 65e94c20c3SJiawen Wu dma_addr1 = vsetq_lane_u64(paddr, zero, 0); 66*43fd3624SAndre Muezerie vst1q_u64(RTE_CAST_PTR(uint64_t *, rxdp++), dma_addr1); 67e94c20c3SJiawen Wu } 68e94c20c3SJiawen Wu 69e94c20c3SJiawen Wu rxq->rxrearm_start += RTE_NGBE_RXQ_REARM_THRESH; 70e94c20c3SJiawen Wu if (rxq->rxrearm_start >= rxq->nb_rx_desc) 71e94c20c3SJiawen Wu rxq->rxrearm_start = 0; 72e94c20c3SJiawen Wu 73e94c20c3SJiawen Wu rxq->rxrearm_nb -= RTE_NGBE_RXQ_REARM_THRESH; 74e94c20c3SJiawen Wu 75e94c20c3SJiawen Wu rx_id = (uint16_t)((rxq->rxrearm_start == 0) ? 76e94c20c3SJiawen Wu (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1)); 77e94c20c3SJiawen Wu 78e94c20c3SJiawen Wu /* Update the tail pointer on the NIC */ 79e94c20c3SJiawen Wu ngbe_set32(rxq->rdt_reg_addr, rx_id); 80e94c20c3SJiawen Wu } 81e94c20c3SJiawen Wu 82e94c20c3SJiawen Wu static inline void 83e94c20c3SJiawen Wu desc_to_olflags_v(uint8x16x2_t sterr_tmp1, uint8x16x2_t sterr_tmp2, 84e94c20c3SJiawen Wu uint8x16_t staterr, uint8_t vlan_flags, 85e94c20c3SJiawen Wu struct rte_mbuf **rx_pkts) 86e94c20c3SJiawen Wu { 87e94c20c3SJiawen Wu uint8x16_t ptype; 88e94c20c3SJiawen Wu uint8x16_t vtag_lo, vtag_hi, vtag; 89e94c20c3SJiawen Wu uint8x16_t temp_csum, temp_vp; 90e94c20c3SJiawen Wu uint8x16_t vtag_mask = vdupq_n_u8(0x0F); 91e94c20c3SJiawen Wu uint32x4_t csum = {0, 0, 0, 0}; 92e94c20c3SJiawen Wu 93e94c20c3SJiawen Wu union { 94e94c20c3SJiawen Wu uint16_t e[4]; 95e94c20c3SJiawen Wu uint64_t word; 96e94c20c3SJiawen Wu } vol; 97e94c20c3SJiawen Wu 98e94c20c3SJiawen Wu const uint8x16_t rsstype_msk = { 99e94c20c3SJiawen Wu 0x0F, 0x0F, 0x0F, 0x0F, 100e94c20c3SJiawen Wu 0x00, 0x00, 0x00, 0x00, 101e94c20c3SJiawen Wu 0x00, 0x00, 0x00, 0x00, 102e94c20c3SJiawen Wu 0x00, 0x00, 0x00, 0x00}; 103e94c20c3SJiawen Wu 104e94c20c3SJiawen Wu const uint8x16_t rss_flags = { 105e94c20c3SJiawen Wu 0, RTE_MBUF_F_RX_RSS_HASH, RTE_MBUF_F_RX_RSS_HASH, RTE_MBUF_F_RX_RSS_HASH, 106e94c20c3SJiawen Wu 0, RTE_MBUF_F_RX_RSS_HASH, 0, RTE_MBUF_F_RX_RSS_HASH, 107e94c20c3SJiawen Wu RTE_MBUF_F_RX_RSS_HASH, 0, 0, 0, 108e94c20c3SJiawen Wu 0, 0, 0, RTE_MBUF_F_RX_FDIR}; 109e94c20c3SJiawen Wu 110e94c20c3SJiawen Wu /* mask everything except vlan present and l4/ip csum error */ 111e94c20c3SJiawen Wu const uint8x16_t vlan_csum_msk = { 112e94c20c3SJiawen Wu NGBE_RXD_STAT_VLAN, NGBE_RXD_STAT_VLAN, 113e94c20c3SJiawen Wu NGBE_RXD_STAT_VLAN, NGBE_RXD_STAT_VLAN, 114e94c20c3SJiawen Wu 0, 0, 0, 0, 115e94c20c3SJiawen Wu 0, 0, 0, 0, 116e94c20c3SJiawen Wu (NGBE_RXD_ERR_L4CS | NGBE_RXD_ERR_IPCS) >> 24, 117e94c20c3SJiawen Wu (NGBE_RXD_ERR_L4CS | NGBE_RXD_ERR_IPCS) >> 24, 118e94c20c3SJiawen Wu (NGBE_RXD_ERR_L4CS | NGBE_RXD_ERR_IPCS) >> 24, 119e94c20c3SJiawen Wu (NGBE_RXD_ERR_L4CS | NGBE_RXD_ERR_IPCS) >> 24}; 120e94c20c3SJiawen Wu 121e94c20c3SJiawen Wu /* map vlan present and l4/ip csum error to ol_flags */ 122e94c20c3SJiawen Wu const uint8x16_t vlan_csum_map_lo = { 123e94c20c3SJiawen Wu RTE_MBUF_F_RX_IP_CKSUM_GOOD, 124e94c20c3SJiawen Wu RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD, 125e94c20c3SJiawen Wu RTE_MBUF_F_RX_IP_CKSUM_BAD, 126e94c20c3SJiawen Wu RTE_MBUF_F_RX_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD, 127e94c20c3SJiawen Wu 0, 0, 0, 0, 128e94c20c3SJiawen Wu vlan_flags | RTE_MBUF_F_RX_IP_CKSUM_GOOD, 129e94c20c3SJiawen Wu vlan_flags | RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD, 130e94c20c3SJiawen Wu vlan_flags | RTE_MBUF_F_RX_IP_CKSUM_BAD, 131e94c20c3SJiawen Wu vlan_flags | RTE_MBUF_F_RX_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD, 132e94c20c3SJiawen Wu 0, 0, 0, 0}; 133e94c20c3SJiawen Wu 134e94c20c3SJiawen Wu const uint8x16_t vlan_csum_map_hi = { 135e94c20c3SJiawen Wu RTE_MBUF_F_RX_L4_CKSUM_GOOD >> sizeof(uint8_t), 0, 136e94c20c3SJiawen Wu RTE_MBUF_F_RX_L4_CKSUM_GOOD >> sizeof(uint8_t), 0, 137e94c20c3SJiawen Wu 0, 0, 0, 0, 138e94c20c3SJiawen Wu RTE_MBUF_F_RX_L4_CKSUM_GOOD >> sizeof(uint8_t), 0, 139e94c20c3SJiawen Wu RTE_MBUF_F_RX_L4_CKSUM_GOOD >> sizeof(uint8_t), 0, 140e94c20c3SJiawen Wu 0, 0, 0, 0}; 141e94c20c3SJiawen Wu 142e94c20c3SJiawen Wu ptype = vzipq_u8(sterr_tmp1.val[0], sterr_tmp2.val[0]).val[0]; 143e94c20c3SJiawen Wu ptype = vandq_u8(ptype, rsstype_msk); 144e94c20c3SJiawen Wu ptype = vqtbl1q_u8(rss_flags, ptype); 145e94c20c3SJiawen Wu 146e94c20c3SJiawen Wu /* extract vlan_flags and csum_error from staterr */ 147e94c20c3SJiawen Wu vtag = vandq_u8(staterr, vlan_csum_msk); 148e94c20c3SJiawen Wu 149e94c20c3SJiawen Wu /* csum bits are in the most significant, to use shuffle we need to 150e94c20c3SJiawen Wu * shift them. Change mask from 0xc0 to 0x03. 151e94c20c3SJiawen Wu */ 152e94c20c3SJiawen Wu temp_csum = vshrq_n_u8(vtag, 6); 153e94c20c3SJiawen Wu 154e94c20c3SJiawen Wu /* Change vlan present mask from 0x20 to 0x08. 155e94c20c3SJiawen Wu */ 156e94c20c3SJiawen Wu temp_vp = vshrq_n_u8(vtag, 2); 157e94c20c3SJiawen Wu 158e94c20c3SJiawen Wu /* 'OR' the most significant 32 bits containing the checksum flags with 159e94c20c3SJiawen Wu * the vlan present flags. Then bits layout of each lane(8bits) will be 160e94c20c3SJiawen Wu * 'xxxx,VLAN,x,ERR_IPCS,ERR_L4CS' 161e94c20c3SJiawen Wu */ 162e94c20c3SJiawen Wu csum = vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u8(temp_csum), 3), csum, 0); 163e94c20c3SJiawen Wu vtag = vorrq_u8(vreinterpretq_u8_u32(csum), vtag); 164e94c20c3SJiawen Wu vtag = vorrq_u8(vtag, temp_vp); 165e94c20c3SJiawen Wu vtag = vandq_u8(vtag, vtag_mask); 166e94c20c3SJiawen Wu 167e94c20c3SJiawen Wu /* convert L4 checksum correct type to vtag_hi */ 168e94c20c3SJiawen Wu vtag_hi = vqtbl1q_u8(vlan_csum_map_hi, vtag); 169e94c20c3SJiawen Wu vtag_hi = vshrq_n_u8(vtag_hi, 7); 170e94c20c3SJiawen Wu 171e94c20c3SJiawen Wu /* convert VP, IPE, L4E to vtag_lo */ 172e94c20c3SJiawen Wu vtag_lo = vqtbl1q_u8(vlan_csum_map_lo, vtag); 173e94c20c3SJiawen Wu vtag_lo = vorrq_u8(ptype, vtag_lo); 174e94c20c3SJiawen Wu 175e94c20c3SJiawen Wu vtag = vzipq_u8(vtag_lo, vtag_hi).val[0]; 176e94c20c3SJiawen Wu vol.word = vgetq_lane_u64(vreinterpretq_u64_u8(vtag), 0); 177e94c20c3SJiawen Wu 178e94c20c3SJiawen Wu rx_pkts[0]->ol_flags = vol.e[0]; 179e94c20c3SJiawen Wu rx_pkts[1]->ol_flags = vol.e[1]; 180e94c20c3SJiawen Wu rx_pkts[2]->ol_flags = vol.e[2]; 181e94c20c3SJiawen Wu rx_pkts[3]->ol_flags = vol.e[3]; 182e94c20c3SJiawen Wu } 183e94c20c3SJiawen Wu 184e94c20c3SJiawen Wu #define NGBE_VPMD_DESC_EOP_MASK 0x02020202 185e94c20c3SJiawen Wu #define NGBE_UINT8_BIT (CHAR_BIT * sizeof(uint8_t)) 186e94c20c3SJiawen Wu 187e94c20c3SJiawen Wu static inline void 188e94c20c3SJiawen Wu desc_to_ptype_v(uint64x2_t descs[4], uint16_t pkt_type_mask, 189e94c20c3SJiawen Wu struct rte_mbuf **rx_pkts) 190e94c20c3SJiawen Wu { 191e94c20c3SJiawen Wu uint32x4_t ptype_mask = vdupq_n_u32((uint32_t)pkt_type_mask); 192e94c20c3SJiawen Wu uint32x4_t ptype0 = vzipq_u32(vreinterpretq_u32_u64(descs[0]), 193e94c20c3SJiawen Wu vreinterpretq_u32_u64(descs[2])).val[0]; 194e94c20c3SJiawen Wu uint32x4_t ptype1 = vzipq_u32(vreinterpretq_u32_u64(descs[1]), 195e94c20c3SJiawen Wu vreinterpretq_u32_u64(descs[3])).val[0]; 196e94c20c3SJiawen Wu 197e94c20c3SJiawen Wu /* interleave low 32 bits, 198e94c20c3SJiawen Wu * now we have 4 ptypes in a NEON register 199e94c20c3SJiawen Wu */ 200e94c20c3SJiawen Wu ptype0 = vzipq_u32(ptype0, ptype1).val[0]; 201e94c20c3SJiawen Wu 202e94c20c3SJiawen Wu /* shift right by NGBE_RXD_PTID_SHIFT, and apply ptype mask */ 203e94c20c3SJiawen Wu ptype0 = vandq_u32(vshrq_n_u32(ptype0, NGBE_RXD_PTID_SHIFT), ptype_mask); 204e94c20c3SJiawen Wu 205e94c20c3SJiawen Wu rx_pkts[0]->packet_type = ngbe_decode_ptype(vgetq_lane_u32(ptype0, 0)); 206e94c20c3SJiawen Wu rx_pkts[1]->packet_type = ngbe_decode_ptype(vgetq_lane_u32(ptype0, 1)); 207e94c20c3SJiawen Wu rx_pkts[2]->packet_type = ngbe_decode_ptype(vgetq_lane_u32(ptype0, 2)); 208e94c20c3SJiawen Wu rx_pkts[3]->packet_type = ngbe_decode_ptype(vgetq_lane_u32(ptype0, 3)); 209e94c20c3SJiawen Wu } 210e94c20c3SJiawen Wu 211e94c20c3SJiawen Wu /** 212e94c20c3SJiawen Wu * vPMD raw receive routine, only accept(nb_pkts >= RTE_NGBE_DESCS_PER_LOOP) 213e94c20c3SJiawen Wu * 214e94c20c3SJiawen Wu * Notice: 215e94c20c3SJiawen Wu * - nb_pkts < RTE_NGBE_DESCS_PER_LOOP, just return no packet 216e94c20c3SJiawen Wu * - floor align nb_pkts to a RTE_NGBE_DESC_PER_LOOP power-of-two 217e94c20c3SJiawen Wu */ 218e94c20c3SJiawen Wu static inline uint16_t 219e94c20c3SJiawen Wu _recv_raw_pkts_vec(struct ngbe_rx_queue *rxq, struct rte_mbuf **rx_pkts, 220e94c20c3SJiawen Wu uint16_t nb_pkts, uint8_t *split_packet) 221e94c20c3SJiawen Wu { 222e94c20c3SJiawen Wu volatile struct ngbe_rx_desc *rxdp; 223e94c20c3SJiawen Wu struct ngbe_rx_entry *sw_ring; 224e94c20c3SJiawen Wu uint16_t nb_pkts_recd; 225e94c20c3SJiawen Wu int pos; 226e94c20c3SJiawen Wu uint8x16_t shuf_msk = { 227e94c20c3SJiawen Wu 0xFF, 0xFF, 228e94c20c3SJiawen Wu 0xFF, 0xFF, /* skip 32 bits pkt_type */ 229e94c20c3SJiawen Wu 12, 13, /* octet 12~13, low 16 bits pkt_len */ 230e94c20c3SJiawen Wu 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ 231e94c20c3SJiawen Wu 12, 13, /* octet 12~13, 16 bits data_len */ 232e94c20c3SJiawen Wu 14, 15, /* octet 14~15, low 16 bits vlan_macip */ 233e94c20c3SJiawen Wu 4, 5, 6, 7 /* octet 4~7, 32bits rss */ 234e94c20c3SJiawen Wu }; 235e94c20c3SJiawen Wu uint16x8_t crc_adjust = {0, 0, rxq->crc_len, 0, 236e94c20c3SJiawen Wu rxq->crc_len, 0, 0, 0}; 237e94c20c3SJiawen Wu uint8_t vlan_flags; 238e94c20c3SJiawen Wu 239e94c20c3SJiawen Wu /* nb_pkts has to be floor-aligned to RTE_NGBE_DESCS_PER_LOOP */ 240e94c20c3SJiawen Wu nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_NGBE_DESCS_PER_LOOP); 241e94c20c3SJiawen Wu 242e94c20c3SJiawen Wu /* Just the act of getting into the function from the application is 243e94c20c3SJiawen Wu * going to cost about 7 cycles 244e94c20c3SJiawen Wu */ 245e94c20c3SJiawen Wu rxdp = rxq->rx_ring + rxq->rx_tail; 246e94c20c3SJiawen Wu 247e94c20c3SJiawen Wu rte_prefetch_non_temporal(rxdp); 248e94c20c3SJiawen Wu 249e94c20c3SJiawen Wu /* See if we need to rearm the RX queue - gives the prefetch a bit 250e94c20c3SJiawen Wu * of time to act 251e94c20c3SJiawen Wu */ 252e94c20c3SJiawen Wu if (rxq->rxrearm_nb > RTE_NGBE_RXQ_REARM_THRESH) 253e94c20c3SJiawen Wu ngbe_rxq_rearm(rxq); 254e94c20c3SJiawen Wu 255e94c20c3SJiawen Wu /* Before we start moving massive data around, check to see if 256e94c20c3SJiawen Wu * there is actually a packet available 257e94c20c3SJiawen Wu */ 258e94c20c3SJiawen Wu if (!(rxdp->qw1.lo.status & rte_cpu_to_le_32(NGBE_RXD_STAT_DD))) 259e94c20c3SJiawen Wu return 0; 260e94c20c3SJiawen Wu 261e94c20c3SJiawen Wu /* Cache is empty -> need to scan the buffer rings, but first move 262e94c20c3SJiawen Wu * the next 'n' mbufs into the cache 263e94c20c3SJiawen Wu */ 264e94c20c3SJiawen Wu sw_ring = &rxq->sw_ring[rxq->rx_tail]; 265e94c20c3SJiawen Wu 266e94c20c3SJiawen Wu /* ensure these 2 flags are in the lower 8 bits */ 267e94c20c3SJiawen Wu RTE_BUILD_BUG_ON((RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED) > UINT8_MAX); 268e94c20c3SJiawen Wu vlan_flags = rxq->vlan_flags & UINT8_MAX; 269e94c20c3SJiawen Wu 270e94c20c3SJiawen Wu /* A. load 4 packet in one loop 271e94c20c3SJiawen Wu * B. copy 4 mbuf point from swring to rx_pkts 272e94c20c3SJiawen Wu * C. calc the number of DD bits among the 4 packets 273e94c20c3SJiawen Wu * [C*. extract the end-of-packet bit, if requested] 274e94c20c3SJiawen Wu * D. fill info. from desc to mbuf 275e94c20c3SJiawen Wu */ 276e94c20c3SJiawen Wu for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts; 277e94c20c3SJiawen Wu pos += RTE_NGBE_DESCS_PER_LOOP, 278e94c20c3SJiawen Wu rxdp += RTE_NGBE_DESCS_PER_LOOP) { 279e94c20c3SJiawen Wu uint64x2_t descs[RTE_NGBE_DESCS_PER_LOOP]; 280e94c20c3SJiawen Wu uint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4; 281e94c20c3SJiawen Wu uint8x16x2_t sterr_tmp1, sterr_tmp2; 282e94c20c3SJiawen Wu uint64x2_t mbp1, mbp2; 283e94c20c3SJiawen Wu uint8x16_t staterr; 284e94c20c3SJiawen Wu uint16x8_t tmp; 285e94c20c3SJiawen Wu uint32_t stat; 286e94c20c3SJiawen Wu 287e94c20c3SJiawen Wu /* B.1 load 2 mbuf point */ 288e94c20c3SJiawen Wu mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]); 289e94c20c3SJiawen Wu 290e94c20c3SJiawen Wu /* B.2 copy 2 mbuf point into rx_pkts */ 291e94c20c3SJiawen Wu vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1); 292e94c20c3SJiawen Wu 293e94c20c3SJiawen Wu /* B.1 load 2 mbuf point */ 294e94c20c3SJiawen Wu mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]); 295e94c20c3SJiawen Wu 296e94c20c3SJiawen Wu /* A. load 4 pkts descs */ 297e94c20c3SJiawen Wu descs[0] = vld1q_u64((uint64_t *)(uintptr_t)(rxdp)); 298e94c20c3SJiawen Wu descs[1] = vld1q_u64((uint64_t *)(uintptr_t)(rxdp + 1)); 299e94c20c3SJiawen Wu descs[2] = vld1q_u64((uint64_t *)(uintptr_t)(rxdp + 2)); 300e94c20c3SJiawen Wu descs[3] = vld1q_u64((uint64_t *)(uintptr_t)(rxdp + 3)); 301e94c20c3SJiawen Wu 302e94c20c3SJiawen Wu /* B.2 copy 2 mbuf point into rx_pkts */ 303e94c20c3SJiawen Wu vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2); 304e94c20c3SJiawen Wu 305e94c20c3SJiawen Wu if (split_packet) { 306e94c20c3SJiawen Wu rte_mbuf_prefetch_part2(rx_pkts[pos]); 307e94c20c3SJiawen Wu rte_mbuf_prefetch_part2(rx_pkts[pos + 1]); 308e94c20c3SJiawen Wu rte_mbuf_prefetch_part2(rx_pkts[pos + 2]); 309e94c20c3SJiawen Wu rte_mbuf_prefetch_part2(rx_pkts[pos + 3]); 310e94c20c3SJiawen Wu } 311e94c20c3SJiawen Wu 312e94c20c3SJiawen Wu /* D.1 pkt 3,4 convert format from desc to pktmbuf */ 313e94c20c3SJiawen Wu pkt_mb4 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[3]), shuf_msk); 314e94c20c3SJiawen Wu pkt_mb3 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[2]), shuf_msk); 315e94c20c3SJiawen Wu 316e94c20c3SJiawen Wu /* D.1 pkt 1,2 convert format from desc to pktmbuf */ 317e94c20c3SJiawen Wu pkt_mb2 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[1]), shuf_msk); 318e94c20c3SJiawen Wu pkt_mb1 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[0]), shuf_msk); 319e94c20c3SJiawen Wu 320e94c20c3SJiawen Wu /* C.1 4=>2 filter staterr info only */ 321e94c20c3SJiawen Wu sterr_tmp2 = vzipq_u8(vreinterpretq_u8_u64(descs[1]), 322e94c20c3SJiawen Wu vreinterpretq_u8_u64(descs[3])); 323e94c20c3SJiawen Wu /* C.1 4=>2 filter staterr info only */ 324e94c20c3SJiawen Wu sterr_tmp1 = vzipq_u8(vreinterpretq_u8_u64(descs[0]), 325e94c20c3SJiawen Wu vreinterpretq_u8_u64(descs[2])); 326e94c20c3SJiawen Wu 327e94c20c3SJiawen Wu /* C.2 get 4 pkts staterr value */ 328e94c20c3SJiawen Wu staterr = vzipq_u8(sterr_tmp1.val[1], sterr_tmp2.val[1]).val[0]; 329e94c20c3SJiawen Wu 330e94c20c3SJiawen Wu /* set ol_flags with vlan packet type */ 331e94c20c3SJiawen Wu desc_to_olflags_v(sterr_tmp1, sterr_tmp2, staterr, vlan_flags, 332e94c20c3SJiawen Wu &rx_pkts[pos]); 333e94c20c3SJiawen Wu 334e94c20c3SJiawen Wu /* D.2 pkt 3,4 set in_port/nb_seg and remove crc */ 335e94c20c3SJiawen Wu tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb4), crc_adjust); 336e94c20c3SJiawen Wu pkt_mb4 = vreinterpretq_u8_u16(tmp); 337e94c20c3SJiawen Wu tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust); 338e94c20c3SJiawen Wu pkt_mb3 = vreinterpretq_u8_u16(tmp); 339e94c20c3SJiawen Wu 340e94c20c3SJiawen Wu /* D.3 copy final 3,4 data to rx_pkts */ 341e94c20c3SJiawen Wu vst1q_u8((void *)&rx_pkts[pos + 3]->rx_descriptor_fields1, 342e94c20c3SJiawen Wu pkt_mb4); 343e94c20c3SJiawen Wu vst1q_u8((void *)&rx_pkts[pos + 2]->rx_descriptor_fields1, 344e94c20c3SJiawen Wu pkt_mb3); 345e94c20c3SJiawen Wu 346e94c20c3SJiawen Wu /* D.2 pkt 1,2 set in_port/nb_seg and remove crc */ 347e94c20c3SJiawen Wu tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust); 348e94c20c3SJiawen Wu pkt_mb2 = vreinterpretq_u8_u16(tmp); 349e94c20c3SJiawen Wu tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust); 350e94c20c3SJiawen Wu pkt_mb1 = vreinterpretq_u8_u16(tmp); 351e94c20c3SJiawen Wu 352e94c20c3SJiawen Wu /* C* extract and record EOP bit */ 353e94c20c3SJiawen Wu if (split_packet) { 354e94c20c3SJiawen Wu stat = vgetq_lane_u32(vreinterpretq_u32_u8(staterr), 0); 355e94c20c3SJiawen Wu /* and with mask to extract bits, flipping 1-0 */ 356e94c20c3SJiawen Wu *(int *)split_packet = ~stat & NGBE_VPMD_DESC_EOP_MASK; 357e94c20c3SJiawen Wu 358e94c20c3SJiawen Wu split_packet += RTE_NGBE_DESCS_PER_LOOP; 359e94c20c3SJiawen Wu } 360e94c20c3SJiawen Wu 361e94c20c3SJiawen Wu /* C.4 expand DD bit to saturate UINT8 */ 362e94c20c3SJiawen Wu staterr = vshlq_n_u8(staterr, NGBE_UINT8_BIT - 1); 363e94c20c3SJiawen Wu staterr = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u8(staterr), 364e94c20c3SJiawen Wu NGBE_UINT8_BIT - 1)); 365e94c20c3SJiawen Wu stat = ~vgetq_lane_u32(vreinterpretq_u32_u8(staterr), 0); 366e94c20c3SJiawen Wu 367e94c20c3SJiawen Wu rte_prefetch_non_temporal(rxdp + RTE_NGBE_DESCS_PER_LOOP); 368e94c20c3SJiawen Wu 369e94c20c3SJiawen Wu /* D.3 copy final 1,2 data to rx_pkts */ 370e94c20c3SJiawen Wu vst1q_u8((uint8_t *)&rx_pkts[pos + 1]->rx_descriptor_fields1, 371e94c20c3SJiawen Wu pkt_mb2); 372e94c20c3SJiawen Wu vst1q_u8((uint8_t *)&rx_pkts[pos]->rx_descriptor_fields1, 373e94c20c3SJiawen Wu pkt_mb1); 374e94c20c3SJiawen Wu 375e94c20c3SJiawen Wu desc_to_ptype_v(descs, NGBE_PTID_MASK, &rx_pkts[pos]); 376e94c20c3SJiawen Wu 377e94c20c3SJiawen Wu /* C.5 calc available number of desc */ 378e94c20c3SJiawen Wu if (unlikely(stat == 0)) { 379e94c20c3SJiawen Wu nb_pkts_recd += RTE_NGBE_DESCS_PER_LOOP; 380e94c20c3SJiawen Wu } else { 381e94c20c3SJiawen Wu nb_pkts_recd += rte_ctz32(stat) / NGBE_UINT8_BIT; 382e94c20c3SJiawen Wu break; 383e94c20c3SJiawen Wu } 384e94c20c3SJiawen Wu } 385e94c20c3SJiawen Wu 386e94c20c3SJiawen Wu /* Update our internal tail pointer */ 387e94c20c3SJiawen Wu rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd); 388e94c20c3SJiawen Wu rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1)); 389e94c20c3SJiawen Wu rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd); 390e94c20c3SJiawen Wu 391e94c20c3SJiawen Wu return nb_pkts_recd; 392e94c20c3SJiawen Wu } 393e94c20c3SJiawen Wu 394e94c20c3SJiawen Wu /** 395e94c20c3SJiawen Wu * vPMD receive routine, only accept(nb_pkts >= RTE_NGBE_DESCS_PER_LOOP) 396e94c20c3SJiawen Wu * 397e94c20c3SJiawen Wu * Notice: 398e94c20c3SJiawen Wu * - nb_pkts < RTE_NGBE_DESCS_PER_LOOP, just return no packet 399e94c20c3SJiawen Wu * - floor align nb_pkts to a RTE_NGBE_DESC_PER_LOOP power-of-two 400e94c20c3SJiawen Wu */ 401e94c20c3SJiawen Wu uint16_t 402e94c20c3SJiawen Wu ngbe_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, 403e94c20c3SJiawen Wu uint16_t nb_pkts) 404e94c20c3SJiawen Wu { 405e94c20c3SJiawen Wu return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL); 406e94c20c3SJiawen Wu } 407e94c20c3SJiawen Wu 408e94c20c3SJiawen Wu /** 409e94c20c3SJiawen Wu * vPMD receive routine that reassembles scattered packets 410e94c20c3SJiawen Wu * 411e94c20c3SJiawen Wu * Notice: 412e94c20c3SJiawen Wu * - nb_pkts < RTE_NGBE_DESCS_PER_LOOP, just return no packet 413e94c20c3SJiawen Wu * - floor align nb_pkts to a RTE_NGBE_DESC_PER_LOOP power-of-two 414e94c20c3SJiawen Wu */ 415e94c20c3SJiawen Wu static uint16_t 416e94c20c3SJiawen Wu ngbe_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts, 417e94c20c3SJiawen Wu uint16_t nb_pkts) 418e94c20c3SJiawen Wu { 419e94c20c3SJiawen Wu struct ngbe_rx_queue *rxq = rx_queue; 420e94c20c3SJiawen Wu uint8_t split_flags[RTE_NGBE_MAX_RX_BURST] = {0}; 421e94c20c3SJiawen Wu 422e94c20c3SJiawen Wu /* get some new buffers */ 423e94c20c3SJiawen Wu uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts, 424e94c20c3SJiawen Wu split_flags); 425e94c20c3SJiawen Wu if (nb_bufs == 0) 426e94c20c3SJiawen Wu return 0; 427e94c20c3SJiawen Wu 428e94c20c3SJiawen Wu /* happy day case, full burst + no packets to be joined */ 429e94c20c3SJiawen Wu const uint64_t *split_fl64 = (uint64_t *)split_flags; 430e94c20c3SJiawen Wu if (rxq->pkt_first_seg == NULL && 431e94c20c3SJiawen Wu split_fl64[0] == 0 && split_fl64[1] == 0 && 432e94c20c3SJiawen Wu split_fl64[2] == 0 && split_fl64[3] == 0) 433e94c20c3SJiawen Wu return nb_bufs; 434e94c20c3SJiawen Wu 435e94c20c3SJiawen Wu /* reassemble any packets that need reassembly*/ 436e94c20c3SJiawen Wu unsigned int i = 0; 437e94c20c3SJiawen Wu if (rxq->pkt_first_seg == NULL) { 438e94c20c3SJiawen Wu /* find the first split flag, and only reassemble then*/ 439e94c20c3SJiawen Wu while (i < nb_bufs && !split_flags[i]) 440e94c20c3SJiawen Wu i++; 441e94c20c3SJiawen Wu if (i == nb_bufs) 442e94c20c3SJiawen Wu return nb_bufs; 443e94c20c3SJiawen Wu rxq->pkt_first_seg = rx_pkts[i]; 444e94c20c3SJiawen Wu } 445e94c20c3SJiawen Wu return i + reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i, 446e94c20c3SJiawen Wu &split_flags[i]); 447e94c20c3SJiawen Wu } 448e94c20c3SJiawen Wu 449e94c20c3SJiawen Wu /** 450e94c20c3SJiawen Wu * vPMD receive routine that reassembles scattered packets. 451e94c20c3SJiawen Wu */ 452e94c20c3SJiawen Wu uint16_t 453e94c20c3SJiawen Wu ngbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, 454e94c20c3SJiawen Wu uint16_t nb_pkts) 455e94c20c3SJiawen Wu { 456e94c20c3SJiawen Wu uint16_t retval = 0; 457e94c20c3SJiawen Wu 458e94c20c3SJiawen Wu while (nb_pkts > RTE_NGBE_MAX_RX_BURST) { 459e94c20c3SJiawen Wu uint16_t burst; 460e94c20c3SJiawen Wu 461e94c20c3SJiawen Wu burst = ngbe_recv_scattered_burst_vec(rx_queue, 462e94c20c3SJiawen Wu rx_pkts + retval, 463e94c20c3SJiawen Wu RTE_NGBE_MAX_RX_BURST); 464e94c20c3SJiawen Wu retval += burst; 465e94c20c3SJiawen Wu nb_pkts -= burst; 466e94c20c3SJiawen Wu if (burst < RTE_NGBE_MAX_RX_BURST) 467e94c20c3SJiawen Wu return retval; 468e94c20c3SJiawen Wu } 469e94c20c3SJiawen Wu 470e94c20c3SJiawen Wu return retval + ngbe_recv_scattered_burst_vec(rx_queue, 471e94c20c3SJiawen Wu rx_pkts + retval, 472e94c20c3SJiawen Wu nb_pkts); 473e94c20c3SJiawen Wu } 474e94c20c3SJiawen Wu 475e94c20c3SJiawen Wu static inline void 476e94c20c3SJiawen Wu vtx1(volatile struct ngbe_tx_desc *txdp, 477e94c20c3SJiawen Wu struct rte_mbuf *pkt, uint64_t flags) 478e94c20c3SJiawen Wu { 4798d75bf03SJiawen Wu uint16_t pkt_len = pkt->data_len; 4808d75bf03SJiawen Wu 4818d75bf03SJiawen Wu if (pkt_len < RTE_ETHER_HDR_LEN) 4828d75bf03SJiawen Wu pkt_len = NGBE_FRAME_SIZE_DFT; 4838d75bf03SJiawen Wu 4848d75bf03SJiawen Wu uint64x2_t descriptor = {pkt->buf_iova + pkt->data_off, 4858d75bf03SJiawen Wu (uint64_t)pkt_len << 45 | flags | pkt_len}; 486e94c20c3SJiawen Wu 487*43fd3624SAndre Muezerie vst1q_u64(RTE_CAST_PTR(uint64_t *, txdp), descriptor); 488e94c20c3SJiawen Wu } 489e94c20c3SJiawen Wu 490e94c20c3SJiawen Wu static inline void 491e94c20c3SJiawen Wu vtx(volatile struct ngbe_tx_desc *txdp, 492e94c20c3SJiawen Wu struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags) 493e94c20c3SJiawen Wu { 494e94c20c3SJiawen Wu int i; 495e94c20c3SJiawen Wu 496e94c20c3SJiawen Wu for (i = 0; i < nb_pkts; ++i, ++txdp, ++pkt) 497e94c20c3SJiawen Wu vtx1(txdp, *pkt, flags); 498e94c20c3SJiawen Wu } 499e94c20c3SJiawen Wu 500e94c20c3SJiawen Wu uint16_t 501e94c20c3SJiawen Wu ngbe_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts, 502e94c20c3SJiawen Wu uint16_t nb_pkts) 503e94c20c3SJiawen Wu { 504e94c20c3SJiawen Wu struct ngbe_tx_queue *txq = (struct ngbe_tx_queue *)tx_queue; 505e94c20c3SJiawen Wu volatile struct ngbe_tx_desc *txdp; 506e94c20c3SJiawen Wu struct ngbe_tx_entry_v *txep; 507e94c20c3SJiawen Wu uint16_t n, nb_commit, tx_id; 508e94c20c3SJiawen Wu uint64_t flags = NGBE_TXD_FLAGS; 509e94c20c3SJiawen Wu uint64_t rs = NGBE_TXD_FLAGS; 510e94c20c3SJiawen Wu int i; 511e94c20c3SJiawen Wu 512e94c20c3SJiawen Wu /* cross rx_thresh boundary is not allowed */ 513e94c20c3SJiawen Wu nb_pkts = RTE_MIN(nb_pkts, txq->tx_free_thresh); 514e94c20c3SJiawen Wu 515e94c20c3SJiawen Wu if (txq->nb_tx_free < txq->tx_free_thresh) 516e94c20c3SJiawen Wu ngbe_tx_free_bufs(txq); 517e94c20c3SJiawen Wu 518e94c20c3SJiawen Wu nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts); 519e94c20c3SJiawen Wu if (unlikely(nb_pkts == 0)) 520e94c20c3SJiawen Wu return 0; 521e94c20c3SJiawen Wu 522e94c20c3SJiawen Wu tx_id = txq->tx_tail; 523e94c20c3SJiawen Wu txdp = &txq->tx_ring[tx_id]; 524e94c20c3SJiawen Wu txep = &txq->sw_ring_v[tx_id]; 525e94c20c3SJiawen Wu 526e94c20c3SJiawen Wu txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts); 527e94c20c3SJiawen Wu 528e94c20c3SJiawen Wu n = (uint16_t)(txq->nb_tx_desc - tx_id); 529e94c20c3SJiawen Wu nb_commit = nb_pkts; 530e94c20c3SJiawen Wu if (nb_commit >= n) { 531e94c20c3SJiawen Wu tx_backlog_entry(txep, tx_pkts, n); 532e94c20c3SJiawen Wu 533e94c20c3SJiawen Wu for (i = 0; i < n - 1; ++i, ++tx_pkts, ++txdp) 534e94c20c3SJiawen Wu vtx1(txdp, *tx_pkts, flags); 535e94c20c3SJiawen Wu 536e94c20c3SJiawen Wu vtx1(txdp, *tx_pkts++, rs); 537e94c20c3SJiawen Wu 538e94c20c3SJiawen Wu nb_commit = (uint16_t)(nb_commit - n); 539e94c20c3SJiawen Wu 540e94c20c3SJiawen Wu tx_id = 0; 541e94c20c3SJiawen Wu 542e94c20c3SJiawen Wu /* avoid reach the end of ring */ 543e94c20c3SJiawen Wu txdp = &txq->tx_ring[tx_id]; 544e94c20c3SJiawen Wu txep = &txq->sw_ring_v[tx_id]; 545e94c20c3SJiawen Wu } 546e94c20c3SJiawen Wu 547e94c20c3SJiawen Wu tx_backlog_entry(txep, tx_pkts, nb_commit); 548e94c20c3SJiawen Wu 549e94c20c3SJiawen Wu vtx(txdp, tx_pkts, nb_commit, flags); 550e94c20c3SJiawen Wu 551e94c20c3SJiawen Wu tx_id = (uint16_t)(tx_id + nb_commit); 552e94c20c3SJiawen Wu 553e94c20c3SJiawen Wu txq->tx_tail = tx_id; 554e94c20c3SJiawen Wu 555e94c20c3SJiawen Wu ngbe_set32(txq->tdt_reg_addr, txq->tx_tail); 556e94c20c3SJiawen Wu 557e94c20c3SJiawen Wu return nb_pkts; 558e94c20c3SJiawen Wu } 559e94c20c3SJiawen Wu 560e94c20c3SJiawen Wu static void __rte_cold 561e94c20c3SJiawen Wu ngbe_tx_queue_release_mbufs_vec(struct ngbe_tx_queue *txq) 562e94c20c3SJiawen Wu { 563e94c20c3SJiawen Wu _ngbe_tx_queue_release_mbufs_vec(txq); 564e94c20c3SJiawen Wu } 565e94c20c3SJiawen Wu 566e94c20c3SJiawen Wu void __rte_cold 567e94c20c3SJiawen Wu ngbe_rx_queue_release_mbufs_vec(struct ngbe_rx_queue *rxq) 568e94c20c3SJiawen Wu { 569e94c20c3SJiawen Wu _ngbe_rx_queue_release_mbufs_vec(rxq); 570e94c20c3SJiawen Wu } 571e94c20c3SJiawen Wu 572e94c20c3SJiawen Wu static void __rte_cold 573e94c20c3SJiawen Wu ngbe_tx_free_swring(struct ngbe_tx_queue *txq) 574e94c20c3SJiawen Wu { 575e94c20c3SJiawen Wu _ngbe_tx_free_swring_vec(txq); 576e94c20c3SJiawen Wu } 577e94c20c3SJiawen Wu 578e94c20c3SJiawen Wu static void __rte_cold 579e94c20c3SJiawen Wu ngbe_reset_tx_queue(struct ngbe_tx_queue *txq) 580e94c20c3SJiawen Wu { 581e94c20c3SJiawen Wu _ngbe_reset_tx_queue_vec(txq); 582e94c20c3SJiawen Wu } 583e94c20c3SJiawen Wu 584e94c20c3SJiawen Wu static const struct ngbe_txq_ops vec_txq_ops = { 585e94c20c3SJiawen Wu .release_mbufs = ngbe_tx_queue_release_mbufs_vec, 586e94c20c3SJiawen Wu .free_swring = ngbe_tx_free_swring, 587e94c20c3SJiawen Wu .reset = ngbe_reset_tx_queue, 588e94c20c3SJiawen Wu }; 589e94c20c3SJiawen Wu 590e94c20c3SJiawen Wu int __rte_cold 591e94c20c3SJiawen Wu ngbe_rxq_vec_setup(struct ngbe_rx_queue *rxq) 592e94c20c3SJiawen Wu { 593e94c20c3SJiawen Wu return ngbe_rxq_vec_setup_default(rxq); 594e94c20c3SJiawen Wu } 595e94c20c3SJiawen Wu 596e94c20c3SJiawen Wu int __rte_cold 597e94c20c3SJiawen Wu ngbe_txq_vec_setup(struct ngbe_tx_queue *txq) 598e94c20c3SJiawen Wu { 599e94c20c3SJiawen Wu return ngbe_txq_vec_setup_default(txq, &vec_txq_ops); 600e94c20c3SJiawen Wu } 601e94c20c3SJiawen Wu 602e94c20c3SJiawen Wu int __rte_cold 603e94c20c3SJiawen Wu ngbe_rx_vec_dev_conf_condition_check(struct rte_eth_dev *dev) 604e94c20c3SJiawen Wu { 605e94c20c3SJiawen Wu return ngbe_rx_vec_dev_conf_condition_check_default(dev); 606e94c20c3SJiawen Wu } 607