1*c1d14583SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause 2*c1d14583SBruce Richardson * Copyright(c) 2019 Intel Corporation 3*c1d14583SBruce Richardson */ 4*c1d14583SBruce Richardson 5*c1d14583SBruce Richardson #include "ice_rxtx_vec_common.h" 6*c1d14583SBruce Richardson 7*c1d14583SBruce Richardson #include <rte_vect.h> 8*c1d14583SBruce Richardson 9*c1d14583SBruce Richardson static inline __m128i 10*c1d14583SBruce Richardson ice_flex_rxd_to_fdir_flags_vec(const __m128i fdir_id0_3) 11*c1d14583SBruce Richardson { 12*c1d14583SBruce Richardson #define FDID_MIS_MAGIC 0xFFFFFFFF 13*c1d14583SBruce Richardson RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR != (1 << 2)); 14*c1d14583SBruce Richardson RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR_ID != (1 << 13)); 15*c1d14583SBruce Richardson const __m128i pkt_fdir_bit = _mm_set1_epi32(RTE_MBUF_F_RX_FDIR | 16*c1d14583SBruce Richardson RTE_MBUF_F_RX_FDIR_ID); 17*c1d14583SBruce Richardson /* desc->flow_id field == 0xFFFFFFFF means fdir mismatch */ 18*c1d14583SBruce Richardson const __m128i fdir_mis_mask = _mm_set1_epi32(FDID_MIS_MAGIC); 19*c1d14583SBruce Richardson __m128i fdir_mask = _mm_cmpeq_epi32(fdir_id0_3, 20*c1d14583SBruce Richardson fdir_mis_mask); 21*c1d14583SBruce Richardson /* this XOR op results to bit-reverse the fdir_mask */ 22*c1d14583SBruce Richardson fdir_mask = _mm_xor_si128(fdir_mask, fdir_mis_mask); 23*c1d14583SBruce Richardson const __m128i fdir_flags = _mm_and_si128(fdir_mask, pkt_fdir_bit); 24*c1d14583SBruce Richardson 25*c1d14583SBruce Richardson return fdir_flags; 26*c1d14583SBruce Richardson } 27*c1d14583SBruce Richardson 28*c1d14583SBruce Richardson static inline void 29*c1d14583SBruce Richardson ice_rxq_rearm(struct ice_rx_queue *rxq) 30*c1d14583SBruce Richardson { 31*c1d14583SBruce Richardson int i; 32*c1d14583SBruce Richardson uint16_t rx_id; 33*c1d14583SBruce Richardson volatile union ice_rx_flex_desc *rxdp; 34*c1d14583SBruce Richardson struct ice_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start]; 35*c1d14583SBruce Richardson struct rte_mbuf *mb0, *mb1; 36*c1d14583SBruce Richardson __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM, 37*c1d14583SBruce Richardson RTE_PKTMBUF_HEADROOM); 38*c1d14583SBruce Richardson __m128i dma_addr0, dma_addr1; 39*c1d14583SBruce Richardson 40*c1d14583SBruce Richardson rxdp = rxq->rx_ring + rxq->rxrearm_start; 41*c1d14583SBruce Richardson 42*c1d14583SBruce Richardson /* Pull 'n' more MBUFs into the software ring */ 43*c1d14583SBruce Richardson if (rte_mempool_get_bulk(rxq->mp, 44*c1d14583SBruce Richardson (void *)rxep, 45*c1d14583SBruce Richardson ICE_RXQ_REARM_THRESH) < 0) { 46*c1d14583SBruce Richardson if (rxq->rxrearm_nb + ICE_RXQ_REARM_THRESH >= 47*c1d14583SBruce Richardson rxq->nb_rx_desc) { 48*c1d14583SBruce Richardson dma_addr0 = _mm_setzero_si128(); 49*c1d14583SBruce Richardson for (i = 0; i < ICE_DESCS_PER_LOOP; i++) { 50*c1d14583SBruce Richardson rxep[i].mbuf = &rxq->fake_mbuf; 51*c1d14583SBruce Richardson _mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp[i].read), 52*c1d14583SBruce Richardson dma_addr0); 53*c1d14583SBruce Richardson } 54*c1d14583SBruce Richardson } 55*c1d14583SBruce Richardson rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed += 56*c1d14583SBruce Richardson ICE_RXQ_REARM_THRESH; 57*c1d14583SBruce Richardson return; 58*c1d14583SBruce Richardson } 59*c1d14583SBruce Richardson 60*c1d14583SBruce Richardson /* Initialize the mbufs in vector, process 2 mbufs in one loop */ 61*c1d14583SBruce Richardson for (i = 0; i < ICE_RXQ_REARM_THRESH; i += 2, rxep += 2) { 62*c1d14583SBruce Richardson __m128i vaddr0, vaddr1; 63*c1d14583SBruce Richardson 64*c1d14583SBruce Richardson mb0 = rxep[0].mbuf; 65*c1d14583SBruce Richardson mb1 = rxep[1].mbuf; 66*c1d14583SBruce Richardson 67*c1d14583SBruce Richardson #if RTE_IOVA_IN_MBUF 68*c1d14583SBruce Richardson /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */ 69*c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != 70*c1d14583SBruce Richardson offsetof(struct rte_mbuf, buf_addr) + 8); 71*c1d14583SBruce Richardson #endif 72*c1d14583SBruce Richardson vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr); 73*c1d14583SBruce Richardson vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr); 74*c1d14583SBruce Richardson 75*c1d14583SBruce Richardson #if RTE_IOVA_IN_MBUF 76*c1d14583SBruce Richardson /* convert pa to dma_addr hdr/data */ 77*c1d14583SBruce Richardson dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0); 78*c1d14583SBruce Richardson dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1); 79*c1d14583SBruce Richardson #else 80*c1d14583SBruce Richardson /* convert va to dma_addr hdr/data */ 81*c1d14583SBruce Richardson dma_addr0 = _mm_unpacklo_epi64(vaddr0, vaddr0); 82*c1d14583SBruce Richardson dma_addr1 = _mm_unpacklo_epi64(vaddr1, vaddr1); 83*c1d14583SBruce Richardson #endif 84*c1d14583SBruce Richardson 85*c1d14583SBruce Richardson /* add headroom to pa values */ 86*c1d14583SBruce Richardson dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room); 87*c1d14583SBruce Richardson dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room); 88*c1d14583SBruce Richardson 89*c1d14583SBruce Richardson /* flush desc with pa dma_addr */ 90*c1d14583SBruce Richardson _mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp++->read), dma_addr0); 91*c1d14583SBruce Richardson _mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp++->read), dma_addr1); 92*c1d14583SBruce Richardson } 93*c1d14583SBruce Richardson 94*c1d14583SBruce Richardson rxq->rxrearm_start += ICE_RXQ_REARM_THRESH; 95*c1d14583SBruce Richardson if (rxq->rxrearm_start >= rxq->nb_rx_desc) 96*c1d14583SBruce Richardson rxq->rxrearm_start = 0; 97*c1d14583SBruce Richardson 98*c1d14583SBruce Richardson rxq->rxrearm_nb -= ICE_RXQ_REARM_THRESH; 99*c1d14583SBruce Richardson 100*c1d14583SBruce Richardson rx_id = (uint16_t)((rxq->rxrearm_start == 0) ? 101*c1d14583SBruce Richardson (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1)); 102*c1d14583SBruce Richardson 103*c1d14583SBruce Richardson /* Update the tail pointer on the NIC */ 104*c1d14583SBruce Richardson ICE_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id); 105*c1d14583SBruce Richardson } 106*c1d14583SBruce Richardson 107*c1d14583SBruce Richardson static inline void 108*c1d14583SBruce Richardson ice_rx_desc_to_olflags_v(struct ice_rx_queue *rxq, __m128i descs[4], 109*c1d14583SBruce Richardson struct rte_mbuf **rx_pkts) 110*c1d14583SBruce Richardson { 111*c1d14583SBruce Richardson const __m128i mbuf_init = _mm_set_epi64x(0, rxq->mbuf_initializer); 112*c1d14583SBruce Richardson __m128i rearm0, rearm1, rearm2, rearm3; 113*c1d14583SBruce Richardson 114*c1d14583SBruce Richardson __m128i tmp_desc, flags, rss_vlan; 115*c1d14583SBruce Richardson 116*c1d14583SBruce Richardson /* mask everything except checksum, RSS and VLAN flags. 117*c1d14583SBruce Richardson * bit6:4 for checksum. 118*c1d14583SBruce Richardson * bit12 for RSS indication. 119*c1d14583SBruce Richardson * bit13 for VLAN indication. 120*c1d14583SBruce Richardson */ 121*c1d14583SBruce Richardson const __m128i desc_mask = _mm_set_epi32(0x30f0, 0x30f0, 122*c1d14583SBruce Richardson 0x30f0, 0x30f0); 123*c1d14583SBruce Richardson const __m128i cksum_mask = _mm_set_epi32(RTE_MBUF_F_RX_IP_CKSUM_MASK | 124*c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_MASK | 125*c1d14583SBruce Richardson RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | 126*c1d14583SBruce Richardson RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD, 127*c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_MASK | 128*c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_MASK | 129*c1d14583SBruce Richardson RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | 130*c1d14583SBruce Richardson RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD, 131*c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_MASK | 132*c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_MASK | 133*c1d14583SBruce Richardson RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | 134*c1d14583SBruce Richardson RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD, 135*c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_MASK | 136*c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_MASK | 137*c1d14583SBruce Richardson RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | 138*c1d14583SBruce Richardson RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD); 139*c1d14583SBruce Richardson 140*c1d14583SBruce Richardson /* map the checksum, rss and vlan fields to the checksum, rss 141*c1d14583SBruce Richardson * and vlan flag 142*c1d14583SBruce Richardson */ 143*c1d14583SBruce Richardson const __m128i cksum_flags = 144*c1d14583SBruce Richardson _mm_set_epi8((RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 145*c1d14583SBruce Richardson RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | 146*c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 147*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 148*c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 149*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 150*c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 151*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 152*c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 153*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 154*c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 155*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 156*c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 157*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 158*c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 159*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 160*c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 161*c1d14583SBruce Richardson /** 162*c1d14583SBruce Richardson * shift right 20 bits to use the low two bits to indicate 163*c1d14583SBruce Richardson * outer checksum status 164*c1d14583SBruce Richardson * shift right 1 bit to make sure it not exceed 255 165*c1d14583SBruce Richardson */ 166*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 167*c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 168*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 169*c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 170*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 171*c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 172*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 173*c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 174*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 175*c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 176*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 177*c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 178*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 179*c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 180*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 181*c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1); 182*c1d14583SBruce Richardson 183*c1d14583SBruce Richardson const __m128i rss_vlan_flags = _mm_set_epi8(0, 0, 0, 0, 184*c1d14583SBruce Richardson 0, 0, 0, 0, 185*c1d14583SBruce Richardson 0, 0, 0, 0, 186*c1d14583SBruce Richardson RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 187*c1d14583SBruce Richardson RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 188*c1d14583SBruce Richardson RTE_MBUF_F_RX_RSS_HASH, 0); 189*c1d14583SBruce Richardson 190*c1d14583SBruce Richardson /* merge 4 descriptors */ 191*c1d14583SBruce Richardson flags = _mm_unpackhi_epi32(descs[0], descs[1]); 192*c1d14583SBruce Richardson tmp_desc = _mm_unpackhi_epi32(descs[2], descs[3]); 193*c1d14583SBruce Richardson tmp_desc = _mm_unpacklo_epi64(flags, tmp_desc); 194*c1d14583SBruce Richardson tmp_desc = _mm_and_si128(tmp_desc, desc_mask); 195*c1d14583SBruce Richardson 196*c1d14583SBruce Richardson /* checksum flags */ 197*c1d14583SBruce Richardson tmp_desc = _mm_srli_epi32(tmp_desc, 4); 198*c1d14583SBruce Richardson flags = _mm_shuffle_epi8(cksum_flags, tmp_desc); 199*c1d14583SBruce Richardson /* then we shift left 1 bit */ 200*c1d14583SBruce Richardson flags = _mm_slli_epi32(flags, 1); 201*c1d14583SBruce Richardson 202*c1d14583SBruce Richardson __m128i l4_outer_mask = _mm_set_epi32(0x6, 0x6, 0x6, 0x6); 203*c1d14583SBruce Richardson __m128i l4_outer_flags = _mm_and_si128(flags, l4_outer_mask); 204*c1d14583SBruce Richardson l4_outer_flags = _mm_slli_epi32(l4_outer_flags, 20); 205*c1d14583SBruce Richardson 206*c1d14583SBruce Richardson __m128i l3_l4_mask = _mm_set_epi32(~0x6, ~0x6, ~0x6, ~0x6); 207*c1d14583SBruce Richardson __m128i l3_l4_flags = _mm_and_si128(flags, l3_l4_mask); 208*c1d14583SBruce Richardson flags = _mm_or_si128(l3_l4_flags, l4_outer_flags); 209*c1d14583SBruce Richardson /* we need to mask out the redundant bits introduced by RSS or 210*c1d14583SBruce Richardson * VLAN fields. 211*c1d14583SBruce Richardson */ 212*c1d14583SBruce Richardson flags = _mm_and_si128(flags, cksum_mask); 213*c1d14583SBruce Richardson 214*c1d14583SBruce Richardson /* RSS, VLAN flag */ 215*c1d14583SBruce Richardson tmp_desc = _mm_srli_epi32(tmp_desc, 8); 216*c1d14583SBruce Richardson rss_vlan = _mm_shuffle_epi8(rss_vlan_flags, tmp_desc); 217*c1d14583SBruce Richardson 218*c1d14583SBruce Richardson /* merge the flags */ 219*c1d14583SBruce Richardson flags = _mm_or_si128(flags, rss_vlan); 220*c1d14583SBruce Richardson 221*c1d14583SBruce Richardson if (rxq->fdir_enabled) { 222*c1d14583SBruce Richardson const __m128i fdir_id0_1 = 223*c1d14583SBruce Richardson _mm_unpackhi_epi32(descs[0], descs[1]); 224*c1d14583SBruce Richardson 225*c1d14583SBruce Richardson const __m128i fdir_id2_3 = 226*c1d14583SBruce Richardson _mm_unpackhi_epi32(descs[2], descs[3]); 227*c1d14583SBruce Richardson 228*c1d14583SBruce Richardson const __m128i fdir_id0_3 = 229*c1d14583SBruce Richardson _mm_unpackhi_epi64(fdir_id0_1, fdir_id2_3); 230*c1d14583SBruce Richardson 231*c1d14583SBruce Richardson const __m128i fdir_flags = 232*c1d14583SBruce Richardson ice_flex_rxd_to_fdir_flags_vec(fdir_id0_3); 233*c1d14583SBruce Richardson 234*c1d14583SBruce Richardson /* merge with fdir_flags */ 235*c1d14583SBruce Richardson flags = _mm_or_si128(flags, fdir_flags); 236*c1d14583SBruce Richardson 237*c1d14583SBruce Richardson /* write fdir_id to mbuf */ 238*c1d14583SBruce Richardson rx_pkts[0]->hash.fdir.hi = 239*c1d14583SBruce Richardson _mm_extract_epi32(fdir_id0_3, 0); 240*c1d14583SBruce Richardson 241*c1d14583SBruce Richardson rx_pkts[1]->hash.fdir.hi = 242*c1d14583SBruce Richardson _mm_extract_epi32(fdir_id0_3, 1); 243*c1d14583SBruce Richardson 244*c1d14583SBruce Richardson rx_pkts[2]->hash.fdir.hi = 245*c1d14583SBruce Richardson _mm_extract_epi32(fdir_id0_3, 2); 246*c1d14583SBruce Richardson 247*c1d14583SBruce Richardson rx_pkts[3]->hash.fdir.hi = 248*c1d14583SBruce Richardson _mm_extract_epi32(fdir_id0_3, 3); 249*c1d14583SBruce Richardson } /* if() on fdir_enabled */ 250*c1d14583SBruce Richardson 251*c1d14583SBruce Richardson /** 252*c1d14583SBruce Richardson * At this point, we have the 4 sets of flags in the low 16-bits 253*c1d14583SBruce Richardson * of each 32-bit value in flags. 254*c1d14583SBruce Richardson * We want to extract these, and merge them with the mbuf init data 255*c1d14583SBruce Richardson * so we can do a single 16-byte write to the mbuf to set the flags 256*c1d14583SBruce Richardson * and all the other initialization fields. Extracting the 257*c1d14583SBruce Richardson * appropriate flags means that we have to do a shift and blend for 258*c1d14583SBruce Richardson * each mbuf before we do the write. 259*c1d14583SBruce Richardson */ 260*c1d14583SBruce Richardson rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(flags, 8), 0x30); 261*c1d14583SBruce Richardson rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(flags, 4), 0x30); 262*c1d14583SBruce Richardson rearm2 = _mm_blend_epi16(mbuf_init, flags, 0x30); 263*c1d14583SBruce Richardson rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(flags, 4), 0x30); 264*c1d14583SBruce Richardson 265*c1d14583SBruce Richardson /* write the rearm data and the olflags in one write */ 266*c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) != 267*c1d14583SBruce Richardson offsetof(struct rte_mbuf, rearm_data) + 8); 268*c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) != 269*c1d14583SBruce Richardson RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16)); 270*c1d14583SBruce Richardson _mm_store_si128((__m128i *)&rx_pkts[0]->rearm_data, rearm0); 271*c1d14583SBruce Richardson _mm_store_si128((__m128i *)&rx_pkts[1]->rearm_data, rearm1); 272*c1d14583SBruce Richardson _mm_store_si128((__m128i *)&rx_pkts[2]->rearm_data, rearm2); 273*c1d14583SBruce Richardson _mm_store_si128((__m128i *)&rx_pkts[3]->rearm_data, rearm3); 274*c1d14583SBruce Richardson } 275*c1d14583SBruce Richardson 276*c1d14583SBruce Richardson static inline void 277*c1d14583SBruce Richardson ice_rx_desc_to_ptype_v(__m128i descs[4], struct rte_mbuf **rx_pkts, 278*c1d14583SBruce Richardson uint32_t *ptype_tbl) 279*c1d14583SBruce Richardson { 280*c1d14583SBruce Richardson const __m128i ptype_mask = _mm_set_epi16(ICE_RX_FLEX_DESC_PTYPE_M, 0, 281*c1d14583SBruce Richardson ICE_RX_FLEX_DESC_PTYPE_M, 0, 282*c1d14583SBruce Richardson ICE_RX_FLEX_DESC_PTYPE_M, 0, 283*c1d14583SBruce Richardson ICE_RX_FLEX_DESC_PTYPE_M, 0); 284*c1d14583SBruce Richardson __m128i ptype_01 = _mm_unpacklo_epi32(descs[0], descs[1]); 285*c1d14583SBruce Richardson __m128i ptype_23 = _mm_unpacklo_epi32(descs[2], descs[3]); 286*c1d14583SBruce Richardson __m128i ptype_all = _mm_unpacklo_epi64(ptype_01, ptype_23); 287*c1d14583SBruce Richardson 288*c1d14583SBruce Richardson ptype_all = _mm_and_si128(ptype_all, ptype_mask); 289*c1d14583SBruce Richardson 290*c1d14583SBruce Richardson rx_pkts[0]->packet_type = ptype_tbl[_mm_extract_epi16(ptype_all, 1)]; 291*c1d14583SBruce Richardson rx_pkts[1]->packet_type = ptype_tbl[_mm_extract_epi16(ptype_all, 3)]; 292*c1d14583SBruce Richardson rx_pkts[2]->packet_type = ptype_tbl[_mm_extract_epi16(ptype_all, 5)]; 293*c1d14583SBruce Richardson rx_pkts[3]->packet_type = ptype_tbl[_mm_extract_epi16(ptype_all, 7)]; 294*c1d14583SBruce Richardson } 295*c1d14583SBruce Richardson 296*c1d14583SBruce Richardson /** 297*c1d14583SBruce Richardson * vPMD raw receive routine, only accept(nb_pkts >= ICE_DESCS_PER_LOOP) 298*c1d14583SBruce Richardson * 299*c1d14583SBruce Richardson * Notice: 300*c1d14583SBruce Richardson * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet 301*c1d14583SBruce Richardson * - floor align nb_pkts to a ICE_DESCS_PER_LOOP power-of-two 302*c1d14583SBruce Richardson */ 303*c1d14583SBruce Richardson static inline uint16_t 304*c1d14583SBruce Richardson _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts, 305*c1d14583SBruce Richardson uint16_t nb_pkts, uint8_t *split_packet) 306*c1d14583SBruce Richardson { 307*c1d14583SBruce Richardson volatile union ice_rx_flex_desc *rxdp; 308*c1d14583SBruce Richardson struct ice_rx_entry *sw_ring; 309*c1d14583SBruce Richardson uint16_t nb_pkts_recd; 310*c1d14583SBruce Richardson int pos; 311*c1d14583SBruce Richardson uint64_t var; 312*c1d14583SBruce Richardson uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; 313*c1d14583SBruce Richardson __m128i crc_adjust = _mm_set_epi16 314*c1d14583SBruce Richardson (0, 0, 0, /* ignore non-length fields */ 315*c1d14583SBruce Richardson -rxq->crc_len, /* sub crc on data_len */ 316*c1d14583SBruce Richardson 0, /* ignore high-16bits of pkt_len */ 317*c1d14583SBruce Richardson -rxq->crc_len, /* sub crc on pkt_len */ 318*c1d14583SBruce Richardson 0, 0 /* ignore pkt_type field */ 319*c1d14583SBruce Richardson ); 320*c1d14583SBruce Richardson const __m128i zero = _mm_setzero_si128(); 321*c1d14583SBruce Richardson /* mask to shuffle from desc. to mbuf */ 322*c1d14583SBruce Richardson const __m128i shuf_msk = _mm_set_epi8 323*c1d14583SBruce Richardson (0xFF, 0xFF, 324*c1d14583SBruce Richardson 0xFF, 0xFF, /* rss hash parsed separately */ 325*c1d14583SBruce Richardson 11, 10, /* octet 10~11, 16 bits vlan_macip */ 326*c1d14583SBruce Richardson 5, 4, /* octet 4~5, 16 bits data_len */ 327*c1d14583SBruce Richardson 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ 328*c1d14583SBruce Richardson 5, 4, /* octet 4~5, low 16 bits pkt_len */ 329*c1d14583SBruce Richardson 0xFF, 0xFF, /* pkt_type set as unknown */ 330*c1d14583SBruce Richardson 0xFF, 0xFF /* pkt_type set as unknown */ 331*c1d14583SBruce Richardson ); 332*c1d14583SBruce Richardson const __m128i eop_shuf_mask = _mm_set_epi8(0xFF, 0xFF, 333*c1d14583SBruce Richardson 0xFF, 0xFF, 334*c1d14583SBruce Richardson 0xFF, 0xFF, 335*c1d14583SBruce Richardson 0xFF, 0xFF, 336*c1d14583SBruce Richardson 0xFF, 0xFF, 337*c1d14583SBruce Richardson 0xFF, 0xFF, 338*c1d14583SBruce Richardson 0x04, 0x0C, 339*c1d14583SBruce Richardson 0x00, 0x08); 340*c1d14583SBruce Richardson 341*c1d14583SBruce Richardson /** 342*c1d14583SBruce Richardson * compile-time check the above crc_adjust layout is correct. 343*c1d14583SBruce Richardson * NOTE: the first field (lowest address) is given last in set_epi16 344*c1d14583SBruce Richardson * call above. 345*c1d14583SBruce Richardson */ 346*c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != 347*c1d14583SBruce Richardson offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); 348*c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != 349*c1d14583SBruce Richardson offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); 350*c1d14583SBruce Richardson 351*c1d14583SBruce Richardson /* 4 packets DD mask */ 352*c1d14583SBruce Richardson const __m128i dd_check = _mm_set_epi64x(0x0000000100000001LL, 353*c1d14583SBruce Richardson 0x0000000100000001LL); 354*c1d14583SBruce Richardson /* 4 packets EOP mask */ 355*c1d14583SBruce Richardson const __m128i eop_check = _mm_set_epi64x(0x0000000200000002LL, 356*c1d14583SBruce Richardson 0x0000000200000002LL); 357*c1d14583SBruce Richardson 358*c1d14583SBruce Richardson /* nb_pkts has to be floor-aligned to ICE_DESCS_PER_LOOP */ 359*c1d14583SBruce Richardson nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, ICE_DESCS_PER_LOOP); 360*c1d14583SBruce Richardson 361*c1d14583SBruce Richardson /* Just the act of getting into the function from the application is 362*c1d14583SBruce Richardson * going to cost about 7 cycles 363*c1d14583SBruce Richardson */ 364*c1d14583SBruce Richardson rxdp = rxq->rx_ring + rxq->rx_tail; 365*c1d14583SBruce Richardson 366*c1d14583SBruce Richardson rte_prefetch0(rxdp); 367*c1d14583SBruce Richardson 368*c1d14583SBruce Richardson /* See if we need to rearm the RX queue - gives the prefetch a bit 369*c1d14583SBruce Richardson * of time to act 370*c1d14583SBruce Richardson */ 371*c1d14583SBruce Richardson if (rxq->rxrearm_nb > ICE_RXQ_REARM_THRESH) 372*c1d14583SBruce Richardson ice_rxq_rearm(rxq); 373*c1d14583SBruce Richardson 374*c1d14583SBruce Richardson /* Before we start moving massive data around, check to see if 375*c1d14583SBruce Richardson * there is actually a packet available 376*c1d14583SBruce Richardson */ 377*c1d14583SBruce Richardson if (!(rxdp->wb.status_error0 & 378*c1d14583SBruce Richardson rte_cpu_to_le_32(1 << ICE_RX_FLEX_DESC_STATUS0_DD_S))) 379*c1d14583SBruce Richardson return 0; 380*c1d14583SBruce Richardson 381*c1d14583SBruce Richardson /** 382*c1d14583SBruce Richardson * Compile-time verify the shuffle mask 383*c1d14583SBruce Richardson * NOTE: some field positions already verified above, but duplicated 384*c1d14583SBruce Richardson * here for completeness in case of future modifications. 385*c1d14583SBruce Richardson */ 386*c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != 387*c1d14583SBruce Richardson offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); 388*c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != 389*c1d14583SBruce Richardson offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); 390*c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) != 391*c1d14583SBruce Richardson offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10); 392*c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != 393*c1d14583SBruce Richardson offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); 394*c1d14583SBruce Richardson 395*c1d14583SBruce Richardson /* Cache is empty -> need to scan the buffer rings, but first move 396*c1d14583SBruce Richardson * the next 'n' mbufs into the cache 397*c1d14583SBruce Richardson */ 398*c1d14583SBruce Richardson sw_ring = &rxq->sw_ring[rxq->rx_tail]; 399*c1d14583SBruce Richardson 400*c1d14583SBruce Richardson /* A. load 4 packet in one loop 401*c1d14583SBruce Richardson * [A*. mask out 4 unused dirty field in desc] 402*c1d14583SBruce Richardson * B. copy 4 mbuf point from swring to rx_pkts 403*c1d14583SBruce Richardson * C. calc the number of DD bits among the 4 packets 404*c1d14583SBruce Richardson * [C*. extract the end-of-packet bit, if requested] 405*c1d14583SBruce Richardson * D. fill info. from desc to mbuf 406*c1d14583SBruce Richardson */ 407*c1d14583SBruce Richardson 408*c1d14583SBruce Richardson for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts; 409*c1d14583SBruce Richardson pos += ICE_DESCS_PER_LOOP, 410*c1d14583SBruce Richardson rxdp += ICE_DESCS_PER_LOOP) { 411*c1d14583SBruce Richardson __m128i descs[ICE_DESCS_PER_LOOP]; 412*c1d14583SBruce Richardson __m128i pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3; 413*c1d14583SBruce Richardson __m128i staterr, sterr_tmp1, sterr_tmp2; 414*c1d14583SBruce Richardson /* 2 64 bit or 4 32 bit mbuf pointers in one XMM reg. */ 415*c1d14583SBruce Richardson __m128i mbp1; 416*c1d14583SBruce Richardson #if defined(RTE_ARCH_X86_64) 417*c1d14583SBruce Richardson __m128i mbp2; 418*c1d14583SBruce Richardson #endif 419*c1d14583SBruce Richardson 420*c1d14583SBruce Richardson /* B.1 load 2 (64 bit) or 4 (32 bit) mbuf points */ 421*c1d14583SBruce Richardson mbp1 = _mm_loadu_si128((__m128i *)&sw_ring[pos]); 422*c1d14583SBruce Richardson /* Read desc statuses backwards to avoid race condition */ 423*c1d14583SBruce Richardson /* A.1 load desc[3] */ 424*c1d14583SBruce Richardson descs[3] = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, rxdp + 3)); 425*c1d14583SBruce Richardson rte_compiler_barrier(); 426*c1d14583SBruce Richardson 427*c1d14583SBruce Richardson /* B.2 copy 2 64 bit or 4 32 bit mbuf point into rx_pkts */ 428*c1d14583SBruce Richardson _mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1); 429*c1d14583SBruce Richardson 430*c1d14583SBruce Richardson #if defined(RTE_ARCH_X86_64) 431*c1d14583SBruce Richardson /* B.1 load 2 64 bit mbuf points */ 432*c1d14583SBruce Richardson mbp2 = _mm_loadu_si128((__m128i *)&sw_ring[pos + 2]); 433*c1d14583SBruce Richardson #endif 434*c1d14583SBruce Richardson 435*c1d14583SBruce Richardson /* A.1 load desc[2-0] */ 436*c1d14583SBruce Richardson descs[2] = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, rxdp + 2)); 437*c1d14583SBruce Richardson rte_compiler_barrier(); 438*c1d14583SBruce Richardson descs[1] = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, rxdp + 1)); 439*c1d14583SBruce Richardson rte_compiler_barrier(); 440*c1d14583SBruce Richardson descs[0] = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, rxdp)); 441*c1d14583SBruce Richardson 442*c1d14583SBruce Richardson #if defined(RTE_ARCH_X86_64) 443*c1d14583SBruce Richardson /* B.2 copy 2 mbuf point into rx_pkts */ 444*c1d14583SBruce Richardson _mm_storeu_si128((__m128i *)&rx_pkts[pos + 2], mbp2); 445*c1d14583SBruce Richardson #endif 446*c1d14583SBruce Richardson 447*c1d14583SBruce Richardson if (split_packet) { 448*c1d14583SBruce Richardson rte_mbuf_prefetch_part2(rx_pkts[pos]); 449*c1d14583SBruce Richardson rte_mbuf_prefetch_part2(rx_pkts[pos + 1]); 450*c1d14583SBruce Richardson rte_mbuf_prefetch_part2(rx_pkts[pos + 2]); 451*c1d14583SBruce Richardson rte_mbuf_prefetch_part2(rx_pkts[pos + 3]); 452*c1d14583SBruce Richardson } 453*c1d14583SBruce Richardson 454*c1d14583SBruce Richardson /* avoid compiler reorder optimization */ 455*c1d14583SBruce Richardson rte_compiler_barrier(); 456*c1d14583SBruce Richardson 457*c1d14583SBruce Richardson /* D.1 pkt 3,4 convert format from desc to pktmbuf */ 458*c1d14583SBruce Richardson pkt_mb3 = _mm_shuffle_epi8(descs[3], shuf_msk); 459*c1d14583SBruce Richardson pkt_mb2 = _mm_shuffle_epi8(descs[2], shuf_msk); 460*c1d14583SBruce Richardson 461*c1d14583SBruce Richardson /* D.1 pkt 1,2 convert format from desc to pktmbuf */ 462*c1d14583SBruce Richardson pkt_mb1 = _mm_shuffle_epi8(descs[1], shuf_msk); 463*c1d14583SBruce Richardson pkt_mb0 = _mm_shuffle_epi8(descs[0], shuf_msk); 464*c1d14583SBruce Richardson 465*c1d14583SBruce Richardson /* C.1 4=>2 filter staterr info only */ 466*c1d14583SBruce Richardson sterr_tmp2 = _mm_unpackhi_epi32(descs[3], descs[2]); 467*c1d14583SBruce Richardson /* C.1 4=>2 filter staterr info only */ 468*c1d14583SBruce Richardson sterr_tmp1 = _mm_unpackhi_epi32(descs[1], descs[0]); 469*c1d14583SBruce Richardson 470*c1d14583SBruce Richardson ice_rx_desc_to_olflags_v(rxq, descs, &rx_pkts[pos]); 471*c1d14583SBruce Richardson 472*c1d14583SBruce Richardson /* D.2 pkt 3,4 set in_port/nb_seg and remove crc */ 473*c1d14583SBruce Richardson pkt_mb3 = _mm_add_epi16(pkt_mb3, crc_adjust); 474*c1d14583SBruce Richardson pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust); 475*c1d14583SBruce Richardson 476*c1d14583SBruce Richardson /* D.2 pkt 1,2 set in_port/nb_seg and remove crc */ 477*c1d14583SBruce Richardson pkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust); 478*c1d14583SBruce Richardson pkt_mb0 = _mm_add_epi16(pkt_mb0, crc_adjust); 479*c1d14583SBruce Richardson 480*c1d14583SBruce Richardson #ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC 481*c1d14583SBruce Richardson /** 482*c1d14583SBruce Richardson * needs to load 2nd 16B of each desc for RSS hash parsing, 483*c1d14583SBruce Richardson * will cause performance drop to get into this context. 484*c1d14583SBruce Richardson */ 485*c1d14583SBruce Richardson if (rxq->vsi->adapter->pf.dev_data->dev_conf.rxmode.offloads & 486*c1d14583SBruce Richardson RTE_ETH_RX_OFFLOAD_RSS_HASH) { 487*c1d14583SBruce Richardson /* load bottom half of every 32B desc */ 488*c1d14583SBruce Richardson const __m128i raw_desc_bh3 = 489*c1d14583SBruce Richardson _mm_load_si128 490*c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[3].wb.status_error1)); 491*c1d14583SBruce Richardson rte_compiler_barrier(); 492*c1d14583SBruce Richardson const __m128i raw_desc_bh2 = 493*c1d14583SBruce Richardson _mm_load_si128 494*c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[2].wb.status_error1)); 495*c1d14583SBruce Richardson rte_compiler_barrier(); 496*c1d14583SBruce Richardson const __m128i raw_desc_bh1 = 497*c1d14583SBruce Richardson _mm_load_si128 498*c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[1].wb.status_error1)); 499*c1d14583SBruce Richardson rte_compiler_barrier(); 500*c1d14583SBruce Richardson const __m128i raw_desc_bh0 = 501*c1d14583SBruce Richardson _mm_load_si128 502*c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[0].wb.status_error1)); 503*c1d14583SBruce Richardson 504*c1d14583SBruce Richardson /** 505*c1d14583SBruce Richardson * to shift the 32b RSS hash value to the 506*c1d14583SBruce Richardson * highest 32b of each 128b before mask 507*c1d14583SBruce Richardson */ 508*c1d14583SBruce Richardson __m128i rss_hash3 = 509*c1d14583SBruce Richardson _mm_slli_epi64(raw_desc_bh3, 32); 510*c1d14583SBruce Richardson __m128i rss_hash2 = 511*c1d14583SBruce Richardson _mm_slli_epi64(raw_desc_bh2, 32); 512*c1d14583SBruce Richardson __m128i rss_hash1 = 513*c1d14583SBruce Richardson _mm_slli_epi64(raw_desc_bh1, 32); 514*c1d14583SBruce Richardson __m128i rss_hash0 = 515*c1d14583SBruce Richardson _mm_slli_epi64(raw_desc_bh0, 32); 516*c1d14583SBruce Richardson 517*c1d14583SBruce Richardson __m128i rss_hash_msk = 518*c1d14583SBruce Richardson _mm_set_epi32(0xFFFFFFFF, 0, 0, 0); 519*c1d14583SBruce Richardson 520*c1d14583SBruce Richardson rss_hash3 = _mm_and_si128 521*c1d14583SBruce Richardson (rss_hash3, rss_hash_msk); 522*c1d14583SBruce Richardson rss_hash2 = _mm_and_si128 523*c1d14583SBruce Richardson (rss_hash2, rss_hash_msk); 524*c1d14583SBruce Richardson rss_hash1 = _mm_and_si128 525*c1d14583SBruce Richardson (rss_hash1, rss_hash_msk); 526*c1d14583SBruce Richardson rss_hash0 = _mm_and_si128 527*c1d14583SBruce Richardson (rss_hash0, rss_hash_msk); 528*c1d14583SBruce Richardson 529*c1d14583SBruce Richardson pkt_mb3 = _mm_or_si128(pkt_mb3, rss_hash3); 530*c1d14583SBruce Richardson pkt_mb2 = _mm_or_si128(pkt_mb2, rss_hash2); 531*c1d14583SBruce Richardson pkt_mb1 = _mm_or_si128(pkt_mb1, rss_hash1); 532*c1d14583SBruce Richardson pkt_mb0 = _mm_or_si128(pkt_mb0, rss_hash0); 533*c1d14583SBruce Richardson } /* if() on RSS hash parsing */ 534*c1d14583SBruce Richardson #endif 535*c1d14583SBruce Richardson 536*c1d14583SBruce Richardson /* C.2 get 4 pkts staterr value */ 537*c1d14583SBruce Richardson staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2); 538*c1d14583SBruce Richardson 539*c1d14583SBruce Richardson /* D.3 copy final 3,4 data to rx_pkts */ 540*c1d14583SBruce Richardson _mm_storeu_si128 541*c1d14583SBruce Richardson ((void *)&rx_pkts[pos + 3]->rx_descriptor_fields1, 542*c1d14583SBruce Richardson pkt_mb3); 543*c1d14583SBruce Richardson _mm_storeu_si128 544*c1d14583SBruce Richardson ((void *)&rx_pkts[pos + 2]->rx_descriptor_fields1, 545*c1d14583SBruce Richardson pkt_mb2); 546*c1d14583SBruce Richardson 547*c1d14583SBruce Richardson /* C* extract and record EOP bit */ 548*c1d14583SBruce Richardson if (split_packet) { 549*c1d14583SBruce Richardson /* and with mask to extract bits, flipping 1-0 */ 550*c1d14583SBruce Richardson __m128i eop_bits = _mm_andnot_si128(staterr, eop_check); 551*c1d14583SBruce Richardson /* the staterr values are not in order, as the count 552*c1d14583SBruce Richardson * of dd bits doesn't care. However, for end of 553*c1d14583SBruce Richardson * packet tracking, we do care, so shuffle. This also 554*c1d14583SBruce Richardson * compresses the 32-bit values to 8-bit 555*c1d14583SBruce Richardson */ 556*c1d14583SBruce Richardson eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask); 557*c1d14583SBruce Richardson /* store the resulting 32-bit value */ 558*c1d14583SBruce Richardson *(int *)split_packet = _mm_cvtsi128_si32(eop_bits); 559*c1d14583SBruce Richardson split_packet += ICE_DESCS_PER_LOOP; 560*c1d14583SBruce Richardson } 561*c1d14583SBruce Richardson 562*c1d14583SBruce Richardson /* C.3 calc available number of desc */ 563*c1d14583SBruce Richardson staterr = _mm_and_si128(staterr, dd_check); 564*c1d14583SBruce Richardson staterr = _mm_packs_epi32(staterr, zero); 565*c1d14583SBruce Richardson 566*c1d14583SBruce Richardson /* D.3 copy final 1,2 data to rx_pkts */ 567*c1d14583SBruce Richardson _mm_storeu_si128 568*c1d14583SBruce Richardson ((void *)&rx_pkts[pos + 1]->rx_descriptor_fields1, 569*c1d14583SBruce Richardson pkt_mb1); 570*c1d14583SBruce Richardson _mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1, 571*c1d14583SBruce Richardson pkt_mb0); 572*c1d14583SBruce Richardson ice_rx_desc_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl); 573*c1d14583SBruce Richardson /* C.4 calc available number of desc */ 574*c1d14583SBruce Richardson var = rte_popcount64(_mm_cvtsi128_si64(staterr)); 575*c1d14583SBruce Richardson nb_pkts_recd += var; 576*c1d14583SBruce Richardson if (likely(var != ICE_DESCS_PER_LOOP)) 577*c1d14583SBruce Richardson break; 578*c1d14583SBruce Richardson } 579*c1d14583SBruce Richardson 580*c1d14583SBruce Richardson /* Update our internal tail pointer */ 581*c1d14583SBruce Richardson rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd); 582*c1d14583SBruce Richardson rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1)); 583*c1d14583SBruce Richardson rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd); 584*c1d14583SBruce Richardson 585*c1d14583SBruce Richardson return nb_pkts_recd; 586*c1d14583SBruce Richardson } 587*c1d14583SBruce Richardson 588*c1d14583SBruce Richardson /** 589*c1d14583SBruce Richardson * Notice: 590*c1d14583SBruce Richardson * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet 591*c1d14583SBruce Richardson * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST 592*c1d14583SBruce Richardson * numbers of DD bits 593*c1d14583SBruce Richardson */ 594*c1d14583SBruce Richardson uint16_t 595*c1d14583SBruce Richardson ice_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, 596*c1d14583SBruce Richardson uint16_t nb_pkts) 597*c1d14583SBruce Richardson { 598*c1d14583SBruce Richardson return _ice_recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL); 599*c1d14583SBruce Richardson } 600*c1d14583SBruce Richardson 601*c1d14583SBruce Richardson /** 602*c1d14583SBruce Richardson * vPMD receive routine that reassembles single burst of 32 scattered packets 603*c1d14583SBruce Richardson * 604*c1d14583SBruce Richardson * Notice: 605*c1d14583SBruce Richardson * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet 606*c1d14583SBruce Richardson */ 607*c1d14583SBruce Richardson static uint16_t 608*c1d14583SBruce Richardson ice_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts, 609*c1d14583SBruce Richardson uint16_t nb_pkts) 610*c1d14583SBruce Richardson { 611*c1d14583SBruce Richardson struct ice_rx_queue *rxq = rx_queue; 612*c1d14583SBruce Richardson uint8_t split_flags[ICE_VPMD_RX_BURST] = {0}; 613*c1d14583SBruce Richardson 614*c1d14583SBruce Richardson /* get some new buffers */ 615*c1d14583SBruce Richardson uint16_t nb_bufs = _ice_recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts, 616*c1d14583SBruce Richardson split_flags); 617*c1d14583SBruce Richardson if (nb_bufs == 0) 618*c1d14583SBruce Richardson return 0; 619*c1d14583SBruce Richardson 620*c1d14583SBruce Richardson /* happy day case, full burst + no packets to be joined */ 621*c1d14583SBruce Richardson const uint64_t *split_fl64 = (uint64_t *)split_flags; 622*c1d14583SBruce Richardson 623*c1d14583SBruce Richardson if (!rxq->pkt_first_seg && 624*c1d14583SBruce Richardson split_fl64[0] == 0 && split_fl64[1] == 0 && 625*c1d14583SBruce Richardson split_fl64[2] == 0 && split_fl64[3] == 0) 626*c1d14583SBruce Richardson return nb_bufs; 627*c1d14583SBruce Richardson 628*c1d14583SBruce Richardson /* reassemble any packets that need reassembly*/ 629*c1d14583SBruce Richardson unsigned int i = 0; 630*c1d14583SBruce Richardson 631*c1d14583SBruce Richardson if (!rxq->pkt_first_seg) { 632*c1d14583SBruce Richardson /* find the first split flag, and only reassemble then*/ 633*c1d14583SBruce Richardson while (i < nb_bufs && !split_flags[i]) 634*c1d14583SBruce Richardson i++; 635*c1d14583SBruce Richardson if (i == nb_bufs) 636*c1d14583SBruce Richardson return nb_bufs; 637*c1d14583SBruce Richardson rxq->pkt_first_seg = rx_pkts[i]; 638*c1d14583SBruce Richardson } 639*c1d14583SBruce Richardson return i + ice_rx_reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i, 640*c1d14583SBruce Richardson &split_flags[i]); 641*c1d14583SBruce Richardson } 642*c1d14583SBruce Richardson 643*c1d14583SBruce Richardson /** 644*c1d14583SBruce Richardson * vPMD receive routine that reassembles scattered packets. 645*c1d14583SBruce Richardson */ 646*c1d14583SBruce Richardson uint16_t 647*c1d14583SBruce Richardson ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, 648*c1d14583SBruce Richardson uint16_t nb_pkts) 649*c1d14583SBruce Richardson { 650*c1d14583SBruce Richardson uint16_t retval = 0; 651*c1d14583SBruce Richardson 652*c1d14583SBruce Richardson while (nb_pkts > ICE_VPMD_RX_BURST) { 653*c1d14583SBruce Richardson uint16_t burst; 654*c1d14583SBruce Richardson 655*c1d14583SBruce Richardson burst = ice_recv_scattered_burst_vec(rx_queue, 656*c1d14583SBruce Richardson rx_pkts + retval, 657*c1d14583SBruce Richardson ICE_VPMD_RX_BURST); 658*c1d14583SBruce Richardson retval += burst; 659*c1d14583SBruce Richardson nb_pkts -= burst; 660*c1d14583SBruce Richardson if (burst < ICE_VPMD_RX_BURST) 661*c1d14583SBruce Richardson return retval; 662*c1d14583SBruce Richardson } 663*c1d14583SBruce Richardson 664*c1d14583SBruce Richardson return retval + ice_recv_scattered_burst_vec(rx_queue, 665*c1d14583SBruce Richardson rx_pkts + retval, 666*c1d14583SBruce Richardson nb_pkts); 667*c1d14583SBruce Richardson } 668*c1d14583SBruce Richardson 669*c1d14583SBruce Richardson static inline void 670*c1d14583SBruce Richardson ice_vtx1(volatile struct ice_tx_desc *txdp, struct rte_mbuf *pkt, 671*c1d14583SBruce Richardson uint64_t flags) 672*c1d14583SBruce Richardson { 673*c1d14583SBruce Richardson uint64_t high_qw = 674*c1d14583SBruce Richardson (ICE_TX_DESC_DTYPE_DATA | 675*c1d14583SBruce Richardson ((uint64_t)flags << ICE_TXD_QW1_CMD_S) | 676*c1d14583SBruce Richardson ((uint64_t)pkt->data_len << ICE_TXD_QW1_TX_BUF_SZ_S)); 677*c1d14583SBruce Richardson 678*c1d14583SBruce Richardson __m128i descriptor = _mm_set_epi64x(high_qw, rte_pktmbuf_iova(pkt)); 679*c1d14583SBruce Richardson _mm_store_si128(RTE_CAST_PTR(__m128i *, txdp), descriptor); 680*c1d14583SBruce Richardson } 681*c1d14583SBruce Richardson 682*c1d14583SBruce Richardson static inline void 683*c1d14583SBruce Richardson ice_vtx(volatile struct ice_tx_desc *txdp, struct rte_mbuf **pkt, 684*c1d14583SBruce Richardson uint16_t nb_pkts, uint64_t flags) 685*c1d14583SBruce Richardson { 686*c1d14583SBruce Richardson int i; 687*c1d14583SBruce Richardson 688*c1d14583SBruce Richardson for (i = 0; i < nb_pkts; ++i, ++txdp, ++pkt) 689*c1d14583SBruce Richardson ice_vtx1(txdp, *pkt, flags); 690*c1d14583SBruce Richardson } 691*c1d14583SBruce Richardson 692*c1d14583SBruce Richardson static uint16_t 693*c1d14583SBruce Richardson ice_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts, 694*c1d14583SBruce Richardson uint16_t nb_pkts) 695*c1d14583SBruce Richardson { 696*c1d14583SBruce Richardson struct ice_tx_queue *txq = (struct ice_tx_queue *)tx_queue; 697*c1d14583SBruce Richardson volatile struct ice_tx_desc *txdp; 698*c1d14583SBruce Richardson struct ice_tx_entry *txep; 699*c1d14583SBruce Richardson uint16_t n, nb_commit, tx_id; 700*c1d14583SBruce Richardson uint64_t flags = ICE_TD_CMD; 701*c1d14583SBruce Richardson uint64_t rs = ICE_TX_DESC_CMD_RS | ICE_TD_CMD; 702*c1d14583SBruce Richardson int i; 703*c1d14583SBruce Richardson 704*c1d14583SBruce Richardson /* cross rx_thresh boundary is not allowed */ 705*c1d14583SBruce Richardson nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh); 706*c1d14583SBruce Richardson 707*c1d14583SBruce Richardson if (txq->nb_tx_free < txq->tx_free_thresh) 708*c1d14583SBruce Richardson ice_tx_free_bufs_vec(txq); 709*c1d14583SBruce Richardson 710*c1d14583SBruce Richardson nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts); 711*c1d14583SBruce Richardson nb_commit = nb_pkts; 712*c1d14583SBruce Richardson if (unlikely(nb_pkts == 0)) 713*c1d14583SBruce Richardson return 0; 714*c1d14583SBruce Richardson 715*c1d14583SBruce Richardson tx_id = txq->tx_tail; 716*c1d14583SBruce Richardson txdp = &txq->tx_ring[tx_id]; 717*c1d14583SBruce Richardson txep = &txq->sw_ring[tx_id]; 718*c1d14583SBruce Richardson 719*c1d14583SBruce Richardson txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts); 720*c1d14583SBruce Richardson 721*c1d14583SBruce Richardson n = (uint16_t)(txq->nb_tx_desc - tx_id); 722*c1d14583SBruce Richardson if (nb_commit >= n) { 723*c1d14583SBruce Richardson ice_tx_backlog_entry(txep, tx_pkts, n); 724*c1d14583SBruce Richardson 725*c1d14583SBruce Richardson for (i = 0; i < n - 1; ++i, ++tx_pkts, ++txdp) 726*c1d14583SBruce Richardson ice_vtx1(txdp, *tx_pkts, flags); 727*c1d14583SBruce Richardson 728*c1d14583SBruce Richardson ice_vtx1(txdp, *tx_pkts++, rs); 729*c1d14583SBruce Richardson 730*c1d14583SBruce Richardson nb_commit = (uint16_t)(nb_commit - n); 731*c1d14583SBruce Richardson 732*c1d14583SBruce Richardson tx_id = 0; 733*c1d14583SBruce Richardson txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1); 734*c1d14583SBruce Richardson 735*c1d14583SBruce Richardson /* avoid reach the end of ring */ 736*c1d14583SBruce Richardson txdp = &txq->tx_ring[tx_id]; 737*c1d14583SBruce Richardson txep = &txq->sw_ring[tx_id]; 738*c1d14583SBruce Richardson } 739*c1d14583SBruce Richardson 740*c1d14583SBruce Richardson ice_tx_backlog_entry(txep, tx_pkts, nb_commit); 741*c1d14583SBruce Richardson 742*c1d14583SBruce Richardson ice_vtx(txdp, tx_pkts, nb_commit, flags); 743*c1d14583SBruce Richardson 744*c1d14583SBruce Richardson tx_id = (uint16_t)(tx_id + nb_commit); 745*c1d14583SBruce Richardson if (tx_id > txq->tx_next_rs) { 746*c1d14583SBruce Richardson txq->tx_ring[txq->tx_next_rs].cmd_type_offset_bsz |= 747*c1d14583SBruce Richardson rte_cpu_to_le_64(((uint64_t)ICE_TX_DESC_CMD_RS) << 748*c1d14583SBruce Richardson ICE_TXD_QW1_CMD_S); 749*c1d14583SBruce Richardson txq->tx_next_rs = 750*c1d14583SBruce Richardson (uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh); 751*c1d14583SBruce Richardson } 752*c1d14583SBruce Richardson 753*c1d14583SBruce Richardson txq->tx_tail = tx_id; 754*c1d14583SBruce Richardson 755*c1d14583SBruce Richardson ICE_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail); 756*c1d14583SBruce Richardson 757*c1d14583SBruce Richardson return nb_pkts; 758*c1d14583SBruce Richardson } 759*c1d14583SBruce Richardson 760*c1d14583SBruce Richardson uint16_t 761*c1d14583SBruce Richardson ice_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts, 762*c1d14583SBruce Richardson uint16_t nb_pkts) 763*c1d14583SBruce Richardson { 764*c1d14583SBruce Richardson uint16_t nb_tx = 0; 765*c1d14583SBruce Richardson struct ice_tx_queue *txq = (struct ice_tx_queue *)tx_queue; 766*c1d14583SBruce Richardson 767*c1d14583SBruce Richardson while (nb_pkts) { 768*c1d14583SBruce Richardson uint16_t ret, num; 769*c1d14583SBruce Richardson 770*c1d14583SBruce Richardson num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh); 771*c1d14583SBruce Richardson ret = ice_xmit_fixed_burst_vec(tx_queue, &tx_pkts[nb_tx], num); 772*c1d14583SBruce Richardson nb_tx += ret; 773*c1d14583SBruce Richardson nb_pkts -= ret; 774*c1d14583SBruce Richardson if (ret < num) 775*c1d14583SBruce Richardson break; 776*c1d14583SBruce Richardson } 777*c1d14583SBruce Richardson 778*c1d14583SBruce Richardson return nb_tx; 779*c1d14583SBruce Richardson } 780*c1d14583SBruce Richardson 781*c1d14583SBruce Richardson int __rte_cold 782*c1d14583SBruce Richardson ice_rxq_vec_setup(struct ice_rx_queue *rxq) 783*c1d14583SBruce Richardson { 784*c1d14583SBruce Richardson if (!rxq) 785*c1d14583SBruce Richardson return -1; 786*c1d14583SBruce Richardson 787*c1d14583SBruce Richardson rxq->rx_rel_mbufs = _ice_rx_queue_release_mbufs_vec; 788*c1d14583SBruce Richardson return ice_rxq_vec_setup_default(rxq); 789*c1d14583SBruce Richardson } 790*c1d14583SBruce Richardson 791*c1d14583SBruce Richardson int __rte_cold 792*c1d14583SBruce Richardson ice_txq_vec_setup(struct ice_tx_queue __rte_unused *txq) 793*c1d14583SBruce Richardson { 794*c1d14583SBruce Richardson if (!txq) 795*c1d14583SBruce Richardson return -1; 796*c1d14583SBruce Richardson 797*c1d14583SBruce Richardson txq->tx_rel_mbufs = _ice_tx_queue_release_mbufs_vec; 798*c1d14583SBruce Richardson return 0; 799*c1d14583SBruce Richardson } 800*c1d14583SBruce Richardson 801*c1d14583SBruce Richardson int __rte_cold 802*c1d14583SBruce Richardson ice_rx_vec_dev_check(struct rte_eth_dev *dev) 803*c1d14583SBruce Richardson { 804*c1d14583SBruce Richardson return ice_rx_vec_dev_check_default(dev); 805*c1d14583SBruce Richardson } 806*c1d14583SBruce Richardson 807*c1d14583SBruce Richardson int __rte_cold 808*c1d14583SBruce Richardson ice_tx_vec_dev_check(struct rte_eth_dev *dev) 809*c1d14583SBruce Richardson { 810*c1d14583SBruce Richardson return ice_tx_vec_dev_check_default(dev); 811*c1d14583SBruce Richardson } 812