1*c1d14583SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause 2*c1d14583SBruce Richardson * Copyright(c) 2019 Intel Corporation 3*c1d14583SBruce Richardson */ 4*c1d14583SBruce Richardson 5*c1d14583SBruce Richardson #include "ice_rxtx_vec_common.h" 6*c1d14583SBruce Richardson #include "ice_rxtx_common_avx.h" 7*c1d14583SBruce Richardson 8*c1d14583SBruce Richardson #include <rte_vect.h> 9*c1d14583SBruce Richardson 10*c1d14583SBruce Richardson #define ICE_DESCS_PER_LOOP_AVX 8 11*c1d14583SBruce Richardson 12*c1d14583SBruce Richardson static __rte_always_inline void 13*c1d14583SBruce Richardson ice_rxq_rearm(struct ice_rx_queue *rxq) 14*c1d14583SBruce Richardson { 15*c1d14583SBruce Richardson ice_rxq_rearm_common(rxq, true); 16*c1d14583SBruce Richardson } 17*c1d14583SBruce Richardson 18*c1d14583SBruce Richardson static inline __m256i 19*c1d14583SBruce Richardson ice_flex_rxd_to_fdir_flags_vec_avx512(const __m256i fdir_id0_7) 20*c1d14583SBruce Richardson { 21*c1d14583SBruce Richardson #define FDID_MIS_MAGIC 0xFFFFFFFF 22*c1d14583SBruce Richardson RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR != (1 << 2)); 23*c1d14583SBruce Richardson RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR_ID != (1 << 13)); 24*c1d14583SBruce Richardson const __m256i pkt_fdir_bit = _mm256_set1_epi32(RTE_MBUF_F_RX_FDIR | 25*c1d14583SBruce Richardson RTE_MBUF_F_RX_FDIR_ID); 26*c1d14583SBruce Richardson /* desc->flow_id field == 0xFFFFFFFF means fdir mismatch */ 27*c1d14583SBruce Richardson const __m256i fdir_mis_mask = _mm256_set1_epi32(FDID_MIS_MAGIC); 28*c1d14583SBruce Richardson __m256i fdir_mask = _mm256_cmpeq_epi32(fdir_id0_7, 29*c1d14583SBruce Richardson fdir_mis_mask); 30*c1d14583SBruce Richardson /* this XOR op results to bit-reverse the fdir_mask */ 31*c1d14583SBruce Richardson fdir_mask = _mm256_xor_si256(fdir_mask, fdir_mis_mask); 32*c1d14583SBruce Richardson const __m256i fdir_flags = _mm256_and_si256(fdir_mask, pkt_fdir_bit); 33*c1d14583SBruce Richardson 34*c1d14583SBruce Richardson return fdir_flags; 35*c1d14583SBruce Richardson } 36*c1d14583SBruce Richardson 37*c1d14583SBruce Richardson static __rte_always_inline uint16_t 38*c1d14583SBruce Richardson _ice_recv_raw_pkts_vec_avx512(struct ice_rx_queue *rxq, 39*c1d14583SBruce Richardson struct rte_mbuf **rx_pkts, 40*c1d14583SBruce Richardson uint16_t nb_pkts, 41*c1d14583SBruce Richardson uint8_t *split_packet, 42*c1d14583SBruce Richardson bool do_offload) 43*c1d14583SBruce Richardson { 44*c1d14583SBruce Richardson const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; 45*c1d14583SBruce Richardson const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 46*c1d14583SBruce Richardson 0, rxq->mbuf_initializer); 47*c1d14583SBruce Richardson struct ice_rx_entry *sw_ring = &rxq->sw_ring[rxq->rx_tail]; 48*c1d14583SBruce Richardson volatile union ice_rx_flex_desc *rxdp = rxq->rx_ring + rxq->rx_tail; 49*c1d14583SBruce Richardson 50*c1d14583SBruce Richardson rte_prefetch0(rxdp); 51*c1d14583SBruce Richardson 52*c1d14583SBruce Richardson /* nb_pkts has to be floor-aligned to ICE_DESCS_PER_LOOP_AVX */ 53*c1d14583SBruce Richardson nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, ICE_DESCS_PER_LOOP_AVX); 54*c1d14583SBruce Richardson 55*c1d14583SBruce Richardson /* See if we need to rearm the RX queue - gives the prefetch a bit 56*c1d14583SBruce Richardson * of time to act 57*c1d14583SBruce Richardson */ 58*c1d14583SBruce Richardson if (rxq->rxrearm_nb > ICE_RXQ_REARM_THRESH) 59*c1d14583SBruce Richardson ice_rxq_rearm(rxq); 60*c1d14583SBruce Richardson 61*c1d14583SBruce Richardson /* Before we start moving massive data around, check to see if 62*c1d14583SBruce Richardson * there is actually a packet available 63*c1d14583SBruce Richardson */ 64*c1d14583SBruce Richardson if (!(rxdp->wb.status_error0 & 65*c1d14583SBruce Richardson rte_cpu_to_le_32(1 << ICE_RX_FLEX_DESC_STATUS0_DD_S))) 66*c1d14583SBruce Richardson return 0; 67*c1d14583SBruce Richardson 68*c1d14583SBruce Richardson /* constants used in processing loop */ 69*c1d14583SBruce Richardson const __m512i crc_adjust = 70*c1d14583SBruce Richardson _mm512_set4_epi32 71*c1d14583SBruce Richardson (0, /* ignore non-length fields */ 72*c1d14583SBruce Richardson -rxq->crc_len, /* sub crc on data_len */ 73*c1d14583SBruce Richardson -rxq->crc_len, /* sub crc on pkt_len */ 74*c1d14583SBruce Richardson 0 /* ignore non-length fields */ 75*c1d14583SBruce Richardson ); 76*c1d14583SBruce Richardson 77*c1d14583SBruce Richardson /* 8 packets DD mask, LSB in each 32-bit value */ 78*c1d14583SBruce Richardson const __m256i dd_check = _mm256_set1_epi32(1); 79*c1d14583SBruce Richardson 80*c1d14583SBruce Richardson /* 8 packets EOP mask, second-LSB in each 32-bit value */ 81*c1d14583SBruce Richardson const __m256i eop_check = _mm256_slli_epi32(dd_check, 82*c1d14583SBruce Richardson ICE_RX_DESC_STATUS_EOF_S); 83*c1d14583SBruce Richardson 84*c1d14583SBruce Richardson /* mask to shuffle from desc. to mbuf (4 descriptors)*/ 85*c1d14583SBruce Richardson const __m512i shuf_msk = 86*c1d14583SBruce Richardson _mm512_set4_epi32 87*c1d14583SBruce Richardson (/* rss hash parsed separately */ 88*c1d14583SBruce Richardson 0xFFFFFFFF, 89*c1d14583SBruce Richardson /* octet 10~11, 16 bits vlan_macip */ 90*c1d14583SBruce Richardson /* octet 4~5, 16 bits data_len */ 91*c1d14583SBruce Richardson 11 << 24 | 10 << 16 | 5 << 8 | 4, 92*c1d14583SBruce Richardson /* skip hi 16 bits pkt_len, zero out */ 93*c1d14583SBruce Richardson /* octet 4~5, 16 bits pkt_len */ 94*c1d14583SBruce Richardson 0xFFFF << 16 | 5 << 8 | 4, 95*c1d14583SBruce Richardson /* pkt_type set as unknown */ 96*c1d14583SBruce Richardson 0xFFFFFFFF 97*c1d14583SBruce Richardson ); 98*c1d14583SBruce Richardson 99*c1d14583SBruce Richardson /** 100*c1d14583SBruce Richardson * compile-time check the above crc and shuffle layout is correct. 101*c1d14583SBruce Richardson * NOTE: the first field (lowest address) is given last in set_epi 102*c1d14583SBruce Richardson * calls above. 103*c1d14583SBruce Richardson */ 104*c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != 105*c1d14583SBruce Richardson offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); 106*c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != 107*c1d14583SBruce Richardson offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); 108*c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) != 109*c1d14583SBruce Richardson offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10); 110*c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != 111*c1d14583SBruce Richardson offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); 112*c1d14583SBruce Richardson 113*c1d14583SBruce Richardson /* following code block is for Rx Checksum Offload */ 114*c1d14583SBruce Richardson /* Status/Error flag masks */ 115*c1d14583SBruce Richardson /** 116*c1d14583SBruce Richardson * mask everything except Checksum Reports, RSS indication 117*c1d14583SBruce Richardson * and VLAN indication. 118*c1d14583SBruce Richardson * bit6:4 for IP/L4 checksum errors. 119*c1d14583SBruce Richardson * bit12 is for RSS indication. 120*c1d14583SBruce Richardson * bit13 is for VLAN indication. 121*c1d14583SBruce Richardson */ 122*c1d14583SBruce Richardson const __m256i flags_mask = 123*c1d14583SBruce Richardson _mm256_set1_epi32((0xF << 4) | (1 << 12) | (1 << 13)); 124*c1d14583SBruce Richardson /** 125*c1d14583SBruce Richardson * data to be shuffled by the result of the flags mask shifted by 4 126*c1d14583SBruce Richardson * bits. This gives use the l3_l4 flags. 127*c1d14583SBruce Richardson */ 128*c1d14583SBruce Richardson const __m256i l3_l4_flags_shuf = 129*c1d14583SBruce Richardson _mm256_set_epi8((RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 130*c1d14583SBruce Richardson RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | 131*c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 132*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 133*c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 134*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 135*c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 136*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 137*c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 138*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 139*c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 140*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 141*c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 142*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 143*c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 144*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 145*c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 146*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 147*c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 148*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 149*c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 150*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 151*c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 152*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 153*c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 154*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 155*c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 156*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 157*c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 158*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 159*c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 160*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 161*c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 162*c1d14583SBruce Richardson /** 163*c1d14583SBruce Richardson * second 128-bits 164*c1d14583SBruce Richardson * shift right 20 bits to use the low two bits to indicate 165*c1d14583SBruce Richardson * outer checksum status 166*c1d14583SBruce Richardson * shift right 1 bit to make sure it not exceed 255 167*c1d14583SBruce Richardson */ 168*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 169*c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 170*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 171*c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 172*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 173*c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 174*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 175*c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 176*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 177*c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 178*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 179*c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 180*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 181*c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 182*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 183*c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 184*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 185*c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 186*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 187*c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 188*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 189*c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 190*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 191*c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 192*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 193*c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 194*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 195*c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 196*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 197*c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 198*c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 199*c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1); 200*c1d14583SBruce Richardson const __m256i cksum_mask = 201*c1d14583SBruce Richardson _mm256_set1_epi32(RTE_MBUF_F_RX_IP_CKSUM_MASK | 202*c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_MASK | 203*c1d14583SBruce Richardson RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 204*c1d14583SBruce Richardson RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK); 205*c1d14583SBruce Richardson /** 206*c1d14583SBruce Richardson * data to be shuffled by result of flag mask, shifted down 12. 207*c1d14583SBruce Richardson * If RSS(bit12)/VLAN(bit13) are set, 208*c1d14583SBruce Richardson * shuffle moves appropriate flags in place. 209*c1d14583SBruce Richardson */ 210*c1d14583SBruce Richardson const __m256i rss_vlan_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 211*c1d14583SBruce Richardson 0, 0, 0, 0, 212*c1d14583SBruce Richardson 0, 0, 0, 0, 213*c1d14583SBruce Richardson RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 214*c1d14583SBruce Richardson RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 215*c1d14583SBruce Richardson RTE_MBUF_F_RX_RSS_HASH, 0, 216*c1d14583SBruce Richardson /* 2nd 128-bits */ 217*c1d14583SBruce Richardson 0, 0, 0, 0, 218*c1d14583SBruce Richardson 0, 0, 0, 0, 219*c1d14583SBruce Richardson 0, 0, 0, 0, 220*c1d14583SBruce Richardson RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 221*c1d14583SBruce Richardson RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 222*c1d14583SBruce Richardson RTE_MBUF_F_RX_RSS_HASH, 0); 223*c1d14583SBruce Richardson 224*c1d14583SBruce Richardson uint16_t i, received; 225*c1d14583SBruce Richardson 226*c1d14583SBruce Richardson for (i = 0, received = 0; i < nb_pkts; 227*c1d14583SBruce Richardson i += ICE_DESCS_PER_LOOP_AVX, 228*c1d14583SBruce Richardson rxdp += ICE_DESCS_PER_LOOP_AVX) { 229*c1d14583SBruce Richardson /* step 1, copy over 8 mbuf pointers to rx_pkts array */ 230*c1d14583SBruce Richardson _mm256_storeu_si256((void *)&rx_pkts[i], 231*c1d14583SBruce Richardson _mm256_loadu_si256((void *)&sw_ring[i])); 232*c1d14583SBruce Richardson #ifdef RTE_ARCH_X86_64 233*c1d14583SBruce Richardson _mm256_storeu_si256 234*c1d14583SBruce Richardson ((void *)&rx_pkts[i + 4], 235*c1d14583SBruce Richardson _mm256_loadu_si256((void *)&sw_ring[i + 4])); 236*c1d14583SBruce Richardson #endif 237*c1d14583SBruce Richardson 238*c1d14583SBruce Richardson __m512i raw_desc0_3, raw_desc4_7; 239*c1d14583SBruce Richardson __m256i raw_desc0_1, raw_desc2_3, raw_desc4_5, raw_desc6_7; 240*c1d14583SBruce Richardson 241*c1d14583SBruce Richardson /* load in descriptors, in reverse order */ 242*c1d14583SBruce Richardson const __m128i raw_desc7 = 243*c1d14583SBruce Richardson _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 7)); 244*c1d14583SBruce Richardson rte_compiler_barrier(); 245*c1d14583SBruce Richardson const __m128i raw_desc6 = 246*c1d14583SBruce Richardson _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 6)); 247*c1d14583SBruce Richardson rte_compiler_barrier(); 248*c1d14583SBruce Richardson const __m128i raw_desc5 = 249*c1d14583SBruce Richardson _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 5)); 250*c1d14583SBruce Richardson rte_compiler_barrier(); 251*c1d14583SBruce Richardson const __m128i raw_desc4 = 252*c1d14583SBruce Richardson _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 4)); 253*c1d14583SBruce Richardson rte_compiler_barrier(); 254*c1d14583SBruce Richardson const __m128i raw_desc3 = 255*c1d14583SBruce Richardson _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 3)); 256*c1d14583SBruce Richardson rte_compiler_barrier(); 257*c1d14583SBruce Richardson const __m128i raw_desc2 = 258*c1d14583SBruce Richardson _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 2)); 259*c1d14583SBruce Richardson rte_compiler_barrier(); 260*c1d14583SBruce Richardson const __m128i raw_desc1 = 261*c1d14583SBruce Richardson _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 1)); 262*c1d14583SBruce Richardson rte_compiler_barrier(); 263*c1d14583SBruce Richardson const __m128i raw_desc0 = 264*c1d14583SBruce Richardson _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 0)); 265*c1d14583SBruce Richardson 266*c1d14583SBruce Richardson raw_desc6_7 = 267*c1d14583SBruce Richardson _mm256_inserti128_si256 268*c1d14583SBruce Richardson (_mm256_castsi128_si256(raw_desc6), 269*c1d14583SBruce Richardson raw_desc7, 1); 270*c1d14583SBruce Richardson raw_desc4_5 = 271*c1d14583SBruce Richardson _mm256_inserti128_si256 272*c1d14583SBruce Richardson (_mm256_castsi128_si256(raw_desc4), 273*c1d14583SBruce Richardson raw_desc5, 1); 274*c1d14583SBruce Richardson raw_desc2_3 = 275*c1d14583SBruce Richardson _mm256_inserti128_si256 276*c1d14583SBruce Richardson (_mm256_castsi128_si256(raw_desc2), 277*c1d14583SBruce Richardson raw_desc3, 1); 278*c1d14583SBruce Richardson raw_desc0_1 = 279*c1d14583SBruce Richardson _mm256_inserti128_si256 280*c1d14583SBruce Richardson (_mm256_castsi128_si256(raw_desc0), 281*c1d14583SBruce Richardson raw_desc1, 1); 282*c1d14583SBruce Richardson 283*c1d14583SBruce Richardson raw_desc4_7 = 284*c1d14583SBruce Richardson _mm512_inserti64x4 285*c1d14583SBruce Richardson (_mm512_castsi256_si512(raw_desc4_5), 286*c1d14583SBruce Richardson raw_desc6_7, 1); 287*c1d14583SBruce Richardson raw_desc0_3 = 288*c1d14583SBruce Richardson _mm512_inserti64x4 289*c1d14583SBruce Richardson (_mm512_castsi256_si512(raw_desc0_1), 290*c1d14583SBruce Richardson raw_desc2_3, 1); 291*c1d14583SBruce Richardson 292*c1d14583SBruce Richardson if (split_packet) { 293*c1d14583SBruce Richardson int j; 294*c1d14583SBruce Richardson 295*c1d14583SBruce Richardson for (j = 0; j < ICE_DESCS_PER_LOOP_AVX; j++) 296*c1d14583SBruce Richardson rte_mbuf_prefetch_part2(rx_pkts[i + j]); 297*c1d14583SBruce Richardson } 298*c1d14583SBruce Richardson 299*c1d14583SBruce Richardson /** 300*c1d14583SBruce Richardson * convert descriptors 0-7 into mbufs, re-arrange fields. 301*c1d14583SBruce Richardson * Then write into the mbuf. 302*c1d14583SBruce Richardson */ 303*c1d14583SBruce Richardson __m512i mb4_7 = _mm512_shuffle_epi8(raw_desc4_7, shuf_msk); 304*c1d14583SBruce Richardson __m512i mb0_3 = _mm512_shuffle_epi8(raw_desc0_3, shuf_msk); 305*c1d14583SBruce Richardson 306*c1d14583SBruce Richardson mb4_7 = _mm512_add_epi32(mb4_7, crc_adjust); 307*c1d14583SBruce Richardson mb0_3 = _mm512_add_epi32(mb0_3, crc_adjust); 308*c1d14583SBruce Richardson 309*c1d14583SBruce Richardson /** 310*c1d14583SBruce Richardson * to get packet types, ptype is located in bit16-25 311*c1d14583SBruce Richardson * of each 128bits 312*c1d14583SBruce Richardson */ 313*c1d14583SBruce Richardson const __m512i ptype_mask = 314*c1d14583SBruce Richardson _mm512_set1_epi16(ICE_RX_FLEX_DESC_PTYPE_M); 315*c1d14583SBruce Richardson 316*c1d14583SBruce Richardson /** 317*c1d14583SBruce Richardson * to get packet types, ptype is located in bit16-25 318*c1d14583SBruce Richardson * of each 128bits 319*c1d14583SBruce Richardson */ 320*c1d14583SBruce Richardson const __m512i ptypes4_7 = 321*c1d14583SBruce Richardson _mm512_and_si512(raw_desc4_7, ptype_mask); 322*c1d14583SBruce Richardson const __m512i ptypes0_3 = 323*c1d14583SBruce Richardson _mm512_and_si512(raw_desc0_3, ptype_mask); 324*c1d14583SBruce Richardson 325*c1d14583SBruce Richardson const __m256i ptypes6_7 = 326*c1d14583SBruce Richardson _mm512_extracti64x4_epi64(ptypes4_7, 1); 327*c1d14583SBruce Richardson const __m256i ptypes4_5 = 328*c1d14583SBruce Richardson _mm512_extracti64x4_epi64(ptypes4_7, 0); 329*c1d14583SBruce Richardson const __m256i ptypes2_3 = 330*c1d14583SBruce Richardson _mm512_extracti64x4_epi64(ptypes0_3, 1); 331*c1d14583SBruce Richardson const __m256i ptypes0_1 = 332*c1d14583SBruce Richardson _mm512_extracti64x4_epi64(ptypes0_3, 0); 333*c1d14583SBruce Richardson const uint16_t ptype7 = _mm256_extract_epi16(ptypes6_7, 9); 334*c1d14583SBruce Richardson const uint16_t ptype6 = _mm256_extract_epi16(ptypes6_7, 1); 335*c1d14583SBruce Richardson const uint16_t ptype5 = _mm256_extract_epi16(ptypes4_5, 9); 336*c1d14583SBruce Richardson const uint16_t ptype4 = _mm256_extract_epi16(ptypes4_5, 1); 337*c1d14583SBruce Richardson const uint16_t ptype3 = _mm256_extract_epi16(ptypes2_3, 9); 338*c1d14583SBruce Richardson const uint16_t ptype2 = _mm256_extract_epi16(ptypes2_3, 1); 339*c1d14583SBruce Richardson const uint16_t ptype1 = _mm256_extract_epi16(ptypes0_1, 9); 340*c1d14583SBruce Richardson const uint16_t ptype0 = _mm256_extract_epi16(ptypes0_1, 1); 341*c1d14583SBruce Richardson 342*c1d14583SBruce Richardson const __m512i ptype4_7 = _mm512_set_epi32 343*c1d14583SBruce Richardson (0, 0, 0, ptype_tbl[ptype7], 344*c1d14583SBruce Richardson 0, 0, 0, ptype_tbl[ptype6], 345*c1d14583SBruce Richardson 0, 0, 0, ptype_tbl[ptype5], 346*c1d14583SBruce Richardson 0, 0, 0, ptype_tbl[ptype4]); 347*c1d14583SBruce Richardson const __m512i ptype0_3 = _mm512_set_epi32 348*c1d14583SBruce Richardson (0, 0, 0, ptype_tbl[ptype3], 349*c1d14583SBruce Richardson 0, 0, 0, ptype_tbl[ptype2], 350*c1d14583SBruce Richardson 0, 0, 0, ptype_tbl[ptype1], 351*c1d14583SBruce Richardson 0, 0, 0, ptype_tbl[ptype0]); 352*c1d14583SBruce Richardson 353*c1d14583SBruce Richardson mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7); 354*c1d14583SBruce Richardson mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3); 355*c1d14583SBruce Richardson 356*c1d14583SBruce Richardson __m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0); 357*c1d14583SBruce Richardson __m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1); 358*c1d14583SBruce Richardson __m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0); 359*c1d14583SBruce Richardson __m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1); 360*c1d14583SBruce Richardson 361*c1d14583SBruce Richardson /** 362*c1d14583SBruce Richardson * use permute/extract to get status content 363*c1d14583SBruce Richardson * After the operations, the packets status flags are in the 364*c1d14583SBruce Richardson * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6] 365*c1d14583SBruce Richardson */ 366*c1d14583SBruce Richardson /* merge the status bits into one register */ 367*c1d14583SBruce Richardson const __m512i status_permute_msk = _mm512_set_epi32 368*c1d14583SBruce Richardson (0, 0, 0, 0, 369*c1d14583SBruce Richardson 0, 0, 0, 0, 370*c1d14583SBruce Richardson 22, 30, 6, 14, 371*c1d14583SBruce Richardson 18, 26, 2, 10); 372*c1d14583SBruce Richardson const __m512i raw_status0_7 = _mm512_permutex2var_epi32 373*c1d14583SBruce Richardson (raw_desc4_7, status_permute_msk, raw_desc0_3); 374*c1d14583SBruce Richardson __m256i status0_7 = _mm512_extracti64x4_epi64 375*c1d14583SBruce Richardson (raw_status0_7, 0); 376*c1d14583SBruce Richardson 377*c1d14583SBruce Richardson __m256i mbuf_flags = _mm256_set1_epi32(0); 378*c1d14583SBruce Richardson 379*c1d14583SBruce Richardson if (do_offload) { 380*c1d14583SBruce Richardson /* now do flag manipulation */ 381*c1d14583SBruce Richardson 382*c1d14583SBruce Richardson /* get only flag/error bits we want */ 383*c1d14583SBruce Richardson const __m256i flag_bits = 384*c1d14583SBruce Richardson _mm256_and_si256(status0_7, flags_mask); 385*c1d14583SBruce Richardson /** 386*c1d14583SBruce Richardson * l3_l4_error flags, shuffle, then shift to correct adjustment 387*c1d14583SBruce Richardson * of flags in flags_shuf, and finally mask out extra bits 388*c1d14583SBruce Richardson */ 389*c1d14583SBruce Richardson __m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf, 390*c1d14583SBruce Richardson _mm256_srli_epi32(flag_bits, 4)); 391*c1d14583SBruce Richardson l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1); 392*c1d14583SBruce Richardson __m256i l4_outer_mask = _mm256_set1_epi32(0x6); 393*c1d14583SBruce Richardson __m256i l4_outer_flags = 394*c1d14583SBruce Richardson _mm256_and_si256(l3_l4_flags, l4_outer_mask); 395*c1d14583SBruce Richardson l4_outer_flags = _mm256_slli_epi32(l4_outer_flags, 20); 396*c1d14583SBruce Richardson 397*c1d14583SBruce Richardson __m256i l3_l4_mask = _mm256_set1_epi32(~0x6); 398*c1d14583SBruce Richardson 399*c1d14583SBruce Richardson l3_l4_flags = _mm256_and_si256(l3_l4_flags, l3_l4_mask); 400*c1d14583SBruce Richardson l3_l4_flags = _mm256_or_si256(l3_l4_flags, l4_outer_flags); 401*c1d14583SBruce Richardson l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask); 402*c1d14583SBruce Richardson /* set rss and vlan flags */ 403*c1d14583SBruce Richardson const __m256i rss_vlan_flag_bits = 404*c1d14583SBruce Richardson _mm256_srli_epi32(flag_bits, 12); 405*c1d14583SBruce Richardson const __m256i rss_vlan_flags = 406*c1d14583SBruce Richardson _mm256_shuffle_epi8(rss_vlan_flags_shuf, 407*c1d14583SBruce Richardson rss_vlan_flag_bits); 408*c1d14583SBruce Richardson 409*c1d14583SBruce Richardson /* merge flags */ 410*c1d14583SBruce Richardson mbuf_flags = _mm256_or_si256(l3_l4_flags, 411*c1d14583SBruce Richardson rss_vlan_flags); 412*c1d14583SBruce Richardson } 413*c1d14583SBruce Richardson 414*c1d14583SBruce Richardson if (rxq->fdir_enabled) { 415*c1d14583SBruce Richardson const __m256i fdir_id4_7 = 416*c1d14583SBruce Richardson _mm256_unpackhi_epi32(raw_desc6_7, raw_desc4_5); 417*c1d14583SBruce Richardson 418*c1d14583SBruce Richardson const __m256i fdir_id0_3 = 419*c1d14583SBruce Richardson _mm256_unpackhi_epi32(raw_desc2_3, raw_desc0_1); 420*c1d14583SBruce Richardson 421*c1d14583SBruce Richardson const __m256i fdir_id0_7 = 422*c1d14583SBruce Richardson _mm256_unpackhi_epi64(fdir_id4_7, fdir_id0_3); 423*c1d14583SBruce Richardson 424*c1d14583SBruce Richardson if (do_offload) { 425*c1d14583SBruce Richardson const __m256i fdir_flags = 426*c1d14583SBruce Richardson ice_flex_rxd_to_fdir_flags_vec_avx512 427*c1d14583SBruce Richardson (fdir_id0_7); 428*c1d14583SBruce Richardson 429*c1d14583SBruce Richardson /* merge with fdir_flags */ 430*c1d14583SBruce Richardson mbuf_flags = _mm256_or_si256 431*c1d14583SBruce Richardson (mbuf_flags, fdir_flags); 432*c1d14583SBruce Richardson } else { 433*c1d14583SBruce Richardson mbuf_flags = 434*c1d14583SBruce Richardson ice_flex_rxd_to_fdir_flags_vec_avx512 435*c1d14583SBruce Richardson (fdir_id0_7); 436*c1d14583SBruce Richardson } 437*c1d14583SBruce Richardson 438*c1d14583SBruce Richardson /* write to mbuf: have to use scalar store here */ 439*c1d14583SBruce Richardson rx_pkts[i + 0]->hash.fdir.hi = 440*c1d14583SBruce Richardson _mm256_extract_epi32(fdir_id0_7, 3); 441*c1d14583SBruce Richardson 442*c1d14583SBruce Richardson rx_pkts[i + 1]->hash.fdir.hi = 443*c1d14583SBruce Richardson _mm256_extract_epi32(fdir_id0_7, 7); 444*c1d14583SBruce Richardson 445*c1d14583SBruce Richardson rx_pkts[i + 2]->hash.fdir.hi = 446*c1d14583SBruce Richardson _mm256_extract_epi32(fdir_id0_7, 2); 447*c1d14583SBruce Richardson 448*c1d14583SBruce Richardson rx_pkts[i + 3]->hash.fdir.hi = 449*c1d14583SBruce Richardson _mm256_extract_epi32(fdir_id0_7, 6); 450*c1d14583SBruce Richardson 451*c1d14583SBruce Richardson rx_pkts[i + 4]->hash.fdir.hi = 452*c1d14583SBruce Richardson _mm256_extract_epi32(fdir_id0_7, 1); 453*c1d14583SBruce Richardson 454*c1d14583SBruce Richardson rx_pkts[i + 5]->hash.fdir.hi = 455*c1d14583SBruce Richardson _mm256_extract_epi32(fdir_id0_7, 5); 456*c1d14583SBruce Richardson 457*c1d14583SBruce Richardson rx_pkts[i + 6]->hash.fdir.hi = 458*c1d14583SBruce Richardson _mm256_extract_epi32(fdir_id0_7, 0); 459*c1d14583SBruce Richardson 460*c1d14583SBruce Richardson rx_pkts[i + 7]->hash.fdir.hi = 461*c1d14583SBruce Richardson _mm256_extract_epi32(fdir_id0_7, 4); 462*c1d14583SBruce Richardson } /* if() on fdir_enabled */ 463*c1d14583SBruce Richardson 464*c1d14583SBruce Richardson if (do_offload) { 465*c1d14583SBruce Richardson #ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC 466*c1d14583SBruce Richardson /** 467*c1d14583SBruce Richardson * needs to load 2nd 16B of each desc for RSS hash parsing, 468*c1d14583SBruce Richardson * will cause performance drop to get into this context. 469*c1d14583SBruce Richardson */ 470*c1d14583SBruce Richardson if (rxq->vsi->adapter->pf.dev_data->dev_conf.rxmode.offloads & 471*c1d14583SBruce Richardson RTE_ETH_RX_OFFLOAD_RSS_HASH) { 472*c1d14583SBruce Richardson /* load bottom half of every 32B desc */ 473*c1d14583SBruce Richardson const __m128i raw_desc_bh7 = _mm_load_si128 474*c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[7].wb.status_error1)); 475*c1d14583SBruce Richardson rte_compiler_barrier(); 476*c1d14583SBruce Richardson const __m128i raw_desc_bh6 = _mm_load_si128 477*c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, rxdp[6].wb.status_error1)); 478*c1d14583SBruce Richardson rte_compiler_barrier(); 479*c1d14583SBruce Richardson const __m128i raw_desc_bh5 = _mm_load_si128 480*c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[5].wb.status_error1)); 481*c1d14583SBruce Richardson rte_compiler_barrier(); 482*c1d14583SBruce Richardson const __m128i raw_desc_bh4 = _mm_load_si128 483*c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[4].wb.status_error1)); 484*c1d14583SBruce Richardson rte_compiler_barrier(); 485*c1d14583SBruce Richardson const __m128i raw_desc_bh3 = _mm_load_si128 486*c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[3].wb.status_error1)); 487*c1d14583SBruce Richardson rte_compiler_barrier(); 488*c1d14583SBruce Richardson const __m128i raw_desc_bh2 = _mm_load_si128 489*c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[2].wb.status_error1)); 490*c1d14583SBruce Richardson rte_compiler_barrier(); 491*c1d14583SBruce Richardson const __m128i raw_desc_bh1 = _mm_load_si128 492*c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[1].wb.status_error1)); 493*c1d14583SBruce Richardson rte_compiler_barrier(); 494*c1d14583SBruce Richardson const __m128i raw_desc_bh0 = _mm_load_si128 495*c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[0].wb.status_error1)); 496*c1d14583SBruce Richardson 497*c1d14583SBruce Richardson __m256i raw_desc_bh6_7 = 498*c1d14583SBruce Richardson _mm256_inserti128_si256 499*c1d14583SBruce Richardson (_mm256_castsi128_si256(raw_desc_bh6), 500*c1d14583SBruce Richardson raw_desc_bh7, 1); 501*c1d14583SBruce Richardson __m256i raw_desc_bh4_5 = 502*c1d14583SBruce Richardson _mm256_inserti128_si256 503*c1d14583SBruce Richardson (_mm256_castsi128_si256(raw_desc_bh4), 504*c1d14583SBruce Richardson raw_desc_bh5, 1); 505*c1d14583SBruce Richardson __m256i raw_desc_bh2_3 = 506*c1d14583SBruce Richardson _mm256_inserti128_si256 507*c1d14583SBruce Richardson (_mm256_castsi128_si256(raw_desc_bh2), 508*c1d14583SBruce Richardson raw_desc_bh3, 1); 509*c1d14583SBruce Richardson __m256i raw_desc_bh0_1 = 510*c1d14583SBruce Richardson _mm256_inserti128_si256 511*c1d14583SBruce Richardson (_mm256_castsi128_si256(raw_desc_bh0), 512*c1d14583SBruce Richardson raw_desc_bh1, 1); 513*c1d14583SBruce Richardson 514*c1d14583SBruce Richardson /** 515*c1d14583SBruce Richardson * to shift the 32b RSS hash value to the 516*c1d14583SBruce Richardson * highest 32b of each 128b before mask 517*c1d14583SBruce Richardson */ 518*c1d14583SBruce Richardson __m256i rss_hash6_7 = 519*c1d14583SBruce Richardson _mm256_slli_epi64(raw_desc_bh6_7, 32); 520*c1d14583SBruce Richardson __m256i rss_hash4_5 = 521*c1d14583SBruce Richardson _mm256_slli_epi64(raw_desc_bh4_5, 32); 522*c1d14583SBruce Richardson __m256i rss_hash2_3 = 523*c1d14583SBruce Richardson _mm256_slli_epi64(raw_desc_bh2_3, 32); 524*c1d14583SBruce Richardson __m256i rss_hash0_1 = 525*c1d14583SBruce Richardson _mm256_slli_epi64(raw_desc_bh0_1, 32); 526*c1d14583SBruce Richardson 527*c1d14583SBruce Richardson __m256i rss_hash_msk = 528*c1d14583SBruce Richardson _mm256_set_epi32(0xFFFFFFFF, 0, 0, 0, 529*c1d14583SBruce Richardson 0xFFFFFFFF, 0, 0, 0); 530*c1d14583SBruce Richardson 531*c1d14583SBruce Richardson rss_hash6_7 = _mm256_and_si256 532*c1d14583SBruce Richardson (rss_hash6_7, rss_hash_msk); 533*c1d14583SBruce Richardson rss_hash4_5 = _mm256_and_si256 534*c1d14583SBruce Richardson (rss_hash4_5, rss_hash_msk); 535*c1d14583SBruce Richardson rss_hash2_3 = _mm256_and_si256 536*c1d14583SBruce Richardson (rss_hash2_3, rss_hash_msk); 537*c1d14583SBruce Richardson rss_hash0_1 = _mm256_and_si256 538*c1d14583SBruce Richardson (rss_hash0_1, rss_hash_msk); 539*c1d14583SBruce Richardson 540*c1d14583SBruce Richardson mb6_7 = _mm256_or_si256(mb6_7, rss_hash6_7); 541*c1d14583SBruce Richardson mb4_5 = _mm256_or_si256(mb4_5, rss_hash4_5); 542*c1d14583SBruce Richardson mb2_3 = _mm256_or_si256(mb2_3, rss_hash2_3); 543*c1d14583SBruce Richardson mb0_1 = _mm256_or_si256(mb0_1, rss_hash0_1); 544*c1d14583SBruce Richardson } /* if() on RSS hash parsing */ 545*c1d14583SBruce Richardson #endif 546*c1d14583SBruce Richardson } 547*c1d14583SBruce Richardson 548*c1d14583SBruce Richardson /** 549*c1d14583SBruce Richardson * At this point, we have the 8 sets of flags in the low 16-bits 550*c1d14583SBruce Richardson * of each 32-bit value in vlan0. 551*c1d14583SBruce Richardson * We want to extract these, and merge them with the mbuf init 552*c1d14583SBruce Richardson * data so we can do a single write to the mbuf to set the flags 553*c1d14583SBruce Richardson * and all the other initialization fields. Extracting the 554*c1d14583SBruce Richardson * appropriate flags means that we have to do a shift and blend 555*c1d14583SBruce Richardson * for each mbuf before we do the write. However, we can also 556*c1d14583SBruce Richardson * add in the previously computed rx_descriptor fields to 557*c1d14583SBruce Richardson * make a single 256-bit write per mbuf 558*c1d14583SBruce Richardson */ 559*c1d14583SBruce Richardson /* check the structure matches expectations */ 560*c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) != 561*c1d14583SBruce Richardson offsetof(struct rte_mbuf, rearm_data) + 8); 562*c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) != 563*c1d14583SBruce Richardson RTE_ALIGN(offsetof(struct rte_mbuf, 564*c1d14583SBruce Richardson rearm_data), 565*c1d14583SBruce Richardson 16)); 566*c1d14583SBruce Richardson /* build up data and do writes */ 567*c1d14583SBruce Richardson __m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5, 568*c1d14583SBruce Richardson rearm6, rearm7; 569*c1d14583SBruce Richardson 570*c1d14583SBruce Richardson rearm6 = _mm256_blend_epi32(mbuf_init, 571*c1d14583SBruce Richardson _mm256_slli_si256(mbuf_flags, 8), 572*c1d14583SBruce Richardson 0x04); 573*c1d14583SBruce Richardson rearm4 = _mm256_blend_epi32(mbuf_init, 574*c1d14583SBruce Richardson _mm256_slli_si256(mbuf_flags, 4), 575*c1d14583SBruce Richardson 0x04); 576*c1d14583SBruce Richardson rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04); 577*c1d14583SBruce Richardson rearm0 = _mm256_blend_epi32(mbuf_init, 578*c1d14583SBruce Richardson _mm256_srli_si256(mbuf_flags, 4), 579*c1d14583SBruce Richardson 0x04); 580*c1d14583SBruce Richardson 581*c1d14583SBruce Richardson /* permute to add in the rx_descriptor e.g. rss fields */ 582*c1d14583SBruce Richardson rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20); 583*c1d14583SBruce Richardson rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20); 584*c1d14583SBruce Richardson rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20); 585*c1d14583SBruce Richardson rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20); 586*c1d14583SBruce Richardson 587*c1d14583SBruce Richardson /* write to mbuf */ 588*c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data, 589*c1d14583SBruce Richardson rearm6); 590*c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data, 591*c1d14583SBruce Richardson rearm4); 592*c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data, 593*c1d14583SBruce Richardson rearm2); 594*c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data, 595*c1d14583SBruce Richardson rearm0); 596*c1d14583SBruce Richardson 597*c1d14583SBruce Richardson /* repeat for the odd mbufs */ 598*c1d14583SBruce Richardson const __m256i odd_flags = 599*c1d14583SBruce Richardson _mm256_castsi128_si256 600*c1d14583SBruce Richardson (_mm256_extracti128_si256(mbuf_flags, 1)); 601*c1d14583SBruce Richardson rearm7 = _mm256_blend_epi32(mbuf_init, 602*c1d14583SBruce Richardson _mm256_slli_si256(odd_flags, 8), 603*c1d14583SBruce Richardson 0x04); 604*c1d14583SBruce Richardson rearm5 = _mm256_blend_epi32(mbuf_init, 605*c1d14583SBruce Richardson _mm256_slli_si256(odd_flags, 4), 606*c1d14583SBruce Richardson 0x04); 607*c1d14583SBruce Richardson rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04); 608*c1d14583SBruce Richardson rearm1 = _mm256_blend_epi32(mbuf_init, 609*c1d14583SBruce Richardson _mm256_srli_si256(odd_flags, 4), 610*c1d14583SBruce Richardson 0x04); 611*c1d14583SBruce Richardson 612*c1d14583SBruce Richardson /* since odd mbufs are already in hi 128-bits use blend */ 613*c1d14583SBruce Richardson rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0); 614*c1d14583SBruce Richardson rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0); 615*c1d14583SBruce Richardson rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0); 616*c1d14583SBruce Richardson rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0); 617*c1d14583SBruce Richardson /* again write to mbufs */ 618*c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data, 619*c1d14583SBruce Richardson rearm7); 620*c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data, 621*c1d14583SBruce Richardson rearm5); 622*c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data, 623*c1d14583SBruce Richardson rearm3); 624*c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data, 625*c1d14583SBruce Richardson rearm1); 626*c1d14583SBruce Richardson 627*c1d14583SBruce Richardson /* extract and record EOP bit */ 628*c1d14583SBruce Richardson if (split_packet) { 629*c1d14583SBruce Richardson const __m128i eop_mask = 630*c1d14583SBruce Richardson _mm_set1_epi16(1 << ICE_RX_DESC_STATUS_EOF_S); 631*c1d14583SBruce Richardson const __m256i eop_bits256 = _mm256_and_si256(status0_7, 632*c1d14583SBruce Richardson eop_check); 633*c1d14583SBruce Richardson /* pack status bits into a single 128-bit register */ 634*c1d14583SBruce Richardson const __m128i eop_bits = 635*c1d14583SBruce Richardson _mm_packus_epi32 636*c1d14583SBruce Richardson (_mm256_castsi256_si128(eop_bits256), 637*c1d14583SBruce Richardson _mm256_extractf128_si256(eop_bits256, 638*c1d14583SBruce Richardson 1)); 639*c1d14583SBruce Richardson /** 640*c1d14583SBruce Richardson * flip bits, and mask out the EOP bit, which is now 641*c1d14583SBruce Richardson * a split-packet bit i.e. !EOP, rather than EOP one. 642*c1d14583SBruce Richardson */ 643*c1d14583SBruce Richardson __m128i split_bits = _mm_andnot_si128(eop_bits, 644*c1d14583SBruce Richardson eop_mask); 645*c1d14583SBruce Richardson /** 646*c1d14583SBruce Richardson * eop bits are out of order, so we need to shuffle them 647*c1d14583SBruce Richardson * back into order again. In doing so, only use low 8 648*c1d14583SBruce Richardson * bits, which acts like another pack instruction 649*c1d14583SBruce Richardson * The original order is (hi->lo): 1,3,5,7,0,2,4,6 650*c1d14583SBruce Richardson * [Since we use epi8, the 16-bit positions are 651*c1d14583SBruce Richardson * multiplied by 2 in the eop_shuffle value.] 652*c1d14583SBruce Richardson */ 653*c1d14583SBruce Richardson __m128i eop_shuffle = 654*c1d14583SBruce Richardson _mm_set_epi8(/* zero hi 64b */ 655*c1d14583SBruce Richardson 0xFF, 0xFF, 0xFF, 0xFF, 656*c1d14583SBruce Richardson 0xFF, 0xFF, 0xFF, 0xFF, 657*c1d14583SBruce Richardson /* move values to lo 64b */ 658*c1d14583SBruce Richardson 8, 0, 10, 2, 659*c1d14583SBruce Richardson 12, 4, 14, 6); 660*c1d14583SBruce Richardson split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle); 661*c1d14583SBruce Richardson *(uint64_t *)split_packet = 662*c1d14583SBruce Richardson _mm_cvtsi128_si64(split_bits); 663*c1d14583SBruce Richardson split_packet += ICE_DESCS_PER_LOOP_AVX; 664*c1d14583SBruce Richardson } 665*c1d14583SBruce Richardson 666*c1d14583SBruce Richardson /* perform dd_check */ 667*c1d14583SBruce Richardson status0_7 = _mm256_and_si256(status0_7, dd_check); 668*c1d14583SBruce Richardson status0_7 = _mm256_packs_epi32(status0_7, 669*c1d14583SBruce Richardson _mm256_setzero_si256()); 670*c1d14583SBruce Richardson 671*c1d14583SBruce Richardson uint64_t burst = rte_popcount64 672*c1d14583SBruce Richardson (_mm_cvtsi128_si64 673*c1d14583SBruce Richardson (_mm256_extracti128_si256 674*c1d14583SBruce Richardson (status0_7, 1))); 675*c1d14583SBruce Richardson burst += rte_popcount64 676*c1d14583SBruce Richardson (_mm_cvtsi128_si64 677*c1d14583SBruce Richardson (_mm256_castsi256_si128(status0_7))); 678*c1d14583SBruce Richardson received += burst; 679*c1d14583SBruce Richardson if (burst != ICE_DESCS_PER_LOOP_AVX) 680*c1d14583SBruce Richardson break; 681*c1d14583SBruce Richardson } 682*c1d14583SBruce Richardson 683*c1d14583SBruce Richardson /* update tail pointers */ 684*c1d14583SBruce Richardson rxq->rx_tail += received; 685*c1d14583SBruce Richardson rxq->rx_tail &= (rxq->nb_rx_desc - 1); 686*c1d14583SBruce Richardson if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep avx2 aligned */ 687*c1d14583SBruce Richardson rxq->rx_tail--; 688*c1d14583SBruce Richardson received--; 689*c1d14583SBruce Richardson } 690*c1d14583SBruce Richardson rxq->rxrearm_nb += received; 691*c1d14583SBruce Richardson return received; 692*c1d14583SBruce Richardson } 693*c1d14583SBruce Richardson 694*c1d14583SBruce Richardson /** 695*c1d14583SBruce Richardson * Notice: 696*c1d14583SBruce Richardson * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet 697*c1d14583SBruce Richardson */ 698*c1d14583SBruce Richardson uint16_t 699*c1d14583SBruce Richardson ice_recv_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts, 700*c1d14583SBruce Richardson uint16_t nb_pkts) 701*c1d14583SBruce Richardson { 702*c1d14583SBruce Richardson return _ice_recv_raw_pkts_vec_avx512(rx_queue, rx_pkts, nb_pkts, NULL, false); 703*c1d14583SBruce Richardson } 704*c1d14583SBruce Richardson 705*c1d14583SBruce Richardson /** 706*c1d14583SBruce Richardson * Notice: 707*c1d14583SBruce Richardson * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet 708*c1d14583SBruce Richardson */ 709*c1d14583SBruce Richardson uint16_t 710*c1d14583SBruce Richardson ice_recv_pkts_vec_avx512_offload(void *rx_queue, struct rte_mbuf **rx_pkts, 711*c1d14583SBruce Richardson uint16_t nb_pkts) 712*c1d14583SBruce Richardson { 713*c1d14583SBruce Richardson return _ice_recv_raw_pkts_vec_avx512(rx_queue, rx_pkts, 714*c1d14583SBruce Richardson nb_pkts, NULL, true); 715*c1d14583SBruce Richardson } 716*c1d14583SBruce Richardson 717*c1d14583SBruce Richardson /** 718*c1d14583SBruce Richardson * vPMD receive routine that reassembles single burst of 32 scattered packets 719*c1d14583SBruce Richardson * Notice: 720*c1d14583SBruce Richardson * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet 721*c1d14583SBruce Richardson */ 722*c1d14583SBruce Richardson static uint16_t 723*c1d14583SBruce Richardson ice_recv_scattered_burst_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts, 724*c1d14583SBruce Richardson uint16_t nb_pkts) 725*c1d14583SBruce Richardson { 726*c1d14583SBruce Richardson struct ice_rx_queue *rxq = rx_queue; 727*c1d14583SBruce Richardson uint8_t split_flags[ICE_VPMD_RX_BURST] = {0}; 728*c1d14583SBruce Richardson 729*c1d14583SBruce Richardson /* get some new buffers */ 730*c1d14583SBruce Richardson uint16_t nb_bufs = _ice_recv_raw_pkts_vec_avx512(rxq, rx_pkts, nb_pkts, 731*c1d14583SBruce Richardson split_flags, false); 732*c1d14583SBruce Richardson if (nb_bufs == 0) 733*c1d14583SBruce Richardson return 0; 734*c1d14583SBruce Richardson 735*c1d14583SBruce Richardson /* happy day case, full burst + no packets to be joined */ 736*c1d14583SBruce Richardson const uint64_t *split_fl64 = (uint64_t *)split_flags; 737*c1d14583SBruce Richardson 738*c1d14583SBruce Richardson if (!rxq->pkt_first_seg && 739*c1d14583SBruce Richardson split_fl64[0] == 0 && split_fl64[1] == 0 && 740*c1d14583SBruce Richardson split_fl64[2] == 0 && split_fl64[3] == 0) 741*c1d14583SBruce Richardson return nb_bufs; 742*c1d14583SBruce Richardson 743*c1d14583SBruce Richardson /* reassemble any packets that need reassembly */ 744*c1d14583SBruce Richardson unsigned int i = 0; 745*c1d14583SBruce Richardson 746*c1d14583SBruce Richardson if (!rxq->pkt_first_seg) { 747*c1d14583SBruce Richardson /* find the first split flag, and only reassemble then */ 748*c1d14583SBruce Richardson while (i < nb_bufs && !split_flags[i]) 749*c1d14583SBruce Richardson i++; 750*c1d14583SBruce Richardson if (i == nb_bufs) 751*c1d14583SBruce Richardson return nb_bufs; 752*c1d14583SBruce Richardson rxq->pkt_first_seg = rx_pkts[i]; 753*c1d14583SBruce Richardson } 754*c1d14583SBruce Richardson return i + ice_rx_reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i, 755*c1d14583SBruce Richardson &split_flags[i]); 756*c1d14583SBruce Richardson } 757*c1d14583SBruce Richardson 758*c1d14583SBruce Richardson /** 759*c1d14583SBruce Richardson * vPMD receive routine that reassembles single burst of 32 scattered packets 760*c1d14583SBruce Richardson * Notice: 761*c1d14583SBruce Richardson * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet 762*c1d14583SBruce Richardson */ 763*c1d14583SBruce Richardson static uint16_t 764*c1d14583SBruce Richardson ice_recv_scattered_burst_vec_avx512_offload(void *rx_queue, 765*c1d14583SBruce Richardson struct rte_mbuf **rx_pkts, 766*c1d14583SBruce Richardson uint16_t nb_pkts) 767*c1d14583SBruce Richardson { 768*c1d14583SBruce Richardson struct ice_rx_queue *rxq = rx_queue; 769*c1d14583SBruce Richardson uint8_t split_flags[ICE_VPMD_RX_BURST] = {0}; 770*c1d14583SBruce Richardson 771*c1d14583SBruce Richardson /* get some new buffers */ 772*c1d14583SBruce Richardson uint16_t nb_bufs = _ice_recv_raw_pkts_vec_avx512(rxq, 773*c1d14583SBruce Richardson rx_pkts, nb_pkts, split_flags, true); 774*c1d14583SBruce Richardson if (nb_bufs == 0) 775*c1d14583SBruce Richardson return 0; 776*c1d14583SBruce Richardson 777*c1d14583SBruce Richardson /* happy day case, full burst + no packets to be joined */ 778*c1d14583SBruce Richardson const uint64_t *split_fl64 = (uint64_t *)split_flags; 779*c1d14583SBruce Richardson 780*c1d14583SBruce Richardson if (!rxq->pkt_first_seg && 781*c1d14583SBruce Richardson split_fl64[0] == 0 && split_fl64[1] == 0 && 782*c1d14583SBruce Richardson split_fl64[2] == 0 && split_fl64[3] == 0) 783*c1d14583SBruce Richardson return nb_bufs; 784*c1d14583SBruce Richardson 785*c1d14583SBruce Richardson /* reassemble any packets that need reassembly */ 786*c1d14583SBruce Richardson unsigned int i = 0; 787*c1d14583SBruce Richardson 788*c1d14583SBruce Richardson if (!rxq->pkt_first_seg) { 789*c1d14583SBruce Richardson /* find the first split flag, and only reassemble then */ 790*c1d14583SBruce Richardson while (i < nb_bufs && !split_flags[i]) 791*c1d14583SBruce Richardson i++; 792*c1d14583SBruce Richardson if (i == nb_bufs) 793*c1d14583SBruce Richardson return nb_bufs; 794*c1d14583SBruce Richardson rxq->pkt_first_seg = rx_pkts[i]; 795*c1d14583SBruce Richardson } 796*c1d14583SBruce Richardson return i + ice_rx_reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i, 797*c1d14583SBruce Richardson &split_flags[i]); 798*c1d14583SBruce Richardson } 799*c1d14583SBruce Richardson 800*c1d14583SBruce Richardson /** 801*c1d14583SBruce Richardson * vPMD receive routine that reassembles scattered packets. 802*c1d14583SBruce Richardson * Main receive routine that can handle arbitrary burst sizes 803*c1d14583SBruce Richardson * Notice: 804*c1d14583SBruce Richardson * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet 805*c1d14583SBruce Richardson */ 806*c1d14583SBruce Richardson uint16_t 807*c1d14583SBruce Richardson ice_recv_scattered_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts, 808*c1d14583SBruce Richardson uint16_t nb_pkts) 809*c1d14583SBruce Richardson { 810*c1d14583SBruce Richardson uint16_t retval = 0; 811*c1d14583SBruce Richardson 812*c1d14583SBruce Richardson while (nb_pkts > ICE_VPMD_RX_BURST) { 813*c1d14583SBruce Richardson uint16_t burst = ice_recv_scattered_burst_vec_avx512(rx_queue, 814*c1d14583SBruce Richardson rx_pkts + retval, ICE_VPMD_RX_BURST); 815*c1d14583SBruce Richardson retval += burst; 816*c1d14583SBruce Richardson nb_pkts -= burst; 817*c1d14583SBruce Richardson if (burst < ICE_VPMD_RX_BURST) 818*c1d14583SBruce Richardson return retval; 819*c1d14583SBruce Richardson } 820*c1d14583SBruce Richardson return retval + ice_recv_scattered_burst_vec_avx512(rx_queue, 821*c1d14583SBruce Richardson rx_pkts + retval, nb_pkts); 822*c1d14583SBruce Richardson } 823*c1d14583SBruce Richardson 824*c1d14583SBruce Richardson /** 825*c1d14583SBruce Richardson * vPMD receive routine that reassembles scattered packets. 826*c1d14583SBruce Richardson * Main receive routine that can handle arbitrary burst sizes 827*c1d14583SBruce Richardson * Notice: 828*c1d14583SBruce Richardson * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet 829*c1d14583SBruce Richardson */ 830*c1d14583SBruce Richardson uint16_t 831*c1d14583SBruce Richardson ice_recv_scattered_pkts_vec_avx512_offload(void *rx_queue, 832*c1d14583SBruce Richardson struct rte_mbuf **rx_pkts, 833*c1d14583SBruce Richardson uint16_t nb_pkts) 834*c1d14583SBruce Richardson { 835*c1d14583SBruce Richardson uint16_t retval = 0; 836*c1d14583SBruce Richardson 837*c1d14583SBruce Richardson while (nb_pkts > ICE_VPMD_RX_BURST) { 838*c1d14583SBruce Richardson uint16_t burst = 839*c1d14583SBruce Richardson ice_recv_scattered_burst_vec_avx512_offload(rx_queue, 840*c1d14583SBruce Richardson rx_pkts + retval, ICE_VPMD_RX_BURST); 841*c1d14583SBruce Richardson retval += burst; 842*c1d14583SBruce Richardson nb_pkts -= burst; 843*c1d14583SBruce Richardson if (burst < ICE_VPMD_RX_BURST) 844*c1d14583SBruce Richardson return retval; 845*c1d14583SBruce Richardson } 846*c1d14583SBruce Richardson return retval + ice_recv_scattered_burst_vec_avx512_offload(rx_queue, 847*c1d14583SBruce Richardson rx_pkts + retval, nb_pkts); 848*c1d14583SBruce Richardson } 849*c1d14583SBruce Richardson 850*c1d14583SBruce Richardson static __rte_always_inline int 851*c1d14583SBruce Richardson ice_tx_free_bufs_avx512(struct ice_tx_queue *txq) 852*c1d14583SBruce Richardson { 853*c1d14583SBruce Richardson struct ice_vec_tx_entry *txep; 854*c1d14583SBruce Richardson uint32_t n; 855*c1d14583SBruce Richardson uint32_t i; 856*c1d14583SBruce Richardson int nb_free = 0; 857*c1d14583SBruce Richardson struct rte_mbuf *m, *free[ICE_TX_MAX_FREE_BUF_SZ]; 858*c1d14583SBruce Richardson 859*c1d14583SBruce Richardson /* check DD bits on threshold descriptor */ 860*c1d14583SBruce Richardson if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz & 861*c1d14583SBruce Richardson rte_cpu_to_le_64(ICE_TXD_QW1_DTYPE_M)) != 862*c1d14583SBruce Richardson rte_cpu_to_le_64(ICE_TX_DESC_DTYPE_DESC_DONE)) 863*c1d14583SBruce Richardson return 0; 864*c1d14583SBruce Richardson 865*c1d14583SBruce Richardson n = txq->tx_rs_thresh; 866*c1d14583SBruce Richardson 867*c1d14583SBruce Richardson /* first buffer to free from S/W ring is at index 868*c1d14583SBruce Richardson * tx_next_dd - (tx_rs_thresh - 1) 869*c1d14583SBruce Richardson */ 870*c1d14583SBruce Richardson txep = (void *)txq->sw_ring; 871*c1d14583SBruce Richardson txep += txq->tx_next_dd - (n - 1); 872*c1d14583SBruce Richardson 873*c1d14583SBruce Richardson if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE && (n & 31) == 0) { 874*c1d14583SBruce Richardson struct rte_mempool *mp = txep[0].mbuf->pool; 875*c1d14583SBruce Richardson void **cache_objs; 876*c1d14583SBruce Richardson struct rte_mempool_cache *cache = rte_mempool_default_cache(mp, 877*c1d14583SBruce Richardson rte_lcore_id()); 878*c1d14583SBruce Richardson 879*c1d14583SBruce Richardson if (!cache || cache->len == 0) 880*c1d14583SBruce Richardson goto normal; 881*c1d14583SBruce Richardson 882*c1d14583SBruce Richardson cache_objs = &cache->objs[cache->len]; 883*c1d14583SBruce Richardson 884*c1d14583SBruce Richardson if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) { 885*c1d14583SBruce Richardson rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n); 886*c1d14583SBruce Richardson goto done; 887*c1d14583SBruce Richardson } 888*c1d14583SBruce Richardson 889*c1d14583SBruce Richardson /* The cache follows the following algorithm 890*c1d14583SBruce Richardson * 1. Add the objects to the cache 891*c1d14583SBruce Richardson * 2. Anything greater than the cache min value (if it 892*c1d14583SBruce Richardson * crosses the cache flush threshold) is flushed to the ring. 893*c1d14583SBruce Richardson */ 894*c1d14583SBruce Richardson /* Add elements back into the cache */ 895*c1d14583SBruce Richardson uint32_t copied = 0; 896*c1d14583SBruce Richardson /* n is multiple of 32 */ 897*c1d14583SBruce Richardson while (copied < n) { 898*c1d14583SBruce Richardson #ifdef RTE_ARCH_64 899*c1d14583SBruce Richardson const __m512i a = _mm512_loadu_si512(&txep[copied]); 900*c1d14583SBruce Richardson const __m512i b = _mm512_loadu_si512(&txep[copied + 8]); 901*c1d14583SBruce Richardson const __m512i c = _mm512_loadu_si512(&txep[copied + 16]); 902*c1d14583SBruce Richardson const __m512i d = _mm512_loadu_si512(&txep[copied + 24]); 903*c1d14583SBruce Richardson 904*c1d14583SBruce Richardson _mm512_storeu_si512(&cache_objs[copied], a); 905*c1d14583SBruce Richardson _mm512_storeu_si512(&cache_objs[copied + 8], b); 906*c1d14583SBruce Richardson _mm512_storeu_si512(&cache_objs[copied + 16], c); 907*c1d14583SBruce Richardson _mm512_storeu_si512(&cache_objs[copied + 24], d); 908*c1d14583SBruce Richardson #else 909*c1d14583SBruce Richardson const __m512i a = _mm512_loadu_si512(&txep[copied]); 910*c1d14583SBruce Richardson const __m512i b = _mm512_loadu_si512(&txep[copied + 16]); 911*c1d14583SBruce Richardson _mm512_storeu_si512(&cache_objs[copied], a); 912*c1d14583SBruce Richardson _mm512_storeu_si512(&cache_objs[copied + 16], b); 913*c1d14583SBruce Richardson #endif 914*c1d14583SBruce Richardson copied += 32; 915*c1d14583SBruce Richardson } 916*c1d14583SBruce Richardson cache->len += n; 917*c1d14583SBruce Richardson 918*c1d14583SBruce Richardson if (cache->len >= cache->flushthresh) { 919*c1d14583SBruce Richardson rte_mempool_ops_enqueue_bulk 920*c1d14583SBruce Richardson (mp, &cache->objs[cache->size], 921*c1d14583SBruce Richardson cache->len - cache->size); 922*c1d14583SBruce Richardson cache->len = cache->size; 923*c1d14583SBruce Richardson } 924*c1d14583SBruce Richardson goto done; 925*c1d14583SBruce Richardson } 926*c1d14583SBruce Richardson 927*c1d14583SBruce Richardson normal: 928*c1d14583SBruce Richardson m = rte_pktmbuf_prefree_seg(txep[0].mbuf); 929*c1d14583SBruce Richardson if (likely(m)) { 930*c1d14583SBruce Richardson free[0] = m; 931*c1d14583SBruce Richardson nb_free = 1; 932*c1d14583SBruce Richardson for (i = 1; i < n; i++) { 933*c1d14583SBruce Richardson m = rte_pktmbuf_prefree_seg(txep[i].mbuf); 934*c1d14583SBruce Richardson if (likely(m)) { 935*c1d14583SBruce Richardson if (likely(m->pool == free[0]->pool)) { 936*c1d14583SBruce Richardson free[nb_free++] = m; 937*c1d14583SBruce Richardson } else { 938*c1d14583SBruce Richardson rte_mempool_put_bulk(free[0]->pool, 939*c1d14583SBruce Richardson (void *)free, 940*c1d14583SBruce Richardson nb_free); 941*c1d14583SBruce Richardson free[0] = m; 942*c1d14583SBruce Richardson nb_free = 1; 943*c1d14583SBruce Richardson } 944*c1d14583SBruce Richardson } 945*c1d14583SBruce Richardson } 946*c1d14583SBruce Richardson rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free); 947*c1d14583SBruce Richardson } else { 948*c1d14583SBruce Richardson for (i = 1; i < n; i++) { 949*c1d14583SBruce Richardson m = rte_pktmbuf_prefree_seg(txep[i].mbuf); 950*c1d14583SBruce Richardson if (m) 951*c1d14583SBruce Richardson rte_mempool_put(m->pool, m); 952*c1d14583SBruce Richardson } 953*c1d14583SBruce Richardson } 954*c1d14583SBruce Richardson 955*c1d14583SBruce Richardson done: 956*c1d14583SBruce Richardson /* buffers were freed, update counters */ 957*c1d14583SBruce Richardson txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh); 958*c1d14583SBruce Richardson txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh); 959*c1d14583SBruce Richardson if (txq->tx_next_dd >= txq->nb_tx_desc) 960*c1d14583SBruce Richardson txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1); 961*c1d14583SBruce Richardson 962*c1d14583SBruce Richardson return txq->tx_rs_thresh; 963*c1d14583SBruce Richardson } 964*c1d14583SBruce Richardson 965*c1d14583SBruce Richardson static __rte_always_inline void 966*c1d14583SBruce Richardson ice_vtx1(volatile struct ice_tx_desc *txdp, 967*c1d14583SBruce Richardson struct rte_mbuf *pkt, uint64_t flags, bool do_offload) 968*c1d14583SBruce Richardson { 969*c1d14583SBruce Richardson uint64_t high_qw = 970*c1d14583SBruce Richardson (ICE_TX_DESC_DTYPE_DATA | 971*c1d14583SBruce Richardson ((uint64_t)flags << ICE_TXD_QW1_CMD_S) | 972*c1d14583SBruce Richardson ((uint64_t)pkt->data_len << ICE_TXD_QW1_TX_BUF_SZ_S)); 973*c1d14583SBruce Richardson 974*c1d14583SBruce Richardson if (do_offload) 975*c1d14583SBruce Richardson ice_txd_enable_offload(pkt, &high_qw); 976*c1d14583SBruce Richardson 977*c1d14583SBruce Richardson __m128i descriptor = _mm_set_epi64x(high_qw, rte_pktmbuf_iova(pkt)); 978*c1d14583SBruce Richardson _mm_store_si128(RTE_CAST_PTR(__m128i *, txdp), descriptor); 979*c1d14583SBruce Richardson } 980*c1d14583SBruce Richardson 981*c1d14583SBruce Richardson static __rte_always_inline void 982*c1d14583SBruce Richardson ice_vtx(volatile struct ice_tx_desc *txdp, struct rte_mbuf **pkt, 983*c1d14583SBruce Richardson uint16_t nb_pkts, uint64_t flags, bool do_offload) 984*c1d14583SBruce Richardson { 985*c1d14583SBruce Richardson const uint64_t hi_qw_tmpl = (ICE_TX_DESC_DTYPE_DATA | 986*c1d14583SBruce Richardson ((uint64_t)flags << ICE_TXD_QW1_CMD_S)); 987*c1d14583SBruce Richardson 988*c1d14583SBruce Richardson for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) { 989*c1d14583SBruce Richardson uint64_t hi_qw3 = 990*c1d14583SBruce Richardson hi_qw_tmpl | 991*c1d14583SBruce Richardson ((uint64_t)pkt[3]->data_len << 992*c1d14583SBruce Richardson ICE_TXD_QW1_TX_BUF_SZ_S); 993*c1d14583SBruce Richardson if (do_offload) 994*c1d14583SBruce Richardson ice_txd_enable_offload(pkt[3], &hi_qw3); 995*c1d14583SBruce Richardson uint64_t hi_qw2 = 996*c1d14583SBruce Richardson hi_qw_tmpl | 997*c1d14583SBruce Richardson ((uint64_t)pkt[2]->data_len << 998*c1d14583SBruce Richardson ICE_TXD_QW1_TX_BUF_SZ_S); 999*c1d14583SBruce Richardson if (do_offload) 1000*c1d14583SBruce Richardson ice_txd_enable_offload(pkt[2], &hi_qw2); 1001*c1d14583SBruce Richardson uint64_t hi_qw1 = 1002*c1d14583SBruce Richardson hi_qw_tmpl | 1003*c1d14583SBruce Richardson ((uint64_t)pkt[1]->data_len << 1004*c1d14583SBruce Richardson ICE_TXD_QW1_TX_BUF_SZ_S); 1005*c1d14583SBruce Richardson if (do_offload) 1006*c1d14583SBruce Richardson ice_txd_enable_offload(pkt[1], &hi_qw1); 1007*c1d14583SBruce Richardson uint64_t hi_qw0 = 1008*c1d14583SBruce Richardson hi_qw_tmpl | 1009*c1d14583SBruce Richardson ((uint64_t)pkt[0]->data_len << 1010*c1d14583SBruce Richardson ICE_TXD_QW1_TX_BUF_SZ_S); 1011*c1d14583SBruce Richardson if (do_offload) 1012*c1d14583SBruce Richardson ice_txd_enable_offload(pkt[0], &hi_qw0); 1013*c1d14583SBruce Richardson 1014*c1d14583SBruce Richardson __m512i desc0_3 = 1015*c1d14583SBruce Richardson _mm512_set_epi64 1016*c1d14583SBruce Richardson (hi_qw3, rte_pktmbuf_iova(pkt[3]), 1017*c1d14583SBruce Richardson hi_qw2, rte_pktmbuf_iova(pkt[2]), 1018*c1d14583SBruce Richardson hi_qw1, rte_pktmbuf_iova(pkt[1]), 1019*c1d14583SBruce Richardson hi_qw0, rte_pktmbuf_iova(pkt[0])); 1020*c1d14583SBruce Richardson _mm512_storeu_si512(RTE_CAST_PTR(void *, txdp), desc0_3); 1021*c1d14583SBruce Richardson } 1022*c1d14583SBruce Richardson 1023*c1d14583SBruce Richardson /* do any last ones */ 1024*c1d14583SBruce Richardson while (nb_pkts) { 1025*c1d14583SBruce Richardson ice_vtx1(txdp, *pkt, flags, do_offload); 1026*c1d14583SBruce Richardson txdp++, pkt++, nb_pkts--; 1027*c1d14583SBruce Richardson } 1028*c1d14583SBruce Richardson } 1029*c1d14583SBruce Richardson 1030*c1d14583SBruce Richardson static __rte_always_inline void 1031*c1d14583SBruce Richardson ice_tx_backlog_entry_avx512(struct ice_vec_tx_entry *txep, 1032*c1d14583SBruce Richardson struct rte_mbuf **tx_pkts, uint16_t nb_pkts) 1033*c1d14583SBruce Richardson { 1034*c1d14583SBruce Richardson int i; 1035*c1d14583SBruce Richardson 1036*c1d14583SBruce Richardson for (i = 0; i < (int)nb_pkts; ++i) 1037*c1d14583SBruce Richardson txep[i].mbuf = tx_pkts[i]; 1038*c1d14583SBruce Richardson } 1039*c1d14583SBruce Richardson 1040*c1d14583SBruce Richardson static __rte_always_inline uint16_t 1041*c1d14583SBruce Richardson ice_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, 1042*c1d14583SBruce Richardson uint16_t nb_pkts, bool do_offload) 1043*c1d14583SBruce Richardson { 1044*c1d14583SBruce Richardson struct ice_tx_queue *txq = (struct ice_tx_queue *)tx_queue; 1045*c1d14583SBruce Richardson volatile struct ice_tx_desc *txdp; 1046*c1d14583SBruce Richardson struct ice_vec_tx_entry *txep; 1047*c1d14583SBruce Richardson uint16_t n, nb_commit, tx_id; 1048*c1d14583SBruce Richardson uint64_t flags = ICE_TD_CMD; 1049*c1d14583SBruce Richardson uint64_t rs = ICE_TX_DESC_CMD_RS | ICE_TD_CMD; 1050*c1d14583SBruce Richardson 1051*c1d14583SBruce Richardson /* cross rx_thresh boundary is not allowed */ 1052*c1d14583SBruce Richardson nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh); 1053*c1d14583SBruce Richardson 1054*c1d14583SBruce Richardson if (txq->nb_tx_free < txq->tx_free_thresh) 1055*c1d14583SBruce Richardson ice_tx_free_bufs_avx512(txq); 1056*c1d14583SBruce Richardson 1057*c1d14583SBruce Richardson nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts); 1058*c1d14583SBruce Richardson if (unlikely(nb_pkts == 0)) 1059*c1d14583SBruce Richardson return 0; 1060*c1d14583SBruce Richardson 1061*c1d14583SBruce Richardson tx_id = txq->tx_tail; 1062*c1d14583SBruce Richardson txdp = &txq->tx_ring[tx_id]; 1063*c1d14583SBruce Richardson txep = (void *)txq->sw_ring; 1064*c1d14583SBruce Richardson txep += tx_id; 1065*c1d14583SBruce Richardson 1066*c1d14583SBruce Richardson txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts); 1067*c1d14583SBruce Richardson 1068*c1d14583SBruce Richardson n = (uint16_t)(txq->nb_tx_desc - tx_id); 1069*c1d14583SBruce Richardson if (nb_commit >= n) { 1070*c1d14583SBruce Richardson ice_tx_backlog_entry_avx512(txep, tx_pkts, n); 1071*c1d14583SBruce Richardson 1072*c1d14583SBruce Richardson ice_vtx(txdp, tx_pkts, n - 1, flags, do_offload); 1073*c1d14583SBruce Richardson tx_pkts += (n - 1); 1074*c1d14583SBruce Richardson txdp += (n - 1); 1075*c1d14583SBruce Richardson 1076*c1d14583SBruce Richardson ice_vtx1(txdp, *tx_pkts++, rs, do_offload); 1077*c1d14583SBruce Richardson 1078*c1d14583SBruce Richardson nb_commit = (uint16_t)(nb_commit - n); 1079*c1d14583SBruce Richardson 1080*c1d14583SBruce Richardson tx_id = 0; 1081*c1d14583SBruce Richardson txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1); 1082*c1d14583SBruce Richardson 1083*c1d14583SBruce Richardson /* avoid reach the end of ring */ 1084*c1d14583SBruce Richardson txdp = txq->tx_ring; 1085*c1d14583SBruce Richardson txep = (void *)txq->sw_ring; 1086*c1d14583SBruce Richardson } 1087*c1d14583SBruce Richardson 1088*c1d14583SBruce Richardson ice_tx_backlog_entry_avx512(txep, tx_pkts, nb_commit); 1089*c1d14583SBruce Richardson 1090*c1d14583SBruce Richardson ice_vtx(txdp, tx_pkts, nb_commit, flags, do_offload); 1091*c1d14583SBruce Richardson 1092*c1d14583SBruce Richardson tx_id = (uint16_t)(tx_id + nb_commit); 1093*c1d14583SBruce Richardson if (tx_id > txq->tx_next_rs) { 1094*c1d14583SBruce Richardson txq->tx_ring[txq->tx_next_rs].cmd_type_offset_bsz |= 1095*c1d14583SBruce Richardson rte_cpu_to_le_64(((uint64_t)ICE_TX_DESC_CMD_RS) << 1096*c1d14583SBruce Richardson ICE_TXD_QW1_CMD_S); 1097*c1d14583SBruce Richardson txq->tx_next_rs = 1098*c1d14583SBruce Richardson (uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh); 1099*c1d14583SBruce Richardson } 1100*c1d14583SBruce Richardson 1101*c1d14583SBruce Richardson txq->tx_tail = tx_id; 1102*c1d14583SBruce Richardson 1103*c1d14583SBruce Richardson ICE_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail); 1104*c1d14583SBruce Richardson 1105*c1d14583SBruce Richardson return nb_pkts; 1106*c1d14583SBruce Richardson } 1107*c1d14583SBruce Richardson 1108*c1d14583SBruce Richardson uint16_t 1109*c1d14583SBruce Richardson ice_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, 1110*c1d14583SBruce Richardson uint16_t nb_pkts) 1111*c1d14583SBruce Richardson { 1112*c1d14583SBruce Richardson uint16_t nb_tx = 0; 1113*c1d14583SBruce Richardson struct ice_tx_queue *txq = (struct ice_tx_queue *)tx_queue; 1114*c1d14583SBruce Richardson 1115*c1d14583SBruce Richardson while (nb_pkts) { 1116*c1d14583SBruce Richardson uint16_t ret, num; 1117*c1d14583SBruce Richardson 1118*c1d14583SBruce Richardson num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh); 1119*c1d14583SBruce Richardson ret = ice_xmit_fixed_burst_vec_avx512(tx_queue, 1120*c1d14583SBruce Richardson &tx_pkts[nb_tx], num, false); 1121*c1d14583SBruce Richardson nb_tx += ret; 1122*c1d14583SBruce Richardson nb_pkts -= ret; 1123*c1d14583SBruce Richardson if (ret < num) 1124*c1d14583SBruce Richardson break; 1125*c1d14583SBruce Richardson } 1126*c1d14583SBruce Richardson 1127*c1d14583SBruce Richardson return nb_tx; 1128*c1d14583SBruce Richardson } 1129*c1d14583SBruce Richardson 1130*c1d14583SBruce Richardson uint16_t 1131*c1d14583SBruce Richardson ice_xmit_pkts_vec_avx512_offload(void *tx_queue, struct rte_mbuf **tx_pkts, 1132*c1d14583SBruce Richardson uint16_t nb_pkts) 1133*c1d14583SBruce Richardson { 1134*c1d14583SBruce Richardson uint16_t nb_tx = 0; 1135*c1d14583SBruce Richardson struct ice_tx_queue *txq = (struct ice_tx_queue *)tx_queue; 1136*c1d14583SBruce Richardson 1137*c1d14583SBruce Richardson while (nb_pkts) { 1138*c1d14583SBruce Richardson uint16_t ret, num; 1139*c1d14583SBruce Richardson 1140*c1d14583SBruce Richardson num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh); 1141*c1d14583SBruce Richardson ret = ice_xmit_fixed_burst_vec_avx512(tx_queue, 1142*c1d14583SBruce Richardson &tx_pkts[nb_tx], num, true); 1143*c1d14583SBruce Richardson 1144*c1d14583SBruce Richardson nb_tx += ret; 1145*c1d14583SBruce Richardson nb_pkts -= ret; 1146*c1d14583SBruce Richardson if (ret < num) 1147*c1d14583SBruce Richardson break; 1148*c1d14583SBruce Richardson } 1149*c1d14583SBruce Richardson 1150*c1d14583SBruce Richardson return nb_tx; 1151*c1d14583SBruce Richardson } 1152