1c1d14583SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause 2c1d14583SBruce Richardson * Copyright(c) 2019 Intel Corporation 3c1d14583SBruce Richardson */ 4c1d14583SBruce Richardson 5c1d14583SBruce Richardson #include "ice_rxtx_vec_common.h" 6c1d14583SBruce Richardson #include "ice_rxtx_common_avx.h" 7c1d14583SBruce Richardson 8c1d14583SBruce Richardson #include <rte_vect.h> 9c1d14583SBruce Richardson 10c1d14583SBruce Richardson static __rte_always_inline void 11c1d14583SBruce Richardson ice_rxq_rearm(struct ice_rx_queue *rxq) 12c1d14583SBruce Richardson { 13c1d14583SBruce Richardson return ice_rxq_rearm_common(rxq, false); 14c1d14583SBruce Richardson } 15c1d14583SBruce Richardson 16c1d14583SBruce Richardson static __rte_always_inline __m256i 17c1d14583SBruce Richardson ice_flex_rxd_to_fdir_flags_vec_avx2(const __m256i fdir_id0_7) 18c1d14583SBruce Richardson { 19c1d14583SBruce Richardson #define FDID_MIS_MAGIC 0xFFFFFFFF 20c1d14583SBruce Richardson RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR != (1 << 2)); 21c1d14583SBruce Richardson RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR_ID != (1 << 13)); 22c1d14583SBruce Richardson const __m256i pkt_fdir_bit = _mm256_set1_epi32(RTE_MBUF_F_RX_FDIR | 23c1d14583SBruce Richardson RTE_MBUF_F_RX_FDIR_ID); 24c1d14583SBruce Richardson /* desc->flow_id field == 0xFFFFFFFF means fdir mismatch */ 25c1d14583SBruce Richardson const __m256i fdir_mis_mask = _mm256_set1_epi32(FDID_MIS_MAGIC); 26c1d14583SBruce Richardson __m256i fdir_mask = _mm256_cmpeq_epi32(fdir_id0_7, 27c1d14583SBruce Richardson fdir_mis_mask); 28c1d14583SBruce Richardson /* this XOR op results to bit-reverse the fdir_mask */ 29c1d14583SBruce Richardson fdir_mask = _mm256_xor_si256(fdir_mask, fdir_mis_mask); 30c1d14583SBruce Richardson const __m256i fdir_flags = _mm256_and_si256(fdir_mask, pkt_fdir_bit); 31c1d14583SBruce Richardson 32c1d14583SBruce Richardson return fdir_flags; 33c1d14583SBruce Richardson } 34c1d14583SBruce Richardson 35c1d14583SBruce Richardson static __rte_always_inline uint16_t 36c1d14583SBruce Richardson _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts, 37c1d14583SBruce Richardson uint16_t nb_pkts, uint8_t *split_packet, 38c1d14583SBruce Richardson bool offload) 39c1d14583SBruce Richardson { 40c1d14583SBruce Richardson #define ICE_DESCS_PER_LOOP_AVX 8 41c1d14583SBruce Richardson 42c1d14583SBruce Richardson const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; 43c1d14583SBruce Richardson const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 44c1d14583SBruce Richardson 0, rxq->mbuf_initializer); 45c1d14583SBruce Richardson struct ice_rx_entry *sw_ring = &rxq->sw_ring[rxq->rx_tail]; 46c1d14583SBruce Richardson volatile union ice_rx_flex_desc *rxdp = rxq->rx_ring + rxq->rx_tail; 47c1d14583SBruce Richardson const int avx_aligned = ((rxq->rx_tail & 1) == 0); 48c1d14583SBruce Richardson 49c1d14583SBruce Richardson rte_prefetch0(rxdp); 50c1d14583SBruce Richardson 51c1d14583SBruce Richardson /* nb_pkts has to be floor-aligned to ICE_DESCS_PER_LOOP_AVX */ 52c1d14583SBruce Richardson nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, ICE_DESCS_PER_LOOP_AVX); 53c1d14583SBruce Richardson 54c1d14583SBruce Richardson /* See if we need to rearm the RX queue - gives the prefetch a bit 55c1d14583SBruce Richardson * of time to act 56c1d14583SBruce Richardson */ 57c1d14583SBruce Richardson if (rxq->rxrearm_nb > ICE_RXQ_REARM_THRESH) 58c1d14583SBruce Richardson ice_rxq_rearm(rxq); 59c1d14583SBruce Richardson 60c1d14583SBruce Richardson /* Before we start moving massive data around, check to see if 61c1d14583SBruce Richardson * there is actually a packet available 62c1d14583SBruce Richardson */ 63c1d14583SBruce Richardson if (!(rxdp->wb.status_error0 & 64c1d14583SBruce Richardson rte_cpu_to_le_32(1 << ICE_RX_FLEX_DESC_STATUS0_DD_S))) 65c1d14583SBruce Richardson return 0; 66c1d14583SBruce Richardson 67c1d14583SBruce Richardson /* constants used in processing loop */ 68c1d14583SBruce Richardson const __m256i crc_adjust = 69c1d14583SBruce Richardson _mm256_set_epi16 70c1d14583SBruce Richardson (/* first descriptor */ 71c1d14583SBruce Richardson 0, 0, 0, /* ignore non-length fields */ 72c1d14583SBruce Richardson -rxq->crc_len, /* sub crc on data_len */ 73c1d14583SBruce Richardson 0, /* ignore high-16bits of pkt_len */ 74c1d14583SBruce Richardson -rxq->crc_len, /* sub crc on pkt_len */ 75c1d14583SBruce Richardson 0, 0, /* ignore pkt_type field */ 76c1d14583SBruce Richardson /* second descriptor */ 77c1d14583SBruce Richardson 0, 0, 0, /* ignore non-length fields */ 78c1d14583SBruce Richardson -rxq->crc_len, /* sub crc on data_len */ 79c1d14583SBruce Richardson 0, /* ignore high-16bits of pkt_len */ 80c1d14583SBruce Richardson -rxq->crc_len, /* sub crc on pkt_len */ 81c1d14583SBruce Richardson 0, 0 /* ignore pkt_type field */ 82c1d14583SBruce Richardson ); 83c1d14583SBruce Richardson 84c1d14583SBruce Richardson /* 8 packets DD mask, LSB in each 32-bit value */ 85c1d14583SBruce Richardson const __m256i dd_check = _mm256_set1_epi32(1); 86c1d14583SBruce Richardson 87c1d14583SBruce Richardson /* 8 packets EOP mask, second-LSB in each 32-bit value */ 88c1d14583SBruce Richardson const __m256i eop_check = _mm256_slli_epi32(dd_check, 89c1d14583SBruce Richardson ICE_RX_DESC_STATUS_EOF_S); 90c1d14583SBruce Richardson 91c1d14583SBruce Richardson /* mask to shuffle from desc. to mbuf (2 descriptors)*/ 92c1d14583SBruce Richardson const __m256i shuf_msk = 93c1d14583SBruce Richardson _mm256_set_epi8 94c1d14583SBruce Richardson (/* first descriptor */ 95c1d14583SBruce Richardson 0xFF, 0xFF, 96c1d14583SBruce Richardson 0xFF, 0xFF, /* rss hash parsed separately */ 97c1d14583SBruce Richardson 11, 10, /* octet 10~11, 16 bits vlan_macip */ 98c1d14583SBruce Richardson 5, 4, /* octet 4~5, 16 bits data_len */ 99c1d14583SBruce Richardson 0xFF, 0xFF, /* skip hi 16 bits pkt_len, zero out */ 100c1d14583SBruce Richardson 5, 4, /* octet 4~5, 16 bits pkt_len */ 101c1d14583SBruce Richardson 0xFF, 0xFF, /* pkt_type set as unknown */ 102c1d14583SBruce Richardson 0xFF, 0xFF, /*pkt_type set as unknown */ 103c1d14583SBruce Richardson /* second descriptor */ 104c1d14583SBruce Richardson 0xFF, 0xFF, 105c1d14583SBruce Richardson 0xFF, 0xFF, /* rss hash parsed separately */ 106c1d14583SBruce Richardson 11, 10, /* octet 10~11, 16 bits vlan_macip */ 107c1d14583SBruce Richardson 5, 4, /* octet 4~5, 16 bits data_len */ 108c1d14583SBruce Richardson 0xFF, 0xFF, /* skip hi 16 bits pkt_len, zero out */ 109c1d14583SBruce Richardson 5, 4, /* octet 4~5, 16 bits pkt_len */ 110c1d14583SBruce Richardson 0xFF, 0xFF, /* pkt_type set as unknown */ 111c1d14583SBruce Richardson 0xFF, 0xFF /*pkt_type set as unknown */ 112c1d14583SBruce Richardson ); 113c1d14583SBruce Richardson /** 114c1d14583SBruce Richardson * compile-time check the above crc and shuffle layout is correct. 115c1d14583SBruce Richardson * NOTE: the first field (lowest address) is given last in set_epi 116c1d14583SBruce Richardson * calls above. 117c1d14583SBruce Richardson */ 118c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != 119c1d14583SBruce Richardson offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); 120c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != 121c1d14583SBruce Richardson offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); 122c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) != 123c1d14583SBruce Richardson offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10); 124c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != 125c1d14583SBruce Richardson offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); 126c1d14583SBruce Richardson 127c1d14583SBruce Richardson /* Status/Error flag masks */ 128c1d14583SBruce Richardson /** 129c1d14583SBruce Richardson * mask everything except Checksum Reports, RSS indication 130c1d14583SBruce Richardson * and VLAN indication. 131c1d14583SBruce Richardson * bit6:4 for IP/L4 checksum errors. 132c1d14583SBruce Richardson * bit12 is for RSS indication. 133c1d14583SBruce Richardson * bit13 is for VLAN indication. 134c1d14583SBruce Richardson */ 135c1d14583SBruce Richardson const __m256i flags_mask = 136c1d14583SBruce Richardson _mm256_set1_epi32((0xF << 4) | (1 << 12) | (1 << 13)); 137c1d14583SBruce Richardson /** 138c1d14583SBruce Richardson * data to be shuffled by the result of the flags mask shifted by 4 139c1d14583SBruce Richardson * bits. This gives use the l3_l4 flags. 140c1d14583SBruce Richardson */ 141c1d14583SBruce Richardson const __m256i l3_l4_flags_shuf = 142c1d14583SBruce Richardson _mm256_set_epi8((RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 143c1d14583SBruce Richardson RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | 144c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 145c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 146c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 147c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 148c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 149c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 150c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 151c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 152c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 153c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 154c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 155c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 156c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 157c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 158c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 159c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 160c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 161c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 162c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 163c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 164c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 165c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 166c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 167c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 168c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 169c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 170c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 171c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 172c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 173c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 174c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 175c1d14583SBruce Richardson /** 176c1d14583SBruce Richardson * second 128-bits 177c1d14583SBruce Richardson * shift right 20 bits to use the low two bits to indicate 178c1d14583SBruce Richardson * outer checksum status 179c1d14583SBruce Richardson * shift right 1 bit to make sure it not exceed 255 180c1d14583SBruce Richardson */ 181c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 182c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 183c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 184c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 185c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 186c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 187c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 188c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 189c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 190c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 191c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 192c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 193c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 194c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 195c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 196c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 197c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 198c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 199c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 200c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 201c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 202c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 203c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 204c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 205c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 206c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 207c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 208c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 209c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 210c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 211c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 212c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1); 213c1d14583SBruce Richardson const __m256i cksum_mask = 214c1d14583SBruce Richardson _mm256_set1_epi32(RTE_MBUF_F_RX_IP_CKSUM_MASK | 215c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_MASK | 216c1d14583SBruce Richardson RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 217c1d14583SBruce Richardson RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK); 218c1d14583SBruce Richardson /** 219c1d14583SBruce Richardson * data to be shuffled by result of flag mask, shifted down 12. 220c1d14583SBruce Richardson * If RSS(bit12)/VLAN(bit13) are set, 221c1d14583SBruce Richardson * shuffle moves appropriate flags in place. 222c1d14583SBruce Richardson */ 223c1d14583SBruce Richardson const __m256i rss_vlan_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 224c1d14583SBruce Richardson 0, 0, 0, 0, 225c1d14583SBruce Richardson 0, 0, 0, 0, 226c1d14583SBruce Richardson RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 227c1d14583SBruce Richardson RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 228c1d14583SBruce Richardson RTE_MBUF_F_RX_RSS_HASH, 0, 229c1d14583SBruce Richardson /* end up 128-bits */ 230c1d14583SBruce Richardson 0, 0, 0, 0, 231c1d14583SBruce Richardson 0, 0, 0, 0, 232c1d14583SBruce Richardson 0, 0, 0, 0, 233c1d14583SBruce Richardson RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 234c1d14583SBruce Richardson RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 235c1d14583SBruce Richardson RTE_MBUF_F_RX_RSS_HASH, 0); 236c1d14583SBruce Richardson 237c1d14583SBruce Richardson RTE_SET_USED(avx_aligned); /* for 32B descriptors we don't use this */ 238c1d14583SBruce Richardson 239c1d14583SBruce Richardson uint16_t i, received; 240c1d14583SBruce Richardson 241c1d14583SBruce Richardson for (i = 0, received = 0; i < nb_pkts; 242c1d14583SBruce Richardson i += ICE_DESCS_PER_LOOP_AVX, 243c1d14583SBruce Richardson rxdp += ICE_DESCS_PER_LOOP_AVX) { 244c1d14583SBruce Richardson /* step 1, copy over 8 mbuf pointers to rx_pkts array */ 245c1d14583SBruce Richardson _mm256_storeu_si256((void *)&rx_pkts[i], 246c1d14583SBruce Richardson _mm256_loadu_si256((void *)&sw_ring[i])); 247c1d14583SBruce Richardson #ifdef RTE_ARCH_X86_64 248c1d14583SBruce Richardson _mm256_storeu_si256 249c1d14583SBruce Richardson ((void *)&rx_pkts[i + 4], 250c1d14583SBruce Richardson _mm256_loadu_si256((void *)&sw_ring[i + 4])); 251c1d14583SBruce Richardson #endif 252c1d14583SBruce Richardson 253c1d14583SBruce Richardson const __m128i raw_desc7 = _mm_load_si128 254c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, rxdp + 7)); 255c1d14583SBruce Richardson rte_compiler_barrier(); 256c1d14583SBruce Richardson const __m128i raw_desc6 = _mm_load_si128 257c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, rxdp + 6)); 258c1d14583SBruce Richardson rte_compiler_barrier(); 259c1d14583SBruce Richardson const __m128i raw_desc5 = _mm_load_si128 260c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, rxdp + 5)); 261c1d14583SBruce Richardson rte_compiler_barrier(); 262c1d14583SBruce Richardson const __m128i raw_desc4 = _mm_load_si128 263c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, rxdp + 4)); 264c1d14583SBruce Richardson rte_compiler_barrier(); 265c1d14583SBruce Richardson const __m128i raw_desc3 = _mm_load_si128 266c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, rxdp + 3)); 267c1d14583SBruce Richardson rte_compiler_barrier(); 268c1d14583SBruce Richardson const __m128i raw_desc2 = _mm_load_si128 269c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, rxdp + 2)); 270c1d14583SBruce Richardson rte_compiler_barrier(); 271c1d14583SBruce Richardson const __m128i raw_desc1 = _mm_load_si128 272c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, rxdp + 1)); 273c1d14583SBruce Richardson rte_compiler_barrier(); 274c1d14583SBruce Richardson const __m128i raw_desc0 = _mm_load_si128 275c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, rxdp + 0)); 276c1d14583SBruce Richardson 277c1d14583SBruce Richardson const __m256i raw_desc6_7 = 278c1d14583SBruce Richardson _mm256_inserti128_si256(_mm256_castsi128_si256(raw_desc6), raw_desc7, 1); 279c1d14583SBruce Richardson const __m256i raw_desc4_5 = 280c1d14583SBruce Richardson _mm256_inserti128_si256(_mm256_castsi128_si256(raw_desc4), raw_desc5, 1); 281c1d14583SBruce Richardson const __m256i raw_desc2_3 = 282c1d14583SBruce Richardson _mm256_inserti128_si256(_mm256_castsi128_si256(raw_desc2), raw_desc3, 1); 283c1d14583SBruce Richardson const __m256i raw_desc0_1 = 284c1d14583SBruce Richardson _mm256_inserti128_si256(_mm256_castsi128_si256(raw_desc0), raw_desc1, 1); 285c1d14583SBruce Richardson 286c1d14583SBruce Richardson if (split_packet) { 287c1d14583SBruce Richardson int j; 288c1d14583SBruce Richardson 289c1d14583SBruce Richardson for (j = 0; j < ICE_DESCS_PER_LOOP_AVX; j++) 290c1d14583SBruce Richardson rte_mbuf_prefetch_part2(rx_pkts[i + j]); 291c1d14583SBruce Richardson } 292c1d14583SBruce Richardson 293c1d14583SBruce Richardson /** 294c1d14583SBruce Richardson * convert descriptors 4-7 into mbufs, re-arrange fields. 295c1d14583SBruce Richardson * Then write into the mbuf. 296c1d14583SBruce Richardson */ 297c1d14583SBruce Richardson __m256i mb6_7 = _mm256_shuffle_epi8(raw_desc6_7, shuf_msk); 298c1d14583SBruce Richardson __m256i mb4_5 = _mm256_shuffle_epi8(raw_desc4_5, shuf_msk); 299c1d14583SBruce Richardson 300c1d14583SBruce Richardson mb6_7 = _mm256_add_epi16(mb6_7, crc_adjust); 301c1d14583SBruce Richardson mb4_5 = _mm256_add_epi16(mb4_5, crc_adjust); 302c1d14583SBruce Richardson /** 303c1d14583SBruce Richardson * to get packet types, ptype is located in bit16-25 304c1d14583SBruce Richardson * of each 128bits 305c1d14583SBruce Richardson */ 306c1d14583SBruce Richardson const __m256i ptype_mask = 307c1d14583SBruce Richardson _mm256_set1_epi16(ICE_RX_FLEX_DESC_PTYPE_M); 308c1d14583SBruce Richardson const __m256i ptypes6_7 = 309c1d14583SBruce Richardson _mm256_and_si256(raw_desc6_7, ptype_mask); 310c1d14583SBruce Richardson const __m256i ptypes4_5 = 311c1d14583SBruce Richardson _mm256_and_si256(raw_desc4_5, ptype_mask); 312c1d14583SBruce Richardson const uint16_t ptype7 = _mm256_extract_epi16(ptypes6_7, 9); 313c1d14583SBruce Richardson const uint16_t ptype6 = _mm256_extract_epi16(ptypes6_7, 1); 314c1d14583SBruce Richardson const uint16_t ptype5 = _mm256_extract_epi16(ptypes4_5, 9); 315c1d14583SBruce Richardson const uint16_t ptype4 = _mm256_extract_epi16(ptypes4_5, 1); 316c1d14583SBruce Richardson 317c1d14583SBruce Richardson mb6_7 = _mm256_insert_epi32(mb6_7, ptype_tbl[ptype7], 4); 318c1d14583SBruce Richardson mb6_7 = _mm256_insert_epi32(mb6_7, ptype_tbl[ptype6], 0); 319c1d14583SBruce Richardson mb4_5 = _mm256_insert_epi32(mb4_5, ptype_tbl[ptype5], 4); 320c1d14583SBruce Richardson mb4_5 = _mm256_insert_epi32(mb4_5, ptype_tbl[ptype4], 0); 321c1d14583SBruce Richardson /* merge the status bits into one register */ 322c1d14583SBruce Richardson const __m256i status4_7 = _mm256_unpackhi_epi32(raw_desc6_7, 323c1d14583SBruce Richardson raw_desc4_5); 324c1d14583SBruce Richardson 325c1d14583SBruce Richardson /** 326c1d14583SBruce Richardson * convert descriptors 0-3 into mbufs, re-arrange fields. 327c1d14583SBruce Richardson * Then write into the mbuf. 328c1d14583SBruce Richardson */ 329c1d14583SBruce Richardson __m256i mb2_3 = _mm256_shuffle_epi8(raw_desc2_3, shuf_msk); 330c1d14583SBruce Richardson __m256i mb0_1 = _mm256_shuffle_epi8(raw_desc0_1, shuf_msk); 331c1d14583SBruce Richardson 332c1d14583SBruce Richardson mb2_3 = _mm256_add_epi16(mb2_3, crc_adjust); 333c1d14583SBruce Richardson mb0_1 = _mm256_add_epi16(mb0_1, crc_adjust); 334c1d14583SBruce Richardson /** 335c1d14583SBruce Richardson * to get packet types, ptype is located in bit16-25 336c1d14583SBruce Richardson * of each 128bits 337c1d14583SBruce Richardson */ 338c1d14583SBruce Richardson const __m256i ptypes2_3 = 339c1d14583SBruce Richardson _mm256_and_si256(raw_desc2_3, ptype_mask); 340c1d14583SBruce Richardson const __m256i ptypes0_1 = 341c1d14583SBruce Richardson _mm256_and_si256(raw_desc0_1, ptype_mask); 342c1d14583SBruce Richardson const uint16_t ptype3 = _mm256_extract_epi16(ptypes2_3, 9); 343c1d14583SBruce Richardson const uint16_t ptype2 = _mm256_extract_epi16(ptypes2_3, 1); 344c1d14583SBruce Richardson const uint16_t ptype1 = _mm256_extract_epi16(ptypes0_1, 9); 345c1d14583SBruce Richardson const uint16_t ptype0 = _mm256_extract_epi16(ptypes0_1, 1); 346c1d14583SBruce Richardson 347c1d14583SBruce Richardson mb2_3 = _mm256_insert_epi32(mb2_3, ptype_tbl[ptype3], 4); 348c1d14583SBruce Richardson mb2_3 = _mm256_insert_epi32(mb2_3, ptype_tbl[ptype2], 0); 349c1d14583SBruce Richardson mb0_1 = _mm256_insert_epi32(mb0_1, ptype_tbl[ptype1], 4); 350c1d14583SBruce Richardson mb0_1 = _mm256_insert_epi32(mb0_1, ptype_tbl[ptype0], 0); 351c1d14583SBruce Richardson /* merge the status bits into one register */ 352c1d14583SBruce Richardson const __m256i status0_3 = _mm256_unpackhi_epi32(raw_desc2_3, 353c1d14583SBruce Richardson raw_desc0_1); 354c1d14583SBruce Richardson 355c1d14583SBruce Richardson /** 356c1d14583SBruce Richardson * take the two sets of status bits and merge to one 357c1d14583SBruce Richardson * After merge, the packets status flags are in the 358c1d14583SBruce Richardson * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6] 359c1d14583SBruce Richardson */ 360c1d14583SBruce Richardson __m256i status0_7 = _mm256_unpacklo_epi64(status4_7, 361c1d14583SBruce Richardson status0_3); 362c1d14583SBruce Richardson __m256i mbuf_flags = _mm256_set1_epi32(0); 363c1d14583SBruce Richardson 364c1d14583SBruce Richardson if (offload) { 365c1d14583SBruce Richardson /* now do flag manipulation */ 366c1d14583SBruce Richardson 367c1d14583SBruce Richardson /* get only flag/error bits we want */ 368c1d14583SBruce Richardson const __m256i flag_bits = 369c1d14583SBruce Richardson _mm256_and_si256(status0_7, flags_mask); 370c1d14583SBruce Richardson /** 371c1d14583SBruce Richardson * l3_l4_error flags, shuffle, then shift to correct adjustment 372c1d14583SBruce Richardson * of flags in flags_shuf, and finally mask out extra bits 373c1d14583SBruce Richardson */ 374c1d14583SBruce Richardson __m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf, 375c1d14583SBruce Richardson _mm256_srli_epi32(flag_bits, 4)); 376c1d14583SBruce Richardson l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1); 377c1d14583SBruce Richardson 378c1d14583SBruce Richardson __m256i l4_outer_mask = _mm256_set1_epi32(0x6); 379c1d14583SBruce Richardson __m256i l4_outer_flags = 380c1d14583SBruce Richardson _mm256_and_si256(l3_l4_flags, l4_outer_mask); 381c1d14583SBruce Richardson l4_outer_flags = _mm256_slli_epi32(l4_outer_flags, 20); 382c1d14583SBruce Richardson 383c1d14583SBruce Richardson __m256i l3_l4_mask = _mm256_set1_epi32(~0x6); 384c1d14583SBruce Richardson 385c1d14583SBruce Richardson l3_l4_flags = _mm256_and_si256(l3_l4_flags, l3_l4_mask); 386c1d14583SBruce Richardson l3_l4_flags = _mm256_or_si256(l3_l4_flags, l4_outer_flags); 387c1d14583SBruce Richardson l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask); 388c1d14583SBruce Richardson /* set rss and vlan flags */ 389c1d14583SBruce Richardson const __m256i rss_vlan_flag_bits = 390c1d14583SBruce Richardson _mm256_srli_epi32(flag_bits, 12); 391c1d14583SBruce Richardson const __m256i rss_vlan_flags = 392c1d14583SBruce Richardson _mm256_shuffle_epi8(rss_vlan_flags_shuf, 393c1d14583SBruce Richardson rss_vlan_flag_bits); 394c1d14583SBruce Richardson 395c1d14583SBruce Richardson /* merge flags */ 396c1d14583SBruce Richardson mbuf_flags = _mm256_or_si256(l3_l4_flags, 397c1d14583SBruce Richardson rss_vlan_flags); 398c1d14583SBruce Richardson } 399c1d14583SBruce Richardson 400c1d14583SBruce Richardson if (rxq->fdir_enabled) { 401c1d14583SBruce Richardson const __m256i fdir_id4_7 = 402c1d14583SBruce Richardson _mm256_unpackhi_epi32(raw_desc6_7, raw_desc4_5); 403c1d14583SBruce Richardson 404c1d14583SBruce Richardson const __m256i fdir_id0_3 = 405c1d14583SBruce Richardson _mm256_unpackhi_epi32(raw_desc2_3, raw_desc0_1); 406c1d14583SBruce Richardson 407c1d14583SBruce Richardson const __m256i fdir_id0_7 = 408c1d14583SBruce Richardson _mm256_unpackhi_epi64(fdir_id4_7, fdir_id0_3); 409c1d14583SBruce Richardson 410c1d14583SBruce Richardson const __m256i fdir_flags = 411c1d14583SBruce Richardson ice_flex_rxd_to_fdir_flags_vec_avx2(fdir_id0_7); 412c1d14583SBruce Richardson 413c1d14583SBruce Richardson /* merge with fdir_flags */ 414c1d14583SBruce Richardson mbuf_flags = _mm256_or_si256(mbuf_flags, fdir_flags); 415c1d14583SBruce Richardson 416c1d14583SBruce Richardson /* write to mbuf: have to use scalar store here */ 417c1d14583SBruce Richardson rx_pkts[i + 0]->hash.fdir.hi = 418c1d14583SBruce Richardson _mm256_extract_epi32(fdir_id0_7, 3); 419c1d14583SBruce Richardson 420c1d14583SBruce Richardson rx_pkts[i + 1]->hash.fdir.hi = 421c1d14583SBruce Richardson _mm256_extract_epi32(fdir_id0_7, 7); 422c1d14583SBruce Richardson 423c1d14583SBruce Richardson rx_pkts[i + 2]->hash.fdir.hi = 424c1d14583SBruce Richardson _mm256_extract_epi32(fdir_id0_7, 2); 425c1d14583SBruce Richardson 426c1d14583SBruce Richardson rx_pkts[i + 3]->hash.fdir.hi = 427c1d14583SBruce Richardson _mm256_extract_epi32(fdir_id0_7, 6); 428c1d14583SBruce Richardson 429c1d14583SBruce Richardson rx_pkts[i + 4]->hash.fdir.hi = 430c1d14583SBruce Richardson _mm256_extract_epi32(fdir_id0_7, 1); 431c1d14583SBruce Richardson 432c1d14583SBruce Richardson rx_pkts[i + 5]->hash.fdir.hi = 433c1d14583SBruce Richardson _mm256_extract_epi32(fdir_id0_7, 5); 434c1d14583SBruce Richardson 435c1d14583SBruce Richardson rx_pkts[i + 6]->hash.fdir.hi = 436c1d14583SBruce Richardson _mm256_extract_epi32(fdir_id0_7, 0); 437c1d14583SBruce Richardson 438c1d14583SBruce Richardson rx_pkts[i + 7]->hash.fdir.hi = 439c1d14583SBruce Richardson _mm256_extract_epi32(fdir_id0_7, 4); 440c1d14583SBruce Richardson } /* if() on fdir_enabled */ 441c1d14583SBruce Richardson 442c1d14583SBruce Richardson if (offload) { 443c1d14583SBruce Richardson #ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC 444c1d14583SBruce Richardson /** 445c1d14583SBruce Richardson * needs to load 2nd 16B of each desc for RSS hash parsing, 446c1d14583SBruce Richardson * will cause performance drop to get into this context. 447c1d14583SBruce Richardson */ 448c1d14583SBruce Richardson if (rxq->vsi->adapter->pf.dev_data->dev_conf.rxmode.offloads & 449c1d14583SBruce Richardson RTE_ETH_RX_OFFLOAD_RSS_HASH) { 450c1d14583SBruce Richardson /* load bottom half of every 32B desc */ 451c1d14583SBruce Richardson const __m128i raw_desc_bh7 = _mm_load_si128 452c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[7].wb.status_error1)); 453c1d14583SBruce Richardson rte_compiler_barrier(); 454c1d14583SBruce Richardson const __m128i raw_desc_bh6 = _mm_load_si128 455c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[6].wb.status_error1)); 456c1d14583SBruce Richardson rte_compiler_barrier(); 457c1d14583SBruce Richardson const __m128i raw_desc_bh5 = _mm_load_si128 458c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[5].wb.status_error1)); 459c1d14583SBruce Richardson rte_compiler_barrier(); 460c1d14583SBruce Richardson const __m128i raw_desc_bh4 = _mm_load_si128 461c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[4].wb.status_error1)); 462c1d14583SBruce Richardson rte_compiler_barrier(); 463c1d14583SBruce Richardson const __m128i raw_desc_bh3 = _mm_load_si128 464c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[3].wb.status_error1)); 465c1d14583SBruce Richardson rte_compiler_barrier(); 466c1d14583SBruce Richardson const __m128i raw_desc_bh2 = _mm_load_si128 467c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[2].wb.status_error1)); 468c1d14583SBruce Richardson rte_compiler_barrier(); 469c1d14583SBruce Richardson const __m128i raw_desc_bh1 = _mm_load_si128 470c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[1].wb.status_error1)); 471c1d14583SBruce Richardson rte_compiler_barrier(); 472c1d14583SBruce Richardson const __m128i raw_desc_bh0 = _mm_load_si128 473c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[0].wb.status_error1)); 474c1d14583SBruce Richardson 475c1d14583SBruce Richardson __m256i raw_desc_bh6_7 = 476c1d14583SBruce Richardson _mm256_inserti128_si256 477c1d14583SBruce Richardson (_mm256_castsi128_si256(raw_desc_bh6), 478c1d14583SBruce Richardson raw_desc_bh7, 1); 479c1d14583SBruce Richardson __m256i raw_desc_bh4_5 = 480c1d14583SBruce Richardson _mm256_inserti128_si256 481c1d14583SBruce Richardson (_mm256_castsi128_si256(raw_desc_bh4), 482c1d14583SBruce Richardson raw_desc_bh5, 1); 483c1d14583SBruce Richardson __m256i raw_desc_bh2_3 = 484c1d14583SBruce Richardson _mm256_inserti128_si256 485c1d14583SBruce Richardson (_mm256_castsi128_si256(raw_desc_bh2), 486c1d14583SBruce Richardson raw_desc_bh3, 1); 487c1d14583SBruce Richardson __m256i raw_desc_bh0_1 = 488c1d14583SBruce Richardson _mm256_inserti128_si256 489c1d14583SBruce Richardson (_mm256_castsi128_si256(raw_desc_bh0), 490c1d14583SBruce Richardson raw_desc_bh1, 1); 491c1d14583SBruce Richardson 492c1d14583SBruce Richardson /** 493c1d14583SBruce Richardson * to shift the 32b RSS hash value to the 494c1d14583SBruce Richardson * highest 32b of each 128b before mask 495c1d14583SBruce Richardson */ 496c1d14583SBruce Richardson __m256i rss_hash6_7 = 497c1d14583SBruce Richardson _mm256_slli_epi64(raw_desc_bh6_7, 32); 498c1d14583SBruce Richardson __m256i rss_hash4_5 = 499c1d14583SBruce Richardson _mm256_slli_epi64(raw_desc_bh4_5, 32); 500c1d14583SBruce Richardson __m256i rss_hash2_3 = 501c1d14583SBruce Richardson _mm256_slli_epi64(raw_desc_bh2_3, 32); 502c1d14583SBruce Richardson __m256i rss_hash0_1 = 503c1d14583SBruce Richardson _mm256_slli_epi64(raw_desc_bh0_1, 32); 504c1d14583SBruce Richardson 505c1d14583SBruce Richardson __m256i rss_hash_msk = 506c1d14583SBruce Richardson _mm256_set_epi32(0xFFFFFFFF, 0, 0, 0, 507c1d14583SBruce Richardson 0xFFFFFFFF, 0, 0, 0); 508c1d14583SBruce Richardson 509c1d14583SBruce Richardson rss_hash6_7 = _mm256_and_si256 510c1d14583SBruce Richardson (rss_hash6_7, rss_hash_msk); 511c1d14583SBruce Richardson rss_hash4_5 = _mm256_and_si256 512c1d14583SBruce Richardson (rss_hash4_5, rss_hash_msk); 513c1d14583SBruce Richardson rss_hash2_3 = _mm256_and_si256 514c1d14583SBruce Richardson (rss_hash2_3, rss_hash_msk); 515c1d14583SBruce Richardson rss_hash0_1 = _mm256_and_si256 516c1d14583SBruce Richardson (rss_hash0_1, rss_hash_msk); 517c1d14583SBruce Richardson 518c1d14583SBruce Richardson mb6_7 = _mm256_or_si256(mb6_7, rss_hash6_7); 519c1d14583SBruce Richardson mb4_5 = _mm256_or_si256(mb4_5, rss_hash4_5); 520c1d14583SBruce Richardson mb2_3 = _mm256_or_si256(mb2_3, rss_hash2_3); 521c1d14583SBruce Richardson mb0_1 = _mm256_or_si256(mb0_1, rss_hash0_1); 522c1d14583SBruce Richardson } /* if() on RSS hash parsing */ 523c1d14583SBruce Richardson #endif 524c1d14583SBruce Richardson } 525c1d14583SBruce Richardson 526c1d14583SBruce Richardson /** 527c1d14583SBruce Richardson * At this point, we have the 8 sets of flags in the low 16-bits 528c1d14583SBruce Richardson * of each 32-bit value in vlan0. 529c1d14583SBruce Richardson * We want to extract these, and merge them with the mbuf init 530c1d14583SBruce Richardson * data so we can do a single write to the mbuf to set the flags 531c1d14583SBruce Richardson * and all the other initialization fields. Extracting the 532c1d14583SBruce Richardson * appropriate flags means that we have to do a shift and blend 533c1d14583SBruce Richardson * for each mbuf before we do the write. However, we can also 534c1d14583SBruce Richardson * add in the previously computed rx_descriptor fields to 535c1d14583SBruce Richardson * make a single 256-bit write per mbuf 536c1d14583SBruce Richardson */ 537c1d14583SBruce Richardson /* check the structure matches expectations */ 538c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) != 539c1d14583SBruce Richardson offsetof(struct rte_mbuf, rearm_data) + 8); 540c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) != 541c1d14583SBruce Richardson RTE_ALIGN(offsetof(struct rte_mbuf, 542c1d14583SBruce Richardson rearm_data), 543c1d14583SBruce Richardson 16)); 544c1d14583SBruce Richardson /* build up data and do writes */ 545c1d14583SBruce Richardson __m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5, 546c1d14583SBruce Richardson rearm6, rearm7; 547c1d14583SBruce Richardson rearm6 = _mm256_blend_epi32(mbuf_init, 548c1d14583SBruce Richardson _mm256_slli_si256(mbuf_flags, 8), 549c1d14583SBruce Richardson 0x04); 550c1d14583SBruce Richardson rearm4 = _mm256_blend_epi32(mbuf_init, 551c1d14583SBruce Richardson _mm256_slli_si256(mbuf_flags, 4), 552c1d14583SBruce Richardson 0x04); 553c1d14583SBruce Richardson rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04); 554c1d14583SBruce Richardson rearm0 = _mm256_blend_epi32(mbuf_init, 555c1d14583SBruce Richardson _mm256_srli_si256(mbuf_flags, 4), 556c1d14583SBruce Richardson 0x04); 557c1d14583SBruce Richardson /* permute to add in the rx_descriptor e.g. rss fields */ 558c1d14583SBruce Richardson rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20); 559c1d14583SBruce Richardson rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20); 560c1d14583SBruce Richardson rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20); 561c1d14583SBruce Richardson rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20); 562c1d14583SBruce Richardson /* write to mbuf */ 563c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data, 564c1d14583SBruce Richardson rearm6); 565c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data, 566c1d14583SBruce Richardson rearm4); 567c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data, 568c1d14583SBruce Richardson rearm2); 569c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data, 570c1d14583SBruce Richardson rearm0); 571c1d14583SBruce Richardson 572c1d14583SBruce Richardson /* repeat for the odd mbufs */ 573c1d14583SBruce Richardson const __m256i odd_flags = 574c1d14583SBruce Richardson _mm256_castsi128_si256 575c1d14583SBruce Richardson (_mm256_extracti128_si256(mbuf_flags, 1)); 576c1d14583SBruce Richardson rearm7 = _mm256_blend_epi32(mbuf_init, 577c1d14583SBruce Richardson _mm256_slli_si256(odd_flags, 8), 578c1d14583SBruce Richardson 0x04); 579c1d14583SBruce Richardson rearm5 = _mm256_blend_epi32(mbuf_init, 580c1d14583SBruce Richardson _mm256_slli_si256(odd_flags, 4), 581c1d14583SBruce Richardson 0x04); 582c1d14583SBruce Richardson rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04); 583c1d14583SBruce Richardson rearm1 = _mm256_blend_epi32(mbuf_init, 584c1d14583SBruce Richardson _mm256_srli_si256(odd_flags, 4), 585c1d14583SBruce Richardson 0x04); 586c1d14583SBruce Richardson /* since odd mbufs are already in hi 128-bits use blend */ 587c1d14583SBruce Richardson rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0); 588c1d14583SBruce Richardson rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0); 589c1d14583SBruce Richardson rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0); 590c1d14583SBruce Richardson rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0); 591c1d14583SBruce Richardson /* again write to mbufs */ 592c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data, 593c1d14583SBruce Richardson rearm7); 594c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data, 595c1d14583SBruce Richardson rearm5); 596c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data, 597c1d14583SBruce Richardson rearm3); 598c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data, 599c1d14583SBruce Richardson rearm1); 600c1d14583SBruce Richardson 601c1d14583SBruce Richardson /* extract and record EOP bit */ 602c1d14583SBruce Richardson if (split_packet) { 603c1d14583SBruce Richardson const __m128i eop_mask = 604c1d14583SBruce Richardson _mm_set1_epi16(1 << ICE_RX_DESC_STATUS_EOF_S); 605c1d14583SBruce Richardson const __m256i eop_bits256 = _mm256_and_si256(status0_7, 606c1d14583SBruce Richardson eop_check); 607c1d14583SBruce Richardson /* pack status bits into a single 128-bit register */ 608c1d14583SBruce Richardson const __m128i eop_bits = 609c1d14583SBruce Richardson _mm_packus_epi32 610c1d14583SBruce Richardson (_mm256_castsi256_si128(eop_bits256), 611c1d14583SBruce Richardson _mm256_extractf128_si256(eop_bits256, 612c1d14583SBruce Richardson 1)); 613c1d14583SBruce Richardson /** 614c1d14583SBruce Richardson * flip bits, and mask out the EOP bit, which is now 615c1d14583SBruce Richardson * a split-packet bit i.e. !EOP, rather than EOP one. 616c1d14583SBruce Richardson */ 617c1d14583SBruce Richardson __m128i split_bits = _mm_andnot_si128(eop_bits, 618c1d14583SBruce Richardson eop_mask); 619c1d14583SBruce Richardson /** 620c1d14583SBruce Richardson * eop bits are out of order, so we need to shuffle them 621c1d14583SBruce Richardson * back into order again. In doing so, only use low 8 622c1d14583SBruce Richardson * bits, which acts like another pack instruction 623c1d14583SBruce Richardson * The original order is (hi->lo): 1,3,5,7,0,2,4,6 624c1d14583SBruce Richardson * [Since we use epi8, the 16-bit positions are 625c1d14583SBruce Richardson * multiplied by 2 in the eop_shuffle value.] 626c1d14583SBruce Richardson */ 627c1d14583SBruce Richardson __m128i eop_shuffle = 628c1d14583SBruce Richardson _mm_set_epi8(/* zero hi 64b */ 629c1d14583SBruce Richardson 0xFF, 0xFF, 0xFF, 0xFF, 630c1d14583SBruce Richardson 0xFF, 0xFF, 0xFF, 0xFF, 631c1d14583SBruce Richardson /* move values to lo 64b */ 632c1d14583SBruce Richardson 8, 0, 10, 2, 633c1d14583SBruce Richardson 12, 4, 14, 6); 634c1d14583SBruce Richardson split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle); 635c1d14583SBruce Richardson *(uint64_t *)split_packet = 636c1d14583SBruce Richardson _mm_cvtsi128_si64(split_bits); 637c1d14583SBruce Richardson split_packet += ICE_DESCS_PER_LOOP_AVX; 638c1d14583SBruce Richardson } 639c1d14583SBruce Richardson 640c1d14583SBruce Richardson /* perform dd_check */ 641c1d14583SBruce Richardson status0_7 = _mm256_and_si256(status0_7, dd_check); 642c1d14583SBruce Richardson status0_7 = _mm256_packs_epi32(status0_7, 643c1d14583SBruce Richardson _mm256_setzero_si256()); 644c1d14583SBruce Richardson 645c1d14583SBruce Richardson uint64_t burst = rte_popcount64 646c1d14583SBruce Richardson (_mm_cvtsi128_si64 647c1d14583SBruce Richardson (_mm256_extracti128_si256 648c1d14583SBruce Richardson (status0_7, 1))); 649c1d14583SBruce Richardson burst += rte_popcount64 650c1d14583SBruce Richardson (_mm_cvtsi128_si64 651c1d14583SBruce Richardson (_mm256_castsi256_si128(status0_7))); 652c1d14583SBruce Richardson received += burst; 653c1d14583SBruce Richardson if (burst != ICE_DESCS_PER_LOOP_AVX) 654c1d14583SBruce Richardson break; 655c1d14583SBruce Richardson } 656c1d14583SBruce Richardson 657c1d14583SBruce Richardson /* update tail pointers */ 658c1d14583SBruce Richardson rxq->rx_tail += received; 659c1d14583SBruce Richardson rxq->rx_tail &= (rxq->nb_rx_desc - 1); 660c1d14583SBruce Richardson if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep avx2 aligned */ 661c1d14583SBruce Richardson rxq->rx_tail--; 662c1d14583SBruce Richardson received--; 663c1d14583SBruce Richardson } 664c1d14583SBruce Richardson rxq->rxrearm_nb += received; 665c1d14583SBruce Richardson return received; 666c1d14583SBruce Richardson } 667c1d14583SBruce Richardson 668c1d14583SBruce Richardson /** 669c1d14583SBruce Richardson * Notice: 670c1d14583SBruce Richardson * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet 671c1d14583SBruce Richardson */ 672c1d14583SBruce Richardson uint16_t 673c1d14583SBruce Richardson ice_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, 674c1d14583SBruce Richardson uint16_t nb_pkts) 675c1d14583SBruce Richardson { 676c1d14583SBruce Richardson return _ice_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, 677c1d14583SBruce Richardson nb_pkts, NULL, false); 678c1d14583SBruce Richardson } 679c1d14583SBruce Richardson 680c1d14583SBruce Richardson uint16_t 681c1d14583SBruce Richardson ice_recv_pkts_vec_avx2_offload(void *rx_queue, struct rte_mbuf **rx_pkts, 682c1d14583SBruce Richardson uint16_t nb_pkts) 683c1d14583SBruce Richardson { 684c1d14583SBruce Richardson return _ice_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, 685c1d14583SBruce Richardson nb_pkts, NULL, true); 686c1d14583SBruce Richardson } 687c1d14583SBruce Richardson 688c1d14583SBruce Richardson /** 689c1d14583SBruce Richardson * vPMD receive routine that reassembles single burst of 32 scattered packets 690c1d14583SBruce Richardson * Notice: 691c1d14583SBruce Richardson * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet 692c1d14583SBruce Richardson */ 693c1d14583SBruce Richardson static __rte_always_inline uint16_t 694c1d14583SBruce Richardson ice_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, 695c1d14583SBruce Richardson uint16_t nb_pkts, bool offload) 696c1d14583SBruce Richardson { 697c1d14583SBruce Richardson struct ice_rx_queue *rxq = rx_queue; 698c1d14583SBruce Richardson uint8_t split_flags[ICE_VPMD_RX_BURST] = {0}; 699c1d14583SBruce Richardson 700c1d14583SBruce Richardson /* get some new buffers */ 701c1d14583SBruce Richardson uint16_t nb_bufs = _ice_recv_raw_pkts_vec_avx2(rxq, rx_pkts, nb_pkts, 702c1d14583SBruce Richardson split_flags, offload); 703c1d14583SBruce Richardson if (nb_bufs == 0) 704c1d14583SBruce Richardson return 0; 705c1d14583SBruce Richardson 706c1d14583SBruce Richardson /* happy day case, full burst + no packets to be joined */ 707c1d14583SBruce Richardson const uint64_t *split_fl64 = (uint64_t *)split_flags; 708c1d14583SBruce Richardson 709c1d14583SBruce Richardson if (!rxq->pkt_first_seg && 710c1d14583SBruce Richardson split_fl64[0] == 0 && split_fl64[1] == 0 && 711c1d14583SBruce Richardson split_fl64[2] == 0 && split_fl64[3] == 0) 712c1d14583SBruce Richardson return nb_bufs; 713c1d14583SBruce Richardson 714c1d14583SBruce Richardson /* reassemble any packets that need reassembly*/ 715c1d14583SBruce Richardson unsigned int i = 0; 716c1d14583SBruce Richardson 717c1d14583SBruce Richardson if (!rxq->pkt_first_seg) { 718c1d14583SBruce Richardson /* find the first split flag, and only reassemble then*/ 719c1d14583SBruce Richardson while (i < nb_bufs && !split_flags[i]) 720c1d14583SBruce Richardson i++; 721c1d14583SBruce Richardson if (i == nb_bufs) 722c1d14583SBruce Richardson return nb_bufs; 723c1d14583SBruce Richardson rxq->pkt_first_seg = rx_pkts[i]; 724c1d14583SBruce Richardson } 72582fbc4a4SBruce Richardson return i + ci_rx_reassemble_packets(&rx_pkts[i], nb_bufs - i, &split_flags[i], 72682fbc4a4SBruce Richardson &rxq->pkt_first_seg, &rxq->pkt_last_seg, rxq->crc_len); 727c1d14583SBruce Richardson } 728c1d14583SBruce Richardson 729c1d14583SBruce Richardson /** 730c1d14583SBruce Richardson * vPMD receive routine that reassembles scattered packets. 731c1d14583SBruce Richardson * Main receive routine that can handle arbitrary burst sizes 732c1d14583SBruce Richardson * Notice: 733c1d14583SBruce Richardson * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet 734c1d14583SBruce Richardson */ 735c1d14583SBruce Richardson static __rte_always_inline uint16_t 736c1d14583SBruce Richardson ice_recv_scattered_pkts_vec_avx2_common(void *rx_queue, 737c1d14583SBruce Richardson struct rte_mbuf **rx_pkts, 738c1d14583SBruce Richardson uint16_t nb_pkts, 739c1d14583SBruce Richardson bool offload) 740c1d14583SBruce Richardson { 741c1d14583SBruce Richardson uint16_t retval = 0; 742c1d14583SBruce Richardson 743c1d14583SBruce Richardson while (nb_pkts > ICE_VPMD_RX_BURST) { 744c1d14583SBruce Richardson uint16_t burst = ice_recv_scattered_burst_vec_avx2(rx_queue, 745c1d14583SBruce Richardson rx_pkts + retval, ICE_VPMD_RX_BURST, offload); 746c1d14583SBruce Richardson retval += burst; 747c1d14583SBruce Richardson nb_pkts -= burst; 748c1d14583SBruce Richardson if (burst < ICE_VPMD_RX_BURST) 749c1d14583SBruce Richardson return retval; 750c1d14583SBruce Richardson } 751c1d14583SBruce Richardson return retval + ice_recv_scattered_burst_vec_avx2(rx_queue, 752c1d14583SBruce Richardson rx_pkts + retval, nb_pkts, offload); 753c1d14583SBruce Richardson } 754c1d14583SBruce Richardson 755c1d14583SBruce Richardson uint16_t 756c1d14583SBruce Richardson ice_recv_scattered_pkts_vec_avx2(void *rx_queue, 757c1d14583SBruce Richardson struct rte_mbuf **rx_pkts, 758c1d14583SBruce Richardson uint16_t nb_pkts) 759c1d14583SBruce Richardson { 760c1d14583SBruce Richardson return ice_recv_scattered_pkts_vec_avx2_common(rx_queue, 761c1d14583SBruce Richardson rx_pkts, 762c1d14583SBruce Richardson nb_pkts, 763c1d14583SBruce Richardson false); 764c1d14583SBruce Richardson } 765c1d14583SBruce Richardson 766c1d14583SBruce Richardson uint16_t 767c1d14583SBruce Richardson ice_recv_scattered_pkts_vec_avx2_offload(void *rx_queue, 768c1d14583SBruce Richardson struct rte_mbuf **rx_pkts, 769c1d14583SBruce Richardson uint16_t nb_pkts) 770c1d14583SBruce Richardson { 771c1d14583SBruce Richardson return ice_recv_scattered_pkts_vec_avx2_common(rx_queue, 772c1d14583SBruce Richardson rx_pkts, 773c1d14583SBruce Richardson nb_pkts, 774c1d14583SBruce Richardson true); 775c1d14583SBruce Richardson } 776c1d14583SBruce Richardson 777c1d14583SBruce Richardson static __rte_always_inline void 778c1d14583SBruce Richardson ice_vtx1(volatile struct ice_tx_desc *txdp, 779c1d14583SBruce Richardson struct rte_mbuf *pkt, uint64_t flags, bool offload) 780c1d14583SBruce Richardson { 781c1d14583SBruce Richardson uint64_t high_qw = 782c1d14583SBruce Richardson (ICE_TX_DESC_DTYPE_DATA | 783c1d14583SBruce Richardson ((uint64_t)flags << ICE_TXD_QW1_CMD_S) | 784c1d14583SBruce Richardson ((uint64_t)pkt->data_len << ICE_TXD_QW1_TX_BUF_SZ_S)); 785c1d14583SBruce Richardson if (offload) 786c1d14583SBruce Richardson ice_txd_enable_offload(pkt, &high_qw); 787c1d14583SBruce Richardson 788c1d14583SBruce Richardson __m128i descriptor = _mm_set_epi64x(high_qw, rte_pktmbuf_iova(pkt)); 789c1d14583SBruce Richardson _mm_store_si128(RTE_CAST_PTR(__m128i *, txdp), descriptor); 790c1d14583SBruce Richardson } 791c1d14583SBruce Richardson 792c1d14583SBruce Richardson static __rte_always_inline void 793c1d14583SBruce Richardson ice_vtx(volatile struct ice_tx_desc *txdp, 794c1d14583SBruce Richardson struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags, bool offload) 795c1d14583SBruce Richardson { 796c1d14583SBruce Richardson const uint64_t hi_qw_tmpl = (ICE_TX_DESC_DTYPE_DATA | 797c1d14583SBruce Richardson ((uint64_t)flags << ICE_TXD_QW1_CMD_S)); 798c1d14583SBruce Richardson 799c1d14583SBruce Richardson /* if unaligned on 32-bit boundary, do one to align */ 800c1d14583SBruce Richardson if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) { 801c1d14583SBruce Richardson ice_vtx1(txdp, *pkt, flags, offload); 802c1d14583SBruce Richardson nb_pkts--, txdp++, pkt++; 803c1d14583SBruce Richardson } 804c1d14583SBruce Richardson 805c1d14583SBruce Richardson /* do two at a time while possible, in bursts */ 806c1d14583SBruce Richardson for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) { 807c1d14583SBruce Richardson uint64_t hi_qw3 = 808c1d14583SBruce Richardson hi_qw_tmpl | 809c1d14583SBruce Richardson ((uint64_t)pkt[3]->data_len << 810c1d14583SBruce Richardson ICE_TXD_QW1_TX_BUF_SZ_S); 811c1d14583SBruce Richardson if (offload) 812c1d14583SBruce Richardson ice_txd_enable_offload(pkt[3], &hi_qw3); 813c1d14583SBruce Richardson uint64_t hi_qw2 = 814c1d14583SBruce Richardson hi_qw_tmpl | 815c1d14583SBruce Richardson ((uint64_t)pkt[2]->data_len << 816c1d14583SBruce Richardson ICE_TXD_QW1_TX_BUF_SZ_S); 817c1d14583SBruce Richardson if (offload) 818c1d14583SBruce Richardson ice_txd_enable_offload(pkt[2], &hi_qw2); 819c1d14583SBruce Richardson uint64_t hi_qw1 = 820c1d14583SBruce Richardson hi_qw_tmpl | 821c1d14583SBruce Richardson ((uint64_t)pkt[1]->data_len << 822c1d14583SBruce Richardson ICE_TXD_QW1_TX_BUF_SZ_S); 823c1d14583SBruce Richardson if (offload) 824c1d14583SBruce Richardson ice_txd_enable_offload(pkt[1], &hi_qw1); 825c1d14583SBruce Richardson uint64_t hi_qw0 = 826c1d14583SBruce Richardson hi_qw_tmpl | 827c1d14583SBruce Richardson ((uint64_t)pkt[0]->data_len << 828c1d14583SBruce Richardson ICE_TXD_QW1_TX_BUF_SZ_S); 829c1d14583SBruce Richardson if (offload) 830c1d14583SBruce Richardson ice_txd_enable_offload(pkt[0], &hi_qw0); 831c1d14583SBruce Richardson 832c1d14583SBruce Richardson __m256i desc2_3 = 833c1d14583SBruce Richardson _mm256_set_epi64x 834c1d14583SBruce Richardson (hi_qw3, rte_pktmbuf_iova(pkt[3]), 835c1d14583SBruce Richardson hi_qw2, rte_pktmbuf_iova(pkt[2])); 836c1d14583SBruce Richardson __m256i desc0_1 = 837c1d14583SBruce Richardson _mm256_set_epi64x 838c1d14583SBruce Richardson (hi_qw1, rte_pktmbuf_iova(pkt[1]), 839c1d14583SBruce Richardson hi_qw0, rte_pktmbuf_iova(pkt[0])); 840c1d14583SBruce Richardson _mm256_store_si256(RTE_CAST_PTR(__m256i *, txdp + 2), desc2_3); 841c1d14583SBruce Richardson _mm256_store_si256(RTE_CAST_PTR(__m256i *, txdp), desc0_1); 842c1d14583SBruce Richardson } 843c1d14583SBruce Richardson 844c1d14583SBruce Richardson /* do any last ones */ 845c1d14583SBruce Richardson while (nb_pkts) { 846c1d14583SBruce Richardson ice_vtx1(txdp, *pkt, flags, offload); 847c1d14583SBruce Richardson txdp++, pkt++, nb_pkts--; 848c1d14583SBruce Richardson } 849c1d14583SBruce Richardson } 850c1d14583SBruce Richardson 851c1d14583SBruce Richardson static __rte_always_inline uint16_t 852c1d14583SBruce Richardson ice_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts, 853c1d14583SBruce Richardson uint16_t nb_pkts, bool offload) 854c1d14583SBruce Richardson { 855c038157aSBruce Richardson struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue; 856c1d14583SBruce Richardson volatile struct ice_tx_desc *txdp; 857*7e230d56SBruce Richardson struct ci_tx_entry_vec *txep; 858c1d14583SBruce Richardson uint16_t n, nb_commit, tx_id; 859c1d14583SBruce Richardson uint64_t flags = ICE_TD_CMD; 860c1d14583SBruce Richardson uint64_t rs = ICE_TX_DESC_CMD_RS | ICE_TD_CMD; 861c1d14583SBruce Richardson 862c1d14583SBruce Richardson /* cross rx_thresh boundary is not allowed */ 863c1d14583SBruce Richardson nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh); 864c1d14583SBruce Richardson 865c1d14583SBruce Richardson if (txq->nb_tx_free < txq->tx_free_thresh) 866*7e230d56SBruce Richardson ci_tx_free_bufs_vec(txq, ice_tx_desc_done, false); 867c1d14583SBruce Richardson 868c1d14583SBruce Richardson nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts); 869c1d14583SBruce Richardson if (unlikely(nb_pkts == 0)) 870c1d14583SBruce Richardson return 0; 871c1d14583SBruce Richardson 872c1d14583SBruce Richardson tx_id = txq->tx_tail; 8734d0f54d9SBruce Richardson txdp = &txq->ice_tx_ring[tx_id]; 874*7e230d56SBruce Richardson txep = &txq->sw_ring_vec[tx_id]; 875c1d14583SBruce Richardson 876c1d14583SBruce Richardson txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts); 877c1d14583SBruce Richardson 878c1d14583SBruce Richardson n = (uint16_t)(txq->nb_tx_desc - tx_id); 879c1d14583SBruce Richardson if (nb_commit >= n) { 880*7e230d56SBruce Richardson ci_tx_backlog_entry_vec(txep, tx_pkts, n); 881c1d14583SBruce Richardson 882c1d14583SBruce Richardson ice_vtx(txdp, tx_pkts, n - 1, flags, offload); 883c1d14583SBruce Richardson tx_pkts += (n - 1); 884c1d14583SBruce Richardson txdp += (n - 1); 885c1d14583SBruce Richardson 886c1d14583SBruce Richardson ice_vtx1(txdp, *tx_pkts++, rs, offload); 887c1d14583SBruce Richardson 888c1d14583SBruce Richardson nb_commit = (uint16_t)(nb_commit - n); 889c1d14583SBruce Richardson 890c1d14583SBruce Richardson tx_id = 0; 891c1d14583SBruce Richardson txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1); 892c1d14583SBruce Richardson 893c1d14583SBruce Richardson /* avoid reach the end of ring */ 8944d0f54d9SBruce Richardson txdp = &txq->ice_tx_ring[tx_id]; 895*7e230d56SBruce Richardson txep = &txq->sw_ring_vec[tx_id]; 896c1d14583SBruce Richardson } 897c1d14583SBruce Richardson 898*7e230d56SBruce Richardson ci_tx_backlog_entry_vec(txep, tx_pkts, nb_commit); 899c1d14583SBruce Richardson 900c1d14583SBruce Richardson ice_vtx(txdp, tx_pkts, nb_commit, flags, offload); 901c1d14583SBruce Richardson 902c1d14583SBruce Richardson tx_id = (uint16_t)(tx_id + nb_commit); 903c1d14583SBruce Richardson if (tx_id > txq->tx_next_rs) { 9044d0f54d9SBruce Richardson txq->ice_tx_ring[txq->tx_next_rs].cmd_type_offset_bsz |= 905c1d14583SBruce Richardson rte_cpu_to_le_64(((uint64_t)ICE_TX_DESC_CMD_RS) << 906c1d14583SBruce Richardson ICE_TXD_QW1_CMD_S); 907c1d14583SBruce Richardson txq->tx_next_rs = 908c1d14583SBruce Richardson (uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh); 909c1d14583SBruce Richardson } 910c1d14583SBruce Richardson 911c1d14583SBruce Richardson txq->tx_tail = tx_id; 912c1d14583SBruce Richardson 913c1d14583SBruce Richardson ICE_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail); 914c1d14583SBruce Richardson 915c1d14583SBruce Richardson return nb_pkts; 916c1d14583SBruce Richardson } 917c1d14583SBruce Richardson 918c1d14583SBruce Richardson static __rte_always_inline uint16_t 919c1d14583SBruce Richardson ice_xmit_pkts_vec_avx2_common(void *tx_queue, struct rte_mbuf **tx_pkts, 920c1d14583SBruce Richardson uint16_t nb_pkts, bool offload) 921c1d14583SBruce Richardson { 922c1d14583SBruce Richardson uint16_t nb_tx = 0; 923c038157aSBruce Richardson struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue; 924c1d14583SBruce Richardson 925c1d14583SBruce Richardson while (nb_pkts) { 926c1d14583SBruce Richardson uint16_t ret, num; 927c1d14583SBruce Richardson 928c1d14583SBruce Richardson num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh); 929c1d14583SBruce Richardson ret = ice_xmit_fixed_burst_vec_avx2(tx_queue, &tx_pkts[nb_tx], 930c1d14583SBruce Richardson num, offload); 931c1d14583SBruce Richardson nb_tx += ret; 932c1d14583SBruce Richardson nb_pkts -= ret; 933c1d14583SBruce Richardson if (ret < num) 934c1d14583SBruce Richardson break; 935c1d14583SBruce Richardson } 936c1d14583SBruce Richardson 937c1d14583SBruce Richardson return nb_tx; 938c1d14583SBruce Richardson } 939c1d14583SBruce Richardson 940c1d14583SBruce Richardson uint16_t 941c1d14583SBruce Richardson ice_xmit_pkts_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts, 942c1d14583SBruce Richardson uint16_t nb_pkts) 943c1d14583SBruce Richardson { 944c1d14583SBruce Richardson return ice_xmit_pkts_vec_avx2_common(tx_queue, tx_pkts, nb_pkts, false); 945c1d14583SBruce Richardson } 946c1d14583SBruce Richardson 947c1d14583SBruce Richardson uint16_t 948c1d14583SBruce Richardson ice_xmit_pkts_vec_avx2_offload(void *tx_queue, struct rte_mbuf **tx_pkts, 949c1d14583SBruce Richardson uint16_t nb_pkts) 950c1d14583SBruce Richardson { 951c1d14583SBruce Richardson return ice_xmit_pkts_vec_avx2_common(tx_queue, tx_pkts, nb_pkts, true); 952c1d14583SBruce Richardson } 953