1c1d14583SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause 2c1d14583SBruce Richardson * Copyright(c) 2019 Intel Corporation 3c1d14583SBruce Richardson */ 4c1d14583SBruce Richardson 5c1d14583SBruce Richardson #include "ice_rxtx_vec_common.h" 6c1d14583SBruce Richardson #include "ice_rxtx_common_avx.h" 7c1d14583SBruce Richardson 8c1d14583SBruce Richardson #include <rte_vect.h> 9c1d14583SBruce Richardson 10c1d14583SBruce Richardson #define ICE_DESCS_PER_LOOP_AVX 8 11c1d14583SBruce Richardson 12c1d14583SBruce Richardson static __rte_always_inline void 13c1d14583SBruce Richardson ice_rxq_rearm(struct ice_rx_queue *rxq) 14c1d14583SBruce Richardson { 15c1d14583SBruce Richardson ice_rxq_rearm_common(rxq, true); 16c1d14583SBruce Richardson } 17c1d14583SBruce Richardson 18c1d14583SBruce Richardson static inline __m256i 19c1d14583SBruce Richardson ice_flex_rxd_to_fdir_flags_vec_avx512(const __m256i fdir_id0_7) 20c1d14583SBruce Richardson { 21c1d14583SBruce Richardson #define FDID_MIS_MAGIC 0xFFFFFFFF 22c1d14583SBruce Richardson RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR != (1 << 2)); 23c1d14583SBruce Richardson RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR_ID != (1 << 13)); 24c1d14583SBruce Richardson const __m256i pkt_fdir_bit = _mm256_set1_epi32(RTE_MBUF_F_RX_FDIR | 25c1d14583SBruce Richardson RTE_MBUF_F_RX_FDIR_ID); 26c1d14583SBruce Richardson /* desc->flow_id field == 0xFFFFFFFF means fdir mismatch */ 27c1d14583SBruce Richardson const __m256i fdir_mis_mask = _mm256_set1_epi32(FDID_MIS_MAGIC); 28c1d14583SBruce Richardson __m256i fdir_mask = _mm256_cmpeq_epi32(fdir_id0_7, 29c1d14583SBruce Richardson fdir_mis_mask); 30c1d14583SBruce Richardson /* this XOR op results to bit-reverse the fdir_mask */ 31c1d14583SBruce Richardson fdir_mask = _mm256_xor_si256(fdir_mask, fdir_mis_mask); 32c1d14583SBruce Richardson const __m256i fdir_flags = _mm256_and_si256(fdir_mask, pkt_fdir_bit); 33c1d14583SBruce Richardson 34c1d14583SBruce Richardson return fdir_flags; 35c1d14583SBruce Richardson } 36c1d14583SBruce Richardson 37c1d14583SBruce Richardson static __rte_always_inline uint16_t 38c1d14583SBruce Richardson _ice_recv_raw_pkts_vec_avx512(struct ice_rx_queue *rxq, 39c1d14583SBruce Richardson struct rte_mbuf **rx_pkts, 40c1d14583SBruce Richardson uint16_t nb_pkts, 41c1d14583SBruce Richardson uint8_t *split_packet, 42c1d14583SBruce Richardson bool do_offload) 43c1d14583SBruce Richardson { 44c1d14583SBruce Richardson const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; 45c1d14583SBruce Richardson const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 46c1d14583SBruce Richardson 0, rxq->mbuf_initializer); 47c1d14583SBruce Richardson struct ice_rx_entry *sw_ring = &rxq->sw_ring[rxq->rx_tail]; 48c1d14583SBruce Richardson volatile union ice_rx_flex_desc *rxdp = rxq->rx_ring + rxq->rx_tail; 49c1d14583SBruce Richardson 50c1d14583SBruce Richardson rte_prefetch0(rxdp); 51c1d14583SBruce Richardson 52c1d14583SBruce Richardson /* nb_pkts has to be floor-aligned to ICE_DESCS_PER_LOOP_AVX */ 53c1d14583SBruce Richardson nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, ICE_DESCS_PER_LOOP_AVX); 54c1d14583SBruce Richardson 55c1d14583SBruce Richardson /* See if we need to rearm the RX queue - gives the prefetch a bit 56c1d14583SBruce Richardson * of time to act 57c1d14583SBruce Richardson */ 58c1d14583SBruce Richardson if (rxq->rxrearm_nb > ICE_RXQ_REARM_THRESH) 59c1d14583SBruce Richardson ice_rxq_rearm(rxq); 60c1d14583SBruce Richardson 61c1d14583SBruce Richardson /* Before we start moving massive data around, check to see if 62c1d14583SBruce Richardson * there is actually a packet available 63c1d14583SBruce Richardson */ 64c1d14583SBruce Richardson if (!(rxdp->wb.status_error0 & 65c1d14583SBruce Richardson rte_cpu_to_le_32(1 << ICE_RX_FLEX_DESC_STATUS0_DD_S))) 66c1d14583SBruce Richardson return 0; 67c1d14583SBruce Richardson 68c1d14583SBruce Richardson /* constants used in processing loop */ 69c1d14583SBruce Richardson const __m512i crc_adjust = 70c1d14583SBruce Richardson _mm512_set4_epi32 71c1d14583SBruce Richardson (0, /* ignore non-length fields */ 72c1d14583SBruce Richardson -rxq->crc_len, /* sub crc on data_len */ 73c1d14583SBruce Richardson -rxq->crc_len, /* sub crc on pkt_len */ 74c1d14583SBruce Richardson 0 /* ignore non-length fields */ 75c1d14583SBruce Richardson ); 76c1d14583SBruce Richardson 77c1d14583SBruce Richardson /* 8 packets DD mask, LSB in each 32-bit value */ 78c1d14583SBruce Richardson const __m256i dd_check = _mm256_set1_epi32(1); 79c1d14583SBruce Richardson 80c1d14583SBruce Richardson /* 8 packets EOP mask, second-LSB in each 32-bit value */ 81c1d14583SBruce Richardson const __m256i eop_check = _mm256_slli_epi32(dd_check, 82c1d14583SBruce Richardson ICE_RX_DESC_STATUS_EOF_S); 83c1d14583SBruce Richardson 84c1d14583SBruce Richardson /* mask to shuffle from desc. to mbuf (4 descriptors)*/ 85c1d14583SBruce Richardson const __m512i shuf_msk = 86c1d14583SBruce Richardson _mm512_set4_epi32 87c1d14583SBruce Richardson (/* rss hash parsed separately */ 88c1d14583SBruce Richardson 0xFFFFFFFF, 89c1d14583SBruce Richardson /* octet 10~11, 16 bits vlan_macip */ 90c1d14583SBruce Richardson /* octet 4~5, 16 bits data_len */ 91c1d14583SBruce Richardson 11 << 24 | 10 << 16 | 5 << 8 | 4, 92c1d14583SBruce Richardson /* skip hi 16 bits pkt_len, zero out */ 93c1d14583SBruce Richardson /* octet 4~5, 16 bits pkt_len */ 94c1d14583SBruce Richardson 0xFFFF << 16 | 5 << 8 | 4, 95c1d14583SBruce Richardson /* pkt_type set as unknown */ 96c1d14583SBruce Richardson 0xFFFFFFFF 97c1d14583SBruce Richardson ); 98c1d14583SBruce Richardson 99c1d14583SBruce Richardson /** 100c1d14583SBruce Richardson * compile-time check the above crc and shuffle layout is correct. 101c1d14583SBruce Richardson * NOTE: the first field (lowest address) is given last in set_epi 102c1d14583SBruce Richardson * calls above. 103c1d14583SBruce Richardson */ 104c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != 105c1d14583SBruce Richardson offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); 106c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != 107c1d14583SBruce Richardson offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); 108c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) != 109c1d14583SBruce Richardson offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10); 110c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != 111c1d14583SBruce Richardson offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); 112c1d14583SBruce Richardson 113c1d14583SBruce Richardson /* following code block is for Rx Checksum Offload */ 114c1d14583SBruce Richardson /* Status/Error flag masks */ 115c1d14583SBruce Richardson /** 116c1d14583SBruce Richardson * mask everything except Checksum Reports, RSS indication 117c1d14583SBruce Richardson * and VLAN indication. 118c1d14583SBruce Richardson * bit6:4 for IP/L4 checksum errors. 119c1d14583SBruce Richardson * bit12 is for RSS indication. 120c1d14583SBruce Richardson * bit13 is for VLAN indication. 121c1d14583SBruce Richardson */ 122c1d14583SBruce Richardson const __m256i flags_mask = 123c1d14583SBruce Richardson _mm256_set1_epi32((0xF << 4) | (1 << 12) | (1 << 13)); 124c1d14583SBruce Richardson /** 125c1d14583SBruce Richardson * data to be shuffled by the result of the flags mask shifted by 4 126c1d14583SBruce Richardson * bits. This gives use the l3_l4 flags. 127c1d14583SBruce Richardson */ 128c1d14583SBruce Richardson const __m256i l3_l4_flags_shuf = 129c1d14583SBruce Richardson _mm256_set_epi8((RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 130c1d14583SBruce Richardson RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | 131c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 132c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 133c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 134c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 135c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 136c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 137c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 138c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 139c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 140c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 141c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 142c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 143c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 144c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 145c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 146c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 147c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 148c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 149c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 150c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 151c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 152c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 153c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 154c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 155c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 156c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 157c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 158c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 159c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 160c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 161c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 162c1d14583SBruce Richardson /** 163c1d14583SBruce Richardson * second 128-bits 164c1d14583SBruce Richardson * shift right 20 bits to use the low two bits to indicate 165c1d14583SBruce Richardson * outer checksum status 166c1d14583SBruce Richardson * shift right 1 bit to make sure it not exceed 255 167c1d14583SBruce Richardson */ 168c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 169c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 170c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 171c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 172c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 173c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 174c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 175c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 176c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 177c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 178c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 179c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 180c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 181c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 182c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 183c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 184c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 185c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 186c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 187c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 188c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 189c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 190c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 191c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 192c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 193c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 194c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 195c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 196c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 197c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 198c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 199c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1); 200c1d14583SBruce Richardson const __m256i cksum_mask = 201c1d14583SBruce Richardson _mm256_set1_epi32(RTE_MBUF_F_RX_IP_CKSUM_MASK | 202c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_MASK | 203c1d14583SBruce Richardson RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 204c1d14583SBruce Richardson RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK); 205c1d14583SBruce Richardson /** 206c1d14583SBruce Richardson * data to be shuffled by result of flag mask, shifted down 12. 207c1d14583SBruce Richardson * If RSS(bit12)/VLAN(bit13) are set, 208c1d14583SBruce Richardson * shuffle moves appropriate flags in place. 209c1d14583SBruce Richardson */ 210c1d14583SBruce Richardson const __m256i rss_vlan_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 211c1d14583SBruce Richardson 0, 0, 0, 0, 212c1d14583SBruce Richardson 0, 0, 0, 0, 213c1d14583SBruce Richardson RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 214c1d14583SBruce Richardson RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 215c1d14583SBruce Richardson RTE_MBUF_F_RX_RSS_HASH, 0, 216c1d14583SBruce Richardson /* 2nd 128-bits */ 217c1d14583SBruce Richardson 0, 0, 0, 0, 218c1d14583SBruce Richardson 0, 0, 0, 0, 219c1d14583SBruce Richardson 0, 0, 0, 0, 220c1d14583SBruce Richardson RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 221c1d14583SBruce Richardson RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 222c1d14583SBruce Richardson RTE_MBUF_F_RX_RSS_HASH, 0); 223c1d14583SBruce Richardson 224c1d14583SBruce Richardson uint16_t i, received; 225c1d14583SBruce Richardson 226c1d14583SBruce Richardson for (i = 0, received = 0; i < nb_pkts; 227c1d14583SBruce Richardson i += ICE_DESCS_PER_LOOP_AVX, 228c1d14583SBruce Richardson rxdp += ICE_DESCS_PER_LOOP_AVX) { 229c1d14583SBruce Richardson /* step 1, copy over 8 mbuf pointers to rx_pkts array */ 230c1d14583SBruce Richardson _mm256_storeu_si256((void *)&rx_pkts[i], 231c1d14583SBruce Richardson _mm256_loadu_si256((void *)&sw_ring[i])); 232c1d14583SBruce Richardson #ifdef RTE_ARCH_X86_64 233c1d14583SBruce Richardson _mm256_storeu_si256 234c1d14583SBruce Richardson ((void *)&rx_pkts[i + 4], 235c1d14583SBruce Richardson _mm256_loadu_si256((void *)&sw_ring[i + 4])); 236c1d14583SBruce Richardson #endif 237c1d14583SBruce Richardson 238c1d14583SBruce Richardson __m512i raw_desc0_3, raw_desc4_7; 239c1d14583SBruce Richardson __m256i raw_desc0_1, raw_desc2_3, raw_desc4_5, raw_desc6_7; 240c1d14583SBruce Richardson 241c1d14583SBruce Richardson /* load in descriptors, in reverse order */ 242c1d14583SBruce Richardson const __m128i raw_desc7 = 243c1d14583SBruce Richardson _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 7)); 244c1d14583SBruce Richardson rte_compiler_barrier(); 245c1d14583SBruce Richardson const __m128i raw_desc6 = 246c1d14583SBruce Richardson _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 6)); 247c1d14583SBruce Richardson rte_compiler_barrier(); 248c1d14583SBruce Richardson const __m128i raw_desc5 = 249c1d14583SBruce Richardson _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 5)); 250c1d14583SBruce Richardson rte_compiler_barrier(); 251c1d14583SBruce Richardson const __m128i raw_desc4 = 252c1d14583SBruce Richardson _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 4)); 253c1d14583SBruce Richardson rte_compiler_barrier(); 254c1d14583SBruce Richardson const __m128i raw_desc3 = 255c1d14583SBruce Richardson _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 3)); 256c1d14583SBruce Richardson rte_compiler_barrier(); 257c1d14583SBruce Richardson const __m128i raw_desc2 = 258c1d14583SBruce Richardson _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 2)); 259c1d14583SBruce Richardson rte_compiler_barrier(); 260c1d14583SBruce Richardson const __m128i raw_desc1 = 261c1d14583SBruce Richardson _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 1)); 262c1d14583SBruce Richardson rte_compiler_barrier(); 263c1d14583SBruce Richardson const __m128i raw_desc0 = 264c1d14583SBruce Richardson _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 0)); 265c1d14583SBruce Richardson 266c1d14583SBruce Richardson raw_desc6_7 = 267c1d14583SBruce Richardson _mm256_inserti128_si256 268c1d14583SBruce Richardson (_mm256_castsi128_si256(raw_desc6), 269c1d14583SBruce Richardson raw_desc7, 1); 270c1d14583SBruce Richardson raw_desc4_5 = 271c1d14583SBruce Richardson _mm256_inserti128_si256 272c1d14583SBruce Richardson (_mm256_castsi128_si256(raw_desc4), 273c1d14583SBruce Richardson raw_desc5, 1); 274c1d14583SBruce Richardson raw_desc2_3 = 275c1d14583SBruce Richardson _mm256_inserti128_si256 276c1d14583SBruce Richardson (_mm256_castsi128_si256(raw_desc2), 277c1d14583SBruce Richardson raw_desc3, 1); 278c1d14583SBruce Richardson raw_desc0_1 = 279c1d14583SBruce Richardson _mm256_inserti128_si256 280c1d14583SBruce Richardson (_mm256_castsi128_si256(raw_desc0), 281c1d14583SBruce Richardson raw_desc1, 1); 282c1d14583SBruce Richardson 283c1d14583SBruce Richardson raw_desc4_7 = 284c1d14583SBruce Richardson _mm512_inserti64x4 285c1d14583SBruce Richardson (_mm512_castsi256_si512(raw_desc4_5), 286c1d14583SBruce Richardson raw_desc6_7, 1); 287c1d14583SBruce Richardson raw_desc0_3 = 288c1d14583SBruce Richardson _mm512_inserti64x4 289c1d14583SBruce Richardson (_mm512_castsi256_si512(raw_desc0_1), 290c1d14583SBruce Richardson raw_desc2_3, 1); 291c1d14583SBruce Richardson 292c1d14583SBruce Richardson if (split_packet) { 293c1d14583SBruce Richardson int j; 294c1d14583SBruce Richardson 295c1d14583SBruce Richardson for (j = 0; j < ICE_DESCS_PER_LOOP_AVX; j++) 296c1d14583SBruce Richardson rte_mbuf_prefetch_part2(rx_pkts[i + j]); 297c1d14583SBruce Richardson } 298c1d14583SBruce Richardson 299c1d14583SBruce Richardson /** 300c1d14583SBruce Richardson * convert descriptors 0-7 into mbufs, re-arrange fields. 301c1d14583SBruce Richardson * Then write into the mbuf. 302c1d14583SBruce Richardson */ 303c1d14583SBruce Richardson __m512i mb4_7 = _mm512_shuffle_epi8(raw_desc4_7, shuf_msk); 304c1d14583SBruce Richardson __m512i mb0_3 = _mm512_shuffle_epi8(raw_desc0_3, shuf_msk); 305c1d14583SBruce Richardson 306c1d14583SBruce Richardson mb4_7 = _mm512_add_epi32(mb4_7, crc_adjust); 307c1d14583SBruce Richardson mb0_3 = _mm512_add_epi32(mb0_3, crc_adjust); 308c1d14583SBruce Richardson 309c1d14583SBruce Richardson /** 310c1d14583SBruce Richardson * to get packet types, ptype is located in bit16-25 311c1d14583SBruce Richardson * of each 128bits 312c1d14583SBruce Richardson */ 313c1d14583SBruce Richardson const __m512i ptype_mask = 314c1d14583SBruce Richardson _mm512_set1_epi16(ICE_RX_FLEX_DESC_PTYPE_M); 315c1d14583SBruce Richardson 316c1d14583SBruce Richardson /** 317c1d14583SBruce Richardson * to get packet types, ptype is located in bit16-25 318c1d14583SBruce Richardson * of each 128bits 319c1d14583SBruce Richardson */ 320c1d14583SBruce Richardson const __m512i ptypes4_7 = 321c1d14583SBruce Richardson _mm512_and_si512(raw_desc4_7, ptype_mask); 322c1d14583SBruce Richardson const __m512i ptypes0_3 = 323c1d14583SBruce Richardson _mm512_and_si512(raw_desc0_3, ptype_mask); 324c1d14583SBruce Richardson 325c1d14583SBruce Richardson const __m256i ptypes6_7 = 326c1d14583SBruce Richardson _mm512_extracti64x4_epi64(ptypes4_7, 1); 327c1d14583SBruce Richardson const __m256i ptypes4_5 = 328c1d14583SBruce Richardson _mm512_extracti64x4_epi64(ptypes4_7, 0); 329c1d14583SBruce Richardson const __m256i ptypes2_3 = 330c1d14583SBruce Richardson _mm512_extracti64x4_epi64(ptypes0_3, 1); 331c1d14583SBruce Richardson const __m256i ptypes0_1 = 332c1d14583SBruce Richardson _mm512_extracti64x4_epi64(ptypes0_3, 0); 333c1d14583SBruce Richardson const uint16_t ptype7 = _mm256_extract_epi16(ptypes6_7, 9); 334c1d14583SBruce Richardson const uint16_t ptype6 = _mm256_extract_epi16(ptypes6_7, 1); 335c1d14583SBruce Richardson const uint16_t ptype5 = _mm256_extract_epi16(ptypes4_5, 9); 336c1d14583SBruce Richardson const uint16_t ptype4 = _mm256_extract_epi16(ptypes4_5, 1); 337c1d14583SBruce Richardson const uint16_t ptype3 = _mm256_extract_epi16(ptypes2_3, 9); 338c1d14583SBruce Richardson const uint16_t ptype2 = _mm256_extract_epi16(ptypes2_3, 1); 339c1d14583SBruce Richardson const uint16_t ptype1 = _mm256_extract_epi16(ptypes0_1, 9); 340c1d14583SBruce Richardson const uint16_t ptype0 = _mm256_extract_epi16(ptypes0_1, 1); 341c1d14583SBruce Richardson 342c1d14583SBruce Richardson const __m512i ptype4_7 = _mm512_set_epi32 343c1d14583SBruce Richardson (0, 0, 0, ptype_tbl[ptype7], 344c1d14583SBruce Richardson 0, 0, 0, ptype_tbl[ptype6], 345c1d14583SBruce Richardson 0, 0, 0, ptype_tbl[ptype5], 346c1d14583SBruce Richardson 0, 0, 0, ptype_tbl[ptype4]); 347c1d14583SBruce Richardson const __m512i ptype0_3 = _mm512_set_epi32 348c1d14583SBruce Richardson (0, 0, 0, ptype_tbl[ptype3], 349c1d14583SBruce Richardson 0, 0, 0, ptype_tbl[ptype2], 350c1d14583SBruce Richardson 0, 0, 0, ptype_tbl[ptype1], 351c1d14583SBruce Richardson 0, 0, 0, ptype_tbl[ptype0]); 352c1d14583SBruce Richardson 353c1d14583SBruce Richardson mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7); 354c1d14583SBruce Richardson mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3); 355c1d14583SBruce Richardson 356c1d14583SBruce Richardson __m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0); 357c1d14583SBruce Richardson __m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1); 358c1d14583SBruce Richardson __m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0); 359c1d14583SBruce Richardson __m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1); 360c1d14583SBruce Richardson 361c1d14583SBruce Richardson /** 362c1d14583SBruce Richardson * use permute/extract to get status content 363c1d14583SBruce Richardson * After the operations, the packets status flags are in the 364c1d14583SBruce Richardson * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6] 365c1d14583SBruce Richardson */ 366c1d14583SBruce Richardson /* merge the status bits into one register */ 367c1d14583SBruce Richardson const __m512i status_permute_msk = _mm512_set_epi32 368c1d14583SBruce Richardson (0, 0, 0, 0, 369c1d14583SBruce Richardson 0, 0, 0, 0, 370c1d14583SBruce Richardson 22, 30, 6, 14, 371c1d14583SBruce Richardson 18, 26, 2, 10); 372c1d14583SBruce Richardson const __m512i raw_status0_7 = _mm512_permutex2var_epi32 373c1d14583SBruce Richardson (raw_desc4_7, status_permute_msk, raw_desc0_3); 374c1d14583SBruce Richardson __m256i status0_7 = _mm512_extracti64x4_epi64 375c1d14583SBruce Richardson (raw_status0_7, 0); 376c1d14583SBruce Richardson 377c1d14583SBruce Richardson __m256i mbuf_flags = _mm256_set1_epi32(0); 378c1d14583SBruce Richardson 379c1d14583SBruce Richardson if (do_offload) { 380c1d14583SBruce Richardson /* now do flag manipulation */ 381c1d14583SBruce Richardson 382c1d14583SBruce Richardson /* get only flag/error bits we want */ 383c1d14583SBruce Richardson const __m256i flag_bits = 384c1d14583SBruce Richardson _mm256_and_si256(status0_7, flags_mask); 385c1d14583SBruce Richardson /** 386c1d14583SBruce Richardson * l3_l4_error flags, shuffle, then shift to correct adjustment 387c1d14583SBruce Richardson * of flags in flags_shuf, and finally mask out extra bits 388c1d14583SBruce Richardson */ 389c1d14583SBruce Richardson __m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf, 390c1d14583SBruce Richardson _mm256_srli_epi32(flag_bits, 4)); 391c1d14583SBruce Richardson l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1); 392c1d14583SBruce Richardson __m256i l4_outer_mask = _mm256_set1_epi32(0x6); 393c1d14583SBruce Richardson __m256i l4_outer_flags = 394c1d14583SBruce Richardson _mm256_and_si256(l3_l4_flags, l4_outer_mask); 395c1d14583SBruce Richardson l4_outer_flags = _mm256_slli_epi32(l4_outer_flags, 20); 396c1d14583SBruce Richardson 397c1d14583SBruce Richardson __m256i l3_l4_mask = _mm256_set1_epi32(~0x6); 398c1d14583SBruce Richardson 399c1d14583SBruce Richardson l3_l4_flags = _mm256_and_si256(l3_l4_flags, l3_l4_mask); 400c1d14583SBruce Richardson l3_l4_flags = _mm256_or_si256(l3_l4_flags, l4_outer_flags); 401c1d14583SBruce Richardson l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask); 402c1d14583SBruce Richardson /* set rss and vlan flags */ 403c1d14583SBruce Richardson const __m256i rss_vlan_flag_bits = 404c1d14583SBruce Richardson _mm256_srli_epi32(flag_bits, 12); 405c1d14583SBruce Richardson const __m256i rss_vlan_flags = 406c1d14583SBruce Richardson _mm256_shuffle_epi8(rss_vlan_flags_shuf, 407c1d14583SBruce Richardson rss_vlan_flag_bits); 408c1d14583SBruce Richardson 409c1d14583SBruce Richardson /* merge flags */ 410c1d14583SBruce Richardson mbuf_flags = _mm256_or_si256(l3_l4_flags, 411c1d14583SBruce Richardson rss_vlan_flags); 412c1d14583SBruce Richardson } 413c1d14583SBruce Richardson 414c1d14583SBruce Richardson if (rxq->fdir_enabled) { 415c1d14583SBruce Richardson const __m256i fdir_id4_7 = 416c1d14583SBruce Richardson _mm256_unpackhi_epi32(raw_desc6_7, raw_desc4_5); 417c1d14583SBruce Richardson 418c1d14583SBruce Richardson const __m256i fdir_id0_3 = 419c1d14583SBruce Richardson _mm256_unpackhi_epi32(raw_desc2_3, raw_desc0_1); 420c1d14583SBruce Richardson 421c1d14583SBruce Richardson const __m256i fdir_id0_7 = 422c1d14583SBruce Richardson _mm256_unpackhi_epi64(fdir_id4_7, fdir_id0_3); 423c1d14583SBruce Richardson 424c1d14583SBruce Richardson if (do_offload) { 425c1d14583SBruce Richardson const __m256i fdir_flags = 426c1d14583SBruce Richardson ice_flex_rxd_to_fdir_flags_vec_avx512 427c1d14583SBruce Richardson (fdir_id0_7); 428c1d14583SBruce Richardson 429c1d14583SBruce Richardson /* merge with fdir_flags */ 430c1d14583SBruce Richardson mbuf_flags = _mm256_or_si256 431c1d14583SBruce Richardson (mbuf_flags, fdir_flags); 432c1d14583SBruce Richardson } else { 433c1d14583SBruce Richardson mbuf_flags = 434c1d14583SBruce Richardson ice_flex_rxd_to_fdir_flags_vec_avx512 435c1d14583SBruce Richardson (fdir_id0_7); 436c1d14583SBruce Richardson } 437c1d14583SBruce Richardson 438c1d14583SBruce Richardson /* write to mbuf: have to use scalar store here */ 439c1d14583SBruce Richardson rx_pkts[i + 0]->hash.fdir.hi = 440c1d14583SBruce Richardson _mm256_extract_epi32(fdir_id0_7, 3); 441c1d14583SBruce Richardson 442c1d14583SBruce Richardson rx_pkts[i + 1]->hash.fdir.hi = 443c1d14583SBruce Richardson _mm256_extract_epi32(fdir_id0_7, 7); 444c1d14583SBruce Richardson 445c1d14583SBruce Richardson rx_pkts[i + 2]->hash.fdir.hi = 446c1d14583SBruce Richardson _mm256_extract_epi32(fdir_id0_7, 2); 447c1d14583SBruce Richardson 448c1d14583SBruce Richardson rx_pkts[i + 3]->hash.fdir.hi = 449c1d14583SBruce Richardson _mm256_extract_epi32(fdir_id0_7, 6); 450c1d14583SBruce Richardson 451c1d14583SBruce Richardson rx_pkts[i + 4]->hash.fdir.hi = 452c1d14583SBruce Richardson _mm256_extract_epi32(fdir_id0_7, 1); 453c1d14583SBruce Richardson 454c1d14583SBruce Richardson rx_pkts[i + 5]->hash.fdir.hi = 455c1d14583SBruce Richardson _mm256_extract_epi32(fdir_id0_7, 5); 456c1d14583SBruce Richardson 457c1d14583SBruce Richardson rx_pkts[i + 6]->hash.fdir.hi = 458c1d14583SBruce Richardson _mm256_extract_epi32(fdir_id0_7, 0); 459c1d14583SBruce Richardson 460c1d14583SBruce Richardson rx_pkts[i + 7]->hash.fdir.hi = 461c1d14583SBruce Richardson _mm256_extract_epi32(fdir_id0_7, 4); 462c1d14583SBruce Richardson } /* if() on fdir_enabled */ 463c1d14583SBruce Richardson 464c1d14583SBruce Richardson if (do_offload) { 465c1d14583SBruce Richardson #ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC 466c1d14583SBruce Richardson /** 467c1d14583SBruce Richardson * needs to load 2nd 16B of each desc for RSS hash parsing, 468c1d14583SBruce Richardson * will cause performance drop to get into this context. 469c1d14583SBruce Richardson */ 470c1d14583SBruce Richardson if (rxq->vsi->adapter->pf.dev_data->dev_conf.rxmode.offloads & 471c1d14583SBruce Richardson RTE_ETH_RX_OFFLOAD_RSS_HASH) { 472c1d14583SBruce Richardson /* load bottom half of every 32B desc */ 473c1d14583SBruce Richardson const __m128i raw_desc_bh7 = _mm_load_si128 474c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[7].wb.status_error1)); 475c1d14583SBruce Richardson rte_compiler_barrier(); 476c1d14583SBruce Richardson const __m128i raw_desc_bh6 = _mm_load_si128 477c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, rxdp[6].wb.status_error1)); 478c1d14583SBruce Richardson rte_compiler_barrier(); 479c1d14583SBruce Richardson const __m128i raw_desc_bh5 = _mm_load_si128 480c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[5].wb.status_error1)); 481c1d14583SBruce Richardson rte_compiler_barrier(); 482c1d14583SBruce Richardson const __m128i raw_desc_bh4 = _mm_load_si128 483c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[4].wb.status_error1)); 484c1d14583SBruce Richardson rte_compiler_barrier(); 485c1d14583SBruce Richardson const __m128i raw_desc_bh3 = _mm_load_si128 486c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[3].wb.status_error1)); 487c1d14583SBruce Richardson rte_compiler_barrier(); 488c1d14583SBruce Richardson const __m128i raw_desc_bh2 = _mm_load_si128 489c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[2].wb.status_error1)); 490c1d14583SBruce Richardson rte_compiler_barrier(); 491c1d14583SBruce Richardson const __m128i raw_desc_bh1 = _mm_load_si128 492c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[1].wb.status_error1)); 493c1d14583SBruce Richardson rte_compiler_barrier(); 494c1d14583SBruce Richardson const __m128i raw_desc_bh0 = _mm_load_si128 495c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[0].wb.status_error1)); 496c1d14583SBruce Richardson 497c1d14583SBruce Richardson __m256i raw_desc_bh6_7 = 498c1d14583SBruce Richardson _mm256_inserti128_si256 499c1d14583SBruce Richardson (_mm256_castsi128_si256(raw_desc_bh6), 500c1d14583SBruce Richardson raw_desc_bh7, 1); 501c1d14583SBruce Richardson __m256i raw_desc_bh4_5 = 502c1d14583SBruce Richardson _mm256_inserti128_si256 503c1d14583SBruce Richardson (_mm256_castsi128_si256(raw_desc_bh4), 504c1d14583SBruce Richardson raw_desc_bh5, 1); 505c1d14583SBruce Richardson __m256i raw_desc_bh2_3 = 506c1d14583SBruce Richardson _mm256_inserti128_si256 507c1d14583SBruce Richardson (_mm256_castsi128_si256(raw_desc_bh2), 508c1d14583SBruce Richardson raw_desc_bh3, 1); 509c1d14583SBruce Richardson __m256i raw_desc_bh0_1 = 510c1d14583SBruce Richardson _mm256_inserti128_si256 511c1d14583SBruce Richardson (_mm256_castsi128_si256(raw_desc_bh0), 512c1d14583SBruce Richardson raw_desc_bh1, 1); 513c1d14583SBruce Richardson 514c1d14583SBruce Richardson /** 515c1d14583SBruce Richardson * to shift the 32b RSS hash value to the 516c1d14583SBruce Richardson * highest 32b of each 128b before mask 517c1d14583SBruce Richardson */ 518c1d14583SBruce Richardson __m256i rss_hash6_7 = 519c1d14583SBruce Richardson _mm256_slli_epi64(raw_desc_bh6_7, 32); 520c1d14583SBruce Richardson __m256i rss_hash4_5 = 521c1d14583SBruce Richardson _mm256_slli_epi64(raw_desc_bh4_5, 32); 522c1d14583SBruce Richardson __m256i rss_hash2_3 = 523c1d14583SBruce Richardson _mm256_slli_epi64(raw_desc_bh2_3, 32); 524c1d14583SBruce Richardson __m256i rss_hash0_1 = 525c1d14583SBruce Richardson _mm256_slli_epi64(raw_desc_bh0_1, 32); 526c1d14583SBruce Richardson 527c1d14583SBruce Richardson __m256i rss_hash_msk = 528c1d14583SBruce Richardson _mm256_set_epi32(0xFFFFFFFF, 0, 0, 0, 529c1d14583SBruce Richardson 0xFFFFFFFF, 0, 0, 0); 530c1d14583SBruce Richardson 531c1d14583SBruce Richardson rss_hash6_7 = _mm256_and_si256 532c1d14583SBruce Richardson (rss_hash6_7, rss_hash_msk); 533c1d14583SBruce Richardson rss_hash4_5 = _mm256_and_si256 534c1d14583SBruce Richardson (rss_hash4_5, rss_hash_msk); 535c1d14583SBruce Richardson rss_hash2_3 = _mm256_and_si256 536c1d14583SBruce Richardson (rss_hash2_3, rss_hash_msk); 537c1d14583SBruce Richardson rss_hash0_1 = _mm256_and_si256 538c1d14583SBruce Richardson (rss_hash0_1, rss_hash_msk); 539c1d14583SBruce Richardson 540c1d14583SBruce Richardson mb6_7 = _mm256_or_si256(mb6_7, rss_hash6_7); 541c1d14583SBruce Richardson mb4_5 = _mm256_or_si256(mb4_5, rss_hash4_5); 542c1d14583SBruce Richardson mb2_3 = _mm256_or_si256(mb2_3, rss_hash2_3); 543c1d14583SBruce Richardson mb0_1 = _mm256_or_si256(mb0_1, rss_hash0_1); 544c1d14583SBruce Richardson } /* if() on RSS hash parsing */ 545c1d14583SBruce Richardson #endif 546c1d14583SBruce Richardson } 547c1d14583SBruce Richardson 548c1d14583SBruce Richardson /** 549c1d14583SBruce Richardson * At this point, we have the 8 sets of flags in the low 16-bits 550c1d14583SBruce Richardson * of each 32-bit value in vlan0. 551c1d14583SBruce Richardson * We want to extract these, and merge them with the mbuf init 552c1d14583SBruce Richardson * data so we can do a single write to the mbuf to set the flags 553c1d14583SBruce Richardson * and all the other initialization fields. Extracting the 554c1d14583SBruce Richardson * appropriate flags means that we have to do a shift and blend 555c1d14583SBruce Richardson * for each mbuf before we do the write. However, we can also 556c1d14583SBruce Richardson * add in the previously computed rx_descriptor fields to 557c1d14583SBruce Richardson * make a single 256-bit write per mbuf 558c1d14583SBruce Richardson */ 559c1d14583SBruce Richardson /* check the structure matches expectations */ 560c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) != 561c1d14583SBruce Richardson offsetof(struct rte_mbuf, rearm_data) + 8); 562c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) != 563c1d14583SBruce Richardson RTE_ALIGN(offsetof(struct rte_mbuf, 564c1d14583SBruce Richardson rearm_data), 565c1d14583SBruce Richardson 16)); 566c1d14583SBruce Richardson /* build up data and do writes */ 567c1d14583SBruce Richardson __m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5, 568c1d14583SBruce Richardson rearm6, rearm7; 569c1d14583SBruce Richardson 570c1d14583SBruce Richardson rearm6 = _mm256_blend_epi32(mbuf_init, 571c1d14583SBruce Richardson _mm256_slli_si256(mbuf_flags, 8), 572c1d14583SBruce Richardson 0x04); 573c1d14583SBruce Richardson rearm4 = _mm256_blend_epi32(mbuf_init, 574c1d14583SBruce Richardson _mm256_slli_si256(mbuf_flags, 4), 575c1d14583SBruce Richardson 0x04); 576c1d14583SBruce Richardson rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04); 577c1d14583SBruce Richardson rearm0 = _mm256_blend_epi32(mbuf_init, 578c1d14583SBruce Richardson _mm256_srli_si256(mbuf_flags, 4), 579c1d14583SBruce Richardson 0x04); 580c1d14583SBruce Richardson 581c1d14583SBruce Richardson /* permute to add in the rx_descriptor e.g. rss fields */ 582c1d14583SBruce Richardson rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20); 583c1d14583SBruce Richardson rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20); 584c1d14583SBruce Richardson rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20); 585c1d14583SBruce Richardson rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20); 586c1d14583SBruce Richardson 587c1d14583SBruce Richardson /* write to mbuf */ 588c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data, 589c1d14583SBruce Richardson rearm6); 590c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data, 591c1d14583SBruce Richardson rearm4); 592c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data, 593c1d14583SBruce Richardson rearm2); 594c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data, 595c1d14583SBruce Richardson rearm0); 596c1d14583SBruce Richardson 597c1d14583SBruce Richardson /* repeat for the odd mbufs */ 598c1d14583SBruce Richardson const __m256i odd_flags = 599c1d14583SBruce Richardson _mm256_castsi128_si256 600c1d14583SBruce Richardson (_mm256_extracti128_si256(mbuf_flags, 1)); 601c1d14583SBruce Richardson rearm7 = _mm256_blend_epi32(mbuf_init, 602c1d14583SBruce Richardson _mm256_slli_si256(odd_flags, 8), 603c1d14583SBruce Richardson 0x04); 604c1d14583SBruce Richardson rearm5 = _mm256_blend_epi32(mbuf_init, 605c1d14583SBruce Richardson _mm256_slli_si256(odd_flags, 4), 606c1d14583SBruce Richardson 0x04); 607c1d14583SBruce Richardson rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04); 608c1d14583SBruce Richardson rearm1 = _mm256_blend_epi32(mbuf_init, 609c1d14583SBruce Richardson _mm256_srli_si256(odd_flags, 4), 610c1d14583SBruce Richardson 0x04); 611c1d14583SBruce Richardson 612c1d14583SBruce Richardson /* since odd mbufs are already in hi 128-bits use blend */ 613c1d14583SBruce Richardson rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0); 614c1d14583SBruce Richardson rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0); 615c1d14583SBruce Richardson rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0); 616c1d14583SBruce Richardson rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0); 617c1d14583SBruce Richardson /* again write to mbufs */ 618c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data, 619c1d14583SBruce Richardson rearm7); 620c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data, 621c1d14583SBruce Richardson rearm5); 622c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data, 623c1d14583SBruce Richardson rearm3); 624c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data, 625c1d14583SBruce Richardson rearm1); 626c1d14583SBruce Richardson 627c1d14583SBruce Richardson /* extract and record EOP bit */ 628c1d14583SBruce Richardson if (split_packet) { 629c1d14583SBruce Richardson const __m128i eop_mask = 630c1d14583SBruce Richardson _mm_set1_epi16(1 << ICE_RX_DESC_STATUS_EOF_S); 631c1d14583SBruce Richardson const __m256i eop_bits256 = _mm256_and_si256(status0_7, 632c1d14583SBruce Richardson eop_check); 633c1d14583SBruce Richardson /* pack status bits into a single 128-bit register */ 634c1d14583SBruce Richardson const __m128i eop_bits = 635c1d14583SBruce Richardson _mm_packus_epi32 636c1d14583SBruce Richardson (_mm256_castsi256_si128(eop_bits256), 637c1d14583SBruce Richardson _mm256_extractf128_si256(eop_bits256, 638c1d14583SBruce Richardson 1)); 639c1d14583SBruce Richardson /** 640c1d14583SBruce Richardson * flip bits, and mask out the EOP bit, which is now 641c1d14583SBruce Richardson * a split-packet bit i.e. !EOP, rather than EOP one. 642c1d14583SBruce Richardson */ 643c1d14583SBruce Richardson __m128i split_bits = _mm_andnot_si128(eop_bits, 644c1d14583SBruce Richardson eop_mask); 645c1d14583SBruce Richardson /** 646c1d14583SBruce Richardson * eop bits are out of order, so we need to shuffle them 647c1d14583SBruce Richardson * back into order again. In doing so, only use low 8 648c1d14583SBruce Richardson * bits, which acts like another pack instruction 649c1d14583SBruce Richardson * The original order is (hi->lo): 1,3,5,7,0,2,4,6 650c1d14583SBruce Richardson * [Since we use epi8, the 16-bit positions are 651c1d14583SBruce Richardson * multiplied by 2 in the eop_shuffle value.] 652c1d14583SBruce Richardson */ 653c1d14583SBruce Richardson __m128i eop_shuffle = 654c1d14583SBruce Richardson _mm_set_epi8(/* zero hi 64b */ 655c1d14583SBruce Richardson 0xFF, 0xFF, 0xFF, 0xFF, 656c1d14583SBruce Richardson 0xFF, 0xFF, 0xFF, 0xFF, 657c1d14583SBruce Richardson /* move values to lo 64b */ 658c1d14583SBruce Richardson 8, 0, 10, 2, 659c1d14583SBruce Richardson 12, 4, 14, 6); 660c1d14583SBruce Richardson split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle); 661c1d14583SBruce Richardson *(uint64_t *)split_packet = 662c1d14583SBruce Richardson _mm_cvtsi128_si64(split_bits); 663c1d14583SBruce Richardson split_packet += ICE_DESCS_PER_LOOP_AVX; 664c1d14583SBruce Richardson } 665c1d14583SBruce Richardson 666c1d14583SBruce Richardson /* perform dd_check */ 667c1d14583SBruce Richardson status0_7 = _mm256_and_si256(status0_7, dd_check); 668c1d14583SBruce Richardson status0_7 = _mm256_packs_epi32(status0_7, 669c1d14583SBruce Richardson _mm256_setzero_si256()); 670c1d14583SBruce Richardson 671c1d14583SBruce Richardson uint64_t burst = rte_popcount64 672c1d14583SBruce Richardson (_mm_cvtsi128_si64 673c1d14583SBruce Richardson (_mm256_extracti128_si256 674c1d14583SBruce Richardson (status0_7, 1))); 675c1d14583SBruce Richardson burst += rte_popcount64 676c1d14583SBruce Richardson (_mm_cvtsi128_si64 677c1d14583SBruce Richardson (_mm256_castsi256_si128(status0_7))); 678c1d14583SBruce Richardson received += burst; 679c1d14583SBruce Richardson if (burst != ICE_DESCS_PER_LOOP_AVX) 680c1d14583SBruce Richardson break; 681c1d14583SBruce Richardson } 682c1d14583SBruce Richardson 683c1d14583SBruce Richardson /* update tail pointers */ 684c1d14583SBruce Richardson rxq->rx_tail += received; 685c1d14583SBruce Richardson rxq->rx_tail &= (rxq->nb_rx_desc - 1); 686c1d14583SBruce Richardson if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep avx2 aligned */ 687c1d14583SBruce Richardson rxq->rx_tail--; 688c1d14583SBruce Richardson received--; 689c1d14583SBruce Richardson } 690c1d14583SBruce Richardson rxq->rxrearm_nb += received; 691c1d14583SBruce Richardson return received; 692c1d14583SBruce Richardson } 693c1d14583SBruce Richardson 694c1d14583SBruce Richardson /** 695c1d14583SBruce Richardson * Notice: 696c1d14583SBruce Richardson * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet 697c1d14583SBruce Richardson */ 698c1d14583SBruce Richardson uint16_t 699c1d14583SBruce Richardson ice_recv_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts, 700c1d14583SBruce Richardson uint16_t nb_pkts) 701c1d14583SBruce Richardson { 702c1d14583SBruce Richardson return _ice_recv_raw_pkts_vec_avx512(rx_queue, rx_pkts, nb_pkts, NULL, false); 703c1d14583SBruce Richardson } 704c1d14583SBruce Richardson 705c1d14583SBruce Richardson /** 706c1d14583SBruce Richardson * Notice: 707c1d14583SBruce Richardson * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet 708c1d14583SBruce Richardson */ 709c1d14583SBruce Richardson uint16_t 710c1d14583SBruce Richardson ice_recv_pkts_vec_avx512_offload(void *rx_queue, struct rte_mbuf **rx_pkts, 711c1d14583SBruce Richardson uint16_t nb_pkts) 712c1d14583SBruce Richardson { 713c1d14583SBruce Richardson return _ice_recv_raw_pkts_vec_avx512(rx_queue, rx_pkts, 714c1d14583SBruce Richardson nb_pkts, NULL, true); 715c1d14583SBruce Richardson } 716c1d14583SBruce Richardson 717c1d14583SBruce Richardson /** 718c1d14583SBruce Richardson * vPMD receive routine that reassembles single burst of 32 scattered packets 719c1d14583SBruce Richardson * Notice: 720c1d14583SBruce Richardson * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet 721c1d14583SBruce Richardson */ 722c1d14583SBruce Richardson static uint16_t 723c1d14583SBruce Richardson ice_recv_scattered_burst_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts, 724c1d14583SBruce Richardson uint16_t nb_pkts) 725c1d14583SBruce Richardson { 726c1d14583SBruce Richardson struct ice_rx_queue *rxq = rx_queue; 727c1d14583SBruce Richardson uint8_t split_flags[ICE_VPMD_RX_BURST] = {0}; 728c1d14583SBruce Richardson 729c1d14583SBruce Richardson /* get some new buffers */ 730c1d14583SBruce Richardson uint16_t nb_bufs = _ice_recv_raw_pkts_vec_avx512(rxq, rx_pkts, nb_pkts, 731c1d14583SBruce Richardson split_flags, false); 732c1d14583SBruce Richardson if (nb_bufs == 0) 733c1d14583SBruce Richardson return 0; 734c1d14583SBruce Richardson 735c1d14583SBruce Richardson /* happy day case, full burst + no packets to be joined */ 736c1d14583SBruce Richardson const uint64_t *split_fl64 = (uint64_t *)split_flags; 737c1d14583SBruce Richardson 738c1d14583SBruce Richardson if (!rxq->pkt_first_seg && 739c1d14583SBruce Richardson split_fl64[0] == 0 && split_fl64[1] == 0 && 740c1d14583SBruce Richardson split_fl64[2] == 0 && split_fl64[3] == 0) 741c1d14583SBruce Richardson return nb_bufs; 742c1d14583SBruce Richardson 743c1d14583SBruce Richardson /* reassemble any packets that need reassembly */ 744c1d14583SBruce Richardson unsigned int i = 0; 745c1d14583SBruce Richardson 746c1d14583SBruce Richardson if (!rxq->pkt_first_seg) { 747c1d14583SBruce Richardson /* find the first split flag, and only reassemble then */ 748c1d14583SBruce Richardson while (i < nb_bufs && !split_flags[i]) 749c1d14583SBruce Richardson i++; 750c1d14583SBruce Richardson if (i == nb_bufs) 751c1d14583SBruce Richardson return nb_bufs; 752c1d14583SBruce Richardson rxq->pkt_first_seg = rx_pkts[i]; 753c1d14583SBruce Richardson } 75482fbc4a4SBruce Richardson return i + ci_rx_reassemble_packets(&rx_pkts[i], nb_bufs - i, &split_flags[i], 75582fbc4a4SBruce Richardson &rxq->pkt_first_seg, &rxq->pkt_last_seg, rxq->crc_len); 756c1d14583SBruce Richardson } 757c1d14583SBruce Richardson 758c1d14583SBruce Richardson /** 759c1d14583SBruce Richardson * vPMD receive routine that reassembles single burst of 32 scattered packets 760c1d14583SBruce Richardson * Notice: 761c1d14583SBruce Richardson * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet 762c1d14583SBruce Richardson */ 763c1d14583SBruce Richardson static uint16_t 764c1d14583SBruce Richardson ice_recv_scattered_burst_vec_avx512_offload(void *rx_queue, 765c1d14583SBruce Richardson struct rte_mbuf **rx_pkts, 766c1d14583SBruce Richardson uint16_t nb_pkts) 767c1d14583SBruce Richardson { 768c1d14583SBruce Richardson struct ice_rx_queue *rxq = rx_queue; 769c1d14583SBruce Richardson uint8_t split_flags[ICE_VPMD_RX_BURST] = {0}; 770c1d14583SBruce Richardson 771c1d14583SBruce Richardson /* get some new buffers */ 772c1d14583SBruce Richardson uint16_t nb_bufs = _ice_recv_raw_pkts_vec_avx512(rxq, 773c1d14583SBruce Richardson rx_pkts, nb_pkts, split_flags, true); 774c1d14583SBruce Richardson if (nb_bufs == 0) 775c1d14583SBruce Richardson return 0; 776c1d14583SBruce Richardson 777c1d14583SBruce Richardson /* happy day case, full burst + no packets to be joined */ 778c1d14583SBruce Richardson const uint64_t *split_fl64 = (uint64_t *)split_flags; 779c1d14583SBruce Richardson 780c1d14583SBruce Richardson if (!rxq->pkt_first_seg && 781c1d14583SBruce Richardson split_fl64[0] == 0 && split_fl64[1] == 0 && 782c1d14583SBruce Richardson split_fl64[2] == 0 && split_fl64[3] == 0) 783c1d14583SBruce Richardson return nb_bufs; 784c1d14583SBruce Richardson 785c1d14583SBruce Richardson /* reassemble any packets that need reassembly */ 786c1d14583SBruce Richardson unsigned int i = 0; 787c1d14583SBruce Richardson 788c1d14583SBruce Richardson if (!rxq->pkt_first_seg) { 789c1d14583SBruce Richardson /* find the first split flag, and only reassemble then */ 790c1d14583SBruce Richardson while (i < nb_bufs && !split_flags[i]) 791c1d14583SBruce Richardson i++; 792c1d14583SBruce Richardson if (i == nb_bufs) 793c1d14583SBruce Richardson return nb_bufs; 794c1d14583SBruce Richardson rxq->pkt_first_seg = rx_pkts[i]; 795c1d14583SBruce Richardson } 79682fbc4a4SBruce Richardson return i + ci_rx_reassemble_packets(&rx_pkts[i], nb_bufs - i, &split_flags[i], 79782fbc4a4SBruce Richardson &rxq->pkt_first_seg, &rxq->pkt_last_seg, rxq->crc_len); 798c1d14583SBruce Richardson } 799c1d14583SBruce Richardson 800c1d14583SBruce Richardson /** 801c1d14583SBruce Richardson * vPMD receive routine that reassembles scattered packets. 802c1d14583SBruce Richardson * Main receive routine that can handle arbitrary burst sizes 803c1d14583SBruce Richardson * Notice: 804c1d14583SBruce Richardson * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet 805c1d14583SBruce Richardson */ 806c1d14583SBruce Richardson uint16_t 807c1d14583SBruce Richardson ice_recv_scattered_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts, 808c1d14583SBruce Richardson uint16_t nb_pkts) 809c1d14583SBruce Richardson { 810c1d14583SBruce Richardson uint16_t retval = 0; 811c1d14583SBruce Richardson 812c1d14583SBruce Richardson while (nb_pkts > ICE_VPMD_RX_BURST) { 813c1d14583SBruce Richardson uint16_t burst = ice_recv_scattered_burst_vec_avx512(rx_queue, 814c1d14583SBruce Richardson rx_pkts + retval, ICE_VPMD_RX_BURST); 815c1d14583SBruce Richardson retval += burst; 816c1d14583SBruce Richardson nb_pkts -= burst; 817c1d14583SBruce Richardson if (burst < ICE_VPMD_RX_BURST) 818c1d14583SBruce Richardson return retval; 819c1d14583SBruce Richardson } 820c1d14583SBruce Richardson return retval + ice_recv_scattered_burst_vec_avx512(rx_queue, 821c1d14583SBruce Richardson rx_pkts + retval, nb_pkts); 822c1d14583SBruce Richardson } 823c1d14583SBruce Richardson 824c1d14583SBruce Richardson /** 825c1d14583SBruce Richardson * vPMD receive routine that reassembles scattered packets. 826c1d14583SBruce Richardson * Main receive routine that can handle arbitrary burst sizes 827c1d14583SBruce Richardson * Notice: 828c1d14583SBruce Richardson * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet 829c1d14583SBruce Richardson */ 830c1d14583SBruce Richardson uint16_t 831c1d14583SBruce Richardson ice_recv_scattered_pkts_vec_avx512_offload(void *rx_queue, 832c1d14583SBruce Richardson struct rte_mbuf **rx_pkts, 833c1d14583SBruce Richardson uint16_t nb_pkts) 834c1d14583SBruce Richardson { 835c1d14583SBruce Richardson uint16_t retval = 0; 836c1d14583SBruce Richardson 837c1d14583SBruce Richardson while (nb_pkts > ICE_VPMD_RX_BURST) { 838c1d14583SBruce Richardson uint16_t burst = 839c1d14583SBruce Richardson ice_recv_scattered_burst_vec_avx512_offload(rx_queue, 840c1d14583SBruce Richardson rx_pkts + retval, ICE_VPMD_RX_BURST); 841c1d14583SBruce Richardson retval += burst; 842c1d14583SBruce Richardson nb_pkts -= burst; 843c1d14583SBruce Richardson if (burst < ICE_VPMD_RX_BURST) 844c1d14583SBruce Richardson return retval; 845c1d14583SBruce Richardson } 846c1d14583SBruce Richardson return retval + ice_recv_scattered_burst_vec_avx512_offload(rx_queue, 847c1d14583SBruce Richardson rx_pkts + retval, nb_pkts); 848c1d14583SBruce Richardson } 849c1d14583SBruce Richardson 850c1d14583SBruce Richardson static __rte_always_inline void 851c1d14583SBruce Richardson ice_vtx1(volatile struct ice_tx_desc *txdp, 852c1d14583SBruce Richardson struct rte_mbuf *pkt, uint64_t flags, bool do_offload) 853c1d14583SBruce Richardson { 854c1d14583SBruce Richardson uint64_t high_qw = 855c1d14583SBruce Richardson (ICE_TX_DESC_DTYPE_DATA | 856c1d14583SBruce Richardson ((uint64_t)flags << ICE_TXD_QW1_CMD_S) | 857c1d14583SBruce Richardson ((uint64_t)pkt->data_len << ICE_TXD_QW1_TX_BUF_SZ_S)); 858c1d14583SBruce Richardson 859c1d14583SBruce Richardson if (do_offload) 860c1d14583SBruce Richardson ice_txd_enable_offload(pkt, &high_qw); 861c1d14583SBruce Richardson 862c1d14583SBruce Richardson __m128i descriptor = _mm_set_epi64x(high_qw, rte_pktmbuf_iova(pkt)); 863c1d14583SBruce Richardson _mm_store_si128(RTE_CAST_PTR(__m128i *, txdp), descriptor); 864c1d14583SBruce Richardson } 865c1d14583SBruce Richardson 866c1d14583SBruce Richardson static __rte_always_inline void 867c1d14583SBruce Richardson ice_vtx(volatile struct ice_tx_desc *txdp, struct rte_mbuf **pkt, 868c1d14583SBruce Richardson uint16_t nb_pkts, uint64_t flags, bool do_offload) 869c1d14583SBruce Richardson { 870c1d14583SBruce Richardson const uint64_t hi_qw_tmpl = (ICE_TX_DESC_DTYPE_DATA | 871c1d14583SBruce Richardson ((uint64_t)flags << ICE_TXD_QW1_CMD_S)); 872c1d14583SBruce Richardson 873c1d14583SBruce Richardson for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) { 874c1d14583SBruce Richardson uint64_t hi_qw3 = 875c1d14583SBruce Richardson hi_qw_tmpl | 876c1d14583SBruce Richardson ((uint64_t)pkt[3]->data_len << 877c1d14583SBruce Richardson ICE_TXD_QW1_TX_BUF_SZ_S); 878c1d14583SBruce Richardson if (do_offload) 879c1d14583SBruce Richardson ice_txd_enable_offload(pkt[3], &hi_qw3); 880c1d14583SBruce Richardson uint64_t hi_qw2 = 881c1d14583SBruce Richardson hi_qw_tmpl | 882c1d14583SBruce Richardson ((uint64_t)pkt[2]->data_len << 883c1d14583SBruce Richardson ICE_TXD_QW1_TX_BUF_SZ_S); 884c1d14583SBruce Richardson if (do_offload) 885c1d14583SBruce Richardson ice_txd_enable_offload(pkt[2], &hi_qw2); 886c1d14583SBruce Richardson uint64_t hi_qw1 = 887c1d14583SBruce Richardson hi_qw_tmpl | 888c1d14583SBruce Richardson ((uint64_t)pkt[1]->data_len << 889c1d14583SBruce Richardson ICE_TXD_QW1_TX_BUF_SZ_S); 890c1d14583SBruce Richardson if (do_offload) 891c1d14583SBruce Richardson ice_txd_enable_offload(pkt[1], &hi_qw1); 892c1d14583SBruce Richardson uint64_t hi_qw0 = 893c1d14583SBruce Richardson hi_qw_tmpl | 894c1d14583SBruce Richardson ((uint64_t)pkt[0]->data_len << 895c1d14583SBruce Richardson ICE_TXD_QW1_TX_BUF_SZ_S); 896c1d14583SBruce Richardson if (do_offload) 897c1d14583SBruce Richardson ice_txd_enable_offload(pkt[0], &hi_qw0); 898c1d14583SBruce Richardson 899c1d14583SBruce Richardson __m512i desc0_3 = 900c1d14583SBruce Richardson _mm512_set_epi64 901c1d14583SBruce Richardson (hi_qw3, rte_pktmbuf_iova(pkt[3]), 902c1d14583SBruce Richardson hi_qw2, rte_pktmbuf_iova(pkt[2]), 903c1d14583SBruce Richardson hi_qw1, rte_pktmbuf_iova(pkt[1]), 904c1d14583SBruce Richardson hi_qw0, rte_pktmbuf_iova(pkt[0])); 905c1d14583SBruce Richardson _mm512_storeu_si512(RTE_CAST_PTR(void *, txdp), desc0_3); 906c1d14583SBruce Richardson } 907c1d14583SBruce Richardson 908c1d14583SBruce Richardson /* do any last ones */ 909c1d14583SBruce Richardson while (nb_pkts) { 910c1d14583SBruce Richardson ice_vtx1(txdp, *pkt, flags, do_offload); 911c1d14583SBruce Richardson txdp++, pkt++, nb_pkts--; 912c1d14583SBruce Richardson } 913c1d14583SBruce Richardson } 914c1d14583SBruce Richardson 915c1d14583SBruce Richardson static __rte_always_inline uint16_t 916c1d14583SBruce Richardson ice_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, 917c1d14583SBruce Richardson uint16_t nb_pkts, bool do_offload) 918c1d14583SBruce Richardson { 919c038157aSBruce Richardson struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue; 920c1d14583SBruce Richardson volatile struct ice_tx_desc *txdp; 9215cc9919fSBruce Richardson struct ci_tx_entry_vec *txep; 922c1d14583SBruce Richardson uint16_t n, nb_commit, tx_id; 923c1d14583SBruce Richardson uint64_t flags = ICE_TD_CMD; 924c1d14583SBruce Richardson uint64_t rs = ICE_TX_DESC_CMD_RS | ICE_TD_CMD; 925c1d14583SBruce Richardson 926c1d14583SBruce Richardson /* cross rx_thresh boundary is not allowed */ 927c1d14583SBruce Richardson nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh); 928c1d14583SBruce Richardson 929c1d14583SBruce Richardson if (txq->nb_tx_free < txq->tx_free_thresh) 9300f62bbefSBruce Richardson ci_tx_free_bufs_vec(txq, ice_tx_desc_done, false); 931c1d14583SBruce Richardson 932c1d14583SBruce Richardson nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts); 933c1d14583SBruce Richardson if (unlikely(nb_pkts == 0)) 934c1d14583SBruce Richardson return 0; 935c1d14583SBruce Richardson 936c1d14583SBruce Richardson tx_id = txq->tx_tail; 9374d0f54d9SBruce Richardson txdp = &txq->ice_tx_ring[tx_id]; 938c1d14583SBruce Richardson txep = (void *)txq->sw_ring; 939c1d14583SBruce Richardson txep += tx_id; 940c1d14583SBruce Richardson 941c1d14583SBruce Richardson txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts); 942c1d14583SBruce Richardson 943c1d14583SBruce Richardson n = (uint16_t)(txq->nb_tx_desc - tx_id); 944c1d14583SBruce Richardson if (nb_commit >= n) { 945*7e230d56SBruce Richardson ci_tx_backlog_entry_vec(txep, tx_pkts, n); 946c1d14583SBruce Richardson 947c1d14583SBruce Richardson ice_vtx(txdp, tx_pkts, n - 1, flags, do_offload); 948c1d14583SBruce Richardson tx_pkts += (n - 1); 949c1d14583SBruce Richardson txdp += (n - 1); 950c1d14583SBruce Richardson 951c1d14583SBruce Richardson ice_vtx1(txdp, *tx_pkts++, rs, do_offload); 952c1d14583SBruce Richardson 953c1d14583SBruce Richardson nb_commit = (uint16_t)(nb_commit - n); 954c1d14583SBruce Richardson 955c1d14583SBruce Richardson tx_id = 0; 956c1d14583SBruce Richardson txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1); 957c1d14583SBruce Richardson 958c1d14583SBruce Richardson /* avoid reach the end of ring */ 9594d0f54d9SBruce Richardson txdp = txq->ice_tx_ring; 960c1d14583SBruce Richardson txep = (void *)txq->sw_ring; 961c1d14583SBruce Richardson } 962c1d14583SBruce Richardson 963*7e230d56SBruce Richardson ci_tx_backlog_entry_vec(txep, tx_pkts, nb_commit); 964c1d14583SBruce Richardson 965c1d14583SBruce Richardson ice_vtx(txdp, tx_pkts, nb_commit, flags, do_offload); 966c1d14583SBruce Richardson 967c1d14583SBruce Richardson tx_id = (uint16_t)(tx_id + nb_commit); 968c1d14583SBruce Richardson if (tx_id > txq->tx_next_rs) { 9694d0f54d9SBruce Richardson txq->ice_tx_ring[txq->tx_next_rs].cmd_type_offset_bsz |= 970c1d14583SBruce Richardson rte_cpu_to_le_64(((uint64_t)ICE_TX_DESC_CMD_RS) << 971c1d14583SBruce Richardson ICE_TXD_QW1_CMD_S); 972c1d14583SBruce Richardson txq->tx_next_rs = 973c1d14583SBruce Richardson (uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh); 974c1d14583SBruce Richardson } 975c1d14583SBruce Richardson 976c1d14583SBruce Richardson txq->tx_tail = tx_id; 977c1d14583SBruce Richardson 978c1d14583SBruce Richardson ICE_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail); 979c1d14583SBruce Richardson 980c1d14583SBruce Richardson return nb_pkts; 981c1d14583SBruce Richardson } 982c1d14583SBruce Richardson 983c1d14583SBruce Richardson uint16_t 984c1d14583SBruce Richardson ice_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, 985c1d14583SBruce Richardson uint16_t nb_pkts) 986c1d14583SBruce Richardson { 987c1d14583SBruce Richardson uint16_t nb_tx = 0; 988c038157aSBruce Richardson struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue; 989c1d14583SBruce Richardson 990c1d14583SBruce Richardson while (nb_pkts) { 991c1d14583SBruce Richardson uint16_t ret, num; 992c1d14583SBruce Richardson 993c1d14583SBruce Richardson num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh); 994c1d14583SBruce Richardson ret = ice_xmit_fixed_burst_vec_avx512(tx_queue, 995c1d14583SBruce Richardson &tx_pkts[nb_tx], num, false); 996c1d14583SBruce Richardson nb_tx += ret; 997c1d14583SBruce Richardson nb_pkts -= ret; 998c1d14583SBruce Richardson if (ret < num) 999c1d14583SBruce Richardson break; 1000c1d14583SBruce Richardson } 1001c1d14583SBruce Richardson 1002c1d14583SBruce Richardson return nb_tx; 1003c1d14583SBruce Richardson } 1004c1d14583SBruce Richardson 1005c1d14583SBruce Richardson uint16_t 1006c1d14583SBruce Richardson ice_xmit_pkts_vec_avx512_offload(void *tx_queue, struct rte_mbuf **tx_pkts, 1007c1d14583SBruce Richardson uint16_t nb_pkts) 1008c1d14583SBruce Richardson { 1009c1d14583SBruce Richardson uint16_t nb_tx = 0; 1010c038157aSBruce Richardson struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue; 1011c1d14583SBruce Richardson 1012c1d14583SBruce Richardson while (nb_pkts) { 1013c1d14583SBruce Richardson uint16_t ret, num; 1014c1d14583SBruce Richardson 1015c1d14583SBruce Richardson num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh); 1016c1d14583SBruce Richardson ret = ice_xmit_fixed_burst_vec_avx512(tx_queue, 1017c1d14583SBruce Richardson &tx_pkts[nb_tx], num, true); 1018c1d14583SBruce Richardson 1019c1d14583SBruce Richardson nb_tx += ret; 1020c1d14583SBruce Richardson nb_pkts -= ret; 1021c1d14583SBruce Richardson if (ret < num) 1022c1d14583SBruce Richardson break; 1023c1d14583SBruce Richardson } 1024c1d14583SBruce Richardson 1025c1d14583SBruce Richardson return nb_tx; 1026c1d14583SBruce Richardson } 1027