1c1d14583SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause 2c1d14583SBruce Richardson * Copyright(c) 2019 Intel Corporation 3c1d14583SBruce Richardson */ 4c1d14583SBruce Richardson 5c1d14583SBruce Richardson #include "iavf_rxtx_vec_common.h" 6c1d14583SBruce Richardson 7c1d14583SBruce Richardson #include <rte_vect.h> 8c1d14583SBruce Richardson 9c1d14583SBruce Richardson static __rte_always_inline void 10c1d14583SBruce Richardson iavf_rxq_rearm(struct iavf_rx_queue *rxq) 11c1d14583SBruce Richardson { 12c1d14583SBruce Richardson return iavf_rxq_rearm_common(rxq, false); 13c1d14583SBruce Richardson } 14c1d14583SBruce Richardson 15c1d14583SBruce Richardson #define PKTLEN_SHIFT 10 16c1d14583SBruce Richardson 17c1d14583SBruce Richardson static __rte_always_inline uint16_t 18c1d14583SBruce Richardson _iavf_recv_raw_pkts_vec_avx2(struct iavf_rx_queue *rxq, 19c1d14583SBruce Richardson struct rte_mbuf **rx_pkts, 20c1d14583SBruce Richardson uint16_t nb_pkts, uint8_t *split_packet, 21c1d14583SBruce Richardson bool offload) 22c1d14583SBruce Richardson { 23c1d14583SBruce Richardson #define IAVF_DESCS_PER_LOOP_AVX 8 24c1d14583SBruce Richardson 25c1d14583SBruce Richardson /* const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; */ 26c1d14583SBruce Richardson const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl; 27c1d14583SBruce Richardson 28c1d14583SBruce Richardson const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 29c1d14583SBruce Richardson 0, rxq->mbuf_initializer); 30c1d14583SBruce Richardson /* struct iavf_rx_entry *sw_ring = &rxq->sw_ring[rxq->rx_tail]; */ 31c1d14583SBruce Richardson struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail]; 32c1d14583SBruce Richardson volatile union iavf_rx_desc *rxdp = rxq->rx_ring + rxq->rx_tail; 33c1d14583SBruce Richardson const int avx_aligned = ((rxq->rx_tail & 1) == 0); 34c1d14583SBruce Richardson 35c1d14583SBruce Richardson rte_prefetch0(rxdp); 36c1d14583SBruce Richardson 37c1d14583SBruce Richardson /* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */ 38c1d14583SBruce Richardson nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_DESCS_PER_LOOP_AVX); 39c1d14583SBruce Richardson 40c1d14583SBruce Richardson /* See if we need to rearm the RX queue - gives the prefetch a bit 41c1d14583SBruce Richardson * of time to act 42c1d14583SBruce Richardson */ 43c1d14583SBruce Richardson if (rxq->rxrearm_nb > IAVF_RXQ_REARM_THRESH) 44c1d14583SBruce Richardson iavf_rxq_rearm(rxq); 45c1d14583SBruce Richardson 46c1d14583SBruce Richardson /* Before we start moving massive data around, check to see if 47c1d14583SBruce Richardson * there is actually a packet available 48c1d14583SBruce Richardson */ 49c1d14583SBruce Richardson if (!(rxdp->wb.qword1.status_error_len & 50c1d14583SBruce Richardson rte_cpu_to_le_32(1 << IAVF_RX_DESC_STATUS_DD_SHIFT))) 51c1d14583SBruce Richardson return 0; 52c1d14583SBruce Richardson 53c1d14583SBruce Richardson /* constants used in processing loop */ 54c1d14583SBruce Richardson const __m256i crc_adjust = 55c1d14583SBruce Richardson _mm256_set_epi16 56c1d14583SBruce Richardson (/* first descriptor */ 57c1d14583SBruce Richardson 0, 0, 0, /* ignore non-length fields */ 58c1d14583SBruce Richardson -rxq->crc_len, /* sub crc on data_len */ 59c1d14583SBruce Richardson 0, /* ignore high-16bits of pkt_len */ 60c1d14583SBruce Richardson -rxq->crc_len, /* sub crc on pkt_len */ 61c1d14583SBruce Richardson 0, 0, /* ignore pkt_type field */ 62c1d14583SBruce Richardson /* second descriptor */ 63c1d14583SBruce Richardson 0, 0, 0, /* ignore non-length fields */ 64c1d14583SBruce Richardson -rxq->crc_len, /* sub crc on data_len */ 65c1d14583SBruce Richardson 0, /* ignore high-16bits of pkt_len */ 66c1d14583SBruce Richardson -rxq->crc_len, /* sub crc on pkt_len */ 67c1d14583SBruce Richardson 0, 0 /* ignore pkt_type field */ 68c1d14583SBruce Richardson ); 69c1d14583SBruce Richardson 70c1d14583SBruce Richardson /* 8 packets DD mask, LSB in each 32-bit value */ 71c1d14583SBruce Richardson const __m256i dd_check = _mm256_set1_epi32(1); 72c1d14583SBruce Richardson 73c1d14583SBruce Richardson /* 8 packets EOP mask, second-LSB in each 32-bit value */ 74c1d14583SBruce Richardson const __m256i eop_check = _mm256_slli_epi32(dd_check, 75c1d14583SBruce Richardson IAVF_RX_DESC_STATUS_EOF_SHIFT); 76c1d14583SBruce Richardson 77c1d14583SBruce Richardson /* mask to shuffle from desc. to mbuf (2 descriptors)*/ 78c1d14583SBruce Richardson const __m256i shuf_msk = 79c1d14583SBruce Richardson _mm256_set_epi8 80c1d14583SBruce Richardson (/* first descriptor */ 81c1d14583SBruce Richardson 7, 6, 5, 4, /* octet 4~7, 32bits rss */ 82c1d14583SBruce Richardson 3, 2, /* octet 2~3, low 16 bits vlan_macip */ 83c1d14583SBruce Richardson 15, 14, /* octet 15~14, 16 bits data_len */ 84c1d14583SBruce Richardson 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ 85c1d14583SBruce Richardson 15, 14, /* octet 15~14, low 16 bits pkt_len */ 86c1d14583SBruce Richardson 0xFF, 0xFF, /* pkt_type set as unknown */ 87c1d14583SBruce Richardson 0xFF, 0xFF, /*pkt_type set as unknown */ 88c1d14583SBruce Richardson /* second descriptor */ 89c1d14583SBruce Richardson 7, 6, 5, 4, /* octet 4~7, 32bits rss */ 90c1d14583SBruce Richardson 3, 2, /* octet 2~3, low 16 bits vlan_macip */ 91c1d14583SBruce Richardson 15, 14, /* octet 15~14, 16 bits data_len */ 92c1d14583SBruce Richardson 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ 93c1d14583SBruce Richardson 15, 14, /* octet 15~14, low 16 bits pkt_len */ 94c1d14583SBruce Richardson 0xFF, 0xFF, /* pkt_type set as unknown */ 95c1d14583SBruce Richardson 0xFF, 0xFF /*pkt_type set as unknown */ 96c1d14583SBruce Richardson ); 97c1d14583SBruce Richardson /** 98c1d14583SBruce Richardson * compile-time check the above crc and shuffle layout is correct. 99c1d14583SBruce Richardson * NOTE: the first field (lowest address) is given last in set_epi 100c1d14583SBruce Richardson * calls above. 101c1d14583SBruce Richardson */ 102c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != 103c1d14583SBruce Richardson offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); 104c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != 105c1d14583SBruce Richardson offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); 106c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) != 107c1d14583SBruce Richardson offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10); 108c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != 109c1d14583SBruce Richardson offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); 110c1d14583SBruce Richardson 111c1d14583SBruce Richardson /* Status/Error flag masks */ 112c1d14583SBruce Richardson /** 113c1d14583SBruce Richardson * mask everything except RSS, flow director and VLAN flags 114c1d14583SBruce Richardson * bit2 is for VLAN tag, bit11 for flow director indication 115c1d14583SBruce Richardson * bit13:12 for RSS indication. Bits 3-5 of error 116c1d14583SBruce Richardson * field (bits 22-24) are for IP/L4 checksum errors 117c1d14583SBruce Richardson */ 118c1d14583SBruce Richardson const __m256i flags_mask = 119c1d14583SBruce Richardson _mm256_set1_epi32((1 << 2) | (1 << 11) | 120c1d14583SBruce Richardson (3 << 12) | (7 << 22)); 121c1d14583SBruce Richardson /** 122c1d14583SBruce Richardson * data to be shuffled by result of flag mask. If VLAN bit is set, 123c1d14583SBruce Richardson * (bit 2), then position 4 in this array will be used in the 124c1d14583SBruce Richardson * destination 125c1d14583SBruce Richardson */ 126c1d14583SBruce Richardson const __m256i vlan_flags_shuf = 127c1d14583SBruce Richardson _mm256_set_epi32(0, 0, RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 0, 128c1d14583SBruce Richardson 0, 0, RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 0); 129c1d14583SBruce Richardson /** 130c1d14583SBruce Richardson * data to be shuffled by result of flag mask, shifted down 11. 131c1d14583SBruce Richardson * If RSS/FDIR bits are set, shuffle moves appropriate flags in 132c1d14583SBruce Richardson * place. 133c1d14583SBruce Richardson */ 134c1d14583SBruce Richardson const __m256i rss_flags_shuf = 135c1d14583SBruce Richardson _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 136c1d14583SBruce Richardson RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_FDIR, RTE_MBUF_F_RX_RSS_HASH, 137c1d14583SBruce Richardson 0, 0, 0, 0, RTE_MBUF_F_RX_FDIR, 0,/* end up 128-bits */ 138c1d14583SBruce Richardson 0, 0, 0, 0, 0, 0, 0, 0, 139c1d14583SBruce Richardson RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_FDIR, RTE_MBUF_F_RX_RSS_HASH, 140c1d14583SBruce Richardson 0, 0, 0, 0, RTE_MBUF_F_RX_FDIR, 0); 141c1d14583SBruce Richardson 142c1d14583SBruce Richardson /** 143c1d14583SBruce Richardson * data to be shuffled by the result of the flags mask shifted by 22 144c1d14583SBruce Richardson * bits. This gives use the l3_l4 flags. 145c1d14583SBruce Richardson */ 146c1d14583SBruce Richardson const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 147c1d14583SBruce Richardson /* shift right 1 bit to make sure it not exceed 255 */ 148c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | 149c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 150c1d14583SBruce Richardson (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 151c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1, 152c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 153c1d14583SBruce Richardson (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD) >> 1, 154c1d14583SBruce Richardson (RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 155c1d14583SBruce Richardson (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1, 156c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD >> 1, 157c1d14583SBruce Richardson (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_GOOD) >> 1, 158c1d14583SBruce Richardson /* second 128-bits */ 159c1d14583SBruce Richardson 0, 0, 0, 0, 0, 0, 0, 0, 160c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | 161c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 162c1d14583SBruce Richardson (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 163c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1, 164c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 165c1d14583SBruce Richardson (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD) >> 1, 166c1d14583SBruce Richardson (RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 167c1d14583SBruce Richardson (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1, 168c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD >> 1, 169c1d14583SBruce Richardson (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_GOOD) >> 1); 170c1d14583SBruce Richardson 171c1d14583SBruce Richardson const __m256i cksum_mask = 172c1d14583SBruce Richardson _mm256_set1_epi32(RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD | 173c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD | 174c1d14583SBruce Richardson RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD); 175c1d14583SBruce Richardson 176c1d14583SBruce Richardson RTE_SET_USED(avx_aligned); /* for 32B descriptors we don't use this */ 177c1d14583SBruce Richardson 178c1d14583SBruce Richardson uint16_t i, received; 179c1d14583SBruce Richardson 180c1d14583SBruce Richardson for (i = 0, received = 0; i < nb_pkts; 181c1d14583SBruce Richardson i += IAVF_DESCS_PER_LOOP_AVX, 182c1d14583SBruce Richardson rxdp += IAVF_DESCS_PER_LOOP_AVX) { 183c1d14583SBruce Richardson /* step 1, copy over 8 mbuf pointers to rx_pkts array */ 184c1d14583SBruce Richardson _mm256_storeu_si256((void *)&rx_pkts[i], 185c1d14583SBruce Richardson _mm256_loadu_si256((void *)&sw_ring[i])); 186c1d14583SBruce Richardson #ifdef RTE_ARCH_X86_64 187c1d14583SBruce Richardson _mm256_storeu_si256 188c1d14583SBruce Richardson ((void *)&rx_pkts[i + 4], 189c1d14583SBruce Richardson _mm256_loadu_si256((void *)&sw_ring[i + 4])); 190c1d14583SBruce Richardson #endif 191c1d14583SBruce Richardson 192c1d14583SBruce Richardson const __m128i raw_desc7 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 7)); 193c1d14583SBruce Richardson rte_compiler_barrier(); 194c1d14583SBruce Richardson const __m128i raw_desc6 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 6)); 195c1d14583SBruce Richardson rte_compiler_barrier(); 196c1d14583SBruce Richardson const __m128i raw_desc5 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 5)); 197c1d14583SBruce Richardson rte_compiler_barrier(); 198c1d14583SBruce Richardson const __m128i raw_desc4 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 4)); 199c1d14583SBruce Richardson rte_compiler_barrier(); 200c1d14583SBruce Richardson const __m128i raw_desc3 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 3)); 201c1d14583SBruce Richardson rte_compiler_barrier(); 202c1d14583SBruce Richardson const __m128i raw_desc2 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 2)); 203c1d14583SBruce Richardson rte_compiler_barrier(); 204c1d14583SBruce Richardson const __m128i raw_desc1 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 1)); 205c1d14583SBruce Richardson rte_compiler_barrier(); 206c1d14583SBruce Richardson const __m128i raw_desc0 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 0)); 207c1d14583SBruce Richardson 208c1d14583SBruce Richardson const __m256i raw_desc6_7 = 209c1d14583SBruce Richardson _mm256_inserti128_si256(_mm256_castsi128_si256(raw_desc6), raw_desc7, 1); 210c1d14583SBruce Richardson const __m256i raw_desc4_5 = 211c1d14583SBruce Richardson _mm256_inserti128_si256(_mm256_castsi128_si256(raw_desc4), raw_desc5, 1); 212c1d14583SBruce Richardson const __m256i raw_desc2_3 = 213c1d14583SBruce Richardson _mm256_inserti128_si256(_mm256_castsi128_si256(raw_desc2), raw_desc3, 1); 214c1d14583SBruce Richardson const __m256i raw_desc0_1 = 215c1d14583SBruce Richardson _mm256_inserti128_si256(_mm256_castsi128_si256(raw_desc0), raw_desc1, 1); 216c1d14583SBruce Richardson 217c1d14583SBruce Richardson if (split_packet) { 218c1d14583SBruce Richardson int j; 219c1d14583SBruce Richardson 220c1d14583SBruce Richardson for (j = 0; j < IAVF_DESCS_PER_LOOP_AVX; j++) 221c1d14583SBruce Richardson rte_mbuf_prefetch_part2(rx_pkts[i + j]); 222c1d14583SBruce Richardson } 223c1d14583SBruce Richardson 224c1d14583SBruce Richardson /** 225c1d14583SBruce Richardson * convert descriptors 4-7 into mbufs, adjusting length and 226c1d14583SBruce Richardson * re-arranging fields. Then write into the mbuf 227c1d14583SBruce Richardson */ 228c1d14583SBruce Richardson const __m256i len6_7 = _mm256_slli_epi32(raw_desc6_7, 229c1d14583SBruce Richardson PKTLEN_SHIFT); 230c1d14583SBruce Richardson const __m256i len4_5 = _mm256_slli_epi32(raw_desc4_5, 231c1d14583SBruce Richardson PKTLEN_SHIFT); 232c1d14583SBruce Richardson const __m256i desc6_7 = _mm256_blend_epi16(raw_desc6_7, 233c1d14583SBruce Richardson len6_7, 0x80); 234c1d14583SBruce Richardson const __m256i desc4_5 = _mm256_blend_epi16(raw_desc4_5, 235c1d14583SBruce Richardson len4_5, 0x80); 236c1d14583SBruce Richardson __m256i mb6_7 = _mm256_shuffle_epi8(desc6_7, shuf_msk); 237c1d14583SBruce Richardson __m256i mb4_5 = _mm256_shuffle_epi8(desc4_5, shuf_msk); 238c1d14583SBruce Richardson 239c1d14583SBruce Richardson mb6_7 = _mm256_add_epi16(mb6_7, crc_adjust); 240c1d14583SBruce Richardson mb4_5 = _mm256_add_epi16(mb4_5, crc_adjust); 241c1d14583SBruce Richardson /** 242c1d14583SBruce Richardson * to get packet types, shift 64-bit values down 30 bits 243c1d14583SBruce Richardson * and so ptype is in lower 8-bits in each 244c1d14583SBruce Richardson */ 245c1d14583SBruce Richardson const __m256i ptypes6_7 = _mm256_srli_epi64(desc6_7, 30); 246c1d14583SBruce Richardson const __m256i ptypes4_5 = _mm256_srli_epi64(desc4_5, 30); 247c1d14583SBruce Richardson const uint8_t ptype7 = _mm256_extract_epi8(ptypes6_7, 24); 248c1d14583SBruce Richardson const uint8_t ptype6 = _mm256_extract_epi8(ptypes6_7, 8); 249c1d14583SBruce Richardson const uint8_t ptype5 = _mm256_extract_epi8(ptypes4_5, 24); 250c1d14583SBruce Richardson const uint8_t ptype4 = _mm256_extract_epi8(ptypes4_5, 8); 251c1d14583SBruce Richardson 252c1d14583SBruce Richardson mb6_7 = _mm256_insert_epi32(mb6_7, type_table[ptype7], 4); 253c1d14583SBruce Richardson mb6_7 = _mm256_insert_epi32(mb6_7, type_table[ptype6], 0); 254c1d14583SBruce Richardson mb4_5 = _mm256_insert_epi32(mb4_5, type_table[ptype5], 4); 255c1d14583SBruce Richardson mb4_5 = _mm256_insert_epi32(mb4_5, type_table[ptype4], 0); 256c1d14583SBruce Richardson /* merge the status bits into one register */ 257c1d14583SBruce Richardson const __m256i status4_7 = _mm256_unpackhi_epi32(desc6_7, 258c1d14583SBruce Richardson desc4_5); 259c1d14583SBruce Richardson 260c1d14583SBruce Richardson /** 261c1d14583SBruce Richardson * convert descriptors 0-3 into mbufs, adjusting length and 262c1d14583SBruce Richardson * re-arranging fields. Then write into the mbuf 263c1d14583SBruce Richardson */ 264c1d14583SBruce Richardson const __m256i len2_3 = _mm256_slli_epi32(raw_desc2_3, 265c1d14583SBruce Richardson PKTLEN_SHIFT); 266c1d14583SBruce Richardson const __m256i len0_1 = _mm256_slli_epi32(raw_desc0_1, 267c1d14583SBruce Richardson PKTLEN_SHIFT); 268c1d14583SBruce Richardson const __m256i desc2_3 = _mm256_blend_epi16(raw_desc2_3, 269c1d14583SBruce Richardson len2_3, 0x80); 270c1d14583SBruce Richardson const __m256i desc0_1 = _mm256_blend_epi16(raw_desc0_1, 271c1d14583SBruce Richardson len0_1, 0x80); 272c1d14583SBruce Richardson __m256i mb2_3 = _mm256_shuffle_epi8(desc2_3, shuf_msk); 273c1d14583SBruce Richardson __m256i mb0_1 = _mm256_shuffle_epi8(desc0_1, shuf_msk); 274c1d14583SBruce Richardson 275c1d14583SBruce Richardson mb2_3 = _mm256_add_epi16(mb2_3, crc_adjust); 276c1d14583SBruce Richardson mb0_1 = _mm256_add_epi16(mb0_1, crc_adjust); 277c1d14583SBruce Richardson /* get the packet types */ 278c1d14583SBruce Richardson const __m256i ptypes2_3 = _mm256_srli_epi64(desc2_3, 30); 279c1d14583SBruce Richardson const __m256i ptypes0_1 = _mm256_srli_epi64(desc0_1, 30); 280c1d14583SBruce Richardson const uint8_t ptype3 = _mm256_extract_epi8(ptypes2_3, 24); 281c1d14583SBruce Richardson const uint8_t ptype2 = _mm256_extract_epi8(ptypes2_3, 8); 282c1d14583SBruce Richardson const uint8_t ptype1 = _mm256_extract_epi8(ptypes0_1, 24); 283c1d14583SBruce Richardson const uint8_t ptype0 = _mm256_extract_epi8(ptypes0_1, 8); 284c1d14583SBruce Richardson 285c1d14583SBruce Richardson mb2_3 = _mm256_insert_epi32(mb2_3, type_table[ptype3], 4); 286c1d14583SBruce Richardson mb2_3 = _mm256_insert_epi32(mb2_3, type_table[ptype2], 0); 287c1d14583SBruce Richardson mb0_1 = _mm256_insert_epi32(mb0_1, type_table[ptype1], 4); 288c1d14583SBruce Richardson mb0_1 = _mm256_insert_epi32(mb0_1, type_table[ptype0], 0); 289c1d14583SBruce Richardson /* merge the status bits into one register */ 290c1d14583SBruce Richardson const __m256i status0_3 = _mm256_unpackhi_epi32(desc2_3, 291c1d14583SBruce Richardson desc0_1); 292c1d14583SBruce Richardson 293c1d14583SBruce Richardson /** 294c1d14583SBruce Richardson * take the two sets of status bits and merge to one 295c1d14583SBruce Richardson * After merge, the packets status flags are in the 296c1d14583SBruce Richardson * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6] 297c1d14583SBruce Richardson */ 298c1d14583SBruce Richardson __m256i status0_7 = _mm256_unpacklo_epi64(status4_7, 299c1d14583SBruce Richardson status0_3); 300c1d14583SBruce Richardson __m256i mbuf_flags = _mm256_set1_epi32(0); 301c1d14583SBruce Richardson 302c1d14583SBruce Richardson if (offload) { 303c1d14583SBruce Richardson /* now do flag manipulation */ 304c1d14583SBruce Richardson 305c1d14583SBruce Richardson /* get only flag/error bits we want */ 306c1d14583SBruce Richardson const __m256i flag_bits = 307c1d14583SBruce Richardson _mm256_and_si256(status0_7, flags_mask); 308c1d14583SBruce Richardson /* set vlan and rss flags */ 309c1d14583SBruce Richardson const __m256i vlan_flags = 310c1d14583SBruce Richardson _mm256_shuffle_epi8(vlan_flags_shuf, flag_bits); 311c1d14583SBruce Richardson const __m256i rss_flags = 312c1d14583SBruce Richardson _mm256_shuffle_epi8(rss_flags_shuf, 313c1d14583SBruce Richardson _mm256_srli_epi32(flag_bits, 11)); 314c1d14583SBruce Richardson /** 315c1d14583SBruce Richardson * l3_l4_error flags, shuffle, then shift to correct adjustment 316c1d14583SBruce Richardson * of flags in flags_shuf, and finally mask out extra bits 317c1d14583SBruce Richardson */ 318c1d14583SBruce Richardson __m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf, 319c1d14583SBruce Richardson _mm256_srli_epi32(flag_bits, 22)); 320c1d14583SBruce Richardson l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1); 321c1d14583SBruce Richardson l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask); 322c1d14583SBruce Richardson 323c1d14583SBruce Richardson /* merge flags */ 324c1d14583SBruce Richardson mbuf_flags = _mm256_or_si256(l3_l4_flags, 325c1d14583SBruce Richardson _mm256_or_si256(rss_flags, vlan_flags)); 326c1d14583SBruce Richardson } 327c1d14583SBruce Richardson 328c1d14583SBruce Richardson /** 329c1d14583SBruce Richardson * At this point, we have the 8 sets of flags in the low 16-bits 330c1d14583SBruce Richardson * of each 32-bit value in vlan0. 331c1d14583SBruce Richardson * We want to extract these, and merge them with the mbuf init 332c1d14583SBruce Richardson * data so we can do a single write to the mbuf to set the flags 333c1d14583SBruce Richardson * and all the other initialization fields. Extracting the 334c1d14583SBruce Richardson * appropriate flags means that we have to do a shift and blend 335c1d14583SBruce Richardson * for each mbuf before we do the write. However, we can also 336c1d14583SBruce Richardson * add in the previously computed rx_descriptor fields to 337c1d14583SBruce Richardson * make a single 256-bit write per mbuf 338c1d14583SBruce Richardson */ 339c1d14583SBruce Richardson /* check the structure matches expectations */ 340c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) != 341c1d14583SBruce Richardson offsetof(struct rte_mbuf, rearm_data) + 8); 342c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) != 343c1d14583SBruce Richardson RTE_ALIGN(offsetof(struct rte_mbuf, 344c1d14583SBruce Richardson rearm_data), 345c1d14583SBruce Richardson 16)); 346c1d14583SBruce Richardson /* build up data and do writes */ 347c1d14583SBruce Richardson __m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5, 348c1d14583SBruce Richardson rearm6, rearm7; 349c1d14583SBruce Richardson rearm6 = _mm256_blend_epi32(mbuf_init, 350c1d14583SBruce Richardson _mm256_slli_si256(mbuf_flags, 8), 351c1d14583SBruce Richardson 0x04); 352c1d14583SBruce Richardson rearm4 = _mm256_blend_epi32(mbuf_init, 353c1d14583SBruce Richardson _mm256_slli_si256(mbuf_flags, 4), 354c1d14583SBruce Richardson 0x04); 355c1d14583SBruce Richardson rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04); 356c1d14583SBruce Richardson rearm0 = _mm256_blend_epi32(mbuf_init, 357c1d14583SBruce Richardson _mm256_srli_si256(mbuf_flags, 4), 358c1d14583SBruce Richardson 0x04); 359c1d14583SBruce Richardson /* permute to add in the rx_descriptor e.g. rss fields */ 360c1d14583SBruce Richardson rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20); 361c1d14583SBruce Richardson rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20); 362c1d14583SBruce Richardson rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20); 363c1d14583SBruce Richardson rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20); 364c1d14583SBruce Richardson /* write to mbuf */ 365c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data, 366c1d14583SBruce Richardson rearm6); 367c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data, 368c1d14583SBruce Richardson rearm4); 369c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data, 370c1d14583SBruce Richardson rearm2); 371c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data, 372c1d14583SBruce Richardson rearm0); 373c1d14583SBruce Richardson 374c1d14583SBruce Richardson /* repeat for the odd mbufs */ 375c1d14583SBruce Richardson const __m256i odd_flags = 376c1d14583SBruce Richardson _mm256_castsi128_si256 377c1d14583SBruce Richardson (_mm256_extracti128_si256(mbuf_flags, 1)); 378c1d14583SBruce Richardson rearm7 = _mm256_blend_epi32(mbuf_init, 379c1d14583SBruce Richardson _mm256_slli_si256(odd_flags, 8), 380c1d14583SBruce Richardson 0x04); 381c1d14583SBruce Richardson rearm5 = _mm256_blend_epi32(mbuf_init, 382c1d14583SBruce Richardson _mm256_slli_si256(odd_flags, 4), 383c1d14583SBruce Richardson 0x04); 384c1d14583SBruce Richardson rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04); 385c1d14583SBruce Richardson rearm1 = _mm256_blend_epi32(mbuf_init, 386c1d14583SBruce Richardson _mm256_srli_si256(odd_flags, 4), 387c1d14583SBruce Richardson 0x04); 388c1d14583SBruce Richardson /* since odd mbufs are already in hi 128-bits use blend */ 389c1d14583SBruce Richardson rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0); 390c1d14583SBruce Richardson rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0); 391c1d14583SBruce Richardson rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0); 392c1d14583SBruce Richardson rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0); 393c1d14583SBruce Richardson /* again write to mbufs */ 394c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data, 395c1d14583SBruce Richardson rearm7); 396c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data, 397c1d14583SBruce Richardson rearm5); 398c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data, 399c1d14583SBruce Richardson rearm3); 400c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data, 401c1d14583SBruce Richardson rearm1); 402c1d14583SBruce Richardson 403c1d14583SBruce Richardson /* extract and record EOP bit */ 404c1d14583SBruce Richardson if (split_packet) { 405c1d14583SBruce Richardson const __m128i eop_mask = 406c1d14583SBruce Richardson _mm_set1_epi16(1 << IAVF_RX_DESC_STATUS_EOF_SHIFT); 407c1d14583SBruce Richardson const __m256i eop_bits256 = _mm256_and_si256(status0_7, 408c1d14583SBruce Richardson eop_check); 409c1d14583SBruce Richardson /* pack status bits into a single 128-bit register */ 410c1d14583SBruce Richardson const __m128i eop_bits = 411c1d14583SBruce Richardson _mm_packus_epi32 412c1d14583SBruce Richardson (_mm256_castsi256_si128(eop_bits256), 413c1d14583SBruce Richardson _mm256_extractf128_si256(eop_bits256, 414c1d14583SBruce Richardson 1)); 415c1d14583SBruce Richardson /** 416c1d14583SBruce Richardson * flip bits, and mask out the EOP bit, which is now 417c1d14583SBruce Richardson * a split-packet bit i.e. !EOP, rather than EOP one. 418c1d14583SBruce Richardson */ 419c1d14583SBruce Richardson __m128i split_bits = _mm_andnot_si128(eop_bits, 420c1d14583SBruce Richardson eop_mask); 421c1d14583SBruce Richardson /** 422c1d14583SBruce Richardson * eop bits are out of order, so we need to shuffle them 423c1d14583SBruce Richardson * back into order again. In doing so, only use low 8 424c1d14583SBruce Richardson * bits, which acts like another pack instruction 425c1d14583SBruce Richardson * The original order is (hi->lo): 1,3,5,7,0,2,4,6 426c1d14583SBruce Richardson * [Since we use epi8, the 16-bit positions are 427c1d14583SBruce Richardson * multiplied by 2 in the eop_shuffle value.] 428c1d14583SBruce Richardson */ 429c1d14583SBruce Richardson __m128i eop_shuffle = 430c1d14583SBruce Richardson _mm_set_epi8(/* zero hi 64b */ 431c1d14583SBruce Richardson 0xFF, 0xFF, 0xFF, 0xFF, 432c1d14583SBruce Richardson 0xFF, 0xFF, 0xFF, 0xFF, 433c1d14583SBruce Richardson /* move values to lo 64b */ 434c1d14583SBruce Richardson 8, 0, 10, 2, 435c1d14583SBruce Richardson 12, 4, 14, 6); 436c1d14583SBruce Richardson split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle); 437c1d14583SBruce Richardson *(uint64_t *)split_packet = 438c1d14583SBruce Richardson _mm_cvtsi128_si64(split_bits); 439c1d14583SBruce Richardson split_packet += IAVF_DESCS_PER_LOOP_AVX; 440c1d14583SBruce Richardson } 441c1d14583SBruce Richardson 442c1d14583SBruce Richardson /* perform dd_check */ 443c1d14583SBruce Richardson status0_7 = _mm256_and_si256(status0_7, dd_check); 444c1d14583SBruce Richardson status0_7 = _mm256_packs_epi32(status0_7, 445c1d14583SBruce Richardson _mm256_setzero_si256()); 446c1d14583SBruce Richardson 447c1d14583SBruce Richardson uint64_t burst = rte_popcount64 448c1d14583SBruce Richardson (_mm_cvtsi128_si64 449c1d14583SBruce Richardson (_mm256_extracti128_si256 450c1d14583SBruce Richardson (status0_7, 1))); 451c1d14583SBruce Richardson burst += rte_popcount64 452c1d14583SBruce Richardson (_mm_cvtsi128_si64 453c1d14583SBruce Richardson (_mm256_castsi256_si128(status0_7))); 454c1d14583SBruce Richardson received += burst; 455c1d14583SBruce Richardson if (burst != IAVF_DESCS_PER_LOOP_AVX) 456c1d14583SBruce Richardson break; 457c1d14583SBruce Richardson } 458c1d14583SBruce Richardson 459c1d14583SBruce Richardson /* update tail pointers */ 460c1d14583SBruce Richardson rxq->rx_tail += received; 461c1d14583SBruce Richardson rxq->rx_tail &= (rxq->nb_rx_desc - 1); 462c1d14583SBruce Richardson if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep avx2 aligned */ 463c1d14583SBruce Richardson rxq->rx_tail--; 464c1d14583SBruce Richardson received--; 465c1d14583SBruce Richardson } 466c1d14583SBruce Richardson rxq->rxrearm_nb += received; 467c1d14583SBruce Richardson return received; 468c1d14583SBruce Richardson } 469c1d14583SBruce Richardson 470c1d14583SBruce Richardson static inline __m256i 471c1d14583SBruce Richardson flex_rxd_to_fdir_flags_vec_avx2(const __m256i fdir_id0_7) 472c1d14583SBruce Richardson { 473c1d14583SBruce Richardson #define FDID_MIS_MAGIC 0xFFFFFFFF 474c1d14583SBruce Richardson RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR != (1 << 2)); 475c1d14583SBruce Richardson RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR_ID != (1 << 13)); 476c1d14583SBruce Richardson const __m256i pkt_fdir_bit = _mm256_set1_epi32(RTE_MBUF_F_RX_FDIR | 477c1d14583SBruce Richardson RTE_MBUF_F_RX_FDIR_ID); 478c1d14583SBruce Richardson /* desc->flow_id field == 0xFFFFFFFF means fdir mismatch */ 479c1d14583SBruce Richardson const __m256i fdir_mis_mask = _mm256_set1_epi32(FDID_MIS_MAGIC); 480c1d14583SBruce Richardson __m256i fdir_mask = _mm256_cmpeq_epi32(fdir_id0_7, 481c1d14583SBruce Richardson fdir_mis_mask); 482c1d14583SBruce Richardson /* this XOR op results to bit-reverse the fdir_mask */ 483c1d14583SBruce Richardson fdir_mask = _mm256_xor_si256(fdir_mask, fdir_mis_mask); 484c1d14583SBruce Richardson const __m256i fdir_flags = _mm256_and_si256(fdir_mask, pkt_fdir_bit); 485c1d14583SBruce Richardson 486c1d14583SBruce Richardson return fdir_flags; 487c1d14583SBruce Richardson } 488c1d14583SBruce Richardson 489c1d14583SBruce Richardson static __rte_always_inline uint16_t 490c1d14583SBruce Richardson _iavf_recv_raw_pkts_vec_avx2_flex_rxd(struct iavf_rx_queue *rxq, 491c1d14583SBruce Richardson struct rte_mbuf **rx_pkts, 492c1d14583SBruce Richardson uint16_t nb_pkts, uint8_t *split_packet, 493c1d14583SBruce Richardson bool offload) 494c1d14583SBruce Richardson { 495c1d14583SBruce Richardson #define IAVF_DESCS_PER_LOOP_AVX 8 496c1d14583SBruce Richardson 497c1d14583SBruce Richardson struct iavf_adapter *adapter = rxq->vsi->adapter; 498c1d14583SBruce Richardson 499c1d14583SBruce Richardson #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC 500c1d14583SBruce Richardson uint64_t offloads = adapter->dev_data->dev_conf.rxmode.offloads; 501c1d14583SBruce Richardson #endif 502c1d14583SBruce Richardson const uint32_t *type_table = adapter->ptype_tbl; 503c1d14583SBruce Richardson 504c1d14583SBruce Richardson const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 505c1d14583SBruce Richardson 0, rxq->mbuf_initializer); 506c1d14583SBruce Richardson struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail]; 507c1d14583SBruce Richardson volatile union iavf_rx_flex_desc *rxdp = 508c1d14583SBruce Richardson (volatile union iavf_rx_flex_desc *)rxq->rx_ring + rxq->rx_tail; 509c1d14583SBruce Richardson 510c1d14583SBruce Richardson rte_prefetch0(rxdp); 511c1d14583SBruce Richardson 512c1d14583SBruce Richardson /* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */ 513c1d14583SBruce Richardson nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_DESCS_PER_LOOP_AVX); 514c1d14583SBruce Richardson 515c1d14583SBruce Richardson /* See if we need to rearm the RX queue - gives the prefetch a bit 516c1d14583SBruce Richardson * of time to act 517c1d14583SBruce Richardson */ 518c1d14583SBruce Richardson if (rxq->rxrearm_nb > IAVF_RXQ_REARM_THRESH) 519c1d14583SBruce Richardson iavf_rxq_rearm(rxq); 520c1d14583SBruce Richardson 521c1d14583SBruce Richardson /* Before we start moving massive data around, check to see if 522c1d14583SBruce Richardson * there is actually a packet available 523c1d14583SBruce Richardson */ 524c1d14583SBruce Richardson if (!(rxdp->wb.status_error0 & 525c1d14583SBruce Richardson rte_cpu_to_le_32(1 << IAVF_RX_FLEX_DESC_STATUS0_DD_S))) 526c1d14583SBruce Richardson return 0; 527c1d14583SBruce Richardson #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC 528c1d14583SBruce Richardson bool is_tsinit = false; 529c1d14583SBruce Richardson uint8_t inflection_point = 0; 530c1d14583SBruce Richardson __m256i hw_low_last = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, rxq->phc_time); 531c1d14583SBruce Richardson if (rxq->offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP) { 532c1d14583SBruce Richardson uint64_t sw_cur_time = rte_get_timer_cycles() / (rte_get_timer_hz() / 1000); 533c1d14583SBruce Richardson 534c1d14583SBruce Richardson if (unlikely(sw_cur_time - rxq->hw_time_update > 4)) { 535c1d14583SBruce Richardson hw_low_last = _mm256_setzero_si256(); 536c1d14583SBruce Richardson is_tsinit = 1; 537c1d14583SBruce Richardson } else { 538c1d14583SBruce Richardson hw_low_last = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, rxq->phc_time); 539c1d14583SBruce Richardson } 540c1d14583SBruce Richardson } 541c1d14583SBruce Richardson #endif 542c1d14583SBruce Richardson 543c1d14583SBruce Richardson /* constants used in processing loop */ 544c1d14583SBruce Richardson const __m256i crc_adjust = 545c1d14583SBruce Richardson _mm256_set_epi16 546c1d14583SBruce Richardson (/* first descriptor */ 547c1d14583SBruce Richardson 0, 0, 0, /* ignore non-length fields */ 548c1d14583SBruce Richardson -rxq->crc_len, /* sub crc on data_len */ 549c1d14583SBruce Richardson 0, /* ignore high-16bits of pkt_len */ 550c1d14583SBruce Richardson -rxq->crc_len, /* sub crc on pkt_len */ 551c1d14583SBruce Richardson 0, 0, /* ignore pkt_type field */ 552c1d14583SBruce Richardson /* second descriptor */ 553c1d14583SBruce Richardson 0, 0, 0, /* ignore non-length fields */ 554c1d14583SBruce Richardson -rxq->crc_len, /* sub crc on data_len */ 555c1d14583SBruce Richardson 0, /* ignore high-16bits of pkt_len */ 556c1d14583SBruce Richardson -rxq->crc_len, /* sub crc on pkt_len */ 557c1d14583SBruce Richardson 0, 0 /* ignore pkt_type field */ 558c1d14583SBruce Richardson ); 559c1d14583SBruce Richardson 560c1d14583SBruce Richardson /* 8 packets DD mask, LSB in each 32-bit value */ 561c1d14583SBruce Richardson const __m256i dd_check = _mm256_set1_epi32(1); 562c1d14583SBruce Richardson 563c1d14583SBruce Richardson /* 8 packets EOP mask, second-LSB in each 32-bit value */ 564c1d14583SBruce Richardson const __m256i eop_check = _mm256_slli_epi32(dd_check, 565c1d14583SBruce Richardson IAVF_RX_FLEX_DESC_STATUS0_EOF_S); 566c1d14583SBruce Richardson 567c1d14583SBruce Richardson /* mask to shuffle from desc. to mbuf (2 descriptors)*/ 568c1d14583SBruce Richardson const __m256i shuf_msk = 569c1d14583SBruce Richardson _mm256_set_epi8 570c1d14583SBruce Richardson (/* first descriptor */ 571c1d14583SBruce Richardson 0xFF, 0xFF, 572c1d14583SBruce Richardson 0xFF, 0xFF, /* rss hash parsed separately */ 573c1d14583SBruce Richardson 11, 10, /* octet 10~11, 16 bits vlan_macip */ 574c1d14583SBruce Richardson 5, 4, /* octet 4~5, 16 bits data_len */ 575c1d14583SBruce Richardson 0xFF, 0xFF, /* skip hi 16 bits pkt_len, zero out */ 576c1d14583SBruce Richardson 5, 4, /* octet 4~5, 16 bits pkt_len */ 577c1d14583SBruce Richardson 0xFF, 0xFF, /* pkt_type set as unknown */ 578c1d14583SBruce Richardson 0xFF, 0xFF, /*pkt_type set as unknown */ 579c1d14583SBruce Richardson /* second descriptor */ 580c1d14583SBruce Richardson 0xFF, 0xFF, 581c1d14583SBruce Richardson 0xFF, 0xFF, /* rss hash parsed separately */ 582c1d14583SBruce Richardson 11, 10, /* octet 10~11, 16 bits vlan_macip */ 583c1d14583SBruce Richardson 5, 4, /* octet 4~5, 16 bits data_len */ 584c1d14583SBruce Richardson 0xFF, 0xFF, /* skip hi 16 bits pkt_len, zero out */ 585c1d14583SBruce Richardson 5, 4, /* octet 4~5, 16 bits pkt_len */ 586c1d14583SBruce Richardson 0xFF, 0xFF, /* pkt_type set as unknown */ 587c1d14583SBruce Richardson 0xFF, 0xFF /*pkt_type set as unknown */ 588c1d14583SBruce Richardson ); 589c1d14583SBruce Richardson /** 590c1d14583SBruce Richardson * compile-time check the above crc and shuffle layout is correct. 591c1d14583SBruce Richardson * NOTE: the first field (lowest address) is given last in set_epi 592c1d14583SBruce Richardson * calls above. 593c1d14583SBruce Richardson */ 594c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != 595c1d14583SBruce Richardson offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); 596c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != 597c1d14583SBruce Richardson offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); 598c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) != 599c1d14583SBruce Richardson offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10); 600c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != 601c1d14583SBruce Richardson offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); 602c1d14583SBruce Richardson 603c1d14583SBruce Richardson /* Status/Error flag masks */ 604c1d14583SBruce Richardson /** 605c1d14583SBruce Richardson * mask everything except Checksum Reports, RSS indication 606c1d14583SBruce Richardson * and VLAN indication. 607c1d14583SBruce Richardson * bit6:4 for IP/L4 checksum errors. 608c1d14583SBruce Richardson * bit12 is for RSS indication. 609c1d14583SBruce Richardson * bit13 is for VLAN indication. 610c1d14583SBruce Richardson */ 611c1d14583SBruce Richardson const __m256i flags_mask = 612c1d14583SBruce Richardson _mm256_set1_epi32((0xF << 4) | (1 << 12) | (1 << 13)); 613c1d14583SBruce Richardson /** 614c1d14583SBruce Richardson * data to be shuffled by the result of the flags mask shifted by 4 615c1d14583SBruce Richardson * bits. This gives use the l3_l4 flags. 616c1d14583SBruce Richardson */ 617c1d14583SBruce Richardson const __m256i l3_l4_flags_shuf = 618c1d14583SBruce Richardson _mm256_set_epi8((RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 619c1d14583SBruce Richardson RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | 620c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 621c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 622c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 623c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 624c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 625c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 626c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 627c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 628c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 629c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 630c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 631c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 632c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 633c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 634c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 635c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 636c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 637c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 638c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 639c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 640c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 641c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 642c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 643c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 644c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 645c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 646c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 647c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 648c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 649c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 650c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 651c1d14583SBruce Richardson /** 652c1d14583SBruce Richardson * second 128-bits 653c1d14583SBruce Richardson * shift right 20 bits to use the low two bits to indicate 654c1d14583SBruce Richardson * outer checksum status 655c1d14583SBruce Richardson * shift right 1 bit to make sure it not exceed 255 656c1d14583SBruce Richardson */ 657c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 658c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 659c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 660c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 661c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 662c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 663c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 664c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 665c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 666c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 667c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 668c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 669c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 670c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 671c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 672c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 673c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 674c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 675c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 676c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 677c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 678c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 679c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 680c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 681c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 682c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 683c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 684c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 685c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 686c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 687c1d14583SBruce Richardson (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 688c1d14583SBruce Richardson RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1); 689c1d14583SBruce Richardson const __m256i cksum_mask = 690c1d14583SBruce Richardson _mm256_set1_epi32(RTE_MBUF_F_RX_IP_CKSUM_MASK | 691c1d14583SBruce Richardson RTE_MBUF_F_RX_L4_CKSUM_MASK | 692c1d14583SBruce Richardson RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 693c1d14583SBruce Richardson RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK); 694c1d14583SBruce Richardson /** 695c1d14583SBruce Richardson * data to be shuffled by result of flag mask, shifted down 12. 696c1d14583SBruce Richardson * If RSS(bit12)/VLAN(bit13) are set, 697c1d14583SBruce Richardson * shuffle moves appropriate flags in place. 698c1d14583SBruce Richardson */ 699c1d14583SBruce Richardson const __m256i rss_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 700c1d14583SBruce Richardson 0, 0, 0, 0, 701c1d14583SBruce Richardson 0, 0, 0, 0, 702c1d14583SBruce Richardson RTE_MBUF_F_RX_RSS_HASH, 0, 703c1d14583SBruce Richardson RTE_MBUF_F_RX_RSS_HASH, 0, 704c1d14583SBruce Richardson /* end up 128-bits */ 705c1d14583SBruce Richardson 0, 0, 0, 0, 706c1d14583SBruce Richardson 0, 0, 0, 0, 707c1d14583SBruce Richardson 0, 0, 0, 0, 708c1d14583SBruce Richardson RTE_MBUF_F_RX_RSS_HASH, 0, 709c1d14583SBruce Richardson RTE_MBUF_F_RX_RSS_HASH, 0); 710c1d14583SBruce Richardson 711c1d14583SBruce Richardson const __m256i vlan_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 712c1d14583SBruce Richardson 0, 0, 0, 0, 713c1d14583SBruce Richardson 0, 0, 0, 0, 714c1d14583SBruce Richardson RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 715c1d14583SBruce Richardson RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 716c1d14583SBruce Richardson 0, 0, 717c1d14583SBruce Richardson /* end up 128-bits */ 718c1d14583SBruce Richardson 0, 0, 0, 0, 719c1d14583SBruce Richardson 0, 0, 0, 0, 720c1d14583SBruce Richardson 0, 0, 0, 0, 721c1d14583SBruce Richardson RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 722c1d14583SBruce Richardson RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 723c1d14583SBruce Richardson 0, 0); 724c1d14583SBruce Richardson 725c1d14583SBruce Richardson uint16_t i, received; 726c1d14583SBruce Richardson 727c1d14583SBruce Richardson for (i = 0, received = 0; i < nb_pkts; 728c1d14583SBruce Richardson i += IAVF_DESCS_PER_LOOP_AVX, 729c1d14583SBruce Richardson rxdp += IAVF_DESCS_PER_LOOP_AVX) { 730c1d14583SBruce Richardson /* step 1, copy over 8 mbuf pointers to rx_pkts array */ 731c1d14583SBruce Richardson _mm256_storeu_si256((void *)&rx_pkts[i], 732c1d14583SBruce Richardson _mm256_loadu_si256((void *)&sw_ring[i])); 733c1d14583SBruce Richardson #ifdef RTE_ARCH_X86_64 734c1d14583SBruce Richardson _mm256_storeu_si256 735c1d14583SBruce Richardson ((void *)&rx_pkts[i + 4], 736c1d14583SBruce Richardson _mm256_loadu_si256((void *)&sw_ring[i + 4])); 737c1d14583SBruce Richardson #endif 738c1d14583SBruce Richardson 739c1d14583SBruce Richardson __m256i raw_desc0_1, raw_desc2_3, raw_desc4_5, raw_desc6_7; 740c1d14583SBruce Richardson 741c1d14583SBruce Richardson const __m128i raw_desc7 = 742c1d14583SBruce Richardson _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 7)); 743c1d14583SBruce Richardson rte_compiler_barrier(); 744c1d14583SBruce Richardson const __m128i raw_desc6 = 745c1d14583SBruce Richardson _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 6)); 746c1d14583SBruce Richardson rte_compiler_barrier(); 747c1d14583SBruce Richardson const __m128i raw_desc5 = 748c1d14583SBruce Richardson _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 5)); 749c1d14583SBruce Richardson rte_compiler_barrier(); 750c1d14583SBruce Richardson const __m128i raw_desc4 = 751c1d14583SBruce Richardson _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 4)); 752c1d14583SBruce Richardson rte_compiler_barrier(); 753c1d14583SBruce Richardson const __m128i raw_desc3 = 754c1d14583SBruce Richardson _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 3)); 755c1d14583SBruce Richardson rte_compiler_barrier(); 756c1d14583SBruce Richardson const __m128i raw_desc2 = 757c1d14583SBruce Richardson _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 2)); 758c1d14583SBruce Richardson rte_compiler_barrier(); 759c1d14583SBruce Richardson const __m128i raw_desc1 = 760c1d14583SBruce Richardson _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 1)); 761c1d14583SBruce Richardson rte_compiler_barrier(); 762c1d14583SBruce Richardson const __m128i raw_desc0 = 763c1d14583SBruce Richardson _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 0)); 764c1d14583SBruce Richardson 765c1d14583SBruce Richardson raw_desc6_7 = 766c1d14583SBruce Richardson _mm256_inserti128_si256 767c1d14583SBruce Richardson (_mm256_castsi128_si256(raw_desc6), 768c1d14583SBruce Richardson raw_desc7, 1); 769c1d14583SBruce Richardson raw_desc4_5 = 770c1d14583SBruce Richardson _mm256_inserti128_si256 771c1d14583SBruce Richardson (_mm256_castsi128_si256(raw_desc4), 772c1d14583SBruce Richardson raw_desc5, 1); 773c1d14583SBruce Richardson raw_desc2_3 = 774c1d14583SBruce Richardson _mm256_inserti128_si256 775c1d14583SBruce Richardson (_mm256_castsi128_si256(raw_desc2), 776c1d14583SBruce Richardson raw_desc3, 1); 777c1d14583SBruce Richardson raw_desc0_1 = 778c1d14583SBruce Richardson _mm256_inserti128_si256 779c1d14583SBruce Richardson (_mm256_castsi128_si256(raw_desc0), 780c1d14583SBruce Richardson raw_desc1, 1); 781c1d14583SBruce Richardson 782c1d14583SBruce Richardson if (split_packet) { 783c1d14583SBruce Richardson int j; 784c1d14583SBruce Richardson 785c1d14583SBruce Richardson for (j = 0; j < IAVF_DESCS_PER_LOOP_AVX; j++) 786c1d14583SBruce Richardson rte_mbuf_prefetch_part2(rx_pkts[i + j]); 787c1d14583SBruce Richardson } 788c1d14583SBruce Richardson 789c1d14583SBruce Richardson /** 790c1d14583SBruce Richardson * convert descriptors 4-7 into mbufs, re-arrange fields. 791c1d14583SBruce Richardson * Then write into the mbuf. 792c1d14583SBruce Richardson */ 793c1d14583SBruce Richardson __m256i mb6_7 = _mm256_shuffle_epi8(raw_desc6_7, shuf_msk); 794c1d14583SBruce Richardson __m256i mb4_5 = _mm256_shuffle_epi8(raw_desc4_5, shuf_msk); 795c1d14583SBruce Richardson 796c1d14583SBruce Richardson mb6_7 = _mm256_add_epi16(mb6_7, crc_adjust); 797c1d14583SBruce Richardson mb4_5 = _mm256_add_epi16(mb4_5, crc_adjust); 798c1d14583SBruce Richardson 799c1d14583SBruce Richardson /** 800c1d14583SBruce Richardson * to get packet types, ptype is located in bit16-25 801c1d14583SBruce Richardson * of each 128bits 802c1d14583SBruce Richardson */ 803c1d14583SBruce Richardson const __m256i ptype_mask = 804c1d14583SBruce Richardson _mm256_set1_epi16(IAVF_RX_FLEX_DESC_PTYPE_M); 805c1d14583SBruce Richardson const __m256i ptypes6_7 = 806c1d14583SBruce Richardson _mm256_and_si256(raw_desc6_7, ptype_mask); 807c1d14583SBruce Richardson const __m256i ptypes4_5 = 808c1d14583SBruce Richardson _mm256_and_si256(raw_desc4_5, ptype_mask); 809c1d14583SBruce Richardson const uint16_t ptype7 = _mm256_extract_epi16(ptypes6_7, 9); 810c1d14583SBruce Richardson const uint16_t ptype6 = _mm256_extract_epi16(ptypes6_7, 1); 811c1d14583SBruce Richardson const uint16_t ptype5 = _mm256_extract_epi16(ptypes4_5, 9); 812c1d14583SBruce Richardson const uint16_t ptype4 = _mm256_extract_epi16(ptypes4_5, 1); 813c1d14583SBruce Richardson 814c1d14583SBruce Richardson mb6_7 = _mm256_insert_epi32(mb6_7, type_table[ptype7], 4); 815c1d14583SBruce Richardson mb6_7 = _mm256_insert_epi32(mb6_7, type_table[ptype6], 0); 816c1d14583SBruce Richardson mb4_5 = _mm256_insert_epi32(mb4_5, type_table[ptype5], 4); 817c1d14583SBruce Richardson mb4_5 = _mm256_insert_epi32(mb4_5, type_table[ptype4], 0); 818c1d14583SBruce Richardson /* merge the status bits into one register */ 819c1d14583SBruce Richardson const __m256i status4_7 = _mm256_unpackhi_epi32(raw_desc6_7, 820c1d14583SBruce Richardson raw_desc4_5); 821c1d14583SBruce Richardson 822c1d14583SBruce Richardson /** 823c1d14583SBruce Richardson * convert descriptors 0-3 into mbufs, re-arrange fields. 824c1d14583SBruce Richardson * Then write into the mbuf. 825c1d14583SBruce Richardson */ 826c1d14583SBruce Richardson __m256i mb2_3 = _mm256_shuffle_epi8(raw_desc2_3, shuf_msk); 827c1d14583SBruce Richardson __m256i mb0_1 = _mm256_shuffle_epi8(raw_desc0_1, shuf_msk); 828c1d14583SBruce Richardson 829c1d14583SBruce Richardson mb2_3 = _mm256_add_epi16(mb2_3, crc_adjust); 830c1d14583SBruce Richardson mb0_1 = _mm256_add_epi16(mb0_1, crc_adjust); 831c1d14583SBruce Richardson /** 832c1d14583SBruce Richardson * to get packet types, ptype is located in bit16-25 833c1d14583SBruce Richardson * of each 128bits 834c1d14583SBruce Richardson */ 835c1d14583SBruce Richardson const __m256i ptypes2_3 = 836c1d14583SBruce Richardson _mm256_and_si256(raw_desc2_3, ptype_mask); 837c1d14583SBruce Richardson const __m256i ptypes0_1 = 838c1d14583SBruce Richardson _mm256_and_si256(raw_desc0_1, ptype_mask); 839c1d14583SBruce Richardson const uint16_t ptype3 = _mm256_extract_epi16(ptypes2_3, 9); 840c1d14583SBruce Richardson const uint16_t ptype2 = _mm256_extract_epi16(ptypes2_3, 1); 841c1d14583SBruce Richardson const uint16_t ptype1 = _mm256_extract_epi16(ptypes0_1, 9); 842c1d14583SBruce Richardson const uint16_t ptype0 = _mm256_extract_epi16(ptypes0_1, 1); 843c1d14583SBruce Richardson 844c1d14583SBruce Richardson mb2_3 = _mm256_insert_epi32(mb2_3, type_table[ptype3], 4); 845c1d14583SBruce Richardson mb2_3 = _mm256_insert_epi32(mb2_3, type_table[ptype2], 0); 846c1d14583SBruce Richardson mb0_1 = _mm256_insert_epi32(mb0_1, type_table[ptype1], 4); 847c1d14583SBruce Richardson mb0_1 = _mm256_insert_epi32(mb0_1, type_table[ptype0], 0); 848c1d14583SBruce Richardson /* merge the status bits into one register */ 849c1d14583SBruce Richardson const __m256i status0_3 = _mm256_unpackhi_epi32(raw_desc2_3, 850c1d14583SBruce Richardson raw_desc0_1); 851c1d14583SBruce Richardson 852c1d14583SBruce Richardson /** 853c1d14583SBruce Richardson * take the two sets of status bits and merge to one 854c1d14583SBruce Richardson * After merge, the packets status flags are in the 855c1d14583SBruce Richardson * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6] 856c1d14583SBruce Richardson */ 857c1d14583SBruce Richardson __m256i status0_7 = _mm256_unpacklo_epi64(status4_7, 858c1d14583SBruce Richardson status0_3); 859c1d14583SBruce Richardson __m256i mbuf_flags = _mm256_set1_epi32(0); 860c1d14583SBruce Richardson __m256i vlan_flags = _mm256_setzero_si256(); 861c1d14583SBruce Richardson 862c1d14583SBruce Richardson if (offload) { 863c1d14583SBruce Richardson /* now do flag manipulation */ 864c1d14583SBruce Richardson 865c1d14583SBruce Richardson /* get only flag/error bits we want */ 866c1d14583SBruce Richardson const __m256i flag_bits = 867c1d14583SBruce Richardson _mm256_and_si256(status0_7, flags_mask); 868c1d14583SBruce Richardson /** 869c1d14583SBruce Richardson * l3_l4_error flags, shuffle, then shift to correct adjustment 870c1d14583SBruce Richardson * of flags in flags_shuf, and finally mask out extra bits 871c1d14583SBruce Richardson */ 872c1d14583SBruce Richardson __m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf, 873c1d14583SBruce Richardson _mm256_srli_epi32(flag_bits, 4)); 874c1d14583SBruce Richardson l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1); 875c1d14583SBruce Richardson __m256i l4_outer_mask = _mm256_set1_epi32(0x6); 876c1d14583SBruce Richardson __m256i l4_outer_flags = 877c1d14583SBruce Richardson _mm256_and_si256(l3_l4_flags, l4_outer_mask); 878c1d14583SBruce Richardson l4_outer_flags = _mm256_slli_epi32(l4_outer_flags, 20); 879c1d14583SBruce Richardson 880c1d14583SBruce Richardson __m256i l3_l4_mask = _mm256_set1_epi32(~0x6); 881c1d14583SBruce Richardson 882c1d14583SBruce Richardson l3_l4_flags = _mm256_and_si256(l3_l4_flags, l3_l4_mask); 883c1d14583SBruce Richardson l3_l4_flags = _mm256_or_si256(l3_l4_flags, l4_outer_flags); 884c1d14583SBruce Richardson l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask); 885c1d14583SBruce Richardson 886c1d14583SBruce Richardson /* set rss and vlan flags */ 887c1d14583SBruce Richardson const __m256i rss_vlan_flag_bits = 888c1d14583SBruce Richardson _mm256_srli_epi32(flag_bits, 12); 889c1d14583SBruce Richardson const __m256i rss_flags = 890c1d14583SBruce Richardson _mm256_shuffle_epi8(rss_flags_shuf, 891c1d14583SBruce Richardson rss_vlan_flag_bits); 892c1d14583SBruce Richardson 893c1d14583SBruce Richardson if (rxq->rx_flags == IAVF_RX_FLAGS_VLAN_TAG_LOC_L2TAG1) 894c1d14583SBruce Richardson vlan_flags = 895c1d14583SBruce Richardson _mm256_shuffle_epi8(vlan_flags_shuf, 896c1d14583SBruce Richardson rss_vlan_flag_bits); 897c1d14583SBruce Richardson 898c1d14583SBruce Richardson const __m256i rss_vlan_flags = 899c1d14583SBruce Richardson _mm256_or_si256(rss_flags, vlan_flags); 900c1d14583SBruce Richardson 901c1d14583SBruce Richardson /* merge flags */ 902c1d14583SBruce Richardson mbuf_flags = _mm256_or_si256(l3_l4_flags, 903c1d14583SBruce Richardson rss_vlan_flags); 904c1d14583SBruce Richardson } 905c1d14583SBruce Richardson 906c1d14583SBruce Richardson if (rxq->fdir_enabled) { 907c1d14583SBruce Richardson const __m256i fdir_id4_7 = 908c1d14583SBruce Richardson _mm256_unpackhi_epi32(raw_desc6_7, raw_desc4_5); 909c1d14583SBruce Richardson 910c1d14583SBruce Richardson const __m256i fdir_id0_3 = 911c1d14583SBruce Richardson _mm256_unpackhi_epi32(raw_desc2_3, raw_desc0_1); 912c1d14583SBruce Richardson 913c1d14583SBruce Richardson const __m256i fdir_id0_7 = 914c1d14583SBruce Richardson _mm256_unpackhi_epi64(fdir_id4_7, fdir_id0_3); 915c1d14583SBruce Richardson 916c1d14583SBruce Richardson const __m256i fdir_flags = 917c1d14583SBruce Richardson flex_rxd_to_fdir_flags_vec_avx2(fdir_id0_7); 918c1d14583SBruce Richardson 919c1d14583SBruce Richardson /* merge with fdir_flags */ 920c1d14583SBruce Richardson mbuf_flags = _mm256_or_si256(mbuf_flags, fdir_flags); 921c1d14583SBruce Richardson 922c1d14583SBruce Richardson /* write to mbuf: have to use scalar store here */ 923c1d14583SBruce Richardson rx_pkts[i + 0]->hash.fdir.hi = 924c1d14583SBruce Richardson _mm256_extract_epi32(fdir_id0_7, 3); 925c1d14583SBruce Richardson 926c1d14583SBruce Richardson rx_pkts[i + 1]->hash.fdir.hi = 927c1d14583SBruce Richardson _mm256_extract_epi32(fdir_id0_7, 7); 928c1d14583SBruce Richardson 929c1d14583SBruce Richardson rx_pkts[i + 2]->hash.fdir.hi = 930c1d14583SBruce Richardson _mm256_extract_epi32(fdir_id0_7, 2); 931c1d14583SBruce Richardson 932c1d14583SBruce Richardson rx_pkts[i + 3]->hash.fdir.hi = 933c1d14583SBruce Richardson _mm256_extract_epi32(fdir_id0_7, 6); 934c1d14583SBruce Richardson 935c1d14583SBruce Richardson rx_pkts[i + 4]->hash.fdir.hi = 936c1d14583SBruce Richardson _mm256_extract_epi32(fdir_id0_7, 1); 937c1d14583SBruce Richardson 938c1d14583SBruce Richardson rx_pkts[i + 5]->hash.fdir.hi = 939c1d14583SBruce Richardson _mm256_extract_epi32(fdir_id0_7, 5); 940c1d14583SBruce Richardson 941c1d14583SBruce Richardson rx_pkts[i + 6]->hash.fdir.hi = 942c1d14583SBruce Richardson _mm256_extract_epi32(fdir_id0_7, 0); 943c1d14583SBruce Richardson 944c1d14583SBruce Richardson rx_pkts[i + 7]->hash.fdir.hi = 945c1d14583SBruce Richardson _mm256_extract_epi32(fdir_id0_7, 4); 946c1d14583SBruce Richardson } /* if() on fdir_enabled */ 947c1d14583SBruce Richardson 948c1d14583SBruce Richardson if (offload) { 949c1d14583SBruce Richardson #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC 950c1d14583SBruce Richardson /** 951c1d14583SBruce Richardson * needs to load 2nd 16B of each desc, 952c1d14583SBruce Richardson * will cause performance drop to get into this context. 953c1d14583SBruce Richardson */ 954c1d14583SBruce Richardson if (offloads & RTE_ETH_RX_OFFLOAD_RSS_HASH || 955c1d14583SBruce Richardson offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP || 956c1d14583SBruce Richardson rxq->rx_flags & IAVF_RX_FLAGS_VLAN_TAG_LOC_L2TAG2_2) { 957c1d14583SBruce Richardson /* load bottom half of every 32B desc */ 958c1d14583SBruce Richardson const __m128i raw_desc_bh7 = 959c1d14583SBruce Richardson _mm_load_si128 960c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[7].wb.status_error1)); 961c1d14583SBruce Richardson rte_compiler_barrier(); 962c1d14583SBruce Richardson const __m128i raw_desc_bh6 = 963c1d14583SBruce Richardson _mm_load_si128 964c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[6].wb.status_error1)); 965c1d14583SBruce Richardson rte_compiler_barrier(); 966c1d14583SBruce Richardson const __m128i raw_desc_bh5 = 967c1d14583SBruce Richardson _mm_load_si128 968c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[5].wb.status_error1)); 969c1d14583SBruce Richardson rte_compiler_barrier(); 970c1d14583SBruce Richardson const __m128i raw_desc_bh4 = 971c1d14583SBruce Richardson _mm_load_si128 972c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[4].wb.status_error1)); 973c1d14583SBruce Richardson rte_compiler_barrier(); 974c1d14583SBruce Richardson const __m128i raw_desc_bh3 = 975c1d14583SBruce Richardson _mm_load_si128 976c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[3].wb.status_error1)); 977c1d14583SBruce Richardson rte_compiler_barrier(); 978c1d14583SBruce Richardson const __m128i raw_desc_bh2 = 979c1d14583SBruce Richardson _mm_load_si128 980c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[2].wb.status_error1)); 981c1d14583SBruce Richardson rte_compiler_barrier(); 982c1d14583SBruce Richardson const __m128i raw_desc_bh1 = 983c1d14583SBruce Richardson _mm_load_si128 984c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[1].wb.status_error1)); 985c1d14583SBruce Richardson rte_compiler_barrier(); 986c1d14583SBruce Richardson const __m128i raw_desc_bh0 = 987c1d14583SBruce Richardson _mm_load_si128 988c1d14583SBruce Richardson (RTE_CAST_PTR(const __m128i *, &rxdp[0].wb.status_error1)); 989c1d14583SBruce Richardson 990c1d14583SBruce Richardson __m256i raw_desc_bh6_7 = 991c1d14583SBruce Richardson _mm256_inserti128_si256 992c1d14583SBruce Richardson (_mm256_castsi128_si256(raw_desc_bh6), 993c1d14583SBruce Richardson raw_desc_bh7, 1); 994c1d14583SBruce Richardson __m256i raw_desc_bh4_5 = 995c1d14583SBruce Richardson _mm256_inserti128_si256 996c1d14583SBruce Richardson (_mm256_castsi128_si256(raw_desc_bh4), 997c1d14583SBruce Richardson raw_desc_bh5, 1); 998c1d14583SBruce Richardson __m256i raw_desc_bh2_3 = 999c1d14583SBruce Richardson _mm256_inserti128_si256 1000c1d14583SBruce Richardson (_mm256_castsi128_si256(raw_desc_bh2), 1001c1d14583SBruce Richardson raw_desc_bh3, 1); 1002c1d14583SBruce Richardson __m256i raw_desc_bh0_1 = 1003c1d14583SBruce Richardson _mm256_inserti128_si256 1004c1d14583SBruce Richardson (_mm256_castsi128_si256(raw_desc_bh0), 1005c1d14583SBruce Richardson raw_desc_bh1, 1); 1006c1d14583SBruce Richardson 1007c1d14583SBruce Richardson if (offloads & RTE_ETH_RX_OFFLOAD_RSS_HASH) { 1008c1d14583SBruce Richardson /** 1009c1d14583SBruce Richardson * to shift the 32b RSS hash value to the 1010c1d14583SBruce Richardson * highest 32b of each 128b before mask 1011c1d14583SBruce Richardson */ 1012c1d14583SBruce Richardson __m256i rss_hash6_7 = 1013c1d14583SBruce Richardson _mm256_slli_epi64(raw_desc_bh6_7, 32); 1014c1d14583SBruce Richardson __m256i rss_hash4_5 = 1015c1d14583SBruce Richardson _mm256_slli_epi64(raw_desc_bh4_5, 32); 1016c1d14583SBruce Richardson __m256i rss_hash2_3 = 1017c1d14583SBruce Richardson _mm256_slli_epi64(raw_desc_bh2_3, 32); 1018c1d14583SBruce Richardson __m256i rss_hash0_1 = 1019c1d14583SBruce Richardson _mm256_slli_epi64(raw_desc_bh0_1, 32); 1020c1d14583SBruce Richardson 1021c1d14583SBruce Richardson const __m256i rss_hash_msk = 1022c1d14583SBruce Richardson _mm256_set_epi32(0xFFFFFFFF, 0, 0, 0, 1023c1d14583SBruce Richardson 0xFFFFFFFF, 0, 0, 0); 1024c1d14583SBruce Richardson 1025c1d14583SBruce Richardson rss_hash6_7 = _mm256_and_si256 1026c1d14583SBruce Richardson (rss_hash6_7, rss_hash_msk); 1027c1d14583SBruce Richardson rss_hash4_5 = _mm256_and_si256 1028c1d14583SBruce Richardson (rss_hash4_5, rss_hash_msk); 1029c1d14583SBruce Richardson rss_hash2_3 = _mm256_and_si256 1030c1d14583SBruce Richardson (rss_hash2_3, rss_hash_msk); 1031c1d14583SBruce Richardson rss_hash0_1 = _mm256_and_si256 1032c1d14583SBruce Richardson (rss_hash0_1, rss_hash_msk); 1033c1d14583SBruce Richardson 1034c1d14583SBruce Richardson mb6_7 = _mm256_or_si256(mb6_7, rss_hash6_7); 1035c1d14583SBruce Richardson mb4_5 = _mm256_or_si256(mb4_5, rss_hash4_5); 1036c1d14583SBruce Richardson mb2_3 = _mm256_or_si256(mb2_3, rss_hash2_3); 1037c1d14583SBruce Richardson mb0_1 = _mm256_or_si256(mb0_1, rss_hash0_1); 1038c1d14583SBruce Richardson } /* if() on RSS hash parsing */ 1039c1d14583SBruce Richardson 1040c1d14583SBruce Richardson if (rxq->rx_flags & IAVF_RX_FLAGS_VLAN_TAG_LOC_L2TAG2_2) { 1041c1d14583SBruce Richardson /* merge the status/error-1 bits into one register */ 1042c1d14583SBruce Richardson const __m256i status1_4_7 = 1043c1d14583SBruce Richardson _mm256_unpacklo_epi32(raw_desc_bh6_7, 1044c1d14583SBruce Richardson raw_desc_bh4_5); 1045c1d14583SBruce Richardson const __m256i status1_0_3 = 1046c1d14583SBruce Richardson _mm256_unpacklo_epi32(raw_desc_bh2_3, 1047c1d14583SBruce Richardson raw_desc_bh0_1); 1048c1d14583SBruce Richardson 1049c1d14583SBruce Richardson const __m256i status1_0_7 = 1050c1d14583SBruce Richardson _mm256_unpacklo_epi64(status1_4_7, 1051c1d14583SBruce Richardson status1_0_3); 1052c1d14583SBruce Richardson 1053c1d14583SBruce Richardson const __m256i l2tag2p_flag_mask = 1054c1d14583SBruce Richardson _mm256_set1_epi32 1055c1d14583SBruce Richardson (1 << IAVF_RX_FLEX_DESC_STATUS1_L2TAG2P_S); 1056c1d14583SBruce Richardson 1057c1d14583SBruce Richardson __m256i l2tag2p_flag_bits = 1058c1d14583SBruce Richardson _mm256_and_si256 1059c1d14583SBruce Richardson (status1_0_7, l2tag2p_flag_mask); 1060c1d14583SBruce Richardson 1061c1d14583SBruce Richardson l2tag2p_flag_bits = 1062c1d14583SBruce Richardson _mm256_srli_epi32(l2tag2p_flag_bits, 1063c1d14583SBruce Richardson IAVF_RX_FLEX_DESC_STATUS1_L2TAG2P_S); 1064c1d14583SBruce Richardson 1065c1d14583SBruce Richardson const __m256i l2tag2_flags_shuf = 1066c1d14583SBruce Richardson _mm256_set_epi8(0, 0, 0, 0, 1067c1d14583SBruce Richardson 0, 0, 0, 0, 1068c1d14583SBruce Richardson 0, 0, 0, 0, 1069c1d14583SBruce Richardson 0, 0, 1070c1d14583SBruce Richardson RTE_MBUF_F_RX_VLAN | 1071c1d14583SBruce Richardson RTE_MBUF_F_RX_VLAN_STRIPPED, 1072c1d14583SBruce Richardson 0, 1073c1d14583SBruce Richardson /* end up 128-bits */ 1074c1d14583SBruce Richardson 0, 0, 0, 0, 1075c1d14583SBruce Richardson 0, 0, 0, 0, 1076c1d14583SBruce Richardson 0, 0, 0, 0, 1077c1d14583SBruce Richardson 0, 0, 1078c1d14583SBruce Richardson RTE_MBUF_F_RX_VLAN | 1079c1d14583SBruce Richardson RTE_MBUF_F_RX_VLAN_STRIPPED, 1080c1d14583SBruce Richardson 0); 1081c1d14583SBruce Richardson 1082c1d14583SBruce Richardson vlan_flags = 1083c1d14583SBruce Richardson _mm256_shuffle_epi8(l2tag2_flags_shuf, 1084c1d14583SBruce Richardson l2tag2p_flag_bits); 1085c1d14583SBruce Richardson 1086c1d14583SBruce Richardson /* merge with vlan_flags */ 1087c1d14583SBruce Richardson mbuf_flags = _mm256_or_si256 1088c1d14583SBruce Richardson (mbuf_flags, vlan_flags); 1089c1d14583SBruce Richardson 1090c1d14583SBruce Richardson /* L2TAG2_2 */ 1091c1d14583SBruce Richardson __m256i vlan_tci6_7 = 1092c1d14583SBruce Richardson _mm256_slli_si256(raw_desc_bh6_7, 4); 1093c1d14583SBruce Richardson __m256i vlan_tci4_5 = 1094c1d14583SBruce Richardson _mm256_slli_si256(raw_desc_bh4_5, 4); 1095c1d14583SBruce Richardson __m256i vlan_tci2_3 = 1096c1d14583SBruce Richardson _mm256_slli_si256(raw_desc_bh2_3, 4); 1097c1d14583SBruce Richardson __m256i vlan_tci0_1 = 1098c1d14583SBruce Richardson _mm256_slli_si256(raw_desc_bh0_1, 4); 1099c1d14583SBruce Richardson 1100c1d14583SBruce Richardson const __m256i vlan_tci_msk = 1101c1d14583SBruce Richardson _mm256_set_epi32(0, 0xFFFF0000, 0, 0, 1102c1d14583SBruce Richardson 0, 0xFFFF0000, 0, 0); 1103c1d14583SBruce Richardson 1104c1d14583SBruce Richardson vlan_tci6_7 = _mm256_and_si256 1105c1d14583SBruce Richardson (vlan_tci6_7, vlan_tci_msk); 1106c1d14583SBruce Richardson vlan_tci4_5 = _mm256_and_si256 1107c1d14583SBruce Richardson (vlan_tci4_5, vlan_tci_msk); 1108c1d14583SBruce Richardson vlan_tci2_3 = _mm256_and_si256 1109c1d14583SBruce Richardson (vlan_tci2_3, vlan_tci_msk); 1110c1d14583SBruce Richardson vlan_tci0_1 = _mm256_and_si256 1111c1d14583SBruce Richardson (vlan_tci0_1, vlan_tci_msk); 1112c1d14583SBruce Richardson 1113c1d14583SBruce Richardson mb6_7 = _mm256_or_si256(mb6_7, vlan_tci6_7); 1114c1d14583SBruce Richardson mb4_5 = _mm256_or_si256(mb4_5, vlan_tci4_5); 1115c1d14583SBruce Richardson mb2_3 = _mm256_or_si256(mb2_3, vlan_tci2_3); 1116c1d14583SBruce Richardson mb0_1 = _mm256_or_si256(mb0_1, vlan_tci0_1); 1117c1d14583SBruce Richardson } /* if() on Vlan parsing */ 1118c1d14583SBruce Richardson 1119c1d14583SBruce Richardson if (offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP) { 1120c1d14583SBruce Richardson uint32_t mask = 0xFFFFFFFF; 1121c1d14583SBruce Richardson __m256i ts; 1122c1d14583SBruce Richardson __m256i ts_low = _mm256_setzero_si256(); 1123c1d14583SBruce Richardson __m256i ts_low1; 1124c1d14583SBruce Richardson __m256i ts_low2; 1125c1d14583SBruce Richardson __m256i max_ret; 1126c1d14583SBruce Richardson __m256i cmp_ret; 1127c1d14583SBruce Richardson uint8_t ret = 0; 1128c1d14583SBruce Richardson uint8_t shift = 8; 1129c1d14583SBruce Richardson __m256i ts_desp_mask = _mm256_set_epi32(mask, 0, 0, 0, mask, 0, 0, 0); 1130c1d14583SBruce Richardson __m256i cmp_mask = _mm256_set1_epi32(mask); 1131c1d14583SBruce Richardson __m256i ts_permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); 1132c1d14583SBruce Richardson 1133c1d14583SBruce Richardson ts = _mm256_and_si256(raw_desc_bh0_1, ts_desp_mask); 1134c1d14583SBruce Richardson ts_low = _mm256_or_si256(ts_low, _mm256_srli_si256(ts, 3 * 4)); 1135c1d14583SBruce Richardson ts = _mm256_and_si256(raw_desc_bh2_3, ts_desp_mask); 1136c1d14583SBruce Richardson ts_low = _mm256_or_si256(ts_low, _mm256_srli_si256(ts, 2 * 4)); 1137c1d14583SBruce Richardson ts = _mm256_and_si256(raw_desc_bh4_5, ts_desp_mask); 1138c1d14583SBruce Richardson ts_low = _mm256_or_si256(ts_low, _mm256_srli_si256(ts, 4)); 1139c1d14583SBruce Richardson ts = _mm256_and_si256(raw_desc_bh6_7, ts_desp_mask); 1140c1d14583SBruce Richardson ts_low = _mm256_or_si256(ts_low, ts); 1141c1d14583SBruce Richardson 1142c1d14583SBruce Richardson ts_low1 = _mm256_permutevar8x32_epi32(ts_low, ts_permute_mask); 1143c1d14583SBruce Richardson ts_low2 = _mm256_permutevar8x32_epi32(ts_low1, 1144c1d14583SBruce Richardson _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 7)); 1145c1d14583SBruce Richardson ts_low2 = _mm256_and_si256(ts_low2, 1146c1d14583SBruce Richardson _mm256_set_epi32(mask, mask, mask, mask, mask, mask, mask, 0)); 1147c1d14583SBruce Richardson ts_low2 = _mm256_or_si256(ts_low2, hw_low_last); 1148c1d14583SBruce Richardson hw_low_last = _mm256_and_si256(ts_low1, 1149c1d14583SBruce Richardson _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, mask)); 1150c1d14583SBruce Richardson 1151c1d14583SBruce Richardson *RTE_MBUF_DYNFIELD(rx_pkts[i + 0], 1152c1d14583SBruce Richardson iavf_timestamp_dynfield_offset, uint32_t *) = _mm256_extract_epi32(ts_low1, 0); 1153c1d14583SBruce Richardson *RTE_MBUF_DYNFIELD(rx_pkts[i + 1], 1154c1d14583SBruce Richardson iavf_timestamp_dynfield_offset, uint32_t *) = _mm256_extract_epi32(ts_low1, 1); 1155c1d14583SBruce Richardson *RTE_MBUF_DYNFIELD(rx_pkts[i + 2], 1156c1d14583SBruce Richardson iavf_timestamp_dynfield_offset, uint32_t *) = _mm256_extract_epi32(ts_low1, 2); 1157c1d14583SBruce Richardson *RTE_MBUF_DYNFIELD(rx_pkts[i + 3], 1158c1d14583SBruce Richardson iavf_timestamp_dynfield_offset, uint32_t *) = _mm256_extract_epi32(ts_low1, 3); 1159c1d14583SBruce Richardson *RTE_MBUF_DYNFIELD(rx_pkts[i + 4], 1160c1d14583SBruce Richardson iavf_timestamp_dynfield_offset, uint32_t *) = _mm256_extract_epi32(ts_low1, 4); 1161c1d14583SBruce Richardson *RTE_MBUF_DYNFIELD(rx_pkts[i + 5], 1162c1d14583SBruce Richardson iavf_timestamp_dynfield_offset, uint32_t *) = _mm256_extract_epi32(ts_low1, 5); 1163c1d14583SBruce Richardson *RTE_MBUF_DYNFIELD(rx_pkts[i + 6], 1164c1d14583SBruce Richardson iavf_timestamp_dynfield_offset, uint32_t *) = _mm256_extract_epi32(ts_low1, 6); 1165c1d14583SBruce Richardson *RTE_MBUF_DYNFIELD(rx_pkts[i + 7], 1166c1d14583SBruce Richardson iavf_timestamp_dynfield_offset, uint32_t *) = _mm256_extract_epi32(ts_low1, 7); 1167c1d14583SBruce Richardson 1168c1d14583SBruce Richardson if (unlikely(is_tsinit)) { 1169c1d14583SBruce Richardson uint32_t in_timestamp; 1170c1d14583SBruce Richardson if (iavf_get_phc_time(rxq)) 1171c1d14583SBruce Richardson PMD_DRV_LOG(ERR, "get physical time failed"); 1172c1d14583SBruce Richardson in_timestamp = *RTE_MBUF_DYNFIELD(rx_pkts[i + 0], 1173c1d14583SBruce Richardson iavf_timestamp_dynfield_offset, uint32_t *); 1174c1d14583SBruce Richardson rxq->phc_time = iavf_tstamp_convert_32b_64b(rxq->phc_time, in_timestamp); 1175c1d14583SBruce Richardson } 1176c1d14583SBruce Richardson 1177c1d14583SBruce Richardson *RTE_MBUF_DYNFIELD(rx_pkts[i + 0], 1178c1d14583SBruce Richardson iavf_timestamp_dynfield_offset + 4, uint32_t *) = (uint32_t)(rxq->phc_time >> 32); 1179c1d14583SBruce Richardson *RTE_MBUF_DYNFIELD(rx_pkts[i + 1], 1180c1d14583SBruce Richardson iavf_timestamp_dynfield_offset + 4, uint32_t *) = (uint32_t)(rxq->phc_time >> 32); 1181c1d14583SBruce Richardson *RTE_MBUF_DYNFIELD(rx_pkts[i + 2], 1182c1d14583SBruce Richardson iavf_timestamp_dynfield_offset + 4, uint32_t *) = (uint32_t)(rxq->phc_time >> 32); 1183c1d14583SBruce Richardson *RTE_MBUF_DYNFIELD(rx_pkts[i + 3], 1184c1d14583SBruce Richardson iavf_timestamp_dynfield_offset + 4, uint32_t *) = (uint32_t)(rxq->phc_time >> 32); 1185c1d14583SBruce Richardson *RTE_MBUF_DYNFIELD(rx_pkts[i + 4], 1186c1d14583SBruce Richardson iavf_timestamp_dynfield_offset + 4, uint32_t *) = (uint32_t)(rxq->phc_time >> 32); 1187c1d14583SBruce Richardson *RTE_MBUF_DYNFIELD(rx_pkts[i + 5], 1188c1d14583SBruce Richardson iavf_timestamp_dynfield_offset + 4, uint32_t *) = (uint32_t)(rxq->phc_time >> 32); 1189c1d14583SBruce Richardson *RTE_MBUF_DYNFIELD(rx_pkts[i + 6], 1190c1d14583SBruce Richardson iavf_timestamp_dynfield_offset + 4, uint32_t *) = (uint32_t)(rxq->phc_time >> 32); 1191c1d14583SBruce Richardson *RTE_MBUF_DYNFIELD(rx_pkts[i + 7], 1192c1d14583SBruce Richardson iavf_timestamp_dynfield_offset + 4, uint32_t *) = (uint32_t)(rxq->phc_time >> 32); 1193c1d14583SBruce Richardson 1194c1d14583SBruce Richardson max_ret = _mm256_max_epu32(ts_low2, ts_low1); 1195c1d14583SBruce Richardson cmp_ret = _mm256_andnot_si256(_mm256_cmpeq_epi32(max_ret, ts_low1), cmp_mask); 1196c1d14583SBruce Richardson 1197c1d14583SBruce Richardson if (_mm256_testz_si256(cmp_ret, cmp_mask)) { 1198c1d14583SBruce Richardson inflection_point = 0; 1199c1d14583SBruce Richardson } else { 1200c1d14583SBruce Richardson inflection_point = 1; 1201c1d14583SBruce Richardson while (shift > 1) { 1202c1d14583SBruce Richardson shift = shift >> 1; 1203c1d14583SBruce Richardson __m256i mask_low = _mm256_setzero_si256(); 1204c1d14583SBruce Richardson __m256i mask_high = _mm256_setzero_si256(); 1205c1d14583SBruce Richardson switch (shift) { 1206c1d14583SBruce Richardson case 4: 1207c1d14583SBruce Richardson mask_low = _mm256_set_epi32(0, 0, 0, 0, mask, mask, mask, mask); 1208c1d14583SBruce Richardson mask_high = _mm256_set_epi32(mask, mask, mask, mask, 0, 0, 0, 0); 1209c1d14583SBruce Richardson break; 1210c1d14583SBruce Richardson case 2: 1211c1d14583SBruce Richardson mask_low = _mm256_srli_si256(cmp_mask, 2 * 4); 1212c1d14583SBruce Richardson mask_high = _mm256_slli_si256(cmp_mask, 2 * 4); 1213c1d14583SBruce Richardson break; 1214c1d14583SBruce Richardson case 1: 1215c1d14583SBruce Richardson mask_low = _mm256_srli_si256(cmp_mask, 1 * 4); 1216c1d14583SBruce Richardson mask_high = _mm256_slli_si256(cmp_mask, 1 * 4); 1217c1d14583SBruce Richardson break; 1218c1d14583SBruce Richardson } 1219c1d14583SBruce Richardson ret = _mm256_testz_si256(cmp_ret, mask_low); 1220c1d14583SBruce Richardson if (ret) { 1221c1d14583SBruce Richardson ret = _mm256_testz_si256(cmp_ret, mask_high); 1222c1d14583SBruce Richardson inflection_point += ret ? 0 : shift; 1223c1d14583SBruce Richardson cmp_mask = mask_high; 1224c1d14583SBruce Richardson } else { 1225c1d14583SBruce Richardson cmp_mask = mask_low; 1226c1d14583SBruce Richardson } 1227c1d14583SBruce Richardson } 1228c1d14583SBruce Richardson } 1229c1d14583SBruce Richardson mbuf_flags = _mm256_or_si256(mbuf_flags, _mm256_set1_epi32(iavf_timestamp_dynflag)); 1230c1d14583SBruce Richardson } /* if() on Timestamp parsing */ 1231c1d14583SBruce Richardson } 1232c1d14583SBruce Richardson #endif 1233c1d14583SBruce Richardson } 1234c1d14583SBruce Richardson 1235c1d14583SBruce Richardson /** 1236c1d14583SBruce Richardson * At this point, we have the 8 sets of flags in the low 16-bits 1237c1d14583SBruce Richardson * of each 32-bit value in vlan0. 1238c1d14583SBruce Richardson * We want to extract these, and merge them with the mbuf init 1239c1d14583SBruce Richardson * data so we can do a single write to the mbuf to set the flags 1240c1d14583SBruce Richardson * and all the other initialization fields. Extracting the 1241c1d14583SBruce Richardson * appropriate flags means that we have to do a shift and blend 1242c1d14583SBruce Richardson * for each mbuf before we do the write. However, we can also 1243c1d14583SBruce Richardson * add in the previously computed rx_descriptor fields to 1244c1d14583SBruce Richardson * make a single 256-bit write per mbuf 1245c1d14583SBruce Richardson */ 1246c1d14583SBruce Richardson /* check the structure matches expectations */ 1247c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) != 1248c1d14583SBruce Richardson offsetof(struct rte_mbuf, rearm_data) + 8); 1249c1d14583SBruce Richardson RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) != 1250c1d14583SBruce Richardson RTE_ALIGN(offsetof(struct rte_mbuf, 1251c1d14583SBruce Richardson rearm_data), 1252c1d14583SBruce Richardson 16)); 1253c1d14583SBruce Richardson /* build up data and do writes */ 1254c1d14583SBruce Richardson __m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5, 1255c1d14583SBruce Richardson rearm6, rearm7; 1256c1d14583SBruce Richardson rearm6 = _mm256_blend_epi32(mbuf_init, 1257c1d14583SBruce Richardson _mm256_slli_si256(mbuf_flags, 8), 1258c1d14583SBruce Richardson 0x04); 1259c1d14583SBruce Richardson rearm4 = _mm256_blend_epi32(mbuf_init, 1260c1d14583SBruce Richardson _mm256_slli_si256(mbuf_flags, 4), 1261c1d14583SBruce Richardson 0x04); 1262c1d14583SBruce Richardson rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04); 1263c1d14583SBruce Richardson rearm0 = _mm256_blend_epi32(mbuf_init, 1264c1d14583SBruce Richardson _mm256_srli_si256(mbuf_flags, 4), 1265c1d14583SBruce Richardson 0x04); 1266c1d14583SBruce Richardson /* permute to add in the rx_descriptor e.g. rss fields */ 1267c1d14583SBruce Richardson rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20); 1268c1d14583SBruce Richardson rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20); 1269c1d14583SBruce Richardson rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20); 1270c1d14583SBruce Richardson rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20); 1271c1d14583SBruce Richardson /* write to mbuf */ 1272c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data, 1273c1d14583SBruce Richardson rearm6); 1274c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data, 1275c1d14583SBruce Richardson rearm4); 1276c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data, 1277c1d14583SBruce Richardson rearm2); 1278c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data, 1279c1d14583SBruce Richardson rearm0); 1280c1d14583SBruce Richardson 1281c1d14583SBruce Richardson /* repeat for the odd mbufs */ 1282c1d14583SBruce Richardson const __m256i odd_flags = 1283c1d14583SBruce Richardson _mm256_castsi128_si256 1284c1d14583SBruce Richardson (_mm256_extracti128_si256(mbuf_flags, 1)); 1285c1d14583SBruce Richardson rearm7 = _mm256_blend_epi32(mbuf_init, 1286c1d14583SBruce Richardson _mm256_slli_si256(odd_flags, 8), 1287c1d14583SBruce Richardson 0x04); 1288c1d14583SBruce Richardson rearm5 = _mm256_blend_epi32(mbuf_init, 1289c1d14583SBruce Richardson _mm256_slli_si256(odd_flags, 4), 1290c1d14583SBruce Richardson 0x04); 1291c1d14583SBruce Richardson rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04); 1292c1d14583SBruce Richardson rearm1 = _mm256_blend_epi32(mbuf_init, 1293c1d14583SBruce Richardson _mm256_srli_si256(odd_flags, 4), 1294c1d14583SBruce Richardson 0x04); 1295c1d14583SBruce Richardson /* since odd mbufs are already in hi 128-bits use blend */ 1296c1d14583SBruce Richardson rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0); 1297c1d14583SBruce Richardson rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0); 1298c1d14583SBruce Richardson rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0); 1299c1d14583SBruce Richardson rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0); 1300c1d14583SBruce Richardson /* again write to mbufs */ 1301c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data, 1302c1d14583SBruce Richardson rearm7); 1303c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data, 1304c1d14583SBruce Richardson rearm5); 1305c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data, 1306c1d14583SBruce Richardson rearm3); 1307c1d14583SBruce Richardson _mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data, 1308c1d14583SBruce Richardson rearm1); 1309c1d14583SBruce Richardson 1310c1d14583SBruce Richardson /* extract and record EOP bit */ 1311c1d14583SBruce Richardson if (split_packet) { 1312c1d14583SBruce Richardson const __m128i eop_mask = 1313c1d14583SBruce Richardson _mm_set1_epi16(1 << 1314c1d14583SBruce Richardson IAVF_RX_FLEX_DESC_STATUS0_EOF_S); 1315c1d14583SBruce Richardson const __m256i eop_bits256 = _mm256_and_si256(status0_7, 1316c1d14583SBruce Richardson eop_check); 1317c1d14583SBruce Richardson /* pack status bits into a single 128-bit register */ 1318c1d14583SBruce Richardson const __m128i eop_bits = 1319c1d14583SBruce Richardson _mm_packus_epi32 1320c1d14583SBruce Richardson (_mm256_castsi256_si128(eop_bits256), 1321c1d14583SBruce Richardson _mm256_extractf128_si256(eop_bits256, 1322c1d14583SBruce Richardson 1)); 1323c1d14583SBruce Richardson /** 1324c1d14583SBruce Richardson * flip bits, and mask out the EOP bit, which is now 1325c1d14583SBruce Richardson * a split-packet bit i.e. !EOP, rather than EOP one. 1326c1d14583SBruce Richardson */ 1327c1d14583SBruce Richardson __m128i split_bits = _mm_andnot_si128(eop_bits, 1328c1d14583SBruce Richardson eop_mask); 1329c1d14583SBruce Richardson /** 1330c1d14583SBruce Richardson * eop bits are out of order, so we need to shuffle them 1331c1d14583SBruce Richardson * back into order again. In doing so, only use low 8 1332c1d14583SBruce Richardson * bits, which acts like another pack instruction 1333c1d14583SBruce Richardson * The original order is (hi->lo): 1,3,5,7,0,2,4,6 1334c1d14583SBruce Richardson * [Since we use epi8, the 16-bit positions are 1335c1d14583SBruce Richardson * multiplied by 2 in the eop_shuffle value.] 1336c1d14583SBruce Richardson */ 1337c1d14583SBruce Richardson __m128i eop_shuffle = 1338c1d14583SBruce Richardson _mm_set_epi8(/* zero hi 64b */ 1339c1d14583SBruce Richardson 0xFF, 0xFF, 0xFF, 0xFF, 1340c1d14583SBruce Richardson 0xFF, 0xFF, 0xFF, 0xFF, 1341c1d14583SBruce Richardson /* move values to lo 64b */ 1342c1d14583SBruce Richardson 8, 0, 10, 2, 1343c1d14583SBruce Richardson 12, 4, 14, 6); 1344c1d14583SBruce Richardson split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle); 1345c1d14583SBruce Richardson *(uint64_t *)split_packet = 1346c1d14583SBruce Richardson _mm_cvtsi128_si64(split_bits); 1347c1d14583SBruce Richardson split_packet += IAVF_DESCS_PER_LOOP_AVX; 1348c1d14583SBruce Richardson } 1349c1d14583SBruce Richardson 1350c1d14583SBruce Richardson /* perform dd_check */ 1351c1d14583SBruce Richardson status0_7 = _mm256_and_si256(status0_7, dd_check); 1352c1d14583SBruce Richardson status0_7 = _mm256_packs_epi32(status0_7, 1353c1d14583SBruce Richardson _mm256_setzero_si256()); 1354c1d14583SBruce Richardson 1355c1d14583SBruce Richardson uint64_t burst = rte_popcount64 1356c1d14583SBruce Richardson (_mm_cvtsi128_si64 1357c1d14583SBruce Richardson (_mm256_extracti128_si256 1358c1d14583SBruce Richardson (status0_7, 1))); 1359c1d14583SBruce Richardson burst += rte_popcount64 1360c1d14583SBruce Richardson (_mm_cvtsi128_si64 1361c1d14583SBruce Richardson (_mm256_castsi256_si128(status0_7))); 1362c1d14583SBruce Richardson received += burst; 1363c1d14583SBruce Richardson #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC 1364c1d14583SBruce Richardson if (rxq->offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP) { 1365c1d14583SBruce Richardson inflection_point = (inflection_point <= burst) ? inflection_point : 0; 1366c1d14583SBruce Richardson switch (inflection_point) { 1367c1d14583SBruce Richardson case 1: 1368c1d14583SBruce Richardson *RTE_MBUF_DYNFIELD(rx_pkts[i + 0], 1369c1d14583SBruce Richardson iavf_timestamp_dynfield_offset + 4, uint32_t *) += 1; 1370c1d14583SBruce Richardson /* fallthrough */ 1371c1d14583SBruce Richardson case 2: 1372c1d14583SBruce Richardson *RTE_MBUF_DYNFIELD(rx_pkts[i + 1], 1373c1d14583SBruce Richardson iavf_timestamp_dynfield_offset + 4, uint32_t *) += 1; 1374c1d14583SBruce Richardson /* fallthrough */ 1375c1d14583SBruce Richardson case 3: 1376c1d14583SBruce Richardson *RTE_MBUF_DYNFIELD(rx_pkts[i + 2], 1377c1d14583SBruce Richardson iavf_timestamp_dynfield_offset + 4, uint32_t *) += 1; 1378c1d14583SBruce Richardson /* fallthrough */ 1379c1d14583SBruce Richardson case 4: 1380c1d14583SBruce Richardson *RTE_MBUF_DYNFIELD(rx_pkts[i + 3], 1381c1d14583SBruce Richardson iavf_timestamp_dynfield_offset + 4, uint32_t *) += 1; 1382c1d14583SBruce Richardson /* fallthrough */ 1383c1d14583SBruce Richardson case 5: 1384c1d14583SBruce Richardson *RTE_MBUF_DYNFIELD(rx_pkts[i + 4], 1385c1d14583SBruce Richardson iavf_timestamp_dynfield_offset + 4, uint32_t *) += 1; 1386c1d14583SBruce Richardson /* fallthrough */ 1387c1d14583SBruce Richardson case 6: 1388c1d14583SBruce Richardson *RTE_MBUF_DYNFIELD(rx_pkts[i + 5], 1389c1d14583SBruce Richardson iavf_timestamp_dynfield_offset + 4, uint32_t *) += 1; 1390c1d14583SBruce Richardson /* fallthrough */ 1391c1d14583SBruce Richardson case 7: 1392c1d14583SBruce Richardson *RTE_MBUF_DYNFIELD(rx_pkts[i + 6], 1393c1d14583SBruce Richardson iavf_timestamp_dynfield_offset + 4, uint32_t *) += 1; 1394c1d14583SBruce Richardson /* fallthrough */ 1395c1d14583SBruce Richardson case 8: 1396c1d14583SBruce Richardson *RTE_MBUF_DYNFIELD(rx_pkts[i + 7], 1397c1d14583SBruce Richardson iavf_timestamp_dynfield_offset + 4, uint32_t *) += 1; 1398c1d14583SBruce Richardson rxq->phc_time += (uint64_t)1 << 32; 1399c1d14583SBruce Richardson /* fallthrough */ 1400c1d14583SBruce Richardson case 0: 1401c1d14583SBruce Richardson break; 1402c1d14583SBruce Richardson default: 1403c1d14583SBruce Richardson PMD_DRV_LOG(ERR, "invalid inflection point for rx timestamp"); 1404c1d14583SBruce Richardson break; 1405c1d14583SBruce Richardson } 1406c1d14583SBruce Richardson 1407c1d14583SBruce Richardson rxq->hw_time_update = rte_get_timer_cycles() / (rte_get_timer_hz() / 1000); 1408c1d14583SBruce Richardson } 1409c1d14583SBruce Richardson #endif 1410c1d14583SBruce Richardson if (burst != IAVF_DESCS_PER_LOOP_AVX) 1411c1d14583SBruce Richardson break; 1412c1d14583SBruce Richardson } 1413c1d14583SBruce Richardson 1414c1d14583SBruce Richardson #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC 1415c1d14583SBruce Richardson if (received > 0 && (rxq->offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP)) 1416c1d14583SBruce Richardson rxq->phc_time = *RTE_MBUF_DYNFIELD(rx_pkts[received - 1], iavf_timestamp_dynfield_offset, rte_mbuf_timestamp_t *); 1417c1d14583SBruce Richardson #endif 1418c1d14583SBruce Richardson 1419c1d14583SBruce Richardson /* update tail pointers */ 1420c1d14583SBruce Richardson rxq->rx_tail += received; 1421c1d14583SBruce Richardson rxq->rx_tail &= (rxq->nb_rx_desc - 1); 1422c1d14583SBruce Richardson if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep avx2 aligned */ 1423c1d14583SBruce Richardson rxq->rx_tail--; 1424c1d14583SBruce Richardson received--; 1425c1d14583SBruce Richardson } 1426c1d14583SBruce Richardson rxq->rxrearm_nb += received; 1427c1d14583SBruce Richardson return received; 1428c1d14583SBruce Richardson } 1429c1d14583SBruce Richardson 1430c1d14583SBruce Richardson /** 1431c1d14583SBruce Richardson * Notice: 1432c1d14583SBruce Richardson * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet 1433c1d14583SBruce Richardson */ 1434c1d14583SBruce Richardson uint16_t 1435c1d14583SBruce Richardson iavf_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, 1436c1d14583SBruce Richardson uint16_t nb_pkts) 1437c1d14583SBruce Richardson { 1438c1d14583SBruce Richardson return _iavf_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts, 1439c1d14583SBruce Richardson NULL, false); 1440c1d14583SBruce Richardson } 1441c1d14583SBruce Richardson 1442c1d14583SBruce Richardson uint16_t 1443c1d14583SBruce Richardson iavf_recv_pkts_vec_avx2_offload(void *rx_queue, struct rte_mbuf **rx_pkts, 1444c1d14583SBruce Richardson uint16_t nb_pkts) 1445c1d14583SBruce Richardson { 1446c1d14583SBruce Richardson return _iavf_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts, 1447c1d14583SBruce Richardson NULL, true); 1448c1d14583SBruce Richardson } 1449c1d14583SBruce Richardson 1450c1d14583SBruce Richardson /** 1451c1d14583SBruce Richardson * Notice: 1452c1d14583SBruce Richardson * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet 1453c1d14583SBruce Richardson */ 1454c1d14583SBruce Richardson uint16_t 1455c1d14583SBruce Richardson iavf_recv_pkts_vec_avx2_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts, 1456c1d14583SBruce Richardson uint16_t nb_pkts) 1457c1d14583SBruce Richardson { 1458c1d14583SBruce Richardson return _iavf_recv_raw_pkts_vec_avx2_flex_rxd(rx_queue, rx_pkts, 1459c1d14583SBruce Richardson nb_pkts, NULL, false); 1460c1d14583SBruce Richardson } 1461c1d14583SBruce Richardson 1462c1d14583SBruce Richardson uint16_t 1463c1d14583SBruce Richardson iavf_recv_pkts_vec_avx2_flex_rxd_offload(void *rx_queue, struct rte_mbuf **rx_pkts, 1464c1d14583SBruce Richardson uint16_t nb_pkts) 1465c1d14583SBruce Richardson { 1466c1d14583SBruce Richardson return _iavf_recv_raw_pkts_vec_avx2_flex_rxd(rx_queue, rx_pkts, 1467c1d14583SBruce Richardson nb_pkts, NULL, true); 1468c1d14583SBruce Richardson } 1469c1d14583SBruce Richardson 1470c1d14583SBruce Richardson /** 1471c1d14583SBruce Richardson * vPMD receive routine that reassembles single burst of 32 scattered packets 1472c1d14583SBruce Richardson * Notice: 1473c1d14583SBruce Richardson * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet 1474c1d14583SBruce Richardson */ 1475c1d14583SBruce Richardson static __rte_always_inline uint16_t 1476c1d14583SBruce Richardson iavf_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, 1477c1d14583SBruce Richardson uint16_t nb_pkts, bool offload) 1478c1d14583SBruce Richardson { 1479c1d14583SBruce Richardson struct iavf_rx_queue *rxq = rx_queue; 1480c1d14583SBruce Richardson uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0}; 1481c1d14583SBruce Richardson 1482c1d14583SBruce Richardson /* get some new buffers */ 1483c1d14583SBruce Richardson uint16_t nb_bufs = _iavf_recv_raw_pkts_vec_avx2(rxq, rx_pkts, nb_pkts, 1484c1d14583SBruce Richardson split_flags, offload); 1485c1d14583SBruce Richardson if (nb_bufs == 0) 1486c1d14583SBruce Richardson return 0; 1487c1d14583SBruce Richardson 1488c1d14583SBruce Richardson /* happy day case, full burst + no packets to be joined */ 1489c1d14583SBruce Richardson const uint64_t *split_fl64 = (uint64_t *)split_flags; 1490c1d14583SBruce Richardson 1491c1d14583SBruce Richardson if (!rxq->pkt_first_seg && 1492c1d14583SBruce Richardson split_fl64[0] == 0 && split_fl64[1] == 0 && 1493c1d14583SBruce Richardson split_fl64[2] == 0 && split_fl64[3] == 0) 1494c1d14583SBruce Richardson return nb_bufs; 1495c1d14583SBruce Richardson 1496c1d14583SBruce Richardson /* reassemble any packets that need reassembly*/ 1497c1d14583SBruce Richardson unsigned int i = 0; 1498c1d14583SBruce Richardson 1499c1d14583SBruce Richardson if (!rxq->pkt_first_seg) { 1500c1d14583SBruce Richardson /* find the first split flag, and only reassemble then*/ 1501c1d14583SBruce Richardson while (i < nb_bufs && !split_flags[i]) 1502c1d14583SBruce Richardson i++; 1503c1d14583SBruce Richardson if (i == nb_bufs) 1504c1d14583SBruce Richardson return nb_bufs; 1505c1d14583SBruce Richardson rxq->pkt_first_seg = rx_pkts[i]; 1506c1d14583SBruce Richardson } 150782fbc4a4SBruce Richardson return i + ci_rx_reassemble_packets(&rx_pkts[i], nb_bufs - i, &split_flags[i], 150882fbc4a4SBruce Richardson &rxq->pkt_first_seg, &rxq->pkt_last_seg, rxq->crc_len); 1509c1d14583SBruce Richardson } 1510c1d14583SBruce Richardson 1511c1d14583SBruce Richardson /** 1512c1d14583SBruce Richardson * vPMD receive routine that reassembles scattered packets. 1513c1d14583SBruce Richardson * Main receive routine that can handle arbitrary burst sizes 1514c1d14583SBruce Richardson * Notice: 1515c1d14583SBruce Richardson * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet 1516c1d14583SBruce Richardson */ 1517c1d14583SBruce Richardson static __rte_always_inline uint16_t 1518c1d14583SBruce Richardson iavf_recv_scattered_pkts_vec_avx2_common(void *rx_queue, struct rte_mbuf **rx_pkts, 1519c1d14583SBruce Richardson uint16_t nb_pkts, bool offload) 1520c1d14583SBruce Richardson { 1521c1d14583SBruce Richardson uint16_t retval = 0; 1522c1d14583SBruce Richardson 1523c1d14583SBruce Richardson while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) { 1524c1d14583SBruce Richardson uint16_t burst = iavf_recv_scattered_burst_vec_avx2(rx_queue, 1525c1d14583SBruce Richardson rx_pkts + retval, IAVF_VPMD_RX_MAX_BURST, offload); 1526c1d14583SBruce Richardson retval += burst; 1527c1d14583SBruce Richardson nb_pkts -= burst; 1528c1d14583SBruce Richardson if (burst < IAVF_VPMD_RX_MAX_BURST) 1529c1d14583SBruce Richardson return retval; 1530c1d14583SBruce Richardson } 1531c1d14583SBruce Richardson return retval + iavf_recv_scattered_burst_vec_avx2(rx_queue, 1532c1d14583SBruce Richardson rx_pkts + retval, nb_pkts, offload); 1533c1d14583SBruce Richardson } 1534c1d14583SBruce Richardson 1535c1d14583SBruce Richardson uint16_t 1536c1d14583SBruce Richardson iavf_recv_scattered_pkts_vec_avx2(void *rx_queue, 1537c1d14583SBruce Richardson struct rte_mbuf **rx_pkts, 1538c1d14583SBruce Richardson uint16_t nb_pkts) 1539c1d14583SBruce Richardson { 1540c1d14583SBruce Richardson return iavf_recv_scattered_pkts_vec_avx2_common(rx_queue, 1541c1d14583SBruce Richardson rx_pkts, 1542c1d14583SBruce Richardson nb_pkts, 1543c1d14583SBruce Richardson false); 1544c1d14583SBruce Richardson } 1545c1d14583SBruce Richardson 1546c1d14583SBruce Richardson uint16_t 1547c1d14583SBruce Richardson iavf_recv_scattered_pkts_vec_avx2_offload(void *rx_queue, 1548c1d14583SBruce Richardson struct rte_mbuf **rx_pkts, 1549c1d14583SBruce Richardson uint16_t nb_pkts) 1550c1d14583SBruce Richardson { 1551c1d14583SBruce Richardson return iavf_recv_scattered_pkts_vec_avx2_common(rx_queue, 1552c1d14583SBruce Richardson rx_pkts, 1553c1d14583SBruce Richardson nb_pkts, 1554c1d14583SBruce Richardson true); 1555c1d14583SBruce Richardson } 1556c1d14583SBruce Richardson 1557c1d14583SBruce Richardson /** 1558c1d14583SBruce Richardson * vPMD receive routine that reassembles single burst of 1559c1d14583SBruce Richardson * 32 scattered packets for flex RxD 1560c1d14583SBruce Richardson * Notice: 1561c1d14583SBruce Richardson * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet 1562c1d14583SBruce Richardson */ 1563c1d14583SBruce Richardson static __rte_always_inline uint16_t 1564c1d14583SBruce Richardson iavf_recv_scattered_burst_vec_avx2_flex_rxd(void *rx_queue, 1565c1d14583SBruce Richardson struct rte_mbuf **rx_pkts, 1566c1d14583SBruce Richardson uint16_t nb_pkts, bool offload) 1567c1d14583SBruce Richardson { 1568c1d14583SBruce Richardson struct iavf_rx_queue *rxq = rx_queue; 1569c1d14583SBruce Richardson uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0}; 1570c1d14583SBruce Richardson 1571c1d14583SBruce Richardson /* get some new buffers */ 1572c1d14583SBruce Richardson uint16_t nb_bufs = _iavf_recv_raw_pkts_vec_avx2_flex_rxd(rxq, 1573c1d14583SBruce Richardson rx_pkts, nb_pkts, split_flags, offload); 1574c1d14583SBruce Richardson if (nb_bufs == 0) 1575c1d14583SBruce Richardson return 0; 1576c1d14583SBruce Richardson 1577c1d14583SBruce Richardson /* happy day case, full burst + no packets to be joined */ 1578c1d14583SBruce Richardson const uint64_t *split_fl64 = (uint64_t *)split_flags; 1579c1d14583SBruce Richardson 1580c1d14583SBruce Richardson if (!rxq->pkt_first_seg && 1581c1d14583SBruce Richardson split_fl64[0] == 0 && split_fl64[1] == 0 && 1582c1d14583SBruce Richardson split_fl64[2] == 0 && split_fl64[3] == 0) 1583c1d14583SBruce Richardson return nb_bufs; 1584c1d14583SBruce Richardson 1585c1d14583SBruce Richardson /* reassemble any packets that need reassembly*/ 1586c1d14583SBruce Richardson unsigned int i = 0; 1587c1d14583SBruce Richardson 1588c1d14583SBruce Richardson if (!rxq->pkt_first_seg) { 1589c1d14583SBruce Richardson /* find the first split flag, and only reassemble then*/ 1590c1d14583SBruce Richardson while (i < nb_bufs && !split_flags[i]) 1591c1d14583SBruce Richardson i++; 1592c1d14583SBruce Richardson if (i == nb_bufs) 1593c1d14583SBruce Richardson return nb_bufs; 1594c1d14583SBruce Richardson rxq->pkt_first_seg = rx_pkts[i]; 1595c1d14583SBruce Richardson } 159682fbc4a4SBruce Richardson return i + ci_rx_reassemble_packets(&rx_pkts[i], nb_bufs - i, &split_flags[i], 159782fbc4a4SBruce Richardson &rxq->pkt_first_seg, &rxq->pkt_last_seg, rxq->crc_len); 1598c1d14583SBruce Richardson } 1599c1d14583SBruce Richardson 1600c1d14583SBruce Richardson /** 1601c1d14583SBruce Richardson * vPMD receive routine that reassembles scattered packets for flex RxD. 1602c1d14583SBruce Richardson * Main receive routine that can handle arbitrary burst sizes 1603c1d14583SBruce Richardson * Notice: 1604c1d14583SBruce Richardson * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet 1605c1d14583SBruce Richardson */ 1606c1d14583SBruce Richardson static __rte_always_inline uint16_t 1607c1d14583SBruce Richardson iavf_recv_scattered_pkts_vec_avx2_flex_rxd_common(void *rx_queue, 1608c1d14583SBruce Richardson struct rte_mbuf **rx_pkts, 1609c1d14583SBruce Richardson uint16_t nb_pkts, bool offload) 1610c1d14583SBruce Richardson { 1611c1d14583SBruce Richardson uint16_t retval = 0; 1612c1d14583SBruce Richardson 1613c1d14583SBruce Richardson while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) { 1614c1d14583SBruce Richardson uint16_t burst = 1615c1d14583SBruce Richardson iavf_recv_scattered_burst_vec_avx2_flex_rxd 1616c1d14583SBruce Richardson (rx_queue, rx_pkts + retval, IAVF_VPMD_RX_MAX_BURST, 1617c1d14583SBruce Richardson offload); 1618c1d14583SBruce Richardson retval += burst; 1619c1d14583SBruce Richardson nb_pkts -= burst; 1620c1d14583SBruce Richardson if (burst < IAVF_VPMD_RX_MAX_BURST) 1621c1d14583SBruce Richardson return retval; 1622c1d14583SBruce Richardson } 1623c1d14583SBruce Richardson return retval + iavf_recv_scattered_burst_vec_avx2_flex_rxd(rx_queue, 1624c1d14583SBruce Richardson rx_pkts + retval, nb_pkts, offload); 1625c1d14583SBruce Richardson } 1626c1d14583SBruce Richardson 1627c1d14583SBruce Richardson uint16_t 1628c1d14583SBruce Richardson iavf_recv_scattered_pkts_vec_avx2_flex_rxd(void *rx_queue, 1629c1d14583SBruce Richardson struct rte_mbuf **rx_pkts, 1630c1d14583SBruce Richardson uint16_t nb_pkts) 1631c1d14583SBruce Richardson { 1632c1d14583SBruce Richardson return iavf_recv_scattered_pkts_vec_avx2_flex_rxd_common(rx_queue, 1633c1d14583SBruce Richardson rx_pkts, 1634c1d14583SBruce Richardson nb_pkts, 1635c1d14583SBruce Richardson false); 1636c1d14583SBruce Richardson } 1637c1d14583SBruce Richardson 1638c1d14583SBruce Richardson uint16_t 1639c1d14583SBruce Richardson iavf_recv_scattered_pkts_vec_avx2_flex_rxd_offload(void *rx_queue, 1640c1d14583SBruce Richardson struct rte_mbuf **rx_pkts, 1641c1d14583SBruce Richardson uint16_t nb_pkts) 1642c1d14583SBruce Richardson { 1643c1d14583SBruce Richardson return iavf_recv_scattered_pkts_vec_avx2_flex_rxd_common(rx_queue, 1644c1d14583SBruce Richardson rx_pkts, 1645c1d14583SBruce Richardson nb_pkts, 1646c1d14583SBruce Richardson true); 1647c1d14583SBruce Richardson } 1648c1d14583SBruce Richardson 1649c1d14583SBruce Richardson 1650c1d14583SBruce Richardson static __rte_always_inline void 1651c1d14583SBruce Richardson iavf_vtx1(volatile struct iavf_tx_desc *txdp, 1652c1d14583SBruce Richardson struct rte_mbuf *pkt, uint64_t flags, bool offload) 1653c1d14583SBruce Richardson { 1654c1d14583SBruce Richardson uint64_t high_qw = 1655c1d14583SBruce Richardson (IAVF_TX_DESC_DTYPE_DATA | 1656c1d14583SBruce Richardson ((uint64_t)flags << IAVF_TXD_QW1_CMD_SHIFT) | 1657c1d14583SBruce Richardson ((uint64_t)pkt->data_len << IAVF_TXD_QW1_TX_BUF_SZ_SHIFT)); 1658c1d14583SBruce Richardson if (offload) 1659c1d14583SBruce Richardson iavf_txd_enable_offload(pkt, &high_qw); 1660c1d14583SBruce Richardson 1661c1d14583SBruce Richardson __m128i descriptor = _mm_set_epi64x(high_qw, 1662c1d14583SBruce Richardson pkt->buf_iova + pkt->data_off); 1663c1d14583SBruce Richardson _mm_store_si128(RTE_CAST_PTR(__m128i *, txdp), descriptor); 1664c1d14583SBruce Richardson } 1665c1d14583SBruce Richardson 1666c1d14583SBruce Richardson static __rte_always_inline void 1667c1d14583SBruce Richardson iavf_vtx(volatile struct iavf_tx_desc *txdp, 1668c1d14583SBruce Richardson struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags, bool offload) 1669c1d14583SBruce Richardson { 1670c1d14583SBruce Richardson const uint64_t hi_qw_tmpl = (IAVF_TX_DESC_DTYPE_DATA | 1671c1d14583SBruce Richardson ((uint64_t)flags << IAVF_TXD_QW1_CMD_SHIFT)); 1672c1d14583SBruce Richardson 1673c1d14583SBruce Richardson /* if unaligned on 32-bit boundary, do one to align */ 1674c1d14583SBruce Richardson if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) { 1675c1d14583SBruce Richardson iavf_vtx1(txdp, *pkt, flags, offload); 1676c1d14583SBruce Richardson nb_pkts--, txdp++, pkt++; 1677c1d14583SBruce Richardson } 1678c1d14583SBruce Richardson 1679c1d14583SBruce Richardson /* do two at a time while possible, in bursts */ 1680c1d14583SBruce Richardson for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) { 1681c1d14583SBruce Richardson uint64_t hi_qw3 = 1682c1d14583SBruce Richardson hi_qw_tmpl | 1683c1d14583SBruce Richardson ((uint64_t)pkt[3]->data_len << 1684c1d14583SBruce Richardson IAVF_TXD_QW1_TX_BUF_SZ_SHIFT); 1685c1d14583SBruce Richardson if (offload) 1686c1d14583SBruce Richardson iavf_txd_enable_offload(pkt[3], &hi_qw3); 1687c1d14583SBruce Richardson uint64_t hi_qw2 = 1688c1d14583SBruce Richardson hi_qw_tmpl | 1689c1d14583SBruce Richardson ((uint64_t)pkt[2]->data_len << 1690c1d14583SBruce Richardson IAVF_TXD_QW1_TX_BUF_SZ_SHIFT); 1691c1d14583SBruce Richardson if (offload) 1692c1d14583SBruce Richardson iavf_txd_enable_offload(pkt[2], &hi_qw2); 1693c1d14583SBruce Richardson uint64_t hi_qw1 = 1694c1d14583SBruce Richardson hi_qw_tmpl | 1695c1d14583SBruce Richardson ((uint64_t)pkt[1]->data_len << 1696c1d14583SBruce Richardson IAVF_TXD_QW1_TX_BUF_SZ_SHIFT); 1697c1d14583SBruce Richardson if (offload) 1698c1d14583SBruce Richardson iavf_txd_enable_offload(pkt[1], &hi_qw1); 1699c1d14583SBruce Richardson uint64_t hi_qw0 = 1700c1d14583SBruce Richardson hi_qw_tmpl | 1701c1d14583SBruce Richardson ((uint64_t)pkt[0]->data_len << 1702c1d14583SBruce Richardson IAVF_TXD_QW1_TX_BUF_SZ_SHIFT); 1703c1d14583SBruce Richardson if (offload) 1704c1d14583SBruce Richardson iavf_txd_enable_offload(pkt[0], &hi_qw0); 1705c1d14583SBruce Richardson 1706c1d14583SBruce Richardson __m256i desc2_3 = 1707c1d14583SBruce Richardson _mm256_set_epi64x 1708c1d14583SBruce Richardson (hi_qw3, 1709c1d14583SBruce Richardson pkt[3]->buf_iova + pkt[3]->data_off, 1710c1d14583SBruce Richardson hi_qw2, 1711c1d14583SBruce Richardson pkt[2]->buf_iova + pkt[2]->data_off); 1712c1d14583SBruce Richardson __m256i desc0_1 = 1713c1d14583SBruce Richardson _mm256_set_epi64x 1714c1d14583SBruce Richardson (hi_qw1, 1715c1d14583SBruce Richardson pkt[1]->buf_iova + pkt[1]->data_off, 1716c1d14583SBruce Richardson hi_qw0, 1717c1d14583SBruce Richardson pkt[0]->buf_iova + pkt[0]->data_off); 1718c1d14583SBruce Richardson _mm256_store_si256(RTE_CAST_PTR(__m256i *, txdp + 2), desc2_3); 1719c1d14583SBruce Richardson _mm256_store_si256(RTE_CAST_PTR(__m256i *, txdp), desc0_1); 1720c1d14583SBruce Richardson } 1721c1d14583SBruce Richardson 1722c1d14583SBruce Richardson /* do any last ones */ 1723c1d14583SBruce Richardson while (nb_pkts) { 1724c1d14583SBruce Richardson iavf_vtx1(txdp, *pkt, flags, offload); 1725c1d14583SBruce Richardson txdp++, pkt++, nb_pkts--; 1726c1d14583SBruce Richardson } 1727c1d14583SBruce Richardson } 1728c1d14583SBruce Richardson 1729c1d14583SBruce Richardson static __rte_always_inline uint16_t 1730c1d14583SBruce Richardson iavf_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts, 1731c1d14583SBruce Richardson uint16_t nb_pkts, bool offload) 1732c1d14583SBruce Richardson { 1733b92babc2SBruce Richardson struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue; 1734c1d14583SBruce Richardson volatile struct iavf_tx_desc *txdp; 1735*7662502dSBruce Richardson struct ci_tx_entry_vec *txep; 1736c1d14583SBruce Richardson uint16_t n, nb_commit, tx_id; 1737c1d14583SBruce Richardson /* bit2 is reserved and must be set to 1 according to Spec */ 1738c1d14583SBruce Richardson uint64_t flags = IAVF_TX_DESC_CMD_EOP | IAVF_TX_DESC_CMD_ICRC; 1739c1d14583SBruce Richardson uint64_t rs = IAVF_TX_DESC_CMD_RS | flags; 1740c1d14583SBruce Richardson 1741e61679e7SBruce Richardson if (txq->nb_tx_free < txq->tx_free_thresh) 1742*7662502dSBruce Richardson ci_tx_free_bufs_vec(txq, iavf_tx_desc_done, false); 1743c1d14583SBruce Richardson 1744e61679e7SBruce Richardson nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts); 1745c1d14583SBruce Richardson if (unlikely(nb_pkts == 0)) 1746c1d14583SBruce Richardson return 0; 1747e61679e7SBruce Richardson nb_commit = nb_pkts; 1748c1d14583SBruce Richardson 1749c1d14583SBruce Richardson tx_id = txq->tx_tail; 17504d0f54d9SBruce Richardson txdp = &txq->iavf_tx_ring[tx_id]; 1751*7662502dSBruce Richardson txep = &txq->sw_ring_vec[tx_id]; 1752c1d14583SBruce Richardson 1753e61679e7SBruce Richardson txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts); 1754c1d14583SBruce Richardson 1755c1d14583SBruce Richardson n = (uint16_t)(txq->nb_tx_desc - tx_id); 1756c1d14583SBruce Richardson if (nb_commit >= n) { 1757*7662502dSBruce Richardson ci_tx_backlog_entry_vec(txep, tx_pkts, n); 1758c1d14583SBruce Richardson 1759c1d14583SBruce Richardson iavf_vtx(txdp, tx_pkts, n - 1, flags, offload); 1760c1d14583SBruce Richardson tx_pkts += (n - 1); 1761c1d14583SBruce Richardson txdp += (n - 1); 1762c1d14583SBruce Richardson 1763c1d14583SBruce Richardson iavf_vtx1(txdp, *tx_pkts++, rs, offload); 1764c1d14583SBruce Richardson 1765c1d14583SBruce Richardson nb_commit = (uint16_t)(nb_commit - n); 1766c1d14583SBruce Richardson 1767c1d14583SBruce Richardson tx_id = 0; 1768e61679e7SBruce Richardson txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1); 1769c1d14583SBruce Richardson 1770c1d14583SBruce Richardson /* avoid reach the end of ring */ 17714d0f54d9SBruce Richardson txdp = &txq->iavf_tx_ring[tx_id]; 1772*7662502dSBruce Richardson txep = &txq->sw_ring_vec[tx_id]; 1773c1d14583SBruce Richardson } 1774c1d14583SBruce Richardson 1775*7662502dSBruce Richardson ci_tx_backlog_entry_vec(txep, tx_pkts, nb_commit); 1776c1d14583SBruce Richardson 1777c1d14583SBruce Richardson iavf_vtx(txdp, tx_pkts, nb_commit, flags, offload); 1778c1d14583SBruce Richardson 1779c1d14583SBruce Richardson tx_id = (uint16_t)(tx_id + nb_commit); 1780e61679e7SBruce Richardson if (tx_id > txq->tx_next_rs) { 17814d0f54d9SBruce Richardson txq->iavf_tx_ring[txq->tx_next_rs].cmd_type_offset_bsz |= 1782c1d14583SBruce Richardson rte_cpu_to_le_64(((uint64_t)IAVF_TX_DESC_CMD_RS) << 1783c1d14583SBruce Richardson IAVF_TXD_QW1_CMD_SHIFT); 1784e61679e7SBruce Richardson txq->tx_next_rs = 1785e61679e7SBruce Richardson (uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh); 1786c1d14583SBruce Richardson } 1787c1d14583SBruce Richardson 1788c1d14583SBruce Richardson txq->tx_tail = tx_id; 1789c1d14583SBruce Richardson 1790c1d14583SBruce Richardson IAVF_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail); 1791c1d14583SBruce Richardson 1792c1d14583SBruce Richardson return nb_pkts; 1793c1d14583SBruce Richardson } 1794c1d14583SBruce Richardson 1795c1d14583SBruce Richardson static __rte_always_inline uint16_t 1796c1d14583SBruce Richardson iavf_xmit_pkts_vec_avx2_common(void *tx_queue, struct rte_mbuf **tx_pkts, 1797c1d14583SBruce Richardson uint16_t nb_pkts, bool offload) 1798c1d14583SBruce Richardson { 1799c1d14583SBruce Richardson uint16_t nb_tx = 0; 1800b92babc2SBruce Richardson struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue; 1801c1d14583SBruce Richardson 1802c1d14583SBruce Richardson while (nb_pkts) { 1803c1d14583SBruce Richardson uint16_t ret, num; 1804c1d14583SBruce Richardson 1805c1d14583SBruce Richardson /* cross rs_thresh boundary is not allowed */ 1806e61679e7SBruce Richardson num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh); 1807c1d14583SBruce Richardson ret = iavf_xmit_fixed_burst_vec_avx2(tx_queue, &tx_pkts[nb_tx], 1808c1d14583SBruce Richardson num, offload); 1809c1d14583SBruce Richardson nb_tx += ret; 1810c1d14583SBruce Richardson nb_pkts -= ret; 1811c1d14583SBruce Richardson if (ret < num) 1812c1d14583SBruce Richardson break; 1813c1d14583SBruce Richardson } 1814c1d14583SBruce Richardson 1815c1d14583SBruce Richardson return nb_tx; 1816c1d14583SBruce Richardson } 1817c1d14583SBruce Richardson 1818c1d14583SBruce Richardson uint16_t 1819c1d14583SBruce Richardson iavf_xmit_pkts_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts, 1820c1d14583SBruce Richardson uint16_t nb_pkts) 1821c1d14583SBruce Richardson { 1822c1d14583SBruce Richardson return iavf_xmit_pkts_vec_avx2_common(tx_queue, tx_pkts, nb_pkts, false); 1823c1d14583SBruce Richardson } 1824c1d14583SBruce Richardson 1825c1d14583SBruce Richardson uint16_t 1826c1d14583SBruce Richardson iavf_xmit_pkts_vec_avx2_offload(void *tx_queue, struct rte_mbuf **tx_pkts, 1827c1d14583SBruce Richardson uint16_t nb_pkts) 1828c1d14583SBruce Richardson { 1829c1d14583SBruce Richardson return iavf_xmit_pkts_vec_avx2_common(tx_queue, tx_pkts, nb_pkts, true); 1830c1d14583SBruce Richardson } 1831