1c4e4c189SLance Richardson /* SPDX-License-Identifier: BSD-3-Clause */ 2e6e8f03eSRandy Schacher /* Copyright(c) 2019-2023 Broadcom All rights reserved. */ 3c4e4c189SLance Richardson 4c4e4c189SLance Richardson #include <inttypes.h> 5c4e4c189SLance Richardson #include <stdbool.h> 6c4e4c189SLance Richardson 7c4e4c189SLance Richardson #include <rte_bitmap.h> 8c4e4c189SLance Richardson #include <rte_byteorder.h> 9c4e4c189SLance Richardson #include <rte_malloc.h> 10c4e4c189SLance Richardson #include <rte_memory.h> 11c4e4c189SLance Richardson #include <rte_vect.h> 12c4e4c189SLance Richardson 13c4e4c189SLance Richardson #include "bnxt.h" 14c4e4c189SLance Richardson #include "bnxt_cpr.h" 15c4e4c189SLance Richardson #include "bnxt_ring.h" 16c4e4c189SLance Richardson 17c4e4c189SLance Richardson #include "bnxt_txq.h" 18c4e4c189SLance Richardson #include "bnxt_txr.h" 19c4e4c189SLance Richardson #include "bnxt_rxtx_vec_common.h" 20c4e4c189SLance Richardson #include <unistd.h> 21c4e4c189SLance Richardson 22c4e4c189SLance Richardson static uint16_t 23c4e4c189SLance Richardson recv_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) 24c4e4c189SLance Richardson { 25c4e4c189SLance Richardson struct bnxt_rx_queue *rxq = rx_queue; 26c4e4c189SLance Richardson const __m256i mbuf_init = 27c4e4c189SLance Richardson _mm256_set_epi64x(0, 0, 0, rxq->mbuf_initializer); 28c4e4c189SLance Richardson struct bnxt_cp_ring_info *cpr = rxq->cp_ring; 29c4e4c189SLance Richardson struct bnxt_rx_ring_info *rxr = rxq->rx_ring; 30c4e4c189SLance Richardson uint16_t cp_ring_size = cpr->cp_ring_struct->ring_size; 31c4e4c189SLance Richardson uint16_t rx_ring_size = rxr->rx_ring_struct->ring_size; 32c4e4c189SLance Richardson struct cmpl_base *cp_desc_ring = cpr->cp_desc_ring; 33c4e4c189SLance Richardson uint64_t valid, desc_valid_mask = ~0ULL; 34c4e4c189SLance Richardson const __m256i info3_v_mask = _mm256_set1_epi32(CMPL_BASE_V); 35c4e4c189SLance Richardson uint32_t raw_cons = cpr->cp_raw_cons; 36c4e4c189SLance Richardson uint32_t cons, mbcons; 37c4e4c189SLance Richardson int nb_rx_pkts = 0; 38c4e4c189SLance Richardson int i; 39c4e4c189SLance Richardson const __m256i valid_target = 40c4e4c189SLance Richardson _mm256_set1_epi32(!!(raw_cons & cp_ring_size)); 41c4e4c189SLance Richardson const __m256i dsc_shuf_msk = 42c4e4c189SLance Richardson _mm256_set_epi8(0xff, 0xff, 0xff, 0xff, /* Zeroes. */ 43c4e4c189SLance Richardson 7, 6, /* metadata type */ 44c4e4c189SLance Richardson 9, 8, /* flags2 low 16 */ 45c4e4c189SLance Richardson 5, 4, /* vlan_tci */ 46c4e4c189SLance Richardson 1, 0, /* errors_v2 */ 47c4e4c189SLance Richardson 0xff, 0xff, 0xff, 0xff, /* Zeroes. */ 48c4e4c189SLance Richardson 0xff, 0xff, 0xff, 0xff, /* Zeroes. */ 49c4e4c189SLance Richardson 7, 6, /* metadata type */ 50c4e4c189SLance Richardson 9, 8, /* flags2 low 16 */ 51c4e4c189SLance Richardson 5, 4, /* vlan_tci */ 52c4e4c189SLance Richardson 1, 0, /* errors_v2 */ 53c4e4c189SLance Richardson 0xff, 0xff, 0xff, 0xff); /* Zeroes. */ 54c4e4c189SLance Richardson const __m256i shuf_msk = 55c4e4c189SLance Richardson _mm256_set_epi8(15, 14, 13, 12, /* rss */ 56c4e4c189SLance Richardson 7, 6, /* vlan_tci */ 57c4e4c189SLance Richardson 3, 2, /* data_len */ 58c4e4c189SLance Richardson 0xFF, 0xFF, 3, 2, /* pkt_len */ 59c4e4c189SLance Richardson 0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type (zeroes) */ 60c4e4c189SLance Richardson 15, 14, 13, 12, /* rss */ 61c4e4c189SLance Richardson 7, 6, /* vlan_tci */ 62c4e4c189SLance Richardson 3, 2, /* data_len */ 63c4e4c189SLance Richardson 0xFF, 0xFF, 3, 2, /* pkt_len */ 64c4e4c189SLance Richardson 0xFF, 0xFF, 0xFF, 0xFF); /* pkt_type (zeroes) */ 65c4e4c189SLance Richardson const __m256i flags_type_mask = 66c4e4c189SLance Richardson _mm256_set1_epi32(RX_PKT_CMPL_FLAGS_ITYPE_MASK); 67c4e4c189SLance Richardson const __m256i flags2_mask1 = 68c4e4c189SLance Richardson _mm256_set1_epi32(CMPL_FLAGS2_VLAN_TUN_MSK); 69c4e4c189SLance Richardson const __m256i flags2_mask2 = 70c4e4c189SLance Richardson _mm256_set1_epi32(RX_PKT_CMPL_FLAGS2_IP_TYPE); 71c4e4c189SLance Richardson const __m256i rss_mask = 72c4e4c189SLance Richardson _mm256_set1_epi32(RX_PKT_CMPL_FLAGS_RSS_VALID); 73c4e4c189SLance Richardson __m256i t0, t1, flags_type, flags2, index, errors; 74c4e4c189SLance Richardson __m256i ptype_idx, ptypes, is_tunnel; 75c4e4c189SLance Richardson __m256i mbuf01, mbuf23, mbuf45, mbuf67; 76c4e4c189SLance Richardson __m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5, rearm6, rearm7; 77c4e4c189SLance Richardson __m256i ol_flags, ol_flags_hi; 78c4e4c189SLance Richardson __m256i rss_flags; 79c4e4c189SLance Richardson 80c4e4c189SLance Richardson /* Validate ptype table indexing at build time. */ 81c4e4c189SLance Richardson bnxt_check_ptype_constants(); 82c4e4c189SLance Richardson 83c4e4c189SLance Richardson /* If Rx Q was stopped return */ 84c4e4c189SLance Richardson if (unlikely(!rxq->rx_started)) 85c4e4c189SLance Richardson return 0; 86c4e4c189SLance Richardson 87c4e4c189SLance Richardson if (rxq->rxrearm_nb >= rxq->rx_free_thresh) 88c4e4c189SLance Richardson bnxt_rxq_rearm(rxq, rxr); 89c4e4c189SLance Richardson 90c4e4c189SLance Richardson nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, BNXT_RX_DESCS_PER_LOOP_VEC256); 91c4e4c189SLance Richardson 92c4e4c189SLance Richardson cons = raw_cons & (cp_ring_size - 1); 93c4e4c189SLance Richardson mbcons = (raw_cons / 2) & (rx_ring_size - 1); 94c4e4c189SLance Richardson 95e5f2b3ebSLance Richardson /* Return immediately if there is not at least one completed packet. */ 96e5f2b3ebSLance Richardson if (!bnxt_cpr_cmp_valid(&cp_desc_ring[cons], raw_cons, cp_ring_size)) 97e5f2b3ebSLance Richardson return 0; 98e5f2b3ebSLance Richardson 99c4e4c189SLance Richardson /* Ensure that we do not go past the ends of the rings. */ 100c4e4c189SLance Richardson nb_pkts = RTE_MIN(nb_pkts, RTE_MIN(rx_ring_size - mbcons, 101c4e4c189SLance Richardson (cp_ring_size - cons) / 2)); 102c4e4c189SLance Richardson /* 103c4e4c189SLance Richardson * If we are at the end of the ring, ensure that descriptors after the 104c4e4c189SLance Richardson * last valid entry are not treated as valid. Otherwise, force the 105c4e4c189SLance Richardson * maximum number of packets to receive to be a multiple of the per- 106c4e4c189SLance Richardson * loop count. 107c4e4c189SLance Richardson */ 108c4e4c189SLance Richardson if (nb_pkts < BNXT_RX_DESCS_PER_LOOP_VEC256) { 109c4e4c189SLance Richardson desc_valid_mask >>= 110c4e4c189SLance Richardson CHAR_BIT * (BNXT_RX_DESCS_PER_LOOP_VEC256 - nb_pkts); 111c4e4c189SLance Richardson } else { 112c4e4c189SLance Richardson nb_pkts = 113c4e4c189SLance Richardson RTE_ALIGN_FLOOR(nb_pkts, BNXT_RX_DESCS_PER_LOOP_VEC256); 114c4e4c189SLance Richardson } 115c4e4c189SLance Richardson 116c4e4c189SLance Richardson /* Handle RX burst request */ 117c4e4c189SLance Richardson for (i = 0; i < nb_pkts; i += BNXT_RX_DESCS_PER_LOOP_VEC256, 118c4e4c189SLance Richardson cons += BNXT_RX_DESCS_PER_LOOP_VEC256 * 2, 119c4e4c189SLance Richardson mbcons += BNXT_RX_DESCS_PER_LOOP_VEC256) { 120c4e4c189SLance Richardson __m256i desc0, desc1, desc2, desc3, desc4, desc5, desc6, desc7; 121c4e4c189SLance Richardson __m256i rxcmp0_1, rxcmp2_3, rxcmp4_5, rxcmp6_7, info3_v; 122c4e4c189SLance Richardson __m256i errors_v2; 123c4e4c189SLance Richardson uint32_t num_valid; 124c4e4c189SLance Richardson 125c4e4c189SLance Richardson /* Copy eight mbuf pointers to output array. */ 126c4e4c189SLance Richardson t0 = _mm256_loadu_si256((void *)&rxr->rx_buf_ring[mbcons]); 127c4e4c189SLance Richardson _mm256_storeu_si256((void *)&rx_pkts[i], t0); 128c4e4c189SLance Richardson #ifdef RTE_ARCH_X86_64 129c4e4c189SLance Richardson t0 = _mm256_loadu_si256((void *)&rxr->rx_buf_ring[mbcons + 4]); 130c4e4c189SLance Richardson _mm256_storeu_si256((void *)&rx_pkts[i + 4], t0); 131c4e4c189SLance Richardson #endif 132c4e4c189SLance Richardson 133c4e4c189SLance Richardson /* 134c4e4c189SLance Richardson * Load eight receive completion descriptors into 256-bit 135c4e4c189SLance Richardson * registers. Loads are issued in reverse order in order to 136c4e4c189SLance Richardson * ensure consistent state. 137c4e4c189SLance Richardson */ 138c4e4c189SLance Richardson desc7 = _mm256_load_si256((void *)&cp_desc_ring[cons + 14]); 139c4e4c189SLance Richardson rte_compiler_barrier(); 140c4e4c189SLance Richardson desc6 = _mm256_load_si256((void *)&cp_desc_ring[cons + 12]); 141c4e4c189SLance Richardson rte_compiler_barrier(); 142c4e4c189SLance Richardson desc5 = _mm256_load_si256((void *)&cp_desc_ring[cons + 10]); 143c4e4c189SLance Richardson rte_compiler_barrier(); 144c4e4c189SLance Richardson desc4 = _mm256_load_si256((void *)&cp_desc_ring[cons + 8]); 145c4e4c189SLance Richardson rte_compiler_barrier(); 146c4e4c189SLance Richardson desc3 = _mm256_load_si256((void *)&cp_desc_ring[cons + 6]); 147c4e4c189SLance Richardson rte_compiler_barrier(); 148c4e4c189SLance Richardson desc2 = _mm256_load_si256((void *)&cp_desc_ring[cons + 4]); 149c4e4c189SLance Richardson rte_compiler_barrier(); 150c4e4c189SLance Richardson desc1 = _mm256_load_si256((void *)&cp_desc_ring[cons + 2]); 151c4e4c189SLance Richardson rte_compiler_barrier(); 152c4e4c189SLance Richardson desc0 = _mm256_load_si256((void *)&cp_desc_ring[cons + 0]); 153c4e4c189SLance Richardson 154c4e4c189SLance Richardson /* 155c4e4c189SLance Richardson * Pack needed fields from each descriptor into a compressed 156c4e4c189SLance Richardson * 128-bit layout and pair two compressed descriptors into 157c4e4c189SLance Richardson * 256-bit registers. The 128-bit compressed layout is as 158c4e4c189SLance Richardson * follows: 159c4e4c189SLance Richardson * Bits 0-15: flags_type field from low completion record. 160c4e4c189SLance Richardson * Bits 16-31: len field from low completion record. 161c4e4c189SLance Richardson * Bits 32-47: flags2 (low 16 bits) from high completion. 162c4e4c189SLance Richardson * Bits 48-79: metadata from high completion record. 163c4e4c189SLance Richardson * Bits 80-95: errors_v2 from high completion record. 164c4e4c189SLance Richardson * Bits 96-127: rss hash from low completion record. 165c4e4c189SLance Richardson */ 166c4e4c189SLance Richardson t0 = _mm256_permute2f128_si256(desc6, desc7, 0x20); 167c4e4c189SLance Richardson t1 = _mm256_permute2f128_si256(desc6, desc7, 0x31); 168c4e4c189SLance Richardson t1 = _mm256_shuffle_epi8(t1, dsc_shuf_msk); 169c4e4c189SLance Richardson rxcmp6_7 = _mm256_blend_epi32(t0, t1, 0x66); 170c4e4c189SLance Richardson 171c4e4c189SLance Richardson t0 = _mm256_permute2f128_si256(desc4, desc5, 0x20); 172c4e4c189SLance Richardson t1 = _mm256_permute2f128_si256(desc4, desc5, 0x31); 173c4e4c189SLance Richardson t1 = _mm256_shuffle_epi8(t1, dsc_shuf_msk); 174c4e4c189SLance Richardson rxcmp4_5 = _mm256_blend_epi32(t0, t1, 0x66); 175c4e4c189SLance Richardson 176c4e4c189SLance Richardson t0 = _mm256_permute2f128_si256(desc2, desc3, 0x20); 177c4e4c189SLance Richardson t1 = _mm256_permute2f128_si256(desc2, desc3, 0x31); 178c4e4c189SLance Richardson t1 = _mm256_shuffle_epi8(t1, dsc_shuf_msk); 179c4e4c189SLance Richardson rxcmp2_3 = _mm256_blend_epi32(t0, t1, 0x66); 180c4e4c189SLance Richardson 181c4e4c189SLance Richardson t0 = _mm256_permute2f128_si256(desc0, desc1, 0x20); 182c4e4c189SLance Richardson t1 = _mm256_permute2f128_si256(desc0, desc1, 0x31); 183c4e4c189SLance Richardson t1 = _mm256_shuffle_epi8(t1, dsc_shuf_msk); 184c4e4c189SLance Richardson rxcmp0_1 = _mm256_blend_epi32(t0, t1, 0x66); 185c4e4c189SLance Richardson 186c4e4c189SLance Richardson /* Compute packet type table indices for eight packets. */ 187c4e4c189SLance Richardson t0 = _mm256_unpacklo_epi32(rxcmp0_1, rxcmp2_3); 188c4e4c189SLance Richardson t1 = _mm256_unpacklo_epi32(rxcmp4_5, rxcmp6_7); 189c4e4c189SLance Richardson flags_type = _mm256_unpacklo_epi64(t0, t1); 190c4e4c189SLance Richardson ptype_idx = _mm256_and_si256(flags_type, flags_type_mask); 191c4e4c189SLance Richardson ptype_idx = _mm256_srli_epi32(ptype_idx, 192c4e4c189SLance Richardson RX_PKT_CMPL_FLAGS_ITYPE_SFT - 193c4e4c189SLance Richardson BNXT_PTYPE_TBL_TYPE_SFT); 194c4e4c189SLance Richardson 195c4e4c189SLance Richardson t0 = _mm256_unpacklo_epi32(rxcmp0_1, rxcmp2_3); 196c4e4c189SLance Richardson t1 = _mm256_unpacklo_epi32(rxcmp4_5, rxcmp6_7); 197c4e4c189SLance Richardson flags2 = _mm256_unpackhi_epi64(t0, t1); 198c4e4c189SLance Richardson 199c4e4c189SLance Richardson t0 = _mm256_srli_epi32(_mm256_and_si256(flags2, flags2_mask1), 200c4e4c189SLance Richardson RX_PKT_CMPL_FLAGS2_META_FORMAT_SFT - 201c4e4c189SLance Richardson BNXT_PTYPE_TBL_VLAN_SFT); 202c4e4c189SLance Richardson ptype_idx = _mm256_or_si256(ptype_idx, t0); 203c4e4c189SLance Richardson 204c4e4c189SLance Richardson t0 = _mm256_srli_epi32(_mm256_and_si256(flags2, flags2_mask2), 205c4e4c189SLance Richardson RX_PKT_CMPL_FLAGS2_IP_TYPE_SFT - 206c4e4c189SLance Richardson BNXT_PTYPE_TBL_IP_VER_SFT); 207c4e4c189SLance Richardson ptype_idx = _mm256_or_si256(ptype_idx, t0); 208c4e4c189SLance Richardson 209c4e4c189SLance Richardson /* 210c4e4c189SLance Richardson * Load ptypes for eight packets using gather. Gather operations 211c4e4c189SLance Richardson * have extremely high latency (~19 cycles), execution and use 212c4e4c189SLance Richardson * of result should be separated as much as possible. 213c4e4c189SLance Richardson */ 214c4e4c189SLance Richardson ptypes = _mm256_i32gather_epi32((int *)bnxt_ptype_table, 215c4e4c189SLance Richardson ptype_idx, sizeof(uint32_t)); 216c4e4c189SLance Richardson /* 217c4e4c189SLance Richardson * Compute ol_flags and checksum error table indices for eight 218c4e4c189SLance Richardson * packets. 219c4e4c189SLance Richardson */ 220c4e4c189SLance Richardson is_tunnel = _mm256_and_si256(flags2, _mm256_set1_epi32(4)); 221c4e4c189SLance Richardson is_tunnel = _mm256_slli_epi32(is_tunnel, 3); 222c4e4c189SLance Richardson flags2 = _mm256_and_si256(flags2, _mm256_set1_epi32(0x1F)); 223c4e4c189SLance Richardson 224c4e4c189SLance Richardson /* Extract errors_v2 fields for eight packets. */ 225c4e4c189SLance Richardson t0 = _mm256_unpackhi_epi32(rxcmp0_1, rxcmp2_3); 226c4e4c189SLance Richardson t1 = _mm256_unpackhi_epi32(rxcmp4_5, rxcmp6_7); 227c4e4c189SLance Richardson errors_v2 = _mm256_unpacklo_epi64(t0, t1); 228c4e4c189SLance Richardson 229c4e4c189SLance Richardson errors = _mm256_srli_epi32(errors_v2, 4); 230c4e4c189SLance Richardson errors = _mm256_and_si256(errors, _mm256_set1_epi32(0xF)); 231c4e4c189SLance Richardson errors = _mm256_and_si256(errors, flags2); 232c4e4c189SLance Richardson 233c4e4c189SLance Richardson index = _mm256_andnot_si256(errors, flags2); 234c4e4c189SLance Richardson errors = _mm256_or_si256(errors, 235c4e4c189SLance Richardson _mm256_srli_epi32(is_tunnel, 1)); 236c4e4c189SLance Richardson index = _mm256_or_si256(index, is_tunnel); 237c4e4c189SLance Richardson 238c4e4c189SLance Richardson /* 239c4e4c189SLance Richardson * Load ol_flags for eight packets using gather. Gather 240c4e4c189SLance Richardson * operations have extremely high latency (~19 cycles), 241c4e4c189SLance Richardson * execution and use of result should be separated as much 242c4e4c189SLance Richardson * as possible. 243c4e4c189SLance Richardson */ 244c4e4c189SLance Richardson ol_flags = _mm256_i32gather_epi32((int *)rxr->ol_flags_table, 245c4e4c189SLance Richardson index, sizeof(uint32_t)); 246c4e4c189SLance Richardson errors = _mm256_i32gather_epi32((int *)rxr->ol_flags_err_table, 247c4e4c189SLance Richardson errors, sizeof(uint32_t)); 248c4e4c189SLance Richardson 249c4e4c189SLance Richardson /* 250c4e4c189SLance Richardson * Pack the 128-bit array of valid descriptor flags into 64 251c4e4c189SLance Richardson * bits and count the number of set bits in order to determine 252c4e4c189SLance Richardson * the number of valid descriptors. 253c4e4c189SLance Richardson */ 254c4e4c189SLance Richardson const __m256i perm_msk = 255c4e4c189SLance Richardson _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); 256c4e4c189SLance Richardson info3_v = _mm256_permutevar8x32_epi32(errors_v2, perm_msk); 257c4e4c189SLance Richardson info3_v = _mm256_and_si256(errors_v2, info3_v_mask); 258c4e4c189SLance Richardson info3_v = _mm256_xor_si256(info3_v, valid_target); 259c4e4c189SLance Richardson 260c4e4c189SLance Richardson info3_v = _mm256_packs_epi32(info3_v, _mm256_setzero_si256()); 261c4e4c189SLance Richardson valid = _mm_cvtsi128_si64(_mm256_extracti128_si256(info3_v, 1)); 262c4e4c189SLance Richardson valid = (valid << CHAR_BIT) | 263c4e4c189SLance Richardson _mm_cvtsi128_si64(_mm256_castsi256_si128(info3_v)); 2643d4e27fdSDavid Marchand num_valid = rte_popcount64(valid & desc_valid_mask); 265c4e4c189SLance Richardson 266c4e4c189SLance Richardson if (num_valid == 0) 267c4e4c189SLance Richardson break; 268c4e4c189SLance Richardson 269c4e4c189SLance Richardson /* Update mbuf rearm_data for eight packets. */ 270c4e4c189SLance Richardson mbuf01 = _mm256_shuffle_epi8(rxcmp0_1, shuf_msk); 271c4e4c189SLance Richardson mbuf23 = _mm256_shuffle_epi8(rxcmp2_3, shuf_msk); 272c4e4c189SLance Richardson mbuf45 = _mm256_shuffle_epi8(rxcmp4_5, shuf_msk); 273c4e4c189SLance Richardson mbuf67 = _mm256_shuffle_epi8(rxcmp6_7, shuf_msk); 274c4e4c189SLance Richardson 275c4e4c189SLance Richardson /* Blend in ptype field for two mbufs at a time. */ 276c4e4c189SLance Richardson mbuf01 = _mm256_blend_epi32(mbuf01, ptypes, 0x11); 277c4e4c189SLance Richardson mbuf23 = _mm256_blend_epi32(mbuf23, 278c4e4c189SLance Richardson _mm256_srli_si256(ptypes, 4), 0x11); 279c4e4c189SLance Richardson mbuf45 = _mm256_blend_epi32(mbuf45, 280c4e4c189SLance Richardson _mm256_srli_si256(ptypes, 8), 0x11); 281c4e4c189SLance Richardson mbuf67 = _mm256_blend_epi32(mbuf67, 282c4e4c189SLance Richardson _mm256_srli_si256(ptypes, 12), 0x11); 283c4e4c189SLance Richardson 284c4e4c189SLance Richardson /* Unpack rearm data, set fixed fields for first four mbufs. */ 285c4e4c189SLance Richardson rearm0 = _mm256_permute2f128_si256(mbuf_init, mbuf01, 0x20); 286c4e4c189SLance Richardson rearm1 = _mm256_blend_epi32(mbuf_init, mbuf01, 0xF0); 287c4e4c189SLance Richardson rearm2 = _mm256_permute2f128_si256(mbuf_init, mbuf23, 0x20); 288c4e4c189SLance Richardson rearm3 = _mm256_blend_epi32(mbuf_init, mbuf23, 0xF0); 289c4e4c189SLance Richardson 290c4e4c189SLance Richardson /* Compute final ol_flags values for eight packets. */ 291c4e4c189SLance Richardson rss_flags = _mm256_and_si256(flags_type, rss_mask); 292c4e4c189SLance Richardson rss_flags = _mm256_srli_epi32(rss_flags, 9); 293c4e4c189SLance Richardson ol_flags = _mm256_or_si256(ol_flags, errors); 294c4e4c189SLance Richardson ol_flags = _mm256_or_si256(ol_flags, rss_flags); 295c4e4c189SLance Richardson ol_flags_hi = _mm256_permute2f128_si256(ol_flags, 296c4e4c189SLance Richardson ol_flags, 0x11); 297c4e4c189SLance Richardson 298c4e4c189SLance Richardson /* Set ol_flags fields for first four packets. */ 299c4e4c189SLance Richardson rearm0 = _mm256_blend_epi32(rearm0, 300c4e4c189SLance Richardson _mm256_slli_si256(ol_flags, 8), 301c4e4c189SLance Richardson 0x04); 302c4e4c189SLance Richardson rearm1 = _mm256_blend_epi32(rearm1, 303c4e4c189SLance Richardson _mm256_slli_si256(ol_flags_hi, 8), 304c4e4c189SLance Richardson 0x04); 305c4e4c189SLance Richardson rearm2 = _mm256_blend_epi32(rearm2, 306c4e4c189SLance Richardson _mm256_slli_si256(ol_flags, 4), 307c4e4c189SLance Richardson 0x04); 308c4e4c189SLance Richardson rearm3 = _mm256_blend_epi32(rearm3, 309c4e4c189SLance Richardson _mm256_slli_si256(ol_flags_hi, 4), 310c4e4c189SLance Richardson 0x04); 311c4e4c189SLance Richardson 312c4e4c189SLance Richardson /* Store all mbuf fields for first four packets. */ 313c4e4c189SLance Richardson _mm256_storeu_si256((void *)&rx_pkts[i + 0]->rearm_data, 314c4e4c189SLance Richardson rearm0); 315c4e4c189SLance Richardson _mm256_storeu_si256((void *)&rx_pkts[i + 1]->rearm_data, 316c4e4c189SLance Richardson rearm1); 317c4e4c189SLance Richardson _mm256_storeu_si256((void *)&rx_pkts[i + 2]->rearm_data, 318c4e4c189SLance Richardson rearm2); 319c4e4c189SLance Richardson _mm256_storeu_si256((void *)&rx_pkts[i + 3]->rearm_data, 320c4e4c189SLance Richardson rearm3); 321c4e4c189SLance Richardson 322c4e4c189SLance Richardson /* Unpack rearm data, set fixed fields for final four mbufs. */ 323c4e4c189SLance Richardson rearm4 = _mm256_permute2f128_si256(mbuf_init, mbuf45, 0x20); 324c4e4c189SLance Richardson rearm5 = _mm256_blend_epi32(mbuf_init, mbuf45, 0xF0); 325c4e4c189SLance Richardson rearm6 = _mm256_permute2f128_si256(mbuf_init, mbuf67, 0x20); 326c4e4c189SLance Richardson rearm7 = _mm256_blend_epi32(mbuf_init, mbuf67, 0xF0); 327c4e4c189SLance Richardson 328c4e4c189SLance Richardson /* Set ol_flags fields for final four packets. */ 329c4e4c189SLance Richardson rearm4 = _mm256_blend_epi32(rearm4, ol_flags, 0x04); 330c4e4c189SLance Richardson rearm5 = _mm256_blend_epi32(rearm5, ol_flags_hi, 0x04); 331c4e4c189SLance Richardson rearm6 = _mm256_blend_epi32(rearm6, 332c4e4c189SLance Richardson _mm256_srli_si256(ol_flags, 4), 333c4e4c189SLance Richardson 0x04); 334c4e4c189SLance Richardson rearm7 = _mm256_blend_epi32(rearm7, 335c4e4c189SLance Richardson _mm256_srli_si256(ol_flags_hi, 4), 336c4e4c189SLance Richardson 0x04); 337c4e4c189SLance Richardson 338c4e4c189SLance Richardson /* Store all mbuf fields for final four packets. */ 339c4e4c189SLance Richardson _mm256_storeu_si256((void *)&rx_pkts[i + 4]->rearm_data, 340c4e4c189SLance Richardson rearm4); 341c4e4c189SLance Richardson _mm256_storeu_si256((void *)&rx_pkts[i + 5]->rearm_data, 342c4e4c189SLance Richardson rearm5); 343c4e4c189SLance Richardson _mm256_storeu_si256((void *)&rx_pkts[i + 6]->rearm_data, 344c4e4c189SLance Richardson rearm6); 345c4e4c189SLance Richardson _mm256_storeu_si256((void *)&rx_pkts[i + 7]->rearm_data, 346c4e4c189SLance Richardson rearm7); 347c4e4c189SLance Richardson 348c4e4c189SLance Richardson nb_rx_pkts += num_valid; 349c4e4c189SLance Richardson if (num_valid < BNXT_RX_DESCS_PER_LOOP_VEC256) 350c4e4c189SLance Richardson break; 351c4e4c189SLance Richardson } 352c4e4c189SLance Richardson 353c4e4c189SLance Richardson if (nb_rx_pkts) { 354c4e4c189SLance Richardson rxr->rx_raw_prod = RING_ADV(rxr->rx_raw_prod, nb_rx_pkts); 355c4e4c189SLance Richardson 356c4e4c189SLance Richardson rxq->rxrearm_nb += nb_rx_pkts; 357c4e4c189SLance Richardson cpr->cp_raw_cons += 2 * nb_rx_pkts; 358c4e4c189SLance Richardson bnxt_db_cq(cpr); 359c4e4c189SLance Richardson } 360c4e4c189SLance Richardson 361c4e4c189SLance Richardson return nb_rx_pkts; 362c4e4c189SLance Richardson } 363c4e4c189SLance Richardson 364d58c6c07SAjit Khaparde static uint16_t 365d58c6c07SAjit Khaparde crx_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) 366d58c6c07SAjit Khaparde { 367d58c6c07SAjit Khaparde struct bnxt_rx_queue *rxq = rx_queue; 368d58c6c07SAjit Khaparde const __m256i mbuf_init = 369d58c6c07SAjit Khaparde _mm256_set_epi64x(0, 0, 0, rxq->mbuf_initializer); 370d58c6c07SAjit Khaparde struct bnxt_cp_ring_info *cpr = rxq->cp_ring; 371d58c6c07SAjit Khaparde struct bnxt_rx_ring_info *rxr = rxq->rx_ring; 372d58c6c07SAjit Khaparde uint16_t cp_ring_size = cpr->cp_ring_struct->ring_size; 373d58c6c07SAjit Khaparde uint16_t rx_ring_size = rxr->rx_ring_struct->ring_size; 374d58c6c07SAjit Khaparde struct cmpl_base *cp_desc_ring = cpr->cp_desc_ring; 375d58c6c07SAjit Khaparde uint64_t valid, desc_valid_mask = ~0ULL; 376d58c6c07SAjit Khaparde const __m256i info3_v_mask = _mm256_set1_epi32(CMPL_BASE_V); 377d58c6c07SAjit Khaparde uint32_t raw_cons = cpr->cp_raw_cons; 378d58c6c07SAjit Khaparde uint32_t cons, mbcons; 379d58c6c07SAjit Khaparde int nb_rx_pkts = 0; 380d58c6c07SAjit Khaparde int i; 381d58c6c07SAjit Khaparde const __m256i valid_target = 382d58c6c07SAjit Khaparde _mm256_set1_epi32(!!(raw_cons & cp_ring_size)); 383d58c6c07SAjit Khaparde const __m256i shuf_msk = 384d58c6c07SAjit Khaparde _mm256_set_epi8(15, 14, 13, 12, /* rss */ 385d58c6c07SAjit Khaparde 7, 6, /* vlan_tci */ 386d58c6c07SAjit Khaparde 3, 2, /* data_len */ 387d58c6c07SAjit Khaparde 0xFF, 0xFF, 3, 2, /* pkt_len */ 388d58c6c07SAjit Khaparde 0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type (zeroes) */ 389d58c6c07SAjit Khaparde 15, 14, 13, 12, /* rss */ 390d58c6c07SAjit Khaparde 7, 6, /* vlan_tci */ 391d58c6c07SAjit Khaparde 3, 2, /* data_len */ 392d58c6c07SAjit Khaparde 0xFF, 0xFF, 3, 2, /* pkt_len */ 393d58c6c07SAjit Khaparde 0xFF, 0xFF, 0xFF, 0xFF); /* pkt_type (zeroes) */ 394d58c6c07SAjit Khaparde const __m256i flags_type_mask = 395de7432f2SAjit Khaparde _mm256_set1_epi32(RX_PKT_COMPRESS_CMPL_FLAGS_ITYPE_MASK); 396d58c6c07SAjit Khaparde const __m256i flags2_mask1 = 397de7432f2SAjit Khaparde _mm256_set1_epi32(CMPL_FLAGS2_VLAN_TUN_MSK_CRX); 398d58c6c07SAjit Khaparde const __m256i flags2_mask2 = 399de7432f2SAjit Khaparde _mm256_set1_epi32(RX_PKT_COMPRESS_CMPL_FLAGS_IP_TYPE); 400d58c6c07SAjit Khaparde const __m256i rss_mask = 401de7432f2SAjit Khaparde _mm256_set1_epi32(RX_PKT_COMPRESS_CMPL_FLAGS_RSS_VALID); 402d58c6c07SAjit Khaparde __m256i t0, t1, flags_type, flags2, index, errors; 403d58c6c07SAjit Khaparde __m256i ptype_idx, ptypes, is_tunnel; 404d58c6c07SAjit Khaparde __m256i mbuf01, mbuf23, mbuf45, mbuf67; 405d58c6c07SAjit Khaparde __m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5, rearm6, rearm7; 406d58c6c07SAjit Khaparde __m256i ol_flags, ol_flags_hi; 407d58c6c07SAjit Khaparde __m256i rss_flags; 408de7432f2SAjit Khaparde __m256i errors_v2; 409de7432f2SAjit Khaparde __m256i cs_err_v2; 410d58c6c07SAjit Khaparde 411d58c6c07SAjit Khaparde /* Validate ptype table indexing at build time. */ 412d58c6c07SAjit Khaparde bnxt_check_ptype_constants(); 413d58c6c07SAjit Khaparde 414d58c6c07SAjit Khaparde /* If Rx Q was stopped return */ 415d58c6c07SAjit Khaparde if (unlikely(!rxq->rx_started)) 416d58c6c07SAjit Khaparde return 0; 417d58c6c07SAjit Khaparde 418d58c6c07SAjit Khaparde if (rxq->rxrearm_nb >= rxq->rx_free_thresh) 419d58c6c07SAjit Khaparde bnxt_rxq_rearm(rxq, rxr); 420d58c6c07SAjit Khaparde 421d58c6c07SAjit Khaparde nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, BNXT_RX_DESCS_PER_LOOP_VEC256); 422d58c6c07SAjit Khaparde 423d58c6c07SAjit Khaparde cons = raw_cons & (cp_ring_size - 1); 424d58c6c07SAjit Khaparde mbcons = raw_cons & (rx_ring_size - 1); 425d58c6c07SAjit Khaparde 426d58c6c07SAjit Khaparde /* Return immediately if there is not at least one completed packet. */ 427d58c6c07SAjit Khaparde if (!bnxt_cpr_cmp_valid(&cp_desc_ring[cons], raw_cons, cp_ring_size)) 428d58c6c07SAjit Khaparde return 0; 429d58c6c07SAjit Khaparde 430d58c6c07SAjit Khaparde /* Ensure that we do not go past the ends of the rings. */ 431d58c6c07SAjit Khaparde nb_pkts = RTE_MIN(nb_pkts, RTE_MIN(rx_ring_size - mbcons, 432d58c6c07SAjit Khaparde cp_ring_size - cons)); 433d58c6c07SAjit Khaparde /* 434d58c6c07SAjit Khaparde * If we are at the end of the ring, ensure that descriptors after the 435d58c6c07SAjit Khaparde * last valid entry are not treated as valid. Otherwise, force the 436d58c6c07SAjit Khaparde * maximum number of packets to receive to be a multiple of the per- 437d58c6c07SAjit Khaparde * loop count. 438d58c6c07SAjit Khaparde */ 439d58c6c07SAjit Khaparde if (nb_pkts < BNXT_RX_DESCS_PER_LOOP_VEC256) { 440d58c6c07SAjit Khaparde desc_valid_mask >>= 441d58c6c07SAjit Khaparde CHAR_BIT * (BNXT_RX_DESCS_PER_LOOP_VEC256 - nb_pkts); 442d58c6c07SAjit Khaparde } else { 443d58c6c07SAjit Khaparde nb_pkts = 444d58c6c07SAjit Khaparde RTE_ALIGN_FLOOR(nb_pkts, BNXT_RX_DESCS_PER_LOOP_VEC256); 445d58c6c07SAjit Khaparde } 446d58c6c07SAjit Khaparde 447d58c6c07SAjit Khaparde /* Handle RX burst request */ 448d58c6c07SAjit Khaparde for (i = 0; i < nb_pkts; i += BNXT_RX_DESCS_PER_LOOP_VEC256, 449d58c6c07SAjit Khaparde cons += BNXT_RX_DESCS_PER_LOOP_VEC256, 450d58c6c07SAjit Khaparde mbcons += BNXT_RX_DESCS_PER_LOOP_VEC256) { 451d58c6c07SAjit Khaparde __m256i rxcmp0_1, rxcmp2_3, rxcmp4_5, rxcmp6_7, info3_v; 452d58c6c07SAjit Khaparde uint32_t num_valid; 453d58c6c07SAjit Khaparde 454d58c6c07SAjit Khaparde /* Copy eight mbuf pointers to output array. */ 455d58c6c07SAjit Khaparde t0 = _mm256_loadu_si256((void *)&rxr->rx_buf_ring[mbcons]); 456d58c6c07SAjit Khaparde _mm256_storeu_si256((void *)&rx_pkts[i], t0); 457d58c6c07SAjit Khaparde #ifdef RTE_ARCH_X86_64 458d58c6c07SAjit Khaparde t0 = _mm256_loadu_si256((void *)&rxr->rx_buf_ring[mbcons + 4]); 459d58c6c07SAjit Khaparde _mm256_storeu_si256((void *)&rx_pkts[i + 4], t0); 460d58c6c07SAjit Khaparde #endif 461d58c6c07SAjit Khaparde 462d58c6c07SAjit Khaparde /* 463d58c6c07SAjit Khaparde * Load eight receive completion descriptors into 256-bit 464d58c6c07SAjit Khaparde * registers. Loads are issued in reverse order in order to 465d58c6c07SAjit Khaparde * ensure consistent state. 466d58c6c07SAjit Khaparde */ 467d58c6c07SAjit Khaparde rxcmp6_7 = _mm256_loadu_si256((void *)&cp_desc_ring[cons + 6]); 468d58c6c07SAjit Khaparde rte_compiler_barrier(); 469d58c6c07SAjit Khaparde rxcmp4_5 = _mm256_loadu_si256((void *)&cp_desc_ring[cons + 4]); 470d58c6c07SAjit Khaparde rte_compiler_barrier(); 471d58c6c07SAjit Khaparde rxcmp2_3 = _mm256_loadu_si256((void *)&cp_desc_ring[cons + 2]); 472d58c6c07SAjit Khaparde rte_compiler_barrier(); 473d58c6c07SAjit Khaparde rxcmp0_1 = _mm256_loadu_si256((void *)&cp_desc_ring[cons + 0]); 474de7432f2SAjit Khaparde rte_compiler_barrier(); 475d58c6c07SAjit Khaparde 476d58c6c07SAjit Khaparde /* Compute packet type table indices for eight packets. */ 477d58c6c07SAjit Khaparde t0 = _mm256_unpacklo_epi32(rxcmp0_1, rxcmp2_3); 478d58c6c07SAjit Khaparde t1 = _mm256_unpacklo_epi32(rxcmp4_5, rxcmp6_7); 479d58c6c07SAjit Khaparde flags_type = _mm256_unpacklo_epi64(t0, t1); 480d58c6c07SAjit Khaparde ptype_idx = _mm256_and_si256(flags_type, flags_type_mask); 481d58c6c07SAjit Khaparde ptype_idx = _mm256_srli_epi32(ptype_idx, 482de7432f2SAjit Khaparde RX_PKT_COMPRESS_CMPL_FLAGS_ITYPE_SFT - 483d58c6c07SAjit Khaparde BNXT_PTYPE_TBL_TYPE_SFT); 484d58c6c07SAjit Khaparde 485de7432f2SAjit Khaparde t0 = _mm256_unpackhi_epi32(rxcmp0_1, rxcmp2_3); 486de7432f2SAjit Khaparde t1 = _mm256_unpackhi_epi32(rxcmp4_5, rxcmp6_7); 487de7432f2SAjit Khaparde cs_err_v2 = _mm256_unpacklo_epi64(t0, t1); 488d58c6c07SAjit Khaparde 489de7432f2SAjit Khaparde t0 = _mm256_srli_epi32(_mm256_and_si256(cs_err_v2, flags2_mask1), 490de7432f2SAjit Khaparde RX_PKT_COMPRESS_CMPL_METADATA1_SFT - 491d58c6c07SAjit Khaparde BNXT_PTYPE_TBL_VLAN_SFT); 492d58c6c07SAjit Khaparde ptype_idx = _mm256_or_si256(ptype_idx, t0); 493d58c6c07SAjit Khaparde 494de7432f2SAjit Khaparde t0 = _mm256_srli_epi32(_mm256_and_si256(cs_err_v2, flags2_mask2), 495d58c6c07SAjit Khaparde RX_PKT_CMPL_FLAGS2_IP_TYPE_SFT - 496d58c6c07SAjit Khaparde BNXT_PTYPE_TBL_IP_VER_SFT); 497d58c6c07SAjit Khaparde ptype_idx = _mm256_or_si256(ptype_idx, t0); 498d58c6c07SAjit Khaparde 499d58c6c07SAjit Khaparde /* 500d58c6c07SAjit Khaparde * Load ptypes for eight packets using gather. Gather operations 501d58c6c07SAjit Khaparde * have extremely high latency (~19 cycles), execution and use 502d58c6c07SAjit Khaparde * of result should be separated as much as possible. 503d58c6c07SAjit Khaparde */ 504d58c6c07SAjit Khaparde ptypes = _mm256_i32gather_epi32((int *)bnxt_ptype_table, 505d58c6c07SAjit Khaparde ptype_idx, sizeof(uint32_t)); 506d58c6c07SAjit Khaparde /* 507d58c6c07SAjit Khaparde * Compute ol_flags and checksum error table indices for eight 508d58c6c07SAjit Khaparde * packets. 509d58c6c07SAjit Khaparde */ 510de7432f2SAjit Khaparde is_tunnel = _mm256_and_si256(cs_err_v2, 511de7432f2SAjit Khaparde _mm256_set1_epi32(BNXT_CRX_TUN_CS_CALC)); 512d58c6c07SAjit Khaparde is_tunnel = _mm256_slli_epi32(is_tunnel, 3); 513de7432f2SAjit Khaparde 514de7432f2SAjit Khaparde flags2 = _mm256_and_si256(cs_err_v2, 515de7432f2SAjit Khaparde _mm256_set1_epi32(BNXT_CRX_CQE_CSUM_CALC_MASK)); 516de7432f2SAjit Khaparde flags2 = _mm256_srli_epi64(flags2, 8); 517d58c6c07SAjit Khaparde 518d58c6c07SAjit Khaparde /* Extract errors_v2 fields for eight packets. */ 519d58c6c07SAjit Khaparde t0 = _mm256_unpackhi_epi32(rxcmp0_1, rxcmp2_3); 520d58c6c07SAjit Khaparde t1 = _mm256_unpackhi_epi32(rxcmp4_5, rxcmp6_7); 521d58c6c07SAjit Khaparde errors_v2 = _mm256_unpacklo_epi64(t0, t1); 522d58c6c07SAjit Khaparde 523de7432f2SAjit Khaparde /* Compute errors out of cs_err_v2 to index into flags table. */ 524de7432f2SAjit Khaparde errors = _mm256_and_si256(cs_err_v2, _mm256_set1_epi32(0xF0)); 525de7432f2SAjit Khaparde errors = _mm256_srli_epi32(errors, 4); 526d58c6c07SAjit Khaparde errors = _mm256_and_si256(errors, flags2); 527d58c6c07SAjit Khaparde 528d58c6c07SAjit Khaparde index = _mm256_andnot_si256(errors, flags2); 529d58c6c07SAjit Khaparde errors = _mm256_or_si256(errors, 530d58c6c07SAjit Khaparde _mm256_srli_epi32(is_tunnel, 1)); 531d58c6c07SAjit Khaparde index = _mm256_or_si256(index, is_tunnel); 532d58c6c07SAjit Khaparde 533d58c6c07SAjit Khaparde /* 534d58c6c07SAjit Khaparde * Load ol_flags for eight packets using gather. Gather 535d58c6c07SAjit Khaparde * operations have extremely high latency (~19 cycles), 536d58c6c07SAjit Khaparde * execution and use of result should be separated as much 537d58c6c07SAjit Khaparde * as possible. 538d58c6c07SAjit Khaparde */ 539d58c6c07SAjit Khaparde ol_flags = _mm256_i32gather_epi32((int *)rxr->ol_flags_table, 540d58c6c07SAjit Khaparde index, sizeof(uint32_t)); 541d58c6c07SAjit Khaparde errors = _mm256_i32gather_epi32((int *)rxr->ol_flags_err_table, 542d58c6c07SAjit Khaparde errors, sizeof(uint32_t)); 543d58c6c07SAjit Khaparde 544d58c6c07SAjit Khaparde /* 545d58c6c07SAjit Khaparde * Pack the 128-bit array of valid descriptor flags into 64 546d58c6c07SAjit Khaparde * bits and count the number of set bits in order to determine 547d58c6c07SAjit Khaparde * the number of valid descriptors. 548d58c6c07SAjit Khaparde */ 549d58c6c07SAjit Khaparde const __m256i perm_msk = 550d58c6c07SAjit Khaparde _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); 551d58c6c07SAjit Khaparde info3_v = _mm256_permutevar8x32_epi32(errors_v2, perm_msk); 552d58c6c07SAjit Khaparde info3_v = _mm256_and_si256(errors_v2, info3_v_mask); 553d58c6c07SAjit Khaparde info3_v = _mm256_xor_si256(info3_v, valid_target); 554d58c6c07SAjit Khaparde 555d58c6c07SAjit Khaparde info3_v = _mm256_packs_epi32(info3_v, _mm256_setzero_si256()); 556d58c6c07SAjit Khaparde valid = _mm_cvtsi128_si64(_mm256_extracti128_si256(info3_v, 1)); 557d58c6c07SAjit Khaparde valid = (valid << CHAR_BIT) | 558d58c6c07SAjit Khaparde _mm_cvtsi128_si64(_mm256_castsi256_si128(info3_v)); 559d58c6c07SAjit Khaparde num_valid = rte_popcount64(valid & desc_valid_mask); 560d58c6c07SAjit Khaparde 561d58c6c07SAjit Khaparde if (num_valid == 0) 562d58c6c07SAjit Khaparde break; 563d58c6c07SAjit Khaparde 564d58c6c07SAjit Khaparde /* Update mbuf rearm_data for eight packets. */ 565d58c6c07SAjit Khaparde mbuf01 = _mm256_shuffle_epi8(rxcmp0_1, shuf_msk); 566d58c6c07SAjit Khaparde mbuf23 = _mm256_shuffle_epi8(rxcmp2_3, shuf_msk); 567d58c6c07SAjit Khaparde mbuf45 = _mm256_shuffle_epi8(rxcmp4_5, shuf_msk); 568d58c6c07SAjit Khaparde mbuf67 = _mm256_shuffle_epi8(rxcmp6_7, shuf_msk); 569d58c6c07SAjit Khaparde 570d58c6c07SAjit Khaparde /* Blend in ptype field for two mbufs at a time. */ 571d58c6c07SAjit Khaparde mbuf01 = _mm256_blend_epi32(mbuf01, ptypes, 0x11); 572d58c6c07SAjit Khaparde mbuf23 = _mm256_blend_epi32(mbuf23, 573d58c6c07SAjit Khaparde _mm256_srli_si256(ptypes, 4), 0x11); 574d58c6c07SAjit Khaparde mbuf45 = _mm256_blend_epi32(mbuf45, 575d58c6c07SAjit Khaparde _mm256_srli_si256(ptypes, 8), 0x11); 576d58c6c07SAjit Khaparde mbuf67 = _mm256_blend_epi32(mbuf67, 577d58c6c07SAjit Khaparde _mm256_srli_si256(ptypes, 12), 0x11); 578d58c6c07SAjit Khaparde 579d58c6c07SAjit Khaparde /* Unpack rearm data, set fixed fields for first four mbufs. */ 580d58c6c07SAjit Khaparde rearm0 = _mm256_permute2f128_si256(mbuf_init, mbuf01, 0x20); 581d58c6c07SAjit Khaparde rearm1 = _mm256_blend_epi32(mbuf_init, mbuf01, 0xF0); 582d58c6c07SAjit Khaparde rearm2 = _mm256_permute2f128_si256(mbuf_init, mbuf23, 0x20); 583d58c6c07SAjit Khaparde rearm3 = _mm256_blend_epi32(mbuf_init, mbuf23, 0xF0); 584d58c6c07SAjit Khaparde 585d58c6c07SAjit Khaparde /* Compute final ol_flags values for eight packets. */ 586d58c6c07SAjit Khaparde rss_flags = _mm256_and_si256(flags_type, rss_mask); 587d58c6c07SAjit Khaparde rss_flags = _mm256_srli_epi32(rss_flags, 9); 588d58c6c07SAjit Khaparde ol_flags = _mm256_or_si256(ol_flags, errors); 589d58c6c07SAjit Khaparde ol_flags = _mm256_or_si256(ol_flags, rss_flags); 590d58c6c07SAjit Khaparde ol_flags_hi = _mm256_permute2f128_si256(ol_flags, 591d58c6c07SAjit Khaparde ol_flags, 0x11); 592d58c6c07SAjit Khaparde 593d58c6c07SAjit Khaparde /* Set ol_flags fields for first four packets. */ 594d58c6c07SAjit Khaparde rearm0 = _mm256_blend_epi32(rearm0, 595d58c6c07SAjit Khaparde _mm256_slli_si256(ol_flags, 8), 596d58c6c07SAjit Khaparde 0x04); 597d58c6c07SAjit Khaparde rearm1 = _mm256_blend_epi32(rearm1, 598d58c6c07SAjit Khaparde _mm256_slli_si256(ol_flags_hi, 8), 599d58c6c07SAjit Khaparde 0x04); 600d58c6c07SAjit Khaparde rearm2 = _mm256_blend_epi32(rearm2, 601d58c6c07SAjit Khaparde _mm256_slli_si256(ol_flags, 4), 602d58c6c07SAjit Khaparde 0x04); 603d58c6c07SAjit Khaparde rearm3 = _mm256_blend_epi32(rearm3, 604d58c6c07SAjit Khaparde _mm256_slli_si256(ol_flags_hi, 4), 605d58c6c07SAjit Khaparde 0x04); 606d58c6c07SAjit Khaparde 607d58c6c07SAjit Khaparde /* Store all mbuf fields for first four packets. */ 608d58c6c07SAjit Khaparde _mm256_storeu_si256((void *)&rx_pkts[i + 0]->rearm_data, 609d58c6c07SAjit Khaparde rearm0); 610d58c6c07SAjit Khaparde _mm256_storeu_si256((void *)&rx_pkts[i + 1]->rearm_data, 611d58c6c07SAjit Khaparde rearm1); 612d58c6c07SAjit Khaparde _mm256_storeu_si256((void *)&rx_pkts[i + 2]->rearm_data, 613d58c6c07SAjit Khaparde rearm2); 614d58c6c07SAjit Khaparde _mm256_storeu_si256((void *)&rx_pkts[i + 3]->rearm_data, 615d58c6c07SAjit Khaparde rearm3); 616d58c6c07SAjit Khaparde 617d58c6c07SAjit Khaparde /* Unpack rearm data, set fixed fields for final four mbufs. */ 618d58c6c07SAjit Khaparde rearm4 = _mm256_permute2f128_si256(mbuf_init, mbuf45, 0x20); 619d58c6c07SAjit Khaparde rearm5 = _mm256_blend_epi32(mbuf_init, mbuf45, 0xF0); 620d58c6c07SAjit Khaparde rearm6 = _mm256_permute2f128_si256(mbuf_init, mbuf67, 0x20); 621d58c6c07SAjit Khaparde rearm7 = _mm256_blend_epi32(mbuf_init, mbuf67, 0xF0); 622d58c6c07SAjit Khaparde 623d58c6c07SAjit Khaparde /* Set ol_flags fields for final four packets. */ 624d58c6c07SAjit Khaparde rearm4 = _mm256_blend_epi32(rearm4, ol_flags, 0x04); 625d58c6c07SAjit Khaparde rearm5 = _mm256_blend_epi32(rearm5, ol_flags_hi, 0x04); 626d58c6c07SAjit Khaparde rearm6 = _mm256_blend_epi32(rearm6, 627d58c6c07SAjit Khaparde _mm256_srli_si256(ol_flags, 4), 628d58c6c07SAjit Khaparde 0x04); 629d58c6c07SAjit Khaparde rearm7 = _mm256_blend_epi32(rearm7, 630d58c6c07SAjit Khaparde _mm256_srli_si256(ol_flags_hi, 4), 631d58c6c07SAjit Khaparde 0x04); 632d58c6c07SAjit Khaparde 633d58c6c07SAjit Khaparde /* Store all mbuf fields for final four packets. */ 634d58c6c07SAjit Khaparde _mm256_storeu_si256((void *)&rx_pkts[i + 4]->rearm_data, 635d58c6c07SAjit Khaparde rearm4); 636d58c6c07SAjit Khaparde _mm256_storeu_si256((void *)&rx_pkts[i + 5]->rearm_data, 637d58c6c07SAjit Khaparde rearm5); 638d58c6c07SAjit Khaparde _mm256_storeu_si256((void *)&rx_pkts[i + 6]->rearm_data, 639d58c6c07SAjit Khaparde rearm6); 640d58c6c07SAjit Khaparde _mm256_storeu_si256((void *)&rx_pkts[i + 7]->rearm_data, 641d58c6c07SAjit Khaparde rearm7); 642d58c6c07SAjit Khaparde 643d58c6c07SAjit Khaparde nb_rx_pkts += num_valid; 644d58c6c07SAjit Khaparde if (num_valid < BNXT_RX_DESCS_PER_LOOP_VEC256) 645d58c6c07SAjit Khaparde break; 646d58c6c07SAjit Khaparde } 647d58c6c07SAjit Khaparde 648d58c6c07SAjit Khaparde if (nb_rx_pkts) { 649d58c6c07SAjit Khaparde rxr->rx_raw_prod = RING_ADV(rxr->rx_raw_prod, nb_rx_pkts); 650d58c6c07SAjit Khaparde 651d58c6c07SAjit Khaparde rxq->rxrearm_nb += nb_rx_pkts; 652d58c6c07SAjit Khaparde cpr->cp_raw_cons += nb_rx_pkts; 653d58c6c07SAjit Khaparde bnxt_db_cq(cpr); 654d58c6c07SAjit Khaparde } 655d58c6c07SAjit Khaparde 656d58c6c07SAjit Khaparde return nb_rx_pkts; 657d58c6c07SAjit Khaparde } 658d58c6c07SAjit Khaparde 659c4e4c189SLance Richardson uint16_t 660c4e4c189SLance Richardson bnxt_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, 661c4e4c189SLance Richardson uint16_t nb_pkts) 662c4e4c189SLance Richardson { 663c4e4c189SLance Richardson uint16_t cnt = 0; 664c4e4c189SLance Richardson 665c4e4c189SLance Richardson while (nb_pkts > RTE_BNXT_MAX_RX_BURST) { 666c4e4c189SLance Richardson uint16_t burst; 667c4e4c189SLance Richardson 668c4e4c189SLance Richardson burst = recv_burst_vec_avx2(rx_queue, rx_pkts + cnt, 669c4e4c189SLance Richardson RTE_BNXT_MAX_RX_BURST); 670c4e4c189SLance Richardson 671c4e4c189SLance Richardson cnt += burst; 672c4e4c189SLance Richardson nb_pkts -= burst; 673c4e4c189SLance Richardson 674c4e4c189SLance Richardson if (burst < RTE_BNXT_MAX_RX_BURST) 675c4e4c189SLance Richardson return cnt; 676c4e4c189SLance Richardson } 677c4e4c189SLance Richardson return cnt + recv_burst_vec_avx2(rx_queue, rx_pkts + cnt, nb_pkts); 678c4e4c189SLance Richardson } 679c4e4c189SLance Richardson 680d58c6c07SAjit Khaparde uint16_t 681d58c6c07SAjit Khaparde bnxt_crx_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, 682d58c6c07SAjit Khaparde uint16_t nb_pkts) 683d58c6c07SAjit Khaparde { 684d58c6c07SAjit Khaparde uint16_t cnt = 0; 685d58c6c07SAjit Khaparde 686d58c6c07SAjit Khaparde while (nb_pkts > RTE_BNXT_MAX_RX_BURST) { 687d58c6c07SAjit Khaparde uint16_t burst; 688d58c6c07SAjit Khaparde 689d58c6c07SAjit Khaparde burst = crx_burst_vec_avx2(rx_queue, rx_pkts + cnt, 690d58c6c07SAjit Khaparde RTE_BNXT_MAX_RX_BURST); 691d58c6c07SAjit Khaparde 692d58c6c07SAjit Khaparde cnt += burst; 693d58c6c07SAjit Khaparde nb_pkts -= burst; 694d58c6c07SAjit Khaparde 695d58c6c07SAjit Khaparde if (burst < RTE_BNXT_MAX_RX_BURST) 696d58c6c07SAjit Khaparde return cnt; 697d58c6c07SAjit Khaparde } 698d58c6c07SAjit Khaparde return cnt + crx_burst_vec_avx2(rx_queue, rx_pkts + cnt, nb_pkts); 699d58c6c07SAjit Khaparde } 700d58c6c07SAjit Khaparde 701c4e4c189SLance Richardson static void 702c4e4c189SLance Richardson bnxt_handle_tx_cp_vec(struct bnxt_tx_queue *txq) 703c4e4c189SLance Richardson { 704c4e4c189SLance Richardson struct bnxt_cp_ring_info *cpr = txq->cp_ring; 705c4e4c189SLance Richardson uint32_t raw_cons = cpr->cp_raw_cons; 706c4e4c189SLance Richardson uint32_t cons; 707c4e4c189SLance Richardson uint32_t nb_tx_pkts = 0; 708c4e4c189SLance Richardson struct tx_cmpl *txcmp; 709c4e4c189SLance Richardson struct cmpl_base *cp_desc_ring = cpr->cp_desc_ring; 710c4e4c189SLance Richardson struct bnxt_ring *cp_ring_struct = cpr->cp_ring_struct; 711c4e4c189SLance Richardson uint32_t ring_mask = cp_ring_struct->ring_mask; 712c4e4c189SLance Richardson 713c4e4c189SLance Richardson do { 714c4e4c189SLance Richardson cons = RING_CMPL(ring_mask, raw_cons); 715c4e4c189SLance Richardson txcmp = (struct tx_cmpl *)&cp_desc_ring[cons]; 716c4e4c189SLance Richardson 7175ed30db8SLance Richardson if (!bnxt_cpr_cmp_valid(txcmp, raw_cons, ring_mask + 1)) 718c4e4c189SLance Richardson break; 719c4e4c189SLance Richardson 720c4e4c189SLance Richardson nb_tx_pkts += txcmp->opaque; 721c4e4c189SLance Richardson raw_cons = NEXT_RAW_CMP(raw_cons); 722c4e4c189SLance Richardson } while (nb_tx_pkts < ring_mask); 723c4e4c189SLance Richardson 724c4e4c189SLance Richardson if (nb_tx_pkts) { 725295968d1SFerruh Yigit if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) 726c4e4c189SLance Richardson bnxt_tx_cmp_vec_fast(txq, nb_tx_pkts); 727c4e4c189SLance Richardson else 728c4e4c189SLance Richardson bnxt_tx_cmp_vec(txq, nb_tx_pkts); 729c4e4c189SLance Richardson cpr->cp_raw_cons = raw_cons; 730c4e4c189SLance Richardson bnxt_db_cq(cpr); 731c4e4c189SLance Richardson } 732c4e4c189SLance Richardson } 733c4e4c189SLance Richardson 734c4e4c189SLance Richardson static inline void 735c4e4c189SLance Richardson bnxt_xmit_one(struct rte_mbuf *mbuf, struct tx_bd_long *txbd, 736c4e4c189SLance Richardson struct rte_mbuf **tx_buf) 737c4e4c189SLance Richardson { 738c4e4c189SLance Richardson uint64_t dsc_hi, dsc_lo; 739c4e4c189SLance Richardson __m128i desc; 740c4e4c189SLance Richardson 741c4e4c189SLance Richardson *tx_buf = mbuf; 742c4e4c189SLance Richardson 743c4e4c189SLance Richardson dsc_hi = mbuf->buf_iova + mbuf->data_off; 744c4e4c189SLance Richardson dsc_lo = (mbuf->data_len << 16) | 745c4e4c189SLance Richardson bnxt_xmit_flags_len(mbuf->data_len, TX_BD_FLAGS_NOCMPL); 746c4e4c189SLance Richardson 747c4e4c189SLance Richardson desc = _mm_set_epi64x(dsc_hi, dsc_lo); 748c4e4c189SLance Richardson _mm_store_si128((void *)txbd, desc); 749c4e4c189SLance Richardson } 750c4e4c189SLance Richardson 751c4e4c189SLance Richardson static uint16_t 752c4e4c189SLance Richardson bnxt_xmit_fixed_burst_vec(struct bnxt_tx_queue *txq, struct rte_mbuf **pkts, 753c4e4c189SLance Richardson uint16_t nb_pkts) 754c4e4c189SLance Richardson { 755c4e4c189SLance Richardson struct bnxt_tx_ring_info *txr = txq->tx_ring; 756c4e4c189SLance Richardson uint16_t tx_prod, tx_raw_prod = txr->tx_raw_prod; 757c4e4c189SLance Richardson struct tx_bd_long *txbd; 758c4e4c189SLance Richardson struct rte_mbuf **tx_buf; 759c4e4c189SLance Richardson uint16_t to_send; 760c4e4c189SLance Richardson 761c4e4c189SLance Richardson tx_prod = RING_IDX(txr->tx_ring_struct, tx_raw_prod); 762c4e4c189SLance Richardson txbd = &txr->tx_desc_ring[tx_prod]; 763c4e4c189SLance Richardson tx_buf = &txr->tx_buf_ring[tx_prod]; 764c4e4c189SLance Richardson 765c4e4c189SLance Richardson /* Prefetch next transmit buffer descriptors. */ 766c4e4c189SLance Richardson rte_prefetch0(txbd); 767c4e4c189SLance Richardson rte_prefetch0(txbd + 3); 768c4e4c189SLance Richardson 769c4e4c189SLance Richardson nb_pkts = RTE_MIN(nb_pkts, bnxt_tx_avail(txq)); 770c4e4c189SLance Richardson 771c4e4c189SLance Richardson if (unlikely(nb_pkts == 0)) 772c4e4c189SLance Richardson return 0; 773c4e4c189SLance Richardson 774c4e4c189SLance Richardson /* Handle TX burst request */ 775c4e4c189SLance Richardson to_send = nb_pkts; 776c4e4c189SLance Richardson 777c4e4c189SLance Richardson /* 778c4e4c189SLance Richardson * If current descriptor is not on a 32-byte boundary, send one packet 779c4e4c189SLance Richardson * to align for 32-byte stores. 780c4e4c189SLance Richardson */ 781c4e4c189SLance Richardson if (tx_prod & 1) { 782c4e4c189SLance Richardson bnxt_xmit_one(pkts[0], txbd++, tx_buf++); 783c4e4c189SLance Richardson to_send--; 784c4e4c189SLance Richardson pkts++; 785c4e4c189SLance Richardson } 786c4e4c189SLance Richardson 787c4e4c189SLance Richardson /* 788c4e4c189SLance Richardson * Send four packets per loop, with a single store for each pair 789c4e4c189SLance Richardson * of descriptors. 790c4e4c189SLance Richardson */ 791c4e4c189SLance Richardson while (to_send >= BNXT_TX_DESCS_PER_LOOP) { 792c4e4c189SLance Richardson uint64_t dsc0_hi, dsc0_lo, dsc1_hi, dsc1_lo; 793c4e4c189SLance Richardson uint64_t dsc2_hi, dsc2_lo, dsc3_hi, dsc3_lo; 794c4e4c189SLance Richardson __m256i dsc01, dsc23; 795c4e4c189SLance Richardson 796c4e4c189SLance Richardson /* Prefetch next transmit buffer descriptors. */ 797c4e4c189SLance Richardson rte_prefetch0(txbd + 4); 798c4e4c189SLance Richardson rte_prefetch0(txbd + 7); 799c4e4c189SLance Richardson 800c4e4c189SLance Richardson /* Copy four mbuf pointers to tx buf ring. */ 801c4e4c189SLance Richardson #ifdef RTE_ARCH_X86_64 802c4e4c189SLance Richardson __m256i tmp = _mm256_loadu_si256((void *)pkts); 803c4e4c189SLance Richardson _mm256_storeu_si256((void *)tx_buf, tmp); 804c4e4c189SLance Richardson #else 805c4e4c189SLance Richardson __m128i tmp = _mm_loadu_si128((void *)pkts); 806c4e4c189SLance Richardson _mm_storeu_si128((void *)tx_buf, tmp); 807c4e4c189SLance Richardson #endif 808c4e4c189SLance Richardson 809c4e4c189SLance Richardson dsc0_hi = tx_buf[0]->buf_iova + tx_buf[0]->data_off; 810c4e4c189SLance Richardson dsc0_lo = (tx_buf[0]->data_len << 16) | 811c4e4c189SLance Richardson bnxt_xmit_flags_len(tx_buf[0]->data_len, 812c4e4c189SLance Richardson TX_BD_FLAGS_NOCMPL); 813c4e4c189SLance Richardson 814c4e4c189SLance Richardson dsc1_hi = tx_buf[1]->buf_iova + tx_buf[1]->data_off; 815c4e4c189SLance Richardson dsc1_lo = (tx_buf[1]->data_len << 16) | 816c4e4c189SLance Richardson bnxt_xmit_flags_len(tx_buf[1]->data_len, 817c4e4c189SLance Richardson TX_BD_FLAGS_NOCMPL); 818c4e4c189SLance Richardson 819c4e4c189SLance Richardson dsc01 = _mm256_set_epi64x(dsc1_hi, dsc1_lo, dsc0_hi, dsc0_lo); 820c4e4c189SLance Richardson 821c4e4c189SLance Richardson dsc2_hi = tx_buf[2]->buf_iova + tx_buf[2]->data_off; 822c4e4c189SLance Richardson dsc2_lo = (tx_buf[2]->data_len << 16) | 823c4e4c189SLance Richardson bnxt_xmit_flags_len(tx_buf[2]->data_len, 824c4e4c189SLance Richardson TX_BD_FLAGS_NOCMPL); 825c4e4c189SLance Richardson 826c4e4c189SLance Richardson dsc3_hi = tx_buf[3]->buf_iova + tx_buf[3]->data_off; 827c4e4c189SLance Richardson dsc3_lo = (tx_buf[3]->data_len << 16) | 828c4e4c189SLance Richardson bnxt_xmit_flags_len(tx_buf[3]->data_len, 829c4e4c189SLance Richardson TX_BD_FLAGS_NOCMPL); 830c4e4c189SLance Richardson 831c4e4c189SLance Richardson dsc23 = _mm256_set_epi64x(dsc3_hi, dsc3_lo, dsc2_hi, dsc2_lo); 832c4e4c189SLance Richardson 833c4e4c189SLance Richardson _mm256_store_si256((void *)txbd, dsc01); 834c4e4c189SLance Richardson _mm256_store_si256((void *)(txbd + 2), dsc23); 835c4e4c189SLance Richardson 836c4e4c189SLance Richardson to_send -= BNXT_TX_DESCS_PER_LOOP; 837c4e4c189SLance Richardson pkts += BNXT_TX_DESCS_PER_LOOP; 838c4e4c189SLance Richardson txbd += BNXT_TX_DESCS_PER_LOOP; 839c4e4c189SLance Richardson tx_buf += BNXT_TX_DESCS_PER_LOOP; 840c4e4c189SLance Richardson } 841c4e4c189SLance Richardson 842c4e4c189SLance Richardson /* Send any remaining packets, writing each descriptor individually. */ 843c4e4c189SLance Richardson while (to_send) { 844c4e4c189SLance Richardson bnxt_xmit_one(pkts[0], txbd++, tx_buf++); 845c4e4c189SLance Richardson to_send--; 846c4e4c189SLance Richardson pkts++; 847c4e4c189SLance Richardson } 848c4e4c189SLance Richardson 849c4e4c189SLance Richardson /* Request a completion for the final packet of the burst. */ 850c4e4c189SLance Richardson txbd[-1].opaque = nb_pkts; 851c4e4c189SLance Richardson txbd[-1].flags_type &= ~TX_BD_LONG_FLAGS_NO_CMPL; 852c4e4c189SLance Richardson 853c4e4c189SLance Richardson tx_raw_prod += nb_pkts; 854c4e4c189SLance Richardson bnxt_db_write(&txr->tx_db, tx_raw_prod); 855c4e4c189SLance Richardson 856c4e4c189SLance Richardson txr->tx_raw_prod = tx_raw_prod; 857c4e4c189SLance Richardson 858c4e4c189SLance Richardson return nb_pkts; 859c4e4c189SLance Richardson } 860c4e4c189SLance Richardson 861c4e4c189SLance Richardson uint16_t 862c4e4c189SLance Richardson bnxt_xmit_pkts_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts, 863c4e4c189SLance Richardson uint16_t nb_pkts) 864c4e4c189SLance Richardson { 865c4e4c189SLance Richardson int nb_sent = 0; 866c4e4c189SLance Richardson struct bnxt_tx_queue *txq = tx_queue; 867c4e4c189SLance Richardson struct bnxt_tx_ring_info *txr = txq->tx_ring; 868c4e4c189SLance Richardson uint16_t ring_size = txr->tx_ring_struct->ring_size; 869c4e4c189SLance Richardson 870c4e4c189SLance Richardson /* Tx queue was stopped; wait for it to be restarted */ 871c4e4c189SLance Richardson if (unlikely(!txq->tx_started)) { 872*e99981afSDavid Marchand PMD_DRV_LOG_LINE(DEBUG, "Tx q stopped;return"); 873c4e4c189SLance Richardson return 0; 874c4e4c189SLance Richardson } 875c4e4c189SLance Richardson 876c4e4c189SLance Richardson /* Handle TX completions */ 877c4e4c189SLance Richardson if (bnxt_tx_bds_in_hw(txq) >= txq->tx_free_thresh) 878c4e4c189SLance Richardson bnxt_handle_tx_cp_vec(txq); 879c4e4c189SLance Richardson 880c4e4c189SLance Richardson while (nb_pkts) { 881c4e4c189SLance Richardson uint16_t ret, num; 882c4e4c189SLance Richardson 883c4e4c189SLance Richardson /* 884c4e4c189SLance Richardson * Ensure that no more than RTE_BNXT_MAX_TX_BURST packets 885c4e4c189SLance Richardson * are transmitted before the next completion. 886c4e4c189SLance Richardson */ 887c4e4c189SLance Richardson num = RTE_MIN(nb_pkts, RTE_BNXT_MAX_TX_BURST); 888c4e4c189SLance Richardson 889c4e4c189SLance Richardson /* 890c4e4c189SLance Richardson * Ensure that a ring wrap does not occur within a call to 891c4e4c189SLance Richardson * bnxt_xmit_fixed_burst_vec(). 892c4e4c189SLance Richardson */ 893c4e4c189SLance Richardson num = RTE_MIN(num, ring_size - 894c4e4c189SLance Richardson (txr->tx_raw_prod & (ring_size - 1))); 895c4e4c189SLance Richardson ret = bnxt_xmit_fixed_burst_vec(txq, &tx_pkts[nb_sent], num); 896c4e4c189SLance Richardson nb_sent += ret; 897c4e4c189SLance Richardson nb_pkts -= ret; 898c4e4c189SLance Richardson if (ret < num) 899c4e4c189SLance Richardson break; 900c4e4c189SLance Richardson } 901c4e4c189SLance Richardson 902c4e4c189SLance Richardson return nb_sent; 903c4e4c189SLance Richardson } 904