1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2019 Intel Corporation 3 */ 4 5 #include "ice_rxtx_vec_common.h" 6 #include "ice_rxtx_common_avx.h" 7 8 #include <rte_vect.h> 9 10 static __rte_always_inline void 11 ice_rxq_rearm(struct ice_rx_queue *rxq) 12 { 13 return ice_rxq_rearm_common(rxq, false); 14 } 15 16 static __rte_always_inline __m256i 17 ice_flex_rxd_to_fdir_flags_vec_avx2(const __m256i fdir_id0_7) 18 { 19 #define FDID_MIS_MAGIC 0xFFFFFFFF 20 RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR != (1 << 2)); 21 RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR_ID != (1 << 13)); 22 const __m256i pkt_fdir_bit = _mm256_set1_epi32(RTE_MBUF_F_RX_FDIR | 23 RTE_MBUF_F_RX_FDIR_ID); 24 /* desc->flow_id field == 0xFFFFFFFF means fdir mismatch */ 25 const __m256i fdir_mis_mask = _mm256_set1_epi32(FDID_MIS_MAGIC); 26 __m256i fdir_mask = _mm256_cmpeq_epi32(fdir_id0_7, 27 fdir_mis_mask); 28 /* this XOR op results to bit-reverse the fdir_mask */ 29 fdir_mask = _mm256_xor_si256(fdir_mask, fdir_mis_mask); 30 const __m256i fdir_flags = _mm256_and_si256(fdir_mask, pkt_fdir_bit); 31 32 return fdir_flags; 33 } 34 35 static __rte_always_inline uint16_t 36 _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts, 37 uint16_t nb_pkts, uint8_t *split_packet, 38 bool offload) 39 { 40 #define ICE_DESCS_PER_LOOP_AVX 8 41 42 const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; 43 const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 44 0, rxq->mbuf_initializer); 45 struct ice_rx_entry *sw_ring = &rxq->sw_ring[rxq->rx_tail]; 46 volatile union ice_rx_flex_desc *rxdp = rxq->rx_ring + rxq->rx_tail; 47 const int avx_aligned = ((rxq->rx_tail & 1) == 0); 48 49 rte_prefetch0(rxdp); 50 51 /* nb_pkts has to be floor-aligned to ICE_DESCS_PER_LOOP_AVX */ 52 nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, ICE_DESCS_PER_LOOP_AVX); 53 54 /* See if we need to rearm the RX queue - gives the prefetch a bit 55 * of time to act 56 */ 57 if (rxq->rxrearm_nb > ICE_RXQ_REARM_THRESH) 58 ice_rxq_rearm(rxq); 59 60 /* Before we start moving massive data around, check to see if 61 * there is actually a packet available 62 */ 63 if (!(rxdp->wb.status_error0 & 64 rte_cpu_to_le_32(1 << ICE_RX_FLEX_DESC_STATUS0_DD_S))) 65 return 0; 66 67 /* constants used in processing loop */ 68 const __m256i crc_adjust = 69 _mm256_set_epi16 70 (/* first descriptor */ 71 0, 0, 0, /* ignore non-length fields */ 72 -rxq->crc_len, /* sub crc on data_len */ 73 0, /* ignore high-16bits of pkt_len */ 74 -rxq->crc_len, /* sub crc on pkt_len */ 75 0, 0, /* ignore pkt_type field */ 76 /* second descriptor */ 77 0, 0, 0, /* ignore non-length fields */ 78 -rxq->crc_len, /* sub crc on data_len */ 79 0, /* ignore high-16bits of pkt_len */ 80 -rxq->crc_len, /* sub crc on pkt_len */ 81 0, 0 /* ignore pkt_type field */ 82 ); 83 84 /* 8 packets DD mask, LSB in each 32-bit value */ 85 const __m256i dd_check = _mm256_set1_epi32(1); 86 87 /* 8 packets EOP mask, second-LSB in each 32-bit value */ 88 const __m256i eop_check = _mm256_slli_epi32(dd_check, 89 ICE_RX_DESC_STATUS_EOF_S); 90 91 /* mask to shuffle from desc. to mbuf (2 descriptors)*/ 92 const __m256i shuf_msk = 93 _mm256_set_epi8 94 (/* first descriptor */ 95 0xFF, 0xFF, 96 0xFF, 0xFF, /* rss hash parsed separately */ 97 11, 10, /* octet 10~11, 16 bits vlan_macip */ 98 5, 4, /* octet 4~5, 16 bits data_len */ 99 0xFF, 0xFF, /* skip hi 16 bits pkt_len, zero out */ 100 5, 4, /* octet 4~5, 16 bits pkt_len */ 101 0xFF, 0xFF, /* pkt_type set as unknown */ 102 0xFF, 0xFF, /*pkt_type set as unknown */ 103 /* second descriptor */ 104 0xFF, 0xFF, 105 0xFF, 0xFF, /* rss hash parsed separately */ 106 11, 10, /* octet 10~11, 16 bits vlan_macip */ 107 5, 4, /* octet 4~5, 16 bits data_len */ 108 0xFF, 0xFF, /* skip hi 16 bits pkt_len, zero out */ 109 5, 4, /* octet 4~5, 16 bits pkt_len */ 110 0xFF, 0xFF, /* pkt_type set as unknown */ 111 0xFF, 0xFF /*pkt_type set as unknown */ 112 ); 113 /** 114 * compile-time check the above crc and shuffle layout is correct. 115 * NOTE: the first field (lowest address) is given last in set_epi 116 * calls above. 117 */ 118 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != 119 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); 120 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != 121 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); 122 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) != 123 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10); 124 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != 125 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); 126 127 /* Status/Error flag masks */ 128 /** 129 * mask everything except Checksum Reports, RSS indication 130 * and VLAN indication. 131 * bit6:4 for IP/L4 checksum errors. 132 * bit12 is for RSS indication. 133 * bit13 is for VLAN indication. 134 */ 135 const __m256i flags_mask = 136 _mm256_set1_epi32((0xF << 4) | (1 << 12) | (1 << 13)); 137 /** 138 * data to be shuffled by the result of the flags mask shifted by 4 139 * bits. This gives use the l3_l4 flags. 140 */ 141 const __m256i l3_l4_flags_shuf = 142 _mm256_set_epi8((RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 143 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | 144 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 145 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 146 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 147 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 148 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 149 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 150 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 151 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 152 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 153 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 154 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 155 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 156 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 157 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 158 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 159 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 160 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 161 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 162 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 163 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 164 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 165 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 166 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 167 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 168 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 169 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 170 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 171 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 172 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 173 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 174 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 175 /** 176 * second 128-bits 177 * shift right 20 bits to use the low two bits to indicate 178 * outer checksum status 179 * shift right 1 bit to make sure it not exceed 255 180 */ 181 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 182 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 183 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 184 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 185 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 186 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 187 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 188 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 189 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 190 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 191 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 192 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 193 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 194 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 195 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 196 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 197 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 198 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 199 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 200 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 201 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 202 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 203 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 204 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 205 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 206 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 207 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 208 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 209 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 210 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 211 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 212 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1); 213 const __m256i cksum_mask = 214 _mm256_set1_epi32(RTE_MBUF_F_RX_IP_CKSUM_MASK | 215 RTE_MBUF_F_RX_L4_CKSUM_MASK | 216 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 217 RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK); 218 /** 219 * data to be shuffled by result of flag mask, shifted down 12. 220 * If RSS(bit12)/VLAN(bit13) are set, 221 * shuffle moves appropriate flags in place. 222 */ 223 const __m256i rss_vlan_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 224 0, 0, 0, 0, 225 0, 0, 0, 0, 226 RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 227 RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 228 RTE_MBUF_F_RX_RSS_HASH, 0, 229 /* end up 128-bits */ 230 0, 0, 0, 0, 231 0, 0, 0, 0, 232 0, 0, 0, 0, 233 RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 234 RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 235 RTE_MBUF_F_RX_RSS_HASH, 0); 236 237 RTE_SET_USED(avx_aligned); /* for 32B descriptors we don't use this */ 238 239 uint16_t i, received; 240 241 for (i = 0, received = 0; i < nb_pkts; 242 i += ICE_DESCS_PER_LOOP_AVX, 243 rxdp += ICE_DESCS_PER_LOOP_AVX) { 244 /* step 1, copy over 8 mbuf pointers to rx_pkts array */ 245 _mm256_storeu_si256((void *)&rx_pkts[i], 246 _mm256_loadu_si256((void *)&sw_ring[i])); 247 #ifdef RTE_ARCH_X86_64 248 _mm256_storeu_si256 249 ((void *)&rx_pkts[i + 4], 250 _mm256_loadu_si256((void *)&sw_ring[i + 4])); 251 #endif 252 253 const __m128i raw_desc7 = _mm_load_si128 254 (RTE_CAST_PTR(const __m128i *, rxdp + 7)); 255 rte_compiler_barrier(); 256 const __m128i raw_desc6 = _mm_load_si128 257 (RTE_CAST_PTR(const __m128i *, rxdp + 6)); 258 rte_compiler_barrier(); 259 const __m128i raw_desc5 = _mm_load_si128 260 (RTE_CAST_PTR(const __m128i *, rxdp + 5)); 261 rte_compiler_barrier(); 262 const __m128i raw_desc4 = _mm_load_si128 263 (RTE_CAST_PTR(const __m128i *, rxdp + 4)); 264 rte_compiler_barrier(); 265 const __m128i raw_desc3 = _mm_load_si128 266 (RTE_CAST_PTR(const __m128i *, rxdp + 3)); 267 rte_compiler_barrier(); 268 const __m128i raw_desc2 = _mm_load_si128 269 (RTE_CAST_PTR(const __m128i *, rxdp + 2)); 270 rte_compiler_barrier(); 271 const __m128i raw_desc1 = _mm_load_si128 272 (RTE_CAST_PTR(const __m128i *, rxdp + 1)); 273 rte_compiler_barrier(); 274 const __m128i raw_desc0 = _mm_load_si128 275 (RTE_CAST_PTR(const __m128i *, rxdp + 0)); 276 277 const __m256i raw_desc6_7 = 278 _mm256_inserti128_si256(_mm256_castsi128_si256(raw_desc6), raw_desc7, 1); 279 const __m256i raw_desc4_5 = 280 _mm256_inserti128_si256(_mm256_castsi128_si256(raw_desc4), raw_desc5, 1); 281 const __m256i raw_desc2_3 = 282 _mm256_inserti128_si256(_mm256_castsi128_si256(raw_desc2), raw_desc3, 1); 283 const __m256i raw_desc0_1 = 284 _mm256_inserti128_si256(_mm256_castsi128_si256(raw_desc0), raw_desc1, 1); 285 286 if (split_packet) { 287 int j; 288 289 for (j = 0; j < ICE_DESCS_PER_LOOP_AVX; j++) 290 rte_mbuf_prefetch_part2(rx_pkts[i + j]); 291 } 292 293 /** 294 * convert descriptors 4-7 into mbufs, re-arrange fields. 295 * Then write into the mbuf. 296 */ 297 __m256i mb6_7 = _mm256_shuffle_epi8(raw_desc6_7, shuf_msk); 298 __m256i mb4_5 = _mm256_shuffle_epi8(raw_desc4_5, shuf_msk); 299 300 mb6_7 = _mm256_add_epi16(mb6_7, crc_adjust); 301 mb4_5 = _mm256_add_epi16(mb4_5, crc_adjust); 302 /** 303 * to get packet types, ptype is located in bit16-25 304 * of each 128bits 305 */ 306 const __m256i ptype_mask = 307 _mm256_set1_epi16(ICE_RX_FLEX_DESC_PTYPE_M); 308 const __m256i ptypes6_7 = 309 _mm256_and_si256(raw_desc6_7, ptype_mask); 310 const __m256i ptypes4_5 = 311 _mm256_and_si256(raw_desc4_5, ptype_mask); 312 const uint16_t ptype7 = _mm256_extract_epi16(ptypes6_7, 9); 313 const uint16_t ptype6 = _mm256_extract_epi16(ptypes6_7, 1); 314 const uint16_t ptype5 = _mm256_extract_epi16(ptypes4_5, 9); 315 const uint16_t ptype4 = _mm256_extract_epi16(ptypes4_5, 1); 316 317 mb6_7 = _mm256_insert_epi32(mb6_7, ptype_tbl[ptype7], 4); 318 mb6_7 = _mm256_insert_epi32(mb6_7, ptype_tbl[ptype6], 0); 319 mb4_5 = _mm256_insert_epi32(mb4_5, ptype_tbl[ptype5], 4); 320 mb4_5 = _mm256_insert_epi32(mb4_5, ptype_tbl[ptype4], 0); 321 /* merge the status bits into one register */ 322 const __m256i status4_7 = _mm256_unpackhi_epi32(raw_desc6_7, 323 raw_desc4_5); 324 325 /** 326 * convert descriptors 0-3 into mbufs, re-arrange fields. 327 * Then write into the mbuf. 328 */ 329 __m256i mb2_3 = _mm256_shuffle_epi8(raw_desc2_3, shuf_msk); 330 __m256i mb0_1 = _mm256_shuffle_epi8(raw_desc0_1, shuf_msk); 331 332 mb2_3 = _mm256_add_epi16(mb2_3, crc_adjust); 333 mb0_1 = _mm256_add_epi16(mb0_1, crc_adjust); 334 /** 335 * to get packet types, ptype is located in bit16-25 336 * of each 128bits 337 */ 338 const __m256i ptypes2_3 = 339 _mm256_and_si256(raw_desc2_3, ptype_mask); 340 const __m256i ptypes0_1 = 341 _mm256_and_si256(raw_desc0_1, ptype_mask); 342 const uint16_t ptype3 = _mm256_extract_epi16(ptypes2_3, 9); 343 const uint16_t ptype2 = _mm256_extract_epi16(ptypes2_3, 1); 344 const uint16_t ptype1 = _mm256_extract_epi16(ptypes0_1, 9); 345 const uint16_t ptype0 = _mm256_extract_epi16(ptypes0_1, 1); 346 347 mb2_3 = _mm256_insert_epi32(mb2_3, ptype_tbl[ptype3], 4); 348 mb2_3 = _mm256_insert_epi32(mb2_3, ptype_tbl[ptype2], 0); 349 mb0_1 = _mm256_insert_epi32(mb0_1, ptype_tbl[ptype1], 4); 350 mb0_1 = _mm256_insert_epi32(mb0_1, ptype_tbl[ptype0], 0); 351 /* merge the status bits into one register */ 352 const __m256i status0_3 = _mm256_unpackhi_epi32(raw_desc2_3, 353 raw_desc0_1); 354 355 /** 356 * take the two sets of status bits and merge to one 357 * After merge, the packets status flags are in the 358 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6] 359 */ 360 __m256i status0_7 = _mm256_unpacklo_epi64(status4_7, 361 status0_3); 362 __m256i mbuf_flags = _mm256_set1_epi32(0); 363 364 if (offload) { 365 /* now do flag manipulation */ 366 367 /* get only flag/error bits we want */ 368 const __m256i flag_bits = 369 _mm256_and_si256(status0_7, flags_mask); 370 /** 371 * l3_l4_error flags, shuffle, then shift to correct adjustment 372 * of flags in flags_shuf, and finally mask out extra bits 373 */ 374 __m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf, 375 _mm256_srli_epi32(flag_bits, 4)); 376 l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1); 377 378 __m256i l4_outer_mask = _mm256_set1_epi32(0x6); 379 __m256i l4_outer_flags = 380 _mm256_and_si256(l3_l4_flags, l4_outer_mask); 381 l4_outer_flags = _mm256_slli_epi32(l4_outer_flags, 20); 382 383 __m256i l3_l4_mask = _mm256_set1_epi32(~0x6); 384 385 l3_l4_flags = _mm256_and_si256(l3_l4_flags, l3_l4_mask); 386 l3_l4_flags = _mm256_or_si256(l3_l4_flags, l4_outer_flags); 387 l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask); 388 /* set rss and vlan flags */ 389 const __m256i rss_vlan_flag_bits = 390 _mm256_srli_epi32(flag_bits, 12); 391 const __m256i rss_vlan_flags = 392 _mm256_shuffle_epi8(rss_vlan_flags_shuf, 393 rss_vlan_flag_bits); 394 395 /* merge flags */ 396 mbuf_flags = _mm256_or_si256(l3_l4_flags, 397 rss_vlan_flags); 398 } 399 400 if (rxq->fdir_enabled) { 401 const __m256i fdir_id4_7 = 402 _mm256_unpackhi_epi32(raw_desc6_7, raw_desc4_5); 403 404 const __m256i fdir_id0_3 = 405 _mm256_unpackhi_epi32(raw_desc2_3, raw_desc0_1); 406 407 const __m256i fdir_id0_7 = 408 _mm256_unpackhi_epi64(fdir_id4_7, fdir_id0_3); 409 410 const __m256i fdir_flags = 411 ice_flex_rxd_to_fdir_flags_vec_avx2(fdir_id0_7); 412 413 /* merge with fdir_flags */ 414 mbuf_flags = _mm256_or_si256(mbuf_flags, fdir_flags); 415 416 /* write to mbuf: have to use scalar store here */ 417 rx_pkts[i + 0]->hash.fdir.hi = 418 _mm256_extract_epi32(fdir_id0_7, 3); 419 420 rx_pkts[i + 1]->hash.fdir.hi = 421 _mm256_extract_epi32(fdir_id0_7, 7); 422 423 rx_pkts[i + 2]->hash.fdir.hi = 424 _mm256_extract_epi32(fdir_id0_7, 2); 425 426 rx_pkts[i + 3]->hash.fdir.hi = 427 _mm256_extract_epi32(fdir_id0_7, 6); 428 429 rx_pkts[i + 4]->hash.fdir.hi = 430 _mm256_extract_epi32(fdir_id0_7, 1); 431 432 rx_pkts[i + 5]->hash.fdir.hi = 433 _mm256_extract_epi32(fdir_id0_7, 5); 434 435 rx_pkts[i + 6]->hash.fdir.hi = 436 _mm256_extract_epi32(fdir_id0_7, 0); 437 438 rx_pkts[i + 7]->hash.fdir.hi = 439 _mm256_extract_epi32(fdir_id0_7, 4); 440 } /* if() on fdir_enabled */ 441 442 if (offload) { 443 #ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC 444 /** 445 * needs to load 2nd 16B of each desc for RSS hash parsing, 446 * will cause performance drop to get into this context. 447 */ 448 if (rxq->vsi->adapter->pf.dev_data->dev_conf.rxmode.offloads & 449 RTE_ETH_RX_OFFLOAD_RSS_HASH) { 450 /* load bottom half of every 32B desc */ 451 const __m128i raw_desc_bh7 = _mm_load_si128 452 (RTE_CAST_PTR(const __m128i *, &rxdp[7].wb.status_error1)); 453 rte_compiler_barrier(); 454 const __m128i raw_desc_bh6 = _mm_load_si128 455 (RTE_CAST_PTR(const __m128i *, &rxdp[6].wb.status_error1)); 456 rte_compiler_barrier(); 457 const __m128i raw_desc_bh5 = _mm_load_si128 458 (RTE_CAST_PTR(const __m128i *, &rxdp[5].wb.status_error1)); 459 rte_compiler_barrier(); 460 const __m128i raw_desc_bh4 = _mm_load_si128 461 (RTE_CAST_PTR(const __m128i *, &rxdp[4].wb.status_error1)); 462 rte_compiler_barrier(); 463 const __m128i raw_desc_bh3 = _mm_load_si128 464 (RTE_CAST_PTR(const __m128i *, &rxdp[3].wb.status_error1)); 465 rte_compiler_barrier(); 466 const __m128i raw_desc_bh2 = _mm_load_si128 467 (RTE_CAST_PTR(const __m128i *, &rxdp[2].wb.status_error1)); 468 rte_compiler_barrier(); 469 const __m128i raw_desc_bh1 = _mm_load_si128 470 (RTE_CAST_PTR(const __m128i *, &rxdp[1].wb.status_error1)); 471 rte_compiler_barrier(); 472 const __m128i raw_desc_bh0 = _mm_load_si128 473 (RTE_CAST_PTR(const __m128i *, &rxdp[0].wb.status_error1)); 474 475 __m256i raw_desc_bh6_7 = 476 _mm256_inserti128_si256 477 (_mm256_castsi128_si256(raw_desc_bh6), 478 raw_desc_bh7, 1); 479 __m256i raw_desc_bh4_5 = 480 _mm256_inserti128_si256 481 (_mm256_castsi128_si256(raw_desc_bh4), 482 raw_desc_bh5, 1); 483 __m256i raw_desc_bh2_3 = 484 _mm256_inserti128_si256 485 (_mm256_castsi128_si256(raw_desc_bh2), 486 raw_desc_bh3, 1); 487 __m256i raw_desc_bh0_1 = 488 _mm256_inserti128_si256 489 (_mm256_castsi128_si256(raw_desc_bh0), 490 raw_desc_bh1, 1); 491 492 /** 493 * to shift the 32b RSS hash value to the 494 * highest 32b of each 128b before mask 495 */ 496 __m256i rss_hash6_7 = 497 _mm256_slli_epi64(raw_desc_bh6_7, 32); 498 __m256i rss_hash4_5 = 499 _mm256_slli_epi64(raw_desc_bh4_5, 32); 500 __m256i rss_hash2_3 = 501 _mm256_slli_epi64(raw_desc_bh2_3, 32); 502 __m256i rss_hash0_1 = 503 _mm256_slli_epi64(raw_desc_bh0_1, 32); 504 505 __m256i rss_hash_msk = 506 _mm256_set_epi32(0xFFFFFFFF, 0, 0, 0, 507 0xFFFFFFFF, 0, 0, 0); 508 509 rss_hash6_7 = _mm256_and_si256 510 (rss_hash6_7, rss_hash_msk); 511 rss_hash4_5 = _mm256_and_si256 512 (rss_hash4_5, rss_hash_msk); 513 rss_hash2_3 = _mm256_and_si256 514 (rss_hash2_3, rss_hash_msk); 515 rss_hash0_1 = _mm256_and_si256 516 (rss_hash0_1, rss_hash_msk); 517 518 mb6_7 = _mm256_or_si256(mb6_7, rss_hash6_7); 519 mb4_5 = _mm256_or_si256(mb4_5, rss_hash4_5); 520 mb2_3 = _mm256_or_si256(mb2_3, rss_hash2_3); 521 mb0_1 = _mm256_or_si256(mb0_1, rss_hash0_1); 522 } /* if() on RSS hash parsing */ 523 #endif 524 } 525 526 /** 527 * At this point, we have the 8 sets of flags in the low 16-bits 528 * of each 32-bit value in vlan0. 529 * We want to extract these, and merge them with the mbuf init 530 * data so we can do a single write to the mbuf to set the flags 531 * and all the other initialization fields. Extracting the 532 * appropriate flags means that we have to do a shift and blend 533 * for each mbuf before we do the write. However, we can also 534 * add in the previously computed rx_descriptor fields to 535 * make a single 256-bit write per mbuf 536 */ 537 /* check the structure matches expectations */ 538 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) != 539 offsetof(struct rte_mbuf, rearm_data) + 8); 540 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) != 541 RTE_ALIGN(offsetof(struct rte_mbuf, 542 rearm_data), 543 16)); 544 /* build up data and do writes */ 545 __m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5, 546 rearm6, rearm7; 547 rearm6 = _mm256_blend_epi32(mbuf_init, 548 _mm256_slli_si256(mbuf_flags, 8), 549 0x04); 550 rearm4 = _mm256_blend_epi32(mbuf_init, 551 _mm256_slli_si256(mbuf_flags, 4), 552 0x04); 553 rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04); 554 rearm0 = _mm256_blend_epi32(mbuf_init, 555 _mm256_srli_si256(mbuf_flags, 4), 556 0x04); 557 /* permute to add in the rx_descriptor e.g. rss fields */ 558 rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20); 559 rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20); 560 rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20); 561 rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20); 562 /* write to mbuf */ 563 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data, 564 rearm6); 565 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data, 566 rearm4); 567 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data, 568 rearm2); 569 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data, 570 rearm0); 571 572 /* repeat for the odd mbufs */ 573 const __m256i odd_flags = 574 _mm256_castsi128_si256 575 (_mm256_extracti128_si256(mbuf_flags, 1)); 576 rearm7 = _mm256_blend_epi32(mbuf_init, 577 _mm256_slli_si256(odd_flags, 8), 578 0x04); 579 rearm5 = _mm256_blend_epi32(mbuf_init, 580 _mm256_slli_si256(odd_flags, 4), 581 0x04); 582 rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04); 583 rearm1 = _mm256_blend_epi32(mbuf_init, 584 _mm256_srli_si256(odd_flags, 4), 585 0x04); 586 /* since odd mbufs are already in hi 128-bits use blend */ 587 rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0); 588 rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0); 589 rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0); 590 rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0); 591 /* again write to mbufs */ 592 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data, 593 rearm7); 594 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data, 595 rearm5); 596 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data, 597 rearm3); 598 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data, 599 rearm1); 600 601 /* extract and record EOP bit */ 602 if (split_packet) { 603 const __m128i eop_mask = 604 _mm_set1_epi16(1 << ICE_RX_DESC_STATUS_EOF_S); 605 const __m256i eop_bits256 = _mm256_and_si256(status0_7, 606 eop_check); 607 /* pack status bits into a single 128-bit register */ 608 const __m128i eop_bits = 609 _mm_packus_epi32 610 (_mm256_castsi256_si128(eop_bits256), 611 _mm256_extractf128_si256(eop_bits256, 612 1)); 613 /** 614 * flip bits, and mask out the EOP bit, which is now 615 * a split-packet bit i.e. !EOP, rather than EOP one. 616 */ 617 __m128i split_bits = _mm_andnot_si128(eop_bits, 618 eop_mask); 619 /** 620 * eop bits are out of order, so we need to shuffle them 621 * back into order again. In doing so, only use low 8 622 * bits, which acts like another pack instruction 623 * The original order is (hi->lo): 1,3,5,7,0,2,4,6 624 * [Since we use epi8, the 16-bit positions are 625 * multiplied by 2 in the eop_shuffle value.] 626 */ 627 __m128i eop_shuffle = 628 _mm_set_epi8(/* zero hi 64b */ 629 0xFF, 0xFF, 0xFF, 0xFF, 630 0xFF, 0xFF, 0xFF, 0xFF, 631 /* move values to lo 64b */ 632 8, 0, 10, 2, 633 12, 4, 14, 6); 634 split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle); 635 *(uint64_t *)split_packet = 636 _mm_cvtsi128_si64(split_bits); 637 split_packet += ICE_DESCS_PER_LOOP_AVX; 638 } 639 640 /* perform dd_check */ 641 status0_7 = _mm256_and_si256(status0_7, dd_check); 642 status0_7 = _mm256_packs_epi32(status0_7, 643 _mm256_setzero_si256()); 644 645 uint64_t burst = rte_popcount64 646 (_mm_cvtsi128_si64 647 (_mm256_extracti128_si256 648 (status0_7, 1))); 649 burst += rte_popcount64 650 (_mm_cvtsi128_si64 651 (_mm256_castsi256_si128(status0_7))); 652 received += burst; 653 if (burst != ICE_DESCS_PER_LOOP_AVX) 654 break; 655 } 656 657 /* update tail pointers */ 658 rxq->rx_tail += received; 659 rxq->rx_tail &= (rxq->nb_rx_desc - 1); 660 if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep avx2 aligned */ 661 rxq->rx_tail--; 662 received--; 663 } 664 rxq->rxrearm_nb += received; 665 return received; 666 } 667 668 /** 669 * Notice: 670 * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet 671 */ 672 uint16_t 673 ice_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, 674 uint16_t nb_pkts) 675 { 676 return _ice_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, 677 nb_pkts, NULL, false); 678 } 679 680 uint16_t 681 ice_recv_pkts_vec_avx2_offload(void *rx_queue, struct rte_mbuf **rx_pkts, 682 uint16_t nb_pkts) 683 { 684 return _ice_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, 685 nb_pkts, NULL, true); 686 } 687 688 /** 689 * vPMD receive routine that reassembles single burst of 32 scattered packets 690 * Notice: 691 * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet 692 */ 693 static __rte_always_inline uint16_t 694 ice_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, 695 uint16_t nb_pkts, bool offload) 696 { 697 struct ice_rx_queue *rxq = rx_queue; 698 uint8_t split_flags[ICE_VPMD_RX_BURST] = {0}; 699 700 /* get some new buffers */ 701 uint16_t nb_bufs = _ice_recv_raw_pkts_vec_avx2(rxq, rx_pkts, nb_pkts, 702 split_flags, offload); 703 if (nb_bufs == 0) 704 return 0; 705 706 /* happy day case, full burst + no packets to be joined */ 707 const uint64_t *split_fl64 = (uint64_t *)split_flags; 708 709 if (!rxq->pkt_first_seg && 710 split_fl64[0] == 0 && split_fl64[1] == 0 && 711 split_fl64[2] == 0 && split_fl64[3] == 0) 712 return nb_bufs; 713 714 /* reassemble any packets that need reassembly*/ 715 unsigned int i = 0; 716 717 if (!rxq->pkt_first_seg) { 718 /* find the first split flag, and only reassemble then*/ 719 while (i < nb_bufs && !split_flags[i]) 720 i++; 721 if (i == nb_bufs) 722 return nb_bufs; 723 rxq->pkt_first_seg = rx_pkts[i]; 724 } 725 return i + ci_rx_reassemble_packets(&rx_pkts[i], nb_bufs - i, &split_flags[i], 726 &rxq->pkt_first_seg, &rxq->pkt_last_seg, rxq->crc_len); 727 } 728 729 /** 730 * vPMD receive routine that reassembles scattered packets. 731 * Main receive routine that can handle arbitrary burst sizes 732 * Notice: 733 * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet 734 */ 735 static __rte_always_inline uint16_t 736 ice_recv_scattered_pkts_vec_avx2_common(void *rx_queue, 737 struct rte_mbuf **rx_pkts, 738 uint16_t nb_pkts, 739 bool offload) 740 { 741 uint16_t retval = 0; 742 743 while (nb_pkts > ICE_VPMD_RX_BURST) { 744 uint16_t burst = ice_recv_scattered_burst_vec_avx2(rx_queue, 745 rx_pkts + retval, ICE_VPMD_RX_BURST, offload); 746 retval += burst; 747 nb_pkts -= burst; 748 if (burst < ICE_VPMD_RX_BURST) 749 return retval; 750 } 751 return retval + ice_recv_scattered_burst_vec_avx2(rx_queue, 752 rx_pkts + retval, nb_pkts, offload); 753 } 754 755 uint16_t 756 ice_recv_scattered_pkts_vec_avx2(void *rx_queue, 757 struct rte_mbuf **rx_pkts, 758 uint16_t nb_pkts) 759 { 760 return ice_recv_scattered_pkts_vec_avx2_common(rx_queue, 761 rx_pkts, 762 nb_pkts, 763 false); 764 } 765 766 uint16_t 767 ice_recv_scattered_pkts_vec_avx2_offload(void *rx_queue, 768 struct rte_mbuf **rx_pkts, 769 uint16_t nb_pkts) 770 { 771 return ice_recv_scattered_pkts_vec_avx2_common(rx_queue, 772 rx_pkts, 773 nb_pkts, 774 true); 775 } 776 777 static __rte_always_inline void 778 ice_vtx1(volatile struct ice_tx_desc *txdp, 779 struct rte_mbuf *pkt, uint64_t flags, bool offload) 780 { 781 uint64_t high_qw = 782 (ICE_TX_DESC_DTYPE_DATA | 783 ((uint64_t)flags << ICE_TXD_QW1_CMD_S) | 784 ((uint64_t)pkt->data_len << ICE_TXD_QW1_TX_BUF_SZ_S)); 785 if (offload) 786 ice_txd_enable_offload(pkt, &high_qw); 787 788 __m128i descriptor = _mm_set_epi64x(high_qw, rte_pktmbuf_iova(pkt)); 789 _mm_store_si128(RTE_CAST_PTR(__m128i *, txdp), descriptor); 790 } 791 792 static __rte_always_inline void 793 ice_vtx(volatile struct ice_tx_desc *txdp, 794 struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags, bool offload) 795 { 796 const uint64_t hi_qw_tmpl = (ICE_TX_DESC_DTYPE_DATA | 797 ((uint64_t)flags << ICE_TXD_QW1_CMD_S)); 798 799 /* if unaligned on 32-bit boundary, do one to align */ 800 if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) { 801 ice_vtx1(txdp, *pkt, flags, offload); 802 nb_pkts--, txdp++, pkt++; 803 } 804 805 /* do two at a time while possible, in bursts */ 806 for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) { 807 uint64_t hi_qw3 = 808 hi_qw_tmpl | 809 ((uint64_t)pkt[3]->data_len << 810 ICE_TXD_QW1_TX_BUF_SZ_S); 811 if (offload) 812 ice_txd_enable_offload(pkt[3], &hi_qw3); 813 uint64_t hi_qw2 = 814 hi_qw_tmpl | 815 ((uint64_t)pkt[2]->data_len << 816 ICE_TXD_QW1_TX_BUF_SZ_S); 817 if (offload) 818 ice_txd_enable_offload(pkt[2], &hi_qw2); 819 uint64_t hi_qw1 = 820 hi_qw_tmpl | 821 ((uint64_t)pkt[1]->data_len << 822 ICE_TXD_QW1_TX_BUF_SZ_S); 823 if (offload) 824 ice_txd_enable_offload(pkt[1], &hi_qw1); 825 uint64_t hi_qw0 = 826 hi_qw_tmpl | 827 ((uint64_t)pkt[0]->data_len << 828 ICE_TXD_QW1_TX_BUF_SZ_S); 829 if (offload) 830 ice_txd_enable_offload(pkt[0], &hi_qw0); 831 832 __m256i desc2_3 = 833 _mm256_set_epi64x 834 (hi_qw3, rte_pktmbuf_iova(pkt[3]), 835 hi_qw2, rte_pktmbuf_iova(pkt[2])); 836 __m256i desc0_1 = 837 _mm256_set_epi64x 838 (hi_qw1, rte_pktmbuf_iova(pkt[1]), 839 hi_qw0, rte_pktmbuf_iova(pkt[0])); 840 _mm256_store_si256(RTE_CAST_PTR(__m256i *, txdp + 2), desc2_3); 841 _mm256_store_si256(RTE_CAST_PTR(__m256i *, txdp), desc0_1); 842 } 843 844 /* do any last ones */ 845 while (nb_pkts) { 846 ice_vtx1(txdp, *pkt, flags, offload); 847 txdp++, pkt++, nb_pkts--; 848 } 849 } 850 851 static __rte_always_inline uint16_t 852 ice_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts, 853 uint16_t nb_pkts, bool offload) 854 { 855 struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue; 856 volatile struct ice_tx_desc *txdp; 857 struct ci_tx_entry_vec *txep; 858 uint16_t n, nb_commit, tx_id; 859 uint64_t flags = ICE_TD_CMD; 860 uint64_t rs = ICE_TX_DESC_CMD_RS | ICE_TD_CMD; 861 862 /* cross rx_thresh boundary is not allowed */ 863 nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh); 864 865 if (txq->nb_tx_free < txq->tx_free_thresh) 866 ci_tx_free_bufs_vec(txq, ice_tx_desc_done, false); 867 868 nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts); 869 if (unlikely(nb_pkts == 0)) 870 return 0; 871 872 tx_id = txq->tx_tail; 873 txdp = &txq->ice_tx_ring[tx_id]; 874 txep = &txq->sw_ring_vec[tx_id]; 875 876 txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts); 877 878 n = (uint16_t)(txq->nb_tx_desc - tx_id); 879 if (nb_commit >= n) { 880 ci_tx_backlog_entry_vec(txep, tx_pkts, n); 881 882 ice_vtx(txdp, tx_pkts, n - 1, flags, offload); 883 tx_pkts += (n - 1); 884 txdp += (n - 1); 885 886 ice_vtx1(txdp, *tx_pkts++, rs, offload); 887 888 nb_commit = (uint16_t)(nb_commit - n); 889 890 tx_id = 0; 891 txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1); 892 893 /* avoid reach the end of ring */ 894 txdp = &txq->ice_tx_ring[tx_id]; 895 txep = &txq->sw_ring_vec[tx_id]; 896 } 897 898 ci_tx_backlog_entry_vec(txep, tx_pkts, nb_commit); 899 900 ice_vtx(txdp, tx_pkts, nb_commit, flags, offload); 901 902 tx_id = (uint16_t)(tx_id + nb_commit); 903 if (tx_id > txq->tx_next_rs) { 904 txq->ice_tx_ring[txq->tx_next_rs].cmd_type_offset_bsz |= 905 rte_cpu_to_le_64(((uint64_t)ICE_TX_DESC_CMD_RS) << 906 ICE_TXD_QW1_CMD_S); 907 txq->tx_next_rs = 908 (uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh); 909 } 910 911 txq->tx_tail = tx_id; 912 913 ICE_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail); 914 915 return nb_pkts; 916 } 917 918 static __rte_always_inline uint16_t 919 ice_xmit_pkts_vec_avx2_common(void *tx_queue, struct rte_mbuf **tx_pkts, 920 uint16_t nb_pkts, bool offload) 921 { 922 uint16_t nb_tx = 0; 923 struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue; 924 925 while (nb_pkts) { 926 uint16_t ret, num; 927 928 num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh); 929 ret = ice_xmit_fixed_burst_vec_avx2(tx_queue, &tx_pkts[nb_tx], 930 num, offload); 931 nb_tx += ret; 932 nb_pkts -= ret; 933 if (ret < num) 934 break; 935 } 936 937 return nb_tx; 938 } 939 940 uint16_t 941 ice_xmit_pkts_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts, 942 uint16_t nb_pkts) 943 { 944 return ice_xmit_pkts_vec_avx2_common(tx_queue, tx_pkts, nb_pkts, false); 945 } 946 947 uint16_t 948 ice_xmit_pkts_vec_avx2_offload(void *tx_queue, struct rte_mbuf **tx_pkts, 949 uint16_t nb_pkts) 950 { 951 return ice_xmit_pkts_vec_avx2_common(tx_queue, tx_pkts, nb_pkts, true); 952 } 953