1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2019 Intel Corporation 3 */ 4 5 #include "ice_rxtx_vec_common.h" 6 #include "ice_rxtx_common_avx.h" 7 8 #include <rte_vect.h> 9 10 #define ICE_DESCS_PER_LOOP_AVX 8 11 12 static __rte_always_inline void 13 ice_rxq_rearm(struct ice_rx_queue *rxq) 14 { 15 ice_rxq_rearm_common(rxq, true); 16 } 17 18 static inline __m256i 19 ice_flex_rxd_to_fdir_flags_vec_avx512(const __m256i fdir_id0_7) 20 { 21 #define FDID_MIS_MAGIC 0xFFFFFFFF 22 RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR != (1 << 2)); 23 RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR_ID != (1 << 13)); 24 const __m256i pkt_fdir_bit = _mm256_set1_epi32(RTE_MBUF_F_RX_FDIR | 25 RTE_MBUF_F_RX_FDIR_ID); 26 /* desc->flow_id field == 0xFFFFFFFF means fdir mismatch */ 27 const __m256i fdir_mis_mask = _mm256_set1_epi32(FDID_MIS_MAGIC); 28 __m256i fdir_mask = _mm256_cmpeq_epi32(fdir_id0_7, 29 fdir_mis_mask); 30 /* this XOR op results to bit-reverse the fdir_mask */ 31 fdir_mask = _mm256_xor_si256(fdir_mask, fdir_mis_mask); 32 const __m256i fdir_flags = _mm256_and_si256(fdir_mask, pkt_fdir_bit); 33 34 return fdir_flags; 35 } 36 37 static __rte_always_inline uint16_t 38 _ice_recv_raw_pkts_vec_avx512(struct ice_rx_queue *rxq, 39 struct rte_mbuf **rx_pkts, 40 uint16_t nb_pkts, 41 uint8_t *split_packet, 42 bool do_offload) 43 { 44 const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; 45 const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 46 0, rxq->mbuf_initializer); 47 struct ice_rx_entry *sw_ring = &rxq->sw_ring[rxq->rx_tail]; 48 volatile union ice_rx_flex_desc *rxdp = rxq->rx_ring + rxq->rx_tail; 49 50 rte_prefetch0(rxdp); 51 52 /* nb_pkts has to be floor-aligned to ICE_DESCS_PER_LOOP_AVX */ 53 nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, ICE_DESCS_PER_LOOP_AVX); 54 55 /* See if we need to rearm the RX queue - gives the prefetch a bit 56 * of time to act 57 */ 58 if (rxq->rxrearm_nb > ICE_RXQ_REARM_THRESH) 59 ice_rxq_rearm(rxq); 60 61 /* Before we start moving massive data around, check to see if 62 * there is actually a packet available 63 */ 64 if (!(rxdp->wb.status_error0 & 65 rte_cpu_to_le_32(1 << ICE_RX_FLEX_DESC_STATUS0_DD_S))) 66 return 0; 67 68 /* constants used in processing loop */ 69 const __m512i crc_adjust = 70 _mm512_set4_epi32 71 (0, /* ignore non-length fields */ 72 -rxq->crc_len, /* sub crc on data_len */ 73 -rxq->crc_len, /* sub crc on pkt_len */ 74 0 /* ignore non-length fields */ 75 ); 76 77 /* 8 packets DD mask, LSB in each 32-bit value */ 78 const __m256i dd_check = _mm256_set1_epi32(1); 79 80 /* 8 packets EOP mask, second-LSB in each 32-bit value */ 81 const __m256i eop_check = _mm256_slli_epi32(dd_check, 82 ICE_RX_DESC_STATUS_EOF_S); 83 84 /* mask to shuffle from desc. to mbuf (4 descriptors)*/ 85 const __m512i shuf_msk = 86 _mm512_set4_epi32 87 (/* rss hash parsed separately */ 88 0xFFFFFFFF, 89 /* octet 10~11, 16 bits vlan_macip */ 90 /* octet 4~5, 16 bits data_len */ 91 11 << 24 | 10 << 16 | 5 << 8 | 4, 92 /* skip hi 16 bits pkt_len, zero out */ 93 /* octet 4~5, 16 bits pkt_len */ 94 0xFFFF << 16 | 5 << 8 | 4, 95 /* pkt_type set as unknown */ 96 0xFFFFFFFF 97 ); 98 99 /** 100 * compile-time check the above crc and shuffle layout is correct. 101 * NOTE: the first field (lowest address) is given last in set_epi 102 * calls above. 103 */ 104 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != 105 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); 106 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != 107 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); 108 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) != 109 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10); 110 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != 111 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); 112 113 /* following code block is for Rx Checksum Offload */ 114 /* Status/Error flag masks */ 115 /** 116 * mask everything except Checksum Reports, RSS indication 117 * and VLAN indication. 118 * bit6:4 for IP/L4 checksum errors. 119 * bit12 is for RSS indication. 120 * bit13 is for VLAN indication. 121 */ 122 const __m256i flags_mask = 123 _mm256_set1_epi32((0xF << 4) | (1 << 12) | (1 << 13)); 124 /** 125 * data to be shuffled by the result of the flags mask shifted by 4 126 * bits. This gives use the l3_l4 flags. 127 */ 128 const __m256i l3_l4_flags_shuf = 129 _mm256_set_epi8((RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 130 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | 131 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 132 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 133 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 134 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 135 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 136 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 137 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 138 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 139 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 140 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 141 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 142 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 143 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 144 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 145 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 146 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 147 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 148 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 149 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 150 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 151 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 152 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 153 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 154 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 155 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 156 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 157 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 158 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 159 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 160 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 161 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 162 /** 163 * second 128-bits 164 * shift right 20 bits to use the low two bits to indicate 165 * outer checksum status 166 * shift right 1 bit to make sure it not exceed 255 167 */ 168 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 169 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 170 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 171 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 172 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 173 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 174 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 175 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 176 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 177 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 178 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 179 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 180 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 181 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 182 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 183 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 184 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 185 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 186 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 187 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 188 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 189 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 190 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 191 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 192 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 193 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 194 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | 195 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 196 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 197 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 198 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 199 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1); 200 const __m256i cksum_mask = 201 _mm256_set1_epi32(RTE_MBUF_F_RX_IP_CKSUM_MASK | 202 RTE_MBUF_F_RX_L4_CKSUM_MASK | 203 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 204 RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK); 205 /** 206 * data to be shuffled by result of flag mask, shifted down 12. 207 * If RSS(bit12)/VLAN(bit13) are set, 208 * shuffle moves appropriate flags in place. 209 */ 210 const __m256i rss_vlan_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 211 0, 0, 0, 0, 212 0, 0, 0, 0, 213 RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 214 RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 215 RTE_MBUF_F_RX_RSS_HASH, 0, 216 /* 2nd 128-bits */ 217 0, 0, 0, 0, 218 0, 0, 0, 0, 219 0, 0, 0, 0, 220 RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 221 RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 222 RTE_MBUF_F_RX_RSS_HASH, 0); 223 224 uint16_t i, received; 225 226 for (i = 0, received = 0; i < nb_pkts; 227 i += ICE_DESCS_PER_LOOP_AVX, 228 rxdp += ICE_DESCS_PER_LOOP_AVX) { 229 /* step 1, copy over 8 mbuf pointers to rx_pkts array */ 230 _mm256_storeu_si256((void *)&rx_pkts[i], 231 _mm256_loadu_si256((void *)&sw_ring[i])); 232 #ifdef RTE_ARCH_X86_64 233 _mm256_storeu_si256 234 ((void *)&rx_pkts[i + 4], 235 _mm256_loadu_si256((void *)&sw_ring[i + 4])); 236 #endif 237 238 __m512i raw_desc0_3, raw_desc4_7; 239 __m256i raw_desc0_1, raw_desc2_3, raw_desc4_5, raw_desc6_7; 240 241 /* load in descriptors, in reverse order */ 242 const __m128i raw_desc7 = 243 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 7)); 244 rte_compiler_barrier(); 245 const __m128i raw_desc6 = 246 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 6)); 247 rte_compiler_barrier(); 248 const __m128i raw_desc5 = 249 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 5)); 250 rte_compiler_barrier(); 251 const __m128i raw_desc4 = 252 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 4)); 253 rte_compiler_barrier(); 254 const __m128i raw_desc3 = 255 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 3)); 256 rte_compiler_barrier(); 257 const __m128i raw_desc2 = 258 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 2)); 259 rte_compiler_barrier(); 260 const __m128i raw_desc1 = 261 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 1)); 262 rte_compiler_barrier(); 263 const __m128i raw_desc0 = 264 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 0)); 265 266 raw_desc6_7 = 267 _mm256_inserti128_si256 268 (_mm256_castsi128_si256(raw_desc6), 269 raw_desc7, 1); 270 raw_desc4_5 = 271 _mm256_inserti128_si256 272 (_mm256_castsi128_si256(raw_desc4), 273 raw_desc5, 1); 274 raw_desc2_3 = 275 _mm256_inserti128_si256 276 (_mm256_castsi128_si256(raw_desc2), 277 raw_desc3, 1); 278 raw_desc0_1 = 279 _mm256_inserti128_si256 280 (_mm256_castsi128_si256(raw_desc0), 281 raw_desc1, 1); 282 283 raw_desc4_7 = 284 _mm512_inserti64x4 285 (_mm512_castsi256_si512(raw_desc4_5), 286 raw_desc6_7, 1); 287 raw_desc0_3 = 288 _mm512_inserti64x4 289 (_mm512_castsi256_si512(raw_desc0_1), 290 raw_desc2_3, 1); 291 292 if (split_packet) { 293 int j; 294 295 for (j = 0; j < ICE_DESCS_PER_LOOP_AVX; j++) 296 rte_mbuf_prefetch_part2(rx_pkts[i + j]); 297 } 298 299 /** 300 * convert descriptors 0-7 into mbufs, re-arrange fields. 301 * Then write into the mbuf. 302 */ 303 __m512i mb4_7 = _mm512_shuffle_epi8(raw_desc4_7, shuf_msk); 304 __m512i mb0_3 = _mm512_shuffle_epi8(raw_desc0_3, shuf_msk); 305 306 mb4_7 = _mm512_add_epi32(mb4_7, crc_adjust); 307 mb0_3 = _mm512_add_epi32(mb0_3, crc_adjust); 308 309 /** 310 * to get packet types, ptype is located in bit16-25 311 * of each 128bits 312 */ 313 const __m512i ptype_mask = 314 _mm512_set1_epi16(ICE_RX_FLEX_DESC_PTYPE_M); 315 316 /** 317 * to get packet types, ptype is located in bit16-25 318 * of each 128bits 319 */ 320 const __m512i ptypes4_7 = 321 _mm512_and_si512(raw_desc4_7, ptype_mask); 322 const __m512i ptypes0_3 = 323 _mm512_and_si512(raw_desc0_3, ptype_mask); 324 325 const __m256i ptypes6_7 = 326 _mm512_extracti64x4_epi64(ptypes4_7, 1); 327 const __m256i ptypes4_5 = 328 _mm512_extracti64x4_epi64(ptypes4_7, 0); 329 const __m256i ptypes2_3 = 330 _mm512_extracti64x4_epi64(ptypes0_3, 1); 331 const __m256i ptypes0_1 = 332 _mm512_extracti64x4_epi64(ptypes0_3, 0); 333 const uint16_t ptype7 = _mm256_extract_epi16(ptypes6_7, 9); 334 const uint16_t ptype6 = _mm256_extract_epi16(ptypes6_7, 1); 335 const uint16_t ptype5 = _mm256_extract_epi16(ptypes4_5, 9); 336 const uint16_t ptype4 = _mm256_extract_epi16(ptypes4_5, 1); 337 const uint16_t ptype3 = _mm256_extract_epi16(ptypes2_3, 9); 338 const uint16_t ptype2 = _mm256_extract_epi16(ptypes2_3, 1); 339 const uint16_t ptype1 = _mm256_extract_epi16(ptypes0_1, 9); 340 const uint16_t ptype0 = _mm256_extract_epi16(ptypes0_1, 1); 341 342 const __m512i ptype4_7 = _mm512_set_epi32 343 (0, 0, 0, ptype_tbl[ptype7], 344 0, 0, 0, ptype_tbl[ptype6], 345 0, 0, 0, ptype_tbl[ptype5], 346 0, 0, 0, ptype_tbl[ptype4]); 347 const __m512i ptype0_3 = _mm512_set_epi32 348 (0, 0, 0, ptype_tbl[ptype3], 349 0, 0, 0, ptype_tbl[ptype2], 350 0, 0, 0, ptype_tbl[ptype1], 351 0, 0, 0, ptype_tbl[ptype0]); 352 353 mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7); 354 mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3); 355 356 __m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0); 357 __m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1); 358 __m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0); 359 __m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1); 360 361 /** 362 * use permute/extract to get status content 363 * After the operations, the packets status flags are in the 364 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6] 365 */ 366 /* merge the status bits into one register */ 367 const __m512i status_permute_msk = _mm512_set_epi32 368 (0, 0, 0, 0, 369 0, 0, 0, 0, 370 22, 30, 6, 14, 371 18, 26, 2, 10); 372 const __m512i raw_status0_7 = _mm512_permutex2var_epi32 373 (raw_desc4_7, status_permute_msk, raw_desc0_3); 374 __m256i status0_7 = _mm512_extracti64x4_epi64 375 (raw_status0_7, 0); 376 377 __m256i mbuf_flags = _mm256_set1_epi32(0); 378 379 if (do_offload) { 380 /* now do flag manipulation */ 381 382 /* get only flag/error bits we want */ 383 const __m256i flag_bits = 384 _mm256_and_si256(status0_7, flags_mask); 385 /** 386 * l3_l4_error flags, shuffle, then shift to correct adjustment 387 * of flags in flags_shuf, and finally mask out extra bits 388 */ 389 __m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf, 390 _mm256_srli_epi32(flag_bits, 4)); 391 l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1); 392 __m256i l4_outer_mask = _mm256_set1_epi32(0x6); 393 __m256i l4_outer_flags = 394 _mm256_and_si256(l3_l4_flags, l4_outer_mask); 395 l4_outer_flags = _mm256_slli_epi32(l4_outer_flags, 20); 396 397 __m256i l3_l4_mask = _mm256_set1_epi32(~0x6); 398 399 l3_l4_flags = _mm256_and_si256(l3_l4_flags, l3_l4_mask); 400 l3_l4_flags = _mm256_or_si256(l3_l4_flags, l4_outer_flags); 401 l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask); 402 /* set rss and vlan flags */ 403 const __m256i rss_vlan_flag_bits = 404 _mm256_srli_epi32(flag_bits, 12); 405 const __m256i rss_vlan_flags = 406 _mm256_shuffle_epi8(rss_vlan_flags_shuf, 407 rss_vlan_flag_bits); 408 409 /* merge flags */ 410 mbuf_flags = _mm256_or_si256(l3_l4_flags, 411 rss_vlan_flags); 412 } 413 414 if (rxq->fdir_enabled) { 415 const __m256i fdir_id4_7 = 416 _mm256_unpackhi_epi32(raw_desc6_7, raw_desc4_5); 417 418 const __m256i fdir_id0_3 = 419 _mm256_unpackhi_epi32(raw_desc2_3, raw_desc0_1); 420 421 const __m256i fdir_id0_7 = 422 _mm256_unpackhi_epi64(fdir_id4_7, fdir_id0_3); 423 424 if (do_offload) { 425 const __m256i fdir_flags = 426 ice_flex_rxd_to_fdir_flags_vec_avx512 427 (fdir_id0_7); 428 429 /* merge with fdir_flags */ 430 mbuf_flags = _mm256_or_si256 431 (mbuf_flags, fdir_flags); 432 } else { 433 mbuf_flags = 434 ice_flex_rxd_to_fdir_flags_vec_avx512 435 (fdir_id0_7); 436 } 437 438 /* write to mbuf: have to use scalar store here */ 439 rx_pkts[i + 0]->hash.fdir.hi = 440 _mm256_extract_epi32(fdir_id0_7, 3); 441 442 rx_pkts[i + 1]->hash.fdir.hi = 443 _mm256_extract_epi32(fdir_id0_7, 7); 444 445 rx_pkts[i + 2]->hash.fdir.hi = 446 _mm256_extract_epi32(fdir_id0_7, 2); 447 448 rx_pkts[i + 3]->hash.fdir.hi = 449 _mm256_extract_epi32(fdir_id0_7, 6); 450 451 rx_pkts[i + 4]->hash.fdir.hi = 452 _mm256_extract_epi32(fdir_id0_7, 1); 453 454 rx_pkts[i + 5]->hash.fdir.hi = 455 _mm256_extract_epi32(fdir_id0_7, 5); 456 457 rx_pkts[i + 6]->hash.fdir.hi = 458 _mm256_extract_epi32(fdir_id0_7, 0); 459 460 rx_pkts[i + 7]->hash.fdir.hi = 461 _mm256_extract_epi32(fdir_id0_7, 4); 462 } /* if() on fdir_enabled */ 463 464 if (do_offload) { 465 #ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC 466 /** 467 * needs to load 2nd 16B of each desc for RSS hash parsing, 468 * will cause performance drop to get into this context. 469 */ 470 if (rxq->vsi->adapter->pf.dev_data->dev_conf.rxmode.offloads & 471 RTE_ETH_RX_OFFLOAD_RSS_HASH) { 472 /* load bottom half of every 32B desc */ 473 const __m128i raw_desc_bh7 = _mm_load_si128 474 (RTE_CAST_PTR(const __m128i *, &rxdp[7].wb.status_error1)); 475 rte_compiler_barrier(); 476 const __m128i raw_desc_bh6 = _mm_load_si128 477 (RTE_CAST_PTR(const __m128i *, rxdp[6].wb.status_error1)); 478 rte_compiler_barrier(); 479 const __m128i raw_desc_bh5 = _mm_load_si128 480 (RTE_CAST_PTR(const __m128i *, &rxdp[5].wb.status_error1)); 481 rte_compiler_barrier(); 482 const __m128i raw_desc_bh4 = _mm_load_si128 483 (RTE_CAST_PTR(const __m128i *, &rxdp[4].wb.status_error1)); 484 rte_compiler_barrier(); 485 const __m128i raw_desc_bh3 = _mm_load_si128 486 (RTE_CAST_PTR(const __m128i *, &rxdp[3].wb.status_error1)); 487 rte_compiler_barrier(); 488 const __m128i raw_desc_bh2 = _mm_load_si128 489 (RTE_CAST_PTR(const __m128i *, &rxdp[2].wb.status_error1)); 490 rte_compiler_barrier(); 491 const __m128i raw_desc_bh1 = _mm_load_si128 492 (RTE_CAST_PTR(const __m128i *, &rxdp[1].wb.status_error1)); 493 rte_compiler_barrier(); 494 const __m128i raw_desc_bh0 = _mm_load_si128 495 (RTE_CAST_PTR(const __m128i *, &rxdp[0].wb.status_error1)); 496 497 __m256i raw_desc_bh6_7 = 498 _mm256_inserti128_si256 499 (_mm256_castsi128_si256(raw_desc_bh6), 500 raw_desc_bh7, 1); 501 __m256i raw_desc_bh4_5 = 502 _mm256_inserti128_si256 503 (_mm256_castsi128_si256(raw_desc_bh4), 504 raw_desc_bh5, 1); 505 __m256i raw_desc_bh2_3 = 506 _mm256_inserti128_si256 507 (_mm256_castsi128_si256(raw_desc_bh2), 508 raw_desc_bh3, 1); 509 __m256i raw_desc_bh0_1 = 510 _mm256_inserti128_si256 511 (_mm256_castsi128_si256(raw_desc_bh0), 512 raw_desc_bh1, 1); 513 514 /** 515 * to shift the 32b RSS hash value to the 516 * highest 32b of each 128b before mask 517 */ 518 __m256i rss_hash6_7 = 519 _mm256_slli_epi64(raw_desc_bh6_7, 32); 520 __m256i rss_hash4_5 = 521 _mm256_slli_epi64(raw_desc_bh4_5, 32); 522 __m256i rss_hash2_3 = 523 _mm256_slli_epi64(raw_desc_bh2_3, 32); 524 __m256i rss_hash0_1 = 525 _mm256_slli_epi64(raw_desc_bh0_1, 32); 526 527 __m256i rss_hash_msk = 528 _mm256_set_epi32(0xFFFFFFFF, 0, 0, 0, 529 0xFFFFFFFF, 0, 0, 0); 530 531 rss_hash6_7 = _mm256_and_si256 532 (rss_hash6_7, rss_hash_msk); 533 rss_hash4_5 = _mm256_and_si256 534 (rss_hash4_5, rss_hash_msk); 535 rss_hash2_3 = _mm256_and_si256 536 (rss_hash2_3, rss_hash_msk); 537 rss_hash0_1 = _mm256_and_si256 538 (rss_hash0_1, rss_hash_msk); 539 540 mb6_7 = _mm256_or_si256(mb6_7, rss_hash6_7); 541 mb4_5 = _mm256_or_si256(mb4_5, rss_hash4_5); 542 mb2_3 = _mm256_or_si256(mb2_3, rss_hash2_3); 543 mb0_1 = _mm256_or_si256(mb0_1, rss_hash0_1); 544 } /* if() on RSS hash parsing */ 545 #endif 546 } 547 548 /** 549 * At this point, we have the 8 sets of flags in the low 16-bits 550 * of each 32-bit value in vlan0. 551 * We want to extract these, and merge them with the mbuf init 552 * data so we can do a single write to the mbuf to set the flags 553 * and all the other initialization fields. Extracting the 554 * appropriate flags means that we have to do a shift and blend 555 * for each mbuf before we do the write. However, we can also 556 * add in the previously computed rx_descriptor fields to 557 * make a single 256-bit write per mbuf 558 */ 559 /* check the structure matches expectations */ 560 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) != 561 offsetof(struct rte_mbuf, rearm_data) + 8); 562 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) != 563 RTE_ALIGN(offsetof(struct rte_mbuf, 564 rearm_data), 565 16)); 566 /* build up data and do writes */ 567 __m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5, 568 rearm6, rearm7; 569 570 rearm6 = _mm256_blend_epi32(mbuf_init, 571 _mm256_slli_si256(mbuf_flags, 8), 572 0x04); 573 rearm4 = _mm256_blend_epi32(mbuf_init, 574 _mm256_slli_si256(mbuf_flags, 4), 575 0x04); 576 rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04); 577 rearm0 = _mm256_blend_epi32(mbuf_init, 578 _mm256_srli_si256(mbuf_flags, 4), 579 0x04); 580 581 /* permute to add in the rx_descriptor e.g. rss fields */ 582 rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20); 583 rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20); 584 rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20); 585 rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20); 586 587 /* write to mbuf */ 588 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data, 589 rearm6); 590 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data, 591 rearm4); 592 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data, 593 rearm2); 594 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data, 595 rearm0); 596 597 /* repeat for the odd mbufs */ 598 const __m256i odd_flags = 599 _mm256_castsi128_si256 600 (_mm256_extracti128_si256(mbuf_flags, 1)); 601 rearm7 = _mm256_blend_epi32(mbuf_init, 602 _mm256_slli_si256(odd_flags, 8), 603 0x04); 604 rearm5 = _mm256_blend_epi32(mbuf_init, 605 _mm256_slli_si256(odd_flags, 4), 606 0x04); 607 rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04); 608 rearm1 = _mm256_blend_epi32(mbuf_init, 609 _mm256_srli_si256(odd_flags, 4), 610 0x04); 611 612 /* since odd mbufs are already in hi 128-bits use blend */ 613 rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0); 614 rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0); 615 rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0); 616 rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0); 617 /* again write to mbufs */ 618 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data, 619 rearm7); 620 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data, 621 rearm5); 622 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data, 623 rearm3); 624 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data, 625 rearm1); 626 627 /* extract and record EOP bit */ 628 if (split_packet) { 629 const __m128i eop_mask = 630 _mm_set1_epi16(1 << ICE_RX_DESC_STATUS_EOF_S); 631 const __m256i eop_bits256 = _mm256_and_si256(status0_7, 632 eop_check); 633 /* pack status bits into a single 128-bit register */ 634 const __m128i eop_bits = 635 _mm_packus_epi32 636 (_mm256_castsi256_si128(eop_bits256), 637 _mm256_extractf128_si256(eop_bits256, 638 1)); 639 /** 640 * flip bits, and mask out the EOP bit, which is now 641 * a split-packet bit i.e. !EOP, rather than EOP one. 642 */ 643 __m128i split_bits = _mm_andnot_si128(eop_bits, 644 eop_mask); 645 /** 646 * eop bits are out of order, so we need to shuffle them 647 * back into order again. In doing so, only use low 8 648 * bits, which acts like another pack instruction 649 * The original order is (hi->lo): 1,3,5,7,0,2,4,6 650 * [Since we use epi8, the 16-bit positions are 651 * multiplied by 2 in the eop_shuffle value.] 652 */ 653 __m128i eop_shuffle = 654 _mm_set_epi8(/* zero hi 64b */ 655 0xFF, 0xFF, 0xFF, 0xFF, 656 0xFF, 0xFF, 0xFF, 0xFF, 657 /* move values to lo 64b */ 658 8, 0, 10, 2, 659 12, 4, 14, 6); 660 split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle); 661 *(uint64_t *)split_packet = 662 _mm_cvtsi128_si64(split_bits); 663 split_packet += ICE_DESCS_PER_LOOP_AVX; 664 } 665 666 /* perform dd_check */ 667 status0_7 = _mm256_and_si256(status0_7, dd_check); 668 status0_7 = _mm256_packs_epi32(status0_7, 669 _mm256_setzero_si256()); 670 671 uint64_t burst = rte_popcount64 672 (_mm_cvtsi128_si64 673 (_mm256_extracti128_si256 674 (status0_7, 1))); 675 burst += rte_popcount64 676 (_mm_cvtsi128_si64 677 (_mm256_castsi256_si128(status0_7))); 678 received += burst; 679 if (burst != ICE_DESCS_PER_LOOP_AVX) 680 break; 681 } 682 683 /* update tail pointers */ 684 rxq->rx_tail += received; 685 rxq->rx_tail &= (rxq->nb_rx_desc - 1); 686 if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep avx2 aligned */ 687 rxq->rx_tail--; 688 received--; 689 } 690 rxq->rxrearm_nb += received; 691 return received; 692 } 693 694 /** 695 * Notice: 696 * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet 697 */ 698 uint16_t 699 ice_recv_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts, 700 uint16_t nb_pkts) 701 { 702 return _ice_recv_raw_pkts_vec_avx512(rx_queue, rx_pkts, nb_pkts, NULL, false); 703 } 704 705 /** 706 * Notice: 707 * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet 708 */ 709 uint16_t 710 ice_recv_pkts_vec_avx512_offload(void *rx_queue, struct rte_mbuf **rx_pkts, 711 uint16_t nb_pkts) 712 { 713 return _ice_recv_raw_pkts_vec_avx512(rx_queue, rx_pkts, 714 nb_pkts, NULL, true); 715 } 716 717 /** 718 * vPMD receive routine that reassembles single burst of 32 scattered packets 719 * Notice: 720 * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet 721 */ 722 static uint16_t 723 ice_recv_scattered_burst_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts, 724 uint16_t nb_pkts) 725 { 726 struct ice_rx_queue *rxq = rx_queue; 727 uint8_t split_flags[ICE_VPMD_RX_BURST] = {0}; 728 729 /* get some new buffers */ 730 uint16_t nb_bufs = _ice_recv_raw_pkts_vec_avx512(rxq, rx_pkts, nb_pkts, 731 split_flags, false); 732 if (nb_bufs == 0) 733 return 0; 734 735 /* happy day case, full burst + no packets to be joined */ 736 const uint64_t *split_fl64 = (uint64_t *)split_flags; 737 738 if (!rxq->pkt_first_seg && 739 split_fl64[0] == 0 && split_fl64[1] == 0 && 740 split_fl64[2] == 0 && split_fl64[3] == 0) 741 return nb_bufs; 742 743 /* reassemble any packets that need reassembly */ 744 unsigned int i = 0; 745 746 if (!rxq->pkt_first_seg) { 747 /* find the first split flag, and only reassemble then */ 748 while (i < nb_bufs && !split_flags[i]) 749 i++; 750 if (i == nb_bufs) 751 return nb_bufs; 752 rxq->pkt_first_seg = rx_pkts[i]; 753 } 754 return i + ci_rx_reassemble_packets(&rx_pkts[i], nb_bufs - i, &split_flags[i], 755 &rxq->pkt_first_seg, &rxq->pkt_last_seg, rxq->crc_len); 756 } 757 758 /** 759 * vPMD receive routine that reassembles single burst of 32 scattered packets 760 * Notice: 761 * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet 762 */ 763 static uint16_t 764 ice_recv_scattered_burst_vec_avx512_offload(void *rx_queue, 765 struct rte_mbuf **rx_pkts, 766 uint16_t nb_pkts) 767 { 768 struct ice_rx_queue *rxq = rx_queue; 769 uint8_t split_flags[ICE_VPMD_RX_BURST] = {0}; 770 771 /* get some new buffers */ 772 uint16_t nb_bufs = _ice_recv_raw_pkts_vec_avx512(rxq, 773 rx_pkts, nb_pkts, split_flags, true); 774 if (nb_bufs == 0) 775 return 0; 776 777 /* happy day case, full burst + no packets to be joined */ 778 const uint64_t *split_fl64 = (uint64_t *)split_flags; 779 780 if (!rxq->pkt_first_seg && 781 split_fl64[0] == 0 && split_fl64[1] == 0 && 782 split_fl64[2] == 0 && split_fl64[3] == 0) 783 return nb_bufs; 784 785 /* reassemble any packets that need reassembly */ 786 unsigned int i = 0; 787 788 if (!rxq->pkt_first_seg) { 789 /* find the first split flag, and only reassemble then */ 790 while (i < nb_bufs && !split_flags[i]) 791 i++; 792 if (i == nb_bufs) 793 return nb_bufs; 794 rxq->pkt_first_seg = rx_pkts[i]; 795 } 796 return i + ci_rx_reassemble_packets(&rx_pkts[i], nb_bufs - i, &split_flags[i], 797 &rxq->pkt_first_seg, &rxq->pkt_last_seg, rxq->crc_len); 798 } 799 800 /** 801 * vPMD receive routine that reassembles scattered packets. 802 * Main receive routine that can handle arbitrary burst sizes 803 * Notice: 804 * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet 805 */ 806 uint16_t 807 ice_recv_scattered_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts, 808 uint16_t nb_pkts) 809 { 810 uint16_t retval = 0; 811 812 while (nb_pkts > ICE_VPMD_RX_BURST) { 813 uint16_t burst = ice_recv_scattered_burst_vec_avx512(rx_queue, 814 rx_pkts + retval, ICE_VPMD_RX_BURST); 815 retval += burst; 816 nb_pkts -= burst; 817 if (burst < ICE_VPMD_RX_BURST) 818 return retval; 819 } 820 return retval + ice_recv_scattered_burst_vec_avx512(rx_queue, 821 rx_pkts + retval, nb_pkts); 822 } 823 824 /** 825 * vPMD receive routine that reassembles scattered packets. 826 * Main receive routine that can handle arbitrary burst sizes 827 * Notice: 828 * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet 829 */ 830 uint16_t 831 ice_recv_scattered_pkts_vec_avx512_offload(void *rx_queue, 832 struct rte_mbuf **rx_pkts, 833 uint16_t nb_pkts) 834 { 835 uint16_t retval = 0; 836 837 while (nb_pkts > ICE_VPMD_RX_BURST) { 838 uint16_t burst = 839 ice_recv_scattered_burst_vec_avx512_offload(rx_queue, 840 rx_pkts + retval, ICE_VPMD_RX_BURST); 841 retval += burst; 842 nb_pkts -= burst; 843 if (burst < ICE_VPMD_RX_BURST) 844 return retval; 845 } 846 return retval + ice_recv_scattered_burst_vec_avx512_offload(rx_queue, 847 rx_pkts + retval, nb_pkts); 848 } 849 850 static __rte_always_inline void 851 ice_vtx1(volatile struct ice_tx_desc *txdp, 852 struct rte_mbuf *pkt, uint64_t flags, bool do_offload) 853 { 854 uint64_t high_qw = 855 (ICE_TX_DESC_DTYPE_DATA | 856 ((uint64_t)flags << ICE_TXD_QW1_CMD_S) | 857 ((uint64_t)pkt->data_len << ICE_TXD_QW1_TX_BUF_SZ_S)); 858 859 if (do_offload) 860 ice_txd_enable_offload(pkt, &high_qw); 861 862 __m128i descriptor = _mm_set_epi64x(high_qw, rte_pktmbuf_iova(pkt)); 863 _mm_store_si128(RTE_CAST_PTR(__m128i *, txdp), descriptor); 864 } 865 866 static __rte_always_inline void 867 ice_vtx(volatile struct ice_tx_desc *txdp, struct rte_mbuf **pkt, 868 uint16_t nb_pkts, uint64_t flags, bool do_offload) 869 { 870 const uint64_t hi_qw_tmpl = (ICE_TX_DESC_DTYPE_DATA | 871 ((uint64_t)flags << ICE_TXD_QW1_CMD_S)); 872 873 for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) { 874 uint64_t hi_qw3 = 875 hi_qw_tmpl | 876 ((uint64_t)pkt[3]->data_len << 877 ICE_TXD_QW1_TX_BUF_SZ_S); 878 if (do_offload) 879 ice_txd_enable_offload(pkt[3], &hi_qw3); 880 uint64_t hi_qw2 = 881 hi_qw_tmpl | 882 ((uint64_t)pkt[2]->data_len << 883 ICE_TXD_QW1_TX_BUF_SZ_S); 884 if (do_offload) 885 ice_txd_enable_offload(pkt[2], &hi_qw2); 886 uint64_t hi_qw1 = 887 hi_qw_tmpl | 888 ((uint64_t)pkt[1]->data_len << 889 ICE_TXD_QW1_TX_BUF_SZ_S); 890 if (do_offload) 891 ice_txd_enable_offload(pkt[1], &hi_qw1); 892 uint64_t hi_qw0 = 893 hi_qw_tmpl | 894 ((uint64_t)pkt[0]->data_len << 895 ICE_TXD_QW1_TX_BUF_SZ_S); 896 if (do_offload) 897 ice_txd_enable_offload(pkt[0], &hi_qw0); 898 899 __m512i desc0_3 = 900 _mm512_set_epi64 901 (hi_qw3, rte_pktmbuf_iova(pkt[3]), 902 hi_qw2, rte_pktmbuf_iova(pkt[2]), 903 hi_qw1, rte_pktmbuf_iova(pkt[1]), 904 hi_qw0, rte_pktmbuf_iova(pkt[0])); 905 _mm512_storeu_si512(RTE_CAST_PTR(void *, txdp), desc0_3); 906 } 907 908 /* do any last ones */ 909 while (nb_pkts) { 910 ice_vtx1(txdp, *pkt, flags, do_offload); 911 txdp++, pkt++, nb_pkts--; 912 } 913 } 914 915 static __rte_always_inline uint16_t 916 ice_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, 917 uint16_t nb_pkts, bool do_offload) 918 { 919 struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue; 920 volatile struct ice_tx_desc *txdp; 921 struct ci_tx_entry_vec *txep; 922 uint16_t n, nb_commit, tx_id; 923 uint64_t flags = ICE_TD_CMD; 924 uint64_t rs = ICE_TX_DESC_CMD_RS | ICE_TD_CMD; 925 926 /* cross rx_thresh boundary is not allowed */ 927 nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh); 928 929 if (txq->nb_tx_free < txq->tx_free_thresh) 930 ci_tx_free_bufs_vec(txq, ice_tx_desc_done, false); 931 932 nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts); 933 if (unlikely(nb_pkts == 0)) 934 return 0; 935 936 tx_id = txq->tx_tail; 937 txdp = &txq->ice_tx_ring[tx_id]; 938 txep = (void *)txq->sw_ring; 939 txep += tx_id; 940 941 txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts); 942 943 n = (uint16_t)(txq->nb_tx_desc - tx_id); 944 if (nb_commit >= n) { 945 ci_tx_backlog_entry_vec(txep, tx_pkts, n); 946 947 ice_vtx(txdp, tx_pkts, n - 1, flags, do_offload); 948 tx_pkts += (n - 1); 949 txdp += (n - 1); 950 951 ice_vtx1(txdp, *tx_pkts++, rs, do_offload); 952 953 nb_commit = (uint16_t)(nb_commit - n); 954 955 tx_id = 0; 956 txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1); 957 958 /* avoid reach the end of ring */ 959 txdp = txq->ice_tx_ring; 960 txep = (void *)txq->sw_ring; 961 } 962 963 ci_tx_backlog_entry_vec(txep, tx_pkts, nb_commit); 964 965 ice_vtx(txdp, tx_pkts, nb_commit, flags, do_offload); 966 967 tx_id = (uint16_t)(tx_id + nb_commit); 968 if (tx_id > txq->tx_next_rs) { 969 txq->ice_tx_ring[txq->tx_next_rs].cmd_type_offset_bsz |= 970 rte_cpu_to_le_64(((uint64_t)ICE_TX_DESC_CMD_RS) << 971 ICE_TXD_QW1_CMD_S); 972 txq->tx_next_rs = 973 (uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh); 974 } 975 976 txq->tx_tail = tx_id; 977 978 ICE_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail); 979 980 return nb_pkts; 981 } 982 983 uint16_t 984 ice_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, 985 uint16_t nb_pkts) 986 { 987 uint16_t nb_tx = 0; 988 struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue; 989 990 while (nb_pkts) { 991 uint16_t ret, num; 992 993 num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh); 994 ret = ice_xmit_fixed_burst_vec_avx512(tx_queue, 995 &tx_pkts[nb_tx], num, false); 996 nb_tx += ret; 997 nb_pkts -= ret; 998 if (ret < num) 999 break; 1000 } 1001 1002 return nb_tx; 1003 } 1004 1005 uint16_t 1006 ice_xmit_pkts_vec_avx512_offload(void *tx_queue, struct rte_mbuf **tx_pkts, 1007 uint16_t nb_pkts) 1008 { 1009 uint16_t nb_tx = 0; 1010 struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue; 1011 1012 while (nb_pkts) { 1013 uint16_t ret, num; 1014 1015 num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh); 1016 ret = ice_xmit_fixed_burst_vec_avx512(tx_queue, 1017 &tx_pkts[nb_tx], num, true); 1018 1019 nb_tx += ret; 1020 nb_pkts -= ret; 1021 if (ret < num) 1022 break; 1023 } 1024 1025 return nb_tx; 1026 } 1027