1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2020 Intel Corporation 3 */ 4 5 #include "iavf_rxtx_vec_common.h" 6 7 #include <rte_vect.h> 8 9 #define IAVF_DESCS_PER_LOOP_AVX 8 10 #define PKTLEN_SHIFT 10 11 12 /****************************************************************************** 13 * If user knows a specific offload is not enabled by APP, 14 * the macro can be commented to save the effort of fast path. 15 * Currently below 6 features are supported in RX path, 16 * 1, checksum offload 17 * 2, VLAN/QINQ stripping 18 * 3, RSS hash 19 * 4, packet type analysis 20 * 5, flow director ID report 21 * 6, timestamp offload 22 ******************************************************************************/ 23 #define IAVF_RX_CSUM_OFFLOAD 24 #define IAVF_RX_VLAN_OFFLOAD 25 #define IAVF_RX_RSS_OFFLOAD 26 #define IAVF_RX_PTYPE_OFFLOAD 27 #define IAVF_RX_FDIR_OFFLOAD 28 #define IAVF_RX_TS_OFFLOAD 29 30 static __rte_always_inline void 31 iavf_rxq_rearm(struct iavf_rx_queue *rxq) 32 { 33 iavf_rxq_rearm_common(rxq, true); 34 } 35 36 #define IAVF_RX_LEN_MASK 0x80808080 37 static __rte_always_inline uint16_t 38 _iavf_recv_raw_pkts_vec_avx512(struct iavf_rx_queue *rxq, 39 struct rte_mbuf **rx_pkts, 40 uint16_t nb_pkts, uint8_t *split_packet, 41 bool offload) 42 { 43 #ifdef IAVF_RX_PTYPE_OFFLOAD 44 const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl; 45 #endif 46 47 const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0, 48 rxq->mbuf_initializer); 49 struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail]; 50 volatile union iavf_rx_desc *rxdp = rxq->rx_ring + rxq->rx_tail; 51 52 rte_prefetch0(rxdp); 53 54 /* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */ 55 nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_DESCS_PER_LOOP_AVX); 56 57 /* See if we need to rearm the RX queue - gives the prefetch a bit 58 * of time to act 59 */ 60 if (rxq->rxrearm_nb > IAVF_RXQ_REARM_THRESH) 61 iavf_rxq_rearm(rxq); 62 63 /* Before we start moving massive data around, check to see if 64 * there is actually a packet available 65 */ 66 if (!(rxdp->wb.qword1.status_error_len & 67 rte_cpu_to_le_32(1 << IAVF_RX_DESC_STATUS_DD_SHIFT))) 68 return 0; 69 70 /* constants used in processing loop */ 71 const __m512i crc_adjust = 72 _mm512_set_epi32 73 (/* 1st descriptor */ 74 0, /* ignore non-length fields */ 75 -rxq->crc_len, /* sub crc on data_len */ 76 -rxq->crc_len, /* sub crc on pkt_len */ 77 0, /* ignore pkt_type field */ 78 /* 2nd descriptor */ 79 0, /* ignore non-length fields */ 80 -rxq->crc_len, /* sub crc on data_len */ 81 -rxq->crc_len, /* sub crc on pkt_len */ 82 0, /* ignore pkt_type field */ 83 /* 3rd descriptor */ 84 0, /* ignore non-length fields */ 85 -rxq->crc_len, /* sub crc on data_len */ 86 -rxq->crc_len, /* sub crc on pkt_len */ 87 0, /* ignore pkt_type field */ 88 /* 4th descriptor */ 89 0, /* ignore non-length fields */ 90 -rxq->crc_len, /* sub crc on data_len */ 91 -rxq->crc_len, /* sub crc on pkt_len */ 92 0 /* ignore pkt_type field */ 93 ); 94 95 /* 8 packets DD mask, LSB in each 32-bit value */ 96 const __m256i dd_check = _mm256_set1_epi32(1); 97 98 /* 8 packets EOP mask, second-LSB in each 32-bit value */ 99 const __m256i eop_check = _mm256_slli_epi32(dd_check, 100 IAVF_RX_DESC_STATUS_EOF_SHIFT); 101 102 /* mask to shuffle from desc. to mbuf (4 descriptors)*/ 103 const __m512i shuf_msk = 104 _mm512_set_epi32 105 (/* 1st descriptor */ 106 0x07060504, /* octet 4~7, 32bits rss */ 107 0x03020F0E, /* octet 2~3, low 16 bits vlan_macip */ 108 /* octet 15~14, 16 bits data_len */ 109 0xFFFF0F0E, /* skip high 16 bits pkt_len, zero out */ 110 /* octet 15~14, low 16 bits pkt_len */ 111 0xFFFFFFFF, /* pkt_type set as unknown */ 112 /* 2nd descriptor */ 113 0x07060504, /* octet 4~7, 32bits rss */ 114 0x03020F0E, /* octet 2~3, low 16 bits vlan_macip */ 115 /* octet 15~14, 16 bits data_len */ 116 0xFFFF0F0E, /* skip high 16 bits pkt_len, zero out */ 117 /* octet 15~14, low 16 bits pkt_len */ 118 0xFFFFFFFF, /* pkt_type set as unknown */ 119 /* 3rd descriptor */ 120 0x07060504, /* octet 4~7, 32bits rss */ 121 0x03020F0E, /* octet 2~3, low 16 bits vlan_macip */ 122 /* octet 15~14, 16 bits data_len */ 123 0xFFFF0F0E, /* skip high 16 bits pkt_len, zero out */ 124 /* octet 15~14, low 16 bits pkt_len */ 125 0xFFFFFFFF, /* pkt_type set as unknown */ 126 /* 4th descriptor */ 127 0x07060504, /* octet 4~7, 32bits rss */ 128 0x03020F0E, /* octet 2~3, low 16 bits vlan_macip */ 129 /* octet 15~14, 16 bits data_len */ 130 0xFFFF0F0E, /* skip high 16 bits pkt_len, zero out */ 131 /* octet 15~14, low 16 bits pkt_len */ 132 0xFFFFFFFF /* pkt_type set as unknown */ 133 ); 134 /** 135 * compile-time check the above crc and shuffle layout is correct. 136 * NOTE: the first field (lowest address) is given last in set_epi 137 * calls above. 138 */ 139 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != 140 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); 141 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != 142 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); 143 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) != 144 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10); 145 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != 146 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); 147 148 uint16_t i, received; 149 150 for (i = 0, received = 0; i < nb_pkts; 151 i += IAVF_DESCS_PER_LOOP_AVX, 152 rxdp += IAVF_DESCS_PER_LOOP_AVX) { 153 /* step 1, copy over 8 mbuf pointers to rx_pkts array */ 154 _mm256_storeu_si256((void *)&rx_pkts[i], 155 _mm256_loadu_si256((void *)&sw_ring[i])); 156 #ifdef RTE_ARCH_X86_64 157 _mm256_storeu_si256 158 ((void *)&rx_pkts[i + 4], 159 _mm256_loadu_si256((void *)&sw_ring[i + 4])); 160 #endif 161 162 __m512i raw_desc0_3, raw_desc4_7; 163 const __m128i raw_desc7 = 164 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 7)); 165 rte_compiler_barrier(); 166 const __m128i raw_desc6 = 167 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 6)); 168 rte_compiler_barrier(); 169 const __m128i raw_desc5 = 170 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 5)); 171 rte_compiler_barrier(); 172 const __m128i raw_desc4 = 173 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 4)); 174 rte_compiler_barrier(); 175 const __m128i raw_desc3 = 176 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 3)); 177 rte_compiler_barrier(); 178 const __m128i raw_desc2 = 179 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 2)); 180 rte_compiler_barrier(); 181 const __m128i raw_desc1 = 182 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 1)); 183 rte_compiler_barrier(); 184 const __m128i raw_desc0 = 185 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 0)); 186 187 raw_desc4_7 = _mm512_broadcast_i32x4(raw_desc4); 188 raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc5, 1); 189 raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc6, 2); 190 raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc7, 3); 191 raw_desc0_3 = _mm512_broadcast_i32x4(raw_desc0); 192 raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc1, 1); 193 raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc2, 2); 194 raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc3, 3); 195 196 if (split_packet) { 197 int j; 198 199 for (j = 0; j < IAVF_DESCS_PER_LOOP_AVX; j++) 200 rte_mbuf_prefetch_part2(rx_pkts[i + j]); 201 } 202 203 /** 204 * convert descriptors 4-7 into mbufs, adjusting length and 205 * re-arranging fields. Then write into the mbuf 206 */ 207 const __m512i len4_7 = _mm512_slli_epi32(raw_desc4_7, 208 PKTLEN_SHIFT); 209 const __m512i desc4_7 = _mm512_mask_blend_epi16(IAVF_RX_LEN_MASK, 210 raw_desc4_7, 211 len4_7); 212 __m512i mb4_7 = _mm512_shuffle_epi8(desc4_7, shuf_msk); 213 214 mb4_7 = _mm512_add_epi32(mb4_7, crc_adjust); 215 #ifdef IAVF_RX_PTYPE_OFFLOAD 216 /** 217 * to get packet types, shift 64-bit values down 30 bits 218 * and so ptype is in lower 8-bits in each 219 */ 220 const __m512i ptypes4_7 = _mm512_srli_epi64(desc4_7, 30); 221 const __m256i ptypes6_7 = _mm512_extracti64x4_epi64(ptypes4_7, 1); 222 const __m256i ptypes4_5 = _mm512_extracti64x4_epi64(ptypes4_7, 0); 223 const uint8_t ptype7 = _mm256_extract_epi8(ptypes6_7, 24); 224 const uint8_t ptype6 = _mm256_extract_epi8(ptypes6_7, 8); 225 const uint8_t ptype5 = _mm256_extract_epi8(ptypes4_5, 24); 226 const uint8_t ptype4 = _mm256_extract_epi8(ptypes4_5, 8); 227 228 const __m512i ptype4_7 = _mm512_set_epi32 229 (0, 0, 0, type_table[ptype7], 230 0, 0, 0, type_table[ptype6], 231 0, 0, 0, type_table[ptype5], 232 0, 0, 0, type_table[ptype4]); 233 mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7); 234 #endif 235 236 /** 237 * convert descriptors 0-3 into mbufs, adjusting length and 238 * re-arranging fields. Then write into the mbuf 239 */ 240 const __m512i len0_3 = _mm512_slli_epi32(raw_desc0_3, 241 PKTLEN_SHIFT); 242 const __m512i desc0_3 = _mm512_mask_blend_epi16(IAVF_RX_LEN_MASK, 243 raw_desc0_3, 244 len0_3); 245 __m512i mb0_3 = _mm512_shuffle_epi8(desc0_3, shuf_msk); 246 247 mb0_3 = _mm512_add_epi32(mb0_3, crc_adjust); 248 #ifdef IAVF_RX_PTYPE_OFFLOAD 249 /* get the packet types */ 250 const __m512i ptypes0_3 = _mm512_srli_epi64(desc0_3, 30); 251 const __m256i ptypes2_3 = _mm512_extracti64x4_epi64(ptypes0_3, 1); 252 const __m256i ptypes0_1 = _mm512_extracti64x4_epi64(ptypes0_3, 0); 253 const uint8_t ptype3 = _mm256_extract_epi8(ptypes2_3, 24); 254 const uint8_t ptype2 = _mm256_extract_epi8(ptypes2_3, 8); 255 const uint8_t ptype1 = _mm256_extract_epi8(ptypes0_1, 24); 256 const uint8_t ptype0 = _mm256_extract_epi8(ptypes0_1, 8); 257 258 const __m512i ptype0_3 = _mm512_set_epi32 259 (0, 0, 0, type_table[ptype3], 260 0, 0, 0, type_table[ptype2], 261 0, 0, 0, type_table[ptype1], 262 0, 0, 0, type_table[ptype0]); 263 mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3); 264 #endif 265 266 /** 267 * use permute/extract to get status content 268 * After the operations, the packets status flags are in the 269 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6] 270 */ 271 /* merge the status bits into one register */ 272 const __m512i status_permute_msk = _mm512_set_epi32 273 (0, 0, 0, 0, 274 0, 0, 0, 0, 275 22, 30, 6, 14, 276 18, 26, 2, 10); 277 const __m512i raw_status0_7 = _mm512_permutex2var_epi32 278 (raw_desc4_7, status_permute_msk, raw_desc0_3); 279 __m256i status0_7 = _mm512_extracti64x4_epi64 280 (raw_status0_7, 0); 281 282 /* now do flag manipulation */ 283 284 /* merge flags */ 285 __m256i mbuf_flags = _mm256_set1_epi32(0); 286 287 if (offload) { 288 #if defined(IAVF_RX_CSUM_OFFLOAD) || defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD) 289 /* Status/Error flag masks */ 290 /** 291 * mask everything except RSS, flow director and VLAN flags 292 * bit2 is for VLAN tag, bit11 for flow director indication 293 * bit13:12 for RSS indication. Bits 3-5 of error 294 * field (bits 22-24) are for IP/L4 checksum errors 295 */ 296 const __m256i flags_mask = 297 _mm256_set1_epi32((1 << 2) | (1 << 11) | 298 (3 << 12) | (7 << 22)); 299 #endif 300 301 #ifdef IAVF_RX_VLAN_OFFLOAD 302 /** 303 * data to be shuffled by result of flag mask. If VLAN bit is set, 304 * (bit 2), then position 4 in this array will be used in the 305 * destination 306 */ 307 const __m256i vlan_flags_shuf = 308 _mm256_set_epi32(0, 0, RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 0, 309 0, 0, RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 0); 310 #endif 311 312 #ifdef IAVF_RX_RSS_OFFLOAD 313 /** 314 * data to be shuffled by result of flag mask, shifted down 11. 315 * If RSS/FDIR bits are set, shuffle moves appropriate flags in 316 * place. 317 */ 318 const __m256i rss_flags_shuf = 319 _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 320 RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_FDIR, RTE_MBUF_F_RX_RSS_HASH, 321 0, 0, 0, 0, RTE_MBUF_F_RX_FDIR, 0,/* end up 128-bits */ 322 0, 0, 0, 0, 0, 0, 0, 0, 323 RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_FDIR, RTE_MBUF_F_RX_RSS_HASH, 324 0, 0, 0, 0, RTE_MBUF_F_RX_FDIR, 0); 325 #endif 326 327 #ifdef IAVF_RX_CSUM_OFFLOAD 328 /** 329 * data to be shuffled by the result of the flags mask shifted by 22 330 * bits. This gives use the l3_l4 flags. 331 */ 332 const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 333 /* shift right 1 bit to make sure it not exceed 255 */ 334 (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | 335 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 336 (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 337 RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1, 338 (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 339 (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD) >> 1, 340 (RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 341 (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1, 342 RTE_MBUF_F_RX_IP_CKSUM_BAD >> 1, 343 (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_GOOD) >> 1, 344 /* second 128-bits */ 345 0, 0, 0, 0, 0, 0, 0, 0, 346 (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | 347 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 348 (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 349 RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1, 350 (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 351 (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD) >> 1, 352 (RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 353 (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1, 354 RTE_MBUF_F_RX_IP_CKSUM_BAD >> 1, 355 (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_GOOD) >> 1); 356 357 const __m256i cksum_mask = 358 _mm256_set1_epi32(RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD | 359 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD | 360 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD); 361 #endif 362 363 #if defined(IAVF_RX_CSUM_OFFLOAD) || defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD) 364 /* get only flag/error bits we want */ 365 const __m256i flag_bits = 366 _mm256_and_si256(status0_7, flags_mask); 367 #endif 368 /* set vlan and rss flags */ 369 #ifdef IAVF_RX_VLAN_OFFLOAD 370 const __m256i vlan_flags = 371 _mm256_shuffle_epi8(vlan_flags_shuf, flag_bits); 372 #endif 373 #ifdef IAVF_RX_RSS_OFFLOAD 374 const __m256i rss_flags = 375 _mm256_shuffle_epi8(rss_flags_shuf, 376 _mm256_srli_epi32(flag_bits, 11)); 377 #endif 378 #ifdef IAVF_RX_CSUM_OFFLOAD 379 /** 380 * l3_l4_error flags, shuffle, then shift to correct adjustment 381 * of flags in flags_shuf, and finally mask out extra bits 382 */ 383 __m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf, 384 _mm256_srli_epi32(flag_bits, 22)); 385 l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1); 386 l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask); 387 #endif 388 389 #ifdef IAVF_RX_CSUM_OFFLOAD 390 mbuf_flags = _mm256_or_si256(mbuf_flags, l3_l4_flags); 391 #endif 392 #ifdef IAVF_RX_RSS_OFFLOAD 393 mbuf_flags = _mm256_or_si256(mbuf_flags, rss_flags); 394 #endif 395 #ifdef IAVF_RX_VLAN_OFFLOAD 396 mbuf_flags = _mm256_or_si256(mbuf_flags, vlan_flags); 397 #endif 398 } 399 400 /** 401 * At this point, we have the 8 sets of flags in the low 16-bits 402 * of each 32-bit value in vlan0. 403 * We want to extract these, and merge them with the mbuf init 404 * data so we can do a single write to the mbuf to set the flags 405 * and all the other initialization fields. Extracting the 406 * appropriate flags means that we have to do a shift and blend 407 * for each mbuf before we do the write. However, we can also 408 * add in the previously computed rx_descriptor fields to 409 * make a single 256-bit write per mbuf 410 */ 411 /* check the structure matches expectations */ 412 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) != 413 offsetof(struct rte_mbuf, rearm_data) + 8); 414 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) != 415 RTE_ALIGN(offsetof(struct rte_mbuf, 416 rearm_data), 417 16)); 418 /* build up data and do writes */ 419 __m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5, 420 rearm6, rearm7; 421 const __m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0); 422 const __m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1); 423 const __m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0); 424 const __m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1); 425 426 if (offload) { 427 rearm6 = _mm256_blend_epi32(mbuf_init, 428 _mm256_slli_si256(mbuf_flags, 8), 429 0x04); 430 rearm4 = _mm256_blend_epi32(mbuf_init, 431 _mm256_slli_si256(mbuf_flags, 4), 432 0x04); 433 rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04); 434 rearm0 = _mm256_blend_epi32(mbuf_init, 435 _mm256_srli_si256(mbuf_flags, 4), 436 0x04); 437 /* permute to add in the rx_descriptor e.g. rss fields */ 438 rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20); 439 rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20); 440 rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20); 441 rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20); 442 } else { 443 rearm6 = _mm256_permute2f128_si256(mbuf_init, mb6_7, 0x20); 444 rearm4 = _mm256_permute2f128_si256(mbuf_init, mb4_5, 0x20); 445 rearm2 = _mm256_permute2f128_si256(mbuf_init, mb2_3, 0x20); 446 rearm0 = _mm256_permute2f128_si256(mbuf_init, mb0_1, 0x20); 447 } 448 /* write to mbuf */ 449 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data, 450 rearm6); 451 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data, 452 rearm4); 453 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data, 454 rearm2); 455 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data, 456 rearm0); 457 458 /* repeat for the odd mbufs */ 459 if (offload) { 460 const __m256i odd_flags = 461 _mm256_castsi128_si256 462 (_mm256_extracti128_si256(mbuf_flags, 1)); 463 rearm7 = _mm256_blend_epi32(mbuf_init, 464 _mm256_slli_si256(odd_flags, 8), 465 0x04); 466 rearm5 = _mm256_blend_epi32(mbuf_init, 467 _mm256_slli_si256(odd_flags, 4), 468 0x04); 469 rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04); 470 rearm1 = _mm256_blend_epi32(mbuf_init, 471 _mm256_srli_si256(odd_flags, 4), 472 0x04); 473 /* since odd mbufs are already in hi 128-bits use blend */ 474 rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0); 475 rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0); 476 rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0); 477 rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0); 478 } else { 479 rearm7 = _mm256_blend_epi32(mbuf_init, mb6_7, 0xF0); 480 rearm5 = _mm256_blend_epi32(mbuf_init, mb4_5, 0xF0); 481 rearm3 = _mm256_blend_epi32(mbuf_init, mb2_3, 0xF0); 482 rearm1 = _mm256_blend_epi32(mbuf_init, mb0_1, 0xF0); 483 } 484 /* again write to mbufs */ 485 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data, 486 rearm7); 487 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data, 488 rearm5); 489 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data, 490 rearm3); 491 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data, 492 rearm1); 493 494 /* extract and record EOP bit */ 495 if (split_packet) { 496 const __m128i eop_mask = 497 _mm_set1_epi16(1 << IAVF_RX_DESC_STATUS_EOF_SHIFT); 498 const __m256i eop_bits256 = _mm256_and_si256(status0_7, 499 eop_check); 500 /* pack status bits into a single 128-bit register */ 501 const __m128i eop_bits = 502 _mm_packus_epi32 503 (_mm256_castsi256_si128(eop_bits256), 504 _mm256_extractf128_si256(eop_bits256, 505 1)); 506 /** 507 * flip bits, and mask out the EOP bit, which is now 508 * a split-packet bit i.e. !EOP, rather than EOP one. 509 */ 510 __m128i split_bits = _mm_andnot_si128(eop_bits, 511 eop_mask); 512 /** 513 * eop bits are out of order, so we need to shuffle them 514 * back into order again. In doing so, only use low 8 515 * bits, which acts like another pack instruction 516 * The original order is (hi->lo): 1,3,5,7,0,2,4,6 517 * [Since we use epi8, the 16-bit positions are 518 * multiplied by 2 in the eop_shuffle value.] 519 */ 520 __m128i eop_shuffle = 521 _mm_set_epi8(/* zero hi 64b */ 522 0xFF, 0xFF, 0xFF, 0xFF, 523 0xFF, 0xFF, 0xFF, 0xFF, 524 /* move values to lo 64b */ 525 8, 0, 10, 2, 526 12, 4, 14, 6); 527 split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle); 528 *(uint64_t *)split_packet = 529 _mm_cvtsi128_si64(split_bits); 530 split_packet += IAVF_DESCS_PER_LOOP_AVX; 531 } 532 533 /* perform dd_check */ 534 status0_7 = _mm256_and_si256(status0_7, dd_check); 535 status0_7 = _mm256_packs_epi32(status0_7, 536 _mm256_setzero_si256()); 537 538 uint64_t burst = rte_popcount64 539 (_mm_cvtsi128_si64 540 (_mm256_extracti128_si256 541 (status0_7, 1))); 542 burst += rte_popcount64 543 (_mm_cvtsi128_si64 544 (_mm256_castsi256_si128(status0_7))); 545 received += burst; 546 if (burst != IAVF_DESCS_PER_LOOP_AVX) 547 break; 548 } 549 550 /* update tail pointers */ 551 rxq->rx_tail += received; 552 rxq->rx_tail &= (rxq->nb_rx_desc - 1); 553 if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep aligned */ 554 rxq->rx_tail--; 555 received--; 556 } 557 rxq->rxrearm_nb += received; 558 return received; 559 } 560 561 static __rte_always_inline __m256i 562 flex_rxd_to_fdir_flags_vec_avx512(const __m256i fdir_id0_7) 563 { 564 #define FDID_MIS_MAGIC 0xFFFFFFFF 565 RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR != (1 << 2)); 566 RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR_ID != (1 << 13)); 567 const __m256i pkt_fdir_bit = _mm256_set1_epi32(RTE_MBUF_F_RX_FDIR | 568 RTE_MBUF_F_RX_FDIR_ID); 569 /* desc->flow_id field == 0xFFFFFFFF means fdir mismatch */ 570 const __m256i fdir_mis_mask = _mm256_set1_epi32(FDID_MIS_MAGIC); 571 __m256i fdir_mask = _mm256_cmpeq_epi32(fdir_id0_7, 572 fdir_mis_mask); 573 /* this XOR op results to bit-reverse the fdir_mask */ 574 fdir_mask = _mm256_xor_si256(fdir_mask, fdir_mis_mask); 575 const __m256i fdir_flags = _mm256_and_si256(fdir_mask, pkt_fdir_bit); 576 577 return fdir_flags; 578 } 579 580 static __rte_always_inline uint16_t 581 _iavf_recv_raw_pkts_vec_avx512_flex_rxd(struct iavf_rx_queue *rxq, 582 struct rte_mbuf **rx_pkts, 583 uint16_t nb_pkts, 584 uint8_t *split_packet, 585 bool offload) 586 { 587 struct iavf_adapter *adapter = rxq->vsi->adapter; 588 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC 589 uint64_t offloads = adapter->dev_data->dev_conf.rxmode.offloads; 590 #endif 591 #ifdef IAVF_RX_PTYPE_OFFLOAD 592 const uint32_t *type_table = adapter->ptype_tbl; 593 #endif 594 595 const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0, 596 rxq->mbuf_initializer); 597 struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail]; 598 volatile union iavf_rx_flex_desc *rxdp = 599 (volatile union iavf_rx_flex_desc *)rxq->rx_ring + rxq->rx_tail; 600 601 rte_prefetch0(rxdp); 602 603 /* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */ 604 nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_DESCS_PER_LOOP_AVX); 605 606 /* See if we need to rearm the RX queue - gives the prefetch a bit 607 * of time to act 608 */ 609 if (rxq->rxrearm_nb > IAVF_RXQ_REARM_THRESH) 610 iavf_rxq_rearm(rxq); 611 612 /* Before we start moving massive data around, check to see if 613 * there is actually a packet available 614 */ 615 if (!(rxdp->wb.status_error0 & 616 rte_cpu_to_le_32(1 << IAVF_RX_FLEX_DESC_STATUS0_DD_S))) 617 return 0; 618 619 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC 620 #ifdef IAVF_RX_TS_OFFLOAD 621 uint8_t inflection_point = 0; 622 bool is_tsinit = false; 623 __m256i hw_low_last = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, (uint32_t)rxq->phc_time); 624 625 if (rxq->offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP) { 626 uint64_t sw_cur_time = rte_get_timer_cycles() / (rte_get_timer_hz() / 1000); 627 628 if (unlikely(sw_cur_time - rxq->hw_time_update > 4)) { 629 hw_low_last = _mm256_setzero_si256(); 630 is_tsinit = 1; 631 } else { 632 hw_low_last = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, (uint32_t)rxq->phc_time); 633 } 634 } 635 #endif 636 #endif 637 638 /* constants used in processing loop */ 639 const __m512i crc_adjust = 640 _mm512_set_epi32 641 (/* 1st descriptor */ 642 0, /* ignore non-length fields */ 643 -rxq->crc_len, /* sub crc on data_len */ 644 -rxq->crc_len, /* sub crc on pkt_len */ 645 0, /* ignore pkt_type field */ 646 /* 2nd descriptor */ 647 0, /* ignore non-length fields */ 648 -rxq->crc_len, /* sub crc on data_len */ 649 -rxq->crc_len, /* sub crc on pkt_len */ 650 0, /* ignore pkt_type field */ 651 /* 3rd descriptor */ 652 0, /* ignore non-length fields */ 653 -rxq->crc_len, /* sub crc on data_len */ 654 -rxq->crc_len, /* sub crc on pkt_len */ 655 0, /* ignore pkt_type field */ 656 /* 4th descriptor */ 657 0, /* ignore non-length fields */ 658 -rxq->crc_len, /* sub crc on data_len */ 659 -rxq->crc_len, /* sub crc on pkt_len */ 660 0 /* ignore pkt_type field */ 661 ); 662 663 /* 8 packets DD mask, LSB in each 32-bit value */ 664 const __m256i dd_check = _mm256_set1_epi32(1); 665 666 /* 8 packets EOP mask, second-LSB in each 32-bit value */ 667 const __m256i eop_check = _mm256_slli_epi32(dd_check, 668 IAVF_RX_FLEX_DESC_STATUS0_EOF_S); 669 670 /* mask to shuffle from desc. to mbuf (4 descriptors)*/ 671 const __m512i shuf_msk = 672 _mm512_set_epi32 673 (/* 1st descriptor */ 674 0xFFFFFFFF, /* rss hash parsed separately */ 675 0x0B0A0504, /* octet 10~11, 16 bits vlan_macip */ 676 /* octet 4~5, 16 bits data_len */ 677 0xFFFF0504, /* skip hi 16 bits pkt_len, zero out */ 678 /* octet 4~5, 16 bits pkt_len */ 679 0xFFFFFFFF, /* pkt_type set as unknown */ 680 /* 2nd descriptor */ 681 0xFFFFFFFF, /* rss hash parsed separately */ 682 0x0B0A0504, /* octet 10~11, 16 bits vlan_macip */ 683 /* octet 4~5, 16 bits data_len */ 684 0xFFFF0504, /* skip hi 16 bits pkt_len, zero out */ 685 /* octet 4~5, 16 bits pkt_len */ 686 0xFFFFFFFF, /* pkt_type set as unknown */ 687 /* 3rd descriptor */ 688 0xFFFFFFFF, /* rss hash parsed separately */ 689 0x0B0A0504, /* octet 10~11, 16 bits vlan_macip */ 690 /* octet 4~5, 16 bits data_len */ 691 0xFFFF0504, /* skip hi 16 bits pkt_len, zero out */ 692 /* octet 4~5, 16 bits pkt_len */ 693 0xFFFFFFFF, /* pkt_type set as unknown */ 694 /* 4th descriptor */ 695 0xFFFFFFFF, /* rss hash parsed separately */ 696 0x0B0A0504, /* octet 10~11, 16 bits vlan_macip */ 697 /* octet 4~5, 16 bits data_len */ 698 0xFFFF0504, /* skip hi 16 bits pkt_len, zero out */ 699 /* octet 4~5, 16 bits pkt_len */ 700 0xFFFFFFFF /* pkt_type set as unknown */ 701 ); 702 /** 703 * compile-time check the above crc and shuffle layout is correct. 704 * NOTE: the first field (lowest address) is given last in set_epi 705 * calls above. 706 */ 707 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != 708 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); 709 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != 710 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); 711 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) != 712 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10); 713 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != 714 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); 715 716 uint16_t i, received; 717 718 for (i = 0, received = 0; i < nb_pkts; 719 i += IAVF_DESCS_PER_LOOP_AVX, 720 rxdp += IAVF_DESCS_PER_LOOP_AVX) { 721 /* step 1, copy over 8 mbuf pointers to rx_pkts array */ 722 _mm256_storeu_si256((void *)&rx_pkts[i], 723 _mm256_loadu_si256((void *)&sw_ring[i])); 724 #ifdef RTE_ARCH_X86_64 725 _mm256_storeu_si256 726 ((void *)&rx_pkts[i + 4], 727 _mm256_loadu_si256((void *)&sw_ring[i + 4])); 728 #endif 729 730 __m512i raw_desc0_3, raw_desc4_7; 731 732 const __m128i raw_desc7 = 733 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 7)); 734 rte_compiler_barrier(); 735 const __m128i raw_desc6 = 736 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 6)); 737 rte_compiler_barrier(); 738 const __m128i raw_desc5 = 739 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 5)); 740 rte_compiler_barrier(); 741 const __m128i raw_desc4 = 742 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 4)); 743 rte_compiler_barrier(); 744 const __m128i raw_desc3 = 745 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 3)); 746 rte_compiler_barrier(); 747 const __m128i raw_desc2 = 748 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 2)); 749 rte_compiler_barrier(); 750 const __m128i raw_desc1 = 751 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 1)); 752 rte_compiler_barrier(); 753 const __m128i raw_desc0 = 754 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 0)); 755 756 raw_desc4_7 = _mm512_broadcast_i32x4(raw_desc4); 757 raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc5, 1); 758 raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc6, 2); 759 raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc7, 3); 760 raw_desc0_3 = _mm512_broadcast_i32x4(raw_desc0); 761 raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc1, 1); 762 raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc2, 2); 763 raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc3, 3); 764 765 if (split_packet) { 766 int j; 767 768 for (j = 0; j < IAVF_DESCS_PER_LOOP_AVX; j++) 769 rte_mbuf_prefetch_part2(rx_pkts[i + j]); 770 } 771 772 /** 773 * convert descriptors 4-7 into mbufs, re-arrange fields. 774 * Then write into the mbuf. 775 */ 776 __m512i mb4_7 = _mm512_shuffle_epi8(raw_desc4_7, shuf_msk); 777 778 mb4_7 = _mm512_add_epi32(mb4_7, crc_adjust); 779 #ifdef IAVF_RX_PTYPE_OFFLOAD 780 /** 781 * to get packet types, ptype is located in bit16-25 782 * of each 128bits 783 */ 784 const __m512i ptype_mask = 785 _mm512_set1_epi16(IAVF_RX_FLEX_DESC_PTYPE_M); 786 const __m512i ptypes4_7 = 787 _mm512_and_si512(raw_desc4_7, ptype_mask); 788 const __m256i ptypes6_7 = _mm512_extracti64x4_epi64(ptypes4_7, 1); 789 const __m256i ptypes4_5 = _mm512_extracti64x4_epi64(ptypes4_7, 0); 790 const uint16_t ptype7 = _mm256_extract_epi16(ptypes6_7, 9); 791 const uint16_t ptype6 = _mm256_extract_epi16(ptypes6_7, 1); 792 const uint16_t ptype5 = _mm256_extract_epi16(ptypes4_5, 9); 793 const uint16_t ptype4 = _mm256_extract_epi16(ptypes4_5, 1); 794 795 const __m512i ptype4_7 = _mm512_set_epi32 796 (0, 0, 0, type_table[ptype7], 797 0, 0, 0, type_table[ptype6], 798 0, 0, 0, type_table[ptype5], 799 0, 0, 0, type_table[ptype4]); 800 mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7); 801 #endif 802 803 /** 804 * convert descriptors 0-3 into mbufs, re-arrange fields. 805 * Then write into the mbuf. 806 */ 807 __m512i mb0_3 = _mm512_shuffle_epi8(raw_desc0_3, shuf_msk); 808 809 mb0_3 = _mm512_add_epi32(mb0_3, crc_adjust); 810 #ifdef IAVF_RX_PTYPE_OFFLOAD 811 /** 812 * to get packet types, ptype is located in bit16-25 813 * of each 128bits 814 */ 815 const __m512i ptypes0_3 = 816 _mm512_and_si512(raw_desc0_3, ptype_mask); 817 const __m256i ptypes2_3 = _mm512_extracti64x4_epi64(ptypes0_3, 1); 818 const __m256i ptypes0_1 = _mm512_extracti64x4_epi64(ptypes0_3, 0); 819 const uint16_t ptype3 = _mm256_extract_epi16(ptypes2_3, 9); 820 const uint16_t ptype2 = _mm256_extract_epi16(ptypes2_3, 1); 821 const uint16_t ptype1 = _mm256_extract_epi16(ptypes0_1, 9); 822 const uint16_t ptype0 = _mm256_extract_epi16(ptypes0_1, 1); 823 824 const __m512i ptype0_3 = _mm512_set_epi32 825 (0, 0, 0, type_table[ptype3], 826 0, 0, 0, type_table[ptype2], 827 0, 0, 0, type_table[ptype1], 828 0, 0, 0, type_table[ptype0]); 829 mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3); 830 #endif 831 832 /** 833 * use permute/extract to get status content 834 * After the operations, the packets status flags are in the 835 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6] 836 */ 837 /* merge the status bits into one register */ 838 const __m512i status_permute_msk = _mm512_set_epi32 839 (0, 0, 0, 0, 840 0, 0, 0, 0, 841 22, 30, 6, 14, 842 18, 26, 2, 10); 843 const __m512i raw_status0_7 = _mm512_permutex2var_epi32 844 (raw_desc4_7, status_permute_msk, raw_desc0_3); 845 __m256i status0_7 = _mm512_extracti64x4_epi64 846 (raw_status0_7, 0); 847 848 /* now do flag manipulation */ 849 850 /* merge flags */ 851 __m256i mbuf_flags = _mm256_set1_epi32(0); 852 __m256i vlan_flags = _mm256_setzero_si256(); 853 854 if (offload) { 855 #if defined(IAVF_RX_CSUM_OFFLOAD) || defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD) 856 /* Status/Error flag masks */ 857 /** 858 * mask everything except Checksum Reports, RSS indication 859 * and VLAN indication. 860 * bit6:4 for IP/L4 checksum errors. 861 * bit12 is for RSS indication. 862 * bit13 is for VLAN indication. 863 */ 864 const __m256i flags_mask = 865 _mm256_set1_epi32((0xF << 4) | (1 << 12) | (1 << 13)); 866 #endif 867 #ifdef IAVF_RX_CSUM_OFFLOAD 868 /** 869 * data to be shuffled by the result of the flags mask shifted by 4 870 * bits. This gives use the l3_l4 flags. 871 */ 872 const __m256i l3_l4_flags_shuf = 873 _mm256_set_epi8((RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 874 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | 875 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 876 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 877 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | 878 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 879 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 880 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 881 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 882 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 883 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 884 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 885 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 886 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 887 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 888 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 889 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 890 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 891 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 892 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 893 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 894 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | 895 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 896 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 897 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | 898 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 899 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 900 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 901 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 902 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 903 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 904 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 905 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 906 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 907 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 908 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 909 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 910 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 911 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 912 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 913 /** 914 * second 128-bits 915 * shift right 20 bits to use the low two bits to indicate 916 * outer checksum status 917 * shift right 1 bit to make sure it not exceed 255 918 */ 919 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 920 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | 921 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 922 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 923 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | 924 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 925 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 926 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 927 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 928 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 929 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 930 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 931 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 932 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 933 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 934 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 935 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 936 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 937 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 938 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 939 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 940 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | 941 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 942 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 943 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | 944 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 945 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 946 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 947 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 948 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 949 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | 950 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 951 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 952 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 953 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 954 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, 955 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 956 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, 957 (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 958 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1); 959 const __m256i cksum_mask = 960 _mm256_set1_epi32(RTE_MBUF_F_RX_IP_CKSUM_MASK | 961 RTE_MBUF_F_RX_L4_CKSUM_MASK | 962 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 963 RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK); 964 #endif 965 #if defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD) 966 /** 967 * data to be shuffled by result of flag mask, shifted down 12. 968 * If RSS(bit12)/VLAN(bit13) are set, 969 * shuffle moves appropriate flags in place. 970 */ 971 const __m256i rss_flags_shuf = _mm256_set_epi8 972 (0, 0, 0, 0, 973 0, 0, 0, 0, 974 0, 0, 0, 0, 975 RTE_MBUF_F_RX_RSS_HASH, 0, 976 RTE_MBUF_F_RX_RSS_HASH, 0, 977 /* end up 128-bits */ 978 0, 0, 0, 0, 979 0, 0, 0, 0, 980 0, 0, 0, 0, 981 RTE_MBUF_F_RX_RSS_HASH, 0, 982 RTE_MBUF_F_RX_RSS_HASH, 0); 983 984 const __m256i vlan_flags_shuf = _mm256_set_epi8 985 (0, 0, 0, 0, 986 0, 0, 0, 0, 987 0, 0, 0, 0, 988 RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 989 RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 990 0, 0, 991 /* end up 128-bits */ 992 0, 0, 0, 0, 993 0, 0, 0, 0, 994 0, 0, 0, 0, 995 RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 996 RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 997 0, 0); 998 #endif 999 1000 #if defined(IAVF_RX_CSUM_OFFLOAD) || defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD) 1001 /* get only flag/error bits we want */ 1002 const __m256i flag_bits = 1003 _mm256_and_si256(status0_7, flags_mask); 1004 #endif 1005 #ifdef IAVF_RX_CSUM_OFFLOAD 1006 /** 1007 * l3_l4_error flags, shuffle, then shift to correct adjustment 1008 * of flags in flags_shuf, and finally mask out extra bits 1009 */ 1010 __m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf, 1011 _mm256_srli_epi32(flag_bits, 4)); 1012 l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1); 1013 __m256i l4_outer_mask = _mm256_set1_epi32(0x6); 1014 __m256i l4_outer_flags = 1015 _mm256_and_si256(l3_l4_flags, l4_outer_mask); 1016 l4_outer_flags = _mm256_slli_epi32(l4_outer_flags, 20); 1017 1018 __m256i l3_l4_mask = _mm256_set1_epi32(~0x6); 1019 1020 l3_l4_flags = _mm256_and_si256(l3_l4_flags, l3_l4_mask); 1021 l3_l4_flags = _mm256_or_si256(l3_l4_flags, l4_outer_flags); 1022 l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask); 1023 #endif 1024 #if defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD) 1025 /* set rss and vlan flags */ 1026 const __m256i rss_vlan_flag_bits = 1027 _mm256_srli_epi32(flag_bits, 12); 1028 const __m256i rss_flags = 1029 _mm256_shuffle_epi8(rss_flags_shuf, 1030 rss_vlan_flag_bits); 1031 1032 if (rxq->rx_flags == IAVF_RX_FLAGS_VLAN_TAG_LOC_L2TAG1) 1033 vlan_flags = 1034 _mm256_shuffle_epi8(vlan_flags_shuf, 1035 rss_vlan_flag_bits); 1036 1037 const __m256i rss_vlan_flags = 1038 _mm256_or_si256(rss_flags, vlan_flags); 1039 1040 #endif 1041 1042 #ifdef IAVF_RX_CSUM_OFFLOAD 1043 mbuf_flags = _mm256_or_si256(mbuf_flags, l3_l4_flags); 1044 #endif 1045 #if defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD) 1046 mbuf_flags = _mm256_or_si256(mbuf_flags, rss_vlan_flags); 1047 #endif 1048 } 1049 1050 #ifdef IAVF_RX_FDIR_OFFLOAD 1051 if (rxq->fdir_enabled) { 1052 const __m512i fdir_permute_mask = _mm512_set_epi32 1053 (0, 0, 0, 0, 1054 0, 0, 0, 0, 1055 7, 15, 23, 31, 1056 3, 11, 19, 27); 1057 __m512i fdir_tmp = _mm512_permutex2var_epi32 1058 (raw_desc0_3, fdir_permute_mask, raw_desc4_7); 1059 const __m256i fdir_id0_7 = _mm512_extracti64x4_epi64 1060 (fdir_tmp, 0); 1061 const __m256i fdir_flags = 1062 flex_rxd_to_fdir_flags_vec_avx512(fdir_id0_7); 1063 1064 /* merge with fdir_flags */ 1065 mbuf_flags = _mm256_or_si256(mbuf_flags, fdir_flags); 1066 1067 /* write to mbuf: have to use scalar store here */ 1068 rx_pkts[i + 0]->hash.fdir.hi = 1069 _mm256_extract_epi32(fdir_id0_7, 3); 1070 1071 rx_pkts[i + 1]->hash.fdir.hi = 1072 _mm256_extract_epi32(fdir_id0_7, 7); 1073 1074 rx_pkts[i + 2]->hash.fdir.hi = 1075 _mm256_extract_epi32(fdir_id0_7, 2); 1076 1077 rx_pkts[i + 3]->hash.fdir.hi = 1078 _mm256_extract_epi32(fdir_id0_7, 6); 1079 1080 rx_pkts[i + 4]->hash.fdir.hi = 1081 _mm256_extract_epi32(fdir_id0_7, 1); 1082 1083 rx_pkts[i + 5]->hash.fdir.hi = 1084 _mm256_extract_epi32(fdir_id0_7, 5); 1085 1086 rx_pkts[i + 6]->hash.fdir.hi = 1087 _mm256_extract_epi32(fdir_id0_7, 0); 1088 1089 rx_pkts[i + 7]->hash.fdir.hi = 1090 _mm256_extract_epi32(fdir_id0_7, 4); 1091 } /* if() on fdir_enabled */ 1092 #endif 1093 1094 __m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0); 1095 __m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1); 1096 __m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0); 1097 __m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1); 1098 1099 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC 1100 if (offload) { 1101 #if defined(IAVF_RX_RSS_OFFLOAD) || defined(IAVF_RX_TS_OFFLOAD) 1102 /** 1103 * needs to load 2nd 16B of each desc for RSS hash parsing, 1104 * will cause performance drop to get into this context. 1105 */ 1106 if (offloads & RTE_ETH_RX_OFFLOAD_RSS_HASH || 1107 offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP || 1108 rxq->rx_flags & IAVF_RX_FLAGS_VLAN_TAG_LOC_L2TAG2_2) { 1109 /* load bottom half of every 32B desc */ 1110 const __m128i raw_desc_bh7 = 1111 _mm_load_si128 1112 (RTE_CAST_PTR(const __m128i *, &rxdp[7].wb.status_error1)); 1113 rte_compiler_barrier(); 1114 const __m128i raw_desc_bh6 = 1115 _mm_load_si128 1116 (RTE_CAST_PTR(const __m128i *, &rxdp[6].wb.status_error1)); 1117 rte_compiler_barrier(); 1118 const __m128i raw_desc_bh5 = 1119 _mm_load_si128 1120 (RTE_CAST_PTR(const __m128i *, &rxdp[5].wb.status_error1)); 1121 rte_compiler_barrier(); 1122 const __m128i raw_desc_bh4 = 1123 _mm_load_si128 1124 (RTE_CAST_PTR(const __m128i *, &rxdp[4].wb.status_error1)); 1125 rte_compiler_barrier(); 1126 const __m128i raw_desc_bh3 = 1127 _mm_load_si128 1128 (RTE_CAST_PTR(const __m128i *, &rxdp[3].wb.status_error1)); 1129 rte_compiler_barrier(); 1130 const __m128i raw_desc_bh2 = 1131 _mm_load_si128 1132 (RTE_CAST_PTR(const __m128i *, &rxdp[2].wb.status_error1)); 1133 rte_compiler_barrier(); 1134 const __m128i raw_desc_bh1 = 1135 _mm_load_si128 1136 (RTE_CAST_PTR(const __m128i *, &rxdp[1].wb.status_error1)); 1137 rte_compiler_barrier(); 1138 const __m128i raw_desc_bh0 = 1139 _mm_load_si128 1140 (RTE_CAST_PTR(const __m128i *, &rxdp[0].wb.status_error1)); 1141 1142 __m256i raw_desc_bh6_7 = 1143 _mm256_inserti128_si256 1144 (_mm256_castsi128_si256(raw_desc_bh6), 1145 raw_desc_bh7, 1); 1146 __m256i raw_desc_bh4_5 = 1147 _mm256_inserti128_si256 1148 (_mm256_castsi128_si256(raw_desc_bh4), 1149 raw_desc_bh5, 1); 1150 __m256i raw_desc_bh2_3 = 1151 _mm256_inserti128_si256 1152 (_mm256_castsi128_si256(raw_desc_bh2), 1153 raw_desc_bh3, 1); 1154 __m256i raw_desc_bh0_1 = 1155 _mm256_inserti128_si256 1156 (_mm256_castsi128_si256(raw_desc_bh0), 1157 raw_desc_bh1, 1); 1158 1159 #ifdef IAVF_RX_RSS_OFFLOAD 1160 if (offloads & RTE_ETH_RX_OFFLOAD_RSS_HASH) { 1161 /** 1162 * to shift the 32b RSS hash value to the 1163 * highest 32b of each 128b before mask 1164 */ 1165 __m256i rss_hash6_7 = 1166 _mm256_slli_epi64 1167 (raw_desc_bh6_7, 32); 1168 __m256i rss_hash4_5 = 1169 _mm256_slli_epi64 1170 (raw_desc_bh4_5, 32); 1171 __m256i rss_hash2_3 = 1172 _mm256_slli_epi64 1173 (raw_desc_bh2_3, 32); 1174 __m256i rss_hash0_1 = 1175 _mm256_slli_epi64 1176 (raw_desc_bh0_1, 32); 1177 1178 const __m256i rss_hash_msk = 1179 _mm256_set_epi32 1180 (0xFFFFFFFF, 0, 0, 0, 1181 0xFFFFFFFF, 0, 0, 0); 1182 1183 rss_hash6_7 = _mm256_and_si256 1184 (rss_hash6_7, rss_hash_msk); 1185 rss_hash4_5 = _mm256_and_si256 1186 (rss_hash4_5, rss_hash_msk); 1187 rss_hash2_3 = _mm256_and_si256 1188 (rss_hash2_3, rss_hash_msk); 1189 rss_hash0_1 = _mm256_and_si256 1190 (rss_hash0_1, rss_hash_msk); 1191 1192 mb6_7 = _mm256_or_si256 1193 (mb6_7, rss_hash6_7); 1194 mb4_5 = _mm256_or_si256 1195 (mb4_5, rss_hash4_5); 1196 mb2_3 = _mm256_or_si256 1197 (mb2_3, rss_hash2_3); 1198 mb0_1 = _mm256_or_si256 1199 (mb0_1, rss_hash0_1); 1200 } 1201 1202 if (rxq->rx_flags & IAVF_RX_FLAGS_VLAN_TAG_LOC_L2TAG2_2) { 1203 /* merge the status/error-1 bits into one register */ 1204 const __m256i status1_4_7 = 1205 _mm256_unpacklo_epi32 1206 (raw_desc_bh6_7, 1207 raw_desc_bh4_5); 1208 const __m256i status1_0_3 = 1209 _mm256_unpacklo_epi32 1210 (raw_desc_bh2_3, 1211 raw_desc_bh0_1); 1212 1213 const __m256i status1_0_7 = 1214 _mm256_unpacklo_epi64 1215 (status1_4_7, status1_0_3); 1216 1217 const __m256i l2tag2p_flag_mask = 1218 _mm256_set1_epi32 1219 (1 << IAVF_RX_FLEX_DESC_STATUS1_L2TAG2P_S); 1220 1221 __m256i l2tag2p_flag_bits = 1222 _mm256_and_si256 1223 (status1_0_7, 1224 l2tag2p_flag_mask); 1225 1226 l2tag2p_flag_bits = 1227 _mm256_srli_epi32 1228 (l2tag2p_flag_bits, 1229 IAVF_RX_FLEX_DESC_STATUS1_L2TAG2P_S); 1230 1231 const __m256i l2tag2_flags_shuf = 1232 _mm256_set_epi8 1233 (0, 0, 0, 0, 1234 0, 0, 0, 0, 1235 0, 0, 0, 0, 1236 0, 0, 1237 RTE_MBUF_F_RX_VLAN | 1238 RTE_MBUF_F_RX_VLAN_STRIPPED, 1239 0, 1240 /* end up 128-bits */ 1241 0, 0, 0, 0, 1242 0, 0, 0, 0, 1243 0, 0, 0, 0, 1244 0, 0, 1245 RTE_MBUF_F_RX_VLAN | 1246 RTE_MBUF_F_RX_VLAN_STRIPPED, 1247 0); 1248 1249 vlan_flags = 1250 _mm256_shuffle_epi8 1251 (l2tag2_flags_shuf, 1252 l2tag2p_flag_bits); 1253 1254 /* merge with vlan_flags */ 1255 mbuf_flags = _mm256_or_si256 1256 (mbuf_flags, 1257 vlan_flags); 1258 1259 /* L2TAG2_2 */ 1260 __m256i vlan_tci6_7 = 1261 _mm256_slli_si256 1262 (raw_desc_bh6_7, 4); 1263 __m256i vlan_tci4_5 = 1264 _mm256_slli_si256 1265 (raw_desc_bh4_5, 4); 1266 __m256i vlan_tci2_3 = 1267 _mm256_slli_si256 1268 (raw_desc_bh2_3, 4); 1269 __m256i vlan_tci0_1 = 1270 _mm256_slli_si256 1271 (raw_desc_bh0_1, 4); 1272 1273 const __m256i vlan_tci_msk = 1274 _mm256_set_epi32 1275 (0, 0xFFFF0000, 0, 0, 1276 0, 0xFFFF0000, 0, 0); 1277 1278 vlan_tci6_7 = _mm256_and_si256 1279 (vlan_tci6_7, 1280 vlan_tci_msk); 1281 vlan_tci4_5 = _mm256_and_si256 1282 (vlan_tci4_5, 1283 vlan_tci_msk); 1284 vlan_tci2_3 = _mm256_and_si256 1285 (vlan_tci2_3, 1286 vlan_tci_msk); 1287 vlan_tci0_1 = _mm256_and_si256 1288 (vlan_tci0_1, 1289 vlan_tci_msk); 1290 1291 mb6_7 = _mm256_or_si256 1292 (mb6_7, vlan_tci6_7); 1293 mb4_5 = _mm256_or_si256 1294 (mb4_5, vlan_tci4_5); 1295 mb2_3 = _mm256_or_si256 1296 (mb2_3, vlan_tci2_3); 1297 mb0_1 = _mm256_or_si256 1298 (mb0_1, vlan_tci0_1); 1299 } 1300 #endif /* IAVF_RX_RSS_OFFLOAD */ 1301 1302 #ifdef IAVF_RX_TS_OFFLOAD 1303 if (offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP) { 1304 uint32_t mask = 0xFFFFFFFF; 1305 __m256i ts; 1306 __m256i ts_low = _mm256_setzero_si256(); 1307 __m256i ts_low1; 1308 __m256i ts_low2; 1309 __m256i max_ret; 1310 __m256i cmp_ret; 1311 uint8_t ret = 0; 1312 uint8_t shift = 8; 1313 __m256i ts_desp_mask = _mm256_set_epi32(mask, 0, 0, 0, mask, 0, 0, 0); 1314 __m256i cmp_mask = _mm256_set1_epi32(mask); 1315 __m256i ts_permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); 1316 1317 ts = _mm256_and_si256(raw_desc_bh0_1, ts_desp_mask); 1318 ts_low = _mm256_or_si256(ts_low, _mm256_srli_si256(ts, 3 * 4)); 1319 ts = _mm256_and_si256(raw_desc_bh2_3, ts_desp_mask); 1320 ts_low = _mm256_or_si256(ts_low, _mm256_srli_si256(ts, 2 * 4)); 1321 ts = _mm256_and_si256(raw_desc_bh4_5, ts_desp_mask); 1322 ts_low = _mm256_or_si256(ts_low, _mm256_srli_si256(ts, 4)); 1323 ts = _mm256_and_si256(raw_desc_bh6_7, ts_desp_mask); 1324 ts_low = _mm256_or_si256(ts_low, ts); 1325 1326 ts_low1 = _mm256_permutevar8x32_epi32(ts_low, ts_permute_mask); 1327 ts_low2 = _mm256_permutevar8x32_epi32(ts_low1, 1328 _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 7)); 1329 ts_low2 = _mm256_and_si256(ts_low2, 1330 _mm256_set_epi32(mask, mask, mask, mask, mask, mask, mask, 0)); 1331 ts_low2 = _mm256_or_si256(ts_low2, hw_low_last); 1332 hw_low_last = _mm256_and_si256(ts_low1, 1333 _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, mask)); 1334 1335 *RTE_MBUF_DYNFIELD(rx_pkts[i + 0], 1336 iavf_timestamp_dynfield_offset, uint32_t *) = _mm256_extract_epi32(ts_low1, 0); 1337 *RTE_MBUF_DYNFIELD(rx_pkts[i + 1], 1338 iavf_timestamp_dynfield_offset, uint32_t *) = _mm256_extract_epi32(ts_low1, 1); 1339 *RTE_MBUF_DYNFIELD(rx_pkts[i + 2], 1340 iavf_timestamp_dynfield_offset, uint32_t *) = _mm256_extract_epi32(ts_low1, 2); 1341 *RTE_MBUF_DYNFIELD(rx_pkts[i + 3], 1342 iavf_timestamp_dynfield_offset, uint32_t *) = _mm256_extract_epi32(ts_low1, 3); 1343 *RTE_MBUF_DYNFIELD(rx_pkts[i + 4], 1344 iavf_timestamp_dynfield_offset, uint32_t *) = _mm256_extract_epi32(ts_low1, 4); 1345 *RTE_MBUF_DYNFIELD(rx_pkts[i + 5], 1346 iavf_timestamp_dynfield_offset, uint32_t *) = _mm256_extract_epi32(ts_low1, 5); 1347 *RTE_MBUF_DYNFIELD(rx_pkts[i + 6], 1348 iavf_timestamp_dynfield_offset, uint32_t *) = _mm256_extract_epi32(ts_low1, 6); 1349 *RTE_MBUF_DYNFIELD(rx_pkts[i + 7], 1350 iavf_timestamp_dynfield_offset, uint32_t *) = _mm256_extract_epi32(ts_low1, 7); 1351 1352 if (unlikely(is_tsinit)) { 1353 uint32_t in_timestamp; 1354 1355 if (iavf_get_phc_time(rxq)) 1356 PMD_DRV_LOG(ERR, "get physical time failed"); 1357 in_timestamp = *RTE_MBUF_DYNFIELD(rx_pkts[i + 0], 1358 iavf_timestamp_dynfield_offset, uint32_t *); 1359 rxq->phc_time = iavf_tstamp_convert_32b_64b(rxq->phc_time, in_timestamp); 1360 } 1361 1362 *RTE_MBUF_DYNFIELD(rx_pkts[i + 0], 1363 iavf_timestamp_dynfield_offset + 4, uint32_t *) = (uint32_t)(rxq->phc_time >> 32); 1364 *RTE_MBUF_DYNFIELD(rx_pkts[i + 1], 1365 iavf_timestamp_dynfield_offset + 4, uint32_t *) = (uint32_t)(rxq->phc_time >> 32); 1366 *RTE_MBUF_DYNFIELD(rx_pkts[i + 2], 1367 iavf_timestamp_dynfield_offset + 4, uint32_t *) = (uint32_t)(rxq->phc_time >> 32); 1368 *RTE_MBUF_DYNFIELD(rx_pkts[i + 3], 1369 iavf_timestamp_dynfield_offset + 4, uint32_t *) = (uint32_t)(rxq->phc_time >> 32); 1370 *RTE_MBUF_DYNFIELD(rx_pkts[i + 4], 1371 iavf_timestamp_dynfield_offset + 4, uint32_t *) = (uint32_t)(rxq->phc_time >> 32); 1372 *RTE_MBUF_DYNFIELD(rx_pkts[i + 5], 1373 iavf_timestamp_dynfield_offset + 4, uint32_t *) = (uint32_t)(rxq->phc_time >> 32); 1374 *RTE_MBUF_DYNFIELD(rx_pkts[i + 6], 1375 iavf_timestamp_dynfield_offset + 4, uint32_t *) = (uint32_t)(rxq->phc_time >> 32); 1376 *RTE_MBUF_DYNFIELD(rx_pkts[i + 7], 1377 iavf_timestamp_dynfield_offset + 4, uint32_t *) = (uint32_t)(rxq->phc_time >> 32); 1378 1379 max_ret = _mm256_max_epu32(ts_low2, ts_low1); 1380 cmp_ret = _mm256_andnot_si256(_mm256_cmpeq_epi32(max_ret, ts_low1), cmp_mask); 1381 1382 if (_mm256_testz_si256(cmp_ret, cmp_mask)) { 1383 inflection_point = 0; 1384 } else { 1385 inflection_point = 1; 1386 while (shift > 1) { 1387 shift = shift >> 1; 1388 __m256i mask_low = _mm256_setzero_si256(); 1389 __m256i mask_high = _mm256_setzero_si256(); 1390 switch (shift) { 1391 case 4: 1392 mask_low = _mm256_set_epi32(0, 0, 0, 0, mask, mask, mask, mask); 1393 mask_high = _mm256_set_epi32(mask, mask, mask, mask, 0, 0, 0, 0); 1394 break; 1395 case 2: 1396 mask_low = _mm256_srli_si256(cmp_mask, 2 * 4); 1397 mask_high = _mm256_slli_si256(cmp_mask, 2 * 4); 1398 break; 1399 case 1: 1400 mask_low = _mm256_srli_si256(cmp_mask, 1 * 4); 1401 mask_high = _mm256_slli_si256(cmp_mask, 1 * 4); 1402 break; 1403 } 1404 ret = _mm256_testz_si256(cmp_ret, mask_low); 1405 if (ret) { 1406 ret = _mm256_testz_si256(cmp_ret, mask_high); 1407 inflection_point += ret ? 0 : shift; 1408 cmp_mask = mask_high; 1409 } else { 1410 cmp_mask = mask_low; 1411 } 1412 } 1413 } 1414 mbuf_flags = _mm256_or_si256(mbuf_flags, 1415 _mm256_set1_epi32(iavf_timestamp_dynflag)); 1416 } 1417 #endif /* IAVF_RX_TS_OFFLOAD */ 1418 } /* if() on RSS hash or RX timestamp parsing */ 1419 #endif 1420 } 1421 #endif 1422 1423 /** 1424 * At this point, we have the 8 sets of flags in the low 16-bits 1425 * of each 32-bit value in vlan0. 1426 * We want to extract these, and merge them with the mbuf init 1427 * data so we can do a single write to the mbuf to set the flags 1428 * and all the other initialization fields. Extracting the 1429 * appropriate flags means that we have to do a shift and blend 1430 * for each mbuf before we do the write. However, we can also 1431 * add in the previously computed rx_descriptor fields to 1432 * make a single 256-bit write per mbuf 1433 */ 1434 /* check the structure matches expectations */ 1435 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) != 1436 offsetof(struct rte_mbuf, rearm_data) + 8); 1437 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) != 1438 RTE_ALIGN(offsetof(struct rte_mbuf, 1439 rearm_data), 1440 16)); 1441 /* build up data and do writes */ 1442 __m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5, 1443 rearm6, rearm7; 1444 rearm6 = _mm256_blend_epi32(mbuf_init, 1445 _mm256_slli_si256(mbuf_flags, 8), 1446 0x04); 1447 rearm4 = _mm256_blend_epi32(mbuf_init, 1448 _mm256_slli_si256(mbuf_flags, 4), 1449 0x04); 1450 rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04); 1451 rearm0 = _mm256_blend_epi32(mbuf_init, 1452 _mm256_srli_si256(mbuf_flags, 4), 1453 0x04); 1454 /* permute to add in the rx_descriptor e.g. rss fields */ 1455 rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20); 1456 rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20); 1457 rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20); 1458 rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20); 1459 /* write to mbuf */ 1460 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data, 1461 rearm6); 1462 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data, 1463 rearm4); 1464 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data, 1465 rearm2); 1466 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data, 1467 rearm0); 1468 1469 /* repeat for the odd mbufs */ 1470 const __m256i odd_flags = 1471 _mm256_castsi128_si256 1472 (_mm256_extracti128_si256(mbuf_flags, 1)); 1473 rearm7 = _mm256_blend_epi32(mbuf_init, 1474 _mm256_slli_si256(odd_flags, 8), 1475 0x04); 1476 rearm5 = _mm256_blend_epi32(mbuf_init, 1477 _mm256_slli_si256(odd_flags, 4), 1478 0x04); 1479 rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04); 1480 rearm1 = _mm256_blend_epi32(mbuf_init, 1481 _mm256_srli_si256(odd_flags, 4), 1482 0x04); 1483 /* since odd mbufs are already in hi 128-bits use blend */ 1484 rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0); 1485 rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0); 1486 rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0); 1487 rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0); 1488 /* again write to mbufs */ 1489 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data, 1490 rearm7); 1491 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data, 1492 rearm5); 1493 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data, 1494 rearm3); 1495 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data, 1496 rearm1); 1497 1498 /* extract and record EOP bit */ 1499 if (split_packet) { 1500 const __m128i eop_mask = 1501 _mm_set1_epi16(1 << 1502 IAVF_RX_FLEX_DESC_STATUS0_EOF_S); 1503 const __m256i eop_bits256 = _mm256_and_si256(status0_7, 1504 eop_check); 1505 /* pack status bits into a single 128-bit register */ 1506 const __m128i eop_bits = 1507 _mm_packus_epi32 1508 (_mm256_castsi256_si128(eop_bits256), 1509 _mm256_extractf128_si256(eop_bits256, 1510 1)); 1511 /** 1512 * flip bits, and mask out the EOP bit, which is now 1513 * a split-packet bit i.e. !EOP, rather than EOP one. 1514 */ 1515 __m128i split_bits = _mm_andnot_si128(eop_bits, 1516 eop_mask); 1517 /** 1518 * eop bits are out of order, so we need to shuffle them 1519 * back into order again. In doing so, only use low 8 1520 * bits, which acts like another pack instruction 1521 * The original order is (hi->lo): 1,3,5,7,0,2,4,6 1522 * [Since we use epi8, the 16-bit positions are 1523 * multiplied by 2 in the eop_shuffle value.] 1524 */ 1525 __m128i eop_shuffle = 1526 _mm_set_epi8(/* zero hi 64b */ 1527 0xFF, 0xFF, 0xFF, 0xFF, 1528 0xFF, 0xFF, 0xFF, 0xFF, 1529 /* move values to lo 64b */ 1530 8, 0, 10, 2, 1531 12, 4, 14, 6); 1532 split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle); 1533 *(uint64_t *)split_packet = 1534 _mm_cvtsi128_si64(split_bits); 1535 split_packet += IAVF_DESCS_PER_LOOP_AVX; 1536 } 1537 1538 /* perform dd_check */ 1539 status0_7 = _mm256_and_si256(status0_7, dd_check); 1540 status0_7 = _mm256_packs_epi32(status0_7, 1541 _mm256_setzero_si256()); 1542 1543 uint64_t burst = rte_popcount64 1544 (_mm_cvtsi128_si64 1545 (_mm256_extracti128_si256 1546 (status0_7, 1))); 1547 burst += rte_popcount64 1548 (_mm_cvtsi128_si64 1549 (_mm256_castsi256_si128(status0_7))); 1550 received += burst; 1551 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC 1552 #ifdef IAVF_RX_TS_OFFLOAD 1553 if (rxq->offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP) { 1554 inflection_point = (inflection_point <= burst) ? inflection_point : 0; 1555 switch (inflection_point) { 1556 case 1: 1557 *RTE_MBUF_DYNFIELD(rx_pkts[i + 0], 1558 iavf_timestamp_dynfield_offset + 4, uint32_t *) += 1; 1559 /* fallthrough */ 1560 case 2: 1561 *RTE_MBUF_DYNFIELD(rx_pkts[i + 1], 1562 iavf_timestamp_dynfield_offset + 4, uint32_t *) += 1; 1563 /* fallthrough */ 1564 case 3: 1565 *RTE_MBUF_DYNFIELD(rx_pkts[i + 2], 1566 iavf_timestamp_dynfield_offset + 4, uint32_t *) += 1; 1567 /* fallthrough */ 1568 case 4: 1569 *RTE_MBUF_DYNFIELD(rx_pkts[i + 3], 1570 iavf_timestamp_dynfield_offset + 4, uint32_t *) += 1; 1571 /* fallthrough */ 1572 case 5: 1573 *RTE_MBUF_DYNFIELD(rx_pkts[i + 4], 1574 iavf_timestamp_dynfield_offset + 4, uint32_t *) += 1; 1575 /* fallthrough */ 1576 case 6: 1577 *RTE_MBUF_DYNFIELD(rx_pkts[i + 5], 1578 iavf_timestamp_dynfield_offset + 4, uint32_t *) += 1; 1579 /* fallthrough */ 1580 case 7: 1581 *RTE_MBUF_DYNFIELD(rx_pkts[i + 6], 1582 iavf_timestamp_dynfield_offset + 4, uint32_t *) += 1; 1583 /* fallthrough */ 1584 case 8: 1585 *RTE_MBUF_DYNFIELD(rx_pkts[i + 7], 1586 iavf_timestamp_dynfield_offset + 4, uint32_t *) += 1; 1587 rxq->phc_time += (uint64_t)1 << 32; 1588 /* fallthrough */ 1589 case 0: 1590 break; 1591 default: 1592 PMD_DRV_LOG(ERR, "invalid inflection point for rx timestamp"); 1593 break; 1594 } 1595 1596 rxq->hw_time_update = rte_get_timer_cycles() / (rte_get_timer_hz() / 1000); 1597 } 1598 #endif 1599 #endif 1600 if (burst != IAVF_DESCS_PER_LOOP_AVX) 1601 break; 1602 } 1603 1604 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC 1605 #ifdef IAVF_RX_TS_OFFLOAD 1606 if (received > 0 && (rxq->offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP)) 1607 rxq->phc_time = *RTE_MBUF_DYNFIELD(rx_pkts[received - 1], 1608 iavf_timestamp_dynfield_offset, rte_mbuf_timestamp_t *); 1609 #endif 1610 #endif 1611 1612 /* update tail pointers */ 1613 rxq->rx_tail += received; 1614 rxq->rx_tail &= (rxq->nb_rx_desc - 1); 1615 if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep aligned */ 1616 rxq->rx_tail--; 1617 received--; 1618 } 1619 rxq->rxrearm_nb += received; 1620 return received; 1621 } 1622 1623 /** 1624 * Notice: 1625 * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet 1626 */ 1627 uint16_t 1628 iavf_recv_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts, 1629 uint16_t nb_pkts) 1630 { 1631 return _iavf_recv_raw_pkts_vec_avx512(rx_queue, rx_pkts, nb_pkts, 1632 NULL, false); 1633 } 1634 1635 /** 1636 * Notice: 1637 * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet 1638 */ 1639 uint16_t 1640 iavf_recv_pkts_vec_avx512_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts, 1641 uint16_t nb_pkts) 1642 { 1643 return _iavf_recv_raw_pkts_vec_avx512_flex_rxd(rx_queue, rx_pkts, 1644 nb_pkts, NULL, false); 1645 } 1646 1647 /** 1648 * vPMD receive routine that reassembles single burst of 32 scattered packets 1649 * Notice: 1650 * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet 1651 */ 1652 static __rte_always_inline uint16_t 1653 iavf_recv_scattered_burst_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts, 1654 uint16_t nb_pkts, bool offload) 1655 { 1656 struct iavf_rx_queue *rxq = rx_queue; 1657 uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0}; 1658 1659 /* get some new buffers */ 1660 uint16_t nb_bufs = _iavf_recv_raw_pkts_vec_avx512(rxq, rx_pkts, nb_pkts, 1661 split_flags, offload); 1662 if (nb_bufs == 0) 1663 return 0; 1664 1665 /* happy day case, full burst + no packets to be joined */ 1666 const uint64_t *split_fl64 = (uint64_t *)split_flags; 1667 1668 if (!rxq->pkt_first_seg && 1669 split_fl64[0] == 0 && split_fl64[1] == 0 && 1670 split_fl64[2] == 0 && split_fl64[3] == 0) 1671 return nb_bufs; 1672 1673 /* reassemble any packets that need reassembly*/ 1674 unsigned int i = 0; 1675 1676 if (!rxq->pkt_first_seg) { 1677 /* find the first split flag, and only reassemble then*/ 1678 while (i < nb_bufs && !split_flags[i]) 1679 i++; 1680 if (i == nb_bufs) 1681 return nb_bufs; 1682 rxq->pkt_first_seg = rx_pkts[i]; 1683 } 1684 return i + ci_rx_reassemble_packets(&rx_pkts[i], nb_bufs - i, &split_flags[i], 1685 &rxq->pkt_first_seg, &rxq->pkt_last_seg, rxq->crc_len); 1686 } 1687 1688 /** 1689 * vPMD receive routine that reassembles scattered packets. 1690 * Main receive routine that can handle arbitrary burst sizes 1691 * Notice: 1692 * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet 1693 */ 1694 static __rte_always_inline uint16_t 1695 iavf_recv_scattered_pkts_vec_avx512_cmn(void *rx_queue, struct rte_mbuf **rx_pkts, 1696 uint16_t nb_pkts, bool offload) 1697 { 1698 uint16_t retval = 0; 1699 1700 while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) { 1701 uint16_t burst = iavf_recv_scattered_burst_vec_avx512(rx_queue, 1702 rx_pkts + retval, IAVF_VPMD_RX_MAX_BURST, offload); 1703 retval += burst; 1704 nb_pkts -= burst; 1705 if (burst < IAVF_VPMD_RX_MAX_BURST) 1706 return retval; 1707 } 1708 return retval + iavf_recv_scattered_burst_vec_avx512(rx_queue, 1709 rx_pkts + retval, nb_pkts, offload); 1710 } 1711 1712 uint16_t 1713 iavf_recv_scattered_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts, 1714 uint16_t nb_pkts) 1715 { 1716 return iavf_recv_scattered_pkts_vec_avx512_cmn(rx_queue, rx_pkts, 1717 nb_pkts, false); 1718 } 1719 1720 /** 1721 * vPMD receive routine that reassembles single burst of 1722 * 32 scattered packets for flex RxD 1723 * Notice: 1724 * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet 1725 */ 1726 static __rte_always_inline uint16_t 1727 iavf_recv_scattered_burst_vec_avx512_flex_rxd(void *rx_queue, 1728 struct rte_mbuf **rx_pkts, 1729 uint16_t nb_pkts, 1730 bool offload) 1731 { 1732 struct iavf_rx_queue *rxq = rx_queue; 1733 uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0}; 1734 1735 /* get some new buffers */ 1736 uint16_t nb_bufs = _iavf_recv_raw_pkts_vec_avx512_flex_rxd(rxq, 1737 rx_pkts, nb_pkts, split_flags, offload); 1738 if (nb_bufs == 0) 1739 return 0; 1740 1741 /* happy day case, full burst + no packets to be joined */ 1742 const uint64_t *split_fl64 = (uint64_t *)split_flags; 1743 1744 if (!rxq->pkt_first_seg && 1745 split_fl64[0] == 0 && split_fl64[1] == 0 && 1746 split_fl64[2] == 0 && split_fl64[3] == 0) 1747 return nb_bufs; 1748 1749 /* reassemble any packets that need reassembly*/ 1750 unsigned int i = 0; 1751 1752 if (!rxq->pkt_first_seg) { 1753 /* find the first split flag, and only reassemble then*/ 1754 while (i < nb_bufs && !split_flags[i]) 1755 i++; 1756 if (i == nb_bufs) 1757 return nb_bufs; 1758 rxq->pkt_first_seg = rx_pkts[i]; 1759 } 1760 return i + ci_rx_reassemble_packets(&rx_pkts[i], nb_bufs - i, &split_flags[i], 1761 &rxq->pkt_first_seg, &rxq->pkt_last_seg, rxq->crc_len); 1762 } 1763 1764 /** 1765 * vPMD receive routine that reassembles scattered packets for flex RxD. 1766 * Main receive routine that can handle arbitrary burst sizes 1767 * Notice: 1768 * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet 1769 */ 1770 static __rte_always_inline uint16_t 1771 iavf_recv_scattered_pkts_vec_avx512_flex_rxd_cmn(void *rx_queue, 1772 struct rte_mbuf **rx_pkts, 1773 uint16_t nb_pkts, 1774 bool offload) 1775 { 1776 uint16_t retval = 0; 1777 1778 while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) { 1779 uint16_t burst = 1780 iavf_recv_scattered_burst_vec_avx512_flex_rxd 1781 (rx_queue, rx_pkts + retval, 1782 IAVF_VPMD_RX_MAX_BURST, offload); 1783 retval += burst; 1784 nb_pkts -= burst; 1785 if (burst < IAVF_VPMD_RX_MAX_BURST) 1786 return retval; 1787 } 1788 return retval + iavf_recv_scattered_burst_vec_avx512_flex_rxd(rx_queue, 1789 rx_pkts + retval, nb_pkts, offload); 1790 } 1791 1792 uint16_t 1793 iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue, 1794 struct rte_mbuf **rx_pkts, 1795 uint16_t nb_pkts) 1796 { 1797 return iavf_recv_scattered_pkts_vec_avx512_flex_rxd_cmn(rx_queue, 1798 rx_pkts, 1799 nb_pkts, 1800 false); 1801 } 1802 1803 uint16_t 1804 iavf_recv_pkts_vec_avx512_offload(void *rx_queue, struct rte_mbuf **rx_pkts, 1805 uint16_t nb_pkts) 1806 { 1807 return _iavf_recv_raw_pkts_vec_avx512(rx_queue, rx_pkts, 1808 nb_pkts, NULL, true); 1809 } 1810 1811 uint16_t 1812 iavf_recv_scattered_pkts_vec_avx512_offload(void *rx_queue, 1813 struct rte_mbuf **rx_pkts, 1814 uint16_t nb_pkts) 1815 { 1816 return iavf_recv_scattered_pkts_vec_avx512_cmn(rx_queue, rx_pkts, 1817 nb_pkts, true); 1818 } 1819 1820 uint16_t 1821 iavf_recv_pkts_vec_avx512_flex_rxd_offload(void *rx_queue, 1822 struct rte_mbuf **rx_pkts, 1823 uint16_t nb_pkts) 1824 { 1825 return _iavf_recv_raw_pkts_vec_avx512_flex_rxd(rx_queue, 1826 rx_pkts, 1827 nb_pkts, 1828 NULL, 1829 true); 1830 } 1831 1832 uint16_t 1833 iavf_recv_scattered_pkts_vec_avx512_flex_rxd_offload(void *rx_queue, 1834 struct rte_mbuf **rx_pkts, 1835 uint16_t nb_pkts) 1836 { 1837 return iavf_recv_scattered_pkts_vec_avx512_flex_rxd_cmn(rx_queue, 1838 rx_pkts, 1839 nb_pkts, 1840 true); 1841 } 1842 1843 static __rte_always_inline void 1844 tx_backlog_entry_avx512(struct ci_tx_entry_vec *txep, 1845 struct rte_mbuf **tx_pkts, uint16_t nb_pkts) 1846 { 1847 int i; 1848 1849 for (i = 0; i < (int)nb_pkts; ++i) 1850 txep[i].mbuf = tx_pkts[i]; 1851 } 1852 1853 static __rte_always_inline void 1854 iavf_vtx1(volatile struct iavf_tx_desc *txdp, 1855 struct rte_mbuf *pkt, uint64_t flags, 1856 bool offload) 1857 { 1858 uint64_t high_qw = 1859 (IAVF_TX_DESC_DTYPE_DATA | 1860 ((uint64_t)flags << IAVF_TXD_QW1_CMD_SHIFT) | 1861 ((uint64_t)pkt->data_len << IAVF_TXD_QW1_TX_BUF_SZ_SHIFT)); 1862 if (offload) 1863 iavf_txd_enable_offload(pkt, &high_qw); 1864 1865 __m128i descriptor = _mm_set_epi64x(high_qw, 1866 pkt->buf_iova + pkt->data_off); 1867 _mm_storeu_si128(RTE_CAST_PTR(__m128i *, txdp), descriptor); 1868 } 1869 1870 #define IAVF_TX_LEN_MASK 0xAA 1871 #define IAVF_TX_OFF_MASK 0x55 1872 static __rte_always_inline void 1873 iavf_vtx(volatile struct iavf_tx_desc *txdp, 1874 struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags, 1875 bool offload) 1876 { 1877 const uint64_t hi_qw_tmpl = (IAVF_TX_DESC_DTYPE_DATA | 1878 ((uint64_t)flags << IAVF_TXD_QW1_CMD_SHIFT)); 1879 1880 /* if unaligned on 32-bit boundary, do one to align */ 1881 if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) { 1882 iavf_vtx1(txdp, *pkt, flags, offload); 1883 nb_pkts--, txdp++, pkt++; 1884 } 1885 1886 /* do 4 at a time while possible, in bursts */ 1887 for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) { 1888 uint64_t hi_qw3 = 1889 hi_qw_tmpl | 1890 ((uint64_t)pkt[3]->data_len << 1891 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT); 1892 uint64_t hi_qw2 = 1893 hi_qw_tmpl | 1894 ((uint64_t)pkt[2]->data_len << 1895 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT); 1896 uint64_t hi_qw1 = 1897 hi_qw_tmpl | 1898 ((uint64_t)pkt[1]->data_len << 1899 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT); 1900 uint64_t hi_qw0 = 1901 hi_qw_tmpl | 1902 ((uint64_t)pkt[0]->data_len << 1903 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT); 1904 if (offload) { 1905 iavf_txd_enable_offload(pkt[3], &hi_qw3); 1906 iavf_txd_enable_offload(pkt[2], &hi_qw2); 1907 iavf_txd_enable_offload(pkt[1], &hi_qw1); 1908 iavf_txd_enable_offload(pkt[0], &hi_qw0); 1909 } 1910 1911 __m512i desc0_3 = 1912 _mm512_set_epi64 1913 (hi_qw3, 1914 pkt[3]->buf_iova + pkt[3]->data_off, 1915 hi_qw2, 1916 pkt[2]->buf_iova + pkt[2]->data_off, 1917 hi_qw1, 1918 pkt[1]->buf_iova + pkt[1]->data_off, 1919 hi_qw0, 1920 pkt[0]->buf_iova + pkt[0]->data_off); 1921 _mm512_storeu_si512(RTE_CAST_PTR(void *, txdp), desc0_3); 1922 } 1923 1924 /* do any last ones */ 1925 while (nb_pkts) { 1926 iavf_vtx1(txdp, *pkt, flags, offload); 1927 txdp++, pkt++, nb_pkts--; 1928 } 1929 } 1930 1931 static __rte_always_inline void 1932 iavf_fill_ctx_desc_tunneling_avx512(uint64_t *low_ctx_qw, struct rte_mbuf *pkt) 1933 { 1934 if (pkt->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK) { 1935 uint64_t eip_typ = IAVF_TX_CTX_DESC_EIPT_NONE; 1936 uint64_t eip_len = 0; 1937 uint64_t eip_noinc = 0; 1938 /* Default - IP_ID is increment in each segment of LSO */ 1939 1940 switch (pkt->ol_flags & (RTE_MBUF_F_TX_OUTER_IPV4 | 1941 RTE_MBUF_F_TX_OUTER_IPV6 | 1942 RTE_MBUF_F_TX_OUTER_IP_CKSUM)) { 1943 case RTE_MBUF_F_TX_OUTER_IPV4: 1944 eip_typ = IAVF_TX_CTX_DESC_EIPT_IPV4_NO_CHECKSUM_OFFLOAD; 1945 eip_len = pkt->outer_l3_len >> 2; 1946 break; 1947 case RTE_MBUF_F_TX_OUTER_IPV4 | RTE_MBUF_F_TX_OUTER_IP_CKSUM: 1948 eip_typ = IAVF_TX_CTX_DESC_EIPT_IPV4_CHECKSUM_OFFLOAD; 1949 eip_len = pkt->outer_l3_len >> 2; 1950 break; 1951 case RTE_MBUF_F_TX_OUTER_IPV6: 1952 eip_typ = IAVF_TX_CTX_DESC_EIPT_IPV6; 1953 eip_len = pkt->outer_l3_len >> 2; 1954 break; 1955 } 1956 1957 /* L4TUNT: L4 Tunneling Type */ 1958 switch (pkt->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK) { 1959 case RTE_MBUF_F_TX_TUNNEL_IPIP: 1960 /* for non UDP / GRE tunneling, set to 00b */ 1961 break; 1962 case RTE_MBUF_F_TX_TUNNEL_VXLAN: 1963 case RTE_MBUF_F_TX_TUNNEL_VXLAN_GPE: 1964 case RTE_MBUF_F_TX_TUNNEL_GTP: 1965 case RTE_MBUF_F_TX_TUNNEL_GENEVE: 1966 eip_typ |= IAVF_TXD_CTX_UDP_TUNNELING; 1967 break; 1968 case RTE_MBUF_F_TX_TUNNEL_GRE: 1969 eip_typ |= IAVF_TXD_CTX_GRE_TUNNELING; 1970 break; 1971 default: 1972 PMD_TX_LOG(ERR, "Tunnel type not supported"); 1973 return; 1974 } 1975 1976 /* L4TUNLEN: L4 Tunneling Length, in Words 1977 * 1978 * We depend on app to set rte_mbuf.l2_len correctly. 1979 * For IP in GRE it should be set to the length of the GRE 1980 * header; 1981 * For MAC in GRE or MAC in UDP it should be set to the length 1982 * of the GRE or UDP headers plus the inner MAC up to including 1983 * its last Ethertype. 1984 * If MPLS labels exists, it should include them as well. 1985 */ 1986 eip_typ |= (pkt->l2_len >> 1) << IAVF_TXD_CTX_QW0_NATLEN_SHIFT; 1987 1988 /** 1989 * Calculate the tunneling UDP checksum. 1990 * Shall be set only if L4TUNT = 01b and EIPT is not zero 1991 */ 1992 if ((eip_typ & (IAVF_TX_CTX_EXT_IP_IPV4 | 1993 IAVF_TX_CTX_EXT_IP_IPV6 | 1994 IAVF_TX_CTX_EXT_IP_IPV4_NO_CSUM)) && 1995 (eip_typ & IAVF_TXD_CTX_UDP_TUNNELING) && 1996 (pkt->ol_flags & RTE_MBUF_F_TX_OUTER_UDP_CKSUM)) 1997 eip_typ |= IAVF_TXD_CTX_QW0_L4T_CS_MASK; 1998 1999 *low_ctx_qw = eip_typ << IAVF_TXD_CTX_QW0_TUN_PARAMS_EIPT_SHIFT | 2000 eip_len << IAVF_TXD_CTX_QW0_TUN_PARAMS_EIPLEN_SHIFT | 2001 eip_noinc << IAVF_TXD_CTX_QW0_TUN_PARAMS_EIP_NOINC_SHIFT; 2002 2003 } else { 2004 *low_ctx_qw = 0; 2005 } 2006 } 2007 2008 static inline void 2009 iavf_fill_ctx_desc_tunnelling_field(volatile uint64_t *qw0, 2010 const struct rte_mbuf *m) 2011 { 2012 uint64_t eip_typ = IAVF_TX_CTX_DESC_EIPT_NONE; 2013 uint64_t eip_len = 0; 2014 uint64_t eip_noinc = 0; 2015 /* Default - IP_ID is increment in each segment of LSO */ 2016 2017 switch (m->ol_flags & (RTE_MBUF_F_TX_OUTER_IPV4 | 2018 RTE_MBUF_F_TX_OUTER_IPV6 | 2019 RTE_MBUF_F_TX_OUTER_IP_CKSUM)) { 2020 case RTE_MBUF_F_TX_OUTER_IPV4: 2021 eip_typ = IAVF_TX_CTX_DESC_EIPT_IPV4_NO_CHECKSUM_OFFLOAD; 2022 eip_len = m->outer_l3_len >> 2; 2023 break; 2024 case RTE_MBUF_F_TX_OUTER_IPV4 | RTE_MBUF_F_TX_OUTER_IP_CKSUM: 2025 eip_typ = IAVF_TX_CTX_DESC_EIPT_IPV4_CHECKSUM_OFFLOAD; 2026 eip_len = m->outer_l3_len >> 2; 2027 break; 2028 case RTE_MBUF_F_TX_OUTER_IPV6: 2029 eip_typ = IAVF_TX_CTX_DESC_EIPT_IPV6; 2030 eip_len = m->outer_l3_len >> 2; 2031 break; 2032 } 2033 2034 /* L4TUNT: L4 Tunneling Type */ 2035 switch (m->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK) { 2036 case RTE_MBUF_F_TX_TUNNEL_IPIP: 2037 /* for non UDP / GRE tunneling, set to 00b */ 2038 break; 2039 case RTE_MBUF_F_TX_TUNNEL_VXLAN: 2040 case RTE_MBUF_F_TX_TUNNEL_VXLAN_GPE: 2041 case RTE_MBUF_F_TX_TUNNEL_GTP: 2042 case RTE_MBUF_F_TX_TUNNEL_GENEVE: 2043 eip_typ |= IAVF_TXD_CTX_UDP_TUNNELING; 2044 break; 2045 case RTE_MBUF_F_TX_TUNNEL_GRE: 2046 eip_typ |= IAVF_TXD_CTX_GRE_TUNNELING; 2047 break; 2048 default: 2049 PMD_TX_LOG(ERR, "Tunnel type not supported"); 2050 return; 2051 } 2052 2053 /* L4TUNLEN: L4 Tunneling Length, in Words 2054 * 2055 * We depend on app to set rte_mbuf.l2_len correctly. 2056 * For IP in GRE it should be set to the length of the GRE 2057 * header; 2058 * For MAC in GRE or MAC in UDP it should be set to the length 2059 * of the GRE or UDP headers plus the inner MAC up to including 2060 * its last Ethertype. 2061 * If MPLS labels exists, it should include them as well. 2062 */ 2063 eip_typ |= (m->l2_len >> 1) << IAVF_TXD_CTX_QW0_NATLEN_SHIFT; 2064 2065 /** 2066 * Calculate the tunneling UDP checksum. 2067 * Shall be set only if L4TUNT = 01b and EIPT is not zero 2068 */ 2069 if ((eip_typ & (IAVF_TX_CTX_EXT_IP_IPV6 | 2070 IAVF_TX_CTX_EXT_IP_IPV4 | 2071 IAVF_TX_CTX_EXT_IP_IPV4_NO_CSUM)) && 2072 (eip_typ & IAVF_TXD_CTX_UDP_TUNNELING) && 2073 (m->ol_flags & RTE_MBUF_F_TX_OUTER_UDP_CKSUM)) 2074 eip_typ |= IAVF_TXD_CTX_QW0_L4T_CS_MASK; 2075 2076 *qw0 = eip_typ << IAVF_TXD_CTX_QW0_TUN_PARAMS_EIPT_SHIFT | 2077 eip_len << IAVF_TXD_CTX_QW0_TUN_PARAMS_EIPLEN_SHIFT | 2078 eip_noinc << IAVF_TXD_CTX_QW0_TUN_PARAMS_EIP_NOINC_SHIFT; 2079 } 2080 2081 static __rte_always_inline void 2082 ctx_vtx1(volatile struct iavf_tx_desc *txdp, struct rte_mbuf *pkt, 2083 uint64_t flags, bool offload, uint8_t vlan_flag) 2084 { 2085 uint64_t high_ctx_qw = IAVF_TX_DESC_DTYPE_CONTEXT; 2086 uint64_t low_ctx_qw = 0; 2087 2088 if (((pkt->ol_flags & RTE_MBUF_F_TX_VLAN) || offload)) { 2089 if (offload) 2090 iavf_fill_ctx_desc_tunneling_avx512(&low_ctx_qw, pkt); 2091 if ((pkt->ol_flags & RTE_MBUF_F_TX_VLAN) || 2092 (vlan_flag & IAVF_TX_FLAGS_VLAN_TAG_LOC_L2TAG2)) { 2093 high_ctx_qw |= IAVF_TX_CTX_DESC_IL2TAG2 << IAVF_TXD_CTX_QW1_CMD_SHIFT; 2094 low_ctx_qw |= (uint64_t)pkt->vlan_tci << IAVF_TXD_CTX_QW0_L2TAG2_PARAM; 2095 } 2096 } 2097 if (IAVF_CHECK_TX_LLDP(pkt)) 2098 high_ctx_qw |= IAVF_TX_CTX_DESC_SWTCH_UPLINK 2099 << IAVF_TXD_CTX_QW1_CMD_SHIFT; 2100 uint64_t high_data_qw = (IAVF_TX_DESC_DTYPE_DATA | 2101 ((uint64_t)flags << IAVF_TXD_QW1_CMD_SHIFT) | 2102 ((uint64_t)pkt->data_len << IAVF_TXD_QW1_TX_BUF_SZ_SHIFT)); 2103 if (offload) 2104 iavf_txd_enable_offload(pkt, &high_data_qw); 2105 2106 __m256i ctx_data_desc = _mm256_set_epi64x(high_data_qw, pkt->buf_iova + pkt->data_off, 2107 high_ctx_qw, low_ctx_qw); 2108 2109 _mm256_storeu_si256(RTE_CAST_PTR(__m256i *, txdp), ctx_data_desc); 2110 } 2111 2112 static __rte_always_inline void 2113 ctx_vtx(volatile struct iavf_tx_desc *txdp, 2114 struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags, 2115 bool offload, uint8_t vlan_flag) 2116 { 2117 uint64_t hi_data_qw_tmpl = (IAVF_TX_DESC_DTYPE_DATA | 2118 ((uint64_t)flags << IAVF_TXD_QW1_CMD_SHIFT)); 2119 2120 /* if unaligned on 32-bit boundary, do one to align */ 2121 if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) { 2122 ctx_vtx1(txdp, *pkt, flags, offload, vlan_flag); 2123 nb_pkts--, txdp++, pkt++; 2124 } 2125 2126 for (; nb_pkts > 1; txdp += 4, pkt += 2, nb_pkts -= 2) { 2127 uint64_t hi_ctx_qw1 = IAVF_TX_DESC_DTYPE_CONTEXT; 2128 uint64_t hi_ctx_qw0 = IAVF_TX_DESC_DTYPE_CONTEXT; 2129 uint64_t low_ctx_qw1 = 0; 2130 uint64_t low_ctx_qw0 = 0; 2131 uint64_t hi_data_qw1 = 0; 2132 uint64_t hi_data_qw0 = 0; 2133 2134 hi_data_qw1 = hi_data_qw_tmpl | 2135 ((uint64_t)pkt[1]->data_len << 2136 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT); 2137 hi_data_qw0 = hi_data_qw_tmpl | 2138 ((uint64_t)pkt[0]->data_len << 2139 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT); 2140 2141 if (pkt[1]->ol_flags & RTE_MBUF_F_TX_VLAN) { 2142 if (vlan_flag & IAVF_TX_FLAGS_VLAN_TAG_LOC_L2TAG2) { 2143 hi_ctx_qw1 |= 2144 IAVF_TX_CTX_DESC_IL2TAG2 << IAVF_TXD_CTX_QW1_CMD_SHIFT; 2145 low_ctx_qw1 |= 2146 (uint64_t)pkt[1]->vlan_tci << IAVF_TXD_CTX_QW0_L2TAG2_PARAM; 2147 } else { 2148 hi_data_qw1 |= 2149 (uint64_t)pkt[1]->vlan_tci << IAVF_TXD_QW1_L2TAG1_SHIFT; 2150 } 2151 } 2152 if (IAVF_CHECK_TX_LLDP(pkt[1])) 2153 hi_ctx_qw1 |= IAVF_TX_CTX_DESC_SWTCH_UPLINK 2154 << IAVF_TXD_CTX_QW1_CMD_SHIFT; 2155 2156 if (pkt[0]->ol_flags & RTE_MBUF_F_TX_VLAN) { 2157 if (vlan_flag & IAVF_TX_FLAGS_VLAN_TAG_LOC_L2TAG2) { 2158 hi_ctx_qw0 |= 2159 IAVF_TX_CTX_DESC_IL2TAG2 << IAVF_TXD_CTX_QW1_CMD_SHIFT; 2160 low_ctx_qw0 |= 2161 (uint64_t)pkt[0]->vlan_tci << IAVF_TXD_CTX_QW0_L2TAG2_PARAM; 2162 } else { 2163 hi_data_qw0 |= 2164 (uint64_t)pkt[0]->vlan_tci << IAVF_TXD_QW1_L2TAG1_SHIFT; 2165 } 2166 } 2167 if (IAVF_CHECK_TX_LLDP(pkt[0])) 2168 hi_ctx_qw0 |= IAVF_TX_CTX_DESC_SWTCH_UPLINK 2169 << IAVF_TXD_CTX_QW1_CMD_SHIFT; 2170 2171 if (offload) { 2172 iavf_txd_enable_offload(pkt[1], &hi_data_qw1); 2173 iavf_txd_enable_offload(pkt[0], &hi_data_qw0); 2174 iavf_fill_ctx_desc_tunnelling_field(&low_ctx_qw1, pkt[1]); 2175 iavf_fill_ctx_desc_tunnelling_field(&low_ctx_qw0, pkt[0]); 2176 } 2177 2178 __m512i desc0_3 = 2179 _mm512_set_epi64 2180 (hi_data_qw1, pkt[1]->buf_iova + pkt[1]->data_off, 2181 hi_ctx_qw1, low_ctx_qw1, 2182 hi_data_qw0, pkt[0]->buf_iova + pkt[0]->data_off, 2183 hi_ctx_qw0, low_ctx_qw0); 2184 _mm512_storeu_si512(RTE_CAST_PTR(void *, txdp), desc0_3); 2185 } 2186 2187 if (nb_pkts) 2188 ctx_vtx1(txdp, *pkt, flags, offload, vlan_flag); 2189 } 2190 2191 static __rte_always_inline uint16_t 2192 iavf_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, 2193 uint16_t nb_pkts, bool offload) 2194 { 2195 struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue; 2196 volatile struct iavf_tx_desc *txdp; 2197 struct ci_tx_entry_vec *txep; 2198 uint16_t n, nb_commit, tx_id; 2199 /* bit2 is reserved and must be set to 1 according to Spec */ 2200 uint64_t flags = IAVF_TX_DESC_CMD_EOP | IAVF_TX_DESC_CMD_ICRC; 2201 uint64_t rs = IAVF_TX_DESC_CMD_RS | flags; 2202 2203 if (txq->nb_tx_free < txq->tx_free_thresh) 2204 ci_tx_free_bufs_vec(txq, iavf_tx_desc_done, false); 2205 2206 nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts); 2207 if (unlikely(nb_pkts == 0)) 2208 return 0; 2209 nb_commit = nb_pkts; 2210 2211 tx_id = txq->tx_tail; 2212 txdp = &txq->iavf_tx_ring[tx_id]; 2213 txep = (void *)txq->sw_ring; 2214 txep += tx_id; 2215 2216 txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts); 2217 2218 n = (uint16_t)(txq->nb_tx_desc - tx_id); 2219 if (nb_commit >= n) { 2220 tx_backlog_entry_avx512(txep, tx_pkts, n); 2221 2222 iavf_vtx(txdp, tx_pkts, n - 1, flags, offload); 2223 tx_pkts += (n - 1); 2224 txdp += (n - 1); 2225 2226 iavf_vtx1(txdp, *tx_pkts++, rs, offload); 2227 2228 nb_commit = (uint16_t)(nb_commit - n); 2229 2230 tx_id = 0; 2231 txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1); 2232 2233 /* avoid reach the end of ring */ 2234 txdp = &txq->iavf_tx_ring[tx_id]; 2235 txep = (void *)txq->sw_ring; 2236 txep += tx_id; 2237 } 2238 2239 tx_backlog_entry_avx512(txep, tx_pkts, nb_commit); 2240 2241 iavf_vtx(txdp, tx_pkts, nb_commit, flags, offload); 2242 2243 tx_id = (uint16_t)(tx_id + nb_commit); 2244 if (tx_id > txq->tx_next_rs) { 2245 txq->iavf_tx_ring[txq->tx_next_rs].cmd_type_offset_bsz |= 2246 rte_cpu_to_le_64(((uint64_t)IAVF_TX_DESC_CMD_RS) << 2247 IAVF_TXD_QW1_CMD_SHIFT); 2248 txq->tx_next_rs = 2249 (uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh); 2250 } 2251 2252 txq->tx_tail = tx_id; 2253 2254 IAVF_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail); 2255 2256 return nb_pkts; 2257 } 2258 2259 static __rte_always_inline uint16_t 2260 iavf_xmit_fixed_burst_vec_avx512_ctx(void *tx_queue, struct rte_mbuf **tx_pkts, 2261 uint16_t nb_pkts, bool offload) 2262 { 2263 struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue; 2264 volatile struct iavf_tx_desc *txdp; 2265 struct ci_tx_entry_vec *txep; 2266 uint16_t n, nb_commit, nb_mbuf, tx_id; 2267 /* bit2 is reserved and must be set to 1 according to Spec */ 2268 uint64_t flags = IAVF_TX_DESC_CMD_EOP | IAVF_TX_DESC_CMD_ICRC; 2269 uint64_t rs = IAVF_TX_DESC_CMD_RS | flags; 2270 2271 if (txq->nb_tx_free < txq->tx_free_thresh) 2272 ci_tx_free_bufs_vec(txq, iavf_tx_desc_done, true); 2273 2274 nb_commit = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts << 1); 2275 nb_commit &= 0xFFFE; 2276 if (unlikely(nb_commit == 0)) 2277 return 0; 2278 2279 nb_pkts = nb_commit >> 1; 2280 tx_id = txq->tx_tail; 2281 txdp = &txq->iavf_tx_ring[tx_id]; 2282 txep = (void *)txq->sw_ring; 2283 txep += (tx_id >> 1); 2284 2285 txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_commit); 2286 n = (uint16_t)(txq->nb_tx_desc - tx_id); 2287 2288 if (n != 0 && nb_commit >= n) { 2289 nb_mbuf = n >> 1; 2290 tx_backlog_entry_avx512(txep, tx_pkts, nb_mbuf); 2291 2292 ctx_vtx(txdp, tx_pkts, nb_mbuf - 1, flags, offload, txq->vlan_flag); 2293 tx_pkts += (nb_mbuf - 1); 2294 txdp += (n - 2); 2295 ctx_vtx1(txdp, *tx_pkts++, rs, offload, txq->vlan_flag); 2296 2297 nb_commit = (uint16_t)(nb_commit - n); 2298 2299 txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1); 2300 tx_id = 0; 2301 /* avoid reach the end of ring */ 2302 txdp = txq->iavf_tx_ring; 2303 txep = (void *)txq->sw_ring; 2304 } 2305 2306 nb_mbuf = nb_commit >> 1; 2307 tx_backlog_entry_avx512(txep, tx_pkts, nb_mbuf); 2308 2309 ctx_vtx(txdp, tx_pkts, nb_mbuf, flags, offload, txq->vlan_flag); 2310 tx_id = (uint16_t)(tx_id + nb_commit); 2311 2312 if (tx_id > txq->tx_next_rs) { 2313 txq->iavf_tx_ring[txq->tx_next_rs].cmd_type_offset_bsz |= 2314 rte_cpu_to_le_64(((uint64_t)IAVF_TX_DESC_CMD_RS) << 2315 IAVF_TXD_QW1_CMD_SHIFT); 2316 txq->tx_next_rs = 2317 (uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh); 2318 } 2319 2320 txq->tx_tail = tx_id; 2321 2322 IAVF_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail); 2323 return nb_pkts; 2324 } 2325 2326 static __rte_always_inline uint16_t 2327 iavf_xmit_pkts_vec_avx512_cmn(void *tx_queue, struct rte_mbuf **tx_pkts, 2328 uint16_t nb_pkts, bool offload) 2329 { 2330 uint16_t nb_tx = 0; 2331 struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue; 2332 2333 while (nb_pkts) { 2334 uint16_t ret, num; 2335 2336 /* cross rs_thresh boundary is not allowed */ 2337 num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh); 2338 ret = iavf_xmit_fixed_burst_vec_avx512(tx_queue, &tx_pkts[nb_tx], 2339 num, offload); 2340 nb_tx += ret; 2341 nb_pkts -= ret; 2342 if (ret < num) 2343 break; 2344 } 2345 2346 return nb_tx; 2347 } 2348 2349 uint16_t 2350 iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, 2351 uint16_t nb_pkts) 2352 { 2353 return iavf_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts, false); 2354 } 2355 2356 uint16_t 2357 iavf_xmit_pkts_vec_avx512_offload(void *tx_queue, struct rte_mbuf **tx_pkts, 2358 uint16_t nb_pkts) 2359 { 2360 return iavf_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts, true); 2361 } 2362 2363 static __rte_always_inline uint16_t 2364 iavf_xmit_pkts_vec_avx512_ctx_cmn(void *tx_queue, struct rte_mbuf **tx_pkts, 2365 uint16_t nb_pkts, bool offload) 2366 { 2367 uint16_t nb_tx = 0; 2368 struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue; 2369 2370 while (nb_pkts) { 2371 uint16_t ret, num; 2372 2373 /* cross rs_thresh boundary is not allowed */ 2374 num = (uint16_t)RTE_MIN(nb_pkts << 1, txq->tx_rs_thresh); 2375 num = num >> 1; 2376 ret = iavf_xmit_fixed_burst_vec_avx512_ctx(tx_queue, &tx_pkts[nb_tx], 2377 num, offload); 2378 nb_tx += ret; 2379 nb_pkts -= ret; 2380 if (ret < num) 2381 break; 2382 } 2383 2384 return nb_tx; 2385 } 2386 2387 uint16_t 2388 iavf_xmit_pkts_vec_avx512_ctx_offload(void *tx_queue, struct rte_mbuf **tx_pkts, 2389 uint16_t nb_pkts) 2390 { 2391 return iavf_xmit_pkts_vec_avx512_ctx_cmn(tx_queue, tx_pkts, nb_pkts, true); 2392 } 2393 2394 uint16_t 2395 iavf_xmit_pkts_vec_avx512_ctx(void *tx_queue, struct rte_mbuf **tx_pkts, 2396 uint16_t nb_pkts) 2397 { 2398 return iavf_xmit_pkts_vec_avx512_ctx_cmn(tx_queue, tx_pkts, nb_pkts, false); 2399 } 2400