1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2023 Intel Corporation 3 */ 4 5 #include <rte_vect.h> 6 #include <idpf_common_device.h> 7 #include <idpf_common_rxtx.h> 8 9 #ifndef __INTEL_COMPILER 10 #pragma GCC diagnostic ignored "-Wcast-qual" 11 #endif 12 13 #define IDPF_DESCS_PER_LOOP_AVX 8 14 #define PKTLEN_SHIFT 10 15 16 static __rte_always_inline void 17 idpf_singleq_rearm_common(struct idpf_rx_queue *rxq) 18 { 19 struct rte_mbuf **rxp = &rxq->sw_ring[rxq->rxrearm_start]; 20 volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring; 21 uint16_t rx_id; 22 int i; 23 24 rxdp += rxq->rxrearm_start; 25 26 /* Pull 'n' more MBUFs into the software ring */ 27 if (rte_mempool_get_bulk(rxq->mp, 28 (void *)rxp, 29 IDPF_RXQ_REARM_THRESH) < 0) { 30 if (rxq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >= 31 rxq->nb_rx_desc) { 32 __m128i dma_addr0; 33 34 dma_addr0 = _mm_setzero_si128(); 35 for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) { 36 rxp[i] = &rxq->fake_mbuf; 37 _mm_store_si128((__m128i *)&rxdp[i].read, 38 dma_addr0); 39 } 40 } 41 __atomic_fetch_add(&rxq->rx_stats.mbuf_alloc_failed, 42 IDPF_RXQ_REARM_THRESH, __ATOMIC_RELAXED); 43 return; 44 } 45 struct rte_mbuf *mb0, *mb1, *mb2, *mb3; 46 struct rte_mbuf *mb4, *mb5, *mb6, *mb7; 47 __m512i dma_addr0_3, dma_addr4_7; 48 __m512i hdr_room = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM); 49 /* Initialize the mbufs in vector, process 8 mbufs in one loop */ 50 for (i = 0; i < IDPF_RXQ_REARM_THRESH; 51 i += 8, rxp += 8, rxdp += 8) { 52 __m128i vaddr0, vaddr1, vaddr2, vaddr3; 53 __m128i vaddr4, vaddr5, vaddr6, vaddr7; 54 __m256i vaddr0_1, vaddr2_3; 55 __m256i vaddr4_5, vaddr6_7; 56 __m512i vaddr0_3, vaddr4_7; 57 58 mb0 = rxp[0]; 59 mb1 = rxp[1]; 60 mb2 = rxp[2]; 61 mb3 = rxp[3]; 62 mb4 = rxp[4]; 63 mb5 = rxp[5]; 64 mb6 = rxp[6]; 65 mb7 = rxp[7]; 66 67 /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */ 68 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != 69 offsetof(struct rte_mbuf, buf_addr) + 8); 70 vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr); 71 vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr); 72 vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr); 73 vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr); 74 vaddr4 = _mm_loadu_si128((__m128i *)&mb4->buf_addr); 75 vaddr5 = _mm_loadu_si128((__m128i *)&mb5->buf_addr); 76 vaddr6 = _mm_loadu_si128((__m128i *)&mb6->buf_addr); 77 vaddr7 = _mm_loadu_si128((__m128i *)&mb7->buf_addr); 78 79 /** 80 * merge 0 & 1, by casting 0 to 256-bit and inserting 1 81 * into the high lanes. Similarly for 2 & 3, and so on. 82 */ 83 vaddr0_1 = 84 _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0), 85 vaddr1, 1); 86 vaddr2_3 = 87 _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr2), 88 vaddr3, 1); 89 vaddr4_5 = 90 _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr4), 91 vaddr5, 1); 92 vaddr6_7 = 93 _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr6), 94 vaddr7, 1); 95 vaddr0_3 = 96 _mm512_inserti64x4(_mm512_castsi256_si512(vaddr0_1), 97 vaddr2_3, 1); 98 vaddr4_7 = 99 _mm512_inserti64x4(_mm512_castsi256_si512(vaddr4_5), 100 vaddr6_7, 1); 101 102 /* convert pa to dma_addr hdr/data */ 103 dma_addr0_3 = _mm512_unpackhi_epi64(vaddr0_3, vaddr0_3); 104 dma_addr4_7 = _mm512_unpackhi_epi64(vaddr4_7, vaddr4_7); 105 106 /* add headroom to pa values */ 107 dma_addr0_3 = _mm512_add_epi64(dma_addr0_3, hdr_room); 108 dma_addr4_7 = _mm512_add_epi64(dma_addr4_7, hdr_room); 109 110 /* flush desc with pa dma_addr */ 111 _mm512_store_si512((__m512i *)&rxdp->read, dma_addr0_3); 112 _mm512_store_si512((__m512i *)&(rxdp + 4)->read, dma_addr4_7); 113 } 114 115 rxq->rxrearm_start += IDPF_RXQ_REARM_THRESH; 116 if (rxq->rxrearm_start >= rxq->nb_rx_desc) 117 rxq->rxrearm_start = 0; 118 119 rxq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH; 120 121 rx_id = (uint16_t)((rxq->rxrearm_start == 0) ? 122 (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1)); 123 124 /* Update the tail pointer on the NIC */ 125 IDPF_PCI_REG_WRITE(rxq->qrx_tail, rx_id); 126 } 127 128 static __rte_always_inline void 129 idpf_singleq_rearm(struct idpf_rx_queue *rxq) 130 { 131 int i; 132 uint16_t rx_id; 133 volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring; 134 struct rte_mempool_cache *cache = 135 rte_mempool_default_cache(rxq->mp, rte_lcore_id()); 136 struct rte_mbuf **rxp = &rxq->sw_ring[rxq->rxrearm_start]; 137 138 rxdp += rxq->rxrearm_start; 139 140 if (unlikely(cache == NULL)) 141 return idpf_singleq_rearm_common(rxq); 142 143 /* We need to pull 'n' more MBUFs into the software ring from mempool 144 * We inline the mempool function here, so we can vectorize the copy 145 * from the cache into the shadow ring. 146 */ 147 148 /* Can this be satisfied from the cache? */ 149 if (cache->len < IDPF_RXQ_REARM_THRESH) { 150 /* No. Backfill the cache first, and then fill from it */ 151 uint32_t req = IDPF_RXQ_REARM_THRESH + (cache->size - 152 cache->len); 153 154 /* How many do we require i.e. number to fill the cache + the request */ 155 int ret = rte_mempool_ops_dequeue_bulk 156 (rxq->mp, &cache->objs[cache->len], req); 157 if (ret == 0) { 158 cache->len += req; 159 } else { 160 if (rxq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >= 161 rxq->nb_rx_desc) { 162 __m128i dma_addr0; 163 164 dma_addr0 = _mm_setzero_si128(); 165 for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) { 166 rxp[i] = &rxq->fake_mbuf; 167 _mm_storeu_si128((__m128i *)&rxdp[i].read, 168 dma_addr0); 169 } 170 } 171 __atomic_fetch_add(&rxq->rx_stats.mbuf_alloc_failed, 172 IDPF_RXQ_REARM_THRESH, __ATOMIC_RELAXED); 173 return; 174 } 175 } 176 177 const __m512i iova_offsets = _mm512_set1_epi64(offsetof 178 (struct rte_mbuf, buf_iova)); 179 const __m512i headroom = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM); 180 181 /* to shuffle the addresses to correct slots. Values 4-7 will contain 182 * zeros, so use 7 for a zero-value. 183 */ 184 const __m512i permute_idx = _mm512_set_epi64(7, 7, 3, 1, 7, 7, 2, 0); 185 186 /* Initialize the mbufs in vector, process 8 mbufs in one loop, taking 187 * from mempool cache and populating both shadow and HW rings 188 */ 189 for (i = 0; i < IDPF_RXQ_REARM_THRESH / IDPF_DESCS_PER_LOOP_AVX; i++) { 190 const __m512i mbuf_ptrs = _mm512_loadu_si512 191 (&cache->objs[cache->len - IDPF_DESCS_PER_LOOP_AVX]); 192 _mm512_storeu_si512(rxp, mbuf_ptrs); 193 194 const __m512i iova_base_addrs = _mm512_i64gather_epi64 195 (_mm512_add_epi64(mbuf_ptrs, iova_offsets), 196 0, /* base */ 197 1 /* scale */); 198 const __m512i iova_addrs = _mm512_add_epi64(iova_base_addrs, 199 headroom); 200 const __m512i iovas0 = _mm512_castsi256_si512 201 (_mm512_extracti64x4_epi64(iova_addrs, 0)); 202 const __m512i iovas1 = _mm512_castsi256_si512 203 (_mm512_extracti64x4_epi64(iova_addrs, 1)); 204 205 /* permute leaves desc 2-3 addresses in header address slots 0-1 206 * but these are ignored by driver since header split not 207 * enabled. Similarly for desc 6 & 7. 208 */ 209 const __m512i desc0_1 = _mm512_permutexvar_epi64 210 (permute_idx, 211 iovas0); 212 const __m512i desc2_3 = _mm512_bsrli_epi128(desc0_1, 8); 213 214 const __m512i desc4_5 = _mm512_permutexvar_epi64 215 (permute_idx, 216 iovas1); 217 const __m512i desc6_7 = _mm512_bsrli_epi128(desc4_5, 8); 218 219 _mm512_storeu_si512((void *)rxdp, desc0_1); 220 _mm512_storeu_si512((void *)(rxdp + 2), desc2_3); 221 _mm512_storeu_si512((void *)(rxdp + 4), desc4_5); 222 _mm512_storeu_si512((void *)(rxdp + 6), desc6_7); 223 224 rxp += IDPF_DESCS_PER_LOOP_AVX; 225 rxdp += IDPF_DESCS_PER_LOOP_AVX; 226 cache->len -= IDPF_DESCS_PER_LOOP_AVX; 227 } 228 229 rxq->rxrearm_start += IDPF_RXQ_REARM_THRESH; 230 if (rxq->rxrearm_start >= rxq->nb_rx_desc) 231 rxq->rxrearm_start = 0; 232 233 rxq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH; 234 235 rx_id = (uint16_t)((rxq->rxrearm_start == 0) ? 236 (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1)); 237 238 /* Update the tail pointer on the NIC */ 239 IDPF_PCI_REG_WRITE(rxq->qrx_tail, rx_id); 240 } 241 242 #define IDPF_RX_LEN_MASK 0x80808080 243 static __rte_always_inline uint16_t 244 _idpf_singleq_recv_raw_pkts_avx512(struct idpf_rx_queue *rxq, 245 struct rte_mbuf **rx_pkts, 246 uint16_t nb_pkts) 247 { 248 const uint32_t *type_table = rxq->adapter->ptype_tbl; 249 250 const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0, 251 rxq->mbuf_initializer); 252 struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail]; 253 volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring; 254 255 rxdp += rxq->rx_tail; 256 257 rte_prefetch0(rxdp); 258 259 /* nb_pkts has to be floor-aligned to IDPF_DESCS_PER_LOOP_AVX */ 260 nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IDPF_DESCS_PER_LOOP_AVX); 261 262 /* See if we need to rearm the RX queue - gives the prefetch a bit 263 * of time to act 264 */ 265 if (rxq->rxrearm_nb > IDPF_RXQ_REARM_THRESH) 266 idpf_singleq_rearm(rxq); 267 268 /* Before we start moving massive data around, check to see if 269 * there is actually a packet available 270 */ 271 if ((rxdp->flex_nic_wb.status_error0 & 272 rte_cpu_to_le_32(1 << VIRTCHNL2_RX_FLEX_DESC_STATUS0_DD_S)) == 0) 273 return 0; 274 275 /* 8 packets DD mask, LSB in each 32-bit value */ 276 const __m256i dd_check = _mm256_set1_epi32(1); 277 278 /* mask to shuffle from desc. to mbuf (4 descriptors)*/ 279 const __m512i shuf_msk = 280 _mm512_set_epi32 281 (/* 1st descriptor */ 282 0xFFFFFFFF, /* rss set as unknown */ 283 0xFFFF0504, /* vlan_macip set as unknown */ 284 /* octet 15~14, 16 bits data_len */ 285 0xFFFF0504, /* skip high 16 bits pkt_len, zero out */ 286 /* octet 15~14, low 16 bits pkt_len */ 287 0xFFFFFFFF, /* pkt_type set as unknown */ 288 /* 2nd descriptor */ 289 0xFFFFFFFF, /* rss set as unknown */ 290 0xFFFF0504, /* vlan_macip set as unknown */ 291 /* octet 15~14, 16 bits data_len */ 292 0xFFFF0504, /* skip high 16 bits pkt_len, zero out */ 293 /* octet 15~14, low 16 bits pkt_len */ 294 0xFFFFFFFF, /* pkt_type set as unknown */ 295 /* 3rd descriptor */ 296 0xFFFFFFFF, /* rss set as unknown */ 297 0xFFFF0504, /* vlan_macip set as unknown */ 298 /* octet 15~14, 16 bits data_len */ 299 0xFFFF0504, /* skip high 16 bits pkt_len, zero out */ 300 /* octet 15~14, low 16 bits pkt_len */ 301 0xFFFFFFFF, /* pkt_type set as unknown */ 302 /* 4th descriptor */ 303 0xFFFFFFFF, /* rss set as unknown */ 304 0xFFFF0504, /* vlan_macip set as unknown */ 305 /* octet 15~14, 16 bits data_len */ 306 0xFFFF0504, /* skip high 16 bits pkt_len, zero out */ 307 /* octet 15~14, low 16 bits pkt_len */ 308 0xFFFFFFFF /* pkt_type set as unknown */ 309 ); 310 /** 311 * compile-time check the shuffle layout is correct. 312 * NOTE: the first field (lowest address) is given last in set_epi 313 * calls above. 314 */ 315 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != 316 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); 317 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != 318 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); 319 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) != 320 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10); 321 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != 322 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); 323 324 uint16_t i, received; 325 326 for (i = 0, received = 0; i < nb_pkts; 327 i += IDPF_DESCS_PER_LOOP_AVX, 328 rxdp += IDPF_DESCS_PER_LOOP_AVX) { 329 /* step 1, copy over 8 mbuf pointers to rx_pkts array */ 330 _mm256_storeu_si256((void *)&rx_pkts[i], 331 _mm256_loadu_si256((void *)&sw_ring[i])); 332 #ifdef RTE_ARCH_X86_64 333 _mm256_storeu_si256 334 ((void *)&rx_pkts[i + 4], 335 _mm256_loadu_si256((void *)&sw_ring[i + 4])); 336 #endif 337 338 __m512i raw_desc0_3, raw_desc4_7; 339 const __m128i raw_desc7 = 340 _mm_load_si128((void *)(rxdp + 7)); 341 rte_compiler_barrier(); 342 const __m128i raw_desc6 = 343 _mm_load_si128((void *)(rxdp + 6)); 344 rte_compiler_barrier(); 345 const __m128i raw_desc5 = 346 _mm_load_si128((void *)(rxdp + 5)); 347 rte_compiler_barrier(); 348 const __m128i raw_desc4 = 349 _mm_load_si128((void *)(rxdp + 4)); 350 rte_compiler_barrier(); 351 const __m128i raw_desc3 = 352 _mm_load_si128((void *)(rxdp + 3)); 353 rte_compiler_barrier(); 354 const __m128i raw_desc2 = 355 _mm_load_si128((void *)(rxdp + 2)); 356 rte_compiler_barrier(); 357 const __m128i raw_desc1 = 358 _mm_load_si128((void *)(rxdp + 1)); 359 rte_compiler_barrier(); 360 const __m128i raw_desc0 = 361 _mm_load_si128((void *)(rxdp + 0)); 362 363 raw_desc4_7 = _mm512_broadcast_i32x4(raw_desc4); 364 raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc5, 1); 365 raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc6, 2); 366 raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc7, 3); 367 raw_desc0_3 = _mm512_broadcast_i32x4(raw_desc0); 368 raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc1, 1); 369 raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc2, 2); 370 raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc3, 3); 371 372 /** 373 * convert descriptors 4-7 into mbufs, adjusting length and 374 * re-arranging fields. Then write into the mbuf 375 */ 376 const __m512i len4_7 = _mm512_slli_epi32(raw_desc4_7, 377 PKTLEN_SHIFT); 378 const __m512i desc4_7 = _mm512_mask_blend_epi16(IDPF_RX_LEN_MASK, 379 raw_desc4_7, 380 len4_7); 381 __m512i mb4_7 = _mm512_shuffle_epi8(desc4_7, shuf_msk); 382 383 /** 384 * to get packet types, shift 64-bit values down 30 bits 385 * and so ptype is in lower 8-bits in each 386 */ 387 const __m512i ptypes4_7 = _mm512_srli_epi64(desc4_7, 16); 388 const __m256i ptypes6_7 = _mm512_extracti64x4_epi64(ptypes4_7, 1); 389 const __m256i ptypes4_5 = _mm512_extracti64x4_epi64(ptypes4_7, 0); 390 const uint8_t ptype7 = _mm256_extract_epi8(ptypes6_7, 16); 391 const uint8_t ptype6 = _mm256_extract_epi8(ptypes6_7, 0); 392 const uint8_t ptype5 = _mm256_extract_epi8(ptypes4_5, 16); 393 const uint8_t ptype4 = _mm256_extract_epi8(ptypes4_5, 0); 394 395 const __m512i ptype4_7 = _mm512_set_epi32 396 (0, 0, 0, type_table[ptype7], 397 0, 0, 0, type_table[ptype6], 398 0, 0, 0, type_table[ptype5], 399 0, 0, 0, type_table[ptype4]); 400 mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7); 401 402 /** 403 * convert descriptors 0-3 into mbufs, adjusting length and 404 * re-arranging fields. Then write into the mbuf 405 */ 406 const __m512i len0_3 = _mm512_slli_epi32(raw_desc0_3, 407 PKTLEN_SHIFT); 408 const __m512i desc0_3 = _mm512_mask_blend_epi16(IDPF_RX_LEN_MASK, 409 raw_desc0_3, 410 len0_3); 411 __m512i mb0_3 = _mm512_shuffle_epi8(desc0_3, shuf_msk); 412 413 /* get the packet types */ 414 const __m512i ptypes0_3 = _mm512_srli_epi64(desc0_3, 16); 415 const __m256i ptypes2_3 = _mm512_extracti64x4_epi64(ptypes0_3, 1); 416 const __m256i ptypes0_1 = _mm512_extracti64x4_epi64(ptypes0_3, 0); 417 const uint8_t ptype3 = _mm256_extract_epi8(ptypes2_3, 16); 418 const uint8_t ptype2 = _mm256_extract_epi8(ptypes2_3, 0); 419 const uint8_t ptype1 = _mm256_extract_epi8(ptypes0_1, 16); 420 const uint8_t ptype0 = _mm256_extract_epi8(ptypes0_1, 0); 421 422 const __m512i ptype0_3 = _mm512_set_epi32 423 (0, 0, 0, type_table[ptype3], 424 0, 0, 0, type_table[ptype2], 425 0, 0, 0, type_table[ptype1], 426 0, 0, 0, type_table[ptype0]); 427 mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3); 428 429 /** 430 * use permute/extract to get status content 431 * After the operations, the packets status flags are in the 432 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6] 433 */ 434 /* merge the status bits into one register */ 435 const __m512i status_permute_msk = _mm512_set_epi32 436 (0, 0, 0, 0, 437 0, 0, 0, 0, 438 22, 30, 6, 14, 439 18, 26, 2, 10); 440 const __m512i raw_status0_7 = _mm512_permutex2var_epi32 441 (raw_desc4_7, status_permute_msk, raw_desc0_3); 442 __m256i status0_7 = _mm512_extracti64x4_epi64 443 (raw_status0_7, 0); 444 445 /* now do flag manipulation */ 446 447 /** 448 * At this point, we have the 8 sets of flags in the low 16-bits 449 * of each 32-bit value. 450 * We want to extract these, and merge them with the mbuf init 451 * data so we can do a single write to the mbuf to set the flags 452 * and all the other initialization fields. Extracting the 453 * appropriate flags means that we have to do a shift and blend 454 * for each mbuf before we do the write. However, we can also 455 * add in the previously computed rx_descriptor fields to 456 * make a single 256-bit write per mbuf 457 */ 458 /* check the structure matches expectations */ 459 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) != 460 offsetof(struct rte_mbuf, rearm_data) + 8); 461 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) != 462 RTE_ALIGN(offsetof(struct rte_mbuf, 463 rearm_data), 464 16)); 465 /* build up data and do writes */ 466 __m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5, 467 rearm6, rearm7; 468 const __m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0); 469 const __m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1); 470 const __m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0); 471 const __m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1); 472 473 rearm6 = _mm256_permute2f128_si256(mbuf_init, mb6_7, 0x20); 474 rearm4 = _mm256_permute2f128_si256(mbuf_init, mb4_5, 0x20); 475 rearm2 = _mm256_permute2f128_si256(mbuf_init, mb2_3, 0x20); 476 rearm0 = _mm256_permute2f128_si256(mbuf_init, mb0_1, 0x20); 477 478 /* write to mbuf */ 479 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data, 480 rearm6); 481 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data, 482 rearm4); 483 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data, 484 rearm2); 485 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data, 486 rearm0); 487 488 rearm7 = _mm256_blend_epi32(mbuf_init, mb6_7, 0xF0); 489 rearm5 = _mm256_blend_epi32(mbuf_init, mb4_5, 0xF0); 490 rearm3 = _mm256_blend_epi32(mbuf_init, mb2_3, 0xF0); 491 rearm1 = _mm256_blend_epi32(mbuf_init, mb0_1, 0xF0); 492 493 /* again write to mbufs */ 494 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data, 495 rearm7); 496 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data, 497 rearm5); 498 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data, 499 rearm3); 500 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data, 501 rearm1); 502 503 /* perform dd_check */ 504 status0_7 = _mm256_and_si256(status0_7, dd_check); 505 status0_7 = _mm256_packs_epi32(status0_7, 506 _mm256_setzero_si256()); 507 508 uint64_t burst = __builtin_popcountll 509 (_mm_cvtsi128_si64 510 (_mm256_extracti128_si256 511 (status0_7, 1))); 512 burst += __builtin_popcountll 513 (_mm_cvtsi128_si64 514 (_mm256_castsi256_si128(status0_7))); 515 received += burst; 516 if (burst != IDPF_DESCS_PER_LOOP_AVX) 517 break; 518 } 519 520 /* update tail pointers */ 521 rxq->rx_tail += received; 522 rxq->rx_tail &= (rxq->nb_rx_desc - 1); 523 if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep aligned */ 524 rxq->rx_tail--; 525 received--; 526 } 527 rxq->rxrearm_nb += received; 528 return received; 529 } 530 531 /** 532 * Notice: 533 * - nb_pkts < IDPF_DESCS_PER_LOOP, just return no packet 534 */ 535 uint16_t 536 idpf_dp_singleq_recv_pkts_avx512(void *rx_queue, struct rte_mbuf **rx_pkts, 537 uint16_t nb_pkts) 538 { 539 return _idpf_singleq_recv_raw_pkts_avx512(rx_queue, rx_pkts, nb_pkts); 540 } 541 542 static __rte_always_inline void 543 idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq) 544 { 545 struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start]; 546 volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring; 547 uint16_t rx_id; 548 int i; 549 550 rxdp += rx_bufq->rxrearm_start; 551 552 /* Pull 'n' more MBUFs into the software ring */ 553 if (rte_mempool_get_bulk(rx_bufq->mp, 554 (void *)rxp, 555 IDPF_RXQ_REARM_THRESH) < 0) { 556 if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >= 557 rx_bufq->nb_rx_desc) { 558 __m128i dma_addr0; 559 560 dma_addr0 = _mm_setzero_si128(); 561 for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) { 562 rxp[i] = &rx_bufq->fake_mbuf; 563 _mm_store_si128((__m128i *)&rxdp[i], 564 dma_addr0); 565 } 566 } 567 __atomic_fetch_add(&rx_bufq->rx_stats.mbuf_alloc_failed, 568 IDPF_RXQ_REARM_THRESH, __ATOMIC_RELAXED); 569 return; 570 } 571 572 /* Initialize the mbufs in vector, process 8 mbufs in one loop */ 573 for (i = 0; i < IDPF_RXQ_REARM_THRESH; 574 i += 8, rxp += 8, rxdp += 8) { 575 rxdp[0].split_rd.pkt_addr = rxp[0]->buf_iova + RTE_PKTMBUF_HEADROOM; 576 rxdp[1].split_rd.pkt_addr = rxp[1]->buf_iova + RTE_PKTMBUF_HEADROOM; 577 rxdp[2].split_rd.pkt_addr = rxp[2]->buf_iova + RTE_PKTMBUF_HEADROOM; 578 rxdp[3].split_rd.pkt_addr = rxp[3]->buf_iova + RTE_PKTMBUF_HEADROOM; 579 rxdp[4].split_rd.pkt_addr = rxp[4]->buf_iova + RTE_PKTMBUF_HEADROOM; 580 rxdp[5].split_rd.pkt_addr = rxp[5]->buf_iova + RTE_PKTMBUF_HEADROOM; 581 rxdp[6].split_rd.pkt_addr = rxp[6]->buf_iova + RTE_PKTMBUF_HEADROOM; 582 rxdp[7].split_rd.pkt_addr = rxp[7]->buf_iova + RTE_PKTMBUF_HEADROOM; 583 } 584 585 rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH; 586 if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc) 587 rx_bufq->rxrearm_start = 0; 588 589 rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH; 590 591 rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ? 592 (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1)); 593 594 /* Update the tail pointer on the NIC */ 595 IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id); 596 } 597 598 static __rte_always_inline void 599 idpf_splitq_rearm(struct idpf_rx_queue *rx_bufq) 600 { 601 int i; 602 uint16_t rx_id; 603 volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring; 604 struct rte_mempool_cache *cache = 605 rte_mempool_default_cache(rx_bufq->mp, rte_lcore_id()); 606 struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start]; 607 608 rxdp += rx_bufq->rxrearm_start; 609 610 if (unlikely(!cache)) 611 return idpf_splitq_rearm_common(rx_bufq); 612 613 /* We need to pull 'n' more MBUFs into the software ring from mempool 614 * We inline the mempool function here, so we can vectorize the copy 615 * from the cache into the shadow ring. 616 */ 617 618 /* Can this be satisfied from the cache? */ 619 if (cache->len < IDPF_RXQ_REARM_THRESH) { 620 /* No. Backfill the cache first, and then fill from it */ 621 uint32_t req = IDPF_RXQ_REARM_THRESH + (cache->size - 622 cache->len); 623 624 /* How many do we require i.e. number to fill the cache + the request */ 625 int ret = rte_mempool_ops_dequeue_bulk 626 (rx_bufq->mp, &cache->objs[cache->len], req); 627 if (ret == 0) { 628 cache->len += req; 629 } else { 630 if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >= 631 rx_bufq->nb_rx_desc) { 632 __m128i dma_addr0; 633 634 dma_addr0 = _mm_setzero_si128(); 635 for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) { 636 rxp[i] = &rx_bufq->fake_mbuf; 637 _mm_storeu_si128((__m128i *)&rxdp[i], 638 dma_addr0); 639 } 640 } 641 __atomic_fetch_add(&rx_bufq->rx_stats.mbuf_alloc_failed, 642 IDPF_RXQ_REARM_THRESH, __ATOMIC_RELAXED); 643 return; 644 } 645 } 646 647 const __m512i iova_offsets = _mm512_set1_epi64(offsetof 648 (struct rte_mbuf, buf_iova)); 649 const __m512i headroom = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM); 650 651 /* Initialize the mbufs in vector, process 8 mbufs in one loop, taking 652 * from mempool cache and populating both shadow and HW rings 653 */ 654 for (i = 0; i < IDPF_RXQ_REARM_THRESH / IDPF_DESCS_PER_LOOP_AVX; i++) { 655 const __m512i mbuf_ptrs = _mm512_loadu_si512 656 (&cache->objs[cache->len - IDPF_DESCS_PER_LOOP_AVX]); 657 _mm512_storeu_si512(rxp, mbuf_ptrs); 658 659 const __m512i iova_base_addrs = _mm512_i64gather_epi64 660 (_mm512_add_epi64(mbuf_ptrs, iova_offsets), 661 0, /* base */ 662 1 /* scale */); 663 const __m512i iova_addrs = _mm512_add_epi64(iova_base_addrs, 664 headroom); 665 666 const __m512i iova_addrs_1 = _mm512_bsrli_epi128(iova_addrs, 8); 667 668 rxdp[0].split_rd.pkt_addr = 669 _mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs, 0)); 670 rxdp[1].split_rd.pkt_addr = 671 _mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs_1, 0)); 672 rxdp[2].split_rd.pkt_addr = 673 _mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs, 1)); 674 rxdp[3].split_rd.pkt_addr = 675 _mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs_1, 1)); 676 rxdp[4].split_rd.pkt_addr = 677 _mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs, 2)); 678 rxdp[5].split_rd.pkt_addr = 679 _mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs_1, 2)); 680 rxdp[6].split_rd.pkt_addr = 681 _mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs, 3)); 682 rxdp[7].split_rd.pkt_addr = 683 _mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs_1, 3)); 684 685 rxp += IDPF_DESCS_PER_LOOP_AVX; 686 rxdp += IDPF_DESCS_PER_LOOP_AVX; 687 cache->len -= IDPF_DESCS_PER_LOOP_AVX; 688 } 689 690 rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH; 691 if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc) 692 rx_bufq->rxrearm_start = 0; 693 694 rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH; 695 696 rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ? 697 (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1)); 698 699 /* Update the tail pointer on the NIC */ 700 IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id); 701 } 702 703 static __rte_always_inline uint16_t 704 _idpf_splitq_recv_raw_pkts_avx512(struct idpf_rx_queue *rxq, 705 struct rte_mbuf **rx_pkts, 706 uint16_t nb_pkts) 707 { 708 const uint32_t *type_table = rxq->adapter->ptype_tbl; 709 const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0, 710 rxq->bufq2->mbuf_initializer); 711 /* only handle bufq2 here */ 712 struct rte_mbuf **sw_ring = &rxq->bufq2->sw_ring[rxq->rx_tail]; 713 volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring; 714 715 rxdp += rxq->rx_tail; 716 717 rte_prefetch0(rxdp); 718 719 /* nb_pkts has to be floor-aligned to IDPF_DESCS_PER_LOOP_AVX */ 720 nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IDPF_DESCS_PER_LOOP_AVX); 721 722 /* See if we need to rearm the RX queue - gives the prefetch a bit 723 * of time to act 724 */ 725 if (rxq->bufq2->rxrearm_nb > IDPF_RXQ_REARM_THRESH) 726 idpf_splitq_rearm(rxq->bufq2); 727 728 /* Before we start moving massive data around, check to see if 729 * there is actually a packet available 730 */ 731 if (((rxdp->flex_adv_nic_3_wb.pktlen_gen_bufq_id & 732 VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) >> 733 VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) != rxq->expected_gen_id) 734 return 0; 735 736 const __m512i dd_check = _mm512_set1_epi64(1); 737 const __m512i gen_check = _mm512_set1_epi64((uint64_t)1<<46); 738 739 /* mask to shuffle from desc. to mbuf (4 descriptors)*/ 740 const __m512i shuf_msk = 741 _mm512_set_epi32 742 (/* 1st descriptor */ 743 0xFFFFFFFF, /* octet 4~7, 32bits rss */ 744 0xFFFF0504, /* octet 2~3, low 16 bits vlan_macip */ 745 /* octet 15~14, 16 bits data_len */ 746 0xFFFF0504, /* skip high 16 bits pkt_len, zero out */ 747 /* octet 15~14, low 16 bits pkt_len */ 748 0xFFFFFFFF, /* pkt_type set as unknown */ 749 /* 2nd descriptor */ 750 0xFFFFFFFF, /* octet 4~7, 32bits rss */ 751 0xFFFF0504, /* octet 2~3, low 16 bits vlan_macip */ 752 /* octet 15~14, 16 bits data_len */ 753 0xFFFF0504, /* skip high 16 bits pkt_len, zero out */ 754 /* octet 15~14, low 16 bits pkt_len */ 755 0xFFFFFFFF, /* pkt_type set as unknown */ 756 /* 3rd descriptor */ 757 0xFFFFFFFF, /* octet 4~7, 32bits rss */ 758 0xFFFF0504, /* octet 2~3, low 16 bits vlan_macip */ 759 /* octet 15~14, 16 bits data_len */ 760 0xFFFF0504, /* skip high 16 bits pkt_len, zero out */ 761 /* octet 15~14, low 16 bits pkt_len */ 762 0xFFFFFFFF, /* pkt_type set as unknown */ 763 /* 4th descriptor */ 764 0xFFFFFFFF, /* octet 4~7, 32bits rss */ 765 0xFFFF0504, /* octet 2~3, low 16 bits vlan_macip */ 766 /* octet 15~14, 16 bits data_len */ 767 0xFFFF0504, /* skip high 16 bits pkt_len, zero out */ 768 /* octet 15~14, low 16 bits pkt_len */ 769 0xFFFFFFFF /* pkt_type set as unknown */ 770 ); 771 /** 772 * compile-time check the above crc and shuffle layout is correct. 773 * NOTE: the first field (lowest address) is given last in set_epi 774 * calls above. 775 */ 776 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != 777 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); 778 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != 779 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); 780 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) != 781 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10); 782 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != 783 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); 784 785 uint16_t i, received; 786 787 for (i = 0, received = 0; i < nb_pkts; 788 i += IDPF_DESCS_PER_LOOP_AVX, 789 rxdp += IDPF_DESCS_PER_LOOP_AVX) { 790 /* step 1, copy over 8 mbuf pointers to rx_pkts array */ 791 _mm256_storeu_si256((void *)&rx_pkts[i], 792 _mm256_loadu_si256((void *)&sw_ring[i])); 793 #ifdef RTE_ARCH_X86_64 794 _mm256_storeu_si256 795 ((void *)&rx_pkts[i + 4], 796 _mm256_loadu_si256((void *)&sw_ring[i + 4])); 797 #endif 798 799 __m512i raw_desc0_3, raw_desc4_7; 800 const __m128i raw_desc7 = 801 _mm_load_si128((void *)(rxdp + 7)); 802 rte_compiler_barrier(); 803 const __m128i raw_desc6 = 804 _mm_load_si128((void *)(rxdp + 6)); 805 rte_compiler_barrier(); 806 const __m128i raw_desc5 = 807 _mm_load_si128((void *)(rxdp + 5)); 808 rte_compiler_barrier(); 809 const __m128i raw_desc4 = 810 _mm_load_si128((void *)(rxdp + 4)); 811 rte_compiler_barrier(); 812 const __m128i raw_desc3 = 813 _mm_load_si128((void *)(rxdp + 3)); 814 rte_compiler_barrier(); 815 const __m128i raw_desc2 = 816 _mm_load_si128((void *)(rxdp + 2)); 817 rte_compiler_barrier(); 818 const __m128i raw_desc1 = 819 _mm_load_si128((void *)(rxdp + 1)); 820 rte_compiler_barrier(); 821 const __m128i raw_desc0 = 822 _mm_load_si128((void *)(rxdp + 0)); 823 824 raw_desc4_7 = _mm512_broadcast_i32x4(raw_desc4); 825 raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc5, 1); 826 raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc6, 2); 827 raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc7, 3); 828 raw_desc0_3 = _mm512_broadcast_i32x4(raw_desc0); 829 raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc1, 1); 830 raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc2, 2); 831 raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc3, 3); 832 833 /** 834 * convert descriptors 4-7 into mbufs, adjusting length and 835 * re-arranging fields. Then write into the mbuf 836 */ 837 const __m512i len_mask = _mm512_set_epi32(0xffffffff, 0xffffffff, 838 0xffff3fff, 0xffffffff, 839 0xffffffff, 0xffffffff, 840 0xffff3fff, 0xffffffff, 841 0xffffffff, 0xffffffff, 842 0xffff3fff, 0xffffffff, 843 0xffffffff, 0xffffffff, 844 0xffff3fff, 0xffffffff); 845 const __m512i desc4_7 = _mm512_and_epi32(raw_desc4_7, len_mask); 846 __m512i mb4_7 = _mm512_shuffle_epi8(desc4_7, shuf_msk); 847 848 /** 849 * to get packet types, shift 64-bit values down 30 bits 850 * and so ptype is in lower 8-bits in each 851 */ 852 const __m512i ptypes4_7 = _mm512_srli_epi64(desc4_7, 16); 853 const __m256i ptypes6_7 = _mm512_extracti64x4_epi64(ptypes4_7, 1); 854 const __m256i ptypes4_5 = _mm512_extracti64x4_epi64(ptypes4_7, 0); 855 const uint8_t ptype7 = _mm256_extract_epi8(ptypes6_7, 16); 856 const uint8_t ptype6 = _mm256_extract_epi8(ptypes6_7, 0); 857 const uint8_t ptype5 = _mm256_extract_epi8(ptypes4_5, 16); 858 const uint8_t ptype4 = _mm256_extract_epi8(ptypes4_5, 0); 859 860 const __m512i ptype4_7 = _mm512_set_epi32 861 (0, 0, 0, type_table[ptype7], 862 0, 0, 0, type_table[ptype6], 863 0, 0, 0, type_table[ptype5], 864 0, 0, 0, type_table[ptype4]); 865 mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7); 866 867 /** 868 * convert descriptors 0-3 into mbufs, adjusting length and 869 * re-arranging fields. Then write into the mbuf 870 */ 871 const __m512i desc0_3 = _mm512_and_epi32(raw_desc0_3, len_mask); 872 __m512i mb0_3 = _mm512_shuffle_epi8(desc0_3, shuf_msk); 873 874 /* get the packet types */ 875 const __m512i ptypes0_3 = _mm512_srli_epi64(desc0_3, 16); 876 const __m256i ptypes2_3 = _mm512_extracti64x4_epi64(ptypes0_3, 1); 877 const __m256i ptypes0_1 = _mm512_extracti64x4_epi64(ptypes0_3, 0); 878 const uint8_t ptype3 = _mm256_extract_epi8(ptypes2_3, 16); 879 const uint8_t ptype2 = _mm256_extract_epi8(ptypes2_3, 0); 880 const uint8_t ptype1 = _mm256_extract_epi8(ptypes0_1, 16); 881 const uint8_t ptype0 = _mm256_extract_epi8(ptypes0_1, 0); 882 883 const __m512i ptype0_3 = _mm512_set_epi32 884 (0, 0, 0, type_table[ptype3], 885 0, 0, 0, type_table[ptype2], 886 0, 0, 0, type_table[ptype1], 887 0, 0, 0, type_table[ptype0]); 888 mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3); 889 890 /** 891 * use permute/extract to get status and generation bit content 892 * After the operations, the packets status flags are in the 893 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6] 894 */ 895 896 const __m512i dd_permute_msk = _mm512_set_epi64 897 (11, 15, 3, 7, 9, 13, 1, 5); 898 const __m512i status0_7 = _mm512_permutex2var_epi64 899 (raw_desc4_7, dd_permute_msk, raw_desc0_3); 900 const __m512i gen_permute_msk = _mm512_set_epi64 901 (10, 14, 2, 6, 8, 12, 0, 4); 902 const __m512i raw_gen0_7 = _mm512_permutex2var_epi64 903 (raw_desc4_7, gen_permute_msk, raw_desc0_3); 904 905 /* now do flag manipulation */ 906 907 /** 908 * At this point, we have the 8 sets of flags in the low 16-bits 909 * of each 32-bit value in vlan0. 910 * We want to extract these, and merge them with the mbuf init 911 * data so we can do a single write to the mbuf to set the flags 912 * and all the other initialization fields. Extracting the 913 * appropriate flags means that we have to do a shift and blend 914 * for each mbuf before we do the write. However, we can also 915 * add in the previously computed rx_descriptor fields to 916 * make a single 256-bit write per mbuf 917 */ 918 /* check the structure matches expectations */ 919 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) != 920 offsetof(struct rte_mbuf, rearm_data) + 8); 921 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) != 922 RTE_ALIGN(offsetof(struct rte_mbuf, 923 rearm_data), 924 16)); 925 /* build up data and do writes */ 926 __m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5, 927 rearm6, rearm7; 928 const __m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0); 929 const __m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1); 930 const __m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0); 931 const __m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1); 932 933 rearm6 = _mm256_permute2f128_si256(mbuf_init, mb6_7, 0x20); 934 rearm4 = _mm256_permute2f128_si256(mbuf_init, mb4_5, 0x20); 935 rearm2 = _mm256_permute2f128_si256(mbuf_init, mb2_3, 0x20); 936 rearm0 = _mm256_permute2f128_si256(mbuf_init, mb0_1, 0x20); 937 938 /* write to mbuf */ 939 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data, 940 rearm6); 941 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data, 942 rearm4); 943 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data, 944 rearm2); 945 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data, 946 rearm0); 947 948 rearm7 = _mm256_blend_epi32(mbuf_init, mb6_7, 0xF0); 949 rearm5 = _mm256_blend_epi32(mbuf_init, mb4_5, 0xF0); 950 rearm3 = _mm256_blend_epi32(mbuf_init, mb2_3, 0xF0); 951 rearm1 = _mm256_blend_epi32(mbuf_init, mb0_1, 0xF0); 952 953 /* again write to mbufs */ 954 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data, 955 rearm7); 956 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data, 957 rearm5); 958 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data, 959 rearm3); 960 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data, 961 rearm1); 962 963 const __mmask8 dd_mask = _mm512_cmpeq_epi64_mask( 964 _mm512_and_epi64(status0_7, dd_check), dd_check); 965 const __mmask8 gen_mask = _mm512_cmpeq_epi64_mask( 966 _mm512_and_epi64(raw_gen0_7, gen_check), 967 _mm512_set1_epi64((uint64_t)rxq->expected_gen_id << 46)); 968 const __mmask8 recv_mask = _kand_mask8(dd_mask, gen_mask); 969 uint16_t burst = __builtin_popcount(_cvtmask8_u32(recv_mask)); 970 971 received += burst; 972 if (burst != IDPF_DESCS_PER_LOOP_AVX) 973 break; 974 } 975 976 /* update tail pointers */ 977 rxq->rx_tail += received; 978 rxq->expected_gen_id ^= ((rxq->rx_tail & rxq->nb_rx_desc) != 0); 979 rxq->rx_tail &= (rxq->nb_rx_desc - 1); 980 if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep aligned */ 981 rxq->rx_tail--; 982 received--; 983 } 984 985 rxq->bufq2->rxrearm_nb += received; 986 return received; 987 } 988 989 /* only bufq2 can receive pkts */ 990 uint16_t 991 idpf_dp_splitq_recv_pkts_avx512(void *rx_queue, struct rte_mbuf **rx_pkts, 992 uint16_t nb_pkts) 993 { 994 return _idpf_splitq_recv_raw_pkts_avx512(rx_queue, rx_pkts, 995 nb_pkts); 996 } 997 998 static __rte_always_inline int 999 idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq) 1000 { 1001 struct idpf_tx_vec_entry *txep; 1002 uint32_t n; 1003 uint32_t i; 1004 int nb_free = 0; 1005 struct rte_mbuf *m, *free[txq->rs_thresh]; 1006 1007 /* check DD bits on threshold descriptor */ 1008 if ((txq->tx_ring[txq->next_dd].qw1.cmd_dtype & 1009 rte_cpu_to_le_64(IDPF_TXD_QW1_DTYPE_M)) != 1010 rte_cpu_to_le_64(IDPF_TX_DESC_DTYPE_DESC_DONE)) 1011 return 0; 1012 1013 n = txq->rs_thresh; 1014 1015 /* first buffer to free from S/W ring is at index 1016 * tx_next_dd - (tx_rs_thresh-1) 1017 */ 1018 txep = (void *)txq->sw_ring; 1019 txep += txq->next_dd - (n - 1); 1020 1021 if (txq->offloads & IDPF_TX_OFFLOAD_MBUF_FAST_FREE && (n & 31) == 0) { 1022 struct rte_mempool *mp = txep[0].mbuf->pool; 1023 struct rte_mempool_cache *cache = rte_mempool_default_cache(mp, 1024 rte_lcore_id()); 1025 void **cache_objs; 1026 1027 if (cache == NULL || cache->len == 0) 1028 goto normal; 1029 1030 cache_objs = &cache->objs[cache->len]; 1031 1032 if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) { 1033 rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n); 1034 goto done; 1035 } 1036 1037 /* The cache follows the following algorithm 1038 * 1. Add the objects to the cache 1039 * 2. Anything greater than the cache min value (if it crosses the 1040 * cache flush threshold) is flushed to the ring. 1041 */ 1042 /* Add elements back into the cache */ 1043 uint32_t copied = 0; 1044 /* n is multiple of 32 */ 1045 while (copied < n) { 1046 const __m512i a = _mm512_loadu_si512(&txep[copied]); 1047 const __m512i b = _mm512_loadu_si512(&txep[copied + 8]); 1048 const __m512i c = _mm512_loadu_si512(&txep[copied + 16]); 1049 const __m512i d = _mm512_loadu_si512(&txep[copied + 24]); 1050 1051 _mm512_storeu_si512(&cache_objs[copied], a); 1052 _mm512_storeu_si512(&cache_objs[copied + 8], b); 1053 _mm512_storeu_si512(&cache_objs[copied + 16], c); 1054 _mm512_storeu_si512(&cache_objs[copied + 24], d); 1055 copied += 32; 1056 } 1057 cache->len += n; 1058 1059 if (cache->len >= cache->flushthresh) { 1060 rte_mempool_ops_enqueue_bulk(mp, 1061 &cache->objs[cache->size], 1062 cache->len - cache->size); 1063 cache->len = cache->size; 1064 } 1065 goto done; 1066 } 1067 1068 normal: 1069 m = rte_pktmbuf_prefree_seg(txep[0].mbuf); 1070 if (likely(m != NULL)) { 1071 free[0] = m; 1072 nb_free = 1; 1073 for (i = 1; i < n; i++) { 1074 m = rte_pktmbuf_prefree_seg(txep[i].mbuf); 1075 if (likely(m != NULL)) { 1076 if (likely(m->pool == free[0]->pool)) { 1077 free[nb_free++] = m; 1078 } else { 1079 rte_mempool_put_bulk(free[0]->pool, 1080 (void *)free, 1081 nb_free); 1082 free[0] = m; 1083 nb_free = 1; 1084 } 1085 } 1086 } 1087 rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free); 1088 } else { 1089 for (i = 1; i < n; i++) { 1090 m = rte_pktmbuf_prefree_seg(txep[i].mbuf); 1091 if (m != NULL) 1092 rte_mempool_put(m->pool, m); 1093 } 1094 } 1095 1096 done: 1097 /* buffers were freed, update counters */ 1098 txq->nb_free = (uint16_t)(txq->nb_free + txq->rs_thresh); 1099 txq->next_dd = (uint16_t)(txq->next_dd + txq->rs_thresh); 1100 if (txq->next_dd >= txq->nb_tx_desc) 1101 txq->next_dd = (uint16_t)(txq->rs_thresh - 1); 1102 1103 return txq->rs_thresh; 1104 } 1105 1106 static __rte_always_inline void 1107 tx_backlog_entry_avx512(struct idpf_tx_vec_entry *txep, 1108 struct rte_mbuf **tx_pkts, uint16_t nb_pkts) 1109 { 1110 int i; 1111 1112 for (i = 0; i < (int)nb_pkts; ++i) 1113 txep[i].mbuf = tx_pkts[i]; 1114 } 1115 1116 #define IDPF_FLEX_TXD_QW1_BUF_SZ_S 48 1117 static __rte_always_inline void 1118 idpf_singleq_vtx1(volatile struct idpf_flex_tx_desc *txdp, 1119 struct rte_mbuf *pkt, uint64_t flags) 1120 { 1121 uint64_t high_qw = 1122 (IDPF_TX_DESC_DTYPE_FLEX_DATA << IDPF_FLEX_TXD_QW1_DTYPE_S | 1123 ((uint64_t)flags << IDPF_FLEX_TXD_QW1_CMD_S) | 1124 ((uint64_t)pkt->data_len << IDPF_FLEX_TXD_QW1_BUF_SZ_S)); 1125 1126 __m128i descriptor = _mm_set_epi64x(high_qw, 1127 pkt->buf_iova + pkt->data_off); 1128 _mm_storeu_si128((__m128i *)txdp, descriptor); 1129 } 1130 1131 #define IDPF_TX_LEN_MASK 0xAA 1132 #define IDPF_TX_OFF_MASK 0x55 1133 static __rte_always_inline void 1134 idpf_singleq_vtx(volatile struct idpf_flex_tx_desc *txdp, 1135 struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags) 1136 { 1137 const uint64_t hi_qw_tmpl = (IDPF_TX_DESC_DTYPE_FLEX_DATA | 1138 ((uint64_t)flags << IDPF_FLEX_TXD_QW1_CMD_S)); 1139 1140 /* if unaligned on 32-bit boundary, do one to align */ 1141 if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) { 1142 idpf_singleq_vtx1(txdp, *pkt, flags); 1143 nb_pkts--, txdp++, pkt++; 1144 } 1145 1146 /* do 4 at a time while possible, in bursts */ 1147 for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) { 1148 uint64_t hi_qw3 = 1149 hi_qw_tmpl | 1150 ((uint64_t)pkt[3]->data_len << 1151 IDPF_FLEX_TXD_QW1_BUF_SZ_S); 1152 uint64_t hi_qw2 = 1153 hi_qw_tmpl | 1154 ((uint64_t)pkt[2]->data_len << 1155 IDPF_FLEX_TXD_QW1_BUF_SZ_S); 1156 uint64_t hi_qw1 = 1157 hi_qw_tmpl | 1158 ((uint64_t)pkt[1]->data_len << 1159 IDPF_FLEX_TXD_QW1_BUF_SZ_S); 1160 uint64_t hi_qw0 = 1161 hi_qw_tmpl | 1162 ((uint64_t)pkt[0]->data_len << 1163 IDPF_FLEX_TXD_QW1_BUF_SZ_S); 1164 1165 __m512i desc0_3 = 1166 _mm512_set_epi64 1167 (hi_qw3, 1168 pkt[3]->buf_iova + pkt[3]->data_off, 1169 hi_qw2, 1170 pkt[2]->buf_iova + pkt[2]->data_off, 1171 hi_qw1, 1172 pkt[1]->buf_iova + pkt[1]->data_off, 1173 hi_qw0, 1174 pkt[0]->buf_iova + pkt[0]->data_off); 1175 _mm512_storeu_si512((void *)txdp, desc0_3); 1176 } 1177 1178 /* do any last ones */ 1179 while (nb_pkts) { 1180 idpf_singleq_vtx1(txdp, *pkt, flags); 1181 txdp++, pkt++, nb_pkts--; 1182 } 1183 } 1184 1185 static __rte_always_inline uint16_t 1186 idpf_singleq_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, 1187 uint16_t nb_pkts) 1188 { 1189 struct idpf_tx_queue *txq = tx_queue; 1190 volatile struct idpf_flex_tx_desc *txdp; 1191 struct idpf_tx_vec_entry *txep; 1192 uint16_t n, nb_commit, tx_id; 1193 uint64_t flags = IDPF_TX_FLEX_DESC_CMD_EOP; 1194 uint64_t rs = IDPF_TX_FLEX_DESC_CMD_RS | flags; 1195 1196 /* cross rx_thresh boundary is not allowed */ 1197 nb_pkts = RTE_MIN(nb_pkts, txq->rs_thresh); 1198 1199 if (txq->nb_free < txq->free_thresh) 1200 idpf_tx_singleq_free_bufs_avx512(txq); 1201 1202 nb_pkts = (uint16_t)RTE_MIN(txq->nb_free, nb_pkts); 1203 nb_commit = nb_pkts; 1204 if (unlikely(nb_pkts == 0)) 1205 return 0; 1206 1207 tx_id = txq->tx_tail; 1208 txdp = &txq->tx_ring[tx_id]; 1209 txep = (void *)txq->sw_ring; 1210 txep += tx_id; 1211 1212 txq->nb_free = (uint16_t)(txq->nb_free - nb_pkts); 1213 1214 n = (uint16_t)(txq->nb_tx_desc - tx_id); 1215 if (nb_commit >= n) { 1216 tx_backlog_entry_avx512(txep, tx_pkts, n); 1217 1218 idpf_singleq_vtx(txdp, tx_pkts, n - 1, flags); 1219 tx_pkts += (n - 1); 1220 txdp += (n - 1); 1221 1222 idpf_singleq_vtx1(txdp, *tx_pkts++, rs); 1223 1224 nb_commit = (uint16_t)(nb_commit - n); 1225 1226 tx_id = 0; 1227 txq->next_rs = (uint16_t)(txq->rs_thresh - 1); 1228 1229 /* avoid reach the end of ring */ 1230 txdp = &txq->tx_ring[tx_id]; 1231 txep = (void *)txq->sw_ring; 1232 txep += tx_id; 1233 } 1234 1235 tx_backlog_entry_avx512(txep, tx_pkts, nb_commit); 1236 1237 idpf_singleq_vtx(txdp, tx_pkts, nb_commit, flags); 1238 1239 tx_id = (uint16_t)(tx_id + nb_commit); 1240 if (tx_id > txq->next_rs) { 1241 txq->tx_ring[txq->next_rs].qw1.cmd_dtype |= 1242 rte_cpu_to_le_64(((uint64_t)IDPF_TX_FLEX_DESC_CMD_RS) << 1243 IDPF_FLEX_TXD_QW1_CMD_S); 1244 txq->next_rs = 1245 (uint16_t)(txq->next_rs + txq->rs_thresh); 1246 } 1247 1248 txq->tx_tail = tx_id; 1249 1250 IDPF_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail); 1251 1252 return nb_pkts; 1253 } 1254 1255 static __rte_always_inline uint16_t 1256 idpf_singleq_xmit_pkts_vec_avx512_cmn(void *tx_queue, struct rte_mbuf **tx_pkts, 1257 uint16_t nb_pkts) 1258 { 1259 uint16_t nb_tx = 0; 1260 struct idpf_tx_queue *txq = tx_queue; 1261 1262 while (nb_pkts) { 1263 uint16_t ret, num; 1264 1265 num = (uint16_t)RTE_MIN(nb_pkts, txq->rs_thresh); 1266 ret = idpf_singleq_xmit_fixed_burst_vec_avx512(tx_queue, &tx_pkts[nb_tx], 1267 num); 1268 nb_tx += ret; 1269 nb_pkts -= ret; 1270 if (ret < num) 1271 break; 1272 } 1273 1274 return nb_tx; 1275 } 1276 1277 uint16_t 1278 idpf_dp_singleq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, 1279 uint16_t nb_pkts) 1280 { 1281 return idpf_singleq_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts); 1282 } 1283 1284 static __rte_always_inline void 1285 idpf_splitq_scan_cq_ring(struct idpf_tx_queue *cq) 1286 { 1287 struct idpf_splitq_tx_compl_desc *compl_ring; 1288 struct idpf_tx_queue *txq; 1289 uint16_t genid, txq_qid, cq_qid, i; 1290 uint8_t ctype; 1291 1292 cq_qid = cq->tx_tail; 1293 1294 for (i = 0; i < IDPD_TXQ_SCAN_CQ_THRESH; i++) { 1295 if (cq_qid == cq->nb_tx_desc) { 1296 cq_qid = 0; 1297 cq->expected_gen_id ^= 1; 1298 } 1299 compl_ring = &cq->compl_ring[cq_qid]; 1300 genid = (compl_ring->qid_comptype_gen & 1301 rte_cpu_to_le_64(IDPF_TXD_COMPLQ_GEN_M)) >> IDPF_TXD_COMPLQ_GEN_S; 1302 if (genid != cq->expected_gen_id) 1303 break; 1304 ctype = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) & 1305 IDPF_TXD_COMPLQ_COMPL_TYPE_M) >> IDPF_TXD_COMPLQ_COMPL_TYPE_S; 1306 txq_qid = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) & 1307 IDPF_TXD_COMPLQ_QID_M) >> IDPF_TXD_COMPLQ_QID_S; 1308 txq = cq->txqs[txq_qid - cq->tx_start_qid]; 1309 txq->ctype[ctype]++; 1310 cq_qid++; 1311 } 1312 1313 cq->tx_tail = cq_qid; 1314 } 1315 1316 static __rte_always_inline int 1317 idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq) 1318 { 1319 struct idpf_tx_vec_entry *txep; 1320 uint32_t n; 1321 uint32_t i; 1322 int nb_free = 0; 1323 struct rte_mbuf *m, *free[txq->rs_thresh]; 1324 1325 n = txq->rs_thresh; 1326 1327 /* first buffer to free from S/W ring is at index 1328 * tx_next_dd - (tx_rs_thresh-1) 1329 */ 1330 txep = (void *)txq->sw_ring; 1331 txep += txq->next_dd - (n - 1); 1332 1333 if (txq->offloads & IDPF_TX_OFFLOAD_MBUF_FAST_FREE && (n & 31) == 0) { 1334 struct rte_mempool *mp = txep[0].mbuf->pool; 1335 struct rte_mempool_cache *cache = rte_mempool_default_cache(mp, 1336 rte_lcore_id()); 1337 void **cache_objs; 1338 1339 if (!cache || cache->len == 0) 1340 goto normal; 1341 1342 cache_objs = &cache->objs[cache->len]; 1343 1344 if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) { 1345 rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n); 1346 goto done; 1347 } 1348 1349 /* The cache follows the following algorithm 1350 * 1. Add the objects to the cache 1351 * 2. Anything greater than the cache min value (if it crosses the 1352 * cache flush threshold) is flushed to the ring. 1353 */ 1354 /* Add elements back into the cache */ 1355 uint32_t copied = 0; 1356 /* n is multiple of 32 */ 1357 while (copied < n) { 1358 const __m512i a = _mm512_loadu_si512(&txep[copied]); 1359 const __m512i b = _mm512_loadu_si512(&txep[copied + 8]); 1360 const __m512i c = _mm512_loadu_si512(&txep[copied + 16]); 1361 const __m512i d = _mm512_loadu_si512(&txep[copied + 24]); 1362 1363 _mm512_storeu_si512(&cache_objs[copied], a); 1364 _mm512_storeu_si512(&cache_objs[copied + 8], b); 1365 _mm512_storeu_si512(&cache_objs[copied + 16], c); 1366 _mm512_storeu_si512(&cache_objs[copied + 24], d); 1367 copied += 32; 1368 } 1369 cache->len += n; 1370 1371 if (cache->len >= cache->flushthresh) { 1372 rte_mempool_ops_enqueue_bulk(mp, 1373 &cache->objs[cache->size], 1374 cache->len - cache->size); 1375 cache->len = cache->size; 1376 } 1377 goto done; 1378 } 1379 1380 normal: 1381 m = rte_pktmbuf_prefree_seg(txep[0].mbuf); 1382 if (likely(m)) { 1383 free[0] = m; 1384 nb_free = 1; 1385 for (i = 1; i < n; i++) { 1386 m = rte_pktmbuf_prefree_seg(txep[i].mbuf); 1387 if (likely(m)) { 1388 if (likely(m->pool == free[0]->pool)) { 1389 free[nb_free++] = m; 1390 } else { 1391 rte_mempool_put_bulk(free[0]->pool, 1392 (void *)free, 1393 nb_free); 1394 free[0] = m; 1395 nb_free = 1; 1396 } 1397 } 1398 } 1399 rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free); 1400 } else { 1401 for (i = 1; i < n; i++) { 1402 m = rte_pktmbuf_prefree_seg(txep[i].mbuf); 1403 if (m) 1404 rte_mempool_put(m->pool, m); 1405 } 1406 } 1407 1408 done: 1409 /* buffers were freed, update counters */ 1410 txq->nb_free = (uint16_t)(txq->nb_free + txq->rs_thresh); 1411 txq->next_dd = (uint16_t)(txq->next_dd + txq->rs_thresh); 1412 if (txq->next_dd >= txq->nb_tx_desc) 1413 txq->next_dd = (uint16_t)(txq->rs_thresh - 1); 1414 txq->ctype[IDPF_TXD_COMPLT_RS] -= txq->rs_thresh; 1415 1416 return txq->rs_thresh; 1417 } 1418 1419 #define IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S 48 1420 1421 static __rte_always_inline void 1422 idpf_splitq_vtx1(volatile struct idpf_flex_tx_sched_desc *txdp, 1423 struct rte_mbuf *pkt, uint64_t flags) 1424 { 1425 uint64_t high_qw = 1426 (IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE | 1427 ((uint64_t)flags) | 1428 ((uint64_t)pkt->data_len << IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S)); 1429 1430 __m128i descriptor = _mm_set_epi64x(high_qw, 1431 pkt->buf_iova + pkt->data_off); 1432 _mm_storeu_si128((__m128i *)txdp, descriptor); 1433 } 1434 1435 static __rte_always_inline void 1436 idpf_splitq_vtx(volatile struct idpf_flex_tx_sched_desc *txdp, 1437 struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags) 1438 { 1439 const uint64_t hi_qw_tmpl = (IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE | 1440 ((uint64_t)flags)); 1441 1442 /* if unaligned on 32-bit boundary, do one to align */ 1443 if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) { 1444 idpf_splitq_vtx1(txdp, *pkt, flags); 1445 nb_pkts--, txdp++, pkt++; 1446 } 1447 1448 /* do 4 at a time while possible, in bursts */ 1449 for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) { 1450 uint64_t hi_qw3 = 1451 hi_qw_tmpl | 1452 ((uint64_t)pkt[3]->data_len << 1453 IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S); 1454 uint64_t hi_qw2 = 1455 hi_qw_tmpl | 1456 ((uint64_t)pkt[2]->data_len << 1457 IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S); 1458 uint64_t hi_qw1 = 1459 hi_qw_tmpl | 1460 ((uint64_t)pkt[1]->data_len << 1461 IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S); 1462 uint64_t hi_qw0 = 1463 hi_qw_tmpl | 1464 ((uint64_t)pkt[0]->data_len << 1465 IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S); 1466 1467 __m512i desc0_3 = 1468 _mm512_set_epi64 1469 (hi_qw3, 1470 pkt[3]->buf_iova + pkt[3]->data_off, 1471 hi_qw2, 1472 pkt[2]->buf_iova + pkt[2]->data_off, 1473 hi_qw1, 1474 pkt[1]->buf_iova + pkt[1]->data_off, 1475 hi_qw0, 1476 pkt[0]->buf_iova + pkt[0]->data_off); 1477 _mm512_storeu_si512((void *)txdp, desc0_3); 1478 } 1479 1480 /* do any last ones */ 1481 while (nb_pkts) { 1482 idpf_splitq_vtx1(txdp, *pkt, flags); 1483 txdp++, pkt++, nb_pkts--; 1484 } 1485 } 1486 1487 static __rte_always_inline uint16_t 1488 idpf_splitq_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, 1489 uint16_t nb_pkts) 1490 { 1491 struct idpf_tx_queue *txq = (struct idpf_tx_queue *)tx_queue; 1492 volatile struct idpf_flex_tx_sched_desc *txdp; 1493 struct idpf_tx_vec_entry *txep; 1494 uint16_t n, nb_commit, tx_id; 1495 /* bit2 is reserved and must be set to 1 according to Spec */ 1496 uint64_t cmd_dtype = IDPF_TXD_FLEX_FLOW_CMD_EOP; 1497 1498 tx_id = txq->tx_tail; 1499 1500 /* cross rx_thresh boundary is not allowed */ 1501 nb_pkts = RTE_MIN(nb_pkts, txq->rs_thresh); 1502 1503 nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_free, nb_pkts); 1504 if (unlikely(nb_pkts == 0)) 1505 return 0; 1506 1507 tx_id = txq->tx_tail; 1508 txdp = &txq->desc_ring[tx_id]; 1509 txep = (void *)txq->sw_ring; 1510 txep += tx_id; 1511 1512 txq->nb_free = (uint16_t)(txq->nb_free - nb_pkts); 1513 1514 n = (uint16_t)(txq->nb_tx_desc - tx_id); 1515 if (nb_commit >= n) { 1516 tx_backlog_entry_avx512(txep, tx_pkts, n); 1517 1518 idpf_splitq_vtx((void *)txdp, tx_pkts, n - 1, cmd_dtype); 1519 tx_pkts += (n - 1); 1520 txdp += (n - 1); 1521 1522 idpf_splitq_vtx1((void *)txdp, *tx_pkts++, cmd_dtype); 1523 1524 nb_commit = (uint16_t)(nb_commit - n); 1525 1526 tx_id = 0; 1527 txq->next_rs = (uint16_t)(txq->rs_thresh - 1); 1528 1529 /* avoid reach the end of ring */ 1530 txdp = &txq->desc_ring[tx_id]; 1531 txep = (void *)txq->sw_ring; 1532 txep += tx_id; 1533 } 1534 1535 tx_backlog_entry_avx512(txep, tx_pkts, nb_commit); 1536 1537 idpf_splitq_vtx((void *)txdp, tx_pkts, nb_commit, cmd_dtype); 1538 1539 tx_id = (uint16_t)(tx_id + nb_commit); 1540 if (tx_id > txq->next_rs) 1541 txq->next_rs = 1542 (uint16_t)(txq->next_rs + txq->rs_thresh); 1543 1544 txq->tx_tail = tx_id; 1545 1546 IDPF_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail); 1547 1548 return nb_pkts; 1549 } 1550 1551 static __rte_always_inline uint16_t 1552 idpf_splitq_xmit_pkts_vec_avx512_cmn(void *tx_queue, struct rte_mbuf **tx_pkts, 1553 uint16_t nb_pkts) 1554 { 1555 struct idpf_tx_queue *txq = (struct idpf_tx_queue *)tx_queue; 1556 uint16_t nb_tx = 0; 1557 1558 while (nb_pkts) { 1559 uint16_t ret, num; 1560 1561 idpf_splitq_scan_cq_ring(txq->complq); 1562 1563 if (txq->ctype[IDPF_TXD_COMPLT_RS] > txq->free_thresh) 1564 idpf_tx_splitq_free_bufs_avx512(txq); 1565 1566 num = (uint16_t)RTE_MIN(nb_pkts, txq->rs_thresh); 1567 ret = idpf_splitq_xmit_fixed_burst_vec_avx512(tx_queue, 1568 &tx_pkts[nb_tx], 1569 num); 1570 nb_tx += ret; 1571 nb_pkts -= ret; 1572 if (ret < num) 1573 break; 1574 } 1575 1576 return nb_tx; 1577 } 1578 1579 uint16_t 1580 idpf_dp_splitq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, 1581 uint16_t nb_pkts) 1582 { 1583 return idpf_splitq_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts); 1584 } 1585 1586 static inline void 1587 idpf_tx_release_mbufs_avx512(struct idpf_tx_queue *txq) 1588 { 1589 unsigned int i; 1590 const uint16_t max_desc = (uint16_t)(txq->nb_tx_desc - 1); 1591 struct idpf_tx_vec_entry *swr = (void *)txq->sw_ring; 1592 1593 if (txq->sw_ring == NULL || txq->nb_free == max_desc) 1594 return; 1595 1596 i = txq->next_dd - txq->rs_thresh + 1; 1597 if (txq->tx_tail < i) { 1598 for (; i < txq->nb_tx_desc; i++) { 1599 rte_pktmbuf_free_seg(swr[i].mbuf); 1600 swr[i].mbuf = NULL; 1601 } 1602 i = 0; 1603 } 1604 } 1605 1606 static const struct idpf_txq_ops avx512_tx_vec_ops = { 1607 .release_mbufs = idpf_tx_release_mbufs_avx512, 1608 }; 1609 1610 int __rte_cold 1611 idpf_qc_tx_vec_avx512_setup(struct idpf_tx_queue *txq) 1612 { 1613 if (!txq) 1614 return 0; 1615 1616 txq->ops = &avx512_tx_vec_ops; 1617 return 0; 1618 } 1619