1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2023 Intel Corporation 3 */ 4 5 #include <rte_vect.h> 6 #include "idpf_common_device.h" 7 #include "idpf_common_rxtx.h" 8 9 #define IDPF_DESCS_PER_LOOP_AVX 8 10 #define PKTLEN_SHIFT 10 11 12 static __rte_always_inline void 13 idpf_singleq_rearm_common(struct idpf_rx_queue *rxq) 14 { 15 struct rte_mbuf **rxp = &rxq->sw_ring[rxq->rxrearm_start]; 16 volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring; 17 uint16_t rx_id; 18 int i; 19 20 rxdp += rxq->rxrearm_start; 21 22 /* Pull 'n' more MBUFs into the software ring */ 23 if (rte_mempool_get_bulk(rxq->mp, 24 (void *)rxp, 25 IDPF_RXQ_REARM_THRESH) < 0) { 26 if (rxq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >= 27 rxq->nb_rx_desc) { 28 __m128i dma_addr0; 29 30 dma_addr0 = _mm_setzero_si128(); 31 for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) { 32 rxp[i] = &rxq->fake_mbuf; 33 _mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp[i].read), 34 dma_addr0); 35 } 36 } 37 rte_atomic_fetch_add_explicit(&rxq->rx_stats.mbuf_alloc_failed, 38 IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed); 39 return; 40 } 41 struct rte_mbuf *mb0, *mb1, *mb2, *mb3; 42 struct rte_mbuf *mb4, *mb5, *mb6, *mb7; 43 __m512i dma_addr0_3, dma_addr4_7; 44 __m512i hdr_room = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM); 45 /* Initialize the mbufs in vector, process 8 mbufs in one loop */ 46 for (i = 0; i < IDPF_RXQ_REARM_THRESH; 47 i += 8, rxp += 8, rxdp += 8) { 48 __m128i vaddr0, vaddr1, vaddr2, vaddr3; 49 __m128i vaddr4, vaddr5, vaddr6, vaddr7; 50 __m256i vaddr0_1, vaddr2_3; 51 __m256i vaddr4_5, vaddr6_7; 52 __m512i vaddr0_3, vaddr4_7; 53 54 mb0 = rxp[0]; 55 mb1 = rxp[1]; 56 mb2 = rxp[2]; 57 mb3 = rxp[3]; 58 mb4 = rxp[4]; 59 mb5 = rxp[5]; 60 mb6 = rxp[6]; 61 mb7 = rxp[7]; 62 63 /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */ 64 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != 65 offsetof(struct rte_mbuf, buf_addr) + 8); 66 vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr); 67 vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr); 68 vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr); 69 vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr); 70 vaddr4 = _mm_loadu_si128((__m128i *)&mb4->buf_addr); 71 vaddr5 = _mm_loadu_si128((__m128i *)&mb5->buf_addr); 72 vaddr6 = _mm_loadu_si128((__m128i *)&mb6->buf_addr); 73 vaddr7 = _mm_loadu_si128((__m128i *)&mb7->buf_addr); 74 75 /** 76 * merge 0 & 1, by casting 0 to 256-bit and inserting 1 77 * into the high lanes. Similarly for 2 & 3, and so on. 78 */ 79 vaddr0_1 = 80 _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0), 81 vaddr1, 1); 82 vaddr2_3 = 83 _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr2), 84 vaddr3, 1); 85 vaddr4_5 = 86 _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr4), 87 vaddr5, 1); 88 vaddr6_7 = 89 _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr6), 90 vaddr7, 1); 91 vaddr0_3 = 92 _mm512_inserti64x4(_mm512_castsi256_si512(vaddr0_1), 93 vaddr2_3, 1); 94 vaddr4_7 = 95 _mm512_inserti64x4(_mm512_castsi256_si512(vaddr4_5), 96 vaddr6_7, 1); 97 98 /* convert pa to dma_addr hdr/data */ 99 dma_addr0_3 = _mm512_unpackhi_epi64(vaddr0_3, vaddr0_3); 100 dma_addr4_7 = _mm512_unpackhi_epi64(vaddr4_7, vaddr4_7); 101 102 /* add headroom to pa values */ 103 dma_addr0_3 = _mm512_add_epi64(dma_addr0_3, hdr_room); 104 dma_addr4_7 = _mm512_add_epi64(dma_addr4_7, hdr_room); 105 106 /* flush desc with pa dma_addr */ 107 _mm512_store_si512(RTE_CAST_PTR(__m512i *, &rxdp->read), dma_addr0_3); 108 _mm512_store_si512(RTE_CAST_PTR(__m512i *, &(rxdp + 4)->read), dma_addr4_7); 109 } 110 111 rxq->rxrearm_start += IDPF_RXQ_REARM_THRESH; 112 if (rxq->rxrearm_start >= rxq->nb_rx_desc) 113 rxq->rxrearm_start = 0; 114 115 rxq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH; 116 117 rx_id = (uint16_t)((rxq->rxrearm_start == 0) ? 118 (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1)); 119 120 /* Update the tail pointer on the NIC */ 121 IDPF_PCI_REG_WRITE(rxq->qrx_tail, rx_id); 122 } 123 124 static __rte_always_inline void 125 idpf_singleq_rearm(struct idpf_rx_queue *rxq) 126 { 127 int i; 128 uint16_t rx_id; 129 volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring; 130 struct rte_mempool_cache *cache = 131 rte_mempool_default_cache(rxq->mp, rte_lcore_id()); 132 struct rte_mbuf **rxp = &rxq->sw_ring[rxq->rxrearm_start]; 133 134 rxdp += rxq->rxrearm_start; 135 136 if (unlikely(cache == NULL)) 137 return idpf_singleq_rearm_common(rxq); 138 139 /* We need to pull 'n' more MBUFs into the software ring from mempool 140 * We inline the mempool function here, so we can vectorize the copy 141 * from the cache into the shadow ring. 142 */ 143 144 /* Can this be satisfied from the cache? */ 145 if (cache->len < IDPF_RXQ_REARM_THRESH) { 146 /* No. Backfill the cache first, and then fill from it */ 147 uint32_t req = IDPF_RXQ_REARM_THRESH + (cache->size - 148 cache->len); 149 150 /* How many do we require i.e. number to fill the cache + the request */ 151 int ret = rte_mempool_ops_dequeue_bulk 152 (rxq->mp, &cache->objs[cache->len], req); 153 if (ret == 0) { 154 cache->len += req; 155 } else { 156 if (rxq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >= 157 rxq->nb_rx_desc) { 158 __m128i dma_addr0; 159 160 dma_addr0 = _mm_setzero_si128(); 161 for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) { 162 rxp[i] = &rxq->fake_mbuf; 163 _mm_storeu_si128(RTE_CAST_PTR 164 (__m128i *, &rxdp[i].read), dma_addr0); 165 } 166 } 167 rte_atomic_fetch_add_explicit(&rxq->rx_stats.mbuf_alloc_failed, 168 IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed); 169 return; 170 } 171 } 172 173 const __m512i iova_offsets = _mm512_set1_epi64(offsetof 174 (struct rte_mbuf, buf_iova)); 175 const __m512i headroom = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM); 176 177 /* to shuffle the addresses to correct slots. Values 4-7 will contain 178 * zeros, so use 7 for a zero-value. 179 */ 180 const __m512i permute_idx = _mm512_set_epi64(7, 7, 3, 1, 7, 7, 2, 0); 181 182 /* Initialize the mbufs in vector, process 8 mbufs in one loop, taking 183 * from mempool cache and populating both shadow and HW rings 184 */ 185 for (i = 0; i < IDPF_RXQ_REARM_THRESH / IDPF_DESCS_PER_LOOP_AVX; i++) { 186 const __m512i mbuf_ptrs = _mm512_loadu_si512 187 (&cache->objs[cache->len - IDPF_DESCS_PER_LOOP_AVX]); 188 _mm512_storeu_si512(rxp, mbuf_ptrs); 189 190 const __m512i iova_base_addrs = _mm512_i64gather_epi64 191 (_mm512_add_epi64(mbuf_ptrs, iova_offsets), 192 0, /* base */ 193 1 /* scale */); 194 const __m512i iova_addrs = _mm512_add_epi64(iova_base_addrs, 195 headroom); 196 const __m512i iovas0 = _mm512_castsi256_si512 197 (_mm512_extracti64x4_epi64(iova_addrs, 0)); 198 const __m512i iovas1 = _mm512_castsi256_si512 199 (_mm512_extracti64x4_epi64(iova_addrs, 1)); 200 201 /* permute leaves desc 2-3 addresses in header address slots 0-1 202 * but these are ignored by driver since header split not 203 * enabled. Similarly for desc 6 & 7. 204 */ 205 const __m512i desc0_1 = _mm512_permutexvar_epi64 206 (permute_idx, 207 iovas0); 208 const __m512i desc2_3 = _mm512_bsrli_epi128(desc0_1, 8); 209 210 const __m512i desc4_5 = _mm512_permutexvar_epi64 211 (permute_idx, 212 iovas1); 213 const __m512i desc6_7 = _mm512_bsrli_epi128(desc4_5, 8); 214 215 _mm512_storeu_si512(RTE_CAST_PTR(void *, rxdp), desc0_1); 216 _mm512_storeu_si512(RTE_CAST_PTR(void *, (rxdp + 2)), desc2_3); 217 _mm512_storeu_si512(RTE_CAST_PTR(void *, (rxdp + 4)), desc4_5); 218 _mm512_storeu_si512(RTE_CAST_PTR(void *, (rxdp + 6)), desc6_7); 219 220 rxp += IDPF_DESCS_PER_LOOP_AVX; 221 rxdp += IDPF_DESCS_PER_LOOP_AVX; 222 cache->len -= IDPF_DESCS_PER_LOOP_AVX; 223 } 224 225 rxq->rxrearm_start += IDPF_RXQ_REARM_THRESH; 226 if (rxq->rxrearm_start >= rxq->nb_rx_desc) 227 rxq->rxrearm_start = 0; 228 229 rxq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH; 230 231 rx_id = (uint16_t)((rxq->rxrearm_start == 0) ? 232 (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1)); 233 234 /* Update the tail pointer on the NIC */ 235 IDPF_PCI_REG_WRITE(rxq->qrx_tail, rx_id); 236 } 237 238 #define IDPF_RX_LEN_MASK 0x80808080 239 static __rte_always_inline uint16_t 240 _idpf_singleq_recv_raw_pkts_avx512(struct idpf_rx_queue *rxq, 241 struct rte_mbuf **rx_pkts, 242 uint16_t nb_pkts) 243 { 244 const uint32_t *type_table = rxq->adapter->ptype_tbl; 245 246 const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0, 247 rxq->mbuf_initializer); 248 struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail]; 249 volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring; 250 251 rxdp += rxq->rx_tail; 252 253 rte_prefetch0(rxdp); 254 255 /* nb_pkts has to be floor-aligned to IDPF_DESCS_PER_LOOP_AVX */ 256 nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IDPF_DESCS_PER_LOOP_AVX); 257 258 /* See if we need to rearm the RX queue - gives the prefetch a bit 259 * of time to act 260 */ 261 if (rxq->rxrearm_nb > IDPF_RXQ_REARM_THRESH) 262 idpf_singleq_rearm(rxq); 263 264 /* Before we start moving massive data around, check to see if 265 * there is actually a packet available 266 */ 267 if ((rxdp->flex_nic_wb.status_error0 & 268 rte_cpu_to_le_32(1 << VIRTCHNL2_RX_FLEX_DESC_STATUS0_DD_S)) == 0) 269 return 0; 270 271 /* 8 packets DD mask, LSB in each 32-bit value */ 272 const __m256i dd_check = _mm256_set1_epi32(1); 273 274 /* mask to shuffle from desc. to mbuf (4 descriptors)*/ 275 const __m512i shuf_msk = 276 _mm512_set_epi32 277 (/* 1st descriptor */ 278 0xFFFFFFFF, /* rss set as unknown */ 279 0xFFFF0504, /* vlan_macip set as unknown */ 280 /* octet 15~14, 16 bits data_len */ 281 0xFFFF0504, /* skip high 16 bits pkt_len, zero out */ 282 /* octet 15~14, low 16 bits pkt_len */ 283 0xFFFFFFFF, /* pkt_type set as unknown */ 284 /* 2nd descriptor */ 285 0xFFFFFFFF, /* rss set as unknown */ 286 0xFFFF0504, /* vlan_macip set as unknown */ 287 /* octet 15~14, 16 bits data_len */ 288 0xFFFF0504, /* skip high 16 bits pkt_len, zero out */ 289 /* octet 15~14, low 16 bits pkt_len */ 290 0xFFFFFFFF, /* pkt_type set as unknown */ 291 /* 3rd descriptor */ 292 0xFFFFFFFF, /* rss set as unknown */ 293 0xFFFF0504, /* vlan_macip set as unknown */ 294 /* octet 15~14, 16 bits data_len */ 295 0xFFFF0504, /* skip high 16 bits pkt_len, zero out */ 296 /* octet 15~14, low 16 bits pkt_len */ 297 0xFFFFFFFF, /* pkt_type set as unknown */ 298 /* 4th descriptor */ 299 0xFFFFFFFF, /* rss set as unknown */ 300 0xFFFF0504, /* vlan_macip set as unknown */ 301 /* octet 15~14, 16 bits data_len */ 302 0xFFFF0504, /* skip high 16 bits pkt_len, zero out */ 303 /* octet 15~14, low 16 bits pkt_len */ 304 0xFFFFFFFF /* pkt_type set as unknown */ 305 ); 306 /** 307 * compile-time check the shuffle layout is correct. 308 * NOTE: the first field (lowest address) is given last in set_epi 309 * calls above. 310 */ 311 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != 312 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); 313 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != 314 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); 315 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) != 316 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10); 317 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != 318 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); 319 320 uint16_t i, received; 321 322 for (i = 0, received = 0; i < nb_pkts; 323 i += IDPF_DESCS_PER_LOOP_AVX, 324 rxdp += IDPF_DESCS_PER_LOOP_AVX) { 325 /* step 1, copy over 8 mbuf pointers to rx_pkts array */ 326 _mm256_storeu_si256((void *)&rx_pkts[i], 327 _mm256_loadu_si256((void *)&sw_ring[i])); 328 #ifdef RTE_ARCH_X86_64 329 _mm256_storeu_si256 330 ((void *)&rx_pkts[i + 4], 331 _mm256_loadu_si256((void *)&sw_ring[i + 4])); 332 #endif 333 334 __m512i raw_desc0_3, raw_desc4_7; 335 const __m128i raw_desc7 = 336 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 7)); 337 rte_compiler_barrier(); 338 const __m128i raw_desc6 = 339 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 6)); 340 rte_compiler_barrier(); 341 const __m128i raw_desc5 = 342 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 5)); 343 rte_compiler_barrier(); 344 const __m128i raw_desc4 = 345 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 4)); 346 rte_compiler_barrier(); 347 const __m128i raw_desc3 = 348 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 3)); 349 rte_compiler_barrier(); 350 const __m128i raw_desc2 = 351 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 2)); 352 rte_compiler_barrier(); 353 const __m128i raw_desc1 = 354 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 1)); 355 rte_compiler_barrier(); 356 const __m128i raw_desc0 = 357 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 0)); 358 359 raw_desc4_7 = _mm512_broadcast_i32x4(raw_desc4); 360 raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc5, 1); 361 raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc6, 2); 362 raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc7, 3); 363 raw_desc0_3 = _mm512_broadcast_i32x4(raw_desc0); 364 raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc1, 1); 365 raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc2, 2); 366 raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc3, 3); 367 368 /** 369 * convert descriptors 4-7 into mbufs, adjusting length and 370 * re-arranging fields. Then write into the mbuf 371 */ 372 const __m512i len4_7 = _mm512_slli_epi32(raw_desc4_7, 373 PKTLEN_SHIFT); 374 const __m512i desc4_7 = _mm512_mask_blend_epi16(IDPF_RX_LEN_MASK, 375 raw_desc4_7, 376 len4_7); 377 __m512i mb4_7 = _mm512_shuffle_epi8(desc4_7, shuf_msk); 378 379 /** 380 * to get packet types, shift 64-bit values down 30 bits 381 * and so ptype is in lower 8-bits in each 382 */ 383 const __m512i ptypes4_7 = _mm512_srli_epi64(desc4_7, 16); 384 const __m256i ptypes6_7 = _mm512_extracti64x4_epi64(ptypes4_7, 1); 385 const __m256i ptypes4_5 = _mm512_extracti64x4_epi64(ptypes4_7, 0); 386 const uint8_t ptype7 = _mm256_extract_epi8(ptypes6_7, 16); 387 const uint8_t ptype6 = _mm256_extract_epi8(ptypes6_7, 0); 388 const uint8_t ptype5 = _mm256_extract_epi8(ptypes4_5, 16); 389 const uint8_t ptype4 = _mm256_extract_epi8(ptypes4_5, 0); 390 391 const __m512i ptype4_7 = _mm512_set_epi32 392 (0, 0, 0, type_table[ptype7], 393 0, 0, 0, type_table[ptype6], 394 0, 0, 0, type_table[ptype5], 395 0, 0, 0, type_table[ptype4]); 396 mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7); 397 398 /** 399 * convert descriptors 0-3 into mbufs, adjusting length and 400 * re-arranging fields. Then write into the mbuf 401 */ 402 const __m512i len0_3 = _mm512_slli_epi32(raw_desc0_3, 403 PKTLEN_SHIFT); 404 const __m512i desc0_3 = _mm512_mask_blend_epi16(IDPF_RX_LEN_MASK, 405 raw_desc0_3, 406 len0_3); 407 __m512i mb0_3 = _mm512_shuffle_epi8(desc0_3, shuf_msk); 408 409 /* get the packet types */ 410 const __m512i ptypes0_3 = _mm512_srli_epi64(desc0_3, 16); 411 const __m256i ptypes2_3 = _mm512_extracti64x4_epi64(ptypes0_3, 1); 412 const __m256i ptypes0_1 = _mm512_extracti64x4_epi64(ptypes0_3, 0); 413 const uint8_t ptype3 = _mm256_extract_epi8(ptypes2_3, 16); 414 const uint8_t ptype2 = _mm256_extract_epi8(ptypes2_3, 0); 415 const uint8_t ptype1 = _mm256_extract_epi8(ptypes0_1, 16); 416 const uint8_t ptype0 = _mm256_extract_epi8(ptypes0_1, 0); 417 418 const __m512i ptype0_3 = _mm512_set_epi32 419 (0, 0, 0, type_table[ptype3], 420 0, 0, 0, type_table[ptype2], 421 0, 0, 0, type_table[ptype1], 422 0, 0, 0, type_table[ptype0]); 423 mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3); 424 425 /** 426 * use permute/extract to get status content 427 * After the operations, the packets status flags are in the 428 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6] 429 */ 430 /* merge the status bits into one register */ 431 const __m512i status_permute_msk = _mm512_set_epi32 432 (0, 0, 0, 0, 433 0, 0, 0, 0, 434 22, 30, 6, 14, 435 18, 26, 2, 10); 436 const __m512i raw_status0_7 = _mm512_permutex2var_epi32 437 (raw_desc4_7, status_permute_msk, raw_desc0_3); 438 __m256i status0_7 = _mm512_extracti64x4_epi64 439 (raw_status0_7, 0); 440 441 /* now do flag manipulation */ 442 443 /** 444 * At this point, we have the 8 sets of flags in the low 16-bits 445 * of each 32-bit value. 446 * We want to extract these, and merge them with the mbuf init 447 * data so we can do a single write to the mbuf to set the flags 448 * and all the other initialization fields. Extracting the 449 * appropriate flags means that we have to do a shift and blend 450 * for each mbuf before we do the write. However, we can also 451 * add in the previously computed rx_descriptor fields to 452 * make a single 256-bit write per mbuf 453 */ 454 /* check the structure matches expectations */ 455 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) != 456 offsetof(struct rte_mbuf, rearm_data) + 8); 457 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) != 458 RTE_ALIGN(offsetof(struct rte_mbuf, 459 rearm_data), 460 16)); 461 /* build up data and do writes */ 462 __m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5, 463 rearm6, rearm7; 464 const __m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0); 465 const __m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1); 466 const __m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0); 467 const __m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1); 468 469 rearm6 = _mm256_permute2f128_si256(mbuf_init, mb6_7, 0x20); 470 rearm4 = _mm256_permute2f128_si256(mbuf_init, mb4_5, 0x20); 471 rearm2 = _mm256_permute2f128_si256(mbuf_init, mb2_3, 0x20); 472 rearm0 = _mm256_permute2f128_si256(mbuf_init, mb0_1, 0x20); 473 474 /* write to mbuf */ 475 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data, 476 rearm6); 477 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data, 478 rearm4); 479 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data, 480 rearm2); 481 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data, 482 rearm0); 483 484 rearm7 = _mm256_blend_epi32(mbuf_init, mb6_7, 0xF0); 485 rearm5 = _mm256_blend_epi32(mbuf_init, mb4_5, 0xF0); 486 rearm3 = _mm256_blend_epi32(mbuf_init, mb2_3, 0xF0); 487 rearm1 = _mm256_blend_epi32(mbuf_init, mb0_1, 0xF0); 488 489 /* again write to mbufs */ 490 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data, 491 rearm7); 492 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data, 493 rearm5); 494 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data, 495 rearm3); 496 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data, 497 rearm1); 498 499 /* perform dd_check */ 500 status0_7 = _mm256_and_si256(status0_7, dd_check); 501 status0_7 = _mm256_packs_epi32(status0_7, 502 _mm256_setzero_si256()); 503 504 uint64_t burst = rte_popcount64 505 (_mm_cvtsi128_si64 506 (_mm256_extracti128_si256 507 (status0_7, 1))); 508 burst += rte_popcount64 509 (_mm_cvtsi128_si64 510 (_mm256_castsi256_si128(status0_7))); 511 received += burst; 512 if (burst != IDPF_DESCS_PER_LOOP_AVX) 513 break; 514 } 515 516 /* update tail pointers */ 517 rxq->rx_tail += received; 518 rxq->rx_tail &= (rxq->nb_rx_desc - 1); 519 if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep aligned */ 520 rxq->rx_tail--; 521 received--; 522 } 523 rxq->rxrearm_nb += received; 524 return received; 525 } 526 527 /** 528 * Notice: 529 * - nb_pkts < IDPF_DESCS_PER_LOOP, just return no packet 530 */ 531 uint16_t 532 idpf_dp_singleq_recv_pkts_avx512(void *rx_queue, struct rte_mbuf **rx_pkts, 533 uint16_t nb_pkts) 534 { 535 return _idpf_singleq_recv_raw_pkts_avx512(rx_queue, rx_pkts, nb_pkts); 536 } 537 538 static __rte_always_inline void 539 idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq) 540 { 541 struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start]; 542 volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring; 543 uint16_t rx_id; 544 int i; 545 546 rxdp += rx_bufq->rxrearm_start; 547 548 /* Pull 'n' more MBUFs into the software ring */ 549 if (rte_mempool_get_bulk(rx_bufq->mp, 550 (void *)rxp, 551 IDPF_RXQ_REARM_THRESH) < 0) { 552 if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >= 553 rx_bufq->nb_rx_desc) { 554 __m128i dma_addr0; 555 556 dma_addr0 = _mm_setzero_si128(); 557 for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) { 558 rxp[i] = &rx_bufq->fake_mbuf; 559 _mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp[i]), 560 dma_addr0); 561 } 562 } 563 rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed, 564 IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed); 565 return; 566 } 567 568 /* Initialize the mbufs in vector, process 8 mbufs in one loop */ 569 for (i = 0; i < IDPF_RXQ_REARM_THRESH; 570 i += 8, rxp += 8, rxdp += 8) { 571 rxdp[0].split_rd.pkt_addr = rxp[0]->buf_iova + RTE_PKTMBUF_HEADROOM; 572 rxdp[1].split_rd.pkt_addr = rxp[1]->buf_iova + RTE_PKTMBUF_HEADROOM; 573 rxdp[2].split_rd.pkt_addr = rxp[2]->buf_iova + RTE_PKTMBUF_HEADROOM; 574 rxdp[3].split_rd.pkt_addr = rxp[3]->buf_iova + RTE_PKTMBUF_HEADROOM; 575 rxdp[4].split_rd.pkt_addr = rxp[4]->buf_iova + RTE_PKTMBUF_HEADROOM; 576 rxdp[5].split_rd.pkt_addr = rxp[5]->buf_iova + RTE_PKTMBUF_HEADROOM; 577 rxdp[6].split_rd.pkt_addr = rxp[6]->buf_iova + RTE_PKTMBUF_HEADROOM; 578 rxdp[7].split_rd.pkt_addr = rxp[7]->buf_iova + RTE_PKTMBUF_HEADROOM; 579 } 580 581 rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH; 582 if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc) 583 rx_bufq->rxrearm_start = 0; 584 585 rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH; 586 587 rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ? 588 (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1)); 589 590 /* Update the tail pointer on the NIC */ 591 IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id); 592 } 593 594 static __rte_always_inline void 595 idpf_splitq_rearm(struct idpf_rx_queue *rx_bufq) 596 { 597 int i; 598 uint16_t rx_id; 599 volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring; 600 struct rte_mempool_cache *cache = 601 rte_mempool_default_cache(rx_bufq->mp, rte_lcore_id()); 602 struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start]; 603 604 rxdp += rx_bufq->rxrearm_start; 605 606 if (unlikely(!cache)) 607 return idpf_splitq_rearm_common(rx_bufq); 608 609 /* We need to pull 'n' more MBUFs into the software ring from mempool 610 * We inline the mempool function here, so we can vectorize the copy 611 * from the cache into the shadow ring. 612 */ 613 614 /* Can this be satisfied from the cache? */ 615 if (cache->len < IDPF_RXQ_REARM_THRESH) { 616 /* No. Backfill the cache first, and then fill from it */ 617 uint32_t req = IDPF_RXQ_REARM_THRESH + (cache->size - 618 cache->len); 619 620 /* How many do we require i.e. number to fill the cache + the request */ 621 int ret = rte_mempool_ops_dequeue_bulk 622 (rx_bufq->mp, &cache->objs[cache->len], req); 623 if (ret == 0) { 624 cache->len += req; 625 } else { 626 if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >= 627 rx_bufq->nb_rx_desc) { 628 __m128i dma_addr0; 629 630 dma_addr0 = _mm_setzero_si128(); 631 for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) { 632 rxp[i] = &rx_bufq->fake_mbuf; 633 _mm_storeu_si128(RTE_CAST_PTR(__m128i *, &rxdp[i]), 634 dma_addr0); 635 } 636 } 637 rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed, 638 IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed); 639 return; 640 } 641 } 642 643 const __m512i iova_offsets = _mm512_set1_epi64(offsetof 644 (struct rte_mbuf, buf_iova)); 645 const __m512i headroom = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM); 646 647 /* Initialize the mbufs in vector, process 8 mbufs in one loop, taking 648 * from mempool cache and populating both shadow and HW rings 649 */ 650 for (i = 0; i < IDPF_RXQ_REARM_THRESH / IDPF_DESCS_PER_LOOP_AVX; i++) { 651 const __m512i mbuf_ptrs = _mm512_loadu_si512 652 (&cache->objs[cache->len - IDPF_DESCS_PER_LOOP_AVX]); 653 _mm512_storeu_si512(rxp, mbuf_ptrs); 654 655 const __m512i iova_base_addrs = _mm512_i64gather_epi64 656 (_mm512_add_epi64(mbuf_ptrs, iova_offsets), 657 0, /* base */ 658 1 /* scale */); 659 const __m512i iova_addrs = _mm512_add_epi64(iova_base_addrs, 660 headroom); 661 662 const __m512i iova_addrs_1 = _mm512_bsrli_epi128(iova_addrs, 8); 663 664 rxdp[0].split_rd.pkt_addr = 665 _mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs, 0)); 666 rxdp[1].split_rd.pkt_addr = 667 _mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs_1, 0)); 668 rxdp[2].split_rd.pkt_addr = 669 _mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs, 1)); 670 rxdp[3].split_rd.pkt_addr = 671 _mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs_1, 1)); 672 rxdp[4].split_rd.pkt_addr = 673 _mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs, 2)); 674 rxdp[5].split_rd.pkt_addr = 675 _mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs_1, 2)); 676 rxdp[6].split_rd.pkt_addr = 677 _mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs, 3)); 678 rxdp[7].split_rd.pkt_addr = 679 _mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs_1, 3)); 680 681 rxp += IDPF_DESCS_PER_LOOP_AVX; 682 rxdp += IDPF_DESCS_PER_LOOP_AVX; 683 cache->len -= IDPF_DESCS_PER_LOOP_AVX; 684 } 685 686 rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH; 687 if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc) 688 rx_bufq->rxrearm_start = 0; 689 690 rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH; 691 692 rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ? 693 (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1)); 694 695 /* Update the tail pointer on the NIC */ 696 IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id); 697 } 698 699 static __rte_always_inline uint16_t 700 _idpf_splitq_recv_raw_pkts_avx512(struct idpf_rx_queue *rxq, 701 struct rte_mbuf **rx_pkts, 702 uint16_t nb_pkts) 703 { 704 const uint32_t *type_table = rxq->adapter->ptype_tbl; 705 const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0, 706 rxq->bufq2->mbuf_initializer); 707 /* only handle bufq2 here */ 708 struct rte_mbuf **sw_ring = &rxq->bufq2->sw_ring[rxq->rx_tail]; 709 volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring; 710 711 rxdp += rxq->rx_tail; 712 713 rte_prefetch0(rxdp); 714 715 /* nb_pkts has to be floor-aligned to IDPF_DESCS_PER_LOOP_AVX */ 716 nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IDPF_DESCS_PER_LOOP_AVX); 717 718 /* See if we need to rearm the RX queue - gives the prefetch a bit 719 * of time to act 720 */ 721 if (rxq->bufq2->rxrearm_nb > IDPF_RXQ_REARM_THRESH) 722 idpf_splitq_rearm(rxq->bufq2); 723 724 /* Before we start moving massive data around, check to see if 725 * there is actually a packet available 726 */ 727 if (((rxdp->flex_adv_nic_3_wb.pktlen_gen_bufq_id & 728 VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) >> 729 VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) != rxq->expected_gen_id) 730 return 0; 731 732 const __m512i dd_check = _mm512_set1_epi64(1); 733 const __m512i gen_check = _mm512_set1_epi64((uint64_t)1<<46); 734 735 /* mask to shuffle from desc. to mbuf (4 descriptors)*/ 736 const __m512i shuf_msk = 737 _mm512_set_epi32 738 (/* 1st descriptor */ 739 0xFFFFFFFF, /* octet 4~7, 32bits rss */ 740 0xFFFF0504, /* octet 2~3, low 16 bits vlan_macip */ 741 /* octet 15~14, 16 bits data_len */ 742 0xFFFF0504, /* skip high 16 bits pkt_len, zero out */ 743 /* octet 15~14, low 16 bits pkt_len */ 744 0xFFFFFFFF, /* pkt_type set as unknown */ 745 /* 2nd descriptor */ 746 0xFFFFFFFF, /* octet 4~7, 32bits rss */ 747 0xFFFF0504, /* octet 2~3, low 16 bits vlan_macip */ 748 /* octet 15~14, 16 bits data_len */ 749 0xFFFF0504, /* skip high 16 bits pkt_len, zero out */ 750 /* octet 15~14, low 16 bits pkt_len */ 751 0xFFFFFFFF, /* pkt_type set as unknown */ 752 /* 3rd descriptor */ 753 0xFFFFFFFF, /* octet 4~7, 32bits rss */ 754 0xFFFF0504, /* octet 2~3, low 16 bits vlan_macip */ 755 /* octet 15~14, 16 bits data_len */ 756 0xFFFF0504, /* skip high 16 bits pkt_len, zero out */ 757 /* octet 15~14, low 16 bits pkt_len */ 758 0xFFFFFFFF, /* pkt_type set as unknown */ 759 /* 4th descriptor */ 760 0xFFFFFFFF, /* octet 4~7, 32bits rss */ 761 0xFFFF0504, /* octet 2~3, low 16 bits vlan_macip */ 762 /* octet 15~14, 16 bits data_len */ 763 0xFFFF0504, /* skip high 16 bits pkt_len, zero out */ 764 /* octet 15~14, low 16 bits pkt_len */ 765 0xFFFFFFFF /* pkt_type set as unknown */ 766 ); 767 /** 768 * compile-time check the above crc and shuffle layout is correct. 769 * NOTE: the first field (lowest address) is given last in set_epi 770 * calls above. 771 */ 772 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != 773 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); 774 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != 775 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); 776 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) != 777 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10); 778 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != 779 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); 780 781 uint16_t i, received; 782 783 for (i = 0, received = 0; i < nb_pkts; 784 i += IDPF_DESCS_PER_LOOP_AVX, 785 rxdp += IDPF_DESCS_PER_LOOP_AVX) { 786 /* step 1, copy over 8 mbuf pointers to rx_pkts array */ 787 _mm256_storeu_si256((void *)&rx_pkts[i], 788 _mm256_loadu_si256((void *)&sw_ring[i])); 789 #ifdef RTE_ARCH_X86_64 790 _mm256_storeu_si256 791 ((void *)&rx_pkts[i + 4], 792 _mm256_loadu_si256((void *)&sw_ring[i + 4])); 793 #endif 794 795 __m512i raw_desc0_3, raw_desc4_7; 796 const __m128i raw_desc7 = 797 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 7)); 798 rte_compiler_barrier(); 799 const __m128i raw_desc6 = 800 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 6)); 801 rte_compiler_barrier(); 802 const __m128i raw_desc5 = 803 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 5)); 804 rte_compiler_barrier(); 805 const __m128i raw_desc4 = 806 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 4)); 807 rte_compiler_barrier(); 808 const __m128i raw_desc3 = 809 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 3)); 810 rte_compiler_barrier(); 811 const __m128i raw_desc2 = 812 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 2)); 813 rte_compiler_barrier(); 814 const __m128i raw_desc1 = 815 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 1)); 816 rte_compiler_barrier(); 817 const __m128i raw_desc0 = 818 _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 0)); 819 820 raw_desc4_7 = _mm512_broadcast_i32x4(raw_desc4); 821 raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc5, 1); 822 raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc6, 2); 823 raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc7, 3); 824 raw_desc0_3 = _mm512_broadcast_i32x4(raw_desc0); 825 raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc1, 1); 826 raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc2, 2); 827 raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc3, 3); 828 829 /** 830 * convert descriptors 4-7 into mbufs, adjusting length and 831 * re-arranging fields. Then write into the mbuf 832 */ 833 const __m512i len_mask = _mm512_set_epi32(0xffffffff, 0xffffffff, 834 0xffff3fff, 0xffffffff, 835 0xffffffff, 0xffffffff, 836 0xffff3fff, 0xffffffff, 837 0xffffffff, 0xffffffff, 838 0xffff3fff, 0xffffffff, 839 0xffffffff, 0xffffffff, 840 0xffff3fff, 0xffffffff); 841 const __m512i desc4_7 = _mm512_and_epi32(raw_desc4_7, len_mask); 842 __m512i mb4_7 = _mm512_shuffle_epi8(desc4_7, shuf_msk); 843 844 /** 845 * to get packet types, shift 64-bit values down 30 bits 846 * and so ptype is in lower 8-bits in each 847 */ 848 const __m512i ptypes4_7 = _mm512_srli_epi64(desc4_7, 16); 849 const __m256i ptypes6_7 = _mm512_extracti64x4_epi64(ptypes4_7, 1); 850 const __m256i ptypes4_5 = _mm512_extracti64x4_epi64(ptypes4_7, 0); 851 const uint8_t ptype7 = _mm256_extract_epi8(ptypes6_7, 16); 852 const uint8_t ptype6 = _mm256_extract_epi8(ptypes6_7, 0); 853 const uint8_t ptype5 = _mm256_extract_epi8(ptypes4_5, 16); 854 const uint8_t ptype4 = _mm256_extract_epi8(ptypes4_5, 0); 855 856 const __m512i ptype4_7 = _mm512_set_epi32 857 (0, 0, 0, type_table[ptype7], 858 0, 0, 0, type_table[ptype6], 859 0, 0, 0, type_table[ptype5], 860 0, 0, 0, type_table[ptype4]); 861 mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7); 862 863 /** 864 * convert descriptors 0-3 into mbufs, adjusting length and 865 * re-arranging fields. Then write into the mbuf 866 */ 867 const __m512i desc0_3 = _mm512_and_epi32(raw_desc0_3, len_mask); 868 __m512i mb0_3 = _mm512_shuffle_epi8(desc0_3, shuf_msk); 869 870 /* get the packet types */ 871 const __m512i ptypes0_3 = _mm512_srli_epi64(desc0_3, 16); 872 const __m256i ptypes2_3 = _mm512_extracti64x4_epi64(ptypes0_3, 1); 873 const __m256i ptypes0_1 = _mm512_extracti64x4_epi64(ptypes0_3, 0); 874 const uint8_t ptype3 = _mm256_extract_epi8(ptypes2_3, 16); 875 const uint8_t ptype2 = _mm256_extract_epi8(ptypes2_3, 0); 876 const uint8_t ptype1 = _mm256_extract_epi8(ptypes0_1, 16); 877 const uint8_t ptype0 = _mm256_extract_epi8(ptypes0_1, 0); 878 879 const __m512i ptype0_3 = _mm512_set_epi32 880 (0, 0, 0, type_table[ptype3], 881 0, 0, 0, type_table[ptype2], 882 0, 0, 0, type_table[ptype1], 883 0, 0, 0, type_table[ptype0]); 884 mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3); 885 886 /** 887 * use permute/extract to get status and generation bit content 888 * After the operations, the packets status flags are in the 889 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6] 890 */ 891 892 const __m512i dd_permute_msk = _mm512_set_epi64 893 (11, 15, 3, 7, 9, 13, 1, 5); 894 const __m512i status0_7 = _mm512_permutex2var_epi64 895 (raw_desc4_7, dd_permute_msk, raw_desc0_3); 896 const __m512i gen_permute_msk = _mm512_set_epi64 897 (10, 14, 2, 6, 8, 12, 0, 4); 898 const __m512i raw_gen0_7 = _mm512_permutex2var_epi64 899 (raw_desc4_7, gen_permute_msk, raw_desc0_3); 900 901 /* now do flag manipulation */ 902 903 /** 904 * At this point, we have the 8 sets of flags in the low 16-bits 905 * of each 32-bit value in vlan0. 906 * We want to extract these, and merge them with the mbuf init 907 * data so we can do a single write to the mbuf to set the flags 908 * and all the other initialization fields. Extracting the 909 * appropriate flags means that we have to do a shift and blend 910 * for each mbuf before we do the write. However, we can also 911 * add in the previously computed rx_descriptor fields to 912 * make a single 256-bit write per mbuf 913 */ 914 /* check the structure matches expectations */ 915 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) != 916 offsetof(struct rte_mbuf, rearm_data) + 8); 917 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) != 918 RTE_ALIGN(offsetof(struct rte_mbuf, 919 rearm_data), 920 16)); 921 /* build up data and do writes */ 922 __m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5, 923 rearm6, rearm7; 924 const __m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0); 925 const __m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1); 926 const __m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0); 927 const __m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1); 928 929 rearm6 = _mm256_permute2f128_si256(mbuf_init, mb6_7, 0x20); 930 rearm4 = _mm256_permute2f128_si256(mbuf_init, mb4_5, 0x20); 931 rearm2 = _mm256_permute2f128_si256(mbuf_init, mb2_3, 0x20); 932 rearm0 = _mm256_permute2f128_si256(mbuf_init, mb0_1, 0x20); 933 934 /* write to mbuf */ 935 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data, 936 rearm6); 937 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data, 938 rearm4); 939 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data, 940 rearm2); 941 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data, 942 rearm0); 943 944 rearm7 = _mm256_blend_epi32(mbuf_init, mb6_7, 0xF0); 945 rearm5 = _mm256_blend_epi32(mbuf_init, mb4_5, 0xF0); 946 rearm3 = _mm256_blend_epi32(mbuf_init, mb2_3, 0xF0); 947 rearm1 = _mm256_blend_epi32(mbuf_init, mb0_1, 0xF0); 948 949 /* again write to mbufs */ 950 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data, 951 rearm7); 952 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data, 953 rearm5); 954 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data, 955 rearm3); 956 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data, 957 rearm1); 958 959 const __mmask8 dd_mask = _mm512_cmpeq_epi64_mask( 960 _mm512_and_epi64(status0_7, dd_check), dd_check); 961 const __mmask8 gen_mask = _mm512_cmpeq_epi64_mask( 962 _mm512_and_epi64(raw_gen0_7, gen_check), 963 _mm512_set1_epi64((uint64_t)rxq->expected_gen_id << 46)); 964 const __mmask8 recv_mask = _kand_mask8(dd_mask, gen_mask); 965 uint16_t burst = rte_popcount32(_cvtmask8_u32(recv_mask)); 966 967 received += burst; 968 if (burst != IDPF_DESCS_PER_LOOP_AVX) 969 break; 970 } 971 972 /* update tail pointers */ 973 rxq->rx_tail += received; 974 rxq->expected_gen_id ^= ((rxq->rx_tail & rxq->nb_rx_desc) != 0); 975 rxq->rx_tail &= (rxq->nb_rx_desc - 1); 976 if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep aligned */ 977 rxq->rx_tail--; 978 received--; 979 } 980 981 rxq->bufq2->rxrearm_nb += received; 982 return received; 983 } 984 985 /* only bufq2 can receive pkts */ 986 uint16_t 987 idpf_dp_splitq_recv_pkts_avx512(void *rx_queue, struct rte_mbuf **rx_pkts, 988 uint16_t nb_pkts) 989 { 990 return _idpf_splitq_recv_raw_pkts_avx512(rx_queue, rx_pkts, 991 nb_pkts); 992 } 993 994 static __rte_always_inline int 995 idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq) 996 { 997 struct idpf_tx_vec_entry *txep; 998 uint32_t n; 999 uint32_t i; 1000 int nb_free = 0; 1001 struct rte_mbuf *m, *free[txq->rs_thresh]; 1002 1003 /* check DD bits on threshold descriptor */ 1004 if ((txq->tx_ring[txq->next_dd].qw1 & 1005 rte_cpu_to_le_64(IDPF_TXD_QW1_DTYPE_M)) != 1006 rte_cpu_to_le_64(IDPF_TX_DESC_DTYPE_DESC_DONE)) 1007 return 0; 1008 1009 n = txq->rs_thresh; 1010 1011 /* first buffer to free from S/W ring is at index 1012 * tx_next_dd - (tx_rs_thresh-1) 1013 */ 1014 txep = (void *)txq->sw_ring; 1015 txep += txq->next_dd - (n - 1); 1016 1017 if (txq->offloads & IDPF_TX_OFFLOAD_MBUF_FAST_FREE && (n & 31) == 0) { 1018 struct rte_mempool *mp = txep[0].mbuf->pool; 1019 struct rte_mempool_cache *cache = rte_mempool_default_cache(mp, 1020 rte_lcore_id()); 1021 void **cache_objs; 1022 1023 if (cache == NULL || cache->len == 0) 1024 goto normal; 1025 1026 cache_objs = &cache->objs[cache->len]; 1027 1028 if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) { 1029 rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n); 1030 goto done; 1031 } 1032 1033 /* The cache follows the following algorithm 1034 * 1. Add the objects to the cache 1035 * 2. Anything greater than the cache min value (if it crosses the 1036 * cache flush threshold) is flushed to the ring. 1037 */ 1038 /* Add elements back into the cache */ 1039 uint32_t copied = 0; 1040 /* n is multiple of 32 */ 1041 while (copied < n) { 1042 #ifdef RTE_ARCH_64 1043 const __m512i a = _mm512_loadu_si512(&txep[copied]); 1044 const __m512i b = _mm512_loadu_si512(&txep[copied + 8]); 1045 const __m512i c = _mm512_loadu_si512(&txep[copied + 16]); 1046 const __m512i d = _mm512_loadu_si512(&txep[copied + 24]); 1047 1048 _mm512_storeu_si512(&cache_objs[copied], a); 1049 _mm512_storeu_si512(&cache_objs[copied + 8], b); 1050 _mm512_storeu_si512(&cache_objs[copied + 16], c); 1051 _mm512_storeu_si512(&cache_objs[copied + 24], d); 1052 #else 1053 const __m512i a = _mm512_loadu_si512(&txep[copied]); 1054 const __m512i b = _mm512_loadu_si512(&txep[copied + 16]); 1055 _mm512_storeu_si512(&cache_objs[copied], a); 1056 _mm512_storeu_si512(&cache_objs[copied + 16], b); 1057 #endif 1058 copied += 32; 1059 } 1060 cache->len += n; 1061 1062 if (cache->len >= cache->flushthresh) { 1063 rte_mempool_ops_enqueue_bulk(mp, 1064 &cache->objs[cache->size], 1065 cache->len - cache->size); 1066 cache->len = cache->size; 1067 } 1068 goto done; 1069 } 1070 1071 normal: 1072 m = rte_pktmbuf_prefree_seg(txep[0].mbuf); 1073 if (likely(m != NULL)) { 1074 free[0] = m; 1075 nb_free = 1; 1076 for (i = 1; i < n; i++) { 1077 m = rte_pktmbuf_prefree_seg(txep[i].mbuf); 1078 if (likely(m != NULL)) { 1079 if (likely(m->pool == free[0]->pool)) { 1080 free[nb_free++] = m; 1081 } else { 1082 rte_mempool_put_bulk(free[0]->pool, 1083 (void *)free, 1084 nb_free); 1085 free[0] = m; 1086 nb_free = 1; 1087 } 1088 } 1089 } 1090 rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free); 1091 } else { 1092 for (i = 1; i < n; i++) { 1093 m = rte_pktmbuf_prefree_seg(txep[i].mbuf); 1094 if (m != NULL) 1095 rte_mempool_put(m->pool, m); 1096 } 1097 } 1098 1099 done: 1100 /* buffers were freed, update counters */ 1101 txq->nb_free = (uint16_t)(txq->nb_free + txq->rs_thresh); 1102 txq->next_dd = (uint16_t)(txq->next_dd + txq->rs_thresh); 1103 if (txq->next_dd >= txq->nb_tx_desc) 1104 txq->next_dd = (uint16_t)(txq->rs_thresh - 1); 1105 1106 return txq->rs_thresh; 1107 } 1108 1109 static __rte_always_inline void 1110 tx_backlog_entry_avx512(struct idpf_tx_vec_entry *txep, 1111 struct rte_mbuf **tx_pkts, uint16_t nb_pkts) 1112 { 1113 int i; 1114 1115 for (i = 0; i < (int)nb_pkts; ++i) 1116 txep[i].mbuf = tx_pkts[i]; 1117 } 1118 1119 static __rte_always_inline void 1120 idpf_singleq_vtx1(volatile struct idpf_base_tx_desc *txdp, 1121 struct rte_mbuf *pkt, uint64_t flags) 1122 { 1123 uint64_t high_qw = 1124 (IDPF_TX_DESC_DTYPE_DATA | 1125 ((uint64_t)flags << IDPF_TXD_QW1_CMD_S) | 1126 ((uint64_t)pkt->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S)); 1127 1128 __m128i descriptor = _mm_set_epi64x(high_qw, 1129 pkt->buf_iova + pkt->data_off); 1130 _mm_storeu_si128(RTE_CAST_PTR(__m128i *, txdp), descriptor); 1131 } 1132 1133 #define IDPF_TX_LEN_MASK 0xAA 1134 #define IDPF_TX_OFF_MASK 0x55 1135 static __rte_always_inline void 1136 idpf_singleq_vtx(volatile struct idpf_base_tx_desc *txdp, 1137 struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags) 1138 { 1139 const uint64_t hi_qw_tmpl = (IDPF_TX_DESC_DTYPE_DATA | 1140 ((uint64_t)flags << IDPF_TXD_QW1_CMD_S)); 1141 1142 /* if unaligned on 32-bit boundary, do one to align */ 1143 if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) { 1144 idpf_singleq_vtx1(txdp, *pkt, flags); 1145 nb_pkts--, txdp++, pkt++; 1146 } 1147 1148 /* do 4 at a time while possible, in bursts */ 1149 for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) { 1150 uint64_t hi_qw3 = 1151 hi_qw_tmpl | 1152 ((uint64_t)pkt[3]->data_len << 1153 IDPF_TXD_QW1_TX_BUF_SZ_S); 1154 uint64_t hi_qw2 = 1155 hi_qw_tmpl | 1156 ((uint64_t)pkt[2]->data_len << 1157 IDPF_TXD_QW1_TX_BUF_SZ_S); 1158 uint64_t hi_qw1 = 1159 hi_qw_tmpl | 1160 ((uint64_t)pkt[1]->data_len << 1161 IDPF_TXD_QW1_TX_BUF_SZ_S); 1162 uint64_t hi_qw0 = 1163 hi_qw_tmpl | 1164 ((uint64_t)pkt[0]->data_len << 1165 IDPF_TXD_QW1_TX_BUF_SZ_S); 1166 1167 __m512i desc0_3 = 1168 _mm512_set_epi64 1169 (hi_qw3, 1170 pkt[3]->buf_iova + pkt[3]->data_off, 1171 hi_qw2, 1172 pkt[2]->buf_iova + pkt[2]->data_off, 1173 hi_qw1, 1174 pkt[1]->buf_iova + pkt[1]->data_off, 1175 hi_qw0, 1176 pkt[0]->buf_iova + pkt[0]->data_off); 1177 _mm512_storeu_si512(RTE_CAST_PTR(void *, txdp), desc0_3); 1178 } 1179 1180 /* do any last ones */ 1181 while (nb_pkts) { 1182 idpf_singleq_vtx1(txdp, *pkt, flags); 1183 txdp++, pkt++, nb_pkts--; 1184 } 1185 } 1186 1187 static __rte_always_inline uint16_t 1188 idpf_singleq_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, 1189 uint16_t nb_pkts) 1190 { 1191 struct idpf_tx_queue *txq = tx_queue; 1192 volatile struct idpf_base_tx_desc *txdp; 1193 struct idpf_tx_vec_entry *txep; 1194 uint16_t n, nb_commit, tx_id; 1195 uint64_t flags = IDPF_TX_DESC_CMD_EOP; 1196 uint64_t rs = IDPF_TX_DESC_CMD_RS | flags; 1197 1198 /* cross rx_thresh boundary is not allowed */ 1199 nb_pkts = RTE_MIN(nb_pkts, txq->rs_thresh); 1200 1201 if (txq->nb_free < txq->free_thresh) 1202 idpf_tx_singleq_free_bufs_avx512(txq); 1203 1204 nb_pkts = (uint16_t)RTE_MIN(txq->nb_free, nb_pkts); 1205 nb_commit = nb_pkts; 1206 if (unlikely(nb_pkts == 0)) 1207 return 0; 1208 1209 tx_id = txq->tx_tail; 1210 txdp = &txq->tx_ring[tx_id]; 1211 txep = (void *)txq->sw_ring; 1212 txep += tx_id; 1213 1214 txq->nb_free = (uint16_t)(txq->nb_free - nb_pkts); 1215 1216 n = (uint16_t)(txq->nb_tx_desc - tx_id); 1217 if (nb_commit >= n) { 1218 tx_backlog_entry_avx512(txep, tx_pkts, n); 1219 1220 idpf_singleq_vtx(txdp, tx_pkts, n - 1, flags); 1221 tx_pkts += (n - 1); 1222 txdp += (n - 1); 1223 1224 idpf_singleq_vtx1(txdp, *tx_pkts++, rs); 1225 1226 nb_commit = (uint16_t)(nb_commit - n); 1227 1228 tx_id = 0; 1229 txq->next_rs = (uint16_t)(txq->rs_thresh - 1); 1230 1231 /* avoid reach the end of ring */ 1232 txdp = &txq->tx_ring[tx_id]; 1233 txep = (void *)txq->sw_ring; 1234 txep += tx_id; 1235 } 1236 1237 tx_backlog_entry_avx512(txep, tx_pkts, nb_commit); 1238 1239 idpf_singleq_vtx(txdp, tx_pkts, nb_commit, flags); 1240 1241 tx_id = (uint16_t)(tx_id + nb_commit); 1242 if (tx_id > txq->next_rs) { 1243 txq->tx_ring[txq->next_rs].qw1 |= 1244 rte_cpu_to_le_64(((uint64_t)IDPF_TX_DESC_CMD_RS) << 1245 IDPF_TXD_QW1_CMD_S); 1246 txq->next_rs = 1247 (uint16_t)(txq->next_rs + txq->rs_thresh); 1248 } 1249 1250 txq->tx_tail = tx_id; 1251 1252 IDPF_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail); 1253 1254 return nb_pkts; 1255 } 1256 1257 static __rte_always_inline uint16_t 1258 idpf_singleq_xmit_pkts_vec_avx512_cmn(void *tx_queue, struct rte_mbuf **tx_pkts, 1259 uint16_t nb_pkts) 1260 { 1261 uint16_t nb_tx = 0; 1262 struct idpf_tx_queue *txq = tx_queue; 1263 1264 while (nb_pkts) { 1265 uint16_t ret, num; 1266 1267 num = (uint16_t)RTE_MIN(nb_pkts, txq->rs_thresh); 1268 ret = idpf_singleq_xmit_fixed_burst_vec_avx512(tx_queue, &tx_pkts[nb_tx], 1269 num); 1270 nb_tx += ret; 1271 nb_pkts -= ret; 1272 if (ret < num) 1273 break; 1274 } 1275 1276 return nb_tx; 1277 } 1278 1279 uint16_t 1280 idpf_dp_singleq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, 1281 uint16_t nb_pkts) 1282 { 1283 return idpf_singleq_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts); 1284 } 1285 1286 static __rte_always_inline void 1287 idpf_splitq_scan_cq_ring(struct idpf_tx_queue *cq) 1288 { 1289 struct idpf_splitq_tx_compl_desc *compl_ring; 1290 struct idpf_tx_queue *txq; 1291 uint16_t genid, txq_qid, cq_qid, i; 1292 uint8_t ctype; 1293 1294 cq_qid = cq->tx_tail; 1295 1296 for (i = 0; i < IDPD_TXQ_SCAN_CQ_THRESH; i++) { 1297 if (cq_qid == cq->nb_tx_desc) { 1298 cq_qid = 0; 1299 cq->expected_gen_id ^= 1; 1300 } 1301 compl_ring = &cq->compl_ring[cq_qid]; 1302 genid = (compl_ring->qid_comptype_gen & 1303 rte_cpu_to_le_64(IDPF_TXD_COMPLQ_GEN_M)) >> IDPF_TXD_COMPLQ_GEN_S; 1304 if (genid != cq->expected_gen_id) 1305 break; 1306 ctype = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) & 1307 IDPF_TXD_COMPLQ_COMPL_TYPE_M) >> IDPF_TXD_COMPLQ_COMPL_TYPE_S; 1308 txq_qid = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) & 1309 IDPF_TXD_COMPLQ_QID_M) >> IDPF_TXD_COMPLQ_QID_S; 1310 txq = cq->txqs[txq_qid - cq->tx_start_qid]; 1311 txq->ctype[ctype]++; 1312 cq_qid++; 1313 } 1314 1315 cq->tx_tail = cq_qid; 1316 } 1317 1318 static __rte_always_inline int 1319 idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq) 1320 { 1321 struct idpf_tx_vec_entry *txep; 1322 uint32_t n; 1323 uint32_t i; 1324 int nb_free = 0; 1325 struct rte_mbuf *m, *free[txq->rs_thresh]; 1326 1327 n = txq->rs_thresh; 1328 1329 /* first buffer to free from S/W ring is at index 1330 * tx_next_dd - (tx_rs_thresh-1) 1331 */ 1332 txep = (void *)txq->sw_ring; 1333 txep += txq->next_dd - (n - 1); 1334 1335 if (txq->offloads & IDPF_TX_OFFLOAD_MBUF_FAST_FREE && (n & 31) == 0) { 1336 struct rte_mempool *mp = txep[0].mbuf->pool; 1337 struct rte_mempool_cache *cache = rte_mempool_default_cache(mp, 1338 rte_lcore_id()); 1339 void **cache_objs; 1340 1341 if (!cache || cache->len == 0) 1342 goto normal; 1343 1344 cache_objs = &cache->objs[cache->len]; 1345 1346 if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) { 1347 rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n); 1348 goto done; 1349 } 1350 1351 /* The cache follows the following algorithm 1352 * 1. Add the objects to the cache 1353 * 2. Anything greater than the cache min value (if it crosses the 1354 * cache flush threshold) is flushed to the ring. 1355 */ 1356 /* Add elements back into the cache */ 1357 uint32_t copied = 0; 1358 /* n is multiple of 32 */ 1359 while (copied < n) { 1360 const __m512i a = _mm512_loadu_si512(&txep[copied]); 1361 const __m512i b = _mm512_loadu_si512(&txep[copied + 8]); 1362 const __m512i c = _mm512_loadu_si512(&txep[copied + 16]); 1363 const __m512i d = _mm512_loadu_si512(&txep[copied + 24]); 1364 1365 _mm512_storeu_si512(&cache_objs[copied], a); 1366 _mm512_storeu_si512(&cache_objs[copied + 8], b); 1367 _mm512_storeu_si512(&cache_objs[copied + 16], c); 1368 _mm512_storeu_si512(&cache_objs[copied + 24], d); 1369 copied += 32; 1370 } 1371 cache->len += n; 1372 1373 if (cache->len >= cache->flushthresh) { 1374 rte_mempool_ops_enqueue_bulk(mp, 1375 &cache->objs[cache->size], 1376 cache->len - cache->size); 1377 cache->len = cache->size; 1378 } 1379 goto done; 1380 } 1381 1382 normal: 1383 m = rte_pktmbuf_prefree_seg(txep[0].mbuf); 1384 if (likely(m)) { 1385 free[0] = m; 1386 nb_free = 1; 1387 for (i = 1; i < n; i++) { 1388 m = rte_pktmbuf_prefree_seg(txep[i].mbuf); 1389 if (likely(m)) { 1390 if (likely(m->pool == free[0]->pool)) { 1391 free[nb_free++] = m; 1392 } else { 1393 rte_mempool_put_bulk(free[0]->pool, 1394 (void *)free, 1395 nb_free); 1396 free[0] = m; 1397 nb_free = 1; 1398 } 1399 } 1400 } 1401 rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free); 1402 } else { 1403 for (i = 1; i < n; i++) { 1404 m = rte_pktmbuf_prefree_seg(txep[i].mbuf); 1405 if (m) 1406 rte_mempool_put(m->pool, m); 1407 } 1408 } 1409 1410 done: 1411 /* buffers were freed, update counters */ 1412 txq->nb_free = (uint16_t)(txq->nb_free + txq->rs_thresh); 1413 txq->next_dd = (uint16_t)(txq->next_dd + txq->rs_thresh); 1414 if (txq->next_dd >= txq->nb_tx_desc) 1415 txq->next_dd = (uint16_t)(txq->rs_thresh - 1); 1416 txq->ctype[IDPF_TXD_COMPLT_RS] -= txq->rs_thresh; 1417 1418 return txq->rs_thresh; 1419 } 1420 1421 #define IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S 48 1422 1423 static __rte_always_inline void 1424 idpf_splitq_vtx1(volatile struct idpf_flex_tx_sched_desc *txdp, 1425 struct rte_mbuf *pkt, uint64_t flags) 1426 { 1427 uint64_t high_qw = 1428 (IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE | 1429 ((uint64_t)flags) | 1430 ((uint64_t)pkt->data_len << IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S)); 1431 1432 __m128i descriptor = _mm_set_epi64x(high_qw, 1433 pkt->buf_iova + pkt->data_off); 1434 _mm_storeu_si128(RTE_CAST_PTR(__m128i *, txdp), descriptor); 1435 } 1436 1437 static __rte_always_inline void 1438 idpf_splitq_vtx(volatile struct idpf_flex_tx_sched_desc *txdp, 1439 struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags) 1440 { 1441 const uint64_t hi_qw_tmpl = (IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE | 1442 ((uint64_t)flags)); 1443 1444 /* if unaligned on 32-bit boundary, do one to align */ 1445 if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) { 1446 idpf_splitq_vtx1(txdp, *pkt, flags); 1447 nb_pkts--, txdp++, pkt++; 1448 } 1449 1450 /* do 4 at a time while possible, in bursts */ 1451 for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) { 1452 uint64_t hi_qw3 = 1453 hi_qw_tmpl | 1454 ((uint64_t)pkt[3]->data_len << 1455 IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S); 1456 uint64_t hi_qw2 = 1457 hi_qw_tmpl | 1458 ((uint64_t)pkt[2]->data_len << 1459 IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S); 1460 uint64_t hi_qw1 = 1461 hi_qw_tmpl | 1462 ((uint64_t)pkt[1]->data_len << 1463 IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S); 1464 uint64_t hi_qw0 = 1465 hi_qw_tmpl | 1466 ((uint64_t)pkt[0]->data_len << 1467 IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S); 1468 1469 __m512i desc0_3 = 1470 _mm512_set_epi64 1471 (hi_qw3, 1472 pkt[3]->buf_iova + pkt[3]->data_off, 1473 hi_qw2, 1474 pkt[2]->buf_iova + pkt[2]->data_off, 1475 hi_qw1, 1476 pkt[1]->buf_iova + pkt[1]->data_off, 1477 hi_qw0, 1478 pkt[0]->buf_iova + pkt[0]->data_off); 1479 _mm512_storeu_si512(RTE_CAST_PTR(void *, txdp), desc0_3); 1480 } 1481 1482 /* do any last ones */ 1483 while (nb_pkts) { 1484 idpf_splitq_vtx1(txdp, *pkt, flags); 1485 txdp++, pkt++, nb_pkts--; 1486 } 1487 } 1488 1489 static __rte_always_inline uint16_t 1490 idpf_splitq_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, 1491 uint16_t nb_pkts) 1492 { 1493 struct idpf_tx_queue *txq = (struct idpf_tx_queue *)tx_queue; 1494 volatile struct idpf_flex_tx_sched_desc *txdp; 1495 struct idpf_tx_vec_entry *txep; 1496 uint16_t n, nb_commit, tx_id; 1497 /* bit2 is reserved and must be set to 1 according to Spec */ 1498 uint64_t cmd_dtype = IDPF_TXD_FLEX_FLOW_CMD_EOP; 1499 1500 tx_id = txq->tx_tail; 1501 1502 /* cross rx_thresh boundary is not allowed */ 1503 nb_pkts = RTE_MIN(nb_pkts, txq->rs_thresh); 1504 1505 nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_free, nb_pkts); 1506 if (unlikely(nb_pkts == 0)) 1507 return 0; 1508 1509 tx_id = txq->tx_tail; 1510 txdp = &txq->desc_ring[tx_id]; 1511 txep = (void *)txq->sw_ring; 1512 txep += tx_id; 1513 1514 txq->nb_free = (uint16_t)(txq->nb_free - nb_pkts); 1515 1516 n = (uint16_t)(txq->nb_tx_desc - tx_id); 1517 if (nb_commit >= n) { 1518 tx_backlog_entry_avx512(txep, tx_pkts, n); 1519 1520 idpf_splitq_vtx(txdp, tx_pkts, n - 1, cmd_dtype); 1521 tx_pkts += (n - 1); 1522 txdp += (n - 1); 1523 1524 idpf_splitq_vtx1(txdp, *tx_pkts++, cmd_dtype); 1525 1526 nb_commit = (uint16_t)(nb_commit - n); 1527 1528 tx_id = 0; 1529 txq->next_rs = (uint16_t)(txq->rs_thresh - 1); 1530 1531 /* avoid reach the end of ring */ 1532 txdp = &txq->desc_ring[tx_id]; 1533 txep = (void *)txq->sw_ring; 1534 txep += tx_id; 1535 } 1536 1537 tx_backlog_entry_avx512(txep, tx_pkts, nb_commit); 1538 1539 idpf_splitq_vtx(txdp, tx_pkts, nb_commit, cmd_dtype); 1540 1541 tx_id = (uint16_t)(tx_id + nb_commit); 1542 if (tx_id > txq->next_rs) 1543 txq->next_rs = 1544 (uint16_t)(txq->next_rs + txq->rs_thresh); 1545 1546 txq->tx_tail = tx_id; 1547 1548 IDPF_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail); 1549 1550 return nb_pkts; 1551 } 1552 1553 static __rte_always_inline uint16_t 1554 idpf_splitq_xmit_pkts_vec_avx512_cmn(void *tx_queue, struct rte_mbuf **tx_pkts, 1555 uint16_t nb_pkts) 1556 { 1557 struct idpf_tx_queue *txq = (struct idpf_tx_queue *)tx_queue; 1558 uint16_t nb_tx = 0; 1559 1560 while (nb_pkts) { 1561 uint16_t ret, num; 1562 1563 idpf_splitq_scan_cq_ring(txq->complq); 1564 1565 if (txq->ctype[IDPF_TXD_COMPLT_RS] > txq->free_thresh) 1566 idpf_tx_splitq_free_bufs_avx512(txq); 1567 1568 num = (uint16_t)RTE_MIN(nb_pkts, txq->rs_thresh); 1569 ret = idpf_splitq_xmit_fixed_burst_vec_avx512(tx_queue, 1570 &tx_pkts[nb_tx], 1571 num); 1572 nb_tx += ret; 1573 nb_pkts -= ret; 1574 if (ret < num) 1575 break; 1576 } 1577 1578 return nb_tx; 1579 } 1580 1581 uint16_t 1582 idpf_dp_splitq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, 1583 uint16_t nb_pkts) 1584 { 1585 return idpf_splitq_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts); 1586 } 1587 1588 static inline void 1589 idpf_tx_release_mbufs_avx512(struct idpf_tx_queue *txq) 1590 { 1591 unsigned int i; 1592 const uint16_t max_desc = (uint16_t)(txq->nb_tx_desc - 1); 1593 struct idpf_tx_vec_entry *swr = (void *)txq->sw_ring; 1594 1595 if (txq->sw_ring == NULL || txq->nb_free == max_desc) 1596 return; 1597 1598 i = txq->next_dd - txq->rs_thresh + 1; 1599 if (txq->tx_tail < i) { 1600 for (; i < txq->nb_tx_desc; i++) { 1601 rte_pktmbuf_free_seg(swr[i].mbuf); 1602 swr[i].mbuf = NULL; 1603 } 1604 i = 0; 1605 } 1606 for (; i < txq->tx_tail; i++) { 1607 rte_pktmbuf_free_seg(swr[i].mbuf); 1608 swr[i].mbuf = NULL; 1609 } 1610 } 1611 1612 static const struct idpf_txq_ops avx512_tx_vec_ops = { 1613 .release_mbufs = idpf_tx_release_mbufs_avx512, 1614 }; 1615 1616 int __rte_cold 1617 idpf_qc_tx_vec_avx512_setup(struct idpf_tx_queue *txq) 1618 { 1619 if (!txq) 1620 return 0; 1621 1622 txq->ops = &avx512_tx_vec_ops; 1623 return 0; 1624 } 1625