10fac6a1cSBeilei Xing /* SPDX-License-Identifier: BSD-3-Clause 20fac6a1cSBeilei Xing * Copyright(c) 2023 Intel Corporation 30fac6a1cSBeilei Xing */ 40fac6a1cSBeilei Xing 50fac6a1cSBeilei Xing #include <rte_vect.h> 6ec4b04a7SQi Zhang #include "idpf_common_device.h" 7ec4b04a7SQi Zhang #include "idpf_common_rxtx.h" 80fac6a1cSBeilei Xing 90fac6a1cSBeilei Xing #define IDPF_DESCS_PER_LOOP_AVX 8 100fac6a1cSBeilei Xing #define PKTLEN_SHIFT 10 110fac6a1cSBeilei Xing 120fac6a1cSBeilei Xing static __rte_always_inline void 130fac6a1cSBeilei Xing idpf_singleq_rearm_common(struct idpf_rx_queue *rxq) 140fac6a1cSBeilei Xing { 150fac6a1cSBeilei Xing struct rte_mbuf **rxp = &rxq->sw_ring[rxq->rxrearm_start]; 160fac6a1cSBeilei Xing volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring; 170fac6a1cSBeilei Xing uint16_t rx_id; 180fac6a1cSBeilei Xing int i; 190fac6a1cSBeilei Xing 200fac6a1cSBeilei Xing rxdp += rxq->rxrearm_start; 210fac6a1cSBeilei Xing 220fac6a1cSBeilei Xing /* Pull 'n' more MBUFs into the software ring */ 230fac6a1cSBeilei Xing if (rte_mempool_get_bulk(rxq->mp, 240fac6a1cSBeilei Xing (void *)rxp, 250fac6a1cSBeilei Xing IDPF_RXQ_REARM_THRESH) < 0) { 260fac6a1cSBeilei Xing if (rxq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >= 270fac6a1cSBeilei Xing rxq->nb_rx_desc) { 280fac6a1cSBeilei Xing __m128i dma_addr0; 290fac6a1cSBeilei Xing 300fac6a1cSBeilei Xing dma_addr0 = _mm_setzero_si128(); 310fac6a1cSBeilei Xing for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) { 320fac6a1cSBeilei Xing rxp[i] = &rxq->fake_mbuf; 33*43fd3624SAndre Muezerie _mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp[i].read), 340fac6a1cSBeilei Xing dma_addr0); 350fac6a1cSBeilei Xing } 360fac6a1cSBeilei Xing } 37e12a0166STyler Retzlaff rte_atomic_fetch_add_explicit(&rxq->rx_stats.mbuf_alloc_failed, 38e12a0166STyler Retzlaff IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed); 390fac6a1cSBeilei Xing return; 400fac6a1cSBeilei Xing } 410fac6a1cSBeilei Xing struct rte_mbuf *mb0, *mb1, *mb2, *mb3; 420fac6a1cSBeilei Xing struct rte_mbuf *mb4, *mb5, *mb6, *mb7; 430fac6a1cSBeilei Xing __m512i dma_addr0_3, dma_addr4_7; 440fac6a1cSBeilei Xing __m512i hdr_room = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM); 450fac6a1cSBeilei Xing /* Initialize the mbufs in vector, process 8 mbufs in one loop */ 460fac6a1cSBeilei Xing for (i = 0; i < IDPF_RXQ_REARM_THRESH; 470fac6a1cSBeilei Xing i += 8, rxp += 8, rxdp += 8) { 480fac6a1cSBeilei Xing __m128i vaddr0, vaddr1, vaddr2, vaddr3; 490fac6a1cSBeilei Xing __m128i vaddr4, vaddr5, vaddr6, vaddr7; 500fac6a1cSBeilei Xing __m256i vaddr0_1, vaddr2_3; 510fac6a1cSBeilei Xing __m256i vaddr4_5, vaddr6_7; 520fac6a1cSBeilei Xing __m512i vaddr0_3, vaddr4_7; 530fac6a1cSBeilei Xing 540fac6a1cSBeilei Xing mb0 = rxp[0]; 550fac6a1cSBeilei Xing mb1 = rxp[1]; 560fac6a1cSBeilei Xing mb2 = rxp[2]; 570fac6a1cSBeilei Xing mb3 = rxp[3]; 580fac6a1cSBeilei Xing mb4 = rxp[4]; 590fac6a1cSBeilei Xing mb5 = rxp[5]; 600fac6a1cSBeilei Xing mb6 = rxp[6]; 610fac6a1cSBeilei Xing mb7 = rxp[7]; 620fac6a1cSBeilei Xing 630fac6a1cSBeilei Xing /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */ 640fac6a1cSBeilei Xing RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != 650fac6a1cSBeilei Xing offsetof(struct rte_mbuf, buf_addr) + 8); 660fac6a1cSBeilei Xing vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr); 670fac6a1cSBeilei Xing vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr); 680fac6a1cSBeilei Xing vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr); 690fac6a1cSBeilei Xing vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr); 700fac6a1cSBeilei Xing vaddr4 = _mm_loadu_si128((__m128i *)&mb4->buf_addr); 710fac6a1cSBeilei Xing vaddr5 = _mm_loadu_si128((__m128i *)&mb5->buf_addr); 720fac6a1cSBeilei Xing vaddr6 = _mm_loadu_si128((__m128i *)&mb6->buf_addr); 730fac6a1cSBeilei Xing vaddr7 = _mm_loadu_si128((__m128i *)&mb7->buf_addr); 740fac6a1cSBeilei Xing 750fac6a1cSBeilei Xing /** 760fac6a1cSBeilei Xing * merge 0 & 1, by casting 0 to 256-bit and inserting 1 770fac6a1cSBeilei Xing * into the high lanes. Similarly for 2 & 3, and so on. 780fac6a1cSBeilei Xing */ 790fac6a1cSBeilei Xing vaddr0_1 = 800fac6a1cSBeilei Xing _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0), 810fac6a1cSBeilei Xing vaddr1, 1); 820fac6a1cSBeilei Xing vaddr2_3 = 830fac6a1cSBeilei Xing _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr2), 840fac6a1cSBeilei Xing vaddr3, 1); 850fac6a1cSBeilei Xing vaddr4_5 = 860fac6a1cSBeilei Xing _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr4), 870fac6a1cSBeilei Xing vaddr5, 1); 880fac6a1cSBeilei Xing vaddr6_7 = 890fac6a1cSBeilei Xing _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr6), 900fac6a1cSBeilei Xing vaddr7, 1); 910fac6a1cSBeilei Xing vaddr0_3 = 920fac6a1cSBeilei Xing _mm512_inserti64x4(_mm512_castsi256_si512(vaddr0_1), 930fac6a1cSBeilei Xing vaddr2_3, 1); 940fac6a1cSBeilei Xing vaddr4_7 = 950fac6a1cSBeilei Xing _mm512_inserti64x4(_mm512_castsi256_si512(vaddr4_5), 960fac6a1cSBeilei Xing vaddr6_7, 1); 970fac6a1cSBeilei Xing 980fac6a1cSBeilei Xing /* convert pa to dma_addr hdr/data */ 990fac6a1cSBeilei Xing dma_addr0_3 = _mm512_unpackhi_epi64(vaddr0_3, vaddr0_3); 1000fac6a1cSBeilei Xing dma_addr4_7 = _mm512_unpackhi_epi64(vaddr4_7, vaddr4_7); 1010fac6a1cSBeilei Xing 1020fac6a1cSBeilei Xing /* add headroom to pa values */ 1030fac6a1cSBeilei Xing dma_addr0_3 = _mm512_add_epi64(dma_addr0_3, hdr_room); 1040fac6a1cSBeilei Xing dma_addr4_7 = _mm512_add_epi64(dma_addr4_7, hdr_room); 1050fac6a1cSBeilei Xing 1060fac6a1cSBeilei Xing /* flush desc with pa dma_addr */ 107*43fd3624SAndre Muezerie _mm512_store_si512(RTE_CAST_PTR(__m512i *, &rxdp->read), dma_addr0_3); 108*43fd3624SAndre Muezerie _mm512_store_si512(RTE_CAST_PTR(__m512i *, &(rxdp + 4)->read), dma_addr4_7); 1090fac6a1cSBeilei Xing } 1100fac6a1cSBeilei Xing 1110fac6a1cSBeilei Xing rxq->rxrearm_start += IDPF_RXQ_REARM_THRESH; 1120fac6a1cSBeilei Xing if (rxq->rxrearm_start >= rxq->nb_rx_desc) 1130fac6a1cSBeilei Xing rxq->rxrearm_start = 0; 1140fac6a1cSBeilei Xing 1150fac6a1cSBeilei Xing rxq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH; 1160fac6a1cSBeilei Xing 1170fac6a1cSBeilei Xing rx_id = (uint16_t)((rxq->rxrearm_start == 0) ? 1180fac6a1cSBeilei Xing (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1)); 1190fac6a1cSBeilei Xing 1200fac6a1cSBeilei Xing /* Update the tail pointer on the NIC */ 1210fac6a1cSBeilei Xing IDPF_PCI_REG_WRITE(rxq->qrx_tail, rx_id); 1220fac6a1cSBeilei Xing } 1230fac6a1cSBeilei Xing 1240fac6a1cSBeilei Xing static __rte_always_inline void 1250fac6a1cSBeilei Xing idpf_singleq_rearm(struct idpf_rx_queue *rxq) 1260fac6a1cSBeilei Xing { 1270fac6a1cSBeilei Xing int i; 1280fac6a1cSBeilei Xing uint16_t rx_id; 1290fac6a1cSBeilei Xing volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring; 1300fac6a1cSBeilei Xing struct rte_mempool_cache *cache = 1310fac6a1cSBeilei Xing rte_mempool_default_cache(rxq->mp, rte_lcore_id()); 1320fac6a1cSBeilei Xing struct rte_mbuf **rxp = &rxq->sw_ring[rxq->rxrearm_start]; 1330fac6a1cSBeilei Xing 1340fac6a1cSBeilei Xing rxdp += rxq->rxrearm_start; 1350fac6a1cSBeilei Xing 1360fac6a1cSBeilei Xing if (unlikely(cache == NULL)) 1370fac6a1cSBeilei Xing return idpf_singleq_rearm_common(rxq); 1380fac6a1cSBeilei Xing 1390fac6a1cSBeilei Xing /* We need to pull 'n' more MBUFs into the software ring from mempool 1400fac6a1cSBeilei Xing * We inline the mempool function here, so we can vectorize the copy 1410fac6a1cSBeilei Xing * from the cache into the shadow ring. 1420fac6a1cSBeilei Xing */ 1430fac6a1cSBeilei Xing 1440fac6a1cSBeilei Xing /* Can this be satisfied from the cache? */ 1450fac6a1cSBeilei Xing if (cache->len < IDPF_RXQ_REARM_THRESH) { 1460fac6a1cSBeilei Xing /* No. Backfill the cache first, and then fill from it */ 1470fac6a1cSBeilei Xing uint32_t req = IDPF_RXQ_REARM_THRESH + (cache->size - 1480fac6a1cSBeilei Xing cache->len); 1490fac6a1cSBeilei Xing 1500fac6a1cSBeilei Xing /* How many do we require i.e. number to fill the cache + the request */ 1510fac6a1cSBeilei Xing int ret = rte_mempool_ops_dequeue_bulk 1520fac6a1cSBeilei Xing (rxq->mp, &cache->objs[cache->len], req); 1530fac6a1cSBeilei Xing if (ret == 0) { 1540fac6a1cSBeilei Xing cache->len += req; 1550fac6a1cSBeilei Xing } else { 1560fac6a1cSBeilei Xing if (rxq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >= 1570fac6a1cSBeilei Xing rxq->nb_rx_desc) { 1580fac6a1cSBeilei Xing __m128i dma_addr0; 1590fac6a1cSBeilei Xing 1600fac6a1cSBeilei Xing dma_addr0 = _mm_setzero_si128(); 1610fac6a1cSBeilei Xing for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) { 1620fac6a1cSBeilei Xing rxp[i] = &rxq->fake_mbuf; 163*43fd3624SAndre Muezerie _mm_storeu_si128(RTE_CAST_PTR 164*43fd3624SAndre Muezerie (__m128i *, &rxdp[i].read), dma_addr0); 1650fac6a1cSBeilei Xing } 1660fac6a1cSBeilei Xing } 167e12a0166STyler Retzlaff rte_atomic_fetch_add_explicit(&rxq->rx_stats.mbuf_alloc_failed, 168e12a0166STyler Retzlaff IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed); 1690fac6a1cSBeilei Xing return; 1700fac6a1cSBeilei Xing } 1710fac6a1cSBeilei Xing } 1720fac6a1cSBeilei Xing 1730fac6a1cSBeilei Xing const __m512i iova_offsets = _mm512_set1_epi64(offsetof 1740fac6a1cSBeilei Xing (struct rte_mbuf, buf_iova)); 1750fac6a1cSBeilei Xing const __m512i headroom = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM); 1760fac6a1cSBeilei Xing 1770fac6a1cSBeilei Xing /* to shuffle the addresses to correct slots. Values 4-7 will contain 1780fac6a1cSBeilei Xing * zeros, so use 7 for a zero-value. 1790fac6a1cSBeilei Xing */ 1800fac6a1cSBeilei Xing const __m512i permute_idx = _mm512_set_epi64(7, 7, 3, 1, 7, 7, 2, 0); 1810fac6a1cSBeilei Xing 1820fac6a1cSBeilei Xing /* Initialize the mbufs in vector, process 8 mbufs in one loop, taking 1830fac6a1cSBeilei Xing * from mempool cache and populating both shadow and HW rings 1840fac6a1cSBeilei Xing */ 1850fac6a1cSBeilei Xing for (i = 0; i < IDPF_RXQ_REARM_THRESH / IDPF_DESCS_PER_LOOP_AVX; i++) { 1860fac6a1cSBeilei Xing const __m512i mbuf_ptrs = _mm512_loadu_si512 1870fac6a1cSBeilei Xing (&cache->objs[cache->len - IDPF_DESCS_PER_LOOP_AVX]); 1880fac6a1cSBeilei Xing _mm512_storeu_si512(rxp, mbuf_ptrs); 1890fac6a1cSBeilei Xing 1900fac6a1cSBeilei Xing const __m512i iova_base_addrs = _mm512_i64gather_epi64 1910fac6a1cSBeilei Xing (_mm512_add_epi64(mbuf_ptrs, iova_offsets), 1920fac6a1cSBeilei Xing 0, /* base */ 1930fac6a1cSBeilei Xing 1 /* scale */); 1940fac6a1cSBeilei Xing const __m512i iova_addrs = _mm512_add_epi64(iova_base_addrs, 1950fac6a1cSBeilei Xing headroom); 1960fac6a1cSBeilei Xing const __m512i iovas0 = _mm512_castsi256_si512 1970fac6a1cSBeilei Xing (_mm512_extracti64x4_epi64(iova_addrs, 0)); 1980fac6a1cSBeilei Xing const __m512i iovas1 = _mm512_castsi256_si512 1990fac6a1cSBeilei Xing (_mm512_extracti64x4_epi64(iova_addrs, 1)); 2000fac6a1cSBeilei Xing 2010fac6a1cSBeilei Xing /* permute leaves desc 2-3 addresses in header address slots 0-1 2020fac6a1cSBeilei Xing * but these are ignored by driver since header split not 2030fac6a1cSBeilei Xing * enabled. Similarly for desc 6 & 7. 2040fac6a1cSBeilei Xing */ 2050fac6a1cSBeilei Xing const __m512i desc0_1 = _mm512_permutexvar_epi64 2060fac6a1cSBeilei Xing (permute_idx, 2070fac6a1cSBeilei Xing iovas0); 2080fac6a1cSBeilei Xing const __m512i desc2_3 = _mm512_bsrli_epi128(desc0_1, 8); 2090fac6a1cSBeilei Xing 2100fac6a1cSBeilei Xing const __m512i desc4_5 = _mm512_permutexvar_epi64 2110fac6a1cSBeilei Xing (permute_idx, 2120fac6a1cSBeilei Xing iovas1); 2130fac6a1cSBeilei Xing const __m512i desc6_7 = _mm512_bsrli_epi128(desc4_5, 8); 2140fac6a1cSBeilei Xing 215*43fd3624SAndre Muezerie _mm512_storeu_si512(RTE_CAST_PTR(void *, rxdp), desc0_1); 216*43fd3624SAndre Muezerie _mm512_storeu_si512(RTE_CAST_PTR(void *, (rxdp + 2)), desc2_3); 217*43fd3624SAndre Muezerie _mm512_storeu_si512(RTE_CAST_PTR(void *, (rxdp + 4)), desc4_5); 218*43fd3624SAndre Muezerie _mm512_storeu_si512(RTE_CAST_PTR(void *, (rxdp + 6)), desc6_7); 2190fac6a1cSBeilei Xing 2200fac6a1cSBeilei Xing rxp += IDPF_DESCS_PER_LOOP_AVX; 2210fac6a1cSBeilei Xing rxdp += IDPF_DESCS_PER_LOOP_AVX; 2220fac6a1cSBeilei Xing cache->len -= IDPF_DESCS_PER_LOOP_AVX; 2230fac6a1cSBeilei Xing } 2240fac6a1cSBeilei Xing 2250fac6a1cSBeilei Xing rxq->rxrearm_start += IDPF_RXQ_REARM_THRESH; 2260fac6a1cSBeilei Xing if (rxq->rxrearm_start >= rxq->nb_rx_desc) 2270fac6a1cSBeilei Xing rxq->rxrearm_start = 0; 2280fac6a1cSBeilei Xing 2290fac6a1cSBeilei Xing rxq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH; 2300fac6a1cSBeilei Xing 2310fac6a1cSBeilei Xing rx_id = (uint16_t)((rxq->rxrearm_start == 0) ? 2320fac6a1cSBeilei Xing (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1)); 2330fac6a1cSBeilei Xing 2340fac6a1cSBeilei Xing /* Update the tail pointer on the NIC */ 2350fac6a1cSBeilei Xing IDPF_PCI_REG_WRITE(rxq->qrx_tail, rx_id); 2360fac6a1cSBeilei Xing } 2370fac6a1cSBeilei Xing 2380fac6a1cSBeilei Xing #define IDPF_RX_LEN_MASK 0x80808080 2390fac6a1cSBeilei Xing static __rte_always_inline uint16_t 2400fac6a1cSBeilei Xing _idpf_singleq_recv_raw_pkts_avx512(struct idpf_rx_queue *rxq, 2410fac6a1cSBeilei Xing struct rte_mbuf **rx_pkts, 2420fac6a1cSBeilei Xing uint16_t nb_pkts) 2430fac6a1cSBeilei Xing { 2440fac6a1cSBeilei Xing const uint32_t *type_table = rxq->adapter->ptype_tbl; 2450fac6a1cSBeilei Xing 2460fac6a1cSBeilei Xing const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0, 2470fac6a1cSBeilei Xing rxq->mbuf_initializer); 2480fac6a1cSBeilei Xing struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail]; 2490fac6a1cSBeilei Xing volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring; 2500fac6a1cSBeilei Xing 2510fac6a1cSBeilei Xing rxdp += rxq->rx_tail; 2520fac6a1cSBeilei Xing 2530fac6a1cSBeilei Xing rte_prefetch0(rxdp); 2540fac6a1cSBeilei Xing 2550fac6a1cSBeilei Xing /* nb_pkts has to be floor-aligned to IDPF_DESCS_PER_LOOP_AVX */ 2560fac6a1cSBeilei Xing nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IDPF_DESCS_PER_LOOP_AVX); 2570fac6a1cSBeilei Xing 2580fac6a1cSBeilei Xing /* See if we need to rearm the RX queue - gives the prefetch a bit 2590fac6a1cSBeilei Xing * of time to act 2600fac6a1cSBeilei Xing */ 2610fac6a1cSBeilei Xing if (rxq->rxrearm_nb > IDPF_RXQ_REARM_THRESH) 2620fac6a1cSBeilei Xing idpf_singleq_rearm(rxq); 2630fac6a1cSBeilei Xing 2640fac6a1cSBeilei Xing /* Before we start moving massive data around, check to see if 2650fac6a1cSBeilei Xing * there is actually a packet available 2660fac6a1cSBeilei Xing */ 2670fac6a1cSBeilei Xing if ((rxdp->flex_nic_wb.status_error0 & 2680fac6a1cSBeilei Xing rte_cpu_to_le_32(1 << VIRTCHNL2_RX_FLEX_DESC_STATUS0_DD_S)) == 0) 2690fac6a1cSBeilei Xing return 0; 2700fac6a1cSBeilei Xing 2710fac6a1cSBeilei Xing /* 8 packets DD mask, LSB in each 32-bit value */ 2720fac6a1cSBeilei Xing const __m256i dd_check = _mm256_set1_epi32(1); 2730fac6a1cSBeilei Xing 2740fac6a1cSBeilei Xing /* mask to shuffle from desc. to mbuf (4 descriptors)*/ 2750fac6a1cSBeilei Xing const __m512i shuf_msk = 2760fac6a1cSBeilei Xing _mm512_set_epi32 2770fac6a1cSBeilei Xing (/* 1st descriptor */ 2780fac6a1cSBeilei Xing 0xFFFFFFFF, /* rss set as unknown */ 2790fac6a1cSBeilei Xing 0xFFFF0504, /* vlan_macip set as unknown */ 2800fac6a1cSBeilei Xing /* octet 15~14, 16 bits data_len */ 2810fac6a1cSBeilei Xing 0xFFFF0504, /* skip high 16 bits pkt_len, zero out */ 2820fac6a1cSBeilei Xing /* octet 15~14, low 16 bits pkt_len */ 2830fac6a1cSBeilei Xing 0xFFFFFFFF, /* pkt_type set as unknown */ 2840fac6a1cSBeilei Xing /* 2nd descriptor */ 2850fac6a1cSBeilei Xing 0xFFFFFFFF, /* rss set as unknown */ 2860fac6a1cSBeilei Xing 0xFFFF0504, /* vlan_macip set as unknown */ 2870fac6a1cSBeilei Xing /* octet 15~14, 16 bits data_len */ 2880fac6a1cSBeilei Xing 0xFFFF0504, /* skip high 16 bits pkt_len, zero out */ 2890fac6a1cSBeilei Xing /* octet 15~14, low 16 bits pkt_len */ 2900fac6a1cSBeilei Xing 0xFFFFFFFF, /* pkt_type set as unknown */ 2910fac6a1cSBeilei Xing /* 3rd descriptor */ 2920fac6a1cSBeilei Xing 0xFFFFFFFF, /* rss set as unknown */ 2930fac6a1cSBeilei Xing 0xFFFF0504, /* vlan_macip set as unknown */ 2940fac6a1cSBeilei Xing /* octet 15~14, 16 bits data_len */ 2950fac6a1cSBeilei Xing 0xFFFF0504, /* skip high 16 bits pkt_len, zero out */ 2960fac6a1cSBeilei Xing /* octet 15~14, low 16 bits pkt_len */ 2970fac6a1cSBeilei Xing 0xFFFFFFFF, /* pkt_type set as unknown */ 2980fac6a1cSBeilei Xing /* 4th descriptor */ 2990fac6a1cSBeilei Xing 0xFFFFFFFF, /* rss set as unknown */ 3000fac6a1cSBeilei Xing 0xFFFF0504, /* vlan_macip set as unknown */ 3010fac6a1cSBeilei Xing /* octet 15~14, 16 bits data_len */ 3020fac6a1cSBeilei Xing 0xFFFF0504, /* skip high 16 bits pkt_len, zero out */ 3030fac6a1cSBeilei Xing /* octet 15~14, low 16 bits pkt_len */ 3040fac6a1cSBeilei Xing 0xFFFFFFFF /* pkt_type set as unknown */ 3050fac6a1cSBeilei Xing ); 3060fac6a1cSBeilei Xing /** 3070fac6a1cSBeilei Xing * compile-time check the shuffle layout is correct. 3080fac6a1cSBeilei Xing * NOTE: the first field (lowest address) is given last in set_epi 3090fac6a1cSBeilei Xing * calls above. 3100fac6a1cSBeilei Xing */ 3110fac6a1cSBeilei Xing RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != 3120fac6a1cSBeilei Xing offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); 3130fac6a1cSBeilei Xing RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != 3140fac6a1cSBeilei Xing offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); 3150fac6a1cSBeilei Xing RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) != 3160fac6a1cSBeilei Xing offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10); 3170fac6a1cSBeilei Xing RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != 3180fac6a1cSBeilei Xing offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); 3190fac6a1cSBeilei Xing 3200fac6a1cSBeilei Xing uint16_t i, received; 3210fac6a1cSBeilei Xing 3220fac6a1cSBeilei Xing for (i = 0, received = 0; i < nb_pkts; 3230fac6a1cSBeilei Xing i += IDPF_DESCS_PER_LOOP_AVX, 3240fac6a1cSBeilei Xing rxdp += IDPF_DESCS_PER_LOOP_AVX) { 3250fac6a1cSBeilei Xing /* step 1, copy over 8 mbuf pointers to rx_pkts array */ 3260fac6a1cSBeilei Xing _mm256_storeu_si256((void *)&rx_pkts[i], 3270fac6a1cSBeilei Xing _mm256_loadu_si256((void *)&sw_ring[i])); 3280fac6a1cSBeilei Xing #ifdef RTE_ARCH_X86_64 3290fac6a1cSBeilei Xing _mm256_storeu_si256 3300fac6a1cSBeilei Xing ((void *)&rx_pkts[i + 4], 3310fac6a1cSBeilei Xing _mm256_loadu_si256((void *)&sw_ring[i + 4])); 3320fac6a1cSBeilei Xing #endif 3330fac6a1cSBeilei Xing 3340fac6a1cSBeilei Xing __m512i raw_desc0_3, raw_desc4_7; 3350fac6a1cSBeilei Xing const __m128i raw_desc7 = 336*43fd3624SAndre Muezerie _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 7)); 3370fac6a1cSBeilei Xing rte_compiler_barrier(); 3380fac6a1cSBeilei Xing const __m128i raw_desc6 = 339*43fd3624SAndre Muezerie _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 6)); 3400fac6a1cSBeilei Xing rte_compiler_barrier(); 3410fac6a1cSBeilei Xing const __m128i raw_desc5 = 342*43fd3624SAndre Muezerie _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 5)); 3430fac6a1cSBeilei Xing rte_compiler_barrier(); 3440fac6a1cSBeilei Xing const __m128i raw_desc4 = 345*43fd3624SAndre Muezerie _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 4)); 3460fac6a1cSBeilei Xing rte_compiler_barrier(); 3470fac6a1cSBeilei Xing const __m128i raw_desc3 = 348*43fd3624SAndre Muezerie _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 3)); 3490fac6a1cSBeilei Xing rte_compiler_barrier(); 3500fac6a1cSBeilei Xing const __m128i raw_desc2 = 351*43fd3624SAndre Muezerie _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 2)); 3520fac6a1cSBeilei Xing rte_compiler_barrier(); 3530fac6a1cSBeilei Xing const __m128i raw_desc1 = 354*43fd3624SAndre Muezerie _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 1)); 3550fac6a1cSBeilei Xing rte_compiler_barrier(); 3560fac6a1cSBeilei Xing const __m128i raw_desc0 = 357*43fd3624SAndre Muezerie _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 0)); 3580fac6a1cSBeilei Xing 3590fac6a1cSBeilei Xing raw_desc4_7 = _mm512_broadcast_i32x4(raw_desc4); 3600fac6a1cSBeilei Xing raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc5, 1); 3610fac6a1cSBeilei Xing raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc6, 2); 3620fac6a1cSBeilei Xing raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc7, 3); 3630fac6a1cSBeilei Xing raw_desc0_3 = _mm512_broadcast_i32x4(raw_desc0); 3640fac6a1cSBeilei Xing raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc1, 1); 3650fac6a1cSBeilei Xing raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc2, 2); 3660fac6a1cSBeilei Xing raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc3, 3); 3670fac6a1cSBeilei Xing 3680fac6a1cSBeilei Xing /** 3690fac6a1cSBeilei Xing * convert descriptors 4-7 into mbufs, adjusting length and 3700fac6a1cSBeilei Xing * re-arranging fields. Then write into the mbuf 3710fac6a1cSBeilei Xing */ 3720fac6a1cSBeilei Xing const __m512i len4_7 = _mm512_slli_epi32(raw_desc4_7, 3730fac6a1cSBeilei Xing PKTLEN_SHIFT); 3740fac6a1cSBeilei Xing const __m512i desc4_7 = _mm512_mask_blend_epi16(IDPF_RX_LEN_MASK, 3750fac6a1cSBeilei Xing raw_desc4_7, 3760fac6a1cSBeilei Xing len4_7); 3770fac6a1cSBeilei Xing __m512i mb4_7 = _mm512_shuffle_epi8(desc4_7, shuf_msk); 3780fac6a1cSBeilei Xing 3790fac6a1cSBeilei Xing /** 3800fac6a1cSBeilei Xing * to get packet types, shift 64-bit values down 30 bits 3810fac6a1cSBeilei Xing * and so ptype is in lower 8-bits in each 3820fac6a1cSBeilei Xing */ 3830fac6a1cSBeilei Xing const __m512i ptypes4_7 = _mm512_srli_epi64(desc4_7, 16); 3840fac6a1cSBeilei Xing const __m256i ptypes6_7 = _mm512_extracti64x4_epi64(ptypes4_7, 1); 3850fac6a1cSBeilei Xing const __m256i ptypes4_5 = _mm512_extracti64x4_epi64(ptypes4_7, 0); 3860fac6a1cSBeilei Xing const uint8_t ptype7 = _mm256_extract_epi8(ptypes6_7, 16); 3870fac6a1cSBeilei Xing const uint8_t ptype6 = _mm256_extract_epi8(ptypes6_7, 0); 3880fac6a1cSBeilei Xing const uint8_t ptype5 = _mm256_extract_epi8(ptypes4_5, 16); 3890fac6a1cSBeilei Xing const uint8_t ptype4 = _mm256_extract_epi8(ptypes4_5, 0); 3900fac6a1cSBeilei Xing 3910fac6a1cSBeilei Xing const __m512i ptype4_7 = _mm512_set_epi32 3920fac6a1cSBeilei Xing (0, 0, 0, type_table[ptype7], 3930fac6a1cSBeilei Xing 0, 0, 0, type_table[ptype6], 3940fac6a1cSBeilei Xing 0, 0, 0, type_table[ptype5], 3950fac6a1cSBeilei Xing 0, 0, 0, type_table[ptype4]); 3960fac6a1cSBeilei Xing mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7); 3970fac6a1cSBeilei Xing 3980fac6a1cSBeilei Xing /** 3990fac6a1cSBeilei Xing * convert descriptors 0-3 into mbufs, adjusting length and 4000fac6a1cSBeilei Xing * re-arranging fields. Then write into the mbuf 4010fac6a1cSBeilei Xing */ 4020fac6a1cSBeilei Xing const __m512i len0_3 = _mm512_slli_epi32(raw_desc0_3, 4030fac6a1cSBeilei Xing PKTLEN_SHIFT); 4040fac6a1cSBeilei Xing const __m512i desc0_3 = _mm512_mask_blend_epi16(IDPF_RX_LEN_MASK, 4050fac6a1cSBeilei Xing raw_desc0_3, 4060fac6a1cSBeilei Xing len0_3); 4070fac6a1cSBeilei Xing __m512i mb0_3 = _mm512_shuffle_epi8(desc0_3, shuf_msk); 4080fac6a1cSBeilei Xing 4090fac6a1cSBeilei Xing /* get the packet types */ 4100fac6a1cSBeilei Xing const __m512i ptypes0_3 = _mm512_srli_epi64(desc0_3, 16); 4110fac6a1cSBeilei Xing const __m256i ptypes2_3 = _mm512_extracti64x4_epi64(ptypes0_3, 1); 4120fac6a1cSBeilei Xing const __m256i ptypes0_1 = _mm512_extracti64x4_epi64(ptypes0_3, 0); 4130fac6a1cSBeilei Xing const uint8_t ptype3 = _mm256_extract_epi8(ptypes2_3, 16); 4140fac6a1cSBeilei Xing const uint8_t ptype2 = _mm256_extract_epi8(ptypes2_3, 0); 4150fac6a1cSBeilei Xing const uint8_t ptype1 = _mm256_extract_epi8(ptypes0_1, 16); 4160fac6a1cSBeilei Xing const uint8_t ptype0 = _mm256_extract_epi8(ptypes0_1, 0); 4170fac6a1cSBeilei Xing 4180fac6a1cSBeilei Xing const __m512i ptype0_3 = _mm512_set_epi32 4190fac6a1cSBeilei Xing (0, 0, 0, type_table[ptype3], 4200fac6a1cSBeilei Xing 0, 0, 0, type_table[ptype2], 4210fac6a1cSBeilei Xing 0, 0, 0, type_table[ptype1], 4220fac6a1cSBeilei Xing 0, 0, 0, type_table[ptype0]); 4230fac6a1cSBeilei Xing mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3); 4240fac6a1cSBeilei Xing 4250fac6a1cSBeilei Xing /** 4260fac6a1cSBeilei Xing * use permute/extract to get status content 4270fac6a1cSBeilei Xing * After the operations, the packets status flags are in the 4280fac6a1cSBeilei Xing * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6] 4290fac6a1cSBeilei Xing */ 4300fac6a1cSBeilei Xing /* merge the status bits into one register */ 4310fac6a1cSBeilei Xing const __m512i status_permute_msk = _mm512_set_epi32 4320fac6a1cSBeilei Xing (0, 0, 0, 0, 4330fac6a1cSBeilei Xing 0, 0, 0, 0, 4340fac6a1cSBeilei Xing 22, 30, 6, 14, 4350fac6a1cSBeilei Xing 18, 26, 2, 10); 4360fac6a1cSBeilei Xing const __m512i raw_status0_7 = _mm512_permutex2var_epi32 4370fac6a1cSBeilei Xing (raw_desc4_7, status_permute_msk, raw_desc0_3); 4380fac6a1cSBeilei Xing __m256i status0_7 = _mm512_extracti64x4_epi64 4390fac6a1cSBeilei Xing (raw_status0_7, 0); 4400fac6a1cSBeilei Xing 4410fac6a1cSBeilei Xing /* now do flag manipulation */ 4420fac6a1cSBeilei Xing 4430fac6a1cSBeilei Xing /** 4440fac6a1cSBeilei Xing * At this point, we have the 8 sets of flags in the low 16-bits 4450fac6a1cSBeilei Xing * of each 32-bit value. 4460fac6a1cSBeilei Xing * We want to extract these, and merge them with the mbuf init 4470fac6a1cSBeilei Xing * data so we can do a single write to the mbuf to set the flags 4480fac6a1cSBeilei Xing * and all the other initialization fields. Extracting the 4490fac6a1cSBeilei Xing * appropriate flags means that we have to do a shift and blend 4500fac6a1cSBeilei Xing * for each mbuf before we do the write. However, we can also 4510fac6a1cSBeilei Xing * add in the previously computed rx_descriptor fields to 4520fac6a1cSBeilei Xing * make a single 256-bit write per mbuf 4530fac6a1cSBeilei Xing */ 4540fac6a1cSBeilei Xing /* check the structure matches expectations */ 4550fac6a1cSBeilei Xing RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) != 4560fac6a1cSBeilei Xing offsetof(struct rte_mbuf, rearm_data) + 8); 4570fac6a1cSBeilei Xing RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) != 4580fac6a1cSBeilei Xing RTE_ALIGN(offsetof(struct rte_mbuf, 4590fac6a1cSBeilei Xing rearm_data), 4600fac6a1cSBeilei Xing 16)); 4610fac6a1cSBeilei Xing /* build up data and do writes */ 4620fac6a1cSBeilei Xing __m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5, 4630fac6a1cSBeilei Xing rearm6, rearm7; 4640fac6a1cSBeilei Xing const __m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0); 4650fac6a1cSBeilei Xing const __m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1); 4660fac6a1cSBeilei Xing const __m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0); 4670fac6a1cSBeilei Xing const __m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1); 4680fac6a1cSBeilei Xing 4690fac6a1cSBeilei Xing rearm6 = _mm256_permute2f128_si256(mbuf_init, mb6_7, 0x20); 4700fac6a1cSBeilei Xing rearm4 = _mm256_permute2f128_si256(mbuf_init, mb4_5, 0x20); 4710fac6a1cSBeilei Xing rearm2 = _mm256_permute2f128_si256(mbuf_init, mb2_3, 0x20); 4720fac6a1cSBeilei Xing rearm0 = _mm256_permute2f128_si256(mbuf_init, mb0_1, 0x20); 4730fac6a1cSBeilei Xing 4740fac6a1cSBeilei Xing /* write to mbuf */ 4750fac6a1cSBeilei Xing _mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data, 4760fac6a1cSBeilei Xing rearm6); 4770fac6a1cSBeilei Xing _mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data, 4780fac6a1cSBeilei Xing rearm4); 4790fac6a1cSBeilei Xing _mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data, 4800fac6a1cSBeilei Xing rearm2); 4810fac6a1cSBeilei Xing _mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data, 4820fac6a1cSBeilei Xing rearm0); 4830fac6a1cSBeilei Xing 4840fac6a1cSBeilei Xing rearm7 = _mm256_blend_epi32(mbuf_init, mb6_7, 0xF0); 4850fac6a1cSBeilei Xing rearm5 = _mm256_blend_epi32(mbuf_init, mb4_5, 0xF0); 4860fac6a1cSBeilei Xing rearm3 = _mm256_blend_epi32(mbuf_init, mb2_3, 0xF0); 4870fac6a1cSBeilei Xing rearm1 = _mm256_blend_epi32(mbuf_init, mb0_1, 0xF0); 4880fac6a1cSBeilei Xing 4890fac6a1cSBeilei Xing /* again write to mbufs */ 4900fac6a1cSBeilei Xing _mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data, 4910fac6a1cSBeilei Xing rearm7); 4920fac6a1cSBeilei Xing _mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data, 4930fac6a1cSBeilei Xing rearm5); 4940fac6a1cSBeilei Xing _mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data, 4950fac6a1cSBeilei Xing rearm3); 4960fac6a1cSBeilei Xing _mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data, 4970fac6a1cSBeilei Xing rearm1); 4980fac6a1cSBeilei Xing 4990fac6a1cSBeilei Xing /* perform dd_check */ 5000fac6a1cSBeilei Xing status0_7 = _mm256_and_si256(status0_7, dd_check); 5010fac6a1cSBeilei Xing status0_7 = _mm256_packs_epi32(status0_7, 5020fac6a1cSBeilei Xing _mm256_setzero_si256()); 5030fac6a1cSBeilei Xing 5043d4e27fdSDavid Marchand uint64_t burst = rte_popcount64 5050fac6a1cSBeilei Xing (_mm_cvtsi128_si64 5060fac6a1cSBeilei Xing (_mm256_extracti128_si256 5070fac6a1cSBeilei Xing (status0_7, 1))); 5083d4e27fdSDavid Marchand burst += rte_popcount64 5090fac6a1cSBeilei Xing (_mm_cvtsi128_si64 5100fac6a1cSBeilei Xing (_mm256_castsi256_si128(status0_7))); 5110fac6a1cSBeilei Xing received += burst; 5120fac6a1cSBeilei Xing if (burst != IDPF_DESCS_PER_LOOP_AVX) 5130fac6a1cSBeilei Xing break; 5140fac6a1cSBeilei Xing } 5150fac6a1cSBeilei Xing 5160fac6a1cSBeilei Xing /* update tail pointers */ 5170fac6a1cSBeilei Xing rxq->rx_tail += received; 5180fac6a1cSBeilei Xing rxq->rx_tail &= (rxq->nb_rx_desc - 1); 5190fac6a1cSBeilei Xing if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep aligned */ 5200fac6a1cSBeilei Xing rxq->rx_tail--; 5210fac6a1cSBeilei Xing received--; 5220fac6a1cSBeilei Xing } 5230fac6a1cSBeilei Xing rxq->rxrearm_nb += received; 5240fac6a1cSBeilei Xing return received; 5250fac6a1cSBeilei Xing } 5260fac6a1cSBeilei Xing 5270fac6a1cSBeilei Xing /** 5280fac6a1cSBeilei Xing * Notice: 5290fac6a1cSBeilei Xing * - nb_pkts < IDPF_DESCS_PER_LOOP, just return no packet 5300fac6a1cSBeilei Xing */ 5310fac6a1cSBeilei Xing uint16_t 5329ebf3f6bSBeilei Xing idpf_dp_singleq_recv_pkts_avx512(void *rx_queue, struct rte_mbuf **rx_pkts, 5330fac6a1cSBeilei Xing uint16_t nb_pkts) 5340fac6a1cSBeilei Xing { 5350fac6a1cSBeilei Xing return _idpf_singleq_recv_raw_pkts_avx512(rx_queue, rx_pkts, nb_pkts); 5360fac6a1cSBeilei Xing } 5370fac6a1cSBeilei Xing 538e528d7c7SWenjun Wu static __rte_always_inline void 539e528d7c7SWenjun Wu idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq) 540e528d7c7SWenjun Wu { 541e528d7c7SWenjun Wu struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start]; 542e528d7c7SWenjun Wu volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring; 543e528d7c7SWenjun Wu uint16_t rx_id; 544e528d7c7SWenjun Wu int i; 545e528d7c7SWenjun Wu 546e528d7c7SWenjun Wu rxdp += rx_bufq->rxrearm_start; 547e528d7c7SWenjun Wu 548e528d7c7SWenjun Wu /* Pull 'n' more MBUFs into the software ring */ 549e528d7c7SWenjun Wu if (rte_mempool_get_bulk(rx_bufq->mp, 550e528d7c7SWenjun Wu (void *)rxp, 551e528d7c7SWenjun Wu IDPF_RXQ_REARM_THRESH) < 0) { 552e528d7c7SWenjun Wu if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >= 553e528d7c7SWenjun Wu rx_bufq->nb_rx_desc) { 554e528d7c7SWenjun Wu __m128i dma_addr0; 555e528d7c7SWenjun Wu 556e528d7c7SWenjun Wu dma_addr0 = _mm_setzero_si128(); 557e528d7c7SWenjun Wu for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) { 558e528d7c7SWenjun Wu rxp[i] = &rx_bufq->fake_mbuf; 559*43fd3624SAndre Muezerie _mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp[i]), 560e528d7c7SWenjun Wu dma_addr0); 561e528d7c7SWenjun Wu } 562e528d7c7SWenjun Wu } 563e12a0166STyler Retzlaff rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed, 564e12a0166STyler Retzlaff IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed); 565e528d7c7SWenjun Wu return; 566e528d7c7SWenjun Wu } 567e528d7c7SWenjun Wu 568e528d7c7SWenjun Wu /* Initialize the mbufs in vector, process 8 mbufs in one loop */ 569e528d7c7SWenjun Wu for (i = 0; i < IDPF_RXQ_REARM_THRESH; 570e528d7c7SWenjun Wu i += 8, rxp += 8, rxdp += 8) { 571e528d7c7SWenjun Wu rxdp[0].split_rd.pkt_addr = rxp[0]->buf_iova + RTE_PKTMBUF_HEADROOM; 572e528d7c7SWenjun Wu rxdp[1].split_rd.pkt_addr = rxp[1]->buf_iova + RTE_PKTMBUF_HEADROOM; 573e528d7c7SWenjun Wu rxdp[2].split_rd.pkt_addr = rxp[2]->buf_iova + RTE_PKTMBUF_HEADROOM; 574e528d7c7SWenjun Wu rxdp[3].split_rd.pkt_addr = rxp[3]->buf_iova + RTE_PKTMBUF_HEADROOM; 575e528d7c7SWenjun Wu rxdp[4].split_rd.pkt_addr = rxp[4]->buf_iova + RTE_PKTMBUF_HEADROOM; 576e528d7c7SWenjun Wu rxdp[5].split_rd.pkt_addr = rxp[5]->buf_iova + RTE_PKTMBUF_HEADROOM; 577e528d7c7SWenjun Wu rxdp[6].split_rd.pkt_addr = rxp[6]->buf_iova + RTE_PKTMBUF_HEADROOM; 578e528d7c7SWenjun Wu rxdp[7].split_rd.pkt_addr = rxp[7]->buf_iova + RTE_PKTMBUF_HEADROOM; 579e528d7c7SWenjun Wu } 580e528d7c7SWenjun Wu 581e528d7c7SWenjun Wu rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH; 582e528d7c7SWenjun Wu if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc) 583e528d7c7SWenjun Wu rx_bufq->rxrearm_start = 0; 584e528d7c7SWenjun Wu 585e528d7c7SWenjun Wu rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH; 586e528d7c7SWenjun Wu 587e528d7c7SWenjun Wu rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ? 588e528d7c7SWenjun Wu (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1)); 589e528d7c7SWenjun Wu 590e528d7c7SWenjun Wu /* Update the tail pointer on the NIC */ 591e528d7c7SWenjun Wu IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id); 592e528d7c7SWenjun Wu } 593e528d7c7SWenjun Wu 594e528d7c7SWenjun Wu static __rte_always_inline void 595e528d7c7SWenjun Wu idpf_splitq_rearm(struct idpf_rx_queue *rx_bufq) 596e528d7c7SWenjun Wu { 597e528d7c7SWenjun Wu int i; 598e528d7c7SWenjun Wu uint16_t rx_id; 599e528d7c7SWenjun Wu volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring; 600e528d7c7SWenjun Wu struct rte_mempool_cache *cache = 601e528d7c7SWenjun Wu rte_mempool_default_cache(rx_bufq->mp, rte_lcore_id()); 602e528d7c7SWenjun Wu struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start]; 603e528d7c7SWenjun Wu 604e528d7c7SWenjun Wu rxdp += rx_bufq->rxrearm_start; 605e528d7c7SWenjun Wu 606e528d7c7SWenjun Wu if (unlikely(!cache)) 607e528d7c7SWenjun Wu return idpf_splitq_rearm_common(rx_bufq); 608e528d7c7SWenjun Wu 609e528d7c7SWenjun Wu /* We need to pull 'n' more MBUFs into the software ring from mempool 610e528d7c7SWenjun Wu * We inline the mempool function here, so we can vectorize the copy 611e528d7c7SWenjun Wu * from the cache into the shadow ring. 612e528d7c7SWenjun Wu */ 613e528d7c7SWenjun Wu 614e528d7c7SWenjun Wu /* Can this be satisfied from the cache? */ 615e528d7c7SWenjun Wu if (cache->len < IDPF_RXQ_REARM_THRESH) { 616e528d7c7SWenjun Wu /* No. Backfill the cache first, and then fill from it */ 617e528d7c7SWenjun Wu uint32_t req = IDPF_RXQ_REARM_THRESH + (cache->size - 618e528d7c7SWenjun Wu cache->len); 619e528d7c7SWenjun Wu 620e528d7c7SWenjun Wu /* How many do we require i.e. number to fill the cache + the request */ 621e528d7c7SWenjun Wu int ret = rte_mempool_ops_dequeue_bulk 622e528d7c7SWenjun Wu (rx_bufq->mp, &cache->objs[cache->len], req); 623e528d7c7SWenjun Wu if (ret == 0) { 624e528d7c7SWenjun Wu cache->len += req; 625e528d7c7SWenjun Wu } else { 626e528d7c7SWenjun Wu if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >= 627e528d7c7SWenjun Wu rx_bufq->nb_rx_desc) { 628e528d7c7SWenjun Wu __m128i dma_addr0; 629e528d7c7SWenjun Wu 630e528d7c7SWenjun Wu dma_addr0 = _mm_setzero_si128(); 631e528d7c7SWenjun Wu for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) { 632e528d7c7SWenjun Wu rxp[i] = &rx_bufq->fake_mbuf; 633*43fd3624SAndre Muezerie _mm_storeu_si128(RTE_CAST_PTR(__m128i *, &rxdp[i]), 634e528d7c7SWenjun Wu dma_addr0); 635e528d7c7SWenjun Wu } 636e528d7c7SWenjun Wu } 637e12a0166STyler Retzlaff rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed, 638e12a0166STyler Retzlaff IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed); 639e528d7c7SWenjun Wu return; 640e528d7c7SWenjun Wu } 641e528d7c7SWenjun Wu } 642e528d7c7SWenjun Wu 643e528d7c7SWenjun Wu const __m512i iova_offsets = _mm512_set1_epi64(offsetof 644e528d7c7SWenjun Wu (struct rte_mbuf, buf_iova)); 645e528d7c7SWenjun Wu const __m512i headroom = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM); 646e528d7c7SWenjun Wu 647e528d7c7SWenjun Wu /* Initialize the mbufs in vector, process 8 mbufs in one loop, taking 648e528d7c7SWenjun Wu * from mempool cache and populating both shadow and HW rings 649e528d7c7SWenjun Wu */ 650e528d7c7SWenjun Wu for (i = 0; i < IDPF_RXQ_REARM_THRESH / IDPF_DESCS_PER_LOOP_AVX; i++) { 651e528d7c7SWenjun Wu const __m512i mbuf_ptrs = _mm512_loadu_si512 652e528d7c7SWenjun Wu (&cache->objs[cache->len - IDPF_DESCS_PER_LOOP_AVX]); 653e528d7c7SWenjun Wu _mm512_storeu_si512(rxp, mbuf_ptrs); 654e528d7c7SWenjun Wu 655e528d7c7SWenjun Wu const __m512i iova_base_addrs = _mm512_i64gather_epi64 656e528d7c7SWenjun Wu (_mm512_add_epi64(mbuf_ptrs, iova_offsets), 657e528d7c7SWenjun Wu 0, /* base */ 658e528d7c7SWenjun Wu 1 /* scale */); 659e528d7c7SWenjun Wu const __m512i iova_addrs = _mm512_add_epi64(iova_base_addrs, 660e528d7c7SWenjun Wu headroom); 661e528d7c7SWenjun Wu 662e528d7c7SWenjun Wu const __m512i iova_addrs_1 = _mm512_bsrli_epi128(iova_addrs, 8); 663e528d7c7SWenjun Wu 664e528d7c7SWenjun Wu rxdp[0].split_rd.pkt_addr = 665e528d7c7SWenjun Wu _mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs, 0)); 666e528d7c7SWenjun Wu rxdp[1].split_rd.pkt_addr = 667e528d7c7SWenjun Wu _mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs_1, 0)); 668e528d7c7SWenjun Wu rxdp[2].split_rd.pkt_addr = 669e528d7c7SWenjun Wu _mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs, 1)); 670e528d7c7SWenjun Wu rxdp[3].split_rd.pkt_addr = 671e528d7c7SWenjun Wu _mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs_1, 1)); 672e528d7c7SWenjun Wu rxdp[4].split_rd.pkt_addr = 673e528d7c7SWenjun Wu _mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs, 2)); 674e528d7c7SWenjun Wu rxdp[5].split_rd.pkt_addr = 675e528d7c7SWenjun Wu _mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs_1, 2)); 676e528d7c7SWenjun Wu rxdp[6].split_rd.pkt_addr = 677e528d7c7SWenjun Wu _mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs, 3)); 678e528d7c7SWenjun Wu rxdp[7].split_rd.pkt_addr = 679e528d7c7SWenjun Wu _mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs_1, 3)); 680e528d7c7SWenjun Wu 681e528d7c7SWenjun Wu rxp += IDPF_DESCS_PER_LOOP_AVX; 682e528d7c7SWenjun Wu rxdp += IDPF_DESCS_PER_LOOP_AVX; 683e528d7c7SWenjun Wu cache->len -= IDPF_DESCS_PER_LOOP_AVX; 684e528d7c7SWenjun Wu } 685e528d7c7SWenjun Wu 686e528d7c7SWenjun Wu rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH; 687e528d7c7SWenjun Wu if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc) 688e528d7c7SWenjun Wu rx_bufq->rxrearm_start = 0; 689e528d7c7SWenjun Wu 690e528d7c7SWenjun Wu rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH; 691e528d7c7SWenjun Wu 692e528d7c7SWenjun Wu rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ? 693e528d7c7SWenjun Wu (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1)); 694e528d7c7SWenjun Wu 695e528d7c7SWenjun Wu /* Update the tail pointer on the NIC */ 696e528d7c7SWenjun Wu IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id); 697e528d7c7SWenjun Wu } 698e528d7c7SWenjun Wu 699e528d7c7SWenjun Wu static __rte_always_inline uint16_t 700e528d7c7SWenjun Wu _idpf_splitq_recv_raw_pkts_avx512(struct idpf_rx_queue *rxq, 701e528d7c7SWenjun Wu struct rte_mbuf **rx_pkts, 702e528d7c7SWenjun Wu uint16_t nb_pkts) 703e528d7c7SWenjun Wu { 704e528d7c7SWenjun Wu const uint32_t *type_table = rxq->adapter->ptype_tbl; 705e528d7c7SWenjun Wu const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0, 706e528d7c7SWenjun Wu rxq->bufq2->mbuf_initializer); 707e528d7c7SWenjun Wu /* only handle bufq2 here */ 708e528d7c7SWenjun Wu struct rte_mbuf **sw_ring = &rxq->bufq2->sw_ring[rxq->rx_tail]; 709e528d7c7SWenjun Wu volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring; 710e528d7c7SWenjun Wu 711e528d7c7SWenjun Wu rxdp += rxq->rx_tail; 712e528d7c7SWenjun Wu 713e528d7c7SWenjun Wu rte_prefetch0(rxdp); 714e528d7c7SWenjun Wu 715e528d7c7SWenjun Wu /* nb_pkts has to be floor-aligned to IDPF_DESCS_PER_LOOP_AVX */ 716e528d7c7SWenjun Wu nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IDPF_DESCS_PER_LOOP_AVX); 717e528d7c7SWenjun Wu 718e528d7c7SWenjun Wu /* See if we need to rearm the RX queue - gives the prefetch a bit 719e528d7c7SWenjun Wu * of time to act 720e528d7c7SWenjun Wu */ 721e528d7c7SWenjun Wu if (rxq->bufq2->rxrearm_nb > IDPF_RXQ_REARM_THRESH) 722e528d7c7SWenjun Wu idpf_splitq_rearm(rxq->bufq2); 723e528d7c7SWenjun Wu 724e528d7c7SWenjun Wu /* Before we start moving massive data around, check to see if 725e528d7c7SWenjun Wu * there is actually a packet available 726e528d7c7SWenjun Wu */ 727e528d7c7SWenjun Wu if (((rxdp->flex_adv_nic_3_wb.pktlen_gen_bufq_id & 728e528d7c7SWenjun Wu VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) >> 729e528d7c7SWenjun Wu VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) != rxq->expected_gen_id) 730e528d7c7SWenjun Wu return 0; 731e528d7c7SWenjun Wu 732e528d7c7SWenjun Wu const __m512i dd_check = _mm512_set1_epi64(1); 733e528d7c7SWenjun Wu const __m512i gen_check = _mm512_set1_epi64((uint64_t)1<<46); 734e528d7c7SWenjun Wu 735e528d7c7SWenjun Wu /* mask to shuffle from desc. to mbuf (4 descriptors)*/ 736e528d7c7SWenjun Wu const __m512i shuf_msk = 737e528d7c7SWenjun Wu _mm512_set_epi32 738e528d7c7SWenjun Wu (/* 1st descriptor */ 739e528d7c7SWenjun Wu 0xFFFFFFFF, /* octet 4~7, 32bits rss */ 740e528d7c7SWenjun Wu 0xFFFF0504, /* octet 2~3, low 16 bits vlan_macip */ 741e528d7c7SWenjun Wu /* octet 15~14, 16 bits data_len */ 742e528d7c7SWenjun Wu 0xFFFF0504, /* skip high 16 bits pkt_len, zero out */ 743e528d7c7SWenjun Wu /* octet 15~14, low 16 bits pkt_len */ 744e528d7c7SWenjun Wu 0xFFFFFFFF, /* pkt_type set as unknown */ 745e528d7c7SWenjun Wu /* 2nd descriptor */ 746e528d7c7SWenjun Wu 0xFFFFFFFF, /* octet 4~7, 32bits rss */ 747e528d7c7SWenjun Wu 0xFFFF0504, /* octet 2~3, low 16 bits vlan_macip */ 748e528d7c7SWenjun Wu /* octet 15~14, 16 bits data_len */ 749e528d7c7SWenjun Wu 0xFFFF0504, /* skip high 16 bits pkt_len, zero out */ 750e528d7c7SWenjun Wu /* octet 15~14, low 16 bits pkt_len */ 751e528d7c7SWenjun Wu 0xFFFFFFFF, /* pkt_type set as unknown */ 752e528d7c7SWenjun Wu /* 3rd descriptor */ 753e528d7c7SWenjun Wu 0xFFFFFFFF, /* octet 4~7, 32bits rss */ 754e528d7c7SWenjun Wu 0xFFFF0504, /* octet 2~3, low 16 bits vlan_macip */ 755e528d7c7SWenjun Wu /* octet 15~14, 16 bits data_len */ 756e528d7c7SWenjun Wu 0xFFFF0504, /* skip high 16 bits pkt_len, zero out */ 757e528d7c7SWenjun Wu /* octet 15~14, low 16 bits pkt_len */ 758e528d7c7SWenjun Wu 0xFFFFFFFF, /* pkt_type set as unknown */ 759e528d7c7SWenjun Wu /* 4th descriptor */ 760e528d7c7SWenjun Wu 0xFFFFFFFF, /* octet 4~7, 32bits rss */ 761e528d7c7SWenjun Wu 0xFFFF0504, /* octet 2~3, low 16 bits vlan_macip */ 762e528d7c7SWenjun Wu /* octet 15~14, 16 bits data_len */ 763e528d7c7SWenjun Wu 0xFFFF0504, /* skip high 16 bits pkt_len, zero out */ 764e528d7c7SWenjun Wu /* octet 15~14, low 16 bits pkt_len */ 765e528d7c7SWenjun Wu 0xFFFFFFFF /* pkt_type set as unknown */ 766e528d7c7SWenjun Wu ); 767e528d7c7SWenjun Wu /** 768e528d7c7SWenjun Wu * compile-time check the above crc and shuffle layout is correct. 769e528d7c7SWenjun Wu * NOTE: the first field (lowest address) is given last in set_epi 770e528d7c7SWenjun Wu * calls above. 771e528d7c7SWenjun Wu */ 772e528d7c7SWenjun Wu RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != 773e528d7c7SWenjun Wu offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); 774e528d7c7SWenjun Wu RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != 775e528d7c7SWenjun Wu offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); 776e528d7c7SWenjun Wu RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) != 777e528d7c7SWenjun Wu offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10); 778e528d7c7SWenjun Wu RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != 779e528d7c7SWenjun Wu offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); 780e528d7c7SWenjun Wu 781e528d7c7SWenjun Wu uint16_t i, received; 782e528d7c7SWenjun Wu 783e528d7c7SWenjun Wu for (i = 0, received = 0; i < nb_pkts; 784e528d7c7SWenjun Wu i += IDPF_DESCS_PER_LOOP_AVX, 785e528d7c7SWenjun Wu rxdp += IDPF_DESCS_PER_LOOP_AVX) { 786e528d7c7SWenjun Wu /* step 1, copy over 8 mbuf pointers to rx_pkts array */ 787e528d7c7SWenjun Wu _mm256_storeu_si256((void *)&rx_pkts[i], 788e528d7c7SWenjun Wu _mm256_loadu_si256((void *)&sw_ring[i])); 789e528d7c7SWenjun Wu #ifdef RTE_ARCH_X86_64 790e528d7c7SWenjun Wu _mm256_storeu_si256 791e528d7c7SWenjun Wu ((void *)&rx_pkts[i + 4], 792e528d7c7SWenjun Wu _mm256_loadu_si256((void *)&sw_ring[i + 4])); 793e528d7c7SWenjun Wu #endif 794e528d7c7SWenjun Wu 795e528d7c7SWenjun Wu __m512i raw_desc0_3, raw_desc4_7; 796e528d7c7SWenjun Wu const __m128i raw_desc7 = 797*43fd3624SAndre Muezerie _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 7)); 798e528d7c7SWenjun Wu rte_compiler_barrier(); 799e528d7c7SWenjun Wu const __m128i raw_desc6 = 800*43fd3624SAndre Muezerie _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 6)); 801e528d7c7SWenjun Wu rte_compiler_barrier(); 802e528d7c7SWenjun Wu const __m128i raw_desc5 = 803*43fd3624SAndre Muezerie _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 5)); 804e528d7c7SWenjun Wu rte_compiler_barrier(); 805e528d7c7SWenjun Wu const __m128i raw_desc4 = 806*43fd3624SAndre Muezerie _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 4)); 807e528d7c7SWenjun Wu rte_compiler_barrier(); 808e528d7c7SWenjun Wu const __m128i raw_desc3 = 809*43fd3624SAndre Muezerie _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 3)); 810e528d7c7SWenjun Wu rte_compiler_barrier(); 811e528d7c7SWenjun Wu const __m128i raw_desc2 = 812*43fd3624SAndre Muezerie _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 2)); 813e528d7c7SWenjun Wu rte_compiler_barrier(); 814e528d7c7SWenjun Wu const __m128i raw_desc1 = 815*43fd3624SAndre Muezerie _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 1)); 816e528d7c7SWenjun Wu rte_compiler_barrier(); 817e528d7c7SWenjun Wu const __m128i raw_desc0 = 818*43fd3624SAndre Muezerie _mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 0)); 819e528d7c7SWenjun Wu 820e528d7c7SWenjun Wu raw_desc4_7 = _mm512_broadcast_i32x4(raw_desc4); 821e528d7c7SWenjun Wu raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc5, 1); 822e528d7c7SWenjun Wu raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc6, 2); 823e528d7c7SWenjun Wu raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc7, 3); 824e528d7c7SWenjun Wu raw_desc0_3 = _mm512_broadcast_i32x4(raw_desc0); 825e528d7c7SWenjun Wu raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc1, 1); 826e528d7c7SWenjun Wu raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc2, 2); 827e528d7c7SWenjun Wu raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc3, 3); 828e528d7c7SWenjun Wu 829e528d7c7SWenjun Wu /** 830e528d7c7SWenjun Wu * convert descriptors 4-7 into mbufs, adjusting length and 831e528d7c7SWenjun Wu * re-arranging fields. Then write into the mbuf 832e528d7c7SWenjun Wu */ 833e528d7c7SWenjun Wu const __m512i len_mask = _mm512_set_epi32(0xffffffff, 0xffffffff, 834e528d7c7SWenjun Wu 0xffff3fff, 0xffffffff, 835e528d7c7SWenjun Wu 0xffffffff, 0xffffffff, 836e528d7c7SWenjun Wu 0xffff3fff, 0xffffffff, 837e528d7c7SWenjun Wu 0xffffffff, 0xffffffff, 838e528d7c7SWenjun Wu 0xffff3fff, 0xffffffff, 839e528d7c7SWenjun Wu 0xffffffff, 0xffffffff, 840e528d7c7SWenjun Wu 0xffff3fff, 0xffffffff); 841e528d7c7SWenjun Wu const __m512i desc4_7 = _mm512_and_epi32(raw_desc4_7, len_mask); 842e528d7c7SWenjun Wu __m512i mb4_7 = _mm512_shuffle_epi8(desc4_7, shuf_msk); 843e528d7c7SWenjun Wu 844e528d7c7SWenjun Wu /** 845e528d7c7SWenjun Wu * to get packet types, shift 64-bit values down 30 bits 846e528d7c7SWenjun Wu * and so ptype is in lower 8-bits in each 847e528d7c7SWenjun Wu */ 848e528d7c7SWenjun Wu const __m512i ptypes4_7 = _mm512_srli_epi64(desc4_7, 16); 849e528d7c7SWenjun Wu const __m256i ptypes6_7 = _mm512_extracti64x4_epi64(ptypes4_7, 1); 850e528d7c7SWenjun Wu const __m256i ptypes4_5 = _mm512_extracti64x4_epi64(ptypes4_7, 0); 851e528d7c7SWenjun Wu const uint8_t ptype7 = _mm256_extract_epi8(ptypes6_7, 16); 852e528d7c7SWenjun Wu const uint8_t ptype6 = _mm256_extract_epi8(ptypes6_7, 0); 853e528d7c7SWenjun Wu const uint8_t ptype5 = _mm256_extract_epi8(ptypes4_5, 16); 854e528d7c7SWenjun Wu const uint8_t ptype4 = _mm256_extract_epi8(ptypes4_5, 0); 855e528d7c7SWenjun Wu 856e528d7c7SWenjun Wu const __m512i ptype4_7 = _mm512_set_epi32 857e528d7c7SWenjun Wu (0, 0, 0, type_table[ptype7], 858e528d7c7SWenjun Wu 0, 0, 0, type_table[ptype6], 859e528d7c7SWenjun Wu 0, 0, 0, type_table[ptype5], 860e528d7c7SWenjun Wu 0, 0, 0, type_table[ptype4]); 861e528d7c7SWenjun Wu mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7); 862e528d7c7SWenjun Wu 863e528d7c7SWenjun Wu /** 864e528d7c7SWenjun Wu * convert descriptors 0-3 into mbufs, adjusting length and 865e528d7c7SWenjun Wu * re-arranging fields. Then write into the mbuf 866e528d7c7SWenjun Wu */ 867e528d7c7SWenjun Wu const __m512i desc0_3 = _mm512_and_epi32(raw_desc0_3, len_mask); 868e528d7c7SWenjun Wu __m512i mb0_3 = _mm512_shuffle_epi8(desc0_3, shuf_msk); 869e528d7c7SWenjun Wu 870e528d7c7SWenjun Wu /* get the packet types */ 871e528d7c7SWenjun Wu const __m512i ptypes0_3 = _mm512_srli_epi64(desc0_3, 16); 872e528d7c7SWenjun Wu const __m256i ptypes2_3 = _mm512_extracti64x4_epi64(ptypes0_3, 1); 873e528d7c7SWenjun Wu const __m256i ptypes0_1 = _mm512_extracti64x4_epi64(ptypes0_3, 0); 874e528d7c7SWenjun Wu const uint8_t ptype3 = _mm256_extract_epi8(ptypes2_3, 16); 875e528d7c7SWenjun Wu const uint8_t ptype2 = _mm256_extract_epi8(ptypes2_3, 0); 876e528d7c7SWenjun Wu const uint8_t ptype1 = _mm256_extract_epi8(ptypes0_1, 16); 877e528d7c7SWenjun Wu const uint8_t ptype0 = _mm256_extract_epi8(ptypes0_1, 0); 878e528d7c7SWenjun Wu 879e528d7c7SWenjun Wu const __m512i ptype0_3 = _mm512_set_epi32 880e528d7c7SWenjun Wu (0, 0, 0, type_table[ptype3], 881e528d7c7SWenjun Wu 0, 0, 0, type_table[ptype2], 882e528d7c7SWenjun Wu 0, 0, 0, type_table[ptype1], 883e528d7c7SWenjun Wu 0, 0, 0, type_table[ptype0]); 884e528d7c7SWenjun Wu mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3); 885e528d7c7SWenjun Wu 886e528d7c7SWenjun Wu /** 887e528d7c7SWenjun Wu * use permute/extract to get status and generation bit content 888e528d7c7SWenjun Wu * After the operations, the packets status flags are in the 889e528d7c7SWenjun Wu * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6] 890e528d7c7SWenjun Wu */ 891e528d7c7SWenjun Wu 892e528d7c7SWenjun Wu const __m512i dd_permute_msk = _mm512_set_epi64 893e528d7c7SWenjun Wu (11, 15, 3, 7, 9, 13, 1, 5); 894e528d7c7SWenjun Wu const __m512i status0_7 = _mm512_permutex2var_epi64 895e528d7c7SWenjun Wu (raw_desc4_7, dd_permute_msk, raw_desc0_3); 896e528d7c7SWenjun Wu const __m512i gen_permute_msk = _mm512_set_epi64 897e528d7c7SWenjun Wu (10, 14, 2, 6, 8, 12, 0, 4); 898e528d7c7SWenjun Wu const __m512i raw_gen0_7 = _mm512_permutex2var_epi64 899e528d7c7SWenjun Wu (raw_desc4_7, gen_permute_msk, raw_desc0_3); 900e528d7c7SWenjun Wu 901e528d7c7SWenjun Wu /* now do flag manipulation */ 902e528d7c7SWenjun Wu 903e528d7c7SWenjun Wu /** 904e528d7c7SWenjun Wu * At this point, we have the 8 sets of flags in the low 16-bits 905e528d7c7SWenjun Wu * of each 32-bit value in vlan0. 906e528d7c7SWenjun Wu * We want to extract these, and merge them with the mbuf init 907e528d7c7SWenjun Wu * data so we can do a single write to the mbuf to set the flags 908e528d7c7SWenjun Wu * and all the other initialization fields. Extracting the 909e528d7c7SWenjun Wu * appropriate flags means that we have to do a shift and blend 910e528d7c7SWenjun Wu * for each mbuf before we do the write. However, we can also 911e528d7c7SWenjun Wu * add in the previously computed rx_descriptor fields to 912e528d7c7SWenjun Wu * make a single 256-bit write per mbuf 913e528d7c7SWenjun Wu */ 914e528d7c7SWenjun Wu /* check the structure matches expectations */ 915e528d7c7SWenjun Wu RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) != 916e528d7c7SWenjun Wu offsetof(struct rte_mbuf, rearm_data) + 8); 917e528d7c7SWenjun Wu RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) != 918e528d7c7SWenjun Wu RTE_ALIGN(offsetof(struct rte_mbuf, 919e528d7c7SWenjun Wu rearm_data), 920e528d7c7SWenjun Wu 16)); 921e528d7c7SWenjun Wu /* build up data and do writes */ 922e528d7c7SWenjun Wu __m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5, 923e528d7c7SWenjun Wu rearm6, rearm7; 924e528d7c7SWenjun Wu const __m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0); 925e528d7c7SWenjun Wu const __m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1); 926e528d7c7SWenjun Wu const __m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0); 927e528d7c7SWenjun Wu const __m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1); 928e528d7c7SWenjun Wu 929e528d7c7SWenjun Wu rearm6 = _mm256_permute2f128_si256(mbuf_init, mb6_7, 0x20); 930e528d7c7SWenjun Wu rearm4 = _mm256_permute2f128_si256(mbuf_init, mb4_5, 0x20); 931e528d7c7SWenjun Wu rearm2 = _mm256_permute2f128_si256(mbuf_init, mb2_3, 0x20); 932e528d7c7SWenjun Wu rearm0 = _mm256_permute2f128_si256(mbuf_init, mb0_1, 0x20); 933e528d7c7SWenjun Wu 934e528d7c7SWenjun Wu /* write to mbuf */ 935e528d7c7SWenjun Wu _mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data, 936e528d7c7SWenjun Wu rearm6); 937e528d7c7SWenjun Wu _mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data, 938e528d7c7SWenjun Wu rearm4); 939e528d7c7SWenjun Wu _mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data, 940e528d7c7SWenjun Wu rearm2); 941e528d7c7SWenjun Wu _mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data, 942e528d7c7SWenjun Wu rearm0); 943e528d7c7SWenjun Wu 944e528d7c7SWenjun Wu rearm7 = _mm256_blend_epi32(mbuf_init, mb6_7, 0xF0); 945e528d7c7SWenjun Wu rearm5 = _mm256_blend_epi32(mbuf_init, mb4_5, 0xF0); 946e528d7c7SWenjun Wu rearm3 = _mm256_blend_epi32(mbuf_init, mb2_3, 0xF0); 947e528d7c7SWenjun Wu rearm1 = _mm256_blend_epi32(mbuf_init, mb0_1, 0xF0); 948e528d7c7SWenjun Wu 949e528d7c7SWenjun Wu /* again write to mbufs */ 950e528d7c7SWenjun Wu _mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data, 951e528d7c7SWenjun Wu rearm7); 952e528d7c7SWenjun Wu _mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data, 953e528d7c7SWenjun Wu rearm5); 954e528d7c7SWenjun Wu _mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data, 955e528d7c7SWenjun Wu rearm3); 956e528d7c7SWenjun Wu _mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data, 957e528d7c7SWenjun Wu rearm1); 958e528d7c7SWenjun Wu 959e528d7c7SWenjun Wu const __mmask8 dd_mask = _mm512_cmpeq_epi64_mask( 960e528d7c7SWenjun Wu _mm512_and_epi64(status0_7, dd_check), dd_check); 961e528d7c7SWenjun Wu const __mmask8 gen_mask = _mm512_cmpeq_epi64_mask( 962e528d7c7SWenjun Wu _mm512_and_epi64(raw_gen0_7, gen_check), 963e528d7c7SWenjun Wu _mm512_set1_epi64((uint64_t)rxq->expected_gen_id << 46)); 964e528d7c7SWenjun Wu const __mmask8 recv_mask = _kand_mask8(dd_mask, gen_mask); 9653d4e27fdSDavid Marchand uint16_t burst = rte_popcount32(_cvtmask8_u32(recv_mask)); 966e528d7c7SWenjun Wu 967e528d7c7SWenjun Wu received += burst; 968e528d7c7SWenjun Wu if (burst != IDPF_DESCS_PER_LOOP_AVX) 969e528d7c7SWenjun Wu break; 970e528d7c7SWenjun Wu } 971e528d7c7SWenjun Wu 972e528d7c7SWenjun Wu /* update tail pointers */ 973e528d7c7SWenjun Wu rxq->rx_tail += received; 974e528d7c7SWenjun Wu rxq->expected_gen_id ^= ((rxq->rx_tail & rxq->nb_rx_desc) != 0); 975e528d7c7SWenjun Wu rxq->rx_tail &= (rxq->nb_rx_desc - 1); 976e528d7c7SWenjun Wu if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep aligned */ 977e528d7c7SWenjun Wu rxq->rx_tail--; 978e528d7c7SWenjun Wu received--; 979e528d7c7SWenjun Wu } 980e528d7c7SWenjun Wu 981e528d7c7SWenjun Wu rxq->bufq2->rxrearm_nb += received; 982e528d7c7SWenjun Wu return received; 983e528d7c7SWenjun Wu } 984e528d7c7SWenjun Wu 985e528d7c7SWenjun Wu /* only bufq2 can receive pkts */ 986e528d7c7SWenjun Wu uint16_t 987e528d7c7SWenjun Wu idpf_dp_splitq_recv_pkts_avx512(void *rx_queue, struct rte_mbuf **rx_pkts, 988e528d7c7SWenjun Wu uint16_t nb_pkts) 989e528d7c7SWenjun Wu { 990e528d7c7SWenjun Wu return _idpf_splitq_recv_raw_pkts_avx512(rx_queue, rx_pkts, 991e528d7c7SWenjun Wu nb_pkts); 992e528d7c7SWenjun Wu } 993e528d7c7SWenjun Wu 9940fac6a1cSBeilei Xing static __rte_always_inline int 995e528d7c7SWenjun Wu idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq) 9960fac6a1cSBeilei Xing { 9970fac6a1cSBeilei Xing struct idpf_tx_vec_entry *txep; 9980fac6a1cSBeilei Xing uint32_t n; 9990fac6a1cSBeilei Xing uint32_t i; 10000fac6a1cSBeilei Xing int nb_free = 0; 10010fac6a1cSBeilei Xing struct rte_mbuf *m, *free[txq->rs_thresh]; 10020fac6a1cSBeilei Xing 10030fac6a1cSBeilei Xing /* check DD bits on threshold descriptor */ 1004bab8149aSSimei Su if ((txq->tx_ring[txq->next_dd].qw1 & 10050fac6a1cSBeilei Xing rte_cpu_to_le_64(IDPF_TXD_QW1_DTYPE_M)) != 10060fac6a1cSBeilei Xing rte_cpu_to_le_64(IDPF_TX_DESC_DTYPE_DESC_DONE)) 10070fac6a1cSBeilei Xing return 0; 10080fac6a1cSBeilei Xing 10090fac6a1cSBeilei Xing n = txq->rs_thresh; 10100fac6a1cSBeilei Xing 10110fac6a1cSBeilei Xing /* first buffer to free from S/W ring is at index 10120fac6a1cSBeilei Xing * tx_next_dd - (tx_rs_thresh-1) 10130fac6a1cSBeilei Xing */ 10140fac6a1cSBeilei Xing txep = (void *)txq->sw_ring; 10150fac6a1cSBeilei Xing txep += txq->next_dd - (n - 1); 10160fac6a1cSBeilei Xing 10170fac6a1cSBeilei Xing if (txq->offloads & IDPF_TX_OFFLOAD_MBUF_FAST_FREE && (n & 31) == 0) { 10180fac6a1cSBeilei Xing struct rte_mempool *mp = txep[0].mbuf->pool; 10190fac6a1cSBeilei Xing struct rte_mempool_cache *cache = rte_mempool_default_cache(mp, 10200fac6a1cSBeilei Xing rte_lcore_id()); 10210fac6a1cSBeilei Xing void **cache_objs; 10220fac6a1cSBeilei Xing 10230fac6a1cSBeilei Xing if (cache == NULL || cache->len == 0) 10240fac6a1cSBeilei Xing goto normal; 10250fac6a1cSBeilei Xing 10260fac6a1cSBeilei Xing cache_objs = &cache->objs[cache->len]; 10270fac6a1cSBeilei Xing 10280fac6a1cSBeilei Xing if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) { 10290fac6a1cSBeilei Xing rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n); 10300fac6a1cSBeilei Xing goto done; 10310fac6a1cSBeilei Xing } 10320fac6a1cSBeilei Xing 10330fac6a1cSBeilei Xing /* The cache follows the following algorithm 10340fac6a1cSBeilei Xing * 1. Add the objects to the cache 10350fac6a1cSBeilei Xing * 2. Anything greater than the cache min value (if it crosses the 10360fac6a1cSBeilei Xing * cache flush threshold) is flushed to the ring. 10370fac6a1cSBeilei Xing */ 10380fac6a1cSBeilei Xing /* Add elements back into the cache */ 10390fac6a1cSBeilei Xing uint32_t copied = 0; 10400fac6a1cSBeilei Xing /* n is multiple of 32 */ 10410fac6a1cSBeilei Xing while (copied < n) { 1042d16364e3SBruce Richardson #ifdef RTE_ARCH_64 10430fac6a1cSBeilei Xing const __m512i a = _mm512_loadu_si512(&txep[copied]); 10440fac6a1cSBeilei Xing const __m512i b = _mm512_loadu_si512(&txep[copied + 8]); 10450fac6a1cSBeilei Xing const __m512i c = _mm512_loadu_si512(&txep[copied + 16]); 10460fac6a1cSBeilei Xing const __m512i d = _mm512_loadu_si512(&txep[copied + 24]); 10470fac6a1cSBeilei Xing 10480fac6a1cSBeilei Xing _mm512_storeu_si512(&cache_objs[copied], a); 10490fac6a1cSBeilei Xing _mm512_storeu_si512(&cache_objs[copied + 8], b); 10500fac6a1cSBeilei Xing _mm512_storeu_si512(&cache_objs[copied + 16], c); 10510fac6a1cSBeilei Xing _mm512_storeu_si512(&cache_objs[copied + 24], d); 1052d16364e3SBruce Richardson #else 1053d16364e3SBruce Richardson const __m512i a = _mm512_loadu_si512(&txep[copied]); 1054d16364e3SBruce Richardson const __m512i b = _mm512_loadu_si512(&txep[copied + 16]); 1055d16364e3SBruce Richardson _mm512_storeu_si512(&cache_objs[copied], a); 1056d16364e3SBruce Richardson _mm512_storeu_si512(&cache_objs[copied + 16], b); 1057d16364e3SBruce Richardson #endif 10580fac6a1cSBeilei Xing copied += 32; 10590fac6a1cSBeilei Xing } 10600fac6a1cSBeilei Xing cache->len += n; 10610fac6a1cSBeilei Xing 10620fac6a1cSBeilei Xing if (cache->len >= cache->flushthresh) { 10630fac6a1cSBeilei Xing rte_mempool_ops_enqueue_bulk(mp, 10640fac6a1cSBeilei Xing &cache->objs[cache->size], 10650fac6a1cSBeilei Xing cache->len - cache->size); 10660fac6a1cSBeilei Xing cache->len = cache->size; 10670fac6a1cSBeilei Xing } 10680fac6a1cSBeilei Xing goto done; 10690fac6a1cSBeilei Xing } 10700fac6a1cSBeilei Xing 10710fac6a1cSBeilei Xing normal: 10720fac6a1cSBeilei Xing m = rte_pktmbuf_prefree_seg(txep[0].mbuf); 10730fac6a1cSBeilei Xing if (likely(m != NULL)) { 10740fac6a1cSBeilei Xing free[0] = m; 10750fac6a1cSBeilei Xing nb_free = 1; 10760fac6a1cSBeilei Xing for (i = 1; i < n; i++) { 10770fac6a1cSBeilei Xing m = rte_pktmbuf_prefree_seg(txep[i].mbuf); 10780fac6a1cSBeilei Xing if (likely(m != NULL)) { 10790fac6a1cSBeilei Xing if (likely(m->pool == free[0]->pool)) { 10800fac6a1cSBeilei Xing free[nb_free++] = m; 10810fac6a1cSBeilei Xing } else { 10820fac6a1cSBeilei Xing rte_mempool_put_bulk(free[0]->pool, 10830fac6a1cSBeilei Xing (void *)free, 10840fac6a1cSBeilei Xing nb_free); 10850fac6a1cSBeilei Xing free[0] = m; 10860fac6a1cSBeilei Xing nb_free = 1; 10870fac6a1cSBeilei Xing } 10880fac6a1cSBeilei Xing } 10890fac6a1cSBeilei Xing } 10900fac6a1cSBeilei Xing rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free); 10910fac6a1cSBeilei Xing } else { 10920fac6a1cSBeilei Xing for (i = 1; i < n; i++) { 10930fac6a1cSBeilei Xing m = rte_pktmbuf_prefree_seg(txep[i].mbuf); 10940fac6a1cSBeilei Xing if (m != NULL) 10950fac6a1cSBeilei Xing rte_mempool_put(m->pool, m); 10960fac6a1cSBeilei Xing } 10970fac6a1cSBeilei Xing } 10980fac6a1cSBeilei Xing 10990fac6a1cSBeilei Xing done: 11000fac6a1cSBeilei Xing /* buffers were freed, update counters */ 11010fac6a1cSBeilei Xing txq->nb_free = (uint16_t)(txq->nb_free + txq->rs_thresh); 11020fac6a1cSBeilei Xing txq->next_dd = (uint16_t)(txq->next_dd + txq->rs_thresh); 11030fac6a1cSBeilei Xing if (txq->next_dd >= txq->nb_tx_desc) 11040fac6a1cSBeilei Xing txq->next_dd = (uint16_t)(txq->rs_thresh - 1); 11050fac6a1cSBeilei Xing 11060fac6a1cSBeilei Xing return txq->rs_thresh; 11070fac6a1cSBeilei Xing } 11080fac6a1cSBeilei Xing 11090fac6a1cSBeilei Xing static __rte_always_inline void 11100fac6a1cSBeilei Xing tx_backlog_entry_avx512(struct idpf_tx_vec_entry *txep, 11110fac6a1cSBeilei Xing struct rte_mbuf **tx_pkts, uint16_t nb_pkts) 11120fac6a1cSBeilei Xing { 11130fac6a1cSBeilei Xing int i; 11140fac6a1cSBeilei Xing 11150fac6a1cSBeilei Xing for (i = 0; i < (int)nb_pkts; ++i) 11160fac6a1cSBeilei Xing txep[i].mbuf = tx_pkts[i]; 11170fac6a1cSBeilei Xing } 11180fac6a1cSBeilei Xing 11190fac6a1cSBeilei Xing static __rte_always_inline void 1120bab8149aSSimei Su idpf_singleq_vtx1(volatile struct idpf_base_tx_desc *txdp, 11210fac6a1cSBeilei Xing struct rte_mbuf *pkt, uint64_t flags) 11220fac6a1cSBeilei Xing { 11230fac6a1cSBeilei Xing uint64_t high_qw = 1124bab8149aSSimei Su (IDPF_TX_DESC_DTYPE_DATA | 1125bab8149aSSimei Su ((uint64_t)flags << IDPF_TXD_QW1_CMD_S) | 1126bab8149aSSimei Su ((uint64_t)pkt->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S)); 11270fac6a1cSBeilei Xing 11280fac6a1cSBeilei Xing __m128i descriptor = _mm_set_epi64x(high_qw, 11290fac6a1cSBeilei Xing pkt->buf_iova + pkt->data_off); 1130*43fd3624SAndre Muezerie _mm_storeu_si128(RTE_CAST_PTR(__m128i *, txdp), descriptor); 11310fac6a1cSBeilei Xing } 11320fac6a1cSBeilei Xing 11330fac6a1cSBeilei Xing #define IDPF_TX_LEN_MASK 0xAA 11340fac6a1cSBeilei Xing #define IDPF_TX_OFF_MASK 0x55 11350fac6a1cSBeilei Xing static __rte_always_inline void 1136bab8149aSSimei Su idpf_singleq_vtx(volatile struct idpf_base_tx_desc *txdp, 11370fac6a1cSBeilei Xing struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags) 11380fac6a1cSBeilei Xing { 1139bab8149aSSimei Su const uint64_t hi_qw_tmpl = (IDPF_TX_DESC_DTYPE_DATA | 1140bab8149aSSimei Su ((uint64_t)flags << IDPF_TXD_QW1_CMD_S)); 11410fac6a1cSBeilei Xing 11420fac6a1cSBeilei Xing /* if unaligned on 32-bit boundary, do one to align */ 11430fac6a1cSBeilei Xing if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) { 1144e528d7c7SWenjun Wu idpf_singleq_vtx1(txdp, *pkt, flags); 11450fac6a1cSBeilei Xing nb_pkts--, txdp++, pkt++; 11460fac6a1cSBeilei Xing } 11470fac6a1cSBeilei Xing 11480fac6a1cSBeilei Xing /* do 4 at a time while possible, in bursts */ 11490fac6a1cSBeilei Xing for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) { 11500fac6a1cSBeilei Xing uint64_t hi_qw3 = 11510fac6a1cSBeilei Xing hi_qw_tmpl | 11520fac6a1cSBeilei Xing ((uint64_t)pkt[3]->data_len << 1153bab8149aSSimei Su IDPF_TXD_QW1_TX_BUF_SZ_S); 11540fac6a1cSBeilei Xing uint64_t hi_qw2 = 11550fac6a1cSBeilei Xing hi_qw_tmpl | 11560fac6a1cSBeilei Xing ((uint64_t)pkt[2]->data_len << 1157bab8149aSSimei Su IDPF_TXD_QW1_TX_BUF_SZ_S); 11580fac6a1cSBeilei Xing uint64_t hi_qw1 = 11590fac6a1cSBeilei Xing hi_qw_tmpl | 11600fac6a1cSBeilei Xing ((uint64_t)pkt[1]->data_len << 1161bab8149aSSimei Su IDPF_TXD_QW1_TX_BUF_SZ_S); 11620fac6a1cSBeilei Xing uint64_t hi_qw0 = 11630fac6a1cSBeilei Xing hi_qw_tmpl | 11640fac6a1cSBeilei Xing ((uint64_t)pkt[0]->data_len << 1165bab8149aSSimei Su IDPF_TXD_QW1_TX_BUF_SZ_S); 11660fac6a1cSBeilei Xing 11670fac6a1cSBeilei Xing __m512i desc0_3 = 11680fac6a1cSBeilei Xing _mm512_set_epi64 11690fac6a1cSBeilei Xing (hi_qw3, 11700fac6a1cSBeilei Xing pkt[3]->buf_iova + pkt[3]->data_off, 11710fac6a1cSBeilei Xing hi_qw2, 11720fac6a1cSBeilei Xing pkt[2]->buf_iova + pkt[2]->data_off, 11730fac6a1cSBeilei Xing hi_qw1, 11740fac6a1cSBeilei Xing pkt[1]->buf_iova + pkt[1]->data_off, 11750fac6a1cSBeilei Xing hi_qw0, 11760fac6a1cSBeilei Xing pkt[0]->buf_iova + pkt[0]->data_off); 1177*43fd3624SAndre Muezerie _mm512_storeu_si512(RTE_CAST_PTR(void *, txdp), desc0_3); 11780fac6a1cSBeilei Xing } 11790fac6a1cSBeilei Xing 11800fac6a1cSBeilei Xing /* do any last ones */ 11810fac6a1cSBeilei Xing while (nb_pkts) { 1182e528d7c7SWenjun Wu idpf_singleq_vtx1(txdp, *pkt, flags); 11830fac6a1cSBeilei Xing txdp++, pkt++, nb_pkts--; 11840fac6a1cSBeilei Xing } 11850fac6a1cSBeilei Xing } 11860fac6a1cSBeilei Xing 11870fac6a1cSBeilei Xing static __rte_always_inline uint16_t 1188e528d7c7SWenjun Wu idpf_singleq_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, 11890fac6a1cSBeilei Xing uint16_t nb_pkts) 11900fac6a1cSBeilei Xing { 11910fac6a1cSBeilei Xing struct idpf_tx_queue *txq = tx_queue; 1192bab8149aSSimei Su volatile struct idpf_base_tx_desc *txdp; 11930fac6a1cSBeilei Xing struct idpf_tx_vec_entry *txep; 11940fac6a1cSBeilei Xing uint16_t n, nb_commit, tx_id; 1195bab8149aSSimei Su uint64_t flags = IDPF_TX_DESC_CMD_EOP; 1196bab8149aSSimei Su uint64_t rs = IDPF_TX_DESC_CMD_RS | flags; 11970fac6a1cSBeilei Xing 11980fac6a1cSBeilei Xing /* cross rx_thresh boundary is not allowed */ 11990fac6a1cSBeilei Xing nb_pkts = RTE_MIN(nb_pkts, txq->rs_thresh); 12000fac6a1cSBeilei Xing 12010fac6a1cSBeilei Xing if (txq->nb_free < txq->free_thresh) 1202e528d7c7SWenjun Wu idpf_tx_singleq_free_bufs_avx512(txq); 12030fac6a1cSBeilei Xing 12040fac6a1cSBeilei Xing nb_pkts = (uint16_t)RTE_MIN(txq->nb_free, nb_pkts); 12050fac6a1cSBeilei Xing nb_commit = nb_pkts; 12060fac6a1cSBeilei Xing if (unlikely(nb_pkts == 0)) 12070fac6a1cSBeilei Xing return 0; 12080fac6a1cSBeilei Xing 12090fac6a1cSBeilei Xing tx_id = txq->tx_tail; 12100fac6a1cSBeilei Xing txdp = &txq->tx_ring[tx_id]; 12110fac6a1cSBeilei Xing txep = (void *)txq->sw_ring; 12120fac6a1cSBeilei Xing txep += tx_id; 12130fac6a1cSBeilei Xing 12140fac6a1cSBeilei Xing txq->nb_free = (uint16_t)(txq->nb_free - nb_pkts); 12150fac6a1cSBeilei Xing 12160fac6a1cSBeilei Xing n = (uint16_t)(txq->nb_tx_desc - tx_id); 12170fac6a1cSBeilei Xing if (nb_commit >= n) { 12180fac6a1cSBeilei Xing tx_backlog_entry_avx512(txep, tx_pkts, n); 12190fac6a1cSBeilei Xing 1220e528d7c7SWenjun Wu idpf_singleq_vtx(txdp, tx_pkts, n - 1, flags); 12210fac6a1cSBeilei Xing tx_pkts += (n - 1); 12220fac6a1cSBeilei Xing txdp += (n - 1); 12230fac6a1cSBeilei Xing 1224e528d7c7SWenjun Wu idpf_singleq_vtx1(txdp, *tx_pkts++, rs); 12250fac6a1cSBeilei Xing 12260fac6a1cSBeilei Xing nb_commit = (uint16_t)(nb_commit - n); 12270fac6a1cSBeilei Xing 12280fac6a1cSBeilei Xing tx_id = 0; 12290fac6a1cSBeilei Xing txq->next_rs = (uint16_t)(txq->rs_thresh - 1); 12300fac6a1cSBeilei Xing 12310fac6a1cSBeilei Xing /* avoid reach the end of ring */ 12320fac6a1cSBeilei Xing txdp = &txq->tx_ring[tx_id]; 12330fac6a1cSBeilei Xing txep = (void *)txq->sw_ring; 12340fac6a1cSBeilei Xing txep += tx_id; 12350fac6a1cSBeilei Xing } 12360fac6a1cSBeilei Xing 12370fac6a1cSBeilei Xing tx_backlog_entry_avx512(txep, tx_pkts, nb_commit); 12380fac6a1cSBeilei Xing 1239e528d7c7SWenjun Wu idpf_singleq_vtx(txdp, tx_pkts, nb_commit, flags); 12400fac6a1cSBeilei Xing 12410fac6a1cSBeilei Xing tx_id = (uint16_t)(tx_id + nb_commit); 12420fac6a1cSBeilei Xing if (tx_id > txq->next_rs) { 1243bab8149aSSimei Su txq->tx_ring[txq->next_rs].qw1 |= 1244bab8149aSSimei Su rte_cpu_to_le_64(((uint64_t)IDPF_TX_DESC_CMD_RS) << 1245bab8149aSSimei Su IDPF_TXD_QW1_CMD_S); 12460fac6a1cSBeilei Xing txq->next_rs = 12470fac6a1cSBeilei Xing (uint16_t)(txq->next_rs + txq->rs_thresh); 12480fac6a1cSBeilei Xing } 12490fac6a1cSBeilei Xing 12500fac6a1cSBeilei Xing txq->tx_tail = tx_id; 12510fac6a1cSBeilei Xing 12520fac6a1cSBeilei Xing IDPF_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail); 12530fac6a1cSBeilei Xing 12540fac6a1cSBeilei Xing return nb_pkts; 12550fac6a1cSBeilei Xing } 12560fac6a1cSBeilei Xing 12570fac6a1cSBeilei Xing static __rte_always_inline uint16_t 1258e528d7c7SWenjun Wu idpf_singleq_xmit_pkts_vec_avx512_cmn(void *tx_queue, struct rte_mbuf **tx_pkts, 12590fac6a1cSBeilei Xing uint16_t nb_pkts) 12600fac6a1cSBeilei Xing { 12610fac6a1cSBeilei Xing uint16_t nb_tx = 0; 12620fac6a1cSBeilei Xing struct idpf_tx_queue *txq = tx_queue; 12630fac6a1cSBeilei Xing 12640fac6a1cSBeilei Xing while (nb_pkts) { 12650fac6a1cSBeilei Xing uint16_t ret, num; 12660fac6a1cSBeilei Xing 12670fac6a1cSBeilei Xing num = (uint16_t)RTE_MIN(nb_pkts, txq->rs_thresh); 1268e528d7c7SWenjun Wu ret = idpf_singleq_xmit_fixed_burst_vec_avx512(tx_queue, &tx_pkts[nb_tx], 12690fac6a1cSBeilei Xing num); 12700fac6a1cSBeilei Xing nb_tx += ret; 12710fac6a1cSBeilei Xing nb_pkts -= ret; 12720fac6a1cSBeilei Xing if (ret < num) 12730fac6a1cSBeilei Xing break; 12740fac6a1cSBeilei Xing } 12750fac6a1cSBeilei Xing 12760fac6a1cSBeilei Xing return nb_tx; 12770fac6a1cSBeilei Xing } 12780fac6a1cSBeilei Xing 12790fac6a1cSBeilei Xing uint16_t 12809ebf3f6bSBeilei Xing idpf_dp_singleq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, 12810fac6a1cSBeilei Xing uint16_t nb_pkts) 12820fac6a1cSBeilei Xing { 1283e528d7c7SWenjun Wu return idpf_singleq_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts); 1284e528d7c7SWenjun Wu } 1285e528d7c7SWenjun Wu 1286e528d7c7SWenjun Wu static __rte_always_inline void 1287e528d7c7SWenjun Wu idpf_splitq_scan_cq_ring(struct idpf_tx_queue *cq) 1288e528d7c7SWenjun Wu { 1289e528d7c7SWenjun Wu struct idpf_splitq_tx_compl_desc *compl_ring; 1290e528d7c7SWenjun Wu struct idpf_tx_queue *txq; 1291e528d7c7SWenjun Wu uint16_t genid, txq_qid, cq_qid, i; 1292e528d7c7SWenjun Wu uint8_t ctype; 1293e528d7c7SWenjun Wu 1294e528d7c7SWenjun Wu cq_qid = cq->tx_tail; 1295e528d7c7SWenjun Wu 1296e528d7c7SWenjun Wu for (i = 0; i < IDPD_TXQ_SCAN_CQ_THRESH; i++) { 1297e528d7c7SWenjun Wu if (cq_qid == cq->nb_tx_desc) { 1298e528d7c7SWenjun Wu cq_qid = 0; 1299e528d7c7SWenjun Wu cq->expected_gen_id ^= 1; 1300e528d7c7SWenjun Wu } 1301e528d7c7SWenjun Wu compl_ring = &cq->compl_ring[cq_qid]; 1302e528d7c7SWenjun Wu genid = (compl_ring->qid_comptype_gen & 1303e528d7c7SWenjun Wu rte_cpu_to_le_64(IDPF_TXD_COMPLQ_GEN_M)) >> IDPF_TXD_COMPLQ_GEN_S; 1304e528d7c7SWenjun Wu if (genid != cq->expected_gen_id) 1305e528d7c7SWenjun Wu break; 1306e528d7c7SWenjun Wu ctype = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) & 1307e528d7c7SWenjun Wu IDPF_TXD_COMPLQ_COMPL_TYPE_M) >> IDPF_TXD_COMPLQ_COMPL_TYPE_S; 1308e528d7c7SWenjun Wu txq_qid = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) & 1309e528d7c7SWenjun Wu IDPF_TXD_COMPLQ_QID_M) >> IDPF_TXD_COMPLQ_QID_S; 1310e528d7c7SWenjun Wu txq = cq->txqs[txq_qid - cq->tx_start_qid]; 1311e528d7c7SWenjun Wu txq->ctype[ctype]++; 1312e528d7c7SWenjun Wu cq_qid++; 1313e528d7c7SWenjun Wu } 1314e528d7c7SWenjun Wu 1315e528d7c7SWenjun Wu cq->tx_tail = cq_qid; 1316e528d7c7SWenjun Wu } 1317e528d7c7SWenjun Wu 1318e528d7c7SWenjun Wu static __rte_always_inline int 1319e528d7c7SWenjun Wu idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq) 1320e528d7c7SWenjun Wu { 1321e528d7c7SWenjun Wu struct idpf_tx_vec_entry *txep; 1322e528d7c7SWenjun Wu uint32_t n; 1323e528d7c7SWenjun Wu uint32_t i; 1324e528d7c7SWenjun Wu int nb_free = 0; 1325e528d7c7SWenjun Wu struct rte_mbuf *m, *free[txq->rs_thresh]; 1326e528d7c7SWenjun Wu 1327e528d7c7SWenjun Wu n = txq->rs_thresh; 1328e528d7c7SWenjun Wu 1329e528d7c7SWenjun Wu /* first buffer to free from S/W ring is at index 1330e528d7c7SWenjun Wu * tx_next_dd - (tx_rs_thresh-1) 1331e528d7c7SWenjun Wu */ 1332e528d7c7SWenjun Wu txep = (void *)txq->sw_ring; 1333e528d7c7SWenjun Wu txep += txq->next_dd - (n - 1); 1334e528d7c7SWenjun Wu 1335e528d7c7SWenjun Wu if (txq->offloads & IDPF_TX_OFFLOAD_MBUF_FAST_FREE && (n & 31) == 0) { 1336e528d7c7SWenjun Wu struct rte_mempool *mp = txep[0].mbuf->pool; 1337e528d7c7SWenjun Wu struct rte_mempool_cache *cache = rte_mempool_default_cache(mp, 1338e528d7c7SWenjun Wu rte_lcore_id()); 1339e528d7c7SWenjun Wu void **cache_objs; 1340e528d7c7SWenjun Wu 1341e528d7c7SWenjun Wu if (!cache || cache->len == 0) 1342e528d7c7SWenjun Wu goto normal; 1343e528d7c7SWenjun Wu 1344e528d7c7SWenjun Wu cache_objs = &cache->objs[cache->len]; 1345e528d7c7SWenjun Wu 1346e528d7c7SWenjun Wu if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) { 1347e528d7c7SWenjun Wu rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n); 1348e528d7c7SWenjun Wu goto done; 1349e528d7c7SWenjun Wu } 1350e528d7c7SWenjun Wu 1351e528d7c7SWenjun Wu /* The cache follows the following algorithm 1352e528d7c7SWenjun Wu * 1. Add the objects to the cache 1353e528d7c7SWenjun Wu * 2. Anything greater than the cache min value (if it crosses the 1354e528d7c7SWenjun Wu * cache flush threshold) is flushed to the ring. 1355e528d7c7SWenjun Wu */ 1356e528d7c7SWenjun Wu /* Add elements back into the cache */ 1357e528d7c7SWenjun Wu uint32_t copied = 0; 1358e528d7c7SWenjun Wu /* n is multiple of 32 */ 1359e528d7c7SWenjun Wu while (copied < n) { 1360e528d7c7SWenjun Wu const __m512i a = _mm512_loadu_si512(&txep[copied]); 1361e528d7c7SWenjun Wu const __m512i b = _mm512_loadu_si512(&txep[copied + 8]); 1362e528d7c7SWenjun Wu const __m512i c = _mm512_loadu_si512(&txep[copied + 16]); 1363e528d7c7SWenjun Wu const __m512i d = _mm512_loadu_si512(&txep[copied + 24]); 1364e528d7c7SWenjun Wu 1365e528d7c7SWenjun Wu _mm512_storeu_si512(&cache_objs[copied], a); 1366e528d7c7SWenjun Wu _mm512_storeu_si512(&cache_objs[copied + 8], b); 1367e528d7c7SWenjun Wu _mm512_storeu_si512(&cache_objs[copied + 16], c); 1368e528d7c7SWenjun Wu _mm512_storeu_si512(&cache_objs[copied + 24], d); 1369e528d7c7SWenjun Wu copied += 32; 1370e528d7c7SWenjun Wu } 1371e528d7c7SWenjun Wu cache->len += n; 1372e528d7c7SWenjun Wu 1373e528d7c7SWenjun Wu if (cache->len >= cache->flushthresh) { 1374e528d7c7SWenjun Wu rte_mempool_ops_enqueue_bulk(mp, 1375e528d7c7SWenjun Wu &cache->objs[cache->size], 1376e528d7c7SWenjun Wu cache->len - cache->size); 1377e528d7c7SWenjun Wu cache->len = cache->size; 1378e528d7c7SWenjun Wu } 1379e528d7c7SWenjun Wu goto done; 1380e528d7c7SWenjun Wu } 1381e528d7c7SWenjun Wu 1382e528d7c7SWenjun Wu normal: 1383e528d7c7SWenjun Wu m = rte_pktmbuf_prefree_seg(txep[0].mbuf); 1384e528d7c7SWenjun Wu if (likely(m)) { 1385e528d7c7SWenjun Wu free[0] = m; 1386e528d7c7SWenjun Wu nb_free = 1; 1387e528d7c7SWenjun Wu for (i = 1; i < n; i++) { 1388e528d7c7SWenjun Wu m = rte_pktmbuf_prefree_seg(txep[i].mbuf); 1389e528d7c7SWenjun Wu if (likely(m)) { 1390e528d7c7SWenjun Wu if (likely(m->pool == free[0]->pool)) { 1391e528d7c7SWenjun Wu free[nb_free++] = m; 1392e528d7c7SWenjun Wu } else { 1393e528d7c7SWenjun Wu rte_mempool_put_bulk(free[0]->pool, 1394e528d7c7SWenjun Wu (void *)free, 1395e528d7c7SWenjun Wu nb_free); 1396e528d7c7SWenjun Wu free[0] = m; 1397e528d7c7SWenjun Wu nb_free = 1; 1398e528d7c7SWenjun Wu } 1399e528d7c7SWenjun Wu } 1400e528d7c7SWenjun Wu } 1401e528d7c7SWenjun Wu rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free); 1402e528d7c7SWenjun Wu } else { 1403e528d7c7SWenjun Wu for (i = 1; i < n; i++) { 1404e528d7c7SWenjun Wu m = rte_pktmbuf_prefree_seg(txep[i].mbuf); 1405e528d7c7SWenjun Wu if (m) 1406e528d7c7SWenjun Wu rte_mempool_put(m->pool, m); 1407e528d7c7SWenjun Wu } 1408e528d7c7SWenjun Wu } 1409e528d7c7SWenjun Wu 1410e528d7c7SWenjun Wu done: 1411e528d7c7SWenjun Wu /* buffers were freed, update counters */ 1412e528d7c7SWenjun Wu txq->nb_free = (uint16_t)(txq->nb_free + txq->rs_thresh); 1413e528d7c7SWenjun Wu txq->next_dd = (uint16_t)(txq->next_dd + txq->rs_thresh); 1414e528d7c7SWenjun Wu if (txq->next_dd >= txq->nb_tx_desc) 1415e528d7c7SWenjun Wu txq->next_dd = (uint16_t)(txq->rs_thresh - 1); 1416e528d7c7SWenjun Wu txq->ctype[IDPF_TXD_COMPLT_RS] -= txq->rs_thresh; 1417e528d7c7SWenjun Wu 1418e528d7c7SWenjun Wu return txq->rs_thresh; 1419e528d7c7SWenjun Wu } 1420e528d7c7SWenjun Wu 1421e528d7c7SWenjun Wu #define IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S 48 1422e528d7c7SWenjun Wu 1423e528d7c7SWenjun Wu static __rte_always_inline void 1424e528d7c7SWenjun Wu idpf_splitq_vtx1(volatile struct idpf_flex_tx_sched_desc *txdp, 1425e528d7c7SWenjun Wu struct rte_mbuf *pkt, uint64_t flags) 1426e528d7c7SWenjun Wu { 1427e528d7c7SWenjun Wu uint64_t high_qw = 1428e528d7c7SWenjun Wu (IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE | 1429e528d7c7SWenjun Wu ((uint64_t)flags) | 1430e528d7c7SWenjun Wu ((uint64_t)pkt->data_len << IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S)); 1431e528d7c7SWenjun Wu 1432e528d7c7SWenjun Wu __m128i descriptor = _mm_set_epi64x(high_qw, 1433e528d7c7SWenjun Wu pkt->buf_iova + pkt->data_off); 1434*43fd3624SAndre Muezerie _mm_storeu_si128(RTE_CAST_PTR(__m128i *, txdp), descriptor); 1435e528d7c7SWenjun Wu } 1436e528d7c7SWenjun Wu 1437e528d7c7SWenjun Wu static __rte_always_inline void 1438e528d7c7SWenjun Wu idpf_splitq_vtx(volatile struct idpf_flex_tx_sched_desc *txdp, 1439e528d7c7SWenjun Wu struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags) 1440e528d7c7SWenjun Wu { 1441e528d7c7SWenjun Wu const uint64_t hi_qw_tmpl = (IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE | 1442e528d7c7SWenjun Wu ((uint64_t)flags)); 1443e528d7c7SWenjun Wu 1444e528d7c7SWenjun Wu /* if unaligned on 32-bit boundary, do one to align */ 1445e528d7c7SWenjun Wu if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) { 1446e528d7c7SWenjun Wu idpf_splitq_vtx1(txdp, *pkt, flags); 1447e528d7c7SWenjun Wu nb_pkts--, txdp++, pkt++; 1448e528d7c7SWenjun Wu } 1449e528d7c7SWenjun Wu 1450e528d7c7SWenjun Wu /* do 4 at a time while possible, in bursts */ 1451e528d7c7SWenjun Wu for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) { 1452e528d7c7SWenjun Wu uint64_t hi_qw3 = 1453e528d7c7SWenjun Wu hi_qw_tmpl | 1454e528d7c7SWenjun Wu ((uint64_t)pkt[3]->data_len << 1455e528d7c7SWenjun Wu IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S); 1456e528d7c7SWenjun Wu uint64_t hi_qw2 = 1457e528d7c7SWenjun Wu hi_qw_tmpl | 1458e528d7c7SWenjun Wu ((uint64_t)pkt[2]->data_len << 1459e528d7c7SWenjun Wu IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S); 1460e528d7c7SWenjun Wu uint64_t hi_qw1 = 1461e528d7c7SWenjun Wu hi_qw_tmpl | 1462e528d7c7SWenjun Wu ((uint64_t)pkt[1]->data_len << 1463e528d7c7SWenjun Wu IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S); 1464e528d7c7SWenjun Wu uint64_t hi_qw0 = 1465e528d7c7SWenjun Wu hi_qw_tmpl | 1466e528d7c7SWenjun Wu ((uint64_t)pkt[0]->data_len << 1467e528d7c7SWenjun Wu IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S); 1468e528d7c7SWenjun Wu 1469e528d7c7SWenjun Wu __m512i desc0_3 = 1470e528d7c7SWenjun Wu _mm512_set_epi64 1471e528d7c7SWenjun Wu (hi_qw3, 1472e528d7c7SWenjun Wu pkt[3]->buf_iova + pkt[3]->data_off, 1473e528d7c7SWenjun Wu hi_qw2, 1474e528d7c7SWenjun Wu pkt[2]->buf_iova + pkt[2]->data_off, 1475e528d7c7SWenjun Wu hi_qw1, 1476e528d7c7SWenjun Wu pkt[1]->buf_iova + pkt[1]->data_off, 1477e528d7c7SWenjun Wu hi_qw0, 1478e528d7c7SWenjun Wu pkt[0]->buf_iova + pkt[0]->data_off); 1479*43fd3624SAndre Muezerie _mm512_storeu_si512(RTE_CAST_PTR(void *, txdp), desc0_3); 1480e528d7c7SWenjun Wu } 1481e528d7c7SWenjun Wu 1482e528d7c7SWenjun Wu /* do any last ones */ 1483e528d7c7SWenjun Wu while (nb_pkts) { 1484e528d7c7SWenjun Wu idpf_splitq_vtx1(txdp, *pkt, flags); 1485e528d7c7SWenjun Wu txdp++, pkt++, nb_pkts--; 1486e528d7c7SWenjun Wu } 1487e528d7c7SWenjun Wu } 1488e528d7c7SWenjun Wu 1489e528d7c7SWenjun Wu static __rte_always_inline uint16_t 1490e528d7c7SWenjun Wu idpf_splitq_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, 1491e528d7c7SWenjun Wu uint16_t nb_pkts) 1492e528d7c7SWenjun Wu { 1493e528d7c7SWenjun Wu struct idpf_tx_queue *txq = (struct idpf_tx_queue *)tx_queue; 1494e528d7c7SWenjun Wu volatile struct idpf_flex_tx_sched_desc *txdp; 1495e528d7c7SWenjun Wu struct idpf_tx_vec_entry *txep; 1496e528d7c7SWenjun Wu uint16_t n, nb_commit, tx_id; 1497e528d7c7SWenjun Wu /* bit2 is reserved and must be set to 1 according to Spec */ 1498e528d7c7SWenjun Wu uint64_t cmd_dtype = IDPF_TXD_FLEX_FLOW_CMD_EOP; 1499e528d7c7SWenjun Wu 1500e528d7c7SWenjun Wu tx_id = txq->tx_tail; 1501e528d7c7SWenjun Wu 1502e528d7c7SWenjun Wu /* cross rx_thresh boundary is not allowed */ 1503e528d7c7SWenjun Wu nb_pkts = RTE_MIN(nb_pkts, txq->rs_thresh); 1504e528d7c7SWenjun Wu 1505e528d7c7SWenjun Wu nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_free, nb_pkts); 1506e528d7c7SWenjun Wu if (unlikely(nb_pkts == 0)) 1507e528d7c7SWenjun Wu return 0; 1508e528d7c7SWenjun Wu 1509e528d7c7SWenjun Wu tx_id = txq->tx_tail; 1510e528d7c7SWenjun Wu txdp = &txq->desc_ring[tx_id]; 1511e528d7c7SWenjun Wu txep = (void *)txq->sw_ring; 1512e528d7c7SWenjun Wu txep += tx_id; 1513e528d7c7SWenjun Wu 1514e528d7c7SWenjun Wu txq->nb_free = (uint16_t)(txq->nb_free - nb_pkts); 1515e528d7c7SWenjun Wu 1516e528d7c7SWenjun Wu n = (uint16_t)(txq->nb_tx_desc - tx_id); 1517e528d7c7SWenjun Wu if (nb_commit >= n) { 1518e528d7c7SWenjun Wu tx_backlog_entry_avx512(txep, tx_pkts, n); 1519e528d7c7SWenjun Wu 1520*43fd3624SAndre Muezerie idpf_splitq_vtx(txdp, tx_pkts, n - 1, cmd_dtype); 1521e528d7c7SWenjun Wu tx_pkts += (n - 1); 1522e528d7c7SWenjun Wu txdp += (n - 1); 1523e528d7c7SWenjun Wu 1524*43fd3624SAndre Muezerie idpf_splitq_vtx1(txdp, *tx_pkts++, cmd_dtype); 1525e528d7c7SWenjun Wu 1526e528d7c7SWenjun Wu nb_commit = (uint16_t)(nb_commit - n); 1527e528d7c7SWenjun Wu 1528e528d7c7SWenjun Wu tx_id = 0; 1529e528d7c7SWenjun Wu txq->next_rs = (uint16_t)(txq->rs_thresh - 1); 1530e528d7c7SWenjun Wu 1531e528d7c7SWenjun Wu /* avoid reach the end of ring */ 1532e528d7c7SWenjun Wu txdp = &txq->desc_ring[tx_id]; 1533e528d7c7SWenjun Wu txep = (void *)txq->sw_ring; 1534e528d7c7SWenjun Wu txep += tx_id; 1535e528d7c7SWenjun Wu } 1536e528d7c7SWenjun Wu 1537e528d7c7SWenjun Wu tx_backlog_entry_avx512(txep, tx_pkts, nb_commit); 1538e528d7c7SWenjun Wu 1539*43fd3624SAndre Muezerie idpf_splitq_vtx(txdp, tx_pkts, nb_commit, cmd_dtype); 1540e528d7c7SWenjun Wu 1541e528d7c7SWenjun Wu tx_id = (uint16_t)(tx_id + nb_commit); 1542e528d7c7SWenjun Wu if (tx_id > txq->next_rs) 1543e528d7c7SWenjun Wu txq->next_rs = 1544e528d7c7SWenjun Wu (uint16_t)(txq->next_rs + txq->rs_thresh); 1545e528d7c7SWenjun Wu 1546e528d7c7SWenjun Wu txq->tx_tail = tx_id; 1547e528d7c7SWenjun Wu 1548e528d7c7SWenjun Wu IDPF_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail); 1549e528d7c7SWenjun Wu 1550e528d7c7SWenjun Wu return nb_pkts; 1551e528d7c7SWenjun Wu } 1552e528d7c7SWenjun Wu 1553e528d7c7SWenjun Wu static __rte_always_inline uint16_t 1554e528d7c7SWenjun Wu idpf_splitq_xmit_pkts_vec_avx512_cmn(void *tx_queue, struct rte_mbuf **tx_pkts, 1555e528d7c7SWenjun Wu uint16_t nb_pkts) 1556e528d7c7SWenjun Wu { 1557e528d7c7SWenjun Wu struct idpf_tx_queue *txq = (struct idpf_tx_queue *)tx_queue; 1558e528d7c7SWenjun Wu uint16_t nb_tx = 0; 1559e528d7c7SWenjun Wu 1560e528d7c7SWenjun Wu while (nb_pkts) { 1561e528d7c7SWenjun Wu uint16_t ret, num; 1562e528d7c7SWenjun Wu 1563e528d7c7SWenjun Wu idpf_splitq_scan_cq_ring(txq->complq); 1564e528d7c7SWenjun Wu 1565e528d7c7SWenjun Wu if (txq->ctype[IDPF_TXD_COMPLT_RS] > txq->free_thresh) 1566e528d7c7SWenjun Wu idpf_tx_splitq_free_bufs_avx512(txq); 1567e528d7c7SWenjun Wu 1568e528d7c7SWenjun Wu num = (uint16_t)RTE_MIN(nb_pkts, txq->rs_thresh); 1569e528d7c7SWenjun Wu ret = idpf_splitq_xmit_fixed_burst_vec_avx512(tx_queue, 1570e528d7c7SWenjun Wu &tx_pkts[nb_tx], 1571e528d7c7SWenjun Wu num); 1572e528d7c7SWenjun Wu nb_tx += ret; 1573e528d7c7SWenjun Wu nb_pkts -= ret; 1574e528d7c7SWenjun Wu if (ret < num) 1575e528d7c7SWenjun Wu break; 1576e528d7c7SWenjun Wu } 1577e528d7c7SWenjun Wu 1578e528d7c7SWenjun Wu return nb_tx; 1579e528d7c7SWenjun Wu } 1580e528d7c7SWenjun Wu 1581e528d7c7SWenjun Wu uint16_t 1582e528d7c7SWenjun Wu idpf_dp_splitq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, 1583e528d7c7SWenjun Wu uint16_t nb_pkts) 1584e528d7c7SWenjun Wu { 1585e528d7c7SWenjun Wu return idpf_splitq_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts); 15860fac6a1cSBeilei Xing } 15870fac6a1cSBeilei Xing 15880fac6a1cSBeilei Xing static inline void 1589e528d7c7SWenjun Wu idpf_tx_release_mbufs_avx512(struct idpf_tx_queue *txq) 15900fac6a1cSBeilei Xing { 15910fac6a1cSBeilei Xing unsigned int i; 15920fac6a1cSBeilei Xing const uint16_t max_desc = (uint16_t)(txq->nb_tx_desc - 1); 15930fac6a1cSBeilei Xing struct idpf_tx_vec_entry *swr = (void *)txq->sw_ring; 15940fac6a1cSBeilei Xing 15950fac6a1cSBeilei Xing if (txq->sw_ring == NULL || txq->nb_free == max_desc) 15960fac6a1cSBeilei Xing return; 15970fac6a1cSBeilei Xing 15980fac6a1cSBeilei Xing i = txq->next_dd - txq->rs_thresh + 1; 15990fac6a1cSBeilei Xing if (txq->tx_tail < i) { 16000fac6a1cSBeilei Xing for (; i < txq->nb_tx_desc; i++) { 16010fac6a1cSBeilei Xing rte_pktmbuf_free_seg(swr[i].mbuf); 16020fac6a1cSBeilei Xing swr[i].mbuf = NULL; 16030fac6a1cSBeilei Xing } 16040fac6a1cSBeilei Xing i = 0; 16050fac6a1cSBeilei Xing } 1606b28f22e8SWenjun Wu for (; i < txq->tx_tail; i++) { 1607b28f22e8SWenjun Wu rte_pktmbuf_free_seg(swr[i].mbuf); 1608b28f22e8SWenjun Wu swr[i].mbuf = NULL; 1609b28f22e8SWenjun Wu } 16100fac6a1cSBeilei Xing } 16110fac6a1cSBeilei Xing 1612e528d7c7SWenjun Wu static const struct idpf_txq_ops avx512_tx_vec_ops = { 1613e528d7c7SWenjun Wu .release_mbufs = idpf_tx_release_mbufs_avx512, 16140fac6a1cSBeilei Xing }; 16150fac6a1cSBeilei Xing 16160fac6a1cSBeilei Xing int __rte_cold 1617e528d7c7SWenjun Wu idpf_qc_tx_vec_avx512_setup(struct idpf_tx_queue *txq) 16180fac6a1cSBeilei Xing { 1619e528d7c7SWenjun Wu if (!txq) 1620e528d7c7SWenjun Wu return 0; 1621e528d7c7SWenjun Wu 1622e528d7c7SWenjun Wu txq->ops = &avx512_tx_vec_ops; 16230fac6a1cSBeilei Xing return 0; 16240fac6a1cSBeilei Xing } 1625