xref: /dpdk/drivers/common/idpf/idpf_common_rxtx_avx512.c (revision 43fd3624fdfe3a33904a9b64d94306dd3d4f2c13)
10fac6a1cSBeilei Xing /* SPDX-License-Identifier: BSD-3-Clause
20fac6a1cSBeilei Xing  * Copyright(c) 2023 Intel Corporation
30fac6a1cSBeilei Xing  */
40fac6a1cSBeilei Xing 
50fac6a1cSBeilei Xing #include <rte_vect.h>
6ec4b04a7SQi Zhang #include "idpf_common_device.h"
7ec4b04a7SQi Zhang #include "idpf_common_rxtx.h"
80fac6a1cSBeilei Xing 
90fac6a1cSBeilei Xing #define IDPF_DESCS_PER_LOOP_AVX 8
100fac6a1cSBeilei Xing #define PKTLEN_SHIFT 10
110fac6a1cSBeilei Xing 
120fac6a1cSBeilei Xing static __rte_always_inline void
130fac6a1cSBeilei Xing idpf_singleq_rearm_common(struct idpf_rx_queue *rxq)
140fac6a1cSBeilei Xing {
150fac6a1cSBeilei Xing 	struct rte_mbuf **rxp = &rxq->sw_ring[rxq->rxrearm_start];
160fac6a1cSBeilei Xing 	volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring;
170fac6a1cSBeilei Xing 	uint16_t rx_id;
180fac6a1cSBeilei Xing 	int i;
190fac6a1cSBeilei Xing 
200fac6a1cSBeilei Xing 	rxdp += rxq->rxrearm_start;
210fac6a1cSBeilei Xing 
220fac6a1cSBeilei Xing 	/* Pull 'n' more MBUFs into the software ring */
230fac6a1cSBeilei Xing 	if (rte_mempool_get_bulk(rxq->mp,
240fac6a1cSBeilei Xing 				 (void *)rxp,
250fac6a1cSBeilei Xing 				 IDPF_RXQ_REARM_THRESH) < 0) {
260fac6a1cSBeilei Xing 		if (rxq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
270fac6a1cSBeilei Xing 		    rxq->nb_rx_desc) {
280fac6a1cSBeilei Xing 			__m128i dma_addr0;
290fac6a1cSBeilei Xing 
300fac6a1cSBeilei Xing 			dma_addr0 = _mm_setzero_si128();
310fac6a1cSBeilei Xing 			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
320fac6a1cSBeilei Xing 				rxp[i] = &rxq->fake_mbuf;
33*43fd3624SAndre Muezerie 				_mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp[i].read),
340fac6a1cSBeilei Xing 						dma_addr0);
350fac6a1cSBeilei Xing 			}
360fac6a1cSBeilei Xing 		}
37e12a0166STyler Retzlaff 		rte_atomic_fetch_add_explicit(&rxq->rx_stats.mbuf_alloc_failed,
38e12a0166STyler Retzlaff 				   IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
390fac6a1cSBeilei Xing 		return;
400fac6a1cSBeilei Xing 	}
410fac6a1cSBeilei Xing 	struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
420fac6a1cSBeilei Xing 	struct rte_mbuf *mb4, *mb5, *mb6, *mb7;
430fac6a1cSBeilei Xing 	__m512i dma_addr0_3, dma_addr4_7;
440fac6a1cSBeilei Xing 	__m512i hdr_room = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
450fac6a1cSBeilei Xing 	/* Initialize the mbufs in vector, process 8 mbufs in one loop */
460fac6a1cSBeilei Xing 	for (i = 0; i < IDPF_RXQ_REARM_THRESH;
470fac6a1cSBeilei Xing 			i += 8, rxp += 8, rxdp += 8) {
480fac6a1cSBeilei Xing 		__m128i vaddr0, vaddr1, vaddr2, vaddr3;
490fac6a1cSBeilei Xing 		__m128i vaddr4, vaddr5, vaddr6, vaddr7;
500fac6a1cSBeilei Xing 		__m256i vaddr0_1, vaddr2_3;
510fac6a1cSBeilei Xing 		__m256i vaddr4_5, vaddr6_7;
520fac6a1cSBeilei Xing 		__m512i vaddr0_3, vaddr4_7;
530fac6a1cSBeilei Xing 
540fac6a1cSBeilei Xing 		mb0 = rxp[0];
550fac6a1cSBeilei Xing 		mb1 = rxp[1];
560fac6a1cSBeilei Xing 		mb2 = rxp[2];
570fac6a1cSBeilei Xing 		mb3 = rxp[3];
580fac6a1cSBeilei Xing 		mb4 = rxp[4];
590fac6a1cSBeilei Xing 		mb5 = rxp[5];
600fac6a1cSBeilei Xing 		mb6 = rxp[6];
610fac6a1cSBeilei Xing 		mb7 = rxp[7];
620fac6a1cSBeilei Xing 
630fac6a1cSBeilei Xing 		/* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
640fac6a1cSBeilei Xing 		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
650fac6a1cSBeilei Xing 				offsetof(struct rte_mbuf, buf_addr) + 8);
660fac6a1cSBeilei Xing 		vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
670fac6a1cSBeilei Xing 		vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
680fac6a1cSBeilei Xing 		vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);
690fac6a1cSBeilei Xing 		vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);
700fac6a1cSBeilei Xing 		vaddr4 = _mm_loadu_si128((__m128i *)&mb4->buf_addr);
710fac6a1cSBeilei Xing 		vaddr5 = _mm_loadu_si128((__m128i *)&mb5->buf_addr);
720fac6a1cSBeilei Xing 		vaddr6 = _mm_loadu_si128((__m128i *)&mb6->buf_addr);
730fac6a1cSBeilei Xing 		vaddr7 = _mm_loadu_si128((__m128i *)&mb7->buf_addr);
740fac6a1cSBeilei Xing 
750fac6a1cSBeilei Xing 		/**
760fac6a1cSBeilei Xing 		 * merge 0 & 1, by casting 0 to 256-bit and inserting 1
770fac6a1cSBeilei Xing 		 * into the high lanes. Similarly for 2 & 3, and so on.
780fac6a1cSBeilei Xing 		 */
790fac6a1cSBeilei Xing 		vaddr0_1 =
800fac6a1cSBeilei Xing 			_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0),
810fac6a1cSBeilei Xing 						vaddr1, 1);
820fac6a1cSBeilei Xing 		vaddr2_3 =
830fac6a1cSBeilei Xing 			_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr2),
840fac6a1cSBeilei Xing 						vaddr3, 1);
850fac6a1cSBeilei Xing 		vaddr4_5 =
860fac6a1cSBeilei Xing 			_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr4),
870fac6a1cSBeilei Xing 						vaddr5, 1);
880fac6a1cSBeilei Xing 		vaddr6_7 =
890fac6a1cSBeilei Xing 			_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr6),
900fac6a1cSBeilei Xing 						vaddr7, 1);
910fac6a1cSBeilei Xing 		vaddr0_3 =
920fac6a1cSBeilei Xing 			_mm512_inserti64x4(_mm512_castsi256_si512(vaddr0_1),
930fac6a1cSBeilei Xing 						vaddr2_3, 1);
940fac6a1cSBeilei Xing 		vaddr4_7 =
950fac6a1cSBeilei Xing 			_mm512_inserti64x4(_mm512_castsi256_si512(vaddr4_5),
960fac6a1cSBeilei Xing 						vaddr6_7, 1);
970fac6a1cSBeilei Xing 
980fac6a1cSBeilei Xing 		/* convert pa to dma_addr hdr/data */
990fac6a1cSBeilei Xing 		dma_addr0_3 = _mm512_unpackhi_epi64(vaddr0_3, vaddr0_3);
1000fac6a1cSBeilei Xing 		dma_addr4_7 = _mm512_unpackhi_epi64(vaddr4_7, vaddr4_7);
1010fac6a1cSBeilei Xing 
1020fac6a1cSBeilei Xing 		/* add headroom to pa values */
1030fac6a1cSBeilei Xing 		dma_addr0_3 = _mm512_add_epi64(dma_addr0_3, hdr_room);
1040fac6a1cSBeilei Xing 		dma_addr4_7 = _mm512_add_epi64(dma_addr4_7, hdr_room);
1050fac6a1cSBeilei Xing 
1060fac6a1cSBeilei Xing 		/* flush desc with pa dma_addr */
107*43fd3624SAndre Muezerie 		_mm512_store_si512(RTE_CAST_PTR(__m512i *, &rxdp->read), dma_addr0_3);
108*43fd3624SAndre Muezerie 		_mm512_store_si512(RTE_CAST_PTR(__m512i *, &(rxdp + 4)->read), dma_addr4_7);
1090fac6a1cSBeilei Xing 	}
1100fac6a1cSBeilei Xing 
1110fac6a1cSBeilei Xing 	rxq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
1120fac6a1cSBeilei Xing 	if (rxq->rxrearm_start >= rxq->nb_rx_desc)
1130fac6a1cSBeilei Xing 		rxq->rxrearm_start = 0;
1140fac6a1cSBeilei Xing 
1150fac6a1cSBeilei Xing 	rxq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
1160fac6a1cSBeilei Xing 
1170fac6a1cSBeilei Xing 	rx_id = (uint16_t)((rxq->rxrearm_start == 0) ?
1180fac6a1cSBeilei Xing 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
1190fac6a1cSBeilei Xing 
1200fac6a1cSBeilei Xing 	/* Update the tail pointer on the NIC */
1210fac6a1cSBeilei Xing 	IDPF_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
1220fac6a1cSBeilei Xing }
1230fac6a1cSBeilei Xing 
1240fac6a1cSBeilei Xing static __rte_always_inline void
1250fac6a1cSBeilei Xing idpf_singleq_rearm(struct idpf_rx_queue *rxq)
1260fac6a1cSBeilei Xing {
1270fac6a1cSBeilei Xing 	int i;
1280fac6a1cSBeilei Xing 	uint16_t rx_id;
1290fac6a1cSBeilei Xing 	volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring;
1300fac6a1cSBeilei Xing 	struct rte_mempool_cache *cache =
1310fac6a1cSBeilei Xing 		rte_mempool_default_cache(rxq->mp, rte_lcore_id());
1320fac6a1cSBeilei Xing 	struct rte_mbuf **rxp = &rxq->sw_ring[rxq->rxrearm_start];
1330fac6a1cSBeilei Xing 
1340fac6a1cSBeilei Xing 	rxdp += rxq->rxrearm_start;
1350fac6a1cSBeilei Xing 
1360fac6a1cSBeilei Xing 	if (unlikely(cache == NULL))
1370fac6a1cSBeilei Xing 		return idpf_singleq_rearm_common(rxq);
1380fac6a1cSBeilei Xing 
1390fac6a1cSBeilei Xing 	/* We need to pull 'n' more MBUFs into the software ring from mempool
1400fac6a1cSBeilei Xing 	 * We inline the mempool function here, so we can vectorize the copy
1410fac6a1cSBeilei Xing 	 * from the cache into the shadow ring.
1420fac6a1cSBeilei Xing 	 */
1430fac6a1cSBeilei Xing 
1440fac6a1cSBeilei Xing 	/* Can this be satisfied from the cache? */
1450fac6a1cSBeilei Xing 	if (cache->len < IDPF_RXQ_REARM_THRESH) {
1460fac6a1cSBeilei Xing 		/* No. Backfill the cache first, and then fill from it */
1470fac6a1cSBeilei Xing 		uint32_t req = IDPF_RXQ_REARM_THRESH + (cache->size -
1480fac6a1cSBeilei Xing 							cache->len);
1490fac6a1cSBeilei Xing 
1500fac6a1cSBeilei Xing 		/* How many do we require i.e. number to fill the cache + the request */
1510fac6a1cSBeilei Xing 		int ret = rte_mempool_ops_dequeue_bulk
1520fac6a1cSBeilei Xing 				(rxq->mp, &cache->objs[cache->len], req);
1530fac6a1cSBeilei Xing 		if (ret == 0) {
1540fac6a1cSBeilei Xing 			cache->len += req;
1550fac6a1cSBeilei Xing 		} else {
1560fac6a1cSBeilei Xing 			if (rxq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
1570fac6a1cSBeilei Xing 			    rxq->nb_rx_desc) {
1580fac6a1cSBeilei Xing 				__m128i dma_addr0;
1590fac6a1cSBeilei Xing 
1600fac6a1cSBeilei Xing 				dma_addr0 = _mm_setzero_si128();
1610fac6a1cSBeilei Xing 				for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
1620fac6a1cSBeilei Xing 					rxp[i] = &rxq->fake_mbuf;
163*43fd3624SAndre Muezerie 					_mm_storeu_si128(RTE_CAST_PTR
164*43fd3624SAndre Muezerie 							(__m128i *, &rxdp[i].read), dma_addr0);
1650fac6a1cSBeilei Xing 				}
1660fac6a1cSBeilei Xing 			}
167e12a0166STyler Retzlaff 			rte_atomic_fetch_add_explicit(&rxq->rx_stats.mbuf_alloc_failed,
168e12a0166STyler Retzlaff 					   IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
1690fac6a1cSBeilei Xing 			return;
1700fac6a1cSBeilei Xing 		}
1710fac6a1cSBeilei Xing 	}
1720fac6a1cSBeilei Xing 
1730fac6a1cSBeilei Xing 	const __m512i iova_offsets =  _mm512_set1_epi64(offsetof
1740fac6a1cSBeilei Xing 							(struct rte_mbuf, buf_iova));
1750fac6a1cSBeilei Xing 	const __m512i headroom = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
1760fac6a1cSBeilei Xing 
1770fac6a1cSBeilei Xing 	/* to shuffle the addresses to correct slots. Values 4-7 will contain
1780fac6a1cSBeilei Xing 	 * zeros, so use 7 for a zero-value.
1790fac6a1cSBeilei Xing 	 */
1800fac6a1cSBeilei Xing 	const __m512i permute_idx = _mm512_set_epi64(7, 7, 3, 1, 7, 7, 2, 0);
1810fac6a1cSBeilei Xing 
1820fac6a1cSBeilei Xing 	/* Initialize the mbufs in vector, process 8 mbufs in one loop, taking
1830fac6a1cSBeilei Xing 	 * from mempool cache and populating both shadow and HW rings
1840fac6a1cSBeilei Xing 	 */
1850fac6a1cSBeilei Xing 	for (i = 0; i < IDPF_RXQ_REARM_THRESH / IDPF_DESCS_PER_LOOP_AVX; i++) {
1860fac6a1cSBeilei Xing 		const __m512i mbuf_ptrs = _mm512_loadu_si512
1870fac6a1cSBeilei Xing 			(&cache->objs[cache->len - IDPF_DESCS_PER_LOOP_AVX]);
1880fac6a1cSBeilei Xing 		_mm512_storeu_si512(rxp, mbuf_ptrs);
1890fac6a1cSBeilei Xing 
1900fac6a1cSBeilei Xing 		const __m512i iova_base_addrs = _mm512_i64gather_epi64
1910fac6a1cSBeilei Xing 				(_mm512_add_epi64(mbuf_ptrs, iova_offsets),
1920fac6a1cSBeilei Xing 				 0, /* base */
1930fac6a1cSBeilei Xing 				 1  /* scale */);
1940fac6a1cSBeilei Xing 		const __m512i iova_addrs = _mm512_add_epi64(iova_base_addrs,
1950fac6a1cSBeilei Xing 				headroom);
1960fac6a1cSBeilei Xing 		const __m512i iovas0 = _mm512_castsi256_si512
1970fac6a1cSBeilei Xing 				(_mm512_extracti64x4_epi64(iova_addrs, 0));
1980fac6a1cSBeilei Xing 		const __m512i iovas1 = _mm512_castsi256_si512
1990fac6a1cSBeilei Xing 				(_mm512_extracti64x4_epi64(iova_addrs, 1));
2000fac6a1cSBeilei Xing 
2010fac6a1cSBeilei Xing 		/* permute leaves desc 2-3 addresses in header address slots 0-1
2020fac6a1cSBeilei Xing 		 * but these are ignored by driver since header split not
2030fac6a1cSBeilei Xing 		 * enabled. Similarly for desc 6 & 7.
2040fac6a1cSBeilei Xing 		 */
2050fac6a1cSBeilei Xing 		const __m512i desc0_1 = _mm512_permutexvar_epi64
2060fac6a1cSBeilei Xing 				(permute_idx,
2070fac6a1cSBeilei Xing 				 iovas0);
2080fac6a1cSBeilei Xing 		const __m512i desc2_3 = _mm512_bsrli_epi128(desc0_1, 8);
2090fac6a1cSBeilei Xing 
2100fac6a1cSBeilei Xing 		const __m512i desc4_5 = _mm512_permutexvar_epi64
2110fac6a1cSBeilei Xing 				(permute_idx,
2120fac6a1cSBeilei Xing 				 iovas1);
2130fac6a1cSBeilei Xing 		const __m512i desc6_7 = _mm512_bsrli_epi128(desc4_5, 8);
2140fac6a1cSBeilei Xing 
215*43fd3624SAndre Muezerie 		_mm512_storeu_si512(RTE_CAST_PTR(void *, rxdp), desc0_1);
216*43fd3624SAndre Muezerie 		_mm512_storeu_si512(RTE_CAST_PTR(void *, (rxdp + 2)), desc2_3);
217*43fd3624SAndre Muezerie 		_mm512_storeu_si512(RTE_CAST_PTR(void *, (rxdp + 4)), desc4_5);
218*43fd3624SAndre Muezerie 		_mm512_storeu_si512(RTE_CAST_PTR(void *, (rxdp + 6)), desc6_7);
2190fac6a1cSBeilei Xing 
2200fac6a1cSBeilei Xing 		rxp += IDPF_DESCS_PER_LOOP_AVX;
2210fac6a1cSBeilei Xing 		rxdp += IDPF_DESCS_PER_LOOP_AVX;
2220fac6a1cSBeilei Xing 		cache->len -= IDPF_DESCS_PER_LOOP_AVX;
2230fac6a1cSBeilei Xing 	}
2240fac6a1cSBeilei Xing 
2250fac6a1cSBeilei Xing 	rxq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
2260fac6a1cSBeilei Xing 	if (rxq->rxrearm_start >= rxq->nb_rx_desc)
2270fac6a1cSBeilei Xing 		rxq->rxrearm_start = 0;
2280fac6a1cSBeilei Xing 
2290fac6a1cSBeilei Xing 	rxq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
2300fac6a1cSBeilei Xing 
2310fac6a1cSBeilei Xing 	rx_id = (uint16_t)((rxq->rxrearm_start == 0) ?
2320fac6a1cSBeilei Xing 			   (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
2330fac6a1cSBeilei Xing 
2340fac6a1cSBeilei Xing 	/* Update the tail pointer on the NIC */
2350fac6a1cSBeilei Xing 	IDPF_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
2360fac6a1cSBeilei Xing }
2370fac6a1cSBeilei Xing 
2380fac6a1cSBeilei Xing #define IDPF_RX_LEN_MASK 0x80808080
2390fac6a1cSBeilei Xing static __rte_always_inline uint16_t
2400fac6a1cSBeilei Xing _idpf_singleq_recv_raw_pkts_avx512(struct idpf_rx_queue *rxq,
2410fac6a1cSBeilei Xing 				   struct rte_mbuf **rx_pkts,
2420fac6a1cSBeilei Xing 				   uint16_t nb_pkts)
2430fac6a1cSBeilei Xing {
2440fac6a1cSBeilei Xing 	const uint32_t *type_table = rxq->adapter->ptype_tbl;
2450fac6a1cSBeilei Xing 
2460fac6a1cSBeilei Xing 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
2470fac6a1cSBeilei Xing 						    rxq->mbuf_initializer);
2480fac6a1cSBeilei Xing 	struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail];
2490fac6a1cSBeilei Xing 	volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring;
2500fac6a1cSBeilei Xing 
2510fac6a1cSBeilei Xing 	rxdp += rxq->rx_tail;
2520fac6a1cSBeilei Xing 
2530fac6a1cSBeilei Xing 	rte_prefetch0(rxdp);
2540fac6a1cSBeilei Xing 
2550fac6a1cSBeilei Xing 	/* nb_pkts has to be floor-aligned to IDPF_DESCS_PER_LOOP_AVX */
2560fac6a1cSBeilei Xing 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IDPF_DESCS_PER_LOOP_AVX);
2570fac6a1cSBeilei Xing 
2580fac6a1cSBeilei Xing 	/* See if we need to rearm the RX queue - gives the prefetch a bit
2590fac6a1cSBeilei Xing 	 * of time to act
2600fac6a1cSBeilei Xing 	 */
2610fac6a1cSBeilei Xing 	if (rxq->rxrearm_nb > IDPF_RXQ_REARM_THRESH)
2620fac6a1cSBeilei Xing 		idpf_singleq_rearm(rxq);
2630fac6a1cSBeilei Xing 
2640fac6a1cSBeilei Xing 	/* Before we start moving massive data around, check to see if
2650fac6a1cSBeilei Xing 	 * there is actually a packet available
2660fac6a1cSBeilei Xing 	 */
2670fac6a1cSBeilei Xing 	if ((rxdp->flex_nic_wb.status_error0  &
2680fac6a1cSBeilei Xing 	      rte_cpu_to_le_32(1 << VIRTCHNL2_RX_FLEX_DESC_STATUS0_DD_S)) == 0)
2690fac6a1cSBeilei Xing 		return 0;
2700fac6a1cSBeilei Xing 
2710fac6a1cSBeilei Xing 	/* 8 packets DD mask, LSB in each 32-bit value */
2720fac6a1cSBeilei Xing 	const __m256i dd_check = _mm256_set1_epi32(1);
2730fac6a1cSBeilei Xing 
2740fac6a1cSBeilei Xing 	/* mask to shuffle from desc. to mbuf (4 descriptors)*/
2750fac6a1cSBeilei Xing 	const __m512i shuf_msk =
2760fac6a1cSBeilei Xing 		_mm512_set_epi32
2770fac6a1cSBeilei Xing 			(/* 1st descriptor */
2780fac6a1cSBeilei Xing 			 0xFFFFFFFF,    /* rss set as unknown */
2790fac6a1cSBeilei Xing 			 0xFFFF0504,    /* vlan_macip set as unknown */
2800fac6a1cSBeilei Xing 					/* octet 15~14, 16 bits data_len */
2810fac6a1cSBeilei Xing 			 0xFFFF0504,    /* skip high 16 bits pkt_len, zero out */
2820fac6a1cSBeilei Xing 					/* octet 15~14, low 16 bits pkt_len */
2830fac6a1cSBeilei Xing 			 0xFFFFFFFF,    /* pkt_type set as unknown */
2840fac6a1cSBeilei Xing 			 /* 2nd descriptor */
2850fac6a1cSBeilei Xing 			 0xFFFFFFFF,    /* rss set as unknown */
2860fac6a1cSBeilei Xing 			 0xFFFF0504,    /* vlan_macip set as unknown */
2870fac6a1cSBeilei Xing 					/* octet 15~14, 16 bits data_len */
2880fac6a1cSBeilei Xing 			 0xFFFF0504,    /* skip high 16 bits pkt_len, zero out */
2890fac6a1cSBeilei Xing 					/* octet 15~14, low 16 bits pkt_len */
2900fac6a1cSBeilei Xing 			 0xFFFFFFFF,    /* pkt_type set as unknown */
2910fac6a1cSBeilei Xing 			 /* 3rd descriptor */
2920fac6a1cSBeilei Xing 			 0xFFFFFFFF,    /* rss set as unknown */
2930fac6a1cSBeilei Xing 			 0xFFFF0504,    /* vlan_macip set as unknown */
2940fac6a1cSBeilei Xing 					/* octet 15~14, 16 bits data_len */
2950fac6a1cSBeilei Xing 			 0xFFFF0504,    /* skip high 16 bits pkt_len, zero out */
2960fac6a1cSBeilei Xing 					/* octet 15~14, low 16 bits pkt_len */
2970fac6a1cSBeilei Xing 			 0xFFFFFFFF,    /* pkt_type set as unknown */
2980fac6a1cSBeilei Xing 			 /* 4th descriptor */
2990fac6a1cSBeilei Xing 			 0xFFFFFFFF,    /* rss set as unknown */
3000fac6a1cSBeilei Xing 			 0xFFFF0504,    /* vlan_macip set as unknown */
3010fac6a1cSBeilei Xing 					/* octet 15~14, 16 bits data_len */
3020fac6a1cSBeilei Xing 			 0xFFFF0504,    /* skip high 16 bits pkt_len, zero out */
3030fac6a1cSBeilei Xing 					/* octet 15~14, low 16 bits pkt_len */
3040fac6a1cSBeilei Xing 			 0xFFFFFFFF     /* pkt_type set as unknown */
3050fac6a1cSBeilei Xing 			);
3060fac6a1cSBeilei Xing 	/**
3070fac6a1cSBeilei Xing 	 * compile-time check the shuffle layout is correct.
3080fac6a1cSBeilei Xing 	 * NOTE: the first field (lowest address) is given last in set_epi
3090fac6a1cSBeilei Xing 	 * calls above.
3100fac6a1cSBeilei Xing 	 */
3110fac6a1cSBeilei Xing 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
3120fac6a1cSBeilei Xing 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
3130fac6a1cSBeilei Xing 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
3140fac6a1cSBeilei Xing 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
3150fac6a1cSBeilei Xing 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
3160fac6a1cSBeilei Xing 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
3170fac6a1cSBeilei Xing 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
3180fac6a1cSBeilei Xing 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
3190fac6a1cSBeilei Xing 
3200fac6a1cSBeilei Xing 	uint16_t i, received;
3210fac6a1cSBeilei Xing 
3220fac6a1cSBeilei Xing 	for (i = 0, received = 0; i < nb_pkts;
3230fac6a1cSBeilei Xing 	     i += IDPF_DESCS_PER_LOOP_AVX,
3240fac6a1cSBeilei Xing 	     rxdp += IDPF_DESCS_PER_LOOP_AVX) {
3250fac6a1cSBeilei Xing 		/* step 1, copy over 8 mbuf pointers to rx_pkts array */
3260fac6a1cSBeilei Xing 		_mm256_storeu_si256((void *)&rx_pkts[i],
3270fac6a1cSBeilei Xing 				    _mm256_loadu_si256((void *)&sw_ring[i]));
3280fac6a1cSBeilei Xing #ifdef RTE_ARCH_X86_64
3290fac6a1cSBeilei Xing 		_mm256_storeu_si256
3300fac6a1cSBeilei Xing 			((void *)&rx_pkts[i + 4],
3310fac6a1cSBeilei Xing 			 _mm256_loadu_si256((void *)&sw_ring[i + 4]));
3320fac6a1cSBeilei Xing #endif
3330fac6a1cSBeilei Xing 
3340fac6a1cSBeilei Xing 		__m512i raw_desc0_3, raw_desc4_7;
3350fac6a1cSBeilei Xing 		const __m128i raw_desc7 =
336*43fd3624SAndre Muezerie 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 7));
3370fac6a1cSBeilei Xing 		rte_compiler_barrier();
3380fac6a1cSBeilei Xing 		const __m128i raw_desc6 =
339*43fd3624SAndre Muezerie 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 6));
3400fac6a1cSBeilei Xing 		rte_compiler_barrier();
3410fac6a1cSBeilei Xing 		const __m128i raw_desc5 =
342*43fd3624SAndre Muezerie 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 5));
3430fac6a1cSBeilei Xing 		rte_compiler_barrier();
3440fac6a1cSBeilei Xing 		const __m128i raw_desc4 =
345*43fd3624SAndre Muezerie 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 4));
3460fac6a1cSBeilei Xing 		rte_compiler_barrier();
3470fac6a1cSBeilei Xing 		const __m128i raw_desc3 =
348*43fd3624SAndre Muezerie 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 3));
3490fac6a1cSBeilei Xing 		rte_compiler_barrier();
3500fac6a1cSBeilei Xing 		const __m128i raw_desc2 =
351*43fd3624SAndre Muezerie 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 2));
3520fac6a1cSBeilei Xing 		rte_compiler_barrier();
3530fac6a1cSBeilei Xing 		const __m128i raw_desc1 =
354*43fd3624SAndre Muezerie 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 1));
3550fac6a1cSBeilei Xing 		rte_compiler_barrier();
3560fac6a1cSBeilei Xing 		const __m128i raw_desc0 =
357*43fd3624SAndre Muezerie 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 0));
3580fac6a1cSBeilei Xing 
3590fac6a1cSBeilei Xing 		raw_desc4_7 = _mm512_broadcast_i32x4(raw_desc4);
3600fac6a1cSBeilei Xing 		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc5, 1);
3610fac6a1cSBeilei Xing 		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc6, 2);
3620fac6a1cSBeilei Xing 		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc7, 3);
3630fac6a1cSBeilei Xing 		raw_desc0_3 = _mm512_broadcast_i32x4(raw_desc0);
3640fac6a1cSBeilei Xing 		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc1, 1);
3650fac6a1cSBeilei Xing 		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc2, 2);
3660fac6a1cSBeilei Xing 		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc3, 3);
3670fac6a1cSBeilei Xing 
3680fac6a1cSBeilei Xing 		/**
3690fac6a1cSBeilei Xing 		 * convert descriptors 4-7 into mbufs, adjusting length and
3700fac6a1cSBeilei Xing 		 * re-arranging fields. Then write into the mbuf
3710fac6a1cSBeilei Xing 		 */
3720fac6a1cSBeilei Xing 		const __m512i len4_7 = _mm512_slli_epi32(raw_desc4_7,
3730fac6a1cSBeilei Xing 							 PKTLEN_SHIFT);
3740fac6a1cSBeilei Xing 		const __m512i desc4_7 = _mm512_mask_blend_epi16(IDPF_RX_LEN_MASK,
3750fac6a1cSBeilei Xing 								raw_desc4_7,
3760fac6a1cSBeilei Xing 								len4_7);
3770fac6a1cSBeilei Xing 		__m512i mb4_7 = _mm512_shuffle_epi8(desc4_7, shuf_msk);
3780fac6a1cSBeilei Xing 
3790fac6a1cSBeilei Xing 		/**
3800fac6a1cSBeilei Xing 		 * to get packet types, shift 64-bit values down 30 bits
3810fac6a1cSBeilei Xing 		 * and so ptype is in lower 8-bits in each
3820fac6a1cSBeilei Xing 		 */
3830fac6a1cSBeilei Xing 		const __m512i ptypes4_7 = _mm512_srli_epi64(desc4_7, 16);
3840fac6a1cSBeilei Xing 		const __m256i ptypes6_7 = _mm512_extracti64x4_epi64(ptypes4_7, 1);
3850fac6a1cSBeilei Xing 		const __m256i ptypes4_5 = _mm512_extracti64x4_epi64(ptypes4_7, 0);
3860fac6a1cSBeilei Xing 		const uint8_t ptype7 = _mm256_extract_epi8(ptypes6_7, 16);
3870fac6a1cSBeilei Xing 		const uint8_t ptype6 = _mm256_extract_epi8(ptypes6_7, 0);
3880fac6a1cSBeilei Xing 		const uint8_t ptype5 = _mm256_extract_epi8(ptypes4_5, 16);
3890fac6a1cSBeilei Xing 		const uint8_t ptype4 = _mm256_extract_epi8(ptypes4_5, 0);
3900fac6a1cSBeilei Xing 
3910fac6a1cSBeilei Xing 		const __m512i ptype4_7 = _mm512_set_epi32
3920fac6a1cSBeilei Xing 			(0, 0, 0, type_table[ptype7],
3930fac6a1cSBeilei Xing 			 0, 0, 0, type_table[ptype6],
3940fac6a1cSBeilei Xing 			 0, 0, 0, type_table[ptype5],
3950fac6a1cSBeilei Xing 			 0, 0, 0, type_table[ptype4]);
3960fac6a1cSBeilei Xing 		mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7);
3970fac6a1cSBeilei Xing 
3980fac6a1cSBeilei Xing 		/**
3990fac6a1cSBeilei Xing 		 * convert descriptors 0-3 into mbufs, adjusting length and
4000fac6a1cSBeilei Xing 		 * re-arranging fields. Then write into the mbuf
4010fac6a1cSBeilei Xing 		 */
4020fac6a1cSBeilei Xing 		const __m512i len0_3 = _mm512_slli_epi32(raw_desc0_3,
4030fac6a1cSBeilei Xing 							 PKTLEN_SHIFT);
4040fac6a1cSBeilei Xing 		const __m512i desc0_3 = _mm512_mask_blend_epi16(IDPF_RX_LEN_MASK,
4050fac6a1cSBeilei Xing 								raw_desc0_3,
4060fac6a1cSBeilei Xing 								len0_3);
4070fac6a1cSBeilei Xing 		__m512i mb0_3 = _mm512_shuffle_epi8(desc0_3, shuf_msk);
4080fac6a1cSBeilei Xing 
4090fac6a1cSBeilei Xing 		/* get the packet types */
4100fac6a1cSBeilei Xing 		const __m512i ptypes0_3 = _mm512_srli_epi64(desc0_3, 16);
4110fac6a1cSBeilei Xing 		const __m256i ptypes2_3 = _mm512_extracti64x4_epi64(ptypes0_3, 1);
4120fac6a1cSBeilei Xing 		const __m256i ptypes0_1 = _mm512_extracti64x4_epi64(ptypes0_3, 0);
4130fac6a1cSBeilei Xing 		const uint8_t ptype3 = _mm256_extract_epi8(ptypes2_3, 16);
4140fac6a1cSBeilei Xing 		const uint8_t ptype2 = _mm256_extract_epi8(ptypes2_3, 0);
4150fac6a1cSBeilei Xing 		const uint8_t ptype1 = _mm256_extract_epi8(ptypes0_1, 16);
4160fac6a1cSBeilei Xing 		const uint8_t ptype0 = _mm256_extract_epi8(ptypes0_1, 0);
4170fac6a1cSBeilei Xing 
4180fac6a1cSBeilei Xing 		const __m512i ptype0_3 = _mm512_set_epi32
4190fac6a1cSBeilei Xing 			(0, 0, 0, type_table[ptype3],
4200fac6a1cSBeilei Xing 			 0, 0, 0, type_table[ptype2],
4210fac6a1cSBeilei Xing 			 0, 0, 0, type_table[ptype1],
4220fac6a1cSBeilei Xing 			 0, 0, 0, type_table[ptype0]);
4230fac6a1cSBeilei Xing 		mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3);
4240fac6a1cSBeilei Xing 
4250fac6a1cSBeilei Xing 		/**
4260fac6a1cSBeilei Xing 		 * use permute/extract to get status content
4270fac6a1cSBeilei Xing 		 * After the operations, the packets status flags are in the
4280fac6a1cSBeilei Xing 		 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
4290fac6a1cSBeilei Xing 		 */
4300fac6a1cSBeilei Xing 		/* merge the status bits into one register */
4310fac6a1cSBeilei Xing 		const __m512i status_permute_msk = _mm512_set_epi32
4320fac6a1cSBeilei Xing 			(0, 0, 0, 0,
4330fac6a1cSBeilei Xing 			 0, 0, 0, 0,
4340fac6a1cSBeilei Xing 			 22, 30, 6, 14,
4350fac6a1cSBeilei Xing 			 18, 26, 2, 10);
4360fac6a1cSBeilei Xing 		const __m512i raw_status0_7 = _mm512_permutex2var_epi32
4370fac6a1cSBeilei Xing 			(raw_desc4_7, status_permute_msk, raw_desc0_3);
4380fac6a1cSBeilei Xing 		__m256i status0_7 = _mm512_extracti64x4_epi64
4390fac6a1cSBeilei Xing 			(raw_status0_7, 0);
4400fac6a1cSBeilei Xing 
4410fac6a1cSBeilei Xing 		/* now do flag manipulation */
4420fac6a1cSBeilei Xing 
4430fac6a1cSBeilei Xing 		/**
4440fac6a1cSBeilei Xing 		 * At this point, we have the 8 sets of flags in the low 16-bits
4450fac6a1cSBeilei Xing 		 * of each 32-bit value.
4460fac6a1cSBeilei Xing 		 * We want to extract these, and merge them with the mbuf init
4470fac6a1cSBeilei Xing 		 * data so we can do a single write to the mbuf to set the flags
4480fac6a1cSBeilei Xing 		 * and all the other initialization fields. Extracting the
4490fac6a1cSBeilei Xing 		 * appropriate flags means that we have to do a shift and blend
4500fac6a1cSBeilei Xing 		 * for each mbuf before we do the write. However, we can also
4510fac6a1cSBeilei Xing 		 * add in the previously computed rx_descriptor fields to
4520fac6a1cSBeilei Xing 		 * make a single 256-bit write per mbuf
4530fac6a1cSBeilei Xing 		 */
4540fac6a1cSBeilei Xing 		/* check the structure matches expectations */
4550fac6a1cSBeilei Xing 		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
4560fac6a1cSBeilei Xing 				 offsetof(struct rte_mbuf, rearm_data) + 8);
4570fac6a1cSBeilei Xing 		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
4580fac6a1cSBeilei Xing 				 RTE_ALIGN(offsetof(struct rte_mbuf,
4590fac6a1cSBeilei Xing 						    rearm_data),
4600fac6a1cSBeilei Xing 						    16));
4610fac6a1cSBeilei Xing 		/* build up data and do writes */
4620fac6a1cSBeilei Xing 		__m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
4630fac6a1cSBeilei Xing 			rearm6, rearm7;
4640fac6a1cSBeilei Xing 		const __m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0);
4650fac6a1cSBeilei Xing 		const __m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1);
4660fac6a1cSBeilei Xing 		const __m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0);
4670fac6a1cSBeilei Xing 		const __m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1);
4680fac6a1cSBeilei Xing 
4690fac6a1cSBeilei Xing 		rearm6 = _mm256_permute2f128_si256(mbuf_init, mb6_7, 0x20);
4700fac6a1cSBeilei Xing 		rearm4 = _mm256_permute2f128_si256(mbuf_init, mb4_5, 0x20);
4710fac6a1cSBeilei Xing 		rearm2 = _mm256_permute2f128_si256(mbuf_init, mb2_3, 0x20);
4720fac6a1cSBeilei Xing 		rearm0 = _mm256_permute2f128_si256(mbuf_init, mb0_1, 0x20);
4730fac6a1cSBeilei Xing 
4740fac6a1cSBeilei Xing 		/* write to mbuf */
4750fac6a1cSBeilei Xing 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data,
4760fac6a1cSBeilei Xing 				    rearm6);
4770fac6a1cSBeilei Xing 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data,
4780fac6a1cSBeilei Xing 				    rearm4);
4790fac6a1cSBeilei Xing 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data,
4800fac6a1cSBeilei Xing 				    rearm2);
4810fac6a1cSBeilei Xing 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data,
4820fac6a1cSBeilei Xing 				    rearm0);
4830fac6a1cSBeilei Xing 
4840fac6a1cSBeilei Xing 		rearm7 = _mm256_blend_epi32(mbuf_init, mb6_7, 0xF0);
4850fac6a1cSBeilei Xing 		rearm5 = _mm256_blend_epi32(mbuf_init, mb4_5, 0xF0);
4860fac6a1cSBeilei Xing 		rearm3 = _mm256_blend_epi32(mbuf_init, mb2_3, 0xF0);
4870fac6a1cSBeilei Xing 		rearm1 = _mm256_blend_epi32(mbuf_init, mb0_1, 0xF0);
4880fac6a1cSBeilei Xing 
4890fac6a1cSBeilei Xing 		/* again write to mbufs */
4900fac6a1cSBeilei Xing 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data,
4910fac6a1cSBeilei Xing 				    rearm7);
4920fac6a1cSBeilei Xing 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data,
4930fac6a1cSBeilei Xing 				    rearm5);
4940fac6a1cSBeilei Xing 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data,
4950fac6a1cSBeilei Xing 				    rearm3);
4960fac6a1cSBeilei Xing 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data,
4970fac6a1cSBeilei Xing 				    rearm1);
4980fac6a1cSBeilei Xing 
4990fac6a1cSBeilei Xing 		/* perform dd_check */
5000fac6a1cSBeilei Xing 		status0_7 = _mm256_and_si256(status0_7, dd_check);
5010fac6a1cSBeilei Xing 		status0_7 = _mm256_packs_epi32(status0_7,
5020fac6a1cSBeilei Xing 					       _mm256_setzero_si256());
5030fac6a1cSBeilei Xing 
5043d4e27fdSDavid Marchand 		uint64_t burst = rte_popcount64
5050fac6a1cSBeilei Xing 					(_mm_cvtsi128_si64
5060fac6a1cSBeilei Xing 						(_mm256_extracti128_si256
5070fac6a1cSBeilei Xing 							(status0_7, 1)));
5083d4e27fdSDavid Marchand 		burst += rte_popcount64
5090fac6a1cSBeilei Xing 				(_mm_cvtsi128_si64
5100fac6a1cSBeilei Xing 					(_mm256_castsi256_si128(status0_7)));
5110fac6a1cSBeilei Xing 		received += burst;
5120fac6a1cSBeilei Xing 		if (burst != IDPF_DESCS_PER_LOOP_AVX)
5130fac6a1cSBeilei Xing 			break;
5140fac6a1cSBeilei Xing 	}
5150fac6a1cSBeilei Xing 
5160fac6a1cSBeilei Xing 	/* update tail pointers */
5170fac6a1cSBeilei Xing 	rxq->rx_tail += received;
5180fac6a1cSBeilei Xing 	rxq->rx_tail &= (rxq->nb_rx_desc - 1);
5190fac6a1cSBeilei Xing 	if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep aligned */
5200fac6a1cSBeilei Xing 		rxq->rx_tail--;
5210fac6a1cSBeilei Xing 		received--;
5220fac6a1cSBeilei Xing 	}
5230fac6a1cSBeilei Xing 	rxq->rxrearm_nb += received;
5240fac6a1cSBeilei Xing 	return received;
5250fac6a1cSBeilei Xing }
5260fac6a1cSBeilei Xing 
5270fac6a1cSBeilei Xing /**
5280fac6a1cSBeilei Xing  * Notice:
5290fac6a1cSBeilei Xing  * - nb_pkts < IDPF_DESCS_PER_LOOP, just return no packet
5300fac6a1cSBeilei Xing  */
5310fac6a1cSBeilei Xing uint16_t
5329ebf3f6bSBeilei Xing idpf_dp_singleq_recv_pkts_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
5330fac6a1cSBeilei Xing 				 uint16_t nb_pkts)
5340fac6a1cSBeilei Xing {
5350fac6a1cSBeilei Xing 	return _idpf_singleq_recv_raw_pkts_avx512(rx_queue, rx_pkts, nb_pkts);
5360fac6a1cSBeilei Xing }
5370fac6a1cSBeilei Xing 
538e528d7c7SWenjun Wu static __rte_always_inline void
539e528d7c7SWenjun Wu idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq)
540e528d7c7SWenjun Wu {
541e528d7c7SWenjun Wu 	struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
542e528d7c7SWenjun Wu 	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
543e528d7c7SWenjun Wu 	uint16_t rx_id;
544e528d7c7SWenjun Wu 	int i;
545e528d7c7SWenjun Wu 
546e528d7c7SWenjun Wu 	rxdp += rx_bufq->rxrearm_start;
547e528d7c7SWenjun Wu 
548e528d7c7SWenjun Wu 	/* Pull 'n' more MBUFs into the software ring */
549e528d7c7SWenjun Wu 	if (rte_mempool_get_bulk(rx_bufq->mp,
550e528d7c7SWenjun Wu 				 (void *)rxp,
551e528d7c7SWenjun Wu 				 IDPF_RXQ_REARM_THRESH) < 0) {
552e528d7c7SWenjun Wu 		if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
553e528d7c7SWenjun Wu 		    rx_bufq->nb_rx_desc) {
554e528d7c7SWenjun Wu 			__m128i dma_addr0;
555e528d7c7SWenjun Wu 
556e528d7c7SWenjun Wu 			dma_addr0 = _mm_setzero_si128();
557e528d7c7SWenjun Wu 			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
558e528d7c7SWenjun Wu 				rxp[i] = &rx_bufq->fake_mbuf;
559*43fd3624SAndre Muezerie 				_mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp[i]),
560e528d7c7SWenjun Wu 						dma_addr0);
561e528d7c7SWenjun Wu 			}
562e528d7c7SWenjun Wu 		}
563e12a0166STyler Retzlaff 	rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
564e12a0166STyler Retzlaff 			   IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
565e528d7c7SWenjun Wu 		return;
566e528d7c7SWenjun Wu 	}
567e528d7c7SWenjun Wu 
568e528d7c7SWenjun Wu 	/* Initialize the mbufs in vector, process 8 mbufs in one loop */
569e528d7c7SWenjun Wu 	for (i = 0; i < IDPF_RXQ_REARM_THRESH;
570e528d7c7SWenjun Wu 			i += 8, rxp += 8, rxdp += 8) {
571e528d7c7SWenjun Wu 		rxdp[0].split_rd.pkt_addr = rxp[0]->buf_iova + RTE_PKTMBUF_HEADROOM;
572e528d7c7SWenjun Wu 		rxdp[1].split_rd.pkt_addr = rxp[1]->buf_iova + RTE_PKTMBUF_HEADROOM;
573e528d7c7SWenjun Wu 		rxdp[2].split_rd.pkt_addr = rxp[2]->buf_iova + RTE_PKTMBUF_HEADROOM;
574e528d7c7SWenjun Wu 		rxdp[3].split_rd.pkt_addr = rxp[3]->buf_iova + RTE_PKTMBUF_HEADROOM;
575e528d7c7SWenjun Wu 		rxdp[4].split_rd.pkt_addr = rxp[4]->buf_iova + RTE_PKTMBUF_HEADROOM;
576e528d7c7SWenjun Wu 		rxdp[5].split_rd.pkt_addr = rxp[5]->buf_iova + RTE_PKTMBUF_HEADROOM;
577e528d7c7SWenjun Wu 		rxdp[6].split_rd.pkt_addr = rxp[6]->buf_iova + RTE_PKTMBUF_HEADROOM;
578e528d7c7SWenjun Wu 		rxdp[7].split_rd.pkt_addr = rxp[7]->buf_iova + RTE_PKTMBUF_HEADROOM;
579e528d7c7SWenjun Wu 	}
580e528d7c7SWenjun Wu 
581e528d7c7SWenjun Wu 	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
582e528d7c7SWenjun Wu 	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
583e528d7c7SWenjun Wu 		rx_bufq->rxrearm_start = 0;
584e528d7c7SWenjun Wu 
585e528d7c7SWenjun Wu 	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
586e528d7c7SWenjun Wu 
587e528d7c7SWenjun Wu 	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
588e528d7c7SWenjun Wu 			     (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
589e528d7c7SWenjun Wu 
590e528d7c7SWenjun Wu 	/* Update the tail pointer on the NIC */
591e528d7c7SWenjun Wu 	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
592e528d7c7SWenjun Wu }
593e528d7c7SWenjun Wu 
594e528d7c7SWenjun Wu static __rte_always_inline void
595e528d7c7SWenjun Wu idpf_splitq_rearm(struct idpf_rx_queue *rx_bufq)
596e528d7c7SWenjun Wu {
597e528d7c7SWenjun Wu 	int i;
598e528d7c7SWenjun Wu 	uint16_t rx_id;
599e528d7c7SWenjun Wu 	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
600e528d7c7SWenjun Wu 	struct rte_mempool_cache *cache =
601e528d7c7SWenjun Wu 		rte_mempool_default_cache(rx_bufq->mp, rte_lcore_id());
602e528d7c7SWenjun Wu 	struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
603e528d7c7SWenjun Wu 
604e528d7c7SWenjun Wu 	rxdp += rx_bufq->rxrearm_start;
605e528d7c7SWenjun Wu 
606e528d7c7SWenjun Wu 	if (unlikely(!cache))
607e528d7c7SWenjun Wu 		return idpf_splitq_rearm_common(rx_bufq);
608e528d7c7SWenjun Wu 
609e528d7c7SWenjun Wu 	/* We need to pull 'n' more MBUFs into the software ring from mempool
610e528d7c7SWenjun Wu 	 * We inline the mempool function here, so we can vectorize the copy
611e528d7c7SWenjun Wu 	 * from the cache into the shadow ring.
612e528d7c7SWenjun Wu 	 */
613e528d7c7SWenjun Wu 
614e528d7c7SWenjun Wu 	/* Can this be satisfied from the cache? */
615e528d7c7SWenjun Wu 	if (cache->len < IDPF_RXQ_REARM_THRESH) {
616e528d7c7SWenjun Wu 		/* No. Backfill the cache first, and then fill from it */
617e528d7c7SWenjun Wu 		uint32_t req = IDPF_RXQ_REARM_THRESH + (cache->size -
618e528d7c7SWenjun Wu 							cache->len);
619e528d7c7SWenjun Wu 
620e528d7c7SWenjun Wu 		/* How many do we require i.e. number to fill the cache + the request */
621e528d7c7SWenjun Wu 		int ret = rte_mempool_ops_dequeue_bulk
622e528d7c7SWenjun Wu 				(rx_bufq->mp, &cache->objs[cache->len], req);
623e528d7c7SWenjun Wu 		if (ret == 0) {
624e528d7c7SWenjun Wu 			cache->len += req;
625e528d7c7SWenjun Wu 		} else {
626e528d7c7SWenjun Wu 			if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
627e528d7c7SWenjun Wu 			    rx_bufq->nb_rx_desc) {
628e528d7c7SWenjun Wu 				__m128i dma_addr0;
629e528d7c7SWenjun Wu 
630e528d7c7SWenjun Wu 				dma_addr0 = _mm_setzero_si128();
631e528d7c7SWenjun Wu 				for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
632e528d7c7SWenjun Wu 					rxp[i] = &rx_bufq->fake_mbuf;
633*43fd3624SAndre Muezerie 					_mm_storeu_si128(RTE_CAST_PTR(__m128i *, &rxdp[i]),
634e528d7c7SWenjun Wu 							 dma_addr0);
635e528d7c7SWenjun Wu 				}
636e528d7c7SWenjun Wu 			}
637e12a0166STyler Retzlaff 		rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
638e12a0166STyler Retzlaff 				   IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
639e528d7c7SWenjun Wu 			return;
640e528d7c7SWenjun Wu 		}
641e528d7c7SWenjun Wu 	}
642e528d7c7SWenjun Wu 
643e528d7c7SWenjun Wu 	const __m512i iova_offsets =  _mm512_set1_epi64(offsetof
644e528d7c7SWenjun Wu 							(struct rte_mbuf, buf_iova));
645e528d7c7SWenjun Wu 	const __m512i headroom = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
646e528d7c7SWenjun Wu 
647e528d7c7SWenjun Wu 	/* Initialize the mbufs in vector, process 8 mbufs in one loop, taking
648e528d7c7SWenjun Wu 	 * from mempool cache and populating both shadow and HW rings
649e528d7c7SWenjun Wu 	 */
650e528d7c7SWenjun Wu 	for (i = 0; i < IDPF_RXQ_REARM_THRESH / IDPF_DESCS_PER_LOOP_AVX; i++) {
651e528d7c7SWenjun Wu 		const __m512i mbuf_ptrs = _mm512_loadu_si512
652e528d7c7SWenjun Wu 			(&cache->objs[cache->len - IDPF_DESCS_PER_LOOP_AVX]);
653e528d7c7SWenjun Wu 		_mm512_storeu_si512(rxp, mbuf_ptrs);
654e528d7c7SWenjun Wu 
655e528d7c7SWenjun Wu 		const __m512i iova_base_addrs = _mm512_i64gather_epi64
656e528d7c7SWenjun Wu 				(_mm512_add_epi64(mbuf_ptrs, iova_offsets),
657e528d7c7SWenjun Wu 				 0, /* base */
658e528d7c7SWenjun Wu 				 1  /* scale */);
659e528d7c7SWenjun Wu 		const __m512i iova_addrs = _mm512_add_epi64(iova_base_addrs,
660e528d7c7SWenjun Wu 				headroom);
661e528d7c7SWenjun Wu 
662e528d7c7SWenjun Wu 		const __m512i iova_addrs_1 = _mm512_bsrli_epi128(iova_addrs, 8);
663e528d7c7SWenjun Wu 
664e528d7c7SWenjun Wu 		rxdp[0].split_rd.pkt_addr =
665e528d7c7SWenjun Wu 			_mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs, 0));
666e528d7c7SWenjun Wu 		rxdp[1].split_rd.pkt_addr =
667e528d7c7SWenjun Wu 			_mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs_1, 0));
668e528d7c7SWenjun Wu 		rxdp[2].split_rd.pkt_addr =
669e528d7c7SWenjun Wu 			_mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs, 1));
670e528d7c7SWenjun Wu 		rxdp[3].split_rd.pkt_addr =
671e528d7c7SWenjun Wu 			_mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs_1, 1));
672e528d7c7SWenjun Wu 		rxdp[4].split_rd.pkt_addr =
673e528d7c7SWenjun Wu 			_mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs, 2));
674e528d7c7SWenjun Wu 		rxdp[5].split_rd.pkt_addr =
675e528d7c7SWenjun Wu 			_mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs_1, 2));
676e528d7c7SWenjun Wu 		rxdp[6].split_rd.pkt_addr =
677e528d7c7SWenjun Wu 			_mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs, 3));
678e528d7c7SWenjun Wu 		rxdp[7].split_rd.pkt_addr =
679e528d7c7SWenjun Wu 			_mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs_1, 3));
680e528d7c7SWenjun Wu 
681e528d7c7SWenjun Wu 		rxp += IDPF_DESCS_PER_LOOP_AVX;
682e528d7c7SWenjun Wu 		rxdp += IDPF_DESCS_PER_LOOP_AVX;
683e528d7c7SWenjun Wu 		cache->len -= IDPF_DESCS_PER_LOOP_AVX;
684e528d7c7SWenjun Wu 	}
685e528d7c7SWenjun Wu 
686e528d7c7SWenjun Wu 	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
687e528d7c7SWenjun Wu 	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
688e528d7c7SWenjun Wu 		rx_bufq->rxrearm_start = 0;
689e528d7c7SWenjun Wu 
690e528d7c7SWenjun Wu 	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
691e528d7c7SWenjun Wu 
692e528d7c7SWenjun Wu 	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
693e528d7c7SWenjun Wu 			   (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
694e528d7c7SWenjun Wu 
695e528d7c7SWenjun Wu 	/* Update the tail pointer on the NIC */
696e528d7c7SWenjun Wu 	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
697e528d7c7SWenjun Wu }
698e528d7c7SWenjun Wu 
699e528d7c7SWenjun Wu static __rte_always_inline uint16_t
700e528d7c7SWenjun Wu _idpf_splitq_recv_raw_pkts_avx512(struct idpf_rx_queue *rxq,
701e528d7c7SWenjun Wu 				  struct rte_mbuf **rx_pkts,
702e528d7c7SWenjun Wu 				  uint16_t nb_pkts)
703e528d7c7SWenjun Wu {
704e528d7c7SWenjun Wu 	const uint32_t *type_table = rxq->adapter->ptype_tbl;
705e528d7c7SWenjun Wu 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
706e528d7c7SWenjun Wu 						    rxq->bufq2->mbuf_initializer);
707e528d7c7SWenjun Wu 	/* only handle bufq2 here */
708e528d7c7SWenjun Wu 	struct rte_mbuf **sw_ring = &rxq->bufq2->sw_ring[rxq->rx_tail];
709e528d7c7SWenjun Wu 	volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring;
710e528d7c7SWenjun Wu 
711e528d7c7SWenjun Wu 	rxdp += rxq->rx_tail;
712e528d7c7SWenjun Wu 
713e528d7c7SWenjun Wu 	rte_prefetch0(rxdp);
714e528d7c7SWenjun Wu 
715e528d7c7SWenjun Wu 	/* nb_pkts has to be floor-aligned to IDPF_DESCS_PER_LOOP_AVX */
716e528d7c7SWenjun Wu 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IDPF_DESCS_PER_LOOP_AVX);
717e528d7c7SWenjun Wu 
718e528d7c7SWenjun Wu 	/* See if we need to rearm the RX queue - gives the prefetch a bit
719e528d7c7SWenjun Wu 	 * of time to act
720e528d7c7SWenjun Wu 	 */
721e528d7c7SWenjun Wu 	if (rxq->bufq2->rxrearm_nb > IDPF_RXQ_REARM_THRESH)
722e528d7c7SWenjun Wu 		idpf_splitq_rearm(rxq->bufq2);
723e528d7c7SWenjun Wu 
724e528d7c7SWenjun Wu 	/* Before we start moving massive data around, check to see if
725e528d7c7SWenjun Wu 	 * there is actually a packet available
726e528d7c7SWenjun Wu 	 */
727e528d7c7SWenjun Wu 	if (((rxdp->flex_adv_nic_3_wb.pktlen_gen_bufq_id &
728e528d7c7SWenjun Wu 	      VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) >>
729e528d7c7SWenjun Wu 	     VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) != rxq->expected_gen_id)
730e528d7c7SWenjun Wu 		return 0;
731e528d7c7SWenjun Wu 
732e528d7c7SWenjun Wu 	const __m512i dd_check = _mm512_set1_epi64(1);
733e528d7c7SWenjun Wu 	const __m512i gen_check = _mm512_set1_epi64((uint64_t)1<<46);
734e528d7c7SWenjun Wu 
735e528d7c7SWenjun Wu 	/* mask to shuffle from desc. to mbuf (4 descriptors)*/
736e528d7c7SWenjun Wu 	const __m512i shuf_msk =
737e528d7c7SWenjun Wu 		_mm512_set_epi32
738e528d7c7SWenjun Wu 			(/* 1st descriptor */
739e528d7c7SWenjun Wu 			 0xFFFFFFFF,    /* octet 4~7, 32bits rss */
740e528d7c7SWenjun Wu 			 0xFFFF0504,    /* octet 2~3, low 16 bits vlan_macip */
741e528d7c7SWenjun Wu 					/* octet 15~14, 16 bits data_len */
742e528d7c7SWenjun Wu 			 0xFFFF0504,    /* skip high 16 bits pkt_len, zero out */
743e528d7c7SWenjun Wu 					/* octet 15~14, low 16 bits pkt_len */
744e528d7c7SWenjun Wu 			 0xFFFFFFFF,    /* pkt_type set as unknown */
745e528d7c7SWenjun Wu 			 /* 2nd descriptor */
746e528d7c7SWenjun Wu 			 0xFFFFFFFF,    /* octet 4~7, 32bits rss */
747e528d7c7SWenjun Wu 			 0xFFFF0504,    /* octet 2~3, low 16 bits vlan_macip */
748e528d7c7SWenjun Wu 					/* octet 15~14, 16 bits data_len */
749e528d7c7SWenjun Wu 			 0xFFFF0504,    /* skip high 16 bits pkt_len, zero out */
750e528d7c7SWenjun Wu 					/* octet 15~14, low 16 bits pkt_len */
751e528d7c7SWenjun Wu 			 0xFFFFFFFF,    /* pkt_type set as unknown */
752e528d7c7SWenjun Wu 			 /* 3rd descriptor */
753e528d7c7SWenjun Wu 			 0xFFFFFFFF,    /* octet 4~7, 32bits rss */
754e528d7c7SWenjun Wu 			 0xFFFF0504,    /* octet 2~3, low 16 bits vlan_macip */
755e528d7c7SWenjun Wu 					/* octet 15~14, 16 bits data_len */
756e528d7c7SWenjun Wu 			 0xFFFF0504,    /* skip high 16 bits pkt_len, zero out */
757e528d7c7SWenjun Wu 					/* octet 15~14, low 16 bits pkt_len */
758e528d7c7SWenjun Wu 			 0xFFFFFFFF,    /* pkt_type set as unknown */
759e528d7c7SWenjun Wu 			 /* 4th descriptor */
760e528d7c7SWenjun Wu 			 0xFFFFFFFF,    /* octet 4~7, 32bits rss */
761e528d7c7SWenjun Wu 			 0xFFFF0504,    /* octet 2~3, low 16 bits vlan_macip */
762e528d7c7SWenjun Wu 					/* octet 15~14, 16 bits data_len */
763e528d7c7SWenjun Wu 			 0xFFFF0504,    /* skip high 16 bits pkt_len, zero out */
764e528d7c7SWenjun Wu 					/* octet 15~14, low 16 bits pkt_len */
765e528d7c7SWenjun Wu 			 0xFFFFFFFF     /* pkt_type set as unknown */
766e528d7c7SWenjun Wu 			);
767e528d7c7SWenjun Wu 	/**
768e528d7c7SWenjun Wu 	 * compile-time check the above crc and shuffle layout is correct.
769e528d7c7SWenjun Wu 	 * NOTE: the first field (lowest address) is given last in set_epi
770e528d7c7SWenjun Wu 	 * calls above.
771e528d7c7SWenjun Wu 	 */
772e528d7c7SWenjun Wu 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
773e528d7c7SWenjun Wu 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
774e528d7c7SWenjun Wu 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
775e528d7c7SWenjun Wu 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
776e528d7c7SWenjun Wu 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
777e528d7c7SWenjun Wu 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
778e528d7c7SWenjun Wu 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
779e528d7c7SWenjun Wu 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
780e528d7c7SWenjun Wu 
781e528d7c7SWenjun Wu 	uint16_t i, received;
782e528d7c7SWenjun Wu 
783e528d7c7SWenjun Wu 	for (i = 0, received = 0; i < nb_pkts;
784e528d7c7SWenjun Wu 	     i += IDPF_DESCS_PER_LOOP_AVX,
785e528d7c7SWenjun Wu 	     rxdp += IDPF_DESCS_PER_LOOP_AVX) {
786e528d7c7SWenjun Wu 		/* step 1, copy over 8 mbuf pointers to rx_pkts array */
787e528d7c7SWenjun Wu 		_mm256_storeu_si256((void *)&rx_pkts[i],
788e528d7c7SWenjun Wu 				    _mm256_loadu_si256((void *)&sw_ring[i]));
789e528d7c7SWenjun Wu #ifdef RTE_ARCH_X86_64
790e528d7c7SWenjun Wu 		_mm256_storeu_si256
791e528d7c7SWenjun Wu 			((void *)&rx_pkts[i + 4],
792e528d7c7SWenjun Wu 			 _mm256_loadu_si256((void *)&sw_ring[i + 4]));
793e528d7c7SWenjun Wu #endif
794e528d7c7SWenjun Wu 
795e528d7c7SWenjun Wu 		__m512i raw_desc0_3, raw_desc4_7;
796e528d7c7SWenjun Wu 		const __m128i raw_desc7 =
797*43fd3624SAndre Muezerie 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 7));
798e528d7c7SWenjun Wu 		rte_compiler_barrier();
799e528d7c7SWenjun Wu 		const __m128i raw_desc6 =
800*43fd3624SAndre Muezerie 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 6));
801e528d7c7SWenjun Wu 		rte_compiler_barrier();
802e528d7c7SWenjun Wu 		const __m128i raw_desc5 =
803*43fd3624SAndre Muezerie 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 5));
804e528d7c7SWenjun Wu 		rte_compiler_barrier();
805e528d7c7SWenjun Wu 		const __m128i raw_desc4 =
806*43fd3624SAndre Muezerie 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 4));
807e528d7c7SWenjun Wu 		rte_compiler_barrier();
808e528d7c7SWenjun Wu 		const __m128i raw_desc3 =
809*43fd3624SAndre Muezerie 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 3));
810e528d7c7SWenjun Wu 		rte_compiler_barrier();
811e528d7c7SWenjun Wu 		const __m128i raw_desc2 =
812*43fd3624SAndre Muezerie 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 2));
813e528d7c7SWenjun Wu 		rte_compiler_barrier();
814e528d7c7SWenjun Wu 		const __m128i raw_desc1 =
815*43fd3624SAndre Muezerie 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 1));
816e528d7c7SWenjun Wu 		rte_compiler_barrier();
817e528d7c7SWenjun Wu 		const __m128i raw_desc0 =
818*43fd3624SAndre Muezerie 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 0));
819e528d7c7SWenjun Wu 
820e528d7c7SWenjun Wu 		raw_desc4_7 = _mm512_broadcast_i32x4(raw_desc4);
821e528d7c7SWenjun Wu 		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc5, 1);
822e528d7c7SWenjun Wu 		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc6, 2);
823e528d7c7SWenjun Wu 		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc7, 3);
824e528d7c7SWenjun Wu 		raw_desc0_3 = _mm512_broadcast_i32x4(raw_desc0);
825e528d7c7SWenjun Wu 		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc1, 1);
826e528d7c7SWenjun Wu 		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc2, 2);
827e528d7c7SWenjun Wu 		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc3, 3);
828e528d7c7SWenjun Wu 
829e528d7c7SWenjun Wu 		/**
830e528d7c7SWenjun Wu 		 * convert descriptors 4-7 into mbufs, adjusting length and
831e528d7c7SWenjun Wu 		 * re-arranging fields. Then write into the mbuf
832e528d7c7SWenjun Wu 		 */
833e528d7c7SWenjun Wu 		const __m512i len_mask = _mm512_set_epi32(0xffffffff, 0xffffffff,
834e528d7c7SWenjun Wu 							  0xffff3fff, 0xffffffff,
835e528d7c7SWenjun Wu 							  0xffffffff, 0xffffffff,
836e528d7c7SWenjun Wu 							  0xffff3fff, 0xffffffff,
837e528d7c7SWenjun Wu 							  0xffffffff, 0xffffffff,
838e528d7c7SWenjun Wu 							  0xffff3fff, 0xffffffff,
839e528d7c7SWenjun Wu 							  0xffffffff, 0xffffffff,
840e528d7c7SWenjun Wu 							  0xffff3fff, 0xffffffff);
841e528d7c7SWenjun Wu 		const __m512i desc4_7 = _mm512_and_epi32(raw_desc4_7, len_mask);
842e528d7c7SWenjun Wu 		__m512i mb4_7 = _mm512_shuffle_epi8(desc4_7, shuf_msk);
843e528d7c7SWenjun Wu 
844e528d7c7SWenjun Wu 		/**
845e528d7c7SWenjun Wu 		 * to get packet types, shift 64-bit values down 30 bits
846e528d7c7SWenjun Wu 		 * and so ptype is in lower 8-bits in each
847e528d7c7SWenjun Wu 		 */
848e528d7c7SWenjun Wu 		const __m512i ptypes4_7 = _mm512_srli_epi64(desc4_7, 16);
849e528d7c7SWenjun Wu 		const __m256i ptypes6_7 = _mm512_extracti64x4_epi64(ptypes4_7, 1);
850e528d7c7SWenjun Wu 		const __m256i ptypes4_5 = _mm512_extracti64x4_epi64(ptypes4_7, 0);
851e528d7c7SWenjun Wu 		const uint8_t ptype7 = _mm256_extract_epi8(ptypes6_7, 16);
852e528d7c7SWenjun Wu 		const uint8_t ptype6 = _mm256_extract_epi8(ptypes6_7, 0);
853e528d7c7SWenjun Wu 		const uint8_t ptype5 = _mm256_extract_epi8(ptypes4_5, 16);
854e528d7c7SWenjun Wu 		const uint8_t ptype4 = _mm256_extract_epi8(ptypes4_5, 0);
855e528d7c7SWenjun Wu 
856e528d7c7SWenjun Wu 		const __m512i ptype4_7 = _mm512_set_epi32
857e528d7c7SWenjun Wu 			(0, 0, 0, type_table[ptype7],
858e528d7c7SWenjun Wu 			 0, 0, 0, type_table[ptype6],
859e528d7c7SWenjun Wu 			 0, 0, 0, type_table[ptype5],
860e528d7c7SWenjun Wu 			 0, 0, 0, type_table[ptype4]);
861e528d7c7SWenjun Wu 		mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7);
862e528d7c7SWenjun Wu 
863e528d7c7SWenjun Wu 		/**
864e528d7c7SWenjun Wu 		 * convert descriptors 0-3 into mbufs, adjusting length and
865e528d7c7SWenjun Wu 		 * re-arranging fields. Then write into the mbuf
866e528d7c7SWenjun Wu 		 */
867e528d7c7SWenjun Wu 		const __m512i desc0_3 = _mm512_and_epi32(raw_desc0_3, len_mask);
868e528d7c7SWenjun Wu 		__m512i mb0_3 = _mm512_shuffle_epi8(desc0_3, shuf_msk);
869e528d7c7SWenjun Wu 
870e528d7c7SWenjun Wu 		/* get the packet types */
871e528d7c7SWenjun Wu 		const __m512i ptypes0_3 = _mm512_srli_epi64(desc0_3, 16);
872e528d7c7SWenjun Wu 		const __m256i ptypes2_3 = _mm512_extracti64x4_epi64(ptypes0_3, 1);
873e528d7c7SWenjun Wu 		const __m256i ptypes0_1 = _mm512_extracti64x4_epi64(ptypes0_3, 0);
874e528d7c7SWenjun Wu 		const uint8_t ptype3 = _mm256_extract_epi8(ptypes2_3, 16);
875e528d7c7SWenjun Wu 		const uint8_t ptype2 = _mm256_extract_epi8(ptypes2_3, 0);
876e528d7c7SWenjun Wu 		const uint8_t ptype1 = _mm256_extract_epi8(ptypes0_1, 16);
877e528d7c7SWenjun Wu 		const uint8_t ptype0 = _mm256_extract_epi8(ptypes0_1, 0);
878e528d7c7SWenjun Wu 
879e528d7c7SWenjun Wu 		const __m512i ptype0_3 = _mm512_set_epi32
880e528d7c7SWenjun Wu 			(0, 0, 0, type_table[ptype3],
881e528d7c7SWenjun Wu 			 0, 0, 0, type_table[ptype2],
882e528d7c7SWenjun Wu 			 0, 0, 0, type_table[ptype1],
883e528d7c7SWenjun Wu 			 0, 0, 0, type_table[ptype0]);
884e528d7c7SWenjun Wu 		mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3);
885e528d7c7SWenjun Wu 
886e528d7c7SWenjun Wu 		/**
887e528d7c7SWenjun Wu 		 * use permute/extract to get status and generation bit content
888e528d7c7SWenjun Wu 		 * After the operations, the packets status flags are in the
889e528d7c7SWenjun Wu 		 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
890e528d7c7SWenjun Wu 		 */
891e528d7c7SWenjun Wu 
892e528d7c7SWenjun Wu 		const __m512i dd_permute_msk = _mm512_set_epi64
893e528d7c7SWenjun Wu 			(11, 15, 3, 7, 9, 13, 1, 5);
894e528d7c7SWenjun Wu 		const __m512i status0_7 = _mm512_permutex2var_epi64
895e528d7c7SWenjun Wu 			(raw_desc4_7, dd_permute_msk, raw_desc0_3);
896e528d7c7SWenjun Wu 		const __m512i gen_permute_msk = _mm512_set_epi64
897e528d7c7SWenjun Wu 			(10, 14, 2, 6, 8, 12, 0, 4);
898e528d7c7SWenjun Wu 		const __m512i raw_gen0_7 = _mm512_permutex2var_epi64
899e528d7c7SWenjun Wu 			(raw_desc4_7, gen_permute_msk, raw_desc0_3);
900e528d7c7SWenjun Wu 
901e528d7c7SWenjun Wu 		/* now do flag manipulation */
902e528d7c7SWenjun Wu 
903e528d7c7SWenjun Wu 		/**
904e528d7c7SWenjun Wu 		 * At this point, we have the 8 sets of flags in the low 16-bits
905e528d7c7SWenjun Wu 		 * of each 32-bit value in vlan0.
906e528d7c7SWenjun Wu 		 * We want to extract these, and merge them with the mbuf init
907e528d7c7SWenjun Wu 		 * data so we can do a single write to the mbuf to set the flags
908e528d7c7SWenjun Wu 		 * and all the other initialization fields. Extracting the
909e528d7c7SWenjun Wu 		 * appropriate flags means that we have to do a shift and blend
910e528d7c7SWenjun Wu 		 * for each mbuf before we do the write. However, we can also
911e528d7c7SWenjun Wu 		 * add in the previously computed rx_descriptor fields to
912e528d7c7SWenjun Wu 		 * make a single 256-bit write per mbuf
913e528d7c7SWenjun Wu 		 */
914e528d7c7SWenjun Wu 		/* check the structure matches expectations */
915e528d7c7SWenjun Wu 		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
916e528d7c7SWenjun Wu 				 offsetof(struct rte_mbuf, rearm_data) + 8);
917e528d7c7SWenjun Wu 		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
918e528d7c7SWenjun Wu 				 RTE_ALIGN(offsetof(struct rte_mbuf,
919e528d7c7SWenjun Wu 						    rearm_data),
920e528d7c7SWenjun Wu 						    16));
921e528d7c7SWenjun Wu 				/* build up data and do writes */
922e528d7c7SWenjun Wu 		__m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
923e528d7c7SWenjun Wu 			rearm6, rearm7;
924e528d7c7SWenjun Wu 		const __m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0);
925e528d7c7SWenjun Wu 		const __m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1);
926e528d7c7SWenjun Wu 		const __m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0);
927e528d7c7SWenjun Wu 		const __m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1);
928e528d7c7SWenjun Wu 
929e528d7c7SWenjun Wu 		rearm6 = _mm256_permute2f128_si256(mbuf_init, mb6_7, 0x20);
930e528d7c7SWenjun Wu 		rearm4 = _mm256_permute2f128_si256(mbuf_init, mb4_5, 0x20);
931e528d7c7SWenjun Wu 		rearm2 = _mm256_permute2f128_si256(mbuf_init, mb2_3, 0x20);
932e528d7c7SWenjun Wu 		rearm0 = _mm256_permute2f128_si256(mbuf_init, mb0_1, 0x20);
933e528d7c7SWenjun Wu 
934e528d7c7SWenjun Wu 		/* write to mbuf */
935e528d7c7SWenjun Wu 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data,
936e528d7c7SWenjun Wu 				    rearm6);
937e528d7c7SWenjun Wu 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data,
938e528d7c7SWenjun Wu 				    rearm4);
939e528d7c7SWenjun Wu 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data,
940e528d7c7SWenjun Wu 				    rearm2);
941e528d7c7SWenjun Wu 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data,
942e528d7c7SWenjun Wu 				    rearm0);
943e528d7c7SWenjun Wu 
944e528d7c7SWenjun Wu 		rearm7 = _mm256_blend_epi32(mbuf_init, mb6_7, 0xF0);
945e528d7c7SWenjun Wu 		rearm5 = _mm256_blend_epi32(mbuf_init, mb4_5, 0xF0);
946e528d7c7SWenjun Wu 		rearm3 = _mm256_blend_epi32(mbuf_init, mb2_3, 0xF0);
947e528d7c7SWenjun Wu 		rearm1 = _mm256_blend_epi32(mbuf_init, mb0_1, 0xF0);
948e528d7c7SWenjun Wu 
949e528d7c7SWenjun Wu 		/* again write to mbufs */
950e528d7c7SWenjun Wu 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data,
951e528d7c7SWenjun Wu 				    rearm7);
952e528d7c7SWenjun Wu 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data,
953e528d7c7SWenjun Wu 				    rearm5);
954e528d7c7SWenjun Wu 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data,
955e528d7c7SWenjun Wu 				    rearm3);
956e528d7c7SWenjun Wu 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data,
957e528d7c7SWenjun Wu 				    rearm1);
958e528d7c7SWenjun Wu 
959e528d7c7SWenjun Wu 		const __mmask8 dd_mask = _mm512_cmpeq_epi64_mask(
960e528d7c7SWenjun Wu 			_mm512_and_epi64(status0_7, dd_check), dd_check);
961e528d7c7SWenjun Wu 		const __mmask8 gen_mask = _mm512_cmpeq_epi64_mask(
962e528d7c7SWenjun Wu 			_mm512_and_epi64(raw_gen0_7, gen_check),
963e528d7c7SWenjun Wu 			_mm512_set1_epi64((uint64_t)rxq->expected_gen_id << 46));
964e528d7c7SWenjun Wu 		const __mmask8 recv_mask = _kand_mask8(dd_mask, gen_mask);
9653d4e27fdSDavid Marchand 		uint16_t burst = rte_popcount32(_cvtmask8_u32(recv_mask));
966e528d7c7SWenjun Wu 
967e528d7c7SWenjun Wu 		received += burst;
968e528d7c7SWenjun Wu 		if (burst != IDPF_DESCS_PER_LOOP_AVX)
969e528d7c7SWenjun Wu 			break;
970e528d7c7SWenjun Wu 	}
971e528d7c7SWenjun Wu 
972e528d7c7SWenjun Wu 	/* update tail pointers */
973e528d7c7SWenjun Wu 	rxq->rx_tail += received;
974e528d7c7SWenjun Wu 	rxq->expected_gen_id ^= ((rxq->rx_tail & rxq->nb_rx_desc) != 0);
975e528d7c7SWenjun Wu 	rxq->rx_tail &= (rxq->nb_rx_desc - 1);
976e528d7c7SWenjun Wu 	if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep aligned */
977e528d7c7SWenjun Wu 		rxq->rx_tail--;
978e528d7c7SWenjun Wu 		received--;
979e528d7c7SWenjun Wu 	}
980e528d7c7SWenjun Wu 
981e528d7c7SWenjun Wu 	rxq->bufq2->rxrearm_nb += received;
982e528d7c7SWenjun Wu 	return received;
983e528d7c7SWenjun Wu }
984e528d7c7SWenjun Wu 
985e528d7c7SWenjun Wu /* only bufq2 can receive pkts */
986e528d7c7SWenjun Wu uint16_t
987e528d7c7SWenjun Wu idpf_dp_splitq_recv_pkts_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
988e528d7c7SWenjun Wu 			     uint16_t nb_pkts)
989e528d7c7SWenjun Wu {
990e528d7c7SWenjun Wu 	return _idpf_splitq_recv_raw_pkts_avx512(rx_queue, rx_pkts,
991e528d7c7SWenjun Wu 						 nb_pkts);
992e528d7c7SWenjun Wu }
993e528d7c7SWenjun Wu 
9940fac6a1cSBeilei Xing static __rte_always_inline int
995e528d7c7SWenjun Wu idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
9960fac6a1cSBeilei Xing {
9970fac6a1cSBeilei Xing 	struct idpf_tx_vec_entry *txep;
9980fac6a1cSBeilei Xing 	uint32_t n;
9990fac6a1cSBeilei Xing 	uint32_t i;
10000fac6a1cSBeilei Xing 	int nb_free = 0;
10010fac6a1cSBeilei Xing 	struct rte_mbuf *m, *free[txq->rs_thresh];
10020fac6a1cSBeilei Xing 
10030fac6a1cSBeilei Xing 	/* check DD bits on threshold descriptor */
1004bab8149aSSimei Su 	if ((txq->tx_ring[txq->next_dd].qw1 &
10050fac6a1cSBeilei Xing 			rte_cpu_to_le_64(IDPF_TXD_QW1_DTYPE_M)) !=
10060fac6a1cSBeilei Xing 			rte_cpu_to_le_64(IDPF_TX_DESC_DTYPE_DESC_DONE))
10070fac6a1cSBeilei Xing 		return 0;
10080fac6a1cSBeilei Xing 
10090fac6a1cSBeilei Xing 	n = txq->rs_thresh;
10100fac6a1cSBeilei Xing 
10110fac6a1cSBeilei Xing 	 /* first buffer to free from S/W ring is at index
10120fac6a1cSBeilei Xing 	  * tx_next_dd - (tx_rs_thresh-1)
10130fac6a1cSBeilei Xing 	  */
10140fac6a1cSBeilei Xing 	txep = (void *)txq->sw_ring;
10150fac6a1cSBeilei Xing 	txep += txq->next_dd - (n - 1);
10160fac6a1cSBeilei Xing 
10170fac6a1cSBeilei Xing 	if (txq->offloads & IDPF_TX_OFFLOAD_MBUF_FAST_FREE && (n & 31) == 0) {
10180fac6a1cSBeilei Xing 		struct rte_mempool *mp = txep[0].mbuf->pool;
10190fac6a1cSBeilei Xing 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
10200fac6a1cSBeilei Xing 								rte_lcore_id());
10210fac6a1cSBeilei Xing 		void **cache_objs;
10220fac6a1cSBeilei Xing 
10230fac6a1cSBeilei Xing 		if (cache == NULL || cache->len == 0)
10240fac6a1cSBeilei Xing 			goto normal;
10250fac6a1cSBeilei Xing 
10260fac6a1cSBeilei Xing 		cache_objs = &cache->objs[cache->len];
10270fac6a1cSBeilei Xing 
10280fac6a1cSBeilei Xing 		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
10290fac6a1cSBeilei Xing 			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
10300fac6a1cSBeilei Xing 			goto done;
10310fac6a1cSBeilei Xing 		}
10320fac6a1cSBeilei Xing 
10330fac6a1cSBeilei Xing 		/* The cache follows the following algorithm
10340fac6a1cSBeilei Xing 		 *   1. Add the objects to the cache
10350fac6a1cSBeilei Xing 		 *   2. Anything greater than the cache min value (if it crosses the
10360fac6a1cSBeilei Xing 		 *   cache flush threshold) is flushed to the ring.
10370fac6a1cSBeilei Xing 		 */
10380fac6a1cSBeilei Xing 		/* Add elements back into the cache */
10390fac6a1cSBeilei Xing 		uint32_t copied = 0;
10400fac6a1cSBeilei Xing 		/* n is multiple of 32 */
10410fac6a1cSBeilei Xing 		while (copied < n) {
1042d16364e3SBruce Richardson #ifdef RTE_ARCH_64
10430fac6a1cSBeilei Xing 			const __m512i a = _mm512_loadu_si512(&txep[copied]);
10440fac6a1cSBeilei Xing 			const __m512i b = _mm512_loadu_si512(&txep[copied + 8]);
10450fac6a1cSBeilei Xing 			const __m512i c = _mm512_loadu_si512(&txep[copied + 16]);
10460fac6a1cSBeilei Xing 			const __m512i d = _mm512_loadu_si512(&txep[copied + 24]);
10470fac6a1cSBeilei Xing 
10480fac6a1cSBeilei Xing 			_mm512_storeu_si512(&cache_objs[copied], a);
10490fac6a1cSBeilei Xing 			_mm512_storeu_si512(&cache_objs[copied + 8], b);
10500fac6a1cSBeilei Xing 			_mm512_storeu_si512(&cache_objs[copied + 16], c);
10510fac6a1cSBeilei Xing 			_mm512_storeu_si512(&cache_objs[copied + 24], d);
1052d16364e3SBruce Richardson #else
1053d16364e3SBruce Richardson 			const __m512i a = _mm512_loadu_si512(&txep[copied]);
1054d16364e3SBruce Richardson 			const __m512i b = _mm512_loadu_si512(&txep[copied + 16]);
1055d16364e3SBruce Richardson 			_mm512_storeu_si512(&cache_objs[copied], a);
1056d16364e3SBruce Richardson 			_mm512_storeu_si512(&cache_objs[copied + 16], b);
1057d16364e3SBruce Richardson #endif
10580fac6a1cSBeilei Xing 			copied += 32;
10590fac6a1cSBeilei Xing 		}
10600fac6a1cSBeilei Xing 		cache->len += n;
10610fac6a1cSBeilei Xing 
10620fac6a1cSBeilei Xing 		if (cache->len >= cache->flushthresh) {
10630fac6a1cSBeilei Xing 			rte_mempool_ops_enqueue_bulk(mp,
10640fac6a1cSBeilei Xing 						     &cache->objs[cache->size],
10650fac6a1cSBeilei Xing 						     cache->len - cache->size);
10660fac6a1cSBeilei Xing 			cache->len = cache->size;
10670fac6a1cSBeilei Xing 		}
10680fac6a1cSBeilei Xing 		goto done;
10690fac6a1cSBeilei Xing 	}
10700fac6a1cSBeilei Xing 
10710fac6a1cSBeilei Xing normal:
10720fac6a1cSBeilei Xing 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
10730fac6a1cSBeilei Xing 	if (likely(m != NULL)) {
10740fac6a1cSBeilei Xing 		free[0] = m;
10750fac6a1cSBeilei Xing 		nb_free = 1;
10760fac6a1cSBeilei Xing 		for (i = 1; i < n; i++) {
10770fac6a1cSBeilei Xing 			m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
10780fac6a1cSBeilei Xing 			if (likely(m != NULL)) {
10790fac6a1cSBeilei Xing 				if (likely(m->pool == free[0]->pool)) {
10800fac6a1cSBeilei Xing 					free[nb_free++] = m;
10810fac6a1cSBeilei Xing 				} else {
10820fac6a1cSBeilei Xing 					rte_mempool_put_bulk(free[0]->pool,
10830fac6a1cSBeilei Xing 							     (void *)free,
10840fac6a1cSBeilei Xing 							     nb_free);
10850fac6a1cSBeilei Xing 					free[0] = m;
10860fac6a1cSBeilei Xing 					nb_free = 1;
10870fac6a1cSBeilei Xing 				}
10880fac6a1cSBeilei Xing 			}
10890fac6a1cSBeilei Xing 		}
10900fac6a1cSBeilei Xing 		rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
10910fac6a1cSBeilei Xing 	} else {
10920fac6a1cSBeilei Xing 		for (i = 1; i < n; i++) {
10930fac6a1cSBeilei Xing 			m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
10940fac6a1cSBeilei Xing 			if (m != NULL)
10950fac6a1cSBeilei Xing 				rte_mempool_put(m->pool, m);
10960fac6a1cSBeilei Xing 		}
10970fac6a1cSBeilei Xing 	}
10980fac6a1cSBeilei Xing 
10990fac6a1cSBeilei Xing done:
11000fac6a1cSBeilei Xing 	/* buffers were freed, update counters */
11010fac6a1cSBeilei Xing 	txq->nb_free = (uint16_t)(txq->nb_free + txq->rs_thresh);
11020fac6a1cSBeilei Xing 	txq->next_dd = (uint16_t)(txq->next_dd + txq->rs_thresh);
11030fac6a1cSBeilei Xing 	if (txq->next_dd >= txq->nb_tx_desc)
11040fac6a1cSBeilei Xing 		txq->next_dd = (uint16_t)(txq->rs_thresh - 1);
11050fac6a1cSBeilei Xing 
11060fac6a1cSBeilei Xing 	return txq->rs_thresh;
11070fac6a1cSBeilei Xing }
11080fac6a1cSBeilei Xing 
11090fac6a1cSBeilei Xing static __rte_always_inline void
11100fac6a1cSBeilei Xing tx_backlog_entry_avx512(struct idpf_tx_vec_entry *txep,
11110fac6a1cSBeilei Xing 			struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
11120fac6a1cSBeilei Xing {
11130fac6a1cSBeilei Xing 	int i;
11140fac6a1cSBeilei Xing 
11150fac6a1cSBeilei Xing 	for (i = 0; i < (int)nb_pkts; ++i)
11160fac6a1cSBeilei Xing 		txep[i].mbuf = tx_pkts[i];
11170fac6a1cSBeilei Xing }
11180fac6a1cSBeilei Xing 
11190fac6a1cSBeilei Xing static __rte_always_inline void
1120bab8149aSSimei Su idpf_singleq_vtx1(volatile struct idpf_base_tx_desc *txdp,
11210fac6a1cSBeilei Xing 	  struct rte_mbuf *pkt, uint64_t flags)
11220fac6a1cSBeilei Xing {
11230fac6a1cSBeilei Xing 	uint64_t high_qw =
1124bab8149aSSimei Su 		(IDPF_TX_DESC_DTYPE_DATA |
1125bab8149aSSimei Su 		 ((uint64_t)flags  << IDPF_TXD_QW1_CMD_S) |
1126bab8149aSSimei Su 		 ((uint64_t)pkt->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S));
11270fac6a1cSBeilei Xing 
11280fac6a1cSBeilei Xing 	__m128i descriptor = _mm_set_epi64x(high_qw,
11290fac6a1cSBeilei Xing 					    pkt->buf_iova + pkt->data_off);
1130*43fd3624SAndre Muezerie 	_mm_storeu_si128(RTE_CAST_PTR(__m128i *, txdp), descriptor);
11310fac6a1cSBeilei Xing }
11320fac6a1cSBeilei Xing 
11330fac6a1cSBeilei Xing #define IDPF_TX_LEN_MASK 0xAA
11340fac6a1cSBeilei Xing #define IDPF_TX_OFF_MASK 0x55
11350fac6a1cSBeilei Xing static __rte_always_inline void
1136bab8149aSSimei Su idpf_singleq_vtx(volatile struct idpf_base_tx_desc *txdp,
11370fac6a1cSBeilei Xing 	 struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags)
11380fac6a1cSBeilei Xing {
1139bab8149aSSimei Su 	const uint64_t hi_qw_tmpl = (IDPF_TX_DESC_DTYPE_DATA  |
1140bab8149aSSimei Su 			((uint64_t)flags  << IDPF_TXD_QW1_CMD_S));
11410fac6a1cSBeilei Xing 
11420fac6a1cSBeilei Xing 	/* if unaligned on 32-bit boundary, do one to align */
11430fac6a1cSBeilei Xing 	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
1144e528d7c7SWenjun Wu 		idpf_singleq_vtx1(txdp, *pkt, flags);
11450fac6a1cSBeilei Xing 		nb_pkts--, txdp++, pkt++;
11460fac6a1cSBeilei Xing 	}
11470fac6a1cSBeilei Xing 
11480fac6a1cSBeilei Xing 	/* do 4 at a time while possible, in bursts */
11490fac6a1cSBeilei Xing 	for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
11500fac6a1cSBeilei Xing 		uint64_t hi_qw3 =
11510fac6a1cSBeilei Xing 			hi_qw_tmpl |
11520fac6a1cSBeilei Xing 			((uint64_t)pkt[3]->data_len <<
1153bab8149aSSimei Su 			 IDPF_TXD_QW1_TX_BUF_SZ_S);
11540fac6a1cSBeilei Xing 		uint64_t hi_qw2 =
11550fac6a1cSBeilei Xing 			hi_qw_tmpl |
11560fac6a1cSBeilei Xing 			((uint64_t)pkt[2]->data_len <<
1157bab8149aSSimei Su 			 IDPF_TXD_QW1_TX_BUF_SZ_S);
11580fac6a1cSBeilei Xing 		uint64_t hi_qw1 =
11590fac6a1cSBeilei Xing 			hi_qw_tmpl |
11600fac6a1cSBeilei Xing 			((uint64_t)pkt[1]->data_len <<
1161bab8149aSSimei Su 			 IDPF_TXD_QW1_TX_BUF_SZ_S);
11620fac6a1cSBeilei Xing 		uint64_t hi_qw0 =
11630fac6a1cSBeilei Xing 			hi_qw_tmpl |
11640fac6a1cSBeilei Xing 			((uint64_t)pkt[0]->data_len <<
1165bab8149aSSimei Su 			 IDPF_TXD_QW1_TX_BUF_SZ_S);
11660fac6a1cSBeilei Xing 
11670fac6a1cSBeilei Xing 		__m512i desc0_3 =
11680fac6a1cSBeilei Xing 			_mm512_set_epi64
11690fac6a1cSBeilei Xing 				(hi_qw3,
11700fac6a1cSBeilei Xing 				 pkt[3]->buf_iova + pkt[3]->data_off,
11710fac6a1cSBeilei Xing 				 hi_qw2,
11720fac6a1cSBeilei Xing 				 pkt[2]->buf_iova + pkt[2]->data_off,
11730fac6a1cSBeilei Xing 				 hi_qw1,
11740fac6a1cSBeilei Xing 				 pkt[1]->buf_iova + pkt[1]->data_off,
11750fac6a1cSBeilei Xing 				 hi_qw0,
11760fac6a1cSBeilei Xing 				 pkt[0]->buf_iova + pkt[0]->data_off);
1177*43fd3624SAndre Muezerie 		_mm512_storeu_si512(RTE_CAST_PTR(void *, txdp), desc0_3);
11780fac6a1cSBeilei Xing 	}
11790fac6a1cSBeilei Xing 
11800fac6a1cSBeilei Xing 	/* do any last ones */
11810fac6a1cSBeilei Xing 	while (nb_pkts) {
1182e528d7c7SWenjun Wu 		idpf_singleq_vtx1(txdp, *pkt, flags);
11830fac6a1cSBeilei Xing 		txdp++, pkt++, nb_pkts--;
11840fac6a1cSBeilei Xing 	}
11850fac6a1cSBeilei Xing }
11860fac6a1cSBeilei Xing 
11870fac6a1cSBeilei Xing static __rte_always_inline uint16_t
1188e528d7c7SWenjun Wu idpf_singleq_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
11890fac6a1cSBeilei Xing 					 uint16_t nb_pkts)
11900fac6a1cSBeilei Xing {
11910fac6a1cSBeilei Xing 	struct idpf_tx_queue *txq = tx_queue;
1192bab8149aSSimei Su 	volatile struct idpf_base_tx_desc *txdp;
11930fac6a1cSBeilei Xing 	struct idpf_tx_vec_entry *txep;
11940fac6a1cSBeilei Xing 	uint16_t n, nb_commit, tx_id;
1195bab8149aSSimei Su 	uint64_t flags = IDPF_TX_DESC_CMD_EOP;
1196bab8149aSSimei Su 	uint64_t rs = IDPF_TX_DESC_CMD_RS | flags;
11970fac6a1cSBeilei Xing 
11980fac6a1cSBeilei Xing 	/* cross rx_thresh boundary is not allowed */
11990fac6a1cSBeilei Xing 	nb_pkts = RTE_MIN(nb_pkts, txq->rs_thresh);
12000fac6a1cSBeilei Xing 
12010fac6a1cSBeilei Xing 	if (txq->nb_free < txq->free_thresh)
1202e528d7c7SWenjun Wu 		idpf_tx_singleq_free_bufs_avx512(txq);
12030fac6a1cSBeilei Xing 
12040fac6a1cSBeilei Xing 	nb_pkts = (uint16_t)RTE_MIN(txq->nb_free, nb_pkts);
12050fac6a1cSBeilei Xing 	nb_commit = nb_pkts;
12060fac6a1cSBeilei Xing 	if (unlikely(nb_pkts == 0))
12070fac6a1cSBeilei Xing 		return 0;
12080fac6a1cSBeilei Xing 
12090fac6a1cSBeilei Xing 	tx_id = txq->tx_tail;
12100fac6a1cSBeilei Xing 	txdp = &txq->tx_ring[tx_id];
12110fac6a1cSBeilei Xing 	txep = (void *)txq->sw_ring;
12120fac6a1cSBeilei Xing 	txep += tx_id;
12130fac6a1cSBeilei Xing 
12140fac6a1cSBeilei Xing 	txq->nb_free = (uint16_t)(txq->nb_free - nb_pkts);
12150fac6a1cSBeilei Xing 
12160fac6a1cSBeilei Xing 	n = (uint16_t)(txq->nb_tx_desc - tx_id);
12170fac6a1cSBeilei Xing 	if (nb_commit >= n) {
12180fac6a1cSBeilei Xing 		tx_backlog_entry_avx512(txep, tx_pkts, n);
12190fac6a1cSBeilei Xing 
1220e528d7c7SWenjun Wu 		idpf_singleq_vtx(txdp, tx_pkts, n - 1, flags);
12210fac6a1cSBeilei Xing 		tx_pkts += (n - 1);
12220fac6a1cSBeilei Xing 		txdp += (n - 1);
12230fac6a1cSBeilei Xing 
1224e528d7c7SWenjun Wu 		idpf_singleq_vtx1(txdp, *tx_pkts++, rs);
12250fac6a1cSBeilei Xing 
12260fac6a1cSBeilei Xing 		nb_commit = (uint16_t)(nb_commit - n);
12270fac6a1cSBeilei Xing 
12280fac6a1cSBeilei Xing 		tx_id = 0;
12290fac6a1cSBeilei Xing 		txq->next_rs = (uint16_t)(txq->rs_thresh - 1);
12300fac6a1cSBeilei Xing 
12310fac6a1cSBeilei Xing 		/* avoid reach the end of ring */
12320fac6a1cSBeilei Xing 		txdp = &txq->tx_ring[tx_id];
12330fac6a1cSBeilei Xing 		txep = (void *)txq->sw_ring;
12340fac6a1cSBeilei Xing 		txep += tx_id;
12350fac6a1cSBeilei Xing 	}
12360fac6a1cSBeilei Xing 
12370fac6a1cSBeilei Xing 	tx_backlog_entry_avx512(txep, tx_pkts, nb_commit);
12380fac6a1cSBeilei Xing 
1239e528d7c7SWenjun Wu 	idpf_singleq_vtx(txdp, tx_pkts, nb_commit, flags);
12400fac6a1cSBeilei Xing 
12410fac6a1cSBeilei Xing 	tx_id = (uint16_t)(tx_id + nb_commit);
12420fac6a1cSBeilei Xing 	if (tx_id > txq->next_rs) {
1243bab8149aSSimei Su 		txq->tx_ring[txq->next_rs].qw1 |=
1244bab8149aSSimei Su 			rte_cpu_to_le_64(((uint64_t)IDPF_TX_DESC_CMD_RS) <<
1245bab8149aSSimei Su 					 IDPF_TXD_QW1_CMD_S);
12460fac6a1cSBeilei Xing 		txq->next_rs =
12470fac6a1cSBeilei Xing 			(uint16_t)(txq->next_rs + txq->rs_thresh);
12480fac6a1cSBeilei Xing 	}
12490fac6a1cSBeilei Xing 
12500fac6a1cSBeilei Xing 	txq->tx_tail = tx_id;
12510fac6a1cSBeilei Xing 
12520fac6a1cSBeilei Xing 	IDPF_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
12530fac6a1cSBeilei Xing 
12540fac6a1cSBeilei Xing 	return nb_pkts;
12550fac6a1cSBeilei Xing }
12560fac6a1cSBeilei Xing 
12570fac6a1cSBeilei Xing static __rte_always_inline uint16_t
1258e528d7c7SWenjun Wu idpf_singleq_xmit_pkts_vec_avx512_cmn(void *tx_queue, struct rte_mbuf **tx_pkts,
12590fac6a1cSBeilei Xing 			      uint16_t nb_pkts)
12600fac6a1cSBeilei Xing {
12610fac6a1cSBeilei Xing 	uint16_t nb_tx = 0;
12620fac6a1cSBeilei Xing 	struct idpf_tx_queue *txq = tx_queue;
12630fac6a1cSBeilei Xing 
12640fac6a1cSBeilei Xing 	while (nb_pkts) {
12650fac6a1cSBeilei Xing 		uint16_t ret, num;
12660fac6a1cSBeilei Xing 
12670fac6a1cSBeilei Xing 		num = (uint16_t)RTE_MIN(nb_pkts, txq->rs_thresh);
1268e528d7c7SWenjun Wu 		ret = idpf_singleq_xmit_fixed_burst_vec_avx512(tx_queue, &tx_pkts[nb_tx],
12690fac6a1cSBeilei Xing 						       num);
12700fac6a1cSBeilei Xing 		nb_tx += ret;
12710fac6a1cSBeilei Xing 		nb_pkts -= ret;
12720fac6a1cSBeilei Xing 		if (ret < num)
12730fac6a1cSBeilei Xing 			break;
12740fac6a1cSBeilei Xing 	}
12750fac6a1cSBeilei Xing 
12760fac6a1cSBeilei Xing 	return nb_tx;
12770fac6a1cSBeilei Xing }
12780fac6a1cSBeilei Xing 
12790fac6a1cSBeilei Xing uint16_t
12809ebf3f6bSBeilei Xing idpf_dp_singleq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
12810fac6a1cSBeilei Xing 				 uint16_t nb_pkts)
12820fac6a1cSBeilei Xing {
1283e528d7c7SWenjun Wu 	return idpf_singleq_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts);
1284e528d7c7SWenjun Wu }
1285e528d7c7SWenjun Wu 
1286e528d7c7SWenjun Wu static __rte_always_inline void
1287e528d7c7SWenjun Wu idpf_splitq_scan_cq_ring(struct idpf_tx_queue *cq)
1288e528d7c7SWenjun Wu {
1289e528d7c7SWenjun Wu 	struct idpf_splitq_tx_compl_desc *compl_ring;
1290e528d7c7SWenjun Wu 	struct idpf_tx_queue *txq;
1291e528d7c7SWenjun Wu 	uint16_t genid, txq_qid, cq_qid, i;
1292e528d7c7SWenjun Wu 	uint8_t ctype;
1293e528d7c7SWenjun Wu 
1294e528d7c7SWenjun Wu 	cq_qid = cq->tx_tail;
1295e528d7c7SWenjun Wu 
1296e528d7c7SWenjun Wu 	for (i = 0; i < IDPD_TXQ_SCAN_CQ_THRESH; i++) {
1297e528d7c7SWenjun Wu 		if (cq_qid == cq->nb_tx_desc) {
1298e528d7c7SWenjun Wu 			cq_qid = 0;
1299e528d7c7SWenjun Wu 			cq->expected_gen_id ^= 1;
1300e528d7c7SWenjun Wu 		}
1301e528d7c7SWenjun Wu 		compl_ring = &cq->compl_ring[cq_qid];
1302e528d7c7SWenjun Wu 		genid = (compl_ring->qid_comptype_gen &
1303e528d7c7SWenjun Wu 			rte_cpu_to_le_64(IDPF_TXD_COMPLQ_GEN_M)) >> IDPF_TXD_COMPLQ_GEN_S;
1304e528d7c7SWenjun Wu 		if (genid != cq->expected_gen_id)
1305e528d7c7SWenjun Wu 			break;
1306e528d7c7SWenjun Wu 		ctype = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
1307e528d7c7SWenjun Wu 			IDPF_TXD_COMPLQ_COMPL_TYPE_M) >> IDPF_TXD_COMPLQ_COMPL_TYPE_S;
1308e528d7c7SWenjun Wu 		txq_qid = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
1309e528d7c7SWenjun Wu 			IDPF_TXD_COMPLQ_QID_M) >> IDPF_TXD_COMPLQ_QID_S;
1310e528d7c7SWenjun Wu 		txq = cq->txqs[txq_qid - cq->tx_start_qid];
1311e528d7c7SWenjun Wu 		txq->ctype[ctype]++;
1312e528d7c7SWenjun Wu 		cq_qid++;
1313e528d7c7SWenjun Wu 	}
1314e528d7c7SWenjun Wu 
1315e528d7c7SWenjun Wu 	cq->tx_tail = cq_qid;
1316e528d7c7SWenjun Wu }
1317e528d7c7SWenjun Wu 
1318e528d7c7SWenjun Wu static __rte_always_inline int
1319e528d7c7SWenjun Wu idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
1320e528d7c7SWenjun Wu {
1321e528d7c7SWenjun Wu 	struct idpf_tx_vec_entry *txep;
1322e528d7c7SWenjun Wu 	uint32_t n;
1323e528d7c7SWenjun Wu 	uint32_t i;
1324e528d7c7SWenjun Wu 	int nb_free = 0;
1325e528d7c7SWenjun Wu 	struct rte_mbuf *m, *free[txq->rs_thresh];
1326e528d7c7SWenjun Wu 
1327e528d7c7SWenjun Wu 	n = txq->rs_thresh;
1328e528d7c7SWenjun Wu 
1329e528d7c7SWenjun Wu 	 /* first buffer to free from S/W ring is at index
1330e528d7c7SWenjun Wu 	  * tx_next_dd - (tx_rs_thresh-1)
1331e528d7c7SWenjun Wu 	  */
1332e528d7c7SWenjun Wu 	txep = (void *)txq->sw_ring;
1333e528d7c7SWenjun Wu 	txep += txq->next_dd - (n - 1);
1334e528d7c7SWenjun Wu 
1335e528d7c7SWenjun Wu 	if (txq->offloads & IDPF_TX_OFFLOAD_MBUF_FAST_FREE && (n & 31) == 0) {
1336e528d7c7SWenjun Wu 		struct rte_mempool *mp = txep[0].mbuf->pool;
1337e528d7c7SWenjun Wu 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
1338e528d7c7SWenjun Wu 								rte_lcore_id());
1339e528d7c7SWenjun Wu 		void **cache_objs;
1340e528d7c7SWenjun Wu 
1341e528d7c7SWenjun Wu 		if (!cache || cache->len == 0)
1342e528d7c7SWenjun Wu 			goto normal;
1343e528d7c7SWenjun Wu 
1344e528d7c7SWenjun Wu 		cache_objs = &cache->objs[cache->len];
1345e528d7c7SWenjun Wu 
1346e528d7c7SWenjun Wu 		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
1347e528d7c7SWenjun Wu 			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
1348e528d7c7SWenjun Wu 			goto done;
1349e528d7c7SWenjun Wu 		}
1350e528d7c7SWenjun Wu 
1351e528d7c7SWenjun Wu 		/* The cache follows the following algorithm
1352e528d7c7SWenjun Wu 		 *   1. Add the objects to the cache
1353e528d7c7SWenjun Wu 		 *   2. Anything greater than the cache min value (if it crosses the
1354e528d7c7SWenjun Wu 		 *   cache flush threshold) is flushed to the ring.
1355e528d7c7SWenjun Wu 		 */
1356e528d7c7SWenjun Wu 		/* Add elements back into the cache */
1357e528d7c7SWenjun Wu 		uint32_t copied = 0;
1358e528d7c7SWenjun Wu 		/* n is multiple of 32 */
1359e528d7c7SWenjun Wu 		while (copied < n) {
1360e528d7c7SWenjun Wu 			const __m512i a = _mm512_loadu_si512(&txep[copied]);
1361e528d7c7SWenjun Wu 			const __m512i b = _mm512_loadu_si512(&txep[copied + 8]);
1362e528d7c7SWenjun Wu 			const __m512i c = _mm512_loadu_si512(&txep[copied + 16]);
1363e528d7c7SWenjun Wu 			const __m512i d = _mm512_loadu_si512(&txep[copied + 24]);
1364e528d7c7SWenjun Wu 
1365e528d7c7SWenjun Wu 			_mm512_storeu_si512(&cache_objs[copied], a);
1366e528d7c7SWenjun Wu 			_mm512_storeu_si512(&cache_objs[copied + 8], b);
1367e528d7c7SWenjun Wu 			_mm512_storeu_si512(&cache_objs[copied + 16], c);
1368e528d7c7SWenjun Wu 			_mm512_storeu_si512(&cache_objs[copied + 24], d);
1369e528d7c7SWenjun Wu 			copied += 32;
1370e528d7c7SWenjun Wu 		}
1371e528d7c7SWenjun Wu 		cache->len += n;
1372e528d7c7SWenjun Wu 
1373e528d7c7SWenjun Wu 		if (cache->len >= cache->flushthresh) {
1374e528d7c7SWenjun Wu 			rte_mempool_ops_enqueue_bulk(mp,
1375e528d7c7SWenjun Wu 						     &cache->objs[cache->size],
1376e528d7c7SWenjun Wu 						     cache->len - cache->size);
1377e528d7c7SWenjun Wu 			cache->len = cache->size;
1378e528d7c7SWenjun Wu 		}
1379e528d7c7SWenjun Wu 		goto done;
1380e528d7c7SWenjun Wu 	}
1381e528d7c7SWenjun Wu 
1382e528d7c7SWenjun Wu normal:
1383e528d7c7SWenjun Wu 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
1384e528d7c7SWenjun Wu 	if (likely(m)) {
1385e528d7c7SWenjun Wu 		free[0] = m;
1386e528d7c7SWenjun Wu 		nb_free = 1;
1387e528d7c7SWenjun Wu 		for (i = 1; i < n; i++) {
1388e528d7c7SWenjun Wu 			m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
1389e528d7c7SWenjun Wu 			if (likely(m)) {
1390e528d7c7SWenjun Wu 				if (likely(m->pool == free[0]->pool)) {
1391e528d7c7SWenjun Wu 					free[nb_free++] = m;
1392e528d7c7SWenjun Wu 				} else {
1393e528d7c7SWenjun Wu 					rte_mempool_put_bulk(free[0]->pool,
1394e528d7c7SWenjun Wu 							     (void *)free,
1395e528d7c7SWenjun Wu 							     nb_free);
1396e528d7c7SWenjun Wu 					free[0] = m;
1397e528d7c7SWenjun Wu 					nb_free = 1;
1398e528d7c7SWenjun Wu 				}
1399e528d7c7SWenjun Wu 			}
1400e528d7c7SWenjun Wu 		}
1401e528d7c7SWenjun Wu 		rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
1402e528d7c7SWenjun Wu 	} else {
1403e528d7c7SWenjun Wu 		for (i = 1; i < n; i++) {
1404e528d7c7SWenjun Wu 			m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
1405e528d7c7SWenjun Wu 			if (m)
1406e528d7c7SWenjun Wu 				rte_mempool_put(m->pool, m);
1407e528d7c7SWenjun Wu 		}
1408e528d7c7SWenjun Wu 	}
1409e528d7c7SWenjun Wu 
1410e528d7c7SWenjun Wu done:
1411e528d7c7SWenjun Wu 	/* buffers were freed, update counters */
1412e528d7c7SWenjun Wu 	txq->nb_free = (uint16_t)(txq->nb_free + txq->rs_thresh);
1413e528d7c7SWenjun Wu 	txq->next_dd = (uint16_t)(txq->next_dd + txq->rs_thresh);
1414e528d7c7SWenjun Wu 	if (txq->next_dd >= txq->nb_tx_desc)
1415e528d7c7SWenjun Wu 		txq->next_dd = (uint16_t)(txq->rs_thresh - 1);
1416e528d7c7SWenjun Wu 	txq->ctype[IDPF_TXD_COMPLT_RS] -= txq->rs_thresh;
1417e528d7c7SWenjun Wu 
1418e528d7c7SWenjun Wu 	return txq->rs_thresh;
1419e528d7c7SWenjun Wu }
1420e528d7c7SWenjun Wu 
1421e528d7c7SWenjun Wu #define IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S	48
1422e528d7c7SWenjun Wu 
1423e528d7c7SWenjun Wu static __rte_always_inline void
1424e528d7c7SWenjun Wu idpf_splitq_vtx1(volatile struct idpf_flex_tx_sched_desc *txdp,
1425e528d7c7SWenjun Wu 	  struct rte_mbuf *pkt, uint64_t flags)
1426e528d7c7SWenjun Wu {
1427e528d7c7SWenjun Wu 	uint64_t high_qw =
1428e528d7c7SWenjun Wu 		(IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE |
1429e528d7c7SWenjun Wu 		 ((uint64_t)flags) |
1430e528d7c7SWenjun Wu 		 ((uint64_t)pkt->data_len << IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S));
1431e528d7c7SWenjun Wu 
1432e528d7c7SWenjun Wu 	__m128i descriptor = _mm_set_epi64x(high_qw,
1433e528d7c7SWenjun Wu 					    pkt->buf_iova + pkt->data_off);
1434*43fd3624SAndre Muezerie 	_mm_storeu_si128(RTE_CAST_PTR(__m128i *, txdp), descriptor);
1435e528d7c7SWenjun Wu }
1436e528d7c7SWenjun Wu 
1437e528d7c7SWenjun Wu static __rte_always_inline void
1438e528d7c7SWenjun Wu idpf_splitq_vtx(volatile struct idpf_flex_tx_sched_desc *txdp,
1439e528d7c7SWenjun Wu 	 struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags)
1440e528d7c7SWenjun Wu {
1441e528d7c7SWenjun Wu 	const uint64_t hi_qw_tmpl = (IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE  |
1442e528d7c7SWenjun Wu 			((uint64_t)flags));
1443e528d7c7SWenjun Wu 
1444e528d7c7SWenjun Wu 	/* if unaligned on 32-bit boundary, do one to align */
1445e528d7c7SWenjun Wu 	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
1446e528d7c7SWenjun Wu 		idpf_splitq_vtx1(txdp, *pkt, flags);
1447e528d7c7SWenjun Wu 		nb_pkts--, txdp++, pkt++;
1448e528d7c7SWenjun Wu 	}
1449e528d7c7SWenjun Wu 
1450e528d7c7SWenjun Wu 	/* do 4 at a time while possible, in bursts */
1451e528d7c7SWenjun Wu 	for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
1452e528d7c7SWenjun Wu 		uint64_t hi_qw3 =
1453e528d7c7SWenjun Wu 			hi_qw_tmpl |
1454e528d7c7SWenjun Wu 			((uint64_t)pkt[3]->data_len <<
1455e528d7c7SWenjun Wu 			 IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S);
1456e528d7c7SWenjun Wu 		uint64_t hi_qw2 =
1457e528d7c7SWenjun Wu 			hi_qw_tmpl |
1458e528d7c7SWenjun Wu 			((uint64_t)pkt[2]->data_len <<
1459e528d7c7SWenjun Wu 			 IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S);
1460e528d7c7SWenjun Wu 		uint64_t hi_qw1 =
1461e528d7c7SWenjun Wu 			hi_qw_tmpl |
1462e528d7c7SWenjun Wu 			((uint64_t)pkt[1]->data_len <<
1463e528d7c7SWenjun Wu 			 IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S);
1464e528d7c7SWenjun Wu 		uint64_t hi_qw0 =
1465e528d7c7SWenjun Wu 			hi_qw_tmpl |
1466e528d7c7SWenjun Wu 			((uint64_t)pkt[0]->data_len <<
1467e528d7c7SWenjun Wu 			 IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S);
1468e528d7c7SWenjun Wu 
1469e528d7c7SWenjun Wu 		__m512i desc0_3 =
1470e528d7c7SWenjun Wu 			_mm512_set_epi64
1471e528d7c7SWenjun Wu 				(hi_qw3,
1472e528d7c7SWenjun Wu 				 pkt[3]->buf_iova + pkt[3]->data_off,
1473e528d7c7SWenjun Wu 				 hi_qw2,
1474e528d7c7SWenjun Wu 				 pkt[2]->buf_iova + pkt[2]->data_off,
1475e528d7c7SWenjun Wu 				 hi_qw1,
1476e528d7c7SWenjun Wu 				 pkt[1]->buf_iova + pkt[1]->data_off,
1477e528d7c7SWenjun Wu 				 hi_qw0,
1478e528d7c7SWenjun Wu 				 pkt[0]->buf_iova + pkt[0]->data_off);
1479*43fd3624SAndre Muezerie 		_mm512_storeu_si512(RTE_CAST_PTR(void *, txdp), desc0_3);
1480e528d7c7SWenjun Wu 	}
1481e528d7c7SWenjun Wu 
1482e528d7c7SWenjun Wu 	/* do any last ones */
1483e528d7c7SWenjun Wu 	while (nb_pkts) {
1484e528d7c7SWenjun Wu 		idpf_splitq_vtx1(txdp, *pkt, flags);
1485e528d7c7SWenjun Wu 		txdp++, pkt++, nb_pkts--;
1486e528d7c7SWenjun Wu 	}
1487e528d7c7SWenjun Wu }
1488e528d7c7SWenjun Wu 
1489e528d7c7SWenjun Wu static __rte_always_inline uint16_t
1490e528d7c7SWenjun Wu idpf_splitq_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
1491e528d7c7SWenjun Wu 					uint16_t nb_pkts)
1492e528d7c7SWenjun Wu {
1493e528d7c7SWenjun Wu 	struct idpf_tx_queue *txq = (struct idpf_tx_queue *)tx_queue;
1494e528d7c7SWenjun Wu 	volatile struct idpf_flex_tx_sched_desc *txdp;
1495e528d7c7SWenjun Wu 	struct idpf_tx_vec_entry *txep;
1496e528d7c7SWenjun Wu 	uint16_t n, nb_commit, tx_id;
1497e528d7c7SWenjun Wu 	/* bit2 is reserved and must be set to 1 according to Spec */
1498e528d7c7SWenjun Wu 	uint64_t cmd_dtype = IDPF_TXD_FLEX_FLOW_CMD_EOP;
1499e528d7c7SWenjun Wu 
1500e528d7c7SWenjun Wu 	tx_id = txq->tx_tail;
1501e528d7c7SWenjun Wu 
1502e528d7c7SWenjun Wu 	/* cross rx_thresh boundary is not allowed */
1503e528d7c7SWenjun Wu 	nb_pkts = RTE_MIN(nb_pkts, txq->rs_thresh);
1504e528d7c7SWenjun Wu 
1505e528d7c7SWenjun Wu 	nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_free, nb_pkts);
1506e528d7c7SWenjun Wu 	if (unlikely(nb_pkts == 0))
1507e528d7c7SWenjun Wu 		return 0;
1508e528d7c7SWenjun Wu 
1509e528d7c7SWenjun Wu 	tx_id = txq->tx_tail;
1510e528d7c7SWenjun Wu 	txdp = &txq->desc_ring[tx_id];
1511e528d7c7SWenjun Wu 	txep = (void *)txq->sw_ring;
1512e528d7c7SWenjun Wu 	txep += tx_id;
1513e528d7c7SWenjun Wu 
1514e528d7c7SWenjun Wu 	txq->nb_free = (uint16_t)(txq->nb_free - nb_pkts);
1515e528d7c7SWenjun Wu 
1516e528d7c7SWenjun Wu 	n = (uint16_t)(txq->nb_tx_desc - tx_id);
1517e528d7c7SWenjun Wu 	if (nb_commit >= n) {
1518e528d7c7SWenjun Wu 		tx_backlog_entry_avx512(txep, tx_pkts, n);
1519e528d7c7SWenjun Wu 
1520*43fd3624SAndre Muezerie 		idpf_splitq_vtx(txdp, tx_pkts, n - 1, cmd_dtype);
1521e528d7c7SWenjun Wu 		tx_pkts += (n - 1);
1522e528d7c7SWenjun Wu 		txdp += (n - 1);
1523e528d7c7SWenjun Wu 
1524*43fd3624SAndre Muezerie 		idpf_splitq_vtx1(txdp, *tx_pkts++, cmd_dtype);
1525e528d7c7SWenjun Wu 
1526e528d7c7SWenjun Wu 		nb_commit = (uint16_t)(nb_commit - n);
1527e528d7c7SWenjun Wu 
1528e528d7c7SWenjun Wu 		tx_id = 0;
1529e528d7c7SWenjun Wu 		txq->next_rs = (uint16_t)(txq->rs_thresh - 1);
1530e528d7c7SWenjun Wu 
1531e528d7c7SWenjun Wu 		/* avoid reach the end of ring */
1532e528d7c7SWenjun Wu 		txdp = &txq->desc_ring[tx_id];
1533e528d7c7SWenjun Wu 		txep = (void *)txq->sw_ring;
1534e528d7c7SWenjun Wu 		txep += tx_id;
1535e528d7c7SWenjun Wu 	}
1536e528d7c7SWenjun Wu 
1537e528d7c7SWenjun Wu 	tx_backlog_entry_avx512(txep, tx_pkts, nb_commit);
1538e528d7c7SWenjun Wu 
1539*43fd3624SAndre Muezerie 	idpf_splitq_vtx(txdp, tx_pkts, nb_commit, cmd_dtype);
1540e528d7c7SWenjun Wu 
1541e528d7c7SWenjun Wu 	tx_id = (uint16_t)(tx_id + nb_commit);
1542e528d7c7SWenjun Wu 	if (tx_id > txq->next_rs)
1543e528d7c7SWenjun Wu 		txq->next_rs =
1544e528d7c7SWenjun Wu 			(uint16_t)(txq->next_rs + txq->rs_thresh);
1545e528d7c7SWenjun Wu 
1546e528d7c7SWenjun Wu 	txq->tx_tail = tx_id;
1547e528d7c7SWenjun Wu 
1548e528d7c7SWenjun Wu 	IDPF_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
1549e528d7c7SWenjun Wu 
1550e528d7c7SWenjun Wu 	return nb_pkts;
1551e528d7c7SWenjun Wu }
1552e528d7c7SWenjun Wu 
1553e528d7c7SWenjun Wu static __rte_always_inline uint16_t
1554e528d7c7SWenjun Wu idpf_splitq_xmit_pkts_vec_avx512_cmn(void *tx_queue, struct rte_mbuf **tx_pkts,
1555e528d7c7SWenjun Wu 				     uint16_t nb_pkts)
1556e528d7c7SWenjun Wu {
1557e528d7c7SWenjun Wu 	struct idpf_tx_queue *txq = (struct idpf_tx_queue *)tx_queue;
1558e528d7c7SWenjun Wu 	uint16_t nb_tx = 0;
1559e528d7c7SWenjun Wu 
1560e528d7c7SWenjun Wu 	while (nb_pkts) {
1561e528d7c7SWenjun Wu 		uint16_t ret, num;
1562e528d7c7SWenjun Wu 
1563e528d7c7SWenjun Wu 		idpf_splitq_scan_cq_ring(txq->complq);
1564e528d7c7SWenjun Wu 
1565e528d7c7SWenjun Wu 		if (txq->ctype[IDPF_TXD_COMPLT_RS] > txq->free_thresh)
1566e528d7c7SWenjun Wu 			idpf_tx_splitq_free_bufs_avx512(txq);
1567e528d7c7SWenjun Wu 
1568e528d7c7SWenjun Wu 		num = (uint16_t)RTE_MIN(nb_pkts, txq->rs_thresh);
1569e528d7c7SWenjun Wu 		ret = idpf_splitq_xmit_fixed_burst_vec_avx512(tx_queue,
1570e528d7c7SWenjun Wu 							      &tx_pkts[nb_tx],
1571e528d7c7SWenjun Wu 							      num);
1572e528d7c7SWenjun Wu 		nb_tx += ret;
1573e528d7c7SWenjun Wu 		nb_pkts -= ret;
1574e528d7c7SWenjun Wu 		if (ret < num)
1575e528d7c7SWenjun Wu 			break;
1576e528d7c7SWenjun Wu 	}
1577e528d7c7SWenjun Wu 
1578e528d7c7SWenjun Wu 	return nb_tx;
1579e528d7c7SWenjun Wu }
1580e528d7c7SWenjun Wu 
1581e528d7c7SWenjun Wu uint16_t
1582e528d7c7SWenjun Wu idpf_dp_splitq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
1583e528d7c7SWenjun Wu 				uint16_t nb_pkts)
1584e528d7c7SWenjun Wu {
1585e528d7c7SWenjun Wu 	return idpf_splitq_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts);
15860fac6a1cSBeilei Xing }
15870fac6a1cSBeilei Xing 
15880fac6a1cSBeilei Xing static inline void
1589e528d7c7SWenjun Wu idpf_tx_release_mbufs_avx512(struct idpf_tx_queue *txq)
15900fac6a1cSBeilei Xing {
15910fac6a1cSBeilei Xing 	unsigned int i;
15920fac6a1cSBeilei Xing 	const uint16_t max_desc = (uint16_t)(txq->nb_tx_desc - 1);
15930fac6a1cSBeilei Xing 	struct idpf_tx_vec_entry *swr = (void *)txq->sw_ring;
15940fac6a1cSBeilei Xing 
15950fac6a1cSBeilei Xing 	if (txq->sw_ring == NULL || txq->nb_free == max_desc)
15960fac6a1cSBeilei Xing 		return;
15970fac6a1cSBeilei Xing 
15980fac6a1cSBeilei Xing 	i = txq->next_dd - txq->rs_thresh + 1;
15990fac6a1cSBeilei Xing 	if (txq->tx_tail < i) {
16000fac6a1cSBeilei Xing 		for (; i < txq->nb_tx_desc; i++) {
16010fac6a1cSBeilei Xing 			rte_pktmbuf_free_seg(swr[i].mbuf);
16020fac6a1cSBeilei Xing 			swr[i].mbuf = NULL;
16030fac6a1cSBeilei Xing 		}
16040fac6a1cSBeilei Xing 		i = 0;
16050fac6a1cSBeilei Xing 	}
1606b28f22e8SWenjun Wu 	for (; i < txq->tx_tail; i++) {
1607b28f22e8SWenjun Wu 		rte_pktmbuf_free_seg(swr[i].mbuf);
1608b28f22e8SWenjun Wu 		swr[i].mbuf = NULL;
1609b28f22e8SWenjun Wu 	}
16100fac6a1cSBeilei Xing }
16110fac6a1cSBeilei Xing 
1612e528d7c7SWenjun Wu static const struct idpf_txq_ops avx512_tx_vec_ops = {
1613e528d7c7SWenjun Wu 	.release_mbufs = idpf_tx_release_mbufs_avx512,
16140fac6a1cSBeilei Xing };
16150fac6a1cSBeilei Xing 
16160fac6a1cSBeilei Xing int __rte_cold
1617e528d7c7SWenjun Wu idpf_qc_tx_vec_avx512_setup(struct idpf_tx_queue *txq)
16180fac6a1cSBeilei Xing {
1619e528d7c7SWenjun Wu 	if (!txq)
1620e528d7c7SWenjun Wu 		return 0;
1621e528d7c7SWenjun Wu 
1622e528d7c7SWenjun Wu 	txq->ops = &avx512_tx_vec_ops;
16230fac6a1cSBeilei Xing 	return 0;
16240fac6a1cSBeilei Xing }
1625