xref: /dpdk/drivers/common/idpf/idpf_common_rxtx_avx512.c (revision 43fd3624fdfe3a33904a9b64d94306dd3d4f2c13)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2023 Intel Corporation
3  */
4 
5 #include <rte_vect.h>
6 #include "idpf_common_device.h"
7 #include "idpf_common_rxtx.h"
8 
9 #define IDPF_DESCS_PER_LOOP_AVX 8
10 #define PKTLEN_SHIFT 10
11 
12 static __rte_always_inline void
13 idpf_singleq_rearm_common(struct idpf_rx_queue *rxq)
14 {
15 	struct rte_mbuf **rxp = &rxq->sw_ring[rxq->rxrearm_start];
16 	volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring;
17 	uint16_t rx_id;
18 	int i;
19 
20 	rxdp += rxq->rxrearm_start;
21 
22 	/* Pull 'n' more MBUFs into the software ring */
23 	if (rte_mempool_get_bulk(rxq->mp,
24 				 (void *)rxp,
25 				 IDPF_RXQ_REARM_THRESH) < 0) {
26 		if (rxq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
27 		    rxq->nb_rx_desc) {
28 			__m128i dma_addr0;
29 
30 			dma_addr0 = _mm_setzero_si128();
31 			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
32 				rxp[i] = &rxq->fake_mbuf;
33 				_mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp[i].read),
34 						dma_addr0);
35 			}
36 		}
37 		rte_atomic_fetch_add_explicit(&rxq->rx_stats.mbuf_alloc_failed,
38 				   IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
39 		return;
40 	}
41 	struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
42 	struct rte_mbuf *mb4, *mb5, *mb6, *mb7;
43 	__m512i dma_addr0_3, dma_addr4_7;
44 	__m512i hdr_room = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
45 	/* Initialize the mbufs in vector, process 8 mbufs in one loop */
46 	for (i = 0; i < IDPF_RXQ_REARM_THRESH;
47 			i += 8, rxp += 8, rxdp += 8) {
48 		__m128i vaddr0, vaddr1, vaddr2, vaddr3;
49 		__m128i vaddr4, vaddr5, vaddr6, vaddr7;
50 		__m256i vaddr0_1, vaddr2_3;
51 		__m256i vaddr4_5, vaddr6_7;
52 		__m512i vaddr0_3, vaddr4_7;
53 
54 		mb0 = rxp[0];
55 		mb1 = rxp[1];
56 		mb2 = rxp[2];
57 		mb3 = rxp[3];
58 		mb4 = rxp[4];
59 		mb5 = rxp[5];
60 		mb6 = rxp[6];
61 		mb7 = rxp[7];
62 
63 		/* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
64 		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
65 				offsetof(struct rte_mbuf, buf_addr) + 8);
66 		vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
67 		vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
68 		vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);
69 		vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);
70 		vaddr4 = _mm_loadu_si128((__m128i *)&mb4->buf_addr);
71 		vaddr5 = _mm_loadu_si128((__m128i *)&mb5->buf_addr);
72 		vaddr6 = _mm_loadu_si128((__m128i *)&mb6->buf_addr);
73 		vaddr7 = _mm_loadu_si128((__m128i *)&mb7->buf_addr);
74 
75 		/**
76 		 * merge 0 & 1, by casting 0 to 256-bit and inserting 1
77 		 * into the high lanes. Similarly for 2 & 3, and so on.
78 		 */
79 		vaddr0_1 =
80 			_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0),
81 						vaddr1, 1);
82 		vaddr2_3 =
83 			_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr2),
84 						vaddr3, 1);
85 		vaddr4_5 =
86 			_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr4),
87 						vaddr5, 1);
88 		vaddr6_7 =
89 			_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr6),
90 						vaddr7, 1);
91 		vaddr0_3 =
92 			_mm512_inserti64x4(_mm512_castsi256_si512(vaddr0_1),
93 						vaddr2_3, 1);
94 		vaddr4_7 =
95 			_mm512_inserti64x4(_mm512_castsi256_si512(vaddr4_5),
96 						vaddr6_7, 1);
97 
98 		/* convert pa to dma_addr hdr/data */
99 		dma_addr0_3 = _mm512_unpackhi_epi64(vaddr0_3, vaddr0_3);
100 		dma_addr4_7 = _mm512_unpackhi_epi64(vaddr4_7, vaddr4_7);
101 
102 		/* add headroom to pa values */
103 		dma_addr0_3 = _mm512_add_epi64(dma_addr0_3, hdr_room);
104 		dma_addr4_7 = _mm512_add_epi64(dma_addr4_7, hdr_room);
105 
106 		/* flush desc with pa dma_addr */
107 		_mm512_store_si512(RTE_CAST_PTR(__m512i *, &rxdp->read), dma_addr0_3);
108 		_mm512_store_si512(RTE_CAST_PTR(__m512i *, &(rxdp + 4)->read), dma_addr4_7);
109 	}
110 
111 	rxq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
112 	if (rxq->rxrearm_start >= rxq->nb_rx_desc)
113 		rxq->rxrearm_start = 0;
114 
115 	rxq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
116 
117 	rx_id = (uint16_t)((rxq->rxrearm_start == 0) ?
118 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
119 
120 	/* Update the tail pointer on the NIC */
121 	IDPF_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
122 }
123 
124 static __rte_always_inline void
125 idpf_singleq_rearm(struct idpf_rx_queue *rxq)
126 {
127 	int i;
128 	uint16_t rx_id;
129 	volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring;
130 	struct rte_mempool_cache *cache =
131 		rte_mempool_default_cache(rxq->mp, rte_lcore_id());
132 	struct rte_mbuf **rxp = &rxq->sw_ring[rxq->rxrearm_start];
133 
134 	rxdp += rxq->rxrearm_start;
135 
136 	if (unlikely(cache == NULL))
137 		return idpf_singleq_rearm_common(rxq);
138 
139 	/* We need to pull 'n' more MBUFs into the software ring from mempool
140 	 * We inline the mempool function here, so we can vectorize the copy
141 	 * from the cache into the shadow ring.
142 	 */
143 
144 	/* Can this be satisfied from the cache? */
145 	if (cache->len < IDPF_RXQ_REARM_THRESH) {
146 		/* No. Backfill the cache first, and then fill from it */
147 		uint32_t req = IDPF_RXQ_REARM_THRESH + (cache->size -
148 							cache->len);
149 
150 		/* How many do we require i.e. number to fill the cache + the request */
151 		int ret = rte_mempool_ops_dequeue_bulk
152 				(rxq->mp, &cache->objs[cache->len], req);
153 		if (ret == 0) {
154 			cache->len += req;
155 		} else {
156 			if (rxq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
157 			    rxq->nb_rx_desc) {
158 				__m128i dma_addr0;
159 
160 				dma_addr0 = _mm_setzero_si128();
161 				for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
162 					rxp[i] = &rxq->fake_mbuf;
163 					_mm_storeu_si128(RTE_CAST_PTR
164 							(__m128i *, &rxdp[i].read), dma_addr0);
165 				}
166 			}
167 			rte_atomic_fetch_add_explicit(&rxq->rx_stats.mbuf_alloc_failed,
168 					   IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
169 			return;
170 		}
171 	}
172 
173 	const __m512i iova_offsets =  _mm512_set1_epi64(offsetof
174 							(struct rte_mbuf, buf_iova));
175 	const __m512i headroom = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
176 
177 	/* to shuffle the addresses to correct slots. Values 4-7 will contain
178 	 * zeros, so use 7 for a zero-value.
179 	 */
180 	const __m512i permute_idx = _mm512_set_epi64(7, 7, 3, 1, 7, 7, 2, 0);
181 
182 	/* Initialize the mbufs in vector, process 8 mbufs in one loop, taking
183 	 * from mempool cache and populating both shadow and HW rings
184 	 */
185 	for (i = 0; i < IDPF_RXQ_REARM_THRESH / IDPF_DESCS_PER_LOOP_AVX; i++) {
186 		const __m512i mbuf_ptrs = _mm512_loadu_si512
187 			(&cache->objs[cache->len - IDPF_DESCS_PER_LOOP_AVX]);
188 		_mm512_storeu_si512(rxp, mbuf_ptrs);
189 
190 		const __m512i iova_base_addrs = _mm512_i64gather_epi64
191 				(_mm512_add_epi64(mbuf_ptrs, iova_offsets),
192 				 0, /* base */
193 				 1  /* scale */);
194 		const __m512i iova_addrs = _mm512_add_epi64(iova_base_addrs,
195 				headroom);
196 		const __m512i iovas0 = _mm512_castsi256_si512
197 				(_mm512_extracti64x4_epi64(iova_addrs, 0));
198 		const __m512i iovas1 = _mm512_castsi256_si512
199 				(_mm512_extracti64x4_epi64(iova_addrs, 1));
200 
201 		/* permute leaves desc 2-3 addresses in header address slots 0-1
202 		 * but these are ignored by driver since header split not
203 		 * enabled. Similarly for desc 6 & 7.
204 		 */
205 		const __m512i desc0_1 = _mm512_permutexvar_epi64
206 				(permute_idx,
207 				 iovas0);
208 		const __m512i desc2_3 = _mm512_bsrli_epi128(desc0_1, 8);
209 
210 		const __m512i desc4_5 = _mm512_permutexvar_epi64
211 				(permute_idx,
212 				 iovas1);
213 		const __m512i desc6_7 = _mm512_bsrli_epi128(desc4_5, 8);
214 
215 		_mm512_storeu_si512(RTE_CAST_PTR(void *, rxdp), desc0_1);
216 		_mm512_storeu_si512(RTE_CAST_PTR(void *, (rxdp + 2)), desc2_3);
217 		_mm512_storeu_si512(RTE_CAST_PTR(void *, (rxdp + 4)), desc4_5);
218 		_mm512_storeu_si512(RTE_CAST_PTR(void *, (rxdp + 6)), desc6_7);
219 
220 		rxp += IDPF_DESCS_PER_LOOP_AVX;
221 		rxdp += IDPF_DESCS_PER_LOOP_AVX;
222 		cache->len -= IDPF_DESCS_PER_LOOP_AVX;
223 	}
224 
225 	rxq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
226 	if (rxq->rxrearm_start >= rxq->nb_rx_desc)
227 		rxq->rxrearm_start = 0;
228 
229 	rxq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
230 
231 	rx_id = (uint16_t)((rxq->rxrearm_start == 0) ?
232 			   (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
233 
234 	/* Update the tail pointer on the NIC */
235 	IDPF_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
236 }
237 
238 #define IDPF_RX_LEN_MASK 0x80808080
239 static __rte_always_inline uint16_t
240 _idpf_singleq_recv_raw_pkts_avx512(struct idpf_rx_queue *rxq,
241 				   struct rte_mbuf **rx_pkts,
242 				   uint16_t nb_pkts)
243 {
244 	const uint32_t *type_table = rxq->adapter->ptype_tbl;
245 
246 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
247 						    rxq->mbuf_initializer);
248 	struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail];
249 	volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring;
250 
251 	rxdp += rxq->rx_tail;
252 
253 	rte_prefetch0(rxdp);
254 
255 	/* nb_pkts has to be floor-aligned to IDPF_DESCS_PER_LOOP_AVX */
256 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IDPF_DESCS_PER_LOOP_AVX);
257 
258 	/* See if we need to rearm the RX queue - gives the prefetch a bit
259 	 * of time to act
260 	 */
261 	if (rxq->rxrearm_nb > IDPF_RXQ_REARM_THRESH)
262 		idpf_singleq_rearm(rxq);
263 
264 	/* Before we start moving massive data around, check to see if
265 	 * there is actually a packet available
266 	 */
267 	if ((rxdp->flex_nic_wb.status_error0  &
268 	      rte_cpu_to_le_32(1 << VIRTCHNL2_RX_FLEX_DESC_STATUS0_DD_S)) == 0)
269 		return 0;
270 
271 	/* 8 packets DD mask, LSB in each 32-bit value */
272 	const __m256i dd_check = _mm256_set1_epi32(1);
273 
274 	/* mask to shuffle from desc. to mbuf (4 descriptors)*/
275 	const __m512i shuf_msk =
276 		_mm512_set_epi32
277 			(/* 1st descriptor */
278 			 0xFFFFFFFF,    /* rss set as unknown */
279 			 0xFFFF0504,    /* vlan_macip set as unknown */
280 					/* octet 15~14, 16 bits data_len */
281 			 0xFFFF0504,    /* skip high 16 bits pkt_len, zero out */
282 					/* octet 15~14, low 16 bits pkt_len */
283 			 0xFFFFFFFF,    /* pkt_type set as unknown */
284 			 /* 2nd descriptor */
285 			 0xFFFFFFFF,    /* rss set as unknown */
286 			 0xFFFF0504,    /* vlan_macip set as unknown */
287 					/* octet 15~14, 16 bits data_len */
288 			 0xFFFF0504,    /* skip high 16 bits pkt_len, zero out */
289 					/* octet 15~14, low 16 bits pkt_len */
290 			 0xFFFFFFFF,    /* pkt_type set as unknown */
291 			 /* 3rd descriptor */
292 			 0xFFFFFFFF,    /* rss set as unknown */
293 			 0xFFFF0504,    /* vlan_macip set as unknown */
294 					/* octet 15~14, 16 bits data_len */
295 			 0xFFFF0504,    /* skip high 16 bits pkt_len, zero out */
296 					/* octet 15~14, low 16 bits pkt_len */
297 			 0xFFFFFFFF,    /* pkt_type set as unknown */
298 			 /* 4th descriptor */
299 			 0xFFFFFFFF,    /* rss set as unknown */
300 			 0xFFFF0504,    /* vlan_macip set as unknown */
301 					/* octet 15~14, 16 bits data_len */
302 			 0xFFFF0504,    /* skip high 16 bits pkt_len, zero out */
303 					/* octet 15~14, low 16 bits pkt_len */
304 			 0xFFFFFFFF     /* pkt_type set as unknown */
305 			);
306 	/**
307 	 * compile-time check the shuffle layout is correct.
308 	 * NOTE: the first field (lowest address) is given last in set_epi
309 	 * calls above.
310 	 */
311 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
312 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
313 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
314 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
315 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
316 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
317 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
318 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
319 
320 	uint16_t i, received;
321 
322 	for (i = 0, received = 0; i < nb_pkts;
323 	     i += IDPF_DESCS_PER_LOOP_AVX,
324 	     rxdp += IDPF_DESCS_PER_LOOP_AVX) {
325 		/* step 1, copy over 8 mbuf pointers to rx_pkts array */
326 		_mm256_storeu_si256((void *)&rx_pkts[i],
327 				    _mm256_loadu_si256((void *)&sw_ring[i]));
328 #ifdef RTE_ARCH_X86_64
329 		_mm256_storeu_si256
330 			((void *)&rx_pkts[i + 4],
331 			 _mm256_loadu_si256((void *)&sw_ring[i + 4]));
332 #endif
333 
334 		__m512i raw_desc0_3, raw_desc4_7;
335 		const __m128i raw_desc7 =
336 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 7));
337 		rte_compiler_barrier();
338 		const __m128i raw_desc6 =
339 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 6));
340 		rte_compiler_barrier();
341 		const __m128i raw_desc5 =
342 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 5));
343 		rte_compiler_barrier();
344 		const __m128i raw_desc4 =
345 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 4));
346 		rte_compiler_barrier();
347 		const __m128i raw_desc3 =
348 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 3));
349 		rte_compiler_barrier();
350 		const __m128i raw_desc2 =
351 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 2));
352 		rte_compiler_barrier();
353 		const __m128i raw_desc1 =
354 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 1));
355 		rte_compiler_barrier();
356 		const __m128i raw_desc0 =
357 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 0));
358 
359 		raw_desc4_7 = _mm512_broadcast_i32x4(raw_desc4);
360 		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc5, 1);
361 		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc6, 2);
362 		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc7, 3);
363 		raw_desc0_3 = _mm512_broadcast_i32x4(raw_desc0);
364 		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc1, 1);
365 		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc2, 2);
366 		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc3, 3);
367 
368 		/**
369 		 * convert descriptors 4-7 into mbufs, adjusting length and
370 		 * re-arranging fields. Then write into the mbuf
371 		 */
372 		const __m512i len4_7 = _mm512_slli_epi32(raw_desc4_7,
373 							 PKTLEN_SHIFT);
374 		const __m512i desc4_7 = _mm512_mask_blend_epi16(IDPF_RX_LEN_MASK,
375 								raw_desc4_7,
376 								len4_7);
377 		__m512i mb4_7 = _mm512_shuffle_epi8(desc4_7, shuf_msk);
378 
379 		/**
380 		 * to get packet types, shift 64-bit values down 30 bits
381 		 * and so ptype is in lower 8-bits in each
382 		 */
383 		const __m512i ptypes4_7 = _mm512_srli_epi64(desc4_7, 16);
384 		const __m256i ptypes6_7 = _mm512_extracti64x4_epi64(ptypes4_7, 1);
385 		const __m256i ptypes4_5 = _mm512_extracti64x4_epi64(ptypes4_7, 0);
386 		const uint8_t ptype7 = _mm256_extract_epi8(ptypes6_7, 16);
387 		const uint8_t ptype6 = _mm256_extract_epi8(ptypes6_7, 0);
388 		const uint8_t ptype5 = _mm256_extract_epi8(ptypes4_5, 16);
389 		const uint8_t ptype4 = _mm256_extract_epi8(ptypes4_5, 0);
390 
391 		const __m512i ptype4_7 = _mm512_set_epi32
392 			(0, 0, 0, type_table[ptype7],
393 			 0, 0, 0, type_table[ptype6],
394 			 0, 0, 0, type_table[ptype5],
395 			 0, 0, 0, type_table[ptype4]);
396 		mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7);
397 
398 		/**
399 		 * convert descriptors 0-3 into mbufs, adjusting length and
400 		 * re-arranging fields. Then write into the mbuf
401 		 */
402 		const __m512i len0_3 = _mm512_slli_epi32(raw_desc0_3,
403 							 PKTLEN_SHIFT);
404 		const __m512i desc0_3 = _mm512_mask_blend_epi16(IDPF_RX_LEN_MASK,
405 								raw_desc0_3,
406 								len0_3);
407 		__m512i mb0_3 = _mm512_shuffle_epi8(desc0_3, shuf_msk);
408 
409 		/* get the packet types */
410 		const __m512i ptypes0_3 = _mm512_srli_epi64(desc0_3, 16);
411 		const __m256i ptypes2_3 = _mm512_extracti64x4_epi64(ptypes0_3, 1);
412 		const __m256i ptypes0_1 = _mm512_extracti64x4_epi64(ptypes0_3, 0);
413 		const uint8_t ptype3 = _mm256_extract_epi8(ptypes2_3, 16);
414 		const uint8_t ptype2 = _mm256_extract_epi8(ptypes2_3, 0);
415 		const uint8_t ptype1 = _mm256_extract_epi8(ptypes0_1, 16);
416 		const uint8_t ptype0 = _mm256_extract_epi8(ptypes0_1, 0);
417 
418 		const __m512i ptype0_3 = _mm512_set_epi32
419 			(0, 0, 0, type_table[ptype3],
420 			 0, 0, 0, type_table[ptype2],
421 			 0, 0, 0, type_table[ptype1],
422 			 0, 0, 0, type_table[ptype0]);
423 		mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3);
424 
425 		/**
426 		 * use permute/extract to get status content
427 		 * After the operations, the packets status flags are in the
428 		 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
429 		 */
430 		/* merge the status bits into one register */
431 		const __m512i status_permute_msk = _mm512_set_epi32
432 			(0, 0, 0, 0,
433 			 0, 0, 0, 0,
434 			 22, 30, 6, 14,
435 			 18, 26, 2, 10);
436 		const __m512i raw_status0_7 = _mm512_permutex2var_epi32
437 			(raw_desc4_7, status_permute_msk, raw_desc0_3);
438 		__m256i status0_7 = _mm512_extracti64x4_epi64
439 			(raw_status0_7, 0);
440 
441 		/* now do flag manipulation */
442 
443 		/**
444 		 * At this point, we have the 8 sets of flags in the low 16-bits
445 		 * of each 32-bit value.
446 		 * We want to extract these, and merge them with the mbuf init
447 		 * data so we can do a single write to the mbuf to set the flags
448 		 * and all the other initialization fields. Extracting the
449 		 * appropriate flags means that we have to do a shift and blend
450 		 * for each mbuf before we do the write. However, we can also
451 		 * add in the previously computed rx_descriptor fields to
452 		 * make a single 256-bit write per mbuf
453 		 */
454 		/* check the structure matches expectations */
455 		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
456 				 offsetof(struct rte_mbuf, rearm_data) + 8);
457 		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
458 				 RTE_ALIGN(offsetof(struct rte_mbuf,
459 						    rearm_data),
460 						    16));
461 		/* build up data and do writes */
462 		__m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
463 			rearm6, rearm7;
464 		const __m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0);
465 		const __m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1);
466 		const __m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0);
467 		const __m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1);
468 
469 		rearm6 = _mm256_permute2f128_si256(mbuf_init, mb6_7, 0x20);
470 		rearm4 = _mm256_permute2f128_si256(mbuf_init, mb4_5, 0x20);
471 		rearm2 = _mm256_permute2f128_si256(mbuf_init, mb2_3, 0x20);
472 		rearm0 = _mm256_permute2f128_si256(mbuf_init, mb0_1, 0x20);
473 
474 		/* write to mbuf */
475 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data,
476 				    rearm6);
477 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data,
478 				    rearm4);
479 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data,
480 				    rearm2);
481 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data,
482 				    rearm0);
483 
484 		rearm7 = _mm256_blend_epi32(mbuf_init, mb6_7, 0xF0);
485 		rearm5 = _mm256_blend_epi32(mbuf_init, mb4_5, 0xF0);
486 		rearm3 = _mm256_blend_epi32(mbuf_init, mb2_3, 0xF0);
487 		rearm1 = _mm256_blend_epi32(mbuf_init, mb0_1, 0xF0);
488 
489 		/* again write to mbufs */
490 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data,
491 				    rearm7);
492 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data,
493 				    rearm5);
494 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data,
495 				    rearm3);
496 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data,
497 				    rearm1);
498 
499 		/* perform dd_check */
500 		status0_7 = _mm256_and_si256(status0_7, dd_check);
501 		status0_7 = _mm256_packs_epi32(status0_7,
502 					       _mm256_setzero_si256());
503 
504 		uint64_t burst = rte_popcount64
505 					(_mm_cvtsi128_si64
506 						(_mm256_extracti128_si256
507 							(status0_7, 1)));
508 		burst += rte_popcount64
509 				(_mm_cvtsi128_si64
510 					(_mm256_castsi256_si128(status0_7)));
511 		received += burst;
512 		if (burst != IDPF_DESCS_PER_LOOP_AVX)
513 			break;
514 	}
515 
516 	/* update tail pointers */
517 	rxq->rx_tail += received;
518 	rxq->rx_tail &= (rxq->nb_rx_desc - 1);
519 	if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep aligned */
520 		rxq->rx_tail--;
521 		received--;
522 	}
523 	rxq->rxrearm_nb += received;
524 	return received;
525 }
526 
527 /**
528  * Notice:
529  * - nb_pkts < IDPF_DESCS_PER_LOOP, just return no packet
530  */
531 uint16_t
532 idpf_dp_singleq_recv_pkts_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
533 				 uint16_t nb_pkts)
534 {
535 	return _idpf_singleq_recv_raw_pkts_avx512(rx_queue, rx_pkts, nb_pkts);
536 }
537 
538 static __rte_always_inline void
539 idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq)
540 {
541 	struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
542 	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
543 	uint16_t rx_id;
544 	int i;
545 
546 	rxdp += rx_bufq->rxrearm_start;
547 
548 	/* Pull 'n' more MBUFs into the software ring */
549 	if (rte_mempool_get_bulk(rx_bufq->mp,
550 				 (void *)rxp,
551 				 IDPF_RXQ_REARM_THRESH) < 0) {
552 		if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
553 		    rx_bufq->nb_rx_desc) {
554 			__m128i dma_addr0;
555 
556 			dma_addr0 = _mm_setzero_si128();
557 			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
558 				rxp[i] = &rx_bufq->fake_mbuf;
559 				_mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp[i]),
560 						dma_addr0);
561 			}
562 		}
563 	rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
564 			   IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
565 		return;
566 	}
567 
568 	/* Initialize the mbufs in vector, process 8 mbufs in one loop */
569 	for (i = 0; i < IDPF_RXQ_REARM_THRESH;
570 			i += 8, rxp += 8, rxdp += 8) {
571 		rxdp[0].split_rd.pkt_addr = rxp[0]->buf_iova + RTE_PKTMBUF_HEADROOM;
572 		rxdp[1].split_rd.pkt_addr = rxp[1]->buf_iova + RTE_PKTMBUF_HEADROOM;
573 		rxdp[2].split_rd.pkt_addr = rxp[2]->buf_iova + RTE_PKTMBUF_HEADROOM;
574 		rxdp[3].split_rd.pkt_addr = rxp[3]->buf_iova + RTE_PKTMBUF_HEADROOM;
575 		rxdp[4].split_rd.pkt_addr = rxp[4]->buf_iova + RTE_PKTMBUF_HEADROOM;
576 		rxdp[5].split_rd.pkt_addr = rxp[5]->buf_iova + RTE_PKTMBUF_HEADROOM;
577 		rxdp[6].split_rd.pkt_addr = rxp[6]->buf_iova + RTE_PKTMBUF_HEADROOM;
578 		rxdp[7].split_rd.pkt_addr = rxp[7]->buf_iova + RTE_PKTMBUF_HEADROOM;
579 	}
580 
581 	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
582 	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
583 		rx_bufq->rxrearm_start = 0;
584 
585 	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
586 
587 	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
588 			     (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
589 
590 	/* Update the tail pointer on the NIC */
591 	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
592 }
593 
594 static __rte_always_inline void
595 idpf_splitq_rearm(struct idpf_rx_queue *rx_bufq)
596 {
597 	int i;
598 	uint16_t rx_id;
599 	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
600 	struct rte_mempool_cache *cache =
601 		rte_mempool_default_cache(rx_bufq->mp, rte_lcore_id());
602 	struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
603 
604 	rxdp += rx_bufq->rxrearm_start;
605 
606 	if (unlikely(!cache))
607 		return idpf_splitq_rearm_common(rx_bufq);
608 
609 	/* We need to pull 'n' more MBUFs into the software ring from mempool
610 	 * We inline the mempool function here, so we can vectorize the copy
611 	 * from the cache into the shadow ring.
612 	 */
613 
614 	/* Can this be satisfied from the cache? */
615 	if (cache->len < IDPF_RXQ_REARM_THRESH) {
616 		/* No. Backfill the cache first, and then fill from it */
617 		uint32_t req = IDPF_RXQ_REARM_THRESH + (cache->size -
618 							cache->len);
619 
620 		/* How many do we require i.e. number to fill the cache + the request */
621 		int ret = rte_mempool_ops_dequeue_bulk
622 				(rx_bufq->mp, &cache->objs[cache->len], req);
623 		if (ret == 0) {
624 			cache->len += req;
625 		} else {
626 			if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
627 			    rx_bufq->nb_rx_desc) {
628 				__m128i dma_addr0;
629 
630 				dma_addr0 = _mm_setzero_si128();
631 				for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
632 					rxp[i] = &rx_bufq->fake_mbuf;
633 					_mm_storeu_si128(RTE_CAST_PTR(__m128i *, &rxdp[i]),
634 							 dma_addr0);
635 				}
636 			}
637 		rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
638 				   IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
639 			return;
640 		}
641 	}
642 
643 	const __m512i iova_offsets =  _mm512_set1_epi64(offsetof
644 							(struct rte_mbuf, buf_iova));
645 	const __m512i headroom = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
646 
647 	/* Initialize the mbufs in vector, process 8 mbufs in one loop, taking
648 	 * from mempool cache and populating both shadow and HW rings
649 	 */
650 	for (i = 0; i < IDPF_RXQ_REARM_THRESH / IDPF_DESCS_PER_LOOP_AVX; i++) {
651 		const __m512i mbuf_ptrs = _mm512_loadu_si512
652 			(&cache->objs[cache->len - IDPF_DESCS_PER_LOOP_AVX]);
653 		_mm512_storeu_si512(rxp, mbuf_ptrs);
654 
655 		const __m512i iova_base_addrs = _mm512_i64gather_epi64
656 				(_mm512_add_epi64(mbuf_ptrs, iova_offsets),
657 				 0, /* base */
658 				 1  /* scale */);
659 		const __m512i iova_addrs = _mm512_add_epi64(iova_base_addrs,
660 				headroom);
661 
662 		const __m512i iova_addrs_1 = _mm512_bsrli_epi128(iova_addrs, 8);
663 
664 		rxdp[0].split_rd.pkt_addr =
665 			_mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs, 0));
666 		rxdp[1].split_rd.pkt_addr =
667 			_mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs_1, 0));
668 		rxdp[2].split_rd.pkt_addr =
669 			_mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs, 1));
670 		rxdp[3].split_rd.pkt_addr =
671 			_mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs_1, 1));
672 		rxdp[4].split_rd.pkt_addr =
673 			_mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs, 2));
674 		rxdp[5].split_rd.pkt_addr =
675 			_mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs_1, 2));
676 		rxdp[6].split_rd.pkt_addr =
677 			_mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs, 3));
678 		rxdp[7].split_rd.pkt_addr =
679 			_mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs_1, 3));
680 
681 		rxp += IDPF_DESCS_PER_LOOP_AVX;
682 		rxdp += IDPF_DESCS_PER_LOOP_AVX;
683 		cache->len -= IDPF_DESCS_PER_LOOP_AVX;
684 	}
685 
686 	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
687 	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
688 		rx_bufq->rxrearm_start = 0;
689 
690 	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
691 
692 	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
693 			   (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
694 
695 	/* Update the tail pointer on the NIC */
696 	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
697 }
698 
699 static __rte_always_inline uint16_t
700 _idpf_splitq_recv_raw_pkts_avx512(struct idpf_rx_queue *rxq,
701 				  struct rte_mbuf **rx_pkts,
702 				  uint16_t nb_pkts)
703 {
704 	const uint32_t *type_table = rxq->adapter->ptype_tbl;
705 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
706 						    rxq->bufq2->mbuf_initializer);
707 	/* only handle bufq2 here */
708 	struct rte_mbuf **sw_ring = &rxq->bufq2->sw_ring[rxq->rx_tail];
709 	volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring;
710 
711 	rxdp += rxq->rx_tail;
712 
713 	rte_prefetch0(rxdp);
714 
715 	/* nb_pkts has to be floor-aligned to IDPF_DESCS_PER_LOOP_AVX */
716 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IDPF_DESCS_PER_LOOP_AVX);
717 
718 	/* See if we need to rearm the RX queue - gives the prefetch a bit
719 	 * of time to act
720 	 */
721 	if (rxq->bufq2->rxrearm_nb > IDPF_RXQ_REARM_THRESH)
722 		idpf_splitq_rearm(rxq->bufq2);
723 
724 	/* Before we start moving massive data around, check to see if
725 	 * there is actually a packet available
726 	 */
727 	if (((rxdp->flex_adv_nic_3_wb.pktlen_gen_bufq_id &
728 	      VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) >>
729 	     VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) != rxq->expected_gen_id)
730 		return 0;
731 
732 	const __m512i dd_check = _mm512_set1_epi64(1);
733 	const __m512i gen_check = _mm512_set1_epi64((uint64_t)1<<46);
734 
735 	/* mask to shuffle from desc. to mbuf (4 descriptors)*/
736 	const __m512i shuf_msk =
737 		_mm512_set_epi32
738 			(/* 1st descriptor */
739 			 0xFFFFFFFF,    /* octet 4~7, 32bits rss */
740 			 0xFFFF0504,    /* octet 2~3, low 16 bits vlan_macip */
741 					/* octet 15~14, 16 bits data_len */
742 			 0xFFFF0504,    /* skip high 16 bits pkt_len, zero out */
743 					/* octet 15~14, low 16 bits pkt_len */
744 			 0xFFFFFFFF,    /* pkt_type set as unknown */
745 			 /* 2nd descriptor */
746 			 0xFFFFFFFF,    /* octet 4~7, 32bits rss */
747 			 0xFFFF0504,    /* octet 2~3, low 16 bits vlan_macip */
748 					/* octet 15~14, 16 bits data_len */
749 			 0xFFFF0504,    /* skip high 16 bits pkt_len, zero out */
750 					/* octet 15~14, low 16 bits pkt_len */
751 			 0xFFFFFFFF,    /* pkt_type set as unknown */
752 			 /* 3rd descriptor */
753 			 0xFFFFFFFF,    /* octet 4~7, 32bits rss */
754 			 0xFFFF0504,    /* octet 2~3, low 16 bits vlan_macip */
755 					/* octet 15~14, 16 bits data_len */
756 			 0xFFFF0504,    /* skip high 16 bits pkt_len, zero out */
757 					/* octet 15~14, low 16 bits pkt_len */
758 			 0xFFFFFFFF,    /* pkt_type set as unknown */
759 			 /* 4th descriptor */
760 			 0xFFFFFFFF,    /* octet 4~7, 32bits rss */
761 			 0xFFFF0504,    /* octet 2~3, low 16 bits vlan_macip */
762 					/* octet 15~14, 16 bits data_len */
763 			 0xFFFF0504,    /* skip high 16 bits pkt_len, zero out */
764 					/* octet 15~14, low 16 bits pkt_len */
765 			 0xFFFFFFFF     /* pkt_type set as unknown */
766 			);
767 	/**
768 	 * compile-time check the above crc and shuffle layout is correct.
769 	 * NOTE: the first field (lowest address) is given last in set_epi
770 	 * calls above.
771 	 */
772 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
773 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
774 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
775 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
776 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
777 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
778 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
779 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
780 
781 	uint16_t i, received;
782 
783 	for (i = 0, received = 0; i < nb_pkts;
784 	     i += IDPF_DESCS_PER_LOOP_AVX,
785 	     rxdp += IDPF_DESCS_PER_LOOP_AVX) {
786 		/* step 1, copy over 8 mbuf pointers to rx_pkts array */
787 		_mm256_storeu_si256((void *)&rx_pkts[i],
788 				    _mm256_loadu_si256((void *)&sw_ring[i]));
789 #ifdef RTE_ARCH_X86_64
790 		_mm256_storeu_si256
791 			((void *)&rx_pkts[i + 4],
792 			 _mm256_loadu_si256((void *)&sw_ring[i + 4]));
793 #endif
794 
795 		__m512i raw_desc0_3, raw_desc4_7;
796 		const __m128i raw_desc7 =
797 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 7));
798 		rte_compiler_barrier();
799 		const __m128i raw_desc6 =
800 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 6));
801 		rte_compiler_barrier();
802 		const __m128i raw_desc5 =
803 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 5));
804 		rte_compiler_barrier();
805 		const __m128i raw_desc4 =
806 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 4));
807 		rte_compiler_barrier();
808 		const __m128i raw_desc3 =
809 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 3));
810 		rte_compiler_barrier();
811 		const __m128i raw_desc2 =
812 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 2));
813 		rte_compiler_barrier();
814 		const __m128i raw_desc1 =
815 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 1));
816 		rte_compiler_barrier();
817 		const __m128i raw_desc0 =
818 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 0));
819 
820 		raw_desc4_7 = _mm512_broadcast_i32x4(raw_desc4);
821 		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc5, 1);
822 		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc6, 2);
823 		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc7, 3);
824 		raw_desc0_3 = _mm512_broadcast_i32x4(raw_desc0);
825 		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc1, 1);
826 		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc2, 2);
827 		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc3, 3);
828 
829 		/**
830 		 * convert descriptors 4-7 into mbufs, adjusting length and
831 		 * re-arranging fields. Then write into the mbuf
832 		 */
833 		const __m512i len_mask = _mm512_set_epi32(0xffffffff, 0xffffffff,
834 							  0xffff3fff, 0xffffffff,
835 							  0xffffffff, 0xffffffff,
836 							  0xffff3fff, 0xffffffff,
837 							  0xffffffff, 0xffffffff,
838 							  0xffff3fff, 0xffffffff,
839 							  0xffffffff, 0xffffffff,
840 							  0xffff3fff, 0xffffffff);
841 		const __m512i desc4_7 = _mm512_and_epi32(raw_desc4_7, len_mask);
842 		__m512i mb4_7 = _mm512_shuffle_epi8(desc4_7, shuf_msk);
843 
844 		/**
845 		 * to get packet types, shift 64-bit values down 30 bits
846 		 * and so ptype is in lower 8-bits in each
847 		 */
848 		const __m512i ptypes4_7 = _mm512_srli_epi64(desc4_7, 16);
849 		const __m256i ptypes6_7 = _mm512_extracti64x4_epi64(ptypes4_7, 1);
850 		const __m256i ptypes4_5 = _mm512_extracti64x4_epi64(ptypes4_7, 0);
851 		const uint8_t ptype7 = _mm256_extract_epi8(ptypes6_7, 16);
852 		const uint8_t ptype6 = _mm256_extract_epi8(ptypes6_7, 0);
853 		const uint8_t ptype5 = _mm256_extract_epi8(ptypes4_5, 16);
854 		const uint8_t ptype4 = _mm256_extract_epi8(ptypes4_5, 0);
855 
856 		const __m512i ptype4_7 = _mm512_set_epi32
857 			(0, 0, 0, type_table[ptype7],
858 			 0, 0, 0, type_table[ptype6],
859 			 0, 0, 0, type_table[ptype5],
860 			 0, 0, 0, type_table[ptype4]);
861 		mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7);
862 
863 		/**
864 		 * convert descriptors 0-3 into mbufs, adjusting length and
865 		 * re-arranging fields. Then write into the mbuf
866 		 */
867 		const __m512i desc0_3 = _mm512_and_epi32(raw_desc0_3, len_mask);
868 		__m512i mb0_3 = _mm512_shuffle_epi8(desc0_3, shuf_msk);
869 
870 		/* get the packet types */
871 		const __m512i ptypes0_3 = _mm512_srli_epi64(desc0_3, 16);
872 		const __m256i ptypes2_3 = _mm512_extracti64x4_epi64(ptypes0_3, 1);
873 		const __m256i ptypes0_1 = _mm512_extracti64x4_epi64(ptypes0_3, 0);
874 		const uint8_t ptype3 = _mm256_extract_epi8(ptypes2_3, 16);
875 		const uint8_t ptype2 = _mm256_extract_epi8(ptypes2_3, 0);
876 		const uint8_t ptype1 = _mm256_extract_epi8(ptypes0_1, 16);
877 		const uint8_t ptype0 = _mm256_extract_epi8(ptypes0_1, 0);
878 
879 		const __m512i ptype0_3 = _mm512_set_epi32
880 			(0, 0, 0, type_table[ptype3],
881 			 0, 0, 0, type_table[ptype2],
882 			 0, 0, 0, type_table[ptype1],
883 			 0, 0, 0, type_table[ptype0]);
884 		mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3);
885 
886 		/**
887 		 * use permute/extract to get status and generation bit content
888 		 * After the operations, the packets status flags are in the
889 		 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
890 		 */
891 
892 		const __m512i dd_permute_msk = _mm512_set_epi64
893 			(11, 15, 3, 7, 9, 13, 1, 5);
894 		const __m512i status0_7 = _mm512_permutex2var_epi64
895 			(raw_desc4_7, dd_permute_msk, raw_desc0_3);
896 		const __m512i gen_permute_msk = _mm512_set_epi64
897 			(10, 14, 2, 6, 8, 12, 0, 4);
898 		const __m512i raw_gen0_7 = _mm512_permutex2var_epi64
899 			(raw_desc4_7, gen_permute_msk, raw_desc0_3);
900 
901 		/* now do flag manipulation */
902 
903 		/**
904 		 * At this point, we have the 8 sets of flags in the low 16-bits
905 		 * of each 32-bit value in vlan0.
906 		 * We want to extract these, and merge them with the mbuf init
907 		 * data so we can do a single write to the mbuf to set the flags
908 		 * and all the other initialization fields. Extracting the
909 		 * appropriate flags means that we have to do a shift and blend
910 		 * for each mbuf before we do the write. However, we can also
911 		 * add in the previously computed rx_descriptor fields to
912 		 * make a single 256-bit write per mbuf
913 		 */
914 		/* check the structure matches expectations */
915 		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
916 				 offsetof(struct rte_mbuf, rearm_data) + 8);
917 		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
918 				 RTE_ALIGN(offsetof(struct rte_mbuf,
919 						    rearm_data),
920 						    16));
921 				/* build up data and do writes */
922 		__m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
923 			rearm6, rearm7;
924 		const __m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0);
925 		const __m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1);
926 		const __m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0);
927 		const __m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1);
928 
929 		rearm6 = _mm256_permute2f128_si256(mbuf_init, mb6_7, 0x20);
930 		rearm4 = _mm256_permute2f128_si256(mbuf_init, mb4_5, 0x20);
931 		rearm2 = _mm256_permute2f128_si256(mbuf_init, mb2_3, 0x20);
932 		rearm0 = _mm256_permute2f128_si256(mbuf_init, mb0_1, 0x20);
933 
934 		/* write to mbuf */
935 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data,
936 				    rearm6);
937 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data,
938 				    rearm4);
939 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data,
940 				    rearm2);
941 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data,
942 				    rearm0);
943 
944 		rearm7 = _mm256_blend_epi32(mbuf_init, mb6_7, 0xF0);
945 		rearm5 = _mm256_blend_epi32(mbuf_init, mb4_5, 0xF0);
946 		rearm3 = _mm256_blend_epi32(mbuf_init, mb2_3, 0xF0);
947 		rearm1 = _mm256_blend_epi32(mbuf_init, mb0_1, 0xF0);
948 
949 		/* again write to mbufs */
950 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data,
951 				    rearm7);
952 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data,
953 				    rearm5);
954 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data,
955 				    rearm3);
956 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data,
957 				    rearm1);
958 
959 		const __mmask8 dd_mask = _mm512_cmpeq_epi64_mask(
960 			_mm512_and_epi64(status0_7, dd_check), dd_check);
961 		const __mmask8 gen_mask = _mm512_cmpeq_epi64_mask(
962 			_mm512_and_epi64(raw_gen0_7, gen_check),
963 			_mm512_set1_epi64((uint64_t)rxq->expected_gen_id << 46));
964 		const __mmask8 recv_mask = _kand_mask8(dd_mask, gen_mask);
965 		uint16_t burst = rte_popcount32(_cvtmask8_u32(recv_mask));
966 
967 		received += burst;
968 		if (burst != IDPF_DESCS_PER_LOOP_AVX)
969 			break;
970 	}
971 
972 	/* update tail pointers */
973 	rxq->rx_tail += received;
974 	rxq->expected_gen_id ^= ((rxq->rx_tail & rxq->nb_rx_desc) != 0);
975 	rxq->rx_tail &= (rxq->nb_rx_desc - 1);
976 	if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep aligned */
977 		rxq->rx_tail--;
978 		received--;
979 	}
980 
981 	rxq->bufq2->rxrearm_nb += received;
982 	return received;
983 }
984 
985 /* only bufq2 can receive pkts */
986 uint16_t
987 idpf_dp_splitq_recv_pkts_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
988 			     uint16_t nb_pkts)
989 {
990 	return _idpf_splitq_recv_raw_pkts_avx512(rx_queue, rx_pkts,
991 						 nb_pkts);
992 }
993 
994 static __rte_always_inline int
995 idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
996 {
997 	struct idpf_tx_vec_entry *txep;
998 	uint32_t n;
999 	uint32_t i;
1000 	int nb_free = 0;
1001 	struct rte_mbuf *m, *free[txq->rs_thresh];
1002 
1003 	/* check DD bits on threshold descriptor */
1004 	if ((txq->tx_ring[txq->next_dd].qw1 &
1005 			rte_cpu_to_le_64(IDPF_TXD_QW1_DTYPE_M)) !=
1006 			rte_cpu_to_le_64(IDPF_TX_DESC_DTYPE_DESC_DONE))
1007 		return 0;
1008 
1009 	n = txq->rs_thresh;
1010 
1011 	 /* first buffer to free from S/W ring is at index
1012 	  * tx_next_dd - (tx_rs_thresh-1)
1013 	  */
1014 	txep = (void *)txq->sw_ring;
1015 	txep += txq->next_dd - (n - 1);
1016 
1017 	if (txq->offloads & IDPF_TX_OFFLOAD_MBUF_FAST_FREE && (n & 31) == 0) {
1018 		struct rte_mempool *mp = txep[0].mbuf->pool;
1019 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
1020 								rte_lcore_id());
1021 		void **cache_objs;
1022 
1023 		if (cache == NULL || cache->len == 0)
1024 			goto normal;
1025 
1026 		cache_objs = &cache->objs[cache->len];
1027 
1028 		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
1029 			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
1030 			goto done;
1031 		}
1032 
1033 		/* The cache follows the following algorithm
1034 		 *   1. Add the objects to the cache
1035 		 *   2. Anything greater than the cache min value (if it crosses the
1036 		 *   cache flush threshold) is flushed to the ring.
1037 		 */
1038 		/* Add elements back into the cache */
1039 		uint32_t copied = 0;
1040 		/* n is multiple of 32 */
1041 		while (copied < n) {
1042 #ifdef RTE_ARCH_64
1043 			const __m512i a = _mm512_loadu_si512(&txep[copied]);
1044 			const __m512i b = _mm512_loadu_si512(&txep[copied + 8]);
1045 			const __m512i c = _mm512_loadu_si512(&txep[copied + 16]);
1046 			const __m512i d = _mm512_loadu_si512(&txep[copied + 24]);
1047 
1048 			_mm512_storeu_si512(&cache_objs[copied], a);
1049 			_mm512_storeu_si512(&cache_objs[copied + 8], b);
1050 			_mm512_storeu_si512(&cache_objs[copied + 16], c);
1051 			_mm512_storeu_si512(&cache_objs[copied + 24], d);
1052 #else
1053 			const __m512i a = _mm512_loadu_si512(&txep[copied]);
1054 			const __m512i b = _mm512_loadu_si512(&txep[copied + 16]);
1055 			_mm512_storeu_si512(&cache_objs[copied], a);
1056 			_mm512_storeu_si512(&cache_objs[copied + 16], b);
1057 #endif
1058 			copied += 32;
1059 		}
1060 		cache->len += n;
1061 
1062 		if (cache->len >= cache->flushthresh) {
1063 			rte_mempool_ops_enqueue_bulk(mp,
1064 						     &cache->objs[cache->size],
1065 						     cache->len - cache->size);
1066 			cache->len = cache->size;
1067 		}
1068 		goto done;
1069 	}
1070 
1071 normal:
1072 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
1073 	if (likely(m != NULL)) {
1074 		free[0] = m;
1075 		nb_free = 1;
1076 		for (i = 1; i < n; i++) {
1077 			m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
1078 			if (likely(m != NULL)) {
1079 				if (likely(m->pool == free[0]->pool)) {
1080 					free[nb_free++] = m;
1081 				} else {
1082 					rte_mempool_put_bulk(free[0]->pool,
1083 							     (void *)free,
1084 							     nb_free);
1085 					free[0] = m;
1086 					nb_free = 1;
1087 				}
1088 			}
1089 		}
1090 		rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
1091 	} else {
1092 		for (i = 1; i < n; i++) {
1093 			m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
1094 			if (m != NULL)
1095 				rte_mempool_put(m->pool, m);
1096 		}
1097 	}
1098 
1099 done:
1100 	/* buffers were freed, update counters */
1101 	txq->nb_free = (uint16_t)(txq->nb_free + txq->rs_thresh);
1102 	txq->next_dd = (uint16_t)(txq->next_dd + txq->rs_thresh);
1103 	if (txq->next_dd >= txq->nb_tx_desc)
1104 		txq->next_dd = (uint16_t)(txq->rs_thresh - 1);
1105 
1106 	return txq->rs_thresh;
1107 }
1108 
1109 static __rte_always_inline void
1110 tx_backlog_entry_avx512(struct idpf_tx_vec_entry *txep,
1111 			struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
1112 {
1113 	int i;
1114 
1115 	for (i = 0; i < (int)nb_pkts; ++i)
1116 		txep[i].mbuf = tx_pkts[i];
1117 }
1118 
1119 static __rte_always_inline void
1120 idpf_singleq_vtx1(volatile struct idpf_base_tx_desc *txdp,
1121 	  struct rte_mbuf *pkt, uint64_t flags)
1122 {
1123 	uint64_t high_qw =
1124 		(IDPF_TX_DESC_DTYPE_DATA |
1125 		 ((uint64_t)flags  << IDPF_TXD_QW1_CMD_S) |
1126 		 ((uint64_t)pkt->data_len << IDPF_TXD_QW1_TX_BUF_SZ_S));
1127 
1128 	__m128i descriptor = _mm_set_epi64x(high_qw,
1129 					    pkt->buf_iova + pkt->data_off);
1130 	_mm_storeu_si128(RTE_CAST_PTR(__m128i *, txdp), descriptor);
1131 }
1132 
1133 #define IDPF_TX_LEN_MASK 0xAA
1134 #define IDPF_TX_OFF_MASK 0x55
1135 static __rte_always_inline void
1136 idpf_singleq_vtx(volatile struct idpf_base_tx_desc *txdp,
1137 	 struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags)
1138 {
1139 	const uint64_t hi_qw_tmpl = (IDPF_TX_DESC_DTYPE_DATA  |
1140 			((uint64_t)flags  << IDPF_TXD_QW1_CMD_S));
1141 
1142 	/* if unaligned on 32-bit boundary, do one to align */
1143 	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
1144 		idpf_singleq_vtx1(txdp, *pkt, flags);
1145 		nb_pkts--, txdp++, pkt++;
1146 	}
1147 
1148 	/* do 4 at a time while possible, in bursts */
1149 	for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
1150 		uint64_t hi_qw3 =
1151 			hi_qw_tmpl |
1152 			((uint64_t)pkt[3]->data_len <<
1153 			 IDPF_TXD_QW1_TX_BUF_SZ_S);
1154 		uint64_t hi_qw2 =
1155 			hi_qw_tmpl |
1156 			((uint64_t)pkt[2]->data_len <<
1157 			 IDPF_TXD_QW1_TX_BUF_SZ_S);
1158 		uint64_t hi_qw1 =
1159 			hi_qw_tmpl |
1160 			((uint64_t)pkt[1]->data_len <<
1161 			 IDPF_TXD_QW1_TX_BUF_SZ_S);
1162 		uint64_t hi_qw0 =
1163 			hi_qw_tmpl |
1164 			((uint64_t)pkt[0]->data_len <<
1165 			 IDPF_TXD_QW1_TX_BUF_SZ_S);
1166 
1167 		__m512i desc0_3 =
1168 			_mm512_set_epi64
1169 				(hi_qw3,
1170 				 pkt[3]->buf_iova + pkt[3]->data_off,
1171 				 hi_qw2,
1172 				 pkt[2]->buf_iova + pkt[2]->data_off,
1173 				 hi_qw1,
1174 				 pkt[1]->buf_iova + pkt[1]->data_off,
1175 				 hi_qw0,
1176 				 pkt[0]->buf_iova + pkt[0]->data_off);
1177 		_mm512_storeu_si512(RTE_CAST_PTR(void *, txdp), desc0_3);
1178 	}
1179 
1180 	/* do any last ones */
1181 	while (nb_pkts) {
1182 		idpf_singleq_vtx1(txdp, *pkt, flags);
1183 		txdp++, pkt++, nb_pkts--;
1184 	}
1185 }
1186 
1187 static __rte_always_inline uint16_t
1188 idpf_singleq_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
1189 					 uint16_t nb_pkts)
1190 {
1191 	struct idpf_tx_queue *txq = tx_queue;
1192 	volatile struct idpf_base_tx_desc *txdp;
1193 	struct idpf_tx_vec_entry *txep;
1194 	uint16_t n, nb_commit, tx_id;
1195 	uint64_t flags = IDPF_TX_DESC_CMD_EOP;
1196 	uint64_t rs = IDPF_TX_DESC_CMD_RS | flags;
1197 
1198 	/* cross rx_thresh boundary is not allowed */
1199 	nb_pkts = RTE_MIN(nb_pkts, txq->rs_thresh);
1200 
1201 	if (txq->nb_free < txq->free_thresh)
1202 		idpf_tx_singleq_free_bufs_avx512(txq);
1203 
1204 	nb_pkts = (uint16_t)RTE_MIN(txq->nb_free, nb_pkts);
1205 	nb_commit = nb_pkts;
1206 	if (unlikely(nb_pkts == 0))
1207 		return 0;
1208 
1209 	tx_id = txq->tx_tail;
1210 	txdp = &txq->tx_ring[tx_id];
1211 	txep = (void *)txq->sw_ring;
1212 	txep += tx_id;
1213 
1214 	txq->nb_free = (uint16_t)(txq->nb_free - nb_pkts);
1215 
1216 	n = (uint16_t)(txq->nb_tx_desc - tx_id);
1217 	if (nb_commit >= n) {
1218 		tx_backlog_entry_avx512(txep, tx_pkts, n);
1219 
1220 		idpf_singleq_vtx(txdp, tx_pkts, n - 1, flags);
1221 		tx_pkts += (n - 1);
1222 		txdp += (n - 1);
1223 
1224 		idpf_singleq_vtx1(txdp, *tx_pkts++, rs);
1225 
1226 		nb_commit = (uint16_t)(nb_commit - n);
1227 
1228 		tx_id = 0;
1229 		txq->next_rs = (uint16_t)(txq->rs_thresh - 1);
1230 
1231 		/* avoid reach the end of ring */
1232 		txdp = &txq->tx_ring[tx_id];
1233 		txep = (void *)txq->sw_ring;
1234 		txep += tx_id;
1235 	}
1236 
1237 	tx_backlog_entry_avx512(txep, tx_pkts, nb_commit);
1238 
1239 	idpf_singleq_vtx(txdp, tx_pkts, nb_commit, flags);
1240 
1241 	tx_id = (uint16_t)(tx_id + nb_commit);
1242 	if (tx_id > txq->next_rs) {
1243 		txq->tx_ring[txq->next_rs].qw1 |=
1244 			rte_cpu_to_le_64(((uint64_t)IDPF_TX_DESC_CMD_RS) <<
1245 					 IDPF_TXD_QW1_CMD_S);
1246 		txq->next_rs =
1247 			(uint16_t)(txq->next_rs + txq->rs_thresh);
1248 	}
1249 
1250 	txq->tx_tail = tx_id;
1251 
1252 	IDPF_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
1253 
1254 	return nb_pkts;
1255 }
1256 
1257 static __rte_always_inline uint16_t
1258 idpf_singleq_xmit_pkts_vec_avx512_cmn(void *tx_queue, struct rte_mbuf **tx_pkts,
1259 			      uint16_t nb_pkts)
1260 {
1261 	uint16_t nb_tx = 0;
1262 	struct idpf_tx_queue *txq = tx_queue;
1263 
1264 	while (nb_pkts) {
1265 		uint16_t ret, num;
1266 
1267 		num = (uint16_t)RTE_MIN(nb_pkts, txq->rs_thresh);
1268 		ret = idpf_singleq_xmit_fixed_burst_vec_avx512(tx_queue, &tx_pkts[nb_tx],
1269 						       num);
1270 		nb_tx += ret;
1271 		nb_pkts -= ret;
1272 		if (ret < num)
1273 			break;
1274 	}
1275 
1276 	return nb_tx;
1277 }
1278 
1279 uint16_t
1280 idpf_dp_singleq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
1281 				 uint16_t nb_pkts)
1282 {
1283 	return idpf_singleq_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts);
1284 }
1285 
1286 static __rte_always_inline void
1287 idpf_splitq_scan_cq_ring(struct idpf_tx_queue *cq)
1288 {
1289 	struct idpf_splitq_tx_compl_desc *compl_ring;
1290 	struct idpf_tx_queue *txq;
1291 	uint16_t genid, txq_qid, cq_qid, i;
1292 	uint8_t ctype;
1293 
1294 	cq_qid = cq->tx_tail;
1295 
1296 	for (i = 0; i < IDPD_TXQ_SCAN_CQ_THRESH; i++) {
1297 		if (cq_qid == cq->nb_tx_desc) {
1298 			cq_qid = 0;
1299 			cq->expected_gen_id ^= 1;
1300 		}
1301 		compl_ring = &cq->compl_ring[cq_qid];
1302 		genid = (compl_ring->qid_comptype_gen &
1303 			rte_cpu_to_le_64(IDPF_TXD_COMPLQ_GEN_M)) >> IDPF_TXD_COMPLQ_GEN_S;
1304 		if (genid != cq->expected_gen_id)
1305 			break;
1306 		ctype = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
1307 			IDPF_TXD_COMPLQ_COMPL_TYPE_M) >> IDPF_TXD_COMPLQ_COMPL_TYPE_S;
1308 		txq_qid = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
1309 			IDPF_TXD_COMPLQ_QID_M) >> IDPF_TXD_COMPLQ_QID_S;
1310 		txq = cq->txqs[txq_qid - cq->tx_start_qid];
1311 		txq->ctype[ctype]++;
1312 		cq_qid++;
1313 	}
1314 
1315 	cq->tx_tail = cq_qid;
1316 }
1317 
1318 static __rte_always_inline int
1319 idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
1320 {
1321 	struct idpf_tx_vec_entry *txep;
1322 	uint32_t n;
1323 	uint32_t i;
1324 	int nb_free = 0;
1325 	struct rte_mbuf *m, *free[txq->rs_thresh];
1326 
1327 	n = txq->rs_thresh;
1328 
1329 	 /* first buffer to free from S/W ring is at index
1330 	  * tx_next_dd - (tx_rs_thresh-1)
1331 	  */
1332 	txep = (void *)txq->sw_ring;
1333 	txep += txq->next_dd - (n - 1);
1334 
1335 	if (txq->offloads & IDPF_TX_OFFLOAD_MBUF_FAST_FREE && (n & 31) == 0) {
1336 		struct rte_mempool *mp = txep[0].mbuf->pool;
1337 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
1338 								rte_lcore_id());
1339 		void **cache_objs;
1340 
1341 		if (!cache || cache->len == 0)
1342 			goto normal;
1343 
1344 		cache_objs = &cache->objs[cache->len];
1345 
1346 		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
1347 			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
1348 			goto done;
1349 		}
1350 
1351 		/* The cache follows the following algorithm
1352 		 *   1. Add the objects to the cache
1353 		 *   2. Anything greater than the cache min value (if it crosses the
1354 		 *   cache flush threshold) is flushed to the ring.
1355 		 */
1356 		/* Add elements back into the cache */
1357 		uint32_t copied = 0;
1358 		/* n is multiple of 32 */
1359 		while (copied < n) {
1360 			const __m512i a = _mm512_loadu_si512(&txep[copied]);
1361 			const __m512i b = _mm512_loadu_si512(&txep[copied + 8]);
1362 			const __m512i c = _mm512_loadu_si512(&txep[copied + 16]);
1363 			const __m512i d = _mm512_loadu_si512(&txep[copied + 24]);
1364 
1365 			_mm512_storeu_si512(&cache_objs[copied], a);
1366 			_mm512_storeu_si512(&cache_objs[copied + 8], b);
1367 			_mm512_storeu_si512(&cache_objs[copied + 16], c);
1368 			_mm512_storeu_si512(&cache_objs[copied + 24], d);
1369 			copied += 32;
1370 		}
1371 		cache->len += n;
1372 
1373 		if (cache->len >= cache->flushthresh) {
1374 			rte_mempool_ops_enqueue_bulk(mp,
1375 						     &cache->objs[cache->size],
1376 						     cache->len - cache->size);
1377 			cache->len = cache->size;
1378 		}
1379 		goto done;
1380 	}
1381 
1382 normal:
1383 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
1384 	if (likely(m)) {
1385 		free[0] = m;
1386 		nb_free = 1;
1387 		for (i = 1; i < n; i++) {
1388 			m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
1389 			if (likely(m)) {
1390 				if (likely(m->pool == free[0]->pool)) {
1391 					free[nb_free++] = m;
1392 				} else {
1393 					rte_mempool_put_bulk(free[0]->pool,
1394 							     (void *)free,
1395 							     nb_free);
1396 					free[0] = m;
1397 					nb_free = 1;
1398 				}
1399 			}
1400 		}
1401 		rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
1402 	} else {
1403 		for (i = 1; i < n; i++) {
1404 			m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
1405 			if (m)
1406 				rte_mempool_put(m->pool, m);
1407 		}
1408 	}
1409 
1410 done:
1411 	/* buffers were freed, update counters */
1412 	txq->nb_free = (uint16_t)(txq->nb_free + txq->rs_thresh);
1413 	txq->next_dd = (uint16_t)(txq->next_dd + txq->rs_thresh);
1414 	if (txq->next_dd >= txq->nb_tx_desc)
1415 		txq->next_dd = (uint16_t)(txq->rs_thresh - 1);
1416 	txq->ctype[IDPF_TXD_COMPLT_RS] -= txq->rs_thresh;
1417 
1418 	return txq->rs_thresh;
1419 }
1420 
1421 #define IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S	48
1422 
1423 static __rte_always_inline void
1424 idpf_splitq_vtx1(volatile struct idpf_flex_tx_sched_desc *txdp,
1425 	  struct rte_mbuf *pkt, uint64_t flags)
1426 {
1427 	uint64_t high_qw =
1428 		(IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE |
1429 		 ((uint64_t)flags) |
1430 		 ((uint64_t)pkt->data_len << IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S));
1431 
1432 	__m128i descriptor = _mm_set_epi64x(high_qw,
1433 					    pkt->buf_iova + pkt->data_off);
1434 	_mm_storeu_si128(RTE_CAST_PTR(__m128i *, txdp), descriptor);
1435 }
1436 
1437 static __rte_always_inline void
1438 idpf_splitq_vtx(volatile struct idpf_flex_tx_sched_desc *txdp,
1439 	 struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags)
1440 {
1441 	const uint64_t hi_qw_tmpl = (IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE  |
1442 			((uint64_t)flags));
1443 
1444 	/* if unaligned on 32-bit boundary, do one to align */
1445 	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
1446 		idpf_splitq_vtx1(txdp, *pkt, flags);
1447 		nb_pkts--, txdp++, pkt++;
1448 	}
1449 
1450 	/* do 4 at a time while possible, in bursts */
1451 	for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
1452 		uint64_t hi_qw3 =
1453 			hi_qw_tmpl |
1454 			((uint64_t)pkt[3]->data_len <<
1455 			 IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S);
1456 		uint64_t hi_qw2 =
1457 			hi_qw_tmpl |
1458 			((uint64_t)pkt[2]->data_len <<
1459 			 IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S);
1460 		uint64_t hi_qw1 =
1461 			hi_qw_tmpl |
1462 			((uint64_t)pkt[1]->data_len <<
1463 			 IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S);
1464 		uint64_t hi_qw0 =
1465 			hi_qw_tmpl |
1466 			((uint64_t)pkt[0]->data_len <<
1467 			 IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S);
1468 
1469 		__m512i desc0_3 =
1470 			_mm512_set_epi64
1471 				(hi_qw3,
1472 				 pkt[3]->buf_iova + pkt[3]->data_off,
1473 				 hi_qw2,
1474 				 pkt[2]->buf_iova + pkt[2]->data_off,
1475 				 hi_qw1,
1476 				 pkt[1]->buf_iova + pkt[1]->data_off,
1477 				 hi_qw0,
1478 				 pkt[0]->buf_iova + pkt[0]->data_off);
1479 		_mm512_storeu_si512(RTE_CAST_PTR(void *, txdp), desc0_3);
1480 	}
1481 
1482 	/* do any last ones */
1483 	while (nb_pkts) {
1484 		idpf_splitq_vtx1(txdp, *pkt, flags);
1485 		txdp++, pkt++, nb_pkts--;
1486 	}
1487 }
1488 
1489 static __rte_always_inline uint16_t
1490 idpf_splitq_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
1491 					uint16_t nb_pkts)
1492 {
1493 	struct idpf_tx_queue *txq = (struct idpf_tx_queue *)tx_queue;
1494 	volatile struct idpf_flex_tx_sched_desc *txdp;
1495 	struct idpf_tx_vec_entry *txep;
1496 	uint16_t n, nb_commit, tx_id;
1497 	/* bit2 is reserved and must be set to 1 according to Spec */
1498 	uint64_t cmd_dtype = IDPF_TXD_FLEX_FLOW_CMD_EOP;
1499 
1500 	tx_id = txq->tx_tail;
1501 
1502 	/* cross rx_thresh boundary is not allowed */
1503 	nb_pkts = RTE_MIN(nb_pkts, txq->rs_thresh);
1504 
1505 	nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_free, nb_pkts);
1506 	if (unlikely(nb_pkts == 0))
1507 		return 0;
1508 
1509 	tx_id = txq->tx_tail;
1510 	txdp = &txq->desc_ring[tx_id];
1511 	txep = (void *)txq->sw_ring;
1512 	txep += tx_id;
1513 
1514 	txq->nb_free = (uint16_t)(txq->nb_free - nb_pkts);
1515 
1516 	n = (uint16_t)(txq->nb_tx_desc - tx_id);
1517 	if (nb_commit >= n) {
1518 		tx_backlog_entry_avx512(txep, tx_pkts, n);
1519 
1520 		idpf_splitq_vtx(txdp, tx_pkts, n - 1, cmd_dtype);
1521 		tx_pkts += (n - 1);
1522 		txdp += (n - 1);
1523 
1524 		idpf_splitq_vtx1(txdp, *tx_pkts++, cmd_dtype);
1525 
1526 		nb_commit = (uint16_t)(nb_commit - n);
1527 
1528 		tx_id = 0;
1529 		txq->next_rs = (uint16_t)(txq->rs_thresh - 1);
1530 
1531 		/* avoid reach the end of ring */
1532 		txdp = &txq->desc_ring[tx_id];
1533 		txep = (void *)txq->sw_ring;
1534 		txep += tx_id;
1535 	}
1536 
1537 	tx_backlog_entry_avx512(txep, tx_pkts, nb_commit);
1538 
1539 	idpf_splitq_vtx(txdp, tx_pkts, nb_commit, cmd_dtype);
1540 
1541 	tx_id = (uint16_t)(tx_id + nb_commit);
1542 	if (tx_id > txq->next_rs)
1543 		txq->next_rs =
1544 			(uint16_t)(txq->next_rs + txq->rs_thresh);
1545 
1546 	txq->tx_tail = tx_id;
1547 
1548 	IDPF_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
1549 
1550 	return nb_pkts;
1551 }
1552 
1553 static __rte_always_inline uint16_t
1554 idpf_splitq_xmit_pkts_vec_avx512_cmn(void *tx_queue, struct rte_mbuf **tx_pkts,
1555 				     uint16_t nb_pkts)
1556 {
1557 	struct idpf_tx_queue *txq = (struct idpf_tx_queue *)tx_queue;
1558 	uint16_t nb_tx = 0;
1559 
1560 	while (nb_pkts) {
1561 		uint16_t ret, num;
1562 
1563 		idpf_splitq_scan_cq_ring(txq->complq);
1564 
1565 		if (txq->ctype[IDPF_TXD_COMPLT_RS] > txq->free_thresh)
1566 			idpf_tx_splitq_free_bufs_avx512(txq);
1567 
1568 		num = (uint16_t)RTE_MIN(nb_pkts, txq->rs_thresh);
1569 		ret = idpf_splitq_xmit_fixed_burst_vec_avx512(tx_queue,
1570 							      &tx_pkts[nb_tx],
1571 							      num);
1572 		nb_tx += ret;
1573 		nb_pkts -= ret;
1574 		if (ret < num)
1575 			break;
1576 	}
1577 
1578 	return nb_tx;
1579 }
1580 
1581 uint16_t
1582 idpf_dp_splitq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
1583 				uint16_t nb_pkts)
1584 {
1585 	return idpf_splitq_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts);
1586 }
1587 
1588 static inline void
1589 idpf_tx_release_mbufs_avx512(struct idpf_tx_queue *txq)
1590 {
1591 	unsigned int i;
1592 	const uint16_t max_desc = (uint16_t)(txq->nb_tx_desc - 1);
1593 	struct idpf_tx_vec_entry *swr = (void *)txq->sw_ring;
1594 
1595 	if (txq->sw_ring == NULL || txq->nb_free == max_desc)
1596 		return;
1597 
1598 	i = txq->next_dd - txq->rs_thresh + 1;
1599 	if (txq->tx_tail < i) {
1600 		for (; i < txq->nb_tx_desc; i++) {
1601 			rte_pktmbuf_free_seg(swr[i].mbuf);
1602 			swr[i].mbuf = NULL;
1603 		}
1604 		i = 0;
1605 	}
1606 	for (; i < txq->tx_tail; i++) {
1607 		rte_pktmbuf_free_seg(swr[i].mbuf);
1608 		swr[i].mbuf = NULL;
1609 	}
1610 }
1611 
1612 static const struct idpf_txq_ops avx512_tx_vec_ops = {
1613 	.release_mbufs = idpf_tx_release_mbufs_avx512,
1614 };
1615 
1616 int __rte_cold
1617 idpf_qc_tx_vec_avx512_setup(struct idpf_tx_queue *txq)
1618 {
1619 	if (!txq)
1620 		return 0;
1621 
1622 	txq->ops = &avx512_tx_vec_ops;
1623 	return 0;
1624 }
1625