xref: /dpdk/drivers/common/idpf/idpf_common_rxtx_avx512.c (revision 665b49c51639a10c553433bc2bcd85c7331c631e)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2023 Intel Corporation
3  */
4 
5 #include <rte_vect.h>
6 #include <idpf_common_device.h>
7 #include <idpf_common_rxtx.h>
8 
9 #ifndef __INTEL_COMPILER
10 #pragma GCC diagnostic ignored "-Wcast-qual"
11 #endif
12 
13 #define IDPF_DESCS_PER_LOOP_AVX 8
14 #define PKTLEN_SHIFT 10
15 
16 static __rte_always_inline void
17 idpf_singleq_rearm_common(struct idpf_rx_queue *rxq)
18 {
19 	struct rte_mbuf **rxp = &rxq->sw_ring[rxq->rxrearm_start];
20 	volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring;
21 	uint16_t rx_id;
22 	int i;
23 
24 	rxdp += rxq->rxrearm_start;
25 
26 	/* Pull 'n' more MBUFs into the software ring */
27 	if (rte_mempool_get_bulk(rxq->mp,
28 				 (void *)rxp,
29 				 IDPF_RXQ_REARM_THRESH) < 0) {
30 		if (rxq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
31 		    rxq->nb_rx_desc) {
32 			__m128i dma_addr0;
33 
34 			dma_addr0 = _mm_setzero_si128();
35 			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
36 				rxp[i] = &rxq->fake_mbuf;
37 				_mm_store_si128((__m128i *)&rxdp[i].read,
38 						dma_addr0);
39 			}
40 		}
41 		__atomic_fetch_add(&rxq->rx_stats.mbuf_alloc_failed,
42 				   IDPF_RXQ_REARM_THRESH, __ATOMIC_RELAXED);
43 		return;
44 	}
45 	struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
46 	struct rte_mbuf *mb4, *mb5, *mb6, *mb7;
47 	__m512i dma_addr0_3, dma_addr4_7;
48 	__m512i hdr_room = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
49 	/* Initialize the mbufs in vector, process 8 mbufs in one loop */
50 	for (i = 0; i < IDPF_RXQ_REARM_THRESH;
51 			i += 8, rxp += 8, rxdp += 8) {
52 		__m128i vaddr0, vaddr1, vaddr2, vaddr3;
53 		__m128i vaddr4, vaddr5, vaddr6, vaddr7;
54 		__m256i vaddr0_1, vaddr2_3;
55 		__m256i vaddr4_5, vaddr6_7;
56 		__m512i vaddr0_3, vaddr4_7;
57 
58 		mb0 = rxp[0];
59 		mb1 = rxp[1];
60 		mb2 = rxp[2];
61 		mb3 = rxp[3];
62 		mb4 = rxp[4];
63 		mb5 = rxp[5];
64 		mb6 = rxp[6];
65 		mb7 = rxp[7];
66 
67 		/* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
68 		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
69 				offsetof(struct rte_mbuf, buf_addr) + 8);
70 		vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
71 		vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
72 		vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);
73 		vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);
74 		vaddr4 = _mm_loadu_si128((__m128i *)&mb4->buf_addr);
75 		vaddr5 = _mm_loadu_si128((__m128i *)&mb5->buf_addr);
76 		vaddr6 = _mm_loadu_si128((__m128i *)&mb6->buf_addr);
77 		vaddr7 = _mm_loadu_si128((__m128i *)&mb7->buf_addr);
78 
79 		/**
80 		 * merge 0 & 1, by casting 0 to 256-bit and inserting 1
81 		 * into the high lanes. Similarly for 2 & 3, and so on.
82 		 */
83 		vaddr0_1 =
84 			_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0),
85 						vaddr1, 1);
86 		vaddr2_3 =
87 			_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr2),
88 						vaddr3, 1);
89 		vaddr4_5 =
90 			_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr4),
91 						vaddr5, 1);
92 		vaddr6_7 =
93 			_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr6),
94 						vaddr7, 1);
95 		vaddr0_3 =
96 			_mm512_inserti64x4(_mm512_castsi256_si512(vaddr0_1),
97 						vaddr2_3, 1);
98 		vaddr4_7 =
99 			_mm512_inserti64x4(_mm512_castsi256_si512(vaddr4_5),
100 						vaddr6_7, 1);
101 
102 		/* convert pa to dma_addr hdr/data */
103 		dma_addr0_3 = _mm512_unpackhi_epi64(vaddr0_3, vaddr0_3);
104 		dma_addr4_7 = _mm512_unpackhi_epi64(vaddr4_7, vaddr4_7);
105 
106 		/* add headroom to pa values */
107 		dma_addr0_3 = _mm512_add_epi64(dma_addr0_3, hdr_room);
108 		dma_addr4_7 = _mm512_add_epi64(dma_addr4_7, hdr_room);
109 
110 		/* flush desc with pa dma_addr */
111 		_mm512_store_si512((__m512i *)&rxdp->read, dma_addr0_3);
112 		_mm512_store_si512((__m512i *)&(rxdp + 4)->read, dma_addr4_7);
113 	}
114 
115 	rxq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
116 	if (rxq->rxrearm_start >= rxq->nb_rx_desc)
117 		rxq->rxrearm_start = 0;
118 
119 	rxq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
120 
121 	rx_id = (uint16_t)((rxq->rxrearm_start == 0) ?
122 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
123 
124 	/* Update the tail pointer on the NIC */
125 	IDPF_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
126 }
127 
128 static __rte_always_inline void
129 idpf_singleq_rearm(struct idpf_rx_queue *rxq)
130 {
131 	int i;
132 	uint16_t rx_id;
133 	volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring;
134 	struct rte_mempool_cache *cache =
135 		rte_mempool_default_cache(rxq->mp, rte_lcore_id());
136 	struct rte_mbuf **rxp = &rxq->sw_ring[rxq->rxrearm_start];
137 
138 	rxdp += rxq->rxrearm_start;
139 
140 	if (unlikely(cache == NULL))
141 		return idpf_singleq_rearm_common(rxq);
142 
143 	/* We need to pull 'n' more MBUFs into the software ring from mempool
144 	 * We inline the mempool function here, so we can vectorize the copy
145 	 * from the cache into the shadow ring.
146 	 */
147 
148 	/* Can this be satisfied from the cache? */
149 	if (cache->len < IDPF_RXQ_REARM_THRESH) {
150 		/* No. Backfill the cache first, and then fill from it */
151 		uint32_t req = IDPF_RXQ_REARM_THRESH + (cache->size -
152 							cache->len);
153 
154 		/* How many do we require i.e. number to fill the cache + the request */
155 		int ret = rte_mempool_ops_dequeue_bulk
156 				(rxq->mp, &cache->objs[cache->len], req);
157 		if (ret == 0) {
158 			cache->len += req;
159 		} else {
160 			if (rxq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
161 			    rxq->nb_rx_desc) {
162 				__m128i dma_addr0;
163 
164 				dma_addr0 = _mm_setzero_si128();
165 				for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
166 					rxp[i] = &rxq->fake_mbuf;
167 					_mm_storeu_si128((__m128i *)&rxdp[i].read,
168 							 dma_addr0);
169 				}
170 			}
171 			__atomic_fetch_add(&rxq->rx_stats.mbuf_alloc_failed,
172 					   IDPF_RXQ_REARM_THRESH, __ATOMIC_RELAXED);
173 			return;
174 		}
175 	}
176 
177 	const __m512i iova_offsets =  _mm512_set1_epi64(offsetof
178 							(struct rte_mbuf, buf_iova));
179 	const __m512i headroom = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
180 
181 	/* to shuffle the addresses to correct slots. Values 4-7 will contain
182 	 * zeros, so use 7 for a zero-value.
183 	 */
184 	const __m512i permute_idx = _mm512_set_epi64(7, 7, 3, 1, 7, 7, 2, 0);
185 
186 	/* Initialize the mbufs in vector, process 8 mbufs in one loop, taking
187 	 * from mempool cache and populating both shadow and HW rings
188 	 */
189 	for (i = 0; i < IDPF_RXQ_REARM_THRESH / IDPF_DESCS_PER_LOOP_AVX; i++) {
190 		const __m512i mbuf_ptrs = _mm512_loadu_si512
191 			(&cache->objs[cache->len - IDPF_DESCS_PER_LOOP_AVX]);
192 		_mm512_storeu_si512(rxp, mbuf_ptrs);
193 
194 		const __m512i iova_base_addrs = _mm512_i64gather_epi64
195 				(_mm512_add_epi64(mbuf_ptrs, iova_offsets),
196 				 0, /* base */
197 				 1  /* scale */);
198 		const __m512i iova_addrs = _mm512_add_epi64(iova_base_addrs,
199 				headroom);
200 		const __m512i iovas0 = _mm512_castsi256_si512
201 				(_mm512_extracti64x4_epi64(iova_addrs, 0));
202 		const __m512i iovas1 = _mm512_castsi256_si512
203 				(_mm512_extracti64x4_epi64(iova_addrs, 1));
204 
205 		/* permute leaves desc 2-3 addresses in header address slots 0-1
206 		 * but these are ignored by driver since header split not
207 		 * enabled. Similarly for desc 6 & 7.
208 		 */
209 		const __m512i desc0_1 = _mm512_permutexvar_epi64
210 				(permute_idx,
211 				 iovas0);
212 		const __m512i desc2_3 = _mm512_bsrli_epi128(desc0_1, 8);
213 
214 		const __m512i desc4_5 = _mm512_permutexvar_epi64
215 				(permute_idx,
216 				 iovas1);
217 		const __m512i desc6_7 = _mm512_bsrli_epi128(desc4_5, 8);
218 
219 		_mm512_storeu_si512((void *)rxdp, desc0_1);
220 		_mm512_storeu_si512((void *)(rxdp + 2), desc2_3);
221 		_mm512_storeu_si512((void *)(rxdp + 4), desc4_5);
222 		_mm512_storeu_si512((void *)(rxdp + 6), desc6_7);
223 
224 		rxp += IDPF_DESCS_PER_LOOP_AVX;
225 		rxdp += IDPF_DESCS_PER_LOOP_AVX;
226 		cache->len -= IDPF_DESCS_PER_LOOP_AVX;
227 	}
228 
229 	rxq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
230 	if (rxq->rxrearm_start >= rxq->nb_rx_desc)
231 		rxq->rxrearm_start = 0;
232 
233 	rxq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
234 
235 	rx_id = (uint16_t)((rxq->rxrearm_start == 0) ?
236 			   (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
237 
238 	/* Update the tail pointer on the NIC */
239 	IDPF_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
240 }
241 
242 #define IDPF_RX_LEN_MASK 0x80808080
243 static __rte_always_inline uint16_t
244 _idpf_singleq_recv_raw_pkts_avx512(struct idpf_rx_queue *rxq,
245 				   struct rte_mbuf **rx_pkts,
246 				   uint16_t nb_pkts)
247 {
248 	const uint32_t *type_table = rxq->adapter->ptype_tbl;
249 
250 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
251 						    rxq->mbuf_initializer);
252 	struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail];
253 	volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring;
254 
255 	rxdp += rxq->rx_tail;
256 
257 	rte_prefetch0(rxdp);
258 
259 	/* nb_pkts has to be floor-aligned to IDPF_DESCS_PER_LOOP_AVX */
260 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IDPF_DESCS_PER_LOOP_AVX);
261 
262 	/* See if we need to rearm the RX queue - gives the prefetch a bit
263 	 * of time to act
264 	 */
265 	if (rxq->rxrearm_nb > IDPF_RXQ_REARM_THRESH)
266 		idpf_singleq_rearm(rxq);
267 
268 	/* Before we start moving massive data around, check to see if
269 	 * there is actually a packet available
270 	 */
271 	if ((rxdp->flex_nic_wb.status_error0  &
272 	      rte_cpu_to_le_32(1 << VIRTCHNL2_RX_FLEX_DESC_STATUS0_DD_S)) == 0)
273 		return 0;
274 
275 	/* 8 packets DD mask, LSB in each 32-bit value */
276 	const __m256i dd_check = _mm256_set1_epi32(1);
277 
278 	/* mask to shuffle from desc. to mbuf (4 descriptors)*/
279 	const __m512i shuf_msk =
280 		_mm512_set_epi32
281 			(/* 1st descriptor */
282 			 0xFFFFFFFF,    /* rss set as unknown */
283 			 0xFFFF0504,    /* vlan_macip set as unknown */
284 					/* octet 15~14, 16 bits data_len */
285 			 0xFFFF0504,    /* skip high 16 bits pkt_len, zero out */
286 					/* octet 15~14, low 16 bits pkt_len */
287 			 0xFFFFFFFF,    /* pkt_type set as unknown */
288 			 /* 2nd descriptor */
289 			 0xFFFFFFFF,    /* rss set as unknown */
290 			 0xFFFF0504,    /* vlan_macip set as unknown */
291 					/* octet 15~14, 16 bits data_len */
292 			 0xFFFF0504,    /* skip high 16 bits pkt_len, zero out */
293 					/* octet 15~14, low 16 bits pkt_len */
294 			 0xFFFFFFFF,    /* pkt_type set as unknown */
295 			 /* 3rd descriptor */
296 			 0xFFFFFFFF,    /* rss set as unknown */
297 			 0xFFFF0504,    /* vlan_macip set as unknown */
298 					/* octet 15~14, 16 bits data_len */
299 			 0xFFFF0504,    /* skip high 16 bits pkt_len, zero out */
300 					/* octet 15~14, low 16 bits pkt_len */
301 			 0xFFFFFFFF,    /* pkt_type set as unknown */
302 			 /* 4th descriptor */
303 			 0xFFFFFFFF,    /* rss set as unknown */
304 			 0xFFFF0504,    /* vlan_macip set as unknown */
305 					/* octet 15~14, 16 bits data_len */
306 			 0xFFFF0504,    /* skip high 16 bits pkt_len, zero out */
307 					/* octet 15~14, low 16 bits pkt_len */
308 			 0xFFFFFFFF     /* pkt_type set as unknown */
309 			);
310 	/**
311 	 * compile-time check the shuffle layout is correct.
312 	 * NOTE: the first field (lowest address) is given last in set_epi
313 	 * calls above.
314 	 */
315 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
316 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
317 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
318 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
319 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
320 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
321 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
322 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
323 
324 	uint16_t i, received;
325 
326 	for (i = 0, received = 0; i < nb_pkts;
327 	     i += IDPF_DESCS_PER_LOOP_AVX,
328 	     rxdp += IDPF_DESCS_PER_LOOP_AVX) {
329 		/* step 1, copy over 8 mbuf pointers to rx_pkts array */
330 		_mm256_storeu_si256((void *)&rx_pkts[i],
331 				    _mm256_loadu_si256((void *)&sw_ring[i]));
332 #ifdef RTE_ARCH_X86_64
333 		_mm256_storeu_si256
334 			((void *)&rx_pkts[i + 4],
335 			 _mm256_loadu_si256((void *)&sw_ring[i + 4]));
336 #endif
337 
338 		__m512i raw_desc0_3, raw_desc4_7;
339 		const __m128i raw_desc7 =
340 			_mm_load_si128((void *)(rxdp + 7));
341 		rte_compiler_barrier();
342 		const __m128i raw_desc6 =
343 			_mm_load_si128((void *)(rxdp + 6));
344 		rte_compiler_barrier();
345 		const __m128i raw_desc5 =
346 			_mm_load_si128((void *)(rxdp + 5));
347 		rte_compiler_barrier();
348 		const __m128i raw_desc4 =
349 			_mm_load_si128((void *)(rxdp + 4));
350 		rte_compiler_barrier();
351 		const __m128i raw_desc3 =
352 			_mm_load_si128((void *)(rxdp + 3));
353 		rte_compiler_barrier();
354 		const __m128i raw_desc2 =
355 			_mm_load_si128((void *)(rxdp + 2));
356 		rte_compiler_barrier();
357 		const __m128i raw_desc1 =
358 			_mm_load_si128((void *)(rxdp + 1));
359 		rte_compiler_barrier();
360 		const __m128i raw_desc0 =
361 			_mm_load_si128((void *)(rxdp + 0));
362 
363 		raw_desc4_7 = _mm512_broadcast_i32x4(raw_desc4);
364 		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc5, 1);
365 		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc6, 2);
366 		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc7, 3);
367 		raw_desc0_3 = _mm512_broadcast_i32x4(raw_desc0);
368 		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc1, 1);
369 		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc2, 2);
370 		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc3, 3);
371 
372 		/**
373 		 * convert descriptors 4-7 into mbufs, adjusting length and
374 		 * re-arranging fields. Then write into the mbuf
375 		 */
376 		const __m512i len4_7 = _mm512_slli_epi32(raw_desc4_7,
377 							 PKTLEN_SHIFT);
378 		const __m512i desc4_7 = _mm512_mask_blend_epi16(IDPF_RX_LEN_MASK,
379 								raw_desc4_7,
380 								len4_7);
381 		__m512i mb4_7 = _mm512_shuffle_epi8(desc4_7, shuf_msk);
382 
383 		/**
384 		 * to get packet types, shift 64-bit values down 30 bits
385 		 * and so ptype is in lower 8-bits in each
386 		 */
387 		const __m512i ptypes4_7 = _mm512_srli_epi64(desc4_7, 16);
388 		const __m256i ptypes6_7 = _mm512_extracti64x4_epi64(ptypes4_7, 1);
389 		const __m256i ptypes4_5 = _mm512_extracti64x4_epi64(ptypes4_7, 0);
390 		const uint8_t ptype7 = _mm256_extract_epi8(ptypes6_7, 16);
391 		const uint8_t ptype6 = _mm256_extract_epi8(ptypes6_7, 0);
392 		const uint8_t ptype5 = _mm256_extract_epi8(ptypes4_5, 16);
393 		const uint8_t ptype4 = _mm256_extract_epi8(ptypes4_5, 0);
394 
395 		const __m512i ptype4_7 = _mm512_set_epi32
396 			(0, 0, 0, type_table[ptype7],
397 			 0, 0, 0, type_table[ptype6],
398 			 0, 0, 0, type_table[ptype5],
399 			 0, 0, 0, type_table[ptype4]);
400 		mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7);
401 
402 		/**
403 		 * convert descriptors 0-3 into mbufs, adjusting length and
404 		 * re-arranging fields. Then write into the mbuf
405 		 */
406 		const __m512i len0_3 = _mm512_slli_epi32(raw_desc0_3,
407 							 PKTLEN_SHIFT);
408 		const __m512i desc0_3 = _mm512_mask_blend_epi16(IDPF_RX_LEN_MASK,
409 								raw_desc0_3,
410 								len0_3);
411 		__m512i mb0_3 = _mm512_shuffle_epi8(desc0_3, shuf_msk);
412 
413 		/* get the packet types */
414 		const __m512i ptypes0_3 = _mm512_srli_epi64(desc0_3, 16);
415 		const __m256i ptypes2_3 = _mm512_extracti64x4_epi64(ptypes0_3, 1);
416 		const __m256i ptypes0_1 = _mm512_extracti64x4_epi64(ptypes0_3, 0);
417 		const uint8_t ptype3 = _mm256_extract_epi8(ptypes2_3, 16);
418 		const uint8_t ptype2 = _mm256_extract_epi8(ptypes2_3, 0);
419 		const uint8_t ptype1 = _mm256_extract_epi8(ptypes0_1, 16);
420 		const uint8_t ptype0 = _mm256_extract_epi8(ptypes0_1, 0);
421 
422 		const __m512i ptype0_3 = _mm512_set_epi32
423 			(0, 0, 0, type_table[ptype3],
424 			 0, 0, 0, type_table[ptype2],
425 			 0, 0, 0, type_table[ptype1],
426 			 0, 0, 0, type_table[ptype0]);
427 		mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3);
428 
429 		/**
430 		 * use permute/extract to get status content
431 		 * After the operations, the packets status flags are in the
432 		 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
433 		 */
434 		/* merge the status bits into one register */
435 		const __m512i status_permute_msk = _mm512_set_epi32
436 			(0, 0, 0, 0,
437 			 0, 0, 0, 0,
438 			 22, 30, 6, 14,
439 			 18, 26, 2, 10);
440 		const __m512i raw_status0_7 = _mm512_permutex2var_epi32
441 			(raw_desc4_7, status_permute_msk, raw_desc0_3);
442 		__m256i status0_7 = _mm512_extracti64x4_epi64
443 			(raw_status0_7, 0);
444 
445 		/* now do flag manipulation */
446 
447 		/**
448 		 * At this point, we have the 8 sets of flags in the low 16-bits
449 		 * of each 32-bit value.
450 		 * We want to extract these, and merge them with the mbuf init
451 		 * data so we can do a single write to the mbuf to set the flags
452 		 * and all the other initialization fields. Extracting the
453 		 * appropriate flags means that we have to do a shift and blend
454 		 * for each mbuf before we do the write. However, we can also
455 		 * add in the previously computed rx_descriptor fields to
456 		 * make a single 256-bit write per mbuf
457 		 */
458 		/* check the structure matches expectations */
459 		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
460 				 offsetof(struct rte_mbuf, rearm_data) + 8);
461 		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
462 				 RTE_ALIGN(offsetof(struct rte_mbuf,
463 						    rearm_data),
464 						    16));
465 		/* build up data and do writes */
466 		__m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
467 			rearm6, rearm7;
468 		const __m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0);
469 		const __m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1);
470 		const __m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0);
471 		const __m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1);
472 
473 		rearm6 = _mm256_permute2f128_si256(mbuf_init, mb6_7, 0x20);
474 		rearm4 = _mm256_permute2f128_si256(mbuf_init, mb4_5, 0x20);
475 		rearm2 = _mm256_permute2f128_si256(mbuf_init, mb2_3, 0x20);
476 		rearm0 = _mm256_permute2f128_si256(mbuf_init, mb0_1, 0x20);
477 
478 		/* write to mbuf */
479 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data,
480 				    rearm6);
481 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data,
482 				    rearm4);
483 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data,
484 				    rearm2);
485 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data,
486 				    rearm0);
487 
488 		rearm7 = _mm256_blend_epi32(mbuf_init, mb6_7, 0xF0);
489 		rearm5 = _mm256_blend_epi32(mbuf_init, mb4_5, 0xF0);
490 		rearm3 = _mm256_blend_epi32(mbuf_init, mb2_3, 0xF0);
491 		rearm1 = _mm256_blend_epi32(mbuf_init, mb0_1, 0xF0);
492 
493 		/* again write to mbufs */
494 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data,
495 				    rearm7);
496 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data,
497 				    rearm5);
498 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data,
499 				    rearm3);
500 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data,
501 				    rearm1);
502 
503 		/* perform dd_check */
504 		status0_7 = _mm256_and_si256(status0_7, dd_check);
505 		status0_7 = _mm256_packs_epi32(status0_7,
506 					       _mm256_setzero_si256());
507 
508 		uint64_t burst = __builtin_popcountll
509 					(_mm_cvtsi128_si64
510 						(_mm256_extracti128_si256
511 							(status0_7, 1)));
512 		burst += __builtin_popcountll
513 				(_mm_cvtsi128_si64
514 					(_mm256_castsi256_si128(status0_7)));
515 		received += burst;
516 		if (burst != IDPF_DESCS_PER_LOOP_AVX)
517 			break;
518 	}
519 
520 	/* update tail pointers */
521 	rxq->rx_tail += received;
522 	rxq->rx_tail &= (rxq->nb_rx_desc - 1);
523 	if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep aligned */
524 		rxq->rx_tail--;
525 		received--;
526 	}
527 	rxq->rxrearm_nb += received;
528 	return received;
529 }
530 
531 /**
532  * Notice:
533  * - nb_pkts < IDPF_DESCS_PER_LOOP, just return no packet
534  */
535 uint16_t
536 idpf_dp_singleq_recv_pkts_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
537 				 uint16_t nb_pkts)
538 {
539 	return _idpf_singleq_recv_raw_pkts_avx512(rx_queue, rx_pkts, nb_pkts);
540 }
541 
542 static __rte_always_inline void
543 idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq)
544 {
545 	struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
546 	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
547 	uint16_t rx_id;
548 	int i;
549 
550 	rxdp += rx_bufq->rxrearm_start;
551 
552 	/* Pull 'n' more MBUFs into the software ring */
553 	if (rte_mempool_get_bulk(rx_bufq->mp,
554 				 (void *)rxp,
555 				 IDPF_RXQ_REARM_THRESH) < 0) {
556 		if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
557 		    rx_bufq->nb_rx_desc) {
558 			__m128i dma_addr0;
559 
560 			dma_addr0 = _mm_setzero_si128();
561 			for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
562 				rxp[i] = &rx_bufq->fake_mbuf;
563 				_mm_store_si128((__m128i *)&rxdp[i],
564 						dma_addr0);
565 			}
566 		}
567 	__atomic_fetch_add(&rx_bufq->rx_stats.mbuf_alloc_failed,
568 			   IDPF_RXQ_REARM_THRESH, __ATOMIC_RELAXED);
569 		return;
570 	}
571 
572 	/* Initialize the mbufs in vector, process 8 mbufs in one loop */
573 	for (i = 0; i < IDPF_RXQ_REARM_THRESH;
574 			i += 8, rxp += 8, rxdp += 8) {
575 		rxdp[0].split_rd.pkt_addr = rxp[0]->buf_iova + RTE_PKTMBUF_HEADROOM;
576 		rxdp[1].split_rd.pkt_addr = rxp[1]->buf_iova + RTE_PKTMBUF_HEADROOM;
577 		rxdp[2].split_rd.pkt_addr = rxp[2]->buf_iova + RTE_PKTMBUF_HEADROOM;
578 		rxdp[3].split_rd.pkt_addr = rxp[3]->buf_iova + RTE_PKTMBUF_HEADROOM;
579 		rxdp[4].split_rd.pkt_addr = rxp[4]->buf_iova + RTE_PKTMBUF_HEADROOM;
580 		rxdp[5].split_rd.pkt_addr = rxp[5]->buf_iova + RTE_PKTMBUF_HEADROOM;
581 		rxdp[6].split_rd.pkt_addr = rxp[6]->buf_iova + RTE_PKTMBUF_HEADROOM;
582 		rxdp[7].split_rd.pkt_addr = rxp[7]->buf_iova + RTE_PKTMBUF_HEADROOM;
583 	}
584 
585 	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
586 	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
587 		rx_bufq->rxrearm_start = 0;
588 
589 	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
590 
591 	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
592 			     (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
593 
594 	/* Update the tail pointer on the NIC */
595 	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
596 }
597 
598 static __rte_always_inline void
599 idpf_splitq_rearm(struct idpf_rx_queue *rx_bufq)
600 {
601 	int i;
602 	uint16_t rx_id;
603 	volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
604 	struct rte_mempool_cache *cache =
605 		rte_mempool_default_cache(rx_bufq->mp, rte_lcore_id());
606 	struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
607 
608 	rxdp += rx_bufq->rxrearm_start;
609 
610 	if (unlikely(!cache))
611 		return idpf_splitq_rearm_common(rx_bufq);
612 
613 	/* We need to pull 'n' more MBUFs into the software ring from mempool
614 	 * We inline the mempool function here, so we can vectorize the copy
615 	 * from the cache into the shadow ring.
616 	 */
617 
618 	/* Can this be satisfied from the cache? */
619 	if (cache->len < IDPF_RXQ_REARM_THRESH) {
620 		/* No. Backfill the cache first, and then fill from it */
621 		uint32_t req = IDPF_RXQ_REARM_THRESH + (cache->size -
622 							cache->len);
623 
624 		/* How many do we require i.e. number to fill the cache + the request */
625 		int ret = rte_mempool_ops_dequeue_bulk
626 				(rx_bufq->mp, &cache->objs[cache->len], req);
627 		if (ret == 0) {
628 			cache->len += req;
629 		} else {
630 			if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
631 			    rx_bufq->nb_rx_desc) {
632 				__m128i dma_addr0;
633 
634 				dma_addr0 = _mm_setzero_si128();
635 				for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
636 					rxp[i] = &rx_bufq->fake_mbuf;
637 					_mm_storeu_si128((__m128i *)&rxdp[i],
638 							 dma_addr0);
639 				}
640 			}
641 		__atomic_fetch_add(&rx_bufq->rx_stats.mbuf_alloc_failed,
642 				   IDPF_RXQ_REARM_THRESH, __ATOMIC_RELAXED);
643 			return;
644 		}
645 	}
646 
647 	const __m512i iova_offsets =  _mm512_set1_epi64(offsetof
648 							(struct rte_mbuf, buf_iova));
649 	const __m512i headroom = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
650 
651 	/* Initialize the mbufs in vector, process 8 mbufs in one loop, taking
652 	 * from mempool cache and populating both shadow and HW rings
653 	 */
654 	for (i = 0; i < IDPF_RXQ_REARM_THRESH / IDPF_DESCS_PER_LOOP_AVX; i++) {
655 		const __m512i mbuf_ptrs = _mm512_loadu_si512
656 			(&cache->objs[cache->len - IDPF_DESCS_PER_LOOP_AVX]);
657 		_mm512_storeu_si512(rxp, mbuf_ptrs);
658 
659 		const __m512i iova_base_addrs = _mm512_i64gather_epi64
660 				(_mm512_add_epi64(mbuf_ptrs, iova_offsets),
661 				 0, /* base */
662 				 1  /* scale */);
663 		const __m512i iova_addrs = _mm512_add_epi64(iova_base_addrs,
664 				headroom);
665 
666 		const __m512i iova_addrs_1 = _mm512_bsrli_epi128(iova_addrs, 8);
667 
668 		rxdp[0].split_rd.pkt_addr =
669 			_mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs, 0));
670 		rxdp[1].split_rd.pkt_addr =
671 			_mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs_1, 0));
672 		rxdp[2].split_rd.pkt_addr =
673 			_mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs, 1));
674 		rxdp[3].split_rd.pkt_addr =
675 			_mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs_1, 1));
676 		rxdp[4].split_rd.pkt_addr =
677 			_mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs, 2));
678 		rxdp[5].split_rd.pkt_addr =
679 			_mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs_1, 2));
680 		rxdp[6].split_rd.pkt_addr =
681 			_mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs, 3));
682 		rxdp[7].split_rd.pkt_addr =
683 			_mm_cvtsi128_si64(_mm512_extracti32x4_epi32(iova_addrs_1, 3));
684 
685 		rxp += IDPF_DESCS_PER_LOOP_AVX;
686 		rxdp += IDPF_DESCS_PER_LOOP_AVX;
687 		cache->len -= IDPF_DESCS_PER_LOOP_AVX;
688 	}
689 
690 	rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
691 	if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
692 		rx_bufq->rxrearm_start = 0;
693 
694 	rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
695 
696 	rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
697 			   (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1));
698 
699 	/* Update the tail pointer on the NIC */
700 	IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
701 }
702 
703 static __rte_always_inline uint16_t
704 _idpf_splitq_recv_raw_pkts_avx512(struct idpf_rx_queue *rxq,
705 				  struct rte_mbuf **rx_pkts,
706 				  uint16_t nb_pkts)
707 {
708 	const uint32_t *type_table = rxq->adapter->ptype_tbl;
709 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
710 						    rxq->bufq2->mbuf_initializer);
711 	/* only handle bufq2 here */
712 	struct rte_mbuf **sw_ring = &rxq->bufq2->sw_ring[rxq->rx_tail];
713 	volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring;
714 
715 	rxdp += rxq->rx_tail;
716 
717 	rte_prefetch0(rxdp);
718 
719 	/* nb_pkts has to be floor-aligned to IDPF_DESCS_PER_LOOP_AVX */
720 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IDPF_DESCS_PER_LOOP_AVX);
721 
722 	/* See if we need to rearm the RX queue - gives the prefetch a bit
723 	 * of time to act
724 	 */
725 	if (rxq->bufq2->rxrearm_nb > IDPF_RXQ_REARM_THRESH)
726 		idpf_splitq_rearm(rxq->bufq2);
727 
728 	/* Before we start moving massive data around, check to see if
729 	 * there is actually a packet available
730 	 */
731 	if (((rxdp->flex_adv_nic_3_wb.pktlen_gen_bufq_id &
732 	      VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) >>
733 	     VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) != rxq->expected_gen_id)
734 		return 0;
735 
736 	const __m512i dd_check = _mm512_set1_epi64(1);
737 	const __m512i gen_check = _mm512_set1_epi64((uint64_t)1<<46);
738 
739 	/* mask to shuffle from desc. to mbuf (4 descriptors)*/
740 	const __m512i shuf_msk =
741 		_mm512_set_epi32
742 			(/* 1st descriptor */
743 			 0xFFFFFFFF,    /* octet 4~7, 32bits rss */
744 			 0xFFFF0504,    /* octet 2~3, low 16 bits vlan_macip */
745 					/* octet 15~14, 16 bits data_len */
746 			 0xFFFF0504,    /* skip high 16 bits pkt_len, zero out */
747 					/* octet 15~14, low 16 bits pkt_len */
748 			 0xFFFFFFFF,    /* pkt_type set as unknown */
749 			 /* 2nd descriptor */
750 			 0xFFFFFFFF,    /* octet 4~7, 32bits rss */
751 			 0xFFFF0504,    /* octet 2~3, low 16 bits vlan_macip */
752 					/* octet 15~14, 16 bits data_len */
753 			 0xFFFF0504,    /* skip high 16 bits pkt_len, zero out */
754 					/* octet 15~14, low 16 bits pkt_len */
755 			 0xFFFFFFFF,    /* pkt_type set as unknown */
756 			 /* 3rd descriptor */
757 			 0xFFFFFFFF,    /* octet 4~7, 32bits rss */
758 			 0xFFFF0504,    /* octet 2~3, low 16 bits vlan_macip */
759 					/* octet 15~14, 16 bits data_len */
760 			 0xFFFF0504,    /* skip high 16 bits pkt_len, zero out */
761 					/* octet 15~14, low 16 bits pkt_len */
762 			 0xFFFFFFFF,    /* pkt_type set as unknown */
763 			 /* 4th descriptor */
764 			 0xFFFFFFFF,    /* octet 4~7, 32bits rss */
765 			 0xFFFF0504,    /* octet 2~3, low 16 bits vlan_macip */
766 					/* octet 15~14, 16 bits data_len */
767 			 0xFFFF0504,    /* skip high 16 bits pkt_len, zero out */
768 					/* octet 15~14, low 16 bits pkt_len */
769 			 0xFFFFFFFF     /* pkt_type set as unknown */
770 			);
771 	/**
772 	 * compile-time check the above crc and shuffle layout is correct.
773 	 * NOTE: the first field (lowest address) is given last in set_epi
774 	 * calls above.
775 	 */
776 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
777 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
778 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
779 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
780 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
781 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
782 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
783 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
784 
785 	uint16_t i, received;
786 
787 	for (i = 0, received = 0; i < nb_pkts;
788 	     i += IDPF_DESCS_PER_LOOP_AVX,
789 	     rxdp += IDPF_DESCS_PER_LOOP_AVX) {
790 		/* step 1, copy over 8 mbuf pointers to rx_pkts array */
791 		_mm256_storeu_si256((void *)&rx_pkts[i],
792 				    _mm256_loadu_si256((void *)&sw_ring[i]));
793 #ifdef RTE_ARCH_X86_64
794 		_mm256_storeu_si256
795 			((void *)&rx_pkts[i + 4],
796 			 _mm256_loadu_si256((void *)&sw_ring[i + 4]));
797 #endif
798 
799 		__m512i raw_desc0_3, raw_desc4_7;
800 		const __m128i raw_desc7 =
801 			_mm_load_si128((void *)(rxdp + 7));
802 		rte_compiler_barrier();
803 		const __m128i raw_desc6 =
804 			_mm_load_si128((void *)(rxdp + 6));
805 		rte_compiler_barrier();
806 		const __m128i raw_desc5 =
807 			_mm_load_si128((void *)(rxdp + 5));
808 		rte_compiler_barrier();
809 		const __m128i raw_desc4 =
810 			_mm_load_si128((void *)(rxdp + 4));
811 		rte_compiler_barrier();
812 		const __m128i raw_desc3 =
813 			_mm_load_si128((void *)(rxdp + 3));
814 		rte_compiler_barrier();
815 		const __m128i raw_desc2 =
816 			_mm_load_si128((void *)(rxdp + 2));
817 		rte_compiler_barrier();
818 		const __m128i raw_desc1 =
819 			_mm_load_si128((void *)(rxdp + 1));
820 		rte_compiler_barrier();
821 		const __m128i raw_desc0 =
822 			_mm_load_si128((void *)(rxdp + 0));
823 
824 		raw_desc4_7 = _mm512_broadcast_i32x4(raw_desc4);
825 		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc5, 1);
826 		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc6, 2);
827 		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc7, 3);
828 		raw_desc0_3 = _mm512_broadcast_i32x4(raw_desc0);
829 		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc1, 1);
830 		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc2, 2);
831 		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc3, 3);
832 
833 		/**
834 		 * convert descriptors 4-7 into mbufs, adjusting length and
835 		 * re-arranging fields. Then write into the mbuf
836 		 */
837 		const __m512i len_mask = _mm512_set_epi32(0xffffffff, 0xffffffff,
838 							  0xffff3fff, 0xffffffff,
839 							  0xffffffff, 0xffffffff,
840 							  0xffff3fff, 0xffffffff,
841 							  0xffffffff, 0xffffffff,
842 							  0xffff3fff, 0xffffffff,
843 							  0xffffffff, 0xffffffff,
844 							  0xffff3fff, 0xffffffff);
845 		const __m512i desc4_7 = _mm512_and_epi32(raw_desc4_7, len_mask);
846 		__m512i mb4_7 = _mm512_shuffle_epi8(desc4_7, shuf_msk);
847 
848 		/**
849 		 * to get packet types, shift 64-bit values down 30 bits
850 		 * and so ptype is in lower 8-bits in each
851 		 */
852 		const __m512i ptypes4_7 = _mm512_srli_epi64(desc4_7, 16);
853 		const __m256i ptypes6_7 = _mm512_extracti64x4_epi64(ptypes4_7, 1);
854 		const __m256i ptypes4_5 = _mm512_extracti64x4_epi64(ptypes4_7, 0);
855 		const uint8_t ptype7 = _mm256_extract_epi8(ptypes6_7, 16);
856 		const uint8_t ptype6 = _mm256_extract_epi8(ptypes6_7, 0);
857 		const uint8_t ptype5 = _mm256_extract_epi8(ptypes4_5, 16);
858 		const uint8_t ptype4 = _mm256_extract_epi8(ptypes4_5, 0);
859 
860 		const __m512i ptype4_7 = _mm512_set_epi32
861 			(0, 0, 0, type_table[ptype7],
862 			 0, 0, 0, type_table[ptype6],
863 			 0, 0, 0, type_table[ptype5],
864 			 0, 0, 0, type_table[ptype4]);
865 		mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7);
866 
867 		/**
868 		 * convert descriptors 0-3 into mbufs, adjusting length and
869 		 * re-arranging fields. Then write into the mbuf
870 		 */
871 		const __m512i desc0_3 = _mm512_and_epi32(raw_desc0_3, len_mask);
872 		__m512i mb0_3 = _mm512_shuffle_epi8(desc0_3, shuf_msk);
873 
874 		/* get the packet types */
875 		const __m512i ptypes0_3 = _mm512_srli_epi64(desc0_3, 16);
876 		const __m256i ptypes2_3 = _mm512_extracti64x4_epi64(ptypes0_3, 1);
877 		const __m256i ptypes0_1 = _mm512_extracti64x4_epi64(ptypes0_3, 0);
878 		const uint8_t ptype3 = _mm256_extract_epi8(ptypes2_3, 16);
879 		const uint8_t ptype2 = _mm256_extract_epi8(ptypes2_3, 0);
880 		const uint8_t ptype1 = _mm256_extract_epi8(ptypes0_1, 16);
881 		const uint8_t ptype0 = _mm256_extract_epi8(ptypes0_1, 0);
882 
883 		const __m512i ptype0_3 = _mm512_set_epi32
884 			(0, 0, 0, type_table[ptype3],
885 			 0, 0, 0, type_table[ptype2],
886 			 0, 0, 0, type_table[ptype1],
887 			 0, 0, 0, type_table[ptype0]);
888 		mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3);
889 
890 		/**
891 		 * use permute/extract to get status and generation bit content
892 		 * After the operations, the packets status flags are in the
893 		 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
894 		 */
895 
896 		const __m512i dd_permute_msk = _mm512_set_epi64
897 			(11, 15, 3, 7, 9, 13, 1, 5);
898 		const __m512i status0_7 = _mm512_permutex2var_epi64
899 			(raw_desc4_7, dd_permute_msk, raw_desc0_3);
900 		const __m512i gen_permute_msk = _mm512_set_epi64
901 			(10, 14, 2, 6, 8, 12, 0, 4);
902 		const __m512i raw_gen0_7 = _mm512_permutex2var_epi64
903 			(raw_desc4_7, gen_permute_msk, raw_desc0_3);
904 
905 		/* now do flag manipulation */
906 
907 		/**
908 		 * At this point, we have the 8 sets of flags in the low 16-bits
909 		 * of each 32-bit value in vlan0.
910 		 * We want to extract these, and merge them with the mbuf init
911 		 * data so we can do a single write to the mbuf to set the flags
912 		 * and all the other initialization fields. Extracting the
913 		 * appropriate flags means that we have to do a shift and blend
914 		 * for each mbuf before we do the write. However, we can also
915 		 * add in the previously computed rx_descriptor fields to
916 		 * make a single 256-bit write per mbuf
917 		 */
918 		/* check the structure matches expectations */
919 		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
920 				 offsetof(struct rte_mbuf, rearm_data) + 8);
921 		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
922 				 RTE_ALIGN(offsetof(struct rte_mbuf,
923 						    rearm_data),
924 						    16));
925 				/* build up data and do writes */
926 		__m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
927 			rearm6, rearm7;
928 		const __m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0);
929 		const __m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1);
930 		const __m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0);
931 		const __m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1);
932 
933 		rearm6 = _mm256_permute2f128_si256(mbuf_init, mb6_7, 0x20);
934 		rearm4 = _mm256_permute2f128_si256(mbuf_init, mb4_5, 0x20);
935 		rearm2 = _mm256_permute2f128_si256(mbuf_init, mb2_3, 0x20);
936 		rearm0 = _mm256_permute2f128_si256(mbuf_init, mb0_1, 0x20);
937 
938 		/* write to mbuf */
939 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data,
940 				    rearm6);
941 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data,
942 				    rearm4);
943 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data,
944 				    rearm2);
945 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data,
946 				    rearm0);
947 
948 		rearm7 = _mm256_blend_epi32(mbuf_init, mb6_7, 0xF0);
949 		rearm5 = _mm256_blend_epi32(mbuf_init, mb4_5, 0xF0);
950 		rearm3 = _mm256_blend_epi32(mbuf_init, mb2_3, 0xF0);
951 		rearm1 = _mm256_blend_epi32(mbuf_init, mb0_1, 0xF0);
952 
953 		/* again write to mbufs */
954 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data,
955 				    rearm7);
956 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data,
957 				    rearm5);
958 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data,
959 				    rearm3);
960 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data,
961 				    rearm1);
962 
963 		const __mmask8 dd_mask = _mm512_cmpeq_epi64_mask(
964 			_mm512_and_epi64(status0_7, dd_check), dd_check);
965 		const __mmask8 gen_mask = _mm512_cmpeq_epi64_mask(
966 			_mm512_and_epi64(raw_gen0_7, gen_check),
967 			_mm512_set1_epi64((uint64_t)rxq->expected_gen_id << 46));
968 		const __mmask8 recv_mask = _kand_mask8(dd_mask, gen_mask);
969 		uint16_t burst = __builtin_popcount(_cvtmask8_u32(recv_mask));
970 
971 		received += burst;
972 		if (burst != IDPF_DESCS_PER_LOOP_AVX)
973 			break;
974 	}
975 
976 	/* update tail pointers */
977 	rxq->rx_tail += received;
978 	rxq->expected_gen_id ^= ((rxq->rx_tail & rxq->nb_rx_desc) != 0);
979 	rxq->rx_tail &= (rxq->nb_rx_desc - 1);
980 	if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep aligned */
981 		rxq->rx_tail--;
982 		received--;
983 	}
984 
985 	rxq->bufq2->rxrearm_nb += received;
986 	return received;
987 }
988 
989 /* only bufq2 can receive pkts */
990 uint16_t
991 idpf_dp_splitq_recv_pkts_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
992 			     uint16_t nb_pkts)
993 {
994 	return _idpf_splitq_recv_raw_pkts_avx512(rx_queue, rx_pkts,
995 						 nb_pkts);
996 }
997 
998 static __rte_always_inline int
999 idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
1000 {
1001 	struct idpf_tx_vec_entry *txep;
1002 	uint32_t n;
1003 	uint32_t i;
1004 	int nb_free = 0;
1005 	struct rte_mbuf *m, *free[txq->rs_thresh];
1006 
1007 	/* check DD bits on threshold descriptor */
1008 	if ((txq->tx_ring[txq->next_dd].qw1.cmd_dtype &
1009 			rte_cpu_to_le_64(IDPF_TXD_QW1_DTYPE_M)) !=
1010 			rte_cpu_to_le_64(IDPF_TX_DESC_DTYPE_DESC_DONE))
1011 		return 0;
1012 
1013 	n = txq->rs_thresh;
1014 
1015 	 /* first buffer to free from S/W ring is at index
1016 	  * tx_next_dd - (tx_rs_thresh-1)
1017 	  */
1018 	txep = (void *)txq->sw_ring;
1019 	txep += txq->next_dd - (n - 1);
1020 
1021 	if (txq->offloads & IDPF_TX_OFFLOAD_MBUF_FAST_FREE && (n & 31) == 0) {
1022 		struct rte_mempool *mp = txep[0].mbuf->pool;
1023 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
1024 								rte_lcore_id());
1025 		void **cache_objs;
1026 
1027 		if (cache == NULL || cache->len == 0)
1028 			goto normal;
1029 
1030 		cache_objs = &cache->objs[cache->len];
1031 
1032 		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
1033 			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
1034 			goto done;
1035 		}
1036 
1037 		/* The cache follows the following algorithm
1038 		 *   1. Add the objects to the cache
1039 		 *   2. Anything greater than the cache min value (if it crosses the
1040 		 *   cache flush threshold) is flushed to the ring.
1041 		 */
1042 		/* Add elements back into the cache */
1043 		uint32_t copied = 0;
1044 		/* n is multiple of 32 */
1045 		while (copied < n) {
1046 			const __m512i a = _mm512_loadu_si512(&txep[copied]);
1047 			const __m512i b = _mm512_loadu_si512(&txep[copied + 8]);
1048 			const __m512i c = _mm512_loadu_si512(&txep[copied + 16]);
1049 			const __m512i d = _mm512_loadu_si512(&txep[copied + 24]);
1050 
1051 			_mm512_storeu_si512(&cache_objs[copied], a);
1052 			_mm512_storeu_si512(&cache_objs[copied + 8], b);
1053 			_mm512_storeu_si512(&cache_objs[copied + 16], c);
1054 			_mm512_storeu_si512(&cache_objs[copied + 24], d);
1055 			copied += 32;
1056 		}
1057 		cache->len += n;
1058 
1059 		if (cache->len >= cache->flushthresh) {
1060 			rte_mempool_ops_enqueue_bulk(mp,
1061 						     &cache->objs[cache->size],
1062 						     cache->len - cache->size);
1063 			cache->len = cache->size;
1064 		}
1065 		goto done;
1066 	}
1067 
1068 normal:
1069 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
1070 	if (likely(m != NULL)) {
1071 		free[0] = m;
1072 		nb_free = 1;
1073 		for (i = 1; i < n; i++) {
1074 			m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
1075 			if (likely(m != NULL)) {
1076 				if (likely(m->pool == free[0]->pool)) {
1077 					free[nb_free++] = m;
1078 				} else {
1079 					rte_mempool_put_bulk(free[0]->pool,
1080 							     (void *)free,
1081 							     nb_free);
1082 					free[0] = m;
1083 					nb_free = 1;
1084 				}
1085 			}
1086 		}
1087 		rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
1088 	} else {
1089 		for (i = 1; i < n; i++) {
1090 			m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
1091 			if (m != NULL)
1092 				rte_mempool_put(m->pool, m);
1093 		}
1094 	}
1095 
1096 done:
1097 	/* buffers were freed, update counters */
1098 	txq->nb_free = (uint16_t)(txq->nb_free + txq->rs_thresh);
1099 	txq->next_dd = (uint16_t)(txq->next_dd + txq->rs_thresh);
1100 	if (txq->next_dd >= txq->nb_tx_desc)
1101 		txq->next_dd = (uint16_t)(txq->rs_thresh - 1);
1102 
1103 	return txq->rs_thresh;
1104 }
1105 
1106 static __rte_always_inline void
1107 tx_backlog_entry_avx512(struct idpf_tx_vec_entry *txep,
1108 			struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
1109 {
1110 	int i;
1111 
1112 	for (i = 0; i < (int)nb_pkts; ++i)
1113 		txep[i].mbuf = tx_pkts[i];
1114 }
1115 
1116 #define IDPF_FLEX_TXD_QW1_BUF_SZ_S 48
1117 static __rte_always_inline void
1118 idpf_singleq_vtx1(volatile struct idpf_flex_tx_desc *txdp,
1119 	  struct rte_mbuf *pkt, uint64_t flags)
1120 {
1121 	uint64_t high_qw =
1122 		(IDPF_TX_DESC_DTYPE_FLEX_DATA << IDPF_FLEX_TXD_QW1_DTYPE_S |
1123 		 ((uint64_t)flags  << IDPF_FLEX_TXD_QW1_CMD_S) |
1124 		 ((uint64_t)pkt->data_len << IDPF_FLEX_TXD_QW1_BUF_SZ_S));
1125 
1126 	__m128i descriptor = _mm_set_epi64x(high_qw,
1127 					    pkt->buf_iova + pkt->data_off);
1128 	_mm_storeu_si128((__m128i *)txdp, descriptor);
1129 }
1130 
1131 #define IDPF_TX_LEN_MASK 0xAA
1132 #define IDPF_TX_OFF_MASK 0x55
1133 static __rte_always_inline void
1134 idpf_singleq_vtx(volatile struct idpf_flex_tx_desc *txdp,
1135 	 struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags)
1136 {
1137 	const uint64_t hi_qw_tmpl = (IDPF_TX_DESC_DTYPE_FLEX_DATA  |
1138 			((uint64_t)flags  << IDPF_FLEX_TXD_QW1_CMD_S));
1139 
1140 	/* if unaligned on 32-bit boundary, do one to align */
1141 	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
1142 		idpf_singleq_vtx1(txdp, *pkt, flags);
1143 		nb_pkts--, txdp++, pkt++;
1144 	}
1145 
1146 	/* do 4 at a time while possible, in bursts */
1147 	for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
1148 		uint64_t hi_qw3 =
1149 			hi_qw_tmpl |
1150 			((uint64_t)pkt[3]->data_len <<
1151 			 IDPF_FLEX_TXD_QW1_BUF_SZ_S);
1152 		uint64_t hi_qw2 =
1153 			hi_qw_tmpl |
1154 			((uint64_t)pkt[2]->data_len <<
1155 			 IDPF_FLEX_TXD_QW1_BUF_SZ_S);
1156 		uint64_t hi_qw1 =
1157 			hi_qw_tmpl |
1158 			((uint64_t)pkt[1]->data_len <<
1159 			 IDPF_FLEX_TXD_QW1_BUF_SZ_S);
1160 		uint64_t hi_qw0 =
1161 			hi_qw_tmpl |
1162 			((uint64_t)pkt[0]->data_len <<
1163 			 IDPF_FLEX_TXD_QW1_BUF_SZ_S);
1164 
1165 		__m512i desc0_3 =
1166 			_mm512_set_epi64
1167 				(hi_qw3,
1168 				 pkt[3]->buf_iova + pkt[3]->data_off,
1169 				 hi_qw2,
1170 				 pkt[2]->buf_iova + pkt[2]->data_off,
1171 				 hi_qw1,
1172 				 pkt[1]->buf_iova + pkt[1]->data_off,
1173 				 hi_qw0,
1174 				 pkt[0]->buf_iova + pkt[0]->data_off);
1175 		_mm512_storeu_si512((void *)txdp, desc0_3);
1176 	}
1177 
1178 	/* do any last ones */
1179 	while (nb_pkts) {
1180 		idpf_singleq_vtx1(txdp, *pkt, flags);
1181 		txdp++, pkt++, nb_pkts--;
1182 	}
1183 }
1184 
1185 static __rte_always_inline uint16_t
1186 idpf_singleq_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
1187 					 uint16_t nb_pkts)
1188 {
1189 	struct idpf_tx_queue *txq = tx_queue;
1190 	volatile struct idpf_flex_tx_desc *txdp;
1191 	struct idpf_tx_vec_entry *txep;
1192 	uint16_t n, nb_commit, tx_id;
1193 	uint64_t flags = IDPF_TX_FLEX_DESC_CMD_EOP;
1194 	uint64_t rs = IDPF_TX_FLEX_DESC_CMD_RS | flags;
1195 
1196 	/* cross rx_thresh boundary is not allowed */
1197 	nb_pkts = RTE_MIN(nb_pkts, txq->rs_thresh);
1198 
1199 	if (txq->nb_free < txq->free_thresh)
1200 		idpf_tx_singleq_free_bufs_avx512(txq);
1201 
1202 	nb_pkts = (uint16_t)RTE_MIN(txq->nb_free, nb_pkts);
1203 	nb_commit = nb_pkts;
1204 	if (unlikely(nb_pkts == 0))
1205 		return 0;
1206 
1207 	tx_id = txq->tx_tail;
1208 	txdp = &txq->tx_ring[tx_id];
1209 	txep = (void *)txq->sw_ring;
1210 	txep += tx_id;
1211 
1212 	txq->nb_free = (uint16_t)(txq->nb_free - nb_pkts);
1213 
1214 	n = (uint16_t)(txq->nb_tx_desc - tx_id);
1215 	if (nb_commit >= n) {
1216 		tx_backlog_entry_avx512(txep, tx_pkts, n);
1217 
1218 		idpf_singleq_vtx(txdp, tx_pkts, n - 1, flags);
1219 		tx_pkts += (n - 1);
1220 		txdp += (n - 1);
1221 
1222 		idpf_singleq_vtx1(txdp, *tx_pkts++, rs);
1223 
1224 		nb_commit = (uint16_t)(nb_commit - n);
1225 
1226 		tx_id = 0;
1227 		txq->next_rs = (uint16_t)(txq->rs_thresh - 1);
1228 
1229 		/* avoid reach the end of ring */
1230 		txdp = &txq->tx_ring[tx_id];
1231 		txep = (void *)txq->sw_ring;
1232 		txep += tx_id;
1233 	}
1234 
1235 	tx_backlog_entry_avx512(txep, tx_pkts, nb_commit);
1236 
1237 	idpf_singleq_vtx(txdp, tx_pkts, nb_commit, flags);
1238 
1239 	tx_id = (uint16_t)(tx_id + nb_commit);
1240 	if (tx_id > txq->next_rs) {
1241 		txq->tx_ring[txq->next_rs].qw1.cmd_dtype |=
1242 			rte_cpu_to_le_64(((uint64_t)IDPF_TX_FLEX_DESC_CMD_RS) <<
1243 					 IDPF_FLEX_TXD_QW1_CMD_S);
1244 		txq->next_rs =
1245 			(uint16_t)(txq->next_rs + txq->rs_thresh);
1246 	}
1247 
1248 	txq->tx_tail = tx_id;
1249 
1250 	IDPF_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
1251 
1252 	return nb_pkts;
1253 }
1254 
1255 static __rte_always_inline uint16_t
1256 idpf_singleq_xmit_pkts_vec_avx512_cmn(void *tx_queue, struct rte_mbuf **tx_pkts,
1257 			      uint16_t nb_pkts)
1258 {
1259 	uint16_t nb_tx = 0;
1260 	struct idpf_tx_queue *txq = tx_queue;
1261 
1262 	while (nb_pkts) {
1263 		uint16_t ret, num;
1264 
1265 		num = (uint16_t)RTE_MIN(nb_pkts, txq->rs_thresh);
1266 		ret = idpf_singleq_xmit_fixed_burst_vec_avx512(tx_queue, &tx_pkts[nb_tx],
1267 						       num);
1268 		nb_tx += ret;
1269 		nb_pkts -= ret;
1270 		if (ret < num)
1271 			break;
1272 	}
1273 
1274 	return nb_tx;
1275 }
1276 
1277 uint16_t
1278 idpf_dp_singleq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
1279 				 uint16_t nb_pkts)
1280 {
1281 	return idpf_singleq_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts);
1282 }
1283 
1284 static __rte_always_inline void
1285 idpf_splitq_scan_cq_ring(struct idpf_tx_queue *cq)
1286 {
1287 	struct idpf_splitq_tx_compl_desc *compl_ring;
1288 	struct idpf_tx_queue *txq;
1289 	uint16_t genid, txq_qid, cq_qid, i;
1290 	uint8_t ctype;
1291 
1292 	cq_qid = cq->tx_tail;
1293 
1294 	for (i = 0; i < IDPD_TXQ_SCAN_CQ_THRESH; i++) {
1295 		if (cq_qid == cq->nb_tx_desc) {
1296 			cq_qid = 0;
1297 			cq->expected_gen_id ^= 1;
1298 		}
1299 		compl_ring = &cq->compl_ring[cq_qid];
1300 		genid = (compl_ring->qid_comptype_gen &
1301 			rte_cpu_to_le_64(IDPF_TXD_COMPLQ_GEN_M)) >> IDPF_TXD_COMPLQ_GEN_S;
1302 		if (genid != cq->expected_gen_id)
1303 			break;
1304 		ctype = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
1305 			IDPF_TXD_COMPLQ_COMPL_TYPE_M) >> IDPF_TXD_COMPLQ_COMPL_TYPE_S;
1306 		txq_qid = (rte_le_to_cpu_16(compl_ring->qid_comptype_gen) &
1307 			IDPF_TXD_COMPLQ_QID_M) >> IDPF_TXD_COMPLQ_QID_S;
1308 		txq = cq->txqs[txq_qid - cq->tx_start_qid];
1309 		txq->ctype[ctype]++;
1310 		cq_qid++;
1311 	}
1312 
1313 	cq->tx_tail = cq_qid;
1314 }
1315 
1316 static __rte_always_inline int
1317 idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
1318 {
1319 	struct idpf_tx_vec_entry *txep;
1320 	uint32_t n;
1321 	uint32_t i;
1322 	int nb_free = 0;
1323 	struct rte_mbuf *m, *free[txq->rs_thresh];
1324 
1325 	n = txq->rs_thresh;
1326 
1327 	 /* first buffer to free from S/W ring is at index
1328 	  * tx_next_dd - (tx_rs_thresh-1)
1329 	  */
1330 	txep = (void *)txq->sw_ring;
1331 	txep += txq->next_dd - (n - 1);
1332 
1333 	if (txq->offloads & IDPF_TX_OFFLOAD_MBUF_FAST_FREE && (n & 31) == 0) {
1334 		struct rte_mempool *mp = txep[0].mbuf->pool;
1335 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
1336 								rte_lcore_id());
1337 		void **cache_objs;
1338 
1339 		if (!cache || cache->len == 0)
1340 			goto normal;
1341 
1342 		cache_objs = &cache->objs[cache->len];
1343 
1344 		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
1345 			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
1346 			goto done;
1347 		}
1348 
1349 		/* The cache follows the following algorithm
1350 		 *   1. Add the objects to the cache
1351 		 *   2. Anything greater than the cache min value (if it crosses the
1352 		 *   cache flush threshold) is flushed to the ring.
1353 		 */
1354 		/* Add elements back into the cache */
1355 		uint32_t copied = 0;
1356 		/* n is multiple of 32 */
1357 		while (copied < n) {
1358 			const __m512i a = _mm512_loadu_si512(&txep[copied]);
1359 			const __m512i b = _mm512_loadu_si512(&txep[copied + 8]);
1360 			const __m512i c = _mm512_loadu_si512(&txep[copied + 16]);
1361 			const __m512i d = _mm512_loadu_si512(&txep[copied + 24]);
1362 
1363 			_mm512_storeu_si512(&cache_objs[copied], a);
1364 			_mm512_storeu_si512(&cache_objs[copied + 8], b);
1365 			_mm512_storeu_si512(&cache_objs[copied + 16], c);
1366 			_mm512_storeu_si512(&cache_objs[copied + 24], d);
1367 			copied += 32;
1368 		}
1369 		cache->len += n;
1370 
1371 		if (cache->len >= cache->flushthresh) {
1372 			rte_mempool_ops_enqueue_bulk(mp,
1373 						     &cache->objs[cache->size],
1374 						     cache->len - cache->size);
1375 			cache->len = cache->size;
1376 		}
1377 		goto done;
1378 	}
1379 
1380 normal:
1381 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
1382 	if (likely(m)) {
1383 		free[0] = m;
1384 		nb_free = 1;
1385 		for (i = 1; i < n; i++) {
1386 			m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
1387 			if (likely(m)) {
1388 				if (likely(m->pool == free[0]->pool)) {
1389 					free[nb_free++] = m;
1390 				} else {
1391 					rte_mempool_put_bulk(free[0]->pool,
1392 							     (void *)free,
1393 							     nb_free);
1394 					free[0] = m;
1395 					nb_free = 1;
1396 				}
1397 			}
1398 		}
1399 		rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
1400 	} else {
1401 		for (i = 1; i < n; i++) {
1402 			m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
1403 			if (m)
1404 				rte_mempool_put(m->pool, m);
1405 		}
1406 	}
1407 
1408 done:
1409 	/* buffers were freed, update counters */
1410 	txq->nb_free = (uint16_t)(txq->nb_free + txq->rs_thresh);
1411 	txq->next_dd = (uint16_t)(txq->next_dd + txq->rs_thresh);
1412 	if (txq->next_dd >= txq->nb_tx_desc)
1413 		txq->next_dd = (uint16_t)(txq->rs_thresh - 1);
1414 	txq->ctype[IDPF_TXD_COMPLT_RS] -= txq->rs_thresh;
1415 
1416 	return txq->rs_thresh;
1417 }
1418 
1419 #define IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S	48
1420 
1421 static __rte_always_inline void
1422 idpf_splitq_vtx1(volatile struct idpf_flex_tx_sched_desc *txdp,
1423 	  struct rte_mbuf *pkt, uint64_t flags)
1424 {
1425 	uint64_t high_qw =
1426 		(IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE |
1427 		 ((uint64_t)flags) |
1428 		 ((uint64_t)pkt->data_len << IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S));
1429 
1430 	__m128i descriptor = _mm_set_epi64x(high_qw,
1431 					    pkt->buf_iova + pkt->data_off);
1432 	_mm_storeu_si128((__m128i *)txdp, descriptor);
1433 }
1434 
1435 static __rte_always_inline void
1436 idpf_splitq_vtx(volatile struct idpf_flex_tx_sched_desc *txdp,
1437 	 struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags)
1438 {
1439 	const uint64_t hi_qw_tmpl = (IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE  |
1440 			((uint64_t)flags));
1441 
1442 	/* if unaligned on 32-bit boundary, do one to align */
1443 	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
1444 		idpf_splitq_vtx1(txdp, *pkt, flags);
1445 		nb_pkts--, txdp++, pkt++;
1446 	}
1447 
1448 	/* do 4 at a time while possible, in bursts */
1449 	for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
1450 		uint64_t hi_qw3 =
1451 			hi_qw_tmpl |
1452 			((uint64_t)pkt[3]->data_len <<
1453 			 IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S);
1454 		uint64_t hi_qw2 =
1455 			hi_qw_tmpl |
1456 			((uint64_t)pkt[2]->data_len <<
1457 			 IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S);
1458 		uint64_t hi_qw1 =
1459 			hi_qw_tmpl |
1460 			((uint64_t)pkt[1]->data_len <<
1461 			 IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S);
1462 		uint64_t hi_qw0 =
1463 			hi_qw_tmpl |
1464 			((uint64_t)pkt[0]->data_len <<
1465 			 IDPF_TXD_FLEX_QW1_TX_BUF_SZ_S);
1466 
1467 		__m512i desc0_3 =
1468 			_mm512_set_epi64
1469 				(hi_qw3,
1470 				 pkt[3]->buf_iova + pkt[3]->data_off,
1471 				 hi_qw2,
1472 				 pkt[2]->buf_iova + pkt[2]->data_off,
1473 				 hi_qw1,
1474 				 pkt[1]->buf_iova + pkt[1]->data_off,
1475 				 hi_qw0,
1476 				 pkt[0]->buf_iova + pkt[0]->data_off);
1477 		_mm512_storeu_si512((void *)txdp, desc0_3);
1478 	}
1479 
1480 	/* do any last ones */
1481 	while (nb_pkts) {
1482 		idpf_splitq_vtx1(txdp, *pkt, flags);
1483 		txdp++, pkt++, nb_pkts--;
1484 	}
1485 }
1486 
1487 static __rte_always_inline uint16_t
1488 idpf_splitq_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
1489 					uint16_t nb_pkts)
1490 {
1491 	struct idpf_tx_queue *txq = (struct idpf_tx_queue *)tx_queue;
1492 	volatile struct idpf_flex_tx_sched_desc *txdp;
1493 	struct idpf_tx_vec_entry *txep;
1494 	uint16_t n, nb_commit, tx_id;
1495 	/* bit2 is reserved and must be set to 1 according to Spec */
1496 	uint64_t cmd_dtype = IDPF_TXD_FLEX_FLOW_CMD_EOP;
1497 
1498 	tx_id = txq->tx_tail;
1499 
1500 	/* cross rx_thresh boundary is not allowed */
1501 	nb_pkts = RTE_MIN(nb_pkts, txq->rs_thresh);
1502 
1503 	nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_free, nb_pkts);
1504 	if (unlikely(nb_pkts == 0))
1505 		return 0;
1506 
1507 	tx_id = txq->tx_tail;
1508 	txdp = &txq->desc_ring[tx_id];
1509 	txep = (void *)txq->sw_ring;
1510 	txep += tx_id;
1511 
1512 	txq->nb_free = (uint16_t)(txq->nb_free - nb_pkts);
1513 
1514 	n = (uint16_t)(txq->nb_tx_desc - tx_id);
1515 	if (nb_commit >= n) {
1516 		tx_backlog_entry_avx512(txep, tx_pkts, n);
1517 
1518 		idpf_splitq_vtx((void *)txdp, tx_pkts, n - 1, cmd_dtype);
1519 		tx_pkts += (n - 1);
1520 		txdp += (n - 1);
1521 
1522 		idpf_splitq_vtx1((void *)txdp, *tx_pkts++, cmd_dtype);
1523 
1524 		nb_commit = (uint16_t)(nb_commit - n);
1525 
1526 		tx_id = 0;
1527 		txq->next_rs = (uint16_t)(txq->rs_thresh - 1);
1528 
1529 		/* avoid reach the end of ring */
1530 		txdp = &txq->desc_ring[tx_id];
1531 		txep = (void *)txq->sw_ring;
1532 		txep += tx_id;
1533 	}
1534 
1535 	tx_backlog_entry_avx512(txep, tx_pkts, nb_commit);
1536 
1537 	idpf_splitq_vtx((void *)txdp, tx_pkts, nb_commit, cmd_dtype);
1538 
1539 	tx_id = (uint16_t)(tx_id + nb_commit);
1540 	if (tx_id > txq->next_rs)
1541 		txq->next_rs =
1542 			(uint16_t)(txq->next_rs + txq->rs_thresh);
1543 
1544 	txq->tx_tail = tx_id;
1545 
1546 	IDPF_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
1547 
1548 	return nb_pkts;
1549 }
1550 
1551 static __rte_always_inline uint16_t
1552 idpf_splitq_xmit_pkts_vec_avx512_cmn(void *tx_queue, struct rte_mbuf **tx_pkts,
1553 				     uint16_t nb_pkts)
1554 {
1555 	struct idpf_tx_queue *txq = (struct idpf_tx_queue *)tx_queue;
1556 	uint16_t nb_tx = 0;
1557 
1558 	while (nb_pkts) {
1559 		uint16_t ret, num;
1560 
1561 		idpf_splitq_scan_cq_ring(txq->complq);
1562 
1563 		if (txq->ctype[IDPF_TXD_COMPLT_RS] > txq->free_thresh)
1564 			idpf_tx_splitq_free_bufs_avx512(txq);
1565 
1566 		num = (uint16_t)RTE_MIN(nb_pkts, txq->rs_thresh);
1567 		ret = idpf_splitq_xmit_fixed_burst_vec_avx512(tx_queue,
1568 							      &tx_pkts[nb_tx],
1569 							      num);
1570 		nb_tx += ret;
1571 		nb_pkts -= ret;
1572 		if (ret < num)
1573 			break;
1574 	}
1575 
1576 	return nb_tx;
1577 }
1578 
1579 uint16_t
1580 idpf_dp_splitq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
1581 				uint16_t nb_pkts)
1582 {
1583 	return idpf_splitq_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts);
1584 }
1585 
1586 static inline void
1587 idpf_tx_release_mbufs_avx512(struct idpf_tx_queue *txq)
1588 {
1589 	unsigned int i;
1590 	const uint16_t max_desc = (uint16_t)(txq->nb_tx_desc - 1);
1591 	struct idpf_tx_vec_entry *swr = (void *)txq->sw_ring;
1592 
1593 	if (txq->sw_ring == NULL || txq->nb_free == max_desc)
1594 		return;
1595 
1596 	i = txq->next_dd - txq->rs_thresh + 1;
1597 	if (txq->tx_tail < i) {
1598 		for (; i < txq->nb_tx_desc; i++) {
1599 			rte_pktmbuf_free_seg(swr[i].mbuf);
1600 			swr[i].mbuf = NULL;
1601 		}
1602 		i = 0;
1603 	}
1604 }
1605 
1606 static const struct idpf_txq_ops avx512_tx_vec_ops = {
1607 	.release_mbufs = idpf_tx_release_mbufs_avx512,
1608 };
1609 
1610 int __rte_cold
1611 idpf_qc_tx_vec_avx512_setup(struct idpf_tx_queue *txq)
1612 {
1613 	if (!txq)
1614 		return 0;
1615 
1616 	txq->ops = &avx512_tx_vec_ops;
1617 	return 0;
1618 }
1619