xref: /dpdk/drivers/net/intel/iavf/iavf_rxtx_vec_avx512.c (revision 7662502d4c0344059903be75e9afa0ffe26865b3)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2020 Intel Corporation
3  */
4 
5 #include "iavf_rxtx_vec_common.h"
6 
7 #include <rte_vect.h>
8 
9 #define IAVF_DESCS_PER_LOOP_AVX 8
10 #define PKTLEN_SHIFT 10
11 
12 /******************************************************************************
13  * If user knows a specific offload is not enabled by APP,
14  * the macro can be commented to save the effort of fast path.
15  * Currently below 6 features are supported in RX path,
16  * 1, checksum offload
17  * 2, VLAN/QINQ stripping
18  * 3, RSS hash
19  * 4, packet type analysis
20  * 5, flow director ID report
21  * 6, timestamp offload
22  ******************************************************************************/
23 #define IAVF_RX_CSUM_OFFLOAD
24 #define IAVF_RX_VLAN_OFFLOAD
25 #define IAVF_RX_RSS_OFFLOAD
26 #define IAVF_RX_PTYPE_OFFLOAD
27 #define IAVF_RX_FDIR_OFFLOAD
28 #define IAVF_RX_TS_OFFLOAD
29 
30 static __rte_always_inline void
31 iavf_rxq_rearm(struct iavf_rx_queue *rxq)
32 {
33 	iavf_rxq_rearm_common(rxq, true);
34 }
35 
36 #define IAVF_RX_LEN_MASK 0x80808080
37 static __rte_always_inline uint16_t
38 _iavf_recv_raw_pkts_vec_avx512(struct iavf_rx_queue *rxq,
39 			       struct rte_mbuf **rx_pkts,
40 			       uint16_t nb_pkts, uint8_t *split_packet,
41 			       bool offload)
42 {
43 #ifdef IAVF_RX_PTYPE_OFFLOAD
44 	const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
45 #endif
46 
47 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
48 						    rxq->mbuf_initializer);
49 	struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail];
50 	volatile union iavf_rx_desc *rxdp = rxq->rx_ring + rxq->rx_tail;
51 
52 	rte_prefetch0(rxdp);
53 
54 	/* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
55 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_DESCS_PER_LOOP_AVX);
56 
57 	/* See if we need to rearm the RX queue - gives the prefetch a bit
58 	 * of time to act
59 	 */
60 	if (rxq->rxrearm_nb > IAVF_RXQ_REARM_THRESH)
61 		iavf_rxq_rearm(rxq);
62 
63 	/* Before we start moving massive data around, check to see if
64 	 * there is actually a packet available
65 	 */
66 	if (!(rxdp->wb.qword1.status_error_len &
67 	      rte_cpu_to_le_32(1 << IAVF_RX_DESC_STATUS_DD_SHIFT)))
68 		return 0;
69 
70 	/* constants used in processing loop */
71 	const __m512i crc_adjust =
72 		_mm512_set_epi32
73 			(/* 1st descriptor */
74 			 0,             /* ignore non-length fields */
75 			 -rxq->crc_len, /* sub crc on data_len */
76 			 -rxq->crc_len, /* sub crc on pkt_len */
77 			 0,             /* ignore pkt_type field */
78 			 /* 2nd descriptor */
79 			 0,             /* ignore non-length fields */
80 			 -rxq->crc_len, /* sub crc on data_len */
81 			 -rxq->crc_len, /* sub crc on pkt_len */
82 			 0,             /* ignore pkt_type field */
83 			 /* 3rd descriptor */
84 			 0,             /* ignore non-length fields */
85 			 -rxq->crc_len, /* sub crc on data_len */
86 			 -rxq->crc_len, /* sub crc on pkt_len */
87 			 0,             /* ignore pkt_type field */
88 			 /* 4th descriptor */
89 			 0,             /* ignore non-length fields */
90 			 -rxq->crc_len, /* sub crc on data_len */
91 			 -rxq->crc_len, /* sub crc on pkt_len */
92 			 0              /* ignore pkt_type field */
93 			);
94 
95 	/* 8 packets DD mask, LSB in each 32-bit value */
96 	const __m256i dd_check = _mm256_set1_epi32(1);
97 
98 	/* 8 packets EOP mask, second-LSB in each 32-bit value */
99 	const __m256i eop_check = _mm256_slli_epi32(dd_check,
100 			IAVF_RX_DESC_STATUS_EOF_SHIFT);
101 
102 	/* mask to shuffle from desc. to mbuf (4 descriptors)*/
103 	const __m512i shuf_msk =
104 		_mm512_set_epi32
105 			(/* 1st descriptor */
106 			 0x07060504,    /* octet 4~7, 32bits rss */
107 			 0x03020F0E,    /* octet 2~3, low 16 bits vlan_macip */
108 					/* octet 15~14, 16 bits data_len */
109 			 0xFFFF0F0E,    /* skip high 16 bits pkt_len, zero out */
110 					/* octet 15~14, low 16 bits pkt_len */
111 			 0xFFFFFFFF,    /* pkt_type set as unknown */
112 			 /* 2nd descriptor */
113 			 0x07060504,    /* octet 4~7, 32bits rss */
114 			 0x03020F0E,    /* octet 2~3, low 16 bits vlan_macip */
115 					/* octet 15~14, 16 bits data_len */
116 			 0xFFFF0F0E,    /* skip high 16 bits pkt_len, zero out */
117 					/* octet 15~14, low 16 bits pkt_len */
118 			 0xFFFFFFFF,    /* pkt_type set as unknown */
119 			 /* 3rd descriptor */
120 			 0x07060504,    /* octet 4~7, 32bits rss */
121 			 0x03020F0E,    /* octet 2~3, low 16 bits vlan_macip */
122 					/* octet 15~14, 16 bits data_len */
123 			 0xFFFF0F0E,    /* skip high 16 bits pkt_len, zero out */
124 					/* octet 15~14, low 16 bits pkt_len */
125 			 0xFFFFFFFF,    /* pkt_type set as unknown */
126 			 /* 4th descriptor */
127 			 0x07060504,    /* octet 4~7, 32bits rss */
128 			 0x03020F0E,    /* octet 2~3, low 16 bits vlan_macip */
129 					/* octet 15~14, 16 bits data_len */
130 			 0xFFFF0F0E,    /* skip high 16 bits pkt_len, zero out */
131 					/* octet 15~14, low 16 bits pkt_len */
132 			 0xFFFFFFFF     /* pkt_type set as unknown */
133 			);
134 	/**
135 	 * compile-time check the above crc and shuffle layout is correct.
136 	 * NOTE: the first field (lowest address) is given last in set_epi
137 	 * calls above.
138 	 */
139 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
140 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
141 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
142 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
143 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
144 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
145 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
146 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
147 
148 	uint16_t i, received;
149 
150 	for (i = 0, received = 0; i < nb_pkts;
151 	     i += IAVF_DESCS_PER_LOOP_AVX,
152 	     rxdp += IAVF_DESCS_PER_LOOP_AVX) {
153 		/* step 1, copy over 8 mbuf pointers to rx_pkts array */
154 		_mm256_storeu_si256((void *)&rx_pkts[i],
155 				    _mm256_loadu_si256((void *)&sw_ring[i]));
156 #ifdef RTE_ARCH_X86_64
157 		_mm256_storeu_si256
158 			((void *)&rx_pkts[i + 4],
159 			 _mm256_loadu_si256((void *)&sw_ring[i + 4]));
160 #endif
161 
162 		__m512i raw_desc0_3, raw_desc4_7;
163 		const __m128i raw_desc7 =
164 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 7));
165 		rte_compiler_barrier();
166 		const __m128i raw_desc6 =
167 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 6));
168 		rte_compiler_barrier();
169 		const __m128i raw_desc5 =
170 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 5));
171 		rte_compiler_barrier();
172 		const __m128i raw_desc4 =
173 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 4));
174 		rte_compiler_barrier();
175 		const __m128i raw_desc3 =
176 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 3));
177 		rte_compiler_barrier();
178 		const __m128i raw_desc2 =
179 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 2));
180 		rte_compiler_barrier();
181 		const __m128i raw_desc1 =
182 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 1));
183 		rte_compiler_barrier();
184 		const __m128i raw_desc0 =
185 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 0));
186 
187 		raw_desc4_7 = _mm512_broadcast_i32x4(raw_desc4);
188 		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc5, 1);
189 		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc6, 2);
190 		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc7, 3);
191 		raw_desc0_3 = _mm512_broadcast_i32x4(raw_desc0);
192 		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc1, 1);
193 		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc2, 2);
194 		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc3, 3);
195 
196 		if (split_packet) {
197 			int j;
198 
199 			for (j = 0; j < IAVF_DESCS_PER_LOOP_AVX; j++)
200 				rte_mbuf_prefetch_part2(rx_pkts[i + j]);
201 		}
202 
203 		/**
204 		 * convert descriptors 4-7 into mbufs, adjusting length and
205 		 * re-arranging fields. Then write into the mbuf
206 		 */
207 		const __m512i len4_7 = _mm512_slli_epi32(raw_desc4_7,
208 							 PKTLEN_SHIFT);
209 		const __m512i desc4_7 = _mm512_mask_blend_epi16(IAVF_RX_LEN_MASK,
210 								raw_desc4_7,
211 								len4_7);
212 		__m512i mb4_7 = _mm512_shuffle_epi8(desc4_7, shuf_msk);
213 
214 		mb4_7 = _mm512_add_epi32(mb4_7, crc_adjust);
215 #ifdef IAVF_RX_PTYPE_OFFLOAD
216 		/**
217 		 * to get packet types, shift 64-bit values down 30 bits
218 		 * and so ptype is in lower 8-bits in each
219 		 */
220 		const __m512i ptypes4_7 = _mm512_srli_epi64(desc4_7, 30);
221 		const __m256i ptypes6_7 = _mm512_extracti64x4_epi64(ptypes4_7, 1);
222 		const __m256i ptypes4_5 = _mm512_extracti64x4_epi64(ptypes4_7, 0);
223 		const uint8_t ptype7 = _mm256_extract_epi8(ptypes6_7, 24);
224 		const uint8_t ptype6 = _mm256_extract_epi8(ptypes6_7, 8);
225 		const uint8_t ptype5 = _mm256_extract_epi8(ptypes4_5, 24);
226 		const uint8_t ptype4 = _mm256_extract_epi8(ptypes4_5, 8);
227 
228 		const __m512i ptype4_7 = _mm512_set_epi32
229 			(0, 0, 0, type_table[ptype7],
230 			 0, 0, 0, type_table[ptype6],
231 			 0, 0, 0, type_table[ptype5],
232 			 0, 0, 0, type_table[ptype4]);
233 		mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7);
234 #endif
235 
236 		/**
237 		 * convert descriptors 0-3 into mbufs, adjusting length and
238 		 * re-arranging fields. Then write into the mbuf
239 		 */
240 		const __m512i len0_3 = _mm512_slli_epi32(raw_desc0_3,
241 							 PKTLEN_SHIFT);
242 		const __m512i desc0_3 = _mm512_mask_blend_epi16(IAVF_RX_LEN_MASK,
243 								raw_desc0_3,
244 								len0_3);
245 		__m512i mb0_3 = _mm512_shuffle_epi8(desc0_3, shuf_msk);
246 
247 		mb0_3 = _mm512_add_epi32(mb0_3, crc_adjust);
248 #ifdef IAVF_RX_PTYPE_OFFLOAD
249 		/* get the packet types */
250 		const __m512i ptypes0_3 = _mm512_srli_epi64(desc0_3, 30);
251 		const __m256i ptypes2_3 = _mm512_extracti64x4_epi64(ptypes0_3, 1);
252 		const __m256i ptypes0_1 = _mm512_extracti64x4_epi64(ptypes0_3, 0);
253 		const uint8_t ptype3 = _mm256_extract_epi8(ptypes2_3, 24);
254 		const uint8_t ptype2 = _mm256_extract_epi8(ptypes2_3, 8);
255 		const uint8_t ptype1 = _mm256_extract_epi8(ptypes0_1, 24);
256 		const uint8_t ptype0 = _mm256_extract_epi8(ptypes0_1, 8);
257 
258 		const __m512i ptype0_3 = _mm512_set_epi32
259 			(0, 0, 0, type_table[ptype3],
260 			 0, 0, 0, type_table[ptype2],
261 			 0, 0, 0, type_table[ptype1],
262 			 0, 0, 0, type_table[ptype0]);
263 		mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3);
264 #endif
265 
266 		/**
267 		 * use permute/extract to get status content
268 		 * After the operations, the packets status flags are in the
269 		 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
270 		 */
271 		/* merge the status bits into one register */
272 		const __m512i status_permute_msk = _mm512_set_epi32
273 			(0, 0, 0, 0,
274 			 0, 0, 0, 0,
275 			 22, 30, 6, 14,
276 			 18, 26, 2, 10);
277 		const __m512i raw_status0_7 = _mm512_permutex2var_epi32
278 			(raw_desc4_7, status_permute_msk, raw_desc0_3);
279 		__m256i status0_7 = _mm512_extracti64x4_epi64
280 			(raw_status0_7, 0);
281 
282 		/* now do flag manipulation */
283 
284 		/* merge flags */
285 		__m256i mbuf_flags = _mm256_set1_epi32(0);
286 
287 		if (offload) {
288 #if defined(IAVF_RX_CSUM_OFFLOAD) || defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
289 			/* Status/Error flag masks */
290 			/**
291 			 * mask everything except RSS, flow director and VLAN flags
292 			 * bit2 is for VLAN tag, bit11 for flow director indication
293 			 * bit13:12 for RSS indication. Bits 3-5 of error
294 			 * field (bits 22-24) are for IP/L4 checksum errors
295 			 */
296 			const __m256i flags_mask =
297 				_mm256_set1_epi32((1 << 2) | (1 << 11) |
298 						  (3 << 12) | (7 << 22));
299 #endif
300 
301 #ifdef IAVF_RX_VLAN_OFFLOAD
302 			/**
303 			 * data to be shuffled by result of flag mask. If VLAN bit is set,
304 			 * (bit 2), then position 4 in this array will be used in the
305 			 * destination
306 			 */
307 			const __m256i vlan_flags_shuf =
308 				_mm256_set_epi32(0, 0, RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 0,
309 						 0, 0, RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 0);
310 #endif
311 
312 #ifdef IAVF_RX_RSS_OFFLOAD
313 			/**
314 			 * data to be shuffled by result of flag mask, shifted down 11.
315 			 * If RSS/FDIR bits are set, shuffle moves appropriate flags in
316 			 * place.
317 			 */
318 			const __m256i rss_flags_shuf =
319 				_mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
320 						RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_FDIR, RTE_MBUF_F_RX_RSS_HASH,
321 						0, 0, 0, 0, RTE_MBUF_F_RX_FDIR, 0,/* end up 128-bits */
322 						0, 0, 0, 0, 0, 0, 0, 0,
323 						RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_FDIR, RTE_MBUF_F_RX_RSS_HASH,
324 						0, 0, 0, 0, RTE_MBUF_F_RX_FDIR, 0);
325 #endif
326 
327 #ifdef IAVF_RX_CSUM_OFFLOAD
328 			/**
329 			 * data to be shuffled by the result of the flags mask shifted by 22
330 			 * bits.  This gives use the l3_l4 flags.
331 			 */
332 			const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
333 					/* shift right 1 bit to make sure it not exceed 255 */
334 					(RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
335 					 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
336 					(RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
337 					 RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1,
338 					(RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
339 					(RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD) >> 1,
340 					(RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
341 					(RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1,
342 					RTE_MBUF_F_RX_IP_CKSUM_BAD >> 1,
343 					(RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_GOOD) >> 1,
344 					/* second 128-bits */
345 					0, 0, 0, 0, 0, 0, 0, 0,
346 					(RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
347 					 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
348 					(RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
349 					 RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1,
350 					(RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
351 					(RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD) >> 1,
352 					(RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
353 					(RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1,
354 					RTE_MBUF_F_RX_IP_CKSUM_BAD >> 1,
355 					(RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_GOOD) >> 1);
356 
357 			const __m256i cksum_mask =
358 				_mm256_set1_epi32(RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD |
359 						  RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
360 						  RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD);
361 #endif
362 
363 #if defined(IAVF_RX_CSUM_OFFLOAD) || defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
364 			/* get only flag/error bits we want */
365 			const __m256i flag_bits =
366 				_mm256_and_si256(status0_7, flags_mask);
367 #endif
368 			/* set vlan and rss flags */
369 #ifdef IAVF_RX_VLAN_OFFLOAD
370 			const __m256i vlan_flags =
371 				_mm256_shuffle_epi8(vlan_flags_shuf, flag_bits);
372 #endif
373 #ifdef IAVF_RX_RSS_OFFLOAD
374 			const __m256i rss_flags =
375 				_mm256_shuffle_epi8(rss_flags_shuf,
376 						    _mm256_srli_epi32(flag_bits, 11));
377 #endif
378 #ifdef IAVF_RX_CSUM_OFFLOAD
379 			/**
380 			 * l3_l4_error flags, shuffle, then shift to correct adjustment
381 			 * of flags in flags_shuf, and finally mask out extra bits
382 			 */
383 			__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
384 							_mm256_srli_epi32(flag_bits, 22));
385 			l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
386 			l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
387 #endif
388 
389 #ifdef IAVF_RX_CSUM_OFFLOAD
390 			mbuf_flags = _mm256_or_si256(mbuf_flags, l3_l4_flags);
391 #endif
392 #ifdef IAVF_RX_RSS_OFFLOAD
393 			mbuf_flags = _mm256_or_si256(mbuf_flags, rss_flags);
394 #endif
395 #ifdef IAVF_RX_VLAN_OFFLOAD
396 			mbuf_flags = _mm256_or_si256(mbuf_flags, vlan_flags);
397 #endif
398 		}
399 
400 		/**
401 		 * At this point, we have the 8 sets of flags in the low 16-bits
402 		 * of each 32-bit value in vlan0.
403 		 * We want to extract these, and merge them with the mbuf init
404 		 * data so we can do a single write to the mbuf to set the flags
405 		 * and all the other initialization fields. Extracting the
406 		 * appropriate flags means that we have to do a shift and blend
407 		 * for each mbuf before we do the write. However, we can also
408 		 * add in the previously computed rx_descriptor fields to
409 		 * make a single 256-bit write per mbuf
410 		 */
411 		/* check the structure matches expectations */
412 		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
413 				 offsetof(struct rte_mbuf, rearm_data) + 8);
414 		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
415 				 RTE_ALIGN(offsetof(struct rte_mbuf,
416 						    rearm_data),
417 						    16));
418 		/* build up data and do writes */
419 		__m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
420 			rearm6, rearm7;
421 		const __m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0);
422 		const __m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1);
423 		const __m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0);
424 		const __m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1);
425 
426 		if (offload) {
427 			rearm6 = _mm256_blend_epi32(mbuf_init,
428 						    _mm256_slli_si256(mbuf_flags, 8),
429 						    0x04);
430 			rearm4 = _mm256_blend_epi32(mbuf_init,
431 						    _mm256_slli_si256(mbuf_flags, 4),
432 						    0x04);
433 			rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04);
434 			rearm0 = _mm256_blend_epi32(mbuf_init,
435 						    _mm256_srli_si256(mbuf_flags, 4),
436 						    0x04);
437 			/* permute to add in the rx_descriptor e.g. rss fields */
438 			rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
439 			rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
440 			rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
441 			rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20);
442 		} else {
443 			rearm6 = _mm256_permute2f128_si256(mbuf_init, mb6_7, 0x20);
444 			rearm4 = _mm256_permute2f128_si256(mbuf_init, mb4_5, 0x20);
445 			rearm2 = _mm256_permute2f128_si256(mbuf_init, mb2_3, 0x20);
446 			rearm0 = _mm256_permute2f128_si256(mbuf_init, mb0_1, 0x20);
447 		}
448 		/* write to mbuf */
449 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data,
450 				    rearm6);
451 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data,
452 				    rearm4);
453 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data,
454 				    rearm2);
455 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data,
456 				    rearm0);
457 
458 		/* repeat for the odd mbufs */
459 		if (offload) {
460 			const __m256i odd_flags =
461 				_mm256_castsi128_si256
462 					(_mm256_extracti128_si256(mbuf_flags, 1));
463 			rearm7 = _mm256_blend_epi32(mbuf_init,
464 						    _mm256_slli_si256(odd_flags, 8),
465 						    0x04);
466 			rearm5 = _mm256_blend_epi32(mbuf_init,
467 						    _mm256_slli_si256(odd_flags, 4),
468 						    0x04);
469 			rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04);
470 			rearm1 = _mm256_blend_epi32(mbuf_init,
471 						    _mm256_srli_si256(odd_flags, 4),
472 						    0x04);
473 			/* since odd mbufs are already in hi 128-bits use blend */
474 			rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0);
475 			rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0);
476 			rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0);
477 			rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0);
478 		} else {
479 			rearm7 = _mm256_blend_epi32(mbuf_init, mb6_7, 0xF0);
480 			rearm5 = _mm256_blend_epi32(mbuf_init, mb4_5, 0xF0);
481 			rearm3 = _mm256_blend_epi32(mbuf_init, mb2_3, 0xF0);
482 			rearm1 = _mm256_blend_epi32(mbuf_init, mb0_1, 0xF0);
483 		}
484 		/* again write to mbufs */
485 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data,
486 				    rearm7);
487 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data,
488 				    rearm5);
489 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data,
490 				    rearm3);
491 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data,
492 				    rearm1);
493 
494 		/* extract and record EOP bit */
495 		if (split_packet) {
496 			const __m128i eop_mask =
497 				_mm_set1_epi16(1 << IAVF_RX_DESC_STATUS_EOF_SHIFT);
498 			const __m256i eop_bits256 = _mm256_and_si256(status0_7,
499 								     eop_check);
500 			/* pack status bits into a single 128-bit register */
501 			const __m128i eop_bits =
502 				_mm_packus_epi32
503 					(_mm256_castsi256_si128(eop_bits256),
504 					 _mm256_extractf128_si256(eop_bits256,
505 								  1));
506 			/**
507 			 * flip bits, and mask out the EOP bit, which is now
508 			 * a split-packet bit i.e. !EOP, rather than EOP one.
509 			 */
510 			__m128i split_bits = _mm_andnot_si128(eop_bits,
511 							      eop_mask);
512 			/**
513 			 * eop bits are out of order, so we need to shuffle them
514 			 * back into order again. In doing so, only use low 8
515 			 * bits, which acts like another pack instruction
516 			 * The original order is (hi->lo): 1,3,5,7,0,2,4,6
517 			 * [Since we use epi8, the 16-bit positions are
518 			 * multiplied by 2 in the eop_shuffle value.]
519 			 */
520 			__m128i eop_shuffle =
521 				_mm_set_epi8(/* zero hi 64b */
522 					     0xFF, 0xFF, 0xFF, 0xFF,
523 					     0xFF, 0xFF, 0xFF, 0xFF,
524 					     /* move values to lo 64b */
525 					     8, 0, 10, 2,
526 					     12, 4, 14, 6);
527 			split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle);
528 			*(uint64_t *)split_packet =
529 				_mm_cvtsi128_si64(split_bits);
530 			split_packet += IAVF_DESCS_PER_LOOP_AVX;
531 		}
532 
533 		/* perform dd_check */
534 		status0_7 = _mm256_and_si256(status0_7, dd_check);
535 		status0_7 = _mm256_packs_epi32(status0_7,
536 					       _mm256_setzero_si256());
537 
538 		uint64_t burst = rte_popcount64
539 					(_mm_cvtsi128_si64
540 						(_mm256_extracti128_si256
541 							(status0_7, 1)));
542 		burst += rte_popcount64
543 				(_mm_cvtsi128_si64
544 					(_mm256_castsi256_si128(status0_7)));
545 		received += burst;
546 		if (burst != IAVF_DESCS_PER_LOOP_AVX)
547 			break;
548 	}
549 
550 	/* update tail pointers */
551 	rxq->rx_tail += received;
552 	rxq->rx_tail &= (rxq->nb_rx_desc - 1);
553 	if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep aligned */
554 		rxq->rx_tail--;
555 		received--;
556 	}
557 	rxq->rxrearm_nb += received;
558 	return received;
559 }
560 
561 static __rte_always_inline __m256i
562 flex_rxd_to_fdir_flags_vec_avx512(const __m256i fdir_id0_7)
563 {
564 #define FDID_MIS_MAGIC 0xFFFFFFFF
565 	RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR != (1 << 2));
566 	RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR_ID != (1 << 13));
567 	const __m256i pkt_fdir_bit = _mm256_set1_epi32(RTE_MBUF_F_RX_FDIR |
568 						       RTE_MBUF_F_RX_FDIR_ID);
569 	/* desc->flow_id field == 0xFFFFFFFF means fdir mismatch */
570 	const __m256i fdir_mis_mask = _mm256_set1_epi32(FDID_MIS_MAGIC);
571 	__m256i fdir_mask = _mm256_cmpeq_epi32(fdir_id0_7,
572 					       fdir_mis_mask);
573 	/* this XOR op results to bit-reverse the fdir_mask */
574 	fdir_mask = _mm256_xor_si256(fdir_mask, fdir_mis_mask);
575 	const __m256i fdir_flags = _mm256_and_si256(fdir_mask, pkt_fdir_bit);
576 
577 	return fdir_flags;
578 }
579 
580 static __rte_always_inline uint16_t
581 _iavf_recv_raw_pkts_vec_avx512_flex_rxd(struct iavf_rx_queue *rxq,
582 					struct rte_mbuf **rx_pkts,
583 					uint16_t nb_pkts,
584 					uint8_t *split_packet,
585 					bool offload)
586 {
587 	struct iavf_adapter *adapter = rxq->vsi->adapter;
588 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
589 	uint64_t offloads = adapter->dev_data->dev_conf.rxmode.offloads;
590 #endif
591 #ifdef IAVF_RX_PTYPE_OFFLOAD
592 	const uint32_t *type_table = adapter->ptype_tbl;
593 #endif
594 
595 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
596 						    rxq->mbuf_initializer);
597 	struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail];
598 	volatile union iavf_rx_flex_desc *rxdp =
599 		(volatile union iavf_rx_flex_desc *)rxq->rx_ring + rxq->rx_tail;
600 
601 	rte_prefetch0(rxdp);
602 
603 	/* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
604 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_DESCS_PER_LOOP_AVX);
605 
606 	/* See if we need to rearm the RX queue - gives the prefetch a bit
607 	 * of time to act
608 	 */
609 	if (rxq->rxrearm_nb > IAVF_RXQ_REARM_THRESH)
610 		iavf_rxq_rearm(rxq);
611 
612 	/* Before we start moving massive data around, check to see if
613 	 * there is actually a packet available
614 	 */
615 	if (!(rxdp->wb.status_error0 &
616 	      rte_cpu_to_le_32(1 << IAVF_RX_FLEX_DESC_STATUS0_DD_S)))
617 		return 0;
618 
619 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
620 #ifdef IAVF_RX_TS_OFFLOAD
621 	uint8_t inflection_point = 0;
622 	bool is_tsinit = false;
623 	__m256i hw_low_last = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, (uint32_t)rxq->phc_time);
624 
625 	if (rxq->offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP) {
626 		uint64_t sw_cur_time = rte_get_timer_cycles() / (rte_get_timer_hz() / 1000);
627 
628 		if (unlikely(sw_cur_time - rxq->hw_time_update > 4)) {
629 			hw_low_last = _mm256_setzero_si256();
630 			is_tsinit = 1;
631 		} else {
632 			hw_low_last = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, (uint32_t)rxq->phc_time);
633 		}
634 	}
635 #endif
636 #endif
637 
638 	/* constants used in processing loop */
639 	const __m512i crc_adjust =
640 		_mm512_set_epi32
641 			(/* 1st descriptor */
642 			 0,             /* ignore non-length fields */
643 			 -rxq->crc_len, /* sub crc on data_len */
644 			 -rxq->crc_len, /* sub crc on pkt_len */
645 			 0,             /* ignore pkt_type field */
646 			 /* 2nd descriptor */
647 			 0,             /* ignore non-length fields */
648 			 -rxq->crc_len, /* sub crc on data_len */
649 			 -rxq->crc_len, /* sub crc on pkt_len */
650 			 0,             /* ignore pkt_type field */
651 			 /* 3rd descriptor */
652 			 0,             /* ignore non-length fields */
653 			 -rxq->crc_len, /* sub crc on data_len */
654 			 -rxq->crc_len, /* sub crc on pkt_len */
655 			 0,             /* ignore pkt_type field */
656 			 /* 4th descriptor */
657 			 0,             /* ignore non-length fields */
658 			 -rxq->crc_len, /* sub crc on data_len */
659 			 -rxq->crc_len, /* sub crc on pkt_len */
660 			 0              /* ignore pkt_type field */
661 			);
662 
663 	/* 8 packets DD mask, LSB in each 32-bit value */
664 	const __m256i dd_check = _mm256_set1_epi32(1);
665 
666 	/* 8 packets EOP mask, second-LSB in each 32-bit value */
667 	const __m256i eop_check = _mm256_slli_epi32(dd_check,
668 			IAVF_RX_FLEX_DESC_STATUS0_EOF_S);
669 
670 	/* mask to shuffle from desc. to mbuf (4 descriptors)*/
671 	const __m512i shuf_msk =
672 		_mm512_set_epi32
673 			(/* 1st descriptor */
674 			 0xFFFFFFFF,    /* rss hash parsed separately */
675 			 0x0B0A0504,    /* octet 10~11, 16 bits vlan_macip */
676 					/* octet 4~5, 16 bits data_len */
677 			 0xFFFF0504,    /* skip hi 16 bits pkt_len, zero out */
678 					/* octet 4~5, 16 bits pkt_len */
679 			 0xFFFFFFFF,    /* pkt_type set as unknown */
680 			 /* 2nd descriptor */
681 			 0xFFFFFFFF,    /* rss hash parsed separately */
682 			 0x0B0A0504,    /* octet 10~11, 16 bits vlan_macip */
683 					/* octet 4~5, 16 bits data_len */
684 			 0xFFFF0504,    /* skip hi 16 bits pkt_len, zero out */
685 					/* octet 4~5, 16 bits pkt_len */
686 			 0xFFFFFFFF,    /* pkt_type set as unknown */
687 			 /* 3rd descriptor */
688 			 0xFFFFFFFF,    /* rss hash parsed separately */
689 			 0x0B0A0504,    /* octet 10~11, 16 bits vlan_macip */
690 					/* octet 4~5, 16 bits data_len */
691 			 0xFFFF0504,    /* skip hi 16 bits pkt_len, zero out */
692 					/* octet 4~5, 16 bits pkt_len */
693 			 0xFFFFFFFF,    /* pkt_type set as unknown */
694 			 /* 4th descriptor */
695 			 0xFFFFFFFF,    /* rss hash parsed separately */
696 			 0x0B0A0504,    /* octet 10~11, 16 bits vlan_macip */
697 					/* octet 4~5, 16 bits data_len */
698 			 0xFFFF0504,    /* skip hi 16 bits pkt_len, zero out */
699 					/* octet 4~5, 16 bits pkt_len */
700 			 0xFFFFFFFF     /* pkt_type set as unknown */
701 			);
702 	/**
703 	 * compile-time check the above crc and shuffle layout is correct.
704 	 * NOTE: the first field (lowest address) is given last in set_epi
705 	 * calls above.
706 	 */
707 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
708 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
709 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
710 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
711 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
712 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
713 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
714 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
715 
716 	uint16_t i, received;
717 
718 	for (i = 0, received = 0; i < nb_pkts;
719 	     i += IAVF_DESCS_PER_LOOP_AVX,
720 	     rxdp += IAVF_DESCS_PER_LOOP_AVX) {
721 		/* step 1, copy over 8 mbuf pointers to rx_pkts array */
722 		_mm256_storeu_si256((void *)&rx_pkts[i],
723 				    _mm256_loadu_si256((void *)&sw_ring[i]));
724 #ifdef RTE_ARCH_X86_64
725 		_mm256_storeu_si256
726 			((void *)&rx_pkts[i + 4],
727 			 _mm256_loadu_si256((void *)&sw_ring[i + 4]));
728 #endif
729 
730 		__m512i raw_desc0_3, raw_desc4_7;
731 
732 		const __m128i raw_desc7 =
733 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 7));
734 		rte_compiler_barrier();
735 		const __m128i raw_desc6 =
736 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 6));
737 		rte_compiler_barrier();
738 		const __m128i raw_desc5 =
739 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 5));
740 		rte_compiler_barrier();
741 		const __m128i raw_desc4 =
742 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 4));
743 		rte_compiler_barrier();
744 		const __m128i raw_desc3 =
745 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 3));
746 		rte_compiler_barrier();
747 		const __m128i raw_desc2 =
748 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 2));
749 		rte_compiler_barrier();
750 		const __m128i raw_desc1 =
751 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 1));
752 		rte_compiler_barrier();
753 		const __m128i raw_desc0 =
754 			_mm_load_si128(RTE_CAST_PTR(const __m128i *, rxdp + 0));
755 
756 		raw_desc4_7 = _mm512_broadcast_i32x4(raw_desc4);
757 		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc5, 1);
758 		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc6, 2);
759 		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc7, 3);
760 		raw_desc0_3 = _mm512_broadcast_i32x4(raw_desc0);
761 		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc1, 1);
762 		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc2, 2);
763 		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc3, 3);
764 
765 		if (split_packet) {
766 			int j;
767 
768 			for (j = 0; j < IAVF_DESCS_PER_LOOP_AVX; j++)
769 				rte_mbuf_prefetch_part2(rx_pkts[i + j]);
770 		}
771 
772 		/**
773 		 * convert descriptors 4-7 into mbufs, re-arrange fields.
774 		 * Then write into the mbuf.
775 		 */
776 		__m512i mb4_7 = _mm512_shuffle_epi8(raw_desc4_7, shuf_msk);
777 
778 		mb4_7 = _mm512_add_epi32(mb4_7, crc_adjust);
779 #ifdef IAVF_RX_PTYPE_OFFLOAD
780 		/**
781 		 * to get packet types, ptype is located in bit16-25
782 		 * of each 128bits
783 		 */
784 		const __m512i ptype_mask =
785 			_mm512_set1_epi16(IAVF_RX_FLEX_DESC_PTYPE_M);
786 		const __m512i ptypes4_7 =
787 			_mm512_and_si512(raw_desc4_7, ptype_mask);
788 		const __m256i ptypes6_7 = _mm512_extracti64x4_epi64(ptypes4_7, 1);
789 		const __m256i ptypes4_5 = _mm512_extracti64x4_epi64(ptypes4_7, 0);
790 		const uint16_t ptype7 = _mm256_extract_epi16(ptypes6_7, 9);
791 		const uint16_t ptype6 = _mm256_extract_epi16(ptypes6_7, 1);
792 		const uint16_t ptype5 = _mm256_extract_epi16(ptypes4_5, 9);
793 		const uint16_t ptype4 = _mm256_extract_epi16(ptypes4_5, 1);
794 
795 		const __m512i ptype4_7 = _mm512_set_epi32
796 			(0, 0, 0, type_table[ptype7],
797 			 0, 0, 0, type_table[ptype6],
798 			 0, 0, 0, type_table[ptype5],
799 			 0, 0, 0, type_table[ptype4]);
800 		mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7);
801 #endif
802 
803 		/**
804 		 * convert descriptors 0-3 into mbufs, re-arrange fields.
805 		 * Then write into the mbuf.
806 		 */
807 		__m512i mb0_3 = _mm512_shuffle_epi8(raw_desc0_3, shuf_msk);
808 
809 		mb0_3 = _mm512_add_epi32(mb0_3, crc_adjust);
810 #ifdef IAVF_RX_PTYPE_OFFLOAD
811 		/**
812 		 * to get packet types, ptype is located in bit16-25
813 		 * of each 128bits
814 		 */
815 		const __m512i ptypes0_3 =
816 			_mm512_and_si512(raw_desc0_3, ptype_mask);
817 		const __m256i ptypes2_3 = _mm512_extracti64x4_epi64(ptypes0_3, 1);
818 		const __m256i ptypes0_1 = _mm512_extracti64x4_epi64(ptypes0_3, 0);
819 		const uint16_t ptype3 = _mm256_extract_epi16(ptypes2_3, 9);
820 		const uint16_t ptype2 = _mm256_extract_epi16(ptypes2_3, 1);
821 		const uint16_t ptype1 = _mm256_extract_epi16(ptypes0_1, 9);
822 		const uint16_t ptype0 = _mm256_extract_epi16(ptypes0_1, 1);
823 
824 		const __m512i ptype0_3 = _mm512_set_epi32
825 			(0, 0, 0, type_table[ptype3],
826 			 0, 0, 0, type_table[ptype2],
827 			 0, 0, 0, type_table[ptype1],
828 			 0, 0, 0, type_table[ptype0]);
829 		mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3);
830 #endif
831 
832 		/**
833 		 * use permute/extract to get status content
834 		 * After the operations, the packets status flags are in the
835 		 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
836 		 */
837 		/* merge the status bits into one register */
838 		const __m512i status_permute_msk = _mm512_set_epi32
839 			(0, 0, 0, 0,
840 			 0, 0, 0, 0,
841 			 22, 30, 6, 14,
842 			 18, 26, 2, 10);
843 		const __m512i raw_status0_7 = _mm512_permutex2var_epi32
844 			(raw_desc4_7, status_permute_msk, raw_desc0_3);
845 		__m256i status0_7 = _mm512_extracti64x4_epi64
846 			(raw_status0_7, 0);
847 
848 		/* now do flag manipulation */
849 
850 		/* merge flags */
851 		__m256i mbuf_flags = _mm256_set1_epi32(0);
852 		__m256i vlan_flags = _mm256_setzero_si256();
853 
854 		if (offload) {
855 #if defined(IAVF_RX_CSUM_OFFLOAD) || defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
856 			/* Status/Error flag masks */
857 			/**
858 			 * mask everything except Checksum Reports, RSS indication
859 			 * and VLAN indication.
860 			 * bit6:4 for IP/L4 checksum errors.
861 			 * bit12 is for RSS indication.
862 			 * bit13 is for VLAN indication.
863 			 */
864 			const __m256i flags_mask =
865 				 _mm256_set1_epi32((0xF << 4) | (1 << 12) | (1 << 13));
866 #endif
867 #ifdef IAVF_RX_CSUM_OFFLOAD
868 			/**
869 			 * data to be shuffled by the result of the flags mask shifted by 4
870 			 * bits.  This gives use the l3_l4 flags.
871 			 */
872 			const __m256i l3_l4_flags_shuf =
873 				_mm256_set_epi8((RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 |
874 				 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
875 				 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
876 				(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 |
877 				 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
878 				 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
879 				(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 |
880 				 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
881 				 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
882 				(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 |
883 				 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
884 				 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
885 				(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 |
886 				 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
887 				(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 |
888 				 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
889 				(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 |
890 				 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
891 				(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 |
892 				 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
893 				(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 |
894 				 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
895 				 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
896 				(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 |
897 				 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
898 				 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
899 				(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 |
900 				 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
901 				 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
902 				(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 |
903 				 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
904 				 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
905 				(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 |
906 				 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
907 				(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 |
908 				 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
909 				(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 |
910 				 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
911 				(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 |
912 				 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
913 				/**
914 				 * second 128-bits
915 				 * shift right 20 bits to use the low two bits to indicate
916 				 * outer checksum status
917 				 * shift right 1 bit to make sure it not exceed 255
918 				 */
919 				(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 |
920 				 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
921 				 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
922 				(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 |
923 				 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
924 				 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
925 				(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 |
926 				 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
927 				 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
928 				(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 |
929 				 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
930 				 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
931 				(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 |
932 				 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
933 				(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 |
934 				 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
935 				(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 |
936 				 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
937 				(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 |
938 				 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
939 				(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 |
940 				 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
941 				 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
942 				(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 |
943 				 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
944 				 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
945 				(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 |
946 				 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
947 				 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
948 				(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 |
949 				 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
950 				 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
951 				(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 |
952 				 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
953 				(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 |
954 				 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
955 				(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 |
956 				 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
957 				(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 |
958 				 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1);
959 			const __m256i cksum_mask =
960 				 _mm256_set1_epi32(RTE_MBUF_F_RX_IP_CKSUM_MASK |
961 						   RTE_MBUF_F_RX_L4_CKSUM_MASK |
962 						   RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
963 						   RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK);
964 #endif
965 #if defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
966 			/**
967 			 * data to be shuffled by result of flag mask, shifted down 12.
968 			 * If RSS(bit12)/VLAN(bit13) are set,
969 			 * shuffle moves appropriate flags in place.
970 			 */
971 			const __m256i rss_flags_shuf = _mm256_set_epi8
972 					(0, 0, 0, 0,
973 					 0, 0, 0, 0,
974 					 0, 0, 0, 0,
975 					 RTE_MBUF_F_RX_RSS_HASH, 0,
976 					 RTE_MBUF_F_RX_RSS_HASH, 0,
977 					 /* end up 128-bits */
978 					 0, 0, 0, 0,
979 					 0, 0, 0, 0,
980 					 0, 0, 0, 0,
981 					 RTE_MBUF_F_RX_RSS_HASH, 0,
982 					 RTE_MBUF_F_RX_RSS_HASH, 0);
983 
984 			const __m256i vlan_flags_shuf = _mm256_set_epi8
985 					(0, 0, 0, 0,
986 					 0, 0, 0, 0,
987 					 0, 0, 0, 0,
988 					 RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED,
989 					 RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED,
990 					 0, 0,
991 					 /* end up 128-bits */
992 					 0, 0, 0, 0,
993 					 0, 0, 0, 0,
994 					 0, 0, 0, 0,
995 					 RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED,
996 					 RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED,
997 					 0, 0);
998 #endif
999 
1000 #if defined(IAVF_RX_CSUM_OFFLOAD) || defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
1001 			/* get only flag/error bits we want */
1002 			const __m256i flag_bits =
1003 				_mm256_and_si256(status0_7, flags_mask);
1004 #endif
1005 #ifdef IAVF_RX_CSUM_OFFLOAD
1006 			/**
1007 			 * l3_l4_error flags, shuffle, then shift to correct adjustment
1008 			 * of flags in flags_shuf, and finally mask out extra bits
1009 			 */
1010 			__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
1011 					_mm256_srli_epi32(flag_bits, 4));
1012 			l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
1013 			__m256i l4_outer_mask = _mm256_set1_epi32(0x6);
1014 			__m256i l4_outer_flags =
1015 					_mm256_and_si256(l3_l4_flags, l4_outer_mask);
1016 			l4_outer_flags = _mm256_slli_epi32(l4_outer_flags, 20);
1017 
1018 			__m256i l3_l4_mask = _mm256_set1_epi32(~0x6);
1019 
1020 			l3_l4_flags = _mm256_and_si256(l3_l4_flags, l3_l4_mask);
1021 			l3_l4_flags = _mm256_or_si256(l3_l4_flags, l4_outer_flags);
1022 			l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
1023 #endif
1024 #if defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
1025 			/* set rss and vlan flags */
1026 			const __m256i rss_vlan_flag_bits =
1027 				_mm256_srli_epi32(flag_bits, 12);
1028 			const __m256i rss_flags =
1029 				_mm256_shuffle_epi8(rss_flags_shuf,
1030 						    rss_vlan_flag_bits);
1031 
1032 			if (rxq->rx_flags == IAVF_RX_FLAGS_VLAN_TAG_LOC_L2TAG1)
1033 				vlan_flags =
1034 					_mm256_shuffle_epi8(vlan_flags_shuf,
1035 							    rss_vlan_flag_bits);
1036 
1037 			const __m256i rss_vlan_flags =
1038 				_mm256_or_si256(rss_flags, vlan_flags);
1039 
1040 #endif
1041 
1042 #ifdef IAVF_RX_CSUM_OFFLOAD
1043 			mbuf_flags = _mm256_or_si256(mbuf_flags, l3_l4_flags);
1044 #endif
1045 #if defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
1046 			mbuf_flags = _mm256_or_si256(mbuf_flags, rss_vlan_flags);
1047 #endif
1048 		}
1049 
1050 #ifdef IAVF_RX_FDIR_OFFLOAD
1051 		if (rxq->fdir_enabled) {
1052 			const __m512i fdir_permute_mask = _mm512_set_epi32
1053 				(0, 0, 0, 0,
1054 				 0, 0, 0, 0,
1055 				 7, 15, 23, 31,
1056 				 3, 11, 19, 27);
1057 			__m512i fdir_tmp = _mm512_permutex2var_epi32
1058 				(raw_desc0_3, fdir_permute_mask, raw_desc4_7);
1059 			const __m256i fdir_id0_7 = _mm512_extracti64x4_epi64
1060 				(fdir_tmp, 0);
1061 			const __m256i fdir_flags =
1062 				flex_rxd_to_fdir_flags_vec_avx512(fdir_id0_7);
1063 
1064 			/* merge with fdir_flags */
1065 			mbuf_flags = _mm256_or_si256(mbuf_flags, fdir_flags);
1066 
1067 			/* write to mbuf: have to use scalar store here */
1068 			rx_pkts[i + 0]->hash.fdir.hi =
1069 				_mm256_extract_epi32(fdir_id0_7, 3);
1070 
1071 			rx_pkts[i + 1]->hash.fdir.hi =
1072 				_mm256_extract_epi32(fdir_id0_7, 7);
1073 
1074 			rx_pkts[i + 2]->hash.fdir.hi =
1075 				_mm256_extract_epi32(fdir_id0_7, 2);
1076 
1077 			rx_pkts[i + 3]->hash.fdir.hi =
1078 				_mm256_extract_epi32(fdir_id0_7, 6);
1079 
1080 			rx_pkts[i + 4]->hash.fdir.hi =
1081 				_mm256_extract_epi32(fdir_id0_7, 1);
1082 
1083 			rx_pkts[i + 5]->hash.fdir.hi =
1084 				_mm256_extract_epi32(fdir_id0_7, 5);
1085 
1086 			rx_pkts[i + 6]->hash.fdir.hi =
1087 				_mm256_extract_epi32(fdir_id0_7, 0);
1088 
1089 			rx_pkts[i + 7]->hash.fdir.hi =
1090 				_mm256_extract_epi32(fdir_id0_7, 4);
1091 		} /* if() on fdir_enabled */
1092 #endif
1093 
1094 		__m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0);
1095 		__m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1);
1096 		__m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0);
1097 		__m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1);
1098 
1099 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
1100 		if (offload) {
1101 #if defined(IAVF_RX_RSS_OFFLOAD) || defined(IAVF_RX_TS_OFFLOAD)
1102 			/**
1103 			 * needs to load 2nd 16B of each desc for RSS hash parsing,
1104 			 * will cause performance drop to get into this context.
1105 			 */
1106 			if (offloads & RTE_ETH_RX_OFFLOAD_RSS_HASH ||
1107 				offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP ||
1108 			    rxq->rx_flags & IAVF_RX_FLAGS_VLAN_TAG_LOC_L2TAG2_2) {
1109 				/* load bottom half of every 32B desc */
1110 				const __m128i raw_desc_bh7 =
1111 					_mm_load_si128
1112 					(RTE_CAST_PTR(const __m128i *, &rxdp[7].wb.status_error1));
1113 				rte_compiler_barrier();
1114 				const __m128i raw_desc_bh6 =
1115 					_mm_load_si128
1116 					(RTE_CAST_PTR(const __m128i *, &rxdp[6].wb.status_error1));
1117 				rte_compiler_barrier();
1118 				const __m128i raw_desc_bh5 =
1119 					_mm_load_si128
1120 					(RTE_CAST_PTR(const __m128i *, &rxdp[5].wb.status_error1));
1121 				rte_compiler_barrier();
1122 				const __m128i raw_desc_bh4 =
1123 					_mm_load_si128
1124 					(RTE_CAST_PTR(const __m128i *, &rxdp[4].wb.status_error1));
1125 				rte_compiler_barrier();
1126 				const __m128i raw_desc_bh3 =
1127 					_mm_load_si128
1128 					(RTE_CAST_PTR(const __m128i *, &rxdp[3].wb.status_error1));
1129 				rte_compiler_barrier();
1130 				const __m128i raw_desc_bh2 =
1131 					_mm_load_si128
1132 					(RTE_CAST_PTR(const __m128i *, &rxdp[2].wb.status_error1));
1133 				rte_compiler_barrier();
1134 				const __m128i raw_desc_bh1 =
1135 					_mm_load_si128
1136 					(RTE_CAST_PTR(const __m128i *, &rxdp[1].wb.status_error1));
1137 				rte_compiler_barrier();
1138 				const __m128i raw_desc_bh0 =
1139 					_mm_load_si128
1140 					(RTE_CAST_PTR(const __m128i *, &rxdp[0].wb.status_error1));
1141 
1142 				__m256i raw_desc_bh6_7 =
1143 					_mm256_inserti128_si256
1144 						(_mm256_castsi128_si256(raw_desc_bh6),
1145 						 raw_desc_bh7, 1);
1146 				__m256i raw_desc_bh4_5 =
1147 					_mm256_inserti128_si256
1148 						(_mm256_castsi128_si256(raw_desc_bh4),
1149 						 raw_desc_bh5, 1);
1150 				__m256i raw_desc_bh2_3 =
1151 					_mm256_inserti128_si256
1152 						(_mm256_castsi128_si256(raw_desc_bh2),
1153 						 raw_desc_bh3, 1);
1154 				__m256i raw_desc_bh0_1 =
1155 					_mm256_inserti128_si256
1156 						(_mm256_castsi128_si256(raw_desc_bh0),
1157 						 raw_desc_bh1, 1);
1158 
1159 #ifdef IAVF_RX_RSS_OFFLOAD
1160 				if (offloads & RTE_ETH_RX_OFFLOAD_RSS_HASH) {
1161 					/**
1162 					 * to shift the 32b RSS hash value to the
1163 					 * highest 32b of each 128b before mask
1164 					 */
1165 					__m256i rss_hash6_7 =
1166 						_mm256_slli_epi64
1167 						(raw_desc_bh6_7, 32);
1168 					__m256i rss_hash4_5 =
1169 						_mm256_slli_epi64
1170 						(raw_desc_bh4_5, 32);
1171 					__m256i rss_hash2_3 =
1172 						_mm256_slli_epi64
1173 						(raw_desc_bh2_3, 32);
1174 					__m256i rss_hash0_1 =
1175 						_mm256_slli_epi64
1176 						(raw_desc_bh0_1, 32);
1177 
1178 					const __m256i rss_hash_msk =
1179 						_mm256_set_epi32
1180 						(0xFFFFFFFF, 0, 0, 0,
1181 						 0xFFFFFFFF, 0, 0, 0);
1182 
1183 					rss_hash6_7 = _mm256_and_si256
1184 						(rss_hash6_7, rss_hash_msk);
1185 					rss_hash4_5 = _mm256_and_si256
1186 						(rss_hash4_5, rss_hash_msk);
1187 					rss_hash2_3 = _mm256_and_si256
1188 						(rss_hash2_3, rss_hash_msk);
1189 					rss_hash0_1 = _mm256_and_si256
1190 						(rss_hash0_1, rss_hash_msk);
1191 
1192 					mb6_7 = _mm256_or_si256
1193 						(mb6_7, rss_hash6_7);
1194 					mb4_5 = _mm256_or_si256
1195 						(mb4_5, rss_hash4_5);
1196 					mb2_3 = _mm256_or_si256
1197 						(mb2_3, rss_hash2_3);
1198 					mb0_1 = _mm256_or_si256
1199 						(mb0_1, rss_hash0_1);
1200 				}
1201 
1202 				if (rxq->rx_flags & IAVF_RX_FLAGS_VLAN_TAG_LOC_L2TAG2_2) {
1203 					/* merge the status/error-1 bits into one register */
1204 					const __m256i status1_4_7 =
1205 						_mm256_unpacklo_epi32
1206 						(raw_desc_bh6_7,
1207 						 raw_desc_bh4_5);
1208 					const __m256i status1_0_3 =
1209 						_mm256_unpacklo_epi32
1210 						(raw_desc_bh2_3,
1211 						 raw_desc_bh0_1);
1212 
1213 					const __m256i status1_0_7 =
1214 						_mm256_unpacklo_epi64
1215 						(status1_4_7, status1_0_3);
1216 
1217 					const __m256i l2tag2p_flag_mask =
1218 						_mm256_set1_epi32
1219 						(1 << IAVF_RX_FLEX_DESC_STATUS1_L2TAG2P_S);
1220 
1221 					__m256i l2tag2p_flag_bits =
1222 						_mm256_and_si256
1223 						(status1_0_7,
1224 						 l2tag2p_flag_mask);
1225 
1226 					l2tag2p_flag_bits =
1227 						_mm256_srli_epi32
1228 						(l2tag2p_flag_bits,
1229 						 IAVF_RX_FLEX_DESC_STATUS1_L2TAG2P_S);
1230 
1231 					const __m256i l2tag2_flags_shuf =
1232 						_mm256_set_epi8
1233 							(0, 0, 0, 0,
1234 							 0, 0, 0, 0,
1235 							 0, 0, 0, 0,
1236 							 0, 0,
1237 							 RTE_MBUF_F_RX_VLAN |
1238 							 RTE_MBUF_F_RX_VLAN_STRIPPED,
1239 							 0,
1240 							 /* end up 128-bits */
1241 							 0, 0, 0, 0,
1242 							 0, 0, 0, 0,
1243 							 0, 0, 0, 0,
1244 							 0, 0,
1245 							 RTE_MBUF_F_RX_VLAN |
1246 							 RTE_MBUF_F_RX_VLAN_STRIPPED,
1247 							 0);
1248 
1249 					vlan_flags =
1250 						_mm256_shuffle_epi8
1251 							(l2tag2_flags_shuf,
1252 							 l2tag2p_flag_bits);
1253 
1254 					/* merge with vlan_flags */
1255 					mbuf_flags = _mm256_or_si256
1256 							(mbuf_flags,
1257 							 vlan_flags);
1258 
1259 					/* L2TAG2_2 */
1260 					__m256i vlan_tci6_7 =
1261 						_mm256_slli_si256
1262 							(raw_desc_bh6_7, 4);
1263 					__m256i vlan_tci4_5 =
1264 						_mm256_slli_si256
1265 							(raw_desc_bh4_5, 4);
1266 					__m256i vlan_tci2_3 =
1267 						_mm256_slli_si256
1268 							(raw_desc_bh2_3, 4);
1269 					__m256i vlan_tci0_1 =
1270 						_mm256_slli_si256
1271 							(raw_desc_bh0_1, 4);
1272 
1273 					const __m256i vlan_tci_msk =
1274 						_mm256_set_epi32
1275 						(0, 0xFFFF0000, 0, 0,
1276 						 0, 0xFFFF0000, 0, 0);
1277 
1278 					vlan_tci6_7 = _mm256_and_si256
1279 							(vlan_tci6_7,
1280 							 vlan_tci_msk);
1281 					vlan_tci4_5 = _mm256_and_si256
1282 							(vlan_tci4_5,
1283 							 vlan_tci_msk);
1284 					vlan_tci2_3 = _mm256_and_si256
1285 							(vlan_tci2_3,
1286 							 vlan_tci_msk);
1287 					vlan_tci0_1 = _mm256_and_si256
1288 							(vlan_tci0_1,
1289 							 vlan_tci_msk);
1290 
1291 					mb6_7 = _mm256_or_si256
1292 							(mb6_7, vlan_tci6_7);
1293 					mb4_5 = _mm256_or_si256
1294 							(mb4_5, vlan_tci4_5);
1295 					mb2_3 = _mm256_or_si256
1296 							(mb2_3, vlan_tci2_3);
1297 					mb0_1 = _mm256_or_si256
1298 							(mb0_1, vlan_tci0_1);
1299 				}
1300 #endif /* IAVF_RX_RSS_OFFLOAD */
1301 
1302 #ifdef IAVF_RX_TS_OFFLOAD
1303 				if (offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP) {
1304 					uint32_t mask = 0xFFFFFFFF;
1305 					__m256i ts;
1306 					__m256i ts_low = _mm256_setzero_si256();
1307 					__m256i ts_low1;
1308 					__m256i ts_low2;
1309 					__m256i max_ret;
1310 					__m256i cmp_ret;
1311 					uint8_t ret = 0;
1312 					uint8_t shift = 8;
1313 					__m256i ts_desp_mask = _mm256_set_epi32(mask, 0, 0, 0, mask, 0, 0, 0);
1314 					__m256i cmp_mask = _mm256_set1_epi32(mask);
1315 					__m256i ts_permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
1316 
1317 					ts = _mm256_and_si256(raw_desc_bh0_1, ts_desp_mask);
1318 					ts_low = _mm256_or_si256(ts_low, _mm256_srli_si256(ts, 3 * 4));
1319 					ts = _mm256_and_si256(raw_desc_bh2_3, ts_desp_mask);
1320 					ts_low = _mm256_or_si256(ts_low, _mm256_srli_si256(ts, 2 * 4));
1321 					ts = _mm256_and_si256(raw_desc_bh4_5, ts_desp_mask);
1322 					ts_low = _mm256_or_si256(ts_low, _mm256_srli_si256(ts, 4));
1323 					ts = _mm256_and_si256(raw_desc_bh6_7, ts_desp_mask);
1324 					ts_low = _mm256_or_si256(ts_low, ts);
1325 
1326 					ts_low1 = _mm256_permutevar8x32_epi32(ts_low, ts_permute_mask);
1327 					ts_low2 = _mm256_permutevar8x32_epi32(ts_low1,
1328 								_mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 7));
1329 					ts_low2 = _mm256_and_si256(ts_low2,
1330 								_mm256_set_epi32(mask, mask, mask, mask, mask, mask, mask, 0));
1331 					ts_low2 = _mm256_or_si256(ts_low2, hw_low_last);
1332 					hw_low_last = _mm256_and_si256(ts_low1,
1333 								_mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, mask));
1334 
1335 					*RTE_MBUF_DYNFIELD(rx_pkts[i + 0],
1336 						iavf_timestamp_dynfield_offset, uint32_t *) = _mm256_extract_epi32(ts_low1, 0);
1337 					*RTE_MBUF_DYNFIELD(rx_pkts[i + 1],
1338 						iavf_timestamp_dynfield_offset, uint32_t *) = _mm256_extract_epi32(ts_low1, 1);
1339 					*RTE_MBUF_DYNFIELD(rx_pkts[i + 2],
1340 						iavf_timestamp_dynfield_offset, uint32_t *) = _mm256_extract_epi32(ts_low1, 2);
1341 					*RTE_MBUF_DYNFIELD(rx_pkts[i + 3],
1342 						iavf_timestamp_dynfield_offset, uint32_t *) = _mm256_extract_epi32(ts_low1, 3);
1343 					*RTE_MBUF_DYNFIELD(rx_pkts[i + 4],
1344 						iavf_timestamp_dynfield_offset, uint32_t *) = _mm256_extract_epi32(ts_low1, 4);
1345 					*RTE_MBUF_DYNFIELD(rx_pkts[i + 5],
1346 						iavf_timestamp_dynfield_offset, uint32_t *) = _mm256_extract_epi32(ts_low1, 5);
1347 					*RTE_MBUF_DYNFIELD(rx_pkts[i + 6],
1348 						iavf_timestamp_dynfield_offset, uint32_t *) = _mm256_extract_epi32(ts_low1, 6);
1349 					*RTE_MBUF_DYNFIELD(rx_pkts[i + 7],
1350 						iavf_timestamp_dynfield_offset, uint32_t *) = _mm256_extract_epi32(ts_low1, 7);
1351 
1352 					if (unlikely(is_tsinit)) {
1353 						uint32_t in_timestamp;
1354 
1355 						if (iavf_get_phc_time(rxq))
1356 							PMD_DRV_LOG(ERR, "get physical time failed");
1357 						in_timestamp = *RTE_MBUF_DYNFIELD(rx_pkts[i + 0],
1358 										iavf_timestamp_dynfield_offset, uint32_t *);
1359 						rxq->phc_time = iavf_tstamp_convert_32b_64b(rxq->phc_time, in_timestamp);
1360 					}
1361 
1362 					*RTE_MBUF_DYNFIELD(rx_pkts[i + 0],
1363 						iavf_timestamp_dynfield_offset + 4, uint32_t *) = (uint32_t)(rxq->phc_time >> 32);
1364 					*RTE_MBUF_DYNFIELD(rx_pkts[i + 1],
1365 						iavf_timestamp_dynfield_offset + 4, uint32_t *) = (uint32_t)(rxq->phc_time >> 32);
1366 					*RTE_MBUF_DYNFIELD(rx_pkts[i + 2],
1367 						iavf_timestamp_dynfield_offset + 4, uint32_t *) = (uint32_t)(rxq->phc_time >> 32);
1368 					*RTE_MBUF_DYNFIELD(rx_pkts[i + 3],
1369 						iavf_timestamp_dynfield_offset + 4, uint32_t *) = (uint32_t)(rxq->phc_time >> 32);
1370 					*RTE_MBUF_DYNFIELD(rx_pkts[i + 4],
1371 						iavf_timestamp_dynfield_offset + 4, uint32_t *) = (uint32_t)(rxq->phc_time >> 32);
1372 					*RTE_MBUF_DYNFIELD(rx_pkts[i + 5],
1373 						iavf_timestamp_dynfield_offset + 4, uint32_t *) = (uint32_t)(rxq->phc_time >> 32);
1374 					*RTE_MBUF_DYNFIELD(rx_pkts[i + 6],
1375 						iavf_timestamp_dynfield_offset + 4, uint32_t *) = (uint32_t)(rxq->phc_time >> 32);
1376 					*RTE_MBUF_DYNFIELD(rx_pkts[i + 7],
1377 						iavf_timestamp_dynfield_offset + 4, uint32_t *) = (uint32_t)(rxq->phc_time >> 32);
1378 
1379 					max_ret = _mm256_max_epu32(ts_low2, ts_low1);
1380 					cmp_ret = _mm256_andnot_si256(_mm256_cmpeq_epi32(max_ret, ts_low1), cmp_mask);
1381 
1382 					if (_mm256_testz_si256(cmp_ret, cmp_mask)) {
1383 						inflection_point = 0;
1384 					} else {
1385 						inflection_point = 1;
1386 						while (shift > 1) {
1387 							shift = shift >> 1;
1388 							__m256i mask_low = _mm256_setzero_si256();
1389 							__m256i mask_high = _mm256_setzero_si256();
1390 							switch (shift) {
1391 							case 4:
1392 								mask_low = _mm256_set_epi32(0, 0, 0, 0, mask, mask, mask, mask);
1393 								mask_high = _mm256_set_epi32(mask, mask, mask, mask, 0, 0, 0, 0);
1394 								break;
1395 							case 2:
1396 								mask_low = _mm256_srli_si256(cmp_mask, 2 * 4);
1397 								mask_high = _mm256_slli_si256(cmp_mask, 2 * 4);
1398 								break;
1399 							case 1:
1400 								mask_low = _mm256_srli_si256(cmp_mask, 1 * 4);
1401 								mask_high = _mm256_slli_si256(cmp_mask, 1 * 4);
1402 								break;
1403 							}
1404 							ret = _mm256_testz_si256(cmp_ret, mask_low);
1405 							if (ret) {
1406 								ret = _mm256_testz_si256(cmp_ret, mask_high);
1407 								inflection_point += ret ? 0 : shift;
1408 								cmp_mask = mask_high;
1409 							} else {
1410 								cmp_mask = mask_low;
1411 							}
1412 						}
1413 					}
1414 					mbuf_flags = _mm256_or_si256(mbuf_flags,
1415 						_mm256_set1_epi32(iavf_timestamp_dynflag));
1416 				}
1417 #endif /* IAVF_RX_TS_OFFLOAD */
1418 			} /* if() on RSS hash or RX timestamp parsing */
1419 #endif
1420 		}
1421 #endif
1422 
1423 		/**
1424 		 * At this point, we have the 8 sets of flags in the low 16-bits
1425 		 * of each 32-bit value in vlan0.
1426 		 * We want to extract these, and merge them with the mbuf init
1427 		 * data so we can do a single write to the mbuf to set the flags
1428 		 * and all the other initialization fields. Extracting the
1429 		 * appropriate flags means that we have to do a shift and blend
1430 		 * for each mbuf before we do the write. However, we can also
1431 		 * add in the previously computed rx_descriptor fields to
1432 		 * make a single 256-bit write per mbuf
1433 		 */
1434 		/* check the structure matches expectations */
1435 		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
1436 				 offsetof(struct rte_mbuf, rearm_data) + 8);
1437 		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
1438 				 RTE_ALIGN(offsetof(struct rte_mbuf,
1439 						    rearm_data),
1440 						    16));
1441 		/* build up data and do writes */
1442 		__m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
1443 			rearm6, rearm7;
1444 		rearm6 = _mm256_blend_epi32(mbuf_init,
1445 					    _mm256_slli_si256(mbuf_flags, 8),
1446 					    0x04);
1447 		rearm4 = _mm256_blend_epi32(mbuf_init,
1448 					    _mm256_slli_si256(mbuf_flags, 4),
1449 					    0x04);
1450 		rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04);
1451 		rearm0 = _mm256_blend_epi32(mbuf_init,
1452 					    _mm256_srli_si256(mbuf_flags, 4),
1453 					    0x04);
1454 		/* permute to add in the rx_descriptor e.g. rss fields */
1455 		rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
1456 		rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
1457 		rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
1458 		rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20);
1459 		/* write to mbuf */
1460 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data,
1461 				    rearm6);
1462 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data,
1463 				    rearm4);
1464 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data,
1465 				    rearm2);
1466 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data,
1467 				    rearm0);
1468 
1469 		/* repeat for the odd mbufs */
1470 		const __m256i odd_flags =
1471 			_mm256_castsi128_si256
1472 				(_mm256_extracti128_si256(mbuf_flags, 1));
1473 		rearm7 = _mm256_blend_epi32(mbuf_init,
1474 					    _mm256_slli_si256(odd_flags, 8),
1475 					    0x04);
1476 		rearm5 = _mm256_blend_epi32(mbuf_init,
1477 					    _mm256_slli_si256(odd_flags, 4),
1478 					    0x04);
1479 		rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04);
1480 		rearm1 = _mm256_blend_epi32(mbuf_init,
1481 					    _mm256_srli_si256(odd_flags, 4),
1482 					    0x04);
1483 		/* since odd mbufs are already in hi 128-bits use blend */
1484 		rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0);
1485 		rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0);
1486 		rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0);
1487 		rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0);
1488 		/* again write to mbufs */
1489 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data,
1490 				    rearm7);
1491 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data,
1492 				    rearm5);
1493 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data,
1494 				    rearm3);
1495 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data,
1496 				    rearm1);
1497 
1498 		/* extract and record EOP bit */
1499 		if (split_packet) {
1500 			const __m128i eop_mask =
1501 				_mm_set1_epi16(1 <<
1502 					       IAVF_RX_FLEX_DESC_STATUS0_EOF_S);
1503 			const __m256i eop_bits256 = _mm256_and_si256(status0_7,
1504 								     eop_check);
1505 			/* pack status bits into a single 128-bit register */
1506 			const __m128i eop_bits =
1507 				_mm_packus_epi32
1508 					(_mm256_castsi256_si128(eop_bits256),
1509 					 _mm256_extractf128_si256(eop_bits256,
1510 								  1));
1511 			/**
1512 			 * flip bits, and mask out the EOP bit, which is now
1513 			 * a split-packet bit i.e. !EOP, rather than EOP one.
1514 			 */
1515 			__m128i split_bits = _mm_andnot_si128(eop_bits,
1516 							      eop_mask);
1517 			/**
1518 			 * eop bits are out of order, so we need to shuffle them
1519 			 * back into order again. In doing so, only use low 8
1520 			 * bits, which acts like another pack instruction
1521 			 * The original order is (hi->lo): 1,3,5,7,0,2,4,6
1522 			 * [Since we use epi8, the 16-bit positions are
1523 			 * multiplied by 2 in the eop_shuffle value.]
1524 			 */
1525 			__m128i eop_shuffle =
1526 				_mm_set_epi8(/* zero hi 64b */
1527 					     0xFF, 0xFF, 0xFF, 0xFF,
1528 					     0xFF, 0xFF, 0xFF, 0xFF,
1529 					     /* move values to lo 64b */
1530 					     8, 0, 10, 2,
1531 					     12, 4, 14, 6);
1532 			split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle);
1533 			*(uint64_t *)split_packet =
1534 				_mm_cvtsi128_si64(split_bits);
1535 			split_packet += IAVF_DESCS_PER_LOOP_AVX;
1536 		}
1537 
1538 		/* perform dd_check */
1539 		status0_7 = _mm256_and_si256(status0_7, dd_check);
1540 		status0_7 = _mm256_packs_epi32(status0_7,
1541 					       _mm256_setzero_si256());
1542 
1543 		uint64_t burst = rte_popcount64
1544 					(_mm_cvtsi128_si64
1545 						(_mm256_extracti128_si256
1546 							(status0_7, 1)));
1547 		burst += rte_popcount64
1548 				(_mm_cvtsi128_si64
1549 					(_mm256_castsi256_si128(status0_7)));
1550 		received += burst;
1551 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
1552 #ifdef IAVF_RX_TS_OFFLOAD
1553 		if (rxq->offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP) {
1554 			inflection_point = (inflection_point <= burst) ? inflection_point : 0;
1555 			switch (inflection_point) {
1556 			case 1:
1557 				*RTE_MBUF_DYNFIELD(rx_pkts[i + 0],
1558 					iavf_timestamp_dynfield_offset + 4, uint32_t *) += 1;
1559 				/* fallthrough */
1560 			case 2:
1561 				*RTE_MBUF_DYNFIELD(rx_pkts[i + 1],
1562 					iavf_timestamp_dynfield_offset + 4, uint32_t *) += 1;
1563 				/* fallthrough */
1564 			case 3:
1565 				*RTE_MBUF_DYNFIELD(rx_pkts[i + 2],
1566 					iavf_timestamp_dynfield_offset + 4, uint32_t *) += 1;
1567 				/* fallthrough */
1568 			case 4:
1569 				*RTE_MBUF_DYNFIELD(rx_pkts[i + 3],
1570 					iavf_timestamp_dynfield_offset + 4, uint32_t *) += 1;
1571 				/* fallthrough */
1572 			case 5:
1573 				*RTE_MBUF_DYNFIELD(rx_pkts[i + 4],
1574 					iavf_timestamp_dynfield_offset + 4, uint32_t *) += 1;
1575 				/* fallthrough */
1576 			case 6:
1577 				*RTE_MBUF_DYNFIELD(rx_pkts[i + 5],
1578 					iavf_timestamp_dynfield_offset + 4, uint32_t *) += 1;
1579 				/* fallthrough */
1580 			case 7:
1581 				*RTE_MBUF_DYNFIELD(rx_pkts[i + 6],
1582 					iavf_timestamp_dynfield_offset + 4, uint32_t *) += 1;
1583 				/* fallthrough */
1584 			case 8:
1585 				*RTE_MBUF_DYNFIELD(rx_pkts[i + 7],
1586 					iavf_timestamp_dynfield_offset + 4, uint32_t *) += 1;
1587 				rxq->phc_time += (uint64_t)1 << 32;
1588 				/* fallthrough */
1589 			case 0:
1590 				break;
1591 			default:
1592 				PMD_DRV_LOG(ERR, "invalid inflection point for rx timestamp");
1593 				break;
1594 			}
1595 
1596 			rxq->hw_time_update = rte_get_timer_cycles() / (rte_get_timer_hz() / 1000);
1597 		}
1598 #endif
1599 #endif
1600 		if (burst != IAVF_DESCS_PER_LOOP_AVX)
1601 			break;
1602 	}
1603 
1604 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
1605 #ifdef IAVF_RX_TS_OFFLOAD
1606 	if (received > 0 && (rxq->offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP))
1607 		rxq->phc_time = *RTE_MBUF_DYNFIELD(rx_pkts[received - 1],
1608 			iavf_timestamp_dynfield_offset, rte_mbuf_timestamp_t *);
1609 #endif
1610 #endif
1611 
1612 	/* update tail pointers */
1613 	rxq->rx_tail += received;
1614 	rxq->rx_tail &= (rxq->nb_rx_desc - 1);
1615 	if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep aligned */
1616 		rxq->rx_tail--;
1617 		received--;
1618 	}
1619 	rxq->rxrearm_nb += received;
1620 	return received;
1621 }
1622 
1623 /**
1624  * Notice:
1625  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
1626  */
1627 uint16_t
1628 iavf_recv_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
1629 			  uint16_t nb_pkts)
1630 {
1631 	return _iavf_recv_raw_pkts_vec_avx512(rx_queue, rx_pkts, nb_pkts,
1632 					      NULL, false);
1633 }
1634 
1635 /**
1636  * Notice:
1637  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
1638  */
1639 uint16_t
1640 iavf_recv_pkts_vec_avx512_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts,
1641 				   uint16_t nb_pkts)
1642 {
1643 	return _iavf_recv_raw_pkts_vec_avx512_flex_rxd(rx_queue, rx_pkts,
1644 						       nb_pkts, NULL, false);
1645 }
1646 
1647 /**
1648  * vPMD receive routine that reassembles single burst of 32 scattered packets
1649  * Notice:
1650  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
1651  */
1652 static __rte_always_inline uint16_t
1653 iavf_recv_scattered_burst_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
1654 				     uint16_t nb_pkts, bool offload)
1655 {
1656 	struct iavf_rx_queue *rxq = rx_queue;
1657 	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
1658 
1659 	/* get some new buffers */
1660 	uint16_t nb_bufs = _iavf_recv_raw_pkts_vec_avx512(rxq, rx_pkts, nb_pkts,
1661 							  split_flags, offload);
1662 	if (nb_bufs == 0)
1663 		return 0;
1664 
1665 	/* happy day case, full burst + no packets to be joined */
1666 	const uint64_t *split_fl64 = (uint64_t *)split_flags;
1667 
1668 	if (!rxq->pkt_first_seg &&
1669 	    split_fl64[0] == 0 && split_fl64[1] == 0 &&
1670 	    split_fl64[2] == 0 && split_fl64[3] == 0)
1671 		return nb_bufs;
1672 
1673 	/* reassemble any packets that need reassembly*/
1674 	unsigned int i = 0;
1675 
1676 	if (!rxq->pkt_first_seg) {
1677 		/* find the first split flag, and only reassemble then*/
1678 		while (i < nb_bufs && !split_flags[i])
1679 			i++;
1680 		if (i == nb_bufs)
1681 			return nb_bufs;
1682 		rxq->pkt_first_seg = rx_pkts[i];
1683 	}
1684 	return i + ci_rx_reassemble_packets(&rx_pkts[i], nb_bufs - i, &split_flags[i],
1685 		&rxq->pkt_first_seg, &rxq->pkt_last_seg, rxq->crc_len);
1686 }
1687 
1688 /**
1689  * vPMD receive routine that reassembles scattered packets.
1690  * Main receive routine that can handle arbitrary burst sizes
1691  * Notice:
1692  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
1693  */
1694 static __rte_always_inline uint16_t
1695 iavf_recv_scattered_pkts_vec_avx512_cmn(void *rx_queue, struct rte_mbuf **rx_pkts,
1696 					uint16_t nb_pkts, bool offload)
1697 {
1698 	uint16_t retval = 0;
1699 
1700 	while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
1701 		uint16_t burst = iavf_recv_scattered_burst_vec_avx512(rx_queue,
1702 				rx_pkts + retval, IAVF_VPMD_RX_MAX_BURST, offload);
1703 		retval += burst;
1704 		nb_pkts -= burst;
1705 		if (burst < IAVF_VPMD_RX_MAX_BURST)
1706 			return retval;
1707 	}
1708 	return retval + iavf_recv_scattered_burst_vec_avx512(rx_queue,
1709 				rx_pkts + retval, nb_pkts, offload);
1710 }
1711 
1712 uint16_t
1713 iavf_recv_scattered_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
1714 				    uint16_t nb_pkts)
1715 {
1716 	return iavf_recv_scattered_pkts_vec_avx512_cmn(rx_queue, rx_pkts,
1717 						       nb_pkts, false);
1718 }
1719 
1720 /**
1721  * vPMD receive routine that reassembles single burst of
1722  * 32 scattered packets for flex RxD
1723  * Notice:
1724  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
1725  */
1726 static __rte_always_inline uint16_t
1727 iavf_recv_scattered_burst_vec_avx512_flex_rxd(void *rx_queue,
1728 					      struct rte_mbuf **rx_pkts,
1729 					      uint16_t nb_pkts,
1730 					      bool offload)
1731 {
1732 	struct iavf_rx_queue *rxq = rx_queue;
1733 	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
1734 
1735 	/* get some new buffers */
1736 	uint16_t nb_bufs = _iavf_recv_raw_pkts_vec_avx512_flex_rxd(rxq,
1737 					rx_pkts, nb_pkts, split_flags, offload);
1738 	if (nb_bufs == 0)
1739 		return 0;
1740 
1741 	/* happy day case, full burst + no packets to be joined */
1742 	const uint64_t *split_fl64 = (uint64_t *)split_flags;
1743 
1744 	if (!rxq->pkt_first_seg &&
1745 	    split_fl64[0] == 0 && split_fl64[1] == 0 &&
1746 	    split_fl64[2] == 0 && split_fl64[3] == 0)
1747 		return nb_bufs;
1748 
1749 	/* reassemble any packets that need reassembly*/
1750 	unsigned int i = 0;
1751 
1752 	if (!rxq->pkt_first_seg) {
1753 		/* find the first split flag, and only reassemble then*/
1754 		while (i < nb_bufs && !split_flags[i])
1755 			i++;
1756 		if (i == nb_bufs)
1757 			return nb_bufs;
1758 		rxq->pkt_first_seg = rx_pkts[i];
1759 	}
1760 	return i + ci_rx_reassemble_packets(&rx_pkts[i], nb_bufs - i, &split_flags[i],
1761 			&rxq->pkt_first_seg, &rxq->pkt_last_seg, rxq->crc_len);
1762 }
1763 
1764 /**
1765  * vPMD receive routine that reassembles scattered packets for flex RxD.
1766  * Main receive routine that can handle arbitrary burst sizes
1767  * Notice:
1768  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
1769  */
1770 static __rte_always_inline uint16_t
1771 iavf_recv_scattered_pkts_vec_avx512_flex_rxd_cmn(void *rx_queue,
1772 						 struct rte_mbuf **rx_pkts,
1773 						 uint16_t nb_pkts,
1774 						 bool offload)
1775 {
1776 	uint16_t retval = 0;
1777 
1778 	while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
1779 		uint16_t burst =
1780 			iavf_recv_scattered_burst_vec_avx512_flex_rxd
1781 				(rx_queue, rx_pkts + retval,
1782 				 IAVF_VPMD_RX_MAX_BURST, offload);
1783 		retval += burst;
1784 		nb_pkts -= burst;
1785 		if (burst < IAVF_VPMD_RX_MAX_BURST)
1786 			return retval;
1787 	}
1788 	return retval + iavf_recv_scattered_burst_vec_avx512_flex_rxd(rx_queue,
1789 				rx_pkts + retval, nb_pkts, offload);
1790 }
1791 
1792 uint16_t
1793 iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
1794 					     struct rte_mbuf **rx_pkts,
1795 					     uint16_t nb_pkts)
1796 {
1797 	return iavf_recv_scattered_pkts_vec_avx512_flex_rxd_cmn(rx_queue,
1798 								rx_pkts,
1799 								nb_pkts,
1800 								false);
1801 }
1802 
1803 uint16_t
1804 iavf_recv_pkts_vec_avx512_offload(void *rx_queue, struct rte_mbuf **rx_pkts,
1805 				  uint16_t nb_pkts)
1806 {
1807 	return _iavf_recv_raw_pkts_vec_avx512(rx_queue, rx_pkts,
1808 					      nb_pkts, NULL, true);
1809 }
1810 
1811 uint16_t
1812 iavf_recv_scattered_pkts_vec_avx512_offload(void *rx_queue,
1813 					    struct rte_mbuf **rx_pkts,
1814 					    uint16_t nb_pkts)
1815 {
1816 	return iavf_recv_scattered_pkts_vec_avx512_cmn(rx_queue, rx_pkts,
1817 						       nb_pkts, true);
1818 }
1819 
1820 uint16_t
1821 iavf_recv_pkts_vec_avx512_flex_rxd_offload(void *rx_queue,
1822 					   struct rte_mbuf **rx_pkts,
1823 					   uint16_t nb_pkts)
1824 {
1825 	return _iavf_recv_raw_pkts_vec_avx512_flex_rxd(rx_queue,
1826 						       rx_pkts,
1827 						       nb_pkts,
1828 						       NULL,
1829 						       true);
1830 }
1831 
1832 uint16_t
1833 iavf_recv_scattered_pkts_vec_avx512_flex_rxd_offload(void *rx_queue,
1834 						     struct rte_mbuf **rx_pkts,
1835 						     uint16_t nb_pkts)
1836 {
1837 	return iavf_recv_scattered_pkts_vec_avx512_flex_rxd_cmn(rx_queue,
1838 								rx_pkts,
1839 								nb_pkts,
1840 								true);
1841 }
1842 
1843 static __rte_always_inline void
1844 tx_backlog_entry_avx512(struct ci_tx_entry_vec *txep,
1845 			struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
1846 {
1847 	int i;
1848 
1849 	for (i = 0; i < (int)nb_pkts; ++i)
1850 		txep[i].mbuf = tx_pkts[i];
1851 }
1852 
1853 static __rte_always_inline void
1854 iavf_vtx1(volatile struct iavf_tx_desc *txdp,
1855 	  struct rte_mbuf *pkt, uint64_t flags,
1856 	  bool offload)
1857 {
1858 	uint64_t high_qw =
1859 		(IAVF_TX_DESC_DTYPE_DATA |
1860 		 ((uint64_t)flags  << IAVF_TXD_QW1_CMD_SHIFT) |
1861 		 ((uint64_t)pkt->data_len << IAVF_TXD_QW1_TX_BUF_SZ_SHIFT));
1862 	if (offload)
1863 		iavf_txd_enable_offload(pkt, &high_qw);
1864 
1865 	__m128i descriptor = _mm_set_epi64x(high_qw,
1866 					    pkt->buf_iova + pkt->data_off);
1867 	_mm_storeu_si128(RTE_CAST_PTR(__m128i *, txdp), descriptor);
1868 }
1869 
1870 #define IAVF_TX_LEN_MASK 0xAA
1871 #define IAVF_TX_OFF_MASK 0x55
1872 static __rte_always_inline void
1873 iavf_vtx(volatile struct iavf_tx_desc *txdp,
1874 		struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags,
1875 		bool offload)
1876 {
1877 	const uint64_t hi_qw_tmpl = (IAVF_TX_DESC_DTYPE_DATA |
1878 			((uint64_t)flags  << IAVF_TXD_QW1_CMD_SHIFT));
1879 
1880 	/* if unaligned on 32-bit boundary, do one to align */
1881 	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
1882 		iavf_vtx1(txdp, *pkt, flags, offload);
1883 		nb_pkts--, txdp++, pkt++;
1884 	}
1885 
1886 	/* do 4 at a time while possible, in bursts */
1887 	for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
1888 		uint64_t hi_qw3 =
1889 			hi_qw_tmpl |
1890 			((uint64_t)pkt[3]->data_len <<
1891 			 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
1892 		uint64_t hi_qw2 =
1893 			hi_qw_tmpl |
1894 			((uint64_t)pkt[2]->data_len <<
1895 			 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
1896 		uint64_t hi_qw1 =
1897 			hi_qw_tmpl |
1898 			((uint64_t)pkt[1]->data_len <<
1899 			 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
1900 		uint64_t hi_qw0 =
1901 			hi_qw_tmpl |
1902 			((uint64_t)pkt[0]->data_len <<
1903 			 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
1904 		if (offload) {
1905 			iavf_txd_enable_offload(pkt[3], &hi_qw3);
1906 			iavf_txd_enable_offload(pkt[2], &hi_qw2);
1907 			iavf_txd_enable_offload(pkt[1], &hi_qw1);
1908 			iavf_txd_enable_offload(pkt[0], &hi_qw0);
1909 		}
1910 
1911 		__m512i desc0_3 =
1912 			_mm512_set_epi64
1913 				(hi_qw3,
1914 				 pkt[3]->buf_iova + pkt[3]->data_off,
1915 				 hi_qw2,
1916 				 pkt[2]->buf_iova + pkt[2]->data_off,
1917 				 hi_qw1,
1918 				 pkt[1]->buf_iova + pkt[1]->data_off,
1919 				 hi_qw0,
1920 				 pkt[0]->buf_iova + pkt[0]->data_off);
1921 		_mm512_storeu_si512(RTE_CAST_PTR(void *, txdp), desc0_3);
1922 	}
1923 
1924 	/* do any last ones */
1925 	while (nb_pkts) {
1926 		iavf_vtx1(txdp, *pkt, flags, offload);
1927 		txdp++, pkt++, nb_pkts--;
1928 	}
1929 }
1930 
1931 static __rte_always_inline void
1932 iavf_fill_ctx_desc_tunneling_avx512(uint64_t *low_ctx_qw, struct rte_mbuf *pkt)
1933 {
1934 	if (pkt->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK) {
1935 		uint64_t eip_typ = IAVF_TX_CTX_DESC_EIPT_NONE;
1936 		uint64_t eip_len = 0;
1937 		uint64_t eip_noinc = 0;
1938 		/* Default - IP_ID is increment in each segment of LSO */
1939 
1940 		switch (pkt->ol_flags & (RTE_MBUF_F_TX_OUTER_IPV4 |
1941 				RTE_MBUF_F_TX_OUTER_IPV6 |
1942 				RTE_MBUF_F_TX_OUTER_IP_CKSUM)) {
1943 		case RTE_MBUF_F_TX_OUTER_IPV4:
1944 			eip_typ = IAVF_TX_CTX_DESC_EIPT_IPV4_NO_CHECKSUM_OFFLOAD;
1945 			eip_len = pkt->outer_l3_len >> 2;
1946 		break;
1947 		case RTE_MBUF_F_TX_OUTER_IPV4 | RTE_MBUF_F_TX_OUTER_IP_CKSUM:
1948 			eip_typ = IAVF_TX_CTX_DESC_EIPT_IPV4_CHECKSUM_OFFLOAD;
1949 			eip_len = pkt->outer_l3_len >> 2;
1950 		break;
1951 		case RTE_MBUF_F_TX_OUTER_IPV6:
1952 			eip_typ = IAVF_TX_CTX_DESC_EIPT_IPV6;
1953 			eip_len = pkt->outer_l3_len >> 2;
1954 		break;
1955 		}
1956 
1957 		/* L4TUNT: L4 Tunneling Type */
1958 		switch (pkt->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK) {
1959 		case RTE_MBUF_F_TX_TUNNEL_IPIP:
1960 			/* for non UDP / GRE tunneling, set to 00b */
1961 			break;
1962 		case RTE_MBUF_F_TX_TUNNEL_VXLAN:
1963 		case RTE_MBUF_F_TX_TUNNEL_VXLAN_GPE:
1964 		case RTE_MBUF_F_TX_TUNNEL_GTP:
1965 		case RTE_MBUF_F_TX_TUNNEL_GENEVE:
1966 			eip_typ |= IAVF_TXD_CTX_UDP_TUNNELING;
1967 			break;
1968 		case RTE_MBUF_F_TX_TUNNEL_GRE:
1969 			eip_typ |= IAVF_TXD_CTX_GRE_TUNNELING;
1970 			break;
1971 		default:
1972 			PMD_TX_LOG(ERR, "Tunnel type not supported");
1973 			return;
1974 		}
1975 
1976 		/* L4TUNLEN: L4 Tunneling Length, in Words
1977 		 *
1978 		 * We depend on app to set rte_mbuf.l2_len correctly.
1979 		 * For IP in GRE it should be set to the length of the GRE
1980 		 * header;
1981 		 * For MAC in GRE or MAC in UDP it should be set to the length
1982 		 * of the GRE or UDP headers plus the inner MAC up to including
1983 		 * its last Ethertype.
1984 		 * If MPLS labels exists, it should include them as well.
1985 		 */
1986 		eip_typ |= (pkt->l2_len >> 1) << IAVF_TXD_CTX_QW0_NATLEN_SHIFT;
1987 
1988 		/**
1989 		 * Calculate the tunneling UDP checksum.
1990 		 * Shall be set only if L4TUNT = 01b and EIPT is not zero
1991 		 */
1992 		if ((eip_typ & (IAVF_TX_CTX_EXT_IP_IPV4 |
1993 					IAVF_TX_CTX_EXT_IP_IPV6 |
1994 					IAVF_TX_CTX_EXT_IP_IPV4_NO_CSUM)) &&
1995 				(eip_typ & IAVF_TXD_CTX_UDP_TUNNELING) &&
1996 				(pkt->ol_flags & RTE_MBUF_F_TX_OUTER_UDP_CKSUM))
1997 			eip_typ |= IAVF_TXD_CTX_QW0_L4T_CS_MASK;
1998 
1999 		*low_ctx_qw = eip_typ << IAVF_TXD_CTX_QW0_TUN_PARAMS_EIPT_SHIFT |
2000 			eip_len << IAVF_TXD_CTX_QW0_TUN_PARAMS_EIPLEN_SHIFT |
2001 			eip_noinc << IAVF_TXD_CTX_QW0_TUN_PARAMS_EIP_NOINC_SHIFT;
2002 
2003 	} else {
2004 		*low_ctx_qw = 0;
2005 	}
2006 }
2007 
2008 static inline void
2009 iavf_fill_ctx_desc_tunnelling_field(volatile uint64_t *qw0,
2010 		const struct rte_mbuf *m)
2011 {
2012 	uint64_t eip_typ = IAVF_TX_CTX_DESC_EIPT_NONE;
2013 	uint64_t eip_len = 0;
2014 	uint64_t eip_noinc = 0;
2015 	/* Default - IP_ID is increment in each segment of LSO */
2016 
2017 	switch (m->ol_flags & (RTE_MBUF_F_TX_OUTER_IPV4 |
2018 			RTE_MBUF_F_TX_OUTER_IPV6 |
2019 			RTE_MBUF_F_TX_OUTER_IP_CKSUM)) {
2020 	case RTE_MBUF_F_TX_OUTER_IPV4:
2021 		eip_typ = IAVF_TX_CTX_DESC_EIPT_IPV4_NO_CHECKSUM_OFFLOAD;
2022 		eip_len = m->outer_l3_len >> 2;
2023 	break;
2024 	case RTE_MBUF_F_TX_OUTER_IPV4 | RTE_MBUF_F_TX_OUTER_IP_CKSUM:
2025 		eip_typ = IAVF_TX_CTX_DESC_EIPT_IPV4_CHECKSUM_OFFLOAD;
2026 		eip_len = m->outer_l3_len >> 2;
2027 	break;
2028 	case RTE_MBUF_F_TX_OUTER_IPV6:
2029 		eip_typ = IAVF_TX_CTX_DESC_EIPT_IPV6;
2030 		eip_len = m->outer_l3_len >> 2;
2031 	break;
2032 	}
2033 
2034 	/* L4TUNT: L4 Tunneling Type */
2035 	switch (m->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK) {
2036 	case RTE_MBUF_F_TX_TUNNEL_IPIP:
2037 		/* for non UDP / GRE tunneling, set to 00b */
2038 		break;
2039 	case RTE_MBUF_F_TX_TUNNEL_VXLAN:
2040 	case RTE_MBUF_F_TX_TUNNEL_VXLAN_GPE:
2041 	case RTE_MBUF_F_TX_TUNNEL_GTP:
2042 	case RTE_MBUF_F_TX_TUNNEL_GENEVE:
2043 		eip_typ |= IAVF_TXD_CTX_UDP_TUNNELING;
2044 		break;
2045 	case RTE_MBUF_F_TX_TUNNEL_GRE:
2046 		eip_typ |= IAVF_TXD_CTX_GRE_TUNNELING;
2047 		break;
2048 	default:
2049 		PMD_TX_LOG(ERR, "Tunnel type not supported");
2050 		return;
2051 	}
2052 
2053 	/* L4TUNLEN: L4 Tunneling Length, in Words
2054 	 *
2055 	 * We depend on app to set rte_mbuf.l2_len correctly.
2056 	 * For IP in GRE it should be set to the length of the GRE
2057 	 * header;
2058 	 * For MAC in GRE or MAC in UDP it should be set to the length
2059 	 * of the GRE or UDP headers plus the inner MAC up to including
2060 	 * its last Ethertype.
2061 	 * If MPLS labels exists, it should include them as well.
2062 	 */
2063 	eip_typ |= (m->l2_len >> 1) << IAVF_TXD_CTX_QW0_NATLEN_SHIFT;
2064 
2065 	/**
2066 	 * Calculate the tunneling UDP checksum.
2067 	 * Shall be set only if L4TUNT = 01b and EIPT is not zero
2068 	 */
2069 	if ((eip_typ & (IAVF_TX_CTX_EXT_IP_IPV6 |
2070 				IAVF_TX_CTX_EXT_IP_IPV4 |
2071 				IAVF_TX_CTX_EXT_IP_IPV4_NO_CSUM)) &&
2072 			(eip_typ & IAVF_TXD_CTX_UDP_TUNNELING) &&
2073 			(m->ol_flags & RTE_MBUF_F_TX_OUTER_UDP_CKSUM))
2074 		eip_typ |= IAVF_TXD_CTX_QW0_L4T_CS_MASK;
2075 
2076 	*qw0 = eip_typ << IAVF_TXD_CTX_QW0_TUN_PARAMS_EIPT_SHIFT |
2077 		eip_len << IAVF_TXD_CTX_QW0_TUN_PARAMS_EIPLEN_SHIFT |
2078 		eip_noinc << IAVF_TXD_CTX_QW0_TUN_PARAMS_EIP_NOINC_SHIFT;
2079 }
2080 
2081 static __rte_always_inline void
2082 ctx_vtx1(volatile struct iavf_tx_desc *txdp, struct rte_mbuf *pkt,
2083 		uint64_t flags, bool offload, uint8_t vlan_flag)
2084 {
2085 	uint64_t high_ctx_qw = IAVF_TX_DESC_DTYPE_CONTEXT;
2086 	uint64_t low_ctx_qw = 0;
2087 
2088 	if (((pkt->ol_flags & RTE_MBUF_F_TX_VLAN) || offload)) {
2089 		if (offload)
2090 			iavf_fill_ctx_desc_tunneling_avx512(&low_ctx_qw, pkt);
2091 		if ((pkt->ol_flags & RTE_MBUF_F_TX_VLAN) ||
2092 				(vlan_flag & IAVF_TX_FLAGS_VLAN_TAG_LOC_L2TAG2)) {
2093 			high_ctx_qw |= IAVF_TX_CTX_DESC_IL2TAG2 << IAVF_TXD_CTX_QW1_CMD_SHIFT;
2094 			low_ctx_qw |= (uint64_t)pkt->vlan_tci << IAVF_TXD_CTX_QW0_L2TAG2_PARAM;
2095 		}
2096 	}
2097 	if (IAVF_CHECK_TX_LLDP(pkt))
2098 		high_ctx_qw |= IAVF_TX_CTX_DESC_SWTCH_UPLINK
2099 			<< IAVF_TXD_CTX_QW1_CMD_SHIFT;
2100 	uint64_t high_data_qw = (IAVF_TX_DESC_DTYPE_DATA |
2101 				((uint64_t)flags  << IAVF_TXD_QW1_CMD_SHIFT) |
2102 				((uint64_t)pkt->data_len << IAVF_TXD_QW1_TX_BUF_SZ_SHIFT));
2103 	if (offload)
2104 		iavf_txd_enable_offload(pkt, &high_data_qw);
2105 
2106 	__m256i ctx_data_desc = _mm256_set_epi64x(high_data_qw, pkt->buf_iova + pkt->data_off,
2107 							high_ctx_qw, low_ctx_qw);
2108 
2109 	_mm256_storeu_si256(RTE_CAST_PTR(__m256i *, txdp), ctx_data_desc);
2110 }
2111 
2112 static __rte_always_inline void
2113 ctx_vtx(volatile struct iavf_tx_desc *txdp,
2114 		struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags,
2115 		bool offload, uint8_t vlan_flag)
2116 {
2117 	uint64_t hi_data_qw_tmpl = (IAVF_TX_DESC_DTYPE_DATA |
2118 					((uint64_t)flags  << IAVF_TXD_QW1_CMD_SHIFT));
2119 
2120 	/* if unaligned on 32-bit boundary, do one to align */
2121 	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
2122 		ctx_vtx1(txdp, *pkt, flags, offload, vlan_flag);
2123 		nb_pkts--, txdp++, pkt++;
2124 	}
2125 
2126 	for (; nb_pkts > 1; txdp += 4, pkt += 2, nb_pkts -= 2) {
2127 		uint64_t hi_ctx_qw1 = IAVF_TX_DESC_DTYPE_CONTEXT;
2128 		uint64_t hi_ctx_qw0 = IAVF_TX_DESC_DTYPE_CONTEXT;
2129 		uint64_t low_ctx_qw1 = 0;
2130 		uint64_t low_ctx_qw0 = 0;
2131 		uint64_t hi_data_qw1 = 0;
2132 		uint64_t hi_data_qw0 = 0;
2133 
2134 		hi_data_qw1 = hi_data_qw_tmpl |
2135 				((uint64_t)pkt[1]->data_len <<
2136 					IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
2137 		hi_data_qw0 = hi_data_qw_tmpl |
2138 				((uint64_t)pkt[0]->data_len <<
2139 					IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
2140 
2141 		if (pkt[1]->ol_flags & RTE_MBUF_F_TX_VLAN) {
2142 			if (vlan_flag & IAVF_TX_FLAGS_VLAN_TAG_LOC_L2TAG2) {
2143 				hi_ctx_qw1 |=
2144 					IAVF_TX_CTX_DESC_IL2TAG2 << IAVF_TXD_CTX_QW1_CMD_SHIFT;
2145 				low_ctx_qw1 |=
2146 					(uint64_t)pkt[1]->vlan_tci << IAVF_TXD_CTX_QW0_L2TAG2_PARAM;
2147 			} else {
2148 				hi_data_qw1 |=
2149 					(uint64_t)pkt[1]->vlan_tci << IAVF_TXD_QW1_L2TAG1_SHIFT;
2150 			}
2151 		}
2152 		if (IAVF_CHECK_TX_LLDP(pkt[1]))
2153 			hi_ctx_qw1 |= IAVF_TX_CTX_DESC_SWTCH_UPLINK
2154 				<< IAVF_TXD_CTX_QW1_CMD_SHIFT;
2155 
2156 		if (pkt[0]->ol_flags & RTE_MBUF_F_TX_VLAN) {
2157 			if (vlan_flag & IAVF_TX_FLAGS_VLAN_TAG_LOC_L2TAG2) {
2158 				hi_ctx_qw0 |=
2159 					IAVF_TX_CTX_DESC_IL2TAG2 << IAVF_TXD_CTX_QW1_CMD_SHIFT;
2160 				low_ctx_qw0 |=
2161 					(uint64_t)pkt[0]->vlan_tci << IAVF_TXD_CTX_QW0_L2TAG2_PARAM;
2162 			} else {
2163 				hi_data_qw0 |=
2164 					(uint64_t)pkt[0]->vlan_tci << IAVF_TXD_QW1_L2TAG1_SHIFT;
2165 			}
2166 		}
2167 		if (IAVF_CHECK_TX_LLDP(pkt[0]))
2168 			hi_ctx_qw0 |= IAVF_TX_CTX_DESC_SWTCH_UPLINK
2169 				<< IAVF_TXD_CTX_QW1_CMD_SHIFT;
2170 
2171 		if (offload) {
2172 			iavf_txd_enable_offload(pkt[1], &hi_data_qw1);
2173 			iavf_txd_enable_offload(pkt[0], &hi_data_qw0);
2174 			iavf_fill_ctx_desc_tunnelling_field(&low_ctx_qw1, pkt[1]);
2175 			iavf_fill_ctx_desc_tunnelling_field(&low_ctx_qw0, pkt[0]);
2176 		}
2177 
2178 		__m512i desc0_3 =
2179 				_mm512_set_epi64
2180 						(hi_data_qw1, pkt[1]->buf_iova + pkt[1]->data_off,
2181 						hi_ctx_qw1, low_ctx_qw1,
2182 						hi_data_qw0, pkt[0]->buf_iova + pkt[0]->data_off,
2183 						hi_ctx_qw0, low_ctx_qw0);
2184 		_mm512_storeu_si512(RTE_CAST_PTR(void *, txdp), desc0_3);
2185 	}
2186 
2187 	if (nb_pkts)
2188 		ctx_vtx1(txdp, *pkt, flags, offload, vlan_flag);
2189 }
2190 
2191 static __rte_always_inline uint16_t
2192 iavf_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
2193 				 uint16_t nb_pkts, bool offload)
2194 {
2195 	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
2196 	volatile struct iavf_tx_desc *txdp;
2197 	struct ci_tx_entry_vec *txep;
2198 	uint16_t n, nb_commit, tx_id;
2199 	/* bit2 is reserved and must be set to 1 according to Spec */
2200 	uint64_t flags = IAVF_TX_DESC_CMD_EOP | IAVF_TX_DESC_CMD_ICRC;
2201 	uint64_t rs = IAVF_TX_DESC_CMD_RS | flags;
2202 
2203 	if (txq->nb_tx_free < txq->tx_free_thresh)
2204 		ci_tx_free_bufs_vec(txq, iavf_tx_desc_done, false);
2205 
2206 	nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
2207 	if (unlikely(nb_pkts == 0))
2208 		return 0;
2209 	nb_commit = nb_pkts;
2210 
2211 	tx_id = txq->tx_tail;
2212 	txdp = &txq->iavf_tx_ring[tx_id];
2213 	txep = (void *)txq->sw_ring;
2214 	txep += tx_id;
2215 
2216 	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
2217 
2218 	n = (uint16_t)(txq->nb_tx_desc - tx_id);
2219 	if (nb_commit >= n) {
2220 		tx_backlog_entry_avx512(txep, tx_pkts, n);
2221 
2222 		iavf_vtx(txdp, tx_pkts, n - 1, flags, offload);
2223 		tx_pkts += (n - 1);
2224 		txdp += (n - 1);
2225 
2226 		iavf_vtx1(txdp, *tx_pkts++, rs, offload);
2227 
2228 		nb_commit = (uint16_t)(nb_commit - n);
2229 
2230 		tx_id = 0;
2231 		txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
2232 
2233 		/* avoid reach the end of ring */
2234 		txdp = &txq->iavf_tx_ring[tx_id];
2235 		txep = (void *)txq->sw_ring;
2236 		txep += tx_id;
2237 	}
2238 
2239 	tx_backlog_entry_avx512(txep, tx_pkts, nb_commit);
2240 
2241 	iavf_vtx(txdp, tx_pkts, nb_commit, flags, offload);
2242 
2243 	tx_id = (uint16_t)(tx_id + nb_commit);
2244 	if (tx_id > txq->tx_next_rs) {
2245 		txq->iavf_tx_ring[txq->tx_next_rs].cmd_type_offset_bsz |=
2246 			rte_cpu_to_le_64(((uint64_t)IAVF_TX_DESC_CMD_RS) <<
2247 					 IAVF_TXD_QW1_CMD_SHIFT);
2248 		txq->tx_next_rs =
2249 			(uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh);
2250 	}
2251 
2252 	txq->tx_tail = tx_id;
2253 
2254 	IAVF_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
2255 
2256 	return nb_pkts;
2257 }
2258 
2259 static __rte_always_inline uint16_t
2260 iavf_xmit_fixed_burst_vec_avx512_ctx(void *tx_queue, struct rte_mbuf **tx_pkts,
2261 				 uint16_t nb_pkts, bool offload)
2262 {
2263 	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
2264 	volatile struct iavf_tx_desc *txdp;
2265 	struct ci_tx_entry_vec *txep;
2266 	uint16_t n, nb_commit, nb_mbuf, tx_id;
2267 	/* bit2 is reserved and must be set to 1 according to Spec */
2268 	uint64_t flags = IAVF_TX_DESC_CMD_EOP | IAVF_TX_DESC_CMD_ICRC;
2269 	uint64_t rs = IAVF_TX_DESC_CMD_RS | flags;
2270 
2271 	if (txq->nb_tx_free < txq->tx_free_thresh)
2272 		ci_tx_free_bufs_vec(txq, iavf_tx_desc_done, true);
2273 
2274 	nb_commit = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts << 1);
2275 	nb_commit &= 0xFFFE;
2276 	if (unlikely(nb_commit == 0))
2277 		return 0;
2278 
2279 	nb_pkts = nb_commit >> 1;
2280 	tx_id = txq->tx_tail;
2281 	txdp = &txq->iavf_tx_ring[tx_id];
2282 	txep = (void *)txq->sw_ring;
2283 	txep += (tx_id >> 1);
2284 
2285 	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_commit);
2286 	n = (uint16_t)(txq->nb_tx_desc - tx_id);
2287 
2288 	if (n != 0 && nb_commit >= n) {
2289 		nb_mbuf = n >> 1;
2290 		tx_backlog_entry_avx512(txep, tx_pkts, nb_mbuf);
2291 
2292 		ctx_vtx(txdp, tx_pkts, nb_mbuf - 1, flags, offload, txq->vlan_flag);
2293 		tx_pkts += (nb_mbuf - 1);
2294 		txdp += (n - 2);
2295 		ctx_vtx1(txdp, *tx_pkts++, rs, offload, txq->vlan_flag);
2296 
2297 		nb_commit = (uint16_t)(nb_commit - n);
2298 
2299 		txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
2300 		tx_id = 0;
2301 		/* avoid reach the end of ring */
2302 		txdp = txq->iavf_tx_ring;
2303 		txep = (void *)txq->sw_ring;
2304 	}
2305 
2306 	nb_mbuf = nb_commit >> 1;
2307 	tx_backlog_entry_avx512(txep, tx_pkts, nb_mbuf);
2308 
2309 	ctx_vtx(txdp, tx_pkts, nb_mbuf, flags, offload, txq->vlan_flag);
2310 	tx_id = (uint16_t)(tx_id + nb_commit);
2311 
2312 	if (tx_id > txq->tx_next_rs) {
2313 		txq->iavf_tx_ring[txq->tx_next_rs].cmd_type_offset_bsz |=
2314 			rte_cpu_to_le_64(((uint64_t)IAVF_TX_DESC_CMD_RS) <<
2315 					 IAVF_TXD_QW1_CMD_SHIFT);
2316 		txq->tx_next_rs =
2317 			(uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh);
2318 	}
2319 
2320 	txq->tx_tail = tx_id;
2321 
2322 	IAVF_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
2323 	return nb_pkts;
2324 }
2325 
2326 static __rte_always_inline uint16_t
2327 iavf_xmit_pkts_vec_avx512_cmn(void *tx_queue, struct rte_mbuf **tx_pkts,
2328 			      uint16_t nb_pkts, bool offload)
2329 {
2330 	uint16_t nb_tx = 0;
2331 	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
2332 
2333 	while (nb_pkts) {
2334 		uint16_t ret, num;
2335 
2336 		/* cross rs_thresh boundary is not allowed */
2337 		num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh);
2338 		ret = iavf_xmit_fixed_burst_vec_avx512(tx_queue, &tx_pkts[nb_tx],
2339 						       num, offload);
2340 		nb_tx += ret;
2341 		nb_pkts -= ret;
2342 		if (ret < num)
2343 			break;
2344 	}
2345 
2346 	return nb_tx;
2347 }
2348 
2349 uint16_t
2350 iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
2351 			  uint16_t nb_pkts)
2352 {
2353 	return iavf_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts, false);
2354 }
2355 
2356 uint16_t
2357 iavf_xmit_pkts_vec_avx512_offload(void *tx_queue, struct rte_mbuf **tx_pkts,
2358 				  uint16_t nb_pkts)
2359 {
2360 	return iavf_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts, true);
2361 }
2362 
2363 static __rte_always_inline uint16_t
2364 iavf_xmit_pkts_vec_avx512_ctx_cmn(void *tx_queue, struct rte_mbuf **tx_pkts,
2365 				  uint16_t nb_pkts, bool offload)
2366 {
2367 	uint16_t nb_tx = 0;
2368 	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
2369 
2370 	while (nb_pkts) {
2371 		uint16_t ret, num;
2372 
2373 		/* cross rs_thresh boundary is not allowed */
2374 		num = (uint16_t)RTE_MIN(nb_pkts << 1, txq->tx_rs_thresh);
2375 		num = num >> 1;
2376 		ret = iavf_xmit_fixed_burst_vec_avx512_ctx(tx_queue, &tx_pkts[nb_tx],
2377 						       num, offload);
2378 		nb_tx += ret;
2379 		nb_pkts -= ret;
2380 		if (ret < num)
2381 			break;
2382 	}
2383 
2384 	return nb_tx;
2385 }
2386 
2387 uint16_t
2388 iavf_xmit_pkts_vec_avx512_ctx_offload(void *tx_queue, struct rte_mbuf **tx_pkts,
2389 				  uint16_t nb_pkts)
2390 {
2391 	return iavf_xmit_pkts_vec_avx512_ctx_cmn(tx_queue, tx_pkts, nb_pkts, true);
2392 }
2393 
2394 uint16_t
2395 iavf_xmit_pkts_vec_avx512_ctx(void *tx_queue, struct rte_mbuf **tx_pkts,
2396 				  uint16_t nb_pkts)
2397 {
2398 	return iavf_xmit_pkts_vec_avx512_ctx_cmn(tx_queue, tx_pkts, nb_pkts, false);
2399 }
2400