xref: /dpdk/drivers/net/mlx5/mlx5_rxtx_vec_sse.h (revision 43fd3624fdfe3a33904a9b64d94306dd3d4f2c13)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2017 6WIND S.A.
3  * Copyright 2017 Mellanox Technologies, Ltd
4  */
5 
6 #ifndef RTE_PMD_MLX5_RXTX_VEC_SSE_H_
7 #define RTE_PMD_MLX5_RXTX_VEC_SSE_H_
8 
9 #include <stdint.h>
10 #include <string.h>
11 #include <stdlib.h>
12 #include <rte_vect.h>
13 
14 #include <rte_mbuf.h>
15 #include <rte_mempool.h>
16 #include <rte_prefetch.h>
17 
18 #include <mlx5_prm.h>
19 
20 #include "mlx5_defs.h"
21 #include "mlx5.h"
22 #include "mlx5_utils.h"
23 #include "mlx5_rxtx.h"
24 #include "mlx5_rxtx_vec.h"
25 #include "mlx5_autoconf.h"
26 
27 /**
28  * Store free buffers to RX SW ring.
29  *
30  * @param elts
31  *   Pointer to SW ring to be filled.
32  * @param pkts
33  *   Pointer to array of packets to be stored.
34  * @param pkts_n
35  *   Number of packets to be stored.
36  */
37 static inline void
38 rxq_copy_mbuf_v(struct rte_mbuf **elts, struct rte_mbuf **pkts, uint16_t n)
39 {
40 	unsigned int pos;
41 	uint16_t p = n & -2;
42 
43 	for (pos = 0; pos < p; pos += 2) {
44 		__m128i mbp;
45 
46 		mbp = _mm_loadu_si128((__m128i *)&elts[pos]);
47 		_mm_storeu_si128((__m128i *)&pkts[pos], mbp);
48 	}
49 	if (n & 1)
50 		pkts[pos] = elts[pos];
51 }
52 
53 /**
54  * Decompress a compressed completion and fill in mbufs in RX SW ring with data
55  * extracted from the title completion descriptor.
56  *
57  * @param rxq
58  *   Pointer to RX queue structure.
59  * @param cq
60  *   Pointer to completion array having a compressed completion at first.
61  * @param elts
62  *   Pointer to SW ring to be filled. The first mbuf has to be pre-built from
63  *   the title completion descriptor to be copied to the rest of mbufs.
64  * @param keep
65  *   Keep unzipping if the next CQE is the miniCQE array.
66  *
67  * @return
68  *   Number of mini-CQEs successfully decompressed.
69  */
70 static inline uint16_t
71 rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
72 		    struct rte_mbuf **elts, bool keep)
73 {
74 	volatile struct mlx5_mini_cqe8 *mcq =
75 		(volatile struct mlx5_mini_cqe8 *)(cq + !rxq->cqe_comp_layout);
76 	/* Title packet is pre-built. */
77 	struct rte_mbuf *t_pkt = rxq->cqe_comp_layout ? &rxq->title_pkt : elts[0];
78 	unsigned int pos;
79 	unsigned int i;
80 	unsigned int inv = 0;
81 	/* Mask to shuffle from extracted mini CQE to mbuf. */
82 	const __m128i shuf_mask1 =
83 		_mm_set_epi8(0,  1,  2,  3, /* rss, bswap32 */
84 			    -1, -1,         /* skip vlan_tci */
85 			     6,  7,         /* data_len, bswap16 */
86 			    -1, -1,  6,  7, /* pkt_len, bswap16 */
87 			    -1, -1, -1, -1  /* skip packet_type */);
88 	const __m128i shuf_mask2 =
89 		_mm_set_epi8(8,  9, 10, 11, /* rss, bswap32 */
90 			    -1, -1,         /* skip vlan_tci */
91 			    14, 15,         /* data_len, bswap16 */
92 			    -1, -1, 14, 15, /* pkt_len, bswap16 */
93 			    -1, -1, -1, -1  /* skip packet_type */);
94 	/* Restore the compressed count. Must be 16 bits. */
95 	uint16_t mcqe_n = (rxq->cqe_comp_layout) ?
96 		(MLX5_CQE_NUM_MINIS(cq->op_own) + 1U) : rte_be_to_cpu_32(cq->byte_cnt);
97 	uint16_t pkts_n = mcqe_n;
98 	const __m128i rearm =
99 		_mm_loadu_si128((__m128i *)&t_pkt->rearm_data);
100 	const __m128i rxdf =
101 		_mm_loadu_si128((__m128i *)&t_pkt->rx_descriptor_fields1);
102 	const __m128i crc_adj =
103 		_mm_set_epi16(0, 0, 0,
104 			      rxq->crc_present * RTE_ETHER_CRC_LEN,
105 			      0,
106 			      rxq->crc_present * RTE_ETHER_CRC_LEN,
107 			      0, 0);
108 	__m128i ol_flags = _mm_setzero_si128();
109 	__m128i ol_flags_mask = _mm_setzero_si128();
110 #ifdef MLX5_PMD_SOFT_COUNTERS
111 	const __m128i zero = _mm_setzero_si128();
112 	const __m128i ones = _mm_cmpeq_epi32(zero, zero);
113 	uint32_t rcvd_byte = 0;
114 	/* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */
115 	const __m128i len_shuf_mask =
116 		_mm_set_epi8(-1, -1, -1, -1,
117 			     -1, -1, -1, -1,
118 			     14, 15,  6,  7,
119 			     10, 11,  2,  3);
120 #endif
121 	/*
122 	 * A. load mCQEs into a 128bit register.
123 	 * B. store rearm data to mbuf.
124 	 * C. combine data from mCQEs with rx_descriptor_fields1.
125 	 * D. store rx_descriptor_fields1.
126 	 * E. store flow tag (rte_flow mark).
127 	 */
128 cycle:
129 	if (rxq->cqe_comp_layout)
130 		rte_prefetch0((volatile void *)(cq + mcqe_n));
131 	for (pos = 0; pos < mcqe_n; ) {
132 		__m128i mcqe1, mcqe2;
133 		__m128i rxdf1, rxdf2;
134 #ifdef MLX5_PMD_SOFT_COUNTERS
135 		__m128i byte_cnt, invalid_mask;
136 #endif
137 
138 		if (!rxq->cqe_comp_layout)
139 			for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
140 				if (likely(pos + i < mcqe_n))
141 					rte_prefetch0((volatile void *)(cq + pos + i));
142 		/* A.1 load mCQEs into a 128bit register. */
143 		mcqe1 = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, &mcq[pos % 8]));
144 		mcqe2 = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, &mcq[pos % 8 + 2]));
145 		/* B.1 store rearm data to mbuf. */
146 		_mm_storeu_si128((__m128i *)&elts[pos]->rearm_data, rearm);
147 		_mm_storeu_si128((__m128i *)&elts[pos + 1]->rearm_data, rearm);
148 		/* C.1 combine data from mCQEs with rx_descriptor_fields1. */
149 		rxdf1 = _mm_shuffle_epi8(mcqe1, shuf_mask1);
150 		rxdf2 = _mm_shuffle_epi8(mcqe1, shuf_mask2);
151 		rxdf1 = _mm_sub_epi16(rxdf1, crc_adj);
152 		rxdf2 = _mm_sub_epi16(rxdf2, crc_adj);
153 		rxdf1 = _mm_blend_epi16(rxdf1, rxdf, 0x23);
154 		rxdf2 = _mm_blend_epi16(rxdf2, rxdf, 0x23);
155 		/* D.1 store rx_descriptor_fields1. */
156 		_mm_storeu_si128((__m128i *)
157 				  &elts[pos]->rx_descriptor_fields1,
158 				 rxdf1);
159 		_mm_storeu_si128((__m128i *)
160 				  &elts[pos + 1]->rx_descriptor_fields1,
161 				 rxdf2);
162 		/* B.1 store rearm data to mbuf. */
163 		_mm_storeu_si128((__m128i *)&elts[pos + 2]->rearm_data, rearm);
164 		_mm_storeu_si128((__m128i *)&elts[pos + 3]->rearm_data, rearm);
165 		/* C.1 combine data from mCQEs with rx_descriptor_fields1. */
166 		rxdf1 = _mm_shuffle_epi8(mcqe2, shuf_mask1);
167 		rxdf2 = _mm_shuffle_epi8(mcqe2, shuf_mask2);
168 		rxdf1 = _mm_sub_epi16(rxdf1, crc_adj);
169 		rxdf2 = _mm_sub_epi16(rxdf2, crc_adj);
170 		rxdf1 = _mm_blend_epi16(rxdf1, rxdf, 0x23);
171 		rxdf2 = _mm_blend_epi16(rxdf2, rxdf, 0x23);
172 		/* D.1 store rx_descriptor_fields1. */
173 		_mm_storeu_si128((__m128i *)
174 				  &elts[pos + 2]->rx_descriptor_fields1,
175 				 rxdf1);
176 		_mm_storeu_si128((__m128i *)
177 				  &elts[pos + 3]->rx_descriptor_fields1,
178 				 rxdf2);
179 #ifdef MLX5_PMD_SOFT_COUNTERS
180 		invalid_mask = _mm_set_epi64x(0,
181 					      (mcqe_n - pos) *
182 					      sizeof(uint16_t) * 8);
183 		invalid_mask = _mm_sll_epi64(ones, invalid_mask);
184 		byte_cnt = _mm_blend_epi16(_mm_srli_si128(mcqe1, 4),
185 					   mcqe2, 0xcc);
186 		byte_cnt = _mm_shuffle_epi8(byte_cnt, len_shuf_mask);
187 		byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt);
188 		byte_cnt = _mm_hadd_epi16(byte_cnt, zero);
189 		rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero));
190 #endif
191 		if (rxq->mark) {
192 			if (rxq->mcqe_format !=
193 				MLX5_CQE_RESP_FORMAT_FTAG_STRIDX) {
194 				const uint32_t flow_tag = t_pkt->hash.fdir.hi;
195 
196 				/* E.1 store flow tag (rte_flow mark). */
197 				elts[pos]->hash.fdir.hi = flow_tag;
198 				elts[pos + 1]->hash.fdir.hi = flow_tag;
199 				elts[pos + 2]->hash.fdir.hi = flow_tag;
200 				elts[pos + 3]->hash.fdir.hi = flow_tag;
201 			} else {
202 				const __m128i flow_mark_adj =
203 					_mm_set_epi32(-1, -1, -1, -1);
204 				const __m128i flow_mark_shuf =
205 					_mm_set_epi8(-1,  9,  8, 12,
206 						     -1,  1,  0,  4,
207 						     -1, -1, -1, -1,
208 						     -1, -1, -1, -1);
209 				const __m128i ft_mask =
210 					_mm_set1_epi32(0xffffff00);
211 				const __m128i fdir_flags =
212 					_mm_set1_epi32(RTE_MBUF_F_RX_FDIR);
213 				const __m128i fdir_all_flags =
214 					_mm_set1_epi32(RTE_MBUF_F_RX_FDIR |
215 						       rxq->mark_flag);
216 				__m128i fdir_id_flags =
217 					_mm_set1_epi32(rxq->mark_flag);
218 
219 				/* Extract flow_tag field. */
220 				__m128i ftag0 =
221 					_mm_shuffle_epi8(mcqe1, flow_mark_shuf);
222 				__m128i ftag1 =
223 					_mm_shuffle_epi8(mcqe2, flow_mark_shuf);
224 				__m128i ftag =
225 					_mm_unpackhi_epi64(ftag0, ftag1);
226 				__m128i invalid_mask =
227 					_mm_cmpeq_epi32(ftag, zero);
228 
229 				ol_flags_mask = _mm_or_si128(ol_flags_mask,
230 							     fdir_all_flags);
231 				/* Set RTE_MBUF_F_RX_FDIR if flow tag is non-zero. */
232 				ol_flags = _mm_or_si128(ol_flags,
233 					_mm_andnot_si128(invalid_mask,
234 							 fdir_flags));
235 				/* Mask out invalid entries. */
236 				fdir_id_flags = _mm_andnot_si128(invalid_mask,
237 								 fdir_id_flags);
238 				/* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */
239 				ol_flags = _mm_or_si128(ol_flags,
240 					_mm_andnot_si128(_mm_cmpeq_epi32(ftag,
241 							 ft_mask),
242 					fdir_id_flags));
243 				ftag = _mm_add_epi32(ftag, flow_mark_adj);
244 				elts[pos]->hash.fdir.hi =
245 						_mm_extract_epi32(ftag, 0);
246 				elts[pos + 1]->hash.fdir.hi =
247 						_mm_extract_epi32(ftag, 1);
248 				elts[pos + 2]->hash.fdir.hi =
249 						_mm_extract_epi32(ftag, 2);
250 				elts[pos + 3]->hash.fdir.hi =
251 						_mm_extract_epi32(ftag, 3);
252 			}
253 		}
254 		if (unlikely(rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_HASH)) {
255 			if (rxq->mcqe_format ==
256 			    MLX5_CQE_RESP_FORMAT_L34H_STRIDX) {
257 				const uint8_t pkt_info =
258 					(cq->pkt_info & 0x3) << 6;
259 				const uint8_t pkt_hdr0 =
260 					_mm_extract_epi8(mcqe1, 0);
261 				const uint8_t pkt_hdr1 =
262 					_mm_extract_epi8(mcqe1, 8);
263 				const uint8_t pkt_hdr2 =
264 					_mm_extract_epi8(mcqe2, 0);
265 				const uint8_t pkt_hdr3 =
266 					_mm_extract_epi8(mcqe2, 8);
267 				const __m128i vlan_mask =
268 					_mm_set1_epi32(RTE_MBUF_F_RX_VLAN |
269 						       RTE_MBUF_F_RX_VLAN_STRIPPED);
270 				const __m128i cv_mask =
271 					_mm_set1_epi32(MLX5_CQE_VLAN_STRIPPED);
272 				const __m128i pkt_cv =
273 					_mm_set_epi32(pkt_hdr0 & 0x1,
274 						      pkt_hdr1 & 0x1,
275 						      pkt_hdr2 & 0x1,
276 						      pkt_hdr3 & 0x1);
277 
278 				ol_flags_mask = _mm_or_si128(ol_flags_mask,
279 							     vlan_mask);
280 				ol_flags = _mm_or_si128(ol_flags,
281 					_mm_and_si128(_mm_cmpeq_epi32(pkt_cv,
282 					cv_mask), vlan_mask));
283 				elts[pos]->packet_type =
284 					mlx5_ptype_table[(pkt_hdr0 >> 2) |
285 							 pkt_info];
286 				elts[pos + 1]->packet_type =
287 					mlx5_ptype_table[(pkt_hdr1 >> 2) |
288 							 pkt_info];
289 				elts[pos + 2]->packet_type =
290 					mlx5_ptype_table[(pkt_hdr2 >> 2) |
291 							 pkt_info];
292 				elts[pos + 3]->packet_type =
293 					mlx5_ptype_table[(pkt_hdr3 >> 2) |
294 							 pkt_info];
295 				if (rxq->tunnel) {
296 					elts[pos]->packet_type |=
297 						!!(((pkt_hdr0 >> 2) |
298 						pkt_info) & (1 << 6));
299 					elts[pos + 1]->packet_type |=
300 						!!(((pkt_hdr1 >> 2) |
301 						pkt_info) & (1 << 6));
302 					elts[pos + 2]->packet_type |=
303 						!!(((pkt_hdr2 >> 2) |
304 						pkt_info) & (1 << 6));
305 					elts[pos + 3]->packet_type |=
306 						!!(((pkt_hdr3 >> 2) |
307 						pkt_info) & (1 << 6));
308 				}
309 			}
310 			const __m128i hash_flags =
311 				_mm_set1_epi32(RTE_MBUF_F_RX_RSS_HASH);
312 			const __m128i rearm_flags =
313 				_mm_set1_epi32((uint32_t)t_pkt->ol_flags);
314 
315 			ol_flags_mask = _mm_or_si128(ol_flags_mask, hash_flags);
316 			ol_flags = _mm_or_si128(ol_flags,
317 				_mm_andnot_si128(ol_flags_mask, rearm_flags));
318 			elts[pos]->ol_flags =
319 				_mm_extract_epi32(ol_flags, 0);
320 			elts[pos + 1]->ol_flags =
321 				_mm_extract_epi32(ol_flags, 1);
322 			elts[pos + 2]->ol_flags =
323 				_mm_extract_epi32(ol_flags, 2);
324 			elts[pos + 3]->ol_flags =
325 				_mm_extract_epi32(ol_flags, 3);
326 			elts[pos]->hash.rss = 0;
327 			elts[pos + 1]->hash.rss = 0;
328 			elts[pos + 2]->hash.rss = 0;
329 			elts[pos + 3]->hash.rss = 0;
330 		}
331 		if (rxq->dynf_meta) {
332 			int32_t offs = rxq->flow_meta_offset;
333 			const uint32_t meta =
334 				*RTE_MBUF_DYNFIELD(t_pkt, offs, uint32_t *);
335 
336 			/* Check if title packet has valid metadata. */
337 			if (meta) {
338 				MLX5_ASSERT(t_pkt->ol_flags &
339 					    rxq->flow_meta_mask);
340 				*RTE_MBUF_DYNFIELD(elts[pos], offs,
341 							uint32_t *) = meta;
342 				*RTE_MBUF_DYNFIELD(elts[pos + 1], offs,
343 							uint32_t *) = meta;
344 				*RTE_MBUF_DYNFIELD(elts[pos + 2], offs,
345 							uint32_t *) = meta;
346 				*RTE_MBUF_DYNFIELD(elts[pos + 3], offs,
347 							uint32_t *) = meta;
348 			}
349 		}
350 		pos += MLX5_VPMD_DESCS_PER_LOOP;
351 		/* Move to next CQE and invalidate consumed CQEs. */
352 		if (!rxq->cqe_comp_layout) {
353 			if (!(pos & 0x7) && pos < mcqe_n) {
354 				if (pos + 8 < mcqe_n)
355 					rte_prefetch0((volatile void *)(cq + pos + 8));
356 				mcq = (volatile struct mlx5_mini_cqe8 *)(cq + pos);
357 				for (i = 0; i < 8; ++i)
358 					cq[inv++].op_own = MLX5_CQE_INVALIDATE;
359 			}
360 		}
361 	}
362 	if (rxq->cqe_comp_layout && keep) {
363 		int ret;
364 		/* Keep unzipping if the next CQE is the miniCQE array. */
365 		cq = &cq[mcqe_n];
366 		ret = check_cqe_iteration(cq, rxq->cqe_n, rxq->cq_ci + pkts_n);
367 		if (ret == MLX5_CQE_STATUS_SW_OWN &&
368 		    MLX5_CQE_FORMAT(cq->op_own) == MLX5_COMPRESSED) {
369 			pos = 0;
370 			elts = &elts[mcqe_n];
371 			mcq = (volatile struct mlx5_mini_cqe8 *)cq;
372 			mcqe_n = MLX5_CQE_NUM_MINIS(cq->op_own) + 1;
373 			pkts_n += mcqe_n;
374 			goto cycle;
375 		}
376 	} else {
377 		/* Invalidate the rest of CQEs. */
378 		for (; inv < pkts_n; ++inv)
379 			cq[inv].op_own = MLX5_CQE_INVALIDATE;
380 	}
381 #ifdef MLX5_PMD_SOFT_COUNTERS
382 	rxq->stats.ipackets += pkts_n;
383 	rxq->stats.ibytes += rcvd_byte;
384 #endif
385 	return pkts_n;
386 }
387 
388 /**
389  * Calculate packet type and offload flag for mbuf and store it.
390  *
391  * @param rxq
392  *   Pointer to RX queue structure.
393  * @param cqes[4]
394  *   Array of four 16bytes completions extracted from the original completion
395  *   descriptor.
396  * @param op_err
397  *   Opcode vector having responder error status. Each field is 4B.
398  * @param pkts
399  *   Pointer to array of packets to be filled.
400  */
401 static inline void
402 rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, __m128i cqes[4],
403 			 __m128i op_err, struct rte_mbuf **pkts)
404 {
405 	__m128i pinfo0, pinfo1;
406 	__m128i pinfo, ptype;
407 	__m128i ol_flags = _mm_set1_epi32(rxq->rss_hash * RTE_MBUF_F_RX_RSS_HASH |
408 					  rxq->hw_timestamp * rxq->timestamp_rx_flag);
409 	__m128i cv_flags;
410 	const __m128i zero = _mm_setzero_si128();
411 	const __m128i ptype_mask = _mm_set1_epi32(0xfd06);
412 	const __m128i ptype_ol_mask = _mm_set1_epi32(0x106);
413 	const __m128i pinfo_mask = _mm_set1_epi32(0x3);
414 	const __m128i cv_flag_sel =
415 		_mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0,
416 			     (uint8_t)((RTE_MBUF_F_RX_IP_CKSUM_GOOD |
417 					RTE_MBUF_F_RX_L4_CKSUM_GOOD) >> 1),
418 			     0,
419 			     (uint8_t)(RTE_MBUF_F_RX_L4_CKSUM_GOOD >> 1),
420 			     0,
421 			     (uint8_t)(RTE_MBUF_F_RX_IP_CKSUM_GOOD >> 1),
422 			     (uint8_t)(RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED),
423 			     0);
424 	const __m128i cv_mask =
425 		_mm_set1_epi32(RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
426 			       RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED);
427 	const __m128i mbuf_init =
428 		_mm_load_si128((__m128i *)&rxq->mbuf_initializer);
429 	__m128i rearm0, rearm1, rearm2, rearm3;
430 	uint8_t pt_idx0, pt_idx1, pt_idx2, pt_idx3;
431 
432 	/* Extract pkt_info field. */
433 	pinfo0 = _mm_unpacklo_epi32(cqes[0], cqes[1]);
434 	pinfo1 = _mm_unpacklo_epi32(cqes[2], cqes[3]);
435 	pinfo = _mm_unpacklo_epi64(pinfo0, pinfo1);
436 	/* Extract hdr_type_etc field. */
437 	pinfo0 = _mm_unpackhi_epi32(cqes[0], cqes[1]);
438 	pinfo1 = _mm_unpackhi_epi32(cqes[2], cqes[3]);
439 	ptype = _mm_unpacklo_epi64(pinfo0, pinfo1);
440 	if (rxq->mark) {
441 		const __m128i pinfo_ft_mask = _mm_set1_epi32(0xffffff00);
442 		const __m128i fdir_flags = _mm_set1_epi32(RTE_MBUF_F_RX_FDIR);
443 		__m128i fdir_id_flags = _mm_set1_epi32(rxq->mark_flag);
444 		__m128i flow_tag, invalid_mask;
445 
446 		flow_tag = _mm_and_si128(pinfo, pinfo_ft_mask);
447 		/* Check if flow tag is non-zero then set RTE_MBUF_F_RX_FDIR. */
448 		invalid_mask = _mm_cmpeq_epi32(flow_tag, zero);
449 		ol_flags = _mm_or_si128(ol_flags,
450 					_mm_andnot_si128(invalid_mask,
451 							 fdir_flags));
452 		/* Mask out invalid entries. */
453 		fdir_id_flags = _mm_andnot_si128(invalid_mask, fdir_id_flags);
454 		/* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */
455 		ol_flags = _mm_or_si128(ol_flags,
456 					_mm_andnot_si128(
457 						_mm_cmpeq_epi32(flow_tag,
458 								pinfo_ft_mask),
459 						fdir_id_flags));
460 	}
461 	/*
462 	 * Merge the two fields to generate the following:
463 	 * bit[1]     = l3_ok
464 	 * bit[2]     = l4_ok
465 	 * bit[8]     = cv
466 	 * bit[11:10] = l3_hdr_type
467 	 * bit[14:12] = l4_hdr_type
468 	 * bit[15]    = ip_frag
469 	 * bit[16]    = tunneled
470 	 * bit[17]    = outer_l3_type
471 	 */
472 	ptype = _mm_and_si128(ptype, ptype_mask);
473 	pinfo = _mm_and_si128(pinfo, pinfo_mask);
474 	pinfo = _mm_slli_epi32(pinfo, 16);
475 	/* Make pinfo has merged fields for ol_flags calculation. */
476 	pinfo = _mm_or_si128(ptype, pinfo);
477 	ptype = _mm_srli_epi32(pinfo, 10);
478 	ptype = _mm_packs_epi32(ptype, zero);
479 	/* Errored packets will have RTE_PTYPE_ALL_MASK. */
480 	op_err = _mm_srli_epi16(op_err, 8);
481 	ptype = _mm_or_si128(ptype, op_err);
482 	pt_idx0 = _mm_extract_epi8(ptype, 0);
483 	pt_idx1 = _mm_extract_epi8(ptype, 2);
484 	pt_idx2 = _mm_extract_epi8(ptype, 4);
485 	pt_idx3 = _mm_extract_epi8(ptype, 6);
486 	pkts[0]->packet_type = mlx5_ptype_table[pt_idx0] |
487 			       !!(pt_idx0 & (1 << 6)) * rxq->tunnel;
488 	pkts[1]->packet_type = mlx5_ptype_table[pt_idx1] |
489 			       !!(pt_idx1 & (1 << 6)) * rxq->tunnel;
490 	pkts[2]->packet_type = mlx5_ptype_table[pt_idx2] |
491 			       !!(pt_idx2 & (1 << 6)) * rxq->tunnel;
492 	pkts[3]->packet_type = mlx5_ptype_table[pt_idx3] |
493 			       !!(pt_idx3 & (1 << 6)) * rxq->tunnel;
494 	/* Fill flags for checksum and VLAN. */
495 	pinfo = _mm_and_si128(pinfo, ptype_ol_mask);
496 	pinfo = _mm_shuffle_epi8(cv_flag_sel, pinfo);
497 	/* Locate checksum flags at byte[2:1] and merge with VLAN flags. */
498 	cv_flags = _mm_slli_epi32(pinfo, 9);
499 	cv_flags = _mm_or_si128(pinfo, cv_flags);
500 	/* Move back flags to start from byte[0]. */
501 	cv_flags = _mm_srli_epi32(cv_flags, 8);
502 	/* Mask out garbage bits. */
503 	cv_flags = _mm_and_si128(cv_flags, cv_mask);
504 	/* Merge to ol_flags. */
505 	ol_flags = _mm_or_si128(ol_flags, cv_flags);
506 	/* Merge mbuf_init and ol_flags. */
507 	rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(ol_flags, 8), 0x30);
508 	rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(ol_flags, 4), 0x30);
509 	rearm2 = _mm_blend_epi16(mbuf_init, ol_flags, 0x30);
510 	rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(ol_flags, 4), 0x30);
511 	/* Write 8B rearm_data and 8B ol_flags. */
512 	_mm_store_si128((__m128i *)&pkts[0]->rearm_data, rearm0);
513 	_mm_store_si128((__m128i *)&pkts[1]->rearm_data, rearm1);
514 	_mm_store_si128((__m128i *)&pkts[2]->rearm_data, rearm2);
515 	_mm_store_si128((__m128i *)&pkts[3]->rearm_data, rearm3);
516 }
517 
518 /**
519  * Process a non-compressed completion and fill in mbufs in RX SW ring
520  * with data extracted from the title completion descriptor.
521  *
522  * @param rxq
523  *   Pointer to RX queue structure.
524  * @param cq
525  *   Pointer to completion array having a non-compressed completion at first.
526  * @param elts
527  *   Pointer to SW ring to be filled. The first mbuf has to be pre-built from
528  *   the title completion descriptor to be copied to the rest of mbufs.
529  * @param[out] pkts
530  *   Array to store received packets.
531  * @param pkts_n
532  *   Maximum number of packets in array.
533  * @param[out] err
534  *   Pointer to a flag. Set non-zero value if pkts array has at least one error
535  *   packet to handle.
536  * @param[out] comp
537  *   Pointer to a index. Set it to the first compressed completion if any.
538  *
539  * @return
540  *   Number of CQEs successfully processed.
541  */
542 static inline uint16_t
543 rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
544 		 struct rte_mbuf **elts, struct rte_mbuf **pkts,
545 		 uint16_t pkts_n, uint64_t *err, uint64_t *comp)
546 {
547 	const uint16_t q_n = 1 << rxq->cqe_n;
548 	const uint16_t q_mask = q_n - 1;
549 	unsigned int pos, adj;
550 	uint64_t n = 0;
551 	uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
552 	uint16_t nocmp_n = 0;
553 	const uint8_t vic = rxq->cq_ci >> rxq->cqe_n;
554 	const uint8_t own = !(rxq->cq_ci & (q_mask + 1));
555 	const __m128i vic_check = _mm_set1_epi64x(0x00ff000000ff0000LL);
556 	const __m128i owner_check =	_mm_set1_epi64x(0x0100000001000000LL);
557 	const __m128i opcode_check = _mm_set1_epi64x(0xf0000000f0000000LL);
558 	const __m128i format_check = _mm_set1_epi64x(0x0c0000000c000000LL);
559 	const __m128i resp_err_check = _mm_set1_epi64x(0xe0000000e0000000LL);
560 #ifdef MLX5_PMD_SOFT_COUNTERS
561 	uint32_t rcvd_byte = 0;
562 	/* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */
563 	const __m128i len_shuf_mask =
564 		_mm_set_epi8(-1, -1, -1, -1,
565 			     -1, -1, -1, -1,
566 			     12, 13,  8,  9,
567 			      4,  5,  0,  1);
568 #endif
569 	const __m128i validity =
570 		_mm_set_epi8(0, vic, 0, 0,
571 			     0, vic, 0, 0,
572 			     0, vic, 0, 0,
573 			     0, vic, 0, 0);
574 	const __m128i ownership =
575 		_mm_set_epi8(own, 0, 0, 0,
576 			     own, 0, 0, 0,
577 			     own, 0, 0, 0,
578 			     own, 0, 0, 0);
579 	/* Mask to shuffle from extracted CQE to mbuf. */
580 	const __m128i shuf_mask =
581 		_mm_set_epi8(-1,  3,  2,  1, /* fdir.hi */
582 			     12, 13, 14, 15, /* rss, bswap32 */
583 			     10, 11,         /* vlan_tci, bswap16 */
584 			      4,  5,         /* data_len, bswap16 */
585 			     -1, -1,         /* zero out 2nd half of pkt_len */
586 			      4,  5          /* pkt_len, bswap16 */);
587 	/* Mask to blend from the last Qword to the first DQword. */
588 	const __m128i blend_mask =
589 		_mm_set_epi8(-1, -1, -1, -1,
590 			     -1, -1, -1, -1,
591 			      0,  0,  0,  0,
592 			      0,  0,  0, -1);
593 	const __m128i zero = _mm_setzero_si128();
594 	const __m128i ones = _mm_cmpeq_epi32(zero, zero);
595 	const __m128i crc_adj =
596 		_mm_set_epi16(0, 0, 0, 0, 0,
597 			      rxq->crc_present * RTE_ETHER_CRC_LEN,
598 			      0,
599 			      rxq->crc_present * RTE_ETHER_CRC_LEN);
600 	const __m128i flow_mark_adj = _mm_set_epi32(rxq->mark * (-1), 0, 0, 0);
601 	/*
602 	 * A. load first Qword (8bytes) in one loop.
603 	 * B. copy 4 mbuf pointers from elts ring to returning pkts.
604 	 * C. load remained CQE data and extract necessary fields.
605 	 *    Final 16bytes cqes[] extracted from original 64bytes CQE has the
606 	 *    following structure:
607 	 *        struct {
608 	 *          uint8_t  pkt_info;
609 	 *          uint8_t  flow_tag[3];
610 	 *          uint16_t byte_cnt;
611 	 *          uint8_t  validity_iteration_count;
612 	 *          uint8_t  op_own;
613 	 *          uint16_t hdr_type_etc;
614 	 *          uint16_t vlan_info;
615 	 *          uint32_t rx_has_res;
616 	 *        } c;
617 	 * D. fill in mbuf.
618 	 * E. get valid CQEs.
619 	 * F. find compressed CQE.
620 	 */
621 	for (pos = 0;
622 	     pos < pkts_n;
623 	     pos += MLX5_VPMD_DESCS_PER_LOOP) {
624 		__m128i cqes[MLX5_VPMD_DESCS_PER_LOOP];
625 		__m128i cqe_tmp1, cqe_tmp2;
626 		__m128i pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3;
627 		__m128i op_own, op_own_tmp1, op_own_tmp2;
628 		__m128i opcode, owner_mask, invalid_mask;
629 		__m128i comp_mask, mini_mask;
630 		__m128i mask;
631 #ifdef MLX5_PMD_SOFT_COUNTERS
632 		__m128i byte_cnt;
633 #endif
634 		__m128i mbp1, mbp2;
635 		__m128i p = _mm_set_epi16(0, 0, 0, 0, 3, 2, 1, 0);
636 		unsigned int p1, p2, p3;
637 
638 		/* Prefetch next 4 CQEs. */
639 		if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) {
640 			rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP]);
641 			rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 1]);
642 			rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 2]);
643 			rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 3]);
644 		}
645 		/* A.0 do not cross the end of CQ. */
646 		mask = _mm_set_epi64x(0, (pkts_n - pos) * sizeof(uint16_t) * 8);
647 		mask = _mm_sll_epi64(ones, mask);
648 		p = _mm_andnot_si128(mask, p);
649 		/* A.1 load cqes. */
650 		p3 = _mm_extract_epi16(p, 3);
651 		cqes[3] = _mm_loadl_epi64(RTE_CAST_PTR(const __m128i *,
652 					   &cq[pos + p3].sop_drop_qpn));
653 		rte_compiler_barrier();
654 		p2 = _mm_extract_epi16(p, 2);
655 		cqes[2] = _mm_loadl_epi64(RTE_CAST_PTR(const __m128i *,
656 					   &cq[pos + p2].sop_drop_qpn));
657 		rte_compiler_barrier();
658 		/* B.1 load mbuf pointers. */
659 		mbp1 = _mm_loadu_si128((__m128i *)&elts[pos]);
660 		mbp2 = _mm_loadu_si128((__m128i *)&elts[pos + 2]);
661 		/* A.1 load a block having op_own. */
662 		p1 = _mm_extract_epi16(p, 1);
663 		cqes[1] = _mm_loadl_epi64(RTE_CAST_PTR(const __m128i *,
664 					   &cq[pos + p1].sop_drop_qpn));
665 		rte_compiler_barrier();
666 		cqes[0] = _mm_loadl_epi64(RTE_CAST_PTR(const __m128i *,
667 					   &cq[pos].sop_drop_qpn));
668 		/* B.2 copy mbuf pointers. */
669 		_mm_storeu_si128((__m128i *)&pkts[pos], mbp1);
670 		_mm_storeu_si128((__m128i *)&pkts[pos + 2], mbp2);
671 		rte_io_rmb();
672 		/* C.1 load remained CQE data and extract necessary fields. */
673 		cqe_tmp2 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &cq[pos + p3]));
674 		cqe_tmp1 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &cq[pos + p2]));
675 		cqes[3] = _mm_blendv_epi8(cqes[3], cqe_tmp2, blend_mask);
676 		cqes[2] = _mm_blendv_epi8(cqes[2], cqe_tmp1, blend_mask);
677 		cqe_tmp2 = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, &cq[pos + p3].csum));
678 		cqe_tmp1 = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, &cq[pos + p2].csum));
679 		cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x30);
680 		cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x30);
681 		cqe_tmp2 = _mm_loadl_epi64(RTE_CAST_PTR(const __m128i *, &cq[pos + p3].rsvd4[2]));
682 		cqe_tmp1 = _mm_loadl_epi64(RTE_CAST_PTR(const __m128i *, &cq[pos + p2].rsvd4[2]));
683 		cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x04);
684 		cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x04);
685 		/* C.2 generate final structure for mbuf with swapping bytes. */
686 		pkt_mb3 = _mm_shuffle_epi8(cqes[3], shuf_mask);
687 		pkt_mb2 = _mm_shuffle_epi8(cqes[2], shuf_mask);
688 		/* C.3 adjust CRC length. */
689 		pkt_mb3 = _mm_sub_epi16(pkt_mb3, crc_adj);
690 		pkt_mb2 = _mm_sub_epi16(pkt_mb2, crc_adj);
691 		/* C.4 adjust flow mark. */
692 		pkt_mb3 = _mm_add_epi32(pkt_mb3, flow_mark_adj);
693 		pkt_mb2 = _mm_add_epi32(pkt_mb2, flow_mark_adj);
694 		/* D.1 fill in mbuf - rx_descriptor_fields1. */
695 		_mm_storeu_si128((void *)&pkts[pos + 3]->pkt_len, pkt_mb3);
696 		_mm_storeu_si128((void *)&pkts[pos + 2]->pkt_len, pkt_mb2);
697 		/* E.1 extract op_own field. */
698 		op_own_tmp2 = _mm_unpacklo_epi32(cqes[2], cqes[3]);
699 		/* C.1 load remained CQE data and extract necessary fields. */
700 		cqe_tmp2 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &cq[pos + p1]));
701 		cqe_tmp1 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &cq[pos]));
702 		cqes[1] = _mm_blendv_epi8(cqes[1], cqe_tmp2, blend_mask);
703 		cqes[0] = _mm_blendv_epi8(cqes[0], cqe_tmp1, blend_mask);
704 		cqe_tmp2 = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, &cq[pos + p1].csum));
705 		cqe_tmp1 = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, &cq[pos].csum));
706 		cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x30);
707 		cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x30);
708 		cqe_tmp2 = _mm_loadl_epi64(RTE_CAST_PTR(const __m128i *, &cq[pos + p1].rsvd4[2]));
709 		cqe_tmp1 = _mm_loadl_epi64(RTE_CAST_PTR(const __m128i *, &cq[pos].rsvd4[2]));
710 		cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x04);
711 		cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x04);
712 		/* C.2 generate final structure for mbuf with swapping bytes. */
713 		pkt_mb1 = _mm_shuffle_epi8(cqes[1], shuf_mask);
714 		pkt_mb0 = _mm_shuffle_epi8(cqes[0], shuf_mask);
715 		/* C.3 adjust CRC length. */
716 		pkt_mb1 = _mm_sub_epi16(pkt_mb1, crc_adj);
717 		pkt_mb0 = _mm_sub_epi16(pkt_mb0, crc_adj);
718 		/* C.4 adjust flow mark. */
719 		pkt_mb1 = _mm_add_epi32(pkt_mb1, flow_mark_adj);
720 		pkt_mb0 = _mm_add_epi32(pkt_mb0, flow_mark_adj);
721 		/* E.1 extract op_own byte. */
722 		op_own_tmp1 = _mm_unpacklo_epi32(cqes[0], cqes[1]);
723 		op_own = _mm_unpackhi_epi64(op_own_tmp1, op_own_tmp2);
724 		/* D.1 fill in mbuf - rx_descriptor_fields1. */
725 		_mm_storeu_si128((void *)&pkts[pos + 1]->pkt_len, pkt_mb1);
726 		_mm_storeu_si128((void *)&pkts[pos]->pkt_len, pkt_mb0);
727 		/* E.2 mask out CQEs belonging to HW. */
728 		if (rxq->cqe_comp_layout) {
729 			owner_mask = _mm_and_si128(op_own, vic_check);
730 			owner_mask = _mm_cmpeq_epi32(owner_mask, validity);
731 			owner_mask = _mm_xor_si128(owner_mask, ones);
732 		} else {
733 			owner_mask = _mm_and_si128(op_own, owner_check);
734 			owner_mask = _mm_cmpeq_epi32(owner_mask, ownership);
735 		}
736 		owner_mask = _mm_packs_epi32(owner_mask, zero);
737 		/* E.3 get mask for invalidated CQEs. */
738 		opcode = _mm_and_si128(op_own, opcode_check);
739 		invalid_mask = _mm_cmpeq_epi32(opcode_check, opcode);
740 		invalid_mask = _mm_packs_epi32(invalid_mask, zero);
741 		/* E.4 mask out beyond boundary. */
742 		invalid_mask = _mm_or_si128(invalid_mask, mask);
743 		/* E.5 merge invalid_mask with invalid owner. */
744 		invalid_mask = _mm_or_si128(invalid_mask, owner_mask);
745 		/* F.1 find compressed CQE format. */
746 		comp_mask = _mm_and_si128(op_own, format_check);
747 		comp_mask = _mm_cmpeq_epi32(comp_mask, format_check);
748 		comp_mask = _mm_packs_epi32(comp_mask, zero);
749 		/* F.2 mask out invalid entries. */
750 		comp_mask = _mm_andnot_si128(invalid_mask, comp_mask);
751 		comp_idx = _mm_cvtsi128_si64(comp_mask);
752 		/* F.3 get the first compressed CQE. */
753 		comp_idx = comp_idx ?
754 				rte_ctz64(comp_idx) /
755 					(sizeof(uint16_t) * 8) :
756 				MLX5_VPMD_DESCS_PER_LOOP;
757 		/* E.6 mask out entries after the compressed CQE. */
758 		mask = _mm_set_epi64x(0, comp_idx * sizeof(uint16_t) * 8);
759 		mask = _mm_sll_epi64(ones, mask);
760 		invalid_mask = _mm_or_si128(invalid_mask, mask);
761 		/* E.7 count non-compressed valid CQEs. */
762 		n = _mm_cvtsi128_si64(invalid_mask);
763 		n = n ? rte_ctz64(n) / (sizeof(uint16_t) * 8) :
764 			MLX5_VPMD_DESCS_PER_LOOP;
765 		nocmp_n += n;
766 		/* D.2 get the final invalid mask. */
767 		mask = _mm_set_epi64x(0, n * sizeof(uint16_t) * 8);
768 		mask = _mm_sll_epi64(ones, mask);
769 		invalid_mask = _mm_or_si128(invalid_mask, mask);
770 		/* D.3 check error in opcode. */
771 		adj = (!rxq->cqe_comp_layout &&
772 		       comp_idx != MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n);
773 		mask = _mm_set_epi64x(0, adj * sizeof(uint16_t) * 8);
774 		mini_mask = _mm_sll_epi64(invalid_mask, mask);
775 		opcode = _mm_cmpeq_epi32(resp_err_check, opcode);
776 		opcode = _mm_packs_epi32(opcode, zero);
777 		opcode = _mm_andnot_si128(mini_mask, opcode);
778 		/* D.4 mark if any error is set */
779 		*err |= _mm_cvtsi128_si64(opcode);
780 		/* D.5 fill in mbuf - rearm_data and packet_type. */
781 		rxq_cq_to_ptype_oflags_v(rxq, cqes, opcode, &pkts[pos]);
782 		if (unlikely(rxq->shared)) {
783 			pkts[pos]->port = cq[pos].user_index_low;
784 			pkts[pos + 1]->port = cq[pos + p1].user_index_low;
785 			pkts[pos + 2]->port = cq[pos + p2].user_index_low;
786 			pkts[pos + 3]->port = cq[pos + p3].user_index_low;
787 		}
788 		if (unlikely(rxq->hw_timestamp)) {
789 			int offset = rxq->timestamp_offset;
790 			if (rxq->rt_timestamp) {
791 				struct mlx5_dev_ctx_shared *sh = rxq->sh;
792 				uint64_t ts;
793 
794 				ts = rte_be_to_cpu_64(cq[pos].timestamp);
795 				mlx5_timestamp_set(pkts[pos], offset,
796 					mlx5_txpp_convert_rx_ts(sh, ts));
797 				ts = rte_be_to_cpu_64(cq[pos + p1].timestamp);
798 				mlx5_timestamp_set(pkts[pos + 1], offset,
799 					mlx5_txpp_convert_rx_ts(sh, ts));
800 				ts = rte_be_to_cpu_64(cq[pos + p2].timestamp);
801 				mlx5_timestamp_set(pkts[pos + 2], offset,
802 					mlx5_txpp_convert_rx_ts(sh, ts));
803 				ts = rte_be_to_cpu_64(cq[pos + p3].timestamp);
804 				mlx5_timestamp_set(pkts[pos + 3], offset,
805 					mlx5_txpp_convert_rx_ts(sh, ts));
806 			} else {
807 				mlx5_timestamp_set(pkts[pos], offset,
808 					rte_be_to_cpu_64(cq[pos].timestamp));
809 				mlx5_timestamp_set(pkts[pos + 1], offset,
810 					rte_be_to_cpu_64(cq[pos + p1].timestamp));
811 				mlx5_timestamp_set(pkts[pos + 2], offset,
812 					rte_be_to_cpu_64(cq[pos + p2].timestamp));
813 				mlx5_timestamp_set(pkts[pos + 3], offset,
814 					rte_be_to_cpu_64(cq[pos + p3].timestamp));
815 			}
816 		}
817 		if (rxq->dynf_meta) {
818 			/* This code is subject for further optimization. */
819 			int32_t offs = rxq->flow_meta_offset;
820 			uint32_t mask = rxq->flow_meta_port_mask;
821 
822 			*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
823 				rte_be_to_cpu_32
824 				(cq[pos].flow_table_metadata) &	mask;
825 			*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *) =
826 				rte_be_to_cpu_32
827 				(cq[pos + p1].flow_table_metadata) & mask;
828 			*RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *) =
829 				rte_be_to_cpu_32
830 				(cq[pos + p2].flow_table_metadata) & mask;
831 			*RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *) =
832 				rte_be_to_cpu_32
833 				(cq[pos + p3].flow_table_metadata) & mask;
834 			if (*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *))
835 				pkts[pos]->ol_flags |= rxq->flow_meta_mask;
836 			if (*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *))
837 				pkts[pos + 1]->ol_flags |= rxq->flow_meta_mask;
838 			if (*RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *))
839 				pkts[pos + 2]->ol_flags |= rxq->flow_meta_mask;
840 			if (*RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *))
841 				pkts[pos + 3]->ol_flags |= rxq->flow_meta_mask;
842 		}
843 #ifdef MLX5_PMD_SOFT_COUNTERS
844 		/* Add up received bytes count. */
845 		byte_cnt = _mm_shuffle_epi8(op_own, len_shuf_mask);
846 		byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt);
847 		byte_cnt = _mm_hadd_epi16(byte_cnt, zero);
848 		rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero));
849 #endif
850 		/*
851 		 * Break the loop unless more valid CQE is expected, or if
852 		 * there's a compressed CQE.
853 		 */
854 		if (n != MLX5_VPMD_DESCS_PER_LOOP)
855 			break;
856 	}
857 #ifdef MLX5_PMD_SOFT_COUNTERS
858 	rxq->stats.ipackets += nocmp_n;
859 	rxq->stats.ibytes += rcvd_byte;
860 #endif
861 	if (comp_idx == n)
862 		*comp = comp_idx;
863 	return nocmp_n;
864 }
865 
866 #endif /* RTE_PMD_MLX5_RXTX_VEC_SSE_H_ */
867