xref: /dpdk/drivers/net/mlx5/mlx5_rxtx_vec_sse.h (revision 90ec9b0db5c7bf7f911cb5ebcd8dfd15eb69c7dd)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2017 6WIND S.A.
3  * Copyright 2017 Mellanox Technologies, Ltd
4  */
5 
6 #ifndef RTE_PMD_MLX5_RXTX_VEC_SSE_H_
7 #define RTE_PMD_MLX5_RXTX_VEC_SSE_H_
8 
9 #include <stdint.h>
10 #include <string.h>
11 #include <stdlib.h>
12 #include <rte_vect.h>
13 
14 #include <rte_mbuf.h>
15 #include <rte_mempool.h>
16 #include <rte_prefetch.h>
17 
18 #include <mlx5_prm.h>
19 
20 #include "mlx5_defs.h"
21 #include "mlx5.h"
22 #include "mlx5_utils.h"
23 #include "mlx5_rxtx.h"
24 #include "mlx5_rxtx_vec.h"
25 #include "mlx5_autoconf.h"
26 
27 #ifndef __INTEL_COMPILER
28 #pragma GCC diagnostic ignored "-Wcast-qual"
29 #endif
30 
31 /**
32  * Store free buffers to RX SW ring.
33  *
34  * @param elts
35  *   Pointer to SW ring to be filled.
36  * @param pkts
37  *   Pointer to array of packets to be stored.
38  * @param pkts_n
39  *   Number of packets to be stored.
40  */
41 static inline void
42 rxq_copy_mbuf_v(struct rte_mbuf **elts, struct rte_mbuf **pkts, uint16_t n)
43 {
44 	unsigned int pos;
45 	uint16_t p = n & -2;
46 
47 	for (pos = 0; pos < p; pos += 2) {
48 		__m128i mbp;
49 
50 		mbp = _mm_loadu_si128((__m128i *)&elts[pos]);
51 		_mm_storeu_si128((__m128i *)&pkts[pos], mbp);
52 	}
53 	if (n & 1)
54 		pkts[pos] = elts[pos];
55 }
56 
57 /**
58  * Decompress a compressed completion and fill in mbufs in RX SW ring with data
59  * extracted from the title completion descriptor.
60  *
61  * @param rxq
62  *   Pointer to RX queue structure.
63  * @param cq
64  *   Pointer to completion array having a compressed completion at first.
65  * @param elts
66  *   Pointer to SW ring to be filled. The first mbuf has to be pre-built from
67  *   the title completion descriptor to be copied to the rest of mbufs.
68  * @param keep
69  *   Keep unzipping if the next CQE is the miniCQE array.
70  *
71  * @return
72  *   Number of mini-CQEs successfully decompressed.
73  */
74 static inline uint16_t
75 rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
76 		    struct rte_mbuf **elts, bool keep)
77 {
78 	volatile struct mlx5_mini_cqe8 *mcq = (void *)(cq + !rxq->cqe_comp_layout);
79 	/* Title packet is pre-built. */
80 	struct rte_mbuf *t_pkt = rxq->cqe_comp_layout ? &rxq->title_pkt : elts[0];
81 	unsigned int pos;
82 	unsigned int i;
83 	unsigned int inv = 0;
84 	/* Mask to shuffle from extracted mini CQE to mbuf. */
85 	const __m128i shuf_mask1 =
86 		_mm_set_epi8(0,  1,  2,  3, /* rss, bswap32 */
87 			    -1, -1,         /* skip vlan_tci */
88 			     6,  7,         /* data_len, bswap16 */
89 			    -1, -1,  6,  7, /* pkt_len, bswap16 */
90 			    -1, -1, -1, -1  /* skip packet_type */);
91 	const __m128i shuf_mask2 =
92 		_mm_set_epi8(8,  9, 10, 11, /* rss, bswap32 */
93 			    -1, -1,         /* skip vlan_tci */
94 			    14, 15,         /* data_len, bswap16 */
95 			    -1, -1, 14, 15, /* pkt_len, bswap16 */
96 			    -1, -1, -1, -1  /* skip packet_type */);
97 	/* Restore the compressed count. Must be 16 bits. */
98 	uint16_t mcqe_n = (rxq->cqe_comp_layout) ?
99 		(MLX5_CQE_NUM_MINIS(cq->op_own) + 1) :
100 		t_pkt->data_len + (rxq->crc_present * RTE_ETHER_CRC_LEN);
101 	uint16_t pkts_n = mcqe_n;
102 	const __m128i rearm =
103 		_mm_loadu_si128((__m128i *)&t_pkt->rearm_data);
104 	const __m128i rxdf =
105 		_mm_loadu_si128((__m128i *)&t_pkt->rx_descriptor_fields1);
106 	const __m128i crc_adj =
107 		_mm_set_epi16(0, 0, 0,
108 			      rxq->crc_present * RTE_ETHER_CRC_LEN,
109 			      0,
110 			      rxq->crc_present * RTE_ETHER_CRC_LEN,
111 			      0, 0);
112 	__m128i ol_flags = _mm_setzero_si128();
113 	__m128i ol_flags_mask = _mm_setzero_si128();
114 #ifdef MLX5_PMD_SOFT_COUNTERS
115 	const __m128i zero = _mm_setzero_si128();
116 	const __m128i ones = _mm_cmpeq_epi32(zero, zero);
117 	uint32_t rcvd_byte = 0;
118 	/* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */
119 	const __m128i len_shuf_mask =
120 		_mm_set_epi8(-1, -1, -1, -1,
121 			     -1, -1, -1, -1,
122 			     14, 15,  6,  7,
123 			     10, 11,  2,  3);
124 #endif
125 	/*
126 	 * A. load mCQEs into a 128bit register.
127 	 * B. store rearm data to mbuf.
128 	 * C. combine data from mCQEs with rx_descriptor_fields1.
129 	 * D. store rx_descriptor_fields1.
130 	 * E. store flow tag (rte_flow mark).
131 	 */
132 cycle:
133 	if (rxq->cqe_comp_layout)
134 		rte_prefetch0((void *)(cq + mcqe_n));
135 	for (pos = 0; pos < mcqe_n; ) {
136 		__m128i mcqe1, mcqe2;
137 		__m128i rxdf1, rxdf2;
138 #ifdef MLX5_PMD_SOFT_COUNTERS
139 		__m128i byte_cnt, invalid_mask;
140 #endif
141 
142 		if (!rxq->cqe_comp_layout)
143 			for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
144 				if (likely(pos + i < mcqe_n))
145 					rte_prefetch0((void *)(cq + pos + i));
146 		/* A.1 load mCQEs into a 128bit register. */
147 		mcqe1 = _mm_loadu_si128((__m128i *)&mcq[pos % 8]);
148 		mcqe2 = _mm_loadu_si128((__m128i *)&mcq[pos % 8 + 2]);
149 		/* B.1 store rearm data to mbuf. */
150 		_mm_storeu_si128((__m128i *)&elts[pos]->rearm_data, rearm);
151 		_mm_storeu_si128((__m128i *)&elts[pos + 1]->rearm_data, rearm);
152 		/* C.1 combine data from mCQEs with rx_descriptor_fields1. */
153 		rxdf1 = _mm_shuffle_epi8(mcqe1, shuf_mask1);
154 		rxdf2 = _mm_shuffle_epi8(mcqe1, shuf_mask2);
155 		rxdf1 = _mm_sub_epi16(rxdf1, crc_adj);
156 		rxdf2 = _mm_sub_epi16(rxdf2, crc_adj);
157 		rxdf1 = _mm_blend_epi16(rxdf1, rxdf, 0x23);
158 		rxdf2 = _mm_blend_epi16(rxdf2, rxdf, 0x23);
159 		/* D.1 store rx_descriptor_fields1. */
160 		_mm_storeu_si128((__m128i *)
161 				  &elts[pos]->rx_descriptor_fields1,
162 				 rxdf1);
163 		_mm_storeu_si128((__m128i *)
164 				  &elts[pos + 1]->rx_descriptor_fields1,
165 				 rxdf2);
166 		/* B.1 store rearm data to mbuf. */
167 		_mm_storeu_si128((__m128i *)&elts[pos + 2]->rearm_data, rearm);
168 		_mm_storeu_si128((__m128i *)&elts[pos + 3]->rearm_data, rearm);
169 		/* C.1 combine data from mCQEs with rx_descriptor_fields1. */
170 		rxdf1 = _mm_shuffle_epi8(mcqe2, shuf_mask1);
171 		rxdf2 = _mm_shuffle_epi8(mcqe2, shuf_mask2);
172 		rxdf1 = _mm_sub_epi16(rxdf1, crc_adj);
173 		rxdf2 = _mm_sub_epi16(rxdf2, crc_adj);
174 		rxdf1 = _mm_blend_epi16(rxdf1, rxdf, 0x23);
175 		rxdf2 = _mm_blend_epi16(rxdf2, rxdf, 0x23);
176 		/* D.1 store rx_descriptor_fields1. */
177 		_mm_storeu_si128((__m128i *)
178 				  &elts[pos + 2]->rx_descriptor_fields1,
179 				 rxdf1);
180 		_mm_storeu_si128((__m128i *)
181 				  &elts[pos + 3]->rx_descriptor_fields1,
182 				 rxdf2);
183 #ifdef MLX5_PMD_SOFT_COUNTERS
184 		invalid_mask = _mm_set_epi64x(0,
185 					      (mcqe_n - pos) *
186 					      sizeof(uint16_t) * 8);
187 		invalid_mask = _mm_sll_epi64(ones, invalid_mask);
188 		byte_cnt = _mm_blend_epi16(_mm_srli_si128(mcqe1, 4),
189 					   mcqe2, 0xcc);
190 		byte_cnt = _mm_shuffle_epi8(byte_cnt, len_shuf_mask);
191 		byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt);
192 		byte_cnt = _mm_hadd_epi16(byte_cnt, zero);
193 		rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero));
194 #endif
195 		if (rxq->mark) {
196 			if (rxq->mcqe_format !=
197 				MLX5_CQE_RESP_FORMAT_FTAG_STRIDX) {
198 				const uint32_t flow_tag = t_pkt->hash.fdir.hi;
199 
200 				/* E.1 store flow tag (rte_flow mark). */
201 				elts[pos]->hash.fdir.hi = flow_tag;
202 				elts[pos + 1]->hash.fdir.hi = flow_tag;
203 				elts[pos + 2]->hash.fdir.hi = flow_tag;
204 				elts[pos + 3]->hash.fdir.hi = flow_tag;
205 			} else {
206 				const __m128i flow_mark_adj =
207 					_mm_set_epi32(-1, -1, -1, -1);
208 				const __m128i flow_mark_shuf =
209 					_mm_set_epi8(-1,  9,  8, 12,
210 						     -1,  1,  0,  4,
211 						     -1, -1, -1, -1,
212 						     -1, -1, -1, -1);
213 				const __m128i ft_mask =
214 					_mm_set1_epi32(0xffffff00);
215 				const __m128i fdir_flags =
216 					_mm_set1_epi32(RTE_MBUF_F_RX_FDIR);
217 				const __m128i fdir_all_flags =
218 					_mm_set1_epi32(RTE_MBUF_F_RX_FDIR |
219 						       rxq->mark_flag);
220 				__m128i fdir_id_flags =
221 					_mm_set1_epi32(rxq->mark_flag);
222 
223 				/* Extract flow_tag field. */
224 				__m128i ftag0 =
225 					_mm_shuffle_epi8(mcqe1, flow_mark_shuf);
226 				__m128i ftag1 =
227 					_mm_shuffle_epi8(mcqe2, flow_mark_shuf);
228 				__m128i ftag =
229 					_mm_unpackhi_epi64(ftag0, ftag1);
230 				__m128i invalid_mask =
231 					_mm_cmpeq_epi32(ftag, zero);
232 
233 				ol_flags_mask = _mm_or_si128(ol_flags_mask,
234 							     fdir_all_flags);
235 				/* Set RTE_MBUF_F_RX_FDIR if flow tag is non-zero. */
236 				ol_flags = _mm_or_si128(ol_flags,
237 					_mm_andnot_si128(invalid_mask,
238 							 fdir_flags));
239 				/* Mask out invalid entries. */
240 				fdir_id_flags = _mm_andnot_si128(invalid_mask,
241 								 fdir_id_flags);
242 				/* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */
243 				ol_flags = _mm_or_si128(ol_flags,
244 					_mm_andnot_si128(_mm_cmpeq_epi32(ftag,
245 							 ft_mask),
246 					fdir_id_flags));
247 				ftag = _mm_add_epi32(ftag, flow_mark_adj);
248 				elts[pos]->hash.fdir.hi =
249 						_mm_extract_epi32(ftag, 0);
250 				elts[pos + 1]->hash.fdir.hi =
251 						_mm_extract_epi32(ftag, 1);
252 				elts[pos + 2]->hash.fdir.hi =
253 						_mm_extract_epi32(ftag, 2);
254 				elts[pos + 3]->hash.fdir.hi =
255 						_mm_extract_epi32(ftag, 3);
256 			}
257 		}
258 		if (unlikely(rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_HASH)) {
259 			if (rxq->mcqe_format ==
260 			    MLX5_CQE_RESP_FORMAT_L34H_STRIDX) {
261 				const uint8_t pkt_info =
262 					(cq->pkt_info & 0x3) << 6;
263 				const uint8_t pkt_hdr0 =
264 					_mm_extract_epi8(mcqe1, 0);
265 				const uint8_t pkt_hdr1 =
266 					_mm_extract_epi8(mcqe1, 8);
267 				const uint8_t pkt_hdr2 =
268 					_mm_extract_epi8(mcqe2, 0);
269 				const uint8_t pkt_hdr3 =
270 					_mm_extract_epi8(mcqe2, 8);
271 				const __m128i vlan_mask =
272 					_mm_set1_epi32(RTE_MBUF_F_RX_VLAN |
273 						       RTE_MBUF_F_RX_VLAN_STRIPPED);
274 				const __m128i cv_mask =
275 					_mm_set1_epi32(MLX5_CQE_VLAN_STRIPPED);
276 				const __m128i pkt_cv =
277 					_mm_set_epi32(pkt_hdr0 & 0x1,
278 						      pkt_hdr1 & 0x1,
279 						      pkt_hdr2 & 0x1,
280 						      pkt_hdr3 & 0x1);
281 
282 				ol_flags_mask = _mm_or_si128(ol_flags_mask,
283 							     vlan_mask);
284 				ol_flags = _mm_or_si128(ol_flags,
285 					_mm_and_si128(_mm_cmpeq_epi32(pkt_cv,
286 					cv_mask), vlan_mask));
287 				elts[pos]->packet_type =
288 					mlx5_ptype_table[(pkt_hdr0 >> 2) |
289 							 pkt_info];
290 				elts[pos + 1]->packet_type =
291 					mlx5_ptype_table[(pkt_hdr1 >> 2) |
292 							 pkt_info];
293 				elts[pos + 2]->packet_type =
294 					mlx5_ptype_table[(pkt_hdr2 >> 2) |
295 							 pkt_info];
296 				elts[pos + 3]->packet_type =
297 					mlx5_ptype_table[(pkt_hdr3 >> 2) |
298 							 pkt_info];
299 				if (rxq->tunnel) {
300 					elts[pos]->packet_type |=
301 						!!(((pkt_hdr0 >> 2) |
302 						pkt_info) & (1 << 6));
303 					elts[pos + 1]->packet_type |=
304 						!!(((pkt_hdr1 >> 2) |
305 						pkt_info) & (1 << 6));
306 					elts[pos + 2]->packet_type |=
307 						!!(((pkt_hdr2 >> 2) |
308 						pkt_info) & (1 << 6));
309 					elts[pos + 3]->packet_type |=
310 						!!(((pkt_hdr3 >> 2) |
311 						pkt_info) & (1 << 6));
312 				}
313 			}
314 			const __m128i hash_flags =
315 				_mm_set1_epi32(RTE_MBUF_F_RX_RSS_HASH);
316 			const __m128i rearm_flags =
317 				_mm_set1_epi32((uint32_t)t_pkt->ol_flags);
318 
319 			ol_flags_mask = _mm_or_si128(ol_flags_mask, hash_flags);
320 			ol_flags = _mm_or_si128(ol_flags,
321 				_mm_andnot_si128(ol_flags_mask, rearm_flags));
322 			elts[pos]->ol_flags =
323 				_mm_extract_epi32(ol_flags, 0);
324 			elts[pos + 1]->ol_flags =
325 				_mm_extract_epi32(ol_flags, 1);
326 			elts[pos + 2]->ol_flags =
327 				_mm_extract_epi32(ol_flags, 2);
328 			elts[pos + 3]->ol_flags =
329 				_mm_extract_epi32(ol_flags, 3);
330 			elts[pos]->hash.rss = 0;
331 			elts[pos + 1]->hash.rss = 0;
332 			elts[pos + 2]->hash.rss = 0;
333 			elts[pos + 3]->hash.rss = 0;
334 		}
335 		if (rxq->dynf_meta) {
336 			int32_t offs = rxq->flow_meta_offset;
337 			const uint32_t meta =
338 				*RTE_MBUF_DYNFIELD(t_pkt, offs, uint32_t *);
339 
340 			/* Check if title packet has valid metadata. */
341 			if (meta) {
342 				MLX5_ASSERT(t_pkt->ol_flags &
343 					    rxq->flow_meta_mask);
344 				*RTE_MBUF_DYNFIELD(elts[pos], offs,
345 							uint32_t *) = meta;
346 				*RTE_MBUF_DYNFIELD(elts[pos + 1], offs,
347 							uint32_t *) = meta;
348 				*RTE_MBUF_DYNFIELD(elts[pos + 2], offs,
349 							uint32_t *) = meta;
350 				*RTE_MBUF_DYNFIELD(elts[pos + 3], offs,
351 							uint32_t *) = meta;
352 			}
353 		}
354 		pos += MLX5_VPMD_DESCS_PER_LOOP;
355 		/* Move to next CQE and invalidate consumed CQEs. */
356 		if (!rxq->cqe_comp_layout) {
357 			if (!(pos & 0x7) && pos < mcqe_n) {
358 				if (pos + 8 < mcqe_n)
359 					rte_prefetch0((void *)(cq + pos + 8));
360 				mcq = (void *)(cq + pos);
361 				for (i = 0; i < 8; ++i)
362 					cq[inv++].op_own = MLX5_CQE_INVALIDATE;
363 			}
364 		}
365 	}
366 	if (rxq->cqe_comp_layout && keep) {
367 		int ret;
368 		/* Keep unzipping if the next CQE is the miniCQE array. */
369 		cq = &cq[mcqe_n];
370 		ret = check_cqe_iteration(cq, rxq->cqe_n, rxq->cq_ci + pkts_n);
371 		if (ret == MLX5_CQE_STATUS_SW_OWN &&
372 		    MLX5_CQE_FORMAT(cq->op_own) == MLX5_COMPRESSED) {
373 			pos = 0;
374 			elts = &elts[mcqe_n];
375 			mcq = (void *)cq;
376 			mcqe_n = MLX5_CQE_NUM_MINIS(cq->op_own) + 1;
377 			pkts_n += mcqe_n;
378 			goto cycle;
379 		}
380 	} else {
381 		/* Invalidate the rest of CQEs. */
382 		for (; inv < pkts_n; ++inv)
383 			cq[inv].op_own = MLX5_CQE_INVALIDATE;
384 	}
385 #ifdef MLX5_PMD_SOFT_COUNTERS
386 	rxq->stats.ipackets += pkts_n;
387 	rxq->stats.ibytes += rcvd_byte;
388 #endif
389 	return pkts_n;
390 }
391 
392 /**
393  * Calculate packet type and offload flag for mbuf and store it.
394  *
395  * @param rxq
396  *   Pointer to RX queue structure.
397  * @param cqes[4]
398  *   Array of four 16bytes completions extracted from the original completion
399  *   descriptor.
400  * @param op_err
401  *   Opcode vector having responder error status. Each field is 4B.
402  * @param pkts
403  *   Pointer to array of packets to be filled.
404  */
405 static inline void
406 rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, __m128i cqes[4],
407 			 __m128i op_err, struct rte_mbuf **pkts)
408 {
409 	__m128i pinfo0, pinfo1;
410 	__m128i pinfo, ptype;
411 	__m128i ol_flags = _mm_set1_epi32(rxq->rss_hash * RTE_MBUF_F_RX_RSS_HASH |
412 					  rxq->hw_timestamp * rxq->timestamp_rx_flag);
413 	__m128i cv_flags;
414 	const __m128i zero = _mm_setzero_si128();
415 	const __m128i ptype_mask = _mm_set1_epi32(0xfd06);
416 	const __m128i ptype_ol_mask = _mm_set1_epi32(0x106);
417 	const __m128i pinfo_mask = _mm_set1_epi32(0x3);
418 	const __m128i cv_flag_sel =
419 		_mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0,
420 			     (uint8_t)((RTE_MBUF_F_RX_IP_CKSUM_GOOD |
421 					RTE_MBUF_F_RX_L4_CKSUM_GOOD) >> 1),
422 			     0,
423 			     (uint8_t)(RTE_MBUF_F_RX_L4_CKSUM_GOOD >> 1),
424 			     0,
425 			     (uint8_t)(RTE_MBUF_F_RX_IP_CKSUM_GOOD >> 1),
426 			     (uint8_t)(RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED),
427 			     0);
428 	const __m128i cv_mask =
429 		_mm_set1_epi32(RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
430 			       RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED);
431 	const __m128i mbuf_init =
432 		_mm_load_si128((__m128i *)&rxq->mbuf_initializer);
433 	__m128i rearm0, rearm1, rearm2, rearm3;
434 	uint8_t pt_idx0, pt_idx1, pt_idx2, pt_idx3;
435 
436 	/* Extract pkt_info field. */
437 	pinfo0 = _mm_unpacklo_epi32(cqes[0], cqes[1]);
438 	pinfo1 = _mm_unpacklo_epi32(cqes[2], cqes[3]);
439 	pinfo = _mm_unpacklo_epi64(pinfo0, pinfo1);
440 	/* Extract hdr_type_etc field. */
441 	pinfo0 = _mm_unpackhi_epi32(cqes[0], cqes[1]);
442 	pinfo1 = _mm_unpackhi_epi32(cqes[2], cqes[3]);
443 	ptype = _mm_unpacklo_epi64(pinfo0, pinfo1);
444 	if (rxq->mark) {
445 		const __m128i pinfo_ft_mask = _mm_set1_epi32(0xffffff00);
446 		const __m128i fdir_flags = _mm_set1_epi32(RTE_MBUF_F_RX_FDIR);
447 		__m128i fdir_id_flags = _mm_set1_epi32(rxq->mark_flag);
448 		__m128i flow_tag, invalid_mask;
449 
450 		flow_tag = _mm_and_si128(pinfo, pinfo_ft_mask);
451 		/* Check if flow tag is non-zero then set RTE_MBUF_F_RX_FDIR. */
452 		invalid_mask = _mm_cmpeq_epi32(flow_tag, zero);
453 		ol_flags = _mm_or_si128(ol_flags,
454 					_mm_andnot_si128(invalid_mask,
455 							 fdir_flags));
456 		/* Mask out invalid entries. */
457 		fdir_id_flags = _mm_andnot_si128(invalid_mask, fdir_id_flags);
458 		/* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */
459 		ol_flags = _mm_or_si128(ol_flags,
460 					_mm_andnot_si128(
461 						_mm_cmpeq_epi32(flow_tag,
462 								pinfo_ft_mask),
463 						fdir_id_flags));
464 	}
465 	/*
466 	 * Merge the two fields to generate the following:
467 	 * bit[1]     = l3_ok
468 	 * bit[2]     = l4_ok
469 	 * bit[8]     = cv
470 	 * bit[11:10] = l3_hdr_type
471 	 * bit[14:12] = l4_hdr_type
472 	 * bit[15]    = ip_frag
473 	 * bit[16]    = tunneled
474 	 * bit[17]    = outer_l3_type
475 	 */
476 	ptype = _mm_and_si128(ptype, ptype_mask);
477 	pinfo = _mm_and_si128(pinfo, pinfo_mask);
478 	pinfo = _mm_slli_epi32(pinfo, 16);
479 	/* Make pinfo has merged fields for ol_flags calculation. */
480 	pinfo = _mm_or_si128(ptype, pinfo);
481 	ptype = _mm_srli_epi32(pinfo, 10);
482 	ptype = _mm_packs_epi32(ptype, zero);
483 	/* Errored packets will have RTE_PTYPE_ALL_MASK. */
484 	op_err = _mm_srli_epi16(op_err, 8);
485 	ptype = _mm_or_si128(ptype, op_err);
486 	pt_idx0 = _mm_extract_epi8(ptype, 0);
487 	pt_idx1 = _mm_extract_epi8(ptype, 2);
488 	pt_idx2 = _mm_extract_epi8(ptype, 4);
489 	pt_idx3 = _mm_extract_epi8(ptype, 6);
490 	pkts[0]->packet_type = mlx5_ptype_table[pt_idx0] |
491 			       !!(pt_idx0 & (1 << 6)) * rxq->tunnel;
492 	pkts[1]->packet_type = mlx5_ptype_table[pt_idx1] |
493 			       !!(pt_idx1 & (1 << 6)) * rxq->tunnel;
494 	pkts[2]->packet_type = mlx5_ptype_table[pt_idx2] |
495 			       !!(pt_idx2 & (1 << 6)) * rxq->tunnel;
496 	pkts[3]->packet_type = mlx5_ptype_table[pt_idx3] |
497 			       !!(pt_idx3 & (1 << 6)) * rxq->tunnel;
498 	/* Fill flags for checksum and VLAN. */
499 	pinfo = _mm_and_si128(pinfo, ptype_ol_mask);
500 	pinfo = _mm_shuffle_epi8(cv_flag_sel, pinfo);
501 	/* Locate checksum flags at byte[2:1] and merge with VLAN flags. */
502 	cv_flags = _mm_slli_epi32(pinfo, 9);
503 	cv_flags = _mm_or_si128(pinfo, cv_flags);
504 	/* Move back flags to start from byte[0]. */
505 	cv_flags = _mm_srli_epi32(cv_flags, 8);
506 	/* Mask out garbage bits. */
507 	cv_flags = _mm_and_si128(cv_flags, cv_mask);
508 	/* Merge to ol_flags. */
509 	ol_flags = _mm_or_si128(ol_flags, cv_flags);
510 	/* Merge mbuf_init and ol_flags. */
511 	rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(ol_flags, 8), 0x30);
512 	rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(ol_flags, 4), 0x30);
513 	rearm2 = _mm_blend_epi16(mbuf_init, ol_flags, 0x30);
514 	rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(ol_flags, 4), 0x30);
515 	/* Write 8B rearm_data and 8B ol_flags. */
516 	_mm_store_si128((__m128i *)&pkts[0]->rearm_data, rearm0);
517 	_mm_store_si128((__m128i *)&pkts[1]->rearm_data, rearm1);
518 	_mm_store_si128((__m128i *)&pkts[2]->rearm_data, rearm2);
519 	_mm_store_si128((__m128i *)&pkts[3]->rearm_data, rearm3);
520 }
521 
522 /**
523  * Process a non-compressed completion and fill in mbufs in RX SW ring
524  * with data extracted from the title completion descriptor.
525  *
526  * @param rxq
527  *   Pointer to RX queue structure.
528  * @param cq
529  *   Pointer to completion array having a non-compressed completion at first.
530  * @param elts
531  *   Pointer to SW ring to be filled. The first mbuf has to be pre-built from
532  *   the title completion descriptor to be copied to the rest of mbufs.
533  * @param[out] pkts
534  *   Array to store received packets.
535  * @param pkts_n
536  *   Maximum number of packets in array.
537  * @param[out] err
538  *   Pointer to a flag. Set non-zero value if pkts array has at least one error
539  *   packet to handle.
540  * @param[out] comp
541  *   Pointer to a index. Set it to the first compressed completion if any.
542  *
543  * @return
544  *   Number of CQEs successfully processed.
545  */
546 static inline uint16_t
547 rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
548 		 struct rte_mbuf **elts, struct rte_mbuf **pkts,
549 		 uint16_t pkts_n, uint64_t *err, uint64_t *comp)
550 {
551 	const uint16_t q_n = 1 << rxq->cqe_n;
552 	const uint16_t q_mask = q_n - 1;
553 	unsigned int pos, adj;
554 	uint64_t n = 0;
555 	uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
556 	uint16_t nocmp_n = 0;
557 	const uint8_t vic = rxq->cq_ci >> rxq->cqe_n;
558 	const uint8_t own = !(rxq->cq_ci & (q_mask + 1));
559 	const __m128i vic_check = _mm_set1_epi64x(0x00ff000000ff0000LL);
560 	const __m128i owner_check =	_mm_set1_epi64x(0x0100000001000000LL);
561 	const __m128i opcode_check = _mm_set1_epi64x(0xf0000000f0000000LL);
562 	const __m128i format_check = _mm_set1_epi64x(0x0c0000000c000000LL);
563 	const __m128i resp_err_check = _mm_set1_epi64x(0xe0000000e0000000LL);
564 #ifdef MLX5_PMD_SOFT_COUNTERS
565 	uint32_t rcvd_byte = 0;
566 	/* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */
567 	const __m128i len_shuf_mask =
568 		_mm_set_epi8(-1, -1, -1, -1,
569 			     -1, -1, -1, -1,
570 			     12, 13,  8,  9,
571 			      4,  5,  0,  1);
572 #endif
573 	const __m128i validity =
574 		_mm_set_epi8(0, vic, 0, 0,
575 			     0, vic, 0, 0,
576 			     0, vic, 0, 0,
577 			     0, vic, 0, 0);
578 	const __m128i ownership =
579 		_mm_set_epi8(own, 0, 0, 0,
580 			     own, 0, 0, 0,
581 			     own, 0, 0, 0,
582 			     own, 0, 0, 0);
583 	/* Mask to shuffle from extracted CQE to mbuf. */
584 	const __m128i shuf_mask =
585 		_mm_set_epi8(-1,  3,  2,  1, /* fdir.hi */
586 			     12, 13, 14, 15, /* rss, bswap32 */
587 			     10, 11,         /* vlan_tci, bswap16 */
588 			      4,  5,         /* data_len, bswap16 */
589 			     -1, -1,         /* zero out 2nd half of pkt_len */
590 			      4,  5          /* pkt_len, bswap16 */);
591 	/* Mask to blend from the last Qword to the first DQword. */
592 	const __m128i blend_mask =
593 		_mm_set_epi8(-1, -1, -1, -1,
594 			     -1, -1, -1, -1,
595 			      0,  0,  0,  0,
596 			      0,  0,  0, -1);
597 	const __m128i zero = _mm_setzero_si128();
598 	const __m128i ones = _mm_cmpeq_epi32(zero, zero);
599 	const __m128i crc_adj =
600 		_mm_set_epi16(0, 0, 0, 0, 0,
601 			      rxq->crc_present * RTE_ETHER_CRC_LEN,
602 			      0,
603 			      rxq->crc_present * RTE_ETHER_CRC_LEN);
604 	const __m128i flow_mark_adj = _mm_set_epi32(rxq->mark * (-1), 0, 0, 0);
605 	/*
606 	 * A. load first Qword (8bytes) in one loop.
607 	 * B. copy 4 mbuf pointers from elts ring to returning pkts.
608 	 * C. load remained CQE data and extract necessary fields.
609 	 *    Final 16bytes cqes[] extracted from original 64bytes CQE has the
610 	 *    following structure:
611 	 *        struct {
612 	 *          uint8_t  pkt_info;
613 	 *          uint8_t  flow_tag[3];
614 	 *          uint16_t byte_cnt;
615 	 *          uint8_t  validity_iteration_count;
616 	 *          uint8_t  op_own;
617 	 *          uint16_t hdr_type_etc;
618 	 *          uint16_t vlan_info;
619 	 *          uint32_t rx_has_res;
620 	 *        } c;
621 	 * D. fill in mbuf.
622 	 * E. get valid CQEs.
623 	 * F. find compressed CQE.
624 	 */
625 	for (pos = 0;
626 	     pos < pkts_n;
627 	     pos += MLX5_VPMD_DESCS_PER_LOOP) {
628 		__m128i cqes[MLX5_VPMD_DESCS_PER_LOOP];
629 		__m128i cqe_tmp1, cqe_tmp2;
630 		__m128i pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3;
631 		__m128i op_own, op_own_tmp1, op_own_tmp2;
632 		__m128i opcode, owner_mask, invalid_mask;
633 		__m128i comp_mask, mini_mask;
634 		__m128i mask;
635 #ifdef MLX5_PMD_SOFT_COUNTERS
636 		__m128i byte_cnt;
637 #endif
638 		__m128i mbp1, mbp2;
639 		__m128i p = _mm_set_epi16(0, 0, 0, 0, 3, 2, 1, 0);
640 		unsigned int p1, p2, p3;
641 
642 		/* Prefetch next 4 CQEs. */
643 		if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) {
644 			rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP]);
645 			rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 1]);
646 			rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 2]);
647 			rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 3]);
648 		}
649 		/* A.0 do not cross the end of CQ. */
650 		mask = _mm_set_epi64x(0, (pkts_n - pos) * sizeof(uint16_t) * 8);
651 		mask = _mm_sll_epi64(ones, mask);
652 		p = _mm_andnot_si128(mask, p);
653 		/* A.1 load cqes. */
654 		p3 = _mm_extract_epi16(p, 3);
655 		cqes[3] = _mm_loadl_epi64((__m128i *)
656 					   &cq[pos + p3].sop_drop_qpn);
657 		rte_compiler_barrier();
658 		p2 = _mm_extract_epi16(p, 2);
659 		cqes[2] = _mm_loadl_epi64((__m128i *)
660 					   &cq[pos + p2].sop_drop_qpn);
661 		rte_compiler_barrier();
662 		/* B.1 load mbuf pointers. */
663 		mbp1 = _mm_loadu_si128((__m128i *)&elts[pos]);
664 		mbp2 = _mm_loadu_si128((__m128i *)&elts[pos + 2]);
665 		/* A.1 load a block having op_own. */
666 		p1 = _mm_extract_epi16(p, 1);
667 		cqes[1] = _mm_loadl_epi64((__m128i *)
668 					   &cq[pos + p1].sop_drop_qpn);
669 		rte_compiler_barrier();
670 		cqes[0] = _mm_loadl_epi64((__m128i *)
671 					   &cq[pos].sop_drop_qpn);
672 		/* B.2 copy mbuf pointers. */
673 		_mm_storeu_si128((__m128i *)&pkts[pos], mbp1);
674 		_mm_storeu_si128((__m128i *)&pkts[pos + 2], mbp2);
675 		rte_io_rmb();
676 		/* C.1 load remained CQE data and extract necessary fields. */
677 		cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p3]);
678 		cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos + p2]);
679 		cqes[3] = _mm_blendv_epi8(cqes[3], cqe_tmp2, blend_mask);
680 		cqes[2] = _mm_blendv_epi8(cqes[2], cqe_tmp1, blend_mask);
681 		cqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p3].csum);
682 		cqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos + p2].csum);
683 		cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x30);
684 		cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x30);
685 		cqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p3].rsvd4[2]);
686 		cqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos + p2].rsvd4[2]);
687 		cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x04);
688 		cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x04);
689 		/* C.2 generate final structure for mbuf with swapping bytes. */
690 		pkt_mb3 = _mm_shuffle_epi8(cqes[3], shuf_mask);
691 		pkt_mb2 = _mm_shuffle_epi8(cqes[2], shuf_mask);
692 		/* C.3 adjust CRC length. */
693 		pkt_mb3 = _mm_sub_epi16(pkt_mb3, crc_adj);
694 		pkt_mb2 = _mm_sub_epi16(pkt_mb2, crc_adj);
695 		/* C.4 adjust flow mark. */
696 		pkt_mb3 = _mm_add_epi32(pkt_mb3, flow_mark_adj);
697 		pkt_mb2 = _mm_add_epi32(pkt_mb2, flow_mark_adj);
698 		/* D.1 fill in mbuf - rx_descriptor_fields1. */
699 		_mm_storeu_si128((void *)&pkts[pos + 3]->pkt_len, pkt_mb3);
700 		_mm_storeu_si128((void *)&pkts[pos + 2]->pkt_len, pkt_mb2);
701 		/* E.1 extract op_own field. */
702 		op_own_tmp2 = _mm_unpacklo_epi32(cqes[2], cqes[3]);
703 		/* C.1 load remained CQE data and extract necessary fields. */
704 		cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p1]);
705 		cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos]);
706 		cqes[1] = _mm_blendv_epi8(cqes[1], cqe_tmp2, blend_mask);
707 		cqes[0] = _mm_blendv_epi8(cqes[0], cqe_tmp1, blend_mask);
708 		cqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p1].csum);
709 		cqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos].csum);
710 		cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x30);
711 		cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x30);
712 		cqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p1].rsvd4[2]);
713 		cqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos].rsvd4[2]);
714 		cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x04);
715 		cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x04);
716 		/* C.2 generate final structure for mbuf with swapping bytes. */
717 		pkt_mb1 = _mm_shuffle_epi8(cqes[1], shuf_mask);
718 		pkt_mb0 = _mm_shuffle_epi8(cqes[0], shuf_mask);
719 		/* C.3 adjust CRC length. */
720 		pkt_mb1 = _mm_sub_epi16(pkt_mb1, crc_adj);
721 		pkt_mb0 = _mm_sub_epi16(pkt_mb0, crc_adj);
722 		/* C.4 adjust flow mark. */
723 		pkt_mb1 = _mm_add_epi32(pkt_mb1, flow_mark_adj);
724 		pkt_mb0 = _mm_add_epi32(pkt_mb0, flow_mark_adj);
725 		/* E.1 extract op_own byte. */
726 		op_own_tmp1 = _mm_unpacklo_epi32(cqes[0], cqes[1]);
727 		op_own = _mm_unpackhi_epi64(op_own_tmp1, op_own_tmp2);
728 		/* D.1 fill in mbuf - rx_descriptor_fields1. */
729 		_mm_storeu_si128((void *)&pkts[pos + 1]->pkt_len, pkt_mb1);
730 		_mm_storeu_si128((void *)&pkts[pos]->pkt_len, pkt_mb0);
731 		/* E.2 mask out CQEs belonging to HW. */
732 		if (rxq->cqe_comp_layout) {
733 			owner_mask = _mm_and_si128(op_own, vic_check);
734 			owner_mask = _mm_cmpeq_epi32(owner_mask, validity);
735 			owner_mask = _mm_xor_si128(owner_mask, ones);
736 		} else {
737 			owner_mask = _mm_and_si128(op_own, owner_check);
738 			owner_mask = _mm_cmpeq_epi32(owner_mask, ownership);
739 		}
740 		owner_mask = _mm_packs_epi32(owner_mask, zero);
741 		/* E.3 get mask for invalidated CQEs. */
742 		opcode = _mm_and_si128(op_own, opcode_check);
743 		invalid_mask = _mm_cmpeq_epi32(opcode_check, opcode);
744 		invalid_mask = _mm_packs_epi32(invalid_mask, zero);
745 		/* E.4 mask out beyond boundary. */
746 		invalid_mask = _mm_or_si128(invalid_mask, mask);
747 		/* E.5 merge invalid_mask with invalid owner. */
748 		invalid_mask = _mm_or_si128(invalid_mask, owner_mask);
749 		/* F.1 find compressed CQE format. */
750 		comp_mask = _mm_and_si128(op_own, format_check);
751 		comp_mask = _mm_cmpeq_epi32(comp_mask, format_check);
752 		comp_mask = _mm_packs_epi32(comp_mask, zero);
753 		/* F.2 mask out invalid entries. */
754 		comp_mask = _mm_andnot_si128(invalid_mask, comp_mask);
755 		comp_idx = _mm_cvtsi128_si64(comp_mask);
756 		/* F.3 get the first compressed CQE. */
757 		comp_idx = comp_idx ?
758 				rte_ctz64(comp_idx) /
759 					(sizeof(uint16_t) * 8) :
760 				MLX5_VPMD_DESCS_PER_LOOP;
761 		/* E.6 mask out entries after the compressed CQE. */
762 		mask = _mm_set_epi64x(0, comp_idx * sizeof(uint16_t) * 8);
763 		mask = _mm_sll_epi64(ones, mask);
764 		invalid_mask = _mm_or_si128(invalid_mask, mask);
765 		/* E.7 count non-compressed valid CQEs. */
766 		n = _mm_cvtsi128_si64(invalid_mask);
767 		n = n ? rte_ctz64(n) / (sizeof(uint16_t) * 8) :
768 			MLX5_VPMD_DESCS_PER_LOOP;
769 		nocmp_n += n;
770 		/* D.2 get the final invalid mask. */
771 		mask = _mm_set_epi64x(0, n * sizeof(uint16_t) * 8);
772 		mask = _mm_sll_epi64(ones, mask);
773 		invalid_mask = _mm_or_si128(invalid_mask, mask);
774 		/* D.3 check error in opcode. */
775 		adj = (!rxq->cqe_comp_layout &&
776 		       comp_idx != MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n);
777 		mask = _mm_set_epi64x(0, adj * sizeof(uint16_t) * 8);
778 		mini_mask = _mm_sll_epi64(invalid_mask, mask);
779 		opcode = _mm_cmpeq_epi32(resp_err_check, opcode);
780 		opcode = _mm_packs_epi32(opcode, zero);
781 		opcode = _mm_andnot_si128(mini_mask, opcode);
782 		/* D.4 mark if any error is set */
783 		*err |= _mm_cvtsi128_si64(opcode);
784 		/* D.5 fill in mbuf - rearm_data and packet_type. */
785 		rxq_cq_to_ptype_oflags_v(rxq, cqes, opcode, &pkts[pos]);
786 		if (unlikely(rxq->shared)) {
787 			pkts[pos]->port = cq[pos].user_index_low;
788 			pkts[pos + p1]->port = cq[pos + p1].user_index_low;
789 			pkts[pos + p2]->port = cq[pos + p2].user_index_low;
790 			pkts[pos + p3]->port = cq[pos + p3].user_index_low;
791 		}
792 		if (unlikely(rxq->hw_timestamp)) {
793 			int offset = rxq->timestamp_offset;
794 			if (rxq->rt_timestamp) {
795 				struct mlx5_dev_ctx_shared *sh = rxq->sh;
796 				uint64_t ts;
797 
798 				ts = rte_be_to_cpu_64(cq[pos].timestamp);
799 				mlx5_timestamp_set(pkts[pos], offset,
800 					mlx5_txpp_convert_rx_ts(sh, ts));
801 				ts = rte_be_to_cpu_64(cq[pos + p1].timestamp);
802 				mlx5_timestamp_set(pkts[pos + 1], offset,
803 					mlx5_txpp_convert_rx_ts(sh, ts));
804 				ts = rte_be_to_cpu_64(cq[pos + p2].timestamp);
805 				mlx5_timestamp_set(pkts[pos + 2], offset,
806 					mlx5_txpp_convert_rx_ts(sh, ts));
807 				ts = rte_be_to_cpu_64(cq[pos + p3].timestamp);
808 				mlx5_timestamp_set(pkts[pos + 3], offset,
809 					mlx5_txpp_convert_rx_ts(sh, ts));
810 			} else {
811 				mlx5_timestamp_set(pkts[pos], offset,
812 					rte_be_to_cpu_64(cq[pos].timestamp));
813 				mlx5_timestamp_set(pkts[pos + 1], offset,
814 					rte_be_to_cpu_64(cq[pos + p1].timestamp));
815 				mlx5_timestamp_set(pkts[pos + 2], offset,
816 					rte_be_to_cpu_64(cq[pos + p2].timestamp));
817 				mlx5_timestamp_set(pkts[pos + 3], offset,
818 					rte_be_to_cpu_64(cq[pos + p3].timestamp));
819 			}
820 		}
821 		if (rxq->dynf_meta) {
822 			/* This code is subject for further optimization. */
823 			int32_t offs = rxq->flow_meta_offset;
824 			uint32_t mask = rxq->flow_meta_port_mask;
825 
826 			*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
827 				rte_be_to_cpu_32
828 				(cq[pos].flow_table_metadata) &	mask;
829 			*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *) =
830 				rte_be_to_cpu_32
831 				(cq[pos + p1].flow_table_metadata) & mask;
832 			*RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *) =
833 				rte_be_to_cpu_32
834 				(cq[pos + p2].flow_table_metadata) & mask;
835 			*RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *) =
836 				rte_be_to_cpu_32
837 				(cq[pos + p3].flow_table_metadata) & mask;
838 			if (*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *))
839 				pkts[pos]->ol_flags |= rxq->flow_meta_mask;
840 			if (*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *))
841 				pkts[pos + 1]->ol_flags |= rxq->flow_meta_mask;
842 			if (*RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *))
843 				pkts[pos + 2]->ol_flags |= rxq->flow_meta_mask;
844 			if (*RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *))
845 				pkts[pos + 3]->ol_flags |= rxq->flow_meta_mask;
846 		}
847 #ifdef MLX5_PMD_SOFT_COUNTERS
848 		/* Add up received bytes count. */
849 		byte_cnt = _mm_shuffle_epi8(op_own, len_shuf_mask);
850 		byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt);
851 		byte_cnt = _mm_hadd_epi16(byte_cnt, zero);
852 		rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero));
853 #endif
854 		/*
855 		 * Break the loop unless more valid CQE is expected, or if
856 		 * there's a compressed CQE.
857 		 */
858 		if (n != MLX5_VPMD_DESCS_PER_LOOP)
859 			break;
860 	}
861 #ifdef MLX5_PMD_SOFT_COUNTERS
862 	rxq->stats.ipackets += nocmp_n;
863 	rxq->stats.ibytes += rcvd_byte;
864 #endif
865 	if (comp_idx == n)
866 		*comp = comp_idx;
867 	return nocmp_n;
868 }
869 
870 #endif /* RTE_PMD_MLX5_RXTX_VEC_SSE_H_ */
871