xref: /dpdk/drivers/net/mlx5/mlx5_rxtx_vec_sse.h (revision 37dda90ee15b7098bc48356868a87d34f727eecc)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2017 6WIND S.A.
3  * Copyright 2017 Mellanox Technologies, Ltd
4  */
5 
6 #ifndef RTE_PMD_MLX5_RXTX_VEC_SSE_H_
7 #define RTE_PMD_MLX5_RXTX_VEC_SSE_H_
8 
9 #include <stdint.h>
10 #include <string.h>
11 #include <stdlib.h>
12 #include <rte_vect.h>
13 
14 #include <rte_mbuf.h>
15 #include <rte_mempool.h>
16 #include <rte_prefetch.h>
17 
18 #include <mlx5_prm.h>
19 
20 #include "mlx5_defs.h"
21 #include "mlx5.h"
22 #include "mlx5_utils.h"
23 #include "mlx5_rxtx.h"
24 #include "mlx5_rxtx_vec.h"
25 #include "mlx5_autoconf.h"
26 
27 #ifndef __INTEL_COMPILER
28 #pragma GCC diagnostic ignored "-Wcast-qual"
29 #endif
30 
31 /**
32  * Store free buffers to RX SW ring.
33  *
34  * @param elts
35  *   Pointer to SW ring to be filled.
36  * @param pkts
37  *   Pointer to array of packets to be stored.
38  * @param pkts_n
39  *   Number of packets to be stored.
40  */
41 static inline void
42 rxq_copy_mbuf_v(struct rte_mbuf **elts, struct rte_mbuf **pkts, uint16_t n)
43 {
44 	unsigned int pos;
45 	uint16_t p = n & -2;
46 
47 	for (pos = 0; pos < p; pos += 2) {
48 		__m128i mbp;
49 
50 		mbp = _mm_loadu_si128((__m128i *)&elts[pos]);
51 		_mm_storeu_si128((__m128i *)&pkts[pos], mbp);
52 	}
53 	if (n & 1)
54 		pkts[pos] = elts[pos];
55 }
56 
57 /**
58  * Decompress a compressed completion and fill in mbufs in RX SW ring with data
59  * extracted from the title completion descriptor.
60  *
61  * @param rxq
62  *   Pointer to RX queue structure.
63  * @param cq
64  *   Pointer to completion array having a compressed completion at first.
65  * @param elts
66  *   Pointer to SW ring to be filled. The first mbuf has to be pre-built from
67  *   the title completion descriptor to be copied to the rest of mbufs.
68  * @param keep
69  *   Keep unzipping if the next CQE is the miniCQE array.
70  *
71  * @return
72  *   Number of mini-CQEs successfully decompressed.
73  */
74 static inline uint16_t
75 rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
76 		    struct rte_mbuf **elts, bool keep)
77 {
78 	volatile struct mlx5_mini_cqe8 *mcq = (void *)(cq + !rxq->cqe_comp_layout);
79 	/* Title packet is pre-built. */
80 	struct rte_mbuf *t_pkt = rxq->cqe_comp_layout ? &rxq->title_pkt : elts[0];
81 	unsigned int pos;
82 	unsigned int i;
83 	unsigned int inv = 0;
84 	/* Mask to shuffle from extracted mini CQE to mbuf. */
85 	const __m128i shuf_mask1 =
86 		_mm_set_epi8(0,  1,  2,  3, /* rss, bswap32 */
87 			    -1, -1,         /* skip vlan_tci */
88 			     6,  7,         /* data_len, bswap16 */
89 			    -1, -1,  6,  7, /* pkt_len, bswap16 */
90 			    -1, -1, -1, -1  /* skip packet_type */);
91 	const __m128i shuf_mask2 =
92 		_mm_set_epi8(8,  9, 10, 11, /* rss, bswap32 */
93 			    -1, -1,         /* skip vlan_tci */
94 			    14, 15,         /* data_len, bswap16 */
95 			    -1, -1, 14, 15, /* pkt_len, bswap16 */
96 			    -1, -1, -1, -1  /* skip packet_type */);
97 	/* Restore the compressed count. Must be 16 bits. */
98 	uint16_t mcqe_n = (rxq->cqe_comp_layout) ?
99 		(MLX5_CQE_NUM_MINIS(cq->op_own) + 1U) : rte_be_to_cpu_32(cq->byte_cnt);
100 	uint16_t pkts_n = mcqe_n;
101 	const __m128i rearm =
102 		_mm_loadu_si128((__m128i *)&t_pkt->rearm_data);
103 	const __m128i rxdf =
104 		_mm_loadu_si128((__m128i *)&t_pkt->rx_descriptor_fields1);
105 	const __m128i crc_adj =
106 		_mm_set_epi16(0, 0, 0,
107 			      rxq->crc_present * RTE_ETHER_CRC_LEN,
108 			      0,
109 			      rxq->crc_present * RTE_ETHER_CRC_LEN,
110 			      0, 0);
111 	__m128i ol_flags = _mm_setzero_si128();
112 	__m128i ol_flags_mask = _mm_setzero_si128();
113 #ifdef MLX5_PMD_SOFT_COUNTERS
114 	const __m128i zero = _mm_setzero_si128();
115 	const __m128i ones = _mm_cmpeq_epi32(zero, zero);
116 	uint32_t rcvd_byte = 0;
117 	/* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */
118 	const __m128i len_shuf_mask =
119 		_mm_set_epi8(-1, -1, -1, -1,
120 			     -1, -1, -1, -1,
121 			     14, 15,  6,  7,
122 			     10, 11,  2,  3);
123 #endif
124 	/*
125 	 * A. load mCQEs into a 128bit register.
126 	 * B. store rearm data to mbuf.
127 	 * C. combine data from mCQEs with rx_descriptor_fields1.
128 	 * D. store rx_descriptor_fields1.
129 	 * E. store flow tag (rte_flow mark).
130 	 */
131 cycle:
132 	if (rxq->cqe_comp_layout)
133 		rte_prefetch0((void *)(cq + mcqe_n));
134 	for (pos = 0; pos < mcqe_n; ) {
135 		__m128i mcqe1, mcqe2;
136 		__m128i rxdf1, rxdf2;
137 #ifdef MLX5_PMD_SOFT_COUNTERS
138 		__m128i byte_cnt, invalid_mask;
139 #endif
140 
141 		if (!rxq->cqe_comp_layout)
142 			for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
143 				if (likely(pos + i < mcqe_n))
144 					rte_prefetch0((void *)(cq + pos + i));
145 		/* A.1 load mCQEs into a 128bit register. */
146 		mcqe1 = _mm_loadu_si128((__m128i *)&mcq[pos % 8]);
147 		mcqe2 = _mm_loadu_si128((__m128i *)&mcq[pos % 8 + 2]);
148 		/* B.1 store rearm data to mbuf. */
149 		_mm_storeu_si128((__m128i *)&elts[pos]->rearm_data, rearm);
150 		_mm_storeu_si128((__m128i *)&elts[pos + 1]->rearm_data, rearm);
151 		/* C.1 combine data from mCQEs with rx_descriptor_fields1. */
152 		rxdf1 = _mm_shuffle_epi8(mcqe1, shuf_mask1);
153 		rxdf2 = _mm_shuffle_epi8(mcqe1, shuf_mask2);
154 		rxdf1 = _mm_sub_epi16(rxdf1, crc_adj);
155 		rxdf2 = _mm_sub_epi16(rxdf2, crc_adj);
156 		rxdf1 = _mm_blend_epi16(rxdf1, rxdf, 0x23);
157 		rxdf2 = _mm_blend_epi16(rxdf2, rxdf, 0x23);
158 		/* D.1 store rx_descriptor_fields1. */
159 		_mm_storeu_si128((__m128i *)
160 				  &elts[pos]->rx_descriptor_fields1,
161 				 rxdf1);
162 		_mm_storeu_si128((__m128i *)
163 				  &elts[pos + 1]->rx_descriptor_fields1,
164 				 rxdf2);
165 		/* B.1 store rearm data to mbuf. */
166 		_mm_storeu_si128((__m128i *)&elts[pos + 2]->rearm_data, rearm);
167 		_mm_storeu_si128((__m128i *)&elts[pos + 3]->rearm_data, rearm);
168 		/* C.1 combine data from mCQEs with rx_descriptor_fields1. */
169 		rxdf1 = _mm_shuffle_epi8(mcqe2, shuf_mask1);
170 		rxdf2 = _mm_shuffle_epi8(mcqe2, shuf_mask2);
171 		rxdf1 = _mm_sub_epi16(rxdf1, crc_adj);
172 		rxdf2 = _mm_sub_epi16(rxdf2, crc_adj);
173 		rxdf1 = _mm_blend_epi16(rxdf1, rxdf, 0x23);
174 		rxdf2 = _mm_blend_epi16(rxdf2, rxdf, 0x23);
175 		/* D.1 store rx_descriptor_fields1. */
176 		_mm_storeu_si128((__m128i *)
177 				  &elts[pos + 2]->rx_descriptor_fields1,
178 				 rxdf1);
179 		_mm_storeu_si128((__m128i *)
180 				  &elts[pos + 3]->rx_descriptor_fields1,
181 				 rxdf2);
182 #ifdef MLX5_PMD_SOFT_COUNTERS
183 		invalid_mask = _mm_set_epi64x(0,
184 					      (mcqe_n - pos) *
185 					      sizeof(uint16_t) * 8);
186 		invalid_mask = _mm_sll_epi64(ones, invalid_mask);
187 		byte_cnt = _mm_blend_epi16(_mm_srli_si128(mcqe1, 4),
188 					   mcqe2, 0xcc);
189 		byte_cnt = _mm_shuffle_epi8(byte_cnt, len_shuf_mask);
190 		byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt);
191 		byte_cnt = _mm_hadd_epi16(byte_cnt, zero);
192 		rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero));
193 #endif
194 		if (rxq->mark) {
195 			if (rxq->mcqe_format !=
196 				MLX5_CQE_RESP_FORMAT_FTAG_STRIDX) {
197 				const uint32_t flow_tag = t_pkt->hash.fdir.hi;
198 
199 				/* E.1 store flow tag (rte_flow mark). */
200 				elts[pos]->hash.fdir.hi = flow_tag;
201 				elts[pos + 1]->hash.fdir.hi = flow_tag;
202 				elts[pos + 2]->hash.fdir.hi = flow_tag;
203 				elts[pos + 3]->hash.fdir.hi = flow_tag;
204 			} else {
205 				const __m128i flow_mark_adj =
206 					_mm_set_epi32(-1, -1, -1, -1);
207 				const __m128i flow_mark_shuf =
208 					_mm_set_epi8(-1,  9,  8, 12,
209 						     -1,  1,  0,  4,
210 						     -1, -1, -1, -1,
211 						     -1, -1, -1, -1);
212 				const __m128i ft_mask =
213 					_mm_set1_epi32(0xffffff00);
214 				const __m128i fdir_flags =
215 					_mm_set1_epi32(RTE_MBUF_F_RX_FDIR);
216 				const __m128i fdir_all_flags =
217 					_mm_set1_epi32(RTE_MBUF_F_RX_FDIR |
218 						       rxq->mark_flag);
219 				__m128i fdir_id_flags =
220 					_mm_set1_epi32(rxq->mark_flag);
221 
222 				/* Extract flow_tag field. */
223 				__m128i ftag0 =
224 					_mm_shuffle_epi8(mcqe1, flow_mark_shuf);
225 				__m128i ftag1 =
226 					_mm_shuffle_epi8(mcqe2, flow_mark_shuf);
227 				__m128i ftag =
228 					_mm_unpackhi_epi64(ftag0, ftag1);
229 				__m128i invalid_mask =
230 					_mm_cmpeq_epi32(ftag, zero);
231 
232 				ol_flags_mask = _mm_or_si128(ol_flags_mask,
233 							     fdir_all_flags);
234 				/* Set RTE_MBUF_F_RX_FDIR if flow tag is non-zero. */
235 				ol_flags = _mm_or_si128(ol_flags,
236 					_mm_andnot_si128(invalid_mask,
237 							 fdir_flags));
238 				/* Mask out invalid entries. */
239 				fdir_id_flags = _mm_andnot_si128(invalid_mask,
240 								 fdir_id_flags);
241 				/* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */
242 				ol_flags = _mm_or_si128(ol_flags,
243 					_mm_andnot_si128(_mm_cmpeq_epi32(ftag,
244 							 ft_mask),
245 					fdir_id_flags));
246 				ftag = _mm_add_epi32(ftag, flow_mark_adj);
247 				elts[pos]->hash.fdir.hi =
248 						_mm_extract_epi32(ftag, 0);
249 				elts[pos + 1]->hash.fdir.hi =
250 						_mm_extract_epi32(ftag, 1);
251 				elts[pos + 2]->hash.fdir.hi =
252 						_mm_extract_epi32(ftag, 2);
253 				elts[pos + 3]->hash.fdir.hi =
254 						_mm_extract_epi32(ftag, 3);
255 			}
256 		}
257 		if (unlikely(rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_HASH)) {
258 			if (rxq->mcqe_format ==
259 			    MLX5_CQE_RESP_FORMAT_L34H_STRIDX) {
260 				const uint8_t pkt_info =
261 					(cq->pkt_info & 0x3) << 6;
262 				const uint8_t pkt_hdr0 =
263 					_mm_extract_epi8(mcqe1, 0);
264 				const uint8_t pkt_hdr1 =
265 					_mm_extract_epi8(mcqe1, 8);
266 				const uint8_t pkt_hdr2 =
267 					_mm_extract_epi8(mcqe2, 0);
268 				const uint8_t pkt_hdr3 =
269 					_mm_extract_epi8(mcqe2, 8);
270 				const __m128i vlan_mask =
271 					_mm_set1_epi32(RTE_MBUF_F_RX_VLAN |
272 						       RTE_MBUF_F_RX_VLAN_STRIPPED);
273 				const __m128i cv_mask =
274 					_mm_set1_epi32(MLX5_CQE_VLAN_STRIPPED);
275 				const __m128i pkt_cv =
276 					_mm_set_epi32(pkt_hdr0 & 0x1,
277 						      pkt_hdr1 & 0x1,
278 						      pkt_hdr2 & 0x1,
279 						      pkt_hdr3 & 0x1);
280 
281 				ol_flags_mask = _mm_or_si128(ol_flags_mask,
282 							     vlan_mask);
283 				ol_flags = _mm_or_si128(ol_flags,
284 					_mm_and_si128(_mm_cmpeq_epi32(pkt_cv,
285 					cv_mask), vlan_mask));
286 				elts[pos]->packet_type =
287 					mlx5_ptype_table[(pkt_hdr0 >> 2) |
288 							 pkt_info];
289 				elts[pos + 1]->packet_type =
290 					mlx5_ptype_table[(pkt_hdr1 >> 2) |
291 							 pkt_info];
292 				elts[pos + 2]->packet_type =
293 					mlx5_ptype_table[(pkt_hdr2 >> 2) |
294 							 pkt_info];
295 				elts[pos + 3]->packet_type =
296 					mlx5_ptype_table[(pkt_hdr3 >> 2) |
297 							 pkt_info];
298 				if (rxq->tunnel) {
299 					elts[pos]->packet_type |=
300 						!!(((pkt_hdr0 >> 2) |
301 						pkt_info) & (1 << 6));
302 					elts[pos + 1]->packet_type |=
303 						!!(((pkt_hdr1 >> 2) |
304 						pkt_info) & (1 << 6));
305 					elts[pos + 2]->packet_type |=
306 						!!(((pkt_hdr2 >> 2) |
307 						pkt_info) & (1 << 6));
308 					elts[pos + 3]->packet_type |=
309 						!!(((pkt_hdr3 >> 2) |
310 						pkt_info) & (1 << 6));
311 				}
312 			}
313 			const __m128i hash_flags =
314 				_mm_set1_epi32(RTE_MBUF_F_RX_RSS_HASH);
315 			const __m128i rearm_flags =
316 				_mm_set1_epi32((uint32_t)t_pkt->ol_flags);
317 
318 			ol_flags_mask = _mm_or_si128(ol_flags_mask, hash_flags);
319 			ol_flags = _mm_or_si128(ol_flags,
320 				_mm_andnot_si128(ol_flags_mask, rearm_flags));
321 			elts[pos]->ol_flags =
322 				_mm_extract_epi32(ol_flags, 0);
323 			elts[pos + 1]->ol_flags =
324 				_mm_extract_epi32(ol_flags, 1);
325 			elts[pos + 2]->ol_flags =
326 				_mm_extract_epi32(ol_flags, 2);
327 			elts[pos + 3]->ol_flags =
328 				_mm_extract_epi32(ol_flags, 3);
329 			elts[pos]->hash.rss = 0;
330 			elts[pos + 1]->hash.rss = 0;
331 			elts[pos + 2]->hash.rss = 0;
332 			elts[pos + 3]->hash.rss = 0;
333 		}
334 		if (rxq->dynf_meta) {
335 			int32_t offs = rxq->flow_meta_offset;
336 			const uint32_t meta =
337 				*RTE_MBUF_DYNFIELD(t_pkt, offs, uint32_t *);
338 
339 			/* Check if title packet has valid metadata. */
340 			if (meta) {
341 				MLX5_ASSERT(t_pkt->ol_flags &
342 					    rxq->flow_meta_mask);
343 				*RTE_MBUF_DYNFIELD(elts[pos], offs,
344 							uint32_t *) = meta;
345 				*RTE_MBUF_DYNFIELD(elts[pos + 1], offs,
346 							uint32_t *) = meta;
347 				*RTE_MBUF_DYNFIELD(elts[pos + 2], offs,
348 							uint32_t *) = meta;
349 				*RTE_MBUF_DYNFIELD(elts[pos + 3], offs,
350 							uint32_t *) = meta;
351 			}
352 		}
353 		pos += MLX5_VPMD_DESCS_PER_LOOP;
354 		/* Move to next CQE and invalidate consumed CQEs. */
355 		if (!rxq->cqe_comp_layout) {
356 			if (!(pos & 0x7) && pos < mcqe_n) {
357 				if (pos + 8 < mcqe_n)
358 					rte_prefetch0((void *)(cq + pos + 8));
359 				mcq = (void *)(cq + pos);
360 				for (i = 0; i < 8; ++i)
361 					cq[inv++].op_own = MLX5_CQE_INVALIDATE;
362 			}
363 		}
364 	}
365 	if (rxq->cqe_comp_layout && keep) {
366 		int ret;
367 		/* Keep unzipping if the next CQE is the miniCQE array. */
368 		cq = &cq[mcqe_n];
369 		ret = check_cqe_iteration(cq, rxq->cqe_n, rxq->cq_ci + pkts_n);
370 		if (ret == MLX5_CQE_STATUS_SW_OWN &&
371 		    MLX5_CQE_FORMAT(cq->op_own) == MLX5_COMPRESSED) {
372 			pos = 0;
373 			elts = &elts[mcqe_n];
374 			mcq = (void *)cq;
375 			mcqe_n = MLX5_CQE_NUM_MINIS(cq->op_own) + 1;
376 			pkts_n += mcqe_n;
377 			goto cycle;
378 		}
379 	} else {
380 		/* Invalidate the rest of CQEs. */
381 		for (; inv < pkts_n; ++inv)
382 			cq[inv].op_own = MLX5_CQE_INVALIDATE;
383 	}
384 #ifdef MLX5_PMD_SOFT_COUNTERS
385 	rxq->stats.ipackets += pkts_n;
386 	rxq->stats.ibytes += rcvd_byte;
387 #endif
388 	return pkts_n;
389 }
390 
391 /**
392  * Calculate packet type and offload flag for mbuf and store it.
393  *
394  * @param rxq
395  *   Pointer to RX queue structure.
396  * @param cqes[4]
397  *   Array of four 16bytes completions extracted from the original completion
398  *   descriptor.
399  * @param op_err
400  *   Opcode vector having responder error status. Each field is 4B.
401  * @param pkts
402  *   Pointer to array of packets to be filled.
403  */
404 static inline void
405 rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, __m128i cqes[4],
406 			 __m128i op_err, struct rte_mbuf **pkts)
407 {
408 	__m128i pinfo0, pinfo1;
409 	__m128i pinfo, ptype;
410 	__m128i ol_flags = _mm_set1_epi32(rxq->rss_hash * RTE_MBUF_F_RX_RSS_HASH |
411 					  rxq->hw_timestamp * rxq->timestamp_rx_flag);
412 	__m128i cv_flags;
413 	const __m128i zero = _mm_setzero_si128();
414 	const __m128i ptype_mask = _mm_set1_epi32(0xfd06);
415 	const __m128i ptype_ol_mask = _mm_set1_epi32(0x106);
416 	const __m128i pinfo_mask = _mm_set1_epi32(0x3);
417 	const __m128i cv_flag_sel =
418 		_mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0,
419 			     (uint8_t)((RTE_MBUF_F_RX_IP_CKSUM_GOOD |
420 					RTE_MBUF_F_RX_L4_CKSUM_GOOD) >> 1),
421 			     0,
422 			     (uint8_t)(RTE_MBUF_F_RX_L4_CKSUM_GOOD >> 1),
423 			     0,
424 			     (uint8_t)(RTE_MBUF_F_RX_IP_CKSUM_GOOD >> 1),
425 			     (uint8_t)(RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED),
426 			     0);
427 	const __m128i cv_mask =
428 		_mm_set1_epi32(RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
429 			       RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED);
430 	const __m128i mbuf_init =
431 		_mm_load_si128((__m128i *)&rxq->mbuf_initializer);
432 	__m128i rearm0, rearm1, rearm2, rearm3;
433 	uint8_t pt_idx0, pt_idx1, pt_idx2, pt_idx3;
434 
435 	/* Extract pkt_info field. */
436 	pinfo0 = _mm_unpacklo_epi32(cqes[0], cqes[1]);
437 	pinfo1 = _mm_unpacklo_epi32(cqes[2], cqes[3]);
438 	pinfo = _mm_unpacklo_epi64(pinfo0, pinfo1);
439 	/* Extract hdr_type_etc field. */
440 	pinfo0 = _mm_unpackhi_epi32(cqes[0], cqes[1]);
441 	pinfo1 = _mm_unpackhi_epi32(cqes[2], cqes[3]);
442 	ptype = _mm_unpacklo_epi64(pinfo0, pinfo1);
443 	if (rxq->mark) {
444 		const __m128i pinfo_ft_mask = _mm_set1_epi32(0xffffff00);
445 		const __m128i fdir_flags = _mm_set1_epi32(RTE_MBUF_F_RX_FDIR);
446 		__m128i fdir_id_flags = _mm_set1_epi32(rxq->mark_flag);
447 		__m128i flow_tag, invalid_mask;
448 
449 		flow_tag = _mm_and_si128(pinfo, pinfo_ft_mask);
450 		/* Check if flow tag is non-zero then set RTE_MBUF_F_RX_FDIR. */
451 		invalid_mask = _mm_cmpeq_epi32(flow_tag, zero);
452 		ol_flags = _mm_or_si128(ol_flags,
453 					_mm_andnot_si128(invalid_mask,
454 							 fdir_flags));
455 		/* Mask out invalid entries. */
456 		fdir_id_flags = _mm_andnot_si128(invalid_mask, fdir_id_flags);
457 		/* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */
458 		ol_flags = _mm_or_si128(ol_flags,
459 					_mm_andnot_si128(
460 						_mm_cmpeq_epi32(flow_tag,
461 								pinfo_ft_mask),
462 						fdir_id_flags));
463 	}
464 	/*
465 	 * Merge the two fields to generate the following:
466 	 * bit[1]     = l3_ok
467 	 * bit[2]     = l4_ok
468 	 * bit[8]     = cv
469 	 * bit[11:10] = l3_hdr_type
470 	 * bit[14:12] = l4_hdr_type
471 	 * bit[15]    = ip_frag
472 	 * bit[16]    = tunneled
473 	 * bit[17]    = outer_l3_type
474 	 */
475 	ptype = _mm_and_si128(ptype, ptype_mask);
476 	pinfo = _mm_and_si128(pinfo, pinfo_mask);
477 	pinfo = _mm_slli_epi32(pinfo, 16);
478 	/* Make pinfo has merged fields for ol_flags calculation. */
479 	pinfo = _mm_or_si128(ptype, pinfo);
480 	ptype = _mm_srli_epi32(pinfo, 10);
481 	ptype = _mm_packs_epi32(ptype, zero);
482 	/* Errored packets will have RTE_PTYPE_ALL_MASK. */
483 	op_err = _mm_srli_epi16(op_err, 8);
484 	ptype = _mm_or_si128(ptype, op_err);
485 	pt_idx0 = _mm_extract_epi8(ptype, 0);
486 	pt_idx1 = _mm_extract_epi8(ptype, 2);
487 	pt_idx2 = _mm_extract_epi8(ptype, 4);
488 	pt_idx3 = _mm_extract_epi8(ptype, 6);
489 	pkts[0]->packet_type = mlx5_ptype_table[pt_idx0] |
490 			       !!(pt_idx0 & (1 << 6)) * rxq->tunnel;
491 	pkts[1]->packet_type = mlx5_ptype_table[pt_idx1] |
492 			       !!(pt_idx1 & (1 << 6)) * rxq->tunnel;
493 	pkts[2]->packet_type = mlx5_ptype_table[pt_idx2] |
494 			       !!(pt_idx2 & (1 << 6)) * rxq->tunnel;
495 	pkts[3]->packet_type = mlx5_ptype_table[pt_idx3] |
496 			       !!(pt_idx3 & (1 << 6)) * rxq->tunnel;
497 	/* Fill flags for checksum and VLAN. */
498 	pinfo = _mm_and_si128(pinfo, ptype_ol_mask);
499 	pinfo = _mm_shuffle_epi8(cv_flag_sel, pinfo);
500 	/* Locate checksum flags at byte[2:1] and merge with VLAN flags. */
501 	cv_flags = _mm_slli_epi32(pinfo, 9);
502 	cv_flags = _mm_or_si128(pinfo, cv_flags);
503 	/* Move back flags to start from byte[0]. */
504 	cv_flags = _mm_srli_epi32(cv_flags, 8);
505 	/* Mask out garbage bits. */
506 	cv_flags = _mm_and_si128(cv_flags, cv_mask);
507 	/* Merge to ol_flags. */
508 	ol_flags = _mm_or_si128(ol_flags, cv_flags);
509 	/* Merge mbuf_init and ol_flags. */
510 	rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(ol_flags, 8), 0x30);
511 	rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(ol_flags, 4), 0x30);
512 	rearm2 = _mm_blend_epi16(mbuf_init, ol_flags, 0x30);
513 	rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(ol_flags, 4), 0x30);
514 	/* Write 8B rearm_data and 8B ol_flags. */
515 	_mm_store_si128((__m128i *)&pkts[0]->rearm_data, rearm0);
516 	_mm_store_si128((__m128i *)&pkts[1]->rearm_data, rearm1);
517 	_mm_store_si128((__m128i *)&pkts[2]->rearm_data, rearm2);
518 	_mm_store_si128((__m128i *)&pkts[3]->rearm_data, rearm3);
519 }
520 
521 /**
522  * Process a non-compressed completion and fill in mbufs in RX SW ring
523  * with data extracted from the title completion descriptor.
524  *
525  * @param rxq
526  *   Pointer to RX queue structure.
527  * @param cq
528  *   Pointer to completion array having a non-compressed completion at first.
529  * @param elts
530  *   Pointer to SW ring to be filled. The first mbuf has to be pre-built from
531  *   the title completion descriptor to be copied to the rest of mbufs.
532  * @param[out] pkts
533  *   Array to store received packets.
534  * @param pkts_n
535  *   Maximum number of packets in array.
536  * @param[out] err
537  *   Pointer to a flag. Set non-zero value if pkts array has at least one error
538  *   packet to handle.
539  * @param[out] comp
540  *   Pointer to a index. Set it to the first compressed completion if any.
541  *
542  * @return
543  *   Number of CQEs successfully processed.
544  */
545 static inline uint16_t
546 rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
547 		 struct rte_mbuf **elts, struct rte_mbuf **pkts,
548 		 uint16_t pkts_n, uint64_t *err, uint64_t *comp)
549 {
550 	const uint16_t q_n = 1 << rxq->cqe_n;
551 	const uint16_t q_mask = q_n - 1;
552 	unsigned int pos, adj;
553 	uint64_t n = 0;
554 	uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
555 	uint16_t nocmp_n = 0;
556 	const uint8_t vic = rxq->cq_ci >> rxq->cqe_n;
557 	const uint8_t own = !(rxq->cq_ci & (q_mask + 1));
558 	const __m128i vic_check = _mm_set1_epi64x(0x00ff000000ff0000LL);
559 	const __m128i owner_check =	_mm_set1_epi64x(0x0100000001000000LL);
560 	const __m128i opcode_check = _mm_set1_epi64x(0xf0000000f0000000LL);
561 	const __m128i format_check = _mm_set1_epi64x(0x0c0000000c000000LL);
562 	const __m128i resp_err_check = _mm_set1_epi64x(0xe0000000e0000000LL);
563 #ifdef MLX5_PMD_SOFT_COUNTERS
564 	uint32_t rcvd_byte = 0;
565 	/* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */
566 	const __m128i len_shuf_mask =
567 		_mm_set_epi8(-1, -1, -1, -1,
568 			     -1, -1, -1, -1,
569 			     12, 13,  8,  9,
570 			      4,  5,  0,  1);
571 #endif
572 	const __m128i validity =
573 		_mm_set_epi8(0, vic, 0, 0,
574 			     0, vic, 0, 0,
575 			     0, vic, 0, 0,
576 			     0, vic, 0, 0);
577 	const __m128i ownership =
578 		_mm_set_epi8(own, 0, 0, 0,
579 			     own, 0, 0, 0,
580 			     own, 0, 0, 0,
581 			     own, 0, 0, 0);
582 	/* Mask to shuffle from extracted CQE to mbuf. */
583 	const __m128i shuf_mask =
584 		_mm_set_epi8(-1,  3,  2,  1, /* fdir.hi */
585 			     12, 13, 14, 15, /* rss, bswap32 */
586 			     10, 11,         /* vlan_tci, bswap16 */
587 			      4,  5,         /* data_len, bswap16 */
588 			     -1, -1,         /* zero out 2nd half of pkt_len */
589 			      4,  5          /* pkt_len, bswap16 */);
590 	/* Mask to blend from the last Qword to the first DQword. */
591 	const __m128i blend_mask =
592 		_mm_set_epi8(-1, -1, -1, -1,
593 			     -1, -1, -1, -1,
594 			      0,  0,  0,  0,
595 			      0,  0,  0, -1);
596 	const __m128i zero = _mm_setzero_si128();
597 	const __m128i ones = _mm_cmpeq_epi32(zero, zero);
598 	const __m128i crc_adj =
599 		_mm_set_epi16(0, 0, 0, 0, 0,
600 			      rxq->crc_present * RTE_ETHER_CRC_LEN,
601 			      0,
602 			      rxq->crc_present * RTE_ETHER_CRC_LEN);
603 	const __m128i flow_mark_adj = _mm_set_epi32(rxq->mark * (-1), 0, 0, 0);
604 	/*
605 	 * A. load first Qword (8bytes) in one loop.
606 	 * B. copy 4 mbuf pointers from elts ring to returning pkts.
607 	 * C. load remained CQE data and extract necessary fields.
608 	 *    Final 16bytes cqes[] extracted from original 64bytes CQE has the
609 	 *    following structure:
610 	 *        struct {
611 	 *          uint8_t  pkt_info;
612 	 *          uint8_t  flow_tag[3];
613 	 *          uint16_t byte_cnt;
614 	 *          uint8_t  validity_iteration_count;
615 	 *          uint8_t  op_own;
616 	 *          uint16_t hdr_type_etc;
617 	 *          uint16_t vlan_info;
618 	 *          uint32_t rx_has_res;
619 	 *        } c;
620 	 * D. fill in mbuf.
621 	 * E. get valid CQEs.
622 	 * F. find compressed CQE.
623 	 */
624 	for (pos = 0;
625 	     pos < pkts_n;
626 	     pos += MLX5_VPMD_DESCS_PER_LOOP) {
627 		__m128i cqes[MLX5_VPMD_DESCS_PER_LOOP];
628 		__m128i cqe_tmp1, cqe_tmp2;
629 		__m128i pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3;
630 		__m128i op_own, op_own_tmp1, op_own_tmp2;
631 		__m128i opcode, owner_mask, invalid_mask;
632 		__m128i comp_mask, mini_mask;
633 		__m128i mask;
634 #ifdef MLX5_PMD_SOFT_COUNTERS
635 		__m128i byte_cnt;
636 #endif
637 		__m128i mbp1, mbp2;
638 		__m128i p = _mm_set_epi16(0, 0, 0, 0, 3, 2, 1, 0);
639 		unsigned int p1, p2, p3;
640 
641 		/* Prefetch next 4 CQEs. */
642 		if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) {
643 			rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP]);
644 			rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 1]);
645 			rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 2]);
646 			rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 3]);
647 		}
648 		/* A.0 do not cross the end of CQ. */
649 		mask = _mm_set_epi64x(0, (pkts_n - pos) * sizeof(uint16_t) * 8);
650 		mask = _mm_sll_epi64(ones, mask);
651 		p = _mm_andnot_si128(mask, p);
652 		/* A.1 load cqes. */
653 		p3 = _mm_extract_epi16(p, 3);
654 		cqes[3] = _mm_loadl_epi64((__m128i *)
655 					   &cq[pos + p3].sop_drop_qpn);
656 		rte_compiler_barrier();
657 		p2 = _mm_extract_epi16(p, 2);
658 		cqes[2] = _mm_loadl_epi64((__m128i *)
659 					   &cq[pos + p2].sop_drop_qpn);
660 		rte_compiler_barrier();
661 		/* B.1 load mbuf pointers. */
662 		mbp1 = _mm_loadu_si128((__m128i *)&elts[pos]);
663 		mbp2 = _mm_loadu_si128((__m128i *)&elts[pos + 2]);
664 		/* A.1 load a block having op_own. */
665 		p1 = _mm_extract_epi16(p, 1);
666 		cqes[1] = _mm_loadl_epi64((__m128i *)
667 					   &cq[pos + p1].sop_drop_qpn);
668 		rte_compiler_barrier();
669 		cqes[0] = _mm_loadl_epi64((__m128i *)
670 					   &cq[pos].sop_drop_qpn);
671 		/* B.2 copy mbuf pointers. */
672 		_mm_storeu_si128((__m128i *)&pkts[pos], mbp1);
673 		_mm_storeu_si128((__m128i *)&pkts[pos + 2], mbp2);
674 		rte_io_rmb();
675 		/* C.1 load remained CQE data and extract necessary fields. */
676 		cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p3]);
677 		cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos + p2]);
678 		cqes[3] = _mm_blendv_epi8(cqes[3], cqe_tmp2, blend_mask);
679 		cqes[2] = _mm_blendv_epi8(cqes[2], cqe_tmp1, blend_mask);
680 		cqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p3].csum);
681 		cqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos + p2].csum);
682 		cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x30);
683 		cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x30);
684 		cqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p3].rsvd4[2]);
685 		cqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos + p2].rsvd4[2]);
686 		cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x04);
687 		cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x04);
688 		/* C.2 generate final structure for mbuf with swapping bytes. */
689 		pkt_mb3 = _mm_shuffle_epi8(cqes[3], shuf_mask);
690 		pkt_mb2 = _mm_shuffle_epi8(cqes[2], shuf_mask);
691 		/* C.3 adjust CRC length. */
692 		pkt_mb3 = _mm_sub_epi16(pkt_mb3, crc_adj);
693 		pkt_mb2 = _mm_sub_epi16(pkt_mb2, crc_adj);
694 		/* C.4 adjust flow mark. */
695 		pkt_mb3 = _mm_add_epi32(pkt_mb3, flow_mark_adj);
696 		pkt_mb2 = _mm_add_epi32(pkt_mb2, flow_mark_adj);
697 		/* D.1 fill in mbuf - rx_descriptor_fields1. */
698 		_mm_storeu_si128((void *)&pkts[pos + 3]->pkt_len, pkt_mb3);
699 		_mm_storeu_si128((void *)&pkts[pos + 2]->pkt_len, pkt_mb2);
700 		/* E.1 extract op_own field. */
701 		op_own_tmp2 = _mm_unpacklo_epi32(cqes[2], cqes[3]);
702 		/* C.1 load remained CQE data and extract necessary fields. */
703 		cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p1]);
704 		cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos]);
705 		cqes[1] = _mm_blendv_epi8(cqes[1], cqe_tmp2, blend_mask);
706 		cqes[0] = _mm_blendv_epi8(cqes[0], cqe_tmp1, blend_mask);
707 		cqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p1].csum);
708 		cqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos].csum);
709 		cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x30);
710 		cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x30);
711 		cqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p1].rsvd4[2]);
712 		cqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos].rsvd4[2]);
713 		cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x04);
714 		cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x04);
715 		/* C.2 generate final structure for mbuf with swapping bytes. */
716 		pkt_mb1 = _mm_shuffle_epi8(cqes[1], shuf_mask);
717 		pkt_mb0 = _mm_shuffle_epi8(cqes[0], shuf_mask);
718 		/* C.3 adjust CRC length. */
719 		pkt_mb1 = _mm_sub_epi16(pkt_mb1, crc_adj);
720 		pkt_mb0 = _mm_sub_epi16(pkt_mb0, crc_adj);
721 		/* C.4 adjust flow mark. */
722 		pkt_mb1 = _mm_add_epi32(pkt_mb1, flow_mark_adj);
723 		pkt_mb0 = _mm_add_epi32(pkt_mb0, flow_mark_adj);
724 		/* E.1 extract op_own byte. */
725 		op_own_tmp1 = _mm_unpacklo_epi32(cqes[0], cqes[1]);
726 		op_own = _mm_unpackhi_epi64(op_own_tmp1, op_own_tmp2);
727 		/* D.1 fill in mbuf - rx_descriptor_fields1. */
728 		_mm_storeu_si128((void *)&pkts[pos + 1]->pkt_len, pkt_mb1);
729 		_mm_storeu_si128((void *)&pkts[pos]->pkt_len, pkt_mb0);
730 		/* E.2 mask out CQEs belonging to HW. */
731 		if (rxq->cqe_comp_layout) {
732 			owner_mask = _mm_and_si128(op_own, vic_check);
733 			owner_mask = _mm_cmpeq_epi32(owner_mask, validity);
734 			owner_mask = _mm_xor_si128(owner_mask, ones);
735 		} else {
736 			owner_mask = _mm_and_si128(op_own, owner_check);
737 			owner_mask = _mm_cmpeq_epi32(owner_mask, ownership);
738 		}
739 		owner_mask = _mm_packs_epi32(owner_mask, zero);
740 		/* E.3 get mask for invalidated CQEs. */
741 		opcode = _mm_and_si128(op_own, opcode_check);
742 		invalid_mask = _mm_cmpeq_epi32(opcode_check, opcode);
743 		invalid_mask = _mm_packs_epi32(invalid_mask, zero);
744 		/* E.4 mask out beyond boundary. */
745 		invalid_mask = _mm_or_si128(invalid_mask, mask);
746 		/* E.5 merge invalid_mask with invalid owner. */
747 		invalid_mask = _mm_or_si128(invalid_mask, owner_mask);
748 		/* F.1 find compressed CQE format. */
749 		comp_mask = _mm_and_si128(op_own, format_check);
750 		comp_mask = _mm_cmpeq_epi32(comp_mask, format_check);
751 		comp_mask = _mm_packs_epi32(comp_mask, zero);
752 		/* F.2 mask out invalid entries. */
753 		comp_mask = _mm_andnot_si128(invalid_mask, comp_mask);
754 		comp_idx = _mm_cvtsi128_si64(comp_mask);
755 		/* F.3 get the first compressed CQE. */
756 		comp_idx = comp_idx ?
757 				rte_ctz64(comp_idx) /
758 					(sizeof(uint16_t) * 8) :
759 				MLX5_VPMD_DESCS_PER_LOOP;
760 		/* E.6 mask out entries after the compressed CQE. */
761 		mask = _mm_set_epi64x(0, comp_idx * sizeof(uint16_t) * 8);
762 		mask = _mm_sll_epi64(ones, mask);
763 		invalid_mask = _mm_or_si128(invalid_mask, mask);
764 		/* E.7 count non-compressed valid CQEs. */
765 		n = _mm_cvtsi128_si64(invalid_mask);
766 		n = n ? rte_ctz64(n) / (sizeof(uint16_t) * 8) :
767 			MLX5_VPMD_DESCS_PER_LOOP;
768 		nocmp_n += n;
769 		/* D.2 get the final invalid mask. */
770 		mask = _mm_set_epi64x(0, n * sizeof(uint16_t) * 8);
771 		mask = _mm_sll_epi64(ones, mask);
772 		invalid_mask = _mm_or_si128(invalid_mask, mask);
773 		/* D.3 check error in opcode. */
774 		adj = (!rxq->cqe_comp_layout &&
775 		       comp_idx != MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n);
776 		mask = _mm_set_epi64x(0, adj * sizeof(uint16_t) * 8);
777 		mini_mask = _mm_sll_epi64(invalid_mask, mask);
778 		opcode = _mm_cmpeq_epi32(resp_err_check, opcode);
779 		opcode = _mm_packs_epi32(opcode, zero);
780 		opcode = _mm_andnot_si128(mini_mask, opcode);
781 		/* D.4 mark if any error is set */
782 		*err |= _mm_cvtsi128_si64(opcode);
783 		/* D.5 fill in mbuf - rearm_data and packet_type. */
784 		rxq_cq_to_ptype_oflags_v(rxq, cqes, opcode, &pkts[pos]);
785 		if (unlikely(rxq->shared)) {
786 			pkts[pos]->port = cq[pos].user_index_low;
787 			pkts[pos + 1]->port = cq[pos + p1].user_index_low;
788 			pkts[pos + 2]->port = cq[pos + p2].user_index_low;
789 			pkts[pos + 3]->port = cq[pos + p3].user_index_low;
790 		}
791 		if (unlikely(rxq->hw_timestamp)) {
792 			int offset = rxq->timestamp_offset;
793 			if (rxq->rt_timestamp) {
794 				struct mlx5_dev_ctx_shared *sh = rxq->sh;
795 				uint64_t ts;
796 
797 				ts = rte_be_to_cpu_64(cq[pos].timestamp);
798 				mlx5_timestamp_set(pkts[pos], offset,
799 					mlx5_txpp_convert_rx_ts(sh, ts));
800 				ts = rte_be_to_cpu_64(cq[pos + p1].timestamp);
801 				mlx5_timestamp_set(pkts[pos + 1], offset,
802 					mlx5_txpp_convert_rx_ts(sh, ts));
803 				ts = rte_be_to_cpu_64(cq[pos + p2].timestamp);
804 				mlx5_timestamp_set(pkts[pos + 2], offset,
805 					mlx5_txpp_convert_rx_ts(sh, ts));
806 				ts = rte_be_to_cpu_64(cq[pos + p3].timestamp);
807 				mlx5_timestamp_set(pkts[pos + 3], offset,
808 					mlx5_txpp_convert_rx_ts(sh, ts));
809 			} else {
810 				mlx5_timestamp_set(pkts[pos], offset,
811 					rte_be_to_cpu_64(cq[pos].timestamp));
812 				mlx5_timestamp_set(pkts[pos + 1], offset,
813 					rte_be_to_cpu_64(cq[pos + p1].timestamp));
814 				mlx5_timestamp_set(pkts[pos + 2], offset,
815 					rte_be_to_cpu_64(cq[pos + p2].timestamp));
816 				mlx5_timestamp_set(pkts[pos + 3], offset,
817 					rte_be_to_cpu_64(cq[pos + p3].timestamp));
818 			}
819 		}
820 		if (rxq->dynf_meta) {
821 			/* This code is subject for further optimization. */
822 			int32_t offs = rxq->flow_meta_offset;
823 			uint32_t mask = rxq->flow_meta_port_mask;
824 
825 			*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
826 				rte_be_to_cpu_32
827 				(cq[pos].flow_table_metadata) &	mask;
828 			*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *) =
829 				rte_be_to_cpu_32
830 				(cq[pos + p1].flow_table_metadata) & mask;
831 			*RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *) =
832 				rte_be_to_cpu_32
833 				(cq[pos + p2].flow_table_metadata) & mask;
834 			*RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *) =
835 				rte_be_to_cpu_32
836 				(cq[pos + p3].flow_table_metadata) & mask;
837 			if (*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *))
838 				pkts[pos]->ol_flags |= rxq->flow_meta_mask;
839 			if (*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *))
840 				pkts[pos + 1]->ol_flags |= rxq->flow_meta_mask;
841 			if (*RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *))
842 				pkts[pos + 2]->ol_flags |= rxq->flow_meta_mask;
843 			if (*RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *))
844 				pkts[pos + 3]->ol_flags |= rxq->flow_meta_mask;
845 		}
846 #ifdef MLX5_PMD_SOFT_COUNTERS
847 		/* Add up received bytes count. */
848 		byte_cnt = _mm_shuffle_epi8(op_own, len_shuf_mask);
849 		byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt);
850 		byte_cnt = _mm_hadd_epi16(byte_cnt, zero);
851 		rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero));
852 #endif
853 		/*
854 		 * Break the loop unless more valid CQE is expected, or if
855 		 * there's a compressed CQE.
856 		 */
857 		if (n != MLX5_VPMD_DESCS_PER_LOOP)
858 			break;
859 	}
860 #ifdef MLX5_PMD_SOFT_COUNTERS
861 	rxq->stats.ipackets += nocmp_n;
862 	rxq->stats.ibytes += rcvd_byte;
863 #endif
864 	if (comp_idx == n)
865 		*comp = comp_idx;
866 	return nocmp_n;
867 }
868 
869 #endif /* RTE_PMD_MLX5_RXTX_VEC_SSE_H_ */
870