xref: /dpdk/drivers/net/mlx5/mlx5_rxtx_vec_sse.h (revision 2df20a1d345a5fc0a1b6dc0317d11fc7b1fda7e7)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2017 6WIND S.A.
3  * Copyright 2017 Mellanox Technologies, Ltd
4  */
5 
6 #ifndef RTE_PMD_MLX5_RXTX_VEC_SSE_H_
7 #define RTE_PMD_MLX5_RXTX_VEC_SSE_H_
8 
9 #include <stdint.h>
10 #include <string.h>
11 #include <stdlib.h>
12 #include <smmintrin.h>
13 
14 #include <rte_mbuf.h>
15 #include <rte_mempool.h>
16 #include <rte_prefetch.h>
17 
18 #include <mlx5_prm.h>
19 
20 #include "mlx5_defs.h"
21 #include "mlx5.h"
22 #include "mlx5_utils.h"
23 #include "mlx5_rxtx.h"
24 #include "mlx5_rxtx_vec.h"
25 #include "mlx5_autoconf.h"
26 
27 #ifndef __INTEL_COMPILER
28 #pragma GCC diagnostic ignored "-Wcast-qual"
29 #endif
30 
31 /**
32  * Store free buffers to RX SW ring.
33  *
34  * @param elts
35  *   Pointer to SW ring to be filled.
36  * @param pkts
37  *   Pointer to array of packets to be stored.
38  * @param pkts_n
39  *   Number of packets to be stored.
40  */
41 static inline void
42 rxq_copy_mbuf_v(struct rte_mbuf **elts, struct rte_mbuf **pkts, uint16_t n)
43 {
44 	unsigned int pos;
45 	uint16_t p = n & -2;
46 
47 	for (pos = 0; pos < p; pos += 2) {
48 		__m128i mbp;
49 
50 		mbp = _mm_loadu_si128((__m128i *)&elts[pos]);
51 		_mm_storeu_si128((__m128i *)&pkts[pos], mbp);
52 	}
53 	if (n & 1)
54 		pkts[pos] = elts[pos];
55 }
56 
57 /**
58  * Decompress a compressed completion and fill in mbufs in RX SW ring with data
59  * extracted from the title completion descriptor.
60  *
61  * @param rxq
62  *   Pointer to RX queue structure.
63  * @param cq
64  *   Pointer to completion array having a compressed completion at first.
65  * @param elts
66  *   Pointer to SW ring to be filled. The first mbuf has to be pre-built from
67  *   the title completion descriptor to be copied to the rest of mbufs.
68  *
69  * @return
70  *   Number of mini-CQEs successfully decompressed.
71  */
72 static inline uint16_t
73 rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
74 		    struct rte_mbuf **elts)
75 {
76 	volatile struct mlx5_mini_cqe8 *mcq = (void *)(cq + !rxq->cqe_comp_layout);
77 	/* Title packet is pre-built. */
78 	struct rte_mbuf *t_pkt = rxq->cqe_comp_layout ? &rxq->title_pkt : elts[0];
79 	unsigned int pos;
80 	unsigned int i;
81 	unsigned int inv = 0;
82 	/* Mask to shuffle from extracted mini CQE to mbuf. */
83 	const __m128i shuf_mask1 =
84 		_mm_set_epi8(0,  1,  2,  3, /* rss, bswap32 */
85 			    -1, -1,         /* skip vlan_tci */
86 			     6,  7,         /* data_len, bswap16 */
87 			    -1, -1,  6,  7, /* pkt_len, bswap16 */
88 			    -1, -1, -1, -1  /* skip packet_type */);
89 	const __m128i shuf_mask2 =
90 		_mm_set_epi8(8,  9, 10, 11, /* rss, bswap32 */
91 			    -1, -1,         /* skip vlan_tci */
92 			    14, 15,         /* data_len, bswap16 */
93 			    -1, -1, 14, 15, /* pkt_len, bswap16 */
94 			    -1, -1, -1, -1  /* skip packet_type */);
95 	/* Restore the compressed count. Must be 16 bits. */
96 	uint16_t mcqe_n = (rxq->cqe_comp_layout) ?
97 		(MLX5_CQE_NUM_MINIS(cq->op_own) + 1) :
98 		t_pkt->data_len + (rxq->crc_present * RTE_ETHER_CRC_LEN);
99 	uint16_t pkts_n = mcqe_n;
100 	const __m128i rearm =
101 		_mm_loadu_si128((__m128i *)&t_pkt->rearm_data);
102 	const __m128i rxdf =
103 		_mm_loadu_si128((__m128i *)&t_pkt->rx_descriptor_fields1);
104 	const __m128i crc_adj =
105 		_mm_set_epi16(0, 0, 0,
106 			      rxq->crc_present * RTE_ETHER_CRC_LEN,
107 			      0,
108 			      rxq->crc_present * RTE_ETHER_CRC_LEN,
109 			      0, 0);
110 	__m128i ol_flags = _mm_setzero_si128();
111 	__m128i ol_flags_mask = _mm_setzero_si128();
112 #ifdef MLX5_PMD_SOFT_COUNTERS
113 	const __m128i zero = _mm_setzero_si128();
114 	const __m128i ones = _mm_cmpeq_epi32(zero, zero);
115 	uint32_t rcvd_byte = 0;
116 	/* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */
117 	const __m128i len_shuf_mask =
118 		_mm_set_epi8(-1, -1, -1, -1,
119 			     -1, -1, -1, -1,
120 			     14, 15,  6,  7,
121 			     10, 11,  2,  3);
122 #endif
123 	/*
124 	 * A. load mCQEs into a 128bit register.
125 	 * B. store rearm data to mbuf.
126 	 * C. combine data from mCQEs with rx_descriptor_fields1.
127 	 * D. store rx_descriptor_fields1.
128 	 * E. store flow tag (rte_flow mark).
129 	 */
130 cycle:
131 	if (rxq->cqe_comp_layout)
132 		rte_prefetch0((void *)(cq + mcqe_n));
133 	for (pos = 0; pos < mcqe_n; ) {
134 		__m128i mcqe1, mcqe2;
135 		__m128i rxdf1, rxdf2;
136 #ifdef MLX5_PMD_SOFT_COUNTERS
137 		__m128i byte_cnt, invalid_mask;
138 #endif
139 
140 		if (!rxq->cqe_comp_layout)
141 			for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
142 				if (likely(pos + i < mcqe_n))
143 					rte_prefetch0((void *)(cq + pos + i));
144 		/* A.1 load mCQEs into a 128bit register. */
145 		mcqe1 = _mm_loadu_si128((__m128i *)&mcq[pos % 8]);
146 		mcqe2 = _mm_loadu_si128((__m128i *)&mcq[pos % 8 + 2]);
147 		/* B.1 store rearm data to mbuf. */
148 		_mm_storeu_si128((__m128i *)&elts[pos]->rearm_data, rearm);
149 		_mm_storeu_si128((__m128i *)&elts[pos + 1]->rearm_data, rearm);
150 		/* C.1 combine data from mCQEs with rx_descriptor_fields1. */
151 		rxdf1 = _mm_shuffle_epi8(mcqe1, shuf_mask1);
152 		rxdf2 = _mm_shuffle_epi8(mcqe1, shuf_mask2);
153 		rxdf1 = _mm_sub_epi16(rxdf1, crc_adj);
154 		rxdf2 = _mm_sub_epi16(rxdf2, crc_adj);
155 		rxdf1 = _mm_blend_epi16(rxdf1, rxdf, 0x23);
156 		rxdf2 = _mm_blend_epi16(rxdf2, rxdf, 0x23);
157 		/* D.1 store rx_descriptor_fields1. */
158 		_mm_storeu_si128((__m128i *)
159 				  &elts[pos]->rx_descriptor_fields1,
160 				 rxdf1);
161 		_mm_storeu_si128((__m128i *)
162 				  &elts[pos + 1]->rx_descriptor_fields1,
163 				 rxdf2);
164 		/* B.1 store rearm data to mbuf. */
165 		_mm_storeu_si128((__m128i *)&elts[pos + 2]->rearm_data, rearm);
166 		_mm_storeu_si128((__m128i *)&elts[pos + 3]->rearm_data, rearm);
167 		/* C.1 combine data from mCQEs with rx_descriptor_fields1. */
168 		rxdf1 = _mm_shuffle_epi8(mcqe2, shuf_mask1);
169 		rxdf2 = _mm_shuffle_epi8(mcqe2, shuf_mask2);
170 		rxdf1 = _mm_sub_epi16(rxdf1, crc_adj);
171 		rxdf2 = _mm_sub_epi16(rxdf2, crc_adj);
172 		rxdf1 = _mm_blend_epi16(rxdf1, rxdf, 0x23);
173 		rxdf2 = _mm_blend_epi16(rxdf2, rxdf, 0x23);
174 		/* D.1 store rx_descriptor_fields1. */
175 		_mm_storeu_si128((__m128i *)
176 				  &elts[pos + 2]->rx_descriptor_fields1,
177 				 rxdf1);
178 		_mm_storeu_si128((__m128i *)
179 				  &elts[pos + 3]->rx_descriptor_fields1,
180 				 rxdf2);
181 #ifdef MLX5_PMD_SOFT_COUNTERS
182 		invalid_mask = _mm_set_epi64x(0,
183 					      (mcqe_n - pos) *
184 					      sizeof(uint16_t) * 8);
185 		invalid_mask = _mm_sll_epi64(ones, invalid_mask);
186 		byte_cnt = _mm_blend_epi16(_mm_srli_si128(mcqe1, 4),
187 					   mcqe2, 0xcc);
188 		byte_cnt = _mm_shuffle_epi8(byte_cnt, len_shuf_mask);
189 		byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt);
190 		byte_cnt = _mm_hadd_epi16(byte_cnt, zero);
191 		rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero));
192 #endif
193 		if (rxq->mark) {
194 			if (rxq->mcqe_format !=
195 				MLX5_CQE_RESP_FORMAT_FTAG_STRIDX) {
196 				const uint32_t flow_tag = t_pkt->hash.fdir.hi;
197 
198 				/* E.1 store flow tag (rte_flow mark). */
199 				elts[pos]->hash.fdir.hi = flow_tag;
200 				elts[pos + 1]->hash.fdir.hi = flow_tag;
201 				elts[pos + 2]->hash.fdir.hi = flow_tag;
202 				elts[pos + 3]->hash.fdir.hi = flow_tag;
203 			} else {
204 				const __m128i flow_mark_adj =
205 					_mm_set_epi32(-1, -1, -1, -1);
206 				const __m128i flow_mark_shuf =
207 					_mm_set_epi8(-1,  9,  8, 12,
208 						     -1,  1,  0,  4,
209 						     -1, -1, -1, -1,
210 						     -1, -1, -1, -1);
211 				const __m128i ft_mask =
212 					_mm_set1_epi32(0xffffff00);
213 				const __m128i fdir_flags =
214 					_mm_set1_epi32(RTE_MBUF_F_RX_FDIR);
215 				const __m128i fdir_all_flags =
216 					_mm_set1_epi32(RTE_MBUF_F_RX_FDIR |
217 						       rxq->mark_flag);
218 				__m128i fdir_id_flags =
219 					_mm_set1_epi32(rxq->mark_flag);
220 
221 				/* Extract flow_tag field. */
222 				__m128i ftag0 =
223 					_mm_shuffle_epi8(mcqe1, flow_mark_shuf);
224 				__m128i ftag1 =
225 					_mm_shuffle_epi8(mcqe2, flow_mark_shuf);
226 				__m128i ftag =
227 					_mm_unpackhi_epi64(ftag0, ftag1);
228 				__m128i invalid_mask =
229 					_mm_cmpeq_epi32(ftag, zero);
230 
231 				ol_flags_mask = _mm_or_si128(ol_flags_mask,
232 							     fdir_all_flags);
233 				/* Set RTE_MBUF_F_RX_FDIR if flow tag is non-zero. */
234 				ol_flags = _mm_or_si128(ol_flags,
235 					_mm_andnot_si128(invalid_mask,
236 							 fdir_flags));
237 				/* Mask out invalid entries. */
238 				fdir_id_flags = _mm_andnot_si128(invalid_mask,
239 								 fdir_id_flags);
240 				/* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */
241 				ol_flags = _mm_or_si128(ol_flags,
242 					_mm_andnot_si128(_mm_cmpeq_epi32(ftag,
243 							 ft_mask),
244 					fdir_id_flags));
245 				ftag = _mm_add_epi32(ftag, flow_mark_adj);
246 				elts[pos]->hash.fdir.hi =
247 						_mm_extract_epi32(ftag, 0);
248 				elts[pos + 1]->hash.fdir.hi =
249 						_mm_extract_epi32(ftag, 1);
250 				elts[pos + 2]->hash.fdir.hi =
251 						_mm_extract_epi32(ftag, 2);
252 				elts[pos + 3]->hash.fdir.hi =
253 						_mm_extract_epi32(ftag, 3);
254 			}
255 		}
256 		if (unlikely(rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_HASH)) {
257 			if (rxq->mcqe_format ==
258 			    MLX5_CQE_RESP_FORMAT_L34H_STRIDX) {
259 				const uint8_t pkt_info =
260 					(cq->pkt_info & 0x3) << 6;
261 				const uint8_t pkt_hdr0 =
262 					_mm_extract_epi8(mcqe1, 0);
263 				const uint8_t pkt_hdr1 =
264 					_mm_extract_epi8(mcqe1, 8);
265 				const uint8_t pkt_hdr2 =
266 					_mm_extract_epi8(mcqe2, 0);
267 				const uint8_t pkt_hdr3 =
268 					_mm_extract_epi8(mcqe2, 8);
269 				const __m128i vlan_mask =
270 					_mm_set1_epi32(RTE_MBUF_F_RX_VLAN |
271 						       RTE_MBUF_F_RX_VLAN_STRIPPED);
272 				const __m128i cv_mask =
273 					_mm_set1_epi32(MLX5_CQE_VLAN_STRIPPED);
274 				const __m128i pkt_cv =
275 					_mm_set_epi32(pkt_hdr0 & 0x1,
276 						      pkt_hdr1 & 0x1,
277 						      pkt_hdr2 & 0x1,
278 						      pkt_hdr3 & 0x1);
279 
280 				ol_flags_mask = _mm_or_si128(ol_flags_mask,
281 							     vlan_mask);
282 				ol_flags = _mm_or_si128(ol_flags,
283 					_mm_and_si128(_mm_cmpeq_epi32(pkt_cv,
284 					cv_mask), vlan_mask));
285 				elts[pos]->packet_type =
286 					mlx5_ptype_table[(pkt_hdr0 >> 2) |
287 							 pkt_info];
288 				elts[pos + 1]->packet_type =
289 					mlx5_ptype_table[(pkt_hdr1 >> 2) |
290 							 pkt_info];
291 				elts[pos + 2]->packet_type =
292 					mlx5_ptype_table[(pkt_hdr2 >> 2) |
293 							 pkt_info];
294 				elts[pos + 3]->packet_type =
295 					mlx5_ptype_table[(pkt_hdr3 >> 2) |
296 							 pkt_info];
297 				if (rxq->tunnel) {
298 					elts[pos]->packet_type |=
299 						!!(((pkt_hdr0 >> 2) |
300 						pkt_info) & (1 << 6));
301 					elts[pos + 1]->packet_type |=
302 						!!(((pkt_hdr1 >> 2) |
303 						pkt_info) & (1 << 6));
304 					elts[pos + 2]->packet_type |=
305 						!!(((pkt_hdr2 >> 2) |
306 						pkt_info) & (1 << 6));
307 					elts[pos + 3]->packet_type |=
308 						!!(((pkt_hdr3 >> 2) |
309 						pkt_info) & (1 << 6));
310 				}
311 			}
312 			const __m128i hash_flags =
313 				_mm_set1_epi32(RTE_MBUF_F_RX_RSS_HASH);
314 			const __m128i rearm_flags =
315 				_mm_set1_epi32((uint32_t)t_pkt->ol_flags);
316 
317 			ol_flags_mask = _mm_or_si128(ol_flags_mask, hash_flags);
318 			ol_flags = _mm_or_si128(ol_flags,
319 				_mm_andnot_si128(ol_flags_mask, rearm_flags));
320 			elts[pos]->ol_flags =
321 				_mm_extract_epi32(ol_flags, 0);
322 			elts[pos + 1]->ol_flags =
323 				_mm_extract_epi32(ol_flags, 1);
324 			elts[pos + 2]->ol_flags =
325 				_mm_extract_epi32(ol_flags, 2);
326 			elts[pos + 3]->ol_flags =
327 				_mm_extract_epi32(ol_flags, 3);
328 			elts[pos]->hash.rss = 0;
329 			elts[pos + 1]->hash.rss = 0;
330 			elts[pos + 2]->hash.rss = 0;
331 			elts[pos + 3]->hash.rss = 0;
332 		}
333 		if (rxq->dynf_meta) {
334 			int32_t offs = rxq->flow_meta_offset;
335 			const uint32_t meta =
336 				*RTE_MBUF_DYNFIELD(t_pkt, offs, uint32_t *);
337 
338 			/* Check if title packet has valid metadata. */
339 			if (meta) {
340 				MLX5_ASSERT(t_pkt->ol_flags &
341 					    rxq->flow_meta_mask);
342 				*RTE_MBUF_DYNFIELD(elts[pos], offs,
343 							uint32_t *) = meta;
344 				*RTE_MBUF_DYNFIELD(elts[pos + 1], offs,
345 							uint32_t *) = meta;
346 				*RTE_MBUF_DYNFIELD(elts[pos + 2], offs,
347 							uint32_t *) = meta;
348 				*RTE_MBUF_DYNFIELD(elts[pos + 3], offs,
349 							uint32_t *) = meta;
350 			}
351 		}
352 		pos += MLX5_VPMD_DESCS_PER_LOOP;
353 		/* Move to next CQE and invalidate consumed CQEs. */
354 		if (!rxq->cqe_comp_layout) {
355 			if (!(pos & 0x7) && pos < mcqe_n) {
356 				if (pos + 8 < mcqe_n)
357 					rte_prefetch0((void *)(cq + pos + 8));
358 				mcq = (void *)(cq + pos);
359 				for (i = 0; i < 8; ++i)
360 					cq[inv++].op_own = MLX5_CQE_INVALIDATE;
361 			}
362 		}
363 	}
364 	if (rxq->cqe_comp_layout) {
365 		int ret;
366 		/* Keep unzipping if the next CQE is the miniCQE array. */
367 		cq = &cq[mcqe_n];
368 		ret = check_cqe_iteration(cq, rxq->cqe_n, rxq->cq_ci + pkts_n);
369 		if (ret == MLX5_CQE_STATUS_SW_OWN &&
370 		    MLX5_CQE_FORMAT(cq->op_own) == MLX5_COMPRESSED) {
371 			pos = 0;
372 			elts = &elts[mcqe_n];
373 			mcq = (void *)cq;
374 			mcqe_n = MLX5_CQE_NUM_MINIS(cq->op_own) + 1;
375 			pkts_n += mcqe_n;
376 			goto cycle;
377 		}
378 	} else {
379 		/* Invalidate the rest of CQEs. */
380 		for (; inv < pkts_n; ++inv)
381 			cq[inv].op_own = MLX5_CQE_INVALIDATE;
382 	}
383 #ifdef MLX5_PMD_SOFT_COUNTERS
384 	rxq->stats.ipackets += pkts_n;
385 	rxq->stats.ibytes += rcvd_byte;
386 #endif
387 	return pkts_n;
388 }
389 
390 /**
391  * Calculate packet type and offload flag for mbuf and store it.
392  *
393  * @param rxq
394  *   Pointer to RX queue structure.
395  * @param cqes[4]
396  *   Array of four 16bytes completions extracted from the original completion
397  *   descriptor.
398  * @param op_err
399  *   Opcode vector having responder error status. Each field is 4B.
400  * @param pkts
401  *   Pointer to array of packets to be filled.
402  */
403 static inline void
404 rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, __m128i cqes[4],
405 			 __m128i op_err, struct rte_mbuf **pkts)
406 {
407 	__m128i pinfo0, pinfo1;
408 	__m128i pinfo, ptype;
409 	__m128i ol_flags = _mm_set1_epi32(rxq->rss_hash * RTE_MBUF_F_RX_RSS_HASH |
410 					  rxq->hw_timestamp * rxq->timestamp_rx_flag);
411 	__m128i cv_flags;
412 	const __m128i zero = _mm_setzero_si128();
413 	const __m128i ptype_mask = _mm_set1_epi32(0xfd06);
414 	const __m128i ptype_ol_mask = _mm_set1_epi32(0x106);
415 	const __m128i pinfo_mask = _mm_set1_epi32(0x3);
416 	const __m128i cv_flag_sel =
417 		_mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0,
418 			     (uint8_t)((RTE_MBUF_F_RX_IP_CKSUM_GOOD |
419 					RTE_MBUF_F_RX_L4_CKSUM_GOOD) >> 1),
420 			     0,
421 			     (uint8_t)(RTE_MBUF_F_RX_L4_CKSUM_GOOD >> 1),
422 			     0,
423 			     (uint8_t)(RTE_MBUF_F_RX_IP_CKSUM_GOOD >> 1),
424 			     (uint8_t)(RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED),
425 			     0);
426 	const __m128i cv_mask =
427 		_mm_set1_epi32(RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
428 			       RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED);
429 	const __m128i mbuf_init =
430 		_mm_load_si128((__m128i *)&rxq->mbuf_initializer);
431 	__m128i rearm0, rearm1, rearm2, rearm3;
432 	uint8_t pt_idx0, pt_idx1, pt_idx2, pt_idx3;
433 
434 	/* Extract pkt_info field. */
435 	pinfo0 = _mm_unpacklo_epi32(cqes[0], cqes[1]);
436 	pinfo1 = _mm_unpacklo_epi32(cqes[2], cqes[3]);
437 	pinfo = _mm_unpacklo_epi64(pinfo0, pinfo1);
438 	/* Extract hdr_type_etc field. */
439 	pinfo0 = _mm_unpackhi_epi32(cqes[0], cqes[1]);
440 	pinfo1 = _mm_unpackhi_epi32(cqes[2], cqes[3]);
441 	ptype = _mm_unpacklo_epi64(pinfo0, pinfo1);
442 	if (rxq->mark) {
443 		const __m128i pinfo_ft_mask = _mm_set1_epi32(0xffffff00);
444 		const __m128i fdir_flags = _mm_set1_epi32(RTE_MBUF_F_RX_FDIR);
445 		__m128i fdir_id_flags = _mm_set1_epi32(rxq->mark_flag);
446 		__m128i flow_tag, invalid_mask;
447 
448 		flow_tag = _mm_and_si128(pinfo, pinfo_ft_mask);
449 		/* Check if flow tag is non-zero then set RTE_MBUF_F_RX_FDIR. */
450 		invalid_mask = _mm_cmpeq_epi32(flow_tag, zero);
451 		ol_flags = _mm_or_si128(ol_flags,
452 					_mm_andnot_si128(invalid_mask,
453 							 fdir_flags));
454 		/* Mask out invalid entries. */
455 		fdir_id_flags = _mm_andnot_si128(invalid_mask, fdir_id_flags);
456 		/* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */
457 		ol_flags = _mm_or_si128(ol_flags,
458 					_mm_andnot_si128(
459 						_mm_cmpeq_epi32(flow_tag,
460 								pinfo_ft_mask),
461 						fdir_id_flags));
462 	}
463 	/*
464 	 * Merge the two fields to generate the following:
465 	 * bit[1]     = l3_ok
466 	 * bit[2]     = l4_ok
467 	 * bit[8]     = cv
468 	 * bit[11:10] = l3_hdr_type
469 	 * bit[14:12] = l4_hdr_type
470 	 * bit[15]    = ip_frag
471 	 * bit[16]    = tunneled
472 	 * bit[17]    = outer_l3_type
473 	 */
474 	ptype = _mm_and_si128(ptype, ptype_mask);
475 	pinfo = _mm_and_si128(pinfo, pinfo_mask);
476 	pinfo = _mm_slli_epi32(pinfo, 16);
477 	/* Make pinfo has merged fields for ol_flags calculation. */
478 	pinfo = _mm_or_si128(ptype, pinfo);
479 	ptype = _mm_srli_epi32(pinfo, 10);
480 	ptype = _mm_packs_epi32(ptype, zero);
481 	/* Errored packets will have RTE_PTYPE_ALL_MASK. */
482 	op_err = _mm_srli_epi16(op_err, 8);
483 	ptype = _mm_or_si128(ptype, op_err);
484 	pt_idx0 = _mm_extract_epi8(ptype, 0);
485 	pt_idx1 = _mm_extract_epi8(ptype, 2);
486 	pt_idx2 = _mm_extract_epi8(ptype, 4);
487 	pt_idx3 = _mm_extract_epi8(ptype, 6);
488 	pkts[0]->packet_type = mlx5_ptype_table[pt_idx0] |
489 			       !!(pt_idx0 & (1 << 6)) * rxq->tunnel;
490 	pkts[1]->packet_type = mlx5_ptype_table[pt_idx1] |
491 			       !!(pt_idx1 & (1 << 6)) * rxq->tunnel;
492 	pkts[2]->packet_type = mlx5_ptype_table[pt_idx2] |
493 			       !!(pt_idx2 & (1 << 6)) * rxq->tunnel;
494 	pkts[3]->packet_type = mlx5_ptype_table[pt_idx3] |
495 			       !!(pt_idx3 & (1 << 6)) * rxq->tunnel;
496 	/* Fill flags for checksum and VLAN. */
497 	pinfo = _mm_and_si128(pinfo, ptype_ol_mask);
498 	pinfo = _mm_shuffle_epi8(cv_flag_sel, pinfo);
499 	/* Locate checksum flags at byte[2:1] and merge with VLAN flags. */
500 	cv_flags = _mm_slli_epi32(pinfo, 9);
501 	cv_flags = _mm_or_si128(pinfo, cv_flags);
502 	/* Move back flags to start from byte[0]. */
503 	cv_flags = _mm_srli_epi32(cv_flags, 8);
504 	/* Mask out garbage bits. */
505 	cv_flags = _mm_and_si128(cv_flags, cv_mask);
506 	/* Merge to ol_flags. */
507 	ol_flags = _mm_or_si128(ol_flags, cv_flags);
508 	/* Merge mbuf_init and ol_flags. */
509 	rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(ol_flags, 8), 0x30);
510 	rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(ol_flags, 4), 0x30);
511 	rearm2 = _mm_blend_epi16(mbuf_init, ol_flags, 0x30);
512 	rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(ol_flags, 4), 0x30);
513 	/* Write 8B rearm_data and 8B ol_flags. */
514 	_mm_store_si128((__m128i *)&pkts[0]->rearm_data, rearm0);
515 	_mm_store_si128((__m128i *)&pkts[1]->rearm_data, rearm1);
516 	_mm_store_si128((__m128i *)&pkts[2]->rearm_data, rearm2);
517 	_mm_store_si128((__m128i *)&pkts[3]->rearm_data, rearm3);
518 }
519 
520 /**
521  * Process a non-compressed completion and fill in mbufs in RX SW ring
522  * with data extracted from the title completion descriptor.
523  *
524  * @param rxq
525  *   Pointer to RX queue structure.
526  * @param cq
527  *   Pointer to completion array having a non-compressed completion at first.
528  * @param elts
529  *   Pointer to SW ring to be filled. The first mbuf has to be pre-built from
530  *   the title completion descriptor to be copied to the rest of mbufs.
531  * @param[out] pkts
532  *   Array to store received packets.
533  * @param pkts_n
534  *   Maximum number of packets in array.
535  * @param[out] err
536  *   Pointer to a flag. Set non-zero value if pkts array has at least one error
537  *   packet to handle.
538  * @param[out] comp
539  *   Pointer to a index. Set it to the first compressed completion if any.
540  *
541  * @return
542  *   Number of CQEs successfully processed.
543  */
544 static inline uint16_t
545 rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
546 		 struct rte_mbuf **elts, struct rte_mbuf **pkts,
547 		 uint16_t pkts_n, uint64_t *err, uint64_t *comp)
548 {
549 	const uint16_t q_n = 1 << rxq->cqe_n;
550 	const uint16_t q_mask = q_n - 1;
551 	unsigned int pos, adj;
552 	uint64_t n = 0;
553 	uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
554 	uint16_t nocmp_n = 0;
555 	const uint8_t vic = rxq->cq_ci >> rxq->cqe_n;
556 	const uint8_t own = !(rxq->cq_ci & (q_mask + 1));
557 	const __m128i vic_check = _mm_set1_epi64x(0x00ff000000ff0000LL);
558 	const __m128i owner_check =	_mm_set1_epi64x(0x0100000001000000LL);
559 	const __m128i opcode_check = _mm_set1_epi64x(0xf0000000f0000000LL);
560 	const __m128i format_check = _mm_set1_epi64x(0x0c0000000c000000LL);
561 	const __m128i resp_err_check = _mm_set1_epi64x(0xe0000000e0000000LL);
562 #ifdef MLX5_PMD_SOFT_COUNTERS
563 	uint32_t rcvd_byte = 0;
564 	/* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */
565 	const __m128i len_shuf_mask =
566 		_mm_set_epi8(-1, -1, -1, -1,
567 			     -1, -1, -1, -1,
568 			     12, 13,  8,  9,
569 			      4,  5,  0,  1);
570 #endif
571 	const __m128i validity =
572 		_mm_set_epi8(0, vic, 0, 0,
573 			     0, vic, 0, 0,
574 			     0, vic, 0, 0,
575 			     0, vic, 0, 0);
576 	const __m128i ownership =
577 		_mm_set_epi8(own, 0, 0, 0,
578 			     own, 0, 0, 0,
579 			     own, 0, 0, 0,
580 			     own, 0, 0, 0);
581 	/* Mask to shuffle from extracted CQE to mbuf. */
582 	const __m128i shuf_mask =
583 		_mm_set_epi8(-1,  3,  2,  1, /* fdir.hi */
584 			     12, 13, 14, 15, /* rss, bswap32 */
585 			     10, 11,         /* vlan_tci, bswap16 */
586 			      4,  5,         /* data_len, bswap16 */
587 			     -1, -1,         /* zero out 2nd half of pkt_len */
588 			      4,  5          /* pkt_len, bswap16 */);
589 	/* Mask to blend from the last Qword to the first DQword. */
590 	const __m128i blend_mask =
591 		_mm_set_epi8(-1, -1, -1, -1,
592 			     -1, -1, -1, -1,
593 			      0,  0,  0,  0,
594 			      0,  0,  0, -1);
595 	const __m128i zero = _mm_setzero_si128();
596 	const __m128i ones = _mm_cmpeq_epi32(zero, zero);
597 	const __m128i crc_adj =
598 		_mm_set_epi16(0, 0, 0, 0, 0,
599 			      rxq->crc_present * RTE_ETHER_CRC_LEN,
600 			      0,
601 			      rxq->crc_present * RTE_ETHER_CRC_LEN);
602 	const __m128i flow_mark_adj = _mm_set_epi32(rxq->mark * (-1), 0, 0, 0);
603 	/*
604 	 * A. load first Qword (8bytes) in one loop.
605 	 * B. copy 4 mbuf pointers from elts ring to returning pkts.
606 	 * C. load remained CQE data and extract necessary fields.
607 	 *    Final 16bytes cqes[] extracted from original 64bytes CQE has the
608 	 *    following structure:
609 	 *        struct {
610 	 *          uint8_t  pkt_info;
611 	 *          uint8_t  flow_tag[3];
612 	 *          uint16_t byte_cnt;
613 	 *          uint8_t  validity_iteration_count;
614 	 *          uint8_t  op_own;
615 	 *          uint16_t hdr_type_etc;
616 	 *          uint16_t vlan_info;
617 	 *          uint32_t rx_has_res;
618 	 *        } c;
619 	 * D. fill in mbuf.
620 	 * E. get valid CQEs.
621 	 * F. find compressed CQE.
622 	 */
623 	for (pos = 0;
624 	     pos < pkts_n;
625 	     pos += MLX5_VPMD_DESCS_PER_LOOP) {
626 		__m128i cqes[MLX5_VPMD_DESCS_PER_LOOP];
627 		__m128i cqe_tmp1, cqe_tmp2;
628 		__m128i pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3;
629 		__m128i op_own, op_own_tmp1, op_own_tmp2;
630 		__m128i opcode, owner_mask, invalid_mask;
631 		__m128i comp_mask, mini_mask;
632 		__m128i mask;
633 #ifdef MLX5_PMD_SOFT_COUNTERS
634 		__m128i byte_cnt;
635 #endif
636 		__m128i mbp1, mbp2;
637 		__m128i p = _mm_set_epi16(0, 0, 0, 0, 3, 2, 1, 0);
638 		unsigned int p1, p2, p3;
639 
640 		/* Prefetch next 4 CQEs. */
641 		if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) {
642 			rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP]);
643 			rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 1]);
644 			rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 2]);
645 			rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 3]);
646 		}
647 		/* A.0 do not cross the end of CQ. */
648 		mask = _mm_set_epi64x(0, (pkts_n - pos) * sizeof(uint16_t) * 8);
649 		mask = _mm_sll_epi64(ones, mask);
650 		p = _mm_andnot_si128(mask, p);
651 		/* A.1 load cqes. */
652 		p3 = _mm_extract_epi16(p, 3);
653 		cqes[3] = _mm_loadl_epi64((__m128i *)
654 					   &cq[pos + p3].sop_drop_qpn);
655 		rte_compiler_barrier();
656 		p2 = _mm_extract_epi16(p, 2);
657 		cqes[2] = _mm_loadl_epi64((__m128i *)
658 					   &cq[pos + p2].sop_drop_qpn);
659 		rte_compiler_barrier();
660 		/* B.1 load mbuf pointers. */
661 		mbp1 = _mm_loadu_si128((__m128i *)&elts[pos]);
662 		mbp2 = _mm_loadu_si128((__m128i *)&elts[pos + 2]);
663 		/* A.1 load a block having op_own. */
664 		p1 = _mm_extract_epi16(p, 1);
665 		cqes[1] = _mm_loadl_epi64((__m128i *)
666 					   &cq[pos + p1].sop_drop_qpn);
667 		rte_compiler_barrier();
668 		cqes[0] = _mm_loadl_epi64((__m128i *)
669 					   &cq[pos].sop_drop_qpn);
670 		/* B.2 copy mbuf pointers. */
671 		_mm_storeu_si128((__m128i *)&pkts[pos], mbp1);
672 		_mm_storeu_si128((__m128i *)&pkts[pos + 2], mbp2);
673 		rte_io_rmb();
674 		/* C.1 load remained CQE data and extract necessary fields. */
675 		cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p3]);
676 		cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos + p2]);
677 		cqes[3] = _mm_blendv_epi8(cqes[3], cqe_tmp2, blend_mask);
678 		cqes[2] = _mm_blendv_epi8(cqes[2], cqe_tmp1, blend_mask);
679 		cqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p3].csum);
680 		cqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos + p2].csum);
681 		cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x30);
682 		cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x30);
683 		cqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p3].rsvd4[2]);
684 		cqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos + p2].rsvd4[2]);
685 		cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x04);
686 		cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x04);
687 		/* C.2 generate final structure for mbuf with swapping bytes. */
688 		pkt_mb3 = _mm_shuffle_epi8(cqes[3], shuf_mask);
689 		pkt_mb2 = _mm_shuffle_epi8(cqes[2], shuf_mask);
690 		/* C.3 adjust CRC length. */
691 		pkt_mb3 = _mm_sub_epi16(pkt_mb3, crc_adj);
692 		pkt_mb2 = _mm_sub_epi16(pkt_mb2, crc_adj);
693 		/* C.4 adjust flow mark. */
694 		pkt_mb3 = _mm_add_epi32(pkt_mb3, flow_mark_adj);
695 		pkt_mb2 = _mm_add_epi32(pkt_mb2, flow_mark_adj);
696 		/* D.1 fill in mbuf - rx_descriptor_fields1. */
697 		_mm_storeu_si128((void *)&pkts[pos + 3]->pkt_len, pkt_mb3);
698 		_mm_storeu_si128((void *)&pkts[pos + 2]->pkt_len, pkt_mb2);
699 		/* E.1 extract op_own field. */
700 		op_own_tmp2 = _mm_unpacklo_epi32(cqes[2], cqes[3]);
701 		/* C.1 load remained CQE data and extract necessary fields. */
702 		cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p1]);
703 		cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos]);
704 		cqes[1] = _mm_blendv_epi8(cqes[1], cqe_tmp2, blend_mask);
705 		cqes[0] = _mm_blendv_epi8(cqes[0], cqe_tmp1, blend_mask);
706 		cqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p1].csum);
707 		cqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos].csum);
708 		cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x30);
709 		cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x30);
710 		cqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p1].rsvd4[2]);
711 		cqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos].rsvd4[2]);
712 		cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x04);
713 		cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x04);
714 		/* C.2 generate final structure for mbuf with swapping bytes. */
715 		pkt_mb1 = _mm_shuffle_epi8(cqes[1], shuf_mask);
716 		pkt_mb0 = _mm_shuffle_epi8(cqes[0], shuf_mask);
717 		/* C.3 adjust CRC length. */
718 		pkt_mb1 = _mm_sub_epi16(pkt_mb1, crc_adj);
719 		pkt_mb0 = _mm_sub_epi16(pkt_mb0, crc_adj);
720 		/* C.4 adjust flow mark. */
721 		pkt_mb1 = _mm_add_epi32(pkt_mb1, flow_mark_adj);
722 		pkt_mb0 = _mm_add_epi32(pkt_mb0, flow_mark_adj);
723 		/* E.1 extract op_own byte. */
724 		op_own_tmp1 = _mm_unpacklo_epi32(cqes[0], cqes[1]);
725 		op_own = _mm_unpackhi_epi64(op_own_tmp1, op_own_tmp2);
726 		/* D.1 fill in mbuf - rx_descriptor_fields1. */
727 		_mm_storeu_si128((void *)&pkts[pos + 1]->pkt_len, pkt_mb1);
728 		_mm_storeu_si128((void *)&pkts[pos]->pkt_len, pkt_mb0);
729 		/* E.2 mask out CQEs belonging to HW. */
730 		if (rxq->cqe_comp_layout) {
731 			owner_mask = _mm_and_si128(op_own, vic_check);
732 			owner_mask = _mm_cmpeq_epi32(owner_mask, validity);
733 			owner_mask = _mm_xor_si128(owner_mask, ones);
734 		} else {
735 			owner_mask = _mm_and_si128(op_own, owner_check);
736 			owner_mask = _mm_cmpeq_epi32(owner_mask, ownership);
737 		}
738 		owner_mask = _mm_packs_epi32(owner_mask, zero);
739 		/* E.3 get mask for invalidated CQEs. */
740 		opcode = _mm_and_si128(op_own, opcode_check);
741 		invalid_mask = _mm_cmpeq_epi32(opcode_check, opcode);
742 		invalid_mask = _mm_packs_epi32(invalid_mask, zero);
743 		/* E.4 mask out beyond boundary. */
744 		invalid_mask = _mm_or_si128(invalid_mask, mask);
745 		/* E.5 merge invalid_mask with invalid owner. */
746 		invalid_mask = _mm_or_si128(invalid_mask, owner_mask);
747 		/* F.1 find compressed CQE format. */
748 		comp_mask = _mm_and_si128(op_own, format_check);
749 		comp_mask = _mm_cmpeq_epi32(comp_mask, format_check);
750 		comp_mask = _mm_packs_epi32(comp_mask, zero);
751 		/* F.2 mask out invalid entries. */
752 		comp_mask = _mm_andnot_si128(invalid_mask, comp_mask);
753 		comp_idx = _mm_cvtsi128_si64(comp_mask);
754 		/* F.3 get the first compressed CQE. */
755 		comp_idx = comp_idx ?
756 				rte_ctz64(comp_idx) /
757 					(sizeof(uint16_t) * 8) :
758 				MLX5_VPMD_DESCS_PER_LOOP;
759 		/* E.6 mask out entries after the compressed CQE. */
760 		mask = _mm_set_epi64x(0, comp_idx * sizeof(uint16_t) * 8);
761 		mask = _mm_sll_epi64(ones, mask);
762 		invalid_mask = _mm_or_si128(invalid_mask, mask);
763 		/* E.7 count non-compressed valid CQEs. */
764 		n = _mm_cvtsi128_si64(invalid_mask);
765 		n = n ? rte_ctz64(n) / (sizeof(uint16_t) * 8) :
766 			MLX5_VPMD_DESCS_PER_LOOP;
767 		nocmp_n += n;
768 		/* D.2 get the final invalid mask. */
769 		mask = _mm_set_epi64x(0, n * sizeof(uint16_t) * 8);
770 		mask = _mm_sll_epi64(ones, mask);
771 		invalid_mask = _mm_or_si128(invalid_mask, mask);
772 		/* D.3 check error in opcode. */
773 		adj = (!rxq->cqe_comp_layout &&
774 		       comp_idx != MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n);
775 		mask = _mm_set_epi64x(0, adj * sizeof(uint16_t) * 8);
776 		mini_mask = _mm_sll_epi64(invalid_mask, mask);
777 		opcode = _mm_cmpeq_epi32(resp_err_check, opcode);
778 		opcode = _mm_packs_epi32(opcode, zero);
779 		opcode = _mm_andnot_si128(mini_mask, opcode);
780 		/* D.4 mark if any error is set */
781 		*err |= _mm_cvtsi128_si64(opcode);
782 		/* D.5 fill in mbuf - rearm_data and packet_type. */
783 		rxq_cq_to_ptype_oflags_v(rxq, cqes, opcode, &pkts[pos]);
784 		if (unlikely(rxq->shared)) {
785 			pkts[pos]->port = cq[pos].user_index_low;
786 			pkts[pos + p1]->port = cq[pos + p1].user_index_low;
787 			pkts[pos + p2]->port = cq[pos + p2].user_index_low;
788 			pkts[pos + p3]->port = cq[pos + p3].user_index_low;
789 		}
790 		if (unlikely(rxq->hw_timestamp)) {
791 			int offset = rxq->timestamp_offset;
792 			if (rxq->rt_timestamp) {
793 				struct mlx5_dev_ctx_shared *sh = rxq->sh;
794 				uint64_t ts;
795 
796 				ts = rte_be_to_cpu_64(cq[pos].timestamp);
797 				mlx5_timestamp_set(pkts[pos], offset,
798 					mlx5_txpp_convert_rx_ts(sh, ts));
799 				ts = rte_be_to_cpu_64(cq[pos + p1].timestamp);
800 				mlx5_timestamp_set(pkts[pos + 1], offset,
801 					mlx5_txpp_convert_rx_ts(sh, ts));
802 				ts = rte_be_to_cpu_64(cq[pos + p2].timestamp);
803 				mlx5_timestamp_set(pkts[pos + 2], offset,
804 					mlx5_txpp_convert_rx_ts(sh, ts));
805 				ts = rte_be_to_cpu_64(cq[pos + p3].timestamp);
806 				mlx5_timestamp_set(pkts[pos + 3], offset,
807 					mlx5_txpp_convert_rx_ts(sh, ts));
808 			} else {
809 				mlx5_timestamp_set(pkts[pos], offset,
810 					rte_be_to_cpu_64(cq[pos].timestamp));
811 				mlx5_timestamp_set(pkts[pos + 1], offset,
812 					rte_be_to_cpu_64(cq[pos + p1].timestamp));
813 				mlx5_timestamp_set(pkts[pos + 2], offset,
814 					rte_be_to_cpu_64(cq[pos + p2].timestamp));
815 				mlx5_timestamp_set(pkts[pos + 3], offset,
816 					rte_be_to_cpu_64(cq[pos + p3].timestamp));
817 			}
818 		}
819 		if (rxq->dynf_meta) {
820 			/* This code is subject for further optimization. */
821 			int32_t offs = rxq->flow_meta_offset;
822 			uint32_t mask = rxq->flow_meta_port_mask;
823 
824 			*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
825 				rte_be_to_cpu_32
826 				(cq[pos].flow_table_metadata) &	mask;
827 			*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *) =
828 				rte_be_to_cpu_32
829 				(cq[pos + p1].flow_table_metadata) & mask;
830 			*RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *) =
831 				rte_be_to_cpu_32
832 				(cq[pos + p2].flow_table_metadata) & mask;
833 			*RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *) =
834 				rte_be_to_cpu_32
835 				(cq[pos + p3].flow_table_metadata) & mask;
836 			if (*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *))
837 				pkts[pos]->ol_flags |= rxq->flow_meta_mask;
838 			if (*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *))
839 				pkts[pos + 1]->ol_flags |= rxq->flow_meta_mask;
840 			if (*RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *))
841 				pkts[pos + 2]->ol_flags |= rxq->flow_meta_mask;
842 			if (*RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *))
843 				pkts[pos + 3]->ol_flags |= rxq->flow_meta_mask;
844 		}
845 #ifdef MLX5_PMD_SOFT_COUNTERS
846 		/* Add up received bytes count. */
847 		byte_cnt = _mm_shuffle_epi8(op_own, len_shuf_mask);
848 		byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt);
849 		byte_cnt = _mm_hadd_epi16(byte_cnt, zero);
850 		rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero));
851 #endif
852 		/*
853 		 * Break the loop unless more valid CQE is expected, or if
854 		 * there's a compressed CQE.
855 		 */
856 		if (n != MLX5_VPMD_DESCS_PER_LOOP)
857 			break;
858 	}
859 #ifdef MLX5_PMD_SOFT_COUNTERS
860 	rxq->stats.ipackets += nocmp_n;
861 	rxq->stats.ibytes += rcvd_byte;
862 #endif
863 	if (comp_idx == n)
864 		*comp = comp_idx;
865 	return nocmp_n;
866 }
867 
868 #endif /* RTE_PMD_MLX5_RXTX_VEC_SSE_H_ */
869