1 /*- 2 * BSD LICENSE 3 * 4 * Copyright 2017 6WIND S.A. 5 * Copyright 2017 Mellanox. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of 6WIND S.A. nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #ifndef RTE_PMD_MLX5_RXTX_VEC_SSE_H_ 35 #define RTE_PMD_MLX5_RXTX_VEC_SSE_H_ 36 37 #include <assert.h> 38 #include <stdint.h> 39 #include <string.h> 40 #include <stdlib.h> 41 #include <smmintrin.h> 42 43 #include <rte_mbuf.h> 44 #include <rte_mempool.h> 45 #include <rte_prefetch.h> 46 47 #include "mlx5.h" 48 #include "mlx5_utils.h" 49 #include "mlx5_rxtx.h" 50 #include "mlx5_rxtx_vec.h" 51 #include "mlx5_autoconf.h" 52 #include "mlx5_defs.h" 53 #include "mlx5_prm.h" 54 55 #ifndef __INTEL_COMPILER 56 #pragma GCC diagnostic ignored "-Wcast-qual" 57 #endif 58 59 /** 60 * Fill in buffer descriptors in a multi-packet send descriptor. 61 * 62 * @param txq 63 * Pointer to TX queue structure. 64 * @param dseg 65 * Pointer to buffer descriptor to be written. 66 * @param pkts 67 * Pointer to array of packets to be sent. 68 * @param n 69 * Number of packets to be filled. 70 */ 71 static inline void 72 txq_wr_dseg_v(struct mlx5_txq_data *txq, __m128i *dseg, 73 struct rte_mbuf **pkts, unsigned int n) 74 { 75 unsigned int pos; 76 uintptr_t addr; 77 const __m128i shuf_mask_dseg = 78 _mm_set_epi8(8, 9, 10, 11, /* addr, bswap64 */ 79 12, 13, 14, 15, 80 7, 6, 5, 4, /* lkey */ 81 0, 1, 2, 3 /* length, bswap32 */); 82 #ifdef MLX5_PMD_SOFT_COUNTERS 83 uint32_t tx_byte = 0; 84 #endif 85 86 for (pos = 0; pos < n; ++pos, ++dseg) { 87 __m128i desc; 88 struct rte_mbuf *pkt = pkts[pos]; 89 90 addr = rte_pktmbuf_mtod(pkt, uintptr_t); 91 desc = _mm_set_epi32(addr >> 32, 92 addr, 93 mlx5_tx_mb2mr(txq, pkt), 94 DATA_LEN(pkt)); 95 desc = _mm_shuffle_epi8(desc, shuf_mask_dseg); 96 _mm_store_si128(dseg, desc); 97 #ifdef MLX5_PMD_SOFT_COUNTERS 98 tx_byte += DATA_LEN(pkt); 99 #endif 100 } 101 #ifdef MLX5_PMD_SOFT_COUNTERS 102 txq->stats.obytes += tx_byte; 103 #endif 104 } 105 106 /** 107 * Send multi-segmented packets until it encounters a single segment packet in 108 * the pkts list. 109 * 110 * @param txq 111 * Pointer to TX queue structure. 112 * @param pkts 113 * Pointer to array of packets to be sent. 114 * @param pkts_n 115 * Number of packets to be sent. 116 * 117 * @return 118 * Number of packets successfully transmitted (<= pkts_n). 119 */ 120 static uint16_t 121 txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, 122 uint16_t pkts_n) 123 { 124 uint16_t elts_head = txq->elts_head; 125 const uint16_t elts_n = 1 << txq->elts_n; 126 const uint16_t elts_m = elts_n - 1; 127 const uint16_t wq_n = 1 << txq->wqe_n; 128 const uint16_t wq_mask = wq_n - 1; 129 const unsigned int nb_dword_per_wqebb = 130 MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE; 131 const unsigned int nb_dword_in_hdr = 132 sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE; 133 unsigned int n; 134 volatile struct mlx5_wqe *wqe = NULL; 135 136 assert(elts_n > pkts_n); 137 mlx5_tx_complete(txq); 138 /* A CQE slot must always be available. */ 139 assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci)); 140 if (unlikely(!pkts_n)) 141 return 0; 142 for (n = 0; n < pkts_n; ++n) { 143 struct rte_mbuf *buf = pkts[n]; 144 unsigned int segs_n = buf->nb_segs; 145 unsigned int ds = nb_dword_in_hdr; 146 unsigned int len = PKT_LEN(buf); 147 uint16_t wqe_ci = txq->wqe_ci; 148 const __m128i shuf_mask_ctrl = 149 _mm_set_epi8(15, 14, 13, 12, 150 8, 9, 10, 11, /* bswap32 */ 151 4, 5, 6, 7, /* bswap32 */ 152 0, 1, 2, 3 /* bswap32 */); 153 uint8_t cs_flags; 154 uint16_t max_elts; 155 uint16_t max_wqe; 156 __m128i *t_wqe, *dseg; 157 __m128i ctrl; 158 159 assert(segs_n); 160 max_elts = elts_n - (elts_head - txq->elts_tail); 161 max_wqe = wq_n - (txq->wqe_ci - txq->wqe_pi); 162 /* 163 * A MPW session consumes 2 WQEs at most to 164 * include MLX5_MPW_DSEG_MAX pointers. 165 */ 166 if (segs_n == 1 || 167 max_elts < segs_n || max_wqe < 2) 168 break; 169 if (segs_n > MLX5_MPW_DSEG_MAX) { 170 txq->stats.oerrors++; 171 break; 172 } 173 wqe = &((volatile struct mlx5_wqe64 *) 174 txq->wqes)[wqe_ci & wq_mask].hdr; 175 cs_flags = txq_ol_cksum_to_cs(txq, buf); 176 /* Title WQEBB pointer. */ 177 t_wqe = (__m128i *)wqe; 178 dseg = (__m128i *)(wqe + 1); 179 do { 180 if (!(ds++ % nb_dword_per_wqebb)) { 181 dseg = (__m128i *) 182 &((volatile struct mlx5_wqe64 *) 183 txq->wqes)[++wqe_ci & wq_mask]; 184 } 185 txq_wr_dseg_v(txq, dseg++, &buf, 1); 186 (*txq->elts)[elts_head++ & elts_m] = buf; 187 buf = buf->next; 188 } while (--segs_n); 189 ++wqe_ci; 190 /* Fill CTRL in the header. */ 191 ctrl = _mm_set_epi32(0, 0, txq->qp_num_8s | ds, 192 MLX5_OPC_MOD_MPW << 24 | 193 txq->wqe_ci << 8 | MLX5_OPCODE_TSO); 194 ctrl = _mm_shuffle_epi8(ctrl, shuf_mask_ctrl); 195 _mm_store_si128(t_wqe, ctrl); 196 /* Fill ESEG in the header. */ 197 _mm_store_si128(t_wqe + 1, 198 _mm_set_epi16(0, 0, 0, 0, 199 rte_cpu_to_be_16(len), cs_flags, 200 0, 0)); 201 txq->wqe_ci = wqe_ci; 202 } 203 if (!n) 204 return 0; 205 txq->elts_comp += (uint16_t)(elts_head - txq->elts_head); 206 txq->elts_head = elts_head; 207 if (txq->elts_comp >= MLX5_TX_COMP_THRESH) { 208 wqe->ctrl[2] = rte_cpu_to_be_32(8); 209 wqe->ctrl[3] = txq->elts_head; 210 txq->elts_comp = 0; 211 #ifndef NDEBUG 212 ++txq->cq_pi; 213 #endif 214 } 215 #ifdef MLX5_PMD_SOFT_COUNTERS 216 txq->stats.opackets += n; 217 #endif 218 mlx5_tx_dbrec(txq, wqe); 219 return n; 220 } 221 222 /** 223 * Send burst of packets with Enhanced MPW. If it encounters a multi-seg packet, 224 * it returns to make it processed by txq_scatter_v(). All the packets in 225 * the pkts list should be single segment packets having same offload flags. 226 * This must be checked by txq_count_contig_single_seg() and txq_calc_offload(). 227 * 228 * @param txq 229 * Pointer to TX queue structure. 230 * @param pkts 231 * Pointer to array of packets to be sent. 232 * @param pkts_n 233 * Number of packets to be sent (<= MLX5_VPMD_TX_MAX_BURST). 234 * @param cs_flags 235 * Checksum offload flags to be written in the descriptor. 236 * 237 * @return 238 * Number of packets successfully transmitted (<= pkts_n). 239 */ 240 static inline uint16_t 241 txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n, 242 uint8_t cs_flags) 243 { 244 struct rte_mbuf **elts; 245 uint16_t elts_head = txq->elts_head; 246 const uint16_t elts_n = 1 << txq->elts_n; 247 const uint16_t elts_m = elts_n - 1; 248 const unsigned int nb_dword_per_wqebb = 249 MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE; 250 const unsigned int nb_dword_in_hdr = 251 sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE; 252 unsigned int n = 0; 253 unsigned int pos; 254 uint16_t max_elts; 255 uint16_t max_wqe; 256 uint32_t comp_req = 0; 257 const uint16_t wq_n = 1 << txq->wqe_n; 258 const uint16_t wq_mask = wq_n - 1; 259 uint16_t wq_idx = txq->wqe_ci & wq_mask; 260 volatile struct mlx5_wqe64 *wq = 261 &((volatile struct mlx5_wqe64 *)txq->wqes)[wq_idx]; 262 volatile struct mlx5_wqe *wqe = (volatile struct mlx5_wqe *)wq; 263 const __m128i shuf_mask_ctrl = 264 _mm_set_epi8(15, 14, 13, 12, 265 8, 9, 10, 11, /* bswap32 */ 266 4, 5, 6, 7, /* bswap32 */ 267 0, 1, 2, 3 /* bswap32 */); 268 __m128i *t_wqe, *dseg; 269 __m128i ctrl; 270 271 /* Make sure all packets can fit into a single WQE. */ 272 assert(elts_n > pkts_n); 273 mlx5_tx_complete(txq); 274 max_elts = (elts_n - (elts_head - txq->elts_tail)); 275 /* A CQE slot must always be available. */ 276 assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci)); 277 max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); 278 pkts_n = RTE_MIN((unsigned int)RTE_MIN(pkts_n, max_wqe), max_elts); 279 assert(pkts_n <= MLX5_DSEG_MAX - nb_dword_in_hdr); 280 if (unlikely(!pkts_n)) 281 return 0; 282 elts = &(*txq->elts)[elts_head & elts_m]; 283 /* Loop for available tailroom first. */ 284 n = RTE_MIN(elts_n - (elts_head & elts_m), pkts_n); 285 for (pos = 0; pos < (n & -2); pos += 2) 286 _mm_storeu_si128((__m128i *)&elts[pos], 287 _mm_loadu_si128((__m128i *)&pkts[pos])); 288 if (n & 1) 289 elts[pos] = pkts[pos]; 290 /* Check if it crosses the end of the queue. */ 291 if (unlikely(n < pkts_n)) { 292 elts = &(*txq->elts)[0]; 293 for (pos = 0; pos < pkts_n - n; ++pos) 294 elts[pos] = pkts[n + pos]; 295 } 296 txq->elts_head += pkts_n; 297 /* Save title WQEBB pointer. */ 298 t_wqe = (__m128i *)wqe; 299 dseg = (__m128i *)(wqe + 1); 300 /* Calculate the number of entries to the end. */ 301 n = RTE_MIN( 302 (wq_n - wq_idx) * nb_dword_per_wqebb - nb_dword_in_hdr, 303 pkts_n); 304 /* Fill DSEGs. */ 305 txq_wr_dseg_v(txq, dseg, pkts, n); 306 /* Check if it crosses the end of the queue. */ 307 if (n < pkts_n) { 308 dseg = (__m128i *)txq->wqes; 309 txq_wr_dseg_v(txq, dseg, &pkts[n], pkts_n - n); 310 } 311 if (txq->elts_comp + pkts_n < MLX5_TX_COMP_THRESH) { 312 txq->elts_comp += pkts_n; 313 } else { 314 /* Request a completion. */ 315 txq->elts_comp = 0; 316 #ifndef NDEBUG 317 ++txq->cq_pi; 318 #endif 319 comp_req = 8; 320 } 321 /* Fill CTRL in the header. */ 322 ctrl = _mm_set_epi32(txq->elts_head, comp_req, 323 txq->qp_num_8s | (pkts_n + 2), 324 MLX5_OPC_MOD_ENHANCED_MPSW << 24 | 325 txq->wqe_ci << 8 | MLX5_OPCODE_ENHANCED_MPSW); 326 ctrl = _mm_shuffle_epi8(ctrl, shuf_mask_ctrl); 327 _mm_store_si128(t_wqe, ctrl); 328 /* Fill ESEG in the header. */ 329 _mm_store_si128(t_wqe + 1, 330 _mm_set_epi8(0, 0, 0, 0, 331 0, 0, 0, 0, 332 0, 0, 0, cs_flags, 333 0, 0, 0, 0)); 334 #ifdef MLX5_PMD_SOFT_COUNTERS 335 txq->stats.opackets += pkts_n; 336 #endif 337 txq->wqe_ci += (nb_dword_in_hdr + pkts_n + (nb_dword_per_wqebb - 1)) / 338 nb_dword_per_wqebb; 339 /* Ring QP doorbell. */ 340 mlx5_tx_dbrec_cond_wmb(txq, wqe, pkts_n < MLX5_VPMD_TX_MAX_BURST); 341 return pkts_n; 342 } 343 344 /** 345 * Store free buffers to RX SW ring. 346 * 347 * @param rxq 348 * Pointer to RX queue structure. 349 * @param pkts 350 * Pointer to array of packets to be stored. 351 * @param pkts_n 352 * Number of packets to be stored. 353 */ 354 static inline void 355 rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n) 356 { 357 const uint16_t q_mask = (1 << rxq->elts_n) - 1; 358 struct rte_mbuf **elts = &(*rxq->elts)[rxq->rq_pi & q_mask]; 359 unsigned int pos; 360 uint16_t p = n & -2; 361 362 for (pos = 0; pos < p; pos += 2) { 363 __m128i mbp; 364 365 mbp = _mm_loadu_si128((__m128i *)&elts[pos]); 366 _mm_storeu_si128((__m128i *)&pkts[pos], mbp); 367 } 368 if (n & 1) 369 pkts[pos] = elts[pos]; 370 } 371 372 /** 373 * Decompress a compressed completion and fill in mbufs in RX SW ring with data 374 * extracted from the title completion descriptor. 375 * 376 * @param rxq 377 * Pointer to RX queue structure. 378 * @param cq 379 * Pointer to completion array having a compressed completion at first. 380 * @param elts 381 * Pointer to SW ring to be filled. The first mbuf has to be pre-built from 382 * the title completion descriptor to be copied to the rest of mbufs. 383 */ 384 static inline void 385 rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq, 386 struct rte_mbuf **elts) 387 { 388 volatile struct mlx5_mini_cqe8 *mcq = (void *)(cq + 1); 389 struct rte_mbuf *t_pkt = elts[0]; /* Title packet is pre-built. */ 390 unsigned int pos; 391 unsigned int i; 392 unsigned int inv = 0; 393 /* Mask to shuffle from extracted mini CQE to mbuf. */ 394 const __m128i shuf_mask1 = 395 _mm_set_epi8(0, 1, 2, 3, /* rss, bswap32 */ 396 -1, -1, /* skip vlan_tci */ 397 6, 7, /* data_len, bswap16 */ 398 -1, -1, 6, 7, /* pkt_len, bswap16 */ 399 -1, -1, -1, -1 /* skip packet_type */); 400 const __m128i shuf_mask2 = 401 _mm_set_epi8(8, 9, 10, 11, /* rss, bswap32 */ 402 -1, -1, /* skip vlan_tci */ 403 14, 15, /* data_len, bswap16 */ 404 -1, -1, 14, 15, /* pkt_len, bswap16 */ 405 -1, -1, -1, -1 /* skip packet_type */); 406 /* Restore the compressed count. Must be 16 bits. */ 407 const uint16_t mcqe_n = t_pkt->data_len + 408 (rxq->crc_present * ETHER_CRC_LEN); 409 const __m128i rearm = 410 _mm_loadu_si128((__m128i *)&t_pkt->rearm_data); 411 const __m128i rxdf = 412 _mm_loadu_si128((__m128i *)&t_pkt->rx_descriptor_fields1); 413 const __m128i crc_adj = 414 _mm_set_epi16(0, 0, 0, 415 rxq->crc_present * ETHER_CRC_LEN, 416 0, 417 rxq->crc_present * ETHER_CRC_LEN, 418 0, 0); 419 const uint32_t flow_tag = t_pkt->hash.fdir.hi; 420 #ifdef MLX5_PMD_SOFT_COUNTERS 421 const __m128i zero = _mm_setzero_si128(); 422 const __m128i ones = _mm_cmpeq_epi32(zero, zero); 423 uint32_t rcvd_byte = 0; 424 /* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */ 425 const __m128i len_shuf_mask = 426 _mm_set_epi8(-1, -1, -1, -1, 427 -1, -1, -1, -1, 428 14, 15, 6, 7, 429 10, 11, 2, 3); 430 #endif 431 432 /* 433 * A. load mCQEs into a 128bit register. 434 * B. store rearm data to mbuf. 435 * C. combine data from mCQEs with rx_descriptor_fields1. 436 * D. store rx_descriptor_fields1. 437 * E. store flow tag (rte_flow mark). 438 */ 439 for (pos = 0; pos < mcqe_n; ) { 440 __m128i mcqe1, mcqe2; 441 __m128i rxdf1, rxdf2; 442 #ifdef MLX5_PMD_SOFT_COUNTERS 443 __m128i byte_cnt, invalid_mask; 444 #endif 445 446 if (!(pos & 0x7) && pos + 8 < mcqe_n) 447 rte_prefetch0((void *)(cq + pos + 8)); 448 /* A.1 load mCQEs into a 128bit register. */ 449 mcqe1 = _mm_loadu_si128((__m128i *)&mcq[pos % 8]); 450 mcqe2 = _mm_loadu_si128((__m128i *)&mcq[pos % 8 + 2]); 451 /* B.1 store rearm data to mbuf. */ 452 _mm_storeu_si128((__m128i *)&elts[pos]->rearm_data, rearm); 453 _mm_storeu_si128((__m128i *)&elts[pos + 1]->rearm_data, rearm); 454 /* C.1 combine data from mCQEs with rx_descriptor_fields1. */ 455 rxdf1 = _mm_shuffle_epi8(mcqe1, shuf_mask1); 456 rxdf2 = _mm_shuffle_epi8(mcqe1, shuf_mask2); 457 rxdf1 = _mm_sub_epi16(rxdf1, crc_adj); 458 rxdf2 = _mm_sub_epi16(rxdf2, crc_adj); 459 rxdf1 = _mm_blend_epi16(rxdf1, rxdf, 0x23); 460 rxdf2 = _mm_blend_epi16(rxdf2, rxdf, 0x23); 461 /* D.1 store rx_descriptor_fields1. */ 462 _mm_storeu_si128((__m128i *) 463 &elts[pos]->rx_descriptor_fields1, 464 rxdf1); 465 _mm_storeu_si128((__m128i *) 466 &elts[pos + 1]->rx_descriptor_fields1, 467 rxdf2); 468 /* B.1 store rearm data to mbuf. */ 469 _mm_storeu_si128((__m128i *)&elts[pos + 2]->rearm_data, rearm); 470 _mm_storeu_si128((__m128i *)&elts[pos + 3]->rearm_data, rearm); 471 /* C.1 combine data from mCQEs with rx_descriptor_fields1. */ 472 rxdf1 = _mm_shuffle_epi8(mcqe2, shuf_mask1); 473 rxdf2 = _mm_shuffle_epi8(mcqe2, shuf_mask2); 474 rxdf1 = _mm_sub_epi16(rxdf1, crc_adj); 475 rxdf2 = _mm_sub_epi16(rxdf2, crc_adj); 476 rxdf1 = _mm_blend_epi16(rxdf1, rxdf, 0x23); 477 rxdf2 = _mm_blend_epi16(rxdf2, rxdf, 0x23); 478 /* D.1 store rx_descriptor_fields1. */ 479 _mm_storeu_si128((__m128i *) 480 &elts[pos + 2]->rx_descriptor_fields1, 481 rxdf1); 482 _mm_storeu_si128((__m128i *) 483 &elts[pos + 3]->rx_descriptor_fields1, 484 rxdf2); 485 #ifdef MLX5_PMD_SOFT_COUNTERS 486 invalid_mask = _mm_set_epi64x(0, 487 (mcqe_n - pos) * 488 sizeof(uint16_t) * 8); 489 invalid_mask = _mm_sll_epi64(ones, invalid_mask); 490 mcqe1 = _mm_srli_si128(mcqe1, 4); 491 byte_cnt = _mm_blend_epi16(mcqe1, mcqe2, 0xcc); 492 byte_cnt = _mm_shuffle_epi8(byte_cnt, len_shuf_mask); 493 byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt); 494 byte_cnt = _mm_hadd_epi16(byte_cnt, zero); 495 rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero)); 496 #endif 497 if (rxq->mark) { 498 /* E.1 store flow tag (rte_flow mark). */ 499 elts[pos]->hash.fdir.hi = flow_tag; 500 elts[pos + 1]->hash.fdir.hi = flow_tag; 501 elts[pos + 2]->hash.fdir.hi = flow_tag; 502 elts[pos + 3]->hash.fdir.hi = flow_tag; 503 } 504 pos += MLX5_VPMD_DESCS_PER_LOOP; 505 /* Move to next CQE and invalidate consumed CQEs. */ 506 if (!(pos & 0x7) && pos < mcqe_n) { 507 mcq = (void *)(cq + pos); 508 for (i = 0; i < 8; ++i) 509 cq[inv++].op_own = MLX5_CQE_INVALIDATE; 510 } 511 } 512 /* Invalidate the rest of CQEs. */ 513 for (; inv < mcqe_n; ++inv) 514 cq[inv].op_own = MLX5_CQE_INVALIDATE; 515 #ifdef MLX5_PMD_SOFT_COUNTERS 516 rxq->stats.ipackets += mcqe_n; 517 rxq->stats.ibytes += rcvd_byte; 518 #endif 519 rxq->cq_ci += mcqe_n; 520 } 521 522 /** 523 * Calculate packet type and offload flag for mbuf and store it. 524 * 525 * @param rxq 526 * Pointer to RX queue structure. 527 * @param cqes[4] 528 * Array of four 16bytes completions extracted from the original completion 529 * descriptor. 530 * @param op_err 531 * Opcode vector having responder error status. Each field is 4B. 532 * @param pkts 533 * Pointer to array of packets to be filled. 534 */ 535 static inline void 536 rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, __m128i cqes[4], 537 __m128i op_err, struct rte_mbuf **pkts) 538 { 539 __m128i pinfo0, pinfo1; 540 __m128i pinfo, ptype; 541 __m128i ol_flags = _mm_set1_epi32(rxq->rss_hash * PKT_RX_RSS_HASH | 542 rxq->hw_timestamp * PKT_RX_TIMESTAMP); 543 __m128i cv_flags; 544 const __m128i zero = _mm_setzero_si128(); 545 const __m128i ptype_mask = 546 _mm_set_epi32(0xfd06, 0xfd06, 0xfd06, 0xfd06); 547 const __m128i ptype_ol_mask = 548 _mm_set_epi32(0x106, 0x106, 0x106, 0x106); 549 const __m128i pinfo_mask = 550 _mm_set_epi32(0x3, 0x3, 0x3, 0x3); 551 const __m128i cv_flag_sel = 552 _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 553 (uint8_t)((PKT_RX_IP_CKSUM_GOOD | 554 PKT_RX_L4_CKSUM_GOOD) >> 1), 555 0, 556 (uint8_t)(PKT_RX_L4_CKSUM_GOOD >> 1), 557 0, 558 (uint8_t)(PKT_RX_IP_CKSUM_GOOD >> 1), 559 (uint8_t)(PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED), 560 0); 561 const __m128i cv_mask = 562 _mm_set_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | 563 PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 564 PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | 565 PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 566 PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | 567 PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 568 PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | 569 PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED); 570 const __m128i mbuf_init = 571 _mm_loadl_epi64((__m128i *)&rxq->mbuf_initializer); 572 __m128i rearm0, rearm1, rearm2, rearm3; 573 574 /* Extract pkt_info field. */ 575 pinfo0 = _mm_unpacklo_epi32(cqes[0], cqes[1]); 576 pinfo1 = _mm_unpacklo_epi32(cqes[2], cqes[3]); 577 pinfo = _mm_unpacklo_epi64(pinfo0, pinfo1); 578 /* Extract hdr_type_etc field. */ 579 pinfo0 = _mm_unpackhi_epi32(cqes[0], cqes[1]); 580 pinfo1 = _mm_unpackhi_epi32(cqes[2], cqes[3]); 581 ptype = _mm_unpacklo_epi64(pinfo0, pinfo1); 582 if (rxq->mark) { 583 const __m128i pinfo_ft_mask = 584 _mm_set_epi32(0xffffff00, 0xffffff00, 585 0xffffff00, 0xffffff00); 586 const __m128i fdir_flags = _mm_set1_epi32(PKT_RX_FDIR); 587 const __m128i fdir_id_flags = _mm_set1_epi32(PKT_RX_FDIR_ID); 588 __m128i flow_tag, invalid_mask; 589 590 flow_tag = _mm_and_si128(pinfo, pinfo_ft_mask); 591 /* Check if flow tag is non-zero then set PKT_RX_FDIR. */ 592 invalid_mask = _mm_cmpeq_epi32(flow_tag, zero); 593 ol_flags = _mm_or_si128(ol_flags, 594 _mm_andnot_si128(invalid_mask, 595 fdir_flags)); 596 /* Mask out invalid entries. */ 597 flow_tag = _mm_andnot_si128(invalid_mask, flow_tag); 598 /* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */ 599 ol_flags = _mm_or_si128(ol_flags, 600 _mm_andnot_si128( 601 _mm_cmpeq_epi32(flow_tag, 602 pinfo_ft_mask), 603 fdir_id_flags)); 604 } 605 /* 606 * Merge the two fields to generate the following: 607 * bit[1] = l3_ok 608 * bit[2] = l4_ok 609 * bit[8] = cv 610 * bit[11:10] = l3_hdr_type 611 * bit[14:12] = l4_hdr_type 612 * bit[15] = ip_frag 613 * bit[16] = tunneled 614 * bit[17] = outer_l3_type 615 */ 616 ptype = _mm_and_si128(ptype, ptype_mask); 617 pinfo = _mm_and_si128(pinfo, pinfo_mask); 618 pinfo = _mm_slli_epi32(pinfo, 16); 619 /* Make pinfo has merged fields for ol_flags calculation. */ 620 pinfo = _mm_or_si128(ptype, pinfo); 621 ptype = _mm_srli_epi32(pinfo, 10); 622 ptype = _mm_packs_epi32(ptype, zero); 623 /* Errored packets will have RTE_PTYPE_ALL_MASK. */ 624 op_err = _mm_srli_epi16(op_err, 8); 625 ptype = _mm_or_si128(ptype, op_err); 626 pkts[0]->packet_type = mlx5_ptype_table[_mm_extract_epi8(ptype, 0)]; 627 pkts[1]->packet_type = mlx5_ptype_table[_mm_extract_epi8(ptype, 2)]; 628 pkts[2]->packet_type = mlx5_ptype_table[_mm_extract_epi8(ptype, 4)]; 629 pkts[3]->packet_type = mlx5_ptype_table[_mm_extract_epi8(ptype, 6)]; 630 /* Fill flags for checksum and VLAN. */ 631 pinfo = _mm_and_si128(pinfo, ptype_ol_mask); 632 pinfo = _mm_shuffle_epi8(cv_flag_sel, pinfo); 633 /* Locate checksum flags at byte[2:1] and merge with VLAN flags. */ 634 cv_flags = _mm_slli_epi32(pinfo, 9); 635 cv_flags = _mm_or_si128(pinfo, cv_flags); 636 /* Move back flags to start from byte[0]. */ 637 cv_flags = _mm_srli_epi32(cv_flags, 8); 638 /* Mask out garbage bits. */ 639 cv_flags = _mm_and_si128(cv_flags, cv_mask); 640 /* Merge to ol_flags. */ 641 ol_flags = _mm_or_si128(ol_flags, cv_flags); 642 /* Merge mbuf_init and ol_flags. */ 643 rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(ol_flags, 8), 0x30); 644 rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(ol_flags, 4), 0x30); 645 rearm2 = _mm_blend_epi16(mbuf_init, ol_flags, 0x30); 646 rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(ol_flags, 4), 0x30); 647 /* Write 8B rearm_data and 8B ol_flags. */ 648 _mm_store_si128((__m128i *)&pkts[0]->rearm_data, rearm0); 649 _mm_store_si128((__m128i *)&pkts[1]->rearm_data, rearm1); 650 _mm_store_si128((__m128i *)&pkts[2]->rearm_data, rearm2); 651 _mm_store_si128((__m128i *)&pkts[3]->rearm_data, rearm3); 652 } 653 654 /** 655 * Receive burst of packets. An errored completion also consumes a mbuf, but the 656 * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed 657 * before returning to application. 658 * 659 * @param rxq 660 * Pointer to RX queue structure. 661 * @param[out] pkts 662 * Array to store received packets. 663 * @param pkts_n 664 * Maximum number of packets in array. 665 * 666 * @return 667 * Number of packets received including errors (<= pkts_n). 668 */ 669 static inline uint16_t 670 rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 671 { 672 const uint16_t q_n = 1 << rxq->cqe_n; 673 const uint16_t q_mask = q_n - 1; 674 volatile struct mlx5_cqe *cq; 675 struct rte_mbuf **elts; 676 unsigned int pos; 677 uint64_t n; 678 uint16_t repl_n; 679 uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP; 680 uint16_t nocmp_n = 0; 681 uint16_t rcvd_pkt = 0; 682 unsigned int cq_idx = rxq->cq_ci & q_mask; 683 unsigned int elts_idx; 684 unsigned int ownership = !!(rxq->cq_ci & (q_mask + 1)); 685 const __m128i owner_check = 686 _mm_set_epi64x(0x0100000001000000LL, 0x0100000001000000LL); 687 const __m128i opcode_check = 688 _mm_set_epi64x(0xf0000000f0000000LL, 0xf0000000f0000000LL); 689 const __m128i format_check = 690 _mm_set_epi64x(0x0c0000000c000000LL, 0x0c0000000c000000LL); 691 const __m128i resp_err_check = 692 _mm_set_epi64x(0xe0000000e0000000LL, 0xe0000000e0000000LL); 693 #ifdef MLX5_PMD_SOFT_COUNTERS 694 uint32_t rcvd_byte = 0; 695 /* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */ 696 const __m128i len_shuf_mask = 697 _mm_set_epi8(-1, -1, -1, -1, 698 -1, -1, -1, -1, 699 12, 13, 8, 9, 700 4, 5, 0, 1); 701 #endif 702 /* Mask to shuffle from extracted CQE to mbuf. */ 703 const __m128i shuf_mask = 704 _mm_set_epi8(-1, 3, 2, 1, /* fdir.hi */ 705 12, 13, 14, 15, /* rss, bswap32 */ 706 10, 11, /* vlan_tci, bswap16 */ 707 4, 5, /* data_len, bswap16 */ 708 -1, -1, /* zero out 2nd half of pkt_len */ 709 4, 5 /* pkt_len, bswap16 */); 710 /* Mask to blend from the last Qword to the first DQword. */ 711 const __m128i blend_mask = 712 _mm_set_epi8(-1, -1, -1, -1, 713 -1, -1, -1, -1, 714 0, 0, 0, 0, 715 0, 0, 0, -1); 716 const __m128i zero = _mm_setzero_si128(); 717 const __m128i ones = _mm_cmpeq_epi32(zero, zero); 718 const __m128i crc_adj = 719 _mm_set_epi16(0, 0, 0, 0, 0, 720 rxq->crc_present * ETHER_CRC_LEN, 721 0, 722 rxq->crc_present * ETHER_CRC_LEN); 723 const __m128i flow_mark_adj = _mm_set_epi32(rxq->mark * (-1), 0, 0, 0); 724 725 assert(rxq->sges_n == 0); 726 assert(rxq->cqe_n == rxq->elts_n); 727 cq = &(*rxq->cqes)[cq_idx]; 728 rte_prefetch0(cq); 729 rte_prefetch0(cq + 1); 730 rte_prefetch0(cq + 2); 731 rte_prefetch0(cq + 3); 732 pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST); 733 /* 734 * Order of indexes: 735 * rq_ci >= cq_ci >= rq_pi 736 * Definition of indexes: 737 * rq_ci - cq_ci := # of buffers owned by HW (posted). 738 * cq_ci - rq_pi := # of buffers not returned to app (decompressed). 739 * N - (rq_ci - rq_pi) := # of buffers consumed (to be replenished). 740 */ 741 repl_n = q_n - (rxq->rq_ci - rxq->rq_pi); 742 if (repl_n >= MLX5_VPMD_RXQ_RPLNSH_THRESH) 743 mlx5_rx_replenish_bulk_mbuf(rxq, repl_n); 744 /* See if there're unreturned mbufs from compressed CQE. */ 745 rcvd_pkt = rxq->cq_ci - rxq->rq_pi; 746 if (rcvd_pkt > 0) { 747 rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n); 748 rxq_copy_mbuf_v(rxq, pkts, rcvd_pkt); 749 rxq->rq_pi += rcvd_pkt; 750 pkts += rcvd_pkt; 751 } 752 elts_idx = rxq->rq_pi & q_mask; 753 elts = &(*rxq->elts)[elts_idx]; 754 /* Not to overflow pkts array. */ 755 pkts_n = RTE_ALIGN_FLOOR(pkts_n - rcvd_pkt, MLX5_VPMD_DESCS_PER_LOOP); 756 /* Not to cross queue end. */ 757 pkts_n = RTE_MIN(pkts_n, q_n - elts_idx); 758 if (!pkts_n) 759 return rcvd_pkt; 760 /* At this point, there shouldn't be any remained packets. */ 761 assert(rxq->rq_pi == rxq->cq_ci); 762 /* 763 * A. load first Qword (8bytes) in one loop. 764 * B. copy 4 mbuf pointers from elts ring to returing pkts. 765 * C. load remained CQE data and extract necessary fields. 766 * Final 16bytes cqes[] extracted from original 64bytes CQE has the 767 * following structure: 768 * struct { 769 * uint8_t pkt_info; 770 * uint8_t flow_tag[3]; 771 * uint16_t byte_cnt; 772 * uint8_t rsvd4; 773 * uint8_t op_own; 774 * uint16_t hdr_type_etc; 775 * uint16_t vlan_info; 776 * uint32_t rx_has_res; 777 * } c; 778 * D. fill in mbuf. 779 * E. get valid CQEs. 780 * F. find compressed CQE. 781 */ 782 for (pos = 0; 783 pos < pkts_n; 784 pos += MLX5_VPMD_DESCS_PER_LOOP) { 785 __m128i cqes[MLX5_VPMD_DESCS_PER_LOOP]; 786 __m128i cqe_tmp1, cqe_tmp2; 787 __m128i pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3; 788 __m128i op_own, op_own_tmp1, op_own_tmp2; 789 __m128i opcode, owner_mask, invalid_mask; 790 __m128i comp_mask; 791 __m128i mask; 792 #ifdef MLX5_PMD_SOFT_COUNTERS 793 __m128i byte_cnt; 794 #endif 795 __m128i mbp1, mbp2; 796 __m128i p = _mm_set_epi16(0, 0, 0, 0, 3, 2, 1, 0); 797 unsigned int p1, p2, p3; 798 799 /* Prefetch next 4 CQEs. */ 800 if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) { 801 rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP]); 802 rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 1]); 803 rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 2]); 804 rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 3]); 805 } 806 /* A.0 do not cross the end of CQ. */ 807 mask = _mm_set_epi64x(0, (pkts_n - pos) * sizeof(uint16_t) * 8); 808 mask = _mm_sll_epi64(ones, mask); 809 p = _mm_andnot_si128(mask, p); 810 /* A.1 load cqes. */ 811 p3 = _mm_extract_epi16(p, 3); 812 cqes[3] = _mm_loadl_epi64((__m128i *) 813 &cq[pos + p3].sop_drop_qpn); 814 rte_compiler_barrier(); 815 p2 = _mm_extract_epi16(p, 2); 816 cqes[2] = _mm_loadl_epi64((__m128i *) 817 &cq[pos + p2].sop_drop_qpn); 818 rte_compiler_barrier(); 819 /* B.1 load mbuf pointers. */ 820 mbp1 = _mm_loadu_si128((__m128i *)&elts[pos]); 821 mbp2 = _mm_loadu_si128((__m128i *)&elts[pos + 2]); 822 /* A.1 load a block having op_own. */ 823 p1 = _mm_extract_epi16(p, 1); 824 cqes[1] = _mm_loadl_epi64((__m128i *) 825 &cq[pos + p1].sop_drop_qpn); 826 rte_compiler_barrier(); 827 cqes[0] = _mm_loadl_epi64((__m128i *) 828 &cq[pos].sop_drop_qpn); 829 /* B.2 copy mbuf pointers. */ 830 _mm_storeu_si128((__m128i *)&pkts[pos], mbp1); 831 _mm_storeu_si128((__m128i *)&pkts[pos + 2], mbp2); 832 rte_compiler_barrier(); 833 /* C.1 load remained CQE data and extract necessary fields. */ 834 cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p3]); 835 cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos + p2]); 836 cqes[3] = _mm_blendv_epi8(cqes[3], cqe_tmp2, blend_mask); 837 cqes[2] = _mm_blendv_epi8(cqes[2], cqe_tmp1, blend_mask); 838 cqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p3].rsvd1[3]); 839 cqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos + p2].rsvd1[3]); 840 cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x30); 841 cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x30); 842 cqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p3].rsvd2[10]); 843 cqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos + p2].rsvd2[10]); 844 cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x04); 845 cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x04); 846 /* C.2 generate final structure for mbuf with swapping bytes. */ 847 pkt_mb3 = _mm_shuffle_epi8(cqes[3], shuf_mask); 848 pkt_mb2 = _mm_shuffle_epi8(cqes[2], shuf_mask); 849 /* C.3 adjust CRC length. */ 850 pkt_mb3 = _mm_sub_epi16(pkt_mb3, crc_adj); 851 pkt_mb2 = _mm_sub_epi16(pkt_mb2, crc_adj); 852 /* C.4 adjust flow mark. */ 853 pkt_mb3 = _mm_add_epi32(pkt_mb3, flow_mark_adj); 854 pkt_mb2 = _mm_add_epi32(pkt_mb2, flow_mark_adj); 855 /* D.1 fill in mbuf - rx_descriptor_fields1. */ 856 _mm_storeu_si128((void *)&pkts[pos + 3]->pkt_len, pkt_mb3); 857 _mm_storeu_si128((void *)&pkts[pos + 2]->pkt_len, pkt_mb2); 858 /* E.1 extract op_own field. */ 859 op_own_tmp2 = _mm_unpacklo_epi32(cqes[2], cqes[3]); 860 /* C.1 load remained CQE data and extract necessary fields. */ 861 cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p1]); 862 cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos]); 863 cqes[1] = _mm_blendv_epi8(cqes[1], cqe_tmp2, blend_mask); 864 cqes[0] = _mm_blendv_epi8(cqes[0], cqe_tmp1, blend_mask); 865 cqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p1].rsvd1[3]); 866 cqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos].rsvd1[3]); 867 cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x30); 868 cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x30); 869 cqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p1].rsvd2[10]); 870 cqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos].rsvd2[10]); 871 cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x04); 872 cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x04); 873 /* C.2 generate final structure for mbuf with swapping bytes. */ 874 pkt_mb1 = _mm_shuffle_epi8(cqes[1], shuf_mask); 875 pkt_mb0 = _mm_shuffle_epi8(cqes[0], shuf_mask); 876 /* C.3 adjust CRC length. */ 877 pkt_mb1 = _mm_sub_epi16(pkt_mb1, crc_adj); 878 pkt_mb0 = _mm_sub_epi16(pkt_mb0, crc_adj); 879 /* C.4 adjust flow mark. */ 880 pkt_mb1 = _mm_add_epi32(pkt_mb1, flow_mark_adj); 881 pkt_mb0 = _mm_add_epi32(pkt_mb0, flow_mark_adj); 882 /* E.1 extract op_own byte. */ 883 op_own_tmp1 = _mm_unpacklo_epi32(cqes[0], cqes[1]); 884 op_own = _mm_unpackhi_epi64(op_own_tmp1, op_own_tmp2); 885 /* D.1 fill in mbuf - rx_descriptor_fields1. */ 886 _mm_storeu_si128((void *)&pkts[pos + 1]->pkt_len, pkt_mb1); 887 _mm_storeu_si128((void *)&pkts[pos]->pkt_len, pkt_mb0); 888 /* E.2 flip owner bit to mark CQEs from last round. */ 889 owner_mask = _mm_and_si128(op_own, owner_check); 890 if (ownership) 891 owner_mask = _mm_xor_si128(owner_mask, owner_check); 892 owner_mask = _mm_cmpeq_epi32(owner_mask, owner_check); 893 owner_mask = _mm_packs_epi32(owner_mask, zero); 894 /* E.3 get mask for invalidated CQEs. */ 895 opcode = _mm_and_si128(op_own, opcode_check); 896 invalid_mask = _mm_cmpeq_epi32(opcode_check, opcode); 897 invalid_mask = _mm_packs_epi32(invalid_mask, zero); 898 /* E.4 mask out beyond boundary. */ 899 invalid_mask = _mm_or_si128(invalid_mask, mask); 900 /* E.5 merge invalid_mask with invalid owner. */ 901 invalid_mask = _mm_or_si128(invalid_mask, owner_mask); 902 /* F.1 find compressed CQE format. */ 903 comp_mask = _mm_and_si128(op_own, format_check); 904 comp_mask = _mm_cmpeq_epi32(comp_mask, format_check); 905 comp_mask = _mm_packs_epi32(comp_mask, zero); 906 /* F.2 mask out invalid entries. */ 907 comp_mask = _mm_andnot_si128(invalid_mask, comp_mask); 908 comp_idx = _mm_cvtsi128_si64(comp_mask); 909 /* F.3 get the first compressed CQE. */ 910 comp_idx = comp_idx ? 911 __builtin_ctzll(comp_idx) / 912 (sizeof(uint16_t) * 8) : 913 MLX5_VPMD_DESCS_PER_LOOP; 914 /* E.6 mask out entries after the compressed CQE. */ 915 mask = _mm_set_epi64x(0, comp_idx * sizeof(uint16_t) * 8); 916 mask = _mm_sll_epi64(ones, mask); 917 invalid_mask = _mm_or_si128(invalid_mask, mask); 918 /* E.7 count non-compressed valid CQEs. */ 919 n = _mm_cvtsi128_si64(invalid_mask); 920 n = n ? __builtin_ctzll(n) / (sizeof(uint16_t) * 8) : 921 MLX5_VPMD_DESCS_PER_LOOP; 922 nocmp_n += n; 923 /* D.2 get the final invalid mask. */ 924 mask = _mm_set_epi64x(0, n * sizeof(uint16_t) * 8); 925 mask = _mm_sll_epi64(ones, mask); 926 invalid_mask = _mm_or_si128(invalid_mask, mask); 927 /* D.3 check error in opcode. */ 928 opcode = _mm_cmpeq_epi32(resp_err_check, opcode); 929 opcode = _mm_packs_epi32(opcode, zero); 930 opcode = _mm_andnot_si128(invalid_mask, opcode); 931 /* D.4 mark if any error is set */ 932 rxq->pending_err |= !!_mm_cvtsi128_si64(opcode); 933 /* D.5 fill in mbuf - rearm_data and packet_type. */ 934 rxq_cq_to_ptype_oflags_v(rxq, cqes, opcode, &pkts[pos]); 935 if (rxq->hw_timestamp) { 936 pkts[pos]->timestamp = 937 rte_be_to_cpu_64(cq[pos].timestamp); 938 pkts[pos + 1]->timestamp = 939 rte_be_to_cpu_64(cq[pos + p1].timestamp); 940 pkts[pos + 2]->timestamp = 941 rte_be_to_cpu_64(cq[pos + p2].timestamp); 942 pkts[pos + 3]->timestamp = 943 rte_be_to_cpu_64(cq[pos + p3].timestamp); 944 } 945 #ifdef MLX5_PMD_SOFT_COUNTERS 946 /* Add up received bytes count. */ 947 byte_cnt = _mm_shuffle_epi8(op_own, len_shuf_mask); 948 byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt); 949 byte_cnt = _mm_hadd_epi16(byte_cnt, zero); 950 rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero)); 951 #endif 952 /* 953 * Break the loop unless more valid CQE is expected, or if 954 * there's a compressed CQE. 955 */ 956 if (n != MLX5_VPMD_DESCS_PER_LOOP) 957 break; 958 } 959 /* If no new CQE seen, return without updating cq_db. */ 960 if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) 961 return rcvd_pkt; 962 /* Update the consumer indexes for non-compressed CQEs. */ 963 assert(nocmp_n <= pkts_n); 964 rxq->cq_ci += nocmp_n; 965 rxq->rq_pi += nocmp_n; 966 rcvd_pkt += nocmp_n; 967 #ifdef MLX5_PMD_SOFT_COUNTERS 968 rxq->stats.ipackets += nocmp_n; 969 rxq->stats.ibytes += rcvd_byte; 970 #endif 971 /* Decompress the last CQE if compressed. */ 972 if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) { 973 assert(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP)); 974 rxq_cq_decompress_v(rxq, &cq[nocmp_n], &elts[nocmp_n]); 975 /* Return more packets if needed. */ 976 if (nocmp_n < pkts_n) { 977 uint16_t n = rxq->cq_ci - rxq->rq_pi; 978 979 n = RTE_MIN(n, pkts_n - nocmp_n); 980 rxq_copy_mbuf_v(rxq, &pkts[nocmp_n], n); 981 rxq->rq_pi += n; 982 rcvd_pkt += n; 983 } 984 } 985 rte_compiler_barrier(); 986 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); 987 return rcvd_pkt; 988 } 989 990 #endif /* RTE_PMD_MLX5_RXTX_VEC_SSE_H_ */ 991