1 /*- 2 * BSD LICENSE 3 * 4 * Copyright 2017 6WIND S.A. 5 * Copyright 2017 Mellanox. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of 6WIND S.A. nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <assert.h> 35 #include <stdint.h> 36 #include <string.h> 37 #include <stdlib.h> 38 #include <smmintrin.h> 39 40 /* Verbs header. */ 41 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 42 #ifdef PEDANTIC 43 #pragma GCC diagnostic ignored "-Wpedantic" 44 #endif 45 #include <infiniband/verbs.h> 46 #include <infiniband/mlx5dv.h> 47 #ifdef PEDANTIC 48 #pragma GCC diagnostic error "-Wpedantic" 49 #endif 50 51 #include <rte_mbuf.h> 52 #include <rte_mempool.h> 53 #include <rte_prefetch.h> 54 55 #include "mlx5.h" 56 #include "mlx5_utils.h" 57 #include "mlx5_rxtx.h" 58 #include "mlx5_rxtx_vec.h" 59 #include "mlx5_autoconf.h" 60 #include "mlx5_defs.h" 61 #include "mlx5_prm.h" 62 63 #ifndef __INTEL_COMPILER 64 #pragma GCC diagnostic ignored "-Wcast-qual" 65 #endif 66 67 /** 68 * Fill in buffer descriptors in a multi-packet send descriptor. 69 * 70 * @param txq 71 * Pointer to TX queue structure. 72 * @param dseg 73 * Pointer to buffer descriptor to be writen. 74 * @param pkts 75 * Pointer to array of packets to be sent. 76 * @param n 77 * Number of packets to be filled. 78 */ 79 static inline void 80 txq_wr_dseg_v(struct mlx5_txq_data *txq, __m128i *dseg, 81 struct rte_mbuf **pkts, unsigned int n) 82 { 83 unsigned int pos; 84 uintptr_t addr; 85 const __m128i shuf_mask_dseg = 86 _mm_set_epi8(8, 9, 10, 11, /* addr, bswap64 */ 87 12, 13, 14, 15, 88 7, 6, 5, 4, /* lkey */ 89 0, 1, 2, 3 /* length, bswap32 */); 90 #ifdef MLX5_PMD_SOFT_COUNTERS 91 uint32_t tx_byte = 0; 92 #endif 93 94 for (pos = 0; pos < n; ++pos, ++dseg) { 95 __m128i desc; 96 struct rte_mbuf *pkt = pkts[pos]; 97 98 addr = rte_pktmbuf_mtod(pkt, uintptr_t); 99 desc = _mm_set_epi32(addr >> 32, 100 addr, 101 mlx5_tx_mb2mr(txq, pkt), 102 DATA_LEN(pkt)); 103 desc = _mm_shuffle_epi8(desc, shuf_mask_dseg); 104 _mm_store_si128(dseg, desc); 105 #ifdef MLX5_PMD_SOFT_COUNTERS 106 tx_byte += DATA_LEN(pkt); 107 #endif 108 } 109 #ifdef MLX5_PMD_SOFT_COUNTERS 110 txq->stats.obytes += tx_byte; 111 #endif 112 } 113 114 /** 115 * Count the number of continuous single segment packets. 116 * 117 * @param pkts 118 * Pointer to array of packets. 119 * @param pkts_n 120 * Number of packets. 121 * 122 * @return 123 * Number of continuous single segment packets. 124 */ 125 static inline unsigned int 126 txq_check_multiseg(struct rte_mbuf **pkts, uint16_t pkts_n) 127 { 128 unsigned int pos; 129 130 if (!pkts_n) 131 return 0; 132 /* Count the number of continuous single segment packets. */ 133 for (pos = 0; pos < pkts_n; ++pos) 134 if (NB_SEGS(pkts[pos]) > 1) 135 break; 136 return pos; 137 } 138 139 /** 140 * Count the number of packets having same ol_flags and calculate cs_flags. 141 * 142 * @param txq 143 * Pointer to TX queue structure. 144 * @param pkts 145 * Pointer to array of packets. 146 * @param pkts_n 147 * Number of packets. 148 * @param cs_flags 149 * Pointer of flags to be returned. 150 * 151 * @return 152 * Number of packets having same ol_flags. 153 */ 154 static inline unsigned int 155 txq_calc_offload(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, 156 uint16_t pkts_n, uint8_t *cs_flags) 157 { 158 unsigned int pos; 159 const uint64_t ol_mask = 160 PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | 161 PKT_TX_UDP_CKSUM | PKT_TX_TUNNEL_GRE | 162 PKT_TX_TUNNEL_VXLAN | PKT_TX_OUTER_IP_CKSUM; 163 164 if (!pkts_n) 165 return 0; 166 /* Count the number of packets having same ol_flags. */ 167 for (pos = 1; pos < pkts_n; ++pos) 168 if ((pkts[pos]->ol_flags ^ pkts[0]->ol_flags) & ol_mask) 169 break; 170 /* Should open another MPW session for the rest. */ 171 if (pkts[0]->ol_flags & 172 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) { 173 const uint64_t is_tunneled = 174 pkts[0]->ol_flags & 175 (PKT_TX_TUNNEL_GRE | 176 PKT_TX_TUNNEL_VXLAN); 177 178 if (is_tunneled && txq->tunnel_en) { 179 *cs_flags = MLX5_ETH_WQE_L3_INNER_CSUM | 180 MLX5_ETH_WQE_L4_INNER_CSUM; 181 if (pkts[0]->ol_flags & PKT_TX_OUTER_IP_CKSUM) 182 *cs_flags |= MLX5_ETH_WQE_L3_CSUM; 183 } else { 184 *cs_flags = MLX5_ETH_WQE_L3_CSUM | 185 MLX5_ETH_WQE_L4_CSUM; 186 } 187 } 188 return pos; 189 } 190 191 /** 192 * Send multi-segmented packets until it encounters a single segment packet in 193 * the pkts list. 194 * 195 * @param txq 196 * Pointer to TX queue structure. 197 * @param pkts 198 * Pointer to array of packets to be sent. 199 * @param pkts_n 200 * Number of packets to be sent. 201 * 202 * @return 203 * Number of packets successfully transmitted (<= pkts_n). 204 */ 205 static uint16_t 206 txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, 207 uint16_t pkts_n) 208 { 209 uint16_t elts_head = txq->elts_head; 210 const uint16_t elts_n = 1 << txq->elts_n; 211 const uint16_t elts_m = elts_n - 1; 212 const uint16_t wq_n = 1 << txq->wqe_n; 213 const uint16_t wq_mask = wq_n - 1; 214 const unsigned int nb_dword_per_wqebb = 215 MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE; 216 const unsigned int nb_dword_in_hdr = 217 sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE; 218 unsigned int n; 219 volatile struct mlx5_wqe *wqe = NULL; 220 221 assert(elts_n > pkts_n); 222 mlx5_tx_complete(txq); 223 if (unlikely(!pkts_n)) 224 return 0; 225 for (n = 0; n < pkts_n; ++n) { 226 struct rte_mbuf *buf = pkts[n]; 227 unsigned int segs_n = buf->nb_segs; 228 unsigned int ds = nb_dword_in_hdr; 229 unsigned int len = PKT_LEN(buf); 230 uint16_t wqe_ci = txq->wqe_ci; 231 const __m128i shuf_mask_ctrl = 232 _mm_set_epi8(15, 14, 13, 12, 233 8, 9, 10, 11, /* bswap32 */ 234 4, 5, 6, 7, /* bswap32 */ 235 0, 1, 2, 3 /* bswap32 */); 236 uint8_t cs_flags = 0; 237 uint16_t max_elts; 238 uint16_t max_wqe; 239 __m128i *t_wqe, *dseg; 240 __m128i ctrl; 241 242 assert(segs_n); 243 max_elts = elts_n - (elts_head - txq->elts_tail); 244 max_wqe = wq_n - (txq->wqe_ci - txq->wqe_pi); 245 /* 246 * A MPW session consumes 2 WQEs at most to 247 * include MLX5_MPW_DSEG_MAX pointers. 248 */ 249 if (segs_n == 1 || 250 max_elts < segs_n || max_wqe < 2) 251 break; 252 if (segs_n > MLX5_MPW_DSEG_MAX) { 253 txq->stats.oerrors++; 254 break; 255 } 256 wqe = &((volatile struct mlx5_wqe64 *) 257 txq->wqes)[wqe_ci & wq_mask].hdr; 258 if (buf->ol_flags & 259 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) { 260 const uint64_t is_tunneled = buf->ol_flags & 261 (PKT_TX_TUNNEL_GRE | 262 PKT_TX_TUNNEL_VXLAN); 263 264 if (is_tunneled && txq->tunnel_en) { 265 cs_flags = MLX5_ETH_WQE_L3_INNER_CSUM | 266 MLX5_ETH_WQE_L4_INNER_CSUM; 267 if (buf->ol_flags & PKT_TX_OUTER_IP_CKSUM) 268 cs_flags |= MLX5_ETH_WQE_L3_CSUM; 269 } else { 270 cs_flags = MLX5_ETH_WQE_L3_CSUM | 271 MLX5_ETH_WQE_L4_CSUM; 272 } 273 } 274 /* Title WQEBB pointer. */ 275 t_wqe = (__m128i *)wqe; 276 dseg = (__m128i *)(wqe + 1); 277 do { 278 if (!(ds++ % nb_dword_per_wqebb)) { 279 dseg = (__m128i *) 280 &((volatile struct mlx5_wqe64 *) 281 txq->wqes)[++wqe_ci & wq_mask]; 282 } 283 txq_wr_dseg_v(txq, dseg++, &buf, 1); 284 (*txq->elts)[elts_head++ & elts_m] = buf; 285 buf = buf->next; 286 } while (--segs_n); 287 ++wqe_ci; 288 /* Fill CTRL in the header. */ 289 ctrl = _mm_set_epi32(0, 0, txq->qp_num_8s | ds, 290 MLX5_OPC_MOD_MPW << 24 | 291 txq->wqe_ci << 8 | MLX5_OPCODE_TSO); 292 ctrl = _mm_shuffle_epi8(ctrl, shuf_mask_ctrl); 293 _mm_store_si128(t_wqe, ctrl); 294 /* Fill ESEG in the header. */ 295 _mm_store_si128(t_wqe + 1, 296 _mm_set_epi16(0, 0, 0, 0, 297 rte_cpu_to_be_16(len), cs_flags, 298 0, 0)); 299 txq->wqe_ci = wqe_ci; 300 } 301 if (!n) 302 return 0; 303 txq->elts_comp += (uint16_t)(elts_head - txq->elts_head); 304 txq->elts_head = elts_head; 305 if (txq->elts_comp >= MLX5_TX_COMP_THRESH) { 306 wqe->ctrl[2] = rte_cpu_to_be_32(8); 307 wqe->ctrl[3] = txq->elts_head; 308 txq->elts_comp = 0; 309 ++txq->cq_pi; 310 } 311 #ifdef MLX5_PMD_SOFT_COUNTERS 312 txq->stats.opackets += n; 313 #endif 314 mlx5_tx_dbrec(txq, wqe); 315 return n; 316 } 317 318 /** 319 * Send burst of packets with Enhanced MPW. If it encounters a multi-seg packet, 320 * it returns to make it processed by txq_scatter_v(). All the packets in 321 * the pkts list should be single segment packets having same offload flags. 322 * This must be checked by txq_check_multiseg() and txq_calc_offload(). 323 * 324 * @param txq 325 * Pointer to TX queue structure. 326 * @param pkts 327 * Pointer to array of packets to be sent. 328 * @param pkts_n 329 * Number of packets to be sent (<= MLX5_VPMD_TX_MAX_BURST). 330 * @param cs_flags 331 * Checksum offload flags to be written in the descriptor. 332 * 333 * @return 334 * Number of packets successfully transmitted (<= pkts_n). 335 */ 336 static inline uint16_t 337 txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n, 338 uint8_t cs_flags) 339 { 340 struct rte_mbuf **elts; 341 uint16_t elts_head = txq->elts_head; 342 const uint16_t elts_n = 1 << txq->elts_n; 343 const uint16_t elts_m = elts_n - 1; 344 const unsigned int nb_dword_per_wqebb = 345 MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE; 346 const unsigned int nb_dword_in_hdr = 347 sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE; 348 unsigned int n = 0; 349 unsigned int pos; 350 uint16_t max_elts; 351 uint16_t max_wqe; 352 uint32_t comp_req = 0; 353 const uint16_t wq_n = 1 << txq->wqe_n; 354 const uint16_t wq_mask = wq_n - 1; 355 uint16_t wq_idx = txq->wqe_ci & wq_mask; 356 volatile struct mlx5_wqe64 *wq = 357 &((volatile struct mlx5_wqe64 *)txq->wqes)[wq_idx]; 358 volatile struct mlx5_wqe *wqe = (volatile struct mlx5_wqe *)wq; 359 const __m128i shuf_mask_ctrl = 360 _mm_set_epi8(15, 14, 13, 12, 361 8, 9, 10, 11, /* bswap32 */ 362 4, 5, 6, 7, /* bswap32 */ 363 0, 1, 2, 3 /* bswap32 */); 364 __m128i *t_wqe, *dseg; 365 __m128i ctrl; 366 367 /* Make sure all packets can fit into a single WQE. */ 368 assert(elts_n > pkts_n); 369 mlx5_tx_complete(txq); 370 max_elts = (elts_n - (elts_head - txq->elts_tail)); 371 max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); 372 pkts_n = RTE_MIN((unsigned int)RTE_MIN(pkts_n, max_wqe), max_elts); 373 assert(pkts_n <= MLX5_DSEG_MAX - nb_dword_in_hdr); 374 if (unlikely(!pkts_n)) 375 return 0; 376 elts = &(*txq->elts)[elts_head & elts_m]; 377 /* Loop for available tailroom first. */ 378 n = RTE_MIN(elts_n - (elts_head & elts_m), pkts_n); 379 for (pos = 0; pos < (n & -2); pos += 2) 380 _mm_storeu_si128((__m128i *)&elts[pos], 381 _mm_loadu_si128((__m128i *)&pkts[pos])); 382 if (n & 1) 383 elts[pos] = pkts[pos]; 384 /* Check if it crosses the end of the queue. */ 385 if (unlikely(n < pkts_n)) { 386 elts = &(*txq->elts)[0]; 387 for (pos = 0; pos < pkts_n - n; ++pos) 388 elts[pos] = pkts[n + pos]; 389 } 390 txq->elts_head += pkts_n; 391 /* Save title WQEBB pointer. */ 392 t_wqe = (__m128i *)wqe; 393 dseg = (__m128i *)(wqe + 1); 394 /* Calculate the number of entries to the end. */ 395 n = RTE_MIN( 396 (wq_n - wq_idx) * nb_dword_per_wqebb - nb_dword_in_hdr, 397 pkts_n); 398 /* Fill DSEGs. */ 399 txq_wr_dseg_v(txq, dseg, pkts, n); 400 /* Check if it crosses the end of the queue. */ 401 if (n < pkts_n) { 402 dseg = (__m128i *)txq->wqes; 403 txq_wr_dseg_v(txq, dseg, &pkts[n], pkts_n - n); 404 } 405 if (txq->elts_comp + pkts_n < MLX5_TX_COMP_THRESH) { 406 txq->elts_comp += pkts_n; 407 } else { 408 /* Request a completion. */ 409 txq->elts_comp = 0; 410 ++txq->cq_pi; 411 comp_req = 8; 412 } 413 /* Fill CTRL in the header. */ 414 ctrl = _mm_set_epi32(txq->elts_head, comp_req, 415 txq->qp_num_8s | (pkts_n + 2), 416 MLX5_OPC_MOD_ENHANCED_MPSW << 24 | 417 txq->wqe_ci << 8 | MLX5_OPCODE_ENHANCED_MPSW); 418 ctrl = _mm_shuffle_epi8(ctrl, shuf_mask_ctrl); 419 _mm_store_si128(t_wqe, ctrl); 420 /* Fill ESEG in the header. */ 421 _mm_store_si128(t_wqe + 1, 422 _mm_set_epi8(0, 0, 0, 0, 423 0, 0, 0, 0, 424 0, 0, 0, cs_flags, 425 0, 0, 0, 0)); 426 #ifdef MLX5_PMD_SOFT_COUNTERS 427 txq->stats.opackets += pkts_n; 428 #endif 429 txq->wqe_ci += (nb_dword_in_hdr + pkts_n + (nb_dword_per_wqebb - 1)) / 430 nb_dword_per_wqebb; 431 /* Ring QP doorbell. */ 432 mlx5_tx_dbrec(txq, wqe); 433 return pkts_n; 434 } 435 436 /** 437 * DPDK callback for vectorized TX. 438 * 439 * @param dpdk_txq 440 * Generic pointer to TX queue structure. 441 * @param[in] pkts 442 * Packets to transmit. 443 * @param pkts_n 444 * Number of packets in array. 445 * 446 * @return 447 * Number of packets successfully transmitted (<= pkts_n). 448 */ 449 uint16_t 450 mlx5_tx_burst_raw_vec(void *dpdk_txq, struct rte_mbuf **pkts, 451 uint16_t pkts_n) 452 { 453 struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq; 454 uint16_t nb_tx = 0; 455 456 while (pkts_n > nb_tx) { 457 uint16_t n; 458 uint16_t ret; 459 460 n = RTE_MIN((uint16_t)(pkts_n - nb_tx), MLX5_VPMD_TX_MAX_BURST); 461 ret = txq_burst_v(txq, &pkts[nb_tx], n, 0); 462 nb_tx += ret; 463 if (!ret) 464 break; 465 } 466 return nb_tx; 467 } 468 469 /** 470 * DPDK callback for vectorized TX with multi-seg packets and offload. 471 * 472 * @param dpdk_txq 473 * Generic pointer to TX queue structure. 474 * @param[in] pkts 475 * Packets to transmit. 476 * @param pkts_n 477 * Number of packets in array. 478 * 479 * @return 480 * Number of packets successfully transmitted (<= pkts_n). 481 */ 482 uint16_t 483 mlx5_tx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 484 { 485 struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq; 486 uint16_t nb_tx = 0; 487 488 while (pkts_n > nb_tx) { 489 uint8_t cs_flags = 0; 490 uint16_t n; 491 uint16_t ret; 492 493 /* Transmit multi-seg packets in the head of pkts list. */ 494 if (!(txq->flags & ETH_TXQ_FLAGS_NOMULTSEGS) && 495 NB_SEGS(pkts[nb_tx]) > 1) 496 nb_tx += txq_scatter_v(txq, 497 &pkts[nb_tx], 498 pkts_n - nb_tx); 499 n = RTE_MIN((uint16_t)(pkts_n - nb_tx), MLX5_VPMD_TX_MAX_BURST); 500 if (!(txq->flags & ETH_TXQ_FLAGS_NOMULTSEGS)) 501 n = txq_check_multiseg(&pkts[nb_tx], n); 502 if (!(txq->flags & ETH_TXQ_FLAGS_NOOFFLOADS)) 503 n = txq_calc_offload(txq, &pkts[nb_tx], n, &cs_flags); 504 ret = txq_burst_v(txq, &pkts[nb_tx], n, cs_flags); 505 nb_tx += ret; 506 if (!ret) 507 break; 508 } 509 return nb_tx; 510 } 511 512 /** 513 * Store free buffers to RX SW ring. 514 * 515 * @param rxq 516 * Pointer to RX queue structure. 517 * @param pkts 518 * Pointer to array of packets to be stored. 519 * @param pkts_n 520 * Number of packets to be stored. 521 */ 522 static inline void 523 rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n) 524 { 525 const uint16_t q_mask = (1 << rxq->elts_n) - 1; 526 struct rte_mbuf **elts = &(*rxq->elts)[rxq->rq_pi & q_mask]; 527 unsigned int pos; 528 uint16_t p = n & -2; 529 530 for (pos = 0; pos < p; pos += 2) { 531 __m128i mbp; 532 533 mbp = _mm_loadu_si128((__m128i *)&elts[pos]); 534 _mm_storeu_si128((__m128i *)&pkts[pos], mbp); 535 } 536 if (n & 1) 537 pkts[pos] = elts[pos]; 538 } 539 540 /** 541 * Replenish buffers for RX in bulk. 542 * 543 * @param rxq 544 * Pointer to RX queue structure. 545 * @param n 546 * Number of buffers to be replenished. 547 */ 548 static inline void 549 rxq_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq, uint16_t n) 550 { 551 const uint16_t q_n = 1 << rxq->elts_n; 552 const uint16_t q_mask = q_n - 1; 553 const uint16_t elts_idx = rxq->rq_ci & q_mask; 554 struct rte_mbuf **elts = &(*rxq->elts)[elts_idx]; 555 volatile struct mlx5_wqe_data_seg *wq = &(*rxq->wqes)[elts_idx]; 556 unsigned int i; 557 558 assert(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH); 559 assert(n <= (uint16_t)(q_n - (rxq->rq_ci - rxq->rq_pi))); 560 assert(MLX5_VPMD_RXQ_RPLNSH_THRESH > MLX5_VPMD_DESCS_PER_LOOP); 561 /* Not to cross queue end. */ 562 n = RTE_MIN(n - MLX5_VPMD_DESCS_PER_LOOP, q_n - elts_idx); 563 if (rte_mempool_get_bulk(rxq->mp, (void *)elts, n) < 0) { 564 rxq->stats.rx_nombuf += n; 565 return; 566 } 567 for (i = 0; i < n; ++i) 568 wq[i].addr = rte_cpu_to_be_64((uintptr_t)elts[i]->buf_addr + 569 RTE_PKTMBUF_HEADROOM); 570 rxq->rq_ci += n; 571 rte_io_wmb(); 572 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 573 } 574 575 /** 576 * Decompress a compressed completion and fill in mbufs in RX SW ring with data 577 * extracted from the title completion descriptor. 578 * 579 * @param rxq 580 * Pointer to RX queue structure. 581 * @param cq 582 * Pointer to completion array having a compressed completion at first. 583 * @param elts 584 * Pointer to SW ring to be filled. The first mbuf has to be pre-built from 585 * the title completion descriptor to be copied to the rest of mbufs. 586 */ 587 static inline void 588 rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, 589 volatile struct mlx5_cqe *cq, 590 struct rte_mbuf **elts) 591 { 592 volatile struct mlx5_mini_cqe8 *mcq = (void *)(cq + 1); 593 struct rte_mbuf *t_pkt = elts[0]; /* Title packet is pre-built. */ 594 unsigned int pos; 595 unsigned int i; 596 unsigned int inv = 0; 597 /* Mask to shuffle from extracted mini CQE to mbuf. */ 598 const __m128i shuf_mask1 = 599 _mm_set_epi8(0, 1, 2, 3, /* rss, bswap32 */ 600 -1, -1, /* skip vlan_tci */ 601 6, 7, /* data_len, bswap16 */ 602 -1, -1, 6, 7, /* pkt_len, bswap16 */ 603 -1, -1, -1, -1 /* skip packet_type */); 604 const __m128i shuf_mask2 = 605 _mm_set_epi8(8, 9, 10, 11, /* rss, bswap32 */ 606 -1, -1, /* skip vlan_tci */ 607 14, 15, /* data_len, bswap16 */ 608 -1, -1, 14, 15, /* pkt_len, bswap16 */ 609 -1, -1, -1, -1 /* skip packet_type */); 610 /* Restore the compressed count. Must be 16 bits. */ 611 const uint16_t mcqe_n = t_pkt->data_len + 612 (rxq->crc_present * ETHER_CRC_LEN); 613 const __m128i rearm = 614 _mm_loadu_si128((__m128i *)&t_pkt->rearm_data); 615 const __m128i rxdf = 616 _mm_loadu_si128((__m128i *)&t_pkt->rx_descriptor_fields1); 617 const __m128i crc_adj = 618 _mm_set_epi16(0, 0, 0, 619 rxq->crc_present * ETHER_CRC_LEN, 620 0, 621 rxq->crc_present * ETHER_CRC_LEN, 622 0, 0); 623 const uint32_t flow_tag = t_pkt->hash.fdir.hi; 624 #ifdef MLX5_PMD_SOFT_COUNTERS 625 const __m128i zero = _mm_setzero_si128(); 626 const __m128i ones = _mm_cmpeq_epi32(zero, zero); 627 uint32_t rcvd_byte = 0; 628 /* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */ 629 const __m128i len_shuf_mask = 630 _mm_set_epi8(-1, -1, -1, -1, 631 -1, -1, -1, -1, 632 14, 15, 6, 7, 633 10, 11, 2, 3); 634 #endif 635 636 /* 637 * Not to overflow elts array. Decompress next time after mbuf 638 * replenishment. 639 */ 640 if (unlikely(mcqe_n + MLX5_VPMD_DESCS_PER_LOOP > 641 (uint16_t)(rxq->rq_ci - rxq->cq_ci))) 642 return; 643 /* 644 * A. load mCQEs into a 128bit register. 645 * B. store rearm data to mbuf. 646 * C. combine data from mCQEs with rx_descriptor_fields1. 647 * D. store rx_descriptor_fields1. 648 * E. store flow tag (rte_flow mark). 649 */ 650 for (pos = 0; pos < mcqe_n; ) { 651 __m128i mcqe1, mcqe2; 652 __m128i rxdf1, rxdf2; 653 #ifdef MLX5_PMD_SOFT_COUNTERS 654 __m128i byte_cnt, invalid_mask; 655 #endif 656 657 if (!(pos & 0x7) && pos + 8 < mcqe_n) 658 rte_prefetch0((void *)(cq + pos + 8)); 659 /* A.1 load mCQEs into a 128bit register. */ 660 mcqe1 = _mm_loadu_si128((__m128i *)&mcq[pos % 8]); 661 mcqe2 = _mm_loadu_si128((__m128i *)&mcq[pos % 8 + 2]); 662 /* B.1 store rearm data to mbuf. */ 663 _mm_storeu_si128((__m128i *)&elts[pos]->rearm_data, rearm); 664 _mm_storeu_si128((__m128i *)&elts[pos + 1]->rearm_data, rearm); 665 /* C.1 combine data from mCQEs with rx_descriptor_fields1. */ 666 rxdf1 = _mm_shuffle_epi8(mcqe1, shuf_mask1); 667 rxdf2 = _mm_shuffle_epi8(mcqe1, shuf_mask2); 668 rxdf1 = _mm_sub_epi16(rxdf1, crc_adj); 669 rxdf2 = _mm_sub_epi16(rxdf2, crc_adj); 670 rxdf1 = _mm_blend_epi16(rxdf1, rxdf, 0x23); 671 rxdf2 = _mm_blend_epi16(rxdf2, rxdf, 0x23); 672 /* D.1 store rx_descriptor_fields1. */ 673 _mm_storeu_si128((__m128i *) 674 &elts[pos]->rx_descriptor_fields1, 675 rxdf1); 676 _mm_storeu_si128((__m128i *) 677 &elts[pos + 1]->rx_descriptor_fields1, 678 rxdf2); 679 /* B.1 store rearm data to mbuf. */ 680 _mm_storeu_si128((__m128i *)&elts[pos + 2]->rearm_data, rearm); 681 _mm_storeu_si128((__m128i *)&elts[pos + 3]->rearm_data, rearm); 682 /* C.1 combine data from mCQEs with rx_descriptor_fields1. */ 683 rxdf1 = _mm_shuffle_epi8(mcqe2, shuf_mask1); 684 rxdf2 = _mm_shuffle_epi8(mcqe2, shuf_mask2); 685 rxdf1 = _mm_sub_epi16(rxdf1, crc_adj); 686 rxdf2 = _mm_sub_epi16(rxdf2, crc_adj); 687 rxdf1 = _mm_blend_epi16(rxdf1, rxdf, 0x23); 688 rxdf2 = _mm_blend_epi16(rxdf2, rxdf, 0x23); 689 /* D.1 store rx_descriptor_fields1. */ 690 _mm_storeu_si128((__m128i *) 691 &elts[pos + 2]->rx_descriptor_fields1, 692 rxdf1); 693 _mm_storeu_si128((__m128i *) 694 &elts[pos + 3]->rx_descriptor_fields1, 695 rxdf2); 696 #ifdef MLX5_PMD_SOFT_COUNTERS 697 invalid_mask = _mm_set_epi64x(0, 698 (mcqe_n - pos) * 699 sizeof(uint16_t) * 8); 700 invalid_mask = _mm_sll_epi64(ones, invalid_mask); 701 mcqe1 = _mm_srli_si128(mcqe1, 4); 702 byte_cnt = _mm_blend_epi16(mcqe1, mcqe2, 0xcc); 703 byte_cnt = _mm_shuffle_epi8(byte_cnt, len_shuf_mask); 704 byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt); 705 byte_cnt = _mm_hadd_epi16(byte_cnt, zero); 706 rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero)); 707 #endif 708 if (rxq->mark) { 709 /* E.1 store flow tag (rte_flow mark). */ 710 elts[pos]->hash.fdir.hi = flow_tag; 711 elts[pos + 1]->hash.fdir.hi = flow_tag; 712 elts[pos + 2]->hash.fdir.hi = flow_tag; 713 elts[pos + 3]->hash.fdir.hi = flow_tag; 714 } 715 pos += MLX5_VPMD_DESCS_PER_LOOP; 716 /* Move to next CQE and invalidate consumed CQEs. */ 717 if (!(pos & 0x7) && pos < mcqe_n) { 718 mcq = (void *)(cq + pos); 719 for (i = 0; i < 8; ++i) 720 cq[inv++].op_own = MLX5_CQE_INVALIDATE; 721 } 722 } 723 /* Invalidate the rest of CQEs. */ 724 for (; inv < mcqe_n; ++inv) 725 cq[inv].op_own = MLX5_CQE_INVALIDATE; 726 #ifdef MLX5_PMD_SOFT_COUNTERS 727 rxq->stats.ipackets += mcqe_n; 728 rxq->stats.ibytes += rcvd_byte; 729 #endif 730 rxq->cq_ci += mcqe_n; 731 } 732 733 /** 734 * Calculate packet type and offload flag for mbuf and store it. 735 * 736 * @param rxq 737 * Pointer to RX queue structure. 738 * @param cqes[4] 739 * Array of four 16bytes completions extracted from the original completion 740 * descriptor. 741 * @param op_err 742 * Opcode vector having responder error status. Each field is 4B. 743 * @param pkts 744 * Pointer to array of packets to be filled. 745 */ 746 static inline void 747 rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, __m128i cqes[4], 748 __m128i op_err, struct rte_mbuf **pkts) 749 { 750 __m128i pinfo0, pinfo1; 751 __m128i pinfo, ptype; 752 __m128i ol_flags = _mm_set1_epi32(rxq->rss_hash * PKT_RX_RSS_HASH); 753 __m128i cv_flags; 754 const __m128i zero = _mm_setzero_si128(); 755 const __m128i ptype_mask = 756 _mm_set_epi32(0xfd06, 0xfd06, 0xfd06, 0xfd06); 757 const __m128i ptype_ol_mask = 758 _mm_set_epi32(0x106, 0x106, 0x106, 0x106); 759 const __m128i pinfo_mask = 760 _mm_set_epi32(0x3, 0x3, 0x3, 0x3); 761 const __m128i cv_flag_sel = 762 _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 763 (uint8_t)((PKT_RX_IP_CKSUM_GOOD | 764 PKT_RX_L4_CKSUM_GOOD) >> 1), 765 0, 766 (uint8_t)(PKT_RX_L4_CKSUM_GOOD >> 1), 767 0, 768 (uint8_t)(PKT_RX_IP_CKSUM_GOOD >> 1), 769 (uint8_t)(PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED), 770 0); 771 const __m128i cv_mask = 772 _mm_set_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | 773 PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED, 774 PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | 775 PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED, 776 PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | 777 PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED, 778 PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | 779 PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED); 780 const __m128i mbuf_init = 781 _mm_loadl_epi64((__m128i *)&rxq->mbuf_initializer); 782 __m128i rearm0, rearm1, rearm2, rearm3; 783 784 /* Extract pkt_info field. */ 785 pinfo0 = _mm_unpacklo_epi32(cqes[0], cqes[1]); 786 pinfo1 = _mm_unpacklo_epi32(cqes[2], cqes[3]); 787 pinfo = _mm_unpacklo_epi64(pinfo0, pinfo1); 788 /* Extract hdr_type_etc field. */ 789 pinfo0 = _mm_unpackhi_epi32(cqes[0], cqes[1]); 790 pinfo1 = _mm_unpackhi_epi32(cqes[2], cqes[3]); 791 ptype = _mm_unpacklo_epi64(pinfo0, pinfo1); 792 if (rxq->mark) { 793 const __m128i pinfo_ft_mask = 794 _mm_set_epi32(0xffffff00, 0xffffff00, 795 0xffffff00, 0xffffff00); 796 const __m128i fdir_flags = _mm_set1_epi32(PKT_RX_FDIR); 797 const __m128i fdir_id_flags = _mm_set1_epi32(PKT_RX_FDIR_ID); 798 __m128i flow_tag, invalid_mask; 799 800 flow_tag = _mm_and_si128(pinfo, pinfo_ft_mask); 801 /* Check if flow tag is non-zero then set PKT_RX_FDIR. */ 802 invalid_mask = _mm_cmpeq_epi32(flow_tag, zero); 803 ol_flags = _mm_or_si128(ol_flags, 804 _mm_andnot_si128(invalid_mask, 805 fdir_flags)); 806 /* Mask out invalid entries. */ 807 flow_tag = _mm_andnot_si128(invalid_mask, flow_tag); 808 /* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */ 809 ol_flags = _mm_or_si128(ol_flags, 810 _mm_andnot_si128( 811 _mm_cmpeq_epi32(flow_tag, 812 pinfo_ft_mask), 813 fdir_id_flags)); 814 } 815 /* 816 * Merge the two fields to generate the following: 817 * bit[1] = l3_ok 818 * bit[2] = l4_ok 819 * bit[8] = cv 820 * bit[11:10] = l3_hdr_type 821 * bit[14:12] = l4_hdr_type 822 * bit[15] = ip_frag 823 * bit[16] = tunneled 824 * bit[17] = outer_l3_type 825 */ 826 ptype = _mm_and_si128(ptype, ptype_mask); 827 pinfo = _mm_and_si128(pinfo, pinfo_mask); 828 pinfo = _mm_slli_epi32(pinfo, 16); 829 /* Make pinfo has merged fields for ol_flags calculation. */ 830 pinfo = _mm_or_si128(ptype, pinfo); 831 ptype = _mm_srli_epi32(pinfo, 10); 832 ptype = _mm_packs_epi32(ptype, zero); 833 /* Errored packets will have RTE_PTYPE_ALL_MASK. */ 834 op_err = _mm_srli_epi16(op_err, 8); 835 ptype = _mm_or_si128(ptype, op_err); 836 pkts[0]->packet_type = mlx5_ptype_table[_mm_extract_epi8(ptype, 0)]; 837 pkts[1]->packet_type = mlx5_ptype_table[_mm_extract_epi8(ptype, 2)]; 838 pkts[2]->packet_type = mlx5_ptype_table[_mm_extract_epi8(ptype, 4)]; 839 pkts[3]->packet_type = mlx5_ptype_table[_mm_extract_epi8(ptype, 6)]; 840 /* Fill flags for checksum and VLAN. */ 841 pinfo = _mm_and_si128(pinfo, ptype_ol_mask); 842 pinfo = _mm_shuffle_epi8(cv_flag_sel, pinfo); 843 /* Locate checksum flags at byte[2:1] and merge with VLAN flags. */ 844 cv_flags = _mm_slli_epi32(pinfo, 9); 845 cv_flags = _mm_or_si128(pinfo, cv_flags); 846 /* Move back flags to start from byte[0]. */ 847 cv_flags = _mm_srli_epi32(cv_flags, 8); 848 /* Mask out garbage bits. */ 849 cv_flags = _mm_and_si128(cv_flags, cv_mask); 850 /* Merge to ol_flags. */ 851 ol_flags = _mm_or_si128(ol_flags, cv_flags); 852 /* Merge mbuf_init and ol_flags. */ 853 rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(ol_flags, 8), 0x30); 854 rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(ol_flags, 4), 0x30); 855 rearm2 = _mm_blend_epi16(mbuf_init, ol_flags, 0x30); 856 rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(ol_flags, 4), 0x30); 857 /* Write 8B rearm_data and 8B ol_flags. */ 858 _mm_store_si128((__m128i *)&pkts[0]->rearm_data, rearm0); 859 _mm_store_si128((__m128i *)&pkts[1]->rearm_data, rearm1); 860 _mm_store_si128((__m128i *)&pkts[2]->rearm_data, rearm2); 861 _mm_store_si128((__m128i *)&pkts[3]->rearm_data, rearm3); 862 } 863 864 /** 865 * Skip error packets. 866 * 867 * @param rxq 868 * Pointer to RX queue structure. 869 * @param[out] pkts 870 * Array to store received packets. 871 * @param pkts_n 872 * Maximum number of packets in array. 873 * 874 * @return 875 * Number of packets successfully received (<= pkts_n). 876 */ 877 static uint16_t 878 rxq_handle_pending_error(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, 879 uint16_t pkts_n) 880 { 881 uint16_t n = 0; 882 unsigned int i; 883 #ifdef MLX5_PMD_SOFT_COUNTERS 884 uint32_t err_bytes = 0; 885 #endif 886 887 for (i = 0; i < pkts_n; ++i) { 888 struct rte_mbuf *pkt = pkts[i]; 889 890 if (pkt->packet_type == RTE_PTYPE_ALL_MASK) { 891 #ifdef MLX5_PMD_SOFT_COUNTERS 892 err_bytes += PKT_LEN(pkt); 893 #endif 894 rte_pktmbuf_free_seg(pkt); 895 } else { 896 pkts[n++] = pkt; 897 } 898 } 899 rxq->stats.idropped += (pkts_n - n); 900 #ifdef MLX5_PMD_SOFT_COUNTERS 901 /* Correct counters of errored completions. */ 902 rxq->stats.ipackets -= (pkts_n - n); 903 rxq->stats.ibytes -= err_bytes; 904 #endif 905 rxq->pending_err = 0; 906 return n; 907 } 908 909 /** 910 * Receive burst of packets. An errored completion also consumes a mbuf, but the 911 * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed 912 * before returning to application. 913 * 914 * @param rxq 915 * Pointer to RX queue structure. 916 * @param[out] pkts 917 * Array to store received packets. 918 * @param pkts_n 919 * Maximum number of packets in array. 920 * 921 * @return 922 * Number of packets received including errors (<= pkts_n). 923 */ 924 static inline uint16_t 925 rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 926 { 927 const uint16_t q_n = 1 << rxq->cqe_n; 928 const uint16_t q_mask = q_n - 1; 929 volatile struct mlx5_cqe *cq; 930 struct rte_mbuf **elts; 931 unsigned int pos; 932 uint64_t n; 933 uint16_t repl_n; 934 uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP; 935 uint16_t nocmp_n = 0; 936 uint16_t rcvd_pkt = 0; 937 unsigned int cq_idx = rxq->cq_ci & q_mask; 938 unsigned int elts_idx; 939 unsigned int ownership = !!(rxq->cq_ci & (q_mask + 1)); 940 const __m128i owner_check = 941 _mm_set_epi64x(0x0100000001000000LL, 0x0100000001000000LL); 942 const __m128i opcode_check = 943 _mm_set_epi64x(0xf0000000f0000000LL, 0xf0000000f0000000LL); 944 const __m128i format_check = 945 _mm_set_epi64x(0x0c0000000c000000LL, 0x0c0000000c000000LL); 946 const __m128i resp_err_check = 947 _mm_set_epi64x(0xe0000000e0000000LL, 0xe0000000e0000000LL); 948 #ifdef MLX5_PMD_SOFT_COUNTERS 949 uint32_t rcvd_byte = 0; 950 /* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */ 951 const __m128i len_shuf_mask = 952 _mm_set_epi8(-1, -1, -1, -1, 953 -1, -1, -1, -1, 954 12, 13, 8, 9, 955 4, 5, 0, 1); 956 #endif 957 /* Mask to shuffle from extracted CQE to mbuf. */ 958 const __m128i shuf_mask = 959 _mm_set_epi8(-1, 3, 2, 1, /* fdir.hi */ 960 12, 13, 14, 15, /* rss, bswap32 */ 961 10, 11, /* vlan_tci, bswap16 */ 962 4, 5, /* data_len, bswap16 */ 963 -1, -1, /* zero out 2nd half of pkt_len */ 964 4, 5 /* pkt_len, bswap16 */); 965 /* Mask to blend from the last Qword to the first DQword. */ 966 const __m128i blend_mask = 967 _mm_set_epi8(-1, -1, -1, -1, 968 -1, -1, -1, -1, 969 0, 0, 0, 0, 970 0, 0, 0, -1); 971 const __m128i zero = _mm_setzero_si128(); 972 const __m128i ones = _mm_cmpeq_epi32(zero, zero); 973 const __m128i crc_adj = 974 _mm_set_epi16(0, 0, 0, 0, 0, 975 rxq->crc_present * ETHER_CRC_LEN, 976 0, 977 rxq->crc_present * ETHER_CRC_LEN); 978 const __m128i flow_mark_adj = _mm_set_epi32(rxq->mark * (-1), 0, 0, 0); 979 980 assert(rxq->sges_n == 0); 981 assert(rxq->cqe_n == rxq->elts_n); 982 cq = &(*rxq->cqes)[cq_idx]; 983 rte_prefetch0(cq); 984 rte_prefetch0(cq + 1); 985 rte_prefetch0(cq + 2); 986 rte_prefetch0(cq + 3); 987 pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST); 988 /* 989 * Order of indexes: 990 * rq_ci >= cq_ci >= rq_pi 991 * Definition of indexes: 992 * rq_ci - cq_ci := # of buffers owned by HW (posted). 993 * cq_ci - rq_pi := # of buffers not returned to app (decompressed). 994 * N - (rq_ci - rq_pi) := # of buffers consumed (to be replenished). 995 */ 996 repl_n = q_n - (rxq->rq_ci - rxq->rq_pi); 997 if (repl_n >= MLX5_VPMD_RXQ_RPLNSH_THRESH) 998 rxq_replenish_bulk_mbuf(rxq, repl_n); 999 /* See if there're unreturned mbufs from compressed CQE. */ 1000 rcvd_pkt = rxq->cq_ci - rxq->rq_pi; 1001 if (rcvd_pkt > 0) { 1002 rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n); 1003 rxq_copy_mbuf_v(rxq, pkts, rcvd_pkt); 1004 rxq->rq_pi += rcvd_pkt; 1005 pkts += rcvd_pkt; 1006 } 1007 elts_idx = rxq->rq_pi & q_mask; 1008 elts = &(*rxq->elts)[elts_idx]; 1009 pkts_n = RTE_MIN(pkts_n - rcvd_pkt, 1010 (uint16_t)(rxq->rq_ci - rxq->cq_ci)); 1011 /* Not to overflow pkts/elts array. */ 1012 pkts_n = RTE_ALIGN_FLOOR(pkts_n, MLX5_VPMD_DESCS_PER_LOOP); 1013 /* Not to cross queue end. */ 1014 pkts_n = RTE_MIN(pkts_n, q_n - elts_idx); 1015 if (!pkts_n) 1016 return rcvd_pkt; 1017 /* At this point, there shouldn't be any remained packets. */ 1018 assert(rxq->rq_pi == rxq->cq_ci); 1019 /* 1020 * A. load first Qword (8bytes) in one loop. 1021 * B. copy 4 mbuf pointers from elts ring to returing pkts. 1022 * C. load remained CQE data and extract necessary fields. 1023 * Final 16bytes cqes[] extracted from original 64bytes CQE has the 1024 * following structure: 1025 * struct { 1026 * uint8_t pkt_info; 1027 * uint8_t flow_tag[3]; 1028 * uint16_t byte_cnt; 1029 * uint8_t rsvd4; 1030 * uint8_t op_own; 1031 * uint16_t hdr_type_etc; 1032 * uint16_t vlan_info; 1033 * uint32_t rx_has_res; 1034 * } c; 1035 * D. fill in mbuf. 1036 * E. get valid CQEs. 1037 * F. find compressed CQE. 1038 */ 1039 for (pos = 0; 1040 pos < pkts_n; 1041 pos += MLX5_VPMD_DESCS_PER_LOOP) { 1042 __m128i cqes[MLX5_VPMD_DESCS_PER_LOOP]; 1043 __m128i cqe_tmp1, cqe_tmp2; 1044 __m128i pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3; 1045 __m128i op_own, op_own_tmp1, op_own_tmp2; 1046 __m128i opcode, owner_mask, invalid_mask; 1047 __m128i comp_mask; 1048 __m128i mask; 1049 #ifdef MLX5_PMD_SOFT_COUNTERS 1050 __m128i byte_cnt; 1051 #endif 1052 __m128i mbp1, mbp2; 1053 __m128i p = _mm_set_epi16(0, 0, 0, 0, 3, 2, 1, 0); 1054 unsigned int p1, p2, p3; 1055 1056 /* Prefetch next 4 CQEs. */ 1057 if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) { 1058 rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP]); 1059 rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 1]); 1060 rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 2]); 1061 rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 3]); 1062 } 1063 /* A.0 do not cross the end of CQ. */ 1064 mask = _mm_set_epi64x(0, (pkts_n - pos) * sizeof(uint16_t) * 8); 1065 mask = _mm_sll_epi64(ones, mask); 1066 p = _mm_andnot_si128(mask, p); 1067 /* A.1 load cqes. */ 1068 p3 = _mm_extract_epi16(p, 3); 1069 cqes[3] = _mm_loadl_epi64((__m128i *) 1070 &cq[pos + p3].sop_drop_qpn); 1071 rte_compiler_barrier(); 1072 p2 = _mm_extract_epi16(p, 2); 1073 cqes[2] = _mm_loadl_epi64((__m128i *) 1074 &cq[pos + p2].sop_drop_qpn); 1075 rte_compiler_barrier(); 1076 /* B.1 load mbuf pointers. */ 1077 mbp1 = _mm_loadu_si128((__m128i *)&elts[pos]); 1078 mbp2 = _mm_loadu_si128((__m128i *)&elts[pos + 2]); 1079 /* A.1 load a block having op_own. */ 1080 p1 = _mm_extract_epi16(p, 1); 1081 cqes[1] = _mm_loadl_epi64((__m128i *) 1082 &cq[pos + p1].sop_drop_qpn); 1083 rte_compiler_barrier(); 1084 cqes[0] = _mm_loadl_epi64((__m128i *) 1085 &cq[pos].sop_drop_qpn); 1086 /* B.2 copy mbuf pointers. */ 1087 _mm_storeu_si128((__m128i *)&pkts[pos], mbp1); 1088 _mm_storeu_si128((__m128i *)&pkts[pos + 2], mbp2); 1089 rte_compiler_barrier(); 1090 /* C.1 load remained CQE data and extract necessary fields. */ 1091 cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p3]); 1092 cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos + p2]); 1093 cqes[3] = _mm_blendv_epi8(cqes[3], cqe_tmp2, blend_mask); 1094 cqes[2] = _mm_blendv_epi8(cqes[2], cqe_tmp1, blend_mask); 1095 cqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p3].rsvd1[3]); 1096 cqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos + p2].rsvd1[3]); 1097 cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x30); 1098 cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x30); 1099 cqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p3].rsvd2[10]); 1100 cqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos + p2].rsvd2[10]); 1101 cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x04); 1102 cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x04); 1103 /* C.2 generate final structure for mbuf with swapping bytes. */ 1104 pkt_mb3 = _mm_shuffle_epi8(cqes[3], shuf_mask); 1105 pkt_mb2 = _mm_shuffle_epi8(cqes[2], shuf_mask); 1106 /* C.3 adjust CRC length. */ 1107 pkt_mb3 = _mm_sub_epi16(pkt_mb3, crc_adj); 1108 pkt_mb2 = _mm_sub_epi16(pkt_mb2, crc_adj); 1109 /* C.4 adjust flow mark. */ 1110 pkt_mb3 = _mm_add_epi32(pkt_mb3, flow_mark_adj); 1111 pkt_mb2 = _mm_add_epi32(pkt_mb2, flow_mark_adj); 1112 /* D.1 fill in mbuf - rx_descriptor_fields1. */ 1113 _mm_storeu_si128((void *)&pkts[pos + 3]->pkt_len, pkt_mb3); 1114 _mm_storeu_si128((void *)&pkts[pos + 2]->pkt_len, pkt_mb2); 1115 /* E.1 extract op_own field. */ 1116 op_own_tmp2 = _mm_unpacklo_epi32(cqes[2], cqes[3]); 1117 /* C.1 load remained CQE data and extract necessary fields. */ 1118 cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p1]); 1119 cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos]); 1120 cqes[1] = _mm_blendv_epi8(cqes[1], cqe_tmp2, blend_mask); 1121 cqes[0] = _mm_blendv_epi8(cqes[0], cqe_tmp1, blend_mask); 1122 cqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p1].rsvd1[3]); 1123 cqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos].rsvd1[3]); 1124 cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x30); 1125 cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x30); 1126 cqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p1].rsvd2[10]); 1127 cqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos].rsvd2[10]); 1128 cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x04); 1129 cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x04); 1130 /* C.2 generate final structure for mbuf with swapping bytes. */ 1131 pkt_mb1 = _mm_shuffle_epi8(cqes[1], shuf_mask); 1132 pkt_mb0 = _mm_shuffle_epi8(cqes[0], shuf_mask); 1133 /* C.3 adjust CRC length. */ 1134 pkt_mb1 = _mm_sub_epi16(pkt_mb1, crc_adj); 1135 pkt_mb0 = _mm_sub_epi16(pkt_mb0, crc_adj); 1136 /* C.4 adjust flow mark. */ 1137 pkt_mb1 = _mm_add_epi32(pkt_mb1, flow_mark_adj); 1138 pkt_mb0 = _mm_add_epi32(pkt_mb0, flow_mark_adj); 1139 /* E.1 extract op_own byte. */ 1140 op_own_tmp1 = _mm_unpacklo_epi32(cqes[0], cqes[1]); 1141 op_own = _mm_unpackhi_epi64(op_own_tmp1, op_own_tmp2); 1142 /* D.1 fill in mbuf - rx_descriptor_fields1. */ 1143 _mm_storeu_si128((void *)&pkts[pos + 1]->pkt_len, pkt_mb1); 1144 _mm_storeu_si128((void *)&pkts[pos]->pkt_len, pkt_mb0); 1145 /* E.2 flip owner bit to mark CQEs from last round. */ 1146 owner_mask = _mm_and_si128(op_own, owner_check); 1147 if (ownership) 1148 owner_mask = _mm_xor_si128(owner_mask, owner_check); 1149 owner_mask = _mm_cmpeq_epi32(owner_mask, owner_check); 1150 owner_mask = _mm_packs_epi32(owner_mask, zero); 1151 /* E.3 get mask for invalidated CQEs. */ 1152 opcode = _mm_and_si128(op_own, opcode_check); 1153 invalid_mask = _mm_cmpeq_epi32(opcode_check, opcode); 1154 invalid_mask = _mm_packs_epi32(invalid_mask, zero); 1155 /* E.4 mask out beyond boundary. */ 1156 invalid_mask = _mm_or_si128(invalid_mask, mask); 1157 /* E.5 merge invalid_mask with invalid owner. */ 1158 invalid_mask = _mm_or_si128(invalid_mask, owner_mask); 1159 /* F.1 find compressed CQE format. */ 1160 comp_mask = _mm_and_si128(op_own, format_check); 1161 comp_mask = _mm_cmpeq_epi32(comp_mask, format_check); 1162 comp_mask = _mm_packs_epi32(comp_mask, zero); 1163 /* F.2 mask out invalid entries. */ 1164 comp_mask = _mm_andnot_si128(invalid_mask, comp_mask); 1165 comp_idx = _mm_cvtsi128_si64(comp_mask); 1166 /* F.3 get the first compressed CQE. */ 1167 comp_idx = comp_idx ? 1168 __builtin_ctzll(comp_idx) / 1169 (sizeof(uint16_t) * 8) : 1170 MLX5_VPMD_DESCS_PER_LOOP; 1171 /* E.6 mask out entries after the compressed CQE. */ 1172 mask = _mm_set_epi64x(0, comp_idx * sizeof(uint16_t) * 8); 1173 mask = _mm_sll_epi64(ones, mask); 1174 invalid_mask = _mm_or_si128(invalid_mask, mask); 1175 /* E.7 count non-compressed valid CQEs. */ 1176 n = _mm_cvtsi128_si64(invalid_mask); 1177 n = n ? __builtin_ctzll(n) / (sizeof(uint16_t) * 8) : 1178 MLX5_VPMD_DESCS_PER_LOOP; 1179 nocmp_n += n; 1180 /* D.2 get the final invalid mask. */ 1181 mask = _mm_set_epi64x(0, n * sizeof(uint16_t) * 8); 1182 mask = _mm_sll_epi64(ones, mask); 1183 invalid_mask = _mm_or_si128(invalid_mask, mask); 1184 /* D.3 check error in opcode. */ 1185 opcode = _mm_cmpeq_epi32(resp_err_check, opcode); 1186 opcode = _mm_packs_epi32(opcode, zero); 1187 opcode = _mm_andnot_si128(invalid_mask, opcode); 1188 /* D.4 mark if any error is set */ 1189 rxq->pending_err |= !!_mm_cvtsi128_si64(opcode); 1190 /* D.5 fill in mbuf - rearm_data and packet_type. */ 1191 rxq_cq_to_ptype_oflags_v(rxq, cqes, opcode, &pkts[pos]); 1192 #ifdef MLX5_PMD_SOFT_COUNTERS 1193 /* Add up received bytes count. */ 1194 byte_cnt = _mm_shuffle_epi8(op_own, len_shuf_mask); 1195 byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt); 1196 byte_cnt = _mm_hadd_epi16(byte_cnt, zero); 1197 rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero)); 1198 #endif 1199 /* 1200 * Break the loop unless more valid CQE is expected, or if 1201 * there's a compressed CQE. 1202 */ 1203 if (n != MLX5_VPMD_DESCS_PER_LOOP) 1204 break; 1205 } 1206 /* If no new CQE seen, return without updating cq_db. */ 1207 if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) 1208 return rcvd_pkt; 1209 /* Update the consumer indexes for non-compressed CQEs. */ 1210 assert(nocmp_n <= pkts_n); 1211 rxq->cq_ci += nocmp_n; 1212 rxq->rq_pi += nocmp_n; 1213 rcvd_pkt += nocmp_n; 1214 #ifdef MLX5_PMD_SOFT_COUNTERS 1215 rxq->stats.ipackets += nocmp_n; 1216 rxq->stats.ibytes += rcvd_byte; 1217 #endif 1218 /* Decompress the last CQE if compressed. */ 1219 if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) { 1220 assert(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP)); 1221 rxq_cq_decompress_v(rxq, &cq[nocmp_n], &elts[nocmp_n]); 1222 /* Return more packets if needed. */ 1223 if (nocmp_n < pkts_n) { 1224 uint16_t n = rxq->cq_ci - rxq->rq_pi; 1225 1226 n = RTE_MIN(n, pkts_n - nocmp_n); 1227 rxq_copy_mbuf_v(rxq, &pkts[nocmp_n], n); 1228 rxq->rq_pi += n; 1229 rcvd_pkt += n; 1230 } 1231 } 1232 rte_compiler_barrier(); 1233 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); 1234 return rcvd_pkt; 1235 } 1236 1237 /** 1238 * DPDK callback for vectorized RX. 1239 * 1240 * @param dpdk_rxq 1241 * Generic pointer to RX queue structure. 1242 * @param[out] pkts 1243 * Array to store received packets. 1244 * @param pkts_n 1245 * Maximum number of packets in array. 1246 * 1247 * @return 1248 * Number of packets successfully received (<= pkts_n). 1249 */ 1250 uint16_t 1251 mlx5_rx_burst_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1252 { 1253 struct mlx5_rxq_data *rxq = dpdk_rxq; 1254 uint16_t nb_rx; 1255 1256 nb_rx = rxq_burst_v(rxq, pkts, pkts_n); 1257 if (unlikely(rxq->pending_err)) 1258 nb_rx = rxq_handle_pending_error(rxq, pkts, nb_rx); 1259 return nb_rx; 1260 } 1261 1262 /** 1263 * Check Tx queue flags are set for raw vectorized Tx. 1264 * 1265 * @param priv 1266 * Pointer to private structure. 1267 * 1268 * @return 1269 * 1 if supported, negative errno value if not. 1270 */ 1271 int __attribute__((cold)) 1272 priv_check_raw_vec_tx_support(struct priv *priv) 1273 { 1274 uint16_t i; 1275 1276 /* All the configured queues should support. */ 1277 for (i = 0; i < priv->txqs_n; ++i) { 1278 struct mlx5_txq_data *txq = (*priv->txqs)[i]; 1279 1280 if (!(txq->flags & ETH_TXQ_FLAGS_NOMULTSEGS) || 1281 !(txq->flags & ETH_TXQ_FLAGS_NOOFFLOADS)) 1282 break; 1283 } 1284 if (i != priv->txqs_n) 1285 return -ENOTSUP; 1286 return 1; 1287 } 1288 1289 /** 1290 * Check a device can support vectorized TX. 1291 * 1292 * @param priv 1293 * Pointer to private structure. 1294 * 1295 * @return 1296 * 1 if supported, negative errno value if not. 1297 */ 1298 int __attribute__((cold)) 1299 priv_check_vec_tx_support(struct priv *priv) 1300 { 1301 if (!priv->tx_vec_en || 1302 priv->txqs_n > MLX5_VPMD_MIN_TXQS || 1303 priv->mps != MLX5_MPW_ENHANCED || 1304 priv->tso) 1305 return -ENOTSUP; 1306 return 1; 1307 } 1308 1309 /** 1310 * Check a RX queue can support vectorized RX. 1311 * 1312 * @param rxq 1313 * Pointer to RX queue. 1314 * 1315 * @return 1316 * 1 if supported, negative errno value if not. 1317 */ 1318 int __attribute__((cold)) 1319 rxq_check_vec_support(struct mlx5_rxq_data *rxq) 1320 { 1321 struct mlx5_rxq_ctrl *ctrl = 1322 container_of(rxq, struct mlx5_rxq_ctrl, rxq); 1323 1324 if (!ctrl->priv->rx_vec_en || rxq->sges_n != 0) 1325 return -ENOTSUP; 1326 return 1; 1327 } 1328 1329 /** 1330 * Check a device can support vectorized RX. 1331 * 1332 * @param priv 1333 * Pointer to private structure. 1334 * 1335 * @return 1336 * 1 if supported, negative errno value if not. 1337 */ 1338 int __attribute__((cold)) 1339 priv_check_vec_rx_support(struct priv *priv) 1340 { 1341 uint16_t i; 1342 1343 if (!priv->rx_vec_en) 1344 return -ENOTSUP; 1345 /* All the configured queues should support. */ 1346 for (i = 0; i < priv->rxqs_n; ++i) { 1347 struct mlx5_rxq_data *rxq = (*priv->rxqs)[i]; 1348 1349 if (!rxq) 1350 continue; 1351 if (rxq_check_vec_support(rxq) < 0) 1352 break; 1353 } 1354 if (i != priv->rxqs_n) 1355 return -ENOTSUP; 1356 return 1; 1357 } 1358