1 /*- 2 * BSD LICENSE 3 * 4 * Copyright 2015 6WIND S.A. 5 * Copyright 2015 Mellanox. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of 6WIND S.A. nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <assert.h> 35 #include <stdint.h> 36 #include <string.h> 37 #include <stdlib.h> 38 39 /* Verbs header. */ 40 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 41 #ifdef PEDANTIC 42 #pragma GCC diagnostic ignored "-Wpedantic" 43 #endif 44 #include <infiniband/verbs.h> 45 #include <infiniband/mlx5_hw.h> 46 #include <infiniband/arch.h> 47 #ifdef PEDANTIC 48 #pragma GCC diagnostic error "-Wpedantic" 49 #endif 50 51 /* DPDK headers don't like -pedantic. */ 52 #ifdef PEDANTIC 53 #pragma GCC diagnostic ignored "-Wpedantic" 54 #endif 55 #include <rte_mbuf.h> 56 #include <rte_mempool.h> 57 #include <rte_prefetch.h> 58 #include <rte_common.h> 59 #include <rte_branch_prediction.h> 60 #include <rte_ether.h> 61 #ifdef PEDANTIC 62 #pragma GCC diagnostic error "-Wpedantic" 63 #endif 64 65 #include "mlx5.h" 66 #include "mlx5_utils.h" 67 #include "mlx5_rxtx.h" 68 #include "mlx5_autoconf.h" 69 #include "mlx5_defs.h" 70 #include "mlx5_prm.h" 71 72 static inline int 73 check_cqe(volatile struct mlx5_cqe *cqe, 74 unsigned int cqes_n, const uint16_t ci) 75 __attribute__((always_inline)); 76 77 static inline void 78 txq_complete(struct txq *txq) __attribute__((always_inline)); 79 80 static inline uint32_t 81 txq_mp2mr(struct txq *txq, struct rte_mempool *mp) 82 __attribute__((always_inline)); 83 84 static inline void 85 mlx5_tx_dbrec(struct txq *txq, volatile struct mlx5_wqe *wqe) 86 __attribute__((always_inline)); 87 88 static inline uint32_t 89 rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe) 90 __attribute__((always_inline)); 91 92 static inline int 93 mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe, 94 uint16_t cqe_cnt, uint32_t *rss_hash) 95 __attribute__((always_inline)); 96 97 static inline uint32_t 98 rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe *cqe) 99 __attribute__((always_inline)); 100 101 #ifndef NDEBUG 102 103 /** 104 * Verify or set magic value in CQE. 105 * 106 * @param cqe 107 * Pointer to CQE. 108 * 109 * @return 110 * 0 the first time. 111 */ 112 static inline int 113 check_cqe_seen(volatile struct mlx5_cqe *cqe) 114 { 115 static const uint8_t magic[] = "seen"; 116 volatile uint8_t (*buf)[sizeof(cqe->rsvd0)] = &cqe->rsvd0; 117 int ret = 1; 118 unsigned int i; 119 120 for (i = 0; i < sizeof(magic) && i < sizeof(*buf); ++i) 121 if (!ret || (*buf)[i] != magic[i]) { 122 ret = 0; 123 (*buf)[i] = magic[i]; 124 } 125 return ret; 126 } 127 128 #endif /* NDEBUG */ 129 130 /** 131 * Check whether CQE is valid. 132 * 133 * @param cqe 134 * Pointer to CQE. 135 * @param cqes_n 136 * Size of completion queue. 137 * @param ci 138 * Consumer index. 139 * 140 * @return 141 * 0 on success, 1 on failure. 142 */ 143 static inline int 144 check_cqe(volatile struct mlx5_cqe *cqe, 145 unsigned int cqes_n, const uint16_t ci) 146 { 147 uint16_t idx = ci & cqes_n; 148 uint8_t op_own = cqe->op_own; 149 uint8_t op_owner = MLX5_CQE_OWNER(op_own); 150 uint8_t op_code = MLX5_CQE_OPCODE(op_own); 151 152 if (unlikely((op_owner != (!!(idx))) || (op_code == MLX5_CQE_INVALID))) 153 return 1; /* No CQE. */ 154 #ifndef NDEBUG 155 if ((op_code == MLX5_CQE_RESP_ERR) || 156 (op_code == MLX5_CQE_REQ_ERR)) { 157 volatile struct mlx5_err_cqe *err_cqe = (volatile void *)cqe; 158 uint8_t syndrome = err_cqe->syndrome; 159 160 if ((syndrome == MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR) || 161 (syndrome == MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR)) 162 return 0; 163 if (!check_cqe_seen(cqe)) 164 ERROR("unexpected CQE error %u (0x%02x)" 165 " syndrome 0x%02x", 166 op_code, op_code, syndrome); 167 return 1; 168 } else if ((op_code != MLX5_CQE_RESP_SEND) && 169 (op_code != MLX5_CQE_REQ)) { 170 if (!check_cqe_seen(cqe)) 171 ERROR("unexpected CQE opcode %u (0x%02x)", 172 op_code, op_code); 173 return 1; 174 } 175 #endif /* NDEBUG */ 176 return 0; 177 } 178 179 /** 180 * Return the address of the WQE. 181 * 182 * @param txq 183 * Pointer to TX queue structure. 184 * @param wqe_ci 185 * WQE consumer index. 186 * 187 * @return 188 * WQE address. 189 */ 190 static inline uintptr_t * 191 tx_mlx5_wqe(struct txq *txq, uint16_t ci) 192 { 193 ci &= ((1 << txq->wqe_n) - 1); 194 return (uintptr_t *)((uintptr_t)txq->wqes + ci * MLX5_WQE_SIZE); 195 } 196 197 /** 198 * Return the size of tailroom of WQ. 199 * 200 * @param txq 201 * Pointer to TX queue structure. 202 * @param addr 203 * Pointer to tail of WQ. 204 * 205 * @return 206 * Size of tailroom. 207 */ 208 static inline size_t 209 tx_mlx5_wq_tailroom(struct txq *txq, void *addr) 210 { 211 size_t tailroom; 212 tailroom = (uintptr_t)(txq->wqes) + 213 (1 << txq->wqe_n) * MLX5_WQE_SIZE - 214 (uintptr_t)addr; 215 return tailroom; 216 } 217 218 /** 219 * Copy data to tailroom of circular queue. 220 * 221 * @param dst 222 * Pointer to destination. 223 * @param src 224 * Pointer to source. 225 * @param n 226 * Number of bytes to copy. 227 * @param base 228 * Pointer to head of queue. 229 * @param tailroom 230 * Size of tailroom from dst. 231 * 232 * @return 233 * Pointer after copied data. 234 */ 235 static inline void * 236 mlx5_copy_to_wq(void *dst, const void *src, size_t n, 237 void *base, size_t tailroom) 238 { 239 void *ret; 240 241 if (n > tailroom) { 242 rte_memcpy(dst, src, tailroom); 243 rte_memcpy(base, (void *)((uintptr_t)src + tailroom), 244 n - tailroom); 245 ret = (uint8_t *)base + n - tailroom; 246 } else { 247 rte_memcpy(dst, src, n); 248 ret = (n == tailroom) ? base : (uint8_t *)dst + n; 249 } 250 return ret; 251 } 252 253 /** 254 * Manage TX completions. 255 * 256 * When sending a burst, mlx5_tx_burst() posts several WRs. 257 * 258 * @param txq 259 * Pointer to TX queue structure. 260 */ 261 static inline void 262 txq_complete(struct txq *txq) 263 { 264 const unsigned int elts_n = 1 << txq->elts_n; 265 const unsigned int cqe_n = 1 << txq->cqe_n; 266 const unsigned int cqe_cnt = cqe_n - 1; 267 uint16_t elts_free = txq->elts_tail; 268 uint16_t elts_tail; 269 uint16_t cq_ci = txq->cq_ci; 270 volatile struct mlx5_cqe *cqe = NULL; 271 volatile struct mlx5_wqe_ctrl *ctrl; 272 273 do { 274 volatile struct mlx5_cqe *tmp; 275 276 tmp = &(*txq->cqes)[cq_ci & cqe_cnt]; 277 if (check_cqe(tmp, cqe_n, cq_ci)) 278 break; 279 cqe = tmp; 280 #ifndef NDEBUG 281 if (MLX5_CQE_FORMAT(cqe->op_own) == MLX5_COMPRESSED) { 282 if (!check_cqe_seen(cqe)) 283 ERROR("unexpected compressed CQE, TX stopped"); 284 return; 285 } 286 if ((MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_RESP_ERR) || 287 (MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_REQ_ERR)) { 288 if (!check_cqe_seen(cqe)) 289 ERROR("unexpected error CQE, TX stopped"); 290 return; 291 } 292 #endif /* NDEBUG */ 293 ++cq_ci; 294 } while (1); 295 if (unlikely(cqe == NULL)) 296 return; 297 txq->wqe_pi = ntohs(cqe->wqe_counter); 298 ctrl = (volatile struct mlx5_wqe_ctrl *) 299 tx_mlx5_wqe(txq, txq->wqe_pi); 300 elts_tail = ctrl->ctrl3; 301 assert(elts_tail < (1 << txq->wqe_n)); 302 /* Free buffers. */ 303 while (elts_free != elts_tail) { 304 struct rte_mbuf *elt = (*txq->elts)[elts_free]; 305 unsigned int elts_free_next = 306 (elts_free + 1) & (elts_n - 1); 307 struct rte_mbuf *elt_next = (*txq->elts)[elts_free_next]; 308 309 #ifndef NDEBUG 310 /* Poisoning. */ 311 memset(&(*txq->elts)[elts_free], 312 0x66, 313 sizeof((*txq->elts)[elts_free])); 314 #endif 315 RTE_MBUF_PREFETCH_TO_FREE(elt_next); 316 /* Only one segment needs to be freed. */ 317 rte_pktmbuf_free_seg(elt); 318 elts_free = elts_free_next; 319 } 320 txq->cq_ci = cq_ci; 321 txq->elts_tail = elts_tail; 322 /* Update the consumer index. */ 323 rte_wmb(); 324 *txq->cq_db = htonl(cq_ci); 325 } 326 327 /** 328 * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which 329 * the cloned mbuf is allocated is returned instead. 330 * 331 * @param buf 332 * Pointer to mbuf. 333 * 334 * @return 335 * Memory pool where data is located for given mbuf. 336 */ 337 static struct rte_mempool * 338 txq_mb2mp(struct rte_mbuf *buf) 339 { 340 if (unlikely(RTE_MBUF_INDIRECT(buf))) 341 return rte_mbuf_from_indirect(buf)->pool; 342 return buf->pool; 343 } 344 345 /** 346 * Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[]. 347 * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full, 348 * remove an entry first. 349 * 350 * @param txq 351 * Pointer to TX queue structure. 352 * @param[in] mp 353 * Memory Pool for which a Memory Region lkey must be returned. 354 * 355 * @return 356 * mr->lkey on success, (uint32_t)-1 on failure. 357 */ 358 static inline uint32_t 359 txq_mp2mr(struct txq *txq, struct rte_mempool *mp) 360 { 361 unsigned int i; 362 uint32_t lkey = (uint32_t)-1; 363 364 for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) { 365 if (unlikely(txq->mp2mr[i].mp == NULL)) { 366 /* Unknown MP, add a new MR for it. */ 367 break; 368 } 369 if (txq->mp2mr[i].mp == mp) { 370 assert(txq->mp2mr[i].lkey != (uint32_t)-1); 371 assert(htonl(txq->mp2mr[i].mr->lkey) == 372 txq->mp2mr[i].lkey); 373 lkey = txq->mp2mr[i].lkey; 374 break; 375 } 376 } 377 if (unlikely(lkey == (uint32_t)-1)) 378 lkey = txq_mp2mr_reg(txq, mp, i); 379 return lkey; 380 } 381 382 /** 383 * Ring TX queue doorbell. 384 * 385 * @param txq 386 * Pointer to TX queue structure. 387 * @param wqe 388 * Pointer to the last WQE posted in the NIC. 389 */ 390 static inline void 391 mlx5_tx_dbrec(struct txq *txq, volatile struct mlx5_wqe *wqe) 392 { 393 uint64_t *dst = (uint64_t *)((uintptr_t)txq->bf_reg); 394 volatile uint64_t *src = ((volatile uint64_t *)wqe); 395 396 rte_wmb(); 397 *txq->qp_db = htonl(txq->wqe_ci); 398 /* Ensure ordering between DB record and BF copy. */ 399 rte_wmb(); 400 *dst = *src; 401 } 402 403 /** 404 * DPDK callback to check the status of a tx descriptor. 405 * 406 * @param tx_queue 407 * The tx queue. 408 * @param[in] offset 409 * The index of the descriptor in the ring. 410 * 411 * @return 412 * The status of the tx descriptor. 413 */ 414 int 415 mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset) 416 { 417 struct txq *txq = tx_queue; 418 const unsigned int elts_n = 1 << txq->elts_n; 419 const unsigned int elts_cnt = elts_n - 1; 420 unsigned int used; 421 422 txq_complete(txq); 423 used = (txq->elts_head - txq->elts_tail) & elts_cnt; 424 if (offset < used) 425 return RTE_ETH_TX_DESC_FULL; 426 return RTE_ETH_TX_DESC_DONE; 427 } 428 429 /** 430 * DPDK callback to check the status of a rx descriptor. 431 * 432 * @param rx_queue 433 * The rx queue. 434 * @param[in] offset 435 * The index of the descriptor in the ring. 436 * 437 * @return 438 * The status of the tx descriptor. 439 */ 440 int 441 mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset) 442 { 443 struct rxq *rxq = rx_queue; 444 struct rxq_zip *zip = &rxq->zip; 445 volatile struct mlx5_cqe *cqe; 446 const unsigned int cqe_n = (1 << rxq->cqe_n); 447 const unsigned int cqe_cnt = cqe_n - 1; 448 unsigned int cq_ci; 449 unsigned int used; 450 451 /* if we are processing a compressed cqe */ 452 if (zip->ai) { 453 used = zip->cqe_cnt - zip->ca; 454 cq_ci = zip->cq_ci; 455 } else { 456 used = 0; 457 cq_ci = rxq->cq_ci; 458 } 459 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; 460 while (check_cqe(cqe, cqe_n, cq_ci) == 0) { 461 int8_t op_own; 462 unsigned int n; 463 464 op_own = cqe->op_own; 465 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) 466 n = ntohl(cqe->byte_cnt); 467 else 468 n = 1; 469 cq_ci += n; 470 used += n; 471 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; 472 } 473 used = RTE_MIN(used, (1U << rxq->elts_n) - 1); 474 if (offset < used) 475 return RTE_ETH_RX_DESC_DONE; 476 return RTE_ETH_RX_DESC_AVAIL; 477 } 478 479 /** 480 * DPDK callback for TX. 481 * 482 * @param dpdk_txq 483 * Generic pointer to TX queue structure. 484 * @param[in] pkts 485 * Packets to transmit. 486 * @param pkts_n 487 * Number of packets in array. 488 * 489 * @return 490 * Number of packets successfully transmitted (<= pkts_n). 491 */ 492 uint16_t 493 mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 494 { 495 struct txq *txq = (struct txq *)dpdk_txq; 496 uint16_t elts_head = txq->elts_head; 497 const unsigned int elts_n = 1 << txq->elts_n; 498 unsigned int i = 0; 499 unsigned int j = 0; 500 unsigned int k = 0; 501 unsigned int max; 502 uint16_t max_wqe; 503 unsigned int comp; 504 volatile struct mlx5_wqe_v *wqe = NULL; 505 unsigned int segs_n = 0; 506 struct rte_mbuf *buf = NULL; 507 uint8_t *raw; 508 509 if (unlikely(!pkts_n)) 510 return 0; 511 /* Prefetch first packet cacheline. */ 512 rte_prefetch0(*pkts); 513 /* Start processing. */ 514 txq_complete(txq); 515 max = (elts_n - (elts_head - txq->elts_tail)); 516 if (max > elts_n) 517 max -= elts_n; 518 max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); 519 if (unlikely(!max_wqe)) 520 return 0; 521 do { 522 volatile rte_v128u32_t *dseg = NULL; 523 uint32_t length; 524 unsigned int ds = 0; 525 uintptr_t addr; 526 uint64_t naddr; 527 uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE + 2; 528 uint16_t tso_header_sz = 0; 529 uint16_t ehdr; 530 uint8_t cs_flags = 0; 531 uint64_t tso = 0; 532 #ifdef MLX5_PMD_SOFT_COUNTERS 533 uint32_t total_length = 0; 534 #endif 535 536 /* first_seg */ 537 buf = *(pkts++); 538 segs_n = buf->nb_segs; 539 /* 540 * Make sure there is enough room to store this packet and 541 * that one ring entry remains unused. 542 */ 543 assert(segs_n); 544 if (max < segs_n + 1) 545 break; 546 max -= segs_n; 547 --segs_n; 548 if (!segs_n) 549 --pkts_n; 550 if (unlikely(--max_wqe == 0)) 551 break; 552 wqe = (volatile struct mlx5_wqe_v *) 553 tx_mlx5_wqe(txq, txq->wqe_ci); 554 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); 555 if (pkts_n > 1) 556 rte_prefetch0(*pkts); 557 addr = rte_pktmbuf_mtod(buf, uintptr_t); 558 length = DATA_LEN(buf); 559 ehdr = (((uint8_t *)addr)[1] << 8) | 560 ((uint8_t *)addr)[0]; 561 #ifdef MLX5_PMD_SOFT_COUNTERS 562 total_length = length; 563 #endif 564 if (length < (MLX5_WQE_DWORD_SIZE + 2)) 565 break; 566 /* Update element. */ 567 (*txq->elts)[elts_head] = buf; 568 elts_head = (elts_head + 1) & (elts_n - 1); 569 /* Prefetch next buffer data. */ 570 if (pkts_n > 1) { 571 volatile void *pkt_addr; 572 573 pkt_addr = rte_pktmbuf_mtod(*pkts, volatile void *); 574 rte_prefetch0(pkt_addr); 575 } 576 /* Should we enable HW CKSUM offload */ 577 if (buf->ol_flags & 578 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) { 579 const uint64_t is_tunneled = buf->ol_flags & 580 (PKT_TX_TUNNEL_GRE | 581 PKT_TX_TUNNEL_VXLAN); 582 583 if (is_tunneled && txq->tunnel_en) { 584 cs_flags = MLX5_ETH_WQE_L3_INNER_CSUM | 585 MLX5_ETH_WQE_L4_INNER_CSUM; 586 if (buf->ol_flags & PKT_TX_OUTER_IP_CKSUM) 587 cs_flags |= MLX5_ETH_WQE_L3_CSUM; 588 } else { 589 cs_flags = MLX5_ETH_WQE_L3_CSUM | 590 MLX5_ETH_WQE_L4_CSUM; 591 } 592 } 593 raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE; 594 /* Replace the Ethernet type by the VLAN if necessary. */ 595 if (buf->ol_flags & PKT_TX_VLAN_PKT) { 596 uint32_t vlan = htonl(0x81000000 | buf->vlan_tci); 597 unsigned int len = 2 * ETHER_ADDR_LEN - 2; 598 599 addr += 2; 600 length -= 2; 601 /* Copy Destination and source mac address. */ 602 memcpy((uint8_t *)raw, ((uint8_t *)addr), len); 603 /* Copy VLAN. */ 604 memcpy((uint8_t *)raw + len, &vlan, sizeof(vlan)); 605 /* Copy missing two bytes to end the DSeg. */ 606 memcpy((uint8_t *)raw + len + sizeof(vlan), 607 ((uint8_t *)addr) + len, 2); 608 addr += len + 2; 609 length -= (len + 2); 610 } else { 611 memcpy((uint8_t *)raw, ((uint8_t *)addr) + 2, 612 MLX5_WQE_DWORD_SIZE); 613 length -= pkt_inline_sz; 614 addr += pkt_inline_sz; 615 } 616 if (txq->tso_en) { 617 tso = buf->ol_flags & PKT_TX_TCP_SEG; 618 if (tso) { 619 uintptr_t end = (uintptr_t) 620 (((uintptr_t)txq->wqes) + 621 (1 << txq->wqe_n) * 622 MLX5_WQE_SIZE); 623 unsigned int copy_b; 624 uint8_t vlan_sz = (buf->ol_flags & 625 PKT_TX_VLAN_PKT) ? 4 : 0; 626 const uint64_t is_tunneled = 627 buf->ol_flags & 628 (PKT_TX_TUNNEL_GRE | 629 PKT_TX_TUNNEL_VXLAN); 630 631 tso_header_sz = buf->l2_len + vlan_sz + 632 buf->l3_len + buf->l4_len; 633 634 if (is_tunneled && txq->tunnel_en) { 635 tso_header_sz += buf->outer_l2_len + 636 buf->outer_l3_len; 637 cs_flags |= MLX5_ETH_WQE_L4_INNER_CSUM; 638 } else { 639 cs_flags |= MLX5_ETH_WQE_L4_CSUM; 640 } 641 if (unlikely(tso_header_sz > 642 MLX5_MAX_TSO_HEADER)) 643 break; 644 copy_b = tso_header_sz - pkt_inline_sz; 645 /* First seg must contain all headers. */ 646 assert(copy_b <= length); 647 raw += MLX5_WQE_DWORD_SIZE; 648 if (copy_b && 649 ((end - (uintptr_t)raw) > copy_b)) { 650 uint16_t n = (MLX5_WQE_DS(copy_b) - 651 1 + 3) / 4; 652 653 if (unlikely(max_wqe < n)) 654 break; 655 max_wqe -= n; 656 rte_memcpy((void *)raw, 657 (void *)addr, copy_b); 658 addr += copy_b; 659 length -= copy_b; 660 pkt_inline_sz += copy_b; 661 /* 662 * Another DWORD will be added 663 * in the inline part. 664 */ 665 raw += MLX5_WQE_DS(copy_b) * 666 MLX5_WQE_DWORD_SIZE - 667 MLX5_WQE_DWORD_SIZE; 668 } else { 669 /* NOP WQE. */ 670 wqe->ctrl = (rte_v128u32_t){ 671 htonl(txq->wqe_ci << 8), 672 htonl(txq->qp_num_8s | 1), 673 0, 674 0, 675 }; 676 ds = 1; 677 total_length = 0; 678 pkts--; 679 pkts_n++; 680 elts_head = (elts_head - 1) & 681 (elts_n - 1); 682 k++; 683 goto next_wqe; 684 } 685 } 686 } 687 /* Inline if enough room. */ 688 if (txq->inline_en || tso) { 689 uintptr_t end = (uintptr_t) 690 (((uintptr_t)txq->wqes) + 691 (1 << txq->wqe_n) * MLX5_WQE_SIZE); 692 unsigned int max_inline = txq->max_inline * 693 RTE_CACHE_LINE_SIZE - 694 (pkt_inline_sz - 2); 695 uintptr_t addr_end = (addr + max_inline) & 696 ~(RTE_CACHE_LINE_SIZE - 1); 697 unsigned int copy_b = (addr_end > addr) ? 698 RTE_MIN((addr_end - addr), length) : 699 0; 700 701 raw += MLX5_WQE_DWORD_SIZE; 702 if (copy_b && ((end - (uintptr_t)raw) > copy_b)) { 703 /* 704 * One Dseg remains in the current WQE. To 705 * keep the computation positive, it is 706 * removed after the bytes to Dseg conversion. 707 */ 708 uint16_t n = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4; 709 710 if (unlikely(max_wqe < n)) 711 break; 712 max_wqe -= n; 713 if (tso) { 714 uint32_t inl = 715 htonl(copy_b | MLX5_INLINE_SEG); 716 717 pkt_inline_sz = 718 MLX5_WQE_DS(tso_header_sz) * 719 MLX5_WQE_DWORD_SIZE; 720 rte_memcpy((void *)raw, 721 (void *)&inl, sizeof(inl)); 722 raw += sizeof(inl); 723 pkt_inline_sz += sizeof(inl); 724 } 725 rte_memcpy((void *)raw, (void *)addr, copy_b); 726 addr += copy_b; 727 length -= copy_b; 728 pkt_inline_sz += copy_b; 729 } 730 /* 731 * 2 DWORDs consumed by the WQE header + ETH segment + 732 * the size of the inline part of the packet. 733 */ 734 ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2); 735 if (length > 0) { 736 if (ds % (MLX5_WQE_SIZE / 737 MLX5_WQE_DWORD_SIZE) == 0) { 738 if (unlikely(--max_wqe == 0)) 739 break; 740 dseg = (volatile rte_v128u32_t *) 741 tx_mlx5_wqe(txq, txq->wqe_ci + 742 ds / 4); 743 } else { 744 dseg = (volatile rte_v128u32_t *) 745 ((uintptr_t)wqe + 746 (ds * MLX5_WQE_DWORD_SIZE)); 747 } 748 goto use_dseg; 749 } else if (!segs_n) { 750 goto next_pkt; 751 } else { 752 /* dseg will be advance as part of next_seg */ 753 dseg = (volatile rte_v128u32_t *) 754 ((uintptr_t)wqe + 755 ((ds - 1) * MLX5_WQE_DWORD_SIZE)); 756 goto next_seg; 757 } 758 } else { 759 /* 760 * No inline has been done in the packet, only the 761 * Ethernet Header as been stored. 762 */ 763 dseg = (volatile rte_v128u32_t *) 764 ((uintptr_t)wqe + (3 * MLX5_WQE_DWORD_SIZE)); 765 ds = 3; 766 use_dseg: 767 /* Add the remaining packet as a simple ds. */ 768 naddr = htonll(addr); 769 *dseg = (rte_v128u32_t){ 770 htonl(length), 771 txq_mp2mr(txq, txq_mb2mp(buf)), 772 naddr, 773 naddr >> 32, 774 }; 775 ++ds; 776 if (!segs_n) 777 goto next_pkt; 778 } 779 next_seg: 780 assert(buf); 781 assert(ds); 782 assert(wqe); 783 /* 784 * Spill on next WQE when the current one does not have 785 * enough room left. Size of WQE must a be a multiple 786 * of data segment size. 787 */ 788 assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE)); 789 if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE))) { 790 if (unlikely(--max_wqe == 0)) 791 break; 792 dseg = (volatile rte_v128u32_t *) 793 tx_mlx5_wqe(txq, txq->wqe_ci + ds / 4); 794 rte_prefetch0(tx_mlx5_wqe(txq, 795 txq->wqe_ci + ds / 4 + 1)); 796 } else { 797 ++dseg; 798 } 799 ++ds; 800 buf = buf->next; 801 assert(buf); 802 length = DATA_LEN(buf); 803 #ifdef MLX5_PMD_SOFT_COUNTERS 804 total_length += length; 805 #endif 806 /* Store segment information. */ 807 naddr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)); 808 *dseg = (rte_v128u32_t){ 809 htonl(length), 810 txq_mp2mr(txq, txq_mb2mp(buf)), 811 naddr, 812 naddr >> 32, 813 }; 814 (*txq->elts)[elts_head] = buf; 815 elts_head = (elts_head + 1) & (elts_n - 1); 816 ++j; 817 --segs_n; 818 if (segs_n) 819 goto next_seg; 820 else 821 --pkts_n; 822 next_pkt: 823 ++i; 824 /* Initialize known and common part of the WQE structure. */ 825 if (tso) { 826 wqe->ctrl = (rte_v128u32_t){ 827 htonl((txq->wqe_ci << 8) | MLX5_OPCODE_TSO), 828 htonl(txq->qp_num_8s | ds), 829 0, 830 0, 831 }; 832 wqe->eseg = (rte_v128u32_t){ 833 0, 834 cs_flags | (htons(buf->tso_segsz) << 16), 835 0, 836 (ehdr << 16) | htons(tso_header_sz), 837 }; 838 } else { 839 wqe->ctrl = (rte_v128u32_t){ 840 htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND), 841 htonl(txq->qp_num_8s | ds), 842 0, 843 0, 844 }; 845 wqe->eseg = (rte_v128u32_t){ 846 0, 847 cs_flags, 848 0, 849 (ehdr << 16) | htons(pkt_inline_sz), 850 }; 851 } 852 next_wqe: 853 txq->wqe_ci += (ds + 3) / 4; 854 #ifdef MLX5_PMD_SOFT_COUNTERS 855 /* Increment sent bytes counter. */ 856 txq->stats.obytes += total_length; 857 #endif 858 } while (pkts_n); 859 /* Take a shortcut if nothing must be sent. */ 860 if (unlikely((i + k) == 0)) 861 return 0; 862 /* Check whether completion threshold has been reached. */ 863 comp = txq->elts_comp + i + j + k; 864 if (comp >= MLX5_TX_COMP_THRESH) { 865 volatile struct mlx5_wqe_ctrl *w = 866 (volatile struct mlx5_wqe_ctrl *)wqe; 867 868 /* Request completion on last WQE. */ 869 w->ctrl2 = htonl(8); 870 /* Save elts_head in unused "immediate" field of WQE. */ 871 w->ctrl3 = elts_head; 872 txq->elts_comp = 0; 873 } else { 874 txq->elts_comp = comp; 875 } 876 #ifdef MLX5_PMD_SOFT_COUNTERS 877 /* Increment sent packets counter. */ 878 txq->stats.opackets += i; 879 #endif 880 /* Ring QP doorbell. */ 881 mlx5_tx_dbrec(txq, (volatile struct mlx5_wqe *)wqe); 882 txq->elts_head = elts_head; 883 return i; 884 } 885 886 /** 887 * Open a MPW session. 888 * 889 * @param txq 890 * Pointer to TX queue structure. 891 * @param mpw 892 * Pointer to MPW session structure. 893 * @param length 894 * Packet length. 895 */ 896 static inline void 897 mlx5_mpw_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length) 898 { 899 uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1); 900 volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] = 901 (volatile struct mlx5_wqe_data_seg (*)[]) 902 tx_mlx5_wqe(txq, idx + 1); 903 904 mpw->state = MLX5_MPW_STATE_OPENED; 905 mpw->pkts_n = 0; 906 mpw->len = length; 907 mpw->total_len = 0; 908 mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx); 909 mpw->wqe->eseg.mss = htons(length); 910 mpw->wqe->eseg.inline_hdr_sz = 0; 911 mpw->wqe->eseg.rsvd0 = 0; 912 mpw->wqe->eseg.rsvd1 = 0; 913 mpw->wqe->eseg.rsvd2 = 0; 914 mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) | 915 (txq->wqe_ci << 8) | MLX5_OPCODE_TSO); 916 mpw->wqe->ctrl[2] = 0; 917 mpw->wqe->ctrl[3] = 0; 918 mpw->data.dseg[0] = (volatile struct mlx5_wqe_data_seg *) 919 (((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE)); 920 mpw->data.dseg[1] = (volatile struct mlx5_wqe_data_seg *) 921 (((uintptr_t)mpw->wqe) + (3 * MLX5_WQE_DWORD_SIZE)); 922 mpw->data.dseg[2] = &(*dseg)[0]; 923 mpw->data.dseg[3] = &(*dseg)[1]; 924 mpw->data.dseg[4] = &(*dseg)[2]; 925 } 926 927 /** 928 * Close a MPW session. 929 * 930 * @param txq 931 * Pointer to TX queue structure. 932 * @param mpw 933 * Pointer to MPW session structure. 934 */ 935 static inline void 936 mlx5_mpw_close(struct txq *txq, struct mlx5_mpw *mpw) 937 { 938 unsigned int num = mpw->pkts_n; 939 940 /* 941 * Store size in multiple of 16 bytes. Control and Ethernet segments 942 * count as 2. 943 */ 944 mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | (2 + num)); 945 mpw->state = MLX5_MPW_STATE_CLOSED; 946 if (num < 3) 947 ++txq->wqe_ci; 948 else 949 txq->wqe_ci += 2; 950 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci)); 951 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); 952 } 953 954 /** 955 * DPDK callback for TX with MPW support. 956 * 957 * @param dpdk_txq 958 * Generic pointer to TX queue structure. 959 * @param[in] pkts 960 * Packets to transmit. 961 * @param pkts_n 962 * Number of packets in array. 963 * 964 * @return 965 * Number of packets successfully transmitted (<= pkts_n). 966 */ 967 uint16_t 968 mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 969 { 970 struct txq *txq = (struct txq *)dpdk_txq; 971 uint16_t elts_head = txq->elts_head; 972 const unsigned int elts_n = 1 << txq->elts_n; 973 unsigned int i = 0; 974 unsigned int j = 0; 975 unsigned int max; 976 uint16_t max_wqe; 977 unsigned int comp; 978 struct mlx5_mpw mpw = { 979 .state = MLX5_MPW_STATE_CLOSED, 980 }; 981 982 if (unlikely(!pkts_n)) 983 return 0; 984 /* Prefetch first packet cacheline. */ 985 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci)); 986 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); 987 /* Start processing. */ 988 txq_complete(txq); 989 max = (elts_n - (elts_head - txq->elts_tail)); 990 if (max > elts_n) 991 max -= elts_n; 992 max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); 993 if (unlikely(!max_wqe)) 994 return 0; 995 do { 996 struct rte_mbuf *buf = *(pkts++); 997 unsigned int elts_head_next; 998 uint32_t length; 999 unsigned int segs_n = buf->nb_segs; 1000 uint32_t cs_flags = 0; 1001 1002 /* 1003 * Make sure there is enough room to store this packet and 1004 * that one ring entry remains unused. 1005 */ 1006 assert(segs_n); 1007 if (max < segs_n + 1) 1008 break; 1009 /* Do not bother with large packets MPW cannot handle. */ 1010 if (segs_n > MLX5_MPW_DSEG_MAX) 1011 break; 1012 max -= segs_n; 1013 --pkts_n; 1014 /* Should we enable HW CKSUM offload */ 1015 if (buf->ol_flags & 1016 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) 1017 cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; 1018 /* Retrieve packet information. */ 1019 length = PKT_LEN(buf); 1020 assert(length); 1021 /* Start new session if packet differs. */ 1022 if ((mpw.state == MLX5_MPW_STATE_OPENED) && 1023 ((mpw.len != length) || 1024 (segs_n != 1) || 1025 (mpw.wqe->eseg.cs_flags != cs_flags))) 1026 mlx5_mpw_close(txq, &mpw); 1027 if (mpw.state == MLX5_MPW_STATE_CLOSED) { 1028 /* 1029 * Multi-Packet WQE consumes at most two WQE. 1030 * mlx5_mpw_new() expects to be able to use such 1031 * resources. 1032 */ 1033 if (unlikely(max_wqe < 2)) 1034 break; 1035 max_wqe -= 2; 1036 mlx5_mpw_new(txq, &mpw, length); 1037 mpw.wqe->eseg.cs_flags = cs_flags; 1038 } 1039 /* Multi-segment packets must be alone in their MPW. */ 1040 assert((segs_n == 1) || (mpw.pkts_n == 0)); 1041 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 1042 length = 0; 1043 #endif 1044 do { 1045 volatile struct mlx5_wqe_data_seg *dseg; 1046 uintptr_t addr; 1047 1048 elts_head_next = (elts_head + 1) & (elts_n - 1); 1049 assert(buf); 1050 (*txq->elts)[elts_head] = buf; 1051 dseg = mpw.data.dseg[mpw.pkts_n]; 1052 addr = rte_pktmbuf_mtod(buf, uintptr_t); 1053 *dseg = (struct mlx5_wqe_data_seg){ 1054 .byte_count = htonl(DATA_LEN(buf)), 1055 .lkey = txq_mp2mr(txq, txq_mb2mp(buf)), 1056 .addr = htonll(addr), 1057 }; 1058 elts_head = elts_head_next; 1059 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 1060 length += DATA_LEN(buf); 1061 #endif 1062 buf = buf->next; 1063 ++mpw.pkts_n; 1064 ++j; 1065 } while (--segs_n); 1066 assert(length == mpw.len); 1067 if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) 1068 mlx5_mpw_close(txq, &mpw); 1069 elts_head = elts_head_next; 1070 #ifdef MLX5_PMD_SOFT_COUNTERS 1071 /* Increment sent bytes counter. */ 1072 txq->stats.obytes += length; 1073 #endif 1074 ++i; 1075 } while (pkts_n); 1076 /* Take a shortcut if nothing must be sent. */ 1077 if (unlikely(i == 0)) 1078 return 0; 1079 /* Check whether completion threshold has been reached. */ 1080 /* "j" includes both packets and segments. */ 1081 comp = txq->elts_comp + j; 1082 if (comp >= MLX5_TX_COMP_THRESH) { 1083 volatile struct mlx5_wqe *wqe = mpw.wqe; 1084 1085 /* Request completion on last WQE. */ 1086 wqe->ctrl[2] = htonl(8); 1087 /* Save elts_head in unused "immediate" field of WQE. */ 1088 wqe->ctrl[3] = elts_head; 1089 txq->elts_comp = 0; 1090 } else { 1091 txq->elts_comp = comp; 1092 } 1093 #ifdef MLX5_PMD_SOFT_COUNTERS 1094 /* Increment sent packets counter. */ 1095 txq->stats.opackets += i; 1096 #endif 1097 /* Ring QP doorbell. */ 1098 if (mpw.state == MLX5_MPW_STATE_OPENED) 1099 mlx5_mpw_close(txq, &mpw); 1100 mlx5_tx_dbrec(txq, mpw.wqe); 1101 txq->elts_head = elts_head; 1102 return i; 1103 } 1104 1105 /** 1106 * Open a MPW inline session. 1107 * 1108 * @param txq 1109 * Pointer to TX queue structure. 1110 * @param mpw 1111 * Pointer to MPW session structure. 1112 * @param length 1113 * Packet length. 1114 */ 1115 static inline void 1116 mlx5_mpw_inline_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length) 1117 { 1118 uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1); 1119 struct mlx5_wqe_inl_small *inl; 1120 1121 mpw->state = MLX5_MPW_INL_STATE_OPENED; 1122 mpw->pkts_n = 0; 1123 mpw->len = length; 1124 mpw->total_len = 0; 1125 mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx); 1126 mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) | 1127 (txq->wqe_ci << 8) | 1128 MLX5_OPCODE_TSO); 1129 mpw->wqe->ctrl[2] = 0; 1130 mpw->wqe->ctrl[3] = 0; 1131 mpw->wqe->eseg.mss = htons(length); 1132 mpw->wqe->eseg.inline_hdr_sz = 0; 1133 mpw->wqe->eseg.cs_flags = 0; 1134 mpw->wqe->eseg.rsvd0 = 0; 1135 mpw->wqe->eseg.rsvd1 = 0; 1136 mpw->wqe->eseg.rsvd2 = 0; 1137 inl = (struct mlx5_wqe_inl_small *) 1138 (((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE); 1139 mpw->data.raw = (uint8_t *)&inl->raw; 1140 } 1141 1142 /** 1143 * Close a MPW inline session. 1144 * 1145 * @param txq 1146 * Pointer to TX queue structure. 1147 * @param mpw 1148 * Pointer to MPW session structure. 1149 */ 1150 static inline void 1151 mlx5_mpw_inline_close(struct txq *txq, struct mlx5_mpw *mpw) 1152 { 1153 unsigned int size; 1154 struct mlx5_wqe_inl_small *inl = (struct mlx5_wqe_inl_small *) 1155 (((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE)); 1156 1157 size = MLX5_WQE_SIZE - MLX5_MWQE64_INL_DATA + mpw->total_len; 1158 /* 1159 * Store size in multiple of 16 bytes. Control and Ethernet segments 1160 * count as 2. 1161 */ 1162 mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | MLX5_WQE_DS(size)); 1163 mpw->state = MLX5_MPW_STATE_CLOSED; 1164 inl->byte_cnt = htonl(mpw->total_len | MLX5_INLINE_SEG); 1165 txq->wqe_ci += (size + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE; 1166 } 1167 1168 /** 1169 * DPDK callback for TX with MPW inline support. 1170 * 1171 * @param dpdk_txq 1172 * Generic pointer to TX queue structure. 1173 * @param[in] pkts 1174 * Packets to transmit. 1175 * @param pkts_n 1176 * Number of packets in array. 1177 * 1178 * @return 1179 * Number of packets successfully transmitted (<= pkts_n). 1180 */ 1181 uint16_t 1182 mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, 1183 uint16_t pkts_n) 1184 { 1185 struct txq *txq = (struct txq *)dpdk_txq; 1186 uint16_t elts_head = txq->elts_head; 1187 const unsigned int elts_n = 1 << txq->elts_n; 1188 unsigned int i = 0; 1189 unsigned int j = 0; 1190 unsigned int max; 1191 uint16_t max_wqe; 1192 unsigned int comp; 1193 unsigned int inline_room = txq->max_inline * RTE_CACHE_LINE_SIZE; 1194 struct mlx5_mpw mpw = { 1195 .state = MLX5_MPW_STATE_CLOSED, 1196 }; 1197 /* 1198 * Compute the maximum number of WQE which can be consumed by inline 1199 * code. 1200 * - 2 DSEG for: 1201 * - 1 control segment, 1202 * - 1 Ethernet segment, 1203 * - N Dseg from the inline request. 1204 */ 1205 const unsigned int wqe_inl_n = 1206 ((2 * MLX5_WQE_DWORD_SIZE + 1207 txq->max_inline * RTE_CACHE_LINE_SIZE) + 1208 RTE_CACHE_LINE_SIZE - 1) / RTE_CACHE_LINE_SIZE; 1209 1210 if (unlikely(!pkts_n)) 1211 return 0; 1212 /* Prefetch first packet cacheline. */ 1213 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci)); 1214 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); 1215 /* Start processing. */ 1216 txq_complete(txq); 1217 max = (elts_n - (elts_head - txq->elts_tail)); 1218 if (max > elts_n) 1219 max -= elts_n; 1220 do { 1221 struct rte_mbuf *buf = *(pkts++); 1222 unsigned int elts_head_next; 1223 uintptr_t addr; 1224 uint32_t length; 1225 unsigned int segs_n = buf->nb_segs; 1226 uint32_t cs_flags = 0; 1227 1228 /* 1229 * Make sure there is enough room to store this packet and 1230 * that one ring entry remains unused. 1231 */ 1232 assert(segs_n); 1233 if (max < segs_n + 1) 1234 break; 1235 /* Do not bother with large packets MPW cannot handle. */ 1236 if (segs_n > MLX5_MPW_DSEG_MAX) 1237 break; 1238 max -= segs_n; 1239 --pkts_n; 1240 /* 1241 * Compute max_wqe in case less WQE were consumed in previous 1242 * iteration. 1243 */ 1244 max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); 1245 /* Should we enable HW CKSUM offload */ 1246 if (buf->ol_flags & 1247 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) 1248 cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; 1249 /* Retrieve packet information. */ 1250 length = PKT_LEN(buf); 1251 /* Start new session if packet differs. */ 1252 if (mpw.state == MLX5_MPW_STATE_OPENED) { 1253 if ((mpw.len != length) || 1254 (segs_n != 1) || 1255 (mpw.wqe->eseg.cs_flags != cs_flags)) 1256 mlx5_mpw_close(txq, &mpw); 1257 } else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) { 1258 if ((mpw.len != length) || 1259 (segs_n != 1) || 1260 (length > inline_room) || 1261 (mpw.wqe->eseg.cs_flags != cs_flags)) { 1262 mlx5_mpw_inline_close(txq, &mpw); 1263 inline_room = 1264 txq->max_inline * RTE_CACHE_LINE_SIZE; 1265 } 1266 } 1267 if (mpw.state == MLX5_MPW_STATE_CLOSED) { 1268 if ((segs_n != 1) || 1269 (length > inline_room)) { 1270 /* 1271 * Multi-Packet WQE consumes at most two WQE. 1272 * mlx5_mpw_new() expects to be able to use 1273 * such resources. 1274 */ 1275 if (unlikely(max_wqe < 2)) 1276 break; 1277 max_wqe -= 2; 1278 mlx5_mpw_new(txq, &mpw, length); 1279 mpw.wqe->eseg.cs_flags = cs_flags; 1280 } else { 1281 if (unlikely(max_wqe < wqe_inl_n)) 1282 break; 1283 max_wqe -= wqe_inl_n; 1284 mlx5_mpw_inline_new(txq, &mpw, length); 1285 mpw.wqe->eseg.cs_flags = cs_flags; 1286 } 1287 } 1288 /* Multi-segment packets must be alone in their MPW. */ 1289 assert((segs_n == 1) || (mpw.pkts_n == 0)); 1290 if (mpw.state == MLX5_MPW_STATE_OPENED) { 1291 assert(inline_room == 1292 txq->max_inline * RTE_CACHE_LINE_SIZE); 1293 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 1294 length = 0; 1295 #endif 1296 do { 1297 volatile struct mlx5_wqe_data_seg *dseg; 1298 1299 elts_head_next = 1300 (elts_head + 1) & (elts_n - 1); 1301 assert(buf); 1302 (*txq->elts)[elts_head] = buf; 1303 dseg = mpw.data.dseg[mpw.pkts_n]; 1304 addr = rte_pktmbuf_mtod(buf, uintptr_t); 1305 *dseg = (struct mlx5_wqe_data_seg){ 1306 .byte_count = htonl(DATA_LEN(buf)), 1307 .lkey = txq_mp2mr(txq, txq_mb2mp(buf)), 1308 .addr = htonll(addr), 1309 }; 1310 elts_head = elts_head_next; 1311 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 1312 length += DATA_LEN(buf); 1313 #endif 1314 buf = buf->next; 1315 ++mpw.pkts_n; 1316 ++j; 1317 } while (--segs_n); 1318 assert(length == mpw.len); 1319 if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) 1320 mlx5_mpw_close(txq, &mpw); 1321 } else { 1322 unsigned int max; 1323 1324 assert(mpw.state == MLX5_MPW_INL_STATE_OPENED); 1325 assert(length <= inline_room); 1326 assert(length == DATA_LEN(buf)); 1327 elts_head_next = (elts_head + 1) & (elts_n - 1); 1328 addr = rte_pktmbuf_mtod(buf, uintptr_t); 1329 (*txq->elts)[elts_head] = buf; 1330 /* Maximum number of bytes before wrapping. */ 1331 max = ((((uintptr_t)(txq->wqes)) + 1332 (1 << txq->wqe_n) * 1333 MLX5_WQE_SIZE) - 1334 (uintptr_t)mpw.data.raw); 1335 if (length > max) { 1336 rte_memcpy((void *)(uintptr_t)mpw.data.raw, 1337 (void *)addr, 1338 max); 1339 mpw.data.raw = (volatile void *)txq->wqes; 1340 rte_memcpy((void *)(uintptr_t)mpw.data.raw, 1341 (void *)(addr + max), 1342 length - max); 1343 mpw.data.raw += length - max; 1344 } else { 1345 rte_memcpy((void *)(uintptr_t)mpw.data.raw, 1346 (void *)addr, 1347 length); 1348 1349 if (length == max) 1350 mpw.data.raw = 1351 (volatile void *)txq->wqes; 1352 else 1353 mpw.data.raw += length; 1354 } 1355 ++mpw.pkts_n; 1356 mpw.total_len += length; 1357 ++j; 1358 if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) { 1359 mlx5_mpw_inline_close(txq, &mpw); 1360 inline_room = 1361 txq->max_inline * RTE_CACHE_LINE_SIZE; 1362 } else { 1363 inline_room -= length; 1364 } 1365 } 1366 elts_head = elts_head_next; 1367 #ifdef MLX5_PMD_SOFT_COUNTERS 1368 /* Increment sent bytes counter. */ 1369 txq->stats.obytes += length; 1370 #endif 1371 ++i; 1372 } while (pkts_n); 1373 /* Take a shortcut if nothing must be sent. */ 1374 if (unlikely(i == 0)) 1375 return 0; 1376 /* Check whether completion threshold has been reached. */ 1377 /* "j" includes both packets and segments. */ 1378 comp = txq->elts_comp + j; 1379 if (comp >= MLX5_TX_COMP_THRESH) { 1380 volatile struct mlx5_wqe *wqe = mpw.wqe; 1381 1382 /* Request completion on last WQE. */ 1383 wqe->ctrl[2] = htonl(8); 1384 /* Save elts_head in unused "immediate" field of WQE. */ 1385 wqe->ctrl[3] = elts_head; 1386 txq->elts_comp = 0; 1387 } else { 1388 txq->elts_comp = comp; 1389 } 1390 #ifdef MLX5_PMD_SOFT_COUNTERS 1391 /* Increment sent packets counter. */ 1392 txq->stats.opackets += i; 1393 #endif 1394 /* Ring QP doorbell. */ 1395 if (mpw.state == MLX5_MPW_INL_STATE_OPENED) 1396 mlx5_mpw_inline_close(txq, &mpw); 1397 else if (mpw.state == MLX5_MPW_STATE_OPENED) 1398 mlx5_mpw_close(txq, &mpw); 1399 mlx5_tx_dbrec(txq, mpw.wqe); 1400 txq->elts_head = elts_head; 1401 return i; 1402 } 1403 1404 /** 1405 * Open an Enhanced MPW session. 1406 * 1407 * @param txq 1408 * Pointer to TX queue structure. 1409 * @param mpw 1410 * Pointer to MPW session structure. 1411 * @param length 1412 * Packet length. 1413 */ 1414 static inline void 1415 mlx5_empw_new(struct txq *txq, struct mlx5_mpw *mpw, int padding) 1416 { 1417 uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1); 1418 1419 mpw->state = MLX5_MPW_ENHANCED_STATE_OPENED; 1420 mpw->pkts_n = 0; 1421 mpw->total_len = sizeof(struct mlx5_wqe); 1422 mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx); 1423 mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_ENHANCED_MPSW << 24) | 1424 (txq->wqe_ci << 8) | 1425 MLX5_OPCODE_ENHANCED_MPSW); 1426 mpw->wqe->ctrl[2] = 0; 1427 mpw->wqe->ctrl[3] = 0; 1428 memset((void *)(uintptr_t)&mpw->wqe->eseg, 0, MLX5_WQE_DWORD_SIZE); 1429 if (unlikely(padding)) { 1430 uintptr_t addr = (uintptr_t)(mpw->wqe + 1); 1431 1432 /* Pad the first 2 DWORDs with zero-length inline header. */ 1433 *(volatile uint32_t *)addr = htonl(MLX5_INLINE_SEG); 1434 *(volatile uint32_t *)(addr + MLX5_WQE_DWORD_SIZE) = 1435 htonl(MLX5_INLINE_SEG); 1436 mpw->total_len += 2 * MLX5_WQE_DWORD_SIZE; 1437 /* Start from the next WQEBB. */ 1438 mpw->data.raw = (volatile void *)(tx_mlx5_wqe(txq, idx + 1)); 1439 } else { 1440 mpw->data.raw = (volatile void *)(mpw->wqe + 1); 1441 } 1442 } 1443 1444 /** 1445 * Close an Enhanced MPW session. 1446 * 1447 * @param txq 1448 * Pointer to TX queue structure. 1449 * @param mpw 1450 * Pointer to MPW session structure. 1451 * 1452 * @return 1453 * Number of consumed WQEs. 1454 */ 1455 static inline uint16_t 1456 mlx5_empw_close(struct txq *txq, struct mlx5_mpw *mpw) 1457 { 1458 uint16_t ret; 1459 1460 /* Store size in multiple of 16 bytes. Control and Ethernet segments 1461 * count as 2. 1462 */ 1463 mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | MLX5_WQE_DS(mpw->total_len)); 1464 mpw->state = MLX5_MPW_STATE_CLOSED; 1465 ret = (mpw->total_len + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE; 1466 txq->wqe_ci += ret; 1467 return ret; 1468 } 1469 1470 /** 1471 * DPDK callback for TX with Enhanced MPW support. 1472 * 1473 * @param dpdk_txq 1474 * Generic pointer to TX queue structure. 1475 * @param[in] pkts 1476 * Packets to transmit. 1477 * @param pkts_n 1478 * Number of packets in array. 1479 * 1480 * @return 1481 * Number of packets successfully transmitted (<= pkts_n). 1482 */ 1483 uint16_t 1484 mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 1485 { 1486 struct txq *txq = (struct txq *)dpdk_txq; 1487 uint16_t elts_head = txq->elts_head; 1488 const unsigned int elts_n = 1 << txq->elts_n; 1489 unsigned int i = 0; 1490 unsigned int j = 0; 1491 unsigned int max_elts; 1492 uint16_t max_wqe; 1493 unsigned int max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE; 1494 unsigned int mpw_room = 0; 1495 unsigned int inl_pad = 0; 1496 uint32_t inl_hdr; 1497 struct mlx5_mpw mpw = { 1498 .state = MLX5_MPW_STATE_CLOSED, 1499 }; 1500 1501 if (unlikely(!pkts_n)) 1502 return 0; 1503 /* Start processing. */ 1504 txq_complete(txq); 1505 max_elts = (elts_n - (elts_head - txq->elts_tail)); 1506 if (max_elts > elts_n) 1507 max_elts -= elts_n; 1508 /* A CQE slot must always be available. */ 1509 assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci)); 1510 max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); 1511 if (unlikely(!max_wqe)) 1512 return 0; 1513 do { 1514 struct rte_mbuf *buf = *(pkts++); 1515 unsigned int elts_head_next; 1516 uintptr_t addr; 1517 uint64_t naddr; 1518 unsigned int n; 1519 unsigned int do_inline = 0; /* Whether inline is possible. */ 1520 uint32_t length; 1521 unsigned int segs_n = buf->nb_segs; 1522 uint32_t cs_flags = 0; 1523 1524 /* 1525 * Make sure there is enough room to store this packet and 1526 * that one ring entry remains unused. 1527 */ 1528 assert(segs_n); 1529 if (max_elts - j < segs_n + 1) 1530 break; 1531 /* Do not bother with large packets MPW cannot handle. */ 1532 if (segs_n > MLX5_MPW_DSEG_MAX) 1533 break; 1534 /* Should we enable HW CKSUM offload. */ 1535 if (buf->ol_flags & 1536 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) 1537 cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; 1538 /* Retrieve packet information. */ 1539 length = PKT_LEN(buf); 1540 /* Start new session if: 1541 * - multi-segment packet 1542 * - no space left even for a dseg 1543 * - next packet can be inlined with a new WQE 1544 * - cs_flag differs 1545 * It can't be MLX5_MPW_STATE_OPENED as always have a single 1546 * segmented packet. 1547 */ 1548 if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED) { 1549 if ((segs_n != 1) || 1550 (inl_pad + sizeof(struct mlx5_wqe_data_seg) > 1551 mpw_room) || 1552 (length <= txq->inline_max_packet_sz && 1553 inl_pad + sizeof(inl_hdr) + length > 1554 mpw_room) || 1555 (mpw.wqe->eseg.cs_flags != cs_flags)) 1556 max_wqe -= mlx5_empw_close(txq, &mpw); 1557 } 1558 if (unlikely(mpw.state == MLX5_MPW_STATE_CLOSED)) { 1559 if (unlikely(segs_n != 1)) { 1560 /* Fall back to legacy MPW. 1561 * A MPW session consumes 2 WQEs at most to 1562 * include MLX5_MPW_DSEG_MAX pointers. 1563 */ 1564 if (unlikely(max_wqe < 2)) 1565 break; 1566 mlx5_mpw_new(txq, &mpw, length); 1567 } else { 1568 /* In Enhanced MPW, inline as much as the budget 1569 * is allowed. The remaining space is to be 1570 * filled with dsegs. If the title WQEBB isn't 1571 * padded, it will have 2 dsegs there. 1572 */ 1573 mpw_room = RTE_MIN(MLX5_WQE_SIZE_MAX, 1574 (max_inline ? max_inline : 1575 pkts_n * MLX5_WQE_DWORD_SIZE) + 1576 MLX5_WQE_SIZE); 1577 if (unlikely(max_wqe * MLX5_WQE_SIZE < 1578 mpw_room)) 1579 break; 1580 /* Don't pad the title WQEBB to not waste WQ. */ 1581 mlx5_empw_new(txq, &mpw, 0); 1582 mpw_room -= mpw.total_len; 1583 inl_pad = 0; 1584 do_inline = 1585 length <= txq->inline_max_packet_sz && 1586 sizeof(inl_hdr) + length <= mpw_room && 1587 !txq->mpw_hdr_dseg; 1588 } 1589 mpw.wqe->eseg.cs_flags = cs_flags; 1590 } else { 1591 /* Evaluate whether the next packet can be inlined. 1592 * Inlininig is possible when: 1593 * - length is less than configured value 1594 * - length fits for remaining space 1595 * - not required to fill the title WQEBB with dsegs 1596 */ 1597 do_inline = 1598 length <= txq->inline_max_packet_sz && 1599 inl_pad + sizeof(inl_hdr) + length <= 1600 mpw_room && 1601 (!txq->mpw_hdr_dseg || 1602 mpw.total_len >= MLX5_WQE_SIZE); 1603 } 1604 /* Multi-segment packets must be alone in their MPW. */ 1605 assert((segs_n == 1) || (mpw.pkts_n == 0)); 1606 if (unlikely(mpw.state == MLX5_MPW_STATE_OPENED)) { 1607 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 1608 length = 0; 1609 #endif 1610 do { 1611 volatile struct mlx5_wqe_data_seg *dseg; 1612 1613 elts_head_next = 1614 (elts_head + 1) & (elts_n - 1); 1615 assert(buf); 1616 (*txq->elts)[elts_head] = buf; 1617 dseg = mpw.data.dseg[mpw.pkts_n]; 1618 addr = rte_pktmbuf_mtod(buf, uintptr_t); 1619 *dseg = (struct mlx5_wqe_data_seg){ 1620 .byte_count = htonl(DATA_LEN(buf)), 1621 .lkey = txq_mp2mr(txq, txq_mb2mp(buf)), 1622 .addr = htonll(addr), 1623 }; 1624 elts_head = elts_head_next; 1625 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 1626 length += DATA_LEN(buf); 1627 #endif 1628 buf = buf->next; 1629 ++j; 1630 ++mpw.pkts_n; 1631 } while (--segs_n); 1632 /* A multi-segmented packet takes one MPW session. 1633 * TODO: Pack more multi-segmented packets if possible. 1634 */ 1635 mlx5_mpw_close(txq, &mpw); 1636 if (mpw.pkts_n < 3) 1637 max_wqe--; 1638 else 1639 max_wqe -= 2; 1640 } else if (do_inline) { 1641 /* Inline packet into WQE. */ 1642 unsigned int max; 1643 1644 assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED); 1645 assert(length == DATA_LEN(buf)); 1646 inl_hdr = htonl(length | MLX5_INLINE_SEG); 1647 addr = rte_pktmbuf_mtod(buf, uintptr_t); 1648 mpw.data.raw = (volatile void *) 1649 ((uintptr_t)mpw.data.raw + inl_pad); 1650 max = tx_mlx5_wq_tailroom(txq, 1651 (void *)(uintptr_t)mpw.data.raw); 1652 /* Copy inline header. */ 1653 mpw.data.raw = (volatile void *) 1654 mlx5_copy_to_wq( 1655 (void *)(uintptr_t)mpw.data.raw, 1656 &inl_hdr, 1657 sizeof(inl_hdr), 1658 (void *)(uintptr_t)txq->wqes, 1659 max); 1660 max = tx_mlx5_wq_tailroom(txq, 1661 (void *)(uintptr_t)mpw.data.raw); 1662 /* Copy packet data. */ 1663 mpw.data.raw = (volatile void *) 1664 mlx5_copy_to_wq( 1665 (void *)(uintptr_t)mpw.data.raw, 1666 (void *)addr, 1667 length, 1668 (void *)(uintptr_t)txq->wqes, 1669 max); 1670 ++mpw.pkts_n; 1671 mpw.total_len += (inl_pad + sizeof(inl_hdr) + length); 1672 /* No need to get completion as the entire packet is 1673 * copied to WQ. Free the buf right away. 1674 */ 1675 elts_head_next = elts_head; 1676 rte_pktmbuf_free_seg(buf); 1677 mpw_room -= (inl_pad + sizeof(inl_hdr) + length); 1678 /* Add pad in the next packet if any. */ 1679 inl_pad = (((uintptr_t)mpw.data.raw + 1680 (MLX5_WQE_DWORD_SIZE - 1)) & 1681 ~(MLX5_WQE_DWORD_SIZE - 1)) - 1682 (uintptr_t)mpw.data.raw; 1683 } else { 1684 /* No inline. Load a dseg of packet pointer. */ 1685 volatile rte_v128u32_t *dseg; 1686 1687 assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED); 1688 assert((inl_pad + sizeof(*dseg)) <= mpw_room); 1689 assert(length == DATA_LEN(buf)); 1690 if (!tx_mlx5_wq_tailroom(txq, 1691 (void *)((uintptr_t)mpw.data.raw 1692 + inl_pad))) 1693 dseg = (volatile void *)txq->wqes; 1694 else 1695 dseg = (volatile void *) 1696 ((uintptr_t)mpw.data.raw + 1697 inl_pad); 1698 elts_head_next = (elts_head + 1) & (elts_n - 1); 1699 (*txq->elts)[elts_head] = buf; 1700 addr = rte_pktmbuf_mtod(buf, uintptr_t); 1701 for (n = 0; n * RTE_CACHE_LINE_SIZE < length; n++) 1702 rte_prefetch2((void *)(addr + 1703 n * RTE_CACHE_LINE_SIZE)); 1704 naddr = htonll(addr); 1705 *dseg = (rte_v128u32_t) { 1706 htonl(length), 1707 txq_mp2mr(txq, txq_mb2mp(buf)), 1708 naddr, 1709 naddr >> 32, 1710 }; 1711 mpw.data.raw = (volatile void *)(dseg + 1); 1712 mpw.total_len += (inl_pad + sizeof(*dseg)); 1713 ++j; 1714 ++mpw.pkts_n; 1715 mpw_room -= (inl_pad + sizeof(*dseg)); 1716 inl_pad = 0; 1717 } 1718 elts_head = elts_head_next; 1719 #ifdef MLX5_PMD_SOFT_COUNTERS 1720 /* Increment sent bytes counter. */ 1721 txq->stats.obytes += length; 1722 #endif 1723 ++i; 1724 } while (i < pkts_n); 1725 /* Take a shortcut if nothing must be sent. */ 1726 if (unlikely(i == 0)) 1727 return 0; 1728 /* Check whether completion threshold has been reached. */ 1729 if (txq->elts_comp + j >= MLX5_TX_COMP_THRESH || 1730 (uint16_t)(txq->wqe_ci - txq->mpw_comp) >= 1731 (1 << txq->wqe_n) / MLX5_TX_COMP_THRESH_INLINE_DIV) { 1732 volatile struct mlx5_wqe *wqe = mpw.wqe; 1733 1734 /* Request completion on last WQE. */ 1735 wqe->ctrl[2] = htonl(8); 1736 /* Save elts_head in unused "immediate" field of WQE. */ 1737 wqe->ctrl[3] = elts_head; 1738 txq->elts_comp = 0; 1739 txq->mpw_comp = txq->wqe_ci; 1740 txq->cq_pi++; 1741 } else { 1742 txq->elts_comp += j; 1743 } 1744 #ifdef MLX5_PMD_SOFT_COUNTERS 1745 /* Increment sent packets counter. */ 1746 txq->stats.opackets += i; 1747 #endif 1748 if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED) 1749 mlx5_empw_close(txq, &mpw); 1750 else if (mpw.state == MLX5_MPW_STATE_OPENED) 1751 mlx5_mpw_close(txq, &mpw); 1752 /* Ring QP doorbell. */ 1753 mlx5_tx_dbrec(txq, mpw.wqe); 1754 txq->elts_head = elts_head; 1755 return i; 1756 } 1757 1758 /** 1759 * Translate RX completion flags to packet type. 1760 * 1761 * @param[in] cqe 1762 * Pointer to CQE. 1763 * 1764 * @note: fix mlx5_dev_supported_ptypes_get() if any change here. 1765 * 1766 * @return 1767 * Packet type for struct rte_mbuf. 1768 */ 1769 static inline uint32_t 1770 rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe) 1771 { 1772 uint32_t pkt_type; 1773 uint16_t flags = ntohs(cqe->hdr_type_etc); 1774 1775 if (cqe->pkt_info & MLX5_CQE_RX_TUNNEL_PACKET) { 1776 pkt_type = 1777 TRANSPOSE(flags, 1778 MLX5_CQE_RX_IPV4_PACKET, 1779 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN) | 1780 TRANSPOSE(flags, 1781 MLX5_CQE_RX_IPV6_PACKET, 1782 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN); 1783 pkt_type |= ((cqe->pkt_info & MLX5_CQE_RX_OUTER_PACKET) ? 1784 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN : 1785 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN); 1786 } else { 1787 pkt_type = 1788 TRANSPOSE(flags, 1789 MLX5_CQE_L3_HDR_TYPE_IPV6, 1790 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN) | 1791 TRANSPOSE(flags, 1792 MLX5_CQE_L3_HDR_TYPE_IPV4, 1793 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN); 1794 } 1795 return pkt_type; 1796 } 1797 1798 /** 1799 * Get size of the next packet for a given CQE. For compressed CQEs, the 1800 * consumer index is updated only once all packets of the current one have 1801 * been processed. 1802 * 1803 * @param rxq 1804 * Pointer to RX queue. 1805 * @param cqe 1806 * CQE to process. 1807 * @param[out] rss_hash 1808 * Packet RSS Hash result. 1809 * 1810 * @return 1811 * Packet size in bytes (0 if there is none), -1 in case of completion 1812 * with error. 1813 */ 1814 static inline int 1815 mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe, 1816 uint16_t cqe_cnt, uint32_t *rss_hash) 1817 { 1818 struct rxq_zip *zip = &rxq->zip; 1819 uint16_t cqe_n = cqe_cnt + 1; 1820 int len = 0; 1821 uint16_t idx, end; 1822 1823 /* Process compressed data in the CQE and mini arrays. */ 1824 if (zip->ai) { 1825 volatile struct mlx5_mini_cqe8 (*mc)[8] = 1826 (volatile struct mlx5_mini_cqe8 (*)[8]) 1827 (uintptr_t)(&(*rxq->cqes)[zip->ca & cqe_cnt]); 1828 1829 len = ntohl((*mc)[zip->ai & 7].byte_cnt); 1830 *rss_hash = ntohl((*mc)[zip->ai & 7].rx_hash_result); 1831 if ((++zip->ai & 7) == 0) { 1832 /* Invalidate consumed CQEs */ 1833 idx = zip->ca; 1834 end = zip->na; 1835 while (idx != end) { 1836 (*rxq->cqes)[idx & cqe_cnt].op_own = 1837 MLX5_CQE_INVALIDATE; 1838 ++idx; 1839 } 1840 /* 1841 * Increment consumer index to skip the number of 1842 * CQEs consumed. Hardware leaves holes in the CQ 1843 * ring for software use. 1844 */ 1845 zip->ca = zip->na; 1846 zip->na += 8; 1847 } 1848 if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) { 1849 /* Invalidate the rest */ 1850 idx = zip->ca; 1851 end = zip->cq_ci; 1852 1853 while (idx != end) { 1854 (*rxq->cqes)[idx & cqe_cnt].op_own = 1855 MLX5_CQE_INVALIDATE; 1856 ++idx; 1857 } 1858 rxq->cq_ci = zip->cq_ci; 1859 zip->ai = 0; 1860 } 1861 /* No compressed data, get next CQE and verify if it is compressed. */ 1862 } else { 1863 int ret; 1864 int8_t op_own; 1865 1866 ret = check_cqe(cqe, cqe_n, rxq->cq_ci); 1867 if (unlikely(ret == 1)) 1868 return 0; 1869 ++rxq->cq_ci; 1870 op_own = cqe->op_own; 1871 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) { 1872 volatile struct mlx5_mini_cqe8 (*mc)[8] = 1873 (volatile struct mlx5_mini_cqe8 (*)[8]) 1874 (uintptr_t)(&(*rxq->cqes)[rxq->cq_ci & 1875 cqe_cnt]); 1876 1877 /* Fix endianness. */ 1878 zip->cqe_cnt = ntohl(cqe->byte_cnt); 1879 /* 1880 * Current mini array position is the one returned by 1881 * check_cqe64(). 1882 * 1883 * If completion comprises several mini arrays, as a 1884 * special case the second one is located 7 CQEs after 1885 * the initial CQE instead of 8 for subsequent ones. 1886 */ 1887 zip->ca = rxq->cq_ci; 1888 zip->na = zip->ca + 7; 1889 /* Compute the next non compressed CQE. */ 1890 --rxq->cq_ci; 1891 zip->cq_ci = rxq->cq_ci + zip->cqe_cnt; 1892 /* Get packet size to return. */ 1893 len = ntohl((*mc)[0].byte_cnt); 1894 *rss_hash = ntohl((*mc)[0].rx_hash_result); 1895 zip->ai = 1; 1896 /* Prefetch all the entries to be invalidated */ 1897 idx = zip->ca; 1898 end = zip->cq_ci; 1899 while (idx != end) { 1900 rte_prefetch0(&(*rxq->cqes)[(idx) & cqe_cnt]); 1901 ++idx; 1902 } 1903 } else { 1904 len = ntohl(cqe->byte_cnt); 1905 *rss_hash = ntohl(cqe->rx_hash_res); 1906 } 1907 /* Error while receiving packet. */ 1908 if (unlikely(MLX5_CQE_OPCODE(op_own) == MLX5_CQE_RESP_ERR)) 1909 return -1; 1910 } 1911 return len; 1912 } 1913 1914 /** 1915 * Translate RX completion flags to offload flags. 1916 * 1917 * @param[in] rxq 1918 * Pointer to RX queue structure. 1919 * @param[in] cqe 1920 * Pointer to CQE. 1921 * 1922 * @return 1923 * Offload flags (ol_flags) for struct rte_mbuf. 1924 */ 1925 static inline uint32_t 1926 rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe *cqe) 1927 { 1928 uint32_t ol_flags = 0; 1929 uint16_t flags = ntohs(cqe->hdr_type_etc); 1930 1931 ol_flags = 1932 TRANSPOSE(flags, 1933 MLX5_CQE_RX_L3_HDR_VALID, 1934 PKT_RX_IP_CKSUM_GOOD) | 1935 TRANSPOSE(flags, 1936 MLX5_CQE_RX_L4_HDR_VALID, 1937 PKT_RX_L4_CKSUM_GOOD); 1938 if ((cqe->pkt_info & MLX5_CQE_RX_TUNNEL_PACKET) && (rxq->csum_l2tun)) 1939 ol_flags |= 1940 TRANSPOSE(flags, 1941 MLX5_CQE_RX_L3_HDR_VALID, 1942 PKT_RX_IP_CKSUM_GOOD) | 1943 TRANSPOSE(flags, 1944 MLX5_CQE_RX_L4_HDR_VALID, 1945 PKT_RX_L4_CKSUM_GOOD); 1946 return ol_flags; 1947 } 1948 1949 /** 1950 * DPDK callback for RX. 1951 * 1952 * @param dpdk_rxq 1953 * Generic pointer to RX queue structure. 1954 * @param[out] pkts 1955 * Array to store received packets. 1956 * @param pkts_n 1957 * Maximum number of packets in array. 1958 * 1959 * @return 1960 * Number of packets successfully received (<= pkts_n). 1961 */ 1962 uint16_t 1963 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1964 { 1965 struct rxq *rxq = dpdk_rxq; 1966 const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1; 1967 const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1; 1968 const unsigned int sges_n = rxq->sges_n; 1969 struct rte_mbuf *pkt = NULL; 1970 struct rte_mbuf *seg = NULL; 1971 volatile struct mlx5_cqe *cqe = 1972 &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1973 unsigned int i = 0; 1974 unsigned int rq_ci = rxq->rq_ci << sges_n; 1975 int len; /* keep its value across iterations. */ 1976 1977 while (pkts_n) { 1978 unsigned int idx = rq_ci & wqe_cnt; 1979 volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx]; 1980 struct rte_mbuf *rep = (*rxq->elts)[idx]; 1981 uint32_t rss_hash_res = 0; 1982 1983 if (pkt) 1984 NEXT(seg) = rep; 1985 seg = rep; 1986 rte_prefetch0(seg); 1987 rte_prefetch0(cqe); 1988 rte_prefetch0(wqe); 1989 rep = rte_mbuf_raw_alloc(rxq->mp); 1990 if (unlikely(rep == NULL)) { 1991 ++rxq->stats.rx_nombuf; 1992 if (!pkt) { 1993 /* 1994 * no buffers before we even started, 1995 * bail out silently. 1996 */ 1997 break; 1998 } 1999 while (pkt != seg) { 2000 assert(pkt != (*rxq->elts)[idx]); 2001 rep = NEXT(pkt); 2002 rte_mbuf_refcnt_set(pkt, 0); 2003 __rte_mbuf_raw_free(pkt); 2004 pkt = rep; 2005 } 2006 break; 2007 } 2008 if (!pkt) { 2009 cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 2010 len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt, 2011 &rss_hash_res); 2012 if (!len) { 2013 rte_mbuf_refcnt_set(rep, 0); 2014 __rte_mbuf_raw_free(rep); 2015 break; 2016 } 2017 if (unlikely(len == -1)) { 2018 /* RX error, packet is likely too large. */ 2019 rte_mbuf_refcnt_set(rep, 0); 2020 __rte_mbuf_raw_free(rep); 2021 ++rxq->stats.idropped; 2022 goto skip; 2023 } 2024 pkt = seg; 2025 assert(len >= (rxq->crc_present << 2)); 2026 /* Update packet information. */ 2027 pkt->packet_type = 0; 2028 pkt->ol_flags = 0; 2029 if (rss_hash_res && rxq->rss_hash) { 2030 pkt->hash.rss = rss_hash_res; 2031 pkt->ol_flags = PKT_RX_RSS_HASH; 2032 } 2033 if (rxq->mark && (cqe->sop_drop_qpn != 2034 htonl(MLX5_FLOW_MARK_INVALID))) { 2035 pkt->ol_flags |= PKT_RX_FDIR; 2036 if (cqe->sop_drop_qpn != 2037 htonl(MLX5_FLOW_MARK_DEFAULT)) { 2038 uint32_t mark = cqe->sop_drop_qpn; 2039 2040 pkt->ol_flags |= PKT_RX_FDIR_ID; 2041 pkt->hash.fdir.hi = 2042 mlx5_flow_mark_get(mark); 2043 } 2044 } 2045 if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip | 2046 rxq->crc_present) { 2047 if (rxq->csum) { 2048 pkt->packet_type = 2049 rxq_cq_to_pkt_type(cqe); 2050 pkt->ol_flags |= 2051 rxq_cq_to_ol_flags(rxq, cqe); 2052 } 2053 if (ntohs(cqe->hdr_type_etc) & 2054 MLX5_CQE_VLAN_STRIPPED) { 2055 pkt->ol_flags |= PKT_RX_VLAN_PKT | 2056 PKT_RX_VLAN_STRIPPED; 2057 pkt->vlan_tci = ntohs(cqe->vlan_info); 2058 } 2059 if (rxq->crc_present) 2060 len -= ETHER_CRC_LEN; 2061 } 2062 PKT_LEN(pkt) = len; 2063 } 2064 DATA_LEN(rep) = DATA_LEN(seg); 2065 PKT_LEN(rep) = PKT_LEN(seg); 2066 SET_DATA_OFF(rep, DATA_OFF(seg)); 2067 NB_SEGS(rep) = NB_SEGS(seg); 2068 PORT(rep) = PORT(seg); 2069 NEXT(rep) = NULL; 2070 (*rxq->elts)[idx] = rep; 2071 /* 2072 * Fill NIC descriptor with the new buffer. The lkey and size 2073 * of the buffers are already known, only the buffer address 2074 * changes. 2075 */ 2076 wqe->addr = htonll(rte_pktmbuf_mtod(rep, uintptr_t)); 2077 if (len > DATA_LEN(seg)) { 2078 len -= DATA_LEN(seg); 2079 ++NB_SEGS(pkt); 2080 ++rq_ci; 2081 continue; 2082 } 2083 DATA_LEN(seg) = len; 2084 #ifdef MLX5_PMD_SOFT_COUNTERS 2085 /* Increment bytes counter. */ 2086 rxq->stats.ibytes += PKT_LEN(pkt); 2087 #endif 2088 /* Return packet. */ 2089 *(pkts++) = pkt; 2090 pkt = NULL; 2091 --pkts_n; 2092 ++i; 2093 skip: 2094 /* Align consumer index to the next stride. */ 2095 rq_ci >>= sges_n; 2096 ++rq_ci; 2097 rq_ci <<= sges_n; 2098 } 2099 if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci))) 2100 return 0; 2101 /* Update the consumer index. */ 2102 rxq->rq_ci = rq_ci >> sges_n; 2103 rte_wmb(); 2104 *rxq->cq_db = htonl(rxq->cq_ci); 2105 rte_wmb(); 2106 *rxq->rq_db = htonl(rxq->rq_ci); 2107 #ifdef MLX5_PMD_SOFT_COUNTERS 2108 /* Increment packets counter. */ 2109 rxq->stats.ipackets += i; 2110 #endif 2111 return i; 2112 } 2113 2114 /** 2115 * Dummy DPDK callback for TX. 2116 * 2117 * This function is used to temporarily replace the real callback during 2118 * unsafe control operations on the queue, or in case of error. 2119 * 2120 * @param dpdk_txq 2121 * Generic pointer to TX queue structure. 2122 * @param[in] pkts 2123 * Packets to transmit. 2124 * @param pkts_n 2125 * Number of packets in array. 2126 * 2127 * @return 2128 * Number of packets successfully transmitted (<= pkts_n). 2129 */ 2130 uint16_t 2131 removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 2132 { 2133 (void)dpdk_txq; 2134 (void)pkts; 2135 (void)pkts_n; 2136 return 0; 2137 } 2138 2139 /** 2140 * Dummy DPDK callback for RX. 2141 * 2142 * This function is used to temporarily replace the real callback during 2143 * unsafe control operations on the queue, or in case of error. 2144 * 2145 * @param dpdk_rxq 2146 * Generic pointer to RX queue structure. 2147 * @param[out] pkts 2148 * Array to store received packets. 2149 * @param pkts_n 2150 * Maximum number of packets in array. 2151 * 2152 * @return 2153 * Number of packets successfully received (<= pkts_n). 2154 */ 2155 uint16_t 2156 removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 2157 { 2158 (void)dpdk_rxq; 2159 (void)pkts; 2160 (void)pkts_n; 2161 return 0; 2162 } 2163 2164 /** 2165 * DPDK callback for rx queue interrupt enable. 2166 * 2167 * @param dev 2168 * Pointer to Ethernet device structure. 2169 * @param rx_queue_id 2170 * RX queue number 2171 * 2172 * @return 2173 * 0 on success, negative on failure. 2174 */ 2175 int 2176 mlx5_rx_intr_enable(struct rte_eth_dev *dev, uint16_t rx_queue_id) 2177 { 2178 #ifdef HAVE_UPDATE_CQ_CI 2179 struct priv *priv = mlx5_get_priv(dev); 2180 struct rxq *rxq = (*priv->rxqs)[rx_queue_id]; 2181 struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq); 2182 struct ibv_cq *cq = rxq_ctrl->cq; 2183 uint16_t ci = rxq->cq_ci; 2184 int ret = 0; 2185 2186 ibv_mlx5_exp_update_cq_ci(cq, ci); 2187 ret = ibv_req_notify_cq(cq, 0); 2188 #else 2189 int ret = -1; 2190 (void)dev; 2191 (void)rx_queue_id; 2192 #endif 2193 if (ret) 2194 WARN("unable to arm interrupt on rx queue %d", rx_queue_id); 2195 return ret; 2196 } 2197 2198 /** 2199 * DPDK callback for rx queue interrupt disable. 2200 * 2201 * @param dev 2202 * Pointer to Ethernet device structure. 2203 * @param rx_queue_id 2204 * RX queue number 2205 * 2206 * @return 2207 * 0 on success, negative on failure. 2208 */ 2209 int 2210 mlx5_rx_intr_disable(struct rte_eth_dev *dev, uint16_t rx_queue_id) 2211 { 2212 #ifdef HAVE_UPDATE_CQ_CI 2213 struct priv *priv = mlx5_get_priv(dev); 2214 struct rxq *rxq = (*priv->rxqs)[rx_queue_id]; 2215 struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq); 2216 struct ibv_cq *cq = rxq_ctrl->cq; 2217 struct ibv_cq *ev_cq; 2218 void *ev_ctx; 2219 int ret = 0; 2220 2221 ret = ibv_get_cq_event(cq->channel, &ev_cq, &ev_ctx); 2222 if (ret || ev_cq != cq) 2223 ret = -1; 2224 else 2225 ibv_ack_cq_events(cq, 1); 2226 #else 2227 int ret = -1; 2228 (void)dev; 2229 (void)rx_queue_id; 2230 #endif 2231 if (ret) 2232 WARN("unable to disable interrupt on rx queue %d", 2233 rx_queue_id); 2234 return ret; 2235 } 2236