1 /*- 2 * BSD LICENSE 3 * 4 * Copyright 2015 6WIND S.A. 5 * Copyright 2015 Mellanox. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of 6WIND S.A. nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <assert.h> 35 #include <stdint.h> 36 #include <string.h> 37 #include <stdlib.h> 38 39 /* Verbs header. */ 40 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 41 #ifdef PEDANTIC 42 #pragma GCC diagnostic ignored "-Wpedantic" 43 #endif 44 #include <infiniband/verbs.h> 45 #include <infiniband/mlx5_hw.h> 46 #include <infiniband/arch.h> 47 #ifdef PEDANTIC 48 #pragma GCC diagnostic error "-Wpedantic" 49 #endif 50 51 /* DPDK headers don't like -pedantic. */ 52 #ifdef PEDANTIC 53 #pragma GCC diagnostic ignored "-Wpedantic" 54 #endif 55 #include <rte_mbuf.h> 56 #include <rte_mempool.h> 57 #include <rte_prefetch.h> 58 #include <rte_common.h> 59 #include <rte_branch_prediction.h> 60 #include <rte_ether.h> 61 #ifdef PEDANTIC 62 #pragma GCC diagnostic error "-Wpedantic" 63 #endif 64 65 #include "mlx5.h" 66 #include "mlx5_utils.h" 67 #include "mlx5_rxtx.h" 68 #include "mlx5_autoconf.h" 69 #include "mlx5_defs.h" 70 #include "mlx5_prm.h" 71 72 static inline int 73 check_cqe(volatile struct mlx5_cqe *cqe, 74 unsigned int cqes_n, const uint16_t ci) 75 __attribute__((always_inline)); 76 77 static inline void 78 txq_complete(struct txq *txq) __attribute__((always_inline)); 79 80 static inline uint32_t 81 txq_mp2mr(struct txq *txq, struct rte_mempool *mp) 82 __attribute__((always_inline)); 83 84 static inline void 85 mlx5_tx_dbrec(struct txq *txq, volatile struct mlx5_wqe *wqe) 86 __attribute__((always_inline)); 87 88 static inline uint32_t 89 rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe) 90 __attribute__((always_inline)); 91 92 static inline int 93 mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe, 94 uint16_t cqe_cnt, uint32_t *rss_hash) 95 __attribute__((always_inline)); 96 97 static inline uint32_t 98 rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe *cqe) 99 __attribute__((always_inline)); 100 101 #ifndef NDEBUG 102 103 /** 104 * Verify or set magic value in CQE. 105 * 106 * @param cqe 107 * Pointer to CQE. 108 * 109 * @return 110 * 0 the first time. 111 */ 112 static inline int 113 check_cqe_seen(volatile struct mlx5_cqe *cqe) 114 { 115 static const uint8_t magic[] = "seen"; 116 volatile uint8_t (*buf)[sizeof(cqe->rsvd0)] = &cqe->rsvd0; 117 int ret = 1; 118 unsigned int i; 119 120 for (i = 0; i < sizeof(magic) && i < sizeof(*buf); ++i) 121 if (!ret || (*buf)[i] != magic[i]) { 122 ret = 0; 123 (*buf)[i] = magic[i]; 124 } 125 return ret; 126 } 127 128 #endif /* NDEBUG */ 129 130 /** 131 * Check whether CQE is valid. 132 * 133 * @param cqe 134 * Pointer to CQE. 135 * @param cqes_n 136 * Size of completion queue. 137 * @param ci 138 * Consumer index. 139 * 140 * @return 141 * 0 on success, 1 on failure. 142 */ 143 static inline int 144 check_cqe(volatile struct mlx5_cqe *cqe, 145 unsigned int cqes_n, const uint16_t ci) 146 { 147 uint16_t idx = ci & cqes_n; 148 uint8_t op_own = cqe->op_own; 149 uint8_t op_owner = MLX5_CQE_OWNER(op_own); 150 uint8_t op_code = MLX5_CQE_OPCODE(op_own); 151 152 if (unlikely((op_owner != (!!(idx))) || (op_code == MLX5_CQE_INVALID))) 153 return 1; /* No CQE. */ 154 #ifndef NDEBUG 155 if ((op_code == MLX5_CQE_RESP_ERR) || 156 (op_code == MLX5_CQE_REQ_ERR)) { 157 volatile struct mlx5_err_cqe *err_cqe = (volatile void *)cqe; 158 uint8_t syndrome = err_cqe->syndrome; 159 160 if ((syndrome == MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR) || 161 (syndrome == MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR)) 162 return 0; 163 if (!check_cqe_seen(cqe)) 164 ERROR("unexpected CQE error %u (0x%02x)" 165 " syndrome 0x%02x", 166 op_code, op_code, syndrome); 167 return 1; 168 } else if ((op_code != MLX5_CQE_RESP_SEND) && 169 (op_code != MLX5_CQE_REQ)) { 170 if (!check_cqe_seen(cqe)) 171 ERROR("unexpected CQE opcode %u (0x%02x)", 172 op_code, op_code); 173 return 1; 174 } 175 #endif /* NDEBUG */ 176 return 0; 177 } 178 179 /** 180 * Return the address of the WQE. 181 * 182 * @param txq 183 * Pointer to TX queue structure. 184 * @param wqe_ci 185 * WQE consumer index. 186 * 187 * @return 188 * WQE address. 189 */ 190 static inline uintptr_t * 191 tx_mlx5_wqe(struct txq *txq, uint16_t ci) 192 { 193 ci &= ((1 << txq->wqe_n) - 1); 194 return (uintptr_t *)((uintptr_t)txq->wqes + ci * MLX5_WQE_SIZE); 195 } 196 197 /** 198 * Return the size of tailroom of WQ. 199 * 200 * @param txq 201 * Pointer to TX queue structure. 202 * @param addr 203 * Pointer to tail of WQ. 204 * 205 * @return 206 * Size of tailroom. 207 */ 208 static inline size_t 209 tx_mlx5_wq_tailroom(struct txq *txq, void *addr) 210 { 211 size_t tailroom; 212 tailroom = (uintptr_t)(txq->wqes) + 213 (1 << txq->wqe_n) * MLX5_WQE_SIZE - 214 (uintptr_t)addr; 215 return tailroom; 216 } 217 218 /** 219 * Copy data to tailroom of circular queue. 220 * 221 * @param dst 222 * Pointer to destination. 223 * @param src 224 * Pointer to source. 225 * @param n 226 * Number of bytes to copy. 227 * @param base 228 * Pointer to head of queue. 229 * @param tailroom 230 * Size of tailroom from dst. 231 * 232 * @return 233 * Pointer after copied data. 234 */ 235 static inline void * 236 mlx5_copy_to_wq(void *dst, const void *src, size_t n, 237 void *base, size_t tailroom) 238 { 239 void *ret; 240 241 if (n > tailroom) { 242 rte_memcpy(dst, src, tailroom); 243 rte_memcpy(base, (void *)((uintptr_t)src + tailroom), 244 n - tailroom); 245 ret = (uint8_t *)base + n - tailroom; 246 } else { 247 rte_memcpy(dst, src, n); 248 ret = (n == tailroom) ? base : (uint8_t *)dst + n; 249 } 250 return ret; 251 } 252 253 /** 254 * Manage TX completions. 255 * 256 * When sending a burst, mlx5_tx_burst() posts several WRs. 257 * 258 * @param txq 259 * Pointer to TX queue structure. 260 */ 261 static inline void 262 txq_complete(struct txq *txq) 263 { 264 const unsigned int elts_n = 1 << txq->elts_n; 265 const unsigned int cqe_n = 1 << txq->cqe_n; 266 const unsigned int cqe_cnt = cqe_n - 1; 267 uint16_t elts_free = txq->elts_tail; 268 uint16_t elts_tail; 269 uint16_t cq_ci = txq->cq_ci; 270 volatile struct mlx5_cqe *cqe = NULL; 271 volatile struct mlx5_wqe_ctrl *ctrl; 272 273 do { 274 volatile struct mlx5_cqe *tmp; 275 276 tmp = &(*txq->cqes)[cq_ci & cqe_cnt]; 277 if (check_cqe(tmp, cqe_n, cq_ci)) 278 break; 279 cqe = tmp; 280 #ifndef NDEBUG 281 if (MLX5_CQE_FORMAT(cqe->op_own) == MLX5_COMPRESSED) { 282 if (!check_cqe_seen(cqe)) 283 ERROR("unexpected compressed CQE, TX stopped"); 284 return; 285 } 286 if ((MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_RESP_ERR) || 287 (MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_REQ_ERR)) { 288 if (!check_cqe_seen(cqe)) 289 ERROR("unexpected error CQE, TX stopped"); 290 return; 291 } 292 #endif /* NDEBUG */ 293 ++cq_ci; 294 } while (1); 295 if (unlikely(cqe == NULL)) 296 return; 297 txq->wqe_pi = ntohs(cqe->wqe_counter); 298 ctrl = (volatile struct mlx5_wqe_ctrl *) 299 tx_mlx5_wqe(txq, txq->wqe_pi); 300 elts_tail = ctrl->ctrl3; 301 assert(elts_tail < (1 << txq->wqe_n)); 302 /* Free buffers. */ 303 while (elts_free != elts_tail) { 304 struct rte_mbuf *elt = (*txq->elts)[elts_free]; 305 unsigned int elts_free_next = 306 (elts_free + 1) & (elts_n - 1); 307 struct rte_mbuf *elt_next = (*txq->elts)[elts_free_next]; 308 309 #ifndef NDEBUG 310 /* Poisoning. */ 311 memset(&(*txq->elts)[elts_free], 312 0x66, 313 sizeof((*txq->elts)[elts_free])); 314 #endif 315 RTE_MBUF_PREFETCH_TO_FREE(elt_next); 316 /* Only one segment needs to be freed. */ 317 rte_pktmbuf_free_seg(elt); 318 elts_free = elts_free_next; 319 } 320 txq->cq_ci = cq_ci; 321 txq->elts_tail = elts_tail; 322 /* Update the consumer index. */ 323 rte_wmb(); 324 *txq->cq_db = htonl(cq_ci); 325 } 326 327 /** 328 * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which 329 * the cloned mbuf is allocated is returned instead. 330 * 331 * @param buf 332 * Pointer to mbuf. 333 * 334 * @return 335 * Memory pool where data is located for given mbuf. 336 */ 337 static struct rte_mempool * 338 txq_mb2mp(struct rte_mbuf *buf) 339 { 340 if (unlikely(RTE_MBUF_INDIRECT(buf))) 341 return rte_mbuf_from_indirect(buf)->pool; 342 return buf->pool; 343 } 344 345 /** 346 * Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[]. 347 * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full, 348 * remove an entry first. 349 * 350 * @param txq 351 * Pointer to TX queue structure. 352 * @param[in] mp 353 * Memory Pool for which a Memory Region lkey must be returned. 354 * 355 * @return 356 * mr->lkey on success, (uint32_t)-1 on failure. 357 */ 358 static inline uint32_t 359 txq_mp2mr(struct txq *txq, struct rte_mempool *mp) 360 { 361 unsigned int i; 362 uint32_t lkey = (uint32_t)-1; 363 364 for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) { 365 if (unlikely(txq->mp2mr[i].mp == NULL)) { 366 /* Unknown MP, add a new MR for it. */ 367 break; 368 } 369 if (txq->mp2mr[i].mp == mp) { 370 assert(txq->mp2mr[i].lkey != (uint32_t)-1); 371 assert(htonl(txq->mp2mr[i].mr->lkey) == 372 txq->mp2mr[i].lkey); 373 lkey = txq->mp2mr[i].lkey; 374 break; 375 } 376 } 377 if (unlikely(lkey == (uint32_t)-1)) 378 lkey = txq_mp2mr_reg(txq, mp, i); 379 return lkey; 380 } 381 382 /** 383 * Ring TX queue doorbell. 384 * 385 * @param txq 386 * Pointer to TX queue structure. 387 * @param wqe 388 * Pointer to the last WQE posted in the NIC. 389 */ 390 static inline void 391 mlx5_tx_dbrec(struct txq *txq, volatile struct mlx5_wqe *wqe) 392 { 393 uint64_t *dst = (uint64_t *)((uintptr_t)txq->bf_reg); 394 volatile uint64_t *src = ((volatile uint64_t *)wqe); 395 396 rte_wmb(); 397 *txq->qp_db = htonl(txq->wqe_ci); 398 /* Ensure ordering between DB record and BF copy. */ 399 rte_wmb(); 400 *dst = *src; 401 } 402 403 /** 404 * DPDK callback to check the status of a tx descriptor. 405 * 406 * @param tx_queue 407 * The tx queue. 408 * @param[in] offset 409 * The index of the descriptor in the ring. 410 * 411 * @return 412 * The status of the tx descriptor. 413 */ 414 int 415 mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset) 416 { 417 struct txq *txq = tx_queue; 418 const unsigned int elts_n = 1 << txq->elts_n; 419 const unsigned int elts_cnt = elts_n - 1; 420 unsigned int used; 421 422 txq_complete(txq); 423 used = (txq->elts_head - txq->elts_tail) & elts_cnt; 424 if (offset < used) 425 return RTE_ETH_TX_DESC_FULL; 426 return RTE_ETH_TX_DESC_DONE; 427 } 428 429 /** 430 * DPDK callback to check the status of a rx descriptor. 431 * 432 * @param rx_queue 433 * The rx queue. 434 * @param[in] offset 435 * The index of the descriptor in the ring. 436 * 437 * @return 438 * The status of the tx descriptor. 439 */ 440 int 441 mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset) 442 { 443 struct rxq *rxq = rx_queue; 444 struct rxq_zip *zip = &rxq->zip; 445 volatile struct mlx5_cqe *cqe; 446 const unsigned int cqe_n = (1 << rxq->cqe_n); 447 const unsigned int cqe_cnt = cqe_n - 1; 448 unsigned int cq_ci; 449 unsigned int used; 450 451 /* if we are processing a compressed cqe */ 452 if (zip->ai) { 453 used = zip->cqe_cnt - zip->ca; 454 cq_ci = zip->cq_ci; 455 } else { 456 used = 0; 457 cq_ci = rxq->cq_ci; 458 } 459 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; 460 while (check_cqe(cqe, cqe_n, cq_ci) == 0) { 461 int8_t op_own; 462 unsigned int n; 463 464 op_own = cqe->op_own; 465 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) 466 n = ntohl(cqe->byte_cnt); 467 else 468 n = 1; 469 cq_ci += n; 470 used += n; 471 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; 472 } 473 used = RTE_MIN(used, (1U << rxq->elts_n) - 1); 474 if (offset < used) 475 return RTE_ETH_RX_DESC_DONE; 476 return RTE_ETH_RX_DESC_AVAIL; 477 } 478 479 /** 480 * DPDK callback for TX. 481 * 482 * @param dpdk_txq 483 * Generic pointer to TX queue structure. 484 * @param[in] pkts 485 * Packets to transmit. 486 * @param pkts_n 487 * Number of packets in array. 488 * 489 * @return 490 * Number of packets successfully transmitted (<= pkts_n). 491 */ 492 uint16_t 493 mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 494 { 495 struct txq *txq = (struct txq *)dpdk_txq; 496 uint16_t elts_head = txq->elts_head; 497 const unsigned int elts_n = 1 << txq->elts_n; 498 unsigned int i = 0; 499 unsigned int j = 0; 500 unsigned int k = 0; 501 unsigned int max; 502 unsigned int max_inline = txq->max_inline; 503 const unsigned int inline_en = !!max_inline && txq->inline_en; 504 uint16_t max_wqe; 505 unsigned int comp; 506 volatile struct mlx5_wqe_v *wqe = NULL; 507 unsigned int segs_n = 0; 508 struct rte_mbuf *buf = NULL; 509 uint8_t *raw; 510 511 if (unlikely(!pkts_n)) 512 return 0; 513 /* Prefetch first packet cacheline. */ 514 rte_prefetch0(*pkts); 515 /* Start processing. */ 516 txq_complete(txq); 517 max = (elts_n - (elts_head - txq->elts_tail)); 518 if (max > elts_n) 519 max -= elts_n; 520 max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); 521 if (unlikely(!max_wqe)) 522 return 0; 523 do { 524 volatile rte_v128u32_t *dseg = NULL; 525 uint32_t length; 526 unsigned int ds = 0; 527 uintptr_t addr; 528 uint64_t naddr; 529 uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE + 2; 530 uint16_t tso_header_sz = 0; 531 uint16_t ehdr; 532 uint8_t cs_flags = 0; 533 uint64_t tso = 0; 534 #ifdef MLX5_PMD_SOFT_COUNTERS 535 uint32_t total_length = 0; 536 #endif 537 538 /* first_seg */ 539 buf = *(pkts++); 540 segs_n = buf->nb_segs; 541 /* 542 * Make sure there is enough room to store this packet and 543 * that one ring entry remains unused. 544 */ 545 assert(segs_n); 546 if (max < segs_n + 1) 547 break; 548 max -= segs_n; 549 --segs_n; 550 if (!segs_n) 551 --pkts_n; 552 if (unlikely(--max_wqe == 0)) 553 break; 554 wqe = (volatile struct mlx5_wqe_v *) 555 tx_mlx5_wqe(txq, txq->wqe_ci); 556 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); 557 if (pkts_n > 1) 558 rte_prefetch0(*pkts); 559 addr = rte_pktmbuf_mtod(buf, uintptr_t); 560 length = DATA_LEN(buf); 561 ehdr = (((uint8_t *)addr)[1] << 8) | 562 ((uint8_t *)addr)[0]; 563 #ifdef MLX5_PMD_SOFT_COUNTERS 564 total_length = length; 565 #endif 566 if (length < (MLX5_WQE_DWORD_SIZE + 2)) 567 break; 568 /* Update element. */ 569 (*txq->elts)[elts_head] = buf; 570 elts_head = (elts_head + 1) & (elts_n - 1); 571 /* Prefetch next buffer data. */ 572 if (pkts_n > 1) { 573 volatile void *pkt_addr; 574 575 pkt_addr = rte_pktmbuf_mtod(*pkts, volatile void *); 576 rte_prefetch0(pkt_addr); 577 } 578 /* Should we enable HW CKSUM offload */ 579 if (buf->ol_flags & 580 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) { 581 const uint64_t is_tunneled = buf->ol_flags & 582 (PKT_TX_TUNNEL_GRE | 583 PKT_TX_TUNNEL_VXLAN); 584 585 if (is_tunneled && txq->tunnel_en) { 586 cs_flags = MLX5_ETH_WQE_L3_INNER_CSUM | 587 MLX5_ETH_WQE_L4_INNER_CSUM; 588 if (buf->ol_flags & PKT_TX_OUTER_IP_CKSUM) 589 cs_flags |= MLX5_ETH_WQE_L3_CSUM; 590 } else { 591 cs_flags = MLX5_ETH_WQE_L3_CSUM | 592 MLX5_ETH_WQE_L4_CSUM; 593 } 594 } 595 raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE; 596 /* Replace the Ethernet type by the VLAN if necessary. */ 597 if (buf->ol_flags & PKT_TX_VLAN_PKT) { 598 uint32_t vlan = htonl(0x81000000 | buf->vlan_tci); 599 unsigned int len = 2 * ETHER_ADDR_LEN - 2; 600 601 addr += 2; 602 length -= 2; 603 /* Copy Destination and source mac address. */ 604 memcpy((uint8_t *)raw, ((uint8_t *)addr), len); 605 /* Copy VLAN. */ 606 memcpy((uint8_t *)raw + len, &vlan, sizeof(vlan)); 607 /* Copy missing two bytes to end the DSeg. */ 608 memcpy((uint8_t *)raw + len + sizeof(vlan), 609 ((uint8_t *)addr) + len, 2); 610 addr += len + 2; 611 length -= (len + 2); 612 } else { 613 memcpy((uint8_t *)raw, ((uint8_t *)addr) + 2, 614 MLX5_WQE_DWORD_SIZE); 615 length -= pkt_inline_sz; 616 addr += pkt_inline_sz; 617 } 618 if (txq->tso_en) { 619 tso = buf->ol_flags & PKT_TX_TCP_SEG; 620 if (tso) { 621 uintptr_t end = (uintptr_t) 622 (((uintptr_t)txq->wqes) + 623 (1 << txq->wqe_n) * 624 MLX5_WQE_SIZE); 625 unsigned int copy_b; 626 uint8_t vlan_sz = (buf->ol_flags & 627 PKT_TX_VLAN_PKT) ? 4 : 0; 628 const uint64_t is_tunneled = 629 buf->ol_flags & 630 (PKT_TX_TUNNEL_GRE | 631 PKT_TX_TUNNEL_VXLAN); 632 633 tso_header_sz = buf->l2_len + vlan_sz + 634 buf->l3_len + buf->l4_len; 635 636 if (is_tunneled && txq->tunnel_en) { 637 tso_header_sz += buf->outer_l2_len + 638 buf->outer_l3_len; 639 cs_flags |= MLX5_ETH_WQE_L4_INNER_CSUM; 640 } else { 641 cs_flags |= MLX5_ETH_WQE_L4_CSUM; 642 } 643 if (unlikely(tso_header_sz > 644 MLX5_MAX_TSO_HEADER)) 645 break; 646 copy_b = tso_header_sz - pkt_inline_sz; 647 /* First seg must contain all headers. */ 648 assert(copy_b <= length); 649 raw += MLX5_WQE_DWORD_SIZE; 650 if (copy_b && 651 ((end - (uintptr_t)raw) > copy_b)) { 652 uint16_t n = (MLX5_WQE_DS(copy_b) - 653 1 + 3) / 4; 654 655 if (unlikely(max_wqe < n)) 656 break; 657 max_wqe -= n; 658 rte_memcpy((void *)raw, 659 (void *)addr, copy_b); 660 addr += copy_b; 661 length -= copy_b; 662 pkt_inline_sz += copy_b; 663 /* 664 * Another DWORD will be added 665 * in the inline part. 666 */ 667 raw += MLX5_WQE_DS(copy_b) * 668 MLX5_WQE_DWORD_SIZE - 669 MLX5_WQE_DWORD_SIZE; 670 } else { 671 /* NOP WQE. */ 672 wqe->ctrl = (rte_v128u32_t){ 673 htonl(txq->wqe_ci << 8), 674 htonl(txq->qp_num_8s | 1), 675 0, 676 0, 677 }; 678 ds = 1; 679 total_length = 0; 680 pkts--; 681 pkts_n++; 682 elts_head = (elts_head - 1) & 683 (elts_n - 1); 684 k++; 685 goto next_wqe; 686 } 687 } 688 } 689 /* Inline if enough room. */ 690 if (inline_en || tso) { 691 uintptr_t end = (uintptr_t) 692 (((uintptr_t)txq->wqes) + 693 (1 << txq->wqe_n) * MLX5_WQE_SIZE); 694 unsigned int inline_room = max_inline * 695 RTE_CACHE_LINE_SIZE - 696 (pkt_inline_sz - 2); 697 uintptr_t addr_end = (addr + inline_room) & 698 ~(RTE_CACHE_LINE_SIZE - 1); 699 unsigned int copy_b = (addr_end > addr) ? 700 RTE_MIN((addr_end - addr), length) : 701 0; 702 703 raw += MLX5_WQE_DWORD_SIZE; 704 if (copy_b && ((end - (uintptr_t)raw) > copy_b)) { 705 /* 706 * One Dseg remains in the current WQE. To 707 * keep the computation positive, it is 708 * removed after the bytes to Dseg conversion. 709 */ 710 uint16_t n = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4; 711 712 if (unlikely(max_wqe < n)) 713 break; 714 max_wqe -= n; 715 if (tso) { 716 uint32_t inl = 717 htonl(copy_b | MLX5_INLINE_SEG); 718 719 pkt_inline_sz = 720 MLX5_WQE_DS(tso_header_sz) * 721 MLX5_WQE_DWORD_SIZE; 722 rte_memcpy((void *)raw, 723 (void *)&inl, sizeof(inl)); 724 raw += sizeof(inl); 725 pkt_inline_sz += sizeof(inl); 726 } 727 rte_memcpy((void *)raw, (void *)addr, copy_b); 728 addr += copy_b; 729 length -= copy_b; 730 pkt_inline_sz += copy_b; 731 } 732 /* 733 * 2 DWORDs consumed by the WQE header + ETH segment + 734 * the size of the inline part of the packet. 735 */ 736 ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2); 737 if (length > 0) { 738 if (ds % (MLX5_WQE_SIZE / 739 MLX5_WQE_DWORD_SIZE) == 0) { 740 if (unlikely(--max_wqe == 0)) 741 break; 742 dseg = (volatile rte_v128u32_t *) 743 tx_mlx5_wqe(txq, txq->wqe_ci + 744 ds / 4); 745 } else { 746 dseg = (volatile rte_v128u32_t *) 747 ((uintptr_t)wqe + 748 (ds * MLX5_WQE_DWORD_SIZE)); 749 } 750 goto use_dseg; 751 } else if (!segs_n) { 752 goto next_pkt; 753 } else { 754 /* dseg will be advance as part of next_seg */ 755 dseg = (volatile rte_v128u32_t *) 756 ((uintptr_t)wqe + 757 ((ds - 1) * MLX5_WQE_DWORD_SIZE)); 758 goto next_seg; 759 } 760 } else { 761 /* 762 * No inline has been done in the packet, only the 763 * Ethernet Header as been stored. 764 */ 765 dseg = (volatile rte_v128u32_t *) 766 ((uintptr_t)wqe + (3 * MLX5_WQE_DWORD_SIZE)); 767 ds = 3; 768 use_dseg: 769 /* Add the remaining packet as a simple ds. */ 770 naddr = htonll(addr); 771 *dseg = (rte_v128u32_t){ 772 htonl(length), 773 txq_mp2mr(txq, txq_mb2mp(buf)), 774 naddr, 775 naddr >> 32, 776 }; 777 ++ds; 778 if (!segs_n) 779 goto next_pkt; 780 } 781 next_seg: 782 assert(buf); 783 assert(ds); 784 assert(wqe); 785 /* 786 * Spill on next WQE when the current one does not have 787 * enough room left. Size of WQE must a be a multiple 788 * of data segment size. 789 */ 790 assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE)); 791 if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE))) { 792 if (unlikely(--max_wqe == 0)) 793 break; 794 dseg = (volatile rte_v128u32_t *) 795 tx_mlx5_wqe(txq, txq->wqe_ci + ds / 4); 796 rte_prefetch0(tx_mlx5_wqe(txq, 797 txq->wqe_ci + ds / 4 + 1)); 798 } else { 799 ++dseg; 800 } 801 ++ds; 802 buf = buf->next; 803 assert(buf); 804 length = DATA_LEN(buf); 805 #ifdef MLX5_PMD_SOFT_COUNTERS 806 total_length += length; 807 #endif 808 /* Store segment information. */ 809 naddr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)); 810 *dseg = (rte_v128u32_t){ 811 htonl(length), 812 txq_mp2mr(txq, txq_mb2mp(buf)), 813 naddr, 814 naddr >> 32, 815 }; 816 (*txq->elts)[elts_head] = buf; 817 elts_head = (elts_head + 1) & (elts_n - 1); 818 ++j; 819 --segs_n; 820 if (segs_n) 821 goto next_seg; 822 else 823 --pkts_n; 824 next_pkt: 825 ++i; 826 /* Initialize known and common part of the WQE structure. */ 827 if (tso) { 828 wqe->ctrl = (rte_v128u32_t){ 829 htonl((txq->wqe_ci << 8) | MLX5_OPCODE_TSO), 830 htonl(txq->qp_num_8s | ds), 831 0, 832 0, 833 }; 834 wqe->eseg = (rte_v128u32_t){ 835 0, 836 cs_flags | (htons(buf->tso_segsz) << 16), 837 0, 838 (ehdr << 16) | htons(tso_header_sz), 839 }; 840 } else { 841 wqe->ctrl = (rte_v128u32_t){ 842 htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND), 843 htonl(txq->qp_num_8s | ds), 844 0, 845 0, 846 }; 847 wqe->eseg = (rte_v128u32_t){ 848 0, 849 cs_flags, 850 0, 851 (ehdr << 16) | htons(pkt_inline_sz), 852 }; 853 } 854 next_wqe: 855 txq->wqe_ci += (ds + 3) / 4; 856 #ifdef MLX5_PMD_SOFT_COUNTERS 857 /* Increment sent bytes counter. */ 858 txq->stats.obytes += total_length; 859 #endif 860 } while (pkts_n); 861 /* Take a shortcut if nothing must be sent. */ 862 if (unlikely((i + k) == 0)) 863 return 0; 864 /* Check whether completion threshold has been reached. */ 865 comp = txq->elts_comp + i + j + k; 866 if (comp >= MLX5_TX_COMP_THRESH) { 867 volatile struct mlx5_wqe_ctrl *w = 868 (volatile struct mlx5_wqe_ctrl *)wqe; 869 870 /* Request completion on last WQE. */ 871 w->ctrl2 = htonl(8); 872 /* Save elts_head in unused "immediate" field of WQE. */ 873 w->ctrl3 = elts_head; 874 txq->elts_comp = 0; 875 } else { 876 txq->elts_comp = comp; 877 } 878 #ifdef MLX5_PMD_SOFT_COUNTERS 879 /* Increment sent packets counter. */ 880 txq->stats.opackets += i; 881 #endif 882 /* Ring QP doorbell. */ 883 mlx5_tx_dbrec(txq, (volatile struct mlx5_wqe *)wqe); 884 txq->elts_head = elts_head; 885 return i; 886 } 887 888 /** 889 * Open a MPW session. 890 * 891 * @param txq 892 * Pointer to TX queue structure. 893 * @param mpw 894 * Pointer to MPW session structure. 895 * @param length 896 * Packet length. 897 */ 898 static inline void 899 mlx5_mpw_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length) 900 { 901 uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1); 902 volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] = 903 (volatile struct mlx5_wqe_data_seg (*)[]) 904 tx_mlx5_wqe(txq, idx + 1); 905 906 mpw->state = MLX5_MPW_STATE_OPENED; 907 mpw->pkts_n = 0; 908 mpw->len = length; 909 mpw->total_len = 0; 910 mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx); 911 mpw->wqe->eseg.mss = htons(length); 912 mpw->wqe->eseg.inline_hdr_sz = 0; 913 mpw->wqe->eseg.rsvd0 = 0; 914 mpw->wqe->eseg.rsvd1 = 0; 915 mpw->wqe->eseg.rsvd2 = 0; 916 mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) | 917 (txq->wqe_ci << 8) | MLX5_OPCODE_TSO); 918 mpw->wqe->ctrl[2] = 0; 919 mpw->wqe->ctrl[3] = 0; 920 mpw->data.dseg[0] = (volatile struct mlx5_wqe_data_seg *) 921 (((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE)); 922 mpw->data.dseg[1] = (volatile struct mlx5_wqe_data_seg *) 923 (((uintptr_t)mpw->wqe) + (3 * MLX5_WQE_DWORD_SIZE)); 924 mpw->data.dseg[2] = &(*dseg)[0]; 925 mpw->data.dseg[3] = &(*dseg)[1]; 926 mpw->data.dseg[4] = &(*dseg)[2]; 927 } 928 929 /** 930 * Close a MPW session. 931 * 932 * @param txq 933 * Pointer to TX queue structure. 934 * @param mpw 935 * Pointer to MPW session structure. 936 */ 937 static inline void 938 mlx5_mpw_close(struct txq *txq, struct mlx5_mpw *mpw) 939 { 940 unsigned int num = mpw->pkts_n; 941 942 /* 943 * Store size in multiple of 16 bytes. Control and Ethernet segments 944 * count as 2. 945 */ 946 mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | (2 + num)); 947 mpw->state = MLX5_MPW_STATE_CLOSED; 948 if (num < 3) 949 ++txq->wqe_ci; 950 else 951 txq->wqe_ci += 2; 952 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci)); 953 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); 954 } 955 956 /** 957 * DPDK callback for TX with MPW support. 958 * 959 * @param dpdk_txq 960 * Generic pointer to TX queue structure. 961 * @param[in] pkts 962 * Packets to transmit. 963 * @param pkts_n 964 * Number of packets in array. 965 * 966 * @return 967 * Number of packets successfully transmitted (<= pkts_n). 968 */ 969 uint16_t 970 mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 971 { 972 struct txq *txq = (struct txq *)dpdk_txq; 973 uint16_t elts_head = txq->elts_head; 974 const unsigned int elts_n = 1 << txq->elts_n; 975 unsigned int i = 0; 976 unsigned int j = 0; 977 unsigned int max; 978 uint16_t max_wqe; 979 unsigned int comp; 980 struct mlx5_mpw mpw = { 981 .state = MLX5_MPW_STATE_CLOSED, 982 }; 983 984 if (unlikely(!pkts_n)) 985 return 0; 986 /* Prefetch first packet cacheline. */ 987 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci)); 988 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); 989 /* Start processing. */ 990 txq_complete(txq); 991 max = (elts_n - (elts_head - txq->elts_tail)); 992 if (max > elts_n) 993 max -= elts_n; 994 max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); 995 if (unlikely(!max_wqe)) 996 return 0; 997 do { 998 struct rte_mbuf *buf = *(pkts++); 999 unsigned int elts_head_next; 1000 uint32_t length; 1001 unsigned int segs_n = buf->nb_segs; 1002 uint32_t cs_flags = 0; 1003 1004 /* 1005 * Make sure there is enough room to store this packet and 1006 * that one ring entry remains unused. 1007 */ 1008 assert(segs_n); 1009 if (max < segs_n + 1) 1010 break; 1011 /* Do not bother with large packets MPW cannot handle. */ 1012 if (segs_n > MLX5_MPW_DSEG_MAX) 1013 break; 1014 max -= segs_n; 1015 --pkts_n; 1016 /* Should we enable HW CKSUM offload */ 1017 if (buf->ol_flags & 1018 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) 1019 cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; 1020 /* Retrieve packet information. */ 1021 length = PKT_LEN(buf); 1022 assert(length); 1023 /* Start new session if packet differs. */ 1024 if ((mpw.state == MLX5_MPW_STATE_OPENED) && 1025 ((mpw.len != length) || 1026 (segs_n != 1) || 1027 (mpw.wqe->eseg.cs_flags != cs_flags))) 1028 mlx5_mpw_close(txq, &mpw); 1029 if (mpw.state == MLX5_MPW_STATE_CLOSED) { 1030 /* 1031 * Multi-Packet WQE consumes at most two WQE. 1032 * mlx5_mpw_new() expects to be able to use such 1033 * resources. 1034 */ 1035 if (unlikely(max_wqe < 2)) 1036 break; 1037 max_wqe -= 2; 1038 mlx5_mpw_new(txq, &mpw, length); 1039 mpw.wqe->eseg.cs_flags = cs_flags; 1040 } 1041 /* Multi-segment packets must be alone in their MPW. */ 1042 assert((segs_n == 1) || (mpw.pkts_n == 0)); 1043 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 1044 length = 0; 1045 #endif 1046 do { 1047 volatile struct mlx5_wqe_data_seg *dseg; 1048 uintptr_t addr; 1049 1050 elts_head_next = (elts_head + 1) & (elts_n - 1); 1051 assert(buf); 1052 (*txq->elts)[elts_head] = buf; 1053 dseg = mpw.data.dseg[mpw.pkts_n]; 1054 addr = rte_pktmbuf_mtod(buf, uintptr_t); 1055 *dseg = (struct mlx5_wqe_data_seg){ 1056 .byte_count = htonl(DATA_LEN(buf)), 1057 .lkey = txq_mp2mr(txq, txq_mb2mp(buf)), 1058 .addr = htonll(addr), 1059 }; 1060 elts_head = elts_head_next; 1061 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 1062 length += DATA_LEN(buf); 1063 #endif 1064 buf = buf->next; 1065 ++mpw.pkts_n; 1066 ++j; 1067 } while (--segs_n); 1068 assert(length == mpw.len); 1069 if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) 1070 mlx5_mpw_close(txq, &mpw); 1071 elts_head = elts_head_next; 1072 #ifdef MLX5_PMD_SOFT_COUNTERS 1073 /* Increment sent bytes counter. */ 1074 txq->stats.obytes += length; 1075 #endif 1076 ++i; 1077 } while (pkts_n); 1078 /* Take a shortcut if nothing must be sent. */ 1079 if (unlikely(i == 0)) 1080 return 0; 1081 /* Check whether completion threshold has been reached. */ 1082 /* "j" includes both packets and segments. */ 1083 comp = txq->elts_comp + j; 1084 if (comp >= MLX5_TX_COMP_THRESH) { 1085 volatile struct mlx5_wqe *wqe = mpw.wqe; 1086 1087 /* Request completion on last WQE. */ 1088 wqe->ctrl[2] = htonl(8); 1089 /* Save elts_head in unused "immediate" field of WQE. */ 1090 wqe->ctrl[3] = elts_head; 1091 txq->elts_comp = 0; 1092 } else { 1093 txq->elts_comp = comp; 1094 } 1095 #ifdef MLX5_PMD_SOFT_COUNTERS 1096 /* Increment sent packets counter. */ 1097 txq->stats.opackets += i; 1098 #endif 1099 /* Ring QP doorbell. */ 1100 if (mpw.state == MLX5_MPW_STATE_OPENED) 1101 mlx5_mpw_close(txq, &mpw); 1102 mlx5_tx_dbrec(txq, mpw.wqe); 1103 txq->elts_head = elts_head; 1104 return i; 1105 } 1106 1107 /** 1108 * Open a MPW inline session. 1109 * 1110 * @param txq 1111 * Pointer to TX queue structure. 1112 * @param mpw 1113 * Pointer to MPW session structure. 1114 * @param length 1115 * Packet length. 1116 */ 1117 static inline void 1118 mlx5_mpw_inline_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length) 1119 { 1120 uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1); 1121 struct mlx5_wqe_inl_small *inl; 1122 1123 mpw->state = MLX5_MPW_INL_STATE_OPENED; 1124 mpw->pkts_n = 0; 1125 mpw->len = length; 1126 mpw->total_len = 0; 1127 mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx); 1128 mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) | 1129 (txq->wqe_ci << 8) | 1130 MLX5_OPCODE_TSO); 1131 mpw->wqe->ctrl[2] = 0; 1132 mpw->wqe->ctrl[3] = 0; 1133 mpw->wqe->eseg.mss = htons(length); 1134 mpw->wqe->eseg.inline_hdr_sz = 0; 1135 mpw->wqe->eseg.cs_flags = 0; 1136 mpw->wqe->eseg.rsvd0 = 0; 1137 mpw->wqe->eseg.rsvd1 = 0; 1138 mpw->wqe->eseg.rsvd2 = 0; 1139 inl = (struct mlx5_wqe_inl_small *) 1140 (((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE); 1141 mpw->data.raw = (uint8_t *)&inl->raw; 1142 } 1143 1144 /** 1145 * Close a MPW inline session. 1146 * 1147 * @param txq 1148 * Pointer to TX queue structure. 1149 * @param mpw 1150 * Pointer to MPW session structure. 1151 */ 1152 static inline void 1153 mlx5_mpw_inline_close(struct txq *txq, struct mlx5_mpw *mpw) 1154 { 1155 unsigned int size; 1156 struct mlx5_wqe_inl_small *inl = (struct mlx5_wqe_inl_small *) 1157 (((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE)); 1158 1159 size = MLX5_WQE_SIZE - MLX5_MWQE64_INL_DATA + mpw->total_len; 1160 /* 1161 * Store size in multiple of 16 bytes. Control and Ethernet segments 1162 * count as 2. 1163 */ 1164 mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | MLX5_WQE_DS(size)); 1165 mpw->state = MLX5_MPW_STATE_CLOSED; 1166 inl->byte_cnt = htonl(mpw->total_len | MLX5_INLINE_SEG); 1167 txq->wqe_ci += (size + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE; 1168 } 1169 1170 /** 1171 * DPDK callback for TX with MPW inline support. 1172 * 1173 * @param dpdk_txq 1174 * Generic pointer to TX queue structure. 1175 * @param[in] pkts 1176 * Packets to transmit. 1177 * @param pkts_n 1178 * Number of packets in array. 1179 * 1180 * @return 1181 * Number of packets successfully transmitted (<= pkts_n). 1182 */ 1183 uint16_t 1184 mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, 1185 uint16_t pkts_n) 1186 { 1187 struct txq *txq = (struct txq *)dpdk_txq; 1188 uint16_t elts_head = txq->elts_head; 1189 const unsigned int elts_n = 1 << txq->elts_n; 1190 unsigned int i = 0; 1191 unsigned int j = 0; 1192 unsigned int max; 1193 uint16_t max_wqe; 1194 unsigned int comp; 1195 unsigned int inline_room = txq->max_inline * RTE_CACHE_LINE_SIZE; 1196 struct mlx5_mpw mpw = { 1197 .state = MLX5_MPW_STATE_CLOSED, 1198 }; 1199 /* 1200 * Compute the maximum number of WQE which can be consumed by inline 1201 * code. 1202 * - 2 DSEG for: 1203 * - 1 control segment, 1204 * - 1 Ethernet segment, 1205 * - N Dseg from the inline request. 1206 */ 1207 const unsigned int wqe_inl_n = 1208 ((2 * MLX5_WQE_DWORD_SIZE + 1209 txq->max_inline * RTE_CACHE_LINE_SIZE) + 1210 RTE_CACHE_LINE_SIZE - 1) / RTE_CACHE_LINE_SIZE; 1211 1212 if (unlikely(!pkts_n)) 1213 return 0; 1214 /* Prefetch first packet cacheline. */ 1215 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci)); 1216 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); 1217 /* Start processing. */ 1218 txq_complete(txq); 1219 max = (elts_n - (elts_head - txq->elts_tail)); 1220 if (max > elts_n) 1221 max -= elts_n; 1222 do { 1223 struct rte_mbuf *buf = *(pkts++); 1224 unsigned int elts_head_next; 1225 uintptr_t addr; 1226 uint32_t length; 1227 unsigned int segs_n = buf->nb_segs; 1228 uint32_t cs_flags = 0; 1229 1230 /* 1231 * Make sure there is enough room to store this packet and 1232 * that one ring entry remains unused. 1233 */ 1234 assert(segs_n); 1235 if (max < segs_n + 1) 1236 break; 1237 /* Do not bother with large packets MPW cannot handle. */ 1238 if (segs_n > MLX5_MPW_DSEG_MAX) 1239 break; 1240 max -= segs_n; 1241 --pkts_n; 1242 /* 1243 * Compute max_wqe in case less WQE were consumed in previous 1244 * iteration. 1245 */ 1246 max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); 1247 /* Should we enable HW CKSUM offload */ 1248 if (buf->ol_flags & 1249 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) 1250 cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; 1251 /* Retrieve packet information. */ 1252 length = PKT_LEN(buf); 1253 /* Start new session if packet differs. */ 1254 if (mpw.state == MLX5_MPW_STATE_OPENED) { 1255 if ((mpw.len != length) || 1256 (segs_n != 1) || 1257 (mpw.wqe->eseg.cs_flags != cs_flags)) 1258 mlx5_mpw_close(txq, &mpw); 1259 } else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) { 1260 if ((mpw.len != length) || 1261 (segs_n != 1) || 1262 (length > inline_room) || 1263 (mpw.wqe->eseg.cs_flags != cs_flags)) { 1264 mlx5_mpw_inline_close(txq, &mpw); 1265 inline_room = 1266 txq->max_inline * RTE_CACHE_LINE_SIZE; 1267 } 1268 } 1269 if (mpw.state == MLX5_MPW_STATE_CLOSED) { 1270 if ((segs_n != 1) || 1271 (length > inline_room)) { 1272 /* 1273 * Multi-Packet WQE consumes at most two WQE. 1274 * mlx5_mpw_new() expects to be able to use 1275 * such resources. 1276 */ 1277 if (unlikely(max_wqe < 2)) 1278 break; 1279 max_wqe -= 2; 1280 mlx5_mpw_new(txq, &mpw, length); 1281 mpw.wqe->eseg.cs_flags = cs_flags; 1282 } else { 1283 if (unlikely(max_wqe < wqe_inl_n)) 1284 break; 1285 max_wqe -= wqe_inl_n; 1286 mlx5_mpw_inline_new(txq, &mpw, length); 1287 mpw.wqe->eseg.cs_flags = cs_flags; 1288 } 1289 } 1290 /* Multi-segment packets must be alone in their MPW. */ 1291 assert((segs_n == 1) || (mpw.pkts_n == 0)); 1292 if (mpw.state == MLX5_MPW_STATE_OPENED) { 1293 assert(inline_room == 1294 txq->max_inline * RTE_CACHE_LINE_SIZE); 1295 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 1296 length = 0; 1297 #endif 1298 do { 1299 volatile struct mlx5_wqe_data_seg *dseg; 1300 1301 elts_head_next = 1302 (elts_head + 1) & (elts_n - 1); 1303 assert(buf); 1304 (*txq->elts)[elts_head] = buf; 1305 dseg = mpw.data.dseg[mpw.pkts_n]; 1306 addr = rte_pktmbuf_mtod(buf, uintptr_t); 1307 *dseg = (struct mlx5_wqe_data_seg){ 1308 .byte_count = htonl(DATA_LEN(buf)), 1309 .lkey = txq_mp2mr(txq, txq_mb2mp(buf)), 1310 .addr = htonll(addr), 1311 }; 1312 elts_head = elts_head_next; 1313 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 1314 length += DATA_LEN(buf); 1315 #endif 1316 buf = buf->next; 1317 ++mpw.pkts_n; 1318 ++j; 1319 } while (--segs_n); 1320 assert(length == mpw.len); 1321 if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) 1322 mlx5_mpw_close(txq, &mpw); 1323 } else { 1324 unsigned int max; 1325 1326 assert(mpw.state == MLX5_MPW_INL_STATE_OPENED); 1327 assert(length <= inline_room); 1328 assert(length == DATA_LEN(buf)); 1329 elts_head_next = (elts_head + 1) & (elts_n - 1); 1330 addr = rte_pktmbuf_mtod(buf, uintptr_t); 1331 (*txq->elts)[elts_head] = buf; 1332 /* Maximum number of bytes before wrapping. */ 1333 max = ((((uintptr_t)(txq->wqes)) + 1334 (1 << txq->wqe_n) * 1335 MLX5_WQE_SIZE) - 1336 (uintptr_t)mpw.data.raw); 1337 if (length > max) { 1338 rte_memcpy((void *)(uintptr_t)mpw.data.raw, 1339 (void *)addr, 1340 max); 1341 mpw.data.raw = (volatile void *)txq->wqes; 1342 rte_memcpy((void *)(uintptr_t)mpw.data.raw, 1343 (void *)(addr + max), 1344 length - max); 1345 mpw.data.raw += length - max; 1346 } else { 1347 rte_memcpy((void *)(uintptr_t)mpw.data.raw, 1348 (void *)addr, 1349 length); 1350 1351 if (length == max) 1352 mpw.data.raw = 1353 (volatile void *)txq->wqes; 1354 else 1355 mpw.data.raw += length; 1356 } 1357 ++mpw.pkts_n; 1358 mpw.total_len += length; 1359 ++j; 1360 if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) { 1361 mlx5_mpw_inline_close(txq, &mpw); 1362 inline_room = 1363 txq->max_inline * RTE_CACHE_LINE_SIZE; 1364 } else { 1365 inline_room -= length; 1366 } 1367 } 1368 elts_head = elts_head_next; 1369 #ifdef MLX5_PMD_SOFT_COUNTERS 1370 /* Increment sent bytes counter. */ 1371 txq->stats.obytes += length; 1372 #endif 1373 ++i; 1374 } while (pkts_n); 1375 /* Take a shortcut if nothing must be sent. */ 1376 if (unlikely(i == 0)) 1377 return 0; 1378 /* Check whether completion threshold has been reached. */ 1379 /* "j" includes both packets and segments. */ 1380 comp = txq->elts_comp + j; 1381 if (comp >= MLX5_TX_COMP_THRESH) { 1382 volatile struct mlx5_wqe *wqe = mpw.wqe; 1383 1384 /* Request completion on last WQE. */ 1385 wqe->ctrl[2] = htonl(8); 1386 /* Save elts_head in unused "immediate" field of WQE. */ 1387 wqe->ctrl[3] = elts_head; 1388 txq->elts_comp = 0; 1389 } else { 1390 txq->elts_comp = comp; 1391 } 1392 #ifdef MLX5_PMD_SOFT_COUNTERS 1393 /* Increment sent packets counter. */ 1394 txq->stats.opackets += i; 1395 #endif 1396 /* Ring QP doorbell. */ 1397 if (mpw.state == MLX5_MPW_INL_STATE_OPENED) 1398 mlx5_mpw_inline_close(txq, &mpw); 1399 else if (mpw.state == MLX5_MPW_STATE_OPENED) 1400 mlx5_mpw_close(txq, &mpw); 1401 mlx5_tx_dbrec(txq, mpw.wqe); 1402 txq->elts_head = elts_head; 1403 return i; 1404 } 1405 1406 /** 1407 * Open an Enhanced MPW session. 1408 * 1409 * @param txq 1410 * Pointer to TX queue structure. 1411 * @param mpw 1412 * Pointer to MPW session structure. 1413 * @param length 1414 * Packet length. 1415 */ 1416 static inline void 1417 mlx5_empw_new(struct txq *txq, struct mlx5_mpw *mpw, int padding) 1418 { 1419 uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1); 1420 1421 mpw->state = MLX5_MPW_ENHANCED_STATE_OPENED; 1422 mpw->pkts_n = 0; 1423 mpw->total_len = sizeof(struct mlx5_wqe); 1424 mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx); 1425 mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_ENHANCED_MPSW << 24) | 1426 (txq->wqe_ci << 8) | 1427 MLX5_OPCODE_ENHANCED_MPSW); 1428 mpw->wqe->ctrl[2] = 0; 1429 mpw->wqe->ctrl[3] = 0; 1430 memset((void *)(uintptr_t)&mpw->wqe->eseg, 0, MLX5_WQE_DWORD_SIZE); 1431 if (unlikely(padding)) { 1432 uintptr_t addr = (uintptr_t)(mpw->wqe + 1); 1433 1434 /* Pad the first 2 DWORDs with zero-length inline header. */ 1435 *(volatile uint32_t *)addr = htonl(MLX5_INLINE_SEG); 1436 *(volatile uint32_t *)(addr + MLX5_WQE_DWORD_SIZE) = 1437 htonl(MLX5_INLINE_SEG); 1438 mpw->total_len += 2 * MLX5_WQE_DWORD_SIZE; 1439 /* Start from the next WQEBB. */ 1440 mpw->data.raw = (volatile void *)(tx_mlx5_wqe(txq, idx + 1)); 1441 } else { 1442 mpw->data.raw = (volatile void *)(mpw->wqe + 1); 1443 } 1444 } 1445 1446 /** 1447 * Close an Enhanced MPW session. 1448 * 1449 * @param txq 1450 * Pointer to TX queue structure. 1451 * @param mpw 1452 * Pointer to MPW session structure. 1453 * 1454 * @return 1455 * Number of consumed WQEs. 1456 */ 1457 static inline uint16_t 1458 mlx5_empw_close(struct txq *txq, struct mlx5_mpw *mpw) 1459 { 1460 uint16_t ret; 1461 1462 /* Store size in multiple of 16 bytes. Control and Ethernet segments 1463 * count as 2. 1464 */ 1465 mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | MLX5_WQE_DS(mpw->total_len)); 1466 mpw->state = MLX5_MPW_STATE_CLOSED; 1467 ret = (mpw->total_len + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE; 1468 txq->wqe_ci += ret; 1469 return ret; 1470 } 1471 1472 /** 1473 * DPDK callback for TX with Enhanced MPW support. 1474 * 1475 * @param dpdk_txq 1476 * Generic pointer to TX queue structure. 1477 * @param[in] pkts 1478 * Packets to transmit. 1479 * @param pkts_n 1480 * Number of packets in array. 1481 * 1482 * @return 1483 * Number of packets successfully transmitted (<= pkts_n). 1484 */ 1485 uint16_t 1486 mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 1487 { 1488 struct txq *txq = (struct txq *)dpdk_txq; 1489 uint16_t elts_head = txq->elts_head; 1490 const unsigned int elts_n = 1 << txq->elts_n; 1491 unsigned int i = 0; 1492 unsigned int j = 0; 1493 unsigned int max_elts; 1494 uint16_t max_wqe; 1495 unsigned int max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE; 1496 unsigned int mpw_room = 0; 1497 unsigned int inl_pad = 0; 1498 uint32_t inl_hdr; 1499 struct mlx5_mpw mpw = { 1500 .state = MLX5_MPW_STATE_CLOSED, 1501 }; 1502 1503 if (unlikely(!pkts_n)) 1504 return 0; 1505 /* Start processing. */ 1506 txq_complete(txq); 1507 max_elts = (elts_n - (elts_head - txq->elts_tail)); 1508 if (max_elts > elts_n) 1509 max_elts -= elts_n; 1510 /* A CQE slot must always be available. */ 1511 assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci)); 1512 max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); 1513 if (unlikely(!max_wqe)) 1514 return 0; 1515 do { 1516 struct rte_mbuf *buf = *(pkts++); 1517 unsigned int elts_head_next; 1518 uintptr_t addr; 1519 uint64_t naddr; 1520 unsigned int n; 1521 unsigned int do_inline = 0; /* Whether inline is possible. */ 1522 uint32_t length; 1523 unsigned int segs_n = buf->nb_segs; 1524 uint32_t cs_flags = 0; 1525 1526 /* 1527 * Make sure there is enough room to store this packet and 1528 * that one ring entry remains unused. 1529 */ 1530 assert(segs_n); 1531 if (max_elts - j < segs_n + 1) 1532 break; 1533 /* Do not bother with large packets MPW cannot handle. */ 1534 if (segs_n > MLX5_MPW_DSEG_MAX) 1535 break; 1536 /* Should we enable HW CKSUM offload. */ 1537 if (buf->ol_flags & 1538 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) 1539 cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; 1540 /* Retrieve packet information. */ 1541 length = PKT_LEN(buf); 1542 /* Start new session if: 1543 * - multi-segment packet 1544 * - no space left even for a dseg 1545 * - next packet can be inlined with a new WQE 1546 * - cs_flag differs 1547 * It can't be MLX5_MPW_STATE_OPENED as always have a single 1548 * segmented packet. 1549 */ 1550 if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED) { 1551 if ((segs_n != 1) || 1552 (inl_pad + sizeof(struct mlx5_wqe_data_seg) > 1553 mpw_room) || 1554 (length <= txq->inline_max_packet_sz && 1555 inl_pad + sizeof(inl_hdr) + length > 1556 mpw_room) || 1557 (mpw.wqe->eseg.cs_flags != cs_flags)) 1558 max_wqe -= mlx5_empw_close(txq, &mpw); 1559 } 1560 if (unlikely(mpw.state == MLX5_MPW_STATE_CLOSED)) { 1561 if (unlikely(segs_n != 1)) { 1562 /* Fall back to legacy MPW. 1563 * A MPW session consumes 2 WQEs at most to 1564 * include MLX5_MPW_DSEG_MAX pointers. 1565 */ 1566 if (unlikely(max_wqe < 2)) 1567 break; 1568 mlx5_mpw_new(txq, &mpw, length); 1569 } else { 1570 /* In Enhanced MPW, inline as much as the budget 1571 * is allowed. The remaining space is to be 1572 * filled with dsegs. If the title WQEBB isn't 1573 * padded, it will have 2 dsegs there. 1574 */ 1575 mpw_room = RTE_MIN(MLX5_WQE_SIZE_MAX, 1576 (max_inline ? max_inline : 1577 pkts_n * MLX5_WQE_DWORD_SIZE) + 1578 MLX5_WQE_SIZE); 1579 if (unlikely(max_wqe * MLX5_WQE_SIZE < 1580 mpw_room)) 1581 break; 1582 /* Don't pad the title WQEBB to not waste WQ. */ 1583 mlx5_empw_new(txq, &mpw, 0); 1584 mpw_room -= mpw.total_len; 1585 inl_pad = 0; 1586 do_inline = 1587 length <= txq->inline_max_packet_sz && 1588 sizeof(inl_hdr) + length <= mpw_room && 1589 !txq->mpw_hdr_dseg; 1590 } 1591 mpw.wqe->eseg.cs_flags = cs_flags; 1592 } else { 1593 /* Evaluate whether the next packet can be inlined. 1594 * Inlininig is possible when: 1595 * - length is less than configured value 1596 * - length fits for remaining space 1597 * - not required to fill the title WQEBB with dsegs 1598 */ 1599 do_inline = 1600 length <= txq->inline_max_packet_sz && 1601 inl_pad + sizeof(inl_hdr) + length <= 1602 mpw_room && 1603 (!txq->mpw_hdr_dseg || 1604 mpw.total_len >= MLX5_WQE_SIZE); 1605 } 1606 /* Multi-segment packets must be alone in their MPW. */ 1607 assert((segs_n == 1) || (mpw.pkts_n == 0)); 1608 if (unlikely(mpw.state == MLX5_MPW_STATE_OPENED)) { 1609 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 1610 length = 0; 1611 #endif 1612 do { 1613 volatile struct mlx5_wqe_data_seg *dseg; 1614 1615 elts_head_next = 1616 (elts_head + 1) & (elts_n - 1); 1617 assert(buf); 1618 (*txq->elts)[elts_head] = buf; 1619 dseg = mpw.data.dseg[mpw.pkts_n]; 1620 addr = rte_pktmbuf_mtod(buf, uintptr_t); 1621 *dseg = (struct mlx5_wqe_data_seg){ 1622 .byte_count = htonl(DATA_LEN(buf)), 1623 .lkey = txq_mp2mr(txq, txq_mb2mp(buf)), 1624 .addr = htonll(addr), 1625 }; 1626 elts_head = elts_head_next; 1627 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 1628 length += DATA_LEN(buf); 1629 #endif 1630 buf = buf->next; 1631 ++j; 1632 ++mpw.pkts_n; 1633 } while (--segs_n); 1634 /* A multi-segmented packet takes one MPW session. 1635 * TODO: Pack more multi-segmented packets if possible. 1636 */ 1637 mlx5_mpw_close(txq, &mpw); 1638 if (mpw.pkts_n < 3) 1639 max_wqe--; 1640 else 1641 max_wqe -= 2; 1642 } else if (do_inline) { 1643 /* Inline packet into WQE. */ 1644 unsigned int max; 1645 1646 assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED); 1647 assert(length == DATA_LEN(buf)); 1648 inl_hdr = htonl(length | MLX5_INLINE_SEG); 1649 addr = rte_pktmbuf_mtod(buf, uintptr_t); 1650 mpw.data.raw = (volatile void *) 1651 ((uintptr_t)mpw.data.raw + inl_pad); 1652 max = tx_mlx5_wq_tailroom(txq, 1653 (void *)(uintptr_t)mpw.data.raw); 1654 /* Copy inline header. */ 1655 mpw.data.raw = (volatile void *) 1656 mlx5_copy_to_wq( 1657 (void *)(uintptr_t)mpw.data.raw, 1658 &inl_hdr, 1659 sizeof(inl_hdr), 1660 (void *)(uintptr_t)txq->wqes, 1661 max); 1662 max = tx_mlx5_wq_tailroom(txq, 1663 (void *)(uintptr_t)mpw.data.raw); 1664 /* Copy packet data. */ 1665 mpw.data.raw = (volatile void *) 1666 mlx5_copy_to_wq( 1667 (void *)(uintptr_t)mpw.data.raw, 1668 (void *)addr, 1669 length, 1670 (void *)(uintptr_t)txq->wqes, 1671 max); 1672 ++mpw.pkts_n; 1673 mpw.total_len += (inl_pad + sizeof(inl_hdr) + length); 1674 /* No need to get completion as the entire packet is 1675 * copied to WQ. Free the buf right away. 1676 */ 1677 elts_head_next = elts_head; 1678 rte_pktmbuf_free_seg(buf); 1679 mpw_room -= (inl_pad + sizeof(inl_hdr) + length); 1680 /* Add pad in the next packet if any. */ 1681 inl_pad = (((uintptr_t)mpw.data.raw + 1682 (MLX5_WQE_DWORD_SIZE - 1)) & 1683 ~(MLX5_WQE_DWORD_SIZE - 1)) - 1684 (uintptr_t)mpw.data.raw; 1685 } else { 1686 /* No inline. Load a dseg of packet pointer. */ 1687 volatile rte_v128u32_t *dseg; 1688 1689 assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED); 1690 assert((inl_pad + sizeof(*dseg)) <= mpw_room); 1691 assert(length == DATA_LEN(buf)); 1692 if (!tx_mlx5_wq_tailroom(txq, 1693 (void *)((uintptr_t)mpw.data.raw 1694 + inl_pad))) 1695 dseg = (volatile void *)txq->wqes; 1696 else 1697 dseg = (volatile void *) 1698 ((uintptr_t)mpw.data.raw + 1699 inl_pad); 1700 elts_head_next = (elts_head + 1) & (elts_n - 1); 1701 (*txq->elts)[elts_head] = buf; 1702 addr = rte_pktmbuf_mtod(buf, uintptr_t); 1703 for (n = 0; n * RTE_CACHE_LINE_SIZE < length; n++) 1704 rte_prefetch2((void *)(addr + 1705 n * RTE_CACHE_LINE_SIZE)); 1706 naddr = htonll(addr); 1707 *dseg = (rte_v128u32_t) { 1708 htonl(length), 1709 txq_mp2mr(txq, txq_mb2mp(buf)), 1710 naddr, 1711 naddr >> 32, 1712 }; 1713 mpw.data.raw = (volatile void *)(dseg + 1); 1714 mpw.total_len += (inl_pad + sizeof(*dseg)); 1715 ++j; 1716 ++mpw.pkts_n; 1717 mpw_room -= (inl_pad + sizeof(*dseg)); 1718 inl_pad = 0; 1719 } 1720 elts_head = elts_head_next; 1721 #ifdef MLX5_PMD_SOFT_COUNTERS 1722 /* Increment sent bytes counter. */ 1723 txq->stats.obytes += length; 1724 #endif 1725 ++i; 1726 } while (i < pkts_n); 1727 /* Take a shortcut if nothing must be sent. */ 1728 if (unlikely(i == 0)) 1729 return 0; 1730 /* Check whether completion threshold has been reached. */ 1731 if (txq->elts_comp + j >= MLX5_TX_COMP_THRESH || 1732 (uint16_t)(txq->wqe_ci - txq->mpw_comp) >= 1733 (1 << txq->wqe_n) / MLX5_TX_COMP_THRESH_INLINE_DIV) { 1734 volatile struct mlx5_wqe *wqe = mpw.wqe; 1735 1736 /* Request completion on last WQE. */ 1737 wqe->ctrl[2] = htonl(8); 1738 /* Save elts_head in unused "immediate" field of WQE. */ 1739 wqe->ctrl[3] = elts_head; 1740 txq->elts_comp = 0; 1741 txq->mpw_comp = txq->wqe_ci; 1742 txq->cq_pi++; 1743 } else { 1744 txq->elts_comp += j; 1745 } 1746 #ifdef MLX5_PMD_SOFT_COUNTERS 1747 /* Increment sent packets counter. */ 1748 txq->stats.opackets += i; 1749 #endif 1750 if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED) 1751 mlx5_empw_close(txq, &mpw); 1752 else if (mpw.state == MLX5_MPW_STATE_OPENED) 1753 mlx5_mpw_close(txq, &mpw); 1754 /* Ring QP doorbell. */ 1755 mlx5_tx_dbrec(txq, mpw.wqe); 1756 txq->elts_head = elts_head; 1757 return i; 1758 } 1759 1760 /** 1761 * Translate RX completion flags to packet type. 1762 * 1763 * @param[in] cqe 1764 * Pointer to CQE. 1765 * 1766 * @note: fix mlx5_dev_supported_ptypes_get() if any change here. 1767 * 1768 * @return 1769 * Packet type for struct rte_mbuf. 1770 */ 1771 static inline uint32_t 1772 rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe) 1773 { 1774 uint32_t pkt_type; 1775 uint16_t flags = ntohs(cqe->hdr_type_etc); 1776 1777 if (cqe->pkt_info & MLX5_CQE_RX_TUNNEL_PACKET) { 1778 pkt_type = 1779 TRANSPOSE(flags, 1780 MLX5_CQE_RX_IPV4_PACKET, 1781 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN) | 1782 TRANSPOSE(flags, 1783 MLX5_CQE_RX_IPV6_PACKET, 1784 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN); 1785 pkt_type |= ((cqe->pkt_info & MLX5_CQE_RX_OUTER_PACKET) ? 1786 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN : 1787 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN); 1788 } else { 1789 pkt_type = 1790 TRANSPOSE(flags, 1791 MLX5_CQE_L3_HDR_TYPE_IPV6, 1792 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN) | 1793 TRANSPOSE(flags, 1794 MLX5_CQE_L3_HDR_TYPE_IPV4, 1795 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN); 1796 } 1797 return pkt_type; 1798 } 1799 1800 /** 1801 * Get size of the next packet for a given CQE. For compressed CQEs, the 1802 * consumer index is updated only once all packets of the current one have 1803 * been processed. 1804 * 1805 * @param rxq 1806 * Pointer to RX queue. 1807 * @param cqe 1808 * CQE to process. 1809 * @param[out] rss_hash 1810 * Packet RSS Hash result. 1811 * 1812 * @return 1813 * Packet size in bytes (0 if there is none), -1 in case of completion 1814 * with error. 1815 */ 1816 static inline int 1817 mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe, 1818 uint16_t cqe_cnt, uint32_t *rss_hash) 1819 { 1820 struct rxq_zip *zip = &rxq->zip; 1821 uint16_t cqe_n = cqe_cnt + 1; 1822 int len = 0; 1823 uint16_t idx, end; 1824 1825 /* Process compressed data in the CQE and mini arrays. */ 1826 if (zip->ai) { 1827 volatile struct mlx5_mini_cqe8 (*mc)[8] = 1828 (volatile struct mlx5_mini_cqe8 (*)[8]) 1829 (uintptr_t)(&(*rxq->cqes)[zip->ca & cqe_cnt]); 1830 1831 len = ntohl((*mc)[zip->ai & 7].byte_cnt); 1832 *rss_hash = ntohl((*mc)[zip->ai & 7].rx_hash_result); 1833 if ((++zip->ai & 7) == 0) { 1834 /* Invalidate consumed CQEs */ 1835 idx = zip->ca; 1836 end = zip->na; 1837 while (idx != end) { 1838 (*rxq->cqes)[idx & cqe_cnt].op_own = 1839 MLX5_CQE_INVALIDATE; 1840 ++idx; 1841 } 1842 /* 1843 * Increment consumer index to skip the number of 1844 * CQEs consumed. Hardware leaves holes in the CQ 1845 * ring for software use. 1846 */ 1847 zip->ca = zip->na; 1848 zip->na += 8; 1849 } 1850 if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) { 1851 /* Invalidate the rest */ 1852 idx = zip->ca; 1853 end = zip->cq_ci; 1854 1855 while (idx != end) { 1856 (*rxq->cqes)[idx & cqe_cnt].op_own = 1857 MLX5_CQE_INVALIDATE; 1858 ++idx; 1859 } 1860 rxq->cq_ci = zip->cq_ci; 1861 zip->ai = 0; 1862 } 1863 /* No compressed data, get next CQE and verify if it is compressed. */ 1864 } else { 1865 int ret; 1866 int8_t op_own; 1867 1868 ret = check_cqe(cqe, cqe_n, rxq->cq_ci); 1869 if (unlikely(ret == 1)) 1870 return 0; 1871 ++rxq->cq_ci; 1872 op_own = cqe->op_own; 1873 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) { 1874 volatile struct mlx5_mini_cqe8 (*mc)[8] = 1875 (volatile struct mlx5_mini_cqe8 (*)[8]) 1876 (uintptr_t)(&(*rxq->cqes)[rxq->cq_ci & 1877 cqe_cnt]); 1878 1879 /* Fix endianness. */ 1880 zip->cqe_cnt = ntohl(cqe->byte_cnt); 1881 /* 1882 * Current mini array position is the one returned by 1883 * check_cqe64(). 1884 * 1885 * If completion comprises several mini arrays, as a 1886 * special case the second one is located 7 CQEs after 1887 * the initial CQE instead of 8 for subsequent ones. 1888 */ 1889 zip->ca = rxq->cq_ci; 1890 zip->na = zip->ca + 7; 1891 /* Compute the next non compressed CQE. */ 1892 --rxq->cq_ci; 1893 zip->cq_ci = rxq->cq_ci + zip->cqe_cnt; 1894 /* Get packet size to return. */ 1895 len = ntohl((*mc)[0].byte_cnt); 1896 *rss_hash = ntohl((*mc)[0].rx_hash_result); 1897 zip->ai = 1; 1898 /* Prefetch all the entries to be invalidated */ 1899 idx = zip->ca; 1900 end = zip->cq_ci; 1901 while (idx != end) { 1902 rte_prefetch0(&(*rxq->cqes)[(idx) & cqe_cnt]); 1903 ++idx; 1904 } 1905 } else { 1906 len = ntohl(cqe->byte_cnt); 1907 *rss_hash = ntohl(cqe->rx_hash_res); 1908 } 1909 /* Error while receiving packet. */ 1910 if (unlikely(MLX5_CQE_OPCODE(op_own) == MLX5_CQE_RESP_ERR)) 1911 return -1; 1912 } 1913 return len; 1914 } 1915 1916 /** 1917 * Translate RX completion flags to offload flags. 1918 * 1919 * @param[in] rxq 1920 * Pointer to RX queue structure. 1921 * @param[in] cqe 1922 * Pointer to CQE. 1923 * 1924 * @return 1925 * Offload flags (ol_flags) for struct rte_mbuf. 1926 */ 1927 static inline uint32_t 1928 rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe *cqe) 1929 { 1930 uint32_t ol_flags = 0; 1931 uint16_t flags = ntohs(cqe->hdr_type_etc); 1932 1933 ol_flags = 1934 TRANSPOSE(flags, 1935 MLX5_CQE_RX_L3_HDR_VALID, 1936 PKT_RX_IP_CKSUM_GOOD) | 1937 TRANSPOSE(flags, 1938 MLX5_CQE_RX_L4_HDR_VALID, 1939 PKT_RX_L4_CKSUM_GOOD); 1940 if ((cqe->pkt_info & MLX5_CQE_RX_TUNNEL_PACKET) && (rxq->csum_l2tun)) 1941 ol_flags |= 1942 TRANSPOSE(flags, 1943 MLX5_CQE_RX_L3_HDR_VALID, 1944 PKT_RX_IP_CKSUM_GOOD) | 1945 TRANSPOSE(flags, 1946 MLX5_CQE_RX_L4_HDR_VALID, 1947 PKT_RX_L4_CKSUM_GOOD); 1948 return ol_flags; 1949 } 1950 1951 /** 1952 * DPDK callback for RX. 1953 * 1954 * @param dpdk_rxq 1955 * Generic pointer to RX queue structure. 1956 * @param[out] pkts 1957 * Array to store received packets. 1958 * @param pkts_n 1959 * Maximum number of packets in array. 1960 * 1961 * @return 1962 * Number of packets successfully received (<= pkts_n). 1963 */ 1964 uint16_t 1965 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1966 { 1967 struct rxq *rxq = dpdk_rxq; 1968 const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1; 1969 const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1; 1970 const unsigned int sges_n = rxq->sges_n; 1971 struct rte_mbuf *pkt = NULL; 1972 struct rte_mbuf *seg = NULL; 1973 volatile struct mlx5_cqe *cqe = 1974 &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1975 unsigned int i = 0; 1976 unsigned int rq_ci = rxq->rq_ci << sges_n; 1977 int len = 0; /* keep its value across iterations. */ 1978 1979 while (pkts_n) { 1980 unsigned int idx = rq_ci & wqe_cnt; 1981 volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx]; 1982 struct rte_mbuf *rep = (*rxq->elts)[idx]; 1983 uint32_t rss_hash_res = 0; 1984 1985 if (pkt) 1986 NEXT(seg) = rep; 1987 seg = rep; 1988 rte_prefetch0(seg); 1989 rte_prefetch0(cqe); 1990 rte_prefetch0(wqe); 1991 rep = rte_mbuf_raw_alloc(rxq->mp); 1992 if (unlikely(rep == NULL)) { 1993 ++rxq->stats.rx_nombuf; 1994 if (!pkt) { 1995 /* 1996 * no buffers before we even started, 1997 * bail out silently. 1998 */ 1999 break; 2000 } 2001 while (pkt != seg) { 2002 assert(pkt != (*rxq->elts)[idx]); 2003 rep = NEXT(pkt); 2004 NEXT(pkt) = NULL; 2005 NB_SEGS(pkt) = 1; 2006 rte_mbuf_raw_free(pkt); 2007 pkt = rep; 2008 } 2009 break; 2010 } 2011 if (!pkt) { 2012 cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 2013 len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt, 2014 &rss_hash_res); 2015 if (!len) { 2016 rte_mbuf_raw_free(rep); 2017 break; 2018 } 2019 if (unlikely(len == -1)) { 2020 /* RX error, packet is likely too large. */ 2021 rte_mbuf_raw_free(rep); 2022 ++rxq->stats.idropped; 2023 goto skip; 2024 } 2025 pkt = seg; 2026 assert(len >= (rxq->crc_present << 2)); 2027 /* Update packet information. */ 2028 pkt->packet_type = 0; 2029 pkt->ol_flags = 0; 2030 if (rss_hash_res && rxq->rss_hash) { 2031 pkt->hash.rss = rss_hash_res; 2032 pkt->ol_flags = PKT_RX_RSS_HASH; 2033 } 2034 if (rxq->mark && 2035 MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) { 2036 pkt->ol_flags |= PKT_RX_FDIR; 2037 if (cqe->sop_drop_qpn != 2038 htonl(MLX5_FLOW_MARK_DEFAULT)) { 2039 uint32_t mark = cqe->sop_drop_qpn; 2040 2041 pkt->ol_flags |= PKT_RX_FDIR_ID; 2042 pkt->hash.fdir.hi = 2043 mlx5_flow_mark_get(mark); 2044 } 2045 } 2046 if (rxq->csum | rxq->csum_l2tun) { 2047 pkt->packet_type = rxq_cq_to_pkt_type(cqe); 2048 pkt->ol_flags |= rxq_cq_to_ol_flags(rxq, cqe); 2049 } 2050 if (rxq->vlan_strip && 2051 (cqe->hdr_type_etc & 2052 htons(MLX5_CQE_VLAN_STRIPPED))) { 2053 pkt->ol_flags |= PKT_RX_VLAN_PKT | 2054 PKT_RX_VLAN_STRIPPED; 2055 pkt->vlan_tci = ntohs(cqe->vlan_info); 2056 } 2057 if (rxq->crc_present) 2058 len -= ETHER_CRC_LEN; 2059 PKT_LEN(pkt) = len; 2060 } 2061 DATA_LEN(rep) = DATA_LEN(seg); 2062 PKT_LEN(rep) = PKT_LEN(seg); 2063 SET_DATA_OFF(rep, DATA_OFF(seg)); 2064 NB_SEGS(rep) = NB_SEGS(seg); 2065 PORT(rep) = PORT(seg); 2066 NEXT(rep) = NULL; 2067 (*rxq->elts)[idx] = rep; 2068 /* 2069 * Fill NIC descriptor with the new buffer. The lkey and size 2070 * of the buffers are already known, only the buffer address 2071 * changes. 2072 */ 2073 wqe->addr = htonll(rte_pktmbuf_mtod(rep, uintptr_t)); 2074 if (len > DATA_LEN(seg)) { 2075 len -= DATA_LEN(seg); 2076 ++NB_SEGS(pkt); 2077 ++rq_ci; 2078 continue; 2079 } 2080 DATA_LEN(seg) = len; 2081 #ifdef MLX5_PMD_SOFT_COUNTERS 2082 /* Increment bytes counter. */ 2083 rxq->stats.ibytes += PKT_LEN(pkt); 2084 #endif 2085 /* Return packet. */ 2086 *(pkts++) = pkt; 2087 pkt = NULL; 2088 --pkts_n; 2089 ++i; 2090 skip: 2091 /* Align consumer index to the next stride. */ 2092 rq_ci >>= sges_n; 2093 ++rq_ci; 2094 rq_ci <<= sges_n; 2095 } 2096 if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci))) 2097 return 0; 2098 /* Update the consumer index. */ 2099 rxq->rq_ci = rq_ci >> sges_n; 2100 rte_wmb(); 2101 *rxq->cq_db = htonl(rxq->cq_ci); 2102 rte_wmb(); 2103 *rxq->rq_db = htonl(rxq->rq_ci); 2104 #ifdef MLX5_PMD_SOFT_COUNTERS 2105 /* Increment packets counter. */ 2106 rxq->stats.ipackets += i; 2107 #endif 2108 return i; 2109 } 2110 2111 /** 2112 * Dummy DPDK callback for TX. 2113 * 2114 * This function is used to temporarily replace the real callback during 2115 * unsafe control operations on the queue, or in case of error. 2116 * 2117 * @param dpdk_txq 2118 * Generic pointer to TX queue structure. 2119 * @param[in] pkts 2120 * Packets to transmit. 2121 * @param pkts_n 2122 * Number of packets in array. 2123 * 2124 * @return 2125 * Number of packets successfully transmitted (<= pkts_n). 2126 */ 2127 uint16_t 2128 removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 2129 { 2130 (void)dpdk_txq; 2131 (void)pkts; 2132 (void)pkts_n; 2133 return 0; 2134 } 2135 2136 /** 2137 * Dummy DPDK callback for RX. 2138 * 2139 * This function is used to temporarily replace the real callback during 2140 * unsafe control operations on the queue, or in case of error. 2141 * 2142 * @param dpdk_rxq 2143 * Generic pointer to RX queue structure. 2144 * @param[out] pkts 2145 * Array to store received packets. 2146 * @param pkts_n 2147 * Maximum number of packets in array. 2148 * 2149 * @return 2150 * Number of packets successfully received (<= pkts_n). 2151 */ 2152 uint16_t 2153 removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 2154 { 2155 (void)dpdk_rxq; 2156 (void)pkts; 2157 (void)pkts_n; 2158 return 0; 2159 } 2160 2161 /** 2162 * DPDK callback for rx queue interrupt enable. 2163 * 2164 * @param dev 2165 * Pointer to Ethernet device structure. 2166 * @param rx_queue_id 2167 * RX queue number 2168 * 2169 * @return 2170 * 0 on success, negative on failure. 2171 */ 2172 int 2173 mlx5_rx_intr_enable(struct rte_eth_dev *dev, uint16_t rx_queue_id) 2174 { 2175 #ifdef HAVE_UPDATE_CQ_CI 2176 struct priv *priv = mlx5_get_priv(dev); 2177 struct rxq *rxq = (*priv->rxqs)[rx_queue_id]; 2178 struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq); 2179 struct ibv_cq *cq = rxq_ctrl->cq; 2180 uint16_t ci = rxq->cq_ci; 2181 int ret = 0; 2182 2183 ibv_mlx5_exp_update_cq_ci(cq, ci); 2184 ret = ibv_req_notify_cq(cq, 0); 2185 #else 2186 int ret = -1; 2187 (void)dev; 2188 (void)rx_queue_id; 2189 #endif 2190 if (ret) 2191 WARN("unable to arm interrupt on rx queue %d", rx_queue_id); 2192 return ret; 2193 } 2194 2195 /** 2196 * DPDK callback for rx queue interrupt disable. 2197 * 2198 * @param dev 2199 * Pointer to Ethernet device structure. 2200 * @param rx_queue_id 2201 * RX queue number 2202 * 2203 * @return 2204 * 0 on success, negative on failure. 2205 */ 2206 int 2207 mlx5_rx_intr_disable(struct rte_eth_dev *dev, uint16_t rx_queue_id) 2208 { 2209 #ifdef HAVE_UPDATE_CQ_CI 2210 struct priv *priv = mlx5_get_priv(dev); 2211 struct rxq *rxq = (*priv->rxqs)[rx_queue_id]; 2212 struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq); 2213 struct ibv_cq *cq = rxq_ctrl->cq; 2214 struct ibv_cq *ev_cq; 2215 void *ev_ctx; 2216 int ret = 0; 2217 2218 ret = ibv_get_cq_event(cq->channel, &ev_cq, &ev_ctx); 2219 if (ret || ev_cq != cq) 2220 ret = -1; 2221 else 2222 ibv_ack_cq_events(cq, 1); 2223 #else 2224 int ret = -1; 2225 (void)dev; 2226 (void)rx_queue_id; 2227 #endif 2228 if (ret) 2229 WARN("unable to disable interrupt on rx queue %d", 2230 rx_queue_id); 2231 return ret; 2232 } 2233