1 /*- 2 * BSD LICENSE 3 * 4 * Copyright 2015 6WIND S.A. 5 * Copyright 2015 Mellanox. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of 6WIND S.A. nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <assert.h> 35 #include <stdint.h> 36 #include <string.h> 37 #include <stdlib.h> 38 39 /* Verbs header. */ 40 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 41 #ifdef PEDANTIC 42 #pragma GCC diagnostic ignored "-pedantic" 43 #endif 44 #include <infiniband/verbs.h> 45 #ifdef PEDANTIC 46 #pragma GCC diagnostic error "-pedantic" 47 #endif 48 49 /* DPDK headers don't like -pedantic. */ 50 #ifdef PEDANTIC 51 #pragma GCC diagnostic ignored "-pedantic" 52 #endif 53 #include <rte_mbuf.h> 54 #include <rte_mempool.h> 55 #include <rte_prefetch.h> 56 #include <rte_common.h> 57 #include <rte_branch_prediction.h> 58 #include <rte_memory.h> 59 #ifdef PEDANTIC 60 #pragma GCC diagnostic error "-pedantic" 61 #endif 62 63 #include "mlx5.h" 64 #include "mlx5_utils.h" 65 #include "mlx5_rxtx.h" 66 #include "mlx5_autoconf.h" 67 #include "mlx5_defs.h" 68 69 /** 70 * Manage TX completions. 71 * 72 * When sending a burst, mlx5_tx_burst() posts several WRs. 73 * To improve performance, a completion event is only required once every 74 * MLX5_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information 75 * for other WRs, but this information would not be used anyway. 76 * 77 * @param txq 78 * Pointer to TX queue structure. 79 * 80 * @return 81 * 0 on success, -1 on failure. 82 */ 83 static int 84 txq_complete(struct txq *txq) 85 { 86 unsigned int elts_comp = txq->elts_comp; 87 unsigned int elts_tail = txq->elts_tail; 88 unsigned int elts_free = txq->elts_tail; 89 const unsigned int elts_n = txq->elts_n; 90 int wcs_n; 91 92 if (unlikely(elts_comp == 0)) 93 return 0; 94 #ifdef DEBUG_SEND 95 DEBUG("%p: processing %u work requests completions", 96 (void *)txq, elts_comp); 97 #endif 98 wcs_n = txq->poll_cnt(txq->cq, elts_comp); 99 if (unlikely(wcs_n == 0)) 100 return 0; 101 if (unlikely(wcs_n < 0)) { 102 DEBUG("%p: ibv_poll_cq() failed (wcs_n=%d)", 103 (void *)txq, wcs_n); 104 return -1; 105 } 106 elts_comp -= wcs_n; 107 assert(elts_comp <= txq->elts_comp); 108 /* 109 * Assume WC status is successful as nothing can be done about it 110 * anyway. 111 */ 112 elts_tail += wcs_n * txq->elts_comp_cd_init; 113 if (elts_tail >= elts_n) 114 elts_tail -= elts_n; 115 116 while (elts_free != elts_tail) { 117 struct txq_elt *elt = &(*txq->elts)[elts_free]; 118 unsigned int elts_free_next = 119 (((elts_free + 1) == elts_n) ? 0 : elts_free + 1); 120 struct rte_mbuf *tmp = elt->buf; 121 struct txq_elt *elt_next = &(*txq->elts)[elts_free_next]; 122 123 #ifndef NDEBUG 124 /* Poisoning. */ 125 memset(elt, 0x66, sizeof(*elt)); 126 #endif 127 RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); 128 /* Faster than rte_pktmbuf_free(). */ 129 do { 130 struct rte_mbuf *next = NEXT(tmp); 131 132 rte_pktmbuf_free_seg(tmp); 133 tmp = next; 134 } while (tmp != NULL); 135 elts_free = elts_free_next; 136 } 137 138 txq->elts_tail = elts_tail; 139 txq->elts_comp = elts_comp; 140 return 0; 141 } 142 143 /* For best performance, this function should not be inlined. */ 144 struct ibv_mr *mlx5_mp2mr(struct ibv_pd *, const struct rte_mempool *) 145 __attribute__((noinline)); 146 147 /** 148 * Register mempool as a memory region. 149 * 150 * @param pd 151 * Pointer to protection domain. 152 * @param mp 153 * Pointer to memory pool. 154 * 155 * @return 156 * Memory region pointer, NULL in case of error. 157 */ 158 struct ibv_mr * 159 mlx5_mp2mr(struct ibv_pd *pd, const struct rte_mempool *mp) 160 { 161 const struct rte_memseg *ms = rte_eal_get_physmem_layout(); 162 uintptr_t start = mp->elt_va_start; 163 uintptr_t end = mp->elt_va_end; 164 unsigned int i; 165 166 DEBUG("mempool %p area start=%p end=%p size=%zu", 167 (const void *)mp, (void *)start, (void *)end, 168 (size_t)(end - start)); 169 /* Round start and end to page boundary if found in memory segments. */ 170 for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) { 171 uintptr_t addr = (uintptr_t)ms[i].addr; 172 size_t len = ms[i].len; 173 unsigned int align = ms[i].hugepage_sz; 174 175 if ((start > addr) && (start < addr + len)) 176 start = RTE_ALIGN_FLOOR(start, align); 177 if ((end > addr) && (end < addr + len)) 178 end = RTE_ALIGN_CEIL(end, align); 179 } 180 DEBUG("mempool %p using start=%p end=%p size=%zu for MR", 181 (const void *)mp, (void *)start, (void *)end, 182 (size_t)(end - start)); 183 return ibv_reg_mr(pd, 184 (void *)start, 185 end - start, 186 IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); 187 } 188 189 /** 190 * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which 191 * the cloned mbuf is allocated is returned instead. 192 * 193 * @param buf 194 * Pointer to mbuf. 195 * 196 * @return 197 * Memory pool where data is located for given mbuf. 198 */ 199 static struct rte_mempool * 200 txq_mb2mp(struct rte_mbuf *buf) 201 { 202 if (unlikely(RTE_MBUF_INDIRECT(buf))) 203 return rte_mbuf_from_indirect(buf)->pool; 204 return buf->pool; 205 } 206 207 /** 208 * Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[]. 209 * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full, 210 * remove an entry first. 211 * 212 * @param txq 213 * Pointer to TX queue structure. 214 * @param[in] mp 215 * Memory Pool for which a Memory Region lkey must be returned. 216 * 217 * @return 218 * mr->lkey on success, (uint32_t)-1 on failure. 219 */ 220 static uint32_t 221 txq_mp2mr(struct txq *txq, const struct rte_mempool *mp) 222 { 223 unsigned int i; 224 struct ibv_mr *mr; 225 226 for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) { 227 if (unlikely(txq->mp2mr[i].mp == NULL)) { 228 /* Unknown MP, add a new MR for it. */ 229 break; 230 } 231 if (txq->mp2mr[i].mp == mp) { 232 assert(txq->mp2mr[i].lkey != (uint32_t)-1); 233 assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey); 234 return txq->mp2mr[i].lkey; 235 } 236 } 237 /* Add a new entry, register MR first. */ 238 DEBUG("%p: discovered new memory pool \"%s\" (%p)", 239 (void *)txq, mp->name, (const void *)mp); 240 mr = mlx5_mp2mr(txq->priv->pd, mp); 241 if (unlikely(mr == NULL)) { 242 DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.", 243 (void *)txq); 244 return (uint32_t)-1; 245 } 246 if (unlikely(i == RTE_DIM(txq->mp2mr))) { 247 /* Table is full, remove oldest entry. */ 248 DEBUG("%p: MR <-> MP table full, dropping oldest entry.", 249 (void *)txq); 250 --i; 251 claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr)); 252 memmove(&txq->mp2mr[0], &txq->mp2mr[1], 253 (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0]))); 254 } 255 /* Store the new entry. */ 256 txq->mp2mr[i].mp = mp; 257 txq->mp2mr[i].mr = mr; 258 txq->mp2mr[i].lkey = mr->lkey; 259 DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32, 260 (void *)txq, mp->name, (const void *)mp, txq->mp2mr[i].lkey); 261 return txq->mp2mr[i].lkey; 262 } 263 264 struct txq_mp2mr_mbuf_check_data { 265 const struct rte_mempool *mp; 266 int ret; 267 }; 268 269 /** 270 * Callback function for rte_mempool_obj_iter() to check whether a given 271 * mempool object looks like a mbuf. 272 * 273 * @param[in, out] arg 274 * Context data (struct txq_mp2mr_mbuf_check_data). Contains mempool pointer 275 * and return value. 276 * @param[in] start 277 * Object start address. 278 * @param[in] end 279 * Object end address. 280 * @param index 281 * Unused. 282 * 283 * @return 284 * Nonzero value when object is not a mbuf. 285 */ 286 static void 287 txq_mp2mr_mbuf_check(void *arg, void *start, void *end, 288 uint32_t index __rte_unused) 289 { 290 struct txq_mp2mr_mbuf_check_data *data = arg; 291 struct rte_mbuf *buf = 292 (void *)((uintptr_t)start + data->mp->header_size); 293 294 (void)index; 295 /* Check whether mbuf structure fits element size and whether mempool 296 * pointer is valid. */ 297 if (((uintptr_t)end >= (uintptr_t)(buf + 1)) && 298 (buf->pool == data->mp)) 299 data->ret = 0; 300 else 301 data->ret = -1; 302 } 303 304 /** 305 * Iterator function for rte_mempool_walk() to register existing mempools and 306 * fill the MP to MR cache of a TX queue. 307 * 308 * @param[in] mp 309 * Memory Pool to register. 310 * @param *arg 311 * Pointer to TX queue structure. 312 */ 313 void 314 txq_mp2mr_iter(const struct rte_mempool *mp, void *arg) 315 { 316 struct txq *txq = arg; 317 struct txq_mp2mr_mbuf_check_data data = { 318 .mp = mp, 319 .ret = -1, 320 }; 321 322 /* Discard empty mempools. */ 323 if (mp->size == 0) 324 return; 325 /* Register mempool only if the first element looks like a mbuf. */ 326 rte_mempool_obj_iter((void *)mp->elt_va_start, 327 1, 328 mp->header_size + mp->elt_size + mp->trailer_size, 329 1, 330 mp->elt_pa, 331 mp->pg_num, 332 mp->pg_shift, 333 txq_mp2mr_mbuf_check, 334 &data); 335 if (data.ret) 336 return; 337 txq_mp2mr(txq, mp); 338 } 339 340 /** 341 * Insert VLAN using mbuf headroom space. 342 * 343 * @param buf 344 * Buffer for VLAN insertion. 345 * 346 * @return 347 * 0 on success, errno value on failure. 348 */ 349 static inline int 350 insert_vlan_sw(struct rte_mbuf *buf) 351 { 352 uintptr_t addr; 353 uint32_t vlan; 354 uint16_t head_room_len = rte_pktmbuf_headroom(buf); 355 356 if (head_room_len < 4) 357 return EINVAL; 358 359 addr = rte_pktmbuf_mtod(buf, uintptr_t); 360 vlan = htonl(0x81000000 | buf->vlan_tci); 361 memmove((void *)(addr - 4), (void *)addr, 12); 362 memcpy((void *)(addr + 8), &vlan, sizeof(vlan)); 363 364 SET_DATA_OFF(buf, head_room_len - 4); 365 DATA_LEN(buf) += 4; 366 367 return 0; 368 } 369 370 #if MLX5_PMD_SGE_WR_N > 1 371 372 /** 373 * Copy scattered mbuf contents to a single linear buffer. 374 * 375 * @param[out] linear 376 * Linear output buffer. 377 * @param[in] buf 378 * Scattered input buffer. 379 * 380 * @return 381 * Number of bytes copied to the output buffer or 0 if not large enough. 382 */ 383 static unsigned int 384 linearize_mbuf(linear_t *linear, struct rte_mbuf *buf) 385 { 386 unsigned int size = 0; 387 unsigned int offset; 388 389 do { 390 unsigned int len = DATA_LEN(buf); 391 392 offset = size; 393 size += len; 394 if (unlikely(size > sizeof(*linear))) 395 return 0; 396 memcpy(&(*linear)[offset], 397 rte_pktmbuf_mtod(buf, uint8_t *), 398 len); 399 buf = NEXT(buf); 400 } while (buf != NULL); 401 return size; 402 } 403 404 /** 405 * Handle scattered buffers for mlx5_tx_burst(). 406 * 407 * @param txq 408 * TX queue structure. 409 * @param segs 410 * Number of segments in buf. 411 * @param elt 412 * TX queue element to fill. 413 * @param[in] buf 414 * Buffer to process. 415 * @param elts_head 416 * Index of the linear buffer to use if necessary (normally txq->elts_head). 417 * @param[out] sges 418 * Array filled with SGEs on success. 419 * 420 * @return 421 * A structure containing the processed packet size in bytes and the 422 * number of SGEs. Both fields are set to (unsigned int)-1 in case of 423 * failure. 424 */ 425 static struct tx_burst_sg_ret { 426 unsigned int length; 427 unsigned int num; 428 } 429 tx_burst_sg(struct txq *txq, unsigned int segs, struct txq_elt *elt, 430 struct rte_mbuf *buf, unsigned int elts_head, 431 struct ibv_sge (*sges)[MLX5_PMD_SGE_WR_N]) 432 { 433 unsigned int sent_size = 0; 434 unsigned int j; 435 int linearize = 0; 436 437 /* When there are too many segments, extra segments are 438 * linearized in the last SGE. */ 439 if (unlikely(segs > RTE_DIM(*sges))) { 440 segs = (RTE_DIM(*sges) - 1); 441 linearize = 1; 442 } 443 /* Update element. */ 444 elt->buf = buf; 445 /* Register segments as SGEs. */ 446 for (j = 0; (j != segs); ++j) { 447 struct ibv_sge *sge = &(*sges)[j]; 448 uint32_t lkey; 449 450 /* Retrieve Memory Region key for this memory pool. */ 451 lkey = txq_mp2mr(txq, txq_mb2mp(buf)); 452 if (unlikely(lkey == (uint32_t)-1)) { 453 /* MR does not exist. */ 454 DEBUG("%p: unable to get MP <-> MR association", 455 (void *)txq); 456 /* Clean up TX element. */ 457 elt->buf = NULL; 458 goto stop; 459 } 460 /* Update SGE. */ 461 sge->addr = rte_pktmbuf_mtod(buf, uintptr_t); 462 if (txq->priv->vf) 463 rte_prefetch0((volatile void *) 464 (uintptr_t)sge->addr); 465 sge->length = DATA_LEN(buf); 466 sge->lkey = lkey; 467 sent_size += sge->length; 468 buf = NEXT(buf); 469 } 470 /* If buf is not NULL here and is not going to be linearized, 471 * nb_segs is not valid. */ 472 assert(j == segs); 473 assert((buf == NULL) || (linearize)); 474 /* Linearize extra segments. */ 475 if (linearize) { 476 struct ibv_sge *sge = &(*sges)[segs]; 477 linear_t *linear = &(*txq->elts_linear)[elts_head]; 478 unsigned int size = linearize_mbuf(linear, buf); 479 480 assert(segs == (RTE_DIM(*sges) - 1)); 481 if (size == 0) { 482 /* Invalid packet. */ 483 DEBUG("%p: packet too large to be linearized.", 484 (void *)txq); 485 /* Clean up TX element. */ 486 elt->buf = NULL; 487 goto stop; 488 } 489 /* If MLX5_PMD_SGE_WR_N is 1, free mbuf immediately. */ 490 if (RTE_DIM(*sges) == 1) { 491 do { 492 struct rte_mbuf *next = NEXT(buf); 493 494 rte_pktmbuf_free_seg(buf); 495 buf = next; 496 } while (buf != NULL); 497 elt->buf = NULL; 498 } 499 /* Update SGE. */ 500 sge->addr = (uintptr_t)&(*linear)[0]; 501 sge->length = size; 502 sge->lkey = txq->mr_linear->lkey; 503 sent_size += size; 504 /* Include last segment. */ 505 segs++; 506 } 507 return (struct tx_burst_sg_ret){ 508 .length = sent_size, 509 .num = segs, 510 }; 511 stop: 512 return (struct tx_burst_sg_ret){ 513 .length = -1, 514 .num = -1, 515 }; 516 } 517 518 #endif /* MLX5_PMD_SGE_WR_N > 1 */ 519 520 /** 521 * DPDK callback for TX. 522 * 523 * @param dpdk_txq 524 * Generic pointer to TX queue structure. 525 * @param[in] pkts 526 * Packets to transmit. 527 * @param pkts_n 528 * Number of packets in array. 529 * 530 * @return 531 * Number of packets successfully transmitted (<= pkts_n). 532 */ 533 uint16_t 534 mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 535 { 536 struct txq *txq = (struct txq *)dpdk_txq; 537 unsigned int elts_head = txq->elts_head; 538 const unsigned int elts_n = txq->elts_n; 539 unsigned int elts_comp_cd = txq->elts_comp_cd; 540 unsigned int elts_comp = 0; 541 unsigned int i; 542 unsigned int max; 543 int err; 544 struct rte_mbuf *buf = pkts[0]; 545 546 assert(elts_comp_cd != 0); 547 /* Prefetch first packet cacheline. */ 548 rte_prefetch0(buf); 549 txq_complete(txq); 550 max = (elts_n - (elts_head - txq->elts_tail)); 551 if (max > elts_n) 552 max -= elts_n; 553 assert(max >= 1); 554 assert(max <= elts_n); 555 /* Always leave one free entry in the ring. */ 556 --max; 557 if (max == 0) 558 return 0; 559 if (max > pkts_n) 560 max = pkts_n; 561 for (i = 0; (i != max); ++i) { 562 struct rte_mbuf *buf_next = pkts[i + 1]; 563 unsigned int elts_head_next = 564 (((elts_head + 1) == elts_n) ? 0 : elts_head + 1); 565 struct txq_elt *elt = &(*txq->elts)[elts_head]; 566 unsigned int segs = NB_SEGS(buf); 567 #ifdef MLX5_PMD_SOFT_COUNTERS 568 unsigned int sent_size = 0; 569 #endif 570 uint32_t send_flags = 0; 571 #ifdef HAVE_VERBS_VLAN_INSERTION 572 int insert_vlan = 0; 573 #endif /* HAVE_VERBS_VLAN_INSERTION */ 574 575 if (i + 1 < max) 576 rte_prefetch0(buf_next); 577 /* Request TX completion. */ 578 if (unlikely(--elts_comp_cd == 0)) { 579 elts_comp_cd = txq->elts_comp_cd_init; 580 ++elts_comp; 581 send_flags |= IBV_EXP_QP_BURST_SIGNALED; 582 } 583 /* Should we enable HW CKSUM offload */ 584 if (buf->ol_flags & 585 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) { 586 send_flags |= IBV_EXP_QP_BURST_IP_CSUM; 587 /* HW does not support checksum offloads at arbitrary 588 * offsets but automatically recognizes the packet 589 * type. For inner L3/L4 checksums, only VXLAN (UDP) 590 * tunnels are currently supported. */ 591 if (RTE_ETH_IS_TUNNEL_PKT(buf->packet_type)) 592 send_flags |= IBV_EXP_QP_BURST_TUNNEL; 593 } 594 if (buf->ol_flags & PKT_TX_VLAN_PKT) { 595 #ifdef HAVE_VERBS_VLAN_INSERTION 596 if (!txq->priv->mps) 597 insert_vlan = 1; 598 else 599 #endif /* HAVE_VERBS_VLAN_INSERTION */ 600 { 601 err = insert_vlan_sw(buf); 602 if (unlikely(err)) 603 goto stop; 604 } 605 } 606 if (likely(segs == 1)) { 607 uintptr_t addr; 608 uint32_t length; 609 uint32_t lkey; 610 uintptr_t buf_next_addr; 611 612 /* Retrieve buffer information. */ 613 addr = rte_pktmbuf_mtod(buf, uintptr_t); 614 length = DATA_LEN(buf); 615 /* Update element. */ 616 elt->buf = buf; 617 if (txq->priv->vf) 618 rte_prefetch0((volatile void *) 619 (uintptr_t)addr); 620 /* Prefetch next buffer data. */ 621 if (i + 1 < max) { 622 buf_next_addr = 623 rte_pktmbuf_mtod(buf_next, uintptr_t); 624 rte_prefetch0((volatile void *) 625 (uintptr_t)buf_next_addr); 626 } 627 /* Put packet into send queue. */ 628 #if MLX5_PMD_MAX_INLINE > 0 629 if (length <= txq->max_inline) { 630 #ifdef HAVE_VERBS_VLAN_INSERTION 631 if (insert_vlan) 632 err = txq->send_pending_inline_vlan 633 (txq->qp, 634 (void *)addr, 635 length, 636 send_flags, 637 &buf->vlan_tci); 638 else 639 #endif /* HAVE_VERBS_VLAN_INSERTION */ 640 err = txq->send_pending_inline 641 (txq->qp, 642 (void *)addr, 643 length, 644 send_flags); 645 } else 646 #endif 647 { 648 /* Retrieve Memory Region key for this 649 * memory pool. */ 650 lkey = txq_mp2mr(txq, txq_mb2mp(buf)); 651 if (unlikely(lkey == (uint32_t)-1)) { 652 /* MR does not exist. */ 653 DEBUG("%p: unable to get MP <-> MR" 654 " association", (void *)txq); 655 /* Clean up TX element. */ 656 elt->buf = NULL; 657 goto stop; 658 } 659 #ifdef HAVE_VERBS_VLAN_INSERTION 660 if (insert_vlan) 661 err = txq->send_pending_vlan 662 (txq->qp, 663 addr, 664 length, 665 lkey, 666 send_flags, 667 &buf->vlan_tci); 668 else 669 #endif /* HAVE_VERBS_VLAN_INSERTION */ 670 err = txq->send_pending 671 (txq->qp, 672 addr, 673 length, 674 lkey, 675 send_flags); 676 } 677 if (unlikely(err)) 678 goto stop; 679 #ifdef MLX5_PMD_SOFT_COUNTERS 680 sent_size += length; 681 #endif 682 } else { 683 #if MLX5_PMD_SGE_WR_N > 1 684 struct ibv_sge sges[MLX5_PMD_SGE_WR_N]; 685 struct tx_burst_sg_ret ret; 686 687 ret = tx_burst_sg(txq, segs, elt, buf, elts_head, 688 &sges); 689 if (ret.length == (unsigned int)-1) 690 goto stop; 691 /* Put SG list into send queue. */ 692 #ifdef HAVE_VERBS_VLAN_INSERTION 693 if (insert_vlan) 694 err = txq->send_pending_sg_list_vlan 695 (txq->qp, 696 sges, 697 ret.num, 698 send_flags, 699 &buf->vlan_tci); 700 else 701 #endif /* HAVE_VERBS_VLAN_INSERTION */ 702 err = txq->send_pending_sg_list 703 (txq->qp, 704 sges, 705 ret.num, 706 send_flags); 707 if (unlikely(err)) 708 goto stop; 709 #ifdef MLX5_PMD_SOFT_COUNTERS 710 sent_size += ret.length; 711 #endif 712 #else /* MLX5_PMD_SGE_WR_N > 1 */ 713 DEBUG("%p: TX scattered buffers support not" 714 " compiled in", (void *)txq); 715 goto stop; 716 #endif /* MLX5_PMD_SGE_WR_N > 1 */ 717 } 718 elts_head = elts_head_next; 719 buf = buf_next; 720 #ifdef MLX5_PMD_SOFT_COUNTERS 721 /* Increment sent bytes counter. */ 722 txq->stats.obytes += sent_size; 723 #endif 724 } 725 stop: 726 /* Take a shortcut if nothing must be sent. */ 727 if (unlikely(i == 0)) 728 return 0; 729 #ifdef MLX5_PMD_SOFT_COUNTERS 730 /* Increment sent packets counter. */ 731 txq->stats.opackets += i; 732 #endif 733 /* Ring QP doorbell. */ 734 err = txq->send_flush(txq->qp); 735 if (unlikely(err)) { 736 /* A nonzero value is not supposed to be returned. 737 * Nothing can be done about it. */ 738 DEBUG("%p: send_flush() failed with error %d", 739 (void *)txq, err); 740 } 741 txq->elts_head = elts_head; 742 txq->elts_comp += elts_comp; 743 txq->elts_comp_cd = elts_comp_cd; 744 return i; 745 } 746 747 /** 748 * Translate RX completion flags to packet type. 749 * 750 * @param flags 751 * RX completion flags returned by poll_length_flags(). 752 * 753 * @note: fix mlx5_dev_supported_ptypes_get() if any change here. 754 * 755 * @return 756 * Packet type for struct rte_mbuf. 757 */ 758 static inline uint32_t 759 rxq_cq_to_pkt_type(uint32_t flags) 760 { 761 uint32_t pkt_type; 762 763 if (flags & IBV_EXP_CQ_RX_TUNNEL_PACKET) 764 pkt_type = 765 TRANSPOSE(flags, 766 IBV_EXP_CQ_RX_OUTER_IPV4_PACKET, 767 RTE_PTYPE_L3_IPV4) | 768 TRANSPOSE(flags, 769 IBV_EXP_CQ_RX_OUTER_IPV6_PACKET, 770 RTE_PTYPE_L3_IPV6) | 771 TRANSPOSE(flags, 772 IBV_EXP_CQ_RX_IPV4_PACKET, 773 RTE_PTYPE_INNER_L3_IPV4) | 774 TRANSPOSE(flags, 775 IBV_EXP_CQ_RX_IPV6_PACKET, 776 RTE_PTYPE_INNER_L3_IPV6); 777 else 778 pkt_type = 779 TRANSPOSE(flags, 780 IBV_EXP_CQ_RX_IPV4_PACKET, 781 RTE_PTYPE_L3_IPV4) | 782 TRANSPOSE(flags, 783 IBV_EXP_CQ_RX_IPV6_PACKET, 784 RTE_PTYPE_L3_IPV6); 785 return pkt_type; 786 } 787 788 /** 789 * Translate RX completion flags to offload flags. 790 * 791 * @param[in] rxq 792 * Pointer to RX queue structure. 793 * @param flags 794 * RX completion flags returned by poll_length_flags(). 795 * 796 * @return 797 * Offload flags (ol_flags) for struct rte_mbuf. 798 */ 799 static inline uint32_t 800 rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags) 801 { 802 uint32_t ol_flags = 0; 803 804 if (rxq->csum) { 805 /* Set IP checksum flag only for IPv4/IPv6 packets. */ 806 if (flags & 807 (IBV_EXP_CQ_RX_IPV4_PACKET | IBV_EXP_CQ_RX_IPV6_PACKET)) 808 ol_flags |= 809 TRANSPOSE(~flags, 810 IBV_EXP_CQ_RX_IP_CSUM_OK, 811 PKT_RX_IP_CKSUM_BAD); 812 #ifdef HAVE_EXP_CQ_RX_TCP_PACKET 813 /* Set L4 checksum flag only for TCP/UDP packets. */ 814 if (flags & 815 (IBV_EXP_CQ_RX_TCP_PACKET | IBV_EXP_CQ_RX_UDP_PACKET)) 816 #endif /* HAVE_EXP_CQ_RX_TCP_PACKET */ 817 ol_flags |= 818 TRANSPOSE(~flags, 819 IBV_EXP_CQ_RX_TCP_UDP_CSUM_OK, 820 PKT_RX_L4_CKSUM_BAD); 821 } 822 /* 823 * PKT_RX_IP_CKSUM_BAD and PKT_RX_L4_CKSUM_BAD are used in place 824 * of PKT_RX_EIP_CKSUM_BAD because the latter is not functional 825 * (its value is 0). 826 */ 827 if ((flags & IBV_EXP_CQ_RX_TUNNEL_PACKET) && (rxq->csum_l2tun)) 828 ol_flags |= 829 TRANSPOSE(~flags, 830 IBV_EXP_CQ_RX_OUTER_IP_CSUM_OK, 831 PKT_RX_IP_CKSUM_BAD) | 832 TRANSPOSE(~flags, 833 IBV_EXP_CQ_RX_OUTER_TCP_UDP_CSUM_OK, 834 PKT_RX_L4_CKSUM_BAD); 835 return ol_flags; 836 } 837 838 /** 839 * DPDK callback for RX with scattered packets support. 840 * 841 * @param dpdk_rxq 842 * Generic pointer to RX queue structure. 843 * @param[out] pkts 844 * Array to store received packets. 845 * @param pkts_n 846 * Maximum number of packets in array. 847 * 848 * @return 849 * Number of packets successfully received (<= pkts_n). 850 */ 851 uint16_t 852 mlx5_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 853 { 854 struct rxq *rxq = (struct rxq *)dpdk_rxq; 855 struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp; 856 const unsigned int elts_n = rxq->elts_n; 857 unsigned int elts_head = rxq->elts_head; 858 unsigned int i; 859 unsigned int pkts_ret = 0; 860 int ret; 861 862 if (unlikely(!rxq->sp)) 863 return mlx5_rx_burst(dpdk_rxq, pkts, pkts_n); 864 if (unlikely(elts == NULL)) /* See RTE_DEV_CMD_SET_MTU. */ 865 return 0; 866 for (i = 0; (i != pkts_n); ++i) { 867 struct rxq_elt_sp *elt = &(*elts)[elts_head]; 868 unsigned int len; 869 unsigned int pkt_buf_len; 870 struct rte_mbuf *pkt_buf = NULL; /* Buffer returned in pkts. */ 871 struct rte_mbuf **pkt_buf_next = &pkt_buf; 872 unsigned int seg_headroom = RTE_PKTMBUF_HEADROOM; 873 unsigned int j = 0; 874 uint32_t flags; 875 uint16_t vlan_tci; 876 877 /* Sanity checks. */ 878 assert(elts_head < rxq->elts_n); 879 assert(rxq->elts_head < rxq->elts_n); 880 ret = rxq->poll(rxq->cq, NULL, NULL, &flags, &vlan_tci); 881 if (unlikely(ret < 0)) { 882 struct ibv_wc wc; 883 int wcs_n; 884 885 DEBUG("rxq=%p, poll_length() failed (ret=%d)", 886 (void *)rxq, ret); 887 /* ibv_poll_cq() must be used in case of failure. */ 888 wcs_n = ibv_poll_cq(rxq->cq, 1, &wc); 889 if (unlikely(wcs_n == 0)) 890 break; 891 if (unlikely(wcs_n < 0)) { 892 DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)", 893 (void *)rxq, wcs_n); 894 break; 895 } 896 assert(wcs_n == 1); 897 if (unlikely(wc.status != IBV_WC_SUCCESS)) { 898 /* Whatever, just repost the offending WR. */ 899 DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work" 900 " completion status (%d): %s", 901 (void *)rxq, wc.wr_id, wc.status, 902 ibv_wc_status_str(wc.status)); 903 #ifdef MLX5_PMD_SOFT_COUNTERS 904 /* Increment dropped packets counter. */ 905 ++rxq->stats.idropped; 906 #endif 907 goto repost; 908 } 909 ret = wc.byte_len; 910 } 911 if (ret == 0) 912 break; 913 assert(ret >= (rxq->crc_present << 2)); 914 len = ret - (rxq->crc_present << 2); 915 pkt_buf_len = len; 916 /* 917 * Replace spent segments with new ones, concatenate and 918 * return them as pkt_buf. 919 */ 920 while (1) { 921 struct ibv_sge *sge = &elt->sges[j]; 922 struct rte_mbuf *seg = elt->bufs[j]; 923 struct rte_mbuf *rep; 924 unsigned int seg_tailroom; 925 926 assert(seg != NULL); 927 /* 928 * Fetch initial bytes of packet descriptor into a 929 * cacheline while allocating rep. 930 */ 931 rte_prefetch0(seg); 932 rep = __rte_mbuf_raw_alloc(rxq->mp); 933 if (unlikely(rep == NULL)) { 934 /* 935 * Unable to allocate a replacement mbuf, 936 * repost WR. 937 */ 938 DEBUG("rxq=%p: can't allocate a new mbuf", 939 (void *)rxq); 940 if (pkt_buf != NULL) { 941 *pkt_buf_next = NULL; 942 rte_pktmbuf_free(pkt_buf); 943 } 944 /* Increment out of memory counters. */ 945 ++rxq->stats.rx_nombuf; 946 ++rxq->priv->dev->data->rx_mbuf_alloc_failed; 947 goto repost; 948 } 949 #ifndef NDEBUG 950 /* Poison user-modifiable fields in rep. */ 951 NEXT(rep) = (void *)((uintptr_t)-1); 952 SET_DATA_OFF(rep, 0xdead); 953 DATA_LEN(rep) = 0xd00d; 954 PKT_LEN(rep) = 0xdeadd00d; 955 NB_SEGS(rep) = 0x2a; 956 PORT(rep) = 0x2a; 957 rep->ol_flags = -1; 958 #endif 959 assert(rep->buf_len == seg->buf_len); 960 assert(rep->buf_len == rxq->mb_len); 961 /* Reconfigure sge to use rep instead of seg. */ 962 assert(sge->lkey == rxq->mr->lkey); 963 sge->addr = ((uintptr_t)rep->buf_addr + seg_headroom); 964 elt->bufs[j] = rep; 965 ++j; 966 /* Update pkt_buf if it's the first segment, or link 967 * seg to the previous one and update pkt_buf_next. */ 968 *pkt_buf_next = seg; 969 pkt_buf_next = &NEXT(seg); 970 /* Update seg information. */ 971 seg_tailroom = (seg->buf_len - seg_headroom); 972 assert(sge->length == seg_tailroom); 973 SET_DATA_OFF(seg, seg_headroom); 974 if (likely(len <= seg_tailroom)) { 975 /* Last segment. */ 976 DATA_LEN(seg) = len; 977 PKT_LEN(seg) = len; 978 /* Sanity check. */ 979 assert(rte_pktmbuf_headroom(seg) == 980 seg_headroom); 981 assert(rte_pktmbuf_tailroom(seg) == 982 (seg_tailroom - len)); 983 break; 984 } 985 DATA_LEN(seg) = seg_tailroom; 986 PKT_LEN(seg) = seg_tailroom; 987 /* Sanity check. */ 988 assert(rte_pktmbuf_headroom(seg) == seg_headroom); 989 assert(rte_pktmbuf_tailroom(seg) == 0); 990 /* Fix len and clear headroom for next segments. */ 991 len -= seg_tailroom; 992 seg_headroom = 0; 993 } 994 /* Update head and tail segments. */ 995 *pkt_buf_next = NULL; 996 assert(pkt_buf != NULL); 997 assert(j != 0); 998 NB_SEGS(pkt_buf) = j; 999 PORT(pkt_buf) = rxq->port_id; 1000 PKT_LEN(pkt_buf) = pkt_buf_len; 1001 if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip) { 1002 pkt_buf->packet_type = rxq_cq_to_pkt_type(flags); 1003 pkt_buf->ol_flags = rxq_cq_to_ol_flags(rxq, flags); 1004 #ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS 1005 if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) { 1006 pkt_buf->ol_flags |= PKT_RX_VLAN_PKT; 1007 pkt_buf->vlan_tci = vlan_tci; 1008 } 1009 #endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */ 1010 } 1011 1012 /* Return packet. */ 1013 *(pkts++) = pkt_buf; 1014 ++pkts_ret; 1015 #ifdef MLX5_PMD_SOFT_COUNTERS 1016 /* Increment bytes counter. */ 1017 rxq->stats.ibytes += pkt_buf_len; 1018 #endif 1019 repost: 1020 ret = rxq->recv(rxq->wq, elt->sges, RTE_DIM(elt->sges)); 1021 if (unlikely(ret)) { 1022 /* Inability to repost WRs is fatal. */ 1023 DEBUG("%p: recv_sg_list(): failed (ret=%d)", 1024 (void *)rxq->priv, 1025 ret); 1026 abort(); 1027 } 1028 if (++elts_head >= elts_n) 1029 elts_head = 0; 1030 continue; 1031 } 1032 if (unlikely(i == 0)) 1033 return 0; 1034 rxq->elts_head = elts_head; 1035 #ifdef MLX5_PMD_SOFT_COUNTERS 1036 /* Increment packets counter. */ 1037 rxq->stats.ipackets += pkts_ret; 1038 #endif 1039 return pkts_ret; 1040 } 1041 1042 /** 1043 * DPDK callback for RX. 1044 * 1045 * The following function is the same as mlx5_rx_burst_sp(), except it doesn't 1046 * manage scattered packets. Improves performance when MRU is lower than the 1047 * size of the first segment. 1048 * 1049 * @param dpdk_rxq 1050 * Generic pointer to RX queue structure. 1051 * @param[out] pkts 1052 * Array to store received packets. 1053 * @param pkts_n 1054 * Maximum number of packets in array. 1055 * 1056 * @return 1057 * Number of packets successfully received (<= pkts_n). 1058 */ 1059 uint16_t 1060 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1061 { 1062 struct rxq *rxq = (struct rxq *)dpdk_rxq; 1063 struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp; 1064 const unsigned int elts_n = rxq->elts_n; 1065 unsigned int elts_head = rxq->elts_head; 1066 struct ibv_sge sges[pkts_n]; 1067 unsigned int i; 1068 unsigned int pkts_ret = 0; 1069 int ret; 1070 1071 if (unlikely(rxq->sp)) 1072 return mlx5_rx_burst_sp(dpdk_rxq, pkts, pkts_n); 1073 for (i = 0; (i != pkts_n); ++i) { 1074 struct rxq_elt *elt = &(*elts)[elts_head]; 1075 unsigned int len; 1076 struct rte_mbuf *seg = elt->buf; 1077 struct rte_mbuf *rep; 1078 uint32_t flags; 1079 uint16_t vlan_tci; 1080 1081 /* Sanity checks. */ 1082 assert(seg != NULL); 1083 assert(elts_head < rxq->elts_n); 1084 assert(rxq->elts_head < rxq->elts_n); 1085 /* 1086 * Fetch initial bytes of packet descriptor into a 1087 * cacheline while allocating rep. 1088 */ 1089 rte_prefetch0(seg); 1090 rte_prefetch0(&seg->cacheline1); 1091 ret = rxq->poll(rxq->cq, NULL, NULL, &flags, &vlan_tci); 1092 if (unlikely(ret < 0)) { 1093 struct ibv_wc wc; 1094 int wcs_n; 1095 1096 DEBUG("rxq=%p, poll_length() failed (ret=%d)", 1097 (void *)rxq, ret); 1098 /* ibv_poll_cq() must be used in case of failure. */ 1099 wcs_n = ibv_poll_cq(rxq->cq, 1, &wc); 1100 if (unlikely(wcs_n == 0)) 1101 break; 1102 if (unlikely(wcs_n < 0)) { 1103 DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)", 1104 (void *)rxq, wcs_n); 1105 break; 1106 } 1107 assert(wcs_n == 1); 1108 if (unlikely(wc.status != IBV_WC_SUCCESS)) { 1109 /* Whatever, just repost the offending WR. */ 1110 DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work" 1111 " completion status (%d): %s", 1112 (void *)rxq, wc.wr_id, wc.status, 1113 ibv_wc_status_str(wc.status)); 1114 #ifdef MLX5_PMD_SOFT_COUNTERS 1115 /* Increment dropped packets counter. */ 1116 ++rxq->stats.idropped; 1117 #endif 1118 /* Add SGE to array for repost. */ 1119 sges[i] = elt->sge; 1120 goto repost; 1121 } 1122 ret = wc.byte_len; 1123 } 1124 if (ret == 0) 1125 break; 1126 assert(ret >= (rxq->crc_present << 2)); 1127 len = ret - (rxq->crc_present << 2); 1128 rep = __rte_mbuf_raw_alloc(rxq->mp); 1129 if (unlikely(rep == NULL)) { 1130 /* 1131 * Unable to allocate a replacement mbuf, 1132 * repost WR. 1133 */ 1134 DEBUG("rxq=%p: can't allocate a new mbuf", 1135 (void *)rxq); 1136 /* Increment out of memory counters. */ 1137 ++rxq->stats.rx_nombuf; 1138 ++rxq->priv->dev->data->rx_mbuf_alloc_failed; 1139 goto repost; 1140 } 1141 1142 /* Reconfigure sge to use rep instead of seg. */ 1143 elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM; 1144 assert(elt->sge.lkey == rxq->mr->lkey); 1145 elt->buf = rep; 1146 1147 /* Add SGE to array for repost. */ 1148 sges[i] = elt->sge; 1149 1150 /* Update seg information. */ 1151 SET_DATA_OFF(seg, RTE_PKTMBUF_HEADROOM); 1152 NB_SEGS(seg) = 1; 1153 PORT(seg) = rxq->port_id; 1154 NEXT(seg) = NULL; 1155 PKT_LEN(seg) = len; 1156 DATA_LEN(seg) = len; 1157 if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip) { 1158 seg->packet_type = rxq_cq_to_pkt_type(flags); 1159 seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags); 1160 #ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS 1161 if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) { 1162 seg->ol_flags |= PKT_RX_VLAN_PKT; 1163 seg->vlan_tci = vlan_tci; 1164 } 1165 #endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */ 1166 } 1167 /* Return packet. */ 1168 *(pkts++) = seg; 1169 ++pkts_ret; 1170 #ifdef MLX5_PMD_SOFT_COUNTERS 1171 /* Increment bytes counter. */ 1172 rxq->stats.ibytes += len; 1173 #endif 1174 repost: 1175 if (++elts_head >= elts_n) 1176 elts_head = 0; 1177 continue; 1178 } 1179 if (unlikely(i == 0)) 1180 return 0; 1181 /* Repost WRs. */ 1182 #ifdef DEBUG_RECV 1183 DEBUG("%p: reposting %u WRs", (void *)rxq, i); 1184 #endif 1185 ret = rxq->recv(rxq->wq, sges, i); 1186 if (unlikely(ret)) { 1187 /* Inability to repost WRs is fatal. */ 1188 DEBUG("%p: recv_burst(): failed (ret=%d)", 1189 (void *)rxq->priv, 1190 ret); 1191 abort(); 1192 } 1193 rxq->elts_head = elts_head; 1194 #ifdef MLX5_PMD_SOFT_COUNTERS 1195 /* Increment packets counter. */ 1196 rxq->stats.ipackets += pkts_ret; 1197 #endif 1198 return pkts_ret; 1199 } 1200 1201 /** 1202 * Dummy DPDK callback for TX. 1203 * 1204 * This function is used to temporarily replace the real callback during 1205 * unsafe control operations on the queue, or in case of error. 1206 * 1207 * @param dpdk_txq 1208 * Generic pointer to TX queue structure. 1209 * @param[in] pkts 1210 * Packets to transmit. 1211 * @param pkts_n 1212 * Number of packets in array. 1213 * 1214 * @return 1215 * Number of packets successfully transmitted (<= pkts_n). 1216 */ 1217 uint16_t 1218 removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 1219 { 1220 (void)dpdk_txq; 1221 (void)pkts; 1222 (void)pkts_n; 1223 return 0; 1224 } 1225 1226 /** 1227 * Dummy DPDK callback for RX. 1228 * 1229 * This function is used to temporarily replace the real callback during 1230 * unsafe control operations on the queue, or in case of error. 1231 * 1232 * @param dpdk_rxq 1233 * Generic pointer to RX queue structure. 1234 * @param[out] pkts 1235 * Array to store received packets. 1236 * @param pkts_n 1237 * Maximum number of packets in array. 1238 * 1239 * @return 1240 * Number of packets successfully received (<= pkts_n). 1241 */ 1242 uint16_t 1243 removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1244 { 1245 (void)dpdk_rxq; 1246 (void)pkts; 1247 (void)pkts_n; 1248 return 0; 1249 } 1250