1 /*- 2 * BSD LICENSE 3 * 4 * Copyright 2015 6WIND S.A. 5 * Copyright 2015 Mellanox. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of 6WIND S.A. nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <assert.h> 35 #include <stdint.h> 36 #include <string.h> 37 #include <stdlib.h> 38 39 /* Verbs header. */ 40 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 41 #ifdef PEDANTIC 42 #pragma GCC diagnostic ignored "-pedantic" 43 #endif 44 #include <infiniband/verbs.h> 45 #ifdef PEDANTIC 46 #pragma GCC diagnostic error "-pedantic" 47 #endif 48 49 /* DPDK headers don't like -pedantic. */ 50 #ifdef PEDANTIC 51 #pragma GCC diagnostic ignored "-pedantic" 52 #endif 53 #include <rte_mbuf.h> 54 #include <rte_mempool.h> 55 #include <rte_prefetch.h> 56 #include <rte_common.h> 57 #include <rte_branch_prediction.h> 58 #include <rte_memory.h> 59 #ifdef PEDANTIC 60 #pragma GCC diagnostic error "-pedantic" 61 #endif 62 63 #include "mlx5.h" 64 #include "mlx5_utils.h" 65 #include "mlx5_rxtx.h" 66 #include "mlx5_autoconf.h" 67 #include "mlx5_defs.h" 68 69 /** 70 * Manage TX completions. 71 * 72 * When sending a burst, mlx5_tx_burst() posts several WRs. 73 * To improve performance, a completion event is only required once every 74 * MLX5_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information 75 * for other WRs, but this information would not be used anyway. 76 * 77 * @param txq 78 * Pointer to TX queue structure. 79 * 80 * @return 81 * 0 on success, -1 on failure. 82 */ 83 static int 84 txq_complete(struct txq *txq) 85 { 86 unsigned int elts_comp = txq->elts_comp; 87 unsigned int elts_tail = txq->elts_tail; 88 unsigned int elts_free = txq->elts_tail; 89 const unsigned int elts_n = txq->elts_n; 90 int wcs_n; 91 92 if (unlikely(elts_comp == 0)) 93 return 0; 94 #ifdef DEBUG_SEND 95 DEBUG("%p: processing %u work requests completions", 96 (void *)txq, elts_comp); 97 #endif 98 wcs_n = txq->poll_cnt(txq->cq, elts_comp); 99 if (unlikely(wcs_n == 0)) 100 return 0; 101 if (unlikely(wcs_n < 0)) { 102 DEBUG("%p: ibv_poll_cq() failed (wcs_n=%d)", 103 (void *)txq, wcs_n); 104 return -1; 105 } 106 elts_comp -= wcs_n; 107 assert(elts_comp <= txq->elts_comp); 108 /* 109 * Assume WC status is successful as nothing can be done about it 110 * anyway. 111 */ 112 elts_tail += wcs_n * txq->elts_comp_cd_init; 113 if (elts_tail >= elts_n) 114 elts_tail -= elts_n; 115 116 while (elts_free != elts_tail) { 117 struct txq_elt *elt = &(*txq->elts)[elts_free]; 118 unsigned int elts_free_next = 119 (((elts_free + 1) == elts_n) ? 0 : elts_free + 1); 120 struct rte_mbuf *tmp = elt->buf; 121 struct txq_elt *elt_next = &(*txq->elts)[elts_free_next]; 122 123 RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); 124 /* Faster than rte_pktmbuf_free(). */ 125 do { 126 struct rte_mbuf *next = NEXT(tmp); 127 128 rte_pktmbuf_free_seg(tmp); 129 tmp = next; 130 } while (tmp != NULL); 131 elts_free = elts_free_next; 132 } 133 134 txq->elts_tail = elts_tail; 135 txq->elts_comp = elts_comp; 136 return 0; 137 } 138 139 /* For best performance, this function should not be inlined. */ 140 struct ibv_mr *mlx5_mp2mr(struct ibv_pd *, const struct rte_mempool *) 141 __attribute__((noinline)); 142 143 /** 144 * Register mempool as a memory region. 145 * 146 * @param pd 147 * Pointer to protection domain. 148 * @param mp 149 * Pointer to memory pool. 150 * 151 * @return 152 * Memory region pointer, NULL in case of error. 153 */ 154 struct ibv_mr * 155 mlx5_mp2mr(struct ibv_pd *pd, const struct rte_mempool *mp) 156 { 157 const struct rte_memseg *ms = rte_eal_get_physmem_layout(); 158 uintptr_t start = mp->elt_va_start; 159 uintptr_t end = mp->elt_va_end; 160 unsigned int i; 161 162 DEBUG("mempool %p area start=%p end=%p size=%zu", 163 (const void *)mp, (void *)start, (void *)end, 164 (size_t)(end - start)); 165 /* Round start and end to page boundary if found in memory segments. */ 166 for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) { 167 uintptr_t addr = (uintptr_t)ms[i].addr; 168 size_t len = ms[i].len; 169 unsigned int align = ms[i].hugepage_sz; 170 171 if ((start > addr) && (start < addr + len)) 172 start = RTE_ALIGN_FLOOR(start, align); 173 if ((end > addr) && (end < addr + len)) 174 end = RTE_ALIGN_CEIL(end, align); 175 } 176 DEBUG("mempool %p using start=%p end=%p size=%zu for MR", 177 (const void *)mp, (void *)start, (void *)end, 178 (size_t)(end - start)); 179 return ibv_reg_mr(pd, 180 (void *)start, 181 end - start, 182 IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); 183 } 184 185 /** 186 * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which 187 * the cloned mbuf is allocated is returned instead. 188 * 189 * @param buf 190 * Pointer to mbuf. 191 * 192 * @return 193 * Memory pool where data is located for given mbuf. 194 */ 195 static struct rte_mempool * 196 txq_mb2mp(struct rte_mbuf *buf) 197 { 198 if (unlikely(RTE_MBUF_INDIRECT(buf))) 199 return rte_mbuf_from_indirect(buf)->pool; 200 return buf->pool; 201 } 202 203 /** 204 * Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[]. 205 * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full, 206 * remove an entry first. 207 * 208 * @param txq 209 * Pointer to TX queue structure. 210 * @param[in] mp 211 * Memory Pool for which a Memory Region lkey must be returned. 212 * 213 * @return 214 * mr->lkey on success, (uint32_t)-1 on failure. 215 */ 216 static uint32_t 217 txq_mp2mr(struct txq *txq, const struct rte_mempool *mp) 218 { 219 unsigned int i; 220 struct ibv_mr *mr; 221 222 for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) { 223 if (unlikely(txq->mp2mr[i].mp == NULL)) { 224 /* Unknown MP, add a new MR for it. */ 225 break; 226 } 227 if (txq->mp2mr[i].mp == mp) { 228 assert(txq->mp2mr[i].lkey != (uint32_t)-1); 229 assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey); 230 return txq->mp2mr[i].lkey; 231 } 232 } 233 /* Add a new entry, register MR first. */ 234 DEBUG("%p: discovered new memory pool \"%s\" (%p)", 235 (void *)txq, mp->name, (const void *)mp); 236 mr = mlx5_mp2mr(txq->priv->pd, mp); 237 if (unlikely(mr == NULL)) { 238 DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.", 239 (void *)txq); 240 return (uint32_t)-1; 241 } 242 if (unlikely(i == RTE_DIM(txq->mp2mr))) { 243 /* Table is full, remove oldest entry. */ 244 DEBUG("%p: MR <-> MP table full, dropping oldest entry.", 245 (void *)txq); 246 --i; 247 claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr)); 248 memmove(&txq->mp2mr[0], &txq->mp2mr[1], 249 (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0]))); 250 } 251 /* Store the new entry. */ 252 txq->mp2mr[i].mp = mp; 253 txq->mp2mr[i].mr = mr; 254 txq->mp2mr[i].lkey = mr->lkey; 255 DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32, 256 (void *)txq, mp->name, (const void *)mp, txq->mp2mr[i].lkey); 257 return txq->mp2mr[i].lkey; 258 } 259 260 struct txq_mp2mr_mbuf_check_data { 261 const struct rte_mempool *mp; 262 int ret; 263 }; 264 265 /** 266 * Callback function for rte_mempool_obj_iter() to check whether a given 267 * mempool object looks like a mbuf. 268 * 269 * @param[in, out] arg 270 * Context data (struct txq_mp2mr_mbuf_check_data). Contains mempool pointer 271 * and return value. 272 * @param[in] start 273 * Object start address. 274 * @param[in] end 275 * Object end address. 276 * @param index 277 * Unused. 278 * 279 * @return 280 * Nonzero value when object is not a mbuf. 281 */ 282 static void 283 txq_mp2mr_mbuf_check(void *arg, void *start, void *end, 284 uint32_t index __rte_unused) 285 { 286 struct txq_mp2mr_mbuf_check_data *data = arg; 287 struct rte_mbuf *buf = 288 (void *)((uintptr_t)start + data->mp->header_size); 289 290 (void)index; 291 /* Check whether mbuf structure fits element size and whether mempool 292 * pointer is valid. */ 293 if (((uintptr_t)end >= (uintptr_t)(buf + 1)) && 294 (buf->pool == data->mp)) 295 data->ret = 0; 296 else 297 data->ret = -1; 298 } 299 300 /** 301 * Iterator function for rte_mempool_walk() to register existing mempools and 302 * fill the MP to MR cache of a TX queue. 303 * 304 * @param[in] mp 305 * Memory Pool to register. 306 * @param *arg 307 * Pointer to TX queue structure. 308 */ 309 void 310 txq_mp2mr_iter(const struct rte_mempool *mp, void *arg) 311 { 312 struct txq *txq = arg; 313 struct txq_mp2mr_mbuf_check_data data = { 314 .mp = mp, 315 .ret = -1, 316 }; 317 318 /* Discard empty mempools. */ 319 if (mp->size == 0) 320 return; 321 /* Register mempool only if the first element looks like a mbuf. */ 322 rte_mempool_obj_iter((void *)mp->elt_va_start, 323 1, 324 mp->header_size + mp->elt_size + mp->trailer_size, 325 1, 326 mp->elt_pa, 327 mp->pg_num, 328 mp->pg_shift, 329 txq_mp2mr_mbuf_check, 330 &data); 331 if (data.ret) 332 return; 333 txq_mp2mr(txq, mp); 334 } 335 336 /** 337 * Insert VLAN using mbuf headroom space. 338 * 339 * @param buf 340 * Buffer for VLAN insertion. 341 * 342 * @return 343 * 0 on success, errno value on failure. 344 */ 345 static inline int 346 insert_vlan_sw(struct rte_mbuf *buf) 347 { 348 uintptr_t addr; 349 uint32_t vlan; 350 uint16_t head_room_len = rte_pktmbuf_headroom(buf); 351 352 if (head_room_len < 4) 353 return EINVAL; 354 355 addr = rte_pktmbuf_mtod(buf, uintptr_t); 356 vlan = htonl(0x81000000 | buf->vlan_tci); 357 memmove((void *)(addr - 4), (void *)addr, 12); 358 memcpy((void *)(addr + 8), &vlan, sizeof(vlan)); 359 360 SET_DATA_OFF(buf, head_room_len - 4); 361 DATA_LEN(buf) += 4; 362 363 return 0; 364 } 365 366 #if MLX5_PMD_SGE_WR_N > 1 367 368 /** 369 * Copy scattered mbuf contents to a single linear buffer. 370 * 371 * @param[out] linear 372 * Linear output buffer. 373 * @param[in] buf 374 * Scattered input buffer. 375 * 376 * @return 377 * Number of bytes copied to the output buffer or 0 if not large enough. 378 */ 379 static unsigned int 380 linearize_mbuf(linear_t *linear, struct rte_mbuf *buf) 381 { 382 unsigned int size = 0; 383 unsigned int offset; 384 385 do { 386 unsigned int len = DATA_LEN(buf); 387 388 offset = size; 389 size += len; 390 if (unlikely(size > sizeof(*linear))) 391 return 0; 392 memcpy(&(*linear)[offset], 393 rte_pktmbuf_mtod(buf, uint8_t *), 394 len); 395 buf = NEXT(buf); 396 } while (buf != NULL); 397 return size; 398 } 399 400 /** 401 * Handle scattered buffers for mlx5_tx_burst(). 402 * 403 * @param txq 404 * TX queue structure. 405 * @param segs 406 * Number of segments in buf. 407 * @param elt 408 * TX queue element to fill. 409 * @param[in] buf 410 * Buffer to process. 411 * @param elts_head 412 * Index of the linear buffer to use if necessary (normally txq->elts_head). 413 * @param[out] sges 414 * Array filled with SGEs on success. 415 * 416 * @return 417 * A structure containing the processed packet size in bytes and the 418 * number of SGEs. Both fields are set to (unsigned int)-1 in case of 419 * failure. 420 */ 421 static struct tx_burst_sg_ret { 422 unsigned int length; 423 unsigned int num; 424 } 425 tx_burst_sg(struct txq *txq, unsigned int segs, struct txq_elt *elt, 426 struct rte_mbuf *buf, unsigned int elts_head, 427 struct ibv_sge (*sges)[MLX5_PMD_SGE_WR_N]) 428 { 429 unsigned int sent_size = 0; 430 unsigned int j; 431 int linearize = 0; 432 433 /* When there are too many segments, extra segments are 434 * linearized in the last SGE. */ 435 if (unlikely(segs > RTE_DIM(*sges))) { 436 segs = (RTE_DIM(*sges) - 1); 437 linearize = 1; 438 } 439 /* Update element. */ 440 elt->buf = buf; 441 /* Register segments as SGEs. */ 442 for (j = 0; (j != segs); ++j) { 443 struct ibv_sge *sge = &(*sges)[j]; 444 uint32_t lkey; 445 446 /* Retrieve Memory Region key for this memory pool. */ 447 lkey = txq_mp2mr(txq, txq_mb2mp(buf)); 448 if (unlikely(lkey == (uint32_t)-1)) { 449 /* MR does not exist. */ 450 DEBUG("%p: unable to get MP <-> MR association", 451 (void *)txq); 452 /* Clean up TX element. */ 453 elt->buf = NULL; 454 goto stop; 455 } 456 /* Update SGE. */ 457 sge->addr = rte_pktmbuf_mtod(buf, uintptr_t); 458 if (txq->priv->vf) 459 rte_prefetch0((volatile void *) 460 (uintptr_t)sge->addr); 461 sge->length = DATA_LEN(buf); 462 sge->lkey = lkey; 463 sent_size += sge->length; 464 buf = NEXT(buf); 465 } 466 /* If buf is not NULL here and is not going to be linearized, 467 * nb_segs is not valid. */ 468 assert(j == segs); 469 assert((buf == NULL) || (linearize)); 470 /* Linearize extra segments. */ 471 if (linearize) { 472 struct ibv_sge *sge = &(*sges)[segs]; 473 linear_t *linear = &(*txq->elts_linear)[elts_head]; 474 unsigned int size = linearize_mbuf(linear, buf); 475 476 assert(segs == (RTE_DIM(*sges) - 1)); 477 if (size == 0) { 478 /* Invalid packet. */ 479 DEBUG("%p: packet too large to be linearized.", 480 (void *)txq); 481 /* Clean up TX element. */ 482 elt->buf = NULL; 483 goto stop; 484 } 485 /* If MLX5_PMD_SGE_WR_N is 1, free mbuf immediately. */ 486 if (RTE_DIM(*sges) == 1) { 487 do { 488 struct rte_mbuf *next = NEXT(buf); 489 490 rte_pktmbuf_free_seg(buf); 491 buf = next; 492 } while (buf != NULL); 493 elt->buf = NULL; 494 } 495 /* Update SGE. */ 496 sge->addr = (uintptr_t)&(*linear)[0]; 497 sge->length = size; 498 sge->lkey = txq->mr_linear->lkey; 499 sent_size += size; 500 /* Include last segment. */ 501 segs++; 502 } 503 return (struct tx_burst_sg_ret){ 504 .length = sent_size, 505 .num = segs, 506 }; 507 stop: 508 return (struct tx_burst_sg_ret){ 509 .length = -1, 510 .num = -1, 511 }; 512 } 513 514 #endif /* MLX5_PMD_SGE_WR_N > 1 */ 515 516 /** 517 * DPDK callback for TX. 518 * 519 * @param dpdk_txq 520 * Generic pointer to TX queue structure. 521 * @param[in] pkts 522 * Packets to transmit. 523 * @param pkts_n 524 * Number of packets in array. 525 * 526 * @return 527 * Number of packets successfully transmitted (<= pkts_n). 528 */ 529 uint16_t 530 mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 531 { 532 struct txq *txq = (struct txq *)dpdk_txq; 533 unsigned int elts_head = txq->elts_head; 534 const unsigned int elts_n = txq->elts_n; 535 unsigned int elts_comp_cd = txq->elts_comp_cd; 536 unsigned int elts_comp = 0; 537 unsigned int i; 538 unsigned int max; 539 int err; 540 struct rte_mbuf *buf = pkts[0]; 541 542 assert(elts_comp_cd != 0); 543 /* Prefetch first packet cacheline. */ 544 rte_prefetch0(buf); 545 txq_complete(txq); 546 max = (elts_n - (elts_head - txq->elts_tail)); 547 if (max > elts_n) 548 max -= elts_n; 549 assert(max >= 1); 550 assert(max <= elts_n); 551 /* Always leave one free entry in the ring. */ 552 --max; 553 if (max == 0) 554 return 0; 555 if (max > pkts_n) 556 max = pkts_n; 557 for (i = 0; (i != max); ++i) { 558 struct rte_mbuf *buf_next = pkts[i + 1]; 559 unsigned int elts_head_next = 560 (((elts_head + 1) == elts_n) ? 0 : elts_head + 1); 561 struct txq_elt *elt = &(*txq->elts)[elts_head]; 562 unsigned int segs = NB_SEGS(buf); 563 #ifdef MLX5_PMD_SOFT_COUNTERS 564 unsigned int sent_size = 0; 565 #endif 566 uint32_t send_flags = 0; 567 #ifdef HAVE_VERBS_VLAN_INSERTION 568 int insert_vlan = 0; 569 #endif /* HAVE_VERBS_VLAN_INSERTION */ 570 571 if (i + 1 < max) 572 rte_prefetch0(buf_next); 573 /* Request TX completion. */ 574 if (unlikely(--elts_comp_cd == 0)) { 575 elts_comp_cd = txq->elts_comp_cd_init; 576 ++elts_comp; 577 send_flags |= IBV_EXP_QP_BURST_SIGNALED; 578 } 579 /* Should we enable HW CKSUM offload */ 580 if (buf->ol_flags & 581 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) { 582 send_flags |= IBV_EXP_QP_BURST_IP_CSUM; 583 /* HW does not support checksum offloads at arbitrary 584 * offsets but automatically recognizes the packet 585 * type. For inner L3/L4 checksums, only VXLAN (UDP) 586 * tunnels are currently supported. */ 587 if (RTE_ETH_IS_TUNNEL_PKT(buf->packet_type)) 588 send_flags |= IBV_EXP_QP_BURST_TUNNEL; 589 } 590 if (buf->ol_flags & PKT_TX_VLAN_PKT) { 591 #ifdef HAVE_VERBS_VLAN_INSERTION 592 if (!txq->priv->mps) 593 insert_vlan = 1; 594 else 595 #endif /* HAVE_VERBS_VLAN_INSERTION */ 596 { 597 err = insert_vlan_sw(buf); 598 if (unlikely(err)) 599 goto stop; 600 } 601 } 602 if (likely(segs == 1)) { 603 uintptr_t addr; 604 uint32_t length; 605 uint32_t lkey; 606 uintptr_t buf_next_addr; 607 608 /* Retrieve buffer information. */ 609 addr = rte_pktmbuf_mtod(buf, uintptr_t); 610 length = DATA_LEN(buf); 611 /* Update element. */ 612 elt->buf = buf; 613 if (txq->priv->vf) 614 rte_prefetch0((volatile void *) 615 (uintptr_t)addr); 616 /* Prefetch next buffer data. */ 617 if (i + 1 < max) { 618 buf_next_addr = 619 rte_pktmbuf_mtod(buf_next, uintptr_t); 620 rte_prefetch0((volatile void *) 621 (uintptr_t)buf_next_addr); 622 } 623 /* Put packet into send queue. */ 624 #if MLX5_PMD_MAX_INLINE > 0 625 if (length <= txq->max_inline) { 626 #ifdef HAVE_VERBS_VLAN_INSERTION 627 if (insert_vlan) 628 err = txq->send_pending_inline_vlan 629 (txq->qp, 630 (void *)addr, 631 length, 632 send_flags, 633 &buf->vlan_tci); 634 else 635 #endif /* HAVE_VERBS_VLAN_INSERTION */ 636 err = txq->send_pending_inline 637 (txq->qp, 638 (void *)addr, 639 length, 640 send_flags); 641 } else 642 #endif 643 { 644 /* Retrieve Memory Region key for this 645 * memory pool. */ 646 lkey = txq_mp2mr(txq, txq_mb2mp(buf)); 647 if (unlikely(lkey == (uint32_t)-1)) { 648 /* MR does not exist. */ 649 DEBUG("%p: unable to get MP <-> MR" 650 " association", (void *)txq); 651 /* Clean up TX element. */ 652 elt->buf = NULL; 653 goto stop; 654 } 655 #ifdef HAVE_VERBS_VLAN_INSERTION 656 if (insert_vlan) 657 err = txq->send_pending_vlan 658 (txq->qp, 659 addr, 660 length, 661 lkey, 662 send_flags, 663 &buf->vlan_tci); 664 else 665 #endif /* HAVE_VERBS_VLAN_INSERTION */ 666 err = txq->send_pending 667 (txq->qp, 668 addr, 669 length, 670 lkey, 671 send_flags); 672 } 673 if (unlikely(err)) 674 goto stop; 675 #ifdef MLX5_PMD_SOFT_COUNTERS 676 sent_size += length; 677 #endif 678 } else { 679 #if MLX5_PMD_SGE_WR_N > 1 680 struct ibv_sge sges[MLX5_PMD_SGE_WR_N]; 681 struct tx_burst_sg_ret ret; 682 683 ret = tx_burst_sg(txq, segs, elt, buf, elts_head, 684 &sges); 685 if (ret.length == (unsigned int)-1) 686 goto stop; 687 /* Put SG list into send queue. */ 688 #ifdef HAVE_VERBS_VLAN_INSERTION 689 if (insert_vlan) 690 err = txq->send_pending_sg_list_vlan 691 (txq->qp, 692 sges, 693 ret.num, 694 send_flags, 695 &buf->vlan_tci); 696 else 697 #endif /* HAVE_VERBS_VLAN_INSERTION */ 698 err = txq->send_pending_sg_list 699 (txq->qp, 700 sges, 701 ret.num, 702 send_flags); 703 if (unlikely(err)) 704 goto stop; 705 #ifdef MLX5_PMD_SOFT_COUNTERS 706 sent_size += ret.length; 707 #endif 708 #else /* MLX5_PMD_SGE_WR_N > 1 */ 709 DEBUG("%p: TX scattered buffers support not" 710 " compiled in", (void *)txq); 711 goto stop; 712 #endif /* MLX5_PMD_SGE_WR_N > 1 */ 713 } 714 elts_head = elts_head_next; 715 buf = buf_next; 716 #ifdef MLX5_PMD_SOFT_COUNTERS 717 /* Increment sent bytes counter. */ 718 txq->stats.obytes += sent_size; 719 #endif 720 } 721 stop: 722 /* Take a shortcut if nothing must be sent. */ 723 if (unlikely(i == 0)) 724 return 0; 725 #ifdef MLX5_PMD_SOFT_COUNTERS 726 /* Increment sent packets counter. */ 727 txq->stats.opackets += i; 728 #endif 729 /* Ring QP doorbell. */ 730 err = txq->send_flush(txq->qp); 731 if (unlikely(err)) { 732 /* A nonzero value is not supposed to be returned. 733 * Nothing can be done about it. */ 734 DEBUG("%p: send_flush() failed with error %d", 735 (void *)txq, err); 736 } 737 txq->elts_head = elts_head; 738 txq->elts_comp += elts_comp; 739 txq->elts_comp_cd = elts_comp_cd; 740 return i; 741 } 742 743 /** 744 * Translate RX completion flags to packet type. 745 * 746 * @param flags 747 * RX completion flags returned by poll_length_flags(). 748 * 749 * @note: fix mlx5_dev_supported_ptypes_get() if any change here. 750 * 751 * @return 752 * Packet type for struct rte_mbuf. 753 */ 754 static inline uint32_t 755 rxq_cq_to_pkt_type(uint32_t flags) 756 { 757 uint32_t pkt_type; 758 759 if (flags & IBV_EXP_CQ_RX_TUNNEL_PACKET) 760 pkt_type = 761 TRANSPOSE(flags, 762 IBV_EXP_CQ_RX_OUTER_IPV4_PACKET, 763 RTE_PTYPE_L3_IPV4) | 764 TRANSPOSE(flags, 765 IBV_EXP_CQ_RX_OUTER_IPV6_PACKET, 766 RTE_PTYPE_L3_IPV6) | 767 TRANSPOSE(flags, 768 IBV_EXP_CQ_RX_IPV4_PACKET, 769 RTE_PTYPE_INNER_L3_IPV4) | 770 TRANSPOSE(flags, 771 IBV_EXP_CQ_RX_IPV6_PACKET, 772 RTE_PTYPE_INNER_L3_IPV6); 773 else 774 pkt_type = 775 TRANSPOSE(flags, 776 IBV_EXP_CQ_RX_IPV4_PACKET, 777 RTE_PTYPE_L3_IPV4) | 778 TRANSPOSE(flags, 779 IBV_EXP_CQ_RX_IPV6_PACKET, 780 RTE_PTYPE_L3_IPV6); 781 return pkt_type; 782 } 783 784 /** 785 * Translate RX completion flags to offload flags. 786 * 787 * @param[in] rxq 788 * Pointer to RX queue structure. 789 * @param flags 790 * RX completion flags returned by poll_length_flags(). 791 * 792 * @return 793 * Offload flags (ol_flags) for struct rte_mbuf. 794 */ 795 static inline uint32_t 796 rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags) 797 { 798 uint32_t ol_flags = 0; 799 800 if (rxq->csum) { 801 /* Set IP checksum flag only for IPv4/IPv6 packets. */ 802 if (flags & 803 (IBV_EXP_CQ_RX_IPV4_PACKET | IBV_EXP_CQ_RX_IPV6_PACKET)) 804 ol_flags |= 805 TRANSPOSE(~flags, 806 IBV_EXP_CQ_RX_IP_CSUM_OK, 807 PKT_RX_IP_CKSUM_BAD); 808 #ifdef HAVE_EXP_CQ_RX_TCP_PACKET 809 /* Set L4 checksum flag only for TCP/UDP packets. */ 810 if (flags & 811 (IBV_EXP_CQ_RX_TCP_PACKET | IBV_EXP_CQ_RX_UDP_PACKET)) 812 #endif /* HAVE_EXP_CQ_RX_TCP_PACKET */ 813 ol_flags |= 814 TRANSPOSE(~flags, 815 IBV_EXP_CQ_RX_TCP_UDP_CSUM_OK, 816 PKT_RX_L4_CKSUM_BAD); 817 } 818 /* 819 * PKT_RX_IP_CKSUM_BAD and PKT_RX_L4_CKSUM_BAD are used in place 820 * of PKT_RX_EIP_CKSUM_BAD because the latter is not functional 821 * (its value is 0). 822 */ 823 if ((flags & IBV_EXP_CQ_RX_TUNNEL_PACKET) && (rxq->csum_l2tun)) 824 ol_flags |= 825 TRANSPOSE(~flags, 826 IBV_EXP_CQ_RX_OUTER_IP_CSUM_OK, 827 PKT_RX_IP_CKSUM_BAD) | 828 TRANSPOSE(~flags, 829 IBV_EXP_CQ_RX_OUTER_TCP_UDP_CSUM_OK, 830 PKT_RX_L4_CKSUM_BAD); 831 return ol_flags; 832 } 833 834 /** 835 * DPDK callback for RX with scattered packets support. 836 * 837 * @param dpdk_rxq 838 * Generic pointer to RX queue structure. 839 * @param[out] pkts 840 * Array to store received packets. 841 * @param pkts_n 842 * Maximum number of packets in array. 843 * 844 * @return 845 * Number of packets successfully received (<= pkts_n). 846 */ 847 uint16_t 848 mlx5_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 849 { 850 struct rxq *rxq = (struct rxq *)dpdk_rxq; 851 struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp; 852 const unsigned int elts_n = rxq->elts_n; 853 unsigned int elts_head = rxq->elts_head; 854 unsigned int i; 855 unsigned int pkts_ret = 0; 856 int ret; 857 858 if (unlikely(!rxq->sp)) 859 return mlx5_rx_burst(dpdk_rxq, pkts, pkts_n); 860 if (unlikely(elts == NULL)) /* See RTE_DEV_CMD_SET_MTU. */ 861 return 0; 862 for (i = 0; (i != pkts_n); ++i) { 863 struct rxq_elt_sp *elt = &(*elts)[elts_head]; 864 unsigned int len; 865 unsigned int pkt_buf_len; 866 struct rte_mbuf *pkt_buf = NULL; /* Buffer returned in pkts. */ 867 struct rte_mbuf **pkt_buf_next = &pkt_buf; 868 unsigned int seg_headroom = RTE_PKTMBUF_HEADROOM; 869 unsigned int j = 0; 870 uint32_t flags; 871 uint16_t vlan_tci; 872 873 /* Sanity checks. */ 874 assert(elts_head < rxq->elts_n); 875 assert(rxq->elts_head < rxq->elts_n); 876 ret = rxq->poll(rxq->cq, NULL, NULL, &flags, &vlan_tci); 877 if (unlikely(ret < 0)) { 878 struct ibv_wc wc; 879 int wcs_n; 880 881 DEBUG("rxq=%p, poll_length() failed (ret=%d)", 882 (void *)rxq, ret); 883 /* ibv_poll_cq() must be used in case of failure. */ 884 wcs_n = ibv_poll_cq(rxq->cq, 1, &wc); 885 if (unlikely(wcs_n == 0)) 886 break; 887 if (unlikely(wcs_n < 0)) { 888 DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)", 889 (void *)rxq, wcs_n); 890 break; 891 } 892 assert(wcs_n == 1); 893 if (unlikely(wc.status != IBV_WC_SUCCESS)) { 894 /* Whatever, just repost the offending WR. */ 895 DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work" 896 " completion status (%d): %s", 897 (void *)rxq, wc.wr_id, wc.status, 898 ibv_wc_status_str(wc.status)); 899 #ifdef MLX5_PMD_SOFT_COUNTERS 900 /* Increment dropped packets counter. */ 901 ++rxq->stats.idropped; 902 #endif 903 goto repost; 904 } 905 ret = wc.byte_len; 906 } 907 if (ret == 0) 908 break; 909 assert(ret >= (rxq->crc_present << 2)); 910 len = ret - (rxq->crc_present << 2); 911 pkt_buf_len = len; 912 /* 913 * Replace spent segments with new ones, concatenate and 914 * return them as pkt_buf. 915 */ 916 while (1) { 917 struct ibv_sge *sge = &elt->sges[j]; 918 struct rte_mbuf *seg = elt->bufs[j]; 919 struct rte_mbuf *rep; 920 unsigned int seg_tailroom; 921 922 assert(seg != NULL); 923 /* 924 * Fetch initial bytes of packet descriptor into a 925 * cacheline while allocating rep. 926 */ 927 rte_prefetch0(seg); 928 rep = __rte_mbuf_raw_alloc(rxq->mp); 929 if (unlikely(rep == NULL)) { 930 /* 931 * Unable to allocate a replacement mbuf, 932 * repost WR. 933 */ 934 DEBUG("rxq=%p: can't allocate a new mbuf", 935 (void *)rxq); 936 if (pkt_buf != NULL) { 937 *pkt_buf_next = NULL; 938 rte_pktmbuf_free(pkt_buf); 939 } 940 /* Increment out of memory counters. */ 941 ++rxq->stats.rx_nombuf; 942 ++rxq->priv->dev->data->rx_mbuf_alloc_failed; 943 goto repost; 944 } 945 #ifndef NDEBUG 946 /* Poison user-modifiable fields in rep. */ 947 NEXT(rep) = (void *)((uintptr_t)-1); 948 SET_DATA_OFF(rep, 0xdead); 949 DATA_LEN(rep) = 0xd00d; 950 PKT_LEN(rep) = 0xdeadd00d; 951 NB_SEGS(rep) = 0x2a; 952 PORT(rep) = 0x2a; 953 rep->ol_flags = -1; 954 #endif 955 assert(rep->buf_len == seg->buf_len); 956 assert(rep->buf_len == rxq->mb_len); 957 /* Reconfigure sge to use rep instead of seg. */ 958 assert(sge->lkey == rxq->mr->lkey); 959 sge->addr = ((uintptr_t)rep->buf_addr + seg_headroom); 960 elt->bufs[j] = rep; 961 ++j; 962 /* Update pkt_buf if it's the first segment, or link 963 * seg to the previous one and update pkt_buf_next. */ 964 *pkt_buf_next = seg; 965 pkt_buf_next = &NEXT(seg); 966 /* Update seg information. */ 967 seg_tailroom = (seg->buf_len - seg_headroom); 968 assert(sge->length == seg_tailroom); 969 SET_DATA_OFF(seg, seg_headroom); 970 if (likely(len <= seg_tailroom)) { 971 /* Last segment. */ 972 DATA_LEN(seg) = len; 973 PKT_LEN(seg) = len; 974 /* Sanity check. */ 975 assert(rte_pktmbuf_headroom(seg) == 976 seg_headroom); 977 assert(rte_pktmbuf_tailroom(seg) == 978 (seg_tailroom - len)); 979 break; 980 } 981 DATA_LEN(seg) = seg_tailroom; 982 PKT_LEN(seg) = seg_tailroom; 983 /* Sanity check. */ 984 assert(rte_pktmbuf_headroom(seg) == seg_headroom); 985 assert(rte_pktmbuf_tailroom(seg) == 0); 986 /* Fix len and clear headroom for next segments. */ 987 len -= seg_tailroom; 988 seg_headroom = 0; 989 } 990 /* Update head and tail segments. */ 991 *pkt_buf_next = NULL; 992 assert(pkt_buf != NULL); 993 assert(j != 0); 994 NB_SEGS(pkt_buf) = j; 995 PORT(pkt_buf) = rxq->port_id; 996 PKT_LEN(pkt_buf) = pkt_buf_len; 997 if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip) { 998 pkt_buf->packet_type = rxq_cq_to_pkt_type(flags); 999 pkt_buf->ol_flags = rxq_cq_to_ol_flags(rxq, flags); 1000 #ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS 1001 if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) { 1002 pkt_buf->ol_flags |= PKT_RX_VLAN_PKT; 1003 pkt_buf->vlan_tci = vlan_tci; 1004 } 1005 #endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */ 1006 } 1007 1008 /* Return packet. */ 1009 *(pkts++) = pkt_buf; 1010 ++pkts_ret; 1011 #ifdef MLX5_PMD_SOFT_COUNTERS 1012 /* Increment bytes counter. */ 1013 rxq->stats.ibytes += pkt_buf_len; 1014 #endif 1015 repost: 1016 ret = rxq->recv(rxq->wq, elt->sges, RTE_DIM(elt->sges)); 1017 if (unlikely(ret)) { 1018 /* Inability to repost WRs is fatal. */ 1019 DEBUG("%p: recv_sg_list(): failed (ret=%d)", 1020 (void *)rxq->priv, 1021 ret); 1022 abort(); 1023 } 1024 if (++elts_head >= elts_n) 1025 elts_head = 0; 1026 continue; 1027 } 1028 if (unlikely(i == 0)) 1029 return 0; 1030 rxq->elts_head = elts_head; 1031 #ifdef MLX5_PMD_SOFT_COUNTERS 1032 /* Increment packets counter. */ 1033 rxq->stats.ipackets += pkts_ret; 1034 #endif 1035 return pkts_ret; 1036 } 1037 1038 /** 1039 * DPDK callback for RX. 1040 * 1041 * The following function is the same as mlx5_rx_burst_sp(), except it doesn't 1042 * manage scattered packets. Improves performance when MRU is lower than the 1043 * size of the first segment. 1044 * 1045 * @param dpdk_rxq 1046 * Generic pointer to RX queue structure. 1047 * @param[out] pkts 1048 * Array to store received packets. 1049 * @param pkts_n 1050 * Maximum number of packets in array. 1051 * 1052 * @return 1053 * Number of packets successfully received (<= pkts_n). 1054 */ 1055 uint16_t 1056 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1057 { 1058 struct rxq *rxq = (struct rxq *)dpdk_rxq; 1059 struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp; 1060 const unsigned int elts_n = rxq->elts_n; 1061 unsigned int elts_head = rxq->elts_head; 1062 struct ibv_sge sges[pkts_n]; 1063 unsigned int i; 1064 unsigned int pkts_ret = 0; 1065 int ret; 1066 1067 if (unlikely(rxq->sp)) 1068 return mlx5_rx_burst_sp(dpdk_rxq, pkts, pkts_n); 1069 for (i = 0; (i != pkts_n); ++i) { 1070 struct rxq_elt *elt = &(*elts)[elts_head]; 1071 unsigned int len; 1072 struct rte_mbuf *seg = elt->buf; 1073 struct rte_mbuf *rep; 1074 uint32_t flags; 1075 uint16_t vlan_tci; 1076 1077 /* Sanity checks. */ 1078 assert(seg != NULL); 1079 assert(elts_head < rxq->elts_n); 1080 assert(rxq->elts_head < rxq->elts_n); 1081 /* 1082 * Fetch initial bytes of packet descriptor into a 1083 * cacheline while allocating rep. 1084 */ 1085 rte_prefetch0(seg); 1086 rte_prefetch0(&seg->cacheline1); 1087 ret = rxq->poll(rxq->cq, NULL, NULL, &flags, &vlan_tci); 1088 if (unlikely(ret < 0)) { 1089 struct ibv_wc wc; 1090 int wcs_n; 1091 1092 DEBUG("rxq=%p, poll_length() failed (ret=%d)", 1093 (void *)rxq, ret); 1094 /* ibv_poll_cq() must be used in case of failure. */ 1095 wcs_n = ibv_poll_cq(rxq->cq, 1, &wc); 1096 if (unlikely(wcs_n == 0)) 1097 break; 1098 if (unlikely(wcs_n < 0)) { 1099 DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)", 1100 (void *)rxq, wcs_n); 1101 break; 1102 } 1103 assert(wcs_n == 1); 1104 if (unlikely(wc.status != IBV_WC_SUCCESS)) { 1105 /* Whatever, just repost the offending WR. */ 1106 DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work" 1107 " completion status (%d): %s", 1108 (void *)rxq, wc.wr_id, wc.status, 1109 ibv_wc_status_str(wc.status)); 1110 #ifdef MLX5_PMD_SOFT_COUNTERS 1111 /* Increment dropped packets counter. */ 1112 ++rxq->stats.idropped; 1113 #endif 1114 /* Add SGE to array for repost. */ 1115 sges[i] = elt->sge; 1116 goto repost; 1117 } 1118 ret = wc.byte_len; 1119 } 1120 if (ret == 0) 1121 break; 1122 assert(ret >= (rxq->crc_present << 2)); 1123 len = ret - (rxq->crc_present << 2); 1124 rep = __rte_mbuf_raw_alloc(rxq->mp); 1125 if (unlikely(rep == NULL)) { 1126 /* 1127 * Unable to allocate a replacement mbuf, 1128 * repost WR. 1129 */ 1130 DEBUG("rxq=%p: can't allocate a new mbuf", 1131 (void *)rxq); 1132 /* Increment out of memory counters. */ 1133 ++rxq->stats.rx_nombuf; 1134 ++rxq->priv->dev->data->rx_mbuf_alloc_failed; 1135 goto repost; 1136 } 1137 1138 /* Reconfigure sge to use rep instead of seg. */ 1139 elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM; 1140 assert(elt->sge.lkey == rxq->mr->lkey); 1141 elt->buf = rep; 1142 1143 /* Add SGE to array for repost. */ 1144 sges[i] = elt->sge; 1145 1146 /* Update seg information. */ 1147 SET_DATA_OFF(seg, RTE_PKTMBUF_HEADROOM); 1148 NB_SEGS(seg) = 1; 1149 PORT(seg) = rxq->port_id; 1150 NEXT(seg) = NULL; 1151 PKT_LEN(seg) = len; 1152 DATA_LEN(seg) = len; 1153 if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip) { 1154 seg->packet_type = rxq_cq_to_pkt_type(flags); 1155 seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags); 1156 #ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS 1157 if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) { 1158 seg->ol_flags |= PKT_RX_VLAN_PKT; 1159 seg->vlan_tci = vlan_tci; 1160 } 1161 #endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */ 1162 } 1163 /* Return packet. */ 1164 *(pkts++) = seg; 1165 ++pkts_ret; 1166 #ifdef MLX5_PMD_SOFT_COUNTERS 1167 /* Increment bytes counter. */ 1168 rxq->stats.ibytes += len; 1169 #endif 1170 repost: 1171 if (++elts_head >= elts_n) 1172 elts_head = 0; 1173 continue; 1174 } 1175 if (unlikely(i == 0)) 1176 return 0; 1177 /* Repost WRs. */ 1178 #ifdef DEBUG_RECV 1179 DEBUG("%p: reposting %u WRs", (void *)rxq, i); 1180 #endif 1181 ret = rxq->recv(rxq->wq, sges, i); 1182 if (unlikely(ret)) { 1183 /* Inability to repost WRs is fatal. */ 1184 DEBUG("%p: recv_burst(): failed (ret=%d)", 1185 (void *)rxq->priv, 1186 ret); 1187 abort(); 1188 } 1189 rxq->elts_head = elts_head; 1190 #ifdef MLX5_PMD_SOFT_COUNTERS 1191 /* Increment packets counter. */ 1192 rxq->stats.ipackets += pkts_ret; 1193 #endif 1194 return pkts_ret; 1195 } 1196 1197 /** 1198 * Dummy DPDK callback for TX. 1199 * 1200 * This function is used to temporarily replace the real callback during 1201 * unsafe control operations on the queue, or in case of error. 1202 * 1203 * @param dpdk_txq 1204 * Generic pointer to TX queue structure. 1205 * @param[in] pkts 1206 * Packets to transmit. 1207 * @param pkts_n 1208 * Number of packets in array. 1209 * 1210 * @return 1211 * Number of packets successfully transmitted (<= pkts_n). 1212 */ 1213 uint16_t 1214 removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 1215 { 1216 (void)dpdk_txq; 1217 (void)pkts; 1218 (void)pkts_n; 1219 return 0; 1220 } 1221 1222 /** 1223 * Dummy DPDK callback for RX. 1224 * 1225 * This function is used to temporarily replace the real callback during 1226 * unsafe control operations on the queue, or in case of error. 1227 * 1228 * @param dpdk_rxq 1229 * Generic pointer to RX queue structure. 1230 * @param[out] pkts 1231 * Array to store received packets. 1232 * @param pkts_n 1233 * Maximum number of packets in array. 1234 * 1235 * @return 1236 * Number of packets successfully received (<= pkts_n). 1237 */ 1238 uint16_t 1239 removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1240 { 1241 (void)dpdk_rxq; 1242 (void)pkts; 1243 (void)pkts_n; 1244 return 0; 1245 } 1246