1 /*- 2 * BSD LICENSE 3 * 4 * Copyright 2015 6WIND S.A. 5 * Copyright 2015 Mellanox. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of 6WIND S.A. nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <assert.h> 35 #include <stdint.h> 36 #include <string.h> 37 #include <stdlib.h> 38 39 /* Verbs header. */ 40 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 41 #ifdef PEDANTIC 42 #pragma GCC diagnostic ignored "-pedantic" 43 #endif 44 #include <infiniband/verbs.h> 45 #ifdef PEDANTIC 46 #pragma GCC diagnostic error "-pedantic" 47 #endif 48 49 /* DPDK headers don't like -pedantic. */ 50 #ifdef PEDANTIC 51 #pragma GCC diagnostic ignored "-pedantic" 52 #endif 53 #include <rte_mbuf.h> 54 #include <rte_mempool.h> 55 #include <rte_prefetch.h> 56 #include <rte_common.h> 57 #include <rte_branch_prediction.h> 58 #ifdef PEDANTIC 59 #pragma GCC diagnostic error "-pedantic" 60 #endif 61 62 #include "mlx5.h" 63 #include "mlx5_utils.h" 64 #include "mlx5_rxtx.h" 65 #include "mlx5_defs.h" 66 67 /** 68 * Manage TX completions. 69 * 70 * When sending a burst, mlx5_tx_burst() posts several WRs. 71 * To improve performance, a completion event is only required once every 72 * MLX5_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information 73 * for other WRs, but this information would not be used anyway. 74 * 75 * @param txq 76 * Pointer to TX queue structure. 77 * 78 * @return 79 * 0 on success, -1 on failure. 80 */ 81 static int 82 txq_complete(struct txq *txq) 83 { 84 unsigned int elts_comp = txq->elts_comp; 85 unsigned int elts_tail = txq->elts_tail; 86 const unsigned int elts_n = txq->elts_n; 87 int wcs_n; 88 89 if (unlikely(elts_comp == 0)) 90 return 0; 91 #ifdef DEBUG_SEND 92 DEBUG("%p: processing %u work requests completions", 93 (void *)txq, elts_comp); 94 #endif 95 wcs_n = txq->if_cq->poll_cnt(txq->cq, elts_comp); 96 if (unlikely(wcs_n == 0)) 97 return 0; 98 if (unlikely(wcs_n < 0)) { 99 DEBUG("%p: ibv_poll_cq() failed (wcs_n=%d)", 100 (void *)txq, wcs_n); 101 return -1; 102 } 103 elts_comp -= wcs_n; 104 assert(elts_comp <= txq->elts_comp); 105 /* 106 * Assume WC status is successful as nothing can be done about it 107 * anyway. 108 */ 109 elts_tail += wcs_n * txq->elts_comp_cd_init; 110 if (elts_tail >= elts_n) 111 elts_tail -= elts_n; 112 txq->elts_tail = elts_tail; 113 txq->elts_comp = elts_comp; 114 return 0; 115 } 116 117 /** 118 * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which 119 * the cloned mbuf is allocated is returned instead. 120 * 121 * @param buf 122 * Pointer to mbuf. 123 * 124 * @return 125 * Memory pool where data is located for given mbuf. 126 */ 127 static struct rte_mempool * 128 txq_mb2mp(struct rte_mbuf *buf) 129 { 130 if (unlikely(RTE_MBUF_INDIRECT(buf))) 131 return rte_mbuf_from_indirect(buf)->pool; 132 return buf->pool; 133 } 134 135 /** 136 * Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[]. 137 * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full, 138 * remove an entry first. 139 * 140 * @param txq 141 * Pointer to TX queue structure. 142 * @param[in] mp 143 * Memory Pool for which a Memory Region lkey must be returned. 144 * 145 * @return 146 * mr->lkey on success, (uint32_t)-1 on failure. 147 */ 148 static uint32_t 149 txq_mp2mr(struct txq *txq, const struct rte_mempool *mp) 150 { 151 unsigned int i; 152 struct ibv_mr *mr; 153 154 for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) { 155 if (unlikely(txq->mp2mr[i].mp == NULL)) { 156 /* Unknown MP, add a new MR for it. */ 157 break; 158 } 159 if (txq->mp2mr[i].mp == mp) { 160 assert(txq->mp2mr[i].lkey != (uint32_t)-1); 161 assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey); 162 return txq->mp2mr[i].lkey; 163 } 164 } 165 /* Add a new entry, register MR first. */ 166 DEBUG("%p: discovered new memory pool \"%s\" (%p)", 167 (void *)txq, mp->name, (const void *)mp); 168 mr = ibv_reg_mr(txq->priv->pd, 169 (void *)mp->elt_va_start, 170 (mp->elt_va_end - mp->elt_va_start), 171 (IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE)); 172 if (unlikely(mr == NULL)) { 173 DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.", 174 (void *)txq); 175 return (uint32_t)-1; 176 } 177 if (unlikely(i == RTE_DIM(txq->mp2mr))) { 178 /* Table is full, remove oldest entry. */ 179 DEBUG("%p: MR <-> MP table full, dropping oldest entry.", 180 (void *)txq); 181 --i; 182 claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr)); 183 memmove(&txq->mp2mr[0], &txq->mp2mr[1], 184 (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0]))); 185 } 186 /* Store the new entry. */ 187 txq->mp2mr[i].mp = mp; 188 txq->mp2mr[i].mr = mr; 189 txq->mp2mr[i].lkey = mr->lkey; 190 DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32, 191 (void *)txq, mp->name, (const void *)mp, txq->mp2mr[i].lkey); 192 return txq->mp2mr[i].lkey; 193 } 194 195 struct txq_mp2mr_mbuf_check_data { 196 const struct rte_mempool *mp; 197 int ret; 198 }; 199 200 /** 201 * Callback function for rte_mempool_obj_iter() to check whether a given 202 * mempool object looks like a mbuf. 203 * 204 * @param[in, out] arg 205 * Context data (struct txq_mp2mr_mbuf_check_data). Contains mempool pointer 206 * and return value. 207 * @param[in] start 208 * Object start address. 209 * @param[in] end 210 * Object end address. 211 * @param index 212 * Unused. 213 * 214 * @return 215 * Nonzero value when object is not a mbuf. 216 */ 217 static void 218 txq_mp2mr_mbuf_check(void *arg, void *start, void *end, 219 uint32_t index __rte_unused) 220 { 221 struct txq_mp2mr_mbuf_check_data *data = arg; 222 struct rte_mbuf *buf = 223 (void *)((uintptr_t)start + data->mp->header_size); 224 225 (void)index; 226 /* Check whether mbuf structure fits element size and whether mempool 227 * pointer is valid. */ 228 if (((uintptr_t)end >= (uintptr_t)(buf + 1)) && 229 (buf->pool == data->mp)) 230 data->ret = 0; 231 else 232 data->ret = -1; 233 } 234 235 /** 236 * Iterator function for rte_mempool_walk() to register existing mempools and 237 * fill the MP to MR cache of a TX queue. 238 * 239 * @param[in] mp 240 * Memory Pool to register. 241 * @param *arg 242 * Pointer to TX queue structure. 243 */ 244 void 245 txq_mp2mr_iter(const struct rte_mempool *mp, void *arg) 246 { 247 struct txq *txq = arg; 248 struct txq_mp2mr_mbuf_check_data data = { 249 .mp = mp, 250 .ret = -1, 251 }; 252 253 /* Discard empty mempools. */ 254 if (mp->size == 0) 255 return; 256 /* Register mempool only if the first element looks like a mbuf. */ 257 rte_mempool_obj_iter((void *)mp->elt_va_start, 258 1, 259 mp->header_size + mp->elt_size + mp->trailer_size, 260 1, 261 mp->elt_pa, 262 mp->pg_num, 263 mp->pg_shift, 264 txq_mp2mr_mbuf_check, 265 &data); 266 if (data.ret) 267 return; 268 txq_mp2mr(txq, mp); 269 } 270 271 #if MLX5_PMD_SGE_WR_N > 1 272 273 /** 274 * Copy scattered mbuf contents to a single linear buffer. 275 * 276 * @param[out] linear 277 * Linear output buffer. 278 * @param[in] buf 279 * Scattered input buffer. 280 * 281 * @return 282 * Number of bytes copied to the output buffer or 0 if not large enough. 283 */ 284 static unsigned int 285 linearize_mbuf(linear_t *linear, struct rte_mbuf *buf) 286 { 287 unsigned int size = 0; 288 unsigned int offset; 289 290 do { 291 unsigned int len = DATA_LEN(buf); 292 293 offset = size; 294 size += len; 295 if (unlikely(size > sizeof(*linear))) 296 return 0; 297 memcpy(&(*linear)[offset], 298 rte_pktmbuf_mtod(buf, uint8_t *), 299 len); 300 buf = NEXT(buf); 301 } while (buf != NULL); 302 return size; 303 } 304 305 /** 306 * Handle scattered buffers for mlx5_tx_burst(). 307 * 308 * @param txq 309 * TX queue structure. 310 * @param segs 311 * Number of segments in buf. 312 * @param elt 313 * TX queue element to fill. 314 * @param[in] buf 315 * Buffer to process. 316 * @param elts_head 317 * Index of the linear buffer to use if necessary (normally txq->elts_head). 318 * @param[out] sges 319 * Array filled with SGEs on success. 320 * 321 * @return 322 * A structure containing the processed packet size in bytes and the 323 * number of SGEs. Both fields are set to (unsigned int)-1 in case of 324 * failure. 325 */ 326 static struct tx_burst_sg_ret { 327 unsigned int length; 328 unsigned int num; 329 } 330 tx_burst_sg(struct txq *txq, unsigned int segs, struct txq_elt *elt, 331 struct rte_mbuf *buf, unsigned int elts_head, 332 struct ibv_sge (*sges)[MLX5_PMD_SGE_WR_N]) 333 { 334 unsigned int sent_size = 0; 335 unsigned int j; 336 int linearize = 0; 337 338 /* When there are too many segments, extra segments are 339 * linearized in the last SGE. */ 340 if (unlikely(segs > RTE_DIM(*sges))) { 341 segs = (RTE_DIM(*sges) - 1); 342 linearize = 1; 343 } 344 /* Update element. */ 345 elt->buf = buf; 346 /* Register segments as SGEs. */ 347 for (j = 0; (j != segs); ++j) { 348 struct ibv_sge *sge = &(*sges)[j]; 349 uint32_t lkey; 350 351 /* Retrieve Memory Region key for this memory pool. */ 352 lkey = txq_mp2mr(txq, txq_mb2mp(buf)); 353 if (unlikely(lkey == (uint32_t)-1)) { 354 /* MR does not exist. */ 355 DEBUG("%p: unable to get MP <-> MR association", 356 (void *)txq); 357 /* Clean up TX element. */ 358 elt->buf = NULL; 359 goto stop; 360 } 361 /* Update SGE. */ 362 sge->addr = rte_pktmbuf_mtod(buf, uintptr_t); 363 if (txq->priv->vf) 364 rte_prefetch0((volatile void *) 365 (uintptr_t)sge->addr); 366 sge->length = DATA_LEN(buf); 367 sge->lkey = lkey; 368 sent_size += sge->length; 369 buf = NEXT(buf); 370 } 371 /* If buf is not NULL here and is not going to be linearized, 372 * nb_segs is not valid. */ 373 assert(j == segs); 374 assert((buf == NULL) || (linearize)); 375 /* Linearize extra segments. */ 376 if (linearize) { 377 struct ibv_sge *sge = &(*sges)[segs]; 378 linear_t *linear = &(*txq->elts_linear)[elts_head]; 379 unsigned int size = linearize_mbuf(linear, buf); 380 381 assert(segs == (RTE_DIM(*sges) - 1)); 382 if (size == 0) { 383 /* Invalid packet. */ 384 DEBUG("%p: packet too large to be linearized.", 385 (void *)txq); 386 /* Clean up TX element. */ 387 elt->buf = NULL; 388 goto stop; 389 } 390 /* If MLX5_PMD_SGE_WR_N is 1, free mbuf immediately. */ 391 if (RTE_DIM(*sges) == 1) { 392 do { 393 struct rte_mbuf *next = NEXT(buf); 394 395 rte_pktmbuf_free_seg(buf); 396 buf = next; 397 } while (buf != NULL); 398 elt->buf = NULL; 399 } 400 /* Update SGE. */ 401 sge->addr = (uintptr_t)&(*linear)[0]; 402 sge->length = size; 403 sge->lkey = txq->mr_linear->lkey; 404 sent_size += size; 405 /* Include last segment. */ 406 segs++; 407 } 408 return (struct tx_burst_sg_ret){ 409 .length = sent_size, 410 .num = segs, 411 }; 412 stop: 413 return (struct tx_burst_sg_ret){ 414 .length = -1, 415 .num = -1, 416 }; 417 } 418 419 #endif /* MLX5_PMD_SGE_WR_N > 1 */ 420 421 /** 422 * DPDK callback for TX. 423 * 424 * @param dpdk_txq 425 * Generic pointer to TX queue structure. 426 * @param[in] pkts 427 * Packets to transmit. 428 * @param pkts_n 429 * Number of packets in array. 430 * 431 * @return 432 * Number of packets successfully transmitted (<= pkts_n). 433 */ 434 uint16_t 435 mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 436 { 437 struct txq *txq = (struct txq *)dpdk_txq; 438 unsigned int elts_head = txq->elts_head; 439 const unsigned int elts_n = txq->elts_n; 440 unsigned int elts_comp_cd = txq->elts_comp_cd; 441 unsigned int elts_comp = 0; 442 unsigned int i; 443 unsigned int max; 444 int err; 445 446 assert(elts_comp_cd != 0); 447 txq_complete(txq); 448 max = (elts_n - (elts_head - txq->elts_tail)); 449 if (max > elts_n) 450 max -= elts_n; 451 assert(max >= 1); 452 assert(max <= elts_n); 453 /* Always leave one free entry in the ring. */ 454 --max; 455 if (max == 0) 456 return 0; 457 if (max > pkts_n) 458 max = pkts_n; 459 for (i = 0; (i != max); ++i) { 460 struct rte_mbuf *buf = pkts[i]; 461 unsigned int elts_head_next = 462 (((elts_head + 1) == elts_n) ? 0 : elts_head + 1); 463 struct txq_elt *elt_next = &(*txq->elts)[elts_head_next]; 464 struct txq_elt *elt = &(*txq->elts)[elts_head]; 465 unsigned int segs = NB_SEGS(buf); 466 #ifdef MLX5_PMD_SOFT_COUNTERS 467 unsigned int sent_size = 0; 468 #endif 469 uint32_t send_flags = 0; 470 471 /* Clean up old buffer. */ 472 if (likely(elt->buf != NULL)) { 473 struct rte_mbuf *tmp = elt->buf; 474 475 /* Faster than rte_pktmbuf_free(). */ 476 do { 477 struct rte_mbuf *next = NEXT(tmp); 478 479 rte_pktmbuf_free_seg(tmp); 480 tmp = next; 481 } while (tmp != NULL); 482 } 483 /* Request TX completion. */ 484 if (unlikely(--elts_comp_cd == 0)) { 485 elts_comp_cd = txq->elts_comp_cd_init; 486 ++elts_comp; 487 send_flags |= IBV_EXP_QP_BURST_SIGNALED; 488 } 489 /* Should we enable HW CKSUM offload */ 490 if (buf->ol_flags & 491 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) { 492 send_flags |= IBV_EXP_QP_BURST_IP_CSUM; 493 /* HW does not support checksum offloads at arbitrary 494 * offsets but automatically recognizes the packet 495 * type. For inner L3/L4 checksums, only VXLAN (UDP) 496 * tunnels are currently supported. */ 497 if (RTE_ETH_IS_TUNNEL_PKT(buf->packet_type)) 498 send_flags |= IBV_EXP_QP_BURST_TUNNEL; 499 } 500 if (likely(segs == 1)) { 501 uintptr_t addr; 502 uint32_t length; 503 uint32_t lkey; 504 505 /* Retrieve buffer information. */ 506 addr = rte_pktmbuf_mtod(buf, uintptr_t); 507 length = DATA_LEN(buf); 508 /* Retrieve Memory Region key for this memory pool. */ 509 lkey = txq_mp2mr(txq, txq_mb2mp(buf)); 510 if (unlikely(lkey == (uint32_t)-1)) { 511 /* MR does not exist. */ 512 DEBUG("%p: unable to get MP <-> MR" 513 " association", (void *)txq); 514 /* Clean up TX element. */ 515 elt->buf = NULL; 516 goto stop; 517 } 518 /* Update element. */ 519 elt->buf = buf; 520 if (txq->priv->vf) 521 rte_prefetch0((volatile void *) 522 (uintptr_t)addr); 523 RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); 524 /* Put packet into send queue. */ 525 #if MLX5_PMD_MAX_INLINE > 0 526 if (length <= txq->max_inline) 527 err = txq->if_qp->send_pending_inline 528 (txq->qp, 529 (void *)addr, 530 length, 531 send_flags); 532 else 533 #endif 534 err = txq->if_qp->send_pending 535 (txq->qp, 536 addr, 537 length, 538 lkey, 539 send_flags); 540 if (unlikely(err)) 541 goto stop; 542 #ifdef MLX5_PMD_SOFT_COUNTERS 543 sent_size += length; 544 #endif 545 } else { 546 #if MLX5_PMD_SGE_WR_N > 1 547 struct ibv_sge sges[MLX5_PMD_SGE_WR_N]; 548 struct tx_burst_sg_ret ret; 549 550 ret = tx_burst_sg(txq, segs, elt, buf, elts_head, 551 &sges); 552 if (ret.length == (unsigned int)-1) 553 goto stop; 554 RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); 555 /* Put SG list into send queue. */ 556 err = txq->if_qp->send_pending_sg_list 557 (txq->qp, 558 sges, 559 ret.num, 560 send_flags); 561 if (unlikely(err)) 562 goto stop; 563 #ifdef MLX5_PMD_SOFT_COUNTERS 564 sent_size += ret.length; 565 #endif 566 #else /* MLX5_PMD_SGE_WR_N > 1 */ 567 DEBUG("%p: TX scattered buffers support not" 568 " compiled in", (void *)txq); 569 goto stop; 570 #endif /* MLX5_PMD_SGE_WR_N > 1 */ 571 } 572 elts_head = elts_head_next; 573 #ifdef MLX5_PMD_SOFT_COUNTERS 574 /* Increment sent bytes counter. */ 575 txq->stats.obytes += sent_size; 576 #endif 577 } 578 stop: 579 /* Take a shortcut if nothing must be sent. */ 580 if (unlikely(i == 0)) 581 return 0; 582 #ifdef MLX5_PMD_SOFT_COUNTERS 583 /* Increment sent packets counter. */ 584 txq->stats.opackets += i; 585 #endif 586 /* Ring QP doorbell. */ 587 err = txq->if_qp->send_flush(txq->qp); 588 if (unlikely(err)) { 589 /* A nonzero value is not supposed to be returned. 590 * Nothing can be done about it. */ 591 DEBUG("%p: send_flush() failed with error %d", 592 (void *)txq, err); 593 } 594 txq->elts_head = elts_head; 595 txq->elts_comp += elts_comp; 596 txq->elts_comp_cd = elts_comp_cd; 597 return i; 598 } 599 600 /** 601 * Translate RX completion flags to packet type. 602 * 603 * @param flags 604 * RX completion flags returned by poll_length_flags(). 605 * 606 * @return 607 * Packet type for struct rte_mbuf. 608 */ 609 static inline uint32_t 610 rxq_cq_to_pkt_type(uint32_t flags) 611 { 612 uint32_t pkt_type; 613 614 if (flags & IBV_EXP_CQ_RX_TUNNEL_PACKET) 615 pkt_type = 616 TRANSPOSE(flags, 617 IBV_EXP_CQ_RX_OUTER_IPV4_PACKET, 618 RTE_PTYPE_L3_IPV4) | 619 TRANSPOSE(flags, 620 IBV_EXP_CQ_RX_OUTER_IPV6_PACKET, 621 RTE_PTYPE_L3_IPV6) | 622 TRANSPOSE(flags, 623 IBV_EXP_CQ_RX_IPV4_PACKET, 624 RTE_PTYPE_INNER_L3_IPV4) | 625 TRANSPOSE(flags, 626 IBV_EXP_CQ_RX_IPV6_PACKET, 627 RTE_PTYPE_INNER_L3_IPV6); 628 else 629 pkt_type = 630 TRANSPOSE(flags, 631 IBV_EXP_CQ_RX_IPV4_PACKET, 632 RTE_PTYPE_L3_IPV4) | 633 TRANSPOSE(flags, 634 IBV_EXP_CQ_RX_IPV6_PACKET, 635 RTE_PTYPE_L3_IPV6); 636 return pkt_type; 637 } 638 639 /** 640 * Translate RX completion flags to offload flags. 641 * 642 * @param[in] rxq 643 * Pointer to RX queue structure. 644 * @param flags 645 * RX completion flags returned by poll_length_flags(). 646 * 647 * @return 648 * Offload flags (ol_flags) for struct rte_mbuf. 649 */ 650 static inline uint32_t 651 rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags) 652 { 653 uint32_t ol_flags = 0; 654 655 if (rxq->csum) 656 ol_flags |= 657 TRANSPOSE(~flags, 658 IBV_EXP_CQ_RX_IP_CSUM_OK, 659 PKT_RX_IP_CKSUM_BAD) | 660 TRANSPOSE(~flags, 661 IBV_EXP_CQ_RX_TCP_UDP_CSUM_OK, 662 PKT_RX_L4_CKSUM_BAD); 663 /* 664 * PKT_RX_IP_CKSUM_BAD and PKT_RX_L4_CKSUM_BAD are used in place 665 * of PKT_RX_EIP_CKSUM_BAD because the latter is not functional 666 * (its value is 0). 667 */ 668 if ((flags & IBV_EXP_CQ_RX_TUNNEL_PACKET) && (rxq->csum_l2tun)) 669 ol_flags |= 670 TRANSPOSE(~flags, 671 IBV_EXP_CQ_RX_OUTER_IP_CSUM_OK, 672 PKT_RX_IP_CKSUM_BAD) | 673 TRANSPOSE(~flags, 674 IBV_EXP_CQ_RX_OUTER_TCP_UDP_CSUM_OK, 675 PKT_RX_L4_CKSUM_BAD); 676 return ol_flags; 677 } 678 679 /** 680 * DPDK callback for RX with scattered packets support. 681 * 682 * @param dpdk_rxq 683 * Generic pointer to RX queue structure. 684 * @param[out] pkts 685 * Array to store received packets. 686 * @param pkts_n 687 * Maximum number of packets in array. 688 * 689 * @return 690 * Number of packets successfully received (<= pkts_n). 691 */ 692 uint16_t 693 mlx5_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 694 { 695 struct rxq *rxq = (struct rxq *)dpdk_rxq; 696 struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp; 697 const unsigned int elts_n = rxq->elts_n; 698 unsigned int elts_head = rxq->elts_head; 699 unsigned int i; 700 unsigned int pkts_ret = 0; 701 int ret; 702 703 if (unlikely(!rxq->sp)) 704 return mlx5_rx_burst(dpdk_rxq, pkts, pkts_n); 705 if (unlikely(elts == NULL)) /* See RTE_DEV_CMD_SET_MTU. */ 706 return 0; 707 for (i = 0; (i != pkts_n); ++i) { 708 struct rxq_elt_sp *elt = &(*elts)[elts_head]; 709 unsigned int len; 710 unsigned int pkt_buf_len; 711 struct rte_mbuf *pkt_buf = NULL; /* Buffer returned in pkts. */ 712 struct rte_mbuf **pkt_buf_next = &pkt_buf; 713 unsigned int seg_headroom = RTE_PKTMBUF_HEADROOM; 714 unsigned int j = 0; 715 uint32_t flags; 716 717 /* Sanity checks. */ 718 assert(elts_head < rxq->elts_n); 719 assert(rxq->elts_head < rxq->elts_n); 720 ret = rxq->if_cq->poll_length_flags(rxq->cq, NULL, NULL, 721 &flags); 722 if (unlikely(ret < 0)) { 723 struct ibv_wc wc; 724 int wcs_n; 725 726 DEBUG("rxq=%p, poll_length() failed (ret=%d)", 727 (void *)rxq, ret); 728 /* ibv_poll_cq() must be used in case of failure. */ 729 wcs_n = ibv_poll_cq(rxq->cq, 1, &wc); 730 if (unlikely(wcs_n == 0)) 731 break; 732 if (unlikely(wcs_n < 0)) { 733 DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)", 734 (void *)rxq, wcs_n); 735 break; 736 } 737 assert(wcs_n == 1); 738 if (unlikely(wc.status != IBV_WC_SUCCESS)) { 739 /* Whatever, just repost the offending WR. */ 740 DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work" 741 " completion status (%d): %s", 742 (void *)rxq, wc.wr_id, wc.status, 743 ibv_wc_status_str(wc.status)); 744 #ifdef MLX5_PMD_SOFT_COUNTERS 745 /* Increment dropped packets counter. */ 746 ++rxq->stats.idropped; 747 #endif 748 goto repost; 749 } 750 ret = wc.byte_len; 751 } 752 if (ret == 0) 753 break; 754 len = ret; 755 pkt_buf_len = len; 756 /* 757 * Replace spent segments with new ones, concatenate and 758 * return them as pkt_buf. 759 */ 760 while (1) { 761 struct ibv_sge *sge = &elt->sges[j]; 762 struct rte_mbuf *seg = elt->bufs[j]; 763 struct rte_mbuf *rep; 764 unsigned int seg_tailroom; 765 766 assert(seg != NULL); 767 /* 768 * Fetch initial bytes of packet descriptor into a 769 * cacheline while allocating rep. 770 */ 771 rte_prefetch0(seg); 772 rep = __rte_mbuf_raw_alloc(rxq->mp); 773 if (unlikely(rep == NULL)) { 774 /* 775 * Unable to allocate a replacement mbuf, 776 * repost WR. 777 */ 778 DEBUG("rxq=%p: can't allocate a new mbuf", 779 (void *)rxq); 780 if (pkt_buf != NULL) { 781 *pkt_buf_next = NULL; 782 rte_pktmbuf_free(pkt_buf); 783 } 784 /* Increment out of memory counters. */ 785 ++rxq->stats.rx_nombuf; 786 ++rxq->priv->dev->data->rx_mbuf_alloc_failed; 787 goto repost; 788 } 789 #ifndef NDEBUG 790 /* Poison user-modifiable fields in rep. */ 791 NEXT(rep) = (void *)((uintptr_t)-1); 792 SET_DATA_OFF(rep, 0xdead); 793 DATA_LEN(rep) = 0xd00d; 794 PKT_LEN(rep) = 0xdeadd00d; 795 NB_SEGS(rep) = 0x2a; 796 PORT(rep) = 0x2a; 797 rep->ol_flags = -1; 798 #endif 799 assert(rep->buf_len == seg->buf_len); 800 assert(rep->buf_len == rxq->mb_len); 801 /* Reconfigure sge to use rep instead of seg. */ 802 assert(sge->lkey == rxq->mr->lkey); 803 sge->addr = ((uintptr_t)rep->buf_addr + seg_headroom); 804 elt->bufs[j] = rep; 805 ++j; 806 /* Update pkt_buf if it's the first segment, or link 807 * seg to the previous one and update pkt_buf_next. */ 808 *pkt_buf_next = seg; 809 pkt_buf_next = &NEXT(seg); 810 /* Update seg information. */ 811 seg_tailroom = (seg->buf_len - seg_headroom); 812 assert(sge->length == seg_tailroom); 813 SET_DATA_OFF(seg, seg_headroom); 814 if (likely(len <= seg_tailroom)) { 815 /* Last segment. */ 816 DATA_LEN(seg) = len; 817 PKT_LEN(seg) = len; 818 /* Sanity check. */ 819 assert(rte_pktmbuf_headroom(seg) == 820 seg_headroom); 821 assert(rte_pktmbuf_tailroom(seg) == 822 (seg_tailroom - len)); 823 break; 824 } 825 DATA_LEN(seg) = seg_tailroom; 826 PKT_LEN(seg) = seg_tailroom; 827 /* Sanity check. */ 828 assert(rte_pktmbuf_headroom(seg) == seg_headroom); 829 assert(rte_pktmbuf_tailroom(seg) == 0); 830 /* Fix len and clear headroom for next segments. */ 831 len -= seg_tailroom; 832 seg_headroom = 0; 833 } 834 /* Update head and tail segments. */ 835 *pkt_buf_next = NULL; 836 assert(pkt_buf != NULL); 837 assert(j != 0); 838 NB_SEGS(pkt_buf) = j; 839 PORT(pkt_buf) = rxq->port_id; 840 PKT_LEN(pkt_buf) = pkt_buf_len; 841 pkt_buf->packet_type = rxq_cq_to_pkt_type(flags); 842 pkt_buf->ol_flags = rxq_cq_to_ol_flags(rxq, flags); 843 844 /* Return packet. */ 845 *(pkts++) = pkt_buf; 846 ++pkts_ret; 847 #ifdef MLX5_PMD_SOFT_COUNTERS 848 /* Increment bytes counter. */ 849 rxq->stats.ibytes += pkt_buf_len; 850 #endif 851 repost: 852 ret = rxq->if_wq->recv_sg_list(rxq->wq, 853 elt->sges, 854 RTE_DIM(elt->sges)); 855 if (unlikely(ret)) { 856 /* Inability to repost WRs is fatal. */ 857 DEBUG("%p: recv_sg_list(): failed (ret=%d)", 858 (void *)rxq->priv, 859 ret); 860 abort(); 861 } 862 if (++elts_head >= elts_n) 863 elts_head = 0; 864 continue; 865 } 866 if (unlikely(i == 0)) 867 return 0; 868 rxq->elts_head = elts_head; 869 #ifdef MLX5_PMD_SOFT_COUNTERS 870 /* Increment packets counter. */ 871 rxq->stats.ipackets += pkts_ret; 872 #endif 873 return pkts_ret; 874 } 875 876 /** 877 * DPDK callback for RX. 878 * 879 * The following function is the same as mlx5_rx_burst_sp(), except it doesn't 880 * manage scattered packets. Improves performance when MRU is lower than the 881 * size of the first segment. 882 * 883 * @param dpdk_rxq 884 * Generic pointer to RX queue structure. 885 * @param[out] pkts 886 * Array to store received packets. 887 * @param pkts_n 888 * Maximum number of packets in array. 889 * 890 * @return 891 * Number of packets successfully received (<= pkts_n). 892 */ 893 uint16_t 894 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 895 { 896 struct rxq *rxq = (struct rxq *)dpdk_rxq; 897 struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp; 898 const unsigned int elts_n = rxq->elts_n; 899 unsigned int elts_head = rxq->elts_head; 900 struct ibv_sge sges[pkts_n]; 901 unsigned int i; 902 unsigned int pkts_ret = 0; 903 int ret; 904 905 if (unlikely(rxq->sp)) 906 return mlx5_rx_burst_sp(dpdk_rxq, pkts, pkts_n); 907 for (i = 0; (i != pkts_n); ++i) { 908 struct rxq_elt *elt = &(*elts)[elts_head]; 909 unsigned int len; 910 struct rte_mbuf *seg = elt->buf; 911 struct rte_mbuf *rep; 912 uint32_t flags; 913 914 /* Sanity checks. */ 915 assert(seg != NULL); 916 assert(elts_head < rxq->elts_n); 917 assert(rxq->elts_head < rxq->elts_n); 918 /* 919 * Fetch initial bytes of packet descriptor into a 920 * cacheline while allocating rep. 921 */ 922 rte_prefetch0(seg); 923 rte_prefetch0(&seg->cacheline1); 924 ret = rxq->if_cq->poll_length_flags(rxq->cq, NULL, NULL, 925 &flags); 926 if (unlikely(ret < 0)) { 927 struct ibv_wc wc; 928 int wcs_n; 929 930 DEBUG("rxq=%p, poll_length() failed (ret=%d)", 931 (void *)rxq, ret); 932 /* ibv_poll_cq() must be used in case of failure. */ 933 wcs_n = ibv_poll_cq(rxq->cq, 1, &wc); 934 if (unlikely(wcs_n == 0)) 935 break; 936 if (unlikely(wcs_n < 0)) { 937 DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)", 938 (void *)rxq, wcs_n); 939 break; 940 } 941 assert(wcs_n == 1); 942 if (unlikely(wc.status != IBV_WC_SUCCESS)) { 943 /* Whatever, just repost the offending WR. */ 944 DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work" 945 " completion status (%d): %s", 946 (void *)rxq, wc.wr_id, wc.status, 947 ibv_wc_status_str(wc.status)); 948 #ifdef MLX5_PMD_SOFT_COUNTERS 949 /* Increment dropped packets counter. */ 950 ++rxq->stats.idropped; 951 #endif 952 /* Add SGE to array for repost. */ 953 sges[i] = elt->sge; 954 goto repost; 955 } 956 ret = wc.byte_len; 957 } 958 if (ret == 0) 959 break; 960 len = ret; 961 rep = __rte_mbuf_raw_alloc(rxq->mp); 962 if (unlikely(rep == NULL)) { 963 /* 964 * Unable to allocate a replacement mbuf, 965 * repost WR. 966 */ 967 DEBUG("rxq=%p: can't allocate a new mbuf", 968 (void *)rxq); 969 /* Increment out of memory counters. */ 970 ++rxq->stats.rx_nombuf; 971 ++rxq->priv->dev->data->rx_mbuf_alloc_failed; 972 goto repost; 973 } 974 975 /* Reconfigure sge to use rep instead of seg. */ 976 elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM; 977 assert(elt->sge.lkey == rxq->mr->lkey); 978 elt->buf = rep; 979 980 /* Add SGE to array for repost. */ 981 sges[i] = elt->sge; 982 983 /* Update seg information. */ 984 SET_DATA_OFF(seg, RTE_PKTMBUF_HEADROOM); 985 NB_SEGS(seg) = 1; 986 PORT(seg) = rxq->port_id; 987 NEXT(seg) = NULL; 988 PKT_LEN(seg) = len; 989 DATA_LEN(seg) = len; 990 seg->packet_type = rxq_cq_to_pkt_type(flags); 991 seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags); 992 993 /* Return packet. */ 994 *(pkts++) = seg; 995 ++pkts_ret; 996 #ifdef MLX5_PMD_SOFT_COUNTERS 997 /* Increment bytes counter. */ 998 rxq->stats.ibytes += len; 999 #endif 1000 repost: 1001 if (++elts_head >= elts_n) 1002 elts_head = 0; 1003 continue; 1004 } 1005 if (unlikely(i == 0)) 1006 return 0; 1007 /* Repost WRs. */ 1008 #ifdef DEBUG_RECV 1009 DEBUG("%p: reposting %u WRs", (void *)rxq, i); 1010 #endif 1011 ret = rxq->if_wq->recv_burst(rxq->wq, sges, i); 1012 if (unlikely(ret)) { 1013 /* Inability to repost WRs is fatal. */ 1014 DEBUG("%p: recv_burst(): failed (ret=%d)", 1015 (void *)rxq->priv, 1016 ret); 1017 abort(); 1018 } 1019 rxq->elts_head = elts_head; 1020 #ifdef MLX5_PMD_SOFT_COUNTERS 1021 /* Increment packets counter. */ 1022 rxq->stats.ipackets += pkts_ret; 1023 #endif 1024 return pkts_ret; 1025 } 1026 1027 /** 1028 * Dummy DPDK callback for TX. 1029 * 1030 * This function is used to temporarily replace the real callback during 1031 * unsafe control operations on the queue, or in case of error. 1032 * 1033 * @param dpdk_txq 1034 * Generic pointer to TX queue structure. 1035 * @param[in] pkts 1036 * Packets to transmit. 1037 * @param pkts_n 1038 * Number of packets in array. 1039 * 1040 * @return 1041 * Number of packets successfully transmitted (<= pkts_n). 1042 */ 1043 uint16_t 1044 removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 1045 { 1046 (void)dpdk_txq; 1047 (void)pkts; 1048 (void)pkts_n; 1049 return 0; 1050 } 1051 1052 /** 1053 * Dummy DPDK callback for RX. 1054 * 1055 * This function is used to temporarily replace the real callback during 1056 * unsafe control operations on the queue, or in case of error. 1057 * 1058 * @param dpdk_rxq 1059 * Generic pointer to RX queue structure. 1060 * @param[out] pkts 1061 * Array to store received packets. 1062 * @param pkts_n 1063 * Maximum number of packets in array. 1064 * 1065 * @return 1066 * Number of packets successfully received (<= pkts_n). 1067 */ 1068 uint16_t 1069 removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1070 { 1071 (void)dpdk_rxq; 1072 (void)pkts; 1073 (void)pkts_n; 1074 return 0; 1075 } 1076