1 /*- 2 * BSD LICENSE 3 * 4 * Copyright 2015 6WIND S.A. 5 * Copyright 2015 Mellanox. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of 6WIND S.A. nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <assert.h> 35 #include <stdint.h> 36 #include <string.h> 37 #include <stdlib.h> 38 39 /* Verbs header. */ 40 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 41 #ifdef PEDANTIC 42 #pragma GCC diagnostic ignored "-pedantic" 43 #endif 44 #include <infiniband/verbs.h> 45 #ifdef PEDANTIC 46 #pragma GCC diagnostic error "-pedantic" 47 #endif 48 49 /* DPDK headers don't like -pedantic. */ 50 #ifdef PEDANTIC 51 #pragma GCC diagnostic ignored "-pedantic" 52 #endif 53 #include <rte_mbuf.h> 54 #include <rte_mempool.h> 55 #include <rte_prefetch.h> 56 #include <rte_common.h> 57 #include <rte_branch_prediction.h> 58 #ifdef PEDANTIC 59 #pragma GCC diagnostic error "-pedantic" 60 #endif 61 62 #include "mlx5.h" 63 #include "mlx5_utils.h" 64 #include "mlx5_rxtx.h" 65 #include "mlx5_defs.h" 66 67 /** 68 * Manage TX completions. 69 * 70 * When sending a burst, mlx5_tx_burst() posts several WRs. 71 * To improve performance, a completion event is only required once every 72 * MLX5_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information 73 * for other WRs, but this information would not be used anyway. 74 * 75 * @param txq 76 * Pointer to TX queue structure. 77 * 78 * @return 79 * 0 on success, -1 on failure. 80 */ 81 static int 82 txq_complete(struct txq *txq) 83 { 84 unsigned int elts_comp = txq->elts_comp; 85 unsigned int elts_tail = txq->elts_tail; 86 const unsigned int elts_n = txq->elts_n; 87 int wcs_n; 88 89 if (unlikely(elts_comp == 0)) 90 return 0; 91 #ifdef DEBUG_SEND 92 DEBUG("%p: processing %u work requests completions", 93 (void *)txq, elts_comp); 94 #endif 95 wcs_n = txq->if_cq->poll_cnt(txq->cq, elts_comp); 96 if (unlikely(wcs_n == 0)) 97 return 0; 98 if (unlikely(wcs_n < 0)) { 99 DEBUG("%p: ibv_poll_cq() failed (wcs_n=%d)", 100 (void *)txq, wcs_n); 101 return -1; 102 } 103 elts_comp -= wcs_n; 104 assert(elts_comp <= txq->elts_comp); 105 /* 106 * Assume WC status is successful as nothing can be done about it 107 * anyway. 108 */ 109 elts_tail += wcs_n * txq->elts_comp_cd_init; 110 if (elts_tail >= elts_n) 111 elts_tail -= elts_n; 112 txq->elts_tail = elts_tail; 113 txq->elts_comp = elts_comp; 114 return 0; 115 } 116 117 /** 118 * Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[]. 119 * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full, 120 * remove an entry first. 121 * 122 * @param txq 123 * Pointer to TX queue structure. 124 * @param[in] mp 125 * Memory Pool for which a Memory Region lkey must be returned. 126 * 127 * @return 128 * mr->lkey on success, (uint32_t)-1 on failure. 129 */ 130 static uint32_t 131 txq_mp2mr(struct txq *txq, struct rte_mempool *mp) 132 { 133 unsigned int i; 134 struct ibv_mr *mr; 135 136 for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) { 137 if (unlikely(txq->mp2mr[i].mp == NULL)) { 138 /* Unknown MP, add a new MR for it. */ 139 break; 140 } 141 if (txq->mp2mr[i].mp == mp) { 142 assert(txq->mp2mr[i].lkey != (uint32_t)-1); 143 assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey); 144 return txq->mp2mr[i].lkey; 145 } 146 } 147 /* Add a new entry, register MR first. */ 148 DEBUG("%p: discovered new memory pool %p", (void *)txq, (void *)mp); 149 mr = ibv_reg_mr(txq->priv->pd, 150 (void *)mp->elt_va_start, 151 (mp->elt_va_end - mp->elt_va_start), 152 (IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE)); 153 if (unlikely(mr == NULL)) { 154 DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.", 155 (void *)txq); 156 return (uint32_t)-1; 157 } 158 if (unlikely(i == RTE_DIM(txq->mp2mr))) { 159 /* Table is full, remove oldest entry. */ 160 DEBUG("%p: MR <-> MP table full, dropping oldest entry.", 161 (void *)txq); 162 --i; 163 claim_zero(ibv_dereg_mr(txq->mp2mr[i].mr)); 164 memmove(&txq->mp2mr[0], &txq->mp2mr[1], 165 (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0]))); 166 } 167 /* Store the new entry. */ 168 txq->mp2mr[i].mp = mp; 169 txq->mp2mr[i].mr = mr; 170 txq->mp2mr[i].lkey = mr->lkey; 171 DEBUG("%p: new MR lkey for MP %p: 0x%08" PRIu32, 172 (void *)txq, (void *)mp, txq->mp2mr[i].lkey); 173 return txq->mp2mr[i].lkey; 174 } 175 176 #if MLX5_PMD_SGE_WR_N > 1 177 178 /** 179 * Copy scattered mbuf contents to a single linear buffer. 180 * 181 * @param[out] linear 182 * Linear output buffer. 183 * @param[in] buf 184 * Scattered input buffer. 185 * 186 * @return 187 * Number of bytes copied to the output buffer or 0 if not large enough. 188 */ 189 static unsigned int 190 linearize_mbuf(linear_t *linear, struct rte_mbuf *buf) 191 { 192 unsigned int size = 0; 193 unsigned int offset; 194 195 do { 196 unsigned int len = DATA_LEN(buf); 197 198 offset = size; 199 size += len; 200 if (unlikely(size > sizeof(*linear))) 201 return 0; 202 memcpy(&(*linear)[offset], 203 rte_pktmbuf_mtod(buf, uint8_t *), 204 len); 205 buf = NEXT(buf); 206 } while (buf != NULL); 207 return size; 208 } 209 210 /** 211 * Handle scattered buffers for mlx5_tx_burst(). 212 * 213 * @param txq 214 * TX queue structure. 215 * @param segs 216 * Number of segments in buf. 217 * @param elt 218 * TX queue element to fill. 219 * @param[in] buf 220 * Buffer to process. 221 * @param elts_head 222 * Index of the linear buffer to use if necessary (normally txq->elts_head). 223 * @param[out] sges 224 * Array filled with SGEs on success. 225 * 226 * @return 227 * A structure containing the processed packet size in bytes and the 228 * number of SGEs. Both fields are set to (unsigned int)-1 in case of 229 * failure. 230 */ 231 static struct tx_burst_sg_ret { 232 unsigned int length; 233 unsigned int num; 234 } 235 tx_burst_sg(struct txq *txq, unsigned int segs, struct txq_elt *elt, 236 struct rte_mbuf *buf, unsigned int elts_head, 237 struct ibv_sge (*sges)[MLX5_PMD_SGE_WR_N]) 238 { 239 unsigned int sent_size = 0; 240 unsigned int j; 241 int linearize = 0; 242 243 /* When there are too many segments, extra segments are 244 * linearized in the last SGE. */ 245 if (unlikely(segs > RTE_DIM(*sges))) { 246 segs = (RTE_DIM(*sges) - 1); 247 linearize = 1; 248 } 249 /* Update element. */ 250 elt->buf = buf; 251 /* Register segments as SGEs. */ 252 for (j = 0; (j != segs); ++j) { 253 struct ibv_sge *sge = &(*sges)[j]; 254 uint32_t lkey; 255 256 /* Retrieve Memory Region key for this memory pool. */ 257 lkey = txq_mp2mr(txq, buf->pool); 258 if (unlikely(lkey == (uint32_t)-1)) { 259 /* MR does not exist. */ 260 DEBUG("%p: unable to get MP <-> MR association", 261 (void *)txq); 262 /* Clean up TX element. */ 263 elt->buf = NULL; 264 goto stop; 265 } 266 /* Update SGE. */ 267 sge->addr = rte_pktmbuf_mtod(buf, uintptr_t); 268 if (txq->priv->vf) 269 rte_prefetch0((volatile void *) 270 (uintptr_t)sge->addr); 271 sge->length = DATA_LEN(buf); 272 sge->lkey = lkey; 273 sent_size += sge->length; 274 buf = NEXT(buf); 275 } 276 /* If buf is not NULL here and is not going to be linearized, 277 * nb_segs is not valid. */ 278 assert(j == segs); 279 assert((buf == NULL) || (linearize)); 280 /* Linearize extra segments. */ 281 if (linearize) { 282 struct ibv_sge *sge = &(*sges)[segs]; 283 linear_t *linear = &(*txq->elts_linear)[elts_head]; 284 unsigned int size = linearize_mbuf(linear, buf); 285 286 assert(segs == (RTE_DIM(*sges) - 1)); 287 if (size == 0) { 288 /* Invalid packet. */ 289 DEBUG("%p: packet too large to be linearized.", 290 (void *)txq); 291 /* Clean up TX element. */ 292 elt->buf = NULL; 293 goto stop; 294 } 295 /* If MLX5_PMD_SGE_WR_N is 1, free mbuf immediately. */ 296 if (RTE_DIM(*sges) == 1) { 297 do { 298 struct rte_mbuf *next = NEXT(buf); 299 300 rte_pktmbuf_free_seg(buf); 301 buf = next; 302 } while (buf != NULL); 303 elt->buf = NULL; 304 } 305 /* Update SGE. */ 306 sge->addr = (uintptr_t)&(*linear)[0]; 307 sge->length = size; 308 sge->lkey = txq->mr_linear->lkey; 309 sent_size += size; 310 } 311 return (struct tx_burst_sg_ret){ 312 .length = sent_size, 313 .num = segs, 314 }; 315 stop: 316 return (struct tx_burst_sg_ret){ 317 .length = -1, 318 .num = -1, 319 }; 320 } 321 322 #endif /* MLX5_PMD_SGE_WR_N > 1 */ 323 324 /** 325 * DPDK callback for TX. 326 * 327 * @param dpdk_txq 328 * Generic pointer to TX queue structure. 329 * @param[in] pkts 330 * Packets to transmit. 331 * @param pkts_n 332 * Number of packets in array. 333 * 334 * @return 335 * Number of packets successfully transmitted (<= pkts_n). 336 */ 337 uint16_t 338 mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 339 { 340 struct txq *txq = (struct txq *)dpdk_txq; 341 unsigned int elts_head = txq->elts_head; 342 const unsigned int elts_tail = txq->elts_tail; 343 const unsigned int elts_n = txq->elts_n; 344 unsigned int elts_comp_cd = txq->elts_comp_cd; 345 unsigned int elts_comp = 0; 346 unsigned int i; 347 unsigned int max; 348 int err; 349 350 assert(elts_comp_cd != 0); 351 txq_complete(txq); 352 max = (elts_n - (elts_head - elts_tail)); 353 if (max > elts_n) 354 max -= elts_n; 355 assert(max >= 1); 356 assert(max <= elts_n); 357 /* Always leave one free entry in the ring. */ 358 --max; 359 if (max == 0) 360 return 0; 361 if (max > pkts_n) 362 max = pkts_n; 363 for (i = 0; (i != max); ++i) { 364 struct rte_mbuf *buf = pkts[i]; 365 unsigned int elts_head_next = 366 (((elts_head + 1) == elts_n) ? 0 : elts_head + 1); 367 struct txq_elt *elt_next = &(*txq->elts)[elts_head_next]; 368 struct txq_elt *elt = &(*txq->elts)[elts_head]; 369 unsigned int segs = NB_SEGS(buf); 370 #ifdef MLX5_PMD_SOFT_COUNTERS 371 unsigned int sent_size = 0; 372 #endif 373 uint32_t send_flags = 0; 374 375 /* Clean up old buffer. */ 376 if (likely(elt->buf != NULL)) { 377 struct rte_mbuf *tmp = elt->buf; 378 379 /* Faster than rte_pktmbuf_free(). */ 380 do { 381 struct rte_mbuf *next = NEXT(tmp); 382 383 rte_pktmbuf_free_seg(tmp); 384 tmp = next; 385 } while (tmp != NULL); 386 } 387 /* Request TX completion. */ 388 if (unlikely(--elts_comp_cd == 0)) { 389 elts_comp_cd = txq->elts_comp_cd_init; 390 ++elts_comp; 391 send_flags |= IBV_EXP_QP_BURST_SIGNALED; 392 } 393 /* Should we enable HW CKSUM offload */ 394 if (buf->ol_flags & 395 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) { 396 send_flags |= IBV_EXP_QP_BURST_IP_CSUM; 397 /* HW does not support checksum offloads at arbitrary 398 * offsets but automatically recognizes the packet 399 * type. For inner L3/L4 checksums, only VXLAN (UDP) 400 * tunnels are currently supported. */ 401 if (RTE_ETH_IS_TUNNEL_PKT(buf->packet_type)) 402 send_flags |= IBV_EXP_QP_BURST_TUNNEL; 403 } 404 if (likely(segs == 1)) { 405 uintptr_t addr; 406 uint32_t length; 407 uint32_t lkey; 408 409 /* Retrieve buffer information. */ 410 addr = rte_pktmbuf_mtod(buf, uintptr_t); 411 length = DATA_LEN(buf); 412 /* Retrieve Memory Region key for this memory pool. */ 413 lkey = txq_mp2mr(txq, buf->pool); 414 if (unlikely(lkey == (uint32_t)-1)) { 415 /* MR does not exist. */ 416 DEBUG("%p: unable to get MP <-> MR" 417 " association", (void *)txq); 418 /* Clean up TX element. */ 419 elt->buf = NULL; 420 goto stop; 421 } 422 /* Update element. */ 423 elt->buf = buf; 424 if (txq->priv->vf) 425 rte_prefetch0((volatile void *) 426 (uintptr_t)addr); 427 RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); 428 /* Put packet into send queue. */ 429 #if MLX5_PMD_MAX_INLINE > 0 430 if (length <= txq->max_inline) 431 err = txq->if_qp->send_pending_inline 432 (txq->qp, 433 (void *)addr, 434 length, 435 send_flags); 436 else 437 #endif 438 err = txq->if_qp->send_pending 439 (txq->qp, 440 addr, 441 length, 442 lkey, 443 send_flags); 444 if (unlikely(err)) 445 goto stop; 446 #ifdef MLX5_PMD_SOFT_COUNTERS 447 sent_size += length; 448 #endif 449 } else { 450 #if MLX5_PMD_SGE_WR_N > 1 451 struct ibv_sge sges[MLX5_PMD_SGE_WR_N]; 452 struct tx_burst_sg_ret ret; 453 454 ret = tx_burst_sg(txq, segs, elt, buf, elts_head, 455 &sges); 456 if (ret.length == (unsigned int)-1) 457 goto stop; 458 RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); 459 /* Put SG list into send queue. */ 460 err = txq->if_qp->send_pending_sg_list 461 (txq->qp, 462 sges, 463 ret.num, 464 send_flags); 465 if (unlikely(err)) 466 goto stop; 467 #ifdef MLX5_PMD_SOFT_COUNTERS 468 sent_size += ret.length; 469 #endif 470 #else /* MLX5_PMD_SGE_WR_N > 1 */ 471 DEBUG("%p: TX scattered buffers support not" 472 " compiled in", (void *)txq); 473 goto stop; 474 #endif /* MLX5_PMD_SGE_WR_N > 1 */ 475 } 476 elts_head = elts_head_next; 477 #ifdef MLX5_PMD_SOFT_COUNTERS 478 /* Increment sent bytes counter. */ 479 txq->stats.obytes += sent_size; 480 #endif 481 } 482 stop: 483 /* Take a shortcut if nothing must be sent. */ 484 if (unlikely(i == 0)) 485 return 0; 486 #ifdef MLX5_PMD_SOFT_COUNTERS 487 /* Increment sent packets counter. */ 488 txq->stats.opackets += i; 489 #endif 490 /* Ring QP doorbell. */ 491 err = txq->if_qp->send_flush(txq->qp); 492 if (unlikely(err)) { 493 /* A nonzero value is not supposed to be returned. 494 * Nothing can be done about it. */ 495 DEBUG("%p: send_flush() failed with error %d", 496 (void *)txq, err); 497 } 498 txq->elts_head = elts_head; 499 txq->elts_comp += elts_comp; 500 txq->elts_comp_cd = elts_comp_cd; 501 return i; 502 } 503 504 /** 505 * Translate RX completion flags to packet type. 506 * 507 * @param flags 508 * RX completion flags returned by poll_length_flags(). 509 * 510 * @return 511 * Packet type for struct rte_mbuf. 512 */ 513 static inline uint32_t 514 rxq_cq_to_pkt_type(uint32_t flags) 515 { 516 uint32_t pkt_type; 517 518 if (flags & IBV_EXP_CQ_RX_TUNNEL_PACKET) 519 pkt_type = 520 TRANSPOSE(flags, 521 IBV_EXP_CQ_RX_OUTER_IPV4_PACKET, 522 RTE_PTYPE_L3_IPV4) | 523 TRANSPOSE(flags, 524 IBV_EXP_CQ_RX_OUTER_IPV6_PACKET, 525 RTE_PTYPE_L3_IPV6) | 526 TRANSPOSE(flags, 527 IBV_EXP_CQ_RX_IPV4_PACKET, 528 RTE_PTYPE_INNER_L3_IPV4) | 529 TRANSPOSE(flags, 530 IBV_EXP_CQ_RX_IPV6_PACKET, 531 RTE_PTYPE_INNER_L3_IPV6); 532 else 533 pkt_type = 534 TRANSPOSE(flags, 535 IBV_EXP_CQ_RX_IPV4_PACKET, 536 RTE_PTYPE_L3_IPV4) | 537 TRANSPOSE(flags, 538 IBV_EXP_CQ_RX_IPV6_PACKET, 539 RTE_PTYPE_L3_IPV6); 540 return pkt_type; 541 } 542 543 /** 544 * Translate RX completion flags to offload flags. 545 * 546 * @param[in] rxq 547 * Pointer to RX queue structure. 548 * @param flags 549 * RX completion flags returned by poll_length_flags(). 550 * 551 * @return 552 * Offload flags (ol_flags) for struct rte_mbuf. 553 */ 554 static inline uint32_t 555 rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags) 556 { 557 uint32_t ol_flags = 0; 558 559 if (rxq->csum) 560 ol_flags |= 561 TRANSPOSE(~flags, 562 IBV_EXP_CQ_RX_IP_CSUM_OK, 563 PKT_RX_IP_CKSUM_BAD) | 564 TRANSPOSE(~flags, 565 IBV_EXP_CQ_RX_TCP_UDP_CSUM_OK, 566 PKT_RX_L4_CKSUM_BAD); 567 /* 568 * PKT_RX_IP_CKSUM_BAD and PKT_RX_L4_CKSUM_BAD are used in place 569 * of PKT_RX_EIP_CKSUM_BAD because the latter is not functional 570 * (its value is 0). 571 */ 572 if ((flags & IBV_EXP_CQ_RX_TUNNEL_PACKET) && (rxq->csum_l2tun)) 573 ol_flags |= 574 TRANSPOSE(~flags, 575 IBV_EXP_CQ_RX_OUTER_IP_CSUM_OK, 576 PKT_RX_IP_CKSUM_BAD) | 577 TRANSPOSE(~flags, 578 IBV_EXP_CQ_RX_OUTER_TCP_UDP_CSUM_OK, 579 PKT_RX_L4_CKSUM_BAD); 580 return ol_flags; 581 } 582 583 /** 584 * DPDK callback for RX with scattered packets support. 585 * 586 * @param dpdk_rxq 587 * Generic pointer to RX queue structure. 588 * @param[out] pkts 589 * Array to store received packets. 590 * @param pkts_n 591 * Maximum number of packets in array. 592 * 593 * @return 594 * Number of packets successfully received (<= pkts_n). 595 */ 596 uint16_t 597 mlx5_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 598 { 599 struct rxq *rxq = (struct rxq *)dpdk_rxq; 600 struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp; 601 const unsigned int elts_n = rxq->elts_n; 602 unsigned int elts_head = rxq->elts_head; 603 unsigned int i; 604 unsigned int pkts_ret = 0; 605 int ret; 606 607 if (unlikely(!rxq->sp)) 608 return mlx5_rx_burst(dpdk_rxq, pkts, pkts_n); 609 if (unlikely(elts == NULL)) /* See RTE_DEV_CMD_SET_MTU. */ 610 return 0; 611 for (i = 0; (i != pkts_n); ++i) { 612 struct rxq_elt_sp *elt = &(*elts)[elts_head]; 613 unsigned int len; 614 unsigned int pkt_buf_len; 615 struct rte_mbuf *pkt_buf = NULL; /* Buffer returned in pkts. */ 616 struct rte_mbuf **pkt_buf_next = &pkt_buf; 617 unsigned int seg_headroom = RTE_PKTMBUF_HEADROOM; 618 unsigned int j = 0; 619 uint32_t flags; 620 621 /* Sanity checks. */ 622 assert(elts_head < rxq->elts_n); 623 assert(rxq->elts_head < rxq->elts_n); 624 ret = rxq->if_cq->poll_length_flags(rxq->cq, NULL, NULL, 625 &flags); 626 if (unlikely(ret < 0)) { 627 struct ibv_wc wc; 628 int wcs_n; 629 630 DEBUG("rxq=%p, poll_length() failed (ret=%d)", 631 (void *)rxq, ret); 632 /* ibv_poll_cq() must be used in case of failure. */ 633 wcs_n = ibv_poll_cq(rxq->cq, 1, &wc); 634 if (unlikely(wcs_n == 0)) 635 break; 636 if (unlikely(wcs_n < 0)) { 637 DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)", 638 (void *)rxq, wcs_n); 639 break; 640 } 641 assert(wcs_n == 1); 642 if (unlikely(wc.status != IBV_WC_SUCCESS)) { 643 /* Whatever, just repost the offending WR. */ 644 DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work" 645 " completion status (%d): %s", 646 (void *)rxq, wc.wr_id, wc.status, 647 ibv_wc_status_str(wc.status)); 648 #ifdef MLX5_PMD_SOFT_COUNTERS 649 /* Increment dropped packets counter. */ 650 ++rxq->stats.idropped; 651 #endif 652 goto repost; 653 } 654 ret = wc.byte_len; 655 } 656 if (ret == 0) 657 break; 658 len = ret; 659 pkt_buf_len = len; 660 /* 661 * Replace spent segments with new ones, concatenate and 662 * return them as pkt_buf. 663 */ 664 while (1) { 665 struct ibv_sge *sge = &elt->sges[j]; 666 struct rte_mbuf *seg = elt->bufs[j]; 667 struct rte_mbuf *rep; 668 unsigned int seg_tailroom; 669 670 assert(seg != NULL); 671 /* 672 * Fetch initial bytes of packet descriptor into a 673 * cacheline while allocating rep. 674 */ 675 rte_prefetch0(seg); 676 rep = __rte_mbuf_raw_alloc(rxq->mp); 677 if (unlikely(rep == NULL)) { 678 /* 679 * Unable to allocate a replacement mbuf, 680 * repost WR. 681 */ 682 DEBUG("rxq=%p: can't allocate a new mbuf", 683 (void *)rxq); 684 if (pkt_buf != NULL) { 685 *pkt_buf_next = NULL; 686 rte_pktmbuf_free(pkt_buf); 687 } 688 /* Increment out of memory counters. */ 689 ++rxq->stats.rx_nombuf; 690 ++rxq->priv->dev->data->rx_mbuf_alloc_failed; 691 goto repost; 692 } 693 #ifndef NDEBUG 694 /* Poison user-modifiable fields in rep. */ 695 NEXT(rep) = (void *)((uintptr_t)-1); 696 SET_DATA_OFF(rep, 0xdead); 697 DATA_LEN(rep) = 0xd00d; 698 PKT_LEN(rep) = 0xdeadd00d; 699 NB_SEGS(rep) = 0x2a; 700 PORT(rep) = 0x2a; 701 rep->ol_flags = -1; 702 #endif 703 assert(rep->buf_len == seg->buf_len); 704 assert(rep->buf_len == rxq->mb_len); 705 /* Reconfigure sge to use rep instead of seg. */ 706 assert(sge->lkey == rxq->mr->lkey); 707 sge->addr = ((uintptr_t)rep->buf_addr + seg_headroom); 708 elt->bufs[j] = rep; 709 ++j; 710 /* Update pkt_buf if it's the first segment, or link 711 * seg to the previous one and update pkt_buf_next. */ 712 *pkt_buf_next = seg; 713 pkt_buf_next = &NEXT(seg); 714 /* Update seg information. */ 715 seg_tailroom = (seg->buf_len - seg_headroom); 716 assert(sge->length == seg_tailroom); 717 SET_DATA_OFF(seg, seg_headroom); 718 if (likely(len <= seg_tailroom)) { 719 /* Last segment. */ 720 DATA_LEN(seg) = len; 721 PKT_LEN(seg) = len; 722 /* Sanity check. */ 723 assert(rte_pktmbuf_headroom(seg) == 724 seg_headroom); 725 assert(rte_pktmbuf_tailroom(seg) == 726 (seg_tailroom - len)); 727 break; 728 } 729 DATA_LEN(seg) = seg_tailroom; 730 PKT_LEN(seg) = seg_tailroom; 731 /* Sanity check. */ 732 assert(rte_pktmbuf_headroom(seg) == seg_headroom); 733 assert(rte_pktmbuf_tailroom(seg) == 0); 734 /* Fix len and clear headroom for next segments. */ 735 len -= seg_tailroom; 736 seg_headroom = 0; 737 } 738 /* Update head and tail segments. */ 739 *pkt_buf_next = NULL; 740 assert(pkt_buf != NULL); 741 assert(j != 0); 742 NB_SEGS(pkt_buf) = j; 743 PORT(pkt_buf) = rxq->port_id; 744 PKT_LEN(pkt_buf) = pkt_buf_len; 745 pkt_buf->packet_type = rxq_cq_to_pkt_type(flags); 746 pkt_buf->ol_flags = rxq_cq_to_ol_flags(rxq, flags); 747 748 /* Return packet. */ 749 *(pkts++) = pkt_buf; 750 ++pkts_ret; 751 #ifdef MLX5_PMD_SOFT_COUNTERS 752 /* Increment bytes counter. */ 753 rxq->stats.ibytes += pkt_buf_len; 754 #endif 755 repost: 756 ret = rxq->if_wq->recv_sg_list(rxq->wq, 757 elt->sges, 758 RTE_DIM(elt->sges)); 759 if (unlikely(ret)) { 760 /* Inability to repost WRs is fatal. */ 761 DEBUG("%p: recv_sg_list(): failed (ret=%d)", 762 (void *)rxq->priv, 763 ret); 764 abort(); 765 } 766 if (++elts_head >= elts_n) 767 elts_head = 0; 768 continue; 769 } 770 if (unlikely(i == 0)) 771 return 0; 772 rxq->elts_head = elts_head; 773 #ifdef MLX5_PMD_SOFT_COUNTERS 774 /* Increment packets counter. */ 775 rxq->stats.ipackets += pkts_ret; 776 #endif 777 return pkts_ret; 778 } 779 780 /** 781 * DPDK callback for RX. 782 * 783 * The following function is the same as mlx5_rx_burst_sp(), except it doesn't 784 * manage scattered packets. Improves performance when MRU is lower than the 785 * size of the first segment. 786 * 787 * @param dpdk_rxq 788 * Generic pointer to RX queue structure. 789 * @param[out] pkts 790 * Array to store received packets. 791 * @param pkts_n 792 * Maximum number of packets in array. 793 * 794 * @return 795 * Number of packets successfully received (<= pkts_n). 796 */ 797 uint16_t 798 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 799 { 800 struct rxq *rxq = (struct rxq *)dpdk_rxq; 801 struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp; 802 const unsigned int elts_n = rxq->elts_n; 803 unsigned int elts_head = rxq->elts_head; 804 struct ibv_sge sges[pkts_n]; 805 unsigned int i; 806 unsigned int pkts_ret = 0; 807 int ret; 808 809 if (unlikely(rxq->sp)) 810 return mlx5_rx_burst_sp(dpdk_rxq, pkts, pkts_n); 811 for (i = 0; (i != pkts_n); ++i) { 812 struct rxq_elt *elt = &(*elts)[elts_head]; 813 unsigned int len; 814 struct rte_mbuf *seg = elt->buf; 815 struct rte_mbuf *rep; 816 uint32_t flags; 817 818 /* Sanity checks. */ 819 assert(seg != NULL); 820 assert(elts_head < rxq->elts_n); 821 assert(rxq->elts_head < rxq->elts_n); 822 /* 823 * Fetch initial bytes of packet descriptor into a 824 * cacheline while allocating rep. 825 */ 826 rte_prefetch0(seg); 827 rte_prefetch0(&seg->cacheline1); 828 ret = rxq->if_cq->poll_length_flags(rxq->cq, NULL, NULL, 829 &flags); 830 if (unlikely(ret < 0)) { 831 struct ibv_wc wc; 832 int wcs_n; 833 834 DEBUG("rxq=%p, poll_length() failed (ret=%d)", 835 (void *)rxq, ret); 836 /* ibv_poll_cq() must be used in case of failure. */ 837 wcs_n = ibv_poll_cq(rxq->cq, 1, &wc); 838 if (unlikely(wcs_n == 0)) 839 break; 840 if (unlikely(wcs_n < 0)) { 841 DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)", 842 (void *)rxq, wcs_n); 843 break; 844 } 845 assert(wcs_n == 1); 846 if (unlikely(wc.status != IBV_WC_SUCCESS)) { 847 /* Whatever, just repost the offending WR. */ 848 DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work" 849 " completion status (%d): %s", 850 (void *)rxq, wc.wr_id, wc.status, 851 ibv_wc_status_str(wc.status)); 852 #ifdef MLX5_PMD_SOFT_COUNTERS 853 /* Increment dropped packets counter. */ 854 ++rxq->stats.idropped; 855 #endif 856 /* Add SGE to array for repost. */ 857 sges[i] = elt->sge; 858 goto repost; 859 } 860 ret = wc.byte_len; 861 } 862 if (ret == 0) 863 break; 864 len = ret; 865 rep = __rte_mbuf_raw_alloc(rxq->mp); 866 if (unlikely(rep == NULL)) { 867 /* 868 * Unable to allocate a replacement mbuf, 869 * repost WR. 870 */ 871 DEBUG("rxq=%p: can't allocate a new mbuf", 872 (void *)rxq); 873 /* Increment out of memory counters. */ 874 ++rxq->stats.rx_nombuf; 875 ++rxq->priv->dev->data->rx_mbuf_alloc_failed; 876 goto repost; 877 } 878 879 /* Reconfigure sge to use rep instead of seg. */ 880 elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM; 881 assert(elt->sge.lkey == rxq->mr->lkey); 882 elt->buf = rep; 883 884 /* Add SGE to array for repost. */ 885 sges[i] = elt->sge; 886 887 /* Update seg information. */ 888 SET_DATA_OFF(seg, RTE_PKTMBUF_HEADROOM); 889 NB_SEGS(seg) = 1; 890 PORT(seg) = rxq->port_id; 891 NEXT(seg) = NULL; 892 PKT_LEN(seg) = len; 893 DATA_LEN(seg) = len; 894 seg->packet_type = rxq_cq_to_pkt_type(flags); 895 seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags); 896 897 /* Return packet. */ 898 *(pkts++) = seg; 899 ++pkts_ret; 900 #ifdef MLX5_PMD_SOFT_COUNTERS 901 /* Increment bytes counter. */ 902 rxq->stats.ibytes += len; 903 #endif 904 repost: 905 if (++elts_head >= elts_n) 906 elts_head = 0; 907 continue; 908 } 909 if (unlikely(i == 0)) 910 return 0; 911 /* Repost WRs. */ 912 #ifdef DEBUG_RECV 913 DEBUG("%p: reposting %u WRs", (void *)rxq, i); 914 #endif 915 ret = rxq->if_wq->recv_burst(rxq->wq, sges, i); 916 if (unlikely(ret)) { 917 /* Inability to repost WRs is fatal. */ 918 DEBUG("%p: recv_burst(): failed (ret=%d)", 919 (void *)rxq->priv, 920 ret); 921 abort(); 922 } 923 rxq->elts_head = elts_head; 924 #ifdef MLX5_PMD_SOFT_COUNTERS 925 /* Increment packets counter. */ 926 rxq->stats.ipackets += pkts_ret; 927 #endif 928 return pkts_ret; 929 } 930 931 /** 932 * Dummy DPDK callback for TX. 933 * 934 * This function is used to temporarily replace the real callback during 935 * unsafe control operations on the queue, or in case of error. 936 * 937 * @param dpdk_txq 938 * Generic pointer to TX queue structure. 939 * @param[in] pkts 940 * Packets to transmit. 941 * @param pkts_n 942 * Number of packets in array. 943 * 944 * @return 945 * Number of packets successfully transmitted (<= pkts_n). 946 */ 947 uint16_t 948 removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 949 { 950 (void)dpdk_txq; 951 (void)pkts; 952 (void)pkts_n; 953 return 0; 954 } 955 956 /** 957 * Dummy DPDK callback for RX. 958 * 959 * This function is used to temporarily replace the real callback during 960 * unsafe control operations on the queue, or in case of error. 961 * 962 * @param dpdk_rxq 963 * Generic pointer to RX queue structure. 964 * @param[out] pkts 965 * Array to store received packets. 966 * @param pkts_n 967 * Maximum number of packets in array. 968 * 969 * @return 970 * Number of packets successfully received (<= pkts_n). 971 */ 972 uint16_t 973 removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 974 { 975 (void)dpdk_rxq; 976 (void)pkts; 977 (void)pkts_n; 978 return 0; 979 } 980