1 /*- 2 * BSD LICENSE 3 * 4 * Copyright 2015 6WIND S.A. 5 * Copyright 2015 Mellanox. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of 6WIND S.A. nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <assert.h> 35 #include <stdint.h> 36 #include <string.h> 37 #include <stdlib.h> 38 39 /* Verbs header. */ 40 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 41 #ifdef PEDANTIC 42 #pragma GCC diagnostic ignored "-Wpedantic" 43 #endif 44 #include <infiniband/verbs.h> 45 #include <infiniband/mlx5_hw.h> 46 #include <infiniband/arch.h> 47 #ifdef PEDANTIC 48 #pragma GCC diagnostic error "-Wpedantic" 49 #endif 50 51 /* DPDK headers don't like -pedantic. */ 52 #ifdef PEDANTIC 53 #pragma GCC diagnostic ignored "-Wpedantic" 54 #endif 55 #include <rte_mbuf.h> 56 #include <rte_mempool.h> 57 #include <rte_prefetch.h> 58 #include <rte_common.h> 59 #include <rte_branch_prediction.h> 60 #include <rte_ether.h> 61 #ifdef PEDANTIC 62 #pragma GCC diagnostic error "-Wpedantic" 63 #endif 64 65 #include "mlx5.h" 66 #include "mlx5_utils.h" 67 #include "mlx5_rxtx.h" 68 #include "mlx5_autoconf.h" 69 #include "mlx5_defs.h" 70 #include "mlx5_prm.h" 71 72 static inline int 73 check_cqe(volatile struct mlx5_cqe *cqe, 74 unsigned int cqes_n, const uint16_t ci) 75 __attribute__((always_inline)); 76 77 static inline void 78 txq_complete(struct txq *txq) __attribute__((always_inline)); 79 80 static inline uint32_t 81 txq_mp2mr(struct txq *txq, struct rte_mempool *mp) 82 __attribute__((always_inline)); 83 84 static inline void 85 mlx5_tx_dbrec(struct txq *txq, volatile struct mlx5_wqe *wqe) 86 __attribute__((always_inline)); 87 88 static inline uint32_t 89 rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe) 90 __attribute__((always_inline)); 91 92 static inline int 93 mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe, 94 uint16_t cqe_cnt, uint32_t *rss_hash) 95 __attribute__((always_inline)); 96 97 static inline uint32_t 98 rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe *cqe) 99 __attribute__((always_inline)); 100 101 #ifndef NDEBUG 102 103 /** 104 * Verify or set magic value in CQE. 105 * 106 * @param cqe 107 * Pointer to CQE. 108 * 109 * @return 110 * 0 the first time. 111 */ 112 static inline int 113 check_cqe_seen(volatile struct mlx5_cqe *cqe) 114 { 115 static const uint8_t magic[] = "seen"; 116 volatile uint8_t (*buf)[sizeof(cqe->rsvd0)] = &cqe->rsvd0; 117 int ret = 1; 118 unsigned int i; 119 120 for (i = 0; i < sizeof(magic) && i < sizeof(*buf); ++i) 121 if (!ret || (*buf)[i] != magic[i]) { 122 ret = 0; 123 (*buf)[i] = magic[i]; 124 } 125 return ret; 126 } 127 128 #endif /* NDEBUG */ 129 130 /** 131 * Check whether CQE is valid. 132 * 133 * @param cqe 134 * Pointer to CQE. 135 * @param cqes_n 136 * Size of completion queue. 137 * @param ci 138 * Consumer index. 139 * 140 * @return 141 * 0 on success, 1 on failure. 142 */ 143 static inline int 144 check_cqe(volatile struct mlx5_cqe *cqe, 145 unsigned int cqes_n, const uint16_t ci) 146 { 147 uint16_t idx = ci & cqes_n; 148 uint8_t op_own = cqe->op_own; 149 uint8_t op_owner = MLX5_CQE_OWNER(op_own); 150 uint8_t op_code = MLX5_CQE_OPCODE(op_own); 151 152 if (unlikely((op_owner != (!!(idx))) || (op_code == MLX5_CQE_INVALID))) 153 return 1; /* No CQE. */ 154 #ifndef NDEBUG 155 if ((op_code == MLX5_CQE_RESP_ERR) || 156 (op_code == MLX5_CQE_REQ_ERR)) { 157 volatile struct mlx5_err_cqe *err_cqe = (volatile void *)cqe; 158 uint8_t syndrome = err_cqe->syndrome; 159 160 if ((syndrome == MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR) || 161 (syndrome == MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR)) 162 return 0; 163 if (!check_cqe_seen(cqe)) 164 ERROR("unexpected CQE error %u (0x%02x)" 165 " syndrome 0x%02x", 166 op_code, op_code, syndrome); 167 return 1; 168 } else if ((op_code != MLX5_CQE_RESP_SEND) && 169 (op_code != MLX5_CQE_REQ)) { 170 if (!check_cqe_seen(cqe)) 171 ERROR("unexpected CQE opcode %u (0x%02x)", 172 op_code, op_code); 173 return 1; 174 } 175 #endif /* NDEBUG */ 176 return 0; 177 } 178 179 /** 180 * Return the address of the WQE. 181 * 182 * @param txq 183 * Pointer to TX queue structure. 184 * @param wqe_ci 185 * WQE consumer index. 186 * 187 * @return 188 * WQE address. 189 */ 190 static inline uintptr_t * 191 tx_mlx5_wqe(struct txq *txq, uint16_t ci) 192 { 193 ci &= ((1 << txq->wqe_n) - 1); 194 return (uintptr_t *)((uintptr_t)txq->wqes + ci * MLX5_WQE_SIZE); 195 } 196 197 /** 198 * Manage TX completions. 199 * 200 * When sending a burst, mlx5_tx_burst() posts several WRs. 201 * 202 * @param txq 203 * Pointer to TX queue structure. 204 */ 205 static inline void 206 txq_complete(struct txq *txq) 207 { 208 const unsigned int elts_n = 1 << txq->elts_n; 209 const unsigned int cqe_n = 1 << txq->cqe_n; 210 const unsigned int cqe_cnt = cqe_n - 1; 211 uint16_t elts_free = txq->elts_tail; 212 uint16_t elts_tail; 213 uint16_t cq_ci = txq->cq_ci; 214 volatile struct mlx5_cqe *cqe = NULL; 215 volatile struct mlx5_wqe_ctrl *ctrl; 216 217 do { 218 volatile struct mlx5_cqe *tmp; 219 220 tmp = &(*txq->cqes)[cq_ci & cqe_cnt]; 221 if (check_cqe(tmp, cqe_n, cq_ci)) 222 break; 223 cqe = tmp; 224 #ifndef NDEBUG 225 if (MLX5_CQE_FORMAT(cqe->op_own) == MLX5_COMPRESSED) { 226 if (!check_cqe_seen(cqe)) 227 ERROR("unexpected compressed CQE, TX stopped"); 228 return; 229 } 230 if ((MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_RESP_ERR) || 231 (MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_REQ_ERR)) { 232 if (!check_cqe_seen(cqe)) 233 ERROR("unexpected error CQE, TX stopped"); 234 return; 235 } 236 #endif /* NDEBUG */ 237 ++cq_ci; 238 } while (1); 239 if (unlikely(cqe == NULL)) 240 return; 241 txq->wqe_pi = ntohs(cqe->wqe_counter); 242 ctrl = (volatile struct mlx5_wqe_ctrl *) 243 tx_mlx5_wqe(txq, txq->wqe_pi); 244 elts_tail = ctrl->ctrl3; 245 assert(elts_tail < (1 << txq->wqe_n)); 246 /* Free buffers. */ 247 while (elts_free != elts_tail) { 248 struct rte_mbuf *elt = (*txq->elts)[elts_free]; 249 unsigned int elts_free_next = 250 (elts_free + 1) & (elts_n - 1); 251 struct rte_mbuf *elt_next = (*txq->elts)[elts_free_next]; 252 253 #ifndef NDEBUG 254 /* Poisoning. */ 255 memset(&(*txq->elts)[elts_free], 256 0x66, 257 sizeof((*txq->elts)[elts_free])); 258 #endif 259 RTE_MBUF_PREFETCH_TO_FREE(elt_next); 260 /* Only one segment needs to be freed. */ 261 rte_pktmbuf_free_seg(elt); 262 elts_free = elts_free_next; 263 } 264 txq->cq_ci = cq_ci; 265 txq->elts_tail = elts_tail; 266 /* Update the consumer index. */ 267 rte_wmb(); 268 *txq->cq_db = htonl(cq_ci); 269 } 270 271 /** 272 * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which 273 * the cloned mbuf is allocated is returned instead. 274 * 275 * @param buf 276 * Pointer to mbuf. 277 * 278 * @return 279 * Memory pool where data is located for given mbuf. 280 */ 281 static struct rte_mempool * 282 txq_mb2mp(struct rte_mbuf *buf) 283 { 284 if (unlikely(RTE_MBUF_INDIRECT(buf))) 285 return rte_mbuf_from_indirect(buf)->pool; 286 return buf->pool; 287 } 288 289 /** 290 * Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[]. 291 * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full, 292 * remove an entry first. 293 * 294 * @param txq 295 * Pointer to TX queue structure. 296 * @param[in] mp 297 * Memory Pool for which a Memory Region lkey must be returned. 298 * 299 * @return 300 * mr->lkey on success, (uint32_t)-1 on failure. 301 */ 302 static inline uint32_t 303 txq_mp2mr(struct txq *txq, struct rte_mempool *mp) 304 { 305 unsigned int i; 306 uint32_t lkey = (uint32_t)-1; 307 308 for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) { 309 if (unlikely(txq->mp2mr[i].mp == NULL)) { 310 /* Unknown MP, add a new MR for it. */ 311 break; 312 } 313 if (txq->mp2mr[i].mp == mp) { 314 assert(txq->mp2mr[i].lkey != (uint32_t)-1); 315 assert(htonl(txq->mp2mr[i].mr->lkey) == 316 txq->mp2mr[i].lkey); 317 lkey = txq->mp2mr[i].lkey; 318 break; 319 } 320 } 321 if (unlikely(lkey == (uint32_t)-1)) 322 lkey = txq_mp2mr_reg(txq, mp, i); 323 return lkey; 324 } 325 326 /** 327 * Ring TX queue doorbell. 328 * 329 * @param txq 330 * Pointer to TX queue structure. 331 * @param wqe 332 * Pointer to the last WQE posted in the NIC. 333 */ 334 static inline void 335 mlx5_tx_dbrec(struct txq *txq, volatile struct mlx5_wqe *wqe) 336 { 337 uint64_t *dst = (uint64_t *)((uintptr_t)txq->bf_reg); 338 volatile uint64_t *src = ((volatile uint64_t *)wqe); 339 340 rte_wmb(); 341 *txq->qp_db = htonl(txq->wqe_ci); 342 /* Ensure ordering between DB record and BF copy. */ 343 rte_wmb(); 344 *dst = *src; 345 } 346 347 /** 348 * DPDK callback for TX. 349 * 350 * @param dpdk_txq 351 * Generic pointer to TX queue structure. 352 * @param[in] pkts 353 * Packets to transmit. 354 * @param pkts_n 355 * Number of packets in array. 356 * 357 * @return 358 * Number of packets successfully transmitted (<= pkts_n). 359 */ 360 uint16_t 361 mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 362 { 363 struct txq *txq = (struct txq *)dpdk_txq; 364 uint16_t elts_head = txq->elts_head; 365 const unsigned int elts_n = 1 << txq->elts_n; 366 unsigned int i = 0; 367 unsigned int j = 0; 368 unsigned int max; 369 uint16_t max_wqe; 370 unsigned int comp; 371 volatile struct mlx5_wqe_v *wqe = NULL; 372 unsigned int segs_n = 0; 373 struct rte_mbuf *buf = NULL; 374 uint8_t *raw; 375 376 if (unlikely(!pkts_n)) 377 return 0; 378 /* Prefetch first packet cacheline. */ 379 rte_prefetch0(*pkts); 380 /* Start processing. */ 381 txq_complete(txq); 382 max = (elts_n - (elts_head - txq->elts_tail)); 383 if (max > elts_n) 384 max -= elts_n; 385 max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); 386 if (unlikely(!max_wqe)) 387 return 0; 388 do { 389 volatile rte_v128u32_t *dseg = NULL; 390 uint32_t length; 391 unsigned int ds = 0; 392 uintptr_t addr; 393 uint64_t naddr; 394 uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE + 2; 395 uint16_t ehdr; 396 uint8_t cs_flags = 0; 397 #ifdef MLX5_PMD_SOFT_COUNTERS 398 uint32_t total_length = 0; 399 #endif 400 401 /* first_seg */ 402 buf = *(pkts++); 403 segs_n = buf->nb_segs; 404 /* 405 * Make sure there is enough room to store this packet and 406 * that one ring entry remains unused. 407 */ 408 assert(segs_n); 409 if (max < segs_n + 1) 410 break; 411 max -= segs_n; 412 --segs_n; 413 if (!segs_n) 414 --pkts_n; 415 if (unlikely(--max_wqe == 0)) 416 break; 417 wqe = (volatile struct mlx5_wqe_v *) 418 tx_mlx5_wqe(txq, txq->wqe_ci); 419 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); 420 if (pkts_n > 1) 421 rte_prefetch0(*pkts); 422 addr = rte_pktmbuf_mtod(buf, uintptr_t); 423 length = DATA_LEN(buf); 424 ehdr = (((uint8_t *)addr)[1] << 8) | 425 ((uint8_t *)addr)[0]; 426 #ifdef MLX5_PMD_SOFT_COUNTERS 427 total_length = length; 428 #endif 429 assert(length >= MLX5_WQE_DWORD_SIZE); 430 /* Update element. */ 431 (*txq->elts)[elts_head] = buf; 432 elts_head = (elts_head + 1) & (elts_n - 1); 433 /* Prefetch next buffer data. */ 434 if (pkts_n > 1) { 435 volatile void *pkt_addr; 436 437 pkt_addr = rte_pktmbuf_mtod(*pkts, volatile void *); 438 rte_prefetch0(pkt_addr); 439 } 440 /* Should we enable HW CKSUM offload */ 441 if (buf->ol_flags & 442 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) { 443 cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; 444 } 445 raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE; 446 /* Replace the Ethernet type by the VLAN if necessary. */ 447 if (buf->ol_flags & PKT_TX_VLAN_PKT) { 448 uint32_t vlan = htonl(0x81000000 | buf->vlan_tci); 449 unsigned int len = 2 * ETHER_ADDR_LEN - 2; 450 451 addr += 2; 452 length -= 2; 453 /* Copy Destination and source mac address. */ 454 memcpy((uint8_t *)raw, ((uint8_t *)addr), len); 455 /* Copy VLAN. */ 456 memcpy((uint8_t *)raw + len, &vlan, sizeof(vlan)); 457 /* Copy missing two bytes to end the DSeg. */ 458 memcpy((uint8_t *)raw + len + sizeof(vlan), 459 ((uint8_t *)addr) + len, 2); 460 addr += len + 2; 461 length -= (len + 2); 462 } else { 463 memcpy((uint8_t *)raw, ((uint8_t *)addr) + 2, 464 MLX5_WQE_DWORD_SIZE); 465 length -= pkt_inline_sz; 466 addr += pkt_inline_sz; 467 } 468 /* Inline if enough room. */ 469 if (txq->max_inline) { 470 uintptr_t end = (uintptr_t) 471 (((uintptr_t)txq->wqes) + 472 (1 << txq->wqe_n) * MLX5_WQE_SIZE); 473 unsigned int max_inline = txq->max_inline * 474 RTE_CACHE_LINE_SIZE - 475 MLX5_WQE_DWORD_SIZE; 476 uintptr_t addr_end = (addr + max_inline) & 477 ~(RTE_CACHE_LINE_SIZE - 1); 478 unsigned int copy_b = (addr_end > addr) ? 479 RTE_MIN((addr_end - addr), length) : 480 0; 481 482 raw += MLX5_WQE_DWORD_SIZE; 483 if (copy_b && ((end - (uintptr_t)raw) > copy_b)) { 484 /* 485 * One Dseg remains in the current WQE. To 486 * keep the computation positive, it is 487 * removed after the bytes to Dseg conversion. 488 */ 489 uint16_t n = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4; 490 491 if (unlikely(max_wqe < n)) 492 break; 493 max_wqe -= n; 494 rte_memcpy((void *)raw, (void *)addr, copy_b); 495 addr += copy_b; 496 length -= copy_b; 497 pkt_inline_sz += copy_b; 498 } 499 /* 500 * 2 DWORDs consumed by the WQE header + ETH segment + 501 * the size of the inline part of the packet. 502 */ 503 ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2); 504 if (length > 0) { 505 if (ds % (MLX5_WQE_SIZE / 506 MLX5_WQE_DWORD_SIZE) == 0) { 507 if (unlikely(--max_wqe == 0)) 508 break; 509 dseg = (volatile rte_v128u32_t *) 510 tx_mlx5_wqe(txq, txq->wqe_ci + 511 ds / 4); 512 } else { 513 dseg = (volatile rte_v128u32_t *) 514 ((uintptr_t)wqe + 515 (ds * MLX5_WQE_DWORD_SIZE)); 516 } 517 goto use_dseg; 518 } else if (!segs_n) { 519 goto next_pkt; 520 } else { 521 /* dseg will be advance as part of next_seg */ 522 dseg = (volatile rte_v128u32_t *) 523 ((uintptr_t)wqe + 524 ((ds - 1) * MLX5_WQE_DWORD_SIZE)); 525 goto next_seg; 526 } 527 } else { 528 /* 529 * No inline has been done in the packet, only the 530 * Ethernet Header as been stored. 531 */ 532 dseg = (volatile rte_v128u32_t *) 533 ((uintptr_t)wqe + (3 * MLX5_WQE_DWORD_SIZE)); 534 ds = 3; 535 use_dseg: 536 /* Add the remaining packet as a simple ds. */ 537 naddr = htonll(addr); 538 *dseg = (rte_v128u32_t){ 539 htonl(length), 540 txq_mp2mr(txq, txq_mb2mp(buf)), 541 naddr, 542 naddr >> 32, 543 }; 544 ++ds; 545 if (!segs_n) 546 goto next_pkt; 547 } 548 next_seg: 549 assert(buf); 550 assert(ds); 551 assert(wqe); 552 /* 553 * Spill on next WQE when the current one does not have 554 * enough room left. Size of WQE must a be a multiple 555 * of data segment size. 556 */ 557 assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE)); 558 if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE))) { 559 if (unlikely(--max_wqe == 0)) 560 break; 561 dseg = (volatile rte_v128u32_t *) 562 tx_mlx5_wqe(txq, txq->wqe_ci + ds / 4); 563 rte_prefetch0(tx_mlx5_wqe(txq, 564 txq->wqe_ci + ds / 4 + 1)); 565 } else { 566 ++dseg; 567 } 568 ++ds; 569 buf = buf->next; 570 assert(buf); 571 length = DATA_LEN(buf); 572 #ifdef MLX5_PMD_SOFT_COUNTERS 573 total_length += length; 574 #endif 575 /* Store segment information. */ 576 naddr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)); 577 *dseg = (rte_v128u32_t){ 578 htonl(length), 579 txq_mp2mr(txq, txq_mb2mp(buf)), 580 naddr, 581 naddr >> 32, 582 }; 583 (*txq->elts)[elts_head] = buf; 584 elts_head = (elts_head + 1) & (elts_n - 1); 585 ++j; 586 --segs_n; 587 if (segs_n) 588 goto next_seg; 589 else 590 --pkts_n; 591 next_pkt: 592 ++i; 593 /* Initialize known and common part of the WQE structure. */ 594 wqe->ctrl = (rte_v128u32_t){ 595 htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND), 596 htonl(txq->qp_num_8s | ds), 597 0, 598 0, 599 }; 600 wqe->eseg = (rte_v128u32_t){ 601 0, 602 cs_flags, 603 0, 604 (ehdr << 16) | htons(pkt_inline_sz), 605 }; 606 txq->wqe_ci += (ds + 3) / 4; 607 #ifdef MLX5_PMD_SOFT_COUNTERS 608 /* Increment sent bytes counter. */ 609 txq->stats.obytes += total_length; 610 #endif 611 } while (pkts_n); 612 /* Take a shortcut if nothing must be sent. */ 613 if (unlikely(i == 0)) 614 return 0; 615 /* Check whether completion threshold has been reached. */ 616 comp = txq->elts_comp + i + j; 617 if (comp >= MLX5_TX_COMP_THRESH) { 618 volatile struct mlx5_wqe_ctrl *w = 619 (volatile struct mlx5_wqe_ctrl *)wqe; 620 621 /* Request completion on last WQE. */ 622 w->ctrl2 = htonl(8); 623 /* Save elts_head in unused "immediate" field of WQE. */ 624 w->ctrl3 = elts_head; 625 txq->elts_comp = 0; 626 } else { 627 txq->elts_comp = comp; 628 } 629 #ifdef MLX5_PMD_SOFT_COUNTERS 630 /* Increment sent packets counter. */ 631 txq->stats.opackets += i; 632 #endif 633 /* Ring QP doorbell. */ 634 mlx5_tx_dbrec(txq, (volatile struct mlx5_wqe *)wqe); 635 txq->elts_head = elts_head; 636 return i; 637 } 638 639 /** 640 * Open a MPW session. 641 * 642 * @param txq 643 * Pointer to TX queue structure. 644 * @param mpw 645 * Pointer to MPW session structure. 646 * @param length 647 * Packet length. 648 */ 649 static inline void 650 mlx5_mpw_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length) 651 { 652 uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1); 653 volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] = 654 (volatile struct mlx5_wqe_data_seg (*)[]) 655 tx_mlx5_wqe(txq, idx + 1); 656 657 mpw->state = MLX5_MPW_STATE_OPENED; 658 mpw->pkts_n = 0; 659 mpw->len = length; 660 mpw->total_len = 0; 661 mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx); 662 mpw->wqe->eseg.mss = htons(length); 663 mpw->wqe->eseg.inline_hdr_sz = 0; 664 mpw->wqe->eseg.rsvd0 = 0; 665 mpw->wqe->eseg.rsvd1 = 0; 666 mpw->wqe->eseg.rsvd2 = 0; 667 mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) | 668 (txq->wqe_ci << 8) | MLX5_OPCODE_TSO); 669 mpw->wqe->ctrl[2] = 0; 670 mpw->wqe->ctrl[3] = 0; 671 mpw->data.dseg[0] = (volatile struct mlx5_wqe_data_seg *) 672 (((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE)); 673 mpw->data.dseg[1] = (volatile struct mlx5_wqe_data_seg *) 674 (((uintptr_t)mpw->wqe) + (3 * MLX5_WQE_DWORD_SIZE)); 675 mpw->data.dseg[2] = &(*dseg)[0]; 676 mpw->data.dseg[3] = &(*dseg)[1]; 677 mpw->data.dseg[4] = &(*dseg)[2]; 678 } 679 680 /** 681 * Close a MPW session. 682 * 683 * @param txq 684 * Pointer to TX queue structure. 685 * @param mpw 686 * Pointer to MPW session structure. 687 */ 688 static inline void 689 mlx5_mpw_close(struct txq *txq, struct mlx5_mpw *mpw) 690 { 691 unsigned int num = mpw->pkts_n; 692 693 /* 694 * Store size in multiple of 16 bytes. Control and Ethernet segments 695 * count as 2. 696 */ 697 mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | (2 + num)); 698 mpw->state = MLX5_MPW_STATE_CLOSED; 699 if (num < 3) 700 ++txq->wqe_ci; 701 else 702 txq->wqe_ci += 2; 703 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci)); 704 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); 705 } 706 707 /** 708 * DPDK callback for TX with MPW support. 709 * 710 * @param dpdk_txq 711 * Generic pointer to TX queue structure. 712 * @param[in] pkts 713 * Packets to transmit. 714 * @param pkts_n 715 * Number of packets in array. 716 * 717 * @return 718 * Number of packets successfully transmitted (<= pkts_n). 719 */ 720 uint16_t 721 mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 722 { 723 struct txq *txq = (struct txq *)dpdk_txq; 724 uint16_t elts_head = txq->elts_head; 725 const unsigned int elts_n = 1 << txq->elts_n; 726 unsigned int i = 0; 727 unsigned int j = 0; 728 unsigned int max; 729 uint16_t max_wqe; 730 unsigned int comp; 731 struct mlx5_mpw mpw = { 732 .state = MLX5_MPW_STATE_CLOSED, 733 }; 734 735 if (unlikely(!pkts_n)) 736 return 0; 737 /* Prefetch first packet cacheline. */ 738 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci)); 739 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); 740 /* Start processing. */ 741 txq_complete(txq); 742 max = (elts_n - (elts_head - txq->elts_tail)); 743 if (max > elts_n) 744 max -= elts_n; 745 max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); 746 if (unlikely(!max_wqe)) 747 return 0; 748 do { 749 struct rte_mbuf *buf = *(pkts++); 750 unsigned int elts_head_next; 751 uint32_t length; 752 unsigned int segs_n = buf->nb_segs; 753 uint32_t cs_flags = 0; 754 755 /* 756 * Make sure there is enough room to store this packet and 757 * that one ring entry remains unused. 758 */ 759 assert(segs_n); 760 if (max < segs_n + 1) 761 break; 762 /* Do not bother with large packets MPW cannot handle. */ 763 if (segs_n > MLX5_MPW_DSEG_MAX) 764 break; 765 max -= segs_n; 766 --pkts_n; 767 /* Should we enable HW CKSUM offload */ 768 if (buf->ol_flags & 769 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) 770 cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; 771 /* Retrieve packet information. */ 772 length = PKT_LEN(buf); 773 assert(length); 774 /* Start new session if packet differs. */ 775 if ((mpw.state == MLX5_MPW_STATE_OPENED) && 776 ((mpw.len != length) || 777 (segs_n != 1) || 778 (mpw.wqe->eseg.cs_flags != cs_flags))) 779 mlx5_mpw_close(txq, &mpw); 780 if (mpw.state == MLX5_MPW_STATE_CLOSED) { 781 /* 782 * Multi-Packet WQE consumes at most two WQE. 783 * mlx5_mpw_new() expects to be able to use such 784 * resources. 785 */ 786 if (unlikely(max_wqe < 2)) 787 break; 788 max_wqe -= 2; 789 mlx5_mpw_new(txq, &mpw, length); 790 mpw.wqe->eseg.cs_flags = cs_flags; 791 } 792 /* Multi-segment packets must be alone in their MPW. */ 793 assert((segs_n == 1) || (mpw.pkts_n == 0)); 794 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 795 length = 0; 796 #endif 797 do { 798 volatile struct mlx5_wqe_data_seg *dseg; 799 uintptr_t addr; 800 801 elts_head_next = (elts_head + 1) & (elts_n - 1); 802 assert(buf); 803 (*txq->elts)[elts_head] = buf; 804 dseg = mpw.data.dseg[mpw.pkts_n]; 805 addr = rte_pktmbuf_mtod(buf, uintptr_t); 806 *dseg = (struct mlx5_wqe_data_seg){ 807 .byte_count = htonl(DATA_LEN(buf)), 808 .lkey = txq_mp2mr(txq, txq_mb2mp(buf)), 809 .addr = htonll(addr), 810 }; 811 elts_head = elts_head_next; 812 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 813 length += DATA_LEN(buf); 814 #endif 815 buf = buf->next; 816 ++mpw.pkts_n; 817 ++j; 818 } while (--segs_n); 819 assert(length == mpw.len); 820 if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) 821 mlx5_mpw_close(txq, &mpw); 822 elts_head = elts_head_next; 823 #ifdef MLX5_PMD_SOFT_COUNTERS 824 /* Increment sent bytes counter. */ 825 txq->stats.obytes += length; 826 #endif 827 ++i; 828 } while (pkts_n); 829 /* Take a shortcut if nothing must be sent. */ 830 if (unlikely(i == 0)) 831 return 0; 832 /* Check whether completion threshold has been reached. */ 833 /* "j" includes both packets and segments. */ 834 comp = txq->elts_comp + j; 835 if (comp >= MLX5_TX_COMP_THRESH) { 836 volatile struct mlx5_wqe *wqe = mpw.wqe; 837 838 /* Request completion on last WQE. */ 839 wqe->ctrl[2] = htonl(8); 840 /* Save elts_head in unused "immediate" field of WQE. */ 841 wqe->ctrl[3] = elts_head; 842 txq->elts_comp = 0; 843 } else { 844 txq->elts_comp = comp; 845 } 846 #ifdef MLX5_PMD_SOFT_COUNTERS 847 /* Increment sent packets counter. */ 848 txq->stats.opackets += i; 849 #endif 850 /* Ring QP doorbell. */ 851 if (mpw.state == MLX5_MPW_STATE_OPENED) 852 mlx5_mpw_close(txq, &mpw); 853 mlx5_tx_dbrec(txq, mpw.wqe); 854 txq->elts_head = elts_head; 855 return i; 856 } 857 858 /** 859 * Open a MPW inline session. 860 * 861 * @param txq 862 * Pointer to TX queue structure. 863 * @param mpw 864 * Pointer to MPW session structure. 865 * @param length 866 * Packet length. 867 */ 868 static inline void 869 mlx5_mpw_inline_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length) 870 { 871 uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1); 872 struct mlx5_wqe_inl_small *inl; 873 874 mpw->state = MLX5_MPW_INL_STATE_OPENED; 875 mpw->pkts_n = 0; 876 mpw->len = length; 877 mpw->total_len = 0; 878 mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx); 879 mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) | 880 (txq->wqe_ci << 8) | 881 MLX5_OPCODE_TSO); 882 mpw->wqe->ctrl[2] = 0; 883 mpw->wqe->ctrl[3] = 0; 884 mpw->wqe->eseg.mss = htons(length); 885 mpw->wqe->eseg.inline_hdr_sz = 0; 886 mpw->wqe->eseg.cs_flags = 0; 887 mpw->wqe->eseg.rsvd0 = 0; 888 mpw->wqe->eseg.rsvd1 = 0; 889 mpw->wqe->eseg.rsvd2 = 0; 890 inl = (struct mlx5_wqe_inl_small *) 891 (((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE); 892 mpw->data.raw = (uint8_t *)&inl->raw; 893 } 894 895 /** 896 * Close a MPW inline session. 897 * 898 * @param txq 899 * Pointer to TX queue structure. 900 * @param mpw 901 * Pointer to MPW session structure. 902 */ 903 static inline void 904 mlx5_mpw_inline_close(struct txq *txq, struct mlx5_mpw *mpw) 905 { 906 unsigned int size; 907 struct mlx5_wqe_inl_small *inl = (struct mlx5_wqe_inl_small *) 908 (((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE)); 909 910 size = MLX5_WQE_SIZE - MLX5_MWQE64_INL_DATA + mpw->total_len; 911 /* 912 * Store size in multiple of 16 bytes. Control and Ethernet segments 913 * count as 2. 914 */ 915 mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | MLX5_WQE_DS(size)); 916 mpw->state = MLX5_MPW_STATE_CLOSED; 917 inl->byte_cnt = htonl(mpw->total_len | MLX5_INLINE_SEG); 918 txq->wqe_ci += (size + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE; 919 } 920 921 /** 922 * DPDK callback for TX with MPW inline support. 923 * 924 * @param dpdk_txq 925 * Generic pointer to TX queue structure. 926 * @param[in] pkts 927 * Packets to transmit. 928 * @param pkts_n 929 * Number of packets in array. 930 * 931 * @return 932 * Number of packets successfully transmitted (<= pkts_n). 933 */ 934 uint16_t 935 mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, 936 uint16_t pkts_n) 937 { 938 struct txq *txq = (struct txq *)dpdk_txq; 939 uint16_t elts_head = txq->elts_head; 940 const unsigned int elts_n = 1 << txq->elts_n; 941 unsigned int i = 0; 942 unsigned int j = 0; 943 unsigned int max; 944 uint16_t max_wqe; 945 unsigned int comp; 946 unsigned int inline_room = txq->max_inline * RTE_CACHE_LINE_SIZE; 947 struct mlx5_mpw mpw = { 948 .state = MLX5_MPW_STATE_CLOSED, 949 }; 950 /* 951 * Compute the maximum number of WQE which can be consumed by inline 952 * code. 953 * - 2 DSEG for: 954 * - 1 control segment, 955 * - 1 Ethernet segment, 956 * - N Dseg from the inline request. 957 */ 958 const unsigned int wqe_inl_n = 959 ((2 * MLX5_WQE_DWORD_SIZE + 960 txq->max_inline * RTE_CACHE_LINE_SIZE) + 961 RTE_CACHE_LINE_SIZE - 1) / RTE_CACHE_LINE_SIZE; 962 963 if (unlikely(!pkts_n)) 964 return 0; 965 /* Prefetch first packet cacheline. */ 966 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci)); 967 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); 968 /* Start processing. */ 969 txq_complete(txq); 970 max = (elts_n - (elts_head - txq->elts_tail)); 971 if (max > elts_n) 972 max -= elts_n; 973 do { 974 struct rte_mbuf *buf = *(pkts++); 975 unsigned int elts_head_next; 976 uintptr_t addr; 977 uint32_t length; 978 unsigned int segs_n = buf->nb_segs; 979 uint32_t cs_flags = 0; 980 981 /* 982 * Make sure there is enough room to store this packet and 983 * that one ring entry remains unused. 984 */ 985 assert(segs_n); 986 if (max < segs_n + 1) 987 break; 988 /* Do not bother with large packets MPW cannot handle. */ 989 if (segs_n > MLX5_MPW_DSEG_MAX) 990 break; 991 max -= segs_n; 992 --pkts_n; 993 /* 994 * Compute max_wqe in case less WQE were consumed in previous 995 * iteration. 996 */ 997 max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); 998 /* Should we enable HW CKSUM offload */ 999 if (buf->ol_flags & 1000 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) 1001 cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; 1002 /* Retrieve packet information. */ 1003 length = PKT_LEN(buf); 1004 /* Start new session if packet differs. */ 1005 if (mpw.state == MLX5_MPW_STATE_OPENED) { 1006 if ((mpw.len != length) || 1007 (segs_n != 1) || 1008 (mpw.wqe->eseg.cs_flags != cs_flags)) 1009 mlx5_mpw_close(txq, &mpw); 1010 } else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) { 1011 if ((mpw.len != length) || 1012 (segs_n != 1) || 1013 (length > inline_room) || 1014 (mpw.wqe->eseg.cs_flags != cs_flags)) { 1015 mlx5_mpw_inline_close(txq, &mpw); 1016 inline_room = 1017 txq->max_inline * RTE_CACHE_LINE_SIZE; 1018 } 1019 } 1020 if (mpw.state == MLX5_MPW_STATE_CLOSED) { 1021 if ((segs_n != 1) || 1022 (length > inline_room)) { 1023 /* 1024 * Multi-Packet WQE consumes at most two WQE. 1025 * mlx5_mpw_new() expects to be able to use 1026 * such resources. 1027 */ 1028 if (unlikely(max_wqe < 2)) 1029 break; 1030 max_wqe -= 2; 1031 mlx5_mpw_new(txq, &mpw, length); 1032 mpw.wqe->eseg.cs_flags = cs_flags; 1033 } else { 1034 if (unlikely(max_wqe < wqe_inl_n)) 1035 break; 1036 max_wqe -= wqe_inl_n; 1037 mlx5_mpw_inline_new(txq, &mpw, length); 1038 mpw.wqe->eseg.cs_flags = cs_flags; 1039 } 1040 } 1041 /* Multi-segment packets must be alone in their MPW. */ 1042 assert((segs_n == 1) || (mpw.pkts_n == 0)); 1043 if (mpw.state == MLX5_MPW_STATE_OPENED) { 1044 assert(inline_room == 1045 txq->max_inline * RTE_CACHE_LINE_SIZE); 1046 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 1047 length = 0; 1048 #endif 1049 do { 1050 volatile struct mlx5_wqe_data_seg *dseg; 1051 1052 elts_head_next = 1053 (elts_head + 1) & (elts_n - 1); 1054 assert(buf); 1055 (*txq->elts)[elts_head] = buf; 1056 dseg = mpw.data.dseg[mpw.pkts_n]; 1057 addr = rte_pktmbuf_mtod(buf, uintptr_t); 1058 *dseg = (struct mlx5_wqe_data_seg){ 1059 .byte_count = htonl(DATA_LEN(buf)), 1060 .lkey = txq_mp2mr(txq, txq_mb2mp(buf)), 1061 .addr = htonll(addr), 1062 }; 1063 elts_head = elts_head_next; 1064 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 1065 length += DATA_LEN(buf); 1066 #endif 1067 buf = buf->next; 1068 ++mpw.pkts_n; 1069 ++j; 1070 } while (--segs_n); 1071 assert(length == mpw.len); 1072 if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) 1073 mlx5_mpw_close(txq, &mpw); 1074 } else { 1075 unsigned int max; 1076 1077 assert(mpw.state == MLX5_MPW_INL_STATE_OPENED); 1078 assert(length <= inline_room); 1079 assert(length == DATA_LEN(buf)); 1080 elts_head_next = (elts_head + 1) & (elts_n - 1); 1081 addr = rte_pktmbuf_mtod(buf, uintptr_t); 1082 (*txq->elts)[elts_head] = buf; 1083 /* Maximum number of bytes before wrapping. */ 1084 max = ((((uintptr_t)(txq->wqes)) + 1085 (1 << txq->wqe_n) * 1086 MLX5_WQE_SIZE) - 1087 (uintptr_t)mpw.data.raw); 1088 if (length > max) { 1089 rte_memcpy((void *)(uintptr_t)mpw.data.raw, 1090 (void *)addr, 1091 max); 1092 mpw.data.raw = (volatile void *)txq->wqes; 1093 rte_memcpy((void *)(uintptr_t)mpw.data.raw, 1094 (void *)(addr + max), 1095 length - max); 1096 mpw.data.raw += length - max; 1097 } else { 1098 rte_memcpy((void *)(uintptr_t)mpw.data.raw, 1099 (void *)addr, 1100 length); 1101 1102 if (length == max) 1103 mpw.data.raw = 1104 (volatile void *)txq->wqes; 1105 else 1106 mpw.data.raw += length; 1107 } 1108 ++mpw.pkts_n; 1109 mpw.total_len += length; 1110 ++j; 1111 if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) { 1112 mlx5_mpw_inline_close(txq, &mpw); 1113 inline_room = 1114 txq->max_inline * RTE_CACHE_LINE_SIZE; 1115 } else { 1116 inline_room -= length; 1117 } 1118 } 1119 elts_head = elts_head_next; 1120 #ifdef MLX5_PMD_SOFT_COUNTERS 1121 /* Increment sent bytes counter. */ 1122 txq->stats.obytes += length; 1123 #endif 1124 ++i; 1125 } while (pkts_n); 1126 /* Take a shortcut if nothing must be sent. */ 1127 if (unlikely(i == 0)) 1128 return 0; 1129 /* Check whether completion threshold has been reached. */ 1130 /* "j" includes both packets and segments. */ 1131 comp = txq->elts_comp + j; 1132 if (comp >= MLX5_TX_COMP_THRESH) { 1133 volatile struct mlx5_wqe *wqe = mpw.wqe; 1134 1135 /* Request completion on last WQE. */ 1136 wqe->ctrl[2] = htonl(8); 1137 /* Save elts_head in unused "immediate" field of WQE. */ 1138 wqe->ctrl[3] = elts_head; 1139 txq->elts_comp = 0; 1140 } else { 1141 txq->elts_comp = comp; 1142 } 1143 #ifdef MLX5_PMD_SOFT_COUNTERS 1144 /* Increment sent packets counter. */ 1145 txq->stats.opackets += i; 1146 #endif 1147 /* Ring QP doorbell. */ 1148 if (mpw.state == MLX5_MPW_INL_STATE_OPENED) 1149 mlx5_mpw_inline_close(txq, &mpw); 1150 else if (mpw.state == MLX5_MPW_STATE_OPENED) 1151 mlx5_mpw_close(txq, &mpw); 1152 mlx5_tx_dbrec(txq, mpw.wqe); 1153 txq->elts_head = elts_head; 1154 return i; 1155 } 1156 1157 /** 1158 * Translate RX completion flags to packet type. 1159 * 1160 * @param[in] cqe 1161 * Pointer to CQE. 1162 * 1163 * @note: fix mlx5_dev_supported_ptypes_get() if any change here. 1164 * 1165 * @return 1166 * Packet type for struct rte_mbuf. 1167 */ 1168 static inline uint32_t 1169 rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe) 1170 { 1171 uint32_t pkt_type; 1172 uint16_t flags = ntohs(cqe->hdr_type_etc); 1173 1174 if (cqe->pkt_info & MLX5_CQE_RX_TUNNEL_PACKET) { 1175 pkt_type = 1176 TRANSPOSE(flags, 1177 MLX5_CQE_RX_IPV4_PACKET, 1178 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN) | 1179 TRANSPOSE(flags, 1180 MLX5_CQE_RX_IPV6_PACKET, 1181 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN); 1182 pkt_type |= ((cqe->pkt_info & MLX5_CQE_RX_OUTER_PACKET) ? 1183 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN : 1184 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN); 1185 } else { 1186 pkt_type = 1187 TRANSPOSE(flags, 1188 MLX5_CQE_L3_HDR_TYPE_IPV6, 1189 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN) | 1190 TRANSPOSE(flags, 1191 MLX5_CQE_L3_HDR_TYPE_IPV4, 1192 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN); 1193 } 1194 return pkt_type; 1195 } 1196 1197 /** 1198 * Get size of the next packet for a given CQE. For compressed CQEs, the 1199 * consumer index is updated only once all packets of the current one have 1200 * been processed. 1201 * 1202 * @param rxq 1203 * Pointer to RX queue. 1204 * @param cqe 1205 * CQE to process. 1206 * @param[out] rss_hash 1207 * Packet RSS Hash result. 1208 * 1209 * @return 1210 * Packet size in bytes (0 if there is none), -1 in case of completion 1211 * with error. 1212 */ 1213 static inline int 1214 mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe, 1215 uint16_t cqe_cnt, uint32_t *rss_hash) 1216 { 1217 struct rxq_zip *zip = &rxq->zip; 1218 uint16_t cqe_n = cqe_cnt + 1; 1219 int len = 0; 1220 uint16_t idx, end; 1221 1222 /* Process compressed data in the CQE and mini arrays. */ 1223 if (zip->ai) { 1224 volatile struct mlx5_mini_cqe8 (*mc)[8] = 1225 (volatile struct mlx5_mini_cqe8 (*)[8]) 1226 (uintptr_t)(&(*rxq->cqes)[zip->ca & cqe_cnt]); 1227 1228 len = ntohl((*mc)[zip->ai & 7].byte_cnt); 1229 *rss_hash = ntohl((*mc)[zip->ai & 7].rx_hash_result); 1230 if ((++zip->ai & 7) == 0) { 1231 /* Invalidate consumed CQEs */ 1232 idx = zip->ca; 1233 end = zip->na; 1234 while (idx != end) { 1235 (*rxq->cqes)[idx & cqe_cnt].op_own = 1236 MLX5_CQE_INVALIDATE; 1237 ++idx; 1238 } 1239 /* 1240 * Increment consumer index to skip the number of 1241 * CQEs consumed. Hardware leaves holes in the CQ 1242 * ring for software use. 1243 */ 1244 zip->ca = zip->na; 1245 zip->na += 8; 1246 } 1247 if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) { 1248 /* Invalidate the rest */ 1249 idx = zip->ca; 1250 end = zip->cq_ci; 1251 1252 while (idx != end) { 1253 (*rxq->cqes)[idx & cqe_cnt].op_own = 1254 MLX5_CQE_INVALIDATE; 1255 ++idx; 1256 } 1257 rxq->cq_ci = zip->cq_ci; 1258 zip->ai = 0; 1259 } 1260 /* No compressed data, get next CQE and verify if it is compressed. */ 1261 } else { 1262 int ret; 1263 int8_t op_own; 1264 1265 ret = check_cqe(cqe, cqe_n, rxq->cq_ci); 1266 if (unlikely(ret == 1)) 1267 return 0; 1268 ++rxq->cq_ci; 1269 op_own = cqe->op_own; 1270 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) { 1271 volatile struct mlx5_mini_cqe8 (*mc)[8] = 1272 (volatile struct mlx5_mini_cqe8 (*)[8]) 1273 (uintptr_t)(&(*rxq->cqes)[rxq->cq_ci & 1274 cqe_cnt]); 1275 1276 /* Fix endianness. */ 1277 zip->cqe_cnt = ntohl(cqe->byte_cnt); 1278 /* 1279 * Current mini array position is the one returned by 1280 * check_cqe64(). 1281 * 1282 * If completion comprises several mini arrays, as a 1283 * special case the second one is located 7 CQEs after 1284 * the initial CQE instead of 8 for subsequent ones. 1285 */ 1286 zip->ca = rxq->cq_ci; 1287 zip->na = zip->ca + 7; 1288 /* Compute the next non compressed CQE. */ 1289 --rxq->cq_ci; 1290 zip->cq_ci = rxq->cq_ci + zip->cqe_cnt; 1291 /* Get packet size to return. */ 1292 len = ntohl((*mc)[0].byte_cnt); 1293 *rss_hash = ntohl((*mc)[0].rx_hash_result); 1294 zip->ai = 1; 1295 /* Prefetch all the entries to be invalidated */ 1296 idx = zip->ca; 1297 end = zip->cq_ci; 1298 while (idx != end) { 1299 rte_prefetch0(&(*rxq->cqes)[(idx) & cqe_cnt]); 1300 ++idx; 1301 } 1302 } else { 1303 len = ntohl(cqe->byte_cnt); 1304 *rss_hash = ntohl(cqe->rx_hash_res); 1305 } 1306 /* Error while receiving packet. */ 1307 if (unlikely(MLX5_CQE_OPCODE(op_own) == MLX5_CQE_RESP_ERR)) 1308 return -1; 1309 } 1310 return len; 1311 } 1312 1313 /** 1314 * Translate RX completion flags to offload flags. 1315 * 1316 * @param[in] rxq 1317 * Pointer to RX queue structure. 1318 * @param[in] cqe 1319 * Pointer to CQE. 1320 * 1321 * @return 1322 * Offload flags (ol_flags) for struct rte_mbuf. 1323 */ 1324 static inline uint32_t 1325 rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe *cqe) 1326 { 1327 uint32_t ol_flags = 0; 1328 uint16_t flags = ntohs(cqe->hdr_type_etc); 1329 1330 ol_flags = 1331 TRANSPOSE(flags, 1332 MLX5_CQE_RX_L3_HDR_VALID, 1333 PKT_RX_IP_CKSUM_GOOD) | 1334 TRANSPOSE(flags, 1335 MLX5_CQE_RX_L4_HDR_VALID, 1336 PKT_RX_L4_CKSUM_GOOD); 1337 if ((cqe->pkt_info & MLX5_CQE_RX_TUNNEL_PACKET) && (rxq->csum_l2tun)) 1338 ol_flags |= 1339 TRANSPOSE(flags, 1340 MLX5_CQE_RX_L3_HDR_VALID, 1341 PKT_RX_IP_CKSUM_GOOD) | 1342 TRANSPOSE(flags, 1343 MLX5_CQE_RX_L4_HDR_VALID, 1344 PKT_RX_L4_CKSUM_GOOD); 1345 return ol_flags; 1346 } 1347 1348 /** 1349 * DPDK callback for RX. 1350 * 1351 * @param dpdk_rxq 1352 * Generic pointer to RX queue structure. 1353 * @param[out] pkts 1354 * Array to store received packets. 1355 * @param pkts_n 1356 * Maximum number of packets in array. 1357 * 1358 * @return 1359 * Number of packets successfully received (<= pkts_n). 1360 */ 1361 uint16_t 1362 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1363 { 1364 struct rxq *rxq = dpdk_rxq; 1365 const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1; 1366 const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1; 1367 const unsigned int sges_n = rxq->sges_n; 1368 struct rte_mbuf *pkt = NULL; 1369 struct rte_mbuf *seg = NULL; 1370 volatile struct mlx5_cqe *cqe = 1371 &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1372 unsigned int i = 0; 1373 unsigned int rq_ci = rxq->rq_ci << sges_n; 1374 int len; /* keep its value across iterations. */ 1375 1376 while (pkts_n) { 1377 unsigned int idx = rq_ci & wqe_cnt; 1378 volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx]; 1379 struct rte_mbuf *rep = (*rxq->elts)[idx]; 1380 uint32_t rss_hash_res = 0; 1381 1382 if (pkt) 1383 NEXT(seg) = rep; 1384 seg = rep; 1385 rte_prefetch0(seg); 1386 rte_prefetch0(cqe); 1387 rte_prefetch0(wqe); 1388 rep = rte_mbuf_raw_alloc(rxq->mp); 1389 if (unlikely(rep == NULL)) { 1390 ++rxq->stats.rx_nombuf; 1391 if (!pkt) { 1392 /* 1393 * no buffers before we even started, 1394 * bail out silently. 1395 */ 1396 break; 1397 } 1398 while (pkt != seg) { 1399 assert(pkt != (*rxq->elts)[idx]); 1400 rep = NEXT(pkt); 1401 rte_mbuf_refcnt_set(pkt, 0); 1402 __rte_mbuf_raw_free(pkt); 1403 pkt = rep; 1404 } 1405 break; 1406 } 1407 if (!pkt) { 1408 cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1409 len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt, 1410 &rss_hash_res); 1411 if (!len) { 1412 rte_mbuf_refcnt_set(rep, 0); 1413 __rte_mbuf_raw_free(rep); 1414 break; 1415 } 1416 if (unlikely(len == -1)) { 1417 /* RX error, packet is likely too large. */ 1418 rte_mbuf_refcnt_set(rep, 0); 1419 __rte_mbuf_raw_free(rep); 1420 ++rxq->stats.idropped; 1421 goto skip; 1422 } 1423 pkt = seg; 1424 assert(len >= (rxq->crc_present << 2)); 1425 /* Update packet information. */ 1426 pkt->packet_type = 0; 1427 pkt->ol_flags = 0; 1428 if (rss_hash_res && rxq->rss_hash) { 1429 pkt->hash.rss = rss_hash_res; 1430 pkt->ol_flags = PKT_RX_RSS_HASH; 1431 } 1432 if (rxq->mark && 1433 ((cqe->sop_drop_qpn != 1434 htonl(MLX5_FLOW_MARK_INVALID)) || 1435 (cqe->sop_drop_qpn != 1436 htonl(MLX5_FLOW_MARK_DEFAULT)))) { 1437 pkt->hash.fdir.hi = 1438 mlx5_flow_mark_get(cqe->sop_drop_qpn); 1439 pkt->ol_flags &= ~PKT_RX_RSS_HASH; 1440 pkt->ol_flags |= PKT_RX_FDIR | PKT_RX_FDIR_ID; 1441 } 1442 if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip | 1443 rxq->crc_present) { 1444 if (rxq->csum) { 1445 pkt->packet_type = 1446 rxq_cq_to_pkt_type(cqe); 1447 pkt->ol_flags |= 1448 rxq_cq_to_ol_flags(rxq, cqe); 1449 } 1450 if (cqe->hdr_type_etc & 1451 MLX5_CQE_VLAN_STRIPPED) { 1452 pkt->ol_flags |= PKT_RX_VLAN_PKT | 1453 PKT_RX_VLAN_STRIPPED; 1454 pkt->vlan_tci = ntohs(cqe->vlan_info); 1455 } 1456 if (rxq->crc_present) 1457 len -= ETHER_CRC_LEN; 1458 } 1459 PKT_LEN(pkt) = len; 1460 } 1461 DATA_LEN(rep) = DATA_LEN(seg); 1462 PKT_LEN(rep) = PKT_LEN(seg); 1463 SET_DATA_OFF(rep, DATA_OFF(seg)); 1464 NB_SEGS(rep) = NB_SEGS(seg); 1465 PORT(rep) = PORT(seg); 1466 NEXT(rep) = NULL; 1467 (*rxq->elts)[idx] = rep; 1468 /* 1469 * Fill NIC descriptor with the new buffer. The lkey and size 1470 * of the buffers are already known, only the buffer address 1471 * changes. 1472 */ 1473 wqe->addr = htonll(rte_pktmbuf_mtod(rep, uintptr_t)); 1474 if (len > DATA_LEN(seg)) { 1475 len -= DATA_LEN(seg); 1476 ++NB_SEGS(pkt); 1477 ++rq_ci; 1478 continue; 1479 } 1480 DATA_LEN(seg) = len; 1481 #ifdef MLX5_PMD_SOFT_COUNTERS 1482 /* Increment bytes counter. */ 1483 rxq->stats.ibytes += PKT_LEN(pkt); 1484 #endif 1485 /* Return packet. */ 1486 *(pkts++) = pkt; 1487 pkt = NULL; 1488 --pkts_n; 1489 ++i; 1490 skip: 1491 /* Align consumer index to the next stride. */ 1492 rq_ci >>= sges_n; 1493 ++rq_ci; 1494 rq_ci <<= sges_n; 1495 } 1496 if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci))) 1497 return 0; 1498 /* Update the consumer index. */ 1499 rxq->rq_ci = rq_ci >> sges_n; 1500 rte_wmb(); 1501 *rxq->cq_db = htonl(rxq->cq_ci); 1502 rte_wmb(); 1503 *rxq->rq_db = htonl(rxq->rq_ci); 1504 #ifdef MLX5_PMD_SOFT_COUNTERS 1505 /* Increment packets counter. */ 1506 rxq->stats.ipackets += i; 1507 #endif 1508 return i; 1509 } 1510 1511 /** 1512 * Dummy DPDK callback for TX. 1513 * 1514 * This function is used to temporarily replace the real callback during 1515 * unsafe control operations on the queue, or in case of error. 1516 * 1517 * @param dpdk_txq 1518 * Generic pointer to TX queue structure. 1519 * @param[in] pkts 1520 * Packets to transmit. 1521 * @param pkts_n 1522 * Number of packets in array. 1523 * 1524 * @return 1525 * Number of packets successfully transmitted (<= pkts_n). 1526 */ 1527 uint16_t 1528 removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 1529 { 1530 (void)dpdk_txq; 1531 (void)pkts; 1532 (void)pkts_n; 1533 return 0; 1534 } 1535 1536 /** 1537 * Dummy DPDK callback for RX. 1538 * 1539 * This function is used to temporarily replace the real callback during 1540 * unsafe control operations on the queue, or in case of error. 1541 * 1542 * @param dpdk_rxq 1543 * Generic pointer to RX queue structure. 1544 * @param[out] pkts 1545 * Array to store received packets. 1546 * @param pkts_n 1547 * Maximum number of packets in array. 1548 * 1549 * @return 1550 * Number of packets successfully received (<= pkts_n). 1551 */ 1552 uint16_t 1553 removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1554 { 1555 (void)dpdk_rxq; 1556 (void)pkts; 1557 (void)pkts_n; 1558 return 0; 1559 } 1560