1 /*- 2 * BSD LICENSE 3 * 4 * Copyright 2015 6WIND S.A. 5 * Copyright 2015 Mellanox. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of 6WIND S.A. nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <assert.h> 35 #include <stdint.h> 36 #include <string.h> 37 #include <stdlib.h> 38 39 /* Verbs header. */ 40 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 41 #ifdef PEDANTIC 42 #pragma GCC diagnostic ignored "-Wpedantic" 43 #endif 44 #include <infiniband/verbs.h> 45 #include <infiniband/mlx5_hw.h> 46 #include <infiniband/arch.h> 47 #ifdef PEDANTIC 48 #pragma GCC diagnostic error "-Wpedantic" 49 #endif 50 51 /* DPDK headers don't like -pedantic. */ 52 #ifdef PEDANTIC 53 #pragma GCC diagnostic ignored "-Wpedantic" 54 #endif 55 #include <rte_mbuf.h> 56 #include <rte_mempool.h> 57 #include <rte_prefetch.h> 58 #include <rte_common.h> 59 #include <rte_branch_prediction.h> 60 #include <rte_ether.h> 61 #ifdef PEDANTIC 62 #pragma GCC diagnostic error "-Wpedantic" 63 #endif 64 65 #include "mlx5.h" 66 #include "mlx5_utils.h" 67 #include "mlx5_rxtx.h" 68 #include "mlx5_autoconf.h" 69 #include "mlx5_defs.h" 70 #include "mlx5_prm.h" 71 72 static inline int 73 check_cqe(volatile struct mlx5_cqe *cqe, 74 unsigned int cqes_n, const uint16_t ci) 75 __attribute__((always_inline)); 76 77 static inline void 78 txq_complete(struct txq *txq) __attribute__((always_inline)); 79 80 static inline uint32_t 81 txq_mp2mr(struct txq *txq, struct rte_mempool *mp) 82 __attribute__((always_inline)); 83 84 static inline void 85 mlx5_tx_dbrec(struct txq *txq, volatile struct mlx5_wqe *wqe) 86 __attribute__((always_inline)); 87 88 static inline uint32_t 89 rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe) 90 __attribute__((always_inline)); 91 92 static inline int 93 mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe, 94 uint16_t cqe_cnt, uint32_t *rss_hash) 95 __attribute__((always_inline)); 96 97 static inline uint32_t 98 rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe *cqe) 99 __attribute__((always_inline)); 100 101 #ifndef NDEBUG 102 103 /** 104 * Verify or set magic value in CQE. 105 * 106 * @param cqe 107 * Pointer to CQE. 108 * 109 * @return 110 * 0 the first time. 111 */ 112 static inline int 113 check_cqe_seen(volatile struct mlx5_cqe *cqe) 114 { 115 static const uint8_t magic[] = "seen"; 116 volatile uint8_t (*buf)[sizeof(cqe->rsvd0)] = &cqe->rsvd0; 117 int ret = 1; 118 unsigned int i; 119 120 for (i = 0; i < sizeof(magic) && i < sizeof(*buf); ++i) 121 if (!ret || (*buf)[i] != magic[i]) { 122 ret = 0; 123 (*buf)[i] = magic[i]; 124 } 125 return ret; 126 } 127 128 #endif /* NDEBUG */ 129 130 /** 131 * Check whether CQE is valid. 132 * 133 * @param cqe 134 * Pointer to CQE. 135 * @param cqes_n 136 * Size of completion queue. 137 * @param ci 138 * Consumer index. 139 * 140 * @return 141 * 0 on success, 1 on failure. 142 */ 143 static inline int 144 check_cqe(volatile struct mlx5_cqe *cqe, 145 unsigned int cqes_n, const uint16_t ci) 146 { 147 uint16_t idx = ci & cqes_n; 148 uint8_t op_own = cqe->op_own; 149 uint8_t op_owner = MLX5_CQE_OWNER(op_own); 150 uint8_t op_code = MLX5_CQE_OPCODE(op_own); 151 152 if (unlikely((op_owner != (!!(idx))) || (op_code == MLX5_CQE_INVALID))) 153 return 1; /* No CQE. */ 154 #ifndef NDEBUG 155 if ((op_code == MLX5_CQE_RESP_ERR) || 156 (op_code == MLX5_CQE_REQ_ERR)) { 157 volatile struct mlx5_err_cqe *err_cqe = (volatile void *)cqe; 158 uint8_t syndrome = err_cqe->syndrome; 159 160 if ((syndrome == MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR) || 161 (syndrome == MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR)) 162 return 0; 163 if (!check_cqe_seen(cqe)) 164 ERROR("unexpected CQE error %u (0x%02x)" 165 " syndrome 0x%02x", 166 op_code, op_code, syndrome); 167 return 1; 168 } else if ((op_code != MLX5_CQE_RESP_SEND) && 169 (op_code != MLX5_CQE_REQ)) { 170 if (!check_cqe_seen(cqe)) 171 ERROR("unexpected CQE opcode %u (0x%02x)", 172 op_code, op_code); 173 return 1; 174 } 175 #endif /* NDEBUG */ 176 return 0; 177 } 178 179 /** 180 * Return the address of the WQE. 181 * 182 * @param txq 183 * Pointer to TX queue structure. 184 * @param wqe_ci 185 * WQE consumer index. 186 * 187 * @return 188 * WQE address. 189 */ 190 static inline uintptr_t * 191 tx_mlx5_wqe(struct txq *txq, uint16_t ci) 192 { 193 ci &= ((1 << txq->wqe_n) - 1); 194 return (uintptr_t *)((uintptr_t)txq->wqes + ci * MLX5_WQE_SIZE); 195 } 196 197 /** 198 * Manage TX completions. 199 * 200 * When sending a burst, mlx5_tx_burst() posts several WRs. 201 * 202 * @param txq 203 * Pointer to TX queue structure. 204 */ 205 static inline void 206 txq_complete(struct txq *txq) 207 { 208 const unsigned int elts_n = 1 << txq->elts_n; 209 const unsigned int cqe_n = 1 << txq->cqe_n; 210 const unsigned int cqe_cnt = cqe_n - 1; 211 uint16_t elts_free = txq->elts_tail; 212 uint16_t elts_tail; 213 uint16_t cq_ci = txq->cq_ci; 214 volatile struct mlx5_cqe *cqe = NULL; 215 volatile struct mlx5_wqe_ctrl *ctrl; 216 217 do { 218 volatile struct mlx5_cqe *tmp; 219 220 tmp = &(*txq->cqes)[cq_ci & cqe_cnt]; 221 if (check_cqe(tmp, cqe_n, cq_ci)) 222 break; 223 cqe = tmp; 224 #ifndef NDEBUG 225 if (MLX5_CQE_FORMAT(cqe->op_own) == MLX5_COMPRESSED) { 226 if (!check_cqe_seen(cqe)) 227 ERROR("unexpected compressed CQE, TX stopped"); 228 return; 229 } 230 if ((MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_RESP_ERR) || 231 (MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_REQ_ERR)) { 232 if (!check_cqe_seen(cqe)) 233 ERROR("unexpected error CQE, TX stopped"); 234 return; 235 } 236 #endif /* NDEBUG */ 237 ++cq_ci; 238 } while (1); 239 if (unlikely(cqe == NULL)) 240 return; 241 ctrl = (volatile struct mlx5_wqe_ctrl *) 242 tx_mlx5_wqe(txq, ntohs(cqe->wqe_counter)); 243 elts_tail = ctrl->ctrl3; 244 assert(elts_tail < (1 << txq->wqe_n)); 245 /* Free buffers. */ 246 while (elts_free != elts_tail) { 247 struct rte_mbuf *elt = (*txq->elts)[elts_free]; 248 unsigned int elts_free_next = 249 (elts_free + 1) & (elts_n - 1); 250 struct rte_mbuf *elt_next = (*txq->elts)[elts_free_next]; 251 252 #ifndef NDEBUG 253 /* Poisoning. */ 254 memset(&(*txq->elts)[elts_free], 255 0x66, 256 sizeof((*txq->elts)[elts_free])); 257 #endif 258 RTE_MBUF_PREFETCH_TO_FREE(elt_next); 259 /* Only one segment needs to be freed. */ 260 rte_pktmbuf_free_seg(elt); 261 elts_free = elts_free_next; 262 } 263 txq->cq_ci = cq_ci; 264 txq->elts_tail = elts_tail; 265 /* Update the consumer index. */ 266 rte_wmb(); 267 *txq->cq_db = htonl(cq_ci); 268 } 269 270 /** 271 * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which 272 * the cloned mbuf is allocated is returned instead. 273 * 274 * @param buf 275 * Pointer to mbuf. 276 * 277 * @return 278 * Memory pool where data is located for given mbuf. 279 */ 280 static struct rte_mempool * 281 txq_mb2mp(struct rte_mbuf *buf) 282 { 283 if (unlikely(RTE_MBUF_INDIRECT(buf))) 284 return rte_mbuf_from_indirect(buf)->pool; 285 return buf->pool; 286 } 287 288 /** 289 * Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[]. 290 * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full, 291 * remove an entry first. 292 * 293 * @param txq 294 * Pointer to TX queue structure. 295 * @param[in] mp 296 * Memory Pool for which a Memory Region lkey must be returned. 297 * 298 * @return 299 * mr->lkey on success, (uint32_t)-1 on failure. 300 */ 301 static inline uint32_t 302 txq_mp2mr(struct txq *txq, struct rte_mempool *mp) 303 { 304 unsigned int i; 305 uint32_t lkey = (uint32_t)-1; 306 307 for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) { 308 if (unlikely(txq->mp2mr[i].mp == NULL)) { 309 /* Unknown MP, add a new MR for it. */ 310 break; 311 } 312 if (txq->mp2mr[i].mp == mp) { 313 assert(txq->mp2mr[i].lkey != (uint32_t)-1); 314 assert(htonl(txq->mp2mr[i].mr->lkey) == 315 txq->mp2mr[i].lkey); 316 lkey = txq->mp2mr[i].lkey; 317 break; 318 } 319 } 320 if (unlikely(lkey == (uint32_t)-1)) 321 lkey = txq_mp2mr_reg(txq, mp, i); 322 return lkey; 323 } 324 325 /** 326 * Ring TX queue doorbell. 327 * 328 * @param txq 329 * Pointer to TX queue structure. 330 * @param wqe 331 * Pointer to the last WQE posted in the NIC. 332 */ 333 static inline void 334 mlx5_tx_dbrec(struct txq *txq, volatile struct mlx5_wqe *wqe) 335 { 336 uint64_t *dst = (uint64_t *)((uintptr_t)txq->bf_reg); 337 volatile uint64_t *src = ((volatile uint64_t *)wqe); 338 339 rte_wmb(); 340 *txq->qp_db = htonl(txq->wqe_ci); 341 /* Ensure ordering between DB record and BF copy. */ 342 rte_wmb(); 343 *dst = *src; 344 } 345 346 /** 347 * DPDK callback for TX. 348 * 349 * @param dpdk_txq 350 * Generic pointer to TX queue structure. 351 * @param[in] pkts 352 * Packets to transmit. 353 * @param pkts_n 354 * Number of packets in array. 355 * 356 * @return 357 * Number of packets successfully transmitted (<= pkts_n). 358 */ 359 uint16_t 360 mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 361 { 362 struct txq *txq = (struct txq *)dpdk_txq; 363 uint16_t elts_head = txq->elts_head; 364 const unsigned int elts_n = 1 << txq->elts_n; 365 unsigned int i = 0; 366 unsigned int j = 0; 367 unsigned int max; 368 unsigned int comp; 369 volatile struct mlx5_wqe_v *wqe = NULL; 370 unsigned int segs_n = 0; 371 struct rte_mbuf *buf = NULL; 372 uint8_t *raw; 373 374 if (unlikely(!pkts_n)) 375 return 0; 376 /* Prefetch first packet cacheline. */ 377 rte_prefetch0(*pkts); 378 /* Start processing. */ 379 txq_complete(txq); 380 max = (elts_n - (elts_head - txq->elts_tail)); 381 if (max > elts_n) 382 max -= elts_n; 383 do { 384 volatile rte_v128u32_t *dseg = NULL; 385 uint32_t length; 386 unsigned int ds = 0; 387 uintptr_t addr; 388 uint64_t naddr; 389 uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE; 390 uint16_t ehdr; 391 uint8_t cs_flags = 0; 392 #ifdef MLX5_PMD_SOFT_COUNTERS 393 uint32_t total_length = 0; 394 #endif 395 396 /* first_seg */ 397 buf = *(pkts++); 398 segs_n = buf->nb_segs; 399 /* 400 * Make sure there is enough room to store this packet and 401 * that one ring entry remains unused. 402 */ 403 assert(segs_n); 404 if (max < segs_n + 1) 405 break; 406 max -= segs_n; 407 --segs_n; 408 if (!segs_n) 409 --pkts_n; 410 wqe = (volatile struct mlx5_wqe_v *) 411 tx_mlx5_wqe(txq, txq->wqe_ci); 412 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); 413 if (pkts_n > 1) 414 rte_prefetch0(*pkts); 415 addr = rte_pktmbuf_mtod(buf, uintptr_t); 416 length = DATA_LEN(buf); 417 ehdr = (((uint8_t *)addr)[1] << 8) | 418 ((uint8_t *)addr)[0]; 419 #ifdef MLX5_PMD_SOFT_COUNTERS 420 total_length = length; 421 #endif 422 assert(length >= MLX5_WQE_DWORD_SIZE); 423 /* Update element. */ 424 (*txq->elts)[elts_head] = buf; 425 elts_head = (elts_head + 1) & (elts_n - 1); 426 /* Prefetch next buffer data. */ 427 if (pkts_n > 1) { 428 volatile void *pkt_addr; 429 430 pkt_addr = rte_pktmbuf_mtod(*pkts, volatile void *); 431 rte_prefetch0(pkt_addr); 432 } 433 /* Should we enable HW CKSUM offload */ 434 if (buf->ol_flags & 435 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) { 436 cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; 437 } 438 raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE; 439 /* 440 * Start by copying the Ethernet header minus the first two 441 * bytes which will be appended at the end of the Ethernet 442 * segment. 443 */ 444 memcpy((uint8_t *)raw, ((uint8_t *)addr) + 2, 16); 445 length -= MLX5_WQE_DWORD_SIZE; 446 addr += MLX5_WQE_DWORD_SIZE; 447 /* Replace the Ethernet type by the VLAN if necessary. */ 448 if (buf->ol_flags & PKT_TX_VLAN_PKT) { 449 uint32_t vlan = htonl(0x81000000 | buf->vlan_tci); 450 451 memcpy((uint8_t *)(raw + MLX5_WQE_DWORD_SIZE - 2 - 452 sizeof(vlan)), 453 &vlan, sizeof(vlan)); 454 addr -= sizeof(vlan); 455 length += sizeof(vlan); 456 } 457 /* Inline if enough room. */ 458 if (txq->max_inline != 0) { 459 uintptr_t end = (uintptr_t) 460 (((uintptr_t)txq->wqes) + 461 (1 << txq->wqe_n) * MLX5_WQE_SIZE); 462 uint16_t max_inline = 463 txq->max_inline * RTE_CACHE_LINE_SIZE; 464 uint16_t room; 465 466 /* 467 * raw starts two bytes before the boundary to 468 * continue the above copy of packet data. 469 */ 470 raw += MLX5_WQE_DWORD_SIZE - 2; 471 room = end - (uintptr_t)raw; 472 if (room > max_inline) { 473 uintptr_t addr_end = (addr + max_inline) & 474 ~(RTE_CACHE_LINE_SIZE - 1); 475 uint16_t copy_b = ((addr_end - addr) > length) ? 476 length : 477 (addr_end - addr); 478 479 rte_memcpy((void *)raw, (void *)addr, copy_b); 480 addr += copy_b; 481 length -= copy_b; 482 pkt_inline_sz += copy_b; 483 /* Sanity check. */ 484 assert(addr <= addr_end); 485 } 486 /* 487 * 2 DWORDs consumed by the WQE header + ETH segment + 488 * the size of the inline part of the packet. 489 */ 490 ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2); 491 if (length > 0) { 492 dseg = (volatile rte_v128u32_t *) 493 ((uintptr_t)wqe + 494 (ds * MLX5_WQE_DWORD_SIZE)); 495 if ((uintptr_t)dseg >= end) 496 dseg = (volatile rte_v128u32_t *) 497 txq->wqes; 498 goto use_dseg; 499 } else if (!segs_n) { 500 goto next_pkt; 501 } else { 502 /* dseg will be advance as part of next_seg */ 503 dseg = (volatile rte_v128u32_t *) 504 ((uintptr_t)wqe + 505 ((ds - 1) * MLX5_WQE_DWORD_SIZE)); 506 goto next_seg; 507 } 508 } else { 509 /* 510 * No inline has been done in the packet, only the 511 * Ethernet Header as been stored. 512 */ 513 dseg = (volatile rte_v128u32_t *) 514 ((uintptr_t)wqe + (3 * MLX5_WQE_DWORD_SIZE)); 515 ds = 3; 516 use_dseg: 517 /* Add the remaining packet as a simple ds. */ 518 naddr = htonll(addr); 519 *dseg = (rte_v128u32_t){ 520 htonl(length), 521 txq_mp2mr(txq, txq_mb2mp(buf)), 522 naddr, 523 naddr >> 32, 524 }; 525 ++ds; 526 if (!segs_n) 527 goto next_pkt; 528 } 529 next_seg: 530 assert(buf); 531 assert(ds); 532 assert(wqe); 533 /* 534 * Spill on next WQE when the current one does not have 535 * enough room left. Size of WQE must a be a multiple 536 * of data segment size. 537 */ 538 assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE)); 539 if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE))) { 540 unsigned int n = (txq->wqe_ci + ((ds + 3) / 4)) & 541 ((1 << txq->wqe_n) - 1); 542 543 dseg = (volatile rte_v128u32_t *) 544 tx_mlx5_wqe(txq, n); 545 rte_prefetch0(tx_mlx5_wqe(txq, n + 1)); 546 } else { 547 ++dseg; 548 } 549 ++ds; 550 buf = buf->next; 551 assert(buf); 552 length = DATA_LEN(buf); 553 #ifdef MLX5_PMD_SOFT_COUNTERS 554 total_length += length; 555 #endif 556 /* Store segment information. */ 557 naddr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)); 558 *dseg = (rte_v128u32_t){ 559 htonl(length), 560 txq_mp2mr(txq, txq_mb2mp(buf)), 561 naddr, 562 naddr >> 32, 563 }; 564 (*txq->elts)[elts_head] = buf; 565 elts_head = (elts_head + 1) & (elts_n - 1); 566 ++j; 567 --segs_n; 568 if (segs_n) 569 goto next_seg; 570 else 571 --pkts_n; 572 next_pkt: 573 ++i; 574 /* Initialize known and common part of the WQE structure. */ 575 wqe->ctrl = (rte_v128u32_t){ 576 htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND), 577 htonl(txq->qp_num_8s | ds), 578 0, 579 0, 580 }; 581 wqe->eseg = (rte_v128u32_t){ 582 0, 583 cs_flags, 584 0, 585 (ehdr << 16) | htons(pkt_inline_sz), 586 }; 587 txq->wqe_ci += (ds + 3) / 4; 588 #ifdef MLX5_PMD_SOFT_COUNTERS 589 /* Increment sent bytes counter. */ 590 txq->stats.obytes += total_length; 591 #endif 592 } while (pkts_n); 593 /* Take a shortcut if nothing must be sent. */ 594 if (unlikely(i == 0)) 595 return 0; 596 /* Check whether completion threshold has been reached. */ 597 comp = txq->elts_comp + i + j; 598 if (comp >= MLX5_TX_COMP_THRESH) { 599 volatile struct mlx5_wqe_ctrl *w = 600 (volatile struct mlx5_wqe_ctrl *)wqe; 601 602 /* Request completion on last WQE. */ 603 w->ctrl2 = htonl(8); 604 /* Save elts_head in unused "immediate" field of WQE. */ 605 w->ctrl3 = elts_head; 606 txq->elts_comp = 0; 607 } else { 608 txq->elts_comp = comp; 609 } 610 #ifdef MLX5_PMD_SOFT_COUNTERS 611 /* Increment sent packets counter. */ 612 txq->stats.opackets += i; 613 #endif 614 /* Ring QP doorbell. */ 615 mlx5_tx_dbrec(txq, (volatile struct mlx5_wqe *)wqe); 616 txq->elts_head = elts_head; 617 return i; 618 } 619 620 /** 621 * Open a MPW session. 622 * 623 * @param txq 624 * Pointer to TX queue structure. 625 * @param mpw 626 * Pointer to MPW session structure. 627 * @param length 628 * Packet length. 629 */ 630 static inline void 631 mlx5_mpw_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length) 632 { 633 uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1); 634 volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] = 635 (volatile struct mlx5_wqe_data_seg (*)[]) 636 tx_mlx5_wqe(txq, idx + 1); 637 638 mpw->state = MLX5_MPW_STATE_OPENED; 639 mpw->pkts_n = 0; 640 mpw->len = length; 641 mpw->total_len = 0; 642 mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx); 643 mpw->wqe->eseg.mss = htons(length); 644 mpw->wqe->eseg.inline_hdr_sz = 0; 645 mpw->wqe->eseg.rsvd0 = 0; 646 mpw->wqe->eseg.rsvd1 = 0; 647 mpw->wqe->eseg.rsvd2 = 0; 648 mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) | 649 (txq->wqe_ci << 8) | MLX5_OPCODE_TSO); 650 mpw->wqe->ctrl[2] = 0; 651 mpw->wqe->ctrl[3] = 0; 652 mpw->data.dseg[0] = (volatile struct mlx5_wqe_data_seg *) 653 (((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE)); 654 mpw->data.dseg[1] = (volatile struct mlx5_wqe_data_seg *) 655 (((uintptr_t)mpw->wqe) + (3 * MLX5_WQE_DWORD_SIZE)); 656 mpw->data.dseg[2] = &(*dseg)[0]; 657 mpw->data.dseg[3] = &(*dseg)[1]; 658 mpw->data.dseg[4] = &(*dseg)[2]; 659 } 660 661 /** 662 * Close a MPW session. 663 * 664 * @param txq 665 * Pointer to TX queue structure. 666 * @param mpw 667 * Pointer to MPW session structure. 668 */ 669 static inline void 670 mlx5_mpw_close(struct txq *txq, struct mlx5_mpw *mpw) 671 { 672 unsigned int num = mpw->pkts_n; 673 674 /* 675 * Store size in multiple of 16 bytes. Control and Ethernet segments 676 * count as 2. 677 */ 678 mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | (2 + num)); 679 mpw->state = MLX5_MPW_STATE_CLOSED; 680 if (num < 3) 681 ++txq->wqe_ci; 682 else 683 txq->wqe_ci += 2; 684 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci)); 685 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); 686 } 687 688 /** 689 * DPDK callback for TX with MPW support. 690 * 691 * @param dpdk_txq 692 * Generic pointer to TX queue structure. 693 * @param[in] pkts 694 * Packets to transmit. 695 * @param pkts_n 696 * Number of packets in array. 697 * 698 * @return 699 * Number of packets successfully transmitted (<= pkts_n). 700 */ 701 uint16_t 702 mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 703 { 704 struct txq *txq = (struct txq *)dpdk_txq; 705 uint16_t elts_head = txq->elts_head; 706 const unsigned int elts_n = 1 << txq->elts_n; 707 unsigned int i = 0; 708 unsigned int j = 0; 709 unsigned int max; 710 unsigned int comp; 711 struct mlx5_mpw mpw = { 712 .state = MLX5_MPW_STATE_CLOSED, 713 }; 714 715 if (unlikely(!pkts_n)) 716 return 0; 717 /* Prefetch first packet cacheline. */ 718 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci)); 719 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); 720 /* Start processing. */ 721 txq_complete(txq); 722 max = (elts_n - (elts_head - txq->elts_tail)); 723 if (max > elts_n) 724 max -= elts_n; 725 do { 726 struct rte_mbuf *buf = *(pkts++); 727 unsigned int elts_head_next; 728 uint32_t length; 729 unsigned int segs_n = buf->nb_segs; 730 uint32_t cs_flags = 0; 731 732 /* 733 * Make sure there is enough room to store this packet and 734 * that one ring entry remains unused. 735 */ 736 assert(segs_n); 737 if (max < segs_n + 1) 738 break; 739 /* Do not bother with large packets MPW cannot handle. */ 740 if (segs_n > MLX5_MPW_DSEG_MAX) 741 break; 742 max -= segs_n; 743 --pkts_n; 744 /* Should we enable HW CKSUM offload */ 745 if (buf->ol_flags & 746 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) 747 cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; 748 /* Retrieve packet information. */ 749 length = PKT_LEN(buf); 750 assert(length); 751 /* Start new session if packet differs. */ 752 if ((mpw.state == MLX5_MPW_STATE_OPENED) && 753 ((mpw.len != length) || 754 (segs_n != 1) || 755 (mpw.wqe->eseg.cs_flags != cs_flags))) 756 mlx5_mpw_close(txq, &mpw); 757 if (mpw.state == MLX5_MPW_STATE_CLOSED) { 758 mlx5_mpw_new(txq, &mpw, length); 759 mpw.wqe->eseg.cs_flags = cs_flags; 760 } 761 /* Multi-segment packets must be alone in their MPW. */ 762 assert((segs_n == 1) || (mpw.pkts_n == 0)); 763 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 764 length = 0; 765 #endif 766 do { 767 volatile struct mlx5_wqe_data_seg *dseg; 768 uintptr_t addr; 769 770 elts_head_next = (elts_head + 1) & (elts_n - 1); 771 assert(buf); 772 (*txq->elts)[elts_head] = buf; 773 dseg = mpw.data.dseg[mpw.pkts_n]; 774 addr = rte_pktmbuf_mtod(buf, uintptr_t); 775 *dseg = (struct mlx5_wqe_data_seg){ 776 .byte_count = htonl(DATA_LEN(buf)), 777 .lkey = txq_mp2mr(txq, txq_mb2mp(buf)), 778 .addr = htonll(addr), 779 }; 780 elts_head = elts_head_next; 781 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 782 length += DATA_LEN(buf); 783 #endif 784 buf = buf->next; 785 ++mpw.pkts_n; 786 ++j; 787 } while (--segs_n); 788 assert(length == mpw.len); 789 if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) 790 mlx5_mpw_close(txq, &mpw); 791 elts_head = elts_head_next; 792 #ifdef MLX5_PMD_SOFT_COUNTERS 793 /* Increment sent bytes counter. */ 794 txq->stats.obytes += length; 795 #endif 796 ++i; 797 } while (pkts_n); 798 /* Take a shortcut if nothing must be sent. */ 799 if (unlikely(i == 0)) 800 return 0; 801 /* Check whether completion threshold has been reached. */ 802 /* "j" includes both packets and segments. */ 803 comp = txq->elts_comp + j; 804 if (comp >= MLX5_TX_COMP_THRESH) { 805 volatile struct mlx5_wqe *wqe = mpw.wqe; 806 807 /* Request completion on last WQE. */ 808 wqe->ctrl[2] = htonl(8); 809 /* Save elts_head in unused "immediate" field of WQE. */ 810 wqe->ctrl[3] = elts_head; 811 txq->elts_comp = 0; 812 } else { 813 txq->elts_comp = comp; 814 } 815 #ifdef MLX5_PMD_SOFT_COUNTERS 816 /* Increment sent packets counter. */ 817 txq->stats.opackets += i; 818 #endif 819 /* Ring QP doorbell. */ 820 if (mpw.state == MLX5_MPW_STATE_OPENED) 821 mlx5_mpw_close(txq, &mpw); 822 mlx5_tx_dbrec(txq, mpw.wqe); 823 txq->elts_head = elts_head; 824 return i; 825 } 826 827 /** 828 * Open a MPW inline session. 829 * 830 * @param txq 831 * Pointer to TX queue structure. 832 * @param mpw 833 * Pointer to MPW session structure. 834 * @param length 835 * Packet length. 836 */ 837 static inline void 838 mlx5_mpw_inline_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length) 839 { 840 uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1); 841 struct mlx5_wqe_inl_small *inl; 842 843 mpw->state = MLX5_MPW_INL_STATE_OPENED; 844 mpw->pkts_n = 0; 845 mpw->len = length; 846 mpw->total_len = 0; 847 mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx); 848 mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) | 849 (txq->wqe_ci << 8) | 850 MLX5_OPCODE_TSO); 851 mpw->wqe->ctrl[2] = 0; 852 mpw->wqe->ctrl[3] = 0; 853 mpw->wqe->eseg.mss = htons(length); 854 mpw->wqe->eseg.inline_hdr_sz = 0; 855 mpw->wqe->eseg.cs_flags = 0; 856 mpw->wqe->eseg.rsvd0 = 0; 857 mpw->wqe->eseg.rsvd1 = 0; 858 mpw->wqe->eseg.rsvd2 = 0; 859 inl = (struct mlx5_wqe_inl_small *) 860 (((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE); 861 mpw->data.raw = (uint8_t *)&inl->raw; 862 } 863 864 /** 865 * Close a MPW inline session. 866 * 867 * @param txq 868 * Pointer to TX queue structure. 869 * @param mpw 870 * Pointer to MPW session structure. 871 */ 872 static inline void 873 mlx5_mpw_inline_close(struct txq *txq, struct mlx5_mpw *mpw) 874 { 875 unsigned int size; 876 struct mlx5_wqe_inl_small *inl = (struct mlx5_wqe_inl_small *) 877 (((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE)); 878 879 size = MLX5_WQE_SIZE - MLX5_MWQE64_INL_DATA + mpw->total_len; 880 /* 881 * Store size in multiple of 16 bytes. Control and Ethernet segments 882 * count as 2. 883 */ 884 mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | MLX5_WQE_DS(size)); 885 mpw->state = MLX5_MPW_STATE_CLOSED; 886 inl->byte_cnt = htonl(mpw->total_len | MLX5_INLINE_SEG); 887 txq->wqe_ci += (size + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE; 888 } 889 890 /** 891 * DPDK callback for TX with MPW inline support. 892 * 893 * @param dpdk_txq 894 * Generic pointer to TX queue structure. 895 * @param[in] pkts 896 * Packets to transmit. 897 * @param pkts_n 898 * Number of packets in array. 899 * 900 * @return 901 * Number of packets successfully transmitted (<= pkts_n). 902 */ 903 uint16_t 904 mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, 905 uint16_t pkts_n) 906 { 907 struct txq *txq = (struct txq *)dpdk_txq; 908 uint16_t elts_head = txq->elts_head; 909 const unsigned int elts_n = 1 << txq->elts_n; 910 unsigned int i = 0; 911 unsigned int j = 0; 912 unsigned int max; 913 unsigned int comp; 914 unsigned int inline_room = txq->max_inline * RTE_CACHE_LINE_SIZE; 915 struct mlx5_mpw mpw = { 916 .state = MLX5_MPW_STATE_CLOSED, 917 }; 918 919 if (unlikely(!pkts_n)) 920 return 0; 921 /* Prefetch first packet cacheline. */ 922 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci)); 923 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); 924 /* Start processing. */ 925 txq_complete(txq); 926 max = (elts_n - (elts_head - txq->elts_tail)); 927 if (max > elts_n) 928 max -= elts_n; 929 do { 930 struct rte_mbuf *buf = *(pkts++); 931 unsigned int elts_head_next; 932 uintptr_t addr; 933 uint32_t length; 934 unsigned int segs_n = buf->nb_segs; 935 uint32_t cs_flags = 0; 936 937 /* 938 * Make sure there is enough room to store this packet and 939 * that one ring entry remains unused. 940 */ 941 assert(segs_n); 942 if (max < segs_n + 1) 943 break; 944 /* Do not bother with large packets MPW cannot handle. */ 945 if (segs_n > MLX5_MPW_DSEG_MAX) 946 break; 947 max -= segs_n; 948 --pkts_n; 949 /* Should we enable HW CKSUM offload */ 950 if (buf->ol_flags & 951 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) 952 cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; 953 /* Retrieve packet information. */ 954 length = PKT_LEN(buf); 955 /* Start new session if packet differs. */ 956 if (mpw.state == MLX5_MPW_STATE_OPENED) { 957 if ((mpw.len != length) || 958 (segs_n != 1) || 959 (mpw.wqe->eseg.cs_flags != cs_flags)) 960 mlx5_mpw_close(txq, &mpw); 961 } else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) { 962 if ((mpw.len != length) || 963 (segs_n != 1) || 964 (length > inline_room) || 965 (mpw.wqe->eseg.cs_flags != cs_flags)) { 966 mlx5_mpw_inline_close(txq, &mpw); 967 inline_room = 968 txq->max_inline * RTE_CACHE_LINE_SIZE; 969 } 970 } 971 if (mpw.state == MLX5_MPW_STATE_CLOSED) { 972 if ((segs_n != 1) || 973 (length > inline_room)) { 974 mlx5_mpw_new(txq, &mpw, length); 975 mpw.wqe->eseg.cs_flags = cs_flags; 976 } else { 977 mlx5_mpw_inline_new(txq, &mpw, length); 978 mpw.wqe->eseg.cs_flags = cs_flags; 979 } 980 } 981 /* Multi-segment packets must be alone in their MPW. */ 982 assert((segs_n == 1) || (mpw.pkts_n == 0)); 983 if (mpw.state == MLX5_MPW_STATE_OPENED) { 984 assert(inline_room == 985 txq->max_inline * RTE_CACHE_LINE_SIZE); 986 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 987 length = 0; 988 #endif 989 do { 990 volatile struct mlx5_wqe_data_seg *dseg; 991 992 elts_head_next = 993 (elts_head + 1) & (elts_n - 1); 994 assert(buf); 995 (*txq->elts)[elts_head] = buf; 996 dseg = mpw.data.dseg[mpw.pkts_n]; 997 addr = rte_pktmbuf_mtod(buf, uintptr_t); 998 *dseg = (struct mlx5_wqe_data_seg){ 999 .byte_count = htonl(DATA_LEN(buf)), 1000 .lkey = txq_mp2mr(txq, txq_mb2mp(buf)), 1001 .addr = htonll(addr), 1002 }; 1003 elts_head = elts_head_next; 1004 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 1005 length += DATA_LEN(buf); 1006 #endif 1007 buf = buf->next; 1008 ++mpw.pkts_n; 1009 ++j; 1010 } while (--segs_n); 1011 assert(length == mpw.len); 1012 if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) 1013 mlx5_mpw_close(txq, &mpw); 1014 } else { 1015 unsigned int max; 1016 1017 assert(mpw.state == MLX5_MPW_INL_STATE_OPENED); 1018 assert(length <= inline_room); 1019 assert(length == DATA_LEN(buf)); 1020 elts_head_next = (elts_head + 1) & (elts_n - 1); 1021 addr = rte_pktmbuf_mtod(buf, uintptr_t); 1022 (*txq->elts)[elts_head] = buf; 1023 /* Maximum number of bytes before wrapping. */ 1024 max = ((((uintptr_t)(txq->wqes)) + 1025 (1 << txq->wqe_n) * 1026 MLX5_WQE_SIZE) - 1027 (uintptr_t)mpw.data.raw); 1028 if (length > max) { 1029 rte_memcpy((void *)(uintptr_t)mpw.data.raw, 1030 (void *)addr, 1031 max); 1032 mpw.data.raw = (volatile void *)txq->wqes; 1033 rte_memcpy((void *)(uintptr_t)mpw.data.raw, 1034 (void *)(addr + max), 1035 length - max); 1036 mpw.data.raw += length - max; 1037 } else { 1038 rte_memcpy((void *)(uintptr_t)mpw.data.raw, 1039 (void *)addr, 1040 length); 1041 mpw.data.raw += length; 1042 } 1043 if ((uintptr_t)mpw.data.raw == 1044 (uintptr_t)tx_mlx5_wqe(txq, 1 << txq->wqe_n)) 1045 mpw.data.raw = (volatile void *)txq->wqes; 1046 ++mpw.pkts_n; 1047 ++j; 1048 if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) { 1049 mlx5_mpw_inline_close(txq, &mpw); 1050 inline_room = 1051 txq->max_inline * RTE_CACHE_LINE_SIZE; 1052 } else { 1053 inline_room -= length; 1054 } 1055 } 1056 mpw.total_len += length; 1057 elts_head = elts_head_next; 1058 #ifdef MLX5_PMD_SOFT_COUNTERS 1059 /* Increment sent bytes counter. */ 1060 txq->stats.obytes += length; 1061 #endif 1062 ++i; 1063 } while (pkts_n); 1064 /* Take a shortcut if nothing must be sent. */ 1065 if (unlikely(i == 0)) 1066 return 0; 1067 /* Check whether completion threshold has been reached. */ 1068 /* "j" includes both packets and segments. */ 1069 comp = txq->elts_comp + j; 1070 if (comp >= MLX5_TX_COMP_THRESH) { 1071 volatile struct mlx5_wqe *wqe = mpw.wqe; 1072 1073 /* Request completion on last WQE. */ 1074 wqe->ctrl[2] = htonl(8); 1075 /* Save elts_head in unused "immediate" field of WQE. */ 1076 wqe->ctrl[3] = elts_head; 1077 txq->elts_comp = 0; 1078 } else { 1079 txq->elts_comp = comp; 1080 } 1081 #ifdef MLX5_PMD_SOFT_COUNTERS 1082 /* Increment sent packets counter. */ 1083 txq->stats.opackets += i; 1084 #endif 1085 /* Ring QP doorbell. */ 1086 if (mpw.state == MLX5_MPW_INL_STATE_OPENED) 1087 mlx5_mpw_inline_close(txq, &mpw); 1088 else if (mpw.state == MLX5_MPW_STATE_OPENED) 1089 mlx5_mpw_close(txq, &mpw); 1090 mlx5_tx_dbrec(txq, mpw.wqe); 1091 txq->elts_head = elts_head; 1092 return i; 1093 } 1094 1095 /** 1096 * Translate RX completion flags to packet type. 1097 * 1098 * @param[in] cqe 1099 * Pointer to CQE. 1100 * 1101 * @note: fix mlx5_dev_supported_ptypes_get() if any change here. 1102 * 1103 * @return 1104 * Packet type for struct rte_mbuf. 1105 */ 1106 static inline uint32_t 1107 rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe) 1108 { 1109 uint32_t pkt_type; 1110 uint8_t flags = cqe->l4_hdr_type_etc; 1111 1112 if (cqe->pkt_info & MLX5_CQE_RX_TUNNEL_PACKET) 1113 pkt_type = 1114 TRANSPOSE(flags, 1115 MLX5_CQE_RX_OUTER_IPV4_PACKET, 1116 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN) | 1117 TRANSPOSE(flags, 1118 MLX5_CQE_RX_OUTER_IPV6_PACKET, 1119 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN) | 1120 TRANSPOSE(flags, 1121 MLX5_CQE_RX_IPV4_PACKET, 1122 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN) | 1123 TRANSPOSE(flags, 1124 MLX5_CQE_RX_IPV6_PACKET, 1125 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN); 1126 else 1127 pkt_type = 1128 TRANSPOSE(flags, 1129 MLX5_CQE_L3_HDR_TYPE_IPV6, 1130 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN) | 1131 TRANSPOSE(flags, 1132 MLX5_CQE_L3_HDR_TYPE_IPV4, 1133 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN); 1134 return pkt_type; 1135 } 1136 1137 /** 1138 * Get size of the next packet for a given CQE. For compressed CQEs, the 1139 * consumer index is updated only once all packets of the current one have 1140 * been processed. 1141 * 1142 * @param rxq 1143 * Pointer to RX queue. 1144 * @param cqe 1145 * CQE to process. 1146 * @param[out] rss_hash 1147 * Packet RSS Hash result. 1148 * 1149 * @return 1150 * Packet size in bytes (0 if there is none), -1 in case of completion 1151 * with error. 1152 */ 1153 static inline int 1154 mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe, 1155 uint16_t cqe_cnt, uint32_t *rss_hash) 1156 { 1157 struct rxq_zip *zip = &rxq->zip; 1158 uint16_t cqe_n = cqe_cnt + 1; 1159 int len = 0; 1160 1161 /* Process compressed data in the CQE and mini arrays. */ 1162 if (zip->ai) { 1163 volatile struct mlx5_mini_cqe8 (*mc)[8] = 1164 (volatile struct mlx5_mini_cqe8 (*)[8]) 1165 (uintptr_t)(&(*rxq->cqes)[zip->ca & cqe_cnt]); 1166 1167 len = ntohl((*mc)[zip->ai & 7].byte_cnt); 1168 *rss_hash = ntohl((*mc)[zip->ai & 7].rx_hash_result); 1169 if ((++zip->ai & 7) == 0) { 1170 /* 1171 * Increment consumer index to skip the number of 1172 * CQEs consumed. Hardware leaves holes in the CQ 1173 * ring for software use. 1174 */ 1175 zip->ca = zip->na; 1176 zip->na += 8; 1177 } 1178 if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) { 1179 uint16_t idx = rxq->cq_ci + 1; 1180 uint16_t end = zip->cq_ci; 1181 1182 while (idx != end) { 1183 (*rxq->cqes)[idx & cqe_cnt].op_own = 1184 MLX5_CQE_INVALIDATE; 1185 ++idx; 1186 } 1187 rxq->cq_ci = zip->cq_ci; 1188 zip->ai = 0; 1189 } 1190 /* No compressed data, get next CQE and verify if it is compressed. */ 1191 } else { 1192 int ret; 1193 int8_t op_own; 1194 1195 ret = check_cqe(cqe, cqe_n, rxq->cq_ci); 1196 if (unlikely(ret == 1)) 1197 return 0; 1198 ++rxq->cq_ci; 1199 op_own = cqe->op_own; 1200 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) { 1201 volatile struct mlx5_mini_cqe8 (*mc)[8] = 1202 (volatile struct mlx5_mini_cqe8 (*)[8]) 1203 (uintptr_t)(&(*rxq->cqes)[rxq->cq_ci & 1204 cqe_cnt]); 1205 1206 /* Fix endianness. */ 1207 zip->cqe_cnt = ntohl(cqe->byte_cnt); 1208 /* 1209 * Current mini array position is the one returned by 1210 * check_cqe64(). 1211 * 1212 * If completion comprises several mini arrays, as a 1213 * special case the second one is located 7 CQEs after 1214 * the initial CQE instead of 8 for subsequent ones. 1215 */ 1216 zip->ca = rxq->cq_ci & cqe_cnt; 1217 zip->na = zip->ca + 7; 1218 /* Compute the next non compressed CQE. */ 1219 --rxq->cq_ci; 1220 zip->cq_ci = rxq->cq_ci + zip->cqe_cnt; 1221 /* Get packet size to return. */ 1222 len = ntohl((*mc)[0].byte_cnt); 1223 *rss_hash = ntohl((*mc)[0].rx_hash_result); 1224 zip->ai = 1; 1225 } else { 1226 len = ntohl(cqe->byte_cnt); 1227 *rss_hash = ntohl(cqe->rx_hash_res); 1228 } 1229 /* Error while receiving packet. */ 1230 if (unlikely(MLX5_CQE_OPCODE(op_own) == MLX5_CQE_RESP_ERR)) 1231 return -1; 1232 } 1233 return len; 1234 } 1235 1236 /** 1237 * Translate RX completion flags to offload flags. 1238 * 1239 * @param[in] rxq 1240 * Pointer to RX queue structure. 1241 * @param[in] cqe 1242 * Pointer to CQE. 1243 * 1244 * @return 1245 * Offload flags (ol_flags) for struct rte_mbuf. 1246 */ 1247 static inline uint32_t 1248 rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe *cqe) 1249 { 1250 uint32_t ol_flags = 0; 1251 uint8_t l3_hdr = (cqe->l4_hdr_type_etc) & MLX5_CQE_L3_HDR_TYPE_MASK; 1252 uint8_t l4_hdr = (cqe->l4_hdr_type_etc) & MLX5_CQE_L4_HDR_TYPE_MASK; 1253 1254 if ((l3_hdr == MLX5_CQE_L3_HDR_TYPE_IPV4) || 1255 (l3_hdr == MLX5_CQE_L3_HDR_TYPE_IPV6)) 1256 ol_flags |= TRANSPOSE(cqe->hds_ip_ext, 1257 MLX5_CQE_L3_OK, 1258 PKT_RX_IP_CKSUM_GOOD); 1259 if ((l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP) || 1260 (l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP_EMP_ACK) || 1261 (l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP_ACK) || 1262 (l4_hdr == MLX5_CQE_L4_HDR_TYPE_UDP)) 1263 ol_flags |= TRANSPOSE(cqe->hds_ip_ext, 1264 MLX5_CQE_L4_OK, 1265 PKT_RX_L4_CKSUM_GOOD); 1266 if ((cqe->pkt_info & MLX5_CQE_RX_TUNNEL_PACKET) && (rxq->csum_l2tun)) 1267 ol_flags |= 1268 TRANSPOSE(cqe->l4_hdr_type_etc, 1269 MLX5_CQE_RX_OUTER_IP_CSUM_OK, 1270 PKT_RX_IP_CKSUM_GOOD) | 1271 TRANSPOSE(cqe->l4_hdr_type_etc, 1272 MLX5_CQE_RX_OUTER_TCP_UDP_CSUM_OK, 1273 PKT_RX_L4_CKSUM_GOOD); 1274 return ol_flags; 1275 } 1276 1277 /** 1278 * DPDK callback for RX. 1279 * 1280 * @param dpdk_rxq 1281 * Generic pointer to RX queue structure. 1282 * @param[out] pkts 1283 * Array to store received packets. 1284 * @param pkts_n 1285 * Maximum number of packets in array. 1286 * 1287 * @return 1288 * Number of packets successfully received (<= pkts_n). 1289 */ 1290 uint16_t 1291 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1292 { 1293 struct rxq *rxq = dpdk_rxq; 1294 const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1; 1295 const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1; 1296 const unsigned int sges_n = rxq->sges_n; 1297 struct rte_mbuf *pkt = NULL; 1298 struct rte_mbuf *seg = NULL; 1299 volatile struct mlx5_cqe *cqe = 1300 &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1301 unsigned int i = 0; 1302 unsigned int rq_ci = rxq->rq_ci << sges_n; 1303 int len; /* keep its value across iterations. */ 1304 1305 while (pkts_n) { 1306 unsigned int idx = rq_ci & wqe_cnt; 1307 volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx]; 1308 struct rte_mbuf *rep = (*rxq->elts)[idx]; 1309 uint32_t rss_hash_res = 0; 1310 1311 if (pkt) 1312 NEXT(seg) = rep; 1313 seg = rep; 1314 rte_prefetch0(seg); 1315 rte_prefetch0(cqe); 1316 rte_prefetch0(wqe); 1317 rep = rte_mbuf_raw_alloc(rxq->mp); 1318 if (unlikely(rep == NULL)) { 1319 ++rxq->stats.rx_nombuf; 1320 if (!pkt) { 1321 /* 1322 * no buffers before we even started, 1323 * bail out silently. 1324 */ 1325 break; 1326 } 1327 while (pkt != seg) { 1328 assert(pkt != (*rxq->elts)[idx]); 1329 rep = NEXT(pkt); 1330 rte_mbuf_refcnt_set(pkt, 0); 1331 __rte_mbuf_raw_free(pkt); 1332 pkt = rep; 1333 } 1334 break; 1335 } 1336 if (!pkt) { 1337 cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1338 len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt, 1339 &rss_hash_res); 1340 if (!len) { 1341 rte_mbuf_refcnt_set(rep, 0); 1342 __rte_mbuf_raw_free(rep); 1343 break; 1344 } 1345 if (unlikely(len == -1)) { 1346 /* RX error, packet is likely too large. */ 1347 rte_mbuf_refcnt_set(rep, 0); 1348 __rte_mbuf_raw_free(rep); 1349 ++rxq->stats.idropped; 1350 goto skip; 1351 } 1352 pkt = seg; 1353 assert(len >= (rxq->crc_present << 2)); 1354 /* Update packet information. */ 1355 pkt->packet_type = 0; 1356 pkt->ol_flags = 0; 1357 if (rss_hash_res && rxq->rss_hash) { 1358 pkt->hash.rss = rss_hash_res; 1359 pkt->ol_flags = PKT_RX_RSS_HASH; 1360 } 1361 if (rxq->mark && 1362 ((cqe->sop_drop_qpn != 1363 htonl(MLX5_FLOW_MARK_INVALID)) || 1364 (cqe->sop_drop_qpn != 1365 htonl(MLX5_FLOW_MARK_DEFAULT)))) { 1366 pkt->hash.fdir.hi = 1367 mlx5_flow_mark_get(cqe->sop_drop_qpn); 1368 pkt->ol_flags &= ~PKT_RX_RSS_HASH; 1369 pkt->ol_flags |= PKT_RX_FDIR | PKT_RX_FDIR_ID; 1370 } 1371 if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip | 1372 rxq->crc_present) { 1373 if (rxq->csum) { 1374 pkt->packet_type = 1375 rxq_cq_to_pkt_type(cqe); 1376 pkt->ol_flags |= 1377 rxq_cq_to_ol_flags(rxq, cqe); 1378 } 1379 if (cqe->l4_hdr_type_etc & 1380 MLX5_CQE_VLAN_STRIPPED) { 1381 pkt->ol_flags |= PKT_RX_VLAN_PKT | 1382 PKT_RX_VLAN_STRIPPED; 1383 pkt->vlan_tci = ntohs(cqe->vlan_info); 1384 } 1385 if (rxq->crc_present) 1386 len -= ETHER_CRC_LEN; 1387 } 1388 PKT_LEN(pkt) = len; 1389 } 1390 DATA_LEN(rep) = DATA_LEN(seg); 1391 PKT_LEN(rep) = PKT_LEN(seg); 1392 SET_DATA_OFF(rep, DATA_OFF(seg)); 1393 NB_SEGS(rep) = NB_SEGS(seg); 1394 PORT(rep) = PORT(seg); 1395 NEXT(rep) = NULL; 1396 (*rxq->elts)[idx] = rep; 1397 /* 1398 * Fill NIC descriptor with the new buffer. The lkey and size 1399 * of the buffers are already known, only the buffer address 1400 * changes. 1401 */ 1402 wqe->addr = htonll(rte_pktmbuf_mtod(rep, uintptr_t)); 1403 if (len > DATA_LEN(seg)) { 1404 len -= DATA_LEN(seg); 1405 ++NB_SEGS(pkt); 1406 ++rq_ci; 1407 continue; 1408 } 1409 DATA_LEN(seg) = len; 1410 #ifdef MLX5_PMD_SOFT_COUNTERS 1411 /* Increment bytes counter. */ 1412 rxq->stats.ibytes += PKT_LEN(pkt); 1413 #endif 1414 /* Return packet. */ 1415 *(pkts++) = pkt; 1416 pkt = NULL; 1417 --pkts_n; 1418 ++i; 1419 skip: 1420 /* Align consumer index to the next stride. */ 1421 rq_ci >>= sges_n; 1422 ++rq_ci; 1423 rq_ci <<= sges_n; 1424 } 1425 if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci))) 1426 return 0; 1427 /* Update the consumer index. */ 1428 rxq->rq_ci = rq_ci >> sges_n; 1429 rte_wmb(); 1430 *rxq->cq_db = htonl(rxq->cq_ci); 1431 rte_wmb(); 1432 *rxq->rq_db = htonl(rxq->rq_ci); 1433 #ifdef MLX5_PMD_SOFT_COUNTERS 1434 /* Increment packets counter. */ 1435 rxq->stats.ipackets += i; 1436 #endif 1437 return i; 1438 } 1439 1440 /** 1441 * Dummy DPDK callback for TX. 1442 * 1443 * This function is used to temporarily replace the real callback during 1444 * unsafe control operations on the queue, or in case of error. 1445 * 1446 * @param dpdk_txq 1447 * Generic pointer to TX queue structure. 1448 * @param[in] pkts 1449 * Packets to transmit. 1450 * @param pkts_n 1451 * Number of packets in array. 1452 * 1453 * @return 1454 * Number of packets successfully transmitted (<= pkts_n). 1455 */ 1456 uint16_t 1457 removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 1458 { 1459 (void)dpdk_txq; 1460 (void)pkts; 1461 (void)pkts_n; 1462 return 0; 1463 } 1464 1465 /** 1466 * Dummy DPDK callback for RX. 1467 * 1468 * This function is used to temporarily replace the real callback during 1469 * unsafe control operations on the queue, or in case of error. 1470 * 1471 * @param dpdk_rxq 1472 * Generic pointer to RX queue structure. 1473 * @param[out] pkts 1474 * Array to store received packets. 1475 * @param pkts_n 1476 * Maximum number of packets in array. 1477 * 1478 * @return 1479 * Number of packets successfully received (<= pkts_n). 1480 */ 1481 uint16_t 1482 removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1483 { 1484 (void)dpdk_rxq; 1485 (void)pkts; 1486 (void)pkts_n; 1487 return 0; 1488 } 1489