1 /*- 2 * BSD LICENSE 3 * 4 * Copyright 2015 6WIND S.A. 5 * Copyright 2015 Mellanox. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of 6WIND S.A. nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <assert.h> 35 #include <stdint.h> 36 #include <string.h> 37 #include <stdlib.h> 38 39 /* Verbs header. */ 40 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 41 #ifdef PEDANTIC 42 #pragma GCC diagnostic ignored "-Wpedantic" 43 #endif 44 #include <infiniband/verbs.h> 45 #include <infiniband/mlx5_hw.h> 46 #include <infiniband/arch.h> 47 #ifdef PEDANTIC 48 #pragma GCC diagnostic error "-Wpedantic" 49 #endif 50 51 /* DPDK headers don't like -pedantic. */ 52 #ifdef PEDANTIC 53 #pragma GCC diagnostic ignored "-Wpedantic" 54 #endif 55 #include <rte_mbuf.h> 56 #include <rte_mempool.h> 57 #include <rte_prefetch.h> 58 #include <rte_common.h> 59 #include <rte_branch_prediction.h> 60 #include <rte_ether.h> 61 #ifdef PEDANTIC 62 #pragma GCC diagnostic error "-Wpedantic" 63 #endif 64 65 #include "mlx5.h" 66 #include "mlx5_utils.h" 67 #include "mlx5_rxtx.h" 68 #include "mlx5_autoconf.h" 69 #include "mlx5_defs.h" 70 #include "mlx5_prm.h" 71 72 #ifndef NDEBUG 73 74 /** 75 * Verify or set magic value in CQE. 76 * 77 * @param cqe 78 * Pointer to CQE. 79 * 80 * @return 81 * 0 the first time. 82 */ 83 static inline int 84 check_cqe_seen(volatile struct mlx5_cqe *cqe) 85 { 86 static const uint8_t magic[] = "seen"; 87 volatile uint8_t (*buf)[sizeof(cqe->rsvd3)] = &cqe->rsvd3; 88 int ret = 1; 89 unsigned int i; 90 91 for (i = 0; i < sizeof(magic) && i < sizeof(*buf); ++i) 92 if (!ret || (*buf)[i] != magic[i]) { 93 ret = 0; 94 (*buf)[i] = magic[i]; 95 } 96 return ret; 97 } 98 99 #endif /* NDEBUG */ 100 101 static inline int 102 check_cqe(volatile struct mlx5_cqe *cqe, 103 unsigned int cqes_n, const uint16_t ci) 104 __attribute__((always_inline)); 105 106 /** 107 * Check whether CQE is valid. 108 * 109 * @param cqe 110 * Pointer to CQE. 111 * @param cqes_n 112 * Size of completion queue. 113 * @param ci 114 * Consumer index. 115 * 116 * @return 117 * 0 on success, 1 on failure. 118 */ 119 static inline int 120 check_cqe(volatile struct mlx5_cqe *cqe, 121 unsigned int cqes_n, const uint16_t ci) 122 { 123 uint16_t idx = ci & cqes_n; 124 uint8_t op_own = cqe->op_own; 125 uint8_t op_owner = MLX5_CQE_OWNER(op_own); 126 uint8_t op_code = MLX5_CQE_OPCODE(op_own); 127 128 if (unlikely((op_owner != (!!(idx))) || (op_code == MLX5_CQE_INVALID))) 129 return 1; /* No CQE. */ 130 #ifndef NDEBUG 131 if ((op_code == MLX5_CQE_RESP_ERR) || 132 (op_code == MLX5_CQE_REQ_ERR)) { 133 volatile struct mlx5_err_cqe *err_cqe = (volatile void *)cqe; 134 uint8_t syndrome = err_cqe->syndrome; 135 136 if ((syndrome == MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR) || 137 (syndrome == MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR)) 138 return 0; 139 if (!check_cqe_seen(cqe)) 140 ERROR("unexpected CQE error %u (0x%02x)" 141 " syndrome 0x%02x", 142 op_code, op_code, syndrome); 143 return 1; 144 } else if ((op_code != MLX5_CQE_RESP_SEND) && 145 (op_code != MLX5_CQE_REQ)) { 146 if (!check_cqe_seen(cqe)) 147 ERROR("unexpected CQE opcode %u (0x%02x)", 148 op_code, op_code); 149 return 1; 150 } 151 #endif /* NDEBUG */ 152 return 0; 153 } 154 155 static inline void 156 txq_complete(struct txq *txq) __attribute__((always_inline)); 157 158 /** 159 * Manage TX completions. 160 * 161 * When sending a burst, mlx5_tx_burst() posts several WRs. 162 * 163 * @param txq 164 * Pointer to TX queue structure. 165 */ 166 static inline void 167 txq_complete(struct txq *txq) 168 { 169 const unsigned int elts_n = 1 << txq->elts_n; 170 const unsigned int cqe_n = 1 << txq->cqe_n; 171 const unsigned int cqe_cnt = cqe_n - 1; 172 uint16_t elts_free = txq->elts_tail; 173 uint16_t elts_tail; 174 uint16_t cq_ci = txq->cq_ci; 175 volatile struct mlx5_cqe *cqe = NULL; 176 volatile struct mlx5_wqe *wqe; 177 178 do { 179 volatile struct mlx5_cqe *tmp; 180 181 tmp = &(*txq->cqes)[cq_ci & cqe_cnt]; 182 if (check_cqe(tmp, cqe_n, cq_ci)) 183 break; 184 cqe = tmp; 185 #ifndef NDEBUG 186 if (MLX5_CQE_FORMAT(cqe->op_own) == MLX5_COMPRESSED) { 187 if (!check_cqe_seen(cqe)) 188 ERROR("unexpected compressed CQE, TX stopped"); 189 return; 190 } 191 if ((MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_RESP_ERR) || 192 (MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_REQ_ERR)) { 193 if (!check_cqe_seen(cqe)) 194 ERROR("unexpected error CQE, TX stopped"); 195 return; 196 } 197 #endif /* NDEBUG */ 198 ++cq_ci; 199 } while (1); 200 if (unlikely(cqe == NULL)) 201 return; 202 wqe = &(*txq->wqes)[htons(cqe->wqe_counter) & 203 ((1 << txq->wqe_n) - 1)].hdr; 204 elts_tail = wqe->ctrl[3]; 205 assert(elts_tail < (1 << txq->wqe_n)); 206 /* Free buffers. */ 207 while (elts_free != elts_tail) { 208 struct rte_mbuf *elt = (*txq->elts)[elts_free]; 209 unsigned int elts_free_next = 210 (elts_free + 1) & (elts_n - 1); 211 struct rte_mbuf *elt_next = (*txq->elts)[elts_free_next]; 212 213 #ifndef NDEBUG 214 /* Poisoning. */ 215 memset(&(*txq->elts)[elts_free], 216 0x66, 217 sizeof((*txq->elts)[elts_free])); 218 #endif 219 RTE_MBUF_PREFETCH_TO_FREE(elt_next); 220 /* Only one segment needs to be freed. */ 221 rte_pktmbuf_free_seg(elt); 222 elts_free = elts_free_next; 223 } 224 txq->cq_ci = cq_ci; 225 txq->elts_tail = elts_tail; 226 /* Update the consumer index. */ 227 rte_wmb(); 228 *txq->cq_db = htonl(cq_ci); 229 } 230 231 /** 232 * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which 233 * the cloned mbuf is allocated is returned instead. 234 * 235 * @param buf 236 * Pointer to mbuf. 237 * 238 * @return 239 * Memory pool where data is located for given mbuf. 240 */ 241 static struct rte_mempool * 242 txq_mb2mp(struct rte_mbuf *buf) 243 { 244 if (unlikely(RTE_MBUF_INDIRECT(buf))) 245 return rte_mbuf_from_indirect(buf)->pool; 246 return buf->pool; 247 } 248 249 static inline uint32_t 250 txq_mp2mr(struct txq *txq, struct rte_mempool *mp) 251 __attribute__((always_inline)); 252 253 /** 254 * Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[]. 255 * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full, 256 * remove an entry first. 257 * 258 * @param txq 259 * Pointer to TX queue structure. 260 * @param[in] mp 261 * Memory Pool for which a Memory Region lkey must be returned. 262 * 263 * @return 264 * mr->lkey on success, (uint32_t)-1 on failure. 265 */ 266 static inline uint32_t 267 txq_mp2mr(struct txq *txq, struct rte_mempool *mp) 268 { 269 unsigned int i; 270 uint32_t lkey = (uint32_t)-1; 271 272 for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) { 273 if (unlikely(txq->mp2mr[i].mp == NULL)) { 274 /* Unknown MP, add a new MR for it. */ 275 break; 276 } 277 if (txq->mp2mr[i].mp == mp) { 278 assert(txq->mp2mr[i].lkey != (uint32_t)-1); 279 assert(htonl(txq->mp2mr[i].mr->lkey) == 280 txq->mp2mr[i].lkey); 281 lkey = txq->mp2mr[i].lkey; 282 break; 283 } 284 } 285 if (unlikely(lkey == (uint32_t)-1)) 286 lkey = txq_mp2mr_reg(txq, mp, i); 287 return lkey; 288 } 289 290 /** 291 * Ring TX queue doorbell. 292 * 293 * @param txq 294 * Pointer to TX queue structure. 295 */ 296 static inline void 297 mlx5_tx_dbrec(struct txq *txq) 298 { 299 uint8_t *dst = (uint8_t *)((uintptr_t)txq->bf_reg + txq->bf_offset); 300 uint32_t data[4] = { 301 htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND), 302 htonl(txq->qp_num_8s), 303 0, 304 0, 305 }; 306 rte_wmb(); 307 *txq->qp_db = htonl(txq->wqe_ci); 308 /* Ensure ordering between DB record and BF copy. */ 309 rte_wmb(); 310 memcpy(dst, (uint8_t *)data, 16); 311 txq->bf_offset ^= (1 << txq->bf_buf_size); 312 } 313 314 /** 315 * Prefetch a CQE. 316 * 317 * @param txq 318 * Pointer to TX queue structure. 319 * @param cqe_ci 320 * CQE consumer index. 321 */ 322 static inline void 323 tx_prefetch_cqe(struct txq *txq, uint16_t ci) 324 { 325 volatile struct mlx5_cqe *cqe; 326 327 cqe = &(*txq->cqes)[ci & ((1 << txq->cqe_n) - 1)]; 328 rte_prefetch0(cqe); 329 } 330 331 /** 332 * Prefetch a WQE. 333 * 334 * @param txq 335 * Pointer to TX queue structure. 336 * @param wqe_ci 337 * WQE consumer index. 338 */ 339 static inline void 340 tx_prefetch_wqe(struct txq *txq, uint16_t ci) 341 { 342 volatile struct mlx5_wqe64 *wqe; 343 344 wqe = &(*txq->wqes)[ci & ((1 << txq->wqe_n) - 1)]; 345 rte_prefetch0(wqe); 346 } 347 348 /** 349 * DPDK callback for TX. 350 * 351 * @param dpdk_txq 352 * Generic pointer to TX queue structure. 353 * @param[in] pkts 354 * Packets to transmit. 355 * @param pkts_n 356 * Number of packets in array. 357 * 358 * @return 359 * Number of packets successfully transmitted (<= pkts_n). 360 */ 361 uint16_t 362 mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 363 { 364 struct txq *txq = (struct txq *)dpdk_txq; 365 uint16_t elts_head = txq->elts_head; 366 const unsigned int elts_n = 1 << txq->elts_n; 367 unsigned int i = 0; 368 unsigned int j = 0; 369 unsigned int max; 370 unsigned int comp; 371 volatile struct mlx5_wqe *wqe = NULL; 372 unsigned int segs_n = 0; 373 struct rte_mbuf *buf = NULL; 374 uint8_t *raw; 375 376 if (unlikely(!pkts_n)) 377 return 0; 378 /* Prefetch first packet cacheline. */ 379 tx_prefetch_cqe(txq, txq->cq_ci); 380 tx_prefetch_cqe(txq, txq->cq_ci + 1); 381 rte_prefetch0(*pkts); 382 /* Start processing. */ 383 txq_complete(txq); 384 max = (elts_n - (elts_head - txq->elts_tail)); 385 if (max > elts_n) 386 max -= elts_n; 387 do { 388 volatile struct mlx5_wqe_data_seg *dseg = NULL; 389 uint32_t length; 390 unsigned int ds = 0; 391 uintptr_t addr; 392 #ifdef MLX5_PMD_SOFT_COUNTERS 393 uint32_t total_length = 0; 394 #endif 395 396 /* first_seg */ 397 buf = *(pkts++); 398 segs_n = buf->nb_segs; 399 /* 400 * Make sure there is enough room to store this packet and 401 * that one ring entry remains unused. 402 */ 403 assert(segs_n); 404 if (max < segs_n + 1) 405 break; 406 max -= segs_n; 407 --segs_n; 408 if (!segs_n) 409 --pkts_n; 410 wqe = &(*txq->wqes)[txq->wqe_ci & 411 ((1 << txq->wqe_n) - 1)].hdr; 412 tx_prefetch_wqe(txq, txq->wqe_ci + 1); 413 if (pkts_n > 1) 414 rte_prefetch0(*pkts); 415 addr = rte_pktmbuf_mtod(buf, uintptr_t); 416 length = DATA_LEN(buf); 417 #ifdef MLX5_PMD_SOFT_COUNTERS 418 total_length = length; 419 #endif 420 assert(length >= MLX5_WQE_DWORD_SIZE); 421 /* Update element. */ 422 (*txq->elts)[elts_head] = buf; 423 elts_head = (elts_head + 1) & (elts_n - 1); 424 /* Prefetch next buffer data. */ 425 if (pkts_n > 1) { 426 volatile void *pkt_addr; 427 428 pkt_addr = rte_pktmbuf_mtod(*pkts, volatile void *); 429 rte_prefetch0(pkt_addr); 430 } 431 /* Should we enable HW CKSUM offload */ 432 if (buf->ol_flags & 433 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) { 434 wqe->eseg.cs_flags = 435 MLX5_ETH_WQE_L3_CSUM | 436 MLX5_ETH_WQE_L4_CSUM; 437 } else { 438 wqe->eseg.cs_flags = 0; 439 } 440 raw = (uint8_t *)(uintptr_t)&wqe->eseg.inline_hdr[0]; 441 /* Start the know and common part of the WQE structure. */ 442 wqe->ctrl[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND); 443 wqe->ctrl[2] = 0; 444 wqe->ctrl[3] = 0; 445 wqe->eseg.rsvd0 = 0; 446 wqe->eseg.rsvd1 = 0; 447 wqe->eseg.mss = 0; 448 wqe->eseg.rsvd2 = 0; 449 /* Start by copying the Ethernet Header. */ 450 memcpy((uint8_t *)raw, ((uint8_t *)addr), 16); 451 length -= MLX5_WQE_DWORD_SIZE; 452 addr += MLX5_WQE_DWORD_SIZE; 453 /* Replace the Ethernet type by the VLAN if necessary. */ 454 if (buf->ol_flags & PKT_TX_VLAN_PKT) { 455 uint32_t vlan = htonl(0x81000000 | buf->vlan_tci); 456 457 memcpy((uint8_t *)(raw + MLX5_WQE_DWORD_SIZE - 458 sizeof(vlan)), 459 &vlan, sizeof(vlan)); 460 addr -= sizeof(vlan); 461 length += sizeof(vlan); 462 } 463 /* Inline if enough room. */ 464 if (txq->max_inline != 0) { 465 uintptr_t end = 466 (uintptr_t)&(*txq->wqes)[1 << txq->wqe_n]; 467 uint16_t max_inline = 468 txq->max_inline * RTE_CACHE_LINE_SIZE; 469 uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE; 470 uint16_t room; 471 472 raw += MLX5_WQE_DWORD_SIZE; 473 room = end - (uintptr_t)raw; 474 if (room > max_inline) { 475 uintptr_t addr_end = (addr + max_inline) & 476 ~(RTE_CACHE_LINE_SIZE - 1); 477 uint16_t copy_b = ((addr_end - addr) > length) ? 478 length : 479 (addr_end - addr); 480 481 rte_memcpy((void *)raw, (void *)addr, copy_b); 482 addr += copy_b; 483 length -= copy_b; 484 pkt_inline_sz += copy_b; 485 /* Sanity check. */ 486 assert(addr <= addr_end); 487 } 488 /* Store the inlined packet size in the WQE. */ 489 wqe->eseg.inline_hdr_sz = htons(pkt_inline_sz); 490 /* 491 * 2 DWORDs consumed by the WQE header + 1 DSEG + 492 * the size of the inline part of the packet. 493 */ 494 ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2); 495 if (length > 0) { 496 dseg = (struct mlx5_wqe_data_seg *) 497 ((uintptr_t)wqe + 498 (ds * MLX5_WQE_DWORD_SIZE)); 499 if ((uintptr_t)dseg >= end) 500 dseg = (struct mlx5_wqe_data_seg *) 501 ((uintptr_t)&(*txq->wqes)[0]); 502 goto use_dseg; 503 } else if (!segs_n) { 504 goto next_pkt; 505 } else { 506 goto next_seg; 507 } 508 } else { 509 /* 510 * No inline has been done in the packet, only the 511 * Ethernet Header as been stored. 512 */ 513 wqe->eseg.inline_hdr_sz = htons(MLX5_WQE_DWORD_SIZE); 514 dseg = (struct mlx5_wqe_data_seg *) 515 ((uintptr_t)wqe + (3 * MLX5_WQE_DWORD_SIZE)); 516 ds = 3; 517 use_dseg: 518 /* Add the remaining packet as a simple ds. */ 519 *dseg = (struct mlx5_wqe_data_seg) { 520 .addr = htonll(addr), 521 .byte_count = htonl(length), 522 .lkey = txq_mp2mr(txq, txq_mb2mp(buf)), 523 }; 524 ++ds; 525 if (!segs_n) 526 goto next_pkt; 527 } 528 next_seg: 529 assert(buf); 530 assert(ds); 531 assert(wqe); 532 /* 533 * Spill on next WQE when the current one does not have 534 * enough room left. Size of WQE must a be a multiple 535 * of data segment size. 536 */ 537 assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE)); 538 if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE))) { 539 unsigned int n = (txq->wqe_ci + ((ds + 3) / 4)) & 540 ((1 << txq->wqe_n) - 1); 541 542 dseg = (struct mlx5_wqe_data_seg *) 543 ((uintptr_t)&(*txq->wqes)[n]); 544 tx_prefetch_wqe(txq, n + 1); 545 } else { 546 ++dseg; 547 } 548 ++ds; 549 buf = buf->next; 550 assert(buf); 551 length = DATA_LEN(buf); 552 #ifdef MLX5_PMD_SOFT_COUNTERS 553 total_length += length; 554 #endif 555 /* Store segment information. */ 556 *dseg = (struct mlx5_wqe_data_seg) { 557 .addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)), 558 .byte_count = htonl(length), 559 .lkey = txq_mp2mr(txq, txq_mb2mp(buf)), 560 }; 561 (*txq->elts)[elts_head] = buf; 562 elts_head = (elts_head + 1) & (elts_n - 1); 563 ++j; 564 --segs_n; 565 if (segs_n) 566 goto next_seg; 567 else 568 --pkts_n; 569 next_pkt: 570 ++i; 571 wqe->ctrl[1] = htonl(txq->qp_num_8s | ds); 572 txq->wqe_ci += (ds + 3) / 4; 573 #ifdef MLX5_PMD_SOFT_COUNTERS 574 /* Increment sent bytes counter. */ 575 txq->stats.obytes += total_length; 576 #endif 577 } while (pkts_n); 578 /* Take a shortcut if nothing must be sent. */ 579 if (unlikely(i == 0)) 580 return 0; 581 /* Check whether completion threshold has been reached. */ 582 comp = txq->elts_comp + i + j; 583 if (comp >= MLX5_TX_COMP_THRESH) { 584 /* Request completion on last WQE. */ 585 wqe->ctrl[2] = htonl(8); 586 /* Save elts_head in unused "immediate" field of WQE. */ 587 wqe->ctrl[3] = elts_head; 588 txq->elts_comp = 0; 589 } else { 590 txq->elts_comp = comp; 591 } 592 #ifdef MLX5_PMD_SOFT_COUNTERS 593 /* Increment sent packets counter. */ 594 txq->stats.opackets += i; 595 #endif 596 /* Ring QP doorbell. */ 597 mlx5_tx_dbrec(txq); 598 txq->elts_head = elts_head; 599 return i; 600 } 601 602 /** 603 * Open a MPW session. 604 * 605 * @param txq 606 * Pointer to TX queue structure. 607 * @param mpw 608 * Pointer to MPW session structure. 609 * @param length 610 * Packet length. 611 */ 612 static inline void 613 mlx5_mpw_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length) 614 { 615 uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1); 616 volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] = 617 (volatile struct mlx5_wqe_data_seg (*)[]) 618 (uintptr_t)&(*txq->wqes)[(idx + 1) & ((1 << txq->wqe_n) - 1)]; 619 620 mpw->state = MLX5_MPW_STATE_OPENED; 621 mpw->pkts_n = 0; 622 mpw->len = length; 623 mpw->total_len = 0; 624 mpw->wqe = (volatile struct mlx5_wqe *)&(*txq->wqes)[idx].hdr; 625 mpw->wqe->eseg.mss = htons(length); 626 mpw->wqe->eseg.inline_hdr_sz = 0; 627 mpw->wqe->eseg.rsvd0 = 0; 628 mpw->wqe->eseg.rsvd1 = 0; 629 mpw->wqe->eseg.rsvd2 = 0; 630 mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) | 631 (txq->wqe_ci << 8) | MLX5_OPCODE_TSO); 632 mpw->wqe->ctrl[2] = 0; 633 mpw->wqe->ctrl[3] = 0; 634 mpw->data.dseg[0] = (volatile struct mlx5_wqe_data_seg *) 635 (((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE)); 636 mpw->data.dseg[1] = (volatile struct mlx5_wqe_data_seg *) 637 (((uintptr_t)mpw->wqe) + (3 * MLX5_WQE_DWORD_SIZE)); 638 mpw->data.dseg[2] = &(*dseg)[0]; 639 mpw->data.dseg[3] = &(*dseg)[1]; 640 mpw->data.dseg[4] = &(*dseg)[2]; 641 } 642 643 /** 644 * Close a MPW session. 645 * 646 * @param txq 647 * Pointer to TX queue structure. 648 * @param mpw 649 * Pointer to MPW session structure. 650 */ 651 static inline void 652 mlx5_mpw_close(struct txq *txq, struct mlx5_mpw *mpw) 653 { 654 unsigned int num = mpw->pkts_n; 655 656 /* 657 * Store size in multiple of 16 bytes. Control and Ethernet segments 658 * count as 2. 659 */ 660 mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | (2 + num)); 661 mpw->state = MLX5_MPW_STATE_CLOSED; 662 if (num < 3) 663 ++txq->wqe_ci; 664 else 665 txq->wqe_ci += 2; 666 tx_prefetch_wqe(txq, txq->wqe_ci); 667 tx_prefetch_wqe(txq, txq->wqe_ci + 1); 668 } 669 670 /** 671 * DPDK callback for TX with MPW support. 672 * 673 * @param dpdk_txq 674 * Generic pointer to TX queue structure. 675 * @param[in] pkts 676 * Packets to transmit. 677 * @param pkts_n 678 * Number of packets in array. 679 * 680 * @return 681 * Number of packets successfully transmitted (<= pkts_n). 682 */ 683 uint16_t 684 mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 685 { 686 struct txq *txq = (struct txq *)dpdk_txq; 687 uint16_t elts_head = txq->elts_head; 688 const unsigned int elts_n = 1 << txq->elts_n; 689 unsigned int i = 0; 690 unsigned int j = 0; 691 unsigned int max; 692 unsigned int comp; 693 struct mlx5_mpw mpw = { 694 .state = MLX5_MPW_STATE_CLOSED, 695 }; 696 697 if (unlikely(!pkts_n)) 698 return 0; 699 /* Prefetch first packet cacheline. */ 700 tx_prefetch_cqe(txq, txq->cq_ci); 701 tx_prefetch_wqe(txq, txq->wqe_ci); 702 tx_prefetch_wqe(txq, txq->wqe_ci + 1); 703 /* Start processing. */ 704 txq_complete(txq); 705 max = (elts_n - (elts_head - txq->elts_tail)); 706 if (max > elts_n) 707 max -= elts_n; 708 do { 709 struct rte_mbuf *buf = *(pkts++); 710 unsigned int elts_head_next; 711 uint32_t length; 712 unsigned int segs_n = buf->nb_segs; 713 uint32_t cs_flags = 0; 714 715 /* 716 * Make sure there is enough room to store this packet and 717 * that one ring entry remains unused. 718 */ 719 assert(segs_n); 720 if (max < segs_n + 1) 721 break; 722 /* Do not bother with large packets MPW cannot handle. */ 723 if (segs_n > MLX5_MPW_DSEG_MAX) 724 break; 725 max -= segs_n; 726 --pkts_n; 727 /* Should we enable HW CKSUM offload */ 728 if (buf->ol_flags & 729 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) 730 cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; 731 /* Retrieve packet information. */ 732 length = PKT_LEN(buf); 733 assert(length); 734 /* Start new session if packet differs. */ 735 if ((mpw.state == MLX5_MPW_STATE_OPENED) && 736 ((mpw.len != length) || 737 (segs_n != 1) || 738 (mpw.wqe->eseg.cs_flags != cs_flags))) 739 mlx5_mpw_close(txq, &mpw); 740 if (mpw.state == MLX5_MPW_STATE_CLOSED) { 741 mlx5_mpw_new(txq, &mpw, length); 742 mpw.wqe->eseg.cs_flags = cs_flags; 743 } 744 /* Multi-segment packets must be alone in their MPW. */ 745 assert((segs_n == 1) || (mpw.pkts_n == 0)); 746 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 747 length = 0; 748 #endif 749 do { 750 volatile struct mlx5_wqe_data_seg *dseg; 751 uintptr_t addr; 752 753 elts_head_next = (elts_head + 1) & (elts_n - 1); 754 assert(buf); 755 (*txq->elts)[elts_head] = buf; 756 dseg = mpw.data.dseg[mpw.pkts_n]; 757 addr = rte_pktmbuf_mtod(buf, uintptr_t); 758 *dseg = (struct mlx5_wqe_data_seg){ 759 .byte_count = htonl(DATA_LEN(buf)), 760 .lkey = txq_mp2mr(txq, txq_mb2mp(buf)), 761 .addr = htonll(addr), 762 }; 763 elts_head = elts_head_next; 764 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 765 length += DATA_LEN(buf); 766 #endif 767 buf = buf->next; 768 ++mpw.pkts_n; 769 ++j; 770 } while (--segs_n); 771 assert(length == mpw.len); 772 if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) 773 mlx5_mpw_close(txq, &mpw); 774 elts_head = elts_head_next; 775 #ifdef MLX5_PMD_SOFT_COUNTERS 776 /* Increment sent bytes counter. */ 777 txq->stats.obytes += length; 778 #endif 779 ++i; 780 } while (pkts_n); 781 /* Take a shortcut if nothing must be sent. */ 782 if (unlikely(i == 0)) 783 return 0; 784 /* Check whether completion threshold has been reached. */ 785 /* "j" includes both packets and segments. */ 786 comp = txq->elts_comp + j; 787 if (comp >= MLX5_TX_COMP_THRESH) { 788 volatile struct mlx5_wqe *wqe = mpw.wqe; 789 790 /* Request completion on last WQE. */ 791 wqe->ctrl[2] = htonl(8); 792 /* Save elts_head in unused "immediate" field of WQE. */ 793 wqe->ctrl[3] = elts_head; 794 txq->elts_comp = 0; 795 } else { 796 txq->elts_comp = comp; 797 } 798 #ifdef MLX5_PMD_SOFT_COUNTERS 799 /* Increment sent packets counter. */ 800 txq->stats.opackets += i; 801 #endif 802 /* Ring QP doorbell. */ 803 if (mpw.state == MLX5_MPW_STATE_OPENED) 804 mlx5_mpw_close(txq, &mpw); 805 mlx5_tx_dbrec(txq); 806 txq->elts_head = elts_head; 807 return i; 808 } 809 810 /** 811 * Open a MPW inline session. 812 * 813 * @param txq 814 * Pointer to TX queue structure. 815 * @param mpw 816 * Pointer to MPW session structure. 817 * @param length 818 * Packet length. 819 */ 820 static inline void 821 mlx5_mpw_inline_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length) 822 { 823 uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1); 824 struct mlx5_wqe_inl_small *inl; 825 826 mpw->state = MLX5_MPW_INL_STATE_OPENED; 827 mpw->pkts_n = 0; 828 mpw->len = length; 829 mpw->total_len = 0; 830 mpw->wqe = (volatile struct mlx5_wqe *)&(*txq->wqes)[idx].hdr; 831 mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) | 832 (txq->wqe_ci << 8) | 833 MLX5_OPCODE_TSO); 834 mpw->wqe->ctrl[2] = 0; 835 mpw->wqe->ctrl[3] = 0; 836 mpw->wqe->eseg.mss = htons(length); 837 mpw->wqe->eseg.inline_hdr_sz = 0; 838 mpw->wqe->eseg.cs_flags = 0; 839 mpw->wqe->eseg.rsvd0 = 0; 840 mpw->wqe->eseg.rsvd1 = 0; 841 mpw->wqe->eseg.rsvd2 = 0; 842 inl = (struct mlx5_wqe_inl_small *) 843 (((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE); 844 mpw->data.raw = (uint8_t *)&inl->raw; 845 } 846 847 /** 848 * Close a MPW inline session. 849 * 850 * @param txq 851 * Pointer to TX queue structure. 852 * @param mpw 853 * Pointer to MPW session structure. 854 */ 855 static inline void 856 mlx5_mpw_inline_close(struct txq *txq, struct mlx5_mpw *mpw) 857 { 858 unsigned int size; 859 struct mlx5_wqe_inl_small *inl = (struct mlx5_wqe_inl_small *) 860 (((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE)); 861 862 size = MLX5_WQE_SIZE - MLX5_MWQE64_INL_DATA + mpw->total_len; 863 /* 864 * Store size in multiple of 16 bytes. Control and Ethernet segments 865 * count as 2. 866 */ 867 mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | MLX5_WQE_DS(size)); 868 mpw->state = MLX5_MPW_STATE_CLOSED; 869 inl->byte_cnt = htonl(mpw->total_len | MLX5_INLINE_SEG); 870 txq->wqe_ci += (size + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE; 871 } 872 873 /** 874 * DPDK callback for TX with MPW inline support. 875 * 876 * @param dpdk_txq 877 * Generic pointer to TX queue structure. 878 * @param[in] pkts 879 * Packets to transmit. 880 * @param pkts_n 881 * Number of packets in array. 882 * 883 * @return 884 * Number of packets successfully transmitted (<= pkts_n). 885 */ 886 uint16_t 887 mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, 888 uint16_t pkts_n) 889 { 890 struct txq *txq = (struct txq *)dpdk_txq; 891 uint16_t elts_head = txq->elts_head; 892 const unsigned int elts_n = 1 << txq->elts_n; 893 unsigned int i = 0; 894 unsigned int j = 0; 895 unsigned int max; 896 unsigned int comp; 897 unsigned int inline_room = txq->max_inline * RTE_CACHE_LINE_SIZE; 898 struct mlx5_mpw mpw = { 899 .state = MLX5_MPW_STATE_CLOSED, 900 }; 901 902 if (unlikely(!pkts_n)) 903 return 0; 904 /* Prefetch first packet cacheline. */ 905 tx_prefetch_cqe(txq, txq->cq_ci); 906 tx_prefetch_wqe(txq, txq->wqe_ci); 907 tx_prefetch_wqe(txq, txq->wqe_ci + 1); 908 /* Start processing. */ 909 txq_complete(txq); 910 max = (elts_n - (elts_head - txq->elts_tail)); 911 if (max > elts_n) 912 max -= elts_n; 913 do { 914 struct rte_mbuf *buf = *(pkts++); 915 unsigned int elts_head_next; 916 uintptr_t addr; 917 uint32_t length; 918 unsigned int segs_n = buf->nb_segs; 919 uint32_t cs_flags = 0; 920 921 /* 922 * Make sure there is enough room to store this packet and 923 * that one ring entry remains unused. 924 */ 925 assert(segs_n); 926 if (max < segs_n + 1) 927 break; 928 /* Do not bother with large packets MPW cannot handle. */ 929 if (segs_n > MLX5_MPW_DSEG_MAX) 930 break; 931 max -= segs_n; 932 --pkts_n; 933 /* Should we enable HW CKSUM offload */ 934 if (buf->ol_flags & 935 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) 936 cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; 937 /* Retrieve packet information. */ 938 length = PKT_LEN(buf); 939 /* Start new session if packet differs. */ 940 if (mpw.state == MLX5_MPW_STATE_OPENED) { 941 if ((mpw.len != length) || 942 (segs_n != 1) || 943 (mpw.wqe->eseg.cs_flags != cs_flags)) 944 mlx5_mpw_close(txq, &mpw); 945 } else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) { 946 if ((mpw.len != length) || 947 (segs_n != 1) || 948 (length > inline_room) || 949 (mpw.wqe->eseg.cs_flags != cs_flags)) { 950 mlx5_mpw_inline_close(txq, &mpw); 951 inline_room = 952 txq->max_inline * RTE_CACHE_LINE_SIZE; 953 } 954 } 955 if (mpw.state == MLX5_MPW_STATE_CLOSED) { 956 if ((segs_n != 1) || 957 (length > inline_room)) { 958 mlx5_mpw_new(txq, &mpw, length); 959 mpw.wqe->eseg.cs_flags = cs_flags; 960 } else { 961 mlx5_mpw_inline_new(txq, &mpw, length); 962 mpw.wqe->eseg.cs_flags = cs_flags; 963 } 964 } 965 /* Multi-segment packets must be alone in their MPW. */ 966 assert((segs_n == 1) || (mpw.pkts_n == 0)); 967 if (mpw.state == MLX5_MPW_STATE_OPENED) { 968 assert(inline_room == 969 txq->max_inline * RTE_CACHE_LINE_SIZE); 970 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 971 length = 0; 972 #endif 973 do { 974 volatile struct mlx5_wqe_data_seg *dseg; 975 976 elts_head_next = 977 (elts_head + 1) & (elts_n - 1); 978 assert(buf); 979 (*txq->elts)[elts_head] = buf; 980 dseg = mpw.data.dseg[mpw.pkts_n]; 981 addr = rte_pktmbuf_mtod(buf, uintptr_t); 982 *dseg = (struct mlx5_wqe_data_seg){ 983 .byte_count = htonl(DATA_LEN(buf)), 984 .lkey = txq_mp2mr(txq, txq_mb2mp(buf)), 985 .addr = htonll(addr), 986 }; 987 elts_head = elts_head_next; 988 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 989 length += DATA_LEN(buf); 990 #endif 991 buf = buf->next; 992 ++mpw.pkts_n; 993 ++j; 994 } while (--segs_n); 995 assert(length == mpw.len); 996 if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) 997 mlx5_mpw_close(txq, &mpw); 998 } else { 999 unsigned int max; 1000 1001 assert(mpw.state == MLX5_MPW_INL_STATE_OPENED); 1002 assert(length <= inline_room); 1003 assert(length == DATA_LEN(buf)); 1004 elts_head_next = (elts_head + 1) & (elts_n - 1); 1005 addr = rte_pktmbuf_mtod(buf, uintptr_t); 1006 (*txq->elts)[elts_head] = buf; 1007 /* Maximum number of bytes before wrapping. */ 1008 max = ((uintptr_t)&(*txq->wqes)[1 << txq->wqe_n] - 1009 (uintptr_t)mpw.data.raw); 1010 if (length > max) { 1011 rte_memcpy((void *)(uintptr_t)mpw.data.raw, 1012 (void *)addr, 1013 max); 1014 mpw.data.raw = 1015 (volatile void *)&(*txq->wqes)[0]; 1016 rte_memcpy((void *)(uintptr_t)mpw.data.raw, 1017 (void *)(addr + max), 1018 length - max); 1019 mpw.data.raw += length - max; 1020 } else { 1021 rte_memcpy((void *)(uintptr_t)mpw.data.raw, 1022 (void *)addr, 1023 length); 1024 mpw.data.raw += length; 1025 } 1026 if ((uintptr_t)mpw.data.raw == 1027 (uintptr_t)&(*txq->wqes)[1 << txq->wqe_n]) 1028 mpw.data.raw = 1029 (volatile void *)&(*txq->wqes)[0]; 1030 ++mpw.pkts_n; 1031 ++j; 1032 if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) { 1033 mlx5_mpw_inline_close(txq, &mpw); 1034 inline_room = 1035 txq->max_inline * RTE_CACHE_LINE_SIZE; 1036 } else { 1037 inline_room -= length; 1038 } 1039 } 1040 mpw.total_len += length; 1041 elts_head = elts_head_next; 1042 #ifdef MLX5_PMD_SOFT_COUNTERS 1043 /* Increment sent bytes counter. */ 1044 txq->stats.obytes += length; 1045 #endif 1046 ++i; 1047 } while (pkts_n); 1048 /* Take a shortcut if nothing must be sent. */ 1049 if (unlikely(i == 0)) 1050 return 0; 1051 /* Check whether completion threshold has been reached. */ 1052 /* "j" includes both packets and segments. */ 1053 comp = txq->elts_comp + j; 1054 if (comp >= MLX5_TX_COMP_THRESH) { 1055 volatile struct mlx5_wqe *wqe = mpw.wqe; 1056 1057 /* Request completion on last WQE. */ 1058 wqe->ctrl[2] = htonl(8); 1059 /* Save elts_head in unused "immediate" field of WQE. */ 1060 wqe->ctrl[3] = elts_head; 1061 txq->elts_comp = 0; 1062 } else { 1063 txq->elts_comp = comp; 1064 } 1065 #ifdef MLX5_PMD_SOFT_COUNTERS 1066 /* Increment sent packets counter. */ 1067 txq->stats.opackets += i; 1068 #endif 1069 /* Ring QP doorbell. */ 1070 if (mpw.state == MLX5_MPW_INL_STATE_OPENED) 1071 mlx5_mpw_inline_close(txq, &mpw); 1072 else if (mpw.state == MLX5_MPW_STATE_OPENED) 1073 mlx5_mpw_close(txq, &mpw); 1074 mlx5_tx_dbrec(txq); 1075 txq->elts_head = elts_head; 1076 return i; 1077 } 1078 1079 /** 1080 * Translate RX completion flags to packet type. 1081 * 1082 * @param[in] cqe 1083 * Pointer to CQE. 1084 * 1085 * @note: fix mlx5_dev_supported_ptypes_get() if any change here. 1086 * 1087 * @return 1088 * Packet type for struct rte_mbuf. 1089 */ 1090 static inline uint32_t 1091 rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe) 1092 { 1093 uint32_t pkt_type; 1094 uint8_t flags = cqe->l4_hdr_type_etc; 1095 1096 if (cqe->pkt_info & MLX5_CQE_RX_TUNNEL_PACKET) 1097 pkt_type = 1098 TRANSPOSE(flags, 1099 MLX5_CQE_RX_OUTER_IPV4_PACKET, 1100 RTE_PTYPE_L3_IPV4) | 1101 TRANSPOSE(flags, 1102 MLX5_CQE_RX_OUTER_IPV6_PACKET, 1103 RTE_PTYPE_L3_IPV6) | 1104 TRANSPOSE(flags, 1105 MLX5_CQE_RX_IPV4_PACKET, 1106 RTE_PTYPE_INNER_L3_IPV4) | 1107 TRANSPOSE(flags, 1108 MLX5_CQE_RX_IPV6_PACKET, 1109 RTE_PTYPE_INNER_L3_IPV6); 1110 else 1111 pkt_type = 1112 TRANSPOSE(flags, 1113 MLX5_CQE_L3_HDR_TYPE_IPV6, 1114 RTE_PTYPE_L3_IPV6) | 1115 TRANSPOSE(flags, 1116 MLX5_CQE_L3_HDR_TYPE_IPV4, 1117 RTE_PTYPE_L3_IPV4); 1118 return pkt_type; 1119 } 1120 1121 /** 1122 * Get size of the next packet for a given CQE. For compressed CQEs, the 1123 * consumer index is updated only once all packets of the current one have 1124 * been processed. 1125 * 1126 * @param rxq 1127 * Pointer to RX queue. 1128 * @param cqe 1129 * CQE to process. 1130 * @param[out] rss_hash 1131 * Packet RSS Hash result. 1132 * 1133 * @return 1134 * Packet size in bytes (0 if there is none), -1 in case of completion 1135 * with error. 1136 */ 1137 static inline int 1138 mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe, 1139 uint16_t cqe_cnt, uint32_t *rss_hash) 1140 { 1141 struct rxq_zip *zip = &rxq->zip; 1142 uint16_t cqe_n = cqe_cnt + 1; 1143 int len = 0; 1144 1145 /* Process compressed data in the CQE and mini arrays. */ 1146 if (zip->ai) { 1147 volatile struct mlx5_mini_cqe8 (*mc)[8] = 1148 (volatile struct mlx5_mini_cqe8 (*)[8]) 1149 (uintptr_t)(&(*rxq->cqes)[zip->ca & cqe_cnt]); 1150 1151 len = ntohl((*mc)[zip->ai & 7].byte_cnt); 1152 *rss_hash = ntohl((*mc)[zip->ai & 7].rx_hash_result); 1153 if ((++zip->ai & 7) == 0) { 1154 /* 1155 * Increment consumer index to skip the number of 1156 * CQEs consumed. Hardware leaves holes in the CQ 1157 * ring for software use. 1158 */ 1159 zip->ca = zip->na; 1160 zip->na += 8; 1161 } 1162 if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) { 1163 uint16_t idx = rxq->cq_ci; 1164 uint16_t end = zip->cq_ci; 1165 1166 while (idx != end) { 1167 (*rxq->cqes)[idx & cqe_cnt].op_own = 1168 MLX5_CQE_INVALIDATE; 1169 ++idx; 1170 } 1171 rxq->cq_ci = zip->cq_ci; 1172 zip->ai = 0; 1173 } 1174 /* No compressed data, get next CQE and verify if it is compressed. */ 1175 } else { 1176 int ret; 1177 int8_t op_own; 1178 1179 ret = check_cqe(cqe, cqe_n, rxq->cq_ci); 1180 if (unlikely(ret == 1)) 1181 return 0; 1182 ++rxq->cq_ci; 1183 op_own = cqe->op_own; 1184 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) { 1185 volatile struct mlx5_mini_cqe8 (*mc)[8] = 1186 (volatile struct mlx5_mini_cqe8 (*)[8]) 1187 (uintptr_t)(&(*rxq->cqes)[rxq->cq_ci & 1188 cqe_cnt]); 1189 1190 /* Fix endianness. */ 1191 zip->cqe_cnt = ntohl(cqe->byte_cnt); 1192 /* 1193 * Current mini array position is the one returned by 1194 * check_cqe64(). 1195 * 1196 * If completion comprises several mini arrays, as a 1197 * special case the second one is located 7 CQEs after 1198 * the initial CQE instead of 8 for subsequent ones. 1199 */ 1200 zip->ca = rxq->cq_ci & cqe_cnt; 1201 zip->na = zip->ca + 7; 1202 /* Compute the next non compressed CQE. */ 1203 --rxq->cq_ci; 1204 zip->cq_ci = rxq->cq_ci + zip->cqe_cnt; 1205 /* Get packet size to return. */ 1206 len = ntohl((*mc)[0].byte_cnt); 1207 *rss_hash = ntohl((*mc)[0].rx_hash_result); 1208 zip->ai = 1; 1209 } else { 1210 len = ntohl(cqe->byte_cnt); 1211 *rss_hash = ntohl(cqe->rx_hash_res); 1212 } 1213 /* Error while receiving packet. */ 1214 if (unlikely(MLX5_CQE_OPCODE(op_own) == MLX5_CQE_RESP_ERR)) 1215 return -1; 1216 } 1217 return len; 1218 } 1219 1220 /** 1221 * Translate RX completion flags to offload flags. 1222 * 1223 * @param[in] rxq 1224 * Pointer to RX queue structure. 1225 * @param[in] cqe 1226 * Pointer to CQE. 1227 * 1228 * @return 1229 * Offload flags (ol_flags) for struct rte_mbuf. 1230 */ 1231 static inline uint32_t 1232 rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe *cqe) 1233 { 1234 uint32_t ol_flags = 0; 1235 uint8_t l3_hdr = (cqe->l4_hdr_type_etc) & MLX5_CQE_L3_HDR_TYPE_MASK; 1236 uint8_t l4_hdr = (cqe->l4_hdr_type_etc) & MLX5_CQE_L4_HDR_TYPE_MASK; 1237 1238 if ((l3_hdr == MLX5_CQE_L3_HDR_TYPE_IPV4) || 1239 (l3_hdr == MLX5_CQE_L3_HDR_TYPE_IPV6)) 1240 ol_flags |= TRANSPOSE(cqe->hds_ip_ext, 1241 MLX5_CQE_L3_OK, 1242 PKT_RX_IP_CKSUM_GOOD); 1243 if ((l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP) || 1244 (l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP_EMP_ACK) || 1245 (l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP_ACK) || 1246 (l4_hdr == MLX5_CQE_L4_HDR_TYPE_UDP)) 1247 ol_flags |= TRANSPOSE(cqe->hds_ip_ext, 1248 MLX5_CQE_L4_OK, 1249 PKT_RX_L4_CKSUM_GOOD); 1250 if ((cqe->pkt_info & MLX5_CQE_RX_TUNNEL_PACKET) && (rxq->csum_l2tun)) 1251 ol_flags |= 1252 TRANSPOSE(cqe->l4_hdr_type_etc, 1253 MLX5_CQE_RX_OUTER_IP_CSUM_OK, 1254 PKT_RX_IP_CKSUM_GOOD) | 1255 TRANSPOSE(cqe->l4_hdr_type_etc, 1256 MLX5_CQE_RX_OUTER_TCP_UDP_CSUM_OK, 1257 PKT_RX_L4_CKSUM_GOOD); 1258 return ol_flags; 1259 } 1260 1261 /** 1262 * DPDK callback for RX. 1263 * 1264 * @param dpdk_rxq 1265 * Generic pointer to RX queue structure. 1266 * @param[out] pkts 1267 * Array to store received packets. 1268 * @param pkts_n 1269 * Maximum number of packets in array. 1270 * 1271 * @return 1272 * Number of packets successfully received (<= pkts_n). 1273 */ 1274 uint16_t 1275 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1276 { 1277 struct rxq *rxq = dpdk_rxq; 1278 const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1; 1279 const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1; 1280 const unsigned int sges_n = rxq->sges_n; 1281 struct rte_mbuf *pkt = NULL; 1282 struct rte_mbuf *seg = NULL; 1283 volatile struct mlx5_cqe *cqe = 1284 &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1285 unsigned int i = 0; 1286 unsigned int rq_ci = rxq->rq_ci << sges_n; 1287 int len; /* keep its value across iterations. */ 1288 1289 while (pkts_n) { 1290 unsigned int idx = rq_ci & wqe_cnt; 1291 volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx]; 1292 struct rte_mbuf *rep = (*rxq->elts)[idx]; 1293 uint32_t rss_hash_res = 0; 1294 1295 if (pkt) 1296 NEXT(seg) = rep; 1297 seg = rep; 1298 rte_prefetch0(seg); 1299 rte_prefetch0(cqe); 1300 rte_prefetch0(wqe); 1301 rep = rte_mbuf_raw_alloc(rxq->mp); 1302 if (unlikely(rep == NULL)) { 1303 ++rxq->stats.rx_nombuf; 1304 if (!pkt) { 1305 /* 1306 * no buffers before we even started, 1307 * bail out silently. 1308 */ 1309 break; 1310 } 1311 while (pkt != seg) { 1312 assert(pkt != (*rxq->elts)[idx]); 1313 seg = NEXT(pkt); 1314 rte_mbuf_refcnt_set(pkt, 0); 1315 __rte_mbuf_raw_free(pkt); 1316 pkt = seg; 1317 } 1318 break; 1319 } 1320 if (!pkt) { 1321 cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1322 len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt, 1323 &rss_hash_res); 1324 if (!len) { 1325 rte_mbuf_refcnt_set(rep, 0); 1326 __rte_mbuf_raw_free(rep); 1327 break; 1328 } 1329 if (unlikely(len == -1)) { 1330 /* RX error, packet is likely too large. */ 1331 rte_mbuf_refcnt_set(rep, 0); 1332 __rte_mbuf_raw_free(rep); 1333 ++rxq->stats.idropped; 1334 goto skip; 1335 } 1336 pkt = seg; 1337 assert(len >= (rxq->crc_present << 2)); 1338 /* Update packet information. */ 1339 pkt->packet_type = 0; 1340 pkt->ol_flags = 0; 1341 if (rxq->rss_hash) { 1342 pkt->hash.rss = rss_hash_res; 1343 pkt->ol_flags = PKT_RX_RSS_HASH; 1344 } 1345 if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip | 1346 rxq->crc_present) { 1347 if (rxq->csum) { 1348 pkt->packet_type = 1349 rxq_cq_to_pkt_type(cqe); 1350 pkt->ol_flags |= 1351 rxq_cq_to_ol_flags(rxq, cqe); 1352 } 1353 if (cqe->l4_hdr_type_etc & 1354 MLX5_CQE_VLAN_STRIPPED) { 1355 pkt->ol_flags |= PKT_RX_VLAN_PKT | 1356 PKT_RX_VLAN_STRIPPED; 1357 pkt->vlan_tci = ntohs(cqe->vlan_info); 1358 } 1359 if (rxq->crc_present) 1360 len -= ETHER_CRC_LEN; 1361 } 1362 PKT_LEN(pkt) = len; 1363 } 1364 DATA_LEN(rep) = DATA_LEN(seg); 1365 PKT_LEN(rep) = PKT_LEN(seg); 1366 SET_DATA_OFF(rep, DATA_OFF(seg)); 1367 NB_SEGS(rep) = NB_SEGS(seg); 1368 PORT(rep) = PORT(seg); 1369 NEXT(rep) = NULL; 1370 (*rxq->elts)[idx] = rep; 1371 /* 1372 * Fill NIC descriptor with the new buffer. The lkey and size 1373 * of the buffers are already known, only the buffer address 1374 * changes. 1375 */ 1376 wqe->addr = htonll(rte_pktmbuf_mtod(rep, uintptr_t)); 1377 if (len > DATA_LEN(seg)) { 1378 len -= DATA_LEN(seg); 1379 ++NB_SEGS(pkt); 1380 ++rq_ci; 1381 continue; 1382 } 1383 DATA_LEN(seg) = len; 1384 #ifdef MLX5_PMD_SOFT_COUNTERS 1385 /* Increment bytes counter. */ 1386 rxq->stats.ibytes += PKT_LEN(pkt); 1387 #endif 1388 /* Return packet. */ 1389 *(pkts++) = pkt; 1390 pkt = NULL; 1391 --pkts_n; 1392 ++i; 1393 skip: 1394 /* Align consumer index to the next stride. */ 1395 rq_ci >>= sges_n; 1396 ++rq_ci; 1397 rq_ci <<= sges_n; 1398 } 1399 if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci))) 1400 return 0; 1401 /* Update the consumer index. */ 1402 rxq->rq_ci = rq_ci >> sges_n; 1403 rte_wmb(); 1404 *rxq->cq_db = htonl(rxq->cq_ci); 1405 rte_wmb(); 1406 *rxq->rq_db = htonl(rxq->rq_ci); 1407 #ifdef MLX5_PMD_SOFT_COUNTERS 1408 /* Increment packets counter. */ 1409 rxq->stats.ipackets += i; 1410 #endif 1411 return i; 1412 } 1413 1414 /** 1415 * Dummy DPDK callback for TX. 1416 * 1417 * This function is used to temporarily replace the real callback during 1418 * unsafe control operations on the queue, or in case of error. 1419 * 1420 * @param dpdk_txq 1421 * Generic pointer to TX queue structure. 1422 * @param[in] pkts 1423 * Packets to transmit. 1424 * @param pkts_n 1425 * Number of packets in array. 1426 * 1427 * @return 1428 * Number of packets successfully transmitted (<= pkts_n). 1429 */ 1430 uint16_t 1431 removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 1432 { 1433 (void)dpdk_txq; 1434 (void)pkts; 1435 (void)pkts_n; 1436 return 0; 1437 } 1438 1439 /** 1440 * Dummy DPDK callback for RX. 1441 * 1442 * This function is used to temporarily replace the real callback during 1443 * unsafe control operations on the queue, or in case of error. 1444 * 1445 * @param dpdk_rxq 1446 * Generic pointer to RX queue structure. 1447 * @param[out] pkts 1448 * Array to store received packets. 1449 * @param pkts_n 1450 * Maximum number of packets in array. 1451 * 1452 * @return 1453 * Number of packets successfully received (<= pkts_n). 1454 */ 1455 uint16_t 1456 removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1457 { 1458 (void)dpdk_rxq; 1459 (void)pkts; 1460 (void)pkts_n; 1461 return 0; 1462 } 1463