1 /*- 2 * BSD LICENSE 3 * 4 * Copyright 2015 6WIND S.A. 5 * Copyright 2015 Mellanox. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of 6WIND S.A. nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <assert.h> 35 #include <stdint.h> 36 #include <string.h> 37 #include <stdlib.h> 38 39 /* Verbs header. */ 40 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 41 #ifdef PEDANTIC 42 #pragma GCC diagnostic ignored "-pedantic" 43 #endif 44 #include <infiniband/verbs.h> 45 #include <infiniband/mlx5_hw.h> 46 #include <infiniband/arch.h> 47 #ifdef PEDANTIC 48 #pragma GCC diagnostic error "-pedantic" 49 #endif 50 51 /* DPDK headers don't like -pedantic. */ 52 #ifdef PEDANTIC 53 #pragma GCC diagnostic ignored "-pedantic" 54 #endif 55 #include <rte_mbuf.h> 56 #include <rte_mempool.h> 57 #include <rte_prefetch.h> 58 #include <rte_common.h> 59 #include <rte_branch_prediction.h> 60 #include <rte_ether.h> 61 #ifdef PEDANTIC 62 #pragma GCC diagnostic error "-pedantic" 63 #endif 64 65 #include "mlx5.h" 66 #include "mlx5_utils.h" 67 #include "mlx5_rxtx.h" 68 #include "mlx5_autoconf.h" 69 #include "mlx5_defs.h" 70 #include "mlx5_prm.h" 71 72 #ifndef NDEBUG 73 74 /** 75 * Verify or set magic value in CQE. 76 * 77 * @param cqe 78 * Pointer to CQE. 79 * 80 * @return 81 * 0 the first time. 82 */ 83 static inline int 84 check_cqe64_seen(volatile struct mlx5_cqe64 *cqe) 85 { 86 static const uint8_t magic[] = "seen"; 87 volatile uint8_t (*buf)[sizeof(cqe->rsvd40)] = &cqe->rsvd40; 88 int ret = 1; 89 unsigned int i; 90 91 for (i = 0; i < sizeof(magic) && i < sizeof(*buf); ++i) 92 if (!ret || (*buf)[i] != magic[i]) { 93 ret = 0; 94 (*buf)[i] = magic[i]; 95 } 96 return ret; 97 } 98 99 #endif /* NDEBUG */ 100 101 static inline int 102 check_cqe64(volatile struct mlx5_cqe64 *cqe, 103 unsigned int cqes_n, const uint16_t ci) 104 __attribute__((always_inline)); 105 106 /** 107 * Check whether CQE is valid. 108 * 109 * @param cqe 110 * Pointer to CQE. 111 * @param cqes_n 112 * Size of completion queue. 113 * @param ci 114 * Consumer index. 115 * 116 * @return 117 * 0 on success, 1 on failure. 118 */ 119 static inline int 120 check_cqe64(volatile struct mlx5_cqe64 *cqe, 121 unsigned int cqes_n, const uint16_t ci) 122 { 123 uint16_t idx = ci & cqes_n; 124 uint8_t op_own = cqe->op_own; 125 uint8_t op_owner = MLX5_CQE_OWNER(op_own); 126 uint8_t op_code = MLX5_CQE_OPCODE(op_own); 127 128 if (unlikely((op_owner != (!!(idx))) || (op_code == MLX5_CQE_INVALID))) 129 return 1; /* No CQE. */ 130 #ifndef NDEBUG 131 if ((op_code == MLX5_CQE_RESP_ERR) || 132 (op_code == MLX5_CQE_REQ_ERR)) { 133 volatile struct mlx5_err_cqe *err_cqe = (volatile void *)cqe; 134 uint8_t syndrome = err_cqe->syndrome; 135 136 if ((syndrome == MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR) || 137 (syndrome == MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR)) 138 return 0; 139 if (!check_cqe64_seen(cqe)) 140 ERROR("unexpected CQE error %u (0x%02x)" 141 " syndrome 0x%02x", 142 op_code, op_code, syndrome); 143 return 1; 144 } else if ((op_code != MLX5_CQE_RESP_SEND) && 145 (op_code != MLX5_CQE_REQ)) { 146 if (!check_cqe64_seen(cqe)) 147 ERROR("unexpected CQE opcode %u (0x%02x)", 148 op_code, op_code); 149 return 1; 150 } 151 #endif /* NDEBUG */ 152 return 0; 153 } 154 155 /** 156 * Manage TX completions. 157 * 158 * When sending a burst, mlx5_tx_burst() posts several WRs. 159 * 160 * @param txq 161 * Pointer to TX queue structure. 162 */ 163 static void 164 txq_complete(struct txq *txq) 165 { 166 const unsigned int elts_n = txq->elts_n; 167 const unsigned int cqe_n = txq->cqe_n; 168 const unsigned int cqe_cnt = cqe_n - 1; 169 uint16_t elts_free = txq->elts_tail; 170 uint16_t elts_tail; 171 uint16_t cq_ci = txq->cq_ci; 172 volatile struct mlx5_cqe64 *cqe = NULL; 173 volatile union mlx5_wqe *wqe; 174 175 do { 176 volatile struct mlx5_cqe64 *tmp; 177 178 tmp = &(*txq->cqes)[cq_ci & cqe_cnt].cqe64; 179 if (check_cqe64(tmp, cqe_n, cq_ci)) 180 break; 181 cqe = tmp; 182 #ifndef NDEBUG 183 if (MLX5_CQE_FORMAT(cqe->op_own) == MLX5_COMPRESSED) { 184 if (!check_cqe64_seen(cqe)) 185 ERROR("unexpected compressed CQE, TX stopped"); 186 return; 187 } 188 if ((MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_RESP_ERR) || 189 (MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_REQ_ERR)) { 190 if (!check_cqe64_seen(cqe)) 191 ERROR("unexpected error CQE, TX stopped"); 192 return; 193 } 194 #endif /* NDEBUG */ 195 ++cq_ci; 196 } while (1); 197 if (unlikely(cqe == NULL)) 198 return; 199 wqe = &(*txq->wqes)[htons(cqe->wqe_counter) & (txq->wqe_n - 1)]; 200 elts_tail = wqe->wqe.ctrl.data[3]; 201 assert(elts_tail < txq->wqe_n); 202 /* Free buffers. */ 203 while (elts_free != elts_tail) { 204 struct rte_mbuf *elt = (*txq->elts)[elts_free]; 205 unsigned int elts_free_next = 206 (elts_free + 1) & (elts_n - 1); 207 struct rte_mbuf *elt_next = (*txq->elts)[elts_free_next]; 208 209 #ifndef NDEBUG 210 /* Poisoning. */ 211 memset(&(*txq->elts)[elts_free], 212 0x66, 213 sizeof((*txq->elts)[elts_free])); 214 #endif 215 RTE_MBUF_PREFETCH_TO_FREE(elt_next); 216 /* Only one segment needs to be freed. */ 217 rte_pktmbuf_free_seg(elt); 218 elts_free = elts_free_next; 219 } 220 txq->cq_ci = cq_ci; 221 txq->elts_tail = elts_tail; 222 /* Update the consumer index. */ 223 rte_wmb(); 224 *txq->cq_db = htonl(cq_ci); 225 } 226 227 /** 228 * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which 229 * the cloned mbuf is allocated is returned instead. 230 * 231 * @param buf 232 * Pointer to mbuf. 233 * 234 * @return 235 * Memory pool where data is located for given mbuf. 236 */ 237 static struct rte_mempool * 238 txq_mb2mp(struct rte_mbuf *buf) 239 { 240 if (unlikely(RTE_MBUF_INDIRECT(buf))) 241 return rte_mbuf_from_indirect(buf)->pool; 242 return buf->pool; 243 } 244 245 static inline uint32_t 246 txq_mp2mr(struct txq *txq, struct rte_mempool *mp) 247 __attribute__((always_inline)); 248 249 /** 250 * Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[]. 251 * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full, 252 * remove an entry first. 253 * 254 * @param txq 255 * Pointer to TX queue structure. 256 * @param[in] mp 257 * Memory Pool for which a Memory Region lkey must be returned. 258 * 259 * @return 260 * mr->lkey on success, (uint32_t)-1 on failure. 261 */ 262 static inline uint32_t 263 txq_mp2mr(struct txq *txq, struct rte_mempool *mp) 264 { 265 unsigned int i; 266 uint32_t lkey = (uint32_t)-1; 267 268 for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) { 269 if (unlikely(txq->mp2mr[i].mp == NULL)) { 270 /* Unknown MP, add a new MR for it. */ 271 break; 272 } 273 if (txq->mp2mr[i].mp == mp) { 274 assert(txq->mp2mr[i].lkey != (uint32_t)-1); 275 assert(htonl(txq->mp2mr[i].mr->lkey) == 276 txq->mp2mr[i].lkey); 277 lkey = txq->mp2mr[i].lkey; 278 break; 279 } 280 } 281 if (unlikely(lkey == (uint32_t)-1)) 282 lkey = txq_mp2mr_reg(txq, mp, i); 283 return lkey; 284 } 285 286 /** 287 * Write a regular WQE. 288 * 289 * @param txq 290 * Pointer to TX queue structure. 291 * @param wqe 292 * Pointer to the WQE to fill. 293 * @param addr 294 * Buffer data address. 295 * @param length 296 * Packet length. 297 * @param lkey 298 * Memory region lkey. 299 */ 300 static inline void 301 mlx5_wqe_write(struct txq *txq, volatile union mlx5_wqe *wqe, 302 uintptr_t addr, uint32_t length, uint32_t lkey) 303 { 304 wqe->wqe.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND); 305 wqe->wqe.ctrl.data[1] = htonl((txq->qp_num_8s) | 4); 306 wqe->wqe.ctrl.data[2] = 0; 307 wqe->wqe.ctrl.data[3] = 0; 308 wqe->inl.eseg.rsvd0 = 0; 309 wqe->inl.eseg.rsvd1 = 0; 310 wqe->inl.eseg.mss = 0; 311 wqe->inl.eseg.rsvd2 = 0; 312 wqe->wqe.eseg.inline_hdr_sz = htons(MLX5_ETH_INLINE_HEADER_SIZE); 313 /* Copy the first 16 bytes into inline header. */ 314 rte_memcpy((uint8_t *)(uintptr_t)wqe->wqe.eseg.inline_hdr_start, 315 (uint8_t *)(uintptr_t)addr, 316 MLX5_ETH_INLINE_HEADER_SIZE); 317 addr += MLX5_ETH_INLINE_HEADER_SIZE; 318 length -= MLX5_ETH_INLINE_HEADER_SIZE; 319 /* Store remaining data in data segment. */ 320 wqe->wqe.dseg.byte_count = htonl(length); 321 wqe->wqe.dseg.lkey = lkey; 322 wqe->wqe.dseg.addr = htonll(addr); 323 /* Increment consumer index. */ 324 ++txq->wqe_ci; 325 } 326 327 /** 328 * Write a regular WQE with VLAN. 329 * 330 * @param txq 331 * Pointer to TX queue structure. 332 * @param wqe 333 * Pointer to the WQE to fill. 334 * @param addr 335 * Buffer data address. 336 * @param length 337 * Packet length. 338 * @param lkey 339 * Memory region lkey. 340 * @param vlan_tci 341 * VLAN field to insert in packet. 342 */ 343 static inline void 344 mlx5_wqe_write_vlan(struct txq *txq, volatile union mlx5_wqe *wqe, 345 uintptr_t addr, uint32_t length, uint32_t lkey, 346 uint16_t vlan_tci) 347 { 348 uint32_t vlan = htonl(0x81000000 | vlan_tci); 349 350 wqe->wqe.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND); 351 wqe->wqe.ctrl.data[1] = htonl((txq->qp_num_8s) | 4); 352 wqe->wqe.ctrl.data[2] = 0; 353 wqe->wqe.ctrl.data[3] = 0; 354 wqe->inl.eseg.rsvd0 = 0; 355 wqe->inl.eseg.rsvd1 = 0; 356 wqe->inl.eseg.mss = 0; 357 wqe->inl.eseg.rsvd2 = 0; 358 wqe->wqe.eseg.inline_hdr_sz = htons(MLX5_ETH_VLAN_INLINE_HEADER_SIZE); 359 /* 360 * Copy 12 bytes of source & destination MAC address. 361 * Copy 4 bytes of VLAN. 362 * Copy 2 bytes of Ether type. 363 */ 364 rte_memcpy((uint8_t *)(uintptr_t)wqe->wqe.eseg.inline_hdr_start, 365 (uint8_t *)(uintptr_t)addr, 12); 366 rte_memcpy((uint8_t *)((uintptr_t)wqe->wqe.eseg.inline_hdr_start + 12), 367 &vlan, sizeof(vlan)); 368 rte_memcpy((uint8_t *)((uintptr_t)wqe->wqe.eseg.inline_hdr_start + 16), 369 (uint8_t *)((uintptr_t)addr + 12), 2); 370 addr += MLX5_ETH_VLAN_INLINE_HEADER_SIZE - sizeof(vlan); 371 length -= MLX5_ETH_VLAN_INLINE_HEADER_SIZE - sizeof(vlan); 372 /* Store remaining data in data segment. */ 373 wqe->wqe.dseg.byte_count = htonl(length); 374 wqe->wqe.dseg.lkey = lkey; 375 wqe->wqe.dseg.addr = htonll(addr); 376 /* Increment consumer index. */ 377 ++txq->wqe_ci; 378 } 379 380 /** 381 * Write a inline WQE. 382 * 383 * @param txq 384 * Pointer to TX queue structure. 385 * @param wqe 386 * Pointer to the WQE to fill. 387 * @param addr 388 * Buffer data address. 389 * @param length 390 * Packet length. 391 * @param lkey 392 * Memory region lkey. 393 */ 394 static inline void 395 mlx5_wqe_write_inline(struct txq *txq, volatile union mlx5_wqe *wqe, 396 uintptr_t addr, uint32_t length) 397 { 398 uint32_t size; 399 uint16_t wqe_cnt = txq->wqe_n - 1; 400 uint16_t wqe_ci = txq->wqe_ci + 1; 401 402 /* Copy the first 16 bytes into inline header. */ 403 rte_memcpy((void *)(uintptr_t)wqe->inl.eseg.inline_hdr_start, 404 (void *)(uintptr_t)addr, 405 MLX5_ETH_INLINE_HEADER_SIZE); 406 addr += MLX5_ETH_INLINE_HEADER_SIZE; 407 length -= MLX5_ETH_INLINE_HEADER_SIZE; 408 size = 3 + ((4 + length + 15) / 16); 409 wqe->inl.byte_cnt = htonl(length | MLX5_INLINE_SEG); 410 rte_memcpy((void *)(uintptr_t)&wqe->inl.data[0], 411 (void *)addr, MLX5_WQE64_INL_DATA); 412 addr += MLX5_WQE64_INL_DATA; 413 length -= MLX5_WQE64_INL_DATA; 414 while (length) { 415 volatile union mlx5_wqe *wqe_next = 416 &(*txq->wqes)[wqe_ci & wqe_cnt]; 417 uint32_t copy_bytes = (length > sizeof(*wqe)) ? 418 sizeof(*wqe) : 419 length; 420 421 rte_mov64((uint8_t *)(uintptr_t)&wqe_next->data[0], 422 (uint8_t *)addr); 423 addr += copy_bytes; 424 length -= copy_bytes; 425 ++wqe_ci; 426 } 427 assert(size < 64); 428 wqe->inl.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND); 429 wqe->inl.ctrl.data[1] = htonl(txq->qp_num_8s | size); 430 wqe->inl.ctrl.data[2] = 0; 431 wqe->inl.ctrl.data[3] = 0; 432 wqe->inl.eseg.rsvd0 = 0; 433 wqe->inl.eseg.rsvd1 = 0; 434 wqe->inl.eseg.mss = 0; 435 wqe->inl.eseg.rsvd2 = 0; 436 wqe->inl.eseg.inline_hdr_sz = htons(MLX5_ETH_INLINE_HEADER_SIZE); 437 /* Increment consumer index. */ 438 txq->wqe_ci = wqe_ci; 439 } 440 441 /** 442 * Write a inline WQE with VLAN. 443 * 444 * @param txq 445 * Pointer to TX queue structure. 446 * @param wqe 447 * Pointer to the WQE to fill. 448 * @param addr 449 * Buffer data address. 450 * @param length 451 * Packet length. 452 * @param lkey 453 * Memory region lkey. 454 * @param vlan_tci 455 * VLAN field to insert in packet. 456 */ 457 static inline void 458 mlx5_wqe_write_inline_vlan(struct txq *txq, volatile union mlx5_wqe *wqe, 459 uintptr_t addr, uint32_t length, uint16_t vlan_tci) 460 { 461 uint32_t size; 462 uint32_t wqe_cnt = txq->wqe_n - 1; 463 uint16_t wqe_ci = txq->wqe_ci + 1; 464 uint32_t vlan = htonl(0x81000000 | vlan_tci); 465 466 /* 467 * Copy 12 bytes of source & destination MAC address. 468 * Copy 4 bytes of VLAN. 469 * Copy 2 bytes of Ether type. 470 */ 471 rte_memcpy((uint8_t *)(uintptr_t)wqe->inl.eseg.inline_hdr_start, 472 (uint8_t *)addr, 12); 473 rte_memcpy((uint8_t *)(uintptr_t)wqe->inl.eseg.inline_hdr_start + 12, 474 &vlan, sizeof(vlan)); 475 rte_memcpy((uint8_t *)((uintptr_t)wqe->inl.eseg.inline_hdr_start + 16), 476 (uint8_t *)(addr + 12), 2); 477 addr += MLX5_ETH_VLAN_INLINE_HEADER_SIZE - sizeof(vlan); 478 length -= MLX5_ETH_VLAN_INLINE_HEADER_SIZE - sizeof(vlan); 479 size = (sizeof(wqe->inl.ctrl.ctrl) + 480 sizeof(wqe->inl.eseg) + 481 sizeof(wqe->inl.byte_cnt) + 482 length + 15) / 16; 483 wqe->inl.byte_cnt = htonl(length | MLX5_INLINE_SEG); 484 rte_memcpy((void *)(uintptr_t)&wqe->inl.data[0], 485 (void *)addr, MLX5_WQE64_INL_DATA); 486 addr += MLX5_WQE64_INL_DATA; 487 length -= MLX5_WQE64_INL_DATA; 488 while (length) { 489 volatile union mlx5_wqe *wqe_next = 490 &(*txq->wqes)[wqe_ci & wqe_cnt]; 491 uint32_t copy_bytes = (length > sizeof(*wqe)) ? 492 sizeof(*wqe) : 493 length; 494 495 rte_mov64((uint8_t *)(uintptr_t)&wqe_next->data[0], 496 (uint8_t *)addr); 497 addr += copy_bytes; 498 length -= copy_bytes; 499 ++wqe_ci; 500 } 501 assert(size < 64); 502 wqe->inl.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND); 503 wqe->inl.ctrl.data[1] = htonl(txq->qp_num_8s | size); 504 wqe->inl.ctrl.data[2] = 0; 505 wqe->inl.ctrl.data[3] = 0; 506 wqe->inl.eseg.rsvd0 = 0; 507 wqe->inl.eseg.rsvd1 = 0; 508 wqe->inl.eseg.mss = 0; 509 wqe->inl.eseg.rsvd2 = 0; 510 wqe->inl.eseg.inline_hdr_sz = htons(MLX5_ETH_VLAN_INLINE_HEADER_SIZE); 511 /* Increment consumer index. */ 512 txq->wqe_ci = wqe_ci; 513 } 514 515 /** 516 * Ring TX queue doorbell. 517 * 518 * @param txq 519 * Pointer to TX queue structure. 520 */ 521 static inline void 522 mlx5_tx_dbrec(struct txq *txq) 523 { 524 uint8_t *dst = (uint8_t *)((uintptr_t)txq->bf_reg + txq->bf_offset); 525 uint32_t data[4] = { 526 htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND), 527 htonl(txq->qp_num_8s), 528 0, 529 0, 530 }; 531 rte_wmb(); 532 *txq->qp_db = htonl(txq->wqe_ci); 533 /* Ensure ordering between DB record and BF copy. */ 534 rte_wmb(); 535 rte_mov16(dst, (uint8_t *)data); 536 txq->bf_offset ^= txq->bf_buf_size; 537 } 538 539 /** 540 * Prefetch a CQE. 541 * 542 * @param txq 543 * Pointer to TX queue structure. 544 * @param cqe_ci 545 * CQE consumer index. 546 */ 547 static inline void 548 tx_prefetch_cqe(struct txq *txq, uint16_t ci) 549 { 550 volatile struct mlx5_cqe64 *cqe; 551 552 cqe = &(*txq->cqes)[ci & (txq->cqe_n - 1)].cqe64; 553 rte_prefetch0(cqe); 554 } 555 556 /** 557 * Prefetch a WQE. 558 * 559 * @param txq 560 * Pointer to TX queue structure. 561 * @param wqe_ci 562 * WQE consumer index. 563 */ 564 static inline void 565 tx_prefetch_wqe(struct txq *txq, uint16_t ci) 566 { 567 volatile union mlx5_wqe *wqe; 568 569 wqe = &(*txq->wqes)[ci & (txq->wqe_n - 1)]; 570 rte_prefetch0(wqe); 571 } 572 573 /** 574 * DPDK callback for TX. 575 * 576 * @param dpdk_txq 577 * Generic pointer to TX queue structure. 578 * @param[in] pkts 579 * Packets to transmit. 580 * @param pkts_n 581 * Number of packets in array. 582 * 583 * @return 584 * Number of packets successfully transmitted (<= pkts_n). 585 */ 586 uint16_t 587 mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 588 { 589 struct txq *txq = (struct txq *)dpdk_txq; 590 uint16_t elts_head = txq->elts_head; 591 const unsigned int elts_n = txq->elts_n; 592 unsigned int i = 0; 593 unsigned int j = 0; 594 unsigned int max; 595 unsigned int comp; 596 volatile union mlx5_wqe *wqe = NULL; 597 598 if (unlikely(!pkts_n)) 599 return 0; 600 /* Prefetch first packet cacheline. */ 601 tx_prefetch_cqe(txq, txq->cq_ci); 602 tx_prefetch_cqe(txq, txq->cq_ci + 1); 603 rte_prefetch0(*pkts); 604 /* Start processing. */ 605 txq_complete(txq); 606 max = (elts_n - (elts_head - txq->elts_tail)); 607 if (max > elts_n) 608 max -= elts_n; 609 do { 610 struct rte_mbuf *buf = *(pkts++); 611 unsigned int elts_head_next; 612 uintptr_t addr; 613 uint32_t length; 614 uint32_t lkey; 615 unsigned int segs_n = buf->nb_segs; 616 volatile struct mlx5_wqe_data_seg *dseg; 617 unsigned int ds = sizeof(*wqe) / 16; 618 619 /* 620 * Make sure there is enough room to store this packet and 621 * that one ring entry remains unused. 622 */ 623 assert(segs_n); 624 if (max < segs_n + 1) 625 break; 626 max -= segs_n; 627 --pkts_n; 628 elts_head_next = (elts_head + 1) & (elts_n - 1); 629 wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)]; 630 dseg = &wqe->wqe.dseg; 631 rte_prefetch0(wqe); 632 if (pkts_n) 633 rte_prefetch0(*pkts); 634 /* Retrieve buffer information. */ 635 addr = rte_pktmbuf_mtod(buf, uintptr_t); 636 length = DATA_LEN(buf); 637 /* Update element. */ 638 (*txq->elts)[elts_head] = buf; 639 /* Prefetch next buffer data. */ 640 if (pkts_n) 641 rte_prefetch0(rte_pktmbuf_mtod(*pkts, 642 volatile void *)); 643 /* Retrieve Memory Region key for this memory pool. */ 644 lkey = txq_mp2mr(txq, txq_mb2mp(buf)); 645 if (buf->ol_flags & PKT_TX_VLAN_PKT) 646 mlx5_wqe_write_vlan(txq, wqe, addr, length, lkey, 647 buf->vlan_tci); 648 else 649 mlx5_wqe_write(txq, wqe, addr, length, lkey); 650 /* Should we enable HW CKSUM offload */ 651 if (buf->ol_flags & 652 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) { 653 wqe->wqe.eseg.cs_flags = 654 MLX5_ETH_WQE_L3_CSUM | 655 MLX5_ETH_WQE_L4_CSUM; 656 } else { 657 wqe->wqe.eseg.cs_flags = 0; 658 } 659 while (--segs_n) { 660 /* 661 * Spill on next WQE when the current one does not have 662 * enough room left. Size of WQE must a be a multiple 663 * of data segment size. 664 */ 665 assert(!(sizeof(*wqe) % sizeof(*dseg))); 666 if (!(ds % (sizeof(*wqe) / 16))) 667 dseg = (volatile void *) 668 &(*txq->wqes)[txq->wqe_ci++ & 669 (txq->wqe_n - 1)]; 670 else 671 ++dseg; 672 ++ds; 673 buf = buf->next; 674 assert(buf); 675 /* Store segment information. */ 676 dseg->byte_count = htonl(DATA_LEN(buf)); 677 dseg->lkey = txq_mp2mr(txq, txq_mb2mp(buf)); 678 dseg->addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)); 679 (*txq->elts)[elts_head_next] = buf; 680 elts_head_next = (elts_head_next + 1) & (elts_n - 1); 681 #ifdef MLX5_PMD_SOFT_COUNTERS 682 length += DATA_LEN(buf); 683 #endif 684 ++j; 685 } 686 /* Update DS field in WQE. */ 687 wqe->wqe.ctrl.data[1] &= htonl(0xffffffc0); 688 wqe->wqe.ctrl.data[1] |= htonl(ds & 0x3f); 689 elts_head = elts_head_next; 690 #ifdef MLX5_PMD_SOFT_COUNTERS 691 /* Increment sent bytes counter. */ 692 txq->stats.obytes += length; 693 #endif 694 elts_head = elts_head_next; 695 ++i; 696 } while (pkts_n); 697 /* Take a shortcut if nothing must be sent. */ 698 if (unlikely(i == 0)) 699 return 0; 700 /* Check whether completion threshold has been reached. */ 701 comp = txq->elts_comp + i + j; 702 if (comp >= MLX5_TX_COMP_THRESH) { 703 /* Request completion on last WQE. */ 704 wqe->wqe.ctrl.data[2] = htonl(8); 705 /* Save elts_head in unused "immediate" field of WQE. */ 706 wqe->wqe.ctrl.data[3] = elts_head; 707 txq->elts_comp = 0; 708 } else { 709 txq->elts_comp = comp; 710 } 711 #ifdef MLX5_PMD_SOFT_COUNTERS 712 /* Increment sent packets counter. */ 713 txq->stats.opackets += i; 714 #endif 715 /* Ring QP doorbell. */ 716 mlx5_tx_dbrec(txq); 717 txq->elts_head = elts_head; 718 return i; 719 } 720 721 /** 722 * DPDK callback for TX with inline support. 723 * 724 * @param dpdk_txq 725 * Generic pointer to TX queue structure. 726 * @param[in] pkts 727 * Packets to transmit. 728 * @param pkts_n 729 * Number of packets in array. 730 * 731 * @return 732 * Number of packets successfully transmitted (<= pkts_n). 733 */ 734 uint16_t 735 mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 736 { 737 struct txq *txq = (struct txq *)dpdk_txq; 738 uint16_t elts_head = txq->elts_head; 739 const unsigned int elts_n = txq->elts_n; 740 unsigned int i = 0; 741 unsigned int j = 0; 742 unsigned int max; 743 unsigned int comp; 744 volatile union mlx5_wqe *wqe = NULL; 745 unsigned int max_inline = txq->max_inline; 746 747 if (unlikely(!pkts_n)) 748 return 0; 749 /* Prefetch first packet cacheline. */ 750 tx_prefetch_cqe(txq, txq->cq_ci); 751 tx_prefetch_cqe(txq, txq->cq_ci + 1); 752 rte_prefetch0(*pkts); 753 /* Start processing. */ 754 txq_complete(txq); 755 max = (elts_n - (elts_head - txq->elts_tail)); 756 if (max > elts_n) 757 max -= elts_n; 758 do { 759 struct rte_mbuf *buf = *(pkts++); 760 unsigned int elts_head_next; 761 uintptr_t addr; 762 uint32_t length; 763 uint32_t lkey; 764 unsigned int segs_n = buf->nb_segs; 765 volatile struct mlx5_wqe_data_seg *dseg; 766 unsigned int ds = sizeof(*wqe) / 16; 767 768 /* 769 * Make sure there is enough room to store this packet and 770 * that one ring entry remains unused. 771 */ 772 assert(segs_n); 773 if (max < segs_n + 1) 774 break; 775 max -= segs_n; 776 --pkts_n; 777 elts_head_next = (elts_head + 1) & (elts_n - 1); 778 wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)]; 779 dseg = &wqe->wqe.dseg; 780 tx_prefetch_wqe(txq, txq->wqe_ci); 781 tx_prefetch_wqe(txq, txq->wqe_ci + 1); 782 if (pkts_n) 783 rte_prefetch0(*pkts); 784 /* Should we enable HW CKSUM offload */ 785 if (buf->ol_flags & 786 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) { 787 wqe->inl.eseg.cs_flags = 788 MLX5_ETH_WQE_L3_CSUM | 789 MLX5_ETH_WQE_L4_CSUM; 790 } else { 791 wqe->inl.eseg.cs_flags = 0; 792 } 793 /* Retrieve buffer information. */ 794 addr = rte_pktmbuf_mtod(buf, uintptr_t); 795 length = DATA_LEN(buf); 796 /* Update element. */ 797 (*txq->elts)[elts_head] = buf; 798 /* Prefetch next buffer data. */ 799 if (pkts_n) 800 rte_prefetch0(rte_pktmbuf_mtod(*pkts, 801 volatile void *)); 802 if ((length <= max_inline) && (segs_n == 1)) { 803 if (buf->ol_flags & PKT_TX_VLAN_PKT) 804 mlx5_wqe_write_inline_vlan(txq, wqe, 805 addr, length, 806 buf->vlan_tci); 807 else 808 mlx5_wqe_write_inline(txq, wqe, addr, length); 809 goto skip_segs; 810 } else { 811 /* Retrieve Memory Region key for this memory pool. */ 812 lkey = txq_mp2mr(txq, txq_mb2mp(buf)); 813 if (buf->ol_flags & PKT_TX_VLAN_PKT) 814 mlx5_wqe_write_vlan(txq, wqe, addr, length, 815 lkey, buf->vlan_tci); 816 else 817 mlx5_wqe_write(txq, wqe, addr, length, lkey); 818 } 819 while (--segs_n) { 820 /* 821 * Spill on next WQE when the current one does not have 822 * enough room left. Size of WQE must a be a multiple 823 * of data segment size. 824 */ 825 assert(!(sizeof(*wqe) % sizeof(*dseg))); 826 if (!(ds % (sizeof(*wqe) / 16))) 827 dseg = (volatile void *) 828 &(*txq->wqes)[txq->wqe_ci++ & 829 (txq->wqe_n - 1)]; 830 else 831 ++dseg; 832 ++ds; 833 buf = buf->next; 834 assert(buf); 835 /* Store segment information. */ 836 dseg->byte_count = htonl(DATA_LEN(buf)); 837 dseg->lkey = txq_mp2mr(txq, txq_mb2mp(buf)); 838 dseg->addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)); 839 (*txq->elts)[elts_head_next] = buf; 840 elts_head_next = (elts_head_next + 1) & (elts_n - 1); 841 #ifdef MLX5_PMD_SOFT_COUNTERS 842 length += DATA_LEN(buf); 843 #endif 844 ++j; 845 } 846 /* Update DS field in WQE. */ 847 wqe->inl.ctrl.data[1] &= htonl(0xffffffc0); 848 wqe->inl.ctrl.data[1] |= htonl(ds & 0x3f); 849 skip_segs: 850 elts_head = elts_head_next; 851 #ifdef MLX5_PMD_SOFT_COUNTERS 852 /* Increment sent bytes counter. */ 853 txq->stats.obytes += length; 854 #endif 855 ++i; 856 } while (pkts_n); 857 /* Take a shortcut if nothing must be sent. */ 858 if (unlikely(i == 0)) 859 return 0; 860 /* Check whether completion threshold has been reached. */ 861 comp = txq->elts_comp + i + j; 862 if (comp >= MLX5_TX_COMP_THRESH) { 863 /* Request completion on last WQE. */ 864 wqe->inl.ctrl.data[2] = htonl(8); 865 /* Save elts_head in unused "immediate" field of WQE. */ 866 wqe->inl.ctrl.data[3] = elts_head; 867 txq->elts_comp = 0; 868 } else { 869 txq->elts_comp = comp; 870 } 871 #ifdef MLX5_PMD_SOFT_COUNTERS 872 /* Increment sent packets counter. */ 873 txq->stats.opackets += i; 874 #endif 875 /* Ring QP doorbell. */ 876 mlx5_tx_dbrec(txq); 877 txq->elts_head = elts_head; 878 return i; 879 } 880 881 /** 882 * Open a MPW session. 883 * 884 * @param txq 885 * Pointer to TX queue structure. 886 * @param mpw 887 * Pointer to MPW session structure. 888 * @param length 889 * Packet length. 890 */ 891 static inline void 892 mlx5_mpw_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length) 893 { 894 uint16_t idx = txq->wqe_ci & (txq->wqe_n - 1); 895 volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] = 896 (volatile struct mlx5_wqe_data_seg (*)[]) 897 (uintptr_t)&(*txq->wqes)[(idx + 1) & (txq->wqe_n - 1)]; 898 899 mpw->state = MLX5_MPW_STATE_OPENED; 900 mpw->pkts_n = 0; 901 mpw->len = length; 902 mpw->total_len = 0; 903 mpw->wqe = &(*txq->wqes)[idx]; 904 mpw->wqe->mpw.eseg.mss = htons(length); 905 mpw->wqe->mpw.eseg.inline_hdr_sz = 0; 906 mpw->wqe->mpw.eseg.rsvd0 = 0; 907 mpw->wqe->mpw.eseg.rsvd1 = 0; 908 mpw->wqe->mpw.eseg.rsvd2 = 0; 909 mpw->wqe->mpw.ctrl.data[0] = htonl((MLX5_OPC_MOD_MPW << 24) | 910 (txq->wqe_ci << 8) | 911 MLX5_OPCODE_LSO_MPW); 912 mpw->wqe->mpw.ctrl.data[2] = 0; 913 mpw->wqe->mpw.ctrl.data[3] = 0; 914 mpw->data.dseg[0] = &mpw->wqe->mpw.dseg[0]; 915 mpw->data.dseg[1] = &mpw->wqe->mpw.dseg[1]; 916 mpw->data.dseg[2] = &(*dseg)[0]; 917 mpw->data.dseg[3] = &(*dseg)[1]; 918 mpw->data.dseg[4] = &(*dseg)[2]; 919 } 920 921 /** 922 * Close a MPW session. 923 * 924 * @param txq 925 * Pointer to TX queue structure. 926 * @param mpw 927 * Pointer to MPW session structure. 928 */ 929 static inline void 930 mlx5_mpw_close(struct txq *txq, struct mlx5_mpw *mpw) 931 { 932 unsigned int num = mpw->pkts_n; 933 934 /* 935 * Store size in multiple of 16 bytes. Control and Ethernet segments 936 * count as 2. 937 */ 938 mpw->wqe->mpw.ctrl.data[1] = htonl(txq->qp_num_8s | (2 + num)); 939 mpw->state = MLX5_MPW_STATE_CLOSED; 940 if (num < 3) 941 ++txq->wqe_ci; 942 else 943 txq->wqe_ci += 2; 944 tx_prefetch_wqe(txq, txq->wqe_ci); 945 tx_prefetch_wqe(txq, txq->wqe_ci + 1); 946 } 947 948 /** 949 * DPDK callback for TX with MPW support. 950 * 951 * @param dpdk_txq 952 * Generic pointer to TX queue structure. 953 * @param[in] pkts 954 * Packets to transmit. 955 * @param pkts_n 956 * Number of packets in array. 957 * 958 * @return 959 * Number of packets successfully transmitted (<= pkts_n). 960 */ 961 uint16_t 962 mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 963 { 964 struct txq *txq = (struct txq *)dpdk_txq; 965 uint16_t elts_head = txq->elts_head; 966 const unsigned int elts_n = txq->elts_n; 967 unsigned int i = 0; 968 unsigned int j = 0; 969 unsigned int max; 970 unsigned int comp; 971 struct mlx5_mpw mpw = { 972 .state = MLX5_MPW_STATE_CLOSED, 973 }; 974 975 if (unlikely(!pkts_n)) 976 return 0; 977 /* Prefetch first packet cacheline. */ 978 tx_prefetch_cqe(txq, txq->cq_ci); 979 tx_prefetch_wqe(txq, txq->wqe_ci); 980 tx_prefetch_wqe(txq, txq->wqe_ci + 1); 981 /* Start processing. */ 982 txq_complete(txq); 983 max = (elts_n - (elts_head - txq->elts_tail)); 984 if (max > elts_n) 985 max -= elts_n; 986 do { 987 struct rte_mbuf *buf = *(pkts++); 988 unsigned int elts_head_next; 989 uint32_t length; 990 unsigned int segs_n = buf->nb_segs; 991 uint32_t cs_flags = 0; 992 993 /* 994 * Make sure there is enough room to store this packet and 995 * that one ring entry remains unused. 996 */ 997 assert(segs_n); 998 if (max < segs_n + 1) 999 break; 1000 /* Do not bother with large packets MPW cannot handle. */ 1001 if (segs_n > MLX5_MPW_DSEG_MAX) 1002 break; 1003 max -= segs_n; 1004 --pkts_n; 1005 /* Should we enable HW CKSUM offload */ 1006 if (buf->ol_flags & 1007 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) 1008 cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; 1009 /* Retrieve packet information. */ 1010 length = PKT_LEN(buf); 1011 assert(length); 1012 /* Start new session if packet differs. */ 1013 if ((mpw.state == MLX5_MPW_STATE_OPENED) && 1014 ((mpw.len != length) || 1015 (segs_n != 1) || 1016 (mpw.wqe->mpw.eseg.cs_flags != cs_flags))) 1017 mlx5_mpw_close(txq, &mpw); 1018 if (mpw.state == MLX5_MPW_STATE_CLOSED) { 1019 mlx5_mpw_new(txq, &mpw, length); 1020 mpw.wqe->mpw.eseg.cs_flags = cs_flags; 1021 } 1022 /* Multi-segment packets must be alone in their MPW. */ 1023 assert((segs_n == 1) || (mpw.pkts_n == 0)); 1024 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 1025 length = 0; 1026 #endif 1027 do { 1028 volatile struct mlx5_wqe_data_seg *dseg; 1029 uintptr_t addr; 1030 1031 elts_head_next = (elts_head + 1) & (elts_n - 1); 1032 assert(buf); 1033 (*txq->elts)[elts_head] = buf; 1034 dseg = mpw.data.dseg[mpw.pkts_n]; 1035 addr = rte_pktmbuf_mtod(buf, uintptr_t); 1036 *dseg = (struct mlx5_wqe_data_seg){ 1037 .byte_count = htonl(DATA_LEN(buf)), 1038 .lkey = txq_mp2mr(txq, txq_mb2mp(buf)), 1039 .addr = htonll(addr), 1040 }; 1041 elts_head = elts_head_next; 1042 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 1043 length += DATA_LEN(buf); 1044 #endif 1045 buf = buf->next; 1046 ++mpw.pkts_n; 1047 ++j; 1048 } while (--segs_n); 1049 assert(length == mpw.len); 1050 if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) 1051 mlx5_mpw_close(txq, &mpw); 1052 elts_head = elts_head_next; 1053 #ifdef MLX5_PMD_SOFT_COUNTERS 1054 /* Increment sent bytes counter. */ 1055 txq->stats.obytes += length; 1056 #endif 1057 ++i; 1058 } while (pkts_n); 1059 /* Take a shortcut if nothing must be sent. */ 1060 if (unlikely(i == 0)) 1061 return 0; 1062 /* Check whether completion threshold has been reached. */ 1063 /* "j" includes both packets and segments. */ 1064 comp = txq->elts_comp + j; 1065 if (comp >= MLX5_TX_COMP_THRESH) { 1066 volatile union mlx5_wqe *wqe = mpw.wqe; 1067 1068 /* Request completion on last WQE. */ 1069 wqe->mpw.ctrl.data[2] = htonl(8); 1070 /* Save elts_head in unused "immediate" field of WQE. */ 1071 wqe->mpw.ctrl.data[3] = elts_head; 1072 txq->elts_comp = 0; 1073 } else { 1074 txq->elts_comp = comp; 1075 } 1076 #ifdef MLX5_PMD_SOFT_COUNTERS 1077 /* Increment sent packets counter. */ 1078 txq->stats.opackets += i; 1079 #endif 1080 /* Ring QP doorbell. */ 1081 if (mpw.state == MLX5_MPW_STATE_OPENED) 1082 mlx5_mpw_close(txq, &mpw); 1083 mlx5_tx_dbrec(txq); 1084 txq->elts_head = elts_head; 1085 return i; 1086 } 1087 1088 /** 1089 * Open a MPW inline session. 1090 * 1091 * @param txq 1092 * Pointer to TX queue structure. 1093 * @param mpw 1094 * Pointer to MPW session structure. 1095 * @param length 1096 * Packet length. 1097 */ 1098 static inline void 1099 mlx5_mpw_inline_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length) 1100 { 1101 uint16_t idx = txq->wqe_ci & (txq->wqe_n - 1); 1102 1103 mpw->state = MLX5_MPW_INL_STATE_OPENED; 1104 mpw->pkts_n = 0; 1105 mpw->len = length; 1106 mpw->total_len = 0; 1107 mpw->wqe = &(*txq->wqes)[idx]; 1108 mpw->wqe->mpw_inl.ctrl.data[0] = htonl((MLX5_OPC_MOD_MPW << 24) | 1109 (txq->wqe_ci << 8) | 1110 MLX5_OPCODE_LSO_MPW); 1111 mpw->wqe->mpw_inl.ctrl.data[2] = 0; 1112 mpw->wqe->mpw_inl.ctrl.data[3] = 0; 1113 mpw->wqe->mpw_inl.eseg.mss = htons(length); 1114 mpw->wqe->mpw_inl.eseg.inline_hdr_sz = 0; 1115 mpw->wqe->mpw_inl.eseg.cs_flags = 0; 1116 mpw->wqe->mpw_inl.eseg.rsvd0 = 0; 1117 mpw->wqe->mpw_inl.eseg.rsvd1 = 0; 1118 mpw->wqe->mpw_inl.eseg.rsvd2 = 0; 1119 mpw->data.raw = &mpw->wqe->mpw_inl.data[0]; 1120 } 1121 1122 /** 1123 * Close a MPW inline session. 1124 * 1125 * @param txq 1126 * Pointer to TX queue structure. 1127 * @param mpw 1128 * Pointer to MPW session structure. 1129 */ 1130 static inline void 1131 mlx5_mpw_inline_close(struct txq *txq, struct mlx5_mpw *mpw) 1132 { 1133 unsigned int size; 1134 1135 size = sizeof(*mpw->wqe) - MLX5_MWQE64_INL_DATA + mpw->total_len; 1136 /* 1137 * Store size in multiple of 16 bytes. Control and Ethernet segments 1138 * count as 2. 1139 */ 1140 mpw->wqe->mpw_inl.ctrl.data[1] = 1141 htonl(txq->qp_num_8s | ((size + 15) / 16)); 1142 mpw->state = MLX5_MPW_STATE_CLOSED; 1143 mpw->wqe->mpw_inl.byte_cnt = htonl(mpw->total_len | MLX5_INLINE_SEG); 1144 txq->wqe_ci += (size + (sizeof(*mpw->wqe) - 1)) / sizeof(*mpw->wqe); 1145 } 1146 1147 /** 1148 * DPDK callback for TX with MPW inline support. 1149 * 1150 * @param dpdk_txq 1151 * Generic pointer to TX queue structure. 1152 * @param[in] pkts 1153 * Packets to transmit. 1154 * @param pkts_n 1155 * Number of packets in array. 1156 * 1157 * @return 1158 * Number of packets successfully transmitted (<= pkts_n). 1159 */ 1160 uint16_t 1161 mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, 1162 uint16_t pkts_n) 1163 { 1164 struct txq *txq = (struct txq *)dpdk_txq; 1165 uint16_t elts_head = txq->elts_head; 1166 const unsigned int elts_n = txq->elts_n; 1167 unsigned int i = 0; 1168 unsigned int j = 0; 1169 unsigned int max; 1170 unsigned int comp; 1171 unsigned int inline_room = txq->max_inline; 1172 struct mlx5_mpw mpw = { 1173 .state = MLX5_MPW_STATE_CLOSED, 1174 }; 1175 1176 if (unlikely(!pkts_n)) 1177 return 0; 1178 /* Prefetch first packet cacheline. */ 1179 tx_prefetch_cqe(txq, txq->cq_ci); 1180 tx_prefetch_wqe(txq, txq->wqe_ci); 1181 tx_prefetch_wqe(txq, txq->wqe_ci + 1); 1182 /* Start processing. */ 1183 txq_complete(txq); 1184 max = (elts_n - (elts_head - txq->elts_tail)); 1185 if (max > elts_n) 1186 max -= elts_n; 1187 do { 1188 struct rte_mbuf *buf = *(pkts++); 1189 unsigned int elts_head_next; 1190 uintptr_t addr; 1191 uint32_t length; 1192 unsigned int segs_n = buf->nb_segs; 1193 uint32_t cs_flags = 0; 1194 1195 /* 1196 * Make sure there is enough room to store this packet and 1197 * that one ring entry remains unused. 1198 */ 1199 assert(segs_n); 1200 if (max < segs_n + 1) 1201 break; 1202 /* Do not bother with large packets MPW cannot handle. */ 1203 if (segs_n > MLX5_MPW_DSEG_MAX) 1204 break; 1205 max -= segs_n; 1206 --pkts_n; 1207 /* Should we enable HW CKSUM offload */ 1208 if (buf->ol_flags & 1209 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) 1210 cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; 1211 /* Retrieve packet information. */ 1212 length = PKT_LEN(buf); 1213 /* Start new session if packet differs. */ 1214 if (mpw.state == MLX5_MPW_STATE_OPENED) { 1215 if ((mpw.len != length) || 1216 (segs_n != 1) || 1217 (mpw.wqe->mpw.eseg.cs_flags != cs_flags)) 1218 mlx5_mpw_close(txq, &mpw); 1219 } else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) { 1220 if ((mpw.len != length) || 1221 (segs_n != 1) || 1222 (length > inline_room) || 1223 (mpw.wqe->mpw_inl.eseg.cs_flags != cs_flags)) { 1224 mlx5_mpw_inline_close(txq, &mpw); 1225 inline_room = txq->max_inline; 1226 } 1227 } 1228 if (mpw.state == MLX5_MPW_STATE_CLOSED) { 1229 if ((segs_n != 1) || 1230 (length > inline_room)) { 1231 mlx5_mpw_new(txq, &mpw, length); 1232 mpw.wqe->mpw.eseg.cs_flags = cs_flags; 1233 } else { 1234 mlx5_mpw_inline_new(txq, &mpw, length); 1235 mpw.wqe->mpw_inl.eseg.cs_flags = cs_flags; 1236 } 1237 } 1238 /* Multi-segment packets must be alone in their MPW. */ 1239 assert((segs_n == 1) || (mpw.pkts_n == 0)); 1240 if (mpw.state == MLX5_MPW_STATE_OPENED) { 1241 assert(inline_room == txq->max_inline); 1242 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 1243 length = 0; 1244 #endif 1245 do { 1246 volatile struct mlx5_wqe_data_seg *dseg; 1247 1248 elts_head_next = 1249 (elts_head + 1) & (elts_n - 1); 1250 assert(buf); 1251 (*txq->elts)[elts_head] = buf; 1252 dseg = mpw.data.dseg[mpw.pkts_n]; 1253 addr = rte_pktmbuf_mtod(buf, uintptr_t); 1254 *dseg = (struct mlx5_wqe_data_seg){ 1255 .byte_count = htonl(DATA_LEN(buf)), 1256 .lkey = txq_mp2mr(txq, txq_mb2mp(buf)), 1257 .addr = htonll(addr), 1258 }; 1259 elts_head = elts_head_next; 1260 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 1261 length += DATA_LEN(buf); 1262 #endif 1263 buf = buf->next; 1264 ++mpw.pkts_n; 1265 ++j; 1266 } while (--segs_n); 1267 assert(length == mpw.len); 1268 if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) 1269 mlx5_mpw_close(txq, &mpw); 1270 } else { 1271 unsigned int max; 1272 1273 assert(mpw.state == MLX5_MPW_INL_STATE_OPENED); 1274 assert(length <= inline_room); 1275 assert(length == DATA_LEN(buf)); 1276 elts_head_next = (elts_head + 1) & (elts_n - 1); 1277 addr = rte_pktmbuf_mtod(buf, uintptr_t); 1278 (*txq->elts)[elts_head] = buf; 1279 /* Maximum number of bytes before wrapping. */ 1280 max = ((uintptr_t)&(*txq->wqes)[txq->wqe_n] - 1281 (uintptr_t)mpw.data.raw); 1282 if (length > max) { 1283 rte_memcpy((void *)(uintptr_t)mpw.data.raw, 1284 (void *)addr, 1285 max); 1286 mpw.data.raw = 1287 (volatile void *)&(*txq->wqes)[0]; 1288 rte_memcpy((void *)(uintptr_t)mpw.data.raw, 1289 (void *)(addr + max), 1290 length - max); 1291 mpw.data.raw += length - max; 1292 } else { 1293 rte_memcpy((void *)(uintptr_t)mpw.data.raw, 1294 (void *)addr, 1295 length); 1296 mpw.data.raw += length; 1297 } 1298 if ((uintptr_t)mpw.data.raw == 1299 (uintptr_t)&(*txq->wqes)[txq->wqe_n]) 1300 mpw.data.raw = 1301 (volatile void *)&(*txq->wqes)[0]; 1302 ++mpw.pkts_n; 1303 ++j; 1304 if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) { 1305 mlx5_mpw_inline_close(txq, &mpw); 1306 inline_room = txq->max_inline; 1307 } else { 1308 inline_room -= length; 1309 } 1310 } 1311 mpw.total_len += length; 1312 elts_head = elts_head_next; 1313 #ifdef MLX5_PMD_SOFT_COUNTERS 1314 /* Increment sent bytes counter. */ 1315 txq->stats.obytes += length; 1316 #endif 1317 ++i; 1318 } while (pkts_n); 1319 /* Take a shortcut if nothing must be sent. */ 1320 if (unlikely(i == 0)) 1321 return 0; 1322 /* Check whether completion threshold has been reached. */ 1323 /* "j" includes both packets and segments. */ 1324 comp = txq->elts_comp + j; 1325 if (comp >= MLX5_TX_COMP_THRESH) { 1326 volatile union mlx5_wqe *wqe = mpw.wqe; 1327 1328 /* Request completion on last WQE. */ 1329 wqe->mpw_inl.ctrl.data[2] = htonl(8); 1330 /* Save elts_head in unused "immediate" field of WQE. */ 1331 wqe->mpw_inl.ctrl.data[3] = elts_head; 1332 txq->elts_comp = 0; 1333 } else { 1334 txq->elts_comp = comp; 1335 } 1336 #ifdef MLX5_PMD_SOFT_COUNTERS 1337 /* Increment sent packets counter. */ 1338 txq->stats.opackets += i; 1339 #endif 1340 /* Ring QP doorbell. */ 1341 if (mpw.state == MLX5_MPW_INL_STATE_OPENED) 1342 mlx5_mpw_inline_close(txq, &mpw); 1343 else if (mpw.state == MLX5_MPW_STATE_OPENED) 1344 mlx5_mpw_close(txq, &mpw); 1345 mlx5_tx_dbrec(txq); 1346 txq->elts_head = elts_head; 1347 return i; 1348 } 1349 1350 /** 1351 * Translate RX completion flags to packet type. 1352 * 1353 * @param[in] cqe 1354 * Pointer to CQE. 1355 * 1356 * @note: fix mlx5_dev_supported_ptypes_get() if any change here. 1357 * 1358 * @return 1359 * Packet type for struct rte_mbuf. 1360 */ 1361 static inline uint32_t 1362 rxq_cq_to_pkt_type(volatile struct mlx5_cqe64 *cqe) 1363 { 1364 uint32_t pkt_type; 1365 uint8_t flags = cqe->l4_hdr_type_etc; 1366 uint8_t info = cqe->rsvd0[0]; 1367 1368 if (info & IBV_EXP_CQ_RX_TUNNEL_PACKET) 1369 pkt_type = 1370 TRANSPOSE(flags, 1371 IBV_EXP_CQ_RX_OUTER_IPV4_PACKET, 1372 RTE_PTYPE_L3_IPV4) | 1373 TRANSPOSE(flags, 1374 IBV_EXP_CQ_RX_OUTER_IPV6_PACKET, 1375 RTE_PTYPE_L3_IPV6) | 1376 TRANSPOSE(flags, 1377 IBV_EXP_CQ_RX_IPV4_PACKET, 1378 RTE_PTYPE_INNER_L3_IPV4) | 1379 TRANSPOSE(flags, 1380 IBV_EXP_CQ_RX_IPV6_PACKET, 1381 RTE_PTYPE_INNER_L3_IPV6); 1382 else 1383 pkt_type = 1384 TRANSPOSE(flags, 1385 MLX5_CQE_L3_HDR_TYPE_IPV6, 1386 RTE_PTYPE_L3_IPV6) | 1387 TRANSPOSE(flags, 1388 MLX5_CQE_L3_HDR_TYPE_IPV4, 1389 RTE_PTYPE_L3_IPV4); 1390 return pkt_type; 1391 } 1392 1393 /** 1394 * Get size of the next packet for a given CQE. For compressed CQEs, the 1395 * consumer index is updated only once all packets of the current one have 1396 * been processed. 1397 * 1398 * @param rxq 1399 * Pointer to RX queue. 1400 * @param cqe 1401 * CQE to process. 1402 * 1403 * @return 1404 * Packet size in bytes (0 if there is none), -1 in case of completion 1405 * with error. 1406 */ 1407 static inline int 1408 mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe64 *cqe, 1409 uint16_t cqe_cnt) 1410 { 1411 struct rxq_zip *zip = &rxq->zip; 1412 uint16_t cqe_n = cqe_cnt + 1; 1413 int len = 0; 1414 1415 /* Process compressed data in the CQE and mini arrays. */ 1416 if (zip->ai) { 1417 volatile struct mlx5_mini_cqe8 (*mc)[8] = 1418 (volatile struct mlx5_mini_cqe8 (*)[8]) 1419 (uintptr_t)(&(*rxq->cqes)[zip->ca & cqe_cnt].cqe64); 1420 1421 len = ntohl((*mc)[zip->ai & 7].byte_cnt); 1422 if ((++zip->ai & 7) == 0) { 1423 /* 1424 * Increment consumer index to skip the number of 1425 * CQEs consumed. Hardware leaves holes in the CQ 1426 * ring for software use. 1427 */ 1428 zip->ca = zip->na; 1429 zip->na += 8; 1430 } 1431 if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) { 1432 uint16_t idx = rxq->cq_ci; 1433 uint16_t end = zip->cq_ci; 1434 1435 while (idx != end) { 1436 (*rxq->cqes)[idx & cqe_cnt].cqe64.op_own = 1437 MLX5_CQE_INVALIDATE; 1438 ++idx; 1439 } 1440 rxq->cq_ci = zip->cq_ci; 1441 zip->ai = 0; 1442 } 1443 /* No compressed data, get next CQE and verify if it is compressed. */ 1444 } else { 1445 int ret; 1446 int8_t op_own; 1447 1448 ret = check_cqe64(cqe, cqe_n, rxq->cq_ci); 1449 if (unlikely(ret == 1)) 1450 return 0; 1451 ++rxq->cq_ci; 1452 op_own = cqe->op_own; 1453 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) { 1454 volatile struct mlx5_mini_cqe8 (*mc)[8] = 1455 (volatile struct mlx5_mini_cqe8 (*)[8]) 1456 (uintptr_t)(&(*rxq->cqes)[rxq->cq_ci & 1457 cqe_cnt].cqe64); 1458 1459 /* Fix endianness. */ 1460 zip->cqe_cnt = ntohl(cqe->byte_cnt); 1461 /* 1462 * Current mini array position is the one returned by 1463 * check_cqe64(). 1464 * 1465 * If completion comprises several mini arrays, as a 1466 * special case the second one is located 7 CQEs after 1467 * the initial CQE instead of 8 for subsequent ones. 1468 */ 1469 zip->ca = rxq->cq_ci & cqe_cnt; 1470 zip->na = zip->ca + 7; 1471 /* Compute the next non compressed CQE. */ 1472 --rxq->cq_ci; 1473 zip->cq_ci = rxq->cq_ci + zip->cqe_cnt; 1474 /* Get packet size to return. */ 1475 len = ntohl((*mc)[0].byte_cnt); 1476 zip->ai = 1; 1477 } else { 1478 len = ntohl(cqe->byte_cnt); 1479 } 1480 /* Error while receiving packet. */ 1481 if (unlikely(MLX5_CQE_OPCODE(op_own) == MLX5_CQE_RESP_ERR)) 1482 return -1; 1483 } 1484 return len; 1485 } 1486 1487 /** 1488 * Translate RX completion flags to offload flags. 1489 * 1490 * @param[in] rxq 1491 * Pointer to RX queue structure. 1492 * @param[in] cqe 1493 * Pointer to CQE. 1494 * 1495 * @return 1496 * Offload flags (ol_flags) for struct rte_mbuf. 1497 */ 1498 static inline uint32_t 1499 rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe64 *cqe) 1500 { 1501 uint32_t ol_flags = 0; 1502 uint8_t l3_hdr = (cqe->l4_hdr_type_etc) & MLX5_CQE_L3_HDR_TYPE_MASK; 1503 uint8_t l4_hdr = (cqe->l4_hdr_type_etc) & MLX5_CQE_L4_HDR_TYPE_MASK; 1504 uint8_t info = cqe->rsvd0[0]; 1505 1506 if ((l3_hdr == MLX5_CQE_L3_HDR_TYPE_IPV4) || 1507 (l3_hdr == MLX5_CQE_L3_HDR_TYPE_IPV6)) 1508 ol_flags |= 1509 (!(cqe->hds_ip_ext & MLX5_CQE_L3_OK) * 1510 PKT_RX_IP_CKSUM_BAD); 1511 if ((l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP) || 1512 (l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP_EMP_ACK) || 1513 (l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP_ACK) || 1514 (l4_hdr == MLX5_CQE_L4_HDR_TYPE_UDP)) 1515 ol_flags |= 1516 (!(cqe->hds_ip_ext & MLX5_CQE_L4_OK) * 1517 PKT_RX_L4_CKSUM_BAD); 1518 /* 1519 * PKT_RX_IP_CKSUM_BAD and PKT_RX_L4_CKSUM_BAD are used in place 1520 * of PKT_RX_EIP_CKSUM_BAD because the latter is not functional 1521 * (its value is 0). 1522 */ 1523 if ((info & IBV_EXP_CQ_RX_TUNNEL_PACKET) && (rxq->csum_l2tun)) 1524 ol_flags |= 1525 TRANSPOSE(~cqe->l4_hdr_type_etc, 1526 IBV_EXP_CQ_RX_OUTER_IP_CSUM_OK, 1527 PKT_RX_IP_CKSUM_BAD) | 1528 TRANSPOSE(~cqe->l4_hdr_type_etc, 1529 IBV_EXP_CQ_RX_OUTER_TCP_UDP_CSUM_OK, 1530 PKT_RX_L4_CKSUM_BAD); 1531 return ol_flags; 1532 } 1533 1534 /** 1535 * DPDK callback for RX. 1536 * 1537 * @param dpdk_rxq 1538 * Generic pointer to RX queue structure. 1539 * @param[out] pkts 1540 * Array to store received packets. 1541 * @param pkts_n 1542 * Maximum number of packets in array. 1543 * 1544 * @return 1545 * Number of packets successfully received (<= pkts_n). 1546 */ 1547 uint16_t 1548 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1549 { 1550 struct rxq *rxq = dpdk_rxq; 1551 const unsigned int wqe_cnt = rxq->elts_n - 1; 1552 const unsigned int cqe_cnt = rxq->cqe_n - 1; 1553 const unsigned int sges_n = rxq->sges_n; 1554 struct rte_mbuf *pkt = NULL; 1555 struct rte_mbuf *seg = NULL; 1556 volatile struct mlx5_cqe64 *cqe = 1557 &(*rxq->cqes)[rxq->cq_ci & cqe_cnt].cqe64; 1558 unsigned int i = 0; 1559 unsigned int rq_ci = rxq->rq_ci << sges_n; 1560 int len; 1561 1562 while (pkts_n) { 1563 unsigned int idx = rq_ci & wqe_cnt; 1564 volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx]; 1565 struct rte_mbuf *rep = (*rxq->elts)[idx]; 1566 1567 if (pkt) 1568 NEXT(seg) = rep; 1569 seg = rep; 1570 rte_prefetch0(seg); 1571 rte_prefetch0(cqe); 1572 rte_prefetch0(wqe); 1573 rep = rte_mbuf_raw_alloc(rxq->mp); 1574 if (unlikely(rep == NULL)) { 1575 while (pkt != seg) { 1576 assert(pkt != (*rxq->elts)[idx]); 1577 seg = NEXT(pkt); 1578 rte_mbuf_refcnt_set(pkt, 0); 1579 __rte_mbuf_raw_free(pkt); 1580 pkt = seg; 1581 } 1582 ++rxq->stats.rx_nombuf; 1583 break; 1584 } 1585 if (!pkt) { 1586 cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt].cqe64; 1587 len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt); 1588 if (len == 0) { 1589 rte_mbuf_refcnt_set(rep, 0); 1590 __rte_mbuf_raw_free(rep); 1591 break; 1592 } 1593 if (unlikely(len == -1)) { 1594 /* RX error, packet is likely too large. */ 1595 rte_mbuf_refcnt_set(rep, 0); 1596 __rte_mbuf_raw_free(rep); 1597 ++rxq->stats.idropped; 1598 goto skip; 1599 } 1600 pkt = seg; 1601 assert(len >= (rxq->crc_present << 2)); 1602 /* Update packet information. */ 1603 pkt->packet_type = 0; 1604 pkt->ol_flags = 0; 1605 if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip | 1606 rxq->crc_present) { 1607 if (rxq->csum) { 1608 pkt->packet_type = 1609 rxq_cq_to_pkt_type(cqe); 1610 pkt->ol_flags = 1611 rxq_cq_to_ol_flags(rxq, cqe); 1612 } 1613 if (cqe->l4_hdr_type_etc & 1614 MLX5_CQE_VLAN_STRIPPED) { 1615 pkt->ol_flags |= PKT_RX_VLAN_PKT | 1616 PKT_RX_VLAN_STRIPPED; 1617 pkt->vlan_tci = ntohs(cqe->vlan_info); 1618 } 1619 if (rxq->crc_present) 1620 len -= ETHER_CRC_LEN; 1621 } 1622 PKT_LEN(pkt) = len; 1623 } 1624 DATA_LEN(rep) = DATA_LEN(seg); 1625 PKT_LEN(rep) = PKT_LEN(seg); 1626 SET_DATA_OFF(rep, DATA_OFF(seg)); 1627 NB_SEGS(rep) = NB_SEGS(seg); 1628 PORT(rep) = PORT(seg); 1629 NEXT(rep) = NULL; 1630 (*rxq->elts)[idx] = rep; 1631 /* 1632 * Fill NIC descriptor with the new buffer. The lkey and size 1633 * of the buffers are already known, only the buffer address 1634 * changes. 1635 */ 1636 wqe->addr = htonll(rte_pktmbuf_mtod(rep, uintptr_t)); 1637 if (len > DATA_LEN(seg)) { 1638 len -= DATA_LEN(seg); 1639 ++NB_SEGS(pkt); 1640 ++rq_ci; 1641 continue; 1642 } 1643 DATA_LEN(seg) = len; 1644 #ifdef MLX5_PMD_SOFT_COUNTERS 1645 /* Increment bytes counter. */ 1646 rxq->stats.ibytes += PKT_LEN(pkt); 1647 #endif 1648 /* Return packet. */ 1649 *(pkts++) = pkt; 1650 pkt = NULL; 1651 --pkts_n; 1652 ++i; 1653 skip: 1654 /* Align consumer index to the next stride. */ 1655 rq_ci >>= sges_n; 1656 ++rq_ci; 1657 rq_ci <<= sges_n; 1658 } 1659 if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci))) 1660 return 0; 1661 /* Update the consumer index. */ 1662 rxq->rq_ci = rq_ci >> sges_n; 1663 rte_wmb(); 1664 *rxq->cq_db = htonl(rxq->cq_ci); 1665 rte_wmb(); 1666 *rxq->rq_db = htonl(rxq->rq_ci); 1667 #ifdef MLX5_PMD_SOFT_COUNTERS 1668 /* Increment packets counter. */ 1669 rxq->stats.ipackets += i; 1670 #endif 1671 return i; 1672 } 1673 1674 /** 1675 * Dummy DPDK callback for TX. 1676 * 1677 * This function is used to temporarily replace the real callback during 1678 * unsafe control operations on the queue, or in case of error. 1679 * 1680 * @param dpdk_txq 1681 * Generic pointer to TX queue structure. 1682 * @param[in] pkts 1683 * Packets to transmit. 1684 * @param pkts_n 1685 * Number of packets in array. 1686 * 1687 * @return 1688 * Number of packets successfully transmitted (<= pkts_n). 1689 */ 1690 uint16_t 1691 removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 1692 { 1693 (void)dpdk_txq; 1694 (void)pkts; 1695 (void)pkts_n; 1696 return 0; 1697 } 1698 1699 /** 1700 * Dummy DPDK callback for RX. 1701 * 1702 * This function is used to temporarily replace the real callback during 1703 * unsafe control operations on the queue, or in case of error. 1704 * 1705 * @param dpdk_rxq 1706 * Generic pointer to RX queue structure. 1707 * @param[out] pkts 1708 * Array to store received packets. 1709 * @param pkts_n 1710 * Maximum number of packets in array. 1711 * 1712 * @return 1713 * Number of packets successfully received (<= pkts_n). 1714 */ 1715 uint16_t 1716 removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1717 { 1718 (void)dpdk_rxq; 1719 (void)pkts; 1720 (void)pkts_n; 1721 return 0; 1722 } 1723