1 /*- 2 * BSD LICENSE 3 * 4 * Copyright 2015 6WIND S.A. 5 * Copyright 2015 Mellanox. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of 6WIND S.A. nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <assert.h> 35 #include <stdint.h> 36 #include <string.h> 37 #include <stdlib.h> 38 39 /* Verbs header. */ 40 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 41 #ifdef PEDANTIC 42 #pragma GCC diagnostic ignored "-Wpedantic" 43 #endif 44 #include <infiniband/verbs.h> 45 #include <infiniband/mlx5_hw.h> 46 #include <infiniband/arch.h> 47 #ifdef PEDANTIC 48 #pragma GCC diagnostic error "-Wpedantic" 49 #endif 50 51 /* DPDK headers don't like -pedantic. */ 52 #ifdef PEDANTIC 53 #pragma GCC diagnostic ignored "-Wpedantic" 54 #endif 55 #include <rte_mbuf.h> 56 #include <rte_mempool.h> 57 #include <rte_prefetch.h> 58 #include <rte_common.h> 59 #include <rte_branch_prediction.h> 60 #include <rte_ether.h> 61 #ifdef PEDANTIC 62 #pragma GCC diagnostic error "-Wpedantic" 63 #endif 64 65 #include "mlx5.h" 66 #include "mlx5_utils.h" 67 #include "mlx5_rxtx.h" 68 #include "mlx5_autoconf.h" 69 #include "mlx5_defs.h" 70 #include "mlx5_prm.h" 71 72 static __rte_always_inline uint32_t 73 rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe); 74 75 static __rte_always_inline int 76 mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe, 77 uint16_t cqe_cnt, uint32_t *rss_hash); 78 79 static __rte_always_inline uint32_t 80 rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe *cqe); 81 82 uint32_t mlx5_ptype_table[] __rte_cache_aligned = { 83 [0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */ 84 }; 85 86 /** 87 * Build a table to translate Rx completion flags to packet type. 88 * 89 * @note: fix mlx5_dev_supported_ptypes_get() if any change here. 90 */ 91 void 92 mlx5_set_ptype_table(void) 93 { 94 unsigned int i; 95 uint32_t (*p)[RTE_DIM(mlx5_ptype_table)] = &mlx5_ptype_table; 96 97 /* Last entry must not be overwritten, reserved for errored packet. */ 98 for (i = 0; i < RTE_DIM(mlx5_ptype_table) - 1; ++i) 99 (*p)[i] = RTE_PTYPE_UNKNOWN; 100 /* 101 * The index to the array should have: 102 * bit[1:0] = l3_hdr_type 103 * bit[4:2] = l4_hdr_type 104 * bit[5] = ip_frag 105 * bit[6] = tunneled 106 * bit[7] = outer_l3_type 107 */ 108 /* L3 */ 109 (*p)[0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 110 RTE_PTYPE_L4_NONFRAG; 111 (*p)[0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 112 RTE_PTYPE_L4_NONFRAG; 113 /* Fragmented */ 114 (*p)[0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 115 RTE_PTYPE_L4_FRAG; 116 (*p)[0x22] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 117 RTE_PTYPE_L4_FRAG; 118 /* TCP */ 119 (*p)[0x05] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 120 RTE_PTYPE_L4_TCP; 121 (*p)[0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 122 RTE_PTYPE_L4_TCP; 123 /* UDP */ 124 (*p)[0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 125 RTE_PTYPE_L4_UDP; 126 (*p)[0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 127 RTE_PTYPE_L4_UDP; 128 /* Repeat with outer_l3_type being set. Just in case. */ 129 (*p)[0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 130 RTE_PTYPE_L4_NONFRAG; 131 (*p)[0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 132 RTE_PTYPE_L4_NONFRAG; 133 (*p)[0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 134 RTE_PTYPE_L4_FRAG; 135 (*p)[0xa2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 136 RTE_PTYPE_L4_FRAG; 137 (*p)[0x85] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 138 RTE_PTYPE_L4_TCP; 139 (*p)[0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 140 RTE_PTYPE_L4_TCP; 141 (*p)[0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 142 RTE_PTYPE_L4_UDP; 143 (*p)[0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 144 RTE_PTYPE_L4_UDP; 145 /* Tunneled - L3 */ 146 (*p)[0x41] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 147 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 148 RTE_PTYPE_INNER_L4_NONFRAG; 149 (*p)[0x42] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 150 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 151 RTE_PTYPE_INNER_L4_NONFRAG; 152 (*p)[0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 153 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 154 RTE_PTYPE_INNER_L4_NONFRAG; 155 (*p)[0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 156 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 157 RTE_PTYPE_INNER_L4_NONFRAG; 158 /* Tunneled - Fragmented */ 159 (*p)[0x61] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 160 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 161 RTE_PTYPE_INNER_L4_FRAG; 162 (*p)[0x62] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 163 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 164 RTE_PTYPE_INNER_L4_FRAG; 165 (*p)[0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 166 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 167 RTE_PTYPE_INNER_L4_FRAG; 168 (*p)[0xe2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 169 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 170 RTE_PTYPE_INNER_L4_FRAG; 171 /* Tunneled - TCP */ 172 (*p)[0x45] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 173 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 174 RTE_PTYPE_L4_TCP; 175 (*p)[0x46] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 176 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 177 RTE_PTYPE_L4_TCP; 178 (*p)[0xc5] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 179 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 180 RTE_PTYPE_L4_TCP; 181 (*p)[0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 182 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 183 RTE_PTYPE_L4_TCP; 184 /* Tunneled - UDP */ 185 (*p)[0x49] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 186 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 187 RTE_PTYPE_L4_UDP; 188 (*p)[0x4a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 189 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 190 RTE_PTYPE_L4_UDP; 191 (*p)[0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 192 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 193 RTE_PTYPE_L4_UDP; 194 (*p)[0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 195 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 196 RTE_PTYPE_L4_UDP; 197 } 198 199 /** 200 * Return the size of tailroom of WQ. 201 * 202 * @param txq 203 * Pointer to TX queue structure. 204 * @param addr 205 * Pointer to tail of WQ. 206 * 207 * @return 208 * Size of tailroom. 209 */ 210 static inline size_t 211 tx_mlx5_wq_tailroom(struct txq *txq, void *addr) 212 { 213 size_t tailroom; 214 tailroom = (uintptr_t)(txq->wqes) + 215 (1 << txq->wqe_n) * MLX5_WQE_SIZE - 216 (uintptr_t)addr; 217 return tailroom; 218 } 219 220 /** 221 * Copy data to tailroom of circular queue. 222 * 223 * @param dst 224 * Pointer to destination. 225 * @param src 226 * Pointer to source. 227 * @param n 228 * Number of bytes to copy. 229 * @param base 230 * Pointer to head of queue. 231 * @param tailroom 232 * Size of tailroom from dst. 233 * 234 * @return 235 * Pointer after copied data. 236 */ 237 static inline void * 238 mlx5_copy_to_wq(void *dst, const void *src, size_t n, 239 void *base, size_t tailroom) 240 { 241 void *ret; 242 243 if (n > tailroom) { 244 rte_memcpy(dst, src, tailroom); 245 rte_memcpy(base, (void *)((uintptr_t)src + tailroom), 246 n - tailroom); 247 ret = (uint8_t *)base + n - tailroom; 248 } else { 249 rte_memcpy(dst, src, n); 250 ret = (n == tailroom) ? base : (uint8_t *)dst + n; 251 } 252 return ret; 253 } 254 255 /** 256 * DPDK callback to check the status of a tx descriptor. 257 * 258 * @param tx_queue 259 * The tx queue. 260 * @param[in] offset 261 * The index of the descriptor in the ring. 262 * 263 * @return 264 * The status of the tx descriptor. 265 */ 266 int 267 mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset) 268 { 269 struct txq *txq = tx_queue; 270 uint16_t used; 271 272 mlx5_tx_complete(txq); 273 used = txq->elts_head - txq->elts_tail; 274 if (offset < used) 275 return RTE_ETH_TX_DESC_FULL; 276 return RTE_ETH_TX_DESC_DONE; 277 } 278 279 /** 280 * DPDK callback to check the status of a rx descriptor. 281 * 282 * @param rx_queue 283 * The rx queue. 284 * @param[in] offset 285 * The index of the descriptor in the ring. 286 * 287 * @return 288 * The status of the tx descriptor. 289 */ 290 int 291 mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset) 292 { 293 struct rxq *rxq = rx_queue; 294 struct rxq_zip *zip = &rxq->zip; 295 volatile struct mlx5_cqe *cqe; 296 const unsigned int cqe_n = (1 << rxq->cqe_n); 297 const unsigned int cqe_cnt = cqe_n - 1; 298 unsigned int cq_ci; 299 unsigned int used; 300 301 /* if we are processing a compressed cqe */ 302 if (zip->ai) { 303 used = zip->cqe_cnt - zip->ca; 304 cq_ci = zip->cq_ci; 305 } else { 306 used = 0; 307 cq_ci = rxq->cq_ci; 308 } 309 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; 310 while (check_cqe(cqe, cqe_n, cq_ci) == 0) { 311 int8_t op_own; 312 unsigned int n; 313 314 op_own = cqe->op_own; 315 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) 316 n = ntohl(cqe->byte_cnt); 317 else 318 n = 1; 319 cq_ci += n; 320 used += n; 321 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; 322 } 323 used = RTE_MIN(used, (1U << rxq->elts_n) - 1); 324 if (offset < used) 325 return RTE_ETH_RX_DESC_DONE; 326 return RTE_ETH_RX_DESC_AVAIL; 327 } 328 329 /** 330 * DPDK callback for TX. 331 * 332 * @param dpdk_txq 333 * Generic pointer to TX queue structure. 334 * @param[in] pkts 335 * Packets to transmit. 336 * @param pkts_n 337 * Number of packets in array. 338 * 339 * @return 340 * Number of packets successfully transmitted (<= pkts_n). 341 */ 342 uint16_t 343 mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 344 { 345 struct txq *txq = (struct txq *)dpdk_txq; 346 uint16_t elts_head = txq->elts_head; 347 const uint16_t elts_n = 1 << txq->elts_n; 348 const uint16_t elts_m = elts_n - 1; 349 unsigned int i = 0; 350 unsigned int j = 0; 351 unsigned int k = 0; 352 uint16_t max_elts; 353 unsigned int max_inline = txq->max_inline; 354 const unsigned int inline_en = !!max_inline && txq->inline_en; 355 uint16_t max_wqe; 356 unsigned int comp; 357 volatile struct mlx5_wqe_v *wqe = NULL; 358 volatile struct mlx5_wqe_ctrl *last_wqe = NULL; 359 unsigned int segs_n = 0; 360 struct rte_mbuf *buf = NULL; 361 uint8_t *raw; 362 363 if (unlikely(!pkts_n)) 364 return 0; 365 /* Prefetch first packet cacheline. */ 366 rte_prefetch0(*pkts); 367 /* Start processing. */ 368 mlx5_tx_complete(txq); 369 max_elts = (elts_n - (elts_head - txq->elts_tail)); 370 max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); 371 if (unlikely(!max_wqe)) 372 return 0; 373 do { 374 volatile rte_v128u32_t *dseg = NULL; 375 uint32_t length; 376 unsigned int ds = 0; 377 unsigned int sg = 0; /* counter of additional segs attached. */ 378 uintptr_t addr; 379 uint64_t naddr; 380 uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE + 2; 381 uint16_t tso_header_sz = 0; 382 uint16_t ehdr; 383 uint8_t cs_flags = 0; 384 uint64_t tso = 0; 385 uint16_t tso_segsz = 0; 386 #ifdef MLX5_PMD_SOFT_COUNTERS 387 uint32_t total_length = 0; 388 #endif 389 390 /* first_seg */ 391 buf = *pkts; 392 segs_n = buf->nb_segs; 393 /* 394 * Make sure there is enough room to store this packet and 395 * that one ring entry remains unused. 396 */ 397 assert(segs_n); 398 if (max_elts < segs_n) 399 break; 400 max_elts -= segs_n; 401 --segs_n; 402 if (unlikely(--max_wqe == 0)) 403 break; 404 wqe = (volatile struct mlx5_wqe_v *) 405 tx_mlx5_wqe(txq, txq->wqe_ci); 406 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); 407 if (pkts_n - i > 1) 408 rte_prefetch0(*(pkts + 1)); 409 addr = rte_pktmbuf_mtod(buf, uintptr_t); 410 length = DATA_LEN(buf); 411 ehdr = (((uint8_t *)addr)[1] << 8) | 412 ((uint8_t *)addr)[0]; 413 #ifdef MLX5_PMD_SOFT_COUNTERS 414 total_length = length; 415 #endif 416 if (length < (MLX5_WQE_DWORD_SIZE + 2)) 417 break; 418 /* Update element. */ 419 (*txq->elts)[elts_head & elts_m] = buf; 420 /* Prefetch next buffer data. */ 421 if (pkts_n - i > 1) 422 rte_prefetch0( 423 rte_pktmbuf_mtod(*(pkts + 1), volatile void *)); 424 /* Should we enable HW CKSUM offload */ 425 if (buf->ol_flags & 426 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) { 427 const uint64_t is_tunneled = buf->ol_flags & 428 (PKT_TX_TUNNEL_GRE | 429 PKT_TX_TUNNEL_VXLAN); 430 431 if (is_tunneled && txq->tunnel_en) { 432 cs_flags = MLX5_ETH_WQE_L3_INNER_CSUM | 433 MLX5_ETH_WQE_L4_INNER_CSUM; 434 if (buf->ol_flags & PKT_TX_OUTER_IP_CKSUM) 435 cs_flags |= MLX5_ETH_WQE_L3_CSUM; 436 } else { 437 cs_flags = MLX5_ETH_WQE_L3_CSUM | 438 MLX5_ETH_WQE_L4_CSUM; 439 } 440 } 441 raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE; 442 /* Replace the Ethernet type by the VLAN if necessary. */ 443 if (buf->ol_flags & PKT_TX_VLAN_PKT) { 444 uint32_t vlan = htonl(0x81000000 | buf->vlan_tci); 445 unsigned int len = 2 * ETHER_ADDR_LEN - 2; 446 447 addr += 2; 448 length -= 2; 449 /* Copy Destination and source mac address. */ 450 memcpy((uint8_t *)raw, ((uint8_t *)addr), len); 451 /* Copy VLAN. */ 452 memcpy((uint8_t *)raw + len, &vlan, sizeof(vlan)); 453 /* Copy missing two bytes to end the DSeg. */ 454 memcpy((uint8_t *)raw + len + sizeof(vlan), 455 ((uint8_t *)addr) + len, 2); 456 addr += len + 2; 457 length -= (len + 2); 458 } else { 459 memcpy((uint8_t *)raw, ((uint8_t *)addr) + 2, 460 MLX5_WQE_DWORD_SIZE); 461 length -= pkt_inline_sz; 462 addr += pkt_inline_sz; 463 } 464 if (txq->tso_en) { 465 tso = buf->ol_flags & PKT_TX_TCP_SEG; 466 if (tso) { 467 uintptr_t end = (uintptr_t) 468 (((uintptr_t)txq->wqes) + 469 (1 << txq->wqe_n) * 470 MLX5_WQE_SIZE); 471 unsigned int copy_b; 472 uint8_t vlan_sz = (buf->ol_flags & 473 PKT_TX_VLAN_PKT) ? 4 : 0; 474 const uint64_t is_tunneled = 475 buf->ol_flags & 476 (PKT_TX_TUNNEL_GRE | 477 PKT_TX_TUNNEL_VXLAN); 478 479 tso_header_sz = buf->l2_len + vlan_sz + 480 buf->l3_len + buf->l4_len; 481 tso_segsz = buf->tso_segsz; 482 483 if (is_tunneled && txq->tunnel_en) { 484 tso_header_sz += buf->outer_l2_len + 485 buf->outer_l3_len; 486 cs_flags |= MLX5_ETH_WQE_L4_INNER_CSUM; 487 } else { 488 cs_flags |= MLX5_ETH_WQE_L4_CSUM; 489 } 490 if (unlikely(tso_header_sz > 491 MLX5_MAX_TSO_HEADER)) 492 break; 493 copy_b = tso_header_sz - pkt_inline_sz; 494 /* First seg must contain all headers. */ 495 assert(copy_b <= length); 496 raw += MLX5_WQE_DWORD_SIZE; 497 if (copy_b && 498 ((end - (uintptr_t)raw) > copy_b)) { 499 uint16_t n = (MLX5_WQE_DS(copy_b) - 500 1 + 3) / 4; 501 502 if (unlikely(max_wqe < n)) 503 break; 504 max_wqe -= n; 505 rte_memcpy((void *)raw, 506 (void *)addr, copy_b); 507 addr += copy_b; 508 length -= copy_b; 509 pkt_inline_sz += copy_b; 510 /* 511 * Another DWORD will be added 512 * in the inline part. 513 */ 514 raw += MLX5_WQE_DS(copy_b) * 515 MLX5_WQE_DWORD_SIZE - 516 MLX5_WQE_DWORD_SIZE; 517 } else { 518 /* NOP WQE. */ 519 wqe->ctrl = (rte_v128u32_t){ 520 htonl(txq->wqe_ci << 8), 521 htonl(txq->qp_num_8s | 1), 522 0, 523 0, 524 }; 525 ds = 1; 526 total_length = 0; 527 k++; 528 goto next_wqe; 529 } 530 } 531 } 532 /* Inline if enough room. */ 533 if (inline_en || tso) { 534 uintptr_t end = (uintptr_t) 535 (((uintptr_t)txq->wqes) + 536 (1 << txq->wqe_n) * MLX5_WQE_SIZE); 537 unsigned int inline_room = max_inline * 538 RTE_CACHE_LINE_SIZE - 539 (pkt_inline_sz - 2); 540 uintptr_t addr_end = (addr + inline_room) & 541 ~(RTE_CACHE_LINE_SIZE - 1); 542 unsigned int copy_b = (addr_end > addr) ? 543 RTE_MIN((addr_end - addr), length) : 544 0; 545 546 raw += MLX5_WQE_DWORD_SIZE; 547 if (copy_b && ((end - (uintptr_t)raw) > copy_b)) { 548 /* 549 * One Dseg remains in the current WQE. To 550 * keep the computation positive, it is 551 * removed after the bytes to Dseg conversion. 552 */ 553 uint16_t n = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4; 554 555 if (unlikely(max_wqe < n)) 556 break; 557 max_wqe -= n; 558 if (tso) { 559 uint32_t inl = 560 htonl(copy_b | MLX5_INLINE_SEG); 561 562 pkt_inline_sz = 563 MLX5_WQE_DS(tso_header_sz) * 564 MLX5_WQE_DWORD_SIZE; 565 rte_memcpy((void *)raw, 566 (void *)&inl, sizeof(inl)); 567 raw += sizeof(inl); 568 pkt_inline_sz += sizeof(inl); 569 } 570 rte_memcpy((void *)raw, (void *)addr, copy_b); 571 addr += copy_b; 572 length -= copy_b; 573 pkt_inline_sz += copy_b; 574 } 575 /* 576 * 2 DWORDs consumed by the WQE header + ETH segment + 577 * the size of the inline part of the packet. 578 */ 579 ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2); 580 if (length > 0) { 581 if (ds % (MLX5_WQE_SIZE / 582 MLX5_WQE_DWORD_SIZE) == 0) { 583 if (unlikely(--max_wqe == 0)) 584 break; 585 dseg = (volatile rte_v128u32_t *) 586 tx_mlx5_wqe(txq, txq->wqe_ci + 587 ds / 4); 588 } else { 589 dseg = (volatile rte_v128u32_t *) 590 ((uintptr_t)wqe + 591 (ds * MLX5_WQE_DWORD_SIZE)); 592 } 593 goto use_dseg; 594 } else if (!segs_n) { 595 goto next_pkt; 596 } else { 597 /* dseg will be advance as part of next_seg */ 598 dseg = (volatile rte_v128u32_t *) 599 ((uintptr_t)wqe + 600 ((ds - 1) * MLX5_WQE_DWORD_SIZE)); 601 goto next_seg; 602 } 603 } else { 604 /* 605 * No inline has been done in the packet, only the 606 * Ethernet Header as been stored. 607 */ 608 dseg = (volatile rte_v128u32_t *) 609 ((uintptr_t)wqe + (3 * MLX5_WQE_DWORD_SIZE)); 610 ds = 3; 611 use_dseg: 612 /* Add the remaining packet as a simple ds. */ 613 naddr = htonll(addr); 614 *dseg = (rte_v128u32_t){ 615 htonl(length), 616 mlx5_tx_mb2mr(txq, buf), 617 naddr, 618 naddr >> 32, 619 }; 620 ++ds; 621 if (!segs_n) 622 goto next_pkt; 623 } 624 next_seg: 625 assert(buf); 626 assert(ds); 627 assert(wqe); 628 /* 629 * Spill on next WQE when the current one does not have 630 * enough room left. Size of WQE must a be a multiple 631 * of data segment size. 632 */ 633 assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE)); 634 if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE))) { 635 if (unlikely(--max_wqe == 0)) 636 break; 637 dseg = (volatile rte_v128u32_t *) 638 tx_mlx5_wqe(txq, txq->wqe_ci + ds / 4); 639 rte_prefetch0(tx_mlx5_wqe(txq, 640 txq->wqe_ci + ds / 4 + 1)); 641 } else { 642 ++dseg; 643 } 644 ++ds; 645 buf = buf->next; 646 assert(buf); 647 length = DATA_LEN(buf); 648 #ifdef MLX5_PMD_SOFT_COUNTERS 649 total_length += length; 650 #endif 651 /* Store segment information. */ 652 naddr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)); 653 *dseg = (rte_v128u32_t){ 654 htonl(length), 655 mlx5_tx_mb2mr(txq, buf), 656 naddr, 657 naddr >> 32, 658 }; 659 (*txq->elts)[++elts_head & elts_m] = buf; 660 ++sg; 661 /* Advance counter only if all segs are successfully posted. */ 662 if (sg < segs_n) 663 goto next_seg; 664 else 665 j += sg; 666 next_pkt: 667 ++elts_head; 668 ++pkts; 669 ++i; 670 /* Initialize known and common part of the WQE structure. */ 671 if (tso) { 672 wqe->ctrl = (rte_v128u32_t){ 673 htonl((txq->wqe_ci << 8) | MLX5_OPCODE_TSO), 674 htonl(txq->qp_num_8s | ds), 675 0, 676 0, 677 }; 678 wqe->eseg = (rte_v128u32_t){ 679 0, 680 cs_flags | (htons(tso_segsz) << 16), 681 0, 682 (ehdr << 16) | htons(tso_header_sz), 683 }; 684 } else { 685 wqe->ctrl = (rte_v128u32_t){ 686 htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND), 687 htonl(txq->qp_num_8s | ds), 688 0, 689 0, 690 }; 691 wqe->eseg = (rte_v128u32_t){ 692 0, 693 cs_flags, 694 0, 695 (ehdr << 16) | htons(pkt_inline_sz), 696 }; 697 } 698 next_wqe: 699 txq->wqe_ci += (ds + 3) / 4; 700 /* Save the last successful WQE for completion request */ 701 last_wqe = (volatile struct mlx5_wqe_ctrl *)wqe; 702 #ifdef MLX5_PMD_SOFT_COUNTERS 703 /* Increment sent bytes counter. */ 704 txq->stats.obytes += total_length; 705 #endif 706 } while (i < pkts_n); 707 /* Take a shortcut if nothing must be sent. */ 708 if (unlikely((i + k) == 0)) 709 return 0; 710 txq->elts_head += (i + j); 711 /* Check whether completion threshold has been reached. */ 712 comp = txq->elts_comp + i + j + k; 713 if (comp >= MLX5_TX_COMP_THRESH) { 714 /* Request completion on last WQE. */ 715 last_wqe->ctrl2 = htonl(8); 716 /* Save elts_head in unused "immediate" field of WQE. */ 717 last_wqe->ctrl3 = txq->elts_head; 718 txq->elts_comp = 0; 719 } else { 720 txq->elts_comp = comp; 721 } 722 #ifdef MLX5_PMD_SOFT_COUNTERS 723 /* Increment sent packets counter. */ 724 txq->stats.opackets += i; 725 #endif 726 /* Ring QP doorbell. */ 727 mlx5_tx_dbrec(txq, (volatile struct mlx5_wqe *)last_wqe); 728 return i; 729 } 730 731 /** 732 * Open a MPW session. 733 * 734 * @param txq 735 * Pointer to TX queue structure. 736 * @param mpw 737 * Pointer to MPW session structure. 738 * @param length 739 * Packet length. 740 */ 741 static inline void 742 mlx5_mpw_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length) 743 { 744 uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1); 745 volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] = 746 (volatile struct mlx5_wqe_data_seg (*)[]) 747 tx_mlx5_wqe(txq, idx + 1); 748 749 mpw->state = MLX5_MPW_STATE_OPENED; 750 mpw->pkts_n = 0; 751 mpw->len = length; 752 mpw->total_len = 0; 753 mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx); 754 mpw->wqe->eseg.mss = htons(length); 755 mpw->wqe->eseg.inline_hdr_sz = 0; 756 mpw->wqe->eseg.rsvd0 = 0; 757 mpw->wqe->eseg.rsvd1 = 0; 758 mpw->wqe->eseg.rsvd2 = 0; 759 mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) | 760 (txq->wqe_ci << 8) | MLX5_OPCODE_TSO); 761 mpw->wqe->ctrl[2] = 0; 762 mpw->wqe->ctrl[3] = 0; 763 mpw->data.dseg[0] = (volatile struct mlx5_wqe_data_seg *) 764 (((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE)); 765 mpw->data.dseg[1] = (volatile struct mlx5_wqe_data_seg *) 766 (((uintptr_t)mpw->wqe) + (3 * MLX5_WQE_DWORD_SIZE)); 767 mpw->data.dseg[2] = &(*dseg)[0]; 768 mpw->data.dseg[3] = &(*dseg)[1]; 769 mpw->data.dseg[4] = &(*dseg)[2]; 770 } 771 772 /** 773 * Close a MPW session. 774 * 775 * @param txq 776 * Pointer to TX queue structure. 777 * @param mpw 778 * Pointer to MPW session structure. 779 */ 780 static inline void 781 mlx5_mpw_close(struct txq *txq, struct mlx5_mpw *mpw) 782 { 783 unsigned int num = mpw->pkts_n; 784 785 /* 786 * Store size in multiple of 16 bytes. Control and Ethernet segments 787 * count as 2. 788 */ 789 mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | (2 + num)); 790 mpw->state = MLX5_MPW_STATE_CLOSED; 791 if (num < 3) 792 ++txq->wqe_ci; 793 else 794 txq->wqe_ci += 2; 795 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci)); 796 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); 797 } 798 799 /** 800 * DPDK callback for TX with MPW support. 801 * 802 * @param dpdk_txq 803 * Generic pointer to TX queue structure. 804 * @param[in] pkts 805 * Packets to transmit. 806 * @param pkts_n 807 * Number of packets in array. 808 * 809 * @return 810 * Number of packets successfully transmitted (<= pkts_n). 811 */ 812 uint16_t 813 mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 814 { 815 struct txq *txq = (struct txq *)dpdk_txq; 816 uint16_t elts_head = txq->elts_head; 817 const uint16_t elts_n = 1 << txq->elts_n; 818 const uint16_t elts_m = elts_n - 1; 819 unsigned int i = 0; 820 unsigned int j = 0; 821 uint16_t max_elts; 822 uint16_t max_wqe; 823 unsigned int comp; 824 struct mlx5_mpw mpw = { 825 .state = MLX5_MPW_STATE_CLOSED, 826 }; 827 828 if (unlikely(!pkts_n)) 829 return 0; 830 /* Prefetch first packet cacheline. */ 831 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci)); 832 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); 833 /* Start processing. */ 834 mlx5_tx_complete(txq); 835 max_elts = (elts_n - (elts_head - txq->elts_tail)); 836 max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); 837 if (unlikely(!max_wqe)) 838 return 0; 839 do { 840 struct rte_mbuf *buf = *(pkts++); 841 uint32_t length; 842 unsigned int segs_n = buf->nb_segs; 843 uint32_t cs_flags = 0; 844 845 /* 846 * Make sure there is enough room to store this packet and 847 * that one ring entry remains unused. 848 */ 849 assert(segs_n); 850 if (max_elts < segs_n) 851 break; 852 /* Do not bother with large packets MPW cannot handle. */ 853 if (segs_n > MLX5_MPW_DSEG_MAX) 854 break; 855 max_elts -= segs_n; 856 --pkts_n; 857 /* Should we enable HW CKSUM offload */ 858 if (buf->ol_flags & 859 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) 860 cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; 861 /* Retrieve packet information. */ 862 length = PKT_LEN(buf); 863 assert(length); 864 /* Start new session if packet differs. */ 865 if ((mpw.state == MLX5_MPW_STATE_OPENED) && 866 ((mpw.len != length) || 867 (segs_n != 1) || 868 (mpw.wqe->eseg.cs_flags != cs_flags))) 869 mlx5_mpw_close(txq, &mpw); 870 if (mpw.state == MLX5_MPW_STATE_CLOSED) { 871 /* 872 * Multi-Packet WQE consumes at most two WQE. 873 * mlx5_mpw_new() expects to be able to use such 874 * resources. 875 */ 876 if (unlikely(max_wqe < 2)) 877 break; 878 max_wqe -= 2; 879 mlx5_mpw_new(txq, &mpw, length); 880 mpw.wqe->eseg.cs_flags = cs_flags; 881 } 882 /* Multi-segment packets must be alone in their MPW. */ 883 assert((segs_n == 1) || (mpw.pkts_n == 0)); 884 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 885 length = 0; 886 #endif 887 do { 888 volatile struct mlx5_wqe_data_seg *dseg; 889 uintptr_t addr; 890 891 assert(buf); 892 (*txq->elts)[elts_head++ & elts_m] = buf; 893 dseg = mpw.data.dseg[mpw.pkts_n]; 894 addr = rte_pktmbuf_mtod(buf, uintptr_t); 895 *dseg = (struct mlx5_wqe_data_seg){ 896 .byte_count = htonl(DATA_LEN(buf)), 897 .lkey = mlx5_tx_mb2mr(txq, buf), 898 .addr = htonll(addr), 899 }; 900 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 901 length += DATA_LEN(buf); 902 #endif 903 buf = buf->next; 904 ++mpw.pkts_n; 905 ++j; 906 } while (--segs_n); 907 assert(length == mpw.len); 908 if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) 909 mlx5_mpw_close(txq, &mpw); 910 #ifdef MLX5_PMD_SOFT_COUNTERS 911 /* Increment sent bytes counter. */ 912 txq->stats.obytes += length; 913 #endif 914 ++i; 915 } while (pkts_n); 916 /* Take a shortcut if nothing must be sent. */ 917 if (unlikely(i == 0)) 918 return 0; 919 /* Check whether completion threshold has been reached. */ 920 /* "j" includes both packets and segments. */ 921 comp = txq->elts_comp + j; 922 if (comp >= MLX5_TX_COMP_THRESH) { 923 volatile struct mlx5_wqe *wqe = mpw.wqe; 924 925 /* Request completion on last WQE. */ 926 wqe->ctrl[2] = htonl(8); 927 /* Save elts_head in unused "immediate" field of WQE. */ 928 wqe->ctrl[3] = elts_head; 929 txq->elts_comp = 0; 930 } else { 931 txq->elts_comp = comp; 932 } 933 #ifdef MLX5_PMD_SOFT_COUNTERS 934 /* Increment sent packets counter. */ 935 txq->stats.opackets += i; 936 #endif 937 /* Ring QP doorbell. */ 938 if (mpw.state == MLX5_MPW_STATE_OPENED) 939 mlx5_mpw_close(txq, &mpw); 940 mlx5_tx_dbrec(txq, mpw.wqe); 941 txq->elts_head = elts_head; 942 return i; 943 } 944 945 /** 946 * Open a MPW inline session. 947 * 948 * @param txq 949 * Pointer to TX queue structure. 950 * @param mpw 951 * Pointer to MPW session structure. 952 * @param length 953 * Packet length. 954 */ 955 static inline void 956 mlx5_mpw_inline_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length) 957 { 958 uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1); 959 struct mlx5_wqe_inl_small *inl; 960 961 mpw->state = MLX5_MPW_INL_STATE_OPENED; 962 mpw->pkts_n = 0; 963 mpw->len = length; 964 mpw->total_len = 0; 965 mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx); 966 mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) | 967 (txq->wqe_ci << 8) | 968 MLX5_OPCODE_TSO); 969 mpw->wqe->ctrl[2] = 0; 970 mpw->wqe->ctrl[3] = 0; 971 mpw->wqe->eseg.mss = htons(length); 972 mpw->wqe->eseg.inline_hdr_sz = 0; 973 mpw->wqe->eseg.cs_flags = 0; 974 mpw->wqe->eseg.rsvd0 = 0; 975 mpw->wqe->eseg.rsvd1 = 0; 976 mpw->wqe->eseg.rsvd2 = 0; 977 inl = (struct mlx5_wqe_inl_small *) 978 (((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE); 979 mpw->data.raw = (uint8_t *)&inl->raw; 980 } 981 982 /** 983 * Close a MPW inline session. 984 * 985 * @param txq 986 * Pointer to TX queue structure. 987 * @param mpw 988 * Pointer to MPW session structure. 989 */ 990 static inline void 991 mlx5_mpw_inline_close(struct txq *txq, struct mlx5_mpw *mpw) 992 { 993 unsigned int size; 994 struct mlx5_wqe_inl_small *inl = (struct mlx5_wqe_inl_small *) 995 (((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE)); 996 997 size = MLX5_WQE_SIZE - MLX5_MWQE64_INL_DATA + mpw->total_len; 998 /* 999 * Store size in multiple of 16 bytes. Control and Ethernet segments 1000 * count as 2. 1001 */ 1002 mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | MLX5_WQE_DS(size)); 1003 mpw->state = MLX5_MPW_STATE_CLOSED; 1004 inl->byte_cnt = htonl(mpw->total_len | MLX5_INLINE_SEG); 1005 txq->wqe_ci += (size + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE; 1006 } 1007 1008 /** 1009 * DPDK callback for TX with MPW inline support. 1010 * 1011 * @param dpdk_txq 1012 * Generic pointer to TX queue structure. 1013 * @param[in] pkts 1014 * Packets to transmit. 1015 * @param pkts_n 1016 * Number of packets in array. 1017 * 1018 * @return 1019 * Number of packets successfully transmitted (<= pkts_n). 1020 */ 1021 uint16_t 1022 mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, 1023 uint16_t pkts_n) 1024 { 1025 struct txq *txq = (struct txq *)dpdk_txq; 1026 uint16_t elts_head = txq->elts_head; 1027 const uint16_t elts_n = 1 << txq->elts_n; 1028 const uint16_t elts_m = elts_n - 1; 1029 unsigned int i = 0; 1030 unsigned int j = 0; 1031 uint16_t max_elts; 1032 uint16_t max_wqe; 1033 unsigned int comp; 1034 unsigned int inline_room = txq->max_inline * RTE_CACHE_LINE_SIZE; 1035 struct mlx5_mpw mpw = { 1036 .state = MLX5_MPW_STATE_CLOSED, 1037 }; 1038 /* 1039 * Compute the maximum number of WQE which can be consumed by inline 1040 * code. 1041 * - 2 DSEG for: 1042 * - 1 control segment, 1043 * - 1 Ethernet segment, 1044 * - N Dseg from the inline request. 1045 */ 1046 const unsigned int wqe_inl_n = 1047 ((2 * MLX5_WQE_DWORD_SIZE + 1048 txq->max_inline * RTE_CACHE_LINE_SIZE) + 1049 RTE_CACHE_LINE_SIZE - 1) / RTE_CACHE_LINE_SIZE; 1050 1051 if (unlikely(!pkts_n)) 1052 return 0; 1053 /* Prefetch first packet cacheline. */ 1054 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci)); 1055 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); 1056 /* Start processing. */ 1057 mlx5_tx_complete(txq); 1058 max_elts = (elts_n - (elts_head - txq->elts_tail)); 1059 do { 1060 struct rte_mbuf *buf = *(pkts++); 1061 uintptr_t addr; 1062 uint32_t length; 1063 unsigned int segs_n = buf->nb_segs; 1064 uint32_t cs_flags = 0; 1065 1066 /* 1067 * Make sure there is enough room to store this packet and 1068 * that one ring entry remains unused. 1069 */ 1070 assert(segs_n); 1071 if (max_elts < segs_n) 1072 break; 1073 /* Do not bother with large packets MPW cannot handle. */ 1074 if (segs_n > MLX5_MPW_DSEG_MAX) 1075 break; 1076 max_elts -= segs_n; 1077 --pkts_n; 1078 /* 1079 * Compute max_wqe in case less WQE were consumed in previous 1080 * iteration. 1081 */ 1082 max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); 1083 /* Should we enable HW CKSUM offload */ 1084 if (buf->ol_flags & 1085 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) 1086 cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; 1087 /* Retrieve packet information. */ 1088 length = PKT_LEN(buf); 1089 /* Start new session if packet differs. */ 1090 if (mpw.state == MLX5_MPW_STATE_OPENED) { 1091 if ((mpw.len != length) || 1092 (segs_n != 1) || 1093 (mpw.wqe->eseg.cs_flags != cs_flags)) 1094 mlx5_mpw_close(txq, &mpw); 1095 } else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) { 1096 if ((mpw.len != length) || 1097 (segs_n != 1) || 1098 (length > inline_room) || 1099 (mpw.wqe->eseg.cs_flags != cs_flags)) { 1100 mlx5_mpw_inline_close(txq, &mpw); 1101 inline_room = 1102 txq->max_inline * RTE_CACHE_LINE_SIZE; 1103 } 1104 } 1105 if (mpw.state == MLX5_MPW_STATE_CLOSED) { 1106 if ((segs_n != 1) || 1107 (length > inline_room)) { 1108 /* 1109 * Multi-Packet WQE consumes at most two WQE. 1110 * mlx5_mpw_new() expects to be able to use 1111 * such resources. 1112 */ 1113 if (unlikely(max_wqe < 2)) 1114 break; 1115 max_wqe -= 2; 1116 mlx5_mpw_new(txq, &mpw, length); 1117 mpw.wqe->eseg.cs_flags = cs_flags; 1118 } else { 1119 if (unlikely(max_wqe < wqe_inl_n)) 1120 break; 1121 max_wqe -= wqe_inl_n; 1122 mlx5_mpw_inline_new(txq, &mpw, length); 1123 mpw.wqe->eseg.cs_flags = cs_flags; 1124 } 1125 } 1126 /* Multi-segment packets must be alone in their MPW. */ 1127 assert((segs_n == 1) || (mpw.pkts_n == 0)); 1128 if (mpw.state == MLX5_MPW_STATE_OPENED) { 1129 assert(inline_room == 1130 txq->max_inline * RTE_CACHE_LINE_SIZE); 1131 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 1132 length = 0; 1133 #endif 1134 do { 1135 volatile struct mlx5_wqe_data_seg *dseg; 1136 1137 assert(buf); 1138 (*txq->elts)[elts_head++ & elts_m] = buf; 1139 dseg = mpw.data.dseg[mpw.pkts_n]; 1140 addr = rte_pktmbuf_mtod(buf, uintptr_t); 1141 *dseg = (struct mlx5_wqe_data_seg){ 1142 .byte_count = htonl(DATA_LEN(buf)), 1143 .lkey = mlx5_tx_mb2mr(txq, buf), 1144 .addr = htonll(addr), 1145 }; 1146 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 1147 length += DATA_LEN(buf); 1148 #endif 1149 buf = buf->next; 1150 ++mpw.pkts_n; 1151 ++j; 1152 } while (--segs_n); 1153 assert(length == mpw.len); 1154 if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) 1155 mlx5_mpw_close(txq, &mpw); 1156 } else { 1157 unsigned int max; 1158 1159 assert(mpw.state == MLX5_MPW_INL_STATE_OPENED); 1160 assert(length <= inline_room); 1161 assert(length == DATA_LEN(buf)); 1162 addr = rte_pktmbuf_mtod(buf, uintptr_t); 1163 (*txq->elts)[elts_head++ & elts_m] = buf; 1164 /* Maximum number of bytes before wrapping. */ 1165 max = ((((uintptr_t)(txq->wqes)) + 1166 (1 << txq->wqe_n) * 1167 MLX5_WQE_SIZE) - 1168 (uintptr_t)mpw.data.raw); 1169 if (length > max) { 1170 rte_memcpy((void *)(uintptr_t)mpw.data.raw, 1171 (void *)addr, 1172 max); 1173 mpw.data.raw = (volatile void *)txq->wqes; 1174 rte_memcpy((void *)(uintptr_t)mpw.data.raw, 1175 (void *)(addr + max), 1176 length - max); 1177 mpw.data.raw += length - max; 1178 } else { 1179 rte_memcpy((void *)(uintptr_t)mpw.data.raw, 1180 (void *)addr, 1181 length); 1182 1183 if (length == max) 1184 mpw.data.raw = 1185 (volatile void *)txq->wqes; 1186 else 1187 mpw.data.raw += length; 1188 } 1189 ++mpw.pkts_n; 1190 mpw.total_len += length; 1191 ++j; 1192 if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) { 1193 mlx5_mpw_inline_close(txq, &mpw); 1194 inline_room = 1195 txq->max_inline * RTE_CACHE_LINE_SIZE; 1196 } else { 1197 inline_room -= length; 1198 } 1199 } 1200 #ifdef MLX5_PMD_SOFT_COUNTERS 1201 /* Increment sent bytes counter. */ 1202 txq->stats.obytes += length; 1203 #endif 1204 ++i; 1205 } while (pkts_n); 1206 /* Take a shortcut if nothing must be sent. */ 1207 if (unlikely(i == 0)) 1208 return 0; 1209 /* Check whether completion threshold has been reached. */ 1210 /* "j" includes both packets and segments. */ 1211 comp = txq->elts_comp + j; 1212 if (comp >= MLX5_TX_COMP_THRESH) { 1213 volatile struct mlx5_wqe *wqe = mpw.wqe; 1214 1215 /* Request completion on last WQE. */ 1216 wqe->ctrl[2] = htonl(8); 1217 /* Save elts_head in unused "immediate" field of WQE. */ 1218 wqe->ctrl[3] = elts_head; 1219 txq->elts_comp = 0; 1220 } else { 1221 txq->elts_comp = comp; 1222 } 1223 #ifdef MLX5_PMD_SOFT_COUNTERS 1224 /* Increment sent packets counter. */ 1225 txq->stats.opackets += i; 1226 #endif 1227 /* Ring QP doorbell. */ 1228 if (mpw.state == MLX5_MPW_INL_STATE_OPENED) 1229 mlx5_mpw_inline_close(txq, &mpw); 1230 else if (mpw.state == MLX5_MPW_STATE_OPENED) 1231 mlx5_mpw_close(txq, &mpw); 1232 mlx5_tx_dbrec(txq, mpw.wqe); 1233 txq->elts_head = elts_head; 1234 return i; 1235 } 1236 1237 /** 1238 * Open an Enhanced MPW session. 1239 * 1240 * @param txq 1241 * Pointer to TX queue structure. 1242 * @param mpw 1243 * Pointer to MPW session structure. 1244 * @param length 1245 * Packet length. 1246 */ 1247 static inline void 1248 mlx5_empw_new(struct txq *txq, struct mlx5_mpw *mpw, int padding) 1249 { 1250 uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1); 1251 1252 mpw->state = MLX5_MPW_ENHANCED_STATE_OPENED; 1253 mpw->pkts_n = 0; 1254 mpw->total_len = sizeof(struct mlx5_wqe); 1255 mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx); 1256 mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_ENHANCED_MPSW << 24) | 1257 (txq->wqe_ci << 8) | 1258 MLX5_OPCODE_ENHANCED_MPSW); 1259 mpw->wqe->ctrl[2] = 0; 1260 mpw->wqe->ctrl[3] = 0; 1261 memset((void *)(uintptr_t)&mpw->wqe->eseg, 0, MLX5_WQE_DWORD_SIZE); 1262 if (unlikely(padding)) { 1263 uintptr_t addr = (uintptr_t)(mpw->wqe + 1); 1264 1265 /* Pad the first 2 DWORDs with zero-length inline header. */ 1266 *(volatile uint32_t *)addr = htonl(MLX5_INLINE_SEG); 1267 *(volatile uint32_t *)(addr + MLX5_WQE_DWORD_SIZE) = 1268 htonl(MLX5_INLINE_SEG); 1269 mpw->total_len += 2 * MLX5_WQE_DWORD_SIZE; 1270 /* Start from the next WQEBB. */ 1271 mpw->data.raw = (volatile void *)(tx_mlx5_wqe(txq, idx + 1)); 1272 } else { 1273 mpw->data.raw = (volatile void *)(mpw->wqe + 1); 1274 } 1275 } 1276 1277 /** 1278 * Close an Enhanced MPW session. 1279 * 1280 * @param txq 1281 * Pointer to TX queue structure. 1282 * @param mpw 1283 * Pointer to MPW session structure. 1284 * 1285 * @return 1286 * Number of consumed WQEs. 1287 */ 1288 static inline uint16_t 1289 mlx5_empw_close(struct txq *txq, struct mlx5_mpw *mpw) 1290 { 1291 uint16_t ret; 1292 1293 /* Store size in multiple of 16 bytes. Control and Ethernet segments 1294 * count as 2. 1295 */ 1296 mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | MLX5_WQE_DS(mpw->total_len)); 1297 mpw->state = MLX5_MPW_STATE_CLOSED; 1298 ret = (mpw->total_len + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE; 1299 txq->wqe_ci += ret; 1300 return ret; 1301 } 1302 1303 /** 1304 * DPDK callback for TX with Enhanced MPW support. 1305 * 1306 * @param dpdk_txq 1307 * Generic pointer to TX queue structure. 1308 * @param[in] pkts 1309 * Packets to transmit. 1310 * @param pkts_n 1311 * Number of packets in array. 1312 * 1313 * @return 1314 * Number of packets successfully transmitted (<= pkts_n). 1315 */ 1316 uint16_t 1317 mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 1318 { 1319 struct txq *txq = (struct txq *)dpdk_txq; 1320 uint16_t elts_head = txq->elts_head; 1321 const uint16_t elts_n = 1 << txq->elts_n; 1322 const uint16_t elts_m = elts_n - 1; 1323 unsigned int i = 0; 1324 unsigned int j = 0; 1325 uint16_t max_elts; 1326 uint16_t max_wqe; 1327 unsigned int max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE; 1328 unsigned int mpw_room = 0; 1329 unsigned int inl_pad = 0; 1330 uint32_t inl_hdr; 1331 struct mlx5_mpw mpw = { 1332 .state = MLX5_MPW_STATE_CLOSED, 1333 }; 1334 1335 if (unlikely(!pkts_n)) 1336 return 0; 1337 /* Start processing. */ 1338 mlx5_tx_complete(txq); 1339 max_elts = (elts_n - (elts_head - txq->elts_tail)); 1340 /* A CQE slot must always be available. */ 1341 assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci)); 1342 max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); 1343 if (unlikely(!max_wqe)) 1344 return 0; 1345 do { 1346 struct rte_mbuf *buf = *(pkts++); 1347 uintptr_t addr; 1348 uint64_t naddr; 1349 unsigned int n; 1350 unsigned int do_inline = 0; /* Whether inline is possible. */ 1351 uint32_t length; 1352 unsigned int segs_n = buf->nb_segs; 1353 uint32_t cs_flags = 0; 1354 1355 /* 1356 * Make sure there is enough room to store this packet and 1357 * that one ring entry remains unused. 1358 */ 1359 assert(segs_n); 1360 if (max_elts - j < segs_n) 1361 break; 1362 /* Do not bother with large packets MPW cannot handle. */ 1363 if (segs_n > MLX5_MPW_DSEG_MAX) 1364 break; 1365 /* Should we enable HW CKSUM offload. */ 1366 if (buf->ol_flags & 1367 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) 1368 cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; 1369 /* Retrieve packet information. */ 1370 length = PKT_LEN(buf); 1371 /* Start new session if: 1372 * - multi-segment packet 1373 * - no space left even for a dseg 1374 * - next packet can be inlined with a new WQE 1375 * - cs_flag differs 1376 * It can't be MLX5_MPW_STATE_OPENED as always have a single 1377 * segmented packet. 1378 */ 1379 if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED) { 1380 if ((segs_n != 1) || 1381 (inl_pad + sizeof(struct mlx5_wqe_data_seg) > 1382 mpw_room) || 1383 (length <= txq->inline_max_packet_sz && 1384 inl_pad + sizeof(inl_hdr) + length > 1385 mpw_room) || 1386 (mpw.wqe->eseg.cs_flags != cs_flags)) 1387 max_wqe -= mlx5_empw_close(txq, &mpw); 1388 } 1389 if (unlikely(mpw.state == MLX5_MPW_STATE_CLOSED)) { 1390 if (unlikely(segs_n != 1)) { 1391 /* Fall back to legacy MPW. 1392 * A MPW session consumes 2 WQEs at most to 1393 * include MLX5_MPW_DSEG_MAX pointers. 1394 */ 1395 if (unlikely(max_wqe < 2)) 1396 break; 1397 mlx5_mpw_new(txq, &mpw, length); 1398 } else { 1399 /* In Enhanced MPW, inline as much as the budget 1400 * is allowed. The remaining space is to be 1401 * filled with dsegs. If the title WQEBB isn't 1402 * padded, it will have 2 dsegs there. 1403 */ 1404 mpw_room = RTE_MIN(MLX5_WQE_SIZE_MAX, 1405 (max_inline ? max_inline : 1406 pkts_n * MLX5_WQE_DWORD_SIZE) + 1407 MLX5_WQE_SIZE); 1408 if (unlikely(max_wqe * MLX5_WQE_SIZE < 1409 mpw_room)) 1410 break; 1411 /* Don't pad the title WQEBB to not waste WQ. */ 1412 mlx5_empw_new(txq, &mpw, 0); 1413 mpw_room -= mpw.total_len; 1414 inl_pad = 0; 1415 do_inline = 1416 length <= txq->inline_max_packet_sz && 1417 sizeof(inl_hdr) + length <= mpw_room && 1418 !txq->mpw_hdr_dseg; 1419 } 1420 mpw.wqe->eseg.cs_flags = cs_flags; 1421 } else { 1422 /* Evaluate whether the next packet can be inlined. 1423 * Inlininig is possible when: 1424 * - length is less than configured value 1425 * - length fits for remaining space 1426 * - not required to fill the title WQEBB with dsegs 1427 */ 1428 do_inline = 1429 length <= txq->inline_max_packet_sz && 1430 inl_pad + sizeof(inl_hdr) + length <= 1431 mpw_room && 1432 (!txq->mpw_hdr_dseg || 1433 mpw.total_len >= MLX5_WQE_SIZE); 1434 } 1435 /* Multi-segment packets must be alone in their MPW. */ 1436 assert((segs_n == 1) || (mpw.pkts_n == 0)); 1437 if (unlikely(mpw.state == MLX5_MPW_STATE_OPENED)) { 1438 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 1439 length = 0; 1440 #endif 1441 do { 1442 volatile struct mlx5_wqe_data_seg *dseg; 1443 1444 assert(buf); 1445 (*txq->elts)[elts_head++ & elts_m] = buf; 1446 dseg = mpw.data.dseg[mpw.pkts_n]; 1447 addr = rte_pktmbuf_mtod(buf, uintptr_t); 1448 *dseg = (struct mlx5_wqe_data_seg){ 1449 .byte_count = htonl(DATA_LEN(buf)), 1450 .lkey = mlx5_tx_mb2mr(txq, buf), 1451 .addr = htonll(addr), 1452 }; 1453 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 1454 length += DATA_LEN(buf); 1455 #endif 1456 buf = buf->next; 1457 ++j; 1458 ++mpw.pkts_n; 1459 } while (--segs_n); 1460 /* A multi-segmented packet takes one MPW session. 1461 * TODO: Pack more multi-segmented packets if possible. 1462 */ 1463 mlx5_mpw_close(txq, &mpw); 1464 if (mpw.pkts_n < 3) 1465 max_wqe--; 1466 else 1467 max_wqe -= 2; 1468 } else if (do_inline) { 1469 /* Inline packet into WQE. */ 1470 unsigned int max; 1471 1472 assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED); 1473 assert(length == DATA_LEN(buf)); 1474 inl_hdr = htonl(length | MLX5_INLINE_SEG); 1475 addr = rte_pktmbuf_mtod(buf, uintptr_t); 1476 mpw.data.raw = (volatile void *) 1477 ((uintptr_t)mpw.data.raw + inl_pad); 1478 max = tx_mlx5_wq_tailroom(txq, 1479 (void *)(uintptr_t)mpw.data.raw); 1480 /* Copy inline header. */ 1481 mpw.data.raw = (volatile void *) 1482 mlx5_copy_to_wq( 1483 (void *)(uintptr_t)mpw.data.raw, 1484 &inl_hdr, 1485 sizeof(inl_hdr), 1486 (void *)(uintptr_t)txq->wqes, 1487 max); 1488 max = tx_mlx5_wq_tailroom(txq, 1489 (void *)(uintptr_t)mpw.data.raw); 1490 /* Copy packet data. */ 1491 mpw.data.raw = (volatile void *) 1492 mlx5_copy_to_wq( 1493 (void *)(uintptr_t)mpw.data.raw, 1494 (void *)addr, 1495 length, 1496 (void *)(uintptr_t)txq->wqes, 1497 max); 1498 ++mpw.pkts_n; 1499 mpw.total_len += (inl_pad + sizeof(inl_hdr) + length); 1500 /* No need to get completion as the entire packet is 1501 * copied to WQ. Free the buf right away. 1502 */ 1503 rte_pktmbuf_free_seg(buf); 1504 mpw_room -= (inl_pad + sizeof(inl_hdr) + length); 1505 /* Add pad in the next packet if any. */ 1506 inl_pad = (((uintptr_t)mpw.data.raw + 1507 (MLX5_WQE_DWORD_SIZE - 1)) & 1508 ~(MLX5_WQE_DWORD_SIZE - 1)) - 1509 (uintptr_t)mpw.data.raw; 1510 } else { 1511 /* No inline. Load a dseg of packet pointer. */ 1512 volatile rte_v128u32_t *dseg; 1513 1514 assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED); 1515 assert((inl_pad + sizeof(*dseg)) <= mpw_room); 1516 assert(length == DATA_LEN(buf)); 1517 if (!tx_mlx5_wq_tailroom(txq, 1518 (void *)((uintptr_t)mpw.data.raw 1519 + inl_pad))) 1520 dseg = (volatile void *)txq->wqes; 1521 else 1522 dseg = (volatile void *) 1523 ((uintptr_t)mpw.data.raw + 1524 inl_pad); 1525 (*txq->elts)[elts_head++ & elts_m] = buf; 1526 addr = rte_pktmbuf_mtod(buf, uintptr_t); 1527 for (n = 0; n * RTE_CACHE_LINE_SIZE < length; n++) 1528 rte_prefetch2((void *)(addr + 1529 n * RTE_CACHE_LINE_SIZE)); 1530 naddr = htonll(addr); 1531 *dseg = (rte_v128u32_t) { 1532 htonl(length), 1533 mlx5_tx_mb2mr(txq, buf), 1534 naddr, 1535 naddr >> 32, 1536 }; 1537 mpw.data.raw = (volatile void *)(dseg + 1); 1538 mpw.total_len += (inl_pad + sizeof(*dseg)); 1539 ++j; 1540 ++mpw.pkts_n; 1541 mpw_room -= (inl_pad + sizeof(*dseg)); 1542 inl_pad = 0; 1543 } 1544 #ifdef MLX5_PMD_SOFT_COUNTERS 1545 /* Increment sent bytes counter. */ 1546 txq->stats.obytes += length; 1547 #endif 1548 ++i; 1549 } while (i < pkts_n); 1550 /* Take a shortcut if nothing must be sent. */ 1551 if (unlikely(i == 0)) 1552 return 0; 1553 /* Check whether completion threshold has been reached. */ 1554 if (txq->elts_comp + j >= MLX5_TX_COMP_THRESH || 1555 (uint16_t)(txq->wqe_ci - txq->mpw_comp) >= 1556 (1 << txq->wqe_n) / MLX5_TX_COMP_THRESH_INLINE_DIV) { 1557 volatile struct mlx5_wqe *wqe = mpw.wqe; 1558 1559 /* Request completion on last WQE. */ 1560 wqe->ctrl[2] = htonl(8); 1561 /* Save elts_head in unused "immediate" field of WQE. */ 1562 wqe->ctrl[3] = elts_head; 1563 txq->elts_comp = 0; 1564 txq->mpw_comp = txq->wqe_ci; 1565 txq->cq_pi++; 1566 } else { 1567 txq->elts_comp += j; 1568 } 1569 #ifdef MLX5_PMD_SOFT_COUNTERS 1570 /* Increment sent packets counter. */ 1571 txq->stats.opackets += i; 1572 #endif 1573 if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED) 1574 mlx5_empw_close(txq, &mpw); 1575 else if (mpw.state == MLX5_MPW_STATE_OPENED) 1576 mlx5_mpw_close(txq, &mpw); 1577 /* Ring QP doorbell. */ 1578 mlx5_tx_dbrec(txq, mpw.wqe); 1579 txq->elts_head = elts_head; 1580 return i; 1581 } 1582 1583 /** 1584 * Translate RX completion flags to packet type. 1585 * 1586 * @param[in] cqe 1587 * Pointer to CQE. 1588 * 1589 * @note: fix mlx5_dev_supported_ptypes_get() if any change here. 1590 * 1591 * @return 1592 * Packet type for struct rte_mbuf. 1593 */ 1594 static inline uint32_t 1595 rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe) 1596 { 1597 uint8_t idx; 1598 uint8_t pinfo = cqe->pkt_info; 1599 uint16_t ptype = cqe->hdr_type_etc; 1600 1601 /* 1602 * The index to the array should have: 1603 * bit[1:0] = l3_hdr_type 1604 * bit[4:2] = l4_hdr_type 1605 * bit[5] = ip_frag 1606 * bit[6] = tunneled 1607 * bit[7] = outer_l3_type 1608 */ 1609 idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10); 1610 return mlx5_ptype_table[idx]; 1611 } 1612 1613 /** 1614 * Get size of the next packet for a given CQE. For compressed CQEs, the 1615 * consumer index is updated only once all packets of the current one have 1616 * been processed. 1617 * 1618 * @param rxq 1619 * Pointer to RX queue. 1620 * @param cqe 1621 * CQE to process. 1622 * @param[out] rss_hash 1623 * Packet RSS Hash result. 1624 * 1625 * @return 1626 * Packet size in bytes (0 if there is none), -1 in case of completion 1627 * with error. 1628 */ 1629 static inline int 1630 mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe, 1631 uint16_t cqe_cnt, uint32_t *rss_hash) 1632 { 1633 struct rxq_zip *zip = &rxq->zip; 1634 uint16_t cqe_n = cqe_cnt + 1; 1635 int len = 0; 1636 uint16_t idx, end; 1637 1638 /* Process compressed data in the CQE and mini arrays. */ 1639 if (zip->ai) { 1640 volatile struct mlx5_mini_cqe8 (*mc)[8] = 1641 (volatile struct mlx5_mini_cqe8 (*)[8]) 1642 (uintptr_t)(&(*rxq->cqes)[zip->ca & cqe_cnt].pkt_info); 1643 1644 len = ntohl((*mc)[zip->ai & 7].byte_cnt); 1645 *rss_hash = ntohl((*mc)[zip->ai & 7].rx_hash_result); 1646 if ((++zip->ai & 7) == 0) { 1647 /* Invalidate consumed CQEs */ 1648 idx = zip->ca; 1649 end = zip->na; 1650 while (idx != end) { 1651 (*rxq->cqes)[idx & cqe_cnt].op_own = 1652 MLX5_CQE_INVALIDATE; 1653 ++idx; 1654 } 1655 /* 1656 * Increment consumer index to skip the number of 1657 * CQEs consumed. Hardware leaves holes in the CQ 1658 * ring for software use. 1659 */ 1660 zip->ca = zip->na; 1661 zip->na += 8; 1662 } 1663 if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) { 1664 /* Invalidate the rest */ 1665 idx = zip->ca; 1666 end = zip->cq_ci; 1667 1668 while (idx != end) { 1669 (*rxq->cqes)[idx & cqe_cnt].op_own = 1670 MLX5_CQE_INVALIDATE; 1671 ++idx; 1672 } 1673 rxq->cq_ci = zip->cq_ci; 1674 zip->ai = 0; 1675 } 1676 /* No compressed data, get next CQE and verify if it is compressed. */ 1677 } else { 1678 int ret; 1679 int8_t op_own; 1680 1681 ret = check_cqe(cqe, cqe_n, rxq->cq_ci); 1682 if (unlikely(ret == 1)) 1683 return 0; 1684 ++rxq->cq_ci; 1685 op_own = cqe->op_own; 1686 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) { 1687 volatile struct mlx5_mini_cqe8 (*mc)[8] = 1688 (volatile struct mlx5_mini_cqe8 (*)[8]) 1689 (uintptr_t)(&(*rxq->cqes)[rxq->cq_ci & 1690 cqe_cnt].pkt_info); 1691 1692 /* Fix endianness. */ 1693 zip->cqe_cnt = ntohl(cqe->byte_cnt); 1694 /* 1695 * Current mini array position is the one returned by 1696 * check_cqe64(). 1697 * 1698 * If completion comprises several mini arrays, as a 1699 * special case the second one is located 7 CQEs after 1700 * the initial CQE instead of 8 for subsequent ones. 1701 */ 1702 zip->ca = rxq->cq_ci; 1703 zip->na = zip->ca + 7; 1704 /* Compute the next non compressed CQE. */ 1705 --rxq->cq_ci; 1706 zip->cq_ci = rxq->cq_ci + zip->cqe_cnt; 1707 /* Get packet size to return. */ 1708 len = ntohl((*mc)[0].byte_cnt); 1709 *rss_hash = ntohl((*mc)[0].rx_hash_result); 1710 zip->ai = 1; 1711 /* Prefetch all the entries to be invalidated */ 1712 idx = zip->ca; 1713 end = zip->cq_ci; 1714 while (idx != end) { 1715 rte_prefetch0(&(*rxq->cqes)[(idx) & cqe_cnt]); 1716 ++idx; 1717 } 1718 } else { 1719 len = ntohl(cqe->byte_cnt); 1720 *rss_hash = ntohl(cqe->rx_hash_res); 1721 } 1722 /* Error while receiving packet. */ 1723 if (unlikely(MLX5_CQE_OPCODE(op_own) == MLX5_CQE_RESP_ERR)) 1724 return -1; 1725 } 1726 return len; 1727 } 1728 1729 /** 1730 * Translate RX completion flags to offload flags. 1731 * 1732 * @param[in] rxq 1733 * Pointer to RX queue structure. 1734 * @param[in] cqe 1735 * Pointer to CQE. 1736 * 1737 * @return 1738 * Offload flags (ol_flags) for struct rte_mbuf. 1739 */ 1740 static inline uint32_t 1741 rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe *cqe) 1742 { 1743 uint32_t ol_flags = 0; 1744 uint16_t flags = ntohs(cqe->hdr_type_etc); 1745 1746 ol_flags = 1747 TRANSPOSE(flags, 1748 MLX5_CQE_RX_L3_HDR_VALID, 1749 PKT_RX_IP_CKSUM_GOOD) | 1750 TRANSPOSE(flags, 1751 MLX5_CQE_RX_L4_HDR_VALID, 1752 PKT_RX_L4_CKSUM_GOOD); 1753 if ((cqe->pkt_info & MLX5_CQE_RX_TUNNEL_PACKET) && (rxq->csum_l2tun)) 1754 ol_flags |= 1755 TRANSPOSE(flags, 1756 MLX5_CQE_RX_L3_HDR_VALID, 1757 PKT_RX_IP_CKSUM_GOOD) | 1758 TRANSPOSE(flags, 1759 MLX5_CQE_RX_L4_HDR_VALID, 1760 PKT_RX_L4_CKSUM_GOOD); 1761 return ol_flags; 1762 } 1763 1764 /** 1765 * DPDK callback for RX. 1766 * 1767 * @param dpdk_rxq 1768 * Generic pointer to RX queue structure. 1769 * @param[out] pkts 1770 * Array to store received packets. 1771 * @param pkts_n 1772 * Maximum number of packets in array. 1773 * 1774 * @return 1775 * Number of packets successfully received (<= pkts_n). 1776 */ 1777 uint16_t 1778 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1779 { 1780 struct rxq *rxq = dpdk_rxq; 1781 const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1; 1782 const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1; 1783 const unsigned int sges_n = rxq->sges_n; 1784 struct rte_mbuf *pkt = NULL; 1785 struct rte_mbuf *seg = NULL; 1786 volatile struct mlx5_cqe *cqe = 1787 &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1788 unsigned int i = 0; 1789 unsigned int rq_ci = rxq->rq_ci << sges_n; 1790 int len = 0; /* keep its value across iterations. */ 1791 1792 while (pkts_n) { 1793 unsigned int idx = rq_ci & wqe_cnt; 1794 volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx]; 1795 struct rte_mbuf *rep = (*rxq->elts)[idx]; 1796 uint32_t rss_hash_res = 0; 1797 1798 if (pkt) 1799 NEXT(seg) = rep; 1800 seg = rep; 1801 rte_prefetch0(seg); 1802 rte_prefetch0(cqe); 1803 rte_prefetch0(wqe); 1804 rep = rte_mbuf_raw_alloc(rxq->mp); 1805 if (unlikely(rep == NULL)) { 1806 ++rxq->stats.rx_nombuf; 1807 if (!pkt) { 1808 /* 1809 * no buffers before we even started, 1810 * bail out silently. 1811 */ 1812 break; 1813 } 1814 while (pkt != seg) { 1815 assert(pkt != (*rxq->elts)[idx]); 1816 rep = NEXT(pkt); 1817 NEXT(pkt) = NULL; 1818 NB_SEGS(pkt) = 1; 1819 rte_mbuf_raw_free(pkt); 1820 pkt = rep; 1821 } 1822 break; 1823 } 1824 if (!pkt) { 1825 cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1826 len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt, 1827 &rss_hash_res); 1828 if (!len) { 1829 rte_mbuf_raw_free(rep); 1830 break; 1831 } 1832 if (unlikely(len == -1)) { 1833 /* RX error, packet is likely too large. */ 1834 rte_mbuf_raw_free(rep); 1835 ++rxq->stats.idropped; 1836 goto skip; 1837 } 1838 pkt = seg; 1839 assert(len >= (rxq->crc_present << 2)); 1840 /* Update packet information. */ 1841 pkt->packet_type = rxq_cq_to_pkt_type(cqe); 1842 pkt->ol_flags = 0; 1843 if (rss_hash_res && rxq->rss_hash) { 1844 pkt->hash.rss = rss_hash_res; 1845 pkt->ol_flags = PKT_RX_RSS_HASH; 1846 } 1847 if (rxq->mark && 1848 MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) { 1849 pkt->ol_flags |= PKT_RX_FDIR; 1850 if (cqe->sop_drop_qpn != 1851 htonl(MLX5_FLOW_MARK_DEFAULT)) { 1852 uint32_t mark = cqe->sop_drop_qpn; 1853 1854 pkt->ol_flags |= PKT_RX_FDIR_ID; 1855 pkt->hash.fdir.hi = 1856 mlx5_flow_mark_get(mark); 1857 } 1858 } 1859 if (rxq->csum | rxq->csum_l2tun) 1860 pkt->ol_flags |= rxq_cq_to_ol_flags(rxq, cqe); 1861 if (rxq->vlan_strip && 1862 (cqe->hdr_type_etc & 1863 htons(MLX5_CQE_VLAN_STRIPPED))) { 1864 pkt->ol_flags |= PKT_RX_VLAN_PKT | 1865 PKT_RX_VLAN_STRIPPED; 1866 pkt->vlan_tci = ntohs(cqe->vlan_info); 1867 } 1868 if (rxq->crc_present) 1869 len -= ETHER_CRC_LEN; 1870 PKT_LEN(pkt) = len; 1871 } 1872 DATA_LEN(rep) = DATA_LEN(seg); 1873 PKT_LEN(rep) = PKT_LEN(seg); 1874 SET_DATA_OFF(rep, DATA_OFF(seg)); 1875 PORT(rep) = PORT(seg); 1876 (*rxq->elts)[idx] = rep; 1877 /* 1878 * Fill NIC descriptor with the new buffer. The lkey and size 1879 * of the buffers are already known, only the buffer address 1880 * changes. 1881 */ 1882 wqe->addr = htonll(rte_pktmbuf_mtod(rep, uintptr_t)); 1883 if (len > DATA_LEN(seg)) { 1884 len -= DATA_LEN(seg); 1885 ++NB_SEGS(pkt); 1886 ++rq_ci; 1887 continue; 1888 } 1889 DATA_LEN(seg) = len; 1890 #ifdef MLX5_PMD_SOFT_COUNTERS 1891 /* Increment bytes counter. */ 1892 rxq->stats.ibytes += PKT_LEN(pkt); 1893 #endif 1894 /* Return packet. */ 1895 *(pkts++) = pkt; 1896 pkt = NULL; 1897 --pkts_n; 1898 ++i; 1899 skip: 1900 /* Align consumer index to the next stride. */ 1901 rq_ci >>= sges_n; 1902 ++rq_ci; 1903 rq_ci <<= sges_n; 1904 } 1905 if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci))) 1906 return 0; 1907 /* Update the consumer index. */ 1908 rxq->rq_ci = rq_ci >> sges_n; 1909 rte_wmb(); 1910 *rxq->cq_db = htonl(rxq->cq_ci); 1911 rte_wmb(); 1912 *rxq->rq_db = htonl(rxq->rq_ci); 1913 #ifdef MLX5_PMD_SOFT_COUNTERS 1914 /* Increment packets counter. */ 1915 rxq->stats.ipackets += i; 1916 #endif 1917 return i; 1918 } 1919 1920 /** 1921 * Dummy DPDK callback for TX. 1922 * 1923 * This function is used to temporarily replace the real callback during 1924 * unsafe control operations on the queue, or in case of error. 1925 * 1926 * @param dpdk_txq 1927 * Generic pointer to TX queue structure. 1928 * @param[in] pkts 1929 * Packets to transmit. 1930 * @param pkts_n 1931 * Number of packets in array. 1932 * 1933 * @return 1934 * Number of packets successfully transmitted (<= pkts_n). 1935 */ 1936 uint16_t 1937 removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 1938 { 1939 (void)dpdk_txq; 1940 (void)pkts; 1941 (void)pkts_n; 1942 return 0; 1943 } 1944 1945 /** 1946 * Dummy DPDK callback for RX. 1947 * 1948 * This function is used to temporarily replace the real callback during 1949 * unsafe control operations on the queue, or in case of error. 1950 * 1951 * @param dpdk_rxq 1952 * Generic pointer to RX queue structure. 1953 * @param[out] pkts 1954 * Array to store received packets. 1955 * @param pkts_n 1956 * Maximum number of packets in array. 1957 * 1958 * @return 1959 * Number of packets successfully received (<= pkts_n). 1960 */ 1961 uint16_t 1962 removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1963 { 1964 (void)dpdk_rxq; 1965 (void)pkts; 1966 (void)pkts_n; 1967 return 0; 1968 } 1969 1970 /* 1971 * Vectorized Rx/Tx routines are not compiled in when required vector 1972 * instructions are not supported on a target architecture. The following null 1973 * stubs are needed for linkage when those are not included outside of this file 1974 * (e.g. mlx5_rxtx_vec_sse.c for x86). 1975 */ 1976 1977 uint16_t __attribute__((weak)) 1978 mlx5_tx_burst_raw_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 1979 { 1980 (void)dpdk_txq; 1981 (void)pkts; 1982 (void)pkts_n; 1983 return 0; 1984 } 1985 1986 uint16_t __attribute__((weak)) 1987 mlx5_tx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 1988 { 1989 (void)dpdk_txq; 1990 (void)pkts; 1991 (void)pkts_n; 1992 return 0; 1993 } 1994 1995 uint16_t __attribute__((weak)) 1996 mlx5_rx_burst_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1997 { 1998 (void)dpdk_rxq; 1999 (void)pkts; 2000 (void)pkts_n; 2001 return 0; 2002 } 2003 2004 int __attribute__((weak)) 2005 priv_check_raw_vec_tx_support(struct priv *priv) 2006 { 2007 (void)priv; 2008 return -ENOTSUP; 2009 } 2010 2011 int __attribute__((weak)) 2012 priv_check_vec_tx_support(struct priv *priv) 2013 { 2014 (void)priv; 2015 return -ENOTSUP; 2016 } 2017 2018 int __attribute__((weak)) 2019 rxq_check_vec_support(struct rxq *rxq) 2020 { 2021 (void)rxq; 2022 return -ENOTSUP; 2023 } 2024 2025 int __attribute__((weak)) 2026 priv_check_vec_rx_support(struct priv *priv) 2027 { 2028 (void)priv; 2029 return -ENOTSUP; 2030 } 2031 2032 void __attribute__((weak)) 2033 priv_prep_vec_rx_function(struct priv *priv) 2034 { 2035 (void)priv; 2036 } 2037