1 /*- 2 * BSD LICENSE 3 * 4 * Copyright 2015 6WIND S.A. 5 * Copyright 2015 Mellanox. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of 6WIND S.A. nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <assert.h> 35 #include <stdint.h> 36 #include <string.h> 37 #include <stdlib.h> 38 39 /* Verbs header. */ 40 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 41 #ifdef PEDANTIC 42 #pragma GCC diagnostic ignored "-Wpedantic" 43 #endif 44 #include <infiniband/verbs.h> 45 #include <infiniband/mlx5dv.h> 46 #ifdef PEDANTIC 47 #pragma GCC diagnostic error "-Wpedantic" 48 #endif 49 50 #include <rte_mbuf.h> 51 #include <rte_mempool.h> 52 #include <rte_prefetch.h> 53 #include <rte_common.h> 54 #include <rte_branch_prediction.h> 55 #include <rte_ether.h> 56 57 #include "mlx5.h" 58 #include "mlx5_utils.h" 59 #include "mlx5_rxtx.h" 60 #include "mlx5_autoconf.h" 61 #include "mlx5_defs.h" 62 #include "mlx5_prm.h" 63 64 static __rte_always_inline uint32_t 65 rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe); 66 67 static __rte_always_inline int 68 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, 69 uint16_t cqe_cnt, uint32_t *rss_hash); 70 71 static __rte_always_inline uint32_t 72 rxq_cq_to_ol_flags(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe); 73 74 uint32_t mlx5_ptype_table[] __rte_cache_aligned = { 75 [0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */ 76 }; 77 78 /** 79 * Build a table to translate Rx completion flags to packet type. 80 * 81 * @note: fix mlx5_dev_supported_ptypes_get() if any change here. 82 */ 83 void 84 mlx5_set_ptype_table(void) 85 { 86 unsigned int i; 87 uint32_t (*p)[RTE_DIM(mlx5_ptype_table)] = &mlx5_ptype_table; 88 89 /* Last entry must not be overwritten, reserved for errored packet. */ 90 for (i = 0; i < RTE_DIM(mlx5_ptype_table) - 1; ++i) 91 (*p)[i] = RTE_PTYPE_UNKNOWN; 92 /* 93 * The index to the array should have: 94 * bit[1:0] = l3_hdr_type 95 * bit[4:2] = l4_hdr_type 96 * bit[5] = ip_frag 97 * bit[6] = tunneled 98 * bit[7] = outer_l3_type 99 */ 100 /* L2 */ 101 (*p)[0x00] = RTE_PTYPE_L2_ETHER; 102 /* L3 */ 103 (*p)[0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 104 RTE_PTYPE_L4_NONFRAG; 105 (*p)[0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 106 RTE_PTYPE_L4_NONFRAG; 107 /* Fragmented */ 108 (*p)[0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 109 RTE_PTYPE_L4_FRAG; 110 (*p)[0x22] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 111 RTE_PTYPE_L4_FRAG; 112 /* TCP */ 113 (*p)[0x05] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 114 RTE_PTYPE_L4_TCP; 115 (*p)[0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 116 RTE_PTYPE_L4_TCP; 117 /* UDP */ 118 (*p)[0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 119 RTE_PTYPE_L4_UDP; 120 (*p)[0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 121 RTE_PTYPE_L4_UDP; 122 /* Repeat with outer_l3_type being set. Just in case. */ 123 (*p)[0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 124 RTE_PTYPE_L4_NONFRAG; 125 (*p)[0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 126 RTE_PTYPE_L4_NONFRAG; 127 (*p)[0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 128 RTE_PTYPE_L4_FRAG; 129 (*p)[0xa2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 130 RTE_PTYPE_L4_FRAG; 131 (*p)[0x85] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 132 RTE_PTYPE_L4_TCP; 133 (*p)[0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 134 RTE_PTYPE_L4_TCP; 135 (*p)[0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 136 RTE_PTYPE_L4_UDP; 137 (*p)[0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 138 RTE_PTYPE_L4_UDP; 139 /* Tunneled - L3 */ 140 (*p)[0x41] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 141 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 142 RTE_PTYPE_INNER_L4_NONFRAG; 143 (*p)[0x42] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 144 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 145 RTE_PTYPE_INNER_L4_NONFRAG; 146 (*p)[0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 147 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 148 RTE_PTYPE_INNER_L4_NONFRAG; 149 (*p)[0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 150 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 151 RTE_PTYPE_INNER_L4_NONFRAG; 152 /* Tunneled - Fragmented */ 153 (*p)[0x61] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 154 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 155 RTE_PTYPE_INNER_L4_FRAG; 156 (*p)[0x62] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 157 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 158 RTE_PTYPE_INNER_L4_FRAG; 159 (*p)[0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 160 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 161 RTE_PTYPE_INNER_L4_FRAG; 162 (*p)[0xe2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 163 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 164 RTE_PTYPE_INNER_L4_FRAG; 165 /* Tunneled - TCP */ 166 (*p)[0x45] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 167 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 168 RTE_PTYPE_L4_TCP; 169 (*p)[0x46] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 170 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 171 RTE_PTYPE_L4_TCP; 172 (*p)[0xc5] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 173 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 174 RTE_PTYPE_L4_TCP; 175 (*p)[0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 176 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 177 RTE_PTYPE_L4_TCP; 178 /* Tunneled - UDP */ 179 (*p)[0x49] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 180 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 181 RTE_PTYPE_L4_UDP; 182 (*p)[0x4a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 183 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 184 RTE_PTYPE_L4_UDP; 185 (*p)[0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 186 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 187 RTE_PTYPE_L4_UDP; 188 (*p)[0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 189 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 190 RTE_PTYPE_L4_UDP; 191 } 192 193 /** 194 * Return the size of tailroom of WQ. 195 * 196 * @param txq 197 * Pointer to TX queue structure. 198 * @param addr 199 * Pointer to tail of WQ. 200 * 201 * @return 202 * Size of tailroom. 203 */ 204 static inline size_t 205 tx_mlx5_wq_tailroom(struct mlx5_txq_data *txq, void *addr) 206 { 207 size_t tailroom; 208 tailroom = (uintptr_t)(txq->wqes) + 209 (1 << txq->wqe_n) * MLX5_WQE_SIZE - 210 (uintptr_t)addr; 211 return tailroom; 212 } 213 214 /** 215 * Copy data to tailroom of circular queue. 216 * 217 * @param dst 218 * Pointer to destination. 219 * @param src 220 * Pointer to source. 221 * @param n 222 * Number of bytes to copy. 223 * @param base 224 * Pointer to head of queue. 225 * @param tailroom 226 * Size of tailroom from dst. 227 * 228 * @return 229 * Pointer after copied data. 230 */ 231 static inline void * 232 mlx5_copy_to_wq(void *dst, const void *src, size_t n, 233 void *base, size_t tailroom) 234 { 235 void *ret; 236 237 if (n > tailroom) { 238 rte_memcpy(dst, src, tailroom); 239 rte_memcpy(base, (void *)((uintptr_t)src + tailroom), 240 n - tailroom); 241 ret = (uint8_t *)base + n - tailroom; 242 } else { 243 rte_memcpy(dst, src, n); 244 ret = (n == tailroom) ? base : (uint8_t *)dst + n; 245 } 246 return ret; 247 } 248 249 /** 250 * DPDK callback to check the status of a tx descriptor. 251 * 252 * @param tx_queue 253 * The tx queue. 254 * @param[in] offset 255 * The index of the descriptor in the ring. 256 * 257 * @return 258 * The status of the tx descriptor. 259 */ 260 int 261 mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset) 262 { 263 struct mlx5_txq_data *txq = tx_queue; 264 uint16_t used; 265 266 mlx5_tx_complete(txq); 267 used = txq->elts_head - txq->elts_tail; 268 if (offset < used) 269 return RTE_ETH_TX_DESC_FULL; 270 return RTE_ETH_TX_DESC_DONE; 271 } 272 273 /** 274 * DPDK callback to check the status of a rx descriptor. 275 * 276 * @param rx_queue 277 * The rx queue. 278 * @param[in] offset 279 * The index of the descriptor in the ring. 280 * 281 * @return 282 * The status of the tx descriptor. 283 */ 284 int 285 mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset) 286 { 287 struct mlx5_rxq_data *rxq = rx_queue; 288 struct rxq_zip *zip = &rxq->zip; 289 volatile struct mlx5_cqe *cqe; 290 const unsigned int cqe_n = (1 << rxq->cqe_n); 291 const unsigned int cqe_cnt = cqe_n - 1; 292 unsigned int cq_ci; 293 unsigned int used; 294 295 /* if we are processing a compressed cqe */ 296 if (zip->ai) { 297 used = zip->cqe_cnt - zip->ca; 298 cq_ci = zip->cq_ci; 299 } else { 300 used = 0; 301 cq_ci = rxq->cq_ci; 302 } 303 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; 304 while (check_cqe(cqe, cqe_n, cq_ci) == 0) { 305 int8_t op_own; 306 unsigned int n; 307 308 op_own = cqe->op_own; 309 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) 310 n = rte_be_to_cpu_32(cqe->byte_cnt); 311 else 312 n = 1; 313 cq_ci += n; 314 used += n; 315 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; 316 } 317 used = RTE_MIN(used, (1U << rxq->elts_n) - 1); 318 if (offset < used) 319 return RTE_ETH_RX_DESC_DONE; 320 return RTE_ETH_RX_DESC_AVAIL; 321 } 322 323 /** 324 * DPDK callback for TX. 325 * 326 * @param dpdk_txq 327 * Generic pointer to TX queue structure. 328 * @param[in] pkts 329 * Packets to transmit. 330 * @param pkts_n 331 * Number of packets in array. 332 * 333 * @return 334 * Number of packets successfully transmitted (<= pkts_n). 335 */ 336 uint16_t 337 mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 338 { 339 struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq; 340 uint16_t elts_head = txq->elts_head; 341 const uint16_t elts_n = 1 << txq->elts_n; 342 const uint16_t elts_m = elts_n - 1; 343 unsigned int i = 0; 344 unsigned int j = 0; 345 unsigned int k = 0; 346 uint16_t max_elts; 347 unsigned int max_inline = txq->max_inline; 348 const unsigned int inline_en = !!max_inline && txq->inline_en; 349 uint16_t max_wqe; 350 unsigned int comp; 351 volatile struct mlx5_wqe_v *wqe = NULL; 352 volatile struct mlx5_wqe_ctrl *last_wqe = NULL; 353 unsigned int segs_n = 0; 354 struct rte_mbuf *buf = NULL; 355 uint8_t *raw; 356 357 if (unlikely(!pkts_n)) 358 return 0; 359 /* Prefetch first packet cacheline. */ 360 rte_prefetch0(*pkts); 361 /* Start processing. */ 362 mlx5_tx_complete(txq); 363 max_elts = (elts_n - (elts_head - txq->elts_tail)); 364 max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); 365 if (unlikely(!max_wqe)) 366 return 0; 367 do { 368 volatile rte_v128u32_t *dseg = NULL; 369 uint32_t length; 370 unsigned int ds = 0; 371 unsigned int sg = 0; /* counter of additional segs attached. */ 372 uintptr_t addr; 373 uint64_t naddr; 374 uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE + 2; 375 uint16_t tso_header_sz = 0; 376 uint16_t ehdr; 377 uint8_t cs_flags = 0; 378 uint64_t tso = 0; 379 uint16_t tso_segsz = 0; 380 #ifdef MLX5_PMD_SOFT_COUNTERS 381 uint32_t total_length = 0; 382 #endif 383 384 /* first_seg */ 385 buf = *pkts; 386 segs_n = buf->nb_segs; 387 /* 388 * Make sure there is enough room to store this packet and 389 * that one ring entry remains unused. 390 */ 391 assert(segs_n); 392 if (max_elts < segs_n) 393 break; 394 max_elts -= segs_n; 395 --segs_n; 396 if (unlikely(--max_wqe == 0)) 397 break; 398 wqe = (volatile struct mlx5_wqe_v *) 399 tx_mlx5_wqe(txq, txq->wqe_ci); 400 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); 401 if (pkts_n - i > 1) 402 rte_prefetch0(*(pkts + 1)); 403 addr = rte_pktmbuf_mtod(buf, uintptr_t); 404 length = DATA_LEN(buf); 405 ehdr = (((uint8_t *)addr)[1] << 8) | 406 ((uint8_t *)addr)[0]; 407 #ifdef MLX5_PMD_SOFT_COUNTERS 408 total_length = length; 409 #endif 410 if (length < (MLX5_WQE_DWORD_SIZE + 2)) { 411 txq->stats.oerrors++; 412 break; 413 } 414 /* Update element. */ 415 (*txq->elts)[elts_head & elts_m] = buf; 416 /* Prefetch next buffer data. */ 417 if (pkts_n - i > 1) 418 rte_prefetch0( 419 rte_pktmbuf_mtod(*(pkts + 1), volatile void *)); 420 /* Should we enable HW CKSUM offload */ 421 if (buf->ol_flags & 422 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) { 423 const uint64_t is_tunneled = buf->ol_flags & 424 (PKT_TX_TUNNEL_GRE | 425 PKT_TX_TUNNEL_VXLAN); 426 427 if (is_tunneled && txq->tunnel_en) { 428 cs_flags = MLX5_ETH_WQE_L3_INNER_CSUM | 429 MLX5_ETH_WQE_L4_INNER_CSUM; 430 if (buf->ol_flags & PKT_TX_OUTER_IP_CKSUM) 431 cs_flags |= MLX5_ETH_WQE_L3_CSUM; 432 } else { 433 cs_flags = MLX5_ETH_WQE_L3_CSUM | 434 MLX5_ETH_WQE_L4_CSUM; 435 } 436 } 437 raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE; 438 /* Replace the Ethernet type by the VLAN if necessary. */ 439 if (buf->ol_flags & PKT_TX_VLAN_PKT) { 440 uint32_t vlan = rte_cpu_to_be_32(0x81000000 | 441 buf->vlan_tci); 442 unsigned int len = 2 * ETHER_ADDR_LEN - 2; 443 444 addr += 2; 445 length -= 2; 446 /* Copy Destination and source mac address. */ 447 memcpy((uint8_t *)raw, ((uint8_t *)addr), len); 448 /* Copy VLAN. */ 449 memcpy((uint8_t *)raw + len, &vlan, sizeof(vlan)); 450 /* Copy missing two bytes to end the DSeg. */ 451 memcpy((uint8_t *)raw + len + sizeof(vlan), 452 ((uint8_t *)addr) + len, 2); 453 addr += len + 2; 454 length -= (len + 2); 455 } else { 456 memcpy((uint8_t *)raw, ((uint8_t *)addr) + 2, 457 MLX5_WQE_DWORD_SIZE); 458 length -= pkt_inline_sz; 459 addr += pkt_inline_sz; 460 } 461 raw += MLX5_WQE_DWORD_SIZE; 462 if (txq->tso_en) { 463 tso = buf->ol_flags & PKT_TX_TCP_SEG; 464 if (tso) { 465 uintptr_t end = (uintptr_t) 466 (((uintptr_t)txq->wqes) + 467 (1 << txq->wqe_n) * 468 MLX5_WQE_SIZE); 469 unsigned int copy_b; 470 uint8_t vlan_sz = (buf->ol_flags & 471 PKT_TX_VLAN_PKT) ? 4 : 0; 472 const uint64_t is_tunneled = 473 buf->ol_flags & 474 (PKT_TX_TUNNEL_GRE | 475 PKT_TX_TUNNEL_VXLAN); 476 477 tso_header_sz = buf->l2_len + vlan_sz + 478 buf->l3_len + buf->l4_len; 479 tso_segsz = buf->tso_segsz; 480 if (unlikely(tso_segsz == 0)) { 481 txq->stats.oerrors++; 482 break; 483 } 484 if (is_tunneled && txq->tunnel_en) { 485 tso_header_sz += buf->outer_l2_len + 486 buf->outer_l3_len; 487 cs_flags |= MLX5_ETH_WQE_L4_INNER_CSUM; 488 } else { 489 cs_flags |= MLX5_ETH_WQE_L4_CSUM; 490 } 491 if (unlikely(tso_header_sz > 492 MLX5_MAX_TSO_HEADER)) { 493 txq->stats.oerrors++; 494 break; 495 } 496 copy_b = tso_header_sz - pkt_inline_sz; 497 /* First seg must contain all headers. */ 498 assert(copy_b <= length); 499 if (copy_b && 500 ((end - (uintptr_t)raw) > copy_b)) { 501 uint16_t n = (MLX5_WQE_DS(copy_b) - 502 1 + 3) / 4; 503 504 if (unlikely(max_wqe < n)) 505 break; 506 max_wqe -= n; 507 rte_memcpy((void *)raw, 508 (void *)addr, copy_b); 509 addr += copy_b; 510 length -= copy_b; 511 /* Include padding for TSO header. */ 512 copy_b = MLX5_WQE_DS(copy_b) * 513 MLX5_WQE_DWORD_SIZE; 514 pkt_inline_sz += copy_b; 515 raw += copy_b; 516 } else { 517 /* NOP WQE. */ 518 wqe->ctrl = (rte_v128u32_t){ 519 rte_cpu_to_be_32( 520 txq->wqe_ci << 8), 521 rte_cpu_to_be_32( 522 txq->qp_num_8s | 1), 523 0, 524 0, 525 }; 526 ds = 1; 527 total_length = 0; 528 k++; 529 goto next_wqe; 530 } 531 } 532 } 533 /* Inline if enough room. */ 534 if (inline_en || tso) { 535 uint32_t inl; 536 uintptr_t end = (uintptr_t) 537 (((uintptr_t)txq->wqes) + 538 (1 << txq->wqe_n) * MLX5_WQE_SIZE); 539 unsigned int inline_room = max_inline * 540 RTE_CACHE_LINE_SIZE - 541 (pkt_inline_sz - 2) - 542 !!tso * sizeof(inl); 543 uintptr_t addr_end = (addr + inline_room) & 544 ~(RTE_CACHE_LINE_SIZE - 1); 545 unsigned int copy_b = (addr_end > addr) ? 546 RTE_MIN((addr_end - addr), length) : 547 0; 548 549 if (copy_b && ((end - (uintptr_t)raw) > copy_b)) { 550 /* 551 * One Dseg remains in the current WQE. To 552 * keep the computation positive, it is 553 * removed after the bytes to Dseg conversion. 554 */ 555 uint16_t n = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4; 556 557 if (unlikely(max_wqe < n)) 558 break; 559 max_wqe -= n; 560 if (tso) { 561 uint32_t inl = 562 rte_cpu_to_be_32(copy_b | 563 MLX5_INLINE_SEG); 564 565 pkt_inline_sz = 566 MLX5_WQE_DS(tso_header_sz) * 567 MLX5_WQE_DWORD_SIZE; 568 569 rte_memcpy((void *)raw, 570 (void *)&inl, sizeof(inl)); 571 raw += sizeof(inl); 572 pkt_inline_sz += sizeof(inl); 573 } 574 rte_memcpy((void *)raw, (void *)addr, copy_b); 575 addr += copy_b; 576 length -= copy_b; 577 pkt_inline_sz += copy_b; 578 } 579 /* 580 * 2 DWORDs consumed by the WQE header + ETH segment + 581 * the size of the inline part of the packet. 582 */ 583 ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2); 584 if (length > 0) { 585 if (ds % (MLX5_WQE_SIZE / 586 MLX5_WQE_DWORD_SIZE) == 0) { 587 if (unlikely(--max_wqe == 0)) 588 break; 589 dseg = (volatile rte_v128u32_t *) 590 tx_mlx5_wqe(txq, txq->wqe_ci + 591 ds / 4); 592 } else { 593 dseg = (volatile rte_v128u32_t *) 594 ((uintptr_t)wqe + 595 (ds * MLX5_WQE_DWORD_SIZE)); 596 } 597 goto use_dseg; 598 } else if (!segs_n) { 599 goto next_pkt; 600 } else { 601 /* dseg will be advance as part of next_seg */ 602 dseg = (volatile rte_v128u32_t *) 603 ((uintptr_t)wqe + 604 ((ds - 1) * MLX5_WQE_DWORD_SIZE)); 605 goto next_seg; 606 } 607 } else { 608 /* 609 * No inline has been done in the packet, only the 610 * Ethernet Header as been stored. 611 */ 612 dseg = (volatile rte_v128u32_t *) 613 ((uintptr_t)wqe + (3 * MLX5_WQE_DWORD_SIZE)); 614 ds = 3; 615 use_dseg: 616 /* Add the remaining packet as a simple ds. */ 617 naddr = rte_cpu_to_be_64(addr); 618 *dseg = (rte_v128u32_t){ 619 rte_cpu_to_be_32(length), 620 mlx5_tx_mb2mr(txq, buf), 621 naddr, 622 naddr >> 32, 623 }; 624 ++ds; 625 if (!segs_n) 626 goto next_pkt; 627 } 628 next_seg: 629 assert(buf); 630 assert(ds); 631 assert(wqe); 632 /* 633 * Spill on next WQE when the current one does not have 634 * enough room left. Size of WQE must a be a multiple 635 * of data segment size. 636 */ 637 assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE)); 638 if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE))) { 639 if (unlikely(--max_wqe == 0)) 640 break; 641 dseg = (volatile rte_v128u32_t *) 642 tx_mlx5_wqe(txq, txq->wqe_ci + ds / 4); 643 rte_prefetch0(tx_mlx5_wqe(txq, 644 txq->wqe_ci + ds / 4 + 1)); 645 } else { 646 ++dseg; 647 } 648 ++ds; 649 buf = buf->next; 650 assert(buf); 651 length = DATA_LEN(buf); 652 #ifdef MLX5_PMD_SOFT_COUNTERS 653 total_length += length; 654 #endif 655 /* Store segment information. */ 656 naddr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf, uintptr_t)); 657 *dseg = (rte_v128u32_t){ 658 rte_cpu_to_be_32(length), 659 mlx5_tx_mb2mr(txq, buf), 660 naddr, 661 naddr >> 32, 662 }; 663 (*txq->elts)[++elts_head & elts_m] = buf; 664 ++sg; 665 /* Advance counter only if all segs are successfully posted. */ 666 if (sg < segs_n) 667 goto next_seg; 668 else 669 j += sg; 670 next_pkt: 671 if (ds > MLX5_DSEG_MAX) { 672 txq->stats.oerrors++; 673 break; 674 } 675 ++elts_head; 676 ++pkts; 677 ++i; 678 /* Initialize known and common part of the WQE structure. */ 679 if (tso) { 680 wqe->ctrl = (rte_v128u32_t){ 681 rte_cpu_to_be_32((txq->wqe_ci << 8) | 682 MLX5_OPCODE_TSO), 683 rte_cpu_to_be_32(txq->qp_num_8s | ds), 684 0, 685 0, 686 }; 687 wqe->eseg = (rte_v128u32_t){ 688 0, 689 cs_flags | (rte_cpu_to_be_16(tso_segsz) << 16), 690 0, 691 (ehdr << 16) | rte_cpu_to_be_16(tso_header_sz), 692 }; 693 } else { 694 wqe->ctrl = (rte_v128u32_t){ 695 rte_cpu_to_be_32((txq->wqe_ci << 8) | 696 MLX5_OPCODE_SEND), 697 rte_cpu_to_be_32(txq->qp_num_8s | ds), 698 0, 699 0, 700 }; 701 wqe->eseg = (rte_v128u32_t){ 702 0, 703 cs_flags, 704 0, 705 (ehdr << 16) | rte_cpu_to_be_16(pkt_inline_sz), 706 }; 707 } 708 next_wqe: 709 txq->wqe_ci += (ds + 3) / 4; 710 /* Save the last successful WQE for completion request */ 711 last_wqe = (volatile struct mlx5_wqe_ctrl *)wqe; 712 #ifdef MLX5_PMD_SOFT_COUNTERS 713 /* Increment sent bytes counter. */ 714 txq->stats.obytes += total_length; 715 #endif 716 } while (i < pkts_n); 717 /* Take a shortcut if nothing must be sent. */ 718 if (unlikely((i + k) == 0)) 719 return 0; 720 txq->elts_head += (i + j); 721 /* Check whether completion threshold has been reached. */ 722 comp = txq->elts_comp + i + j + k; 723 if (comp >= MLX5_TX_COMP_THRESH) { 724 /* Request completion on last WQE. */ 725 last_wqe->ctrl2 = rte_cpu_to_be_32(8); 726 /* Save elts_head in unused "immediate" field of WQE. */ 727 last_wqe->ctrl3 = txq->elts_head; 728 txq->elts_comp = 0; 729 } else { 730 txq->elts_comp = comp; 731 } 732 #ifdef MLX5_PMD_SOFT_COUNTERS 733 /* Increment sent packets counter. */ 734 txq->stats.opackets += i; 735 #endif 736 /* Ring QP doorbell. */ 737 mlx5_tx_dbrec(txq, (volatile struct mlx5_wqe *)last_wqe); 738 return i; 739 } 740 741 /** 742 * Open a MPW session. 743 * 744 * @param txq 745 * Pointer to TX queue structure. 746 * @param mpw 747 * Pointer to MPW session structure. 748 * @param length 749 * Packet length. 750 */ 751 static inline void 752 mlx5_mpw_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw, uint32_t length) 753 { 754 uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1); 755 volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] = 756 (volatile struct mlx5_wqe_data_seg (*)[]) 757 tx_mlx5_wqe(txq, idx + 1); 758 759 mpw->state = MLX5_MPW_STATE_OPENED; 760 mpw->pkts_n = 0; 761 mpw->len = length; 762 mpw->total_len = 0; 763 mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx); 764 mpw->wqe->eseg.mss = rte_cpu_to_be_16(length); 765 mpw->wqe->eseg.inline_hdr_sz = 0; 766 mpw->wqe->eseg.rsvd0 = 0; 767 mpw->wqe->eseg.rsvd1 = 0; 768 mpw->wqe->eseg.rsvd2 = 0; 769 mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) | 770 (txq->wqe_ci << 8) | 771 MLX5_OPCODE_TSO); 772 mpw->wqe->ctrl[2] = 0; 773 mpw->wqe->ctrl[3] = 0; 774 mpw->data.dseg[0] = (volatile struct mlx5_wqe_data_seg *) 775 (((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE)); 776 mpw->data.dseg[1] = (volatile struct mlx5_wqe_data_seg *) 777 (((uintptr_t)mpw->wqe) + (3 * MLX5_WQE_DWORD_SIZE)); 778 mpw->data.dseg[2] = &(*dseg)[0]; 779 mpw->data.dseg[3] = &(*dseg)[1]; 780 mpw->data.dseg[4] = &(*dseg)[2]; 781 } 782 783 /** 784 * Close a MPW session. 785 * 786 * @param txq 787 * Pointer to TX queue structure. 788 * @param mpw 789 * Pointer to MPW session structure. 790 */ 791 static inline void 792 mlx5_mpw_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw) 793 { 794 unsigned int num = mpw->pkts_n; 795 796 /* 797 * Store size in multiple of 16 bytes. Control and Ethernet segments 798 * count as 2. 799 */ 800 mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s | (2 + num)); 801 mpw->state = MLX5_MPW_STATE_CLOSED; 802 if (num < 3) 803 ++txq->wqe_ci; 804 else 805 txq->wqe_ci += 2; 806 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci)); 807 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); 808 } 809 810 /** 811 * DPDK callback for TX with MPW support. 812 * 813 * @param dpdk_txq 814 * Generic pointer to TX queue structure. 815 * @param[in] pkts 816 * Packets to transmit. 817 * @param pkts_n 818 * Number of packets in array. 819 * 820 * @return 821 * Number of packets successfully transmitted (<= pkts_n). 822 */ 823 uint16_t 824 mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 825 { 826 struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq; 827 uint16_t elts_head = txq->elts_head; 828 const uint16_t elts_n = 1 << txq->elts_n; 829 const uint16_t elts_m = elts_n - 1; 830 unsigned int i = 0; 831 unsigned int j = 0; 832 uint16_t max_elts; 833 uint16_t max_wqe; 834 unsigned int comp; 835 struct mlx5_mpw mpw = { 836 .state = MLX5_MPW_STATE_CLOSED, 837 }; 838 839 if (unlikely(!pkts_n)) 840 return 0; 841 /* Prefetch first packet cacheline. */ 842 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci)); 843 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); 844 /* Start processing. */ 845 mlx5_tx_complete(txq); 846 max_elts = (elts_n - (elts_head - txq->elts_tail)); 847 max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); 848 if (unlikely(!max_wqe)) 849 return 0; 850 do { 851 struct rte_mbuf *buf = *(pkts++); 852 uint32_t length; 853 unsigned int segs_n = buf->nb_segs; 854 uint32_t cs_flags = 0; 855 856 /* 857 * Make sure there is enough room to store this packet and 858 * that one ring entry remains unused. 859 */ 860 assert(segs_n); 861 if (max_elts < segs_n) 862 break; 863 /* Do not bother with large packets MPW cannot handle. */ 864 if (segs_n > MLX5_MPW_DSEG_MAX) { 865 txq->stats.oerrors++; 866 break; 867 } 868 max_elts -= segs_n; 869 --pkts_n; 870 /* Should we enable HW CKSUM offload */ 871 if (buf->ol_flags & 872 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) 873 cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; 874 /* Retrieve packet information. */ 875 length = PKT_LEN(buf); 876 assert(length); 877 /* Start new session if packet differs. */ 878 if ((mpw.state == MLX5_MPW_STATE_OPENED) && 879 ((mpw.len != length) || 880 (segs_n != 1) || 881 (mpw.wqe->eseg.cs_flags != cs_flags))) 882 mlx5_mpw_close(txq, &mpw); 883 if (mpw.state == MLX5_MPW_STATE_CLOSED) { 884 /* 885 * Multi-Packet WQE consumes at most two WQE. 886 * mlx5_mpw_new() expects to be able to use such 887 * resources. 888 */ 889 if (unlikely(max_wqe < 2)) 890 break; 891 max_wqe -= 2; 892 mlx5_mpw_new(txq, &mpw, length); 893 mpw.wqe->eseg.cs_flags = cs_flags; 894 } 895 /* Multi-segment packets must be alone in their MPW. */ 896 assert((segs_n == 1) || (mpw.pkts_n == 0)); 897 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 898 length = 0; 899 #endif 900 do { 901 volatile struct mlx5_wqe_data_seg *dseg; 902 uintptr_t addr; 903 904 assert(buf); 905 (*txq->elts)[elts_head++ & elts_m] = buf; 906 dseg = mpw.data.dseg[mpw.pkts_n]; 907 addr = rte_pktmbuf_mtod(buf, uintptr_t); 908 *dseg = (struct mlx5_wqe_data_seg){ 909 .byte_count = rte_cpu_to_be_32(DATA_LEN(buf)), 910 .lkey = mlx5_tx_mb2mr(txq, buf), 911 .addr = rte_cpu_to_be_64(addr), 912 }; 913 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 914 length += DATA_LEN(buf); 915 #endif 916 buf = buf->next; 917 ++mpw.pkts_n; 918 ++j; 919 } while (--segs_n); 920 assert(length == mpw.len); 921 if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) 922 mlx5_mpw_close(txq, &mpw); 923 #ifdef MLX5_PMD_SOFT_COUNTERS 924 /* Increment sent bytes counter. */ 925 txq->stats.obytes += length; 926 #endif 927 ++i; 928 } while (pkts_n); 929 /* Take a shortcut if nothing must be sent. */ 930 if (unlikely(i == 0)) 931 return 0; 932 /* Check whether completion threshold has been reached. */ 933 /* "j" includes both packets and segments. */ 934 comp = txq->elts_comp + j; 935 if (comp >= MLX5_TX_COMP_THRESH) { 936 volatile struct mlx5_wqe *wqe = mpw.wqe; 937 938 /* Request completion on last WQE. */ 939 wqe->ctrl[2] = rte_cpu_to_be_32(8); 940 /* Save elts_head in unused "immediate" field of WQE. */ 941 wqe->ctrl[3] = elts_head; 942 txq->elts_comp = 0; 943 } else { 944 txq->elts_comp = comp; 945 } 946 #ifdef MLX5_PMD_SOFT_COUNTERS 947 /* Increment sent packets counter. */ 948 txq->stats.opackets += i; 949 #endif 950 /* Ring QP doorbell. */ 951 if (mpw.state == MLX5_MPW_STATE_OPENED) 952 mlx5_mpw_close(txq, &mpw); 953 mlx5_tx_dbrec(txq, mpw.wqe); 954 txq->elts_head = elts_head; 955 return i; 956 } 957 958 /** 959 * Open a MPW inline session. 960 * 961 * @param txq 962 * Pointer to TX queue structure. 963 * @param mpw 964 * Pointer to MPW session structure. 965 * @param length 966 * Packet length. 967 */ 968 static inline void 969 mlx5_mpw_inline_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw, 970 uint32_t length) 971 { 972 uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1); 973 struct mlx5_wqe_inl_small *inl; 974 975 mpw->state = MLX5_MPW_INL_STATE_OPENED; 976 mpw->pkts_n = 0; 977 mpw->len = length; 978 mpw->total_len = 0; 979 mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx); 980 mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) | 981 (txq->wqe_ci << 8) | 982 MLX5_OPCODE_TSO); 983 mpw->wqe->ctrl[2] = 0; 984 mpw->wqe->ctrl[3] = 0; 985 mpw->wqe->eseg.mss = rte_cpu_to_be_16(length); 986 mpw->wqe->eseg.inline_hdr_sz = 0; 987 mpw->wqe->eseg.cs_flags = 0; 988 mpw->wqe->eseg.rsvd0 = 0; 989 mpw->wqe->eseg.rsvd1 = 0; 990 mpw->wqe->eseg.rsvd2 = 0; 991 inl = (struct mlx5_wqe_inl_small *) 992 (((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE); 993 mpw->data.raw = (uint8_t *)&inl->raw; 994 } 995 996 /** 997 * Close a MPW inline session. 998 * 999 * @param txq 1000 * Pointer to TX queue structure. 1001 * @param mpw 1002 * Pointer to MPW session structure. 1003 */ 1004 static inline void 1005 mlx5_mpw_inline_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw) 1006 { 1007 unsigned int size; 1008 struct mlx5_wqe_inl_small *inl = (struct mlx5_wqe_inl_small *) 1009 (((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE)); 1010 1011 size = MLX5_WQE_SIZE - MLX5_MWQE64_INL_DATA + mpw->total_len; 1012 /* 1013 * Store size in multiple of 16 bytes. Control and Ethernet segments 1014 * count as 2. 1015 */ 1016 mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s | 1017 MLX5_WQE_DS(size)); 1018 mpw->state = MLX5_MPW_STATE_CLOSED; 1019 inl->byte_cnt = rte_cpu_to_be_32(mpw->total_len | MLX5_INLINE_SEG); 1020 txq->wqe_ci += (size + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE; 1021 } 1022 1023 /** 1024 * DPDK callback for TX with MPW inline support. 1025 * 1026 * @param dpdk_txq 1027 * Generic pointer to TX queue structure. 1028 * @param[in] pkts 1029 * Packets to transmit. 1030 * @param pkts_n 1031 * Number of packets in array. 1032 * 1033 * @return 1034 * Number of packets successfully transmitted (<= pkts_n). 1035 */ 1036 uint16_t 1037 mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, 1038 uint16_t pkts_n) 1039 { 1040 struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq; 1041 uint16_t elts_head = txq->elts_head; 1042 const uint16_t elts_n = 1 << txq->elts_n; 1043 const uint16_t elts_m = elts_n - 1; 1044 unsigned int i = 0; 1045 unsigned int j = 0; 1046 uint16_t max_elts; 1047 uint16_t max_wqe; 1048 unsigned int comp; 1049 unsigned int inline_room = txq->max_inline * RTE_CACHE_LINE_SIZE; 1050 struct mlx5_mpw mpw = { 1051 .state = MLX5_MPW_STATE_CLOSED, 1052 }; 1053 /* 1054 * Compute the maximum number of WQE which can be consumed by inline 1055 * code. 1056 * - 2 DSEG for: 1057 * - 1 control segment, 1058 * - 1 Ethernet segment, 1059 * - N Dseg from the inline request. 1060 */ 1061 const unsigned int wqe_inl_n = 1062 ((2 * MLX5_WQE_DWORD_SIZE + 1063 txq->max_inline * RTE_CACHE_LINE_SIZE) + 1064 RTE_CACHE_LINE_SIZE - 1) / RTE_CACHE_LINE_SIZE; 1065 1066 if (unlikely(!pkts_n)) 1067 return 0; 1068 /* Prefetch first packet cacheline. */ 1069 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci)); 1070 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); 1071 /* Start processing. */ 1072 mlx5_tx_complete(txq); 1073 max_elts = (elts_n - (elts_head - txq->elts_tail)); 1074 do { 1075 struct rte_mbuf *buf = *(pkts++); 1076 uintptr_t addr; 1077 uint32_t length; 1078 unsigned int segs_n = buf->nb_segs; 1079 uint32_t cs_flags = 0; 1080 1081 /* 1082 * Make sure there is enough room to store this packet and 1083 * that one ring entry remains unused. 1084 */ 1085 assert(segs_n); 1086 if (max_elts < segs_n) 1087 break; 1088 /* Do not bother with large packets MPW cannot handle. */ 1089 if (segs_n > MLX5_MPW_DSEG_MAX) { 1090 txq->stats.oerrors++; 1091 break; 1092 } 1093 max_elts -= segs_n; 1094 --pkts_n; 1095 /* 1096 * Compute max_wqe in case less WQE were consumed in previous 1097 * iteration. 1098 */ 1099 max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); 1100 /* Should we enable HW CKSUM offload */ 1101 if (buf->ol_flags & 1102 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) 1103 cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; 1104 /* Retrieve packet information. */ 1105 length = PKT_LEN(buf); 1106 /* Start new session if packet differs. */ 1107 if (mpw.state == MLX5_MPW_STATE_OPENED) { 1108 if ((mpw.len != length) || 1109 (segs_n != 1) || 1110 (mpw.wqe->eseg.cs_flags != cs_flags)) 1111 mlx5_mpw_close(txq, &mpw); 1112 } else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) { 1113 if ((mpw.len != length) || 1114 (segs_n != 1) || 1115 (length > inline_room) || 1116 (mpw.wqe->eseg.cs_flags != cs_flags)) { 1117 mlx5_mpw_inline_close(txq, &mpw); 1118 inline_room = 1119 txq->max_inline * RTE_CACHE_LINE_SIZE; 1120 } 1121 } 1122 if (mpw.state == MLX5_MPW_STATE_CLOSED) { 1123 if ((segs_n != 1) || 1124 (length > inline_room)) { 1125 /* 1126 * Multi-Packet WQE consumes at most two WQE. 1127 * mlx5_mpw_new() expects to be able to use 1128 * such resources. 1129 */ 1130 if (unlikely(max_wqe < 2)) 1131 break; 1132 max_wqe -= 2; 1133 mlx5_mpw_new(txq, &mpw, length); 1134 mpw.wqe->eseg.cs_flags = cs_flags; 1135 } else { 1136 if (unlikely(max_wqe < wqe_inl_n)) 1137 break; 1138 max_wqe -= wqe_inl_n; 1139 mlx5_mpw_inline_new(txq, &mpw, length); 1140 mpw.wqe->eseg.cs_flags = cs_flags; 1141 } 1142 } 1143 /* Multi-segment packets must be alone in their MPW. */ 1144 assert((segs_n == 1) || (mpw.pkts_n == 0)); 1145 if (mpw.state == MLX5_MPW_STATE_OPENED) { 1146 assert(inline_room == 1147 txq->max_inline * RTE_CACHE_LINE_SIZE); 1148 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 1149 length = 0; 1150 #endif 1151 do { 1152 volatile struct mlx5_wqe_data_seg *dseg; 1153 1154 assert(buf); 1155 (*txq->elts)[elts_head++ & elts_m] = buf; 1156 dseg = mpw.data.dseg[mpw.pkts_n]; 1157 addr = rte_pktmbuf_mtod(buf, uintptr_t); 1158 *dseg = (struct mlx5_wqe_data_seg){ 1159 .byte_count = 1160 rte_cpu_to_be_32(DATA_LEN(buf)), 1161 .lkey = mlx5_tx_mb2mr(txq, buf), 1162 .addr = rte_cpu_to_be_64(addr), 1163 }; 1164 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 1165 length += DATA_LEN(buf); 1166 #endif 1167 buf = buf->next; 1168 ++mpw.pkts_n; 1169 ++j; 1170 } while (--segs_n); 1171 assert(length == mpw.len); 1172 if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) 1173 mlx5_mpw_close(txq, &mpw); 1174 } else { 1175 unsigned int max; 1176 1177 assert(mpw.state == MLX5_MPW_INL_STATE_OPENED); 1178 assert(length <= inline_room); 1179 assert(length == DATA_LEN(buf)); 1180 addr = rte_pktmbuf_mtod(buf, uintptr_t); 1181 (*txq->elts)[elts_head++ & elts_m] = buf; 1182 /* Maximum number of bytes before wrapping. */ 1183 max = ((((uintptr_t)(txq->wqes)) + 1184 (1 << txq->wqe_n) * 1185 MLX5_WQE_SIZE) - 1186 (uintptr_t)mpw.data.raw); 1187 if (length > max) { 1188 rte_memcpy((void *)(uintptr_t)mpw.data.raw, 1189 (void *)addr, 1190 max); 1191 mpw.data.raw = (volatile void *)txq->wqes; 1192 rte_memcpy((void *)(uintptr_t)mpw.data.raw, 1193 (void *)(addr + max), 1194 length - max); 1195 mpw.data.raw += length - max; 1196 } else { 1197 rte_memcpy((void *)(uintptr_t)mpw.data.raw, 1198 (void *)addr, 1199 length); 1200 1201 if (length == max) 1202 mpw.data.raw = 1203 (volatile void *)txq->wqes; 1204 else 1205 mpw.data.raw += length; 1206 } 1207 ++mpw.pkts_n; 1208 mpw.total_len += length; 1209 ++j; 1210 if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) { 1211 mlx5_mpw_inline_close(txq, &mpw); 1212 inline_room = 1213 txq->max_inline * RTE_CACHE_LINE_SIZE; 1214 } else { 1215 inline_room -= length; 1216 } 1217 } 1218 #ifdef MLX5_PMD_SOFT_COUNTERS 1219 /* Increment sent bytes counter. */ 1220 txq->stats.obytes += length; 1221 #endif 1222 ++i; 1223 } while (pkts_n); 1224 /* Take a shortcut if nothing must be sent. */ 1225 if (unlikely(i == 0)) 1226 return 0; 1227 /* Check whether completion threshold has been reached. */ 1228 /* "j" includes both packets and segments. */ 1229 comp = txq->elts_comp + j; 1230 if (comp >= MLX5_TX_COMP_THRESH) { 1231 volatile struct mlx5_wqe *wqe = mpw.wqe; 1232 1233 /* Request completion on last WQE. */ 1234 wqe->ctrl[2] = rte_cpu_to_be_32(8); 1235 /* Save elts_head in unused "immediate" field of WQE. */ 1236 wqe->ctrl[3] = elts_head; 1237 txq->elts_comp = 0; 1238 } else { 1239 txq->elts_comp = comp; 1240 } 1241 #ifdef MLX5_PMD_SOFT_COUNTERS 1242 /* Increment sent packets counter. */ 1243 txq->stats.opackets += i; 1244 #endif 1245 /* Ring QP doorbell. */ 1246 if (mpw.state == MLX5_MPW_INL_STATE_OPENED) 1247 mlx5_mpw_inline_close(txq, &mpw); 1248 else if (mpw.state == MLX5_MPW_STATE_OPENED) 1249 mlx5_mpw_close(txq, &mpw); 1250 mlx5_tx_dbrec(txq, mpw.wqe); 1251 txq->elts_head = elts_head; 1252 return i; 1253 } 1254 1255 /** 1256 * Open an Enhanced MPW session. 1257 * 1258 * @param txq 1259 * Pointer to TX queue structure. 1260 * @param mpw 1261 * Pointer to MPW session structure. 1262 * @param length 1263 * Packet length. 1264 */ 1265 static inline void 1266 mlx5_empw_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw, int padding) 1267 { 1268 uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1); 1269 1270 mpw->state = MLX5_MPW_ENHANCED_STATE_OPENED; 1271 mpw->pkts_n = 0; 1272 mpw->total_len = sizeof(struct mlx5_wqe); 1273 mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx); 1274 mpw->wqe->ctrl[0] = 1275 rte_cpu_to_be_32((MLX5_OPC_MOD_ENHANCED_MPSW << 24) | 1276 (txq->wqe_ci << 8) | 1277 MLX5_OPCODE_ENHANCED_MPSW); 1278 mpw->wqe->ctrl[2] = 0; 1279 mpw->wqe->ctrl[3] = 0; 1280 memset((void *)(uintptr_t)&mpw->wqe->eseg, 0, MLX5_WQE_DWORD_SIZE); 1281 if (unlikely(padding)) { 1282 uintptr_t addr = (uintptr_t)(mpw->wqe + 1); 1283 1284 /* Pad the first 2 DWORDs with zero-length inline header. */ 1285 *(volatile uint32_t *)addr = rte_cpu_to_be_32(MLX5_INLINE_SEG); 1286 *(volatile uint32_t *)(addr + MLX5_WQE_DWORD_SIZE) = 1287 rte_cpu_to_be_32(MLX5_INLINE_SEG); 1288 mpw->total_len += 2 * MLX5_WQE_DWORD_SIZE; 1289 /* Start from the next WQEBB. */ 1290 mpw->data.raw = (volatile void *)(tx_mlx5_wqe(txq, idx + 1)); 1291 } else { 1292 mpw->data.raw = (volatile void *)(mpw->wqe + 1); 1293 } 1294 } 1295 1296 /** 1297 * Close an Enhanced MPW session. 1298 * 1299 * @param txq 1300 * Pointer to TX queue structure. 1301 * @param mpw 1302 * Pointer to MPW session structure. 1303 * 1304 * @return 1305 * Number of consumed WQEs. 1306 */ 1307 static inline uint16_t 1308 mlx5_empw_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw) 1309 { 1310 uint16_t ret; 1311 1312 /* Store size in multiple of 16 bytes. Control and Ethernet segments 1313 * count as 2. 1314 */ 1315 mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s | 1316 MLX5_WQE_DS(mpw->total_len)); 1317 mpw->state = MLX5_MPW_STATE_CLOSED; 1318 ret = (mpw->total_len + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE; 1319 txq->wqe_ci += ret; 1320 return ret; 1321 } 1322 1323 /** 1324 * DPDK callback for TX with Enhanced MPW support. 1325 * 1326 * @param dpdk_txq 1327 * Generic pointer to TX queue structure. 1328 * @param[in] pkts 1329 * Packets to transmit. 1330 * @param pkts_n 1331 * Number of packets in array. 1332 * 1333 * @return 1334 * Number of packets successfully transmitted (<= pkts_n). 1335 */ 1336 uint16_t 1337 mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 1338 { 1339 struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq; 1340 uint16_t elts_head = txq->elts_head; 1341 const uint16_t elts_n = 1 << txq->elts_n; 1342 const uint16_t elts_m = elts_n - 1; 1343 unsigned int i = 0; 1344 unsigned int j = 0; 1345 uint16_t max_elts; 1346 uint16_t max_wqe; 1347 unsigned int max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE; 1348 unsigned int mpw_room = 0; 1349 unsigned int inl_pad = 0; 1350 uint32_t inl_hdr; 1351 struct mlx5_mpw mpw = { 1352 .state = MLX5_MPW_STATE_CLOSED, 1353 }; 1354 1355 if (unlikely(!pkts_n)) 1356 return 0; 1357 /* Start processing. */ 1358 mlx5_tx_complete(txq); 1359 max_elts = (elts_n - (elts_head - txq->elts_tail)); 1360 /* A CQE slot must always be available. */ 1361 assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci)); 1362 max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); 1363 if (unlikely(!max_wqe)) 1364 return 0; 1365 do { 1366 struct rte_mbuf *buf = *(pkts++); 1367 uintptr_t addr; 1368 uint64_t naddr; 1369 unsigned int n; 1370 unsigned int do_inline = 0; /* Whether inline is possible. */ 1371 uint32_t length; 1372 unsigned int segs_n = buf->nb_segs; 1373 uint32_t cs_flags = 0; 1374 1375 /* 1376 * Make sure there is enough room to store this packet and 1377 * that one ring entry remains unused. 1378 */ 1379 assert(segs_n); 1380 if (max_elts - j < segs_n) 1381 break; 1382 /* Do not bother with large packets MPW cannot handle. */ 1383 if (segs_n > MLX5_MPW_DSEG_MAX) { 1384 txq->stats.oerrors++; 1385 break; 1386 } 1387 /* Should we enable HW CKSUM offload. */ 1388 if (buf->ol_flags & 1389 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) 1390 cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; 1391 /* Retrieve packet information. */ 1392 length = PKT_LEN(buf); 1393 /* Start new session if: 1394 * - multi-segment packet 1395 * - no space left even for a dseg 1396 * - next packet can be inlined with a new WQE 1397 * - cs_flag differs 1398 * It can't be MLX5_MPW_STATE_OPENED as always have a single 1399 * segmented packet. 1400 */ 1401 if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED) { 1402 if ((segs_n != 1) || 1403 (inl_pad + sizeof(struct mlx5_wqe_data_seg) > 1404 mpw_room) || 1405 (length <= txq->inline_max_packet_sz && 1406 inl_pad + sizeof(inl_hdr) + length > 1407 mpw_room) || 1408 (mpw.wqe->eseg.cs_flags != cs_flags)) 1409 max_wqe -= mlx5_empw_close(txq, &mpw); 1410 } 1411 if (unlikely(mpw.state == MLX5_MPW_STATE_CLOSED)) { 1412 if (unlikely(segs_n != 1)) { 1413 /* Fall back to legacy MPW. 1414 * A MPW session consumes 2 WQEs at most to 1415 * include MLX5_MPW_DSEG_MAX pointers. 1416 */ 1417 if (unlikely(max_wqe < 2)) 1418 break; 1419 mlx5_mpw_new(txq, &mpw, length); 1420 } else { 1421 /* In Enhanced MPW, inline as much as the budget 1422 * is allowed. The remaining space is to be 1423 * filled with dsegs. If the title WQEBB isn't 1424 * padded, it will have 2 dsegs there. 1425 */ 1426 mpw_room = RTE_MIN(MLX5_WQE_SIZE_MAX, 1427 (max_inline ? max_inline : 1428 pkts_n * MLX5_WQE_DWORD_SIZE) + 1429 MLX5_WQE_SIZE); 1430 if (unlikely(max_wqe * MLX5_WQE_SIZE < 1431 mpw_room)) 1432 break; 1433 /* Don't pad the title WQEBB to not waste WQ. */ 1434 mlx5_empw_new(txq, &mpw, 0); 1435 mpw_room -= mpw.total_len; 1436 inl_pad = 0; 1437 do_inline = 1438 length <= txq->inline_max_packet_sz && 1439 sizeof(inl_hdr) + length <= mpw_room && 1440 !txq->mpw_hdr_dseg; 1441 } 1442 mpw.wqe->eseg.cs_flags = cs_flags; 1443 } else { 1444 /* Evaluate whether the next packet can be inlined. 1445 * Inlininig is possible when: 1446 * - length is less than configured value 1447 * - length fits for remaining space 1448 * - not required to fill the title WQEBB with dsegs 1449 */ 1450 do_inline = 1451 length <= txq->inline_max_packet_sz && 1452 inl_pad + sizeof(inl_hdr) + length <= 1453 mpw_room && 1454 (!txq->mpw_hdr_dseg || 1455 mpw.total_len >= MLX5_WQE_SIZE); 1456 } 1457 /* Multi-segment packets must be alone in their MPW. */ 1458 assert((segs_n == 1) || (mpw.pkts_n == 0)); 1459 if (unlikely(mpw.state == MLX5_MPW_STATE_OPENED)) { 1460 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 1461 length = 0; 1462 #endif 1463 do { 1464 volatile struct mlx5_wqe_data_seg *dseg; 1465 1466 assert(buf); 1467 (*txq->elts)[elts_head++ & elts_m] = buf; 1468 dseg = mpw.data.dseg[mpw.pkts_n]; 1469 addr = rte_pktmbuf_mtod(buf, uintptr_t); 1470 *dseg = (struct mlx5_wqe_data_seg){ 1471 .byte_count = rte_cpu_to_be_32( 1472 DATA_LEN(buf)), 1473 .lkey = mlx5_tx_mb2mr(txq, buf), 1474 .addr = rte_cpu_to_be_64(addr), 1475 }; 1476 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) 1477 length += DATA_LEN(buf); 1478 #endif 1479 buf = buf->next; 1480 ++j; 1481 ++mpw.pkts_n; 1482 } while (--segs_n); 1483 /* A multi-segmented packet takes one MPW session. 1484 * TODO: Pack more multi-segmented packets if possible. 1485 */ 1486 mlx5_mpw_close(txq, &mpw); 1487 if (mpw.pkts_n < 3) 1488 max_wqe--; 1489 else 1490 max_wqe -= 2; 1491 } else if (do_inline) { 1492 /* Inline packet into WQE. */ 1493 unsigned int max; 1494 1495 assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED); 1496 assert(length == DATA_LEN(buf)); 1497 inl_hdr = rte_cpu_to_be_32(length | MLX5_INLINE_SEG); 1498 addr = rte_pktmbuf_mtod(buf, uintptr_t); 1499 mpw.data.raw = (volatile void *) 1500 ((uintptr_t)mpw.data.raw + inl_pad); 1501 max = tx_mlx5_wq_tailroom(txq, 1502 (void *)(uintptr_t)mpw.data.raw); 1503 /* Copy inline header. */ 1504 mpw.data.raw = (volatile void *) 1505 mlx5_copy_to_wq( 1506 (void *)(uintptr_t)mpw.data.raw, 1507 &inl_hdr, 1508 sizeof(inl_hdr), 1509 (void *)(uintptr_t)txq->wqes, 1510 max); 1511 max = tx_mlx5_wq_tailroom(txq, 1512 (void *)(uintptr_t)mpw.data.raw); 1513 /* Copy packet data. */ 1514 mpw.data.raw = (volatile void *) 1515 mlx5_copy_to_wq( 1516 (void *)(uintptr_t)mpw.data.raw, 1517 (void *)addr, 1518 length, 1519 (void *)(uintptr_t)txq->wqes, 1520 max); 1521 ++mpw.pkts_n; 1522 mpw.total_len += (inl_pad + sizeof(inl_hdr) + length); 1523 /* No need to get completion as the entire packet is 1524 * copied to WQ. Free the buf right away. 1525 */ 1526 rte_pktmbuf_free_seg(buf); 1527 mpw_room -= (inl_pad + sizeof(inl_hdr) + length); 1528 /* Add pad in the next packet if any. */ 1529 inl_pad = (((uintptr_t)mpw.data.raw + 1530 (MLX5_WQE_DWORD_SIZE - 1)) & 1531 ~(MLX5_WQE_DWORD_SIZE - 1)) - 1532 (uintptr_t)mpw.data.raw; 1533 } else { 1534 /* No inline. Load a dseg of packet pointer. */ 1535 volatile rte_v128u32_t *dseg; 1536 1537 assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED); 1538 assert((inl_pad + sizeof(*dseg)) <= mpw_room); 1539 assert(length == DATA_LEN(buf)); 1540 if (!tx_mlx5_wq_tailroom(txq, 1541 (void *)((uintptr_t)mpw.data.raw 1542 + inl_pad))) 1543 dseg = (volatile void *)txq->wqes; 1544 else 1545 dseg = (volatile void *) 1546 ((uintptr_t)mpw.data.raw + 1547 inl_pad); 1548 (*txq->elts)[elts_head++ & elts_m] = buf; 1549 addr = rte_pktmbuf_mtod(buf, uintptr_t); 1550 for (n = 0; n * RTE_CACHE_LINE_SIZE < length; n++) 1551 rte_prefetch2((void *)(addr + 1552 n * RTE_CACHE_LINE_SIZE)); 1553 naddr = rte_cpu_to_be_64(addr); 1554 *dseg = (rte_v128u32_t) { 1555 rte_cpu_to_be_32(length), 1556 mlx5_tx_mb2mr(txq, buf), 1557 naddr, 1558 naddr >> 32, 1559 }; 1560 mpw.data.raw = (volatile void *)(dseg + 1); 1561 mpw.total_len += (inl_pad + sizeof(*dseg)); 1562 ++j; 1563 ++mpw.pkts_n; 1564 mpw_room -= (inl_pad + sizeof(*dseg)); 1565 inl_pad = 0; 1566 } 1567 #ifdef MLX5_PMD_SOFT_COUNTERS 1568 /* Increment sent bytes counter. */ 1569 txq->stats.obytes += length; 1570 #endif 1571 ++i; 1572 } while (i < pkts_n); 1573 /* Take a shortcut if nothing must be sent. */ 1574 if (unlikely(i == 0)) 1575 return 0; 1576 /* Check whether completion threshold has been reached. */ 1577 if (txq->elts_comp + j >= MLX5_TX_COMP_THRESH || 1578 (uint16_t)(txq->wqe_ci - txq->mpw_comp) >= 1579 (1 << txq->wqe_n) / MLX5_TX_COMP_THRESH_INLINE_DIV) { 1580 volatile struct mlx5_wqe *wqe = mpw.wqe; 1581 1582 /* Request completion on last WQE. */ 1583 wqe->ctrl[2] = rte_cpu_to_be_32(8); 1584 /* Save elts_head in unused "immediate" field of WQE. */ 1585 wqe->ctrl[3] = elts_head; 1586 txq->elts_comp = 0; 1587 txq->mpw_comp = txq->wqe_ci; 1588 txq->cq_pi++; 1589 } else { 1590 txq->elts_comp += j; 1591 } 1592 #ifdef MLX5_PMD_SOFT_COUNTERS 1593 /* Increment sent packets counter. */ 1594 txq->stats.opackets += i; 1595 #endif 1596 if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED) 1597 mlx5_empw_close(txq, &mpw); 1598 else if (mpw.state == MLX5_MPW_STATE_OPENED) 1599 mlx5_mpw_close(txq, &mpw); 1600 /* Ring QP doorbell. */ 1601 mlx5_tx_dbrec(txq, mpw.wqe); 1602 txq->elts_head = elts_head; 1603 return i; 1604 } 1605 1606 /** 1607 * Translate RX completion flags to packet type. 1608 * 1609 * @param[in] cqe 1610 * Pointer to CQE. 1611 * 1612 * @note: fix mlx5_dev_supported_ptypes_get() if any change here. 1613 * 1614 * @return 1615 * Packet type for struct rte_mbuf. 1616 */ 1617 static inline uint32_t 1618 rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe) 1619 { 1620 uint8_t idx; 1621 uint8_t pinfo = cqe->pkt_info; 1622 uint16_t ptype = cqe->hdr_type_etc; 1623 1624 /* 1625 * The index to the array should have: 1626 * bit[1:0] = l3_hdr_type 1627 * bit[4:2] = l4_hdr_type 1628 * bit[5] = ip_frag 1629 * bit[6] = tunneled 1630 * bit[7] = outer_l3_type 1631 */ 1632 idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10); 1633 return mlx5_ptype_table[idx]; 1634 } 1635 1636 /** 1637 * Get size of the next packet for a given CQE. For compressed CQEs, the 1638 * consumer index is updated only once all packets of the current one have 1639 * been processed. 1640 * 1641 * @param rxq 1642 * Pointer to RX queue. 1643 * @param cqe 1644 * CQE to process. 1645 * @param[out] rss_hash 1646 * Packet RSS Hash result. 1647 * 1648 * @return 1649 * Packet size in bytes (0 if there is none), -1 in case of completion 1650 * with error. 1651 */ 1652 static inline int 1653 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, 1654 uint16_t cqe_cnt, uint32_t *rss_hash) 1655 { 1656 struct rxq_zip *zip = &rxq->zip; 1657 uint16_t cqe_n = cqe_cnt + 1; 1658 int len = 0; 1659 uint16_t idx, end; 1660 1661 /* Process compressed data in the CQE and mini arrays. */ 1662 if (zip->ai) { 1663 volatile struct mlx5_mini_cqe8 (*mc)[8] = 1664 (volatile struct mlx5_mini_cqe8 (*)[8]) 1665 (uintptr_t)(&(*rxq->cqes)[zip->ca & cqe_cnt].pkt_info); 1666 1667 len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt); 1668 *rss_hash = rte_be_to_cpu_32((*mc)[zip->ai & 7].rx_hash_result); 1669 if ((++zip->ai & 7) == 0) { 1670 /* Invalidate consumed CQEs */ 1671 idx = zip->ca; 1672 end = zip->na; 1673 while (idx != end) { 1674 (*rxq->cqes)[idx & cqe_cnt].op_own = 1675 MLX5_CQE_INVALIDATE; 1676 ++idx; 1677 } 1678 /* 1679 * Increment consumer index to skip the number of 1680 * CQEs consumed. Hardware leaves holes in the CQ 1681 * ring for software use. 1682 */ 1683 zip->ca = zip->na; 1684 zip->na += 8; 1685 } 1686 if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) { 1687 /* Invalidate the rest */ 1688 idx = zip->ca; 1689 end = zip->cq_ci; 1690 1691 while (idx != end) { 1692 (*rxq->cqes)[idx & cqe_cnt].op_own = 1693 MLX5_CQE_INVALIDATE; 1694 ++idx; 1695 } 1696 rxq->cq_ci = zip->cq_ci; 1697 zip->ai = 0; 1698 } 1699 /* No compressed data, get next CQE and verify if it is compressed. */ 1700 } else { 1701 int ret; 1702 int8_t op_own; 1703 1704 ret = check_cqe(cqe, cqe_n, rxq->cq_ci); 1705 if (unlikely(ret == 1)) 1706 return 0; 1707 ++rxq->cq_ci; 1708 op_own = cqe->op_own; 1709 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) { 1710 volatile struct mlx5_mini_cqe8 (*mc)[8] = 1711 (volatile struct mlx5_mini_cqe8 (*)[8]) 1712 (uintptr_t)(&(*rxq->cqes)[rxq->cq_ci & 1713 cqe_cnt].pkt_info); 1714 1715 /* Fix endianness. */ 1716 zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt); 1717 /* 1718 * Current mini array position is the one returned by 1719 * check_cqe64(). 1720 * 1721 * If completion comprises several mini arrays, as a 1722 * special case the second one is located 7 CQEs after 1723 * the initial CQE instead of 8 for subsequent ones. 1724 */ 1725 zip->ca = rxq->cq_ci; 1726 zip->na = zip->ca + 7; 1727 /* Compute the next non compressed CQE. */ 1728 --rxq->cq_ci; 1729 zip->cq_ci = rxq->cq_ci + zip->cqe_cnt; 1730 /* Get packet size to return. */ 1731 len = rte_be_to_cpu_32((*mc)[0].byte_cnt); 1732 *rss_hash = rte_be_to_cpu_32((*mc)[0].rx_hash_result); 1733 zip->ai = 1; 1734 /* Prefetch all the entries to be invalidated */ 1735 idx = zip->ca; 1736 end = zip->cq_ci; 1737 while (idx != end) { 1738 rte_prefetch0(&(*rxq->cqes)[(idx) & cqe_cnt]); 1739 ++idx; 1740 } 1741 } else { 1742 len = rte_be_to_cpu_32(cqe->byte_cnt); 1743 *rss_hash = rte_be_to_cpu_32(cqe->rx_hash_res); 1744 } 1745 /* Error while receiving packet. */ 1746 if (unlikely(MLX5_CQE_OPCODE(op_own) == MLX5_CQE_RESP_ERR)) 1747 return -1; 1748 } 1749 return len; 1750 } 1751 1752 /** 1753 * Translate RX completion flags to offload flags. 1754 * 1755 * @param[in] rxq 1756 * Pointer to RX queue structure. 1757 * @param[in] cqe 1758 * Pointer to CQE. 1759 * 1760 * @return 1761 * Offload flags (ol_flags) for struct rte_mbuf. 1762 */ 1763 static inline uint32_t 1764 rxq_cq_to_ol_flags(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe) 1765 { 1766 uint32_t ol_flags = 0; 1767 uint16_t flags = rte_be_to_cpu_16(cqe->hdr_type_etc); 1768 1769 ol_flags = 1770 TRANSPOSE(flags, 1771 MLX5_CQE_RX_L3_HDR_VALID, 1772 PKT_RX_IP_CKSUM_GOOD) | 1773 TRANSPOSE(flags, 1774 MLX5_CQE_RX_L4_HDR_VALID, 1775 PKT_RX_L4_CKSUM_GOOD); 1776 if ((cqe->pkt_info & MLX5_CQE_RX_TUNNEL_PACKET) && (rxq->csum_l2tun)) 1777 ol_flags |= 1778 TRANSPOSE(flags, 1779 MLX5_CQE_RX_L3_HDR_VALID, 1780 PKT_RX_IP_CKSUM_GOOD) | 1781 TRANSPOSE(flags, 1782 MLX5_CQE_RX_L4_HDR_VALID, 1783 PKT_RX_L4_CKSUM_GOOD); 1784 return ol_flags; 1785 } 1786 1787 /** 1788 * DPDK callback for RX. 1789 * 1790 * @param dpdk_rxq 1791 * Generic pointer to RX queue structure. 1792 * @param[out] pkts 1793 * Array to store received packets. 1794 * @param pkts_n 1795 * Maximum number of packets in array. 1796 * 1797 * @return 1798 * Number of packets successfully received (<= pkts_n). 1799 */ 1800 uint16_t 1801 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1802 { 1803 struct mlx5_rxq_data *rxq = dpdk_rxq; 1804 const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1; 1805 const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1; 1806 const unsigned int sges_n = rxq->sges_n; 1807 struct rte_mbuf *pkt = NULL; 1808 struct rte_mbuf *seg = NULL; 1809 volatile struct mlx5_cqe *cqe = 1810 &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1811 unsigned int i = 0; 1812 unsigned int rq_ci = rxq->rq_ci << sges_n; 1813 int len = 0; /* keep its value across iterations. */ 1814 1815 while (pkts_n) { 1816 unsigned int idx = rq_ci & wqe_cnt; 1817 volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx]; 1818 struct rte_mbuf *rep = (*rxq->elts)[idx]; 1819 uint32_t rss_hash_res = 0; 1820 1821 if (pkt) 1822 NEXT(seg) = rep; 1823 seg = rep; 1824 rte_prefetch0(seg); 1825 rte_prefetch0(cqe); 1826 rte_prefetch0(wqe); 1827 rep = rte_mbuf_raw_alloc(rxq->mp); 1828 if (unlikely(rep == NULL)) { 1829 ++rxq->stats.rx_nombuf; 1830 if (!pkt) { 1831 /* 1832 * no buffers before we even started, 1833 * bail out silently. 1834 */ 1835 break; 1836 } 1837 while (pkt != seg) { 1838 assert(pkt != (*rxq->elts)[idx]); 1839 rep = NEXT(pkt); 1840 NEXT(pkt) = NULL; 1841 NB_SEGS(pkt) = 1; 1842 rte_mbuf_raw_free(pkt); 1843 pkt = rep; 1844 } 1845 break; 1846 } 1847 if (!pkt) { 1848 cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1849 len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt, 1850 &rss_hash_res); 1851 if (!len) { 1852 rte_mbuf_raw_free(rep); 1853 break; 1854 } 1855 if (unlikely(len == -1)) { 1856 /* RX error, packet is likely too large. */ 1857 rte_mbuf_raw_free(rep); 1858 ++rxq->stats.idropped; 1859 goto skip; 1860 } 1861 pkt = seg; 1862 assert(len >= (rxq->crc_present << 2)); 1863 /* Update packet information. */ 1864 pkt->packet_type = rxq_cq_to_pkt_type(cqe); 1865 pkt->ol_flags = 0; 1866 if (rss_hash_res && rxq->rss_hash) { 1867 pkt->hash.rss = rss_hash_res; 1868 pkt->ol_flags = PKT_RX_RSS_HASH; 1869 } 1870 if (rxq->mark && 1871 MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) { 1872 pkt->ol_flags |= PKT_RX_FDIR; 1873 if (cqe->sop_drop_qpn != 1874 rte_cpu_to_be_32(MLX5_FLOW_MARK_DEFAULT)) { 1875 uint32_t mark = cqe->sop_drop_qpn; 1876 1877 pkt->ol_flags |= PKT_RX_FDIR_ID; 1878 pkt->hash.fdir.hi = 1879 mlx5_flow_mark_get(mark); 1880 } 1881 } 1882 if (rxq->csum | rxq->csum_l2tun) 1883 pkt->ol_flags |= rxq_cq_to_ol_flags(rxq, cqe); 1884 if (rxq->vlan_strip && 1885 (cqe->hdr_type_etc & 1886 rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED))) { 1887 pkt->ol_flags |= PKT_RX_VLAN | 1888 PKT_RX_VLAN_STRIPPED; 1889 pkt->vlan_tci = 1890 rte_be_to_cpu_16(cqe->vlan_info); 1891 } 1892 if (rxq->hw_timestamp) { 1893 pkt->timestamp = 1894 rte_be_to_cpu_64(cqe->timestamp); 1895 pkt->ol_flags |= PKT_RX_TIMESTAMP; 1896 } 1897 if (rxq->crc_present) 1898 len -= ETHER_CRC_LEN; 1899 PKT_LEN(pkt) = len; 1900 } 1901 DATA_LEN(rep) = DATA_LEN(seg); 1902 PKT_LEN(rep) = PKT_LEN(seg); 1903 SET_DATA_OFF(rep, DATA_OFF(seg)); 1904 PORT(rep) = PORT(seg); 1905 (*rxq->elts)[idx] = rep; 1906 /* 1907 * Fill NIC descriptor with the new buffer. The lkey and size 1908 * of the buffers are already known, only the buffer address 1909 * changes. 1910 */ 1911 wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t)); 1912 if (len > DATA_LEN(seg)) { 1913 len -= DATA_LEN(seg); 1914 ++NB_SEGS(pkt); 1915 ++rq_ci; 1916 continue; 1917 } 1918 DATA_LEN(seg) = len; 1919 #ifdef MLX5_PMD_SOFT_COUNTERS 1920 /* Increment bytes counter. */ 1921 rxq->stats.ibytes += PKT_LEN(pkt); 1922 #endif 1923 /* Return packet. */ 1924 *(pkts++) = pkt; 1925 pkt = NULL; 1926 --pkts_n; 1927 ++i; 1928 skip: 1929 /* Align consumer index to the next stride. */ 1930 rq_ci >>= sges_n; 1931 ++rq_ci; 1932 rq_ci <<= sges_n; 1933 } 1934 if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci))) 1935 return 0; 1936 /* Update the consumer index. */ 1937 rxq->rq_ci = rq_ci >> sges_n; 1938 rte_io_wmb(); 1939 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); 1940 rte_io_wmb(); 1941 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 1942 #ifdef MLX5_PMD_SOFT_COUNTERS 1943 /* Increment packets counter. */ 1944 rxq->stats.ipackets += i; 1945 #endif 1946 return i; 1947 } 1948 1949 /** 1950 * Dummy DPDK callback for TX. 1951 * 1952 * This function is used to temporarily replace the real callback during 1953 * unsafe control operations on the queue, or in case of error. 1954 * 1955 * @param dpdk_txq 1956 * Generic pointer to TX queue structure. 1957 * @param[in] pkts 1958 * Packets to transmit. 1959 * @param pkts_n 1960 * Number of packets in array. 1961 * 1962 * @return 1963 * Number of packets successfully transmitted (<= pkts_n). 1964 */ 1965 uint16_t 1966 removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 1967 { 1968 (void)dpdk_txq; 1969 (void)pkts; 1970 (void)pkts_n; 1971 return 0; 1972 } 1973 1974 /** 1975 * Dummy DPDK callback for RX. 1976 * 1977 * This function is used to temporarily replace the real callback during 1978 * unsafe control operations on the queue, or in case of error. 1979 * 1980 * @param dpdk_rxq 1981 * Generic pointer to RX queue structure. 1982 * @param[out] pkts 1983 * Array to store received packets. 1984 * @param pkts_n 1985 * Maximum number of packets in array. 1986 * 1987 * @return 1988 * Number of packets successfully received (<= pkts_n). 1989 */ 1990 uint16_t 1991 removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1992 { 1993 (void)dpdk_rxq; 1994 (void)pkts; 1995 (void)pkts_n; 1996 return 0; 1997 } 1998 1999 /* 2000 * Vectorized Rx/Tx routines are not compiled in when required vector 2001 * instructions are not supported on a target architecture. The following null 2002 * stubs are needed for linkage when those are not included outside of this file 2003 * (e.g. mlx5_rxtx_vec_sse.c for x86). 2004 */ 2005 2006 uint16_t __attribute__((weak)) 2007 mlx5_tx_burst_raw_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 2008 { 2009 (void)dpdk_txq; 2010 (void)pkts; 2011 (void)pkts_n; 2012 return 0; 2013 } 2014 2015 uint16_t __attribute__((weak)) 2016 mlx5_tx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 2017 { 2018 (void)dpdk_txq; 2019 (void)pkts; 2020 (void)pkts_n; 2021 return 0; 2022 } 2023 2024 uint16_t __attribute__((weak)) 2025 mlx5_rx_burst_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 2026 { 2027 (void)dpdk_rxq; 2028 (void)pkts; 2029 (void)pkts_n; 2030 return 0; 2031 } 2032 2033 int __attribute__((weak)) 2034 priv_check_raw_vec_tx_support(struct priv *priv) 2035 { 2036 (void)priv; 2037 return -ENOTSUP; 2038 } 2039 2040 int __attribute__((weak)) 2041 priv_check_vec_tx_support(struct priv *priv) 2042 { 2043 (void)priv; 2044 return -ENOTSUP; 2045 } 2046 2047 int __attribute__((weak)) 2048 rxq_check_vec_support(struct mlx5_rxq_data *rxq) 2049 { 2050 (void)rxq; 2051 return -ENOTSUP; 2052 } 2053 2054 int __attribute__((weak)) 2055 priv_check_vec_rx_support(struct priv *priv) 2056 { 2057 (void)priv; 2058 return -ENOTSUP; 2059 } 2060