1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2015 6WIND S.A. 3 * Copyright 2015-2019 Mellanox Technologies, Ltd 4 */ 5 6 #include <stdint.h> 7 #include <string.h> 8 #include <stdlib.h> 9 10 /* Verbs header. */ 11 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 12 #ifdef PEDANTIC 13 #pragma GCC diagnostic ignored "-Wpedantic" 14 #endif 15 #include <infiniband/verbs.h> 16 #include <infiniband/mlx5dv.h> 17 #ifdef PEDANTIC 18 #pragma GCC diagnostic error "-Wpedantic" 19 #endif 20 21 #include <rte_mbuf.h> 22 #include <rte_mempool.h> 23 #include <rte_prefetch.h> 24 #include <rte_common.h> 25 #include <rte_branch_prediction.h> 26 #include <rte_ether.h> 27 #include <rte_cycles.h> 28 #include <rte_flow.h> 29 30 #include <mlx5_devx_cmds.h> 31 #include <mlx5_prm.h> 32 #include <mlx5_common.h> 33 34 #include "mlx5_defs.h" 35 #include "mlx5.h" 36 #include "mlx5_utils.h" 37 #include "mlx5_rxtx.h" 38 #include "mlx5_autoconf.h" 39 40 /* TX burst subroutines return codes. */ 41 enum mlx5_txcmp_code { 42 MLX5_TXCMP_CODE_EXIT = 0, 43 MLX5_TXCMP_CODE_ERROR, 44 MLX5_TXCMP_CODE_SINGLE, 45 MLX5_TXCMP_CODE_MULTI, 46 MLX5_TXCMP_CODE_TSO, 47 MLX5_TXCMP_CODE_EMPW, 48 }; 49 50 /* 51 * These defines are used to configure Tx burst routine option set 52 * supported at compile time. The not specified options are optimized out 53 * out due to if conditions can be explicitly calculated at compile time. 54 * The offloads with bigger runtime check (require more CPU cycles to 55 * skip) overhead should have the bigger index - this is needed to 56 * select the better matching routine function if no exact match and 57 * some offloads are not actually requested. 58 */ 59 #define MLX5_TXOFF_CONFIG_MULTI (1u << 0) /* Multi-segment packets.*/ 60 #define MLX5_TXOFF_CONFIG_TSO (1u << 1) /* TCP send offload supported.*/ 61 #define MLX5_TXOFF_CONFIG_SWP (1u << 2) /* Tunnels/SW Parser offloads.*/ 62 #define MLX5_TXOFF_CONFIG_CSUM (1u << 3) /* Check Sums offloaded. */ 63 #define MLX5_TXOFF_CONFIG_INLINE (1u << 4) /* Data inlining supported. */ 64 #define MLX5_TXOFF_CONFIG_VLAN (1u << 5) /* VLAN insertion supported.*/ 65 #define MLX5_TXOFF_CONFIG_METADATA (1u << 6) /* Flow metadata. */ 66 #define MLX5_TXOFF_CONFIG_EMPW (1u << 8) /* Enhanced MPW supported.*/ 67 #define MLX5_TXOFF_CONFIG_MPW (1u << 9) /* Legacy MPW supported.*/ 68 69 /* The most common offloads groups. */ 70 #define MLX5_TXOFF_CONFIG_NONE 0 71 #define MLX5_TXOFF_CONFIG_FULL (MLX5_TXOFF_CONFIG_MULTI | \ 72 MLX5_TXOFF_CONFIG_TSO | \ 73 MLX5_TXOFF_CONFIG_SWP | \ 74 MLX5_TXOFF_CONFIG_CSUM | \ 75 MLX5_TXOFF_CONFIG_INLINE | \ 76 MLX5_TXOFF_CONFIG_VLAN | \ 77 MLX5_TXOFF_CONFIG_METADATA) 78 79 #define MLX5_TXOFF_CONFIG(mask) (olx & MLX5_TXOFF_CONFIG_##mask) 80 81 #define MLX5_TXOFF_DECL(func, olx) \ 82 static uint16_t mlx5_tx_burst_##func(void *txq, \ 83 struct rte_mbuf **pkts, \ 84 uint16_t pkts_n) \ 85 { \ 86 return mlx5_tx_burst_tmpl((struct mlx5_txq_data *)txq, \ 87 pkts, pkts_n, (olx)); \ 88 } 89 90 #define MLX5_TXOFF_INFO(func, olx) {mlx5_tx_burst_##func, olx}, 91 92 static __rte_always_inline uint32_t 93 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe); 94 95 static __rte_always_inline int 96 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, 97 uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe); 98 99 static __rte_always_inline uint32_t 100 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe); 101 102 static __rte_always_inline void 103 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, 104 volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res); 105 106 static __rte_always_inline void 107 mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx, 108 const unsigned int strd_n); 109 110 static int 111 mlx5_queue_state_modify(struct rte_eth_dev *dev, 112 struct mlx5_mp_arg_queue_state_modify *sm); 113 114 static inline void 115 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *restrict tcp, 116 volatile struct mlx5_cqe *restrict cqe, 117 uint32_t phcsum); 118 119 static inline void 120 mlx5_lro_update_hdr(uint8_t *restrict padd, 121 volatile struct mlx5_cqe *restrict cqe, 122 uint32_t len); 123 124 uint32_t mlx5_ptype_table[] __rte_cache_aligned = { 125 [0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */ 126 }; 127 128 uint8_t mlx5_cksum_table[1 << 10] __rte_cache_aligned; 129 uint8_t mlx5_swp_types_table[1 << 10] __rte_cache_aligned; 130 131 uint64_t rte_net_mlx5_dynf_inline_mask; 132 #define PKT_TX_DYNF_NOINLINE rte_net_mlx5_dynf_inline_mask 133 134 /** 135 * Build a table to translate Rx completion flags to packet type. 136 * 137 * @note: fix mlx5_dev_supported_ptypes_get() if any change here. 138 */ 139 void 140 mlx5_set_ptype_table(void) 141 { 142 unsigned int i; 143 uint32_t (*p)[RTE_DIM(mlx5_ptype_table)] = &mlx5_ptype_table; 144 145 /* Last entry must not be overwritten, reserved for errored packet. */ 146 for (i = 0; i < RTE_DIM(mlx5_ptype_table) - 1; ++i) 147 (*p)[i] = RTE_PTYPE_UNKNOWN; 148 /* 149 * The index to the array should have: 150 * bit[1:0] = l3_hdr_type 151 * bit[4:2] = l4_hdr_type 152 * bit[5] = ip_frag 153 * bit[6] = tunneled 154 * bit[7] = outer_l3_type 155 */ 156 /* L2 */ 157 (*p)[0x00] = RTE_PTYPE_L2_ETHER; 158 /* L3 */ 159 (*p)[0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 160 RTE_PTYPE_L4_NONFRAG; 161 (*p)[0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 162 RTE_PTYPE_L4_NONFRAG; 163 /* Fragmented */ 164 (*p)[0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 165 RTE_PTYPE_L4_FRAG; 166 (*p)[0x22] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 167 RTE_PTYPE_L4_FRAG; 168 /* TCP */ 169 (*p)[0x05] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 170 RTE_PTYPE_L4_TCP; 171 (*p)[0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 172 RTE_PTYPE_L4_TCP; 173 (*p)[0x0d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 174 RTE_PTYPE_L4_TCP; 175 (*p)[0x0e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 176 RTE_PTYPE_L4_TCP; 177 (*p)[0x11] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 178 RTE_PTYPE_L4_TCP; 179 (*p)[0x12] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 180 RTE_PTYPE_L4_TCP; 181 /* UDP */ 182 (*p)[0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 183 RTE_PTYPE_L4_UDP; 184 (*p)[0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 185 RTE_PTYPE_L4_UDP; 186 /* Repeat with outer_l3_type being set. Just in case. */ 187 (*p)[0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 188 RTE_PTYPE_L4_NONFRAG; 189 (*p)[0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 190 RTE_PTYPE_L4_NONFRAG; 191 (*p)[0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 192 RTE_PTYPE_L4_FRAG; 193 (*p)[0xa2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 194 RTE_PTYPE_L4_FRAG; 195 (*p)[0x85] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 196 RTE_PTYPE_L4_TCP; 197 (*p)[0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 198 RTE_PTYPE_L4_TCP; 199 (*p)[0x8d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 200 RTE_PTYPE_L4_TCP; 201 (*p)[0x8e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 202 RTE_PTYPE_L4_TCP; 203 (*p)[0x91] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 204 RTE_PTYPE_L4_TCP; 205 (*p)[0x92] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 206 RTE_PTYPE_L4_TCP; 207 (*p)[0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 208 RTE_PTYPE_L4_UDP; 209 (*p)[0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 210 RTE_PTYPE_L4_UDP; 211 /* Tunneled - L3 */ 212 (*p)[0x40] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN; 213 (*p)[0x41] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 214 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 215 RTE_PTYPE_INNER_L4_NONFRAG; 216 (*p)[0x42] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 217 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 218 RTE_PTYPE_INNER_L4_NONFRAG; 219 (*p)[0xc0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN; 220 (*p)[0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 221 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 222 RTE_PTYPE_INNER_L4_NONFRAG; 223 (*p)[0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 224 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 225 RTE_PTYPE_INNER_L4_NONFRAG; 226 /* Tunneled - Fragmented */ 227 (*p)[0x61] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 228 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 229 RTE_PTYPE_INNER_L4_FRAG; 230 (*p)[0x62] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 231 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 232 RTE_PTYPE_INNER_L4_FRAG; 233 (*p)[0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 234 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 235 RTE_PTYPE_INNER_L4_FRAG; 236 (*p)[0xe2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 237 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 238 RTE_PTYPE_INNER_L4_FRAG; 239 /* Tunneled - TCP */ 240 (*p)[0x45] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 241 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 242 RTE_PTYPE_INNER_L4_TCP; 243 (*p)[0x46] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 244 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 245 RTE_PTYPE_INNER_L4_TCP; 246 (*p)[0x4d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 247 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 248 RTE_PTYPE_INNER_L4_TCP; 249 (*p)[0x4e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 250 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 251 RTE_PTYPE_INNER_L4_TCP; 252 (*p)[0x51] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 253 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 254 RTE_PTYPE_INNER_L4_TCP; 255 (*p)[0x52] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 256 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 257 RTE_PTYPE_INNER_L4_TCP; 258 (*p)[0xc5] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 259 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 260 RTE_PTYPE_INNER_L4_TCP; 261 (*p)[0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 262 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 263 RTE_PTYPE_INNER_L4_TCP; 264 (*p)[0xcd] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 265 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 266 RTE_PTYPE_INNER_L4_TCP; 267 (*p)[0xce] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 268 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 269 RTE_PTYPE_INNER_L4_TCP; 270 (*p)[0xd1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 271 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 272 RTE_PTYPE_INNER_L4_TCP; 273 (*p)[0xd2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 274 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 275 RTE_PTYPE_INNER_L4_TCP; 276 /* Tunneled - UDP */ 277 (*p)[0x49] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 278 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 279 RTE_PTYPE_INNER_L4_UDP; 280 (*p)[0x4a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 281 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 282 RTE_PTYPE_INNER_L4_UDP; 283 (*p)[0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 284 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 285 RTE_PTYPE_INNER_L4_UDP; 286 (*p)[0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 287 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 288 RTE_PTYPE_INNER_L4_UDP; 289 } 290 291 /** 292 * Build a table to translate packet to checksum type of Verbs. 293 */ 294 void 295 mlx5_set_cksum_table(void) 296 { 297 unsigned int i; 298 uint8_t v; 299 300 /* 301 * The index should have: 302 * bit[0] = PKT_TX_TCP_SEG 303 * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM 304 * bit[4] = PKT_TX_IP_CKSUM 305 * bit[8] = PKT_TX_OUTER_IP_CKSUM 306 * bit[9] = tunnel 307 */ 308 for (i = 0; i < RTE_DIM(mlx5_cksum_table); ++i) { 309 v = 0; 310 if (i & (1 << 9)) { 311 /* Tunneled packet. */ 312 if (i & (1 << 8)) /* Outer IP. */ 313 v |= MLX5_ETH_WQE_L3_CSUM; 314 if (i & (1 << 4)) /* Inner IP. */ 315 v |= MLX5_ETH_WQE_L3_INNER_CSUM; 316 if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */ 317 v |= MLX5_ETH_WQE_L4_INNER_CSUM; 318 } else { 319 /* No tunnel. */ 320 if (i & (1 << 4)) /* IP. */ 321 v |= MLX5_ETH_WQE_L3_CSUM; 322 if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */ 323 v |= MLX5_ETH_WQE_L4_CSUM; 324 } 325 mlx5_cksum_table[i] = v; 326 } 327 } 328 329 /** 330 * Build a table to translate packet type of mbuf to SWP type of Verbs. 331 */ 332 void 333 mlx5_set_swp_types_table(void) 334 { 335 unsigned int i; 336 uint8_t v; 337 338 /* 339 * The index should have: 340 * bit[0:1] = PKT_TX_L4_MASK 341 * bit[4] = PKT_TX_IPV6 342 * bit[8] = PKT_TX_OUTER_IPV6 343 * bit[9] = PKT_TX_OUTER_UDP 344 */ 345 for (i = 0; i < RTE_DIM(mlx5_swp_types_table); ++i) { 346 v = 0; 347 if (i & (1 << 8)) 348 v |= MLX5_ETH_WQE_L3_OUTER_IPV6; 349 if (i & (1 << 9)) 350 v |= MLX5_ETH_WQE_L4_OUTER_UDP; 351 if (i & (1 << 4)) 352 v |= MLX5_ETH_WQE_L3_INNER_IPV6; 353 if ((i & 3) == (PKT_TX_UDP_CKSUM >> 52)) 354 v |= MLX5_ETH_WQE_L4_INNER_UDP; 355 mlx5_swp_types_table[i] = v; 356 } 357 } 358 359 /** 360 * Set Software Parser flags and offsets in Ethernet Segment of WQE. 361 * Flags must be preliminary initialized to zero. 362 * 363 * @param loc 364 * Pointer to burst routine local context. 365 * @param swp_flags 366 * Pointer to store Software Parser flags 367 * @param olx 368 * Configured Tx offloads mask. It is fully defined at 369 * compile time and may be used for optimization. 370 * 371 * @return 372 * Software Parser offsets packed in dword. 373 * Software Parser flags are set by pointer. 374 */ 375 static __rte_always_inline uint32_t 376 txq_mbuf_to_swp(struct mlx5_txq_local *restrict loc, 377 uint8_t *swp_flags, 378 unsigned int olx) 379 { 380 uint64_t ol, tunnel; 381 unsigned int idx, off; 382 uint32_t set; 383 384 if (!MLX5_TXOFF_CONFIG(SWP)) 385 return 0; 386 ol = loc->mbuf->ol_flags; 387 tunnel = ol & PKT_TX_TUNNEL_MASK; 388 /* 389 * Check whether Software Parser is required. 390 * Only customized tunnels may ask for. 391 */ 392 if (likely(tunnel != PKT_TX_TUNNEL_UDP && tunnel != PKT_TX_TUNNEL_IP)) 393 return 0; 394 /* 395 * The index should have: 396 * bit[0:1] = PKT_TX_L4_MASK 397 * bit[4] = PKT_TX_IPV6 398 * bit[8] = PKT_TX_OUTER_IPV6 399 * bit[9] = PKT_TX_OUTER_UDP 400 */ 401 idx = (ol & (PKT_TX_L4_MASK | PKT_TX_IPV6 | PKT_TX_OUTER_IPV6)) >> 52; 402 idx |= (tunnel == PKT_TX_TUNNEL_UDP) ? (1 << 9) : 0; 403 *swp_flags = mlx5_swp_types_table[idx]; 404 /* 405 * Set offsets for SW parser. Since ConnectX-5, SW parser just 406 * complements HW parser. SW parser starts to engage only if HW parser 407 * can't reach a header. For the older devices, HW parser will not kick 408 * in if any of SWP offsets is set. Therefore, all of the L3 offsets 409 * should be set regardless of HW offload. 410 */ 411 off = loc->mbuf->outer_l2_len; 412 if (MLX5_TXOFF_CONFIG(VLAN) && ol & PKT_TX_VLAN_PKT) 413 off += sizeof(struct rte_vlan_hdr); 414 set = (off >> 1) << 8; /* Outer L3 offset. */ 415 off += loc->mbuf->outer_l3_len; 416 if (tunnel == PKT_TX_TUNNEL_UDP) 417 set |= off >> 1; /* Outer L4 offset. */ 418 if (ol & (PKT_TX_IPV4 | PKT_TX_IPV6)) { /* Inner IP. */ 419 const uint64_t csum = ol & PKT_TX_L4_MASK; 420 off += loc->mbuf->l2_len; 421 set |= (off >> 1) << 24; /* Inner L3 offset. */ 422 if (csum == PKT_TX_TCP_CKSUM || 423 csum == PKT_TX_UDP_CKSUM || 424 (MLX5_TXOFF_CONFIG(TSO) && ol & PKT_TX_TCP_SEG)) { 425 off += loc->mbuf->l3_len; 426 set |= (off >> 1) << 16; /* Inner L4 offset. */ 427 } 428 } 429 set = rte_cpu_to_le_32(set); 430 return set; 431 } 432 433 /** 434 * Convert the Checksum offloads to Verbs. 435 * 436 * @param buf 437 * Pointer to the mbuf. 438 * 439 * @return 440 * Converted checksum flags. 441 */ 442 static __rte_always_inline uint8_t 443 txq_ol_cksum_to_cs(struct rte_mbuf *buf) 444 { 445 uint32_t idx; 446 uint8_t is_tunnel = !!(buf->ol_flags & PKT_TX_TUNNEL_MASK); 447 const uint64_t ol_flags_mask = PKT_TX_TCP_SEG | PKT_TX_L4_MASK | 448 PKT_TX_IP_CKSUM | PKT_TX_OUTER_IP_CKSUM; 449 450 /* 451 * The index should have: 452 * bit[0] = PKT_TX_TCP_SEG 453 * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM 454 * bit[4] = PKT_TX_IP_CKSUM 455 * bit[8] = PKT_TX_OUTER_IP_CKSUM 456 * bit[9] = tunnel 457 */ 458 idx = ((buf->ol_flags & ol_flags_mask) >> 50) | (!!is_tunnel << 9); 459 return mlx5_cksum_table[idx]; 460 } 461 462 /** 463 * Internal function to compute the number of used descriptors in an RX queue 464 * 465 * @param rxq 466 * The Rx queue. 467 * 468 * @return 469 * The number of used rx descriptor. 470 */ 471 static uint32_t 472 rx_queue_count(struct mlx5_rxq_data *rxq) 473 { 474 struct rxq_zip *zip = &rxq->zip; 475 volatile struct mlx5_cqe *cqe; 476 const unsigned int cqe_n = (1 << rxq->cqe_n); 477 const unsigned int cqe_cnt = cqe_n - 1; 478 unsigned int cq_ci; 479 unsigned int used; 480 481 /* if we are processing a compressed cqe */ 482 if (zip->ai) { 483 used = zip->cqe_cnt - zip->ca; 484 cq_ci = zip->cq_ci; 485 } else { 486 used = 0; 487 cq_ci = rxq->cq_ci; 488 } 489 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; 490 while (check_cqe(cqe, cqe_n, cq_ci) != MLX5_CQE_STATUS_HW_OWN) { 491 int8_t op_own; 492 unsigned int n; 493 494 op_own = cqe->op_own; 495 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) 496 n = rte_be_to_cpu_32(cqe->byte_cnt); 497 else 498 n = 1; 499 cq_ci += n; 500 used += n; 501 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; 502 } 503 used = RTE_MIN(used, (1U << rxq->elts_n) - 1); 504 return used; 505 } 506 507 /** 508 * DPDK callback to check the status of a rx descriptor. 509 * 510 * @param rx_queue 511 * The Rx queue. 512 * @param[in] offset 513 * The index of the descriptor in the ring. 514 * 515 * @return 516 * The status of the tx descriptor. 517 */ 518 int 519 mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset) 520 { 521 struct mlx5_rxq_data *rxq = rx_queue; 522 struct mlx5_rxq_ctrl *rxq_ctrl = 523 container_of(rxq, struct mlx5_rxq_ctrl, rxq); 524 struct rte_eth_dev *dev = ETH_DEV(rxq_ctrl->priv); 525 526 if (dev->rx_pkt_burst != mlx5_rx_burst) { 527 rte_errno = ENOTSUP; 528 return -rte_errno; 529 } 530 if (offset >= (1 << rxq->elts_n)) { 531 rte_errno = EINVAL; 532 return -rte_errno; 533 } 534 if (offset < rx_queue_count(rxq)) 535 return RTE_ETH_RX_DESC_DONE; 536 return RTE_ETH_RX_DESC_AVAIL; 537 } 538 539 /** 540 * DPDK callback to get the RX queue information 541 * 542 * @param dev 543 * Pointer to the device structure. 544 * 545 * @param rx_queue_id 546 * Rx queue identificator. 547 * 548 * @param qinfo 549 * Pointer to the RX queue information structure. 550 * 551 * @return 552 * None. 553 */ 554 555 void 556 mlx5_rxq_info_get(struct rte_eth_dev *dev, uint16_t rx_queue_id, 557 struct rte_eth_rxq_info *qinfo) 558 { 559 struct mlx5_priv *priv = dev->data->dev_private; 560 struct mlx5_rxq_data *rxq = (*priv->rxqs)[rx_queue_id]; 561 struct mlx5_rxq_ctrl *rxq_ctrl = 562 container_of(rxq, struct mlx5_rxq_ctrl, rxq); 563 564 if (!rxq) 565 return; 566 qinfo->mp = mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ? 567 rxq->mprq_mp : rxq->mp; 568 qinfo->conf.rx_thresh.pthresh = 0; 569 qinfo->conf.rx_thresh.hthresh = 0; 570 qinfo->conf.rx_thresh.wthresh = 0; 571 qinfo->conf.rx_free_thresh = rxq->rq_repl_thresh; 572 qinfo->conf.rx_drop_en = 1; 573 qinfo->conf.rx_deferred_start = rxq_ctrl ? 0 : 1; 574 qinfo->conf.offloads = dev->data->dev_conf.rxmode.offloads; 575 qinfo->scattered_rx = dev->data->scattered_rx; 576 qinfo->nb_desc = 1 << rxq->elts_n; 577 } 578 579 /** 580 * DPDK callback to get the RX packet burst mode information 581 * 582 * @param dev 583 * Pointer to the device structure. 584 * 585 * @param rx_queue_id 586 * Rx queue identificatior. 587 * 588 * @param mode 589 * Pointer to the burts mode information. 590 * 591 * @return 592 * 0 as success, -EINVAL as failure. 593 */ 594 595 int 596 mlx5_rx_burst_mode_get(struct rte_eth_dev *dev, 597 uint16_t rx_queue_id __rte_unused, 598 struct rte_eth_burst_mode *mode) 599 { 600 eth_rx_burst_t pkt_burst = dev->rx_pkt_burst; 601 602 if (pkt_burst == mlx5_rx_burst) { 603 snprintf(mode->info, sizeof(mode->info), "%s", "Scalar"); 604 } else if (pkt_burst == mlx5_rx_burst_mprq) { 605 snprintf(mode->info, sizeof(mode->info), "%s", "Multi-Packet RQ"); 606 } else if (pkt_burst == mlx5_rx_burst_vec) { 607 #if defined RTE_ARCH_X86_64 608 snprintf(mode->info, sizeof(mode->info), "%s", "Vector SSE"); 609 #elif defined RTE_ARCH_ARM64 610 snprintf(mode->info, sizeof(mode->info), "%s", "Vector Neon"); 611 #elif defined RTE_ARCH_PPC_64 612 snprintf(mode->info, sizeof(mode->info), "%s", "Vector AltiVec"); 613 #else 614 return -EINVAL; 615 #endif 616 } else { 617 return -EINVAL; 618 } 619 return 0; 620 } 621 622 /** 623 * DPDK callback to get the number of used descriptors in a RX queue 624 * 625 * @param dev 626 * Pointer to the device structure. 627 * 628 * @param rx_queue_id 629 * The Rx queue. 630 * 631 * @return 632 * The number of used rx descriptor. 633 * -EINVAL if the queue is invalid 634 */ 635 uint32_t 636 mlx5_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id) 637 { 638 struct mlx5_priv *priv = dev->data->dev_private; 639 struct mlx5_rxq_data *rxq; 640 641 if (dev->rx_pkt_burst != mlx5_rx_burst) { 642 rte_errno = ENOTSUP; 643 return -rte_errno; 644 } 645 rxq = (*priv->rxqs)[rx_queue_id]; 646 if (!rxq) { 647 rte_errno = EINVAL; 648 return -rte_errno; 649 } 650 return rx_queue_count(rxq); 651 } 652 653 #define MLX5_SYSTEM_LOG_DIR "/var/log" 654 /** 655 * Dump debug information to log file. 656 * 657 * @param fname 658 * The file name. 659 * @param hex_title 660 * If not NULL this string is printed as a header to the output 661 * and the output will be in hexadecimal view. 662 * @param buf 663 * This is the buffer address to print out. 664 * @param len 665 * The number of bytes to dump out. 666 */ 667 void 668 mlx5_dump_debug_information(const char *fname, const char *hex_title, 669 const void *buf, unsigned int hex_len) 670 { 671 FILE *fd; 672 673 MKSTR(path, "%s/%s", MLX5_SYSTEM_LOG_DIR, fname); 674 fd = fopen(path, "a+"); 675 if (!fd) { 676 DRV_LOG(WARNING, "cannot open %s for debug dump", path); 677 MKSTR(path2, "./%s", fname); 678 fd = fopen(path2, "a+"); 679 if (!fd) { 680 DRV_LOG(ERR, "cannot open %s for debug dump", path2); 681 return; 682 } 683 DRV_LOG(INFO, "New debug dump in file %s", path2); 684 } else { 685 DRV_LOG(INFO, "New debug dump in file %s", path); 686 } 687 if (hex_title) 688 rte_hexdump(fd, hex_title, buf, hex_len); 689 else 690 fprintf(fd, "%s", (const char *)buf); 691 fprintf(fd, "\n\n\n"); 692 fclose(fd); 693 } 694 695 /** 696 * Move QP from error state to running state and initialize indexes. 697 * 698 * @param txq_ctrl 699 * Pointer to TX queue control structure. 700 * 701 * @return 702 * 0 on success, else -1. 703 */ 704 static int 705 tx_recover_qp(struct mlx5_txq_ctrl *txq_ctrl) 706 { 707 struct mlx5_mp_arg_queue_state_modify sm = { 708 .is_wq = 0, 709 .queue_id = txq_ctrl->txq.idx, 710 }; 711 712 if (mlx5_queue_state_modify(ETH_DEV(txq_ctrl->priv), &sm)) 713 return -1; 714 txq_ctrl->txq.wqe_ci = 0; 715 txq_ctrl->txq.wqe_pi = 0; 716 txq_ctrl->txq.elts_comp = 0; 717 return 0; 718 } 719 720 /* Return 1 if the error CQE is signed otherwise, sign it and return 0. */ 721 static int 722 check_err_cqe_seen(volatile struct mlx5_err_cqe *err_cqe) 723 { 724 static const uint8_t magic[] = "seen"; 725 int ret = 1; 726 unsigned int i; 727 728 for (i = 0; i < sizeof(magic); ++i) 729 if (!ret || err_cqe->rsvd1[i] != magic[i]) { 730 ret = 0; 731 err_cqe->rsvd1[i] = magic[i]; 732 } 733 return ret; 734 } 735 736 /** 737 * Handle error CQE. 738 * 739 * @param txq 740 * Pointer to TX queue structure. 741 * @param error_cqe 742 * Pointer to the error CQE. 743 * 744 * @return 745 * Negative value if queue recovery failed, otherwise 746 * the error completion entry is handled successfully. 747 */ 748 static int 749 mlx5_tx_error_cqe_handle(struct mlx5_txq_data *restrict txq, 750 volatile struct mlx5_err_cqe *err_cqe) 751 { 752 if (err_cqe->syndrome != MLX5_CQE_SYNDROME_WR_FLUSH_ERR) { 753 const uint16_t wqe_m = ((1 << txq->wqe_n) - 1); 754 struct mlx5_txq_ctrl *txq_ctrl = 755 container_of(txq, struct mlx5_txq_ctrl, txq); 756 uint16_t new_wqe_pi = rte_be_to_cpu_16(err_cqe->wqe_counter); 757 int seen = check_err_cqe_seen(err_cqe); 758 759 if (!seen && txq_ctrl->dump_file_n < 760 txq_ctrl->priv->config.max_dump_files_num) { 761 MKSTR(err_str, "Unexpected CQE error syndrome " 762 "0x%02x CQN = %u SQN = %u wqe_counter = %u " 763 "wq_ci = %u cq_ci = %u", err_cqe->syndrome, 764 txq->cqe_s, txq->qp_num_8s >> 8, 765 rte_be_to_cpu_16(err_cqe->wqe_counter), 766 txq->wqe_ci, txq->cq_ci); 767 MKSTR(name, "dpdk_mlx5_port_%u_txq_%u_index_%u_%u", 768 PORT_ID(txq_ctrl->priv), txq->idx, 769 txq_ctrl->dump_file_n, (uint32_t)rte_rdtsc()); 770 mlx5_dump_debug_information(name, NULL, err_str, 0); 771 mlx5_dump_debug_information(name, "MLX5 Error CQ:", 772 (const void *)((uintptr_t) 773 txq->cqes), 774 sizeof(*err_cqe) * 775 (1 << txq->cqe_n)); 776 mlx5_dump_debug_information(name, "MLX5 Error SQ:", 777 (const void *)((uintptr_t) 778 txq->wqes), 779 MLX5_WQE_SIZE * 780 (1 << txq->wqe_n)); 781 txq_ctrl->dump_file_n++; 782 } 783 if (!seen) 784 /* 785 * Count errors in WQEs units. 786 * Later it can be improved to count error packets, 787 * for example, by SQ parsing to find how much packets 788 * should be counted for each WQE. 789 */ 790 txq->stats.oerrors += ((txq->wqe_ci & wqe_m) - 791 new_wqe_pi) & wqe_m; 792 if (tx_recover_qp(txq_ctrl)) { 793 /* Recovering failed - retry later on the same WQE. */ 794 return -1; 795 } 796 /* Release all the remaining buffers. */ 797 txq_free_elts(txq_ctrl); 798 } 799 return 0; 800 } 801 802 /** 803 * Translate RX completion flags to packet type. 804 * 805 * @param[in] rxq 806 * Pointer to RX queue structure. 807 * @param[in] cqe 808 * Pointer to CQE. 809 * 810 * @note: fix mlx5_dev_supported_ptypes_get() if any change here. 811 * 812 * @return 813 * Packet type for struct rte_mbuf. 814 */ 815 static inline uint32_t 816 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe) 817 { 818 uint8_t idx; 819 uint8_t pinfo = cqe->pkt_info; 820 uint16_t ptype = cqe->hdr_type_etc; 821 822 /* 823 * The index to the array should have: 824 * bit[1:0] = l3_hdr_type 825 * bit[4:2] = l4_hdr_type 826 * bit[5] = ip_frag 827 * bit[6] = tunneled 828 * bit[7] = outer_l3_type 829 */ 830 idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10); 831 return mlx5_ptype_table[idx] | rxq->tunnel * !!(idx & (1 << 6)); 832 } 833 834 /** 835 * Initialize Rx WQ and indexes. 836 * 837 * @param[in] rxq 838 * Pointer to RX queue structure. 839 */ 840 void 841 mlx5_rxq_initialize(struct mlx5_rxq_data *rxq) 842 { 843 const unsigned int wqe_n = 1 << rxq->elts_n; 844 unsigned int i; 845 846 for (i = 0; (i != wqe_n); ++i) { 847 volatile struct mlx5_wqe_data_seg *scat; 848 uintptr_t addr; 849 uint32_t byte_count; 850 851 if (mlx5_rxq_mprq_enabled(rxq)) { 852 struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[i]; 853 854 scat = &((volatile struct mlx5_wqe_mprq *) 855 rxq->wqes)[i].dseg; 856 addr = (uintptr_t)mlx5_mprq_buf_addr(buf, 857 1 << rxq->strd_num_n); 858 byte_count = (1 << rxq->strd_sz_n) * 859 (1 << rxq->strd_num_n); 860 } else { 861 struct rte_mbuf *buf = (*rxq->elts)[i]; 862 863 scat = &((volatile struct mlx5_wqe_data_seg *) 864 rxq->wqes)[i]; 865 addr = rte_pktmbuf_mtod(buf, uintptr_t); 866 byte_count = DATA_LEN(buf); 867 } 868 /* scat->addr must be able to store a pointer. */ 869 MLX5_ASSERT(sizeof(scat->addr) >= sizeof(uintptr_t)); 870 *scat = (struct mlx5_wqe_data_seg){ 871 .addr = rte_cpu_to_be_64(addr), 872 .byte_count = rte_cpu_to_be_32(byte_count), 873 .lkey = mlx5_rx_addr2mr(rxq, addr), 874 }; 875 } 876 rxq->consumed_strd = 0; 877 rxq->decompressed = 0; 878 rxq->rq_pi = 0; 879 rxq->zip = (struct rxq_zip){ 880 .ai = 0, 881 }; 882 /* Update doorbell counter. */ 883 rxq->rq_ci = wqe_n >> rxq->sges_n; 884 rte_cio_wmb(); 885 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 886 } 887 888 /** 889 * Modify a Verbs/DevX queue state. 890 * This must be called from the primary process. 891 * 892 * @param dev 893 * Pointer to Ethernet device. 894 * @param sm 895 * State modify request parameters. 896 * 897 * @return 898 * 0 in case of success else non-zero value and rte_errno is set. 899 */ 900 int 901 mlx5_queue_state_modify_primary(struct rte_eth_dev *dev, 902 const struct mlx5_mp_arg_queue_state_modify *sm) 903 { 904 int ret; 905 struct mlx5_priv *priv = dev->data->dev_private; 906 907 if (sm->is_wq) { 908 struct mlx5_rxq_data *rxq = (*priv->rxqs)[sm->queue_id]; 909 struct mlx5_rxq_ctrl *rxq_ctrl = 910 container_of(rxq, struct mlx5_rxq_ctrl, rxq); 911 912 if (rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_IBV) { 913 struct ibv_wq_attr mod = { 914 .attr_mask = IBV_WQ_ATTR_STATE, 915 .wq_state = sm->state, 916 }; 917 918 ret = mlx5_glue->modify_wq(rxq_ctrl->obj->wq, &mod); 919 } else { /* rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ. */ 920 struct mlx5_devx_modify_rq_attr rq_attr; 921 922 memset(&rq_attr, 0, sizeof(rq_attr)); 923 if (sm->state == IBV_WQS_RESET) { 924 rq_attr.rq_state = MLX5_RQC_STATE_ERR; 925 rq_attr.state = MLX5_RQC_STATE_RST; 926 } else if (sm->state == IBV_WQS_RDY) { 927 rq_attr.rq_state = MLX5_RQC_STATE_RST; 928 rq_attr.state = MLX5_RQC_STATE_RDY; 929 } else if (sm->state == IBV_WQS_ERR) { 930 rq_attr.rq_state = MLX5_RQC_STATE_RDY; 931 rq_attr.state = MLX5_RQC_STATE_ERR; 932 } 933 ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq, 934 &rq_attr); 935 } 936 if (ret) { 937 DRV_LOG(ERR, "Cannot change Rx WQ state to %u - %s", 938 sm->state, strerror(errno)); 939 rte_errno = errno; 940 return ret; 941 } 942 } else { 943 struct mlx5_txq_data *txq = (*priv->txqs)[sm->queue_id]; 944 struct mlx5_txq_ctrl *txq_ctrl = 945 container_of(txq, struct mlx5_txq_ctrl, txq); 946 struct ibv_qp_attr mod = { 947 .qp_state = IBV_QPS_RESET, 948 .port_num = (uint8_t)priv->ibv_port, 949 }; 950 struct ibv_qp *qp = txq_ctrl->obj->qp; 951 952 ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE); 953 if (ret) { 954 DRV_LOG(ERR, "Cannot change the Tx QP state to RESET " 955 "%s", strerror(errno)); 956 rte_errno = errno; 957 return ret; 958 } 959 mod.qp_state = IBV_QPS_INIT; 960 ret = mlx5_glue->modify_qp(qp, &mod, 961 (IBV_QP_STATE | IBV_QP_PORT)); 962 if (ret) { 963 DRV_LOG(ERR, "Cannot change Tx QP state to INIT %s", 964 strerror(errno)); 965 rte_errno = errno; 966 return ret; 967 } 968 mod.qp_state = IBV_QPS_RTR; 969 ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE); 970 if (ret) { 971 DRV_LOG(ERR, "Cannot change Tx QP state to RTR %s", 972 strerror(errno)); 973 rte_errno = errno; 974 return ret; 975 } 976 mod.qp_state = IBV_QPS_RTS; 977 ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE); 978 if (ret) { 979 DRV_LOG(ERR, "Cannot change Tx QP state to RTS %s", 980 strerror(errno)); 981 rte_errno = errno; 982 return ret; 983 } 984 } 985 return 0; 986 } 987 988 /** 989 * Modify a Verbs queue state. 990 * 991 * @param dev 992 * Pointer to Ethernet device. 993 * @param sm 994 * State modify request parameters. 995 * 996 * @return 997 * 0 in case of success else non-zero value. 998 */ 999 static int 1000 mlx5_queue_state_modify(struct rte_eth_dev *dev, 1001 struct mlx5_mp_arg_queue_state_modify *sm) 1002 { 1003 int ret = 0; 1004 1005 switch (rte_eal_process_type()) { 1006 case RTE_PROC_PRIMARY: 1007 ret = mlx5_queue_state_modify_primary(dev, sm); 1008 break; 1009 case RTE_PROC_SECONDARY: 1010 ret = mlx5_mp_req_queue_state_modify(dev, sm); 1011 break; 1012 default: 1013 break; 1014 } 1015 return ret; 1016 } 1017 1018 /** 1019 * Handle a Rx error. 1020 * The function inserts the RQ state to reset when the first error CQE is 1021 * shown, then drains the CQ by the caller function loop. When the CQ is empty, 1022 * it moves the RQ state to ready and initializes the RQ. 1023 * Next CQE identification and error counting are in the caller responsibility. 1024 * 1025 * @param[in] rxq 1026 * Pointer to RX queue structure. 1027 * @param[in] vec 1028 * 1 when called from vectorized Rx burst, need to prepare mbufs for the RQ. 1029 * 0 when called from non-vectorized Rx burst. 1030 * 1031 * @return 1032 * -1 in case of recovery error, otherwise the CQE status. 1033 */ 1034 int 1035 mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec) 1036 { 1037 const uint16_t cqe_n = 1 << rxq->cqe_n; 1038 const uint16_t cqe_mask = cqe_n - 1; 1039 const unsigned int wqe_n = 1 << rxq->elts_n; 1040 struct mlx5_rxq_ctrl *rxq_ctrl = 1041 container_of(rxq, struct mlx5_rxq_ctrl, rxq); 1042 union { 1043 volatile struct mlx5_cqe *cqe; 1044 volatile struct mlx5_err_cqe *err_cqe; 1045 } u = { 1046 .cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask], 1047 }; 1048 struct mlx5_mp_arg_queue_state_modify sm; 1049 int ret; 1050 1051 switch (rxq->err_state) { 1052 case MLX5_RXQ_ERR_STATE_NO_ERROR: 1053 rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_RESET; 1054 /* Fall-through */ 1055 case MLX5_RXQ_ERR_STATE_NEED_RESET: 1056 sm.is_wq = 1; 1057 sm.queue_id = rxq->idx; 1058 sm.state = IBV_WQS_RESET; 1059 if (mlx5_queue_state_modify(ETH_DEV(rxq_ctrl->priv), &sm)) 1060 return -1; 1061 if (rxq_ctrl->dump_file_n < 1062 rxq_ctrl->priv->config.max_dump_files_num) { 1063 MKSTR(err_str, "Unexpected CQE error syndrome " 1064 "0x%02x CQN = %u RQN = %u wqe_counter = %u" 1065 " rq_ci = %u cq_ci = %u", u.err_cqe->syndrome, 1066 rxq->cqn, rxq_ctrl->wqn, 1067 rte_be_to_cpu_16(u.err_cqe->wqe_counter), 1068 rxq->rq_ci << rxq->sges_n, rxq->cq_ci); 1069 MKSTR(name, "dpdk_mlx5_port_%u_rxq_%u_%u", 1070 rxq->port_id, rxq->idx, (uint32_t)rte_rdtsc()); 1071 mlx5_dump_debug_information(name, NULL, err_str, 0); 1072 mlx5_dump_debug_information(name, "MLX5 Error CQ:", 1073 (const void *)((uintptr_t) 1074 rxq->cqes), 1075 sizeof(*u.cqe) * cqe_n); 1076 mlx5_dump_debug_information(name, "MLX5 Error RQ:", 1077 (const void *)((uintptr_t) 1078 rxq->wqes), 1079 16 * wqe_n); 1080 rxq_ctrl->dump_file_n++; 1081 } 1082 rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_READY; 1083 /* Fall-through */ 1084 case MLX5_RXQ_ERR_STATE_NEED_READY: 1085 ret = check_cqe(u.cqe, cqe_n, rxq->cq_ci); 1086 if (ret == MLX5_CQE_STATUS_HW_OWN) { 1087 rte_cio_wmb(); 1088 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); 1089 rte_cio_wmb(); 1090 /* 1091 * The RQ consumer index must be zeroed while moving 1092 * from RESET state to RDY state. 1093 */ 1094 *rxq->rq_db = rte_cpu_to_be_32(0); 1095 rte_cio_wmb(); 1096 sm.is_wq = 1; 1097 sm.queue_id = rxq->idx; 1098 sm.state = IBV_WQS_RDY; 1099 if (mlx5_queue_state_modify(ETH_DEV(rxq_ctrl->priv), 1100 &sm)) 1101 return -1; 1102 if (vec) { 1103 const uint16_t q_mask = wqe_n - 1; 1104 uint16_t elt_idx; 1105 struct rte_mbuf **elt; 1106 int i; 1107 unsigned int n = wqe_n - (rxq->rq_ci - 1108 rxq->rq_pi); 1109 1110 for (i = 0; i < (int)n; ++i) { 1111 elt_idx = (rxq->rq_ci + i) & q_mask; 1112 elt = &(*rxq->elts)[elt_idx]; 1113 *elt = rte_mbuf_raw_alloc(rxq->mp); 1114 if (!*elt) { 1115 for (i--; i >= 0; --i) { 1116 elt_idx = (rxq->rq_ci + 1117 i) & q_mask; 1118 elt = &(*rxq->elts) 1119 [elt_idx]; 1120 rte_pktmbuf_free_seg 1121 (*elt); 1122 } 1123 return -1; 1124 } 1125 } 1126 for (i = 0; i < (int)wqe_n; ++i) { 1127 elt = &(*rxq->elts)[i]; 1128 DATA_LEN(*elt) = 1129 (uint16_t)((*elt)->buf_len - 1130 rte_pktmbuf_headroom(*elt)); 1131 } 1132 /* Padding with a fake mbuf for vec Rx. */ 1133 for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i) 1134 (*rxq->elts)[wqe_n + i] = 1135 &rxq->fake_mbuf; 1136 } 1137 mlx5_rxq_initialize(rxq); 1138 rxq->err_state = MLX5_RXQ_ERR_STATE_NO_ERROR; 1139 } 1140 return ret; 1141 default: 1142 return -1; 1143 } 1144 } 1145 1146 /** 1147 * Get size of the next packet for a given CQE. For compressed CQEs, the 1148 * consumer index is updated only once all packets of the current one have 1149 * been processed. 1150 * 1151 * @param rxq 1152 * Pointer to RX queue. 1153 * @param cqe 1154 * CQE to process. 1155 * @param[out] mcqe 1156 * Store pointer to mini-CQE if compressed. Otherwise, the pointer is not 1157 * written. 1158 * 1159 * @return 1160 * 0 in case of empty CQE, otherwise the packet size in bytes. 1161 */ 1162 static inline int 1163 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, 1164 uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe) 1165 { 1166 struct rxq_zip *zip = &rxq->zip; 1167 uint16_t cqe_n = cqe_cnt + 1; 1168 int len; 1169 uint16_t idx, end; 1170 1171 do { 1172 len = 0; 1173 /* Process compressed data in the CQE and mini arrays. */ 1174 if (zip->ai) { 1175 volatile struct mlx5_mini_cqe8 (*mc)[8] = 1176 (volatile struct mlx5_mini_cqe8 (*)[8]) 1177 (uintptr_t)(&(*rxq->cqes)[zip->ca & 1178 cqe_cnt].pkt_info); 1179 1180 len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt); 1181 *mcqe = &(*mc)[zip->ai & 7]; 1182 if ((++zip->ai & 7) == 0) { 1183 /* Invalidate consumed CQEs */ 1184 idx = zip->ca; 1185 end = zip->na; 1186 while (idx != end) { 1187 (*rxq->cqes)[idx & cqe_cnt].op_own = 1188 MLX5_CQE_INVALIDATE; 1189 ++idx; 1190 } 1191 /* 1192 * Increment consumer index to skip the number 1193 * of CQEs consumed. Hardware leaves holes in 1194 * the CQ ring for software use. 1195 */ 1196 zip->ca = zip->na; 1197 zip->na += 8; 1198 } 1199 if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) { 1200 /* Invalidate the rest */ 1201 idx = zip->ca; 1202 end = zip->cq_ci; 1203 1204 while (idx != end) { 1205 (*rxq->cqes)[idx & cqe_cnt].op_own = 1206 MLX5_CQE_INVALIDATE; 1207 ++idx; 1208 } 1209 rxq->cq_ci = zip->cq_ci; 1210 zip->ai = 0; 1211 } 1212 /* 1213 * No compressed data, get next CQE and verify if it is 1214 * compressed. 1215 */ 1216 } else { 1217 int ret; 1218 int8_t op_own; 1219 1220 ret = check_cqe(cqe, cqe_n, rxq->cq_ci); 1221 if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) { 1222 if (unlikely(ret == MLX5_CQE_STATUS_ERR || 1223 rxq->err_state)) { 1224 ret = mlx5_rx_err_handle(rxq, 0); 1225 if (ret == MLX5_CQE_STATUS_HW_OWN || 1226 ret == -1) 1227 return 0; 1228 } else { 1229 return 0; 1230 } 1231 } 1232 ++rxq->cq_ci; 1233 op_own = cqe->op_own; 1234 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) { 1235 volatile struct mlx5_mini_cqe8 (*mc)[8] = 1236 (volatile struct mlx5_mini_cqe8 (*)[8]) 1237 (uintptr_t)(&(*rxq->cqes) 1238 [rxq->cq_ci & 1239 cqe_cnt].pkt_info); 1240 1241 /* Fix endianness. */ 1242 zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt); 1243 /* 1244 * Current mini array position is the one 1245 * returned by check_cqe64(). 1246 * 1247 * If completion comprises several mini arrays, 1248 * as a special case the second one is located 1249 * 7 CQEs after the initial CQE instead of 8 1250 * for subsequent ones. 1251 */ 1252 zip->ca = rxq->cq_ci; 1253 zip->na = zip->ca + 7; 1254 /* Compute the next non compressed CQE. */ 1255 --rxq->cq_ci; 1256 zip->cq_ci = rxq->cq_ci + zip->cqe_cnt; 1257 /* Get packet size to return. */ 1258 len = rte_be_to_cpu_32((*mc)[0].byte_cnt); 1259 *mcqe = &(*mc)[0]; 1260 zip->ai = 1; 1261 /* Prefetch all to be invalidated */ 1262 idx = zip->ca; 1263 end = zip->cq_ci; 1264 while (idx != end) { 1265 rte_prefetch0(&(*rxq->cqes)[(idx) & 1266 cqe_cnt]); 1267 ++idx; 1268 } 1269 } else { 1270 len = rte_be_to_cpu_32(cqe->byte_cnt); 1271 } 1272 } 1273 if (unlikely(rxq->err_state)) { 1274 cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1275 ++rxq->stats.idropped; 1276 } else { 1277 return len; 1278 } 1279 } while (1); 1280 } 1281 1282 /** 1283 * Translate RX completion flags to offload flags. 1284 * 1285 * @param[in] cqe 1286 * Pointer to CQE. 1287 * 1288 * @return 1289 * Offload flags (ol_flags) for struct rte_mbuf. 1290 */ 1291 static inline uint32_t 1292 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe) 1293 { 1294 uint32_t ol_flags = 0; 1295 uint16_t flags = rte_be_to_cpu_16(cqe->hdr_type_etc); 1296 1297 ol_flags = 1298 TRANSPOSE(flags, 1299 MLX5_CQE_RX_L3_HDR_VALID, 1300 PKT_RX_IP_CKSUM_GOOD) | 1301 TRANSPOSE(flags, 1302 MLX5_CQE_RX_L4_HDR_VALID, 1303 PKT_RX_L4_CKSUM_GOOD); 1304 return ol_flags; 1305 } 1306 1307 /** 1308 * Fill in mbuf fields from RX completion flags. 1309 * Note that pkt->ol_flags should be initialized outside of this function. 1310 * 1311 * @param rxq 1312 * Pointer to RX queue. 1313 * @param pkt 1314 * mbuf to fill. 1315 * @param cqe 1316 * CQE to process. 1317 * @param rss_hash_res 1318 * Packet RSS Hash result. 1319 */ 1320 static inline void 1321 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, 1322 volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res) 1323 { 1324 /* Update packet information. */ 1325 pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe); 1326 if (rss_hash_res && rxq->rss_hash) { 1327 pkt->hash.rss = rss_hash_res; 1328 pkt->ol_flags |= PKT_RX_RSS_HASH; 1329 } 1330 if (rxq->mark && MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) { 1331 pkt->ol_flags |= PKT_RX_FDIR; 1332 if (cqe->sop_drop_qpn != 1333 rte_cpu_to_be_32(MLX5_FLOW_MARK_DEFAULT)) { 1334 uint32_t mark = cqe->sop_drop_qpn; 1335 1336 pkt->ol_flags |= PKT_RX_FDIR_ID; 1337 pkt->hash.fdir.hi = mlx5_flow_mark_get(mark); 1338 } 1339 } 1340 if (rte_flow_dynf_metadata_avail() && cqe->flow_table_metadata) { 1341 pkt->ol_flags |= PKT_RX_DYNF_METADATA; 1342 *RTE_FLOW_DYNF_METADATA(pkt) = cqe->flow_table_metadata; 1343 } 1344 if (rxq->csum) 1345 pkt->ol_flags |= rxq_cq_to_ol_flags(cqe); 1346 if (rxq->vlan_strip && 1347 (cqe->hdr_type_etc & rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED))) { 1348 pkt->ol_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED; 1349 pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info); 1350 } 1351 if (rxq->hw_timestamp) { 1352 pkt->timestamp = rte_be_to_cpu_64(cqe->timestamp); 1353 pkt->ol_flags |= PKT_RX_TIMESTAMP; 1354 } 1355 } 1356 1357 /** 1358 * DPDK callback for RX. 1359 * 1360 * @param dpdk_rxq 1361 * Generic pointer to RX queue structure. 1362 * @param[out] pkts 1363 * Array to store received packets. 1364 * @param pkts_n 1365 * Maximum number of packets in array. 1366 * 1367 * @return 1368 * Number of packets successfully received (<= pkts_n). 1369 */ 1370 uint16_t 1371 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1372 { 1373 struct mlx5_rxq_data *rxq = dpdk_rxq; 1374 const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1; 1375 const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1; 1376 const unsigned int sges_n = rxq->sges_n; 1377 struct rte_mbuf *pkt = NULL; 1378 struct rte_mbuf *seg = NULL; 1379 volatile struct mlx5_cqe *cqe = 1380 &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1381 unsigned int i = 0; 1382 unsigned int rq_ci = rxq->rq_ci << sges_n; 1383 int len = 0; /* keep its value across iterations. */ 1384 1385 while (pkts_n) { 1386 unsigned int idx = rq_ci & wqe_cnt; 1387 volatile struct mlx5_wqe_data_seg *wqe = 1388 &((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx]; 1389 struct rte_mbuf *rep = (*rxq->elts)[idx]; 1390 volatile struct mlx5_mini_cqe8 *mcqe = NULL; 1391 uint32_t rss_hash_res; 1392 1393 if (pkt) 1394 NEXT(seg) = rep; 1395 seg = rep; 1396 rte_prefetch0(seg); 1397 rte_prefetch0(cqe); 1398 rte_prefetch0(wqe); 1399 rep = rte_mbuf_raw_alloc(rxq->mp); 1400 if (unlikely(rep == NULL)) { 1401 ++rxq->stats.rx_nombuf; 1402 if (!pkt) { 1403 /* 1404 * no buffers before we even started, 1405 * bail out silently. 1406 */ 1407 break; 1408 } 1409 while (pkt != seg) { 1410 MLX5_ASSERT(pkt != (*rxq->elts)[idx]); 1411 rep = NEXT(pkt); 1412 NEXT(pkt) = NULL; 1413 NB_SEGS(pkt) = 1; 1414 rte_mbuf_raw_free(pkt); 1415 pkt = rep; 1416 } 1417 break; 1418 } 1419 if (!pkt) { 1420 cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1421 len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt, &mcqe); 1422 if (!len) { 1423 rte_mbuf_raw_free(rep); 1424 break; 1425 } 1426 pkt = seg; 1427 MLX5_ASSERT(len >= (rxq->crc_present << 2)); 1428 pkt->ol_flags &= EXT_ATTACHED_MBUF; 1429 /* If compressed, take hash result from mini-CQE. */ 1430 rss_hash_res = rte_be_to_cpu_32(mcqe == NULL ? 1431 cqe->rx_hash_res : 1432 mcqe->rx_hash_result); 1433 rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res); 1434 if (rxq->crc_present) 1435 len -= RTE_ETHER_CRC_LEN; 1436 PKT_LEN(pkt) = len; 1437 if (cqe->lro_num_seg > 1) { 1438 mlx5_lro_update_hdr 1439 (rte_pktmbuf_mtod(pkt, uint8_t *), cqe, 1440 len); 1441 pkt->ol_flags |= PKT_RX_LRO; 1442 pkt->tso_segsz = len / cqe->lro_num_seg; 1443 } 1444 } 1445 DATA_LEN(rep) = DATA_LEN(seg); 1446 PKT_LEN(rep) = PKT_LEN(seg); 1447 SET_DATA_OFF(rep, DATA_OFF(seg)); 1448 PORT(rep) = PORT(seg); 1449 (*rxq->elts)[idx] = rep; 1450 /* 1451 * Fill NIC descriptor with the new buffer. The lkey and size 1452 * of the buffers are already known, only the buffer address 1453 * changes. 1454 */ 1455 wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t)); 1456 /* If there's only one MR, no need to replace LKey in WQE. */ 1457 if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) 1458 wqe->lkey = mlx5_rx_mb2mr(rxq, rep); 1459 if (len > DATA_LEN(seg)) { 1460 len -= DATA_LEN(seg); 1461 ++NB_SEGS(pkt); 1462 ++rq_ci; 1463 continue; 1464 } 1465 DATA_LEN(seg) = len; 1466 #ifdef MLX5_PMD_SOFT_COUNTERS 1467 /* Increment bytes counter. */ 1468 rxq->stats.ibytes += PKT_LEN(pkt); 1469 #endif 1470 /* Return packet. */ 1471 *(pkts++) = pkt; 1472 pkt = NULL; 1473 --pkts_n; 1474 ++i; 1475 /* Align consumer index to the next stride. */ 1476 rq_ci >>= sges_n; 1477 ++rq_ci; 1478 rq_ci <<= sges_n; 1479 } 1480 if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci))) 1481 return 0; 1482 /* Update the consumer index. */ 1483 rxq->rq_ci = rq_ci >> sges_n; 1484 rte_cio_wmb(); 1485 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); 1486 rte_cio_wmb(); 1487 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 1488 #ifdef MLX5_PMD_SOFT_COUNTERS 1489 /* Increment packets counter. */ 1490 rxq->stats.ipackets += i; 1491 #endif 1492 return i; 1493 } 1494 1495 /** 1496 * Update LRO packet TCP header. 1497 * The HW LRO feature doesn't update the TCP header after coalescing the 1498 * TCP segments but supplies information in CQE to fill it by SW. 1499 * 1500 * @param tcp 1501 * Pointer to the TCP header. 1502 * @param cqe 1503 * Pointer to the completion entry.. 1504 * @param phcsum 1505 * The L3 pseudo-header checksum. 1506 */ 1507 static inline void 1508 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *restrict tcp, 1509 volatile struct mlx5_cqe *restrict cqe, 1510 uint32_t phcsum) 1511 { 1512 uint8_t l4_type = (rte_be_to_cpu_16(cqe->hdr_type_etc) & 1513 MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT; 1514 /* 1515 * The HW calculates only the TCP payload checksum, need to complete 1516 * the TCP header checksum and the L3 pseudo-header checksum. 1517 */ 1518 uint32_t csum = phcsum + cqe->csum; 1519 1520 if (l4_type == MLX5_L4_HDR_TYPE_TCP_EMPTY_ACK || 1521 l4_type == MLX5_L4_HDR_TYPE_TCP_WITH_ACL) { 1522 tcp->tcp_flags |= RTE_TCP_ACK_FLAG; 1523 tcp->recv_ack = cqe->lro_ack_seq_num; 1524 tcp->rx_win = cqe->lro_tcp_win; 1525 } 1526 if (cqe->lro_tcppsh_abort_dupack & MLX5_CQE_LRO_PUSH_MASK) 1527 tcp->tcp_flags |= RTE_TCP_PSH_FLAG; 1528 tcp->cksum = 0; 1529 csum += rte_raw_cksum(tcp, (tcp->data_off & 0xF) * 4); 1530 csum = ((csum & 0xffff0000) >> 16) + (csum & 0xffff); 1531 csum = (~csum) & 0xffff; 1532 if (csum == 0) 1533 csum = 0xffff; 1534 tcp->cksum = csum; 1535 } 1536 1537 /** 1538 * Update LRO packet headers. 1539 * The HW LRO feature doesn't update the L3/TCP headers after coalescing the 1540 * TCP segments but supply information in CQE to fill it by SW. 1541 * 1542 * @param padd 1543 * The packet address. 1544 * @param cqe 1545 * Pointer to the completion entry.. 1546 * @param len 1547 * The packet length. 1548 */ 1549 static inline void 1550 mlx5_lro_update_hdr(uint8_t *restrict padd, 1551 volatile struct mlx5_cqe *restrict cqe, 1552 uint32_t len) 1553 { 1554 union { 1555 struct rte_ether_hdr *eth; 1556 struct rte_vlan_hdr *vlan; 1557 struct rte_ipv4_hdr *ipv4; 1558 struct rte_ipv6_hdr *ipv6; 1559 struct rte_tcp_hdr *tcp; 1560 uint8_t *hdr; 1561 } h = { 1562 .hdr = padd, 1563 }; 1564 uint16_t proto = h.eth->ether_type; 1565 uint32_t phcsum; 1566 1567 h.eth++; 1568 while (proto == RTE_BE16(RTE_ETHER_TYPE_VLAN) || 1569 proto == RTE_BE16(RTE_ETHER_TYPE_QINQ)) { 1570 proto = h.vlan->eth_proto; 1571 h.vlan++; 1572 } 1573 if (proto == RTE_BE16(RTE_ETHER_TYPE_IPV4)) { 1574 h.ipv4->time_to_live = cqe->lro_min_ttl; 1575 h.ipv4->total_length = rte_cpu_to_be_16(len - (h.hdr - padd)); 1576 h.ipv4->hdr_checksum = 0; 1577 h.ipv4->hdr_checksum = rte_ipv4_cksum(h.ipv4); 1578 phcsum = rte_ipv4_phdr_cksum(h.ipv4, 0); 1579 h.ipv4++; 1580 } else { 1581 h.ipv6->hop_limits = cqe->lro_min_ttl; 1582 h.ipv6->payload_len = rte_cpu_to_be_16(len - (h.hdr - padd) - 1583 sizeof(*h.ipv6)); 1584 phcsum = rte_ipv6_phdr_cksum(h.ipv6, 0); 1585 h.ipv6++; 1586 } 1587 mlx5_lro_update_tcp_hdr(h.tcp, cqe, phcsum); 1588 } 1589 1590 void 1591 mlx5_mprq_buf_free_cb(void *addr __rte_unused, void *opaque) 1592 { 1593 struct mlx5_mprq_buf *buf = opaque; 1594 1595 if (rte_atomic16_read(&buf->refcnt) == 1) { 1596 rte_mempool_put(buf->mp, buf); 1597 } else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) { 1598 rte_atomic16_set(&buf->refcnt, 1); 1599 rte_mempool_put(buf->mp, buf); 1600 } 1601 } 1602 1603 void 1604 mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf) 1605 { 1606 mlx5_mprq_buf_free_cb(NULL, buf); 1607 } 1608 1609 static inline void 1610 mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx, 1611 const unsigned int strd_n) 1612 { 1613 struct mlx5_mprq_buf *rep = rxq->mprq_repl; 1614 volatile struct mlx5_wqe_data_seg *wqe = 1615 &((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg; 1616 void *addr; 1617 1618 MLX5_ASSERT(rep != NULL); 1619 /* Replace MPRQ buf. */ 1620 (*rxq->mprq_bufs)[rq_idx] = rep; 1621 /* Replace WQE. */ 1622 addr = mlx5_mprq_buf_addr(rep, strd_n); 1623 wqe->addr = rte_cpu_to_be_64((uintptr_t)addr); 1624 /* If there's only one MR, no need to replace LKey in WQE. */ 1625 if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) 1626 wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr); 1627 /* Stash a mbuf for next replacement. */ 1628 if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep))) 1629 rxq->mprq_repl = rep; 1630 else 1631 rxq->mprq_repl = NULL; 1632 } 1633 1634 /** 1635 * DPDK callback for RX with Multi-Packet RQ support. 1636 * 1637 * @param dpdk_rxq 1638 * Generic pointer to RX queue structure. 1639 * @param[out] pkts 1640 * Array to store received packets. 1641 * @param pkts_n 1642 * Maximum number of packets in array. 1643 * 1644 * @return 1645 * Number of packets successfully received (<= pkts_n). 1646 */ 1647 uint16_t 1648 mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1649 { 1650 struct mlx5_rxq_data *rxq = dpdk_rxq; 1651 const unsigned int strd_n = 1 << rxq->strd_num_n; 1652 const unsigned int strd_sz = 1 << rxq->strd_sz_n; 1653 const unsigned int strd_shift = 1654 MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en; 1655 const unsigned int cq_mask = (1 << rxq->cqe_n) - 1; 1656 const unsigned int wq_mask = (1 << rxq->elts_n) - 1; 1657 volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask]; 1658 unsigned int i = 0; 1659 uint32_t rq_ci = rxq->rq_ci; 1660 uint16_t consumed_strd = rxq->consumed_strd; 1661 uint16_t headroom_sz = rxq->strd_headroom_en * RTE_PKTMBUF_HEADROOM; 1662 struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_ci & wq_mask]; 1663 1664 while (i < pkts_n) { 1665 struct rte_mbuf *pkt; 1666 void *addr; 1667 int ret; 1668 unsigned int len; 1669 uint16_t strd_cnt; 1670 uint16_t strd_idx; 1671 uint32_t offset; 1672 uint32_t byte_cnt; 1673 volatile struct mlx5_mini_cqe8 *mcqe = NULL; 1674 uint32_t rss_hash_res = 0; 1675 uint8_t lro_num_seg; 1676 1677 if (consumed_strd == strd_n) { 1678 /* Replace WQE only if the buffer is still in use. */ 1679 if (rte_atomic16_read(&buf->refcnt) > 1) { 1680 mprq_buf_replace(rxq, rq_ci & wq_mask, strd_n); 1681 /* Release the old buffer. */ 1682 mlx5_mprq_buf_free(buf); 1683 } else if (unlikely(rxq->mprq_repl == NULL)) { 1684 struct mlx5_mprq_buf *rep; 1685 1686 /* 1687 * Currently, the MPRQ mempool is out of buffer 1688 * and doing memcpy regardless of the size of Rx 1689 * packet. Retry allocation to get back to 1690 * normal. 1691 */ 1692 if (!rte_mempool_get(rxq->mprq_mp, 1693 (void **)&rep)) 1694 rxq->mprq_repl = rep; 1695 } 1696 /* Advance to the next WQE. */ 1697 consumed_strd = 0; 1698 ++rq_ci; 1699 buf = (*rxq->mprq_bufs)[rq_ci & wq_mask]; 1700 } 1701 cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask]; 1702 ret = mlx5_rx_poll_len(rxq, cqe, cq_mask, &mcqe); 1703 if (!ret) 1704 break; 1705 byte_cnt = ret; 1706 strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >> 1707 MLX5_MPRQ_STRIDE_NUM_SHIFT; 1708 MLX5_ASSERT(strd_cnt); 1709 consumed_strd += strd_cnt; 1710 if (byte_cnt & MLX5_MPRQ_FILLER_MASK) 1711 continue; 1712 if (mcqe == NULL) { 1713 rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res); 1714 strd_idx = rte_be_to_cpu_16(cqe->wqe_counter); 1715 } else { 1716 /* mini-CQE for MPRQ doesn't have hash result. */ 1717 strd_idx = rte_be_to_cpu_16(mcqe->stride_idx); 1718 } 1719 MLX5_ASSERT(strd_idx < strd_n); 1720 MLX5_ASSERT(!((rte_be_to_cpu_16(cqe->wqe_id) ^ rq_ci) & 1721 wq_mask)); 1722 lro_num_seg = cqe->lro_num_seg; 1723 /* 1724 * Currently configured to receive a packet per a stride. But if 1725 * MTU is adjusted through kernel interface, device could 1726 * consume multiple strides without raising an error. In this 1727 * case, the packet should be dropped because it is bigger than 1728 * the max_rx_pkt_len. 1729 */ 1730 if (unlikely(!lro_num_seg && strd_cnt > 1)) { 1731 ++rxq->stats.idropped; 1732 continue; 1733 } 1734 pkt = rte_pktmbuf_alloc(rxq->mp); 1735 if (unlikely(pkt == NULL)) { 1736 ++rxq->stats.rx_nombuf; 1737 break; 1738 } 1739 len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT; 1740 MLX5_ASSERT((int)len >= (rxq->crc_present << 2)); 1741 if (rxq->crc_present) 1742 len -= RTE_ETHER_CRC_LEN; 1743 offset = strd_idx * strd_sz + strd_shift; 1744 addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset); 1745 /* 1746 * Memcpy packets to the target mbuf if: 1747 * - The size of packet is smaller than mprq_max_memcpy_len. 1748 * - Out of buffer in the Mempool for Multi-Packet RQ. 1749 */ 1750 if (len <= rxq->mprq_max_memcpy_len || rxq->mprq_repl == NULL) { 1751 /* 1752 * When memcpy'ing packet due to out-of-buffer, the 1753 * packet must be smaller than the target mbuf. 1754 */ 1755 if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) { 1756 rte_pktmbuf_free_seg(pkt); 1757 ++rxq->stats.idropped; 1758 continue; 1759 } 1760 rte_memcpy(rte_pktmbuf_mtod(pkt, void *), addr, len); 1761 DATA_LEN(pkt) = len; 1762 } else { 1763 rte_iova_t buf_iova; 1764 struct rte_mbuf_ext_shared_info *shinfo; 1765 uint16_t buf_len = strd_cnt * strd_sz; 1766 void *buf_addr; 1767 1768 /* Increment the refcnt of the whole chunk. */ 1769 rte_atomic16_add_return(&buf->refcnt, 1); 1770 MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf->refcnt) <= 1771 strd_n + 1); 1772 buf_addr = RTE_PTR_SUB(addr, headroom_sz); 1773 /* 1774 * MLX5 device doesn't use iova but it is necessary in a 1775 * case where the Rx packet is transmitted via a 1776 * different PMD. 1777 */ 1778 buf_iova = rte_mempool_virt2iova(buf) + 1779 RTE_PTR_DIFF(buf_addr, buf); 1780 shinfo = &buf->shinfos[strd_idx]; 1781 rte_mbuf_ext_refcnt_set(shinfo, 1); 1782 /* 1783 * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when 1784 * attaching the stride to mbuf and more offload flags 1785 * will be added below by calling rxq_cq_to_mbuf(). 1786 * Other fields will be overwritten. 1787 */ 1788 rte_pktmbuf_attach_extbuf(pkt, buf_addr, buf_iova, 1789 buf_len, shinfo); 1790 /* Set mbuf head-room. */ 1791 pkt->data_off = headroom_sz; 1792 MLX5_ASSERT(pkt->ol_flags == EXT_ATTACHED_MBUF); 1793 /* 1794 * Prevent potential overflow due to MTU change through 1795 * kernel interface. 1796 */ 1797 if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) { 1798 rte_pktmbuf_free_seg(pkt); 1799 ++rxq->stats.idropped; 1800 continue; 1801 } 1802 DATA_LEN(pkt) = len; 1803 /* 1804 * LRO packet may consume all the stride memory, in this 1805 * case packet head-room space is not guaranteed so must 1806 * to add an empty mbuf for the head-room. 1807 */ 1808 if (!rxq->strd_headroom_en) { 1809 struct rte_mbuf *headroom_mbuf = 1810 rte_pktmbuf_alloc(rxq->mp); 1811 1812 if (unlikely(headroom_mbuf == NULL)) { 1813 rte_pktmbuf_free_seg(pkt); 1814 ++rxq->stats.rx_nombuf; 1815 break; 1816 } 1817 PORT(pkt) = rxq->port_id; 1818 NEXT(headroom_mbuf) = pkt; 1819 pkt = headroom_mbuf; 1820 NB_SEGS(pkt) = 2; 1821 } 1822 } 1823 rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res); 1824 if (lro_num_seg > 1) { 1825 mlx5_lro_update_hdr(addr, cqe, len); 1826 pkt->ol_flags |= PKT_RX_LRO; 1827 pkt->tso_segsz = strd_sz; 1828 } 1829 PKT_LEN(pkt) = len; 1830 PORT(pkt) = rxq->port_id; 1831 #ifdef MLX5_PMD_SOFT_COUNTERS 1832 /* Increment bytes counter. */ 1833 rxq->stats.ibytes += PKT_LEN(pkt); 1834 #endif 1835 /* Return packet. */ 1836 *(pkts++) = pkt; 1837 ++i; 1838 } 1839 /* Update the consumer indexes. */ 1840 rxq->consumed_strd = consumed_strd; 1841 rte_cio_wmb(); 1842 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); 1843 if (rq_ci != rxq->rq_ci) { 1844 rxq->rq_ci = rq_ci; 1845 rte_cio_wmb(); 1846 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 1847 } 1848 #ifdef MLX5_PMD_SOFT_COUNTERS 1849 /* Increment packets counter. */ 1850 rxq->stats.ipackets += i; 1851 #endif 1852 return i; 1853 } 1854 1855 /** 1856 * Dummy DPDK callback for TX. 1857 * 1858 * This function is used to temporarily replace the real callback during 1859 * unsafe control operations on the queue, or in case of error. 1860 * 1861 * @param dpdk_txq 1862 * Generic pointer to TX queue structure. 1863 * @param[in] pkts 1864 * Packets to transmit. 1865 * @param pkts_n 1866 * Number of packets in array. 1867 * 1868 * @return 1869 * Number of packets successfully transmitted (<= pkts_n). 1870 */ 1871 uint16_t 1872 removed_tx_burst(void *dpdk_txq __rte_unused, 1873 struct rte_mbuf **pkts __rte_unused, 1874 uint16_t pkts_n __rte_unused) 1875 { 1876 rte_mb(); 1877 return 0; 1878 } 1879 1880 /** 1881 * Dummy DPDK callback for RX. 1882 * 1883 * This function is used to temporarily replace the real callback during 1884 * unsafe control operations on the queue, or in case of error. 1885 * 1886 * @param dpdk_rxq 1887 * Generic pointer to RX queue structure. 1888 * @param[out] pkts 1889 * Array to store received packets. 1890 * @param pkts_n 1891 * Maximum number of packets in array. 1892 * 1893 * @return 1894 * Number of packets successfully received (<= pkts_n). 1895 */ 1896 uint16_t 1897 removed_rx_burst(void *dpdk_txq __rte_unused, 1898 struct rte_mbuf **pkts __rte_unused, 1899 uint16_t pkts_n __rte_unused) 1900 { 1901 rte_mb(); 1902 return 0; 1903 } 1904 1905 /* 1906 * Vectorized Rx/Tx routines are not compiled in when required vector 1907 * instructions are not supported on a target architecture. The following null 1908 * stubs are needed for linkage when those are not included outside of this file 1909 * (e.g. mlx5_rxtx_vec_sse.c for x86). 1910 */ 1911 1912 __rte_weak uint16_t 1913 mlx5_rx_burst_vec(void *dpdk_txq __rte_unused, 1914 struct rte_mbuf **pkts __rte_unused, 1915 uint16_t pkts_n __rte_unused) 1916 { 1917 return 0; 1918 } 1919 1920 __rte_weak int 1921 mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused) 1922 { 1923 return -ENOTSUP; 1924 } 1925 1926 __rte_weak int 1927 mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused) 1928 { 1929 return -ENOTSUP; 1930 } 1931 1932 /** 1933 * Free the mbufs from the linear array of pointers. 1934 * 1935 * @param pkts 1936 * Pointer to array of packets to be free. 1937 * @param pkts_n 1938 * Number of packets to be freed. 1939 * @param olx 1940 * Configured Tx offloads mask. It is fully defined at 1941 * compile time and may be used for optimization. 1942 */ 1943 static __rte_always_inline void 1944 mlx5_tx_free_mbuf(struct rte_mbuf **restrict pkts, 1945 unsigned int pkts_n, 1946 unsigned int olx __rte_unused) 1947 { 1948 struct rte_mempool *pool = NULL; 1949 struct rte_mbuf **p_free = NULL; 1950 struct rte_mbuf *mbuf; 1951 unsigned int n_free = 0; 1952 1953 /* 1954 * The implemented algorithm eliminates 1955 * copying pointers to temporary array 1956 * for rte_mempool_put_bulk() calls. 1957 */ 1958 MLX5_ASSERT(pkts); 1959 MLX5_ASSERT(pkts_n); 1960 for (;;) { 1961 for (;;) { 1962 /* 1963 * Decrement mbuf reference counter, detach 1964 * indirect and external buffers if needed. 1965 */ 1966 mbuf = rte_pktmbuf_prefree_seg(*pkts); 1967 if (likely(mbuf != NULL)) { 1968 MLX5_ASSERT(mbuf == *pkts); 1969 if (likely(n_free != 0)) { 1970 if (unlikely(pool != mbuf->pool)) 1971 /* From different pool. */ 1972 break; 1973 } else { 1974 /* Start new scan array. */ 1975 pool = mbuf->pool; 1976 p_free = pkts; 1977 } 1978 ++n_free; 1979 ++pkts; 1980 --pkts_n; 1981 if (unlikely(pkts_n == 0)) { 1982 mbuf = NULL; 1983 break; 1984 } 1985 } else { 1986 /* 1987 * This happens if mbuf is still referenced. 1988 * We can't put it back to the pool, skip. 1989 */ 1990 ++pkts; 1991 --pkts_n; 1992 if (unlikely(n_free != 0)) 1993 /* There is some array to free.*/ 1994 break; 1995 if (unlikely(pkts_n == 0)) 1996 /* Last mbuf, nothing to free. */ 1997 return; 1998 } 1999 } 2000 for (;;) { 2001 /* 2002 * This loop is implemented to avoid multiple 2003 * inlining of rte_mempool_put_bulk(). 2004 */ 2005 MLX5_ASSERT(pool); 2006 MLX5_ASSERT(p_free); 2007 MLX5_ASSERT(n_free); 2008 /* 2009 * Free the array of pre-freed mbufs 2010 * belonging to the same memory pool. 2011 */ 2012 rte_mempool_put_bulk(pool, (void *)p_free, n_free); 2013 if (unlikely(mbuf != NULL)) { 2014 /* There is the request to start new scan. */ 2015 pool = mbuf->pool; 2016 p_free = pkts++; 2017 n_free = 1; 2018 --pkts_n; 2019 if (likely(pkts_n != 0)) 2020 break; 2021 /* 2022 * This is the last mbuf to be freed. 2023 * Do one more loop iteration to complete. 2024 * This is rare case of the last unique mbuf. 2025 */ 2026 mbuf = NULL; 2027 continue; 2028 } 2029 if (likely(pkts_n == 0)) 2030 return; 2031 n_free = 0; 2032 break; 2033 } 2034 } 2035 } 2036 2037 /** 2038 * Free the mbuf from the elts ring buffer till new tail. 2039 * 2040 * @param txq 2041 * Pointer to Tx queue structure. 2042 * @param tail 2043 * Index in elts to free up to, becomes new elts tail. 2044 * @param olx 2045 * Configured Tx offloads mask. It is fully defined at 2046 * compile time and may be used for optimization. 2047 */ 2048 static __rte_always_inline void 2049 mlx5_tx_free_elts(struct mlx5_txq_data *restrict txq, 2050 uint16_t tail, 2051 unsigned int olx __rte_unused) 2052 { 2053 uint16_t n_elts = tail - txq->elts_tail; 2054 2055 MLX5_ASSERT(n_elts); 2056 MLX5_ASSERT(n_elts <= txq->elts_s); 2057 /* 2058 * Implement a loop to support ring buffer wraparound 2059 * with single inlining of mlx5_tx_free_mbuf(). 2060 */ 2061 do { 2062 unsigned int part; 2063 2064 part = txq->elts_s - (txq->elts_tail & txq->elts_m); 2065 part = RTE_MIN(part, n_elts); 2066 MLX5_ASSERT(part); 2067 MLX5_ASSERT(part <= txq->elts_s); 2068 mlx5_tx_free_mbuf(&txq->elts[txq->elts_tail & txq->elts_m], 2069 part, olx); 2070 txq->elts_tail += part; 2071 n_elts -= part; 2072 } while (n_elts); 2073 } 2074 2075 /** 2076 * Store the mbuf being sent into elts ring buffer. 2077 * On Tx completion these mbufs will be freed. 2078 * 2079 * @param txq 2080 * Pointer to Tx queue structure. 2081 * @param pkts 2082 * Pointer to array of packets to be stored. 2083 * @param pkts_n 2084 * Number of packets to be stored. 2085 * @param olx 2086 * Configured Tx offloads mask. It is fully defined at 2087 * compile time and may be used for optimization. 2088 */ 2089 static __rte_always_inline void 2090 mlx5_tx_copy_elts(struct mlx5_txq_data *restrict txq, 2091 struct rte_mbuf **restrict pkts, 2092 unsigned int pkts_n, 2093 unsigned int olx __rte_unused) 2094 { 2095 unsigned int part; 2096 struct rte_mbuf **elts = (struct rte_mbuf **)txq->elts; 2097 2098 MLX5_ASSERT(pkts); 2099 MLX5_ASSERT(pkts_n); 2100 part = txq->elts_s - (txq->elts_head & txq->elts_m); 2101 MLX5_ASSERT(part); 2102 MLX5_ASSERT(part <= txq->elts_s); 2103 /* This code is a good candidate for vectorizing with SIMD. */ 2104 rte_memcpy((void *)(elts + (txq->elts_head & txq->elts_m)), 2105 (void *)pkts, 2106 RTE_MIN(part, pkts_n) * sizeof(struct rte_mbuf *)); 2107 txq->elts_head += pkts_n; 2108 if (unlikely(part < pkts_n)) 2109 /* The copy is wrapping around the elts array. */ 2110 rte_memcpy((void *)elts, (void *)(pkts + part), 2111 (pkts_n - part) * sizeof(struct rte_mbuf *)); 2112 } 2113 2114 /** 2115 * Update completion queue consuming index via doorbell 2116 * and flush the completed data buffers. 2117 * 2118 * @param txq 2119 * Pointer to TX queue structure. 2120 * @param valid CQE pointer 2121 * if not NULL update txq->wqe_pi and flush the buffers 2122 * @param olx 2123 * Configured Tx offloads mask. It is fully defined at 2124 * compile time and may be used for optimization. 2125 */ 2126 static __rte_always_inline void 2127 mlx5_tx_comp_flush(struct mlx5_txq_data *restrict txq, 2128 volatile struct mlx5_cqe *last_cqe, 2129 unsigned int olx __rte_unused) 2130 { 2131 if (likely(last_cqe != NULL)) { 2132 uint16_t tail; 2133 2134 txq->wqe_pi = rte_be_to_cpu_16(last_cqe->wqe_counter); 2135 tail = txq->fcqs[(txq->cq_ci - 1) & txq->cqe_m]; 2136 if (likely(tail != txq->elts_tail)) { 2137 mlx5_tx_free_elts(txq, tail, olx); 2138 MLX5_ASSERT(tail == txq->elts_tail); 2139 } 2140 } 2141 } 2142 2143 /** 2144 * Manage TX completions. This routine checks the CQ for 2145 * arrived CQEs, deduces the last accomplished WQE in SQ, 2146 * updates SQ producing index and frees all completed mbufs. 2147 * 2148 * @param txq 2149 * Pointer to TX queue structure. 2150 * @param olx 2151 * Configured Tx offloads mask. It is fully defined at 2152 * compile time and may be used for optimization. 2153 * 2154 * NOTE: not inlined intentionally, it makes tx_burst 2155 * routine smaller, simple and faster - from experiments. 2156 */ 2157 static void 2158 mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq, 2159 unsigned int olx __rte_unused) 2160 { 2161 unsigned int count = MLX5_TX_COMP_MAX_CQE; 2162 volatile struct mlx5_cqe *last_cqe = NULL; 2163 bool ring_doorbell = false; 2164 int ret; 2165 2166 static_assert(MLX5_CQE_STATUS_HW_OWN < 0, "Must be negative value"); 2167 static_assert(MLX5_CQE_STATUS_SW_OWN < 0, "Must be negative value"); 2168 do { 2169 volatile struct mlx5_cqe *cqe; 2170 2171 cqe = &txq->cqes[txq->cq_ci & txq->cqe_m]; 2172 ret = check_cqe(cqe, txq->cqe_s, txq->cq_ci); 2173 if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) { 2174 if (likely(ret != MLX5_CQE_STATUS_ERR)) { 2175 /* No new CQEs in completion queue. */ 2176 MLX5_ASSERT(ret == MLX5_CQE_STATUS_HW_OWN); 2177 break; 2178 } 2179 /* 2180 * Some error occurred, try to restart. 2181 * We have no barrier after WQE related Doorbell 2182 * written, make sure all writes are completed 2183 * here, before we might perform SQ reset. 2184 */ 2185 rte_wmb(); 2186 ret = mlx5_tx_error_cqe_handle 2187 (txq, (volatile struct mlx5_err_cqe *)cqe); 2188 if (unlikely(ret < 0)) { 2189 /* 2190 * Some error occurred on queue error 2191 * handling, we do not advance the index 2192 * here, allowing to retry on next call. 2193 */ 2194 return; 2195 } 2196 /* 2197 * We are going to fetch all entries with 2198 * MLX5_CQE_SYNDROME_WR_FLUSH_ERR status. 2199 * The send queue is supposed to be empty. 2200 */ 2201 ring_doorbell = true; 2202 ++txq->cq_ci; 2203 txq->cq_pi = txq->cq_ci; 2204 last_cqe = NULL; 2205 continue; 2206 } 2207 /* Normal transmit completion. */ 2208 MLX5_ASSERT(txq->cq_ci != txq->cq_pi); 2209 MLX5_ASSERT((txq->fcqs[txq->cq_ci & txq->cqe_m] >> 16) == 2210 cqe->wqe_counter); 2211 ring_doorbell = true; 2212 ++txq->cq_ci; 2213 last_cqe = cqe; 2214 /* 2215 * We have to restrict the amount of processed CQEs 2216 * in one tx_burst routine call. The CQ may be large 2217 * and many CQEs may be updated by the NIC in one 2218 * transaction. Buffers freeing is time consuming, 2219 * multiple iterations may introduce significant 2220 * latency. 2221 */ 2222 if (likely(--count == 0)) 2223 break; 2224 } while (true); 2225 if (likely(ring_doorbell)) { 2226 /* Ring doorbell to notify hardware. */ 2227 rte_compiler_barrier(); 2228 *txq->cq_db = rte_cpu_to_be_32(txq->cq_ci); 2229 mlx5_tx_comp_flush(txq, last_cqe, olx); 2230 } 2231 } 2232 2233 /** 2234 * Check if the completion request flag should be set in the last WQE. 2235 * Both pushed mbufs and WQEs are monitored and the completion request 2236 * flag is set if any of thresholds is reached. 2237 * 2238 * @param txq 2239 * Pointer to TX queue structure. 2240 * @param loc 2241 * Pointer to burst routine local context. 2242 * @param olx 2243 * Configured Tx offloads mask. It is fully defined at 2244 * compile time and may be used for optimization. 2245 */ 2246 static __rte_always_inline void 2247 mlx5_tx_request_completion(struct mlx5_txq_data *restrict txq, 2248 struct mlx5_txq_local *restrict loc, 2249 unsigned int olx) 2250 { 2251 uint16_t head = txq->elts_head; 2252 unsigned int part; 2253 2254 part = MLX5_TXOFF_CONFIG(INLINE) ? 2255 0 : loc->pkts_sent - loc->pkts_copy; 2256 head += part; 2257 if ((uint16_t)(head - txq->elts_comp) >= MLX5_TX_COMP_THRESH || 2258 (MLX5_TXOFF_CONFIG(INLINE) && 2259 (uint16_t)(txq->wqe_ci - txq->wqe_comp) >= txq->wqe_thres)) { 2260 volatile struct mlx5_wqe *last = loc->wqe_last; 2261 2262 MLX5_ASSERT(last); 2263 txq->elts_comp = head; 2264 if (MLX5_TXOFF_CONFIG(INLINE)) 2265 txq->wqe_comp = txq->wqe_ci; 2266 /* Request unconditional completion on last WQE. */ 2267 last->cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS << 2268 MLX5_COMP_MODE_OFFSET); 2269 /* Save elts_head in dedicated free on completion queue. */ 2270 #ifdef RTE_LIBRTE_MLX5_DEBUG 2271 txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head | 2272 (last->cseg.opcode >> 8) << 16; 2273 #else 2274 txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head; 2275 #endif 2276 /* A CQE slot must always be available. */ 2277 MLX5_ASSERT((txq->cq_pi - txq->cq_ci) <= txq->cqe_s); 2278 } 2279 } 2280 2281 /** 2282 * DPDK callback to check the status of a tx descriptor. 2283 * 2284 * @param tx_queue 2285 * The tx queue. 2286 * @param[in] offset 2287 * The index of the descriptor in the ring. 2288 * 2289 * @return 2290 * The status of the tx descriptor. 2291 */ 2292 int 2293 mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset) 2294 { 2295 struct mlx5_txq_data *restrict txq = tx_queue; 2296 uint16_t used; 2297 2298 mlx5_tx_handle_completion(txq, 0); 2299 used = txq->elts_head - txq->elts_tail; 2300 if (offset < used) 2301 return RTE_ETH_TX_DESC_FULL; 2302 return RTE_ETH_TX_DESC_DONE; 2303 } 2304 2305 /** 2306 * Build the Control Segment with specified opcode: 2307 * - MLX5_OPCODE_SEND 2308 * - MLX5_OPCODE_ENHANCED_MPSW 2309 * - MLX5_OPCODE_TSO 2310 * 2311 * @param txq 2312 * Pointer to TX queue structure. 2313 * @param loc 2314 * Pointer to burst routine local context. 2315 * @param wqe 2316 * Pointer to WQE to fill with built Control Segment. 2317 * @param ds 2318 * Supposed length of WQE in segments. 2319 * @param opcode 2320 * SQ WQE opcode to put into Control Segment. 2321 * @param olx 2322 * Configured Tx offloads mask. It is fully defined at 2323 * compile time and may be used for optimization. 2324 */ 2325 static __rte_always_inline void 2326 mlx5_tx_cseg_init(struct mlx5_txq_data *restrict txq, 2327 struct mlx5_txq_local *restrict loc __rte_unused, 2328 struct mlx5_wqe *restrict wqe, 2329 unsigned int ds, 2330 unsigned int opcode, 2331 unsigned int olx __rte_unused) 2332 { 2333 struct mlx5_wqe_cseg *restrict cs = &wqe->cseg; 2334 2335 /* For legacy MPW replace the EMPW by TSO with modifier. */ 2336 if (MLX5_TXOFF_CONFIG(MPW) && opcode == MLX5_OPCODE_ENHANCED_MPSW) 2337 opcode = MLX5_OPCODE_TSO | MLX5_OPC_MOD_MPW << 24; 2338 cs->opcode = rte_cpu_to_be_32((txq->wqe_ci << 8) | opcode); 2339 cs->sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); 2340 cs->flags = RTE_BE32(MLX5_COMP_ONLY_FIRST_ERR << 2341 MLX5_COMP_MODE_OFFSET); 2342 cs->misc = RTE_BE32(0); 2343 } 2344 2345 /** 2346 * Build the Ethernet Segment without inlined data. 2347 * Supports Software Parser, Checksums and VLAN 2348 * insertion Tx offload features. 2349 * 2350 * @param txq 2351 * Pointer to TX queue structure. 2352 * @param loc 2353 * Pointer to burst routine local context. 2354 * @param wqe 2355 * Pointer to WQE to fill with built Ethernet Segment. 2356 * @param olx 2357 * Configured Tx offloads mask. It is fully defined at 2358 * compile time and may be used for optimization. 2359 */ 2360 static __rte_always_inline void 2361 mlx5_tx_eseg_none(struct mlx5_txq_data *restrict txq __rte_unused, 2362 struct mlx5_txq_local *restrict loc, 2363 struct mlx5_wqe *restrict wqe, 2364 unsigned int olx) 2365 { 2366 struct mlx5_wqe_eseg *restrict es = &wqe->eseg; 2367 uint32_t csum; 2368 2369 /* 2370 * Calculate and set check sum flags first, dword field 2371 * in segment may be shared with Software Parser flags. 2372 */ 2373 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; 2374 es->flags = rte_cpu_to_le_32(csum); 2375 /* 2376 * Calculate and set Software Parser offsets and flags. 2377 * These flags a set for custom UDP and IP tunnel packets. 2378 */ 2379 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); 2380 /* Fill metadata field if needed. */ 2381 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? 2382 loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? 2383 *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0; 2384 /* Engage VLAN tag insertion feature if requested. */ 2385 if (MLX5_TXOFF_CONFIG(VLAN) && 2386 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { 2387 /* 2388 * We should get here only if device support 2389 * this feature correctly. 2390 */ 2391 MLX5_ASSERT(txq->vlan_en); 2392 es->inline_hdr = rte_cpu_to_be_32(MLX5_ETH_WQE_VLAN_INSERT | 2393 loc->mbuf->vlan_tci); 2394 } else { 2395 es->inline_hdr = RTE_BE32(0); 2396 } 2397 } 2398 2399 /** 2400 * Build the Ethernet Segment with minimal inlined data 2401 * of MLX5_ESEG_MIN_INLINE_SIZE bytes length. This is 2402 * used to fill the gap in single WQEBB WQEs. 2403 * Supports Software Parser, Checksums and VLAN 2404 * insertion Tx offload features. 2405 * 2406 * @param txq 2407 * Pointer to TX queue structure. 2408 * @param loc 2409 * Pointer to burst routine local context. 2410 * @param wqe 2411 * Pointer to WQE to fill with built Ethernet Segment. 2412 * @param vlan 2413 * Length of VLAN tag insertion if any. 2414 * @param olx 2415 * Configured Tx offloads mask. It is fully defined at 2416 * compile time and may be used for optimization. 2417 */ 2418 static __rte_always_inline void 2419 mlx5_tx_eseg_dmin(struct mlx5_txq_data *restrict txq __rte_unused, 2420 struct mlx5_txq_local *restrict loc, 2421 struct mlx5_wqe *restrict wqe, 2422 unsigned int vlan, 2423 unsigned int olx) 2424 { 2425 struct mlx5_wqe_eseg *restrict es = &wqe->eseg; 2426 uint32_t csum; 2427 uint8_t *psrc, *pdst; 2428 2429 /* 2430 * Calculate and set check sum flags first, dword field 2431 * in segment may be shared with Software Parser flags. 2432 */ 2433 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; 2434 es->flags = rte_cpu_to_le_32(csum); 2435 /* 2436 * Calculate and set Software Parser offsets and flags. 2437 * These flags a set for custom UDP and IP tunnel packets. 2438 */ 2439 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); 2440 /* Fill metadata field if needed. */ 2441 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? 2442 loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? 2443 *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0; 2444 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2445 (sizeof(uint16_t) + 2446 sizeof(rte_v128u32_t)), 2447 "invalid Ethernet Segment data size"); 2448 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2449 (sizeof(uint16_t) + 2450 sizeof(struct rte_vlan_hdr) + 2451 2 * RTE_ETHER_ADDR_LEN), 2452 "invalid Ethernet Segment data size"); 2453 psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *); 2454 es->inline_hdr_sz = RTE_BE16(MLX5_ESEG_MIN_INLINE_SIZE); 2455 es->inline_data = *(unaligned_uint16_t *)psrc; 2456 psrc += sizeof(uint16_t); 2457 pdst = (uint8_t *)(es + 1); 2458 if (MLX5_TXOFF_CONFIG(VLAN) && vlan) { 2459 /* Implement VLAN tag insertion as part inline data. */ 2460 memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t)); 2461 pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); 2462 psrc += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); 2463 /* Insert VLAN ethertype + VLAN tag. */ 2464 *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32 2465 ((RTE_ETHER_TYPE_VLAN << 16) | 2466 loc->mbuf->vlan_tci); 2467 pdst += sizeof(struct rte_vlan_hdr); 2468 /* Copy the rest two bytes from packet data. */ 2469 MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t))); 2470 *(uint16_t *)pdst = *(unaligned_uint16_t *)psrc; 2471 } else { 2472 /* Fill the gap in the title WQEBB with inline data. */ 2473 rte_mov16(pdst, psrc); 2474 } 2475 } 2476 2477 /** 2478 * Build the Ethernet Segment with entire packet 2479 * data inlining. Checks the boundary of WQEBB and 2480 * ring buffer wrapping, supports Software Parser, 2481 * Checksums and VLAN insertion Tx offload features. 2482 * 2483 * @param txq 2484 * Pointer to TX queue structure. 2485 * @param loc 2486 * Pointer to burst routine local context. 2487 * @param wqe 2488 * Pointer to WQE to fill with built Ethernet Segment. 2489 * @param vlan 2490 * Length of VLAN tag insertion if any. 2491 * @param inlen 2492 * Length of data to inline (VLAN included, if any). 2493 * @param tso 2494 * TSO flag, set mss field from the packet. 2495 * @param olx 2496 * Configured Tx offloads mask. It is fully defined at 2497 * compile time and may be used for optimization. 2498 * 2499 * @return 2500 * Pointer to the next Data Segment (aligned and wrapped around). 2501 */ 2502 static __rte_always_inline struct mlx5_wqe_dseg * 2503 mlx5_tx_eseg_data(struct mlx5_txq_data *restrict txq, 2504 struct mlx5_txq_local *restrict loc, 2505 struct mlx5_wqe *restrict wqe, 2506 unsigned int vlan, 2507 unsigned int inlen, 2508 unsigned int tso, 2509 unsigned int olx) 2510 { 2511 struct mlx5_wqe_eseg *restrict es = &wqe->eseg; 2512 uint32_t csum; 2513 uint8_t *psrc, *pdst; 2514 unsigned int part; 2515 2516 /* 2517 * Calculate and set check sum flags first, dword field 2518 * in segment may be shared with Software Parser flags. 2519 */ 2520 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; 2521 if (tso) { 2522 csum <<= 24; 2523 csum |= loc->mbuf->tso_segsz; 2524 es->flags = rte_cpu_to_be_32(csum); 2525 } else { 2526 es->flags = rte_cpu_to_le_32(csum); 2527 } 2528 /* 2529 * Calculate and set Software Parser offsets and flags. 2530 * These flags a set for custom UDP and IP tunnel packets. 2531 */ 2532 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); 2533 /* Fill metadata field if needed. */ 2534 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? 2535 loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? 2536 *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0; 2537 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2538 (sizeof(uint16_t) + 2539 sizeof(rte_v128u32_t)), 2540 "invalid Ethernet Segment data size"); 2541 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2542 (sizeof(uint16_t) + 2543 sizeof(struct rte_vlan_hdr) + 2544 2 * RTE_ETHER_ADDR_LEN), 2545 "invalid Ethernet Segment data size"); 2546 psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *); 2547 es->inline_hdr_sz = rte_cpu_to_be_16(inlen); 2548 es->inline_data = *(unaligned_uint16_t *)psrc; 2549 psrc += sizeof(uint16_t); 2550 pdst = (uint8_t *)(es + 1); 2551 if (MLX5_TXOFF_CONFIG(VLAN) && vlan) { 2552 /* Implement VLAN tag insertion as part inline data. */ 2553 memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t)); 2554 pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); 2555 psrc += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); 2556 /* Insert VLAN ethertype + VLAN tag. */ 2557 *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32 2558 ((RTE_ETHER_TYPE_VLAN << 16) | 2559 loc->mbuf->vlan_tci); 2560 pdst += sizeof(struct rte_vlan_hdr); 2561 /* Copy the rest two bytes from packet data. */ 2562 MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t))); 2563 *(uint16_t *)pdst = *(unaligned_uint16_t *)psrc; 2564 psrc += sizeof(uint16_t); 2565 } else { 2566 /* Fill the gap in the title WQEBB with inline data. */ 2567 rte_mov16(pdst, psrc); 2568 psrc += sizeof(rte_v128u32_t); 2569 } 2570 pdst = (uint8_t *)(es + 2); 2571 MLX5_ASSERT(inlen >= MLX5_ESEG_MIN_INLINE_SIZE); 2572 MLX5_ASSERT(pdst < (uint8_t *)txq->wqes_end); 2573 inlen -= MLX5_ESEG_MIN_INLINE_SIZE; 2574 if (!inlen) { 2575 MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE)); 2576 return (struct mlx5_wqe_dseg *)pdst; 2577 } 2578 /* 2579 * The WQEBB space availability is checked by caller. 2580 * Here we should be aware of WQE ring buffer wraparound only. 2581 */ 2582 part = (uint8_t *)txq->wqes_end - pdst; 2583 part = RTE_MIN(part, inlen); 2584 do { 2585 rte_memcpy(pdst, psrc, part); 2586 inlen -= part; 2587 if (likely(!inlen)) { 2588 /* 2589 * If return value is not used by the caller 2590 * the code below will be optimized out. 2591 */ 2592 pdst += part; 2593 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); 2594 if (unlikely(pdst >= (uint8_t *)txq->wqes_end)) 2595 pdst = (uint8_t *)txq->wqes; 2596 return (struct mlx5_wqe_dseg *)pdst; 2597 } 2598 pdst = (uint8_t *)txq->wqes; 2599 psrc += part; 2600 part = inlen; 2601 } while (true); 2602 } 2603 2604 /** 2605 * Copy data from chain of mbuf to the specified linear buffer. 2606 * Checksums and VLAN insertion Tx offload features. If data 2607 * from some mbuf copied completely this mbuf is freed. Local 2608 * structure is used to keep the byte stream state. 2609 * 2610 * @param pdst 2611 * Pointer to the destination linear buffer. 2612 * @param loc 2613 * Pointer to burst routine local context. 2614 * @param len 2615 * Length of data to be copied. 2616 * @param must 2617 * Length of data to be copied ignoring no inline hint. 2618 * @param olx 2619 * Configured Tx offloads mask. It is fully defined at 2620 * compile time and may be used for optimization. 2621 * 2622 * @return 2623 * Number of actual copied data bytes. This is always greater than or 2624 * equal to must parameter and might be lesser than len in no inline 2625 * hint flag is encountered. 2626 */ 2627 static __rte_always_inline unsigned int 2628 mlx5_tx_mseg_memcpy(uint8_t *pdst, 2629 struct mlx5_txq_local *restrict loc, 2630 unsigned int len, 2631 unsigned int must, 2632 unsigned int olx __rte_unused) 2633 { 2634 struct rte_mbuf *mbuf; 2635 unsigned int part, dlen, copy = 0; 2636 uint8_t *psrc; 2637 2638 MLX5_ASSERT(len); 2639 MLX5_ASSERT(must <= len); 2640 do { 2641 /* Allow zero length packets, must check first. */ 2642 dlen = rte_pktmbuf_data_len(loc->mbuf); 2643 if (dlen <= loc->mbuf_off) { 2644 /* Exhausted packet, just free. */ 2645 mbuf = loc->mbuf; 2646 loc->mbuf = mbuf->next; 2647 rte_pktmbuf_free_seg(mbuf); 2648 loc->mbuf_off = 0; 2649 MLX5_ASSERT(loc->mbuf_nseg > 1); 2650 MLX5_ASSERT(loc->mbuf); 2651 --loc->mbuf_nseg; 2652 if (loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE) { 2653 unsigned int diff; 2654 2655 if (copy >= must) { 2656 /* 2657 * We already copied the minimal 2658 * requested amount of data. 2659 */ 2660 return copy; 2661 } 2662 diff = must - copy; 2663 if (diff <= rte_pktmbuf_data_len(loc->mbuf)) { 2664 /* 2665 * Copy only the minimal required 2666 * part of the data buffer. 2667 */ 2668 len = diff; 2669 } 2670 } 2671 continue; 2672 } 2673 dlen -= loc->mbuf_off; 2674 psrc = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *, 2675 loc->mbuf_off); 2676 part = RTE_MIN(len, dlen); 2677 rte_memcpy(pdst, psrc, part); 2678 copy += part; 2679 loc->mbuf_off += part; 2680 len -= part; 2681 if (!len) { 2682 if (loc->mbuf_off >= rte_pktmbuf_data_len(loc->mbuf)) { 2683 loc->mbuf_off = 0; 2684 /* Exhausted packet, just free. */ 2685 mbuf = loc->mbuf; 2686 loc->mbuf = mbuf->next; 2687 rte_pktmbuf_free_seg(mbuf); 2688 loc->mbuf_off = 0; 2689 MLX5_ASSERT(loc->mbuf_nseg >= 1); 2690 --loc->mbuf_nseg; 2691 } 2692 return copy; 2693 } 2694 pdst += part; 2695 } while (true); 2696 } 2697 2698 /** 2699 * Build the Ethernet Segment with inlined data from 2700 * multi-segment packet. Checks the boundary of WQEBB 2701 * and ring buffer wrapping, supports Software Parser, 2702 * Checksums and VLAN insertion Tx offload features. 2703 * 2704 * @param txq 2705 * Pointer to TX queue structure. 2706 * @param loc 2707 * Pointer to burst routine local context. 2708 * @param wqe 2709 * Pointer to WQE to fill with built Ethernet Segment. 2710 * @param vlan 2711 * Length of VLAN tag insertion if any. 2712 * @param inlen 2713 * Length of data to inline (VLAN included, if any). 2714 * @param tso 2715 * TSO flag, set mss field from the packet. 2716 * @param olx 2717 * Configured Tx offloads mask. It is fully defined at 2718 * compile time and may be used for optimization. 2719 * 2720 * @return 2721 * Pointer to the next Data Segment (aligned and 2722 * possible NOT wrapped around - caller should do 2723 * wrapping check on its own). 2724 */ 2725 static __rte_always_inline struct mlx5_wqe_dseg * 2726 mlx5_tx_eseg_mdat(struct mlx5_txq_data *restrict txq, 2727 struct mlx5_txq_local *restrict loc, 2728 struct mlx5_wqe *restrict wqe, 2729 unsigned int vlan, 2730 unsigned int inlen, 2731 unsigned int tso, 2732 unsigned int olx) 2733 { 2734 struct mlx5_wqe_eseg *restrict es = &wqe->eseg; 2735 uint32_t csum; 2736 uint8_t *pdst; 2737 unsigned int part, tlen = 0; 2738 2739 /* 2740 * Calculate and set check sum flags first, uint32_t field 2741 * in segment may be shared with Software Parser flags. 2742 */ 2743 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; 2744 if (tso) { 2745 csum <<= 24; 2746 csum |= loc->mbuf->tso_segsz; 2747 es->flags = rte_cpu_to_be_32(csum); 2748 } else { 2749 es->flags = rte_cpu_to_le_32(csum); 2750 } 2751 /* 2752 * Calculate and set Software Parser offsets and flags. 2753 * These flags a set for custom UDP and IP tunnel packets. 2754 */ 2755 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); 2756 /* Fill metadata field if needed. */ 2757 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? 2758 loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? 2759 *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0; 2760 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2761 (sizeof(uint16_t) + 2762 sizeof(rte_v128u32_t)), 2763 "invalid Ethernet Segment data size"); 2764 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2765 (sizeof(uint16_t) + 2766 sizeof(struct rte_vlan_hdr) + 2767 2 * RTE_ETHER_ADDR_LEN), 2768 "invalid Ethernet Segment data size"); 2769 MLX5_ASSERT(inlen >= MLX5_ESEG_MIN_INLINE_SIZE); 2770 pdst = (uint8_t *)&es->inline_data; 2771 if (MLX5_TXOFF_CONFIG(VLAN) && vlan) { 2772 /* Implement VLAN tag insertion as part inline data. */ 2773 mlx5_tx_mseg_memcpy(pdst, loc, 2774 2 * RTE_ETHER_ADDR_LEN, 2775 2 * RTE_ETHER_ADDR_LEN, olx); 2776 pdst += 2 * RTE_ETHER_ADDR_LEN; 2777 *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32 2778 ((RTE_ETHER_TYPE_VLAN << 16) | 2779 loc->mbuf->vlan_tci); 2780 pdst += sizeof(struct rte_vlan_hdr); 2781 tlen += 2 * RTE_ETHER_ADDR_LEN + sizeof(struct rte_vlan_hdr); 2782 } 2783 MLX5_ASSERT(pdst < (uint8_t *)txq->wqes_end); 2784 /* 2785 * The WQEBB space availability is checked by caller. 2786 * Here we should be aware of WQE ring buffer wraparound only. 2787 */ 2788 part = (uint8_t *)txq->wqes_end - pdst; 2789 part = RTE_MIN(part, inlen - tlen); 2790 MLX5_ASSERT(part); 2791 do { 2792 unsigned int copy; 2793 2794 /* 2795 * Copying may be interrupted inside the routine 2796 * if run into no inline hint flag. 2797 */ 2798 copy = tlen >= txq->inlen_mode ? 0 : (txq->inlen_mode - tlen); 2799 copy = mlx5_tx_mseg_memcpy(pdst, loc, part, copy, olx); 2800 tlen += copy; 2801 if (likely(inlen <= tlen) || copy < part) { 2802 es->inline_hdr_sz = rte_cpu_to_be_16(tlen); 2803 pdst += copy; 2804 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); 2805 return (struct mlx5_wqe_dseg *)pdst; 2806 } 2807 pdst = (uint8_t *)txq->wqes; 2808 part = inlen - tlen; 2809 } while (true); 2810 } 2811 2812 /** 2813 * Build the Data Segment of pointer type. 2814 * 2815 * @param txq 2816 * Pointer to TX queue structure. 2817 * @param loc 2818 * Pointer to burst routine local context. 2819 * @param dseg 2820 * Pointer to WQE to fill with built Data Segment. 2821 * @param buf 2822 * Data buffer to point. 2823 * @param len 2824 * Data buffer length. 2825 * @param olx 2826 * Configured Tx offloads mask. It is fully defined at 2827 * compile time and may be used for optimization. 2828 */ 2829 static __rte_always_inline void 2830 mlx5_tx_dseg_ptr(struct mlx5_txq_data *restrict txq, 2831 struct mlx5_txq_local *restrict loc, 2832 struct mlx5_wqe_dseg *restrict dseg, 2833 uint8_t *buf, 2834 unsigned int len, 2835 unsigned int olx __rte_unused) 2836 2837 { 2838 MLX5_ASSERT(len); 2839 dseg->bcount = rte_cpu_to_be_32(len); 2840 dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf); 2841 dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf); 2842 } 2843 2844 /** 2845 * Build the Data Segment of pointer type or inline 2846 * if data length is less than buffer in minimal 2847 * Data Segment size. 2848 * 2849 * @param txq 2850 * Pointer to TX queue structure. 2851 * @param loc 2852 * Pointer to burst routine local context. 2853 * @param dseg 2854 * Pointer to WQE to fill with built Data Segment. 2855 * @param buf 2856 * Data buffer to point. 2857 * @param len 2858 * Data buffer length. 2859 * @param olx 2860 * Configured Tx offloads mask. It is fully defined at 2861 * compile time and may be used for optimization. 2862 */ 2863 static __rte_always_inline void 2864 mlx5_tx_dseg_iptr(struct mlx5_txq_data *restrict txq, 2865 struct mlx5_txq_local *restrict loc, 2866 struct mlx5_wqe_dseg *restrict dseg, 2867 uint8_t *buf, 2868 unsigned int len, 2869 unsigned int olx __rte_unused) 2870 2871 { 2872 uintptr_t dst, src; 2873 2874 MLX5_ASSERT(len); 2875 if (len > MLX5_DSEG_MIN_INLINE_SIZE) { 2876 dseg->bcount = rte_cpu_to_be_32(len); 2877 dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf); 2878 dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf); 2879 2880 return; 2881 } 2882 dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE); 2883 /* Unrolled implementation of generic rte_memcpy. */ 2884 dst = (uintptr_t)&dseg->inline_data[0]; 2885 src = (uintptr_t)buf; 2886 if (len & 0x08) { 2887 #ifdef RTE_ARCH_STRICT_ALIGN 2888 MLX5_ASSERT(dst == RTE_PTR_ALIGN(dst, sizeof(uint32_t))); 2889 *(uint32_t *)dst = *(unaligned_uint32_t *)src; 2890 dst += sizeof(uint32_t); 2891 src += sizeof(uint32_t); 2892 *(uint32_t *)dst = *(unaligned_uint32_t *)src; 2893 dst += sizeof(uint32_t); 2894 src += sizeof(uint32_t); 2895 #else 2896 *(uint64_t *)dst = *(unaligned_uint64_t *)src; 2897 dst += sizeof(uint64_t); 2898 src += sizeof(uint64_t); 2899 #endif 2900 } 2901 if (len & 0x04) { 2902 *(uint32_t *)dst = *(unaligned_uint32_t *)src; 2903 dst += sizeof(uint32_t); 2904 src += sizeof(uint32_t); 2905 } 2906 if (len & 0x02) { 2907 *(uint16_t *)dst = *(unaligned_uint16_t *)src; 2908 dst += sizeof(uint16_t); 2909 src += sizeof(uint16_t); 2910 } 2911 if (len & 0x01) 2912 *(uint8_t *)dst = *(uint8_t *)src; 2913 } 2914 2915 /** 2916 * Build the Data Segment of inlined data from single 2917 * segment packet, no VLAN insertion. 2918 * 2919 * @param txq 2920 * Pointer to TX queue structure. 2921 * @param loc 2922 * Pointer to burst routine local context. 2923 * @param dseg 2924 * Pointer to WQE to fill with built Data Segment. 2925 * @param buf 2926 * Data buffer to point. 2927 * @param len 2928 * Data buffer length. 2929 * @param olx 2930 * Configured Tx offloads mask. It is fully defined at 2931 * compile time and may be used for optimization. 2932 * 2933 * @return 2934 * Pointer to the next Data Segment after inlined data. 2935 * Ring buffer wraparound check is needed. We do not 2936 * do it here because it may not be needed for the 2937 * last packet in the eMPW session. 2938 */ 2939 static __rte_always_inline struct mlx5_wqe_dseg * 2940 mlx5_tx_dseg_empw(struct mlx5_txq_data *restrict txq, 2941 struct mlx5_txq_local *restrict loc __rte_unused, 2942 struct mlx5_wqe_dseg *restrict dseg, 2943 uint8_t *buf, 2944 unsigned int len, 2945 unsigned int olx __rte_unused) 2946 { 2947 unsigned int part; 2948 uint8_t *pdst; 2949 2950 if (!MLX5_TXOFF_CONFIG(MPW)) { 2951 /* Store the descriptor byte counter for eMPW sessions. */ 2952 dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE); 2953 pdst = &dseg->inline_data[0]; 2954 } else { 2955 /* The entire legacy MPW session counter is stored on close. */ 2956 pdst = (uint8_t *)dseg; 2957 } 2958 /* 2959 * The WQEBB space availability is checked by caller. 2960 * Here we should be aware of WQE ring buffer wraparound only. 2961 */ 2962 part = (uint8_t *)txq->wqes_end - pdst; 2963 part = RTE_MIN(part, len); 2964 do { 2965 rte_memcpy(pdst, buf, part); 2966 len -= part; 2967 if (likely(!len)) { 2968 pdst += part; 2969 if (!MLX5_TXOFF_CONFIG(MPW)) 2970 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); 2971 /* Note: no final wraparound check here. */ 2972 return (struct mlx5_wqe_dseg *)pdst; 2973 } 2974 pdst = (uint8_t *)txq->wqes; 2975 buf += part; 2976 part = len; 2977 } while (true); 2978 } 2979 2980 /** 2981 * Build the Data Segment of inlined data from single 2982 * segment packet with VLAN insertion. 2983 * 2984 * @param txq 2985 * Pointer to TX queue structure. 2986 * @param loc 2987 * Pointer to burst routine local context. 2988 * @param dseg 2989 * Pointer to the dseg fill with built Data Segment. 2990 * @param buf 2991 * Data buffer to point. 2992 * @param len 2993 * Data buffer length. 2994 * @param olx 2995 * Configured Tx offloads mask. It is fully defined at 2996 * compile time and may be used for optimization. 2997 * 2998 * @return 2999 * Pointer to the next Data Segment after inlined data. 3000 * Ring buffer wraparound check is needed. 3001 */ 3002 static __rte_always_inline struct mlx5_wqe_dseg * 3003 mlx5_tx_dseg_vlan(struct mlx5_txq_data *restrict txq, 3004 struct mlx5_txq_local *restrict loc __rte_unused, 3005 struct mlx5_wqe_dseg *restrict dseg, 3006 uint8_t *buf, 3007 unsigned int len, 3008 unsigned int olx __rte_unused) 3009 3010 { 3011 unsigned int part; 3012 uint8_t *pdst; 3013 3014 MLX5_ASSERT(len > MLX5_ESEG_MIN_INLINE_SIZE); 3015 static_assert(MLX5_DSEG_MIN_INLINE_SIZE == 3016 (2 * RTE_ETHER_ADDR_LEN), 3017 "invalid Data Segment data size"); 3018 if (!MLX5_TXOFF_CONFIG(MPW)) { 3019 /* Store the descriptor byte counter for eMPW sessions. */ 3020 dseg->bcount = rte_cpu_to_be_32 3021 ((len + sizeof(struct rte_vlan_hdr)) | 3022 MLX5_ETH_WQE_DATA_INLINE); 3023 pdst = &dseg->inline_data[0]; 3024 } else { 3025 /* The entire legacy MPW session counter is stored on close. */ 3026 pdst = (uint8_t *)dseg; 3027 } 3028 memcpy(pdst, buf, MLX5_DSEG_MIN_INLINE_SIZE); 3029 buf += MLX5_DSEG_MIN_INLINE_SIZE; 3030 pdst += MLX5_DSEG_MIN_INLINE_SIZE; 3031 len -= MLX5_DSEG_MIN_INLINE_SIZE; 3032 /* Insert VLAN ethertype + VLAN tag. Pointer is aligned. */ 3033 MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE)); 3034 if (unlikely(pdst >= (uint8_t *)txq->wqes_end)) 3035 pdst = (uint8_t *)txq->wqes; 3036 *(uint32_t *)pdst = rte_cpu_to_be_32((RTE_ETHER_TYPE_VLAN << 16) | 3037 loc->mbuf->vlan_tci); 3038 pdst += sizeof(struct rte_vlan_hdr); 3039 /* 3040 * The WQEBB space availability is checked by caller. 3041 * Here we should be aware of WQE ring buffer wraparound only. 3042 */ 3043 part = (uint8_t *)txq->wqes_end - pdst; 3044 part = RTE_MIN(part, len); 3045 do { 3046 rte_memcpy(pdst, buf, part); 3047 len -= part; 3048 if (likely(!len)) { 3049 pdst += part; 3050 if (!MLX5_TXOFF_CONFIG(MPW)) 3051 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); 3052 /* Note: no final wraparound check here. */ 3053 return (struct mlx5_wqe_dseg *)pdst; 3054 } 3055 pdst = (uint8_t *)txq->wqes; 3056 buf += part; 3057 part = len; 3058 } while (true); 3059 } 3060 3061 /** 3062 * Build the Ethernet Segment with optionally inlined data with 3063 * VLAN insertion and following Data Segments (if any) from 3064 * multi-segment packet. Used by ordinary send and TSO. 3065 * 3066 * @param txq 3067 * Pointer to TX queue structure. 3068 * @param loc 3069 * Pointer to burst routine local context. 3070 * @param wqe 3071 * Pointer to WQE to fill with built Ethernet/Data Segments. 3072 * @param vlan 3073 * Length of VLAN header to insert, 0 means no VLAN insertion. 3074 * @param inlen 3075 * Data length to inline. For TSO this parameter specifies 3076 * exact value, for ordinary send routine can be aligned by 3077 * caller to provide better WQE space saving and data buffer 3078 * start address alignment. This length includes VLAN header 3079 * being inserted. 3080 * @param tso 3081 * Zero means ordinary send, inlined data can be extended, 3082 * otherwise this is TSO, inlined data length is fixed. 3083 * @param olx 3084 * Configured Tx offloads mask. It is fully defined at 3085 * compile time and may be used for optimization. 3086 * 3087 * @return 3088 * Actual size of built WQE in segments. 3089 */ 3090 static __rte_always_inline unsigned int 3091 mlx5_tx_mseg_build(struct mlx5_txq_data *restrict txq, 3092 struct mlx5_txq_local *restrict loc, 3093 struct mlx5_wqe *restrict wqe, 3094 unsigned int vlan, 3095 unsigned int inlen, 3096 unsigned int tso, 3097 unsigned int olx __rte_unused) 3098 { 3099 struct mlx5_wqe_dseg *restrict dseg; 3100 unsigned int ds; 3101 3102 MLX5_ASSERT((rte_pktmbuf_pkt_len(loc->mbuf) + vlan) >= inlen); 3103 loc->mbuf_nseg = NB_SEGS(loc->mbuf); 3104 loc->mbuf_off = 0; 3105 3106 dseg = mlx5_tx_eseg_mdat(txq, loc, wqe, vlan, inlen, tso, olx); 3107 if (!loc->mbuf_nseg) 3108 goto dseg_done; 3109 /* 3110 * There are still some mbuf remaining, not inlined. 3111 * The first mbuf may be partially inlined and we 3112 * must process the possible non-zero data offset. 3113 */ 3114 if (loc->mbuf_off) { 3115 unsigned int dlen; 3116 uint8_t *dptr; 3117 3118 /* 3119 * Exhausted packets must be dropped before. 3120 * Non-zero offset means there are some data 3121 * remained in the packet. 3122 */ 3123 MLX5_ASSERT(loc->mbuf_off < rte_pktmbuf_data_len(loc->mbuf)); 3124 MLX5_ASSERT(rte_pktmbuf_data_len(loc->mbuf)); 3125 dptr = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *, 3126 loc->mbuf_off); 3127 dlen = rte_pktmbuf_data_len(loc->mbuf) - loc->mbuf_off; 3128 /* 3129 * Build the pointer/minimal data Data Segment. 3130 * Do ring buffer wrapping check in advance. 3131 */ 3132 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 3133 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 3134 mlx5_tx_dseg_iptr(txq, loc, dseg, dptr, dlen, olx); 3135 /* Store the mbuf to be freed on completion. */ 3136 MLX5_ASSERT(loc->elts_free); 3137 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 3138 --loc->elts_free; 3139 ++dseg; 3140 if (--loc->mbuf_nseg == 0) 3141 goto dseg_done; 3142 loc->mbuf = loc->mbuf->next; 3143 loc->mbuf_off = 0; 3144 } 3145 do { 3146 if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) { 3147 struct rte_mbuf *mbuf; 3148 3149 /* Zero length segment found, just skip. */ 3150 mbuf = loc->mbuf; 3151 loc->mbuf = loc->mbuf->next; 3152 rte_pktmbuf_free_seg(mbuf); 3153 if (--loc->mbuf_nseg == 0) 3154 break; 3155 } else { 3156 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 3157 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 3158 mlx5_tx_dseg_iptr 3159 (txq, loc, dseg, 3160 rte_pktmbuf_mtod(loc->mbuf, uint8_t *), 3161 rte_pktmbuf_data_len(loc->mbuf), olx); 3162 MLX5_ASSERT(loc->elts_free); 3163 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 3164 --loc->elts_free; 3165 ++dseg; 3166 if (--loc->mbuf_nseg == 0) 3167 break; 3168 loc->mbuf = loc->mbuf->next; 3169 } 3170 } while (true); 3171 3172 dseg_done: 3173 /* Calculate actual segments used from the dseg pointer. */ 3174 if ((uintptr_t)wqe < (uintptr_t)dseg) 3175 ds = ((uintptr_t)dseg - (uintptr_t)wqe) / MLX5_WSEG_SIZE; 3176 else 3177 ds = (((uintptr_t)dseg - (uintptr_t)wqe) + 3178 txq->wqe_s * MLX5_WQE_SIZE) / MLX5_WSEG_SIZE; 3179 return ds; 3180 } 3181 3182 /** 3183 * Tx one packet function for multi-segment TSO. Supports all 3184 * types of Tx offloads, uses MLX5_OPCODE_TSO to build WQEs, 3185 * sends one packet per WQE. 3186 * 3187 * This routine is responsible for storing processed mbuf 3188 * into elts ring buffer and update elts_head. 3189 * 3190 * @param txq 3191 * Pointer to TX queue structure. 3192 * @param loc 3193 * Pointer to burst routine local context. 3194 * @param olx 3195 * Configured Tx offloads mask. It is fully defined at 3196 * compile time and may be used for optimization. 3197 * 3198 * @return 3199 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3200 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3201 * Local context variables partially updated. 3202 */ 3203 static __rte_always_inline enum mlx5_txcmp_code 3204 mlx5_tx_packet_multi_tso(struct mlx5_txq_data *restrict txq, 3205 struct mlx5_txq_local *restrict loc, 3206 unsigned int olx) 3207 { 3208 struct mlx5_wqe *restrict wqe; 3209 unsigned int ds, dlen, inlen, ntcp, vlan = 0; 3210 3211 /* 3212 * Calculate data length to be inlined to estimate 3213 * the required space in WQE ring buffer. 3214 */ 3215 dlen = rte_pktmbuf_pkt_len(loc->mbuf); 3216 if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) 3217 vlan = sizeof(struct rte_vlan_hdr); 3218 inlen = loc->mbuf->l2_len + vlan + 3219 loc->mbuf->l3_len + loc->mbuf->l4_len; 3220 if (unlikely((!inlen || !loc->mbuf->tso_segsz))) 3221 return MLX5_TXCMP_CODE_ERROR; 3222 if (loc->mbuf->ol_flags & PKT_TX_TUNNEL_MASK) 3223 inlen += loc->mbuf->outer_l2_len + loc->mbuf->outer_l3_len; 3224 /* Packet must contain all TSO headers. */ 3225 if (unlikely(inlen > MLX5_MAX_TSO_HEADER || 3226 inlen <= MLX5_ESEG_MIN_INLINE_SIZE || 3227 inlen > (dlen + vlan))) 3228 return MLX5_TXCMP_CODE_ERROR; 3229 MLX5_ASSERT(inlen >= txq->inlen_mode); 3230 /* 3231 * Check whether there are enough free WQEBBs: 3232 * - Control Segment 3233 * - Ethernet Segment 3234 * - First Segment of inlined Ethernet data 3235 * - ... data continued ... 3236 * - Data Segments of pointer/min inline type 3237 */ 3238 ds = NB_SEGS(loc->mbuf) + 2 + (inlen - 3239 MLX5_ESEG_MIN_INLINE_SIZE + 3240 MLX5_WSEG_SIZE + 3241 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 3242 if (unlikely(loc->wqe_free < ((ds + 3) / 4))) 3243 return MLX5_TXCMP_CODE_EXIT; 3244 /* Check for maximal WQE size. */ 3245 if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4))) 3246 return MLX5_TXCMP_CODE_ERROR; 3247 #ifdef MLX5_PMD_SOFT_COUNTERS 3248 /* Update sent data bytes/packets counters. */ 3249 ntcp = (dlen - (inlen - vlan) + loc->mbuf->tso_segsz - 1) / 3250 loc->mbuf->tso_segsz; 3251 /* 3252 * One will be added for mbuf itself 3253 * at the end of the mlx5_tx_burst from 3254 * loc->pkts_sent field. 3255 */ 3256 --ntcp; 3257 txq->stats.opackets += ntcp; 3258 txq->stats.obytes += dlen + vlan + ntcp * inlen; 3259 #endif 3260 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3261 loc->wqe_last = wqe; 3262 mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_TSO, olx); 3263 ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 1, olx); 3264 wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); 3265 txq->wqe_ci += (ds + 3) / 4; 3266 loc->wqe_free -= (ds + 3) / 4; 3267 return MLX5_TXCMP_CODE_MULTI; 3268 } 3269 3270 /** 3271 * Tx one packet function for multi-segment SEND. Supports all 3272 * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs, 3273 * sends one packet per WQE, without any data inlining in 3274 * Ethernet Segment. 3275 * 3276 * This routine is responsible for storing processed mbuf 3277 * into elts ring buffer and update elts_head. 3278 * 3279 * @param txq 3280 * Pointer to TX queue structure. 3281 * @param loc 3282 * Pointer to burst routine local context. 3283 * @param olx 3284 * Configured Tx offloads mask. It is fully defined at 3285 * compile time and may be used for optimization. 3286 * 3287 * @return 3288 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3289 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3290 * Local context variables partially updated. 3291 */ 3292 static __rte_always_inline enum mlx5_txcmp_code 3293 mlx5_tx_packet_multi_send(struct mlx5_txq_data *restrict txq, 3294 struct mlx5_txq_local *restrict loc, 3295 unsigned int olx) 3296 { 3297 struct mlx5_wqe_dseg *restrict dseg; 3298 struct mlx5_wqe *restrict wqe; 3299 unsigned int ds, nseg; 3300 3301 MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1); 3302 /* 3303 * No inline at all, it means the CPU cycles saving 3304 * is prioritized at configuration, we should not 3305 * copy any packet data to WQE. 3306 */ 3307 nseg = NB_SEGS(loc->mbuf); 3308 ds = 2 + nseg; 3309 if (unlikely(loc->wqe_free < ((ds + 3) / 4))) 3310 return MLX5_TXCMP_CODE_EXIT; 3311 /* Check for maximal WQE size. */ 3312 if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4))) 3313 return MLX5_TXCMP_CODE_ERROR; 3314 /* 3315 * Some Tx offloads may cause an error if 3316 * packet is not long enough, check against 3317 * assumed minimal length. 3318 */ 3319 if (rte_pktmbuf_pkt_len(loc->mbuf) <= MLX5_ESEG_MIN_INLINE_SIZE) 3320 return MLX5_TXCMP_CODE_ERROR; 3321 #ifdef MLX5_PMD_SOFT_COUNTERS 3322 /* Update sent data bytes counter. */ 3323 txq->stats.obytes += rte_pktmbuf_pkt_len(loc->mbuf); 3324 if (MLX5_TXOFF_CONFIG(VLAN) && 3325 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) 3326 txq->stats.obytes += sizeof(struct rte_vlan_hdr); 3327 #endif 3328 /* 3329 * SEND WQE, one WQEBB: 3330 * - Control Segment, SEND opcode 3331 * - Ethernet Segment, optional VLAN, no inline 3332 * - Data Segments, pointer only type 3333 */ 3334 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3335 loc->wqe_last = wqe; 3336 mlx5_tx_cseg_init(txq, loc, wqe, ds, MLX5_OPCODE_SEND, olx); 3337 mlx5_tx_eseg_none(txq, loc, wqe, olx); 3338 dseg = &wqe->dseg[0]; 3339 do { 3340 if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) { 3341 struct rte_mbuf *mbuf; 3342 3343 /* 3344 * Zero length segment found, have to 3345 * correct total size of WQE in segments. 3346 * It is supposed to be rare occasion, so 3347 * in normal case (no zero length segments) 3348 * we avoid extra writing to the Control 3349 * Segment. 3350 */ 3351 --ds; 3352 wqe->cseg.sq_ds -= RTE_BE32(1); 3353 mbuf = loc->mbuf; 3354 loc->mbuf = mbuf->next; 3355 rte_pktmbuf_free_seg(mbuf); 3356 if (--nseg == 0) 3357 break; 3358 } else { 3359 mlx5_tx_dseg_ptr 3360 (txq, loc, dseg, 3361 rte_pktmbuf_mtod(loc->mbuf, uint8_t *), 3362 rte_pktmbuf_data_len(loc->mbuf), olx); 3363 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 3364 --loc->elts_free; 3365 if (--nseg == 0) 3366 break; 3367 ++dseg; 3368 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 3369 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 3370 loc->mbuf = loc->mbuf->next; 3371 } 3372 } while (true); 3373 txq->wqe_ci += (ds + 3) / 4; 3374 loc->wqe_free -= (ds + 3) / 4; 3375 return MLX5_TXCMP_CODE_MULTI; 3376 } 3377 3378 /** 3379 * Tx one packet function for multi-segment SEND. Supports all 3380 * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs, 3381 * sends one packet per WQE, with data inlining in 3382 * Ethernet Segment and minimal Data Segments. 3383 * 3384 * This routine is responsible for storing processed mbuf 3385 * into elts ring buffer and update elts_head. 3386 * 3387 * @param txq 3388 * Pointer to TX queue structure. 3389 * @param loc 3390 * Pointer to burst routine local context. 3391 * @param olx 3392 * Configured Tx offloads mask. It is fully defined at 3393 * compile time and may be used for optimization. 3394 * 3395 * @return 3396 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3397 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3398 * Local context variables partially updated. 3399 */ 3400 static __rte_always_inline enum mlx5_txcmp_code 3401 mlx5_tx_packet_multi_inline(struct mlx5_txq_data *restrict txq, 3402 struct mlx5_txq_local *restrict loc, 3403 unsigned int olx) 3404 { 3405 struct mlx5_wqe *restrict wqe; 3406 unsigned int ds, inlen, dlen, vlan = 0; 3407 3408 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE)); 3409 MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1); 3410 /* 3411 * First calculate data length to be inlined 3412 * to estimate the required space for WQE. 3413 */ 3414 dlen = rte_pktmbuf_pkt_len(loc->mbuf); 3415 if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) 3416 vlan = sizeof(struct rte_vlan_hdr); 3417 inlen = dlen + vlan; 3418 /* Check against minimal length. */ 3419 if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE) 3420 return MLX5_TXCMP_CODE_ERROR; 3421 MLX5_ASSERT(txq->inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE); 3422 if (inlen > txq->inlen_send || 3423 loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE) { 3424 struct rte_mbuf *mbuf; 3425 unsigned int nxlen; 3426 uintptr_t start; 3427 3428 /* 3429 * Packet length exceeds the allowed inline 3430 * data length, check whether the minimal 3431 * inlining is required. 3432 */ 3433 if (txq->inlen_mode) { 3434 MLX5_ASSERT(txq->inlen_mode >= 3435 MLX5_ESEG_MIN_INLINE_SIZE); 3436 MLX5_ASSERT(txq->inlen_mode <= txq->inlen_send); 3437 inlen = txq->inlen_mode; 3438 } else { 3439 if (loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE || 3440 !vlan || txq->vlan_en) { 3441 /* 3442 * VLAN insertion will be done inside by HW. 3443 * It is not utmost effective - VLAN flag is 3444 * checked twice, but we should proceed the 3445 * inlining length correctly and take into 3446 * account the VLAN header being inserted. 3447 */ 3448 return mlx5_tx_packet_multi_send 3449 (txq, loc, olx); 3450 } 3451 inlen = MLX5_ESEG_MIN_INLINE_SIZE; 3452 } 3453 /* 3454 * Now we know the minimal amount of data is requested 3455 * to inline. Check whether we should inline the buffers 3456 * from the chain beginning to eliminate some mbufs. 3457 */ 3458 mbuf = loc->mbuf; 3459 nxlen = rte_pktmbuf_data_len(mbuf); 3460 if (unlikely(nxlen <= txq->inlen_send)) { 3461 /* We can inline first mbuf at least. */ 3462 if (nxlen < inlen) { 3463 unsigned int smlen; 3464 3465 /* Scan mbufs till inlen filled. */ 3466 do { 3467 smlen = nxlen; 3468 mbuf = NEXT(mbuf); 3469 MLX5_ASSERT(mbuf); 3470 nxlen = rte_pktmbuf_data_len(mbuf); 3471 nxlen += smlen; 3472 } while (unlikely(nxlen < inlen)); 3473 if (unlikely(nxlen > txq->inlen_send)) { 3474 /* We cannot inline entire mbuf. */ 3475 smlen = inlen - smlen; 3476 start = rte_pktmbuf_mtod_offset 3477 (mbuf, uintptr_t, smlen); 3478 goto do_align; 3479 } 3480 } 3481 do { 3482 inlen = nxlen; 3483 mbuf = NEXT(mbuf); 3484 /* There should be not end of packet. */ 3485 MLX5_ASSERT(mbuf); 3486 nxlen = inlen + rte_pktmbuf_data_len(mbuf); 3487 } while (unlikely(nxlen < txq->inlen_send)); 3488 } 3489 start = rte_pktmbuf_mtod(mbuf, uintptr_t); 3490 /* 3491 * Check whether we can do inline to align start 3492 * address of data buffer to cacheline. 3493 */ 3494 do_align: 3495 start = (~start + 1) & (RTE_CACHE_LINE_SIZE - 1); 3496 if (unlikely(start)) { 3497 start += inlen; 3498 if (start <= txq->inlen_send) 3499 inlen = start; 3500 } 3501 } 3502 /* 3503 * Check whether there are enough free WQEBBs: 3504 * - Control Segment 3505 * - Ethernet Segment 3506 * - First Segment of inlined Ethernet data 3507 * - ... data continued ... 3508 * - Data Segments of pointer/min inline type 3509 * 3510 * Estimate the number of Data Segments conservatively, 3511 * supposing no any mbufs is being freed during inlining. 3512 */ 3513 MLX5_ASSERT(inlen <= txq->inlen_send); 3514 ds = NB_SEGS(loc->mbuf) + 2 + (inlen - 3515 MLX5_ESEG_MIN_INLINE_SIZE + 3516 MLX5_WSEG_SIZE + 3517 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 3518 if (unlikely(loc->wqe_free < ((ds + 3) / 4))) 3519 return MLX5_TXCMP_CODE_EXIT; 3520 /* Check for maximal WQE size. */ 3521 if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4))) 3522 return MLX5_TXCMP_CODE_ERROR; 3523 #ifdef MLX5_PMD_SOFT_COUNTERS 3524 /* Update sent data bytes/packets counters. */ 3525 txq->stats.obytes += dlen + vlan; 3526 #endif 3527 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3528 loc->wqe_last = wqe; 3529 mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_SEND, olx); 3530 ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 0, olx); 3531 wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); 3532 txq->wqe_ci += (ds + 3) / 4; 3533 loc->wqe_free -= (ds + 3) / 4; 3534 return MLX5_TXCMP_CODE_MULTI; 3535 } 3536 3537 /** 3538 * Tx burst function for multi-segment packets. Supports all 3539 * types of Tx offloads, uses MLX5_OPCODE_SEND/TSO to build WQEs, 3540 * sends one packet per WQE. Function stops sending if it 3541 * encounters the single-segment packet. 3542 * 3543 * This routine is responsible for storing processed mbuf 3544 * into elts ring buffer and update elts_head. 3545 * 3546 * @param txq 3547 * Pointer to TX queue structure. 3548 * @param[in] pkts 3549 * Packets to transmit. 3550 * @param pkts_n 3551 * Number of packets in array. 3552 * @param loc 3553 * Pointer to burst routine local context. 3554 * @param olx 3555 * Configured Tx offloads mask. It is fully defined at 3556 * compile time and may be used for optimization. 3557 * 3558 * @return 3559 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3560 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3561 * MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered. 3562 * MLX5_TXCMP_CODE_TSO - TSO single-segment packet encountered. 3563 * Local context variables updated. 3564 */ 3565 static __rte_always_inline enum mlx5_txcmp_code 3566 mlx5_tx_burst_mseg(struct mlx5_txq_data *restrict txq, 3567 struct rte_mbuf **restrict pkts, 3568 unsigned int pkts_n, 3569 struct mlx5_txq_local *restrict loc, 3570 unsigned int olx) 3571 { 3572 MLX5_ASSERT(loc->elts_free && loc->wqe_free); 3573 MLX5_ASSERT(pkts_n > loc->pkts_sent); 3574 pkts += loc->pkts_sent + 1; 3575 pkts_n -= loc->pkts_sent; 3576 for (;;) { 3577 enum mlx5_txcmp_code ret; 3578 3579 MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1); 3580 /* 3581 * Estimate the number of free elts quickly but 3582 * conservatively. Some segment may be fully inlined 3583 * and freed, ignore this here - precise estimation 3584 * is costly. 3585 */ 3586 if (loc->elts_free < NB_SEGS(loc->mbuf)) 3587 return MLX5_TXCMP_CODE_EXIT; 3588 if (MLX5_TXOFF_CONFIG(TSO) && 3589 unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) { 3590 /* Proceed with multi-segment TSO. */ 3591 ret = mlx5_tx_packet_multi_tso(txq, loc, olx); 3592 } else if (MLX5_TXOFF_CONFIG(INLINE)) { 3593 /* Proceed with multi-segment SEND with inlining. */ 3594 ret = mlx5_tx_packet_multi_inline(txq, loc, olx); 3595 } else { 3596 /* Proceed with multi-segment SEND w/o inlining. */ 3597 ret = mlx5_tx_packet_multi_send(txq, loc, olx); 3598 } 3599 if (ret == MLX5_TXCMP_CODE_EXIT) 3600 return MLX5_TXCMP_CODE_EXIT; 3601 if (ret == MLX5_TXCMP_CODE_ERROR) 3602 return MLX5_TXCMP_CODE_ERROR; 3603 /* WQE is built, go to the next packet. */ 3604 ++loc->pkts_sent; 3605 --pkts_n; 3606 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) 3607 return MLX5_TXCMP_CODE_EXIT; 3608 loc->mbuf = *pkts++; 3609 if (pkts_n > 1) 3610 rte_prefetch0(*pkts); 3611 if (likely(NB_SEGS(loc->mbuf) > 1)) 3612 continue; 3613 /* Here ends the series of multi-segment packets. */ 3614 if (MLX5_TXOFF_CONFIG(TSO) && 3615 unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) 3616 return MLX5_TXCMP_CODE_TSO; 3617 return MLX5_TXCMP_CODE_SINGLE; 3618 } 3619 MLX5_ASSERT(false); 3620 } 3621 3622 /** 3623 * Tx burst function for single-segment packets with TSO. 3624 * Supports all types of Tx offloads, except multi-packets. 3625 * Uses MLX5_OPCODE_TSO to build WQEs, sends one packet per WQE. 3626 * Function stops sending if it encounters the multi-segment 3627 * packet or packet without TSO requested. 3628 * 3629 * The routine is responsible for storing processed mbuf 3630 * into elts ring buffer and update elts_head if inline 3631 * offloads is requested due to possible early freeing 3632 * of the inlined mbufs (can not store pkts array in elts 3633 * as a batch). 3634 * 3635 * @param txq 3636 * Pointer to TX queue structure. 3637 * @param[in] pkts 3638 * Packets to transmit. 3639 * @param pkts_n 3640 * Number of packets in array. 3641 * @param loc 3642 * Pointer to burst routine local context. 3643 * @param olx 3644 * Configured Tx offloads mask. It is fully defined at 3645 * compile time and may be used for optimization. 3646 * 3647 * @return 3648 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3649 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3650 * MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered. 3651 * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered. 3652 * Local context variables updated. 3653 */ 3654 static __rte_always_inline enum mlx5_txcmp_code 3655 mlx5_tx_burst_tso(struct mlx5_txq_data *restrict txq, 3656 struct rte_mbuf **restrict pkts, 3657 unsigned int pkts_n, 3658 struct mlx5_txq_local *restrict loc, 3659 unsigned int olx) 3660 { 3661 MLX5_ASSERT(loc->elts_free && loc->wqe_free); 3662 MLX5_ASSERT(pkts_n > loc->pkts_sent); 3663 pkts += loc->pkts_sent + 1; 3664 pkts_n -= loc->pkts_sent; 3665 for (;;) { 3666 struct mlx5_wqe_dseg *restrict dseg; 3667 struct mlx5_wqe *restrict wqe; 3668 unsigned int ds, dlen, hlen, ntcp, vlan = 0; 3669 uint8_t *dptr; 3670 3671 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); 3672 dlen = rte_pktmbuf_data_len(loc->mbuf); 3673 if (MLX5_TXOFF_CONFIG(VLAN) && 3674 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { 3675 vlan = sizeof(struct rte_vlan_hdr); 3676 } 3677 /* 3678 * First calculate the WQE size to check 3679 * whether we have enough space in ring buffer. 3680 */ 3681 hlen = loc->mbuf->l2_len + vlan + 3682 loc->mbuf->l3_len + loc->mbuf->l4_len; 3683 if (unlikely((!hlen || !loc->mbuf->tso_segsz))) 3684 return MLX5_TXCMP_CODE_ERROR; 3685 if (loc->mbuf->ol_flags & PKT_TX_TUNNEL_MASK) 3686 hlen += loc->mbuf->outer_l2_len + 3687 loc->mbuf->outer_l3_len; 3688 /* Segment must contain all TSO headers. */ 3689 if (unlikely(hlen > MLX5_MAX_TSO_HEADER || 3690 hlen <= MLX5_ESEG_MIN_INLINE_SIZE || 3691 hlen > (dlen + vlan))) 3692 return MLX5_TXCMP_CODE_ERROR; 3693 /* 3694 * Check whether there are enough free WQEBBs: 3695 * - Control Segment 3696 * - Ethernet Segment 3697 * - First Segment of inlined Ethernet data 3698 * - ... data continued ... 3699 * - Finishing Data Segment of pointer type 3700 */ 3701 ds = 4 + (hlen - MLX5_ESEG_MIN_INLINE_SIZE + 3702 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 3703 if (loc->wqe_free < ((ds + 3) / 4)) 3704 return MLX5_TXCMP_CODE_EXIT; 3705 #ifdef MLX5_PMD_SOFT_COUNTERS 3706 /* Update sent data bytes/packets counters. */ 3707 ntcp = (dlen + vlan - hlen + 3708 loc->mbuf->tso_segsz - 1) / 3709 loc->mbuf->tso_segsz; 3710 /* 3711 * One will be added for mbuf itself at the end 3712 * of the mlx5_tx_burst from loc->pkts_sent field. 3713 */ 3714 --ntcp; 3715 txq->stats.opackets += ntcp; 3716 txq->stats.obytes += dlen + vlan + ntcp * hlen; 3717 #endif 3718 /* 3719 * Build the TSO WQE: 3720 * - Control Segment 3721 * - Ethernet Segment with hlen bytes inlined 3722 * - Data Segment of pointer type 3723 */ 3724 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3725 loc->wqe_last = wqe; 3726 mlx5_tx_cseg_init(txq, loc, wqe, ds, 3727 MLX5_OPCODE_TSO, olx); 3728 dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan, hlen, 1, olx); 3729 dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + hlen - vlan; 3730 dlen -= hlen - vlan; 3731 mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx); 3732 /* 3733 * WQE is built, update the loop parameters 3734 * and go to the next packet. 3735 */ 3736 txq->wqe_ci += (ds + 3) / 4; 3737 loc->wqe_free -= (ds + 3) / 4; 3738 if (MLX5_TXOFF_CONFIG(INLINE)) 3739 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 3740 --loc->elts_free; 3741 ++loc->pkts_sent; 3742 --pkts_n; 3743 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) 3744 return MLX5_TXCMP_CODE_EXIT; 3745 loc->mbuf = *pkts++; 3746 if (pkts_n > 1) 3747 rte_prefetch0(*pkts); 3748 if (MLX5_TXOFF_CONFIG(MULTI) && 3749 unlikely(NB_SEGS(loc->mbuf) > 1)) 3750 return MLX5_TXCMP_CODE_MULTI; 3751 if (likely(!(loc->mbuf->ol_flags & PKT_TX_TCP_SEG))) 3752 return MLX5_TXCMP_CODE_SINGLE; 3753 /* Continue with the next TSO packet. */ 3754 } 3755 MLX5_ASSERT(false); 3756 } 3757 3758 /** 3759 * Analyze the packet and select the best method to send. 3760 * 3761 * @param txq 3762 * Pointer to TX queue structure. 3763 * @param loc 3764 * Pointer to burst routine local context. 3765 * @param olx 3766 * Configured Tx offloads mask. It is fully defined at 3767 * compile time and may be used for optimization. 3768 * @param newp 3769 * The predefined flag whether do complete check for 3770 * multi-segment packets and TSO. 3771 * 3772 * @return 3773 * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered. 3774 * MLX5_TXCMP_CODE_TSO - TSO required, use TSO/LSO. 3775 * MLX5_TXCMP_CODE_SINGLE - single-segment packet, use SEND. 3776 * MLX5_TXCMP_CODE_EMPW - single-segment packet, use MPW. 3777 */ 3778 static __rte_always_inline enum mlx5_txcmp_code 3779 mlx5_tx_able_to_empw(struct mlx5_txq_data *restrict txq, 3780 struct mlx5_txq_local *restrict loc, 3781 unsigned int olx, 3782 bool newp) 3783 { 3784 /* Check for multi-segment packet. */ 3785 if (newp && 3786 MLX5_TXOFF_CONFIG(MULTI) && 3787 unlikely(NB_SEGS(loc->mbuf) > 1)) 3788 return MLX5_TXCMP_CODE_MULTI; 3789 /* Check for TSO packet. */ 3790 if (newp && 3791 MLX5_TXOFF_CONFIG(TSO) && 3792 unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) 3793 return MLX5_TXCMP_CODE_TSO; 3794 /* Check if eMPW is enabled at all. */ 3795 if (!MLX5_TXOFF_CONFIG(EMPW)) 3796 return MLX5_TXCMP_CODE_SINGLE; 3797 /* Check if eMPW can be engaged. */ 3798 if (MLX5_TXOFF_CONFIG(VLAN) && 3799 unlikely(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) && 3800 (!MLX5_TXOFF_CONFIG(INLINE) || 3801 unlikely((rte_pktmbuf_data_len(loc->mbuf) + 3802 sizeof(struct rte_vlan_hdr)) > txq->inlen_empw))) { 3803 /* 3804 * eMPW does not support VLAN insertion offload, 3805 * we have to inline the entire packet but 3806 * packet is too long for inlining. 3807 */ 3808 return MLX5_TXCMP_CODE_SINGLE; 3809 } 3810 return MLX5_TXCMP_CODE_EMPW; 3811 } 3812 3813 /** 3814 * Check the next packet attributes to match with the eMPW batch ones. 3815 * In addition, for legacy MPW the packet length is checked either. 3816 * 3817 * @param txq 3818 * Pointer to TX queue structure. 3819 * @param es 3820 * Pointer to Ethernet Segment of eMPW batch. 3821 * @param loc 3822 * Pointer to burst routine local context. 3823 * @param dlen 3824 * Length of previous packet in MPW descriptor. 3825 * @param olx 3826 * Configured Tx offloads mask. It is fully defined at 3827 * compile time and may be used for optimization. 3828 * 3829 * @return 3830 * true - packet match with eMPW batch attributes. 3831 * false - no match, eMPW should be restarted. 3832 */ 3833 static __rte_always_inline bool 3834 mlx5_tx_match_empw(struct mlx5_txq_data *restrict txq __rte_unused, 3835 struct mlx5_wqe_eseg *restrict es, 3836 struct mlx5_txq_local *restrict loc, 3837 uint32_t dlen, 3838 unsigned int olx) 3839 { 3840 uint8_t swp_flags = 0; 3841 3842 /* Compare the checksum flags, if any. */ 3843 if (MLX5_TXOFF_CONFIG(CSUM) && 3844 txq_ol_cksum_to_cs(loc->mbuf) != es->cs_flags) 3845 return false; 3846 /* Compare the Software Parser offsets and flags. */ 3847 if (MLX5_TXOFF_CONFIG(SWP) && 3848 (es->swp_offs != txq_mbuf_to_swp(loc, &swp_flags, olx) || 3849 es->swp_flags != swp_flags)) 3850 return false; 3851 /* Fill metadata field if needed. */ 3852 if (MLX5_TXOFF_CONFIG(METADATA) && 3853 es->metadata != (loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? 3854 *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0)) 3855 return false; 3856 /* Legacy MPW can send packets with the same lengt only. */ 3857 if (MLX5_TXOFF_CONFIG(MPW) && 3858 dlen != rte_pktmbuf_data_len(loc->mbuf)) 3859 return false; 3860 /* There must be no VLAN packets in eMPW loop. */ 3861 if (MLX5_TXOFF_CONFIG(VLAN)) 3862 MLX5_ASSERT(!(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)); 3863 return true; 3864 } 3865 3866 /* 3867 * Update send loop variables and WQE for eMPW loop 3868 * without data inlining. Number of Data Segments is 3869 * equal to the number of sent packets. 3870 * 3871 * @param txq 3872 * Pointer to TX queue structure. 3873 * @param loc 3874 * Pointer to burst routine local context. 3875 * @param ds 3876 * Number of packets/Data Segments/Packets. 3877 * @param slen 3878 * Accumulated statistics, bytes sent 3879 * @param olx 3880 * Configured Tx offloads mask. It is fully defined at 3881 * compile time and may be used for optimization. 3882 * 3883 * @return 3884 * true - packet match with eMPW batch attributes. 3885 * false - no match, eMPW should be restarted. 3886 */ 3887 static __rte_always_inline void 3888 mlx5_tx_sdone_empw(struct mlx5_txq_data *restrict txq, 3889 struct mlx5_txq_local *restrict loc, 3890 unsigned int ds, 3891 unsigned int slen, 3892 unsigned int olx __rte_unused) 3893 { 3894 MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE)); 3895 #ifdef MLX5_PMD_SOFT_COUNTERS 3896 /* Update sent data bytes counter. */ 3897 txq->stats.obytes += slen; 3898 #else 3899 (void)slen; 3900 #endif 3901 loc->elts_free -= ds; 3902 loc->pkts_sent += ds; 3903 ds += 2; 3904 loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); 3905 txq->wqe_ci += (ds + 3) / 4; 3906 loc->wqe_free -= (ds + 3) / 4; 3907 } 3908 3909 /* 3910 * Update send loop variables and WQE for eMPW loop 3911 * with data inlining. Gets the size of pushed descriptors 3912 * and data to the WQE. 3913 * 3914 * @param txq 3915 * Pointer to TX queue structure. 3916 * @param loc 3917 * Pointer to burst routine local context. 3918 * @param len 3919 * Total size of descriptor/data in bytes. 3920 * @param slen 3921 * Accumulated statistics, data bytes sent. 3922 * @param wqem 3923 * The base WQE for the eMPW/MPW descriptor. 3924 * @param olx 3925 * Configured Tx offloads mask. It is fully defined at 3926 * compile time and may be used for optimization. 3927 * 3928 * @return 3929 * true - packet match with eMPW batch attributes. 3930 * false - no match, eMPW should be restarted. 3931 */ 3932 static __rte_always_inline void 3933 mlx5_tx_idone_empw(struct mlx5_txq_data *restrict txq, 3934 struct mlx5_txq_local *restrict loc, 3935 unsigned int len, 3936 unsigned int slen, 3937 struct mlx5_wqe *restrict wqem, 3938 unsigned int olx __rte_unused) 3939 { 3940 struct mlx5_wqe_dseg *dseg = &wqem->dseg[0]; 3941 3942 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE)); 3943 #ifdef MLX5_PMD_SOFT_COUNTERS 3944 /* Update sent data bytes counter. */ 3945 txq->stats.obytes += slen; 3946 #else 3947 (void)slen; 3948 #endif 3949 if (MLX5_TXOFF_CONFIG(MPW) && dseg->bcount == RTE_BE32(0)) { 3950 /* 3951 * If the legacy MPW session contains the inline packets 3952 * we should set the only inline data segment length 3953 * and align the total length to the segment size. 3954 */ 3955 MLX5_ASSERT(len > sizeof(dseg->bcount)); 3956 dseg->bcount = rte_cpu_to_be_32((len - sizeof(dseg->bcount)) | 3957 MLX5_ETH_WQE_DATA_INLINE); 3958 len = (len + MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE + 2; 3959 } else { 3960 /* 3961 * The session is not legacy MPW or contains the 3962 * data buffer pointer segments. 3963 */ 3964 MLX5_ASSERT((len % MLX5_WSEG_SIZE) == 0); 3965 len = len / MLX5_WSEG_SIZE + 2; 3966 } 3967 wqem->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | len); 3968 txq->wqe_ci += (len + 3) / 4; 3969 loc->wqe_free -= (len + 3) / 4; 3970 loc->wqe_last = wqem; 3971 } 3972 3973 /** 3974 * The set of Tx burst functions for single-segment packets 3975 * without TSO and with Multi-Packet Writing feature support. 3976 * Supports all types of Tx offloads, except multi-packets 3977 * and TSO. 3978 * 3979 * Uses MLX5_OPCODE_EMPW to build WQEs if possible and sends 3980 * as many packet per WQE as it can. If eMPW is not configured 3981 * or packet can not be sent with eMPW (VLAN insertion) the 3982 * ordinary SEND opcode is used and only one packet placed 3983 * in WQE. 3984 * 3985 * Functions stop sending if it encounters the multi-segment 3986 * packet or packet with TSO requested. 3987 * 3988 * The routines are responsible for storing processed mbuf 3989 * into elts ring buffer and update elts_head if inlining 3990 * offload is requested. Otherwise the copying mbufs to elts 3991 * can be postponed and completed at the end of burst routine. 3992 * 3993 * @param txq 3994 * Pointer to TX queue structure. 3995 * @param[in] pkts 3996 * Packets to transmit. 3997 * @param pkts_n 3998 * Number of packets in array. 3999 * @param loc 4000 * Pointer to burst routine local context. 4001 * @param olx 4002 * Configured Tx offloads mask. It is fully defined at 4003 * compile time and may be used for optimization. 4004 * 4005 * @return 4006 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 4007 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 4008 * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered. 4009 * MLX5_TXCMP_CODE_TSO - TSO packet encountered. 4010 * MLX5_TXCMP_CODE_SINGLE - used inside functions set. 4011 * MLX5_TXCMP_CODE_EMPW - used inside functions set. 4012 * 4013 * Local context variables updated. 4014 * 4015 * 4016 * The routine sends packets with MLX5_OPCODE_EMPW 4017 * without inlining, this is dedicated optimized branch. 4018 * No VLAN insertion is supported. 4019 */ 4020 static __rte_always_inline enum mlx5_txcmp_code 4021 mlx5_tx_burst_empw_simple(struct mlx5_txq_data *restrict txq, 4022 struct rte_mbuf **restrict pkts, 4023 unsigned int pkts_n, 4024 struct mlx5_txq_local *restrict loc, 4025 unsigned int olx) 4026 { 4027 /* 4028 * Subroutine is the part of mlx5_tx_burst_single() 4029 * and sends single-segment packet with eMPW opcode 4030 * without data inlining. 4031 */ 4032 MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE)); 4033 MLX5_ASSERT(MLX5_TXOFF_CONFIG(EMPW)); 4034 MLX5_ASSERT(loc->elts_free && loc->wqe_free); 4035 MLX5_ASSERT(pkts_n > loc->pkts_sent); 4036 static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size"); 4037 pkts += loc->pkts_sent + 1; 4038 pkts_n -= loc->pkts_sent; 4039 for (;;) { 4040 struct mlx5_wqe_dseg *restrict dseg; 4041 struct mlx5_wqe_eseg *restrict eseg; 4042 enum mlx5_txcmp_code ret; 4043 unsigned int part, loop; 4044 unsigned int slen = 0; 4045 4046 next_empw: 4047 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); 4048 part = RTE_MIN(pkts_n, MLX5_TXOFF_CONFIG(MPW) ? 4049 MLX5_MPW_MAX_PACKETS : 4050 MLX5_EMPW_MAX_PACKETS); 4051 if (unlikely(loc->elts_free < part)) { 4052 /* We have no enough elts to save all mbufs. */ 4053 if (unlikely(loc->elts_free < MLX5_EMPW_MIN_PACKETS)) 4054 return MLX5_TXCMP_CODE_EXIT; 4055 /* But we still able to send at least minimal eMPW. */ 4056 part = loc->elts_free; 4057 } 4058 /* Check whether we have enough WQEs */ 4059 if (unlikely(loc->wqe_free < ((2 + part + 3) / 4))) { 4060 if (unlikely(loc->wqe_free < 4061 ((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4))) 4062 return MLX5_TXCMP_CODE_EXIT; 4063 part = (loc->wqe_free * 4) - 2; 4064 } 4065 if (likely(part > 1)) 4066 rte_prefetch0(*pkts); 4067 loc->wqe_last = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4068 /* 4069 * Build eMPW title WQEBB: 4070 * - Control Segment, eMPW opcode 4071 * - Ethernet Segment, no inline 4072 */ 4073 mlx5_tx_cseg_init(txq, loc, loc->wqe_last, part + 2, 4074 MLX5_OPCODE_ENHANCED_MPSW, olx); 4075 mlx5_tx_eseg_none(txq, loc, loc->wqe_last, 4076 olx & ~MLX5_TXOFF_CONFIG_VLAN); 4077 eseg = &loc->wqe_last->eseg; 4078 dseg = &loc->wqe_last->dseg[0]; 4079 loop = part; 4080 /* Store the packet length for legacy MPW. */ 4081 if (MLX5_TXOFF_CONFIG(MPW)) 4082 eseg->mss = rte_cpu_to_be_16 4083 (rte_pktmbuf_data_len(loc->mbuf)); 4084 for (;;) { 4085 uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf); 4086 #ifdef MLX5_PMD_SOFT_COUNTERS 4087 /* Update sent data bytes counter. */ 4088 slen += dlen; 4089 #endif 4090 mlx5_tx_dseg_ptr 4091 (txq, loc, dseg, 4092 rte_pktmbuf_mtod(loc->mbuf, uint8_t *), 4093 dlen, olx); 4094 if (unlikely(--loop == 0)) 4095 break; 4096 loc->mbuf = *pkts++; 4097 if (likely(loop > 1)) 4098 rte_prefetch0(*pkts); 4099 ret = mlx5_tx_able_to_empw(txq, loc, olx, true); 4100 /* 4101 * Unroll the completion code to avoid 4102 * returning variable value - it results in 4103 * unoptimized sequent checking in caller. 4104 */ 4105 if (ret == MLX5_TXCMP_CODE_MULTI) { 4106 part -= loop; 4107 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 4108 if (unlikely(!loc->elts_free || 4109 !loc->wqe_free)) 4110 return MLX5_TXCMP_CODE_EXIT; 4111 return MLX5_TXCMP_CODE_MULTI; 4112 } 4113 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); 4114 if (ret == MLX5_TXCMP_CODE_TSO) { 4115 part -= loop; 4116 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 4117 if (unlikely(!loc->elts_free || 4118 !loc->wqe_free)) 4119 return MLX5_TXCMP_CODE_EXIT; 4120 return MLX5_TXCMP_CODE_TSO; 4121 } 4122 if (ret == MLX5_TXCMP_CODE_SINGLE) { 4123 part -= loop; 4124 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 4125 if (unlikely(!loc->elts_free || 4126 !loc->wqe_free)) 4127 return MLX5_TXCMP_CODE_EXIT; 4128 return MLX5_TXCMP_CODE_SINGLE; 4129 } 4130 if (ret != MLX5_TXCMP_CODE_EMPW) { 4131 MLX5_ASSERT(false); 4132 part -= loop; 4133 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 4134 return MLX5_TXCMP_CODE_ERROR; 4135 } 4136 /* 4137 * Check whether packet parameters coincide 4138 * within assumed eMPW batch: 4139 * - check sum settings 4140 * - metadata value 4141 * - software parser settings 4142 * - packets length (legacy MPW only) 4143 */ 4144 if (!mlx5_tx_match_empw(txq, eseg, loc, dlen, olx)) { 4145 MLX5_ASSERT(loop); 4146 part -= loop; 4147 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 4148 if (unlikely(!loc->elts_free || 4149 !loc->wqe_free)) 4150 return MLX5_TXCMP_CODE_EXIT; 4151 pkts_n -= part; 4152 goto next_empw; 4153 } 4154 /* Packet attributes match, continue the same eMPW. */ 4155 ++dseg; 4156 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 4157 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 4158 } 4159 /* eMPW is built successfully, update loop parameters. */ 4160 MLX5_ASSERT(!loop); 4161 MLX5_ASSERT(pkts_n >= part); 4162 #ifdef MLX5_PMD_SOFT_COUNTERS 4163 /* Update sent data bytes counter. */ 4164 txq->stats.obytes += slen; 4165 #endif 4166 loc->elts_free -= part; 4167 loc->pkts_sent += part; 4168 txq->wqe_ci += (2 + part + 3) / 4; 4169 loc->wqe_free -= (2 + part + 3) / 4; 4170 pkts_n -= part; 4171 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) 4172 return MLX5_TXCMP_CODE_EXIT; 4173 loc->mbuf = *pkts++; 4174 ret = mlx5_tx_able_to_empw(txq, loc, olx, true); 4175 if (unlikely(ret != MLX5_TXCMP_CODE_EMPW)) 4176 return ret; 4177 /* Continue sending eMPW batches. */ 4178 } 4179 MLX5_ASSERT(false); 4180 } 4181 4182 /** 4183 * The routine sends packets with MLX5_OPCODE_EMPW 4184 * with inlining, optionally supports VLAN insertion. 4185 */ 4186 static __rte_always_inline enum mlx5_txcmp_code 4187 mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq, 4188 struct rte_mbuf **restrict pkts, 4189 unsigned int pkts_n, 4190 struct mlx5_txq_local *restrict loc, 4191 unsigned int olx) 4192 { 4193 /* 4194 * Subroutine is the part of mlx5_tx_burst_single() 4195 * and sends single-segment packet with eMPW opcode 4196 * with data inlining. 4197 */ 4198 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE)); 4199 MLX5_ASSERT(MLX5_TXOFF_CONFIG(EMPW)); 4200 MLX5_ASSERT(loc->elts_free && loc->wqe_free); 4201 MLX5_ASSERT(pkts_n > loc->pkts_sent); 4202 static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size"); 4203 pkts += loc->pkts_sent + 1; 4204 pkts_n -= loc->pkts_sent; 4205 for (;;) { 4206 struct mlx5_wqe_dseg *restrict dseg; 4207 struct mlx5_wqe *restrict wqem; 4208 enum mlx5_txcmp_code ret; 4209 unsigned int room, part, nlim; 4210 unsigned int slen = 0; 4211 4212 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); 4213 /* 4214 * Limits the amount of packets in one WQE 4215 * to improve CQE latency generation. 4216 */ 4217 nlim = RTE_MIN(pkts_n, MLX5_TXOFF_CONFIG(MPW) ? 4218 MLX5_MPW_INLINE_MAX_PACKETS : 4219 MLX5_EMPW_MAX_PACKETS); 4220 /* Check whether we have minimal amount WQEs */ 4221 if (unlikely(loc->wqe_free < 4222 ((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4))) 4223 return MLX5_TXCMP_CODE_EXIT; 4224 if (likely(pkts_n > 1)) 4225 rte_prefetch0(*pkts); 4226 wqem = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4227 /* 4228 * Build eMPW title WQEBB: 4229 * - Control Segment, eMPW opcode, zero DS 4230 * - Ethernet Segment, no inline 4231 */ 4232 mlx5_tx_cseg_init(txq, loc, wqem, 0, 4233 MLX5_OPCODE_ENHANCED_MPSW, olx); 4234 mlx5_tx_eseg_none(txq, loc, wqem, 4235 olx & ~MLX5_TXOFF_CONFIG_VLAN); 4236 dseg = &wqem->dseg[0]; 4237 /* Store the packet length for legacy MPW. */ 4238 if (MLX5_TXOFF_CONFIG(MPW)) 4239 wqem->eseg.mss = rte_cpu_to_be_16 4240 (rte_pktmbuf_data_len(loc->mbuf)); 4241 room = RTE_MIN(MLX5_WQE_SIZE_MAX / MLX5_WQE_SIZE, 4242 loc->wqe_free) * MLX5_WQE_SIZE - 4243 MLX5_WQE_CSEG_SIZE - 4244 MLX5_WQE_ESEG_SIZE; 4245 /* Limit the room for legacy MPW sessions for performance. */ 4246 if (MLX5_TXOFF_CONFIG(MPW)) 4247 room = RTE_MIN(room, 4248 RTE_MAX(txq->inlen_empw + 4249 sizeof(dseg->bcount) + 4250 (MLX5_TXOFF_CONFIG(VLAN) ? 4251 sizeof(struct rte_vlan_hdr) : 0), 4252 MLX5_MPW_INLINE_MAX_PACKETS * 4253 MLX5_WQE_DSEG_SIZE)); 4254 /* Build WQE till we have space, packets and resources. */ 4255 part = room; 4256 for (;;) { 4257 uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf); 4258 uint8_t *dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *); 4259 unsigned int tlen; 4260 4261 MLX5_ASSERT(room >= MLX5_WQE_DSEG_SIZE); 4262 MLX5_ASSERT((room % MLX5_WQE_DSEG_SIZE) == 0); 4263 MLX5_ASSERT((uintptr_t)dseg < (uintptr_t)txq->wqes_end); 4264 /* 4265 * Some Tx offloads may cause an error if 4266 * packet is not long enough, check against 4267 * assumed minimal length. 4268 */ 4269 if (unlikely(dlen <= MLX5_ESEG_MIN_INLINE_SIZE)) { 4270 part -= room; 4271 if (unlikely(!part)) 4272 return MLX5_TXCMP_CODE_ERROR; 4273 /* 4274 * We have some successfully built 4275 * packet Data Segments to send. 4276 */ 4277 mlx5_tx_idone_empw(txq, loc, part, 4278 slen, wqem, olx); 4279 return MLX5_TXCMP_CODE_ERROR; 4280 } 4281 /* Inline or not inline - that's the Question. */ 4282 if (dlen > txq->inlen_empw || 4283 loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE) 4284 goto pointer_empw; 4285 if (MLX5_TXOFF_CONFIG(MPW)) { 4286 if (dlen > txq->inlen_send) 4287 goto pointer_empw; 4288 tlen = dlen; 4289 if (part == room) { 4290 /* Open new inline MPW session. */ 4291 tlen += sizeof(dseg->bcount); 4292 dseg->bcount = RTE_BE32(0); 4293 dseg = RTE_PTR_ADD 4294 (dseg, sizeof(dseg->bcount)); 4295 } else { 4296 /* 4297 * No pointer and inline descriptor 4298 * intermix for legacy MPW sessions. 4299 */ 4300 if (wqem->dseg[0].bcount) 4301 break; 4302 } 4303 } else { 4304 tlen = sizeof(dseg->bcount) + dlen; 4305 } 4306 /* Inline entire packet, optional VLAN insertion. */ 4307 if (MLX5_TXOFF_CONFIG(VLAN) && 4308 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { 4309 /* 4310 * The packet length must be checked in 4311 * mlx5_tx_able_to_empw() and packet 4312 * fits into inline length guaranteed. 4313 */ 4314 MLX5_ASSERT((dlen + 4315 sizeof(struct rte_vlan_hdr)) <= 4316 txq->inlen_empw); 4317 tlen += sizeof(struct rte_vlan_hdr); 4318 if (room < tlen) 4319 break; 4320 dseg = mlx5_tx_dseg_vlan(txq, loc, dseg, 4321 dptr, dlen, olx); 4322 #ifdef MLX5_PMD_SOFT_COUNTERS 4323 /* Update sent data bytes counter. */ 4324 slen += sizeof(struct rte_vlan_hdr); 4325 #endif 4326 } else { 4327 if (room < tlen) 4328 break; 4329 dseg = mlx5_tx_dseg_empw(txq, loc, dseg, 4330 dptr, dlen, olx); 4331 } 4332 if (!MLX5_TXOFF_CONFIG(MPW)) 4333 tlen = RTE_ALIGN(tlen, MLX5_WSEG_SIZE); 4334 MLX5_ASSERT(room >= tlen); 4335 room -= tlen; 4336 /* 4337 * Packet data are completely inlined, 4338 * free the packet immediately. 4339 */ 4340 rte_pktmbuf_free_seg(loc->mbuf); 4341 goto next_mbuf; 4342 pointer_empw: 4343 /* 4344 * No pointer and inline descriptor 4345 * intermix for legacy MPW sessions. 4346 */ 4347 if (MLX5_TXOFF_CONFIG(MPW) && 4348 part != room && 4349 wqem->dseg[0].bcount == RTE_BE32(0)) 4350 break; 4351 /* 4352 * Not inlinable VLAN packets are 4353 * proceeded outside of this routine. 4354 */ 4355 MLX5_ASSERT(room >= MLX5_WQE_DSEG_SIZE); 4356 if (MLX5_TXOFF_CONFIG(VLAN)) 4357 MLX5_ASSERT(!(loc->mbuf->ol_flags & 4358 PKT_TX_VLAN_PKT)); 4359 mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx); 4360 /* We have to store mbuf in elts.*/ 4361 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 4362 room -= MLX5_WQE_DSEG_SIZE; 4363 /* Ring buffer wraparound is checked at the loop end.*/ 4364 ++dseg; 4365 next_mbuf: 4366 #ifdef MLX5_PMD_SOFT_COUNTERS 4367 /* Update sent data bytes counter. */ 4368 slen += dlen; 4369 #endif 4370 loc->pkts_sent++; 4371 loc->elts_free--; 4372 pkts_n--; 4373 if (unlikely(!pkts_n || !loc->elts_free)) { 4374 /* 4375 * We have no resources/packets to 4376 * continue build descriptors. 4377 */ 4378 part -= room; 4379 mlx5_tx_idone_empw(txq, loc, part, 4380 slen, wqem, olx); 4381 return MLX5_TXCMP_CODE_EXIT; 4382 } 4383 loc->mbuf = *pkts++; 4384 if (likely(pkts_n > 1)) 4385 rte_prefetch0(*pkts); 4386 ret = mlx5_tx_able_to_empw(txq, loc, olx, true); 4387 /* 4388 * Unroll the completion code to avoid 4389 * returning variable value - it results in 4390 * unoptimized sequent checking in caller. 4391 */ 4392 if (ret == MLX5_TXCMP_CODE_MULTI) { 4393 part -= room; 4394 mlx5_tx_idone_empw(txq, loc, part, 4395 slen, wqem, olx); 4396 if (unlikely(!loc->elts_free || 4397 !loc->wqe_free)) 4398 return MLX5_TXCMP_CODE_EXIT; 4399 return MLX5_TXCMP_CODE_MULTI; 4400 } 4401 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); 4402 if (ret == MLX5_TXCMP_CODE_TSO) { 4403 part -= room; 4404 mlx5_tx_idone_empw(txq, loc, part, 4405 slen, wqem, olx); 4406 if (unlikely(!loc->elts_free || 4407 !loc->wqe_free)) 4408 return MLX5_TXCMP_CODE_EXIT; 4409 return MLX5_TXCMP_CODE_TSO; 4410 } 4411 if (ret == MLX5_TXCMP_CODE_SINGLE) { 4412 part -= room; 4413 mlx5_tx_idone_empw(txq, loc, part, 4414 slen, wqem, olx); 4415 if (unlikely(!loc->elts_free || 4416 !loc->wqe_free)) 4417 return MLX5_TXCMP_CODE_EXIT; 4418 return MLX5_TXCMP_CODE_SINGLE; 4419 } 4420 if (ret != MLX5_TXCMP_CODE_EMPW) { 4421 MLX5_ASSERT(false); 4422 part -= room; 4423 mlx5_tx_idone_empw(txq, loc, part, 4424 slen, wqem, olx); 4425 return MLX5_TXCMP_CODE_ERROR; 4426 } 4427 /* Check if we have minimal room left. */ 4428 nlim--; 4429 if (unlikely(!nlim || room < MLX5_WQE_DSEG_SIZE)) 4430 break; 4431 /* 4432 * Check whether packet parameters coincide 4433 * within assumed eMPW batch: 4434 * - check sum settings 4435 * - metadata value 4436 * - software parser settings 4437 * - packets length (legacy MPW only) 4438 */ 4439 if (!mlx5_tx_match_empw(txq, &wqem->eseg, 4440 loc, dlen, olx)) 4441 break; 4442 /* Packet attributes match, continue the same eMPW. */ 4443 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 4444 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 4445 } 4446 /* 4447 * We get here to close an existing eMPW 4448 * session and start the new one. 4449 */ 4450 MLX5_ASSERT(pkts_n); 4451 part -= room; 4452 if (unlikely(!part)) 4453 return MLX5_TXCMP_CODE_EXIT; 4454 mlx5_tx_idone_empw(txq, loc, part, slen, wqem, olx); 4455 if (unlikely(!loc->elts_free || 4456 !loc->wqe_free)) 4457 return MLX5_TXCMP_CODE_EXIT; 4458 /* Continue the loop with new eMPW session. */ 4459 } 4460 MLX5_ASSERT(false); 4461 } 4462 4463 /** 4464 * The routine sends packets with ordinary MLX5_OPCODE_SEND. 4465 * Data inlining and VLAN insertion are supported. 4466 */ 4467 static __rte_always_inline enum mlx5_txcmp_code 4468 mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq, 4469 struct rte_mbuf **restrict pkts, 4470 unsigned int pkts_n, 4471 struct mlx5_txq_local *restrict loc, 4472 unsigned int olx) 4473 { 4474 /* 4475 * Subroutine is the part of mlx5_tx_burst_single() 4476 * and sends single-segment packet with SEND opcode. 4477 */ 4478 MLX5_ASSERT(loc->elts_free && loc->wqe_free); 4479 MLX5_ASSERT(pkts_n > loc->pkts_sent); 4480 pkts += loc->pkts_sent + 1; 4481 pkts_n -= loc->pkts_sent; 4482 for (;;) { 4483 struct mlx5_wqe *restrict wqe; 4484 enum mlx5_txcmp_code ret; 4485 4486 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); 4487 if (MLX5_TXOFF_CONFIG(INLINE)) { 4488 unsigned int inlen, vlan = 0; 4489 4490 inlen = rte_pktmbuf_data_len(loc->mbuf); 4491 if (MLX5_TXOFF_CONFIG(VLAN) && 4492 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { 4493 vlan = sizeof(struct rte_vlan_hdr); 4494 inlen += vlan; 4495 static_assert((sizeof(struct rte_vlan_hdr) + 4496 sizeof(struct rte_ether_hdr)) == 4497 MLX5_ESEG_MIN_INLINE_SIZE, 4498 "invalid min inline data size"); 4499 } 4500 /* 4501 * If inlining is enabled at configuration time 4502 * the limit must be not less than minimal size. 4503 * Otherwise we would do extra check for data 4504 * size to avoid crashes due to length overflow. 4505 */ 4506 MLX5_ASSERT(txq->inlen_send >= 4507 MLX5_ESEG_MIN_INLINE_SIZE); 4508 if (inlen <= txq->inlen_send) { 4509 unsigned int seg_n, wqe_n; 4510 4511 rte_prefetch0(rte_pktmbuf_mtod 4512 (loc->mbuf, uint8_t *)); 4513 /* Check against minimal length. */ 4514 if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE) 4515 return MLX5_TXCMP_CODE_ERROR; 4516 if (loc->mbuf->ol_flags & 4517 PKT_TX_DYNF_NOINLINE) { 4518 /* 4519 * The hint flag not to inline packet 4520 * data is set. Check whether we can 4521 * follow the hint. 4522 */ 4523 if ((!MLX5_TXOFF_CONFIG(EMPW) && 4524 txq->inlen_mode) || 4525 (MLX5_TXOFF_CONFIG(MPW) && 4526 txq->inlen_mode)) { 4527 /* 4528 * The hardware requires the 4529 * minimal inline data header. 4530 */ 4531 goto single_min_inline; 4532 } 4533 if (MLX5_TXOFF_CONFIG(VLAN) && 4534 vlan && !txq->vlan_en) { 4535 /* 4536 * We must insert VLAN tag 4537 * by software means. 4538 */ 4539 goto single_part_inline; 4540 } 4541 goto single_no_inline; 4542 } 4543 /* 4544 * Completely inlined packet data WQE: 4545 * - Control Segment, SEND opcode 4546 * - Ethernet Segment, no VLAN insertion 4547 * - Data inlined, VLAN optionally inserted 4548 * - Alignment to MLX5_WSEG_SIZE 4549 * Have to estimate amount of WQEBBs 4550 */ 4551 seg_n = (inlen + 3 * MLX5_WSEG_SIZE - 4552 MLX5_ESEG_MIN_INLINE_SIZE + 4553 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 4554 /* Check if there are enough WQEBBs. */ 4555 wqe_n = (seg_n + 3) / 4; 4556 if (wqe_n > loc->wqe_free) 4557 return MLX5_TXCMP_CODE_EXIT; 4558 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4559 loc->wqe_last = wqe; 4560 mlx5_tx_cseg_init(txq, loc, wqe, seg_n, 4561 MLX5_OPCODE_SEND, olx); 4562 mlx5_tx_eseg_data(txq, loc, wqe, 4563 vlan, inlen, 0, olx); 4564 txq->wqe_ci += wqe_n; 4565 loc->wqe_free -= wqe_n; 4566 /* 4567 * Packet data are completely inlined, 4568 * free the packet immediately. 4569 */ 4570 rte_pktmbuf_free_seg(loc->mbuf); 4571 } else if ((!MLX5_TXOFF_CONFIG(EMPW) || 4572 MLX5_TXOFF_CONFIG(MPW)) && 4573 txq->inlen_mode) { 4574 /* 4575 * If minimal inlining is requested the eMPW 4576 * feature should be disabled due to data is 4577 * inlined into Ethernet Segment, which can 4578 * not contain inlined data for eMPW due to 4579 * segment shared for all packets. 4580 */ 4581 struct mlx5_wqe_dseg *restrict dseg; 4582 unsigned int ds; 4583 uint8_t *dptr; 4584 4585 /* 4586 * The inline-mode settings require 4587 * to inline the specified amount of 4588 * data bytes to the Ethernet Segment. 4589 * We should check the free space in 4590 * WQE ring buffer to inline partially. 4591 */ 4592 single_min_inline: 4593 MLX5_ASSERT(txq->inlen_send >= txq->inlen_mode); 4594 MLX5_ASSERT(inlen > txq->inlen_mode); 4595 MLX5_ASSERT(txq->inlen_mode >= 4596 MLX5_ESEG_MIN_INLINE_SIZE); 4597 /* 4598 * Check whether there are enough free WQEBBs: 4599 * - Control Segment 4600 * - Ethernet Segment 4601 * - First Segment of inlined Ethernet data 4602 * - ... data continued ... 4603 * - Finishing Data Segment of pointer type 4604 */ 4605 ds = (MLX5_WQE_CSEG_SIZE + 4606 MLX5_WQE_ESEG_SIZE + 4607 MLX5_WQE_DSEG_SIZE + 4608 txq->inlen_mode - 4609 MLX5_ESEG_MIN_INLINE_SIZE + 4610 MLX5_WQE_DSEG_SIZE + 4611 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 4612 if (loc->wqe_free < ((ds + 3) / 4)) 4613 return MLX5_TXCMP_CODE_EXIT; 4614 /* 4615 * Build the ordinary SEND WQE: 4616 * - Control Segment 4617 * - Ethernet Segment, inline inlen_mode bytes 4618 * - Data Segment of pointer type 4619 */ 4620 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4621 loc->wqe_last = wqe; 4622 mlx5_tx_cseg_init(txq, loc, wqe, ds, 4623 MLX5_OPCODE_SEND, olx); 4624 dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan, 4625 txq->inlen_mode, 4626 0, olx); 4627 dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + 4628 txq->inlen_mode - vlan; 4629 inlen -= txq->inlen_mode; 4630 mlx5_tx_dseg_ptr(txq, loc, dseg, 4631 dptr, inlen, olx); 4632 /* 4633 * WQE is built, update the loop parameters 4634 * and got to the next packet. 4635 */ 4636 txq->wqe_ci += (ds + 3) / 4; 4637 loc->wqe_free -= (ds + 3) / 4; 4638 /* We have to store mbuf in elts.*/ 4639 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE)); 4640 txq->elts[txq->elts_head++ & txq->elts_m] = 4641 loc->mbuf; 4642 --loc->elts_free; 4643 } else { 4644 uint8_t *dptr; 4645 unsigned int dlen; 4646 4647 /* 4648 * Partially inlined packet data WQE, we have 4649 * some space in title WQEBB, we can fill it 4650 * with some packet data. It takes one WQEBB, 4651 * it is available, no extra space check: 4652 * - Control Segment, SEND opcode 4653 * - Ethernet Segment, no VLAN insertion 4654 * - MLX5_ESEG_MIN_INLINE_SIZE bytes of Data 4655 * - Data Segment, pointer type 4656 * 4657 * We also get here if VLAN insertion is not 4658 * supported by HW, the inline is enabled. 4659 */ 4660 single_part_inline: 4661 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4662 loc->wqe_last = wqe; 4663 mlx5_tx_cseg_init(txq, loc, wqe, 4, 4664 MLX5_OPCODE_SEND, olx); 4665 mlx5_tx_eseg_dmin(txq, loc, wqe, vlan, olx); 4666 dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + 4667 MLX5_ESEG_MIN_INLINE_SIZE - vlan; 4668 /* 4669 * The length check is performed above, by 4670 * comparing with txq->inlen_send. We should 4671 * not get overflow here. 4672 */ 4673 MLX5_ASSERT(inlen > MLX5_ESEG_MIN_INLINE_SIZE); 4674 dlen = inlen - MLX5_ESEG_MIN_INLINE_SIZE; 4675 mlx5_tx_dseg_ptr(txq, loc, &wqe->dseg[1], 4676 dptr, dlen, olx); 4677 ++txq->wqe_ci; 4678 --loc->wqe_free; 4679 /* We have to store mbuf in elts.*/ 4680 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE)); 4681 txq->elts[txq->elts_head++ & txq->elts_m] = 4682 loc->mbuf; 4683 --loc->elts_free; 4684 } 4685 #ifdef MLX5_PMD_SOFT_COUNTERS 4686 /* Update sent data bytes counter. */ 4687 txq->stats.obytes += vlan + 4688 rte_pktmbuf_data_len(loc->mbuf); 4689 #endif 4690 } else { 4691 /* 4692 * No inline at all, it means the CPU cycles saving 4693 * is prioritized at configuration, we should not 4694 * copy any packet data to WQE. 4695 * 4696 * SEND WQE, one WQEBB: 4697 * - Control Segment, SEND opcode 4698 * - Ethernet Segment, optional VLAN, no inline 4699 * - Data Segment, pointer type 4700 */ 4701 single_no_inline: 4702 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4703 loc->wqe_last = wqe; 4704 mlx5_tx_cseg_init(txq, loc, wqe, 3, 4705 MLX5_OPCODE_SEND, olx); 4706 mlx5_tx_eseg_none(txq, loc, wqe, olx); 4707 mlx5_tx_dseg_ptr 4708 (txq, loc, &wqe->dseg[0], 4709 rte_pktmbuf_mtod(loc->mbuf, uint8_t *), 4710 rte_pktmbuf_data_len(loc->mbuf), olx); 4711 ++txq->wqe_ci; 4712 --loc->wqe_free; 4713 /* 4714 * We should not store mbuf pointer in elts 4715 * if no inlining is configured, this is done 4716 * by calling routine in a batch copy. 4717 */ 4718 MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE)); 4719 --loc->elts_free; 4720 #ifdef MLX5_PMD_SOFT_COUNTERS 4721 /* Update sent data bytes counter. */ 4722 txq->stats.obytes += rte_pktmbuf_data_len(loc->mbuf); 4723 if (MLX5_TXOFF_CONFIG(VLAN) && 4724 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) 4725 txq->stats.obytes += 4726 sizeof(struct rte_vlan_hdr); 4727 #endif 4728 } 4729 ++loc->pkts_sent; 4730 --pkts_n; 4731 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) 4732 return MLX5_TXCMP_CODE_EXIT; 4733 loc->mbuf = *pkts++; 4734 if (pkts_n > 1) 4735 rte_prefetch0(*pkts); 4736 ret = mlx5_tx_able_to_empw(txq, loc, olx, true); 4737 if (unlikely(ret != MLX5_TXCMP_CODE_SINGLE)) 4738 return ret; 4739 } 4740 MLX5_ASSERT(false); 4741 } 4742 4743 static __rte_always_inline enum mlx5_txcmp_code 4744 mlx5_tx_burst_single(struct mlx5_txq_data *restrict txq, 4745 struct rte_mbuf **restrict pkts, 4746 unsigned int pkts_n, 4747 struct mlx5_txq_local *restrict loc, 4748 unsigned int olx) 4749 { 4750 enum mlx5_txcmp_code ret; 4751 4752 ret = mlx5_tx_able_to_empw(txq, loc, olx, false); 4753 if (ret == MLX5_TXCMP_CODE_SINGLE) 4754 goto ordinary_send; 4755 MLX5_ASSERT(ret == MLX5_TXCMP_CODE_EMPW); 4756 for (;;) { 4757 /* Optimize for inline/no inline eMPW send. */ 4758 ret = (MLX5_TXOFF_CONFIG(INLINE)) ? 4759 mlx5_tx_burst_empw_inline 4760 (txq, pkts, pkts_n, loc, olx) : 4761 mlx5_tx_burst_empw_simple 4762 (txq, pkts, pkts_n, loc, olx); 4763 if (ret != MLX5_TXCMP_CODE_SINGLE) 4764 return ret; 4765 /* The resources to send one packet should remain. */ 4766 MLX5_ASSERT(loc->elts_free && loc->wqe_free); 4767 ordinary_send: 4768 ret = mlx5_tx_burst_single_send(txq, pkts, pkts_n, loc, olx); 4769 MLX5_ASSERT(ret != MLX5_TXCMP_CODE_SINGLE); 4770 if (ret != MLX5_TXCMP_CODE_EMPW) 4771 return ret; 4772 /* The resources to send one packet should remain. */ 4773 MLX5_ASSERT(loc->elts_free && loc->wqe_free); 4774 } 4775 } 4776 4777 /** 4778 * DPDK Tx callback template. This is configured template 4779 * used to generate routines optimized for specified offload setup. 4780 * One of this generated functions is chosen at SQ configuration 4781 * time. 4782 * 4783 * @param txq 4784 * Generic pointer to TX queue structure. 4785 * @param[in] pkts 4786 * Packets to transmit. 4787 * @param pkts_n 4788 * Number of packets in array. 4789 * @param olx 4790 * Configured offloads mask, presents the bits of MLX5_TXOFF_CONFIG_xxx 4791 * values. Should be static to take compile time static configuration 4792 * advantages. 4793 * 4794 * @return 4795 * Number of packets successfully transmitted (<= pkts_n). 4796 */ 4797 static __rte_always_inline uint16_t 4798 mlx5_tx_burst_tmpl(struct mlx5_txq_data *restrict txq, 4799 struct rte_mbuf **restrict pkts, 4800 uint16_t pkts_n, 4801 unsigned int olx) 4802 { 4803 struct mlx5_txq_local loc; 4804 enum mlx5_txcmp_code ret; 4805 unsigned int part; 4806 4807 MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); 4808 MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); 4809 if (unlikely(!pkts_n)) 4810 return 0; 4811 loc.pkts_sent = 0; 4812 loc.pkts_copy = 0; 4813 loc.wqe_last = NULL; 4814 4815 send_loop: 4816 loc.pkts_loop = loc.pkts_sent; 4817 /* 4818 * Check if there are some CQEs, if any: 4819 * - process an encountered errors 4820 * - process the completed WQEs 4821 * - free related mbufs 4822 * - doorbell the NIC about processed CQEs 4823 */ 4824 rte_prefetch0(*(pkts + loc.pkts_sent)); 4825 mlx5_tx_handle_completion(txq, olx); 4826 /* 4827 * Calculate the number of available resources - elts and WQEs. 4828 * There are two possible different scenarios: 4829 * - no data inlining into WQEs, one WQEBB may contains upto 4830 * four packets, in this case elts become scarce resource 4831 * - data inlining into WQEs, one packet may require multiple 4832 * WQEBBs, the WQEs become the limiting factor. 4833 */ 4834 MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); 4835 loc.elts_free = txq->elts_s - 4836 (uint16_t)(txq->elts_head - txq->elts_tail); 4837 MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); 4838 loc.wqe_free = txq->wqe_s - 4839 (uint16_t)(txq->wqe_ci - txq->wqe_pi); 4840 if (unlikely(!loc.elts_free || !loc.wqe_free)) 4841 goto burst_exit; 4842 for (;;) { 4843 /* 4844 * Fetch the packet from array. Usually this is 4845 * the first packet in series of multi/single 4846 * segment packets. 4847 */ 4848 loc.mbuf = *(pkts + loc.pkts_sent); 4849 /* Dedicated branch for multi-segment packets. */ 4850 if (MLX5_TXOFF_CONFIG(MULTI) && 4851 unlikely(NB_SEGS(loc.mbuf) > 1)) { 4852 /* 4853 * Multi-segment packet encountered. 4854 * Hardware is able to process it only 4855 * with SEND/TSO opcodes, one packet 4856 * per WQE, do it in dedicated routine. 4857 */ 4858 enter_send_multi: 4859 MLX5_ASSERT(loc.pkts_sent >= loc.pkts_copy); 4860 part = loc.pkts_sent - loc.pkts_copy; 4861 if (!MLX5_TXOFF_CONFIG(INLINE) && part) { 4862 /* 4863 * There are some single-segment mbufs not 4864 * stored in elts. The mbufs must be in the 4865 * same order as WQEs, so we must copy the 4866 * mbufs to elts here, before the coming 4867 * multi-segment packet mbufs is appended. 4868 */ 4869 mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy, 4870 part, olx); 4871 loc.pkts_copy = loc.pkts_sent; 4872 } 4873 MLX5_ASSERT(pkts_n > loc.pkts_sent); 4874 ret = mlx5_tx_burst_mseg(txq, pkts, pkts_n, &loc, olx); 4875 if (!MLX5_TXOFF_CONFIG(INLINE)) 4876 loc.pkts_copy = loc.pkts_sent; 4877 /* 4878 * These returned code checks are supposed 4879 * to be optimized out due to routine inlining. 4880 */ 4881 if (ret == MLX5_TXCMP_CODE_EXIT) { 4882 /* 4883 * The routine returns this code when 4884 * all packets are sent or there is no 4885 * enough resources to complete request. 4886 */ 4887 break; 4888 } 4889 if (ret == MLX5_TXCMP_CODE_ERROR) { 4890 /* 4891 * The routine returns this code when 4892 * some error in the incoming packets 4893 * format occurred. 4894 */ 4895 txq->stats.oerrors++; 4896 break; 4897 } 4898 if (ret == MLX5_TXCMP_CODE_SINGLE) { 4899 /* 4900 * The single-segment packet was encountered 4901 * in the array, try to send it with the 4902 * best optimized way, possible engaging eMPW. 4903 */ 4904 goto enter_send_single; 4905 } 4906 if (MLX5_TXOFF_CONFIG(TSO) && 4907 ret == MLX5_TXCMP_CODE_TSO) { 4908 /* 4909 * The single-segment TSO packet was 4910 * encountered in the array. 4911 */ 4912 goto enter_send_tso; 4913 } 4914 /* We must not get here. Something is going wrong. */ 4915 MLX5_ASSERT(false); 4916 txq->stats.oerrors++; 4917 break; 4918 } 4919 /* Dedicated branch for single-segment TSO packets. */ 4920 if (MLX5_TXOFF_CONFIG(TSO) && 4921 unlikely(loc.mbuf->ol_flags & PKT_TX_TCP_SEG)) { 4922 /* 4923 * TSO might require special way for inlining 4924 * (dedicated parameters) and is sent with 4925 * MLX5_OPCODE_TSO opcode only, provide this 4926 * in dedicated branch. 4927 */ 4928 enter_send_tso: 4929 MLX5_ASSERT(NB_SEGS(loc.mbuf) == 1); 4930 MLX5_ASSERT(pkts_n > loc.pkts_sent); 4931 ret = mlx5_tx_burst_tso(txq, pkts, pkts_n, &loc, olx); 4932 /* 4933 * These returned code checks are supposed 4934 * to be optimized out due to routine inlining. 4935 */ 4936 if (ret == MLX5_TXCMP_CODE_EXIT) 4937 break; 4938 if (ret == MLX5_TXCMP_CODE_ERROR) { 4939 txq->stats.oerrors++; 4940 break; 4941 } 4942 if (ret == MLX5_TXCMP_CODE_SINGLE) 4943 goto enter_send_single; 4944 if (MLX5_TXOFF_CONFIG(MULTI) && 4945 ret == MLX5_TXCMP_CODE_MULTI) { 4946 /* 4947 * The multi-segment packet was 4948 * encountered in the array. 4949 */ 4950 goto enter_send_multi; 4951 } 4952 /* We must not get here. Something is going wrong. */ 4953 MLX5_ASSERT(false); 4954 txq->stats.oerrors++; 4955 break; 4956 } 4957 /* 4958 * The dedicated branch for the single-segment packets 4959 * without TSO. Often these ones can be sent using 4960 * MLX5_OPCODE_EMPW with multiple packets in one WQE. 4961 * The routine builds the WQEs till it encounters 4962 * the TSO or multi-segment packet (in case if these 4963 * offloads are requested at SQ configuration time). 4964 */ 4965 enter_send_single: 4966 MLX5_ASSERT(pkts_n > loc.pkts_sent); 4967 ret = mlx5_tx_burst_single(txq, pkts, pkts_n, &loc, olx); 4968 /* 4969 * These returned code checks are supposed 4970 * to be optimized out due to routine inlining. 4971 */ 4972 if (ret == MLX5_TXCMP_CODE_EXIT) 4973 break; 4974 if (ret == MLX5_TXCMP_CODE_ERROR) { 4975 txq->stats.oerrors++; 4976 break; 4977 } 4978 if (MLX5_TXOFF_CONFIG(MULTI) && 4979 ret == MLX5_TXCMP_CODE_MULTI) { 4980 /* 4981 * The multi-segment packet was 4982 * encountered in the array. 4983 */ 4984 goto enter_send_multi; 4985 } 4986 if (MLX5_TXOFF_CONFIG(TSO) && 4987 ret == MLX5_TXCMP_CODE_TSO) { 4988 /* 4989 * The single-segment TSO packet was 4990 * encountered in the array. 4991 */ 4992 goto enter_send_tso; 4993 } 4994 /* We must not get here. Something is going wrong. */ 4995 MLX5_ASSERT(false); 4996 txq->stats.oerrors++; 4997 break; 4998 } 4999 /* 5000 * Main Tx loop is completed, do the rest: 5001 * - set completion request if thresholds are reached 5002 * - doorbell the hardware 5003 * - copy the rest of mbufs to elts (if any) 5004 */ 5005 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE) || 5006 loc.pkts_sent >= loc.pkts_copy); 5007 /* Take a shortcut if nothing is sent. */ 5008 if (unlikely(loc.pkts_sent == loc.pkts_loop)) 5009 goto burst_exit; 5010 /* Request CQE generation if limits are reached. */ 5011 mlx5_tx_request_completion(txq, &loc, olx); 5012 /* 5013 * Ring QP doorbell immediately after WQE building completion 5014 * to improve latencies. The pure software related data treatment 5015 * can be completed after doorbell. Tx CQEs for this SQ are 5016 * processed in this thread only by the polling. 5017 * 5018 * The rdma core library can map doorbell register in two ways, 5019 * depending on the environment variable "MLX5_SHUT_UP_BF": 5020 * 5021 * - as regular cached memory, the variable is either missing or 5022 * set to zero. This type of mapping may cause the significant 5023 * doorbell register writing latency and requires explicit 5024 * memory write barrier to mitigate this issue and prevent 5025 * write combining. 5026 * 5027 * - as non-cached memory, the variable is present and set to 5028 * not "0" value. This type of mapping may cause performance 5029 * impact under heavy loading conditions but the explicit write 5030 * memory barrier is not required and it may improve core 5031 * performance. 5032 * 5033 * - the legacy behaviour (prior 19.08 release) was to use some 5034 * heuristics to decide whether write memory barrier should 5035 * be performed. This behavior is supported with specifying 5036 * tx_db_nc=2, write barrier is skipped if application 5037 * provides the full recommended burst of packets, it 5038 * supposes the next packets are coming and the write barrier 5039 * will be issued on the next burst (after descriptor writing, 5040 * at least). 5041 */ 5042 mlx5_tx_dbrec_cond_wmb(txq, loc.wqe_last, !txq->db_nc && 5043 (!txq->db_heu || pkts_n % MLX5_TX_DEFAULT_BURST)); 5044 /* Not all of the mbufs may be stored into elts yet. */ 5045 part = MLX5_TXOFF_CONFIG(INLINE) ? 0 : loc.pkts_sent - loc.pkts_copy; 5046 if (!MLX5_TXOFF_CONFIG(INLINE) && part) { 5047 /* 5048 * There are some single-segment mbufs not stored in elts. 5049 * It can be only if the last packet was single-segment. 5050 * The copying is gathered into one place due to it is 5051 * a good opportunity to optimize that with SIMD. 5052 * Unfortunately if inlining is enabled the gaps in 5053 * pointer array may happen due to early freeing of the 5054 * inlined mbufs. 5055 */ 5056 mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy, part, olx); 5057 loc.pkts_copy = loc.pkts_sent; 5058 } 5059 MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); 5060 MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); 5061 if (pkts_n > loc.pkts_sent) { 5062 /* 5063 * If burst size is large there might be no enough CQE 5064 * fetched from completion queue and no enough resources 5065 * freed to send all the packets. 5066 */ 5067 goto send_loop; 5068 } 5069 burst_exit: 5070 #ifdef MLX5_PMD_SOFT_COUNTERS 5071 /* Increment sent packets counter. */ 5072 txq->stats.opackets += loc.pkts_sent; 5073 #endif 5074 return loc.pkts_sent; 5075 } 5076 5077 /* Generate routines with Enhanced Multi-Packet Write support. */ 5078 MLX5_TXOFF_DECL(full_empw, 5079 MLX5_TXOFF_CONFIG_FULL | MLX5_TXOFF_CONFIG_EMPW) 5080 5081 MLX5_TXOFF_DECL(none_empw, 5082 MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW) 5083 5084 MLX5_TXOFF_DECL(md_empw, 5085 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5086 5087 MLX5_TXOFF_DECL(mt_empw, 5088 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5089 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5090 5091 MLX5_TXOFF_DECL(mtsc_empw, 5092 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5093 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5094 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5095 5096 MLX5_TXOFF_DECL(mti_empw, 5097 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5098 MLX5_TXOFF_CONFIG_INLINE | 5099 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5100 5101 MLX5_TXOFF_DECL(mtv_empw, 5102 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5103 MLX5_TXOFF_CONFIG_VLAN | 5104 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5105 5106 MLX5_TXOFF_DECL(mtiv_empw, 5107 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5108 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5109 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5110 5111 MLX5_TXOFF_DECL(sc_empw, 5112 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5113 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5114 5115 MLX5_TXOFF_DECL(sci_empw, 5116 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5117 MLX5_TXOFF_CONFIG_INLINE | 5118 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5119 5120 MLX5_TXOFF_DECL(scv_empw, 5121 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5122 MLX5_TXOFF_CONFIG_VLAN | 5123 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5124 5125 MLX5_TXOFF_DECL(sciv_empw, 5126 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5127 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5128 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5129 5130 MLX5_TXOFF_DECL(i_empw, 5131 MLX5_TXOFF_CONFIG_INLINE | 5132 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5133 5134 MLX5_TXOFF_DECL(v_empw, 5135 MLX5_TXOFF_CONFIG_VLAN | 5136 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5137 5138 MLX5_TXOFF_DECL(iv_empw, 5139 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5140 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5141 5142 /* Generate routines without Enhanced Multi-Packet Write support. */ 5143 MLX5_TXOFF_DECL(full, 5144 MLX5_TXOFF_CONFIG_FULL) 5145 5146 MLX5_TXOFF_DECL(none, 5147 MLX5_TXOFF_CONFIG_NONE) 5148 5149 MLX5_TXOFF_DECL(md, 5150 MLX5_TXOFF_CONFIG_METADATA) 5151 5152 MLX5_TXOFF_DECL(mt, 5153 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5154 MLX5_TXOFF_CONFIG_METADATA) 5155 5156 MLX5_TXOFF_DECL(mtsc, 5157 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5158 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5159 MLX5_TXOFF_CONFIG_METADATA) 5160 5161 MLX5_TXOFF_DECL(mti, 5162 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5163 MLX5_TXOFF_CONFIG_INLINE | 5164 MLX5_TXOFF_CONFIG_METADATA) 5165 5166 5167 MLX5_TXOFF_DECL(mtv, 5168 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5169 MLX5_TXOFF_CONFIG_VLAN | 5170 MLX5_TXOFF_CONFIG_METADATA) 5171 5172 5173 MLX5_TXOFF_DECL(mtiv, 5174 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5175 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5176 MLX5_TXOFF_CONFIG_METADATA) 5177 5178 MLX5_TXOFF_DECL(sc, 5179 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5180 MLX5_TXOFF_CONFIG_METADATA) 5181 5182 MLX5_TXOFF_DECL(sci, 5183 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5184 MLX5_TXOFF_CONFIG_INLINE | 5185 MLX5_TXOFF_CONFIG_METADATA) 5186 5187 5188 MLX5_TXOFF_DECL(scv, 5189 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5190 MLX5_TXOFF_CONFIG_VLAN | 5191 MLX5_TXOFF_CONFIG_METADATA) 5192 5193 5194 MLX5_TXOFF_DECL(sciv, 5195 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5196 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5197 MLX5_TXOFF_CONFIG_METADATA) 5198 5199 MLX5_TXOFF_DECL(i, 5200 MLX5_TXOFF_CONFIG_INLINE | 5201 MLX5_TXOFF_CONFIG_METADATA) 5202 5203 MLX5_TXOFF_DECL(v, 5204 MLX5_TXOFF_CONFIG_VLAN | 5205 MLX5_TXOFF_CONFIG_METADATA) 5206 5207 MLX5_TXOFF_DECL(iv, 5208 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5209 MLX5_TXOFF_CONFIG_METADATA) 5210 5211 /* 5212 * Generate routines with Legacy Multi-Packet Write support. 5213 * This mode is supported by ConnectX-4 Lx only and imposes 5214 * offload limitations, not supported: 5215 * - ACL/Flows (metadata are becoming meaningless) 5216 * - WQE Inline headers 5217 * - SRIOV (E-Switch offloads) 5218 * - VLAN insertion 5219 * - tunnel encapsulation/decapsulation 5220 * - TSO 5221 */ 5222 MLX5_TXOFF_DECL(none_mpw, 5223 MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW | 5224 MLX5_TXOFF_CONFIG_MPW) 5225 5226 MLX5_TXOFF_DECL(mci_mpw, 5227 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM | 5228 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | 5229 MLX5_TXOFF_CONFIG_MPW) 5230 5231 MLX5_TXOFF_DECL(mc_mpw, 5232 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM | 5233 MLX5_TXOFF_CONFIG_EMPW | MLX5_TXOFF_CONFIG_MPW) 5234 5235 MLX5_TXOFF_DECL(i_mpw, 5236 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | 5237 MLX5_TXOFF_CONFIG_MPW) 5238 5239 /* 5240 * Array of declared and compiled Tx burst function and corresponding 5241 * supported offloads set. The array is used to select the Tx burst 5242 * function for specified offloads set at Tx queue configuration time. 5243 */ 5244 const struct { 5245 eth_tx_burst_t func; 5246 unsigned int olx; 5247 } txoff_func[] = { 5248 MLX5_TXOFF_INFO(full_empw, 5249 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5250 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5251 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5252 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5253 5254 MLX5_TXOFF_INFO(none_empw, 5255 MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW) 5256 5257 MLX5_TXOFF_INFO(md_empw, 5258 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5259 5260 MLX5_TXOFF_INFO(mt_empw, 5261 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5262 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5263 5264 MLX5_TXOFF_INFO(mtsc_empw, 5265 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5266 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5267 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5268 5269 MLX5_TXOFF_INFO(mti_empw, 5270 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5271 MLX5_TXOFF_CONFIG_INLINE | 5272 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5273 5274 MLX5_TXOFF_INFO(mtv_empw, 5275 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5276 MLX5_TXOFF_CONFIG_VLAN | 5277 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5278 5279 MLX5_TXOFF_INFO(mtiv_empw, 5280 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5281 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5282 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5283 5284 MLX5_TXOFF_INFO(sc_empw, 5285 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5286 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5287 5288 MLX5_TXOFF_INFO(sci_empw, 5289 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5290 MLX5_TXOFF_CONFIG_INLINE | 5291 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5292 5293 MLX5_TXOFF_INFO(scv_empw, 5294 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5295 MLX5_TXOFF_CONFIG_VLAN | 5296 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5297 5298 MLX5_TXOFF_INFO(sciv_empw, 5299 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5300 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5301 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5302 5303 MLX5_TXOFF_INFO(i_empw, 5304 MLX5_TXOFF_CONFIG_INLINE | 5305 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5306 5307 MLX5_TXOFF_INFO(v_empw, 5308 MLX5_TXOFF_CONFIG_VLAN | 5309 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5310 5311 MLX5_TXOFF_INFO(iv_empw, 5312 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5313 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5314 5315 MLX5_TXOFF_INFO(full, 5316 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5317 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5318 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5319 MLX5_TXOFF_CONFIG_METADATA) 5320 5321 MLX5_TXOFF_INFO(none, 5322 MLX5_TXOFF_CONFIG_NONE) 5323 5324 MLX5_TXOFF_INFO(md, 5325 MLX5_TXOFF_CONFIG_METADATA) 5326 5327 MLX5_TXOFF_INFO(mt, 5328 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5329 MLX5_TXOFF_CONFIG_METADATA) 5330 5331 MLX5_TXOFF_INFO(mtsc, 5332 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5333 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5334 MLX5_TXOFF_CONFIG_METADATA) 5335 5336 MLX5_TXOFF_INFO(mti, 5337 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5338 MLX5_TXOFF_CONFIG_INLINE | 5339 MLX5_TXOFF_CONFIG_METADATA) 5340 5341 MLX5_TXOFF_INFO(mtv, 5342 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5343 MLX5_TXOFF_CONFIG_VLAN | 5344 MLX5_TXOFF_CONFIG_METADATA) 5345 5346 MLX5_TXOFF_INFO(mtiv, 5347 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5348 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5349 MLX5_TXOFF_CONFIG_METADATA) 5350 5351 MLX5_TXOFF_INFO(sc, 5352 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5353 MLX5_TXOFF_CONFIG_METADATA) 5354 5355 MLX5_TXOFF_INFO(sci, 5356 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5357 MLX5_TXOFF_CONFIG_INLINE | 5358 MLX5_TXOFF_CONFIG_METADATA) 5359 5360 MLX5_TXOFF_INFO(scv, 5361 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5362 MLX5_TXOFF_CONFIG_VLAN | 5363 MLX5_TXOFF_CONFIG_METADATA) 5364 5365 MLX5_TXOFF_INFO(sciv, 5366 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5367 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5368 MLX5_TXOFF_CONFIG_METADATA) 5369 5370 MLX5_TXOFF_INFO(i, 5371 MLX5_TXOFF_CONFIG_INLINE | 5372 MLX5_TXOFF_CONFIG_METADATA) 5373 5374 MLX5_TXOFF_INFO(v, 5375 MLX5_TXOFF_CONFIG_VLAN | 5376 MLX5_TXOFF_CONFIG_METADATA) 5377 5378 MLX5_TXOFF_INFO(iv, 5379 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5380 MLX5_TXOFF_CONFIG_METADATA) 5381 5382 MLX5_TXOFF_INFO(none_mpw, 5383 MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW | 5384 MLX5_TXOFF_CONFIG_MPW) 5385 5386 MLX5_TXOFF_INFO(mci_mpw, 5387 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM | 5388 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | 5389 MLX5_TXOFF_CONFIG_MPW) 5390 5391 MLX5_TXOFF_INFO(mc_mpw, 5392 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM | 5393 MLX5_TXOFF_CONFIG_EMPW | MLX5_TXOFF_CONFIG_MPW) 5394 5395 MLX5_TXOFF_INFO(i_mpw, 5396 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | 5397 MLX5_TXOFF_CONFIG_MPW) 5398 }; 5399 5400 /** 5401 * Configure the Tx function to use. The routine checks configured 5402 * Tx offloads for the device and selects appropriate Tx burst 5403 * routine. There are multiple Tx burst routines compiled from 5404 * the same template in the most optimal way for the dedicated 5405 * Tx offloads set. 5406 * 5407 * @param dev 5408 * Pointer to private data structure. 5409 * 5410 * @return 5411 * Pointer to selected Tx burst function. 5412 */ 5413 eth_tx_burst_t 5414 mlx5_select_tx_function(struct rte_eth_dev *dev) 5415 { 5416 struct mlx5_priv *priv = dev->data->dev_private; 5417 struct mlx5_dev_config *config = &priv->config; 5418 uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads; 5419 unsigned int diff = 0, olx = 0, i, m; 5420 5421 static_assert(MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE <= 5422 MLX5_DSEG_MAX, "invalid WQE max size"); 5423 static_assert(MLX5_WQE_CSEG_SIZE == MLX5_WSEG_SIZE, 5424 "invalid WQE Control Segment size"); 5425 static_assert(MLX5_WQE_ESEG_SIZE == MLX5_WSEG_SIZE, 5426 "invalid WQE Ethernet Segment size"); 5427 static_assert(MLX5_WQE_DSEG_SIZE == MLX5_WSEG_SIZE, 5428 "invalid WQE Data Segment size"); 5429 static_assert(MLX5_WQE_SIZE == 4 * MLX5_WSEG_SIZE, 5430 "invalid WQE size"); 5431 MLX5_ASSERT(priv); 5432 if (tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS) { 5433 /* We should support Multi-Segment Packets. */ 5434 olx |= MLX5_TXOFF_CONFIG_MULTI; 5435 } 5436 if (tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO | 5437 DEV_TX_OFFLOAD_VXLAN_TNL_TSO | 5438 DEV_TX_OFFLOAD_GRE_TNL_TSO | 5439 DEV_TX_OFFLOAD_IP_TNL_TSO | 5440 DEV_TX_OFFLOAD_UDP_TNL_TSO)) { 5441 /* We should support TCP Send Offload. */ 5442 olx |= MLX5_TXOFF_CONFIG_TSO; 5443 } 5444 if (tx_offloads & (DEV_TX_OFFLOAD_IP_TNL_TSO | 5445 DEV_TX_OFFLOAD_UDP_TNL_TSO | 5446 DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)) { 5447 /* We should support Software Parser for Tunnels. */ 5448 olx |= MLX5_TXOFF_CONFIG_SWP; 5449 } 5450 if (tx_offloads & (DEV_TX_OFFLOAD_IPV4_CKSUM | 5451 DEV_TX_OFFLOAD_UDP_CKSUM | 5452 DEV_TX_OFFLOAD_TCP_CKSUM | 5453 DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)) { 5454 /* We should support IP/TCP/UDP Checksums. */ 5455 olx |= MLX5_TXOFF_CONFIG_CSUM; 5456 } 5457 if (tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT) { 5458 /* We should support VLAN insertion. */ 5459 olx |= MLX5_TXOFF_CONFIG_VLAN; 5460 } 5461 if (priv->txqs_n && (*priv->txqs)[0]) { 5462 struct mlx5_txq_data *txd = (*priv->txqs)[0]; 5463 5464 if (txd->inlen_send) { 5465 /* 5466 * Check the data inline requirements. Data inline 5467 * is enabled on per device basis, we can check 5468 * the first Tx queue only. 5469 * 5470 * If device does not support VLAN insertion in WQE 5471 * and some queues are requested to perform VLAN 5472 * insertion offload than inline must be enabled. 5473 */ 5474 olx |= MLX5_TXOFF_CONFIG_INLINE; 5475 } 5476 } 5477 if (config->mps == MLX5_MPW_ENHANCED && 5478 config->txq_inline_min <= 0) { 5479 /* 5480 * The NIC supports Enhanced Multi-Packet Write 5481 * and does not require minimal inline data. 5482 */ 5483 olx |= MLX5_TXOFF_CONFIG_EMPW; 5484 } 5485 if (rte_flow_dynf_metadata_avail()) { 5486 /* We should support Flow metadata. */ 5487 olx |= MLX5_TXOFF_CONFIG_METADATA; 5488 } 5489 if (config->mps == MLX5_MPW) { 5490 /* 5491 * The NIC supports Legacy Multi-Packet Write. 5492 * The MLX5_TXOFF_CONFIG_MPW controls the 5493 * descriptor building method in combination 5494 * with MLX5_TXOFF_CONFIG_EMPW. 5495 */ 5496 if (!(olx & (MLX5_TXOFF_CONFIG_TSO | 5497 MLX5_TXOFF_CONFIG_SWP | 5498 MLX5_TXOFF_CONFIG_VLAN | 5499 MLX5_TXOFF_CONFIG_METADATA))) 5500 olx |= MLX5_TXOFF_CONFIG_EMPW | 5501 MLX5_TXOFF_CONFIG_MPW; 5502 } 5503 /* 5504 * Scan the routines table to find the minimal 5505 * satisfying routine with requested offloads. 5506 */ 5507 m = RTE_DIM(txoff_func); 5508 for (i = 0; i < RTE_DIM(txoff_func); i++) { 5509 unsigned int tmp; 5510 5511 tmp = txoff_func[i].olx; 5512 if (tmp == olx) { 5513 /* Meets requested offloads exactly.*/ 5514 m = i; 5515 break; 5516 } 5517 if ((tmp & olx) != olx) { 5518 /* Does not meet requested offloads at all. */ 5519 continue; 5520 } 5521 if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_EMPW) 5522 /* Do not enable eMPW if not configured. */ 5523 continue; 5524 if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_INLINE) 5525 /* Do not enable inlining if not configured. */ 5526 continue; 5527 /* 5528 * Some routine meets the requirements. 5529 * Check whether it has minimal amount 5530 * of not requested offloads. 5531 */ 5532 tmp = __builtin_popcountl(tmp & ~olx); 5533 if (m >= RTE_DIM(txoff_func) || tmp < diff) { 5534 /* First or better match, save and continue. */ 5535 m = i; 5536 diff = tmp; 5537 continue; 5538 } 5539 if (tmp == diff) { 5540 tmp = txoff_func[i].olx ^ txoff_func[m].olx; 5541 if (__builtin_ffsl(txoff_func[i].olx & ~tmp) < 5542 __builtin_ffsl(txoff_func[m].olx & ~tmp)) { 5543 /* Lighter not requested offload. */ 5544 m = i; 5545 } 5546 } 5547 } 5548 if (m >= RTE_DIM(txoff_func)) { 5549 DRV_LOG(DEBUG, "port %u has no selected Tx function" 5550 " for requested offloads %04X", 5551 dev->data->port_id, olx); 5552 return NULL; 5553 } 5554 DRV_LOG(DEBUG, "port %u has selected Tx function" 5555 " supporting offloads %04X/%04X", 5556 dev->data->port_id, olx, txoff_func[m].olx); 5557 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_MULTI) 5558 DRV_LOG(DEBUG, "\tMULTI (multi segment)"); 5559 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_TSO) 5560 DRV_LOG(DEBUG, "\tTSO (TCP send offload)"); 5561 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_SWP) 5562 DRV_LOG(DEBUG, "\tSWP (software parser)"); 5563 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_CSUM) 5564 DRV_LOG(DEBUG, "\tCSUM (checksum offload)"); 5565 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_INLINE) 5566 DRV_LOG(DEBUG, "\tINLIN (inline data)"); 5567 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_VLAN) 5568 DRV_LOG(DEBUG, "\tVLANI (VLAN insertion)"); 5569 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_METADATA) 5570 DRV_LOG(DEBUG, "\tMETAD (tx Flow metadata)"); 5571 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_EMPW) { 5572 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_MPW) 5573 DRV_LOG(DEBUG, "\tMPW (Legacy MPW)"); 5574 else 5575 DRV_LOG(DEBUG, "\tEMPW (Enhanced MPW)"); 5576 } 5577 return txoff_func[m].func; 5578 } 5579 5580 /** 5581 * DPDK callback to get the TX queue information 5582 * 5583 * @param dev 5584 * Pointer to the device structure. 5585 * 5586 * @param tx_queue_id 5587 * Tx queue identificator. 5588 * 5589 * @param qinfo 5590 * Pointer to the TX queue information structure. 5591 * 5592 * @return 5593 * None. 5594 */ 5595 5596 void 5597 mlx5_txq_info_get(struct rte_eth_dev *dev, uint16_t tx_queue_id, 5598 struct rte_eth_txq_info *qinfo) 5599 { 5600 struct mlx5_priv *priv = dev->data->dev_private; 5601 struct mlx5_txq_data *txq = (*priv->txqs)[tx_queue_id]; 5602 struct mlx5_txq_ctrl *txq_ctrl = 5603 container_of(txq, struct mlx5_txq_ctrl, txq); 5604 5605 if (!txq) 5606 return; 5607 qinfo->nb_desc = txq->elts_s; 5608 qinfo->conf.tx_thresh.pthresh = 0; 5609 qinfo->conf.tx_thresh.hthresh = 0; 5610 qinfo->conf.tx_thresh.wthresh = 0; 5611 qinfo->conf.tx_rs_thresh = 0; 5612 qinfo->conf.tx_free_thresh = 0; 5613 qinfo->conf.tx_deferred_start = txq_ctrl ? 0 : 1; 5614 qinfo->conf.offloads = dev->data->dev_conf.txmode.offloads; 5615 } 5616 5617 /** 5618 * DPDK callback to get the TX packet burst mode information 5619 * 5620 * @param dev 5621 * Pointer to the device structure. 5622 * 5623 * @param tx_queue_id 5624 * Tx queue identificatior. 5625 * 5626 * @param mode 5627 * Pointer to the burts mode information. 5628 * 5629 * @return 5630 * 0 as success, -EINVAL as failure. 5631 */ 5632 5633 int 5634 mlx5_tx_burst_mode_get(struct rte_eth_dev *dev, 5635 uint16_t tx_queue_id __rte_unused, 5636 struct rte_eth_burst_mode *mode) 5637 { 5638 eth_tx_burst_t pkt_burst = dev->tx_pkt_burst; 5639 unsigned int i, olx; 5640 5641 for (i = 0; i < RTE_DIM(txoff_func); i++) { 5642 if (pkt_burst == txoff_func[i].func) { 5643 olx = txoff_func[i].olx; 5644 snprintf(mode->info, sizeof(mode->info), 5645 "%s%s%s%s%s%s%s%s", 5646 (olx & MLX5_TXOFF_CONFIG_EMPW) ? 5647 ((olx & MLX5_TXOFF_CONFIG_MPW) ? 5648 "Legacy MPW" : "Enhanced MPW") : "No MPW", 5649 (olx & MLX5_TXOFF_CONFIG_MULTI) ? 5650 " + MULTI" : "", 5651 (olx & MLX5_TXOFF_CONFIG_TSO) ? 5652 " + TSO" : "", 5653 (olx & MLX5_TXOFF_CONFIG_SWP) ? 5654 " + SWP" : "", 5655 (olx & MLX5_TXOFF_CONFIG_CSUM) ? 5656 " + CSUM" : "", 5657 (olx & MLX5_TXOFF_CONFIG_INLINE) ? 5658 " + INLINE" : "", 5659 (olx & MLX5_TXOFF_CONFIG_VLAN) ? 5660 " + VLAN" : "", 5661 (olx & MLX5_TXOFF_CONFIG_METADATA) ? 5662 " + METADATA" : ""); 5663 return 0; 5664 } 5665 } 5666 return -EINVAL; 5667 } 5668