1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2015 6WIND S.A. 3 * Copyright 2015-2019 Mellanox Technologies, Ltd 4 */ 5 6 #include <stdint.h> 7 #include <string.h> 8 #include <stdlib.h> 9 10 /* Verbs header. */ 11 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 12 #ifdef PEDANTIC 13 #pragma GCC diagnostic ignored "-Wpedantic" 14 #endif 15 #include <infiniband/verbs.h> 16 #include <infiniband/mlx5dv.h> 17 #ifdef PEDANTIC 18 #pragma GCC diagnostic error "-Wpedantic" 19 #endif 20 21 #include <rte_mbuf.h> 22 #include <rte_mempool.h> 23 #include <rte_prefetch.h> 24 #include <rte_common.h> 25 #include <rte_branch_prediction.h> 26 #include <rte_ether.h> 27 #include <rte_cycles.h> 28 #include <rte_flow.h> 29 30 #include <mlx5_devx_cmds.h> 31 #include <mlx5_prm.h> 32 #include <mlx5_common.h> 33 34 #include "mlx5_defs.h" 35 #include "mlx5.h" 36 #include "mlx5_utils.h" 37 #include "mlx5_rxtx.h" 38 #include "mlx5_autoconf.h" 39 40 /* TX burst subroutines return codes. */ 41 enum mlx5_txcmp_code { 42 MLX5_TXCMP_CODE_EXIT = 0, 43 MLX5_TXCMP_CODE_ERROR, 44 MLX5_TXCMP_CODE_SINGLE, 45 MLX5_TXCMP_CODE_MULTI, 46 MLX5_TXCMP_CODE_TSO, 47 MLX5_TXCMP_CODE_EMPW, 48 }; 49 50 /* 51 * These defines are used to configure Tx burst routine option set 52 * supported at compile time. The not specified options are optimized out 53 * out due to if conditions can be explicitly calculated at compile time. 54 * The offloads with bigger runtime check (require more CPU cycles to 55 * skip) overhead should have the bigger index - this is needed to 56 * select the better matching routine function if no exact match and 57 * some offloads are not actually requested. 58 */ 59 #define MLX5_TXOFF_CONFIG_MULTI (1u << 0) /* Multi-segment packets.*/ 60 #define MLX5_TXOFF_CONFIG_TSO (1u << 1) /* TCP send offload supported.*/ 61 #define MLX5_TXOFF_CONFIG_SWP (1u << 2) /* Tunnels/SW Parser offloads.*/ 62 #define MLX5_TXOFF_CONFIG_CSUM (1u << 3) /* Check Sums offloaded. */ 63 #define MLX5_TXOFF_CONFIG_INLINE (1u << 4) /* Data inlining supported. */ 64 #define MLX5_TXOFF_CONFIG_VLAN (1u << 5) /* VLAN insertion supported.*/ 65 #define MLX5_TXOFF_CONFIG_METADATA (1u << 6) /* Flow metadata. */ 66 #define MLX5_TXOFF_CONFIG_EMPW (1u << 8) /* Enhanced MPW supported.*/ 67 #define MLX5_TXOFF_CONFIG_MPW (1u << 9) /* Legacy MPW supported.*/ 68 69 /* The most common offloads groups. */ 70 #define MLX5_TXOFF_CONFIG_NONE 0 71 #define MLX5_TXOFF_CONFIG_FULL (MLX5_TXOFF_CONFIG_MULTI | \ 72 MLX5_TXOFF_CONFIG_TSO | \ 73 MLX5_TXOFF_CONFIG_SWP | \ 74 MLX5_TXOFF_CONFIG_CSUM | \ 75 MLX5_TXOFF_CONFIG_INLINE | \ 76 MLX5_TXOFF_CONFIG_VLAN | \ 77 MLX5_TXOFF_CONFIG_METADATA) 78 79 #define MLX5_TXOFF_CONFIG(mask) (olx & MLX5_TXOFF_CONFIG_##mask) 80 81 #define MLX5_TXOFF_DECL(func, olx) \ 82 static uint16_t mlx5_tx_burst_##func(void *txq, \ 83 struct rte_mbuf **pkts, \ 84 uint16_t pkts_n) \ 85 { \ 86 return mlx5_tx_burst_tmpl((struct mlx5_txq_data *)txq, \ 87 pkts, pkts_n, (olx)); \ 88 } 89 90 #define MLX5_TXOFF_INFO(func, olx) {mlx5_tx_burst_##func, olx}, 91 92 static __rte_always_inline uint32_t 93 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe); 94 95 static __rte_always_inline int 96 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, 97 uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe); 98 99 static __rte_always_inline uint32_t 100 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe); 101 102 static __rte_always_inline void 103 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, 104 volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res); 105 106 static __rte_always_inline void 107 mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx, 108 const unsigned int strd_n); 109 110 static int 111 mlx5_queue_state_modify(struct rte_eth_dev *dev, 112 struct mlx5_mp_arg_queue_state_modify *sm); 113 114 static inline void 115 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *restrict tcp, 116 volatile struct mlx5_cqe *restrict cqe, 117 uint32_t phcsum); 118 119 static inline void 120 mlx5_lro_update_hdr(uint8_t *restrict padd, 121 volatile struct mlx5_cqe *restrict cqe, 122 uint32_t len); 123 124 uint32_t mlx5_ptype_table[] __rte_cache_aligned = { 125 [0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */ 126 }; 127 128 uint8_t mlx5_cksum_table[1 << 10] __rte_cache_aligned; 129 uint8_t mlx5_swp_types_table[1 << 10] __rte_cache_aligned; 130 131 uint64_t rte_net_mlx5_dynf_inline_mask; 132 #define PKT_TX_DYNF_NOINLINE rte_net_mlx5_dynf_inline_mask 133 134 /** 135 * Build a table to translate Rx completion flags to packet type. 136 * 137 * @note: fix mlx5_dev_supported_ptypes_get() if any change here. 138 */ 139 void 140 mlx5_set_ptype_table(void) 141 { 142 unsigned int i; 143 uint32_t (*p)[RTE_DIM(mlx5_ptype_table)] = &mlx5_ptype_table; 144 145 /* Last entry must not be overwritten, reserved for errored packet. */ 146 for (i = 0; i < RTE_DIM(mlx5_ptype_table) - 1; ++i) 147 (*p)[i] = RTE_PTYPE_UNKNOWN; 148 /* 149 * The index to the array should have: 150 * bit[1:0] = l3_hdr_type 151 * bit[4:2] = l4_hdr_type 152 * bit[5] = ip_frag 153 * bit[6] = tunneled 154 * bit[7] = outer_l3_type 155 */ 156 /* L2 */ 157 (*p)[0x00] = RTE_PTYPE_L2_ETHER; 158 /* L3 */ 159 (*p)[0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 160 RTE_PTYPE_L4_NONFRAG; 161 (*p)[0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 162 RTE_PTYPE_L4_NONFRAG; 163 /* Fragmented */ 164 (*p)[0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 165 RTE_PTYPE_L4_FRAG; 166 (*p)[0x22] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 167 RTE_PTYPE_L4_FRAG; 168 /* TCP */ 169 (*p)[0x05] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 170 RTE_PTYPE_L4_TCP; 171 (*p)[0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 172 RTE_PTYPE_L4_TCP; 173 (*p)[0x0d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 174 RTE_PTYPE_L4_TCP; 175 (*p)[0x0e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 176 RTE_PTYPE_L4_TCP; 177 (*p)[0x11] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 178 RTE_PTYPE_L4_TCP; 179 (*p)[0x12] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 180 RTE_PTYPE_L4_TCP; 181 /* UDP */ 182 (*p)[0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 183 RTE_PTYPE_L4_UDP; 184 (*p)[0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 185 RTE_PTYPE_L4_UDP; 186 /* Repeat with outer_l3_type being set. Just in case. */ 187 (*p)[0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 188 RTE_PTYPE_L4_NONFRAG; 189 (*p)[0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 190 RTE_PTYPE_L4_NONFRAG; 191 (*p)[0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 192 RTE_PTYPE_L4_FRAG; 193 (*p)[0xa2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 194 RTE_PTYPE_L4_FRAG; 195 (*p)[0x85] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 196 RTE_PTYPE_L4_TCP; 197 (*p)[0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 198 RTE_PTYPE_L4_TCP; 199 (*p)[0x8d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 200 RTE_PTYPE_L4_TCP; 201 (*p)[0x8e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 202 RTE_PTYPE_L4_TCP; 203 (*p)[0x91] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 204 RTE_PTYPE_L4_TCP; 205 (*p)[0x92] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 206 RTE_PTYPE_L4_TCP; 207 (*p)[0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 208 RTE_PTYPE_L4_UDP; 209 (*p)[0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 210 RTE_PTYPE_L4_UDP; 211 /* Tunneled - L3 */ 212 (*p)[0x40] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN; 213 (*p)[0x41] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 214 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 215 RTE_PTYPE_INNER_L4_NONFRAG; 216 (*p)[0x42] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 217 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 218 RTE_PTYPE_INNER_L4_NONFRAG; 219 (*p)[0xc0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN; 220 (*p)[0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 221 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 222 RTE_PTYPE_INNER_L4_NONFRAG; 223 (*p)[0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 224 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 225 RTE_PTYPE_INNER_L4_NONFRAG; 226 /* Tunneled - Fragmented */ 227 (*p)[0x61] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 228 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 229 RTE_PTYPE_INNER_L4_FRAG; 230 (*p)[0x62] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 231 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 232 RTE_PTYPE_INNER_L4_FRAG; 233 (*p)[0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 234 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 235 RTE_PTYPE_INNER_L4_FRAG; 236 (*p)[0xe2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 237 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 238 RTE_PTYPE_INNER_L4_FRAG; 239 /* Tunneled - TCP */ 240 (*p)[0x45] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 241 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 242 RTE_PTYPE_INNER_L4_TCP; 243 (*p)[0x46] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 244 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 245 RTE_PTYPE_INNER_L4_TCP; 246 (*p)[0x4d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 247 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 248 RTE_PTYPE_INNER_L4_TCP; 249 (*p)[0x4e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 250 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 251 RTE_PTYPE_INNER_L4_TCP; 252 (*p)[0x51] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 253 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 254 RTE_PTYPE_INNER_L4_TCP; 255 (*p)[0x52] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 256 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 257 RTE_PTYPE_INNER_L4_TCP; 258 (*p)[0xc5] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 259 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 260 RTE_PTYPE_INNER_L4_TCP; 261 (*p)[0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 262 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 263 RTE_PTYPE_INNER_L4_TCP; 264 (*p)[0xcd] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 265 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 266 RTE_PTYPE_INNER_L4_TCP; 267 (*p)[0xce] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 268 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 269 RTE_PTYPE_INNER_L4_TCP; 270 (*p)[0xd1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 271 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 272 RTE_PTYPE_INNER_L4_TCP; 273 (*p)[0xd2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 274 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 275 RTE_PTYPE_INNER_L4_TCP; 276 /* Tunneled - UDP */ 277 (*p)[0x49] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 278 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 279 RTE_PTYPE_INNER_L4_UDP; 280 (*p)[0x4a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 281 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 282 RTE_PTYPE_INNER_L4_UDP; 283 (*p)[0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 284 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 285 RTE_PTYPE_INNER_L4_UDP; 286 (*p)[0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 287 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 288 RTE_PTYPE_INNER_L4_UDP; 289 } 290 291 /** 292 * Build a table to translate packet to checksum type of Verbs. 293 */ 294 void 295 mlx5_set_cksum_table(void) 296 { 297 unsigned int i; 298 uint8_t v; 299 300 /* 301 * The index should have: 302 * bit[0] = PKT_TX_TCP_SEG 303 * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM 304 * bit[4] = PKT_TX_IP_CKSUM 305 * bit[8] = PKT_TX_OUTER_IP_CKSUM 306 * bit[9] = tunnel 307 */ 308 for (i = 0; i < RTE_DIM(mlx5_cksum_table); ++i) { 309 v = 0; 310 if (i & (1 << 9)) { 311 /* Tunneled packet. */ 312 if (i & (1 << 8)) /* Outer IP. */ 313 v |= MLX5_ETH_WQE_L3_CSUM; 314 if (i & (1 << 4)) /* Inner IP. */ 315 v |= MLX5_ETH_WQE_L3_INNER_CSUM; 316 if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */ 317 v |= MLX5_ETH_WQE_L4_INNER_CSUM; 318 } else { 319 /* No tunnel. */ 320 if (i & (1 << 4)) /* IP. */ 321 v |= MLX5_ETH_WQE_L3_CSUM; 322 if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */ 323 v |= MLX5_ETH_WQE_L4_CSUM; 324 } 325 mlx5_cksum_table[i] = v; 326 } 327 } 328 329 /** 330 * Build a table to translate packet type of mbuf to SWP type of Verbs. 331 */ 332 void 333 mlx5_set_swp_types_table(void) 334 { 335 unsigned int i; 336 uint8_t v; 337 338 /* 339 * The index should have: 340 * bit[0:1] = PKT_TX_L4_MASK 341 * bit[4] = PKT_TX_IPV6 342 * bit[8] = PKT_TX_OUTER_IPV6 343 * bit[9] = PKT_TX_OUTER_UDP 344 */ 345 for (i = 0; i < RTE_DIM(mlx5_swp_types_table); ++i) { 346 v = 0; 347 if (i & (1 << 8)) 348 v |= MLX5_ETH_WQE_L3_OUTER_IPV6; 349 if (i & (1 << 9)) 350 v |= MLX5_ETH_WQE_L4_OUTER_UDP; 351 if (i & (1 << 4)) 352 v |= MLX5_ETH_WQE_L3_INNER_IPV6; 353 if ((i & 3) == (PKT_TX_UDP_CKSUM >> 52)) 354 v |= MLX5_ETH_WQE_L4_INNER_UDP; 355 mlx5_swp_types_table[i] = v; 356 } 357 } 358 359 /** 360 * Set Software Parser flags and offsets in Ethernet Segment of WQE. 361 * Flags must be preliminary initialized to zero. 362 * 363 * @param loc 364 * Pointer to burst routine local context. 365 * @param swp_flags 366 * Pointer to store Software Parser flags 367 * @param olx 368 * Configured Tx offloads mask. It is fully defined at 369 * compile time and may be used for optimization. 370 * 371 * @return 372 * Software Parser offsets packed in dword. 373 * Software Parser flags are set by pointer. 374 */ 375 static __rte_always_inline uint32_t 376 txq_mbuf_to_swp(struct mlx5_txq_local *restrict loc, 377 uint8_t *swp_flags, 378 unsigned int olx) 379 { 380 uint64_t ol, tunnel; 381 unsigned int idx, off; 382 uint32_t set; 383 384 if (!MLX5_TXOFF_CONFIG(SWP)) 385 return 0; 386 ol = loc->mbuf->ol_flags; 387 tunnel = ol & PKT_TX_TUNNEL_MASK; 388 /* 389 * Check whether Software Parser is required. 390 * Only customized tunnels may ask for. 391 */ 392 if (likely(tunnel != PKT_TX_TUNNEL_UDP && tunnel != PKT_TX_TUNNEL_IP)) 393 return 0; 394 /* 395 * The index should have: 396 * bit[0:1] = PKT_TX_L4_MASK 397 * bit[4] = PKT_TX_IPV6 398 * bit[8] = PKT_TX_OUTER_IPV6 399 * bit[9] = PKT_TX_OUTER_UDP 400 */ 401 idx = (ol & (PKT_TX_L4_MASK | PKT_TX_IPV6 | PKT_TX_OUTER_IPV6)) >> 52; 402 idx |= (tunnel == PKT_TX_TUNNEL_UDP) ? (1 << 9) : 0; 403 *swp_flags = mlx5_swp_types_table[idx]; 404 /* 405 * Set offsets for SW parser. Since ConnectX-5, SW parser just 406 * complements HW parser. SW parser starts to engage only if HW parser 407 * can't reach a header. For the older devices, HW parser will not kick 408 * in if any of SWP offsets is set. Therefore, all of the L3 offsets 409 * should be set regardless of HW offload. 410 */ 411 off = loc->mbuf->outer_l2_len; 412 if (MLX5_TXOFF_CONFIG(VLAN) && ol & PKT_TX_VLAN_PKT) 413 off += sizeof(struct rte_vlan_hdr); 414 set = (off >> 1) << 8; /* Outer L3 offset. */ 415 off += loc->mbuf->outer_l3_len; 416 if (tunnel == PKT_TX_TUNNEL_UDP) 417 set |= off >> 1; /* Outer L4 offset. */ 418 if (ol & (PKT_TX_IPV4 | PKT_TX_IPV6)) { /* Inner IP. */ 419 const uint64_t csum = ol & PKT_TX_L4_MASK; 420 off += loc->mbuf->l2_len; 421 set |= (off >> 1) << 24; /* Inner L3 offset. */ 422 if (csum == PKT_TX_TCP_CKSUM || 423 csum == PKT_TX_UDP_CKSUM || 424 (MLX5_TXOFF_CONFIG(TSO) && ol & PKT_TX_TCP_SEG)) { 425 off += loc->mbuf->l3_len; 426 set |= (off >> 1) << 16; /* Inner L4 offset. */ 427 } 428 } 429 set = rte_cpu_to_le_32(set); 430 return set; 431 } 432 433 /** 434 * Convert the Checksum offloads to Verbs. 435 * 436 * @param buf 437 * Pointer to the mbuf. 438 * 439 * @return 440 * Converted checksum flags. 441 */ 442 static __rte_always_inline uint8_t 443 txq_ol_cksum_to_cs(struct rte_mbuf *buf) 444 { 445 uint32_t idx; 446 uint8_t is_tunnel = !!(buf->ol_flags & PKT_TX_TUNNEL_MASK); 447 const uint64_t ol_flags_mask = PKT_TX_TCP_SEG | PKT_TX_L4_MASK | 448 PKT_TX_IP_CKSUM | PKT_TX_OUTER_IP_CKSUM; 449 450 /* 451 * The index should have: 452 * bit[0] = PKT_TX_TCP_SEG 453 * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM 454 * bit[4] = PKT_TX_IP_CKSUM 455 * bit[8] = PKT_TX_OUTER_IP_CKSUM 456 * bit[9] = tunnel 457 */ 458 idx = ((buf->ol_flags & ol_flags_mask) >> 50) | (!!is_tunnel << 9); 459 return mlx5_cksum_table[idx]; 460 } 461 462 /** 463 * Internal function to compute the number of used descriptors in an RX queue 464 * 465 * @param rxq 466 * The Rx queue. 467 * 468 * @return 469 * The number of used rx descriptor. 470 */ 471 static uint32_t 472 rx_queue_count(struct mlx5_rxq_data *rxq) 473 { 474 struct rxq_zip *zip = &rxq->zip; 475 volatile struct mlx5_cqe *cqe; 476 const unsigned int cqe_n = (1 << rxq->cqe_n); 477 const unsigned int cqe_cnt = cqe_n - 1; 478 unsigned int cq_ci; 479 unsigned int used; 480 481 /* if we are processing a compressed cqe */ 482 if (zip->ai) { 483 used = zip->cqe_cnt - zip->ca; 484 cq_ci = zip->cq_ci; 485 } else { 486 used = 0; 487 cq_ci = rxq->cq_ci; 488 } 489 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; 490 while (check_cqe(cqe, cqe_n, cq_ci) != MLX5_CQE_STATUS_HW_OWN) { 491 int8_t op_own; 492 unsigned int n; 493 494 op_own = cqe->op_own; 495 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) 496 n = rte_be_to_cpu_32(cqe->byte_cnt); 497 else 498 n = 1; 499 cq_ci += n; 500 used += n; 501 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; 502 } 503 used = RTE_MIN(used, (1U << rxq->elts_n) - 1); 504 return used; 505 } 506 507 /** 508 * DPDK callback to check the status of a rx descriptor. 509 * 510 * @param rx_queue 511 * The Rx queue. 512 * @param[in] offset 513 * The index of the descriptor in the ring. 514 * 515 * @return 516 * The status of the tx descriptor. 517 */ 518 int 519 mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset) 520 { 521 struct mlx5_rxq_data *rxq = rx_queue; 522 struct mlx5_rxq_ctrl *rxq_ctrl = 523 container_of(rxq, struct mlx5_rxq_ctrl, rxq); 524 struct rte_eth_dev *dev = ETH_DEV(rxq_ctrl->priv); 525 526 if (dev->rx_pkt_burst != mlx5_rx_burst) { 527 rte_errno = ENOTSUP; 528 return -rte_errno; 529 } 530 if (offset >= (1 << rxq->elts_n)) { 531 rte_errno = EINVAL; 532 return -rte_errno; 533 } 534 if (offset < rx_queue_count(rxq)) 535 return RTE_ETH_RX_DESC_DONE; 536 return RTE_ETH_RX_DESC_AVAIL; 537 } 538 539 /** 540 * DPDK callback to get the RX queue information 541 * 542 * @param dev 543 * Pointer to the device structure. 544 * 545 * @param rx_queue_id 546 * Rx queue identificator. 547 * 548 * @param qinfo 549 * Pointer to the RX queue information structure. 550 * 551 * @return 552 * None. 553 */ 554 555 void 556 mlx5_rxq_info_get(struct rte_eth_dev *dev, uint16_t rx_queue_id, 557 struct rte_eth_rxq_info *qinfo) 558 { 559 struct mlx5_priv *priv = dev->data->dev_private; 560 struct mlx5_rxq_data *rxq = (*priv->rxqs)[rx_queue_id]; 561 struct mlx5_rxq_ctrl *rxq_ctrl = 562 container_of(rxq, struct mlx5_rxq_ctrl, rxq); 563 564 if (!rxq) 565 return; 566 qinfo->mp = mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ? 567 rxq->mprq_mp : rxq->mp; 568 qinfo->conf.rx_thresh.pthresh = 0; 569 qinfo->conf.rx_thresh.hthresh = 0; 570 qinfo->conf.rx_thresh.wthresh = 0; 571 qinfo->conf.rx_free_thresh = rxq->rq_repl_thresh; 572 qinfo->conf.rx_drop_en = 1; 573 qinfo->conf.rx_deferred_start = rxq_ctrl ? 0 : 1; 574 qinfo->conf.offloads = dev->data->dev_conf.rxmode.offloads; 575 qinfo->scattered_rx = dev->data->scattered_rx; 576 qinfo->nb_desc = 1 << rxq->elts_n; 577 } 578 579 /** 580 * DPDK callback to get the RX packet burst mode information 581 * 582 * @param dev 583 * Pointer to the device structure. 584 * 585 * @param rx_queue_id 586 * Rx queue identificatior. 587 * 588 * @param mode 589 * Pointer to the burts mode information. 590 * 591 * @return 592 * 0 as success, -EINVAL as failure. 593 */ 594 595 int 596 mlx5_rx_burst_mode_get(struct rte_eth_dev *dev, 597 uint16_t rx_queue_id __rte_unused, 598 struct rte_eth_burst_mode *mode) 599 { 600 eth_rx_burst_t pkt_burst = dev->rx_pkt_burst; 601 602 if (pkt_burst == mlx5_rx_burst) { 603 snprintf(mode->info, sizeof(mode->info), "%s", "Scalar"); 604 } else if (pkt_burst == mlx5_rx_burst_mprq) { 605 snprintf(mode->info, sizeof(mode->info), "%s", "Multi-Packet RQ"); 606 } else if (pkt_burst == mlx5_rx_burst_vec) { 607 #if defined RTE_ARCH_X86_64 608 snprintf(mode->info, sizeof(mode->info), "%s", "Vector SSE"); 609 #elif defined RTE_ARCH_ARM64 610 snprintf(mode->info, sizeof(mode->info), "%s", "Vector Neon"); 611 #elif defined RTE_ARCH_PPC_64 612 snprintf(mode->info, sizeof(mode->info), "%s", "Vector AltiVec"); 613 #else 614 return -EINVAL; 615 #endif 616 } else { 617 return -EINVAL; 618 } 619 return 0; 620 } 621 622 /** 623 * DPDK callback to get the number of used descriptors in a RX queue 624 * 625 * @param dev 626 * Pointer to the device structure. 627 * 628 * @param rx_queue_id 629 * The Rx queue. 630 * 631 * @return 632 * The number of used rx descriptor. 633 * -EINVAL if the queue is invalid 634 */ 635 uint32_t 636 mlx5_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id) 637 { 638 struct mlx5_priv *priv = dev->data->dev_private; 639 struct mlx5_rxq_data *rxq; 640 641 if (dev->rx_pkt_burst != mlx5_rx_burst) { 642 rte_errno = ENOTSUP; 643 return -rte_errno; 644 } 645 rxq = (*priv->rxqs)[rx_queue_id]; 646 if (!rxq) { 647 rte_errno = EINVAL; 648 return -rte_errno; 649 } 650 return rx_queue_count(rxq); 651 } 652 653 #define MLX5_SYSTEM_LOG_DIR "/var/log" 654 /** 655 * Dump debug information to log file. 656 * 657 * @param fname 658 * The file name. 659 * @param hex_title 660 * If not NULL this string is printed as a header to the output 661 * and the output will be in hexadecimal view. 662 * @param buf 663 * This is the buffer address to print out. 664 * @param len 665 * The number of bytes to dump out. 666 */ 667 void 668 mlx5_dump_debug_information(const char *fname, const char *hex_title, 669 const void *buf, unsigned int hex_len) 670 { 671 FILE *fd; 672 673 MKSTR(path, "%s/%s", MLX5_SYSTEM_LOG_DIR, fname); 674 fd = fopen(path, "a+"); 675 if (!fd) { 676 DRV_LOG(WARNING, "cannot open %s for debug dump", path); 677 MKSTR(path2, "./%s", fname); 678 fd = fopen(path2, "a+"); 679 if (!fd) { 680 DRV_LOG(ERR, "cannot open %s for debug dump", path2); 681 return; 682 } 683 DRV_LOG(INFO, "New debug dump in file %s", path2); 684 } else { 685 DRV_LOG(INFO, "New debug dump in file %s", path); 686 } 687 if (hex_title) 688 rte_hexdump(fd, hex_title, buf, hex_len); 689 else 690 fprintf(fd, "%s", (const char *)buf); 691 fprintf(fd, "\n\n\n"); 692 fclose(fd); 693 } 694 695 /** 696 * Move QP from error state to running state and initialize indexes. 697 * 698 * @param txq_ctrl 699 * Pointer to TX queue control structure. 700 * 701 * @return 702 * 0 on success, else -1. 703 */ 704 static int 705 tx_recover_qp(struct mlx5_txq_ctrl *txq_ctrl) 706 { 707 struct mlx5_mp_arg_queue_state_modify sm = { 708 .is_wq = 0, 709 .queue_id = txq_ctrl->txq.idx, 710 }; 711 712 if (mlx5_queue_state_modify(ETH_DEV(txq_ctrl->priv), &sm)) 713 return -1; 714 txq_ctrl->txq.wqe_ci = 0; 715 txq_ctrl->txq.wqe_pi = 0; 716 txq_ctrl->txq.elts_comp = 0; 717 return 0; 718 } 719 720 /* Return 1 if the error CQE is signed otherwise, sign it and return 0. */ 721 static int 722 check_err_cqe_seen(volatile struct mlx5_err_cqe *err_cqe) 723 { 724 static const uint8_t magic[] = "seen"; 725 int ret = 1; 726 unsigned int i; 727 728 for (i = 0; i < sizeof(magic); ++i) 729 if (!ret || err_cqe->rsvd1[i] != magic[i]) { 730 ret = 0; 731 err_cqe->rsvd1[i] = magic[i]; 732 } 733 return ret; 734 } 735 736 /** 737 * Handle error CQE. 738 * 739 * @param txq 740 * Pointer to TX queue structure. 741 * @param error_cqe 742 * Pointer to the error CQE. 743 * 744 * @return 745 * Negative value if queue recovery failed, otherwise 746 * the error completion entry is handled successfully. 747 */ 748 static int 749 mlx5_tx_error_cqe_handle(struct mlx5_txq_data *restrict txq, 750 volatile struct mlx5_err_cqe *err_cqe) 751 { 752 if (err_cqe->syndrome != MLX5_CQE_SYNDROME_WR_FLUSH_ERR) { 753 const uint16_t wqe_m = ((1 << txq->wqe_n) - 1); 754 struct mlx5_txq_ctrl *txq_ctrl = 755 container_of(txq, struct mlx5_txq_ctrl, txq); 756 uint16_t new_wqe_pi = rte_be_to_cpu_16(err_cqe->wqe_counter); 757 int seen = check_err_cqe_seen(err_cqe); 758 759 if (!seen && txq_ctrl->dump_file_n < 760 txq_ctrl->priv->config.max_dump_files_num) { 761 MKSTR(err_str, "Unexpected CQE error syndrome " 762 "0x%02x CQN = %u SQN = %u wqe_counter = %u " 763 "wq_ci = %u cq_ci = %u", err_cqe->syndrome, 764 txq->cqe_s, txq->qp_num_8s >> 8, 765 rte_be_to_cpu_16(err_cqe->wqe_counter), 766 txq->wqe_ci, txq->cq_ci); 767 MKSTR(name, "dpdk_mlx5_port_%u_txq_%u_index_%u_%u", 768 PORT_ID(txq_ctrl->priv), txq->idx, 769 txq_ctrl->dump_file_n, (uint32_t)rte_rdtsc()); 770 mlx5_dump_debug_information(name, NULL, err_str, 0); 771 mlx5_dump_debug_information(name, "MLX5 Error CQ:", 772 (const void *)((uintptr_t) 773 txq->cqes), 774 sizeof(*err_cqe) * 775 (1 << txq->cqe_n)); 776 mlx5_dump_debug_information(name, "MLX5 Error SQ:", 777 (const void *)((uintptr_t) 778 txq->wqes), 779 MLX5_WQE_SIZE * 780 (1 << txq->wqe_n)); 781 txq_ctrl->dump_file_n++; 782 } 783 if (!seen) 784 /* 785 * Count errors in WQEs units. 786 * Later it can be improved to count error packets, 787 * for example, by SQ parsing to find how much packets 788 * should be counted for each WQE. 789 */ 790 txq->stats.oerrors += ((txq->wqe_ci & wqe_m) - 791 new_wqe_pi) & wqe_m; 792 if (tx_recover_qp(txq_ctrl)) { 793 /* Recovering failed - retry later on the same WQE. */ 794 return -1; 795 } 796 /* Release all the remaining buffers. */ 797 txq_free_elts(txq_ctrl); 798 } 799 return 0; 800 } 801 802 /** 803 * Translate RX completion flags to packet type. 804 * 805 * @param[in] rxq 806 * Pointer to RX queue structure. 807 * @param[in] cqe 808 * Pointer to CQE. 809 * 810 * @note: fix mlx5_dev_supported_ptypes_get() if any change here. 811 * 812 * @return 813 * Packet type for struct rte_mbuf. 814 */ 815 static inline uint32_t 816 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe) 817 { 818 uint8_t idx; 819 uint8_t pinfo = cqe->pkt_info; 820 uint16_t ptype = cqe->hdr_type_etc; 821 822 /* 823 * The index to the array should have: 824 * bit[1:0] = l3_hdr_type 825 * bit[4:2] = l4_hdr_type 826 * bit[5] = ip_frag 827 * bit[6] = tunneled 828 * bit[7] = outer_l3_type 829 */ 830 idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10); 831 return mlx5_ptype_table[idx] | rxq->tunnel * !!(idx & (1 << 6)); 832 } 833 834 /** 835 * Initialize Rx WQ and indexes. 836 * 837 * @param[in] rxq 838 * Pointer to RX queue structure. 839 */ 840 void 841 mlx5_rxq_initialize(struct mlx5_rxq_data *rxq) 842 { 843 const unsigned int wqe_n = 1 << rxq->elts_n; 844 unsigned int i; 845 846 for (i = 0; (i != wqe_n); ++i) { 847 volatile struct mlx5_wqe_data_seg *scat; 848 uintptr_t addr; 849 uint32_t byte_count; 850 851 if (mlx5_rxq_mprq_enabled(rxq)) { 852 struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[i]; 853 854 scat = &((volatile struct mlx5_wqe_mprq *) 855 rxq->wqes)[i].dseg; 856 addr = (uintptr_t)mlx5_mprq_buf_addr(buf, 857 1 << rxq->strd_num_n); 858 byte_count = (1 << rxq->strd_sz_n) * 859 (1 << rxq->strd_num_n); 860 } else { 861 struct rte_mbuf *buf = (*rxq->elts)[i]; 862 863 scat = &((volatile struct mlx5_wqe_data_seg *) 864 rxq->wqes)[i]; 865 addr = rte_pktmbuf_mtod(buf, uintptr_t); 866 byte_count = DATA_LEN(buf); 867 } 868 /* scat->addr must be able to store a pointer. */ 869 MLX5_ASSERT(sizeof(scat->addr) >= sizeof(uintptr_t)); 870 *scat = (struct mlx5_wqe_data_seg){ 871 .addr = rte_cpu_to_be_64(addr), 872 .byte_count = rte_cpu_to_be_32(byte_count), 873 .lkey = mlx5_rx_addr2mr(rxq, addr), 874 }; 875 } 876 rxq->consumed_strd = 0; 877 rxq->decompressed = 0; 878 rxq->rq_pi = 0; 879 rxq->zip = (struct rxq_zip){ 880 .ai = 0, 881 }; 882 /* Update doorbell counter. */ 883 rxq->rq_ci = wqe_n >> rxq->sges_n; 884 rte_cio_wmb(); 885 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 886 } 887 888 /** 889 * Modify a Verbs/DevX queue state. 890 * This must be called from the primary process. 891 * 892 * @param dev 893 * Pointer to Ethernet device. 894 * @param sm 895 * State modify request parameters. 896 * 897 * @return 898 * 0 in case of success else non-zero value and rte_errno is set. 899 */ 900 int 901 mlx5_queue_state_modify_primary(struct rte_eth_dev *dev, 902 const struct mlx5_mp_arg_queue_state_modify *sm) 903 { 904 int ret; 905 struct mlx5_priv *priv = dev->data->dev_private; 906 907 if (sm->is_wq) { 908 struct mlx5_rxq_data *rxq = (*priv->rxqs)[sm->queue_id]; 909 struct mlx5_rxq_ctrl *rxq_ctrl = 910 container_of(rxq, struct mlx5_rxq_ctrl, rxq); 911 912 if (rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_IBV) { 913 struct ibv_wq_attr mod = { 914 .attr_mask = IBV_WQ_ATTR_STATE, 915 .wq_state = sm->state, 916 }; 917 918 ret = mlx5_glue->modify_wq(rxq_ctrl->obj->wq, &mod); 919 } else { /* rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ. */ 920 struct mlx5_devx_modify_rq_attr rq_attr; 921 922 memset(&rq_attr, 0, sizeof(rq_attr)); 923 if (sm->state == IBV_WQS_RESET) { 924 rq_attr.rq_state = MLX5_RQC_STATE_ERR; 925 rq_attr.state = MLX5_RQC_STATE_RST; 926 } else if (sm->state == IBV_WQS_RDY) { 927 rq_attr.rq_state = MLX5_RQC_STATE_RST; 928 rq_attr.state = MLX5_RQC_STATE_RDY; 929 } else if (sm->state == IBV_WQS_ERR) { 930 rq_attr.rq_state = MLX5_RQC_STATE_RDY; 931 rq_attr.state = MLX5_RQC_STATE_ERR; 932 } 933 ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq, 934 &rq_attr); 935 } 936 if (ret) { 937 DRV_LOG(ERR, "Cannot change Rx WQ state to %u - %s", 938 sm->state, strerror(errno)); 939 rte_errno = errno; 940 return ret; 941 } 942 } else { 943 struct mlx5_txq_data *txq = (*priv->txqs)[sm->queue_id]; 944 struct mlx5_txq_ctrl *txq_ctrl = 945 container_of(txq, struct mlx5_txq_ctrl, txq); 946 struct ibv_qp_attr mod = { 947 .qp_state = IBV_QPS_RESET, 948 .port_num = (uint8_t)priv->ibv_port, 949 }; 950 struct ibv_qp *qp = txq_ctrl->obj->qp; 951 952 ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE); 953 if (ret) { 954 DRV_LOG(ERR, "Cannot change the Tx QP state to RESET " 955 "%s", strerror(errno)); 956 rte_errno = errno; 957 return ret; 958 } 959 mod.qp_state = IBV_QPS_INIT; 960 ret = mlx5_glue->modify_qp(qp, &mod, 961 (IBV_QP_STATE | IBV_QP_PORT)); 962 if (ret) { 963 DRV_LOG(ERR, "Cannot change Tx QP state to INIT %s", 964 strerror(errno)); 965 rte_errno = errno; 966 return ret; 967 } 968 mod.qp_state = IBV_QPS_RTR; 969 ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE); 970 if (ret) { 971 DRV_LOG(ERR, "Cannot change Tx QP state to RTR %s", 972 strerror(errno)); 973 rte_errno = errno; 974 return ret; 975 } 976 mod.qp_state = IBV_QPS_RTS; 977 ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE); 978 if (ret) { 979 DRV_LOG(ERR, "Cannot change Tx QP state to RTS %s", 980 strerror(errno)); 981 rte_errno = errno; 982 return ret; 983 } 984 } 985 return 0; 986 } 987 988 /** 989 * Modify a Verbs queue state. 990 * 991 * @param dev 992 * Pointer to Ethernet device. 993 * @param sm 994 * State modify request parameters. 995 * 996 * @return 997 * 0 in case of success else non-zero value. 998 */ 999 static int 1000 mlx5_queue_state_modify(struct rte_eth_dev *dev, 1001 struct mlx5_mp_arg_queue_state_modify *sm) 1002 { 1003 int ret = 0; 1004 1005 switch (rte_eal_process_type()) { 1006 case RTE_PROC_PRIMARY: 1007 ret = mlx5_queue_state_modify_primary(dev, sm); 1008 break; 1009 case RTE_PROC_SECONDARY: 1010 ret = mlx5_mp_req_queue_state_modify(dev, sm); 1011 break; 1012 default: 1013 break; 1014 } 1015 return ret; 1016 } 1017 1018 /** 1019 * Handle a Rx error. 1020 * The function inserts the RQ state to reset when the first error CQE is 1021 * shown, then drains the CQ by the caller function loop. When the CQ is empty, 1022 * it moves the RQ state to ready and initializes the RQ. 1023 * Next CQE identification and error counting are in the caller responsibility. 1024 * 1025 * @param[in] rxq 1026 * Pointer to RX queue structure. 1027 * @param[in] vec 1028 * 1 when called from vectorized Rx burst, need to prepare mbufs for the RQ. 1029 * 0 when called from non-vectorized Rx burst. 1030 * 1031 * @return 1032 * -1 in case of recovery error, otherwise the CQE status. 1033 */ 1034 int 1035 mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec) 1036 { 1037 const uint16_t cqe_n = 1 << rxq->cqe_n; 1038 const uint16_t cqe_mask = cqe_n - 1; 1039 const unsigned int wqe_n = 1 << rxq->elts_n; 1040 struct mlx5_rxq_ctrl *rxq_ctrl = 1041 container_of(rxq, struct mlx5_rxq_ctrl, rxq); 1042 union { 1043 volatile struct mlx5_cqe *cqe; 1044 volatile struct mlx5_err_cqe *err_cqe; 1045 } u = { 1046 .cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask], 1047 }; 1048 struct mlx5_mp_arg_queue_state_modify sm; 1049 int ret; 1050 1051 switch (rxq->err_state) { 1052 case MLX5_RXQ_ERR_STATE_NO_ERROR: 1053 rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_RESET; 1054 /* Fall-through */ 1055 case MLX5_RXQ_ERR_STATE_NEED_RESET: 1056 sm.is_wq = 1; 1057 sm.queue_id = rxq->idx; 1058 sm.state = IBV_WQS_RESET; 1059 if (mlx5_queue_state_modify(ETH_DEV(rxq_ctrl->priv), &sm)) 1060 return -1; 1061 if (rxq_ctrl->dump_file_n < 1062 rxq_ctrl->priv->config.max_dump_files_num) { 1063 MKSTR(err_str, "Unexpected CQE error syndrome " 1064 "0x%02x CQN = %u RQN = %u wqe_counter = %u" 1065 " rq_ci = %u cq_ci = %u", u.err_cqe->syndrome, 1066 rxq->cqn, rxq_ctrl->wqn, 1067 rte_be_to_cpu_16(u.err_cqe->wqe_counter), 1068 rxq->rq_ci << rxq->sges_n, rxq->cq_ci); 1069 MKSTR(name, "dpdk_mlx5_port_%u_rxq_%u_%u", 1070 rxq->port_id, rxq->idx, (uint32_t)rte_rdtsc()); 1071 mlx5_dump_debug_information(name, NULL, err_str, 0); 1072 mlx5_dump_debug_information(name, "MLX5 Error CQ:", 1073 (const void *)((uintptr_t) 1074 rxq->cqes), 1075 sizeof(*u.cqe) * cqe_n); 1076 mlx5_dump_debug_information(name, "MLX5 Error RQ:", 1077 (const void *)((uintptr_t) 1078 rxq->wqes), 1079 16 * wqe_n); 1080 rxq_ctrl->dump_file_n++; 1081 } 1082 rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_READY; 1083 /* Fall-through */ 1084 case MLX5_RXQ_ERR_STATE_NEED_READY: 1085 ret = check_cqe(u.cqe, cqe_n, rxq->cq_ci); 1086 if (ret == MLX5_CQE_STATUS_HW_OWN) { 1087 rte_cio_wmb(); 1088 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); 1089 rte_cio_wmb(); 1090 /* 1091 * The RQ consumer index must be zeroed while moving 1092 * from RESET state to RDY state. 1093 */ 1094 *rxq->rq_db = rte_cpu_to_be_32(0); 1095 rte_cio_wmb(); 1096 sm.is_wq = 1; 1097 sm.queue_id = rxq->idx; 1098 sm.state = IBV_WQS_RDY; 1099 if (mlx5_queue_state_modify(ETH_DEV(rxq_ctrl->priv), 1100 &sm)) 1101 return -1; 1102 if (vec) { 1103 const uint16_t q_mask = wqe_n - 1; 1104 uint16_t elt_idx; 1105 struct rte_mbuf **elt; 1106 int i; 1107 unsigned int n = wqe_n - (rxq->rq_ci - 1108 rxq->rq_pi); 1109 1110 for (i = 0; i < (int)n; ++i) { 1111 elt_idx = (rxq->rq_ci + i) & q_mask; 1112 elt = &(*rxq->elts)[elt_idx]; 1113 *elt = rte_mbuf_raw_alloc(rxq->mp); 1114 if (!*elt) { 1115 for (i--; i >= 0; --i) { 1116 elt_idx = (rxq->rq_ci + 1117 i) & q_mask; 1118 elt = &(*rxq->elts) 1119 [elt_idx]; 1120 rte_pktmbuf_free_seg 1121 (*elt); 1122 } 1123 return -1; 1124 } 1125 } 1126 for (i = 0; i < (int)wqe_n; ++i) { 1127 elt = &(*rxq->elts)[i]; 1128 DATA_LEN(*elt) = 1129 (uint16_t)((*elt)->buf_len - 1130 rte_pktmbuf_headroom(*elt)); 1131 } 1132 /* Padding with a fake mbuf for vec Rx. */ 1133 for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i) 1134 (*rxq->elts)[wqe_n + i] = 1135 &rxq->fake_mbuf; 1136 } 1137 mlx5_rxq_initialize(rxq); 1138 rxq->err_state = MLX5_RXQ_ERR_STATE_NO_ERROR; 1139 } 1140 return ret; 1141 default: 1142 return -1; 1143 } 1144 } 1145 1146 /** 1147 * Get size of the next packet for a given CQE. For compressed CQEs, the 1148 * consumer index is updated only once all packets of the current one have 1149 * been processed. 1150 * 1151 * @param rxq 1152 * Pointer to RX queue. 1153 * @param cqe 1154 * CQE to process. 1155 * @param[out] mcqe 1156 * Store pointer to mini-CQE if compressed. Otherwise, the pointer is not 1157 * written. 1158 * 1159 * @return 1160 * 0 in case of empty CQE, otherwise the packet size in bytes. 1161 */ 1162 static inline int 1163 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, 1164 uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe) 1165 { 1166 struct rxq_zip *zip = &rxq->zip; 1167 uint16_t cqe_n = cqe_cnt + 1; 1168 int len; 1169 uint16_t idx, end; 1170 1171 do { 1172 len = 0; 1173 /* Process compressed data in the CQE and mini arrays. */ 1174 if (zip->ai) { 1175 volatile struct mlx5_mini_cqe8 (*mc)[8] = 1176 (volatile struct mlx5_mini_cqe8 (*)[8]) 1177 (uintptr_t)(&(*rxq->cqes)[zip->ca & 1178 cqe_cnt].pkt_info); 1179 1180 len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt); 1181 *mcqe = &(*mc)[zip->ai & 7]; 1182 if ((++zip->ai & 7) == 0) { 1183 /* Invalidate consumed CQEs */ 1184 idx = zip->ca; 1185 end = zip->na; 1186 while (idx != end) { 1187 (*rxq->cqes)[idx & cqe_cnt].op_own = 1188 MLX5_CQE_INVALIDATE; 1189 ++idx; 1190 } 1191 /* 1192 * Increment consumer index to skip the number 1193 * of CQEs consumed. Hardware leaves holes in 1194 * the CQ ring for software use. 1195 */ 1196 zip->ca = zip->na; 1197 zip->na += 8; 1198 } 1199 if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) { 1200 /* Invalidate the rest */ 1201 idx = zip->ca; 1202 end = zip->cq_ci; 1203 1204 while (idx != end) { 1205 (*rxq->cqes)[idx & cqe_cnt].op_own = 1206 MLX5_CQE_INVALIDATE; 1207 ++idx; 1208 } 1209 rxq->cq_ci = zip->cq_ci; 1210 zip->ai = 0; 1211 } 1212 /* 1213 * No compressed data, get next CQE and verify if it is 1214 * compressed. 1215 */ 1216 } else { 1217 int ret; 1218 int8_t op_own; 1219 1220 ret = check_cqe(cqe, cqe_n, rxq->cq_ci); 1221 if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) { 1222 if (unlikely(ret == MLX5_CQE_STATUS_ERR || 1223 rxq->err_state)) { 1224 ret = mlx5_rx_err_handle(rxq, 0); 1225 if (ret == MLX5_CQE_STATUS_HW_OWN || 1226 ret == -1) 1227 return 0; 1228 } else { 1229 return 0; 1230 } 1231 } 1232 ++rxq->cq_ci; 1233 op_own = cqe->op_own; 1234 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) { 1235 volatile struct mlx5_mini_cqe8 (*mc)[8] = 1236 (volatile struct mlx5_mini_cqe8 (*)[8]) 1237 (uintptr_t)(&(*rxq->cqes) 1238 [rxq->cq_ci & 1239 cqe_cnt].pkt_info); 1240 1241 /* Fix endianness. */ 1242 zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt); 1243 /* 1244 * Current mini array position is the one 1245 * returned by check_cqe64(). 1246 * 1247 * If completion comprises several mini arrays, 1248 * as a special case the second one is located 1249 * 7 CQEs after the initial CQE instead of 8 1250 * for subsequent ones. 1251 */ 1252 zip->ca = rxq->cq_ci; 1253 zip->na = zip->ca + 7; 1254 /* Compute the next non compressed CQE. */ 1255 --rxq->cq_ci; 1256 zip->cq_ci = rxq->cq_ci + zip->cqe_cnt; 1257 /* Get packet size to return. */ 1258 len = rte_be_to_cpu_32((*mc)[0].byte_cnt); 1259 *mcqe = &(*mc)[0]; 1260 zip->ai = 1; 1261 /* Prefetch all to be invalidated */ 1262 idx = zip->ca; 1263 end = zip->cq_ci; 1264 while (idx != end) { 1265 rte_prefetch0(&(*rxq->cqes)[(idx) & 1266 cqe_cnt]); 1267 ++idx; 1268 } 1269 } else { 1270 len = rte_be_to_cpu_32(cqe->byte_cnt); 1271 } 1272 } 1273 if (unlikely(rxq->err_state)) { 1274 cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1275 ++rxq->stats.idropped; 1276 } else { 1277 return len; 1278 } 1279 } while (1); 1280 } 1281 1282 /** 1283 * Translate RX completion flags to offload flags. 1284 * 1285 * @param[in] cqe 1286 * Pointer to CQE. 1287 * 1288 * @return 1289 * Offload flags (ol_flags) for struct rte_mbuf. 1290 */ 1291 static inline uint32_t 1292 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe) 1293 { 1294 uint32_t ol_flags = 0; 1295 uint16_t flags = rte_be_to_cpu_16(cqe->hdr_type_etc); 1296 1297 ol_flags = 1298 TRANSPOSE(flags, 1299 MLX5_CQE_RX_L3_HDR_VALID, 1300 PKT_RX_IP_CKSUM_GOOD) | 1301 TRANSPOSE(flags, 1302 MLX5_CQE_RX_L4_HDR_VALID, 1303 PKT_RX_L4_CKSUM_GOOD); 1304 return ol_flags; 1305 } 1306 1307 /** 1308 * Fill in mbuf fields from RX completion flags. 1309 * Note that pkt->ol_flags should be initialized outside of this function. 1310 * 1311 * @param rxq 1312 * Pointer to RX queue. 1313 * @param pkt 1314 * mbuf to fill. 1315 * @param cqe 1316 * CQE to process. 1317 * @param rss_hash_res 1318 * Packet RSS Hash result. 1319 */ 1320 static inline void 1321 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, 1322 volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res) 1323 { 1324 /* Update packet information. */ 1325 pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe); 1326 if (rss_hash_res && rxq->rss_hash) { 1327 pkt->hash.rss = rss_hash_res; 1328 pkt->ol_flags |= PKT_RX_RSS_HASH; 1329 } 1330 if (rxq->mark && MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) { 1331 pkt->ol_flags |= PKT_RX_FDIR; 1332 if (cqe->sop_drop_qpn != 1333 rte_cpu_to_be_32(MLX5_FLOW_MARK_DEFAULT)) { 1334 uint32_t mark = cqe->sop_drop_qpn; 1335 1336 pkt->ol_flags |= PKT_RX_FDIR_ID; 1337 pkt->hash.fdir.hi = mlx5_flow_mark_get(mark); 1338 } 1339 } 1340 if (rte_flow_dynf_metadata_avail() && cqe->flow_table_metadata) { 1341 pkt->ol_flags |= PKT_RX_DYNF_METADATA; 1342 *RTE_FLOW_DYNF_METADATA(pkt) = cqe->flow_table_metadata; 1343 } 1344 if (rxq->csum) 1345 pkt->ol_flags |= rxq_cq_to_ol_flags(cqe); 1346 if (rxq->vlan_strip && 1347 (cqe->hdr_type_etc & rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED))) { 1348 pkt->ol_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED; 1349 pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info); 1350 } 1351 if (rxq->hw_timestamp) { 1352 pkt->timestamp = rte_be_to_cpu_64(cqe->timestamp); 1353 pkt->ol_flags |= PKT_RX_TIMESTAMP; 1354 } 1355 } 1356 1357 /** 1358 * DPDK callback for RX. 1359 * 1360 * @param dpdk_rxq 1361 * Generic pointer to RX queue structure. 1362 * @param[out] pkts 1363 * Array to store received packets. 1364 * @param pkts_n 1365 * Maximum number of packets in array. 1366 * 1367 * @return 1368 * Number of packets successfully received (<= pkts_n). 1369 */ 1370 uint16_t 1371 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1372 { 1373 struct mlx5_rxq_data *rxq = dpdk_rxq; 1374 const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1; 1375 const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1; 1376 const unsigned int sges_n = rxq->sges_n; 1377 struct rte_mbuf *pkt = NULL; 1378 struct rte_mbuf *seg = NULL; 1379 volatile struct mlx5_cqe *cqe = 1380 &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1381 unsigned int i = 0; 1382 unsigned int rq_ci = rxq->rq_ci << sges_n; 1383 int len = 0; /* keep its value across iterations. */ 1384 1385 while (pkts_n) { 1386 unsigned int idx = rq_ci & wqe_cnt; 1387 volatile struct mlx5_wqe_data_seg *wqe = 1388 &((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx]; 1389 struct rte_mbuf *rep = (*rxq->elts)[idx]; 1390 volatile struct mlx5_mini_cqe8 *mcqe = NULL; 1391 uint32_t rss_hash_res; 1392 1393 if (pkt) 1394 NEXT(seg) = rep; 1395 seg = rep; 1396 rte_prefetch0(seg); 1397 rte_prefetch0(cqe); 1398 rte_prefetch0(wqe); 1399 rep = rte_mbuf_raw_alloc(rxq->mp); 1400 if (unlikely(rep == NULL)) { 1401 ++rxq->stats.rx_nombuf; 1402 if (!pkt) { 1403 /* 1404 * no buffers before we even started, 1405 * bail out silently. 1406 */ 1407 break; 1408 } 1409 while (pkt != seg) { 1410 MLX5_ASSERT(pkt != (*rxq->elts)[idx]); 1411 rep = NEXT(pkt); 1412 NEXT(pkt) = NULL; 1413 NB_SEGS(pkt) = 1; 1414 rte_mbuf_raw_free(pkt); 1415 pkt = rep; 1416 } 1417 break; 1418 } 1419 if (!pkt) { 1420 cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1421 len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt, &mcqe); 1422 if (!len) { 1423 rte_mbuf_raw_free(rep); 1424 break; 1425 } 1426 pkt = seg; 1427 MLX5_ASSERT(len >= (rxq->crc_present << 2)); 1428 pkt->ol_flags &= EXT_ATTACHED_MBUF; 1429 /* If compressed, take hash result from mini-CQE. */ 1430 rss_hash_res = rte_be_to_cpu_32(mcqe == NULL ? 1431 cqe->rx_hash_res : 1432 mcqe->rx_hash_result); 1433 rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res); 1434 if (rxq->crc_present) 1435 len -= RTE_ETHER_CRC_LEN; 1436 PKT_LEN(pkt) = len; 1437 if (cqe->lro_num_seg > 1) { 1438 mlx5_lro_update_hdr 1439 (rte_pktmbuf_mtod(pkt, uint8_t *), cqe, 1440 len); 1441 pkt->ol_flags |= PKT_RX_LRO; 1442 pkt->tso_segsz = len / cqe->lro_num_seg; 1443 } 1444 } 1445 DATA_LEN(rep) = DATA_LEN(seg); 1446 PKT_LEN(rep) = PKT_LEN(seg); 1447 SET_DATA_OFF(rep, DATA_OFF(seg)); 1448 PORT(rep) = PORT(seg); 1449 (*rxq->elts)[idx] = rep; 1450 /* 1451 * Fill NIC descriptor with the new buffer. The lkey and size 1452 * of the buffers are already known, only the buffer address 1453 * changes. 1454 */ 1455 wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t)); 1456 /* If there's only one MR, no need to replace LKey in WQE. */ 1457 if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) 1458 wqe->lkey = mlx5_rx_mb2mr(rxq, rep); 1459 if (len > DATA_LEN(seg)) { 1460 len -= DATA_LEN(seg); 1461 ++NB_SEGS(pkt); 1462 ++rq_ci; 1463 continue; 1464 } 1465 DATA_LEN(seg) = len; 1466 #ifdef MLX5_PMD_SOFT_COUNTERS 1467 /* Increment bytes counter. */ 1468 rxq->stats.ibytes += PKT_LEN(pkt); 1469 #endif 1470 /* Return packet. */ 1471 *(pkts++) = pkt; 1472 pkt = NULL; 1473 --pkts_n; 1474 ++i; 1475 /* Align consumer index to the next stride. */ 1476 rq_ci >>= sges_n; 1477 ++rq_ci; 1478 rq_ci <<= sges_n; 1479 } 1480 if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci))) 1481 return 0; 1482 /* Update the consumer index. */ 1483 rxq->rq_ci = rq_ci >> sges_n; 1484 rte_cio_wmb(); 1485 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); 1486 rte_cio_wmb(); 1487 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 1488 #ifdef MLX5_PMD_SOFT_COUNTERS 1489 /* Increment packets counter. */ 1490 rxq->stats.ipackets += i; 1491 #endif 1492 return i; 1493 } 1494 1495 /** 1496 * Update LRO packet TCP header. 1497 * The HW LRO feature doesn't update the TCP header after coalescing the 1498 * TCP segments but supplies information in CQE to fill it by SW. 1499 * 1500 * @param tcp 1501 * Pointer to the TCP header. 1502 * @param cqe 1503 * Pointer to the completion entry.. 1504 * @param phcsum 1505 * The L3 pseudo-header checksum. 1506 */ 1507 static inline void 1508 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *restrict tcp, 1509 volatile struct mlx5_cqe *restrict cqe, 1510 uint32_t phcsum) 1511 { 1512 uint8_t l4_type = (rte_be_to_cpu_16(cqe->hdr_type_etc) & 1513 MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT; 1514 /* 1515 * The HW calculates only the TCP payload checksum, need to complete 1516 * the TCP header checksum and the L3 pseudo-header checksum. 1517 */ 1518 uint32_t csum = phcsum + cqe->csum; 1519 1520 if (l4_type == MLX5_L4_HDR_TYPE_TCP_EMPTY_ACK || 1521 l4_type == MLX5_L4_HDR_TYPE_TCP_WITH_ACL) { 1522 tcp->tcp_flags |= RTE_TCP_ACK_FLAG; 1523 tcp->recv_ack = cqe->lro_ack_seq_num; 1524 tcp->rx_win = cqe->lro_tcp_win; 1525 } 1526 if (cqe->lro_tcppsh_abort_dupack & MLX5_CQE_LRO_PUSH_MASK) 1527 tcp->tcp_flags |= RTE_TCP_PSH_FLAG; 1528 tcp->cksum = 0; 1529 csum += rte_raw_cksum(tcp, (tcp->data_off & 0xF) * 4); 1530 csum = ((csum & 0xffff0000) >> 16) + (csum & 0xffff); 1531 csum = (~csum) & 0xffff; 1532 if (csum == 0) 1533 csum = 0xffff; 1534 tcp->cksum = csum; 1535 } 1536 1537 /** 1538 * Update LRO packet headers. 1539 * The HW LRO feature doesn't update the L3/TCP headers after coalescing the 1540 * TCP segments but supply information in CQE to fill it by SW. 1541 * 1542 * @param padd 1543 * The packet address. 1544 * @param cqe 1545 * Pointer to the completion entry.. 1546 * @param len 1547 * The packet length. 1548 */ 1549 static inline void 1550 mlx5_lro_update_hdr(uint8_t *restrict padd, 1551 volatile struct mlx5_cqe *restrict cqe, 1552 uint32_t len) 1553 { 1554 union { 1555 struct rte_ether_hdr *eth; 1556 struct rte_vlan_hdr *vlan; 1557 struct rte_ipv4_hdr *ipv4; 1558 struct rte_ipv6_hdr *ipv6; 1559 struct rte_tcp_hdr *tcp; 1560 uint8_t *hdr; 1561 } h = { 1562 .hdr = padd, 1563 }; 1564 uint16_t proto = h.eth->ether_type; 1565 uint32_t phcsum; 1566 1567 h.eth++; 1568 while (proto == RTE_BE16(RTE_ETHER_TYPE_VLAN) || 1569 proto == RTE_BE16(RTE_ETHER_TYPE_QINQ)) { 1570 proto = h.vlan->eth_proto; 1571 h.vlan++; 1572 } 1573 if (proto == RTE_BE16(RTE_ETHER_TYPE_IPV4)) { 1574 h.ipv4->time_to_live = cqe->lro_min_ttl; 1575 h.ipv4->total_length = rte_cpu_to_be_16(len - (h.hdr - padd)); 1576 h.ipv4->hdr_checksum = 0; 1577 h.ipv4->hdr_checksum = rte_ipv4_cksum(h.ipv4); 1578 phcsum = rte_ipv4_phdr_cksum(h.ipv4, 0); 1579 h.ipv4++; 1580 } else { 1581 h.ipv6->hop_limits = cqe->lro_min_ttl; 1582 h.ipv6->payload_len = rte_cpu_to_be_16(len - (h.hdr - padd) - 1583 sizeof(*h.ipv6)); 1584 phcsum = rte_ipv6_phdr_cksum(h.ipv6, 0); 1585 h.ipv6++; 1586 } 1587 mlx5_lro_update_tcp_hdr(h.tcp, cqe, phcsum); 1588 } 1589 1590 void 1591 mlx5_mprq_buf_free_cb(void *addr __rte_unused, void *opaque) 1592 { 1593 struct mlx5_mprq_buf *buf = opaque; 1594 1595 if (rte_atomic16_read(&buf->refcnt) == 1) { 1596 rte_mempool_put(buf->mp, buf); 1597 } else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) { 1598 rte_atomic16_set(&buf->refcnt, 1); 1599 rte_mempool_put(buf->mp, buf); 1600 } 1601 } 1602 1603 void 1604 mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf) 1605 { 1606 mlx5_mprq_buf_free_cb(NULL, buf); 1607 } 1608 1609 static inline void 1610 mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx, 1611 const unsigned int strd_n) 1612 { 1613 struct mlx5_mprq_buf *rep = rxq->mprq_repl; 1614 volatile struct mlx5_wqe_data_seg *wqe = 1615 &((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg; 1616 void *addr; 1617 1618 MLX5_ASSERT(rep != NULL); 1619 /* Replace MPRQ buf. */ 1620 (*rxq->mprq_bufs)[rq_idx] = rep; 1621 /* Replace WQE. */ 1622 addr = mlx5_mprq_buf_addr(rep, strd_n); 1623 wqe->addr = rte_cpu_to_be_64((uintptr_t)addr); 1624 /* If there's only one MR, no need to replace LKey in WQE. */ 1625 if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) 1626 wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr); 1627 /* Stash a mbuf for next replacement. */ 1628 if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep))) 1629 rxq->mprq_repl = rep; 1630 else 1631 rxq->mprq_repl = NULL; 1632 } 1633 1634 /** 1635 * DPDK callback for RX with Multi-Packet RQ support. 1636 * 1637 * @param dpdk_rxq 1638 * Generic pointer to RX queue structure. 1639 * @param[out] pkts 1640 * Array to store received packets. 1641 * @param pkts_n 1642 * Maximum number of packets in array. 1643 * 1644 * @return 1645 * Number of packets successfully received (<= pkts_n). 1646 */ 1647 uint16_t 1648 mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1649 { 1650 struct mlx5_rxq_data *rxq = dpdk_rxq; 1651 const unsigned int strd_n = 1 << rxq->strd_num_n; 1652 const unsigned int strd_sz = 1 << rxq->strd_sz_n; 1653 const unsigned int strd_shift = 1654 MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en; 1655 const unsigned int cq_mask = (1 << rxq->cqe_n) - 1; 1656 const unsigned int wq_mask = (1 << rxq->elts_n) - 1; 1657 volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask]; 1658 unsigned int i = 0; 1659 uint32_t rq_ci = rxq->rq_ci; 1660 uint16_t consumed_strd = rxq->consumed_strd; 1661 uint16_t headroom_sz = rxq->strd_headroom_en * RTE_PKTMBUF_HEADROOM; 1662 struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_ci & wq_mask]; 1663 1664 while (i < pkts_n) { 1665 struct rte_mbuf *pkt; 1666 void *addr; 1667 int ret; 1668 unsigned int len; 1669 uint16_t strd_cnt; 1670 uint16_t strd_idx; 1671 uint32_t offset; 1672 uint32_t byte_cnt; 1673 volatile struct mlx5_mini_cqe8 *mcqe = NULL; 1674 uint32_t rss_hash_res = 0; 1675 uint8_t lro_num_seg; 1676 1677 if (consumed_strd == strd_n) { 1678 /* Replace WQE only if the buffer is still in use. */ 1679 if (rte_atomic16_read(&buf->refcnt) > 1) { 1680 mprq_buf_replace(rxq, rq_ci & wq_mask, strd_n); 1681 /* Release the old buffer. */ 1682 mlx5_mprq_buf_free(buf); 1683 } else if (unlikely(rxq->mprq_repl == NULL)) { 1684 struct mlx5_mprq_buf *rep; 1685 1686 /* 1687 * Currently, the MPRQ mempool is out of buffer 1688 * and doing memcpy regardless of the size of Rx 1689 * packet. Retry allocation to get back to 1690 * normal. 1691 */ 1692 if (!rte_mempool_get(rxq->mprq_mp, 1693 (void **)&rep)) 1694 rxq->mprq_repl = rep; 1695 } 1696 /* Advance to the next WQE. */ 1697 consumed_strd = 0; 1698 ++rq_ci; 1699 buf = (*rxq->mprq_bufs)[rq_ci & wq_mask]; 1700 } 1701 cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask]; 1702 ret = mlx5_rx_poll_len(rxq, cqe, cq_mask, &mcqe); 1703 if (!ret) 1704 break; 1705 byte_cnt = ret; 1706 strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >> 1707 MLX5_MPRQ_STRIDE_NUM_SHIFT; 1708 MLX5_ASSERT(strd_cnt); 1709 consumed_strd += strd_cnt; 1710 if (byte_cnt & MLX5_MPRQ_FILLER_MASK) 1711 continue; 1712 if (mcqe == NULL) { 1713 rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res); 1714 strd_idx = rte_be_to_cpu_16(cqe->wqe_counter); 1715 } else { 1716 /* mini-CQE for MPRQ doesn't have hash result. */ 1717 strd_idx = rte_be_to_cpu_16(mcqe->stride_idx); 1718 } 1719 MLX5_ASSERT(strd_idx < strd_n); 1720 MLX5_ASSERT(!((rte_be_to_cpu_16(cqe->wqe_id) ^ rq_ci) & 1721 wq_mask)); 1722 lro_num_seg = cqe->lro_num_seg; 1723 /* 1724 * Currently configured to receive a packet per a stride. But if 1725 * MTU is adjusted through kernel interface, device could 1726 * consume multiple strides without raising an error. In this 1727 * case, the packet should be dropped because it is bigger than 1728 * the max_rx_pkt_len. 1729 */ 1730 if (unlikely(!lro_num_seg && strd_cnt > 1)) { 1731 ++rxq->stats.idropped; 1732 continue; 1733 } 1734 pkt = rte_pktmbuf_alloc(rxq->mp); 1735 if (unlikely(pkt == NULL)) { 1736 ++rxq->stats.rx_nombuf; 1737 break; 1738 } 1739 len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT; 1740 MLX5_ASSERT((int)len >= (rxq->crc_present << 2)); 1741 if (rxq->crc_present) 1742 len -= RTE_ETHER_CRC_LEN; 1743 offset = strd_idx * strd_sz + strd_shift; 1744 addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset); 1745 /* 1746 * Memcpy packets to the target mbuf if: 1747 * - The size of packet is smaller than mprq_max_memcpy_len. 1748 * - Out of buffer in the Mempool for Multi-Packet RQ. 1749 */ 1750 if (len <= rxq->mprq_max_memcpy_len || rxq->mprq_repl == NULL) { 1751 /* 1752 * When memcpy'ing packet due to out-of-buffer, the 1753 * packet must be smaller than the target mbuf. 1754 */ 1755 if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) { 1756 rte_pktmbuf_free_seg(pkt); 1757 ++rxq->stats.idropped; 1758 continue; 1759 } 1760 rte_memcpy(rte_pktmbuf_mtod(pkt, void *), addr, len); 1761 DATA_LEN(pkt) = len; 1762 } else { 1763 rte_iova_t buf_iova; 1764 struct rte_mbuf_ext_shared_info *shinfo; 1765 uint16_t buf_len = strd_cnt * strd_sz; 1766 void *buf_addr; 1767 1768 /* Increment the refcnt of the whole chunk. */ 1769 rte_atomic16_add_return(&buf->refcnt, 1); 1770 MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf->refcnt) <= 1771 strd_n + 1); 1772 buf_addr = RTE_PTR_SUB(addr, headroom_sz); 1773 /* 1774 * MLX5 device doesn't use iova but it is necessary in a 1775 * case where the Rx packet is transmitted via a 1776 * different PMD. 1777 */ 1778 buf_iova = rte_mempool_virt2iova(buf) + 1779 RTE_PTR_DIFF(buf_addr, buf); 1780 shinfo = &buf->shinfos[strd_idx]; 1781 rte_mbuf_ext_refcnt_set(shinfo, 1); 1782 /* 1783 * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when 1784 * attaching the stride to mbuf and more offload flags 1785 * will be added below by calling rxq_cq_to_mbuf(). 1786 * Other fields will be overwritten. 1787 */ 1788 rte_pktmbuf_attach_extbuf(pkt, buf_addr, buf_iova, 1789 buf_len, shinfo); 1790 /* Set mbuf head-room. */ 1791 pkt->data_off = headroom_sz; 1792 MLX5_ASSERT(pkt->ol_flags == EXT_ATTACHED_MBUF); 1793 /* 1794 * Prevent potential overflow due to MTU change through 1795 * kernel interface. 1796 */ 1797 if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) { 1798 rte_pktmbuf_free_seg(pkt); 1799 ++rxq->stats.idropped; 1800 continue; 1801 } 1802 DATA_LEN(pkt) = len; 1803 /* 1804 * LRO packet may consume all the stride memory, in this 1805 * case packet head-room space is not guaranteed so must 1806 * to add an empty mbuf for the head-room. 1807 */ 1808 if (!rxq->strd_headroom_en) { 1809 struct rte_mbuf *headroom_mbuf = 1810 rte_pktmbuf_alloc(rxq->mp); 1811 1812 if (unlikely(headroom_mbuf == NULL)) { 1813 rte_pktmbuf_free_seg(pkt); 1814 ++rxq->stats.rx_nombuf; 1815 break; 1816 } 1817 PORT(pkt) = rxq->port_id; 1818 NEXT(headroom_mbuf) = pkt; 1819 pkt = headroom_mbuf; 1820 NB_SEGS(pkt) = 2; 1821 } 1822 } 1823 rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res); 1824 if (lro_num_seg > 1) { 1825 mlx5_lro_update_hdr(addr, cqe, len); 1826 pkt->ol_flags |= PKT_RX_LRO; 1827 pkt->tso_segsz = strd_sz; 1828 } 1829 PKT_LEN(pkt) = len; 1830 PORT(pkt) = rxq->port_id; 1831 #ifdef MLX5_PMD_SOFT_COUNTERS 1832 /* Increment bytes counter. */ 1833 rxq->stats.ibytes += PKT_LEN(pkt); 1834 #endif 1835 /* Return packet. */ 1836 *(pkts++) = pkt; 1837 ++i; 1838 } 1839 /* Update the consumer indexes. */ 1840 rxq->consumed_strd = consumed_strd; 1841 rte_cio_wmb(); 1842 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); 1843 if (rq_ci != rxq->rq_ci) { 1844 rxq->rq_ci = rq_ci; 1845 rte_cio_wmb(); 1846 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 1847 } 1848 #ifdef MLX5_PMD_SOFT_COUNTERS 1849 /* Increment packets counter. */ 1850 rxq->stats.ipackets += i; 1851 #endif 1852 return i; 1853 } 1854 1855 /** 1856 * Dummy DPDK callback for TX. 1857 * 1858 * This function is used to temporarily replace the real callback during 1859 * unsafe control operations on the queue, or in case of error. 1860 * 1861 * @param dpdk_txq 1862 * Generic pointer to TX queue structure. 1863 * @param[in] pkts 1864 * Packets to transmit. 1865 * @param pkts_n 1866 * Number of packets in array. 1867 * 1868 * @return 1869 * Number of packets successfully transmitted (<= pkts_n). 1870 */ 1871 uint16_t 1872 removed_tx_burst(void *dpdk_txq __rte_unused, 1873 struct rte_mbuf **pkts __rte_unused, 1874 uint16_t pkts_n __rte_unused) 1875 { 1876 rte_mb(); 1877 return 0; 1878 } 1879 1880 /** 1881 * Dummy DPDK callback for RX. 1882 * 1883 * This function is used to temporarily replace the real callback during 1884 * unsafe control operations on the queue, or in case of error. 1885 * 1886 * @param dpdk_rxq 1887 * Generic pointer to RX queue structure. 1888 * @param[out] pkts 1889 * Array to store received packets. 1890 * @param pkts_n 1891 * Maximum number of packets in array. 1892 * 1893 * @return 1894 * Number of packets successfully received (<= pkts_n). 1895 */ 1896 uint16_t 1897 removed_rx_burst(void *dpdk_txq __rte_unused, 1898 struct rte_mbuf **pkts __rte_unused, 1899 uint16_t pkts_n __rte_unused) 1900 { 1901 rte_mb(); 1902 return 0; 1903 } 1904 1905 /* 1906 * Vectorized Rx/Tx routines are not compiled in when required vector 1907 * instructions are not supported on a target architecture. The following null 1908 * stubs are needed for linkage when those are not included outside of this file 1909 * (e.g. mlx5_rxtx_vec_sse.c for x86). 1910 */ 1911 1912 __rte_weak uint16_t 1913 mlx5_rx_burst_vec(void *dpdk_txq __rte_unused, 1914 struct rte_mbuf **pkts __rte_unused, 1915 uint16_t pkts_n __rte_unused) 1916 { 1917 return 0; 1918 } 1919 1920 __rte_weak int 1921 mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused) 1922 { 1923 return -ENOTSUP; 1924 } 1925 1926 __rte_weak int 1927 mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused) 1928 { 1929 return -ENOTSUP; 1930 } 1931 1932 /** 1933 * Free the mbufs from the linear array of pointers. 1934 * 1935 * @param pkts 1936 * Pointer to array of packets to be free. 1937 * @param pkts_n 1938 * Number of packets to be freed. 1939 * @param olx 1940 * Configured Tx offloads mask. It is fully defined at 1941 * compile time and may be used for optimization. 1942 */ 1943 static __rte_always_inline void 1944 mlx5_tx_free_mbuf(struct rte_mbuf **restrict pkts, 1945 unsigned int pkts_n, 1946 unsigned int olx __rte_unused) 1947 { 1948 struct rte_mempool *pool = NULL; 1949 struct rte_mbuf **p_free = NULL; 1950 struct rte_mbuf *mbuf; 1951 unsigned int n_free = 0; 1952 1953 /* 1954 * The implemented algorithm eliminates 1955 * copying pointers to temporary array 1956 * for rte_mempool_put_bulk() calls. 1957 */ 1958 MLX5_ASSERT(pkts); 1959 MLX5_ASSERT(pkts_n); 1960 for (;;) { 1961 for (;;) { 1962 /* 1963 * Decrement mbuf reference counter, detach 1964 * indirect and external buffers if needed. 1965 */ 1966 mbuf = rte_pktmbuf_prefree_seg(*pkts); 1967 if (likely(mbuf != NULL)) { 1968 MLX5_ASSERT(mbuf == *pkts); 1969 if (likely(n_free != 0)) { 1970 if (unlikely(pool != mbuf->pool)) 1971 /* From different pool. */ 1972 break; 1973 } else { 1974 /* Start new scan array. */ 1975 pool = mbuf->pool; 1976 p_free = pkts; 1977 } 1978 ++n_free; 1979 ++pkts; 1980 --pkts_n; 1981 if (unlikely(pkts_n == 0)) { 1982 mbuf = NULL; 1983 break; 1984 } 1985 } else { 1986 /* 1987 * This happens if mbuf is still referenced. 1988 * We can't put it back to the pool, skip. 1989 */ 1990 ++pkts; 1991 --pkts_n; 1992 if (unlikely(n_free != 0)) 1993 /* There is some array to free.*/ 1994 break; 1995 if (unlikely(pkts_n == 0)) 1996 /* Last mbuf, nothing to free. */ 1997 return; 1998 } 1999 } 2000 for (;;) { 2001 /* 2002 * This loop is implemented to avoid multiple 2003 * inlining of rte_mempool_put_bulk(). 2004 */ 2005 MLX5_ASSERT(pool); 2006 MLX5_ASSERT(p_free); 2007 MLX5_ASSERT(n_free); 2008 /* 2009 * Free the array of pre-freed mbufs 2010 * belonging to the same memory pool. 2011 */ 2012 rte_mempool_put_bulk(pool, (void *)p_free, n_free); 2013 if (unlikely(mbuf != NULL)) { 2014 /* There is the request to start new scan. */ 2015 pool = mbuf->pool; 2016 p_free = pkts++; 2017 n_free = 1; 2018 --pkts_n; 2019 if (likely(pkts_n != 0)) 2020 break; 2021 /* 2022 * This is the last mbuf to be freed. 2023 * Do one more loop iteration to complete. 2024 * This is rare case of the last unique mbuf. 2025 */ 2026 mbuf = NULL; 2027 continue; 2028 } 2029 if (likely(pkts_n == 0)) 2030 return; 2031 n_free = 0; 2032 break; 2033 } 2034 } 2035 } 2036 2037 /** 2038 * Free the mbuf from the elts ring buffer till new tail. 2039 * 2040 * @param txq 2041 * Pointer to Tx queue structure. 2042 * @param tail 2043 * Index in elts to free up to, becomes new elts tail. 2044 * @param olx 2045 * Configured Tx offloads mask. It is fully defined at 2046 * compile time and may be used for optimization. 2047 */ 2048 static __rte_always_inline void 2049 mlx5_tx_free_elts(struct mlx5_txq_data *restrict txq, 2050 uint16_t tail, 2051 unsigned int olx __rte_unused) 2052 { 2053 uint16_t n_elts = tail - txq->elts_tail; 2054 2055 MLX5_ASSERT(n_elts); 2056 MLX5_ASSERT(n_elts <= txq->elts_s); 2057 /* 2058 * Implement a loop to support ring buffer wraparound 2059 * with single inlining of mlx5_tx_free_mbuf(). 2060 */ 2061 do { 2062 unsigned int part; 2063 2064 part = txq->elts_s - (txq->elts_tail & txq->elts_m); 2065 part = RTE_MIN(part, n_elts); 2066 MLX5_ASSERT(part); 2067 MLX5_ASSERT(part <= txq->elts_s); 2068 mlx5_tx_free_mbuf(&txq->elts[txq->elts_tail & txq->elts_m], 2069 part, olx); 2070 txq->elts_tail += part; 2071 n_elts -= part; 2072 } while (n_elts); 2073 } 2074 2075 /** 2076 * Store the mbuf being sent into elts ring buffer. 2077 * On Tx completion these mbufs will be freed. 2078 * 2079 * @param txq 2080 * Pointer to Tx queue structure. 2081 * @param pkts 2082 * Pointer to array of packets to be stored. 2083 * @param pkts_n 2084 * Number of packets to be stored. 2085 * @param olx 2086 * Configured Tx offloads mask. It is fully defined at 2087 * compile time and may be used for optimization. 2088 */ 2089 static __rte_always_inline void 2090 mlx5_tx_copy_elts(struct mlx5_txq_data *restrict txq, 2091 struct rte_mbuf **restrict pkts, 2092 unsigned int pkts_n, 2093 unsigned int olx __rte_unused) 2094 { 2095 unsigned int part; 2096 struct rte_mbuf **elts = (struct rte_mbuf **)txq->elts; 2097 2098 MLX5_ASSERT(pkts); 2099 MLX5_ASSERT(pkts_n); 2100 part = txq->elts_s - (txq->elts_head & txq->elts_m); 2101 MLX5_ASSERT(part); 2102 MLX5_ASSERT(part <= txq->elts_s); 2103 /* This code is a good candidate for vectorizing with SIMD. */ 2104 rte_memcpy((void *)(elts + (txq->elts_head & txq->elts_m)), 2105 (void *)pkts, 2106 RTE_MIN(part, pkts_n) * sizeof(struct rte_mbuf *)); 2107 txq->elts_head += pkts_n; 2108 if (unlikely(part < pkts_n)) 2109 /* The copy is wrapping around the elts array. */ 2110 rte_memcpy((void *)elts, (void *)(pkts + part), 2111 (pkts_n - part) * sizeof(struct rte_mbuf *)); 2112 } 2113 2114 /** 2115 * Update completion queue consuming index via doorbell 2116 * and flush the completed data buffers. 2117 * 2118 * @param txq 2119 * Pointer to TX queue structure. 2120 * @param valid CQE pointer 2121 * if not NULL update txq->wqe_pi and flush the buffers 2122 * @param olx 2123 * Configured Tx offloads mask. It is fully defined at 2124 * compile time and may be used for optimization. 2125 */ 2126 static __rte_always_inline void 2127 mlx5_tx_comp_flush(struct mlx5_txq_data *restrict txq, 2128 volatile struct mlx5_cqe *last_cqe, 2129 unsigned int olx __rte_unused) 2130 { 2131 if (likely(last_cqe != NULL)) { 2132 uint16_t tail; 2133 2134 txq->wqe_pi = rte_be_to_cpu_16(last_cqe->wqe_counter); 2135 tail = txq->fcqs[(txq->cq_ci - 1) & txq->cqe_m]; 2136 if (likely(tail != txq->elts_tail)) { 2137 mlx5_tx_free_elts(txq, tail, olx); 2138 MLX5_ASSERT(tail == txq->elts_tail); 2139 } 2140 } 2141 } 2142 2143 /** 2144 * Manage TX completions. This routine checks the CQ for 2145 * arrived CQEs, deduces the last accomplished WQE in SQ, 2146 * updates SQ producing index and frees all completed mbufs. 2147 * 2148 * @param txq 2149 * Pointer to TX queue structure. 2150 * @param olx 2151 * Configured Tx offloads mask. It is fully defined at 2152 * compile time and may be used for optimization. 2153 * 2154 * NOTE: not inlined intentionally, it makes tx_burst 2155 * routine smaller, simple and faster - from experiments. 2156 */ 2157 static void 2158 mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq, 2159 unsigned int olx __rte_unused) 2160 { 2161 unsigned int count = MLX5_TX_COMP_MAX_CQE; 2162 volatile struct mlx5_cqe *last_cqe = NULL; 2163 uint16_t ci = txq->cq_ci; 2164 int ret; 2165 2166 static_assert(MLX5_CQE_STATUS_HW_OWN < 0, "Must be negative value"); 2167 static_assert(MLX5_CQE_STATUS_SW_OWN < 0, "Must be negative value"); 2168 do { 2169 volatile struct mlx5_cqe *cqe; 2170 2171 cqe = &txq->cqes[ci & txq->cqe_m]; 2172 ret = check_cqe(cqe, txq->cqe_s, ci); 2173 if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) { 2174 if (likely(ret != MLX5_CQE_STATUS_ERR)) { 2175 /* No new CQEs in completion queue. */ 2176 MLX5_ASSERT(ret == MLX5_CQE_STATUS_HW_OWN); 2177 break; 2178 } 2179 /* 2180 * Some error occurred, try to restart. 2181 * We have no barrier after WQE related Doorbell 2182 * written, make sure all writes are completed 2183 * here, before we might perform SQ reset. 2184 */ 2185 rte_wmb(); 2186 txq->cq_ci = ci; 2187 ret = mlx5_tx_error_cqe_handle 2188 (txq, (volatile struct mlx5_err_cqe *)cqe); 2189 if (unlikely(ret < 0)) { 2190 /* 2191 * Some error occurred on queue error 2192 * handling, we do not advance the index 2193 * here, allowing to retry on next call. 2194 */ 2195 return; 2196 } 2197 /* 2198 * We are going to fetch all entries with 2199 * MLX5_CQE_SYNDROME_WR_FLUSH_ERR status. 2200 * The send queue is supposed to be empty. 2201 */ 2202 ++ci; 2203 txq->cq_pi = ci; 2204 last_cqe = NULL; 2205 continue; 2206 } 2207 /* Normal transmit completion. */ 2208 MLX5_ASSERT(ci != txq->cq_pi); 2209 MLX5_ASSERT((txq->fcqs[ci & txq->cqe_m] >> 16) == 2210 cqe->wqe_counter); 2211 ++ci; 2212 last_cqe = cqe; 2213 /* 2214 * We have to restrict the amount of processed CQEs 2215 * in one tx_burst routine call. The CQ may be large 2216 * and many CQEs may be updated by the NIC in one 2217 * transaction. Buffers freeing is time consuming, 2218 * multiple iterations may introduce significant 2219 * latency. 2220 */ 2221 if (likely(--count == 0)) 2222 break; 2223 } while (true); 2224 if (likely(ci != txq->cq_ci)) { 2225 /* 2226 * Update completion queue consuming index 2227 * and ring doorbell to notify hardware. 2228 */ 2229 rte_compiler_barrier(); 2230 txq->cq_ci = ci; 2231 *txq->cq_db = rte_cpu_to_be_32(ci); 2232 mlx5_tx_comp_flush(txq, last_cqe, olx); 2233 } 2234 } 2235 2236 /** 2237 * Check if the completion request flag should be set in the last WQE. 2238 * Both pushed mbufs and WQEs are monitored and the completion request 2239 * flag is set if any of thresholds is reached. 2240 * 2241 * @param txq 2242 * Pointer to TX queue structure. 2243 * @param loc 2244 * Pointer to burst routine local context. 2245 * @param olx 2246 * Configured Tx offloads mask. It is fully defined at 2247 * compile time and may be used for optimization. 2248 */ 2249 static __rte_always_inline void 2250 mlx5_tx_request_completion(struct mlx5_txq_data *restrict txq, 2251 struct mlx5_txq_local *restrict loc, 2252 unsigned int olx) 2253 { 2254 uint16_t head = txq->elts_head; 2255 unsigned int part; 2256 2257 part = MLX5_TXOFF_CONFIG(INLINE) ? 2258 0 : loc->pkts_sent - loc->pkts_copy; 2259 head += part; 2260 if ((uint16_t)(head - txq->elts_comp) >= MLX5_TX_COMP_THRESH || 2261 (MLX5_TXOFF_CONFIG(INLINE) && 2262 (uint16_t)(txq->wqe_ci - txq->wqe_comp) >= txq->wqe_thres)) { 2263 volatile struct mlx5_wqe *last = loc->wqe_last; 2264 2265 txq->elts_comp = head; 2266 if (MLX5_TXOFF_CONFIG(INLINE)) 2267 txq->wqe_comp = txq->wqe_ci; 2268 /* Request unconditional completion on last WQE. */ 2269 last->cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS << 2270 MLX5_COMP_MODE_OFFSET); 2271 /* Save elts_head in dedicated free on completion queue. */ 2272 #ifdef RTE_LIBRTE_MLX5_DEBUG 2273 txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head | 2274 (last->cseg.opcode >> 8) << 16; 2275 #else 2276 txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head; 2277 #endif 2278 /* A CQE slot must always be available. */ 2279 MLX5_ASSERT((txq->cq_pi - txq->cq_ci) <= txq->cqe_s); 2280 } 2281 } 2282 2283 /** 2284 * DPDK callback to check the status of a tx descriptor. 2285 * 2286 * @param tx_queue 2287 * The tx queue. 2288 * @param[in] offset 2289 * The index of the descriptor in the ring. 2290 * 2291 * @return 2292 * The status of the tx descriptor. 2293 */ 2294 int 2295 mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset) 2296 { 2297 struct mlx5_txq_data *restrict txq = tx_queue; 2298 uint16_t used; 2299 2300 mlx5_tx_handle_completion(txq, 0); 2301 used = txq->elts_head - txq->elts_tail; 2302 if (offset < used) 2303 return RTE_ETH_TX_DESC_FULL; 2304 return RTE_ETH_TX_DESC_DONE; 2305 } 2306 2307 /** 2308 * Build the Control Segment with specified opcode: 2309 * - MLX5_OPCODE_SEND 2310 * - MLX5_OPCODE_ENHANCED_MPSW 2311 * - MLX5_OPCODE_TSO 2312 * 2313 * @param txq 2314 * Pointer to TX queue structure. 2315 * @param loc 2316 * Pointer to burst routine local context. 2317 * @param wqe 2318 * Pointer to WQE to fill with built Control Segment. 2319 * @param ds 2320 * Supposed length of WQE in segments. 2321 * @param opcode 2322 * SQ WQE opcode to put into Control Segment. 2323 * @param olx 2324 * Configured Tx offloads mask. It is fully defined at 2325 * compile time and may be used for optimization. 2326 */ 2327 static __rte_always_inline void 2328 mlx5_tx_cseg_init(struct mlx5_txq_data *restrict txq, 2329 struct mlx5_txq_local *restrict loc __rte_unused, 2330 struct mlx5_wqe *restrict wqe, 2331 unsigned int ds, 2332 unsigned int opcode, 2333 unsigned int olx __rte_unused) 2334 { 2335 struct mlx5_wqe_cseg *restrict cs = &wqe->cseg; 2336 2337 /* For legacy MPW replace the EMPW by TSO with modifier. */ 2338 if (MLX5_TXOFF_CONFIG(MPW) && opcode == MLX5_OPCODE_ENHANCED_MPSW) 2339 opcode = MLX5_OPCODE_TSO | MLX5_OPC_MOD_MPW << 24; 2340 cs->opcode = rte_cpu_to_be_32((txq->wqe_ci << 8) | opcode); 2341 cs->sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); 2342 cs->flags = RTE_BE32(MLX5_COMP_ONLY_FIRST_ERR << 2343 MLX5_COMP_MODE_OFFSET); 2344 cs->misc = RTE_BE32(0); 2345 } 2346 2347 /** 2348 * Build the Ethernet Segment without inlined data. 2349 * Supports Software Parser, Checksums and VLAN 2350 * insertion Tx offload features. 2351 * 2352 * @param txq 2353 * Pointer to TX queue structure. 2354 * @param loc 2355 * Pointer to burst routine local context. 2356 * @param wqe 2357 * Pointer to WQE to fill with built Ethernet Segment. 2358 * @param olx 2359 * Configured Tx offloads mask. It is fully defined at 2360 * compile time and may be used for optimization. 2361 */ 2362 static __rte_always_inline void 2363 mlx5_tx_eseg_none(struct mlx5_txq_data *restrict txq __rte_unused, 2364 struct mlx5_txq_local *restrict loc, 2365 struct mlx5_wqe *restrict wqe, 2366 unsigned int olx) 2367 { 2368 struct mlx5_wqe_eseg *restrict es = &wqe->eseg; 2369 uint32_t csum; 2370 2371 /* 2372 * Calculate and set check sum flags first, dword field 2373 * in segment may be shared with Software Parser flags. 2374 */ 2375 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; 2376 es->flags = rte_cpu_to_le_32(csum); 2377 /* 2378 * Calculate and set Software Parser offsets and flags. 2379 * These flags a set for custom UDP and IP tunnel packets. 2380 */ 2381 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); 2382 /* Fill metadata field if needed. */ 2383 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? 2384 loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? 2385 *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0; 2386 /* Engage VLAN tag insertion feature if requested. */ 2387 if (MLX5_TXOFF_CONFIG(VLAN) && 2388 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { 2389 /* 2390 * We should get here only if device support 2391 * this feature correctly. 2392 */ 2393 MLX5_ASSERT(txq->vlan_en); 2394 es->inline_hdr = rte_cpu_to_be_32(MLX5_ETH_WQE_VLAN_INSERT | 2395 loc->mbuf->vlan_tci); 2396 } else { 2397 es->inline_hdr = RTE_BE32(0); 2398 } 2399 } 2400 2401 /** 2402 * Build the Ethernet Segment with minimal inlined data 2403 * of MLX5_ESEG_MIN_INLINE_SIZE bytes length. This is 2404 * used to fill the gap in single WQEBB WQEs. 2405 * Supports Software Parser, Checksums and VLAN 2406 * insertion Tx offload features. 2407 * 2408 * @param txq 2409 * Pointer to TX queue structure. 2410 * @param loc 2411 * Pointer to burst routine local context. 2412 * @param wqe 2413 * Pointer to WQE to fill with built Ethernet Segment. 2414 * @param vlan 2415 * Length of VLAN tag insertion if any. 2416 * @param olx 2417 * Configured Tx offloads mask. It is fully defined at 2418 * compile time and may be used for optimization. 2419 */ 2420 static __rte_always_inline void 2421 mlx5_tx_eseg_dmin(struct mlx5_txq_data *restrict txq __rte_unused, 2422 struct mlx5_txq_local *restrict loc, 2423 struct mlx5_wqe *restrict wqe, 2424 unsigned int vlan, 2425 unsigned int olx) 2426 { 2427 struct mlx5_wqe_eseg *restrict es = &wqe->eseg; 2428 uint32_t csum; 2429 uint8_t *psrc, *pdst; 2430 2431 /* 2432 * Calculate and set check sum flags first, dword field 2433 * in segment may be shared with Software Parser flags. 2434 */ 2435 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; 2436 es->flags = rte_cpu_to_le_32(csum); 2437 /* 2438 * Calculate and set Software Parser offsets and flags. 2439 * These flags a set for custom UDP and IP tunnel packets. 2440 */ 2441 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); 2442 /* Fill metadata field if needed. */ 2443 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? 2444 loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? 2445 *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0; 2446 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2447 (sizeof(uint16_t) + 2448 sizeof(rte_v128u32_t)), 2449 "invalid Ethernet Segment data size"); 2450 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2451 (sizeof(uint16_t) + 2452 sizeof(struct rte_vlan_hdr) + 2453 2 * RTE_ETHER_ADDR_LEN), 2454 "invalid Ethernet Segment data size"); 2455 psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *); 2456 es->inline_hdr_sz = RTE_BE16(MLX5_ESEG_MIN_INLINE_SIZE); 2457 es->inline_data = *(unaligned_uint16_t *)psrc; 2458 psrc += sizeof(uint16_t); 2459 pdst = (uint8_t *)(es + 1); 2460 if (MLX5_TXOFF_CONFIG(VLAN) && vlan) { 2461 /* Implement VLAN tag insertion as part inline data. */ 2462 memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t)); 2463 pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); 2464 psrc += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); 2465 /* Insert VLAN ethertype + VLAN tag. */ 2466 *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32 2467 ((RTE_ETHER_TYPE_VLAN << 16) | 2468 loc->mbuf->vlan_tci); 2469 pdst += sizeof(struct rte_vlan_hdr); 2470 /* Copy the rest two bytes from packet data. */ 2471 MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t))); 2472 *(uint16_t *)pdst = *(unaligned_uint16_t *)psrc; 2473 } else { 2474 /* Fill the gap in the title WQEBB with inline data. */ 2475 rte_mov16(pdst, psrc); 2476 } 2477 } 2478 2479 /** 2480 * Build the Ethernet Segment with entire packet 2481 * data inlining. Checks the boundary of WQEBB and 2482 * ring buffer wrapping, supports Software Parser, 2483 * Checksums and VLAN insertion Tx offload features. 2484 * 2485 * @param txq 2486 * Pointer to TX queue structure. 2487 * @param loc 2488 * Pointer to burst routine local context. 2489 * @param wqe 2490 * Pointer to WQE to fill with built Ethernet Segment. 2491 * @param vlan 2492 * Length of VLAN tag insertion if any. 2493 * @param inlen 2494 * Length of data to inline (VLAN included, if any). 2495 * @param tso 2496 * TSO flag, set mss field from the packet. 2497 * @param olx 2498 * Configured Tx offloads mask. It is fully defined at 2499 * compile time and may be used for optimization. 2500 * 2501 * @return 2502 * Pointer to the next Data Segment (aligned and wrapped around). 2503 */ 2504 static __rte_always_inline struct mlx5_wqe_dseg * 2505 mlx5_tx_eseg_data(struct mlx5_txq_data *restrict txq, 2506 struct mlx5_txq_local *restrict loc, 2507 struct mlx5_wqe *restrict wqe, 2508 unsigned int vlan, 2509 unsigned int inlen, 2510 unsigned int tso, 2511 unsigned int olx) 2512 { 2513 struct mlx5_wqe_eseg *restrict es = &wqe->eseg; 2514 uint32_t csum; 2515 uint8_t *psrc, *pdst; 2516 unsigned int part; 2517 2518 /* 2519 * Calculate and set check sum flags first, dword field 2520 * in segment may be shared with Software Parser flags. 2521 */ 2522 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; 2523 if (tso) { 2524 csum <<= 24; 2525 csum |= loc->mbuf->tso_segsz; 2526 es->flags = rte_cpu_to_be_32(csum); 2527 } else { 2528 es->flags = rte_cpu_to_le_32(csum); 2529 } 2530 /* 2531 * Calculate and set Software Parser offsets and flags. 2532 * These flags a set for custom UDP and IP tunnel packets. 2533 */ 2534 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); 2535 /* Fill metadata field if needed. */ 2536 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? 2537 loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? 2538 *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0; 2539 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2540 (sizeof(uint16_t) + 2541 sizeof(rte_v128u32_t)), 2542 "invalid Ethernet Segment data size"); 2543 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2544 (sizeof(uint16_t) + 2545 sizeof(struct rte_vlan_hdr) + 2546 2 * RTE_ETHER_ADDR_LEN), 2547 "invalid Ethernet Segment data size"); 2548 psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *); 2549 es->inline_hdr_sz = rte_cpu_to_be_16(inlen); 2550 es->inline_data = *(unaligned_uint16_t *)psrc; 2551 psrc += sizeof(uint16_t); 2552 pdst = (uint8_t *)(es + 1); 2553 if (MLX5_TXOFF_CONFIG(VLAN) && vlan) { 2554 /* Implement VLAN tag insertion as part inline data. */ 2555 memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t)); 2556 pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); 2557 psrc += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); 2558 /* Insert VLAN ethertype + VLAN tag. */ 2559 *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32 2560 ((RTE_ETHER_TYPE_VLAN << 16) | 2561 loc->mbuf->vlan_tci); 2562 pdst += sizeof(struct rte_vlan_hdr); 2563 /* Copy the rest two bytes from packet data. */ 2564 MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t))); 2565 *(uint16_t *)pdst = *(unaligned_uint16_t *)psrc; 2566 psrc += sizeof(uint16_t); 2567 } else { 2568 /* Fill the gap in the title WQEBB with inline data. */ 2569 rte_mov16(pdst, psrc); 2570 psrc += sizeof(rte_v128u32_t); 2571 } 2572 pdst = (uint8_t *)(es + 2); 2573 MLX5_ASSERT(inlen >= MLX5_ESEG_MIN_INLINE_SIZE); 2574 MLX5_ASSERT(pdst < (uint8_t *)txq->wqes_end); 2575 inlen -= MLX5_ESEG_MIN_INLINE_SIZE; 2576 if (!inlen) { 2577 MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE)); 2578 return (struct mlx5_wqe_dseg *)pdst; 2579 } 2580 /* 2581 * The WQEBB space availability is checked by caller. 2582 * Here we should be aware of WQE ring buffer wraparound only. 2583 */ 2584 part = (uint8_t *)txq->wqes_end - pdst; 2585 part = RTE_MIN(part, inlen); 2586 do { 2587 rte_memcpy(pdst, psrc, part); 2588 inlen -= part; 2589 if (likely(!inlen)) { 2590 /* 2591 * If return value is not used by the caller 2592 * the code below will be optimized out. 2593 */ 2594 pdst += part; 2595 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); 2596 if (unlikely(pdst >= (uint8_t *)txq->wqes_end)) 2597 pdst = (uint8_t *)txq->wqes; 2598 return (struct mlx5_wqe_dseg *)pdst; 2599 } 2600 pdst = (uint8_t *)txq->wqes; 2601 psrc += part; 2602 part = inlen; 2603 } while (true); 2604 } 2605 2606 /** 2607 * Copy data from chain of mbuf to the specified linear buffer. 2608 * Checksums and VLAN insertion Tx offload features. If data 2609 * from some mbuf copied completely this mbuf is freed. Local 2610 * structure is used to keep the byte stream state. 2611 * 2612 * @param pdst 2613 * Pointer to the destination linear buffer. 2614 * @param loc 2615 * Pointer to burst routine local context. 2616 * @param len 2617 * Length of data to be copied. 2618 * @param must 2619 * Length of data to be copied ignoring no inline hint. 2620 * @param olx 2621 * Configured Tx offloads mask. It is fully defined at 2622 * compile time and may be used for optimization. 2623 * 2624 * @return 2625 * Number of actual copied data bytes. This is always greater than or 2626 * equal to must parameter and might be lesser than len in no inline 2627 * hint flag is encountered. 2628 */ 2629 static __rte_always_inline unsigned int 2630 mlx5_tx_mseg_memcpy(uint8_t *pdst, 2631 struct mlx5_txq_local *restrict loc, 2632 unsigned int len, 2633 unsigned int must, 2634 unsigned int olx __rte_unused) 2635 { 2636 struct rte_mbuf *mbuf; 2637 unsigned int part, dlen, copy = 0; 2638 uint8_t *psrc; 2639 2640 MLX5_ASSERT(len); 2641 MLX5_ASSERT(must <= len); 2642 do { 2643 /* Allow zero length packets, must check first. */ 2644 dlen = rte_pktmbuf_data_len(loc->mbuf); 2645 if (dlen <= loc->mbuf_off) { 2646 /* Exhausted packet, just free. */ 2647 mbuf = loc->mbuf; 2648 loc->mbuf = mbuf->next; 2649 rte_pktmbuf_free_seg(mbuf); 2650 loc->mbuf_off = 0; 2651 MLX5_ASSERT(loc->mbuf_nseg > 1); 2652 MLX5_ASSERT(loc->mbuf); 2653 --loc->mbuf_nseg; 2654 if (loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE) { 2655 unsigned int diff; 2656 2657 if (copy >= must) { 2658 /* 2659 * We already copied the minimal 2660 * requested amount of data. 2661 */ 2662 return copy; 2663 } 2664 diff = must - copy; 2665 if (diff <= rte_pktmbuf_data_len(loc->mbuf)) { 2666 /* 2667 * Copy only the minimal required 2668 * part of the data buffer. 2669 */ 2670 len = diff; 2671 } 2672 } 2673 continue; 2674 } 2675 dlen -= loc->mbuf_off; 2676 psrc = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *, 2677 loc->mbuf_off); 2678 part = RTE_MIN(len, dlen); 2679 rte_memcpy(pdst, psrc, part); 2680 copy += part; 2681 loc->mbuf_off += part; 2682 len -= part; 2683 if (!len) { 2684 if (loc->mbuf_off >= rte_pktmbuf_data_len(loc->mbuf)) { 2685 loc->mbuf_off = 0; 2686 /* Exhausted packet, just free. */ 2687 mbuf = loc->mbuf; 2688 loc->mbuf = mbuf->next; 2689 rte_pktmbuf_free_seg(mbuf); 2690 loc->mbuf_off = 0; 2691 MLX5_ASSERT(loc->mbuf_nseg >= 1); 2692 --loc->mbuf_nseg; 2693 } 2694 return copy; 2695 } 2696 pdst += part; 2697 } while (true); 2698 } 2699 2700 /** 2701 * Build the Ethernet Segment with inlined data from 2702 * multi-segment packet. Checks the boundary of WQEBB 2703 * and ring buffer wrapping, supports Software Parser, 2704 * Checksums and VLAN insertion Tx offload features. 2705 * 2706 * @param txq 2707 * Pointer to TX queue structure. 2708 * @param loc 2709 * Pointer to burst routine local context. 2710 * @param wqe 2711 * Pointer to WQE to fill with built Ethernet Segment. 2712 * @param vlan 2713 * Length of VLAN tag insertion if any. 2714 * @param inlen 2715 * Length of data to inline (VLAN included, if any). 2716 * @param tso 2717 * TSO flag, set mss field from the packet. 2718 * @param olx 2719 * Configured Tx offloads mask. It is fully defined at 2720 * compile time and may be used for optimization. 2721 * 2722 * @return 2723 * Pointer to the next Data Segment (aligned and 2724 * possible NOT wrapped around - caller should do 2725 * wrapping check on its own). 2726 */ 2727 static __rte_always_inline struct mlx5_wqe_dseg * 2728 mlx5_tx_eseg_mdat(struct mlx5_txq_data *restrict txq, 2729 struct mlx5_txq_local *restrict loc, 2730 struct mlx5_wqe *restrict wqe, 2731 unsigned int vlan, 2732 unsigned int inlen, 2733 unsigned int tso, 2734 unsigned int olx) 2735 { 2736 struct mlx5_wqe_eseg *restrict es = &wqe->eseg; 2737 uint32_t csum; 2738 uint8_t *pdst; 2739 unsigned int part, tlen = 0; 2740 2741 /* 2742 * Calculate and set check sum flags first, uint32_t field 2743 * in segment may be shared with Software Parser flags. 2744 */ 2745 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; 2746 if (tso) { 2747 csum <<= 24; 2748 csum |= loc->mbuf->tso_segsz; 2749 es->flags = rte_cpu_to_be_32(csum); 2750 } else { 2751 es->flags = rte_cpu_to_le_32(csum); 2752 } 2753 /* 2754 * Calculate and set Software Parser offsets and flags. 2755 * These flags a set for custom UDP and IP tunnel packets. 2756 */ 2757 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); 2758 /* Fill metadata field if needed. */ 2759 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? 2760 loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? 2761 *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0; 2762 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2763 (sizeof(uint16_t) + 2764 sizeof(rte_v128u32_t)), 2765 "invalid Ethernet Segment data size"); 2766 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2767 (sizeof(uint16_t) + 2768 sizeof(struct rte_vlan_hdr) + 2769 2 * RTE_ETHER_ADDR_LEN), 2770 "invalid Ethernet Segment data size"); 2771 MLX5_ASSERT(inlen >= MLX5_ESEG_MIN_INLINE_SIZE); 2772 pdst = (uint8_t *)&es->inline_data; 2773 if (MLX5_TXOFF_CONFIG(VLAN) && vlan) { 2774 /* Implement VLAN tag insertion as part inline data. */ 2775 mlx5_tx_mseg_memcpy(pdst, loc, 2776 2 * RTE_ETHER_ADDR_LEN, 2777 2 * RTE_ETHER_ADDR_LEN, olx); 2778 pdst += 2 * RTE_ETHER_ADDR_LEN; 2779 *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32 2780 ((RTE_ETHER_TYPE_VLAN << 16) | 2781 loc->mbuf->vlan_tci); 2782 pdst += sizeof(struct rte_vlan_hdr); 2783 tlen += 2 * RTE_ETHER_ADDR_LEN + sizeof(struct rte_vlan_hdr); 2784 } 2785 MLX5_ASSERT(pdst < (uint8_t *)txq->wqes_end); 2786 /* 2787 * The WQEBB space availability is checked by caller. 2788 * Here we should be aware of WQE ring buffer wraparound only. 2789 */ 2790 part = (uint8_t *)txq->wqes_end - pdst; 2791 part = RTE_MIN(part, inlen - tlen); 2792 MLX5_ASSERT(part); 2793 do { 2794 unsigned int copy; 2795 2796 /* 2797 * Copying may be interrupted inside the routine 2798 * if run into no inline hint flag. 2799 */ 2800 copy = tlen >= txq->inlen_mode ? 0 : (txq->inlen_mode - tlen); 2801 copy = mlx5_tx_mseg_memcpy(pdst, loc, part, copy, olx); 2802 tlen += copy; 2803 if (likely(inlen <= tlen) || copy < part) { 2804 es->inline_hdr_sz = rte_cpu_to_be_16(tlen); 2805 pdst += copy; 2806 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); 2807 return (struct mlx5_wqe_dseg *)pdst; 2808 } 2809 pdst = (uint8_t *)txq->wqes; 2810 part = inlen - tlen; 2811 } while (true); 2812 } 2813 2814 /** 2815 * Build the Data Segment of pointer type. 2816 * 2817 * @param txq 2818 * Pointer to TX queue structure. 2819 * @param loc 2820 * Pointer to burst routine local context. 2821 * @param dseg 2822 * Pointer to WQE to fill with built Data Segment. 2823 * @param buf 2824 * Data buffer to point. 2825 * @param len 2826 * Data buffer length. 2827 * @param olx 2828 * Configured Tx offloads mask. It is fully defined at 2829 * compile time and may be used for optimization. 2830 */ 2831 static __rte_always_inline void 2832 mlx5_tx_dseg_ptr(struct mlx5_txq_data *restrict txq, 2833 struct mlx5_txq_local *restrict loc, 2834 struct mlx5_wqe_dseg *restrict dseg, 2835 uint8_t *buf, 2836 unsigned int len, 2837 unsigned int olx __rte_unused) 2838 2839 { 2840 MLX5_ASSERT(len); 2841 dseg->bcount = rte_cpu_to_be_32(len); 2842 dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf); 2843 dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf); 2844 } 2845 2846 /** 2847 * Build the Data Segment of pointer type or inline 2848 * if data length is less than buffer in minimal 2849 * Data Segment size. 2850 * 2851 * @param txq 2852 * Pointer to TX queue structure. 2853 * @param loc 2854 * Pointer to burst routine local context. 2855 * @param dseg 2856 * Pointer to WQE to fill with built Data Segment. 2857 * @param buf 2858 * Data buffer to point. 2859 * @param len 2860 * Data buffer length. 2861 * @param olx 2862 * Configured Tx offloads mask. It is fully defined at 2863 * compile time and may be used for optimization. 2864 */ 2865 static __rte_always_inline void 2866 mlx5_tx_dseg_iptr(struct mlx5_txq_data *restrict txq, 2867 struct mlx5_txq_local *restrict loc, 2868 struct mlx5_wqe_dseg *restrict dseg, 2869 uint8_t *buf, 2870 unsigned int len, 2871 unsigned int olx __rte_unused) 2872 2873 { 2874 uintptr_t dst, src; 2875 2876 MLX5_ASSERT(len); 2877 if (len > MLX5_DSEG_MIN_INLINE_SIZE) { 2878 dseg->bcount = rte_cpu_to_be_32(len); 2879 dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf); 2880 dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf); 2881 2882 return; 2883 } 2884 dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE); 2885 /* Unrolled implementation of generic rte_memcpy. */ 2886 dst = (uintptr_t)&dseg->inline_data[0]; 2887 src = (uintptr_t)buf; 2888 if (len & 0x08) { 2889 #ifdef RTE_ARCH_STRICT_ALIGN 2890 MLX5_ASSERT(dst == RTE_PTR_ALIGN(dst, sizeof(uint32_t))); 2891 *(uint32_t *)dst = *(unaligned_uint32_t *)src; 2892 dst += sizeof(uint32_t); 2893 src += sizeof(uint32_t); 2894 *(uint32_t *)dst = *(unaligned_uint32_t *)src; 2895 dst += sizeof(uint32_t); 2896 src += sizeof(uint32_t); 2897 #else 2898 *(uint64_t *)dst = *(unaligned_uint64_t *)src; 2899 dst += sizeof(uint64_t); 2900 src += sizeof(uint64_t); 2901 #endif 2902 } 2903 if (len & 0x04) { 2904 *(uint32_t *)dst = *(unaligned_uint32_t *)src; 2905 dst += sizeof(uint32_t); 2906 src += sizeof(uint32_t); 2907 } 2908 if (len & 0x02) { 2909 *(uint16_t *)dst = *(unaligned_uint16_t *)src; 2910 dst += sizeof(uint16_t); 2911 src += sizeof(uint16_t); 2912 } 2913 if (len & 0x01) 2914 *(uint8_t *)dst = *(uint8_t *)src; 2915 } 2916 2917 /** 2918 * Build the Data Segment of inlined data from single 2919 * segment packet, no VLAN insertion. 2920 * 2921 * @param txq 2922 * Pointer to TX queue structure. 2923 * @param loc 2924 * Pointer to burst routine local context. 2925 * @param dseg 2926 * Pointer to WQE to fill with built Data Segment. 2927 * @param buf 2928 * Data buffer to point. 2929 * @param len 2930 * Data buffer length. 2931 * @param olx 2932 * Configured Tx offloads mask. It is fully defined at 2933 * compile time and may be used for optimization. 2934 * 2935 * @return 2936 * Pointer to the next Data Segment after inlined data. 2937 * Ring buffer wraparound check is needed. We do not 2938 * do it here because it may not be needed for the 2939 * last packet in the eMPW session. 2940 */ 2941 static __rte_always_inline struct mlx5_wqe_dseg * 2942 mlx5_tx_dseg_empw(struct mlx5_txq_data *restrict txq, 2943 struct mlx5_txq_local *restrict loc __rte_unused, 2944 struct mlx5_wqe_dseg *restrict dseg, 2945 uint8_t *buf, 2946 unsigned int len, 2947 unsigned int olx __rte_unused) 2948 { 2949 unsigned int part; 2950 uint8_t *pdst; 2951 2952 dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE); 2953 pdst = &dseg->inline_data[0]; 2954 /* 2955 * The WQEBB space availability is checked by caller. 2956 * Here we should be aware of WQE ring buffer wraparound only. 2957 */ 2958 part = (uint8_t *)txq->wqes_end - pdst; 2959 part = RTE_MIN(part, len); 2960 do { 2961 rte_memcpy(pdst, buf, part); 2962 len -= part; 2963 if (likely(!len)) { 2964 pdst += part; 2965 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); 2966 /* Note: no final wraparound check here. */ 2967 return (struct mlx5_wqe_dseg *)pdst; 2968 } 2969 pdst = (uint8_t *)txq->wqes; 2970 buf += part; 2971 part = len; 2972 } while (true); 2973 } 2974 2975 /** 2976 * Build the Data Segment of inlined data from single 2977 * segment packet with VLAN insertion. 2978 * 2979 * @param txq 2980 * Pointer to TX queue structure. 2981 * @param loc 2982 * Pointer to burst routine local context. 2983 * @param dseg 2984 * Pointer to the dseg fill with built Data Segment. 2985 * @param buf 2986 * Data buffer to point. 2987 * @param len 2988 * Data buffer length. 2989 * @param olx 2990 * Configured Tx offloads mask. It is fully defined at 2991 * compile time and may be used for optimization. 2992 * 2993 * @return 2994 * Pointer to the next Data Segment after inlined data. 2995 * Ring buffer wraparound check is needed. 2996 */ 2997 static __rte_always_inline struct mlx5_wqe_dseg * 2998 mlx5_tx_dseg_vlan(struct mlx5_txq_data *restrict txq, 2999 struct mlx5_txq_local *restrict loc __rte_unused, 3000 struct mlx5_wqe_dseg *restrict dseg, 3001 uint8_t *buf, 3002 unsigned int len, 3003 unsigned int olx __rte_unused) 3004 3005 { 3006 unsigned int part; 3007 uint8_t *pdst; 3008 3009 MLX5_ASSERT(len > MLX5_ESEG_MIN_INLINE_SIZE); 3010 static_assert(MLX5_DSEG_MIN_INLINE_SIZE == 3011 (2 * RTE_ETHER_ADDR_LEN), 3012 "invalid Data Segment data size"); 3013 dseg->bcount = rte_cpu_to_be_32((len + sizeof(struct rte_vlan_hdr)) | 3014 MLX5_ETH_WQE_DATA_INLINE); 3015 pdst = &dseg->inline_data[0]; 3016 memcpy(pdst, buf, MLX5_DSEG_MIN_INLINE_SIZE); 3017 buf += MLX5_DSEG_MIN_INLINE_SIZE; 3018 pdst += MLX5_DSEG_MIN_INLINE_SIZE; 3019 len -= MLX5_DSEG_MIN_INLINE_SIZE; 3020 /* Insert VLAN ethertype + VLAN tag. Pointer is aligned. */ 3021 MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE)); 3022 if (unlikely(pdst >= (uint8_t *)txq->wqes_end)) 3023 pdst = (uint8_t *)txq->wqes; 3024 *(uint32_t *)pdst = rte_cpu_to_be_32((RTE_ETHER_TYPE_VLAN << 16) | 3025 loc->mbuf->vlan_tci); 3026 pdst += sizeof(struct rte_vlan_hdr); 3027 /* 3028 * The WQEBB space availability is checked by caller. 3029 * Here we should be aware of WQE ring buffer wraparound only. 3030 */ 3031 part = (uint8_t *)txq->wqes_end - pdst; 3032 part = RTE_MIN(part, len); 3033 do { 3034 rte_memcpy(pdst, buf, part); 3035 len -= part; 3036 if (likely(!len)) { 3037 pdst += part; 3038 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); 3039 /* Note: no final wraparound check here. */ 3040 return (struct mlx5_wqe_dseg *)pdst; 3041 } 3042 pdst = (uint8_t *)txq->wqes; 3043 buf += part; 3044 part = len; 3045 } while (true); 3046 } 3047 3048 /** 3049 * Build the Ethernet Segment with optionally inlined data with 3050 * VLAN insertion and following Data Segments (if any) from 3051 * multi-segment packet. Used by ordinary send and TSO. 3052 * 3053 * @param txq 3054 * Pointer to TX queue structure. 3055 * @param loc 3056 * Pointer to burst routine local context. 3057 * @param wqe 3058 * Pointer to WQE to fill with built Ethernet/Data Segments. 3059 * @param vlan 3060 * Length of VLAN header to insert, 0 means no VLAN insertion. 3061 * @param inlen 3062 * Data length to inline. For TSO this parameter specifies 3063 * exact value, for ordinary send routine can be aligned by 3064 * caller to provide better WQE space saving and data buffer 3065 * start address alignment. This length includes VLAN header 3066 * being inserted. 3067 * @param tso 3068 * Zero means ordinary send, inlined data can be extended, 3069 * otherwise this is TSO, inlined data length is fixed. 3070 * @param olx 3071 * Configured Tx offloads mask. It is fully defined at 3072 * compile time and may be used for optimization. 3073 * 3074 * @return 3075 * Actual size of built WQE in segments. 3076 */ 3077 static __rte_always_inline unsigned int 3078 mlx5_tx_mseg_build(struct mlx5_txq_data *restrict txq, 3079 struct mlx5_txq_local *restrict loc, 3080 struct mlx5_wqe *restrict wqe, 3081 unsigned int vlan, 3082 unsigned int inlen, 3083 unsigned int tso, 3084 unsigned int olx __rte_unused) 3085 { 3086 struct mlx5_wqe_dseg *restrict dseg; 3087 unsigned int ds; 3088 3089 MLX5_ASSERT((rte_pktmbuf_pkt_len(loc->mbuf) + vlan) >= inlen); 3090 loc->mbuf_nseg = NB_SEGS(loc->mbuf); 3091 loc->mbuf_off = 0; 3092 3093 dseg = mlx5_tx_eseg_mdat(txq, loc, wqe, vlan, inlen, tso, olx); 3094 if (!loc->mbuf_nseg) 3095 goto dseg_done; 3096 /* 3097 * There are still some mbuf remaining, not inlined. 3098 * The first mbuf may be partially inlined and we 3099 * must process the possible non-zero data offset. 3100 */ 3101 if (loc->mbuf_off) { 3102 unsigned int dlen; 3103 uint8_t *dptr; 3104 3105 /* 3106 * Exhausted packets must be dropped before. 3107 * Non-zero offset means there are some data 3108 * remained in the packet. 3109 */ 3110 MLX5_ASSERT(loc->mbuf_off < rte_pktmbuf_data_len(loc->mbuf)); 3111 MLX5_ASSERT(rte_pktmbuf_data_len(loc->mbuf)); 3112 dptr = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *, 3113 loc->mbuf_off); 3114 dlen = rte_pktmbuf_data_len(loc->mbuf) - loc->mbuf_off; 3115 /* 3116 * Build the pointer/minimal data Data Segment. 3117 * Do ring buffer wrapping check in advance. 3118 */ 3119 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 3120 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 3121 mlx5_tx_dseg_iptr(txq, loc, dseg, dptr, dlen, olx); 3122 /* Store the mbuf to be freed on completion. */ 3123 MLX5_ASSERT(loc->elts_free); 3124 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 3125 --loc->elts_free; 3126 ++dseg; 3127 if (--loc->mbuf_nseg == 0) 3128 goto dseg_done; 3129 loc->mbuf = loc->mbuf->next; 3130 loc->mbuf_off = 0; 3131 } 3132 do { 3133 if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) { 3134 struct rte_mbuf *mbuf; 3135 3136 /* Zero length segment found, just skip. */ 3137 mbuf = loc->mbuf; 3138 loc->mbuf = loc->mbuf->next; 3139 rte_pktmbuf_free_seg(mbuf); 3140 if (--loc->mbuf_nseg == 0) 3141 break; 3142 } else { 3143 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 3144 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 3145 mlx5_tx_dseg_iptr 3146 (txq, loc, dseg, 3147 rte_pktmbuf_mtod(loc->mbuf, uint8_t *), 3148 rte_pktmbuf_data_len(loc->mbuf), olx); 3149 MLX5_ASSERT(loc->elts_free); 3150 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 3151 --loc->elts_free; 3152 ++dseg; 3153 if (--loc->mbuf_nseg == 0) 3154 break; 3155 loc->mbuf = loc->mbuf->next; 3156 } 3157 } while (true); 3158 3159 dseg_done: 3160 /* Calculate actual segments used from the dseg pointer. */ 3161 if ((uintptr_t)wqe < (uintptr_t)dseg) 3162 ds = ((uintptr_t)dseg - (uintptr_t)wqe) / MLX5_WSEG_SIZE; 3163 else 3164 ds = (((uintptr_t)dseg - (uintptr_t)wqe) + 3165 txq->wqe_s * MLX5_WQE_SIZE) / MLX5_WSEG_SIZE; 3166 return ds; 3167 } 3168 3169 /** 3170 * Tx one packet function for multi-segment TSO. Supports all 3171 * types of Tx offloads, uses MLX5_OPCODE_TSO to build WQEs, 3172 * sends one packet per WQE. 3173 * 3174 * This routine is responsible for storing processed mbuf 3175 * into elts ring buffer and update elts_head. 3176 * 3177 * @param txq 3178 * Pointer to TX queue structure. 3179 * @param loc 3180 * Pointer to burst routine local context. 3181 * @param olx 3182 * Configured Tx offloads mask. It is fully defined at 3183 * compile time and may be used for optimization. 3184 * 3185 * @return 3186 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3187 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3188 * Local context variables partially updated. 3189 */ 3190 static __rte_always_inline enum mlx5_txcmp_code 3191 mlx5_tx_packet_multi_tso(struct mlx5_txq_data *restrict txq, 3192 struct mlx5_txq_local *restrict loc, 3193 unsigned int olx) 3194 { 3195 struct mlx5_wqe *restrict wqe; 3196 unsigned int ds, dlen, inlen, ntcp, vlan = 0; 3197 3198 /* 3199 * Calculate data length to be inlined to estimate 3200 * the required space in WQE ring buffer. 3201 */ 3202 dlen = rte_pktmbuf_pkt_len(loc->mbuf); 3203 if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) 3204 vlan = sizeof(struct rte_vlan_hdr); 3205 inlen = loc->mbuf->l2_len + vlan + 3206 loc->mbuf->l3_len + loc->mbuf->l4_len; 3207 if (unlikely((!inlen || !loc->mbuf->tso_segsz))) 3208 return MLX5_TXCMP_CODE_ERROR; 3209 if (loc->mbuf->ol_flags & PKT_TX_TUNNEL_MASK) 3210 inlen += loc->mbuf->outer_l2_len + loc->mbuf->outer_l3_len; 3211 /* Packet must contain all TSO headers. */ 3212 if (unlikely(inlen > MLX5_MAX_TSO_HEADER || 3213 inlen <= MLX5_ESEG_MIN_INLINE_SIZE || 3214 inlen > (dlen + vlan))) 3215 return MLX5_TXCMP_CODE_ERROR; 3216 MLX5_ASSERT(inlen >= txq->inlen_mode); 3217 /* 3218 * Check whether there are enough free WQEBBs: 3219 * - Control Segment 3220 * - Ethernet Segment 3221 * - First Segment of inlined Ethernet data 3222 * - ... data continued ... 3223 * - Data Segments of pointer/min inline type 3224 */ 3225 ds = NB_SEGS(loc->mbuf) + 2 + (inlen - 3226 MLX5_ESEG_MIN_INLINE_SIZE + 3227 MLX5_WSEG_SIZE + 3228 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 3229 if (unlikely(loc->wqe_free < ((ds + 3) / 4))) 3230 return MLX5_TXCMP_CODE_EXIT; 3231 /* Check for maximal WQE size. */ 3232 if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4))) 3233 return MLX5_TXCMP_CODE_ERROR; 3234 #ifdef MLX5_PMD_SOFT_COUNTERS 3235 /* Update sent data bytes/packets counters. */ 3236 ntcp = (dlen - (inlen - vlan) + loc->mbuf->tso_segsz - 1) / 3237 loc->mbuf->tso_segsz; 3238 /* 3239 * One will be added for mbuf itself 3240 * at the end of the mlx5_tx_burst from 3241 * loc->pkts_sent field. 3242 */ 3243 --ntcp; 3244 txq->stats.opackets += ntcp; 3245 txq->stats.obytes += dlen + vlan + ntcp * inlen; 3246 #endif 3247 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3248 loc->wqe_last = wqe; 3249 mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_TSO, olx); 3250 ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 1, olx); 3251 wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); 3252 txq->wqe_ci += (ds + 3) / 4; 3253 loc->wqe_free -= (ds + 3) / 4; 3254 return MLX5_TXCMP_CODE_MULTI; 3255 } 3256 3257 /** 3258 * Tx one packet function for multi-segment SEND. Supports all 3259 * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs, 3260 * sends one packet per WQE, without any data inlining in 3261 * Ethernet Segment. 3262 * 3263 * This routine is responsible for storing processed mbuf 3264 * into elts ring buffer and update elts_head. 3265 * 3266 * @param txq 3267 * Pointer to TX queue structure. 3268 * @param loc 3269 * Pointer to burst routine local context. 3270 * @param olx 3271 * Configured Tx offloads mask. It is fully defined at 3272 * compile time and may be used for optimization. 3273 * 3274 * @return 3275 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3276 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3277 * Local context variables partially updated. 3278 */ 3279 static __rte_always_inline enum mlx5_txcmp_code 3280 mlx5_tx_packet_multi_send(struct mlx5_txq_data *restrict txq, 3281 struct mlx5_txq_local *restrict loc, 3282 unsigned int olx) 3283 { 3284 struct mlx5_wqe_dseg *restrict dseg; 3285 struct mlx5_wqe *restrict wqe; 3286 unsigned int ds, nseg; 3287 3288 MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1); 3289 /* 3290 * No inline at all, it means the CPU cycles saving 3291 * is prioritized at configuration, we should not 3292 * copy any packet data to WQE. 3293 */ 3294 nseg = NB_SEGS(loc->mbuf); 3295 ds = 2 + nseg; 3296 if (unlikely(loc->wqe_free < ((ds + 3) / 4))) 3297 return MLX5_TXCMP_CODE_EXIT; 3298 /* Check for maximal WQE size. */ 3299 if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4))) 3300 return MLX5_TXCMP_CODE_ERROR; 3301 /* 3302 * Some Tx offloads may cause an error if 3303 * packet is not long enough, check against 3304 * assumed minimal length. 3305 */ 3306 if (rte_pktmbuf_pkt_len(loc->mbuf) <= MLX5_ESEG_MIN_INLINE_SIZE) 3307 return MLX5_TXCMP_CODE_ERROR; 3308 #ifdef MLX5_PMD_SOFT_COUNTERS 3309 /* Update sent data bytes counter. */ 3310 txq->stats.obytes += rte_pktmbuf_pkt_len(loc->mbuf); 3311 if (MLX5_TXOFF_CONFIG(VLAN) && 3312 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) 3313 txq->stats.obytes += sizeof(struct rte_vlan_hdr); 3314 #endif 3315 /* 3316 * SEND WQE, one WQEBB: 3317 * - Control Segment, SEND opcode 3318 * - Ethernet Segment, optional VLAN, no inline 3319 * - Data Segments, pointer only type 3320 */ 3321 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3322 loc->wqe_last = wqe; 3323 mlx5_tx_cseg_init(txq, loc, wqe, ds, MLX5_OPCODE_SEND, olx); 3324 mlx5_tx_eseg_none(txq, loc, wqe, olx); 3325 dseg = &wqe->dseg[0]; 3326 do { 3327 if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) { 3328 struct rte_mbuf *mbuf; 3329 3330 /* 3331 * Zero length segment found, have to 3332 * correct total size of WQE in segments. 3333 * It is supposed to be rare occasion, so 3334 * in normal case (no zero length segments) 3335 * we avoid extra writing to the Control 3336 * Segment. 3337 */ 3338 --ds; 3339 wqe->cseg.sq_ds -= RTE_BE32(1); 3340 mbuf = loc->mbuf; 3341 loc->mbuf = mbuf->next; 3342 rte_pktmbuf_free_seg(mbuf); 3343 if (--nseg == 0) 3344 break; 3345 } else { 3346 mlx5_tx_dseg_ptr 3347 (txq, loc, dseg, 3348 rte_pktmbuf_mtod(loc->mbuf, uint8_t *), 3349 rte_pktmbuf_data_len(loc->mbuf), olx); 3350 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 3351 --loc->elts_free; 3352 if (--nseg == 0) 3353 break; 3354 ++dseg; 3355 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 3356 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 3357 loc->mbuf = loc->mbuf->next; 3358 } 3359 } while (true); 3360 txq->wqe_ci += (ds + 3) / 4; 3361 loc->wqe_free -= (ds + 3) / 4; 3362 return MLX5_TXCMP_CODE_MULTI; 3363 } 3364 3365 /** 3366 * Tx one packet function for multi-segment SEND. Supports all 3367 * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs, 3368 * sends one packet per WQE, with data inlining in 3369 * Ethernet Segment and minimal Data Segments. 3370 * 3371 * This routine is responsible for storing processed mbuf 3372 * into elts ring buffer and update elts_head. 3373 * 3374 * @param txq 3375 * Pointer to TX queue structure. 3376 * @param loc 3377 * Pointer to burst routine local context. 3378 * @param olx 3379 * Configured Tx offloads mask. It is fully defined at 3380 * compile time and may be used for optimization. 3381 * 3382 * @return 3383 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3384 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3385 * Local context variables partially updated. 3386 */ 3387 static __rte_always_inline enum mlx5_txcmp_code 3388 mlx5_tx_packet_multi_inline(struct mlx5_txq_data *restrict txq, 3389 struct mlx5_txq_local *restrict loc, 3390 unsigned int olx) 3391 { 3392 struct mlx5_wqe *restrict wqe; 3393 unsigned int ds, inlen, dlen, vlan = 0; 3394 3395 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE)); 3396 MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1); 3397 /* 3398 * First calculate data length to be inlined 3399 * to estimate the required space for WQE. 3400 */ 3401 dlen = rte_pktmbuf_pkt_len(loc->mbuf); 3402 if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) 3403 vlan = sizeof(struct rte_vlan_hdr); 3404 inlen = dlen + vlan; 3405 /* Check against minimal length. */ 3406 if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE) 3407 return MLX5_TXCMP_CODE_ERROR; 3408 MLX5_ASSERT(txq->inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE); 3409 if (inlen > txq->inlen_send || 3410 loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE) { 3411 struct rte_mbuf *mbuf; 3412 unsigned int nxlen; 3413 uintptr_t start; 3414 3415 /* 3416 * Packet length exceeds the allowed inline 3417 * data length, check whether the minimal 3418 * inlining is required. 3419 */ 3420 if (txq->inlen_mode) { 3421 MLX5_ASSERT(txq->inlen_mode >= 3422 MLX5_ESEG_MIN_INLINE_SIZE); 3423 MLX5_ASSERT(txq->inlen_mode <= txq->inlen_send); 3424 inlen = txq->inlen_mode; 3425 } else { 3426 if (loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE || 3427 !vlan || txq->vlan_en) { 3428 /* 3429 * VLAN insertion will be done inside by HW. 3430 * It is not utmost effective - VLAN flag is 3431 * checked twice, but we should proceed the 3432 * inlining length correctly and take into 3433 * account the VLAN header being inserted. 3434 */ 3435 return mlx5_tx_packet_multi_send 3436 (txq, loc, olx); 3437 } 3438 inlen = MLX5_ESEG_MIN_INLINE_SIZE; 3439 } 3440 /* 3441 * Now we know the minimal amount of data is requested 3442 * to inline. Check whether we should inline the buffers 3443 * from the chain beginning to eliminate some mbufs. 3444 */ 3445 mbuf = loc->mbuf; 3446 nxlen = rte_pktmbuf_data_len(mbuf); 3447 if (unlikely(nxlen <= txq->inlen_send)) { 3448 /* We can inline first mbuf at least. */ 3449 if (nxlen < inlen) { 3450 unsigned int smlen; 3451 3452 /* Scan mbufs till inlen filled. */ 3453 do { 3454 smlen = nxlen; 3455 mbuf = NEXT(mbuf); 3456 MLX5_ASSERT(mbuf); 3457 nxlen = rte_pktmbuf_data_len(mbuf); 3458 nxlen += smlen; 3459 } while (unlikely(nxlen < inlen)); 3460 if (unlikely(nxlen > txq->inlen_send)) { 3461 /* We cannot inline entire mbuf. */ 3462 smlen = inlen - smlen; 3463 start = rte_pktmbuf_mtod_offset 3464 (mbuf, uintptr_t, smlen); 3465 goto do_align; 3466 } 3467 } 3468 do { 3469 inlen = nxlen; 3470 mbuf = NEXT(mbuf); 3471 /* There should be not end of packet. */ 3472 MLX5_ASSERT(mbuf); 3473 nxlen = inlen + rte_pktmbuf_data_len(mbuf); 3474 } while (unlikely(nxlen < txq->inlen_send)); 3475 } 3476 start = rte_pktmbuf_mtod(mbuf, uintptr_t); 3477 /* 3478 * Check whether we can do inline to align start 3479 * address of data buffer to cacheline. 3480 */ 3481 do_align: 3482 start = (~start + 1) & (RTE_CACHE_LINE_SIZE - 1); 3483 if (unlikely(start)) { 3484 start += inlen; 3485 if (start <= txq->inlen_send) 3486 inlen = start; 3487 } 3488 } 3489 /* 3490 * Check whether there are enough free WQEBBs: 3491 * - Control Segment 3492 * - Ethernet Segment 3493 * - First Segment of inlined Ethernet data 3494 * - ... data continued ... 3495 * - Data Segments of pointer/min inline type 3496 * 3497 * Estimate the number of Data Segments conservatively, 3498 * supposing no any mbufs is being freed during inlining. 3499 */ 3500 MLX5_ASSERT(inlen <= txq->inlen_send); 3501 ds = NB_SEGS(loc->mbuf) + 2 + (inlen - 3502 MLX5_ESEG_MIN_INLINE_SIZE + 3503 MLX5_WSEG_SIZE + 3504 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 3505 if (unlikely(loc->wqe_free < ((ds + 3) / 4))) 3506 return MLX5_TXCMP_CODE_EXIT; 3507 /* Check for maximal WQE size. */ 3508 if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4))) 3509 return MLX5_TXCMP_CODE_ERROR; 3510 #ifdef MLX5_PMD_SOFT_COUNTERS 3511 /* Update sent data bytes/packets counters. */ 3512 txq->stats.obytes += dlen + vlan; 3513 #endif 3514 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3515 loc->wqe_last = wqe; 3516 mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_SEND, olx); 3517 ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 0, olx); 3518 wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); 3519 txq->wqe_ci += (ds + 3) / 4; 3520 loc->wqe_free -= (ds + 3) / 4; 3521 return MLX5_TXCMP_CODE_MULTI; 3522 } 3523 3524 /** 3525 * Tx burst function for multi-segment packets. Supports all 3526 * types of Tx offloads, uses MLX5_OPCODE_SEND/TSO to build WQEs, 3527 * sends one packet per WQE. Function stops sending if it 3528 * encounters the single-segment packet. 3529 * 3530 * This routine is responsible for storing processed mbuf 3531 * into elts ring buffer and update elts_head. 3532 * 3533 * @param txq 3534 * Pointer to TX queue structure. 3535 * @param[in] pkts 3536 * Packets to transmit. 3537 * @param pkts_n 3538 * Number of packets in array. 3539 * @param loc 3540 * Pointer to burst routine local context. 3541 * @param olx 3542 * Configured Tx offloads mask. It is fully defined at 3543 * compile time and may be used for optimization. 3544 * 3545 * @return 3546 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3547 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3548 * MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered. 3549 * MLX5_TXCMP_CODE_TSO - TSO single-segment packet encountered. 3550 * Local context variables updated. 3551 */ 3552 static __rte_always_inline enum mlx5_txcmp_code 3553 mlx5_tx_burst_mseg(struct mlx5_txq_data *restrict txq, 3554 struct rte_mbuf **restrict pkts, 3555 unsigned int pkts_n, 3556 struct mlx5_txq_local *restrict loc, 3557 unsigned int olx) 3558 { 3559 MLX5_ASSERT(loc->elts_free && loc->wqe_free); 3560 MLX5_ASSERT(pkts_n > loc->pkts_sent); 3561 pkts += loc->pkts_sent + 1; 3562 pkts_n -= loc->pkts_sent; 3563 for (;;) { 3564 enum mlx5_txcmp_code ret; 3565 3566 MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1); 3567 /* 3568 * Estimate the number of free elts quickly but 3569 * conservatively. Some segment may be fully inlined 3570 * and freed, ignore this here - precise estimation 3571 * is costly. 3572 */ 3573 if (loc->elts_free < NB_SEGS(loc->mbuf)) 3574 return MLX5_TXCMP_CODE_EXIT; 3575 if (MLX5_TXOFF_CONFIG(TSO) && 3576 unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) { 3577 /* Proceed with multi-segment TSO. */ 3578 ret = mlx5_tx_packet_multi_tso(txq, loc, olx); 3579 } else if (MLX5_TXOFF_CONFIG(INLINE)) { 3580 /* Proceed with multi-segment SEND with inlining. */ 3581 ret = mlx5_tx_packet_multi_inline(txq, loc, olx); 3582 } else { 3583 /* Proceed with multi-segment SEND w/o inlining. */ 3584 ret = mlx5_tx_packet_multi_send(txq, loc, olx); 3585 } 3586 if (ret == MLX5_TXCMP_CODE_EXIT) 3587 return MLX5_TXCMP_CODE_EXIT; 3588 if (ret == MLX5_TXCMP_CODE_ERROR) 3589 return MLX5_TXCMP_CODE_ERROR; 3590 /* WQE is built, go to the next packet. */ 3591 ++loc->pkts_sent; 3592 --pkts_n; 3593 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) 3594 return MLX5_TXCMP_CODE_EXIT; 3595 loc->mbuf = *pkts++; 3596 if (pkts_n > 1) 3597 rte_prefetch0(*pkts); 3598 if (likely(NB_SEGS(loc->mbuf) > 1)) 3599 continue; 3600 /* Here ends the series of multi-segment packets. */ 3601 if (MLX5_TXOFF_CONFIG(TSO) && 3602 unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) 3603 return MLX5_TXCMP_CODE_TSO; 3604 return MLX5_TXCMP_CODE_SINGLE; 3605 } 3606 MLX5_ASSERT(false); 3607 } 3608 3609 /** 3610 * Tx burst function for single-segment packets with TSO. 3611 * Supports all types of Tx offloads, except multi-packets. 3612 * Uses MLX5_OPCODE_TSO to build WQEs, sends one packet per WQE. 3613 * Function stops sending if it encounters the multi-segment 3614 * packet or packet without TSO requested. 3615 * 3616 * The routine is responsible for storing processed mbuf 3617 * into elts ring buffer and update elts_head if inline 3618 * offloads is requested due to possible early freeing 3619 * of the inlined mbufs (can not store pkts array in elts 3620 * as a batch). 3621 * 3622 * @param txq 3623 * Pointer to TX queue structure. 3624 * @param[in] pkts 3625 * Packets to transmit. 3626 * @param pkts_n 3627 * Number of packets in array. 3628 * @param loc 3629 * Pointer to burst routine local context. 3630 * @param olx 3631 * Configured Tx offloads mask. It is fully defined at 3632 * compile time and may be used for optimization. 3633 * 3634 * @return 3635 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3636 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3637 * MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered. 3638 * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered. 3639 * Local context variables updated. 3640 */ 3641 static __rte_always_inline enum mlx5_txcmp_code 3642 mlx5_tx_burst_tso(struct mlx5_txq_data *restrict txq, 3643 struct rte_mbuf **restrict pkts, 3644 unsigned int pkts_n, 3645 struct mlx5_txq_local *restrict loc, 3646 unsigned int olx) 3647 { 3648 MLX5_ASSERT(loc->elts_free && loc->wqe_free); 3649 MLX5_ASSERT(pkts_n > loc->pkts_sent); 3650 pkts += loc->pkts_sent + 1; 3651 pkts_n -= loc->pkts_sent; 3652 for (;;) { 3653 struct mlx5_wqe_dseg *restrict dseg; 3654 struct mlx5_wqe *restrict wqe; 3655 unsigned int ds, dlen, hlen, ntcp, vlan = 0; 3656 uint8_t *dptr; 3657 3658 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); 3659 dlen = rte_pktmbuf_data_len(loc->mbuf); 3660 if (MLX5_TXOFF_CONFIG(VLAN) && 3661 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { 3662 vlan = sizeof(struct rte_vlan_hdr); 3663 } 3664 /* 3665 * First calculate the WQE size to check 3666 * whether we have enough space in ring buffer. 3667 */ 3668 hlen = loc->mbuf->l2_len + vlan + 3669 loc->mbuf->l3_len + loc->mbuf->l4_len; 3670 if (unlikely((!hlen || !loc->mbuf->tso_segsz))) 3671 return MLX5_TXCMP_CODE_ERROR; 3672 if (loc->mbuf->ol_flags & PKT_TX_TUNNEL_MASK) 3673 hlen += loc->mbuf->outer_l2_len + 3674 loc->mbuf->outer_l3_len; 3675 /* Segment must contain all TSO headers. */ 3676 if (unlikely(hlen > MLX5_MAX_TSO_HEADER || 3677 hlen <= MLX5_ESEG_MIN_INLINE_SIZE || 3678 hlen > (dlen + vlan))) 3679 return MLX5_TXCMP_CODE_ERROR; 3680 /* 3681 * Check whether there are enough free WQEBBs: 3682 * - Control Segment 3683 * - Ethernet Segment 3684 * - First Segment of inlined Ethernet data 3685 * - ... data continued ... 3686 * - Finishing Data Segment of pointer type 3687 */ 3688 ds = 4 + (hlen - MLX5_ESEG_MIN_INLINE_SIZE + 3689 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 3690 if (loc->wqe_free < ((ds + 3) / 4)) 3691 return MLX5_TXCMP_CODE_EXIT; 3692 #ifdef MLX5_PMD_SOFT_COUNTERS 3693 /* Update sent data bytes/packets counters. */ 3694 ntcp = (dlen + vlan - hlen + 3695 loc->mbuf->tso_segsz - 1) / 3696 loc->mbuf->tso_segsz; 3697 /* 3698 * One will be added for mbuf itself at the end 3699 * of the mlx5_tx_burst from loc->pkts_sent field. 3700 */ 3701 --ntcp; 3702 txq->stats.opackets += ntcp; 3703 txq->stats.obytes += dlen + vlan + ntcp * hlen; 3704 #endif 3705 /* 3706 * Build the TSO WQE: 3707 * - Control Segment 3708 * - Ethernet Segment with hlen bytes inlined 3709 * - Data Segment of pointer type 3710 */ 3711 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3712 loc->wqe_last = wqe; 3713 mlx5_tx_cseg_init(txq, loc, wqe, ds, 3714 MLX5_OPCODE_TSO, olx); 3715 dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan, hlen, 1, olx); 3716 dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + hlen - vlan; 3717 dlen -= hlen - vlan; 3718 mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx); 3719 /* 3720 * WQE is built, update the loop parameters 3721 * and go to the next packet. 3722 */ 3723 txq->wqe_ci += (ds + 3) / 4; 3724 loc->wqe_free -= (ds + 3) / 4; 3725 if (MLX5_TXOFF_CONFIG(INLINE)) 3726 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 3727 --loc->elts_free; 3728 ++loc->pkts_sent; 3729 --pkts_n; 3730 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) 3731 return MLX5_TXCMP_CODE_EXIT; 3732 loc->mbuf = *pkts++; 3733 if (pkts_n > 1) 3734 rte_prefetch0(*pkts); 3735 if (MLX5_TXOFF_CONFIG(MULTI) && 3736 unlikely(NB_SEGS(loc->mbuf) > 1)) 3737 return MLX5_TXCMP_CODE_MULTI; 3738 if (likely(!(loc->mbuf->ol_flags & PKT_TX_TCP_SEG))) 3739 return MLX5_TXCMP_CODE_SINGLE; 3740 /* Continue with the next TSO packet. */ 3741 } 3742 MLX5_ASSERT(false); 3743 } 3744 3745 /** 3746 * Analyze the packet and select the best method to send. 3747 * 3748 * @param txq 3749 * Pointer to TX queue structure. 3750 * @param loc 3751 * Pointer to burst routine local context. 3752 * @param olx 3753 * Configured Tx offloads mask. It is fully defined at 3754 * compile time and may be used for optimization. 3755 * @param newp 3756 * The predefined flag whether do complete check for 3757 * multi-segment packets and TSO. 3758 * 3759 * @return 3760 * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered. 3761 * MLX5_TXCMP_CODE_TSO - TSO required, use TSO/LSO. 3762 * MLX5_TXCMP_CODE_SINGLE - single-segment packet, use SEND. 3763 * MLX5_TXCMP_CODE_EMPW - single-segment packet, use MPW. 3764 */ 3765 static __rte_always_inline enum mlx5_txcmp_code 3766 mlx5_tx_able_to_empw(struct mlx5_txq_data *restrict txq, 3767 struct mlx5_txq_local *restrict loc, 3768 unsigned int olx, 3769 bool newp) 3770 { 3771 /* Check for multi-segment packet. */ 3772 if (newp && 3773 MLX5_TXOFF_CONFIG(MULTI) && 3774 unlikely(NB_SEGS(loc->mbuf) > 1)) 3775 return MLX5_TXCMP_CODE_MULTI; 3776 /* Check for TSO packet. */ 3777 if (newp && 3778 MLX5_TXOFF_CONFIG(TSO) && 3779 unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) 3780 return MLX5_TXCMP_CODE_TSO; 3781 /* Check if eMPW is enabled at all. */ 3782 if (!MLX5_TXOFF_CONFIG(EMPW)) 3783 return MLX5_TXCMP_CODE_SINGLE; 3784 /* Check if eMPW can be engaged. */ 3785 if (MLX5_TXOFF_CONFIG(VLAN) && 3786 unlikely(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) && 3787 (!MLX5_TXOFF_CONFIG(INLINE) || 3788 unlikely((rte_pktmbuf_data_len(loc->mbuf) + 3789 sizeof(struct rte_vlan_hdr)) > txq->inlen_empw))) { 3790 /* 3791 * eMPW does not support VLAN insertion offload, 3792 * we have to inline the entire packet but 3793 * packet is too long for inlining. 3794 */ 3795 return MLX5_TXCMP_CODE_SINGLE; 3796 } 3797 return MLX5_TXCMP_CODE_EMPW; 3798 } 3799 3800 /** 3801 * Check the next packet attributes to match with the eMPW batch ones. 3802 * In addition, for legacy MPW the packet length is checked either. 3803 * 3804 * @param txq 3805 * Pointer to TX queue structure. 3806 * @param es 3807 * Pointer to Ethernet Segment of eMPW batch. 3808 * @param loc 3809 * Pointer to burst routine local context. 3810 * @param dlen 3811 * Length of previous packet in MPW descriptor. 3812 * @param olx 3813 * Configured Tx offloads mask. It is fully defined at 3814 * compile time and may be used for optimization. 3815 * 3816 * @return 3817 * true - packet match with eMPW batch attributes. 3818 * false - no match, eMPW should be restarted. 3819 */ 3820 static __rte_always_inline bool 3821 mlx5_tx_match_empw(struct mlx5_txq_data *restrict txq __rte_unused, 3822 struct mlx5_wqe_eseg *restrict es, 3823 struct mlx5_txq_local *restrict loc, 3824 uint32_t dlen, 3825 unsigned int olx) 3826 { 3827 uint8_t swp_flags = 0; 3828 3829 /* Compare the checksum flags, if any. */ 3830 if (MLX5_TXOFF_CONFIG(CSUM) && 3831 txq_ol_cksum_to_cs(loc->mbuf) != es->cs_flags) 3832 return false; 3833 /* Compare the Software Parser offsets and flags. */ 3834 if (MLX5_TXOFF_CONFIG(SWP) && 3835 (es->swp_offs != txq_mbuf_to_swp(loc, &swp_flags, olx) || 3836 es->swp_flags != swp_flags)) 3837 return false; 3838 /* Fill metadata field if needed. */ 3839 if (MLX5_TXOFF_CONFIG(METADATA) && 3840 es->metadata != (loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? 3841 *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0)) 3842 return false; 3843 /* Legacy MPW can send packets with the same lengt only. */ 3844 if (MLX5_TXOFF_CONFIG(MPW) && 3845 dlen != rte_pktmbuf_data_len(loc->mbuf)) 3846 return false; 3847 /* There must be no VLAN packets in eMPW loop. */ 3848 if (MLX5_TXOFF_CONFIG(VLAN)) 3849 MLX5_ASSERT(!(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)); 3850 return true; 3851 } 3852 3853 /* 3854 * Update send loop variables and WQE for eMPW loop 3855 * without data inlining. Number of Data Segments is 3856 * equal to the number of sent packets. 3857 * 3858 * @param txq 3859 * Pointer to TX queue structure. 3860 * @param loc 3861 * Pointer to burst routine local context. 3862 * @param ds 3863 * Number of packets/Data Segments/Packets. 3864 * @param slen 3865 * Accumulated statistics, bytes sent 3866 * @param olx 3867 * Configured Tx offloads mask. It is fully defined at 3868 * compile time and may be used for optimization. 3869 * 3870 * @return 3871 * true - packet match with eMPW batch attributes. 3872 * false - no match, eMPW should be restarted. 3873 */ 3874 static __rte_always_inline void 3875 mlx5_tx_sdone_empw(struct mlx5_txq_data *restrict txq, 3876 struct mlx5_txq_local *restrict loc, 3877 unsigned int ds, 3878 unsigned int slen, 3879 unsigned int olx __rte_unused) 3880 { 3881 MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE)); 3882 #ifdef MLX5_PMD_SOFT_COUNTERS 3883 /* Update sent data bytes counter. */ 3884 txq->stats.obytes += slen; 3885 #else 3886 (void)slen; 3887 #endif 3888 loc->elts_free -= ds; 3889 loc->pkts_sent += ds; 3890 ds += 2; 3891 loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); 3892 txq->wqe_ci += (ds + 3) / 4; 3893 loc->wqe_free -= (ds + 3) / 4; 3894 } 3895 3896 /* 3897 * Update send loop variables and WQE for eMPW loop 3898 * with data inlining. Gets the size of pushed descriptors 3899 * and data to the WQE. 3900 * 3901 * @param txq 3902 * Pointer to TX queue structure. 3903 * @param loc 3904 * Pointer to burst routine local context. 3905 * @param len 3906 * Total size of descriptor/data in bytes. 3907 * @param slen 3908 * Accumulated statistics, data bytes sent. 3909 * @param olx 3910 * Configured Tx offloads mask. It is fully defined at 3911 * compile time and may be used for optimization. 3912 * 3913 * @return 3914 * true - packet match with eMPW batch attributes. 3915 * false - no match, eMPW should be restarted. 3916 */ 3917 static __rte_always_inline void 3918 mlx5_tx_idone_empw(struct mlx5_txq_data *restrict txq, 3919 struct mlx5_txq_local *restrict loc, 3920 unsigned int len, 3921 unsigned int slen, 3922 unsigned int olx __rte_unused) 3923 { 3924 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE)); 3925 MLX5_ASSERT((len % MLX5_WSEG_SIZE) == 0); 3926 #ifdef MLX5_PMD_SOFT_COUNTERS 3927 /* Update sent data bytes counter. */ 3928 txq->stats.obytes += slen; 3929 #else 3930 (void)slen; 3931 #endif 3932 len = len / MLX5_WSEG_SIZE + 2; 3933 loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | len); 3934 txq->wqe_ci += (len + 3) / 4; 3935 loc->wqe_free -= (len + 3) / 4; 3936 } 3937 3938 /** 3939 * The set of Tx burst functions for single-segment packets 3940 * without TSO and with Multi-Packet Writing feature support. 3941 * Supports all types of Tx offloads, except multi-packets 3942 * and TSO. 3943 * 3944 * Uses MLX5_OPCODE_EMPW to build WQEs if possible and sends 3945 * as many packet per WQE as it can. If eMPW is not configured 3946 * or packet can not be sent with eMPW (VLAN insertion) the 3947 * ordinary SEND opcode is used and only one packet placed 3948 * in WQE. 3949 * 3950 * Functions stop sending if it encounters the multi-segment 3951 * packet or packet with TSO requested. 3952 * 3953 * The routines are responsible for storing processed mbuf 3954 * into elts ring buffer and update elts_head if inlining 3955 * offload is requested. Otherwise the copying mbufs to elts 3956 * can be postponed and completed at the end of burst routine. 3957 * 3958 * @param txq 3959 * Pointer to TX queue structure. 3960 * @param[in] pkts 3961 * Packets to transmit. 3962 * @param pkts_n 3963 * Number of packets in array. 3964 * @param loc 3965 * Pointer to burst routine local context. 3966 * @param olx 3967 * Configured Tx offloads mask. It is fully defined at 3968 * compile time and may be used for optimization. 3969 * 3970 * @return 3971 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3972 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3973 * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered. 3974 * MLX5_TXCMP_CODE_TSO - TSO packet encountered. 3975 * MLX5_TXCMP_CODE_SINGLE - used inside functions set. 3976 * MLX5_TXCMP_CODE_EMPW - used inside functions set. 3977 * 3978 * Local context variables updated. 3979 * 3980 * 3981 * The routine sends packets with MLX5_OPCODE_EMPW 3982 * without inlining, this is dedicated optimized branch. 3983 * No VLAN insertion is supported. 3984 */ 3985 static __rte_always_inline enum mlx5_txcmp_code 3986 mlx5_tx_burst_empw_simple(struct mlx5_txq_data *restrict txq, 3987 struct rte_mbuf **restrict pkts, 3988 unsigned int pkts_n, 3989 struct mlx5_txq_local *restrict loc, 3990 unsigned int olx) 3991 { 3992 /* 3993 * Subroutine is the part of mlx5_tx_burst_single() 3994 * and sends single-segment packet with eMPW opcode 3995 * without data inlining. 3996 */ 3997 MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE)); 3998 MLX5_ASSERT(MLX5_TXOFF_CONFIG(EMPW)); 3999 MLX5_ASSERT(loc->elts_free && loc->wqe_free); 4000 MLX5_ASSERT(pkts_n > loc->pkts_sent); 4001 static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size"); 4002 pkts += loc->pkts_sent + 1; 4003 pkts_n -= loc->pkts_sent; 4004 for (;;) { 4005 struct mlx5_wqe_dseg *restrict dseg; 4006 struct mlx5_wqe_eseg *restrict eseg; 4007 enum mlx5_txcmp_code ret; 4008 unsigned int part, loop; 4009 unsigned int slen = 0; 4010 4011 next_empw: 4012 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); 4013 part = RTE_MIN(pkts_n, MLX5_TXOFF_CONFIG(MPW) ? 4014 MLX5_MPW_MAX_PACKETS : 4015 MLX5_EMPW_MAX_PACKETS); 4016 if (unlikely(loc->elts_free < part)) { 4017 /* We have no enough elts to save all mbufs. */ 4018 if (unlikely(loc->elts_free < MLX5_EMPW_MIN_PACKETS)) 4019 return MLX5_TXCMP_CODE_EXIT; 4020 /* But we still able to send at least minimal eMPW. */ 4021 part = loc->elts_free; 4022 } 4023 /* Check whether we have enough WQEs */ 4024 if (unlikely(loc->wqe_free < ((2 + part + 3) / 4))) { 4025 if (unlikely(loc->wqe_free < 4026 ((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4))) 4027 return MLX5_TXCMP_CODE_EXIT; 4028 part = (loc->wqe_free * 4) - 2; 4029 } 4030 if (likely(part > 1)) 4031 rte_prefetch0(*pkts); 4032 loc->wqe_last = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4033 /* 4034 * Build eMPW title WQEBB: 4035 * - Control Segment, eMPW opcode 4036 * - Ethernet Segment, no inline 4037 */ 4038 mlx5_tx_cseg_init(txq, loc, loc->wqe_last, part + 2, 4039 MLX5_OPCODE_ENHANCED_MPSW, olx); 4040 mlx5_tx_eseg_none(txq, loc, loc->wqe_last, 4041 olx & ~MLX5_TXOFF_CONFIG_VLAN); 4042 eseg = &loc->wqe_last->eseg; 4043 dseg = &loc->wqe_last->dseg[0]; 4044 loop = part; 4045 /* Store the packet length for legacy MPW. */ 4046 if (MLX5_TXOFF_CONFIG(MPW)) 4047 eseg->mss = rte_cpu_to_be_16 4048 (rte_pktmbuf_data_len(loc->mbuf)); 4049 for (;;) { 4050 uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf); 4051 #ifdef MLX5_PMD_SOFT_COUNTERS 4052 /* Update sent data bytes counter. */ 4053 slen += dlen; 4054 #endif 4055 mlx5_tx_dseg_ptr 4056 (txq, loc, dseg, 4057 rte_pktmbuf_mtod(loc->mbuf, uint8_t *), 4058 dlen, olx); 4059 if (unlikely(--loop == 0)) 4060 break; 4061 loc->mbuf = *pkts++; 4062 if (likely(loop > 1)) 4063 rte_prefetch0(*pkts); 4064 ret = mlx5_tx_able_to_empw(txq, loc, olx, true); 4065 /* 4066 * Unroll the completion code to avoid 4067 * returning variable value - it results in 4068 * unoptimized sequent checking in caller. 4069 */ 4070 if (ret == MLX5_TXCMP_CODE_MULTI) { 4071 part -= loop; 4072 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 4073 if (unlikely(!loc->elts_free || 4074 !loc->wqe_free)) 4075 return MLX5_TXCMP_CODE_EXIT; 4076 return MLX5_TXCMP_CODE_MULTI; 4077 } 4078 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); 4079 if (ret == MLX5_TXCMP_CODE_TSO) { 4080 part -= loop; 4081 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 4082 if (unlikely(!loc->elts_free || 4083 !loc->wqe_free)) 4084 return MLX5_TXCMP_CODE_EXIT; 4085 return MLX5_TXCMP_CODE_TSO; 4086 } 4087 if (ret == MLX5_TXCMP_CODE_SINGLE) { 4088 part -= loop; 4089 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 4090 if (unlikely(!loc->elts_free || 4091 !loc->wqe_free)) 4092 return MLX5_TXCMP_CODE_EXIT; 4093 return MLX5_TXCMP_CODE_SINGLE; 4094 } 4095 if (ret != MLX5_TXCMP_CODE_EMPW) { 4096 MLX5_ASSERT(false); 4097 part -= loop; 4098 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 4099 return MLX5_TXCMP_CODE_ERROR; 4100 } 4101 /* 4102 * Check whether packet parameters coincide 4103 * within assumed eMPW batch: 4104 * - check sum settings 4105 * - metadata value 4106 * - software parser settings 4107 * - packets length (legacy MPW only) 4108 */ 4109 if (!mlx5_tx_match_empw(txq, eseg, loc, dlen, olx)) { 4110 MLX5_ASSERT(loop); 4111 part -= loop; 4112 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 4113 if (unlikely(!loc->elts_free || 4114 !loc->wqe_free)) 4115 return MLX5_TXCMP_CODE_EXIT; 4116 pkts_n -= part; 4117 goto next_empw; 4118 } 4119 /* Packet attributes match, continue the same eMPW. */ 4120 ++dseg; 4121 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 4122 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 4123 } 4124 /* eMPW is built successfully, update loop parameters. */ 4125 MLX5_ASSERT(!loop); 4126 MLX5_ASSERT(pkts_n >= part); 4127 #ifdef MLX5_PMD_SOFT_COUNTERS 4128 /* Update sent data bytes counter. */ 4129 txq->stats.obytes += slen; 4130 #endif 4131 loc->elts_free -= part; 4132 loc->pkts_sent += part; 4133 txq->wqe_ci += (2 + part + 3) / 4; 4134 loc->wqe_free -= (2 + part + 3) / 4; 4135 pkts_n -= part; 4136 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) 4137 return MLX5_TXCMP_CODE_EXIT; 4138 loc->mbuf = *pkts++; 4139 ret = mlx5_tx_able_to_empw(txq, loc, olx, true); 4140 if (unlikely(ret != MLX5_TXCMP_CODE_EMPW)) 4141 return ret; 4142 /* Continue sending eMPW batches. */ 4143 } 4144 MLX5_ASSERT(false); 4145 } 4146 4147 /** 4148 * The routine sends packets with MLX5_OPCODE_EMPW 4149 * with inlining, optionally supports VLAN insertion. 4150 */ 4151 static __rte_always_inline enum mlx5_txcmp_code 4152 mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq, 4153 struct rte_mbuf **restrict pkts, 4154 unsigned int pkts_n, 4155 struct mlx5_txq_local *restrict loc, 4156 unsigned int olx) 4157 { 4158 /* 4159 * Subroutine is the part of mlx5_tx_burst_single() 4160 * and sends single-segment packet with eMPW opcode 4161 * with data inlining. 4162 */ 4163 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE)); 4164 MLX5_ASSERT(MLX5_TXOFF_CONFIG(EMPW)); 4165 MLX5_ASSERT(loc->elts_free && loc->wqe_free); 4166 MLX5_ASSERT(pkts_n > loc->pkts_sent); 4167 static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size"); 4168 pkts += loc->pkts_sent + 1; 4169 pkts_n -= loc->pkts_sent; 4170 for (;;) { 4171 struct mlx5_wqe_dseg *restrict dseg; 4172 struct mlx5_wqe_eseg *restrict eseg; 4173 enum mlx5_txcmp_code ret; 4174 unsigned int room, part, nlim; 4175 unsigned int slen = 0; 4176 4177 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); 4178 /* 4179 * Limits the amount of packets in one WQE 4180 * to improve CQE latency generation. 4181 */ 4182 nlim = RTE_MIN(pkts_n, MLX5_TXOFF_CONFIG(MPW) ? 4183 MLX5_MPW_INLINE_MAX_PACKETS : 4184 MLX5_EMPW_MAX_PACKETS); 4185 /* Check whether we have minimal amount WQEs */ 4186 if (unlikely(loc->wqe_free < 4187 ((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4))) 4188 return MLX5_TXCMP_CODE_EXIT; 4189 if (likely(pkts_n > 1)) 4190 rte_prefetch0(*pkts); 4191 loc->wqe_last = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4192 /* 4193 * Build eMPW title WQEBB: 4194 * - Control Segment, eMPW opcode, zero DS 4195 * - Ethernet Segment, no inline 4196 */ 4197 mlx5_tx_cseg_init(txq, loc, loc->wqe_last, 0, 4198 MLX5_OPCODE_ENHANCED_MPSW, olx); 4199 mlx5_tx_eseg_none(txq, loc, loc->wqe_last, 4200 olx & ~MLX5_TXOFF_CONFIG_VLAN); 4201 eseg = &loc->wqe_last->eseg; 4202 dseg = &loc->wqe_last->dseg[0]; 4203 /* Store the packet length for legacy MPW. */ 4204 if (MLX5_TXOFF_CONFIG(MPW)) 4205 eseg->mss = rte_cpu_to_be_16 4206 (rte_pktmbuf_data_len(loc->mbuf)); 4207 room = RTE_MIN(MLX5_WQE_SIZE_MAX / MLX5_WQE_SIZE, 4208 loc->wqe_free) * MLX5_WQE_SIZE - 4209 MLX5_WQE_CSEG_SIZE - 4210 MLX5_WQE_ESEG_SIZE; 4211 /* Build WQE till we have space, packets and resources. */ 4212 part = room; 4213 for (;;) { 4214 uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf); 4215 uint8_t *dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *); 4216 unsigned int tlen; 4217 4218 MLX5_ASSERT(room >= MLX5_WQE_DSEG_SIZE); 4219 MLX5_ASSERT((room % MLX5_WQE_DSEG_SIZE) == 0); 4220 MLX5_ASSERT((uintptr_t)dseg < (uintptr_t)txq->wqes_end); 4221 /* 4222 * Some Tx offloads may cause an error if 4223 * packet is not long enough, check against 4224 * assumed minimal length. 4225 */ 4226 if (unlikely(dlen <= MLX5_ESEG_MIN_INLINE_SIZE)) { 4227 part -= room; 4228 if (unlikely(!part)) 4229 return MLX5_TXCMP_CODE_ERROR; 4230 /* 4231 * We have some successfully built 4232 * packet Data Segments to send. 4233 */ 4234 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4235 return MLX5_TXCMP_CODE_ERROR; 4236 } 4237 /* Inline or not inline - that's the Question. */ 4238 if (dlen > txq->inlen_empw || 4239 loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE) 4240 goto pointer_empw; 4241 /* Inline entire packet, optional VLAN insertion. */ 4242 tlen = sizeof(dseg->bcount) + dlen; 4243 if (MLX5_TXOFF_CONFIG(VLAN) && 4244 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { 4245 /* 4246 * The packet length must be checked in 4247 * mlx5_tx_able_to_empw() and packet 4248 * fits into inline length guaranteed. 4249 */ 4250 MLX5_ASSERT((dlen + 4251 sizeof(struct rte_vlan_hdr)) <= 4252 txq->inlen_empw); 4253 tlen += sizeof(struct rte_vlan_hdr); 4254 if (room < tlen) 4255 break; 4256 dseg = mlx5_tx_dseg_vlan(txq, loc, dseg, 4257 dptr, dlen, olx); 4258 #ifdef MLX5_PMD_SOFT_COUNTERS 4259 /* Update sent data bytes counter. */ 4260 slen += sizeof(struct rte_vlan_hdr); 4261 #endif 4262 } else { 4263 if (room < tlen) 4264 break; 4265 dseg = mlx5_tx_dseg_empw(txq, loc, dseg, 4266 dptr, dlen, olx); 4267 } 4268 tlen = RTE_ALIGN(tlen, MLX5_WSEG_SIZE); 4269 MLX5_ASSERT(room >= tlen); 4270 room -= tlen; 4271 /* 4272 * Packet data are completely inlined, 4273 * free the packet immediately. 4274 */ 4275 rte_pktmbuf_free_seg(loc->mbuf); 4276 goto next_mbuf; 4277 pointer_empw: 4278 /* 4279 * Not inlinable VLAN packets are 4280 * proceeded outside of this routine. 4281 */ 4282 MLX5_ASSERT(room >= MLX5_WQE_DSEG_SIZE); 4283 if (MLX5_TXOFF_CONFIG(VLAN)) 4284 MLX5_ASSERT(!(loc->mbuf->ol_flags & 4285 PKT_TX_VLAN_PKT)); 4286 mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx); 4287 /* We have to store mbuf in elts.*/ 4288 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 4289 room -= MLX5_WQE_DSEG_SIZE; 4290 /* Ring buffer wraparound is checked at the loop end.*/ 4291 ++dseg; 4292 next_mbuf: 4293 #ifdef MLX5_PMD_SOFT_COUNTERS 4294 /* Update sent data bytes counter. */ 4295 slen += dlen; 4296 #endif 4297 loc->pkts_sent++; 4298 loc->elts_free--; 4299 pkts_n--; 4300 if (unlikely(!pkts_n || !loc->elts_free)) { 4301 /* 4302 * We have no resources/packets to 4303 * continue build descriptors. 4304 */ 4305 part -= room; 4306 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4307 return MLX5_TXCMP_CODE_EXIT; 4308 } 4309 loc->mbuf = *pkts++; 4310 if (likely(pkts_n > 1)) 4311 rte_prefetch0(*pkts); 4312 ret = mlx5_tx_able_to_empw(txq, loc, olx, true); 4313 /* 4314 * Unroll the completion code to avoid 4315 * returning variable value - it results in 4316 * unoptimized sequent checking in caller. 4317 */ 4318 if (ret == MLX5_TXCMP_CODE_MULTI) { 4319 part -= room; 4320 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4321 if (unlikely(!loc->elts_free || 4322 !loc->wqe_free)) 4323 return MLX5_TXCMP_CODE_EXIT; 4324 return MLX5_TXCMP_CODE_MULTI; 4325 } 4326 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); 4327 if (ret == MLX5_TXCMP_CODE_TSO) { 4328 part -= room; 4329 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4330 if (unlikely(!loc->elts_free || 4331 !loc->wqe_free)) 4332 return MLX5_TXCMP_CODE_EXIT; 4333 return MLX5_TXCMP_CODE_TSO; 4334 } 4335 if (ret == MLX5_TXCMP_CODE_SINGLE) { 4336 part -= room; 4337 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4338 if (unlikely(!loc->elts_free || 4339 !loc->wqe_free)) 4340 return MLX5_TXCMP_CODE_EXIT; 4341 return MLX5_TXCMP_CODE_SINGLE; 4342 } 4343 if (ret != MLX5_TXCMP_CODE_EMPW) { 4344 MLX5_ASSERT(false); 4345 part -= room; 4346 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4347 return MLX5_TXCMP_CODE_ERROR; 4348 } 4349 /* Check if we have minimal room left. */ 4350 nlim--; 4351 if (unlikely(!nlim || room < MLX5_WQE_DSEG_SIZE)) 4352 break; 4353 /* 4354 * Check whether packet parameters coincide 4355 * within assumed eMPW batch: 4356 * - check sum settings 4357 * - metadata value 4358 * - software parser settings 4359 * - packets length (legacy MPW only) 4360 */ 4361 if (!mlx5_tx_match_empw(txq, eseg, loc, dlen, olx)) 4362 break; 4363 /* Packet attributes match, continue the same eMPW. */ 4364 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 4365 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 4366 } 4367 /* 4368 * We get here to close an existing eMPW 4369 * session and start the new one. 4370 */ 4371 MLX5_ASSERT(pkts_n); 4372 part -= room; 4373 if (unlikely(!part)) 4374 return MLX5_TXCMP_CODE_EXIT; 4375 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4376 if (unlikely(!loc->elts_free || 4377 !loc->wqe_free)) 4378 return MLX5_TXCMP_CODE_EXIT; 4379 /* Continue the loop with new eMPW session. */ 4380 } 4381 MLX5_ASSERT(false); 4382 } 4383 4384 /** 4385 * The routine sends packets with ordinary MLX5_OPCODE_SEND. 4386 * Data inlining and VLAN insertion are supported. 4387 */ 4388 static __rte_always_inline enum mlx5_txcmp_code 4389 mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq, 4390 struct rte_mbuf **restrict pkts, 4391 unsigned int pkts_n, 4392 struct mlx5_txq_local *restrict loc, 4393 unsigned int olx) 4394 { 4395 /* 4396 * Subroutine is the part of mlx5_tx_burst_single() 4397 * and sends single-segment packet with SEND opcode. 4398 */ 4399 MLX5_ASSERT(loc->elts_free && loc->wqe_free); 4400 MLX5_ASSERT(pkts_n > loc->pkts_sent); 4401 pkts += loc->pkts_sent + 1; 4402 pkts_n -= loc->pkts_sent; 4403 for (;;) { 4404 struct mlx5_wqe *restrict wqe; 4405 enum mlx5_txcmp_code ret; 4406 4407 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); 4408 if (MLX5_TXOFF_CONFIG(INLINE)) { 4409 unsigned int inlen, vlan = 0; 4410 4411 inlen = rte_pktmbuf_data_len(loc->mbuf); 4412 if (MLX5_TXOFF_CONFIG(VLAN) && 4413 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { 4414 vlan = sizeof(struct rte_vlan_hdr); 4415 inlen += vlan; 4416 static_assert((sizeof(struct rte_vlan_hdr) + 4417 sizeof(struct rte_ether_hdr)) == 4418 MLX5_ESEG_MIN_INLINE_SIZE, 4419 "invalid min inline data size"); 4420 } 4421 /* 4422 * If inlining is enabled at configuration time 4423 * the limit must be not less than minimal size. 4424 * Otherwise we would do extra check for data 4425 * size to avoid crashes due to length overflow. 4426 */ 4427 MLX5_ASSERT(txq->inlen_send >= 4428 MLX5_ESEG_MIN_INLINE_SIZE); 4429 if (inlen <= txq->inlen_send) { 4430 unsigned int seg_n, wqe_n; 4431 4432 rte_prefetch0(rte_pktmbuf_mtod 4433 (loc->mbuf, uint8_t *)); 4434 /* Check against minimal length. */ 4435 if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE) 4436 return MLX5_TXCMP_CODE_ERROR; 4437 if (loc->mbuf->ol_flags & 4438 PKT_TX_DYNF_NOINLINE) { 4439 /* 4440 * The hint flag not to inline packet 4441 * data is set. Check whether we can 4442 * follow the hint. 4443 */ 4444 if ((!MLX5_TXOFF_CONFIG(EMPW) && 4445 txq->inlen_mode) || 4446 (MLX5_TXOFF_CONFIG(MPW) && 4447 txq->inlen_mode)) { 4448 /* 4449 * The hardware requires the 4450 * minimal inline data header. 4451 */ 4452 goto single_min_inline; 4453 } 4454 if (MLX5_TXOFF_CONFIG(VLAN) && 4455 vlan && !txq->vlan_en) { 4456 /* 4457 * We must insert VLAN tag 4458 * by software means. 4459 */ 4460 goto single_part_inline; 4461 } 4462 goto single_no_inline; 4463 } 4464 /* 4465 * Completely inlined packet data WQE: 4466 * - Control Segment, SEND opcode 4467 * - Ethernet Segment, no VLAN insertion 4468 * - Data inlined, VLAN optionally inserted 4469 * - Alignment to MLX5_WSEG_SIZE 4470 * Have to estimate amount of WQEBBs 4471 */ 4472 seg_n = (inlen + 3 * MLX5_WSEG_SIZE - 4473 MLX5_ESEG_MIN_INLINE_SIZE + 4474 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 4475 /* Check if there are enough WQEBBs. */ 4476 wqe_n = (seg_n + 3) / 4; 4477 if (wqe_n > loc->wqe_free) 4478 return MLX5_TXCMP_CODE_EXIT; 4479 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4480 loc->wqe_last = wqe; 4481 mlx5_tx_cseg_init(txq, loc, wqe, seg_n, 4482 MLX5_OPCODE_SEND, olx); 4483 mlx5_tx_eseg_data(txq, loc, wqe, 4484 vlan, inlen, 0, olx); 4485 txq->wqe_ci += wqe_n; 4486 loc->wqe_free -= wqe_n; 4487 /* 4488 * Packet data are completely inlined, 4489 * free the packet immediately. 4490 */ 4491 rte_pktmbuf_free_seg(loc->mbuf); 4492 } else if ((!MLX5_TXOFF_CONFIG(EMPW) || 4493 MLX5_TXOFF_CONFIG(MPW)) && 4494 txq->inlen_mode) { 4495 /* 4496 * If minimal inlining is requested the eMPW 4497 * feature should be disabled due to data is 4498 * inlined into Ethernet Segment, which can 4499 * not contain inlined data for eMPW due to 4500 * segment shared for all packets. 4501 */ 4502 struct mlx5_wqe_dseg *restrict dseg; 4503 unsigned int ds; 4504 uint8_t *dptr; 4505 4506 /* 4507 * The inline-mode settings require 4508 * to inline the specified amount of 4509 * data bytes to the Ethernet Segment. 4510 * We should check the free space in 4511 * WQE ring buffer to inline partially. 4512 */ 4513 single_min_inline: 4514 MLX5_ASSERT(txq->inlen_send >= txq->inlen_mode); 4515 MLX5_ASSERT(inlen > txq->inlen_mode); 4516 MLX5_ASSERT(txq->inlen_mode >= 4517 MLX5_ESEG_MIN_INLINE_SIZE); 4518 /* 4519 * Check whether there are enough free WQEBBs: 4520 * - Control Segment 4521 * - Ethernet Segment 4522 * - First Segment of inlined Ethernet data 4523 * - ... data continued ... 4524 * - Finishing Data Segment of pointer type 4525 */ 4526 ds = (MLX5_WQE_CSEG_SIZE + 4527 MLX5_WQE_ESEG_SIZE + 4528 MLX5_WQE_DSEG_SIZE + 4529 txq->inlen_mode - 4530 MLX5_ESEG_MIN_INLINE_SIZE + 4531 MLX5_WQE_DSEG_SIZE + 4532 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 4533 if (loc->wqe_free < ((ds + 3) / 4)) 4534 return MLX5_TXCMP_CODE_EXIT; 4535 /* 4536 * Build the ordinary SEND WQE: 4537 * - Control Segment 4538 * - Ethernet Segment, inline inlen_mode bytes 4539 * - Data Segment of pointer type 4540 */ 4541 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4542 loc->wqe_last = wqe; 4543 mlx5_tx_cseg_init(txq, loc, wqe, ds, 4544 MLX5_OPCODE_SEND, olx); 4545 dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan, 4546 txq->inlen_mode, 4547 0, olx); 4548 dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + 4549 txq->inlen_mode - vlan; 4550 inlen -= txq->inlen_mode; 4551 mlx5_tx_dseg_ptr(txq, loc, dseg, 4552 dptr, inlen, olx); 4553 /* 4554 * WQE is built, update the loop parameters 4555 * and got to the next packet. 4556 */ 4557 txq->wqe_ci += (ds + 3) / 4; 4558 loc->wqe_free -= (ds + 3) / 4; 4559 /* We have to store mbuf in elts.*/ 4560 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE)); 4561 txq->elts[txq->elts_head++ & txq->elts_m] = 4562 loc->mbuf; 4563 --loc->elts_free; 4564 } else { 4565 uint8_t *dptr; 4566 unsigned int dlen; 4567 4568 /* 4569 * Partially inlined packet data WQE, we have 4570 * some space in title WQEBB, we can fill it 4571 * with some packet data. It takes one WQEBB, 4572 * it is available, no extra space check: 4573 * - Control Segment, SEND opcode 4574 * - Ethernet Segment, no VLAN insertion 4575 * - MLX5_ESEG_MIN_INLINE_SIZE bytes of Data 4576 * - Data Segment, pointer type 4577 * 4578 * We also get here if VLAN insertion is not 4579 * supported by HW, the inline is enabled. 4580 */ 4581 single_part_inline: 4582 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4583 loc->wqe_last = wqe; 4584 mlx5_tx_cseg_init(txq, loc, wqe, 4, 4585 MLX5_OPCODE_SEND, olx); 4586 mlx5_tx_eseg_dmin(txq, loc, wqe, vlan, olx); 4587 dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + 4588 MLX5_ESEG_MIN_INLINE_SIZE - vlan; 4589 /* 4590 * The length check is performed above, by 4591 * comparing with txq->inlen_send. We should 4592 * not get overflow here. 4593 */ 4594 MLX5_ASSERT(inlen > MLX5_ESEG_MIN_INLINE_SIZE); 4595 dlen = inlen - MLX5_ESEG_MIN_INLINE_SIZE; 4596 mlx5_tx_dseg_ptr(txq, loc, &wqe->dseg[1], 4597 dptr, dlen, olx); 4598 ++txq->wqe_ci; 4599 --loc->wqe_free; 4600 /* We have to store mbuf in elts.*/ 4601 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE)); 4602 txq->elts[txq->elts_head++ & txq->elts_m] = 4603 loc->mbuf; 4604 --loc->elts_free; 4605 } 4606 #ifdef MLX5_PMD_SOFT_COUNTERS 4607 /* Update sent data bytes counter. */ 4608 txq->stats.obytes += vlan + 4609 rte_pktmbuf_data_len(loc->mbuf); 4610 #endif 4611 } else { 4612 /* 4613 * No inline at all, it means the CPU cycles saving 4614 * is prioritized at configuration, we should not 4615 * copy any packet data to WQE. 4616 * 4617 * SEND WQE, one WQEBB: 4618 * - Control Segment, SEND opcode 4619 * - Ethernet Segment, optional VLAN, no inline 4620 * - Data Segment, pointer type 4621 */ 4622 single_no_inline: 4623 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4624 loc->wqe_last = wqe; 4625 mlx5_tx_cseg_init(txq, loc, wqe, 3, 4626 MLX5_OPCODE_SEND, olx); 4627 mlx5_tx_eseg_none(txq, loc, wqe, olx); 4628 mlx5_tx_dseg_ptr 4629 (txq, loc, &wqe->dseg[0], 4630 rte_pktmbuf_mtod(loc->mbuf, uint8_t *), 4631 rte_pktmbuf_data_len(loc->mbuf), olx); 4632 ++txq->wqe_ci; 4633 --loc->wqe_free; 4634 /* 4635 * We should not store mbuf pointer in elts 4636 * if no inlining is configured, this is done 4637 * by calling routine in a batch copy. 4638 */ 4639 MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE)); 4640 --loc->elts_free; 4641 #ifdef MLX5_PMD_SOFT_COUNTERS 4642 /* Update sent data bytes counter. */ 4643 txq->stats.obytes += rte_pktmbuf_data_len(loc->mbuf); 4644 if (MLX5_TXOFF_CONFIG(VLAN) && 4645 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) 4646 txq->stats.obytes += 4647 sizeof(struct rte_vlan_hdr); 4648 #endif 4649 } 4650 ++loc->pkts_sent; 4651 --pkts_n; 4652 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) 4653 return MLX5_TXCMP_CODE_EXIT; 4654 loc->mbuf = *pkts++; 4655 if (pkts_n > 1) 4656 rte_prefetch0(*pkts); 4657 ret = mlx5_tx_able_to_empw(txq, loc, olx, true); 4658 if (unlikely(ret != MLX5_TXCMP_CODE_SINGLE)) 4659 return ret; 4660 } 4661 MLX5_ASSERT(false); 4662 } 4663 4664 static __rte_always_inline enum mlx5_txcmp_code 4665 mlx5_tx_burst_single(struct mlx5_txq_data *restrict txq, 4666 struct rte_mbuf **restrict pkts, 4667 unsigned int pkts_n, 4668 struct mlx5_txq_local *restrict loc, 4669 unsigned int olx) 4670 { 4671 enum mlx5_txcmp_code ret; 4672 4673 ret = mlx5_tx_able_to_empw(txq, loc, olx, false); 4674 if (ret == MLX5_TXCMP_CODE_SINGLE) 4675 goto ordinary_send; 4676 MLX5_ASSERT(ret == MLX5_TXCMP_CODE_EMPW); 4677 for (;;) { 4678 /* Optimize for inline/no inline eMPW send. */ 4679 ret = (MLX5_TXOFF_CONFIG(INLINE)) ? 4680 mlx5_tx_burst_empw_inline 4681 (txq, pkts, pkts_n, loc, olx) : 4682 mlx5_tx_burst_empw_simple 4683 (txq, pkts, pkts_n, loc, olx); 4684 if (ret != MLX5_TXCMP_CODE_SINGLE) 4685 return ret; 4686 /* The resources to send one packet should remain. */ 4687 MLX5_ASSERT(loc->elts_free && loc->wqe_free); 4688 ordinary_send: 4689 ret = mlx5_tx_burst_single_send(txq, pkts, pkts_n, loc, olx); 4690 MLX5_ASSERT(ret != MLX5_TXCMP_CODE_SINGLE); 4691 if (ret != MLX5_TXCMP_CODE_EMPW) 4692 return ret; 4693 /* The resources to send one packet should remain. */ 4694 MLX5_ASSERT(loc->elts_free && loc->wqe_free); 4695 } 4696 } 4697 4698 /** 4699 * DPDK Tx callback template. This is configured template 4700 * used to generate routines optimized for specified offload setup. 4701 * One of this generated functions is chosen at SQ configuration 4702 * time. 4703 * 4704 * @param txq 4705 * Generic pointer to TX queue structure. 4706 * @param[in] pkts 4707 * Packets to transmit. 4708 * @param pkts_n 4709 * Number of packets in array. 4710 * @param olx 4711 * Configured offloads mask, presents the bits of MLX5_TXOFF_CONFIG_xxx 4712 * values. Should be static to take compile time static configuration 4713 * advantages. 4714 * 4715 * @return 4716 * Number of packets successfully transmitted (<= pkts_n). 4717 */ 4718 static __rte_always_inline uint16_t 4719 mlx5_tx_burst_tmpl(struct mlx5_txq_data *restrict txq, 4720 struct rte_mbuf **restrict pkts, 4721 uint16_t pkts_n, 4722 unsigned int olx) 4723 { 4724 struct mlx5_txq_local loc; 4725 enum mlx5_txcmp_code ret; 4726 unsigned int part; 4727 4728 MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); 4729 MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); 4730 if (unlikely(!pkts_n)) 4731 return 0; 4732 loc.pkts_sent = 0; 4733 loc.pkts_copy = 0; 4734 loc.wqe_last = NULL; 4735 4736 send_loop: 4737 loc.pkts_loop = loc.pkts_sent; 4738 /* 4739 * Check if there are some CQEs, if any: 4740 * - process an encountered errors 4741 * - process the completed WQEs 4742 * - free related mbufs 4743 * - doorbell the NIC about processed CQEs 4744 */ 4745 rte_prefetch0(*(pkts + loc.pkts_sent)); 4746 mlx5_tx_handle_completion(txq, olx); 4747 /* 4748 * Calculate the number of available resources - elts and WQEs. 4749 * There are two possible different scenarios: 4750 * - no data inlining into WQEs, one WQEBB may contains upto 4751 * four packets, in this case elts become scarce resource 4752 * - data inlining into WQEs, one packet may require multiple 4753 * WQEBBs, the WQEs become the limiting factor. 4754 */ 4755 MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); 4756 loc.elts_free = txq->elts_s - 4757 (uint16_t)(txq->elts_head - txq->elts_tail); 4758 MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); 4759 loc.wqe_free = txq->wqe_s - 4760 (uint16_t)(txq->wqe_ci - txq->wqe_pi); 4761 if (unlikely(!loc.elts_free || !loc.wqe_free)) 4762 goto burst_exit; 4763 for (;;) { 4764 /* 4765 * Fetch the packet from array. Usually this is 4766 * the first packet in series of multi/single 4767 * segment packets. 4768 */ 4769 loc.mbuf = *(pkts + loc.pkts_sent); 4770 /* Dedicated branch for multi-segment packets. */ 4771 if (MLX5_TXOFF_CONFIG(MULTI) && 4772 unlikely(NB_SEGS(loc.mbuf) > 1)) { 4773 /* 4774 * Multi-segment packet encountered. 4775 * Hardware is able to process it only 4776 * with SEND/TSO opcodes, one packet 4777 * per WQE, do it in dedicated routine. 4778 */ 4779 enter_send_multi: 4780 MLX5_ASSERT(loc.pkts_sent >= loc.pkts_copy); 4781 part = loc.pkts_sent - loc.pkts_copy; 4782 if (!MLX5_TXOFF_CONFIG(INLINE) && part) { 4783 /* 4784 * There are some single-segment mbufs not 4785 * stored in elts. The mbufs must be in the 4786 * same order as WQEs, so we must copy the 4787 * mbufs to elts here, before the coming 4788 * multi-segment packet mbufs is appended. 4789 */ 4790 mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy, 4791 part, olx); 4792 loc.pkts_copy = loc.pkts_sent; 4793 } 4794 MLX5_ASSERT(pkts_n > loc.pkts_sent); 4795 ret = mlx5_tx_burst_mseg(txq, pkts, pkts_n, &loc, olx); 4796 if (!MLX5_TXOFF_CONFIG(INLINE)) 4797 loc.pkts_copy = loc.pkts_sent; 4798 /* 4799 * These returned code checks are supposed 4800 * to be optimized out due to routine inlining. 4801 */ 4802 if (ret == MLX5_TXCMP_CODE_EXIT) { 4803 /* 4804 * The routine returns this code when 4805 * all packets are sent or there is no 4806 * enough resources to complete request. 4807 */ 4808 break; 4809 } 4810 if (ret == MLX5_TXCMP_CODE_ERROR) { 4811 /* 4812 * The routine returns this code when 4813 * some error in the incoming packets 4814 * format occurred. 4815 */ 4816 txq->stats.oerrors++; 4817 break; 4818 } 4819 if (ret == MLX5_TXCMP_CODE_SINGLE) { 4820 /* 4821 * The single-segment packet was encountered 4822 * in the array, try to send it with the 4823 * best optimized way, possible engaging eMPW. 4824 */ 4825 goto enter_send_single; 4826 } 4827 if (MLX5_TXOFF_CONFIG(TSO) && 4828 ret == MLX5_TXCMP_CODE_TSO) { 4829 /* 4830 * The single-segment TSO packet was 4831 * encountered in the array. 4832 */ 4833 goto enter_send_tso; 4834 } 4835 /* We must not get here. Something is going wrong. */ 4836 MLX5_ASSERT(false); 4837 txq->stats.oerrors++; 4838 break; 4839 } 4840 /* Dedicated branch for single-segment TSO packets. */ 4841 if (MLX5_TXOFF_CONFIG(TSO) && 4842 unlikely(loc.mbuf->ol_flags & PKT_TX_TCP_SEG)) { 4843 /* 4844 * TSO might require special way for inlining 4845 * (dedicated parameters) and is sent with 4846 * MLX5_OPCODE_TSO opcode only, provide this 4847 * in dedicated branch. 4848 */ 4849 enter_send_tso: 4850 MLX5_ASSERT(NB_SEGS(loc.mbuf) == 1); 4851 MLX5_ASSERT(pkts_n > loc.pkts_sent); 4852 ret = mlx5_tx_burst_tso(txq, pkts, pkts_n, &loc, olx); 4853 /* 4854 * These returned code checks are supposed 4855 * to be optimized out due to routine inlining. 4856 */ 4857 if (ret == MLX5_TXCMP_CODE_EXIT) 4858 break; 4859 if (ret == MLX5_TXCMP_CODE_ERROR) { 4860 txq->stats.oerrors++; 4861 break; 4862 } 4863 if (ret == MLX5_TXCMP_CODE_SINGLE) 4864 goto enter_send_single; 4865 if (MLX5_TXOFF_CONFIG(MULTI) && 4866 ret == MLX5_TXCMP_CODE_MULTI) { 4867 /* 4868 * The multi-segment packet was 4869 * encountered in the array. 4870 */ 4871 goto enter_send_multi; 4872 } 4873 /* We must not get here. Something is going wrong. */ 4874 MLX5_ASSERT(false); 4875 txq->stats.oerrors++; 4876 break; 4877 } 4878 /* 4879 * The dedicated branch for the single-segment packets 4880 * without TSO. Often these ones can be sent using 4881 * MLX5_OPCODE_EMPW with multiple packets in one WQE. 4882 * The routine builds the WQEs till it encounters 4883 * the TSO or multi-segment packet (in case if these 4884 * offloads are requested at SQ configuration time). 4885 */ 4886 enter_send_single: 4887 MLX5_ASSERT(pkts_n > loc.pkts_sent); 4888 ret = mlx5_tx_burst_single(txq, pkts, pkts_n, &loc, olx); 4889 /* 4890 * These returned code checks are supposed 4891 * to be optimized out due to routine inlining. 4892 */ 4893 if (ret == MLX5_TXCMP_CODE_EXIT) 4894 break; 4895 if (ret == MLX5_TXCMP_CODE_ERROR) { 4896 txq->stats.oerrors++; 4897 break; 4898 } 4899 if (MLX5_TXOFF_CONFIG(MULTI) && 4900 ret == MLX5_TXCMP_CODE_MULTI) { 4901 /* 4902 * The multi-segment packet was 4903 * encountered in the array. 4904 */ 4905 goto enter_send_multi; 4906 } 4907 if (MLX5_TXOFF_CONFIG(TSO) && 4908 ret == MLX5_TXCMP_CODE_TSO) { 4909 /* 4910 * The single-segment TSO packet was 4911 * encountered in the array. 4912 */ 4913 goto enter_send_tso; 4914 } 4915 /* We must not get here. Something is going wrong. */ 4916 MLX5_ASSERT(false); 4917 txq->stats.oerrors++; 4918 break; 4919 } 4920 /* 4921 * Main Tx loop is completed, do the rest: 4922 * - set completion request if thresholds are reached 4923 * - doorbell the hardware 4924 * - copy the rest of mbufs to elts (if any) 4925 */ 4926 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE) || 4927 loc.pkts_sent >= loc.pkts_copy); 4928 /* Take a shortcut if nothing is sent. */ 4929 if (unlikely(loc.pkts_sent == loc.pkts_loop)) 4930 goto burst_exit; 4931 /* Request CQE generation if limits are reached. */ 4932 mlx5_tx_request_completion(txq, &loc, olx); 4933 /* 4934 * Ring QP doorbell immediately after WQE building completion 4935 * to improve latencies. The pure software related data treatment 4936 * can be completed after doorbell. Tx CQEs for this SQ are 4937 * processed in this thread only by the polling. 4938 * 4939 * The rdma core library can map doorbell register in two ways, 4940 * depending on the environment variable "MLX5_SHUT_UP_BF": 4941 * 4942 * - as regular cached memory, the variable is either missing or 4943 * set to zero. This type of mapping may cause the significant 4944 * doorbell register writing latency and requires explicit 4945 * memory write barrier to mitigate this issue and prevent 4946 * write combining. 4947 * 4948 * - as non-cached memory, the variable is present and set to 4949 * not "0" value. This type of mapping may cause performance 4950 * impact under heavy loading conditions but the explicit write 4951 * memory barrier is not required and it may improve core 4952 * performance. 4953 * 4954 * - the legacy behaviour (prior 19.08 release) was to use some 4955 * heuristics to decide whether write memory barrier should 4956 * be performed. This behavior is supported with specifying 4957 * tx_db_nc=2, write barrier is skipped if application 4958 * provides the full recommended burst of packets, it 4959 * supposes the next packets are coming and the write barrier 4960 * will be issued on the next burst (after descriptor writing, 4961 * at least). 4962 */ 4963 mlx5_tx_dbrec_cond_wmb(txq, loc.wqe_last, !txq->db_nc && 4964 (!txq->db_heu || pkts_n % MLX5_TX_DEFAULT_BURST)); 4965 /* Not all of the mbufs may be stored into elts yet. */ 4966 part = MLX5_TXOFF_CONFIG(INLINE) ? 0 : loc.pkts_sent - loc.pkts_copy; 4967 if (!MLX5_TXOFF_CONFIG(INLINE) && part) { 4968 /* 4969 * There are some single-segment mbufs not stored in elts. 4970 * It can be only if the last packet was single-segment. 4971 * The copying is gathered into one place due to it is 4972 * a good opportunity to optimize that with SIMD. 4973 * Unfortunately if inlining is enabled the gaps in 4974 * pointer array may happen due to early freeing of the 4975 * inlined mbufs. 4976 */ 4977 mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy, part, olx); 4978 loc.pkts_copy = loc.pkts_sent; 4979 } 4980 MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); 4981 MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); 4982 if (pkts_n > loc.pkts_sent) { 4983 /* 4984 * If burst size is large there might be no enough CQE 4985 * fetched from completion queue and no enough resources 4986 * freed to send all the packets. 4987 */ 4988 goto send_loop; 4989 } 4990 burst_exit: 4991 #ifdef MLX5_PMD_SOFT_COUNTERS 4992 /* Increment sent packets counter. */ 4993 txq->stats.opackets += loc.pkts_sent; 4994 #endif 4995 return loc.pkts_sent; 4996 } 4997 4998 /* Generate routines with Enhanced Multi-Packet Write support. */ 4999 MLX5_TXOFF_DECL(full_empw, 5000 MLX5_TXOFF_CONFIG_FULL | MLX5_TXOFF_CONFIG_EMPW) 5001 5002 MLX5_TXOFF_DECL(none_empw, 5003 MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW) 5004 5005 MLX5_TXOFF_DECL(md_empw, 5006 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5007 5008 MLX5_TXOFF_DECL(mt_empw, 5009 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5010 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5011 5012 MLX5_TXOFF_DECL(mtsc_empw, 5013 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5014 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5015 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5016 5017 MLX5_TXOFF_DECL(mti_empw, 5018 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5019 MLX5_TXOFF_CONFIG_INLINE | 5020 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5021 5022 MLX5_TXOFF_DECL(mtv_empw, 5023 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5024 MLX5_TXOFF_CONFIG_VLAN | 5025 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5026 5027 MLX5_TXOFF_DECL(mtiv_empw, 5028 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5029 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5030 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5031 5032 MLX5_TXOFF_DECL(sc_empw, 5033 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5034 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5035 5036 MLX5_TXOFF_DECL(sci_empw, 5037 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5038 MLX5_TXOFF_CONFIG_INLINE | 5039 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5040 5041 MLX5_TXOFF_DECL(scv_empw, 5042 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5043 MLX5_TXOFF_CONFIG_VLAN | 5044 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5045 5046 MLX5_TXOFF_DECL(sciv_empw, 5047 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5048 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5049 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5050 5051 MLX5_TXOFF_DECL(i_empw, 5052 MLX5_TXOFF_CONFIG_INLINE | 5053 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5054 5055 MLX5_TXOFF_DECL(v_empw, 5056 MLX5_TXOFF_CONFIG_VLAN | 5057 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5058 5059 MLX5_TXOFF_DECL(iv_empw, 5060 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5061 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5062 5063 /* Generate routines without Enhanced Multi-Packet Write support. */ 5064 MLX5_TXOFF_DECL(full, 5065 MLX5_TXOFF_CONFIG_FULL) 5066 5067 MLX5_TXOFF_DECL(none, 5068 MLX5_TXOFF_CONFIG_NONE) 5069 5070 MLX5_TXOFF_DECL(md, 5071 MLX5_TXOFF_CONFIG_METADATA) 5072 5073 MLX5_TXOFF_DECL(mt, 5074 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5075 MLX5_TXOFF_CONFIG_METADATA) 5076 5077 MLX5_TXOFF_DECL(mtsc, 5078 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5079 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5080 MLX5_TXOFF_CONFIG_METADATA) 5081 5082 MLX5_TXOFF_DECL(mti, 5083 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5084 MLX5_TXOFF_CONFIG_INLINE | 5085 MLX5_TXOFF_CONFIG_METADATA) 5086 5087 5088 MLX5_TXOFF_DECL(mtv, 5089 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5090 MLX5_TXOFF_CONFIG_VLAN | 5091 MLX5_TXOFF_CONFIG_METADATA) 5092 5093 5094 MLX5_TXOFF_DECL(mtiv, 5095 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5096 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5097 MLX5_TXOFF_CONFIG_METADATA) 5098 5099 MLX5_TXOFF_DECL(sc, 5100 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5101 MLX5_TXOFF_CONFIG_METADATA) 5102 5103 MLX5_TXOFF_DECL(sci, 5104 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5105 MLX5_TXOFF_CONFIG_INLINE | 5106 MLX5_TXOFF_CONFIG_METADATA) 5107 5108 5109 MLX5_TXOFF_DECL(scv, 5110 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5111 MLX5_TXOFF_CONFIG_VLAN | 5112 MLX5_TXOFF_CONFIG_METADATA) 5113 5114 5115 MLX5_TXOFF_DECL(sciv, 5116 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5117 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5118 MLX5_TXOFF_CONFIG_METADATA) 5119 5120 MLX5_TXOFF_DECL(i, 5121 MLX5_TXOFF_CONFIG_INLINE | 5122 MLX5_TXOFF_CONFIG_METADATA) 5123 5124 MLX5_TXOFF_DECL(v, 5125 MLX5_TXOFF_CONFIG_VLAN | 5126 MLX5_TXOFF_CONFIG_METADATA) 5127 5128 MLX5_TXOFF_DECL(iv, 5129 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5130 MLX5_TXOFF_CONFIG_METADATA) 5131 5132 /* 5133 * Generate routines with Legacy Multi-Packet Write support. 5134 * This mode is supported by ConnectX-4LX only and imposes 5135 * offload limitations, not supported: 5136 * - ACL/Flows (metadata are becoming meaningless) 5137 * - WQE Inline headers 5138 * - SRIOV (E-Switch offloads) 5139 * - VLAN insertion 5140 * - tunnel encapsulation/decapsulation 5141 * - TSO 5142 */ 5143 MLX5_TXOFF_DECL(none_mpw, 5144 MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW | 5145 MLX5_TXOFF_CONFIG_MPW) 5146 5147 MLX5_TXOFF_DECL(mci_mpw, 5148 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM | 5149 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | 5150 MLX5_TXOFF_CONFIG_MPW) 5151 5152 MLX5_TXOFF_DECL(mc_mpw, 5153 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM | 5154 MLX5_TXOFF_CONFIG_EMPW | MLX5_TXOFF_CONFIG_MPW) 5155 5156 MLX5_TXOFF_DECL(i_mpw, 5157 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | 5158 MLX5_TXOFF_CONFIG_MPW) 5159 5160 /* 5161 * Array of declared and compiled Tx burst function and corresponding 5162 * supported offloads set. The array is used to select the Tx burst 5163 * function for specified offloads set at Tx queue configuration time. 5164 */ 5165 const struct { 5166 eth_tx_burst_t func; 5167 unsigned int olx; 5168 } txoff_func[] = { 5169 MLX5_TXOFF_INFO(full_empw, 5170 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5171 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5172 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5173 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5174 5175 MLX5_TXOFF_INFO(none_empw, 5176 MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW) 5177 5178 MLX5_TXOFF_INFO(md_empw, 5179 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5180 5181 MLX5_TXOFF_INFO(mt_empw, 5182 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5183 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5184 5185 MLX5_TXOFF_INFO(mtsc_empw, 5186 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5187 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5188 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5189 5190 MLX5_TXOFF_INFO(mti_empw, 5191 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5192 MLX5_TXOFF_CONFIG_INLINE | 5193 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5194 5195 MLX5_TXOFF_INFO(mtv_empw, 5196 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5197 MLX5_TXOFF_CONFIG_VLAN | 5198 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5199 5200 MLX5_TXOFF_INFO(mtiv_empw, 5201 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5202 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5203 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5204 5205 MLX5_TXOFF_INFO(sc_empw, 5206 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5207 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5208 5209 MLX5_TXOFF_INFO(sci_empw, 5210 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5211 MLX5_TXOFF_CONFIG_INLINE | 5212 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5213 5214 MLX5_TXOFF_INFO(scv_empw, 5215 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5216 MLX5_TXOFF_CONFIG_VLAN | 5217 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5218 5219 MLX5_TXOFF_INFO(sciv_empw, 5220 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5221 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5222 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5223 5224 MLX5_TXOFF_INFO(i_empw, 5225 MLX5_TXOFF_CONFIG_INLINE | 5226 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5227 5228 MLX5_TXOFF_INFO(v_empw, 5229 MLX5_TXOFF_CONFIG_VLAN | 5230 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5231 5232 MLX5_TXOFF_INFO(iv_empw, 5233 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5234 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5235 5236 MLX5_TXOFF_INFO(full, 5237 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5238 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5239 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5240 MLX5_TXOFF_CONFIG_METADATA) 5241 5242 MLX5_TXOFF_INFO(none, 5243 MLX5_TXOFF_CONFIG_NONE) 5244 5245 MLX5_TXOFF_INFO(md, 5246 MLX5_TXOFF_CONFIG_METADATA) 5247 5248 MLX5_TXOFF_INFO(mt, 5249 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5250 MLX5_TXOFF_CONFIG_METADATA) 5251 5252 MLX5_TXOFF_INFO(mtsc, 5253 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5254 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5255 MLX5_TXOFF_CONFIG_METADATA) 5256 5257 MLX5_TXOFF_INFO(mti, 5258 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5259 MLX5_TXOFF_CONFIG_INLINE | 5260 MLX5_TXOFF_CONFIG_METADATA) 5261 5262 MLX5_TXOFF_INFO(mtv, 5263 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5264 MLX5_TXOFF_CONFIG_VLAN | 5265 MLX5_TXOFF_CONFIG_METADATA) 5266 5267 MLX5_TXOFF_INFO(mtiv, 5268 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5269 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5270 MLX5_TXOFF_CONFIG_METADATA) 5271 5272 MLX5_TXOFF_INFO(sc, 5273 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5274 MLX5_TXOFF_CONFIG_METADATA) 5275 5276 MLX5_TXOFF_INFO(sci, 5277 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5278 MLX5_TXOFF_CONFIG_INLINE | 5279 MLX5_TXOFF_CONFIG_METADATA) 5280 5281 MLX5_TXOFF_INFO(scv, 5282 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5283 MLX5_TXOFF_CONFIG_VLAN | 5284 MLX5_TXOFF_CONFIG_METADATA) 5285 5286 MLX5_TXOFF_INFO(sciv, 5287 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5288 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5289 MLX5_TXOFF_CONFIG_METADATA) 5290 5291 MLX5_TXOFF_INFO(i, 5292 MLX5_TXOFF_CONFIG_INLINE | 5293 MLX5_TXOFF_CONFIG_METADATA) 5294 5295 MLX5_TXOFF_INFO(v, 5296 MLX5_TXOFF_CONFIG_VLAN | 5297 MLX5_TXOFF_CONFIG_METADATA) 5298 5299 MLX5_TXOFF_INFO(iv, 5300 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5301 MLX5_TXOFF_CONFIG_METADATA) 5302 5303 MLX5_TXOFF_INFO(none_mpw, 5304 MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW | 5305 MLX5_TXOFF_CONFIG_MPW) 5306 5307 MLX5_TXOFF_INFO(mci_mpw, 5308 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM | 5309 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | 5310 MLX5_TXOFF_CONFIG_MPW) 5311 5312 MLX5_TXOFF_INFO(mc_mpw, 5313 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM | 5314 MLX5_TXOFF_CONFIG_EMPW | MLX5_TXOFF_CONFIG_MPW) 5315 5316 MLX5_TXOFF_INFO(i_mpw, 5317 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | 5318 MLX5_TXOFF_CONFIG_MPW) 5319 }; 5320 5321 /** 5322 * Configure the Tx function to use. The routine checks configured 5323 * Tx offloads for the device and selects appropriate Tx burst 5324 * routine. There are multiple Tx burst routines compiled from 5325 * the same template in the most optimal way for the dedicated 5326 * Tx offloads set. 5327 * 5328 * @param dev 5329 * Pointer to private data structure. 5330 * 5331 * @return 5332 * Pointer to selected Tx burst function. 5333 */ 5334 eth_tx_burst_t 5335 mlx5_select_tx_function(struct rte_eth_dev *dev) 5336 { 5337 struct mlx5_priv *priv = dev->data->dev_private; 5338 struct mlx5_dev_config *config = &priv->config; 5339 uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads; 5340 unsigned int diff = 0, olx = 0, i, m; 5341 5342 static_assert(MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE <= 5343 MLX5_DSEG_MAX, "invalid WQE max size"); 5344 static_assert(MLX5_WQE_CSEG_SIZE == MLX5_WSEG_SIZE, 5345 "invalid WQE Control Segment size"); 5346 static_assert(MLX5_WQE_ESEG_SIZE == MLX5_WSEG_SIZE, 5347 "invalid WQE Ethernet Segment size"); 5348 static_assert(MLX5_WQE_DSEG_SIZE == MLX5_WSEG_SIZE, 5349 "invalid WQE Data Segment size"); 5350 static_assert(MLX5_WQE_SIZE == 4 * MLX5_WSEG_SIZE, 5351 "invalid WQE size"); 5352 MLX5_ASSERT(priv); 5353 if (tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS) { 5354 /* We should support Multi-Segment Packets. */ 5355 olx |= MLX5_TXOFF_CONFIG_MULTI; 5356 } 5357 if (tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO | 5358 DEV_TX_OFFLOAD_VXLAN_TNL_TSO | 5359 DEV_TX_OFFLOAD_GRE_TNL_TSO | 5360 DEV_TX_OFFLOAD_IP_TNL_TSO | 5361 DEV_TX_OFFLOAD_UDP_TNL_TSO)) { 5362 /* We should support TCP Send Offload. */ 5363 olx |= MLX5_TXOFF_CONFIG_TSO; 5364 } 5365 if (tx_offloads & (DEV_TX_OFFLOAD_IP_TNL_TSO | 5366 DEV_TX_OFFLOAD_UDP_TNL_TSO | 5367 DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)) { 5368 /* We should support Software Parser for Tunnels. */ 5369 olx |= MLX5_TXOFF_CONFIG_SWP; 5370 } 5371 if (tx_offloads & (DEV_TX_OFFLOAD_IPV4_CKSUM | 5372 DEV_TX_OFFLOAD_UDP_CKSUM | 5373 DEV_TX_OFFLOAD_TCP_CKSUM | 5374 DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)) { 5375 /* We should support IP/TCP/UDP Checksums. */ 5376 olx |= MLX5_TXOFF_CONFIG_CSUM; 5377 } 5378 if (tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT) { 5379 /* We should support VLAN insertion. */ 5380 olx |= MLX5_TXOFF_CONFIG_VLAN; 5381 } 5382 if (priv->txqs_n && (*priv->txqs)[0]) { 5383 struct mlx5_txq_data *txd = (*priv->txqs)[0]; 5384 5385 if (txd->inlen_send) { 5386 /* 5387 * Check the data inline requirements. Data inline 5388 * is enabled on per device basis, we can check 5389 * the first Tx queue only. 5390 * 5391 * If device does not support VLAN insertion in WQE 5392 * and some queues are requested to perform VLAN 5393 * insertion offload than inline must be enabled. 5394 */ 5395 olx |= MLX5_TXOFF_CONFIG_INLINE; 5396 } 5397 } 5398 if (config->mps == MLX5_MPW_ENHANCED && 5399 config->txq_inline_min <= 0) { 5400 /* 5401 * The NIC supports Enhanced Multi-Packet Write 5402 * and does not require minimal inline data. 5403 */ 5404 olx |= MLX5_TXOFF_CONFIG_EMPW; 5405 } 5406 if (rte_flow_dynf_metadata_avail()) { 5407 /* We should support Flow metadata. */ 5408 olx |= MLX5_TXOFF_CONFIG_METADATA; 5409 } 5410 if (config->mps == MLX5_MPW) { 5411 /* 5412 * The NIC supports Legacy Multi-Packet Write. 5413 * The MLX5_TXOFF_CONFIG_MPW controls the 5414 * descriptor building method in combination 5415 * with MLX5_TXOFF_CONFIG_EMPW. 5416 */ 5417 if (!(olx & (MLX5_TXOFF_CONFIG_TSO | 5418 MLX5_TXOFF_CONFIG_SWP | 5419 MLX5_TXOFF_CONFIG_VLAN | 5420 MLX5_TXOFF_CONFIG_METADATA))) 5421 olx |= MLX5_TXOFF_CONFIG_EMPW | 5422 MLX5_TXOFF_CONFIG_MPW; 5423 } 5424 /* 5425 * Scan the routines table to find the minimal 5426 * satisfying routine with requested offloads. 5427 */ 5428 m = RTE_DIM(txoff_func); 5429 for (i = 0; i < RTE_DIM(txoff_func); i++) { 5430 unsigned int tmp; 5431 5432 tmp = txoff_func[i].olx; 5433 if (tmp == olx) { 5434 /* Meets requested offloads exactly.*/ 5435 m = i; 5436 break; 5437 } 5438 if ((tmp & olx) != olx) { 5439 /* Does not meet requested offloads at all. */ 5440 continue; 5441 } 5442 if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_EMPW) 5443 /* Do not enable eMPW if not configured. */ 5444 continue; 5445 if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_INLINE) 5446 /* Do not enable inlining if not configured. */ 5447 continue; 5448 /* 5449 * Some routine meets the requirements. 5450 * Check whether it has minimal amount 5451 * of not requested offloads. 5452 */ 5453 tmp = __builtin_popcountl(tmp & ~olx); 5454 if (m >= RTE_DIM(txoff_func) || tmp < diff) { 5455 /* First or better match, save and continue. */ 5456 m = i; 5457 diff = tmp; 5458 continue; 5459 } 5460 if (tmp == diff) { 5461 tmp = txoff_func[i].olx ^ txoff_func[m].olx; 5462 if (__builtin_ffsl(txoff_func[i].olx & ~tmp) < 5463 __builtin_ffsl(txoff_func[m].olx & ~tmp)) { 5464 /* Lighter not requested offload. */ 5465 m = i; 5466 } 5467 } 5468 } 5469 if (m >= RTE_DIM(txoff_func)) { 5470 DRV_LOG(DEBUG, "port %u has no selected Tx function" 5471 " for requested offloads %04X", 5472 dev->data->port_id, olx); 5473 return NULL; 5474 } 5475 DRV_LOG(DEBUG, "port %u has selected Tx function" 5476 " supporting offloads %04X/%04X", 5477 dev->data->port_id, olx, txoff_func[m].olx); 5478 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_MULTI) 5479 DRV_LOG(DEBUG, "\tMULTI (multi segment)"); 5480 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_TSO) 5481 DRV_LOG(DEBUG, "\tTSO (TCP send offload)"); 5482 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_SWP) 5483 DRV_LOG(DEBUG, "\tSWP (software parser)"); 5484 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_CSUM) 5485 DRV_LOG(DEBUG, "\tCSUM (checksum offload)"); 5486 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_INLINE) 5487 DRV_LOG(DEBUG, "\tINLIN (inline data)"); 5488 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_VLAN) 5489 DRV_LOG(DEBUG, "\tVLANI (VLAN insertion)"); 5490 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_METADATA) 5491 DRV_LOG(DEBUG, "\tMETAD (tx Flow metadata)"); 5492 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_EMPW) { 5493 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_MPW) 5494 DRV_LOG(DEBUG, "\tMPW (Legacy MPW)"); 5495 else 5496 DRV_LOG(DEBUG, "\tEMPW (Enhanced MPW)"); 5497 } 5498 return txoff_func[m].func; 5499 } 5500 5501 /** 5502 * DPDK callback to get the TX queue information 5503 * 5504 * @param dev 5505 * Pointer to the device structure. 5506 * 5507 * @param tx_queue_id 5508 * Tx queue identificator. 5509 * 5510 * @param qinfo 5511 * Pointer to the TX queue information structure. 5512 * 5513 * @return 5514 * None. 5515 */ 5516 5517 void 5518 mlx5_txq_info_get(struct rte_eth_dev *dev, uint16_t tx_queue_id, 5519 struct rte_eth_txq_info *qinfo) 5520 { 5521 struct mlx5_priv *priv = dev->data->dev_private; 5522 struct mlx5_txq_data *txq = (*priv->txqs)[tx_queue_id]; 5523 struct mlx5_txq_ctrl *txq_ctrl = 5524 container_of(txq, struct mlx5_txq_ctrl, txq); 5525 5526 if (!txq) 5527 return; 5528 qinfo->nb_desc = txq->elts_s; 5529 qinfo->conf.tx_thresh.pthresh = 0; 5530 qinfo->conf.tx_thresh.hthresh = 0; 5531 qinfo->conf.tx_thresh.wthresh = 0; 5532 qinfo->conf.tx_rs_thresh = 0; 5533 qinfo->conf.tx_free_thresh = 0; 5534 qinfo->conf.tx_deferred_start = txq_ctrl ? 0 : 1; 5535 qinfo->conf.offloads = dev->data->dev_conf.txmode.offloads; 5536 } 5537 5538 /** 5539 * DPDK callback to get the TX packet burst mode information 5540 * 5541 * @param dev 5542 * Pointer to the device structure. 5543 * 5544 * @param tx_queue_id 5545 * Tx queue identificatior. 5546 * 5547 * @param mode 5548 * Pointer to the burts mode information. 5549 * 5550 * @return 5551 * 0 as success, -EINVAL as failure. 5552 */ 5553 5554 int 5555 mlx5_tx_burst_mode_get(struct rte_eth_dev *dev, 5556 uint16_t tx_queue_id __rte_unused, 5557 struct rte_eth_burst_mode *mode) 5558 { 5559 eth_tx_burst_t pkt_burst = dev->tx_pkt_burst; 5560 unsigned int i, olx; 5561 5562 for (i = 0; i < RTE_DIM(txoff_func); i++) { 5563 if (pkt_burst == txoff_func[i].func) { 5564 olx = txoff_func[i].olx; 5565 snprintf(mode->info, sizeof(mode->info), 5566 "%s%s%s%s%s%s%s%s", 5567 (olx & MLX5_TXOFF_CONFIG_EMPW) ? 5568 ((olx & MLX5_TXOFF_CONFIG_MPW) ? 5569 "Legacy MPW" : "Enhanced MPW") : "No MPW", 5570 (olx & MLX5_TXOFF_CONFIG_MULTI) ? 5571 " + MULTI" : "", 5572 (olx & MLX5_TXOFF_CONFIG_TSO) ? 5573 " + TSO" : "", 5574 (olx & MLX5_TXOFF_CONFIG_SWP) ? 5575 " + SWP" : "", 5576 (olx & MLX5_TXOFF_CONFIG_CSUM) ? 5577 " + CSUM" : "", 5578 (olx & MLX5_TXOFF_CONFIG_INLINE) ? 5579 " + INLINE" : "", 5580 (olx & MLX5_TXOFF_CONFIG_VLAN) ? 5581 " + VLAN" : "", 5582 (olx & MLX5_TXOFF_CONFIG_METADATA) ? 5583 " + METADATA" : ""); 5584 return 0; 5585 } 5586 } 5587 return -EINVAL; 5588 } 5589