1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2015 6WIND S.A. 3 * Copyright 2015-2019 Mellanox Technologies, Ltd 4 */ 5 6 #include <stdint.h> 7 #include <string.h> 8 #include <stdlib.h> 9 10 /* Verbs header. */ 11 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 12 #ifdef PEDANTIC 13 #pragma GCC diagnostic ignored "-Wpedantic" 14 #endif 15 #include <infiniband/verbs.h> 16 #include <infiniband/mlx5dv.h> 17 #ifdef PEDANTIC 18 #pragma GCC diagnostic error "-Wpedantic" 19 #endif 20 21 #include <rte_mbuf.h> 22 #include <rte_mempool.h> 23 #include <rte_prefetch.h> 24 #include <rte_common.h> 25 #include <rte_branch_prediction.h> 26 #include <rte_ether.h> 27 #include <rte_cycles.h> 28 #include <rte_flow.h> 29 30 #include <mlx5_devx_cmds.h> 31 #include <mlx5_prm.h> 32 #include <mlx5_common.h> 33 34 #include "mlx5_defs.h" 35 #include "mlx5.h" 36 #include "mlx5_utils.h" 37 #include "mlx5_rxtx.h" 38 #include "mlx5_autoconf.h" 39 40 /* TX burst subroutines return codes. */ 41 enum mlx5_txcmp_code { 42 MLX5_TXCMP_CODE_EXIT = 0, 43 MLX5_TXCMP_CODE_ERROR, 44 MLX5_TXCMP_CODE_SINGLE, 45 MLX5_TXCMP_CODE_MULTI, 46 MLX5_TXCMP_CODE_TSO, 47 MLX5_TXCMP_CODE_EMPW, 48 }; 49 50 /* 51 * These defines are used to configure Tx burst routine option set 52 * supported at compile time. The not specified options are optimized out 53 * out due to if conditions can be explicitly calculated at compile time. 54 * The offloads with bigger runtime check (require more CPU cycles to 55 * skip) overhead should have the bigger index - this is needed to 56 * select the better matching routine function if no exact match and 57 * some offloads are not actually requested. 58 */ 59 #define MLX5_TXOFF_CONFIG_MULTI (1u << 0) /* Multi-segment packets.*/ 60 #define MLX5_TXOFF_CONFIG_TSO (1u << 1) /* TCP send offload supported.*/ 61 #define MLX5_TXOFF_CONFIG_SWP (1u << 2) /* Tunnels/SW Parser offloads.*/ 62 #define MLX5_TXOFF_CONFIG_CSUM (1u << 3) /* Check Sums offloaded. */ 63 #define MLX5_TXOFF_CONFIG_INLINE (1u << 4) /* Data inlining supported. */ 64 #define MLX5_TXOFF_CONFIG_VLAN (1u << 5) /* VLAN insertion supported.*/ 65 #define MLX5_TXOFF_CONFIG_METADATA (1u << 6) /* Flow metadata. */ 66 #define MLX5_TXOFF_CONFIG_EMPW (1u << 8) /* Enhanced MPW supported.*/ 67 #define MLX5_TXOFF_CONFIG_MPW (1u << 9) /* Legacy MPW supported.*/ 68 69 /* The most common offloads groups. */ 70 #define MLX5_TXOFF_CONFIG_NONE 0 71 #define MLX5_TXOFF_CONFIG_FULL (MLX5_TXOFF_CONFIG_MULTI | \ 72 MLX5_TXOFF_CONFIG_TSO | \ 73 MLX5_TXOFF_CONFIG_SWP | \ 74 MLX5_TXOFF_CONFIG_CSUM | \ 75 MLX5_TXOFF_CONFIG_INLINE | \ 76 MLX5_TXOFF_CONFIG_VLAN | \ 77 MLX5_TXOFF_CONFIG_METADATA) 78 79 #define MLX5_TXOFF_CONFIG(mask) (olx & MLX5_TXOFF_CONFIG_##mask) 80 81 #define MLX5_TXOFF_DECL(func, olx) \ 82 static uint16_t mlx5_tx_burst_##func(void *txq, \ 83 struct rte_mbuf **pkts, \ 84 uint16_t pkts_n) \ 85 { \ 86 return mlx5_tx_burst_tmpl((struct mlx5_txq_data *)txq, \ 87 pkts, pkts_n, (olx)); \ 88 } 89 90 #define MLX5_TXOFF_INFO(func, olx) {mlx5_tx_burst_##func, olx}, 91 92 static __rte_always_inline uint32_t 93 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe); 94 95 static __rte_always_inline int 96 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, 97 uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe); 98 99 static __rte_always_inline uint32_t 100 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe); 101 102 static __rte_always_inline void 103 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, 104 volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res); 105 106 static __rte_always_inline void 107 mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx, 108 const unsigned int strd_n); 109 110 static int 111 mlx5_queue_state_modify(struct rte_eth_dev *dev, 112 struct mlx5_mp_arg_queue_state_modify *sm); 113 114 static inline void 115 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *restrict tcp, 116 volatile struct mlx5_cqe *restrict cqe, 117 uint32_t phcsum); 118 119 static inline void 120 mlx5_lro_update_hdr(uint8_t *restrict padd, 121 volatile struct mlx5_cqe *restrict cqe, 122 uint32_t len); 123 124 uint32_t mlx5_ptype_table[] __rte_cache_aligned = { 125 [0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */ 126 }; 127 128 uint8_t mlx5_cksum_table[1 << 10] __rte_cache_aligned; 129 uint8_t mlx5_swp_types_table[1 << 10] __rte_cache_aligned; 130 131 uint64_t rte_net_mlx5_dynf_inline_mask; 132 #define PKT_TX_DYNF_NOINLINE rte_net_mlx5_dynf_inline_mask 133 134 /** 135 * Build a table to translate Rx completion flags to packet type. 136 * 137 * @note: fix mlx5_dev_supported_ptypes_get() if any change here. 138 */ 139 void 140 mlx5_set_ptype_table(void) 141 { 142 unsigned int i; 143 uint32_t (*p)[RTE_DIM(mlx5_ptype_table)] = &mlx5_ptype_table; 144 145 /* Last entry must not be overwritten, reserved for errored packet. */ 146 for (i = 0; i < RTE_DIM(mlx5_ptype_table) - 1; ++i) 147 (*p)[i] = RTE_PTYPE_UNKNOWN; 148 /* 149 * The index to the array should have: 150 * bit[1:0] = l3_hdr_type 151 * bit[4:2] = l4_hdr_type 152 * bit[5] = ip_frag 153 * bit[6] = tunneled 154 * bit[7] = outer_l3_type 155 */ 156 /* L2 */ 157 (*p)[0x00] = RTE_PTYPE_L2_ETHER; 158 /* L3 */ 159 (*p)[0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 160 RTE_PTYPE_L4_NONFRAG; 161 (*p)[0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 162 RTE_PTYPE_L4_NONFRAG; 163 /* Fragmented */ 164 (*p)[0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 165 RTE_PTYPE_L4_FRAG; 166 (*p)[0x22] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 167 RTE_PTYPE_L4_FRAG; 168 /* TCP */ 169 (*p)[0x05] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 170 RTE_PTYPE_L4_TCP; 171 (*p)[0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 172 RTE_PTYPE_L4_TCP; 173 (*p)[0x0d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 174 RTE_PTYPE_L4_TCP; 175 (*p)[0x0e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 176 RTE_PTYPE_L4_TCP; 177 (*p)[0x11] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 178 RTE_PTYPE_L4_TCP; 179 (*p)[0x12] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 180 RTE_PTYPE_L4_TCP; 181 /* UDP */ 182 (*p)[0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 183 RTE_PTYPE_L4_UDP; 184 (*p)[0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 185 RTE_PTYPE_L4_UDP; 186 /* Repeat with outer_l3_type being set. Just in case. */ 187 (*p)[0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 188 RTE_PTYPE_L4_NONFRAG; 189 (*p)[0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 190 RTE_PTYPE_L4_NONFRAG; 191 (*p)[0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 192 RTE_PTYPE_L4_FRAG; 193 (*p)[0xa2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 194 RTE_PTYPE_L4_FRAG; 195 (*p)[0x85] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 196 RTE_PTYPE_L4_TCP; 197 (*p)[0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 198 RTE_PTYPE_L4_TCP; 199 (*p)[0x8d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 200 RTE_PTYPE_L4_TCP; 201 (*p)[0x8e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 202 RTE_PTYPE_L4_TCP; 203 (*p)[0x91] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 204 RTE_PTYPE_L4_TCP; 205 (*p)[0x92] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 206 RTE_PTYPE_L4_TCP; 207 (*p)[0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 208 RTE_PTYPE_L4_UDP; 209 (*p)[0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 210 RTE_PTYPE_L4_UDP; 211 /* Tunneled - L3 */ 212 (*p)[0x40] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN; 213 (*p)[0x41] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 214 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 215 RTE_PTYPE_INNER_L4_NONFRAG; 216 (*p)[0x42] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 217 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 218 RTE_PTYPE_INNER_L4_NONFRAG; 219 (*p)[0xc0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN; 220 (*p)[0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 221 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 222 RTE_PTYPE_INNER_L4_NONFRAG; 223 (*p)[0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 224 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 225 RTE_PTYPE_INNER_L4_NONFRAG; 226 /* Tunneled - Fragmented */ 227 (*p)[0x61] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 228 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 229 RTE_PTYPE_INNER_L4_FRAG; 230 (*p)[0x62] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 231 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 232 RTE_PTYPE_INNER_L4_FRAG; 233 (*p)[0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 234 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 235 RTE_PTYPE_INNER_L4_FRAG; 236 (*p)[0xe2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 237 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 238 RTE_PTYPE_INNER_L4_FRAG; 239 /* Tunneled - TCP */ 240 (*p)[0x45] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 241 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 242 RTE_PTYPE_INNER_L4_TCP; 243 (*p)[0x46] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 244 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 245 RTE_PTYPE_INNER_L4_TCP; 246 (*p)[0x4d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 247 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 248 RTE_PTYPE_INNER_L4_TCP; 249 (*p)[0x4e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 250 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 251 RTE_PTYPE_INNER_L4_TCP; 252 (*p)[0x51] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 253 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 254 RTE_PTYPE_INNER_L4_TCP; 255 (*p)[0x52] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 256 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 257 RTE_PTYPE_INNER_L4_TCP; 258 (*p)[0xc5] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 259 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 260 RTE_PTYPE_INNER_L4_TCP; 261 (*p)[0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 262 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 263 RTE_PTYPE_INNER_L4_TCP; 264 (*p)[0xcd] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 265 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 266 RTE_PTYPE_INNER_L4_TCP; 267 (*p)[0xce] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 268 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 269 RTE_PTYPE_INNER_L4_TCP; 270 (*p)[0xd1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 271 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 272 RTE_PTYPE_INNER_L4_TCP; 273 (*p)[0xd2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 274 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 275 RTE_PTYPE_INNER_L4_TCP; 276 /* Tunneled - UDP */ 277 (*p)[0x49] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 278 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 279 RTE_PTYPE_INNER_L4_UDP; 280 (*p)[0x4a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 281 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 282 RTE_PTYPE_INNER_L4_UDP; 283 (*p)[0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 284 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 285 RTE_PTYPE_INNER_L4_UDP; 286 (*p)[0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 287 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 288 RTE_PTYPE_INNER_L4_UDP; 289 } 290 291 /** 292 * Build a table to translate packet to checksum type of Verbs. 293 */ 294 void 295 mlx5_set_cksum_table(void) 296 { 297 unsigned int i; 298 uint8_t v; 299 300 /* 301 * The index should have: 302 * bit[0] = PKT_TX_TCP_SEG 303 * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM 304 * bit[4] = PKT_TX_IP_CKSUM 305 * bit[8] = PKT_TX_OUTER_IP_CKSUM 306 * bit[9] = tunnel 307 */ 308 for (i = 0; i < RTE_DIM(mlx5_cksum_table); ++i) { 309 v = 0; 310 if (i & (1 << 9)) { 311 /* Tunneled packet. */ 312 if (i & (1 << 8)) /* Outer IP. */ 313 v |= MLX5_ETH_WQE_L3_CSUM; 314 if (i & (1 << 4)) /* Inner IP. */ 315 v |= MLX5_ETH_WQE_L3_INNER_CSUM; 316 if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */ 317 v |= MLX5_ETH_WQE_L4_INNER_CSUM; 318 } else { 319 /* No tunnel. */ 320 if (i & (1 << 4)) /* IP. */ 321 v |= MLX5_ETH_WQE_L3_CSUM; 322 if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */ 323 v |= MLX5_ETH_WQE_L4_CSUM; 324 } 325 mlx5_cksum_table[i] = v; 326 } 327 } 328 329 /** 330 * Build a table to translate packet type of mbuf to SWP type of Verbs. 331 */ 332 void 333 mlx5_set_swp_types_table(void) 334 { 335 unsigned int i; 336 uint8_t v; 337 338 /* 339 * The index should have: 340 * bit[0:1] = PKT_TX_L4_MASK 341 * bit[4] = PKT_TX_IPV6 342 * bit[8] = PKT_TX_OUTER_IPV6 343 * bit[9] = PKT_TX_OUTER_UDP 344 */ 345 for (i = 0; i < RTE_DIM(mlx5_swp_types_table); ++i) { 346 v = 0; 347 if (i & (1 << 8)) 348 v |= MLX5_ETH_WQE_L3_OUTER_IPV6; 349 if (i & (1 << 9)) 350 v |= MLX5_ETH_WQE_L4_OUTER_UDP; 351 if (i & (1 << 4)) 352 v |= MLX5_ETH_WQE_L3_INNER_IPV6; 353 if ((i & 3) == (PKT_TX_UDP_CKSUM >> 52)) 354 v |= MLX5_ETH_WQE_L4_INNER_UDP; 355 mlx5_swp_types_table[i] = v; 356 } 357 } 358 359 /** 360 * Set Software Parser flags and offsets in Ethernet Segment of WQE. 361 * Flags must be preliminary initialized to zero. 362 * 363 * @param loc 364 * Pointer to burst routine local context. 365 * @param swp_flags 366 * Pointer to store Software Parser flags 367 * @param olx 368 * Configured Tx offloads mask. It is fully defined at 369 * compile time and may be used for optimization. 370 * 371 * @return 372 * Software Parser offsets packed in dword. 373 * Software Parser flags are set by pointer. 374 */ 375 static __rte_always_inline uint32_t 376 txq_mbuf_to_swp(struct mlx5_txq_local *restrict loc, 377 uint8_t *swp_flags, 378 unsigned int olx) 379 { 380 uint64_t ol, tunnel; 381 unsigned int idx, off; 382 uint32_t set; 383 384 if (!MLX5_TXOFF_CONFIG(SWP)) 385 return 0; 386 ol = loc->mbuf->ol_flags; 387 tunnel = ol & PKT_TX_TUNNEL_MASK; 388 /* 389 * Check whether Software Parser is required. 390 * Only customized tunnels may ask for. 391 */ 392 if (likely(tunnel != PKT_TX_TUNNEL_UDP && tunnel != PKT_TX_TUNNEL_IP)) 393 return 0; 394 /* 395 * The index should have: 396 * bit[0:1] = PKT_TX_L4_MASK 397 * bit[4] = PKT_TX_IPV6 398 * bit[8] = PKT_TX_OUTER_IPV6 399 * bit[9] = PKT_TX_OUTER_UDP 400 */ 401 idx = (ol & (PKT_TX_L4_MASK | PKT_TX_IPV6 | PKT_TX_OUTER_IPV6)) >> 52; 402 idx |= (tunnel == PKT_TX_TUNNEL_UDP) ? (1 << 9) : 0; 403 *swp_flags = mlx5_swp_types_table[idx]; 404 /* 405 * Set offsets for SW parser. Since ConnectX-5, SW parser just 406 * complements HW parser. SW parser starts to engage only if HW parser 407 * can't reach a header. For the older devices, HW parser will not kick 408 * in if any of SWP offsets is set. Therefore, all of the L3 offsets 409 * should be set regardless of HW offload. 410 */ 411 off = loc->mbuf->outer_l2_len; 412 if (MLX5_TXOFF_CONFIG(VLAN) && ol & PKT_TX_VLAN_PKT) 413 off += sizeof(struct rte_vlan_hdr); 414 set = (off >> 1) << 8; /* Outer L3 offset. */ 415 off += loc->mbuf->outer_l3_len; 416 if (tunnel == PKT_TX_TUNNEL_UDP) 417 set |= off >> 1; /* Outer L4 offset. */ 418 if (ol & (PKT_TX_IPV4 | PKT_TX_IPV6)) { /* Inner IP. */ 419 const uint64_t csum = ol & PKT_TX_L4_MASK; 420 off += loc->mbuf->l2_len; 421 set |= (off >> 1) << 24; /* Inner L3 offset. */ 422 if (csum == PKT_TX_TCP_CKSUM || 423 csum == PKT_TX_UDP_CKSUM || 424 (MLX5_TXOFF_CONFIG(TSO) && ol & PKT_TX_TCP_SEG)) { 425 off += loc->mbuf->l3_len; 426 set |= (off >> 1) << 16; /* Inner L4 offset. */ 427 } 428 } 429 set = rte_cpu_to_le_32(set); 430 return set; 431 } 432 433 /** 434 * Convert the Checksum offloads to Verbs. 435 * 436 * @param buf 437 * Pointer to the mbuf. 438 * 439 * @return 440 * Converted checksum flags. 441 */ 442 static __rte_always_inline uint8_t 443 txq_ol_cksum_to_cs(struct rte_mbuf *buf) 444 { 445 uint32_t idx; 446 uint8_t is_tunnel = !!(buf->ol_flags & PKT_TX_TUNNEL_MASK); 447 const uint64_t ol_flags_mask = PKT_TX_TCP_SEG | PKT_TX_L4_MASK | 448 PKT_TX_IP_CKSUM | PKT_TX_OUTER_IP_CKSUM; 449 450 /* 451 * The index should have: 452 * bit[0] = PKT_TX_TCP_SEG 453 * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM 454 * bit[4] = PKT_TX_IP_CKSUM 455 * bit[8] = PKT_TX_OUTER_IP_CKSUM 456 * bit[9] = tunnel 457 */ 458 idx = ((buf->ol_flags & ol_flags_mask) >> 50) | (!!is_tunnel << 9); 459 return mlx5_cksum_table[idx]; 460 } 461 462 /** 463 * Internal function to compute the number of used descriptors in an RX queue 464 * 465 * @param rxq 466 * The Rx queue. 467 * 468 * @return 469 * The number of used rx descriptor. 470 */ 471 static uint32_t 472 rx_queue_count(struct mlx5_rxq_data *rxq) 473 { 474 struct rxq_zip *zip = &rxq->zip; 475 volatile struct mlx5_cqe *cqe; 476 const unsigned int cqe_n = (1 << rxq->cqe_n); 477 const unsigned int cqe_cnt = cqe_n - 1; 478 unsigned int cq_ci; 479 unsigned int used; 480 481 /* if we are processing a compressed cqe */ 482 if (zip->ai) { 483 used = zip->cqe_cnt - zip->ca; 484 cq_ci = zip->cq_ci; 485 } else { 486 used = 0; 487 cq_ci = rxq->cq_ci; 488 } 489 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; 490 while (check_cqe(cqe, cqe_n, cq_ci) != MLX5_CQE_STATUS_HW_OWN) { 491 int8_t op_own; 492 unsigned int n; 493 494 op_own = cqe->op_own; 495 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) 496 n = rte_be_to_cpu_32(cqe->byte_cnt); 497 else 498 n = 1; 499 cq_ci += n; 500 used += n; 501 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; 502 } 503 used = RTE_MIN(used, (1U << rxq->elts_n) - 1); 504 return used; 505 } 506 507 /** 508 * DPDK callback to check the status of a rx descriptor. 509 * 510 * @param rx_queue 511 * The Rx queue. 512 * @param[in] offset 513 * The index of the descriptor in the ring. 514 * 515 * @return 516 * The status of the tx descriptor. 517 */ 518 int 519 mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset) 520 { 521 struct mlx5_rxq_data *rxq = rx_queue; 522 struct mlx5_rxq_ctrl *rxq_ctrl = 523 container_of(rxq, struct mlx5_rxq_ctrl, rxq); 524 struct rte_eth_dev *dev = ETH_DEV(rxq_ctrl->priv); 525 526 if (dev->rx_pkt_burst != mlx5_rx_burst) { 527 rte_errno = ENOTSUP; 528 return -rte_errno; 529 } 530 if (offset >= (1 << rxq->elts_n)) { 531 rte_errno = EINVAL; 532 return -rte_errno; 533 } 534 if (offset < rx_queue_count(rxq)) 535 return RTE_ETH_RX_DESC_DONE; 536 return RTE_ETH_RX_DESC_AVAIL; 537 } 538 539 /** 540 * DPDK callback to get the RX queue information 541 * 542 * @param dev 543 * Pointer to the device structure. 544 * 545 * @param rx_queue_id 546 * Rx queue identificator. 547 * 548 * @param qinfo 549 * Pointer to the RX queue information structure. 550 * 551 * @return 552 * None. 553 */ 554 555 void 556 mlx5_rxq_info_get(struct rte_eth_dev *dev, uint16_t rx_queue_id, 557 struct rte_eth_rxq_info *qinfo) 558 { 559 struct mlx5_priv *priv = dev->data->dev_private; 560 struct mlx5_rxq_data *rxq = (*priv->rxqs)[rx_queue_id]; 561 struct mlx5_rxq_ctrl *rxq_ctrl = 562 container_of(rxq, struct mlx5_rxq_ctrl, rxq); 563 564 if (!rxq) 565 return; 566 qinfo->mp = mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ? 567 rxq->mprq_mp : rxq->mp; 568 qinfo->conf.rx_thresh.pthresh = 0; 569 qinfo->conf.rx_thresh.hthresh = 0; 570 qinfo->conf.rx_thresh.wthresh = 0; 571 qinfo->conf.rx_free_thresh = rxq->rq_repl_thresh; 572 qinfo->conf.rx_drop_en = 1; 573 qinfo->conf.rx_deferred_start = rxq_ctrl ? 0 : 1; 574 qinfo->conf.offloads = dev->data->dev_conf.rxmode.offloads; 575 qinfo->scattered_rx = dev->data->scattered_rx; 576 qinfo->nb_desc = 1 << rxq->elts_n; 577 } 578 579 /** 580 * DPDK callback to get the RX packet burst mode information 581 * 582 * @param dev 583 * Pointer to the device structure. 584 * 585 * @param rx_queue_id 586 * Rx queue identificatior. 587 * 588 * @param mode 589 * Pointer to the burts mode information. 590 * 591 * @return 592 * 0 as success, -EINVAL as failure. 593 */ 594 595 int 596 mlx5_rx_burst_mode_get(struct rte_eth_dev *dev, 597 uint16_t rx_queue_id __rte_unused, 598 struct rte_eth_burst_mode *mode) 599 { 600 eth_rx_burst_t pkt_burst = dev->rx_pkt_burst; 601 602 if (pkt_burst == mlx5_rx_burst) { 603 snprintf(mode->info, sizeof(mode->info), "%s", "Scalar"); 604 } else if (pkt_burst == mlx5_rx_burst_mprq) { 605 snprintf(mode->info, sizeof(mode->info), "%s", "Multi-Packet RQ"); 606 } else if (pkt_burst == mlx5_rx_burst_vec) { 607 #if defined RTE_ARCH_X86_64 608 snprintf(mode->info, sizeof(mode->info), "%s", "Vector SSE"); 609 #elif defined RTE_ARCH_ARM64 610 snprintf(mode->info, sizeof(mode->info), "%s", "Vector Neon"); 611 #elif defined RTE_ARCH_PPC_64 612 snprintf(mode->info, sizeof(mode->info), "%s", "Vector AltiVec"); 613 #else 614 return -EINVAL; 615 #endif 616 } else { 617 return -EINVAL; 618 } 619 return 0; 620 } 621 622 /** 623 * DPDK callback to get the number of used descriptors in a RX queue 624 * 625 * @param dev 626 * Pointer to the device structure. 627 * 628 * @param rx_queue_id 629 * The Rx queue. 630 * 631 * @return 632 * The number of used rx descriptor. 633 * -EINVAL if the queue is invalid 634 */ 635 uint32_t 636 mlx5_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id) 637 { 638 struct mlx5_priv *priv = dev->data->dev_private; 639 struct mlx5_rxq_data *rxq; 640 641 if (dev->rx_pkt_burst != mlx5_rx_burst) { 642 rte_errno = ENOTSUP; 643 return -rte_errno; 644 } 645 rxq = (*priv->rxqs)[rx_queue_id]; 646 if (!rxq) { 647 rte_errno = EINVAL; 648 return -rte_errno; 649 } 650 return rx_queue_count(rxq); 651 } 652 653 #define MLX5_SYSTEM_LOG_DIR "/var/log" 654 /** 655 * Dump debug information to log file. 656 * 657 * @param fname 658 * The file name. 659 * @param hex_title 660 * If not NULL this string is printed as a header to the output 661 * and the output will be in hexadecimal view. 662 * @param buf 663 * This is the buffer address to print out. 664 * @param len 665 * The number of bytes to dump out. 666 */ 667 void 668 mlx5_dump_debug_information(const char *fname, const char *hex_title, 669 const void *buf, unsigned int hex_len) 670 { 671 FILE *fd; 672 673 MKSTR(path, "%s/%s", MLX5_SYSTEM_LOG_DIR, fname); 674 fd = fopen(path, "a+"); 675 if (!fd) { 676 DRV_LOG(WARNING, "cannot open %s for debug dump", path); 677 MKSTR(path2, "./%s", fname); 678 fd = fopen(path2, "a+"); 679 if (!fd) { 680 DRV_LOG(ERR, "cannot open %s for debug dump", path2); 681 return; 682 } 683 DRV_LOG(INFO, "New debug dump in file %s", path2); 684 } else { 685 DRV_LOG(INFO, "New debug dump in file %s", path); 686 } 687 if (hex_title) 688 rte_hexdump(fd, hex_title, buf, hex_len); 689 else 690 fprintf(fd, "%s", (const char *)buf); 691 fprintf(fd, "\n\n\n"); 692 fclose(fd); 693 } 694 695 /** 696 * Move QP from error state to running state and initialize indexes. 697 * 698 * @param txq_ctrl 699 * Pointer to TX queue control structure. 700 * 701 * @return 702 * 0 on success, else -1. 703 */ 704 static int 705 tx_recover_qp(struct mlx5_txq_ctrl *txq_ctrl) 706 { 707 struct mlx5_mp_arg_queue_state_modify sm = { 708 .is_wq = 0, 709 .queue_id = txq_ctrl->txq.idx, 710 }; 711 712 if (mlx5_queue_state_modify(ETH_DEV(txq_ctrl->priv), &sm)) 713 return -1; 714 txq_ctrl->txq.wqe_ci = 0; 715 txq_ctrl->txq.wqe_pi = 0; 716 txq_ctrl->txq.elts_comp = 0; 717 return 0; 718 } 719 720 /* Return 1 if the error CQE is signed otherwise, sign it and return 0. */ 721 static int 722 check_err_cqe_seen(volatile struct mlx5_err_cqe *err_cqe) 723 { 724 static const uint8_t magic[] = "seen"; 725 int ret = 1; 726 unsigned int i; 727 728 for (i = 0; i < sizeof(magic); ++i) 729 if (!ret || err_cqe->rsvd1[i] != magic[i]) { 730 ret = 0; 731 err_cqe->rsvd1[i] = magic[i]; 732 } 733 return ret; 734 } 735 736 /** 737 * Handle error CQE. 738 * 739 * @param txq 740 * Pointer to TX queue structure. 741 * @param error_cqe 742 * Pointer to the error CQE. 743 * 744 * @return 745 * Negative value if queue recovery failed, otherwise 746 * the error completion entry is handled successfully. 747 */ 748 static int 749 mlx5_tx_error_cqe_handle(struct mlx5_txq_data *restrict txq, 750 volatile struct mlx5_err_cqe *err_cqe) 751 { 752 if (err_cqe->syndrome != MLX5_CQE_SYNDROME_WR_FLUSH_ERR) { 753 const uint16_t wqe_m = ((1 << txq->wqe_n) - 1); 754 struct mlx5_txq_ctrl *txq_ctrl = 755 container_of(txq, struct mlx5_txq_ctrl, txq); 756 uint16_t new_wqe_pi = rte_be_to_cpu_16(err_cqe->wqe_counter); 757 int seen = check_err_cqe_seen(err_cqe); 758 759 if (!seen && txq_ctrl->dump_file_n < 760 txq_ctrl->priv->config.max_dump_files_num) { 761 MKSTR(err_str, "Unexpected CQE error syndrome " 762 "0x%02x CQN = %u SQN = %u wqe_counter = %u " 763 "wq_ci = %u cq_ci = %u", err_cqe->syndrome, 764 txq->cqe_s, txq->qp_num_8s >> 8, 765 rte_be_to_cpu_16(err_cqe->wqe_counter), 766 txq->wqe_ci, txq->cq_ci); 767 MKSTR(name, "dpdk_mlx5_port_%u_txq_%u_index_%u_%u", 768 PORT_ID(txq_ctrl->priv), txq->idx, 769 txq_ctrl->dump_file_n, (uint32_t)rte_rdtsc()); 770 mlx5_dump_debug_information(name, NULL, err_str, 0); 771 mlx5_dump_debug_information(name, "MLX5 Error CQ:", 772 (const void *)((uintptr_t) 773 txq->cqes), 774 sizeof(*err_cqe) * 775 (1 << txq->cqe_n)); 776 mlx5_dump_debug_information(name, "MLX5 Error SQ:", 777 (const void *)((uintptr_t) 778 txq->wqes), 779 MLX5_WQE_SIZE * 780 (1 << txq->wqe_n)); 781 txq_ctrl->dump_file_n++; 782 } 783 if (!seen) 784 /* 785 * Count errors in WQEs units. 786 * Later it can be improved to count error packets, 787 * for example, by SQ parsing to find how much packets 788 * should be counted for each WQE. 789 */ 790 txq->stats.oerrors += ((txq->wqe_ci & wqe_m) - 791 new_wqe_pi) & wqe_m; 792 if (tx_recover_qp(txq_ctrl)) { 793 /* Recovering failed - retry later on the same WQE. */ 794 return -1; 795 } 796 /* Release all the remaining buffers. */ 797 txq_free_elts(txq_ctrl); 798 } 799 return 0; 800 } 801 802 /** 803 * Translate RX completion flags to packet type. 804 * 805 * @param[in] rxq 806 * Pointer to RX queue structure. 807 * @param[in] cqe 808 * Pointer to CQE. 809 * 810 * @note: fix mlx5_dev_supported_ptypes_get() if any change here. 811 * 812 * @return 813 * Packet type for struct rte_mbuf. 814 */ 815 static inline uint32_t 816 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe) 817 { 818 uint8_t idx; 819 uint8_t pinfo = cqe->pkt_info; 820 uint16_t ptype = cqe->hdr_type_etc; 821 822 /* 823 * The index to the array should have: 824 * bit[1:0] = l3_hdr_type 825 * bit[4:2] = l4_hdr_type 826 * bit[5] = ip_frag 827 * bit[6] = tunneled 828 * bit[7] = outer_l3_type 829 */ 830 idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10); 831 return mlx5_ptype_table[idx] | rxq->tunnel * !!(idx & (1 << 6)); 832 } 833 834 /** 835 * Initialize Rx WQ and indexes. 836 * 837 * @param[in] rxq 838 * Pointer to RX queue structure. 839 */ 840 void 841 mlx5_rxq_initialize(struct mlx5_rxq_data *rxq) 842 { 843 const unsigned int wqe_n = 1 << rxq->elts_n; 844 unsigned int i; 845 846 for (i = 0; (i != wqe_n); ++i) { 847 volatile struct mlx5_wqe_data_seg *scat; 848 uintptr_t addr; 849 uint32_t byte_count; 850 851 if (mlx5_rxq_mprq_enabled(rxq)) { 852 struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[i]; 853 854 scat = &((volatile struct mlx5_wqe_mprq *) 855 rxq->wqes)[i].dseg; 856 addr = (uintptr_t)mlx5_mprq_buf_addr(buf, 857 1 << rxq->strd_num_n); 858 byte_count = (1 << rxq->strd_sz_n) * 859 (1 << rxq->strd_num_n); 860 } else { 861 struct rte_mbuf *buf = (*rxq->elts)[i]; 862 863 scat = &((volatile struct mlx5_wqe_data_seg *) 864 rxq->wqes)[i]; 865 addr = rte_pktmbuf_mtod(buf, uintptr_t); 866 byte_count = DATA_LEN(buf); 867 } 868 /* scat->addr must be able to store a pointer. */ 869 MLX5_ASSERT(sizeof(scat->addr) >= sizeof(uintptr_t)); 870 *scat = (struct mlx5_wqe_data_seg){ 871 .addr = rte_cpu_to_be_64(addr), 872 .byte_count = rte_cpu_to_be_32(byte_count), 873 .lkey = mlx5_rx_addr2mr(rxq, addr), 874 }; 875 } 876 rxq->consumed_strd = 0; 877 rxq->decompressed = 0; 878 rxq->rq_pi = 0; 879 rxq->zip = (struct rxq_zip){ 880 .ai = 0, 881 }; 882 /* Update doorbell counter. */ 883 rxq->rq_ci = wqe_n >> rxq->sges_n; 884 rte_cio_wmb(); 885 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 886 } 887 888 /** 889 * Modify a Verbs/DevX queue state. 890 * This must be called from the primary process. 891 * 892 * @param dev 893 * Pointer to Ethernet device. 894 * @param sm 895 * State modify request parameters. 896 * 897 * @return 898 * 0 in case of success else non-zero value and rte_errno is set. 899 */ 900 int 901 mlx5_queue_state_modify_primary(struct rte_eth_dev *dev, 902 const struct mlx5_mp_arg_queue_state_modify *sm) 903 { 904 int ret; 905 struct mlx5_priv *priv = dev->data->dev_private; 906 907 if (sm->is_wq) { 908 struct mlx5_rxq_data *rxq = (*priv->rxqs)[sm->queue_id]; 909 struct mlx5_rxq_ctrl *rxq_ctrl = 910 container_of(rxq, struct mlx5_rxq_ctrl, rxq); 911 912 if (rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_IBV) { 913 struct ibv_wq_attr mod = { 914 .attr_mask = IBV_WQ_ATTR_STATE, 915 .wq_state = sm->state, 916 }; 917 918 ret = mlx5_glue->modify_wq(rxq_ctrl->obj->wq, &mod); 919 } else { /* rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ. */ 920 struct mlx5_devx_modify_rq_attr rq_attr; 921 922 memset(&rq_attr, 0, sizeof(rq_attr)); 923 if (sm->state == IBV_WQS_RESET) { 924 rq_attr.rq_state = MLX5_RQC_STATE_ERR; 925 rq_attr.state = MLX5_RQC_STATE_RST; 926 } else if (sm->state == IBV_WQS_RDY) { 927 rq_attr.rq_state = MLX5_RQC_STATE_RST; 928 rq_attr.state = MLX5_RQC_STATE_RDY; 929 } else if (sm->state == IBV_WQS_ERR) { 930 rq_attr.rq_state = MLX5_RQC_STATE_RDY; 931 rq_attr.state = MLX5_RQC_STATE_ERR; 932 } 933 ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq, 934 &rq_attr); 935 } 936 if (ret) { 937 DRV_LOG(ERR, "Cannot change Rx WQ state to %u - %s", 938 sm->state, strerror(errno)); 939 rte_errno = errno; 940 return ret; 941 } 942 } else { 943 struct mlx5_txq_data *txq = (*priv->txqs)[sm->queue_id]; 944 struct mlx5_txq_ctrl *txq_ctrl = 945 container_of(txq, struct mlx5_txq_ctrl, txq); 946 struct ibv_qp_attr mod = { 947 .qp_state = IBV_QPS_RESET, 948 .port_num = (uint8_t)priv->ibv_port, 949 }; 950 struct ibv_qp *qp = txq_ctrl->obj->qp; 951 952 ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE); 953 if (ret) { 954 DRV_LOG(ERR, "Cannot change the Tx QP state to RESET " 955 "%s", strerror(errno)); 956 rte_errno = errno; 957 return ret; 958 } 959 mod.qp_state = IBV_QPS_INIT; 960 ret = mlx5_glue->modify_qp(qp, &mod, 961 (IBV_QP_STATE | IBV_QP_PORT)); 962 if (ret) { 963 DRV_LOG(ERR, "Cannot change Tx QP state to INIT %s", 964 strerror(errno)); 965 rte_errno = errno; 966 return ret; 967 } 968 mod.qp_state = IBV_QPS_RTR; 969 ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE); 970 if (ret) { 971 DRV_LOG(ERR, "Cannot change Tx QP state to RTR %s", 972 strerror(errno)); 973 rte_errno = errno; 974 return ret; 975 } 976 mod.qp_state = IBV_QPS_RTS; 977 ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE); 978 if (ret) { 979 DRV_LOG(ERR, "Cannot change Tx QP state to RTS %s", 980 strerror(errno)); 981 rte_errno = errno; 982 return ret; 983 } 984 } 985 return 0; 986 } 987 988 /** 989 * Modify a Verbs queue state. 990 * 991 * @param dev 992 * Pointer to Ethernet device. 993 * @param sm 994 * State modify request parameters. 995 * 996 * @return 997 * 0 in case of success else non-zero value. 998 */ 999 static int 1000 mlx5_queue_state_modify(struct rte_eth_dev *dev, 1001 struct mlx5_mp_arg_queue_state_modify *sm) 1002 { 1003 int ret = 0; 1004 1005 switch (rte_eal_process_type()) { 1006 case RTE_PROC_PRIMARY: 1007 ret = mlx5_queue_state_modify_primary(dev, sm); 1008 break; 1009 case RTE_PROC_SECONDARY: 1010 ret = mlx5_mp_req_queue_state_modify(dev, sm); 1011 break; 1012 default: 1013 break; 1014 } 1015 return ret; 1016 } 1017 1018 /** 1019 * Handle a Rx error. 1020 * The function inserts the RQ state to reset when the first error CQE is 1021 * shown, then drains the CQ by the caller function loop. When the CQ is empty, 1022 * it moves the RQ state to ready and initializes the RQ. 1023 * Next CQE identification and error counting are in the caller responsibility. 1024 * 1025 * @param[in] rxq 1026 * Pointer to RX queue structure. 1027 * @param[in] vec 1028 * 1 when called from vectorized Rx burst, need to prepare mbufs for the RQ. 1029 * 0 when called from non-vectorized Rx burst. 1030 * 1031 * @return 1032 * -1 in case of recovery error, otherwise the CQE status. 1033 */ 1034 int 1035 mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec) 1036 { 1037 const uint16_t cqe_n = 1 << rxq->cqe_n; 1038 const uint16_t cqe_mask = cqe_n - 1; 1039 const unsigned int wqe_n = 1 << rxq->elts_n; 1040 struct mlx5_rxq_ctrl *rxq_ctrl = 1041 container_of(rxq, struct mlx5_rxq_ctrl, rxq); 1042 union { 1043 volatile struct mlx5_cqe *cqe; 1044 volatile struct mlx5_err_cqe *err_cqe; 1045 } u = { 1046 .cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask], 1047 }; 1048 struct mlx5_mp_arg_queue_state_modify sm; 1049 int ret; 1050 1051 switch (rxq->err_state) { 1052 case MLX5_RXQ_ERR_STATE_NO_ERROR: 1053 rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_RESET; 1054 /* Fall-through */ 1055 case MLX5_RXQ_ERR_STATE_NEED_RESET: 1056 sm.is_wq = 1; 1057 sm.queue_id = rxq->idx; 1058 sm.state = IBV_WQS_RESET; 1059 if (mlx5_queue_state_modify(ETH_DEV(rxq_ctrl->priv), &sm)) 1060 return -1; 1061 if (rxq_ctrl->dump_file_n < 1062 rxq_ctrl->priv->config.max_dump_files_num) { 1063 MKSTR(err_str, "Unexpected CQE error syndrome " 1064 "0x%02x CQN = %u RQN = %u wqe_counter = %u" 1065 " rq_ci = %u cq_ci = %u", u.err_cqe->syndrome, 1066 rxq->cqn, rxq_ctrl->wqn, 1067 rte_be_to_cpu_16(u.err_cqe->wqe_counter), 1068 rxq->rq_ci << rxq->sges_n, rxq->cq_ci); 1069 MKSTR(name, "dpdk_mlx5_port_%u_rxq_%u_%u", 1070 rxq->port_id, rxq->idx, (uint32_t)rte_rdtsc()); 1071 mlx5_dump_debug_information(name, NULL, err_str, 0); 1072 mlx5_dump_debug_information(name, "MLX5 Error CQ:", 1073 (const void *)((uintptr_t) 1074 rxq->cqes), 1075 sizeof(*u.cqe) * cqe_n); 1076 mlx5_dump_debug_information(name, "MLX5 Error RQ:", 1077 (const void *)((uintptr_t) 1078 rxq->wqes), 1079 16 * wqe_n); 1080 rxq_ctrl->dump_file_n++; 1081 } 1082 rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_READY; 1083 /* Fall-through */ 1084 case MLX5_RXQ_ERR_STATE_NEED_READY: 1085 ret = check_cqe(u.cqe, cqe_n, rxq->cq_ci); 1086 if (ret == MLX5_CQE_STATUS_HW_OWN) { 1087 rte_cio_wmb(); 1088 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); 1089 rte_cio_wmb(); 1090 /* 1091 * The RQ consumer index must be zeroed while moving 1092 * from RESET state to RDY state. 1093 */ 1094 *rxq->rq_db = rte_cpu_to_be_32(0); 1095 rte_cio_wmb(); 1096 sm.is_wq = 1; 1097 sm.queue_id = rxq->idx; 1098 sm.state = IBV_WQS_RDY; 1099 if (mlx5_queue_state_modify(ETH_DEV(rxq_ctrl->priv), 1100 &sm)) 1101 return -1; 1102 if (vec) { 1103 const uint16_t q_mask = wqe_n - 1; 1104 uint16_t elt_idx; 1105 struct rte_mbuf **elt; 1106 int i; 1107 unsigned int n = wqe_n - (rxq->rq_ci - 1108 rxq->rq_pi); 1109 1110 for (i = 0; i < (int)n; ++i) { 1111 elt_idx = (rxq->rq_ci + i) & q_mask; 1112 elt = &(*rxq->elts)[elt_idx]; 1113 *elt = rte_mbuf_raw_alloc(rxq->mp); 1114 if (!*elt) { 1115 for (i--; i >= 0; --i) { 1116 elt_idx = (rxq->rq_ci + 1117 i) & q_mask; 1118 elt = &(*rxq->elts) 1119 [elt_idx]; 1120 rte_pktmbuf_free_seg 1121 (*elt); 1122 } 1123 return -1; 1124 } 1125 } 1126 for (i = 0; i < (int)wqe_n; ++i) { 1127 elt = &(*rxq->elts)[i]; 1128 DATA_LEN(*elt) = 1129 (uint16_t)((*elt)->buf_len - 1130 rte_pktmbuf_headroom(*elt)); 1131 } 1132 /* Padding with a fake mbuf for vec Rx. */ 1133 for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i) 1134 (*rxq->elts)[wqe_n + i] = 1135 &rxq->fake_mbuf; 1136 } 1137 mlx5_rxq_initialize(rxq); 1138 rxq->err_state = MLX5_RXQ_ERR_STATE_NO_ERROR; 1139 } 1140 return ret; 1141 default: 1142 return -1; 1143 } 1144 } 1145 1146 /** 1147 * Get size of the next packet for a given CQE. For compressed CQEs, the 1148 * consumer index is updated only once all packets of the current one have 1149 * been processed. 1150 * 1151 * @param rxq 1152 * Pointer to RX queue. 1153 * @param cqe 1154 * CQE to process. 1155 * @param[out] mcqe 1156 * Store pointer to mini-CQE if compressed. Otherwise, the pointer is not 1157 * written. 1158 * 1159 * @return 1160 * 0 in case of empty CQE, otherwise the packet size in bytes. 1161 */ 1162 static inline int 1163 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, 1164 uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe) 1165 { 1166 struct rxq_zip *zip = &rxq->zip; 1167 uint16_t cqe_n = cqe_cnt + 1; 1168 int len; 1169 uint16_t idx, end; 1170 1171 do { 1172 len = 0; 1173 /* Process compressed data in the CQE and mini arrays. */ 1174 if (zip->ai) { 1175 volatile struct mlx5_mini_cqe8 (*mc)[8] = 1176 (volatile struct mlx5_mini_cqe8 (*)[8]) 1177 (uintptr_t)(&(*rxq->cqes)[zip->ca & 1178 cqe_cnt].pkt_info); 1179 1180 len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt); 1181 *mcqe = &(*mc)[zip->ai & 7]; 1182 if ((++zip->ai & 7) == 0) { 1183 /* Invalidate consumed CQEs */ 1184 idx = zip->ca; 1185 end = zip->na; 1186 while (idx != end) { 1187 (*rxq->cqes)[idx & cqe_cnt].op_own = 1188 MLX5_CQE_INVALIDATE; 1189 ++idx; 1190 } 1191 /* 1192 * Increment consumer index to skip the number 1193 * of CQEs consumed. Hardware leaves holes in 1194 * the CQ ring for software use. 1195 */ 1196 zip->ca = zip->na; 1197 zip->na += 8; 1198 } 1199 if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) { 1200 /* Invalidate the rest */ 1201 idx = zip->ca; 1202 end = zip->cq_ci; 1203 1204 while (idx != end) { 1205 (*rxq->cqes)[idx & cqe_cnt].op_own = 1206 MLX5_CQE_INVALIDATE; 1207 ++idx; 1208 } 1209 rxq->cq_ci = zip->cq_ci; 1210 zip->ai = 0; 1211 } 1212 /* 1213 * No compressed data, get next CQE and verify if it is 1214 * compressed. 1215 */ 1216 } else { 1217 int ret; 1218 int8_t op_own; 1219 1220 ret = check_cqe(cqe, cqe_n, rxq->cq_ci); 1221 if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) { 1222 if (unlikely(ret == MLX5_CQE_STATUS_ERR || 1223 rxq->err_state)) { 1224 ret = mlx5_rx_err_handle(rxq, 0); 1225 if (ret == MLX5_CQE_STATUS_HW_OWN || 1226 ret == -1) 1227 return 0; 1228 } else { 1229 return 0; 1230 } 1231 } 1232 ++rxq->cq_ci; 1233 op_own = cqe->op_own; 1234 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) { 1235 volatile struct mlx5_mini_cqe8 (*mc)[8] = 1236 (volatile struct mlx5_mini_cqe8 (*)[8]) 1237 (uintptr_t)(&(*rxq->cqes) 1238 [rxq->cq_ci & 1239 cqe_cnt].pkt_info); 1240 1241 /* Fix endianness. */ 1242 zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt); 1243 /* 1244 * Current mini array position is the one 1245 * returned by check_cqe64(). 1246 * 1247 * If completion comprises several mini arrays, 1248 * as a special case the second one is located 1249 * 7 CQEs after the initial CQE instead of 8 1250 * for subsequent ones. 1251 */ 1252 zip->ca = rxq->cq_ci; 1253 zip->na = zip->ca + 7; 1254 /* Compute the next non compressed CQE. */ 1255 --rxq->cq_ci; 1256 zip->cq_ci = rxq->cq_ci + zip->cqe_cnt; 1257 /* Get packet size to return. */ 1258 len = rte_be_to_cpu_32((*mc)[0].byte_cnt); 1259 *mcqe = &(*mc)[0]; 1260 zip->ai = 1; 1261 /* Prefetch all to be invalidated */ 1262 idx = zip->ca; 1263 end = zip->cq_ci; 1264 while (idx != end) { 1265 rte_prefetch0(&(*rxq->cqes)[(idx) & 1266 cqe_cnt]); 1267 ++idx; 1268 } 1269 } else { 1270 len = rte_be_to_cpu_32(cqe->byte_cnt); 1271 } 1272 } 1273 if (unlikely(rxq->err_state)) { 1274 cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1275 ++rxq->stats.idropped; 1276 } else { 1277 return len; 1278 } 1279 } while (1); 1280 } 1281 1282 /** 1283 * Translate RX completion flags to offload flags. 1284 * 1285 * @param[in] cqe 1286 * Pointer to CQE. 1287 * 1288 * @return 1289 * Offload flags (ol_flags) for struct rte_mbuf. 1290 */ 1291 static inline uint32_t 1292 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe) 1293 { 1294 uint32_t ol_flags = 0; 1295 uint16_t flags = rte_be_to_cpu_16(cqe->hdr_type_etc); 1296 1297 ol_flags = 1298 TRANSPOSE(flags, 1299 MLX5_CQE_RX_L3_HDR_VALID, 1300 PKT_RX_IP_CKSUM_GOOD) | 1301 TRANSPOSE(flags, 1302 MLX5_CQE_RX_L4_HDR_VALID, 1303 PKT_RX_L4_CKSUM_GOOD); 1304 return ol_flags; 1305 } 1306 1307 /** 1308 * Fill in mbuf fields from RX completion flags. 1309 * Note that pkt->ol_flags should be initialized outside of this function. 1310 * 1311 * @param rxq 1312 * Pointer to RX queue. 1313 * @param pkt 1314 * mbuf to fill. 1315 * @param cqe 1316 * CQE to process. 1317 * @param rss_hash_res 1318 * Packet RSS Hash result. 1319 */ 1320 static inline void 1321 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, 1322 volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res) 1323 { 1324 /* Update packet information. */ 1325 pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe); 1326 if (rss_hash_res && rxq->rss_hash) { 1327 pkt->hash.rss = rss_hash_res; 1328 pkt->ol_flags |= PKT_RX_RSS_HASH; 1329 } 1330 if (rxq->mark && MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) { 1331 pkt->ol_flags |= PKT_RX_FDIR; 1332 if (cqe->sop_drop_qpn != 1333 rte_cpu_to_be_32(MLX5_FLOW_MARK_DEFAULT)) { 1334 uint32_t mark = cqe->sop_drop_qpn; 1335 1336 pkt->ol_flags |= PKT_RX_FDIR_ID; 1337 pkt->hash.fdir.hi = mlx5_flow_mark_get(mark); 1338 } 1339 } 1340 if (rte_flow_dynf_metadata_avail() && cqe->flow_table_metadata) { 1341 pkt->ol_flags |= PKT_RX_DYNF_METADATA; 1342 *RTE_FLOW_DYNF_METADATA(pkt) = cqe->flow_table_metadata; 1343 } 1344 if (rxq->csum) 1345 pkt->ol_flags |= rxq_cq_to_ol_flags(cqe); 1346 if (rxq->vlan_strip && 1347 (cqe->hdr_type_etc & rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED))) { 1348 pkt->ol_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED; 1349 pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info); 1350 } 1351 if (rxq->hw_timestamp) { 1352 pkt->timestamp = rte_be_to_cpu_64(cqe->timestamp); 1353 pkt->ol_flags |= PKT_RX_TIMESTAMP; 1354 } 1355 } 1356 1357 /** 1358 * DPDK callback for RX. 1359 * 1360 * @param dpdk_rxq 1361 * Generic pointer to RX queue structure. 1362 * @param[out] pkts 1363 * Array to store received packets. 1364 * @param pkts_n 1365 * Maximum number of packets in array. 1366 * 1367 * @return 1368 * Number of packets successfully received (<= pkts_n). 1369 */ 1370 uint16_t 1371 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1372 { 1373 struct mlx5_rxq_data *rxq = dpdk_rxq; 1374 const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1; 1375 const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1; 1376 const unsigned int sges_n = rxq->sges_n; 1377 struct rte_mbuf *pkt = NULL; 1378 struct rte_mbuf *seg = NULL; 1379 volatile struct mlx5_cqe *cqe = 1380 &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1381 unsigned int i = 0; 1382 unsigned int rq_ci = rxq->rq_ci << sges_n; 1383 int len = 0; /* keep its value across iterations. */ 1384 1385 while (pkts_n) { 1386 unsigned int idx = rq_ci & wqe_cnt; 1387 volatile struct mlx5_wqe_data_seg *wqe = 1388 &((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx]; 1389 struct rte_mbuf *rep = (*rxq->elts)[idx]; 1390 volatile struct mlx5_mini_cqe8 *mcqe = NULL; 1391 uint32_t rss_hash_res; 1392 1393 if (pkt) 1394 NEXT(seg) = rep; 1395 seg = rep; 1396 rte_prefetch0(seg); 1397 rte_prefetch0(cqe); 1398 rte_prefetch0(wqe); 1399 rep = rte_mbuf_raw_alloc(rxq->mp); 1400 if (unlikely(rep == NULL)) { 1401 ++rxq->stats.rx_nombuf; 1402 if (!pkt) { 1403 /* 1404 * no buffers before we even started, 1405 * bail out silently. 1406 */ 1407 break; 1408 } 1409 while (pkt != seg) { 1410 MLX5_ASSERT(pkt != (*rxq->elts)[idx]); 1411 rep = NEXT(pkt); 1412 NEXT(pkt) = NULL; 1413 NB_SEGS(pkt) = 1; 1414 rte_mbuf_raw_free(pkt); 1415 pkt = rep; 1416 } 1417 break; 1418 } 1419 if (!pkt) { 1420 cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1421 len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt, &mcqe); 1422 if (!len) { 1423 rte_mbuf_raw_free(rep); 1424 break; 1425 } 1426 pkt = seg; 1427 MLX5_ASSERT(len >= (rxq->crc_present << 2)); 1428 pkt->ol_flags &= EXT_ATTACHED_MBUF; 1429 /* If compressed, take hash result from mini-CQE. */ 1430 rss_hash_res = rte_be_to_cpu_32(mcqe == NULL ? 1431 cqe->rx_hash_res : 1432 mcqe->rx_hash_result); 1433 rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res); 1434 if (rxq->crc_present) 1435 len -= RTE_ETHER_CRC_LEN; 1436 PKT_LEN(pkt) = len; 1437 if (cqe->lro_num_seg > 1) { 1438 mlx5_lro_update_hdr 1439 (rte_pktmbuf_mtod(pkt, uint8_t *), cqe, 1440 len); 1441 pkt->ol_flags |= PKT_RX_LRO; 1442 pkt->tso_segsz = len / cqe->lro_num_seg; 1443 } 1444 } 1445 DATA_LEN(rep) = DATA_LEN(seg); 1446 PKT_LEN(rep) = PKT_LEN(seg); 1447 SET_DATA_OFF(rep, DATA_OFF(seg)); 1448 PORT(rep) = PORT(seg); 1449 (*rxq->elts)[idx] = rep; 1450 /* 1451 * Fill NIC descriptor with the new buffer. The lkey and size 1452 * of the buffers are already known, only the buffer address 1453 * changes. 1454 */ 1455 wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t)); 1456 /* If there's only one MR, no need to replace LKey in WQE. */ 1457 if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) 1458 wqe->lkey = mlx5_rx_mb2mr(rxq, rep); 1459 if (len > DATA_LEN(seg)) { 1460 len -= DATA_LEN(seg); 1461 ++NB_SEGS(pkt); 1462 ++rq_ci; 1463 continue; 1464 } 1465 DATA_LEN(seg) = len; 1466 #ifdef MLX5_PMD_SOFT_COUNTERS 1467 /* Increment bytes counter. */ 1468 rxq->stats.ibytes += PKT_LEN(pkt); 1469 #endif 1470 /* Return packet. */ 1471 *(pkts++) = pkt; 1472 pkt = NULL; 1473 --pkts_n; 1474 ++i; 1475 /* Align consumer index to the next stride. */ 1476 rq_ci >>= sges_n; 1477 ++rq_ci; 1478 rq_ci <<= sges_n; 1479 } 1480 if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci))) 1481 return 0; 1482 /* Update the consumer index. */ 1483 rxq->rq_ci = rq_ci >> sges_n; 1484 rte_cio_wmb(); 1485 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); 1486 rte_cio_wmb(); 1487 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 1488 #ifdef MLX5_PMD_SOFT_COUNTERS 1489 /* Increment packets counter. */ 1490 rxq->stats.ipackets += i; 1491 #endif 1492 return i; 1493 } 1494 1495 /** 1496 * Update LRO packet TCP header. 1497 * The HW LRO feature doesn't update the TCP header after coalescing the 1498 * TCP segments but supplies information in CQE to fill it by SW. 1499 * 1500 * @param tcp 1501 * Pointer to the TCP header. 1502 * @param cqe 1503 * Pointer to the completion entry.. 1504 * @param phcsum 1505 * The L3 pseudo-header checksum. 1506 */ 1507 static inline void 1508 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *restrict tcp, 1509 volatile struct mlx5_cqe *restrict cqe, 1510 uint32_t phcsum) 1511 { 1512 uint8_t l4_type = (rte_be_to_cpu_16(cqe->hdr_type_etc) & 1513 MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT; 1514 /* 1515 * The HW calculates only the TCP payload checksum, need to complete 1516 * the TCP header checksum and the L3 pseudo-header checksum. 1517 */ 1518 uint32_t csum = phcsum + cqe->csum; 1519 1520 if (l4_type == MLX5_L4_HDR_TYPE_TCP_EMPTY_ACK || 1521 l4_type == MLX5_L4_HDR_TYPE_TCP_WITH_ACL) { 1522 tcp->tcp_flags |= RTE_TCP_ACK_FLAG; 1523 tcp->recv_ack = cqe->lro_ack_seq_num; 1524 tcp->rx_win = cqe->lro_tcp_win; 1525 } 1526 if (cqe->lro_tcppsh_abort_dupack & MLX5_CQE_LRO_PUSH_MASK) 1527 tcp->tcp_flags |= RTE_TCP_PSH_FLAG; 1528 tcp->cksum = 0; 1529 csum += rte_raw_cksum(tcp, (tcp->data_off & 0xF) * 4); 1530 csum = ((csum & 0xffff0000) >> 16) + (csum & 0xffff); 1531 csum = (~csum) & 0xffff; 1532 if (csum == 0) 1533 csum = 0xffff; 1534 tcp->cksum = csum; 1535 } 1536 1537 /** 1538 * Update LRO packet headers. 1539 * The HW LRO feature doesn't update the L3/TCP headers after coalescing the 1540 * TCP segments but supply information in CQE to fill it by SW. 1541 * 1542 * @param padd 1543 * The packet address. 1544 * @param cqe 1545 * Pointer to the completion entry.. 1546 * @param len 1547 * The packet length. 1548 */ 1549 static inline void 1550 mlx5_lro_update_hdr(uint8_t *restrict padd, 1551 volatile struct mlx5_cqe *restrict cqe, 1552 uint32_t len) 1553 { 1554 union { 1555 struct rte_ether_hdr *eth; 1556 struct rte_vlan_hdr *vlan; 1557 struct rte_ipv4_hdr *ipv4; 1558 struct rte_ipv6_hdr *ipv6; 1559 struct rte_tcp_hdr *tcp; 1560 uint8_t *hdr; 1561 } h = { 1562 .hdr = padd, 1563 }; 1564 uint16_t proto = h.eth->ether_type; 1565 uint32_t phcsum; 1566 1567 h.eth++; 1568 while (proto == RTE_BE16(RTE_ETHER_TYPE_VLAN) || 1569 proto == RTE_BE16(RTE_ETHER_TYPE_QINQ)) { 1570 proto = h.vlan->eth_proto; 1571 h.vlan++; 1572 } 1573 if (proto == RTE_BE16(RTE_ETHER_TYPE_IPV4)) { 1574 h.ipv4->time_to_live = cqe->lro_min_ttl; 1575 h.ipv4->total_length = rte_cpu_to_be_16(len - (h.hdr - padd)); 1576 h.ipv4->hdr_checksum = 0; 1577 h.ipv4->hdr_checksum = rte_ipv4_cksum(h.ipv4); 1578 phcsum = rte_ipv4_phdr_cksum(h.ipv4, 0); 1579 h.ipv4++; 1580 } else { 1581 h.ipv6->hop_limits = cqe->lro_min_ttl; 1582 h.ipv6->payload_len = rte_cpu_to_be_16(len - (h.hdr - padd) - 1583 sizeof(*h.ipv6)); 1584 phcsum = rte_ipv6_phdr_cksum(h.ipv6, 0); 1585 h.ipv6++; 1586 } 1587 mlx5_lro_update_tcp_hdr(h.tcp, cqe, phcsum); 1588 } 1589 1590 void 1591 mlx5_mprq_buf_free_cb(void *addr __rte_unused, void *opaque) 1592 { 1593 struct mlx5_mprq_buf *buf = opaque; 1594 1595 if (rte_atomic16_read(&buf->refcnt) == 1) { 1596 rte_mempool_put(buf->mp, buf); 1597 } else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) { 1598 rte_atomic16_set(&buf->refcnt, 1); 1599 rte_mempool_put(buf->mp, buf); 1600 } 1601 } 1602 1603 void 1604 mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf) 1605 { 1606 mlx5_mprq_buf_free_cb(NULL, buf); 1607 } 1608 1609 static inline void 1610 mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx, 1611 const unsigned int strd_n) 1612 { 1613 struct mlx5_mprq_buf *rep = rxq->mprq_repl; 1614 volatile struct mlx5_wqe_data_seg *wqe = 1615 &((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg; 1616 void *addr; 1617 1618 MLX5_ASSERT(rep != NULL); 1619 /* Replace MPRQ buf. */ 1620 (*rxq->mprq_bufs)[rq_idx] = rep; 1621 /* Replace WQE. */ 1622 addr = mlx5_mprq_buf_addr(rep, strd_n); 1623 wqe->addr = rte_cpu_to_be_64((uintptr_t)addr); 1624 /* If there's only one MR, no need to replace LKey in WQE. */ 1625 if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) 1626 wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr); 1627 /* Stash a mbuf for next replacement. */ 1628 if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep))) 1629 rxq->mprq_repl = rep; 1630 else 1631 rxq->mprq_repl = NULL; 1632 } 1633 1634 /** 1635 * DPDK callback for RX with Multi-Packet RQ support. 1636 * 1637 * @param dpdk_rxq 1638 * Generic pointer to RX queue structure. 1639 * @param[out] pkts 1640 * Array to store received packets. 1641 * @param pkts_n 1642 * Maximum number of packets in array. 1643 * 1644 * @return 1645 * Number of packets successfully received (<= pkts_n). 1646 */ 1647 uint16_t 1648 mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1649 { 1650 struct mlx5_rxq_data *rxq = dpdk_rxq; 1651 const unsigned int strd_n = 1 << rxq->strd_num_n; 1652 const unsigned int strd_sz = 1 << rxq->strd_sz_n; 1653 const unsigned int strd_shift = 1654 MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en; 1655 const unsigned int cq_mask = (1 << rxq->cqe_n) - 1; 1656 const unsigned int wq_mask = (1 << rxq->elts_n) - 1; 1657 volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask]; 1658 unsigned int i = 0; 1659 uint32_t rq_ci = rxq->rq_ci; 1660 uint16_t consumed_strd = rxq->consumed_strd; 1661 uint16_t headroom_sz = rxq->strd_headroom_en * RTE_PKTMBUF_HEADROOM; 1662 struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_ci & wq_mask]; 1663 1664 while (i < pkts_n) { 1665 struct rte_mbuf *pkt; 1666 void *addr; 1667 int ret; 1668 unsigned int len; 1669 uint16_t strd_cnt; 1670 uint16_t strd_idx; 1671 uint32_t offset; 1672 uint32_t byte_cnt; 1673 volatile struct mlx5_mini_cqe8 *mcqe = NULL; 1674 uint32_t rss_hash_res = 0; 1675 uint8_t lro_num_seg; 1676 1677 if (consumed_strd == strd_n) { 1678 /* Replace WQE only if the buffer is still in use. */ 1679 if (rte_atomic16_read(&buf->refcnt) > 1) { 1680 mprq_buf_replace(rxq, rq_ci & wq_mask, strd_n); 1681 /* Release the old buffer. */ 1682 mlx5_mprq_buf_free(buf); 1683 } else if (unlikely(rxq->mprq_repl == NULL)) { 1684 struct mlx5_mprq_buf *rep; 1685 1686 /* 1687 * Currently, the MPRQ mempool is out of buffer 1688 * and doing memcpy regardless of the size of Rx 1689 * packet. Retry allocation to get back to 1690 * normal. 1691 */ 1692 if (!rte_mempool_get(rxq->mprq_mp, 1693 (void **)&rep)) 1694 rxq->mprq_repl = rep; 1695 } 1696 /* Advance to the next WQE. */ 1697 consumed_strd = 0; 1698 ++rq_ci; 1699 buf = (*rxq->mprq_bufs)[rq_ci & wq_mask]; 1700 } 1701 cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask]; 1702 ret = mlx5_rx_poll_len(rxq, cqe, cq_mask, &mcqe); 1703 if (!ret) 1704 break; 1705 byte_cnt = ret; 1706 strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >> 1707 MLX5_MPRQ_STRIDE_NUM_SHIFT; 1708 MLX5_ASSERT(strd_cnt); 1709 consumed_strd += strd_cnt; 1710 if (byte_cnt & MLX5_MPRQ_FILLER_MASK) 1711 continue; 1712 if (mcqe == NULL) { 1713 rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res); 1714 strd_idx = rte_be_to_cpu_16(cqe->wqe_counter); 1715 } else { 1716 /* mini-CQE for MPRQ doesn't have hash result. */ 1717 strd_idx = rte_be_to_cpu_16(mcqe->stride_idx); 1718 } 1719 MLX5_ASSERT(strd_idx < strd_n); 1720 MLX5_ASSERT(!((rte_be_to_cpu_16(cqe->wqe_id) ^ rq_ci) & 1721 wq_mask)); 1722 lro_num_seg = cqe->lro_num_seg; 1723 /* 1724 * Currently configured to receive a packet per a stride. But if 1725 * MTU is adjusted through kernel interface, device could 1726 * consume multiple strides without raising an error. In this 1727 * case, the packet should be dropped because it is bigger than 1728 * the max_rx_pkt_len. 1729 */ 1730 if (unlikely(!lro_num_seg && strd_cnt > 1)) { 1731 ++rxq->stats.idropped; 1732 continue; 1733 } 1734 pkt = rte_pktmbuf_alloc(rxq->mp); 1735 if (unlikely(pkt == NULL)) { 1736 ++rxq->stats.rx_nombuf; 1737 break; 1738 } 1739 len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT; 1740 MLX5_ASSERT((int)len >= (rxq->crc_present << 2)); 1741 if (rxq->crc_present) 1742 len -= RTE_ETHER_CRC_LEN; 1743 offset = strd_idx * strd_sz + strd_shift; 1744 addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset); 1745 /* 1746 * Memcpy packets to the target mbuf if: 1747 * - The size of packet is smaller than mprq_max_memcpy_len. 1748 * - Out of buffer in the Mempool for Multi-Packet RQ. 1749 */ 1750 if (len <= rxq->mprq_max_memcpy_len || rxq->mprq_repl == NULL) { 1751 /* 1752 * When memcpy'ing packet due to out-of-buffer, the 1753 * packet must be smaller than the target mbuf. 1754 */ 1755 if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) { 1756 rte_pktmbuf_free_seg(pkt); 1757 ++rxq->stats.idropped; 1758 continue; 1759 } 1760 rte_memcpy(rte_pktmbuf_mtod(pkt, void *), addr, len); 1761 DATA_LEN(pkt) = len; 1762 } else { 1763 rte_iova_t buf_iova; 1764 struct rte_mbuf_ext_shared_info *shinfo; 1765 uint16_t buf_len = strd_cnt * strd_sz; 1766 void *buf_addr; 1767 1768 /* Increment the refcnt of the whole chunk. */ 1769 rte_atomic16_add_return(&buf->refcnt, 1); 1770 MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf->refcnt) <= 1771 strd_n + 1); 1772 buf_addr = RTE_PTR_SUB(addr, headroom_sz); 1773 /* 1774 * MLX5 device doesn't use iova but it is necessary in a 1775 * case where the Rx packet is transmitted via a 1776 * different PMD. 1777 */ 1778 buf_iova = rte_mempool_virt2iova(buf) + 1779 RTE_PTR_DIFF(buf_addr, buf); 1780 shinfo = &buf->shinfos[strd_idx]; 1781 rte_mbuf_ext_refcnt_set(shinfo, 1); 1782 /* 1783 * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when 1784 * attaching the stride to mbuf and more offload flags 1785 * will be added below by calling rxq_cq_to_mbuf(). 1786 * Other fields will be overwritten. 1787 */ 1788 rte_pktmbuf_attach_extbuf(pkt, buf_addr, buf_iova, 1789 buf_len, shinfo); 1790 /* Set mbuf head-room. */ 1791 pkt->data_off = headroom_sz; 1792 MLX5_ASSERT(pkt->ol_flags == EXT_ATTACHED_MBUF); 1793 /* 1794 * Prevent potential overflow due to MTU change through 1795 * kernel interface. 1796 */ 1797 if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) { 1798 rte_pktmbuf_free_seg(pkt); 1799 ++rxq->stats.idropped; 1800 continue; 1801 } 1802 DATA_LEN(pkt) = len; 1803 /* 1804 * LRO packet may consume all the stride memory, in this 1805 * case packet head-room space is not guaranteed so must 1806 * to add an empty mbuf for the head-room. 1807 */ 1808 if (!rxq->strd_headroom_en) { 1809 struct rte_mbuf *headroom_mbuf = 1810 rte_pktmbuf_alloc(rxq->mp); 1811 1812 if (unlikely(headroom_mbuf == NULL)) { 1813 rte_pktmbuf_free_seg(pkt); 1814 ++rxq->stats.rx_nombuf; 1815 break; 1816 } 1817 PORT(pkt) = rxq->port_id; 1818 NEXT(headroom_mbuf) = pkt; 1819 pkt = headroom_mbuf; 1820 NB_SEGS(pkt) = 2; 1821 } 1822 } 1823 rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res); 1824 if (lro_num_seg > 1) { 1825 mlx5_lro_update_hdr(addr, cqe, len); 1826 pkt->ol_flags |= PKT_RX_LRO; 1827 pkt->tso_segsz = strd_sz; 1828 } 1829 PKT_LEN(pkt) = len; 1830 PORT(pkt) = rxq->port_id; 1831 #ifdef MLX5_PMD_SOFT_COUNTERS 1832 /* Increment bytes counter. */ 1833 rxq->stats.ibytes += PKT_LEN(pkt); 1834 #endif 1835 /* Return packet. */ 1836 *(pkts++) = pkt; 1837 ++i; 1838 } 1839 /* Update the consumer indexes. */ 1840 rxq->consumed_strd = consumed_strd; 1841 rte_cio_wmb(); 1842 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); 1843 if (rq_ci != rxq->rq_ci) { 1844 rxq->rq_ci = rq_ci; 1845 rte_cio_wmb(); 1846 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 1847 } 1848 #ifdef MLX5_PMD_SOFT_COUNTERS 1849 /* Increment packets counter. */ 1850 rxq->stats.ipackets += i; 1851 #endif 1852 return i; 1853 } 1854 1855 /** 1856 * Dummy DPDK callback for TX. 1857 * 1858 * This function is used to temporarily replace the real callback during 1859 * unsafe control operations on the queue, or in case of error. 1860 * 1861 * @param dpdk_txq 1862 * Generic pointer to TX queue structure. 1863 * @param[in] pkts 1864 * Packets to transmit. 1865 * @param pkts_n 1866 * Number of packets in array. 1867 * 1868 * @return 1869 * Number of packets successfully transmitted (<= pkts_n). 1870 */ 1871 uint16_t 1872 removed_tx_burst(void *dpdk_txq __rte_unused, 1873 struct rte_mbuf **pkts __rte_unused, 1874 uint16_t pkts_n __rte_unused) 1875 { 1876 rte_mb(); 1877 return 0; 1878 } 1879 1880 /** 1881 * Dummy DPDK callback for RX. 1882 * 1883 * This function is used to temporarily replace the real callback during 1884 * unsafe control operations on the queue, or in case of error. 1885 * 1886 * @param dpdk_rxq 1887 * Generic pointer to RX queue structure. 1888 * @param[out] pkts 1889 * Array to store received packets. 1890 * @param pkts_n 1891 * Maximum number of packets in array. 1892 * 1893 * @return 1894 * Number of packets successfully received (<= pkts_n). 1895 */ 1896 uint16_t 1897 removed_rx_burst(void *dpdk_txq __rte_unused, 1898 struct rte_mbuf **pkts __rte_unused, 1899 uint16_t pkts_n __rte_unused) 1900 { 1901 rte_mb(); 1902 return 0; 1903 } 1904 1905 /* 1906 * Vectorized Rx/Tx routines are not compiled in when required vector 1907 * instructions are not supported on a target architecture. The following null 1908 * stubs are needed for linkage when those are not included outside of this file 1909 * (e.g. mlx5_rxtx_vec_sse.c for x86). 1910 */ 1911 1912 __rte_weak uint16_t 1913 mlx5_rx_burst_vec(void *dpdk_txq __rte_unused, 1914 struct rte_mbuf **pkts __rte_unused, 1915 uint16_t pkts_n __rte_unused) 1916 { 1917 return 0; 1918 } 1919 1920 __rte_weak int 1921 mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused) 1922 { 1923 return -ENOTSUP; 1924 } 1925 1926 __rte_weak int 1927 mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused) 1928 { 1929 return -ENOTSUP; 1930 } 1931 1932 /** 1933 * Free the mbufs from the linear array of pointers. 1934 * 1935 * @param pkts 1936 * Pointer to array of packets to be free. 1937 * @param pkts_n 1938 * Number of packets to be freed. 1939 * @param olx 1940 * Configured Tx offloads mask. It is fully defined at 1941 * compile time and may be used for optimization. 1942 */ 1943 static __rte_always_inline void 1944 mlx5_tx_free_mbuf(struct rte_mbuf **restrict pkts, 1945 unsigned int pkts_n, 1946 unsigned int olx __rte_unused) 1947 { 1948 struct rte_mempool *pool = NULL; 1949 struct rte_mbuf **p_free = NULL; 1950 struct rte_mbuf *mbuf; 1951 unsigned int n_free = 0; 1952 1953 /* 1954 * The implemented algorithm eliminates 1955 * copying pointers to temporary array 1956 * for rte_mempool_put_bulk() calls. 1957 */ 1958 MLX5_ASSERT(pkts); 1959 MLX5_ASSERT(pkts_n); 1960 for (;;) { 1961 for (;;) { 1962 /* 1963 * Decrement mbuf reference counter, detach 1964 * indirect and external buffers if needed. 1965 */ 1966 mbuf = rte_pktmbuf_prefree_seg(*pkts); 1967 if (likely(mbuf != NULL)) { 1968 MLX5_ASSERT(mbuf == *pkts); 1969 if (likely(n_free != 0)) { 1970 if (unlikely(pool != mbuf->pool)) 1971 /* From different pool. */ 1972 break; 1973 } else { 1974 /* Start new scan array. */ 1975 pool = mbuf->pool; 1976 p_free = pkts; 1977 } 1978 ++n_free; 1979 ++pkts; 1980 --pkts_n; 1981 if (unlikely(pkts_n == 0)) { 1982 mbuf = NULL; 1983 break; 1984 } 1985 } else { 1986 /* 1987 * This happens if mbuf is still referenced. 1988 * We can't put it back to the pool, skip. 1989 */ 1990 ++pkts; 1991 --pkts_n; 1992 if (unlikely(n_free != 0)) 1993 /* There is some array to free.*/ 1994 break; 1995 if (unlikely(pkts_n == 0)) 1996 /* Last mbuf, nothing to free. */ 1997 return; 1998 } 1999 } 2000 for (;;) { 2001 /* 2002 * This loop is implemented to avoid multiple 2003 * inlining of rte_mempool_put_bulk(). 2004 */ 2005 MLX5_ASSERT(pool); 2006 MLX5_ASSERT(p_free); 2007 MLX5_ASSERT(n_free); 2008 /* 2009 * Free the array of pre-freed mbufs 2010 * belonging to the same memory pool. 2011 */ 2012 rte_mempool_put_bulk(pool, (void *)p_free, n_free); 2013 if (unlikely(mbuf != NULL)) { 2014 /* There is the request to start new scan. */ 2015 pool = mbuf->pool; 2016 p_free = pkts++; 2017 n_free = 1; 2018 --pkts_n; 2019 if (likely(pkts_n != 0)) 2020 break; 2021 /* 2022 * This is the last mbuf to be freed. 2023 * Do one more loop iteration to complete. 2024 * This is rare case of the last unique mbuf. 2025 */ 2026 mbuf = NULL; 2027 continue; 2028 } 2029 if (likely(pkts_n == 0)) 2030 return; 2031 n_free = 0; 2032 break; 2033 } 2034 } 2035 } 2036 2037 /** 2038 * Free the mbuf from the elts ring buffer till new tail. 2039 * 2040 * @param txq 2041 * Pointer to Tx queue structure. 2042 * @param tail 2043 * Index in elts to free up to, becomes new elts tail. 2044 * @param olx 2045 * Configured Tx offloads mask. It is fully defined at 2046 * compile time and may be used for optimization. 2047 */ 2048 static __rte_always_inline void 2049 mlx5_tx_free_elts(struct mlx5_txq_data *restrict txq, 2050 uint16_t tail, 2051 unsigned int olx __rte_unused) 2052 { 2053 uint16_t n_elts = tail - txq->elts_tail; 2054 2055 MLX5_ASSERT(n_elts); 2056 MLX5_ASSERT(n_elts <= txq->elts_s); 2057 /* 2058 * Implement a loop to support ring buffer wraparound 2059 * with single inlining of mlx5_tx_free_mbuf(). 2060 */ 2061 do { 2062 unsigned int part; 2063 2064 part = txq->elts_s - (txq->elts_tail & txq->elts_m); 2065 part = RTE_MIN(part, n_elts); 2066 MLX5_ASSERT(part); 2067 MLX5_ASSERT(part <= txq->elts_s); 2068 mlx5_tx_free_mbuf(&txq->elts[txq->elts_tail & txq->elts_m], 2069 part, olx); 2070 txq->elts_tail += part; 2071 n_elts -= part; 2072 } while (n_elts); 2073 } 2074 2075 /** 2076 * Store the mbuf being sent into elts ring buffer. 2077 * On Tx completion these mbufs will be freed. 2078 * 2079 * @param txq 2080 * Pointer to Tx queue structure. 2081 * @param pkts 2082 * Pointer to array of packets to be stored. 2083 * @param pkts_n 2084 * Number of packets to be stored. 2085 * @param olx 2086 * Configured Tx offloads mask. It is fully defined at 2087 * compile time and may be used for optimization. 2088 */ 2089 static __rte_always_inline void 2090 mlx5_tx_copy_elts(struct mlx5_txq_data *restrict txq, 2091 struct rte_mbuf **restrict pkts, 2092 unsigned int pkts_n, 2093 unsigned int olx __rte_unused) 2094 { 2095 unsigned int part; 2096 struct rte_mbuf **elts = (struct rte_mbuf **)txq->elts; 2097 2098 MLX5_ASSERT(pkts); 2099 MLX5_ASSERT(pkts_n); 2100 part = txq->elts_s - (txq->elts_head & txq->elts_m); 2101 MLX5_ASSERT(part); 2102 MLX5_ASSERT(part <= txq->elts_s); 2103 /* This code is a good candidate for vectorizing with SIMD. */ 2104 rte_memcpy((void *)(elts + (txq->elts_head & txq->elts_m)), 2105 (void *)pkts, 2106 RTE_MIN(part, pkts_n) * sizeof(struct rte_mbuf *)); 2107 txq->elts_head += pkts_n; 2108 if (unlikely(part < pkts_n)) 2109 /* The copy is wrapping around the elts array. */ 2110 rte_memcpy((void *)elts, (void *)(pkts + part), 2111 (pkts_n - part) * sizeof(struct rte_mbuf *)); 2112 } 2113 2114 /** 2115 * Update completion queue consuming index via doorbell 2116 * and flush the completed data buffers. 2117 * 2118 * @param txq 2119 * Pointer to TX queue structure. 2120 * @param valid CQE pointer 2121 * if not NULL update txq->wqe_pi and flush the buffers 2122 * @param olx 2123 * Configured Tx offloads mask. It is fully defined at 2124 * compile time and may be used for optimization. 2125 */ 2126 static __rte_always_inline void 2127 mlx5_tx_comp_flush(struct mlx5_txq_data *restrict txq, 2128 volatile struct mlx5_cqe *last_cqe, 2129 unsigned int olx __rte_unused) 2130 { 2131 if (likely(last_cqe != NULL)) { 2132 uint16_t tail; 2133 2134 txq->wqe_pi = rte_be_to_cpu_16(last_cqe->wqe_counter); 2135 tail = txq->fcqs[(txq->cq_ci - 1) & txq->cqe_m]; 2136 if (likely(tail != txq->elts_tail)) { 2137 mlx5_tx_free_elts(txq, tail, olx); 2138 MLX5_ASSERT(tail == txq->elts_tail); 2139 } 2140 } 2141 } 2142 2143 /** 2144 * Manage TX completions. This routine checks the CQ for 2145 * arrived CQEs, deduces the last accomplished WQE in SQ, 2146 * updates SQ producing index and frees all completed mbufs. 2147 * 2148 * @param txq 2149 * Pointer to TX queue structure. 2150 * @param olx 2151 * Configured Tx offloads mask. It is fully defined at 2152 * compile time and may be used for optimization. 2153 * 2154 * NOTE: not inlined intentionally, it makes tx_burst 2155 * routine smaller, simple and faster - from experiments. 2156 */ 2157 static void 2158 mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq, 2159 unsigned int olx __rte_unused) 2160 { 2161 unsigned int count = MLX5_TX_COMP_MAX_CQE; 2162 volatile struct mlx5_cqe *last_cqe = NULL; 2163 uint16_t ci = txq->cq_ci; 2164 int ret; 2165 2166 static_assert(MLX5_CQE_STATUS_HW_OWN < 0, "Must be negative value"); 2167 static_assert(MLX5_CQE_STATUS_SW_OWN < 0, "Must be negative value"); 2168 do { 2169 volatile struct mlx5_cqe *cqe; 2170 2171 cqe = &txq->cqes[ci & txq->cqe_m]; 2172 ret = check_cqe(cqe, txq->cqe_s, ci); 2173 if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) { 2174 if (likely(ret != MLX5_CQE_STATUS_ERR)) { 2175 /* No new CQEs in completion queue. */ 2176 MLX5_ASSERT(ret == MLX5_CQE_STATUS_HW_OWN); 2177 break; 2178 } 2179 /* 2180 * Some error occurred, try to restart. 2181 * We have no barrier after WQE related Doorbell 2182 * written, make sure all writes are completed 2183 * here, before we might perform SQ reset. 2184 */ 2185 rte_wmb(); 2186 txq->cq_ci = ci; 2187 ret = mlx5_tx_error_cqe_handle 2188 (txq, (volatile struct mlx5_err_cqe *)cqe); 2189 if (unlikely(ret < 0)) { 2190 /* 2191 * Some error occurred on queue error 2192 * handling, we do not advance the index 2193 * here, allowing to retry on next call. 2194 */ 2195 return; 2196 } 2197 /* 2198 * We are going to fetch all entries with 2199 * MLX5_CQE_SYNDROME_WR_FLUSH_ERR status. 2200 * The send queue is supposed to be empty. 2201 */ 2202 ++ci; 2203 txq->cq_pi = ci; 2204 last_cqe = NULL; 2205 continue; 2206 } 2207 /* Normal transmit completion. */ 2208 MLX5_ASSERT(ci != txq->cq_pi); 2209 MLX5_ASSERT((txq->fcqs[ci & txq->cqe_m] >> 16) == 2210 cqe->wqe_counter); 2211 ++ci; 2212 last_cqe = cqe; 2213 /* 2214 * We have to restrict the amount of processed CQEs 2215 * in one tx_burst routine call. The CQ may be large 2216 * and many CQEs may be updated by the NIC in one 2217 * transaction. Buffers freeing is time consuming, 2218 * multiple iterations may introduce significant 2219 * latency. 2220 */ 2221 if (likely(--count == 0)) 2222 break; 2223 } while (true); 2224 if (likely(ci != txq->cq_ci)) { 2225 /* 2226 * Update completion queue consuming index 2227 * and ring doorbell to notify hardware. 2228 */ 2229 rte_compiler_barrier(); 2230 txq->cq_ci = ci; 2231 *txq->cq_db = rte_cpu_to_be_32(ci); 2232 mlx5_tx_comp_flush(txq, last_cqe, olx); 2233 } 2234 } 2235 2236 /** 2237 * Check if the completion request flag should be set in the last WQE. 2238 * Both pushed mbufs and WQEs are monitored and the completion request 2239 * flag is set if any of thresholds is reached. 2240 * 2241 * @param txq 2242 * Pointer to TX queue structure. 2243 * @param loc 2244 * Pointer to burst routine local context. 2245 * @param olx 2246 * Configured Tx offloads mask. It is fully defined at 2247 * compile time and may be used for optimization. 2248 */ 2249 static __rte_always_inline void 2250 mlx5_tx_request_completion(struct mlx5_txq_data *restrict txq, 2251 struct mlx5_txq_local *restrict loc, 2252 unsigned int olx) 2253 { 2254 uint16_t head = txq->elts_head; 2255 unsigned int part; 2256 2257 part = MLX5_TXOFF_CONFIG(INLINE) ? 2258 0 : loc->pkts_sent - loc->pkts_copy; 2259 head += part; 2260 if ((uint16_t)(head - txq->elts_comp) >= MLX5_TX_COMP_THRESH || 2261 (MLX5_TXOFF_CONFIG(INLINE) && 2262 (uint16_t)(txq->wqe_ci - txq->wqe_comp) >= txq->wqe_thres)) { 2263 volatile struct mlx5_wqe *last = loc->wqe_last; 2264 2265 MLX5_ASSERT(last); 2266 txq->elts_comp = head; 2267 if (MLX5_TXOFF_CONFIG(INLINE)) 2268 txq->wqe_comp = txq->wqe_ci; 2269 /* Request unconditional completion on last WQE. */ 2270 last->cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS << 2271 MLX5_COMP_MODE_OFFSET); 2272 /* Save elts_head in dedicated free on completion queue. */ 2273 #ifdef RTE_LIBRTE_MLX5_DEBUG 2274 txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head | 2275 (last->cseg.opcode >> 8) << 16; 2276 #else 2277 txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head; 2278 #endif 2279 /* A CQE slot must always be available. */ 2280 MLX5_ASSERT((txq->cq_pi - txq->cq_ci) <= txq->cqe_s); 2281 } 2282 } 2283 2284 /** 2285 * DPDK callback to check the status of a tx descriptor. 2286 * 2287 * @param tx_queue 2288 * The tx queue. 2289 * @param[in] offset 2290 * The index of the descriptor in the ring. 2291 * 2292 * @return 2293 * The status of the tx descriptor. 2294 */ 2295 int 2296 mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset) 2297 { 2298 struct mlx5_txq_data *restrict txq = tx_queue; 2299 uint16_t used; 2300 2301 mlx5_tx_handle_completion(txq, 0); 2302 used = txq->elts_head - txq->elts_tail; 2303 if (offset < used) 2304 return RTE_ETH_TX_DESC_FULL; 2305 return RTE_ETH_TX_DESC_DONE; 2306 } 2307 2308 /** 2309 * Build the Control Segment with specified opcode: 2310 * - MLX5_OPCODE_SEND 2311 * - MLX5_OPCODE_ENHANCED_MPSW 2312 * - MLX5_OPCODE_TSO 2313 * 2314 * @param txq 2315 * Pointer to TX queue structure. 2316 * @param loc 2317 * Pointer to burst routine local context. 2318 * @param wqe 2319 * Pointer to WQE to fill with built Control Segment. 2320 * @param ds 2321 * Supposed length of WQE in segments. 2322 * @param opcode 2323 * SQ WQE opcode to put into Control Segment. 2324 * @param olx 2325 * Configured Tx offloads mask. It is fully defined at 2326 * compile time and may be used for optimization. 2327 */ 2328 static __rte_always_inline void 2329 mlx5_tx_cseg_init(struct mlx5_txq_data *restrict txq, 2330 struct mlx5_txq_local *restrict loc __rte_unused, 2331 struct mlx5_wqe *restrict wqe, 2332 unsigned int ds, 2333 unsigned int opcode, 2334 unsigned int olx __rte_unused) 2335 { 2336 struct mlx5_wqe_cseg *restrict cs = &wqe->cseg; 2337 2338 /* For legacy MPW replace the EMPW by TSO with modifier. */ 2339 if (MLX5_TXOFF_CONFIG(MPW) && opcode == MLX5_OPCODE_ENHANCED_MPSW) 2340 opcode = MLX5_OPCODE_TSO | MLX5_OPC_MOD_MPW << 24; 2341 cs->opcode = rte_cpu_to_be_32((txq->wqe_ci << 8) | opcode); 2342 cs->sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); 2343 cs->flags = RTE_BE32(MLX5_COMP_ONLY_FIRST_ERR << 2344 MLX5_COMP_MODE_OFFSET); 2345 cs->misc = RTE_BE32(0); 2346 } 2347 2348 /** 2349 * Build the Ethernet Segment without inlined data. 2350 * Supports Software Parser, Checksums and VLAN 2351 * insertion Tx offload features. 2352 * 2353 * @param txq 2354 * Pointer to TX queue structure. 2355 * @param loc 2356 * Pointer to burst routine local context. 2357 * @param wqe 2358 * Pointer to WQE to fill with built Ethernet Segment. 2359 * @param olx 2360 * Configured Tx offloads mask. It is fully defined at 2361 * compile time and may be used for optimization. 2362 */ 2363 static __rte_always_inline void 2364 mlx5_tx_eseg_none(struct mlx5_txq_data *restrict txq __rte_unused, 2365 struct mlx5_txq_local *restrict loc, 2366 struct mlx5_wqe *restrict wqe, 2367 unsigned int olx) 2368 { 2369 struct mlx5_wqe_eseg *restrict es = &wqe->eseg; 2370 uint32_t csum; 2371 2372 /* 2373 * Calculate and set check sum flags first, dword field 2374 * in segment may be shared with Software Parser flags. 2375 */ 2376 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; 2377 es->flags = rte_cpu_to_le_32(csum); 2378 /* 2379 * Calculate and set Software Parser offsets and flags. 2380 * These flags a set for custom UDP and IP tunnel packets. 2381 */ 2382 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); 2383 /* Fill metadata field if needed. */ 2384 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? 2385 loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? 2386 *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0; 2387 /* Engage VLAN tag insertion feature if requested. */ 2388 if (MLX5_TXOFF_CONFIG(VLAN) && 2389 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { 2390 /* 2391 * We should get here only if device support 2392 * this feature correctly. 2393 */ 2394 MLX5_ASSERT(txq->vlan_en); 2395 es->inline_hdr = rte_cpu_to_be_32(MLX5_ETH_WQE_VLAN_INSERT | 2396 loc->mbuf->vlan_tci); 2397 } else { 2398 es->inline_hdr = RTE_BE32(0); 2399 } 2400 } 2401 2402 /** 2403 * Build the Ethernet Segment with minimal inlined data 2404 * of MLX5_ESEG_MIN_INLINE_SIZE bytes length. This is 2405 * used to fill the gap in single WQEBB WQEs. 2406 * Supports Software Parser, Checksums and VLAN 2407 * insertion Tx offload features. 2408 * 2409 * @param txq 2410 * Pointer to TX queue structure. 2411 * @param loc 2412 * Pointer to burst routine local context. 2413 * @param wqe 2414 * Pointer to WQE to fill with built Ethernet Segment. 2415 * @param vlan 2416 * Length of VLAN tag insertion if any. 2417 * @param olx 2418 * Configured Tx offloads mask. It is fully defined at 2419 * compile time and may be used for optimization. 2420 */ 2421 static __rte_always_inline void 2422 mlx5_tx_eseg_dmin(struct mlx5_txq_data *restrict txq __rte_unused, 2423 struct mlx5_txq_local *restrict loc, 2424 struct mlx5_wqe *restrict wqe, 2425 unsigned int vlan, 2426 unsigned int olx) 2427 { 2428 struct mlx5_wqe_eseg *restrict es = &wqe->eseg; 2429 uint32_t csum; 2430 uint8_t *psrc, *pdst; 2431 2432 /* 2433 * Calculate and set check sum flags first, dword field 2434 * in segment may be shared with Software Parser flags. 2435 */ 2436 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; 2437 es->flags = rte_cpu_to_le_32(csum); 2438 /* 2439 * Calculate and set Software Parser offsets and flags. 2440 * These flags a set for custom UDP and IP tunnel packets. 2441 */ 2442 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); 2443 /* Fill metadata field if needed. */ 2444 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? 2445 loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? 2446 *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0; 2447 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2448 (sizeof(uint16_t) + 2449 sizeof(rte_v128u32_t)), 2450 "invalid Ethernet Segment data size"); 2451 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2452 (sizeof(uint16_t) + 2453 sizeof(struct rte_vlan_hdr) + 2454 2 * RTE_ETHER_ADDR_LEN), 2455 "invalid Ethernet Segment data size"); 2456 psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *); 2457 es->inline_hdr_sz = RTE_BE16(MLX5_ESEG_MIN_INLINE_SIZE); 2458 es->inline_data = *(unaligned_uint16_t *)psrc; 2459 psrc += sizeof(uint16_t); 2460 pdst = (uint8_t *)(es + 1); 2461 if (MLX5_TXOFF_CONFIG(VLAN) && vlan) { 2462 /* Implement VLAN tag insertion as part inline data. */ 2463 memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t)); 2464 pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); 2465 psrc += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); 2466 /* Insert VLAN ethertype + VLAN tag. */ 2467 *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32 2468 ((RTE_ETHER_TYPE_VLAN << 16) | 2469 loc->mbuf->vlan_tci); 2470 pdst += sizeof(struct rte_vlan_hdr); 2471 /* Copy the rest two bytes from packet data. */ 2472 MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t))); 2473 *(uint16_t *)pdst = *(unaligned_uint16_t *)psrc; 2474 } else { 2475 /* Fill the gap in the title WQEBB with inline data. */ 2476 rte_mov16(pdst, psrc); 2477 } 2478 } 2479 2480 /** 2481 * Build the Ethernet Segment with entire packet 2482 * data inlining. Checks the boundary of WQEBB and 2483 * ring buffer wrapping, supports Software Parser, 2484 * Checksums and VLAN insertion Tx offload features. 2485 * 2486 * @param txq 2487 * Pointer to TX queue structure. 2488 * @param loc 2489 * Pointer to burst routine local context. 2490 * @param wqe 2491 * Pointer to WQE to fill with built Ethernet Segment. 2492 * @param vlan 2493 * Length of VLAN tag insertion if any. 2494 * @param inlen 2495 * Length of data to inline (VLAN included, if any). 2496 * @param tso 2497 * TSO flag, set mss field from the packet. 2498 * @param olx 2499 * Configured Tx offloads mask. It is fully defined at 2500 * compile time and may be used for optimization. 2501 * 2502 * @return 2503 * Pointer to the next Data Segment (aligned and wrapped around). 2504 */ 2505 static __rte_always_inline struct mlx5_wqe_dseg * 2506 mlx5_tx_eseg_data(struct mlx5_txq_data *restrict txq, 2507 struct mlx5_txq_local *restrict loc, 2508 struct mlx5_wqe *restrict wqe, 2509 unsigned int vlan, 2510 unsigned int inlen, 2511 unsigned int tso, 2512 unsigned int olx) 2513 { 2514 struct mlx5_wqe_eseg *restrict es = &wqe->eseg; 2515 uint32_t csum; 2516 uint8_t *psrc, *pdst; 2517 unsigned int part; 2518 2519 /* 2520 * Calculate and set check sum flags first, dword field 2521 * in segment may be shared with Software Parser flags. 2522 */ 2523 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; 2524 if (tso) { 2525 csum <<= 24; 2526 csum |= loc->mbuf->tso_segsz; 2527 es->flags = rte_cpu_to_be_32(csum); 2528 } else { 2529 es->flags = rte_cpu_to_le_32(csum); 2530 } 2531 /* 2532 * Calculate and set Software Parser offsets and flags. 2533 * These flags a set for custom UDP and IP tunnel packets. 2534 */ 2535 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); 2536 /* Fill metadata field if needed. */ 2537 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? 2538 loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? 2539 *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0; 2540 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2541 (sizeof(uint16_t) + 2542 sizeof(rte_v128u32_t)), 2543 "invalid Ethernet Segment data size"); 2544 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2545 (sizeof(uint16_t) + 2546 sizeof(struct rte_vlan_hdr) + 2547 2 * RTE_ETHER_ADDR_LEN), 2548 "invalid Ethernet Segment data size"); 2549 psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *); 2550 es->inline_hdr_sz = rte_cpu_to_be_16(inlen); 2551 es->inline_data = *(unaligned_uint16_t *)psrc; 2552 psrc += sizeof(uint16_t); 2553 pdst = (uint8_t *)(es + 1); 2554 if (MLX5_TXOFF_CONFIG(VLAN) && vlan) { 2555 /* Implement VLAN tag insertion as part inline data. */ 2556 memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t)); 2557 pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); 2558 psrc += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); 2559 /* Insert VLAN ethertype + VLAN tag. */ 2560 *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32 2561 ((RTE_ETHER_TYPE_VLAN << 16) | 2562 loc->mbuf->vlan_tci); 2563 pdst += sizeof(struct rte_vlan_hdr); 2564 /* Copy the rest two bytes from packet data. */ 2565 MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t))); 2566 *(uint16_t *)pdst = *(unaligned_uint16_t *)psrc; 2567 psrc += sizeof(uint16_t); 2568 } else { 2569 /* Fill the gap in the title WQEBB with inline data. */ 2570 rte_mov16(pdst, psrc); 2571 psrc += sizeof(rte_v128u32_t); 2572 } 2573 pdst = (uint8_t *)(es + 2); 2574 MLX5_ASSERT(inlen >= MLX5_ESEG_MIN_INLINE_SIZE); 2575 MLX5_ASSERT(pdst < (uint8_t *)txq->wqes_end); 2576 inlen -= MLX5_ESEG_MIN_INLINE_SIZE; 2577 if (!inlen) { 2578 MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE)); 2579 return (struct mlx5_wqe_dseg *)pdst; 2580 } 2581 /* 2582 * The WQEBB space availability is checked by caller. 2583 * Here we should be aware of WQE ring buffer wraparound only. 2584 */ 2585 part = (uint8_t *)txq->wqes_end - pdst; 2586 part = RTE_MIN(part, inlen); 2587 do { 2588 rte_memcpy(pdst, psrc, part); 2589 inlen -= part; 2590 if (likely(!inlen)) { 2591 /* 2592 * If return value is not used by the caller 2593 * the code below will be optimized out. 2594 */ 2595 pdst += part; 2596 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); 2597 if (unlikely(pdst >= (uint8_t *)txq->wqes_end)) 2598 pdst = (uint8_t *)txq->wqes; 2599 return (struct mlx5_wqe_dseg *)pdst; 2600 } 2601 pdst = (uint8_t *)txq->wqes; 2602 psrc += part; 2603 part = inlen; 2604 } while (true); 2605 } 2606 2607 /** 2608 * Copy data from chain of mbuf to the specified linear buffer. 2609 * Checksums and VLAN insertion Tx offload features. If data 2610 * from some mbuf copied completely this mbuf is freed. Local 2611 * structure is used to keep the byte stream state. 2612 * 2613 * @param pdst 2614 * Pointer to the destination linear buffer. 2615 * @param loc 2616 * Pointer to burst routine local context. 2617 * @param len 2618 * Length of data to be copied. 2619 * @param must 2620 * Length of data to be copied ignoring no inline hint. 2621 * @param olx 2622 * Configured Tx offloads mask. It is fully defined at 2623 * compile time and may be used for optimization. 2624 * 2625 * @return 2626 * Number of actual copied data bytes. This is always greater than or 2627 * equal to must parameter and might be lesser than len in no inline 2628 * hint flag is encountered. 2629 */ 2630 static __rte_always_inline unsigned int 2631 mlx5_tx_mseg_memcpy(uint8_t *pdst, 2632 struct mlx5_txq_local *restrict loc, 2633 unsigned int len, 2634 unsigned int must, 2635 unsigned int olx __rte_unused) 2636 { 2637 struct rte_mbuf *mbuf; 2638 unsigned int part, dlen, copy = 0; 2639 uint8_t *psrc; 2640 2641 MLX5_ASSERT(len); 2642 MLX5_ASSERT(must <= len); 2643 do { 2644 /* Allow zero length packets, must check first. */ 2645 dlen = rte_pktmbuf_data_len(loc->mbuf); 2646 if (dlen <= loc->mbuf_off) { 2647 /* Exhausted packet, just free. */ 2648 mbuf = loc->mbuf; 2649 loc->mbuf = mbuf->next; 2650 rte_pktmbuf_free_seg(mbuf); 2651 loc->mbuf_off = 0; 2652 MLX5_ASSERT(loc->mbuf_nseg > 1); 2653 MLX5_ASSERT(loc->mbuf); 2654 --loc->mbuf_nseg; 2655 if (loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE) { 2656 unsigned int diff; 2657 2658 if (copy >= must) { 2659 /* 2660 * We already copied the minimal 2661 * requested amount of data. 2662 */ 2663 return copy; 2664 } 2665 diff = must - copy; 2666 if (diff <= rte_pktmbuf_data_len(loc->mbuf)) { 2667 /* 2668 * Copy only the minimal required 2669 * part of the data buffer. 2670 */ 2671 len = diff; 2672 } 2673 } 2674 continue; 2675 } 2676 dlen -= loc->mbuf_off; 2677 psrc = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *, 2678 loc->mbuf_off); 2679 part = RTE_MIN(len, dlen); 2680 rte_memcpy(pdst, psrc, part); 2681 copy += part; 2682 loc->mbuf_off += part; 2683 len -= part; 2684 if (!len) { 2685 if (loc->mbuf_off >= rte_pktmbuf_data_len(loc->mbuf)) { 2686 loc->mbuf_off = 0; 2687 /* Exhausted packet, just free. */ 2688 mbuf = loc->mbuf; 2689 loc->mbuf = mbuf->next; 2690 rte_pktmbuf_free_seg(mbuf); 2691 loc->mbuf_off = 0; 2692 MLX5_ASSERT(loc->mbuf_nseg >= 1); 2693 --loc->mbuf_nseg; 2694 } 2695 return copy; 2696 } 2697 pdst += part; 2698 } while (true); 2699 } 2700 2701 /** 2702 * Build the Ethernet Segment with inlined data from 2703 * multi-segment packet. Checks the boundary of WQEBB 2704 * and ring buffer wrapping, supports Software Parser, 2705 * Checksums and VLAN insertion Tx offload features. 2706 * 2707 * @param txq 2708 * Pointer to TX queue structure. 2709 * @param loc 2710 * Pointer to burst routine local context. 2711 * @param wqe 2712 * Pointer to WQE to fill with built Ethernet Segment. 2713 * @param vlan 2714 * Length of VLAN tag insertion if any. 2715 * @param inlen 2716 * Length of data to inline (VLAN included, if any). 2717 * @param tso 2718 * TSO flag, set mss field from the packet. 2719 * @param olx 2720 * Configured Tx offloads mask. It is fully defined at 2721 * compile time and may be used for optimization. 2722 * 2723 * @return 2724 * Pointer to the next Data Segment (aligned and 2725 * possible NOT wrapped around - caller should do 2726 * wrapping check on its own). 2727 */ 2728 static __rte_always_inline struct mlx5_wqe_dseg * 2729 mlx5_tx_eseg_mdat(struct mlx5_txq_data *restrict txq, 2730 struct mlx5_txq_local *restrict loc, 2731 struct mlx5_wqe *restrict wqe, 2732 unsigned int vlan, 2733 unsigned int inlen, 2734 unsigned int tso, 2735 unsigned int olx) 2736 { 2737 struct mlx5_wqe_eseg *restrict es = &wqe->eseg; 2738 uint32_t csum; 2739 uint8_t *pdst; 2740 unsigned int part, tlen = 0; 2741 2742 /* 2743 * Calculate and set check sum flags first, uint32_t field 2744 * in segment may be shared with Software Parser flags. 2745 */ 2746 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; 2747 if (tso) { 2748 csum <<= 24; 2749 csum |= loc->mbuf->tso_segsz; 2750 es->flags = rte_cpu_to_be_32(csum); 2751 } else { 2752 es->flags = rte_cpu_to_le_32(csum); 2753 } 2754 /* 2755 * Calculate and set Software Parser offsets and flags. 2756 * These flags a set for custom UDP and IP tunnel packets. 2757 */ 2758 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); 2759 /* Fill metadata field if needed. */ 2760 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? 2761 loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? 2762 *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0; 2763 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2764 (sizeof(uint16_t) + 2765 sizeof(rte_v128u32_t)), 2766 "invalid Ethernet Segment data size"); 2767 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2768 (sizeof(uint16_t) + 2769 sizeof(struct rte_vlan_hdr) + 2770 2 * RTE_ETHER_ADDR_LEN), 2771 "invalid Ethernet Segment data size"); 2772 MLX5_ASSERT(inlen >= MLX5_ESEG_MIN_INLINE_SIZE); 2773 pdst = (uint8_t *)&es->inline_data; 2774 if (MLX5_TXOFF_CONFIG(VLAN) && vlan) { 2775 /* Implement VLAN tag insertion as part inline data. */ 2776 mlx5_tx_mseg_memcpy(pdst, loc, 2777 2 * RTE_ETHER_ADDR_LEN, 2778 2 * RTE_ETHER_ADDR_LEN, olx); 2779 pdst += 2 * RTE_ETHER_ADDR_LEN; 2780 *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32 2781 ((RTE_ETHER_TYPE_VLAN << 16) | 2782 loc->mbuf->vlan_tci); 2783 pdst += sizeof(struct rte_vlan_hdr); 2784 tlen += 2 * RTE_ETHER_ADDR_LEN + sizeof(struct rte_vlan_hdr); 2785 } 2786 MLX5_ASSERT(pdst < (uint8_t *)txq->wqes_end); 2787 /* 2788 * The WQEBB space availability is checked by caller. 2789 * Here we should be aware of WQE ring buffer wraparound only. 2790 */ 2791 part = (uint8_t *)txq->wqes_end - pdst; 2792 part = RTE_MIN(part, inlen - tlen); 2793 MLX5_ASSERT(part); 2794 do { 2795 unsigned int copy; 2796 2797 /* 2798 * Copying may be interrupted inside the routine 2799 * if run into no inline hint flag. 2800 */ 2801 copy = tlen >= txq->inlen_mode ? 0 : (txq->inlen_mode - tlen); 2802 copy = mlx5_tx_mseg_memcpy(pdst, loc, part, copy, olx); 2803 tlen += copy; 2804 if (likely(inlen <= tlen) || copy < part) { 2805 es->inline_hdr_sz = rte_cpu_to_be_16(tlen); 2806 pdst += copy; 2807 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); 2808 return (struct mlx5_wqe_dseg *)pdst; 2809 } 2810 pdst = (uint8_t *)txq->wqes; 2811 part = inlen - tlen; 2812 } while (true); 2813 } 2814 2815 /** 2816 * Build the Data Segment of pointer type. 2817 * 2818 * @param txq 2819 * Pointer to TX queue structure. 2820 * @param loc 2821 * Pointer to burst routine local context. 2822 * @param dseg 2823 * Pointer to WQE to fill with built Data Segment. 2824 * @param buf 2825 * Data buffer to point. 2826 * @param len 2827 * Data buffer length. 2828 * @param olx 2829 * Configured Tx offloads mask. It is fully defined at 2830 * compile time and may be used for optimization. 2831 */ 2832 static __rte_always_inline void 2833 mlx5_tx_dseg_ptr(struct mlx5_txq_data *restrict txq, 2834 struct mlx5_txq_local *restrict loc, 2835 struct mlx5_wqe_dseg *restrict dseg, 2836 uint8_t *buf, 2837 unsigned int len, 2838 unsigned int olx __rte_unused) 2839 2840 { 2841 MLX5_ASSERT(len); 2842 dseg->bcount = rte_cpu_to_be_32(len); 2843 dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf); 2844 dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf); 2845 } 2846 2847 /** 2848 * Build the Data Segment of pointer type or inline 2849 * if data length is less than buffer in minimal 2850 * Data Segment size. 2851 * 2852 * @param txq 2853 * Pointer to TX queue structure. 2854 * @param loc 2855 * Pointer to burst routine local context. 2856 * @param dseg 2857 * Pointer to WQE to fill with built Data Segment. 2858 * @param buf 2859 * Data buffer to point. 2860 * @param len 2861 * Data buffer length. 2862 * @param olx 2863 * Configured Tx offloads mask. It is fully defined at 2864 * compile time and may be used for optimization. 2865 */ 2866 static __rte_always_inline void 2867 mlx5_tx_dseg_iptr(struct mlx5_txq_data *restrict txq, 2868 struct mlx5_txq_local *restrict loc, 2869 struct mlx5_wqe_dseg *restrict dseg, 2870 uint8_t *buf, 2871 unsigned int len, 2872 unsigned int olx __rte_unused) 2873 2874 { 2875 uintptr_t dst, src; 2876 2877 MLX5_ASSERT(len); 2878 if (len > MLX5_DSEG_MIN_INLINE_SIZE) { 2879 dseg->bcount = rte_cpu_to_be_32(len); 2880 dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf); 2881 dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf); 2882 2883 return; 2884 } 2885 dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE); 2886 /* Unrolled implementation of generic rte_memcpy. */ 2887 dst = (uintptr_t)&dseg->inline_data[0]; 2888 src = (uintptr_t)buf; 2889 if (len & 0x08) { 2890 #ifdef RTE_ARCH_STRICT_ALIGN 2891 MLX5_ASSERT(dst == RTE_PTR_ALIGN(dst, sizeof(uint32_t))); 2892 *(uint32_t *)dst = *(unaligned_uint32_t *)src; 2893 dst += sizeof(uint32_t); 2894 src += sizeof(uint32_t); 2895 *(uint32_t *)dst = *(unaligned_uint32_t *)src; 2896 dst += sizeof(uint32_t); 2897 src += sizeof(uint32_t); 2898 #else 2899 *(uint64_t *)dst = *(unaligned_uint64_t *)src; 2900 dst += sizeof(uint64_t); 2901 src += sizeof(uint64_t); 2902 #endif 2903 } 2904 if (len & 0x04) { 2905 *(uint32_t *)dst = *(unaligned_uint32_t *)src; 2906 dst += sizeof(uint32_t); 2907 src += sizeof(uint32_t); 2908 } 2909 if (len & 0x02) { 2910 *(uint16_t *)dst = *(unaligned_uint16_t *)src; 2911 dst += sizeof(uint16_t); 2912 src += sizeof(uint16_t); 2913 } 2914 if (len & 0x01) 2915 *(uint8_t *)dst = *(uint8_t *)src; 2916 } 2917 2918 /** 2919 * Build the Data Segment of inlined data from single 2920 * segment packet, no VLAN insertion. 2921 * 2922 * @param txq 2923 * Pointer to TX queue structure. 2924 * @param loc 2925 * Pointer to burst routine local context. 2926 * @param dseg 2927 * Pointer to WQE to fill with built Data Segment. 2928 * @param buf 2929 * Data buffer to point. 2930 * @param len 2931 * Data buffer length. 2932 * @param olx 2933 * Configured Tx offloads mask. It is fully defined at 2934 * compile time and may be used for optimization. 2935 * 2936 * @return 2937 * Pointer to the next Data Segment after inlined data. 2938 * Ring buffer wraparound check is needed. We do not 2939 * do it here because it may not be needed for the 2940 * last packet in the eMPW session. 2941 */ 2942 static __rte_always_inline struct mlx5_wqe_dseg * 2943 mlx5_tx_dseg_empw(struct mlx5_txq_data *restrict txq, 2944 struct mlx5_txq_local *restrict loc __rte_unused, 2945 struct mlx5_wqe_dseg *restrict dseg, 2946 uint8_t *buf, 2947 unsigned int len, 2948 unsigned int olx __rte_unused) 2949 { 2950 unsigned int part; 2951 uint8_t *pdst; 2952 2953 if (!MLX5_TXOFF_CONFIG(MPW)) { 2954 /* Store the descriptor byte counter for eMPW sessions. */ 2955 dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE); 2956 pdst = &dseg->inline_data[0]; 2957 } else { 2958 /* The entire legacy MPW session counter is stored on close. */ 2959 pdst = (uint8_t *)dseg; 2960 } 2961 /* 2962 * The WQEBB space availability is checked by caller. 2963 * Here we should be aware of WQE ring buffer wraparound only. 2964 */ 2965 part = (uint8_t *)txq->wqes_end - pdst; 2966 part = RTE_MIN(part, len); 2967 do { 2968 rte_memcpy(pdst, buf, part); 2969 len -= part; 2970 if (likely(!len)) { 2971 pdst += part; 2972 if (!MLX5_TXOFF_CONFIG(MPW)) 2973 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); 2974 /* Note: no final wraparound check here. */ 2975 return (struct mlx5_wqe_dseg *)pdst; 2976 } 2977 pdst = (uint8_t *)txq->wqes; 2978 buf += part; 2979 part = len; 2980 } while (true); 2981 } 2982 2983 /** 2984 * Build the Data Segment of inlined data from single 2985 * segment packet with VLAN insertion. 2986 * 2987 * @param txq 2988 * Pointer to TX queue structure. 2989 * @param loc 2990 * Pointer to burst routine local context. 2991 * @param dseg 2992 * Pointer to the dseg fill with built Data Segment. 2993 * @param buf 2994 * Data buffer to point. 2995 * @param len 2996 * Data buffer length. 2997 * @param olx 2998 * Configured Tx offloads mask. It is fully defined at 2999 * compile time and may be used for optimization. 3000 * 3001 * @return 3002 * Pointer to the next Data Segment after inlined data. 3003 * Ring buffer wraparound check is needed. 3004 */ 3005 static __rte_always_inline struct mlx5_wqe_dseg * 3006 mlx5_tx_dseg_vlan(struct mlx5_txq_data *restrict txq, 3007 struct mlx5_txq_local *restrict loc __rte_unused, 3008 struct mlx5_wqe_dseg *restrict dseg, 3009 uint8_t *buf, 3010 unsigned int len, 3011 unsigned int olx __rte_unused) 3012 3013 { 3014 unsigned int part; 3015 uint8_t *pdst; 3016 3017 MLX5_ASSERT(len > MLX5_ESEG_MIN_INLINE_SIZE); 3018 static_assert(MLX5_DSEG_MIN_INLINE_SIZE == 3019 (2 * RTE_ETHER_ADDR_LEN), 3020 "invalid Data Segment data size"); 3021 if (!MLX5_TXOFF_CONFIG(MPW)) { 3022 /* Store the descriptor byte counter for eMPW sessions. */ 3023 dseg->bcount = rte_cpu_to_be_32 3024 ((len + sizeof(struct rte_vlan_hdr)) | 3025 MLX5_ETH_WQE_DATA_INLINE); 3026 pdst = &dseg->inline_data[0]; 3027 } else { 3028 /* The entire legacy MPW session counter is stored on close. */ 3029 pdst = (uint8_t *)dseg; 3030 } 3031 memcpy(pdst, buf, MLX5_DSEG_MIN_INLINE_SIZE); 3032 buf += MLX5_DSEG_MIN_INLINE_SIZE; 3033 pdst += MLX5_DSEG_MIN_INLINE_SIZE; 3034 len -= MLX5_DSEG_MIN_INLINE_SIZE; 3035 /* Insert VLAN ethertype + VLAN tag. Pointer is aligned. */ 3036 MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE)); 3037 if (unlikely(pdst >= (uint8_t *)txq->wqes_end)) 3038 pdst = (uint8_t *)txq->wqes; 3039 *(uint32_t *)pdst = rte_cpu_to_be_32((RTE_ETHER_TYPE_VLAN << 16) | 3040 loc->mbuf->vlan_tci); 3041 pdst += sizeof(struct rte_vlan_hdr); 3042 /* 3043 * The WQEBB space availability is checked by caller. 3044 * Here we should be aware of WQE ring buffer wraparound only. 3045 */ 3046 part = (uint8_t *)txq->wqes_end - pdst; 3047 part = RTE_MIN(part, len); 3048 do { 3049 rte_memcpy(pdst, buf, part); 3050 len -= part; 3051 if (likely(!len)) { 3052 pdst += part; 3053 if (!MLX5_TXOFF_CONFIG(MPW)) 3054 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); 3055 /* Note: no final wraparound check here. */ 3056 return (struct mlx5_wqe_dseg *)pdst; 3057 } 3058 pdst = (uint8_t *)txq->wqes; 3059 buf += part; 3060 part = len; 3061 } while (true); 3062 } 3063 3064 /** 3065 * Build the Ethernet Segment with optionally inlined data with 3066 * VLAN insertion and following Data Segments (if any) from 3067 * multi-segment packet. Used by ordinary send and TSO. 3068 * 3069 * @param txq 3070 * Pointer to TX queue structure. 3071 * @param loc 3072 * Pointer to burst routine local context. 3073 * @param wqe 3074 * Pointer to WQE to fill with built Ethernet/Data Segments. 3075 * @param vlan 3076 * Length of VLAN header to insert, 0 means no VLAN insertion. 3077 * @param inlen 3078 * Data length to inline. For TSO this parameter specifies 3079 * exact value, for ordinary send routine can be aligned by 3080 * caller to provide better WQE space saving and data buffer 3081 * start address alignment. This length includes VLAN header 3082 * being inserted. 3083 * @param tso 3084 * Zero means ordinary send, inlined data can be extended, 3085 * otherwise this is TSO, inlined data length is fixed. 3086 * @param olx 3087 * Configured Tx offloads mask. It is fully defined at 3088 * compile time and may be used for optimization. 3089 * 3090 * @return 3091 * Actual size of built WQE in segments. 3092 */ 3093 static __rte_always_inline unsigned int 3094 mlx5_tx_mseg_build(struct mlx5_txq_data *restrict txq, 3095 struct mlx5_txq_local *restrict loc, 3096 struct mlx5_wqe *restrict wqe, 3097 unsigned int vlan, 3098 unsigned int inlen, 3099 unsigned int tso, 3100 unsigned int olx __rte_unused) 3101 { 3102 struct mlx5_wqe_dseg *restrict dseg; 3103 unsigned int ds; 3104 3105 MLX5_ASSERT((rte_pktmbuf_pkt_len(loc->mbuf) + vlan) >= inlen); 3106 loc->mbuf_nseg = NB_SEGS(loc->mbuf); 3107 loc->mbuf_off = 0; 3108 3109 dseg = mlx5_tx_eseg_mdat(txq, loc, wqe, vlan, inlen, tso, olx); 3110 if (!loc->mbuf_nseg) 3111 goto dseg_done; 3112 /* 3113 * There are still some mbuf remaining, not inlined. 3114 * The first mbuf may be partially inlined and we 3115 * must process the possible non-zero data offset. 3116 */ 3117 if (loc->mbuf_off) { 3118 unsigned int dlen; 3119 uint8_t *dptr; 3120 3121 /* 3122 * Exhausted packets must be dropped before. 3123 * Non-zero offset means there are some data 3124 * remained in the packet. 3125 */ 3126 MLX5_ASSERT(loc->mbuf_off < rte_pktmbuf_data_len(loc->mbuf)); 3127 MLX5_ASSERT(rte_pktmbuf_data_len(loc->mbuf)); 3128 dptr = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *, 3129 loc->mbuf_off); 3130 dlen = rte_pktmbuf_data_len(loc->mbuf) - loc->mbuf_off; 3131 /* 3132 * Build the pointer/minimal data Data Segment. 3133 * Do ring buffer wrapping check in advance. 3134 */ 3135 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 3136 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 3137 mlx5_tx_dseg_iptr(txq, loc, dseg, dptr, dlen, olx); 3138 /* Store the mbuf to be freed on completion. */ 3139 MLX5_ASSERT(loc->elts_free); 3140 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 3141 --loc->elts_free; 3142 ++dseg; 3143 if (--loc->mbuf_nseg == 0) 3144 goto dseg_done; 3145 loc->mbuf = loc->mbuf->next; 3146 loc->mbuf_off = 0; 3147 } 3148 do { 3149 if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) { 3150 struct rte_mbuf *mbuf; 3151 3152 /* Zero length segment found, just skip. */ 3153 mbuf = loc->mbuf; 3154 loc->mbuf = loc->mbuf->next; 3155 rte_pktmbuf_free_seg(mbuf); 3156 if (--loc->mbuf_nseg == 0) 3157 break; 3158 } else { 3159 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 3160 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 3161 mlx5_tx_dseg_iptr 3162 (txq, loc, dseg, 3163 rte_pktmbuf_mtod(loc->mbuf, uint8_t *), 3164 rte_pktmbuf_data_len(loc->mbuf), olx); 3165 MLX5_ASSERT(loc->elts_free); 3166 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 3167 --loc->elts_free; 3168 ++dseg; 3169 if (--loc->mbuf_nseg == 0) 3170 break; 3171 loc->mbuf = loc->mbuf->next; 3172 } 3173 } while (true); 3174 3175 dseg_done: 3176 /* Calculate actual segments used from the dseg pointer. */ 3177 if ((uintptr_t)wqe < (uintptr_t)dseg) 3178 ds = ((uintptr_t)dseg - (uintptr_t)wqe) / MLX5_WSEG_SIZE; 3179 else 3180 ds = (((uintptr_t)dseg - (uintptr_t)wqe) + 3181 txq->wqe_s * MLX5_WQE_SIZE) / MLX5_WSEG_SIZE; 3182 return ds; 3183 } 3184 3185 /** 3186 * Tx one packet function for multi-segment TSO. Supports all 3187 * types of Tx offloads, uses MLX5_OPCODE_TSO to build WQEs, 3188 * sends one packet per WQE. 3189 * 3190 * This routine is responsible for storing processed mbuf 3191 * into elts ring buffer and update elts_head. 3192 * 3193 * @param txq 3194 * Pointer to TX queue structure. 3195 * @param loc 3196 * Pointer to burst routine local context. 3197 * @param olx 3198 * Configured Tx offloads mask. It is fully defined at 3199 * compile time and may be used for optimization. 3200 * 3201 * @return 3202 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3203 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3204 * Local context variables partially updated. 3205 */ 3206 static __rte_always_inline enum mlx5_txcmp_code 3207 mlx5_tx_packet_multi_tso(struct mlx5_txq_data *restrict txq, 3208 struct mlx5_txq_local *restrict loc, 3209 unsigned int olx) 3210 { 3211 struct mlx5_wqe *restrict wqe; 3212 unsigned int ds, dlen, inlen, ntcp, vlan = 0; 3213 3214 /* 3215 * Calculate data length to be inlined to estimate 3216 * the required space in WQE ring buffer. 3217 */ 3218 dlen = rte_pktmbuf_pkt_len(loc->mbuf); 3219 if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) 3220 vlan = sizeof(struct rte_vlan_hdr); 3221 inlen = loc->mbuf->l2_len + vlan + 3222 loc->mbuf->l3_len + loc->mbuf->l4_len; 3223 if (unlikely((!inlen || !loc->mbuf->tso_segsz))) 3224 return MLX5_TXCMP_CODE_ERROR; 3225 if (loc->mbuf->ol_flags & PKT_TX_TUNNEL_MASK) 3226 inlen += loc->mbuf->outer_l2_len + loc->mbuf->outer_l3_len; 3227 /* Packet must contain all TSO headers. */ 3228 if (unlikely(inlen > MLX5_MAX_TSO_HEADER || 3229 inlen <= MLX5_ESEG_MIN_INLINE_SIZE || 3230 inlen > (dlen + vlan))) 3231 return MLX5_TXCMP_CODE_ERROR; 3232 MLX5_ASSERT(inlen >= txq->inlen_mode); 3233 /* 3234 * Check whether there are enough free WQEBBs: 3235 * - Control Segment 3236 * - Ethernet Segment 3237 * - First Segment of inlined Ethernet data 3238 * - ... data continued ... 3239 * - Data Segments of pointer/min inline type 3240 */ 3241 ds = NB_SEGS(loc->mbuf) + 2 + (inlen - 3242 MLX5_ESEG_MIN_INLINE_SIZE + 3243 MLX5_WSEG_SIZE + 3244 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 3245 if (unlikely(loc->wqe_free < ((ds + 3) / 4))) 3246 return MLX5_TXCMP_CODE_EXIT; 3247 /* Check for maximal WQE size. */ 3248 if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4))) 3249 return MLX5_TXCMP_CODE_ERROR; 3250 #ifdef MLX5_PMD_SOFT_COUNTERS 3251 /* Update sent data bytes/packets counters. */ 3252 ntcp = (dlen - (inlen - vlan) + loc->mbuf->tso_segsz - 1) / 3253 loc->mbuf->tso_segsz; 3254 /* 3255 * One will be added for mbuf itself 3256 * at the end of the mlx5_tx_burst from 3257 * loc->pkts_sent field. 3258 */ 3259 --ntcp; 3260 txq->stats.opackets += ntcp; 3261 txq->stats.obytes += dlen + vlan + ntcp * inlen; 3262 #endif 3263 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3264 loc->wqe_last = wqe; 3265 mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_TSO, olx); 3266 ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 1, olx); 3267 wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); 3268 txq->wqe_ci += (ds + 3) / 4; 3269 loc->wqe_free -= (ds + 3) / 4; 3270 return MLX5_TXCMP_CODE_MULTI; 3271 } 3272 3273 /** 3274 * Tx one packet function for multi-segment SEND. Supports all 3275 * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs, 3276 * sends one packet per WQE, without any data inlining in 3277 * Ethernet Segment. 3278 * 3279 * This routine is responsible for storing processed mbuf 3280 * into elts ring buffer and update elts_head. 3281 * 3282 * @param txq 3283 * Pointer to TX queue structure. 3284 * @param loc 3285 * Pointer to burst routine local context. 3286 * @param olx 3287 * Configured Tx offloads mask. It is fully defined at 3288 * compile time and may be used for optimization. 3289 * 3290 * @return 3291 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3292 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3293 * Local context variables partially updated. 3294 */ 3295 static __rte_always_inline enum mlx5_txcmp_code 3296 mlx5_tx_packet_multi_send(struct mlx5_txq_data *restrict txq, 3297 struct mlx5_txq_local *restrict loc, 3298 unsigned int olx) 3299 { 3300 struct mlx5_wqe_dseg *restrict dseg; 3301 struct mlx5_wqe *restrict wqe; 3302 unsigned int ds, nseg; 3303 3304 MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1); 3305 /* 3306 * No inline at all, it means the CPU cycles saving 3307 * is prioritized at configuration, we should not 3308 * copy any packet data to WQE. 3309 */ 3310 nseg = NB_SEGS(loc->mbuf); 3311 ds = 2 + nseg; 3312 if (unlikely(loc->wqe_free < ((ds + 3) / 4))) 3313 return MLX5_TXCMP_CODE_EXIT; 3314 /* Check for maximal WQE size. */ 3315 if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4))) 3316 return MLX5_TXCMP_CODE_ERROR; 3317 /* 3318 * Some Tx offloads may cause an error if 3319 * packet is not long enough, check against 3320 * assumed minimal length. 3321 */ 3322 if (rte_pktmbuf_pkt_len(loc->mbuf) <= MLX5_ESEG_MIN_INLINE_SIZE) 3323 return MLX5_TXCMP_CODE_ERROR; 3324 #ifdef MLX5_PMD_SOFT_COUNTERS 3325 /* Update sent data bytes counter. */ 3326 txq->stats.obytes += rte_pktmbuf_pkt_len(loc->mbuf); 3327 if (MLX5_TXOFF_CONFIG(VLAN) && 3328 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) 3329 txq->stats.obytes += sizeof(struct rte_vlan_hdr); 3330 #endif 3331 /* 3332 * SEND WQE, one WQEBB: 3333 * - Control Segment, SEND opcode 3334 * - Ethernet Segment, optional VLAN, no inline 3335 * - Data Segments, pointer only type 3336 */ 3337 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3338 loc->wqe_last = wqe; 3339 mlx5_tx_cseg_init(txq, loc, wqe, ds, MLX5_OPCODE_SEND, olx); 3340 mlx5_tx_eseg_none(txq, loc, wqe, olx); 3341 dseg = &wqe->dseg[0]; 3342 do { 3343 if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) { 3344 struct rte_mbuf *mbuf; 3345 3346 /* 3347 * Zero length segment found, have to 3348 * correct total size of WQE in segments. 3349 * It is supposed to be rare occasion, so 3350 * in normal case (no zero length segments) 3351 * we avoid extra writing to the Control 3352 * Segment. 3353 */ 3354 --ds; 3355 wqe->cseg.sq_ds -= RTE_BE32(1); 3356 mbuf = loc->mbuf; 3357 loc->mbuf = mbuf->next; 3358 rte_pktmbuf_free_seg(mbuf); 3359 if (--nseg == 0) 3360 break; 3361 } else { 3362 mlx5_tx_dseg_ptr 3363 (txq, loc, dseg, 3364 rte_pktmbuf_mtod(loc->mbuf, uint8_t *), 3365 rte_pktmbuf_data_len(loc->mbuf), olx); 3366 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 3367 --loc->elts_free; 3368 if (--nseg == 0) 3369 break; 3370 ++dseg; 3371 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 3372 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 3373 loc->mbuf = loc->mbuf->next; 3374 } 3375 } while (true); 3376 txq->wqe_ci += (ds + 3) / 4; 3377 loc->wqe_free -= (ds + 3) / 4; 3378 return MLX5_TXCMP_CODE_MULTI; 3379 } 3380 3381 /** 3382 * Tx one packet function for multi-segment SEND. Supports all 3383 * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs, 3384 * sends one packet per WQE, with data inlining in 3385 * Ethernet Segment and minimal Data Segments. 3386 * 3387 * This routine is responsible for storing processed mbuf 3388 * into elts ring buffer and update elts_head. 3389 * 3390 * @param txq 3391 * Pointer to TX queue structure. 3392 * @param loc 3393 * Pointer to burst routine local context. 3394 * @param olx 3395 * Configured Tx offloads mask. It is fully defined at 3396 * compile time and may be used for optimization. 3397 * 3398 * @return 3399 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3400 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3401 * Local context variables partially updated. 3402 */ 3403 static __rte_always_inline enum mlx5_txcmp_code 3404 mlx5_tx_packet_multi_inline(struct mlx5_txq_data *restrict txq, 3405 struct mlx5_txq_local *restrict loc, 3406 unsigned int olx) 3407 { 3408 struct mlx5_wqe *restrict wqe; 3409 unsigned int ds, inlen, dlen, vlan = 0; 3410 3411 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE)); 3412 MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1); 3413 /* 3414 * First calculate data length to be inlined 3415 * to estimate the required space for WQE. 3416 */ 3417 dlen = rte_pktmbuf_pkt_len(loc->mbuf); 3418 if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) 3419 vlan = sizeof(struct rte_vlan_hdr); 3420 inlen = dlen + vlan; 3421 /* Check against minimal length. */ 3422 if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE) 3423 return MLX5_TXCMP_CODE_ERROR; 3424 MLX5_ASSERT(txq->inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE); 3425 if (inlen > txq->inlen_send || 3426 loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE) { 3427 struct rte_mbuf *mbuf; 3428 unsigned int nxlen; 3429 uintptr_t start; 3430 3431 /* 3432 * Packet length exceeds the allowed inline 3433 * data length, check whether the minimal 3434 * inlining is required. 3435 */ 3436 if (txq->inlen_mode) { 3437 MLX5_ASSERT(txq->inlen_mode >= 3438 MLX5_ESEG_MIN_INLINE_SIZE); 3439 MLX5_ASSERT(txq->inlen_mode <= txq->inlen_send); 3440 inlen = txq->inlen_mode; 3441 } else { 3442 if (loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE || 3443 !vlan || txq->vlan_en) { 3444 /* 3445 * VLAN insertion will be done inside by HW. 3446 * It is not utmost effective - VLAN flag is 3447 * checked twice, but we should proceed the 3448 * inlining length correctly and take into 3449 * account the VLAN header being inserted. 3450 */ 3451 return mlx5_tx_packet_multi_send 3452 (txq, loc, olx); 3453 } 3454 inlen = MLX5_ESEG_MIN_INLINE_SIZE; 3455 } 3456 /* 3457 * Now we know the minimal amount of data is requested 3458 * to inline. Check whether we should inline the buffers 3459 * from the chain beginning to eliminate some mbufs. 3460 */ 3461 mbuf = loc->mbuf; 3462 nxlen = rte_pktmbuf_data_len(mbuf); 3463 if (unlikely(nxlen <= txq->inlen_send)) { 3464 /* We can inline first mbuf at least. */ 3465 if (nxlen < inlen) { 3466 unsigned int smlen; 3467 3468 /* Scan mbufs till inlen filled. */ 3469 do { 3470 smlen = nxlen; 3471 mbuf = NEXT(mbuf); 3472 MLX5_ASSERT(mbuf); 3473 nxlen = rte_pktmbuf_data_len(mbuf); 3474 nxlen += smlen; 3475 } while (unlikely(nxlen < inlen)); 3476 if (unlikely(nxlen > txq->inlen_send)) { 3477 /* We cannot inline entire mbuf. */ 3478 smlen = inlen - smlen; 3479 start = rte_pktmbuf_mtod_offset 3480 (mbuf, uintptr_t, smlen); 3481 goto do_align; 3482 } 3483 } 3484 do { 3485 inlen = nxlen; 3486 mbuf = NEXT(mbuf); 3487 /* There should be not end of packet. */ 3488 MLX5_ASSERT(mbuf); 3489 nxlen = inlen + rte_pktmbuf_data_len(mbuf); 3490 } while (unlikely(nxlen < txq->inlen_send)); 3491 } 3492 start = rte_pktmbuf_mtod(mbuf, uintptr_t); 3493 /* 3494 * Check whether we can do inline to align start 3495 * address of data buffer to cacheline. 3496 */ 3497 do_align: 3498 start = (~start + 1) & (RTE_CACHE_LINE_SIZE - 1); 3499 if (unlikely(start)) { 3500 start += inlen; 3501 if (start <= txq->inlen_send) 3502 inlen = start; 3503 } 3504 } 3505 /* 3506 * Check whether there are enough free WQEBBs: 3507 * - Control Segment 3508 * - Ethernet Segment 3509 * - First Segment of inlined Ethernet data 3510 * - ... data continued ... 3511 * - Data Segments of pointer/min inline type 3512 * 3513 * Estimate the number of Data Segments conservatively, 3514 * supposing no any mbufs is being freed during inlining. 3515 */ 3516 MLX5_ASSERT(inlen <= txq->inlen_send); 3517 ds = NB_SEGS(loc->mbuf) + 2 + (inlen - 3518 MLX5_ESEG_MIN_INLINE_SIZE + 3519 MLX5_WSEG_SIZE + 3520 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 3521 if (unlikely(loc->wqe_free < ((ds + 3) / 4))) 3522 return MLX5_TXCMP_CODE_EXIT; 3523 /* Check for maximal WQE size. */ 3524 if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4))) 3525 return MLX5_TXCMP_CODE_ERROR; 3526 #ifdef MLX5_PMD_SOFT_COUNTERS 3527 /* Update sent data bytes/packets counters. */ 3528 txq->stats.obytes += dlen + vlan; 3529 #endif 3530 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3531 loc->wqe_last = wqe; 3532 mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_SEND, olx); 3533 ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 0, olx); 3534 wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); 3535 txq->wqe_ci += (ds + 3) / 4; 3536 loc->wqe_free -= (ds + 3) / 4; 3537 return MLX5_TXCMP_CODE_MULTI; 3538 } 3539 3540 /** 3541 * Tx burst function for multi-segment packets. Supports all 3542 * types of Tx offloads, uses MLX5_OPCODE_SEND/TSO to build WQEs, 3543 * sends one packet per WQE. Function stops sending if it 3544 * encounters the single-segment packet. 3545 * 3546 * This routine is responsible for storing processed mbuf 3547 * into elts ring buffer and update elts_head. 3548 * 3549 * @param txq 3550 * Pointer to TX queue structure. 3551 * @param[in] pkts 3552 * Packets to transmit. 3553 * @param pkts_n 3554 * Number of packets in array. 3555 * @param loc 3556 * Pointer to burst routine local context. 3557 * @param olx 3558 * Configured Tx offloads mask. It is fully defined at 3559 * compile time and may be used for optimization. 3560 * 3561 * @return 3562 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3563 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3564 * MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered. 3565 * MLX5_TXCMP_CODE_TSO - TSO single-segment packet encountered. 3566 * Local context variables updated. 3567 */ 3568 static __rte_always_inline enum mlx5_txcmp_code 3569 mlx5_tx_burst_mseg(struct mlx5_txq_data *restrict txq, 3570 struct rte_mbuf **restrict pkts, 3571 unsigned int pkts_n, 3572 struct mlx5_txq_local *restrict loc, 3573 unsigned int olx) 3574 { 3575 MLX5_ASSERT(loc->elts_free && loc->wqe_free); 3576 MLX5_ASSERT(pkts_n > loc->pkts_sent); 3577 pkts += loc->pkts_sent + 1; 3578 pkts_n -= loc->pkts_sent; 3579 for (;;) { 3580 enum mlx5_txcmp_code ret; 3581 3582 MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1); 3583 /* 3584 * Estimate the number of free elts quickly but 3585 * conservatively. Some segment may be fully inlined 3586 * and freed, ignore this here - precise estimation 3587 * is costly. 3588 */ 3589 if (loc->elts_free < NB_SEGS(loc->mbuf)) 3590 return MLX5_TXCMP_CODE_EXIT; 3591 if (MLX5_TXOFF_CONFIG(TSO) && 3592 unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) { 3593 /* Proceed with multi-segment TSO. */ 3594 ret = mlx5_tx_packet_multi_tso(txq, loc, olx); 3595 } else if (MLX5_TXOFF_CONFIG(INLINE)) { 3596 /* Proceed with multi-segment SEND with inlining. */ 3597 ret = mlx5_tx_packet_multi_inline(txq, loc, olx); 3598 } else { 3599 /* Proceed with multi-segment SEND w/o inlining. */ 3600 ret = mlx5_tx_packet_multi_send(txq, loc, olx); 3601 } 3602 if (ret == MLX5_TXCMP_CODE_EXIT) 3603 return MLX5_TXCMP_CODE_EXIT; 3604 if (ret == MLX5_TXCMP_CODE_ERROR) 3605 return MLX5_TXCMP_CODE_ERROR; 3606 /* WQE is built, go to the next packet. */ 3607 ++loc->pkts_sent; 3608 --pkts_n; 3609 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) 3610 return MLX5_TXCMP_CODE_EXIT; 3611 loc->mbuf = *pkts++; 3612 if (pkts_n > 1) 3613 rte_prefetch0(*pkts); 3614 if (likely(NB_SEGS(loc->mbuf) > 1)) 3615 continue; 3616 /* Here ends the series of multi-segment packets. */ 3617 if (MLX5_TXOFF_CONFIG(TSO) && 3618 unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) 3619 return MLX5_TXCMP_CODE_TSO; 3620 return MLX5_TXCMP_CODE_SINGLE; 3621 } 3622 MLX5_ASSERT(false); 3623 } 3624 3625 /** 3626 * Tx burst function for single-segment packets with TSO. 3627 * Supports all types of Tx offloads, except multi-packets. 3628 * Uses MLX5_OPCODE_TSO to build WQEs, sends one packet per WQE. 3629 * Function stops sending if it encounters the multi-segment 3630 * packet or packet without TSO requested. 3631 * 3632 * The routine is responsible for storing processed mbuf 3633 * into elts ring buffer and update elts_head if inline 3634 * offloads is requested due to possible early freeing 3635 * of the inlined mbufs (can not store pkts array in elts 3636 * as a batch). 3637 * 3638 * @param txq 3639 * Pointer to TX queue structure. 3640 * @param[in] pkts 3641 * Packets to transmit. 3642 * @param pkts_n 3643 * Number of packets in array. 3644 * @param loc 3645 * Pointer to burst routine local context. 3646 * @param olx 3647 * Configured Tx offloads mask. It is fully defined at 3648 * compile time and may be used for optimization. 3649 * 3650 * @return 3651 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3652 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3653 * MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered. 3654 * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered. 3655 * Local context variables updated. 3656 */ 3657 static __rte_always_inline enum mlx5_txcmp_code 3658 mlx5_tx_burst_tso(struct mlx5_txq_data *restrict txq, 3659 struct rte_mbuf **restrict pkts, 3660 unsigned int pkts_n, 3661 struct mlx5_txq_local *restrict loc, 3662 unsigned int olx) 3663 { 3664 MLX5_ASSERT(loc->elts_free && loc->wqe_free); 3665 MLX5_ASSERT(pkts_n > loc->pkts_sent); 3666 pkts += loc->pkts_sent + 1; 3667 pkts_n -= loc->pkts_sent; 3668 for (;;) { 3669 struct mlx5_wqe_dseg *restrict dseg; 3670 struct mlx5_wqe *restrict wqe; 3671 unsigned int ds, dlen, hlen, ntcp, vlan = 0; 3672 uint8_t *dptr; 3673 3674 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); 3675 dlen = rte_pktmbuf_data_len(loc->mbuf); 3676 if (MLX5_TXOFF_CONFIG(VLAN) && 3677 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { 3678 vlan = sizeof(struct rte_vlan_hdr); 3679 } 3680 /* 3681 * First calculate the WQE size to check 3682 * whether we have enough space in ring buffer. 3683 */ 3684 hlen = loc->mbuf->l2_len + vlan + 3685 loc->mbuf->l3_len + loc->mbuf->l4_len; 3686 if (unlikely((!hlen || !loc->mbuf->tso_segsz))) 3687 return MLX5_TXCMP_CODE_ERROR; 3688 if (loc->mbuf->ol_flags & PKT_TX_TUNNEL_MASK) 3689 hlen += loc->mbuf->outer_l2_len + 3690 loc->mbuf->outer_l3_len; 3691 /* Segment must contain all TSO headers. */ 3692 if (unlikely(hlen > MLX5_MAX_TSO_HEADER || 3693 hlen <= MLX5_ESEG_MIN_INLINE_SIZE || 3694 hlen > (dlen + vlan))) 3695 return MLX5_TXCMP_CODE_ERROR; 3696 /* 3697 * Check whether there are enough free WQEBBs: 3698 * - Control Segment 3699 * - Ethernet Segment 3700 * - First Segment of inlined Ethernet data 3701 * - ... data continued ... 3702 * - Finishing Data Segment of pointer type 3703 */ 3704 ds = 4 + (hlen - MLX5_ESEG_MIN_INLINE_SIZE + 3705 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 3706 if (loc->wqe_free < ((ds + 3) / 4)) 3707 return MLX5_TXCMP_CODE_EXIT; 3708 #ifdef MLX5_PMD_SOFT_COUNTERS 3709 /* Update sent data bytes/packets counters. */ 3710 ntcp = (dlen + vlan - hlen + 3711 loc->mbuf->tso_segsz - 1) / 3712 loc->mbuf->tso_segsz; 3713 /* 3714 * One will be added for mbuf itself at the end 3715 * of the mlx5_tx_burst from loc->pkts_sent field. 3716 */ 3717 --ntcp; 3718 txq->stats.opackets += ntcp; 3719 txq->stats.obytes += dlen + vlan + ntcp * hlen; 3720 #endif 3721 /* 3722 * Build the TSO WQE: 3723 * - Control Segment 3724 * - Ethernet Segment with hlen bytes inlined 3725 * - Data Segment of pointer type 3726 */ 3727 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3728 loc->wqe_last = wqe; 3729 mlx5_tx_cseg_init(txq, loc, wqe, ds, 3730 MLX5_OPCODE_TSO, olx); 3731 dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan, hlen, 1, olx); 3732 dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + hlen - vlan; 3733 dlen -= hlen - vlan; 3734 mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx); 3735 /* 3736 * WQE is built, update the loop parameters 3737 * and go to the next packet. 3738 */ 3739 txq->wqe_ci += (ds + 3) / 4; 3740 loc->wqe_free -= (ds + 3) / 4; 3741 if (MLX5_TXOFF_CONFIG(INLINE)) 3742 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 3743 --loc->elts_free; 3744 ++loc->pkts_sent; 3745 --pkts_n; 3746 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) 3747 return MLX5_TXCMP_CODE_EXIT; 3748 loc->mbuf = *pkts++; 3749 if (pkts_n > 1) 3750 rte_prefetch0(*pkts); 3751 if (MLX5_TXOFF_CONFIG(MULTI) && 3752 unlikely(NB_SEGS(loc->mbuf) > 1)) 3753 return MLX5_TXCMP_CODE_MULTI; 3754 if (likely(!(loc->mbuf->ol_flags & PKT_TX_TCP_SEG))) 3755 return MLX5_TXCMP_CODE_SINGLE; 3756 /* Continue with the next TSO packet. */ 3757 } 3758 MLX5_ASSERT(false); 3759 } 3760 3761 /** 3762 * Analyze the packet and select the best method to send. 3763 * 3764 * @param txq 3765 * Pointer to TX queue structure. 3766 * @param loc 3767 * Pointer to burst routine local context. 3768 * @param olx 3769 * Configured Tx offloads mask. It is fully defined at 3770 * compile time and may be used for optimization. 3771 * @param newp 3772 * The predefined flag whether do complete check for 3773 * multi-segment packets and TSO. 3774 * 3775 * @return 3776 * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered. 3777 * MLX5_TXCMP_CODE_TSO - TSO required, use TSO/LSO. 3778 * MLX5_TXCMP_CODE_SINGLE - single-segment packet, use SEND. 3779 * MLX5_TXCMP_CODE_EMPW - single-segment packet, use MPW. 3780 */ 3781 static __rte_always_inline enum mlx5_txcmp_code 3782 mlx5_tx_able_to_empw(struct mlx5_txq_data *restrict txq, 3783 struct mlx5_txq_local *restrict loc, 3784 unsigned int olx, 3785 bool newp) 3786 { 3787 /* Check for multi-segment packet. */ 3788 if (newp && 3789 MLX5_TXOFF_CONFIG(MULTI) && 3790 unlikely(NB_SEGS(loc->mbuf) > 1)) 3791 return MLX5_TXCMP_CODE_MULTI; 3792 /* Check for TSO packet. */ 3793 if (newp && 3794 MLX5_TXOFF_CONFIG(TSO) && 3795 unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) 3796 return MLX5_TXCMP_CODE_TSO; 3797 /* Check if eMPW is enabled at all. */ 3798 if (!MLX5_TXOFF_CONFIG(EMPW)) 3799 return MLX5_TXCMP_CODE_SINGLE; 3800 /* Check if eMPW can be engaged. */ 3801 if (MLX5_TXOFF_CONFIG(VLAN) && 3802 unlikely(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) && 3803 (!MLX5_TXOFF_CONFIG(INLINE) || 3804 unlikely((rte_pktmbuf_data_len(loc->mbuf) + 3805 sizeof(struct rte_vlan_hdr)) > txq->inlen_empw))) { 3806 /* 3807 * eMPW does not support VLAN insertion offload, 3808 * we have to inline the entire packet but 3809 * packet is too long for inlining. 3810 */ 3811 return MLX5_TXCMP_CODE_SINGLE; 3812 } 3813 return MLX5_TXCMP_CODE_EMPW; 3814 } 3815 3816 /** 3817 * Check the next packet attributes to match with the eMPW batch ones. 3818 * In addition, for legacy MPW the packet length is checked either. 3819 * 3820 * @param txq 3821 * Pointer to TX queue structure. 3822 * @param es 3823 * Pointer to Ethernet Segment of eMPW batch. 3824 * @param loc 3825 * Pointer to burst routine local context. 3826 * @param dlen 3827 * Length of previous packet in MPW descriptor. 3828 * @param olx 3829 * Configured Tx offloads mask. It is fully defined at 3830 * compile time and may be used for optimization. 3831 * 3832 * @return 3833 * true - packet match with eMPW batch attributes. 3834 * false - no match, eMPW should be restarted. 3835 */ 3836 static __rte_always_inline bool 3837 mlx5_tx_match_empw(struct mlx5_txq_data *restrict txq __rte_unused, 3838 struct mlx5_wqe_eseg *restrict es, 3839 struct mlx5_txq_local *restrict loc, 3840 uint32_t dlen, 3841 unsigned int olx) 3842 { 3843 uint8_t swp_flags = 0; 3844 3845 /* Compare the checksum flags, if any. */ 3846 if (MLX5_TXOFF_CONFIG(CSUM) && 3847 txq_ol_cksum_to_cs(loc->mbuf) != es->cs_flags) 3848 return false; 3849 /* Compare the Software Parser offsets and flags. */ 3850 if (MLX5_TXOFF_CONFIG(SWP) && 3851 (es->swp_offs != txq_mbuf_to_swp(loc, &swp_flags, olx) || 3852 es->swp_flags != swp_flags)) 3853 return false; 3854 /* Fill metadata field if needed. */ 3855 if (MLX5_TXOFF_CONFIG(METADATA) && 3856 es->metadata != (loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? 3857 *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0)) 3858 return false; 3859 /* Legacy MPW can send packets with the same lengt only. */ 3860 if (MLX5_TXOFF_CONFIG(MPW) && 3861 dlen != rte_pktmbuf_data_len(loc->mbuf)) 3862 return false; 3863 /* There must be no VLAN packets in eMPW loop. */ 3864 if (MLX5_TXOFF_CONFIG(VLAN)) 3865 MLX5_ASSERT(!(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)); 3866 return true; 3867 } 3868 3869 /* 3870 * Update send loop variables and WQE for eMPW loop 3871 * without data inlining. Number of Data Segments is 3872 * equal to the number of sent packets. 3873 * 3874 * @param txq 3875 * Pointer to TX queue structure. 3876 * @param loc 3877 * Pointer to burst routine local context. 3878 * @param ds 3879 * Number of packets/Data Segments/Packets. 3880 * @param slen 3881 * Accumulated statistics, bytes sent 3882 * @param olx 3883 * Configured Tx offloads mask. It is fully defined at 3884 * compile time and may be used for optimization. 3885 * 3886 * @return 3887 * true - packet match with eMPW batch attributes. 3888 * false - no match, eMPW should be restarted. 3889 */ 3890 static __rte_always_inline void 3891 mlx5_tx_sdone_empw(struct mlx5_txq_data *restrict txq, 3892 struct mlx5_txq_local *restrict loc, 3893 unsigned int ds, 3894 unsigned int slen, 3895 unsigned int olx __rte_unused) 3896 { 3897 MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE)); 3898 #ifdef MLX5_PMD_SOFT_COUNTERS 3899 /* Update sent data bytes counter. */ 3900 txq->stats.obytes += slen; 3901 #else 3902 (void)slen; 3903 #endif 3904 loc->elts_free -= ds; 3905 loc->pkts_sent += ds; 3906 ds += 2; 3907 loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); 3908 txq->wqe_ci += (ds + 3) / 4; 3909 loc->wqe_free -= (ds + 3) / 4; 3910 } 3911 3912 /* 3913 * Update send loop variables and WQE for eMPW loop 3914 * with data inlining. Gets the size of pushed descriptors 3915 * and data to the WQE. 3916 * 3917 * @param txq 3918 * Pointer to TX queue structure. 3919 * @param loc 3920 * Pointer to burst routine local context. 3921 * @param len 3922 * Total size of descriptor/data in bytes. 3923 * @param slen 3924 * Accumulated statistics, data bytes sent. 3925 * @param wqem 3926 * The base WQE for the eMPW/MPW descriptor. 3927 * @param olx 3928 * Configured Tx offloads mask. It is fully defined at 3929 * compile time and may be used for optimization. 3930 * 3931 * @return 3932 * true - packet match with eMPW batch attributes. 3933 * false - no match, eMPW should be restarted. 3934 */ 3935 static __rte_always_inline void 3936 mlx5_tx_idone_empw(struct mlx5_txq_data *restrict txq, 3937 struct mlx5_txq_local *restrict loc, 3938 unsigned int len, 3939 unsigned int slen, 3940 struct mlx5_wqe *restrict wqem, 3941 unsigned int olx __rte_unused) 3942 { 3943 struct mlx5_wqe_dseg *dseg = &wqem->dseg[0]; 3944 3945 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE)); 3946 #ifdef MLX5_PMD_SOFT_COUNTERS 3947 /* Update sent data bytes counter. */ 3948 txq->stats.obytes += slen; 3949 #else 3950 (void)slen; 3951 #endif 3952 if (MLX5_TXOFF_CONFIG(MPW) && dseg->bcount == RTE_BE32(0)) { 3953 /* 3954 * If the legacy MPW session contains the inline packets 3955 * we should set the only inline data segment length 3956 * and align the total length to the segment size. 3957 */ 3958 MLX5_ASSERT(len > sizeof(dseg->bcount)); 3959 dseg->bcount = rte_cpu_to_be_32((len - sizeof(dseg->bcount)) | 3960 MLX5_ETH_WQE_DATA_INLINE); 3961 len = (len + MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE + 2; 3962 } else { 3963 /* 3964 * The session is not legacy MPW or contains the 3965 * data buffer pointer segments. 3966 */ 3967 MLX5_ASSERT((len % MLX5_WSEG_SIZE) == 0); 3968 len = len / MLX5_WSEG_SIZE + 2; 3969 } 3970 wqem->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | len); 3971 txq->wqe_ci += (len + 3) / 4; 3972 loc->wqe_free -= (len + 3) / 4; 3973 loc->wqe_last = wqem; 3974 } 3975 3976 /** 3977 * The set of Tx burst functions for single-segment packets 3978 * without TSO and with Multi-Packet Writing feature support. 3979 * Supports all types of Tx offloads, except multi-packets 3980 * and TSO. 3981 * 3982 * Uses MLX5_OPCODE_EMPW to build WQEs if possible and sends 3983 * as many packet per WQE as it can. If eMPW is not configured 3984 * or packet can not be sent with eMPW (VLAN insertion) the 3985 * ordinary SEND opcode is used and only one packet placed 3986 * in WQE. 3987 * 3988 * Functions stop sending if it encounters the multi-segment 3989 * packet or packet with TSO requested. 3990 * 3991 * The routines are responsible for storing processed mbuf 3992 * into elts ring buffer and update elts_head if inlining 3993 * offload is requested. Otherwise the copying mbufs to elts 3994 * can be postponed and completed at the end of burst routine. 3995 * 3996 * @param txq 3997 * Pointer to TX queue structure. 3998 * @param[in] pkts 3999 * Packets to transmit. 4000 * @param pkts_n 4001 * Number of packets in array. 4002 * @param loc 4003 * Pointer to burst routine local context. 4004 * @param olx 4005 * Configured Tx offloads mask. It is fully defined at 4006 * compile time and may be used for optimization. 4007 * 4008 * @return 4009 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 4010 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 4011 * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered. 4012 * MLX5_TXCMP_CODE_TSO - TSO packet encountered. 4013 * MLX5_TXCMP_CODE_SINGLE - used inside functions set. 4014 * MLX5_TXCMP_CODE_EMPW - used inside functions set. 4015 * 4016 * Local context variables updated. 4017 * 4018 * 4019 * The routine sends packets with MLX5_OPCODE_EMPW 4020 * without inlining, this is dedicated optimized branch. 4021 * No VLAN insertion is supported. 4022 */ 4023 static __rte_always_inline enum mlx5_txcmp_code 4024 mlx5_tx_burst_empw_simple(struct mlx5_txq_data *restrict txq, 4025 struct rte_mbuf **restrict pkts, 4026 unsigned int pkts_n, 4027 struct mlx5_txq_local *restrict loc, 4028 unsigned int olx) 4029 { 4030 /* 4031 * Subroutine is the part of mlx5_tx_burst_single() 4032 * and sends single-segment packet with eMPW opcode 4033 * without data inlining. 4034 */ 4035 MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE)); 4036 MLX5_ASSERT(MLX5_TXOFF_CONFIG(EMPW)); 4037 MLX5_ASSERT(loc->elts_free && loc->wqe_free); 4038 MLX5_ASSERT(pkts_n > loc->pkts_sent); 4039 static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size"); 4040 pkts += loc->pkts_sent + 1; 4041 pkts_n -= loc->pkts_sent; 4042 for (;;) { 4043 struct mlx5_wqe_dseg *restrict dseg; 4044 struct mlx5_wqe_eseg *restrict eseg; 4045 enum mlx5_txcmp_code ret; 4046 unsigned int part, loop; 4047 unsigned int slen = 0; 4048 4049 next_empw: 4050 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); 4051 part = RTE_MIN(pkts_n, MLX5_TXOFF_CONFIG(MPW) ? 4052 MLX5_MPW_MAX_PACKETS : 4053 MLX5_EMPW_MAX_PACKETS); 4054 if (unlikely(loc->elts_free < part)) { 4055 /* We have no enough elts to save all mbufs. */ 4056 if (unlikely(loc->elts_free < MLX5_EMPW_MIN_PACKETS)) 4057 return MLX5_TXCMP_CODE_EXIT; 4058 /* But we still able to send at least minimal eMPW. */ 4059 part = loc->elts_free; 4060 } 4061 /* Check whether we have enough WQEs */ 4062 if (unlikely(loc->wqe_free < ((2 + part + 3) / 4))) { 4063 if (unlikely(loc->wqe_free < 4064 ((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4))) 4065 return MLX5_TXCMP_CODE_EXIT; 4066 part = (loc->wqe_free * 4) - 2; 4067 } 4068 if (likely(part > 1)) 4069 rte_prefetch0(*pkts); 4070 loc->wqe_last = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4071 /* 4072 * Build eMPW title WQEBB: 4073 * - Control Segment, eMPW opcode 4074 * - Ethernet Segment, no inline 4075 */ 4076 mlx5_tx_cseg_init(txq, loc, loc->wqe_last, part + 2, 4077 MLX5_OPCODE_ENHANCED_MPSW, olx); 4078 mlx5_tx_eseg_none(txq, loc, loc->wqe_last, 4079 olx & ~MLX5_TXOFF_CONFIG_VLAN); 4080 eseg = &loc->wqe_last->eseg; 4081 dseg = &loc->wqe_last->dseg[0]; 4082 loop = part; 4083 /* Store the packet length for legacy MPW. */ 4084 if (MLX5_TXOFF_CONFIG(MPW)) 4085 eseg->mss = rte_cpu_to_be_16 4086 (rte_pktmbuf_data_len(loc->mbuf)); 4087 for (;;) { 4088 uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf); 4089 #ifdef MLX5_PMD_SOFT_COUNTERS 4090 /* Update sent data bytes counter. */ 4091 slen += dlen; 4092 #endif 4093 mlx5_tx_dseg_ptr 4094 (txq, loc, dseg, 4095 rte_pktmbuf_mtod(loc->mbuf, uint8_t *), 4096 dlen, olx); 4097 if (unlikely(--loop == 0)) 4098 break; 4099 loc->mbuf = *pkts++; 4100 if (likely(loop > 1)) 4101 rte_prefetch0(*pkts); 4102 ret = mlx5_tx_able_to_empw(txq, loc, olx, true); 4103 /* 4104 * Unroll the completion code to avoid 4105 * returning variable value - it results in 4106 * unoptimized sequent checking in caller. 4107 */ 4108 if (ret == MLX5_TXCMP_CODE_MULTI) { 4109 part -= loop; 4110 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 4111 if (unlikely(!loc->elts_free || 4112 !loc->wqe_free)) 4113 return MLX5_TXCMP_CODE_EXIT; 4114 return MLX5_TXCMP_CODE_MULTI; 4115 } 4116 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); 4117 if (ret == MLX5_TXCMP_CODE_TSO) { 4118 part -= loop; 4119 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 4120 if (unlikely(!loc->elts_free || 4121 !loc->wqe_free)) 4122 return MLX5_TXCMP_CODE_EXIT; 4123 return MLX5_TXCMP_CODE_TSO; 4124 } 4125 if (ret == MLX5_TXCMP_CODE_SINGLE) { 4126 part -= loop; 4127 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 4128 if (unlikely(!loc->elts_free || 4129 !loc->wqe_free)) 4130 return MLX5_TXCMP_CODE_EXIT; 4131 return MLX5_TXCMP_CODE_SINGLE; 4132 } 4133 if (ret != MLX5_TXCMP_CODE_EMPW) { 4134 MLX5_ASSERT(false); 4135 part -= loop; 4136 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 4137 return MLX5_TXCMP_CODE_ERROR; 4138 } 4139 /* 4140 * Check whether packet parameters coincide 4141 * within assumed eMPW batch: 4142 * - check sum settings 4143 * - metadata value 4144 * - software parser settings 4145 * - packets length (legacy MPW only) 4146 */ 4147 if (!mlx5_tx_match_empw(txq, eseg, loc, dlen, olx)) { 4148 MLX5_ASSERT(loop); 4149 part -= loop; 4150 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 4151 if (unlikely(!loc->elts_free || 4152 !loc->wqe_free)) 4153 return MLX5_TXCMP_CODE_EXIT; 4154 pkts_n -= part; 4155 goto next_empw; 4156 } 4157 /* Packet attributes match, continue the same eMPW. */ 4158 ++dseg; 4159 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 4160 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 4161 } 4162 /* eMPW is built successfully, update loop parameters. */ 4163 MLX5_ASSERT(!loop); 4164 MLX5_ASSERT(pkts_n >= part); 4165 #ifdef MLX5_PMD_SOFT_COUNTERS 4166 /* Update sent data bytes counter. */ 4167 txq->stats.obytes += slen; 4168 #endif 4169 loc->elts_free -= part; 4170 loc->pkts_sent += part; 4171 txq->wqe_ci += (2 + part + 3) / 4; 4172 loc->wqe_free -= (2 + part + 3) / 4; 4173 pkts_n -= part; 4174 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) 4175 return MLX5_TXCMP_CODE_EXIT; 4176 loc->mbuf = *pkts++; 4177 ret = mlx5_tx_able_to_empw(txq, loc, olx, true); 4178 if (unlikely(ret != MLX5_TXCMP_CODE_EMPW)) 4179 return ret; 4180 /* Continue sending eMPW batches. */ 4181 } 4182 MLX5_ASSERT(false); 4183 } 4184 4185 /** 4186 * The routine sends packets with MLX5_OPCODE_EMPW 4187 * with inlining, optionally supports VLAN insertion. 4188 */ 4189 static __rte_always_inline enum mlx5_txcmp_code 4190 mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq, 4191 struct rte_mbuf **restrict pkts, 4192 unsigned int pkts_n, 4193 struct mlx5_txq_local *restrict loc, 4194 unsigned int olx) 4195 { 4196 /* 4197 * Subroutine is the part of mlx5_tx_burst_single() 4198 * and sends single-segment packet with eMPW opcode 4199 * with data inlining. 4200 */ 4201 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE)); 4202 MLX5_ASSERT(MLX5_TXOFF_CONFIG(EMPW)); 4203 MLX5_ASSERT(loc->elts_free && loc->wqe_free); 4204 MLX5_ASSERT(pkts_n > loc->pkts_sent); 4205 static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size"); 4206 pkts += loc->pkts_sent + 1; 4207 pkts_n -= loc->pkts_sent; 4208 for (;;) { 4209 struct mlx5_wqe_dseg *restrict dseg; 4210 struct mlx5_wqe *restrict wqem; 4211 enum mlx5_txcmp_code ret; 4212 unsigned int room, part, nlim; 4213 unsigned int slen = 0; 4214 4215 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); 4216 /* 4217 * Limits the amount of packets in one WQE 4218 * to improve CQE latency generation. 4219 */ 4220 nlim = RTE_MIN(pkts_n, MLX5_TXOFF_CONFIG(MPW) ? 4221 MLX5_MPW_INLINE_MAX_PACKETS : 4222 MLX5_EMPW_MAX_PACKETS); 4223 /* Check whether we have minimal amount WQEs */ 4224 if (unlikely(loc->wqe_free < 4225 ((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4))) 4226 return MLX5_TXCMP_CODE_EXIT; 4227 if (likely(pkts_n > 1)) 4228 rte_prefetch0(*pkts); 4229 wqem = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4230 /* 4231 * Build eMPW title WQEBB: 4232 * - Control Segment, eMPW opcode, zero DS 4233 * - Ethernet Segment, no inline 4234 */ 4235 mlx5_tx_cseg_init(txq, loc, wqem, 0, 4236 MLX5_OPCODE_ENHANCED_MPSW, olx); 4237 mlx5_tx_eseg_none(txq, loc, wqem, 4238 olx & ~MLX5_TXOFF_CONFIG_VLAN); 4239 dseg = &wqem->dseg[0]; 4240 /* Store the packet length for legacy MPW. */ 4241 if (MLX5_TXOFF_CONFIG(MPW)) 4242 wqem->eseg.mss = rte_cpu_to_be_16 4243 (rte_pktmbuf_data_len(loc->mbuf)); 4244 room = RTE_MIN(MLX5_WQE_SIZE_MAX / MLX5_WQE_SIZE, 4245 loc->wqe_free) * MLX5_WQE_SIZE - 4246 MLX5_WQE_CSEG_SIZE - 4247 MLX5_WQE_ESEG_SIZE; 4248 /* Limit the room for legacy MPW sessions for performance. */ 4249 if (MLX5_TXOFF_CONFIG(MPW)) 4250 room = RTE_MIN(room, 4251 RTE_MAX(txq->inlen_empw + 4252 sizeof(dseg->bcount) + 4253 (MLX5_TXOFF_CONFIG(VLAN) ? 4254 sizeof(struct rte_vlan_hdr) : 0), 4255 MLX5_MPW_INLINE_MAX_PACKETS * 4256 MLX5_WQE_DSEG_SIZE)); 4257 /* Build WQE till we have space, packets and resources. */ 4258 part = room; 4259 for (;;) { 4260 uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf); 4261 uint8_t *dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *); 4262 unsigned int tlen; 4263 4264 MLX5_ASSERT(room >= MLX5_WQE_DSEG_SIZE); 4265 MLX5_ASSERT((room % MLX5_WQE_DSEG_SIZE) == 0); 4266 MLX5_ASSERT((uintptr_t)dseg < (uintptr_t)txq->wqes_end); 4267 /* 4268 * Some Tx offloads may cause an error if 4269 * packet is not long enough, check against 4270 * assumed minimal length. 4271 */ 4272 if (unlikely(dlen <= MLX5_ESEG_MIN_INLINE_SIZE)) { 4273 part -= room; 4274 if (unlikely(!part)) 4275 return MLX5_TXCMP_CODE_ERROR; 4276 /* 4277 * We have some successfully built 4278 * packet Data Segments to send. 4279 */ 4280 mlx5_tx_idone_empw(txq, loc, part, 4281 slen, wqem, olx); 4282 return MLX5_TXCMP_CODE_ERROR; 4283 } 4284 /* Inline or not inline - that's the Question. */ 4285 if (dlen > txq->inlen_empw || 4286 loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE) 4287 goto pointer_empw; 4288 if (MLX5_TXOFF_CONFIG(MPW)) { 4289 if (dlen > txq->inlen_send) 4290 goto pointer_empw; 4291 tlen = dlen; 4292 if (part == room) { 4293 /* Open new inline MPW session. */ 4294 tlen += sizeof(dseg->bcount); 4295 dseg->bcount = RTE_BE32(0); 4296 dseg = RTE_PTR_ADD 4297 (dseg, sizeof(dseg->bcount)); 4298 } else { 4299 /* 4300 * No pointer and inline descriptor 4301 * intermix for legacy MPW sessions. 4302 */ 4303 if (wqem->dseg[0].bcount) 4304 break; 4305 } 4306 } else { 4307 tlen = sizeof(dseg->bcount) + dlen; 4308 } 4309 /* Inline entire packet, optional VLAN insertion. */ 4310 if (MLX5_TXOFF_CONFIG(VLAN) && 4311 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { 4312 /* 4313 * The packet length must be checked in 4314 * mlx5_tx_able_to_empw() and packet 4315 * fits into inline length guaranteed. 4316 */ 4317 MLX5_ASSERT((dlen + 4318 sizeof(struct rte_vlan_hdr)) <= 4319 txq->inlen_empw); 4320 tlen += sizeof(struct rte_vlan_hdr); 4321 if (room < tlen) 4322 break; 4323 dseg = mlx5_tx_dseg_vlan(txq, loc, dseg, 4324 dptr, dlen, olx); 4325 #ifdef MLX5_PMD_SOFT_COUNTERS 4326 /* Update sent data bytes counter. */ 4327 slen += sizeof(struct rte_vlan_hdr); 4328 #endif 4329 } else { 4330 if (room < tlen) 4331 break; 4332 dseg = mlx5_tx_dseg_empw(txq, loc, dseg, 4333 dptr, dlen, olx); 4334 } 4335 if (!MLX5_TXOFF_CONFIG(MPW)) 4336 tlen = RTE_ALIGN(tlen, MLX5_WSEG_SIZE); 4337 MLX5_ASSERT(room >= tlen); 4338 room -= tlen; 4339 /* 4340 * Packet data are completely inlined, 4341 * free the packet immediately. 4342 */ 4343 rte_pktmbuf_free_seg(loc->mbuf); 4344 goto next_mbuf; 4345 pointer_empw: 4346 /* 4347 * No pointer and inline descriptor 4348 * intermix for legacy MPW sessions. 4349 */ 4350 if (MLX5_TXOFF_CONFIG(MPW) && 4351 part != room && 4352 wqem->dseg[0].bcount == RTE_BE32(0)) 4353 break; 4354 /* 4355 * Not inlinable VLAN packets are 4356 * proceeded outside of this routine. 4357 */ 4358 MLX5_ASSERT(room >= MLX5_WQE_DSEG_SIZE); 4359 if (MLX5_TXOFF_CONFIG(VLAN)) 4360 MLX5_ASSERT(!(loc->mbuf->ol_flags & 4361 PKT_TX_VLAN_PKT)); 4362 mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx); 4363 /* We have to store mbuf in elts.*/ 4364 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 4365 room -= MLX5_WQE_DSEG_SIZE; 4366 /* Ring buffer wraparound is checked at the loop end.*/ 4367 ++dseg; 4368 next_mbuf: 4369 #ifdef MLX5_PMD_SOFT_COUNTERS 4370 /* Update sent data bytes counter. */ 4371 slen += dlen; 4372 #endif 4373 loc->pkts_sent++; 4374 loc->elts_free--; 4375 pkts_n--; 4376 if (unlikely(!pkts_n || !loc->elts_free)) { 4377 /* 4378 * We have no resources/packets to 4379 * continue build descriptors. 4380 */ 4381 part -= room; 4382 mlx5_tx_idone_empw(txq, loc, part, 4383 slen, wqem, olx); 4384 return MLX5_TXCMP_CODE_EXIT; 4385 } 4386 loc->mbuf = *pkts++; 4387 if (likely(pkts_n > 1)) 4388 rte_prefetch0(*pkts); 4389 ret = mlx5_tx_able_to_empw(txq, loc, olx, true); 4390 /* 4391 * Unroll the completion code to avoid 4392 * returning variable value - it results in 4393 * unoptimized sequent checking in caller. 4394 */ 4395 if (ret == MLX5_TXCMP_CODE_MULTI) { 4396 part -= room; 4397 mlx5_tx_idone_empw(txq, loc, part, 4398 slen, wqem, olx); 4399 if (unlikely(!loc->elts_free || 4400 !loc->wqe_free)) 4401 return MLX5_TXCMP_CODE_EXIT; 4402 return MLX5_TXCMP_CODE_MULTI; 4403 } 4404 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); 4405 if (ret == MLX5_TXCMP_CODE_TSO) { 4406 part -= room; 4407 mlx5_tx_idone_empw(txq, loc, part, 4408 slen, wqem, olx); 4409 if (unlikely(!loc->elts_free || 4410 !loc->wqe_free)) 4411 return MLX5_TXCMP_CODE_EXIT; 4412 return MLX5_TXCMP_CODE_TSO; 4413 } 4414 if (ret == MLX5_TXCMP_CODE_SINGLE) { 4415 part -= room; 4416 mlx5_tx_idone_empw(txq, loc, part, 4417 slen, wqem, olx); 4418 if (unlikely(!loc->elts_free || 4419 !loc->wqe_free)) 4420 return MLX5_TXCMP_CODE_EXIT; 4421 return MLX5_TXCMP_CODE_SINGLE; 4422 } 4423 if (ret != MLX5_TXCMP_CODE_EMPW) { 4424 MLX5_ASSERT(false); 4425 part -= room; 4426 mlx5_tx_idone_empw(txq, loc, part, 4427 slen, wqem, olx); 4428 return MLX5_TXCMP_CODE_ERROR; 4429 } 4430 /* Check if we have minimal room left. */ 4431 nlim--; 4432 if (unlikely(!nlim || room < MLX5_WQE_DSEG_SIZE)) 4433 break; 4434 /* 4435 * Check whether packet parameters coincide 4436 * within assumed eMPW batch: 4437 * - check sum settings 4438 * - metadata value 4439 * - software parser settings 4440 * - packets length (legacy MPW only) 4441 */ 4442 if (!mlx5_tx_match_empw(txq, &wqem->eseg, 4443 loc, dlen, olx)) 4444 break; 4445 /* Packet attributes match, continue the same eMPW. */ 4446 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 4447 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 4448 } 4449 /* 4450 * We get here to close an existing eMPW 4451 * session and start the new one. 4452 */ 4453 MLX5_ASSERT(pkts_n); 4454 part -= room; 4455 if (unlikely(!part)) 4456 return MLX5_TXCMP_CODE_EXIT; 4457 mlx5_tx_idone_empw(txq, loc, part, slen, wqem, olx); 4458 if (unlikely(!loc->elts_free || 4459 !loc->wqe_free)) 4460 return MLX5_TXCMP_CODE_EXIT; 4461 /* Continue the loop with new eMPW session. */ 4462 } 4463 MLX5_ASSERT(false); 4464 } 4465 4466 /** 4467 * The routine sends packets with ordinary MLX5_OPCODE_SEND. 4468 * Data inlining and VLAN insertion are supported. 4469 */ 4470 static __rte_always_inline enum mlx5_txcmp_code 4471 mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq, 4472 struct rte_mbuf **restrict pkts, 4473 unsigned int pkts_n, 4474 struct mlx5_txq_local *restrict loc, 4475 unsigned int olx) 4476 { 4477 /* 4478 * Subroutine is the part of mlx5_tx_burst_single() 4479 * and sends single-segment packet with SEND opcode. 4480 */ 4481 MLX5_ASSERT(loc->elts_free && loc->wqe_free); 4482 MLX5_ASSERT(pkts_n > loc->pkts_sent); 4483 pkts += loc->pkts_sent + 1; 4484 pkts_n -= loc->pkts_sent; 4485 for (;;) { 4486 struct mlx5_wqe *restrict wqe; 4487 enum mlx5_txcmp_code ret; 4488 4489 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); 4490 if (MLX5_TXOFF_CONFIG(INLINE)) { 4491 unsigned int inlen, vlan = 0; 4492 4493 inlen = rte_pktmbuf_data_len(loc->mbuf); 4494 if (MLX5_TXOFF_CONFIG(VLAN) && 4495 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { 4496 vlan = sizeof(struct rte_vlan_hdr); 4497 inlen += vlan; 4498 static_assert((sizeof(struct rte_vlan_hdr) + 4499 sizeof(struct rte_ether_hdr)) == 4500 MLX5_ESEG_MIN_INLINE_SIZE, 4501 "invalid min inline data size"); 4502 } 4503 /* 4504 * If inlining is enabled at configuration time 4505 * the limit must be not less than minimal size. 4506 * Otherwise we would do extra check for data 4507 * size to avoid crashes due to length overflow. 4508 */ 4509 MLX5_ASSERT(txq->inlen_send >= 4510 MLX5_ESEG_MIN_INLINE_SIZE); 4511 if (inlen <= txq->inlen_send) { 4512 unsigned int seg_n, wqe_n; 4513 4514 rte_prefetch0(rte_pktmbuf_mtod 4515 (loc->mbuf, uint8_t *)); 4516 /* Check against minimal length. */ 4517 if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE) 4518 return MLX5_TXCMP_CODE_ERROR; 4519 if (loc->mbuf->ol_flags & 4520 PKT_TX_DYNF_NOINLINE) { 4521 /* 4522 * The hint flag not to inline packet 4523 * data is set. Check whether we can 4524 * follow the hint. 4525 */ 4526 if ((!MLX5_TXOFF_CONFIG(EMPW) && 4527 txq->inlen_mode) || 4528 (MLX5_TXOFF_CONFIG(MPW) && 4529 txq->inlen_mode)) { 4530 /* 4531 * The hardware requires the 4532 * minimal inline data header. 4533 */ 4534 goto single_min_inline; 4535 } 4536 if (MLX5_TXOFF_CONFIG(VLAN) && 4537 vlan && !txq->vlan_en) { 4538 /* 4539 * We must insert VLAN tag 4540 * by software means. 4541 */ 4542 goto single_part_inline; 4543 } 4544 goto single_no_inline; 4545 } 4546 /* 4547 * Completely inlined packet data WQE: 4548 * - Control Segment, SEND opcode 4549 * - Ethernet Segment, no VLAN insertion 4550 * - Data inlined, VLAN optionally inserted 4551 * - Alignment to MLX5_WSEG_SIZE 4552 * Have to estimate amount of WQEBBs 4553 */ 4554 seg_n = (inlen + 3 * MLX5_WSEG_SIZE - 4555 MLX5_ESEG_MIN_INLINE_SIZE + 4556 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 4557 /* Check if there are enough WQEBBs. */ 4558 wqe_n = (seg_n + 3) / 4; 4559 if (wqe_n > loc->wqe_free) 4560 return MLX5_TXCMP_CODE_EXIT; 4561 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4562 loc->wqe_last = wqe; 4563 mlx5_tx_cseg_init(txq, loc, wqe, seg_n, 4564 MLX5_OPCODE_SEND, olx); 4565 mlx5_tx_eseg_data(txq, loc, wqe, 4566 vlan, inlen, 0, olx); 4567 txq->wqe_ci += wqe_n; 4568 loc->wqe_free -= wqe_n; 4569 /* 4570 * Packet data are completely inlined, 4571 * free the packet immediately. 4572 */ 4573 rte_pktmbuf_free_seg(loc->mbuf); 4574 } else if ((!MLX5_TXOFF_CONFIG(EMPW) || 4575 MLX5_TXOFF_CONFIG(MPW)) && 4576 txq->inlen_mode) { 4577 /* 4578 * If minimal inlining is requested the eMPW 4579 * feature should be disabled due to data is 4580 * inlined into Ethernet Segment, which can 4581 * not contain inlined data for eMPW due to 4582 * segment shared for all packets. 4583 */ 4584 struct mlx5_wqe_dseg *restrict dseg; 4585 unsigned int ds; 4586 uint8_t *dptr; 4587 4588 /* 4589 * The inline-mode settings require 4590 * to inline the specified amount of 4591 * data bytes to the Ethernet Segment. 4592 * We should check the free space in 4593 * WQE ring buffer to inline partially. 4594 */ 4595 single_min_inline: 4596 MLX5_ASSERT(txq->inlen_send >= txq->inlen_mode); 4597 MLX5_ASSERT(inlen > txq->inlen_mode); 4598 MLX5_ASSERT(txq->inlen_mode >= 4599 MLX5_ESEG_MIN_INLINE_SIZE); 4600 /* 4601 * Check whether there are enough free WQEBBs: 4602 * - Control Segment 4603 * - Ethernet Segment 4604 * - First Segment of inlined Ethernet data 4605 * - ... data continued ... 4606 * - Finishing Data Segment of pointer type 4607 */ 4608 ds = (MLX5_WQE_CSEG_SIZE + 4609 MLX5_WQE_ESEG_SIZE + 4610 MLX5_WQE_DSEG_SIZE + 4611 txq->inlen_mode - 4612 MLX5_ESEG_MIN_INLINE_SIZE + 4613 MLX5_WQE_DSEG_SIZE + 4614 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 4615 if (loc->wqe_free < ((ds + 3) / 4)) 4616 return MLX5_TXCMP_CODE_EXIT; 4617 /* 4618 * Build the ordinary SEND WQE: 4619 * - Control Segment 4620 * - Ethernet Segment, inline inlen_mode bytes 4621 * - Data Segment of pointer type 4622 */ 4623 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4624 loc->wqe_last = wqe; 4625 mlx5_tx_cseg_init(txq, loc, wqe, ds, 4626 MLX5_OPCODE_SEND, olx); 4627 dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan, 4628 txq->inlen_mode, 4629 0, olx); 4630 dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + 4631 txq->inlen_mode - vlan; 4632 inlen -= txq->inlen_mode; 4633 mlx5_tx_dseg_ptr(txq, loc, dseg, 4634 dptr, inlen, olx); 4635 /* 4636 * WQE is built, update the loop parameters 4637 * and got to the next packet. 4638 */ 4639 txq->wqe_ci += (ds + 3) / 4; 4640 loc->wqe_free -= (ds + 3) / 4; 4641 /* We have to store mbuf in elts.*/ 4642 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE)); 4643 txq->elts[txq->elts_head++ & txq->elts_m] = 4644 loc->mbuf; 4645 --loc->elts_free; 4646 } else { 4647 uint8_t *dptr; 4648 unsigned int dlen; 4649 4650 /* 4651 * Partially inlined packet data WQE, we have 4652 * some space in title WQEBB, we can fill it 4653 * with some packet data. It takes one WQEBB, 4654 * it is available, no extra space check: 4655 * - Control Segment, SEND opcode 4656 * - Ethernet Segment, no VLAN insertion 4657 * - MLX5_ESEG_MIN_INLINE_SIZE bytes of Data 4658 * - Data Segment, pointer type 4659 * 4660 * We also get here if VLAN insertion is not 4661 * supported by HW, the inline is enabled. 4662 */ 4663 single_part_inline: 4664 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4665 loc->wqe_last = wqe; 4666 mlx5_tx_cseg_init(txq, loc, wqe, 4, 4667 MLX5_OPCODE_SEND, olx); 4668 mlx5_tx_eseg_dmin(txq, loc, wqe, vlan, olx); 4669 dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + 4670 MLX5_ESEG_MIN_INLINE_SIZE - vlan; 4671 /* 4672 * The length check is performed above, by 4673 * comparing with txq->inlen_send. We should 4674 * not get overflow here. 4675 */ 4676 MLX5_ASSERT(inlen > MLX5_ESEG_MIN_INLINE_SIZE); 4677 dlen = inlen - MLX5_ESEG_MIN_INLINE_SIZE; 4678 mlx5_tx_dseg_ptr(txq, loc, &wqe->dseg[1], 4679 dptr, dlen, olx); 4680 ++txq->wqe_ci; 4681 --loc->wqe_free; 4682 /* We have to store mbuf in elts.*/ 4683 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE)); 4684 txq->elts[txq->elts_head++ & txq->elts_m] = 4685 loc->mbuf; 4686 --loc->elts_free; 4687 } 4688 #ifdef MLX5_PMD_SOFT_COUNTERS 4689 /* Update sent data bytes counter. */ 4690 txq->stats.obytes += vlan + 4691 rte_pktmbuf_data_len(loc->mbuf); 4692 #endif 4693 } else { 4694 /* 4695 * No inline at all, it means the CPU cycles saving 4696 * is prioritized at configuration, we should not 4697 * copy any packet data to WQE. 4698 * 4699 * SEND WQE, one WQEBB: 4700 * - Control Segment, SEND opcode 4701 * - Ethernet Segment, optional VLAN, no inline 4702 * - Data Segment, pointer type 4703 */ 4704 single_no_inline: 4705 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4706 loc->wqe_last = wqe; 4707 mlx5_tx_cseg_init(txq, loc, wqe, 3, 4708 MLX5_OPCODE_SEND, olx); 4709 mlx5_tx_eseg_none(txq, loc, wqe, olx); 4710 mlx5_tx_dseg_ptr 4711 (txq, loc, &wqe->dseg[0], 4712 rte_pktmbuf_mtod(loc->mbuf, uint8_t *), 4713 rte_pktmbuf_data_len(loc->mbuf), olx); 4714 ++txq->wqe_ci; 4715 --loc->wqe_free; 4716 /* 4717 * We should not store mbuf pointer in elts 4718 * if no inlining is configured, this is done 4719 * by calling routine in a batch copy. 4720 */ 4721 MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE)); 4722 --loc->elts_free; 4723 #ifdef MLX5_PMD_SOFT_COUNTERS 4724 /* Update sent data bytes counter. */ 4725 txq->stats.obytes += rte_pktmbuf_data_len(loc->mbuf); 4726 if (MLX5_TXOFF_CONFIG(VLAN) && 4727 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) 4728 txq->stats.obytes += 4729 sizeof(struct rte_vlan_hdr); 4730 #endif 4731 } 4732 ++loc->pkts_sent; 4733 --pkts_n; 4734 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) 4735 return MLX5_TXCMP_CODE_EXIT; 4736 loc->mbuf = *pkts++; 4737 if (pkts_n > 1) 4738 rte_prefetch0(*pkts); 4739 ret = mlx5_tx_able_to_empw(txq, loc, olx, true); 4740 if (unlikely(ret != MLX5_TXCMP_CODE_SINGLE)) 4741 return ret; 4742 } 4743 MLX5_ASSERT(false); 4744 } 4745 4746 static __rte_always_inline enum mlx5_txcmp_code 4747 mlx5_tx_burst_single(struct mlx5_txq_data *restrict txq, 4748 struct rte_mbuf **restrict pkts, 4749 unsigned int pkts_n, 4750 struct mlx5_txq_local *restrict loc, 4751 unsigned int olx) 4752 { 4753 enum mlx5_txcmp_code ret; 4754 4755 ret = mlx5_tx_able_to_empw(txq, loc, olx, false); 4756 if (ret == MLX5_TXCMP_CODE_SINGLE) 4757 goto ordinary_send; 4758 MLX5_ASSERT(ret == MLX5_TXCMP_CODE_EMPW); 4759 for (;;) { 4760 /* Optimize for inline/no inline eMPW send. */ 4761 ret = (MLX5_TXOFF_CONFIG(INLINE)) ? 4762 mlx5_tx_burst_empw_inline 4763 (txq, pkts, pkts_n, loc, olx) : 4764 mlx5_tx_burst_empw_simple 4765 (txq, pkts, pkts_n, loc, olx); 4766 if (ret != MLX5_TXCMP_CODE_SINGLE) 4767 return ret; 4768 /* The resources to send one packet should remain. */ 4769 MLX5_ASSERT(loc->elts_free && loc->wqe_free); 4770 ordinary_send: 4771 ret = mlx5_tx_burst_single_send(txq, pkts, pkts_n, loc, olx); 4772 MLX5_ASSERT(ret != MLX5_TXCMP_CODE_SINGLE); 4773 if (ret != MLX5_TXCMP_CODE_EMPW) 4774 return ret; 4775 /* The resources to send one packet should remain. */ 4776 MLX5_ASSERT(loc->elts_free && loc->wqe_free); 4777 } 4778 } 4779 4780 /** 4781 * DPDK Tx callback template. This is configured template 4782 * used to generate routines optimized for specified offload setup. 4783 * One of this generated functions is chosen at SQ configuration 4784 * time. 4785 * 4786 * @param txq 4787 * Generic pointer to TX queue structure. 4788 * @param[in] pkts 4789 * Packets to transmit. 4790 * @param pkts_n 4791 * Number of packets in array. 4792 * @param olx 4793 * Configured offloads mask, presents the bits of MLX5_TXOFF_CONFIG_xxx 4794 * values. Should be static to take compile time static configuration 4795 * advantages. 4796 * 4797 * @return 4798 * Number of packets successfully transmitted (<= pkts_n). 4799 */ 4800 static __rte_always_inline uint16_t 4801 mlx5_tx_burst_tmpl(struct mlx5_txq_data *restrict txq, 4802 struct rte_mbuf **restrict pkts, 4803 uint16_t pkts_n, 4804 unsigned int olx) 4805 { 4806 struct mlx5_txq_local loc; 4807 enum mlx5_txcmp_code ret; 4808 unsigned int part; 4809 4810 MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); 4811 MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); 4812 if (unlikely(!pkts_n)) 4813 return 0; 4814 loc.pkts_sent = 0; 4815 loc.pkts_copy = 0; 4816 loc.wqe_last = NULL; 4817 4818 send_loop: 4819 loc.pkts_loop = loc.pkts_sent; 4820 /* 4821 * Check if there are some CQEs, if any: 4822 * - process an encountered errors 4823 * - process the completed WQEs 4824 * - free related mbufs 4825 * - doorbell the NIC about processed CQEs 4826 */ 4827 rte_prefetch0(*(pkts + loc.pkts_sent)); 4828 mlx5_tx_handle_completion(txq, olx); 4829 /* 4830 * Calculate the number of available resources - elts and WQEs. 4831 * There are two possible different scenarios: 4832 * - no data inlining into WQEs, one WQEBB may contains upto 4833 * four packets, in this case elts become scarce resource 4834 * - data inlining into WQEs, one packet may require multiple 4835 * WQEBBs, the WQEs become the limiting factor. 4836 */ 4837 MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); 4838 loc.elts_free = txq->elts_s - 4839 (uint16_t)(txq->elts_head - txq->elts_tail); 4840 MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); 4841 loc.wqe_free = txq->wqe_s - 4842 (uint16_t)(txq->wqe_ci - txq->wqe_pi); 4843 if (unlikely(!loc.elts_free || !loc.wqe_free)) 4844 goto burst_exit; 4845 for (;;) { 4846 /* 4847 * Fetch the packet from array. Usually this is 4848 * the first packet in series of multi/single 4849 * segment packets. 4850 */ 4851 loc.mbuf = *(pkts + loc.pkts_sent); 4852 /* Dedicated branch for multi-segment packets. */ 4853 if (MLX5_TXOFF_CONFIG(MULTI) && 4854 unlikely(NB_SEGS(loc.mbuf) > 1)) { 4855 /* 4856 * Multi-segment packet encountered. 4857 * Hardware is able to process it only 4858 * with SEND/TSO opcodes, one packet 4859 * per WQE, do it in dedicated routine. 4860 */ 4861 enter_send_multi: 4862 MLX5_ASSERT(loc.pkts_sent >= loc.pkts_copy); 4863 part = loc.pkts_sent - loc.pkts_copy; 4864 if (!MLX5_TXOFF_CONFIG(INLINE) && part) { 4865 /* 4866 * There are some single-segment mbufs not 4867 * stored in elts. The mbufs must be in the 4868 * same order as WQEs, so we must copy the 4869 * mbufs to elts here, before the coming 4870 * multi-segment packet mbufs is appended. 4871 */ 4872 mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy, 4873 part, olx); 4874 loc.pkts_copy = loc.pkts_sent; 4875 } 4876 MLX5_ASSERT(pkts_n > loc.pkts_sent); 4877 ret = mlx5_tx_burst_mseg(txq, pkts, pkts_n, &loc, olx); 4878 if (!MLX5_TXOFF_CONFIG(INLINE)) 4879 loc.pkts_copy = loc.pkts_sent; 4880 /* 4881 * These returned code checks are supposed 4882 * to be optimized out due to routine inlining. 4883 */ 4884 if (ret == MLX5_TXCMP_CODE_EXIT) { 4885 /* 4886 * The routine returns this code when 4887 * all packets are sent or there is no 4888 * enough resources to complete request. 4889 */ 4890 break; 4891 } 4892 if (ret == MLX5_TXCMP_CODE_ERROR) { 4893 /* 4894 * The routine returns this code when 4895 * some error in the incoming packets 4896 * format occurred. 4897 */ 4898 txq->stats.oerrors++; 4899 break; 4900 } 4901 if (ret == MLX5_TXCMP_CODE_SINGLE) { 4902 /* 4903 * The single-segment packet was encountered 4904 * in the array, try to send it with the 4905 * best optimized way, possible engaging eMPW. 4906 */ 4907 goto enter_send_single; 4908 } 4909 if (MLX5_TXOFF_CONFIG(TSO) && 4910 ret == MLX5_TXCMP_CODE_TSO) { 4911 /* 4912 * The single-segment TSO packet was 4913 * encountered in the array. 4914 */ 4915 goto enter_send_tso; 4916 } 4917 /* We must not get here. Something is going wrong. */ 4918 MLX5_ASSERT(false); 4919 txq->stats.oerrors++; 4920 break; 4921 } 4922 /* Dedicated branch for single-segment TSO packets. */ 4923 if (MLX5_TXOFF_CONFIG(TSO) && 4924 unlikely(loc.mbuf->ol_flags & PKT_TX_TCP_SEG)) { 4925 /* 4926 * TSO might require special way for inlining 4927 * (dedicated parameters) and is sent with 4928 * MLX5_OPCODE_TSO opcode only, provide this 4929 * in dedicated branch. 4930 */ 4931 enter_send_tso: 4932 MLX5_ASSERT(NB_SEGS(loc.mbuf) == 1); 4933 MLX5_ASSERT(pkts_n > loc.pkts_sent); 4934 ret = mlx5_tx_burst_tso(txq, pkts, pkts_n, &loc, olx); 4935 /* 4936 * These returned code checks are supposed 4937 * to be optimized out due to routine inlining. 4938 */ 4939 if (ret == MLX5_TXCMP_CODE_EXIT) 4940 break; 4941 if (ret == MLX5_TXCMP_CODE_ERROR) { 4942 txq->stats.oerrors++; 4943 break; 4944 } 4945 if (ret == MLX5_TXCMP_CODE_SINGLE) 4946 goto enter_send_single; 4947 if (MLX5_TXOFF_CONFIG(MULTI) && 4948 ret == MLX5_TXCMP_CODE_MULTI) { 4949 /* 4950 * The multi-segment packet was 4951 * encountered in the array. 4952 */ 4953 goto enter_send_multi; 4954 } 4955 /* We must not get here. Something is going wrong. */ 4956 MLX5_ASSERT(false); 4957 txq->stats.oerrors++; 4958 break; 4959 } 4960 /* 4961 * The dedicated branch for the single-segment packets 4962 * without TSO. Often these ones can be sent using 4963 * MLX5_OPCODE_EMPW with multiple packets in one WQE. 4964 * The routine builds the WQEs till it encounters 4965 * the TSO or multi-segment packet (in case if these 4966 * offloads are requested at SQ configuration time). 4967 */ 4968 enter_send_single: 4969 MLX5_ASSERT(pkts_n > loc.pkts_sent); 4970 ret = mlx5_tx_burst_single(txq, pkts, pkts_n, &loc, olx); 4971 /* 4972 * These returned code checks are supposed 4973 * to be optimized out due to routine inlining. 4974 */ 4975 if (ret == MLX5_TXCMP_CODE_EXIT) 4976 break; 4977 if (ret == MLX5_TXCMP_CODE_ERROR) { 4978 txq->stats.oerrors++; 4979 break; 4980 } 4981 if (MLX5_TXOFF_CONFIG(MULTI) && 4982 ret == MLX5_TXCMP_CODE_MULTI) { 4983 /* 4984 * The multi-segment packet was 4985 * encountered in the array. 4986 */ 4987 goto enter_send_multi; 4988 } 4989 if (MLX5_TXOFF_CONFIG(TSO) && 4990 ret == MLX5_TXCMP_CODE_TSO) { 4991 /* 4992 * The single-segment TSO packet was 4993 * encountered in the array. 4994 */ 4995 goto enter_send_tso; 4996 } 4997 /* We must not get here. Something is going wrong. */ 4998 MLX5_ASSERT(false); 4999 txq->stats.oerrors++; 5000 break; 5001 } 5002 /* 5003 * Main Tx loop is completed, do the rest: 5004 * - set completion request if thresholds are reached 5005 * - doorbell the hardware 5006 * - copy the rest of mbufs to elts (if any) 5007 */ 5008 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE) || 5009 loc.pkts_sent >= loc.pkts_copy); 5010 /* Take a shortcut if nothing is sent. */ 5011 if (unlikely(loc.pkts_sent == loc.pkts_loop)) 5012 goto burst_exit; 5013 /* Request CQE generation if limits are reached. */ 5014 mlx5_tx_request_completion(txq, &loc, olx); 5015 /* 5016 * Ring QP doorbell immediately after WQE building completion 5017 * to improve latencies. The pure software related data treatment 5018 * can be completed after doorbell. Tx CQEs for this SQ are 5019 * processed in this thread only by the polling. 5020 * 5021 * The rdma core library can map doorbell register in two ways, 5022 * depending on the environment variable "MLX5_SHUT_UP_BF": 5023 * 5024 * - as regular cached memory, the variable is either missing or 5025 * set to zero. This type of mapping may cause the significant 5026 * doorbell register writing latency and requires explicit 5027 * memory write barrier to mitigate this issue and prevent 5028 * write combining. 5029 * 5030 * - as non-cached memory, the variable is present and set to 5031 * not "0" value. This type of mapping may cause performance 5032 * impact under heavy loading conditions but the explicit write 5033 * memory barrier is not required and it may improve core 5034 * performance. 5035 * 5036 * - the legacy behaviour (prior 19.08 release) was to use some 5037 * heuristics to decide whether write memory barrier should 5038 * be performed. This behavior is supported with specifying 5039 * tx_db_nc=2, write barrier is skipped if application 5040 * provides the full recommended burst of packets, it 5041 * supposes the next packets are coming and the write barrier 5042 * will be issued on the next burst (after descriptor writing, 5043 * at least). 5044 */ 5045 mlx5_tx_dbrec_cond_wmb(txq, loc.wqe_last, !txq->db_nc && 5046 (!txq->db_heu || pkts_n % MLX5_TX_DEFAULT_BURST)); 5047 /* Not all of the mbufs may be stored into elts yet. */ 5048 part = MLX5_TXOFF_CONFIG(INLINE) ? 0 : loc.pkts_sent - loc.pkts_copy; 5049 if (!MLX5_TXOFF_CONFIG(INLINE) && part) { 5050 /* 5051 * There are some single-segment mbufs not stored in elts. 5052 * It can be only if the last packet was single-segment. 5053 * The copying is gathered into one place due to it is 5054 * a good opportunity to optimize that with SIMD. 5055 * Unfortunately if inlining is enabled the gaps in 5056 * pointer array may happen due to early freeing of the 5057 * inlined mbufs. 5058 */ 5059 mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy, part, olx); 5060 loc.pkts_copy = loc.pkts_sent; 5061 } 5062 MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); 5063 MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); 5064 if (pkts_n > loc.pkts_sent) { 5065 /* 5066 * If burst size is large there might be no enough CQE 5067 * fetched from completion queue and no enough resources 5068 * freed to send all the packets. 5069 */ 5070 goto send_loop; 5071 } 5072 burst_exit: 5073 #ifdef MLX5_PMD_SOFT_COUNTERS 5074 /* Increment sent packets counter. */ 5075 txq->stats.opackets += loc.pkts_sent; 5076 #endif 5077 return loc.pkts_sent; 5078 } 5079 5080 /* Generate routines with Enhanced Multi-Packet Write support. */ 5081 MLX5_TXOFF_DECL(full_empw, 5082 MLX5_TXOFF_CONFIG_FULL | MLX5_TXOFF_CONFIG_EMPW) 5083 5084 MLX5_TXOFF_DECL(none_empw, 5085 MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW) 5086 5087 MLX5_TXOFF_DECL(md_empw, 5088 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5089 5090 MLX5_TXOFF_DECL(mt_empw, 5091 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5092 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5093 5094 MLX5_TXOFF_DECL(mtsc_empw, 5095 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5096 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5097 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5098 5099 MLX5_TXOFF_DECL(mti_empw, 5100 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5101 MLX5_TXOFF_CONFIG_INLINE | 5102 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5103 5104 MLX5_TXOFF_DECL(mtv_empw, 5105 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5106 MLX5_TXOFF_CONFIG_VLAN | 5107 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5108 5109 MLX5_TXOFF_DECL(mtiv_empw, 5110 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5111 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5112 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5113 5114 MLX5_TXOFF_DECL(sc_empw, 5115 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5116 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5117 5118 MLX5_TXOFF_DECL(sci_empw, 5119 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5120 MLX5_TXOFF_CONFIG_INLINE | 5121 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5122 5123 MLX5_TXOFF_DECL(scv_empw, 5124 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5125 MLX5_TXOFF_CONFIG_VLAN | 5126 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5127 5128 MLX5_TXOFF_DECL(sciv_empw, 5129 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5130 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5131 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5132 5133 MLX5_TXOFF_DECL(i_empw, 5134 MLX5_TXOFF_CONFIG_INLINE | 5135 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5136 5137 MLX5_TXOFF_DECL(v_empw, 5138 MLX5_TXOFF_CONFIG_VLAN | 5139 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5140 5141 MLX5_TXOFF_DECL(iv_empw, 5142 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5143 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5144 5145 /* Generate routines without Enhanced Multi-Packet Write support. */ 5146 MLX5_TXOFF_DECL(full, 5147 MLX5_TXOFF_CONFIG_FULL) 5148 5149 MLX5_TXOFF_DECL(none, 5150 MLX5_TXOFF_CONFIG_NONE) 5151 5152 MLX5_TXOFF_DECL(md, 5153 MLX5_TXOFF_CONFIG_METADATA) 5154 5155 MLX5_TXOFF_DECL(mt, 5156 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5157 MLX5_TXOFF_CONFIG_METADATA) 5158 5159 MLX5_TXOFF_DECL(mtsc, 5160 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5161 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5162 MLX5_TXOFF_CONFIG_METADATA) 5163 5164 MLX5_TXOFF_DECL(mti, 5165 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5166 MLX5_TXOFF_CONFIG_INLINE | 5167 MLX5_TXOFF_CONFIG_METADATA) 5168 5169 5170 MLX5_TXOFF_DECL(mtv, 5171 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5172 MLX5_TXOFF_CONFIG_VLAN | 5173 MLX5_TXOFF_CONFIG_METADATA) 5174 5175 5176 MLX5_TXOFF_DECL(mtiv, 5177 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5178 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5179 MLX5_TXOFF_CONFIG_METADATA) 5180 5181 MLX5_TXOFF_DECL(sc, 5182 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5183 MLX5_TXOFF_CONFIG_METADATA) 5184 5185 MLX5_TXOFF_DECL(sci, 5186 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5187 MLX5_TXOFF_CONFIG_INLINE | 5188 MLX5_TXOFF_CONFIG_METADATA) 5189 5190 5191 MLX5_TXOFF_DECL(scv, 5192 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5193 MLX5_TXOFF_CONFIG_VLAN | 5194 MLX5_TXOFF_CONFIG_METADATA) 5195 5196 5197 MLX5_TXOFF_DECL(sciv, 5198 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5199 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5200 MLX5_TXOFF_CONFIG_METADATA) 5201 5202 MLX5_TXOFF_DECL(i, 5203 MLX5_TXOFF_CONFIG_INLINE | 5204 MLX5_TXOFF_CONFIG_METADATA) 5205 5206 MLX5_TXOFF_DECL(v, 5207 MLX5_TXOFF_CONFIG_VLAN | 5208 MLX5_TXOFF_CONFIG_METADATA) 5209 5210 MLX5_TXOFF_DECL(iv, 5211 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5212 MLX5_TXOFF_CONFIG_METADATA) 5213 5214 /* 5215 * Generate routines with Legacy Multi-Packet Write support. 5216 * This mode is supported by ConnectX-4LX only and imposes 5217 * offload limitations, not supported: 5218 * - ACL/Flows (metadata are becoming meaningless) 5219 * - WQE Inline headers 5220 * - SRIOV (E-Switch offloads) 5221 * - VLAN insertion 5222 * - tunnel encapsulation/decapsulation 5223 * - TSO 5224 */ 5225 MLX5_TXOFF_DECL(none_mpw, 5226 MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW | 5227 MLX5_TXOFF_CONFIG_MPW) 5228 5229 MLX5_TXOFF_DECL(mci_mpw, 5230 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM | 5231 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | 5232 MLX5_TXOFF_CONFIG_MPW) 5233 5234 MLX5_TXOFF_DECL(mc_mpw, 5235 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM | 5236 MLX5_TXOFF_CONFIG_EMPW | MLX5_TXOFF_CONFIG_MPW) 5237 5238 MLX5_TXOFF_DECL(i_mpw, 5239 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | 5240 MLX5_TXOFF_CONFIG_MPW) 5241 5242 /* 5243 * Array of declared and compiled Tx burst function and corresponding 5244 * supported offloads set. The array is used to select the Tx burst 5245 * function for specified offloads set at Tx queue configuration time. 5246 */ 5247 const struct { 5248 eth_tx_burst_t func; 5249 unsigned int olx; 5250 } txoff_func[] = { 5251 MLX5_TXOFF_INFO(full_empw, 5252 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5253 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5254 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5255 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5256 5257 MLX5_TXOFF_INFO(none_empw, 5258 MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW) 5259 5260 MLX5_TXOFF_INFO(md_empw, 5261 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5262 5263 MLX5_TXOFF_INFO(mt_empw, 5264 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5265 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5266 5267 MLX5_TXOFF_INFO(mtsc_empw, 5268 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5269 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5270 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5271 5272 MLX5_TXOFF_INFO(mti_empw, 5273 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5274 MLX5_TXOFF_CONFIG_INLINE | 5275 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5276 5277 MLX5_TXOFF_INFO(mtv_empw, 5278 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5279 MLX5_TXOFF_CONFIG_VLAN | 5280 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5281 5282 MLX5_TXOFF_INFO(mtiv_empw, 5283 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5284 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5285 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5286 5287 MLX5_TXOFF_INFO(sc_empw, 5288 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5289 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5290 5291 MLX5_TXOFF_INFO(sci_empw, 5292 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5293 MLX5_TXOFF_CONFIG_INLINE | 5294 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5295 5296 MLX5_TXOFF_INFO(scv_empw, 5297 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5298 MLX5_TXOFF_CONFIG_VLAN | 5299 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5300 5301 MLX5_TXOFF_INFO(sciv_empw, 5302 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5303 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5304 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5305 5306 MLX5_TXOFF_INFO(i_empw, 5307 MLX5_TXOFF_CONFIG_INLINE | 5308 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5309 5310 MLX5_TXOFF_INFO(v_empw, 5311 MLX5_TXOFF_CONFIG_VLAN | 5312 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5313 5314 MLX5_TXOFF_INFO(iv_empw, 5315 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5316 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5317 5318 MLX5_TXOFF_INFO(full, 5319 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5320 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5321 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5322 MLX5_TXOFF_CONFIG_METADATA) 5323 5324 MLX5_TXOFF_INFO(none, 5325 MLX5_TXOFF_CONFIG_NONE) 5326 5327 MLX5_TXOFF_INFO(md, 5328 MLX5_TXOFF_CONFIG_METADATA) 5329 5330 MLX5_TXOFF_INFO(mt, 5331 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5332 MLX5_TXOFF_CONFIG_METADATA) 5333 5334 MLX5_TXOFF_INFO(mtsc, 5335 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5336 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5337 MLX5_TXOFF_CONFIG_METADATA) 5338 5339 MLX5_TXOFF_INFO(mti, 5340 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5341 MLX5_TXOFF_CONFIG_INLINE | 5342 MLX5_TXOFF_CONFIG_METADATA) 5343 5344 MLX5_TXOFF_INFO(mtv, 5345 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5346 MLX5_TXOFF_CONFIG_VLAN | 5347 MLX5_TXOFF_CONFIG_METADATA) 5348 5349 MLX5_TXOFF_INFO(mtiv, 5350 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5351 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5352 MLX5_TXOFF_CONFIG_METADATA) 5353 5354 MLX5_TXOFF_INFO(sc, 5355 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5356 MLX5_TXOFF_CONFIG_METADATA) 5357 5358 MLX5_TXOFF_INFO(sci, 5359 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5360 MLX5_TXOFF_CONFIG_INLINE | 5361 MLX5_TXOFF_CONFIG_METADATA) 5362 5363 MLX5_TXOFF_INFO(scv, 5364 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5365 MLX5_TXOFF_CONFIG_VLAN | 5366 MLX5_TXOFF_CONFIG_METADATA) 5367 5368 MLX5_TXOFF_INFO(sciv, 5369 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5370 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5371 MLX5_TXOFF_CONFIG_METADATA) 5372 5373 MLX5_TXOFF_INFO(i, 5374 MLX5_TXOFF_CONFIG_INLINE | 5375 MLX5_TXOFF_CONFIG_METADATA) 5376 5377 MLX5_TXOFF_INFO(v, 5378 MLX5_TXOFF_CONFIG_VLAN | 5379 MLX5_TXOFF_CONFIG_METADATA) 5380 5381 MLX5_TXOFF_INFO(iv, 5382 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5383 MLX5_TXOFF_CONFIG_METADATA) 5384 5385 MLX5_TXOFF_INFO(none_mpw, 5386 MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW | 5387 MLX5_TXOFF_CONFIG_MPW) 5388 5389 MLX5_TXOFF_INFO(mci_mpw, 5390 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM | 5391 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | 5392 MLX5_TXOFF_CONFIG_MPW) 5393 5394 MLX5_TXOFF_INFO(mc_mpw, 5395 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM | 5396 MLX5_TXOFF_CONFIG_EMPW | MLX5_TXOFF_CONFIG_MPW) 5397 5398 MLX5_TXOFF_INFO(i_mpw, 5399 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | 5400 MLX5_TXOFF_CONFIG_MPW) 5401 }; 5402 5403 /** 5404 * Configure the Tx function to use. The routine checks configured 5405 * Tx offloads for the device and selects appropriate Tx burst 5406 * routine. There are multiple Tx burst routines compiled from 5407 * the same template in the most optimal way for the dedicated 5408 * Tx offloads set. 5409 * 5410 * @param dev 5411 * Pointer to private data structure. 5412 * 5413 * @return 5414 * Pointer to selected Tx burst function. 5415 */ 5416 eth_tx_burst_t 5417 mlx5_select_tx_function(struct rte_eth_dev *dev) 5418 { 5419 struct mlx5_priv *priv = dev->data->dev_private; 5420 struct mlx5_dev_config *config = &priv->config; 5421 uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads; 5422 unsigned int diff = 0, olx = 0, i, m; 5423 5424 static_assert(MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE <= 5425 MLX5_DSEG_MAX, "invalid WQE max size"); 5426 static_assert(MLX5_WQE_CSEG_SIZE == MLX5_WSEG_SIZE, 5427 "invalid WQE Control Segment size"); 5428 static_assert(MLX5_WQE_ESEG_SIZE == MLX5_WSEG_SIZE, 5429 "invalid WQE Ethernet Segment size"); 5430 static_assert(MLX5_WQE_DSEG_SIZE == MLX5_WSEG_SIZE, 5431 "invalid WQE Data Segment size"); 5432 static_assert(MLX5_WQE_SIZE == 4 * MLX5_WSEG_SIZE, 5433 "invalid WQE size"); 5434 MLX5_ASSERT(priv); 5435 if (tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS) { 5436 /* We should support Multi-Segment Packets. */ 5437 olx |= MLX5_TXOFF_CONFIG_MULTI; 5438 } 5439 if (tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO | 5440 DEV_TX_OFFLOAD_VXLAN_TNL_TSO | 5441 DEV_TX_OFFLOAD_GRE_TNL_TSO | 5442 DEV_TX_OFFLOAD_IP_TNL_TSO | 5443 DEV_TX_OFFLOAD_UDP_TNL_TSO)) { 5444 /* We should support TCP Send Offload. */ 5445 olx |= MLX5_TXOFF_CONFIG_TSO; 5446 } 5447 if (tx_offloads & (DEV_TX_OFFLOAD_IP_TNL_TSO | 5448 DEV_TX_OFFLOAD_UDP_TNL_TSO | 5449 DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)) { 5450 /* We should support Software Parser for Tunnels. */ 5451 olx |= MLX5_TXOFF_CONFIG_SWP; 5452 } 5453 if (tx_offloads & (DEV_TX_OFFLOAD_IPV4_CKSUM | 5454 DEV_TX_OFFLOAD_UDP_CKSUM | 5455 DEV_TX_OFFLOAD_TCP_CKSUM | 5456 DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)) { 5457 /* We should support IP/TCP/UDP Checksums. */ 5458 olx |= MLX5_TXOFF_CONFIG_CSUM; 5459 } 5460 if (tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT) { 5461 /* We should support VLAN insertion. */ 5462 olx |= MLX5_TXOFF_CONFIG_VLAN; 5463 } 5464 if (priv->txqs_n && (*priv->txqs)[0]) { 5465 struct mlx5_txq_data *txd = (*priv->txqs)[0]; 5466 5467 if (txd->inlen_send) { 5468 /* 5469 * Check the data inline requirements. Data inline 5470 * is enabled on per device basis, we can check 5471 * the first Tx queue only. 5472 * 5473 * If device does not support VLAN insertion in WQE 5474 * and some queues are requested to perform VLAN 5475 * insertion offload than inline must be enabled. 5476 */ 5477 olx |= MLX5_TXOFF_CONFIG_INLINE; 5478 } 5479 } 5480 if (config->mps == MLX5_MPW_ENHANCED && 5481 config->txq_inline_min <= 0) { 5482 /* 5483 * The NIC supports Enhanced Multi-Packet Write 5484 * and does not require minimal inline data. 5485 */ 5486 olx |= MLX5_TXOFF_CONFIG_EMPW; 5487 } 5488 if (rte_flow_dynf_metadata_avail()) { 5489 /* We should support Flow metadata. */ 5490 olx |= MLX5_TXOFF_CONFIG_METADATA; 5491 } 5492 if (config->mps == MLX5_MPW) { 5493 /* 5494 * The NIC supports Legacy Multi-Packet Write. 5495 * The MLX5_TXOFF_CONFIG_MPW controls the 5496 * descriptor building method in combination 5497 * with MLX5_TXOFF_CONFIG_EMPW. 5498 */ 5499 if (!(olx & (MLX5_TXOFF_CONFIG_TSO | 5500 MLX5_TXOFF_CONFIG_SWP | 5501 MLX5_TXOFF_CONFIG_VLAN | 5502 MLX5_TXOFF_CONFIG_METADATA))) 5503 olx |= MLX5_TXOFF_CONFIG_EMPW | 5504 MLX5_TXOFF_CONFIG_MPW; 5505 } 5506 /* 5507 * Scan the routines table to find the minimal 5508 * satisfying routine with requested offloads. 5509 */ 5510 m = RTE_DIM(txoff_func); 5511 for (i = 0; i < RTE_DIM(txoff_func); i++) { 5512 unsigned int tmp; 5513 5514 tmp = txoff_func[i].olx; 5515 if (tmp == olx) { 5516 /* Meets requested offloads exactly.*/ 5517 m = i; 5518 break; 5519 } 5520 if ((tmp & olx) != olx) { 5521 /* Does not meet requested offloads at all. */ 5522 continue; 5523 } 5524 if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_EMPW) 5525 /* Do not enable eMPW if not configured. */ 5526 continue; 5527 if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_INLINE) 5528 /* Do not enable inlining if not configured. */ 5529 continue; 5530 /* 5531 * Some routine meets the requirements. 5532 * Check whether it has minimal amount 5533 * of not requested offloads. 5534 */ 5535 tmp = __builtin_popcountl(tmp & ~olx); 5536 if (m >= RTE_DIM(txoff_func) || tmp < diff) { 5537 /* First or better match, save and continue. */ 5538 m = i; 5539 diff = tmp; 5540 continue; 5541 } 5542 if (tmp == diff) { 5543 tmp = txoff_func[i].olx ^ txoff_func[m].olx; 5544 if (__builtin_ffsl(txoff_func[i].olx & ~tmp) < 5545 __builtin_ffsl(txoff_func[m].olx & ~tmp)) { 5546 /* Lighter not requested offload. */ 5547 m = i; 5548 } 5549 } 5550 } 5551 if (m >= RTE_DIM(txoff_func)) { 5552 DRV_LOG(DEBUG, "port %u has no selected Tx function" 5553 " for requested offloads %04X", 5554 dev->data->port_id, olx); 5555 return NULL; 5556 } 5557 DRV_LOG(DEBUG, "port %u has selected Tx function" 5558 " supporting offloads %04X/%04X", 5559 dev->data->port_id, olx, txoff_func[m].olx); 5560 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_MULTI) 5561 DRV_LOG(DEBUG, "\tMULTI (multi segment)"); 5562 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_TSO) 5563 DRV_LOG(DEBUG, "\tTSO (TCP send offload)"); 5564 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_SWP) 5565 DRV_LOG(DEBUG, "\tSWP (software parser)"); 5566 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_CSUM) 5567 DRV_LOG(DEBUG, "\tCSUM (checksum offload)"); 5568 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_INLINE) 5569 DRV_LOG(DEBUG, "\tINLIN (inline data)"); 5570 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_VLAN) 5571 DRV_LOG(DEBUG, "\tVLANI (VLAN insertion)"); 5572 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_METADATA) 5573 DRV_LOG(DEBUG, "\tMETAD (tx Flow metadata)"); 5574 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_EMPW) { 5575 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_MPW) 5576 DRV_LOG(DEBUG, "\tMPW (Legacy MPW)"); 5577 else 5578 DRV_LOG(DEBUG, "\tEMPW (Enhanced MPW)"); 5579 } 5580 return txoff_func[m].func; 5581 } 5582 5583 /** 5584 * DPDK callback to get the TX queue information 5585 * 5586 * @param dev 5587 * Pointer to the device structure. 5588 * 5589 * @param tx_queue_id 5590 * Tx queue identificator. 5591 * 5592 * @param qinfo 5593 * Pointer to the TX queue information structure. 5594 * 5595 * @return 5596 * None. 5597 */ 5598 5599 void 5600 mlx5_txq_info_get(struct rte_eth_dev *dev, uint16_t tx_queue_id, 5601 struct rte_eth_txq_info *qinfo) 5602 { 5603 struct mlx5_priv *priv = dev->data->dev_private; 5604 struct mlx5_txq_data *txq = (*priv->txqs)[tx_queue_id]; 5605 struct mlx5_txq_ctrl *txq_ctrl = 5606 container_of(txq, struct mlx5_txq_ctrl, txq); 5607 5608 if (!txq) 5609 return; 5610 qinfo->nb_desc = txq->elts_s; 5611 qinfo->conf.tx_thresh.pthresh = 0; 5612 qinfo->conf.tx_thresh.hthresh = 0; 5613 qinfo->conf.tx_thresh.wthresh = 0; 5614 qinfo->conf.tx_rs_thresh = 0; 5615 qinfo->conf.tx_free_thresh = 0; 5616 qinfo->conf.tx_deferred_start = txq_ctrl ? 0 : 1; 5617 qinfo->conf.offloads = dev->data->dev_conf.txmode.offloads; 5618 } 5619 5620 /** 5621 * DPDK callback to get the TX packet burst mode information 5622 * 5623 * @param dev 5624 * Pointer to the device structure. 5625 * 5626 * @param tx_queue_id 5627 * Tx queue identificatior. 5628 * 5629 * @param mode 5630 * Pointer to the burts mode information. 5631 * 5632 * @return 5633 * 0 as success, -EINVAL as failure. 5634 */ 5635 5636 int 5637 mlx5_tx_burst_mode_get(struct rte_eth_dev *dev, 5638 uint16_t tx_queue_id __rte_unused, 5639 struct rte_eth_burst_mode *mode) 5640 { 5641 eth_tx_burst_t pkt_burst = dev->tx_pkt_burst; 5642 unsigned int i, olx; 5643 5644 for (i = 0; i < RTE_DIM(txoff_func); i++) { 5645 if (pkt_burst == txoff_func[i].func) { 5646 olx = txoff_func[i].olx; 5647 snprintf(mode->info, sizeof(mode->info), 5648 "%s%s%s%s%s%s%s%s", 5649 (olx & MLX5_TXOFF_CONFIG_EMPW) ? 5650 ((olx & MLX5_TXOFF_CONFIG_MPW) ? 5651 "Legacy MPW" : "Enhanced MPW") : "No MPW", 5652 (olx & MLX5_TXOFF_CONFIG_MULTI) ? 5653 " + MULTI" : "", 5654 (olx & MLX5_TXOFF_CONFIG_TSO) ? 5655 " + TSO" : "", 5656 (olx & MLX5_TXOFF_CONFIG_SWP) ? 5657 " + SWP" : "", 5658 (olx & MLX5_TXOFF_CONFIG_CSUM) ? 5659 " + CSUM" : "", 5660 (olx & MLX5_TXOFF_CONFIG_INLINE) ? 5661 " + INLINE" : "", 5662 (olx & MLX5_TXOFF_CONFIG_VLAN) ? 5663 " + VLAN" : "", 5664 (olx & MLX5_TXOFF_CONFIG_METADATA) ? 5665 " + METADATA" : ""); 5666 return 0; 5667 } 5668 } 5669 return -EINVAL; 5670 } 5671