1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2015 6WIND S.A. 3 * Copyright 2015-2019 Mellanox Technologies, Ltd 4 */ 5 6 #include <stdint.h> 7 #include <string.h> 8 #include <stdlib.h> 9 10 /* Verbs header. */ 11 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 12 #ifdef PEDANTIC 13 #pragma GCC diagnostic ignored "-Wpedantic" 14 #endif 15 #include <infiniband/verbs.h> 16 #include <infiniband/mlx5dv.h> 17 #ifdef PEDANTIC 18 #pragma GCC diagnostic error "-Wpedantic" 19 #endif 20 21 #include <rte_mbuf.h> 22 #include <rte_mempool.h> 23 #include <rte_prefetch.h> 24 #include <rte_common.h> 25 #include <rte_branch_prediction.h> 26 #include <rte_ether.h> 27 #include <rte_cycles.h> 28 #include <rte_flow.h> 29 30 #include <mlx5_devx_cmds.h> 31 #include <mlx5_prm.h> 32 #include <mlx5_common.h> 33 34 #include "mlx5_defs.h" 35 #include "mlx5.h" 36 #include "mlx5_utils.h" 37 #include "mlx5_rxtx.h" 38 #include "mlx5_autoconf.h" 39 40 /* TX burst subroutines return codes. */ 41 enum mlx5_txcmp_code { 42 MLX5_TXCMP_CODE_EXIT = 0, 43 MLX5_TXCMP_CODE_ERROR, 44 MLX5_TXCMP_CODE_SINGLE, 45 MLX5_TXCMP_CODE_MULTI, 46 MLX5_TXCMP_CODE_TSO, 47 MLX5_TXCMP_CODE_EMPW, 48 }; 49 50 /* 51 * These defines are used to configure Tx burst routine option set 52 * supported at compile time. The not specified options are optimized out 53 * out due to if conditions can be explicitly calculated at compile time. 54 * The offloads with bigger runtime check (require more CPU cycles to 55 * skip) overhead should have the bigger index - this is needed to 56 * select the better matching routine function if no exact match and 57 * some offloads are not actually requested. 58 */ 59 #define MLX5_TXOFF_CONFIG_MULTI (1u << 0) /* Multi-segment packets.*/ 60 #define MLX5_TXOFF_CONFIG_TSO (1u << 1) /* TCP send offload supported.*/ 61 #define MLX5_TXOFF_CONFIG_SWP (1u << 2) /* Tunnels/SW Parser offloads.*/ 62 #define MLX5_TXOFF_CONFIG_CSUM (1u << 3) /* Check Sums offloaded. */ 63 #define MLX5_TXOFF_CONFIG_INLINE (1u << 4) /* Data inlining supported. */ 64 #define MLX5_TXOFF_CONFIG_VLAN (1u << 5) /* VLAN insertion supported.*/ 65 #define MLX5_TXOFF_CONFIG_METADATA (1u << 6) /* Flow metadata. */ 66 #define MLX5_TXOFF_CONFIG_EMPW (1u << 8) /* Enhanced MPW supported.*/ 67 #define MLX5_TXOFF_CONFIG_MPW (1u << 9) /* Legacy MPW supported.*/ 68 69 /* The most common offloads groups. */ 70 #define MLX5_TXOFF_CONFIG_NONE 0 71 #define MLX5_TXOFF_CONFIG_FULL (MLX5_TXOFF_CONFIG_MULTI | \ 72 MLX5_TXOFF_CONFIG_TSO | \ 73 MLX5_TXOFF_CONFIG_SWP | \ 74 MLX5_TXOFF_CONFIG_CSUM | \ 75 MLX5_TXOFF_CONFIG_INLINE | \ 76 MLX5_TXOFF_CONFIG_VLAN | \ 77 MLX5_TXOFF_CONFIG_METADATA) 78 79 #define MLX5_TXOFF_CONFIG(mask) (olx & MLX5_TXOFF_CONFIG_##mask) 80 81 #define MLX5_TXOFF_DECL(func, olx) \ 82 static uint16_t mlx5_tx_burst_##func(void *txq, \ 83 struct rte_mbuf **pkts, \ 84 uint16_t pkts_n) \ 85 { \ 86 return mlx5_tx_burst_tmpl((struct mlx5_txq_data *)txq, \ 87 pkts, pkts_n, (olx)); \ 88 } 89 90 #define MLX5_TXOFF_INFO(func, olx) {mlx5_tx_burst_##func, olx}, 91 92 static __rte_always_inline uint32_t 93 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe); 94 95 static __rte_always_inline int 96 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, 97 uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe); 98 99 static __rte_always_inline uint32_t 100 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe); 101 102 static __rte_always_inline void 103 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, 104 volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res); 105 106 static __rte_always_inline void 107 mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx, 108 const unsigned int strd_n); 109 110 static int 111 mlx5_queue_state_modify(struct rte_eth_dev *dev, 112 struct mlx5_mp_arg_queue_state_modify *sm); 113 114 static inline void 115 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *restrict tcp, 116 volatile struct mlx5_cqe *restrict cqe, 117 uint32_t phcsum); 118 119 static inline void 120 mlx5_lro_update_hdr(uint8_t *restrict padd, 121 volatile struct mlx5_cqe *restrict cqe, 122 uint32_t len); 123 124 uint32_t mlx5_ptype_table[] __rte_cache_aligned = { 125 [0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */ 126 }; 127 128 uint8_t mlx5_cksum_table[1 << 10] __rte_cache_aligned; 129 uint8_t mlx5_swp_types_table[1 << 10] __rte_cache_aligned; 130 131 uint64_t rte_net_mlx5_dynf_inline_mask; 132 #define PKT_TX_DYNF_NOINLINE rte_net_mlx5_dynf_inline_mask 133 134 /** 135 * Build a table to translate Rx completion flags to packet type. 136 * 137 * @note: fix mlx5_dev_supported_ptypes_get() if any change here. 138 */ 139 void 140 mlx5_set_ptype_table(void) 141 { 142 unsigned int i; 143 uint32_t (*p)[RTE_DIM(mlx5_ptype_table)] = &mlx5_ptype_table; 144 145 /* Last entry must not be overwritten, reserved for errored packet. */ 146 for (i = 0; i < RTE_DIM(mlx5_ptype_table) - 1; ++i) 147 (*p)[i] = RTE_PTYPE_UNKNOWN; 148 /* 149 * The index to the array should have: 150 * bit[1:0] = l3_hdr_type 151 * bit[4:2] = l4_hdr_type 152 * bit[5] = ip_frag 153 * bit[6] = tunneled 154 * bit[7] = outer_l3_type 155 */ 156 /* L2 */ 157 (*p)[0x00] = RTE_PTYPE_L2_ETHER; 158 /* L3 */ 159 (*p)[0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 160 RTE_PTYPE_L4_NONFRAG; 161 (*p)[0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 162 RTE_PTYPE_L4_NONFRAG; 163 /* Fragmented */ 164 (*p)[0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 165 RTE_PTYPE_L4_FRAG; 166 (*p)[0x22] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 167 RTE_PTYPE_L4_FRAG; 168 /* TCP */ 169 (*p)[0x05] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 170 RTE_PTYPE_L4_TCP; 171 (*p)[0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 172 RTE_PTYPE_L4_TCP; 173 (*p)[0x0d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 174 RTE_PTYPE_L4_TCP; 175 (*p)[0x0e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 176 RTE_PTYPE_L4_TCP; 177 (*p)[0x11] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 178 RTE_PTYPE_L4_TCP; 179 (*p)[0x12] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 180 RTE_PTYPE_L4_TCP; 181 /* UDP */ 182 (*p)[0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 183 RTE_PTYPE_L4_UDP; 184 (*p)[0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 185 RTE_PTYPE_L4_UDP; 186 /* Repeat with outer_l3_type being set. Just in case. */ 187 (*p)[0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 188 RTE_PTYPE_L4_NONFRAG; 189 (*p)[0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 190 RTE_PTYPE_L4_NONFRAG; 191 (*p)[0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 192 RTE_PTYPE_L4_FRAG; 193 (*p)[0xa2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 194 RTE_PTYPE_L4_FRAG; 195 (*p)[0x85] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 196 RTE_PTYPE_L4_TCP; 197 (*p)[0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 198 RTE_PTYPE_L4_TCP; 199 (*p)[0x8d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 200 RTE_PTYPE_L4_TCP; 201 (*p)[0x8e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 202 RTE_PTYPE_L4_TCP; 203 (*p)[0x91] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 204 RTE_PTYPE_L4_TCP; 205 (*p)[0x92] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 206 RTE_PTYPE_L4_TCP; 207 (*p)[0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 208 RTE_PTYPE_L4_UDP; 209 (*p)[0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 210 RTE_PTYPE_L4_UDP; 211 /* Tunneled - L3 */ 212 (*p)[0x40] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN; 213 (*p)[0x41] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 214 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 215 RTE_PTYPE_INNER_L4_NONFRAG; 216 (*p)[0x42] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 217 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 218 RTE_PTYPE_INNER_L4_NONFRAG; 219 (*p)[0xc0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN; 220 (*p)[0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 221 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 222 RTE_PTYPE_INNER_L4_NONFRAG; 223 (*p)[0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 224 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 225 RTE_PTYPE_INNER_L4_NONFRAG; 226 /* Tunneled - Fragmented */ 227 (*p)[0x61] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 228 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 229 RTE_PTYPE_INNER_L4_FRAG; 230 (*p)[0x62] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 231 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 232 RTE_PTYPE_INNER_L4_FRAG; 233 (*p)[0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 234 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 235 RTE_PTYPE_INNER_L4_FRAG; 236 (*p)[0xe2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 237 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 238 RTE_PTYPE_INNER_L4_FRAG; 239 /* Tunneled - TCP */ 240 (*p)[0x45] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 241 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 242 RTE_PTYPE_INNER_L4_TCP; 243 (*p)[0x46] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 244 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 245 RTE_PTYPE_INNER_L4_TCP; 246 (*p)[0x4d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 247 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 248 RTE_PTYPE_INNER_L4_TCP; 249 (*p)[0x4e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 250 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 251 RTE_PTYPE_INNER_L4_TCP; 252 (*p)[0x51] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 253 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 254 RTE_PTYPE_INNER_L4_TCP; 255 (*p)[0x52] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 256 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 257 RTE_PTYPE_INNER_L4_TCP; 258 (*p)[0xc5] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 259 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 260 RTE_PTYPE_INNER_L4_TCP; 261 (*p)[0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 262 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 263 RTE_PTYPE_INNER_L4_TCP; 264 (*p)[0xcd] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 265 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 266 RTE_PTYPE_INNER_L4_TCP; 267 (*p)[0xce] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 268 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 269 RTE_PTYPE_INNER_L4_TCP; 270 (*p)[0xd1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 271 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 272 RTE_PTYPE_INNER_L4_TCP; 273 (*p)[0xd2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 274 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 275 RTE_PTYPE_INNER_L4_TCP; 276 /* Tunneled - UDP */ 277 (*p)[0x49] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 278 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 279 RTE_PTYPE_INNER_L4_UDP; 280 (*p)[0x4a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 281 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 282 RTE_PTYPE_INNER_L4_UDP; 283 (*p)[0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 284 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 285 RTE_PTYPE_INNER_L4_UDP; 286 (*p)[0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 287 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 288 RTE_PTYPE_INNER_L4_UDP; 289 } 290 291 /** 292 * Build a table to translate packet to checksum type of Verbs. 293 */ 294 void 295 mlx5_set_cksum_table(void) 296 { 297 unsigned int i; 298 uint8_t v; 299 300 /* 301 * The index should have: 302 * bit[0] = PKT_TX_TCP_SEG 303 * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM 304 * bit[4] = PKT_TX_IP_CKSUM 305 * bit[8] = PKT_TX_OUTER_IP_CKSUM 306 * bit[9] = tunnel 307 */ 308 for (i = 0; i < RTE_DIM(mlx5_cksum_table); ++i) { 309 v = 0; 310 if (i & (1 << 9)) { 311 /* Tunneled packet. */ 312 if (i & (1 << 8)) /* Outer IP. */ 313 v |= MLX5_ETH_WQE_L3_CSUM; 314 if (i & (1 << 4)) /* Inner IP. */ 315 v |= MLX5_ETH_WQE_L3_INNER_CSUM; 316 if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */ 317 v |= MLX5_ETH_WQE_L4_INNER_CSUM; 318 } else { 319 /* No tunnel. */ 320 if (i & (1 << 4)) /* IP. */ 321 v |= MLX5_ETH_WQE_L3_CSUM; 322 if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */ 323 v |= MLX5_ETH_WQE_L4_CSUM; 324 } 325 mlx5_cksum_table[i] = v; 326 } 327 } 328 329 /** 330 * Build a table to translate packet type of mbuf to SWP type of Verbs. 331 */ 332 void 333 mlx5_set_swp_types_table(void) 334 { 335 unsigned int i; 336 uint8_t v; 337 338 /* 339 * The index should have: 340 * bit[0:1] = PKT_TX_L4_MASK 341 * bit[4] = PKT_TX_IPV6 342 * bit[8] = PKT_TX_OUTER_IPV6 343 * bit[9] = PKT_TX_OUTER_UDP 344 */ 345 for (i = 0; i < RTE_DIM(mlx5_swp_types_table); ++i) { 346 v = 0; 347 if (i & (1 << 8)) 348 v |= MLX5_ETH_WQE_L3_OUTER_IPV6; 349 if (i & (1 << 9)) 350 v |= MLX5_ETH_WQE_L4_OUTER_UDP; 351 if (i & (1 << 4)) 352 v |= MLX5_ETH_WQE_L3_INNER_IPV6; 353 if ((i & 3) == (PKT_TX_UDP_CKSUM >> 52)) 354 v |= MLX5_ETH_WQE_L4_INNER_UDP; 355 mlx5_swp_types_table[i] = v; 356 } 357 } 358 359 /** 360 * Set Software Parser flags and offsets in Ethernet Segment of WQE. 361 * Flags must be preliminary initialized to zero. 362 * 363 * @param loc 364 * Pointer to burst routine local context. 365 * @param swp_flags 366 * Pointer to store Software Parser flags 367 * @param olx 368 * Configured Tx offloads mask. It is fully defined at 369 * compile time and may be used for optimization. 370 * 371 * @return 372 * Software Parser offsets packed in dword. 373 * Software Parser flags are set by pointer. 374 */ 375 static __rte_always_inline uint32_t 376 txq_mbuf_to_swp(struct mlx5_txq_local *restrict loc, 377 uint8_t *swp_flags, 378 unsigned int olx) 379 { 380 uint64_t ol, tunnel; 381 unsigned int idx, off; 382 uint32_t set; 383 384 if (!MLX5_TXOFF_CONFIG(SWP)) 385 return 0; 386 ol = loc->mbuf->ol_flags; 387 tunnel = ol & PKT_TX_TUNNEL_MASK; 388 /* 389 * Check whether Software Parser is required. 390 * Only customized tunnels may ask for. 391 */ 392 if (likely(tunnel != PKT_TX_TUNNEL_UDP && tunnel != PKT_TX_TUNNEL_IP)) 393 return 0; 394 /* 395 * The index should have: 396 * bit[0:1] = PKT_TX_L4_MASK 397 * bit[4] = PKT_TX_IPV6 398 * bit[8] = PKT_TX_OUTER_IPV6 399 * bit[9] = PKT_TX_OUTER_UDP 400 */ 401 idx = (ol & (PKT_TX_L4_MASK | PKT_TX_IPV6 | PKT_TX_OUTER_IPV6)) >> 52; 402 idx |= (tunnel == PKT_TX_TUNNEL_UDP) ? (1 << 9) : 0; 403 *swp_flags = mlx5_swp_types_table[idx]; 404 /* 405 * Set offsets for SW parser. Since ConnectX-5, SW parser just 406 * complements HW parser. SW parser starts to engage only if HW parser 407 * can't reach a header. For the older devices, HW parser will not kick 408 * in if any of SWP offsets is set. Therefore, all of the L3 offsets 409 * should be set regardless of HW offload. 410 */ 411 off = loc->mbuf->outer_l2_len; 412 if (MLX5_TXOFF_CONFIG(VLAN) && ol & PKT_TX_VLAN_PKT) 413 off += sizeof(struct rte_vlan_hdr); 414 set = (off >> 1) << 8; /* Outer L3 offset. */ 415 off += loc->mbuf->outer_l3_len; 416 if (tunnel == PKT_TX_TUNNEL_UDP) 417 set |= off >> 1; /* Outer L4 offset. */ 418 if (ol & (PKT_TX_IPV4 | PKT_TX_IPV6)) { /* Inner IP. */ 419 const uint64_t csum = ol & PKT_TX_L4_MASK; 420 off += loc->mbuf->l2_len; 421 set |= (off >> 1) << 24; /* Inner L3 offset. */ 422 if (csum == PKT_TX_TCP_CKSUM || 423 csum == PKT_TX_UDP_CKSUM || 424 (MLX5_TXOFF_CONFIG(TSO) && ol & PKT_TX_TCP_SEG)) { 425 off += loc->mbuf->l3_len; 426 set |= (off >> 1) << 16; /* Inner L4 offset. */ 427 } 428 } 429 set = rte_cpu_to_le_32(set); 430 return set; 431 } 432 433 /** 434 * Convert the Checksum offloads to Verbs. 435 * 436 * @param buf 437 * Pointer to the mbuf. 438 * 439 * @return 440 * Converted checksum flags. 441 */ 442 static __rte_always_inline uint8_t 443 txq_ol_cksum_to_cs(struct rte_mbuf *buf) 444 { 445 uint32_t idx; 446 uint8_t is_tunnel = !!(buf->ol_flags & PKT_TX_TUNNEL_MASK); 447 const uint64_t ol_flags_mask = PKT_TX_TCP_SEG | PKT_TX_L4_MASK | 448 PKT_TX_IP_CKSUM | PKT_TX_OUTER_IP_CKSUM; 449 450 /* 451 * The index should have: 452 * bit[0] = PKT_TX_TCP_SEG 453 * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM 454 * bit[4] = PKT_TX_IP_CKSUM 455 * bit[8] = PKT_TX_OUTER_IP_CKSUM 456 * bit[9] = tunnel 457 */ 458 idx = ((buf->ol_flags & ol_flags_mask) >> 50) | (!!is_tunnel << 9); 459 return mlx5_cksum_table[idx]; 460 } 461 462 /** 463 * Internal function to compute the number of used descriptors in an RX queue 464 * 465 * @param rxq 466 * The Rx queue. 467 * 468 * @return 469 * The number of used rx descriptor. 470 */ 471 static uint32_t 472 rx_queue_count(struct mlx5_rxq_data *rxq) 473 { 474 struct rxq_zip *zip = &rxq->zip; 475 volatile struct mlx5_cqe *cqe; 476 const unsigned int cqe_n = (1 << rxq->cqe_n); 477 const unsigned int cqe_cnt = cqe_n - 1; 478 unsigned int cq_ci; 479 unsigned int used; 480 481 /* if we are processing a compressed cqe */ 482 if (zip->ai) { 483 used = zip->cqe_cnt - zip->ca; 484 cq_ci = zip->cq_ci; 485 } else { 486 used = 0; 487 cq_ci = rxq->cq_ci; 488 } 489 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; 490 while (check_cqe(cqe, cqe_n, cq_ci) != MLX5_CQE_STATUS_HW_OWN) { 491 int8_t op_own; 492 unsigned int n; 493 494 op_own = cqe->op_own; 495 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) 496 n = rte_be_to_cpu_32(cqe->byte_cnt); 497 else 498 n = 1; 499 cq_ci += n; 500 used += n; 501 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; 502 } 503 used = RTE_MIN(used, (1U << rxq->elts_n) - 1); 504 return used; 505 } 506 507 /** 508 * DPDK callback to check the status of a rx descriptor. 509 * 510 * @param rx_queue 511 * The Rx queue. 512 * @param[in] offset 513 * The index of the descriptor in the ring. 514 * 515 * @return 516 * The status of the tx descriptor. 517 */ 518 int 519 mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset) 520 { 521 struct mlx5_rxq_data *rxq = rx_queue; 522 struct mlx5_rxq_ctrl *rxq_ctrl = 523 container_of(rxq, struct mlx5_rxq_ctrl, rxq); 524 struct rte_eth_dev *dev = ETH_DEV(rxq_ctrl->priv); 525 526 if (dev->rx_pkt_burst != mlx5_rx_burst) { 527 rte_errno = ENOTSUP; 528 return -rte_errno; 529 } 530 if (offset >= (1 << rxq->elts_n)) { 531 rte_errno = EINVAL; 532 return -rte_errno; 533 } 534 if (offset < rx_queue_count(rxq)) 535 return RTE_ETH_RX_DESC_DONE; 536 return RTE_ETH_RX_DESC_AVAIL; 537 } 538 539 /** 540 * DPDK callback to get the RX queue information 541 * 542 * @param dev 543 * Pointer to the device structure. 544 * 545 * @param rx_queue_id 546 * Rx queue identificator. 547 * 548 * @param qinfo 549 * Pointer to the RX queue information structure. 550 * 551 * @return 552 * None. 553 */ 554 555 void 556 mlx5_rxq_info_get(struct rte_eth_dev *dev, uint16_t rx_queue_id, 557 struct rte_eth_rxq_info *qinfo) 558 { 559 struct mlx5_priv *priv = dev->data->dev_private; 560 struct mlx5_rxq_data *rxq = (*priv->rxqs)[rx_queue_id]; 561 struct mlx5_rxq_ctrl *rxq_ctrl = 562 container_of(rxq, struct mlx5_rxq_ctrl, rxq); 563 564 if (!rxq) 565 return; 566 qinfo->mp = mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ? 567 rxq->mprq_mp : rxq->mp; 568 qinfo->conf.rx_thresh.pthresh = 0; 569 qinfo->conf.rx_thresh.hthresh = 0; 570 qinfo->conf.rx_thresh.wthresh = 0; 571 qinfo->conf.rx_free_thresh = rxq->rq_repl_thresh; 572 qinfo->conf.rx_drop_en = 1; 573 qinfo->conf.rx_deferred_start = rxq_ctrl ? 0 : 1; 574 qinfo->conf.offloads = dev->data->dev_conf.rxmode.offloads; 575 qinfo->scattered_rx = dev->data->scattered_rx; 576 qinfo->nb_desc = 1 << rxq->elts_n; 577 } 578 579 /** 580 * DPDK callback to get the RX packet burst mode information 581 * 582 * @param dev 583 * Pointer to the device structure. 584 * 585 * @param rx_queue_id 586 * Rx queue identificatior. 587 * 588 * @param mode 589 * Pointer to the burts mode information. 590 * 591 * @return 592 * 0 as success, -EINVAL as failure. 593 */ 594 595 int 596 mlx5_rx_burst_mode_get(struct rte_eth_dev *dev, 597 uint16_t rx_queue_id __rte_unused, 598 struct rte_eth_burst_mode *mode) 599 { 600 eth_rx_burst_t pkt_burst = dev->rx_pkt_burst; 601 602 if (pkt_burst == mlx5_rx_burst) { 603 snprintf(mode->info, sizeof(mode->info), "%s", "Scalar"); 604 } else if (pkt_burst == mlx5_rx_burst_mprq) { 605 snprintf(mode->info, sizeof(mode->info), "%s", "Multi-Packet RQ"); 606 } else if (pkt_burst == mlx5_rx_burst_vec) { 607 #if defined RTE_ARCH_X86_64 608 snprintf(mode->info, sizeof(mode->info), "%s", "Vector SSE"); 609 #elif defined RTE_ARCH_ARM64 610 snprintf(mode->info, sizeof(mode->info), "%s", "Vector Neon"); 611 #elif defined RTE_ARCH_PPC_64 612 snprintf(mode->info, sizeof(mode->info), "%s", "Vector AltiVec"); 613 #else 614 return -EINVAL; 615 #endif 616 } else { 617 return -EINVAL; 618 } 619 return 0; 620 } 621 622 /** 623 * DPDK callback to get the number of used descriptors in a RX queue 624 * 625 * @param dev 626 * Pointer to the device structure. 627 * 628 * @param rx_queue_id 629 * The Rx queue. 630 * 631 * @return 632 * The number of used rx descriptor. 633 * -EINVAL if the queue is invalid 634 */ 635 uint32_t 636 mlx5_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id) 637 { 638 struct mlx5_priv *priv = dev->data->dev_private; 639 struct mlx5_rxq_data *rxq; 640 641 if (dev->rx_pkt_burst != mlx5_rx_burst) { 642 rte_errno = ENOTSUP; 643 return -rte_errno; 644 } 645 rxq = (*priv->rxqs)[rx_queue_id]; 646 if (!rxq) { 647 rte_errno = EINVAL; 648 return -rte_errno; 649 } 650 return rx_queue_count(rxq); 651 } 652 653 #define MLX5_SYSTEM_LOG_DIR "/var/log" 654 /** 655 * Dump debug information to log file. 656 * 657 * @param fname 658 * The file name. 659 * @param hex_title 660 * If not NULL this string is printed as a header to the output 661 * and the output will be in hexadecimal view. 662 * @param buf 663 * This is the buffer address to print out. 664 * @param len 665 * The number of bytes to dump out. 666 */ 667 void 668 mlx5_dump_debug_information(const char *fname, const char *hex_title, 669 const void *buf, unsigned int hex_len) 670 { 671 FILE *fd; 672 673 MKSTR(path, "%s/%s", MLX5_SYSTEM_LOG_DIR, fname); 674 fd = fopen(path, "a+"); 675 if (!fd) { 676 DRV_LOG(WARNING, "cannot open %s for debug dump", path); 677 MKSTR(path2, "./%s", fname); 678 fd = fopen(path2, "a+"); 679 if (!fd) { 680 DRV_LOG(ERR, "cannot open %s for debug dump", path2); 681 return; 682 } 683 DRV_LOG(INFO, "New debug dump in file %s", path2); 684 } else { 685 DRV_LOG(INFO, "New debug dump in file %s", path); 686 } 687 if (hex_title) 688 rte_hexdump(fd, hex_title, buf, hex_len); 689 else 690 fprintf(fd, "%s", (const char *)buf); 691 fprintf(fd, "\n\n\n"); 692 fclose(fd); 693 } 694 695 /** 696 * Move QP from error state to running state and initialize indexes. 697 * 698 * @param txq_ctrl 699 * Pointer to TX queue control structure. 700 * 701 * @return 702 * 0 on success, else -1. 703 */ 704 static int 705 tx_recover_qp(struct mlx5_txq_ctrl *txq_ctrl) 706 { 707 struct mlx5_mp_arg_queue_state_modify sm = { 708 .is_wq = 0, 709 .queue_id = txq_ctrl->txq.idx, 710 }; 711 712 if (mlx5_queue_state_modify(ETH_DEV(txq_ctrl->priv), &sm)) 713 return -1; 714 txq_ctrl->txq.wqe_ci = 0; 715 txq_ctrl->txq.wqe_pi = 0; 716 txq_ctrl->txq.elts_comp = 0; 717 return 0; 718 } 719 720 /* Return 1 if the error CQE is signed otherwise, sign it and return 0. */ 721 static int 722 check_err_cqe_seen(volatile struct mlx5_err_cqe *err_cqe) 723 { 724 static const uint8_t magic[] = "seen"; 725 int ret = 1; 726 unsigned int i; 727 728 for (i = 0; i < sizeof(magic); ++i) 729 if (!ret || err_cqe->rsvd1[i] != magic[i]) { 730 ret = 0; 731 err_cqe->rsvd1[i] = magic[i]; 732 } 733 return ret; 734 } 735 736 /** 737 * Handle error CQE. 738 * 739 * @param txq 740 * Pointer to TX queue structure. 741 * @param error_cqe 742 * Pointer to the error CQE. 743 * 744 * @return 745 * Negative value if queue recovery failed, otherwise 746 * the error completion entry is handled successfully. 747 */ 748 static int 749 mlx5_tx_error_cqe_handle(struct mlx5_txq_data *restrict txq, 750 volatile struct mlx5_err_cqe *err_cqe) 751 { 752 if (err_cqe->syndrome != MLX5_CQE_SYNDROME_WR_FLUSH_ERR) { 753 const uint16_t wqe_m = ((1 << txq->wqe_n) - 1); 754 struct mlx5_txq_ctrl *txq_ctrl = 755 container_of(txq, struct mlx5_txq_ctrl, txq); 756 uint16_t new_wqe_pi = rte_be_to_cpu_16(err_cqe->wqe_counter); 757 int seen = check_err_cqe_seen(err_cqe); 758 759 if (!seen && txq_ctrl->dump_file_n < 760 txq_ctrl->priv->config.max_dump_files_num) { 761 MKSTR(err_str, "Unexpected CQE error syndrome " 762 "0x%02x CQN = %u SQN = %u wqe_counter = %u " 763 "wq_ci = %u cq_ci = %u", err_cqe->syndrome, 764 txq->cqe_s, txq->qp_num_8s >> 8, 765 rte_be_to_cpu_16(err_cqe->wqe_counter), 766 txq->wqe_ci, txq->cq_ci); 767 MKSTR(name, "dpdk_mlx5_port_%u_txq_%u_index_%u_%u", 768 PORT_ID(txq_ctrl->priv), txq->idx, 769 txq_ctrl->dump_file_n, (uint32_t)rte_rdtsc()); 770 mlx5_dump_debug_information(name, NULL, err_str, 0); 771 mlx5_dump_debug_information(name, "MLX5 Error CQ:", 772 (const void *)((uintptr_t) 773 txq->cqes), 774 sizeof(*err_cqe) * 775 (1 << txq->cqe_n)); 776 mlx5_dump_debug_information(name, "MLX5 Error SQ:", 777 (const void *)((uintptr_t) 778 txq->wqes), 779 MLX5_WQE_SIZE * 780 (1 << txq->wqe_n)); 781 txq_ctrl->dump_file_n++; 782 } 783 if (!seen) 784 /* 785 * Count errors in WQEs units. 786 * Later it can be improved to count error packets, 787 * for example, by SQ parsing to find how much packets 788 * should be counted for each WQE. 789 */ 790 txq->stats.oerrors += ((txq->wqe_ci & wqe_m) - 791 new_wqe_pi) & wqe_m; 792 if (tx_recover_qp(txq_ctrl)) { 793 /* Recovering failed - retry later on the same WQE. */ 794 return -1; 795 } 796 /* Release all the remaining buffers. */ 797 txq_free_elts(txq_ctrl); 798 } 799 return 0; 800 } 801 802 /** 803 * Translate RX completion flags to packet type. 804 * 805 * @param[in] rxq 806 * Pointer to RX queue structure. 807 * @param[in] cqe 808 * Pointer to CQE. 809 * 810 * @note: fix mlx5_dev_supported_ptypes_get() if any change here. 811 * 812 * @return 813 * Packet type for struct rte_mbuf. 814 */ 815 static inline uint32_t 816 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe) 817 { 818 uint8_t idx; 819 uint8_t pinfo = cqe->pkt_info; 820 uint16_t ptype = cqe->hdr_type_etc; 821 822 /* 823 * The index to the array should have: 824 * bit[1:0] = l3_hdr_type 825 * bit[4:2] = l4_hdr_type 826 * bit[5] = ip_frag 827 * bit[6] = tunneled 828 * bit[7] = outer_l3_type 829 */ 830 idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10); 831 return mlx5_ptype_table[idx] | rxq->tunnel * !!(idx & (1 << 6)); 832 } 833 834 /** 835 * Initialize Rx WQ and indexes. 836 * 837 * @param[in] rxq 838 * Pointer to RX queue structure. 839 */ 840 void 841 mlx5_rxq_initialize(struct mlx5_rxq_data *rxq) 842 { 843 const unsigned int wqe_n = 1 << rxq->elts_n; 844 unsigned int i; 845 846 for (i = 0; (i != wqe_n); ++i) { 847 volatile struct mlx5_wqe_data_seg *scat; 848 uintptr_t addr; 849 uint32_t byte_count; 850 851 if (mlx5_rxq_mprq_enabled(rxq)) { 852 struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[i]; 853 854 scat = &((volatile struct mlx5_wqe_mprq *) 855 rxq->wqes)[i].dseg; 856 addr = (uintptr_t)mlx5_mprq_buf_addr(buf, 857 1 << rxq->strd_num_n); 858 byte_count = (1 << rxq->strd_sz_n) * 859 (1 << rxq->strd_num_n); 860 } else { 861 struct rte_mbuf *buf = (*rxq->elts)[i]; 862 863 scat = &((volatile struct mlx5_wqe_data_seg *) 864 rxq->wqes)[i]; 865 addr = rte_pktmbuf_mtod(buf, uintptr_t); 866 byte_count = DATA_LEN(buf); 867 } 868 /* scat->addr must be able to store a pointer. */ 869 MLX5_ASSERT(sizeof(scat->addr) >= sizeof(uintptr_t)); 870 *scat = (struct mlx5_wqe_data_seg){ 871 .addr = rte_cpu_to_be_64(addr), 872 .byte_count = rte_cpu_to_be_32(byte_count), 873 .lkey = mlx5_rx_addr2mr(rxq, addr), 874 }; 875 } 876 rxq->consumed_strd = 0; 877 rxq->decompressed = 0; 878 rxq->rq_pi = 0; 879 rxq->zip = (struct rxq_zip){ 880 .ai = 0, 881 }; 882 /* Update doorbell counter. */ 883 rxq->rq_ci = wqe_n >> rxq->sges_n; 884 rte_cio_wmb(); 885 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 886 } 887 888 /** 889 * Modify a Verbs/DevX queue state. 890 * This must be called from the primary process. 891 * 892 * @param dev 893 * Pointer to Ethernet device. 894 * @param sm 895 * State modify request parameters. 896 * 897 * @return 898 * 0 in case of success else non-zero value and rte_errno is set. 899 */ 900 int 901 mlx5_queue_state_modify_primary(struct rte_eth_dev *dev, 902 const struct mlx5_mp_arg_queue_state_modify *sm) 903 { 904 int ret; 905 struct mlx5_priv *priv = dev->data->dev_private; 906 907 if (sm->is_wq) { 908 struct mlx5_rxq_data *rxq = (*priv->rxqs)[sm->queue_id]; 909 struct mlx5_rxq_ctrl *rxq_ctrl = 910 container_of(rxq, struct mlx5_rxq_ctrl, rxq); 911 912 if (rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_IBV) { 913 struct ibv_wq_attr mod = { 914 .attr_mask = IBV_WQ_ATTR_STATE, 915 .wq_state = sm->state, 916 }; 917 918 ret = mlx5_glue->modify_wq(rxq_ctrl->obj->wq, &mod); 919 } else { /* rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ. */ 920 struct mlx5_devx_modify_rq_attr rq_attr; 921 922 memset(&rq_attr, 0, sizeof(rq_attr)); 923 if (sm->state == IBV_WQS_RESET) { 924 rq_attr.rq_state = MLX5_RQC_STATE_ERR; 925 rq_attr.state = MLX5_RQC_STATE_RST; 926 } else if (sm->state == IBV_WQS_RDY) { 927 rq_attr.rq_state = MLX5_RQC_STATE_RST; 928 rq_attr.state = MLX5_RQC_STATE_RDY; 929 } else if (sm->state == IBV_WQS_ERR) { 930 rq_attr.rq_state = MLX5_RQC_STATE_RDY; 931 rq_attr.state = MLX5_RQC_STATE_ERR; 932 } 933 ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq, 934 &rq_attr); 935 } 936 if (ret) { 937 DRV_LOG(ERR, "Cannot change Rx WQ state to %u - %s", 938 sm->state, strerror(errno)); 939 rte_errno = errno; 940 return ret; 941 } 942 } else { 943 struct mlx5_txq_data *txq = (*priv->txqs)[sm->queue_id]; 944 struct mlx5_txq_ctrl *txq_ctrl = 945 container_of(txq, struct mlx5_txq_ctrl, txq); 946 struct ibv_qp_attr mod = { 947 .qp_state = IBV_QPS_RESET, 948 .port_num = (uint8_t)priv->ibv_port, 949 }; 950 struct ibv_qp *qp = txq_ctrl->obj->qp; 951 952 ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE); 953 if (ret) { 954 DRV_LOG(ERR, "Cannot change the Tx QP state to RESET " 955 "%s", strerror(errno)); 956 rte_errno = errno; 957 return ret; 958 } 959 mod.qp_state = IBV_QPS_INIT; 960 ret = mlx5_glue->modify_qp(qp, &mod, 961 (IBV_QP_STATE | IBV_QP_PORT)); 962 if (ret) { 963 DRV_LOG(ERR, "Cannot change Tx QP state to INIT %s", 964 strerror(errno)); 965 rte_errno = errno; 966 return ret; 967 } 968 mod.qp_state = IBV_QPS_RTR; 969 ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE); 970 if (ret) { 971 DRV_LOG(ERR, "Cannot change Tx QP state to RTR %s", 972 strerror(errno)); 973 rte_errno = errno; 974 return ret; 975 } 976 mod.qp_state = IBV_QPS_RTS; 977 ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE); 978 if (ret) { 979 DRV_LOG(ERR, "Cannot change Tx QP state to RTS %s", 980 strerror(errno)); 981 rte_errno = errno; 982 return ret; 983 } 984 } 985 return 0; 986 } 987 988 /** 989 * Modify a Verbs queue state. 990 * 991 * @param dev 992 * Pointer to Ethernet device. 993 * @param sm 994 * State modify request parameters. 995 * 996 * @return 997 * 0 in case of success else non-zero value. 998 */ 999 static int 1000 mlx5_queue_state_modify(struct rte_eth_dev *dev, 1001 struct mlx5_mp_arg_queue_state_modify *sm) 1002 { 1003 int ret = 0; 1004 1005 switch (rte_eal_process_type()) { 1006 case RTE_PROC_PRIMARY: 1007 ret = mlx5_queue_state_modify_primary(dev, sm); 1008 break; 1009 case RTE_PROC_SECONDARY: 1010 ret = mlx5_mp_req_queue_state_modify(dev, sm); 1011 break; 1012 default: 1013 break; 1014 } 1015 return ret; 1016 } 1017 1018 /** 1019 * Handle a Rx error. 1020 * The function inserts the RQ state to reset when the first error CQE is 1021 * shown, then drains the CQ by the caller function loop. When the CQ is empty, 1022 * it moves the RQ state to ready and initializes the RQ. 1023 * Next CQE identification and error counting are in the caller responsibility. 1024 * 1025 * @param[in] rxq 1026 * Pointer to RX queue structure. 1027 * @param[in] vec 1028 * 1 when called from vectorized Rx burst, need to prepare mbufs for the RQ. 1029 * 0 when called from non-vectorized Rx burst. 1030 * 1031 * @return 1032 * -1 in case of recovery error, otherwise the CQE status. 1033 */ 1034 int 1035 mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec) 1036 { 1037 const uint16_t cqe_n = 1 << rxq->cqe_n; 1038 const uint16_t cqe_mask = cqe_n - 1; 1039 const unsigned int wqe_n = 1 << rxq->elts_n; 1040 struct mlx5_rxq_ctrl *rxq_ctrl = 1041 container_of(rxq, struct mlx5_rxq_ctrl, rxq); 1042 union { 1043 volatile struct mlx5_cqe *cqe; 1044 volatile struct mlx5_err_cqe *err_cqe; 1045 } u = { 1046 .cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask], 1047 }; 1048 struct mlx5_mp_arg_queue_state_modify sm; 1049 int ret; 1050 1051 switch (rxq->err_state) { 1052 case MLX5_RXQ_ERR_STATE_NO_ERROR: 1053 rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_RESET; 1054 /* Fall-through */ 1055 case MLX5_RXQ_ERR_STATE_NEED_RESET: 1056 sm.is_wq = 1; 1057 sm.queue_id = rxq->idx; 1058 sm.state = IBV_WQS_RESET; 1059 if (mlx5_queue_state_modify(ETH_DEV(rxq_ctrl->priv), &sm)) 1060 return -1; 1061 if (rxq_ctrl->dump_file_n < 1062 rxq_ctrl->priv->config.max_dump_files_num) { 1063 MKSTR(err_str, "Unexpected CQE error syndrome " 1064 "0x%02x CQN = %u RQN = %u wqe_counter = %u" 1065 " rq_ci = %u cq_ci = %u", u.err_cqe->syndrome, 1066 rxq->cqn, rxq_ctrl->wqn, 1067 rte_be_to_cpu_16(u.err_cqe->wqe_counter), 1068 rxq->rq_ci << rxq->sges_n, rxq->cq_ci); 1069 MKSTR(name, "dpdk_mlx5_port_%u_rxq_%u_%u", 1070 rxq->port_id, rxq->idx, (uint32_t)rte_rdtsc()); 1071 mlx5_dump_debug_information(name, NULL, err_str, 0); 1072 mlx5_dump_debug_information(name, "MLX5 Error CQ:", 1073 (const void *)((uintptr_t) 1074 rxq->cqes), 1075 sizeof(*u.cqe) * cqe_n); 1076 mlx5_dump_debug_information(name, "MLX5 Error RQ:", 1077 (const void *)((uintptr_t) 1078 rxq->wqes), 1079 16 * wqe_n); 1080 rxq_ctrl->dump_file_n++; 1081 } 1082 rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_READY; 1083 /* Fall-through */ 1084 case MLX5_RXQ_ERR_STATE_NEED_READY: 1085 ret = check_cqe(u.cqe, cqe_n, rxq->cq_ci); 1086 if (ret == MLX5_CQE_STATUS_HW_OWN) { 1087 rte_cio_wmb(); 1088 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); 1089 rte_cio_wmb(); 1090 /* 1091 * The RQ consumer index must be zeroed while moving 1092 * from RESET state to RDY state. 1093 */ 1094 *rxq->rq_db = rte_cpu_to_be_32(0); 1095 rte_cio_wmb(); 1096 sm.is_wq = 1; 1097 sm.queue_id = rxq->idx; 1098 sm.state = IBV_WQS_RDY; 1099 if (mlx5_queue_state_modify(ETH_DEV(rxq_ctrl->priv), 1100 &sm)) 1101 return -1; 1102 if (vec) { 1103 const uint16_t q_mask = wqe_n - 1; 1104 uint16_t elt_idx; 1105 struct rte_mbuf **elt; 1106 int i; 1107 unsigned int n = wqe_n - (rxq->rq_ci - 1108 rxq->rq_pi); 1109 1110 for (i = 0; i < (int)n; ++i) { 1111 elt_idx = (rxq->rq_ci + i) & q_mask; 1112 elt = &(*rxq->elts)[elt_idx]; 1113 *elt = rte_mbuf_raw_alloc(rxq->mp); 1114 if (!*elt) { 1115 for (i--; i >= 0; --i) { 1116 elt_idx = (rxq->rq_ci + 1117 i) & q_mask; 1118 elt = &(*rxq->elts) 1119 [elt_idx]; 1120 rte_pktmbuf_free_seg 1121 (*elt); 1122 } 1123 return -1; 1124 } 1125 } 1126 for (i = 0; i < (int)wqe_n; ++i) { 1127 elt = &(*rxq->elts)[i]; 1128 DATA_LEN(*elt) = 1129 (uint16_t)((*elt)->buf_len - 1130 rte_pktmbuf_headroom(*elt)); 1131 } 1132 /* Padding with a fake mbuf for vec Rx. */ 1133 for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i) 1134 (*rxq->elts)[wqe_n + i] = 1135 &rxq->fake_mbuf; 1136 } 1137 mlx5_rxq_initialize(rxq); 1138 rxq->err_state = MLX5_RXQ_ERR_STATE_NO_ERROR; 1139 } 1140 return ret; 1141 default: 1142 return -1; 1143 } 1144 } 1145 1146 /** 1147 * Get size of the next packet for a given CQE. For compressed CQEs, the 1148 * consumer index is updated only once all packets of the current one have 1149 * been processed. 1150 * 1151 * @param rxq 1152 * Pointer to RX queue. 1153 * @param cqe 1154 * CQE to process. 1155 * @param[out] mcqe 1156 * Store pointer to mini-CQE if compressed. Otherwise, the pointer is not 1157 * written. 1158 * 1159 * @return 1160 * 0 in case of empty CQE, otherwise the packet size in bytes. 1161 */ 1162 static inline int 1163 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, 1164 uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe) 1165 { 1166 struct rxq_zip *zip = &rxq->zip; 1167 uint16_t cqe_n = cqe_cnt + 1; 1168 int len; 1169 uint16_t idx, end; 1170 1171 do { 1172 len = 0; 1173 /* Process compressed data in the CQE and mini arrays. */ 1174 if (zip->ai) { 1175 volatile struct mlx5_mini_cqe8 (*mc)[8] = 1176 (volatile struct mlx5_mini_cqe8 (*)[8]) 1177 (uintptr_t)(&(*rxq->cqes)[zip->ca & 1178 cqe_cnt].pkt_info); 1179 1180 len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt); 1181 *mcqe = &(*mc)[zip->ai & 7]; 1182 if ((++zip->ai & 7) == 0) { 1183 /* Invalidate consumed CQEs */ 1184 idx = zip->ca; 1185 end = zip->na; 1186 while (idx != end) { 1187 (*rxq->cqes)[idx & cqe_cnt].op_own = 1188 MLX5_CQE_INVALIDATE; 1189 ++idx; 1190 } 1191 /* 1192 * Increment consumer index to skip the number 1193 * of CQEs consumed. Hardware leaves holes in 1194 * the CQ ring for software use. 1195 */ 1196 zip->ca = zip->na; 1197 zip->na += 8; 1198 } 1199 if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) { 1200 /* Invalidate the rest */ 1201 idx = zip->ca; 1202 end = zip->cq_ci; 1203 1204 while (idx != end) { 1205 (*rxq->cqes)[idx & cqe_cnt].op_own = 1206 MLX5_CQE_INVALIDATE; 1207 ++idx; 1208 } 1209 rxq->cq_ci = zip->cq_ci; 1210 zip->ai = 0; 1211 } 1212 /* 1213 * No compressed data, get next CQE and verify if it is 1214 * compressed. 1215 */ 1216 } else { 1217 int ret; 1218 int8_t op_own; 1219 1220 ret = check_cqe(cqe, cqe_n, rxq->cq_ci); 1221 if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) { 1222 if (unlikely(ret == MLX5_CQE_STATUS_ERR || 1223 rxq->err_state)) { 1224 ret = mlx5_rx_err_handle(rxq, 0); 1225 if (ret == MLX5_CQE_STATUS_HW_OWN || 1226 ret == -1) 1227 return 0; 1228 } else { 1229 return 0; 1230 } 1231 } 1232 ++rxq->cq_ci; 1233 op_own = cqe->op_own; 1234 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) { 1235 volatile struct mlx5_mini_cqe8 (*mc)[8] = 1236 (volatile struct mlx5_mini_cqe8 (*)[8]) 1237 (uintptr_t)(&(*rxq->cqes) 1238 [rxq->cq_ci & 1239 cqe_cnt].pkt_info); 1240 1241 /* Fix endianness. */ 1242 zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt); 1243 /* 1244 * Current mini array position is the one 1245 * returned by check_cqe64(). 1246 * 1247 * If completion comprises several mini arrays, 1248 * as a special case the second one is located 1249 * 7 CQEs after the initial CQE instead of 8 1250 * for subsequent ones. 1251 */ 1252 zip->ca = rxq->cq_ci; 1253 zip->na = zip->ca + 7; 1254 /* Compute the next non compressed CQE. */ 1255 --rxq->cq_ci; 1256 zip->cq_ci = rxq->cq_ci + zip->cqe_cnt; 1257 /* Get packet size to return. */ 1258 len = rte_be_to_cpu_32((*mc)[0].byte_cnt); 1259 *mcqe = &(*mc)[0]; 1260 zip->ai = 1; 1261 /* Prefetch all to be invalidated */ 1262 idx = zip->ca; 1263 end = zip->cq_ci; 1264 while (idx != end) { 1265 rte_prefetch0(&(*rxq->cqes)[(idx) & 1266 cqe_cnt]); 1267 ++idx; 1268 } 1269 } else { 1270 len = rte_be_to_cpu_32(cqe->byte_cnt); 1271 } 1272 } 1273 if (unlikely(rxq->err_state)) { 1274 cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1275 ++rxq->stats.idropped; 1276 } else { 1277 return len; 1278 } 1279 } while (1); 1280 } 1281 1282 /** 1283 * Translate RX completion flags to offload flags. 1284 * 1285 * @param[in] cqe 1286 * Pointer to CQE. 1287 * 1288 * @return 1289 * Offload flags (ol_flags) for struct rte_mbuf. 1290 */ 1291 static inline uint32_t 1292 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe) 1293 { 1294 uint32_t ol_flags = 0; 1295 uint16_t flags = rte_be_to_cpu_16(cqe->hdr_type_etc); 1296 1297 ol_flags = 1298 TRANSPOSE(flags, 1299 MLX5_CQE_RX_L3_HDR_VALID, 1300 PKT_RX_IP_CKSUM_GOOD) | 1301 TRANSPOSE(flags, 1302 MLX5_CQE_RX_L4_HDR_VALID, 1303 PKT_RX_L4_CKSUM_GOOD); 1304 return ol_flags; 1305 } 1306 1307 /** 1308 * Fill in mbuf fields from RX completion flags. 1309 * Note that pkt->ol_flags should be initialized outside of this function. 1310 * 1311 * @param rxq 1312 * Pointer to RX queue. 1313 * @param pkt 1314 * mbuf to fill. 1315 * @param cqe 1316 * CQE to process. 1317 * @param rss_hash_res 1318 * Packet RSS Hash result. 1319 */ 1320 static inline void 1321 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, 1322 volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res) 1323 { 1324 /* Update packet information. */ 1325 pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe); 1326 if (rss_hash_res && rxq->rss_hash) { 1327 pkt->hash.rss = rss_hash_res; 1328 pkt->ol_flags |= PKT_RX_RSS_HASH; 1329 } 1330 if (rxq->mark && MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) { 1331 pkt->ol_flags |= PKT_RX_FDIR; 1332 if (cqe->sop_drop_qpn != 1333 rte_cpu_to_be_32(MLX5_FLOW_MARK_DEFAULT)) { 1334 uint32_t mark = cqe->sop_drop_qpn; 1335 1336 pkt->ol_flags |= PKT_RX_FDIR_ID; 1337 pkt->hash.fdir.hi = mlx5_flow_mark_get(mark); 1338 } 1339 } 1340 if (rte_flow_dynf_metadata_avail() && cqe->flow_table_metadata) { 1341 pkt->ol_flags |= PKT_RX_DYNF_METADATA; 1342 *RTE_FLOW_DYNF_METADATA(pkt) = cqe->flow_table_metadata; 1343 } 1344 if (rxq->csum) 1345 pkt->ol_flags |= rxq_cq_to_ol_flags(cqe); 1346 if (rxq->vlan_strip && 1347 (cqe->hdr_type_etc & rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED))) { 1348 pkt->ol_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED; 1349 pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info); 1350 } 1351 if (rxq->hw_timestamp) { 1352 pkt->timestamp = rte_be_to_cpu_64(cqe->timestamp); 1353 pkt->ol_flags |= PKT_RX_TIMESTAMP; 1354 } 1355 } 1356 1357 /** 1358 * DPDK callback for RX. 1359 * 1360 * @param dpdk_rxq 1361 * Generic pointer to RX queue structure. 1362 * @param[out] pkts 1363 * Array to store received packets. 1364 * @param pkts_n 1365 * Maximum number of packets in array. 1366 * 1367 * @return 1368 * Number of packets successfully received (<= pkts_n). 1369 */ 1370 uint16_t 1371 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1372 { 1373 struct mlx5_rxq_data *rxq = dpdk_rxq; 1374 const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1; 1375 const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1; 1376 const unsigned int sges_n = rxq->sges_n; 1377 struct rte_mbuf *pkt = NULL; 1378 struct rte_mbuf *seg = NULL; 1379 volatile struct mlx5_cqe *cqe = 1380 &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1381 unsigned int i = 0; 1382 unsigned int rq_ci = rxq->rq_ci << sges_n; 1383 int len = 0; /* keep its value across iterations. */ 1384 1385 while (pkts_n) { 1386 unsigned int idx = rq_ci & wqe_cnt; 1387 volatile struct mlx5_wqe_data_seg *wqe = 1388 &((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx]; 1389 struct rte_mbuf *rep = (*rxq->elts)[idx]; 1390 volatile struct mlx5_mini_cqe8 *mcqe = NULL; 1391 uint32_t rss_hash_res; 1392 1393 if (pkt) 1394 NEXT(seg) = rep; 1395 seg = rep; 1396 rte_prefetch0(seg); 1397 rte_prefetch0(cqe); 1398 rte_prefetch0(wqe); 1399 rep = rte_mbuf_raw_alloc(rxq->mp); 1400 if (unlikely(rep == NULL)) { 1401 ++rxq->stats.rx_nombuf; 1402 if (!pkt) { 1403 /* 1404 * no buffers before we even started, 1405 * bail out silently. 1406 */ 1407 break; 1408 } 1409 while (pkt != seg) { 1410 MLX5_ASSERT(pkt != (*rxq->elts)[idx]); 1411 rep = NEXT(pkt); 1412 NEXT(pkt) = NULL; 1413 NB_SEGS(pkt) = 1; 1414 rte_mbuf_raw_free(pkt); 1415 pkt = rep; 1416 } 1417 break; 1418 } 1419 if (!pkt) { 1420 cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1421 len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt, &mcqe); 1422 if (!len) { 1423 rte_mbuf_raw_free(rep); 1424 break; 1425 } 1426 pkt = seg; 1427 MLX5_ASSERT(len >= (rxq->crc_present << 2)); 1428 pkt->ol_flags &= EXT_ATTACHED_MBUF; 1429 /* If compressed, take hash result from mini-CQE. */ 1430 rss_hash_res = rte_be_to_cpu_32(mcqe == NULL ? 1431 cqe->rx_hash_res : 1432 mcqe->rx_hash_result); 1433 rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res); 1434 if (rxq->crc_present) 1435 len -= RTE_ETHER_CRC_LEN; 1436 PKT_LEN(pkt) = len; 1437 if (cqe->lro_num_seg > 1) { 1438 mlx5_lro_update_hdr 1439 (rte_pktmbuf_mtod(pkt, uint8_t *), cqe, 1440 len); 1441 pkt->ol_flags |= PKT_RX_LRO; 1442 pkt->tso_segsz = len / cqe->lro_num_seg; 1443 } 1444 } 1445 DATA_LEN(rep) = DATA_LEN(seg); 1446 PKT_LEN(rep) = PKT_LEN(seg); 1447 SET_DATA_OFF(rep, DATA_OFF(seg)); 1448 PORT(rep) = PORT(seg); 1449 (*rxq->elts)[idx] = rep; 1450 /* 1451 * Fill NIC descriptor with the new buffer. The lkey and size 1452 * of the buffers are already known, only the buffer address 1453 * changes. 1454 */ 1455 wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t)); 1456 /* If there's only one MR, no need to replace LKey in WQE. */ 1457 if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) 1458 wqe->lkey = mlx5_rx_mb2mr(rxq, rep); 1459 if (len > DATA_LEN(seg)) { 1460 len -= DATA_LEN(seg); 1461 ++NB_SEGS(pkt); 1462 ++rq_ci; 1463 continue; 1464 } 1465 DATA_LEN(seg) = len; 1466 #ifdef MLX5_PMD_SOFT_COUNTERS 1467 /* Increment bytes counter. */ 1468 rxq->stats.ibytes += PKT_LEN(pkt); 1469 #endif 1470 /* Return packet. */ 1471 *(pkts++) = pkt; 1472 pkt = NULL; 1473 --pkts_n; 1474 ++i; 1475 /* Align consumer index to the next stride. */ 1476 rq_ci >>= sges_n; 1477 ++rq_ci; 1478 rq_ci <<= sges_n; 1479 } 1480 if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci))) 1481 return 0; 1482 /* Update the consumer index. */ 1483 rxq->rq_ci = rq_ci >> sges_n; 1484 rte_cio_wmb(); 1485 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); 1486 rte_cio_wmb(); 1487 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 1488 #ifdef MLX5_PMD_SOFT_COUNTERS 1489 /* Increment packets counter. */ 1490 rxq->stats.ipackets += i; 1491 #endif 1492 return i; 1493 } 1494 1495 /** 1496 * Update LRO packet TCP header. 1497 * The HW LRO feature doesn't update the TCP header after coalescing the 1498 * TCP segments but supplies information in CQE to fill it by SW. 1499 * 1500 * @param tcp 1501 * Pointer to the TCP header. 1502 * @param cqe 1503 * Pointer to the completion entry.. 1504 * @param phcsum 1505 * The L3 pseudo-header checksum. 1506 */ 1507 static inline void 1508 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *restrict tcp, 1509 volatile struct mlx5_cqe *restrict cqe, 1510 uint32_t phcsum) 1511 { 1512 uint8_t l4_type = (rte_be_to_cpu_16(cqe->hdr_type_etc) & 1513 MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT; 1514 /* 1515 * The HW calculates only the TCP payload checksum, need to complete 1516 * the TCP header checksum and the L3 pseudo-header checksum. 1517 */ 1518 uint32_t csum = phcsum + cqe->csum; 1519 1520 if (l4_type == MLX5_L4_HDR_TYPE_TCP_EMPTY_ACK || 1521 l4_type == MLX5_L4_HDR_TYPE_TCP_WITH_ACL) { 1522 tcp->tcp_flags |= RTE_TCP_ACK_FLAG; 1523 tcp->recv_ack = cqe->lro_ack_seq_num; 1524 tcp->rx_win = cqe->lro_tcp_win; 1525 } 1526 if (cqe->lro_tcppsh_abort_dupack & MLX5_CQE_LRO_PUSH_MASK) 1527 tcp->tcp_flags |= RTE_TCP_PSH_FLAG; 1528 tcp->cksum = 0; 1529 csum += rte_raw_cksum(tcp, (tcp->data_off & 0xF) * 4); 1530 csum = ((csum & 0xffff0000) >> 16) + (csum & 0xffff); 1531 csum = (~csum) & 0xffff; 1532 if (csum == 0) 1533 csum = 0xffff; 1534 tcp->cksum = csum; 1535 } 1536 1537 /** 1538 * Update LRO packet headers. 1539 * The HW LRO feature doesn't update the L3/TCP headers after coalescing the 1540 * TCP segments but supply information in CQE to fill it by SW. 1541 * 1542 * @param padd 1543 * The packet address. 1544 * @param cqe 1545 * Pointer to the completion entry.. 1546 * @param len 1547 * The packet length. 1548 */ 1549 static inline void 1550 mlx5_lro_update_hdr(uint8_t *restrict padd, 1551 volatile struct mlx5_cqe *restrict cqe, 1552 uint32_t len) 1553 { 1554 union { 1555 struct rte_ether_hdr *eth; 1556 struct rte_vlan_hdr *vlan; 1557 struct rte_ipv4_hdr *ipv4; 1558 struct rte_ipv6_hdr *ipv6; 1559 struct rte_tcp_hdr *tcp; 1560 uint8_t *hdr; 1561 } h = { 1562 .hdr = padd, 1563 }; 1564 uint16_t proto = h.eth->ether_type; 1565 uint32_t phcsum; 1566 1567 h.eth++; 1568 while (proto == RTE_BE16(RTE_ETHER_TYPE_VLAN) || 1569 proto == RTE_BE16(RTE_ETHER_TYPE_QINQ)) { 1570 proto = h.vlan->eth_proto; 1571 h.vlan++; 1572 } 1573 if (proto == RTE_BE16(RTE_ETHER_TYPE_IPV4)) { 1574 h.ipv4->time_to_live = cqe->lro_min_ttl; 1575 h.ipv4->total_length = rte_cpu_to_be_16(len - (h.hdr - padd)); 1576 h.ipv4->hdr_checksum = 0; 1577 h.ipv4->hdr_checksum = rte_ipv4_cksum(h.ipv4); 1578 phcsum = rte_ipv4_phdr_cksum(h.ipv4, 0); 1579 h.ipv4++; 1580 } else { 1581 h.ipv6->hop_limits = cqe->lro_min_ttl; 1582 h.ipv6->payload_len = rte_cpu_to_be_16(len - (h.hdr - padd) - 1583 sizeof(*h.ipv6)); 1584 phcsum = rte_ipv6_phdr_cksum(h.ipv6, 0); 1585 h.ipv6++; 1586 } 1587 mlx5_lro_update_tcp_hdr(h.tcp, cqe, phcsum); 1588 } 1589 1590 void 1591 mlx5_mprq_buf_free_cb(void *addr __rte_unused, void *opaque) 1592 { 1593 struct mlx5_mprq_buf *buf = opaque; 1594 1595 if (rte_atomic16_read(&buf->refcnt) == 1) { 1596 rte_mempool_put(buf->mp, buf); 1597 } else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) { 1598 rte_atomic16_set(&buf->refcnt, 1); 1599 rte_mempool_put(buf->mp, buf); 1600 } 1601 } 1602 1603 void 1604 mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf) 1605 { 1606 mlx5_mprq_buf_free_cb(NULL, buf); 1607 } 1608 1609 static inline void 1610 mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx, 1611 const unsigned int strd_n) 1612 { 1613 struct mlx5_mprq_buf *rep = rxq->mprq_repl; 1614 volatile struct mlx5_wqe_data_seg *wqe = 1615 &((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg; 1616 void *addr; 1617 1618 MLX5_ASSERT(rep != NULL); 1619 /* Replace MPRQ buf. */ 1620 (*rxq->mprq_bufs)[rq_idx] = rep; 1621 /* Replace WQE. */ 1622 addr = mlx5_mprq_buf_addr(rep, strd_n); 1623 wqe->addr = rte_cpu_to_be_64((uintptr_t)addr); 1624 /* If there's only one MR, no need to replace LKey in WQE. */ 1625 if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) 1626 wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr); 1627 /* Stash a mbuf for next replacement. */ 1628 if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep))) 1629 rxq->mprq_repl = rep; 1630 else 1631 rxq->mprq_repl = NULL; 1632 } 1633 1634 /** 1635 * DPDK callback for RX with Multi-Packet RQ support. 1636 * 1637 * @param dpdk_rxq 1638 * Generic pointer to RX queue structure. 1639 * @param[out] pkts 1640 * Array to store received packets. 1641 * @param pkts_n 1642 * Maximum number of packets in array. 1643 * 1644 * @return 1645 * Number of packets successfully received (<= pkts_n). 1646 */ 1647 uint16_t 1648 mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1649 { 1650 struct mlx5_rxq_data *rxq = dpdk_rxq; 1651 const unsigned int strd_n = 1 << rxq->strd_num_n; 1652 const unsigned int strd_sz = 1 << rxq->strd_sz_n; 1653 const unsigned int strd_shift = 1654 MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en; 1655 const unsigned int cq_mask = (1 << rxq->cqe_n) - 1; 1656 const unsigned int wq_mask = (1 << rxq->elts_n) - 1; 1657 volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask]; 1658 unsigned int i = 0; 1659 uint32_t rq_ci = rxq->rq_ci; 1660 uint16_t consumed_strd = rxq->consumed_strd; 1661 uint16_t headroom_sz = rxq->strd_headroom_en * RTE_PKTMBUF_HEADROOM; 1662 struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_ci & wq_mask]; 1663 1664 while (i < pkts_n) { 1665 struct rte_mbuf *pkt; 1666 void *addr; 1667 int ret; 1668 unsigned int len; 1669 uint16_t strd_cnt; 1670 uint16_t strd_idx; 1671 uint32_t offset; 1672 uint32_t byte_cnt; 1673 volatile struct mlx5_mini_cqe8 *mcqe = NULL; 1674 uint32_t rss_hash_res = 0; 1675 uint8_t lro_num_seg; 1676 1677 if (consumed_strd == strd_n) { 1678 /* Replace WQE only if the buffer is still in use. */ 1679 if (rte_atomic16_read(&buf->refcnt) > 1) { 1680 mprq_buf_replace(rxq, rq_ci & wq_mask, strd_n); 1681 /* Release the old buffer. */ 1682 mlx5_mprq_buf_free(buf); 1683 } else if (unlikely(rxq->mprq_repl == NULL)) { 1684 struct mlx5_mprq_buf *rep; 1685 1686 /* 1687 * Currently, the MPRQ mempool is out of buffer 1688 * and doing memcpy regardless of the size of Rx 1689 * packet. Retry allocation to get back to 1690 * normal. 1691 */ 1692 if (!rte_mempool_get(rxq->mprq_mp, 1693 (void **)&rep)) 1694 rxq->mprq_repl = rep; 1695 } 1696 /* Advance to the next WQE. */ 1697 consumed_strd = 0; 1698 ++rq_ci; 1699 buf = (*rxq->mprq_bufs)[rq_ci & wq_mask]; 1700 } 1701 cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask]; 1702 ret = mlx5_rx_poll_len(rxq, cqe, cq_mask, &mcqe); 1703 if (!ret) 1704 break; 1705 byte_cnt = ret; 1706 strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >> 1707 MLX5_MPRQ_STRIDE_NUM_SHIFT; 1708 MLX5_ASSERT(strd_cnt); 1709 consumed_strd += strd_cnt; 1710 if (byte_cnt & MLX5_MPRQ_FILLER_MASK) 1711 continue; 1712 if (mcqe == NULL) { 1713 rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res); 1714 strd_idx = rte_be_to_cpu_16(cqe->wqe_counter); 1715 } else { 1716 /* mini-CQE for MPRQ doesn't have hash result. */ 1717 strd_idx = rte_be_to_cpu_16(mcqe->stride_idx); 1718 } 1719 MLX5_ASSERT(strd_idx < strd_n); 1720 MLX5_ASSERT(!((rte_be_to_cpu_16(cqe->wqe_id) ^ rq_ci) & 1721 wq_mask)); 1722 lro_num_seg = cqe->lro_num_seg; 1723 /* 1724 * Currently configured to receive a packet per a stride. But if 1725 * MTU is adjusted through kernel interface, device could 1726 * consume multiple strides without raising an error. In this 1727 * case, the packet should be dropped because it is bigger than 1728 * the max_rx_pkt_len. 1729 */ 1730 if (unlikely(!lro_num_seg && strd_cnt > 1)) { 1731 ++rxq->stats.idropped; 1732 continue; 1733 } 1734 pkt = rte_pktmbuf_alloc(rxq->mp); 1735 if (unlikely(pkt == NULL)) { 1736 ++rxq->stats.rx_nombuf; 1737 break; 1738 } 1739 len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT; 1740 MLX5_ASSERT((int)len >= (rxq->crc_present << 2)); 1741 if (rxq->crc_present) 1742 len -= RTE_ETHER_CRC_LEN; 1743 offset = strd_idx * strd_sz + strd_shift; 1744 addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset); 1745 /* 1746 * Memcpy packets to the target mbuf if: 1747 * - The size of packet is smaller than mprq_max_memcpy_len. 1748 * - Out of buffer in the Mempool for Multi-Packet RQ. 1749 */ 1750 if (len <= rxq->mprq_max_memcpy_len || rxq->mprq_repl == NULL) { 1751 /* 1752 * When memcpy'ing packet due to out-of-buffer, the 1753 * packet must be smaller than the target mbuf. 1754 */ 1755 if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) { 1756 rte_pktmbuf_free_seg(pkt); 1757 ++rxq->stats.idropped; 1758 continue; 1759 } 1760 rte_memcpy(rte_pktmbuf_mtod(pkt, void *), addr, len); 1761 DATA_LEN(pkt) = len; 1762 } else { 1763 rte_iova_t buf_iova; 1764 struct rte_mbuf_ext_shared_info *shinfo; 1765 uint16_t buf_len = strd_cnt * strd_sz; 1766 void *buf_addr; 1767 1768 /* Increment the refcnt of the whole chunk. */ 1769 rte_atomic16_add_return(&buf->refcnt, 1); 1770 MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf->refcnt) <= 1771 strd_n + 1); 1772 buf_addr = RTE_PTR_SUB(addr, headroom_sz); 1773 /* 1774 * MLX5 device doesn't use iova but it is necessary in a 1775 * case where the Rx packet is transmitted via a 1776 * different PMD. 1777 */ 1778 buf_iova = rte_mempool_virt2iova(buf) + 1779 RTE_PTR_DIFF(buf_addr, buf); 1780 shinfo = &buf->shinfos[strd_idx]; 1781 rte_mbuf_ext_refcnt_set(shinfo, 1); 1782 /* 1783 * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when 1784 * attaching the stride to mbuf and more offload flags 1785 * will be added below by calling rxq_cq_to_mbuf(). 1786 * Other fields will be overwritten. 1787 */ 1788 rte_pktmbuf_attach_extbuf(pkt, buf_addr, buf_iova, 1789 buf_len, shinfo); 1790 /* Set mbuf head-room. */ 1791 pkt->data_off = headroom_sz; 1792 MLX5_ASSERT(pkt->ol_flags == EXT_ATTACHED_MBUF); 1793 /* 1794 * Prevent potential overflow due to MTU change through 1795 * kernel interface. 1796 */ 1797 if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) { 1798 rte_pktmbuf_free_seg(pkt); 1799 ++rxq->stats.idropped; 1800 continue; 1801 } 1802 DATA_LEN(pkt) = len; 1803 /* 1804 * LRO packet may consume all the stride memory, in this 1805 * case packet head-room space is not guaranteed so must 1806 * to add an empty mbuf for the head-room. 1807 */ 1808 if (!rxq->strd_headroom_en) { 1809 struct rte_mbuf *headroom_mbuf = 1810 rte_pktmbuf_alloc(rxq->mp); 1811 1812 if (unlikely(headroom_mbuf == NULL)) { 1813 rte_pktmbuf_free_seg(pkt); 1814 ++rxq->stats.rx_nombuf; 1815 break; 1816 } 1817 PORT(pkt) = rxq->port_id; 1818 NEXT(headroom_mbuf) = pkt; 1819 pkt = headroom_mbuf; 1820 NB_SEGS(pkt) = 2; 1821 } 1822 } 1823 rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res); 1824 if (lro_num_seg > 1) { 1825 mlx5_lro_update_hdr(addr, cqe, len); 1826 pkt->ol_flags |= PKT_RX_LRO; 1827 pkt->tso_segsz = strd_sz; 1828 } 1829 PKT_LEN(pkt) = len; 1830 PORT(pkt) = rxq->port_id; 1831 #ifdef MLX5_PMD_SOFT_COUNTERS 1832 /* Increment bytes counter. */ 1833 rxq->stats.ibytes += PKT_LEN(pkt); 1834 #endif 1835 /* Return packet. */ 1836 *(pkts++) = pkt; 1837 ++i; 1838 } 1839 /* Update the consumer indexes. */ 1840 rxq->consumed_strd = consumed_strd; 1841 rte_cio_wmb(); 1842 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); 1843 if (rq_ci != rxq->rq_ci) { 1844 rxq->rq_ci = rq_ci; 1845 rte_cio_wmb(); 1846 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 1847 } 1848 #ifdef MLX5_PMD_SOFT_COUNTERS 1849 /* Increment packets counter. */ 1850 rxq->stats.ipackets += i; 1851 #endif 1852 return i; 1853 } 1854 1855 /** 1856 * Dummy DPDK callback for TX. 1857 * 1858 * This function is used to temporarily replace the real callback during 1859 * unsafe control operations on the queue, or in case of error. 1860 * 1861 * @param dpdk_txq 1862 * Generic pointer to TX queue structure. 1863 * @param[in] pkts 1864 * Packets to transmit. 1865 * @param pkts_n 1866 * Number of packets in array. 1867 * 1868 * @return 1869 * Number of packets successfully transmitted (<= pkts_n). 1870 */ 1871 uint16_t 1872 removed_tx_burst(void *dpdk_txq __rte_unused, 1873 struct rte_mbuf **pkts __rte_unused, 1874 uint16_t pkts_n __rte_unused) 1875 { 1876 rte_mb(); 1877 return 0; 1878 } 1879 1880 /** 1881 * Dummy DPDK callback for RX. 1882 * 1883 * This function is used to temporarily replace the real callback during 1884 * unsafe control operations on the queue, or in case of error. 1885 * 1886 * @param dpdk_rxq 1887 * Generic pointer to RX queue structure. 1888 * @param[out] pkts 1889 * Array to store received packets. 1890 * @param pkts_n 1891 * Maximum number of packets in array. 1892 * 1893 * @return 1894 * Number of packets successfully received (<= pkts_n). 1895 */ 1896 uint16_t 1897 removed_rx_burst(void *dpdk_txq __rte_unused, 1898 struct rte_mbuf **pkts __rte_unused, 1899 uint16_t pkts_n __rte_unused) 1900 { 1901 rte_mb(); 1902 return 0; 1903 } 1904 1905 /* 1906 * Vectorized Rx/Tx routines are not compiled in when required vector 1907 * instructions are not supported on a target architecture. The following null 1908 * stubs are needed for linkage when those are not included outside of this file 1909 * (e.g. mlx5_rxtx_vec_sse.c for x86). 1910 */ 1911 1912 __rte_weak uint16_t 1913 mlx5_rx_burst_vec(void *dpdk_txq __rte_unused, 1914 struct rte_mbuf **pkts __rte_unused, 1915 uint16_t pkts_n __rte_unused) 1916 { 1917 return 0; 1918 } 1919 1920 __rte_weak int 1921 mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused) 1922 { 1923 return -ENOTSUP; 1924 } 1925 1926 __rte_weak int 1927 mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused) 1928 { 1929 return -ENOTSUP; 1930 } 1931 1932 /** 1933 * Free the mbufs from the linear array of pointers. 1934 * 1935 * @param pkts 1936 * Pointer to array of packets to be free. 1937 * @param pkts_n 1938 * Number of packets to be freed. 1939 * @param olx 1940 * Configured Tx offloads mask. It is fully defined at 1941 * compile time and may be used for optimization. 1942 */ 1943 static __rte_always_inline void 1944 mlx5_tx_free_mbuf(struct rte_mbuf **restrict pkts, 1945 unsigned int pkts_n, 1946 unsigned int olx __rte_unused) 1947 { 1948 struct rte_mempool *pool = NULL; 1949 struct rte_mbuf **p_free = NULL; 1950 struct rte_mbuf *mbuf; 1951 unsigned int n_free = 0; 1952 1953 /* 1954 * The implemented algorithm eliminates 1955 * copying pointers to temporary array 1956 * for rte_mempool_put_bulk() calls. 1957 */ 1958 MLX5_ASSERT(pkts); 1959 MLX5_ASSERT(pkts_n); 1960 for (;;) { 1961 for (;;) { 1962 /* 1963 * Decrement mbuf reference counter, detach 1964 * indirect and external buffers if needed. 1965 */ 1966 mbuf = rte_pktmbuf_prefree_seg(*pkts); 1967 if (likely(mbuf != NULL)) { 1968 MLX5_ASSERT(mbuf == *pkts); 1969 if (likely(n_free != 0)) { 1970 if (unlikely(pool != mbuf->pool)) 1971 /* From different pool. */ 1972 break; 1973 } else { 1974 /* Start new scan array. */ 1975 pool = mbuf->pool; 1976 p_free = pkts; 1977 } 1978 ++n_free; 1979 ++pkts; 1980 --pkts_n; 1981 if (unlikely(pkts_n == 0)) { 1982 mbuf = NULL; 1983 break; 1984 } 1985 } else { 1986 /* 1987 * This happens if mbuf is still referenced. 1988 * We can't put it back to the pool, skip. 1989 */ 1990 ++pkts; 1991 --pkts_n; 1992 if (unlikely(n_free != 0)) 1993 /* There is some array to free.*/ 1994 break; 1995 if (unlikely(pkts_n == 0)) 1996 /* Last mbuf, nothing to free. */ 1997 return; 1998 } 1999 } 2000 for (;;) { 2001 /* 2002 * This loop is implemented to avoid multiple 2003 * inlining of rte_mempool_put_bulk(). 2004 */ 2005 MLX5_ASSERT(pool); 2006 MLX5_ASSERT(p_free); 2007 MLX5_ASSERT(n_free); 2008 /* 2009 * Free the array of pre-freed mbufs 2010 * belonging to the same memory pool. 2011 */ 2012 rte_mempool_put_bulk(pool, (void *)p_free, n_free); 2013 if (unlikely(mbuf != NULL)) { 2014 /* There is the request to start new scan. */ 2015 pool = mbuf->pool; 2016 p_free = pkts++; 2017 n_free = 1; 2018 --pkts_n; 2019 if (likely(pkts_n != 0)) 2020 break; 2021 /* 2022 * This is the last mbuf to be freed. 2023 * Do one more loop iteration to complete. 2024 * This is rare case of the last unique mbuf. 2025 */ 2026 mbuf = NULL; 2027 continue; 2028 } 2029 if (likely(pkts_n == 0)) 2030 return; 2031 n_free = 0; 2032 break; 2033 } 2034 } 2035 } 2036 2037 /** 2038 * Free the mbuf from the elts ring buffer till new tail. 2039 * 2040 * @param txq 2041 * Pointer to Tx queue structure. 2042 * @param tail 2043 * Index in elts to free up to, becomes new elts tail. 2044 * @param olx 2045 * Configured Tx offloads mask. It is fully defined at 2046 * compile time and may be used for optimization. 2047 */ 2048 static __rte_always_inline void 2049 mlx5_tx_free_elts(struct mlx5_txq_data *restrict txq, 2050 uint16_t tail, 2051 unsigned int olx __rte_unused) 2052 { 2053 uint16_t n_elts = tail - txq->elts_tail; 2054 2055 MLX5_ASSERT(n_elts); 2056 MLX5_ASSERT(n_elts <= txq->elts_s); 2057 /* 2058 * Implement a loop to support ring buffer wraparound 2059 * with single inlining of mlx5_tx_free_mbuf(). 2060 */ 2061 do { 2062 unsigned int part; 2063 2064 part = txq->elts_s - (txq->elts_tail & txq->elts_m); 2065 part = RTE_MIN(part, n_elts); 2066 MLX5_ASSERT(part); 2067 MLX5_ASSERT(part <= txq->elts_s); 2068 mlx5_tx_free_mbuf(&txq->elts[txq->elts_tail & txq->elts_m], 2069 part, olx); 2070 txq->elts_tail += part; 2071 n_elts -= part; 2072 } while (n_elts); 2073 } 2074 2075 /** 2076 * Store the mbuf being sent into elts ring buffer. 2077 * On Tx completion these mbufs will be freed. 2078 * 2079 * @param txq 2080 * Pointer to Tx queue structure. 2081 * @param pkts 2082 * Pointer to array of packets to be stored. 2083 * @param pkts_n 2084 * Number of packets to be stored. 2085 * @param olx 2086 * Configured Tx offloads mask. It is fully defined at 2087 * compile time and may be used for optimization. 2088 */ 2089 static __rte_always_inline void 2090 mlx5_tx_copy_elts(struct mlx5_txq_data *restrict txq, 2091 struct rte_mbuf **restrict pkts, 2092 unsigned int pkts_n, 2093 unsigned int olx __rte_unused) 2094 { 2095 unsigned int part; 2096 struct rte_mbuf **elts = (struct rte_mbuf **)txq->elts; 2097 2098 MLX5_ASSERT(pkts); 2099 MLX5_ASSERT(pkts_n); 2100 part = txq->elts_s - (txq->elts_head & txq->elts_m); 2101 MLX5_ASSERT(part); 2102 MLX5_ASSERT(part <= txq->elts_s); 2103 /* This code is a good candidate for vectorizing with SIMD. */ 2104 rte_memcpy((void *)(elts + (txq->elts_head & txq->elts_m)), 2105 (void *)pkts, 2106 RTE_MIN(part, pkts_n) * sizeof(struct rte_mbuf *)); 2107 txq->elts_head += pkts_n; 2108 if (unlikely(part < pkts_n)) 2109 /* The copy is wrapping around the elts array. */ 2110 rte_memcpy((void *)elts, (void *)(pkts + part), 2111 (pkts_n - part) * sizeof(struct rte_mbuf *)); 2112 } 2113 2114 /** 2115 * Update completion queue consuming index via doorbell 2116 * and flush the completed data buffers. 2117 * 2118 * @param txq 2119 * Pointer to TX queue structure. 2120 * @param valid CQE pointer 2121 * if not NULL update txq->wqe_pi and flush the buffers 2122 * @param olx 2123 * Configured Tx offloads mask. It is fully defined at 2124 * compile time and may be used for optimization. 2125 */ 2126 static __rte_always_inline void 2127 mlx5_tx_comp_flush(struct mlx5_txq_data *restrict txq, 2128 volatile struct mlx5_cqe *last_cqe, 2129 unsigned int olx __rte_unused) 2130 { 2131 if (likely(last_cqe != NULL)) { 2132 uint16_t tail; 2133 2134 txq->wqe_pi = rte_be_to_cpu_16(last_cqe->wqe_counter); 2135 tail = txq->fcqs[(txq->cq_ci - 1) & txq->cqe_m]; 2136 if (likely(tail != txq->elts_tail)) { 2137 mlx5_tx_free_elts(txq, tail, olx); 2138 MLX5_ASSERT(tail == txq->elts_tail); 2139 } 2140 } 2141 } 2142 2143 /** 2144 * Manage TX completions. This routine checks the CQ for 2145 * arrived CQEs, deduces the last accomplished WQE in SQ, 2146 * updates SQ producing index and frees all completed mbufs. 2147 * 2148 * @param txq 2149 * Pointer to TX queue structure. 2150 * @param olx 2151 * Configured Tx offloads mask. It is fully defined at 2152 * compile time and may be used for optimization. 2153 * 2154 * NOTE: not inlined intentionally, it makes tx_burst 2155 * routine smaller, simple and faster - from experiments. 2156 */ 2157 static void 2158 mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq, 2159 unsigned int olx __rte_unused) 2160 { 2161 unsigned int count = MLX5_TX_COMP_MAX_CQE; 2162 volatile struct mlx5_cqe *last_cqe = NULL; 2163 uint16_t ci = txq->cq_ci; 2164 int ret; 2165 2166 static_assert(MLX5_CQE_STATUS_HW_OWN < 0, "Must be negative value"); 2167 static_assert(MLX5_CQE_STATUS_SW_OWN < 0, "Must be negative value"); 2168 do { 2169 volatile struct mlx5_cqe *cqe; 2170 2171 cqe = &txq->cqes[ci & txq->cqe_m]; 2172 ret = check_cqe(cqe, txq->cqe_s, ci); 2173 if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) { 2174 if (likely(ret != MLX5_CQE_STATUS_ERR)) { 2175 /* No new CQEs in completion queue. */ 2176 MLX5_ASSERT(ret == MLX5_CQE_STATUS_HW_OWN); 2177 break; 2178 } 2179 /* 2180 * Some error occurred, try to restart. 2181 * We have no barrier after WQE related Doorbell 2182 * written, make sure all writes are completed 2183 * here, before we might perform SQ reset. 2184 */ 2185 rte_wmb(); 2186 txq->cq_ci = ci; 2187 ret = mlx5_tx_error_cqe_handle 2188 (txq, (volatile struct mlx5_err_cqe *)cqe); 2189 if (unlikely(ret < 0)) { 2190 /* 2191 * Some error occurred on queue error 2192 * handling, we do not advance the index 2193 * here, allowing to retry on next call. 2194 */ 2195 return; 2196 } 2197 /* 2198 * We are going to fetch all entries with 2199 * MLX5_CQE_SYNDROME_WR_FLUSH_ERR status. 2200 * The send queue is supposed to be empty. 2201 */ 2202 ++ci; 2203 txq->cq_pi = ci; 2204 last_cqe = NULL; 2205 continue; 2206 } 2207 /* Normal transmit completion. */ 2208 MLX5_ASSERT(ci != txq->cq_pi); 2209 MLX5_ASSERT((txq->fcqs[ci & txq->cqe_m] >> 16) == 2210 cqe->wqe_counter); 2211 ++ci; 2212 last_cqe = cqe; 2213 /* 2214 * We have to restrict the amount of processed CQEs 2215 * in one tx_burst routine call. The CQ may be large 2216 * and many CQEs may be updated by the NIC in one 2217 * transaction. Buffers freeing is time consuming, 2218 * multiple iterations may introduce significant 2219 * latency. 2220 */ 2221 if (likely(--count == 0)) 2222 break; 2223 } while (true); 2224 if (likely(ci != txq->cq_ci)) { 2225 /* 2226 * Update completion queue consuming index 2227 * and ring doorbell to notify hardware. 2228 */ 2229 rte_compiler_barrier(); 2230 txq->cq_ci = ci; 2231 *txq->cq_db = rte_cpu_to_be_32(ci); 2232 mlx5_tx_comp_flush(txq, last_cqe, olx); 2233 } 2234 } 2235 2236 /** 2237 * Check if the completion request flag should be set in the last WQE. 2238 * Both pushed mbufs and WQEs are monitored and the completion request 2239 * flag is set if any of thresholds is reached. 2240 * 2241 * @param txq 2242 * Pointer to TX queue structure. 2243 * @param loc 2244 * Pointer to burst routine local context. 2245 * @param olx 2246 * Configured Tx offloads mask. It is fully defined at 2247 * compile time and may be used for optimization. 2248 */ 2249 static __rte_always_inline void 2250 mlx5_tx_request_completion(struct mlx5_txq_data *restrict txq, 2251 struct mlx5_txq_local *restrict loc, 2252 unsigned int olx) 2253 { 2254 uint16_t head = txq->elts_head; 2255 unsigned int part; 2256 2257 part = MLX5_TXOFF_CONFIG(INLINE) ? 2258 0 : loc->pkts_sent - loc->pkts_copy; 2259 head += part; 2260 if ((uint16_t)(head - txq->elts_comp) >= MLX5_TX_COMP_THRESH || 2261 (MLX5_TXOFF_CONFIG(INLINE) && 2262 (uint16_t)(txq->wqe_ci - txq->wqe_comp) >= txq->wqe_thres)) { 2263 volatile struct mlx5_wqe *last = loc->wqe_last; 2264 2265 txq->elts_comp = head; 2266 if (MLX5_TXOFF_CONFIG(INLINE)) 2267 txq->wqe_comp = txq->wqe_ci; 2268 /* Request unconditional completion on last WQE. */ 2269 last->cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS << 2270 MLX5_COMP_MODE_OFFSET); 2271 /* Save elts_head in dedicated free on completion queue. */ 2272 #ifdef RTE_LIBRTE_MLX5_DEBUG 2273 txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head | 2274 (last->cseg.opcode >> 8) << 16; 2275 #else 2276 txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head; 2277 #endif 2278 /* A CQE slot must always be available. */ 2279 MLX5_ASSERT((txq->cq_pi - txq->cq_ci) <= txq->cqe_s); 2280 } 2281 } 2282 2283 /** 2284 * DPDK callback to check the status of a tx descriptor. 2285 * 2286 * @param tx_queue 2287 * The tx queue. 2288 * @param[in] offset 2289 * The index of the descriptor in the ring. 2290 * 2291 * @return 2292 * The status of the tx descriptor. 2293 */ 2294 int 2295 mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset) 2296 { 2297 struct mlx5_txq_data *restrict txq = tx_queue; 2298 uint16_t used; 2299 2300 mlx5_tx_handle_completion(txq, 0); 2301 used = txq->elts_head - txq->elts_tail; 2302 if (offset < used) 2303 return RTE_ETH_TX_DESC_FULL; 2304 return RTE_ETH_TX_DESC_DONE; 2305 } 2306 2307 /** 2308 * Build the Control Segment with specified opcode: 2309 * - MLX5_OPCODE_SEND 2310 * - MLX5_OPCODE_ENHANCED_MPSW 2311 * - MLX5_OPCODE_TSO 2312 * 2313 * @param txq 2314 * Pointer to TX queue structure. 2315 * @param loc 2316 * Pointer to burst routine local context. 2317 * @param wqe 2318 * Pointer to WQE to fill with built Control Segment. 2319 * @param ds 2320 * Supposed length of WQE in segments. 2321 * @param opcode 2322 * SQ WQE opcode to put into Control Segment. 2323 * @param olx 2324 * Configured Tx offloads mask. It is fully defined at 2325 * compile time and may be used for optimization. 2326 */ 2327 static __rte_always_inline void 2328 mlx5_tx_cseg_init(struct mlx5_txq_data *restrict txq, 2329 struct mlx5_txq_local *restrict loc __rte_unused, 2330 struct mlx5_wqe *restrict wqe, 2331 unsigned int ds, 2332 unsigned int opcode, 2333 unsigned int olx __rte_unused) 2334 { 2335 struct mlx5_wqe_cseg *restrict cs = &wqe->cseg; 2336 2337 /* For legacy MPW replace the EMPW by TSO with modifier. */ 2338 if (MLX5_TXOFF_CONFIG(MPW) && opcode == MLX5_OPCODE_ENHANCED_MPSW) 2339 opcode = MLX5_OPCODE_TSO | MLX5_OPC_MOD_MPW << 24; 2340 cs->opcode = rte_cpu_to_be_32((txq->wqe_ci << 8) | opcode); 2341 cs->sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); 2342 cs->flags = RTE_BE32(MLX5_COMP_ONLY_FIRST_ERR << 2343 MLX5_COMP_MODE_OFFSET); 2344 cs->misc = RTE_BE32(0); 2345 } 2346 2347 /** 2348 * Build the Ethernet Segment without inlined data. 2349 * Supports Software Parser, Checksums and VLAN 2350 * insertion Tx offload features. 2351 * 2352 * @param txq 2353 * Pointer to TX queue structure. 2354 * @param loc 2355 * Pointer to burst routine local context. 2356 * @param wqe 2357 * Pointer to WQE to fill with built Ethernet Segment. 2358 * @param olx 2359 * Configured Tx offloads mask. It is fully defined at 2360 * compile time and may be used for optimization. 2361 */ 2362 static __rte_always_inline void 2363 mlx5_tx_eseg_none(struct mlx5_txq_data *restrict txq __rte_unused, 2364 struct mlx5_txq_local *restrict loc, 2365 struct mlx5_wqe *restrict wqe, 2366 unsigned int olx) 2367 { 2368 struct mlx5_wqe_eseg *restrict es = &wqe->eseg; 2369 uint32_t csum; 2370 2371 /* 2372 * Calculate and set check sum flags first, dword field 2373 * in segment may be shared with Software Parser flags. 2374 */ 2375 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; 2376 es->flags = rte_cpu_to_le_32(csum); 2377 /* 2378 * Calculate and set Software Parser offsets and flags. 2379 * These flags a set for custom UDP and IP tunnel packets. 2380 */ 2381 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); 2382 /* Fill metadata field if needed. */ 2383 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? 2384 loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? 2385 *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0; 2386 /* Engage VLAN tag insertion feature if requested. */ 2387 if (MLX5_TXOFF_CONFIG(VLAN) && 2388 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { 2389 /* 2390 * We should get here only if device support 2391 * this feature correctly. 2392 */ 2393 MLX5_ASSERT(txq->vlan_en); 2394 es->inline_hdr = rte_cpu_to_be_32(MLX5_ETH_WQE_VLAN_INSERT | 2395 loc->mbuf->vlan_tci); 2396 } else { 2397 es->inline_hdr = RTE_BE32(0); 2398 } 2399 } 2400 2401 /** 2402 * Build the Ethernet Segment with minimal inlined data 2403 * of MLX5_ESEG_MIN_INLINE_SIZE bytes length. This is 2404 * used to fill the gap in single WQEBB WQEs. 2405 * Supports Software Parser, Checksums and VLAN 2406 * insertion Tx offload features. 2407 * 2408 * @param txq 2409 * Pointer to TX queue structure. 2410 * @param loc 2411 * Pointer to burst routine local context. 2412 * @param wqe 2413 * Pointer to WQE to fill with built Ethernet Segment. 2414 * @param vlan 2415 * Length of VLAN tag insertion if any. 2416 * @param olx 2417 * Configured Tx offloads mask. It is fully defined at 2418 * compile time and may be used for optimization. 2419 */ 2420 static __rte_always_inline void 2421 mlx5_tx_eseg_dmin(struct mlx5_txq_data *restrict txq __rte_unused, 2422 struct mlx5_txq_local *restrict loc, 2423 struct mlx5_wqe *restrict wqe, 2424 unsigned int vlan, 2425 unsigned int olx) 2426 { 2427 struct mlx5_wqe_eseg *restrict es = &wqe->eseg; 2428 uint32_t csum; 2429 uint8_t *psrc, *pdst; 2430 2431 /* 2432 * Calculate and set check sum flags first, dword field 2433 * in segment may be shared with Software Parser flags. 2434 */ 2435 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; 2436 es->flags = rte_cpu_to_le_32(csum); 2437 /* 2438 * Calculate and set Software Parser offsets and flags. 2439 * These flags a set for custom UDP and IP tunnel packets. 2440 */ 2441 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); 2442 /* Fill metadata field if needed. */ 2443 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? 2444 loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? 2445 *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0; 2446 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2447 (sizeof(uint16_t) + 2448 sizeof(rte_v128u32_t)), 2449 "invalid Ethernet Segment data size"); 2450 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2451 (sizeof(uint16_t) + 2452 sizeof(struct rte_vlan_hdr) + 2453 2 * RTE_ETHER_ADDR_LEN), 2454 "invalid Ethernet Segment data size"); 2455 psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *); 2456 es->inline_hdr_sz = RTE_BE16(MLX5_ESEG_MIN_INLINE_SIZE); 2457 es->inline_data = *(unaligned_uint16_t *)psrc; 2458 psrc += sizeof(uint16_t); 2459 pdst = (uint8_t *)(es + 1); 2460 if (MLX5_TXOFF_CONFIG(VLAN) && vlan) { 2461 /* Implement VLAN tag insertion as part inline data. */ 2462 memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t)); 2463 pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); 2464 psrc += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); 2465 /* Insert VLAN ethertype + VLAN tag. */ 2466 *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32 2467 ((RTE_ETHER_TYPE_VLAN << 16) | 2468 loc->mbuf->vlan_tci); 2469 pdst += sizeof(struct rte_vlan_hdr); 2470 /* Copy the rest two bytes from packet data. */ 2471 MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t))); 2472 *(uint16_t *)pdst = *(unaligned_uint16_t *)psrc; 2473 } else { 2474 /* Fill the gap in the title WQEBB with inline data. */ 2475 rte_mov16(pdst, psrc); 2476 } 2477 } 2478 2479 /** 2480 * Build the Ethernet Segment with entire packet 2481 * data inlining. Checks the boundary of WQEBB and 2482 * ring buffer wrapping, supports Software Parser, 2483 * Checksums and VLAN insertion Tx offload features. 2484 * 2485 * @param txq 2486 * Pointer to TX queue structure. 2487 * @param loc 2488 * Pointer to burst routine local context. 2489 * @param wqe 2490 * Pointer to WQE to fill with built Ethernet Segment. 2491 * @param vlan 2492 * Length of VLAN tag insertion if any. 2493 * @param inlen 2494 * Length of data to inline (VLAN included, if any). 2495 * @param tso 2496 * TSO flag, set mss field from the packet. 2497 * @param olx 2498 * Configured Tx offloads mask. It is fully defined at 2499 * compile time and may be used for optimization. 2500 * 2501 * @return 2502 * Pointer to the next Data Segment (aligned and wrapped around). 2503 */ 2504 static __rte_always_inline struct mlx5_wqe_dseg * 2505 mlx5_tx_eseg_data(struct mlx5_txq_data *restrict txq, 2506 struct mlx5_txq_local *restrict loc, 2507 struct mlx5_wqe *restrict wqe, 2508 unsigned int vlan, 2509 unsigned int inlen, 2510 unsigned int tso, 2511 unsigned int olx) 2512 { 2513 struct mlx5_wqe_eseg *restrict es = &wqe->eseg; 2514 uint32_t csum; 2515 uint8_t *psrc, *pdst; 2516 unsigned int part; 2517 2518 /* 2519 * Calculate and set check sum flags first, dword field 2520 * in segment may be shared with Software Parser flags. 2521 */ 2522 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; 2523 if (tso) { 2524 csum <<= 24; 2525 csum |= loc->mbuf->tso_segsz; 2526 es->flags = rte_cpu_to_be_32(csum); 2527 } else { 2528 es->flags = rte_cpu_to_le_32(csum); 2529 } 2530 /* 2531 * Calculate and set Software Parser offsets and flags. 2532 * These flags a set for custom UDP and IP tunnel packets. 2533 */ 2534 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); 2535 /* Fill metadata field if needed. */ 2536 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? 2537 loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? 2538 *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0; 2539 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2540 (sizeof(uint16_t) + 2541 sizeof(rte_v128u32_t)), 2542 "invalid Ethernet Segment data size"); 2543 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2544 (sizeof(uint16_t) + 2545 sizeof(struct rte_vlan_hdr) + 2546 2 * RTE_ETHER_ADDR_LEN), 2547 "invalid Ethernet Segment data size"); 2548 psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *); 2549 es->inline_hdr_sz = rte_cpu_to_be_16(inlen); 2550 es->inline_data = *(unaligned_uint16_t *)psrc; 2551 psrc += sizeof(uint16_t); 2552 pdst = (uint8_t *)(es + 1); 2553 if (MLX5_TXOFF_CONFIG(VLAN) && vlan) { 2554 /* Implement VLAN tag insertion as part inline data. */ 2555 memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t)); 2556 pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); 2557 psrc += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); 2558 /* Insert VLAN ethertype + VLAN tag. */ 2559 *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32 2560 ((RTE_ETHER_TYPE_VLAN << 16) | 2561 loc->mbuf->vlan_tci); 2562 pdst += sizeof(struct rte_vlan_hdr); 2563 /* Copy the rest two bytes from packet data. */ 2564 MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t))); 2565 *(uint16_t *)pdst = *(unaligned_uint16_t *)psrc; 2566 psrc += sizeof(uint16_t); 2567 } else { 2568 /* Fill the gap in the title WQEBB with inline data. */ 2569 rte_mov16(pdst, psrc); 2570 psrc += sizeof(rte_v128u32_t); 2571 } 2572 pdst = (uint8_t *)(es + 2); 2573 MLX5_ASSERT(inlen >= MLX5_ESEG_MIN_INLINE_SIZE); 2574 MLX5_ASSERT(pdst < (uint8_t *)txq->wqes_end); 2575 inlen -= MLX5_ESEG_MIN_INLINE_SIZE; 2576 if (!inlen) { 2577 MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE)); 2578 return (struct mlx5_wqe_dseg *)pdst; 2579 } 2580 /* 2581 * The WQEBB space availability is checked by caller. 2582 * Here we should be aware of WQE ring buffer wraparound only. 2583 */ 2584 part = (uint8_t *)txq->wqes_end - pdst; 2585 part = RTE_MIN(part, inlen); 2586 do { 2587 rte_memcpy(pdst, psrc, part); 2588 inlen -= part; 2589 if (likely(!inlen)) { 2590 /* 2591 * If return value is not used by the caller 2592 * the code below will be optimized out. 2593 */ 2594 pdst += part; 2595 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); 2596 if (unlikely(pdst >= (uint8_t *)txq->wqes_end)) 2597 pdst = (uint8_t *)txq->wqes; 2598 return (struct mlx5_wqe_dseg *)pdst; 2599 } 2600 pdst = (uint8_t *)txq->wqes; 2601 psrc += part; 2602 part = inlen; 2603 } while (true); 2604 } 2605 2606 /** 2607 * Copy data from chain of mbuf to the specified linear buffer. 2608 * Checksums and VLAN insertion Tx offload features. If data 2609 * from some mbuf copied completely this mbuf is freed. Local 2610 * structure is used to keep the byte stream state. 2611 * 2612 * @param pdst 2613 * Pointer to the destination linear buffer. 2614 * @param loc 2615 * Pointer to burst routine local context. 2616 * @param len 2617 * Length of data to be copied. 2618 * @param must 2619 * Length of data to be copied ignoring no inline hint. 2620 * @param olx 2621 * Configured Tx offloads mask. It is fully defined at 2622 * compile time and may be used for optimization. 2623 * 2624 * @return 2625 * Number of actual copied data bytes. This is always greater than or 2626 * equal to must parameter and might be lesser than len in no inline 2627 * hint flag is encountered. 2628 */ 2629 static __rte_always_inline unsigned int 2630 mlx5_tx_mseg_memcpy(uint8_t *pdst, 2631 struct mlx5_txq_local *restrict loc, 2632 unsigned int len, 2633 unsigned int must, 2634 unsigned int olx __rte_unused) 2635 { 2636 struct rte_mbuf *mbuf; 2637 unsigned int part, dlen, copy = 0; 2638 uint8_t *psrc; 2639 2640 MLX5_ASSERT(len); 2641 MLX5_ASSERT(must <= len); 2642 do { 2643 /* Allow zero length packets, must check first. */ 2644 dlen = rte_pktmbuf_data_len(loc->mbuf); 2645 if (dlen <= loc->mbuf_off) { 2646 /* Exhausted packet, just free. */ 2647 mbuf = loc->mbuf; 2648 loc->mbuf = mbuf->next; 2649 rte_pktmbuf_free_seg(mbuf); 2650 loc->mbuf_off = 0; 2651 MLX5_ASSERT(loc->mbuf_nseg > 1); 2652 MLX5_ASSERT(loc->mbuf); 2653 --loc->mbuf_nseg; 2654 if (loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE) { 2655 unsigned int diff; 2656 2657 if (copy >= must) { 2658 /* 2659 * We already copied the minimal 2660 * requested amount of data. 2661 */ 2662 return copy; 2663 } 2664 diff = must - copy; 2665 if (diff <= rte_pktmbuf_data_len(loc->mbuf)) { 2666 /* 2667 * Copy only the minimal required 2668 * part of the data buffer. 2669 */ 2670 len = diff; 2671 } 2672 } 2673 continue; 2674 } 2675 dlen -= loc->mbuf_off; 2676 psrc = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *, 2677 loc->mbuf_off); 2678 part = RTE_MIN(len, dlen); 2679 rte_memcpy(pdst, psrc, part); 2680 copy += part; 2681 loc->mbuf_off += part; 2682 len -= part; 2683 if (!len) { 2684 if (loc->mbuf_off >= rte_pktmbuf_data_len(loc->mbuf)) { 2685 loc->mbuf_off = 0; 2686 /* Exhausted packet, just free. */ 2687 mbuf = loc->mbuf; 2688 loc->mbuf = mbuf->next; 2689 rte_pktmbuf_free_seg(mbuf); 2690 loc->mbuf_off = 0; 2691 MLX5_ASSERT(loc->mbuf_nseg >= 1); 2692 --loc->mbuf_nseg; 2693 } 2694 return copy; 2695 } 2696 pdst += part; 2697 } while (true); 2698 } 2699 2700 /** 2701 * Build the Ethernet Segment with inlined data from 2702 * multi-segment packet. Checks the boundary of WQEBB 2703 * and ring buffer wrapping, supports Software Parser, 2704 * Checksums and VLAN insertion Tx offload features. 2705 * 2706 * @param txq 2707 * Pointer to TX queue structure. 2708 * @param loc 2709 * Pointer to burst routine local context. 2710 * @param wqe 2711 * Pointer to WQE to fill with built Ethernet Segment. 2712 * @param vlan 2713 * Length of VLAN tag insertion if any. 2714 * @param inlen 2715 * Length of data to inline (VLAN included, if any). 2716 * @param tso 2717 * TSO flag, set mss field from the packet. 2718 * @param olx 2719 * Configured Tx offloads mask. It is fully defined at 2720 * compile time and may be used for optimization. 2721 * 2722 * @return 2723 * Pointer to the next Data Segment (aligned and 2724 * possible NOT wrapped around - caller should do 2725 * wrapping check on its own). 2726 */ 2727 static __rte_always_inline struct mlx5_wqe_dseg * 2728 mlx5_tx_eseg_mdat(struct mlx5_txq_data *restrict txq, 2729 struct mlx5_txq_local *restrict loc, 2730 struct mlx5_wqe *restrict wqe, 2731 unsigned int vlan, 2732 unsigned int inlen, 2733 unsigned int tso, 2734 unsigned int olx) 2735 { 2736 struct mlx5_wqe_eseg *restrict es = &wqe->eseg; 2737 uint32_t csum; 2738 uint8_t *pdst; 2739 unsigned int part, tlen = 0; 2740 2741 /* 2742 * Calculate and set check sum flags first, uint32_t field 2743 * in segment may be shared with Software Parser flags. 2744 */ 2745 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; 2746 if (tso) { 2747 csum <<= 24; 2748 csum |= loc->mbuf->tso_segsz; 2749 es->flags = rte_cpu_to_be_32(csum); 2750 } else { 2751 es->flags = rte_cpu_to_le_32(csum); 2752 } 2753 /* 2754 * Calculate and set Software Parser offsets and flags. 2755 * These flags a set for custom UDP and IP tunnel packets. 2756 */ 2757 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); 2758 /* Fill metadata field if needed. */ 2759 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? 2760 loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? 2761 *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0; 2762 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2763 (sizeof(uint16_t) + 2764 sizeof(rte_v128u32_t)), 2765 "invalid Ethernet Segment data size"); 2766 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2767 (sizeof(uint16_t) + 2768 sizeof(struct rte_vlan_hdr) + 2769 2 * RTE_ETHER_ADDR_LEN), 2770 "invalid Ethernet Segment data size"); 2771 MLX5_ASSERT(inlen >= MLX5_ESEG_MIN_INLINE_SIZE); 2772 pdst = (uint8_t *)&es->inline_data; 2773 if (MLX5_TXOFF_CONFIG(VLAN) && vlan) { 2774 /* Implement VLAN tag insertion as part inline data. */ 2775 mlx5_tx_mseg_memcpy(pdst, loc, 2776 2 * RTE_ETHER_ADDR_LEN, 2777 2 * RTE_ETHER_ADDR_LEN, olx); 2778 pdst += 2 * RTE_ETHER_ADDR_LEN; 2779 *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32 2780 ((RTE_ETHER_TYPE_VLAN << 16) | 2781 loc->mbuf->vlan_tci); 2782 pdst += sizeof(struct rte_vlan_hdr); 2783 tlen += 2 * RTE_ETHER_ADDR_LEN + sizeof(struct rte_vlan_hdr); 2784 } 2785 MLX5_ASSERT(pdst < (uint8_t *)txq->wqes_end); 2786 /* 2787 * The WQEBB space availability is checked by caller. 2788 * Here we should be aware of WQE ring buffer wraparound only. 2789 */ 2790 part = (uint8_t *)txq->wqes_end - pdst; 2791 part = RTE_MIN(part, inlen - tlen); 2792 MLX5_ASSERT(part); 2793 do { 2794 unsigned int copy; 2795 2796 /* 2797 * Copying may be interrupted inside the routine 2798 * if run into no inline hint flag. 2799 */ 2800 copy = tlen >= txq->inlen_mode ? 0 : (txq->inlen_mode - tlen); 2801 copy = mlx5_tx_mseg_memcpy(pdst, loc, part, copy, olx); 2802 tlen += copy; 2803 if (likely(inlen <= tlen) || copy < part) { 2804 es->inline_hdr_sz = rte_cpu_to_be_16(tlen); 2805 pdst += copy; 2806 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); 2807 return (struct mlx5_wqe_dseg *)pdst; 2808 } 2809 pdst = (uint8_t *)txq->wqes; 2810 part = inlen - tlen; 2811 } while (true); 2812 } 2813 2814 /** 2815 * Build the Data Segment of pointer type. 2816 * 2817 * @param txq 2818 * Pointer to TX queue structure. 2819 * @param loc 2820 * Pointer to burst routine local context. 2821 * @param dseg 2822 * Pointer to WQE to fill with built Data Segment. 2823 * @param buf 2824 * Data buffer to point. 2825 * @param len 2826 * Data buffer length. 2827 * @param olx 2828 * Configured Tx offloads mask. It is fully defined at 2829 * compile time and may be used for optimization. 2830 */ 2831 static __rte_always_inline void 2832 mlx5_tx_dseg_ptr(struct mlx5_txq_data *restrict txq, 2833 struct mlx5_txq_local *restrict loc, 2834 struct mlx5_wqe_dseg *restrict dseg, 2835 uint8_t *buf, 2836 unsigned int len, 2837 unsigned int olx __rte_unused) 2838 2839 { 2840 MLX5_ASSERT(len); 2841 dseg->bcount = rte_cpu_to_be_32(len); 2842 dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf); 2843 dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf); 2844 } 2845 2846 /** 2847 * Build the Data Segment of pointer type or inline 2848 * if data length is less than buffer in minimal 2849 * Data Segment size. 2850 * 2851 * @param txq 2852 * Pointer to TX queue structure. 2853 * @param loc 2854 * Pointer to burst routine local context. 2855 * @param dseg 2856 * Pointer to WQE to fill with built Data Segment. 2857 * @param buf 2858 * Data buffer to point. 2859 * @param len 2860 * Data buffer length. 2861 * @param olx 2862 * Configured Tx offloads mask. It is fully defined at 2863 * compile time and may be used for optimization. 2864 */ 2865 static __rte_always_inline void 2866 mlx5_tx_dseg_iptr(struct mlx5_txq_data *restrict txq, 2867 struct mlx5_txq_local *restrict loc, 2868 struct mlx5_wqe_dseg *restrict dseg, 2869 uint8_t *buf, 2870 unsigned int len, 2871 unsigned int olx __rte_unused) 2872 2873 { 2874 uintptr_t dst, src; 2875 2876 MLX5_ASSERT(len); 2877 if (len > MLX5_DSEG_MIN_INLINE_SIZE) { 2878 dseg->bcount = rte_cpu_to_be_32(len); 2879 dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf); 2880 dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf); 2881 2882 return; 2883 } 2884 dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE); 2885 /* Unrolled implementation of generic rte_memcpy. */ 2886 dst = (uintptr_t)&dseg->inline_data[0]; 2887 src = (uintptr_t)buf; 2888 if (len & 0x08) { 2889 #ifdef RTE_ARCH_STRICT_ALIGN 2890 MLX5_ASSERT(dst == RTE_PTR_ALIGN(dst, sizeof(uint32_t))); 2891 *(uint32_t *)dst = *(unaligned_uint32_t *)src; 2892 dst += sizeof(uint32_t); 2893 src += sizeof(uint32_t); 2894 *(uint32_t *)dst = *(unaligned_uint32_t *)src; 2895 dst += sizeof(uint32_t); 2896 src += sizeof(uint32_t); 2897 #else 2898 *(uint64_t *)dst = *(unaligned_uint64_t *)src; 2899 dst += sizeof(uint64_t); 2900 src += sizeof(uint64_t); 2901 #endif 2902 } 2903 if (len & 0x04) { 2904 *(uint32_t *)dst = *(unaligned_uint32_t *)src; 2905 dst += sizeof(uint32_t); 2906 src += sizeof(uint32_t); 2907 } 2908 if (len & 0x02) { 2909 *(uint16_t *)dst = *(unaligned_uint16_t *)src; 2910 dst += sizeof(uint16_t); 2911 src += sizeof(uint16_t); 2912 } 2913 if (len & 0x01) 2914 *(uint8_t *)dst = *(uint8_t *)src; 2915 } 2916 2917 /** 2918 * Build the Data Segment of inlined data from single 2919 * segment packet, no VLAN insertion. 2920 * 2921 * @param txq 2922 * Pointer to TX queue structure. 2923 * @param loc 2924 * Pointer to burst routine local context. 2925 * @param dseg 2926 * Pointer to WQE to fill with built Data Segment. 2927 * @param buf 2928 * Data buffer to point. 2929 * @param len 2930 * Data buffer length. 2931 * @param olx 2932 * Configured Tx offloads mask. It is fully defined at 2933 * compile time and may be used for optimization. 2934 * 2935 * @return 2936 * Pointer to the next Data Segment after inlined data. 2937 * Ring buffer wraparound check is needed. We do not 2938 * do it here because it may not be needed for the 2939 * last packet in the eMPW session. 2940 */ 2941 static __rte_always_inline struct mlx5_wqe_dseg * 2942 mlx5_tx_dseg_empw(struct mlx5_txq_data *restrict txq, 2943 struct mlx5_txq_local *restrict loc __rte_unused, 2944 struct mlx5_wqe_dseg *restrict dseg, 2945 uint8_t *buf, 2946 unsigned int len, 2947 unsigned int olx __rte_unused) 2948 { 2949 unsigned int part; 2950 uint8_t *pdst; 2951 2952 if (!MLX5_TXOFF_CONFIG(MPW)) { 2953 /* Store the descriptor byte counter for eMPW sessions. */ 2954 dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE); 2955 pdst = &dseg->inline_data[0]; 2956 } else { 2957 /* The entire legacy MPW session counter is stored on close. */ 2958 pdst = (uint8_t *)dseg; 2959 } 2960 /* 2961 * The WQEBB space availability is checked by caller. 2962 * Here we should be aware of WQE ring buffer wraparound only. 2963 */ 2964 part = (uint8_t *)txq->wqes_end - pdst; 2965 part = RTE_MIN(part, len); 2966 do { 2967 rte_memcpy(pdst, buf, part); 2968 len -= part; 2969 if (likely(!len)) { 2970 pdst += part; 2971 if (!MLX5_TXOFF_CONFIG(MPW)) 2972 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); 2973 /* Note: no final wraparound check here. */ 2974 return (struct mlx5_wqe_dseg *)pdst; 2975 } 2976 pdst = (uint8_t *)txq->wqes; 2977 buf += part; 2978 part = len; 2979 } while (true); 2980 } 2981 2982 /** 2983 * Build the Data Segment of inlined data from single 2984 * segment packet with VLAN insertion. 2985 * 2986 * @param txq 2987 * Pointer to TX queue structure. 2988 * @param loc 2989 * Pointer to burst routine local context. 2990 * @param dseg 2991 * Pointer to the dseg fill with built Data Segment. 2992 * @param buf 2993 * Data buffer to point. 2994 * @param len 2995 * Data buffer length. 2996 * @param olx 2997 * Configured Tx offloads mask. It is fully defined at 2998 * compile time and may be used for optimization. 2999 * 3000 * @return 3001 * Pointer to the next Data Segment after inlined data. 3002 * Ring buffer wraparound check is needed. 3003 */ 3004 static __rte_always_inline struct mlx5_wqe_dseg * 3005 mlx5_tx_dseg_vlan(struct mlx5_txq_data *restrict txq, 3006 struct mlx5_txq_local *restrict loc __rte_unused, 3007 struct mlx5_wqe_dseg *restrict dseg, 3008 uint8_t *buf, 3009 unsigned int len, 3010 unsigned int olx __rte_unused) 3011 3012 { 3013 unsigned int part; 3014 uint8_t *pdst; 3015 3016 MLX5_ASSERT(len > MLX5_ESEG_MIN_INLINE_SIZE); 3017 static_assert(MLX5_DSEG_MIN_INLINE_SIZE == 3018 (2 * RTE_ETHER_ADDR_LEN), 3019 "invalid Data Segment data size"); 3020 if (!MLX5_TXOFF_CONFIG(MPW)) { 3021 /* Store the descriptor byte counter for eMPW sessions. */ 3022 dseg->bcount = rte_cpu_to_be_32 3023 ((len + sizeof(struct rte_vlan_hdr)) | 3024 MLX5_ETH_WQE_DATA_INLINE); 3025 pdst = &dseg->inline_data[0]; 3026 } else { 3027 /* The entire legacy MPW session counter is stored on close. */ 3028 pdst = (uint8_t *)dseg; 3029 } 3030 memcpy(pdst, buf, MLX5_DSEG_MIN_INLINE_SIZE); 3031 buf += MLX5_DSEG_MIN_INLINE_SIZE; 3032 pdst += MLX5_DSEG_MIN_INLINE_SIZE; 3033 len -= MLX5_DSEG_MIN_INLINE_SIZE; 3034 /* Insert VLAN ethertype + VLAN tag. Pointer is aligned. */ 3035 MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE)); 3036 if (unlikely(pdst >= (uint8_t *)txq->wqes_end)) 3037 pdst = (uint8_t *)txq->wqes; 3038 *(uint32_t *)pdst = rte_cpu_to_be_32((RTE_ETHER_TYPE_VLAN << 16) | 3039 loc->mbuf->vlan_tci); 3040 pdst += sizeof(struct rte_vlan_hdr); 3041 /* 3042 * The WQEBB space availability is checked by caller. 3043 * Here we should be aware of WQE ring buffer wraparound only. 3044 */ 3045 part = (uint8_t *)txq->wqes_end - pdst; 3046 part = RTE_MIN(part, len); 3047 do { 3048 rte_memcpy(pdst, buf, part); 3049 len -= part; 3050 if (likely(!len)) { 3051 pdst += part; 3052 if (!MLX5_TXOFF_CONFIG(MPW)) 3053 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); 3054 /* Note: no final wraparound check here. */ 3055 return (struct mlx5_wqe_dseg *)pdst; 3056 } 3057 pdst = (uint8_t *)txq->wqes; 3058 buf += part; 3059 part = len; 3060 } while (true); 3061 } 3062 3063 /** 3064 * Build the Ethernet Segment with optionally inlined data with 3065 * VLAN insertion and following Data Segments (if any) from 3066 * multi-segment packet. Used by ordinary send and TSO. 3067 * 3068 * @param txq 3069 * Pointer to TX queue structure. 3070 * @param loc 3071 * Pointer to burst routine local context. 3072 * @param wqe 3073 * Pointer to WQE to fill with built Ethernet/Data Segments. 3074 * @param vlan 3075 * Length of VLAN header to insert, 0 means no VLAN insertion. 3076 * @param inlen 3077 * Data length to inline. For TSO this parameter specifies 3078 * exact value, for ordinary send routine can be aligned by 3079 * caller to provide better WQE space saving and data buffer 3080 * start address alignment. This length includes VLAN header 3081 * being inserted. 3082 * @param tso 3083 * Zero means ordinary send, inlined data can be extended, 3084 * otherwise this is TSO, inlined data length is fixed. 3085 * @param olx 3086 * Configured Tx offloads mask. It is fully defined at 3087 * compile time and may be used for optimization. 3088 * 3089 * @return 3090 * Actual size of built WQE in segments. 3091 */ 3092 static __rte_always_inline unsigned int 3093 mlx5_tx_mseg_build(struct mlx5_txq_data *restrict txq, 3094 struct mlx5_txq_local *restrict loc, 3095 struct mlx5_wqe *restrict wqe, 3096 unsigned int vlan, 3097 unsigned int inlen, 3098 unsigned int tso, 3099 unsigned int olx __rte_unused) 3100 { 3101 struct mlx5_wqe_dseg *restrict dseg; 3102 unsigned int ds; 3103 3104 MLX5_ASSERT((rte_pktmbuf_pkt_len(loc->mbuf) + vlan) >= inlen); 3105 loc->mbuf_nseg = NB_SEGS(loc->mbuf); 3106 loc->mbuf_off = 0; 3107 3108 dseg = mlx5_tx_eseg_mdat(txq, loc, wqe, vlan, inlen, tso, olx); 3109 if (!loc->mbuf_nseg) 3110 goto dseg_done; 3111 /* 3112 * There are still some mbuf remaining, not inlined. 3113 * The first mbuf may be partially inlined and we 3114 * must process the possible non-zero data offset. 3115 */ 3116 if (loc->mbuf_off) { 3117 unsigned int dlen; 3118 uint8_t *dptr; 3119 3120 /* 3121 * Exhausted packets must be dropped before. 3122 * Non-zero offset means there are some data 3123 * remained in the packet. 3124 */ 3125 MLX5_ASSERT(loc->mbuf_off < rte_pktmbuf_data_len(loc->mbuf)); 3126 MLX5_ASSERT(rte_pktmbuf_data_len(loc->mbuf)); 3127 dptr = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *, 3128 loc->mbuf_off); 3129 dlen = rte_pktmbuf_data_len(loc->mbuf) - loc->mbuf_off; 3130 /* 3131 * Build the pointer/minimal data Data Segment. 3132 * Do ring buffer wrapping check in advance. 3133 */ 3134 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 3135 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 3136 mlx5_tx_dseg_iptr(txq, loc, dseg, dptr, dlen, olx); 3137 /* Store the mbuf to be freed on completion. */ 3138 MLX5_ASSERT(loc->elts_free); 3139 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 3140 --loc->elts_free; 3141 ++dseg; 3142 if (--loc->mbuf_nseg == 0) 3143 goto dseg_done; 3144 loc->mbuf = loc->mbuf->next; 3145 loc->mbuf_off = 0; 3146 } 3147 do { 3148 if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) { 3149 struct rte_mbuf *mbuf; 3150 3151 /* Zero length segment found, just skip. */ 3152 mbuf = loc->mbuf; 3153 loc->mbuf = loc->mbuf->next; 3154 rte_pktmbuf_free_seg(mbuf); 3155 if (--loc->mbuf_nseg == 0) 3156 break; 3157 } else { 3158 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 3159 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 3160 mlx5_tx_dseg_iptr 3161 (txq, loc, dseg, 3162 rte_pktmbuf_mtod(loc->mbuf, uint8_t *), 3163 rte_pktmbuf_data_len(loc->mbuf), olx); 3164 MLX5_ASSERT(loc->elts_free); 3165 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 3166 --loc->elts_free; 3167 ++dseg; 3168 if (--loc->mbuf_nseg == 0) 3169 break; 3170 loc->mbuf = loc->mbuf->next; 3171 } 3172 } while (true); 3173 3174 dseg_done: 3175 /* Calculate actual segments used from the dseg pointer. */ 3176 if ((uintptr_t)wqe < (uintptr_t)dseg) 3177 ds = ((uintptr_t)dseg - (uintptr_t)wqe) / MLX5_WSEG_SIZE; 3178 else 3179 ds = (((uintptr_t)dseg - (uintptr_t)wqe) + 3180 txq->wqe_s * MLX5_WQE_SIZE) / MLX5_WSEG_SIZE; 3181 return ds; 3182 } 3183 3184 /** 3185 * Tx one packet function for multi-segment TSO. Supports all 3186 * types of Tx offloads, uses MLX5_OPCODE_TSO to build WQEs, 3187 * sends one packet per WQE. 3188 * 3189 * This routine is responsible for storing processed mbuf 3190 * into elts ring buffer and update elts_head. 3191 * 3192 * @param txq 3193 * Pointer to TX queue structure. 3194 * @param loc 3195 * Pointer to burst routine local context. 3196 * @param olx 3197 * Configured Tx offloads mask. It is fully defined at 3198 * compile time and may be used for optimization. 3199 * 3200 * @return 3201 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3202 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3203 * Local context variables partially updated. 3204 */ 3205 static __rte_always_inline enum mlx5_txcmp_code 3206 mlx5_tx_packet_multi_tso(struct mlx5_txq_data *restrict txq, 3207 struct mlx5_txq_local *restrict loc, 3208 unsigned int olx) 3209 { 3210 struct mlx5_wqe *restrict wqe; 3211 unsigned int ds, dlen, inlen, ntcp, vlan = 0; 3212 3213 /* 3214 * Calculate data length to be inlined to estimate 3215 * the required space in WQE ring buffer. 3216 */ 3217 dlen = rte_pktmbuf_pkt_len(loc->mbuf); 3218 if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) 3219 vlan = sizeof(struct rte_vlan_hdr); 3220 inlen = loc->mbuf->l2_len + vlan + 3221 loc->mbuf->l3_len + loc->mbuf->l4_len; 3222 if (unlikely((!inlen || !loc->mbuf->tso_segsz))) 3223 return MLX5_TXCMP_CODE_ERROR; 3224 if (loc->mbuf->ol_flags & PKT_TX_TUNNEL_MASK) 3225 inlen += loc->mbuf->outer_l2_len + loc->mbuf->outer_l3_len; 3226 /* Packet must contain all TSO headers. */ 3227 if (unlikely(inlen > MLX5_MAX_TSO_HEADER || 3228 inlen <= MLX5_ESEG_MIN_INLINE_SIZE || 3229 inlen > (dlen + vlan))) 3230 return MLX5_TXCMP_CODE_ERROR; 3231 MLX5_ASSERT(inlen >= txq->inlen_mode); 3232 /* 3233 * Check whether there are enough free WQEBBs: 3234 * - Control Segment 3235 * - Ethernet Segment 3236 * - First Segment of inlined Ethernet data 3237 * - ... data continued ... 3238 * - Data Segments of pointer/min inline type 3239 */ 3240 ds = NB_SEGS(loc->mbuf) + 2 + (inlen - 3241 MLX5_ESEG_MIN_INLINE_SIZE + 3242 MLX5_WSEG_SIZE + 3243 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 3244 if (unlikely(loc->wqe_free < ((ds + 3) / 4))) 3245 return MLX5_TXCMP_CODE_EXIT; 3246 /* Check for maximal WQE size. */ 3247 if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4))) 3248 return MLX5_TXCMP_CODE_ERROR; 3249 #ifdef MLX5_PMD_SOFT_COUNTERS 3250 /* Update sent data bytes/packets counters. */ 3251 ntcp = (dlen - (inlen - vlan) + loc->mbuf->tso_segsz - 1) / 3252 loc->mbuf->tso_segsz; 3253 /* 3254 * One will be added for mbuf itself 3255 * at the end of the mlx5_tx_burst from 3256 * loc->pkts_sent field. 3257 */ 3258 --ntcp; 3259 txq->stats.opackets += ntcp; 3260 txq->stats.obytes += dlen + vlan + ntcp * inlen; 3261 #endif 3262 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3263 loc->wqe_last = wqe; 3264 mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_TSO, olx); 3265 ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 1, olx); 3266 wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); 3267 txq->wqe_ci += (ds + 3) / 4; 3268 loc->wqe_free -= (ds + 3) / 4; 3269 return MLX5_TXCMP_CODE_MULTI; 3270 } 3271 3272 /** 3273 * Tx one packet function for multi-segment SEND. Supports all 3274 * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs, 3275 * sends one packet per WQE, without any data inlining in 3276 * Ethernet Segment. 3277 * 3278 * This routine is responsible for storing processed mbuf 3279 * into elts ring buffer and update elts_head. 3280 * 3281 * @param txq 3282 * Pointer to TX queue structure. 3283 * @param loc 3284 * Pointer to burst routine local context. 3285 * @param olx 3286 * Configured Tx offloads mask. It is fully defined at 3287 * compile time and may be used for optimization. 3288 * 3289 * @return 3290 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3291 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3292 * Local context variables partially updated. 3293 */ 3294 static __rte_always_inline enum mlx5_txcmp_code 3295 mlx5_tx_packet_multi_send(struct mlx5_txq_data *restrict txq, 3296 struct mlx5_txq_local *restrict loc, 3297 unsigned int olx) 3298 { 3299 struct mlx5_wqe_dseg *restrict dseg; 3300 struct mlx5_wqe *restrict wqe; 3301 unsigned int ds, nseg; 3302 3303 MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1); 3304 /* 3305 * No inline at all, it means the CPU cycles saving 3306 * is prioritized at configuration, we should not 3307 * copy any packet data to WQE. 3308 */ 3309 nseg = NB_SEGS(loc->mbuf); 3310 ds = 2 + nseg; 3311 if (unlikely(loc->wqe_free < ((ds + 3) / 4))) 3312 return MLX5_TXCMP_CODE_EXIT; 3313 /* Check for maximal WQE size. */ 3314 if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4))) 3315 return MLX5_TXCMP_CODE_ERROR; 3316 /* 3317 * Some Tx offloads may cause an error if 3318 * packet is not long enough, check against 3319 * assumed minimal length. 3320 */ 3321 if (rte_pktmbuf_pkt_len(loc->mbuf) <= MLX5_ESEG_MIN_INLINE_SIZE) 3322 return MLX5_TXCMP_CODE_ERROR; 3323 #ifdef MLX5_PMD_SOFT_COUNTERS 3324 /* Update sent data bytes counter. */ 3325 txq->stats.obytes += rte_pktmbuf_pkt_len(loc->mbuf); 3326 if (MLX5_TXOFF_CONFIG(VLAN) && 3327 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) 3328 txq->stats.obytes += sizeof(struct rte_vlan_hdr); 3329 #endif 3330 /* 3331 * SEND WQE, one WQEBB: 3332 * - Control Segment, SEND opcode 3333 * - Ethernet Segment, optional VLAN, no inline 3334 * - Data Segments, pointer only type 3335 */ 3336 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3337 loc->wqe_last = wqe; 3338 mlx5_tx_cseg_init(txq, loc, wqe, ds, MLX5_OPCODE_SEND, olx); 3339 mlx5_tx_eseg_none(txq, loc, wqe, olx); 3340 dseg = &wqe->dseg[0]; 3341 do { 3342 if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) { 3343 struct rte_mbuf *mbuf; 3344 3345 /* 3346 * Zero length segment found, have to 3347 * correct total size of WQE in segments. 3348 * It is supposed to be rare occasion, so 3349 * in normal case (no zero length segments) 3350 * we avoid extra writing to the Control 3351 * Segment. 3352 */ 3353 --ds; 3354 wqe->cseg.sq_ds -= RTE_BE32(1); 3355 mbuf = loc->mbuf; 3356 loc->mbuf = mbuf->next; 3357 rte_pktmbuf_free_seg(mbuf); 3358 if (--nseg == 0) 3359 break; 3360 } else { 3361 mlx5_tx_dseg_ptr 3362 (txq, loc, dseg, 3363 rte_pktmbuf_mtod(loc->mbuf, uint8_t *), 3364 rte_pktmbuf_data_len(loc->mbuf), olx); 3365 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 3366 --loc->elts_free; 3367 if (--nseg == 0) 3368 break; 3369 ++dseg; 3370 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 3371 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 3372 loc->mbuf = loc->mbuf->next; 3373 } 3374 } while (true); 3375 txq->wqe_ci += (ds + 3) / 4; 3376 loc->wqe_free -= (ds + 3) / 4; 3377 return MLX5_TXCMP_CODE_MULTI; 3378 } 3379 3380 /** 3381 * Tx one packet function for multi-segment SEND. Supports all 3382 * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs, 3383 * sends one packet per WQE, with data inlining in 3384 * Ethernet Segment and minimal Data Segments. 3385 * 3386 * This routine is responsible for storing processed mbuf 3387 * into elts ring buffer and update elts_head. 3388 * 3389 * @param txq 3390 * Pointer to TX queue structure. 3391 * @param loc 3392 * Pointer to burst routine local context. 3393 * @param olx 3394 * Configured Tx offloads mask. It is fully defined at 3395 * compile time and may be used for optimization. 3396 * 3397 * @return 3398 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3399 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3400 * Local context variables partially updated. 3401 */ 3402 static __rte_always_inline enum mlx5_txcmp_code 3403 mlx5_tx_packet_multi_inline(struct mlx5_txq_data *restrict txq, 3404 struct mlx5_txq_local *restrict loc, 3405 unsigned int olx) 3406 { 3407 struct mlx5_wqe *restrict wqe; 3408 unsigned int ds, inlen, dlen, vlan = 0; 3409 3410 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE)); 3411 MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1); 3412 /* 3413 * First calculate data length to be inlined 3414 * to estimate the required space for WQE. 3415 */ 3416 dlen = rte_pktmbuf_pkt_len(loc->mbuf); 3417 if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) 3418 vlan = sizeof(struct rte_vlan_hdr); 3419 inlen = dlen + vlan; 3420 /* Check against minimal length. */ 3421 if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE) 3422 return MLX5_TXCMP_CODE_ERROR; 3423 MLX5_ASSERT(txq->inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE); 3424 if (inlen > txq->inlen_send || 3425 loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE) { 3426 struct rte_mbuf *mbuf; 3427 unsigned int nxlen; 3428 uintptr_t start; 3429 3430 /* 3431 * Packet length exceeds the allowed inline 3432 * data length, check whether the minimal 3433 * inlining is required. 3434 */ 3435 if (txq->inlen_mode) { 3436 MLX5_ASSERT(txq->inlen_mode >= 3437 MLX5_ESEG_MIN_INLINE_SIZE); 3438 MLX5_ASSERT(txq->inlen_mode <= txq->inlen_send); 3439 inlen = txq->inlen_mode; 3440 } else { 3441 if (loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE || 3442 !vlan || txq->vlan_en) { 3443 /* 3444 * VLAN insertion will be done inside by HW. 3445 * It is not utmost effective - VLAN flag is 3446 * checked twice, but we should proceed the 3447 * inlining length correctly and take into 3448 * account the VLAN header being inserted. 3449 */ 3450 return mlx5_tx_packet_multi_send 3451 (txq, loc, olx); 3452 } 3453 inlen = MLX5_ESEG_MIN_INLINE_SIZE; 3454 } 3455 /* 3456 * Now we know the minimal amount of data is requested 3457 * to inline. Check whether we should inline the buffers 3458 * from the chain beginning to eliminate some mbufs. 3459 */ 3460 mbuf = loc->mbuf; 3461 nxlen = rte_pktmbuf_data_len(mbuf); 3462 if (unlikely(nxlen <= txq->inlen_send)) { 3463 /* We can inline first mbuf at least. */ 3464 if (nxlen < inlen) { 3465 unsigned int smlen; 3466 3467 /* Scan mbufs till inlen filled. */ 3468 do { 3469 smlen = nxlen; 3470 mbuf = NEXT(mbuf); 3471 MLX5_ASSERT(mbuf); 3472 nxlen = rte_pktmbuf_data_len(mbuf); 3473 nxlen += smlen; 3474 } while (unlikely(nxlen < inlen)); 3475 if (unlikely(nxlen > txq->inlen_send)) { 3476 /* We cannot inline entire mbuf. */ 3477 smlen = inlen - smlen; 3478 start = rte_pktmbuf_mtod_offset 3479 (mbuf, uintptr_t, smlen); 3480 goto do_align; 3481 } 3482 } 3483 do { 3484 inlen = nxlen; 3485 mbuf = NEXT(mbuf); 3486 /* There should be not end of packet. */ 3487 MLX5_ASSERT(mbuf); 3488 nxlen = inlen + rte_pktmbuf_data_len(mbuf); 3489 } while (unlikely(nxlen < txq->inlen_send)); 3490 } 3491 start = rte_pktmbuf_mtod(mbuf, uintptr_t); 3492 /* 3493 * Check whether we can do inline to align start 3494 * address of data buffer to cacheline. 3495 */ 3496 do_align: 3497 start = (~start + 1) & (RTE_CACHE_LINE_SIZE - 1); 3498 if (unlikely(start)) { 3499 start += inlen; 3500 if (start <= txq->inlen_send) 3501 inlen = start; 3502 } 3503 } 3504 /* 3505 * Check whether there are enough free WQEBBs: 3506 * - Control Segment 3507 * - Ethernet Segment 3508 * - First Segment of inlined Ethernet data 3509 * - ... data continued ... 3510 * - Data Segments of pointer/min inline type 3511 * 3512 * Estimate the number of Data Segments conservatively, 3513 * supposing no any mbufs is being freed during inlining. 3514 */ 3515 MLX5_ASSERT(inlen <= txq->inlen_send); 3516 ds = NB_SEGS(loc->mbuf) + 2 + (inlen - 3517 MLX5_ESEG_MIN_INLINE_SIZE + 3518 MLX5_WSEG_SIZE + 3519 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 3520 if (unlikely(loc->wqe_free < ((ds + 3) / 4))) 3521 return MLX5_TXCMP_CODE_EXIT; 3522 /* Check for maximal WQE size. */ 3523 if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4))) 3524 return MLX5_TXCMP_CODE_ERROR; 3525 #ifdef MLX5_PMD_SOFT_COUNTERS 3526 /* Update sent data bytes/packets counters. */ 3527 txq->stats.obytes += dlen + vlan; 3528 #endif 3529 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3530 loc->wqe_last = wqe; 3531 mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_SEND, olx); 3532 ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 0, olx); 3533 wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); 3534 txq->wqe_ci += (ds + 3) / 4; 3535 loc->wqe_free -= (ds + 3) / 4; 3536 return MLX5_TXCMP_CODE_MULTI; 3537 } 3538 3539 /** 3540 * Tx burst function for multi-segment packets. Supports all 3541 * types of Tx offloads, uses MLX5_OPCODE_SEND/TSO to build WQEs, 3542 * sends one packet per WQE. Function stops sending if it 3543 * encounters the single-segment packet. 3544 * 3545 * This routine is responsible for storing processed mbuf 3546 * into elts ring buffer and update elts_head. 3547 * 3548 * @param txq 3549 * Pointer to TX queue structure. 3550 * @param[in] pkts 3551 * Packets to transmit. 3552 * @param pkts_n 3553 * Number of packets in array. 3554 * @param loc 3555 * Pointer to burst routine local context. 3556 * @param olx 3557 * Configured Tx offloads mask. It is fully defined at 3558 * compile time and may be used for optimization. 3559 * 3560 * @return 3561 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3562 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3563 * MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered. 3564 * MLX5_TXCMP_CODE_TSO - TSO single-segment packet encountered. 3565 * Local context variables updated. 3566 */ 3567 static __rte_always_inline enum mlx5_txcmp_code 3568 mlx5_tx_burst_mseg(struct mlx5_txq_data *restrict txq, 3569 struct rte_mbuf **restrict pkts, 3570 unsigned int pkts_n, 3571 struct mlx5_txq_local *restrict loc, 3572 unsigned int olx) 3573 { 3574 MLX5_ASSERT(loc->elts_free && loc->wqe_free); 3575 MLX5_ASSERT(pkts_n > loc->pkts_sent); 3576 pkts += loc->pkts_sent + 1; 3577 pkts_n -= loc->pkts_sent; 3578 for (;;) { 3579 enum mlx5_txcmp_code ret; 3580 3581 MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1); 3582 /* 3583 * Estimate the number of free elts quickly but 3584 * conservatively. Some segment may be fully inlined 3585 * and freed, ignore this here - precise estimation 3586 * is costly. 3587 */ 3588 if (loc->elts_free < NB_SEGS(loc->mbuf)) 3589 return MLX5_TXCMP_CODE_EXIT; 3590 if (MLX5_TXOFF_CONFIG(TSO) && 3591 unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) { 3592 /* Proceed with multi-segment TSO. */ 3593 ret = mlx5_tx_packet_multi_tso(txq, loc, olx); 3594 } else if (MLX5_TXOFF_CONFIG(INLINE)) { 3595 /* Proceed with multi-segment SEND with inlining. */ 3596 ret = mlx5_tx_packet_multi_inline(txq, loc, olx); 3597 } else { 3598 /* Proceed with multi-segment SEND w/o inlining. */ 3599 ret = mlx5_tx_packet_multi_send(txq, loc, olx); 3600 } 3601 if (ret == MLX5_TXCMP_CODE_EXIT) 3602 return MLX5_TXCMP_CODE_EXIT; 3603 if (ret == MLX5_TXCMP_CODE_ERROR) 3604 return MLX5_TXCMP_CODE_ERROR; 3605 /* WQE is built, go to the next packet. */ 3606 ++loc->pkts_sent; 3607 --pkts_n; 3608 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) 3609 return MLX5_TXCMP_CODE_EXIT; 3610 loc->mbuf = *pkts++; 3611 if (pkts_n > 1) 3612 rte_prefetch0(*pkts); 3613 if (likely(NB_SEGS(loc->mbuf) > 1)) 3614 continue; 3615 /* Here ends the series of multi-segment packets. */ 3616 if (MLX5_TXOFF_CONFIG(TSO) && 3617 unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) 3618 return MLX5_TXCMP_CODE_TSO; 3619 return MLX5_TXCMP_CODE_SINGLE; 3620 } 3621 MLX5_ASSERT(false); 3622 } 3623 3624 /** 3625 * Tx burst function for single-segment packets with TSO. 3626 * Supports all types of Tx offloads, except multi-packets. 3627 * Uses MLX5_OPCODE_TSO to build WQEs, sends one packet per WQE. 3628 * Function stops sending if it encounters the multi-segment 3629 * packet or packet without TSO requested. 3630 * 3631 * The routine is responsible for storing processed mbuf 3632 * into elts ring buffer and update elts_head if inline 3633 * offloads is requested due to possible early freeing 3634 * of the inlined mbufs (can not store pkts array in elts 3635 * as a batch). 3636 * 3637 * @param txq 3638 * Pointer to TX queue structure. 3639 * @param[in] pkts 3640 * Packets to transmit. 3641 * @param pkts_n 3642 * Number of packets in array. 3643 * @param loc 3644 * Pointer to burst routine local context. 3645 * @param olx 3646 * Configured Tx offloads mask. It is fully defined at 3647 * compile time and may be used for optimization. 3648 * 3649 * @return 3650 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3651 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3652 * MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered. 3653 * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered. 3654 * Local context variables updated. 3655 */ 3656 static __rte_always_inline enum mlx5_txcmp_code 3657 mlx5_tx_burst_tso(struct mlx5_txq_data *restrict txq, 3658 struct rte_mbuf **restrict pkts, 3659 unsigned int pkts_n, 3660 struct mlx5_txq_local *restrict loc, 3661 unsigned int olx) 3662 { 3663 MLX5_ASSERT(loc->elts_free && loc->wqe_free); 3664 MLX5_ASSERT(pkts_n > loc->pkts_sent); 3665 pkts += loc->pkts_sent + 1; 3666 pkts_n -= loc->pkts_sent; 3667 for (;;) { 3668 struct mlx5_wqe_dseg *restrict dseg; 3669 struct mlx5_wqe *restrict wqe; 3670 unsigned int ds, dlen, hlen, ntcp, vlan = 0; 3671 uint8_t *dptr; 3672 3673 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); 3674 dlen = rte_pktmbuf_data_len(loc->mbuf); 3675 if (MLX5_TXOFF_CONFIG(VLAN) && 3676 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { 3677 vlan = sizeof(struct rte_vlan_hdr); 3678 } 3679 /* 3680 * First calculate the WQE size to check 3681 * whether we have enough space in ring buffer. 3682 */ 3683 hlen = loc->mbuf->l2_len + vlan + 3684 loc->mbuf->l3_len + loc->mbuf->l4_len; 3685 if (unlikely((!hlen || !loc->mbuf->tso_segsz))) 3686 return MLX5_TXCMP_CODE_ERROR; 3687 if (loc->mbuf->ol_flags & PKT_TX_TUNNEL_MASK) 3688 hlen += loc->mbuf->outer_l2_len + 3689 loc->mbuf->outer_l3_len; 3690 /* Segment must contain all TSO headers. */ 3691 if (unlikely(hlen > MLX5_MAX_TSO_HEADER || 3692 hlen <= MLX5_ESEG_MIN_INLINE_SIZE || 3693 hlen > (dlen + vlan))) 3694 return MLX5_TXCMP_CODE_ERROR; 3695 /* 3696 * Check whether there are enough free WQEBBs: 3697 * - Control Segment 3698 * - Ethernet Segment 3699 * - First Segment of inlined Ethernet data 3700 * - ... data continued ... 3701 * - Finishing Data Segment of pointer type 3702 */ 3703 ds = 4 + (hlen - MLX5_ESEG_MIN_INLINE_SIZE + 3704 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 3705 if (loc->wqe_free < ((ds + 3) / 4)) 3706 return MLX5_TXCMP_CODE_EXIT; 3707 #ifdef MLX5_PMD_SOFT_COUNTERS 3708 /* Update sent data bytes/packets counters. */ 3709 ntcp = (dlen + vlan - hlen + 3710 loc->mbuf->tso_segsz - 1) / 3711 loc->mbuf->tso_segsz; 3712 /* 3713 * One will be added for mbuf itself at the end 3714 * of the mlx5_tx_burst from loc->pkts_sent field. 3715 */ 3716 --ntcp; 3717 txq->stats.opackets += ntcp; 3718 txq->stats.obytes += dlen + vlan + ntcp * hlen; 3719 #endif 3720 /* 3721 * Build the TSO WQE: 3722 * - Control Segment 3723 * - Ethernet Segment with hlen bytes inlined 3724 * - Data Segment of pointer type 3725 */ 3726 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3727 loc->wqe_last = wqe; 3728 mlx5_tx_cseg_init(txq, loc, wqe, ds, 3729 MLX5_OPCODE_TSO, olx); 3730 dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan, hlen, 1, olx); 3731 dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + hlen - vlan; 3732 dlen -= hlen - vlan; 3733 mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx); 3734 /* 3735 * WQE is built, update the loop parameters 3736 * and go to the next packet. 3737 */ 3738 txq->wqe_ci += (ds + 3) / 4; 3739 loc->wqe_free -= (ds + 3) / 4; 3740 if (MLX5_TXOFF_CONFIG(INLINE)) 3741 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 3742 --loc->elts_free; 3743 ++loc->pkts_sent; 3744 --pkts_n; 3745 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) 3746 return MLX5_TXCMP_CODE_EXIT; 3747 loc->mbuf = *pkts++; 3748 if (pkts_n > 1) 3749 rte_prefetch0(*pkts); 3750 if (MLX5_TXOFF_CONFIG(MULTI) && 3751 unlikely(NB_SEGS(loc->mbuf) > 1)) 3752 return MLX5_TXCMP_CODE_MULTI; 3753 if (likely(!(loc->mbuf->ol_flags & PKT_TX_TCP_SEG))) 3754 return MLX5_TXCMP_CODE_SINGLE; 3755 /* Continue with the next TSO packet. */ 3756 } 3757 MLX5_ASSERT(false); 3758 } 3759 3760 /** 3761 * Analyze the packet and select the best method to send. 3762 * 3763 * @param txq 3764 * Pointer to TX queue structure. 3765 * @param loc 3766 * Pointer to burst routine local context. 3767 * @param olx 3768 * Configured Tx offloads mask. It is fully defined at 3769 * compile time and may be used for optimization. 3770 * @param newp 3771 * The predefined flag whether do complete check for 3772 * multi-segment packets and TSO. 3773 * 3774 * @return 3775 * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered. 3776 * MLX5_TXCMP_CODE_TSO - TSO required, use TSO/LSO. 3777 * MLX5_TXCMP_CODE_SINGLE - single-segment packet, use SEND. 3778 * MLX5_TXCMP_CODE_EMPW - single-segment packet, use MPW. 3779 */ 3780 static __rte_always_inline enum mlx5_txcmp_code 3781 mlx5_tx_able_to_empw(struct mlx5_txq_data *restrict txq, 3782 struct mlx5_txq_local *restrict loc, 3783 unsigned int olx, 3784 bool newp) 3785 { 3786 /* Check for multi-segment packet. */ 3787 if (newp && 3788 MLX5_TXOFF_CONFIG(MULTI) && 3789 unlikely(NB_SEGS(loc->mbuf) > 1)) 3790 return MLX5_TXCMP_CODE_MULTI; 3791 /* Check for TSO packet. */ 3792 if (newp && 3793 MLX5_TXOFF_CONFIG(TSO) && 3794 unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) 3795 return MLX5_TXCMP_CODE_TSO; 3796 /* Check if eMPW is enabled at all. */ 3797 if (!MLX5_TXOFF_CONFIG(EMPW)) 3798 return MLX5_TXCMP_CODE_SINGLE; 3799 /* Check if eMPW can be engaged. */ 3800 if (MLX5_TXOFF_CONFIG(VLAN) && 3801 unlikely(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) && 3802 (!MLX5_TXOFF_CONFIG(INLINE) || 3803 unlikely((rte_pktmbuf_data_len(loc->mbuf) + 3804 sizeof(struct rte_vlan_hdr)) > txq->inlen_empw))) { 3805 /* 3806 * eMPW does not support VLAN insertion offload, 3807 * we have to inline the entire packet but 3808 * packet is too long for inlining. 3809 */ 3810 return MLX5_TXCMP_CODE_SINGLE; 3811 } 3812 return MLX5_TXCMP_CODE_EMPW; 3813 } 3814 3815 /** 3816 * Check the next packet attributes to match with the eMPW batch ones. 3817 * In addition, for legacy MPW the packet length is checked either. 3818 * 3819 * @param txq 3820 * Pointer to TX queue structure. 3821 * @param es 3822 * Pointer to Ethernet Segment of eMPW batch. 3823 * @param loc 3824 * Pointer to burst routine local context. 3825 * @param dlen 3826 * Length of previous packet in MPW descriptor. 3827 * @param olx 3828 * Configured Tx offloads mask. It is fully defined at 3829 * compile time and may be used for optimization. 3830 * 3831 * @return 3832 * true - packet match with eMPW batch attributes. 3833 * false - no match, eMPW should be restarted. 3834 */ 3835 static __rte_always_inline bool 3836 mlx5_tx_match_empw(struct mlx5_txq_data *restrict txq __rte_unused, 3837 struct mlx5_wqe_eseg *restrict es, 3838 struct mlx5_txq_local *restrict loc, 3839 uint32_t dlen, 3840 unsigned int olx) 3841 { 3842 uint8_t swp_flags = 0; 3843 3844 /* Compare the checksum flags, if any. */ 3845 if (MLX5_TXOFF_CONFIG(CSUM) && 3846 txq_ol_cksum_to_cs(loc->mbuf) != es->cs_flags) 3847 return false; 3848 /* Compare the Software Parser offsets and flags. */ 3849 if (MLX5_TXOFF_CONFIG(SWP) && 3850 (es->swp_offs != txq_mbuf_to_swp(loc, &swp_flags, olx) || 3851 es->swp_flags != swp_flags)) 3852 return false; 3853 /* Fill metadata field if needed. */ 3854 if (MLX5_TXOFF_CONFIG(METADATA) && 3855 es->metadata != (loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? 3856 *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0)) 3857 return false; 3858 /* Legacy MPW can send packets with the same lengt only. */ 3859 if (MLX5_TXOFF_CONFIG(MPW) && 3860 dlen != rte_pktmbuf_data_len(loc->mbuf)) 3861 return false; 3862 /* There must be no VLAN packets in eMPW loop. */ 3863 if (MLX5_TXOFF_CONFIG(VLAN)) 3864 MLX5_ASSERT(!(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)); 3865 return true; 3866 } 3867 3868 /* 3869 * Update send loop variables and WQE for eMPW loop 3870 * without data inlining. Number of Data Segments is 3871 * equal to the number of sent packets. 3872 * 3873 * @param txq 3874 * Pointer to TX queue structure. 3875 * @param loc 3876 * Pointer to burst routine local context. 3877 * @param ds 3878 * Number of packets/Data Segments/Packets. 3879 * @param slen 3880 * Accumulated statistics, bytes sent 3881 * @param olx 3882 * Configured Tx offloads mask. It is fully defined at 3883 * compile time and may be used for optimization. 3884 * 3885 * @return 3886 * true - packet match with eMPW batch attributes. 3887 * false - no match, eMPW should be restarted. 3888 */ 3889 static __rte_always_inline void 3890 mlx5_tx_sdone_empw(struct mlx5_txq_data *restrict txq, 3891 struct mlx5_txq_local *restrict loc, 3892 unsigned int ds, 3893 unsigned int slen, 3894 unsigned int olx __rte_unused) 3895 { 3896 MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE)); 3897 #ifdef MLX5_PMD_SOFT_COUNTERS 3898 /* Update sent data bytes counter. */ 3899 txq->stats.obytes += slen; 3900 #else 3901 (void)slen; 3902 #endif 3903 loc->elts_free -= ds; 3904 loc->pkts_sent += ds; 3905 ds += 2; 3906 loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); 3907 txq->wqe_ci += (ds + 3) / 4; 3908 loc->wqe_free -= (ds + 3) / 4; 3909 } 3910 3911 /* 3912 * Update send loop variables and WQE for eMPW loop 3913 * with data inlining. Gets the size of pushed descriptors 3914 * and data to the WQE. 3915 * 3916 * @param txq 3917 * Pointer to TX queue structure. 3918 * @param loc 3919 * Pointer to burst routine local context. 3920 * @param len 3921 * Total size of descriptor/data in bytes. 3922 * @param slen 3923 * Accumulated statistics, data bytes sent. 3924 * @param olx 3925 * Configured Tx offloads mask. It is fully defined at 3926 * compile time and may be used for optimization. 3927 * 3928 * @return 3929 * true - packet match with eMPW batch attributes. 3930 * false - no match, eMPW should be restarted. 3931 */ 3932 static __rte_always_inline void 3933 mlx5_tx_idone_empw(struct mlx5_txq_data *restrict txq, 3934 struct mlx5_txq_local *restrict loc, 3935 unsigned int len, 3936 unsigned int slen, 3937 unsigned int olx __rte_unused) 3938 { 3939 struct mlx5_wqe_dseg *dseg = &loc->wqe_last->dseg[0]; 3940 3941 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE)); 3942 #ifdef MLX5_PMD_SOFT_COUNTERS 3943 /* Update sent data bytes counter. */ 3944 txq->stats.obytes += slen; 3945 #else 3946 (void)slen; 3947 #endif 3948 if (MLX5_TXOFF_CONFIG(MPW) && dseg->bcount == RTE_BE32(0)) { 3949 /* 3950 * If the legacy MPW session contains the inline packets 3951 * we should set the only inline data segment length 3952 * and align the total length to the segment size. 3953 */ 3954 MLX5_ASSERT(len > sizeof(dseg->bcount)); 3955 dseg->bcount = rte_cpu_to_be_32((len - sizeof(dseg->bcount)) | 3956 MLX5_ETH_WQE_DATA_INLINE); 3957 len = (len + MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE + 2; 3958 } else { 3959 /* 3960 * The session is not legacy MPW or contains the 3961 * data buffer pointer segments. 3962 */ 3963 MLX5_ASSERT((len % MLX5_WSEG_SIZE) == 0); 3964 len = len / MLX5_WSEG_SIZE + 2; 3965 } 3966 loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | len); 3967 txq->wqe_ci += (len + 3) / 4; 3968 loc->wqe_free -= (len + 3) / 4; 3969 } 3970 3971 /** 3972 * The set of Tx burst functions for single-segment packets 3973 * without TSO and with Multi-Packet Writing feature support. 3974 * Supports all types of Tx offloads, except multi-packets 3975 * and TSO. 3976 * 3977 * Uses MLX5_OPCODE_EMPW to build WQEs if possible and sends 3978 * as many packet per WQE as it can. If eMPW is not configured 3979 * or packet can not be sent with eMPW (VLAN insertion) the 3980 * ordinary SEND opcode is used and only one packet placed 3981 * in WQE. 3982 * 3983 * Functions stop sending if it encounters the multi-segment 3984 * packet or packet with TSO requested. 3985 * 3986 * The routines are responsible for storing processed mbuf 3987 * into elts ring buffer and update elts_head if inlining 3988 * offload is requested. Otherwise the copying mbufs to elts 3989 * can be postponed and completed at the end of burst routine. 3990 * 3991 * @param txq 3992 * Pointer to TX queue structure. 3993 * @param[in] pkts 3994 * Packets to transmit. 3995 * @param pkts_n 3996 * Number of packets in array. 3997 * @param loc 3998 * Pointer to burst routine local context. 3999 * @param olx 4000 * Configured Tx offloads mask. It is fully defined at 4001 * compile time and may be used for optimization. 4002 * 4003 * @return 4004 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 4005 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 4006 * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered. 4007 * MLX5_TXCMP_CODE_TSO - TSO packet encountered. 4008 * MLX5_TXCMP_CODE_SINGLE - used inside functions set. 4009 * MLX5_TXCMP_CODE_EMPW - used inside functions set. 4010 * 4011 * Local context variables updated. 4012 * 4013 * 4014 * The routine sends packets with MLX5_OPCODE_EMPW 4015 * without inlining, this is dedicated optimized branch. 4016 * No VLAN insertion is supported. 4017 */ 4018 static __rte_always_inline enum mlx5_txcmp_code 4019 mlx5_tx_burst_empw_simple(struct mlx5_txq_data *restrict txq, 4020 struct rte_mbuf **restrict pkts, 4021 unsigned int pkts_n, 4022 struct mlx5_txq_local *restrict loc, 4023 unsigned int olx) 4024 { 4025 /* 4026 * Subroutine is the part of mlx5_tx_burst_single() 4027 * and sends single-segment packet with eMPW opcode 4028 * without data inlining. 4029 */ 4030 MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE)); 4031 MLX5_ASSERT(MLX5_TXOFF_CONFIG(EMPW)); 4032 MLX5_ASSERT(loc->elts_free && loc->wqe_free); 4033 MLX5_ASSERT(pkts_n > loc->pkts_sent); 4034 static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size"); 4035 pkts += loc->pkts_sent + 1; 4036 pkts_n -= loc->pkts_sent; 4037 for (;;) { 4038 struct mlx5_wqe_dseg *restrict dseg; 4039 struct mlx5_wqe_eseg *restrict eseg; 4040 enum mlx5_txcmp_code ret; 4041 unsigned int part, loop; 4042 unsigned int slen = 0; 4043 4044 next_empw: 4045 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); 4046 part = RTE_MIN(pkts_n, MLX5_TXOFF_CONFIG(MPW) ? 4047 MLX5_MPW_MAX_PACKETS : 4048 MLX5_EMPW_MAX_PACKETS); 4049 if (unlikely(loc->elts_free < part)) { 4050 /* We have no enough elts to save all mbufs. */ 4051 if (unlikely(loc->elts_free < MLX5_EMPW_MIN_PACKETS)) 4052 return MLX5_TXCMP_CODE_EXIT; 4053 /* But we still able to send at least minimal eMPW. */ 4054 part = loc->elts_free; 4055 } 4056 /* Check whether we have enough WQEs */ 4057 if (unlikely(loc->wqe_free < ((2 + part + 3) / 4))) { 4058 if (unlikely(loc->wqe_free < 4059 ((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4))) 4060 return MLX5_TXCMP_CODE_EXIT; 4061 part = (loc->wqe_free * 4) - 2; 4062 } 4063 if (likely(part > 1)) 4064 rte_prefetch0(*pkts); 4065 loc->wqe_last = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4066 /* 4067 * Build eMPW title WQEBB: 4068 * - Control Segment, eMPW opcode 4069 * - Ethernet Segment, no inline 4070 */ 4071 mlx5_tx_cseg_init(txq, loc, loc->wqe_last, part + 2, 4072 MLX5_OPCODE_ENHANCED_MPSW, olx); 4073 mlx5_tx_eseg_none(txq, loc, loc->wqe_last, 4074 olx & ~MLX5_TXOFF_CONFIG_VLAN); 4075 eseg = &loc->wqe_last->eseg; 4076 dseg = &loc->wqe_last->dseg[0]; 4077 loop = part; 4078 /* Store the packet length for legacy MPW. */ 4079 if (MLX5_TXOFF_CONFIG(MPW)) 4080 eseg->mss = rte_cpu_to_be_16 4081 (rte_pktmbuf_data_len(loc->mbuf)); 4082 for (;;) { 4083 uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf); 4084 #ifdef MLX5_PMD_SOFT_COUNTERS 4085 /* Update sent data bytes counter. */ 4086 slen += dlen; 4087 #endif 4088 mlx5_tx_dseg_ptr 4089 (txq, loc, dseg, 4090 rte_pktmbuf_mtod(loc->mbuf, uint8_t *), 4091 dlen, olx); 4092 if (unlikely(--loop == 0)) 4093 break; 4094 loc->mbuf = *pkts++; 4095 if (likely(loop > 1)) 4096 rte_prefetch0(*pkts); 4097 ret = mlx5_tx_able_to_empw(txq, loc, olx, true); 4098 /* 4099 * Unroll the completion code to avoid 4100 * returning variable value - it results in 4101 * unoptimized sequent checking in caller. 4102 */ 4103 if (ret == MLX5_TXCMP_CODE_MULTI) { 4104 part -= loop; 4105 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 4106 if (unlikely(!loc->elts_free || 4107 !loc->wqe_free)) 4108 return MLX5_TXCMP_CODE_EXIT; 4109 return MLX5_TXCMP_CODE_MULTI; 4110 } 4111 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); 4112 if (ret == MLX5_TXCMP_CODE_TSO) { 4113 part -= loop; 4114 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 4115 if (unlikely(!loc->elts_free || 4116 !loc->wqe_free)) 4117 return MLX5_TXCMP_CODE_EXIT; 4118 return MLX5_TXCMP_CODE_TSO; 4119 } 4120 if (ret == MLX5_TXCMP_CODE_SINGLE) { 4121 part -= loop; 4122 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 4123 if (unlikely(!loc->elts_free || 4124 !loc->wqe_free)) 4125 return MLX5_TXCMP_CODE_EXIT; 4126 return MLX5_TXCMP_CODE_SINGLE; 4127 } 4128 if (ret != MLX5_TXCMP_CODE_EMPW) { 4129 MLX5_ASSERT(false); 4130 part -= loop; 4131 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 4132 return MLX5_TXCMP_CODE_ERROR; 4133 } 4134 /* 4135 * Check whether packet parameters coincide 4136 * within assumed eMPW batch: 4137 * - check sum settings 4138 * - metadata value 4139 * - software parser settings 4140 * - packets length (legacy MPW only) 4141 */ 4142 if (!mlx5_tx_match_empw(txq, eseg, loc, dlen, olx)) { 4143 MLX5_ASSERT(loop); 4144 part -= loop; 4145 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 4146 if (unlikely(!loc->elts_free || 4147 !loc->wqe_free)) 4148 return MLX5_TXCMP_CODE_EXIT; 4149 pkts_n -= part; 4150 goto next_empw; 4151 } 4152 /* Packet attributes match, continue the same eMPW. */ 4153 ++dseg; 4154 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 4155 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 4156 } 4157 /* eMPW is built successfully, update loop parameters. */ 4158 MLX5_ASSERT(!loop); 4159 MLX5_ASSERT(pkts_n >= part); 4160 #ifdef MLX5_PMD_SOFT_COUNTERS 4161 /* Update sent data bytes counter. */ 4162 txq->stats.obytes += slen; 4163 #endif 4164 loc->elts_free -= part; 4165 loc->pkts_sent += part; 4166 txq->wqe_ci += (2 + part + 3) / 4; 4167 loc->wqe_free -= (2 + part + 3) / 4; 4168 pkts_n -= part; 4169 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) 4170 return MLX5_TXCMP_CODE_EXIT; 4171 loc->mbuf = *pkts++; 4172 ret = mlx5_tx_able_to_empw(txq, loc, olx, true); 4173 if (unlikely(ret != MLX5_TXCMP_CODE_EMPW)) 4174 return ret; 4175 /* Continue sending eMPW batches. */ 4176 } 4177 MLX5_ASSERT(false); 4178 } 4179 4180 /** 4181 * The routine sends packets with MLX5_OPCODE_EMPW 4182 * with inlining, optionally supports VLAN insertion. 4183 */ 4184 static __rte_always_inline enum mlx5_txcmp_code 4185 mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq, 4186 struct rte_mbuf **restrict pkts, 4187 unsigned int pkts_n, 4188 struct mlx5_txq_local *restrict loc, 4189 unsigned int olx) 4190 { 4191 /* 4192 * Subroutine is the part of mlx5_tx_burst_single() 4193 * and sends single-segment packet with eMPW opcode 4194 * with data inlining. 4195 */ 4196 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE)); 4197 MLX5_ASSERT(MLX5_TXOFF_CONFIG(EMPW)); 4198 MLX5_ASSERT(loc->elts_free && loc->wqe_free); 4199 MLX5_ASSERT(pkts_n > loc->pkts_sent); 4200 static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size"); 4201 pkts += loc->pkts_sent + 1; 4202 pkts_n -= loc->pkts_sent; 4203 for (;;) { 4204 struct mlx5_wqe_dseg *restrict dseg; 4205 struct mlx5_wqe_eseg *restrict eseg; 4206 enum mlx5_txcmp_code ret; 4207 unsigned int room, part, nlim; 4208 unsigned int slen = 0; 4209 4210 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); 4211 /* 4212 * Limits the amount of packets in one WQE 4213 * to improve CQE latency generation. 4214 */ 4215 nlim = RTE_MIN(pkts_n, MLX5_TXOFF_CONFIG(MPW) ? 4216 MLX5_MPW_INLINE_MAX_PACKETS : 4217 MLX5_EMPW_MAX_PACKETS); 4218 /* Check whether we have minimal amount WQEs */ 4219 if (unlikely(loc->wqe_free < 4220 ((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4))) 4221 return MLX5_TXCMP_CODE_EXIT; 4222 if (likely(pkts_n > 1)) 4223 rte_prefetch0(*pkts); 4224 loc->wqe_last = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4225 /* 4226 * Build eMPW title WQEBB: 4227 * - Control Segment, eMPW opcode, zero DS 4228 * - Ethernet Segment, no inline 4229 */ 4230 mlx5_tx_cseg_init(txq, loc, loc->wqe_last, 0, 4231 MLX5_OPCODE_ENHANCED_MPSW, olx); 4232 mlx5_tx_eseg_none(txq, loc, loc->wqe_last, 4233 olx & ~MLX5_TXOFF_CONFIG_VLAN); 4234 eseg = &loc->wqe_last->eseg; 4235 dseg = &loc->wqe_last->dseg[0]; 4236 /* Store the packet length for legacy MPW. */ 4237 if (MLX5_TXOFF_CONFIG(MPW)) 4238 eseg->mss = rte_cpu_to_be_16 4239 (rte_pktmbuf_data_len(loc->mbuf)); 4240 room = RTE_MIN(MLX5_WQE_SIZE_MAX / MLX5_WQE_SIZE, 4241 loc->wqe_free) * MLX5_WQE_SIZE - 4242 MLX5_WQE_CSEG_SIZE - 4243 MLX5_WQE_ESEG_SIZE; 4244 /* Limit the room for legacy MPW sessions for performance. */ 4245 if (MLX5_TXOFF_CONFIG(MPW)) 4246 room = RTE_MIN(room, 4247 RTE_MAX(txq->inlen_empw + 4248 sizeof(dseg->bcount) + 4249 (MLX5_TXOFF_CONFIG(VLAN) ? 4250 sizeof(struct rte_vlan_hdr) : 0), 4251 MLX5_MPW_INLINE_MAX_PACKETS * 4252 MLX5_WQE_DSEG_SIZE)); 4253 /* Build WQE till we have space, packets and resources. */ 4254 part = room; 4255 for (;;) { 4256 uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf); 4257 uint8_t *dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *); 4258 unsigned int tlen; 4259 4260 MLX5_ASSERT(room >= MLX5_WQE_DSEG_SIZE); 4261 MLX5_ASSERT((room % MLX5_WQE_DSEG_SIZE) == 0); 4262 MLX5_ASSERT((uintptr_t)dseg < (uintptr_t)txq->wqes_end); 4263 /* 4264 * Some Tx offloads may cause an error if 4265 * packet is not long enough, check against 4266 * assumed minimal length. 4267 */ 4268 if (unlikely(dlen <= MLX5_ESEG_MIN_INLINE_SIZE)) { 4269 part -= room; 4270 if (unlikely(!part)) 4271 return MLX5_TXCMP_CODE_ERROR; 4272 /* 4273 * We have some successfully built 4274 * packet Data Segments to send. 4275 */ 4276 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4277 return MLX5_TXCMP_CODE_ERROR; 4278 } 4279 /* Inline or not inline - that's the Question. */ 4280 if (dlen > txq->inlen_empw || 4281 loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE) 4282 goto pointer_empw; 4283 if (MLX5_TXOFF_CONFIG(MPW)) { 4284 tlen = dlen; 4285 if (part == room) { 4286 /* Open new inline MPW session. */ 4287 tlen += sizeof(dseg->bcount); 4288 dseg->bcount = RTE_BE32(0); 4289 dseg = RTE_PTR_ADD 4290 (dseg, sizeof(dseg->bcount)); 4291 } else { 4292 /* 4293 * No pointer and inline descriptor 4294 * intermix for legacy MPW sessions. 4295 */ 4296 if (loc->wqe_last->dseg[0].bcount) 4297 break; 4298 } 4299 } else { 4300 tlen = sizeof(dseg->bcount) + dlen; 4301 } 4302 /* Inline entire packet, optional VLAN insertion. */ 4303 if (MLX5_TXOFF_CONFIG(VLAN) && 4304 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { 4305 /* 4306 * The packet length must be checked in 4307 * mlx5_tx_able_to_empw() and packet 4308 * fits into inline length guaranteed. 4309 */ 4310 MLX5_ASSERT((dlen + 4311 sizeof(struct rte_vlan_hdr)) <= 4312 txq->inlen_empw); 4313 tlen += sizeof(struct rte_vlan_hdr); 4314 if (room < tlen) 4315 break; 4316 dseg = mlx5_tx_dseg_vlan(txq, loc, dseg, 4317 dptr, dlen, olx); 4318 #ifdef MLX5_PMD_SOFT_COUNTERS 4319 /* Update sent data bytes counter. */ 4320 slen += sizeof(struct rte_vlan_hdr); 4321 #endif 4322 } else { 4323 if (room < tlen) 4324 break; 4325 dseg = mlx5_tx_dseg_empw(txq, loc, dseg, 4326 dptr, dlen, olx); 4327 } 4328 if (!MLX5_TXOFF_CONFIG(MPW)) 4329 tlen = RTE_ALIGN(tlen, MLX5_WSEG_SIZE); 4330 MLX5_ASSERT(room >= tlen); 4331 room -= tlen; 4332 /* 4333 * Packet data are completely inlined, 4334 * free the packet immediately. 4335 */ 4336 rte_pktmbuf_free_seg(loc->mbuf); 4337 goto next_mbuf; 4338 pointer_empw: 4339 /* 4340 * No pointer and inline descriptor 4341 * intermix for legacy MPW sessions. 4342 */ 4343 if (MLX5_TXOFF_CONFIG(MPW) && 4344 part != room && 4345 loc->wqe_last->dseg[0].bcount == RTE_BE32(0)) 4346 break; 4347 /* 4348 * Not inlinable VLAN packets are 4349 * proceeded outside of this routine. 4350 */ 4351 MLX5_ASSERT(room >= MLX5_WQE_DSEG_SIZE); 4352 if (MLX5_TXOFF_CONFIG(VLAN)) 4353 MLX5_ASSERT(!(loc->mbuf->ol_flags & 4354 PKT_TX_VLAN_PKT)); 4355 mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx); 4356 /* We have to store mbuf in elts.*/ 4357 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 4358 room -= MLX5_WQE_DSEG_SIZE; 4359 /* Ring buffer wraparound is checked at the loop end.*/ 4360 ++dseg; 4361 next_mbuf: 4362 #ifdef MLX5_PMD_SOFT_COUNTERS 4363 /* Update sent data bytes counter. */ 4364 slen += dlen; 4365 #endif 4366 loc->pkts_sent++; 4367 loc->elts_free--; 4368 pkts_n--; 4369 if (unlikely(!pkts_n || !loc->elts_free)) { 4370 /* 4371 * We have no resources/packets to 4372 * continue build descriptors. 4373 */ 4374 part -= room; 4375 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4376 return MLX5_TXCMP_CODE_EXIT; 4377 } 4378 loc->mbuf = *pkts++; 4379 if (likely(pkts_n > 1)) 4380 rte_prefetch0(*pkts); 4381 ret = mlx5_tx_able_to_empw(txq, loc, olx, true); 4382 /* 4383 * Unroll the completion code to avoid 4384 * returning variable value - it results in 4385 * unoptimized sequent checking in caller. 4386 */ 4387 if (ret == MLX5_TXCMP_CODE_MULTI) { 4388 part -= room; 4389 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4390 if (unlikely(!loc->elts_free || 4391 !loc->wqe_free)) 4392 return MLX5_TXCMP_CODE_EXIT; 4393 return MLX5_TXCMP_CODE_MULTI; 4394 } 4395 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); 4396 if (ret == MLX5_TXCMP_CODE_TSO) { 4397 part -= room; 4398 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4399 if (unlikely(!loc->elts_free || 4400 !loc->wqe_free)) 4401 return MLX5_TXCMP_CODE_EXIT; 4402 return MLX5_TXCMP_CODE_TSO; 4403 } 4404 if (ret == MLX5_TXCMP_CODE_SINGLE) { 4405 part -= room; 4406 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4407 if (unlikely(!loc->elts_free || 4408 !loc->wqe_free)) 4409 return MLX5_TXCMP_CODE_EXIT; 4410 return MLX5_TXCMP_CODE_SINGLE; 4411 } 4412 if (ret != MLX5_TXCMP_CODE_EMPW) { 4413 MLX5_ASSERT(false); 4414 part -= room; 4415 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4416 return MLX5_TXCMP_CODE_ERROR; 4417 } 4418 /* Check if we have minimal room left. */ 4419 nlim--; 4420 if (unlikely(!nlim || room < MLX5_WQE_DSEG_SIZE)) 4421 break; 4422 /* 4423 * Check whether packet parameters coincide 4424 * within assumed eMPW batch: 4425 * - check sum settings 4426 * - metadata value 4427 * - software parser settings 4428 * - packets length (legacy MPW only) 4429 */ 4430 if (!mlx5_tx_match_empw(txq, eseg, loc, dlen, olx)) 4431 break; 4432 /* Packet attributes match, continue the same eMPW. */ 4433 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 4434 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 4435 } 4436 /* 4437 * We get here to close an existing eMPW 4438 * session and start the new one. 4439 */ 4440 MLX5_ASSERT(pkts_n); 4441 part -= room; 4442 if (unlikely(!part)) 4443 return MLX5_TXCMP_CODE_EXIT; 4444 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4445 if (unlikely(!loc->elts_free || 4446 !loc->wqe_free)) 4447 return MLX5_TXCMP_CODE_EXIT; 4448 /* Continue the loop with new eMPW session. */ 4449 } 4450 MLX5_ASSERT(false); 4451 } 4452 4453 /** 4454 * The routine sends packets with ordinary MLX5_OPCODE_SEND. 4455 * Data inlining and VLAN insertion are supported. 4456 */ 4457 static __rte_always_inline enum mlx5_txcmp_code 4458 mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq, 4459 struct rte_mbuf **restrict pkts, 4460 unsigned int pkts_n, 4461 struct mlx5_txq_local *restrict loc, 4462 unsigned int olx) 4463 { 4464 /* 4465 * Subroutine is the part of mlx5_tx_burst_single() 4466 * and sends single-segment packet with SEND opcode. 4467 */ 4468 MLX5_ASSERT(loc->elts_free && loc->wqe_free); 4469 MLX5_ASSERT(pkts_n > loc->pkts_sent); 4470 pkts += loc->pkts_sent + 1; 4471 pkts_n -= loc->pkts_sent; 4472 for (;;) { 4473 struct mlx5_wqe *restrict wqe; 4474 enum mlx5_txcmp_code ret; 4475 4476 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); 4477 if (MLX5_TXOFF_CONFIG(INLINE)) { 4478 unsigned int inlen, vlan = 0; 4479 4480 inlen = rte_pktmbuf_data_len(loc->mbuf); 4481 if (MLX5_TXOFF_CONFIG(VLAN) && 4482 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { 4483 vlan = sizeof(struct rte_vlan_hdr); 4484 inlen += vlan; 4485 static_assert((sizeof(struct rte_vlan_hdr) + 4486 sizeof(struct rte_ether_hdr)) == 4487 MLX5_ESEG_MIN_INLINE_SIZE, 4488 "invalid min inline data size"); 4489 } 4490 /* 4491 * If inlining is enabled at configuration time 4492 * the limit must be not less than minimal size. 4493 * Otherwise we would do extra check for data 4494 * size to avoid crashes due to length overflow. 4495 */ 4496 MLX5_ASSERT(txq->inlen_send >= 4497 MLX5_ESEG_MIN_INLINE_SIZE); 4498 if (inlen <= txq->inlen_send) { 4499 unsigned int seg_n, wqe_n; 4500 4501 rte_prefetch0(rte_pktmbuf_mtod 4502 (loc->mbuf, uint8_t *)); 4503 /* Check against minimal length. */ 4504 if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE) 4505 return MLX5_TXCMP_CODE_ERROR; 4506 if (loc->mbuf->ol_flags & 4507 PKT_TX_DYNF_NOINLINE) { 4508 /* 4509 * The hint flag not to inline packet 4510 * data is set. Check whether we can 4511 * follow the hint. 4512 */ 4513 if ((!MLX5_TXOFF_CONFIG(EMPW) && 4514 txq->inlen_mode) || 4515 (MLX5_TXOFF_CONFIG(MPW) && 4516 txq->inlen_mode)) { 4517 /* 4518 * The hardware requires the 4519 * minimal inline data header. 4520 */ 4521 goto single_min_inline; 4522 } 4523 if (MLX5_TXOFF_CONFIG(VLAN) && 4524 vlan && !txq->vlan_en) { 4525 /* 4526 * We must insert VLAN tag 4527 * by software means. 4528 */ 4529 goto single_part_inline; 4530 } 4531 goto single_no_inline; 4532 } 4533 /* 4534 * Completely inlined packet data WQE: 4535 * - Control Segment, SEND opcode 4536 * - Ethernet Segment, no VLAN insertion 4537 * - Data inlined, VLAN optionally inserted 4538 * - Alignment to MLX5_WSEG_SIZE 4539 * Have to estimate amount of WQEBBs 4540 */ 4541 seg_n = (inlen + 3 * MLX5_WSEG_SIZE - 4542 MLX5_ESEG_MIN_INLINE_SIZE + 4543 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 4544 /* Check if there are enough WQEBBs. */ 4545 wqe_n = (seg_n + 3) / 4; 4546 if (wqe_n > loc->wqe_free) 4547 return MLX5_TXCMP_CODE_EXIT; 4548 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4549 loc->wqe_last = wqe; 4550 mlx5_tx_cseg_init(txq, loc, wqe, seg_n, 4551 MLX5_OPCODE_SEND, olx); 4552 mlx5_tx_eseg_data(txq, loc, wqe, 4553 vlan, inlen, 0, olx); 4554 txq->wqe_ci += wqe_n; 4555 loc->wqe_free -= wqe_n; 4556 /* 4557 * Packet data are completely inlined, 4558 * free the packet immediately. 4559 */ 4560 rte_pktmbuf_free_seg(loc->mbuf); 4561 } else if ((!MLX5_TXOFF_CONFIG(EMPW) || 4562 MLX5_TXOFF_CONFIG(MPW)) && 4563 txq->inlen_mode) { 4564 /* 4565 * If minimal inlining is requested the eMPW 4566 * feature should be disabled due to data is 4567 * inlined into Ethernet Segment, which can 4568 * not contain inlined data for eMPW due to 4569 * segment shared for all packets. 4570 */ 4571 struct mlx5_wqe_dseg *restrict dseg; 4572 unsigned int ds; 4573 uint8_t *dptr; 4574 4575 /* 4576 * The inline-mode settings require 4577 * to inline the specified amount of 4578 * data bytes to the Ethernet Segment. 4579 * We should check the free space in 4580 * WQE ring buffer to inline partially. 4581 */ 4582 single_min_inline: 4583 MLX5_ASSERT(txq->inlen_send >= txq->inlen_mode); 4584 MLX5_ASSERT(inlen > txq->inlen_mode); 4585 MLX5_ASSERT(txq->inlen_mode >= 4586 MLX5_ESEG_MIN_INLINE_SIZE); 4587 /* 4588 * Check whether there are enough free WQEBBs: 4589 * - Control Segment 4590 * - Ethernet Segment 4591 * - First Segment of inlined Ethernet data 4592 * - ... data continued ... 4593 * - Finishing Data Segment of pointer type 4594 */ 4595 ds = (MLX5_WQE_CSEG_SIZE + 4596 MLX5_WQE_ESEG_SIZE + 4597 MLX5_WQE_DSEG_SIZE + 4598 txq->inlen_mode - 4599 MLX5_ESEG_MIN_INLINE_SIZE + 4600 MLX5_WQE_DSEG_SIZE + 4601 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 4602 if (loc->wqe_free < ((ds + 3) / 4)) 4603 return MLX5_TXCMP_CODE_EXIT; 4604 /* 4605 * Build the ordinary SEND WQE: 4606 * - Control Segment 4607 * - Ethernet Segment, inline inlen_mode bytes 4608 * - Data Segment of pointer type 4609 */ 4610 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4611 loc->wqe_last = wqe; 4612 mlx5_tx_cseg_init(txq, loc, wqe, ds, 4613 MLX5_OPCODE_SEND, olx); 4614 dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan, 4615 txq->inlen_mode, 4616 0, olx); 4617 dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + 4618 txq->inlen_mode - vlan; 4619 inlen -= txq->inlen_mode; 4620 mlx5_tx_dseg_ptr(txq, loc, dseg, 4621 dptr, inlen, olx); 4622 /* 4623 * WQE is built, update the loop parameters 4624 * and got to the next packet. 4625 */ 4626 txq->wqe_ci += (ds + 3) / 4; 4627 loc->wqe_free -= (ds + 3) / 4; 4628 /* We have to store mbuf in elts.*/ 4629 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE)); 4630 txq->elts[txq->elts_head++ & txq->elts_m] = 4631 loc->mbuf; 4632 --loc->elts_free; 4633 } else { 4634 uint8_t *dptr; 4635 unsigned int dlen; 4636 4637 /* 4638 * Partially inlined packet data WQE, we have 4639 * some space in title WQEBB, we can fill it 4640 * with some packet data. It takes one WQEBB, 4641 * it is available, no extra space check: 4642 * - Control Segment, SEND opcode 4643 * - Ethernet Segment, no VLAN insertion 4644 * - MLX5_ESEG_MIN_INLINE_SIZE bytes of Data 4645 * - Data Segment, pointer type 4646 * 4647 * We also get here if VLAN insertion is not 4648 * supported by HW, the inline is enabled. 4649 */ 4650 single_part_inline: 4651 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4652 loc->wqe_last = wqe; 4653 mlx5_tx_cseg_init(txq, loc, wqe, 4, 4654 MLX5_OPCODE_SEND, olx); 4655 mlx5_tx_eseg_dmin(txq, loc, wqe, vlan, olx); 4656 dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + 4657 MLX5_ESEG_MIN_INLINE_SIZE - vlan; 4658 /* 4659 * The length check is performed above, by 4660 * comparing with txq->inlen_send. We should 4661 * not get overflow here. 4662 */ 4663 MLX5_ASSERT(inlen > MLX5_ESEG_MIN_INLINE_SIZE); 4664 dlen = inlen - MLX5_ESEG_MIN_INLINE_SIZE; 4665 mlx5_tx_dseg_ptr(txq, loc, &wqe->dseg[1], 4666 dptr, dlen, olx); 4667 ++txq->wqe_ci; 4668 --loc->wqe_free; 4669 /* We have to store mbuf in elts.*/ 4670 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE)); 4671 txq->elts[txq->elts_head++ & txq->elts_m] = 4672 loc->mbuf; 4673 --loc->elts_free; 4674 } 4675 #ifdef MLX5_PMD_SOFT_COUNTERS 4676 /* Update sent data bytes counter. */ 4677 txq->stats.obytes += vlan + 4678 rte_pktmbuf_data_len(loc->mbuf); 4679 #endif 4680 } else { 4681 /* 4682 * No inline at all, it means the CPU cycles saving 4683 * is prioritized at configuration, we should not 4684 * copy any packet data to WQE. 4685 * 4686 * SEND WQE, one WQEBB: 4687 * - Control Segment, SEND opcode 4688 * - Ethernet Segment, optional VLAN, no inline 4689 * - Data Segment, pointer type 4690 */ 4691 single_no_inline: 4692 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4693 loc->wqe_last = wqe; 4694 mlx5_tx_cseg_init(txq, loc, wqe, 3, 4695 MLX5_OPCODE_SEND, olx); 4696 mlx5_tx_eseg_none(txq, loc, wqe, olx); 4697 mlx5_tx_dseg_ptr 4698 (txq, loc, &wqe->dseg[0], 4699 rte_pktmbuf_mtod(loc->mbuf, uint8_t *), 4700 rte_pktmbuf_data_len(loc->mbuf), olx); 4701 ++txq->wqe_ci; 4702 --loc->wqe_free; 4703 /* 4704 * We should not store mbuf pointer in elts 4705 * if no inlining is configured, this is done 4706 * by calling routine in a batch copy. 4707 */ 4708 MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE)); 4709 --loc->elts_free; 4710 #ifdef MLX5_PMD_SOFT_COUNTERS 4711 /* Update sent data bytes counter. */ 4712 txq->stats.obytes += rte_pktmbuf_data_len(loc->mbuf); 4713 if (MLX5_TXOFF_CONFIG(VLAN) && 4714 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) 4715 txq->stats.obytes += 4716 sizeof(struct rte_vlan_hdr); 4717 #endif 4718 } 4719 ++loc->pkts_sent; 4720 --pkts_n; 4721 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) 4722 return MLX5_TXCMP_CODE_EXIT; 4723 loc->mbuf = *pkts++; 4724 if (pkts_n > 1) 4725 rte_prefetch0(*pkts); 4726 ret = mlx5_tx_able_to_empw(txq, loc, olx, true); 4727 if (unlikely(ret != MLX5_TXCMP_CODE_SINGLE)) 4728 return ret; 4729 } 4730 MLX5_ASSERT(false); 4731 } 4732 4733 static __rte_always_inline enum mlx5_txcmp_code 4734 mlx5_tx_burst_single(struct mlx5_txq_data *restrict txq, 4735 struct rte_mbuf **restrict pkts, 4736 unsigned int pkts_n, 4737 struct mlx5_txq_local *restrict loc, 4738 unsigned int olx) 4739 { 4740 enum mlx5_txcmp_code ret; 4741 4742 ret = mlx5_tx_able_to_empw(txq, loc, olx, false); 4743 if (ret == MLX5_TXCMP_CODE_SINGLE) 4744 goto ordinary_send; 4745 MLX5_ASSERT(ret == MLX5_TXCMP_CODE_EMPW); 4746 for (;;) { 4747 /* Optimize for inline/no inline eMPW send. */ 4748 ret = (MLX5_TXOFF_CONFIG(INLINE)) ? 4749 mlx5_tx_burst_empw_inline 4750 (txq, pkts, pkts_n, loc, olx) : 4751 mlx5_tx_burst_empw_simple 4752 (txq, pkts, pkts_n, loc, olx); 4753 if (ret != MLX5_TXCMP_CODE_SINGLE) 4754 return ret; 4755 /* The resources to send one packet should remain. */ 4756 MLX5_ASSERT(loc->elts_free && loc->wqe_free); 4757 ordinary_send: 4758 ret = mlx5_tx_burst_single_send(txq, pkts, pkts_n, loc, olx); 4759 MLX5_ASSERT(ret != MLX5_TXCMP_CODE_SINGLE); 4760 if (ret != MLX5_TXCMP_CODE_EMPW) 4761 return ret; 4762 /* The resources to send one packet should remain. */ 4763 MLX5_ASSERT(loc->elts_free && loc->wqe_free); 4764 } 4765 } 4766 4767 /** 4768 * DPDK Tx callback template. This is configured template 4769 * used to generate routines optimized for specified offload setup. 4770 * One of this generated functions is chosen at SQ configuration 4771 * time. 4772 * 4773 * @param txq 4774 * Generic pointer to TX queue structure. 4775 * @param[in] pkts 4776 * Packets to transmit. 4777 * @param pkts_n 4778 * Number of packets in array. 4779 * @param olx 4780 * Configured offloads mask, presents the bits of MLX5_TXOFF_CONFIG_xxx 4781 * values. Should be static to take compile time static configuration 4782 * advantages. 4783 * 4784 * @return 4785 * Number of packets successfully transmitted (<= pkts_n). 4786 */ 4787 static __rte_always_inline uint16_t 4788 mlx5_tx_burst_tmpl(struct mlx5_txq_data *restrict txq, 4789 struct rte_mbuf **restrict pkts, 4790 uint16_t pkts_n, 4791 unsigned int olx) 4792 { 4793 struct mlx5_txq_local loc; 4794 enum mlx5_txcmp_code ret; 4795 unsigned int part; 4796 4797 MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); 4798 MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); 4799 if (unlikely(!pkts_n)) 4800 return 0; 4801 loc.pkts_sent = 0; 4802 loc.pkts_copy = 0; 4803 loc.wqe_last = NULL; 4804 4805 send_loop: 4806 loc.pkts_loop = loc.pkts_sent; 4807 /* 4808 * Check if there are some CQEs, if any: 4809 * - process an encountered errors 4810 * - process the completed WQEs 4811 * - free related mbufs 4812 * - doorbell the NIC about processed CQEs 4813 */ 4814 rte_prefetch0(*(pkts + loc.pkts_sent)); 4815 mlx5_tx_handle_completion(txq, olx); 4816 /* 4817 * Calculate the number of available resources - elts and WQEs. 4818 * There are two possible different scenarios: 4819 * - no data inlining into WQEs, one WQEBB may contains upto 4820 * four packets, in this case elts become scarce resource 4821 * - data inlining into WQEs, one packet may require multiple 4822 * WQEBBs, the WQEs become the limiting factor. 4823 */ 4824 MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); 4825 loc.elts_free = txq->elts_s - 4826 (uint16_t)(txq->elts_head - txq->elts_tail); 4827 MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); 4828 loc.wqe_free = txq->wqe_s - 4829 (uint16_t)(txq->wqe_ci - txq->wqe_pi); 4830 if (unlikely(!loc.elts_free || !loc.wqe_free)) 4831 goto burst_exit; 4832 for (;;) { 4833 /* 4834 * Fetch the packet from array. Usually this is 4835 * the first packet in series of multi/single 4836 * segment packets. 4837 */ 4838 loc.mbuf = *(pkts + loc.pkts_sent); 4839 /* Dedicated branch for multi-segment packets. */ 4840 if (MLX5_TXOFF_CONFIG(MULTI) && 4841 unlikely(NB_SEGS(loc.mbuf) > 1)) { 4842 /* 4843 * Multi-segment packet encountered. 4844 * Hardware is able to process it only 4845 * with SEND/TSO opcodes, one packet 4846 * per WQE, do it in dedicated routine. 4847 */ 4848 enter_send_multi: 4849 MLX5_ASSERT(loc.pkts_sent >= loc.pkts_copy); 4850 part = loc.pkts_sent - loc.pkts_copy; 4851 if (!MLX5_TXOFF_CONFIG(INLINE) && part) { 4852 /* 4853 * There are some single-segment mbufs not 4854 * stored in elts. The mbufs must be in the 4855 * same order as WQEs, so we must copy the 4856 * mbufs to elts here, before the coming 4857 * multi-segment packet mbufs is appended. 4858 */ 4859 mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy, 4860 part, olx); 4861 loc.pkts_copy = loc.pkts_sent; 4862 } 4863 MLX5_ASSERT(pkts_n > loc.pkts_sent); 4864 ret = mlx5_tx_burst_mseg(txq, pkts, pkts_n, &loc, olx); 4865 if (!MLX5_TXOFF_CONFIG(INLINE)) 4866 loc.pkts_copy = loc.pkts_sent; 4867 /* 4868 * These returned code checks are supposed 4869 * to be optimized out due to routine inlining. 4870 */ 4871 if (ret == MLX5_TXCMP_CODE_EXIT) { 4872 /* 4873 * The routine returns this code when 4874 * all packets are sent or there is no 4875 * enough resources to complete request. 4876 */ 4877 break; 4878 } 4879 if (ret == MLX5_TXCMP_CODE_ERROR) { 4880 /* 4881 * The routine returns this code when 4882 * some error in the incoming packets 4883 * format occurred. 4884 */ 4885 txq->stats.oerrors++; 4886 break; 4887 } 4888 if (ret == MLX5_TXCMP_CODE_SINGLE) { 4889 /* 4890 * The single-segment packet was encountered 4891 * in the array, try to send it with the 4892 * best optimized way, possible engaging eMPW. 4893 */ 4894 goto enter_send_single; 4895 } 4896 if (MLX5_TXOFF_CONFIG(TSO) && 4897 ret == MLX5_TXCMP_CODE_TSO) { 4898 /* 4899 * The single-segment TSO packet was 4900 * encountered in the array. 4901 */ 4902 goto enter_send_tso; 4903 } 4904 /* We must not get here. Something is going wrong. */ 4905 MLX5_ASSERT(false); 4906 txq->stats.oerrors++; 4907 break; 4908 } 4909 /* Dedicated branch for single-segment TSO packets. */ 4910 if (MLX5_TXOFF_CONFIG(TSO) && 4911 unlikely(loc.mbuf->ol_flags & PKT_TX_TCP_SEG)) { 4912 /* 4913 * TSO might require special way for inlining 4914 * (dedicated parameters) and is sent with 4915 * MLX5_OPCODE_TSO opcode only, provide this 4916 * in dedicated branch. 4917 */ 4918 enter_send_tso: 4919 MLX5_ASSERT(NB_SEGS(loc.mbuf) == 1); 4920 MLX5_ASSERT(pkts_n > loc.pkts_sent); 4921 ret = mlx5_tx_burst_tso(txq, pkts, pkts_n, &loc, olx); 4922 /* 4923 * These returned code checks are supposed 4924 * to be optimized out due to routine inlining. 4925 */ 4926 if (ret == MLX5_TXCMP_CODE_EXIT) 4927 break; 4928 if (ret == MLX5_TXCMP_CODE_ERROR) { 4929 txq->stats.oerrors++; 4930 break; 4931 } 4932 if (ret == MLX5_TXCMP_CODE_SINGLE) 4933 goto enter_send_single; 4934 if (MLX5_TXOFF_CONFIG(MULTI) && 4935 ret == MLX5_TXCMP_CODE_MULTI) { 4936 /* 4937 * The multi-segment packet was 4938 * encountered in the array. 4939 */ 4940 goto enter_send_multi; 4941 } 4942 /* We must not get here. Something is going wrong. */ 4943 MLX5_ASSERT(false); 4944 txq->stats.oerrors++; 4945 break; 4946 } 4947 /* 4948 * The dedicated branch for the single-segment packets 4949 * without TSO. Often these ones can be sent using 4950 * MLX5_OPCODE_EMPW with multiple packets in one WQE. 4951 * The routine builds the WQEs till it encounters 4952 * the TSO or multi-segment packet (in case if these 4953 * offloads are requested at SQ configuration time). 4954 */ 4955 enter_send_single: 4956 MLX5_ASSERT(pkts_n > loc.pkts_sent); 4957 ret = mlx5_tx_burst_single(txq, pkts, pkts_n, &loc, olx); 4958 /* 4959 * These returned code checks are supposed 4960 * to be optimized out due to routine inlining. 4961 */ 4962 if (ret == MLX5_TXCMP_CODE_EXIT) 4963 break; 4964 if (ret == MLX5_TXCMP_CODE_ERROR) { 4965 txq->stats.oerrors++; 4966 break; 4967 } 4968 if (MLX5_TXOFF_CONFIG(MULTI) && 4969 ret == MLX5_TXCMP_CODE_MULTI) { 4970 /* 4971 * The multi-segment packet was 4972 * encountered in the array. 4973 */ 4974 goto enter_send_multi; 4975 } 4976 if (MLX5_TXOFF_CONFIG(TSO) && 4977 ret == MLX5_TXCMP_CODE_TSO) { 4978 /* 4979 * The single-segment TSO packet was 4980 * encountered in the array. 4981 */ 4982 goto enter_send_tso; 4983 } 4984 /* We must not get here. Something is going wrong. */ 4985 MLX5_ASSERT(false); 4986 txq->stats.oerrors++; 4987 break; 4988 } 4989 /* 4990 * Main Tx loop is completed, do the rest: 4991 * - set completion request if thresholds are reached 4992 * - doorbell the hardware 4993 * - copy the rest of mbufs to elts (if any) 4994 */ 4995 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE) || 4996 loc.pkts_sent >= loc.pkts_copy); 4997 /* Take a shortcut if nothing is sent. */ 4998 if (unlikely(loc.pkts_sent == loc.pkts_loop)) 4999 goto burst_exit; 5000 /* Request CQE generation if limits are reached. */ 5001 mlx5_tx_request_completion(txq, &loc, olx); 5002 /* 5003 * Ring QP doorbell immediately after WQE building completion 5004 * to improve latencies. The pure software related data treatment 5005 * can be completed after doorbell. Tx CQEs for this SQ are 5006 * processed in this thread only by the polling. 5007 * 5008 * The rdma core library can map doorbell register in two ways, 5009 * depending on the environment variable "MLX5_SHUT_UP_BF": 5010 * 5011 * - as regular cached memory, the variable is either missing or 5012 * set to zero. This type of mapping may cause the significant 5013 * doorbell register writing latency and requires explicit 5014 * memory write barrier to mitigate this issue and prevent 5015 * write combining. 5016 * 5017 * - as non-cached memory, the variable is present and set to 5018 * not "0" value. This type of mapping may cause performance 5019 * impact under heavy loading conditions but the explicit write 5020 * memory barrier is not required and it may improve core 5021 * performance. 5022 * 5023 * - the legacy behaviour (prior 19.08 release) was to use some 5024 * heuristics to decide whether write memory barrier should 5025 * be performed. This behavior is supported with specifying 5026 * tx_db_nc=2, write barrier is skipped if application 5027 * provides the full recommended burst of packets, it 5028 * supposes the next packets are coming and the write barrier 5029 * will be issued on the next burst (after descriptor writing, 5030 * at least). 5031 */ 5032 mlx5_tx_dbrec_cond_wmb(txq, loc.wqe_last, !txq->db_nc && 5033 (!txq->db_heu || pkts_n % MLX5_TX_DEFAULT_BURST)); 5034 /* Not all of the mbufs may be stored into elts yet. */ 5035 part = MLX5_TXOFF_CONFIG(INLINE) ? 0 : loc.pkts_sent - loc.pkts_copy; 5036 if (!MLX5_TXOFF_CONFIG(INLINE) && part) { 5037 /* 5038 * There are some single-segment mbufs not stored in elts. 5039 * It can be only if the last packet was single-segment. 5040 * The copying is gathered into one place due to it is 5041 * a good opportunity to optimize that with SIMD. 5042 * Unfortunately if inlining is enabled the gaps in 5043 * pointer array may happen due to early freeing of the 5044 * inlined mbufs. 5045 */ 5046 mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy, part, olx); 5047 loc.pkts_copy = loc.pkts_sent; 5048 } 5049 MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); 5050 MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); 5051 if (pkts_n > loc.pkts_sent) { 5052 /* 5053 * If burst size is large there might be no enough CQE 5054 * fetched from completion queue and no enough resources 5055 * freed to send all the packets. 5056 */ 5057 goto send_loop; 5058 } 5059 burst_exit: 5060 #ifdef MLX5_PMD_SOFT_COUNTERS 5061 /* Increment sent packets counter. */ 5062 txq->stats.opackets += loc.pkts_sent; 5063 #endif 5064 return loc.pkts_sent; 5065 } 5066 5067 /* Generate routines with Enhanced Multi-Packet Write support. */ 5068 MLX5_TXOFF_DECL(full_empw, 5069 MLX5_TXOFF_CONFIG_FULL | MLX5_TXOFF_CONFIG_EMPW) 5070 5071 MLX5_TXOFF_DECL(none_empw, 5072 MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW) 5073 5074 MLX5_TXOFF_DECL(md_empw, 5075 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5076 5077 MLX5_TXOFF_DECL(mt_empw, 5078 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5079 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5080 5081 MLX5_TXOFF_DECL(mtsc_empw, 5082 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5083 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5084 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5085 5086 MLX5_TXOFF_DECL(mti_empw, 5087 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5088 MLX5_TXOFF_CONFIG_INLINE | 5089 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5090 5091 MLX5_TXOFF_DECL(mtv_empw, 5092 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5093 MLX5_TXOFF_CONFIG_VLAN | 5094 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5095 5096 MLX5_TXOFF_DECL(mtiv_empw, 5097 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5098 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5099 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5100 5101 MLX5_TXOFF_DECL(sc_empw, 5102 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5103 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5104 5105 MLX5_TXOFF_DECL(sci_empw, 5106 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5107 MLX5_TXOFF_CONFIG_INLINE | 5108 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5109 5110 MLX5_TXOFF_DECL(scv_empw, 5111 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5112 MLX5_TXOFF_CONFIG_VLAN | 5113 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5114 5115 MLX5_TXOFF_DECL(sciv_empw, 5116 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5117 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5118 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5119 5120 MLX5_TXOFF_DECL(i_empw, 5121 MLX5_TXOFF_CONFIG_INLINE | 5122 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5123 5124 MLX5_TXOFF_DECL(v_empw, 5125 MLX5_TXOFF_CONFIG_VLAN | 5126 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5127 5128 MLX5_TXOFF_DECL(iv_empw, 5129 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5130 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5131 5132 /* Generate routines without Enhanced Multi-Packet Write support. */ 5133 MLX5_TXOFF_DECL(full, 5134 MLX5_TXOFF_CONFIG_FULL) 5135 5136 MLX5_TXOFF_DECL(none, 5137 MLX5_TXOFF_CONFIG_NONE) 5138 5139 MLX5_TXOFF_DECL(md, 5140 MLX5_TXOFF_CONFIG_METADATA) 5141 5142 MLX5_TXOFF_DECL(mt, 5143 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5144 MLX5_TXOFF_CONFIG_METADATA) 5145 5146 MLX5_TXOFF_DECL(mtsc, 5147 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5148 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5149 MLX5_TXOFF_CONFIG_METADATA) 5150 5151 MLX5_TXOFF_DECL(mti, 5152 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5153 MLX5_TXOFF_CONFIG_INLINE | 5154 MLX5_TXOFF_CONFIG_METADATA) 5155 5156 5157 MLX5_TXOFF_DECL(mtv, 5158 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5159 MLX5_TXOFF_CONFIG_VLAN | 5160 MLX5_TXOFF_CONFIG_METADATA) 5161 5162 5163 MLX5_TXOFF_DECL(mtiv, 5164 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5165 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5166 MLX5_TXOFF_CONFIG_METADATA) 5167 5168 MLX5_TXOFF_DECL(sc, 5169 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5170 MLX5_TXOFF_CONFIG_METADATA) 5171 5172 MLX5_TXOFF_DECL(sci, 5173 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5174 MLX5_TXOFF_CONFIG_INLINE | 5175 MLX5_TXOFF_CONFIG_METADATA) 5176 5177 5178 MLX5_TXOFF_DECL(scv, 5179 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5180 MLX5_TXOFF_CONFIG_VLAN | 5181 MLX5_TXOFF_CONFIG_METADATA) 5182 5183 5184 MLX5_TXOFF_DECL(sciv, 5185 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5186 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5187 MLX5_TXOFF_CONFIG_METADATA) 5188 5189 MLX5_TXOFF_DECL(i, 5190 MLX5_TXOFF_CONFIG_INLINE | 5191 MLX5_TXOFF_CONFIG_METADATA) 5192 5193 MLX5_TXOFF_DECL(v, 5194 MLX5_TXOFF_CONFIG_VLAN | 5195 MLX5_TXOFF_CONFIG_METADATA) 5196 5197 MLX5_TXOFF_DECL(iv, 5198 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5199 MLX5_TXOFF_CONFIG_METADATA) 5200 5201 /* 5202 * Generate routines with Legacy Multi-Packet Write support. 5203 * This mode is supported by ConnectX-4LX only and imposes 5204 * offload limitations, not supported: 5205 * - ACL/Flows (metadata are becoming meaningless) 5206 * - WQE Inline headers 5207 * - SRIOV (E-Switch offloads) 5208 * - VLAN insertion 5209 * - tunnel encapsulation/decapsulation 5210 * - TSO 5211 */ 5212 MLX5_TXOFF_DECL(none_mpw, 5213 MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW | 5214 MLX5_TXOFF_CONFIG_MPW) 5215 5216 MLX5_TXOFF_DECL(mci_mpw, 5217 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM | 5218 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | 5219 MLX5_TXOFF_CONFIG_MPW) 5220 5221 MLX5_TXOFF_DECL(mc_mpw, 5222 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM | 5223 MLX5_TXOFF_CONFIG_EMPW | MLX5_TXOFF_CONFIG_MPW) 5224 5225 MLX5_TXOFF_DECL(i_mpw, 5226 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | 5227 MLX5_TXOFF_CONFIG_MPW) 5228 5229 /* 5230 * Array of declared and compiled Tx burst function and corresponding 5231 * supported offloads set. The array is used to select the Tx burst 5232 * function for specified offloads set at Tx queue configuration time. 5233 */ 5234 const struct { 5235 eth_tx_burst_t func; 5236 unsigned int olx; 5237 } txoff_func[] = { 5238 MLX5_TXOFF_INFO(full_empw, 5239 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5240 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5241 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5242 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5243 5244 MLX5_TXOFF_INFO(none_empw, 5245 MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW) 5246 5247 MLX5_TXOFF_INFO(md_empw, 5248 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5249 5250 MLX5_TXOFF_INFO(mt_empw, 5251 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5252 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5253 5254 MLX5_TXOFF_INFO(mtsc_empw, 5255 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5256 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5257 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5258 5259 MLX5_TXOFF_INFO(mti_empw, 5260 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5261 MLX5_TXOFF_CONFIG_INLINE | 5262 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5263 5264 MLX5_TXOFF_INFO(mtv_empw, 5265 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5266 MLX5_TXOFF_CONFIG_VLAN | 5267 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5268 5269 MLX5_TXOFF_INFO(mtiv_empw, 5270 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5271 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5272 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5273 5274 MLX5_TXOFF_INFO(sc_empw, 5275 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5276 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5277 5278 MLX5_TXOFF_INFO(sci_empw, 5279 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5280 MLX5_TXOFF_CONFIG_INLINE | 5281 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5282 5283 MLX5_TXOFF_INFO(scv_empw, 5284 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5285 MLX5_TXOFF_CONFIG_VLAN | 5286 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5287 5288 MLX5_TXOFF_INFO(sciv_empw, 5289 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5290 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5291 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5292 5293 MLX5_TXOFF_INFO(i_empw, 5294 MLX5_TXOFF_CONFIG_INLINE | 5295 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5296 5297 MLX5_TXOFF_INFO(v_empw, 5298 MLX5_TXOFF_CONFIG_VLAN | 5299 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5300 5301 MLX5_TXOFF_INFO(iv_empw, 5302 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5303 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5304 5305 MLX5_TXOFF_INFO(full, 5306 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5307 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5308 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5309 MLX5_TXOFF_CONFIG_METADATA) 5310 5311 MLX5_TXOFF_INFO(none, 5312 MLX5_TXOFF_CONFIG_NONE) 5313 5314 MLX5_TXOFF_INFO(md, 5315 MLX5_TXOFF_CONFIG_METADATA) 5316 5317 MLX5_TXOFF_INFO(mt, 5318 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5319 MLX5_TXOFF_CONFIG_METADATA) 5320 5321 MLX5_TXOFF_INFO(mtsc, 5322 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5323 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5324 MLX5_TXOFF_CONFIG_METADATA) 5325 5326 MLX5_TXOFF_INFO(mti, 5327 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5328 MLX5_TXOFF_CONFIG_INLINE | 5329 MLX5_TXOFF_CONFIG_METADATA) 5330 5331 MLX5_TXOFF_INFO(mtv, 5332 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5333 MLX5_TXOFF_CONFIG_VLAN | 5334 MLX5_TXOFF_CONFIG_METADATA) 5335 5336 MLX5_TXOFF_INFO(mtiv, 5337 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5338 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5339 MLX5_TXOFF_CONFIG_METADATA) 5340 5341 MLX5_TXOFF_INFO(sc, 5342 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5343 MLX5_TXOFF_CONFIG_METADATA) 5344 5345 MLX5_TXOFF_INFO(sci, 5346 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5347 MLX5_TXOFF_CONFIG_INLINE | 5348 MLX5_TXOFF_CONFIG_METADATA) 5349 5350 MLX5_TXOFF_INFO(scv, 5351 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5352 MLX5_TXOFF_CONFIG_VLAN | 5353 MLX5_TXOFF_CONFIG_METADATA) 5354 5355 MLX5_TXOFF_INFO(sciv, 5356 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5357 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5358 MLX5_TXOFF_CONFIG_METADATA) 5359 5360 MLX5_TXOFF_INFO(i, 5361 MLX5_TXOFF_CONFIG_INLINE | 5362 MLX5_TXOFF_CONFIG_METADATA) 5363 5364 MLX5_TXOFF_INFO(v, 5365 MLX5_TXOFF_CONFIG_VLAN | 5366 MLX5_TXOFF_CONFIG_METADATA) 5367 5368 MLX5_TXOFF_INFO(iv, 5369 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5370 MLX5_TXOFF_CONFIG_METADATA) 5371 5372 MLX5_TXOFF_INFO(none_mpw, 5373 MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW | 5374 MLX5_TXOFF_CONFIG_MPW) 5375 5376 MLX5_TXOFF_INFO(mci_mpw, 5377 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM | 5378 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | 5379 MLX5_TXOFF_CONFIG_MPW) 5380 5381 MLX5_TXOFF_INFO(mc_mpw, 5382 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM | 5383 MLX5_TXOFF_CONFIG_EMPW | MLX5_TXOFF_CONFIG_MPW) 5384 5385 MLX5_TXOFF_INFO(i_mpw, 5386 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | 5387 MLX5_TXOFF_CONFIG_MPW) 5388 }; 5389 5390 /** 5391 * Configure the Tx function to use. The routine checks configured 5392 * Tx offloads for the device and selects appropriate Tx burst 5393 * routine. There are multiple Tx burst routines compiled from 5394 * the same template in the most optimal way for the dedicated 5395 * Tx offloads set. 5396 * 5397 * @param dev 5398 * Pointer to private data structure. 5399 * 5400 * @return 5401 * Pointer to selected Tx burst function. 5402 */ 5403 eth_tx_burst_t 5404 mlx5_select_tx_function(struct rte_eth_dev *dev) 5405 { 5406 struct mlx5_priv *priv = dev->data->dev_private; 5407 struct mlx5_dev_config *config = &priv->config; 5408 uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads; 5409 unsigned int diff = 0, olx = 0, i, m; 5410 5411 static_assert(MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE <= 5412 MLX5_DSEG_MAX, "invalid WQE max size"); 5413 static_assert(MLX5_WQE_CSEG_SIZE == MLX5_WSEG_SIZE, 5414 "invalid WQE Control Segment size"); 5415 static_assert(MLX5_WQE_ESEG_SIZE == MLX5_WSEG_SIZE, 5416 "invalid WQE Ethernet Segment size"); 5417 static_assert(MLX5_WQE_DSEG_SIZE == MLX5_WSEG_SIZE, 5418 "invalid WQE Data Segment size"); 5419 static_assert(MLX5_WQE_SIZE == 4 * MLX5_WSEG_SIZE, 5420 "invalid WQE size"); 5421 MLX5_ASSERT(priv); 5422 if (tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS) { 5423 /* We should support Multi-Segment Packets. */ 5424 olx |= MLX5_TXOFF_CONFIG_MULTI; 5425 } 5426 if (tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO | 5427 DEV_TX_OFFLOAD_VXLAN_TNL_TSO | 5428 DEV_TX_OFFLOAD_GRE_TNL_TSO | 5429 DEV_TX_OFFLOAD_IP_TNL_TSO | 5430 DEV_TX_OFFLOAD_UDP_TNL_TSO)) { 5431 /* We should support TCP Send Offload. */ 5432 olx |= MLX5_TXOFF_CONFIG_TSO; 5433 } 5434 if (tx_offloads & (DEV_TX_OFFLOAD_IP_TNL_TSO | 5435 DEV_TX_OFFLOAD_UDP_TNL_TSO | 5436 DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)) { 5437 /* We should support Software Parser for Tunnels. */ 5438 olx |= MLX5_TXOFF_CONFIG_SWP; 5439 } 5440 if (tx_offloads & (DEV_TX_OFFLOAD_IPV4_CKSUM | 5441 DEV_TX_OFFLOAD_UDP_CKSUM | 5442 DEV_TX_OFFLOAD_TCP_CKSUM | 5443 DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)) { 5444 /* We should support IP/TCP/UDP Checksums. */ 5445 olx |= MLX5_TXOFF_CONFIG_CSUM; 5446 } 5447 if (tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT) { 5448 /* We should support VLAN insertion. */ 5449 olx |= MLX5_TXOFF_CONFIG_VLAN; 5450 } 5451 if (priv->txqs_n && (*priv->txqs)[0]) { 5452 struct mlx5_txq_data *txd = (*priv->txqs)[0]; 5453 5454 if (txd->inlen_send) { 5455 /* 5456 * Check the data inline requirements. Data inline 5457 * is enabled on per device basis, we can check 5458 * the first Tx queue only. 5459 * 5460 * If device does not support VLAN insertion in WQE 5461 * and some queues are requested to perform VLAN 5462 * insertion offload than inline must be enabled. 5463 */ 5464 olx |= MLX5_TXOFF_CONFIG_INLINE; 5465 } 5466 } 5467 if (config->mps == MLX5_MPW_ENHANCED && 5468 config->txq_inline_min <= 0) { 5469 /* 5470 * The NIC supports Enhanced Multi-Packet Write 5471 * and does not require minimal inline data. 5472 */ 5473 olx |= MLX5_TXOFF_CONFIG_EMPW; 5474 } 5475 if (rte_flow_dynf_metadata_avail()) { 5476 /* We should support Flow metadata. */ 5477 olx |= MLX5_TXOFF_CONFIG_METADATA; 5478 } 5479 if (config->mps == MLX5_MPW) { 5480 /* 5481 * The NIC supports Legacy Multi-Packet Write. 5482 * The MLX5_TXOFF_CONFIG_MPW controls the 5483 * descriptor building method in combination 5484 * with MLX5_TXOFF_CONFIG_EMPW. 5485 */ 5486 if (!(olx & (MLX5_TXOFF_CONFIG_TSO | 5487 MLX5_TXOFF_CONFIG_SWP | 5488 MLX5_TXOFF_CONFIG_VLAN | 5489 MLX5_TXOFF_CONFIG_METADATA))) 5490 olx |= MLX5_TXOFF_CONFIG_EMPW | 5491 MLX5_TXOFF_CONFIG_MPW; 5492 } 5493 /* 5494 * Scan the routines table to find the minimal 5495 * satisfying routine with requested offloads. 5496 */ 5497 m = RTE_DIM(txoff_func); 5498 for (i = 0; i < RTE_DIM(txoff_func); i++) { 5499 unsigned int tmp; 5500 5501 tmp = txoff_func[i].olx; 5502 if (tmp == olx) { 5503 /* Meets requested offloads exactly.*/ 5504 m = i; 5505 break; 5506 } 5507 if ((tmp & olx) != olx) { 5508 /* Does not meet requested offloads at all. */ 5509 continue; 5510 } 5511 if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_EMPW) 5512 /* Do not enable eMPW if not configured. */ 5513 continue; 5514 if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_INLINE) 5515 /* Do not enable inlining if not configured. */ 5516 continue; 5517 /* 5518 * Some routine meets the requirements. 5519 * Check whether it has minimal amount 5520 * of not requested offloads. 5521 */ 5522 tmp = __builtin_popcountl(tmp & ~olx); 5523 if (m >= RTE_DIM(txoff_func) || tmp < diff) { 5524 /* First or better match, save and continue. */ 5525 m = i; 5526 diff = tmp; 5527 continue; 5528 } 5529 if (tmp == diff) { 5530 tmp = txoff_func[i].olx ^ txoff_func[m].olx; 5531 if (__builtin_ffsl(txoff_func[i].olx & ~tmp) < 5532 __builtin_ffsl(txoff_func[m].olx & ~tmp)) { 5533 /* Lighter not requested offload. */ 5534 m = i; 5535 } 5536 } 5537 } 5538 if (m >= RTE_DIM(txoff_func)) { 5539 DRV_LOG(DEBUG, "port %u has no selected Tx function" 5540 " for requested offloads %04X", 5541 dev->data->port_id, olx); 5542 return NULL; 5543 } 5544 DRV_LOG(DEBUG, "port %u has selected Tx function" 5545 " supporting offloads %04X/%04X", 5546 dev->data->port_id, olx, txoff_func[m].olx); 5547 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_MULTI) 5548 DRV_LOG(DEBUG, "\tMULTI (multi segment)"); 5549 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_TSO) 5550 DRV_LOG(DEBUG, "\tTSO (TCP send offload)"); 5551 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_SWP) 5552 DRV_LOG(DEBUG, "\tSWP (software parser)"); 5553 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_CSUM) 5554 DRV_LOG(DEBUG, "\tCSUM (checksum offload)"); 5555 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_INLINE) 5556 DRV_LOG(DEBUG, "\tINLIN (inline data)"); 5557 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_VLAN) 5558 DRV_LOG(DEBUG, "\tVLANI (VLAN insertion)"); 5559 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_METADATA) 5560 DRV_LOG(DEBUG, "\tMETAD (tx Flow metadata)"); 5561 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_EMPW) { 5562 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_MPW) 5563 DRV_LOG(DEBUG, "\tMPW (Legacy MPW)"); 5564 else 5565 DRV_LOG(DEBUG, "\tEMPW (Enhanced MPW)"); 5566 } 5567 return txoff_func[m].func; 5568 } 5569 5570 /** 5571 * DPDK callback to get the TX queue information 5572 * 5573 * @param dev 5574 * Pointer to the device structure. 5575 * 5576 * @param tx_queue_id 5577 * Tx queue identificator. 5578 * 5579 * @param qinfo 5580 * Pointer to the TX queue information structure. 5581 * 5582 * @return 5583 * None. 5584 */ 5585 5586 void 5587 mlx5_txq_info_get(struct rte_eth_dev *dev, uint16_t tx_queue_id, 5588 struct rte_eth_txq_info *qinfo) 5589 { 5590 struct mlx5_priv *priv = dev->data->dev_private; 5591 struct mlx5_txq_data *txq = (*priv->txqs)[tx_queue_id]; 5592 struct mlx5_txq_ctrl *txq_ctrl = 5593 container_of(txq, struct mlx5_txq_ctrl, txq); 5594 5595 if (!txq) 5596 return; 5597 qinfo->nb_desc = txq->elts_s; 5598 qinfo->conf.tx_thresh.pthresh = 0; 5599 qinfo->conf.tx_thresh.hthresh = 0; 5600 qinfo->conf.tx_thresh.wthresh = 0; 5601 qinfo->conf.tx_rs_thresh = 0; 5602 qinfo->conf.tx_free_thresh = 0; 5603 qinfo->conf.tx_deferred_start = txq_ctrl ? 0 : 1; 5604 qinfo->conf.offloads = dev->data->dev_conf.txmode.offloads; 5605 } 5606 5607 /** 5608 * DPDK callback to get the TX packet burst mode information 5609 * 5610 * @param dev 5611 * Pointer to the device structure. 5612 * 5613 * @param tx_queue_id 5614 * Tx queue identificatior. 5615 * 5616 * @param mode 5617 * Pointer to the burts mode information. 5618 * 5619 * @return 5620 * 0 as success, -EINVAL as failure. 5621 */ 5622 5623 int 5624 mlx5_tx_burst_mode_get(struct rte_eth_dev *dev, 5625 uint16_t tx_queue_id __rte_unused, 5626 struct rte_eth_burst_mode *mode) 5627 { 5628 eth_tx_burst_t pkt_burst = dev->tx_pkt_burst; 5629 unsigned int i, olx; 5630 5631 for (i = 0; i < RTE_DIM(txoff_func); i++) { 5632 if (pkt_burst == txoff_func[i].func) { 5633 olx = txoff_func[i].olx; 5634 snprintf(mode->info, sizeof(mode->info), 5635 "%s%s%s%s%s%s%s%s", 5636 (olx & MLX5_TXOFF_CONFIG_EMPW) ? 5637 ((olx & MLX5_TXOFF_CONFIG_MPW) ? 5638 "Legacy MPW" : "Enhanced MPW") : "No MPW", 5639 (olx & MLX5_TXOFF_CONFIG_MULTI) ? 5640 " + MULTI" : "", 5641 (olx & MLX5_TXOFF_CONFIG_TSO) ? 5642 " + TSO" : "", 5643 (olx & MLX5_TXOFF_CONFIG_SWP) ? 5644 " + SWP" : "", 5645 (olx & MLX5_TXOFF_CONFIG_CSUM) ? 5646 " + CSUM" : "", 5647 (olx & MLX5_TXOFF_CONFIG_INLINE) ? 5648 " + INLINE" : "", 5649 (olx & MLX5_TXOFF_CONFIG_VLAN) ? 5650 " + VLAN" : "", 5651 (olx & MLX5_TXOFF_CONFIG_METADATA) ? 5652 " + METADATA" : ""); 5653 return 0; 5654 } 5655 } 5656 return -EINVAL; 5657 } 5658