1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2015 6WIND S.A. 3 * Copyright 2015-2019 Mellanox Technologies, Ltd 4 */ 5 6 #include <assert.h> 7 #include <stdint.h> 8 #include <string.h> 9 #include <stdlib.h> 10 11 /* Verbs header. */ 12 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 13 #ifdef PEDANTIC 14 #pragma GCC diagnostic ignored "-Wpedantic" 15 #endif 16 #include <infiniband/verbs.h> 17 #include <infiniband/mlx5dv.h> 18 #ifdef PEDANTIC 19 #pragma GCC diagnostic error "-Wpedantic" 20 #endif 21 22 #include <rte_mbuf.h> 23 #include <rte_mempool.h> 24 #include <rte_prefetch.h> 25 #include <rte_common.h> 26 #include <rte_branch_prediction.h> 27 #include <rte_ether.h> 28 #include <rte_cycles.h> 29 #include <rte_flow.h> 30 31 #include "mlx5.h" 32 #include "mlx5_utils.h" 33 #include "mlx5_rxtx.h" 34 #include "mlx5_autoconf.h" 35 #include "mlx5_defs.h" 36 #include "mlx5_prm.h" 37 38 /* TX burst subroutines return codes. */ 39 enum mlx5_txcmp_code { 40 MLX5_TXCMP_CODE_EXIT = 0, 41 MLX5_TXCMP_CODE_ERROR, 42 MLX5_TXCMP_CODE_SINGLE, 43 MLX5_TXCMP_CODE_MULTI, 44 MLX5_TXCMP_CODE_TSO, 45 MLX5_TXCMP_CODE_EMPW, 46 }; 47 48 /* 49 * These defines are used to configure Tx burst routine option set 50 * supported at compile time. The not specified options are optimized out 51 * out due to if conditions can be explicitly calculated at compile time. 52 * The offloads with bigger runtime check (require more CPU cycles to 53 * skip) overhead should have the bigger index - this is needed to 54 * select the better matching routine function if no exact match and 55 * some offloads are not actually requested. 56 */ 57 #define MLX5_TXOFF_CONFIG_MULTI (1u << 0) /* Multi-segment packets.*/ 58 #define MLX5_TXOFF_CONFIG_TSO (1u << 1) /* TCP send offload supported.*/ 59 #define MLX5_TXOFF_CONFIG_SWP (1u << 2) /* Tunnels/SW Parser offloads.*/ 60 #define MLX5_TXOFF_CONFIG_CSUM (1u << 3) /* Check Sums offloaded. */ 61 #define MLX5_TXOFF_CONFIG_INLINE (1u << 4) /* Data inlining supported. */ 62 #define MLX5_TXOFF_CONFIG_VLAN (1u << 5) /* VLAN insertion supported.*/ 63 #define MLX5_TXOFF_CONFIG_METADATA (1u << 6) /* Flow metadata. */ 64 #define MLX5_TXOFF_CONFIG_EMPW (1u << 8) /* Enhanced MPW supported.*/ 65 #define MLX5_TXOFF_CONFIG_MPW (1u << 9) /* Legacy MPW supported.*/ 66 67 /* The most common offloads groups. */ 68 #define MLX5_TXOFF_CONFIG_NONE 0 69 #define MLX5_TXOFF_CONFIG_FULL (MLX5_TXOFF_CONFIG_MULTI | \ 70 MLX5_TXOFF_CONFIG_TSO | \ 71 MLX5_TXOFF_CONFIG_SWP | \ 72 MLX5_TXOFF_CONFIG_CSUM | \ 73 MLX5_TXOFF_CONFIG_INLINE | \ 74 MLX5_TXOFF_CONFIG_VLAN | \ 75 MLX5_TXOFF_CONFIG_METADATA) 76 77 #define MLX5_TXOFF_CONFIG(mask) (olx & MLX5_TXOFF_CONFIG_##mask) 78 79 #define MLX5_TXOFF_DECL(func, olx) \ 80 static uint16_t mlx5_tx_burst_##func(void *txq, \ 81 struct rte_mbuf **pkts, \ 82 uint16_t pkts_n) \ 83 { \ 84 return mlx5_tx_burst_tmpl((struct mlx5_txq_data *)txq, \ 85 pkts, pkts_n, (olx)); \ 86 } 87 88 #define MLX5_TXOFF_INFO(func, olx) {mlx5_tx_burst_##func, olx}, 89 90 static __rte_always_inline uint32_t 91 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe); 92 93 static __rte_always_inline int 94 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, 95 uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe); 96 97 static __rte_always_inline uint32_t 98 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe); 99 100 static __rte_always_inline void 101 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, 102 volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res); 103 104 static __rte_always_inline void 105 mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx, 106 const unsigned int strd_n); 107 108 static int 109 mlx5_queue_state_modify(struct rte_eth_dev *dev, 110 struct mlx5_mp_arg_queue_state_modify *sm); 111 112 static inline void 113 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *restrict tcp, 114 volatile struct mlx5_cqe *restrict cqe, 115 uint32_t phcsum); 116 117 static inline void 118 mlx5_lro_update_hdr(uint8_t *restrict padd, 119 volatile struct mlx5_cqe *restrict cqe, 120 uint32_t len); 121 122 uint32_t mlx5_ptype_table[] __rte_cache_aligned = { 123 [0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */ 124 }; 125 126 uint8_t mlx5_cksum_table[1 << 10] __rte_cache_aligned; 127 uint8_t mlx5_swp_types_table[1 << 10] __rte_cache_aligned; 128 129 /** 130 * Build a table to translate Rx completion flags to packet type. 131 * 132 * @note: fix mlx5_dev_supported_ptypes_get() if any change here. 133 */ 134 void 135 mlx5_set_ptype_table(void) 136 { 137 unsigned int i; 138 uint32_t (*p)[RTE_DIM(mlx5_ptype_table)] = &mlx5_ptype_table; 139 140 /* Last entry must not be overwritten, reserved for errored packet. */ 141 for (i = 0; i < RTE_DIM(mlx5_ptype_table) - 1; ++i) 142 (*p)[i] = RTE_PTYPE_UNKNOWN; 143 /* 144 * The index to the array should have: 145 * bit[1:0] = l3_hdr_type 146 * bit[4:2] = l4_hdr_type 147 * bit[5] = ip_frag 148 * bit[6] = tunneled 149 * bit[7] = outer_l3_type 150 */ 151 /* L2 */ 152 (*p)[0x00] = RTE_PTYPE_L2_ETHER; 153 /* L3 */ 154 (*p)[0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 155 RTE_PTYPE_L4_NONFRAG; 156 (*p)[0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 157 RTE_PTYPE_L4_NONFRAG; 158 /* Fragmented */ 159 (*p)[0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 160 RTE_PTYPE_L4_FRAG; 161 (*p)[0x22] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 162 RTE_PTYPE_L4_FRAG; 163 /* TCP */ 164 (*p)[0x05] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 165 RTE_PTYPE_L4_TCP; 166 (*p)[0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 167 RTE_PTYPE_L4_TCP; 168 (*p)[0x0d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 169 RTE_PTYPE_L4_TCP; 170 (*p)[0x0e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 171 RTE_PTYPE_L4_TCP; 172 (*p)[0x11] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 173 RTE_PTYPE_L4_TCP; 174 (*p)[0x12] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 175 RTE_PTYPE_L4_TCP; 176 /* UDP */ 177 (*p)[0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 178 RTE_PTYPE_L4_UDP; 179 (*p)[0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 180 RTE_PTYPE_L4_UDP; 181 /* Repeat with outer_l3_type being set. Just in case. */ 182 (*p)[0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 183 RTE_PTYPE_L4_NONFRAG; 184 (*p)[0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 185 RTE_PTYPE_L4_NONFRAG; 186 (*p)[0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 187 RTE_PTYPE_L4_FRAG; 188 (*p)[0xa2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 189 RTE_PTYPE_L4_FRAG; 190 (*p)[0x85] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 191 RTE_PTYPE_L4_TCP; 192 (*p)[0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 193 RTE_PTYPE_L4_TCP; 194 (*p)[0x8d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 195 RTE_PTYPE_L4_TCP; 196 (*p)[0x8e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 197 RTE_PTYPE_L4_TCP; 198 (*p)[0x91] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 199 RTE_PTYPE_L4_TCP; 200 (*p)[0x92] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 201 RTE_PTYPE_L4_TCP; 202 (*p)[0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 203 RTE_PTYPE_L4_UDP; 204 (*p)[0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 205 RTE_PTYPE_L4_UDP; 206 /* Tunneled - L3 */ 207 (*p)[0x40] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN; 208 (*p)[0x41] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 209 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 210 RTE_PTYPE_INNER_L4_NONFRAG; 211 (*p)[0x42] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 212 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 213 RTE_PTYPE_INNER_L4_NONFRAG; 214 (*p)[0xc0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN; 215 (*p)[0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 216 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 217 RTE_PTYPE_INNER_L4_NONFRAG; 218 (*p)[0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 219 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 220 RTE_PTYPE_INNER_L4_NONFRAG; 221 /* Tunneled - Fragmented */ 222 (*p)[0x61] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 223 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 224 RTE_PTYPE_INNER_L4_FRAG; 225 (*p)[0x62] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 226 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 227 RTE_PTYPE_INNER_L4_FRAG; 228 (*p)[0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 229 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 230 RTE_PTYPE_INNER_L4_FRAG; 231 (*p)[0xe2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 232 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 233 RTE_PTYPE_INNER_L4_FRAG; 234 /* Tunneled - TCP */ 235 (*p)[0x45] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 236 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 237 RTE_PTYPE_INNER_L4_TCP; 238 (*p)[0x46] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 239 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 240 RTE_PTYPE_INNER_L4_TCP; 241 (*p)[0x4d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 242 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 243 RTE_PTYPE_INNER_L4_TCP; 244 (*p)[0x4e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 245 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 246 RTE_PTYPE_INNER_L4_TCP; 247 (*p)[0x51] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 248 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 249 RTE_PTYPE_INNER_L4_TCP; 250 (*p)[0x52] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 251 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 252 RTE_PTYPE_INNER_L4_TCP; 253 (*p)[0xc5] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 254 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 255 RTE_PTYPE_INNER_L4_TCP; 256 (*p)[0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 257 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 258 RTE_PTYPE_INNER_L4_TCP; 259 (*p)[0xcd] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 260 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 261 RTE_PTYPE_INNER_L4_TCP; 262 (*p)[0xce] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 263 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 264 RTE_PTYPE_INNER_L4_TCP; 265 (*p)[0xd1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 266 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 267 RTE_PTYPE_INNER_L4_TCP; 268 (*p)[0xd2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 269 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 270 RTE_PTYPE_INNER_L4_TCP; 271 /* Tunneled - UDP */ 272 (*p)[0x49] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 273 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 274 RTE_PTYPE_INNER_L4_UDP; 275 (*p)[0x4a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 276 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 277 RTE_PTYPE_INNER_L4_UDP; 278 (*p)[0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 279 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 280 RTE_PTYPE_INNER_L4_UDP; 281 (*p)[0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 282 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 283 RTE_PTYPE_INNER_L4_UDP; 284 } 285 286 /** 287 * Build a table to translate packet to checksum type of Verbs. 288 */ 289 void 290 mlx5_set_cksum_table(void) 291 { 292 unsigned int i; 293 uint8_t v; 294 295 /* 296 * The index should have: 297 * bit[0] = PKT_TX_TCP_SEG 298 * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM 299 * bit[4] = PKT_TX_IP_CKSUM 300 * bit[8] = PKT_TX_OUTER_IP_CKSUM 301 * bit[9] = tunnel 302 */ 303 for (i = 0; i < RTE_DIM(mlx5_cksum_table); ++i) { 304 v = 0; 305 if (i & (1 << 9)) { 306 /* Tunneled packet. */ 307 if (i & (1 << 8)) /* Outer IP. */ 308 v |= MLX5_ETH_WQE_L3_CSUM; 309 if (i & (1 << 4)) /* Inner IP. */ 310 v |= MLX5_ETH_WQE_L3_INNER_CSUM; 311 if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */ 312 v |= MLX5_ETH_WQE_L4_INNER_CSUM; 313 } else { 314 /* No tunnel. */ 315 if (i & (1 << 4)) /* IP. */ 316 v |= MLX5_ETH_WQE_L3_CSUM; 317 if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */ 318 v |= MLX5_ETH_WQE_L4_CSUM; 319 } 320 mlx5_cksum_table[i] = v; 321 } 322 } 323 324 /** 325 * Build a table to translate packet type of mbuf to SWP type of Verbs. 326 */ 327 void 328 mlx5_set_swp_types_table(void) 329 { 330 unsigned int i; 331 uint8_t v; 332 333 /* 334 * The index should have: 335 * bit[0:1] = PKT_TX_L4_MASK 336 * bit[4] = PKT_TX_IPV6 337 * bit[8] = PKT_TX_OUTER_IPV6 338 * bit[9] = PKT_TX_OUTER_UDP 339 */ 340 for (i = 0; i < RTE_DIM(mlx5_swp_types_table); ++i) { 341 v = 0; 342 if (i & (1 << 8)) 343 v |= MLX5_ETH_WQE_L3_OUTER_IPV6; 344 if (i & (1 << 9)) 345 v |= MLX5_ETH_WQE_L4_OUTER_UDP; 346 if (i & (1 << 4)) 347 v |= MLX5_ETH_WQE_L3_INNER_IPV6; 348 if ((i & 3) == (PKT_TX_UDP_CKSUM >> 52)) 349 v |= MLX5_ETH_WQE_L4_INNER_UDP; 350 mlx5_swp_types_table[i] = v; 351 } 352 } 353 354 /** 355 * Set Software Parser flags and offsets in Ethernet Segment of WQE. 356 * Flags must be preliminary initialized to zero. 357 * 358 * @param loc 359 * Pointer to burst routine local context. 360 * @param swp_flags 361 * Pointer to store Software Parser flags 362 * @param olx 363 * Configured Tx offloads mask. It is fully defined at 364 * compile time and may be used for optimization. 365 * 366 * @return 367 * Software Parser offsets packed in dword. 368 * Software Parser flags are set by pointer. 369 */ 370 static __rte_always_inline uint32_t 371 txq_mbuf_to_swp(struct mlx5_txq_local *restrict loc, 372 uint8_t *swp_flags, 373 unsigned int olx) 374 { 375 uint64_t ol, tunnel; 376 unsigned int idx, off; 377 uint32_t set; 378 379 if (!MLX5_TXOFF_CONFIG(SWP)) 380 return 0; 381 ol = loc->mbuf->ol_flags; 382 tunnel = ol & PKT_TX_TUNNEL_MASK; 383 /* 384 * Check whether Software Parser is required. 385 * Only customized tunnels may ask for. 386 */ 387 if (likely(tunnel != PKT_TX_TUNNEL_UDP && tunnel != PKT_TX_TUNNEL_IP)) 388 return 0; 389 /* 390 * The index should have: 391 * bit[0:1] = PKT_TX_L4_MASK 392 * bit[4] = PKT_TX_IPV6 393 * bit[8] = PKT_TX_OUTER_IPV6 394 * bit[9] = PKT_TX_OUTER_UDP 395 */ 396 idx = (ol & (PKT_TX_L4_MASK | PKT_TX_IPV6 | PKT_TX_OUTER_IPV6)) >> 52; 397 idx |= (tunnel == PKT_TX_TUNNEL_UDP) ? (1 << 9) : 0; 398 *swp_flags = mlx5_swp_types_table[idx]; 399 /* 400 * Set offsets for SW parser. Since ConnectX-5, SW parser just 401 * complements HW parser. SW parser starts to engage only if HW parser 402 * can't reach a header. For the older devices, HW parser will not kick 403 * in if any of SWP offsets is set. Therefore, all of the L3 offsets 404 * should be set regardless of HW offload. 405 */ 406 off = loc->mbuf->outer_l2_len; 407 if (MLX5_TXOFF_CONFIG(VLAN) && ol & PKT_TX_VLAN_PKT) 408 off += sizeof(struct rte_vlan_hdr); 409 set = (off >> 1) << 8; /* Outer L3 offset. */ 410 off += loc->mbuf->outer_l3_len; 411 if (tunnel == PKT_TX_TUNNEL_UDP) 412 set |= off >> 1; /* Outer L4 offset. */ 413 if (ol & (PKT_TX_IPV4 | PKT_TX_IPV6)) { /* Inner IP. */ 414 const uint64_t csum = ol & PKT_TX_L4_MASK; 415 off += loc->mbuf->l2_len; 416 set |= (off >> 1) << 24; /* Inner L3 offset. */ 417 if (csum == PKT_TX_TCP_CKSUM || 418 csum == PKT_TX_UDP_CKSUM || 419 (MLX5_TXOFF_CONFIG(TSO) && ol & PKT_TX_TCP_SEG)) { 420 off += loc->mbuf->l3_len; 421 set |= (off >> 1) << 16; /* Inner L4 offset. */ 422 } 423 } 424 set = rte_cpu_to_le_32(set); 425 return set; 426 } 427 428 /** 429 * Convert the Checksum offloads to Verbs. 430 * 431 * @param buf 432 * Pointer to the mbuf. 433 * 434 * @return 435 * Converted checksum flags. 436 */ 437 static __rte_always_inline uint8_t 438 txq_ol_cksum_to_cs(struct rte_mbuf *buf) 439 { 440 uint32_t idx; 441 uint8_t is_tunnel = !!(buf->ol_flags & PKT_TX_TUNNEL_MASK); 442 const uint64_t ol_flags_mask = PKT_TX_TCP_SEG | PKT_TX_L4_MASK | 443 PKT_TX_IP_CKSUM | PKT_TX_OUTER_IP_CKSUM; 444 445 /* 446 * The index should have: 447 * bit[0] = PKT_TX_TCP_SEG 448 * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM 449 * bit[4] = PKT_TX_IP_CKSUM 450 * bit[8] = PKT_TX_OUTER_IP_CKSUM 451 * bit[9] = tunnel 452 */ 453 idx = ((buf->ol_flags & ol_flags_mask) >> 50) | (!!is_tunnel << 9); 454 return mlx5_cksum_table[idx]; 455 } 456 457 /** 458 * Internal function to compute the number of used descriptors in an RX queue 459 * 460 * @param rxq 461 * The Rx queue. 462 * 463 * @return 464 * The number of used rx descriptor. 465 */ 466 static uint32_t 467 rx_queue_count(struct mlx5_rxq_data *rxq) 468 { 469 struct rxq_zip *zip = &rxq->zip; 470 volatile struct mlx5_cqe *cqe; 471 const unsigned int cqe_n = (1 << rxq->cqe_n); 472 const unsigned int cqe_cnt = cqe_n - 1; 473 unsigned int cq_ci; 474 unsigned int used; 475 476 /* if we are processing a compressed cqe */ 477 if (zip->ai) { 478 used = zip->cqe_cnt - zip->ca; 479 cq_ci = zip->cq_ci; 480 } else { 481 used = 0; 482 cq_ci = rxq->cq_ci; 483 } 484 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; 485 while (check_cqe(cqe, cqe_n, cq_ci) != MLX5_CQE_STATUS_HW_OWN) { 486 int8_t op_own; 487 unsigned int n; 488 489 op_own = cqe->op_own; 490 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) 491 n = rte_be_to_cpu_32(cqe->byte_cnt); 492 else 493 n = 1; 494 cq_ci += n; 495 used += n; 496 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; 497 } 498 used = RTE_MIN(used, (1U << rxq->elts_n) - 1); 499 return used; 500 } 501 502 /** 503 * DPDK callback to check the status of a rx descriptor. 504 * 505 * @param rx_queue 506 * The Rx queue. 507 * @param[in] offset 508 * The index of the descriptor in the ring. 509 * 510 * @return 511 * The status of the tx descriptor. 512 */ 513 int 514 mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset) 515 { 516 struct mlx5_rxq_data *rxq = rx_queue; 517 struct mlx5_rxq_ctrl *rxq_ctrl = 518 container_of(rxq, struct mlx5_rxq_ctrl, rxq); 519 struct rte_eth_dev *dev = ETH_DEV(rxq_ctrl->priv); 520 521 if (dev->rx_pkt_burst != mlx5_rx_burst) { 522 rte_errno = ENOTSUP; 523 return -rte_errno; 524 } 525 if (offset >= (1 << rxq->elts_n)) { 526 rte_errno = EINVAL; 527 return -rte_errno; 528 } 529 if (offset < rx_queue_count(rxq)) 530 return RTE_ETH_RX_DESC_DONE; 531 return RTE_ETH_RX_DESC_AVAIL; 532 } 533 534 /** 535 * DPDK callback to get the number of used descriptors in a RX queue 536 * 537 * @param dev 538 * Pointer to the device structure. 539 * 540 * @param rx_queue_id 541 * The Rx queue. 542 * 543 * @return 544 * The number of used rx descriptor. 545 * -EINVAL if the queue is invalid 546 */ 547 uint32_t 548 mlx5_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id) 549 { 550 struct mlx5_priv *priv = dev->data->dev_private; 551 struct mlx5_rxq_data *rxq; 552 553 if (dev->rx_pkt_burst != mlx5_rx_burst) { 554 rte_errno = ENOTSUP; 555 return -rte_errno; 556 } 557 rxq = (*priv->rxqs)[rx_queue_id]; 558 if (!rxq) { 559 rte_errno = EINVAL; 560 return -rte_errno; 561 } 562 return rx_queue_count(rxq); 563 } 564 565 #define MLX5_SYSTEM_LOG_DIR "/var/log" 566 /** 567 * Dump debug information to log file. 568 * 569 * @param fname 570 * The file name. 571 * @param hex_title 572 * If not NULL this string is printed as a header to the output 573 * and the output will be in hexadecimal view. 574 * @param buf 575 * This is the buffer address to print out. 576 * @param len 577 * The number of bytes to dump out. 578 */ 579 void 580 mlx5_dump_debug_information(const char *fname, const char *hex_title, 581 const void *buf, unsigned int hex_len) 582 { 583 FILE *fd; 584 585 MKSTR(path, "%s/%s", MLX5_SYSTEM_LOG_DIR, fname); 586 fd = fopen(path, "a+"); 587 if (!fd) { 588 DRV_LOG(WARNING, "cannot open %s for debug dump", path); 589 MKSTR(path2, "./%s", fname); 590 fd = fopen(path2, "a+"); 591 if (!fd) { 592 DRV_LOG(ERR, "cannot open %s for debug dump", path2); 593 return; 594 } 595 DRV_LOG(INFO, "New debug dump in file %s", path2); 596 } else { 597 DRV_LOG(INFO, "New debug dump in file %s", path); 598 } 599 if (hex_title) 600 rte_hexdump(fd, hex_title, buf, hex_len); 601 else 602 fprintf(fd, "%s", (const char *)buf); 603 fprintf(fd, "\n\n\n"); 604 fclose(fd); 605 } 606 607 /** 608 * Move QP from error state to running state and initialize indexes. 609 * 610 * @param txq_ctrl 611 * Pointer to TX queue control structure. 612 * 613 * @return 614 * 0 on success, else -1. 615 */ 616 static int 617 tx_recover_qp(struct mlx5_txq_ctrl *txq_ctrl) 618 { 619 struct mlx5_mp_arg_queue_state_modify sm = { 620 .is_wq = 0, 621 .queue_id = txq_ctrl->txq.idx, 622 }; 623 624 if (mlx5_queue_state_modify(ETH_DEV(txq_ctrl->priv), &sm)) 625 return -1; 626 txq_ctrl->txq.wqe_ci = 0; 627 txq_ctrl->txq.wqe_pi = 0; 628 txq_ctrl->txq.elts_comp = 0; 629 return 0; 630 } 631 632 /* Return 1 if the error CQE is signed otherwise, sign it and return 0. */ 633 static int 634 check_err_cqe_seen(volatile struct mlx5_err_cqe *err_cqe) 635 { 636 static const uint8_t magic[] = "seen"; 637 int ret = 1; 638 unsigned int i; 639 640 for (i = 0; i < sizeof(magic); ++i) 641 if (!ret || err_cqe->rsvd1[i] != magic[i]) { 642 ret = 0; 643 err_cqe->rsvd1[i] = magic[i]; 644 } 645 return ret; 646 } 647 648 /** 649 * Handle error CQE. 650 * 651 * @param txq 652 * Pointer to TX queue structure. 653 * @param error_cqe 654 * Pointer to the error CQE. 655 * 656 * @return 657 * Negative value if queue recovery failed, otherwise 658 * the error completion entry is handled successfully. 659 */ 660 static int 661 mlx5_tx_error_cqe_handle(struct mlx5_txq_data *restrict txq, 662 volatile struct mlx5_err_cqe *err_cqe) 663 { 664 if (err_cqe->syndrome != MLX5_CQE_SYNDROME_WR_FLUSH_ERR) { 665 const uint16_t wqe_m = ((1 << txq->wqe_n) - 1); 666 struct mlx5_txq_ctrl *txq_ctrl = 667 container_of(txq, struct mlx5_txq_ctrl, txq); 668 uint16_t new_wqe_pi = rte_be_to_cpu_16(err_cqe->wqe_counter); 669 int seen = check_err_cqe_seen(err_cqe); 670 671 if (!seen && txq_ctrl->dump_file_n < 672 txq_ctrl->priv->config.max_dump_files_num) { 673 MKSTR(err_str, "Unexpected CQE error syndrome " 674 "0x%02x CQN = %u SQN = %u wqe_counter = %u " 675 "wq_ci = %u cq_ci = %u", err_cqe->syndrome, 676 txq->cqe_s, txq->qp_num_8s >> 8, 677 rte_be_to_cpu_16(err_cqe->wqe_counter), 678 txq->wqe_ci, txq->cq_ci); 679 MKSTR(name, "dpdk_mlx5_port_%u_txq_%u_index_%u_%u", 680 PORT_ID(txq_ctrl->priv), txq->idx, 681 txq_ctrl->dump_file_n, (uint32_t)rte_rdtsc()); 682 mlx5_dump_debug_information(name, NULL, err_str, 0); 683 mlx5_dump_debug_information(name, "MLX5 Error CQ:", 684 (const void *)((uintptr_t) 685 txq->cqes), 686 sizeof(*err_cqe) * 687 (1 << txq->cqe_n)); 688 mlx5_dump_debug_information(name, "MLX5 Error SQ:", 689 (const void *)((uintptr_t) 690 txq->wqes), 691 MLX5_WQE_SIZE * 692 (1 << txq->wqe_n)); 693 txq_ctrl->dump_file_n++; 694 } 695 if (!seen) 696 /* 697 * Count errors in WQEs units. 698 * Later it can be improved to count error packets, 699 * for example, by SQ parsing to find how much packets 700 * should be counted for each WQE. 701 */ 702 txq->stats.oerrors += ((txq->wqe_ci & wqe_m) - 703 new_wqe_pi) & wqe_m; 704 if (tx_recover_qp(txq_ctrl)) { 705 /* Recovering failed - retry later on the same WQE. */ 706 return -1; 707 } 708 /* Release all the remaining buffers. */ 709 txq_free_elts(txq_ctrl); 710 } 711 return 0; 712 } 713 714 /** 715 * Translate RX completion flags to packet type. 716 * 717 * @param[in] rxq 718 * Pointer to RX queue structure. 719 * @param[in] cqe 720 * Pointer to CQE. 721 * 722 * @note: fix mlx5_dev_supported_ptypes_get() if any change here. 723 * 724 * @return 725 * Packet type for struct rte_mbuf. 726 */ 727 static inline uint32_t 728 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe) 729 { 730 uint8_t idx; 731 uint8_t pinfo = cqe->pkt_info; 732 uint16_t ptype = cqe->hdr_type_etc; 733 734 /* 735 * The index to the array should have: 736 * bit[1:0] = l3_hdr_type 737 * bit[4:2] = l4_hdr_type 738 * bit[5] = ip_frag 739 * bit[6] = tunneled 740 * bit[7] = outer_l3_type 741 */ 742 idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10); 743 return mlx5_ptype_table[idx] | rxq->tunnel * !!(idx & (1 << 6)); 744 } 745 746 /** 747 * Initialize Rx WQ and indexes. 748 * 749 * @param[in] rxq 750 * Pointer to RX queue structure. 751 */ 752 void 753 mlx5_rxq_initialize(struct mlx5_rxq_data *rxq) 754 { 755 const unsigned int wqe_n = 1 << rxq->elts_n; 756 unsigned int i; 757 758 for (i = 0; (i != wqe_n); ++i) { 759 volatile struct mlx5_wqe_data_seg *scat; 760 uintptr_t addr; 761 uint32_t byte_count; 762 763 if (mlx5_rxq_mprq_enabled(rxq)) { 764 struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[i]; 765 766 scat = &((volatile struct mlx5_wqe_mprq *) 767 rxq->wqes)[i].dseg; 768 addr = (uintptr_t)mlx5_mprq_buf_addr(buf, 769 1 << rxq->strd_num_n); 770 byte_count = (1 << rxq->strd_sz_n) * 771 (1 << rxq->strd_num_n); 772 } else { 773 struct rte_mbuf *buf = (*rxq->elts)[i]; 774 775 scat = &((volatile struct mlx5_wqe_data_seg *) 776 rxq->wqes)[i]; 777 addr = rte_pktmbuf_mtod(buf, uintptr_t); 778 byte_count = DATA_LEN(buf); 779 } 780 /* scat->addr must be able to store a pointer. */ 781 assert(sizeof(scat->addr) >= sizeof(uintptr_t)); 782 *scat = (struct mlx5_wqe_data_seg){ 783 .addr = rte_cpu_to_be_64(addr), 784 .byte_count = rte_cpu_to_be_32(byte_count), 785 .lkey = mlx5_rx_addr2mr(rxq, addr), 786 }; 787 } 788 rxq->consumed_strd = 0; 789 rxq->decompressed = 0; 790 rxq->rq_pi = 0; 791 rxq->zip = (struct rxq_zip){ 792 .ai = 0, 793 }; 794 /* Update doorbell counter. */ 795 rxq->rq_ci = wqe_n >> rxq->sges_n; 796 rte_cio_wmb(); 797 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 798 } 799 800 /** 801 * Modify a Verbs/DevX queue state. 802 * This must be called from the primary process. 803 * 804 * @param dev 805 * Pointer to Ethernet device. 806 * @param sm 807 * State modify request parameters. 808 * 809 * @return 810 * 0 in case of success else non-zero value and rte_errno is set. 811 */ 812 int 813 mlx5_queue_state_modify_primary(struct rte_eth_dev *dev, 814 const struct mlx5_mp_arg_queue_state_modify *sm) 815 { 816 int ret; 817 struct mlx5_priv *priv = dev->data->dev_private; 818 819 if (sm->is_wq) { 820 struct mlx5_rxq_data *rxq = (*priv->rxqs)[sm->queue_id]; 821 struct mlx5_rxq_ctrl *rxq_ctrl = 822 container_of(rxq, struct mlx5_rxq_ctrl, rxq); 823 824 if (rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_IBV) { 825 struct ibv_wq_attr mod = { 826 .attr_mask = IBV_WQ_ATTR_STATE, 827 .wq_state = sm->state, 828 }; 829 830 ret = mlx5_glue->modify_wq(rxq_ctrl->obj->wq, &mod); 831 } else { /* rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ. */ 832 struct mlx5_devx_modify_rq_attr rq_attr; 833 834 memset(&rq_attr, 0, sizeof(rq_attr)); 835 if (sm->state == IBV_WQS_RESET) { 836 rq_attr.rq_state = MLX5_RQC_STATE_ERR; 837 rq_attr.state = MLX5_RQC_STATE_RST; 838 } else if (sm->state == IBV_WQS_RDY) { 839 rq_attr.rq_state = MLX5_RQC_STATE_RST; 840 rq_attr.state = MLX5_RQC_STATE_RDY; 841 } else if (sm->state == IBV_WQS_ERR) { 842 rq_attr.rq_state = MLX5_RQC_STATE_RDY; 843 rq_attr.state = MLX5_RQC_STATE_ERR; 844 } 845 ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq, 846 &rq_attr); 847 } 848 if (ret) { 849 DRV_LOG(ERR, "Cannot change Rx WQ state to %u - %s", 850 sm->state, strerror(errno)); 851 rte_errno = errno; 852 return ret; 853 } 854 } else { 855 struct mlx5_txq_data *txq = (*priv->txqs)[sm->queue_id]; 856 struct mlx5_txq_ctrl *txq_ctrl = 857 container_of(txq, struct mlx5_txq_ctrl, txq); 858 struct ibv_qp_attr mod = { 859 .qp_state = IBV_QPS_RESET, 860 .port_num = (uint8_t)priv->ibv_port, 861 }; 862 struct ibv_qp *qp = txq_ctrl->obj->qp; 863 864 ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE); 865 if (ret) { 866 DRV_LOG(ERR, "Cannot change the Tx QP state to RESET " 867 "%s", strerror(errno)); 868 rte_errno = errno; 869 return ret; 870 } 871 mod.qp_state = IBV_QPS_INIT; 872 ret = mlx5_glue->modify_qp(qp, &mod, 873 (IBV_QP_STATE | IBV_QP_PORT)); 874 if (ret) { 875 DRV_LOG(ERR, "Cannot change Tx QP state to INIT %s", 876 strerror(errno)); 877 rte_errno = errno; 878 return ret; 879 } 880 mod.qp_state = IBV_QPS_RTR; 881 ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE); 882 if (ret) { 883 DRV_LOG(ERR, "Cannot change Tx QP state to RTR %s", 884 strerror(errno)); 885 rte_errno = errno; 886 return ret; 887 } 888 mod.qp_state = IBV_QPS_RTS; 889 ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE); 890 if (ret) { 891 DRV_LOG(ERR, "Cannot change Tx QP state to RTS %s", 892 strerror(errno)); 893 rte_errno = errno; 894 return ret; 895 } 896 } 897 return 0; 898 } 899 900 /** 901 * Modify a Verbs queue state. 902 * 903 * @param dev 904 * Pointer to Ethernet device. 905 * @param sm 906 * State modify request parameters. 907 * 908 * @return 909 * 0 in case of success else non-zero value. 910 */ 911 static int 912 mlx5_queue_state_modify(struct rte_eth_dev *dev, 913 struct mlx5_mp_arg_queue_state_modify *sm) 914 { 915 int ret = 0; 916 917 switch (rte_eal_process_type()) { 918 case RTE_PROC_PRIMARY: 919 ret = mlx5_queue_state_modify_primary(dev, sm); 920 break; 921 case RTE_PROC_SECONDARY: 922 ret = mlx5_mp_req_queue_state_modify(dev, sm); 923 break; 924 default: 925 break; 926 } 927 return ret; 928 } 929 930 /** 931 * Handle a Rx error. 932 * The function inserts the RQ state to reset when the first error CQE is 933 * shown, then drains the CQ by the caller function loop. When the CQ is empty, 934 * it moves the RQ state to ready and initializes the RQ. 935 * Next CQE identification and error counting are in the caller responsibility. 936 * 937 * @param[in] rxq 938 * Pointer to RX queue structure. 939 * @param[in] vec 940 * 1 when called from vectorized Rx burst, need to prepare mbufs for the RQ. 941 * 0 when called from non-vectorized Rx burst. 942 * 943 * @return 944 * -1 in case of recovery error, otherwise the CQE status. 945 */ 946 int 947 mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec) 948 { 949 const uint16_t cqe_n = 1 << rxq->cqe_n; 950 const uint16_t cqe_mask = cqe_n - 1; 951 const unsigned int wqe_n = 1 << rxq->elts_n; 952 struct mlx5_rxq_ctrl *rxq_ctrl = 953 container_of(rxq, struct mlx5_rxq_ctrl, rxq); 954 union { 955 volatile struct mlx5_cqe *cqe; 956 volatile struct mlx5_err_cqe *err_cqe; 957 } u = { 958 .cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask], 959 }; 960 struct mlx5_mp_arg_queue_state_modify sm; 961 int ret; 962 963 switch (rxq->err_state) { 964 case MLX5_RXQ_ERR_STATE_NO_ERROR: 965 rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_RESET; 966 /* Fall-through */ 967 case MLX5_RXQ_ERR_STATE_NEED_RESET: 968 sm.is_wq = 1; 969 sm.queue_id = rxq->idx; 970 sm.state = IBV_WQS_RESET; 971 if (mlx5_queue_state_modify(ETH_DEV(rxq_ctrl->priv), &sm)) 972 return -1; 973 if (rxq_ctrl->dump_file_n < 974 rxq_ctrl->priv->config.max_dump_files_num) { 975 MKSTR(err_str, "Unexpected CQE error syndrome " 976 "0x%02x CQN = %u RQN = %u wqe_counter = %u" 977 " rq_ci = %u cq_ci = %u", u.err_cqe->syndrome, 978 rxq->cqn, rxq_ctrl->wqn, 979 rte_be_to_cpu_16(u.err_cqe->wqe_counter), 980 rxq->rq_ci << rxq->sges_n, rxq->cq_ci); 981 MKSTR(name, "dpdk_mlx5_port_%u_rxq_%u_%u", 982 rxq->port_id, rxq->idx, (uint32_t)rte_rdtsc()); 983 mlx5_dump_debug_information(name, NULL, err_str, 0); 984 mlx5_dump_debug_information(name, "MLX5 Error CQ:", 985 (const void *)((uintptr_t) 986 rxq->cqes), 987 sizeof(*u.cqe) * cqe_n); 988 mlx5_dump_debug_information(name, "MLX5 Error RQ:", 989 (const void *)((uintptr_t) 990 rxq->wqes), 991 16 * wqe_n); 992 rxq_ctrl->dump_file_n++; 993 } 994 rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_READY; 995 /* Fall-through */ 996 case MLX5_RXQ_ERR_STATE_NEED_READY: 997 ret = check_cqe(u.cqe, cqe_n, rxq->cq_ci); 998 if (ret == MLX5_CQE_STATUS_HW_OWN) { 999 rte_cio_wmb(); 1000 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); 1001 rte_cio_wmb(); 1002 /* 1003 * The RQ consumer index must be zeroed while moving 1004 * from RESET state to RDY state. 1005 */ 1006 *rxq->rq_db = rte_cpu_to_be_32(0); 1007 rte_cio_wmb(); 1008 sm.is_wq = 1; 1009 sm.queue_id = rxq->idx; 1010 sm.state = IBV_WQS_RDY; 1011 if (mlx5_queue_state_modify(ETH_DEV(rxq_ctrl->priv), 1012 &sm)) 1013 return -1; 1014 if (vec) { 1015 const uint16_t q_mask = wqe_n - 1; 1016 uint16_t elt_idx; 1017 struct rte_mbuf **elt; 1018 int i; 1019 unsigned int n = wqe_n - (rxq->rq_ci - 1020 rxq->rq_pi); 1021 1022 for (i = 0; i < (int)n; ++i) { 1023 elt_idx = (rxq->rq_ci + i) & q_mask; 1024 elt = &(*rxq->elts)[elt_idx]; 1025 *elt = rte_mbuf_raw_alloc(rxq->mp); 1026 if (!*elt) { 1027 for (i--; i >= 0; --i) { 1028 elt_idx = (rxq->rq_ci + 1029 i) & q_mask; 1030 elt = &(*rxq->elts) 1031 [elt_idx]; 1032 rte_pktmbuf_free_seg 1033 (*elt); 1034 } 1035 return -1; 1036 } 1037 } 1038 for (i = 0; i < (int)wqe_n; ++i) { 1039 elt = &(*rxq->elts)[i]; 1040 DATA_LEN(*elt) = 1041 (uint16_t)((*elt)->buf_len - 1042 rte_pktmbuf_headroom(*elt)); 1043 } 1044 /* Padding with a fake mbuf for vec Rx. */ 1045 for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i) 1046 (*rxq->elts)[wqe_n + i] = 1047 &rxq->fake_mbuf; 1048 } 1049 mlx5_rxq_initialize(rxq); 1050 rxq->err_state = MLX5_RXQ_ERR_STATE_NO_ERROR; 1051 } 1052 return ret; 1053 default: 1054 return -1; 1055 } 1056 } 1057 1058 /** 1059 * Get size of the next packet for a given CQE. For compressed CQEs, the 1060 * consumer index is updated only once all packets of the current one have 1061 * been processed. 1062 * 1063 * @param rxq 1064 * Pointer to RX queue. 1065 * @param cqe 1066 * CQE to process. 1067 * @param[out] mcqe 1068 * Store pointer to mini-CQE if compressed. Otherwise, the pointer is not 1069 * written. 1070 * 1071 * @return 1072 * 0 in case of empty CQE, otherwise the packet size in bytes. 1073 */ 1074 static inline int 1075 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, 1076 uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe) 1077 { 1078 struct rxq_zip *zip = &rxq->zip; 1079 uint16_t cqe_n = cqe_cnt + 1; 1080 int len; 1081 uint16_t idx, end; 1082 1083 do { 1084 len = 0; 1085 /* Process compressed data in the CQE and mini arrays. */ 1086 if (zip->ai) { 1087 volatile struct mlx5_mini_cqe8 (*mc)[8] = 1088 (volatile struct mlx5_mini_cqe8 (*)[8]) 1089 (uintptr_t)(&(*rxq->cqes)[zip->ca & 1090 cqe_cnt].pkt_info); 1091 1092 len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt); 1093 *mcqe = &(*mc)[zip->ai & 7]; 1094 if ((++zip->ai & 7) == 0) { 1095 /* Invalidate consumed CQEs */ 1096 idx = zip->ca; 1097 end = zip->na; 1098 while (idx != end) { 1099 (*rxq->cqes)[idx & cqe_cnt].op_own = 1100 MLX5_CQE_INVALIDATE; 1101 ++idx; 1102 } 1103 /* 1104 * Increment consumer index to skip the number 1105 * of CQEs consumed. Hardware leaves holes in 1106 * the CQ ring for software use. 1107 */ 1108 zip->ca = zip->na; 1109 zip->na += 8; 1110 } 1111 if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) { 1112 /* Invalidate the rest */ 1113 idx = zip->ca; 1114 end = zip->cq_ci; 1115 1116 while (idx != end) { 1117 (*rxq->cqes)[idx & cqe_cnt].op_own = 1118 MLX5_CQE_INVALIDATE; 1119 ++idx; 1120 } 1121 rxq->cq_ci = zip->cq_ci; 1122 zip->ai = 0; 1123 } 1124 /* 1125 * No compressed data, get next CQE and verify if it is 1126 * compressed. 1127 */ 1128 } else { 1129 int ret; 1130 int8_t op_own; 1131 1132 ret = check_cqe(cqe, cqe_n, rxq->cq_ci); 1133 if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) { 1134 if (unlikely(ret == MLX5_CQE_STATUS_ERR || 1135 rxq->err_state)) { 1136 ret = mlx5_rx_err_handle(rxq, 0); 1137 if (ret == MLX5_CQE_STATUS_HW_OWN || 1138 ret == -1) 1139 return 0; 1140 } else { 1141 return 0; 1142 } 1143 } 1144 ++rxq->cq_ci; 1145 op_own = cqe->op_own; 1146 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) { 1147 volatile struct mlx5_mini_cqe8 (*mc)[8] = 1148 (volatile struct mlx5_mini_cqe8 (*)[8]) 1149 (uintptr_t)(&(*rxq->cqes) 1150 [rxq->cq_ci & 1151 cqe_cnt].pkt_info); 1152 1153 /* Fix endianness. */ 1154 zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt); 1155 /* 1156 * Current mini array position is the one 1157 * returned by check_cqe64(). 1158 * 1159 * If completion comprises several mini arrays, 1160 * as a special case the second one is located 1161 * 7 CQEs after the initial CQE instead of 8 1162 * for subsequent ones. 1163 */ 1164 zip->ca = rxq->cq_ci; 1165 zip->na = zip->ca + 7; 1166 /* Compute the next non compressed CQE. */ 1167 --rxq->cq_ci; 1168 zip->cq_ci = rxq->cq_ci + zip->cqe_cnt; 1169 /* Get packet size to return. */ 1170 len = rte_be_to_cpu_32((*mc)[0].byte_cnt); 1171 *mcqe = &(*mc)[0]; 1172 zip->ai = 1; 1173 /* Prefetch all to be invalidated */ 1174 idx = zip->ca; 1175 end = zip->cq_ci; 1176 while (idx != end) { 1177 rte_prefetch0(&(*rxq->cqes)[(idx) & 1178 cqe_cnt]); 1179 ++idx; 1180 } 1181 } else { 1182 len = rte_be_to_cpu_32(cqe->byte_cnt); 1183 } 1184 } 1185 if (unlikely(rxq->err_state)) { 1186 cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1187 ++rxq->stats.idropped; 1188 } else { 1189 return len; 1190 } 1191 } while (1); 1192 } 1193 1194 /** 1195 * Translate RX completion flags to offload flags. 1196 * 1197 * @param[in] cqe 1198 * Pointer to CQE. 1199 * 1200 * @return 1201 * Offload flags (ol_flags) for struct rte_mbuf. 1202 */ 1203 static inline uint32_t 1204 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe) 1205 { 1206 uint32_t ol_flags = 0; 1207 uint16_t flags = rte_be_to_cpu_16(cqe->hdr_type_etc); 1208 1209 ol_flags = 1210 TRANSPOSE(flags, 1211 MLX5_CQE_RX_L3_HDR_VALID, 1212 PKT_RX_IP_CKSUM_GOOD) | 1213 TRANSPOSE(flags, 1214 MLX5_CQE_RX_L4_HDR_VALID, 1215 PKT_RX_L4_CKSUM_GOOD); 1216 return ol_flags; 1217 } 1218 1219 /** 1220 * Fill in mbuf fields from RX completion flags. 1221 * Note that pkt->ol_flags should be initialized outside of this function. 1222 * 1223 * @param rxq 1224 * Pointer to RX queue. 1225 * @param pkt 1226 * mbuf to fill. 1227 * @param cqe 1228 * CQE to process. 1229 * @param rss_hash_res 1230 * Packet RSS Hash result. 1231 */ 1232 static inline void 1233 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, 1234 volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res) 1235 { 1236 /* Update packet information. */ 1237 pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe); 1238 if (rss_hash_res && rxq->rss_hash) { 1239 pkt->hash.rss = rss_hash_res; 1240 pkt->ol_flags |= PKT_RX_RSS_HASH; 1241 } 1242 if (rxq->mark && MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) { 1243 pkt->ol_flags |= PKT_RX_FDIR; 1244 if (cqe->sop_drop_qpn != 1245 rte_cpu_to_be_32(MLX5_FLOW_MARK_DEFAULT)) { 1246 uint32_t mark = cqe->sop_drop_qpn; 1247 1248 pkt->ol_flags |= PKT_RX_FDIR_ID; 1249 pkt->hash.fdir.hi = mlx5_flow_mark_get(mark); 1250 } 1251 } 1252 if (rte_flow_dynf_metadata_avail() && cqe->flow_table_metadata) { 1253 pkt->ol_flags |= PKT_RX_DYNF_METADATA; 1254 *RTE_FLOW_DYNF_METADATA(pkt) = cqe->flow_table_metadata; 1255 } 1256 if (rxq->csum) 1257 pkt->ol_flags |= rxq_cq_to_ol_flags(cqe); 1258 if (rxq->vlan_strip && 1259 (cqe->hdr_type_etc & rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED))) { 1260 pkt->ol_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED; 1261 pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info); 1262 } 1263 if (rxq->hw_timestamp) { 1264 pkt->timestamp = rte_be_to_cpu_64(cqe->timestamp); 1265 pkt->ol_flags |= PKT_RX_TIMESTAMP; 1266 } 1267 } 1268 1269 /** 1270 * DPDK callback for RX. 1271 * 1272 * @param dpdk_rxq 1273 * Generic pointer to RX queue structure. 1274 * @param[out] pkts 1275 * Array to store received packets. 1276 * @param pkts_n 1277 * Maximum number of packets in array. 1278 * 1279 * @return 1280 * Number of packets successfully received (<= pkts_n). 1281 */ 1282 uint16_t 1283 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1284 { 1285 struct mlx5_rxq_data *rxq = dpdk_rxq; 1286 const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1; 1287 const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1; 1288 const unsigned int sges_n = rxq->sges_n; 1289 struct rte_mbuf *pkt = NULL; 1290 struct rte_mbuf *seg = NULL; 1291 volatile struct mlx5_cqe *cqe = 1292 &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1293 unsigned int i = 0; 1294 unsigned int rq_ci = rxq->rq_ci << sges_n; 1295 int len = 0; /* keep its value across iterations. */ 1296 1297 while (pkts_n) { 1298 unsigned int idx = rq_ci & wqe_cnt; 1299 volatile struct mlx5_wqe_data_seg *wqe = 1300 &((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx]; 1301 struct rte_mbuf *rep = (*rxq->elts)[idx]; 1302 volatile struct mlx5_mini_cqe8 *mcqe = NULL; 1303 uint32_t rss_hash_res; 1304 1305 if (pkt) 1306 NEXT(seg) = rep; 1307 seg = rep; 1308 rte_prefetch0(seg); 1309 rte_prefetch0(cqe); 1310 rte_prefetch0(wqe); 1311 rep = rte_mbuf_raw_alloc(rxq->mp); 1312 if (unlikely(rep == NULL)) { 1313 ++rxq->stats.rx_nombuf; 1314 if (!pkt) { 1315 /* 1316 * no buffers before we even started, 1317 * bail out silently. 1318 */ 1319 break; 1320 } 1321 while (pkt != seg) { 1322 assert(pkt != (*rxq->elts)[idx]); 1323 rep = NEXT(pkt); 1324 NEXT(pkt) = NULL; 1325 NB_SEGS(pkt) = 1; 1326 rte_mbuf_raw_free(pkt); 1327 pkt = rep; 1328 } 1329 break; 1330 } 1331 if (!pkt) { 1332 cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1333 len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt, &mcqe); 1334 if (!len) { 1335 rte_mbuf_raw_free(rep); 1336 break; 1337 } 1338 pkt = seg; 1339 assert(len >= (rxq->crc_present << 2)); 1340 pkt->ol_flags &= EXT_ATTACHED_MBUF; 1341 /* If compressed, take hash result from mini-CQE. */ 1342 rss_hash_res = rte_be_to_cpu_32(mcqe == NULL ? 1343 cqe->rx_hash_res : 1344 mcqe->rx_hash_result); 1345 rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res); 1346 if (rxq->crc_present) 1347 len -= RTE_ETHER_CRC_LEN; 1348 PKT_LEN(pkt) = len; 1349 if (cqe->lro_num_seg > 1) { 1350 mlx5_lro_update_hdr 1351 (rte_pktmbuf_mtod(pkt, uint8_t *), cqe, 1352 len); 1353 pkt->ol_flags |= PKT_RX_LRO; 1354 pkt->tso_segsz = len / cqe->lro_num_seg; 1355 } 1356 } 1357 DATA_LEN(rep) = DATA_LEN(seg); 1358 PKT_LEN(rep) = PKT_LEN(seg); 1359 SET_DATA_OFF(rep, DATA_OFF(seg)); 1360 PORT(rep) = PORT(seg); 1361 (*rxq->elts)[idx] = rep; 1362 /* 1363 * Fill NIC descriptor with the new buffer. The lkey and size 1364 * of the buffers are already known, only the buffer address 1365 * changes. 1366 */ 1367 wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t)); 1368 /* If there's only one MR, no need to replace LKey in WQE. */ 1369 if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) 1370 wqe->lkey = mlx5_rx_mb2mr(rxq, rep); 1371 if (len > DATA_LEN(seg)) { 1372 len -= DATA_LEN(seg); 1373 ++NB_SEGS(pkt); 1374 ++rq_ci; 1375 continue; 1376 } 1377 DATA_LEN(seg) = len; 1378 #ifdef MLX5_PMD_SOFT_COUNTERS 1379 /* Increment bytes counter. */ 1380 rxq->stats.ibytes += PKT_LEN(pkt); 1381 #endif 1382 /* Return packet. */ 1383 *(pkts++) = pkt; 1384 pkt = NULL; 1385 --pkts_n; 1386 ++i; 1387 /* Align consumer index to the next stride. */ 1388 rq_ci >>= sges_n; 1389 ++rq_ci; 1390 rq_ci <<= sges_n; 1391 } 1392 if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci))) 1393 return 0; 1394 /* Update the consumer index. */ 1395 rxq->rq_ci = rq_ci >> sges_n; 1396 rte_cio_wmb(); 1397 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); 1398 rte_cio_wmb(); 1399 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 1400 #ifdef MLX5_PMD_SOFT_COUNTERS 1401 /* Increment packets counter. */ 1402 rxq->stats.ipackets += i; 1403 #endif 1404 return i; 1405 } 1406 1407 /** 1408 * Update LRO packet TCP header. 1409 * The HW LRO feature doesn't update the TCP header after coalescing the 1410 * TCP segments but supplies information in CQE to fill it by SW. 1411 * 1412 * @param tcp 1413 * Pointer to the TCP header. 1414 * @param cqe 1415 * Pointer to the completion entry.. 1416 * @param phcsum 1417 * The L3 pseudo-header checksum. 1418 */ 1419 static inline void 1420 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *restrict tcp, 1421 volatile struct mlx5_cqe *restrict cqe, 1422 uint32_t phcsum) 1423 { 1424 uint8_t l4_type = (rte_be_to_cpu_16(cqe->hdr_type_etc) & 1425 MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT; 1426 /* 1427 * The HW calculates only the TCP payload checksum, need to complete 1428 * the TCP header checksum and the L3 pseudo-header checksum. 1429 */ 1430 uint32_t csum = phcsum + cqe->csum; 1431 1432 if (l4_type == MLX5_L4_HDR_TYPE_TCP_EMPTY_ACK || 1433 l4_type == MLX5_L4_HDR_TYPE_TCP_WITH_ACL) { 1434 tcp->tcp_flags |= RTE_TCP_ACK_FLAG; 1435 tcp->recv_ack = cqe->lro_ack_seq_num; 1436 tcp->rx_win = cqe->lro_tcp_win; 1437 } 1438 if (cqe->lro_tcppsh_abort_dupack & MLX5_CQE_LRO_PUSH_MASK) 1439 tcp->tcp_flags |= RTE_TCP_PSH_FLAG; 1440 tcp->cksum = 0; 1441 csum += rte_raw_cksum(tcp, (tcp->data_off & 0xF) * 4); 1442 csum = ((csum & 0xffff0000) >> 16) + (csum & 0xffff); 1443 csum = (~csum) & 0xffff; 1444 if (csum == 0) 1445 csum = 0xffff; 1446 tcp->cksum = csum; 1447 } 1448 1449 /** 1450 * Update LRO packet headers. 1451 * The HW LRO feature doesn't update the L3/TCP headers after coalescing the 1452 * TCP segments but supply information in CQE to fill it by SW. 1453 * 1454 * @param padd 1455 * The packet address. 1456 * @param cqe 1457 * Pointer to the completion entry.. 1458 * @param len 1459 * The packet length. 1460 */ 1461 static inline void 1462 mlx5_lro_update_hdr(uint8_t *restrict padd, 1463 volatile struct mlx5_cqe *restrict cqe, 1464 uint32_t len) 1465 { 1466 union { 1467 struct rte_ether_hdr *eth; 1468 struct rte_vlan_hdr *vlan; 1469 struct rte_ipv4_hdr *ipv4; 1470 struct rte_ipv6_hdr *ipv6; 1471 struct rte_tcp_hdr *tcp; 1472 uint8_t *hdr; 1473 } h = { 1474 .hdr = padd, 1475 }; 1476 uint16_t proto = h.eth->ether_type; 1477 uint32_t phcsum; 1478 1479 h.eth++; 1480 while (proto == RTE_BE16(RTE_ETHER_TYPE_VLAN) || 1481 proto == RTE_BE16(RTE_ETHER_TYPE_QINQ)) { 1482 proto = h.vlan->eth_proto; 1483 h.vlan++; 1484 } 1485 if (proto == RTE_BE16(RTE_ETHER_TYPE_IPV4)) { 1486 h.ipv4->time_to_live = cqe->lro_min_ttl; 1487 h.ipv4->total_length = rte_cpu_to_be_16(len - (h.hdr - padd)); 1488 h.ipv4->hdr_checksum = 0; 1489 h.ipv4->hdr_checksum = rte_ipv4_cksum(h.ipv4); 1490 phcsum = rte_ipv4_phdr_cksum(h.ipv4, 0); 1491 h.ipv4++; 1492 } else { 1493 h.ipv6->hop_limits = cqe->lro_min_ttl; 1494 h.ipv6->payload_len = rte_cpu_to_be_16(len - (h.hdr - padd) - 1495 sizeof(*h.ipv6)); 1496 phcsum = rte_ipv6_phdr_cksum(h.ipv6, 0); 1497 h.ipv6++; 1498 } 1499 mlx5_lro_update_tcp_hdr(h.tcp, cqe, phcsum); 1500 } 1501 1502 void 1503 mlx5_mprq_buf_free_cb(void *addr __rte_unused, void *opaque) 1504 { 1505 struct mlx5_mprq_buf *buf = opaque; 1506 1507 if (rte_atomic16_read(&buf->refcnt) == 1) { 1508 rte_mempool_put(buf->mp, buf); 1509 } else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) { 1510 rte_atomic16_set(&buf->refcnt, 1); 1511 rte_mempool_put(buf->mp, buf); 1512 } 1513 } 1514 1515 void 1516 mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf) 1517 { 1518 mlx5_mprq_buf_free_cb(NULL, buf); 1519 } 1520 1521 static inline void 1522 mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx, 1523 const unsigned int strd_n) 1524 { 1525 struct mlx5_mprq_buf *rep = rxq->mprq_repl; 1526 volatile struct mlx5_wqe_data_seg *wqe = 1527 &((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg; 1528 void *addr; 1529 1530 assert(rep != NULL); 1531 /* Replace MPRQ buf. */ 1532 (*rxq->mprq_bufs)[rq_idx] = rep; 1533 /* Replace WQE. */ 1534 addr = mlx5_mprq_buf_addr(rep, strd_n); 1535 wqe->addr = rte_cpu_to_be_64((uintptr_t)addr); 1536 /* If there's only one MR, no need to replace LKey in WQE. */ 1537 if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) 1538 wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr); 1539 /* Stash a mbuf for next replacement. */ 1540 if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep))) 1541 rxq->mprq_repl = rep; 1542 else 1543 rxq->mprq_repl = NULL; 1544 } 1545 1546 /** 1547 * DPDK callback for RX with Multi-Packet RQ support. 1548 * 1549 * @param dpdk_rxq 1550 * Generic pointer to RX queue structure. 1551 * @param[out] pkts 1552 * Array to store received packets. 1553 * @param pkts_n 1554 * Maximum number of packets in array. 1555 * 1556 * @return 1557 * Number of packets successfully received (<= pkts_n). 1558 */ 1559 uint16_t 1560 mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1561 { 1562 struct mlx5_rxq_data *rxq = dpdk_rxq; 1563 const unsigned int strd_n = 1 << rxq->strd_num_n; 1564 const unsigned int strd_sz = 1 << rxq->strd_sz_n; 1565 const unsigned int strd_shift = 1566 MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en; 1567 const unsigned int cq_mask = (1 << rxq->cqe_n) - 1; 1568 const unsigned int wq_mask = (1 << rxq->elts_n) - 1; 1569 volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask]; 1570 unsigned int i = 0; 1571 uint32_t rq_ci = rxq->rq_ci; 1572 uint16_t consumed_strd = rxq->consumed_strd; 1573 uint16_t headroom_sz = rxq->strd_headroom_en * RTE_PKTMBUF_HEADROOM; 1574 struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_ci & wq_mask]; 1575 1576 while (i < pkts_n) { 1577 struct rte_mbuf *pkt; 1578 void *addr; 1579 int ret; 1580 unsigned int len; 1581 uint16_t strd_cnt; 1582 uint16_t strd_idx; 1583 uint32_t offset; 1584 uint32_t byte_cnt; 1585 volatile struct mlx5_mini_cqe8 *mcqe = NULL; 1586 uint32_t rss_hash_res = 0; 1587 uint8_t lro_num_seg; 1588 1589 if (consumed_strd == strd_n) { 1590 /* Replace WQE only if the buffer is still in use. */ 1591 if (rte_atomic16_read(&buf->refcnt) > 1) { 1592 mprq_buf_replace(rxq, rq_ci & wq_mask, strd_n); 1593 /* Release the old buffer. */ 1594 mlx5_mprq_buf_free(buf); 1595 } else if (unlikely(rxq->mprq_repl == NULL)) { 1596 struct mlx5_mprq_buf *rep; 1597 1598 /* 1599 * Currently, the MPRQ mempool is out of buffer 1600 * and doing memcpy regardless of the size of Rx 1601 * packet. Retry allocation to get back to 1602 * normal. 1603 */ 1604 if (!rte_mempool_get(rxq->mprq_mp, 1605 (void **)&rep)) 1606 rxq->mprq_repl = rep; 1607 } 1608 /* Advance to the next WQE. */ 1609 consumed_strd = 0; 1610 ++rq_ci; 1611 buf = (*rxq->mprq_bufs)[rq_ci & wq_mask]; 1612 } 1613 cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask]; 1614 ret = mlx5_rx_poll_len(rxq, cqe, cq_mask, &mcqe); 1615 if (!ret) 1616 break; 1617 byte_cnt = ret; 1618 strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >> 1619 MLX5_MPRQ_STRIDE_NUM_SHIFT; 1620 assert(strd_cnt); 1621 consumed_strd += strd_cnt; 1622 if (byte_cnt & MLX5_MPRQ_FILLER_MASK) 1623 continue; 1624 if (mcqe == NULL) { 1625 rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res); 1626 strd_idx = rte_be_to_cpu_16(cqe->wqe_counter); 1627 } else { 1628 /* mini-CQE for MPRQ doesn't have hash result. */ 1629 strd_idx = rte_be_to_cpu_16(mcqe->stride_idx); 1630 } 1631 assert(strd_idx < strd_n); 1632 assert(!((rte_be_to_cpu_16(cqe->wqe_id) ^ rq_ci) & wq_mask)); 1633 lro_num_seg = cqe->lro_num_seg; 1634 /* 1635 * Currently configured to receive a packet per a stride. But if 1636 * MTU is adjusted through kernel interface, device could 1637 * consume multiple strides without raising an error. In this 1638 * case, the packet should be dropped because it is bigger than 1639 * the max_rx_pkt_len. 1640 */ 1641 if (unlikely(!lro_num_seg && strd_cnt > 1)) { 1642 ++rxq->stats.idropped; 1643 continue; 1644 } 1645 pkt = rte_pktmbuf_alloc(rxq->mp); 1646 if (unlikely(pkt == NULL)) { 1647 ++rxq->stats.rx_nombuf; 1648 break; 1649 } 1650 len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT; 1651 assert((int)len >= (rxq->crc_present << 2)); 1652 if (rxq->crc_present) 1653 len -= RTE_ETHER_CRC_LEN; 1654 offset = strd_idx * strd_sz + strd_shift; 1655 addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset); 1656 /* 1657 * Memcpy packets to the target mbuf if: 1658 * - The size of packet is smaller than mprq_max_memcpy_len. 1659 * - Out of buffer in the Mempool for Multi-Packet RQ. 1660 */ 1661 if (len <= rxq->mprq_max_memcpy_len || rxq->mprq_repl == NULL) { 1662 /* 1663 * When memcpy'ing packet due to out-of-buffer, the 1664 * packet must be smaller than the target mbuf. 1665 */ 1666 if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) { 1667 rte_pktmbuf_free_seg(pkt); 1668 ++rxq->stats.idropped; 1669 continue; 1670 } 1671 rte_memcpy(rte_pktmbuf_mtod(pkt, void *), addr, len); 1672 DATA_LEN(pkt) = len; 1673 } else { 1674 rte_iova_t buf_iova; 1675 struct rte_mbuf_ext_shared_info *shinfo; 1676 uint16_t buf_len = strd_cnt * strd_sz; 1677 void *buf_addr; 1678 1679 /* Increment the refcnt of the whole chunk. */ 1680 rte_atomic16_add_return(&buf->refcnt, 1); 1681 assert((uint16_t)rte_atomic16_read(&buf->refcnt) <= 1682 strd_n + 1); 1683 buf_addr = RTE_PTR_SUB(addr, headroom_sz); 1684 /* 1685 * MLX5 device doesn't use iova but it is necessary in a 1686 * case where the Rx packet is transmitted via a 1687 * different PMD. 1688 */ 1689 buf_iova = rte_mempool_virt2iova(buf) + 1690 RTE_PTR_DIFF(buf_addr, buf); 1691 shinfo = &buf->shinfos[strd_idx]; 1692 rte_mbuf_ext_refcnt_set(shinfo, 1); 1693 /* 1694 * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when 1695 * attaching the stride to mbuf and more offload flags 1696 * will be added below by calling rxq_cq_to_mbuf(). 1697 * Other fields will be overwritten. 1698 */ 1699 rte_pktmbuf_attach_extbuf(pkt, buf_addr, buf_iova, 1700 buf_len, shinfo); 1701 /* Set mbuf head-room. */ 1702 pkt->data_off = headroom_sz; 1703 assert(pkt->ol_flags == EXT_ATTACHED_MBUF); 1704 /* 1705 * Prevent potential overflow due to MTU change through 1706 * kernel interface. 1707 */ 1708 if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) { 1709 rte_pktmbuf_free_seg(pkt); 1710 ++rxq->stats.idropped; 1711 continue; 1712 } 1713 DATA_LEN(pkt) = len; 1714 /* 1715 * LRO packet may consume all the stride memory, in this 1716 * case packet head-room space is not guaranteed so must 1717 * to add an empty mbuf for the head-room. 1718 */ 1719 if (!rxq->strd_headroom_en) { 1720 struct rte_mbuf *headroom_mbuf = 1721 rte_pktmbuf_alloc(rxq->mp); 1722 1723 if (unlikely(headroom_mbuf == NULL)) { 1724 rte_pktmbuf_free_seg(pkt); 1725 ++rxq->stats.rx_nombuf; 1726 break; 1727 } 1728 PORT(pkt) = rxq->port_id; 1729 NEXT(headroom_mbuf) = pkt; 1730 pkt = headroom_mbuf; 1731 NB_SEGS(pkt) = 2; 1732 } 1733 } 1734 rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res); 1735 if (lro_num_seg > 1) { 1736 mlx5_lro_update_hdr(addr, cqe, len); 1737 pkt->ol_flags |= PKT_RX_LRO; 1738 pkt->tso_segsz = strd_sz; 1739 } 1740 PKT_LEN(pkt) = len; 1741 PORT(pkt) = rxq->port_id; 1742 #ifdef MLX5_PMD_SOFT_COUNTERS 1743 /* Increment bytes counter. */ 1744 rxq->stats.ibytes += PKT_LEN(pkt); 1745 #endif 1746 /* Return packet. */ 1747 *(pkts++) = pkt; 1748 ++i; 1749 } 1750 /* Update the consumer indexes. */ 1751 rxq->consumed_strd = consumed_strd; 1752 rte_cio_wmb(); 1753 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); 1754 if (rq_ci != rxq->rq_ci) { 1755 rxq->rq_ci = rq_ci; 1756 rte_cio_wmb(); 1757 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 1758 } 1759 #ifdef MLX5_PMD_SOFT_COUNTERS 1760 /* Increment packets counter. */ 1761 rxq->stats.ipackets += i; 1762 #endif 1763 return i; 1764 } 1765 1766 /** 1767 * Dummy DPDK callback for TX. 1768 * 1769 * This function is used to temporarily replace the real callback during 1770 * unsafe control operations on the queue, or in case of error. 1771 * 1772 * @param dpdk_txq 1773 * Generic pointer to TX queue structure. 1774 * @param[in] pkts 1775 * Packets to transmit. 1776 * @param pkts_n 1777 * Number of packets in array. 1778 * 1779 * @return 1780 * Number of packets successfully transmitted (<= pkts_n). 1781 */ 1782 uint16_t 1783 removed_tx_burst(void *dpdk_txq __rte_unused, 1784 struct rte_mbuf **pkts __rte_unused, 1785 uint16_t pkts_n __rte_unused) 1786 { 1787 rte_mb(); 1788 return 0; 1789 } 1790 1791 /** 1792 * Dummy DPDK callback for RX. 1793 * 1794 * This function is used to temporarily replace the real callback during 1795 * unsafe control operations on the queue, or in case of error. 1796 * 1797 * @param dpdk_rxq 1798 * Generic pointer to RX queue structure. 1799 * @param[out] pkts 1800 * Array to store received packets. 1801 * @param pkts_n 1802 * Maximum number of packets in array. 1803 * 1804 * @return 1805 * Number of packets successfully received (<= pkts_n). 1806 */ 1807 uint16_t 1808 removed_rx_burst(void *dpdk_txq __rte_unused, 1809 struct rte_mbuf **pkts __rte_unused, 1810 uint16_t pkts_n __rte_unused) 1811 { 1812 rte_mb(); 1813 return 0; 1814 } 1815 1816 /* 1817 * Vectorized Rx/Tx routines are not compiled in when required vector 1818 * instructions are not supported on a target architecture. The following null 1819 * stubs are needed for linkage when those are not included outside of this file 1820 * (e.g. mlx5_rxtx_vec_sse.c for x86). 1821 */ 1822 1823 __rte_weak uint16_t 1824 mlx5_rx_burst_vec(void *dpdk_txq __rte_unused, 1825 struct rte_mbuf **pkts __rte_unused, 1826 uint16_t pkts_n __rte_unused) 1827 { 1828 return 0; 1829 } 1830 1831 __rte_weak int 1832 mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused) 1833 { 1834 return -ENOTSUP; 1835 } 1836 1837 __rte_weak int 1838 mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused) 1839 { 1840 return -ENOTSUP; 1841 } 1842 1843 /** 1844 * Free the mbufs from the linear array of pointers. 1845 * 1846 * @param pkts 1847 * Pointer to array of packets to be free. 1848 * @param pkts_n 1849 * Number of packets to be freed. 1850 * @param olx 1851 * Configured Tx offloads mask. It is fully defined at 1852 * compile time and may be used for optimization. 1853 */ 1854 static __rte_always_inline void 1855 mlx5_tx_free_mbuf(struct rte_mbuf **restrict pkts, 1856 unsigned int pkts_n, 1857 unsigned int olx __rte_unused) 1858 { 1859 struct rte_mempool *pool = NULL; 1860 struct rte_mbuf **p_free = NULL; 1861 struct rte_mbuf *mbuf; 1862 unsigned int n_free = 0; 1863 1864 /* 1865 * The implemented algorithm eliminates 1866 * copying pointers to temporary array 1867 * for rte_mempool_put_bulk() calls. 1868 */ 1869 assert(pkts); 1870 assert(pkts_n); 1871 for (;;) { 1872 for (;;) { 1873 /* 1874 * Decrement mbuf reference counter, detach 1875 * indirect and external buffers if needed. 1876 */ 1877 mbuf = rte_pktmbuf_prefree_seg(*pkts); 1878 if (likely(mbuf != NULL)) { 1879 assert(mbuf == *pkts); 1880 if (likely(n_free != 0)) { 1881 if (unlikely(pool != mbuf->pool)) 1882 /* From different pool. */ 1883 break; 1884 } else { 1885 /* Start new scan array. */ 1886 pool = mbuf->pool; 1887 p_free = pkts; 1888 } 1889 ++n_free; 1890 ++pkts; 1891 --pkts_n; 1892 if (unlikely(pkts_n == 0)) { 1893 mbuf = NULL; 1894 break; 1895 } 1896 } else { 1897 /* 1898 * This happens if mbuf is still referenced. 1899 * We can't put it back to the pool, skip. 1900 */ 1901 ++pkts; 1902 --pkts_n; 1903 if (unlikely(n_free != 0)) 1904 /* There is some array to free.*/ 1905 break; 1906 if (unlikely(pkts_n == 0)) 1907 /* Last mbuf, nothing to free. */ 1908 return; 1909 } 1910 } 1911 for (;;) { 1912 /* 1913 * This loop is implemented to avoid multiple 1914 * inlining of rte_mempool_put_bulk(). 1915 */ 1916 assert(pool); 1917 assert(p_free); 1918 assert(n_free); 1919 /* 1920 * Free the array of pre-freed mbufs 1921 * belonging to the same memory pool. 1922 */ 1923 rte_mempool_put_bulk(pool, (void *)p_free, n_free); 1924 if (unlikely(mbuf != NULL)) { 1925 /* There is the request to start new scan. */ 1926 pool = mbuf->pool; 1927 p_free = pkts++; 1928 n_free = 1; 1929 --pkts_n; 1930 if (likely(pkts_n != 0)) 1931 break; 1932 /* 1933 * This is the last mbuf to be freed. 1934 * Do one more loop iteration to complete. 1935 * This is rare case of the last unique mbuf. 1936 */ 1937 mbuf = NULL; 1938 continue; 1939 } 1940 if (likely(pkts_n == 0)) 1941 return; 1942 n_free = 0; 1943 break; 1944 } 1945 } 1946 } 1947 1948 /** 1949 * Free the mbuf from the elts ring buffer till new tail. 1950 * 1951 * @param txq 1952 * Pointer to Tx queue structure. 1953 * @param tail 1954 * Index in elts to free up to, becomes new elts tail. 1955 * @param olx 1956 * Configured Tx offloads mask. It is fully defined at 1957 * compile time and may be used for optimization. 1958 */ 1959 static __rte_always_inline void 1960 mlx5_tx_free_elts(struct mlx5_txq_data *restrict txq, 1961 uint16_t tail, 1962 unsigned int olx __rte_unused) 1963 { 1964 uint16_t n_elts = tail - txq->elts_tail; 1965 1966 assert(n_elts); 1967 assert(n_elts <= txq->elts_s); 1968 /* 1969 * Implement a loop to support ring buffer wraparound 1970 * with single inlining of mlx5_tx_free_mbuf(). 1971 */ 1972 do { 1973 unsigned int part; 1974 1975 part = txq->elts_s - (txq->elts_tail & txq->elts_m); 1976 part = RTE_MIN(part, n_elts); 1977 assert(part); 1978 assert(part <= txq->elts_s); 1979 mlx5_tx_free_mbuf(&txq->elts[txq->elts_tail & txq->elts_m], 1980 part, olx); 1981 txq->elts_tail += part; 1982 n_elts -= part; 1983 } while (n_elts); 1984 } 1985 1986 /** 1987 * Store the mbuf being sent into elts ring buffer. 1988 * On Tx completion these mbufs will be freed. 1989 * 1990 * @param txq 1991 * Pointer to Tx queue structure. 1992 * @param pkts 1993 * Pointer to array of packets to be stored. 1994 * @param pkts_n 1995 * Number of packets to be stored. 1996 * @param olx 1997 * Configured Tx offloads mask. It is fully defined at 1998 * compile time and may be used for optimization. 1999 */ 2000 static __rte_always_inline void 2001 mlx5_tx_copy_elts(struct mlx5_txq_data *restrict txq, 2002 struct rte_mbuf **restrict pkts, 2003 unsigned int pkts_n, 2004 unsigned int olx __rte_unused) 2005 { 2006 unsigned int part; 2007 struct rte_mbuf **elts = (struct rte_mbuf **)txq->elts; 2008 2009 assert(pkts); 2010 assert(pkts_n); 2011 part = txq->elts_s - (txq->elts_head & txq->elts_m); 2012 assert(part); 2013 assert(part <= txq->elts_s); 2014 /* This code is a good candidate for vectorizing with SIMD. */ 2015 rte_memcpy((void *)(elts + (txq->elts_head & txq->elts_m)), 2016 (void *)pkts, 2017 RTE_MIN(part, pkts_n) * sizeof(struct rte_mbuf *)); 2018 txq->elts_head += pkts_n; 2019 if (unlikely(part < pkts_n)) 2020 /* The copy is wrapping around the elts array. */ 2021 rte_memcpy((void *)elts, (void *)(pkts + part), 2022 (pkts_n - part) * sizeof(struct rte_mbuf *)); 2023 } 2024 2025 /** 2026 * Update completion queue consuming index via doorbell 2027 * and flush the completed data buffers. 2028 * 2029 * @param txq 2030 * Pointer to TX queue structure. 2031 * @param valid CQE pointer 2032 * if not NULL update txq->wqe_pi and flush the buffers 2033 * @param olx 2034 * Configured Tx offloads mask. It is fully defined at 2035 * compile time and may be used for optimization. 2036 */ 2037 static __rte_always_inline void 2038 mlx5_tx_comp_flush(struct mlx5_txq_data *restrict txq, 2039 volatile struct mlx5_cqe *last_cqe, 2040 unsigned int olx __rte_unused) 2041 { 2042 if (likely(last_cqe != NULL)) { 2043 uint16_t tail; 2044 2045 txq->wqe_pi = rte_be_to_cpu_16(last_cqe->wqe_counter); 2046 tail = txq->fcqs[(txq->cq_ci - 1) & txq->cqe_m]; 2047 if (likely(tail != txq->elts_tail)) { 2048 mlx5_tx_free_elts(txq, tail, olx); 2049 assert(tail == txq->elts_tail); 2050 } 2051 } 2052 } 2053 2054 /** 2055 * Manage TX completions. This routine checks the CQ for 2056 * arrived CQEs, deduces the last accomplished WQE in SQ, 2057 * updates SQ producing index and frees all completed mbufs. 2058 * 2059 * @param txq 2060 * Pointer to TX queue structure. 2061 * @param olx 2062 * Configured Tx offloads mask. It is fully defined at 2063 * compile time and may be used for optimization. 2064 * 2065 * NOTE: not inlined intentionally, it makes tx_burst 2066 * routine smaller, simple and faster - from experiments. 2067 */ 2068 static void 2069 mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq, 2070 unsigned int olx __rte_unused) 2071 { 2072 unsigned int count = MLX5_TX_COMP_MAX_CQE; 2073 volatile struct mlx5_cqe *last_cqe = NULL; 2074 uint16_t ci = txq->cq_ci; 2075 int ret; 2076 2077 static_assert(MLX5_CQE_STATUS_HW_OWN < 0, "Must be negative value"); 2078 static_assert(MLX5_CQE_STATUS_SW_OWN < 0, "Must be negative value"); 2079 do { 2080 volatile struct mlx5_cqe *cqe; 2081 2082 cqe = &txq->cqes[ci & txq->cqe_m]; 2083 ret = check_cqe(cqe, txq->cqe_s, ci); 2084 if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) { 2085 if (likely(ret != MLX5_CQE_STATUS_ERR)) { 2086 /* No new CQEs in completion queue. */ 2087 assert(ret == MLX5_CQE_STATUS_HW_OWN); 2088 break; 2089 } 2090 /* 2091 * Some error occurred, try to restart. 2092 * We have no barrier after WQE related Doorbell 2093 * written, make sure all writes are completed 2094 * here, before we might perform SQ reset. 2095 */ 2096 rte_wmb(); 2097 txq->cq_ci = ci; 2098 ret = mlx5_tx_error_cqe_handle 2099 (txq, (volatile struct mlx5_err_cqe *)cqe); 2100 if (unlikely(ret < 0)) { 2101 /* 2102 * Some error occurred on queue error 2103 * handling, we do not advance the index 2104 * here, allowing to retry on next call. 2105 */ 2106 return; 2107 } 2108 /* 2109 * We are going to fetch all entries with 2110 * MLX5_CQE_SYNDROME_WR_FLUSH_ERR status. 2111 * The send queue is supposed to be empty. 2112 */ 2113 ++ci; 2114 txq->cq_pi = ci; 2115 last_cqe = NULL; 2116 continue; 2117 } 2118 /* Normal transmit completion. */ 2119 assert(ci != txq->cq_pi); 2120 assert((txq->fcqs[ci & txq->cqe_m] >> 16) == cqe->wqe_counter); 2121 ++ci; 2122 last_cqe = cqe; 2123 /* 2124 * We have to restrict the amount of processed CQEs 2125 * in one tx_burst routine call. The CQ may be large 2126 * and many CQEs may be updated by the NIC in one 2127 * transaction. Buffers freeing is time consuming, 2128 * multiple iterations may introduce significant 2129 * latency. 2130 */ 2131 if (likely(--count == 0)) 2132 break; 2133 } while (true); 2134 if (likely(ci != txq->cq_ci)) { 2135 /* 2136 * Update completion queue consuming index 2137 * and ring doorbell to notify hardware. 2138 */ 2139 rte_compiler_barrier(); 2140 txq->cq_ci = ci; 2141 *txq->cq_db = rte_cpu_to_be_32(ci); 2142 mlx5_tx_comp_flush(txq, last_cqe, olx); 2143 } 2144 } 2145 2146 /** 2147 * Check if the completion request flag should be set in the last WQE. 2148 * Both pushed mbufs and WQEs are monitored and the completion request 2149 * flag is set if any of thresholds is reached. 2150 * 2151 * @param txq 2152 * Pointer to TX queue structure. 2153 * @param loc 2154 * Pointer to burst routine local context. 2155 * @param olx 2156 * Configured Tx offloads mask. It is fully defined at 2157 * compile time and may be used for optimization. 2158 */ 2159 static __rte_always_inline void 2160 mlx5_tx_request_completion(struct mlx5_txq_data *restrict txq, 2161 struct mlx5_txq_local *restrict loc, 2162 unsigned int olx) 2163 { 2164 uint16_t head = txq->elts_head; 2165 unsigned int part; 2166 2167 part = MLX5_TXOFF_CONFIG(INLINE) ? 2168 0 : loc->pkts_sent - loc->pkts_copy; 2169 head += part; 2170 if ((uint16_t)(head - txq->elts_comp) >= MLX5_TX_COMP_THRESH || 2171 (MLX5_TXOFF_CONFIG(INLINE) && 2172 (uint16_t)(txq->wqe_ci - txq->wqe_comp) >= txq->wqe_thres)) { 2173 volatile struct mlx5_wqe *last = loc->wqe_last; 2174 2175 txq->elts_comp = head; 2176 if (MLX5_TXOFF_CONFIG(INLINE)) 2177 txq->wqe_comp = txq->wqe_ci; 2178 /* Request unconditional completion on last WQE. */ 2179 last->cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS << 2180 MLX5_COMP_MODE_OFFSET); 2181 /* Save elts_head in dedicated free on completion queue. */ 2182 #ifdef NDEBUG 2183 txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head; 2184 #else 2185 txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head | 2186 (last->cseg.opcode >> 8) << 16; 2187 #endif 2188 /* A CQE slot must always be available. */ 2189 assert((txq->cq_pi - txq->cq_ci) <= txq->cqe_s); 2190 } 2191 } 2192 2193 /** 2194 * DPDK callback to check the status of a tx descriptor. 2195 * 2196 * @param tx_queue 2197 * The tx queue. 2198 * @param[in] offset 2199 * The index of the descriptor in the ring. 2200 * 2201 * @return 2202 * The status of the tx descriptor. 2203 */ 2204 int 2205 mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset) 2206 { 2207 struct mlx5_txq_data *restrict txq = tx_queue; 2208 uint16_t used; 2209 2210 mlx5_tx_handle_completion(txq, 0); 2211 used = txq->elts_head - txq->elts_tail; 2212 if (offset < used) 2213 return RTE_ETH_TX_DESC_FULL; 2214 return RTE_ETH_TX_DESC_DONE; 2215 } 2216 2217 /** 2218 * Build the Control Segment with specified opcode: 2219 * - MLX5_OPCODE_SEND 2220 * - MLX5_OPCODE_ENHANCED_MPSW 2221 * - MLX5_OPCODE_TSO 2222 * 2223 * @param txq 2224 * Pointer to TX queue structure. 2225 * @param loc 2226 * Pointer to burst routine local context. 2227 * @param wqe 2228 * Pointer to WQE to fill with built Control Segment. 2229 * @param ds 2230 * Supposed length of WQE in segments. 2231 * @param opcode 2232 * SQ WQE opcode to put into Control Segment. 2233 * @param olx 2234 * Configured Tx offloads mask. It is fully defined at 2235 * compile time and may be used for optimization. 2236 */ 2237 static __rte_always_inline void 2238 mlx5_tx_cseg_init(struct mlx5_txq_data *restrict txq, 2239 struct mlx5_txq_local *restrict loc __rte_unused, 2240 struct mlx5_wqe *restrict wqe, 2241 unsigned int ds, 2242 unsigned int opcode, 2243 unsigned int olx __rte_unused) 2244 { 2245 struct mlx5_wqe_cseg *restrict cs = &wqe->cseg; 2246 2247 /* For legacy MPW replace the EMPW by TSO with modifier. */ 2248 if (MLX5_TXOFF_CONFIG(MPW) && opcode == MLX5_OPCODE_ENHANCED_MPSW) 2249 opcode = MLX5_OPCODE_TSO | MLX5_OPC_MOD_MPW << 24; 2250 cs->opcode = rte_cpu_to_be_32((txq->wqe_ci << 8) | opcode); 2251 cs->sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); 2252 cs->flags = RTE_BE32(MLX5_COMP_ONLY_FIRST_ERR << 2253 MLX5_COMP_MODE_OFFSET); 2254 cs->misc = RTE_BE32(0); 2255 } 2256 2257 /** 2258 * Build the Ethernet Segment without inlined data. 2259 * Supports Software Parser, Checksums and VLAN 2260 * insertion Tx offload features. 2261 * 2262 * @param txq 2263 * Pointer to TX queue structure. 2264 * @param loc 2265 * Pointer to burst routine local context. 2266 * @param wqe 2267 * Pointer to WQE to fill with built Ethernet Segment. 2268 * @param olx 2269 * Configured Tx offloads mask. It is fully defined at 2270 * compile time and may be used for optimization. 2271 */ 2272 static __rte_always_inline void 2273 mlx5_tx_eseg_none(struct mlx5_txq_data *restrict txq __rte_unused, 2274 struct mlx5_txq_local *restrict loc, 2275 struct mlx5_wqe *restrict wqe, 2276 unsigned int olx) 2277 { 2278 struct mlx5_wqe_eseg *restrict es = &wqe->eseg; 2279 uint32_t csum; 2280 2281 /* 2282 * Calculate and set check sum flags first, dword field 2283 * in segment may be shared with Software Parser flags. 2284 */ 2285 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; 2286 es->flags = rte_cpu_to_le_32(csum); 2287 /* 2288 * Calculate and set Software Parser offsets and flags. 2289 * These flags a set for custom UDP and IP tunnel packets. 2290 */ 2291 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); 2292 /* Fill metadata field if needed. */ 2293 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? 2294 loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? 2295 *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0; 2296 /* Engage VLAN tag insertion feature if requested. */ 2297 if (MLX5_TXOFF_CONFIG(VLAN) && 2298 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { 2299 /* 2300 * We should get here only if device support 2301 * this feature correctly. 2302 */ 2303 assert(txq->vlan_en); 2304 es->inline_hdr = rte_cpu_to_be_32(MLX5_ETH_WQE_VLAN_INSERT | 2305 loc->mbuf->vlan_tci); 2306 } else { 2307 es->inline_hdr = RTE_BE32(0); 2308 } 2309 } 2310 2311 /** 2312 * Build the Ethernet Segment with minimal inlined data 2313 * of MLX5_ESEG_MIN_INLINE_SIZE bytes length. This is 2314 * used to fill the gap in single WQEBB WQEs. 2315 * Supports Software Parser, Checksums and VLAN 2316 * insertion Tx offload features. 2317 * 2318 * @param txq 2319 * Pointer to TX queue structure. 2320 * @param loc 2321 * Pointer to burst routine local context. 2322 * @param wqe 2323 * Pointer to WQE to fill with built Ethernet Segment. 2324 * @param vlan 2325 * Length of VLAN tag insertion if any. 2326 * @param olx 2327 * Configured Tx offloads mask. It is fully defined at 2328 * compile time and may be used for optimization. 2329 */ 2330 static __rte_always_inline void 2331 mlx5_tx_eseg_dmin(struct mlx5_txq_data *restrict txq __rte_unused, 2332 struct mlx5_txq_local *restrict loc, 2333 struct mlx5_wqe *restrict wqe, 2334 unsigned int vlan, 2335 unsigned int olx) 2336 { 2337 struct mlx5_wqe_eseg *restrict es = &wqe->eseg; 2338 uint32_t csum; 2339 uint8_t *psrc, *pdst; 2340 2341 /* 2342 * Calculate and set check sum flags first, dword field 2343 * in segment may be shared with Software Parser flags. 2344 */ 2345 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; 2346 es->flags = rte_cpu_to_le_32(csum); 2347 /* 2348 * Calculate and set Software Parser offsets and flags. 2349 * These flags a set for custom UDP and IP tunnel packets. 2350 */ 2351 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); 2352 /* Fill metadata field if needed. */ 2353 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? 2354 loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? 2355 *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0; 2356 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2357 (sizeof(uint16_t) + 2358 sizeof(rte_v128u32_t)), 2359 "invalid Ethernet Segment data size"); 2360 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2361 (sizeof(uint16_t) + 2362 sizeof(struct rte_vlan_hdr) + 2363 2 * RTE_ETHER_ADDR_LEN), 2364 "invalid Ethernet Segment data size"); 2365 psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *); 2366 es->inline_hdr_sz = RTE_BE16(MLX5_ESEG_MIN_INLINE_SIZE); 2367 es->inline_data = *(unaligned_uint16_t *)psrc; 2368 psrc += sizeof(uint16_t); 2369 pdst = (uint8_t *)(es + 1); 2370 if (MLX5_TXOFF_CONFIG(VLAN) && vlan) { 2371 /* Implement VLAN tag insertion as part inline data. */ 2372 memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t)); 2373 pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); 2374 psrc += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); 2375 /* Insert VLAN ethertype + VLAN tag. */ 2376 *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32 2377 ((RTE_ETHER_TYPE_VLAN << 16) | 2378 loc->mbuf->vlan_tci); 2379 pdst += sizeof(struct rte_vlan_hdr); 2380 /* Copy the rest two bytes from packet data. */ 2381 assert(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t))); 2382 *(uint16_t *)pdst = *(unaligned_uint16_t *)psrc; 2383 } else { 2384 /* Fill the gap in the title WQEBB with inline data. */ 2385 rte_mov16(pdst, psrc); 2386 } 2387 } 2388 2389 /** 2390 * Build the Ethernet Segment with entire packet 2391 * data inlining. Checks the boundary of WQEBB and 2392 * ring buffer wrapping, supports Software Parser, 2393 * Checksums and VLAN insertion Tx offload features. 2394 * 2395 * @param txq 2396 * Pointer to TX queue structure. 2397 * @param loc 2398 * Pointer to burst routine local context. 2399 * @param wqe 2400 * Pointer to WQE to fill with built Ethernet Segment. 2401 * @param vlan 2402 * Length of VLAN tag insertion if any. 2403 * @param inlen 2404 * Length of data to inline (VLAN included, if any). 2405 * @param tso 2406 * TSO flag, set mss field from the packet. 2407 * @param olx 2408 * Configured Tx offloads mask. It is fully defined at 2409 * compile time and may be used for optimization. 2410 * 2411 * @return 2412 * Pointer to the next Data Segment (aligned and wrapped around). 2413 */ 2414 static __rte_always_inline struct mlx5_wqe_dseg * 2415 mlx5_tx_eseg_data(struct mlx5_txq_data *restrict txq, 2416 struct mlx5_txq_local *restrict loc, 2417 struct mlx5_wqe *restrict wqe, 2418 unsigned int vlan, 2419 unsigned int inlen, 2420 unsigned int tso, 2421 unsigned int olx) 2422 { 2423 struct mlx5_wqe_eseg *restrict es = &wqe->eseg; 2424 uint32_t csum; 2425 uint8_t *psrc, *pdst; 2426 unsigned int part; 2427 2428 /* 2429 * Calculate and set check sum flags first, dword field 2430 * in segment may be shared with Software Parser flags. 2431 */ 2432 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; 2433 if (tso) { 2434 csum <<= 24; 2435 csum |= loc->mbuf->tso_segsz; 2436 es->flags = rte_cpu_to_be_32(csum); 2437 } else { 2438 es->flags = rte_cpu_to_le_32(csum); 2439 } 2440 /* 2441 * Calculate and set Software Parser offsets and flags. 2442 * These flags a set for custom UDP and IP tunnel packets. 2443 */ 2444 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); 2445 /* Fill metadata field if needed. */ 2446 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? 2447 loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? 2448 *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0; 2449 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2450 (sizeof(uint16_t) + 2451 sizeof(rte_v128u32_t)), 2452 "invalid Ethernet Segment data size"); 2453 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2454 (sizeof(uint16_t) + 2455 sizeof(struct rte_vlan_hdr) + 2456 2 * RTE_ETHER_ADDR_LEN), 2457 "invalid Ethernet Segment data size"); 2458 psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *); 2459 es->inline_hdr_sz = rte_cpu_to_be_16(inlen); 2460 es->inline_data = *(unaligned_uint16_t *)psrc; 2461 psrc += sizeof(uint16_t); 2462 pdst = (uint8_t *)(es + 1); 2463 if (MLX5_TXOFF_CONFIG(VLAN) && vlan) { 2464 /* Implement VLAN tag insertion as part inline data. */ 2465 memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t)); 2466 pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); 2467 psrc += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); 2468 /* Insert VLAN ethertype + VLAN tag. */ 2469 *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32 2470 ((RTE_ETHER_TYPE_VLAN << 16) | 2471 loc->mbuf->vlan_tci); 2472 pdst += sizeof(struct rte_vlan_hdr); 2473 /* Copy the rest two bytes from packet data. */ 2474 assert(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t))); 2475 *(uint16_t *)pdst = *(unaligned_uint16_t *)psrc; 2476 psrc += sizeof(uint16_t); 2477 } else { 2478 /* Fill the gap in the title WQEBB with inline data. */ 2479 rte_mov16(pdst, psrc); 2480 psrc += sizeof(rte_v128u32_t); 2481 } 2482 pdst = (uint8_t *)(es + 2); 2483 assert(inlen >= MLX5_ESEG_MIN_INLINE_SIZE); 2484 assert(pdst < (uint8_t *)txq->wqes_end); 2485 inlen -= MLX5_ESEG_MIN_INLINE_SIZE; 2486 if (!inlen) { 2487 assert(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE)); 2488 return (struct mlx5_wqe_dseg *)pdst; 2489 } 2490 /* 2491 * The WQEBB space availability is checked by caller. 2492 * Here we should be aware of WQE ring buffer wraparound only. 2493 */ 2494 part = (uint8_t *)txq->wqes_end - pdst; 2495 part = RTE_MIN(part, inlen); 2496 do { 2497 rte_memcpy(pdst, psrc, part); 2498 inlen -= part; 2499 if (likely(!inlen)) { 2500 /* 2501 * If return value is not used by the caller 2502 * the code below will be optimized out. 2503 */ 2504 pdst += part; 2505 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); 2506 if (unlikely(pdst >= (uint8_t *)txq->wqes_end)) 2507 pdst = (uint8_t *)txq->wqes; 2508 return (struct mlx5_wqe_dseg *)pdst; 2509 } 2510 pdst = (uint8_t *)txq->wqes; 2511 psrc += part; 2512 part = inlen; 2513 } while (true); 2514 } 2515 2516 /** 2517 * Copy data from chain of mbuf to the specified linear buffer. 2518 * Checksums and VLAN insertion Tx offload features. If data 2519 * from some mbuf copied completely this mbuf is freed. Local 2520 * structure is used to keep the byte stream state. 2521 * 2522 * @param pdst 2523 * Pointer to the destination linear buffer. 2524 * @param loc 2525 * Pointer to burst routine local context. 2526 * @param len 2527 * Length of data to be copied. 2528 * @param olx 2529 * Configured Tx offloads mask. It is fully defined at 2530 * compile time and may be used for optimization. 2531 */ 2532 static __rte_always_inline void 2533 mlx5_tx_mseg_memcpy(uint8_t *pdst, 2534 struct mlx5_txq_local *restrict loc, 2535 unsigned int len, 2536 unsigned int olx __rte_unused) 2537 { 2538 struct rte_mbuf *mbuf; 2539 unsigned int part, dlen; 2540 uint8_t *psrc; 2541 2542 assert(len); 2543 do { 2544 /* Allow zero length packets, must check first. */ 2545 dlen = rte_pktmbuf_data_len(loc->mbuf); 2546 if (dlen <= loc->mbuf_off) { 2547 /* Exhausted packet, just free. */ 2548 mbuf = loc->mbuf; 2549 loc->mbuf = mbuf->next; 2550 rte_pktmbuf_free_seg(mbuf); 2551 loc->mbuf_off = 0; 2552 assert(loc->mbuf_nseg > 1); 2553 assert(loc->mbuf); 2554 --loc->mbuf_nseg; 2555 continue; 2556 } 2557 dlen -= loc->mbuf_off; 2558 psrc = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *, 2559 loc->mbuf_off); 2560 part = RTE_MIN(len, dlen); 2561 rte_memcpy(pdst, psrc, part); 2562 loc->mbuf_off += part; 2563 len -= part; 2564 if (!len) { 2565 if (loc->mbuf_off >= rte_pktmbuf_data_len(loc->mbuf)) { 2566 loc->mbuf_off = 0; 2567 /* Exhausted packet, just free. */ 2568 mbuf = loc->mbuf; 2569 loc->mbuf = mbuf->next; 2570 rte_pktmbuf_free_seg(mbuf); 2571 loc->mbuf_off = 0; 2572 assert(loc->mbuf_nseg >= 1); 2573 --loc->mbuf_nseg; 2574 } 2575 return; 2576 } 2577 pdst += part; 2578 } while (true); 2579 } 2580 2581 /** 2582 * Build the Ethernet Segment with inlined data from 2583 * multi-segment packet. Checks the boundary of WQEBB 2584 * and ring buffer wrapping, supports Software Parser, 2585 * Checksums and VLAN insertion Tx offload features. 2586 * 2587 * @param txq 2588 * Pointer to TX queue structure. 2589 * @param loc 2590 * Pointer to burst routine local context. 2591 * @param wqe 2592 * Pointer to WQE to fill with built Ethernet Segment. 2593 * @param vlan 2594 * Length of VLAN tag insertion if any. 2595 * @param inlen 2596 * Length of data to inline (VLAN included, if any). 2597 * @param tso 2598 * TSO flag, set mss field from the packet. 2599 * @param olx 2600 * Configured Tx offloads mask. It is fully defined at 2601 * compile time and may be used for optimization. 2602 * 2603 * @return 2604 * Pointer to the next Data Segment (aligned and 2605 * possible NOT wrapped around - caller should do 2606 * wrapping check on its own). 2607 */ 2608 static __rte_always_inline struct mlx5_wqe_dseg * 2609 mlx5_tx_eseg_mdat(struct mlx5_txq_data *restrict txq, 2610 struct mlx5_txq_local *restrict loc, 2611 struct mlx5_wqe *restrict wqe, 2612 unsigned int vlan, 2613 unsigned int inlen, 2614 unsigned int tso, 2615 unsigned int olx) 2616 { 2617 struct mlx5_wqe_eseg *restrict es = &wqe->eseg; 2618 uint32_t csum; 2619 uint8_t *pdst; 2620 unsigned int part; 2621 2622 /* 2623 * Calculate and set check sum flags first, uint32_t field 2624 * in segment may be shared with Software Parser flags. 2625 */ 2626 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; 2627 if (tso) { 2628 csum <<= 24; 2629 csum |= loc->mbuf->tso_segsz; 2630 es->flags = rte_cpu_to_be_32(csum); 2631 } else { 2632 es->flags = rte_cpu_to_le_32(csum); 2633 } 2634 /* 2635 * Calculate and set Software Parser offsets and flags. 2636 * These flags a set for custom UDP and IP tunnel packets. 2637 */ 2638 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); 2639 /* Fill metadata field if needed. */ 2640 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? 2641 loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? 2642 *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0; 2643 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2644 (sizeof(uint16_t) + 2645 sizeof(rte_v128u32_t)), 2646 "invalid Ethernet Segment data size"); 2647 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2648 (sizeof(uint16_t) + 2649 sizeof(struct rte_vlan_hdr) + 2650 2 * RTE_ETHER_ADDR_LEN), 2651 "invalid Ethernet Segment data size"); 2652 assert(inlen >= MLX5_ESEG_MIN_INLINE_SIZE); 2653 es->inline_hdr_sz = rte_cpu_to_be_16(inlen); 2654 pdst = (uint8_t *)&es->inline_data; 2655 if (MLX5_TXOFF_CONFIG(VLAN) && vlan) { 2656 /* Implement VLAN tag insertion as part inline data. */ 2657 mlx5_tx_mseg_memcpy(pdst, loc, 2 * RTE_ETHER_ADDR_LEN, olx); 2658 pdst += 2 * RTE_ETHER_ADDR_LEN; 2659 *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32 2660 ((RTE_ETHER_TYPE_VLAN << 16) | 2661 loc->mbuf->vlan_tci); 2662 pdst += sizeof(struct rte_vlan_hdr); 2663 inlen -= 2 * RTE_ETHER_ADDR_LEN + sizeof(struct rte_vlan_hdr); 2664 } 2665 assert(pdst < (uint8_t *)txq->wqes_end); 2666 /* 2667 * The WQEBB space availability is checked by caller. 2668 * Here we should be aware of WQE ring buffer wraparound only. 2669 */ 2670 part = (uint8_t *)txq->wqes_end - pdst; 2671 part = RTE_MIN(part, inlen); 2672 assert(part); 2673 do { 2674 mlx5_tx_mseg_memcpy(pdst, loc, part, olx); 2675 inlen -= part; 2676 if (likely(!inlen)) { 2677 pdst += part; 2678 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); 2679 return (struct mlx5_wqe_dseg *)pdst; 2680 } 2681 pdst = (uint8_t *)txq->wqes; 2682 part = inlen; 2683 } while (true); 2684 } 2685 2686 /** 2687 * Build the Data Segment of pointer type. 2688 * 2689 * @param txq 2690 * Pointer to TX queue structure. 2691 * @param loc 2692 * Pointer to burst routine local context. 2693 * @param dseg 2694 * Pointer to WQE to fill with built Data Segment. 2695 * @param buf 2696 * Data buffer to point. 2697 * @param len 2698 * Data buffer length. 2699 * @param olx 2700 * Configured Tx offloads mask. It is fully defined at 2701 * compile time and may be used for optimization. 2702 */ 2703 static __rte_always_inline void 2704 mlx5_tx_dseg_ptr(struct mlx5_txq_data *restrict txq, 2705 struct mlx5_txq_local *restrict loc, 2706 struct mlx5_wqe_dseg *restrict dseg, 2707 uint8_t *buf, 2708 unsigned int len, 2709 unsigned int olx __rte_unused) 2710 2711 { 2712 assert(len); 2713 dseg->bcount = rte_cpu_to_be_32(len); 2714 dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf); 2715 dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf); 2716 } 2717 2718 /** 2719 * Build the Data Segment of pointer type or inline 2720 * if data length is less than buffer in minimal 2721 * Data Segment size. 2722 * 2723 * @param txq 2724 * Pointer to TX queue structure. 2725 * @param loc 2726 * Pointer to burst routine local context. 2727 * @param dseg 2728 * Pointer to WQE to fill with built Data Segment. 2729 * @param buf 2730 * Data buffer to point. 2731 * @param len 2732 * Data buffer length. 2733 * @param olx 2734 * Configured Tx offloads mask. It is fully defined at 2735 * compile time and may be used for optimization. 2736 */ 2737 static __rte_always_inline void 2738 mlx5_tx_dseg_iptr(struct mlx5_txq_data *restrict txq, 2739 struct mlx5_txq_local *restrict loc, 2740 struct mlx5_wqe_dseg *restrict dseg, 2741 uint8_t *buf, 2742 unsigned int len, 2743 unsigned int olx __rte_unused) 2744 2745 { 2746 uintptr_t dst, src; 2747 2748 assert(len); 2749 if (len > MLX5_DSEG_MIN_INLINE_SIZE) { 2750 dseg->bcount = rte_cpu_to_be_32(len); 2751 dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf); 2752 dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf); 2753 2754 return; 2755 } 2756 dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE); 2757 /* Unrolled implementation of generic rte_memcpy. */ 2758 dst = (uintptr_t)&dseg->inline_data[0]; 2759 src = (uintptr_t)buf; 2760 if (len & 0x08) { 2761 #ifdef RTE_ARCH_STRICT_ALIGN 2762 assert(dst == RTE_PTR_ALIGN(dst, sizeof(uint32_t))); 2763 *(uint32_t *)dst = *(unaligned_uint32_t *)src; 2764 dst += sizeof(uint32_t); 2765 src += sizeof(uint32_t); 2766 *(uint32_t *)dst = *(unaligned_uint32_t *)src; 2767 dst += sizeof(uint32_t); 2768 src += sizeof(uint32_t); 2769 #else 2770 *(uint64_t *)dst = *(unaligned_uint64_t *)src; 2771 dst += sizeof(uint64_t); 2772 src += sizeof(uint64_t); 2773 #endif 2774 } 2775 if (len & 0x04) { 2776 *(uint32_t *)dst = *(unaligned_uint32_t *)src; 2777 dst += sizeof(uint32_t); 2778 src += sizeof(uint32_t); 2779 } 2780 if (len & 0x02) { 2781 *(uint16_t *)dst = *(unaligned_uint16_t *)src; 2782 dst += sizeof(uint16_t); 2783 src += sizeof(uint16_t); 2784 } 2785 if (len & 0x01) 2786 *(uint8_t *)dst = *(uint8_t *)src; 2787 } 2788 2789 /** 2790 * Build the Data Segment of inlined data from single 2791 * segment packet, no VLAN insertion. 2792 * 2793 * @param txq 2794 * Pointer to TX queue structure. 2795 * @param loc 2796 * Pointer to burst routine local context. 2797 * @param dseg 2798 * Pointer to WQE to fill with built Data Segment. 2799 * @param buf 2800 * Data buffer to point. 2801 * @param len 2802 * Data buffer length. 2803 * @param olx 2804 * Configured Tx offloads mask. It is fully defined at 2805 * compile time and may be used for optimization. 2806 * 2807 * @return 2808 * Pointer to the next Data Segment after inlined data. 2809 * Ring buffer wraparound check is needed. We do not 2810 * do it here because it may not be needed for the 2811 * last packet in the eMPW session. 2812 */ 2813 static __rte_always_inline struct mlx5_wqe_dseg * 2814 mlx5_tx_dseg_empw(struct mlx5_txq_data *restrict txq, 2815 struct mlx5_txq_local *restrict loc __rte_unused, 2816 struct mlx5_wqe_dseg *restrict dseg, 2817 uint8_t *buf, 2818 unsigned int len, 2819 unsigned int olx __rte_unused) 2820 { 2821 unsigned int part; 2822 uint8_t *pdst; 2823 2824 dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE); 2825 pdst = &dseg->inline_data[0]; 2826 /* 2827 * The WQEBB space availability is checked by caller. 2828 * Here we should be aware of WQE ring buffer wraparound only. 2829 */ 2830 part = (uint8_t *)txq->wqes_end - pdst; 2831 part = RTE_MIN(part, len); 2832 do { 2833 rte_memcpy(pdst, buf, part); 2834 len -= part; 2835 if (likely(!len)) { 2836 pdst += part; 2837 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); 2838 /* Note: no final wraparound check here. */ 2839 return (struct mlx5_wqe_dseg *)pdst; 2840 } 2841 pdst = (uint8_t *)txq->wqes; 2842 buf += part; 2843 part = len; 2844 } while (true); 2845 } 2846 2847 /** 2848 * Build the Data Segment of inlined data from single 2849 * segment packet with VLAN insertion. 2850 * 2851 * @param txq 2852 * Pointer to TX queue structure. 2853 * @param loc 2854 * Pointer to burst routine local context. 2855 * @param dseg 2856 * Pointer to the dseg fill with built Data Segment. 2857 * @param buf 2858 * Data buffer to point. 2859 * @param len 2860 * Data buffer length. 2861 * @param olx 2862 * Configured Tx offloads mask. It is fully defined at 2863 * compile time and may be used for optimization. 2864 * 2865 * @return 2866 * Pointer to the next Data Segment after inlined data. 2867 * Ring buffer wraparound check is needed. 2868 */ 2869 static __rte_always_inline struct mlx5_wqe_dseg * 2870 mlx5_tx_dseg_vlan(struct mlx5_txq_data *restrict txq, 2871 struct mlx5_txq_local *restrict loc __rte_unused, 2872 struct mlx5_wqe_dseg *restrict dseg, 2873 uint8_t *buf, 2874 unsigned int len, 2875 unsigned int olx __rte_unused) 2876 2877 { 2878 unsigned int part; 2879 uint8_t *pdst; 2880 2881 assert(len > MLX5_ESEG_MIN_INLINE_SIZE); 2882 static_assert(MLX5_DSEG_MIN_INLINE_SIZE == 2883 (2 * RTE_ETHER_ADDR_LEN), 2884 "invalid Data Segment data size"); 2885 dseg->bcount = rte_cpu_to_be_32((len + sizeof(struct rte_vlan_hdr)) | 2886 MLX5_ETH_WQE_DATA_INLINE); 2887 pdst = &dseg->inline_data[0]; 2888 memcpy(pdst, buf, MLX5_DSEG_MIN_INLINE_SIZE); 2889 buf += MLX5_DSEG_MIN_INLINE_SIZE; 2890 pdst += MLX5_DSEG_MIN_INLINE_SIZE; 2891 len -= MLX5_DSEG_MIN_INLINE_SIZE; 2892 /* Insert VLAN ethertype + VLAN tag. Pointer is aligned. */ 2893 assert(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE)); 2894 if (unlikely(pdst >= (uint8_t *)txq->wqes_end)) 2895 pdst = (uint8_t *)txq->wqes; 2896 *(uint32_t *)pdst = rte_cpu_to_be_32((RTE_ETHER_TYPE_VLAN << 16) | 2897 loc->mbuf->vlan_tci); 2898 pdst += sizeof(struct rte_vlan_hdr); 2899 /* 2900 * The WQEBB space availability is checked by caller. 2901 * Here we should be aware of WQE ring buffer wraparound only. 2902 */ 2903 part = (uint8_t *)txq->wqes_end - pdst; 2904 part = RTE_MIN(part, len); 2905 do { 2906 rte_memcpy(pdst, buf, part); 2907 len -= part; 2908 if (likely(!len)) { 2909 pdst += part; 2910 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); 2911 /* Note: no final wraparound check here. */ 2912 return (struct mlx5_wqe_dseg *)pdst; 2913 } 2914 pdst = (uint8_t *)txq->wqes; 2915 buf += part; 2916 part = len; 2917 } while (true); 2918 } 2919 2920 /** 2921 * Build the Ethernet Segment with optionally inlined data with 2922 * VLAN insertion and following Data Segments (if any) from 2923 * multi-segment packet. Used by ordinary send and TSO. 2924 * 2925 * @param txq 2926 * Pointer to TX queue structure. 2927 * @param loc 2928 * Pointer to burst routine local context. 2929 * @param wqe 2930 * Pointer to WQE to fill with built Ethernet/Data Segments. 2931 * @param vlan 2932 * Length of VLAN header to insert, 0 means no VLAN insertion. 2933 * @param inlen 2934 * Data length to inline. For TSO this parameter specifies 2935 * exact value, for ordinary send routine can be aligned by 2936 * caller to provide better WQE space saving and data buffer 2937 * start address alignment. This length includes VLAN header 2938 * being inserted. 2939 * @param tso 2940 * Zero means ordinary send, inlined data can be extended, 2941 * otherwise this is TSO, inlined data length is fixed. 2942 * @param olx 2943 * Configured Tx offloads mask. It is fully defined at 2944 * compile time and may be used for optimization. 2945 * 2946 * @return 2947 * Actual size of built WQE in segments. 2948 */ 2949 static __rte_always_inline unsigned int 2950 mlx5_tx_mseg_build(struct mlx5_txq_data *restrict txq, 2951 struct mlx5_txq_local *restrict loc, 2952 struct mlx5_wqe *restrict wqe, 2953 unsigned int vlan, 2954 unsigned int inlen, 2955 unsigned int tso, 2956 unsigned int olx __rte_unused) 2957 { 2958 struct mlx5_wqe_dseg *restrict dseg; 2959 unsigned int ds; 2960 2961 assert((rte_pktmbuf_pkt_len(loc->mbuf) + vlan) >= inlen); 2962 loc->mbuf_nseg = NB_SEGS(loc->mbuf); 2963 loc->mbuf_off = 0; 2964 2965 dseg = mlx5_tx_eseg_mdat(txq, loc, wqe, vlan, inlen, tso, olx); 2966 if (!loc->mbuf_nseg) 2967 goto dseg_done; 2968 /* 2969 * There are still some mbuf remaining, not inlined. 2970 * The first mbuf may be partially inlined and we 2971 * must process the possible non-zero data offset. 2972 */ 2973 if (loc->mbuf_off) { 2974 unsigned int dlen; 2975 uint8_t *dptr; 2976 2977 /* 2978 * Exhausted packets must be dropped before. 2979 * Non-zero offset means there are some data 2980 * remained in the packet. 2981 */ 2982 assert(loc->mbuf_off < rte_pktmbuf_data_len(loc->mbuf)); 2983 assert(rte_pktmbuf_data_len(loc->mbuf)); 2984 dptr = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *, 2985 loc->mbuf_off); 2986 dlen = rte_pktmbuf_data_len(loc->mbuf) - loc->mbuf_off; 2987 /* 2988 * Build the pointer/minimal data Data Segment. 2989 * Do ring buffer wrapping check in advance. 2990 */ 2991 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 2992 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 2993 mlx5_tx_dseg_iptr(txq, loc, dseg, dptr, dlen, olx); 2994 /* Store the mbuf to be freed on completion. */ 2995 assert(loc->elts_free); 2996 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 2997 --loc->elts_free; 2998 ++dseg; 2999 if (--loc->mbuf_nseg == 0) 3000 goto dseg_done; 3001 loc->mbuf = loc->mbuf->next; 3002 loc->mbuf_off = 0; 3003 } 3004 do { 3005 if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) { 3006 struct rte_mbuf *mbuf; 3007 3008 /* Zero length segment found, just skip. */ 3009 mbuf = loc->mbuf; 3010 loc->mbuf = loc->mbuf->next; 3011 rte_pktmbuf_free_seg(mbuf); 3012 if (--loc->mbuf_nseg == 0) 3013 break; 3014 } else { 3015 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 3016 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 3017 mlx5_tx_dseg_iptr 3018 (txq, loc, dseg, 3019 rte_pktmbuf_mtod(loc->mbuf, uint8_t *), 3020 rte_pktmbuf_data_len(loc->mbuf), olx); 3021 assert(loc->elts_free); 3022 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 3023 --loc->elts_free; 3024 ++dseg; 3025 if (--loc->mbuf_nseg == 0) 3026 break; 3027 loc->mbuf = loc->mbuf->next; 3028 } 3029 } while (true); 3030 3031 dseg_done: 3032 /* Calculate actual segments used from the dseg pointer. */ 3033 if ((uintptr_t)wqe < (uintptr_t)dseg) 3034 ds = ((uintptr_t)dseg - (uintptr_t)wqe) / MLX5_WSEG_SIZE; 3035 else 3036 ds = (((uintptr_t)dseg - (uintptr_t)wqe) + 3037 txq->wqe_s * MLX5_WQE_SIZE) / MLX5_WSEG_SIZE; 3038 return ds; 3039 } 3040 3041 /** 3042 * Tx one packet function for multi-segment TSO. Supports all 3043 * types of Tx offloads, uses MLX5_OPCODE_TSO to build WQEs, 3044 * sends one packet per WQE. 3045 * 3046 * This routine is responsible for storing processed mbuf 3047 * into elts ring buffer and update elts_head. 3048 * 3049 * @param txq 3050 * Pointer to TX queue structure. 3051 * @param loc 3052 * Pointer to burst routine local context. 3053 * @param olx 3054 * Configured Tx offloads mask. It is fully defined at 3055 * compile time and may be used for optimization. 3056 * 3057 * @return 3058 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3059 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3060 * Local context variables partially updated. 3061 */ 3062 static __rte_always_inline enum mlx5_txcmp_code 3063 mlx5_tx_packet_multi_tso(struct mlx5_txq_data *restrict txq, 3064 struct mlx5_txq_local *restrict loc, 3065 unsigned int olx) 3066 { 3067 struct mlx5_wqe *restrict wqe; 3068 unsigned int ds, dlen, inlen, ntcp, vlan = 0; 3069 3070 /* 3071 * Calculate data length to be inlined to estimate 3072 * the required space in WQE ring buffer. 3073 */ 3074 dlen = rte_pktmbuf_pkt_len(loc->mbuf); 3075 if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) 3076 vlan = sizeof(struct rte_vlan_hdr); 3077 inlen = loc->mbuf->l2_len + vlan + 3078 loc->mbuf->l3_len + loc->mbuf->l4_len; 3079 if (unlikely((!inlen || !loc->mbuf->tso_segsz))) 3080 return MLX5_TXCMP_CODE_ERROR; 3081 if (loc->mbuf->ol_flags & PKT_TX_TUNNEL_MASK) 3082 inlen += loc->mbuf->outer_l2_len + loc->mbuf->outer_l3_len; 3083 /* Packet must contain all TSO headers. */ 3084 if (unlikely(inlen > MLX5_MAX_TSO_HEADER || 3085 inlen <= MLX5_ESEG_MIN_INLINE_SIZE || 3086 inlen > (dlen + vlan))) 3087 return MLX5_TXCMP_CODE_ERROR; 3088 assert(inlen >= txq->inlen_mode); 3089 /* 3090 * Check whether there are enough free WQEBBs: 3091 * - Control Segment 3092 * - Ethernet Segment 3093 * - First Segment of inlined Ethernet data 3094 * - ... data continued ... 3095 * - Data Segments of pointer/min inline type 3096 */ 3097 ds = NB_SEGS(loc->mbuf) + 2 + (inlen - 3098 MLX5_ESEG_MIN_INLINE_SIZE + 3099 MLX5_WSEG_SIZE + 3100 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 3101 if (unlikely(loc->wqe_free < ((ds + 3) / 4))) 3102 return MLX5_TXCMP_CODE_EXIT; 3103 /* Check for maximal WQE size. */ 3104 if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4))) 3105 return MLX5_TXCMP_CODE_ERROR; 3106 #ifdef MLX5_PMD_SOFT_COUNTERS 3107 /* Update sent data bytes/packets counters. */ 3108 ntcp = (dlen - (inlen - vlan) + loc->mbuf->tso_segsz - 1) / 3109 loc->mbuf->tso_segsz; 3110 /* 3111 * One will be added for mbuf itself 3112 * at the end of the mlx5_tx_burst from 3113 * loc->pkts_sent field. 3114 */ 3115 --ntcp; 3116 txq->stats.opackets += ntcp; 3117 txq->stats.obytes += dlen + vlan + ntcp * inlen; 3118 #endif 3119 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3120 loc->wqe_last = wqe; 3121 mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_TSO, olx); 3122 ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 1, olx); 3123 wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); 3124 txq->wqe_ci += (ds + 3) / 4; 3125 loc->wqe_free -= (ds + 3) / 4; 3126 return MLX5_TXCMP_CODE_MULTI; 3127 } 3128 3129 /** 3130 * Tx one packet function for multi-segment SEND. Supports all 3131 * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs, 3132 * sends one packet per WQE, without any data inlining in 3133 * Ethernet Segment. 3134 * 3135 * This routine is responsible for storing processed mbuf 3136 * into elts ring buffer and update elts_head. 3137 * 3138 * @param txq 3139 * Pointer to TX queue structure. 3140 * @param loc 3141 * Pointer to burst routine local context. 3142 * @param olx 3143 * Configured Tx offloads mask. It is fully defined at 3144 * compile time and may be used for optimization. 3145 * 3146 * @return 3147 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3148 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3149 * Local context variables partially updated. 3150 */ 3151 static __rte_always_inline enum mlx5_txcmp_code 3152 mlx5_tx_packet_multi_send(struct mlx5_txq_data *restrict txq, 3153 struct mlx5_txq_local *restrict loc, 3154 unsigned int olx) 3155 { 3156 struct mlx5_wqe_dseg *restrict dseg; 3157 struct mlx5_wqe *restrict wqe; 3158 unsigned int ds, nseg; 3159 3160 assert(NB_SEGS(loc->mbuf) > 1); 3161 /* 3162 * No inline at all, it means the CPU cycles saving 3163 * is prioritized at configuration, we should not 3164 * copy any packet data to WQE. 3165 */ 3166 nseg = NB_SEGS(loc->mbuf); 3167 ds = 2 + nseg; 3168 if (unlikely(loc->wqe_free < ((ds + 3) / 4))) 3169 return MLX5_TXCMP_CODE_EXIT; 3170 /* Check for maximal WQE size. */ 3171 if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4))) 3172 return MLX5_TXCMP_CODE_ERROR; 3173 /* 3174 * Some Tx offloads may cause an error if 3175 * packet is not long enough, check against 3176 * assumed minimal length. 3177 */ 3178 if (rte_pktmbuf_pkt_len(loc->mbuf) <= MLX5_ESEG_MIN_INLINE_SIZE) 3179 return MLX5_TXCMP_CODE_ERROR; 3180 #ifdef MLX5_PMD_SOFT_COUNTERS 3181 /* Update sent data bytes counter. */ 3182 txq->stats.obytes += rte_pktmbuf_pkt_len(loc->mbuf); 3183 if (MLX5_TXOFF_CONFIG(VLAN) && 3184 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) 3185 txq->stats.obytes += sizeof(struct rte_vlan_hdr); 3186 #endif 3187 /* 3188 * SEND WQE, one WQEBB: 3189 * - Control Segment, SEND opcode 3190 * - Ethernet Segment, optional VLAN, no inline 3191 * - Data Segments, pointer only type 3192 */ 3193 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3194 loc->wqe_last = wqe; 3195 mlx5_tx_cseg_init(txq, loc, wqe, ds, MLX5_OPCODE_SEND, olx); 3196 mlx5_tx_eseg_none(txq, loc, wqe, olx); 3197 dseg = &wqe->dseg[0]; 3198 do { 3199 if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) { 3200 struct rte_mbuf *mbuf; 3201 3202 /* 3203 * Zero length segment found, have to 3204 * correct total size of WQE in segments. 3205 * It is supposed to be rare occasion, so 3206 * in normal case (no zero length segments) 3207 * we avoid extra writing to the Control 3208 * Segment. 3209 */ 3210 --ds; 3211 wqe->cseg.sq_ds -= RTE_BE32(1); 3212 mbuf = loc->mbuf; 3213 loc->mbuf = mbuf->next; 3214 rte_pktmbuf_free_seg(mbuf); 3215 if (--nseg == 0) 3216 break; 3217 } else { 3218 mlx5_tx_dseg_ptr 3219 (txq, loc, dseg, 3220 rte_pktmbuf_mtod(loc->mbuf, uint8_t *), 3221 rte_pktmbuf_data_len(loc->mbuf), olx); 3222 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 3223 --loc->elts_free; 3224 if (--nseg == 0) 3225 break; 3226 ++dseg; 3227 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 3228 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 3229 loc->mbuf = loc->mbuf->next; 3230 } 3231 } while (true); 3232 txq->wqe_ci += (ds + 3) / 4; 3233 loc->wqe_free -= (ds + 3) / 4; 3234 return MLX5_TXCMP_CODE_MULTI; 3235 } 3236 3237 /** 3238 * Tx one packet function for multi-segment SEND. Supports all 3239 * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs, 3240 * sends one packet per WQE, with data inlining in 3241 * Ethernet Segment and minimal Data Segments. 3242 * 3243 * This routine is responsible for storing processed mbuf 3244 * into elts ring buffer and update elts_head. 3245 * 3246 * @param txq 3247 * Pointer to TX queue structure. 3248 * @param loc 3249 * Pointer to burst routine local context. 3250 * @param olx 3251 * Configured Tx offloads mask. It is fully defined at 3252 * compile time and may be used for optimization. 3253 * 3254 * @return 3255 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3256 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3257 * Local context variables partially updated. 3258 */ 3259 static __rte_always_inline enum mlx5_txcmp_code 3260 mlx5_tx_packet_multi_inline(struct mlx5_txq_data *restrict txq, 3261 struct mlx5_txq_local *restrict loc, 3262 unsigned int olx) 3263 { 3264 struct mlx5_wqe *restrict wqe; 3265 unsigned int ds, inlen, dlen, vlan = 0; 3266 3267 assert(MLX5_TXOFF_CONFIG(INLINE)); 3268 assert(NB_SEGS(loc->mbuf) > 1); 3269 /* 3270 * First calculate data length to be inlined 3271 * to estimate the required space for WQE. 3272 */ 3273 dlen = rte_pktmbuf_pkt_len(loc->mbuf); 3274 if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) 3275 vlan = sizeof(struct rte_vlan_hdr); 3276 inlen = dlen + vlan; 3277 /* Check against minimal length. */ 3278 if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE) 3279 return MLX5_TXCMP_CODE_ERROR; 3280 assert(txq->inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE); 3281 if (inlen > txq->inlen_send) { 3282 struct rte_mbuf *mbuf; 3283 unsigned int nxlen; 3284 uintptr_t start; 3285 3286 /* 3287 * Packet length exceeds the allowed inline 3288 * data length, check whether the minimal 3289 * inlining is required. 3290 */ 3291 if (txq->inlen_mode) { 3292 assert(txq->inlen_mode >= MLX5_ESEG_MIN_INLINE_SIZE); 3293 assert(txq->inlen_mode <= txq->inlen_send); 3294 inlen = txq->inlen_mode; 3295 } else { 3296 if (!vlan || txq->vlan_en) { 3297 /* 3298 * VLAN insertion will be done inside by HW. 3299 * It is not utmost effective - VLAN flag is 3300 * checked twice, but we should proceed the 3301 * inlining length correctly and take into 3302 * account the VLAN header being inserted. 3303 */ 3304 return mlx5_tx_packet_multi_send 3305 (txq, loc, olx); 3306 } 3307 inlen = MLX5_ESEG_MIN_INLINE_SIZE; 3308 } 3309 /* 3310 * Now we know the minimal amount of data is requested 3311 * to inline. Check whether we should inline the buffers 3312 * from the chain beginning to eliminate some mbufs. 3313 */ 3314 mbuf = loc->mbuf; 3315 nxlen = rte_pktmbuf_data_len(mbuf); 3316 if (unlikely(nxlen <= txq->inlen_send)) { 3317 /* We can inline first mbuf at least. */ 3318 if (nxlen < inlen) { 3319 unsigned int smlen; 3320 3321 /* Scan mbufs till inlen filled. */ 3322 do { 3323 smlen = nxlen; 3324 mbuf = NEXT(mbuf); 3325 assert(mbuf); 3326 nxlen = rte_pktmbuf_data_len(mbuf); 3327 nxlen += smlen; 3328 } while (unlikely(nxlen < inlen)); 3329 if (unlikely(nxlen > txq->inlen_send)) { 3330 /* We cannot inline entire mbuf. */ 3331 smlen = inlen - smlen; 3332 start = rte_pktmbuf_mtod_offset 3333 (mbuf, uintptr_t, smlen); 3334 goto do_align; 3335 } 3336 } 3337 do { 3338 inlen = nxlen; 3339 mbuf = NEXT(mbuf); 3340 /* There should be not end of packet. */ 3341 assert(mbuf); 3342 nxlen = inlen + rte_pktmbuf_data_len(mbuf); 3343 } while (unlikely(nxlen < txq->inlen_send)); 3344 } 3345 start = rte_pktmbuf_mtod(mbuf, uintptr_t); 3346 /* 3347 * Check whether we can do inline to align start 3348 * address of data buffer to cacheline. 3349 */ 3350 do_align: 3351 start = (~start + 1) & (RTE_CACHE_LINE_SIZE - 1); 3352 if (unlikely(start)) { 3353 start += inlen; 3354 if (start <= txq->inlen_send) 3355 inlen = start; 3356 } 3357 } 3358 /* 3359 * Check whether there are enough free WQEBBs: 3360 * - Control Segment 3361 * - Ethernet Segment 3362 * - First Segment of inlined Ethernet data 3363 * - ... data continued ... 3364 * - Data Segments of pointer/min inline type 3365 * 3366 * Estimate the number of Data Segments conservatively, 3367 * supposing no any mbufs is being freed during inlining. 3368 */ 3369 assert(inlen <= txq->inlen_send); 3370 ds = NB_SEGS(loc->mbuf) + 2 + (inlen - 3371 MLX5_ESEG_MIN_INLINE_SIZE + 3372 MLX5_WSEG_SIZE + 3373 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 3374 if (unlikely(loc->wqe_free < ((ds + 3) / 4))) 3375 return MLX5_TXCMP_CODE_EXIT; 3376 /* Check for maximal WQE size. */ 3377 if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4))) 3378 return MLX5_TXCMP_CODE_ERROR; 3379 #ifdef MLX5_PMD_SOFT_COUNTERS 3380 /* Update sent data bytes/packets counters. */ 3381 txq->stats.obytes += dlen + vlan; 3382 #endif 3383 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3384 loc->wqe_last = wqe; 3385 mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_SEND, olx); 3386 ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 0, olx); 3387 wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); 3388 txq->wqe_ci += (ds + 3) / 4; 3389 loc->wqe_free -= (ds + 3) / 4; 3390 return MLX5_TXCMP_CODE_MULTI; 3391 } 3392 3393 /** 3394 * Tx burst function for multi-segment packets. Supports all 3395 * types of Tx offloads, uses MLX5_OPCODE_SEND/TSO to build WQEs, 3396 * sends one packet per WQE. Function stops sending if it 3397 * encounters the single-segment packet. 3398 * 3399 * This routine is responsible for storing processed mbuf 3400 * into elts ring buffer and update elts_head. 3401 * 3402 * @param txq 3403 * Pointer to TX queue structure. 3404 * @param[in] pkts 3405 * Packets to transmit. 3406 * @param pkts_n 3407 * Number of packets in array. 3408 * @param loc 3409 * Pointer to burst routine local context. 3410 * @param olx 3411 * Configured Tx offloads mask. It is fully defined at 3412 * compile time and may be used for optimization. 3413 * 3414 * @return 3415 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3416 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3417 * MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered. 3418 * MLX5_TXCMP_CODE_TSO - TSO single-segment packet encountered. 3419 * Local context variables updated. 3420 */ 3421 static __rte_always_inline enum mlx5_txcmp_code 3422 mlx5_tx_burst_mseg(struct mlx5_txq_data *restrict txq, 3423 struct rte_mbuf **restrict pkts, 3424 unsigned int pkts_n, 3425 struct mlx5_txq_local *restrict loc, 3426 unsigned int olx) 3427 { 3428 assert(loc->elts_free && loc->wqe_free); 3429 assert(pkts_n > loc->pkts_sent); 3430 pkts += loc->pkts_sent + 1; 3431 pkts_n -= loc->pkts_sent; 3432 for (;;) { 3433 enum mlx5_txcmp_code ret; 3434 3435 assert(NB_SEGS(loc->mbuf) > 1); 3436 /* 3437 * Estimate the number of free elts quickly but 3438 * conservatively. Some segment may be fully inlined 3439 * and freed, ignore this here - precise estimation 3440 * is costly. 3441 */ 3442 if (loc->elts_free < NB_SEGS(loc->mbuf)) 3443 return MLX5_TXCMP_CODE_EXIT; 3444 if (MLX5_TXOFF_CONFIG(TSO) && 3445 unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) { 3446 /* Proceed with multi-segment TSO. */ 3447 ret = mlx5_tx_packet_multi_tso(txq, loc, olx); 3448 } else if (MLX5_TXOFF_CONFIG(INLINE)) { 3449 /* Proceed with multi-segment SEND with inlining. */ 3450 ret = mlx5_tx_packet_multi_inline(txq, loc, olx); 3451 } else { 3452 /* Proceed with multi-segment SEND w/o inlining. */ 3453 ret = mlx5_tx_packet_multi_send(txq, loc, olx); 3454 } 3455 if (ret == MLX5_TXCMP_CODE_EXIT) 3456 return MLX5_TXCMP_CODE_EXIT; 3457 if (ret == MLX5_TXCMP_CODE_ERROR) 3458 return MLX5_TXCMP_CODE_ERROR; 3459 /* WQE is built, go to the next packet. */ 3460 ++loc->pkts_sent; 3461 --pkts_n; 3462 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) 3463 return MLX5_TXCMP_CODE_EXIT; 3464 loc->mbuf = *pkts++; 3465 if (pkts_n > 1) 3466 rte_prefetch0(*pkts); 3467 if (likely(NB_SEGS(loc->mbuf) > 1)) 3468 continue; 3469 /* Here ends the series of multi-segment packets. */ 3470 if (MLX5_TXOFF_CONFIG(TSO) && 3471 unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) 3472 return MLX5_TXCMP_CODE_TSO; 3473 return MLX5_TXCMP_CODE_SINGLE; 3474 } 3475 assert(false); 3476 } 3477 3478 /** 3479 * Tx burst function for single-segment packets with TSO. 3480 * Supports all types of Tx offloads, except multi-packets. 3481 * Uses MLX5_OPCODE_TSO to build WQEs, sends one packet per WQE. 3482 * Function stops sending if it encounters the multi-segment 3483 * packet or packet without TSO requested. 3484 * 3485 * The routine is responsible for storing processed mbuf 3486 * into elts ring buffer and update elts_head if inline 3487 * offloads is requested due to possible early freeing 3488 * of the inlined mbufs (can not store pkts array in elts 3489 * as a batch). 3490 * 3491 * @param txq 3492 * Pointer to TX queue structure. 3493 * @param[in] pkts 3494 * Packets to transmit. 3495 * @param pkts_n 3496 * Number of packets in array. 3497 * @param loc 3498 * Pointer to burst routine local context. 3499 * @param olx 3500 * Configured Tx offloads mask. It is fully defined at 3501 * compile time and may be used for optimization. 3502 * 3503 * @return 3504 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3505 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3506 * MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered. 3507 * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered. 3508 * Local context variables updated. 3509 */ 3510 static __rte_always_inline enum mlx5_txcmp_code 3511 mlx5_tx_burst_tso(struct mlx5_txq_data *restrict txq, 3512 struct rte_mbuf **restrict pkts, 3513 unsigned int pkts_n, 3514 struct mlx5_txq_local *restrict loc, 3515 unsigned int olx) 3516 { 3517 assert(loc->elts_free && loc->wqe_free); 3518 assert(pkts_n > loc->pkts_sent); 3519 pkts += loc->pkts_sent + 1; 3520 pkts_n -= loc->pkts_sent; 3521 for (;;) { 3522 struct mlx5_wqe_dseg *restrict dseg; 3523 struct mlx5_wqe *restrict wqe; 3524 unsigned int ds, dlen, hlen, ntcp, vlan = 0; 3525 uint8_t *dptr; 3526 3527 assert(NB_SEGS(loc->mbuf) == 1); 3528 dlen = rte_pktmbuf_data_len(loc->mbuf); 3529 if (MLX5_TXOFF_CONFIG(VLAN) && 3530 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { 3531 vlan = sizeof(struct rte_vlan_hdr); 3532 } 3533 /* 3534 * First calculate the WQE size to check 3535 * whether we have enough space in ring buffer. 3536 */ 3537 hlen = loc->mbuf->l2_len + vlan + 3538 loc->mbuf->l3_len + loc->mbuf->l4_len; 3539 if (unlikely((!hlen || !loc->mbuf->tso_segsz))) 3540 return MLX5_TXCMP_CODE_ERROR; 3541 if (loc->mbuf->ol_flags & PKT_TX_TUNNEL_MASK) 3542 hlen += loc->mbuf->outer_l2_len + 3543 loc->mbuf->outer_l3_len; 3544 /* Segment must contain all TSO headers. */ 3545 if (unlikely(hlen > MLX5_MAX_TSO_HEADER || 3546 hlen <= MLX5_ESEG_MIN_INLINE_SIZE || 3547 hlen > (dlen + vlan))) 3548 return MLX5_TXCMP_CODE_ERROR; 3549 /* 3550 * Check whether there are enough free WQEBBs: 3551 * - Control Segment 3552 * - Ethernet Segment 3553 * - First Segment of inlined Ethernet data 3554 * - ... data continued ... 3555 * - Finishing Data Segment of pointer type 3556 */ 3557 ds = 4 + (hlen - MLX5_ESEG_MIN_INLINE_SIZE + 3558 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 3559 if (loc->wqe_free < ((ds + 3) / 4)) 3560 return MLX5_TXCMP_CODE_EXIT; 3561 #ifdef MLX5_PMD_SOFT_COUNTERS 3562 /* Update sent data bytes/packets counters. */ 3563 ntcp = (dlen + vlan - hlen + 3564 loc->mbuf->tso_segsz - 1) / 3565 loc->mbuf->tso_segsz; 3566 /* 3567 * One will be added for mbuf itself at the end 3568 * of the mlx5_tx_burst from loc->pkts_sent field. 3569 */ 3570 --ntcp; 3571 txq->stats.opackets += ntcp; 3572 txq->stats.obytes += dlen + vlan + ntcp * hlen; 3573 #endif 3574 /* 3575 * Build the TSO WQE: 3576 * - Control Segment 3577 * - Ethernet Segment with hlen bytes inlined 3578 * - Data Segment of pointer type 3579 */ 3580 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3581 loc->wqe_last = wqe; 3582 mlx5_tx_cseg_init(txq, loc, wqe, ds, 3583 MLX5_OPCODE_TSO, olx); 3584 dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan, hlen, 1, olx); 3585 dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + hlen - vlan; 3586 dlen -= hlen - vlan; 3587 mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx); 3588 /* 3589 * WQE is built, update the loop parameters 3590 * and go to the next packet. 3591 */ 3592 txq->wqe_ci += (ds + 3) / 4; 3593 loc->wqe_free -= (ds + 3) / 4; 3594 if (MLX5_TXOFF_CONFIG(INLINE)) 3595 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 3596 --loc->elts_free; 3597 ++loc->pkts_sent; 3598 --pkts_n; 3599 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) 3600 return MLX5_TXCMP_CODE_EXIT; 3601 loc->mbuf = *pkts++; 3602 if (pkts_n > 1) 3603 rte_prefetch0(*pkts); 3604 if (MLX5_TXOFF_CONFIG(MULTI) && 3605 unlikely(NB_SEGS(loc->mbuf) > 1)) 3606 return MLX5_TXCMP_CODE_MULTI; 3607 if (likely(!(loc->mbuf->ol_flags & PKT_TX_TCP_SEG))) 3608 return MLX5_TXCMP_CODE_SINGLE; 3609 /* Continue with the next TSO packet. */ 3610 } 3611 assert(false); 3612 } 3613 3614 /** 3615 * Analyze the packet and select the best method to send. 3616 * 3617 * @param txq 3618 * Pointer to TX queue structure. 3619 * @param loc 3620 * Pointer to burst routine local context. 3621 * @param olx 3622 * Configured Tx offloads mask. It is fully defined at 3623 * compile time and may be used for optimization. 3624 * @param newp 3625 * The predefined flag whether do complete check for 3626 * multi-segment packets and TSO. 3627 * 3628 * @return 3629 * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered. 3630 * MLX5_TXCMP_CODE_TSO - TSO required, use TSO/LSO. 3631 * MLX5_TXCMP_CODE_SINGLE - single-segment packet, use SEND. 3632 * MLX5_TXCMP_CODE_EMPW - single-segment packet, use MPW. 3633 */ 3634 static __rte_always_inline enum mlx5_txcmp_code 3635 mlx5_tx_able_to_empw(struct mlx5_txq_data *restrict txq, 3636 struct mlx5_txq_local *restrict loc, 3637 unsigned int olx, 3638 bool newp) 3639 { 3640 /* Check for multi-segment packet. */ 3641 if (newp && 3642 MLX5_TXOFF_CONFIG(MULTI) && 3643 unlikely(NB_SEGS(loc->mbuf) > 1)) 3644 return MLX5_TXCMP_CODE_MULTI; 3645 /* Check for TSO packet. */ 3646 if (newp && 3647 MLX5_TXOFF_CONFIG(TSO) && 3648 unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) 3649 return MLX5_TXCMP_CODE_TSO; 3650 /* Check if eMPW is enabled at all. */ 3651 if (!MLX5_TXOFF_CONFIG(EMPW)) 3652 return MLX5_TXCMP_CODE_SINGLE; 3653 /* Check if eMPW can be engaged. */ 3654 if (MLX5_TXOFF_CONFIG(VLAN) && 3655 unlikely(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) && 3656 (!MLX5_TXOFF_CONFIG(INLINE) || 3657 unlikely((rte_pktmbuf_data_len(loc->mbuf) + 3658 sizeof(struct rte_vlan_hdr)) > txq->inlen_empw))) { 3659 /* 3660 * eMPW does not support VLAN insertion offload, 3661 * we have to inline the entire packet but 3662 * packet is too long for inlining. 3663 */ 3664 return MLX5_TXCMP_CODE_SINGLE; 3665 } 3666 return MLX5_TXCMP_CODE_EMPW; 3667 } 3668 3669 /** 3670 * Check the next packet attributes to match with the eMPW batch ones. 3671 * In addition, for legacy MPW the packet length is checked either. 3672 * 3673 * @param txq 3674 * Pointer to TX queue structure. 3675 * @param es 3676 * Pointer to Ethernet Segment of eMPW batch. 3677 * @param loc 3678 * Pointer to burst routine local context. 3679 * @param dlen 3680 * Length of previous packet in MPW descriptor. 3681 * @param olx 3682 * Configured Tx offloads mask. It is fully defined at 3683 * compile time and may be used for optimization. 3684 * 3685 * @return 3686 * true - packet match with eMPW batch attributes. 3687 * false - no match, eMPW should be restarted. 3688 */ 3689 static __rte_always_inline bool 3690 mlx5_tx_match_empw(struct mlx5_txq_data *restrict txq __rte_unused, 3691 struct mlx5_wqe_eseg *restrict es, 3692 struct mlx5_txq_local *restrict loc, 3693 uint32_t dlen, 3694 unsigned int olx) 3695 { 3696 uint8_t swp_flags = 0; 3697 3698 /* Compare the checksum flags, if any. */ 3699 if (MLX5_TXOFF_CONFIG(CSUM) && 3700 txq_ol_cksum_to_cs(loc->mbuf) != es->cs_flags) 3701 return false; 3702 /* Compare the Software Parser offsets and flags. */ 3703 if (MLX5_TXOFF_CONFIG(SWP) && 3704 (es->swp_offs != txq_mbuf_to_swp(loc, &swp_flags, olx) || 3705 es->swp_flags != swp_flags)) 3706 return false; 3707 /* Fill metadata field if needed. */ 3708 if (MLX5_TXOFF_CONFIG(METADATA) && 3709 es->metadata != (loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? 3710 *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0)) 3711 return false; 3712 /* Legacy MPW can send packets with the same lengt only. */ 3713 if (MLX5_TXOFF_CONFIG(MPW) && 3714 dlen != rte_pktmbuf_data_len(loc->mbuf)) 3715 return false; 3716 /* There must be no VLAN packets in eMPW loop. */ 3717 if (MLX5_TXOFF_CONFIG(VLAN)) 3718 assert(!(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)); 3719 return true; 3720 } 3721 3722 /* 3723 * Update send loop variables and WQE for eMPW loop 3724 * without data inlining. Number of Data Segments is 3725 * equal to the number of sent packets. 3726 * 3727 * @param txq 3728 * Pointer to TX queue structure. 3729 * @param loc 3730 * Pointer to burst routine local context. 3731 * @param ds 3732 * Number of packets/Data Segments/Packets. 3733 * @param slen 3734 * Accumulated statistics, bytes sent 3735 * @param olx 3736 * Configured Tx offloads mask. It is fully defined at 3737 * compile time and may be used for optimization. 3738 * 3739 * @return 3740 * true - packet match with eMPW batch attributes. 3741 * false - no match, eMPW should be restarted. 3742 */ 3743 static __rte_always_inline void 3744 mlx5_tx_sdone_empw(struct mlx5_txq_data *restrict txq, 3745 struct mlx5_txq_local *restrict loc, 3746 unsigned int ds, 3747 unsigned int slen, 3748 unsigned int olx __rte_unused) 3749 { 3750 assert(!MLX5_TXOFF_CONFIG(INLINE)); 3751 #ifdef MLX5_PMD_SOFT_COUNTERS 3752 /* Update sent data bytes counter. */ 3753 txq->stats.obytes += slen; 3754 #else 3755 (void)slen; 3756 #endif 3757 loc->elts_free -= ds; 3758 loc->pkts_sent += ds; 3759 ds += 2; 3760 loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); 3761 txq->wqe_ci += (ds + 3) / 4; 3762 loc->wqe_free -= (ds + 3) / 4; 3763 } 3764 3765 /* 3766 * Update send loop variables and WQE for eMPW loop 3767 * with data inlining. Gets the size of pushed descriptors 3768 * and data to the WQE. 3769 * 3770 * @param txq 3771 * Pointer to TX queue structure. 3772 * @param loc 3773 * Pointer to burst routine local context. 3774 * @param len 3775 * Total size of descriptor/data in bytes. 3776 * @param slen 3777 * Accumulated statistics, data bytes sent. 3778 * @param olx 3779 * Configured Tx offloads mask. It is fully defined at 3780 * compile time and may be used for optimization. 3781 * 3782 * @return 3783 * true - packet match with eMPW batch attributes. 3784 * false - no match, eMPW should be restarted. 3785 */ 3786 static __rte_always_inline void 3787 mlx5_tx_idone_empw(struct mlx5_txq_data *restrict txq, 3788 struct mlx5_txq_local *restrict loc, 3789 unsigned int len, 3790 unsigned int slen, 3791 unsigned int olx __rte_unused) 3792 { 3793 assert(MLX5_TXOFF_CONFIG(INLINE)); 3794 assert((len % MLX5_WSEG_SIZE) == 0); 3795 #ifdef MLX5_PMD_SOFT_COUNTERS 3796 /* Update sent data bytes counter. */ 3797 txq->stats.obytes += slen; 3798 #else 3799 (void)slen; 3800 #endif 3801 len = len / MLX5_WSEG_SIZE + 2; 3802 loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | len); 3803 txq->wqe_ci += (len + 3) / 4; 3804 loc->wqe_free -= (len + 3) / 4; 3805 } 3806 3807 /** 3808 * The set of Tx burst functions for single-segment packets 3809 * without TSO and with Multi-Packet Writing feature support. 3810 * Supports all types of Tx offloads, except multi-packets 3811 * and TSO. 3812 * 3813 * Uses MLX5_OPCODE_EMPW to build WQEs if possible and sends 3814 * as many packet per WQE as it can. If eMPW is not configured 3815 * or packet can not be sent with eMPW (VLAN insertion) the 3816 * ordinary SEND opcode is used and only one packet placed 3817 * in WQE. 3818 * 3819 * Functions stop sending if it encounters the multi-segment 3820 * packet or packet with TSO requested. 3821 * 3822 * The routines are responsible for storing processed mbuf 3823 * into elts ring buffer and update elts_head if inlining 3824 * offload is requested. Otherwise the copying mbufs to elts 3825 * can be postponed and completed at the end of burst routine. 3826 * 3827 * @param txq 3828 * Pointer to TX queue structure. 3829 * @param[in] pkts 3830 * Packets to transmit. 3831 * @param pkts_n 3832 * Number of packets in array. 3833 * @param loc 3834 * Pointer to burst routine local context. 3835 * @param olx 3836 * Configured Tx offloads mask. It is fully defined at 3837 * compile time and may be used for optimization. 3838 * 3839 * @return 3840 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3841 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3842 * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered. 3843 * MLX5_TXCMP_CODE_TSO - TSO packet encountered. 3844 * MLX5_TXCMP_CODE_SINGLE - used inside functions set. 3845 * MLX5_TXCMP_CODE_EMPW - used inside functions set. 3846 * 3847 * Local context variables updated. 3848 * 3849 * 3850 * The routine sends packets with MLX5_OPCODE_EMPW 3851 * without inlining, this is dedicated optimized branch. 3852 * No VLAN insertion is supported. 3853 */ 3854 static __rte_always_inline enum mlx5_txcmp_code 3855 mlx5_tx_burst_empw_simple(struct mlx5_txq_data *restrict txq, 3856 struct rte_mbuf **restrict pkts, 3857 unsigned int pkts_n, 3858 struct mlx5_txq_local *restrict loc, 3859 unsigned int olx) 3860 { 3861 /* 3862 * Subroutine is the part of mlx5_tx_burst_single() 3863 * and sends single-segment packet with eMPW opcode 3864 * without data inlining. 3865 */ 3866 assert(!MLX5_TXOFF_CONFIG(INLINE)); 3867 assert(MLX5_TXOFF_CONFIG(EMPW)); 3868 assert(loc->elts_free && loc->wqe_free); 3869 assert(pkts_n > loc->pkts_sent); 3870 static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size"); 3871 pkts += loc->pkts_sent + 1; 3872 pkts_n -= loc->pkts_sent; 3873 for (;;) { 3874 struct mlx5_wqe_dseg *restrict dseg; 3875 struct mlx5_wqe_eseg *restrict eseg; 3876 enum mlx5_txcmp_code ret; 3877 unsigned int part, loop; 3878 unsigned int slen = 0; 3879 3880 next_empw: 3881 assert(NB_SEGS(loc->mbuf) == 1); 3882 part = RTE_MIN(pkts_n, MLX5_TXOFF_CONFIG(MPW) ? 3883 MLX5_MPW_MAX_PACKETS : 3884 MLX5_EMPW_MAX_PACKETS); 3885 if (unlikely(loc->elts_free < part)) { 3886 /* We have no enough elts to save all mbufs. */ 3887 if (unlikely(loc->elts_free < MLX5_EMPW_MIN_PACKETS)) 3888 return MLX5_TXCMP_CODE_EXIT; 3889 /* But we still able to send at least minimal eMPW. */ 3890 part = loc->elts_free; 3891 } 3892 /* Check whether we have enough WQEs */ 3893 if (unlikely(loc->wqe_free < ((2 + part + 3) / 4))) { 3894 if (unlikely(loc->wqe_free < 3895 ((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4))) 3896 return MLX5_TXCMP_CODE_EXIT; 3897 part = (loc->wqe_free * 4) - 2; 3898 } 3899 if (likely(part > 1)) 3900 rte_prefetch0(*pkts); 3901 loc->wqe_last = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3902 /* 3903 * Build eMPW title WQEBB: 3904 * - Control Segment, eMPW opcode 3905 * - Ethernet Segment, no inline 3906 */ 3907 mlx5_tx_cseg_init(txq, loc, loc->wqe_last, part + 2, 3908 MLX5_OPCODE_ENHANCED_MPSW, olx); 3909 mlx5_tx_eseg_none(txq, loc, loc->wqe_last, 3910 olx & ~MLX5_TXOFF_CONFIG_VLAN); 3911 eseg = &loc->wqe_last->eseg; 3912 dseg = &loc->wqe_last->dseg[0]; 3913 loop = part; 3914 /* Store the packet length for legacy MPW. */ 3915 if (MLX5_TXOFF_CONFIG(MPW)) 3916 eseg->mss = rte_cpu_to_be_16 3917 (rte_pktmbuf_data_len(loc->mbuf)); 3918 for (;;) { 3919 uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf); 3920 #ifdef MLX5_PMD_SOFT_COUNTERS 3921 /* Update sent data bytes counter. */ 3922 slen += dlen; 3923 #endif 3924 mlx5_tx_dseg_ptr 3925 (txq, loc, dseg, 3926 rte_pktmbuf_mtod(loc->mbuf, uint8_t *), 3927 dlen, olx); 3928 if (unlikely(--loop == 0)) 3929 break; 3930 loc->mbuf = *pkts++; 3931 if (likely(loop > 1)) 3932 rte_prefetch0(*pkts); 3933 ret = mlx5_tx_able_to_empw(txq, loc, olx, true); 3934 /* 3935 * Unroll the completion code to avoid 3936 * returning variable value - it results in 3937 * unoptimized sequent checking in caller. 3938 */ 3939 if (ret == MLX5_TXCMP_CODE_MULTI) { 3940 part -= loop; 3941 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 3942 if (unlikely(!loc->elts_free || 3943 !loc->wqe_free)) 3944 return MLX5_TXCMP_CODE_EXIT; 3945 return MLX5_TXCMP_CODE_MULTI; 3946 } 3947 assert(NB_SEGS(loc->mbuf) == 1); 3948 if (ret == MLX5_TXCMP_CODE_TSO) { 3949 part -= loop; 3950 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 3951 if (unlikely(!loc->elts_free || 3952 !loc->wqe_free)) 3953 return MLX5_TXCMP_CODE_EXIT; 3954 return MLX5_TXCMP_CODE_TSO; 3955 } 3956 if (ret == MLX5_TXCMP_CODE_SINGLE) { 3957 part -= loop; 3958 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 3959 if (unlikely(!loc->elts_free || 3960 !loc->wqe_free)) 3961 return MLX5_TXCMP_CODE_EXIT; 3962 return MLX5_TXCMP_CODE_SINGLE; 3963 } 3964 if (ret != MLX5_TXCMP_CODE_EMPW) { 3965 assert(false); 3966 part -= loop; 3967 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 3968 return MLX5_TXCMP_CODE_ERROR; 3969 } 3970 /* 3971 * Check whether packet parameters coincide 3972 * within assumed eMPW batch: 3973 * - check sum settings 3974 * - metadata value 3975 * - software parser settings 3976 * - packets length (legacy MPW only) 3977 */ 3978 if (!mlx5_tx_match_empw(txq, eseg, loc, dlen, olx)) { 3979 assert(loop); 3980 part -= loop; 3981 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 3982 if (unlikely(!loc->elts_free || 3983 !loc->wqe_free)) 3984 return MLX5_TXCMP_CODE_EXIT; 3985 pkts_n -= part; 3986 goto next_empw; 3987 } 3988 /* Packet attributes match, continue the same eMPW. */ 3989 ++dseg; 3990 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 3991 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 3992 } 3993 /* eMPW is built successfully, update loop parameters. */ 3994 assert(!loop); 3995 assert(pkts_n >= part); 3996 #ifdef MLX5_PMD_SOFT_COUNTERS 3997 /* Update sent data bytes counter. */ 3998 txq->stats.obytes += slen; 3999 #endif 4000 loc->elts_free -= part; 4001 loc->pkts_sent += part; 4002 txq->wqe_ci += (2 + part + 3) / 4; 4003 loc->wqe_free -= (2 + part + 3) / 4; 4004 pkts_n -= part; 4005 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) 4006 return MLX5_TXCMP_CODE_EXIT; 4007 loc->mbuf = *pkts++; 4008 ret = mlx5_tx_able_to_empw(txq, loc, olx, true); 4009 if (unlikely(ret != MLX5_TXCMP_CODE_EMPW)) 4010 return ret; 4011 /* Continue sending eMPW batches. */ 4012 } 4013 assert(false); 4014 } 4015 4016 /** 4017 * The routine sends packets with MLX5_OPCODE_EMPW 4018 * with inlining, optionally supports VLAN insertion. 4019 */ 4020 static __rte_always_inline enum mlx5_txcmp_code 4021 mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq, 4022 struct rte_mbuf **restrict pkts, 4023 unsigned int pkts_n, 4024 struct mlx5_txq_local *restrict loc, 4025 unsigned int olx) 4026 { 4027 /* 4028 * Subroutine is the part of mlx5_tx_burst_single() 4029 * and sends single-segment packet with eMPW opcode 4030 * with data inlining. 4031 */ 4032 assert(MLX5_TXOFF_CONFIG(INLINE)); 4033 assert(MLX5_TXOFF_CONFIG(EMPW)); 4034 assert(loc->elts_free && loc->wqe_free); 4035 assert(pkts_n > loc->pkts_sent); 4036 static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size"); 4037 pkts += loc->pkts_sent + 1; 4038 pkts_n -= loc->pkts_sent; 4039 for (;;) { 4040 struct mlx5_wqe_dseg *restrict dseg; 4041 struct mlx5_wqe_eseg *restrict eseg; 4042 enum mlx5_txcmp_code ret; 4043 unsigned int room, part, nlim; 4044 unsigned int slen = 0; 4045 4046 assert(NB_SEGS(loc->mbuf) == 1); 4047 /* 4048 * Limits the amount of packets in one WQE 4049 * to improve CQE latency generation. 4050 */ 4051 nlim = RTE_MIN(pkts_n, MLX5_TXOFF_CONFIG(MPW) ? 4052 MLX5_MPW_INLINE_MAX_PACKETS : 4053 MLX5_EMPW_MAX_PACKETS); 4054 /* Check whether we have minimal amount WQEs */ 4055 if (unlikely(loc->wqe_free < 4056 ((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4))) 4057 return MLX5_TXCMP_CODE_EXIT; 4058 if (likely(pkts_n > 1)) 4059 rte_prefetch0(*pkts); 4060 loc->wqe_last = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4061 /* 4062 * Build eMPW title WQEBB: 4063 * - Control Segment, eMPW opcode, zero DS 4064 * - Ethernet Segment, no inline 4065 */ 4066 mlx5_tx_cseg_init(txq, loc, loc->wqe_last, 0, 4067 MLX5_OPCODE_ENHANCED_MPSW, olx); 4068 mlx5_tx_eseg_none(txq, loc, loc->wqe_last, 4069 olx & ~MLX5_TXOFF_CONFIG_VLAN); 4070 eseg = &loc->wqe_last->eseg; 4071 dseg = &loc->wqe_last->dseg[0]; 4072 /* Store the packet length for legacy MPW. */ 4073 if (MLX5_TXOFF_CONFIG(MPW)) 4074 eseg->mss = rte_cpu_to_be_16 4075 (rte_pktmbuf_data_len(loc->mbuf)); 4076 room = RTE_MIN(MLX5_WQE_SIZE_MAX / MLX5_WQE_SIZE, 4077 loc->wqe_free) * MLX5_WQE_SIZE - 4078 MLX5_WQE_CSEG_SIZE - 4079 MLX5_WQE_ESEG_SIZE; 4080 /* Build WQE till we have space, packets and resources. */ 4081 part = room; 4082 for (;;) { 4083 uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf); 4084 uint8_t *dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *); 4085 unsigned int tlen; 4086 4087 assert(room >= MLX5_WQE_DSEG_SIZE); 4088 assert((room % MLX5_WQE_DSEG_SIZE) == 0); 4089 assert((uintptr_t)dseg < (uintptr_t)txq->wqes_end); 4090 /* 4091 * Some Tx offloads may cause an error if 4092 * packet is not long enough, check against 4093 * assumed minimal length. 4094 */ 4095 if (unlikely(dlen <= MLX5_ESEG_MIN_INLINE_SIZE)) { 4096 part -= room; 4097 if (unlikely(!part)) 4098 return MLX5_TXCMP_CODE_ERROR; 4099 /* 4100 * We have some successfully built 4101 * packet Data Segments to send. 4102 */ 4103 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4104 return MLX5_TXCMP_CODE_ERROR; 4105 } 4106 /* Inline or not inline - that's the Question. */ 4107 if (dlen > txq->inlen_empw) 4108 goto pointer_empw; 4109 /* Inline entire packet, optional VLAN insertion. */ 4110 tlen = sizeof(dseg->bcount) + dlen; 4111 if (MLX5_TXOFF_CONFIG(VLAN) && 4112 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { 4113 /* 4114 * The packet length must be checked in 4115 * mlx5_tx_able_to_empw() and packet 4116 * fits into inline length guaranteed. 4117 */ 4118 assert((dlen + sizeof(struct rte_vlan_hdr)) <= 4119 txq->inlen_empw); 4120 tlen += sizeof(struct rte_vlan_hdr); 4121 if (room < tlen) 4122 break; 4123 dseg = mlx5_tx_dseg_vlan(txq, loc, dseg, 4124 dptr, dlen, olx); 4125 #ifdef MLX5_PMD_SOFT_COUNTERS 4126 /* Update sent data bytes counter. */ 4127 slen += sizeof(struct rte_vlan_hdr); 4128 #endif 4129 } else { 4130 if (room < tlen) 4131 break; 4132 dseg = mlx5_tx_dseg_empw(txq, loc, dseg, 4133 dptr, dlen, olx); 4134 } 4135 tlen = RTE_ALIGN(tlen, MLX5_WSEG_SIZE); 4136 assert(room >= tlen); 4137 room -= tlen; 4138 /* 4139 * Packet data are completely inlined, 4140 * free the packet immediately. 4141 */ 4142 rte_pktmbuf_free_seg(loc->mbuf); 4143 goto next_mbuf; 4144 pointer_empw: 4145 /* 4146 * Not inlinable VLAN packets are 4147 * proceeded outside of this routine. 4148 */ 4149 assert(room >= MLX5_WQE_DSEG_SIZE); 4150 if (MLX5_TXOFF_CONFIG(VLAN)) 4151 assert(!(loc->mbuf->ol_flags & 4152 PKT_TX_VLAN_PKT)); 4153 mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx); 4154 /* We have to store mbuf in elts.*/ 4155 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 4156 room -= MLX5_WQE_DSEG_SIZE; 4157 /* Ring buffer wraparound is checked at the loop end.*/ 4158 ++dseg; 4159 next_mbuf: 4160 #ifdef MLX5_PMD_SOFT_COUNTERS 4161 /* Update sent data bytes counter. */ 4162 slen += dlen; 4163 #endif 4164 loc->pkts_sent++; 4165 loc->elts_free--; 4166 pkts_n--; 4167 if (unlikely(!pkts_n || !loc->elts_free)) { 4168 /* 4169 * We have no resources/packets to 4170 * continue build descriptors. 4171 */ 4172 part -= room; 4173 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4174 return MLX5_TXCMP_CODE_EXIT; 4175 } 4176 loc->mbuf = *pkts++; 4177 if (likely(pkts_n > 1)) 4178 rte_prefetch0(*pkts); 4179 ret = mlx5_tx_able_to_empw(txq, loc, olx, true); 4180 /* 4181 * Unroll the completion code to avoid 4182 * returning variable value - it results in 4183 * unoptimized sequent checking in caller. 4184 */ 4185 if (ret == MLX5_TXCMP_CODE_MULTI) { 4186 part -= room; 4187 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4188 if (unlikely(!loc->elts_free || 4189 !loc->wqe_free)) 4190 return MLX5_TXCMP_CODE_EXIT; 4191 return MLX5_TXCMP_CODE_MULTI; 4192 } 4193 assert(NB_SEGS(loc->mbuf) == 1); 4194 if (ret == MLX5_TXCMP_CODE_TSO) { 4195 part -= room; 4196 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4197 if (unlikely(!loc->elts_free || 4198 !loc->wqe_free)) 4199 return MLX5_TXCMP_CODE_EXIT; 4200 return MLX5_TXCMP_CODE_TSO; 4201 } 4202 if (ret == MLX5_TXCMP_CODE_SINGLE) { 4203 part -= room; 4204 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4205 if (unlikely(!loc->elts_free || 4206 !loc->wqe_free)) 4207 return MLX5_TXCMP_CODE_EXIT; 4208 return MLX5_TXCMP_CODE_SINGLE; 4209 } 4210 if (ret != MLX5_TXCMP_CODE_EMPW) { 4211 assert(false); 4212 part -= room; 4213 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4214 return MLX5_TXCMP_CODE_ERROR; 4215 } 4216 /* Check if we have minimal room left. */ 4217 nlim--; 4218 if (unlikely(!nlim || room < MLX5_WQE_DSEG_SIZE)) 4219 break; 4220 /* 4221 * Check whether packet parameters coincide 4222 * within assumed eMPW batch: 4223 * - check sum settings 4224 * - metadata value 4225 * - software parser settings 4226 * - packets length (legacy MPW only) 4227 */ 4228 if (!mlx5_tx_match_empw(txq, eseg, loc, dlen, olx)) 4229 break; 4230 /* Packet attributes match, continue the same eMPW. */ 4231 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 4232 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 4233 } 4234 /* 4235 * We get here to close an existing eMPW 4236 * session and start the new one. 4237 */ 4238 assert(pkts_n); 4239 part -= room; 4240 if (unlikely(!part)) 4241 return MLX5_TXCMP_CODE_EXIT; 4242 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4243 if (unlikely(!loc->elts_free || 4244 !loc->wqe_free)) 4245 return MLX5_TXCMP_CODE_EXIT; 4246 /* Continue the loop with new eMPW session. */ 4247 } 4248 assert(false); 4249 } 4250 4251 /** 4252 * The routine sends packets with ordinary MLX5_OPCODE_SEND. 4253 * Data inlining and VLAN insertion are supported. 4254 */ 4255 static __rte_always_inline enum mlx5_txcmp_code 4256 mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq, 4257 struct rte_mbuf **restrict pkts, 4258 unsigned int pkts_n, 4259 struct mlx5_txq_local *restrict loc, 4260 unsigned int olx) 4261 { 4262 /* 4263 * Subroutine is the part of mlx5_tx_burst_single() 4264 * and sends single-segment packet with SEND opcode. 4265 */ 4266 assert(loc->elts_free && loc->wqe_free); 4267 assert(pkts_n > loc->pkts_sent); 4268 pkts += loc->pkts_sent + 1; 4269 pkts_n -= loc->pkts_sent; 4270 for (;;) { 4271 struct mlx5_wqe *restrict wqe; 4272 enum mlx5_txcmp_code ret; 4273 4274 assert(NB_SEGS(loc->mbuf) == 1); 4275 if (MLX5_TXOFF_CONFIG(INLINE)) { 4276 unsigned int inlen, vlan = 0; 4277 4278 inlen = rte_pktmbuf_data_len(loc->mbuf); 4279 if (MLX5_TXOFF_CONFIG(VLAN) && 4280 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { 4281 vlan = sizeof(struct rte_vlan_hdr); 4282 inlen += vlan; 4283 static_assert((sizeof(struct rte_vlan_hdr) + 4284 sizeof(struct rte_ether_hdr)) == 4285 MLX5_ESEG_MIN_INLINE_SIZE, 4286 "invalid min inline data size"); 4287 } 4288 /* 4289 * If inlining is enabled at configuration time 4290 * the limit must be not less than minimal size. 4291 * Otherwise we would do extra check for data 4292 * size to avoid crashes due to length overflow. 4293 */ 4294 assert(txq->inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE); 4295 if (inlen <= txq->inlen_send) { 4296 unsigned int seg_n, wqe_n; 4297 4298 rte_prefetch0(rte_pktmbuf_mtod 4299 (loc->mbuf, uint8_t *)); 4300 /* Check against minimal length. */ 4301 if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE) 4302 return MLX5_TXCMP_CODE_ERROR; 4303 /* 4304 * Completely inlined packet data WQE: 4305 * - Control Segment, SEND opcode 4306 * - Ethernet Segment, no VLAN insertion 4307 * - Data inlined, VLAN optionally inserted 4308 * - Alignment to MLX5_WSEG_SIZE 4309 * Have to estimate amount of WQEBBs 4310 */ 4311 seg_n = (inlen + 3 * MLX5_WSEG_SIZE - 4312 MLX5_ESEG_MIN_INLINE_SIZE + 4313 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 4314 /* Check if there are enough WQEBBs. */ 4315 wqe_n = (seg_n + 3) / 4; 4316 if (wqe_n > loc->wqe_free) 4317 return MLX5_TXCMP_CODE_EXIT; 4318 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4319 loc->wqe_last = wqe; 4320 mlx5_tx_cseg_init(txq, loc, wqe, seg_n, 4321 MLX5_OPCODE_SEND, olx); 4322 mlx5_tx_eseg_data(txq, loc, wqe, 4323 vlan, inlen, 0, olx); 4324 txq->wqe_ci += wqe_n; 4325 loc->wqe_free -= wqe_n; 4326 /* 4327 * Packet data are completely inlined, 4328 * free the packet immediately. 4329 */ 4330 rte_pktmbuf_free_seg(loc->mbuf); 4331 } else if ((!MLX5_TXOFF_CONFIG(EMPW) || 4332 MLX5_TXOFF_CONFIG(MPW)) && 4333 txq->inlen_mode) { 4334 /* 4335 * If minimal inlining is requested the eMPW 4336 * feature should be disabled due to data is 4337 * inlined into Ethernet Segment, which can 4338 * not contain inlined data for eMPW due to 4339 * segment shared for all packets. 4340 */ 4341 struct mlx5_wqe_dseg *restrict dseg; 4342 unsigned int ds; 4343 uint8_t *dptr; 4344 4345 /* 4346 * The inline-mode settings require 4347 * to inline the specified amount of 4348 * data bytes to the Ethernet Segment. 4349 * We should check the free space in 4350 * WQE ring buffer to inline partially. 4351 */ 4352 assert(txq->inlen_send >= txq->inlen_mode); 4353 assert(inlen > txq->inlen_mode); 4354 assert(txq->inlen_mode >= 4355 MLX5_ESEG_MIN_INLINE_SIZE); 4356 /* 4357 * Check whether there are enough free WQEBBs: 4358 * - Control Segment 4359 * - Ethernet Segment 4360 * - First Segment of inlined Ethernet data 4361 * - ... data continued ... 4362 * - Finishing Data Segment of pointer type 4363 */ 4364 ds = (MLX5_WQE_CSEG_SIZE + 4365 MLX5_WQE_ESEG_SIZE + 4366 MLX5_WQE_DSEG_SIZE + 4367 txq->inlen_mode - 4368 MLX5_ESEG_MIN_INLINE_SIZE + 4369 MLX5_WQE_DSEG_SIZE + 4370 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 4371 if (loc->wqe_free < ((ds + 3) / 4)) 4372 return MLX5_TXCMP_CODE_EXIT; 4373 /* 4374 * Build the ordinary SEND WQE: 4375 * - Control Segment 4376 * - Ethernet Segment, inline inlen_mode bytes 4377 * - Data Segment of pointer type 4378 */ 4379 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4380 loc->wqe_last = wqe; 4381 mlx5_tx_cseg_init(txq, loc, wqe, ds, 4382 MLX5_OPCODE_SEND, olx); 4383 dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan, 4384 txq->inlen_mode, 4385 0, olx); 4386 dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + 4387 txq->inlen_mode - vlan; 4388 inlen -= txq->inlen_mode; 4389 mlx5_tx_dseg_ptr(txq, loc, dseg, 4390 dptr, inlen, olx); 4391 /* 4392 * WQE is built, update the loop parameters 4393 * and got to the next packet. 4394 */ 4395 txq->wqe_ci += (ds + 3) / 4; 4396 loc->wqe_free -= (ds + 3) / 4; 4397 /* We have to store mbuf in elts.*/ 4398 assert(MLX5_TXOFF_CONFIG(INLINE)); 4399 txq->elts[txq->elts_head++ & txq->elts_m] = 4400 loc->mbuf; 4401 --loc->elts_free; 4402 } else { 4403 uint8_t *dptr; 4404 unsigned int dlen; 4405 4406 /* 4407 * Partially inlined packet data WQE, we have 4408 * some space in title WQEBB, we can fill it 4409 * with some packet data. It takes one WQEBB, 4410 * it is available, no extra space check: 4411 * - Control Segment, SEND opcode 4412 * - Ethernet Segment, no VLAN insertion 4413 * - MLX5_ESEG_MIN_INLINE_SIZE bytes of Data 4414 * - Data Segment, pointer type 4415 * 4416 * We also get here if VLAN insertion is not 4417 * supported by HW, the inline is enabled. 4418 */ 4419 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4420 loc->wqe_last = wqe; 4421 mlx5_tx_cseg_init(txq, loc, wqe, 4, 4422 MLX5_OPCODE_SEND, olx); 4423 mlx5_tx_eseg_dmin(txq, loc, wqe, vlan, olx); 4424 dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + 4425 MLX5_ESEG_MIN_INLINE_SIZE - vlan; 4426 /* 4427 * The length check is performed above, by 4428 * comparing with txq->inlen_send. We should 4429 * not get overflow here. 4430 */ 4431 assert(inlen > MLX5_ESEG_MIN_INLINE_SIZE); 4432 dlen = inlen - MLX5_ESEG_MIN_INLINE_SIZE; 4433 mlx5_tx_dseg_ptr(txq, loc, &wqe->dseg[1], 4434 dptr, dlen, olx); 4435 ++txq->wqe_ci; 4436 --loc->wqe_free; 4437 /* We have to store mbuf in elts.*/ 4438 assert(MLX5_TXOFF_CONFIG(INLINE)); 4439 txq->elts[txq->elts_head++ & txq->elts_m] = 4440 loc->mbuf; 4441 --loc->elts_free; 4442 } 4443 #ifdef MLX5_PMD_SOFT_COUNTERS 4444 /* Update sent data bytes counter. */ 4445 txq->stats.obytes += vlan + 4446 rte_pktmbuf_data_len(loc->mbuf); 4447 #endif 4448 } else { 4449 /* 4450 * No inline at all, it means the CPU cycles saving 4451 * is prioritized at configuration, we should not 4452 * copy any packet data to WQE. 4453 * 4454 * SEND WQE, one WQEBB: 4455 * - Control Segment, SEND opcode 4456 * - Ethernet Segment, optional VLAN, no inline 4457 * - Data Segment, pointer type 4458 */ 4459 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4460 loc->wqe_last = wqe; 4461 mlx5_tx_cseg_init(txq, loc, wqe, 3, 4462 MLX5_OPCODE_SEND, olx); 4463 mlx5_tx_eseg_none(txq, loc, wqe, olx); 4464 mlx5_tx_dseg_ptr 4465 (txq, loc, &wqe->dseg[0], 4466 rte_pktmbuf_mtod(loc->mbuf, uint8_t *), 4467 rte_pktmbuf_data_len(loc->mbuf), olx); 4468 ++txq->wqe_ci; 4469 --loc->wqe_free; 4470 /* 4471 * We should not store mbuf pointer in elts 4472 * if no inlining is configured, this is done 4473 * by calling routine in a batch copy. 4474 */ 4475 assert(!MLX5_TXOFF_CONFIG(INLINE)); 4476 --loc->elts_free; 4477 #ifdef MLX5_PMD_SOFT_COUNTERS 4478 /* Update sent data bytes counter. */ 4479 txq->stats.obytes += rte_pktmbuf_data_len(loc->mbuf); 4480 if (MLX5_TXOFF_CONFIG(VLAN) && 4481 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) 4482 txq->stats.obytes += 4483 sizeof(struct rte_vlan_hdr); 4484 #endif 4485 } 4486 ++loc->pkts_sent; 4487 --pkts_n; 4488 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) 4489 return MLX5_TXCMP_CODE_EXIT; 4490 loc->mbuf = *pkts++; 4491 if (pkts_n > 1) 4492 rte_prefetch0(*pkts); 4493 ret = mlx5_tx_able_to_empw(txq, loc, olx, true); 4494 if (unlikely(ret != MLX5_TXCMP_CODE_SINGLE)) 4495 return ret; 4496 } 4497 assert(false); 4498 } 4499 4500 static __rte_always_inline enum mlx5_txcmp_code 4501 mlx5_tx_burst_single(struct mlx5_txq_data *restrict txq, 4502 struct rte_mbuf **restrict pkts, 4503 unsigned int pkts_n, 4504 struct mlx5_txq_local *restrict loc, 4505 unsigned int olx) 4506 { 4507 enum mlx5_txcmp_code ret; 4508 4509 ret = mlx5_tx_able_to_empw(txq, loc, olx, false); 4510 if (ret == MLX5_TXCMP_CODE_SINGLE) 4511 goto ordinary_send; 4512 assert(ret == MLX5_TXCMP_CODE_EMPW); 4513 for (;;) { 4514 /* Optimize for inline/no inline eMPW send. */ 4515 ret = (MLX5_TXOFF_CONFIG(INLINE)) ? 4516 mlx5_tx_burst_empw_inline 4517 (txq, pkts, pkts_n, loc, olx) : 4518 mlx5_tx_burst_empw_simple 4519 (txq, pkts, pkts_n, loc, olx); 4520 if (ret != MLX5_TXCMP_CODE_SINGLE) 4521 return ret; 4522 /* The resources to send one packet should remain. */ 4523 assert(loc->elts_free && loc->wqe_free); 4524 ordinary_send: 4525 ret = mlx5_tx_burst_single_send(txq, pkts, pkts_n, loc, olx); 4526 assert(ret != MLX5_TXCMP_CODE_SINGLE); 4527 if (ret != MLX5_TXCMP_CODE_EMPW) 4528 return ret; 4529 /* The resources to send one packet should remain. */ 4530 assert(loc->elts_free && loc->wqe_free); 4531 } 4532 } 4533 4534 /** 4535 * DPDK Tx callback template. This is configured template 4536 * used to generate routines optimized for specified offload setup. 4537 * One of this generated functions is chosen at SQ configuration 4538 * time. 4539 * 4540 * @param txq 4541 * Generic pointer to TX queue structure. 4542 * @param[in] pkts 4543 * Packets to transmit. 4544 * @param pkts_n 4545 * Number of packets in array. 4546 * @param olx 4547 * Configured offloads mask, presents the bits of MLX5_TXOFF_CONFIG_xxx 4548 * values. Should be static to take compile time static configuration 4549 * advantages. 4550 * 4551 * @return 4552 * Number of packets successfully transmitted (<= pkts_n). 4553 */ 4554 static __rte_always_inline uint16_t 4555 mlx5_tx_burst_tmpl(struct mlx5_txq_data *restrict txq, 4556 struct rte_mbuf **restrict pkts, 4557 uint16_t pkts_n, 4558 unsigned int olx) 4559 { 4560 struct mlx5_txq_local loc; 4561 enum mlx5_txcmp_code ret; 4562 unsigned int part; 4563 4564 assert(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); 4565 assert(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); 4566 if (unlikely(!pkts_n)) 4567 return 0; 4568 loc.pkts_sent = 0; 4569 loc.pkts_copy = 0; 4570 loc.wqe_last = NULL; 4571 4572 send_loop: 4573 loc.pkts_loop = loc.pkts_sent; 4574 /* 4575 * Check if there are some CQEs, if any: 4576 * - process an encountered errors 4577 * - process the completed WQEs 4578 * - free related mbufs 4579 * - doorbell the NIC about processed CQEs 4580 */ 4581 rte_prefetch0(*(pkts + loc.pkts_sent)); 4582 mlx5_tx_handle_completion(txq, olx); 4583 /* 4584 * Calculate the number of available resources - elts and WQEs. 4585 * There are two possible different scenarios: 4586 * - no data inlining into WQEs, one WQEBB may contains upto 4587 * four packets, in this case elts become scarce resource 4588 * - data inlining into WQEs, one packet may require multiple 4589 * WQEBBs, the WQEs become the limiting factor. 4590 */ 4591 assert(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); 4592 loc.elts_free = txq->elts_s - 4593 (uint16_t)(txq->elts_head - txq->elts_tail); 4594 assert(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); 4595 loc.wqe_free = txq->wqe_s - 4596 (uint16_t)(txq->wqe_ci - txq->wqe_pi); 4597 if (unlikely(!loc.elts_free || !loc.wqe_free)) 4598 goto burst_exit; 4599 for (;;) { 4600 /* 4601 * Fetch the packet from array. Usually this is 4602 * the first packet in series of multi/single 4603 * segment packets. 4604 */ 4605 loc.mbuf = *(pkts + loc.pkts_sent); 4606 /* Dedicated branch for multi-segment packets. */ 4607 if (MLX5_TXOFF_CONFIG(MULTI) && 4608 unlikely(NB_SEGS(loc.mbuf) > 1)) { 4609 /* 4610 * Multi-segment packet encountered. 4611 * Hardware is able to process it only 4612 * with SEND/TSO opcodes, one packet 4613 * per WQE, do it in dedicated routine. 4614 */ 4615 enter_send_multi: 4616 assert(loc.pkts_sent >= loc.pkts_copy); 4617 part = loc.pkts_sent - loc.pkts_copy; 4618 if (!MLX5_TXOFF_CONFIG(INLINE) && part) { 4619 /* 4620 * There are some single-segment mbufs not 4621 * stored in elts. The mbufs must be in the 4622 * same order as WQEs, so we must copy the 4623 * mbufs to elts here, before the coming 4624 * multi-segment packet mbufs is appended. 4625 */ 4626 mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy, 4627 part, olx); 4628 loc.pkts_copy = loc.pkts_sent; 4629 } 4630 assert(pkts_n > loc.pkts_sent); 4631 ret = mlx5_tx_burst_mseg(txq, pkts, pkts_n, &loc, olx); 4632 if (!MLX5_TXOFF_CONFIG(INLINE)) 4633 loc.pkts_copy = loc.pkts_sent; 4634 /* 4635 * These returned code checks are supposed 4636 * to be optimized out due to routine inlining. 4637 */ 4638 if (ret == MLX5_TXCMP_CODE_EXIT) { 4639 /* 4640 * The routine returns this code when 4641 * all packets are sent or there is no 4642 * enough resources to complete request. 4643 */ 4644 break; 4645 } 4646 if (ret == MLX5_TXCMP_CODE_ERROR) { 4647 /* 4648 * The routine returns this code when 4649 * some error in the incoming packets 4650 * format occurred. 4651 */ 4652 txq->stats.oerrors++; 4653 break; 4654 } 4655 if (ret == MLX5_TXCMP_CODE_SINGLE) { 4656 /* 4657 * The single-segment packet was encountered 4658 * in the array, try to send it with the 4659 * best optimized way, possible engaging eMPW. 4660 */ 4661 goto enter_send_single; 4662 } 4663 if (MLX5_TXOFF_CONFIG(TSO) && 4664 ret == MLX5_TXCMP_CODE_TSO) { 4665 /* 4666 * The single-segment TSO packet was 4667 * encountered in the array. 4668 */ 4669 goto enter_send_tso; 4670 } 4671 /* We must not get here. Something is going wrong. */ 4672 assert(false); 4673 txq->stats.oerrors++; 4674 break; 4675 } 4676 /* Dedicated branch for single-segment TSO packets. */ 4677 if (MLX5_TXOFF_CONFIG(TSO) && 4678 unlikely(loc.mbuf->ol_flags & PKT_TX_TCP_SEG)) { 4679 /* 4680 * TSO might require special way for inlining 4681 * (dedicated parameters) and is sent with 4682 * MLX5_OPCODE_TSO opcode only, provide this 4683 * in dedicated branch. 4684 */ 4685 enter_send_tso: 4686 assert(NB_SEGS(loc.mbuf) == 1); 4687 assert(pkts_n > loc.pkts_sent); 4688 ret = mlx5_tx_burst_tso(txq, pkts, pkts_n, &loc, olx); 4689 /* 4690 * These returned code checks are supposed 4691 * to be optimized out due to routine inlining. 4692 */ 4693 if (ret == MLX5_TXCMP_CODE_EXIT) 4694 break; 4695 if (ret == MLX5_TXCMP_CODE_ERROR) { 4696 txq->stats.oerrors++; 4697 break; 4698 } 4699 if (ret == MLX5_TXCMP_CODE_SINGLE) 4700 goto enter_send_single; 4701 if (MLX5_TXOFF_CONFIG(MULTI) && 4702 ret == MLX5_TXCMP_CODE_MULTI) { 4703 /* 4704 * The multi-segment packet was 4705 * encountered in the array. 4706 */ 4707 goto enter_send_multi; 4708 } 4709 /* We must not get here. Something is going wrong. */ 4710 assert(false); 4711 txq->stats.oerrors++; 4712 break; 4713 } 4714 /* 4715 * The dedicated branch for the single-segment packets 4716 * without TSO. Often these ones can be sent using 4717 * MLX5_OPCODE_EMPW with multiple packets in one WQE. 4718 * The routine builds the WQEs till it encounters 4719 * the TSO or multi-segment packet (in case if these 4720 * offloads are requested at SQ configuration time). 4721 */ 4722 enter_send_single: 4723 assert(pkts_n > loc.pkts_sent); 4724 ret = mlx5_tx_burst_single(txq, pkts, pkts_n, &loc, olx); 4725 /* 4726 * These returned code checks are supposed 4727 * to be optimized out due to routine inlining. 4728 */ 4729 if (ret == MLX5_TXCMP_CODE_EXIT) 4730 break; 4731 if (ret == MLX5_TXCMP_CODE_ERROR) { 4732 txq->stats.oerrors++; 4733 break; 4734 } 4735 if (MLX5_TXOFF_CONFIG(MULTI) && 4736 ret == MLX5_TXCMP_CODE_MULTI) { 4737 /* 4738 * The multi-segment packet was 4739 * encountered in the array. 4740 */ 4741 goto enter_send_multi; 4742 } 4743 if (MLX5_TXOFF_CONFIG(TSO) && 4744 ret == MLX5_TXCMP_CODE_TSO) { 4745 /* 4746 * The single-segment TSO packet was 4747 * encountered in the array. 4748 */ 4749 goto enter_send_tso; 4750 } 4751 /* We must not get here. Something is going wrong. */ 4752 assert(false); 4753 txq->stats.oerrors++; 4754 break; 4755 } 4756 /* 4757 * Main Tx loop is completed, do the rest: 4758 * - set completion request if thresholds are reached 4759 * - doorbell the hardware 4760 * - copy the rest of mbufs to elts (if any) 4761 */ 4762 assert(MLX5_TXOFF_CONFIG(INLINE) || loc.pkts_sent >= loc.pkts_copy); 4763 /* Take a shortcut if nothing is sent. */ 4764 if (unlikely(loc.pkts_sent == loc.pkts_loop)) 4765 goto burst_exit; 4766 /* Request CQE generation if limits are reached. */ 4767 mlx5_tx_request_completion(txq, &loc, olx); 4768 /* 4769 * Ring QP doorbell immediately after WQE building completion 4770 * to improve latencies. The pure software related data treatment 4771 * can be completed after doorbell. Tx CQEs for this SQ are 4772 * processed in this thread only by the polling. 4773 * 4774 * The rdma core library can map doorbell register in two ways, 4775 * depending on the environment variable "MLX5_SHUT_UP_BF": 4776 * 4777 * - as regular cached memory, the variable is either missing or 4778 * set to zero. This type of mapping may cause the significant 4779 * doorbell register writing latency and requires explicit 4780 * memory write barrier to mitigate this issue and prevent 4781 * write combining. 4782 * 4783 * - as non-cached memory, the variable is present and set to 4784 * not "0" value. This type of mapping may cause performance 4785 * impact under heavy loading conditions but the explicit write 4786 * memory barrier is not required and it may improve core 4787 * performance. 4788 * 4789 * - the legacy behaviour (prior 19.08 release) was to use some 4790 * heuristics to decide whether write memory barrier should 4791 * be performed. This behavior is supported with specifying 4792 * tx_db_nc=2, write barrier is skipped if application 4793 * provides the full recommended burst of packets, it 4794 * supposes the next packets are coming and the write barrier 4795 * will be issued on the next burst (after descriptor writing, 4796 * at least). 4797 */ 4798 mlx5_tx_dbrec_cond_wmb(txq, loc.wqe_last, !txq->db_nc && 4799 (!txq->db_heu || pkts_n % MLX5_TX_DEFAULT_BURST)); 4800 /* Not all of the mbufs may be stored into elts yet. */ 4801 part = MLX5_TXOFF_CONFIG(INLINE) ? 0 : loc.pkts_sent - loc.pkts_copy; 4802 if (!MLX5_TXOFF_CONFIG(INLINE) && part) { 4803 /* 4804 * There are some single-segment mbufs not stored in elts. 4805 * It can be only if the last packet was single-segment. 4806 * The copying is gathered into one place due to it is 4807 * a good opportunity to optimize that with SIMD. 4808 * Unfortunately if inlining is enabled the gaps in 4809 * pointer array may happen due to early freeing of the 4810 * inlined mbufs. 4811 */ 4812 mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy, part, olx); 4813 loc.pkts_copy = loc.pkts_sent; 4814 } 4815 assert(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); 4816 assert(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); 4817 if (pkts_n > loc.pkts_sent) { 4818 /* 4819 * If burst size is large there might be no enough CQE 4820 * fetched from completion queue and no enough resources 4821 * freed to send all the packets. 4822 */ 4823 goto send_loop; 4824 } 4825 burst_exit: 4826 #ifdef MLX5_PMD_SOFT_COUNTERS 4827 /* Increment sent packets counter. */ 4828 txq->stats.opackets += loc.pkts_sent; 4829 #endif 4830 return loc.pkts_sent; 4831 } 4832 4833 /* Generate routines with Enhanced Multi-Packet Write support. */ 4834 MLX5_TXOFF_DECL(full_empw, 4835 MLX5_TXOFF_CONFIG_FULL | MLX5_TXOFF_CONFIG_EMPW) 4836 4837 MLX5_TXOFF_DECL(none_empw, 4838 MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW) 4839 4840 MLX5_TXOFF_DECL(md_empw, 4841 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4842 4843 MLX5_TXOFF_DECL(mt_empw, 4844 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4845 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4846 4847 MLX5_TXOFF_DECL(mtsc_empw, 4848 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4849 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4850 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4851 4852 MLX5_TXOFF_DECL(mti_empw, 4853 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4854 MLX5_TXOFF_CONFIG_INLINE | 4855 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4856 4857 MLX5_TXOFF_DECL(mtv_empw, 4858 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4859 MLX5_TXOFF_CONFIG_VLAN | 4860 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4861 4862 MLX5_TXOFF_DECL(mtiv_empw, 4863 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4864 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4865 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4866 4867 MLX5_TXOFF_DECL(sc_empw, 4868 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4869 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4870 4871 MLX5_TXOFF_DECL(sci_empw, 4872 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4873 MLX5_TXOFF_CONFIG_INLINE | 4874 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4875 4876 MLX5_TXOFF_DECL(scv_empw, 4877 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4878 MLX5_TXOFF_CONFIG_VLAN | 4879 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4880 4881 MLX5_TXOFF_DECL(sciv_empw, 4882 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4883 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4884 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4885 4886 MLX5_TXOFF_DECL(i_empw, 4887 MLX5_TXOFF_CONFIG_INLINE | 4888 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4889 4890 MLX5_TXOFF_DECL(v_empw, 4891 MLX5_TXOFF_CONFIG_VLAN | 4892 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4893 4894 MLX5_TXOFF_DECL(iv_empw, 4895 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4896 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4897 4898 /* Generate routines without Enhanced Multi-Packet Write support. */ 4899 MLX5_TXOFF_DECL(full, 4900 MLX5_TXOFF_CONFIG_FULL) 4901 4902 MLX5_TXOFF_DECL(none, 4903 MLX5_TXOFF_CONFIG_NONE) 4904 4905 MLX5_TXOFF_DECL(md, 4906 MLX5_TXOFF_CONFIG_METADATA) 4907 4908 MLX5_TXOFF_DECL(mt, 4909 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4910 MLX5_TXOFF_CONFIG_METADATA) 4911 4912 MLX5_TXOFF_DECL(mtsc, 4913 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4914 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4915 MLX5_TXOFF_CONFIG_METADATA) 4916 4917 MLX5_TXOFF_DECL(mti, 4918 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4919 MLX5_TXOFF_CONFIG_INLINE | 4920 MLX5_TXOFF_CONFIG_METADATA) 4921 4922 4923 MLX5_TXOFF_DECL(mtv, 4924 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4925 MLX5_TXOFF_CONFIG_VLAN | 4926 MLX5_TXOFF_CONFIG_METADATA) 4927 4928 4929 MLX5_TXOFF_DECL(mtiv, 4930 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4931 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4932 MLX5_TXOFF_CONFIG_METADATA) 4933 4934 MLX5_TXOFF_DECL(sc, 4935 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4936 MLX5_TXOFF_CONFIG_METADATA) 4937 4938 MLX5_TXOFF_DECL(sci, 4939 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4940 MLX5_TXOFF_CONFIG_INLINE | 4941 MLX5_TXOFF_CONFIG_METADATA) 4942 4943 4944 MLX5_TXOFF_DECL(scv, 4945 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4946 MLX5_TXOFF_CONFIG_VLAN | 4947 MLX5_TXOFF_CONFIG_METADATA) 4948 4949 4950 MLX5_TXOFF_DECL(sciv, 4951 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4952 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4953 MLX5_TXOFF_CONFIG_METADATA) 4954 4955 MLX5_TXOFF_DECL(i, 4956 MLX5_TXOFF_CONFIG_INLINE | 4957 MLX5_TXOFF_CONFIG_METADATA) 4958 4959 MLX5_TXOFF_DECL(v, 4960 MLX5_TXOFF_CONFIG_VLAN | 4961 MLX5_TXOFF_CONFIG_METADATA) 4962 4963 MLX5_TXOFF_DECL(iv, 4964 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4965 MLX5_TXOFF_CONFIG_METADATA) 4966 4967 /* 4968 * Generate routines with Legacy Multi-Packet Write support. 4969 * This mode is supported by ConnectX-4LX only and imposes 4970 * offload limitations, not supported: 4971 * - ACL/Flows (metadata are becoming meaningless) 4972 * - WQE Inline headers 4973 * - SRIOV (E-Switch offloads) 4974 * - VLAN insertion 4975 * - tunnel encapsulation/decapsulation 4976 * - TSO 4977 */ 4978 MLX5_TXOFF_DECL(none_mpw, 4979 MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW | 4980 MLX5_TXOFF_CONFIG_MPW) 4981 4982 MLX5_TXOFF_DECL(mci_mpw, 4983 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM | 4984 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | 4985 MLX5_TXOFF_CONFIG_MPW) 4986 4987 MLX5_TXOFF_DECL(mc_mpw, 4988 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM | 4989 MLX5_TXOFF_CONFIG_EMPW | MLX5_TXOFF_CONFIG_MPW) 4990 4991 MLX5_TXOFF_DECL(i_mpw, 4992 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | 4993 MLX5_TXOFF_CONFIG_MPW) 4994 4995 /* 4996 * Array of declared and compiled Tx burst function and corresponding 4997 * supported offloads set. The array is used to select the Tx burst 4998 * function for specified offloads set at Tx queue configuration time. 4999 */ 5000 const struct { 5001 eth_tx_burst_t func; 5002 unsigned int olx; 5003 } txoff_func[] = { 5004 MLX5_TXOFF_INFO(full_empw, 5005 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5006 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5007 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5008 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5009 5010 MLX5_TXOFF_INFO(none_empw, 5011 MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW) 5012 5013 MLX5_TXOFF_INFO(md_empw, 5014 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5015 5016 MLX5_TXOFF_INFO(mt_empw, 5017 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5018 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5019 5020 MLX5_TXOFF_INFO(mtsc_empw, 5021 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5022 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5023 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5024 5025 MLX5_TXOFF_INFO(mti_empw, 5026 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5027 MLX5_TXOFF_CONFIG_INLINE | 5028 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5029 5030 MLX5_TXOFF_INFO(mtv_empw, 5031 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5032 MLX5_TXOFF_CONFIG_VLAN | 5033 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5034 5035 MLX5_TXOFF_INFO(mtiv_empw, 5036 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5037 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5038 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5039 5040 MLX5_TXOFF_INFO(sc_empw, 5041 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5042 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5043 5044 MLX5_TXOFF_INFO(sci_empw, 5045 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5046 MLX5_TXOFF_CONFIG_INLINE | 5047 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5048 5049 MLX5_TXOFF_INFO(scv_empw, 5050 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5051 MLX5_TXOFF_CONFIG_VLAN | 5052 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5053 5054 MLX5_TXOFF_INFO(sciv_empw, 5055 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5056 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5057 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5058 5059 MLX5_TXOFF_INFO(i_empw, 5060 MLX5_TXOFF_CONFIG_INLINE | 5061 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5062 5063 MLX5_TXOFF_INFO(v_empw, 5064 MLX5_TXOFF_CONFIG_VLAN | 5065 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5066 5067 MLX5_TXOFF_INFO(iv_empw, 5068 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5069 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5070 5071 MLX5_TXOFF_INFO(full, 5072 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5073 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5074 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5075 MLX5_TXOFF_CONFIG_METADATA) 5076 5077 MLX5_TXOFF_INFO(none, 5078 MLX5_TXOFF_CONFIG_NONE) 5079 5080 MLX5_TXOFF_INFO(md, 5081 MLX5_TXOFF_CONFIG_METADATA) 5082 5083 MLX5_TXOFF_INFO(mt, 5084 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5085 MLX5_TXOFF_CONFIG_METADATA) 5086 5087 MLX5_TXOFF_INFO(mtsc, 5088 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5089 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5090 MLX5_TXOFF_CONFIG_METADATA) 5091 5092 MLX5_TXOFF_INFO(mti, 5093 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5094 MLX5_TXOFF_CONFIG_INLINE | 5095 MLX5_TXOFF_CONFIG_METADATA) 5096 5097 MLX5_TXOFF_INFO(mtv, 5098 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5099 MLX5_TXOFF_CONFIG_VLAN | 5100 MLX5_TXOFF_CONFIG_METADATA) 5101 5102 MLX5_TXOFF_INFO(mtiv, 5103 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5104 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5105 MLX5_TXOFF_CONFIG_METADATA) 5106 5107 MLX5_TXOFF_INFO(sc, 5108 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5109 MLX5_TXOFF_CONFIG_METADATA) 5110 5111 MLX5_TXOFF_INFO(sci, 5112 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5113 MLX5_TXOFF_CONFIG_INLINE | 5114 MLX5_TXOFF_CONFIG_METADATA) 5115 5116 MLX5_TXOFF_INFO(scv, 5117 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5118 MLX5_TXOFF_CONFIG_VLAN | 5119 MLX5_TXOFF_CONFIG_METADATA) 5120 5121 MLX5_TXOFF_INFO(sciv, 5122 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5123 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5124 MLX5_TXOFF_CONFIG_METADATA) 5125 5126 MLX5_TXOFF_INFO(i, 5127 MLX5_TXOFF_CONFIG_INLINE | 5128 MLX5_TXOFF_CONFIG_METADATA) 5129 5130 MLX5_TXOFF_INFO(v, 5131 MLX5_TXOFF_CONFIG_VLAN | 5132 MLX5_TXOFF_CONFIG_METADATA) 5133 5134 MLX5_TXOFF_INFO(iv, 5135 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5136 MLX5_TXOFF_CONFIG_METADATA) 5137 5138 MLX5_TXOFF_INFO(none_mpw, 5139 MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW | 5140 MLX5_TXOFF_CONFIG_MPW) 5141 5142 MLX5_TXOFF_INFO(mci_mpw, 5143 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM | 5144 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | 5145 MLX5_TXOFF_CONFIG_MPW) 5146 5147 MLX5_TXOFF_INFO(mc_mpw, 5148 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM | 5149 MLX5_TXOFF_CONFIG_EMPW | MLX5_TXOFF_CONFIG_MPW) 5150 5151 MLX5_TXOFF_INFO(i_mpw, 5152 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | 5153 MLX5_TXOFF_CONFIG_MPW) 5154 }; 5155 5156 /** 5157 * Configure the Tx function to use. The routine checks configured 5158 * Tx offloads for the device and selects appropriate Tx burst 5159 * routine. There are multiple Tx burst routines compiled from 5160 * the same template in the most optimal way for the dedicated 5161 * Tx offloads set. 5162 * 5163 * @param dev 5164 * Pointer to private data structure. 5165 * 5166 * @return 5167 * Pointer to selected Tx burst function. 5168 */ 5169 eth_tx_burst_t 5170 mlx5_select_tx_function(struct rte_eth_dev *dev) 5171 { 5172 struct mlx5_priv *priv = dev->data->dev_private; 5173 struct mlx5_dev_config *config = &priv->config; 5174 uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads; 5175 unsigned int diff = 0, olx = 0, i, m; 5176 5177 static_assert(MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE <= 5178 MLX5_DSEG_MAX, "invalid WQE max size"); 5179 static_assert(MLX5_WQE_CSEG_SIZE == MLX5_WSEG_SIZE, 5180 "invalid WQE Control Segment size"); 5181 static_assert(MLX5_WQE_ESEG_SIZE == MLX5_WSEG_SIZE, 5182 "invalid WQE Ethernet Segment size"); 5183 static_assert(MLX5_WQE_DSEG_SIZE == MLX5_WSEG_SIZE, 5184 "invalid WQE Data Segment size"); 5185 static_assert(MLX5_WQE_SIZE == 4 * MLX5_WSEG_SIZE, 5186 "invalid WQE size"); 5187 assert(priv); 5188 if (tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS) { 5189 /* We should support Multi-Segment Packets. */ 5190 olx |= MLX5_TXOFF_CONFIG_MULTI; 5191 } 5192 if (tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO | 5193 DEV_TX_OFFLOAD_VXLAN_TNL_TSO | 5194 DEV_TX_OFFLOAD_GRE_TNL_TSO | 5195 DEV_TX_OFFLOAD_IP_TNL_TSO | 5196 DEV_TX_OFFLOAD_UDP_TNL_TSO)) { 5197 /* We should support TCP Send Offload. */ 5198 olx |= MLX5_TXOFF_CONFIG_TSO; 5199 } 5200 if (tx_offloads & (DEV_TX_OFFLOAD_IP_TNL_TSO | 5201 DEV_TX_OFFLOAD_UDP_TNL_TSO | 5202 DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)) { 5203 /* We should support Software Parser for Tunnels. */ 5204 olx |= MLX5_TXOFF_CONFIG_SWP; 5205 } 5206 if (tx_offloads & (DEV_TX_OFFLOAD_IPV4_CKSUM | 5207 DEV_TX_OFFLOAD_UDP_CKSUM | 5208 DEV_TX_OFFLOAD_TCP_CKSUM | 5209 DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)) { 5210 /* We should support IP/TCP/UDP Checksums. */ 5211 olx |= MLX5_TXOFF_CONFIG_CSUM; 5212 } 5213 if (tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT) { 5214 /* We should support VLAN insertion. */ 5215 olx |= MLX5_TXOFF_CONFIG_VLAN; 5216 } 5217 if (priv->txqs_n && (*priv->txqs)[0]) { 5218 struct mlx5_txq_data *txd = (*priv->txqs)[0]; 5219 5220 if (txd->inlen_send) { 5221 /* 5222 * Check the data inline requirements. Data inline 5223 * is enabled on per device basis, we can check 5224 * the first Tx queue only. 5225 * 5226 * If device does not support VLAN insertion in WQE 5227 * and some queues are requested to perform VLAN 5228 * insertion offload than inline must be enabled. 5229 */ 5230 olx |= MLX5_TXOFF_CONFIG_INLINE; 5231 } 5232 } 5233 if (config->mps == MLX5_MPW_ENHANCED && 5234 config->txq_inline_min <= 0) { 5235 /* 5236 * The NIC supports Enhanced Multi-Packet Write 5237 * and does not require minimal inline data. 5238 */ 5239 olx |= MLX5_TXOFF_CONFIG_EMPW; 5240 } 5241 if (rte_flow_dynf_metadata_avail()) { 5242 /* We should support Flow metadata. */ 5243 olx |= MLX5_TXOFF_CONFIG_METADATA; 5244 } 5245 if (config->mps == MLX5_MPW) { 5246 /* 5247 * The NIC supports Legacy Multi-Packet Write. 5248 * The MLX5_TXOFF_CONFIG_MPW controls the 5249 * descriptor building method in combination 5250 * with MLX5_TXOFF_CONFIG_EMPW. 5251 */ 5252 if (!(olx & (MLX5_TXOFF_CONFIG_TSO | 5253 MLX5_TXOFF_CONFIG_SWP | 5254 MLX5_TXOFF_CONFIG_VLAN | 5255 MLX5_TXOFF_CONFIG_METADATA))) 5256 olx |= MLX5_TXOFF_CONFIG_EMPW | 5257 MLX5_TXOFF_CONFIG_MPW; 5258 } 5259 /* 5260 * Scan the routines table to find the minimal 5261 * satisfying routine with requested offloads. 5262 */ 5263 m = RTE_DIM(txoff_func); 5264 for (i = 0; i < RTE_DIM(txoff_func); i++) { 5265 unsigned int tmp; 5266 5267 tmp = txoff_func[i].olx; 5268 if (tmp == olx) { 5269 /* Meets requested offloads exactly.*/ 5270 m = i; 5271 break; 5272 } 5273 if ((tmp & olx) != olx) { 5274 /* Does not meet requested offloads at all. */ 5275 continue; 5276 } 5277 if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_EMPW) 5278 /* Do not enable eMPW if not configured. */ 5279 continue; 5280 if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_INLINE) 5281 /* Do not enable inlining if not configured. */ 5282 continue; 5283 /* 5284 * Some routine meets the requirements. 5285 * Check whether it has minimal amount 5286 * of not requested offloads. 5287 */ 5288 tmp = __builtin_popcountl(tmp & ~olx); 5289 if (m >= RTE_DIM(txoff_func) || tmp < diff) { 5290 /* First or better match, save and continue. */ 5291 m = i; 5292 diff = tmp; 5293 continue; 5294 } 5295 if (tmp == diff) { 5296 tmp = txoff_func[i].olx ^ txoff_func[m].olx; 5297 if (__builtin_ffsl(txoff_func[i].olx & ~tmp) < 5298 __builtin_ffsl(txoff_func[m].olx & ~tmp)) { 5299 /* Lighter not requested offload. */ 5300 m = i; 5301 } 5302 } 5303 } 5304 if (m >= RTE_DIM(txoff_func)) { 5305 DRV_LOG(DEBUG, "port %u has no selected Tx function" 5306 " for requested offloads %04X", 5307 dev->data->port_id, olx); 5308 return NULL; 5309 } 5310 DRV_LOG(DEBUG, "port %u has selected Tx function" 5311 " supporting offloads %04X/%04X", 5312 dev->data->port_id, olx, txoff_func[m].olx); 5313 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_MULTI) 5314 DRV_LOG(DEBUG, "\tMULTI (multi segment)"); 5315 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_TSO) 5316 DRV_LOG(DEBUG, "\tTSO (TCP send offload)"); 5317 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_SWP) 5318 DRV_LOG(DEBUG, "\tSWP (software parser)"); 5319 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_CSUM) 5320 DRV_LOG(DEBUG, "\tCSUM (checksum offload)"); 5321 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_INLINE) 5322 DRV_LOG(DEBUG, "\tINLIN (inline data)"); 5323 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_VLAN) 5324 DRV_LOG(DEBUG, "\tVLANI (VLAN insertion)"); 5325 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_METADATA) 5326 DRV_LOG(DEBUG, "\tMETAD (tx Flow metadata)"); 5327 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_EMPW) { 5328 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_MPW) 5329 DRV_LOG(DEBUG, "\tMPW (Legacy MPW)"); 5330 else 5331 DRV_LOG(DEBUG, "\tEMPW (Enhanced MPW)"); 5332 } 5333 return txoff_func[m].func; 5334 } 5335