1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2015 6WIND S.A. 3 * Copyright 2015-2019 Mellanox Technologies, Ltd 4 */ 5 6 #include <assert.h> 7 #include <stdint.h> 8 #include <string.h> 9 #include <stdlib.h> 10 11 /* Verbs header. */ 12 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 13 #ifdef PEDANTIC 14 #pragma GCC diagnostic ignored "-Wpedantic" 15 #endif 16 #include <infiniband/verbs.h> 17 #include <infiniband/mlx5dv.h> 18 #ifdef PEDANTIC 19 #pragma GCC diagnostic error "-Wpedantic" 20 #endif 21 22 #include <rte_mbuf.h> 23 #include <rte_mempool.h> 24 #include <rte_prefetch.h> 25 #include <rte_common.h> 26 #include <rte_branch_prediction.h> 27 #include <rte_ether.h> 28 #include <rte_cycles.h> 29 #include <rte_flow.h> 30 31 #include "mlx5.h" 32 #include "mlx5_utils.h" 33 #include "mlx5_rxtx.h" 34 #include "mlx5_autoconf.h" 35 #include "mlx5_defs.h" 36 #include "mlx5_prm.h" 37 38 /* TX burst subroutines return codes. */ 39 enum mlx5_txcmp_code { 40 MLX5_TXCMP_CODE_EXIT = 0, 41 MLX5_TXCMP_CODE_ERROR, 42 MLX5_TXCMP_CODE_SINGLE, 43 MLX5_TXCMP_CODE_MULTI, 44 MLX5_TXCMP_CODE_TSO, 45 MLX5_TXCMP_CODE_EMPW, 46 }; 47 48 /* 49 * These defines are used to configure Tx burst routine option set 50 * supported at compile time. The not specified options are optimized out 51 * out due to if conditions can be explicitly calculated at compile time. 52 * The offloads with bigger runtime check (require more CPU cycles to 53 * skip) overhead should have the bigger index - this is needed to 54 * select the better matching routine function if no exact match and 55 * some offloads are not actually requested. 56 */ 57 #define MLX5_TXOFF_CONFIG_MULTI (1u << 0) /* Multi-segment packets.*/ 58 #define MLX5_TXOFF_CONFIG_TSO (1u << 1) /* TCP send offload supported.*/ 59 #define MLX5_TXOFF_CONFIG_SWP (1u << 2) /* Tunnels/SW Parser offloads.*/ 60 #define MLX5_TXOFF_CONFIG_CSUM (1u << 3) /* Check Sums offloaded. */ 61 #define MLX5_TXOFF_CONFIG_INLINE (1u << 4) /* Data inlining supported. */ 62 #define MLX5_TXOFF_CONFIG_VLAN (1u << 5) /* VLAN insertion supported.*/ 63 #define MLX5_TXOFF_CONFIG_METADATA (1u << 6) /* Flow metadata. */ 64 #define MLX5_TXOFF_CONFIG_EMPW (1u << 8) /* Enhanced MPW supported.*/ 65 66 /* The most common offloads groups. */ 67 #define MLX5_TXOFF_CONFIG_NONE 0 68 #define MLX5_TXOFF_CONFIG_FULL (MLX5_TXOFF_CONFIG_MULTI | \ 69 MLX5_TXOFF_CONFIG_TSO | \ 70 MLX5_TXOFF_CONFIG_SWP | \ 71 MLX5_TXOFF_CONFIG_CSUM | \ 72 MLX5_TXOFF_CONFIG_INLINE | \ 73 MLX5_TXOFF_CONFIG_VLAN | \ 74 MLX5_TXOFF_CONFIG_METADATA) 75 76 #define MLX5_TXOFF_CONFIG(mask) (olx & MLX5_TXOFF_CONFIG_##mask) 77 78 #define MLX5_TXOFF_DECL(func, olx) \ 79 static uint16_t mlx5_tx_burst_##func(void *txq, \ 80 struct rte_mbuf **pkts, \ 81 uint16_t pkts_n) \ 82 { \ 83 return mlx5_tx_burst_tmpl((struct mlx5_txq_data *)txq, \ 84 pkts, pkts_n, (olx)); \ 85 } 86 87 #define MLX5_TXOFF_INFO(func, olx) {mlx5_tx_burst_##func, olx}, 88 89 static __rte_always_inline uint32_t 90 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe); 91 92 static __rte_always_inline int 93 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, 94 uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe); 95 96 static __rte_always_inline uint32_t 97 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe); 98 99 static __rte_always_inline void 100 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, 101 volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res); 102 103 static __rte_always_inline void 104 mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx, 105 const unsigned int strd_n); 106 107 static int 108 mlx5_queue_state_modify(struct rte_eth_dev *dev, 109 struct mlx5_mp_arg_queue_state_modify *sm); 110 111 static inline void 112 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *restrict tcp, 113 volatile struct mlx5_cqe *restrict cqe, 114 uint32_t phcsum); 115 116 static inline void 117 mlx5_lro_update_hdr(uint8_t *restrict padd, 118 volatile struct mlx5_cqe *restrict cqe, 119 uint32_t len); 120 121 uint32_t mlx5_ptype_table[] __rte_cache_aligned = { 122 [0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */ 123 }; 124 125 uint8_t mlx5_cksum_table[1 << 10] __rte_cache_aligned; 126 uint8_t mlx5_swp_types_table[1 << 10] __rte_cache_aligned; 127 128 /** 129 * Build a table to translate Rx completion flags to packet type. 130 * 131 * @note: fix mlx5_dev_supported_ptypes_get() if any change here. 132 */ 133 void 134 mlx5_set_ptype_table(void) 135 { 136 unsigned int i; 137 uint32_t (*p)[RTE_DIM(mlx5_ptype_table)] = &mlx5_ptype_table; 138 139 /* Last entry must not be overwritten, reserved for errored packet. */ 140 for (i = 0; i < RTE_DIM(mlx5_ptype_table) - 1; ++i) 141 (*p)[i] = RTE_PTYPE_UNKNOWN; 142 /* 143 * The index to the array should have: 144 * bit[1:0] = l3_hdr_type 145 * bit[4:2] = l4_hdr_type 146 * bit[5] = ip_frag 147 * bit[6] = tunneled 148 * bit[7] = outer_l3_type 149 */ 150 /* L2 */ 151 (*p)[0x00] = RTE_PTYPE_L2_ETHER; 152 /* L3 */ 153 (*p)[0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 154 RTE_PTYPE_L4_NONFRAG; 155 (*p)[0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 156 RTE_PTYPE_L4_NONFRAG; 157 /* Fragmented */ 158 (*p)[0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 159 RTE_PTYPE_L4_FRAG; 160 (*p)[0x22] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 161 RTE_PTYPE_L4_FRAG; 162 /* TCP */ 163 (*p)[0x05] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 164 RTE_PTYPE_L4_TCP; 165 (*p)[0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 166 RTE_PTYPE_L4_TCP; 167 (*p)[0x0d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 168 RTE_PTYPE_L4_TCP; 169 (*p)[0x0e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 170 RTE_PTYPE_L4_TCP; 171 (*p)[0x11] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 172 RTE_PTYPE_L4_TCP; 173 (*p)[0x12] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 174 RTE_PTYPE_L4_TCP; 175 /* UDP */ 176 (*p)[0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 177 RTE_PTYPE_L4_UDP; 178 (*p)[0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 179 RTE_PTYPE_L4_UDP; 180 /* Repeat with outer_l3_type being set. Just in case. */ 181 (*p)[0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 182 RTE_PTYPE_L4_NONFRAG; 183 (*p)[0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 184 RTE_PTYPE_L4_NONFRAG; 185 (*p)[0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 186 RTE_PTYPE_L4_FRAG; 187 (*p)[0xa2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 188 RTE_PTYPE_L4_FRAG; 189 (*p)[0x85] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 190 RTE_PTYPE_L4_TCP; 191 (*p)[0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 192 RTE_PTYPE_L4_TCP; 193 (*p)[0x8d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 194 RTE_PTYPE_L4_TCP; 195 (*p)[0x8e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 196 RTE_PTYPE_L4_TCP; 197 (*p)[0x91] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 198 RTE_PTYPE_L4_TCP; 199 (*p)[0x92] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 200 RTE_PTYPE_L4_TCP; 201 (*p)[0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 202 RTE_PTYPE_L4_UDP; 203 (*p)[0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 204 RTE_PTYPE_L4_UDP; 205 /* Tunneled - L3 */ 206 (*p)[0x40] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN; 207 (*p)[0x41] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 208 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 209 RTE_PTYPE_INNER_L4_NONFRAG; 210 (*p)[0x42] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 211 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 212 RTE_PTYPE_INNER_L4_NONFRAG; 213 (*p)[0xc0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN; 214 (*p)[0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 215 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 216 RTE_PTYPE_INNER_L4_NONFRAG; 217 (*p)[0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 218 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 219 RTE_PTYPE_INNER_L4_NONFRAG; 220 /* Tunneled - Fragmented */ 221 (*p)[0x61] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 222 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 223 RTE_PTYPE_INNER_L4_FRAG; 224 (*p)[0x62] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 225 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 226 RTE_PTYPE_INNER_L4_FRAG; 227 (*p)[0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 228 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 229 RTE_PTYPE_INNER_L4_FRAG; 230 (*p)[0xe2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 231 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 232 RTE_PTYPE_INNER_L4_FRAG; 233 /* Tunneled - TCP */ 234 (*p)[0x45] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 235 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 236 RTE_PTYPE_INNER_L4_TCP; 237 (*p)[0x46] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 238 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 239 RTE_PTYPE_INNER_L4_TCP; 240 (*p)[0x4d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 241 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 242 RTE_PTYPE_INNER_L4_TCP; 243 (*p)[0x4e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 244 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 245 RTE_PTYPE_INNER_L4_TCP; 246 (*p)[0x51] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 247 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 248 RTE_PTYPE_INNER_L4_TCP; 249 (*p)[0x52] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 250 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 251 RTE_PTYPE_INNER_L4_TCP; 252 (*p)[0xc5] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 253 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 254 RTE_PTYPE_INNER_L4_TCP; 255 (*p)[0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 256 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 257 RTE_PTYPE_INNER_L4_TCP; 258 (*p)[0xcd] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 259 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 260 RTE_PTYPE_INNER_L4_TCP; 261 (*p)[0xce] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 262 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 263 RTE_PTYPE_INNER_L4_TCP; 264 (*p)[0xd1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 265 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 266 RTE_PTYPE_INNER_L4_TCP; 267 (*p)[0xd2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 268 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 269 RTE_PTYPE_INNER_L4_TCP; 270 /* Tunneled - UDP */ 271 (*p)[0x49] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 272 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 273 RTE_PTYPE_INNER_L4_UDP; 274 (*p)[0x4a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 275 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 276 RTE_PTYPE_INNER_L4_UDP; 277 (*p)[0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 278 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 279 RTE_PTYPE_INNER_L4_UDP; 280 (*p)[0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 281 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 282 RTE_PTYPE_INNER_L4_UDP; 283 } 284 285 /** 286 * Build a table to translate packet to checksum type of Verbs. 287 */ 288 void 289 mlx5_set_cksum_table(void) 290 { 291 unsigned int i; 292 uint8_t v; 293 294 /* 295 * The index should have: 296 * bit[0] = PKT_TX_TCP_SEG 297 * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM 298 * bit[4] = PKT_TX_IP_CKSUM 299 * bit[8] = PKT_TX_OUTER_IP_CKSUM 300 * bit[9] = tunnel 301 */ 302 for (i = 0; i < RTE_DIM(mlx5_cksum_table); ++i) { 303 v = 0; 304 if (i & (1 << 9)) { 305 /* Tunneled packet. */ 306 if (i & (1 << 8)) /* Outer IP. */ 307 v |= MLX5_ETH_WQE_L3_CSUM; 308 if (i & (1 << 4)) /* Inner IP. */ 309 v |= MLX5_ETH_WQE_L3_INNER_CSUM; 310 if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */ 311 v |= MLX5_ETH_WQE_L4_INNER_CSUM; 312 } else { 313 /* No tunnel. */ 314 if (i & (1 << 4)) /* IP. */ 315 v |= MLX5_ETH_WQE_L3_CSUM; 316 if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */ 317 v |= MLX5_ETH_WQE_L4_CSUM; 318 } 319 mlx5_cksum_table[i] = v; 320 } 321 } 322 323 /** 324 * Build a table to translate packet type of mbuf to SWP type of Verbs. 325 */ 326 void 327 mlx5_set_swp_types_table(void) 328 { 329 unsigned int i; 330 uint8_t v; 331 332 /* 333 * The index should have: 334 * bit[0:1] = PKT_TX_L4_MASK 335 * bit[4] = PKT_TX_IPV6 336 * bit[8] = PKT_TX_OUTER_IPV6 337 * bit[9] = PKT_TX_OUTER_UDP 338 */ 339 for (i = 0; i < RTE_DIM(mlx5_swp_types_table); ++i) { 340 v = 0; 341 if (i & (1 << 8)) 342 v |= MLX5_ETH_WQE_L3_OUTER_IPV6; 343 if (i & (1 << 9)) 344 v |= MLX5_ETH_WQE_L4_OUTER_UDP; 345 if (i & (1 << 4)) 346 v |= MLX5_ETH_WQE_L3_INNER_IPV6; 347 if ((i & 3) == (PKT_TX_UDP_CKSUM >> 52)) 348 v |= MLX5_ETH_WQE_L4_INNER_UDP; 349 mlx5_swp_types_table[i] = v; 350 } 351 } 352 353 /** 354 * Set Software Parser flags and offsets in Ethernet Segment of WQE. 355 * Flags must be preliminary initialized to zero. 356 * 357 * @param loc 358 * Pointer to burst routine local context. 359 * @param swp_flags 360 * Pointer to store Software Parser flags 361 * @param olx 362 * Configured Tx offloads mask. It is fully defined at 363 * compile time and may be used for optimization. 364 * 365 * @return 366 * Software Parser offsets packed in dword. 367 * Software Parser flags are set by pointer. 368 */ 369 static __rte_always_inline uint32_t 370 txq_mbuf_to_swp(struct mlx5_txq_local *restrict loc, 371 uint8_t *swp_flags, 372 unsigned int olx) 373 { 374 uint64_t ol, tunnel; 375 unsigned int idx, off; 376 uint32_t set; 377 378 if (!MLX5_TXOFF_CONFIG(SWP)) 379 return 0; 380 ol = loc->mbuf->ol_flags; 381 tunnel = ol & PKT_TX_TUNNEL_MASK; 382 /* 383 * Check whether Software Parser is required. 384 * Only customized tunnels may ask for. 385 */ 386 if (likely(tunnel != PKT_TX_TUNNEL_UDP && tunnel != PKT_TX_TUNNEL_IP)) 387 return 0; 388 /* 389 * The index should have: 390 * bit[0:1] = PKT_TX_L4_MASK 391 * bit[4] = PKT_TX_IPV6 392 * bit[8] = PKT_TX_OUTER_IPV6 393 * bit[9] = PKT_TX_OUTER_UDP 394 */ 395 idx = (ol & (PKT_TX_L4_MASK | PKT_TX_IPV6 | PKT_TX_OUTER_IPV6)) >> 52; 396 idx |= (tunnel == PKT_TX_TUNNEL_UDP) ? (1 << 9) : 0; 397 *swp_flags = mlx5_swp_types_table[idx]; 398 /* 399 * Set offsets for SW parser. Since ConnectX-5, SW parser just 400 * complements HW parser. SW parser starts to engage only if HW parser 401 * can't reach a header. For the older devices, HW parser will not kick 402 * in if any of SWP offsets is set. Therefore, all of the L3 offsets 403 * should be set regardless of HW offload. 404 */ 405 off = loc->mbuf->outer_l2_len; 406 if (MLX5_TXOFF_CONFIG(VLAN) && ol & PKT_TX_VLAN_PKT) 407 off += sizeof(struct rte_vlan_hdr); 408 set = (off >> 1) << 8; /* Outer L3 offset. */ 409 off += loc->mbuf->outer_l3_len; 410 if (tunnel == PKT_TX_TUNNEL_UDP) 411 set |= off >> 1; /* Outer L4 offset. */ 412 if (ol & (PKT_TX_IPV4 | PKT_TX_IPV6)) { /* Inner IP. */ 413 const uint64_t csum = ol & PKT_TX_L4_MASK; 414 off += loc->mbuf->l2_len; 415 set |= (off >> 1) << 24; /* Inner L3 offset. */ 416 if (csum == PKT_TX_TCP_CKSUM || 417 csum == PKT_TX_UDP_CKSUM || 418 (MLX5_TXOFF_CONFIG(TSO) && ol & PKT_TX_TCP_SEG)) { 419 off += loc->mbuf->l3_len; 420 set |= (off >> 1) << 16; /* Inner L4 offset. */ 421 } 422 } 423 set = rte_cpu_to_le_32(set); 424 return set; 425 } 426 427 /** 428 * Convert the Checksum offloads to Verbs. 429 * 430 * @param buf 431 * Pointer to the mbuf. 432 * 433 * @return 434 * Converted checksum flags. 435 */ 436 static __rte_always_inline uint8_t 437 txq_ol_cksum_to_cs(struct rte_mbuf *buf) 438 { 439 uint32_t idx; 440 uint8_t is_tunnel = !!(buf->ol_flags & PKT_TX_TUNNEL_MASK); 441 const uint64_t ol_flags_mask = PKT_TX_TCP_SEG | PKT_TX_L4_MASK | 442 PKT_TX_IP_CKSUM | PKT_TX_OUTER_IP_CKSUM; 443 444 /* 445 * The index should have: 446 * bit[0] = PKT_TX_TCP_SEG 447 * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM 448 * bit[4] = PKT_TX_IP_CKSUM 449 * bit[8] = PKT_TX_OUTER_IP_CKSUM 450 * bit[9] = tunnel 451 */ 452 idx = ((buf->ol_flags & ol_flags_mask) >> 50) | (!!is_tunnel << 9); 453 return mlx5_cksum_table[idx]; 454 } 455 456 /** 457 * Internal function to compute the number of used descriptors in an RX queue 458 * 459 * @param rxq 460 * The Rx queue. 461 * 462 * @return 463 * The number of used rx descriptor. 464 */ 465 static uint32_t 466 rx_queue_count(struct mlx5_rxq_data *rxq) 467 { 468 struct rxq_zip *zip = &rxq->zip; 469 volatile struct mlx5_cqe *cqe; 470 const unsigned int cqe_n = (1 << rxq->cqe_n); 471 const unsigned int cqe_cnt = cqe_n - 1; 472 unsigned int cq_ci; 473 unsigned int used; 474 475 /* if we are processing a compressed cqe */ 476 if (zip->ai) { 477 used = zip->cqe_cnt - zip->ca; 478 cq_ci = zip->cq_ci; 479 } else { 480 used = 0; 481 cq_ci = rxq->cq_ci; 482 } 483 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; 484 while (check_cqe(cqe, cqe_n, cq_ci) != MLX5_CQE_STATUS_HW_OWN) { 485 int8_t op_own; 486 unsigned int n; 487 488 op_own = cqe->op_own; 489 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) 490 n = rte_be_to_cpu_32(cqe->byte_cnt); 491 else 492 n = 1; 493 cq_ci += n; 494 used += n; 495 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; 496 } 497 used = RTE_MIN(used, (1U << rxq->elts_n) - 1); 498 return used; 499 } 500 501 /** 502 * DPDK callback to check the status of a rx descriptor. 503 * 504 * @param rx_queue 505 * The Rx queue. 506 * @param[in] offset 507 * The index of the descriptor in the ring. 508 * 509 * @return 510 * The status of the tx descriptor. 511 */ 512 int 513 mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset) 514 { 515 struct mlx5_rxq_data *rxq = rx_queue; 516 struct mlx5_rxq_ctrl *rxq_ctrl = 517 container_of(rxq, struct mlx5_rxq_ctrl, rxq); 518 struct rte_eth_dev *dev = ETH_DEV(rxq_ctrl->priv); 519 520 if (dev->rx_pkt_burst != mlx5_rx_burst) { 521 rte_errno = ENOTSUP; 522 return -rte_errno; 523 } 524 if (offset >= (1 << rxq->elts_n)) { 525 rte_errno = EINVAL; 526 return -rte_errno; 527 } 528 if (offset < rx_queue_count(rxq)) 529 return RTE_ETH_RX_DESC_DONE; 530 return RTE_ETH_RX_DESC_AVAIL; 531 } 532 533 /** 534 * DPDK callback to get the number of used descriptors in a RX queue 535 * 536 * @param dev 537 * Pointer to the device structure. 538 * 539 * @param rx_queue_id 540 * The Rx queue. 541 * 542 * @return 543 * The number of used rx descriptor. 544 * -EINVAL if the queue is invalid 545 */ 546 uint32_t 547 mlx5_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id) 548 { 549 struct mlx5_priv *priv = dev->data->dev_private; 550 struct mlx5_rxq_data *rxq; 551 552 if (dev->rx_pkt_burst != mlx5_rx_burst) { 553 rte_errno = ENOTSUP; 554 return -rte_errno; 555 } 556 rxq = (*priv->rxqs)[rx_queue_id]; 557 if (!rxq) { 558 rte_errno = EINVAL; 559 return -rte_errno; 560 } 561 return rx_queue_count(rxq); 562 } 563 564 #define MLX5_SYSTEM_LOG_DIR "/var/log" 565 /** 566 * Dump debug information to log file. 567 * 568 * @param fname 569 * The file name. 570 * @param hex_title 571 * If not NULL this string is printed as a header to the output 572 * and the output will be in hexadecimal view. 573 * @param buf 574 * This is the buffer address to print out. 575 * @param len 576 * The number of bytes to dump out. 577 */ 578 void 579 mlx5_dump_debug_information(const char *fname, const char *hex_title, 580 const void *buf, unsigned int hex_len) 581 { 582 FILE *fd; 583 584 MKSTR(path, "%s/%s", MLX5_SYSTEM_LOG_DIR, fname); 585 fd = fopen(path, "a+"); 586 if (!fd) { 587 DRV_LOG(WARNING, "cannot open %s for debug dump", path); 588 MKSTR(path2, "./%s", fname); 589 fd = fopen(path2, "a+"); 590 if (!fd) { 591 DRV_LOG(ERR, "cannot open %s for debug dump", path2); 592 return; 593 } 594 DRV_LOG(INFO, "New debug dump in file %s", path2); 595 } else { 596 DRV_LOG(INFO, "New debug dump in file %s", path); 597 } 598 if (hex_title) 599 rte_hexdump(fd, hex_title, buf, hex_len); 600 else 601 fprintf(fd, "%s", (const char *)buf); 602 fprintf(fd, "\n\n\n"); 603 fclose(fd); 604 } 605 606 /** 607 * Move QP from error state to running state and initialize indexes. 608 * 609 * @param txq_ctrl 610 * Pointer to TX queue control structure. 611 * 612 * @return 613 * 0 on success, else -1. 614 */ 615 static int 616 tx_recover_qp(struct mlx5_txq_ctrl *txq_ctrl) 617 { 618 struct mlx5_mp_arg_queue_state_modify sm = { 619 .is_wq = 0, 620 .queue_id = txq_ctrl->txq.idx, 621 }; 622 623 if (mlx5_queue_state_modify(ETH_DEV(txq_ctrl->priv), &sm)) 624 return -1; 625 txq_ctrl->txq.wqe_ci = 0; 626 txq_ctrl->txq.wqe_pi = 0; 627 txq_ctrl->txq.elts_comp = 0; 628 return 0; 629 } 630 631 /* Return 1 if the error CQE is signed otherwise, sign it and return 0. */ 632 static int 633 check_err_cqe_seen(volatile struct mlx5_err_cqe *err_cqe) 634 { 635 static const uint8_t magic[] = "seen"; 636 int ret = 1; 637 unsigned int i; 638 639 for (i = 0; i < sizeof(magic); ++i) 640 if (!ret || err_cqe->rsvd1[i] != magic[i]) { 641 ret = 0; 642 err_cqe->rsvd1[i] = magic[i]; 643 } 644 return ret; 645 } 646 647 /** 648 * Handle error CQE. 649 * 650 * @param txq 651 * Pointer to TX queue structure. 652 * @param error_cqe 653 * Pointer to the error CQE. 654 * 655 * @return 656 * Negative value if queue recovery failed, 657 * the last Tx buffer element to free otherwise. 658 */ 659 int 660 mlx5_tx_error_cqe_handle(struct mlx5_txq_data *restrict txq, 661 volatile struct mlx5_err_cqe *err_cqe) 662 { 663 if (err_cqe->syndrome != MLX5_CQE_SYNDROME_WR_FLUSH_ERR) { 664 const uint16_t wqe_m = ((1 << txq->wqe_n) - 1); 665 struct mlx5_txq_ctrl *txq_ctrl = 666 container_of(txq, struct mlx5_txq_ctrl, txq); 667 uint16_t new_wqe_pi = rte_be_to_cpu_16(err_cqe->wqe_counter); 668 int seen = check_err_cqe_seen(err_cqe); 669 670 if (!seen && txq_ctrl->dump_file_n < 671 txq_ctrl->priv->config.max_dump_files_num) { 672 MKSTR(err_str, "Unexpected CQE error syndrome " 673 "0x%02x CQN = %u SQN = %u wqe_counter = %u " 674 "wq_ci = %u cq_ci = %u", err_cqe->syndrome, 675 txq->cqe_s, txq->qp_num_8s >> 8, 676 rte_be_to_cpu_16(err_cqe->wqe_counter), 677 txq->wqe_ci, txq->cq_ci); 678 MKSTR(name, "dpdk_mlx5_port_%u_txq_%u_index_%u_%u", 679 PORT_ID(txq_ctrl->priv), txq->idx, 680 txq_ctrl->dump_file_n, (uint32_t)rte_rdtsc()); 681 mlx5_dump_debug_information(name, NULL, err_str, 0); 682 mlx5_dump_debug_information(name, "MLX5 Error CQ:", 683 (const void *)((uintptr_t) 684 txq->cqes), 685 sizeof(*err_cqe) * 686 (1 << txq->cqe_n)); 687 mlx5_dump_debug_information(name, "MLX5 Error SQ:", 688 (const void *)((uintptr_t) 689 txq->wqes), 690 MLX5_WQE_SIZE * 691 (1 << txq->wqe_n)); 692 txq_ctrl->dump_file_n++; 693 } 694 if (!seen) 695 /* 696 * Count errors in WQEs units. 697 * Later it can be improved to count error packets, 698 * for example, by SQ parsing to find how much packets 699 * should be counted for each WQE. 700 */ 701 txq->stats.oerrors += ((txq->wqe_ci & wqe_m) - 702 new_wqe_pi) & wqe_m; 703 if (tx_recover_qp(txq_ctrl) == 0) { 704 txq->cq_ci++; 705 /* Release all the remaining buffers. */ 706 return txq->elts_head; 707 } 708 /* Recovering failed - try again later on the same WQE. */ 709 return -1; 710 } else { 711 txq->cq_ci++; 712 } 713 /* Do not release buffers. */ 714 return txq->elts_tail; 715 } 716 717 /** 718 * Translate RX completion flags to packet type. 719 * 720 * @param[in] rxq 721 * Pointer to RX queue structure. 722 * @param[in] cqe 723 * Pointer to CQE. 724 * 725 * @note: fix mlx5_dev_supported_ptypes_get() if any change here. 726 * 727 * @return 728 * Packet type for struct rte_mbuf. 729 */ 730 static inline uint32_t 731 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe) 732 { 733 uint8_t idx; 734 uint8_t pinfo = cqe->pkt_info; 735 uint16_t ptype = cqe->hdr_type_etc; 736 737 /* 738 * The index to the array should have: 739 * bit[1:0] = l3_hdr_type 740 * bit[4:2] = l4_hdr_type 741 * bit[5] = ip_frag 742 * bit[6] = tunneled 743 * bit[7] = outer_l3_type 744 */ 745 idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10); 746 return mlx5_ptype_table[idx] | rxq->tunnel * !!(idx & (1 << 6)); 747 } 748 749 /** 750 * Initialize Rx WQ and indexes. 751 * 752 * @param[in] rxq 753 * Pointer to RX queue structure. 754 */ 755 void 756 mlx5_rxq_initialize(struct mlx5_rxq_data *rxq) 757 { 758 const unsigned int wqe_n = 1 << rxq->elts_n; 759 unsigned int i; 760 761 for (i = 0; (i != wqe_n); ++i) { 762 volatile struct mlx5_wqe_data_seg *scat; 763 uintptr_t addr; 764 uint32_t byte_count; 765 766 if (mlx5_rxq_mprq_enabled(rxq)) { 767 struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[i]; 768 769 scat = &((volatile struct mlx5_wqe_mprq *) 770 rxq->wqes)[i].dseg; 771 addr = (uintptr_t)mlx5_mprq_buf_addr(buf, 772 1 << rxq->strd_num_n); 773 byte_count = (1 << rxq->strd_sz_n) * 774 (1 << rxq->strd_num_n); 775 } else { 776 struct rte_mbuf *buf = (*rxq->elts)[i]; 777 778 scat = &((volatile struct mlx5_wqe_data_seg *) 779 rxq->wqes)[i]; 780 addr = rte_pktmbuf_mtod(buf, uintptr_t); 781 byte_count = DATA_LEN(buf); 782 } 783 /* scat->addr must be able to store a pointer. */ 784 assert(sizeof(scat->addr) >= sizeof(uintptr_t)); 785 *scat = (struct mlx5_wqe_data_seg){ 786 .addr = rte_cpu_to_be_64(addr), 787 .byte_count = rte_cpu_to_be_32(byte_count), 788 .lkey = mlx5_rx_addr2mr(rxq, addr), 789 }; 790 } 791 rxq->consumed_strd = 0; 792 rxq->decompressed = 0; 793 rxq->rq_pi = 0; 794 rxq->zip = (struct rxq_zip){ 795 .ai = 0, 796 }; 797 /* Update doorbell counter. */ 798 rxq->rq_ci = wqe_n >> rxq->sges_n; 799 rte_cio_wmb(); 800 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 801 } 802 803 /** 804 * Modify a Verbs/DevX queue state. 805 * This must be called from the primary process. 806 * 807 * @param dev 808 * Pointer to Ethernet device. 809 * @param sm 810 * State modify request parameters. 811 * 812 * @return 813 * 0 in case of success else non-zero value and rte_errno is set. 814 */ 815 int 816 mlx5_queue_state_modify_primary(struct rte_eth_dev *dev, 817 const struct mlx5_mp_arg_queue_state_modify *sm) 818 { 819 int ret; 820 struct mlx5_priv *priv = dev->data->dev_private; 821 822 if (sm->is_wq) { 823 struct mlx5_rxq_data *rxq = (*priv->rxqs)[sm->queue_id]; 824 struct mlx5_rxq_ctrl *rxq_ctrl = 825 container_of(rxq, struct mlx5_rxq_ctrl, rxq); 826 827 if (rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_IBV) { 828 struct ibv_wq_attr mod = { 829 .attr_mask = IBV_WQ_ATTR_STATE, 830 .wq_state = sm->state, 831 }; 832 833 ret = mlx5_glue->modify_wq(rxq_ctrl->obj->wq, &mod); 834 } else { /* rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ. */ 835 struct mlx5_devx_modify_rq_attr rq_attr; 836 837 memset(&rq_attr, 0, sizeof(rq_attr)); 838 if (sm->state == IBV_WQS_RESET) { 839 rq_attr.rq_state = MLX5_RQC_STATE_ERR; 840 rq_attr.state = MLX5_RQC_STATE_RST; 841 } else if (sm->state == IBV_WQS_RDY) { 842 rq_attr.rq_state = MLX5_RQC_STATE_RST; 843 rq_attr.state = MLX5_RQC_STATE_RDY; 844 } else if (sm->state == IBV_WQS_ERR) { 845 rq_attr.rq_state = MLX5_RQC_STATE_RDY; 846 rq_attr.state = MLX5_RQC_STATE_ERR; 847 } 848 ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq, 849 &rq_attr); 850 } 851 if (ret) { 852 DRV_LOG(ERR, "Cannot change Rx WQ state to %u - %s", 853 sm->state, strerror(errno)); 854 rte_errno = errno; 855 return ret; 856 } 857 } else { 858 struct mlx5_txq_data *txq = (*priv->txqs)[sm->queue_id]; 859 struct mlx5_txq_ctrl *txq_ctrl = 860 container_of(txq, struct mlx5_txq_ctrl, txq); 861 struct ibv_qp_attr mod = { 862 .qp_state = IBV_QPS_RESET, 863 .port_num = (uint8_t)priv->ibv_port, 864 }; 865 struct ibv_qp *qp = txq_ctrl->obj->qp; 866 867 ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE); 868 if (ret) { 869 DRV_LOG(ERR, "Cannot change the Tx QP state to RESET " 870 "%s", strerror(errno)); 871 rte_errno = errno; 872 return ret; 873 } 874 mod.qp_state = IBV_QPS_INIT; 875 ret = mlx5_glue->modify_qp(qp, &mod, 876 (IBV_QP_STATE | IBV_QP_PORT)); 877 if (ret) { 878 DRV_LOG(ERR, "Cannot change Tx QP state to INIT %s", 879 strerror(errno)); 880 rte_errno = errno; 881 return ret; 882 } 883 mod.qp_state = IBV_QPS_RTR; 884 ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE); 885 if (ret) { 886 DRV_LOG(ERR, "Cannot change Tx QP state to RTR %s", 887 strerror(errno)); 888 rte_errno = errno; 889 return ret; 890 } 891 mod.qp_state = IBV_QPS_RTS; 892 ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE); 893 if (ret) { 894 DRV_LOG(ERR, "Cannot change Tx QP state to RTS %s", 895 strerror(errno)); 896 rte_errno = errno; 897 return ret; 898 } 899 } 900 return 0; 901 } 902 903 /** 904 * Modify a Verbs queue state. 905 * 906 * @param dev 907 * Pointer to Ethernet device. 908 * @param sm 909 * State modify request parameters. 910 * 911 * @return 912 * 0 in case of success else non-zero value. 913 */ 914 static int 915 mlx5_queue_state_modify(struct rte_eth_dev *dev, 916 struct mlx5_mp_arg_queue_state_modify *sm) 917 { 918 int ret = 0; 919 920 switch (rte_eal_process_type()) { 921 case RTE_PROC_PRIMARY: 922 ret = mlx5_queue_state_modify_primary(dev, sm); 923 break; 924 case RTE_PROC_SECONDARY: 925 ret = mlx5_mp_req_queue_state_modify(dev, sm); 926 break; 927 default: 928 break; 929 } 930 return ret; 931 } 932 933 /** 934 * Handle a Rx error. 935 * The function inserts the RQ state to reset when the first error CQE is 936 * shown, then drains the CQ by the caller function loop. When the CQ is empty, 937 * it moves the RQ state to ready and initializes the RQ. 938 * Next CQE identification and error counting are in the caller responsibility. 939 * 940 * @param[in] rxq 941 * Pointer to RX queue structure. 942 * @param[in] vec 943 * 1 when called from vectorized Rx burst, need to prepare mbufs for the RQ. 944 * 0 when called from non-vectorized Rx burst. 945 * 946 * @return 947 * -1 in case of recovery error, otherwise the CQE status. 948 */ 949 int 950 mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec) 951 { 952 const uint16_t cqe_n = 1 << rxq->cqe_n; 953 const uint16_t cqe_mask = cqe_n - 1; 954 const unsigned int wqe_n = 1 << rxq->elts_n; 955 struct mlx5_rxq_ctrl *rxq_ctrl = 956 container_of(rxq, struct mlx5_rxq_ctrl, rxq); 957 union { 958 volatile struct mlx5_cqe *cqe; 959 volatile struct mlx5_err_cqe *err_cqe; 960 } u = { 961 .cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask], 962 }; 963 struct mlx5_mp_arg_queue_state_modify sm; 964 int ret; 965 966 switch (rxq->err_state) { 967 case MLX5_RXQ_ERR_STATE_NO_ERROR: 968 rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_RESET; 969 /* Fall-through */ 970 case MLX5_RXQ_ERR_STATE_NEED_RESET: 971 sm.is_wq = 1; 972 sm.queue_id = rxq->idx; 973 sm.state = IBV_WQS_RESET; 974 if (mlx5_queue_state_modify(ETH_DEV(rxq_ctrl->priv), &sm)) 975 return -1; 976 if (rxq_ctrl->dump_file_n < 977 rxq_ctrl->priv->config.max_dump_files_num) { 978 MKSTR(err_str, "Unexpected CQE error syndrome " 979 "0x%02x CQN = %u RQN = %u wqe_counter = %u" 980 " rq_ci = %u cq_ci = %u", u.err_cqe->syndrome, 981 rxq->cqn, rxq_ctrl->wqn, 982 rte_be_to_cpu_16(u.err_cqe->wqe_counter), 983 rxq->rq_ci << rxq->sges_n, rxq->cq_ci); 984 MKSTR(name, "dpdk_mlx5_port_%u_rxq_%u_%u", 985 rxq->port_id, rxq->idx, (uint32_t)rte_rdtsc()); 986 mlx5_dump_debug_information(name, NULL, err_str, 0); 987 mlx5_dump_debug_information(name, "MLX5 Error CQ:", 988 (const void *)((uintptr_t) 989 rxq->cqes), 990 sizeof(*u.cqe) * cqe_n); 991 mlx5_dump_debug_information(name, "MLX5 Error RQ:", 992 (const void *)((uintptr_t) 993 rxq->wqes), 994 16 * wqe_n); 995 rxq_ctrl->dump_file_n++; 996 } 997 rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_READY; 998 /* Fall-through */ 999 case MLX5_RXQ_ERR_STATE_NEED_READY: 1000 ret = check_cqe(u.cqe, cqe_n, rxq->cq_ci); 1001 if (ret == MLX5_CQE_STATUS_HW_OWN) { 1002 rte_cio_wmb(); 1003 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); 1004 rte_cio_wmb(); 1005 /* 1006 * The RQ consumer index must be zeroed while moving 1007 * from RESET state to RDY state. 1008 */ 1009 *rxq->rq_db = rte_cpu_to_be_32(0); 1010 rte_cio_wmb(); 1011 sm.is_wq = 1; 1012 sm.queue_id = rxq->idx; 1013 sm.state = IBV_WQS_RDY; 1014 if (mlx5_queue_state_modify(ETH_DEV(rxq_ctrl->priv), 1015 &sm)) 1016 return -1; 1017 if (vec) { 1018 const uint16_t q_mask = wqe_n - 1; 1019 uint16_t elt_idx; 1020 struct rte_mbuf **elt; 1021 int i; 1022 unsigned int n = wqe_n - (rxq->rq_ci - 1023 rxq->rq_pi); 1024 1025 for (i = 0; i < (int)n; ++i) { 1026 elt_idx = (rxq->rq_ci + i) & q_mask; 1027 elt = &(*rxq->elts)[elt_idx]; 1028 *elt = rte_mbuf_raw_alloc(rxq->mp); 1029 if (!*elt) { 1030 for (i--; i >= 0; --i) { 1031 elt_idx = (rxq->rq_ci + 1032 i) & q_mask; 1033 elt = &(*rxq->elts) 1034 [elt_idx]; 1035 rte_pktmbuf_free_seg 1036 (*elt); 1037 } 1038 return -1; 1039 } 1040 } 1041 for (i = 0; i < (int)wqe_n; ++i) { 1042 elt = &(*rxq->elts)[i]; 1043 DATA_LEN(*elt) = 1044 (uint16_t)((*elt)->buf_len - 1045 rte_pktmbuf_headroom(*elt)); 1046 } 1047 /* Padding with a fake mbuf for vec Rx. */ 1048 for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i) 1049 (*rxq->elts)[wqe_n + i] = 1050 &rxq->fake_mbuf; 1051 } 1052 mlx5_rxq_initialize(rxq); 1053 rxq->err_state = MLX5_RXQ_ERR_STATE_NO_ERROR; 1054 } 1055 return ret; 1056 default: 1057 return -1; 1058 } 1059 } 1060 1061 /** 1062 * Get size of the next packet for a given CQE. For compressed CQEs, the 1063 * consumer index is updated only once all packets of the current one have 1064 * been processed. 1065 * 1066 * @param rxq 1067 * Pointer to RX queue. 1068 * @param cqe 1069 * CQE to process. 1070 * @param[out] mcqe 1071 * Store pointer to mini-CQE if compressed. Otherwise, the pointer is not 1072 * written. 1073 * 1074 * @return 1075 * 0 in case of empty CQE, otherwise the packet size in bytes. 1076 */ 1077 static inline int 1078 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, 1079 uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe) 1080 { 1081 struct rxq_zip *zip = &rxq->zip; 1082 uint16_t cqe_n = cqe_cnt + 1; 1083 int len; 1084 uint16_t idx, end; 1085 1086 do { 1087 len = 0; 1088 /* Process compressed data in the CQE and mini arrays. */ 1089 if (zip->ai) { 1090 volatile struct mlx5_mini_cqe8 (*mc)[8] = 1091 (volatile struct mlx5_mini_cqe8 (*)[8]) 1092 (uintptr_t)(&(*rxq->cqes)[zip->ca & 1093 cqe_cnt].pkt_info); 1094 1095 len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt); 1096 *mcqe = &(*mc)[zip->ai & 7]; 1097 if ((++zip->ai & 7) == 0) { 1098 /* Invalidate consumed CQEs */ 1099 idx = zip->ca; 1100 end = zip->na; 1101 while (idx != end) { 1102 (*rxq->cqes)[idx & cqe_cnt].op_own = 1103 MLX5_CQE_INVALIDATE; 1104 ++idx; 1105 } 1106 /* 1107 * Increment consumer index to skip the number 1108 * of CQEs consumed. Hardware leaves holes in 1109 * the CQ ring for software use. 1110 */ 1111 zip->ca = zip->na; 1112 zip->na += 8; 1113 } 1114 if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) { 1115 /* Invalidate the rest */ 1116 idx = zip->ca; 1117 end = zip->cq_ci; 1118 1119 while (idx != end) { 1120 (*rxq->cqes)[idx & cqe_cnt].op_own = 1121 MLX5_CQE_INVALIDATE; 1122 ++idx; 1123 } 1124 rxq->cq_ci = zip->cq_ci; 1125 zip->ai = 0; 1126 } 1127 /* 1128 * No compressed data, get next CQE and verify if it is 1129 * compressed. 1130 */ 1131 } else { 1132 int ret; 1133 int8_t op_own; 1134 1135 ret = check_cqe(cqe, cqe_n, rxq->cq_ci); 1136 if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) { 1137 if (unlikely(ret == MLX5_CQE_STATUS_ERR || 1138 rxq->err_state)) { 1139 ret = mlx5_rx_err_handle(rxq, 0); 1140 if (ret == MLX5_CQE_STATUS_HW_OWN || 1141 ret == -1) 1142 return 0; 1143 } else { 1144 return 0; 1145 } 1146 } 1147 ++rxq->cq_ci; 1148 op_own = cqe->op_own; 1149 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) { 1150 volatile struct mlx5_mini_cqe8 (*mc)[8] = 1151 (volatile struct mlx5_mini_cqe8 (*)[8]) 1152 (uintptr_t)(&(*rxq->cqes) 1153 [rxq->cq_ci & 1154 cqe_cnt].pkt_info); 1155 1156 /* Fix endianness. */ 1157 zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt); 1158 /* 1159 * Current mini array position is the one 1160 * returned by check_cqe64(). 1161 * 1162 * If completion comprises several mini arrays, 1163 * as a special case the second one is located 1164 * 7 CQEs after the initial CQE instead of 8 1165 * for subsequent ones. 1166 */ 1167 zip->ca = rxq->cq_ci; 1168 zip->na = zip->ca + 7; 1169 /* Compute the next non compressed CQE. */ 1170 --rxq->cq_ci; 1171 zip->cq_ci = rxq->cq_ci + zip->cqe_cnt; 1172 /* Get packet size to return. */ 1173 len = rte_be_to_cpu_32((*mc)[0].byte_cnt); 1174 *mcqe = &(*mc)[0]; 1175 zip->ai = 1; 1176 /* Prefetch all to be invalidated */ 1177 idx = zip->ca; 1178 end = zip->cq_ci; 1179 while (idx != end) { 1180 rte_prefetch0(&(*rxq->cqes)[(idx) & 1181 cqe_cnt]); 1182 ++idx; 1183 } 1184 } else { 1185 len = rte_be_to_cpu_32(cqe->byte_cnt); 1186 } 1187 } 1188 if (unlikely(rxq->err_state)) { 1189 cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1190 ++rxq->stats.idropped; 1191 } else { 1192 return len; 1193 } 1194 } while (1); 1195 } 1196 1197 /** 1198 * Translate RX completion flags to offload flags. 1199 * 1200 * @param[in] cqe 1201 * Pointer to CQE. 1202 * 1203 * @return 1204 * Offload flags (ol_flags) for struct rte_mbuf. 1205 */ 1206 static inline uint32_t 1207 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe) 1208 { 1209 uint32_t ol_flags = 0; 1210 uint16_t flags = rte_be_to_cpu_16(cqe->hdr_type_etc); 1211 1212 ol_flags = 1213 TRANSPOSE(flags, 1214 MLX5_CQE_RX_L3_HDR_VALID, 1215 PKT_RX_IP_CKSUM_GOOD) | 1216 TRANSPOSE(flags, 1217 MLX5_CQE_RX_L4_HDR_VALID, 1218 PKT_RX_L4_CKSUM_GOOD); 1219 return ol_flags; 1220 } 1221 1222 /** 1223 * Fill in mbuf fields from RX completion flags. 1224 * Note that pkt->ol_flags should be initialized outside of this function. 1225 * 1226 * @param rxq 1227 * Pointer to RX queue. 1228 * @param pkt 1229 * mbuf to fill. 1230 * @param cqe 1231 * CQE to process. 1232 * @param rss_hash_res 1233 * Packet RSS Hash result. 1234 */ 1235 static inline void 1236 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, 1237 volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res) 1238 { 1239 /* Update packet information. */ 1240 pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe); 1241 if (rss_hash_res && rxq->rss_hash) { 1242 pkt->hash.rss = rss_hash_res; 1243 pkt->ol_flags |= PKT_RX_RSS_HASH; 1244 } 1245 if (rxq->mark && MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) { 1246 pkt->ol_flags |= PKT_RX_FDIR; 1247 if (cqe->sop_drop_qpn != 1248 rte_cpu_to_be_32(MLX5_FLOW_MARK_DEFAULT)) { 1249 uint32_t mark = cqe->sop_drop_qpn; 1250 1251 pkt->ol_flags |= PKT_RX_FDIR_ID; 1252 pkt->hash.fdir.hi = mlx5_flow_mark_get(mark); 1253 } 1254 } 1255 if (rte_flow_dynf_metadata_avail() && cqe->flow_table_metadata) { 1256 pkt->ol_flags |= PKT_RX_DYNF_METADATA; 1257 *RTE_FLOW_DYNF_METADATA(pkt) = cqe->flow_table_metadata; 1258 } 1259 if (rxq->csum) 1260 pkt->ol_flags |= rxq_cq_to_ol_flags(cqe); 1261 if (rxq->vlan_strip && 1262 (cqe->hdr_type_etc & rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED))) { 1263 pkt->ol_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED; 1264 pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info); 1265 } 1266 if (rxq->hw_timestamp) { 1267 pkt->timestamp = rte_be_to_cpu_64(cqe->timestamp); 1268 pkt->ol_flags |= PKT_RX_TIMESTAMP; 1269 } 1270 } 1271 1272 /** 1273 * DPDK callback for RX. 1274 * 1275 * @param dpdk_rxq 1276 * Generic pointer to RX queue structure. 1277 * @param[out] pkts 1278 * Array to store received packets. 1279 * @param pkts_n 1280 * Maximum number of packets in array. 1281 * 1282 * @return 1283 * Number of packets successfully received (<= pkts_n). 1284 */ 1285 uint16_t 1286 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1287 { 1288 struct mlx5_rxq_data *rxq = dpdk_rxq; 1289 const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1; 1290 const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1; 1291 const unsigned int sges_n = rxq->sges_n; 1292 struct rte_mbuf *pkt = NULL; 1293 struct rte_mbuf *seg = NULL; 1294 volatile struct mlx5_cqe *cqe = 1295 &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1296 unsigned int i = 0; 1297 unsigned int rq_ci = rxq->rq_ci << sges_n; 1298 int len = 0; /* keep its value across iterations. */ 1299 1300 while (pkts_n) { 1301 unsigned int idx = rq_ci & wqe_cnt; 1302 volatile struct mlx5_wqe_data_seg *wqe = 1303 &((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx]; 1304 struct rte_mbuf *rep = (*rxq->elts)[idx]; 1305 volatile struct mlx5_mini_cqe8 *mcqe = NULL; 1306 uint32_t rss_hash_res; 1307 1308 if (pkt) 1309 NEXT(seg) = rep; 1310 seg = rep; 1311 rte_prefetch0(seg); 1312 rte_prefetch0(cqe); 1313 rte_prefetch0(wqe); 1314 rep = rte_mbuf_raw_alloc(rxq->mp); 1315 if (unlikely(rep == NULL)) { 1316 ++rxq->stats.rx_nombuf; 1317 if (!pkt) { 1318 /* 1319 * no buffers before we even started, 1320 * bail out silently. 1321 */ 1322 break; 1323 } 1324 while (pkt != seg) { 1325 assert(pkt != (*rxq->elts)[idx]); 1326 rep = NEXT(pkt); 1327 NEXT(pkt) = NULL; 1328 NB_SEGS(pkt) = 1; 1329 rte_mbuf_raw_free(pkt); 1330 pkt = rep; 1331 } 1332 break; 1333 } 1334 if (!pkt) { 1335 cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1336 len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt, &mcqe); 1337 if (!len) { 1338 rte_mbuf_raw_free(rep); 1339 break; 1340 } 1341 pkt = seg; 1342 assert(len >= (rxq->crc_present << 2)); 1343 pkt->ol_flags = 0; 1344 /* If compressed, take hash result from mini-CQE. */ 1345 rss_hash_res = rte_be_to_cpu_32(mcqe == NULL ? 1346 cqe->rx_hash_res : 1347 mcqe->rx_hash_result); 1348 rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res); 1349 if (rxq->crc_present) 1350 len -= RTE_ETHER_CRC_LEN; 1351 PKT_LEN(pkt) = len; 1352 if (cqe->lro_num_seg > 1) { 1353 mlx5_lro_update_hdr 1354 (rte_pktmbuf_mtod(pkt, uint8_t *), cqe, 1355 len); 1356 pkt->ol_flags |= PKT_RX_LRO; 1357 pkt->tso_segsz = len / cqe->lro_num_seg; 1358 } 1359 } 1360 DATA_LEN(rep) = DATA_LEN(seg); 1361 PKT_LEN(rep) = PKT_LEN(seg); 1362 SET_DATA_OFF(rep, DATA_OFF(seg)); 1363 PORT(rep) = PORT(seg); 1364 (*rxq->elts)[idx] = rep; 1365 /* 1366 * Fill NIC descriptor with the new buffer. The lkey and size 1367 * of the buffers are already known, only the buffer address 1368 * changes. 1369 */ 1370 wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t)); 1371 /* If there's only one MR, no need to replace LKey in WQE. */ 1372 if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) 1373 wqe->lkey = mlx5_rx_mb2mr(rxq, rep); 1374 if (len > DATA_LEN(seg)) { 1375 len -= DATA_LEN(seg); 1376 ++NB_SEGS(pkt); 1377 ++rq_ci; 1378 continue; 1379 } 1380 DATA_LEN(seg) = len; 1381 #ifdef MLX5_PMD_SOFT_COUNTERS 1382 /* Increment bytes counter. */ 1383 rxq->stats.ibytes += PKT_LEN(pkt); 1384 #endif 1385 /* Return packet. */ 1386 *(pkts++) = pkt; 1387 pkt = NULL; 1388 --pkts_n; 1389 ++i; 1390 /* Align consumer index to the next stride. */ 1391 rq_ci >>= sges_n; 1392 ++rq_ci; 1393 rq_ci <<= sges_n; 1394 } 1395 if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci))) 1396 return 0; 1397 /* Update the consumer index. */ 1398 rxq->rq_ci = rq_ci >> sges_n; 1399 rte_cio_wmb(); 1400 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); 1401 rte_cio_wmb(); 1402 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 1403 #ifdef MLX5_PMD_SOFT_COUNTERS 1404 /* Increment packets counter. */ 1405 rxq->stats.ipackets += i; 1406 #endif 1407 return i; 1408 } 1409 1410 /** 1411 * Update LRO packet TCP header. 1412 * The HW LRO feature doesn't update the TCP header after coalescing the 1413 * TCP segments but supplies information in CQE to fill it by SW. 1414 * 1415 * @param tcp 1416 * Pointer to the TCP header. 1417 * @param cqe 1418 * Pointer to the completion entry.. 1419 * @param phcsum 1420 * The L3 pseudo-header checksum. 1421 */ 1422 static inline void 1423 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *restrict tcp, 1424 volatile struct mlx5_cqe *restrict cqe, 1425 uint32_t phcsum) 1426 { 1427 uint8_t l4_type = (rte_be_to_cpu_16(cqe->hdr_type_etc) & 1428 MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT; 1429 /* 1430 * The HW calculates only the TCP payload checksum, need to complete 1431 * the TCP header checksum and the L3 pseudo-header checksum. 1432 */ 1433 uint32_t csum = phcsum + cqe->csum; 1434 1435 if (l4_type == MLX5_L4_HDR_TYPE_TCP_EMPTY_ACK || 1436 l4_type == MLX5_L4_HDR_TYPE_TCP_WITH_ACL) { 1437 tcp->tcp_flags |= RTE_TCP_ACK_FLAG; 1438 tcp->recv_ack = cqe->lro_ack_seq_num; 1439 tcp->rx_win = cqe->lro_tcp_win; 1440 } 1441 if (cqe->lro_tcppsh_abort_dupack & MLX5_CQE_LRO_PUSH_MASK) 1442 tcp->tcp_flags |= RTE_TCP_PSH_FLAG; 1443 tcp->cksum = 0; 1444 csum += rte_raw_cksum(tcp, (tcp->data_off & 0xF) * 4); 1445 csum = ((csum & 0xffff0000) >> 16) + (csum & 0xffff); 1446 csum = (~csum) & 0xffff; 1447 if (csum == 0) 1448 csum = 0xffff; 1449 tcp->cksum = csum; 1450 } 1451 1452 /** 1453 * Update LRO packet headers. 1454 * The HW LRO feature doesn't update the L3/TCP headers after coalescing the 1455 * TCP segments but supply information in CQE to fill it by SW. 1456 * 1457 * @param padd 1458 * The packet address. 1459 * @param cqe 1460 * Pointer to the completion entry.. 1461 * @param len 1462 * The packet length. 1463 */ 1464 static inline void 1465 mlx5_lro_update_hdr(uint8_t *restrict padd, 1466 volatile struct mlx5_cqe *restrict cqe, 1467 uint32_t len) 1468 { 1469 union { 1470 struct rte_ether_hdr *eth; 1471 struct rte_vlan_hdr *vlan; 1472 struct rte_ipv4_hdr *ipv4; 1473 struct rte_ipv6_hdr *ipv6; 1474 struct rte_tcp_hdr *tcp; 1475 uint8_t *hdr; 1476 } h = { 1477 .hdr = padd, 1478 }; 1479 uint16_t proto = h.eth->ether_type; 1480 uint32_t phcsum; 1481 1482 h.eth++; 1483 while (proto == RTE_BE16(RTE_ETHER_TYPE_VLAN) || 1484 proto == RTE_BE16(RTE_ETHER_TYPE_QINQ)) { 1485 proto = h.vlan->eth_proto; 1486 h.vlan++; 1487 } 1488 if (proto == RTE_BE16(RTE_ETHER_TYPE_IPV4)) { 1489 h.ipv4->time_to_live = cqe->lro_min_ttl; 1490 h.ipv4->total_length = rte_cpu_to_be_16(len - (h.hdr - padd)); 1491 h.ipv4->hdr_checksum = 0; 1492 h.ipv4->hdr_checksum = rte_ipv4_cksum(h.ipv4); 1493 phcsum = rte_ipv4_phdr_cksum(h.ipv4, 0); 1494 h.ipv4++; 1495 } else { 1496 h.ipv6->hop_limits = cqe->lro_min_ttl; 1497 h.ipv6->payload_len = rte_cpu_to_be_16(len - (h.hdr - padd) - 1498 sizeof(*h.ipv6)); 1499 phcsum = rte_ipv6_phdr_cksum(h.ipv6, 0); 1500 h.ipv6++; 1501 } 1502 mlx5_lro_update_tcp_hdr(h.tcp, cqe, phcsum); 1503 } 1504 1505 void 1506 mlx5_mprq_buf_free_cb(void *addr __rte_unused, void *opaque) 1507 { 1508 struct mlx5_mprq_buf *buf = opaque; 1509 1510 if (rte_atomic16_read(&buf->refcnt) == 1) { 1511 rte_mempool_put(buf->mp, buf); 1512 } else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) { 1513 rte_atomic16_set(&buf->refcnt, 1); 1514 rte_mempool_put(buf->mp, buf); 1515 } 1516 } 1517 1518 void 1519 mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf) 1520 { 1521 mlx5_mprq_buf_free_cb(NULL, buf); 1522 } 1523 1524 static inline void 1525 mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx, 1526 const unsigned int strd_n) 1527 { 1528 struct mlx5_mprq_buf *rep = rxq->mprq_repl; 1529 volatile struct mlx5_wqe_data_seg *wqe = 1530 &((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg; 1531 void *addr; 1532 1533 assert(rep != NULL); 1534 /* Replace MPRQ buf. */ 1535 (*rxq->mprq_bufs)[rq_idx] = rep; 1536 /* Replace WQE. */ 1537 addr = mlx5_mprq_buf_addr(rep, strd_n); 1538 wqe->addr = rte_cpu_to_be_64((uintptr_t)addr); 1539 /* If there's only one MR, no need to replace LKey in WQE. */ 1540 if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) 1541 wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr); 1542 /* Stash a mbuf for next replacement. */ 1543 if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep))) 1544 rxq->mprq_repl = rep; 1545 else 1546 rxq->mprq_repl = NULL; 1547 } 1548 1549 /** 1550 * DPDK callback for RX with Multi-Packet RQ support. 1551 * 1552 * @param dpdk_rxq 1553 * Generic pointer to RX queue structure. 1554 * @param[out] pkts 1555 * Array to store received packets. 1556 * @param pkts_n 1557 * Maximum number of packets in array. 1558 * 1559 * @return 1560 * Number of packets successfully received (<= pkts_n). 1561 */ 1562 uint16_t 1563 mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1564 { 1565 struct mlx5_rxq_data *rxq = dpdk_rxq; 1566 const unsigned int strd_n = 1 << rxq->strd_num_n; 1567 const unsigned int strd_sz = 1 << rxq->strd_sz_n; 1568 const unsigned int strd_shift = 1569 MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en; 1570 const unsigned int cq_mask = (1 << rxq->cqe_n) - 1; 1571 const unsigned int wq_mask = (1 << rxq->elts_n) - 1; 1572 volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask]; 1573 unsigned int i = 0; 1574 uint32_t rq_ci = rxq->rq_ci; 1575 uint16_t consumed_strd = rxq->consumed_strd; 1576 uint16_t headroom_sz = rxq->strd_headroom_en * RTE_PKTMBUF_HEADROOM; 1577 struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_ci & wq_mask]; 1578 1579 while (i < pkts_n) { 1580 struct rte_mbuf *pkt; 1581 void *addr; 1582 int ret; 1583 unsigned int len; 1584 uint16_t strd_cnt; 1585 uint16_t strd_idx; 1586 uint32_t offset; 1587 uint32_t byte_cnt; 1588 volatile struct mlx5_mini_cqe8 *mcqe = NULL; 1589 uint32_t rss_hash_res = 0; 1590 uint8_t lro_num_seg; 1591 1592 if (consumed_strd == strd_n) { 1593 /* Replace WQE only if the buffer is still in use. */ 1594 if (rte_atomic16_read(&buf->refcnt) > 1) { 1595 mprq_buf_replace(rxq, rq_ci & wq_mask, strd_n); 1596 /* Release the old buffer. */ 1597 mlx5_mprq_buf_free(buf); 1598 } else if (unlikely(rxq->mprq_repl == NULL)) { 1599 struct mlx5_mprq_buf *rep; 1600 1601 /* 1602 * Currently, the MPRQ mempool is out of buffer 1603 * and doing memcpy regardless of the size of Rx 1604 * packet. Retry allocation to get back to 1605 * normal. 1606 */ 1607 if (!rte_mempool_get(rxq->mprq_mp, 1608 (void **)&rep)) 1609 rxq->mprq_repl = rep; 1610 } 1611 /* Advance to the next WQE. */ 1612 consumed_strd = 0; 1613 ++rq_ci; 1614 buf = (*rxq->mprq_bufs)[rq_ci & wq_mask]; 1615 } 1616 cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask]; 1617 ret = mlx5_rx_poll_len(rxq, cqe, cq_mask, &mcqe); 1618 if (!ret) 1619 break; 1620 byte_cnt = ret; 1621 strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >> 1622 MLX5_MPRQ_STRIDE_NUM_SHIFT; 1623 assert(strd_cnt); 1624 consumed_strd += strd_cnt; 1625 if (byte_cnt & MLX5_MPRQ_FILLER_MASK) 1626 continue; 1627 if (mcqe == NULL) { 1628 rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res); 1629 strd_idx = rte_be_to_cpu_16(cqe->wqe_counter); 1630 } else { 1631 /* mini-CQE for MPRQ doesn't have hash result. */ 1632 strd_idx = rte_be_to_cpu_16(mcqe->stride_idx); 1633 } 1634 assert(strd_idx < strd_n); 1635 assert(!((rte_be_to_cpu_16(cqe->wqe_id) ^ rq_ci) & wq_mask)); 1636 lro_num_seg = cqe->lro_num_seg; 1637 /* 1638 * Currently configured to receive a packet per a stride. But if 1639 * MTU is adjusted through kernel interface, device could 1640 * consume multiple strides without raising an error. In this 1641 * case, the packet should be dropped because it is bigger than 1642 * the max_rx_pkt_len. 1643 */ 1644 if (unlikely(!lro_num_seg && strd_cnt > 1)) { 1645 ++rxq->stats.idropped; 1646 continue; 1647 } 1648 pkt = rte_pktmbuf_alloc(rxq->mp); 1649 if (unlikely(pkt == NULL)) { 1650 ++rxq->stats.rx_nombuf; 1651 break; 1652 } 1653 len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT; 1654 assert((int)len >= (rxq->crc_present << 2)); 1655 if (rxq->crc_present) 1656 len -= RTE_ETHER_CRC_LEN; 1657 offset = strd_idx * strd_sz + strd_shift; 1658 addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset); 1659 /* 1660 * Memcpy packets to the target mbuf if: 1661 * - The size of packet is smaller than mprq_max_memcpy_len. 1662 * - Out of buffer in the Mempool for Multi-Packet RQ. 1663 */ 1664 if (len <= rxq->mprq_max_memcpy_len || rxq->mprq_repl == NULL) { 1665 /* 1666 * When memcpy'ing packet due to out-of-buffer, the 1667 * packet must be smaller than the target mbuf. 1668 */ 1669 if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) { 1670 rte_pktmbuf_free_seg(pkt); 1671 ++rxq->stats.idropped; 1672 continue; 1673 } 1674 rte_memcpy(rte_pktmbuf_mtod(pkt, void *), addr, len); 1675 DATA_LEN(pkt) = len; 1676 } else { 1677 rte_iova_t buf_iova; 1678 struct rte_mbuf_ext_shared_info *shinfo; 1679 uint16_t buf_len = strd_cnt * strd_sz; 1680 void *buf_addr; 1681 1682 /* Increment the refcnt of the whole chunk. */ 1683 rte_atomic16_add_return(&buf->refcnt, 1); 1684 assert((uint16_t)rte_atomic16_read(&buf->refcnt) <= 1685 strd_n + 1); 1686 buf_addr = RTE_PTR_SUB(addr, headroom_sz); 1687 /* 1688 * MLX5 device doesn't use iova but it is necessary in a 1689 * case where the Rx packet is transmitted via a 1690 * different PMD. 1691 */ 1692 buf_iova = rte_mempool_virt2iova(buf) + 1693 RTE_PTR_DIFF(buf_addr, buf); 1694 shinfo = &buf->shinfos[strd_idx]; 1695 rte_mbuf_ext_refcnt_set(shinfo, 1); 1696 /* 1697 * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when 1698 * attaching the stride to mbuf and more offload flags 1699 * will be added below by calling rxq_cq_to_mbuf(). 1700 * Other fields will be overwritten. 1701 */ 1702 rte_pktmbuf_attach_extbuf(pkt, buf_addr, buf_iova, 1703 buf_len, shinfo); 1704 /* Set mbuf head-room. */ 1705 pkt->data_off = headroom_sz; 1706 assert(pkt->ol_flags == EXT_ATTACHED_MBUF); 1707 /* 1708 * Prevent potential overflow due to MTU change through 1709 * kernel interface. 1710 */ 1711 if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) { 1712 rte_pktmbuf_free_seg(pkt); 1713 ++rxq->stats.idropped; 1714 continue; 1715 } 1716 DATA_LEN(pkt) = len; 1717 /* 1718 * LRO packet may consume all the stride memory, in this 1719 * case packet head-room space is not guaranteed so must 1720 * to add an empty mbuf for the head-room. 1721 */ 1722 if (!rxq->strd_headroom_en) { 1723 struct rte_mbuf *headroom_mbuf = 1724 rte_pktmbuf_alloc(rxq->mp); 1725 1726 if (unlikely(headroom_mbuf == NULL)) { 1727 rte_pktmbuf_free_seg(pkt); 1728 ++rxq->stats.rx_nombuf; 1729 break; 1730 } 1731 PORT(pkt) = rxq->port_id; 1732 NEXT(headroom_mbuf) = pkt; 1733 pkt = headroom_mbuf; 1734 NB_SEGS(pkt) = 2; 1735 } 1736 } 1737 rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res); 1738 if (lro_num_seg > 1) { 1739 mlx5_lro_update_hdr(addr, cqe, len); 1740 pkt->ol_flags |= PKT_RX_LRO; 1741 pkt->tso_segsz = strd_sz; 1742 } 1743 PKT_LEN(pkt) = len; 1744 PORT(pkt) = rxq->port_id; 1745 #ifdef MLX5_PMD_SOFT_COUNTERS 1746 /* Increment bytes counter. */ 1747 rxq->stats.ibytes += PKT_LEN(pkt); 1748 #endif 1749 /* Return packet. */ 1750 *(pkts++) = pkt; 1751 ++i; 1752 } 1753 /* Update the consumer indexes. */ 1754 rxq->consumed_strd = consumed_strd; 1755 rte_cio_wmb(); 1756 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); 1757 if (rq_ci != rxq->rq_ci) { 1758 rxq->rq_ci = rq_ci; 1759 rte_cio_wmb(); 1760 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 1761 } 1762 #ifdef MLX5_PMD_SOFT_COUNTERS 1763 /* Increment packets counter. */ 1764 rxq->stats.ipackets += i; 1765 #endif 1766 return i; 1767 } 1768 1769 /** 1770 * Dummy DPDK callback for TX. 1771 * 1772 * This function is used to temporarily replace the real callback during 1773 * unsafe control operations on the queue, or in case of error. 1774 * 1775 * @param dpdk_txq 1776 * Generic pointer to TX queue structure. 1777 * @param[in] pkts 1778 * Packets to transmit. 1779 * @param pkts_n 1780 * Number of packets in array. 1781 * 1782 * @return 1783 * Number of packets successfully transmitted (<= pkts_n). 1784 */ 1785 uint16_t 1786 removed_tx_burst(void *dpdk_txq __rte_unused, 1787 struct rte_mbuf **pkts __rte_unused, 1788 uint16_t pkts_n __rte_unused) 1789 { 1790 rte_mb(); 1791 return 0; 1792 } 1793 1794 /** 1795 * Dummy DPDK callback for RX. 1796 * 1797 * This function is used to temporarily replace the real callback during 1798 * unsafe control operations on the queue, or in case of error. 1799 * 1800 * @param dpdk_rxq 1801 * Generic pointer to RX queue structure. 1802 * @param[out] pkts 1803 * Array to store received packets. 1804 * @param pkts_n 1805 * Maximum number of packets in array. 1806 * 1807 * @return 1808 * Number of packets successfully received (<= pkts_n). 1809 */ 1810 uint16_t 1811 removed_rx_burst(void *dpdk_txq __rte_unused, 1812 struct rte_mbuf **pkts __rte_unused, 1813 uint16_t pkts_n __rte_unused) 1814 { 1815 rte_mb(); 1816 return 0; 1817 } 1818 1819 /* 1820 * Vectorized Rx/Tx routines are not compiled in when required vector 1821 * instructions are not supported on a target architecture. The following null 1822 * stubs are needed for linkage when those are not included outside of this file 1823 * (e.g. mlx5_rxtx_vec_sse.c for x86). 1824 */ 1825 1826 __rte_weak uint16_t 1827 mlx5_rx_burst_vec(void *dpdk_txq __rte_unused, 1828 struct rte_mbuf **pkts __rte_unused, 1829 uint16_t pkts_n __rte_unused) 1830 { 1831 return 0; 1832 } 1833 1834 __rte_weak int 1835 mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused) 1836 { 1837 return -ENOTSUP; 1838 } 1839 1840 __rte_weak int 1841 mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused) 1842 { 1843 return -ENOTSUP; 1844 } 1845 1846 /** 1847 * Free the mbufs from the linear array of pointers. 1848 * 1849 * @param pkts 1850 * Pointer to array of packets to be free. 1851 * @param pkts_n 1852 * Number of packets to be freed. 1853 * @param olx 1854 * Configured Tx offloads mask. It is fully defined at 1855 * compile time and may be used for optimization. 1856 */ 1857 static __rte_always_inline void 1858 mlx5_tx_free_mbuf(struct rte_mbuf **restrict pkts, 1859 unsigned int pkts_n, 1860 unsigned int olx __rte_unused) 1861 { 1862 struct rte_mempool *pool = NULL; 1863 struct rte_mbuf **p_free = NULL; 1864 struct rte_mbuf *mbuf; 1865 unsigned int n_free = 0; 1866 1867 /* 1868 * The implemented algorithm eliminates 1869 * copying pointers to temporary array 1870 * for rte_mempool_put_bulk() calls. 1871 */ 1872 assert(pkts); 1873 assert(pkts_n); 1874 for (;;) { 1875 for (;;) { 1876 /* 1877 * Decrement mbuf reference counter, detach 1878 * indirect and external buffers if needed. 1879 */ 1880 mbuf = rte_pktmbuf_prefree_seg(*pkts); 1881 if (likely(mbuf != NULL)) { 1882 assert(mbuf == *pkts); 1883 if (likely(n_free != 0)) { 1884 if (unlikely(pool != mbuf->pool)) 1885 /* From different pool. */ 1886 break; 1887 } else { 1888 /* Start new scan array. */ 1889 pool = mbuf->pool; 1890 p_free = pkts; 1891 } 1892 ++n_free; 1893 ++pkts; 1894 --pkts_n; 1895 if (unlikely(pkts_n == 0)) { 1896 mbuf = NULL; 1897 break; 1898 } 1899 } else { 1900 /* 1901 * This happens if mbuf is still referenced. 1902 * We can't put it back to the pool, skip. 1903 */ 1904 ++pkts; 1905 --pkts_n; 1906 if (unlikely(n_free != 0)) 1907 /* There is some array to free.*/ 1908 break; 1909 if (unlikely(pkts_n == 0)) 1910 /* Last mbuf, nothing to free. */ 1911 return; 1912 } 1913 } 1914 for (;;) { 1915 /* 1916 * This loop is implemented to avoid multiple 1917 * inlining of rte_mempool_put_bulk(). 1918 */ 1919 assert(pool); 1920 assert(p_free); 1921 assert(n_free); 1922 /* 1923 * Free the array of pre-freed mbufs 1924 * belonging to the same memory pool. 1925 */ 1926 rte_mempool_put_bulk(pool, (void *)p_free, n_free); 1927 if (unlikely(mbuf != NULL)) { 1928 /* There is the request to start new scan. */ 1929 pool = mbuf->pool; 1930 p_free = pkts++; 1931 n_free = 1; 1932 --pkts_n; 1933 if (likely(pkts_n != 0)) 1934 break; 1935 /* 1936 * This is the last mbuf to be freed. 1937 * Do one more loop iteration to complete. 1938 * This is rare case of the last unique mbuf. 1939 */ 1940 mbuf = NULL; 1941 continue; 1942 } 1943 if (likely(pkts_n == 0)) 1944 return; 1945 n_free = 0; 1946 break; 1947 } 1948 } 1949 } 1950 1951 /** 1952 * Free the mbuf from the elts ring buffer till new tail. 1953 * 1954 * @param txq 1955 * Pointer to Tx queue structure. 1956 * @param tail 1957 * Index in elts to free up to, becomes new elts tail. 1958 * @param olx 1959 * Configured Tx offloads mask. It is fully defined at 1960 * compile time and may be used for optimization. 1961 */ 1962 static __rte_always_inline void 1963 mlx5_tx_free_elts(struct mlx5_txq_data *restrict txq, 1964 uint16_t tail, 1965 unsigned int olx __rte_unused) 1966 { 1967 uint16_t n_elts = tail - txq->elts_tail; 1968 1969 assert(n_elts); 1970 assert(n_elts <= txq->elts_s); 1971 /* 1972 * Implement a loop to support ring buffer wraparound 1973 * with single inlining of mlx5_tx_free_mbuf(). 1974 */ 1975 do { 1976 unsigned int part; 1977 1978 part = txq->elts_s - (txq->elts_tail & txq->elts_m); 1979 part = RTE_MIN(part, n_elts); 1980 assert(part); 1981 assert(part <= txq->elts_s); 1982 mlx5_tx_free_mbuf(&txq->elts[txq->elts_tail & txq->elts_m], 1983 part, olx); 1984 txq->elts_tail += part; 1985 n_elts -= part; 1986 } while (n_elts); 1987 } 1988 1989 /** 1990 * Store the mbuf being sent into elts ring buffer. 1991 * On Tx completion these mbufs will be freed. 1992 * 1993 * @param txq 1994 * Pointer to Tx queue structure. 1995 * @param pkts 1996 * Pointer to array of packets to be stored. 1997 * @param pkts_n 1998 * Number of packets to be stored. 1999 * @param olx 2000 * Configured Tx offloads mask. It is fully defined at 2001 * compile time and may be used for optimization. 2002 */ 2003 static __rte_always_inline void 2004 mlx5_tx_copy_elts(struct mlx5_txq_data *restrict txq, 2005 struct rte_mbuf **restrict pkts, 2006 unsigned int pkts_n, 2007 unsigned int olx __rte_unused) 2008 { 2009 unsigned int part; 2010 struct rte_mbuf **elts = (struct rte_mbuf **)txq->elts; 2011 2012 assert(pkts); 2013 assert(pkts_n); 2014 part = txq->elts_s - (txq->elts_head & txq->elts_m); 2015 assert(part); 2016 assert(part <= txq->elts_s); 2017 /* This code is a good candidate for vectorizing with SIMD. */ 2018 rte_memcpy((void *)(elts + (txq->elts_head & txq->elts_m)), 2019 (void *)pkts, 2020 RTE_MIN(part, pkts_n) * sizeof(struct rte_mbuf *)); 2021 txq->elts_head += pkts_n; 2022 if (unlikely(part < pkts_n)) 2023 /* The copy is wrapping around the elts array. */ 2024 rte_memcpy((void *)elts, (void *)(pkts + part), 2025 (pkts_n - part) * sizeof(struct rte_mbuf *)); 2026 } 2027 2028 /** 2029 * Update completion queue consuming index via doorbell 2030 * and flush the completed data buffers. 2031 * 2032 * @param txq 2033 * Pointer to TX queue structure. 2034 * @param valid CQE pointer 2035 * if not NULL update txq->wqe_pi and flush the buffers 2036 * @param itail 2037 * if not negative - flush the buffers till this index. 2038 * @param olx 2039 * Configured Tx offloads mask. It is fully defined at 2040 * compile time and may be used for optimization. 2041 */ 2042 static __rte_always_inline void 2043 mlx5_tx_comp_flush(struct mlx5_txq_data *restrict txq, 2044 volatile struct mlx5_cqe *last_cqe, 2045 int itail, 2046 unsigned int olx __rte_unused) 2047 { 2048 uint16_t tail; 2049 2050 if (likely(last_cqe != NULL)) { 2051 txq->wqe_pi = rte_be_to_cpu_16(last_cqe->wqe_counter); 2052 tail = ((volatile struct mlx5_wqe_cseg *) 2053 (txq->wqes + (txq->wqe_pi & txq->wqe_m)))->misc; 2054 } else if (itail >= 0) { 2055 tail = (uint16_t)itail; 2056 } else { 2057 return; 2058 } 2059 rte_compiler_barrier(); 2060 *txq->cq_db = rte_cpu_to_be_32(txq->cq_ci); 2061 if (likely(tail != txq->elts_tail)) { 2062 mlx5_tx_free_elts(txq, tail, olx); 2063 assert(tail == txq->elts_tail); 2064 } 2065 } 2066 2067 /** 2068 * Manage TX completions. This routine checks the CQ for 2069 * arrived CQEs, deduces the last accomplished WQE in SQ, 2070 * updates SQ producing index and frees all completed mbufs. 2071 * 2072 * @param txq 2073 * Pointer to TX queue structure. 2074 * @param olx 2075 * Configured Tx offloads mask. It is fully defined at 2076 * compile time and may be used for optimization. 2077 * 2078 * NOTE: not inlined intentionally, it makes tx_burst 2079 * routine smaller, simple and faster - from experiments. 2080 */ 2081 static void 2082 mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq, 2083 unsigned int olx __rte_unused) 2084 { 2085 unsigned int count = MLX5_TX_COMP_MAX_CQE; 2086 volatile struct mlx5_cqe *last_cqe = NULL; 2087 int ret; 2088 2089 static_assert(MLX5_CQE_STATUS_HW_OWN < 0, "Must be negative value"); 2090 static_assert(MLX5_CQE_STATUS_SW_OWN < 0, "Must be negative value"); 2091 do { 2092 volatile struct mlx5_cqe *cqe; 2093 2094 cqe = &txq->cqes[txq->cq_ci & txq->cqe_m]; 2095 ret = check_cqe(cqe, txq->cqe_s, txq->cq_ci); 2096 if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) { 2097 if (likely(ret != MLX5_CQE_STATUS_ERR)) { 2098 /* No new CQEs in completion queue. */ 2099 assert(ret == MLX5_CQE_STATUS_HW_OWN); 2100 break; 2101 } 2102 /* 2103 * Some error occurred, try to restart. 2104 * We have no barrier after WQE related Doorbell 2105 * written, make sure all writes are completed 2106 * here, before we might perform SQ reset. 2107 */ 2108 rte_wmb(); 2109 ret = mlx5_tx_error_cqe_handle 2110 (txq, (volatile struct mlx5_err_cqe *)cqe); 2111 /* 2112 * Flush buffers, update consuming index 2113 * if recovery succeeded. Otherwise 2114 * just try to recover later. 2115 */ 2116 last_cqe = NULL; 2117 break; 2118 } 2119 /* Normal transmit completion. */ 2120 ++txq->cq_ci; 2121 last_cqe = cqe; 2122 #ifndef NDEBUG 2123 if (txq->cq_pi) 2124 --txq->cq_pi; 2125 #endif 2126 /* 2127 * We have to restrict the amount of processed CQEs 2128 * in one tx_burst routine call. The CQ may be large 2129 * and many CQEs may be updated by the NIC in one 2130 * transaction. Buffers freeing is time consuming, 2131 * multiple iterations may introduce significant 2132 * latency. 2133 */ 2134 } while (--count); 2135 mlx5_tx_comp_flush(txq, last_cqe, ret, olx); 2136 } 2137 2138 /** 2139 * Check if the completion request flag should be set in the last WQE. 2140 * Both pushed mbufs and WQEs are monitored and the completion request 2141 * flag is set if any of thresholds is reached. 2142 * 2143 * @param txq 2144 * Pointer to TX queue structure. 2145 * @param loc 2146 * Pointer to burst routine local context. 2147 * @param multi, 2148 * Routine is called from multi-segment sending loop, 2149 * do not correct the elts_head according to the pkts_copy. 2150 * @param olx 2151 * Configured Tx offloads mask. It is fully defined at 2152 * compile time and may be used for optimization. 2153 */ 2154 static __rte_always_inline void 2155 mlx5_tx_request_completion(struct mlx5_txq_data *restrict txq, 2156 struct mlx5_txq_local *restrict loc, 2157 bool multi, 2158 unsigned int olx) 2159 { 2160 uint16_t head = txq->elts_head; 2161 unsigned int part; 2162 2163 part = (MLX5_TXOFF_CONFIG(INLINE) || multi) ? 2164 0 : loc->pkts_sent - loc->pkts_copy; 2165 head += part; 2166 if ((uint16_t)(head - txq->elts_comp) >= MLX5_TX_COMP_THRESH || 2167 (MLX5_TXOFF_CONFIG(INLINE) && 2168 (uint16_t)(txq->wqe_ci - txq->wqe_comp) >= txq->wqe_thres)) { 2169 volatile struct mlx5_wqe *last = loc->wqe_last; 2170 2171 txq->elts_comp = head; 2172 if (MLX5_TXOFF_CONFIG(INLINE)) 2173 txq->wqe_comp = txq->wqe_ci; 2174 /* Request unconditional completion on last WQE. */ 2175 last->cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS << 2176 MLX5_COMP_MODE_OFFSET); 2177 /* Save elts_head in unused "immediate" field of WQE. */ 2178 last->cseg.misc = head; 2179 /* 2180 * A CQE slot must always be available. Count the 2181 * issued CEQ "always" request instead of production 2182 * index due to here can be CQE with errors and 2183 * difference with ci may become inconsistent. 2184 */ 2185 assert(txq->cqe_s > ++txq->cq_pi); 2186 } 2187 } 2188 2189 /** 2190 * DPDK callback to check the status of a tx descriptor. 2191 * 2192 * @param tx_queue 2193 * The tx queue. 2194 * @param[in] offset 2195 * The index of the descriptor in the ring. 2196 * 2197 * @return 2198 * The status of the tx descriptor. 2199 */ 2200 int 2201 mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset) 2202 { 2203 struct mlx5_txq_data *restrict txq = tx_queue; 2204 uint16_t used; 2205 2206 mlx5_tx_handle_completion(txq, 0); 2207 used = txq->elts_head - txq->elts_tail; 2208 if (offset < used) 2209 return RTE_ETH_TX_DESC_FULL; 2210 return RTE_ETH_TX_DESC_DONE; 2211 } 2212 2213 /** 2214 * Build the Control Segment with specified opcode: 2215 * - MLX5_OPCODE_SEND 2216 * - MLX5_OPCODE_ENHANCED_MPSW 2217 * - MLX5_OPCODE_TSO 2218 * 2219 * @param txq 2220 * Pointer to TX queue structure. 2221 * @param loc 2222 * Pointer to burst routine local context. 2223 * @param wqe 2224 * Pointer to WQE to fill with built Control Segment. 2225 * @param ds 2226 * Supposed length of WQE in segments. 2227 * @param opcode 2228 * SQ WQE opcode to put into Control Segment. 2229 * @param olx 2230 * Configured Tx offloads mask. It is fully defined at 2231 * compile time and may be used for optimization. 2232 */ 2233 static __rte_always_inline void 2234 mlx5_tx_cseg_init(struct mlx5_txq_data *restrict txq, 2235 struct mlx5_txq_local *restrict loc __rte_unused, 2236 struct mlx5_wqe *restrict wqe, 2237 unsigned int ds, 2238 unsigned int opcode, 2239 unsigned int olx __rte_unused) 2240 { 2241 struct mlx5_wqe_cseg *restrict cs = &wqe->cseg; 2242 2243 cs->opcode = rte_cpu_to_be_32((txq->wqe_ci << 8) | opcode); 2244 cs->sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); 2245 cs->flags = RTE_BE32(MLX5_COMP_ONLY_FIRST_ERR << 2246 MLX5_COMP_MODE_OFFSET); 2247 cs->misc = RTE_BE32(0); 2248 } 2249 2250 /** 2251 * Build the Ethernet Segment without inlined data. 2252 * Supports Software Parser, Checksums and VLAN 2253 * insertion Tx offload features. 2254 * 2255 * @param txq 2256 * Pointer to TX queue structure. 2257 * @param loc 2258 * Pointer to burst routine local context. 2259 * @param wqe 2260 * Pointer to WQE to fill with built Ethernet Segment. 2261 * @param olx 2262 * Configured Tx offloads mask. It is fully defined at 2263 * compile time and may be used for optimization. 2264 */ 2265 static __rte_always_inline void 2266 mlx5_tx_eseg_none(struct mlx5_txq_data *restrict txq __rte_unused, 2267 struct mlx5_txq_local *restrict loc, 2268 struct mlx5_wqe *restrict wqe, 2269 unsigned int olx) 2270 { 2271 struct mlx5_wqe_eseg *restrict es = &wqe->eseg; 2272 uint32_t csum; 2273 2274 /* 2275 * Calculate and set check sum flags first, dword field 2276 * in segment may be shared with Software Parser flags. 2277 */ 2278 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; 2279 es->flags = rte_cpu_to_le_32(csum); 2280 /* 2281 * Calculate and set Software Parser offsets and flags. 2282 * These flags a set for custom UDP and IP tunnel packets. 2283 */ 2284 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); 2285 /* Fill metadata field if needed. */ 2286 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? 2287 loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? 2288 *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0; 2289 /* Engage VLAN tag insertion feature if requested. */ 2290 if (MLX5_TXOFF_CONFIG(VLAN) && 2291 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { 2292 /* 2293 * We should get here only if device support 2294 * this feature correctly. 2295 */ 2296 assert(txq->vlan_en); 2297 es->inline_hdr = rte_cpu_to_be_32(MLX5_ETH_WQE_VLAN_INSERT | 2298 loc->mbuf->vlan_tci); 2299 } else { 2300 es->inline_hdr = RTE_BE32(0); 2301 } 2302 } 2303 2304 /** 2305 * Build the Ethernet Segment with minimal inlined data 2306 * of MLX5_ESEG_MIN_INLINE_SIZE bytes length. This is 2307 * used to fill the gap in single WQEBB WQEs. 2308 * Supports Software Parser, Checksums and VLAN 2309 * insertion Tx offload features. 2310 * 2311 * @param txq 2312 * Pointer to TX queue structure. 2313 * @param loc 2314 * Pointer to burst routine local context. 2315 * @param wqe 2316 * Pointer to WQE to fill with built Ethernet Segment. 2317 * @param vlan 2318 * Length of VLAN tag insertion if any. 2319 * @param olx 2320 * Configured Tx offloads mask. It is fully defined at 2321 * compile time and may be used for optimization. 2322 */ 2323 static __rte_always_inline void 2324 mlx5_tx_eseg_dmin(struct mlx5_txq_data *restrict txq __rte_unused, 2325 struct mlx5_txq_local *restrict loc, 2326 struct mlx5_wqe *restrict wqe, 2327 unsigned int vlan, 2328 unsigned int olx) 2329 { 2330 struct mlx5_wqe_eseg *restrict es = &wqe->eseg; 2331 uint32_t csum; 2332 uint8_t *psrc, *pdst; 2333 2334 /* 2335 * Calculate and set check sum flags first, dword field 2336 * in segment may be shared with Software Parser flags. 2337 */ 2338 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; 2339 es->flags = rte_cpu_to_le_32(csum); 2340 /* 2341 * Calculate and set Software Parser offsets and flags. 2342 * These flags a set for custom UDP and IP tunnel packets. 2343 */ 2344 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); 2345 /* Fill metadata field if needed. */ 2346 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? 2347 loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? 2348 *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0; 2349 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2350 (sizeof(uint16_t) + 2351 sizeof(rte_v128u32_t)), 2352 "invalid Ethernet Segment data size"); 2353 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2354 (sizeof(uint16_t) + 2355 sizeof(struct rte_vlan_hdr) + 2356 2 * RTE_ETHER_ADDR_LEN), 2357 "invalid Ethernet Segment data size"); 2358 psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *); 2359 es->inline_hdr_sz = RTE_BE16(MLX5_ESEG_MIN_INLINE_SIZE); 2360 es->inline_data = *(unaligned_uint16_t *)psrc; 2361 psrc += sizeof(uint16_t); 2362 pdst = (uint8_t *)(es + 1); 2363 if (MLX5_TXOFF_CONFIG(VLAN) && vlan) { 2364 /* Implement VLAN tag insertion as part inline data. */ 2365 memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t)); 2366 pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); 2367 psrc += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); 2368 /* Insert VLAN ethertype + VLAN tag. */ 2369 *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32 2370 ((RTE_ETHER_TYPE_VLAN << 16) | 2371 loc->mbuf->vlan_tci); 2372 pdst += sizeof(struct rte_vlan_hdr); 2373 /* Copy the rest two bytes from packet data. */ 2374 assert(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t))); 2375 *(uint16_t *)pdst = *(unaligned_uint16_t *)psrc; 2376 } else { 2377 /* Fill the gap in the title WQEBB with inline data. */ 2378 rte_mov16(pdst, psrc); 2379 } 2380 } 2381 2382 /** 2383 * Build the Ethernet Segment with entire packet 2384 * data inlining. Checks the boundary of WQEBB and 2385 * ring buffer wrapping, supports Software Parser, 2386 * Checksums and VLAN insertion Tx offload features. 2387 * 2388 * @param txq 2389 * Pointer to TX queue structure. 2390 * @param loc 2391 * Pointer to burst routine local context. 2392 * @param wqe 2393 * Pointer to WQE to fill with built Ethernet Segment. 2394 * @param vlan 2395 * Length of VLAN tag insertion if any. 2396 * @param inlen 2397 * Length of data to inline (VLAN included, if any). 2398 * @param tso 2399 * TSO flag, set mss field from the packet. 2400 * @param olx 2401 * Configured Tx offloads mask. It is fully defined at 2402 * compile time and may be used for optimization. 2403 * 2404 * @return 2405 * Pointer to the next Data Segment (aligned and wrapped around). 2406 */ 2407 static __rte_always_inline struct mlx5_wqe_dseg * 2408 mlx5_tx_eseg_data(struct mlx5_txq_data *restrict txq, 2409 struct mlx5_txq_local *restrict loc, 2410 struct mlx5_wqe *restrict wqe, 2411 unsigned int vlan, 2412 unsigned int inlen, 2413 unsigned int tso, 2414 unsigned int olx) 2415 { 2416 struct mlx5_wqe_eseg *restrict es = &wqe->eseg; 2417 uint32_t csum; 2418 uint8_t *psrc, *pdst; 2419 unsigned int part; 2420 2421 /* 2422 * Calculate and set check sum flags first, dword field 2423 * in segment may be shared with Software Parser flags. 2424 */ 2425 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; 2426 if (tso) { 2427 csum <<= 24; 2428 csum |= loc->mbuf->tso_segsz; 2429 es->flags = rte_cpu_to_be_32(csum); 2430 } else { 2431 es->flags = rte_cpu_to_le_32(csum); 2432 } 2433 /* 2434 * Calculate and set Software Parser offsets and flags. 2435 * These flags a set for custom UDP and IP tunnel packets. 2436 */ 2437 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); 2438 /* Fill metadata field if needed. */ 2439 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? 2440 loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? 2441 *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0; 2442 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2443 (sizeof(uint16_t) + 2444 sizeof(rte_v128u32_t)), 2445 "invalid Ethernet Segment data size"); 2446 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2447 (sizeof(uint16_t) + 2448 sizeof(struct rte_vlan_hdr) + 2449 2 * RTE_ETHER_ADDR_LEN), 2450 "invalid Ethernet Segment data size"); 2451 psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *); 2452 es->inline_hdr_sz = rte_cpu_to_be_16(inlen); 2453 es->inline_data = *(unaligned_uint16_t *)psrc; 2454 psrc += sizeof(uint16_t); 2455 pdst = (uint8_t *)(es + 1); 2456 if (MLX5_TXOFF_CONFIG(VLAN) && vlan) { 2457 /* Implement VLAN tag insertion as part inline data. */ 2458 memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t)); 2459 pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); 2460 psrc += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); 2461 /* Insert VLAN ethertype + VLAN tag. */ 2462 *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32 2463 ((RTE_ETHER_TYPE_VLAN << 16) | 2464 loc->mbuf->vlan_tci); 2465 pdst += sizeof(struct rte_vlan_hdr); 2466 /* Copy the rest two bytes from packet data. */ 2467 assert(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t))); 2468 *(uint16_t *)pdst = *(unaligned_uint16_t *)psrc; 2469 psrc += sizeof(uint16_t); 2470 } else { 2471 /* Fill the gap in the title WQEBB with inline data. */ 2472 rte_mov16(pdst, psrc); 2473 psrc += sizeof(rte_v128u32_t); 2474 } 2475 pdst = (uint8_t *)(es + 2); 2476 assert(inlen >= MLX5_ESEG_MIN_INLINE_SIZE); 2477 assert(pdst < (uint8_t *)txq->wqes_end); 2478 inlen -= MLX5_ESEG_MIN_INLINE_SIZE; 2479 if (!inlen) { 2480 assert(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE)); 2481 return (struct mlx5_wqe_dseg *)pdst; 2482 } 2483 /* 2484 * The WQEBB space availability is checked by caller. 2485 * Here we should be aware of WQE ring buffer wraparound only. 2486 */ 2487 part = (uint8_t *)txq->wqes_end - pdst; 2488 part = RTE_MIN(part, inlen); 2489 do { 2490 rte_memcpy(pdst, psrc, part); 2491 inlen -= part; 2492 if (likely(!inlen)) { 2493 /* 2494 * If return value is not used by the caller 2495 * the code below will be optimized out. 2496 */ 2497 pdst += part; 2498 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); 2499 if (unlikely(pdst >= (uint8_t *)txq->wqes_end)) 2500 pdst = (uint8_t *)txq->wqes; 2501 return (struct mlx5_wqe_dseg *)pdst; 2502 } 2503 pdst = (uint8_t *)txq->wqes; 2504 psrc += part; 2505 part = inlen; 2506 } while (true); 2507 } 2508 2509 /** 2510 * Copy data from chain of mbuf to the specified linear buffer. 2511 * Checksums and VLAN insertion Tx offload features. If data 2512 * from some mbuf copied completely this mbuf is freed. Local 2513 * structure is used to keep the byte stream state. 2514 * 2515 * @param pdst 2516 * Pointer to the destination linear buffer. 2517 * @param loc 2518 * Pointer to burst routine local context. 2519 * @param len 2520 * Length of data to be copied. 2521 * @param olx 2522 * Configured Tx offloads mask. It is fully defined at 2523 * compile time and may be used for optimization. 2524 */ 2525 static __rte_always_inline void 2526 mlx5_tx_mseg_memcpy(uint8_t *pdst, 2527 struct mlx5_txq_local *restrict loc, 2528 unsigned int len, 2529 unsigned int olx __rte_unused) 2530 { 2531 struct rte_mbuf *mbuf; 2532 unsigned int part, dlen; 2533 uint8_t *psrc; 2534 2535 assert(len); 2536 do { 2537 /* Allow zero length packets, must check first. */ 2538 dlen = rte_pktmbuf_data_len(loc->mbuf); 2539 if (dlen <= loc->mbuf_off) { 2540 /* Exhausted packet, just free. */ 2541 mbuf = loc->mbuf; 2542 loc->mbuf = mbuf->next; 2543 rte_pktmbuf_free_seg(mbuf); 2544 loc->mbuf_off = 0; 2545 assert(loc->mbuf_nseg > 1); 2546 assert(loc->mbuf); 2547 --loc->mbuf_nseg; 2548 continue; 2549 } 2550 dlen -= loc->mbuf_off; 2551 psrc = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *, 2552 loc->mbuf_off); 2553 part = RTE_MIN(len, dlen); 2554 rte_memcpy(pdst, psrc, part); 2555 loc->mbuf_off += part; 2556 len -= part; 2557 if (!len) { 2558 if (loc->mbuf_off >= rte_pktmbuf_data_len(loc->mbuf)) { 2559 loc->mbuf_off = 0; 2560 /* Exhausted packet, just free. */ 2561 mbuf = loc->mbuf; 2562 loc->mbuf = mbuf->next; 2563 rte_pktmbuf_free_seg(mbuf); 2564 loc->mbuf_off = 0; 2565 assert(loc->mbuf_nseg >= 1); 2566 --loc->mbuf_nseg; 2567 } 2568 return; 2569 } 2570 pdst += part; 2571 } while (true); 2572 } 2573 2574 /** 2575 * Build the Ethernet Segment with inlined data from 2576 * multi-segment packet. Checks the boundary of WQEBB 2577 * and ring buffer wrapping, supports Software Parser, 2578 * Checksums and VLAN insertion Tx offload features. 2579 * 2580 * @param txq 2581 * Pointer to TX queue structure. 2582 * @param loc 2583 * Pointer to burst routine local context. 2584 * @param wqe 2585 * Pointer to WQE to fill with built Ethernet Segment. 2586 * @param vlan 2587 * Length of VLAN tag insertion if any. 2588 * @param inlen 2589 * Length of data to inline (VLAN included, if any). 2590 * @param tso 2591 * TSO flag, set mss field from the packet. 2592 * @param olx 2593 * Configured Tx offloads mask. It is fully defined at 2594 * compile time and may be used for optimization. 2595 * 2596 * @return 2597 * Pointer to the next Data Segment (aligned and 2598 * possible NOT wrapped around - caller should do 2599 * wrapping check on its own). 2600 */ 2601 static __rte_always_inline struct mlx5_wqe_dseg * 2602 mlx5_tx_eseg_mdat(struct mlx5_txq_data *restrict txq, 2603 struct mlx5_txq_local *restrict loc, 2604 struct mlx5_wqe *restrict wqe, 2605 unsigned int vlan, 2606 unsigned int inlen, 2607 unsigned int tso, 2608 unsigned int olx) 2609 { 2610 struct mlx5_wqe_eseg *restrict es = &wqe->eseg; 2611 uint32_t csum; 2612 uint8_t *pdst; 2613 unsigned int part; 2614 2615 /* 2616 * Calculate and set check sum flags first, uint32_t field 2617 * in segment may be shared with Software Parser flags. 2618 */ 2619 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; 2620 if (tso) { 2621 csum <<= 24; 2622 csum |= loc->mbuf->tso_segsz; 2623 es->flags = rte_cpu_to_be_32(csum); 2624 } else { 2625 es->flags = rte_cpu_to_le_32(csum); 2626 } 2627 /* 2628 * Calculate and set Software Parser offsets and flags. 2629 * These flags a set for custom UDP and IP tunnel packets. 2630 */ 2631 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); 2632 /* Fill metadata field if needed. */ 2633 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? 2634 loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? 2635 *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0; 2636 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2637 (sizeof(uint16_t) + 2638 sizeof(rte_v128u32_t)), 2639 "invalid Ethernet Segment data size"); 2640 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2641 (sizeof(uint16_t) + 2642 sizeof(struct rte_vlan_hdr) + 2643 2 * RTE_ETHER_ADDR_LEN), 2644 "invalid Ethernet Segment data size"); 2645 assert(inlen >= MLX5_ESEG_MIN_INLINE_SIZE); 2646 es->inline_hdr_sz = rte_cpu_to_be_16(inlen); 2647 pdst = (uint8_t *)&es->inline_data; 2648 if (MLX5_TXOFF_CONFIG(VLAN) && vlan) { 2649 /* Implement VLAN tag insertion as part inline data. */ 2650 mlx5_tx_mseg_memcpy(pdst, loc, 2 * RTE_ETHER_ADDR_LEN, olx); 2651 pdst += 2 * RTE_ETHER_ADDR_LEN; 2652 *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32 2653 ((RTE_ETHER_TYPE_VLAN << 16) | 2654 loc->mbuf->vlan_tci); 2655 pdst += sizeof(struct rte_vlan_hdr); 2656 inlen -= 2 * RTE_ETHER_ADDR_LEN + sizeof(struct rte_vlan_hdr); 2657 } 2658 assert(pdst < (uint8_t *)txq->wqes_end); 2659 /* 2660 * The WQEBB space availability is checked by caller. 2661 * Here we should be aware of WQE ring buffer wraparound only. 2662 */ 2663 part = (uint8_t *)txq->wqes_end - pdst; 2664 part = RTE_MIN(part, inlen); 2665 assert(part); 2666 do { 2667 mlx5_tx_mseg_memcpy(pdst, loc, part, olx); 2668 inlen -= part; 2669 if (likely(!inlen)) { 2670 pdst += part; 2671 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); 2672 return (struct mlx5_wqe_dseg *)pdst; 2673 } 2674 pdst = (uint8_t *)txq->wqes; 2675 part = inlen; 2676 } while (true); 2677 } 2678 2679 /** 2680 * Build the Data Segment of pointer type. 2681 * 2682 * @param txq 2683 * Pointer to TX queue structure. 2684 * @param loc 2685 * Pointer to burst routine local context. 2686 * @param dseg 2687 * Pointer to WQE to fill with built Data Segment. 2688 * @param buf 2689 * Data buffer to point. 2690 * @param len 2691 * Data buffer length. 2692 * @param olx 2693 * Configured Tx offloads mask. It is fully defined at 2694 * compile time and may be used for optimization. 2695 */ 2696 static __rte_always_inline void 2697 mlx5_tx_dseg_ptr(struct mlx5_txq_data *restrict txq, 2698 struct mlx5_txq_local *restrict loc, 2699 struct mlx5_wqe_dseg *restrict dseg, 2700 uint8_t *buf, 2701 unsigned int len, 2702 unsigned int olx __rte_unused) 2703 2704 { 2705 assert(len); 2706 dseg->bcount = rte_cpu_to_be_32(len); 2707 dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf); 2708 dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf); 2709 } 2710 2711 /** 2712 * Build the Data Segment of pointer type or inline 2713 * if data length is less than buffer in minimal 2714 * Data Segment size. 2715 * 2716 * @param txq 2717 * Pointer to TX queue structure. 2718 * @param loc 2719 * Pointer to burst routine local context. 2720 * @param dseg 2721 * Pointer to WQE to fill with built Data Segment. 2722 * @param buf 2723 * Data buffer to point. 2724 * @param len 2725 * Data buffer length. 2726 * @param olx 2727 * Configured Tx offloads mask. It is fully defined at 2728 * compile time and may be used for optimization. 2729 */ 2730 static __rte_always_inline void 2731 mlx5_tx_dseg_iptr(struct mlx5_txq_data *restrict txq, 2732 struct mlx5_txq_local *restrict loc, 2733 struct mlx5_wqe_dseg *restrict dseg, 2734 uint8_t *buf, 2735 unsigned int len, 2736 unsigned int olx __rte_unused) 2737 2738 { 2739 uintptr_t dst, src; 2740 2741 assert(len); 2742 if (len > MLX5_DSEG_MIN_INLINE_SIZE) { 2743 dseg->bcount = rte_cpu_to_be_32(len); 2744 dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf); 2745 dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf); 2746 2747 return; 2748 } 2749 dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE); 2750 /* Unrolled implementation of generic rte_memcpy. */ 2751 dst = (uintptr_t)&dseg->inline_data[0]; 2752 src = (uintptr_t)buf; 2753 if (len & 0x08) { 2754 #ifdef RTE_ARCH_STRICT_ALIGN 2755 assert(dst == RTE_PTR_ALIGN(dst, sizeof(uint32_t))); 2756 *(uint32_t *)dst = *(unaligned_uint32_t *)src; 2757 dst += sizeof(uint32_t); 2758 src += sizeof(uint32_t); 2759 *(uint32_t *)dst = *(unaligned_uint32_t *)src; 2760 dst += sizeof(uint32_t); 2761 src += sizeof(uint32_t); 2762 #else 2763 *(uint64_t *)dst = *(unaligned_uint64_t *)src; 2764 dst += sizeof(uint64_t); 2765 src += sizeof(uint64_t); 2766 #endif 2767 } 2768 if (len & 0x04) { 2769 *(uint32_t *)dst = *(unaligned_uint32_t *)src; 2770 dst += sizeof(uint32_t); 2771 src += sizeof(uint32_t); 2772 } 2773 if (len & 0x02) { 2774 *(uint16_t *)dst = *(unaligned_uint16_t *)src; 2775 dst += sizeof(uint16_t); 2776 src += sizeof(uint16_t); 2777 } 2778 if (len & 0x01) 2779 *(uint8_t *)dst = *(uint8_t *)src; 2780 } 2781 2782 /** 2783 * Build the Data Segment of inlined data from single 2784 * segment packet, no VLAN insertion. 2785 * 2786 * @param txq 2787 * Pointer to TX queue structure. 2788 * @param loc 2789 * Pointer to burst routine local context. 2790 * @param dseg 2791 * Pointer to WQE to fill with built Data Segment. 2792 * @param buf 2793 * Data buffer to point. 2794 * @param len 2795 * Data buffer length. 2796 * @param olx 2797 * Configured Tx offloads mask. It is fully defined at 2798 * compile time and may be used for optimization. 2799 * 2800 * @return 2801 * Pointer to the next Data Segment after inlined data. 2802 * Ring buffer wraparound check is needed. We do not 2803 * do it here because it may not be needed for the 2804 * last packet in the eMPW session. 2805 */ 2806 static __rte_always_inline struct mlx5_wqe_dseg * 2807 mlx5_tx_dseg_empw(struct mlx5_txq_data *restrict txq, 2808 struct mlx5_txq_local *restrict loc __rte_unused, 2809 struct mlx5_wqe_dseg *restrict dseg, 2810 uint8_t *buf, 2811 unsigned int len, 2812 unsigned int olx __rte_unused) 2813 { 2814 unsigned int part; 2815 uint8_t *pdst; 2816 2817 dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE); 2818 pdst = &dseg->inline_data[0]; 2819 /* 2820 * The WQEBB space availability is checked by caller. 2821 * Here we should be aware of WQE ring buffer wraparound only. 2822 */ 2823 part = (uint8_t *)txq->wqes_end - pdst; 2824 part = RTE_MIN(part, len); 2825 do { 2826 rte_memcpy(pdst, buf, part); 2827 len -= part; 2828 if (likely(!len)) { 2829 pdst += part; 2830 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); 2831 /* Note: no final wraparound check here. */ 2832 return (struct mlx5_wqe_dseg *)pdst; 2833 } 2834 pdst = (uint8_t *)txq->wqes; 2835 buf += part; 2836 part = len; 2837 } while (true); 2838 } 2839 2840 /** 2841 * Build the Data Segment of inlined data from single 2842 * segment packet with VLAN insertion. 2843 * 2844 * @param txq 2845 * Pointer to TX queue structure. 2846 * @param loc 2847 * Pointer to burst routine local context. 2848 * @param dseg 2849 * Pointer to the dseg fill with built Data Segment. 2850 * @param buf 2851 * Data buffer to point. 2852 * @param len 2853 * Data buffer length. 2854 * @param olx 2855 * Configured Tx offloads mask. It is fully defined at 2856 * compile time and may be used for optimization. 2857 * 2858 * @return 2859 * Pointer to the next Data Segment after inlined data. 2860 * Ring buffer wraparound check is needed. 2861 */ 2862 static __rte_always_inline struct mlx5_wqe_dseg * 2863 mlx5_tx_dseg_vlan(struct mlx5_txq_data *restrict txq, 2864 struct mlx5_txq_local *restrict loc __rte_unused, 2865 struct mlx5_wqe_dseg *restrict dseg, 2866 uint8_t *buf, 2867 unsigned int len, 2868 unsigned int olx __rte_unused) 2869 2870 { 2871 unsigned int part; 2872 uint8_t *pdst; 2873 2874 assert(len > MLX5_ESEG_MIN_INLINE_SIZE); 2875 static_assert(MLX5_DSEG_MIN_INLINE_SIZE == 2876 (2 * RTE_ETHER_ADDR_LEN), 2877 "invalid Data Segment data size"); 2878 dseg->bcount = rte_cpu_to_be_32((len + sizeof(struct rte_vlan_hdr)) | 2879 MLX5_ETH_WQE_DATA_INLINE); 2880 pdst = &dseg->inline_data[0]; 2881 memcpy(pdst, buf, MLX5_DSEG_MIN_INLINE_SIZE); 2882 buf += MLX5_DSEG_MIN_INLINE_SIZE; 2883 pdst += MLX5_DSEG_MIN_INLINE_SIZE; 2884 len -= MLX5_DSEG_MIN_INLINE_SIZE; 2885 /* Insert VLAN ethertype + VLAN tag. Pointer is aligned. */ 2886 assert(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE)); 2887 if (unlikely(pdst >= (uint8_t *)txq->wqes_end)) 2888 pdst = (uint8_t *)txq->wqes; 2889 *(uint32_t *)pdst = rte_cpu_to_be_32((RTE_ETHER_TYPE_VLAN << 16) | 2890 loc->mbuf->vlan_tci); 2891 pdst += sizeof(struct rte_vlan_hdr); 2892 /* 2893 * The WQEBB space availability is checked by caller. 2894 * Here we should be aware of WQE ring buffer wraparound only. 2895 */ 2896 part = (uint8_t *)txq->wqes_end - pdst; 2897 part = RTE_MIN(part, len); 2898 do { 2899 rte_memcpy(pdst, buf, part); 2900 len -= part; 2901 if (likely(!len)) { 2902 pdst += part; 2903 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); 2904 /* Note: no final wraparound check here. */ 2905 return (struct mlx5_wqe_dseg *)pdst; 2906 } 2907 pdst = (uint8_t *)txq->wqes; 2908 buf += part; 2909 part = len; 2910 } while (true); 2911 } 2912 2913 /** 2914 * Build the Ethernet Segment with optionally inlined data with 2915 * VLAN insertion and following Data Segments (if any) from 2916 * multi-segment packet. Used by ordinary send and TSO. 2917 * 2918 * @param txq 2919 * Pointer to TX queue structure. 2920 * @param loc 2921 * Pointer to burst routine local context. 2922 * @param wqe 2923 * Pointer to WQE to fill with built Ethernet/Data Segments. 2924 * @param vlan 2925 * Length of VLAN header to insert, 0 means no VLAN insertion. 2926 * @param inlen 2927 * Data length to inline. For TSO this parameter specifies 2928 * exact value, for ordinary send routine can be aligned by 2929 * caller to provide better WQE space saving and data buffer 2930 * start address alignment. This length includes VLAN header 2931 * being inserted. 2932 * @param tso 2933 * Zero means ordinary send, inlined data can be extended, 2934 * otherwise this is TSO, inlined data length is fixed. 2935 * @param olx 2936 * Configured Tx offloads mask. It is fully defined at 2937 * compile time and may be used for optimization. 2938 * 2939 * @return 2940 * Actual size of built WQE in segments. 2941 */ 2942 static __rte_always_inline unsigned int 2943 mlx5_tx_mseg_build(struct mlx5_txq_data *restrict txq, 2944 struct mlx5_txq_local *restrict loc, 2945 struct mlx5_wqe *restrict wqe, 2946 unsigned int vlan, 2947 unsigned int inlen, 2948 unsigned int tso, 2949 unsigned int olx __rte_unused) 2950 { 2951 struct mlx5_wqe_dseg *restrict dseg; 2952 unsigned int ds; 2953 2954 assert((rte_pktmbuf_pkt_len(loc->mbuf) + vlan) >= inlen); 2955 loc->mbuf_nseg = NB_SEGS(loc->mbuf); 2956 loc->mbuf_off = 0; 2957 2958 dseg = mlx5_tx_eseg_mdat(txq, loc, wqe, vlan, inlen, tso, olx); 2959 if (!loc->mbuf_nseg) 2960 goto dseg_done; 2961 /* 2962 * There are still some mbuf remaining, not inlined. 2963 * The first mbuf may be partially inlined and we 2964 * must process the possible non-zero data offset. 2965 */ 2966 if (loc->mbuf_off) { 2967 unsigned int dlen; 2968 uint8_t *dptr; 2969 2970 /* 2971 * Exhausted packets must be dropped before. 2972 * Non-zero offset means there are some data 2973 * remained in the packet. 2974 */ 2975 assert(loc->mbuf_off < rte_pktmbuf_data_len(loc->mbuf)); 2976 assert(rte_pktmbuf_data_len(loc->mbuf)); 2977 dptr = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *, 2978 loc->mbuf_off); 2979 dlen = rte_pktmbuf_data_len(loc->mbuf) - loc->mbuf_off; 2980 /* 2981 * Build the pointer/minimal data Data Segment. 2982 * Do ring buffer wrapping check in advance. 2983 */ 2984 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 2985 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 2986 mlx5_tx_dseg_iptr(txq, loc, dseg, dptr, dlen, olx); 2987 /* Store the mbuf to be freed on completion. */ 2988 assert(loc->elts_free); 2989 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 2990 --loc->elts_free; 2991 ++dseg; 2992 if (--loc->mbuf_nseg == 0) 2993 goto dseg_done; 2994 loc->mbuf = loc->mbuf->next; 2995 loc->mbuf_off = 0; 2996 } 2997 do { 2998 if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) { 2999 struct rte_mbuf *mbuf; 3000 3001 /* Zero length segment found, just skip. */ 3002 mbuf = loc->mbuf; 3003 loc->mbuf = loc->mbuf->next; 3004 rte_pktmbuf_free_seg(mbuf); 3005 if (--loc->mbuf_nseg == 0) 3006 break; 3007 } else { 3008 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 3009 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 3010 mlx5_tx_dseg_iptr 3011 (txq, loc, dseg, 3012 rte_pktmbuf_mtod(loc->mbuf, uint8_t *), 3013 rte_pktmbuf_data_len(loc->mbuf), olx); 3014 assert(loc->elts_free); 3015 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 3016 --loc->elts_free; 3017 ++dseg; 3018 if (--loc->mbuf_nseg == 0) 3019 break; 3020 loc->mbuf = loc->mbuf->next; 3021 } 3022 } while (true); 3023 3024 dseg_done: 3025 /* Calculate actual segments used from the dseg pointer. */ 3026 if ((uintptr_t)wqe < (uintptr_t)dseg) 3027 ds = ((uintptr_t)dseg - (uintptr_t)wqe) / MLX5_WSEG_SIZE; 3028 else 3029 ds = (((uintptr_t)dseg - (uintptr_t)wqe) + 3030 txq->wqe_s * MLX5_WQE_SIZE) / MLX5_WSEG_SIZE; 3031 return ds; 3032 } 3033 3034 /** 3035 * Tx one packet function for multi-segment TSO. Supports all 3036 * types of Tx offloads, uses MLX5_OPCODE_TSO to build WQEs, 3037 * sends one packet per WQE. 3038 * 3039 * This routine is responsible for storing processed mbuf 3040 * into elts ring buffer and update elts_head. 3041 * 3042 * @param txq 3043 * Pointer to TX queue structure. 3044 * @param loc 3045 * Pointer to burst routine local context. 3046 * @param olx 3047 * Configured Tx offloads mask. It is fully defined at 3048 * compile time and may be used for optimization. 3049 * 3050 * @return 3051 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3052 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3053 * Local context variables partially updated. 3054 */ 3055 static __rte_always_inline enum mlx5_txcmp_code 3056 mlx5_tx_packet_multi_tso(struct mlx5_txq_data *restrict txq, 3057 struct mlx5_txq_local *restrict loc, 3058 unsigned int olx) 3059 { 3060 struct mlx5_wqe *restrict wqe; 3061 unsigned int ds, dlen, inlen, ntcp, vlan = 0; 3062 3063 /* 3064 * Calculate data length to be inlined to estimate 3065 * the required space in WQE ring buffer. 3066 */ 3067 dlen = rte_pktmbuf_pkt_len(loc->mbuf); 3068 if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) 3069 vlan = sizeof(struct rte_vlan_hdr); 3070 inlen = loc->mbuf->l2_len + vlan + 3071 loc->mbuf->l3_len + loc->mbuf->l4_len; 3072 if (unlikely((!inlen || !loc->mbuf->tso_segsz))) 3073 return MLX5_TXCMP_CODE_ERROR; 3074 if (loc->mbuf->ol_flags & PKT_TX_TUNNEL_MASK) 3075 inlen += loc->mbuf->outer_l2_len + loc->mbuf->outer_l3_len; 3076 /* Packet must contain all TSO headers. */ 3077 if (unlikely(inlen > MLX5_MAX_TSO_HEADER || 3078 inlen <= MLX5_ESEG_MIN_INLINE_SIZE || 3079 inlen > (dlen + vlan))) 3080 return MLX5_TXCMP_CODE_ERROR; 3081 assert(inlen >= txq->inlen_mode); 3082 /* 3083 * Check whether there are enough free WQEBBs: 3084 * - Control Segment 3085 * - Ethernet Segment 3086 * - First Segment of inlined Ethernet data 3087 * - ... data continued ... 3088 * - Data Segments of pointer/min inline type 3089 */ 3090 ds = NB_SEGS(loc->mbuf) + 2 + (inlen - 3091 MLX5_ESEG_MIN_INLINE_SIZE + 3092 MLX5_WSEG_SIZE + 3093 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 3094 if (unlikely(loc->wqe_free < ((ds + 3) / 4))) 3095 return MLX5_TXCMP_CODE_EXIT; 3096 /* Check for maximal WQE size. */ 3097 if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4))) 3098 return MLX5_TXCMP_CODE_ERROR; 3099 #ifdef MLX5_PMD_SOFT_COUNTERS 3100 /* Update sent data bytes/packets counters. */ 3101 ntcp = (dlen - (inlen - vlan) + loc->mbuf->tso_segsz - 1) / 3102 loc->mbuf->tso_segsz; 3103 /* 3104 * One will be added for mbuf itself 3105 * at the end of the mlx5_tx_burst from 3106 * loc->pkts_sent field. 3107 */ 3108 --ntcp; 3109 txq->stats.opackets += ntcp; 3110 txq->stats.obytes += dlen + vlan + ntcp * inlen; 3111 #endif 3112 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3113 loc->wqe_last = wqe; 3114 mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_TSO, olx); 3115 ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 1, olx); 3116 wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); 3117 txq->wqe_ci += (ds + 3) / 4; 3118 loc->wqe_free -= (ds + 3) / 4; 3119 /* Request CQE generation if limits are reached. */ 3120 mlx5_tx_request_completion(txq, loc, true, olx); 3121 return MLX5_TXCMP_CODE_MULTI; 3122 } 3123 3124 /** 3125 * Tx one packet function for multi-segment SEND. Supports all 3126 * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs, 3127 * sends one packet per WQE, without any data inlining in 3128 * Ethernet Segment. 3129 * 3130 * This routine is responsible for storing processed mbuf 3131 * into elts ring buffer and update elts_head. 3132 * 3133 * @param txq 3134 * Pointer to TX queue structure. 3135 * @param loc 3136 * Pointer to burst routine local context. 3137 * @param olx 3138 * Configured Tx offloads mask. It is fully defined at 3139 * compile time and may be used for optimization. 3140 * 3141 * @return 3142 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3143 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3144 * Local context variables partially updated. 3145 */ 3146 static __rte_always_inline enum mlx5_txcmp_code 3147 mlx5_tx_packet_multi_send(struct mlx5_txq_data *restrict txq, 3148 struct mlx5_txq_local *restrict loc, 3149 unsigned int olx) 3150 { 3151 struct mlx5_wqe_dseg *restrict dseg; 3152 struct mlx5_wqe *restrict wqe; 3153 unsigned int ds, nseg; 3154 3155 assert(NB_SEGS(loc->mbuf) > 1); 3156 /* 3157 * No inline at all, it means the CPU cycles saving 3158 * is prioritized at configuration, we should not 3159 * copy any packet data to WQE. 3160 */ 3161 nseg = NB_SEGS(loc->mbuf); 3162 ds = 2 + nseg; 3163 if (unlikely(loc->wqe_free < ((ds + 3) / 4))) 3164 return MLX5_TXCMP_CODE_EXIT; 3165 /* Check for maximal WQE size. */ 3166 if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4))) 3167 return MLX5_TXCMP_CODE_ERROR; 3168 /* 3169 * Some Tx offloads may cause an error if 3170 * packet is not long enough, check against 3171 * assumed minimal length. 3172 */ 3173 if (rte_pktmbuf_pkt_len(loc->mbuf) <= MLX5_ESEG_MIN_INLINE_SIZE) 3174 return MLX5_TXCMP_CODE_ERROR; 3175 #ifdef MLX5_PMD_SOFT_COUNTERS 3176 /* Update sent data bytes counter. */ 3177 txq->stats.obytes += rte_pktmbuf_pkt_len(loc->mbuf); 3178 if (MLX5_TXOFF_CONFIG(VLAN) && 3179 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) 3180 txq->stats.obytes += sizeof(struct rte_vlan_hdr); 3181 #endif 3182 /* 3183 * SEND WQE, one WQEBB: 3184 * - Control Segment, SEND opcode 3185 * - Ethernet Segment, optional VLAN, no inline 3186 * - Data Segments, pointer only type 3187 */ 3188 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3189 loc->wqe_last = wqe; 3190 mlx5_tx_cseg_init(txq, loc, wqe, ds, MLX5_OPCODE_SEND, olx); 3191 mlx5_tx_eseg_none(txq, loc, wqe, olx); 3192 dseg = &wqe->dseg[0]; 3193 do { 3194 if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) { 3195 struct rte_mbuf *mbuf; 3196 3197 /* 3198 * Zero length segment found, have to 3199 * correct total size of WQE in segments. 3200 * It is supposed to be rare occasion, so 3201 * in normal case (no zero length segments) 3202 * we avoid extra writing to the Control 3203 * Segment. 3204 */ 3205 --ds; 3206 wqe->cseg.sq_ds -= RTE_BE32(1); 3207 mbuf = loc->mbuf; 3208 loc->mbuf = mbuf->next; 3209 rte_pktmbuf_free_seg(mbuf); 3210 if (--nseg == 0) 3211 break; 3212 } else { 3213 mlx5_tx_dseg_ptr 3214 (txq, loc, dseg, 3215 rte_pktmbuf_mtod(loc->mbuf, uint8_t *), 3216 rte_pktmbuf_data_len(loc->mbuf), olx); 3217 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 3218 --loc->elts_free; 3219 if (--nseg == 0) 3220 break; 3221 ++dseg; 3222 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 3223 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 3224 loc->mbuf = loc->mbuf->next; 3225 } 3226 } while (true); 3227 txq->wqe_ci += (ds + 3) / 4; 3228 loc->wqe_free -= (ds + 3) / 4; 3229 /* Request CQE generation if limits are reached. */ 3230 mlx5_tx_request_completion(txq, loc, true, olx); 3231 return MLX5_TXCMP_CODE_MULTI; 3232 } 3233 3234 /** 3235 * Tx one packet function for multi-segment SEND. Supports all 3236 * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs, 3237 * sends one packet per WQE, with data inlining in 3238 * Ethernet Segment and minimal Data Segments. 3239 * 3240 * This routine is responsible for storing processed mbuf 3241 * into elts ring buffer and update elts_head. 3242 * 3243 * @param txq 3244 * Pointer to TX queue structure. 3245 * @param loc 3246 * Pointer to burst routine local context. 3247 * @param olx 3248 * Configured Tx offloads mask. It is fully defined at 3249 * compile time and may be used for optimization. 3250 * 3251 * @return 3252 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3253 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3254 * Local context variables partially updated. 3255 */ 3256 static __rte_always_inline enum mlx5_txcmp_code 3257 mlx5_tx_packet_multi_inline(struct mlx5_txq_data *restrict txq, 3258 struct mlx5_txq_local *restrict loc, 3259 unsigned int olx) 3260 { 3261 struct mlx5_wqe *restrict wqe; 3262 unsigned int ds, inlen, dlen, vlan = 0; 3263 3264 assert(MLX5_TXOFF_CONFIG(INLINE)); 3265 assert(NB_SEGS(loc->mbuf) > 1); 3266 /* 3267 * First calculate data length to be inlined 3268 * to estimate the required space for WQE. 3269 */ 3270 dlen = rte_pktmbuf_pkt_len(loc->mbuf); 3271 if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) 3272 vlan = sizeof(struct rte_vlan_hdr); 3273 inlen = dlen + vlan; 3274 /* Check against minimal length. */ 3275 if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE) 3276 return MLX5_TXCMP_CODE_ERROR; 3277 assert(txq->inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE); 3278 if (inlen > txq->inlen_send) { 3279 struct rte_mbuf *mbuf; 3280 unsigned int nxlen; 3281 uintptr_t start; 3282 3283 /* 3284 * Packet length exceeds the allowed inline 3285 * data length, check whether the minimal 3286 * inlining is required. 3287 */ 3288 if (txq->inlen_mode) { 3289 assert(txq->inlen_mode >= MLX5_ESEG_MIN_INLINE_SIZE); 3290 assert(txq->inlen_mode <= txq->inlen_send); 3291 inlen = txq->inlen_mode; 3292 } else { 3293 if (!vlan || txq->vlan_en) { 3294 /* 3295 * VLAN insertion will be done inside by HW. 3296 * It is not utmost effective - VLAN flag is 3297 * checked twice, but we should proceed the 3298 * inlining length correctly and take into 3299 * account the VLAN header being inserted. 3300 */ 3301 return mlx5_tx_packet_multi_send 3302 (txq, loc, olx); 3303 } 3304 inlen = MLX5_ESEG_MIN_INLINE_SIZE; 3305 } 3306 /* 3307 * Now we know the minimal amount of data is requested 3308 * to inline. Check whether we should inline the buffers 3309 * from the chain beginning to eliminate some mbufs. 3310 */ 3311 mbuf = loc->mbuf; 3312 nxlen = rte_pktmbuf_data_len(mbuf); 3313 if (unlikely(nxlen <= txq->inlen_send)) { 3314 /* We can inline first mbuf at least. */ 3315 if (nxlen < inlen) { 3316 unsigned int smlen; 3317 3318 /* Scan mbufs till inlen filled. */ 3319 do { 3320 smlen = nxlen; 3321 mbuf = NEXT(mbuf); 3322 assert(mbuf); 3323 nxlen = rte_pktmbuf_data_len(mbuf); 3324 nxlen += smlen; 3325 } while (unlikely(nxlen < inlen)); 3326 if (unlikely(nxlen > txq->inlen_send)) { 3327 /* We cannot inline entire mbuf. */ 3328 smlen = inlen - smlen; 3329 start = rte_pktmbuf_mtod_offset 3330 (mbuf, uintptr_t, smlen); 3331 goto do_align; 3332 } 3333 } 3334 do { 3335 inlen = nxlen; 3336 mbuf = NEXT(mbuf); 3337 /* There should be not end of packet. */ 3338 assert(mbuf); 3339 nxlen = inlen + rte_pktmbuf_data_len(mbuf); 3340 } while (unlikely(nxlen < txq->inlen_send)); 3341 } 3342 start = rte_pktmbuf_mtod(mbuf, uintptr_t); 3343 /* 3344 * Check whether we can do inline to align start 3345 * address of data buffer to cacheline. 3346 */ 3347 do_align: 3348 start = (~start + 1) & (RTE_CACHE_LINE_SIZE - 1); 3349 if (unlikely(start)) { 3350 start += inlen; 3351 if (start <= txq->inlen_send) 3352 inlen = start; 3353 } 3354 } 3355 /* 3356 * Check whether there are enough free WQEBBs: 3357 * - Control Segment 3358 * - Ethernet Segment 3359 * - First Segment of inlined Ethernet data 3360 * - ... data continued ... 3361 * - Data Segments of pointer/min inline type 3362 * 3363 * Estimate the number of Data Segments conservatively, 3364 * supposing no any mbufs is being freed during inlining. 3365 */ 3366 assert(inlen <= txq->inlen_send); 3367 ds = NB_SEGS(loc->mbuf) + 2 + (inlen - 3368 MLX5_ESEG_MIN_INLINE_SIZE + 3369 MLX5_WSEG_SIZE + 3370 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 3371 if (unlikely(loc->wqe_free < ((ds + 3) / 4))) 3372 return MLX5_TXCMP_CODE_EXIT; 3373 /* Check for maximal WQE size. */ 3374 if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4))) 3375 return MLX5_TXCMP_CODE_ERROR; 3376 #ifdef MLX5_PMD_SOFT_COUNTERS 3377 /* Update sent data bytes/packets counters. */ 3378 txq->stats.obytes += dlen + vlan; 3379 #endif 3380 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3381 loc->wqe_last = wqe; 3382 mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_SEND, olx); 3383 ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 0, olx); 3384 wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); 3385 txq->wqe_ci += (ds + 3) / 4; 3386 loc->wqe_free -= (ds + 3) / 4; 3387 /* Request CQE generation if limits are reached. */ 3388 mlx5_tx_request_completion(txq, loc, true, olx); 3389 return MLX5_TXCMP_CODE_MULTI; 3390 } 3391 3392 /** 3393 * Tx burst function for multi-segment packets. Supports all 3394 * types of Tx offloads, uses MLX5_OPCODE_SEND/TSO to build WQEs, 3395 * sends one packet per WQE. Function stops sending if it 3396 * encounters the single-segment packet. 3397 * 3398 * This routine is responsible for storing processed mbuf 3399 * into elts ring buffer and update elts_head. 3400 * 3401 * @param txq 3402 * Pointer to TX queue structure. 3403 * @param[in] pkts 3404 * Packets to transmit. 3405 * @param pkts_n 3406 * Number of packets in array. 3407 * @param loc 3408 * Pointer to burst routine local context. 3409 * @param olx 3410 * Configured Tx offloads mask. It is fully defined at 3411 * compile time and may be used for optimization. 3412 * 3413 * @return 3414 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3415 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3416 * MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered. 3417 * MLX5_TXCMP_CODE_TSO - TSO single-segment packet encountered. 3418 * Local context variables updated. 3419 */ 3420 static __rte_always_inline enum mlx5_txcmp_code 3421 mlx5_tx_burst_mseg(struct mlx5_txq_data *restrict txq, 3422 struct rte_mbuf **restrict pkts, 3423 unsigned int pkts_n, 3424 struct mlx5_txq_local *restrict loc, 3425 unsigned int olx) 3426 { 3427 assert(loc->elts_free && loc->wqe_free); 3428 assert(pkts_n > loc->pkts_sent); 3429 pkts += loc->pkts_sent + 1; 3430 pkts_n -= loc->pkts_sent; 3431 for (;;) { 3432 enum mlx5_txcmp_code ret; 3433 3434 assert(NB_SEGS(loc->mbuf) > 1); 3435 /* 3436 * Estimate the number of free elts quickly but 3437 * conservatively. Some segment may be fully inlined 3438 * and freed, ignore this here - precise estimation 3439 * is costly. 3440 */ 3441 if (loc->elts_free < NB_SEGS(loc->mbuf)) 3442 return MLX5_TXCMP_CODE_EXIT; 3443 if (MLX5_TXOFF_CONFIG(TSO) && 3444 unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) { 3445 /* Proceed with multi-segment TSO. */ 3446 ret = mlx5_tx_packet_multi_tso(txq, loc, olx); 3447 } else if (MLX5_TXOFF_CONFIG(INLINE)) { 3448 /* Proceed with multi-segment SEND with inlining. */ 3449 ret = mlx5_tx_packet_multi_inline(txq, loc, olx); 3450 } else { 3451 /* Proceed with multi-segment SEND w/o inlining. */ 3452 ret = mlx5_tx_packet_multi_send(txq, loc, olx); 3453 } 3454 if (ret == MLX5_TXCMP_CODE_EXIT) 3455 return MLX5_TXCMP_CODE_EXIT; 3456 if (ret == MLX5_TXCMP_CODE_ERROR) 3457 return MLX5_TXCMP_CODE_ERROR; 3458 /* WQE is built, go to the next packet. */ 3459 ++loc->pkts_sent; 3460 --pkts_n; 3461 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) 3462 return MLX5_TXCMP_CODE_EXIT; 3463 loc->mbuf = *pkts++; 3464 if (pkts_n > 1) 3465 rte_prefetch0(*pkts); 3466 if (likely(NB_SEGS(loc->mbuf) > 1)) 3467 continue; 3468 /* Here ends the series of multi-segment packets. */ 3469 if (MLX5_TXOFF_CONFIG(TSO) && 3470 unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) 3471 return MLX5_TXCMP_CODE_TSO; 3472 return MLX5_TXCMP_CODE_SINGLE; 3473 } 3474 assert(false); 3475 } 3476 3477 /** 3478 * Tx burst function for single-segment packets with TSO. 3479 * Supports all types of Tx offloads, except multi-packets. 3480 * Uses MLX5_OPCODE_TSO to build WQEs, sends one packet per WQE. 3481 * Function stops sending if it encounters the multi-segment 3482 * packet or packet without TSO requested. 3483 * 3484 * The routine is responsible for storing processed mbuf 3485 * into elts ring buffer and update elts_head if inline 3486 * offloads is requested due to possible early freeing 3487 * of the inlined mbufs (can not store pkts array in elts 3488 * as a batch). 3489 * 3490 * @param txq 3491 * Pointer to TX queue structure. 3492 * @param[in] pkts 3493 * Packets to transmit. 3494 * @param pkts_n 3495 * Number of packets in array. 3496 * @param loc 3497 * Pointer to burst routine local context. 3498 * @param olx 3499 * Configured Tx offloads mask. It is fully defined at 3500 * compile time and may be used for optimization. 3501 * 3502 * @return 3503 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3504 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3505 * MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered. 3506 * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered. 3507 * Local context variables updated. 3508 */ 3509 static __rte_always_inline enum mlx5_txcmp_code 3510 mlx5_tx_burst_tso(struct mlx5_txq_data *restrict txq, 3511 struct rte_mbuf **restrict pkts, 3512 unsigned int pkts_n, 3513 struct mlx5_txq_local *restrict loc, 3514 unsigned int olx) 3515 { 3516 assert(loc->elts_free && loc->wqe_free); 3517 assert(pkts_n > loc->pkts_sent); 3518 pkts += loc->pkts_sent + 1; 3519 pkts_n -= loc->pkts_sent; 3520 for (;;) { 3521 struct mlx5_wqe_dseg *restrict dseg; 3522 struct mlx5_wqe *restrict wqe; 3523 unsigned int ds, dlen, hlen, ntcp, vlan = 0; 3524 uint8_t *dptr; 3525 3526 assert(NB_SEGS(loc->mbuf) == 1); 3527 dlen = rte_pktmbuf_data_len(loc->mbuf); 3528 if (MLX5_TXOFF_CONFIG(VLAN) && 3529 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { 3530 vlan = sizeof(struct rte_vlan_hdr); 3531 } 3532 /* 3533 * First calculate the WQE size to check 3534 * whether we have enough space in ring buffer. 3535 */ 3536 hlen = loc->mbuf->l2_len + vlan + 3537 loc->mbuf->l3_len + loc->mbuf->l4_len; 3538 if (unlikely((!hlen || !loc->mbuf->tso_segsz))) 3539 return MLX5_TXCMP_CODE_ERROR; 3540 if (loc->mbuf->ol_flags & PKT_TX_TUNNEL_MASK) 3541 hlen += loc->mbuf->outer_l2_len + 3542 loc->mbuf->outer_l3_len; 3543 /* Segment must contain all TSO headers. */ 3544 if (unlikely(hlen > MLX5_MAX_TSO_HEADER || 3545 hlen <= MLX5_ESEG_MIN_INLINE_SIZE || 3546 hlen > (dlen + vlan))) 3547 return MLX5_TXCMP_CODE_ERROR; 3548 /* 3549 * Check whether there are enough free WQEBBs: 3550 * - Control Segment 3551 * - Ethernet Segment 3552 * - First Segment of inlined Ethernet data 3553 * - ... data continued ... 3554 * - Finishing Data Segment of pointer type 3555 */ 3556 ds = 4 + (hlen - MLX5_ESEG_MIN_INLINE_SIZE + 3557 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 3558 if (loc->wqe_free < ((ds + 3) / 4)) 3559 return MLX5_TXCMP_CODE_EXIT; 3560 #ifdef MLX5_PMD_SOFT_COUNTERS 3561 /* Update sent data bytes/packets counters. */ 3562 ntcp = (dlen + vlan - hlen + 3563 loc->mbuf->tso_segsz - 1) / 3564 loc->mbuf->tso_segsz; 3565 /* 3566 * One will be added for mbuf itself at the end 3567 * of the mlx5_tx_burst from loc->pkts_sent field. 3568 */ 3569 --ntcp; 3570 txq->stats.opackets += ntcp; 3571 txq->stats.obytes += dlen + vlan + ntcp * hlen; 3572 #endif 3573 /* 3574 * Build the TSO WQE: 3575 * - Control Segment 3576 * - Ethernet Segment with hlen bytes inlined 3577 * - Data Segment of pointer type 3578 */ 3579 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3580 loc->wqe_last = wqe; 3581 mlx5_tx_cseg_init(txq, loc, wqe, ds, 3582 MLX5_OPCODE_TSO, olx); 3583 dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan, hlen, 1, olx); 3584 dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + hlen - vlan; 3585 dlen -= hlen - vlan; 3586 mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx); 3587 /* 3588 * WQE is built, update the loop parameters 3589 * and go to the next packet. 3590 */ 3591 txq->wqe_ci += (ds + 3) / 4; 3592 loc->wqe_free -= (ds + 3) / 4; 3593 if (MLX5_TXOFF_CONFIG(INLINE)) 3594 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 3595 --loc->elts_free; 3596 ++loc->pkts_sent; 3597 --pkts_n; 3598 /* Request CQE generation if limits are reached. */ 3599 mlx5_tx_request_completion(txq, loc, false, olx); 3600 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) 3601 return MLX5_TXCMP_CODE_EXIT; 3602 loc->mbuf = *pkts++; 3603 if (pkts_n > 1) 3604 rte_prefetch0(*pkts); 3605 if (MLX5_TXOFF_CONFIG(MULTI) && 3606 unlikely(NB_SEGS(loc->mbuf) > 1)) 3607 return MLX5_TXCMP_CODE_MULTI; 3608 if (likely(!(loc->mbuf->ol_flags & PKT_TX_TCP_SEG))) 3609 return MLX5_TXCMP_CODE_SINGLE; 3610 /* Continue with the next TSO packet. */ 3611 } 3612 assert(false); 3613 } 3614 3615 /** 3616 * Analyze the packet and select the best method to send. 3617 * 3618 * @param txq 3619 * Pointer to TX queue structure. 3620 * @param loc 3621 * Pointer to burst routine local context. 3622 * @param olx 3623 * Configured Tx offloads mask. It is fully defined at 3624 * compile time and may be used for optimization. 3625 * @param newp 3626 * The predefined flag whether do complete check for 3627 * multi-segment packets and TSO. 3628 * 3629 * @return 3630 * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered. 3631 * MLX5_TXCMP_CODE_TSO - TSO required, use TSO/LSO. 3632 * MLX5_TXCMP_CODE_SINGLE - single-segment packet, use SEND. 3633 * MLX5_TXCMP_CODE_EMPW - single-segment packet, use MPW. 3634 */ 3635 static __rte_always_inline enum mlx5_txcmp_code 3636 mlx5_tx_able_to_empw(struct mlx5_txq_data *restrict txq, 3637 struct mlx5_txq_local *restrict loc, 3638 unsigned int olx, 3639 bool newp) 3640 { 3641 /* Check for multi-segment packet. */ 3642 if (newp && 3643 MLX5_TXOFF_CONFIG(MULTI) && 3644 unlikely(NB_SEGS(loc->mbuf) > 1)) 3645 return MLX5_TXCMP_CODE_MULTI; 3646 /* Check for TSO packet. */ 3647 if (newp && 3648 MLX5_TXOFF_CONFIG(TSO) && 3649 unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) 3650 return MLX5_TXCMP_CODE_TSO; 3651 /* Check if eMPW is enabled at all. */ 3652 if (!MLX5_TXOFF_CONFIG(EMPW)) 3653 return MLX5_TXCMP_CODE_SINGLE; 3654 /* Check if eMPW can be engaged. */ 3655 if (MLX5_TXOFF_CONFIG(VLAN) && 3656 unlikely(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) && 3657 (!MLX5_TXOFF_CONFIG(INLINE) || 3658 unlikely((rte_pktmbuf_data_len(loc->mbuf) + 3659 sizeof(struct rte_vlan_hdr)) > txq->inlen_empw))) { 3660 /* 3661 * eMPW does not support VLAN insertion offload, 3662 * we have to inline the entire packet but 3663 * packet is too long for inlining. 3664 */ 3665 return MLX5_TXCMP_CODE_SINGLE; 3666 } 3667 return MLX5_TXCMP_CODE_EMPW; 3668 } 3669 3670 /** 3671 * Check the next packet attributes to match with the eMPW batch ones. 3672 * 3673 * @param txq 3674 * Pointer to TX queue structure. 3675 * @param es 3676 * Pointer to Ethernet Segment of eMPW batch. 3677 * @param loc 3678 * Pointer to burst routine local context. 3679 * @param olx 3680 * Configured Tx offloads mask. It is fully defined at 3681 * compile time and may be used for optimization. 3682 * 3683 * @return 3684 * true - packet match with eMPW batch attributes. 3685 * false - no match, eMPW should be restarted. 3686 */ 3687 static __rte_always_inline bool 3688 mlx5_tx_match_empw(struct mlx5_txq_data *restrict txq __rte_unused, 3689 struct mlx5_wqe_eseg *restrict es, 3690 struct mlx5_txq_local *restrict loc, 3691 unsigned int olx) 3692 { 3693 uint8_t swp_flags = 0; 3694 3695 /* Compare the checksum flags, if any. */ 3696 if (MLX5_TXOFF_CONFIG(CSUM) && 3697 txq_ol_cksum_to_cs(loc->mbuf) != es->cs_flags) 3698 return false; 3699 /* Compare the Software Parser offsets and flags. */ 3700 if (MLX5_TXOFF_CONFIG(SWP) && 3701 (es->swp_offs != txq_mbuf_to_swp(loc, &swp_flags, olx) || 3702 es->swp_flags != swp_flags)) 3703 return false; 3704 /* Fill metadata field if needed. */ 3705 if (MLX5_TXOFF_CONFIG(METADATA) && 3706 es->metadata != (loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? 3707 *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0)) 3708 return false; 3709 /* There must be no VLAN packets in eMPW loop. */ 3710 if (MLX5_TXOFF_CONFIG(VLAN)) 3711 assert(!(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)); 3712 return true; 3713 } 3714 3715 /* 3716 * Update send loop variables and WQE for eMPW loop 3717 * without data inlining. Number of Data Segments is 3718 * equal to the number of sent packets. 3719 * 3720 * @param txq 3721 * Pointer to TX queue structure. 3722 * @param loc 3723 * Pointer to burst routine local context. 3724 * @param ds 3725 * Number of packets/Data Segments/Packets. 3726 * @param slen 3727 * Accumulated statistics, bytes sent 3728 * @param olx 3729 * Configured Tx offloads mask. It is fully defined at 3730 * compile time and may be used for optimization. 3731 * 3732 * @return 3733 * true - packet match with eMPW batch attributes. 3734 * false - no match, eMPW should be restarted. 3735 */ 3736 static __rte_always_inline void 3737 mlx5_tx_sdone_empw(struct mlx5_txq_data *restrict txq, 3738 struct mlx5_txq_local *restrict loc, 3739 unsigned int ds, 3740 unsigned int slen, 3741 unsigned int olx) 3742 { 3743 assert(!MLX5_TXOFF_CONFIG(INLINE)); 3744 #ifdef MLX5_PMD_SOFT_COUNTERS 3745 /* Update sent data bytes counter. */ 3746 txq->stats.obytes += slen; 3747 #else 3748 (void)slen; 3749 #endif 3750 loc->elts_free -= ds; 3751 loc->pkts_sent += ds; 3752 ds += 2; 3753 loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); 3754 txq->wqe_ci += (ds + 3) / 4; 3755 loc->wqe_free -= (ds + 3) / 4; 3756 /* Request CQE generation if limits are reached. */ 3757 mlx5_tx_request_completion(txq, loc, false, olx); 3758 } 3759 3760 /* 3761 * Update send loop variables and WQE for eMPW loop 3762 * with data inlining. Gets the size of pushed descriptors 3763 * and data to the WQE. 3764 * 3765 * @param txq 3766 * Pointer to TX queue structure. 3767 * @param loc 3768 * Pointer to burst routine local context. 3769 * @param len 3770 * Total size of descriptor/data in bytes. 3771 * @param slen 3772 * Accumulated statistics, data bytes sent. 3773 * @param olx 3774 * Configured Tx offloads mask. It is fully defined at 3775 * compile time and may be used for optimization. 3776 * 3777 * @return 3778 * true - packet match with eMPW batch attributes. 3779 * false - no match, eMPW should be restarted. 3780 */ 3781 static __rte_always_inline void 3782 mlx5_tx_idone_empw(struct mlx5_txq_data *restrict txq, 3783 struct mlx5_txq_local *restrict loc, 3784 unsigned int len, 3785 unsigned int slen, 3786 unsigned int olx __rte_unused) 3787 { 3788 assert(MLX5_TXOFF_CONFIG(INLINE)); 3789 assert((len % MLX5_WSEG_SIZE) == 0); 3790 #ifdef MLX5_PMD_SOFT_COUNTERS 3791 /* Update sent data bytes counter. */ 3792 txq->stats.obytes += slen; 3793 #else 3794 (void)slen; 3795 #endif 3796 len = len / MLX5_WSEG_SIZE + 2; 3797 loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | len); 3798 txq->wqe_ci += (len + 3) / 4; 3799 loc->wqe_free -= (len + 3) / 4; 3800 /* Request CQE generation if limits are reached. */ 3801 mlx5_tx_request_completion(txq, loc, false, olx); 3802 } 3803 3804 /** 3805 * The set of Tx burst functions for single-segment packets 3806 * without TSO and with Multi-Packet Writing feature support. 3807 * Supports all types of Tx offloads, except multi-packets 3808 * and TSO. 3809 * 3810 * Uses MLX5_OPCODE_EMPW to build WQEs if possible and sends 3811 * as many packet per WQE as it can. If eMPW is not configured 3812 * or packet can not be sent with eMPW (VLAN insertion) the 3813 * ordinary SEND opcode is used and only one packet placed 3814 * in WQE. 3815 * 3816 * Functions stop sending if it encounters the multi-segment 3817 * packet or packet with TSO requested. 3818 * 3819 * The routines are responsible for storing processed mbuf 3820 * into elts ring buffer and update elts_head if inlining 3821 * offload is requested. Otherwise the copying mbufs to elts 3822 * can be postponed and completed at the end of burst routine. 3823 * 3824 * @param txq 3825 * Pointer to TX queue structure. 3826 * @param[in] pkts 3827 * Packets to transmit. 3828 * @param pkts_n 3829 * Number of packets in array. 3830 * @param loc 3831 * Pointer to burst routine local context. 3832 * @param olx 3833 * Configured Tx offloads mask. It is fully defined at 3834 * compile time and may be used for optimization. 3835 * 3836 * @return 3837 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3838 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3839 * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered. 3840 * MLX5_TXCMP_CODE_TSO - TSO packet encountered. 3841 * MLX5_TXCMP_CODE_SINGLE - used inside functions set. 3842 * MLX5_TXCMP_CODE_EMPW - used inside functions set. 3843 * 3844 * Local context variables updated. 3845 * 3846 * 3847 * The routine sends packets with MLX5_OPCODE_EMPW 3848 * without inlining, this is dedicated optimized branch. 3849 * No VLAN insertion is supported. 3850 */ 3851 static __rte_always_inline enum mlx5_txcmp_code 3852 mlx5_tx_burst_empw_simple(struct mlx5_txq_data *restrict txq, 3853 struct rte_mbuf **restrict pkts, 3854 unsigned int pkts_n, 3855 struct mlx5_txq_local *restrict loc, 3856 unsigned int olx) 3857 { 3858 /* 3859 * Subroutine is the part of mlx5_tx_burst_single() 3860 * and sends single-segment packet with eMPW opcode 3861 * without data inlining. 3862 */ 3863 assert(!MLX5_TXOFF_CONFIG(INLINE)); 3864 assert(MLX5_TXOFF_CONFIG(EMPW)); 3865 assert(loc->elts_free && loc->wqe_free); 3866 assert(pkts_n > loc->pkts_sent); 3867 static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size"); 3868 pkts += loc->pkts_sent + 1; 3869 pkts_n -= loc->pkts_sent; 3870 for (;;) { 3871 struct mlx5_wqe_dseg *restrict dseg; 3872 struct mlx5_wqe_eseg *restrict eseg; 3873 enum mlx5_txcmp_code ret; 3874 unsigned int part, loop; 3875 unsigned int slen = 0; 3876 3877 next_empw: 3878 assert(NB_SEGS(loc->mbuf) == 1); 3879 part = RTE_MIN(pkts_n, MLX5_EMPW_MAX_PACKETS); 3880 if (unlikely(loc->elts_free < part)) { 3881 /* We have no enough elts to save all mbufs. */ 3882 if (unlikely(loc->elts_free < MLX5_EMPW_MIN_PACKETS)) 3883 return MLX5_TXCMP_CODE_EXIT; 3884 /* But we still able to send at least minimal eMPW. */ 3885 part = loc->elts_free; 3886 } 3887 /* Check whether we have enough WQEs */ 3888 if (unlikely(loc->wqe_free < ((2 + part + 3) / 4))) { 3889 if (unlikely(loc->wqe_free < 3890 ((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4))) 3891 return MLX5_TXCMP_CODE_EXIT; 3892 part = (loc->wqe_free * 4) - 2; 3893 } 3894 if (likely(part > 1)) 3895 rte_prefetch0(*pkts); 3896 loc->wqe_last = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3897 /* 3898 * Build eMPW title WQEBB: 3899 * - Control Segment, eMPW opcode 3900 * - Ethernet Segment, no inline 3901 */ 3902 mlx5_tx_cseg_init(txq, loc, loc->wqe_last, part + 2, 3903 MLX5_OPCODE_ENHANCED_MPSW, olx); 3904 mlx5_tx_eseg_none(txq, loc, loc->wqe_last, 3905 olx & ~MLX5_TXOFF_CONFIG_VLAN); 3906 eseg = &loc->wqe_last->eseg; 3907 dseg = &loc->wqe_last->dseg[0]; 3908 loop = part; 3909 for (;;) { 3910 uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf); 3911 #ifdef MLX5_PMD_SOFT_COUNTERS 3912 /* Update sent data bytes counter. */ 3913 slen += dlen; 3914 #endif 3915 mlx5_tx_dseg_ptr 3916 (txq, loc, dseg, 3917 rte_pktmbuf_mtod(loc->mbuf, uint8_t *), 3918 dlen, olx); 3919 if (unlikely(--loop == 0)) 3920 break; 3921 loc->mbuf = *pkts++; 3922 if (likely(loop > 1)) 3923 rte_prefetch0(*pkts); 3924 ret = mlx5_tx_able_to_empw(txq, loc, olx, true); 3925 /* 3926 * Unroll the completion code to avoid 3927 * returning variable value - it results in 3928 * unoptimized sequent checking in caller. 3929 */ 3930 if (ret == MLX5_TXCMP_CODE_MULTI) { 3931 part -= loop; 3932 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 3933 if (unlikely(!loc->elts_free || 3934 !loc->wqe_free)) 3935 return MLX5_TXCMP_CODE_EXIT; 3936 return MLX5_TXCMP_CODE_MULTI; 3937 } 3938 assert(NB_SEGS(loc->mbuf) == 1); 3939 if (ret == MLX5_TXCMP_CODE_TSO) { 3940 part -= loop; 3941 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 3942 if (unlikely(!loc->elts_free || 3943 !loc->wqe_free)) 3944 return MLX5_TXCMP_CODE_EXIT; 3945 return MLX5_TXCMP_CODE_TSO; 3946 } 3947 if (ret == MLX5_TXCMP_CODE_SINGLE) { 3948 part -= loop; 3949 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 3950 if (unlikely(!loc->elts_free || 3951 !loc->wqe_free)) 3952 return MLX5_TXCMP_CODE_EXIT; 3953 return MLX5_TXCMP_CODE_SINGLE; 3954 } 3955 if (ret != MLX5_TXCMP_CODE_EMPW) { 3956 assert(false); 3957 part -= loop; 3958 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 3959 return MLX5_TXCMP_CODE_ERROR; 3960 } 3961 /* 3962 * Check whether packet parameters coincide 3963 * within assumed eMPW batch: 3964 * - check sum settings 3965 * - metadata value 3966 * - software parser settings 3967 */ 3968 if (!mlx5_tx_match_empw(txq, eseg, loc, olx)) { 3969 assert(loop); 3970 part -= loop; 3971 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 3972 if (unlikely(!loc->elts_free || 3973 !loc->wqe_free)) 3974 return MLX5_TXCMP_CODE_EXIT; 3975 pkts_n -= part; 3976 goto next_empw; 3977 } 3978 /* Packet attributes match, continue the same eMPW. */ 3979 ++dseg; 3980 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 3981 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 3982 } 3983 /* eMPW is built successfully, update loop parameters. */ 3984 assert(!loop); 3985 assert(pkts_n >= part); 3986 #ifdef MLX5_PMD_SOFT_COUNTERS 3987 /* Update sent data bytes counter. */ 3988 txq->stats.obytes += slen; 3989 #endif 3990 loc->elts_free -= part; 3991 loc->pkts_sent += part; 3992 txq->wqe_ci += (2 + part + 3) / 4; 3993 loc->wqe_free -= (2 + part + 3) / 4; 3994 pkts_n -= part; 3995 /* Request CQE generation if limits are reached. */ 3996 mlx5_tx_request_completion(txq, loc, false, olx); 3997 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) 3998 return MLX5_TXCMP_CODE_EXIT; 3999 loc->mbuf = *pkts++; 4000 ret = mlx5_tx_able_to_empw(txq, loc, olx, true); 4001 if (unlikely(ret != MLX5_TXCMP_CODE_EMPW)) 4002 return ret; 4003 /* Continue sending eMPW batches. */ 4004 } 4005 assert(false); 4006 } 4007 4008 /** 4009 * The routine sends packets with MLX5_OPCODE_EMPW 4010 * with inlining, optionally supports VLAN insertion. 4011 */ 4012 static __rte_always_inline enum mlx5_txcmp_code 4013 mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq, 4014 struct rte_mbuf **restrict pkts, 4015 unsigned int pkts_n, 4016 struct mlx5_txq_local *restrict loc, 4017 unsigned int olx) 4018 { 4019 /* 4020 * Subroutine is the part of mlx5_tx_burst_single() 4021 * and sends single-segment packet with eMPW opcode 4022 * with data inlining. 4023 */ 4024 assert(MLX5_TXOFF_CONFIG(INLINE)); 4025 assert(MLX5_TXOFF_CONFIG(EMPW)); 4026 assert(loc->elts_free && loc->wqe_free); 4027 assert(pkts_n > loc->pkts_sent); 4028 static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size"); 4029 pkts += loc->pkts_sent + 1; 4030 pkts_n -= loc->pkts_sent; 4031 for (;;) { 4032 struct mlx5_wqe_dseg *restrict dseg; 4033 struct mlx5_wqe_eseg *restrict eseg; 4034 enum mlx5_txcmp_code ret; 4035 unsigned int room, part, nlim; 4036 unsigned int slen = 0; 4037 4038 assert(NB_SEGS(loc->mbuf) == 1); 4039 /* 4040 * Limits the amount of packets in one WQE 4041 * to improve CQE latency generation. 4042 */ 4043 nlim = RTE_MIN(pkts_n, MLX5_EMPW_MAX_PACKETS); 4044 /* Check whether we have minimal amount WQEs */ 4045 if (unlikely(loc->wqe_free < 4046 ((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4))) 4047 return MLX5_TXCMP_CODE_EXIT; 4048 if (likely(pkts_n > 1)) 4049 rte_prefetch0(*pkts); 4050 loc->wqe_last = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4051 /* 4052 * Build eMPW title WQEBB: 4053 * - Control Segment, eMPW opcode, zero DS 4054 * - Ethernet Segment, no inline 4055 */ 4056 mlx5_tx_cseg_init(txq, loc, loc->wqe_last, 0, 4057 MLX5_OPCODE_ENHANCED_MPSW, olx); 4058 mlx5_tx_eseg_none(txq, loc, loc->wqe_last, 4059 olx & ~MLX5_TXOFF_CONFIG_VLAN); 4060 eseg = &loc->wqe_last->eseg; 4061 dseg = &loc->wqe_last->dseg[0]; 4062 room = RTE_MIN(MLX5_WQE_SIZE_MAX / MLX5_WQE_SIZE, 4063 loc->wqe_free) * MLX5_WQE_SIZE - 4064 MLX5_WQE_CSEG_SIZE - 4065 MLX5_WQE_ESEG_SIZE; 4066 /* Build WQE till we have space, packets and resources. */ 4067 part = room; 4068 for (;;) { 4069 uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf); 4070 uint8_t *dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *); 4071 unsigned int tlen; 4072 4073 assert(room >= MLX5_WQE_DSEG_SIZE); 4074 assert((room % MLX5_WQE_DSEG_SIZE) == 0); 4075 assert((uintptr_t)dseg < (uintptr_t)txq->wqes_end); 4076 /* 4077 * Some Tx offloads may cause an error if 4078 * packet is not long enough, check against 4079 * assumed minimal length. 4080 */ 4081 if (unlikely(dlen <= MLX5_ESEG_MIN_INLINE_SIZE)) { 4082 part -= room; 4083 if (unlikely(!part)) 4084 return MLX5_TXCMP_CODE_ERROR; 4085 /* 4086 * We have some successfully built 4087 * packet Data Segments to send. 4088 */ 4089 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4090 return MLX5_TXCMP_CODE_ERROR; 4091 } 4092 /* Inline or not inline - that's the Question. */ 4093 if (dlen > txq->inlen_empw) 4094 goto pointer_empw; 4095 /* Inline entire packet, optional VLAN insertion. */ 4096 tlen = sizeof(dseg->bcount) + dlen; 4097 if (MLX5_TXOFF_CONFIG(VLAN) && 4098 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { 4099 /* 4100 * The packet length must be checked in 4101 * mlx5_tx_able_to_empw() and packet 4102 * fits into inline length guaranteed. 4103 */ 4104 assert((dlen + sizeof(struct rte_vlan_hdr)) <= 4105 txq->inlen_empw); 4106 tlen += sizeof(struct rte_vlan_hdr); 4107 if (room < tlen) 4108 break; 4109 dseg = mlx5_tx_dseg_vlan(txq, loc, dseg, 4110 dptr, dlen, olx); 4111 #ifdef MLX5_PMD_SOFT_COUNTERS 4112 /* Update sent data bytes counter. */ 4113 slen += sizeof(struct rte_vlan_hdr); 4114 #endif 4115 } else { 4116 if (room < tlen) 4117 break; 4118 dseg = mlx5_tx_dseg_empw(txq, loc, dseg, 4119 dptr, dlen, olx); 4120 } 4121 tlen = RTE_ALIGN(tlen, MLX5_WSEG_SIZE); 4122 assert(room >= tlen); 4123 room -= tlen; 4124 /* 4125 * Packet data are completely inlined, 4126 * free the packet immediately. 4127 */ 4128 rte_pktmbuf_free_seg(loc->mbuf); 4129 goto next_mbuf; 4130 pointer_empw: 4131 /* 4132 * Not inlinable VLAN packets are 4133 * proceeded outside of this routine. 4134 */ 4135 assert(room >= MLX5_WQE_DSEG_SIZE); 4136 if (MLX5_TXOFF_CONFIG(VLAN)) 4137 assert(!(loc->mbuf->ol_flags & 4138 PKT_TX_VLAN_PKT)); 4139 mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx); 4140 /* We have to store mbuf in elts.*/ 4141 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 4142 room -= MLX5_WQE_DSEG_SIZE; 4143 /* Ring buffer wraparound is checked at the loop end.*/ 4144 ++dseg; 4145 next_mbuf: 4146 #ifdef MLX5_PMD_SOFT_COUNTERS 4147 /* Update sent data bytes counter. */ 4148 slen += dlen; 4149 #endif 4150 loc->pkts_sent++; 4151 loc->elts_free--; 4152 pkts_n--; 4153 if (unlikely(!pkts_n || !loc->elts_free)) { 4154 /* 4155 * We have no resources/packets to 4156 * continue build descriptors. 4157 */ 4158 part -= room; 4159 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4160 return MLX5_TXCMP_CODE_EXIT; 4161 } 4162 loc->mbuf = *pkts++; 4163 if (likely(pkts_n > 1)) 4164 rte_prefetch0(*pkts); 4165 ret = mlx5_tx_able_to_empw(txq, loc, olx, true); 4166 /* 4167 * Unroll the completion code to avoid 4168 * returning variable value - it results in 4169 * unoptimized sequent checking in caller. 4170 */ 4171 if (ret == MLX5_TXCMP_CODE_MULTI) { 4172 part -= room; 4173 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4174 if (unlikely(!loc->elts_free || 4175 !loc->wqe_free)) 4176 return MLX5_TXCMP_CODE_EXIT; 4177 return MLX5_TXCMP_CODE_MULTI; 4178 } 4179 assert(NB_SEGS(loc->mbuf) == 1); 4180 if (ret == MLX5_TXCMP_CODE_TSO) { 4181 part -= room; 4182 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4183 if (unlikely(!loc->elts_free || 4184 !loc->wqe_free)) 4185 return MLX5_TXCMP_CODE_EXIT; 4186 return MLX5_TXCMP_CODE_TSO; 4187 } 4188 if (ret == MLX5_TXCMP_CODE_SINGLE) { 4189 part -= room; 4190 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4191 if (unlikely(!loc->elts_free || 4192 !loc->wqe_free)) 4193 return MLX5_TXCMP_CODE_EXIT; 4194 return MLX5_TXCMP_CODE_SINGLE; 4195 } 4196 if (ret != MLX5_TXCMP_CODE_EMPW) { 4197 assert(false); 4198 part -= room; 4199 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4200 return MLX5_TXCMP_CODE_ERROR; 4201 } 4202 /* Check if we have minimal room left. */ 4203 nlim--; 4204 if (unlikely(!nlim || room < MLX5_WQE_DSEG_SIZE)) 4205 break; 4206 /* 4207 * Check whether packet parameters coincide 4208 * within assumed eMPW batch: 4209 * - check sum settings 4210 * - metadata value 4211 * - software parser settings 4212 */ 4213 if (!mlx5_tx_match_empw(txq, eseg, loc, olx)) 4214 break; 4215 /* Packet attributes match, continue the same eMPW. */ 4216 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 4217 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 4218 } 4219 /* 4220 * We get here to close an existing eMPW 4221 * session and start the new one. 4222 */ 4223 assert(pkts_n); 4224 part -= room; 4225 if (unlikely(!part)) 4226 return MLX5_TXCMP_CODE_EXIT; 4227 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4228 if (unlikely(!loc->elts_free || 4229 !loc->wqe_free)) 4230 return MLX5_TXCMP_CODE_EXIT; 4231 /* Continue the loop with new eMPW session. */ 4232 } 4233 assert(false); 4234 } 4235 4236 /** 4237 * The routine sends packets with ordinary MLX5_OPCODE_SEND. 4238 * Data inlining and VLAN insertion are supported. 4239 */ 4240 static __rte_always_inline enum mlx5_txcmp_code 4241 mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq, 4242 struct rte_mbuf **restrict pkts, 4243 unsigned int pkts_n, 4244 struct mlx5_txq_local *restrict loc, 4245 unsigned int olx) 4246 { 4247 /* 4248 * Subroutine is the part of mlx5_tx_burst_single() 4249 * and sends single-segment packet with SEND opcode. 4250 */ 4251 assert(loc->elts_free && loc->wqe_free); 4252 assert(pkts_n > loc->pkts_sent); 4253 pkts += loc->pkts_sent + 1; 4254 pkts_n -= loc->pkts_sent; 4255 for (;;) { 4256 struct mlx5_wqe *restrict wqe; 4257 enum mlx5_txcmp_code ret; 4258 4259 assert(NB_SEGS(loc->mbuf) == 1); 4260 if (MLX5_TXOFF_CONFIG(INLINE)) { 4261 unsigned int inlen, vlan = 0; 4262 4263 inlen = rte_pktmbuf_data_len(loc->mbuf); 4264 if (MLX5_TXOFF_CONFIG(VLAN) && 4265 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { 4266 vlan = sizeof(struct rte_vlan_hdr); 4267 inlen += vlan; 4268 static_assert((sizeof(struct rte_vlan_hdr) + 4269 sizeof(struct rte_ether_hdr)) == 4270 MLX5_ESEG_MIN_INLINE_SIZE, 4271 "invalid min inline data size"); 4272 } 4273 /* 4274 * If inlining is enabled at configuration time 4275 * the limit must be not less than minimal size. 4276 * Otherwise we would do extra check for data 4277 * size to avoid crashes due to length overflow. 4278 */ 4279 assert(txq->inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE); 4280 if (inlen <= txq->inlen_send) { 4281 unsigned int seg_n, wqe_n; 4282 4283 rte_prefetch0(rte_pktmbuf_mtod 4284 (loc->mbuf, uint8_t *)); 4285 /* Check against minimal length. */ 4286 if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE) 4287 return MLX5_TXCMP_CODE_ERROR; 4288 /* 4289 * Completely inlined packet data WQE: 4290 * - Control Segment, SEND opcode 4291 * - Ethernet Segment, no VLAN insertion 4292 * - Data inlined, VLAN optionally inserted 4293 * - Alignment to MLX5_WSEG_SIZE 4294 * Have to estimate amount of WQEBBs 4295 */ 4296 seg_n = (inlen + 3 * MLX5_WSEG_SIZE - 4297 MLX5_ESEG_MIN_INLINE_SIZE + 4298 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 4299 /* Check if there are enough WQEBBs. */ 4300 wqe_n = (seg_n + 3) / 4; 4301 if (wqe_n > loc->wqe_free) 4302 return MLX5_TXCMP_CODE_EXIT; 4303 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4304 loc->wqe_last = wqe; 4305 mlx5_tx_cseg_init(txq, loc, wqe, seg_n, 4306 MLX5_OPCODE_SEND, olx); 4307 mlx5_tx_eseg_data(txq, loc, wqe, 4308 vlan, inlen, 0, olx); 4309 txq->wqe_ci += wqe_n; 4310 loc->wqe_free -= wqe_n; 4311 /* 4312 * Packet data are completely inlined, 4313 * free the packet immediately. 4314 */ 4315 rte_pktmbuf_free_seg(loc->mbuf); 4316 } else if (!MLX5_TXOFF_CONFIG(EMPW) && 4317 txq->inlen_mode) { 4318 /* 4319 * If minimal inlining is requested the eMPW 4320 * feature should be disabled due to data is 4321 * inlined into Ethernet Segment, which can 4322 * not contain inlined data for eMPW due to 4323 * segment shared for all packets. 4324 */ 4325 struct mlx5_wqe_dseg *restrict dseg; 4326 unsigned int ds; 4327 uint8_t *dptr; 4328 4329 /* 4330 * The inline-mode settings require 4331 * to inline the specified amount of 4332 * data bytes to the Ethernet Segment. 4333 * We should check the free space in 4334 * WQE ring buffer to inline partially. 4335 */ 4336 assert(txq->inlen_send >= txq->inlen_mode); 4337 assert(inlen > txq->inlen_mode); 4338 assert(txq->inlen_mode >= 4339 MLX5_ESEG_MIN_INLINE_SIZE); 4340 /* 4341 * Check whether there are enough free WQEBBs: 4342 * - Control Segment 4343 * - Ethernet Segment 4344 * - First Segment of inlined Ethernet data 4345 * - ... data continued ... 4346 * - Finishing Data Segment of pointer type 4347 */ 4348 ds = (MLX5_WQE_CSEG_SIZE + 4349 MLX5_WQE_ESEG_SIZE + 4350 MLX5_WQE_DSEG_SIZE + 4351 txq->inlen_mode - 4352 MLX5_ESEG_MIN_INLINE_SIZE + 4353 MLX5_WQE_DSEG_SIZE + 4354 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 4355 if (loc->wqe_free < ((ds + 3) / 4)) 4356 return MLX5_TXCMP_CODE_EXIT; 4357 /* 4358 * Build the ordinary SEND WQE: 4359 * - Control Segment 4360 * - Ethernet Segment, inline inlen_mode bytes 4361 * - Data Segment of pointer type 4362 */ 4363 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4364 loc->wqe_last = wqe; 4365 mlx5_tx_cseg_init(txq, loc, wqe, ds, 4366 MLX5_OPCODE_SEND, olx); 4367 dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan, 4368 txq->inlen_mode, 4369 0, olx); 4370 dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + 4371 txq->inlen_mode - vlan; 4372 inlen -= txq->inlen_mode; 4373 mlx5_tx_dseg_ptr(txq, loc, dseg, 4374 dptr, inlen, olx); 4375 /* 4376 * WQE is built, update the loop parameters 4377 * and got to the next packet. 4378 */ 4379 txq->wqe_ci += (ds + 3) / 4; 4380 loc->wqe_free -= (ds + 3) / 4; 4381 /* We have to store mbuf in elts.*/ 4382 assert(MLX5_TXOFF_CONFIG(INLINE)); 4383 txq->elts[txq->elts_head++ & txq->elts_m] = 4384 loc->mbuf; 4385 --loc->elts_free; 4386 } else { 4387 uint8_t *dptr; 4388 unsigned int dlen; 4389 4390 /* 4391 * Partially inlined packet data WQE, we have 4392 * some space in title WQEBB, we can fill it 4393 * with some packet data. It takes one WQEBB, 4394 * it is available, no extra space check: 4395 * - Control Segment, SEND opcode 4396 * - Ethernet Segment, no VLAN insertion 4397 * - MLX5_ESEG_MIN_INLINE_SIZE bytes of Data 4398 * - Data Segment, pointer type 4399 * 4400 * We also get here if VLAN insertion is not 4401 * supported by HW, the inline is enabled. 4402 */ 4403 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4404 loc->wqe_last = wqe; 4405 mlx5_tx_cseg_init(txq, loc, wqe, 4, 4406 MLX5_OPCODE_SEND, olx); 4407 mlx5_tx_eseg_dmin(txq, loc, wqe, vlan, olx); 4408 dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + 4409 MLX5_ESEG_MIN_INLINE_SIZE - vlan; 4410 /* 4411 * The length check is performed above, by 4412 * comparing with txq->inlen_send. We should 4413 * not get overflow here. 4414 */ 4415 assert(inlen > MLX5_ESEG_MIN_INLINE_SIZE); 4416 dlen = inlen - MLX5_ESEG_MIN_INLINE_SIZE; 4417 mlx5_tx_dseg_ptr(txq, loc, &wqe->dseg[1], 4418 dptr, dlen, olx); 4419 ++txq->wqe_ci; 4420 --loc->wqe_free; 4421 /* We have to store mbuf in elts.*/ 4422 assert(MLX5_TXOFF_CONFIG(INLINE)); 4423 txq->elts[txq->elts_head++ & txq->elts_m] = 4424 loc->mbuf; 4425 --loc->elts_free; 4426 } 4427 #ifdef MLX5_PMD_SOFT_COUNTERS 4428 /* Update sent data bytes counter. */ 4429 txq->stats.obytes += vlan + 4430 rte_pktmbuf_data_len(loc->mbuf); 4431 #endif 4432 } else { 4433 /* 4434 * No inline at all, it means the CPU cycles saving 4435 * is prioritized at configuration, we should not 4436 * copy any packet data to WQE. 4437 * 4438 * SEND WQE, one WQEBB: 4439 * - Control Segment, SEND opcode 4440 * - Ethernet Segment, optional VLAN, no inline 4441 * - Data Segment, pointer type 4442 */ 4443 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4444 loc->wqe_last = wqe; 4445 mlx5_tx_cseg_init(txq, loc, wqe, 3, 4446 MLX5_OPCODE_SEND, olx); 4447 mlx5_tx_eseg_none(txq, loc, wqe, olx); 4448 mlx5_tx_dseg_ptr 4449 (txq, loc, &wqe->dseg[0], 4450 rte_pktmbuf_mtod(loc->mbuf, uint8_t *), 4451 rte_pktmbuf_data_len(loc->mbuf), olx); 4452 ++txq->wqe_ci; 4453 --loc->wqe_free; 4454 /* 4455 * We should not store mbuf pointer in elts 4456 * if no inlining is configured, this is done 4457 * by calling routine in a batch copy. 4458 */ 4459 assert(!MLX5_TXOFF_CONFIG(INLINE)); 4460 --loc->elts_free; 4461 #ifdef MLX5_PMD_SOFT_COUNTERS 4462 /* Update sent data bytes counter. */ 4463 txq->stats.obytes += rte_pktmbuf_data_len(loc->mbuf); 4464 if (MLX5_TXOFF_CONFIG(VLAN) && 4465 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) 4466 txq->stats.obytes += 4467 sizeof(struct rte_vlan_hdr); 4468 #endif 4469 } 4470 ++loc->pkts_sent; 4471 --pkts_n; 4472 /* Request CQE generation if limits are reached. */ 4473 mlx5_tx_request_completion(txq, loc, false, olx); 4474 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) 4475 return MLX5_TXCMP_CODE_EXIT; 4476 loc->mbuf = *pkts++; 4477 if (pkts_n > 1) 4478 rte_prefetch0(*pkts); 4479 ret = mlx5_tx_able_to_empw(txq, loc, olx, true); 4480 if (unlikely(ret != MLX5_TXCMP_CODE_SINGLE)) 4481 return ret; 4482 } 4483 assert(false); 4484 } 4485 4486 static __rte_always_inline enum mlx5_txcmp_code 4487 mlx5_tx_burst_single(struct mlx5_txq_data *restrict txq, 4488 struct rte_mbuf **restrict pkts, 4489 unsigned int pkts_n, 4490 struct mlx5_txq_local *restrict loc, 4491 unsigned int olx) 4492 { 4493 enum mlx5_txcmp_code ret; 4494 4495 ret = mlx5_tx_able_to_empw(txq, loc, olx, false); 4496 if (ret == MLX5_TXCMP_CODE_SINGLE) 4497 goto ordinary_send; 4498 assert(ret == MLX5_TXCMP_CODE_EMPW); 4499 for (;;) { 4500 /* Optimize for inline/no inline eMPW send. */ 4501 ret = (MLX5_TXOFF_CONFIG(INLINE)) ? 4502 mlx5_tx_burst_empw_inline 4503 (txq, pkts, pkts_n, loc, olx) : 4504 mlx5_tx_burst_empw_simple 4505 (txq, pkts, pkts_n, loc, olx); 4506 if (ret != MLX5_TXCMP_CODE_SINGLE) 4507 return ret; 4508 /* The resources to send one packet should remain. */ 4509 assert(loc->elts_free && loc->wqe_free); 4510 ordinary_send: 4511 ret = mlx5_tx_burst_single_send(txq, pkts, pkts_n, loc, olx); 4512 assert(ret != MLX5_TXCMP_CODE_SINGLE); 4513 if (ret != MLX5_TXCMP_CODE_EMPW) 4514 return ret; 4515 /* The resources to send one packet should remain. */ 4516 assert(loc->elts_free && loc->wqe_free); 4517 } 4518 } 4519 4520 /** 4521 * DPDK Tx callback template. This is configured template 4522 * used to generate routines optimized for specified offload setup. 4523 * One of this generated functions is chosen at SQ configuration 4524 * time. 4525 * 4526 * @param txq 4527 * Generic pointer to TX queue structure. 4528 * @param[in] pkts 4529 * Packets to transmit. 4530 * @param pkts_n 4531 * Number of packets in array. 4532 * @param olx 4533 * Configured offloads mask, presents the bits of MLX5_TXOFF_CONFIG_xxx 4534 * values. Should be static to take compile time static configuration 4535 * advantages. 4536 * 4537 * @return 4538 * Number of packets successfully transmitted (<= pkts_n). 4539 */ 4540 static __rte_always_inline uint16_t 4541 mlx5_tx_burst_tmpl(struct mlx5_txq_data *restrict txq, 4542 struct rte_mbuf **restrict pkts, 4543 uint16_t pkts_n, 4544 unsigned int olx) 4545 { 4546 struct mlx5_txq_local loc; 4547 enum mlx5_txcmp_code ret; 4548 unsigned int part; 4549 4550 assert(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); 4551 assert(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); 4552 if (unlikely(!pkts_n)) 4553 return 0; 4554 loc.pkts_sent = 0; 4555 loc.pkts_copy = 0; 4556 loc.wqe_last = NULL; 4557 4558 send_loop: 4559 loc.pkts_loop = loc.pkts_sent; 4560 /* 4561 * Check if there are some CQEs, if any: 4562 * - process an encountered errors 4563 * - process the completed WQEs 4564 * - free related mbufs 4565 * - doorbell the NIC about processed CQEs 4566 */ 4567 rte_prefetch0(*(pkts + loc.pkts_sent)); 4568 mlx5_tx_handle_completion(txq, olx); 4569 /* 4570 * Calculate the number of available resources - elts and WQEs. 4571 * There are two possible different scenarios: 4572 * - no data inlining into WQEs, one WQEBB may contains upto 4573 * four packets, in this case elts become scarce resource 4574 * - data inlining into WQEs, one packet may require multiple 4575 * WQEBBs, the WQEs become the limiting factor. 4576 */ 4577 assert(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); 4578 loc.elts_free = txq->elts_s - 4579 (uint16_t)(txq->elts_head - txq->elts_tail); 4580 assert(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); 4581 loc.wqe_free = txq->wqe_s - 4582 (uint16_t)(txq->wqe_ci - txq->wqe_pi); 4583 if (unlikely(!loc.elts_free || !loc.wqe_free)) 4584 goto burst_exit; 4585 for (;;) { 4586 /* 4587 * Fetch the packet from array. Usually this is 4588 * the first packet in series of multi/single 4589 * segment packets. 4590 */ 4591 loc.mbuf = *(pkts + loc.pkts_sent); 4592 /* Dedicated branch for multi-segment packets. */ 4593 if (MLX5_TXOFF_CONFIG(MULTI) && 4594 unlikely(NB_SEGS(loc.mbuf) > 1)) { 4595 /* 4596 * Multi-segment packet encountered. 4597 * Hardware is able to process it only 4598 * with SEND/TSO opcodes, one packet 4599 * per WQE, do it in dedicated routine. 4600 */ 4601 enter_send_multi: 4602 assert(loc.pkts_sent >= loc.pkts_copy); 4603 part = loc.pkts_sent - loc.pkts_copy; 4604 if (!MLX5_TXOFF_CONFIG(INLINE) && part) { 4605 /* 4606 * There are some single-segment mbufs not 4607 * stored in elts. The mbufs must be in the 4608 * same order as WQEs, so we must copy the 4609 * mbufs to elts here, before the coming 4610 * multi-segment packet mbufs is appended. 4611 */ 4612 mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy, 4613 part, olx); 4614 loc.pkts_copy = loc.pkts_sent; 4615 } 4616 assert(pkts_n > loc.pkts_sent); 4617 ret = mlx5_tx_burst_mseg(txq, pkts, pkts_n, &loc, olx); 4618 if (!MLX5_TXOFF_CONFIG(INLINE)) 4619 loc.pkts_copy = loc.pkts_sent; 4620 /* 4621 * These returned code checks are supposed 4622 * to be optimized out due to routine inlining. 4623 */ 4624 if (ret == MLX5_TXCMP_CODE_EXIT) { 4625 /* 4626 * The routine returns this code when 4627 * all packets are sent or there is no 4628 * enough resources to complete request. 4629 */ 4630 break; 4631 } 4632 if (ret == MLX5_TXCMP_CODE_ERROR) { 4633 /* 4634 * The routine returns this code when 4635 * some error in the incoming packets 4636 * format occurred. 4637 */ 4638 txq->stats.oerrors++; 4639 break; 4640 } 4641 if (ret == MLX5_TXCMP_CODE_SINGLE) { 4642 /* 4643 * The single-segment packet was encountered 4644 * in the array, try to send it with the 4645 * best optimized way, possible engaging eMPW. 4646 */ 4647 goto enter_send_single; 4648 } 4649 if (MLX5_TXOFF_CONFIG(TSO) && 4650 ret == MLX5_TXCMP_CODE_TSO) { 4651 /* 4652 * The single-segment TSO packet was 4653 * encountered in the array. 4654 */ 4655 goto enter_send_tso; 4656 } 4657 /* We must not get here. Something is going wrong. */ 4658 assert(false); 4659 txq->stats.oerrors++; 4660 break; 4661 } 4662 /* Dedicated branch for single-segment TSO packets. */ 4663 if (MLX5_TXOFF_CONFIG(TSO) && 4664 unlikely(loc.mbuf->ol_flags & PKT_TX_TCP_SEG)) { 4665 /* 4666 * TSO might require special way for inlining 4667 * (dedicated parameters) and is sent with 4668 * MLX5_OPCODE_TSO opcode only, provide this 4669 * in dedicated branch. 4670 */ 4671 enter_send_tso: 4672 assert(NB_SEGS(loc.mbuf) == 1); 4673 assert(pkts_n > loc.pkts_sent); 4674 ret = mlx5_tx_burst_tso(txq, pkts, pkts_n, &loc, olx); 4675 /* 4676 * These returned code checks are supposed 4677 * to be optimized out due to routine inlining. 4678 */ 4679 if (ret == MLX5_TXCMP_CODE_EXIT) 4680 break; 4681 if (ret == MLX5_TXCMP_CODE_ERROR) { 4682 txq->stats.oerrors++; 4683 break; 4684 } 4685 if (ret == MLX5_TXCMP_CODE_SINGLE) 4686 goto enter_send_single; 4687 if (MLX5_TXOFF_CONFIG(MULTI) && 4688 ret == MLX5_TXCMP_CODE_MULTI) { 4689 /* 4690 * The multi-segment packet was 4691 * encountered in the array. 4692 */ 4693 goto enter_send_multi; 4694 } 4695 /* We must not get here. Something is going wrong. */ 4696 assert(false); 4697 txq->stats.oerrors++; 4698 break; 4699 } 4700 /* 4701 * The dedicated branch for the single-segment packets 4702 * without TSO. Often these ones can be sent using 4703 * MLX5_OPCODE_EMPW with multiple packets in one WQE. 4704 * The routine builds the WQEs till it encounters 4705 * the TSO or multi-segment packet (in case if these 4706 * offloads are requested at SQ configuration time). 4707 */ 4708 enter_send_single: 4709 assert(pkts_n > loc.pkts_sent); 4710 ret = mlx5_tx_burst_single(txq, pkts, pkts_n, &loc, olx); 4711 /* 4712 * These returned code checks are supposed 4713 * to be optimized out due to routine inlining. 4714 */ 4715 if (ret == MLX5_TXCMP_CODE_EXIT) 4716 break; 4717 if (ret == MLX5_TXCMP_CODE_ERROR) { 4718 txq->stats.oerrors++; 4719 break; 4720 } 4721 if (MLX5_TXOFF_CONFIG(MULTI) && 4722 ret == MLX5_TXCMP_CODE_MULTI) { 4723 /* 4724 * The multi-segment packet was 4725 * encountered in the array. 4726 */ 4727 goto enter_send_multi; 4728 } 4729 if (MLX5_TXOFF_CONFIG(TSO) && 4730 ret == MLX5_TXCMP_CODE_TSO) { 4731 /* 4732 * The single-segment TSO packet was 4733 * encountered in the array. 4734 */ 4735 goto enter_send_tso; 4736 } 4737 /* We must not get here. Something is going wrong. */ 4738 assert(false); 4739 txq->stats.oerrors++; 4740 break; 4741 } 4742 /* 4743 * Main Tx loop is completed, do the rest: 4744 * - set completion request if thresholds are reached 4745 * - doorbell the hardware 4746 * - copy the rest of mbufs to elts (if any) 4747 */ 4748 assert(MLX5_TXOFF_CONFIG(INLINE) || loc.pkts_sent >= loc.pkts_copy); 4749 /* Take a shortcut if nothing is sent. */ 4750 if (unlikely(loc.pkts_sent == loc.pkts_loop)) 4751 goto burst_exit; 4752 /* 4753 * Ring QP doorbell immediately after WQE building completion 4754 * to improve latencies. The pure software related data treatment 4755 * can be completed after doorbell. Tx CQEs for this SQ are 4756 * processed in this thread only by the polling. 4757 * 4758 * The rdma core library can map doorbell register in two ways, 4759 * depending on the environment variable "MLX5_SHUT_UP_BF": 4760 * 4761 * - as regular cached memory, the variable is either missing or 4762 * set to zero. This type of mapping may cause the significant 4763 * doorbell register writing latency and requires explicit 4764 * memory write barrier to mitigate this issue and prevent 4765 * write combining. 4766 * 4767 * - as non-cached memory, the variable is present and set to 4768 * not "0" value. This type of mapping may cause performance 4769 * impact under heavy loading conditions but the explicit write 4770 * memory barrier is not required and it may improve core 4771 * performance. 4772 * 4773 * - the legacy behaviour (prior 19.08 release) was to use some 4774 * heuristics to decide whether write memory barrier should 4775 * be performed. This behavior is supported with specifying 4776 * tx_db_nc=2, write barrier is skipped if application 4777 * provides the full recommended burst of packets, it 4778 * supposes the next packets are coming and the write barrier 4779 * will be issued on the next burst (after descriptor writing, 4780 * at least). 4781 */ 4782 mlx5_tx_dbrec_cond_wmb(txq, loc.wqe_last, !txq->db_nc && 4783 (!txq->db_heu || pkts_n % MLX5_TX_DEFAULT_BURST)); 4784 /* Not all of the mbufs may be stored into elts yet. */ 4785 part = MLX5_TXOFF_CONFIG(INLINE) ? 0 : loc.pkts_sent - loc.pkts_copy; 4786 if (!MLX5_TXOFF_CONFIG(INLINE) && part) { 4787 /* 4788 * There are some single-segment mbufs not stored in elts. 4789 * It can be only if the last packet was single-segment. 4790 * The copying is gathered into one place due to it is 4791 * a good opportunity to optimize that with SIMD. 4792 * Unfortunately if inlining is enabled the gaps in 4793 * pointer array may happen due to early freeing of the 4794 * inlined mbufs. 4795 */ 4796 mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy, part, olx); 4797 loc.pkts_copy = loc.pkts_sent; 4798 } 4799 assert(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); 4800 assert(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); 4801 if (pkts_n > loc.pkts_sent) { 4802 /* 4803 * If burst size is large there might be no enough CQE 4804 * fetched from completion queue and no enough resources 4805 * freed to send all the packets. 4806 */ 4807 goto send_loop; 4808 } 4809 burst_exit: 4810 #ifdef MLX5_PMD_SOFT_COUNTERS 4811 /* Increment sent packets counter. */ 4812 txq->stats.opackets += loc.pkts_sent; 4813 #endif 4814 return loc.pkts_sent; 4815 } 4816 4817 /* Generate routines with Enhanced Multi-Packet Write support. */ 4818 MLX5_TXOFF_DECL(full_empw, 4819 MLX5_TXOFF_CONFIG_FULL | MLX5_TXOFF_CONFIG_EMPW) 4820 4821 MLX5_TXOFF_DECL(none_empw, 4822 MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW) 4823 4824 MLX5_TXOFF_DECL(md_empw, 4825 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4826 4827 MLX5_TXOFF_DECL(mt_empw, 4828 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4829 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4830 4831 MLX5_TXOFF_DECL(mtsc_empw, 4832 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4833 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4834 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4835 4836 MLX5_TXOFF_DECL(mti_empw, 4837 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4838 MLX5_TXOFF_CONFIG_INLINE | 4839 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4840 4841 MLX5_TXOFF_DECL(mtv_empw, 4842 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4843 MLX5_TXOFF_CONFIG_VLAN | 4844 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4845 4846 MLX5_TXOFF_DECL(mtiv_empw, 4847 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4848 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4849 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4850 4851 MLX5_TXOFF_DECL(sc_empw, 4852 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4853 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4854 4855 MLX5_TXOFF_DECL(sci_empw, 4856 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4857 MLX5_TXOFF_CONFIG_INLINE | 4858 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4859 4860 MLX5_TXOFF_DECL(scv_empw, 4861 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4862 MLX5_TXOFF_CONFIG_VLAN | 4863 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4864 4865 MLX5_TXOFF_DECL(sciv_empw, 4866 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4867 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4868 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4869 4870 MLX5_TXOFF_DECL(i_empw, 4871 MLX5_TXOFF_CONFIG_INLINE | 4872 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4873 4874 MLX5_TXOFF_DECL(v_empw, 4875 MLX5_TXOFF_CONFIG_VLAN | 4876 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4877 4878 MLX5_TXOFF_DECL(iv_empw, 4879 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4880 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4881 4882 /* Generate routines without Enhanced Multi-Packet Write support. */ 4883 MLX5_TXOFF_DECL(full, 4884 MLX5_TXOFF_CONFIG_FULL) 4885 4886 MLX5_TXOFF_DECL(none, 4887 MLX5_TXOFF_CONFIG_NONE) 4888 4889 MLX5_TXOFF_DECL(md, 4890 MLX5_TXOFF_CONFIG_METADATA) 4891 4892 MLX5_TXOFF_DECL(mt, 4893 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4894 MLX5_TXOFF_CONFIG_METADATA) 4895 4896 MLX5_TXOFF_DECL(mtsc, 4897 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4898 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4899 MLX5_TXOFF_CONFIG_METADATA) 4900 4901 MLX5_TXOFF_DECL(mti, 4902 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4903 MLX5_TXOFF_CONFIG_INLINE | 4904 MLX5_TXOFF_CONFIG_METADATA) 4905 4906 4907 MLX5_TXOFF_DECL(mtv, 4908 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4909 MLX5_TXOFF_CONFIG_VLAN | 4910 MLX5_TXOFF_CONFIG_METADATA) 4911 4912 4913 MLX5_TXOFF_DECL(mtiv, 4914 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4915 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4916 MLX5_TXOFF_CONFIG_METADATA) 4917 4918 MLX5_TXOFF_DECL(sc, 4919 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4920 MLX5_TXOFF_CONFIG_METADATA) 4921 4922 MLX5_TXOFF_DECL(sci, 4923 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4924 MLX5_TXOFF_CONFIG_INLINE | 4925 MLX5_TXOFF_CONFIG_METADATA) 4926 4927 4928 MLX5_TXOFF_DECL(scv, 4929 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4930 MLX5_TXOFF_CONFIG_VLAN | 4931 MLX5_TXOFF_CONFIG_METADATA) 4932 4933 4934 MLX5_TXOFF_DECL(sciv, 4935 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4936 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4937 MLX5_TXOFF_CONFIG_METADATA) 4938 4939 MLX5_TXOFF_DECL(i, 4940 MLX5_TXOFF_CONFIG_INLINE | 4941 MLX5_TXOFF_CONFIG_METADATA) 4942 4943 MLX5_TXOFF_DECL(v, 4944 MLX5_TXOFF_CONFIG_VLAN | 4945 MLX5_TXOFF_CONFIG_METADATA) 4946 4947 MLX5_TXOFF_DECL(iv, 4948 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4949 MLX5_TXOFF_CONFIG_METADATA) 4950 4951 /* 4952 * Array of declared and compiled Tx burst function and corresponding 4953 * supported offloads set. The array is used to select the Tx burst 4954 * function for specified offloads set at Tx queue configuration time. 4955 */ 4956 const struct { 4957 eth_tx_burst_t func; 4958 unsigned int olx; 4959 } txoff_func[] = { 4960 MLX5_TXOFF_INFO(full_empw, 4961 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4962 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4963 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4964 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4965 4966 MLX5_TXOFF_INFO(none_empw, 4967 MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW) 4968 4969 MLX5_TXOFF_INFO(md_empw, 4970 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4971 4972 MLX5_TXOFF_INFO(mt_empw, 4973 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4974 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4975 4976 MLX5_TXOFF_INFO(mtsc_empw, 4977 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4978 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4979 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4980 4981 MLX5_TXOFF_INFO(mti_empw, 4982 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4983 MLX5_TXOFF_CONFIG_INLINE | 4984 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4985 4986 MLX5_TXOFF_INFO(mtv_empw, 4987 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4988 MLX5_TXOFF_CONFIG_VLAN | 4989 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4990 4991 MLX5_TXOFF_INFO(mtiv_empw, 4992 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4993 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4994 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4995 4996 MLX5_TXOFF_INFO(sc_empw, 4997 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4998 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4999 5000 MLX5_TXOFF_INFO(sci_empw, 5001 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5002 MLX5_TXOFF_CONFIG_INLINE | 5003 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5004 5005 MLX5_TXOFF_INFO(scv_empw, 5006 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5007 MLX5_TXOFF_CONFIG_VLAN | 5008 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5009 5010 MLX5_TXOFF_INFO(sciv_empw, 5011 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5012 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5013 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5014 5015 MLX5_TXOFF_INFO(i_empw, 5016 MLX5_TXOFF_CONFIG_INLINE | 5017 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5018 5019 MLX5_TXOFF_INFO(v_empw, 5020 MLX5_TXOFF_CONFIG_VLAN | 5021 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5022 5023 MLX5_TXOFF_INFO(iv_empw, 5024 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5025 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 5026 5027 MLX5_TXOFF_INFO(full, 5028 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5029 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5030 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5031 MLX5_TXOFF_CONFIG_METADATA) 5032 5033 MLX5_TXOFF_INFO(none, 5034 MLX5_TXOFF_CONFIG_NONE) 5035 5036 MLX5_TXOFF_INFO(md, 5037 MLX5_TXOFF_CONFIG_METADATA) 5038 5039 MLX5_TXOFF_INFO(mt, 5040 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5041 MLX5_TXOFF_CONFIG_METADATA) 5042 5043 MLX5_TXOFF_INFO(mtsc, 5044 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5045 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5046 MLX5_TXOFF_CONFIG_METADATA) 5047 5048 MLX5_TXOFF_INFO(mti, 5049 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5050 MLX5_TXOFF_CONFIG_INLINE | 5051 MLX5_TXOFF_CONFIG_METADATA) 5052 5053 5054 MLX5_TXOFF_INFO(mtv, 5055 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5056 MLX5_TXOFF_CONFIG_VLAN | 5057 MLX5_TXOFF_CONFIG_METADATA) 5058 5059 MLX5_TXOFF_INFO(mtiv, 5060 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5061 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5062 MLX5_TXOFF_CONFIG_METADATA) 5063 5064 MLX5_TXOFF_INFO(sc, 5065 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5066 MLX5_TXOFF_CONFIG_METADATA) 5067 5068 MLX5_TXOFF_INFO(sci, 5069 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5070 MLX5_TXOFF_CONFIG_INLINE | 5071 MLX5_TXOFF_CONFIG_METADATA) 5072 5073 MLX5_TXOFF_INFO(scv, 5074 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5075 MLX5_TXOFF_CONFIG_VLAN | 5076 MLX5_TXOFF_CONFIG_METADATA) 5077 5078 MLX5_TXOFF_INFO(sciv, 5079 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5080 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5081 MLX5_TXOFF_CONFIG_METADATA) 5082 5083 MLX5_TXOFF_INFO(i, 5084 MLX5_TXOFF_CONFIG_INLINE | 5085 MLX5_TXOFF_CONFIG_METADATA) 5086 5087 MLX5_TXOFF_INFO(v, 5088 MLX5_TXOFF_CONFIG_VLAN | 5089 MLX5_TXOFF_CONFIG_METADATA) 5090 5091 MLX5_TXOFF_INFO(iv, 5092 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5093 MLX5_TXOFF_CONFIG_METADATA) 5094 }; 5095 5096 /** 5097 * Configure the Tx function to use. The routine checks configured 5098 * Tx offloads for the device and selects appropriate Tx burst 5099 * routine. There are multiple Tx burst routines compiled from 5100 * the same template in the most optimal way for the dedicated 5101 * Tx offloads set. 5102 * 5103 * @param dev 5104 * Pointer to private data structure. 5105 * 5106 * @return 5107 * Pointer to selected Tx burst function. 5108 */ 5109 eth_tx_burst_t 5110 mlx5_select_tx_function(struct rte_eth_dev *dev) 5111 { 5112 struct mlx5_priv *priv = dev->data->dev_private; 5113 struct mlx5_dev_config *config = &priv->config; 5114 uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads; 5115 unsigned int diff = 0, olx = 0, i, m; 5116 5117 static_assert(MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE <= 5118 MLX5_DSEG_MAX, "invalid WQE max size"); 5119 static_assert(MLX5_WQE_CSEG_SIZE == MLX5_WSEG_SIZE, 5120 "invalid WQE Control Segment size"); 5121 static_assert(MLX5_WQE_ESEG_SIZE == MLX5_WSEG_SIZE, 5122 "invalid WQE Ethernet Segment size"); 5123 static_assert(MLX5_WQE_DSEG_SIZE == MLX5_WSEG_SIZE, 5124 "invalid WQE Data Segment size"); 5125 static_assert(MLX5_WQE_SIZE == 4 * MLX5_WSEG_SIZE, 5126 "invalid WQE size"); 5127 assert(priv); 5128 if (tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS) { 5129 /* We should support Multi-Segment Packets. */ 5130 olx |= MLX5_TXOFF_CONFIG_MULTI; 5131 } 5132 if (tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO | 5133 DEV_TX_OFFLOAD_VXLAN_TNL_TSO | 5134 DEV_TX_OFFLOAD_GRE_TNL_TSO | 5135 DEV_TX_OFFLOAD_IP_TNL_TSO | 5136 DEV_TX_OFFLOAD_UDP_TNL_TSO)) { 5137 /* We should support TCP Send Offload. */ 5138 olx |= MLX5_TXOFF_CONFIG_TSO; 5139 } 5140 if (tx_offloads & (DEV_TX_OFFLOAD_IP_TNL_TSO | 5141 DEV_TX_OFFLOAD_UDP_TNL_TSO | 5142 DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)) { 5143 /* We should support Software Parser for Tunnels. */ 5144 olx |= MLX5_TXOFF_CONFIG_SWP; 5145 } 5146 if (tx_offloads & (DEV_TX_OFFLOAD_IPV4_CKSUM | 5147 DEV_TX_OFFLOAD_UDP_CKSUM | 5148 DEV_TX_OFFLOAD_TCP_CKSUM | 5149 DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)) { 5150 /* We should support IP/TCP/UDP Checksums. */ 5151 olx |= MLX5_TXOFF_CONFIG_CSUM; 5152 } 5153 if (tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT) { 5154 /* We should support VLAN insertion. */ 5155 olx |= MLX5_TXOFF_CONFIG_VLAN; 5156 } 5157 if (priv->txqs_n && (*priv->txqs)[0]) { 5158 struct mlx5_txq_data *txd = (*priv->txqs)[0]; 5159 5160 if (txd->inlen_send) { 5161 /* 5162 * Check the data inline requirements. Data inline 5163 * is enabled on per device basis, we can check 5164 * the first Tx queue only. 5165 * 5166 * If device does not support VLAN insertion in WQE 5167 * and some queues are requested to perform VLAN 5168 * insertion offload than inline must be enabled. 5169 */ 5170 olx |= MLX5_TXOFF_CONFIG_INLINE; 5171 } 5172 } 5173 if (config->mps == MLX5_MPW_ENHANCED && 5174 config->txq_inline_min <= 0) { 5175 /* 5176 * The NIC supports Enhanced Multi-Packet Write. 5177 * We do not support legacy MPW due to its 5178 * hardware related problems, so we just ignore 5179 * legacy MLX5_MPW settings. There should be no 5180 * minimal required inline data. 5181 */ 5182 olx |= MLX5_TXOFF_CONFIG_EMPW; 5183 } 5184 if (rte_flow_dynf_metadata_avail()) { 5185 /* We should support Flow metadata. */ 5186 olx |= MLX5_TXOFF_CONFIG_METADATA; 5187 } 5188 /* 5189 * Scan the routines table to find the minimal 5190 * satisfying routine with requested offloads. 5191 */ 5192 m = RTE_DIM(txoff_func); 5193 for (i = 0; i < RTE_DIM(txoff_func); i++) { 5194 unsigned int tmp; 5195 5196 tmp = txoff_func[i].olx; 5197 if (tmp == olx) { 5198 /* Meets requested offloads exactly.*/ 5199 m = i; 5200 break; 5201 } 5202 if ((tmp & olx) != olx) { 5203 /* Does not meet requested offloads at all. */ 5204 continue; 5205 } 5206 if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_EMPW) 5207 /* Do not enable eMPW if not configured. */ 5208 continue; 5209 if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_INLINE) 5210 /* Do not enable inlining if not configured. */ 5211 continue; 5212 /* 5213 * Some routine meets the requirements. 5214 * Check whether it has minimal amount 5215 * of not requested offloads. 5216 */ 5217 tmp = __builtin_popcountl(tmp & ~olx); 5218 if (m >= RTE_DIM(txoff_func) || tmp < diff) { 5219 /* First or better match, save and continue. */ 5220 m = i; 5221 diff = tmp; 5222 continue; 5223 } 5224 if (tmp == diff) { 5225 tmp = txoff_func[i].olx ^ txoff_func[m].olx; 5226 if (__builtin_ffsl(txoff_func[i].olx & ~tmp) < 5227 __builtin_ffsl(txoff_func[m].olx & ~tmp)) { 5228 /* Lighter not requested offload. */ 5229 m = i; 5230 } 5231 } 5232 } 5233 if (m >= RTE_DIM(txoff_func)) { 5234 DRV_LOG(DEBUG, "port %u has no selected Tx function" 5235 " for requested offloads %04X", 5236 dev->data->port_id, olx); 5237 return NULL; 5238 } 5239 DRV_LOG(DEBUG, "port %u has selected Tx function" 5240 " supporting offloads %04X/%04X", 5241 dev->data->port_id, olx, txoff_func[m].olx); 5242 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_MULTI) 5243 DRV_LOG(DEBUG, "\tMULTI (multi segment)"); 5244 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_TSO) 5245 DRV_LOG(DEBUG, "\tTSO (TCP send offload)"); 5246 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_SWP) 5247 DRV_LOG(DEBUG, "\tSWP (software parser)"); 5248 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_CSUM) 5249 DRV_LOG(DEBUG, "\tCSUM (checksum offload)"); 5250 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_INLINE) 5251 DRV_LOG(DEBUG, "\tINLIN (inline data)"); 5252 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_VLAN) 5253 DRV_LOG(DEBUG, "\tVLANI (VLAN insertion)"); 5254 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_METADATA) 5255 DRV_LOG(DEBUG, "\tMETAD (tx Flow metadata)"); 5256 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_EMPW) 5257 DRV_LOG(DEBUG, "\tEMPW (Enhanced MPW)"); 5258 return txoff_func[m].func; 5259 } 5260