1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2015 6WIND S.A. 3 * Copyright 2015-2019 Mellanox Technologies, Ltd 4 */ 5 6 #include <assert.h> 7 #include <stdint.h> 8 #include <string.h> 9 #include <stdlib.h> 10 11 /* Verbs header. */ 12 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 13 #ifdef PEDANTIC 14 #pragma GCC diagnostic ignored "-Wpedantic" 15 #endif 16 #include <infiniband/verbs.h> 17 #include <infiniband/mlx5dv.h> 18 #ifdef PEDANTIC 19 #pragma GCC diagnostic error "-Wpedantic" 20 #endif 21 22 #include <rte_mbuf.h> 23 #include <rte_mempool.h> 24 #include <rte_prefetch.h> 25 #include <rte_common.h> 26 #include <rte_branch_prediction.h> 27 #include <rte_ether.h> 28 #include <rte_cycles.h> 29 30 #include "mlx5.h" 31 #include "mlx5_utils.h" 32 #include "mlx5_rxtx.h" 33 #include "mlx5_autoconf.h" 34 #include "mlx5_defs.h" 35 #include "mlx5_prm.h" 36 37 /* TX burst subroutines return codes. */ 38 enum mlx5_txcmp_code { 39 MLX5_TXCMP_CODE_EXIT = 0, 40 MLX5_TXCMP_CODE_ERROR, 41 MLX5_TXCMP_CODE_SINGLE, 42 MLX5_TXCMP_CODE_MULTI, 43 MLX5_TXCMP_CODE_TSO, 44 MLX5_TXCMP_CODE_EMPW, 45 }; 46 47 /* 48 * These defines are used to configure Tx burst routine option set 49 * supported at compile time. The not specified options are optimized out 50 * out due to if conditions can be explicitly calculated at compile time. 51 * The offloads with bigger runtime check (require more CPU cycles to 52 * skip) overhead should have the bigger index - this is needed to 53 * select the better matching routine function if no exact match and 54 * some offloads are not actually requested. 55 */ 56 #define MLX5_TXOFF_CONFIG_MULTI (1u << 0) /* Multi-segment packets.*/ 57 #define MLX5_TXOFF_CONFIG_TSO (1u << 1) /* TCP send offload supported.*/ 58 #define MLX5_TXOFF_CONFIG_SWP (1u << 2) /* Tunnels/SW Parser offloads.*/ 59 #define MLX5_TXOFF_CONFIG_CSUM (1u << 3) /* Check Sums offloaded. */ 60 #define MLX5_TXOFF_CONFIG_INLINE (1u << 4) /* Data inlining supported. */ 61 #define MLX5_TXOFF_CONFIG_VLAN (1u << 5) /* VLAN insertion supported.*/ 62 #define MLX5_TXOFF_CONFIG_METADATA (1u << 6) /* Flow metadata. */ 63 #define MLX5_TXOFF_CONFIG_EMPW (1u << 8) /* Enhanced MPW supported.*/ 64 65 /* The most common offloads groups. */ 66 #define MLX5_TXOFF_CONFIG_NONE 0 67 #define MLX5_TXOFF_CONFIG_FULL (MLX5_TXOFF_CONFIG_MULTI | \ 68 MLX5_TXOFF_CONFIG_TSO | \ 69 MLX5_TXOFF_CONFIG_SWP | \ 70 MLX5_TXOFF_CONFIG_CSUM | \ 71 MLX5_TXOFF_CONFIG_INLINE | \ 72 MLX5_TXOFF_CONFIG_VLAN | \ 73 MLX5_TXOFF_CONFIG_METADATA) 74 75 #define MLX5_TXOFF_CONFIG(mask) (olx & MLX5_TXOFF_CONFIG_##mask) 76 77 #define MLX5_TXOFF_DECL(func, olx) \ 78 static uint16_t mlx5_tx_burst_##func(void *txq, \ 79 struct rte_mbuf **pkts, \ 80 uint16_t pkts_n) \ 81 { \ 82 return mlx5_tx_burst_tmpl((struct mlx5_txq_data *)txq, \ 83 pkts, pkts_n, (olx)); \ 84 } 85 86 #define MLX5_TXOFF_INFO(func, olx) {mlx5_tx_burst_##func, olx}, 87 88 static __rte_always_inline uint32_t 89 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe); 90 91 static __rte_always_inline int 92 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, 93 uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe); 94 95 static __rte_always_inline uint32_t 96 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe); 97 98 static __rte_always_inline void 99 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, 100 volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res); 101 102 static __rte_always_inline void 103 mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx, 104 const unsigned int strd_n); 105 106 static int 107 mlx5_queue_state_modify(struct rte_eth_dev *dev, 108 struct mlx5_mp_arg_queue_state_modify *sm); 109 110 static inline void 111 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *restrict tcp, 112 volatile struct mlx5_cqe *restrict cqe, 113 uint32_t phcsum); 114 115 static inline void 116 mlx5_lro_update_hdr(uint8_t *restrict padd, 117 volatile struct mlx5_cqe *restrict cqe, 118 uint32_t len); 119 120 uint32_t mlx5_ptype_table[] __rte_cache_aligned = { 121 [0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */ 122 }; 123 124 uint8_t mlx5_cksum_table[1 << 10] __rte_cache_aligned; 125 uint8_t mlx5_swp_types_table[1 << 10] __rte_cache_aligned; 126 127 /** 128 * Build a table to translate Rx completion flags to packet type. 129 * 130 * @note: fix mlx5_dev_supported_ptypes_get() if any change here. 131 */ 132 void 133 mlx5_set_ptype_table(void) 134 { 135 unsigned int i; 136 uint32_t (*p)[RTE_DIM(mlx5_ptype_table)] = &mlx5_ptype_table; 137 138 /* Last entry must not be overwritten, reserved for errored packet. */ 139 for (i = 0; i < RTE_DIM(mlx5_ptype_table) - 1; ++i) 140 (*p)[i] = RTE_PTYPE_UNKNOWN; 141 /* 142 * The index to the array should have: 143 * bit[1:0] = l3_hdr_type 144 * bit[4:2] = l4_hdr_type 145 * bit[5] = ip_frag 146 * bit[6] = tunneled 147 * bit[7] = outer_l3_type 148 */ 149 /* L2 */ 150 (*p)[0x00] = RTE_PTYPE_L2_ETHER; 151 /* L3 */ 152 (*p)[0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 153 RTE_PTYPE_L4_NONFRAG; 154 (*p)[0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 155 RTE_PTYPE_L4_NONFRAG; 156 /* Fragmented */ 157 (*p)[0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 158 RTE_PTYPE_L4_FRAG; 159 (*p)[0x22] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 160 RTE_PTYPE_L4_FRAG; 161 /* TCP */ 162 (*p)[0x05] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 163 RTE_PTYPE_L4_TCP; 164 (*p)[0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 165 RTE_PTYPE_L4_TCP; 166 (*p)[0x0d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 167 RTE_PTYPE_L4_TCP; 168 (*p)[0x0e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 169 RTE_PTYPE_L4_TCP; 170 (*p)[0x11] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 171 RTE_PTYPE_L4_TCP; 172 (*p)[0x12] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 173 RTE_PTYPE_L4_TCP; 174 /* UDP */ 175 (*p)[0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 176 RTE_PTYPE_L4_UDP; 177 (*p)[0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 178 RTE_PTYPE_L4_UDP; 179 /* Repeat with outer_l3_type being set. Just in case. */ 180 (*p)[0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 181 RTE_PTYPE_L4_NONFRAG; 182 (*p)[0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 183 RTE_PTYPE_L4_NONFRAG; 184 (*p)[0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 185 RTE_PTYPE_L4_FRAG; 186 (*p)[0xa2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 187 RTE_PTYPE_L4_FRAG; 188 (*p)[0x85] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 189 RTE_PTYPE_L4_TCP; 190 (*p)[0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 191 RTE_PTYPE_L4_TCP; 192 (*p)[0x8d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 193 RTE_PTYPE_L4_TCP; 194 (*p)[0x8e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 195 RTE_PTYPE_L4_TCP; 196 (*p)[0x91] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 197 RTE_PTYPE_L4_TCP; 198 (*p)[0x92] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 199 RTE_PTYPE_L4_TCP; 200 (*p)[0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 201 RTE_PTYPE_L4_UDP; 202 (*p)[0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 203 RTE_PTYPE_L4_UDP; 204 /* Tunneled - L3 */ 205 (*p)[0x40] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN; 206 (*p)[0x41] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 207 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 208 RTE_PTYPE_INNER_L4_NONFRAG; 209 (*p)[0x42] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 210 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 211 RTE_PTYPE_INNER_L4_NONFRAG; 212 (*p)[0xc0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN; 213 (*p)[0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 214 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 215 RTE_PTYPE_INNER_L4_NONFRAG; 216 (*p)[0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 217 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 218 RTE_PTYPE_INNER_L4_NONFRAG; 219 /* Tunneled - Fragmented */ 220 (*p)[0x61] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 221 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 222 RTE_PTYPE_INNER_L4_FRAG; 223 (*p)[0x62] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 224 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 225 RTE_PTYPE_INNER_L4_FRAG; 226 (*p)[0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 227 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 228 RTE_PTYPE_INNER_L4_FRAG; 229 (*p)[0xe2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 230 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 231 RTE_PTYPE_INNER_L4_FRAG; 232 /* Tunneled - TCP */ 233 (*p)[0x45] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 234 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 235 RTE_PTYPE_INNER_L4_TCP; 236 (*p)[0x46] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 237 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 238 RTE_PTYPE_INNER_L4_TCP; 239 (*p)[0x4d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 240 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 241 RTE_PTYPE_INNER_L4_TCP; 242 (*p)[0x4e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 243 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 244 RTE_PTYPE_INNER_L4_TCP; 245 (*p)[0x51] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 246 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 247 RTE_PTYPE_INNER_L4_TCP; 248 (*p)[0x52] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 249 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 250 RTE_PTYPE_INNER_L4_TCP; 251 (*p)[0xc5] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 252 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 253 RTE_PTYPE_INNER_L4_TCP; 254 (*p)[0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 255 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 256 RTE_PTYPE_INNER_L4_TCP; 257 (*p)[0xcd] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 258 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 259 RTE_PTYPE_INNER_L4_TCP; 260 (*p)[0xce] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 261 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 262 RTE_PTYPE_INNER_L4_TCP; 263 (*p)[0xd1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 264 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 265 RTE_PTYPE_INNER_L4_TCP; 266 (*p)[0xd2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 267 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 268 RTE_PTYPE_INNER_L4_TCP; 269 /* Tunneled - UDP */ 270 (*p)[0x49] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 271 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 272 RTE_PTYPE_INNER_L4_UDP; 273 (*p)[0x4a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 274 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 275 RTE_PTYPE_INNER_L4_UDP; 276 (*p)[0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 277 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 278 RTE_PTYPE_INNER_L4_UDP; 279 (*p)[0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 280 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 281 RTE_PTYPE_INNER_L4_UDP; 282 } 283 284 /** 285 * Build a table to translate packet to checksum type of Verbs. 286 */ 287 void 288 mlx5_set_cksum_table(void) 289 { 290 unsigned int i; 291 uint8_t v; 292 293 /* 294 * The index should have: 295 * bit[0] = PKT_TX_TCP_SEG 296 * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM 297 * bit[4] = PKT_TX_IP_CKSUM 298 * bit[8] = PKT_TX_OUTER_IP_CKSUM 299 * bit[9] = tunnel 300 */ 301 for (i = 0; i < RTE_DIM(mlx5_cksum_table); ++i) { 302 v = 0; 303 if (i & (1 << 9)) { 304 /* Tunneled packet. */ 305 if (i & (1 << 8)) /* Outer IP. */ 306 v |= MLX5_ETH_WQE_L3_CSUM; 307 if (i & (1 << 4)) /* Inner IP. */ 308 v |= MLX5_ETH_WQE_L3_INNER_CSUM; 309 if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */ 310 v |= MLX5_ETH_WQE_L4_INNER_CSUM; 311 } else { 312 /* No tunnel. */ 313 if (i & (1 << 4)) /* IP. */ 314 v |= MLX5_ETH_WQE_L3_CSUM; 315 if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */ 316 v |= MLX5_ETH_WQE_L4_CSUM; 317 } 318 mlx5_cksum_table[i] = v; 319 } 320 } 321 322 /** 323 * Build a table to translate packet type of mbuf to SWP type of Verbs. 324 */ 325 void 326 mlx5_set_swp_types_table(void) 327 { 328 unsigned int i; 329 uint8_t v; 330 331 /* 332 * The index should have: 333 * bit[0:1] = PKT_TX_L4_MASK 334 * bit[4] = PKT_TX_IPV6 335 * bit[8] = PKT_TX_OUTER_IPV6 336 * bit[9] = PKT_TX_OUTER_UDP 337 */ 338 for (i = 0; i < RTE_DIM(mlx5_swp_types_table); ++i) { 339 v = 0; 340 if (i & (1 << 8)) 341 v |= MLX5_ETH_WQE_L3_OUTER_IPV6; 342 if (i & (1 << 9)) 343 v |= MLX5_ETH_WQE_L4_OUTER_UDP; 344 if (i & (1 << 4)) 345 v |= MLX5_ETH_WQE_L3_INNER_IPV6; 346 if ((i & 3) == (PKT_TX_UDP_CKSUM >> 52)) 347 v |= MLX5_ETH_WQE_L4_INNER_UDP; 348 mlx5_swp_types_table[i] = v; 349 } 350 } 351 352 /** 353 * Set Software Parser flags and offsets in Ethernet Segment of WQE. 354 * Flags must be preliminary initialized to zero. 355 * 356 * @param loc 357 * Pointer to burst routine local context. 358 * @param swp_flags 359 * Pointer to store Software Parser flags 360 * @param olx 361 * Configured Tx offloads mask. It is fully defined at 362 * compile time and may be used for optimization. 363 * 364 * @return 365 * Software Parser offsets packed in dword. 366 * Software Parser flags are set by pointer. 367 */ 368 static __rte_always_inline uint32_t 369 txq_mbuf_to_swp(struct mlx5_txq_local *restrict loc, 370 uint8_t *swp_flags, 371 unsigned int olx) 372 { 373 uint64_t ol, tunnel; 374 unsigned int idx, off; 375 uint32_t set; 376 377 if (!MLX5_TXOFF_CONFIG(SWP)) 378 return 0; 379 ol = loc->mbuf->ol_flags; 380 tunnel = ol & PKT_TX_TUNNEL_MASK; 381 /* 382 * Check whether Software Parser is required. 383 * Only customized tunnels may ask for. 384 */ 385 if (likely(tunnel != PKT_TX_TUNNEL_UDP && tunnel != PKT_TX_TUNNEL_IP)) 386 return 0; 387 /* 388 * The index should have: 389 * bit[0:1] = PKT_TX_L4_MASK 390 * bit[4] = PKT_TX_IPV6 391 * bit[8] = PKT_TX_OUTER_IPV6 392 * bit[9] = PKT_TX_OUTER_UDP 393 */ 394 idx = (ol & (PKT_TX_L4_MASK | PKT_TX_IPV6 | PKT_TX_OUTER_IPV6)) >> 52; 395 idx |= (tunnel == PKT_TX_TUNNEL_UDP) ? (1 << 9) : 0; 396 *swp_flags = mlx5_swp_types_table[idx]; 397 /* 398 * Set offsets for SW parser. Since ConnectX-5, SW parser just 399 * complements HW parser. SW parser starts to engage only if HW parser 400 * can't reach a header. For the older devices, HW parser will not kick 401 * in if any of SWP offsets is set. Therefore, all of the L3 offsets 402 * should be set regardless of HW offload. 403 */ 404 off = loc->mbuf->outer_l2_len; 405 if (MLX5_TXOFF_CONFIG(VLAN) && ol & PKT_TX_VLAN_PKT) 406 off += sizeof(struct rte_vlan_hdr); 407 set = (off >> 1) << 8; /* Outer L3 offset. */ 408 off += loc->mbuf->outer_l3_len; 409 if (tunnel == PKT_TX_TUNNEL_UDP) 410 set |= off >> 1; /* Outer L4 offset. */ 411 if (ol & (PKT_TX_IPV4 | PKT_TX_IPV6)) { /* Inner IP. */ 412 const uint64_t csum = ol & PKT_TX_L4_MASK; 413 off += loc->mbuf->l2_len; 414 set |= (off >> 1) << 24; /* Inner L3 offset. */ 415 if (csum == PKT_TX_TCP_CKSUM || 416 csum == PKT_TX_UDP_CKSUM || 417 (MLX5_TXOFF_CONFIG(TSO) && ol & PKT_TX_TCP_SEG)) { 418 off += loc->mbuf->l3_len; 419 set |= (off >> 1) << 16; /* Inner L4 offset. */ 420 } 421 } 422 set = rte_cpu_to_le_32(set); 423 return set; 424 } 425 426 /** 427 * Convert the Checksum offloads to Verbs. 428 * 429 * @param buf 430 * Pointer to the mbuf. 431 * 432 * @return 433 * Converted checksum flags. 434 */ 435 static __rte_always_inline uint8_t 436 txq_ol_cksum_to_cs(struct rte_mbuf *buf) 437 { 438 uint32_t idx; 439 uint8_t is_tunnel = !!(buf->ol_flags & PKT_TX_TUNNEL_MASK); 440 const uint64_t ol_flags_mask = PKT_TX_TCP_SEG | PKT_TX_L4_MASK | 441 PKT_TX_IP_CKSUM | PKT_TX_OUTER_IP_CKSUM; 442 443 /* 444 * The index should have: 445 * bit[0] = PKT_TX_TCP_SEG 446 * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM 447 * bit[4] = PKT_TX_IP_CKSUM 448 * bit[8] = PKT_TX_OUTER_IP_CKSUM 449 * bit[9] = tunnel 450 */ 451 idx = ((buf->ol_flags & ol_flags_mask) >> 50) | (!!is_tunnel << 9); 452 return mlx5_cksum_table[idx]; 453 } 454 455 /** 456 * Internal function to compute the number of used descriptors in an RX queue 457 * 458 * @param rxq 459 * The Rx queue. 460 * 461 * @return 462 * The number of used rx descriptor. 463 */ 464 static uint32_t 465 rx_queue_count(struct mlx5_rxq_data *rxq) 466 { 467 struct rxq_zip *zip = &rxq->zip; 468 volatile struct mlx5_cqe *cqe; 469 const unsigned int cqe_n = (1 << rxq->cqe_n); 470 const unsigned int cqe_cnt = cqe_n - 1; 471 unsigned int cq_ci; 472 unsigned int used; 473 474 /* if we are processing a compressed cqe */ 475 if (zip->ai) { 476 used = zip->cqe_cnt - zip->ca; 477 cq_ci = zip->cq_ci; 478 } else { 479 used = 0; 480 cq_ci = rxq->cq_ci; 481 } 482 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; 483 while (check_cqe(cqe, cqe_n, cq_ci) != MLX5_CQE_STATUS_HW_OWN) { 484 int8_t op_own; 485 unsigned int n; 486 487 op_own = cqe->op_own; 488 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) 489 n = rte_be_to_cpu_32(cqe->byte_cnt); 490 else 491 n = 1; 492 cq_ci += n; 493 used += n; 494 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; 495 } 496 used = RTE_MIN(used, (1U << rxq->elts_n) - 1); 497 return used; 498 } 499 500 /** 501 * DPDK callback to check the status of a rx descriptor. 502 * 503 * @param rx_queue 504 * The Rx queue. 505 * @param[in] offset 506 * The index of the descriptor in the ring. 507 * 508 * @return 509 * The status of the tx descriptor. 510 */ 511 int 512 mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset) 513 { 514 struct mlx5_rxq_data *rxq = rx_queue; 515 struct mlx5_rxq_ctrl *rxq_ctrl = 516 container_of(rxq, struct mlx5_rxq_ctrl, rxq); 517 struct rte_eth_dev *dev = ETH_DEV(rxq_ctrl->priv); 518 519 if (dev->rx_pkt_burst != mlx5_rx_burst) { 520 rte_errno = ENOTSUP; 521 return -rte_errno; 522 } 523 if (offset >= (1 << rxq->elts_n)) { 524 rte_errno = EINVAL; 525 return -rte_errno; 526 } 527 if (offset < rx_queue_count(rxq)) 528 return RTE_ETH_RX_DESC_DONE; 529 return RTE_ETH_RX_DESC_AVAIL; 530 } 531 532 /** 533 * DPDK callback to get the number of used descriptors in a RX queue 534 * 535 * @param dev 536 * Pointer to the device structure. 537 * 538 * @param rx_queue_id 539 * The Rx queue. 540 * 541 * @return 542 * The number of used rx descriptor. 543 * -EINVAL if the queue is invalid 544 */ 545 uint32_t 546 mlx5_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id) 547 { 548 struct mlx5_priv *priv = dev->data->dev_private; 549 struct mlx5_rxq_data *rxq; 550 551 if (dev->rx_pkt_burst != mlx5_rx_burst) { 552 rte_errno = ENOTSUP; 553 return -rte_errno; 554 } 555 rxq = (*priv->rxqs)[rx_queue_id]; 556 if (!rxq) { 557 rte_errno = EINVAL; 558 return -rte_errno; 559 } 560 return rx_queue_count(rxq); 561 } 562 563 #define MLX5_SYSTEM_LOG_DIR "/var/log" 564 /** 565 * Dump debug information to log file. 566 * 567 * @param fname 568 * The file name. 569 * @param hex_title 570 * If not NULL this string is printed as a header to the output 571 * and the output will be in hexadecimal view. 572 * @param buf 573 * This is the buffer address to print out. 574 * @param len 575 * The number of bytes to dump out. 576 */ 577 void 578 mlx5_dump_debug_information(const char *fname, const char *hex_title, 579 const void *buf, unsigned int hex_len) 580 { 581 FILE *fd; 582 583 MKSTR(path, "%s/%s", MLX5_SYSTEM_LOG_DIR, fname); 584 fd = fopen(path, "a+"); 585 if (!fd) { 586 DRV_LOG(WARNING, "cannot open %s for debug dump\n", 587 path); 588 MKSTR(path2, "./%s", fname); 589 fd = fopen(path2, "a+"); 590 if (!fd) { 591 DRV_LOG(ERR, "cannot open %s for debug dump\n", 592 path2); 593 return; 594 } 595 DRV_LOG(INFO, "New debug dump in file %s\n", path2); 596 } else { 597 DRV_LOG(INFO, "New debug dump in file %s\n", path); 598 } 599 if (hex_title) 600 rte_hexdump(fd, hex_title, buf, hex_len); 601 else 602 fprintf(fd, "%s", (const char *)buf); 603 fprintf(fd, "\n\n\n"); 604 fclose(fd); 605 } 606 607 /** 608 * Move QP from error state to running state and initialize indexes. 609 * 610 * @param txq_ctrl 611 * Pointer to TX queue control structure. 612 * 613 * @return 614 * 0 on success, else -1. 615 */ 616 static int 617 tx_recover_qp(struct mlx5_txq_ctrl *txq_ctrl) 618 { 619 struct mlx5_mp_arg_queue_state_modify sm = { 620 .is_wq = 0, 621 .queue_id = txq_ctrl->txq.idx, 622 }; 623 624 if (mlx5_queue_state_modify(ETH_DEV(txq_ctrl->priv), &sm)) 625 return -1; 626 txq_ctrl->txq.wqe_ci = 0; 627 txq_ctrl->txq.wqe_pi = 0; 628 txq_ctrl->txq.elts_comp = 0; 629 return 0; 630 } 631 632 /* Return 1 if the error CQE is signed otherwise, sign it and return 0. */ 633 static int 634 check_err_cqe_seen(volatile struct mlx5_err_cqe *err_cqe) 635 { 636 static const uint8_t magic[] = "seen"; 637 int ret = 1; 638 unsigned int i; 639 640 for (i = 0; i < sizeof(magic); ++i) 641 if (!ret || err_cqe->rsvd1[i] != magic[i]) { 642 ret = 0; 643 err_cqe->rsvd1[i] = magic[i]; 644 } 645 return ret; 646 } 647 648 /** 649 * Handle error CQE. 650 * 651 * @param txq 652 * Pointer to TX queue structure. 653 * @param error_cqe 654 * Pointer to the error CQE. 655 * 656 * @return 657 * Negative value if queue recovery failed, 658 * the last Tx buffer element to free otherwise. 659 */ 660 int 661 mlx5_tx_error_cqe_handle(struct mlx5_txq_data *restrict txq, 662 volatile struct mlx5_err_cqe *err_cqe) 663 { 664 if (err_cqe->syndrome != MLX5_CQE_SYNDROME_WR_FLUSH_ERR) { 665 const uint16_t wqe_m = ((1 << txq->wqe_n) - 1); 666 struct mlx5_txq_ctrl *txq_ctrl = 667 container_of(txq, struct mlx5_txq_ctrl, txq); 668 uint16_t new_wqe_pi = rte_be_to_cpu_16(err_cqe->wqe_counter); 669 int seen = check_err_cqe_seen(err_cqe); 670 671 if (!seen && txq_ctrl->dump_file_n < 672 txq_ctrl->priv->config.max_dump_files_num) { 673 MKSTR(err_str, "Unexpected CQE error syndrome " 674 "0x%02x CQN = %u SQN = %u wqe_counter = %u " 675 "wq_ci = %u cq_ci = %u", err_cqe->syndrome, 676 txq->cqe_s, txq->qp_num_8s >> 8, 677 rte_be_to_cpu_16(err_cqe->wqe_counter), 678 txq->wqe_ci, txq->cq_ci); 679 MKSTR(name, "dpdk_mlx5_port_%u_txq_%u_index_%u_%u", 680 PORT_ID(txq_ctrl->priv), txq->idx, 681 txq_ctrl->dump_file_n, (uint32_t)rte_rdtsc()); 682 mlx5_dump_debug_information(name, NULL, err_str, 0); 683 mlx5_dump_debug_information(name, "MLX5 Error CQ:", 684 (const void *)((uintptr_t) 685 txq->cqes), 686 sizeof(*err_cqe) * 687 (1 << txq->cqe_n)); 688 mlx5_dump_debug_information(name, "MLX5 Error SQ:", 689 (const void *)((uintptr_t) 690 txq->wqes), 691 MLX5_WQE_SIZE * 692 (1 << txq->wqe_n)); 693 txq_ctrl->dump_file_n++; 694 } 695 if (!seen) 696 /* 697 * Count errors in WQEs units. 698 * Later it can be improved to count error packets, 699 * for example, by SQ parsing to find how much packets 700 * should be counted for each WQE. 701 */ 702 txq->stats.oerrors += ((txq->wqe_ci & wqe_m) - 703 new_wqe_pi) & wqe_m; 704 if (tx_recover_qp(txq_ctrl) == 0) { 705 txq->cq_ci++; 706 /* Release all the remaining buffers. */ 707 return txq->elts_head; 708 } 709 /* Recovering failed - try again later on the same WQE. */ 710 return -1; 711 } else { 712 txq->cq_ci++; 713 } 714 /* Do not release buffers. */ 715 return txq->elts_tail; 716 } 717 718 /** 719 * Translate RX completion flags to packet type. 720 * 721 * @param[in] rxq 722 * Pointer to RX queue structure. 723 * @param[in] cqe 724 * Pointer to CQE. 725 * 726 * @note: fix mlx5_dev_supported_ptypes_get() if any change here. 727 * 728 * @return 729 * Packet type for struct rte_mbuf. 730 */ 731 static inline uint32_t 732 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe) 733 { 734 uint8_t idx; 735 uint8_t pinfo = cqe->pkt_info; 736 uint16_t ptype = cqe->hdr_type_etc; 737 738 /* 739 * The index to the array should have: 740 * bit[1:0] = l3_hdr_type 741 * bit[4:2] = l4_hdr_type 742 * bit[5] = ip_frag 743 * bit[6] = tunneled 744 * bit[7] = outer_l3_type 745 */ 746 idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10); 747 return mlx5_ptype_table[idx] | rxq->tunnel * !!(idx & (1 << 6)); 748 } 749 750 /** 751 * Initialize Rx WQ and indexes. 752 * 753 * @param[in] rxq 754 * Pointer to RX queue structure. 755 */ 756 void 757 mlx5_rxq_initialize(struct mlx5_rxq_data *rxq) 758 { 759 const unsigned int wqe_n = 1 << rxq->elts_n; 760 unsigned int i; 761 762 for (i = 0; (i != wqe_n); ++i) { 763 volatile struct mlx5_wqe_data_seg *scat; 764 uintptr_t addr; 765 uint32_t byte_count; 766 767 if (mlx5_rxq_mprq_enabled(rxq)) { 768 struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[i]; 769 770 scat = &((volatile struct mlx5_wqe_mprq *) 771 rxq->wqes)[i].dseg; 772 addr = (uintptr_t)mlx5_mprq_buf_addr(buf, 773 1 << rxq->strd_num_n); 774 byte_count = (1 << rxq->strd_sz_n) * 775 (1 << rxq->strd_num_n); 776 } else { 777 struct rte_mbuf *buf = (*rxq->elts)[i]; 778 779 scat = &((volatile struct mlx5_wqe_data_seg *) 780 rxq->wqes)[i]; 781 addr = rte_pktmbuf_mtod(buf, uintptr_t); 782 byte_count = DATA_LEN(buf); 783 } 784 /* scat->addr must be able to store a pointer. */ 785 assert(sizeof(scat->addr) >= sizeof(uintptr_t)); 786 *scat = (struct mlx5_wqe_data_seg){ 787 .addr = rte_cpu_to_be_64(addr), 788 .byte_count = rte_cpu_to_be_32(byte_count), 789 .lkey = mlx5_rx_addr2mr(rxq, addr), 790 }; 791 } 792 rxq->consumed_strd = 0; 793 rxq->decompressed = 0; 794 rxq->rq_pi = 0; 795 rxq->zip = (struct rxq_zip){ 796 .ai = 0, 797 }; 798 /* Update doorbell counter. */ 799 rxq->rq_ci = wqe_n >> rxq->sges_n; 800 rte_cio_wmb(); 801 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 802 } 803 804 /** 805 * Modify a Verbs/DevX queue state. 806 * This must be called from the primary process. 807 * 808 * @param dev 809 * Pointer to Ethernet device. 810 * @param sm 811 * State modify request parameters. 812 * 813 * @return 814 * 0 in case of success else non-zero value and rte_errno is set. 815 */ 816 int 817 mlx5_queue_state_modify_primary(struct rte_eth_dev *dev, 818 const struct mlx5_mp_arg_queue_state_modify *sm) 819 { 820 int ret; 821 struct mlx5_priv *priv = dev->data->dev_private; 822 823 if (sm->is_wq) { 824 struct mlx5_rxq_data *rxq = (*priv->rxqs)[sm->queue_id]; 825 struct mlx5_rxq_ctrl *rxq_ctrl = 826 container_of(rxq, struct mlx5_rxq_ctrl, rxq); 827 828 if (rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_IBV) { 829 struct ibv_wq_attr mod = { 830 .attr_mask = IBV_WQ_ATTR_STATE, 831 .wq_state = sm->state, 832 }; 833 834 ret = mlx5_glue->modify_wq(rxq_ctrl->obj->wq, &mod); 835 } else { /* rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ. */ 836 struct mlx5_devx_modify_rq_attr rq_attr; 837 838 memset(&rq_attr, 0, sizeof(rq_attr)); 839 if (sm->state == IBV_WQS_RESET) { 840 rq_attr.rq_state = MLX5_RQC_STATE_ERR; 841 rq_attr.state = MLX5_RQC_STATE_RST; 842 } else if (sm->state == IBV_WQS_RDY) { 843 rq_attr.rq_state = MLX5_RQC_STATE_RST; 844 rq_attr.state = MLX5_RQC_STATE_RDY; 845 } else if (sm->state == IBV_WQS_ERR) { 846 rq_attr.rq_state = MLX5_RQC_STATE_RDY; 847 rq_attr.state = MLX5_RQC_STATE_ERR; 848 } 849 ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq, 850 &rq_attr); 851 } 852 if (ret) { 853 DRV_LOG(ERR, "Cannot change Rx WQ state to %u - %s\n", 854 sm->state, strerror(errno)); 855 rte_errno = errno; 856 return ret; 857 } 858 } else { 859 struct mlx5_txq_data *txq = (*priv->txqs)[sm->queue_id]; 860 struct mlx5_txq_ctrl *txq_ctrl = 861 container_of(txq, struct mlx5_txq_ctrl, txq); 862 struct ibv_qp_attr mod = { 863 .qp_state = IBV_QPS_RESET, 864 .port_num = (uint8_t)priv->ibv_port, 865 }; 866 struct ibv_qp *qp = txq_ctrl->ibv->qp; 867 868 ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE); 869 if (ret) { 870 DRV_LOG(ERR, "Cannot change the Tx QP state to RESET " 871 "%s\n", strerror(errno)); 872 rte_errno = errno; 873 return ret; 874 } 875 mod.qp_state = IBV_QPS_INIT; 876 ret = mlx5_glue->modify_qp(qp, &mod, 877 (IBV_QP_STATE | IBV_QP_PORT)); 878 if (ret) { 879 DRV_LOG(ERR, "Cannot change Tx QP state to INIT %s\n", 880 strerror(errno)); 881 rte_errno = errno; 882 return ret; 883 } 884 mod.qp_state = IBV_QPS_RTR; 885 ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE); 886 if (ret) { 887 DRV_LOG(ERR, "Cannot change Tx QP state to RTR %s\n", 888 strerror(errno)); 889 rte_errno = errno; 890 return ret; 891 } 892 mod.qp_state = IBV_QPS_RTS; 893 ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE); 894 if (ret) { 895 DRV_LOG(ERR, "Cannot change Tx QP state to RTS %s\n", 896 strerror(errno)); 897 rte_errno = errno; 898 return ret; 899 } 900 } 901 return 0; 902 } 903 904 /** 905 * Modify a Verbs queue state. 906 * 907 * @param dev 908 * Pointer to Ethernet device. 909 * @param sm 910 * State modify request parameters. 911 * 912 * @return 913 * 0 in case of success else non-zero value. 914 */ 915 static int 916 mlx5_queue_state_modify(struct rte_eth_dev *dev, 917 struct mlx5_mp_arg_queue_state_modify *sm) 918 { 919 int ret = 0; 920 921 switch (rte_eal_process_type()) { 922 case RTE_PROC_PRIMARY: 923 ret = mlx5_queue_state_modify_primary(dev, sm); 924 break; 925 case RTE_PROC_SECONDARY: 926 ret = mlx5_mp_req_queue_state_modify(dev, sm); 927 break; 928 default: 929 break; 930 } 931 return ret; 932 } 933 934 /** 935 * Handle a Rx error. 936 * The function inserts the RQ state to reset when the first error CQE is 937 * shown, then drains the CQ by the caller function loop. When the CQ is empty, 938 * it moves the RQ state to ready and initializes the RQ. 939 * Next CQE identification and error counting are in the caller responsibility. 940 * 941 * @param[in] rxq 942 * Pointer to RX queue structure. 943 * @param[in] vec 944 * 1 when called from vectorized Rx burst, need to prepare mbufs for the RQ. 945 * 0 when called from non-vectorized Rx burst. 946 * 947 * @return 948 * -1 in case of recovery error, otherwise the CQE status. 949 */ 950 int 951 mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec) 952 { 953 const uint16_t cqe_n = 1 << rxq->cqe_n; 954 const uint16_t cqe_mask = cqe_n - 1; 955 const unsigned int wqe_n = 1 << rxq->elts_n; 956 struct mlx5_rxq_ctrl *rxq_ctrl = 957 container_of(rxq, struct mlx5_rxq_ctrl, rxq); 958 union { 959 volatile struct mlx5_cqe *cqe; 960 volatile struct mlx5_err_cqe *err_cqe; 961 } u = { 962 .cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask], 963 }; 964 struct mlx5_mp_arg_queue_state_modify sm; 965 int ret; 966 967 switch (rxq->err_state) { 968 case MLX5_RXQ_ERR_STATE_NO_ERROR: 969 rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_RESET; 970 /* Fall-through */ 971 case MLX5_RXQ_ERR_STATE_NEED_RESET: 972 sm.is_wq = 1; 973 sm.queue_id = rxq->idx; 974 sm.state = IBV_WQS_RESET; 975 if (mlx5_queue_state_modify(ETH_DEV(rxq_ctrl->priv), &sm)) 976 return -1; 977 if (rxq_ctrl->dump_file_n < 978 rxq_ctrl->priv->config.max_dump_files_num) { 979 MKSTR(err_str, "Unexpected CQE error syndrome " 980 "0x%02x CQN = %u RQN = %u wqe_counter = %u" 981 " rq_ci = %u cq_ci = %u", u.err_cqe->syndrome, 982 rxq->cqn, rxq_ctrl->wqn, 983 rte_be_to_cpu_16(u.err_cqe->wqe_counter), 984 rxq->rq_ci << rxq->sges_n, rxq->cq_ci); 985 MKSTR(name, "dpdk_mlx5_port_%u_rxq_%u_%u", 986 rxq->port_id, rxq->idx, (uint32_t)rte_rdtsc()); 987 mlx5_dump_debug_information(name, NULL, err_str, 0); 988 mlx5_dump_debug_information(name, "MLX5 Error CQ:", 989 (const void *)((uintptr_t) 990 rxq->cqes), 991 sizeof(*u.cqe) * cqe_n); 992 mlx5_dump_debug_information(name, "MLX5 Error RQ:", 993 (const void *)((uintptr_t) 994 rxq->wqes), 995 16 * wqe_n); 996 rxq_ctrl->dump_file_n++; 997 } 998 rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_READY; 999 /* Fall-through */ 1000 case MLX5_RXQ_ERR_STATE_NEED_READY: 1001 ret = check_cqe(u.cqe, cqe_n, rxq->cq_ci); 1002 if (ret == MLX5_CQE_STATUS_HW_OWN) { 1003 rte_cio_wmb(); 1004 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); 1005 rte_cio_wmb(); 1006 /* 1007 * The RQ consumer index must be zeroed while moving 1008 * from RESET state to RDY state. 1009 */ 1010 *rxq->rq_db = rte_cpu_to_be_32(0); 1011 rte_cio_wmb(); 1012 sm.is_wq = 1; 1013 sm.queue_id = rxq->idx; 1014 sm.state = IBV_WQS_RDY; 1015 if (mlx5_queue_state_modify(ETH_DEV(rxq_ctrl->priv), 1016 &sm)) 1017 return -1; 1018 if (vec) { 1019 const uint16_t q_mask = wqe_n - 1; 1020 uint16_t elt_idx; 1021 struct rte_mbuf **elt; 1022 int i; 1023 unsigned int n = wqe_n - (rxq->rq_ci - 1024 rxq->rq_pi); 1025 1026 for (i = 0; i < (int)n; ++i) { 1027 elt_idx = (rxq->rq_ci + i) & q_mask; 1028 elt = &(*rxq->elts)[elt_idx]; 1029 *elt = rte_mbuf_raw_alloc(rxq->mp); 1030 if (!*elt) { 1031 for (i--; i >= 0; --i) { 1032 elt_idx = (rxq->rq_ci + 1033 i) & q_mask; 1034 elt = &(*rxq->elts) 1035 [elt_idx]; 1036 rte_pktmbuf_free_seg 1037 (*elt); 1038 } 1039 return -1; 1040 } 1041 } 1042 for (i = 0; i < (int)wqe_n; ++i) { 1043 elt = &(*rxq->elts)[i]; 1044 DATA_LEN(*elt) = 1045 (uint16_t)((*elt)->buf_len - 1046 rte_pktmbuf_headroom(*elt)); 1047 } 1048 /* Padding with a fake mbuf for vec Rx. */ 1049 for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i) 1050 (*rxq->elts)[wqe_n + i] = 1051 &rxq->fake_mbuf; 1052 } 1053 mlx5_rxq_initialize(rxq); 1054 rxq->err_state = MLX5_RXQ_ERR_STATE_NO_ERROR; 1055 } 1056 return ret; 1057 default: 1058 return -1; 1059 } 1060 } 1061 1062 /** 1063 * Get size of the next packet for a given CQE. For compressed CQEs, the 1064 * consumer index is updated only once all packets of the current one have 1065 * been processed. 1066 * 1067 * @param rxq 1068 * Pointer to RX queue. 1069 * @param cqe 1070 * CQE to process. 1071 * @param[out] mcqe 1072 * Store pointer to mini-CQE if compressed. Otherwise, the pointer is not 1073 * written. 1074 * 1075 * @return 1076 * 0 in case of empty CQE, otherwise the packet size in bytes. 1077 */ 1078 static inline int 1079 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, 1080 uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe) 1081 { 1082 struct rxq_zip *zip = &rxq->zip; 1083 uint16_t cqe_n = cqe_cnt + 1; 1084 int len; 1085 uint16_t idx, end; 1086 1087 do { 1088 len = 0; 1089 /* Process compressed data in the CQE and mini arrays. */ 1090 if (zip->ai) { 1091 volatile struct mlx5_mini_cqe8 (*mc)[8] = 1092 (volatile struct mlx5_mini_cqe8 (*)[8]) 1093 (uintptr_t)(&(*rxq->cqes)[zip->ca & 1094 cqe_cnt].pkt_info); 1095 1096 len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt); 1097 *mcqe = &(*mc)[zip->ai & 7]; 1098 if ((++zip->ai & 7) == 0) { 1099 /* Invalidate consumed CQEs */ 1100 idx = zip->ca; 1101 end = zip->na; 1102 while (idx != end) { 1103 (*rxq->cqes)[idx & cqe_cnt].op_own = 1104 MLX5_CQE_INVALIDATE; 1105 ++idx; 1106 } 1107 /* 1108 * Increment consumer index to skip the number 1109 * of CQEs consumed. Hardware leaves holes in 1110 * the CQ ring for software use. 1111 */ 1112 zip->ca = zip->na; 1113 zip->na += 8; 1114 } 1115 if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) { 1116 /* Invalidate the rest */ 1117 idx = zip->ca; 1118 end = zip->cq_ci; 1119 1120 while (idx != end) { 1121 (*rxq->cqes)[idx & cqe_cnt].op_own = 1122 MLX5_CQE_INVALIDATE; 1123 ++idx; 1124 } 1125 rxq->cq_ci = zip->cq_ci; 1126 zip->ai = 0; 1127 } 1128 /* 1129 * No compressed data, get next CQE and verify if it is 1130 * compressed. 1131 */ 1132 } else { 1133 int ret; 1134 int8_t op_own; 1135 1136 ret = check_cqe(cqe, cqe_n, rxq->cq_ci); 1137 if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) { 1138 if (unlikely(ret == MLX5_CQE_STATUS_ERR || 1139 rxq->err_state)) { 1140 ret = mlx5_rx_err_handle(rxq, 0); 1141 if (ret == MLX5_CQE_STATUS_HW_OWN || 1142 ret == -1) 1143 return 0; 1144 } else { 1145 return 0; 1146 } 1147 } 1148 ++rxq->cq_ci; 1149 op_own = cqe->op_own; 1150 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) { 1151 volatile struct mlx5_mini_cqe8 (*mc)[8] = 1152 (volatile struct mlx5_mini_cqe8 (*)[8]) 1153 (uintptr_t)(&(*rxq->cqes) 1154 [rxq->cq_ci & 1155 cqe_cnt].pkt_info); 1156 1157 /* Fix endianness. */ 1158 zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt); 1159 /* 1160 * Current mini array position is the one 1161 * returned by check_cqe64(). 1162 * 1163 * If completion comprises several mini arrays, 1164 * as a special case the second one is located 1165 * 7 CQEs after the initial CQE instead of 8 1166 * for subsequent ones. 1167 */ 1168 zip->ca = rxq->cq_ci; 1169 zip->na = zip->ca + 7; 1170 /* Compute the next non compressed CQE. */ 1171 --rxq->cq_ci; 1172 zip->cq_ci = rxq->cq_ci + zip->cqe_cnt; 1173 /* Get packet size to return. */ 1174 len = rte_be_to_cpu_32((*mc)[0].byte_cnt); 1175 *mcqe = &(*mc)[0]; 1176 zip->ai = 1; 1177 /* Prefetch all to be invalidated */ 1178 idx = zip->ca; 1179 end = zip->cq_ci; 1180 while (idx != end) { 1181 rte_prefetch0(&(*rxq->cqes)[(idx) & 1182 cqe_cnt]); 1183 ++idx; 1184 } 1185 } else { 1186 len = rte_be_to_cpu_32(cqe->byte_cnt); 1187 } 1188 } 1189 if (unlikely(rxq->err_state)) { 1190 cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1191 ++rxq->stats.idropped; 1192 } else { 1193 return len; 1194 } 1195 } while (1); 1196 } 1197 1198 /** 1199 * Translate RX completion flags to offload flags. 1200 * 1201 * @param[in] cqe 1202 * Pointer to CQE. 1203 * 1204 * @return 1205 * Offload flags (ol_flags) for struct rte_mbuf. 1206 */ 1207 static inline uint32_t 1208 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe) 1209 { 1210 uint32_t ol_flags = 0; 1211 uint16_t flags = rte_be_to_cpu_16(cqe->hdr_type_etc); 1212 1213 ol_flags = 1214 TRANSPOSE(flags, 1215 MLX5_CQE_RX_L3_HDR_VALID, 1216 PKT_RX_IP_CKSUM_GOOD) | 1217 TRANSPOSE(flags, 1218 MLX5_CQE_RX_L4_HDR_VALID, 1219 PKT_RX_L4_CKSUM_GOOD); 1220 return ol_flags; 1221 } 1222 1223 /** 1224 * Fill in mbuf fields from RX completion flags. 1225 * Note that pkt->ol_flags should be initialized outside of this function. 1226 * 1227 * @param rxq 1228 * Pointer to RX queue. 1229 * @param pkt 1230 * mbuf to fill. 1231 * @param cqe 1232 * CQE to process. 1233 * @param rss_hash_res 1234 * Packet RSS Hash result. 1235 */ 1236 static inline void 1237 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, 1238 volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res) 1239 { 1240 /* Update packet information. */ 1241 pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe); 1242 if (rss_hash_res && rxq->rss_hash) { 1243 pkt->hash.rss = rss_hash_res; 1244 pkt->ol_flags |= PKT_RX_RSS_HASH; 1245 } 1246 if (rxq->mark && MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) { 1247 pkt->ol_flags |= PKT_RX_FDIR; 1248 if (cqe->sop_drop_qpn != 1249 rte_cpu_to_be_32(MLX5_FLOW_MARK_DEFAULT)) { 1250 uint32_t mark = cqe->sop_drop_qpn; 1251 1252 pkt->ol_flags |= PKT_RX_FDIR_ID; 1253 pkt->hash.fdir.hi = mlx5_flow_mark_get(mark); 1254 } 1255 } 1256 if (rxq->csum) 1257 pkt->ol_flags |= rxq_cq_to_ol_flags(cqe); 1258 if (rxq->vlan_strip && 1259 (cqe->hdr_type_etc & rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED))) { 1260 pkt->ol_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED; 1261 pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info); 1262 } 1263 if (rxq->hw_timestamp) { 1264 pkt->timestamp = rte_be_to_cpu_64(cqe->timestamp); 1265 pkt->ol_flags |= PKT_RX_TIMESTAMP; 1266 } 1267 } 1268 1269 /** 1270 * DPDK callback for RX. 1271 * 1272 * @param dpdk_rxq 1273 * Generic pointer to RX queue structure. 1274 * @param[out] pkts 1275 * Array to store received packets. 1276 * @param pkts_n 1277 * Maximum number of packets in array. 1278 * 1279 * @return 1280 * Number of packets successfully received (<= pkts_n). 1281 */ 1282 uint16_t 1283 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1284 { 1285 struct mlx5_rxq_data *rxq = dpdk_rxq; 1286 const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1; 1287 const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1; 1288 const unsigned int sges_n = rxq->sges_n; 1289 struct rte_mbuf *pkt = NULL; 1290 struct rte_mbuf *seg = NULL; 1291 volatile struct mlx5_cqe *cqe = 1292 &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1293 unsigned int i = 0; 1294 unsigned int rq_ci = rxq->rq_ci << sges_n; 1295 int len = 0; /* keep its value across iterations. */ 1296 1297 while (pkts_n) { 1298 unsigned int idx = rq_ci & wqe_cnt; 1299 volatile struct mlx5_wqe_data_seg *wqe = 1300 &((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx]; 1301 struct rte_mbuf *rep = (*rxq->elts)[idx]; 1302 volatile struct mlx5_mini_cqe8 *mcqe = NULL; 1303 uint32_t rss_hash_res; 1304 1305 if (pkt) 1306 NEXT(seg) = rep; 1307 seg = rep; 1308 rte_prefetch0(seg); 1309 rte_prefetch0(cqe); 1310 rte_prefetch0(wqe); 1311 rep = rte_mbuf_raw_alloc(rxq->mp); 1312 if (unlikely(rep == NULL)) { 1313 ++rxq->stats.rx_nombuf; 1314 if (!pkt) { 1315 /* 1316 * no buffers before we even started, 1317 * bail out silently. 1318 */ 1319 break; 1320 } 1321 while (pkt != seg) { 1322 assert(pkt != (*rxq->elts)[idx]); 1323 rep = NEXT(pkt); 1324 NEXT(pkt) = NULL; 1325 NB_SEGS(pkt) = 1; 1326 rte_mbuf_raw_free(pkt); 1327 pkt = rep; 1328 } 1329 break; 1330 } 1331 if (!pkt) { 1332 cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1333 len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt, &mcqe); 1334 if (!len) { 1335 rte_mbuf_raw_free(rep); 1336 break; 1337 } 1338 pkt = seg; 1339 assert(len >= (rxq->crc_present << 2)); 1340 pkt->ol_flags = 0; 1341 /* If compressed, take hash result from mini-CQE. */ 1342 rss_hash_res = rte_be_to_cpu_32(mcqe == NULL ? 1343 cqe->rx_hash_res : 1344 mcqe->rx_hash_result); 1345 rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res); 1346 if (rxq->crc_present) 1347 len -= RTE_ETHER_CRC_LEN; 1348 PKT_LEN(pkt) = len; 1349 if (cqe->lro_num_seg > 1) { 1350 mlx5_lro_update_hdr 1351 (rte_pktmbuf_mtod(pkt, uint8_t *), cqe, 1352 len); 1353 pkt->ol_flags |= PKT_RX_LRO; 1354 pkt->tso_segsz = len / cqe->lro_num_seg; 1355 } 1356 } 1357 DATA_LEN(rep) = DATA_LEN(seg); 1358 PKT_LEN(rep) = PKT_LEN(seg); 1359 SET_DATA_OFF(rep, DATA_OFF(seg)); 1360 PORT(rep) = PORT(seg); 1361 (*rxq->elts)[idx] = rep; 1362 /* 1363 * Fill NIC descriptor with the new buffer. The lkey and size 1364 * of the buffers are already known, only the buffer address 1365 * changes. 1366 */ 1367 wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t)); 1368 /* If there's only one MR, no need to replace LKey in WQE. */ 1369 if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) 1370 wqe->lkey = mlx5_rx_mb2mr(rxq, rep); 1371 if (len > DATA_LEN(seg)) { 1372 len -= DATA_LEN(seg); 1373 ++NB_SEGS(pkt); 1374 ++rq_ci; 1375 continue; 1376 } 1377 DATA_LEN(seg) = len; 1378 #ifdef MLX5_PMD_SOFT_COUNTERS 1379 /* Increment bytes counter. */ 1380 rxq->stats.ibytes += PKT_LEN(pkt); 1381 #endif 1382 /* Return packet. */ 1383 *(pkts++) = pkt; 1384 pkt = NULL; 1385 --pkts_n; 1386 ++i; 1387 /* Align consumer index to the next stride. */ 1388 rq_ci >>= sges_n; 1389 ++rq_ci; 1390 rq_ci <<= sges_n; 1391 } 1392 if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci))) 1393 return 0; 1394 /* Update the consumer index. */ 1395 rxq->rq_ci = rq_ci >> sges_n; 1396 rte_cio_wmb(); 1397 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); 1398 rte_cio_wmb(); 1399 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 1400 #ifdef MLX5_PMD_SOFT_COUNTERS 1401 /* Increment packets counter. */ 1402 rxq->stats.ipackets += i; 1403 #endif 1404 return i; 1405 } 1406 1407 /** 1408 * Update LRO packet TCP header. 1409 * The HW LRO feature doesn't update the TCP header after coalescing the 1410 * TCP segments but supplies information in CQE to fill it by SW. 1411 * 1412 * @param tcp 1413 * Pointer to the TCP header. 1414 * @param cqe 1415 * Pointer to the completion entry.. 1416 * @param phcsum 1417 * The L3 pseudo-header checksum. 1418 */ 1419 static inline void 1420 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *restrict tcp, 1421 volatile struct mlx5_cqe *restrict cqe, 1422 uint32_t phcsum) 1423 { 1424 uint8_t l4_type = (rte_be_to_cpu_16(cqe->hdr_type_etc) & 1425 MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT; 1426 /* 1427 * The HW calculates only the TCP payload checksum, need to complete 1428 * the TCP header checksum and the L3 pseudo-header checksum. 1429 */ 1430 uint32_t csum = phcsum + cqe->csum; 1431 1432 if (l4_type == MLX5_L4_HDR_TYPE_TCP_EMPTY_ACK || 1433 l4_type == MLX5_L4_HDR_TYPE_TCP_WITH_ACL) { 1434 tcp->tcp_flags |= RTE_TCP_ACK_FLAG; 1435 tcp->recv_ack = cqe->lro_ack_seq_num; 1436 tcp->rx_win = cqe->lro_tcp_win; 1437 } 1438 if (cqe->lro_tcppsh_abort_dupack & MLX5_CQE_LRO_PUSH_MASK) 1439 tcp->tcp_flags |= RTE_TCP_PSH_FLAG; 1440 tcp->cksum = 0; 1441 csum += rte_raw_cksum(tcp, (tcp->data_off & 0xF) * 4); 1442 csum = ((csum & 0xffff0000) >> 16) + (csum & 0xffff); 1443 csum = (~csum) & 0xffff; 1444 if (csum == 0) 1445 csum = 0xffff; 1446 tcp->cksum = csum; 1447 } 1448 1449 /** 1450 * Update LRO packet headers. 1451 * The HW LRO feature doesn't update the L3/TCP headers after coalescing the 1452 * TCP segments but supply information in CQE to fill it by SW. 1453 * 1454 * @param padd 1455 * The packet address. 1456 * @param cqe 1457 * Pointer to the completion entry.. 1458 * @param len 1459 * The packet length. 1460 */ 1461 static inline void 1462 mlx5_lro_update_hdr(uint8_t *restrict padd, 1463 volatile struct mlx5_cqe *restrict cqe, 1464 uint32_t len) 1465 { 1466 union { 1467 struct rte_ether_hdr *eth; 1468 struct rte_vlan_hdr *vlan; 1469 struct rte_ipv4_hdr *ipv4; 1470 struct rte_ipv6_hdr *ipv6; 1471 struct rte_tcp_hdr *tcp; 1472 uint8_t *hdr; 1473 } h = { 1474 .hdr = padd, 1475 }; 1476 uint16_t proto = h.eth->ether_type; 1477 uint32_t phcsum; 1478 1479 h.eth++; 1480 while (proto == RTE_BE16(RTE_ETHER_TYPE_VLAN) || 1481 proto == RTE_BE16(RTE_ETHER_TYPE_QINQ)) { 1482 proto = h.vlan->eth_proto; 1483 h.vlan++; 1484 } 1485 if (proto == RTE_BE16(RTE_ETHER_TYPE_IPV4)) { 1486 h.ipv4->time_to_live = cqe->lro_min_ttl; 1487 h.ipv4->total_length = rte_cpu_to_be_16(len - (h.hdr - padd)); 1488 h.ipv4->hdr_checksum = 0; 1489 h.ipv4->hdr_checksum = rte_ipv4_cksum(h.ipv4); 1490 phcsum = rte_ipv4_phdr_cksum(h.ipv4, 0); 1491 h.ipv4++; 1492 } else { 1493 h.ipv6->hop_limits = cqe->lro_min_ttl; 1494 h.ipv6->payload_len = rte_cpu_to_be_16(len - (h.hdr - padd) - 1495 sizeof(*h.ipv6)); 1496 phcsum = rte_ipv6_phdr_cksum(h.ipv6, 0); 1497 h.ipv6++; 1498 } 1499 mlx5_lro_update_tcp_hdr(h.tcp, cqe, phcsum); 1500 } 1501 1502 void 1503 mlx5_mprq_buf_free_cb(void *addr __rte_unused, void *opaque) 1504 { 1505 struct mlx5_mprq_buf *buf = opaque; 1506 1507 if (rte_atomic16_read(&buf->refcnt) == 1) { 1508 rte_mempool_put(buf->mp, buf); 1509 } else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) { 1510 rte_atomic16_set(&buf->refcnt, 1); 1511 rte_mempool_put(buf->mp, buf); 1512 } 1513 } 1514 1515 void 1516 mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf) 1517 { 1518 mlx5_mprq_buf_free_cb(NULL, buf); 1519 } 1520 1521 static inline void 1522 mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx, 1523 const unsigned int strd_n) 1524 { 1525 struct mlx5_mprq_buf *rep = rxq->mprq_repl; 1526 volatile struct mlx5_wqe_data_seg *wqe = 1527 &((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg; 1528 void *addr; 1529 1530 assert(rep != NULL); 1531 /* Replace MPRQ buf. */ 1532 (*rxq->mprq_bufs)[rq_idx] = rep; 1533 /* Replace WQE. */ 1534 addr = mlx5_mprq_buf_addr(rep, strd_n); 1535 wqe->addr = rte_cpu_to_be_64((uintptr_t)addr); 1536 /* If there's only one MR, no need to replace LKey in WQE. */ 1537 if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) 1538 wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr); 1539 /* Stash a mbuf for next replacement. */ 1540 if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep))) 1541 rxq->mprq_repl = rep; 1542 else 1543 rxq->mprq_repl = NULL; 1544 } 1545 1546 /** 1547 * DPDK callback for RX with Multi-Packet RQ support. 1548 * 1549 * @param dpdk_rxq 1550 * Generic pointer to RX queue structure. 1551 * @param[out] pkts 1552 * Array to store received packets. 1553 * @param pkts_n 1554 * Maximum number of packets in array. 1555 * 1556 * @return 1557 * Number of packets successfully received (<= pkts_n). 1558 */ 1559 uint16_t 1560 mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1561 { 1562 struct mlx5_rxq_data *rxq = dpdk_rxq; 1563 const unsigned int strd_n = 1 << rxq->strd_num_n; 1564 const unsigned int strd_sz = 1 << rxq->strd_sz_n; 1565 const unsigned int strd_shift = 1566 MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en; 1567 const unsigned int cq_mask = (1 << rxq->cqe_n) - 1; 1568 const unsigned int wq_mask = (1 << rxq->elts_n) - 1; 1569 volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask]; 1570 unsigned int i = 0; 1571 uint32_t rq_ci = rxq->rq_ci; 1572 uint16_t consumed_strd = rxq->consumed_strd; 1573 uint16_t headroom_sz = rxq->strd_headroom_en * RTE_PKTMBUF_HEADROOM; 1574 struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_ci & wq_mask]; 1575 1576 while (i < pkts_n) { 1577 struct rte_mbuf *pkt; 1578 void *addr; 1579 int ret; 1580 unsigned int len; 1581 uint16_t strd_cnt; 1582 uint16_t strd_idx; 1583 uint32_t offset; 1584 uint32_t byte_cnt; 1585 volatile struct mlx5_mini_cqe8 *mcqe = NULL; 1586 uint32_t rss_hash_res = 0; 1587 uint8_t lro_num_seg; 1588 1589 if (consumed_strd == strd_n) { 1590 /* Replace WQE only if the buffer is still in use. */ 1591 if (rte_atomic16_read(&buf->refcnt) > 1) { 1592 mprq_buf_replace(rxq, rq_ci & wq_mask, strd_n); 1593 /* Release the old buffer. */ 1594 mlx5_mprq_buf_free(buf); 1595 } else if (unlikely(rxq->mprq_repl == NULL)) { 1596 struct mlx5_mprq_buf *rep; 1597 1598 /* 1599 * Currently, the MPRQ mempool is out of buffer 1600 * and doing memcpy regardless of the size of Rx 1601 * packet. Retry allocation to get back to 1602 * normal. 1603 */ 1604 if (!rte_mempool_get(rxq->mprq_mp, 1605 (void **)&rep)) 1606 rxq->mprq_repl = rep; 1607 } 1608 /* Advance to the next WQE. */ 1609 consumed_strd = 0; 1610 ++rq_ci; 1611 buf = (*rxq->mprq_bufs)[rq_ci & wq_mask]; 1612 } 1613 cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask]; 1614 ret = mlx5_rx_poll_len(rxq, cqe, cq_mask, &mcqe); 1615 if (!ret) 1616 break; 1617 byte_cnt = ret; 1618 strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >> 1619 MLX5_MPRQ_STRIDE_NUM_SHIFT; 1620 assert(strd_cnt); 1621 consumed_strd += strd_cnt; 1622 if (byte_cnt & MLX5_MPRQ_FILLER_MASK) 1623 continue; 1624 if (mcqe == NULL) { 1625 rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res); 1626 strd_idx = rte_be_to_cpu_16(cqe->wqe_counter); 1627 } else { 1628 /* mini-CQE for MPRQ doesn't have hash result. */ 1629 strd_idx = rte_be_to_cpu_16(mcqe->stride_idx); 1630 } 1631 assert(strd_idx < strd_n); 1632 assert(!((rte_be_to_cpu_16(cqe->wqe_id) ^ rq_ci) & wq_mask)); 1633 lro_num_seg = cqe->lro_num_seg; 1634 /* 1635 * Currently configured to receive a packet per a stride. But if 1636 * MTU is adjusted through kernel interface, device could 1637 * consume multiple strides without raising an error. In this 1638 * case, the packet should be dropped because it is bigger than 1639 * the max_rx_pkt_len. 1640 */ 1641 if (unlikely(!lro_num_seg && strd_cnt > 1)) { 1642 ++rxq->stats.idropped; 1643 continue; 1644 } 1645 pkt = rte_pktmbuf_alloc(rxq->mp); 1646 if (unlikely(pkt == NULL)) { 1647 ++rxq->stats.rx_nombuf; 1648 break; 1649 } 1650 len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT; 1651 assert((int)len >= (rxq->crc_present << 2)); 1652 if (rxq->crc_present) 1653 len -= RTE_ETHER_CRC_LEN; 1654 offset = strd_idx * strd_sz + strd_shift; 1655 addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset); 1656 /* 1657 * Memcpy packets to the target mbuf if: 1658 * - The size of packet is smaller than mprq_max_memcpy_len. 1659 * - Out of buffer in the Mempool for Multi-Packet RQ. 1660 */ 1661 if (len <= rxq->mprq_max_memcpy_len || rxq->mprq_repl == NULL) { 1662 /* 1663 * When memcpy'ing packet due to out-of-buffer, the 1664 * packet must be smaller than the target mbuf. 1665 */ 1666 if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) { 1667 rte_pktmbuf_free_seg(pkt); 1668 ++rxq->stats.idropped; 1669 continue; 1670 } 1671 rte_memcpy(rte_pktmbuf_mtod(pkt, void *), addr, len); 1672 DATA_LEN(pkt) = len; 1673 } else { 1674 rte_iova_t buf_iova; 1675 struct rte_mbuf_ext_shared_info *shinfo; 1676 uint16_t buf_len = strd_cnt * strd_sz; 1677 void *buf_addr; 1678 1679 /* Increment the refcnt of the whole chunk. */ 1680 rte_atomic16_add_return(&buf->refcnt, 1); 1681 assert((uint16_t)rte_atomic16_read(&buf->refcnt) <= 1682 strd_n + 1); 1683 buf_addr = RTE_PTR_SUB(addr, headroom_sz); 1684 /* 1685 * MLX5 device doesn't use iova but it is necessary in a 1686 * case where the Rx packet is transmitted via a 1687 * different PMD. 1688 */ 1689 buf_iova = rte_mempool_virt2iova(buf) + 1690 RTE_PTR_DIFF(buf_addr, buf); 1691 shinfo = &buf->shinfos[strd_idx]; 1692 rte_mbuf_ext_refcnt_set(shinfo, 1); 1693 /* 1694 * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when 1695 * attaching the stride to mbuf and more offload flags 1696 * will be added below by calling rxq_cq_to_mbuf(). 1697 * Other fields will be overwritten. 1698 */ 1699 rte_pktmbuf_attach_extbuf(pkt, buf_addr, buf_iova, 1700 buf_len, shinfo); 1701 /* Set mbuf head-room. */ 1702 pkt->data_off = headroom_sz; 1703 assert(pkt->ol_flags == EXT_ATTACHED_MBUF); 1704 /* 1705 * Prevent potential overflow due to MTU change through 1706 * kernel interface. 1707 */ 1708 if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) { 1709 rte_pktmbuf_free_seg(pkt); 1710 ++rxq->stats.idropped; 1711 continue; 1712 } 1713 DATA_LEN(pkt) = len; 1714 /* 1715 * LRO packet may consume all the stride memory, in this 1716 * case packet head-room space is not guaranteed so must 1717 * to add an empty mbuf for the head-room. 1718 */ 1719 if (!rxq->strd_headroom_en) { 1720 struct rte_mbuf *headroom_mbuf = 1721 rte_pktmbuf_alloc(rxq->mp); 1722 1723 if (unlikely(headroom_mbuf == NULL)) { 1724 rte_pktmbuf_free_seg(pkt); 1725 ++rxq->stats.rx_nombuf; 1726 break; 1727 } 1728 PORT(pkt) = rxq->port_id; 1729 NEXT(headroom_mbuf) = pkt; 1730 pkt = headroom_mbuf; 1731 NB_SEGS(pkt) = 2; 1732 } 1733 } 1734 rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res); 1735 if (lro_num_seg > 1) { 1736 mlx5_lro_update_hdr(addr, cqe, len); 1737 pkt->ol_flags |= PKT_RX_LRO; 1738 pkt->tso_segsz = strd_sz; 1739 } 1740 PKT_LEN(pkt) = len; 1741 PORT(pkt) = rxq->port_id; 1742 #ifdef MLX5_PMD_SOFT_COUNTERS 1743 /* Increment bytes counter. */ 1744 rxq->stats.ibytes += PKT_LEN(pkt); 1745 #endif 1746 /* Return packet. */ 1747 *(pkts++) = pkt; 1748 ++i; 1749 } 1750 /* Update the consumer indexes. */ 1751 rxq->consumed_strd = consumed_strd; 1752 rte_cio_wmb(); 1753 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); 1754 if (rq_ci != rxq->rq_ci) { 1755 rxq->rq_ci = rq_ci; 1756 rte_cio_wmb(); 1757 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 1758 } 1759 #ifdef MLX5_PMD_SOFT_COUNTERS 1760 /* Increment packets counter. */ 1761 rxq->stats.ipackets += i; 1762 #endif 1763 return i; 1764 } 1765 1766 /** 1767 * Dummy DPDK callback for TX. 1768 * 1769 * This function is used to temporarily replace the real callback during 1770 * unsafe control operations on the queue, or in case of error. 1771 * 1772 * @param dpdk_txq 1773 * Generic pointer to TX queue structure. 1774 * @param[in] pkts 1775 * Packets to transmit. 1776 * @param pkts_n 1777 * Number of packets in array. 1778 * 1779 * @return 1780 * Number of packets successfully transmitted (<= pkts_n). 1781 */ 1782 uint16_t 1783 removed_tx_burst(void *dpdk_txq __rte_unused, 1784 struct rte_mbuf **pkts __rte_unused, 1785 uint16_t pkts_n __rte_unused) 1786 { 1787 rte_mb(); 1788 return 0; 1789 } 1790 1791 /** 1792 * Dummy DPDK callback for RX. 1793 * 1794 * This function is used to temporarily replace the real callback during 1795 * unsafe control operations on the queue, or in case of error. 1796 * 1797 * @param dpdk_rxq 1798 * Generic pointer to RX queue structure. 1799 * @param[out] pkts 1800 * Array to store received packets. 1801 * @param pkts_n 1802 * Maximum number of packets in array. 1803 * 1804 * @return 1805 * Number of packets successfully received (<= pkts_n). 1806 */ 1807 uint16_t 1808 removed_rx_burst(void *dpdk_txq __rte_unused, 1809 struct rte_mbuf **pkts __rte_unused, 1810 uint16_t pkts_n __rte_unused) 1811 { 1812 rte_mb(); 1813 return 0; 1814 } 1815 1816 /* 1817 * Vectorized Rx/Tx routines are not compiled in when required vector 1818 * instructions are not supported on a target architecture. The following null 1819 * stubs are needed for linkage when those are not included outside of this file 1820 * (e.g. mlx5_rxtx_vec_sse.c for x86). 1821 */ 1822 1823 __rte_weak uint16_t 1824 mlx5_rx_burst_vec(void *dpdk_txq __rte_unused, 1825 struct rte_mbuf **pkts __rte_unused, 1826 uint16_t pkts_n __rte_unused) 1827 { 1828 return 0; 1829 } 1830 1831 __rte_weak int 1832 mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused) 1833 { 1834 return -ENOTSUP; 1835 } 1836 1837 __rte_weak int 1838 mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused) 1839 { 1840 return -ENOTSUP; 1841 } 1842 1843 /** 1844 * Free the mbufs from the linear array of pointers. 1845 * 1846 * @param pkts 1847 * Pointer to array of packets to be free. 1848 * @param pkts_n 1849 * Number of packets to be freed. 1850 * @param olx 1851 * Configured Tx offloads mask. It is fully defined at 1852 * compile time and may be used for optimization. 1853 */ 1854 static __rte_always_inline void 1855 mlx5_tx_free_mbuf(struct rte_mbuf **restrict pkts, 1856 unsigned int pkts_n, 1857 unsigned int olx __rte_unused) 1858 { 1859 struct rte_mempool *pool = NULL; 1860 struct rte_mbuf **p_free = NULL; 1861 struct rte_mbuf *mbuf; 1862 unsigned int n_free = 0; 1863 1864 /* 1865 * The implemented algorithm eliminates 1866 * copying pointers to temporary array 1867 * for rte_mempool_put_bulk() calls. 1868 */ 1869 assert(pkts); 1870 assert(pkts_n); 1871 for (;;) { 1872 for (;;) { 1873 /* 1874 * Decrement mbuf reference counter, detach 1875 * indirect and external buffers if needed. 1876 */ 1877 mbuf = rte_pktmbuf_prefree_seg(*pkts); 1878 if (likely(mbuf != NULL)) { 1879 assert(mbuf == *pkts); 1880 if (likely(n_free != 0)) { 1881 if (unlikely(pool != mbuf->pool)) 1882 /* From different pool. */ 1883 break; 1884 } else { 1885 /* Start new scan array. */ 1886 pool = mbuf->pool; 1887 p_free = pkts; 1888 } 1889 ++n_free; 1890 ++pkts; 1891 --pkts_n; 1892 if (unlikely(pkts_n == 0)) { 1893 mbuf = NULL; 1894 break; 1895 } 1896 } else { 1897 /* 1898 * This happens if mbuf is still referenced. 1899 * We can't put it back to the pool, skip. 1900 */ 1901 ++pkts; 1902 --pkts_n; 1903 if (unlikely(n_free != 0)) 1904 /* There is some array to free.*/ 1905 break; 1906 if (unlikely(pkts_n == 0)) 1907 /* Last mbuf, nothing to free. */ 1908 return; 1909 } 1910 } 1911 for (;;) { 1912 /* 1913 * This loop is implemented to avoid multiple 1914 * inlining of rte_mempool_put_bulk(). 1915 */ 1916 assert(pool); 1917 assert(p_free); 1918 assert(n_free); 1919 /* 1920 * Free the array of pre-freed mbufs 1921 * belonging to the same memory pool. 1922 */ 1923 rte_mempool_put_bulk(pool, (void *)p_free, n_free); 1924 if (unlikely(mbuf != NULL)) { 1925 /* There is the request to start new scan. */ 1926 pool = mbuf->pool; 1927 p_free = pkts++; 1928 n_free = 1; 1929 --pkts_n; 1930 if (likely(pkts_n != 0)) 1931 break; 1932 /* 1933 * This is the last mbuf to be freed. 1934 * Do one more loop iteration to complete. 1935 * This is rare case of the last unique mbuf. 1936 */ 1937 mbuf = NULL; 1938 continue; 1939 } 1940 if (likely(pkts_n == 0)) 1941 return; 1942 n_free = 0; 1943 break; 1944 } 1945 } 1946 } 1947 1948 /** 1949 * Free the mbuf from the elts ring buffer till new tail. 1950 * 1951 * @param txq 1952 * Pointer to Tx queue structure. 1953 * @param tail 1954 * Index in elts to free up to, becomes new elts tail. 1955 * @param olx 1956 * Configured Tx offloads mask. It is fully defined at 1957 * compile time and may be used for optimization. 1958 */ 1959 static __rte_always_inline void 1960 mlx5_tx_free_elts(struct mlx5_txq_data *restrict txq, 1961 uint16_t tail, 1962 unsigned int olx __rte_unused) 1963 { 1964 uint16_t n_elts = tail - txq->elts_tail; 1965 1966 assert(n_elts); 1967 assert(n_elts <= txq->elts_s); 1968 /* 1969 * Implement a loop to support ring buffer wraparound 1970 * with single inlining of mlx5_tx_free_mbuf(). 1971 */ 1972 do { 1973 unsigned int part; 1974 1975 part = txq->elts_s - (txq->elts_tail & txq->elts_m); 1976 part = RTE_MIN(part, n_elts); 1977 assert(part); 1978 assert(part <= txq->elts_s); 1979 mlx5_tx_free_mbuf(&txq->elts[txq->elts_tail & txq->elts_m], 1980 part, olx); 1981 txq->elts_tail += part; 1982 n_elts -= part; 1983 } while (n_elts); 1984 } 1985 1986 /** 1987 * Store the mbuf being sent into elts ring buffer. 1988 * On Tx completion these mbufs will be freed. 1989 * 1990 * @param txq 1991 * Pointer to Tx queue structure. 1992 * @param pkts 1993 * Pointer to array of packets to be stored. 1994 * @param pkts_n 1995 * Number of packets to be stored. 1996 * @param olx 1997 * Configured Tx offloads mask. It is fully defined at 1998 * compile time and may be used for optimization. 1999 */ 2000 static __rte_always_inline void 2001 mlx5_tx_copy_elts(struct mlx5_txq_data *restrict txq, 2002 struct rte_mbuf **restrict pkts, 2003 unsigned int pkts_n, 2004 unsigned int olx __rte_unused) 2005 { 2006 unsigned int part; 2007 struct rte_mbuf **elts = (struct rte_mbuf **)txq->elts; 2008 2009 assert(pkts); 2010 assert(pkts_n); 2011 part = txq->elts_s - (txq->elts_head & txq->elts_m); 2012 assert(part); 2013 assert(part <= txq->elts_s); 2014 /* This code is a good candidate for vectorizing with SIMD. */ 2015 rte_memcpy((void *)(elts + (txq->elts_head & txq->elts_m)), 2016 (void *)pkts, 2017 RTE_MIN(part, pkts_n) * sizeof(struct rte_mbuf *)); 2018 txq->elts_head += pkts_n; 2019 if (unlikely(part < pkts_n)) 2020 /* The copy is wrapping around the elts array. */ 2021 rte_memcpy((void *)elts, (void *)(pkts + part), 2022 (pkts_n - part) * sizeof(struct rte_mbuf *)); 2023 } 2024 2025 /** 2026 * Update completion queue consuming index via doorbell 2027 * and flush the completed data buffers. 2028 * 2029 * @param txq 2030 * Pointer to TX queue structure. 2031 * @param valid CQE pointer 2032 * if not NULL update txq->wqe_pi and flush the buffers 2033 * @param itail 2034 * if not negative - flush the buffers till this index. 2035 * @param olx 2036 * Configured Tx offloads mask. It is fully defined at 2037 * compile time and may be used for optimization. 2038 */ 2039 static __rte_always_inline void 2040 mlx5_tx_comp_flush(struct mlx5_txq_data *restrict txq, 2041 volatile struct mlx5_cqe *last_cqe, 2042 int itail, 2043 unsigned int olx __rte_unused) 2044 { 2045 uint16_t tail; 2046 2047 if (likely(last_cqe != NULL)) { 2048 txq->wqe_pi = rte_be_to_cpu_16(last_cqe->wqe_counter); 2049 tail = ((volatile struct mlx5_wqe_cseg *) 2050 (txq->wqes + (txq->wqe_pi & txq->wqe_m)))->misc; 2051 } else if (itail >= 0) { 2052 tail = (uint16_t)itail; 2053 } else { 2054 return; 2055 } 2056 rte_compiler_barrier(); 2057 *txq->cq_db = rte_cpu_to_be_32(txq->cq_ci); 2058 if (likely(tail != txq->elts_tail)) { 2059 mlx5_tx_free_elts(txq, tail, olx); 2060 assert(tail == txq->elts_tail); 2061 } 2062 } 2063 2064 /** 2065 * Manage TX completions. This routine checks the CQ for 2066 * arrived CQEs, deduces the last accomplished WQE in SQ, 2067 * updates SQ producing index and frees all completed mbufs. 2068 * 2069 * @param txq 2070 * Pointer to TX queue structure. 2071 * @param olx 2072 * Configured Tx offloads mask. It is fully defined at 2073 * compile time and may be used for optimization. 2074 * 2075 * NOTE: not inlined intentionally, it makes tx_burst 2076 * routine smaller, simple and faster - from experiments. 2077 */ 2078 static void 2079 mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq, 2080 unsigned int olx __rte_unused) 2081 { 2082 unsigned int count = MLX5_TX_COMP_MAX_CQE; 2083 volatile struct mlx5_cqe *last_cqe = NULL; 2084 int ret; 2085 2086 static_assert(MLX5_CQE_STATUS_HW_OWN < 0, "Must be negative value"); 2087 static_assert(MLX5_CQE_STATUS_SW_OWN < 0, "Must be negative value"); 2088 do { 2089 volatile struct mlx5_cqe *cqe; 2090 2091 cqe = &txq->cqes[txq->cq_ci & txq->cqe_m]; 2092 ret = check_cqe(cqe, txq->cqe_s, txq->cq_ci); 2093 if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) { 2094 if (likely(ret != MLX5_CQE_STATUS_ERR)) { 2095 /* No new CQEs in completion queue. */ 2096 assert(ret == MLX5_CQE_STATUS_HW_OWN); 2097 break; 2098 } 2099 /* 2100 * Some error occurred, try to restart. 2101 * We have no barrier after WQE related Doorbell 2102 * written, make sure all writes are completed 2103 * here, before we might perform SQ reset. 2104 */ 2105 rte_wmb(); 2106 ret = mlx5_tx_error_cqe_handle 2107 (txq, (volatile struct mlx5_err_cqe *)cqe); 2108 /* 2109 * Flush buffers, update consuming index 2110 * if recovery succeeded. Otherwise 2111 * just try to recover later. 2112 */ 2113 last_cqe = NULL; 2114 break; 2115 } 2116 /* Normal transmit completion. */ 2117 ++txq->cq_ci; 2118 last_cqe = cqe; 2119 #ifndef NDEBUG 2120 if (txq->cq_pi) 2121 --txq->cq_pi; 2122 #endif 2123 /* 2124 * We have to restrict the amount of processed CQEs 2125 * in one tx_burst routine call. The CQ may be large 2126 * and many CQEs may be updated by the NIC in one 2127 * transaction. Buffers freeing is time consuming, 2128 * multiple iterations may introduce significant 2129 * latency. 2130 */ 2131 } while (--count); 2132 mlx5_tx_comp_flush(txq, last_cqe, ret, olx); 2133 } 2134 2135 /** 2136 * Check if the completion request flag should be set in the last WQE. 2137 * Both pushed mbufs and WQEs are monitored and the completion request 2138 * flag is set if any of thresholds is reached. 2139 * 2140 * @param txq 2141 * Pointer to TX queue structure. 2142 * @param loc 2143 * Pointer to burst routine local context. 2144 * @param multi, 2145 * Routine is called from multi-segment sending loop, 2146 * do not correct the elts_head according to the pkts_copy. 2147 * @param olx 2148 * Configured Tx offloads mask. It is fully defined at 2149 * compile time and may be used for optimization. 2150 */ 2151 static __rte_always_inline void 2152 mlx5_tx_request_completion(struct mlx5_txq_data *restrict txq, 2153 struct mlx5_txq_local *restrict loc, 2154 bool multi, 2155 unsigned int olx) 2156 { 2157 uint16_t head = txq->elts_head; 2158 unsigned int part; 2159 2160 part = (MLX5_TXOFF_CONFIG(INLINE) || multi) ? 2161 0 : loc->pkts_sent - loc->pkts_copy; 2162 head += part; 2163 if ((uint16_t)(head - txq->elts_comp) >= MLX5_TX_COMP_THRESH || 2164 (MLX5_TXOFF_CONFIG(INLINE) && 2165 (uint16_t)(txq->wqe_ci - txq->wqe_comp) >= txq->wqe_thres)) { 2166 volatile struct mlx5_wqe *last = loc->wqe_last; 2167 2168 txq->elts_comp = head; 2169 if (MLX5_TXOFF_CONFIG(INLINE)) 2170 txq->wqe_comp = txq->wqe_ci; 2171 /* Request unconditional completion on last WQE. */ 2172 last->cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS << 2173 MLX5_COMP_MODE_OFFSET); 2174 /* Save elts_head in unused "immediate" field of WQE. */ 2175 last->cseg.misc = head; 2176 /* 2177 * A CQE slot must always be available. Count the 2178 * issued CEQ "always" request instead of production 2179 * index due to here can be CQE with errors and 2180 * difference with ci may become inconsistent. 2181 */ 2182 assert(txq->cqe_s > ++txq->cq_pi); 2183 } 2184 } 2185 2186 /** 2187 * DPDK callback to check the status of a tx descriptor. 2188 * 2189 * @param tx_queue 2190 * The tx queue. 2191 * @param[in] offset 2192 * The index of the descriptor in the ring. 2193 * 2194 * @return 2195 * The status of the tx descriptor. 2196 */ 2197 int 2198 mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset) 2199 { 2200 struct mlx5_txq_data *restrict txq = tx_queue; 2201 uint16_t used; 2202 2203 mlx5_tx_handle_completion(txq, 0); 2204 used = txq->elts_head - txq->elts_tail; 2205 if (offset < used) 2206 return RTE_ETH_TX_DESC_FULL; 2207 return RTE_ETH_TX_DESC_DONE; 2208 } 2209 2210 /** 2211 * Build the Control Segment with specified opcode: 2212 * - MLX5_OPCODE_SEND 2213 * - MLX5_OPCODE_ENHANCED_MPSW 2214 * - MLX5_OPCODE_TSO 2215 * 2216 * @param txq 2217 * Pointer to TX queue structure. 2218 * @param loc 2219 * Pointer to burst routine local context. 2220 * @param wqe 2221 * Pointer to WQE to fill with built Control Segment. 2222 * @param ds 2223 * Supposed length of WQE in segments. 2224 * @param opcode 2225 * SQ WQE opcode to put into Control Segment. 2226 * @param olx 2227 * Configured Tx offloads mask. It is fully defined at 2228 * compile time and may be used for optimization. 2229 */ 2230 static __rte_always_inline void 2231 mlx5_tx_cseg_init(struct mlx5_txq_data *restrict txq, 2232 struct mlx5_txq_local *restrict loc __rte_unused, 2233 struct mlx5_wqe *restrict wqe, 2234 unsigned int ds, 2235 unsigned int opcode, 2236 unsigned int olx __rte_unused) 2237 { 2238 struct mlx5_wqe_cseg *restrict cs = &wqe->cseg; 2239 2240 cs->opcode = rte_cpu_to_be_32((txq->wqe_ci << 8) | opcode); 2241 cs->sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); 2242 cs->flags = RTE_BE32(MLX5_COMP_ONLY_FIRST_ERR << 2243 MLX5_COMP_MODE_OFFSET); 2244 cs->misc = RTE_BE32(0); 2245 } 2246 2247 /** 2248 * Build the Ethernet Segment without inlined data. 2249 * Supports Software Parser, Checksums and VLAN 2250 * insertion Tx offload features. 2251 * 2252 * @param txq 2253 * Pointer to TX queue structure. 2254 * @param loc 2255 * Pointer to burst routine local context. 2256 * @param wqe 2257 * Pointer to WQE to fill with built Ethernet Segment. 2258 * @param olx 2259 * Configured Tx offloads mask. It is fully defined at 2260 * compile time and may be used for optimization. 2261 */ 2262 static __rte_always_inline void 2263 mlx5_tx_eseg_none(struct mlx5_txq_data *restrict txq __rte_unused, 2264 struct mlx5_txq_local *restrict loc, 2265 struct mlx5_wqe *restrict wqe, 2266 unsigned int olx) 2267 { 2268 struct mlx5_wqe_eseg *restrict es = &wqe->eseg; 2269 uint32_t csum; 2270 2271 /* 2272 * Calculate and set check sum flags first, dword field 2273 * in segment may be shared with Software Parser flags. 2274 */ 2275 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; 2276 es->flags = rte_cpu_to_le_32(csum); 2277 /* 2278 * Calculate and set Software Parser offsets and flags. 2279 * These flags a set for custom UDP and IP tunnel packets. 2280 */ 2281 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); 2282 /* Fill metadata field if needed. */ 2283 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? 2284 loc->mbuf->ol_flags & PKT_TX_METADATA ? 2285 loc->mbuf->tx_metadata : 0 : 0; 2286 /* Engage VLAN tag insertion feature if requested. */ 2287 if (MLX5_TXOFF_CONFIG(VLAN) && 2288 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { 2289 /* 2290 * We should get here only if device support 2291 * this feature correctly. 2292 */ 2293 assert(txq->vlan_en); 2294 es->inline_hdr = rte_cpu_to_be_32(MLX5_ETH_WQE_VLAN_INSERT | 2295 loc->mbuf->vlan_tci); 2296 } else { 2297 es->inline_hdr = RTE_BE32(0); 2298 } 2299 } 2300 2301 /** 2302 * Build the Ethernet Segment with minimal inlined data 2303 * of MLX5_ESEG_MIN_INLINE_SIZE bytes length. This is 2304 * used to fill the gap in single WQEBB WQEs. 2305 * Supports Software Parser, Checksums and VLAN 2306 * insertion Tx offload features. 2307 * 2308 * @param txq 2309 * Pointer to TX queue structure. 2310 * @param loc 2311 * Pointer to burst routine local context. 2312 * @param wqe 2313 * Pointer to WQE to fill with built Ethernet Segment. 2314 * @param vlan 2315 * Length of VLAN tag insertion if any. 2316 * @param olx 2317 * Configured Tx offloads mask. It is fully defined at 2318 * compile time and may be used for optimization. 2319 */ 2320 static __rte_always_inline void 2321 mlx5_tx_eseg_dmin(struct mlx5_txq_data *restrict txq __rte_unused, 2322 struct mlx5_txq_local *restrict loc, 2323 struct mlx5_wqe *restrict wqe, 2324 unsigned int vlan, 2325 unsigned int olx) 2326 { 2327 struct mlx5_wqe_eseg *restrict es = &wqe->eseg; 2328 uint32_t csum; 2329 uint8_t *psrc, *pdst; 2330 2331 /* 2332 * Calculate and set check sum flags first, dword field 2333 * in segment may be shared with Software Parser flags. 2334 */ 2335 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; 2336 es->flags = rte_cpu_to_le_32(csum); 2337 /* 2338 * Calculate and set Software Parser offsets and flags. 2339 * These flags a set for custom UDP and IP tunnel packets. 2340 */ 2341 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); 2342 /* Fill metadata field if needed. */ 2343 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? 2344 loc->mbuf->ol_flags & PKT_TX_METADATA ? 2345 loc->mbuf->tx_metadata : 0 : 0; 2346 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2347 (sizeof(uint16_t) + 2348 sizeof(rte_v128u32_t)), 2349 "invalid Ethernet Segment data size"); 2350 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2351 (sizeof(uint16_t) + 2352 sizeof(struct rte_vlan_hdr) + 2353 2 * RTE_ETHER_ADDR_LEN), 2354 "invalid Ethernet Segment data size"); 2355 psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *); 2356 es->inline_hdr_sz = RTE_BE16(MLX5_ESEG_MIN_INLINE_SIZE); 2357 es->inline_data = *(unaligned_uint16_t *)psrc; 2358 psrc += sizeof(uint16_t); 2359 pdst = (uint8_t *)(es + 1); 2360 if (MLX5_TXOFF_CONFIG(VLAN) && vlan) { 2361 /* Implement VLAN tag insertion as part inline data. */ 2362 memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t)); 2363 pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); 2364 psrc += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); 2365 /* Insert VLAN ethertype + VLAN tag. */ 2366 *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32 2367 ((RTE_ETHER_TYPE_VLAN << 16) | 2368 loc->mbuf->vlan_tci); 2369 pdst += sizeof(struct rte_vlan_hdr); 2370 /* Copy the rest two bytes from packet data. */ 2371 assert(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t))); 2372 *(uint16_t *)pdst = *(unaligned_uint16_t *)psrc; 2373 } else { 2374 /* Fill the gap in the title WQEBB with inline data. */ 2375 rte_mov16(pdst, psrc); 2376 } 2377 } 2378 2379 /** 2380 * Build the Ethernet Segment with entire packet 2381 * data inlining. Checks the boundary of WQEBB and 2382 * ring buffer wrapping, supports Software Parser, 2383 * Checksums and VLAN insertion Tx offload features. 2384 * 2385 * @param txq 2386 * Pointer to TX queue structure. 2387 * @param loc 2388 * Pointer to burst routine local context. 2389 * @param wqe 2390 * Pointer to WQE to fill with built Ethernet Segment. 2391 * @param vlan 2392 * Length of VLAN tag insertion if any. 2393 * @param inlen 2394 * Length of data to inline (VLAN included, if any). 2395 * @param tso 2396 * TSO flag, set mss field from the packet. 2397 * @param olx 2398 * Configured Tx offloads mask. It is fully defined at 2399 * compile time and may be used for optimization. 2400 * 2401 * @return 2402 * Pointer to the next Data Segment (aligned and wrapped around). 2403 */ 2404 static __rte_always_inline struct mlx5_wqe_dseg * 2405 mlx5_tx_eseg_data(struct mlx5_txq_data *restrict txq, 2406 struct mlx5_txq_local *restrict loc, 2407 struct mlx5_wqe *restrict wqe, 2408 unsigned int vlan, 2409 unsigned int inlen, 2410 unsigned int tso, 2411 unsigned int olx) 2412 { 2413 struct mlx5_wqe_eseg *restrict es = &wqe->eseg; 2414 uint32_t csum; 2415 uint8_t *psrc, *pdst; 2416 unsigned int part; 2417 2418 /* 2419 * Calculate and set check sum flags first, dword field 2420 * in segment may be shared with Software Parser flags. 2421 */ 2422 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; 2423 if (tso) { 2424 csum <<= 24; 2425 csum |= loc->mbuf->tso_segsz; 2426 es->flags = rte_cpu_to_be_32(csum); 2427 } else { 2428 es->flags = rte_cpu_to_le_32(csum); 2429 } 2430 /* 2431 * Calculate and set Software Parser offsets and flags. 2432 * These flags a set for custom UDP and IP tunnel packets. 2433 */ 2434 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); 2435 /* Fill metadata field if needed. */ 2436 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? 2437 loc->mbuf->ol_flags & PKT_TX_METADATA ? 2438 loc->mbuf->tx_metadata : 0 : 0; 2439 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2440 (sizeof(uint16_t) + 2441 sizeof(rte_v128u32_t)), 2442 "invalid Ethernet Segment data size"); 2443 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2444 (sizeof(uint16_t) + 2445 sizeof(struct rte_vlan_hdr) + 2446 2 * RTE_ETHER_ADDR_LEN), 2447 "invalid Ethernet Segment data size"); 2448 psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *); 2449 es->inline_hdr_sz = rte_cpu_to_be_16(inlen); 2450 es->inline_data = *(unaligned_uint16_t *)psrc; 2451 psrc += sizeof(uint16_t); 2452 pdst = (uint8_t *)(es + 1); 2453 if (MLX5_TXOFF_CONFIG(VLAN) && vlan) { 2454 /* Implement VLAN tag insertion as part inline data. */ 2455 memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t)); 2456 pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); 2457 psrc += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); 2458 /* Insert VLAN ethertype + VLAN tag. */ 2459 *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32 2460 ((RTE_ETHER_TYPE_VLAN << 16) | 2461 loc->mbuf->vlan_tci); 2462 pdst += sizeof(struct rte_vlan_hdr); 2463 /* Copy the rest two bytes from packet data. */ 2464 assert(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t))); 2465 *(uint16_t *)pdst = *(unaligned_uint16_t *)psrc; 2466 psrc += sizeof(uint16_t); 2467 } else { 2468 /* Fill the gap in the title WQEBB with inline data. */ 2469 rte_mov16(pdst, psrc); 2470 psrc += sizeof(rte_v128u32_t); 2471 } 2472 pdst = (uint8_t *)(es + 2); 2473 assert(inlen >= MLX5_ESEG_MIN_INLINE_SIZE); 2474 assert(pdst < (uint8_t *)txq->wqes_end); 2475 inlen -= MLX5_ESEG_MIN_INLINE_SIZE; 2476 if (!inlen) { 2477 assert(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE)); 2478 return (struct mlx5_wqe_dseg *)pdst; 2479 } 2480 /* 2481 * The WQEBB space availability is checked by caller. 2482 * Here we should be aware of WQE ring buffer wraparound only. 2483 */ 2484 part = (uint8_t *)txq->wqes_end - pdst; 2485 part = RTE_MIN(part, inlen); 2486 do { 2487 rte_memcpy(pdst, psrc, part); 2488 inlen -= part; 2489 if (likely(!inlen)) { 2490 /* 2491 * If return value is not used by the caller 2492 * the code below will be optimized out. 2493 */ 2494 pdst += part; 2495 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); 2496 if (unlikely(pdst >= (uint8_t *)txq->wqes_end)) 2497 pdst = (uint8_t *)txq->wqes; 2498 return (struct mlx5_wqe_dseg *)pdst; 2499 } 2500 pdst = (uint8_t *)txq->wqes; 2501 psrc += part; 2502 part = inlen; 2503 } while (true); 2504 } 2505 2506 /** 2507 * Copy data from chain of mbuf to the specified linear buffer. 2508 * Checksums and VLAN insertion Tx offload features. If data 2509 * from some mbuf copied completely this mbuf is freed. Local 2510 * structure is used to keep the byte stream state. 2511 * 2512 * @param pdst 2513 * Pointer to the destination linear buffer. 2514 * @param loc 2515 * Pointer to burst routine local context. 2516 * @param len 2517 * Length of data to be copied. 2518 * @param olx 2519 * Configured Tx offloads mask. It is fully defined at 2520 * compile time and may be used for optimization. 2521 */ 2522 static __rte_always_inline void 2523 mlx5_tx_mseg_memcpy(uint8_t *pdst, 2524 struct mlx5_txq_local *restrict loc, 2525 unsigned int len, 2526 unsigned int olx __rte_unused) 2527 { 2528 struct rte_mbuf *mbuf; 2529 unsigned int part, dlen; 2530 uint8_t *psrc; 2531 2532 assert(len); 2533 do { 2534 /* Allow zero length packets, must check first. */ 2535 dlen = rte_pktmbuf_data_len(loc->mbuf); 2536 if (dlen <= loc->mbuf_off) { 2537 /* Exhausted packet, just free. */ 2538 mbuf = loc->mbuf; 2539 loc->mbuf = mbuf->next; 2540 rte_pktmbuf_free_seg(mbuf); 2541 loc->mbuf_off = 0; 2542 assert(loc->mbuf_nseg > 1); 2543 assert(loc->mbuf); 2544 --loc->mbuf_nseg; 2545 continue; 2546 } 2547 dlen -= loc->mbuf_off; 2548 psrc = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *, 2549 loc->mbuf_off); 2550 part = RTE_MIN(len, dlen); 2551 rte_memcpy(pdst, psrc, part); 2552 loc->mbuf_off += part; 2553 len -= part; 2554 if (!len) { 2555 if (loc->mbuf_off >= rte_pktmbuf_data_len(loc->mbuf)) { 2556 loc->mbuf_off = 0; 2557 /* Exhausted packet, just free. */ 2558 mbuf = loc->mbuf; 2559 loc->mbuf = mbuf->next; 2560 rte_pktmbuf_free_seg(mbuf); 2561 loc->mbuf_off = 0; 2562 assert(loc->mbuf_nseg >= 1); 2563 --loc->mbuf_nseg; 2564 } 2565 return; 2566 } 2567 pdst += part; 2568 } while (true); 2569 } 2570 2571 /** 2572 * Build the Ethernet Segment with inlined data from 2573 * multi-segment packet. Checks the boundary of WQEBB 2574 * and ring buffer wrapping, supports Software Parser, 2575 * Checksums and VLAN insertion Tx offload features. 2576 * 2577 * @param txq 2578 * Pointer to TX queue structure. 2579 * @param loc 2580 * Pointer to burst routine local context. 2581 * @param wqe 2582 * Pointer to WQE to fill with built Ethernet Segment. 2583 * @param vlan 2584 * Length of VLAN tag insertion if any. 2585 * @param inlen 2586 * Length of data to inline (VLAN included, if any). 2587 * @param tso 2588 * TSO flag, set mss field from the packet. 2589 * @param olx 2590 * Configured Tx offloads mask. It is fully defined at 2591 * compile time and may be used for optimization. 2592 * 2593 * @return 2594 * Pointer to the next Data Segment (aligned and 2595 * possible NOT wrapped around - caller should do 2596 * wrapping check on its own). 2597 */ 2598 static __rte_always_inline struct mlx5_wqe_dseg * 2599 mlx5_tx_eseg_mdat(struct mlx5_txq_data *restrict txq, 2600 struct mlx5_txq_local *restrict loc, 2601 struct mlx5_wqe *restrict wqe, 2602 unsigned int vlan, 2603 unsigned int inlen, 2604 unsigned int tso, 2605 unsigned int olx) 2606 { 2607 struct mlx5_wqe_eseg *restrict es = &wqe->eseg; 2608 uint32_t csum; 2609 uint8_t *pdst; 2610 unsigned int part; 2611 2612 /* 2613 * Calculate and set check sum flags first, uint32_t field 2614 * in segment may be shared with Software Parser flags. 2615 */ 2616 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; 2617 if (tso) { 2618 csum <<= 24; 2619 csum |= loc->mbuf->tso_segsz; 2620 es->flags = rte_cpu_to_be_32(csum); 2621 } else { 2622 es->flags = rte_cpu_to_le_32(csum); 2623 } 2624 /* 2625 * Calculate and set Software Parser offsets and flags. 2626 * These flags a set for custom UDP and IP tunnel packets. 2627 */ 2628 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); 2629 /* Fill metadata field if needed. */ 2630 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? 2631 loc->mbuf->ol_flags & PKT_TX_METADATA ? 2632 loc->mbuf->tx_metadata : 0 : 0; 2633 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2634 (sizeof(uint16_t) + 2635 sizeof(rte_v128u32_t)), 2636 "invalid Ethernet Segment data size"); 2637 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2638 (sizeof(uint16_t) + 2639 sizeof(struct rte_vlan_hdr) + 2640 2 * RTE_ETHER_ADDR_LEN), 2641 "invalid Ethernet Segment data size"); 2642 assert(inlen >= MLX5_ESEG_MIN_INLINE_SIZE); 2643 es->inline_hdr_sz = rte_cpu_to_be_16(inlen); 2644 pdst = (uint8_t *)&es->inline_data; 2645 if (MLX5_TXOFF_CONFIG(VLAN) && vlan) { 2646 /* Implement VLAN tag insertion as part inline data. */ 2647 mlx5_tx_mseg_memcpy(pdst, loc, 2 * RTE_ETHER_ADDR_LEN, olx); 2648 pdst += 2 * RTE_ETHER_ADDR_LEN; 2649 *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32 2650 ((RTE_ETHER_TYPE_VLAN << 16) | 2651 loc->mbuf->vlan_tci); 2652 pdst += sizeof(struct rte_vlan_hdr); 2653 inlen -= 2 * RTE_ETHER_ADDR_LEN + sizeof(struct rte_vlan_hdr); 2654 } 2655 assert(pdst < (uint8_t *)txq->wqes_end); 2656 /* 2657 * The WQEBB space availability is checked by caller. 2658 * Here we should be aware of WQE ring buffer wraparound only. 2659 */ 2660 part = (uint8_t *)txq->wqes_end - pdst; 2661 part = RTE_MIN(part, inlen); 2662 assert(part); 2663 do { 2664 mlx5_tx_mseg_memcpy(pdst, loc, part, olx); 2665 inlen -= part; 2666 if (likely(!inlen)) { 2667 pdst += part; 2668 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); 2669 return (struct mlx5_wqe_dseg *)pdst; 2670 } 2671 pdst = (uint8_t *)txq->wqes; 2672 part = inlen; 2673 } while (true); 2674 } 2675 2676 /** 2677 * Build the Data Segment of pointer type. 2678 * 2679 * @param txq 2680 * Pointer to TX queue structure. 2681 * @param loc 2682 * Pointer to burst routine local context. 2683 * @param dseg 2684 * Pointer to WQE to fill with built Data Segment. 2685 * @param buf 2686 * Data buffer to point. 2687 * @param len 2688 * Data buffer length. 2689 * @param olx 2690 * Configured Tx offloads mask. It is fully defined at 2691 * compile time and may be used for optimization. 2692 */ 2693 static __rte_always_inline void 2694 mlx5_tx_dseg_ptr(struct mlx5_txq_data *restrict txq, 2695 struct mlx5_txq_local *restrict loc, 2696 struct mlx5_wqe_dseg *restrict dseg, 2697 uint8_t *buf, 2698 unsigned int len, 2699 unsigned int olx __rte_unused) 2700 2701 { 2702 assert(len); 2703 dseg->bcount = rte_cpu_to_be_32(len); 2704 dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf); 2705 dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf); 2706 } 2707 2708 /** 2709 * Build the Data Segment of pointer type or inline 2710 * if data length is less than buffer in minimal 2711 * Data Segment size. 2712 * 2713 * @param txq 2714 * Pointer to TX queue structure. 2715 * @param loc 2716 * Pointer to burst routine local context. 2717 * @param dseg 2718 * Pointer to WQE to fill with built Data Segment. 2719 * @param buf 2720 * Data buffer to point. 2721 * @param len 2722 * Data buffer length. 2723 * @param olx 2724 * Configured Tx offloads mask. It is fully defined at 2725 * compile time and may be used for optimization. 2726 */ 2727 static __rte_always_inline void 2728 mlx5_tx_dseg_iptr(struct mlx5_txq_data *restrict txq, 2729 struct mlx5_txq_local *restrict loc, 2730 struct mlx5_wqe_dseg *restrict dseg, 2731 uint8_t *buf, 2732 unsigned int len, 2733 unsigned int olx __rte_unused) 2734 2735 { 2736 uintptr_t dst, src; 2737 2738 assert(len); 2739 if (len > MLX5_DSEG_MIN_INLINE_SIZE) { 2740 dseg->bcount = rte_cpu_to_be_32(len); 2741 dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf); 2742 dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf); 2743 2744 return; 2745 } 2746 dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE); 2747 /* Unrolled implementation of generic rte_memcpy. */ 2748 dst = (uintptr_t)&dseg->inline_data[0]; 2749 src = (uintptr_t)buf; 2750 #ifdef RTE_ARCH_STRICT_ALIGN 2751 memcpy(dst, src, len); 2752 #else 2753 if (len & 0x08) { 2754 *(uint64_t *)dst = *(uint64_t *)src; 2755 dst += sizeof(uint64_t); 2756 src += sizeof(uint64_t); 2757 } 2758 if (len & 0x04) { 2759 *(uint32_t *)dst = *(uint32_t *)src; 2760 dst += sizeof(uint32_t); 2761 src += sizeof(uint32_t); 2762 } 2763 if (len & 0x02) { 2764 *(uint16_t *)dst = *(uint16_t *)src; 2765 dst += sizeof(uint16_t); 2766 src += sizeof(uint16_t); 2767 } 2768 if (len & 0x01) 2769 *(uint8_t *)dst = *(uint8_t *)src; 2770 #endif 2771 } 2772 2773 /** 2774 * Build the Data Segment of inlined data from single 2775 * segment packet, no VLAN insertion. 2776 * 2777 * @param txq 2778 * Pointer to TX queue structure. 2779 * @param loc 2780 * Pointer to burst routine local context. 2781 * @param dseg 2782 * Pointer to WQE to fill with built Data Segment. 2783 * @param buf 2784 * Data buffer to point. 2785 * @param len 2786 * Data buffer length. 2787 * @param olx 2788 * Configured Tx offloads mask. It is fully defined at 2789 * compile time and may be used for optimization. 2790 * 2791 * @return 2792 * Pointer to the next Data Segment after inlined data. 2793 * Ring buffer wraparound check is needed. We do not 2794 * do it here because it may not be needed for the 2795 * last packet in the eMPW session. 2796 */ 2797 static __rte_always_inline struct mlx5_wqe_dseg * 2798 mlx5_tx_dseg_empw(struct mlx5_txq_data *restrict txq, 2799 struct mlx5_txq_local *restrict loc __rte_unused, 2800 struct mlx5_wqe_dseg *restrict dseg, 2801 uint8_t *buf, 2802 unsigned int len, 2803 unsigned int olx __rte_unused) 2804 { 2805 unsigned int part; 2806 uint8_t *pdst; 2807 2808 dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE); 2809 pdst = &dseg->inline_data[0]; 2810 /* 2811 * The WQEBB space availability is checked by caller. 2812 * Here we should be aware of WQE ring buffer wraparound only. 2813 */ 2814 part = (uint8_t *)txq->wqes_end - pdst; 2815 part = RTE_MIN(part, len); 2816 do { 2817 rte_memcpy(pdst, buf, part); 2818 len -= part; 2819 if (likely(!len)) { 2820 pdst += part; 2821 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); 2822 /* Note: no final wraparound check here. */ 2823 return (struct mlx5_wqe_dseg *)pdst; 2824 } 2825 pdst = (uint8_t *)txq->wqes; 2826 buf += part; 2827 part = len; 2828 } while (true); 2829 } 2830 2831 /** 2832 * Build the Data Segment of inlined data from single 2833 * segment packet with VLAN insertion. 2834 * 2835 * @param txq 2836 * Pointer to TX queue structure. 2837 * @param loc 2838 * Pointer to burst routine local context. 2839 * @param dseg 2840 * Pointer to the dseg fill with built Data Segment. 2841 * @param buf 2842 * Data buffer to point. 2843 * @param len 2844 * Data buffer length. 2845 * @param olx 2846 * Configured Tx offloads mask. It is fully defined at 2847 * compile time and may be used for optimization. 2848 * 2849 * @return 2850 * Pointer to the next Data Segment after inlined data. 2851 * Ring buffer wraparound check is needed. 2852 */ 2853 static __rte_always_inline struct mlx5_wqe_dseg * 2854 mlx5_tx_dseg_vlan(struct mlx5_txq_data *restrict txq, 2855 struct mlx5_txq_local *restrict loc __rte_unused, 2856 struct mlx5_wqe_dseg *restrict dseg, 2857 uint8_t *buf, 2858 unsigned int len, 2859 unsigned int olx __rte_unused) 2860 2861 { 2862 unsigned int part; 2863 uint8_t *pdst; 2864 2865 assert(len > MLX5_ESEG_MIN_INLINE_SIZE); 2866 static_assert(MLX5_DSEG_MIN_INLINE_SIZE == 2867 (2 * RTE_ETHER_ADDR_LEN), 2868 "invalid Data Segment data size"); 2869 dseg->bcount = rte_cpu_to_be_32((len + sizeof(struct rte_vlan_hdr)) | 2870 MLX5_ETH_WQE_DATA_INLINE); 2871 pdst = &dseg->inline_data[0]; 2872 memcpy(pdst, buf, MLX5_DSEG_MIN_INLINE_SIZE); 2873 buf += MLX5_DSEG_MIN_INLINE_SIZE; 2874 pdst += MLX5_DSEG_MIN_INLINE_SIZE; 2875 len -= MLX5_DSEG_MIN_INLINE_SIZE; 2876 /* Insert VLAN ethertype + VLAN tag. Pointer is aligned. */ 2877 assert(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE)); 2878 if (unlikely(pdst >= (uint8_t *)txq->wqes_end)) 2879 pdst = (uint8_t *)txq->wqes; 2880 *(uint32_t *)pdst = rte_cpu_to_be_32((RTE_ETHER_TYPE_VLAN << 16) | 2881 loc->mbuf->vlan_tci); 2882 pdst += sizeof(struct rte_vlan_hdr); 2883 /* 2884 * The WQEBB space availability is checked by caller. 2885 * Here we should be aware of WQE ring buffer wraparound only. 2886 */ 2887 part = (uint8_t *)txq->wqes_end - pdst; 2888 part = RTE_MIN(part, len); 2889 do { 2890 rte_memcpy(pdst, buf, part); 2891 len -= part; 2892 if (likely(!len)) { 2893 pdst += part; 2894 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); 2895 /* Note: no final wraparound check here. */ 2896 return (struct mlx5_wqe_dseg *)pdst; 2897 } 2898 pdst = (uint8_t *)txq->wqes; 2899 buf += part; 2900 part = len; 2901 } while (true); 2902 } 2903 2904 /** 2905 * Build the Ethernet Segment with optionally inlined data with 2906 * VLAN insertion and following Data Segments (if any) from 2907 * multi-segment packet. Used by ordinary send and TSO. 2908 * 2909 * @param txq 2910 * Pointer to TX queue structure. 2911 * @param loc 2912 * Pointer to burst routine local context. 2913 * @param wqe 2914 * Pointer to WQE to fill with built Ethernet/Data Segments. 2915 * @param vlan 2916 * Length of VLAN header to insert, 0 means no VLAN insertion. 2917 * @param inlen 2918 * Data length to inline. For TSO this parameter specifies 2919 * exact value, for ordinary send routine can be aligned by 2920 * caller to provide better WQE space saving and data buffer 2921 * start address alignment. This length includes VLAN header 2922 * being inserted. 2923 * @param tso 2924 * Zero means ordinary send, inlined data can be extended, 2925 * otherwise this is TSO, inlined data length is fixed. 2926 * @param olx 2927 * Configured Tx offloads mask. It is fully defined at 2928 * compile time and may be used for optimization. 2929 * 2930 * @return 2931 * Actual size of built WQE in segments. 2932 */ 2933 static __rte_always_inline unsigned int 2934 mlx5_tx_mseg_build(struct mlx5_txq_data *restrict txq, 2935 struct mlx5_txq_local *restrict loc, 2936 struct mlx5_wqe *restrict wqe, 2937 unsigned int vlan, 2938 unsigned int inlen, 2939 unsigned int tso, 2940 unsigned int olx __rte_unused) 2941 { 2942 struct mlx5_wqe_dseg *restrict dseg; 2943 unsigned int ds; 2944 2945 assert((rte_pktmbuf_pkt_len(loc->mbuf) + vlan) >= inlen); 2946 loc->mbuf_nseg = NB_SEGS(loc->mbuf); 2947 loc->mbuf_off = 0; 2948 2949 dseg = mlx5_tx_eseg_mdat(txq, loc, wqe, vlan, inlen, tso, olx); 2950 if (!loc->mbuf_nseg) 2951 goto dseg_done; 2952 /* 2953 * There are still some mbuf remaining, not inlined. 2954 * The first mbuf may be partially inlined and we 2955 * must process the possible non-zero data offset. 2956 */ 2957 if (loc->mbuf_off) { 2958 unsigned int dlen; 2959 uint8_t *dptr; 2960 2961 /* 2962 * Exhausted packets must be dropped before. 2963 * Non-zero offset means there are some data 2964 * remained in the packet. 2965 */ 2966 assert(loc->mbuf_off < rte_pktmbuf_data_len(loc->mbuf)); 2967 assert(rte_pktmbuf_data_len(loc->mbuf)); 2968 dptr = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *, 2969 loc->mbuf_off); 2970 dlen = rte_pktmbuf_data_len(loc->mbuf) - loc->mbuf_off; 2971 /* 2972 * Build the pointer/minimal data Data Segment. 2973 * Do ring buffer wrapping check in advance. 2974 */ 2975 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 2976 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 2977 mlx5_tx_dseg_iptr(txq, loc, dseg, dptr, dlen, olx); 2978 /* Store the mbuf to be freed on completion. */ 2979 assert(loc->elts_free); 2980 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 2981 --loc->elts_free; 2982 ++dseg; 2983 if (--loc->mbuf_nseg == 0) 2984 goto dseg_done; 2985 loc->mbuf = loc->mbuf->next; 2986 loc->mbuf_off = 0; 2987 } 2988 do { 2989 if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) { 2990 struct rte_mbuf *mbuf; 2991 2992 /* Zero length segment found, just skip. */ 2993 mbuf = loc->mbuf; 2994 loc->mbuf = loc->mbuf->next; 2995 rte_pktmbuf_free_seg(mbuf); 2996 if (--loc->mbuf_nseg == 0) 2997 break; 2998 } else { 2999 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 3000 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 3001 mlx5_tx_dseg_iptr 3002 (txq, loc, dseg, 3003 rte_pktmbuf_mtod(loc->mbuf, uint8_t *), 3004 rte_pktmbuf_data_len(loc->mbuf), olx); 3005 assert(loc->elts_free); 3006 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 3007 --loc->elts_free; 3008 ++dseg; 3009 if (--loc->mbuf_nseg == 0) 3010 break; 3011 loc->mbuf = loc->mbuf->next; 3012 } 3013 } while (true); 3014 3015 dseg_done: 3016 /* Calculate actual segments used from the dseg pointer. */ 3017 if ((uintptr_t)wqe < (uintptr_t)dseg) 3018 ds = ((uintptr_t)dseg - (uintptr_t)wqe) / MLX5_WSEG_SIZE; 3019 else 3020 ds = (((uintptr_t)dseg - (uintptr_t)wqe) + 3021 txq->wqe_s * MLX5_WQE_SIZE) / MLX5_WSEG_SIZE; 3022 return ds; 3023 } 3024 3025 /** 3026 * Tx one packet function for multi-segment TSO. Supports all 3027 * types of Tx offloads, uses MLX5_OPCODE_TSO to build WQEs, 3028 * sends one packet per WQE. 3029 * 3030 * This routine is responsible for storing processed mbuf 3031 * into elts ring buffer and update elts_head. 3032 * 3033 * @param txq 3034 * Pointer to TX queue structure. 3035 * @param loc 3036 * Pointer to burst routine local context. 3037 * @param olx 3038 * Configured Tx offloads mask. It is fully defined at 3039 * compile time and may be used for optimization. 3040 * 3041 * @return 3042 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3043 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3044 * Local context variables partially updated. 3045 */ 3046 static __rte_always_inline enum mlx5_txcmp_code 3047 mlx5_tx_packet_multi_tso(struct mlx5_txq_data *restrict txq, 3048 struct mlx5_txq_local *restrict loc, 3049 unsigned int olx) 3050 { 3051 struct mlx5_wqe *restrict wqe; 3052 unsigned int ds, dlen, inlen, ntcp, vlan = 0; 3053 3054 /* 3055 * Calculate data length to be inlined to estimate 3056 * the required space in WQE ring buffer. 3057 */ 3058 dlen = rte_pktmbuf_pkt_len(loc->mbuf); 3059 if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) 3060 vlan = sizeof(struct rte_vlan_hdr); 3061 inlen = loc->mbuf->l2_len + vlan + 3062 loc->mbuf->l3_len + loc->mbuf->l4_len; 3063 if (unlikely((!inlen || !loc->mbuf->tso_segsz))) 3064 return MLX5_TXCMP_CODE_ERROR; 3065 if (loc->mbuf->ol_flags & PKT_TX_TUNNEL_MASK) 3066 inlen += loc->mbuf->outer_l2_len + loc->mbuf->outer_l3_len; 3067 /* Packet must contain all TSO headers. */ 3068 if (unlikely(inlen > MLX5_MAX_TSO_HEADER || 3069 inlen <= MLX5_ESEG_MIN_INLINE_SIZE || 3070 inlen > (dlen + vlan))) 3071 return MLX5_TXCMP_CODE_ERROR; 3072 assert(inlen >= txq->inlen_mode); 3073 /* 3074 * Check whether there are enough free WQEBBs: 3075 * - Control Segment 3076 * - Ethernet Segment 3077 * - First Segment of inlined Ethernet data 3078 * - ... data continued ... 3079 * - Data Segments of pointer/min inline type 3080 */ 3081 ds = NB_SEGS(loc->mbuf) + 2 + (inlen - 3082 MLX5_ESEG_MIN_INLINE_SIZE + 3083 MLX5_WSEG_SIZE + 3084 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 3085 if (unlikely(loc->wqe_free < ((ds + 3) / 4))) 3086 return MLX5_TXCMP_CODE_EXIT; 3087 /* Check for maximal WQE size. */ 3088 if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4))) 3089 return MLX5_TXCMP_CODE_ERROR; 3090 #ifdef MLX5_PMD_SOFT_COUNTERS 3091 /* Update sent data bytes/packets counters. */ 3092 ntcp = (dlen - (inlen - vlan) + loc->mbuf->tso_segsz - 1) / 3093 loc->mbuf->tso_segsz; 3094 /* 3095 * One will be added for mbuf itself 3096 * at the end of the mlx5_tx_burst from 3097 * loc->pkts_sent field. 3098 */ 3099 --ntcp; 3100 txq->stats.opackets += ntcp; 3101 txq->stats.obytes += dlen + vlan + ntcp * inlen; 3102 #endif 3103 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3104 loc->wqe_last = wqe; 3105 mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_TSO, olx); 3106 ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 1, olx); 3107 wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); 3108 txq->wqe_ci += (ds + 3) / 4; 3109 loc->wqe_free -= (ds + 3) / 4; 3110 /* Request CQE generation if limits are reached. */ 3111 mlx5_tx_request_completion(txq, loc, true, olx); 3112 return MLX5_TXCMP_CODE_MULTI; 3113 } 3114 3115 /** 3116 * Tx one packet function for multi-segment SEND. Supports all 3117 * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs, 3118 * sends one packet per WQE, without any data inlining in 3119 * Ethernet Segment. 3120 * 3121 * This routine is responsible for storing processed mbuf 3122 * into elts ring buffer and update elts_head. 3123 * 3124 * @param txq 3125 * Pointer to TX queue structure. 3126 * @param loc 3127 * Pointer to burst routine local context. 3128 * @param olx 3129 * Configured Tx offloads mask. It is fully defined at 3130 * compile time and may be used for optimization. 3131 * 3132 * @return 3133 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3134 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3135 * Local context variables partially updated. 3136 */ 3137 static __rte_always_inline enum mlx5_txcmp_code 3138 mlx5_tx_packet_multi_send(struct mlx5_txq_data *restrict txq, 3139 struct mlx5_txq_local *restrict loc, 3140 unsigned int olx) 3141 { 3142 struct mlx5_wqe_dseg *restrict dseg; 3143 struct mlx5_wqe *restrict wqe; 3144 unsigned int ds, nseg; 3145 3146 assert(NB_SEGS(loc->mbuf) > 1); 3147 /* 3148 * No inline at all, it means the CPU cycles saving 3149 * is prioritized at configuration, we should not 3150 * copy any packet data to WQE. 3151 */ 3152 nseg = NB_SEGS(loc->mbuf); 3153 ds = 2 + nseg; 3154 if (unlikely(loc->wqe_free < ((ds + 3) / 4))) 3155 return MLX5_TXCMP_CODE_EXIT; 3156 /* Check for maximal WQE size. */ 3157 if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4))) 3158 return MLX5_TXCMP_CODE_ERROR; 3159 /* 3160 * Some Tx offloads may cause an error if 3161 * packet is not long enough, check against 3162 * assumed minimal length. 3163 */ 3164 if (rte_pktmbuf_pkt_len(loc->mbuf) <= MLX5_ESEG_MIN_INLINE_SIZE) 3165 return MLX5_TXCMP_CODE_ERROR; 3166 #ifdef MLX5_PMD_SOFT_COUNTERS 3167 /* Update sent data bytes counter. */ 3168 txq->stats.obytes += rte_pktmbuf_pkt_len(loc->mbuf); 3169 if (MLX5_TXOFF_CONFIG(VLAN) && 3170 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) 3171 txq->stats.obytes += sizeof(struct rte_vlan_hdr); 3172 #endif 3173 /* 3174 * SEND WQE, one WQEBB: 3175 * - Control Segment, SEND opcode 3176 * - Ethernet Segment, optional VLAN, no inline 3177 * - Data Segments, pointer only type 3178 */ 3179 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3180 loc->wqe_last = wqe; 3181 mlx5_tx_cseg_init(txq, loc, wqe, ds, MLX5_OPCODE_SEND, olx); 3182 mlx5_tx_eseg_none(txq, loc, wqe, olx); 3183 dseg = &wqe->dseg[0]; 3184 do { 3185 if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) { 3186 struct rte_mbuf *mbuf; 3187 3188 /* 3189 * Zero length segment found, have to 3190 * correct total size of WQE in segments. 3191 * It is supposed to be rare occasion, so 3192 * in normal case (no zero length segments) 3193 * we avoid extra writing to the Control 3194 * Segment. 3195 */ 3196 --ds; 3197 wqe->cseg.sq_ds -= RTE_BE32(1); 3198 mbuf = loc->mbuf; 3199 loc->mbuf = mbuf->next; 3200 rte_pktmbuf_free_seg(mbuf); 3201 if (--nseg == 0) 3202 break; 3203 } else { 3204 mlx5_tx_dseg_ptr 3205 (txq, loc, dseg, 3206 rte_pktmbuf_mtod(loc->mbuf, uint8_t *), 3207 rte_pktmbuf_data_len(loc->mbuf), olx); 3208 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 3209 --loc->elts_free; 3210 if (--nseg == 0) 3211 break; 3212 ++dseg; 3213 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 3214 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 3215 loc->mbuf = loc->mbuf->next; 3216 } 3217 } while (true); 3218 txq->wqe_ci += (ds + 3) / 4; 3219 loc->wqe_free -= (ds + 3) / 4; 3220 /* Request CQE generation if limits are reached. */ 3221 mlx5_tx_request_completion(txq, loc, true, olx); 3222 return MLX5_TXCMP_CODE_MULTI; 3223 } 3224 3225 /** 3226 * Tx one packet function for multi-segment SEND. Supports all 3227 * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs, 3228 * sends one packet per WQE, with data inlining in 3229 * Ethernet Segment and minimal Data Segments. 3230 * 3231 * This routine is responsible for storing processed mbuf 3232 * into elts ring buffer and update elts_head. 3233 * 3234 * @param txq 3235 * Pointer to TX queue structure. 3236 * @param loc 3237 * Pointer to burst routine local context. 3238 * @param olx 3239 * Configured Tx offloads mask. It is fully defined at 3240 * compile time and may be used for optimization. 3241 * 3242 * @return 3243 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3244 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3245 * Local context variables partially updated. 3246 */ 3247 static __rte_always_inline enum mlx5_txcmp_code 3248 mlx5_tx_packet_multi_inline(struct mlx5_txq_data *restrict txq, 3249 struct mlx5_txq_local *restrict loc, 3250 unsigned int olx) 3251 { 3252 struct mlx5_wqe *restrict wqe; 3253 unsigned int ds, inlen, dlen, vlan = 0; 3254 3255 assert(MLX5_TXOFF_CONFIG(INLINE)); 3256 assert(NB_SEGS(loc->mbuf) > 1); 3257 /* 3258 * First calculate data length to be inlined 3259 * to estimate the required space for WQE. 3260 */ 3261 dlen = rte_pktmbuf_pkt_len(loc->mbuf); 3262 if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) 3263 vlan = sizeof(struct rte_vlan_hdr); 3264 inlen = dlen + vlan; 3265 /* Check against minimal length. */ 3266 if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE) 3267 return MLX5_TXCMP_CODE_ERROR; 3268 assert(txq->inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE); 3269 if (inlen > txq->inlen_send) { 3270 struct rte_mbuf *mbuf; 3271 unsigned int nxlen; 3272 uintptr_t start; 3273 3274 /* 3275 * Packet length exceeds the allowed inline 3276 * data length, check whether the minimal 3277 * inlining is required. 3278 */ 3279 if (txq->inlen_mode) { 3280 assert(txq->inlen_mode >= MLX5_ESEG_MIN_INLINE_SIZE); 3281 assert(txq->inlen_mode <= txq->inlen_send); 3282 inlen = txq->inlen_mode; 3283 } else { 3284 if (!vlan || txq->vlan_en) { 3285 /* 3286 * VLAN insertion will be done inside by HW. 3287 * It is not utmost effective - VLAN flag is 3288 * checked twice, but we should proceed the 3289 * inlining length correctly and take into 3290 * account the VLAN header being inserted. 3291 */ 3292 return mlx5_tx_packet_multi_send 3293 (txq, loc, olx); 3294 } 3295 inlen = MLX5_ESEG_MIN_INLINE_SIZE; 3296 } 3297 /* 3298 * Now we know the minimal amount of data is requested 3299 * to inline. Check whether we should inline the buffers 3300 * from the chain beginning to eliminate some mbufs. 3301 */ 3302 mbuf = loc->mbuf; 3303 nxlen = rte_pktmbuf_data_len(mbuf); 3304 if (unlikely(nxlen <= txq->inlen_send)) { 3305 /* We can inline first mbuf at least. */ 3306 if (nxlen < inlen) { 3307 unsigned int smlen; 3308 3309 /* Scan mbufs till inlen filled. */ 3310 do { 3311 smlen = nxlen; 3312 mbuf = NEXT(mbuf); 3313 assert(mbuf); 3314 nxlen = rte_pktmbuf_data_len(mbuf); 3315 nxlen += smlen; 3316 } while (unlikely(nxlen < inlen)); 3317 if (unlikely(nxlen > txq->inlen_send)) { 3318 /* We cannot inline entire mbuf. */ 3319 smlen = inlen - smlen; 3320 start = rte_pktmbuf_mtod_offset 3321 (mbuf, uintptr_t, smlen); 3322 goto do_align; 3323 } 3324 } 3325 do { 3326 inlen = nxlen; 3327 mbuf = NEXT(mbuf); 3328 /* There should be not end of packet. */ 3329 assert(mbuf); 3330 nxlen = inlen + rte_pktmbuf_data_len(mbuf); 3331 } while (unlikely(nxlen < txq->inlen_send)); 3332 } 3333 start = rte_pktmbuf_mtod(mbuf, uintptr_t); 3334 /* 3335 * Check whether we can do inline to align start 3336 * address of data buffer to cacheline. 3337 */ 3338 do_align: 3339 start = (~start + 1) & (RTE_CACHE_LINE_SIZE - 1); 3340 if (unlikely(start)) { 3341 start += inlen; 3342 if (start <= txq->inlen_send) 3343 inlen = start; 3344 } 3345 } 3346 /* 3347 * Check whether there are enough free WQEBBs: 3348 * - Control Segment 3349 * - Ethernet Segment 3350 * - First Segment of inlined Ethernet data 3351 * - ... data continued ... 3352 * - Data Segments of pointer/min inline type 3353 * 3354 * Estimate the number of Data Segments conservatively, 3355 * supposing no any mbufs is being freed during inlining. 3356 */ 3357 assert(inlen <= txq->inlen_send); 3358 ds = NB_SEGS(loc->mbuf) + 2 + (inlen - 3359 MLX5_ESEG_MIN_INLINE_SIZE + 3360 MLX5_WSEG_SIZE + 3361 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 3362 if (unlikely(loc->wqe_free < ((ds + 3) / 4))) 3363 return MLX5_TXCMP_CODE_EXIT; 3364 /* Check for maximal WQE size. */ 3365 if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4))) 3366 return MLX5_TXCMP_CODE_ERROR; 3367 #ifdef MLX5_PMD_SOFT_COUNTERS 3368 /* Update sent data bytes/packets counters. */ 3369 txq->stats.obytes += dlen + vlan; 3370 #endif 3371 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3372 loc->wqe_last = wqe; 3373 mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_SEND, olx); 3374 ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 0, olx); 3375 wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); 3376 txq->wqe_ci += (ds + 3) / 4; 3377 loc->wqe_free -= (ds + 3) / 4; 3378 /* Request CQE generation if limits are reached. */ 3379 mlx5_tx_request_completion(txq, loc, true, olx); 3380 return MLX5_TXCMP_CODE_MULTI; 3381 } 3382 3383 /** 3384 * Tx burst function for multi-segment packets. Supports all 3385 * types of Tx offloads, uses MLX5_OPCODE_SEND/TSO to build WQEs, 3386 * sends one packet per WQE. Function stops sending if it 3387 * encounters the single-segment packet. 3388 * 3389 * This routine is responsible for storing processed mbuf 3390 * into elts ring buffer and update elts_head. 3391 * 3392 * @param txq 3393 * Pointer to TX queue structure. 3394 * @param[in] pkts 3395 * Packets to transmit. 3396 * @param pkts_n 3397 * Number of packets in array. 3398 * @param loc 3399 * Pointer to burst routine local context. 3400 * @param olx 3401 * Configured Tx offloads mask. It is fully defined at 3402 * compile time and may be used for optimization. 3403 * 3404 * @return 3405 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3406 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3407 * MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered. 3408 * MLX5_TXCMP_CODE_TSO - TSO single-segment packet encountered. 3409 * Local context variables updated. 3410 */ 3411 static __rte_always_inline enum mlx5_txcmp_code 3412 mlx5_tx_burst_mseg(struct mlx5_txq_data *restrict txq, 3413 struct rte_mbuf **restrict pkts, 3414 unsigned int pkts_n, 3415 struct mlx5_txq_local *restrict loc, 3416 unsigned int olx) 3417 { 3418 assert(loc->elts_free && loc->wqe_free); 3419 assert(pkts_n > loc->pkts_sent); 3420 pkts += loc->pkts_sent + 1; 3421 pkts_n -= loc->pkts_sent; 3422 for (;;) { 3423 enum mlx5_txcmp_code ret; 3424 3425 assert(NB_SEGS(loc->mbuf) > 1); 3426 /* 3427 * Estimate the number of free elts quickly but 3428 * conservatively. Some segment may be fully inlined 3429 * and freed, ignore this here - precise estimation 3430 * is costly. 3431 */ 3432 if (loc->elts_free < NB_SEGS(loc->mbuf)) 3433 return MLX5_TXCMP_CODE_EXIT; 3434 if (MLX5_TXOFF_CONFIG(TSO) && 3435 unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) { 3436 /* Proceed with multi-segment TSO. */ 3437 ret = mlx5_tx_packet_multi_tso(txq, loc, olx); 3438 } else if (MLX5_TXOFF_CONFIG(INLINE)) { 3439 /* Proceed with multi-segment SEND with inlining. */ 3440 ret = mlx5_tx_packet_multi_inline(txq, loc, olx); 3441 } else { 3442 /* Proceed with multi-segment SEND w/o inlining. */ 3443 ret = mlx5_tx_packet_multi_send(txq, loc, olx); 3444 } 3445 if (ret == MLX5_TXCMP_CODE_EXIT) 3446 return MLX5_TXCMP_CODE_EXIT; 3447 if (ret == MLX5_TXCMP_CODE_ERROR) 3448 return MLX5_TXCMP_CODE_ERROR; 3449 /* WQE is built, go to the next packet. */ 3450 ++loc->pkts_sent; 3451 --pkts_n; 3452 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) 3453 return MLX5_TXCMP_CODE_EXIT; 3454 loc->mbuf = *pkts++; 3455 if (pkts_n > 1) 3456 rte_prefetch0(*pkts); 3457 if (likely(NB_SEGS(loc->mbuf) > 1)) 3458 continue; 3459 /* Here ends the series of multi-segment packets. */ 3460 if (MLX5_TXOFF_CONFIG(TSO) && 3461 unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) 3462 return MLX5_TXCMP_CODE_TSO; 3463 return MLX5_TXCMP_CODE_SINGLE; 3464 } 3465 assert(false); 3466 } 3467 3468 /** 3469 * Tx burst function for single-segment packets with TSO. 3470 * Supports all types of Tx offloads, except multi-packets. 3471 * Uses MLX5_OPCODE_TSO to build WQEs, sends one packet per WQE. 3472 * Function stops sending if it encounters the multi-segment 3473 * packet or packet without TSO requested. 3474 * 3475 * The routine is responsible for storing processed mbuf 3476 * into elts ring buffer and update elts_head if inline 3477 * offloads is requested due to possible early freeing 3478 * of the inlined mbufs (can not store pkts array in elts 3479 * as a batch). 3480 * 3481 * @param txq 3482 * Pointer to TX queue structure. 3483 * @param[in] pkts 3484 * Packets to transmit. 3485 * @param pkts_n 3486 * Number of packets in array. 3487 * @param loc 3488 * Pointer to burst routine local context. 3489 * @param olx 3490 * Configured Tx offloads mask. It is fully defined at 3491 * compile time and may be used for optimization. 3492 * 3493 * @return 3494 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3495 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3496 * MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered. 3497 * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered. 3498 * Local context variables updated. 3499 */ 3500 static __rte_always_inline enum mlx5_txcmp_code 3501 mlx5_tx_burst_tso(struct mlx5_txq_data *restrict txq, 3502 struct rte_mbuf **restrict pkts, 3503 unsigned int pkts_n, 3504 struct mlx5_txq_local *restrict loc, 3505 unsigned int olx) 3506 { 3507 assert(loc->elts_free && loc->wqe_free); 3508 assert(pkts_n > loc->pkts_sent); 3509 pkts += loc->pkts_sent + 1; 3510 pkts_n -= loc->pkts_sent; 3511 for (;;) { 3512 struct mlx5_wqe_dseg *restrict dseg; 3513 struct mlx5_wqe *restrict wqe; 3514 unsigned int ds, dlen, hlen, ntcp, vlan = 0; 3515 uint8_t *dptr; 3516 3517 assert(NB_SEGS(loc->mbuf) == 1); 3518 dlen = rte_pktmbuf_data_len(loc->mbuf); 3519 if (MLX5_TXOFF_CONFIG(VLAN) && 3520 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { 3521 vlan = sizeof(struct rte_vlan_hdr); 3522 } 3523 /* 3524 * First calculate the WQE size to check 3525 * whether we have enough space in ring buffer. 3526 */ 3527 hlen = loc->mbuf->l2_len + vlan + 3528 loc->mbuf->l3_len + loc->mbuf->l4_len; 3529 if (unlikely((!hlen || !loc->mbuf->tso_segsz))) 3530 return MLX5_TXCMP_CODE_ERROR; 3531 if (loc->mbuf->ol_flags & PKT_TX_TUNNEL_MASK) 3532 hlen += loc->mbuf->outer_l2_len + 3533 loc->mbuf->outer_l3_len; 3534 /* Segment must contain all TSO headers. */ 3535 if (unlikely(hlen > MLX5_MAX_TSO_HEADER || 3536 hlen <= MLX5_ESEG_MIN_INLINE_SIZE || 3537 hlen > (dlen + vlan))) 3538 return MLX5_TXCMP_CODE_ERROR; 3539 /* 3540 * Check whether there are enough free WQEBBs: 3541 * - Control Segment 3542 * - Ethernet Segment 3543 * - First Segment of inlined Ethernet data 3544 * - ... data continued ... 3545 * - Finishing Data Segment of pointer type 3546 */ 3547 ds = 4 + (hlen - MLX5_ESEG_MIN_INLINE_SIZE + 3548 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 3549 if (loc->wqe_free < ((ds + 3) / 4)) 3550 return MLX5_TXCMP_CODE_EXIT; 3551 #ifdef MLX5_PMD_SOFT_COUNTERS 3552 /* Update sent data bytes/packets counters. */ 3553 ntcp = (dlen + vlan - hlen + 3554 loc->mbuf->tso_segsz - 1) / 3555 loc->mbuf->tso_segsz; 3556 /* 3557 * One will be added for mbuf itself at the end 3558 * of the mlx5_tx_burst from loc->pkts_sent field. 3559 */ 3560 --ntcp; 3561 txq->stats.opackets += ntcp; 3562 txq->stats.obytes += dlen + vlan + ntcp * hlen; 3563 #endif 3564 /* 3565 * Build the TSO WQE: 3566 * - Control Segment 3567 * - Ethernet Segment with hlen bytes inlined 3568 * - Data Segment of pointer type 3569 */ 3570 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3571 loc->wqe_last = wqe; 3572 mlx5_tx_cseg_init(txq, loc, wqe, ds, 3573 MLX5_OPCODE_TSO, olx); 3574 dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan, hlen, 1, olx); 3575 dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + hlen - vlan; 3576 dlen -= hlen - vlan; 3577 mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx); 3578 /* 3579 * WQE is built, update the loop parameters 3580 * and go to the next packet. 3581 */ 3582 txq->wqe_ci += (ds + 3) / 4; 3583 loc->wqe_free -= (ds + 3) / 4; 3584 if (MLX5_TXOFF_CONFIG(INLINE)) 3585 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 3586 --loc->elts_free; 3587 ++loc->pkts_sent; 3588 --pkts_n; 3589 /* Request CQE generation if limits are reached. */ 3590 mlx5_tx_request_completion(txq, loc, false, olx); 3591 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) 3592 return MLX5_TXCMP_CODE_EXIT; 3593 loc->mbuf = *pkts++; 3594 if (pkts_n > 1) 3595 rte_prefetch0(*pkts); 3596 if (MLX5_TXOFF_CONFIG(MULTI) && 3597 unlikely(NB_SEGS(loc->mbuf) > 1)) 3598 return MLX5_TXCMP_CODE_MULTI; 3599 if (likely(!(loc->mbuf->ol_flags & PKT_TX_TCP_SEG))) 3600 return MLX5_TXCMP_CODE_SINGLE; 3601 /* Continue with the next TSO packet. */ 3602 } 3603 assert(false); 3604 } 3605 3606 /** 3607 * Analyze the packet and select the best method to send. 3608 * 3609 * @param txq 3610 * Pointer to TX queue structure. 3611 * @param loc 3612 * Pointer to burst routine local context. 3613 * @param olx 3614 * Configured Tx offloads mask. It is fully defined at 3615 * compile time and may be used for optimization. 3616 * @param newp 3617 * The predefined flag whether do complete check for 3618 * multi-segment packets and TSO. 3619 * 3620 * @return 3621 * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered. 3622 * MLX5_TXCMP_CODE_TSO - TSO required, use TSO/LSO. 3623 * MLX5_TXCMP_CODE_SINGLE - single-segment packet, use SEND. 3624 * MLX5_TXCMP_CODE_EMPW - single-segment packet, use MPW. 3625 */ 3626 static __rte_always_inline enum mlx5_txcmp_code 3627 mlx5_tx_able_to_empw(struct mlx5_txq_data *restrict txq, 3628 struct mlx5_txq_local *restrict loc, 3629 unsigned int olx, 3630 bool newp) 3631 { 3632 /* Check for multi-segment packet. */ 3633 if (newp && 3634 MLX5_TXOFF_CONFIG(MULTI) && 3635 unlikely(NB_SEGS(loc->mbuf) > 1)) 3636 return MLX5_TXCMP_CODE_MULTI; 3637 /* Check for TSO packet. */ 3638 if (newp && 3639 MLX5_TXOFF_CONFIG(TSO) && 3640 unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) 3641 return MLX5_TXCMP_CODE_TSO; 3642 /* Check if eMPW is enabled at all. */ 3643 if (!MLX5_TXOFF_CONFIG(EMPW)) 3644 return MLX5_TXCMP_CODE_SINGLE; 3645 /* Check if eMPW can be engaged. */ 3646 if (MLX5_TXOFF_CONFIG(VLAN) && 3647 unlikely(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) && 3648 (!MLX5_TXOFF_CONFIG(INLINE) || 3649 unlikely((rte_pktmbuf_data_len(loc->mbuf) + 3650 sizeof(struct rte_vlan_hdr)) > txq->inlen_empw))) { 3651 /* 3652 * eMPW does not support VLAN insertion offload, 3653 * we have to inline the entire packet but 3654 * packet is too long for inlining. 3655 */ 3656 return MLX5_TXCMP_CODE_SINGLE; 3657 } 3658 return MLX5_TXCMP_CODE_EMPW; 3659 } 3660 3661 /** 3662 * Check the next packet attributes to match with the eMPW batch ones. 3663 * 3664 * @param txq 3665 * Pointer to TX queue structure. 3666 * @param es 3667 * Pointer to Ethernet Segment of eMPW batch. 3668 * @param loc 3669 * Pointer to burst routine local context. 3670 * @param olx 3671 * Configured Tx offloads mask. It is fully defined at 3672 * compile time and may be used for optimization. 3673 * 3674 * @return 3675 * true - packet match with eMPW batch attributes. 3676 * false - no match, eMPW should be restarted. 3677 */ 3678 static __rte_always_inline bool 3679 mlx5_tx_match_empw(struct mlx5_txq_data *restrict txq __rte_unused, 3680 struct mlx5_wqe_eseg *restrict es, 3681 struct mlx5_txq_local *restrict loc, 3682 unsigned int olx) 3683 { 3684 uint8_t swp_flags = 0; 3685 3686 /* Compare the checksum flags, if any. */ 3687 if (MLX5_TXOFF_CONFIG(CSUM) && 3688 txq_ol_cksum_to_cs(loc->mbuf) != es->cs_flags) 3689 return false; 3690 /* Compare the Software Parser offsets and flags. */ 3691 if (MLX5_TXOFF_CONFIG(SWP) && 3692 (es->swp_offs != txq_mbuf_to_swp(loc, &swp_flags, olx) || 3693 es->swp_flags != swp_flags)) 3694 return false; 3695 /* Fill metadata field if needed. */ 3696 if (MLX5_TXOFF_CONFIG(METADATA) && 3697 es->metadata != (loc->mbuf->ol_flags & PKT_TX_METADATA ? 3698 loc->mbuf->tx_metadata : 0)) 3699 return false; 3700 /* There must be no VLAN packets in eMPW loop. */ 3701 if (MLX5_TXOFF_CONFIG(VLAN)) 3702 assert(!(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)); 3703 return true; 3704 } 3705 3706 /* 3707 * Update send loop variables and WQE for eMPW loop 3708 * without data inlining. Number of Data Segments is 3709 * equal to the number of sent packets. 3710 * 3711 * @param txq 3712 * Pointer to TX queue structure. 3713 * @param loc 3714 * Pointer to burst routine local context. 3715 * @param ds 3716 * Number of packets/Data Segments/Packets. 3717 * @param slen 3718 * Accumulated statistics, bytes sent 3719 * @param olx 3720 * Configured Tx offloads mask. It is fully defined at 3721 * compile time and may be used for optimization. 3722 * 3723 * @return 3724 * true - packet match with eMPW batch attributes. 3725 * false - no match, eMPW should be restarted. 3726 */ 3727 static __rte_always_inline void 3728 mlx5_tx_sdone_empw(struct mlx5_txq_data *restrict txq, 3729 struct mlx5_txq_local *restrict loc, 3730 unsigned int ds, 3731 unsigned int slen, 3732 unsigned int olx) 3733 { 3734 assert(!MLX5_TXOFF_CONFIG(INLINE)); 3735 #ifdef MLX5_PMD_SOFT_COUNTERS 3736 /* Update sent data bytes counter. */ 3737 txq->stats.obytes += slen; 3738 #else 3739 (void)slen; 3740 #endif 3741 loc->elts_free -= ds; 3742 loc->pkts_sent += ds; 3743 ds += 2; 3744 loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); 3745 txq->wqe_ci += (ds + 3) / 4; 3746 loc->wqe_free -= (ds + 3) / 4; 3747 /* Request CQE generation if limits are reached. */ 3748 mlx5_tx_request_completion(txq, loc, false, olx); 3749 } 3750 3751 /* 3752 * Update send loop variables and WQE for eMPW loop 3753 * with data inlining. Gets the size of pushed descriptors 3754 * and data to the WQE. 3755 * 3756 * @param txq 3757 * Pointer to TX queue structure. 3758 * @param loc 3759 * Pointer to burst routine local context. 3760 * @param len 3761 * Total size of descriptor/data in bytes. 3762 * @param slen 3763 * Accumulated statistics, data bytes sent. 3764 * @param olx 3765 * Configured Tx offloads mask. It is fully defined at 3766 * compile time and may be used for optimization. 3767 * 3768 * @return 3769 * true - packet match with eMPW batch attributes. 3770 * false - no match, eMPW should be restarted. 3771 */ 3772 static __rte_always_inline void 3773 mlx5_tx_idone_empw(struct mlx5_txq_data *restrict txq, 3774 struct mlx5_txq_local *restrict loc, 3775 unsigned int len, 3776 unsigned int slen, 3777 unsigned int olx __rte_unused) 3778 { 3779 assert(MLX5_TXOFF_CONFIG(INLINE)); 3780 assert((len % MLX5_WSEG_SIZE) == 0); 3781 #ifdef MLX5_PMD_SOFT_COUNTERS 3782 /* Update sent data bytes counter. */ 3783 txq->stats.obytes += slen; 3784 #else 3785 (void)slen; 3786 #endif 3787 len = len / MLX5_WSEG_SIZE + 2; 3788 loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | len); 3789 txq->wqe_ci += (len + 3) / 4; 3790 loc->wqe_free -= (len + 3) / 4; 3791 /* Request CQE generation if limits are reached. */ 3792 mlx5_tx_request_completion(txq, loc, false, olx); 3793 } 3794 3795 /** 3796 * The set of Tx burst functions for single-segment packets 3797 * without TSO and with Multi-Packet Writing feature support. 3798 * Supports all types of Tx offloads, except multi-packets 3799 * and TSO. 3800 * 3801 * Uses MLX5_OPCODE_EMPW to build WQEs if possible and sends 3802 * as many packet per WQE as it can. If eMPW is not configured 3803 * or packet can not be sent with eMPW (VLAN insertion) the 3804 * ordinary SEND opcode is used and only one packet placed 3805 * in WQE. 3806 * 3807 * Functions stop sending if it encounters the multi-segment 3808 * packet or packet with TSO requested. 3809 * 3810 * The routines are responsible for storing processed mbuf 3811 * into elts ring buffer and update elts_head if inlining 3812 * offload is requested. Otherwise the copying mbufs to elts 3813 * can be postponed and completed at the end of burst routine. 3814 * 3815 * @param txq 3816 * Pointer to TX queue structure. 3817 * @param[in] pkts 3818 * Packets to transmit. 3819 * @param pkts_n 3820 * Number of packets in array. 3821 * @param loc 3822 * Pointer to burst routine local context. 3823 * @param olx 3824 * Configured Tx offloads mask. It is fully defined at 3825 * compile time and may be used for optimization. 3826 * 3827 * @return 3828 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3829 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3830 * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered. 3831 * MLX5_TXCMP_CODE_TSO - TSO packet encountered. 3832 * MLX5_TXCMP_CODE_SINGLE - used inside functions set. 3833 * MLX5_TXCMP_CODE_EMPW - used inside functions set. 3834 * 3835 * Local context variables updated. 3836 * 3837 * 3838 * The routine sends packets with MLX5_OPCODE_EMPW 3839 * without inlining, this is dedicated optimized branch. 3840 * No VLAN insertion is supported. 3841 */ 3842 static __rte_always_inline enum mlx5_txcmp_code 3843 mlx5_tx_burst_empw_simple(struct mlx5_txq_data *restrict txq, 3844 struct rte_mbuf **restrict pkts, 3845 unsigned int pkts_n, 3846 struct mlx5_txq_local *restrict loc, 3847 unsigned int olx) 3848 { 3849 /* 3850 * Subroutine is the part of mlx5_tx_burst_single() 3851 * and sends single-segment packet with eMPW opcode 3852 * without data inlining. 3853 */ 3854 assert(!MLX5_TXOFF_CONFIG(INLINE)); 3855 assert(MLX5_TXOFF_CONFIG(EMPW)); 3856 assert(loc->elts_free && loc->wqe_free); 3857 assert(pkts_n > loc->pkts_sent); 3858 static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size"); 3859 pkts += loc->pkts_sent + 1; 3860 pkts_n -= loc->pkts_sent; 3861 for (;;) { 3862 struct mlx5_wqe_dseg *restrict dseg; 3863 struct mlx5_wqe_eseg *restrict eseg; 3864 enum mlx5_txcmp_code ret; 3865 unsigned int part, loop; 3866 unsigned int slen = 0; 3867 3868 next_empw: 3869 part = RTE_MIN(pkts_n, MLX5_EMPW_MAX_PACKETS); 3870 if (unlikely(loc->elts_free < part)) { 3871 /* We have no enough elts to save all mbufs. */ 3872 if (unlikely(loc->elts_free < MLX5_EMPW_MIN_PACKETS)) 3873 return MLX5_TXCMP_CODE_EXIT; 3874 /* But we still able to send at least minimal eMPW. */ 3875 part = loc->elts_free; 3876 } 3877 /* Check whether we have enough WQEs */ 3878 if (unlikely(loc->wqe_free < ((2 + part + 3) / 4))) { 3879 if (unlikely(loc->wqe_free < 3880 ((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4))) 3881 return MLX5_TXCMP_CODE_EXIT; 3882 part = (loc->wqe_free * 4) - 2; 3883 } 3884 if (likely(part > 1)) 3885 rte_prefetch0(*pkts); 3886 loc->wqe_last = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3887 /* 3888 * Build eMPW title WQEBB: 3889 * - Control Segment, eMPW opcode 3890 * - Ethernet Segment, no inline 3891 */ 3892 mlx5_tx_cseg_init(txq, loc, loc->wqe_last, part + 2, 3893 MLX5_OPCODE_ENHANCED_MPSW, olx); 3894 mlx5_tx_eseg_none(txq, loc, loc->wqe_last, 3895 olx & ~MLX5_TXOFF_CONFIG_VLAN); 3896 eseg = &loc->wqe_last->eseg; 3897 dseg = &loc->wqe_last->dseg[0]; 3898 loop = part; 3899 for (;;) { 3900 uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf); 3901 #ifdef MLX5_PMD_SOFT_COUNTERS 3902 /* Update sent data bytes counter. */ 3903 slen += dlen; 3904 #endif 3905 mlx5_tx_dseg_ptr 3906 (txq, loc, dseg, 3907 rte_pktmbuf_mtod(loc->mbuf, uint8_t *), 3908 dlen, olx); 3909 if (unlikely(--loop == 0)) 3910 break; 3911 loc->mbuf = *pkts++; 3912 if (likely(loop > 1)) 3913 rte_prefetch0(*pkts); 3914 ret = mlx5_tx_able_to_empw(txq, loc, olx, true); 3915 /* 3916 * Unroll the completion code to avoid 3917 * returning variable value - it results in 3918 * unoptimized sequent checking in caller. 3919 */ 3920 if (ret == MLX5_TXCMP_CODE_MULTI) { 3921 part -= loop; 3922 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 3923 if (unlikely(!loc->elts_free || 3924 !loc->wqe_free)) 3925 return MLX5_TXCMP_CODE_EXIT; 3926 return MLX5_TXCMP_CODE_MULTI; 3927 } 3928 if (ret == MLX5_TXCMP_CODE_TSO) { 3929 part -= loop; 3930 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 3931 if (unlikely(!loc->elts_free || 3932 !loc->wqe_free)) 3933 return MLX5_TXCMP_CODE_EXIT; 3934 return MLX5_TXCMP_CODE_TSO; 3935 } 3936 if (ret == MLX5_TXCMP_CODE_SINGLE) { 3937 part -= loop; 3938 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 3939 if (unlikely(!loc->elts_free || 3940 !loc->wqe_free)) 3941 return MLX5_TXCMP_CODE_EXIT; 3942 return MLX5_TXCMP_CODE_SINGLE; 3943 } 3944 if (ret != MLX5_TXCMP_CODE_EMPW) { 3945 assert(false); 3946 part -= loop; 3947 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 3948 return MLX5_TXCMP_CODE_ERROR; 3949 } 3950 /* 3951 * Check whether packet parameters coincide 3952 * within assumed eMPW batch: 3953 * - check sum settings 3954 * - metadata value 3955 * - software parser settings 3956 */ 3957 if (!mlx5_tx_match_empw(txq, eseg, loc, olx)) { 3958 assert(loop); 3959 part -= loop; 3960 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 3961 if (unlikely(!loc->elts_free || 3962 !loc->wqe_free)) 3963 return MLX5_TXCMP_CODE_EXIT; 3964 pkts_n -= part; 3965 goto next_empw; 3966 } 3967 /* Packet attributes match, continue the same eMPW. */ 3968 ++dseg; 3969 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 3970 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 3971 } 3972 /* eMPW is built successfully, update loop parameters. */ 3973 assert(!loop); 3974 assert(pkts_n >= part); 3975 #ifdef MLX5_PMD_SOFT_COUNTERS 3976 /* Update sent data bytes counter. */ 3977 txq->stats.obytes += slen; 3978 #endif 3979 loc->elts_free -= part; 3980 loc->pkts_sent += part; 3981 txq->wqe_ci += (2 + part + 3) / 4; 3982 loc->wqe_free -= (2 + part + 3) / 4; 3983 pkts_n -= part; 3984 /* Request CQE generation if limits are reached. */ 3985 mlx5_tx_request_completion(txq, loc, false, olx); 3986 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) 3987 return MLX5_TXCMP_CODE_EXIT; 3988 loc->mbuf = *pkts++; 3989 ret = mlx5_tx_able_to_empw(txq, loc, olx, true); 3990 if (unlikely(ret != MLX5_TXCMP_CODE_EMPW)) 3991 return ret; 3992 /* Continue sending eMPW batches. */ 3993 } 3994 assert(false); 3995 } 3996 3997 /** 3998 * The routine sends packets with MLX5_OPCODE_EMPW 3999 * with inlining, optionally supports VLAN insertion. 4000 */ 4001 static __rte_always_inline enum mlx5_txcmp_code 4002 mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq, 4003 struct rte_mbuf **restrict pkts, 4004 unsigned int pkts_n, 4005 struct mlx5_txq_local *restrict loc, 4006 unsigned int olx) 4007 { 4008 /* 4009 * Subroutine is the part of mlx5_tx_burst_single() 4010 * and sends single-segment packet with eMPW opcode 4011 * with data inlining. 4012 */ 4013 assert(MLX5_TXOFF_CONFIG(INLINE)); 4014 assert(MLX5_TXOFF_CONFIG(EMPW)); 4015 assert(loc->elts_free && loc->wqe_free); 4016 assert(pkts_n > loc->pkts_sent); 4017 static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size"); 4018 pkts += loc->pkts_sent + 1; 4019 pkts_n -= loc->pkts_sent; 4020 for (;;) { 4021 struct mlx5_wqe_dseg *restrict dseg; 4022 struct mlx5_wqe_eseg *restrict eseg; 4023 enum mlx5_txcmp_code ret; 4024 unsigned int room, part, nlim; 4025 unsigned int slen = 0; 4026 4027 /* 4028 * Limits the amount of packets in one WQE 4029 * to improve CQE latency generation. 4030 */ 4031 nlim = RTE_MIN(pkts_n, MLX5_EMPW_MAX_PACKETS); 4032 /* Check whether we have minimal amount WQEs */ 4033 if (unlikely(loc->wqe_free < 4034 ((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4))) 4035 return MLX5_TXCMP_CODE_EXIT; 4036 if (likely(pkts_n > 1)) 4037 rte_prefetch0(*pkts); 4038 loc->wqe_last = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4039 /* 4040 * Build eMPW title WQEBB: 4041 * - Control Segment, eMPW opcode, zero DS 4042 * - Ethernet Segment, no inline 4043 */ 4044 mlx5_tx_cseg_init(txq, loc, loc->wqe_last, 0, 4045 MLX5_OPCODE_ENHANCED_MPSW, olx); 4046 mlx5_tx_eseg_none(txq, loc, loc->wqe_last, 4047 olx & ~MLX5_TXOFF_CONFIG_VLAN); 4048 eseg = &loc->wqe_last->eseg; 4049 dseg = &loc->wqe_last->dseg[0]; 4050 room = RTE_MIN(MLX5_WQE_SIZE_MAX / MLX5_WQE_SIZE, 4051 loc->wqe_free) * MLX5_WQE_SIZE - 4052 MLX5_WQE_CSEG_SIZE - 4053 MLX5_WQE_ESEG_SIZE; 4054 /* Build WQE till we have space, packets and resources. */ 4055 part = room; 4056 for (;;) { 4057 uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf); 4058 uint8_t *dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *); 4059 unsigned int tlen; 4060 4061 assert(room >= MLX5_WQE_DSEG_SIZE); 4062 assert((room % MLX5_WQE_DSEG_SIZE) == 0); 4063 assert((uintptr_t)dseg < (uintptr_t)txq->wqes_end); 4064 /* 4065 * Some Tx offloads may cause an error if 4066 * packet is not long enough, check against 4067 * assumed minimal length. 4068 */ 4069 if (unlikely(dlen <= MLX5_ESEG_MIN_INLINE_SIZE)) { 4070 part -= room; 4071 if (unlikely(!part)) 4072 return MLX5_TXCMP_CODE_ERROR; 4073 /* 4074 * We have some successfully built 4075 * packet Data Segments to send. 4076 */ 4077 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4078 return MLX5_TXCMP_CODE_ERROR; 4079 } 4080 /* Inline or not inline - that's the Question. */ 4081 if (dlen > txq->inlen_empw) 4082 goto pointer_empw; 4083 /* Inline entire packet, optional VLAN insertion. */ 4084 tlen = sizeof(dseg->bcount) + dlen; 4085 if (MLX5_TXOFF_CONFIG(VLAN) && 4086 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { 4087 /* 4088 * The packet length must be checked in 4089 * mlx5_tx_able_to_empw() and packet 4090 * fits into inline length guaranteed. 4091 */ 4092 assert((dlen + sizeof(struct rte_vlan_hdr)) <= 4093 txq->inlen_empw); 4094 tlen += sizeof(struct rte_vlan_hdr); 4095 if (room < tlen) 4096 break; 4097 dseg = mlx5_tx_dseg_vlan(txq, loc, dseg, 4098 dptr, dlen, olx); 4099 #ifdef MLX5_PMD_SOFT_COUNTERS 4100 /* Update sent data bytes counter. */ 4101 slen += sizeof(struct rte_vlan_hdr); 4102 #endif 4103 } else { 4104 if (room < tlen) 4105 break; 4106 dseg = mlx5_tx_dseg_empw(txq, loc, dseg, 4107 dptr, dlen, olx); 4108 } 4109 tlen = RTE_ALIGN(tlen, MLX5_WSEG_SIZE); 4110 assert(room >= tlen); 4111 room -= tlen; 4112 /* 4113 * Packet data are completely inlined, 4114 * free the packet immediately. 4115 */ 4116 rte_pktmbuf_free_seg(loc->mbuf); 4117 goto next_mbuf; 4118 pointer_empw: 4119 /* 4120 * Not inlinable VLAN packets are 4121 * proceeded outside of this routine. 4122 */ 4123 assert(room >= MLX5_WQE_DSEG_SIZE); 4124 if (MLX5_TXOFF_CONFIG(VLAN)) 4125 assert(!(loc->mbuf->ol_flags & 4126 PKT_TX_VLAN_PKT)); 4127 mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx); 4128 /* We have to store mbuf in elts.*/ 4129 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 4130 room -= MLX5_WQE_DSEG_SIZE; 4131 /* Ring buffer wraparound is checked at the loop end.*/ 4132 ++dseg; 4133 next_mbuf: 4134 #ifdef MLX5_PMD_SOFT_COUNTERS 4135 /* Update sent data bytes counter. */ 4136 slen += dlen; 4137 #endif 4138 loc->pkts_sent++; 4139 loc->elts_free--; 4140 pkts_n--; 4141 if (unlikely(!pkts_n || !loc->elts_free)) { 4142 /* 4143 * We have no resources/packets to 4144 * continue build descriptors. 4145 */ 4146 part -= room; 4147 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4148 return MLX5_TXCMP_CODE_EXIT; 4149 } 4150 loc->mbuf = *pkts++; 4151 if (likely(pkts_n > 1)) 4152 rte_prefetch0(*pkts); 4153 ret = mlx5_tx_able_to_empw(txq, loc, olx, true); 4154 /* 4155 * Unroll the completion code to avoid 4156 * returning variable value - it results in 4157 * unoptimized sequent checking in caller. 4158 */ 4159 if (ret == MLX5_TXCMP_CODE_MULTI) { 4160 part -= room; 4161 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4162 if (unlikely(!loc->elts_free || 4163 !loc->wqe_free)) 4164 return MLX5_TXCMP_CODE_EXIT; 4165 return MLX5_TXCMP_CODE_MULTI; 4166 } 4167 if (ret == MLX5_TXCMP_CODE_TSO) { 4168 part -= room; 4169 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4170 if (unlikely(!loc->elts_free || 4171 !loc->wqe_free)) 4172 return MLX5_TXCMP_CODE_EXIT; 4173 return MLX5_TXCMP_CODE_TSO; 4174 } 4175 if (ret == MLX5_TXCMP_CODE_SINGLE) { 4176 part -= room; 4177 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4178 if (unlikely(!loc->elts_free || 4179 !loc->wqe_free)) 4180 return MLX5_TXCMP_CODE_EXIT; 4181 return MLX5_TXCMP_CODE_SINGLE; 4182 } 4183 if (ret != MLX5_TXCMP_CODE_EMPW) { 4184 assert(false); 4185 part -= room; 4186 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4187 return MLX5_TXCMP_CODE_ERROR; 4188 } 4189 /* Check if we have minimal room left. */ 4190 nlim--; 4191 if (unlikely(!nlim || room < MLX5_WQE_DSEG_SIZE)) 4192 break; 4193 /* 4194 * Check whether packet parameters coincide 4195 * within assumed eMPW batch: 4196 * - check sum settings 4197 * - metadata value 4198 * - software parser settings 4199 */ 4200 if (!mlx5_tx_match_empw(txq, eseg, loc, olx)) 4201 break; 4202 /* Packet attributes match, continue the same eMPW. */ 4203 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 4204 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 4205 } 4206 /* 4207 * We get here to close an existing eMPW 4208 * session and start the new one. 4209 */ 4210 assert(pkts_n); 4211 part -= room; 4212 if (unlikely(!part)) 4213 return MLX5_TXCMP_CODE_EXIT; 4214 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4215 if (unlikely(!loc->elts_free || 4216 !loc->wqe_free)) 4217 return MLX5_TXCMP_CODE_EXIT; 4218 /* Continue the loop with new eMPW session. */ 4219 } 4220 assert(false); 4221 } 4222 4223 /** 4224 * The routine sends packets with ordinary MLX5_OPCODE_SEND. 4225 * Data inlining and VLAN insertion are supported. 4226 */ 4227 static __rte_always_inline enum mlx5_txcmp_code 4228 mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq, 4229 struct rte_mbuf **restrict pkts, 4230 unsigned int pkts_n, 4231 struct mlx5_txq_local *restrict loc, 4232 unsigned int olx) 4233 { 4234 /* 4235 * Subroutine is the part of mlx5_tx_burst_single() 4236 * and sends single-segment packet with SEND opcode. 4237 */ 4238 assert(loc->elts_free && loc->wqe_free); 4239 assert(pkts_n > loc->pkts_sent); 4240 pkts += loc->pkts_sent + 1; 4241 pkts_n -= loc->pkts_sent; 4242 for (;;) { 4243 struct mlx5_wqe *restrict wqe; 4244 enum mlx5_txcmp_code ret; 4245 4246 assert(NB_SEGS(loc->mbuf) == 1); 4247 if (MLX5_TXOFF_CONFIG(INLINE)) { 4248 unsigned int inlen, vlan = 0; 4249 4250 inlen = rte_pktmbuf_data_len(loc->mbuf); 4251 if (MLX5_TXOFF_CONFIG(VLAN) && 4252 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { 4253 vlan = sizeof(struct rte_vlan_hdr); 4254 inlen += vlan; 4255 static_assert((sizeof(struct rte_vlan_hdr) + 4256 sizeof(struct rte_ether_hdr)) == 4257 MLX5_ESEG_MIN_INLINE_SIZE, 4258 "invalid min inline data size"); 4259 } 4260 /* 4261 * If inlining is enabled at configuration time 4262 * the limit must be not less than minimal size. 4263 * Otherwise we would do extra check for data 4264 * size to avoid crashes due to length overflow. 4265 */ 4266 assert(txq->inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE); 4267 if (inlen <= txq->inlen_send) { 4268 unsigned int seg_n, wqe_n; 4269 4270 rte_prefetch0(rte_pktmbuf_mtod 4271 (loc->mbuf, uint8_t *)); 4272 /* Check against minimal length. */ 4273 if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE) 4274 return MLX5_TXCMP_CODE_ERROR; 4275 /* 4276 * Completely inlined packet data WQE: 4277 * - Control Segment, SEND opcode 4278 * - Ethernet Segment, no VLAN insertion 4279 * - Data inlined, VLAN optionally inserted 4280 * - Alignment to MLX5_WSEG_SIZE 4281 * Have to estimate amount of WQEBBs 4282 */ 4283 seg_n = (inlen + 3 * MLX5_WSEG_SIZE - 4284 MLX5_ESEG_MIN_INLINE_SIZE + 4285 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 4286 /* Check if there are enough WQEBBs. */ 4287 wqe_n = (seg_n + 3) / 4; 4288 if (wqe_n > loc->wqe_free) 4289 return MLX5_TXCMP_CODE_EXIT; 4290 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4291 loc->wqe_last = wqe; 4292 mlx5_tx_cseg_init(txq, loc, wqe, seg_n, 4293 MLX5_OPCODE_SEND, olx); 4294 mlx5_tx_eseg_data(txq, loc, wqe, 4295 vlan, inlen, 0, olx); 4296 txq->wqe_ci += wqe_n; 4297 loc->wqe_free -= wqe_n; 4298 /* 4299 * Packet data are completely inlined, 4300 * free the packet immediately. 4301 */ 4302 rte_pktmbuf_free_seg(loc->mbuf); 4303 } else if (!MLX5_TXOFF_CONFIG(EMPW) && 4304 txq->inlen_mode) { 4305 /* 4306 * If minimal inlining is requested the eMPW 4307 * feature should be disabled due to data is 4308 * inlined into Ethernet Segment, which can 4309 * not contain inlined data for eMPW due to 4310 * segment shared for all packets. 4311 */ 4312 struct mlx5_wqe_dseg *restrict dseg; 4313 unsigned int ds; 4314 uint8_t *dptr; 4315 4316 /* 4317 * The inline-mode settings require 4318 * to inline the specified amount of 4319 * data bytes to the Ethernet Segment. 4320 * We should check the free space in 4321 * WQE ring buffer to inline partially. 4322 */ 4323 assert(txq->inlen_send >= txq->inlen_mode); 4324 assert(inlen > txq->inlen_mode); 4325 assert(txq->inlen_mode >= 4326 MLX5_ESEG_MIN_INLINE_SIZE); 4327 /* 4328 * Check whether there are enough free WQEBBs: 4329 * - Control Segment 4330 * - Ethernet Segment 4331 * - First Segment of inlined Ethernet data 4332 * - ... data continued ... 4333 * - Finishing Data Segment of pointer type 4334 */ 4335 ds = (MLX5_WQE_CSEG_SIZE + 4336 MLX5_WQE_ESEG_SIZE + 4337 MLX5_WQE_DSEG_SIZE + 4338 txq->inlen_mode - 4339 MLX5_ESEG_MIN_INLINE_SIZE + 4340 MLX5_WQE_DSEG_SIZE + 4341 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 4342 if (loc->wqe_free < ((ds + 3) / 4)) 4343 return MLX5_TXCMP_CODE_EXIT; 4344 /* 4345 * Build the ordinary SEND WQE: 4346 * - Control Segment 4347 * - Ethernet Segment, inline inlen_mode bytes 4348 * - Data Segment of pointer type 4349 */ 4350 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4351 loc->wqe_last = wqe; 4352 mlx5_tx_cseg_init(txq, loc, wqe, ds, 4353 MLX5_OPCODE_SEND, olx); 4354 dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan, 4355 txq->inlen_mode, 4356 0, olx); 4357 dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + 4358 txq->inlen_mode - vlan; 4359 inlen -= txq->inlen_mode; 4360 mlx5_tx_dseg_ptr(txq, loc, dseg, 4361 dptr, inlen, olx); 4362 /* 4363 * WQE is built, update the loop parameters 4364 * and got to the next packet. 4365 */ 4366 txq->wqe_ci += (ds + 3) / 4; 4367 loc->wqe_free -= (ds + 3) / 4; 4368 /* We have to store mbuf in elts.*/ 4369 assert(MLX5_TXOFF_CONFIG(INLINE)); 4370 txq->elts[txq->elts_head++ & txq->elts_m] = 4371 loc->mbuf; 4372 --loc->elts_free; 4373 } else { 4374 uint8_t *dptr; 4375 unsigned int dlen; 4376 4377 /* 4378 * Partially inlined packet data WQE, we have 4379 * some space in title WQEBB, we can fill it 4380 * with some packet data. It takes one WQEBB, 4381 * it is available, no extra space check: 4382 * - Control Segment, SEND opcode 4383 * - Ethernet Segment, no VLAN insertion 4384 * - MLX5_ESEG_MIN_INLINE_SIZE bytes of Data 4385 * - Data Segment, pointer type 4386 * 4387 * We also get here if VLAN insertion is not 4388 * supported by HW, the inline is enabled. 4389 */ 4390 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4391 loc->wqe_last = wqe; 4392 mlx5_tx_cseg_init(txq, loc, wqe, 4, 4393 MLX5_OPCODE_SEND, olx); 4394 mlx5_tx_eseg_dmin(txq, loc, wqe, vlan, olx); 4395 dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + 4396 MLX5_ESEG_MIN_INLINE_SIZE - vlan; 4397 /* 4398 * The length check is performed above, by 4399 * comparing with txq->inlen_send. We should 4400 * not get overflow here. 4401 */ 4402 assert(inlen > MLX5_ESEG_MIN_INLINE_SIZE); 4403 dlen = inlen - MLX5_ESEG_MIN_INLINE_SIZE; 4404 mlx5_tx_dseg_ptr(txq, loc, &wqe->dseg[1], 4405 dptr, dlen, olx); 4406 ++txq->wqe_ci; 4407 --loc->wqe_free; 4408 /* We have to store mbuf in elts.*/ 4409 assert(MLX5_TXOFF_CONFIG(INLINE)); 4410 txq->elts[txq->elts_head++ & txq->elts_m] = 4411 loc->mbuf; 4412 --loc->elts_free; 4413 } 4414 #ifdef MLX5_PMD_SOFT_COUNTERS 4415 /* Update sent data bytes counter. */ 4416 txq->stats.obytes += vlan + 4417 rte_pktmbuf_data_len(loc->mbuf); 4418 #endif 4419 } else { 4420 /* 4421 * No inline at all, it means the CPU cycles saving 4422 * is prioritized at configuration, we should not 4423 * copy any packet data to WQE. 4424 * 4425 * SEND WQE, one WQEBB: 4426 * - Control Segment, SEND opcode 4427 * - Ethernet Segment, optional VLAN, no inline 4428 * - Data Segment, pointer type 4429 */ 4430 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4431 loc->wqe_last = wqe; 4432 mlx5_tx_cseg_init(txq, loc, wqe, 3, 4433 MLX5_OPCODE_SEND, olx); 4434 mlx5_tx_eseg_none(txq, loc, wqe, olx); 4435 mlx5_tx_dseg_ptr 4436 (txq, loc, &wqe->dseg[0], 4437 rte_pktmbuf_mtod(loc->mbuf, uint8_t *), 4438 rte_pktmbuf_data_len(loc->mbuf), olx); 4439 ++txq->wqe_ci; 4440 --loc->wqe_free; 4441 /* 4442 * We should not store mbuf pointer in elts 4443 * if no inlining is configured, this is done 4444 * by calling routine in a batch copy. 4445 */ 4446 assert(!MLX5_TXOFF_CONFIG(INLINE)); 4447 --loc->elts_free; 4448 #ifdef MLX5_PMD_SOFT_COUNTERS 4449 /* Update sent data bytes counter. */ 4450 txq->stats.obytes += rte_pktmbuf_data_len(loc->mbuf); 4451 if (MLX5_TXOFF_CONFIG(VLAN) && 4452 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) 4453 txq->stats.obytes += 4454 sizeof(struct rte_vlan_hdr); 4455 #endif 4456 } 4457 ++loc->pkts_sent; 4458 --pkts_n; 4459 /* Request CQE generation if limits are reached. */ 4460 mlx5_tx_request_completion(txq, loc, false, olx); 4461 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) 4462 return MLX5_TXCMP_CODE_EXIT; 4463 loc->mbuf = *pkts++; 4464 if (pkts_n > 1) 4465 rte_prefetch0(*pkts); 4466 ret = mlx5_tx_able_to_empw(txq, loc, olx, true); 4467 if (unlikely(ret != MLX5_TXCMP_CODE_SINGLE)) 4468 return ret; 4469 } 4470 assert(false); 4471 } 4472 4473 static __rte_always_inline enum mlx5_txcmp_code 4474 mlx5_tx_burst_single(struct mlx5_txq_data *restrict txq, 4475 struct rte_mbuf **restrict pkts, 4476 unsigned int pkts_n, 4477 struct mlx5_txq_local *restrict loc, 4478 unsigned int olx) 4479 { 4480 enum mlx5_txcmp_code ret; 4481 4482 ret = mlx5_tx_able_to_empw(txq, loc, olx, false); 4483 if (ret == MLX5_TXCMP_CODE_SINGLE) 4484 goto ordinary_send; 4485 assert(ret == MLX5_TXCMP_CODE_EMPW); 4486 for (;;) { 4487 /* Optimize for inline/no inline eMPW send. */ 4488 ret = (MLX5_TXOFF_CONFIG(INLINE)) ? 4489 mlx5_tx_burst_empw_inline 4490 (txq, pkts, pkts_n, loc, olx) : 4491 mlx5_tx_burst_empw_simple 4492 (txq, pkts, pkts_n, loc, olx); 4493 if (ret != MLX5_TXCMP_CODE_SINGLE) 4494 return ret; 4495 /* The resources to send one packet should remain. */ 4496 assert(loc->elts_free && loc->wqe_free); 4497 ordinary_send: 4498 ret = mlx5_tx_burst_single_send(txq, pkts, pkts_n, loc, olx); 4499 assert(ret != MLX5_TXCMP_CODE_SINGLE); 4500 if (ret != MLX5_TXCMP_CODE_EMPW) 4501 return ret; 4502 /* The resources to send one packet should remain. */ 4503 assert(loc->elts_free && loc->wqe_free); 4504 } 4505 } 4506 4507 /** 4508 * DPDK Tx callback template. This is configured template 4509 * used to generate routines optimized for specified offload setup. 4510 * One of this generated functions is chosen at SQ configuration 4511 * time. 4512 * 4513 * @param txq 4514 * Generic pointer to TX queue structure. 4515 * @param[in] pkts 4516 * Packets to transmit. 4517 * @param pkts_n 4518 * Number of packets in array. 4519 * @param olx 4520 * Configured offloads mask, presents the bits of MLX5_TXOFF_CONFIG_xxx 4521 * values. Should be static to take compile time static configuration 4522 * advantages. 4523 * 4524 * @return 4525 * Number of packets successfully transmitted (<= pkts_n). 4526 */ 4527 static __rte_always_inline uint16_t 4528 mlx5_tx_burst_tmpl(struct mlx5_txq_data *restrict txq, 4529 struct rte_mbuf **restrict pkts, 4530 uint16_t pkts_n, 4531 unsigned int olx) 4532 { 4533 struct mlx5_txq_local loc; 4534 enum mlx5_txcmp_code ret; 4535 unsigned int part; 4536 4537 assert(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); 4538 assert(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); 4539 if (unlikely(!pkts_n)) 4540 return 0; 4541 loc.pkts_sent = 0; 4542 loc.pkts_copy = 0; 4543 loc.wqe_last = NULL; 4544 4545 send_loop: 4546 loc.pkts_loop = loc.pkts_sent; 4547 /* 4548 * Check if there are some CQEs, if any: 4549 * - process an encountered errors 4550 * - process the completed WQEs 4551 * - free related mbufs 4552 * - doorbell the NIC about processed CQEs 4553 */ 4554 rte_prefetch0(*(pkts + loc.pkts_sent)); 4555 mlx5_tx_handle_completion(txq, olx); 4556 /* 4557 * Calculate the number of available resources - elts and WQEs. 4558 * There are two possible different scenarios: 4559 * - no data inlining into WQEs, one WQEBB may contains upto 4560 * four packets, in this case elts become scarce resource 4561 * - data inlining into WQEs, one packet may require multiple 4562 * WQEBBs, the WQEs become the limiting factor. 4563 */ 4564 assert(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); 4565 loc.elts_free = txq->elts_s - 4566 (uint16_t)(txq->elts_head - txq->elts_tail); 4567 assert(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); 4568 loc.wqe_free = txq->wqe_s - 4569 (uint16_t)(txq->wqe_ci - txq->wqe_pi); 4570 if (unlikely(!loc.elts_free || !loc.wqe_free)) 4571 return loc.pkts_sent; 4572 for (;;) { 4573 /* 4574 * Fetch the packet from array. Usually this is 4575 * the first packet in series of multi/single 4576 * segment packets. 4577 */ 4578 loc.mbuf = *(pkts + loc.pkts_sent); 4579 /* Dedicated branch for multi-segment packets. */ 4580 if (MLX5_TXOFF_CONFIG(MULTI) && 4581 unlikely(NB_SEGS(loc.mbuf) > 1)) { 4582 /* 4583 * Multi-segment packet encountered. 4584 * Hardware is able to process it only 4585 * with SEND/TSO opcodes, one packet 4586 * per WQE, do it in dedicated routine. 4587 */ 4588 enter_send_multi: 4589 assert(loc.pkts_sent >= loc.pkts_copy); 4590 part = loc.pkts_sent - loc.pkts_copy; 4591 if (!MLX5_TXOFF_CONFIG(INLINE) && part) { 4592 /* 4593 * There are some single-segment mbufs not 4594 * stored in elts. The mbufs must be in the 4595 * same order as WQEs, so we must copy the 4596 * mbufs to elts here, before the coming 4597 * multi-segment packet mbufs is appended. 4598 */ 4599 mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy, 4600 part, olx); 4601 loc.pkts_copy = loc.pkts_sent; 4602 } 4603 assert(pkts_n > loc.pkts_sent); 4604 ret = mlx5_tx_burst_mseg(txq, pkts, pkts_n, &loc, olx); 4605 if (!MLX5_TXOFF_CONFIG(INLINE)) 4606 loc.pkts_copy = loc.pkts_sent; 4607 /* 4608 * These returned code checks are supposed 4609 * to be optimized out due to routine inlining. 4610 */ 4611 if (ret == MLX5_TXCMP_CODE_EXIT) { 4612 /* 4613 * The routine returns this code when 4614 * all packets are sent or there is no 4615 * enough resources to complete request. 4616 */ 4617 break; 4618 } 4619 if (ret == MLX5_TXCMP_CODE_ERROR) { 4620 /* 4621 * The routine returns this code when 4622 * some error in the incoming packets 4623 * format occurred. 4624 */ 4625 txq->stats.oerrors++; 4626 break; 4627 } 4628 if (ret == MLX5_TXCMP_CODE_SINGLE) { 4629 /* 4630 * The single-segment packet was encountered 4631 * in the array, try to send it with the 4632 * best optimized way, possible engaging eMPW. 4633 */ 4634 goto enter_send_single; 4635 } 4636 if (MLX5_TXOFF_CONFIG(TSO) && 4637 ret == MLX5_TXCMP_CODE_TSO) { 4638 /* 4639 * The single-segment TSO packet was 4640 * encountered in the array. 4641 */ 4642 goto enter_send_tso; 4643 } 4644 /* We must not get here. Something is going wrong. */ 4645 assert(false); 4646 txq->stats.oerrors++; 4647 break; 4648 } 4649 /* Dedicated branch for single-segment TSO packets. */ 4650 if (MLX5_TXOFF_CONFIG(TSO) && 4651 unlikely(loc.mbuf->ol_flags & PKT_TX_TCP_SEG)) { 4652 /* 4653 * TSO might require special way for inlining 4654 * (dedicated parameters) and is sent with 4655 * MLX5_OPCODE_TSO opcode only, provide this 4656 * in dedicated branch. 4657 */ 4658 enter_send_tso: 4659 assert(NB_SEGS(loc.mbuf) == 1); 4660 assert(pkts_n > loc.pkts_sent); 4661 ret = mlx5_tx_burst_tso(txq, pkts, pkts_n, &loc, olx); 4662 /* 4663 * These returned code checks are supposed 4664 * to be optimized out due to routine inlining. 4665 */ 4666 if (ret == MLX5_TXCMP_CODE_EXIT) 4667 break; 4668 if (ret == MLX5_TXCMP_CODE_ERROR) { 4669 txq->stats.oerrors++; 4670 break; 4671 } 4672 if (ret == MLX5_TXCMP_CODE_SINGLE) 4673 goto enter_send_single; 4674 if (MLX5_TXOFF_CONFIG(MULTI) && 4675 ret == MLX5_TXCMP_CODE_MULTI) { 4676 /* 4677 * The multi-segment packet was 4678 * encountered in the array. 4679 */ 4680 goto enter_send_multi; 4681 } 4682 /* We must not get here. Something is going wrong. */ 4683 assert(false); 4684 txq->stats.oerrors++; 4685 break; 4686 } 4687 /* 4688 * The dedicated branch for the single-segment packets 4689 * without TSO. Often these ones can be sent using 4690 * MLX5_OPCODE_EMPW with multiple packets in one WQE. 4691 * The routine builds the WQEs till it encounters 4692 * the TSO or multi-segment packet (in case if these 4693 * offloads are requested at SQ configuration time). 4694 */ 4695 enter_send_single: 4696 assert(pkts_n > loc.pkts_sent); 4697 ret = mlx5_tx_burst_single(txq, pkts, pkts_n, &loc, olx); 4698 /* 4699 * These returned code checks are supposed 4700 * to be optimized out due to routine inlining. 4701 */ 4702 if (ret == MLX5_TXCMP_CODE_EXIT) 4703 break; 4704 if (ret == MLX5_TXCMP_CODE_ERROR) { 4705 txq->stats.oerrors++; 4706 break; 4707 } 4708 if (MLX5_TXOFF_CONFIG(MULTI) && 4709 ret == MLX5_TXCMP_CODE_MULTI) { 4710 /* 4711 * The multi-segment packet was 4712 * encountered in the array. 4713 */ 4714 goto enter_send_multi; 4715 } 4716 if (MLX5_TXOFF_CONFIG(TSO) && 4717 ret == MLX5_TXCMP_CODE_TSO) { 4718 /* 4719 * The single-segment TSO packet was 4720 * encountered in the array. 4721 */ 4722 goto enter_send_tso; 4723 } 4724 /* We must not get here. Something is going wrong. */ 4725 assert(false); 4726 txq->stats.oerrors++; 4727 break; 4728 } 4729 /* 4730 * Main Tx loop is completed, do the rest: 4731 * - set completion request if thresholds are reached 4732 * - doorbell the hardware 4733 * - copy the rest of mbufs to elts (if any) 4734 */ 4735 assert(MLX5_TXOFF_CONFIG(INLINE) || loc.pkts_sent >= loc.pkts_copy); 4736 /* Take a shortcut if nothing is sent. */ 4737 if (unlikely(loc.pkts_sent == loc.pkts_loop)) 4738 return loc.pkts_sent; 4739 /* 4740 * Ring QP doorbell immediately after WQE building completion 4741 * to improve latencies. The pure software related data treatment 4742 * can be completed after doorbell. Tx CQEs for this SQ are 4743 * processed in this thread only by the polling. 4744 */ 4745 mlx5_tx_dbrec_cond_wmb(txq, loc.wqe_last, 0); 4746 /* Not all of the mbufs may be stored into elts yet. */ 4747 part = MLX5_TXOFF_CONFIG(INLINE) ? 0 : loc.pkts_sent - loc.pkts_copy; 4748 if (!MLX5_TXOFF_CONFIG(INLINE) && part) { 4749 /* 4750 * There are some single-segment mbufs not stored in elts. 4751 * It can be only if the last packet was single-segment. 4752 * The copying is gathered into one place due to it is 4753 * a good opportunity to optimize that with SIMD. 4754 * Unfortunately if inlining is enabled the gaps in 4755 * pointer array may happen due to early freeing of the 4756 * inlined mbufs. 4757 */ 4758 mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy, part, olx); 4759 loc.pkts_copy = loc.pkts_sent; 4760 } 4761 #ifdef MLX5_PMD_SOFT_COUNTERS 4762 /* Increment sent packets counter. */ 4763 txq->stats.opackets += loc.pkts_sent; 4764 #endif 4765 assert(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); 4766 assert(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); 4767 if (pkts_n > loc.pkts_sent) { 4768 /* 4769 * If burst size is large there might be no enough CQE 4770 * fetched from completion queue and no enough resources 4771 * freed to send all the packets. 4772 */ 4773 goto send_loop; 4774 } 4775 return loc.pkts_sent; 4776 } 4777 4778 /* Generate routines with Enhanced Multi-Packet Write support. */ 4779 MLX5_TXOFF_DECL(full_empw, 4780 MLX5_TXOFF_CONFIG_FULL | MLX5_TXOFF_CONFIG_EMPW) 4781 4782 MLX5_TXOFF_DECL(none_empw, 4783 MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW) 4784 4785 MLX5_TXOFF_DECL(md_empw, 4786 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4787 4788 MLX5_TXOFF_DECL(mt_empw, 4789 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4790 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4791 4792 MLX5_TXOFF_DECL(mtsc_empw, 4793 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4794 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4795 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4796 4797 MLX5_TXOFF_DECL(mti_empw, 4798 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4799 MLX5_TXOFF_CONFIG_INLINE | 4800 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4801 4802 MLX5_TXOFF_DECL(mtv_empw, 4803 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4804 MLX5_TXOFF_CONFIG_VLAN | 4805 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4806 4807 MLX5_TXOFF_DECL(mtiv_empw, 4808 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4809 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4810 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4811 4812 MLX5_TXOFF_DECL(sc_empw, 4813 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4814 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4815 4816 MLX5_TXOFF_DECL(sci_empw, 4817 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4818 MLX5_TXOFF_CONFIG_INLINE | 4819 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4820 4821 MLX5_TXOFF_DECL(scv_empw, 4822 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4823 MLX5_TXOFF_CONFIG_VLAN | 4824 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4825 4826 MLX5_TXOFF_DECL(sciv_empw, 4827 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4828 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4829 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4830 4831 MLX5_TXOFF_DECL(i_empw, 4832 MLX5_TXOFF_CONFIG_INLINE | 4833 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4834 4835 MLX5_TXOFF_DECL(v_empw, 4836 MLX5_TXOFF_CONFIG_VLAN | 4837 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4838 4839 MLX5_TXOFF_DECL(iv_empw, 4840 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4841 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4842 4843 /* Generate routines without Enhanced Multi-Packet Write support. */ 4844 MLX5_TXOFF_DECL(full, 4845 MLX5_TXOFF_CONFIG_FULL) 4846 4847 MLX5_TXOFF_DECL(none, 4848 MLX5_TXOFF_CONFIG_NONE) 4849 4850 MLX5_TXOFF_DECL(md, 4851 MLX5_TXOFF_CONFIG_METADATA) 4852 4853 MLX5_TXOFF_DECL(mt, 4854 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4855 MLX5_TXOFF_CONFIG_METADATA) 4856 4857 MLX5_TXOFF_DECL(mtsc, 4858 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4859 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4860 MLX5_TXOFF_CONFIG_METADATA) 4861 4862 MLX5_TXOFF_DECL(mti, 4863 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4864 MLX5_TXOFF_CONFIG_INLINE | 4865 MLX5_TXOFF_CONFIG_METADATA) 4866 4867 4868 MLX5_TXOFF_DECL(mtv, 4869 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4870 MLX5_TXOFF_CONFIG_VLAN | 4871 MLX5_TXOFF_CONFIG_METADATA) 4872 4873 4874 MLX5_TXOFF_DECL(mtiv, 4875 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4876 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4877 MLX5_TXOFF_CONFIG_METADATA) 4878 4879 MLX5_TXOFF_DECL(sc, 4880 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4881 MLX5_TXOFF_CONFIG_METADATA) 4882 4883 MLX5_TXOFF_DECL(sci, 4884 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4885 MLX5_TXOFF_CONFIG_INLINE | 4886 MLX5_TXOFF_CONFIG_METADATA) 4887 4888 4889 MLX5_TXOFF_DECL(scv, 4890 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4891 MLX5_TXOFF_CONFIG_VLAN | 4892 MLX5_TXOFF_CONFIG_METADATA) 4893 4894 4895 MLX5_TXOFF_DECL(sciv, 4896 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4897 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4898 MLX5_TXOFF_CONFIG_METADATA) 4899 4900 MLX5_TXOFF_DECL(i, 4901 MLX5_TXOFF_CONFIG_INLINE | 4902 MLX5_TXOFF_CONFIG_METADATA) 4903 4904 MLX5_TXOFF_DECL(v, 4905 MLX5_TXOFF_CONFIG_VLAN | 4906 MLX5_TXOFF_CONFIG_METADATA) 4907 4908 MLX5_TXOFF_DECL(iv, 4909 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4910 MLX5_TXOFF_CONFIG_METADATA) 4911 4912 /* 4913 * Array of declared and compiled Tx burst function and corresponding 4914 * supported offloads set. The array is used to select the Tx burst 4915 * function for specified offloads set at Tx queue configuration time. 4916 */ 4917 const struct { 4918 eth_tx_burst_t func; 4919 unsigned int olx; 4920 } txoff_func[] = { 4921 MLX5_TXOFF_INFO(full_empw, 4922 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4923 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4924 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4925 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4926 4927 MLX5_TXOFF_INFO(none_empw, 4928 MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW) 4929 4930 MLX5_TXOFF_INFO(md_empw, 4931 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4932 4933 MLX5_TXOFF_INFO(mt_empw, 4934 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4935 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4936 4937 MLX5_TXOFF_INFO(mtsc_empw, 4938 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4939 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4940 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4941 4942 MLX5_TXOFF_INFO(mti_empw, 4943 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4944 MLX5_TXOFF_CONFIG_INLINE | 4945 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4946 4947 MLX5_TXOFF_INFO(mtv_empw, 4948 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4949 MLX5_TXOFF_CONFIG_VLAN | 4950 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4951 4952 MLX5_TXOFF_INFO(mtiv_empw, 4953 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4954 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4955 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4956 4957 MLX5_TXOFF_INFO(sc_empw, 4958 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4959 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4960 4961 MLX5_TXOFF_INFO(sci_empw, 4962 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4963 MLX5_TXOFF_CONFIG_INLINE | 4964 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4965 4966 MLX5_TXOFF_INFO(scv_empw, 4967 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4968 MLX5_TXOFF_CONFIG_VLAN | 4969 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4970 4971 MLX5_TXOFF_INFO(sciv_empw, 4972 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4973 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4974 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4975 4976 MLX5_TXOFF_INFO(i_empw, 4977 MLX5_TXOFF_CONFIG_INLINE | 4978 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4979 4980 MLX5_TXOFF_INFO(v_empw, 4981 MLX5_TXOFF_CONFIG_VLAN | 4982 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4983 4984 MLX5_TXOFF_INFO(iv_empw, 4985 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4986 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4987 4988 MLX5_TXOFF_INFO(full, 4989 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4990 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4991 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4992 MLX5_TXOFF_CONFIG_METADATA) 4993 4994 MLX5_TXOFF_INFO(none, 4995 MLX5_TXOFF_CONFIG_NONE) 4996 4997 MLX5_TXOFF_INFO(md, 4998 MLX5_TXOFF_CONFIG_METADATA) 4999 5000 MLX5_TXOFF_INFO(mt, 5001 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5002 MLX5_TXOFF_CONFIG_METADATA) 5003 5004 MLX5_TXOFF_INFO(mtsc, 5005 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5006 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5007 MLX5_TXOFF_CONFIG_METADATA) 5008 5009 MLX5_TXOFF_INFO(mti, 5010 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5011 MLX5_TXOFF_CONFIG_INLINE | 5012 MLX5_TXOFF_CONFIG_METADATA) 5013 5014 5015 MLX5_TXOFF_INFO(mtv, 5016 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5017 MLX5_TXOFF_CONFIG_VLAN | 5018 MLX5_TXOFF_CONFIG_METADATA) 5019 5020 MLX5_TXOFF_INFO(mtiv, 5021 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5022 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5023 MLX5_TXOFF_CONFIG_METADATA) 5024 5025 MLX5_TXOFF_INFO(sc, 5026 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5027 MLX5_TXOFF_CONFIG_METADATA) 5028 5029 MLX5_TXOFF_INFO(sci, 5030 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5031 MLX5_TXOFF_CONFIG_INLINE | 5032 MLX5_TXOFF_CONFIG_METADATA) 5033 5034 MLX5_TXOFF_INFO(scv, 5035 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5036 MLX5_TXOFF_CONFIG_VLAN | 5037 MLX5_TXOFF_CONFIG_METADATA) 5038 5039 MLX5_TXOFF_INFO(sciv, 5040 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5041 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5042 MLX5_TXOFF_CONFIG_METADATA) 5043 5044 MLX5_TXOFF_INFO(i, 5045 MLX5_TXOFF_CONFIG_INLINE | 5046 MLX5_TXOFF_CONFIG_METADATA) 5047 5048 MLX5_TXOFF_INFO(v, 5049 MLX5_TXOFF_CONFIG_VLAN | 5050 MLX5_TXOFF_CONFIG_METADATA) 5051 5052 MLX5_TXOFF_INFO(iv, 5053 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5054 MLX5_TXOFF_CONFIG_METADATA) 5055 }; 5056 5057 /** 5058 * Configure the Tx function to use. The routine checks configured 5059 * Tx offloads for the device and selects appropriate Tx burst 5060 * routine. There are multiple Tx burst routines compiled from 5061 * the same template in the most optimal way for the dedicated 5062 * Tx offloads set. 5063 * 5064 * @param dev 5065 * Pointer to private data structure. 5066 * 5067 * @return 5068 * Pointer to selected Tx burst function. 5069 */ 5070 eth_tx_burst_t 5071 mlx5_select_tx_function(struct rte_eth_dev *dev) 5072 { 5073 struct mlx5_priv *priv = dev->data->dev_private; 5074 struct mlx5_dev_config *config = &priv->config; 5075 uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads; 5076 unsigned int diff = 0, olx = 0, i, m; 5077 5078 static_assert(MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE <= 5079 MLX5_DSEG_MAX, "invalid WQE max size"); 5080 static_assert(MLX5_WQE_CSEG_SIZE == MLX5_WSEG_SIZE, 5081 "invalid WQE Control Segment size"); 5082 static_assert(MLX5_WQE_ESEG_SIZE == MLX5_WSEG_SIZE, 5083 "invalid WQE Ethernet Segment size"); 5084 static_assert(MLX5_WQE_DSEG_SIZE == MLX5_WSEG_SIZE, 5085 "invalid WQE Data Segment size"); 5086 static_assert(MLX5_WQE_SIZE == 4 * MLX5_WSEG_SIZE, 5087 "invalid WQE size"); 5088 assert(priv); 5089 if (tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS) { 5090 /* We should support Multi-Segment Packets. */ 5091 olx |= MLX5_TXOFF_CONFIG_MULTI; 5092 } 5093 if (tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO | 5094 DEV_TX_OFFLOAD_VXLAN_TNL_TSO | 5095 DEV_TX_OFFLOAD_GRE_TNL_TSO | 5096 DEV_TX_OFFLOAD_IP_TNL_TSO | 5097 DEV_TX_OFFLOAD_UDP_TNL_TSO)) { 5098 /* We should support TCP Send Offload. */ 5099 olx |= MLX5_TXOFF_CONFIG_TSO; 5100 } 5101 if (tx_offloads & (DEV_TX_OFFLOAD_IP_TNL_TSO | 5102 DEV_TX_OFFLOAD_UDP_TNL_TSO | 5103 DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)) { 5104 /* We should support Software Parser for Tunnels. */ 5105 olx |= MLX5_TXOFF_CONFIG_SWP; 5106 } 5107 if (tx_offloads & (DEV_TX_OFFLOAD_IPV4_CKSUM | 5108 DEV_TX_OFFLOAD_UDP_CKSUM | 5109 DEV_TX_OFFLOAD_TCP_CKSUM | 5110 DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)) { 5111 /* We should support IP/TCP/UDP Checksums. */ 5112 olx |= MLX5_TXOFF_CONFIG_CSUM; 5113 } 5114 if (tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT) { 5115 /* We should support VLAN insertion. */ 5116 olx |= MLX5_TXOFF_CONFIG_VLAN; 5117 } 5118 if (priv->txqs_n && (*priv->txqs)[0]) { 5119 struct mlx5_txq_data *txd = (*priv->txqs)[0]; 5120 5121 if (txd->inlen_send) { 5122 /* 5123 * Check the data inline requirements. Data inline 5124 * is enabled on per device basis, we can check 5125 * the first Tx queue only. 5126 * 5127 * If device does not support VLAN insertion in WQE 5128 * and some queues are requested to perform VLAN 5129 * insertion offload than inline must be enabled. 5130 */ 5131 olx |= MLX5_TXOFF_CONFIG_INLINE; 5132 } 5133 } 5134 if (config->mps == MLX5_MPW_ENHANCED && 5135 config->txq_inline_min <= 0) { 5136 /* 5137 * The NIC supports Enhanced Multi-Packet Write. 5138 * We do not support legacy MPW due to its 5139 * hardware related problems, so we just ignore 5140 * legacy MLX5_MPW settings. There should be no 5141 * minimal required inline data. 5142 */ 5143 olx |= MLX5_TXOFF_CONFIG_EMPW; 5144 } 5145 if (tx_offloads & DEV_TX_OFFLOAD_MATCH_METADATA) { 5146 /* We should support Flow metadata. */ 5147 olx |= MLX5_TXOFF_CONFIG_METADATA; 5148 } 5149 /* 5150 * Scan the routines table to find the minimal 5151 * satisfying routine with requested offloads. 5152 */ 5153 m = RTE_DIM(txoff_func); 5154 for (i = 0; i < RTE_DIM(txoff_func); i++) { 5155 unsigned int tmp; 5156 5157 tmp = txoff_func[i].olx; 5158 if (tmp == olx) { 5159 /* Meets requested offloads exactly.*/ 5160 m = i; 5161 break; 5162 } 5163 if ((tmp & olx) != olx) { 5164 /* Does not meet requested offloads at all. */ 5165 continue; 5166 } 5167 if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_EMPW) 5168 /* Do not enable eMPW if not configured. */ 5169 continue; 5170 if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_INLINE) 5171 /* Do not enable inlining if not configured. */ 5172 continue; 5173 /* 5174 * Some routine meets the requirements. 5175 * Check whether it has minimal amount 5176 * of not requested offloads. 5177 */ 5178 tmp = __builtin_popcountl(tmp & ~olx); 5179 if (m >= RTE_DIM(txoff_func) || tmp < diff) { 5180 /* First or better match, save and continue. */ 5181 m = i; 5182 diff = tmp; 5183 continue; 5184 } 5185 if (tmp == diff) { 5186 tmp = txoff_func[i].olx ^ txoff_func[m].olx; 5187 if (__builtin_ffsl(txoff_func[i].olx & ~tmp) < 5188 __builtin_ffsl(txoff_func[m].olx & ~tmp)) { 5189 /* Lighter not requested offload. */ 5190 m = i; 5191 } 5192 } 5193 } 5194 if (m >= RTE_DIM(txoff_func)) { 5195 DRV_LOG(DEBUG, "port %u has no selected Tx function" 5196 " for requested offloads %04X", 5197 dev->data->port_id, olx); 5198 return NULL; 5199 } 5200 DRV_LOG(DEBUG, "port %u has selected Tx function" 5201 " supporting offloads %04X/%04X", 5202 dev->data->port_id, olx, txoff_func[m].olx); 5203 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_MULTI) 5204 DRV_LOG(DEBUG, "\tMULTI (multi segment)"); 5205 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_TSO) 5206 DRV_LOG(DEBUG, "\tTSO (TCP send offload)"); 5207 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_SWP) 5208 DRV_LOG(DEBUG, "\tSWP (software parser)"); 5209 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_CSUM) 5210 DRV_LOG(DEBUG, "\tCSUM (checksum offload)"); 5211 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_INLINE) 5212 DRV_LOG(DEBUG, "\tINLIN (inline data)"); 5213 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_VLAN) 5214 DRV_LOG(DEBUG, "\tVLANI (VLAN insertion)"); 5215 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_METADATA) 5216 DRV_LOG(DEBUG, "\tMETAD (tx Flow metadata)"); 5217 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_EMPW) 5218 DRV_LOG(DEBUG, "\tEMPW (Enhanced MPW)"); 5219 return txoff_func[m].func; 5220 } 5221