1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2015 6WIND S.A. 3 * Copyright 2015-2019 Mellanox Technologies, Ltd 4 */ 5 6 #include <assert.h> 7 #include <stdint.h> 8 #include <string.h> 9 #include <stdlib.h> 10 11 /* Verbs header. */ 12 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 13 #ifdef PEDANTIC 14 #pragma GCC diagnostic ignored "-Wpedantic" 15 #endif 16 #include <infiniband/verbs.h> 17 #include <infiniband/mlx5dv.h> 18 #ifdef PEDANTIC 19 #pragma GCC diagnostic error "-Wpedantic" 20 #endif 21 22 #include <rte_mbuf.h> 23 #include <rte_mempool.h> 24 #include <rte_prefetch.h> 25 #include <rte_common.h> 26 #include <rte_branch_prediction.h> 27 #include <rte_ether.h> 28 #include <rte_cycles.h> 29 30 #include "mlx5.h" 31 #include "mlx5_utils.h" 32 #include "mlx5_rxtx.h" 33 #include "mlx5_autoconf.h" 34 #include "mlx5_defs.h" 35 #include "mlx5_prm.h" 36 37 /* TX burst subroutines return codes. */ 38 enum mlx5_txcmp_code { 39 MLX5_TXCMP_CODE_EXIT = 0, 40 MLX5_TXCMP_CODE_ERROR, 41 MLX5_TXCMP_CODE_SINGLE, 42 MLX5_TXCMP_CODE_MULTI, 43 MLX5_TXCMP_CODE_TSO, 44 MLX5_TXCMP_CODE_EMPW, 45 }; 46 47 /* 48 * These defines are used to configure Tx burst routine option set 49 * supported at compile time. The not specified options are optimized out 50 * out due to if conditions can be explicitly calculated at compile time. 51 * The offloads with bigger runtime check (require more CPU cycles to 52 * skip) overhead should have the bigger index - this is needed to 53 * select the better matching routine function if no exact match and 54 * some offloads are not actually requested. 55 */ 56 #define MLX5_TXOFF_CONFIG_MULTI (1u << 0) /* Multi-segment packets.*/ 57 #define MLX5_TXOFF_CONFIG_TSO (1u << 1) /* TCP send offload supported.*/ 58 #define MLX5_TXOFF_CONFIG_SWP (1u << 2) /* Tunnels/SW Parser offloads.*/ 59 #define MLX5_TXOFF_CONFIG_CSUM (1u << 3) /* Check Sums offloaded. */ 60 #define MLX5_TXOFF_CONFIG_INLINE (1u << 4) /* Data inlining supported. */ 61 #define MLX5_TXOFF_CONFIG_VLAN (1u << 5) /* VLAN insertion supported.*/ 62 #define MLX5_TXOFF_CONFIG_METADATA (1u << 6) /* Flow metadata. */ 63 #define MLX5_TXOFF_CONFIG_EMPW (1u << 8) /* Enhanced MPW supported.*/ 64 65 /* The most common offloads groups. */ 66 #define MLX5_TXOFF_CONFIG_NONE 0 67 #define MLX5_TXOFF_CONFIG_FULL (MLX5_TXOFF_CONFIG_MULTI | \ 68 MLX5_TXOFF_CONFIG_TSO | \ 69 MLX5_TXOFF_CONFIG_SWP | \ 70 MLX5_TXOFF_CONFIG_CSUM | \ 71 MLX5_TXOFF_CONFIG_INLINE | \ 72 MLX5_TXOFF_CONFIG_VLAN | \ 73 MLX5_TXOFF_CONFIG_METADATA) 74 75 #define MLX5_TXOFF_CONFIG(mask) (olx & MLX5_TXOFF_CONFIG_##mask) 76 77 #define MLX5_TXOFF_DECL(func, olx) \ 78 static uint16_t mlx5_tx_burst_##func(void *txq, \ 79 struct rte_mbuf **pkts, \ 80 uint16_t pkts_n) \ 81 { \ 82 return mlx5_tx_burst_tmpl((struct mlx5_txq_data *)txq, \ 83 pkts, pkts_n, (olx)); \ 84 } 85 86 #define MLX5_TXOFF_INFO(func, olx) {mlx5_tx_burst_##func, olx}, 87 88 static __rte_always_inline uint32_t 89 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe); 90 91 static __rte_always_inline int 92 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, 93 uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe); 94 95 static __rte_always_inline uint32_t 96 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe); 97 98 static __rte_always_inline void 99 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, 100 volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res); 101 102 static __rte_always_inline void 103 mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx, 104 const unsigned int strd_n); 105 106 static int 107 mlx5_queue_state_modify(struct rte_eth_dev *dev, 108 struct mlx5_mp_arg_queue_state_modify *sm); 109 110 static inline void 111 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *restrict tcp, 112 volatile struct mlx5_cqe *restrict cqe, 113 uint32_t phcsum); 114 115 static inline void 116 mlx5_lro_update_hdr(uint8_t *restrict padd, 117 volatile struct mlx5_cqe *restrict cqe, 118 uint32_t len); 119 120 uint32_t mlx5_ptype_table[] __rte_cache_aligned = { 121 [0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */ 122 }; 123 124 uint8_t mlx5_cksum_table[1 << 10] __rte_cache_aligned; 125 uint8_t mlx5_swp_types_table[1 << 10] __rte_cache_aligned; 126 127 /** 128 * Build a table to translate Rx completion flags to packet type. 129 * 130 * @note: fix mlx5_dev_supported_ptypes_get() if any change here. 131 */ 132 void 133 mlx5_set_ptype_table(void) 134 { 135 unsigned int i; 136 uint32_t (*p)[RTE_DIM(mlx5_ptype_table)] = &mlx5_ptype_table; 137 138 /* Last entry must not be overwritten, reserved for errored packet. */ 139 for (i = 0; i < RTE_DIM(mlx5_ptype_table) - 1; ++i) 140 (*p)[i] = RTE_PTYPE_UNKNOWN; 141 /* 142 * The index to the array should have: 143 * bit[1:0] = l3_hdr_type 144 * bit[4:2] = l4_hdr_type 145 * bit[5] = ip_frag 146 * bit[6] = tunneled 147 * bit[7] = outer_l3_type 148 */ 149 /* L2 */ 150 (*p)[0x00] = RTE_PTYPE_L2_ETHER; 151 /* L3 */ 152 (*p)[0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 153 RTE_PTYPE_L4_NONFRAG; 154 (*p)[0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 155 RTE_PTYPE_L4_NONFRAG; 156 /* Fragmented */ 157 (*p)[0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 158 RTE_PTYPE_L4_FRAG; 159 (*p)[0x22] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 160 RTE_PTYPE_L4_FRAG; 161 /* TCP */ 162 (*p)[0x05] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 163 RTE_PTYPE_L4_TCP; 164 (*p)[0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 165 RTE_PTYPE_L4_TCP; 166 (*p)[0x0d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 167 RTE_PTYPE_L4_TCP; 168 (*p)[0x0e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 169 RTE_PTYPE_L4_TCP; 170 (*p)[0x11] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 171 RTE_PTYPE_L4_TCP; 172 (*p)[0x12] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 173 RTE_PTYPE_L4_TCP; 174 /* UDP */ 175 (*p)[0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 176 RTE_PTYPE_L4_UDP; 177 (*p)[0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 178 RTE_PTYPE_L4_UDP; 179 /* Repeat with outer_l3_type being set. Just in case. */ 180 (*p)[0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 181 RTE_PTYPE_L4_NONFRAG; 182 (*p)[0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 183 RTE_PTYPE_L4_NONFRAG; 184 (*p)[0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 185 RTE_PTYPE_L4_FRAG; 186 (*p)[0xa2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 187 RTE_PTYPE_L4_FRAG; 188 (*p)[0x85] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 189 RTE_PTYPE_L4_TCP; 190 (*p)[0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 191 RTE_PTYPE_L4_TCP; 192 (*p)[0x8d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 193 RTE_PTYPE_L4_TCP; 194 (*p)[0x8e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 195 RTE_PTYPE_L4_TCP; 196 (*p)[0x91] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 197 RTE_PTYPE_L4_TCP; 198 (*p)[0x92] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 199 RTE_PTYPE_L4_TCP; 200 (*p)[0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 201 RTE_PTYPE_L4_UDP; 202 (*p)[0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 203 RTE_PTYPE_L4_UDP; 204 /* Tunneled - L3 */ 205 (*p)[0x40] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN; 206 (*p)[0x41] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 207 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 208 RTE_PTYPE_INNER_L4_NONFRAG; 209 (*p)[0x42] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 210 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 211 RTE_PTYPE_INNER_L4_NONFRAG; 212 (*p)[0xc0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN; 213 (*p)[0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 214 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 215 RTE_PTYPE_INNER_L4_NONFRAG; 216 (*p)[0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 217 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 218 RTE_PTYPE_INNER_L4_NONFRAG; 219 /* Tunneled - Fragmented */ 220 (*p)[0x61] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 221 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 222 RTE_PTYPE_INNER_L4_FRAG; 223 (*p)[0x62] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 224 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 225 RTE_PTYPE_INNER_L4_FRAG; 226 (*p)[0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 227 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 228 RTE_PTYPE_INNER_L4_FRAG; 229 (*p)[0xe2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 230 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 231 RTE_PTYPE_INNER_L4_FRAG; 232 /* Tunneled - TCP */ 233 (*p)[0x45] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 234 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 235 RTE_PTYPE_INNER_L4_TCP; 236 (*p)[0x46] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 237 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 238 RTE_PTYPE_INNER_L4_TCP; 239 (*p)[0x4d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 240 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 241 RTE_PTYPE_INNER_L4_TCP; 242 (*p)[0x4e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 243 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 244 RTE_PTYPE_INNER_L4_TCP; 245 (*p)[0x51] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 246 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 247 RTE_PTYPE_INNER_L4_TCP; 248 (*p)[0x52] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 249 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 250 RTE_PTYPE_INNER_L4_TCP; 251 (*p)[0xc5] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 252 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 253 RTE_PTYPE_INNER_L4_TCP; 254 (*p)[0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 255 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 256 RTE_PTYPE_INNER_L4_TCP; 257 (*p)[0xcd] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 258 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 259 RTE_PTYPE_INNER_L4_TCP; 260 (*p)[0xce] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 261 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 262 RTE_PTYPE_INNER_L4_TCP; 263 (*p)[0xd1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 264 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 265 RTE_PTYPE_INNER_L4_TCP; 266 (*p)[0xd2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 267 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 268 RTE_PTYPE_INNER_L4_TCP; 269 /* Tunneled - UDP */ 270 (*p)[0x49] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 271 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 272 RTE_PTYPE_INNER_L4_UDP; 273 (*p)[0x4a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 274 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 275 RTE_PTYPE_INNER_L4_UDP; 276 (*p)[0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 277 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 278 RTE_PTYPE_INNER_L4_UDP; 279 (*p)[0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 280 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 281 RTE_PTYPE_INNER_L4_UDP; 282 } 283 284 /** 285 * Build a table to translate packet to checksum type of Verbs. 286 */ 287 void 288 mlx5_set_cksum_table(void) 289 { 290 unsigned int i; 291 uint8_t v; 292 293 /* 294 * The index should have: 295 * bit[0] = PKT_TX_TCP_SEG 296 * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM 297 * bit[4] = PKT_TX_IP_CKSUM 298 * bit[8] = PKT_TX_OUTER_IP_CKSUM 299 * bit[9] = tunnel 300 */ 301 for (i = 0; i < RTE_DIM(mlx5_cksum_table); ++i) { 302 v = 0; 303 if (i & (1 << 9)) { 304 /* Tunneled packet. */ 305 if (i & (1 << 8)) /* Outer IP. */ 306 v |= MLX5_ETH_WQE_L3_CSUM; 307 if (i & (1 << 4)) /* Inner IP. */ 308 v |= MLX5_ETH_WQE_L3_INNER_CSUM; 309 if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */ 310 v |= MLX5_ETH_WQE_L4_INNER_CSUM; 311 } else { 312 /* No tunnel. */ 313 if (i & (1 << 4)) /* IP. */ 314 v |= MLX5_ETH_WQE_L3_CSUM; 315 if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */ 316 v |= MLX5_ETH_WQE_L4_CSUM; 317 } 318 mlx5_cksum_table[i] = v; 319 } 320 } 321 322 /** 323 * Build a table to translate packet type of mbuf to SWP type of Verbs. 324 */ 325 void 326 mlx5_set_swp_types_table(void) 327 { 328 unsigned int i; 329 uint8_t v; 330 331 /* 332 * The index should have: 333 * bit[0:1] = PKT_TX_L4_MASK 334 * bit[4] = PKT_TX_IPV6 335 * bit[8] = PKT_TX_OUTER_IPV6 336 * bit[9] = PKT_TX_OUTER_UDP 337 */ 338 for (i = 0; i < RTE_DIM(mlx5_swp_types_table); ++i) { 339 v = 0; 340 if (i & (1 << 8)) 341 v |= MLX5_ETH_WQE_L3_OUTER_IPV6; 342 if (i & (1 << 9)) 343 v |= MLX5_ETH_WQE_L4_OUTER_UDP; 344 if (i & (1 << 4)) 345 v |= MLX5_ETH_WQE_L3_INNER_IPV6; 346 if ((i & 3) == (PKT_TX_UDP_CKSUM >> 52)) 347 v |= MLX5_ETH_WQE_L4_INNER_UDP; 348 mlx5_swp_types_table[i] = v; 349 } 350 } 351 352 /** 353 * Set Software Parser flags and offsets in Ethernet Segment of WQE. 354 * Flags must be preliminary initialized to zero. 355 * 356 * @param loc 357 * Pointer to burst routine local context. 358 * @param swp_flags 359 * Pointer to store Software Parser flags 360 * @param olx 361 * Configured Tx offloads mask. It is fully defined at 362 * compile time and may be used for optimization. 363 * 364 * @return 365 * Software Parser offsets packed in dword. 366 * Software Parser flags are set by pointer. 367 */ 368 static __rte_always_inline uint32_t 369 txq_mbuf_to_swp(struct mlx5_txq_local *restrict loc, 370 uint8_t *swp_flags, 371 unsigned int olx) 372 { 373 uint64_t ol, tunnel; 374 unsigned int idx, off; 375 uint32_t set; 376 377 if (!MLX5_TXOFF_CONFIG(SWP)) 378 return 0; 379 ol = loc->mbuf->ol_flags; 380 tunnel = ol & PKT_TX_TUNNEL_MASK; 381 /* 382 * Check whether Software Parser is required. 383 * Only customized tunnels may ask for. 384 */ 385 if (likely(tunnel != PKT_TX_TUNNEL_UDP && tunnel != PKT_TX_TUNNEL_IP)) 386 return 0; 387 /* 388 * The index should have: 389 * bit[0:1] = PKT_TX_L4_MASK 390 * bit[4] = PKT_TX_IPV6 391 * bit[8] = PKT_TX_OUTER_IPV6 392 * bit[9] = PKT_TX_OUTER_UDP 393 */ 394 idx = (ol & (PKT_TX_L4_MASK | PKT_TX_IPV6 | PKT_TX_OUTER_IPV6)) >> 52; 395 idx |= (tunnel == PKT_TX_TUNNEL_UDP) ? (1 << 9) : 0; 396 *swp_flags = mlx5_swp_types_table[idx]; 397 /* 398 * Set offsets for SW parser. Since ConnectX-5, SW parser just 399 * complements HW parser. SW parser starts to engage only if HW parser 400 * can't reach a header. For the older devices, HW parser will not kick 401 * in if any of SWP offsets is set. Therefore, all of the L3 offsets 402 * should be set regardless of HW offload. 403 */ 404 off = loc->mbuf->outer_l2_len; 405 if (MLX5_TXOFF_CONFIG(VLAN) && ol & PKT_TX_VLAN_PKT) 406 off += sizeof(struct rte_vlan_hdr); 407 set = (off >> 1) << 8; /* Outer L3 offset. */ 408 off += loc->mbuf->outer_l3_len; 409 if (tunnel == PKT_TX_TUNNEL_UDP) 410 set |= off >> 1; /* Outer L4 offset. */ 411 if (ol & (PKT_TX_IPV4 | PKT_TX_IPV6)) { /* Inner IP. */ 412 const uint64_t csum = ol & PKT_TX_L4_MASK; 413 off += loc->mbuf->l2_len; 414 set |= (off >> 1) << 24; /* Inner L3 offset. */ 415 if (csum == PKT_TX_TCP_CKSUM || 416 csum == PKT_TX_UDP_CKSUM || 417 (MLX5_TXOFF_CONFIG(TSO) && ol & PKT_TX_TCP_SEG)) { 418 off += loc->mbuf->l3_len; 419 set |= (off >> 1) << 16; /* Inner L4 offset. */ 420 } 421 } 422 set = rte_cpu_to_le_32(set); 423 return set; 424 } 425 426 /** 427 * Convert the Checksum offloads to Verbs. 428 * 429 * @param buf 430 * Pointer to the mbuf. 431 * 432 * @return 433 * Converted checksum flags. 434 */ 435 static __rte_always_inline uint8_t 436 txq_ol_cksum_to_cs(struct rte_mbuf *buf) 437 { 438 uint32_t idx; 439 uint8_t is_tunnel = !!(buf->ol_flags & PKT_TX_TUNNEL_MASK); 440 const uint64_t ol_flags_mask = PKT_TX_TCP_SEG | PKT_TX_L4_MASK | 441 PKT_TX_IP_CKSUM | PKT_TX_OUTER_IP_CKSUM; 442 443 /* 444 * The index should have: 445 * bit[0] = PKT_TX_TCP_SEG 446 * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM 447 * bit[4] = PKT_TX_IP_CKSUM 448 * bit[8] = PKT_TX_OUTER_IP_CKSUM 449 * bit[9] = tunnel 450 */ 451 idx = ((buf->ol_flags & ol_flags_mask) >> 50) | (!!is_tunnel << 9); 452 return mlx5_cksum_table[idx]; 453 } 454 455 /** 456 * Internal function to compute the number of used descriptors in an RX queue 457 * 458 * @param rxq 459 * The Rx queue. 460 * 461 * @return 462 * The number of used rx descriptor. 463 */ 464 static uint32_t 465 rx_queue_count(struct mlx5_rxq_data *rxq) 466 { 467 struct rxq_zip *zip = &rxq->zip; 468 volatile struct mlx5_cqe *cqe; 469 const unsigned int cqe_n = (1 << rxq->cqe_n); 470 const unsigned int cqe_cnt = cqe_n - 1; 471 unsigned int cq_ci; 472 unsigned int used; 473 474 /* if we are processing a compressed cqe */ 475 if (zip->ai) { 476 used = zip->cqe_cnt - zip->ca; 477 cq_ci = zip->cq_ci; 478 } else { 479 used = 0; 480 cq_ci = rxq->cq_ci; 481 } 482 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; 483 while (check_cqe(cqe, cqe_n, cq_ci) != MLX5_CQE_STATUS_HW_OWN) { 484 int8_t op_own; 485 unsigned int n; 486 487 op_own = cqe->op_own; 488 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) 489 n = rte_be_to_cpu_32(cqe->byte_cnt); 490 else 491 n = 1; 492 cq_ci += n; 493 used += n; 494 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; 495 } 496 used = RTE_MIN(used, (1U << rxq->elts_n) - 1); 497 return used; 498 } 499 500 /** 501 * DPDK callback to check the status of a rx descriptor. 502 * 503 * @param rx_queue 504 * The Rx queue. 505 * @param[in] offset 506 * The index of the descriptor in the ring. 507 * 508 * @return 509 * The status of the tx descriptor. 510 */ 511 int 512 mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset) 513 { 514 struct mlx5_rxq_data *rxq = rx_queue; 515 struct mlx5_rxq_ctrl *rxq_ctrl = 516 container_of(rxq, struct mlx5_rxq_ctrl, rxq); 517 struct rte_eth_dev *dev = ETH_DEV(rxq_ctrl->priv); 518 519 if (dev->rx_pkt_burst != mlx5_rx_burst) { 520 rte_errno = ENOTSUP; 521 return -rte_errno; 522 } 523 if (offset >= (1 << rxq->elts_n)) { 524 rte_errno = EINVAL; 525 return -rte_errno; 526 } 527 if (offset < rx_queue_count(rxq)) 528 return RTE_ETH_RX_DESC_DONE; 529 return RTE_ETH_RX_DESC_AVAIL; 530 } 531 532 /** 533 * DPDK callback to get the number of used descriptors in a RX queue 534 * 535 * @param dev 536 * Pointer to the device structure. 537 * 538 * @param rx_queue_id 539 * The Rx queue. 540 * 541 * @return 542 * The number of used rx descriptor. 543 * -EINVAL if the queue is invalid 544 */ 545 uint32_t 546 mlx5_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id) 547 { 548 struct mlx5_priv *priv = dev->data->dev_private; 549 struct mlx5_rxq_data *rxq; 550 551 if (dev->rx_pkt_burst != mlx5_rx_burst) { 552 rte_errno = ENOTSUP; 553 return -rte_errno; 554 } 555 rxq = (*priv->rxqs)[rx_queue_id]; 556 if (!rxq) { 557 rte_errno = EINVAL; 558 return -rte_errno; 559 } 560 return rx_queue_count(rxq); 561 } 562 563 #define MLX5_SYSTEM_LOG_DIR "/var/log" 564 /** 565 * Dump debug information to log file. 566 * 567 * @param fname 568 * The file name. 569 * @param hex_title 570 * If not NULL this string is printed as a header to the output 571 * and the output will be in hexadecimal view. 572 * @param buf 573 * This is the buffer address to print out. 574 * @param len 575 * The number of bytes to dump out. 576 */ 577 void 578 mlx5_dump_debug_information(const char *fname, const char *hex_title, 579 const void *buf, unsigned int hex_len) 580 { 581 FILE *fd; 582 583 MKSTR(path, "%s/%s", MLX5_SYSTEM_LOG_DIR, fname); 584 fd = fopen(path, "a+"); 585 if (!fd) { 586 DRV_LOG(WARNING, "cannot open %s for debug dump\n", 587 path); 588 MKSTR(path2, "./%s", fname); 589 fd = fopen(path2, "a+"); 590 if (!fd) { 591 DRV_LOG(ERR, "cannot open %s for debug dump\n", 592 path2); 593 return; 594 } 595 DRV_LOG(INFO, "New debug dump in file %s\n", path2); 596 } else { 597 DRV_LOG(INFO, "New debug dump in file %s\n", path); 598 } 599 if (hex_title) 600 rte_hexdump(fd, hex_title, buf, hex_len); 601 else 602 fprintf(fd, "%s", (const char *)buf); 603 fprintf(fd, "\n\n\n"); 604 fclose(fd); 605 } 606 607 /** 608 * Move QP from error state to running state and initialize indexes. 609 * 610 * @param txq_ctrl 611 * Pointer to TX queue control structure. 612 * 613 * @return 614 * 0 on success, else -1. 615 */ 616 static int 617 tx_recover_qp(struct mlx5_txq_ctrl *txq_ctrl) 618 { 619 struct mlx5_mp_arg_queue_state_modify sm = { 620 .is_wq = 0, 621 .queue_id = txq_ctrl->txq.idx, 622 }; 623 624 if (mlx5_queue_state_modify(ETH_DEV(txq_ctrl->priv), &sm)) 625 return -1; 626 txq_ctrl->txq.wqe_ci = 0; 627 txq_ctrl->txq.wqe_pi = 0; 628 txq_ctrl->txq.elts_comp = 0; 629 return 0; 630 } 631 632 /* Return 1 if the error CQE is signed otherwise, sign it and return 0. */ 633 static int 634 check_err_cqe_seen(volatile struct mlx5_err_cqe *err_cqe) 635 { 636 static const uint8_t magic[] = "seen"; 637 int ret = 1; 638 unsigned int i; 639 640 for (i = 0; i < sizeof(magic); ++i) 641 if (!ret || err_cqe->rsvd1[i] != magic[i]) { 642 ret = 0; 643 err_cqe->rsvd1[i] = magic[i]; 644 } 645 return ret; 646 } 647 648 /** 649 * Handle error CQE. 650 * 651 * @param txq 652 * Pointer to TX queue structure. 653 * @param error_cqe 654 * Pointer to the error CQE. 655 * 656 * @return 657 * Negative value if queue recovery failed, 658 * the last Tx buffer element to free otherwise. 659 */ 660 int 661 mlx5_tx_error_cqe_handle(struct mlx5_txq_data *restrict txq, 662 volatile struct mlx5_err_cqe *err_cqe) 663 { 664 if (err_cqe->syndrome != MLX5_CQE_SYNDROME_WR_FLUSH_ERR) { 665 const uint16_t wqe_m = ((1 << txq->wqe_n) - 1); 666 struct mlx5_txq_ctrl *txq_ctrl = 667 container_of(txq, struct mlx5_txq_ctrl, txq); 668 uint16_t new_wqe_pi = rte_be_to_cpu_16(err_cqe->wqe_counter); 669 int seen = check_err_cqe_seen(err_cqe); 670 671 if (!seen && txq_ctrl->dump_file_n < 672 txq_ctrl->priv->config.max_dump_files_num) { 673 MKSTR(err_str, "Unexpected CQE error syndrome " 674 "0x%02x CQN = %u SQN = %u wqe_counter = %u " 675 "wq_ci = %u cq_ci = %u", err_cqe->syndrome, 676 txq->cqe_s, txq->qp_num_8s >> 8, 677 rte_be_to_cpu_16(err_cqe->wqe_counter), 678 txq->wqe_ci, txq->cq_ci); 679 MKSTR(name, "dpdk_mlx5_port_%u_txq_%u_index_%u_%u", 680 PORT_ID(txq_ctrl->priv), txq->idx, 681 txq_ctrl->dump_file_n, (uint32_t)rte_rdtsc()); 682 mlx5_dump_debug_information(name, NULL, err_str, 0); 683 mlx5_dump_debug_information(name, "MLX5 Error CQ:", 684 (const void *)((uintptr_t) 685 txq->cqes), 686 sizeof(*err_cqe) * 687 (1 << txq->cqe_n)); 688 mlx5_dump_debug_information(name, "MLX5 Error SQ:", 689 (const void *)((uintptr_t) 690 txq->wqes), 691 MLX5_WQE_SIZE * 692 (1 << txq->wqe_n)); 693 txq_ctrl->dump_file_n++; 694 } 695 if (!seen) 696 /* 697 * Count errors in WQEs units. 698 * Later it can be improved to count error packets, 699 * for example, by SQ parsing to find how much packets 700 * should be counted for each WQE. 701 */ 702 txq->stats.oerrors += ((txq->wqe_ci & wqe_m) - 703 new_wqe_pi) & wqe_m; 704 if (tx_recover_qp(txq_ctrl) == 0) { 705 txq->cq_ci++; 706 /* Release all the remaining buffers. */ 707 return txq->elts_head; 708 } 709 /* Recovering failed - try again later on the same WQE. */ 710 return -1; 711 } else { 712 txq->cq_ci++; 713 } 714 /* Do not release buffers. */ 715 return txq->elts_tail; 716 } 717 718 /** 719 * Translate RX completion flags to packet type. 720 * 721 * @param[in] rxq 722 * Pointer to RX queue structure. 723 * @param[in] cqe 724 * Pointer to CQE. 725 * 726 * @note: fix mlx5_dev_supported_ptypes_get() if any change here. 727 * 728 * @return 729 * Packet type for struct rte_mbuf. 730 */ 731 static inline uint32_t 732 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe) 733 { 734 uint8_t idx; 735 uint8_t pinfo = cqe->pkt_info; 736 uint16_t ptype = cqe->hdr_type_etc; 737 738 /* 739 * The index to the array should have: 740 * bit[1:0] = l3_hdr_type 741 * bit[4:2] = l4_hdr_type 742 * bit[5] = ip_frag 743 * bit[6] = tunneled 744 * bit[7] = outer_l3_type 745 */ 746 idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10); 747 return mlx5_ptype_table[idx] | rxq->tunnel * !!(idx & (1 << 6)); 748 } 749 750 /** 751 * Initialize Rx WQ and indexes. 752 * 753 * @param[in] rxq 754 * Pointer to RX queue structure. 755 */ 756 void 757 mlx5_rxq_initialize(struct mlx5_rxq_data *rxq) 758 { 759 const unsigned int wqe_n = 1 << rxq->elts_n; 760 unsigned int i; 761 762 for (i = 0; (i != wqe_n); ++i) { 763 volatile struct mlx5_wqe_data_seg *scat; 764 uintptr_t addr; 765 uint32_t byte_count; 766 767 if (mlx5_rxq_mprq_enabled(rxq)) { 768 struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[i]; 769 770 scat = &((volatile struct mlx5_wqe_mprq *) 771 rxq->wqes)[i].dseg; 772 addr = (uintptr_t)mlx5_mprq_buf_addr(buf, 773 1 << rxq->strd_num_n); 774 byte_count = (1 << rxq->strd_sz_n) * 775 (1 << rxq->strd_num_n); 776 } else { 777 struct rte_mbuf *buf = (*rxq->elts)[i]; 778 779 scat = &((volatile struct mlx5_wqe_data_seg *) 780 rxq->wqes)[i]; 781 addr = rte_pktmbuf_mtod(buf, uintptr_t); 782 byte_count = DATA_LEN(buf); 783 } 784 /* scat->addr must be able to store a pointer. */ 785 assert(sizeof(scat->addr) >= sizeof(uintptr_t)); 786 *scat = (struct mlx5_wqe_data_seg){ 787 .addr = rte_cpu_to_be_64(addr), 788 .byte_count = rte_cpu_to_be_32(byte_count), 789 .lkey = mlx5_rx_addr2mr(rxq, addr), 790 }; 791 } 792 rxq->consumed_strd = 0; 793 rxq->decompressed = 0; 794 rxq->rq_pi = 0; 795 rxq->zip = (struct rxq_zip){ 796 .ai = 0, 797 }; 798 /* Update doorbell counter. */ 799 rxq->rq_ci = wqe_n >> rxq->sges_n; 800 rte_cio_wmb(); 801 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 802 } 803 804 /** 805 * Modify a Verbs/DevX queue state. 806 * This must be called from the primary process. 807 * 808 * @param dev 809 * Pointer to Ethernet device. 810 * @param sm 811 * State modify request parameters. 812 * 813 * @return 814 * 0 in case of success else non-zero value and rte_errno is set. 815 */ 816 int 817 mlx5_queue_state_modify_primary(struct rte_eth_dev *dev, 818 const struct mlx5_mp_arg_queue_state_modify *sm) 819 { 820 int ret; 821 struct mlx5_priv *priv = dev->data->dev_private; 822 823 if (sm->is_wq) { 824 struct mlx5_rxq_data *rxq = (*priv->rxqs)[sm->queue_id]; 825 struct mlx5_rxq_ctrl *rxq_ctrl = 826 container_of(rxq, struct mlx5_rxq_ctrl, rxq); 827 828 if (rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_IBV) { 829 struct ibv_wq_attr mod = { 830 .attr_mask = IBV_WQ_ATTR_STATE, 831 .wq_state = sm->state, 832 }; 833 834 ret = mlx5_glue->modify_wq(rxq_ctrl->obj->wq, &mod); 835 } else { /* rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ. */ 836 struct mlx5_devx_modify_rq_attr rq_attr; 837 838 memset(&rq_attr, 0, sizeof(rq_attr)); 839 if (sm->state == IBV_WQS_RESET) { 840 rq_attr.rq_state = MLX5_RQC_STATE_ERR; 841 rq_attr.state = MLX5_RQC_STATE_RST; 842 } else if (sm->state == IBV_WQS_RDY) { 843 rq_attr.rq_state = MLX5_RQC_STATE_RST; 844 rq_attr.state = MLX5_RQC_STATE_RDY; 845 } else if (sm->state == IBV_WQS_ERR) { 846 rq_attr.rq_state = MLX5_RQC_STATE_RDY; 847 rq_attr.state = MLX5_RQC_STATE_ERR; 848 } 849 ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq, 850 &rq_attr); 851 } 852 if (ret) { 853 DRV_LOG(ERR, "Cannot change Rx WQ state to %u - %s\n", 854 sm->state, strerror(errno)); 855 rte_errno = errno; 856 return ret; 857 } 858 } else { 859 struct mlx5_txq_data *txq = (*priv->txqs)[sm->queue_id]; 860 struct mlx5_txq_ctrl *txq_ctrl = 861 container_of(txq, struct mlx5_txq_ctrl, txq); 862 struct ibv_qp_attr mod = { 863 .qp_state = IBV_QPS_RESET, 864 .port_num = (uint8_t)priv->ibv_port, 865 }; 866 struct ibv_qp *qp = txq_ctrl->ibv->qp; 867 868 ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE); 869 if (ret) { 870 DRV_LOG(ERR, "Cannot change the Tx QP state to RESET " 871 "%s\n", strerror(errno)); 872 rte_errno = errno; 873 return ret; 874 } 875 mod.qp_state = IBV_QPS_INIT; 876 ret = mlx5_glue->modify_qp(qp, &mod, 877 (IBV_QP_STATE | IBV_QP_PORT)); 878 if (ret) { 879 DRV_LOG(ERR, "Cannot change Tx QP state to INIT %s\n", 880 strerror(errno)); 881 rte_errno = errno; 882 return ret; 883 } 884 mod.qp_state = IBV_QPS_RTR; 885 ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE); 886 if (ret) { 887 DRV_LOG(ERR, "Cannot change Tx QP state to RTR %s\n", 888 strerror(errno)); 889 rte_errno = errno; 890 return ret; 891 } 892 mod.qp_state = IBV_QPS_RTS; 893 ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE); 894 if (ret) { 895 DRV_LOG(ERR, "Cannot change Tx QP state to RTS %s\n", 896 strerror(errno)); 897 rte_errno = errno; 898 return ret; 899 } 900 } 901 return 0; 902 } 903 904 /** 905 * Modify a Verbs queue state. 906 * 907 * @param dev 908 * Pointer to Ethernet device. 909 * @param sm 910 * State modify request parameters. 911 * 912 * @return 913 * 0 in case of success else non-zero value. 914 */ 915 static int 916 mlx5_queue_state_modify(struct rte_eth_dev *dev, 917 struct mlx5_mp_arg_queue_state_modify *sm) 918 { 919 int ret = 0; 920 921 switch (rte_eal_process_type()) { 922 case RTE_PROC_PRIMARY: 923 ret = mlx5_queue_state_modify_primary(dev, sm); 924 break; 925 case RTE_PROC_SECONDARY: 926 ret = mlx5_mp_req_queue_state_modify(dev, sm); 927 break; 928 default: 929 break; 930 } 931 return ret; 932 } 933 934 /** 935 * Handle a Rx error. 936 * The function inserts the RQ state to reset when the first error CQE is 937 * shown, then drains the CQ by the caller function loop. When the CQ is empty, 938 * it moves the RQ state to ready and initializes the RQ. 939 * Next CQE identification and error counting are in the caller responsibility. 940 * 941 * @param[in] rxq 942 * Pointer to RX queue structure. 943 * @param[in] mbuf_prepare 944 * Whether to prepare mbufs for the RQ. 945 * 946 * @return 947 * -1 in case of recovery error, otherwise the CQE status. 948 */ 949 int 950 mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t mbuf_prepare) 951 { 952 const uint16_t cqe_n = 1 << rxq->cqe_n; 953 const uint16_t cqe_mask = cqe_n - 1; 954 const unsigned int wqe_n = 1 << rxq->elts_n; 955 struct mlx5_rxq_ctrl *rxq_ctrl = 956 container_of(rxq, struct mlx5_rxq_ctrl, rxq); 957 union { 958 volatile struct mlx5_cqe *cqe; 959 volatile struct mlx5_err_cqe *err_cqe; 960 } u = { 961 .cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask], 962 }; 963 struct mlx5_mp_arg_queue_state_modify sm; 964 int ret; 965 966 switch (rxq->err_state) { 967 case MLX5_RXQ_ERR_STATE_NO_ERROR: 968 rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_RESET; 969 /* Fall-through */ 970 case MLX5_RXQ_ERR_STATE_NEED_RESET: 971 sm.is_wq = 1; 972 sm.queue_id = rxq->idx; 973 sm.state = IBV_WQS_RESET; 974 if (mlx5_queue_state_modify(ETH_DEV(rxq_ctrl->priv), &sm)) 975 return -1; 976 if (rxq_ctrl->dump_file_n < 977 rxq_ctrl->priv->config.max_dump_files_num) { 978 MKSTR(err_str, "Unexpected CQE error syndrome " 979 "0x%02x CQN = %u RQN = %u wqe_counter = %u" 980 " rq_ci = %u cq_ci = %u", u.err_cqe->syndrome, 981 rxq->cqn, rxq_ctrl->wqn, 982 rte_be_to_cpu_16(u.err_cqe->wqe_counter), 983 rxq->rq_ci << rxq->sges_n, rxq->cq_ci); 984 MKSTR(name, "dpdk_mlx5_port_%u_rxq_%u_%u", 985 rxq->port_id, rxq->idx, (uint32_t)rte_rdtsc()); 986 mlx5_dump_debug_information(name, NULL, err_str, 0); 987 mlx5_dump_debug_information(name, "MLX5 Error CQ:", 988 (const void *)((uintptr_t) 989 rxq->cqes), 990 sizeof(*u.cqe) * cqe_n); 991 mlx5_dump_debug_information(name, "MLX5 Error RQ:", 992 (const void *)((uintptr_t) 993 rxq->wqes), 994 16 * wqe_n); 995 rxq_ctrl->dump_file_n++; 996 } 997 rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_READY; 998 /* Fall-through */ 999 case MLX5_RXQ_ERR_STATE_NEED_READY: 1000 ret = check_cqe(u.cqe, cqe_n, rxq->cq_ci); 1001 if (ret == MLX5_CQE_STATUS_HW_OWN) { 1002 rte_cio_wmb(); 1003 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); 1004 rte_cio_wmb(); 1005 /* 1006 * The RQ consumer index must be zeroed while moving 1007 * from RESET state to RDY state. 1008 */ 1009 *rxq->rq_db = rte_cpu_to_be_32(0); 1010 rte_cio_wmb(); 1011 sm.is_wq = 1; 1012 sm.queue_id = rxq->idx; 1013 sm.state = IBV_WQS_RDY; 1014 if (mlx5_queue_state_modify(ETH_DEV(rxq_ctrl->priv), 1015 &sm)) 1016 return -1; 1017 if (mbuf_prepare) { 1018 const uint16_t q_mask = wqe_n - 1; 1019 uint16_t elt_idx; 1020 struct rte_mbuf **elt; 1021 int i; 1022 unsigned int n = wqe_n - (rxq->rq_ci - 1023 rxq->rq_pi); 1024 1025 for (i = 0; i < (int)n; ++i) { 1026 elt_idx = (rxq->rq_ci + i) & q_mask; 1027 elt = &(*rxq->elts)[elt_idx]; 1028 *elt = rte_mbuf_raw_alloc(rxq->mp); 1029 if (!*elt) { 1030 for (i--; i >= 0; --i) { 1031 elt_idx = (rxq->rq_ci + 1032 i) & q_mask; 1033 elt = &(*rxq->elts) 1034 [elt_idx]; 1035 rte_pktmbuf_free_seg 1036 (*elt); 1037 } 1038 return -1; 1039 } 1040 } 1041 } 1042 mlx5_rxq_initialize(rxq); 1043 rxq->err_state = MLX5_RXQ_ERR_STATE_NO_ERROR; 1044 } 1045 return ret; 1046 default: 1047 return -1; 1048 } 1049 } 1050 1051 /** 1052 * Get size of the next packet for a given CQE. For compressed CQEs, the 1053 * consumer index is updated only once all packets of the current one have 1054 * been processed. 1055 * 1056 * @param rxq 1057 * Pointer to RX queue. 1058 * @param cqe 1059 * CQE to process. 1060 * @param[out] mcqe 1061 * Store pointer to mini-CQE if compressed. Otherwise, the pointer is not 1062 * written. 1063 * 1064 * @return 1065 * 0 in case of empty CQE, otherwise the packet size in bytes. 1066 */ 1067 static inline int 1068 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, 1069 uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe) 1070 { 1071 struct rxq_zip *zip = &rxq->zip; 1072 uint16_t cqe_n = cqe_cnt + 1; 1073 int len; 1074 uint16_t idx, end; 1075 1076 do { 1077 len = 0; 1078 /* Process compressed data in the CQE and mini arrays. */ 1079 if (zip->ai) { 1080 volatile struct mlx5_mini_cqe8 (*mc)[8] = 1081 (volatile struct mlx5_mini_cqe8 (*)[8]) 1082 (uintptr_t)(&(*rxq->cqes)[zip->ca & 1083 cqe_cnt].pkt_info); 1084 1085 len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt); 1086 *mcqe = &(*mc)[zip->ai & 7]; 1087 if ((++zip->ai & 7) == 0) { 1088 /* Invalidate consumed CQEs */ 1089 idx = zip->ca; 1090 end = zip->na; 1091 while (idx != end) { 1092 (*rxq->cqes)[idx & cqe_cnt].op_own = 1093 MLX5_CQE_INVALIDATE; 1094 ++idx; 1095 } 1096 /* 1097 * Increment consumer index to skip the number 1098 * of CQEs consumed. Hardware leaves holes in 1099 * the CQ ring for software use. 1100 */ 1101 zip->ca = zip->na; 1102 zip->na += 8; 1103 } 1104 if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) { 1105 /* Invalidate the rest */ 1106 idx = zip->ca; 1107 end = zip->cq_ci; 1108 1109 while (idx != end) { 1110 (*rxq->cqes)[idx & cqe_cnt].op_own = 1111 MLX5_CQE_INVALIDATE; 1112 ++idx; 1113 } 1114 rxq->cq_ci = zip->cq_ci; 1115 zip->ai = 0; 1116 } 1117 /* 1118 * No compressed data, get next CQE and verify if it is 1119 * compressed. 1120 */ 1121 } else { 1122 int ret; 1123 int8_t op_own; 1124 1125 ret = check_cqe(cqe, cqe_n, rxq->cq_ci); 1126 if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) { 1127 if (unlikely(ret == MLX5_CQE_STATUS_ERR || 1128 rxq->err_state)) { 1129 ret = mlx5_rx_err_handle(rxq, 0); 1130 if (ret == MLX5_CQE_STATUS_HW_OWN || 1131 ret == -1) 1132 return 0; 1133 } else { 1134 return 0; 1135 } 1136 } 1137 ++rxq->cq_ci; 1138 op_own = cqe->op_own; 1139 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) { 1140 volatile struct mlx5_mini_cqe8 (*mc)[8] = 1141 (volatile struct mlx5_mini_cqe8 (*)[8]) 1142 (uintptr_t)(&(*rxq->cqes) 1143 [rxq->cq_ci & 1144 cqe_cnt].pkt_info); 1145 1146 /* Fix endianness. */ 1147 zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt); 1148 /* 1149 * Current mini array position is the one 1150 * returned by check_cqe64(). 1151 * 1152 * If completion comprises several mini arrays, 1153 * as a special case the second one is located 1154 * 7 CQEs after the initial CQE instead of 8 1155 * for subsequent ones. 1156 */ 1157 zip->ca = rxq->cq_ci; 1158 zip->na = zip->ca + 7; 1159 /* Compute the next non compressed CQE. */ 1160 --rxq->cq_ci; 1161 zip->cq_ci = rxq->cq_ci + zip->cqe_cnt; 1162 /* Get packet size to return. */ 1163 len = rte_be_to_cpu_32((*mc)[0].byte_cnt); 1164 *mcqe = &(*mc)[0]; 1165 zip->ai = 1; 1166 /* Prefetch all to be invalidated */ 1167 idx = zip->ca; 1168 end = zip->cq_ci; 1169 while (idx != end) { 1170 rte_prefetch0(&(*rxq->cqes)[(idx) & 1171 cqe_cnt]); 1172 ++idx; 1173 } 1174 } else { 1175 len = rte_be_to_cpu_32(cqe->byte_cnt); 1176 } 1177 } 1178 if (unlikely(rxq->err_state)) { 1179 cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1180 ++rxq->stats.idropped; 1181 } else { 1182 return len; 1183 } 1184 } while (1); 1185 } 1186 1187 /** 1188 * Translate RX completion flags to offload flags. 1189 * 1190 * @param[in] cqe 1191 * Pointer to CQE. 1192 * 1193 * @return 1194 * Offload flags (ol_flags) for struct rte_mbuf. 1195 */ 1196 static inline uint32_t 1197 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe) 1198 { 1199 uint32_t ol_flags = 0; 1200 uint16_t flags = rte_be_to_cpu_16(cqe->hdr_type_etc); 1201 1202 ol_flags = 1203 TRANSPOSE(flags, 1204 MLX5_CQE_RX_L3_HDR_VALID, 1205 PKT_RX_IP_CKSUM_GOOD) | 1206 TRANSPOSE(flags, 1207 MLX5_CQE_RX_L4_HDR_VALID, 1208 PKT_RX_L4_CKSUM_GOOD); 1209 return ol_flags; 1210 } 1211 1212 /** 1213 * Fill in mbuf fields from RX completion flags. 1214 * Note that pkt->ol_flags should be initialized outside of this function. 1215 * 1216 * @param rxq 1217 * Pointer to RX queue. 1218 * @param pkt 1219 * mbuf to fill. 1220 * @param cqe 1221 * CQE to process. 1222 * @param rss_hash_res 1223 * Packet RSS Hash result. 1224 */ 1225 static inline void 1226 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, 1227 volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res) 1228 { 1229 /* Update packet information. */ 1230 pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe); 1231 if (rss_hash_res && rxq->rss_hash) { 1232 pkt->hash.rss = rss_hash_res; 1233 pkt->ol_flags |= PKT_RX_RSS_HASH; 1234 } 1235 if (rxq->mark && MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) { 1236 pkt->ol_flags |= PKT_RX_FDIR; 1237 if (cqe->sop_drop_qpn != 1238 rte_cpu_to_be_32(MLX5_FLOW_MARK_DEFAULT)) { 1239 uint32_t mark = cqe->sop_drop_qpn; 1240 1241 pkt->ol_flags |= PKT_RX_FDIR_ID; 1242 pkt->hash.fdir.hi = mlx5_flow_mark_get(mark); 1243 } 1244 } 1245 if (rxq->csum) 1246 pkt->ol_flags |= rxq_cq_to_ol_flags(cqe); 1247 if (rxq->vlan_strip && 1248 (cqe->hdr_type_etc & rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED))) { 1249 pkt->ol_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED; 1250 pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info); 1251 } 1252 if (rxq->hw_timestamp) { 1253 pkt->timestamp = rte_be_to_cpu_64(cqe->timestamp); 1254 pkt->ol_flags |= PKT_RX_TIMESTAMP; 1255 } 1256 } 1257 1258 /** 1259 * DPDK callback for RX. 1260 * 1261 * @param dpdk_rxq 1262 * Generic pointer to RX queue structure. 1263 * @param[out] pkts 1264 * Array to store received packets. 1265 * @param pkts_n 1266 * Maximum number of packets in array. 1267 * 1268 * @return 1269 * Number of packets successfully received (<= pkts_n). 1270 */ 1271 uint16_t 1272 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1273 { 1274 struct mlx5_rxq_data *rxq = dpdk_rxq; 1275 const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1; 1276 const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1; 1277 const unsigned int sges_n = rxq->sges_n; 1278 struct rte_mbuf *pkt = NULL; 1279 struct rte_mbuf *seg = NULL; 1280 volatile struct mlx5_cqe *cqe = 1281 &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1282 unsigned int i = 0; 1283 unsigned int rq_ci = rxq->rq_ci << sges_n; 1284 int len = 0; /* keep its value across iterations. */ 1285 1286 while (pkts_n) { 1287 unsigned int idx = rq_ci & wqe_cnt; 1288 volatile struct mlx5_wqe_data_seg *wqe = 1289 &((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx]; 1290 struct rte_mbuf *rep = (*rxq->elts)[idx]; 1291 volatile struct mlx5_mini_cqe8 *mcqe = NULL; 1292 uint32_t rss_hash_res; 1293 1294 if (pkt) 1295 NEXT(seg) = rep; 1296 seg = rep; 1297 rte_prefetch0(seg); 1298 rte_prefetch0(cqe); 1299 rte_prefetch0(wqe); 1300 rep = rte_mbuf_raw_alloc(rxq->mp); 1301 if (unlikely(rep == NULL)) { 1302 ++rxq->stats.rx_nombuf; 1303 if (!pkt) { 1304 /* 1305 * no buffers before we even started, 1306 * bail out silently. 1307 */ 1308 break; 1309 } 1310 while (pkt != seg) { 1311 assert(pkt != (*rxq->elts)[idx]); 1312 rep = NEXT(pkt); 1313 NEXT(pkt) = NULL; 1314 NB_SEGS(pkt) = 1; 1315 rte_mbuf_raw_free(pkt); 1316 pkt = rep; 1317 } 1318 break; 1319 } 1320 if (!pkt) { 1321 cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 1322 len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt, &mcqe); 1323 if (!len) { 1324 rte_mbuf_raw_free(rep); 1325 break; 1326 } 1327 pkt = seg; 1328 assert(len >= (rxq->crc_present << 2)); 1329 pkt->ol_flags = 0; 1330 /* If compressed, take hash result from mini-CQE. */ 1331 rss_hash_res = rte_be_to_cpu_32(mcqe == NULL ? 1332 cqe->rx_hash_res : 1333 mcqe->rx_hash_result); 1334 rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res); 1335 if (rxq->crc_present) 1336 len -= RTE_ETHER_CRC_LEN; 1337 PKT_LEN(pkt) = len; 1338 if (cqe->lro_num_seg > 1) { 1339 mlx5_lro_update_hdr 1340 (rte_pktmbuf_mtod(pkt, uint8_t *), cqe, 1341 len); 1342 pkt->ol_flags |= PKT_RX_LRO; 1343 pkt->tso_segsz = len / cqe->lro_num_seg; 1344 } 1345 } 1346 DATA_LEN(rep) = DATA_LEN(seg); 1347 PKT_LEN(rep) = PKT_LEN(seg); 1348 SET_DATA_OFF(rep, DATA_OFF(seg)); 1349 PORT(rep) = PORT(seg); 1350 (*rxq->elts)[idx] = rep; 1351 /* 1352 * Fill NIC descriptor with the new buffer. The lkey and size 1353 * of the buffers are already known, only the buffer address 1354 * changes. 1355 */ 1356 wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t)); 1357 /* If there's only one MR, no need to replace LKey in WQE. */ 1358 if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) 1359 wqe->lkey = mlx5_rx_mb2mr(rxq, rep); 1360 if (len > DATA_LEN(seg)) { 1361 len -= DATA_LEN(seg); 1362 ++NB_SEGS(pkt); 1363 ++rq_ci; 1364 continue; 1365 } 1366 DATA_LEN(seg) = len; 1367 #ifdef MLX5_PMD_SOFT_COUNTERS 1368 /* Increment bytes counter. */ 1369 rxq->stats.ibytes += PKT_LEN(pkt); 1370 #endif 1371 /* Return packet. */ 1372 *(pkts++) = pkt; 1373 pkt = NULL; 1374 --pkts_n; 1375 ++i; 1376 /* Align consumer index to the next stride. */ 1377 rq_ci >>= sges_n; 1378 ++rq_ci; 1379 rq_ci <<= sges_n; 1380 } 1381 if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci))) 1382 return 0; 1383 /* Update the consumer index. */ 1384 rxq->rq_ci = rq_ci >> sges_n; 1385 rte_cio_wmb(); 1386 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); 1387 rte_cio_wmb(); 1388 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 1389 #ifdef MLX5_PMD_SOFT_COUNTERS 1390 /* Increment packets counter. */ 1391 rxq->stats.ipackets += i; 1392 #endif 1393 return i; 1394 } 1395 1396 /** 1397 * Update LRO packet TCP header. 1398 * The HW LRO feature doesn't update the TCP header after coalescing the 1399 * TCP segments but supplies information in CQE to fill it by SW. 1400 * 1401 * @param tcp 1402 * Pointer to the TCP header. 1403 * @param cqe 1404 * Pointer to the completion entry.. 1405 * @param phcsum 1406 * The L3 pseudo-header checksum. 1407 */ 1408 static inline void 1409 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *restrict tcp, 1410 volatile struct mlx5_cqe *restrict cqe, 1411 uint32_t phcsum) 1412 { 1413 uint8_t l4_type = (rte_be_to_cpu_16(cqe->hdr_type_etc) & 1414 MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT; 1415 /* 1416 * The HW calculates only the TCP payload checksum, need to complete 1417 * the TCP header checksum and the L3 pseudo-header checksum. 1418 */ 1419 uint32_t csum = phcsum + cqe->csum; 1420 1421 if (l4_type == MLX5_L4_HDR_TYPE_TCP_EMPTY_ACK || 1422 l4_type == MLX5_L4_HDR_TYPE_TCP_WITH_ACL) { 1423 tcp->tcp_flags |= RTE_TCP_ACK_FLAG; 1424 tcp->recv_ack = cqe->lro_ack_seq_num; 1425 tcp->rx_win = cqe->lro_tcp_win; 1426 } 1427 if (cqe->lro_tcppsh_abort_dupack & MLX5_CQE_LRO_PUSH_MASK) 1428 tcp->tcp_flags |= RTE_TCP_PSH_FLAG; 1429 tcp->cksum = 0; 1430 csum += rte_raw_cksum(tcp, (tcp->data_off & 0xF) * 4); 1431 csum = ((csum & 0xffff0000) >> 16) + (csum & 0xffff); 1432 csum = (~csum) & 0xffff; 1433 if (csum == 0) 1434 csum = 0xffff; 1435 tcp->cksum = csum; 1436 } 1437 1438 /** 1439 * Update LRO packet headers. 1440 * The HW LRO feature doesn't update the L3/TCP headers after coalescing the 1441 * TCP segments but supply information in CQE to fill it by SW. 1442 * 1443 * @param padd 1444 * The packet address. 1445 * @param cqe 1446 * Pointer to the completion entry.. 1447 * @param len 1448 * The packet length. 1449 */ 1450 static inline void 1451 mlx5_lro_update_hdr(uint8_t *restrict padd, 1452 volatile struct mlx5_cqe *restrict cqe, 1453 uint32_t len) 1454 { 1455 union { 1456 struct rte_ether_hdr *eth; 1457 struct rte_vlan_hdr *vlan; 1458 struct rte_ipv4_hdr *ipv4; 1459 struct rte_ipv6_hdr *ipv6; 1460 struct rte_tcp_hdr *tcp; 1461 uint8_t *hdr; 1462 } h = { 1463 .hdr = padd, 1464 }; 1465 uint16_t proto = h.eth->ether_type; 1466 uint32_t phcsum; 1467 1468 h.eth++; 1469 while (proto == RTE_BE16(RTE_ETHER_TYPE_VLAN) || 1470 proto == RTE_BE16(RTE_ETHER_TYPE_QINQ)) { 1471 proto = h.vlan->eth_proto; 1472 h.vlan++; 1473 } 1474 if (proto == RTE_BE16(RTE_ETHER_TYPE_IPV4)) { 1475 h.ipv4->time_to_live = cqe->lro_min_ttl; 1476 h.ipv4->total_length = rte_cpu_to_be_16(len - (h.hdr - padd)); 1477 h.ipv4->hdr_checksum = 0; 1478 h.ipv4->hdr_checksum = rte_ipv4_cksum(h.ipv4); 1479 phcsum = rte_ipv4_phdr_cksum(h.ipv4, 0); 1480 h.ipv4++; 1481 } else { 1482 h.ipv6->hop_limits = cqe->lro_min_ttl; 1483 h.ipv6->payload_len = rte_cpu_to_be_16(len - (h.hdr - padd) - 1484 sizeof(*h.ipv6)); 1485 phcsum = rte_ipv6_phdr_cksum(h.ipv6, 0); 1486 h.ipv6++; 1487 } 1488 mlx5_lro_update_tcp_hdr(h.tcp, cqe, phcsum); 1489 } 1490 1491 void 1492 mlx5_mprq_buf_free_cb(void *addr __rte_unused, void *opaque) 1493 { 1494 struct mlx5_mprq_buf *buf = opaque; 1495 1496 if (rte_atomic16_read(&buf->refcnt) == 1) { 1497 rte_mempool_put(buf->mp, buf); 1498 } else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) { 1499 rte_atomic16_set(&buf->refcnt, 1); 1500 rte_mempool_put(buf->mp, buf); 1501 } 1502 } 1503 1504 void 1505 mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf) 1506 { 1507 mlx5_mprq_buf_free_cb(NULL, buf); 1508 } 1509 1510 static inline void 1511 mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx, 1512 const unsigned int strd_n) 1513 { 1514 struct mlx5_mprq_buf *rep = rxq->mprq_repl; 1515 volatile struct mlx5_wqe_data_seg *wqe = 1516 &((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg; 1517 void *addr; 1518 1519 assert(rep != NULL); 1520 /* Replace MPRQ buf. */ 1521 (*rxq->mprq_bufs)[rq_idx] = rep; 1522 /* Replace WQE. */ 1523 addr = mlx5_mprq_buf_addr(rep, strd_n); 1524 wqe->addr = rte_cpu_to_be_64((uintptr_t)addr); 1525 /* If there's only one MR, no need to replace LKey in WQE. */ 1526 if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) 1527 wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr); 1528 /* Stash a mbuf for next replacement. */ 1529 if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep))) 1530 rxq->mprq_repl = rep; 1531 else 1532 rxq->mprq_repl = NULL; 1533 } 1534 1535 /** 1536 * DPDK callback for RX with Multi-Packet RQ support. 1537 * 1538 * @param dpdk_rxq 1539 * Generic pointer to RX queue structure. 1540 * @param[out] pkts 1541 * Array to store received packets. 1542 * @param pkts_n 1543 * Maximum number of packets in array. 1544 * 1545 * @return 1546 * Number of packets successfully received (<= pkts_n). 1547 */ 1548 uint16_t 1549 mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1550 { 1551 struct mlx5_rxq_data *rxq = dpdk_rxq; 1552 const unsigned int strd_n = 1 << rxq->strd_num_n; 1553 const unsigned int strd_sz = 1 << rxq->strd_sz_n; 1554 const unsigned int strd_shift = 1555 MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en; 1556 const unsigned int cq_mask = (1 << rxq->cqe_n) - 1; 1557 const unsigned int wq_mask = (1 << rxq->elts_n) - 1; 1558 volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask]; 1559 unsigned int i = 0; 1560 uint32_t rq_ci = rxq->rq_ci; 1561 uint16_t consumed_strd = rxq->consumed_strd; 1562 uint16_t headroom_sz = rxq->strd_headroom_en * RTE_PKTMBUF_HEADROOM; 1563 struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_ci & wq_mask]; 1564 1565 while (i < pkts_n) { 1566 struct rte_mbuf *pkt; 1567 void *addr; 1568 int ret; 1569 unsigned int len; 1570 uint16_t strd_cnt; 1571 uint16_t strd_idx; 1572 uint32_t offset; 1573 uint32_t byte_cnt; 1574 volatile struct mlx5_mini_cqe8 *mcqe = NULL; 1575 uint32_t rss_hash_res = 0; 1576 uint8_t lro_num_seg; 1577 1578 if (consumed_strd == strd_n) { 1579 /* Replace WQE only if the buffer is still in use. */ 1580 if (rte_atomic16_read(&buf->refcnt) > 1) { 1581 mprq_buf_replace(rxq, rq_ci & wq_mask, strd_n); 1582 /* Release the old buffer. */ 1583 mlx5_mprq_buf_free(buf); 1584 } else if (unlikely(rxq->mprq_repl == NULL)) { 1585 struct mlx5_mprq_buf *rep; 1586 1587 /* 1588 * Currently, the MPRQ mempool is out of buffer 1589 * and doing memcpy regardless of the size of Rx 1590 * packet. Retry allocation to get back to 1591 * normal. 1592 */ 1593 if (!rte_mempool_get(rxq->mprq_mp, 1594 (void **)&rep)) 1595 rxq->mprq_repl = rep; 1596 } 1597 /* Advance to the next WQE. */ 1598 consumed_strd = 0; 1599 ++rq_ci; 1600 buf = (*rxq->mprq_bufs)[rq_ci & wq_mask]; 1601 } 1602 cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask]; 1603 ret = mlx5_rx_poll_len(rxq, cqe, cq_mask, &mcqe); 1604 if (!ret) 1605 break; 1606 byte_cnt = ret; 1607 strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >> 1608 MLX5_MPRQ_STRIDE_NUM_SHIFT; 1609 assert(strd_cnt); 1610 consumed_strd += strd_cnt; 1611 if (byte_cnt & MLX5_MPRQ_FILLER_MASK) 1612 continue; 1613 if (mcqe == NULL) { 1614 rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res); 1615 strd_idx = rte_be_to_cpu_16(cqe->wqe_counter); 1616 } else { 1617 /* mini-CQE for MPRQ doesn't have hash result. */ 1618 strd_idx = rte_be_to_cpu_16(mcqe->stride_idx); 1619 } 1620 assert(strd_idx < strd_n); 1621 assert(!((rte_be_to_cpu_16(cqe->wqe_id) ^ rq_ci) & wq_mask)); 1622 lro_num_seg = cqe->lro_num_seg; 1623 /* 1624 * Currently configured to receive a packet per a stride. But if 1625 * MTU is adjusted through kernel interface, device could 1626 * consume multiple strides without raising an error. In this 1627 * case, the packet should be dropped because it is bigger than 1628 * the max_rx_pkt_len. 1629 */ 1630 if (unlikely(!lro_num_seg && strd_cnt > 1)) { 1631 ++rxq->stats.idropped; 1632 continue; 1633 } 1634 pkt = rte_pktmbuf_alloc(rxq->mp); 1635 if (unlikely(pkt == NULL)) { 1636 ++rxq->stats.rx_nombuf; 1637 break; 1638 } 1639 len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT; 1640 assert((int)len >= (rxq->crc_present << 2)); 1641 if (rxq->crc_present) 1642 len -= RTE_ETHER_CRC_LEN; 1643 offset = strd_idx * strd_sz + strd_shift; 1644 addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset); 1645 /* 1646 * Memcpy packets to the target mbuf if: 1647 * - The size of packet is smaller than mprq_max_memcpy_len. 1648 * - Out of buffer in the Mempool for Multi-Packet RQ. 1649 */ 1650 if (len <= rxq->mprq_max_memcpy_len || rxq->mprq_repl == NULL) { 1651 /* 1652 * When memcpy'ing packet due to out-of-buffer, the 1653 * packet must be smaller than the target mbuf. 1654 */ 1655 if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) { 1656 rte_pktmbuf_free_seg(pkt); 1657 ++rxq->stats.idropped; 1658 continue; 1659 } 1660 rte_memcpy(rte_pktmbuf_mtod(pkt, void *), addr, len); 1661 DATA_LEN(pkt) = len; 1662 } else { 1663 rte_iova_t buf_iova; 1664 struct rte_mbuf_ext_shared_info *shinfo; 1665 uint16_t buf_len = strd_cnt * strd_sz; 1666 void *buf_addr; 1667 1668 /* Increment the refcnt of the whole chunk. */ 1669 rte_atomic16_add_return(&buf->refcnt, 1); 1670 assert((uint16_t)rte_atomic16_read(&buf->refcnt) <= 1671 strd_n + 1); 1672 buf_addr = RTE_PTR_SUB(addr, headroom_sz); 1673 /* 1674 * MLX5 device doesn't use iova but it is necessary in a 1675 * case where the Rx packet is transmitted via a 1676 * different PMD. 1677 */ 1678 buf_iova = rte_mempool_virt2iova(buf) + 1679 RTE_PTR_DIFF(buf_addr, buf); 1680 shinfo = &buf->shinfos[strd_idx]; 1681 rte_mbuf_ext_refcnt_set(shinfo, 1); 1682 /* 1683 * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when 1684 * attaching the stride to mbuf and more offload flags 1685 * will be added below by calling rxq_cq_to_mbuf(). 1686 * Other fields will be overwritten. 1687 */ 1688 rte_pktmbuf_attach_extbuf(pkt, buf_addr, buf_iova, 1689 buf_len, shinfo); 1690 /* Set mbuf head-room. */ 1691 pkt->data_off = headroom_sz; 1692 assert(pkt->ol_flags == EXT_ATTACHED_MBUF); 1693 /* 1694 * Prevent potential overflow due to MTU change through 1695 * kernel interface. 1696 */ 1697 if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) { 1698 rte_pktmbuf_free_seg(pkt); 1699 ++rxq->stats.idropped; 1700 continue; 1701 } 1702 DATA_LEN(pkt) = len; 1703 /* 1704 * LRO packet may consume all the stride memory, in this 1705 * case packet head-room space is not guaranteed so must 1706 * to add an empty mbuf for the head-room. 1707 */ 1708 if (!rxq->strd_headroom_en) { 1709 struct rte_mbuf *headroom_mbuf = 1710 rte_pktmbuf_alloc(rxq->mp); 1711 1712 if (unlikely(headroom_mbuf == NULL)) { 1713 rte_pktmbuf_free_seg(pkt); 1714 ++rxq->stats.rx_nombuf; 1715 break; 1716 } 1717 PORT(pkt) = rxq->port_id; 1718 NEXT(headroom_mbuf) = pkt; 1719 pkt = headroom_mbuf; 1720 NB_SEGS(pkt) = 2; 1721 } 1722 } 1723 rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res); 1724 if (lro_num_seg > 1) { 1725 mlx5_lro_update_hdr(addr, cqe, len); 1726 pkt->ol_flags |= PKT_RX_LRO; 1727 pkt->tso_segsz = strd_sz; 1728 } 1729 PKT_LEN(pkt) = len; 1730 PORT(pkt) = rxq->port_id; 1731 #ifdef MLX5_PMD_SOFT_COUNTERS 1732 /* Increment bytes counter. */ 1733 rxq->stats.ibytes += PKT_LEN(pkt); 1734 #endif 1735 /* Return packet. */ 1736 *(pkts++) = pkt; 1737 ++i; 1738 } 1739 /* Update the consumer indexes. */ 1740 rxq->consumed_strd = consumed_strd; 1741 rte_cio_wmb(); 1742 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); 1743 if (rq_ci != rxq->rq_ci) { 1744 rxq->rq_ci = rq_ci; 1745 rte_cio_wmb(); 1746 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 1747 } 1748 #ifdef MLX5_PMD_SOFT_COUNTERS 1749 /* Increment packets counter. */ 1750 rxq->stats.ipackets += i; 1751 #endif 1752 return i; 1753 } 1754 1755 /** 1756 * Dummy DPDK callback for TX. 1757 * 1758 * This function is used to temporarily replace the real callback during 1759 * unsafe control operations on the queue, or in case of error. 1760 * 1761 * @param dpdk_txq 1762 * Generic pointer to TX queue structure. 1763 * @param[in] pkts 1764 * Packets to transmit. 1765 * @param pkts_n 1766 * Number of packets in array. 1767 * 1768 * @return 1769 * Number of packets successfully transmitted (<= pkts_n). 1770 */ 1771 uint16_t 1772 removed_tx_burst(void *dpdk_txq __rte_unused, 1773 struct rte_mbuf **pkts __rte_unused, 1774 uint16_t pkts_n __rte_unused) 1775 { 1776 rte_mb(); 1777 return 0; 1778 } 1779 1780 /** 1781 * Dummy DPDK callback for RX. 1782 * 1783 * This function is used to temporarily replace the real callback during 1784 * unsafe control operations on the queue, or in case of error. 1785 * 1786 * @param dpdk_rxq 1787 * Generic pointer to RX queue structure. 1788 * @param[out] pkts 1789 * Array to store received packets. 1790 * @param pkts_n 1791 * Maximum number of packets in array. 1792 * 1793 * @return 1794 * Number of packets successfully received (<= pkts_n). 1795 */ 1796 uint16_t 1797 removed_rx_burst(void *dpdk_txq __rte_unused, 1798 struct rte_mbuf **pkts __rte_unused, 1799 uint16_t pkts_n __rte_unused) 1800 { 1801 rte_mb(); 1802 return 0; 1803 } 1804 1805 /* 1806 * Vectorized Rx/Tx routines are not compiled in when required vector 1807 * instructions are not supported on a target architecture. The following null 1808 * stubs are needed for linkage when those are not included outside of this file 1809 * (e.g. mlx5_rxtx_vec_sse.c for x86). 1810 */ 1811 1812 __rte_weak uint16_t 1813 mlx5_rx_burst_vec(void *dpdk_txq __rte_unused, 1814 struct rte_mbuf **pkts __rte_unused, 1815 uint16_t pkts_n __rte_unused) 1816 { 1817 return 0; 1818 } 1819 1820 __rte_weak int 1821 mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused) 1822 { 1823 return -ENOTSUP; 1824 } 1825 1826 __rte_weak int 1827 mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused) 1828 { 1829 return -ENOTSUP; 1830 } 1831 1832 /** 1833 * Free the mbufs from the linear array of pointers. 1834 * 1835 * @param pkts 1836 * Pointer to array of packets to be free. 1837 * @param pkts_n 1838 * Number of packets to be freed. 1839 * @param olx 1840 * Configured Tx offloads mask. It is fully defined at 1841 * compile time and may be used for optimization. 1842 */ 1843 static __rte_always_inline void 1844 mlx5_tx_free_mbuf(struct rte_mbuf **restrict pkts, 1845 unsigned int pkts_n, 1846 unsigned int olx __rte_unused) 1847 { 1848 struct rte_mempool *pool = NULL; 1849 struct rte_mbuf **p_free = NULL; 1850 struct rte_mbuf *mbuf; 1851 unsigned int n_free = 0; 1852 1853 /* 1854 * The implemented algorithm eliminates 1855 * copying pointers to temporary array 1856 * for rte_mempool_put_bulk() calls. 1857 */ 1858 assert(pkts); 1859 assert(pkts_n); 1860 for (;;) { 1861 for (;;) { 1862 /* 1863 * Decrement mbuf reference counter, detach 1864 * indirect and external buffers if needed. 1865 */ 1866 mbuf = rte_pktmbuf_prefree_seg(*pkts); 1867 if (likely(mbuf != NULL)) { 1868 assert(mbuf == *pkts); 1869 if (likely(n_free != 0)) { 1870 if (unlikely(pool != mbuf->pool)) 1871 /* From different pool. */ 1872 break; 1873 } else { 1874 /* Start new scan array. */ 1875 pool = mbuf->pool; 1876 p_free = pkts; 1877 } 1878 ++n_free; 1879 ++pkts; 1880 --pkts_n; 1881 if (unlikely(pkts_n == 0)) { 1882 mbuf = NULL; 1883 break; 1884 } 1885 } else { 1886 /* 1887 * This happens if mbuf is still referenced. 1888 * We can't put it back to the pool, skip. 1889 */ 1890 ++pkts; 1891 --pkts_n; 1892 if (unlikely(n_free != 0)) 1893 /* There is some array to free.*/ 1894 break; 1895 if (unlikely(pkts_n == 0)) 1896 /* Last mbuf, nothing to free. */ 1897 return; 1898 } 1899 } 1900 for (;;) { 1901 /* 1902 * This loop is implemented to avoid multiple 1903 * inlining of rte_mempool_put_bulk(). 1904 */ 1905 assert(pool); 1906 assert(p_free); 1907 assert(n_free); 1908 /* 1909 * Free the array of pre-freed mbufs 1910 * belonging to the same memory pool. 1911 */ 1912 rte_mempool_put_bulk(pool, (void *)p_free, n_free); 1913 if (unlikely(mbuf != NULL)) { 1914 /* There is the request to start new scan. */ 1915 pool = mbuf->pool; 1916 p_free = pkts++; 1917 n_free = 1; 1918 --pkts_n; 1919 if (likely(pkts_n != 0)) 1920 break; 1921 /* 1922 * This is the last mbuf to be freed. 1923 * Do one more loop iteration to complete. 1924 * This is rare case of the last unique mbuf. 1925 */ 1926 mbuf = NULL; 1927 continue; 1928 } 1929 if (likely(pkts_n == 0)) 1930 return; 1931 n_free = 0; 1932 break; 1933 } 1934 } 1935 } 1936 1937 /** 1938 * Free the mbuf from the elts ring buffer till new tail. 1939 * 1940 * @param txq 1941 * Pointer to Tx queue structure. 1942 * @param tail 1943 * Index in elts to free up to, becomes new elts tail. 1944 * @param olx 1945 * Configured Tx offloads mask. It is fully defined at 1946 * compile time and may be used for optimization. 1947 */ 1948 static __rte_always_inline void 1949 mlx5_tx_free_elts(struct mlx5_txq_data *restrict txq, 1950 uint16_t tail, 1951 unsigned int olx __rte_unused) 1952 { 1953 uint16_t n_elts = tail - txq->elts_tail; 1954 1955 assert(n_elts); 1956 assert(n_elts <= txq->elts_s); 1957 /* 1958 * Implement a loop to support ring buffer wraparound 1959 * with single inlining of mlx5_tx_free_mbuf(). 1960 */ 1961 do { 1962 unsigned int part; 1963 1964 part = txq->elts_s - (txq->elts_tail & txq->elts_m); 1965 part = RTE_MIN(part, n_elts); 1966 assert(part); 1967 assert(part <= txq->elts_s); 1968 mlx5_tx_free_mbuf(&txq->elts[txq->elts_tail & txq->elts_m], 1969 part, olx); 1970 txq->elts_tail += part; 1971 n_elts -= part; 1972 } while (n_elts); 1973 } 1974 1975 /** 1976 * Store the mbuf being sent into elts ring buffer. 1977 * On Tx completion these mbufs will be freed. 1978 * 1979 * @param txq 1980 * Pointer to Tx queue structure. 1981 * @param pkts 1982 * Pointer to array of packets to be stored. 1983 * @param pkts_n 1984 * Number of packets to be stored. 1985 * @param olx 1986 * Configured Tx offloads mask. It is fully defined at 1987 * compile time and may be used for optimization. 1988 */ 1989 static __rte_always_inline void 1990 mlx5_tx_copy_elts(struct mlx5_txq_data *restrict txq, 1991 struct rte_mbuf **restrict pkts, 1992 unsigned int pkts_n, 1993 unsigned int olx __rte_unused) 1994 { 1995 unsigned int part; 1996 struct rte_mbuf **elts = (struct rte_mbuf **)txq->elts; 1997 1998 assert(pkts); 1999 assert(pkts_n); 2000 part = txq->elts_s - (txq->elts_head & txq->elts_m); 2001 assert(part); 2002 assert(part <= txq->elts_s); 2003 /* This code is a good candidate for vectorizing with SIMD. */ 2004 rte_memcpy((void *)(elts + (txq->elts_head & txq->elts_m)), 2005 (void *)pkts, 2006 RTE_MIN(part, pkts_n) * sizeof(struct rte_mbuf *)); 2007 txq->elts_head += pkts_n; 2008 if (unlikely(part < pkts_n)) 2009 /* The copy is wrapping around the elts array. */ 2010 rte_memcpy((void *)elts, (void *)(pkts + part), 2011 (pkts_n - part) * sizeof(struct rte_mbuf *)); 2012 } 2013 2014 /** 2015 * Update completion queue consuming index via doorbell 2016 * and flush the completed data buffers. 2017 * 2018 * @param txq 2019 * Pointer to TX queue structure. 2020 * @param valid CQE pointer 2021 * if not NULL update txq->wqe_pi and flush the buffers 2022 * @param itail 2023 * if not negative - flush the buffers till this index. 2024 * @param olx 2025 * Configured Tx offloads mask. It is fully defined at 2026 * compile time and may be used for optimization. 2027 */ 2028 static __rte_always_inline void 2029 mlx5_tx_comp_flush(struct mlx5_txq_data *restrict txq, 2030 volatile struct mlx5_cqe *last_cqe, 2031 int itail, 2032 unsigned int olx __rte_unused) 2033 { 2034 uint16_t tail; 2035 2036 if (likely(last_cqe != NULL)) { 2037 txq->wqe_pi = rte_be_to_cpu_16(last_cqe->wqe_counter); 2038 tail = ((volatile struct mlx5_wqe_cseg *) 2039 (txq->wqes + (txq->wqe_pi & txq->wqe_m)))->misc; 2040 } else if (itail >= 0) { 2041 tail = (uint16_t)itail; 2042 } else { 2043 return; 2044 } 2045 rte_compiler_barrier(); 2046 *txq->cq_db = rte_cpu_to_be_32(txq->cq_ci); 2047 if (likely(tail != txq->elts_tail)) { 2048 mlx5_tx_free_elts(txq, tail, olx); 2049 assert(tail == txq->elts_tail); 2050 } 2051 } 2052 2053 /** 2054 * Manage TX completions. This routine checks the CQ for 2055 * arrived CQEs, deduces the last accomplished WQE in SQ, 2056 * updates SQ producing index and frees all completed mbufs. 2057 * 2058 * @param txq 2059 * Pointer to TX queue structure. 2060 * @param olx 2061 * Configured Tx offloads mask. It is fully defined at 2062 * compile time and may be used for optimization. 2063 * 2064 * NOTE: not inlined intentionally, it makes tx_burst 2065 * routine smaller, simple and faster - from experiments. 2066 */ 2067 static void 2068 mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq, 2069 unsigned int olx __rte_unused) 2070 { 2071 unsigned int count = MLX5_TX_COMP_MAX_CQE; 2072 volatile struct mlx5_cqe *last_cqe = NULL; 2073 int ret; 2074 2075 static_assert(MLX5_CQE_STATUS_HW_OWN < 0, "Must be negative value"); 2076 static_assert(MLX5_CQE_STATUS_SW_OWN < 0, "Must be negative value"); 2077 do { 2078 volatile struct mlx5_cqe *cqe; 2079 2080 cqe = &txq->cqes[txq->cq_ci & txq->cqe_m]; 2081 ret = check_cqe(cqe, txq->cqe_s, txq->cq_ci); 2082 if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) { 2083 if (likely(ret != MLX5_CQE_STATUS_ERR)) { 2084 /* No new CQEs in completion queue. */ 2085 assert(ret == MLX5_CQE_STATUS_HW_OWN); 2086 break; 2087 } 2088 /* 2089 * Some error occurred, try to restart. 2090 * We have no barrier after WQE related Doorbell 2091 * written, make sure all writes are completed 2092 * here, before we might perform SQ reset. 2093 */ 2094 rte_wmb(); 2095 ret = mlx5_tx_error_cqe_handle 2096 (txq, (volatile struct mlx5_err_cqe *)cqe); 2097 /* 2098 * Flush buffers, update consuming index 2099 * if recovery succeeded. Otherwise 2100 * just try to recover later. 2101 */ 2102 last_cqe = NULL; 2103 break; 2104 } 2105 /* Normal transmit completion. */ 2106 ++txq->cq_ci; 2107 last_cqe = cqe; 2108 #ifndef NDEBUG 2109 if (txq->cq_pi) 2110 --txq->cq_pi; 2111 #endif 2112 /* 2113 * We have to restrict the amount of processed CQEs 2114 * in one tx_burst routine call. The CQ may be large 2115 * and many CQEs may be updated by the NIC in one 2116 * transaction. Buffers freeing is time consuming, 2117 * multiple iterations may introduce significant 2118 * latency. 2119 */ 2120 } while (--count); 2121 mlx5_tx_comp_flush(txq, last_cqe, ret, olx); 2122 } 2123 2124 /** 2125 * Check if the completion request flag should be set in the last WQE. 2126 * Both pushed mbufs and WQEs are monitored and the completion request 2127 * flag is set if any of thresholds is reached. 2128 * 2129 * @param txq 2130 * Pointer to TX queue structure. 2131 * @param loc 2132 * Pointer to burst routine local context. 2133 * @param multi, 2134 * Routine is called from multi-segment sending loop, 2135 * do not correct the elts_head according to the pkts_copy. 2136 * @param olx 2137 * Configured Tx offloads mask. It is fully defined at 2138 * compile time and may be used for optimization. 2139 */ 2140 static __rte_always_inline void 2141 mlx5_tx_request_completion(struct mlx5_txq_data *restrict txq, 2142 struct mlx5_txq_local *restrict loc, 2143 bool multi, 2144 unsigned int olx) 2145 { 2146 uint16_t head = txq->elts_head; 2147 unsigned int part; 2148 2149 part = (MLX5_TXOFF_CONFIG(INLINE) || multi) ? 2150 0 : loc->pkts_sent - loc->pkts_copy; 2151 head += part; 2152 if ((uint16_t)(head - txq->elts_comp) >= MLX5_TX_COMP_THRESH || 2153 (MLX5_TXOFF_CONFIG(INLINE) && 2154 (uint16_t)(txq->wqe_ci - txq->wqe_comp) >= txq->wqe_thres)) { 2155 volatile struct mlx5_wqe *last = loc->wqe_last; 2156 2157 txq->elts_comp = head; 2158 if (MLX5_TXOFF_CONFIG(INLINE)) 2159 txq->wqe_comp = txq->wqe_ci; 2160 /* Request unconditional completion on last WQE. */ 2161 last->cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS << 2162 MLX5_COMP_MODE_OFFSET); 2163 /* Save elts_head in unused "immediate" field of WQE. */ 2164 last->cseg.misc = head; 2165 /* 2166 * A CQE slot must always be available. Count the 2167 * issued CEQ "always" request instead of production 2168 * index due to here can be CQE with errors and 2169 * difference with ci may become inconsistent. 2170 */ 2171 assert(txq->cqe_s > ++txq->cq_pi); 2172 } 2173 } 2174 2175 /** 2176 * DPDK callback to check the status of a tx descriptor. 2177 * 2178 * @param tx_queue 2179 * The tx queue. 2180 * @param[in] offset 2181 * The index of the descriptor in the ring. 2182 * 2183 * @return 2184 * The status of the tx descriptor. 2185 */ 2186 int 2187 mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset) 2188 { 2189 struct mlx5_txq_data *restrict txq = tx_queue; 2190 uint16_t used; 2191 2192 mlx5_tx_handle_completion(txq, 0); 2193 used = txq->elts_head - txq->elts_tail; 2194 if (offset < used) 2195 return RTE_ETH_TX_DESC_FULL; 2196 return RTE_ETH_TX_DESC_DONE; 2197 } 2198 2199 /** 2200 * Build the Control Segment with specified opcode: 2201 * - MLX5_OPCODE_SEND 2202 * - MLX5_OPCODE_ENHANCED_MPSW 2203 * - MLX5_OPCODE_TSO 2204 * 2205 * @param txq 2206 * Pointer to TX queue structure. 2207 * @param loc 2208 * Pointer to burst routine local context. 2209 * @param wqe 2210 * Pointer to WQE to fill with built Control Segment. 2211 * @param ds 2212 * Supposed length of WQE in segments. 2213 * @param opcode 2214 * SQ WQE opcode to put into Control Segment. 2215 * @param olx 2216 * Configured Tx offloads mask. It is fully defined at 2217 * compile time and may be used for optimization. 2218 */ 2219 static __rte_always_inline void 2220 mlx5_tx_cseg_init(struct mlx5_txq_data *restrict txq, 2221 struct mlx5_txq_local *restrict loc __rte_unused, 2222 struct mlx5_wqe *restrict wqe, 2223 unsigned int ds, 2224 unsigned int opcode, 2225 unsigned int olx __rte_unused) 2226 { 2227 struct mlx5_wqe_cseg *restrict cs = &wqe->cseg; 2228 2229 cs->opcode = rte_cpu_to_be_32((txq->wqe_ci << 8) | opcode); 2230 cs->sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); 2231 cs->flags = RTE_BE32(MLX5_COMP_ONLY_FIRST_ERR << 2232 MLX5_COMP_MODE_OFFSET); 2233 cs->misc = RTE_BE32(0); 2234 } 2235 2236 /** 2237 * Build the Ethernet Segment without inlined data. 2238 * Supports Software Parser, Checksums and VLAN 2239 * insertion Tx offload features. 2240 * 2241 * @param txq 2242 * Pointer to TX queue structure. 2243 * @param loc 2244 * Pointer to burst routine local context. 2245 * @param wqe 2246 * Pointer to WQE to fill with built Ethernet Segment. 2247 * @param olx 2248 * Configured Tx offloads mask. It is fully defined at 2249 * compile time and may be used for optimization. 2250 */ 2251 static __rte_always_inline void 2252 mlx5_tx_eseg_none(struct mlx5_txq_data *restrict txq __rte_unused, 2253 struct mlx5_txq_local *restrict loc, 2254 struct mlx5_wqe *restrict wqe, 2255 unsigned int olx) 2256 { 2257 struct mlx5_wqe_eseg *restrict es = &wqe->eseg; 2258 uint32_t csum; 2259 2260 /* 2261 * Calculate and set check sum flags first, dword field 2262 * in segment may be shared with Software Parser flags. 2263 */ 2264 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; 2265 es->flags = rte_cpu_to_le_32(csum); 2266 /* 2267 * Calculate and set Software Parser offsets and flags. 2268 * These flags a set for custom UDP and IP tunnel packets. 2269 */ 2270 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); 2271 /* Fill metadata field if needed. */ 2272 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? 2273 loc->mbuf->ol_flags & PKT_TX_METADATA ? 2274 loc->mbuf->tx_metadata : 0 : 0; 2275 /* Engage VLAN tag insertion feature if requested. */ 2276 if (MLX5_TXOFF_CONFIG(VLAN) && 2277 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { 2278 /* 2279 * We should get here only if device support 2280 * this feature correctly. 2281 */ 2282 assert(txq->vlan_en); 2283 es->inline_hdr = rte_cpu_to_be_32(MLX5_ETH_WQE_VLAN_INSERT | 2284 loc->mbuf->vlan_tci); 2285 } else { 2286 es->inline_hdr = RTE_BE32(0); 2287 } 2288 } 2289 2290 /** 2291 * Build the Ethernet Segment with minimal inlined data 2292 * of MLX5_ESEG_MIN_INLINE_SIZE bytes length. This is 2293 * used to fill the gap in single WQEBB WQEs. 2294 * Supports Software Parser, Checksums and VLAN 2295 * insertion Tx offload features. 2296 * 2297 * @param txq 2298 * Pointer to TX queue structure. 2299 * @param loc 2300 * Pointer to burst routine local context. 2301 * @param wqe 2302 * Pointer to WQE to fill with built Ethernet Segment. 2303 * @param vlan 2304 * Length of VLAN tag insertion if any. 2305 * @param olx 2306 * Configured Tx offloads mask. It is fully defined at 2307 * compile time and may be used for optimization. 2308 */ 2309 static __rte_always_inline void 2310 mlx5_tx_eseg_dmin(struct mlx5_txq_data *restrict txq __rte_unused, 2311 struct mlx5_txq_local *restrict loc, 2312 struct mlx5_wqe *restrict wqe, 2313 unsigned int vlan, 2314 unsigned int olx) 2315 { 2316 struct mlx5_wqe_eseg *restrict es = &wqe->eseg; 2317 uint32_t csum; 2318 uint8_t *psrc, *pdst; 2319 2320 /* 2321 * Calculate and set check sum flags first, dword field 2322 * in segment may be shared with Software Parser flags. 2323 */ 2324 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; 2325 es->flags = rte_cpu_to_le_32(csum); 2326 /* 2327 * Calculate and set Software Parser offsets and flags. 2328 * These flags a set for custom UDP and IP tunnel packets. 2329 */ 2330 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); 2331 /* Fill metadata field if needed. */ 2332 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? 2333 loc->mbuf->ol_flags & PKT_TX_METADATA ? 2334 loc->mbuf->tx_metadata : 0 : 0; 2335 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2336 (sizeof(uint16_t) + 2337 sizeof(rte_v128u32_t)), 2338 "invalid Ethernet Segment data size"); 2339 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2340 (sizeof(uint16_t) + 2341 sizeof(struct rte_vlan_hdr) + 2342 2 * RTE_ETHER_ADDR_LEN), 2343 "invalid Ethernet Segment data size"); 2344 psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *); 2345 es->inline_hdr_sz = RTE_BE16(MLX5_ESEG_MIN_INLINE_SIZE); 2346 es->inline_data = *(unaligned_uint16_t *)psrc; 2347 psrc += sizeof(uint16_t); 2348 pdst = (uint8_t *)(es + 1); 2349 if (MLX5_TXOFF_CONFIG(VLAN) && vlan) { 2350 /* Implement VLAN tag insertion as part inline data. */ 2351 memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t)); 2352 pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); 2353 psrc += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); 2354 /* Insert VLAN ethertype + VLAN tag. */ 2355 *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32 2356 ((RTE_ETHER_TYPE_VLAN << 16) | 2357 loc->mbuf->vlan_tci); 2358 pdst += sizeof(struct rte_vlan_hdr); 2359 /* Copy the rest two bytes from packet data. */ 2360 assert(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t))); 2361 *(uint16_t *)pdst = *(unaligned_uint16_t *)psrc; 2362 } else { 2363 /* Fill the gap in the title WQEBB with inline data. */ 2364 rte_mov16(pdst, psrc); 2365 } 2366 } 2367 2368 /** 2369 * Build the Ethernet Segment with entire packet 2370 * data inlining. Checks the boundary of WQEBB and 2371 * ring buffer wrapping, supports Software Parser, 2372 * Checksums and VLAN insertion Tx offload features. 2373 * 2374 * @param txq 2375 * Pointer to TX queue structure. 2376 * @param loc 2377 * Pointer to burst routine local context. 2378 * @param wqe 2379 * Pointer to WQE to fill with built Ethernet Segment. 2380 * @param vlan 2381 * Length of VLAN tag insertion if any. 2382 * @param inlen 2383 * Length of data to inline (VLAN included, if any). 2384 * @param tso 2385 * TSO flag, set mss field from the packet. 2386 * @param olx 2387 * Configured Tx offloads mask. It is fully defined at 2388 * compile time and may be used for optimization. 2389 * 2390 * @return 2391 * Pointer to the next Data Segment (aligned and wrapped around). 2392 */ 2393 static __rte_always_inline struct mlx5_wqe_dseg * 2394 mlx5_tx_eseg_data(struct mlx5_txq_data *restrict txq, 2395 struct mlx5_txq_local *restrict loc, 2396 struct mlx5_wqe *restrict wqe, 2397 unsigned int vlan, 2398 unsigned int inlen, 2399 unsigned int tso, 2400 unsigned int olx) 2401 { 2402 struct mlx5_wqe_eseg *restrict es = &wqe->eseg; 2403 uint32_t csum; 2404 uint8_t *psrc, *pdst; 2405 unsigned int part; 2406 2407 /* 2408 * Calculate and set check sum flags first, dword field 2409 * in segment may be shared with Software Parser flags. 2410 */ 2411 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; 2412 if (tso) { 2413 csum <<= 24; 2414 csum |= loc->mbuf->tso_segsz; 2415 es->flags = rte_cpu_to_be_32(csum); 2416 } else { 2417 es->flags = rte_cpu_to_le_32(csum); 2418 } 2419 /* 2420 * Calculate and set Software Parser offsets and flags. 2421 * These flags a set for custom UDP and IP tunnel packets. 2422 */ 2423 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); 2424 /* Fill metadata field if needed. */ 2425 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? 2426 loc->mbuf->ol_flags & PKT_TX_METADATA ? 2427 loc->mbuf->tx_metadata : 0 : 0; 2428 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2429 (sizeof(uint16_t) + 2430 sizeof(rte_v128u32_t)), 2431 "invalid Ethernet Segment data size"); 2432 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2433 (sizeof(uint16_t) + 2434 sizeof(struct rte_vlan_hdr) + 2435 2 * RTE_ETHER_ADDR_LEN), 2436 "invalid Ethernet Segment data size"); 2437 psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *); 2438 es->inline_hdr_sz = rte_cpu_to_be_16(inlen); 2439 es->inline_data = *(unaligned_uint16_t *)psrc; 2440 psrc += sizeof(uint16_t); 2441 pdst = (uint8_t *)(es + 1); 2442 if (MLX5_TXOFF_CONFIG(VLAN) && vlan) { 2443 /* Implement VLAN tag insertion as part inline data. */ 2444 memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t)); 2445 pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); 2446 psrc += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t); 2447 /* Insert VLAN ethertype + VLAN tag. */ 2448 *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32 2449 ((RTE_ETHER_TYPE_VLAN << 16) | 2450 loc->mbuf->vlan_tci); 2451 pdst += sizeof(struct rte_vlan_hdr); 2452 /* Copy the rest two bytes from packet data. */ 2453 assert(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t))); 2454 *(uint16_t *)pdst = *(unaligned_uint16_t *)psrc; 2455 psrc += sizeof(uint16_t); 2456 } else { 2457 /* Fill the gap in the title WQEBB with inline data. */ 2458 rte_mov16(pdst, psrc); 2459 psrc += sizeof(rte_v128u32_t); 2460 } 2461 pdst = (uint8_t *)(es + 2); 2462 assert(inlen >= MLX5_ESEG_MIN_INLINE_SIZE); 2463 assert(pdst < (uint8_t *)txq->wqes_end); 2464 inlen -= MLX5_ESEG_MIN_INLINE_SIZE; 2465 if (!inlen) { 2466 assert(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE)); 2467 return (struct mlx5_wqe_dseg *)pdst; 2468 } 2469 /* 2470 * The WQEBB space availability is checked by caller. 2471 * Here we should be aware of WQE ring buffer wraparound only. 2472 */ 2473 part = (uint8_t *)txq->wqes_end - pdst; 2474 part = RTE_MIN(part, inlen); 2475 do { 2476 rte_memcpy(pdst, psrc, part); 2477 inlen -= part; 2478 if (likely(!inlen)) { 2479 /* 2480 * If return value is not used by the caller 2481 * the code below will be optimized out. 2482 */ 2483 pdst += part; 2484 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); 2485 if (unlikely(pdst >= (uint8_t *)txq->wqes_end)) 2486 pdst = (uint8_t *)txq->wqes; 2487 return (struct mlx5_wqe_dseg *)pdst; 2488 } 2489 pdst = (uint8_t *)txq->wqes; 2490 psrc += part; 2491 part = inlen; 2492 } while (true); 2493 } 2494 2495 /** 2496 * Copy data from chain of mbuf to the specified linear buffer. 2497 * Checksums and VLAN insertion Tx offload features. If data 2498 * from some mbuf copied completely this mbuf is freed. Local 2499 * structure is used to keep the byte stream state. 2500 * 2501 * @param pdst 2502 * Pointer to the destination linear buffer. 2503 * @param loc 2504 * Pointer to burst routine local context. 2505 * @param len 2506 * Length of data to be copied. 2507 * @param olx 2508 * Configured Tx offloads mask. It is fully defined at 2509 * compile time and may be used for optimization. 2510 */ 2511 static __rte_always_inline void 2512 mlx5_tx_mseg_memcpy(uint8_t *pdst, 2513 struct mlx5_txq_local *restrict loc, 2514 unsigned int len, 2515 unsigned int olx __rte_unused) 2516 { 2517 struct rte_mbuf *mbuf; 2518 unsigned int part, dlen; 2519 uint8_t *psrc; 2520 2521 assert(len); 2522 do { 2523 /* Allow zero length packets, must check first. */ 2524 dlen = rte_pktmbuf_data_len(loc->mbuf); 2525 if (dlen <= loc->mbuf_off) { 2526 /* Exhausted packet, just free. */ 2527 mbuf = loc->mbuf; 2528 loc->mbuf = mbuf->next; 2529 rte_pktmbuf_free_seg(mbuf); 2530 loc->mbuf_off = 0; 2531 assert(loc->mbuf_nseg > 1); 2532 assert(loc->mbuf); 2533 --loc->mbuf_nseg; 2534 continue; 2535 } 2536 dlen -= loc->mbuf_off; 2537 psrc = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *, 2538 loc->mbuf_off); 2539 part = RTE_MIN(len, dlen); 2540 rte_memcpy(pdst, psrc, part); 2541 loc->mbuf_off += part; 2542 len -= part; 2543 if (!len) { 2544 if (loc->mbuf_off >= rte_pktmbuf_data_len(loc->mbuf)) { 2545 loc->mbuf_off = 0; 2546 /* Exhausted packet, just free. */ 2547 mbuf = loc->mbuf; 2548 loc->mbuf = mbuf->next; 2549 rte_pktmbuf_free_seg(mbuf); 2550 loc->mbuf_off = 0; 2551 assert(loc->mbuf_nseg >= 1); 2552 --loc->mbuf_nseg; 2553 } 2554 return; 2555 } 2556 pdst += part; 2557 } while (true); 2558 } 2559 2560 /** 2561 * Build the Ethernet Segment with inlined data from 2562 * multi-segment packet. Checks the boundary of WQEBB 2563 * and ring buffer wrapping, supports Software Parser, 2564 * Checksums and VLAN insertion Tx offload features. 2565 * 2566 * @param txq 2567 * Pointer to TX queue structure. 2568 * @param loc 2569 * Pointer to burst routine local context. 2570 * @param wqe 2571 * Pointer to WQE to fill with built Ethernet Segment. 2572 * @param vlan 2573 * Length of VLAN tag insertion if any. 2574 * @param inlen 2575 * Length of data to inline (VLAN included, if any). 2576 * @param tso 2577 * TSO flag, set mss field from the packet. 2578 * @param olx 2579 * Configured Tx offloads mask. It is fully defined at 2580 * compile time and may be used for optimization. 2581 * 2582 * @return 2583 * Pointer to the next Data Segment (aligned and 2584 * possible NOT wrapped around - caller should do 2585 * wrapping check on its own). 2586 */ 2587 static __rte_always_inline struct mlx5_wqe_dseg * 2588 mlx5_tx_eseg_mdat(struct mlx5_txq_data *restrict txq, 2589 struct mlx5_txq_local *restrict loc, 2590 struct mlx5_wqe *restrict wqe, 2591 unsigned int vlan, 2592 unsigned int inlen, 2593 unsigned int tso, 2594 unsigned int olx) 2595 { 2596 struct mlx5_wqe_eseg *restrict es = &wqe->eseg; 2597 uint32_t csum; 2598 uint8_t *pdst; 2599 unsigned int part; 2600 2601 /* 2602 * Calculate and set check sum flags first, uint32_t field 2603 * in segment may be shared with Software Parser flags. 2604 */ 2605 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0; 2606 if (tso) { 2607 csum <<= 24; 2608 csum |= loc->mbuf->tso_segsz; 2609 es->flags = rte_cpu_to_be_32(csum); 2610 } else { 2611 es->flags = rte_cpu_to_le_32(csum); 2612 } 2613 /* 2614 * Calculate and set Software Parser offsets and flags. 2615 * These flags a set for custom UDP and IP tunnel packets. 2616 */ 2617 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); 2618 /* Fill metadata field if needed. */ 2619 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? 2620 loc->mbuf->ol_flags & PKT_TX_METADATA ? 2621 loc->mbuf->tx_metadata : 0 : 0; 2622 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2623 (sizeof(uint16_t) + 2624 sizeof(rte_v128u32_t)), 2625 "invalid Ethernet Segment data size"); 2626 static_assert(MLX5_ESEG_MIN_INLINE_SIZE == 2627 (sizeof(uint16_t) + 2628 sizeof(struct rte_vlan_hdr) + 2629 2 * RTE_ETHER_ADDR_LEN), 2630 "invalid Ethernet Segment data size"); 2631 assert(inlen >= MLX5_ESEG_MIN_INLINE_SIZE); 2632 es->inline_hdr_sz = rte_cpu_to_be_16(inlen); 2633 pdst = (uint8_t *)&es->inline_data; 2634 if (MLX5_TXOFF_CONFIG(VLAN) && vlan) { 2635 /* Implement VLAN tag insertion as part inline data. */ 2636 mlx5_tx_mseg_memcpy(pdst, loc, 2 * RTE_ETHER_ADDR_LEN, olx); 2637 pdst += 2 * RTE_ETHER_ADDR_LEN; 2638 *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32 2639 ((RTE_ETHER_TYPE_VLAN << 16) | 2640 loc->mbuf->vlan_tci); 2641 pdst += sizeof(struct rte_vlan_hdr); 2642 inlen -= 2 * RTE_ETHER_ADDR_LEN + sizeof(struct rte_vlan_hdr); 2643 } 2644 assert(pdst < (uint8_t *)txq->wqes_end); 2645 /* 2646 * The WQEBB space availability is checked by caller. 2647 * Here we should be aware of WQE ring buffer wraparound only. 2648 */ 2649 part = (uint8_t *)txq->wqes_end - pdst; 2650 part = RTE_MIN(part, inlen); 2651 assert(part); 2652 do { 2653 mlx5_tx_mseg_memcpy(pdst, loc, part, olx); 2654 inlen -= part; 2655 if (likely(!inlen)) { 2656 pdst += part; 2657 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); 2658 return (struct mlx5_wqe_dseg *)pdst; 2659 } 2660 pdst = (uint8_t *)txq->wqes; 2661 part = inlen; 2662 } while (true); 2663 } 2664 2665 /** 2666 * Build the Data Segment of pointer type. 2667 * 2668 * @param txq 2669 * Pointer to TX queue structure. 2670 * @param loc 2671 * Pointer to burst routine local context. 2672 * @param dseg 2673 * Pointer to WQE to fill with built Data Segment. 2674 * @param buf 2675 * Data buffer to point. 2676 * @param len 2677 * Data buffer length. 2678 * @param olx 2679 * Configured Tx offloads mask. It is fully defined at 2680 * compile time and may be used for optimization. 2681 */ 2682 static __rte_always_inline void 2683 mlx5_tx_dseg_ptr(struct mlx5_txq_data *restrict txq, 2684 struct mlx5_txq_local *restrict loc, 2685 struct mlx5_wqe_dseg *restrict dseg, 2686 uint8_t *buf, 2687 unsigned int len, 2688 unsigned int olx __rte_unused) 2689 2690 { 2691 assert(len); 2692 dseg->bcount = rte_cpu_to_be_32(len); 2693 dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf); 2694 dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf); 2695 } 2696 2697 /** 2698 * Build the Data Segment of pointer type or inline 2699 * if data length is less than buffer in minimal 2700 * Data Segment size. 2701 * 2702 * @param txq 2703 * Pointer to TX queue structure. 2704 * @param loc 2705 * Pointer to burst routine local context. 2706 * @param dseg 2707 * Pointer to WQE to fill with built Data Segment. 2708 * @param buf 2709 * Data buffer to point. 2710 * @param len 2711 * Data buffer length. 2712 * @param olx 2713 * Configured Tx offloads mask. It is fully defined at 2714 * compile time and may be used for optimization. 2715 */ 2716 static __rte_always_inline void 2717 mlx5_tx_dseg_iptr(struct mlx5_txq_data *restrict txq, 2718 struct mlx5_txq_local *restrict loc, 2719 struct mlx5_wqe_dseg *restrict dseg, 2720 uint8_t *buf, 2721 unsigned int len, 2722 unsigned int olx __rte_unused) 2723 2724 { 2725 uintptr_t dst, src; 2726 2727 assert(len); 2728 if (len > MLX5_DSEG_MIN_INLINE_SIZE) { 2729 dseg->bcount = rte_cpu_to_be_32(len); 2730 dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf); 2731 dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf); 2732 2733 return; 2734 } 2735 dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE); 2736 /* Unrolled implementation of generic rte_memcpy. */ 2737 dst = (uintptr_t)&dseg->inline_data[0]; 2738 src = (uintptr_t)buf; 2739 #ifdef RTE_ARCH_STRICT_ALIGN 2740 memcpy(dst, src, len); 2741 #else 2742 if (len & 0x08) { 2743 *(uint64_t *)dst = *(uint64_t *)src; 2744 dst += sizeof(uint64_t); 2745 src += sizeof(uint64_t); 2746 } 2747 if (len & 0x04) { 2748 *(uint32_t *)dst = *(uint32_t *)src; 2749 dst += sizeof(uint32_t); 2750 src += sizeof(uint32_t); 2751 } 2752 if (len & 0x02) { 2753 *(uint16_t *)dst = *(uint16_t *)src; 2754 dst += sizeof(uint16_t); 2755 src += sizeof(uint16_t); 2756 } 2757 if (len & 0x01) 2758 *(uint8_t *)dst = *(uint8_t *)src; 2759 #endif 2760 } 2761 2762 /** 2763 * Build the Data Segment of inlined data from single 2764 * segment packet, no VLAN insertion. 2765 * 2766 * @param txq 2767 * Pointer to TX queue structure. 2768 * @param loc 2769 * Pointer to burst routine local context. 2770 * @param dseg 2771 * Pointer to WQE to fill with built Data Segment. 2772 * @param buf 2773 * Data buffer to point. 2774 * @param len 2775 * Data buffer length. 2776 * @param olx 2777 * Configured Tx offloads mask. It is fully defined at 2778 * compile time and may be used for optimization. 2779 * 2780 * @return 2781 * Pointer to the next Data Segment after inlined data. 2782 * Ring buffer wraparound check is needed. We do not 2783 * do it here because it may not be needed for the 2784 * last packet in the eMPW session. 2785 */ 2786 static __rte_always_inline struct mlx5_wqe_dseg * 2787 mlx5_tx_dseg_empw(struct mlx5_txq_data *restrict txq, 2788 struct mlx5_txq_local *restrict loc __rte_unused, 2789 struct mlx5_wqe_dseg *restrict dseg, 2790 uint8_t *buf, 2791 unsigned int len, 2792 unsigned int olx __rte_unused) 2793 { 2794 unsigned int part; 2795 uint8_t *pdst; 2796 2797 dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE); 2798 pdst = &dseg->inline_data[0]; 2799 /* 2800 * The WQEBB space availability is checked by caller. 2801 * Here we should be aware of WQE ring buffer wraparound only. 2802 */ 2803 part = (uint8_t *)txq->wqes_end - pdst; 2804 part = RTE_MIN(part, len); 2805 do { 2806 rte_memcpy(pdst, buf, part); 2807 len -= part; 2808 if (likely(!len)) { 2809 pdst += part; 2810 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); 2811 /* Note: no final wraparound check here. */ 2812 return (struct mlx5_wqe_dseg *)pdst; 2813 } 2814 pdst = (uint8_t *)txq->wqes; 2815 buf += part; 2816 part = len; 2817 } while (true); 2818 } 2819 2820 /** 2821 * Build the Data Segment of inlined data from single 2822 * segment packet with VLAN insertion. 2823 * 2824 * @param txq 2825 * Pointer to TX queue structure. 2826 * @param loc 2827 * Pointer to burst routine local context. 2828 * @param dseg 2829 * Pointer to the dseg fill with built Data Segment. 2830 * @param buf 2831 * Data buffer to point. 2832 * @param len 2833 * Data buffer length. 2834 * @param olx 2835 * Configured Tx offloads mask. It is fully defined at 2836 * compile time and may be used for optimization. 2837 * 2838 * @return 2839 * Pointer to the next Data Segment after inlined data. 2840 * Ring buffer wraparound check is needed. 2841 */ 2842 static __rte_always_inline struct mlx5_wqe_dseg * 2843 mlx5_tx_dseg_vlan(struct mlx5_txq_data *restrict txq, 2844 struct mlx5_txq_local *restrict loc __rte_unused, 2845 struct mlx5_wqe_dseg *restrict dseg, 2846 uint8_t *buf, 2847 unsigned int len, 2848 unsigned int olx __rte_unused) 2849 2850 { 2851 unsigned int part; 2852 uint8_t *pdst; 2853 2854 assert(len > MLX5_ESEG_MIN_INLINE_SIZE); 2855 static_assert(MLX5_DSEG_MIN_INLINE_SIZE == 2856 (2 * RTE_ETHER_ADDR_LEN), 2857 "invalid Data Segment data size"); 2858 dseg->bcount = rte_cpu_to_be_32((len + sizeof(struct rte_vlan_hdr)) | 2859 MLX5_ETH_WQE_DATA_INLINE); 2860 pdst = &dseg->inline_data[0]; 2861 memcpy(pdst, buf, MLX5_DSEG_MIN_INLINE_SIZE); 2862 buf += MLX5_DSEG_MIN_INLINE_SIZE; 2863 pdst += MLX5_DSEG_MIN_INLINE_SIZE; 2864 len -= MLX5_DSEG_MIN_INLINE_SIZE; 2865 /* Insert VLAN ethertype + VLAN tag. Pointer is aligned. */ 2866 assert(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE)); 2867 if (unlikely(pdst >= (uint8_t *)txq->wqes_end)) 2868 pdst = (uint8_t *)txq->wqes; 2869 *(uint32_t *)pdst = rte_cpu_to_be_32((RTE_ETHER_TYPE_VLAN << 16) | 2870 loc->mbuf->vlan_tci); 2871 pdst += sizeof(struct rte_vlan_hdr); 2872 /* 2873 * The WQEBB space availability is checked by caller. 2874 * Here we should be aware of WQE ring buffer wraparound only. 2875 */ 2876 part = (uint8_t *)txq->wqes_end - pdst; 2877 part = RTE_MIN(part, len); 2878 do { 2879 rte_memcpy(pdst, buf, part); 2880 len -= part; 2881 if (likely(!len)) { 2882 pdst += part; 2883 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); 2884 /* Note: no final wraparound check here. */ 2885 return (struct mlx5_wqe_dseg *)pdst; 2886 } 2887 pdst = (uint8_t *)txq->wqes; 2888 buf += part; 2889 part = len; 2890 } while (true); 2891 } 2892 2893 /** 2894 * Build the Ethernet Segment with optionally inlined data with 2895 * VLAN insertion and following Data Segments (if any) from 2896 * multi-segment packet. Used by ordinary send and TSO. 2897 * 2898 * @param txq 2899 * Pointer to TX queue structure. 2900 * @param loc 2901 * Pointer to burst routine local context. 2902 * @param wqe 2903 * Pointer to WQE to fill with built Ethernet/Data Segments. 2904 * @param vlan 2905 * Length of VLAN header to insert, 0 means no VLAN insertion. 2906 * @param inlen 2907 * Data length to inline. For TSO this parameter specifies 2908 * exact value, for ordinary send routine can be aligned by 2909 * caller to provide better WQE space saving and data buffer 2910 * start address alignment. This length includes VLAN header 2911 * being inserted. 2912 * @param tso 2913 * Zero means ordinary send, inlined data can be extended, 2914 * otherwise this is TSO, inlined data length is fixed. 2915 * @param olx 2916 * Configured Tx offloads mask. It is fully defined at 2917 * compile time and may be used for optimization. 2918 * 2919 * @return 2920 * Actual size of built WQE in segments. 2921 */ 2922 static __rte_always_inline unsigned int 2923 mlx5_tx_mseg_build(struct mlx5_txq_data *restrict txq, 2924 struct mlx5_txq_local *restrict loc, 2925 struct mlx5_wqe *restrict wqe, 2926 unsigned int vlan, 2927 unsigned int inlen, 2928 unsigned int tso, 2929 unsigned int olx __rte_unused) 2930 { 2931 struct mlx5_wqe_dseg *restrict dseg; 2932 unsigned int ds; 2933 2934 assert((rte_pktmbuf_pkt_len(loc->mbuf) + vlan) >= inlen); 2935 loc->mbuf_nseg = NB_SEGS(loc->mbuf); 2936 loc->mbuf_off = 0; 2937 2938 dseg = mlx5_tx_eseg_mdat(txq, loc, wqe, vlan, inlen, tso, olx); 2939 if (!loc->mbuf_nseg) 2940 goto dseg_done; 2941 /* 2942 * There are still some mbuf remaining, not inlined. 2943 * The first mbuf may be partially inlined and we 2944 * must process the possible non-zero data offset. 2945 */ 2946 if (loc->mbuf_off) { 2947 unsigned int dlen; 2948 uint8_t *dptr; 2949 2950 /* 2951 * Exhausted packets must be dropped before. 2952 * Non-zero offset means there are some data 2953 * remained in the packet. 2954 */ 2955 assert(loc->mbuf_off < rte_pktmbuf_data_len(loc->mbuf)); 2956 assert(rte_pktmbuf_data_len(loc->mbuf)); 2957 dptr = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *, 2958 loc->mbuf_off); 2959 dlen = rte_pktmbuf_data_len(loc->mbuf) - loc->mbuf_off; 2960 /* 2961 * Build the pointer/minimal data Data Segment. 2962 * Do ring buffer wrapping check in advance. 2963 */ 2964 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 2965 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 2966 mlx5_tx_dseg_iptr(txq, loc, dseg, dptr, dlen, olx); 2967 /* Store the mbuf to be freed on completion. */ 2968 assert(loc->elts_free); 2969 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 2970 --loc->elts_free; 2971 ++dseg; 2972 if (--loc->mbuf_nseg == 0) 2973 goto dseg_done; 2974 loc->mbuf = loc->mbuf->next; 2975 loc->mbuf_off = 0; 2976 } 2977 do { 2978 if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) { 2979 struct rte_mbuf *mbuf; 2980 2981 /* Zero length segment found, just skip. */ 2982 mbuf = loc->mbuf; 2983 loc->mbuf = loc->mbuf->next; 2984 rte_pktmbuf_free_seg(mbuf); 2985 if (--loc->mbuf_nseg == 0) 2986 break; 2987 } else { 2988 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 2989 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 2990 mlx5_tx_dseg_iptr 2991 (txq, loc, dseg, 2992 rte_pktmbuf_mtod(loc->mbuf, uint8_t *), 2993 rte_pktmbuf_data_len(loc->mbuf), olx); 2994 assert(loc->elts_free); 2995 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 2996 --loc->elts_free; 2997 ++dseg; 2998 if (--loc->mbuf_nseg == 0) 2999 break; 3000 loc->mbuf = loc->mbuf->next; 3001 } 3002 } while (true); 3003 3004 dseg_done: 3005 /* Calculate actual segments used from the dseg pointer. */ 3006 if ((uintptr_t)wqe < (uintptr_t)dseg) 3007 ds = ((uintptr_t)dseg - (uintptr_t)wqe) / MLX5_WSEG_SIZE; 3008 else 3009 ds = (((uintptr_t)dseg - (uintptr_t)wqe) + 3010 txq->wqe_s * MLX5_WQE_SIZE) / MLX5_WSEG_SIZE; 3011 return ds; 3012 } 3013 3014 /** 3015 * Tx one packet function for multi-segment TSO. Supports all 3016 * types of Tx offloads, uses MLX5_OPCODE_TSO to build WQEs, 3017 * sends one packet per WQE. 3018 * 3019 * This routine is responsible for storing processed mbuf 3020 * into elts ring buffer and update elts_head. 3021 * 3022 * @param txq 3023 * Pointer to TX queue structure. 3024 * @param loc 3025 * Pointer to burst routine local context. 3026 * @param olx 3027 * Configured Tx offloads mask. It is fully defined at 3028 * compile time and may be used for optimization. 3029 * 3030 * @return 3031 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3032 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3033 * Local context variables partially updated. 3034 */ 3035 static __rte_always_inline enum mlx5_txcmp_code 3036 mlx5_tx_packet_multi_tso(struct mlx5_txq_data *restrict txq, 3037 struct mlx5_txq_local *restrict loc, 3038 unsigned int olx) 3039 { 3040 struct mlx5_wqe *restrict wqe; 3041 unsigned int ds, dlen, inlen, ntcp, vlan = 0; 3042 3043 /* 3044 * Calculate data length to be inlined to estimate 3045 * the required space in WQE ring buffer. 3046 */ 3047 dlen = rte_pktmbuf_pkt_len(loc->mbuf); 3048 if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) 3049 vlan = sizeof(struct rte_vlan_hdr); 3050 inlen = loc->mbuf->l2_len + vlan + 3051 loc->mbuf->l3_len + loc->mbuf->l4_len; 3052 if (unlikely((!inlen || !loc->mbuf->tso_segsz))) 3053 return MLX5_TXCMP_CODE_ERROR; 3054 if (loc->mbuf->ol_flags & PKT_TX_TUNNEL_MASK) 3055 inlen += loc->mbuf->outer_l2_len + loc->mbuf->outer_l3_len; 3056 /* Packet must contain all TSO headers. */ 3057 if (unlikely(inlen > MLX5_MAX_TSO_HEADER || 3058 inlen <= MLX5_ESEG_MIN_INLINE_SIZE || 3059 inlen > (dlen + vlan))) 3060 return MLX5_TXCMP_CODE_ERROR; 3061 assert(inlen >= txq->inlen_mode); 3062 /* 3063 * Check whether there are enough free WQEBBs: 3064 * - Control Segment 3065 * - Ethernet Segment 3066 * - First Segment of inlined Ethernet data 3067 * - ... data continued ... 3068 * - Data Segments of pointer/min inline type 3069 */ 3070 ds = NB_SEGS(loc->mbuf) + 2 + (inlen - 3071 MLX5_ESEG_MIN_INLINE_SIZE + 3072 MLX5_WSEG_SIZE + 3073 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 3074 if (unlikely(loc->wqe_free < ((ds + 3) / 4))) 3075 return MLX5_TXCMP_CODE_EXIT; 3076 /* Check for maximal WQE size. */ 3077 if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4))) 3078 return MLX5_TXCMP_CODE_ERROR; 3079 #ifdef MLX5_PMD_SOFT_COUNTERS 3080 /* Update sent data bytes/packets counters. */ 3081 ntcp = (dlen - (inlen - vlan) + loc->mbuf->tso_segsz - 1) / 3082 loc->mbuf->tso_segsz; 3083 /* 3084 * One will be added for mbuf itself 3085 * at the end of the mlx5_tx_burst from 3086 * loc->pkts_sent field. 3087 */ 3088 --ntcp; 3089 txq->stats.opackets += ntcp; 3090 txq->stats.obytes += dlen + vlan + ntcp * inlen; 3091 #endif 3092 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3093 loc->wqe_last = wqe; 3094 mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_TSO, olx); 3095 ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 1, olx); 3096 wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); 3097 txq->wqe_ci += (ds + 3) / 4; 3098 loc->wqe_free -= (ds + 3) / 4; 3099 /* Request CQE generation if limits are reached. */ 3100 mlx5_tx_request_completion(txq, loc, true, olx); 3101 return MLX5_TXCMP_CODE_MULTI; 3102 } 3103 3104 /** 3105 * Tx one packet function for multi-segment SEND. Supports all 3106 * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs, 3107 * sends one packet per WQE, without any data inlining in 3108 * Ethernet Segment. 3109 * 3110 * This routine is responsible for storing processed mbuf 3111 * into elts ring buffer and update elts_head. 3112 * 3113 * @param txq 3114 * Pointer to TX queue structure. 3115 * @param loc 3116 * Pointer to burst routine local context. 3117 * @param olx 3118 * Configured Tx offloads mask. It is fully defined at 3119 * compile time and may be used for optimization. 3120 * 3121 * @return 3122 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3123 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3124 * Local context variables partially updated. 3125 */ 3126 static __rte_always_inline enum mlx5_txcmp_code 3127 mlx5_tx_packet_multi_send(struct mlx5_txq_data *restrict txq, 3128 struct mlx5_txq_local *restrict loc, 3129 unsigned int olx) 3130 { 3131 struct mlx5_wqe_dseg *restrict dseg; 3132 struct mlx5_wqe *restrict wqe; 3133 unsigned int ds, nseg; 3134 3135 assert(NB_SEGS(loc->mbuf) > 1); 3136 /* 3137 * No inline at all, it means the CPU cycles saving 3138 * is prioritized at configuration, we should not 3139 * copy any packet data to WQE. 3140 */ 3141 nseg = NB_SEGS(loc->mbuf); 3142 ds = 2 + nseg; 3143 if (unlikely(loc->wqe_free < ((ds + 3) / 4))) 3144 return MLX5_TXCMP_CODE_EXIT; 3145 /* Check for maximal WQE size. */ 3146 if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4))) 3147 return MLX5_TXCMP_CODE_ERROR; 3148 /* 3149 * Some Tx offloads may cause an error if 3150 * packet is not long enough, check against 3151 * assumed minimal length. 3152 */ 3153 if (rte_pktmbuf_pkt_len(loc->mbuf) <= MLX5_ESEG_MIN_INLINE_SIZE) 3154 return MLX5_TXCMP_CODE_ERROR; 3155 #ifdef MLX5_PMD_SOFT_COUNTERS 3156 /* Update sent data bytes counter. */ 3157 txq->stats.obytes += rte_pktmbuf_pkt_len(loc->mbuf); 3158 if (MLX5_TXOFF_CONFIG(VLAN) && 3159 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) 3160 txq->stats.obytes += sizeof(struct rte_vlan_hdr); 3161 #endif 3162 /* 3163 * SEND WQE, one WQEBB: 3164 * - Control Segment, SEND opcode 3165 * - Ethernet Segment, optional VLAN, no inline 3166 * - Data Segments, pointer only type 3167 */ 3168 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3169 loc->wqe_last = wqe; 3170 mlx5_tx_cseg_init(txq, loc, wqe, ds, MLX5_OPCODE_SEND, olx); 3171 mlx5_tx_eseg_none(txq, loc, wqe, olx); 3172 dseg = &wqe->dseg[0]; 3173 do { 3174 if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) { 3175 struct rte_mbuf *mbuf; 3176 3177 /* 3178 * Zero length segment found, have to 3179 * correct total size of WQE in segments. 3180 * It is supposed to be rare occasion, so 3181 * in normal case (no zero length segments) 3182 * we avoid extra writing to the Control 3183 * Segment. 3184 */ 3185 --ds; 3186 wqe->cseg.sq_ds -= RTE_BE32(1); 3187 mbuf = loc->mbuf; 3188 loc->mbuf = mbuf->next; 3189 rte_pktmbuf_free_seg(mbuf); 3190 if (--nseg == 0) 3191 break; 3192 } else { 3193 mlx5_tx_dseg_ptr 3194 (txq, loc, dseg, 3195 rte_pktmbuf_mtod(loc->mbuf, uint8_t *), 3196 rte_pktmbuf_data_len(loc->mbuf), olx); 3197 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 3198 --loc->elts_free; 3199 if (--nseg == 0) 3200 break; 3201 ++dseg; 3202 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 3203 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 3204 loc->mbuf = loc->mbuf->next; 3205 } 3206 } while (true); 3207 txq->wqe_ci += (ds + 3) / 4; 3208 loc->wqe_free -= (ds + 3) / 4; 3209 /* Request CQE generation if limits are reached. */ 3210 mlx5_tx_request_completion(txq, loc, true, olx); 3211 return MLX5_TXCMP_CODE_MULTI; 3212 } 3213 3214 /** 3215 * Tx one packet function for multi-segment SEND. Supports all 3216 * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs, 3217 * sends one packet per WQE, with data inlining in 3218 * Ethernet Segment and minimal Data Segments. 3219 * 3220 * This routine is responsible for storing processed mbuf 3221 * into elts ring buffer and update elts_head. 3222 * 3223 * @param txq 3224 * Pointer to TX queue structure. 3225 * @param loc 3226 * Pointer to burst routine local context. 3227 * @param olx 3228 * Configured Tx offloads mask. It is fully defined at 3229 * compile time and may be used for optimization. 3230 * 3231 * @return 3232 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3233 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3234 * Local context variables partially updated. 3235 */ 3236 static __rte_always_inline enum mlx5_txcmp_code 3237 mlx5_tx_packet_multi_inline(struct mlx5_txq_data *restrict txq, 3238 struct mlx5_txq_local *restrict loc, 3239 unsigned int olx) 3240 { 3241 struct mlx5_wqe *restrict wqe; 3242 unsigned int ds, inlen, dlen, vlan = 0; 3243 3244 assert(MLX5_TXOFF_CONFIG(INLINE)); 3245 assert(NB_SEGS(loc->mbuf) > 1); 3246 /* 3247 * First calculate data length to be inlined 3248 * to estimate the required space for WQE. 3249 */ 3250 dlen = rte_pktmbuf_pkt_len(loc->mbuf); 3251 if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) 3252 vlan = sizeof(struct rte_vlan_hdr); 3253 inlen = dlen + vlan; 3254 /* Check against minimal length. */ 3255 if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE) 3256 return MLX5_TXCMP_CODE_ERROR; 3257 assert(txq->inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE); 3258 if (inlen > txq->inlen_send) { 3259 struct rte_mbuf *mbuf; 3260 unsigned int nxlen; 3261 uintptr_t start; 3262 3263 /* 3264 * Packet length exceeds the allowed inline 3265 * data length, check whether the minimal 3266 * inlining is required. 3267 */ 3268 if (txq->inlen_mode) { 3269 assert(txq->inlen_mode >= MLX5_ESEG_MIN_INLINE_SIZE); 3270 assert(txq->inlen_mode <= txq->inlen_send); 3271 inlen = txq->inlen_mode; 3272 } else { 3273 if (!vlan || txq->vlan_en) { 3274 /* 3275 * VLAN insertion will be done inside by HW. 3276 * It is not utmost effective - VLAN flag is 3277 * checked twice, but we should proceed the 3278 * inlining length correctly and take into 3279 * account the VLAN header being inserted. 3280 */ 3281 return mlx5_tx_packet_multi_send 3282 (txq, loc, olx); 3283 } 3284 inlen = MLX5_ESEG_MIN_INLINE_SIZE; 3285 } 3286 /* 3287 * Now we know the minimal amount of data is requested 3288 * to inline. Check whether we should inline the buffers 3289 * from the chain beginning to eliminate some mbufs. 3290 */ 3291 mbuf = loc->mbuf; 3292 nxlen = rte_pktmbuf_data_len(mbuf); 3293 if (unlikely(nxlen <= txq->inlen_send)) { 3294 /* We can inline first mbuf at least. */ 3295 if (nxlen < inlen) { 3296 unsigned int smlen; 3297 3298 /* Scan mbufs till inlen filled. */ 3299 do { 3300 smlen = nxlen; 3301 mbuf = NEXT(mbuf); 3302 assert(mbuf); 3303 nxlen = rte_pktmbuf_data_len(mbuf); 3304 nxlen += smlen; 3305 } while (unlikely(nxlen < inlen)); 3306 if (unlikely(nxlen > txq->inlen_send)) { 3307 /* We cannot inline entire mbuf. */ 3308 smlen = inlen - smlen; 3309 start = rte_pktmbuf_mtod_offset 3310 (mbuf, uintptr_t, smlen); 3311 goto do_align; 3312 } 3313 } 3314 do { 3315 inlen = nxlen; 3316 mbuf = NEXT(mbuf); 3317 /* There should be not end of packet. */ 3318 assert(mbuf); 3319 nxlen = inlen + rte_pktmbuf_data_len(mbuf); 3320 } while (unlikely(nxlen < txq->inlen_send)); 3321 } 3322 start = rte_pktmbuf_mtod(mbuf, uintptr_t); 3323 /* 3324 * Check whether we can do inline to align start 3325 * address of data buffer to cacheline. 3326 */ 3327 do_align: 3328 start = (~start + 1) & (RTE_CACHE_LINE_SIZE - 1); 3329 if (unlikely(start)) { 3330 start += inlen; 3331 if (start <= txq->inlen_send) 3332 inlen = start; 3333 } 3334 } 3335 /* 3336 * Check whether there are enough free WQEBBs: 3337 * - Control Segment 3338 * - Ethernet Segment 3339 * - First Segment of inlined Ethernet data 3340 * - ... data continued ... 3341 * - Data Segments of pointer/min inline type 3342 * 3343 * Estimate the number of Data Segments conservatively, 3344 * supposing no any mbufs is being freed during inlining. 3345 */ 3346 assert(inlen <= txq->inlen_send); 3347 ds = NB_SEGS(loc->mbuf) + 2 + (inlen - 3348 MLX5_ESEG_MIN_INLINE_SIZE + 3349 MLX5_WSEG_SIZE + 3350 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 3351 if (unlikely(loc->wqe_free < ((ds + 3) / 4))) 3352 return MLX5_TXCMP_CODE_EXIT; 3353 /* Check for maximal WQE size. */ 3354 if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4))) 3355 return MLX5_TXCMP_CODE_ERROR; 3356 #ifdef MLX5_PMD_SOFT_COUNTERS 3357 /* Update sent data bytes/packets counters. */ 3358 txq->stats.obytes += dlen + vlan; 3359 #endif 3360 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3361 loc->wqe_last = wqe; 3362 mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_SEND, olx); 3363 ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 0, olx); 3364 wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); 3365 txq->wqe_ci += (ds + 3) / 4; 3366 loc->wqe_free -= (ds + 3) / 4; 3367 /* Request CQE generation if limits are reached. */ 3368 mlx5_tx_request_completion(txq, loc, true, olx); 3369 return MLX5_TXCMP_CODE_MULTI; 3370 } 3371 3372 /** 3373 * Tx burst function for multi-segment packets. Supports all 3374 * types of Tx offloads, uses MLX5_OPCODE_SEND/TSO to build WQEs, 3375 * sends one packet per WQE. Function stops sending if it 3376 * encounters the single-segment packet. 3377 * 3378 * This routine is responsible for storing processed mbuf 3379 * into elts ring buffer and update elts_head. 3380 * 3381 * @param txq 3382 * Pointer to TX queue structure. 3383 * @param[in] pkts 3384 * Packets to transmit. 3385 * @param pkts_n 3386 * Number of packets in array. 3387 * @param loc 3388 * Pointer to burst routine local context. 3389 * @param olx 3390 * Configured Tx offloads mask. It is fully defined at 3391 * compile time and may be used for optimization. 3392 * 3393 * @return 3394 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3395 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3396 * MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered. 3397 * MLX5_TXCMP_CODE_TSO - TSO single-segment packet encountered. 3398 * Local context variables updated. 3399 */ 3400 static __rte_always_inline enum mlx5_txcmp_code 3401 mlx5_tx_burst_mseg(struct mlx5_txq_data *restrict txq, 3402 struct rte_mbuf **restrict pkts, 3403 unsigned int pkts_n, 3404 struct mlx5_txq_local *restrict loc, 3405 unsigned int olx) 3406 { 3407 assert(loc->elts_free && loc->wqe_free); 3408 assert(pkts_n > loc->pkts_sent); 3409 pkts += loc->pkts_sent + 1; 3410 pkts_n -= loc->pkts_sent; 3411 for (;;) { 3412 enum mlx5_txcmp_code ret; 3413 3414 assert(NB_SEGS(loc->mbuf) > 1); 3415 /* 3416 * Estimate the number of free elts quickly but 3417 * conservatively. Some segment may be fully inlined 3418 * and freed, ignore this here - precise estimation 3419 * is costly. 3420 */ 3421 if (loc->elts_free < NB_SEGS(loc->mbuf)) 3422 return MLX5_TXCMP_CODE_EXIT; 3423 if (MLX5_TXOFF_CONFIG(TSO) && 3424 unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) { 3425 /* Proceed with multi-segment TSO. */ 3426 ret = mlx5_tx_packet_multi_tso(txq, loc, olx); 3427 } else if (MLX5_TXOFF_CONFIG(INLINE)) { 3428 /* Proceed with multi-segment SEND with inlining. */ 3429 ret = mlx5_tx_packet_multi_inline(txq, loc, olx); 3430 } else { 3431 /* Proceed with multi-segment SEND w/o inlining. */ 3432 ret = mlx5_tx_packet_multi_send(txq, loc, olx); 3433 } 3434 if (ret == MLX5_TXCMP_CODE_EXIT) 3435 return MLX5_TXCMP_CODE_EXIT; 3436 if (ret == MLX5_TXCMP_CODE_ERROR) 3437 return MLX5_TXCMP_CODE_ERROR; 3438 /* WQE is built, go to the next packet. */ 3439 ++loc->pkts_sent; 3440 --pkts_n; 3441 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) 3442 return MLX5_TXCMP_CODE_EXIT; 3443 loc->mbuf = *pkts++; 3444 if (pkts_n > 1) 3445 rte_prefetch0(*pkts); 3446 if (likely(NB_SEGS(loc->mbuf) > 1)) 3447 continue; 3448 /* Here ends the series of multi-segment packets. */ 3449 if (MLX5_TXOFF_CONFIG(TSO) && 3450 unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) 3451 return MLX5_TXCMP_CODE_TSO; 3452 return MLX5_TXCMP_CODE_SINGLE; 3453 } 3454 assert(false); 3455 } 3456 3457 /** 3458 * Tx burst function for single-segment packets with TSO. 3459 * Supports all types of Tx offloads, except multi-packets. 3460 * Uses MLX5_OPCODE_TSO to build WQEs, sends one packet per WQE. 3461 * Function stops sending if it encounters the multi-segment 3462 * packet or packet without TSO requested. 3463 * 3464 * The routine is responsible for storing processed mbuf 3465 * into elts ring buffer and update elts_head if inline 3466 * offloads is requested due to possible early freeing 3467 * of the inlined mbufs (can not store pkts array in elts 3468 * as a batch). 3469 * 3470 * @param txq 3471 * Pointer to TX queue structure. 3472 * @param[in] pkts 3473 * Packets to transmit. 3474 * @param pkts_n 3475 * Number of packets in array. 3476 * @param loc 3477 * Pointer to burst routine local context. 3478 * @param olx 3479 * Configured Tx offloads mask. It is fully defined at 3480 * compile time and may be used for optimization. 3481 * 3482 * @return 3483 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3484 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3485 * MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered. 3486 * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered. 3487 * Local context variables updated. 3488 */ 3489 static __rte_always_inline enum mlx5_txcmp_code 3490 mlx5_tx_burst_tso(struct mlx5_txq_data *restrict txq, 3491 struct rte_mbuf **restrict pkts, 3492 unsigned int pkts_n, 3493 struct mlx5_txq_local *restrict loc, 3494 unsigned int olx) 3495 { 3496 assert(loc->elts_free && loc->wqe_free); 3497 assert(pkts_n > loc->pkts_sent); 3498 pkts += loc->pkts_sent + 1; 3499 pkts_n -= loc->pkts_sent; 3500 for (;;) { 3501 struct mlx5_wqe_dseg *restrict dseg; 3502 struct mlx5_wqe *restrict wqe; 3503 unsigned int ds, dlen, hlen, ntcp, vlan = 0; 3504 uint8_t *dptr; 3505 3506 assert(NB_SEGS(loc->mbuf) == 1); 3507 dlen = rte_pktmbuf_data_len(loc->mbuf); 3508 if (MLX5_TXOFF_CONFIG(VLAN) && 3509 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { 3510 vlan = sizeof(struct rte_vlan_hdr); 3511 } 3512 /* 3513 * First calculate the WQE size to check 3514 * whether we have enough space in ring buffer. 3515 */ 3516 hlen = loc->mbuf->l2_len + vlan + 3517 loc->mbuf->l3_len + loc->mbuf->l4_len; 3518 if (unlikely((!hlen || !loc->mbuf->tso_segsz))) 3519 return MLX5_TXCMP_CODE_ERROR; 3520 if (loc->mbuf->ol_flags & PKT_TX_TUNNEL_MASK) 3521 hlen += loc->mbuf->outer_l2_len + 3522 loc->mbuf->outer_l3_len; 3523 /* Segment must contain all TSO headers. */ 3524 if (unlikely(hlen > MLX5_MAX_TSO_HEADER || 3525 hlen <= MLX5_ESEG_MIN_INLINE_SIZE || 3526 hlen > (dlen + vlan))) 3527 return MLX5_TXCMP_CODE_ERROR; 3528 /* 3529 * Check whether there are enough free WQEBBs: 3530 * - Control Segment 3531 * - Ethernet Segment 3532 * - First Segment of inlined Ethernet data 3533 * - ... data continued ... 3534 * - Finishing Data Segment of pointer type 3535 */ 3536 ds = 4 + (hlen - MLX5_ESEG_MIN_INLINE_SIZE + 3537 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 3538 if (loc->wqe_free < ((ds + 3) / 4)) 3539 return MLX5_TXCMP_CODE_EXIT; 3540 #ifdef MLX5_PMD_SOFT_COUNTERS 3541 /* Update sent data bytes/packets counters. */ 3542 ntcp = (dlen + vlan - hlen + 3543 loc->mbuf->tso_segsz - 1) / 3544 loc->mbuf->tso_segsz; 3545 /* 3546 * One will be added for mbuf itself at the end 3547 * of the mlx5_tx_burst from loc->pkts_sent field. 3548 */ 3549 --ntcp; 3550 txq->stats.opackets += ntcp; 3551 txq->stats.obytes += dlen + vlan + ntcp * hlen; 3552 #endif 3553 /* 3554 * Build the TSO WQE: 3555 * - Control Segment 3556 * - Ethernet Segment with hlen bytes inlined 3557 * - Data Segment of pointer type 3558 */ 3559 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3560 loc->wqe_last = wqe; 3561 mlx5_tx_cseg_init(txq, loc, wqe, ds, 3562 MLX5_OPCODE_TSO, olx); 3563 dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan, hlen, 1, olx); 3564 dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + hlen - vlan; 3565 dlen -= hlen - vlan; 3566 mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx); 3567 /* 3568 * WQE is built, update the loop parameters 3569 * and go to the next packet. 3570 */ 3571 txq->wqe_ci += (ds + 3) / 4; 3572 loc->wqe_free -= (ds + 3) / 4; 3573 if (MLX5_TXOFF_CONFIG(INLINE)) 3574 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 3575 --loc->elts_free; 3576 ++loc->pkts_sent; 3577 --pkts_n; 3578 /* Request CQE generation if limits are reached. */ 3579 mlx5_tx_request_completion(txq, loc, false, olx); 3580 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) 3581 return MLX5_TXCMP_CODE_EXIT; 3582 loc->mbuf = *pkts++; 3583 if (pkts_n > 1) 3584 rte_prefetch0(*pkts); 3585 if (MLX5_TXOFF_CONFIG(MULTI) && 3586 unlikely(NB_SEGS(loc->mbuf) > 1)) 3587 return MLX5_TXCMP_CODE_MULTI; 3588 if (likely(!(loc->mbuf->ol_flags & PKT_TX_TCP_SEG))) 3589 return MLX5_TXCMP_CODE_SINGLE; 3590 /* Continue with the next TSO packet. */ 3591 } 3592 assert(false); 3593 } 3594 3595 /** 3596 * Analyze the packet and select the best method to send. 3597 * 3598 * @param txq 3599 * Pointer to TX queue structure. 3600 * @param loc 3601 * Pointer to burst routine local context. 3602 * @param olx 3603 * Configured Tx offloads mask. It is fully defined at 3604 * compile time and may be used for optimization. 3605 * @param newp 3606 * The predefined flag whether do complete check for 3607 * multi-segment packets and TSO. 3608 * 3609 * @return 3610 * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered. 3611 * MLX5_TXCMP_CODE_TSO - TSO required, use TSO/LSO. 3612 * MLX5_TXCMP_CODE_SINGLE - single-segment packet, use SEND. 3613 * MLX5_TXCMP_CODE_EMPW - single-segment packet, use MPW. 3614 */ 3615 static __rte_always_inline enum mlx5_txcmp_code 3616 mlx5_tx_able_to_empw(struct mlx5_txq_data *restrict txq, 3617 struct mlx5_txq_local *restrict loc, 3618 unsigned int olx, 3619 bool newp) 3620 { 3621 /* Check for multi-segment packet. */ 3622 if (newp && 3623 MLX5_TXOFF_CONFIG(MULTI) && 3624 unlikely(NB_SEGS(loc->mbuf) > 1)) 3625 return MLX5_TXCMP_CODE_MULTI; 3626 /* Check for TSO packet. */ 3627 if (newp && 3628 MLX5_TXOFF_CONFIG(TSO) && 3629 unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) 3630 return MLX5_TXCMP_CODE_TSO; 3631 /* Check if eMPW is enabled at all. */ 3632 if (!MLX5_TXOFF_CONFIG(EMPW)) 3633 return MLX5_TXCMP_CODE_SINGLE; 3634 /* Check if eMPW can be engaged. */ 3635 if (MLX5_TXOFF_CONFIG(VLAN) && 3636 unlikely(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) && 3637 (!MLX5_TXOFF_CONFIG(INLINE) || 3638 unlikely((rte_pktmbuf_data_len(loc->mbuf) + 3639 sizeof(struct rte_vlan_hdr)) > txq->inlen_empw))) { 3640 /* 3641 * eMPW does not support VLAN insertion offload, 3642 * we have to inline the entire packet but 3643 * packet is too long for inlining. 3644 */ 3645 return MLX5_TXCMP_CODE_SINGLE; 3646 } 3647 return MLX5_TXCMP_CODE_EMPW; 3648 } 3649 3650 /** 3651 * Check the next packet attributes to match with the eMPW batch ones. 3652 * 3653 * @param txq 3654 * Pointer to TX queue structure. 3655 * @param es 3656 * Pointer to Ethernet Segment of eMPW batch. 3657 * @param loc 3658 * Pointer to burst routine local context. 3659 * @param olx 3660 * Configured Tx offloads mask. It is fully defined at 3661 * compile time and may be used for optimization. 3662 * 3663 * @return 3664 * true - packet match with eMPW batch attributes. 3665 * false - no match, eMPW should be restarted. 3666 */ 3667 static __rte_always_inline bool 3668 mlx5_tx_match_empw(struct mlx5_txq_data *restrict txq __rte_unused, 3669 struct mlx5_wqe_eseg *restrict es, 3670 struct mlx5_txq_local *restrict loc, 3671 unsigned int olx) 3672 { 3673 uint8_t swp_flags = 0; 3674 3675 /* Compare the checksum flags, if any. */ 3676 if (MLX5_TXOFF_CONFIG(CSUM) && 3677 txq_ol_cksum_to_cs(loc->mbuf) != es->cs_flags) 3678 return false; 3679 /* Compare the Software Parser offsets and flags. */ 3680 if (MLX5_TXOFF_CONFIG(SWP) && 3681 (es->swp_offs != txq_mbuf_to_swp(loc, &swp_flags, olx) || 3682 es->swp_flags != swp_flags)) 3683 return false; 3684 /* Fill metadata field if needed. */ 3685 if (MLX5_TXOFF_CONFIG(METADATA) && 3686 es->metadata != (loc->mbuf->ol_flags & PKT_TX_METADATA ? 3687 loc->mbuf->tx_metadata : 0)) 3688 return false; 3689 /* There must be no VLAN packets in eMPW loop. */ 3690 if (MLX5_TXOFF_CONFIG(VLAN)) 3691 assert(!(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)); 3692 return true; 3693 } 3694 3695 /* 3696 * Update send loop variables and WQE for eMPW loop 3697 * without data inlining. Number of Data Segments is 3698 * equal to the number of sent packets. 3699 * 3700 * @param txq 3701 * Pointer to TX queue structure. 3702 * @param loc 3703 * Pointer to burst routine local context. 3704 * @param ds 3705 * Number of packets/Data Segments/Packets. 3706 * @param slen 3707 * Accumulated statistics, bytes sent 3708 * @param olx 3709 * Configured Tx offloads mask. It is fully defined at 3710 * compile time and may be used for optimization. 3711 * 3712 * @return 3713 * true - packet match with eMPW batch attributes. 3714 * false - no match, eMPW should be restarted. 3715 */ 3716 static __rte_always_inline void 3717 mlx5_tx_sdone_empw(struct mlx5_txq_data *restrict txq, 3718 struct mlx5_txq_local *restrict loc, 3719 unsigned int ds, 3720 unsigned int slen, 3721 unsigned int olx) 3722 { 3723 assert(!MLX5_TXOFF_CONFIG(INLINE)); 3724 #ifdef MLX5_PMD_SOFT_COUNTERS 3725 /* Update sent data bytes counter. */ 3726 txq->stats.obytes += slen; 3727 #else 3728 (void)slen; 3729 #endif 3730 loc->elts_free -= ds; 3731 loc->pkts_sent += ds; 3732 ds += 2; 3733 loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); 3734 txq->wqe_ci += (ds + 3) / 4; 3735 loc->wqe_free -= (ds + 3) / 4; 3736 /* Request CQE generation if limits are reached. */ 3737 mlx5_tx_request_completion(txq, loc, false, olx); 3738 } 3739 3740 /* 3741 * Update send loop variables and WQE for eMPW loop 3742 * with data inlining. Gets the size of pushed descriptors 3743 * and data to the WQE. 3744 * 3745 * @param txq 3746 * Pointer to TX queue structure. 3747 * @param loc 3748 * Pointer to burst routine local context. 3749 * @param len 3750 * Total size of descriptor/data in bytes. 3751 * @param slen 3752 * Accumulated statistics, data bytes sent. 3753 * @param olx 3754 * Configured Tx offloads mask. It is fully defined at 3755 * compile time and may be used for optimization. 3756 * 3757 * @return 3758 * true - packet match with eMPW batch attributes. 3759 * false - no match, eMPW should be restarted. 3760 */ 3761 static __rte_always_inline void 3762 mlx5_tx_idone_empw(struct mlx5_txq_data *restrict txq, 3763 struct mlx5_txq_local *restrict loc, 3764 unsigned int len, 3765 unsigned int slen, 3766 unsigned int olx __rte_unused) 3767 { 3768 assert(MLX5_TXOFF_CONFIG(INLINE)); 3769 assert((len % MLX5_WSEG_SIZE) == 0); 3770 #ifdef MLX5_PMD_SOFT_COUNTERS 3771 /* Update sent data bytes counter. */ 3772 txq->stats.obytes += slen; 3773 #else 3774 (void)slen; 3775 #endif 3776 len = len / MLX5_WSEG_SIZE + 2; 3777 loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | len); 3778 txq->wqe_ci += (len + 3) / 4; 3779 loc->wqe_free -= (len + 3) / 4; 3780 /* Request CQE generation if limits are reached. */ 3781 mlx5_tx_request_completion(txq, loc, false, olx); 3782 } 3783 3784 /** 3785 * The set of Tx burst functions for single-segment packets 3786 * without TSO and with Multi-Packet Writing feature support. 3787 * Supports all types of Tx offloads, except multi-packets 3788 * and TSO. 3789 * 3790 * Uses MLX5_OPCODE_EMPW to build WQEs if possible and sends 3791 * as many packet per WQE as it can. If eMPW is not configured 3792 * or packet can not be sent with eMPW (VLAN insertion) the 3793 * ordinary SEND opcode is used and only one packet placed 3794 * in WQE. 3795 * 3796 * Functions stop sending if it encounters the multi-segment 3797 * packet or packet with TSO requested. 3798 * 3799 * The routines are responsible for storing processed mbuf 3800 * into elts ring buffer and update elts_head if inlining 3801 * offload is requested. Otherwise the copying mbufs to elts 3802 * can be postponed and completed at the end of burst routine. 3803 * 3804 * @param txq 3805 * Pointer to TX queue structure. 3806 * @param[in] pkts 3807 * Packets to transmit. 3808 * @param pkts_n 3809 * Number of packets in array. 3810 * @param loc 3811 * Pointer to burst routine local context. 3812 * @param olx 3813 * Configured Tx offloads mask. It is fully defined at 3814 * compile time and may be used for optimization. 3815 * 3816 * @return 3817 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. 3818 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred. 3819 * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered. 3820 * MLX5_TXCMP_CODE_TSO - TSO packet encountered. 3821 * MLX5_TXCMP_CODE_SINGLE - used inside functions set. 3822 * MLX5_TXCMP_CODE_EMPW - used inside functions set. 3823 * 3824 * Local context variables updated. 3825 * 3826 * 3827 * The routine sends packets with MLX5_OPCODE_EMPW 3828 * without inlining, this is dedicated optimized branch. 3829 * No VLAN insertion is supported. 3830 */ 3831 static __rte_always_inline enum mlx5_txcmp_code 3832 mlx5_tx_burst_empw_simple(struct mlx5_txq_data *restrict txq, 3833 struct rte_mbuf **restrict pkts, 3834 unsigned int pkts_n, 3835 struct mlx5_txq_local *restrict loc, 3836 unsigned int olx) 3837 { 3838 /* 3839 * Subroutine is the part of mlx5_tx_burst_single() 3840 * and sends single-segment packet with eMPW opcode 3841 * without data inlining. 3842 */ 3843 assert(!MLX5_TXOFF_CONFIG(INLINE)); 3844 assert(MLX5_TXOFF_CONFIG(EMPW)); 3845 assert(loc->elts_free && loc->wqe_free); 3846 assert(pkts_n > loc->pkts_sent); 3847 static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size"); 3848 pkts += loc->pkts_sent + 1; 3849 pkts_n -= loc->pkts_sent; 3850 for (;;) { 3851 struct mlx5_wqe_dseg *restrict dseg; 3852 struct mlx5_wqe_eseg *restrict eseg; 3853 enum mlx5_txcmp_code ret; 3854 unsigned int part, loop; 3855 unsigned int slen = 0; 3856 3857 next_empw: 3858 part = RTE_MIN(pkts_n, MLX5_EMPW_MAX_PACKETS); 3859 if (unlikely(loc->elts_free < part)) { 3860 /* We have no enough elts to save all mbufs. */ 3861 if (unlikely(loc->elts_free < MLX5_EMPW_MIN_PACKETS)) 3862 return MLX5_TXCMP_CODE_EXIT; 3863 /* But we still able to send at least minimal eMPW. */ 3864 part = loc->elts_free; 3865 } 3866 /* Check whether we have enough WQEs */ 3867 if (unlikely(loc->wqe_free < ((2 + part + 3) / 4))) { 3868 if (unlikely(loc->wqe_free < 3869 ((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4))) 3870 return MLX5_TXCMP_CODE_EXIT; 3871 part = (loc->wqe_free * 4) - 2; 3872 } 3873 if (likely(part > 1)) 3874 rte_prefetch0(*pkts); 3875 loc->wqe_last = txq->wqes + (txq->wqe_ci & txq->wqe_m); 3876 /* 3877 * Build eMPW title WQEBB: 3878 * - Control Segment, eMPW opcode 3879 * - Ethernet Segment, no inline 3880 */ 3881 mlx5_tx_cseg_init(txq, loc, loc->wqe_last, part + 2, 3882 MLX5_OPCODE_ENHANCED_MPSW, olx); 3883 mlx5_tx_eseg_none(txq, loc, loc->wqe_last, 3884 olx & ~MLX5_TXOFF_CONFIG_VLAN); 3885 eseg = &loc->wqe_last->eseg; 3886 dseg = &loc->wqe_last->dseg[0]; 3887 loop = part; 3888 for (;;) { 3889 uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf); 3890 #ifdef MLX5_PMD_SOFT_COUNTERS 3891 /* Update sent data bytes counter. */ 3892 slen += dlen; 3893 #endif 3894 mlx5_tx_dseg_ptr 3895 (txq, loc, dseg, 3896 rte_pktmbuf_mtod(loc->mbuf, uint8_t *), 3897 dlen, olx); 3898 if (unlikely(--loop == 0)) 3899 break; 3900 loc->mbuf = *pkts++; 3901 if (likely(loop > 1)) 3902 rte_prefetch0(*pkts); 3903 ret = mlx5_tx_able_to_empw(txq, loc, olx, true); 3904 /* 3905 * Unroll the completion code to avoid 3906 * returning variable value - it results in 3907 * unoptimized sequent checking in caller. 3908 */ 3909 if (ret == MLX5_TXCMP_CODE_MULTI) { 3910 part -= loop; 3911 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 3912 if (unlikely(!loc->elts_free || 3913 !loc->wqe_free)) 3914 return MLX5_TXCMP_CODE_EXIT; 3915 return MLX5_TXCMP_CODE_MULTI; 3916 } 3917 if (ret == MLX5_TXCMP_CODE_TSO) { 3918 part -= loop; 3919 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 3920 if (unlikely(!loc->elts_free || 3921 !loc->wqe_free)) 3922 return MLX5_TXCMP_CODE_EXIT; 3923 return MLX5_TXCMP_CODE_TSO; 3924 } 3925 if (ret == MLX5_TXCMP_CODE_SINGLE) { 3926 part -= loop; 3927 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 3928 if (unlikely(!loc->elts_free || 3929 !loc->wqe_free)) 3930 return MLX5_TXCMP_CODE_EXIT; 3931 return MLX5_TXCMP_CODE_SINGLE; 3932 } 3933 if (ret != MLX5_TXCMP_CODE_EMPW) { 3934 assert(false); 3935 part -= loop; 3936 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 3937 return MLX5_TXCMP_CODE_ERROR; 3938 } 3939 /* 3940 * Check whether packet parameters coincide 3941 * within assumed eMPW batch: 3942 * - check sum settings 3943 * - metadata value 3944 * - software parser settings 3945 */ 3946 if (!mlx5_tx_match_empw(txq, eseg, loc, olx)) { 3947 assert(loop); 3948 part -= loop; 3949 mlx5_tx_sdone_empw(txq, loc, part, slen, olx); 3950 if (unlikely(!loc->elts_free || 3951 !loc->wqe_free)) 3952 return MLX5_TXCMP_CODE_EXIT; 3953 pkts_n -= part; 3954 goto next_empw; 3955 } 3956 /* Packet attributes match, continue the same eMPW. */ 3957 ++dseg; 3958 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 3959 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 3960 } 3961 /* eMPW is built successfully, update loop parameters. */ 3962 assert(!loop); 3963 assert(pkts_n >= part); 3964 #ifdef MLX5_PMD_SOFT_COUNTERS 3965 /* Update sent data bytes counter. */ 3966 txq->stats.obytes += slen; 3967 #endif 3968 loc->elts_free -= part; 3969 loc->pkts_sent += part; 3970 txq->wqe_ci += (2 + part + 3) / 4; 3971 loc->wqe_free -= (2 + part + 3) / 4; 3972 pkts_n -= part; 3973 /* Request CQE generation if limits are reached. */ 3974 mlx5_tx_request_completion(txq, loc, false, olx); 3975 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) 3976 return MLX5_TXCMP_CODE_EXIT; 3977 loc->mbuf = *pkts++; 3978 ret = mlx5_tx_able_to_empw(txq, loc, olx, true); 3979 if (unlikely(ret != MLX5_TXCMP_CODE_EMPW)) 3980 return ret; 3981 /* Continue sending eMPW batches. */ 3982 } 3983 assert(false); 3984 } 3985 3986 /** 3987 * The routine sends packets with MLX5_OPCODE_EMPW 3988 * with inlining, optionally supports VLAN insertion. 3989 */ 3990 static __rte_always_inline enum mlx5_txcmp_code 3991 mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq, 3992 struct rte_mbuf **restrict pkts, 3993 unsigned int pkts_n, 3994 struct mlx5_txq_local *restrict loc, 3995 unsigned int olx) 3996 { 3997 /* 3998 * Subroutine is the part of mlx5_tx_burst_single() 3999 * and sends single-segment packet with eMPW opcode 4000 * with data inlining. 4001 */ 4002 assert(MLX5_TXOFF_CONFIG(INLINE)); 4003 assert(MLX5_TXOFF_CONFIG(EMPW)); 4004 assert(loc->elts_free && loc->wqe_free); 4005 assert(pkts_n > loc->pkts_sent); 4006 static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size"); 4007 pkts += loc->pkts_sent + 1; 4008 pkts_n -= loc->pkts_sent; 4009 for (;;) { 4010 struct mlx5_wqe_dseg *restrict dseg; 4011 struct mlx5_wqe_eseg *restrict eseg; 4012 enum mlx5_txcmp_code ret; 4013 unsigned int room, part, nlim; 4014 unsigned int slen = 0; 4015 4016 /* 4017 * Limits the amount of packets in one WQE 4018 * to improve CQE latency generation. 4019 */ 4020 nlim = RTE_MIN(pkts_n, MLX5_EMPW_MAX_PACKETS); 4021 /* Check whether we have minimal amount WQEs */ 4022 if (unlikely(loc->wqe_free < 4023 ((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4))) 4024 return MLX5_TXCMP_CODE_EXIT; 4025 if (likely(pkts_n > 1)) 4026 rte_prefetch0(*pkts); 4027 loc->wqe_last = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4028 /* 4029 * Build eMPW title WQEBB: 4030 * - Control Segment, eMPW opcode, zero DS 4031 * - Ethernet Segment, no inline 4032 */ 4033 mlx5_tx_cseg_init(txq, loc, loc->wqe_last, 0, 4034 MLX5_OPCODE_ENHANCED_MPSW, olx); 4035 mlx5_tx_eseg_none(txq, loc, loc->wqe_last, 4036 olx & ~MLX5_TXOFF_CONFIG_VLAN); 4037 eseg = &loc->wqe_last->eseg; 4038 dseg = &loc->wqe_last->dseg[0]; 4039 room = RTE_MIN(MLX5_WQE_SIZE_MAX / MLX5_WQE_SIZE, 4040 loc->wqe_free) * MLX5_WQE_SIZE - 4041 MLX5_WQE_CSEG_SIZE - 4042 MLX5_WQE_ESEG_SIZE; 4043 /* Build WQE till we have space, packets and resources. */ 4044 part = room; 4045 for (;;) { 4046 uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf); 4047 uint8_t *dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *); 4048 unsigned int tlen; 4049 4050 assert(room >= MLX5_WQE_DSEG_SIZE); 4051 assert((room % MLX5_WQE_DSEG_SIZE) == 0); 4052 assert((uintptr_t)dseg < (uintptr_t)txq->wqes_end); 4053 /* 4054 * Some Tx offloads may cause an error if 4055 * packet is not long enough, check against 4056 * assumed minimal length. 4057 */ 4058 if (unlikely(dlen <= MLX5_ESEG_MIN_INLINE_SIZE)) { 4059 part -= room; 4060 if (unlikely(!part)) 4061 return MLX5_TXCMP_CODE_ERROR; 4062 /* 4063 * We have some successfully built 4064 * packet Data Segments to send. 4065 */ 4066 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4067 return MLX5_TXCMP_CODE_ERROR; 4068 } 4069 /* Inline or not inline - that's the Question. */ 4070 if (dlen > txq->inlen_empw) 4071 goto pointer_empw; 4072 /* Inline entire packet, optional VLAN insertion. */ 4073 tlen = sizeof(dseg->bcount) + dlen; 4074 if (MLX5_TXOFF_CONFIG(VLAN) && 4075 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { 4076 /* 4077 * The packet length must be checked in 4078 * mlx5_tx_able_to_empw() and packet 4079 * fits into inline length guaranteed. 4080 */ 4081 assert((dlen + sizeof(struct rte_vlan_hdr)) <= 4082 txq->inlen_empw); 4083 tlen += sizeof(struct rte_vlan_hdr); 4084 if (room < tlen) 4085 break; 4086 dseg = mlx5_tx_dseg_vlan(txq, loc, dseg, 4087 dptr, dlen, olx); 4088 #ifdef MLX5_PMD_SOFT_COUNTERS 4089 /* Update sent data bytes counter. */ 4090 slen += sizeof(struct rte_vlan_hdr); 4091 #endif 4092 } else { 4093 if (room < tlen) 4094 break; 4095 dseg = mlx5_tx_dseg_empw(txq, loc, dseg, 4096 dptr, dlen, olx); 4097 } 4098 tlen = RTE_ALIGN(tlen, MLX5_WSEG_SIZE); 4099 assert(room >= tlen); 4100 room -= tlen; 4101 /* 4102 * Packet data are completely inlined, 4103 * free the packet immediately. 4104 */ 4105 rte_pktmbuf_free_seg(loc->mbuf); 4106 goto next_mbuf; 4107 pointer_empw: 4108 /* 4109 * Not inlinable VLAN packets are 4110 * proceeded outside of this routine. 4111 */ 4112 assert(room >= MLX5_WQE_DSEG_SIZE); 4113 if (MLX5_TXOFF_CONFIG(VLAN)) 4114 assert(!(loc->mbuf->ol_flags & 4115 PKT_TX_VLAN_PKT)); 4116 mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx); 4117 /* We have to store mbuf in elts.*/ 4118 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; 4119 room -= MLX5_WQE_DSEG_SIZE; 4120 /* Ring buffer wraparound is checked at the loop end.*/ 4121 ++dseg; 4122 next_mbuf: 4123 #ifdef MLX5_PMD_SOFT_COUNTERS 4124 /* Update sent data bytes counter. */ 4125 slen += dlen; 4126 #endif 4127 loc->pkts_sent++; 4128 loc->elts_free--; 4129 pkts_n--; 4130 if (unlikely(!pkts_n || !loc->elts_free)) { 4131 /* 4132 * We have no resources/packets to 4133 * continue build descriptors. 4134 */ 4135 part -= room; 4136 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4137 return MLX5_TXCMP_CODE_EXIT; 4138 } 4139 loc->mbuf = *pkts++; 4140 if (likely(pkts_n > 1)) 4141 rte_prefetch0(*pkts); 4142 ret = mlx5_tx_able_to_empw(txq, loc, olx, true); 4143 /* 4144 * Unroll the completion code to avoid 4145 * returning variable value - it results in 4146 * unoptimized sequent checking in caller. 4147 */ 4148 if (ret == MLX5_TXCMP_CODE_MULTI) { 4149 part -= room; 4150 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4151 if (unlikely(!loc->elts_free || 4152 !loc->wqe_free)) 4153 return MLX5_TXCMP_CODE_EXIT; 4154 return MLX5_TXCMP_CODE_MULTI; 4155 } 4156 if (ret == MLX5_TXCMP_CODE_TSO) { 4157 part -= room; 4158 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4159 if (unlikely(!loc->elts_free || 4160 !loc->wqe_free)) 4161 return MLX5_TXCMP_CODE_EXIT; 4162 return MLX5_TXCMP_CODE_TSO; 4163 } 4164 if (ret == MLX5_TXCMP_CODE_SINGLE) { 4165 part -= room; 4166 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4167 if (unlikely(!loc->elts_free || 4168 !loc->wqe_free)) 4169 return MLX5_TXCMP_CODE_EXIT; 4170 return MLX5_TXCMP_CODE_SINGLE; 4171 } 4172 if (ret != MLX5_TXCMP_CODE_EMPW) { 4173 assert(false); 4174 part -= room; 4175 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4176 return MLX5_TXCMP_CODE_ERROR; 4177 } 4178 /* Check if we have minimal room left. */ 4179 nlim--; 4180 if (unlikely(!nlim || room < MLX5_WQE_DSEG_SIZE)) 4181 break; 4182 /* 4183 * Check whether packet parameters coincide 4184 * within assumed eMPW batch: 4185 * - check sum settings 4186 * - metadata value 4187 * - software parser settings 4188 */ 4189 if (!mlx5_tx_match_empw(txq, eseg, loc, olx)) 4190 break; 4191 /* Packet attributes match, continue the same eMPW. */ 4192 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) 4193 dseg = (struct mlx5_wqe_dseg *)txq->wqes; 4194 } 4195 /* 4196 * We get here to close an existing eMPW 4197 * session and start the new one. 4198 */ 4199 assert(pkts_n); 4200 part -= room; 4201 if (unlikely(!part)) 4202 return MLX5_TXCMP_CODE_EXIT; 4203 mlx5_tx_idone_empw(txq, loc, part, slen, olx); 4204 if (unlikely(!loc->elts_free || 4205 !loc->wqe_free)) 4206 return MLX5_TXCMP_CODE_EXIT; 4207 /* Continue the loop with new eMPW session. */ 4208 } 4209 assert(false); 4210 } 4211 4212 /** 4213 * The routine sends packets with ordinary MLX5_OPCODE_SEND. 4214 * Data inlining and VLAN insertion are supported. 4215 */ 4216 static __rte_always_inline enum mlx5_txcmp_code 4217 mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq, 4218 struct rte_mbuf **restrict pkts, 4219 unsigned int pkts_n, 4220 struct mlx5_txq_local *restrict loc, 4221 unsigned int olx) 4222 { 4223 /* 4224 * Subroutine is the part of mlx5_tx_burst_single() 4225 * and sends single-segment packet with SEND opcode. 4226 */ 4227 assert(loc->elts_free && loc->wqe_free); 4228 assert(pkts_n > loc->pkts_sent); 4229 pkts += loc->pkts_sent + 1; 4230 pkts_n -= loc->pkts_sent; 4231 for (;;) { 4232 struct mlx5_wqe *restrict wqe; 4233 enum mlx5_txcmp_code ret; 4234 4235 assert(NB_SEGS(loc->mbuf) == 1); 4236 if (MLX5_TXOFF_CONFIG(INLINE)) { 4237 unsigned int inlen, vlan = 0; 4238 4239 inlen = rte_pktmbuf_data_len(loc->mbuf); 4240 if (MLX5_TXOFF_CONFIG(VLAN) && 4241 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { 4242 vlan = sizeof(struct rte_vlan_hdr); 4243 inlen += vlan; 4244 static_assert((sizeof(struct rte_vlan_hdr) + 4245 sizeof(struct rte_ether_hdr)) == 4246 MLX5_ESEG_MIN_INLINE_SIZE, 4247 "invalid min inline data size"); 4248 } 4249 /* 4250 * If inlining is enabled at configuration time 4251 * the limit must be not less than minimal size. 4252 * Otherwise we would do extra check for data 4253 * size to avoid crashes due to length overflow. 4254 */ 4255 assert(txq->inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE); 4256 if (inlen <= txq->inlen_send) { 4257 unsigned int seg_n, wqe_n; 4258 4259 rte_prefetch0(rte_pktmbuf_mtod 4260 (loc->mbuf, uint8_t *)); 4261 /* Check against minimal length. */ 4262 if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE) 4263 return MLX5_TXCMP_CODE_ERROR; 4264 /* 4265 * Completely inlined packet data WQE: 4266 * - Control Segment, SEND opcode 4267 * - Ethernet Segment, no VLAN insertion 4268 * - Data inlined, VLAN optionally inserted 4269 * - Alignment to MLX5_WSEG_SIZE 4270 * Have to estimate amount of WQEBBs 4271 */ 4272 seg_n = (inlen + 3 * MLX5_WSEG_SIZE - 4273 MLX5_ESEG_MIN_INLINE_SIZE + 4274 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 4275 /* Check if there are enough WQEBBs. */ 4276 wqe_n = (seg_n + 3) / 4; 4277 if (wqe_n > loc->wqe_free) 4278 return MLX5_TXCMP_CODE_EXIT; 4279 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4280 loc->wqe_last = wqe; 4281 mlx5_tx_cseg_init(txq, loc, wqe, seg_n, 4282 MLX5_OPCODE_SEND, olx); 4283 mlx5_tx_eseg_data(txq, loc, wqe, 4284 vlan, inlen, 0, olx); 4285 txq->wqe_ci += wqe_n; 4286 loc->wqe_free -= wqe_n; 4287 /* 4288 * Packet data are completely inlined, 4289 * free the packet immediately. 4290 */ 4291 rte_pktmbuf_free_seg(loc->mbuf); 4292 } else if (!MLX5_TXOFF_CONFIG(EMPW) && 4293 txq->inlen_mode) { 4294 /* 4295 * If minimal inlining is requested the eMPW 4296 * feature should be disabled due to data is 4297 * inlined into Ethernet Segment, which can 4298 * not contain inlined data for eMPW due to 4299 * segment shared for all packets. 4300 */ 4301 struct mlx5_wqe_dseg *restrict dseg; 4302 unsigned int ds; 4303 uint8_t *dptr; 4304 4305 /* 4306 * The inline-mode settings require 4307 * to inline the specified amount of 4308 * data bytes to the Ethernet Segment. 4309 * We should check the free space in 4310 * WQE ring buffer to inline partially. 4311 */ 4312 assert(txq->inlen_send >= txq->inlen_mode); 4313 assert(inlen > txq->inlen_mode); 4314 assert(txq->inlen_mode >= 4315 MLX5_ESEG_MIN_INLINE_SIZE); 4316 /* 4317 * Check whether there are enough free WQEBBs: 4318 * - Control Segment 4319 * - Ethernet Segment 4320 * - First Segment of inlined Ethernet data 4321 * - ... data continued ... 4322 * - Finishing Data Segment of pointer type 4323 */ 4324 ds = (MLX5_WQE_CSEG_SIZE + 4325 MLX5_WQE_ESEG_SIZE + 4326 MLX5_WQE_DSEG_SIZE + 4327 txq->inlen_mode - 4328 MLX5_ESEG_MIN_INLINE_SIZE + 4329 MLX5_WQE_DSEG_SIZE + 4330 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE; 4331 if (loc->wqe_free < ((ds + 3) / 4)) 4332 return MLX5_TXCMP_CODE_EXIT; 4333 /* 4334 * Build the ordinary SEND WQE: 4335 * - Control Segment 4336 * - Ethernet Segment, inline inlen_mode bytes 4337 * - Data Segment of pointer type 4338 */ 4339 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4340 loc->wqe_last = wqe; 4341 mlx5_tx_cseg_init(txq, loc, wqe, ds, 4342 MLX5_OPCODE_SEND, olx); 4343 dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan, 4344 txq->inlen_mode, 4345 0, olx); 4346 dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + 4347 txq->inlen_mode - vlan; 4348 inlen -= txq->inlen_mode; 4349 mlx5_tx_dseg_ptr(txq, loc, dseg, 4350 dptr, inlen, olx); 4351 /* 4352 * WQE is built, update the loop parameters 4353 * and got to the next packet. 4354 */ 4355 txq->wqe_ci += (ds + 3) / 4; 4356 loc->wqe_free -= (ds + 3) / 4; 4357 /* We have to store mbuf in elts.*/ 4358 assert(MLX5_TXOFF_CONFIG(INLINE)); 4359 txq->elts[txq->elts_head++ & txq->elts_m] = 4360 loc->mbuf; 4361 --loc->elts_free; 4362 } else { 4363 uint8_t *dptr; 4364 unsigned int dlen; 4365 4366 /* 4367 * Partially inlined packet data WQE, we have 4368 * some space in title WQEBB, we can fill it 4369 * with some packet data. It takes one WQEBB, 4370 * it is available, no extra space check: 4371 * - Control Segment, SEND opcode 4372 * - Ethernet Segment, no VLAN insertion 4373 * - MLX5_ESEG_MIN_INLINE_SIZE bytes of Data 4374 * - Data Segment, pointer type 4375 * 4376 * We also get here if VLAN insertion is not 4377 * supported by HW, the inline is enabled. 4378 */ 4379 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4380 loc->wqe_last = wqe; 4381 mlx5_tx_cseg_init(txq, loc, wqe, 4, 4382 MLX5_OPCODE_SEND, olx); 4383 mlx5_tx_eseg_dmin(txq, loc, wqe, vlan, olx); 4384 dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + 4385 MLX5_ESEG_MIN_INLINE_SIZE - vlan; 4386 /* 4387 * The length check is performed above, by 4388 * comparing with txq->inlen_send. We should 4389 * not get overflow here. 4390 */ 4391 assert(inlen > MLX5_ESEG_MIN_INLINE_SIZE); 4392 dlen = inlen - MLX5_ESEG_MIN_INLINE_SIZE; 4393 mlx5_tx_dseg_ptr(txq, loc, &wqe->dseg[1], 4394 dptr, dlen, olx); 4395 ++txq->wqe_ci; 4396 --loc->wqe_free; 4397 /* We have to store mbuf in elts.*/ 4398 assert(MLX5_TXOFF_CONFIG(INLINE)); 4399 txq->elts[txq->elts_head++ & txq->elts_m] = 4400 loc->mbuf; 4401 --loc->elts_free; 4402 } 4403 #ifdef MLX5_PMD_SOFT_COUNTERS 4404 /* Update sent data bytes counter. */ 4405 txq->stats.obytes += vlan + 4406 rte_pktmbuf_data_len(loc->mbuf); 4407 #endif 4408 } else { 4409 /* 4410 * No inline at all, it means the CPU cycles saving 4411 * is prioritized at configuration, we should not 4412 * copy any packet data to WQE. 4413 * 4414 * SEND WQE, one WQEBB: 4415 * - Control Segment, SEND opcode 4416 * - Ethernet Segment, optional VLAN, no inline 4417 * - Data Segment, pointer type 4418 */ 4419 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); 4420 loc->wqe_last = wqe; 4421 mlx5_tx_cseg_init(txq, loc, wqe, 3, 4422 MLX5_OPCODE_SEND, olx); 4423 mlx5_tx_eseg_none(txq, loc, wqe, olx); 4424 mlx5_tx_dseg_ptr 4425 (txq, loc, &wqe->dseg[0], 4426 rte_pktmbuf_mtod(loc->mbuf, uint8_t *), 4427 rte_pktmbuf_data_len(loc->mbuf), olx); 4428 ++txq->wqe_ci; 4429 --loc->wqe_free; 4430 /* 4431 * We should not store mbuf pointer in elts 4432 * if no inlining is configured, this is done 4433 * by calling routine in a batch copy. 4434 */ 4435 assert(!MLX5_TXOFF_CONFIG(INLINE)); 4436 --loc->elts_free; 4437 #ifdef MLX5_PMD_SOFT_COUNTERS 4438 /* Update sent data bytes counter. */ 4439 txq->stats.obytes += rte_pktmbuf_data_len(loc->mbuf); 4440 if (MLX5_TXOFF_CONFIG(VLAN) && 4441 loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) 4442 txq->stats.obytes += 4443 sizeof(struct rte_vlan_hdr); 4444 #endif 4445 } 4446 ++loc->pkts_sent; 4447 --pkts_n; 4448 /* Request CQE generation if limits are reached. */ 4449 mlx5_tx_request_completion(txq, loc, false, olx); 4450 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) 4451 return MLX5_TXCMP_CODE_EXIT; 4452 loc->mbuf = *pkts++; 4453 if (pkts_n > 1) 4454 rte_prefetch0(*pkts); 4455 ret = mlx5_tx_able_to_empw(txq, loc, olx, true); 4456 if (unlikely(ret != MLX5_TXCMP_CODE_SINGLE)) 4457 return ret; 4458 } 4459 assert(false); 4460 } 4461 4462 static __rte_always_inline enum mlx5_txcmp_code 4463 mlx5_tx_burst_single(struct mlx5_txq_data *restrict txq, 4464 struct rte_mbuf **restrict pkts, 4465 unsigned int pkts_n, 4466 struct mlx5_txq_local *restrict loc, 4467 unsigned int olx) 4468 { 4469 enum mlx5_txcmp_code ret; 4470 4471 ret = mlx5_tx_able_to_empw(txq, loc, olx, false); 4472 if (ret == MLX5_TXCMP_CODE_SINGLE) 4473 goto ordinary_send; 4474 assert(ret == MLX5_TXCMP_CODE_EMPW); 4475 for (;;) { 4476 /* Optimize for inline/no inline eMPW send. */ 4477 ret = (MLX5_TXOFF_CONFIG(INLINE)) ? 4478 mlx5_tx_burst_empw_inline 4479 (txq, pkts, pkts_n, loc, olx) : 4480 mlx5_tx_burst_empw_simple 4481 (txq, pkts, pkts_n, loc, olx); 4482 if (ret != MLX5_TXCMP_CODE_SINGLE) 4483 return ret; 4484 /* The resources to send one packet should remain. */ 4485 assert(loc->elts_free && loc->wqe_free); 4486 ordinary_send: 4487 ret = mlx5_tx_burst_single_send(txq, pkts, pkts_n, loc, olx); 4488 assert(ret != MLX5_TXCMP_CODE_SINGLE); 4489 if (ret != MLX5_TXCMP_CODE_EMPW) 4490 return ret; 4491 /* The resources to send one packet should remain. */ 4492 assert(loc->elts_free && loc->wqe_free); 4493 } 4494 } 4495 4496 /** 4497 * DPDK Tx callback template. This is configured template 4498 * used to generate routines optimized for specified offload setup. 4499 * One of this generated functions is chosen at SQ configuration 4500 * time. 4501 * 4502 * @param txq 4503 * Generic pointer to TX queue structure. 4504 * @param[in] pkts 4505 * Packets to transmit. 4506 * @param pkts_n 4507 * Number of packets in array. 4508 * @param olx 4509 * Configured offloads mask, presents the bits of MLX5_TXOFF_CONFIG_xxx 4510 * values. Should be static to take compile time static configuration 4511 * advantages. 4512 * 4513 * @return 4514 * Number of packets successfully transmitted (<= pkts_n). 4515 */ 4516 static __rte_always_inline uint16_t 4517 mlx5_tx_burst_tmpl(struct mlx5_txq_data *restrict txq, 4518 struct rte_mbuf **restrict pkts, 4519 uint16_t pkts_n, 4520 unsigned int olx) 4521 { 4522 struct mlx5_txq_local loc; 4523 enum mlx5_txcmp_code ret; 4524 unsigned int part; 4525 4526 assert(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); 4527 assert(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); 4528 if (unlikely(!pkts_n)) 4529 return 0; 4530 loc.pkts_sent = 0; 4531 loc.pkts_copy = 0; 4532 loc.wqe_last = NULL; 4533 4534 send_loop: 4535 loc.pkts_loop = loc.pkts_sent; 4536 /* 4537 * Check if there are some CQEs, if any: 4538 * - process an encountered errors 4539 * - process the completed WQEs 4540 * - free related mbufs 4541 * - doorbell the NIC about processed CQEs 4542 */ 4543 rte_prefetch0(*(pkts + loc.pkts_sent)); 4544 mlx5_tx_handle_completion(txq, olx); 4545 /* 4546 * Calculate the number of available resources - elts and WQEs. 4547 * There are two possible different scenarios: 4548 * - no data inlining into WQEs, one WQEBB may contains upto 4549 * four packets, in this case elts become scarce resource 4550 * - data inlining into WQEs, one packet may require multiple 4551 * WQEBBs, the WQEs become the limiting factor. 4552 */ 4553 assert(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); 4554 loc.elts_free = txq->elts_s - 4555 (uint16_t)(txq->elts_head - txq->elts_tail); 4556 assert(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); 4557 loc.wqe_free = txq->wqe_s - 4558 (uint16_t)(txq->wqe_ci - txq->wqe_pi); 4559 if (unlikely(!loc.elts_free || !loc.wqe_free)) 4560 return loc.pkts_sent; 4561 for (;;) { 4562 /* 4563 * Fetch the packet from array. Usually this is 4564 * the first packet in series of multi/single 4565 * segment packets. 4566 */ 4567 loc.mbuf = *(pkts + loc.pkts_sent); 4568 /* Dedicated branch for multi-segment packets. */ 4569 if (MLX5_TXOFF_CONFIG(MULTI) && 4570 unlikely(NB_SEGS(loc.mbuf) > 1)) { 4571 /* 4572 * Multi-segment packet encountered. 4573 * Hardware is able to process it only 4574 * with SEND/TSO opcodes, one packet 4575 * per WQE, do it in dedicated routine. 4576 */ 4577 enter_send_multi: 4578 assert(loc.pkts_sent >= loc.pkts_copy); 4579 part = loc.pkts_sent - loc.pkts_copy; 4580 if (!MLX5_TXOFF_CONFIG(INLINE) && part) { 4581 /* 4582 * There are some single-segment mbufs not 4583 * stored in elts. The mbufs must be in the 4584 * same order as WQEs, so we must copy the 4585 * mbufs to elts here, before the coming 4586 * multi-segment packet mbufs is appended. 4587 */ 4588 mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy, 4589 part, olx); 4590 loc.pkts_copy = loc.pkts_sent; 4591 } 4592 assert(pkts_n > loc.pkts_sent); 4593 ret = mlx5_tx_burst_mseg(txq, pkts, pkts_n, &loc, olx); 4594 if (!MLX5_TXOFF_CONFIG(INLINE)) 4595 loc.pkts_copy = loc.pkts_sent; 4596 /* 4597 * These returned code checks are supposed 4598 * to be optimized out due to routine inlining. 4599 */ 4600 if (ret == MLX5_TXCMP_CODE_EXIT) { 4601 /* 4602 * The routine returns this code when 4603 * all packets are sent or there is no 4604 * enough resources to complete request. 4605 */ 4606 break; 4607 } 4608 if (ret == MLX5_TXCMP_CODE_ERROR) { 4609 /* 4610 * The routine returns this code when 4611 * some error in the incoming packets 4612 * format occurred. 4613 */ 4614 txq->stats.oerrors++; 4615 break; 4616 } 4617 if (ret == MLX5_TXCMP_CODE_SINGLE) { 4618 /* 4619 * The single-segment packet was encountered 4620 * in the array, try to send it with the 4621 * best optimized way, possible engaging eMPW. 4622 */ 4623 goto enter_send_single; 4624 } 4625 if (MLX5_TXOFF_CONFIG(TSO) && 4626 ret == MLX5_TXCMP_CODE_TSO) { 4627 /* 4628 * The single-segment TSO packet was 4629 * encountered in the array. 4630 */ 4631 goto enter_send_tso; 4632 } 4633 /* We must not get here. Something is going wrong. */ 4634 assert(false); 4635 txq->stats.oerrors++; 4636 break; 4637 } 4638 /* Dedicated branch for single-segment TSO packets. */ 4639 if (MLX5_TXOFF_CONFIG(TSO) && 4640 unlikely(loc.mbuf->ol_flags & PKT_TX_TCP_SEG)) { 4641 /* 4642 * TSO might require special way for inlining 4643 * (dedicated parameters) and is sent with 4644 * MLX5_OPCODE_TSO opcode only, provide this 4645 * in dedicated branch. 4646 */ 4647 enter_send_tso: 4648 assert(NB_SEGS(loc.mbuf) == 1); 4649 assert(pkts_n > loc.pkts_sent); 4650 ret = mlx5_tx_burst_tso(txq, pkts, pkts_n, &loc, olx); 4651 /* 4652 * These returned code checks are supposed 4653 * to be optimized out due to routine inlining. 4654 */ 4655 if (ret == MLX5_TXCMP_CODE_EXIT) 4656 break; 4657 if (ret == MLX5_TXCMP_CODE_ERROR) { 4658 txq->stats.oerrors++; 4659 break; 4660 } 4661 if (ret == MLX5_TXCMP_CODE_SINGLE) 4662 goto enter_send_single; 4663 if (MLX5_TXOFF_CONFIG(MULTI) && 4664 ret == MLX5_TXCMP_CODE_MULTI) { 4665 /* 4666 * The multi-segment packet was 4667 * encountered in the array. 4668 */ 4669 goto enter_send_multi; 4670 } 4671 /* We must not get here. Something is going wrong. */ 4672 assert(false); 4673 txq->stats.oerrors++; 4674 break; 4675 } 4676 /* 4677 * The dedicated branch for the single-segment packets 4678 * without TSO. Often these ones can be sent using 4679 * MLX5_OPCODE_EMPW with multiple packets in one WQE. 4680 * The routine builds the WQEs till it encounters 4681 * the TSO or multi-segment packet (in case if these 4682 * offloads are requested at SQ configuration time). 4683 */ 4684 enter_send_single: 4685 assert(pkts_n > loc.pkts_sent); 4686 ret = mlx5_tx_burst_single(txq, pkts, pkts_n, &loc, olx); 4687 /* 4688 * These returned code checks are supposed 4689 * to be optimized out due to routine inlining. 4690 */ 4691 if (ret == MLX5_TXCMP_CODE_EXIT) 4692 break; 4693 if (ret == MLX5_TXCMP_CODE_ERROR) { 4694 txq->stats.oerrors++; 4695 break; 4696 } 4697 if (MLX5_TXOFF_CONFIG(MULTI) && 4698 ret == MLX5_TXCMP_CODE_MULTI) { 4699 /* 4700 * The multi-segment packet was 4701 * encountered in the array. 4702 */ 4703 goto enter_send_multi; 4704 } 4705 if (MLX5_TXOFF_CONFIG(TSO) && 4706 ret == MLX5_TXCMP_CODE_TSO) { 4707 /* 4708 * The single-segment TSO packet was 4709 * encountered in the array. 4710 */ 4711 goto enter_send_tso; 4712 } 4713 /* We must not get here. Something is going wrong. */ 4714 assert(false); 4715 txq->stats.oerrors++; 4716 break; 4717 } 4718 /* 4719 * Main Tx loop is completed, do the rest: 4720 * - set completion request if thresholds are reached 4721 * - doorbell the hardware 4722 * - copy the rest of mbufs to elts (if any) 4723 */ 4724 assert(MLX5_TXOFF_CONFIG(INLINE) || loc.pkts_sent >= loc.pkts_copy); 4725 /* Take a shortcut if nothing is sent. */ 4726 if (unlikely(loc.pkts_sent == loc.pkts_loop)) 4727 return loc.pkts_sent; 4728 /* 4729 * Ring QP doorbell immediately after WQE building completion 4730 * to improve latencies. The pure software related data treatment 4731 * can be completed after doorbell. Tx CQEs for this SQ are 4732 * processed in this thread only by the polling. 4733 */ 4734 mlx5_tx_dbrec_cond_wmb(txq, loc.wqe_last, 0); 4735 /* Not all of the mbufs may be stored into elts yet. */ 4736 part = MLX5_TXOFF_CONFIG(INLINE) ? 0 : loc.pkts_sent - loc.pkts_copy; 4737 if (!MLX5_TXOFF_CONFIG(INLINE) && part) { 4738 /* 4739 * There are some single-segment mbufs not stored in elts. 4740 * It can be only if the last packet was single-segment. 4741 * The copying is gathered into one place due to it is 4742 * a good opportunity to optimize that with SIMD. 4743 * Unfortunately if inlining is enabled the gaps in 4744 * pointer array may happen due to early freeing of the 4745 * inlined mbufs. 4746 */ 4747 mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy, part, olx); 4748 loc.pkts_copy = loc.pkts_sent; 4749 } 4750 #ifdef MLX5_PMD_SOFT_COUNTERS 4751 /* Increment sent packets counter. */ 4752 txq->stats.opackets += loc.pkts_sent; 4753 #endif 4754 assert(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); 4755 assert(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); 4756 if (pkts_n > loc.pkts_sent) { 4757 /* 4758 * If burst size is large there might be no enough CQE 4759 * fetched from completion queue and no enough resources 4760 * freed to send all the packets. 4761 */ 4762 goto send_loop; 4763 } 4764 return loc.pkts_sent; 4765 } 4766 4767 /* Generate routines with Enhanced Multi-Packet Write support. */ 4768 MLX5_TXOFF_DECL(full_empw, 4769 MLX5_TXOFF_CONFIG_FULL | MLX5_TXOFF_CONFIG_EMPW) 4770 4771 MLX5_TXOFF_DECL(none_empw, 4772 MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW) 4773 4774 MLX5_TXOFF_DECL(md_empw, 4775 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4776 4777 MLX5_TXOFF_DECL(mt_empw, 4778 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4779 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4780 4781 MLX5_TXOFF_DECL(mtsc_empw, 4782 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4783 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4784 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4785 4786 MLX5_TXOFF_DECL(mti_empw, 4787 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4788 MLX5_TXOFF_CONFIG_INLINE | 4789 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4790 4791 MLX5_TXOFF_DECL(mtv_empw, 4792 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4793 MLX5_TXOFF_CONFIG_VLAN | 4794 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4795 4796 MLX5_TXOFF_DECL(mtiv_empw, 4797 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4798 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4799 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4800 4801 MLX5_TXOFF_DECL(sc_empw, 4802 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4803 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4804 4805 MLX5_TXOFF_DECL(sci_empw, 4806 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4807 MLX5_TXOFF_CONFIG_INLINE | 4808 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4809 4810 MLX5_TXOFF_DECL(scv_empw, 4811 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4812 MLX5_TXOFF_CONFIG_VLAN | 4813 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4814 4815 MLX5_TXOFF_DECL(sciv_empw, 4816 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4817 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4818 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4819 4820 MLX5_TXOFF_DECL(i_empw, 4821 MLX5_TXOFF_CONFIG_INLINE | 4822 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4823 4824 MLX5_TXOFF_DECL(v_empw, 4825 MLX5_TXOFF_CONFIG_VLAN | 4826 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4827 4828 MLX5_TXOFF_DECL(iv_empw, 4829 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4830 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4831 4832 /* Generate routines without Enhanced Multi-Packet Write support. */ 4833 MLX5_TXOFF_DECL(full, 4834 MLX5_TXOFF_CONFIG_FULL) 4835 4836 MLX5_TXOFF_DECL(none, 4837 MLX5_TXOFF_CONFIG_NONE) 4838 4839 MLX5_TXOFF_DECL(md, 4840 MLX5_TXOFF_CONFIG_METADATA) 4841 4842 MLX5_TXOFF_DECL(mt, 4843 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4844 MLX5_TXOFF_CONFIG_METADATA) 4845 4846 MLX5_TXOFF_DECL(mtsc, 4847 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4848 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4849 MLX5_TXOFF_CONFIG_METADATA) 4850 4851 MLX5_TXOFF_DECL(mti, 4852 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4853 MLX5_TXOFF_CONFIG_INLINE | 4854 MLX5_TXOFF_CONFIG_METADATA) 4855 4856 4857 MLX5_TXOFF_DECL(mtv, 4858 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4859 MLX5_TXOFF_CONFIG_VLAN | 4860 MLX5_TXOFF_CONFIG_METADATA) 4861 4862 4863 MLX5_TXOFF_DECL(mtiv, 4864 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4865 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4866 MLX5_TXOFF_CONFIG_METADATA) 4867 4868 MLX5_TXOFF_DECL(sc, 4869 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4870 MLX5_TXOFF_CONFIG_METADATA) 4871 4872 MLX5_TXOFF_DECL(sci, 4873 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4874 MLX5_TXOFF_CONFIG_INLINE | 4875 MLX5_TXOFF_CONFIG_METADATA) 4876 4877 4878 MLX5_TXOFF_DECL(scv, 4879 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4880 MLX5_TXOFF_CONFIG_VLAN | 4881 MLX5_TXOFF_CONFIG_METADATA) 4882 4883 4884 MLX5_TXOFF_DECL(sciv, 4885 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4886 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4887 MLX5_TXOFF_CONFIG_METADATA) 4888 4889 MLX5_TXOFF_DECL(i, 4890 MLX5_TXOFF_CONFIG_INLINE | 4891 MLX5_TXOFF_CONFIG_METADATA) 4892 4893 MLX5_TXOFF_DECL(v, 4894 MLX5_TXOFF_CONFIG_VLAN | 4895 MLX5_TXOFF_CONFIG_METADATA) 4896 4897 MLX5_TXOFF_DECL(iv, 4898 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4899 MLX5_TXOFF_CONFIG_METADATA) 4900 4901 /* 4902 * Array of declared and compiled Tx burst function and corresponding 4903 * supported offloads set. The array is used to select the Tx burst 4904 * function for specified offloads set at Tx queue configuration time. 4905 */ 4906 const struct { 4907 eth_tx_burst_t func; 4908 unsigned int olx; 4909 } txoff_func[] = { 4910 MLX5_TXOFF_INFO(full_empw, 4911 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4912 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4913 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4914 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4915 4916 MLX5_TXOFF_INFO(none_empw, 4917 MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW) 4918 4919 MLX5_TXOFF_INFO(md_empw, 4920 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4921 4922 MLX5_TXOFF_INFO(mt_empw, 4923 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4924 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4925 4926 MLX5_TXOFF_INFO(mtsc_empw, 4927 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4928 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4929 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4930 4931 MLX5_TXOFF_INFO(mti_empw, 4932 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4933 MLX5_TXOFF_CONFIG_INLINE | 4934 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4935 4936 MLX5_TXOFF_INFO(mtv_empw, 4937 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4938 MLX5_TXOFF_CONFIG_VLAN | 4939 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4940 4941 MLX5_TXOFF_INFO(mtiv_empw, 4942 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4943 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4944 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4945 4946 MLX5_TXOFF_INFO(sc_empw, 4947 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4948 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4949 4950 MLX5_TXOFF_INFO(sci_empw, 4951 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4952 MLX5_TXOFF_CONFIG_INLINE | 4953 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4954 4955 MLX5_TXOFF_INFO(scv_empw, 4956 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4957 MLX5_TXOFF_CONFIG_VLAN | 4958 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4959 4960 MLX5_TXOFF_INFO(sciv_empw, 4961 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4962 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4963 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4964 4965 MLX5_TXOFF_INFO(i_empw, 4966 MLX5_TXOFF_CONFIG_INLINE | 4967 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4968 4969 MLX5_TXOFF_INFO(v_empw, 4970 MLX5_TXOFF_CONFIG_VLAN | 4971 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4972 4973 MLX5_TXOFF_INFO(iv_empw, 4974 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4975 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) 4976 4977 MLX5_TXOFF_INFO(full, 4978 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4979 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4980 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 4981 MLX5_TXOFF_CONFIG_METADATA) 4982 4983 MLX5_TXOFF_INFO(none, 4984 MLX5_TXOFF_CONFIG_NONE) 4985 4986 MLX5_TXOFF_INFO(md, 4987 MLX5_TXOFF_CONFIG_METADATA) 4988 4989 MLX5_TXOFF_INFO(mt, 4990 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4991 MLX5_TXOFF_CONFIG_METADATA) 4992 4993 MLX5_TXOFF_INFO(mtsc, 4994 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 4995 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 4996 MLX5_TXOFF_CONFIG_METADATA) 4997 4998 MLX5_TXOFF_INFO(mti, 4999 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5000 MLX5_TXOFF_CONFIG_INLINE | 5001 MLX5_TXOFF_CONFIG_METADATA) 5002 5003 5004 MLX5_TXOFF_INFO(mtv, 5005 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5006 MLX5_TXOFF_CONFIG_VLAN | 5007 MLX5_TXOFF_CONFIG_METADATA) 5008 5009 MLX5_TXOFF_INFO(mtiv, 5010 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | 5011 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5012 MLX5_TXOFF_CONFIG_METADATA) 5013 5014 MLX5_TXOFF_INFO(sc, 5015 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5016 MLX5_TXOFF_CONFIG_METADATA) 5017 5018 MLX5_TXOFF_INFO(sci, 5019 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5020 MLX5_TXOFF_CONFIG_INLINE | 5021 MLX5_TXOFF_CONFIG_METADATA) 5022 5023 MLX5_TXOFF_INFO(scv, 5024 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5025 MLX5_TXOFF_CONFIG_VLAN | 5026 MLX5_TXOFF_CONFIG_METADATA) 5027 5028 MLX5_TXOFF_INFO(sciv, 5029 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | 5030 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5031 MLX5_TXOFF_CONFIG_METADATA) 5032 5033 MLX5_TXOFF_INFO(i, 5034 MLX5_TXOFF_CONFIG_INLINE | 5035 MLX5_TXOFF_CONFIG_METADATA) 5036 5037 MLX5_TXOFF_INFO(v, 5038 MLX5_TXOFF_CONFIG_VLAN | 5039 MLX5_TXOFF_CONFIG_METADATA) 5040 5041 MLX5_TXOFF_INFO(iv, 5042 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | 5043 MLX5_TXOFF_CONFIG_METADATA) 5044 }; 5045 5046 /** 5047 * Configure the Tx function to use. The routine checks configured 5048 * Tx offloads for the device and selects appropriate Tx burst 5049 * routine. There are multiple Tx burst routines compiled from 5050 * the same template in the most optimal way for the dedicated 5051 * Tx offloads set. 5052 * 5053 * @param dev 5054 * Pointer to private data structure. 5055 * 5056 * @return 5057 * Pointer to selected Tx burst function. 5058 */ 5059 eth_tx_burst_t 5060 mlx5_select_tx_function(struct rte_eth_dev *dev) 5061 { 5062 struct mlx5_priv *priv = dev->data->dev_private; 5063 struct mlx5_dev_config *config = &priv->config; 5064 uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads; 5065 unsigned int diff = 0, olx = 0, i, m; 5066 5067 static_assert(MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE <= 5068 MLX5_DSEG_MAX, "invalid WQE max size"); 5069 static_assert(MLX5_WQE_CSEG_SIZE == MLX5_WSEG_SIZE, 5070 "invalid WQE Control Segment size"); 5071 static_assert(MLX5_WQE_ESEG_SIZE == MLX5_WSEG_SIZE, 5072 "invalid WQE Ethernet Segment size"); 5073 static_assert(MLX5_WQE_DSEG_SIZE == MLX5_WSEG_SIZE, 5074 "invalid WQE Data Segment size"); 5075 static_assert(MLX5_WQE_SIZE == 4 * MLX5_WSEG_SIZE, 5076 "invalid WQE size"); 5077 assert(priv); 5078 if (tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS) { 5079 /* We should support Multi-Segment Packets. */ 5080 olx |= MLX5_TXOFF_CONFIG_MULTI; 5081 } 5082 if (tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO | 5083 DEV_TX_OFFLOAD_VXLAN_TNL_TSO | 5084 DEV_TX_OFFLOAD_GRE_TNL_TSO | 5085 DEV_TX_OFFLOAD_IP_TNL_TSO | 5086 DEV_TX_OFFLOAD_UDP_TNL_TSO)) { 5087 /* We should support TCP Send Offload. */ 5088 olx |= MLX5_TXOFF_CONFIG_TSO; 5089 } 5090 if (tx_offloads & (DEV_TX_OFFLOAD_IP_TNL_TSO | 5091 DEV_TX_OFFLOAD_UDP_TNL_TSO | 5092 DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)) { 5093 /* We should support Software Parser for Tunnels. */ 5094 olx |= MLX5_TXOFF_CONFIG_SWP; 5095 } 5096 if (tx_offloads & (DEV_TX_OFFLOAD_IPV4_CKSUM | 5097 DEV_TX_OFFLOAD_UDP_CKSUM | 5098 DEV_TX_OFFLOAD_TCP_CKSUM | 5099 DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)) { 5100 /* We should support IP/TCP/UDP Checksums. */ 5101 olx |= MLX5_TXOFF_CONFIG_CSUM; 5102 } 5103 if (tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT) { 5104 /* We should support VLAN insertion. */ 5105 olx |= MLX5_TXOFF_CONFIG_VLAN; 5106 } 5107 if (priv->txqs_n && (*priv->txqs)[0]) { 5108 struct mlx5_txq_data *txd = (*priv->txqs)[0]; 5109 5110 if (txd->inlen_send) { 5111 /* 5112 * Check the data inline requirements. Data inline 5113 * is enabled on per device basis, we can check 5114 * the first Tx queue only. 5115 * 5116 * If device does not support VLAN insertion in WQE 5117 * and some queues are requested to perform VLAN 5118 * insertion offload than inline must be enabled. 5119 */ 5120 olx |= MLX5_TXOFF_CONFIG_INLINE; 5121 } 5122 } 5123 if (config->mps == MLX5_MPW_ENHANCED && 5124 config->txq_inline_min <= 0) { 5125 /* 5126 * The NIC supports Enhanced Multi-Packet Write. 5127 * We do not support legacy MPW due to its 5128 * hardware related problems, so we just ignore 5129 * legacy MLX5_MPW settings. There should be no 5130 * minimal required inline data. 5131 */ 5132 olx |= MLX5_TXOFF_CONFIG_EMPW; 5133 } 5134 if (tx_offloads & DEV_TX_OFFLOAD_MATCH_METADATA) { 5135 /* We should support Flow metadata. */ 5136 olx |= MLX5_TXOFF_CONFIG_METADATA; 5137 } 5138 /* 5139 * Scan the routines table to find the minimal 5140 * satisfying routine with requested offloads. 5141 */ 5142 m = RTE_DIM(txoff_func); 5143 for (i = 0; i < RTE_DIM(txoff_func); i++) { 5144 unsigned int tmp; 5145 5146 tmp = txoff_func[i].olx; 5147 if (tmp == olx) { 5148 /* Meets requested offloads exactly.*/ 5149 m = i; 5150 break; 5151 } 5152 if ((tmp & olx) != olx) { 5153 /* Does not meet requested offloads at all. */ 5154 continue; 5155 } 5156 if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_EMPW) 5157 /* Do not enable eMPW if not configured. */ 5158 continue; 5159 if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_INLINE) 5160 /* Do not enable inlining if not configured. */ 5161 continue; 5162 /* 5163 * Some routine meets the requirements. 5164 * Check whether it has minimal amount 5165 * of not requested offloads. 5166 */ 5167 tmp = __builtin_popcountl(tmp & ~olx); 5168 if (m >= RTE_DIM(txoff_func) || tmp < diff) { 5169 /* First or better match, save and continue. */ 5170 m = i; 5171 diff = tmp; 5172 continue; 5173 } 5174 if (tmp == diff) { 5175 tmp = txoff_func[i].olx ^ txoff_func[m].olx; 5176 if (__builtin_ffsl(txoff_func[i].olx & ~tmp) < 5177 __builtin_ffsl(txoff_func[m].olx & ~tmp)) { 5178 /* Lighter not requested offload. */ 5179 m = i; 5180 } 5181 } 5182 } 5183 if (m >= RTE_DIM(txoff_func)) { 5184 DRV_LOG(DEBUG, "port %u has no selected Tx function" 5185 " for requested offloads %04X", 5186 dev->data->port_id, olx); 5187 return NULL; 5188 } 5189 DRV_LOG(DEBUG, "port %u has selected Tx function" 5190 " supporting offloads %04X/%04X", 5191 dev->data->port_id, olx, txoff_func[m].olx); 5192 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_MULTI) 5193 DRV_LOG(DEBUG, "\tMULTI (multi segment)"); 5194 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_TSO) 5195 DRV_LOG(DEBUG, "\tTSO (TCP send offload)"); 5196 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_SWP) 5197 DRV_LOG(DEBUG, "\tSWP (software parser)"); 5198 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_CSUM) 5199 DRV_LOG(DEBUG, "\tCSUM (checksum offload)"); 5200 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_INLINE) 5201 DRV_LOG(DEBUG, "\tINLIN (inline data)"); 5202 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_VLAN) 5203 DRV_LOG(DEBUG, "\tVLANI (VLAN insertion)"); 5204 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_METADATA) 5205 DRV_LOG(DEBUG, "\tMETAD (tx Flow metadata)"); 5206 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_EMPW) 5207 DRV_LOG(DEBUG, "\tEMPW (Enhanced MPW)"); 5208 return txoff_func[m].func; 5209 } 5210