1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2017 6WIND S.A. 3 * Copyright 2017 Mellanox Technologies, Ltd 4 */ 5 6 /** 7 * @file 8 * Data plane functions for mlx4 driver. 9 */ 10 11 #include <stdbool.h> 12 #include <stdint.h> 13 #include <string.h> 14 15 /* Verbs headers do not support -pedantic. */ 16 #ifdef PEDANTIC 17 #pragma GCC diagnostic ignored "-Wpedantic" 18 #endif 19 #include <infiniband/verbs.h> 20 #ifdef PEDANTIC 21 #pragma GCC diagnostic error "-Wpedantic" 22 #endif 23 24 #include <rte_branch_prediction.h> 25 #include <rte_common.h> 26 #include <rte_io.h> 27 #include <rte_mbuf.h> 28 #include <rte_mempool.h> 29 #include <rte_prefetch.h> 30 31 #include "mlx4.h" 32 #include "mlx4_prm.h" 33 #include "mlx4_rxtx.h" 34 #include "mlx4_utils.h" 35 36 /** 37 * Pointer-value pair structure used in tx_post_send for saving the first 38 * DWORD (32 byte) of a TXBB. 39 */ 40 struct pv { 41 union { 42 volatile struct mlx4_wqe_data_seg *dseg; 43 volatile uint32_t *dst; 44 }; 45 uint32_t val; 46 }; 47 48 /** A helper structure for TSO packet handling. */ 49 struct tso_info { 50 /** Pointer to the array of saved first DWORD (32 byte) of a TXBB. */ 51 struct pv *pv; 52 /** Current entry in the pv array. */ 53 int pv_counter; 54 /** Total size of the WQE including padding. */ 55 uint32_t wqe_size; 56 /** Size of TSO header to prepend to each packet to send. */ 57 uint16_t tso_header_size; 58 /** Total size of the TSO segment in the WQE. */ 59 uint16_t wqe_tso_seg_size; 60 /** Raw WQE size in units of 16 Bytes and without padding. */ 61 uint8_t fence_size; 62 }; 63 64 /** A table to translate Rx completion flags to packet type. */ 65 uint32_t mlx4_ptype_table[0x100] __rte_cache_aligned = { 66 /* 67 * The index to the array should have: 68 * bit[7] - MLX4_CQE_L2_TUNNEL 69 * bit[6] - MLX4_CQE_L2_TUNNEL_IPV4 70 * bit[5] - MLX4_CQE_STATUS_UDP 71 * bit[4] - MLX4_CQE_STATUS_TCP 72 * bit[3] - MLX4_CQE_STATUS_IPV4OPT 73 * bit[2] - MLX4_CQE_STATUS_IPV6 74 * bit[1] - MLX4_CQE_STATUS_IPF 75 * bit[0] - MLX4_CQE_STATUS_IPV4 76 * giving a total of up to 256 entries. 77 */ 78 /* L2 */ 79 [0x00] = RTE_PTYPE_L2_ETHER, 80 /* L3 */ 81 [0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 82 RTE_PTYPE_L4_NONFRAG, 83 [0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 84 RTE_PTYPE_L4_FRAG, 85 [0x03] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 86 RTE_PTYPE_L4_FRAG, 87 [0x04] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 88 RTE_PTYPE_L4_NONFRAG, 89 [0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 90 RTE_PTYPE_L4_FRAG, 91 [0x08] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | 92 RTE_PTYPE_L4_NONFRAG, 93 [0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | 94 RTE_PTYPE_L4_NONFRAG, 95 [0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | 96 RTE_PTYPE_L4_FRAG, 97 [0x0b] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | 98 RTE_PTYPE_L4_FRAG, 99 /* TCP */ 100 [0x11] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 101 RTE_PTYPE_L4_TCP, 102 [0x14] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 103 RTE_PTYPE_L4_TCP, 104 [0x16] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 105 RTE_PTYPE_L4_FRAG, 106 [0x18] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | 107 RTE_PTYPE_L4_TCP, 108 [0x19] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | 109 RTE_PTYPE_L4_TCP, 110 /* UDP */ 111 [0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 112 RTE_PTYPE_L4_UDP, 113 [0x24] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 114 RTE_PTYPE_L4_UDP, 115 [0x26] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 116 RTE_PTYPE_L4_FRAG, 117 [0x28] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | 118 RTE_PTYPE_L4_UDP, 119 [0x29] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | 120 RTE_PTYPE_L4_UDP, 121 /* Tunneled - L3 IPV6 */ 122 [0x80] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN, 123 [0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 124 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 125 RTE_PTYPE_INNER_L4_NONFRAG, 126 [0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 127 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 128 RTE_PTYPE_INNER_L4_FRAG, 129 [0x83] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 130 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 131 RTE_PTYPE_INNER_L4_FRAG, 132 [0x84] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 133 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 134 RTE_PTYPE_INNER_L4_NONFRAG, 135 [0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 136 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 137 RTE_PTYPE_INNER_L4_FRAG, 138 [0x88] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 139 RTE_PTYPE_INNER_L3_IPV4_EXT | 140 RTE_PTYPE_INNER_L4_NONFRAG, 141 [0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 142 RTE_PTYPE_INNER_L3_IPV4_EXT | 143 RTE_PTYPE_INNER_L4_NONFRAG, 144 [0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 145 RTE_PTYPE_INNER_L3_IPV4_EXT | 146 RTE_PTYPE_INNER_L4_FRAG, 147 [0x8b] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 148 RTE_PTYPE_INNER_L3_IPV4_EXT | 149 RTE_PTYPE_INNER_L4_FRAG, 150 /* Tunneled - L3 IPV6, TCP */ 151 [0x91] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 152 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 153 RTE_PTYPE_INNER_L4_TCP, 154 [0x94] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 155 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 156 RTE_PTYPE_INNER_L4_TCP, 157 [0x96] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 158 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 159 RTE_PTYPE_INNER_L4_FRAG, 160 [0x98] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 161 RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_TCP, 162 [0x99] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 163 RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_TCP, 164 /* Tunneled - L3 IPV6, UDP */ 165 [0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 166 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 167 RTE_PTYPE_INNER_L4_UDP, 168 [0xa4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 169 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 170 RTE_PTYPE_INNER_L4_UDP, 171 [0xa6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 172 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 173 RTE_PTYPE_INNER_L4_FRAG, 174 [0xa8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 175 RTE_PTYPE_INNER_L3_IPV4_EXT | 176 RTE_PTYPE_INNER_L4_UDP, 177 [0xa9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 178 RTE_PTYPE_INNER_L3_IPV4_EXT | 179 RTE_PTYPE_INNER_L4_UDP, 180 /* Tunneled - L3 IPV4 */ 181 [0xc0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN, 182 [0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 183 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 184 RTE_PTYPE_INNER_L4_NONFRAG, 185 [0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 186 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 187 RTE_PTYPE_INNER_L4_FRAG, 188 [0xc3] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 189 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 190 RTE_PTYPE_INNER_L4_FRAG, 191 [0xc4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 192 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 193 RTE_PTYPE_INNER_L4_NONFRAG, 194 [0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 195 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 196 RTE_PTYPE_INNER_L4_FRAG, 197 [0xc8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 198 RTE_PTYPE_INNER_L3_IPV4_EXT | 199 RTE_PTYPE_INNER_L4_NONFRAG, 200 [0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 201 RTE_PTYPE_INNER_L3_IPV4_EXT | 202 RTE_PTYPE_INNER_L4_NONFRAG, 203 [0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 204 RTE_PTYPE_INNER_L3_IPV4_EXT | 205 RTE_PTYPE_INNER_L4_FRAG, 206 [0xcb] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 207 RTE_PTYPE_INNER_L3_IPV4_EXT | 208 RTE_PTYPE_INNER_L4_FRAG, 209 /* Tunneled - L3 IPV4, TCP */ 210 [0xd1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 211 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 212 RTE_PTYPE_INNER_L4_TCP, 213 [0xd4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 214 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 215 RTE_PTYPE_INNER_L4_TCP, 216 [0xd6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 217 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 218 RTE_PTYPE_INNER_L4_FRAG, 219 [0xd8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 220 RTE_PTYPE_INNER_L3_IPV4_EXT | 221 RTE_PTYPE_INNER_L4_TCP, 222 [0xd9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 223 RTE_PTYPE_INNER_L3_IPV4_EXT | 224 RTE_PTYPE_INNER_L4_TCP, 225 /* Tunneled - L3 IPV4, UDP */ 226 [0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 227 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 228 RTE_PTYPE_INNER_L4_UDP, 229 [0xe4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 230 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 231 RTE_PTYPE_INNER_L4_UDP, 232 [0xe6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 233 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 234 RTE_PTYPE_INNER_L4_FRAG, 235 [0xe8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 236 RTE_PTYPE_INNER_L3_IPV4_EXT | 237 RTE_PTYPE_INNER_L4_UDP, 238 [0xe9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 239 RTE_PTYPE_INNER_L3_IPV4_EXT | 240 RTE_PTYPE_INNER_L4_UDP, 241 }; 242 243 /** 244 * Stamp TXBB burst so it won't be reused by the HW. 245 * 246 * Routine is used when freeing WQE used by the chip or when failing 247 * building an WQ entry has failed leaving partial information on the queue. 248 * 249 * @param sq 250 * Pointer to the SQ structure. 251 * @param start 252 * Pointer to the first TXBB to stamp. 253 * @param end 254 * Pointer to the followed end TXBB to stamp. 255 * 256 * @return 257 * Stamping burst size in byte units. 258 */ 259 static uint32_t 260 mlx4_txq_stamp_freed_wqe(struct mlx4_sq *sq, volatile uint32_t *start, 261 volatile uint32_t *end) 262 { 263 uint32_t stamp = sq->stamp; 264 int32_t size = (intptr_t)end - (intptr_t)start; 265 266 MLX4_ASSERT(start != end); 267 /* Hold SQ ring wrap around. */ 268 if (size < 0) { 269 size = (int32_t)sq->size + size; 270 do { 271 *start = stamp; 272 start += MLX4_SQ_STAMP_DWORDS; 273 } while (start != (volatile uint32_t *)sq->eob); 274 start = (volatile uint32_t *)sq->buf; 275 /* Flip invalid stamping ownership. */ 276 stamp ^= RTE_BE32(1u << MLX4_SQ_OWNER_BIT); 277 sq->stamp = stamp; 278 if (start == end) 279 return size; 280 } 281 do { 282 *start = stamp; 283 start += MLX4_SQ_STAMP_DWORDS; 284 } while (start != end); 285 return (uint32_t)size; 286 } 287 288 /** 289 * Manage Tx completions. 290 * 291 * When sending a burst, mlx4_tx_burst() posts several WRs. 292 * To improve performance, a completion event is only required once every 293 * MLX4_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information 294 * for other WRs, but this information would not be used anyway. 295 * 296 * @param txq 297 * Pointer to Tx queue structure. 298 * @param elts_m 299 * Tx elements number mask. 300 * @param sq 301 * Pointer to the SQ structure. 302 */ 303 static void 304 mlx4_txq_complete(struct txq *txq, const unsigned int elts_m, 305 struct mlx4_sq *sq) 306 { 307 unsigned int elts_tail = txq->elts_tail; 308 struct mlx4_cq *cq = &txq->mcq; 309 volatile struct mlx4_cqe *cqe; 310 uint32_t completed; 311 uint32_t cons_index = cq->cons_index; 312 volatile uint32_t *first_txbb; 313 314 /* 315 * Traverse over all CQ entries reported and handle each WQ entry 316 * reported by them. 317 */ 318 do { 319 cqe = (volatile struct mlx4_cqe *)mlx4_get_cqe(cq, cons_index); 320 if (unlikely(!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^ 321 !!(cons_index & cq->cqe_cnt))) 322 break; 323 #ifdef RTE_LIBRTE_MLX4_DEBUG 324 /* 325 * Make sure we read the CQE after we read the ownership bit. 326 */ 327 rte_io_rmb(); 328 if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == 329 MLX4_CQE_OPCODE_ERROR)) { 330 volatile struct mlx4_err_cqe *cqe_err = 331 (volatile struct mlx4_err_cqe *)cqe; 332 ERROR("%p CQE error - vendor syndrome: 0x%x" 333 " syndrome: 0x%x\n", 334 (void *)txq, cqe_err->vendor_err, 335 cqe_err->syndrome); 336 break; 337 } 338 #endif /* RTE_LIBRTE_MLX4_DEBUG */ 339 cons_index++; 340 } while (1); 341 completed = (cons_index - cq->cons_index) * txq->elts_comp_cd_init; 342 if (unlikely(!completed)) 343 return; 344 /* First stamping address is the end of the last one. */ 345 first_txbb = (&(*txq->elts)[elts_tail & elts_m])->eocb; 346 elts_tail += completed; 347 /* The new tail element holds the end address. */ 348 sq->remain_size += mlx4_txq_stamp_freed_wqe(sq, first_txbb, 349 (&(*txq->elts)[elts_tail & elts_m])->eocb); 350 /* Update CQ consumer index. */ 351 cq->cons_index = cons_index; 352 *cq->set_ci_db = rte_cpu_to_be_32(cons_index & MLX4_CQ_DB_CI_MASK); 353 txq->elts_tail = elts_tail; 354 } 355 356 /** 357 * Write Tx data segment to the SQ. 358 * 359 * @param dseg 360 * Pointer to data segment in SQ. 361 * @param lkey 362 * Memory region lkey. 363 * @param addr 364 * Data address. 365 * @param byte_count 366 * Big endian bytes count of the data to send. 367 */ 368 static inline void 369 mlx4_fill_tx_data_seg(volatile struct mlx4_wqe_data_seg *dseg, 370 uint32_t lkey, uintptr_t addr, rte_be32_t byte_count) 371 { 372 dseg->addr = rte_cpu_to_be_64(addr); 373 dseg->lkey = lkey; 374 #if RTE_CACHE_LINE_SIZE < 64 375 /* 376 * Need a barrier here before writing the byte_count 377 * fields to make sure that all the data is visible 378 * before the byte_count field is set. 379 * Otherwise, if the segment begins a new cacheline, 380 * the HCA prefetcher could grab the 64-byte chunk and 381 * get a valid (!= 0xffffffff) byte count but stale 382 * data, and end up sending the wrong data. 383 */ 384 rte_io_wmb(); 385 #endif /* RTE_CACHE_LINE_SIZE */ 386 dseg->byte_count = byte_count; 387 } 388 389 /** 390 * Obtain and calculate TSO information needed for assembling a TSO WQE. 391 * 392 * @param buf 393 * Pointer to the first packet mbuf. 394 * @param txq 395 * Pointer to Tx queue structure. 396 * @param tinfo 397 * Pointer to a structure to fill the info with. 398 * 399 * @return 400 * 0 on success, negative value upon error. 401 */ 402 static inline int 403 mlx4_tx_burst_tso_get_params(struct rte_mbuf *buf, 404 struct txq *txq, 405 struct tso_info *tinfo) 406 { 407 struct mlx4_sq *sq = &txq->msq; 408 const uint8_t tunneled = txq->priv->hw_csum_l2tun && 409 (buf->ol_flags & PKT_TX_TUNNEL_MASK); 410 411 tinfo->tso_header_size = buf->l2_len + buf->l3_len + buf->l4_len; 412 if (tunneled) 413 tinfo->tso_header_size += 414 buf->outer_l2_len + buf->outer_l3_len; 415 if (unlikely(buf->tso_segsz == 0 || 416 tinfo->tso_header_size == 0 || 417 tinfo->tso_header_size > MLX4_MAX_TSO_HEADER || 418 tinfo->tso_header_size > buf->data_len)) 419 return -EINVAL; 420 /* 421 * Calculate the WQE TSO segment size 422 * Note: 423 * 1. An LSO segment must be padded such that the subsequent data 424 * segment is 16-byte aligned. 425 * 2. The start address of the TSO segment is always 16 Bytes aligned. 426 */ 427 tinfo->wqe_tso_seg_size = RTE_ALIGN(sizeof(struct mlx4_wqe_lso_seg) + 428 tinfo->tso_header_size, 429 sizeof(struct mlx4_wqe_data_seg)); 430 tinfo->fence_size = ((sizeof(struct mlx4_wqe_ctrl_seg) + 431 tinfo->wqe_tso_seg_size) >> MLX4_SEG_SHIFT) + 432 buf->nb_segs; 433 tinfo->wqe_size = 434 RTE_ALIGN((uint32_t)(tinfo->fence_size << MLX4_SEG_SHIFT), 435 MLX4_TXBB_SIZE); 436 /* Validate WQE size and WQE space in the send queue. */ 437 if (sq->remain_size < tinfo->wqe_size || 438 tinfo->wqe_size > MLX4_MAX_WQE_SIZE) 439 return -ENOMEM; 440 /* Init pv. */ 441 tinfo->pv = (struct pv *)txq->bounce_buf; 442 tinfo->pv_counter = 0; 443 return 0; 444 } 445 446 /** 447 * Fill the TSO WQE data segments with info on buffers to transmit . 448 * 449 * @param buf 450 * Pointer to the first packet mbuf. 451 * @param txq 452 * Pointer to Tx queue structure. 453 * @param tinfo 454 * Pointer to TSO info to use. 455 * @param dseg 456 * Pointer to the first data segment in the TSO WQE. 457 * @param ctrl 458 * Pointer to the control segment in the TSO WQE. 459 * 460 * @return 461 * 0 on success, negative value upon error. 462 */ 463 static inline volatile struct mlx4_wqe_ctrl_seg * 464 mlx4_tx_burst_fill_tso_dsegs(struct rte_mbuf *buf, 465 struct txq *txq, 466 struct tso_info *tinfo, 467 volatile struct mlx4_wqe_data_seg *dseg, 468 volatile struct mlx4_wqe_ctrl_seg *ctrl) 469 { 470 uint32_t lkey; 471 int nb_segs = buf->nb_segs; 472 int nb_segs_txbb; 473 struct mlx4_sq *sq = &txq->msq; 474 struct rte_mbuf *sbuf = buf; 475 struct pv *pv = tinfo->pv; 476 int *pv_counter = &tinfo->pv_counter; 477 volatile struct mlx4_wqe_ctrl_seg *ctrl_next = 478 (volatile struct mlx4_wqe_ctrl_seg *) 479 ((volatile uint8_t *)ctrl + tinfo->wqe_size); 480 uint16_t data_len = sbuf->data_len - tinfo->tso_header_size; 481 uintptr_t data_addr = rte_pktmbuf_mtod_offset(sbuf, uintptr_t, 482 tinfo->tso_header_size); 483 484 do { 485 /* how many dseg entries do we have in the current TXBB ? */ 486 nb_segs_txbb = (MLX4_TXBB_SIZE - 487 ((uintptr_t)dseg & (MLX4_TXBB_SIZE - 1))) >> 488 MLX4_SEG_SHIFT; 489 switch (nb_segs_txbb) { 490 #ifdef RTE_LIBRTE_MLX4_DEBUG 491 default: 492 /* Should never happen. */ 493 rte_panic("%p: Invalid number of SGEs(%d) for a TXBB", 494 (void *)txq, nb_segs_txbb); 495 /* rte_panic never returns. */ 496 break; 497 #endif /* RTE_LIBRTE_MLX4_DEBUG */ 498 case 4: 499 /* Memory region key for this memory pool. */ 500 lkey = mlx4_tx_mb2mr(txq, sbuf); 501 if (unlikely(lkey == (uint32_t)-1)) 502 goto err; 503 dseg->addr = rte_cpu_to_be_64(data_addr); 504 dseg->lkey = lkey; 505 /* 506 * This data segment starts at the beginning of a new 507 * TXBB, so we need to postpone its byte_count writing 508 * for later. 509 */ 510 pv[*pv_counter].dseg = dseg; 511 /* 512 * Zero length segment is treated as inline segment 513 * with zero data. 514 */ 515 pv[(*pv_counter)++].val = 516 rte_cpu_to_be_32(data_len ? 517 data_len : 518 0x80000000); 519 if (--nb_segs == 0) 520 return ctrl_next; 521 /* Prepare next buf info */ 522 sbuf = sbuf->next; 523 dseg++; 524 data_len = sbuf->data_len; 525 data_addr = rte_pktmbuf_mtod(sbuf, uintptr_t); 526 /* fallthrough */ 527 case 3: 528 lkey = mlx4_tx_mb2mr(txq, sbuf); 529 if (unlikely(lkey == (uint32_t)-1)) 530 goto err; 531 mlx4_fill_tx_data_seg(dseg, lkey, data_addr, 532 rte_cpu_to_be_32(data_len ? 533 data_len : 534 0x80000000)); 535 if (--nb_segs == 0) 536 return ctrl_next; 537 /* Prepare next buf info */ 538 sbuf = sbuf->next; 539 dseg++; 540 data_len = sbuf->data_len; 541 data_addr = rte_pktmbuf_mtod(sbuf, uintptr_t); 542 /* fallthrough */ 543 case 2: 544 lkey = mlx4_tx_mb2mr(txq, sbuf); 545 if (unlikely(lkey == (uint32_t)-1)) 546 goto err; 547 mlx4_fill_tx_data_seg(dseg, lkey, data_addr, 548 rte_cpu_to_be_32(data_len ? 549 data_len : 550 0x80000000)); 551 if (--nb_segs == 0) 552 return ctrl_next; 553 /* Prepare next buf info */ 554 sbuf = sbuf->next; 555 dseg++; 556 data_len = sbuf->data_len; 557 data_addr = rte_pktmbuf_mtod(sbuf, uintptr_t); 558 /* fallthrough */ 559 case 1: 560 lkey = mlx4_tx_mb2mr(txq, sbuf); 561 if (unlikely(lkey == (uint32_t)-1)) 562 goto err; 563 mlx4_fill_tx_data_seg(dseg, lkey, data_addr, 564 rte_cpu_to_be_32(data_len ? 565 data_len : 566 0x80000000)); 567 if (--nb_segs == 0) 568 return ctrl_next; 569 /* Prepare next buf info */ 570 sbuf = sbuf->next; 571 dseg++; 572 data_len = sbuf->data_len; 573 data_addr = rte_pktmbuf_mtod(sbuf, uintptr_t); 574 /* fallthrough */ 575 } 576 /* Wrap dseg if it points at the end of the queue. */ 577 if ((volatile uint8_t *)dseg >= sq->eob) 578 dseg = (volatile struct mlx4_wqe_data_seg *) 579 ((volatile uint8_t *)dseg - sq->size); 580 } while (true); 581 err: 582 return NULL; 583 } 584 585 /** 586 * Fill the packet's l2, l3 and l4 headers to the WQE. 587 * 588 * This will be used as the header for each TSO segment that is transmitted. 589 * 590 * @param buf 591 * Pointer to the first packet mbuf. 592 * @param txq 593 * Pointer to Tx queue structure. 594 * @param tinfo 595 * Pointer to TSO info to use. 596 * @param ctrl 597 * Pointer to the control segment in the TSO WQE. 598 * 599 * @return 600 * 0 on success, negative value upon error. 601 */ 602 static inline volatile struct mlx4_wqe_data_seg * 603 mlx4_tx_burst_fill_tso_hdr(struct rte_mbuf *buf, 604 struct txq *txq, 605 struct tso_info *tinfo, 606 volatile struct mlx4_wqe_ctrl_seg *ctrl) 607 { 608 volatile struct mlx4_wqe_lso_seg *tseg = 609 (volatile struct mlx4_wqe_lso_seg *)(ctrl + 1); 610 struct mlx4_sq *sq = &txq->msq; 611 struct pv *pv = tinfo->pv; 612 int *pv_counter = &tinfo->pv_counter; 613 int remain_size = tinfo->tso_header_size; 614 char *from = rte_pktmbuf_mtod(buf, char *); 615 uint16_t txbb_avail_space; 616 /* Union to overcome volatile constraints when copying TSO header. */ 617 union { 618 volatile uint8_t *vto; 619 uint8_t *to; 620 } thdr = { .vto = (volatile uint8_t *)tseg->header, }; 621 622 /* 623 * TSO data always starts at offset 20 from the beginning of the TXBB 624 * (16 byte ctrl + 4byte TSO desc). Since each TXBB is 64Byte aligned 625 * we can write the first 44 TSO header bytes without worry for TxQ 626 * wrapping or overwriting the first TXBB 32bit word. 627 */ 628 txbb_avail_space = MLX4_TXBB_SIZE - 629 (sizeof(struct mlx4_wqe_ctrl_seg) + 630 sizeof(struct mlx4_wqe_lso_seg)); 631 while (remain_size >= (int)(txbb_avail_space + sizeof(uint32_t))) { 632 /* Copy to end of txbb. */ 633 rte_memcpy(thdr.to, from, txbb_avail_space); 634 from += txbb_avail_space; 635 thdr.to += txbb_avail_space; 636 /* New TXBB, Check for TxQ wrap. */ 637 if (thdr.to >= sq->eob) 638 thdr.vto = sq->buf; 639 /* New TXBB, stash the first 32bits for later use. */ 640 pv[*pv_counter].dst = (volatile uint32_t *)thdr.to; 641 pv[(*pv_counter)++].val = *(uint32_t *)from, 642 from += sizeof(uint32_t); 643 thdr.to += sizeof(uint32_t); 644 remain_size -= txbb_avail_space + sizeof(uint32_t); 645 /* Avail space in new TXBB is TXBB size - 4 */ 646 txbb_avail_space = MLX4_TXBB_SIZE - sizeof(uint32_t); 647 } 648 if (remain_size > txbb_avail_space) { 649 rte_memcpy(thdr.to, from, txbb_avail_space); 650 from += txbb_avail_space; 651 thdr.to += txbb_avail_space; 652 remain_size -= txbb_avail_space; 653 /* New TXBB, Check for TxQ wrap. */ 654 if (thdr.to >= sq->eob) 655 thdr.vto = sq->buf; 656 pv[*pv_counter].dst = (volatile uint32_t *)thdr.to; 657 rte_memcpy(&pv[*pv_counter].val, from, remain_size); 658 (*pv_counter)++; 659 } else if (remain_size) { 660 rte_memcpy(thdr.to, from, remain_size); 661 } 662 tseg->mss_hdr_size = rte_cpu_to_be_32((buf->tso_segsz << 16) | 663 tinfo->tso_header_size); 664 /* Calculate data segment location */ 665 return (volatile struct mlx4_wqe_data_seg *) 666 ((uintptr_t)tseg + tinfo->wqe_tso_seg_size); 667 } 668 669 /** 670 * Write data segments and header for TSO uni/multi segment packet. 671 * 672 * @param buf 673 * Pointer to the first packet mbuf. 674 * @param txq 675 * Pointer to Tx queue structure. 676 * @param ctrl 677 * Pointer to the WQE control segment. 678 * 679 * @return 680 * Pointer to the next WQE control segment on success, NULL otherwise. 681 */ 682 static volatile struct mlx4_wqe_ctrl_seg * 683 mlx4_tx_burst_tso(struct rte_mbuf *buf, struct txq *txq, 684 volatile struct mlx4_wqe_ctrl_seg *ctrl) 685 { 686 volatile struct mlx4_wqe_data_seg *dseg; 687 volatile struct mlx4_wqe_ctrl_seg *ctrl_next; 688 struct mlx4_sq *sq = &txq->msq; 689 struct tso_info tinfo; 690 struct pv *pv; 691 int pv_counter; 692 int ret; 693 694 ret = mlx4_tx_burst_tso_get_params(buf, txq, &tinfo); 695 if (unlikely(ret)) 696 goto error; 697 dseg = mlx4_tx_burst_fill_tso_hdr(buf, txq, &tinfo, ctrl); 698 if (unlikely(dseg == NULL)) 699 goto error; 700 if ((uintptr_t)dseg >= (uintptr_t)sq->eob) 701 dseg = (volatile struct mlx4_wqe_data_seg *) 702 ((uintptr_t)dseg - sq->size); 703 ctrl_next = mlx4_tx_burst_fill_tso_dsegs(buf, txq, &tinfo, dseg, ctrl); 704 if (unlikely(ctrl_next == NULL)) 705 goto error; 706 /* Write the first DWORD of each TXBB save earlier. */ 707 if (likely(tinfo.pv_counter)) { 708 pv = tinfo.pv; 709 pv_counter = tinfo.pv_counter; 710 /* Need a barrier here before writing the first TXBB word. */ 711 rte_io_wmb(); 712 do { 713 --pv_counter; 714 *pv[pv_counter].dst = pv[pv_counter].val; 715 } while (pv_counter > 0); 716 } 717 ctrl->fence_size = tinfo.fence_size; 718 sq->remain_size -= tinfo.wqe_size; 719 return ctrl_next; 720 error: 721 txq->stats.odropped++; 722 return NULL; 723 } 724 725 /** 726 * Write data segments of multi-segment packet. 727 * 728 * @param buf 729 * Pointer to the first packet mbuf. 730 * @param txq 731 * Pointer to Tx queue structure. 732 * @param ctrl 733 * Pointer to the WQE control segment. 734 * 735 * @return 736 * Pointer to the next WQE control segment on success, NULL otherwise. 737 */ 738 static volatile struct mlx4_wqe_ctrl_seg * 739 mlx4_tx_burst_segs(struct rte_mbuf *buf, struct txq *txq, 740 volatile struct mlx4_wqe_ctrl_seg *ctrl) 741 { 742 struct pv *pv = (struct pv *)txq->bounce_buf; 743 struct mlx4_sq *sq = &txq->msq; 744 struct rte_mbuf *sbuf = buf; 745 uint32_t lkey; 746 int pv_counter = 0; 747 int nb_segs = buf->nb_segs; 748 uint32_t wqe_size; 749 volatile struct mlx4_wqe_data_seg *dseg = 750 (volatile struct mlx4_wqe_data_seg *)(ctrl + 1); 751 752 ctrl->fence_size = 1 + nb_segs; 753 wqe_size = RTE_ALIGN((uint32_t)(ctrl->fence_size << MLX4_SEG_SHIFT), 754 MLX4_TXBB_SIZE); 755 /* Validate WQE size and WQE space in the send queue. */ 756 if (sq->remain_size < wqe_size || 757 wqe_size > MLX4_MAX_WQE_SIZE) 758 return NULL; 759 /* 760 * Fill the data segments with buffer information. 761 * First WQE TXBB head segment is always control segment, 762 * so jump to tail TXBB data segments code for the first 763 * WQE data segments filling. 764 */ 765 goto txbb_tail_segs; 766 txbb_head_seg: 767 /* Memory region key (big endian) for this memory pool. */ 768 lkey = mlx4_tx_mb2mr(txq, sbuf); 769 if (unlikely(lkey == (uint32_t)-1)) { 770 DEBUG("%p: unable to get MP <-> MR association", 771 (void *)txq); 772 return NULL; 773 } 774 /* Handle WQE wraparound. */ 775 if (dseg >= 776 (volatile struct mlx4_wqe_data_seg *)sq->eob) 777 dseg = (volatile struct mlx4_wqe_data_seg *) 778 sq->buf; 779 dseg->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(sbuf, uintptr_t)); 780 dseg->lkey = lkey; 781 /* 782 * This data segment starts at the beginning of a new 783 * TXBB, so we need to postpone its byte_count writing 784 * for later. 785 */ 786 pv[pv_counter].dseg = dseg; 787 /* 788 * Zero length segment is treated as inline segment 789 * with zero data. 790 */ 791 pv[pv_counter++].val = rte_cpu_to_be_32(sbuf->data_len ? 792 sbuf->data_len : 0x80000000); 793 sbuf = sbuf->next; 794 dseg++; 795 nb_segs--; 796 txbb_tail_segs: 797 /* Jump to default if there are more than two segments remaining. */ 798 switch (nb_segs) { 799 default: 800 lkey = mlx4_tx_mb2mr(txq, sbuf); 801 if (unlikely(lkey == (uint32_t)-1)) { 802 DEBUG("%p: unable to get MP <-> MR association", 803 (void *)txq); 804 return NULL; 805 } 806 mlx4_fill_tx_data_seg(dseg, lkey, 807 rte_pktmbuf_mtod(sbuf, uintptr_t), 808 rte_cpu_to_be_32(sbuf->data_len ? 809 sbuf->data_len : 810 0x80000000)); 811 sbuf = sbuf->next; 812 dseg++; 813 nb_segs--; 814 /* fallthrough */ 815 case 2: 816 lkey = mlx4_tx_mb2mr(txq, sbuf); 817 if (unlikely(lkey == (uint32_t)-1)) { 818 DEBUG("%p: unable to get MP <-> MR association", 819 (void *)txq); 820 return NULL; 821 } 822 mlx4_fill_tx_data_seg(dseg, lkey, 823 rte_pktmbuf_mtod(sbuf, uintptr_t), 824 rte_cpu_to_be_32(sbuf->data_len ? 825 sbuf->data_len : 826 0x80000000)); 827 sbuf = sbuf->next; 828 dseg++; 829 nb_segs--; 830 /* fallthrough */ 831 case 1: 832 lkey = mlx4_tx_mb2mr(txq, sbuf); 833 if (unlikely(lkey == (uint32_t)-1)) { 834 DEBUG("%p: unable to get MP <-> MR association", 835 (void *)txq); 836 return NULL; 837 } 838 mlx4_fill_tx_data_seg(dseg, lkey, 839 rte_pktmbuf_mtod(sbuf, uintptr_t), 840 rte_cpu_to_be_32(sbuf->data_len ? 841 sbuf->data_len : 842 0x80000000)); 843 nb_segs--; 844 if (nb_segs) { 845 sbuf = sbuf->next; 846 dseg++; 847 goto txbb_head_seg; 848 } 849 /* fallthrough */ 850 case 0: 851 break; 852 } 853 /* Write the first DWORD of each TXBB save earlier. */ 854 if (pv_counter) { 855 /* Need a barrier here before writing the byte_count. */ 856 rte_io_wmb(); 857 for (--pv_counter; pv_counter >= 0; pv_counter--) 858 pv[pv_counter].dseg->byte_count = pv[pv_counter].val; 859 } 860 sq->remain_size -= wqe_size; 861 /* Align next WQE address to the next TXBB. */ 862 return (volatile struct mlx4_wqe_ctrl_seg *) 863 ((volatile uint8_t *)ctrl + wqe_size); 864 } 865 866 /** 867 * DPDK callback for Tx. 868 * 869 * @param dpdk_txq 870 * Generic pointer to Tx queue structure. 871 * @param[in] pkts 872 * Packets to transmit. 873 * @param pkts_n 874 * Number of packets in array. 875 * 876 * @return 877 * Number of packets successfully transmitted (<= pkts_n). 878 */ 879 uint16_t 880 mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 881 { 882 struct txq *txq = (struct txq *)dpdk_txq; 883 unsigned int elts_head = txq->elts_head; 884 const unsigned int elts_n = txq->elts_n; 885 const unsigned int elts_m = elts_n - 1; 886 unsigned int bytes_sent = 0; 887 unsigned int i; 888 unsigned int max = elts_head - txq->elts_tail; 889 struct mlx4_sq *sq = &txq->msq; 890 volatile struct mlx4_wqe_ctrl_seg *ctrl; 891 struct txq_elt *elt; 892 893 MLX4_ASSERT(txq->elts_comp_cd != 0); 894 if (likely(max >= txq->elts_comp_cd_init)) 895 mlx4_txq_complete(txq, elts_m, sq); 896 max = elts_n - max; 897 MLX4_ASSERT(max >= 1); 898 MLX4_ASSERT(max <= elts_n); 899 /* Always leave one free entry in the ring. */ 900 --max; 901 if (max > pkts_n) 902 max = pkts_n; 903 elt = &(*txq->elts)[elts_head & elts_m]; 904 /* First Tx burst element saves the next WQE control segment. */ 905 ctrl = elt->wqe; 906 for (i = 0; (i != max); ++i) { 907 struct rte_mbuf *buf = pkts[i]; 908 struct txq_elt *elt_next = &(*txq->elts)[++elts_head & elts_m]; 909 uint32_t owner_opcode = sq->owner_opcode; 910 volatile struct mlx4_wqe_data_seg *dseg = 911 (volatile struct mlx4_wqe_data_seg *)(ctrl + 1); 912 volatile struct mlx4_wqe_ctrl_seg *ctrl_next; 913 union { 914 uint32_t flags; 915 uint16_t flags16[2]; 916 } srcrb; 917 uint32_t lkey; 918 bool tso = txq->priv->tso && (buf->ol_flags & PKT_TX_TCP_SEG); 919 920 /* Clean up old buffer. */ 921 if (likely(elt->buf != NULL)) { 922 struct rte_mbuf *tmp = elt->buf; 923 924 #ifdef RTE_LIBRTE_MLX4_DEBUG 925 /* Poisoning. */ 926 memset(&elt->buf, 0x66, sizeof(struct rte_mbuf *)); 927 #endif 928 /* Faster than rte_pktmbuf_free(). */ 929 do { 930 struct rte_mbuf *next = tmp->next; 931 932 rte_pktmbuf_free_seg(tmp); 933 tmp = next; 934 } while (tmp != NULL); 935 } 936 RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); 937 if (tso) { 938 /* Change opcode to TSO */ 939 owner_opcode &= ~MLX4_OPCODE_CONFIG_CMD; 940 owner_opcode |= MLX4_OPCODE_LSO | MLX4_WQE_CTRL_RR; 941 ctrl_next = mlx4_tx_burst_tso(buf, txq, ctrl); 942 if (!ctrl_next) { 943 elt->buf = NULL; 944 break; 945 } 946 } else if (buf->nb_segs == 1) { 947 /* Validate WQE space in the send queue. */ 948 if (sq->remain_size < MLX4_TXBB_SIZE) { 949 elt->buf = NULL; 950 break; 951 } 952 lkey = mlx4_tx_mb2mr(txq, buf); 953 if (unlikely(lkey == (uint32_t)-1)) { 954 /* MR does not exist. */ 955 DEBUG("%p: unable to get MP <-> MR association", 956 (void *)txq); 957 elt->buf = NULL; 958 break; 959 } 960 mlx4_fill_tx_data_seg(dseg++, lkey, 961 rte_pktmbuf_mtod(buf, uintptr_t), 962 rte_cpu_to_be_32(buf->data_len)); 963 /* Set WQE size in 16-byte units. */ 964 ctrl->fence_size = 0x2; 965 sq->remain_size -= MLX4_TXBB_SIZE; 966 /* Align next WQE address to the next TXBB. */ 967 ctrl_next = ctrl + 0x4; 968 } else { 969 ctrl_next = mlx4_tx_burst_segs(buf, txq, ctrl); 970 if (!ctrl_next) { 971 elt->buf = NULL; 972 break; 973 } 974 } 975 /* Hold SQ ring wrap around. */ 976 if ((volatile uint8_t *)ctrl_next >= sq->eob) { 977 ctrl_next = (volatile struct mlx4_wqe_ctrl_seg *) 978 ((volatile uint8_t *)ctrl_next - sq->size); 979 /* Flip HW valid ownership. */ 980 sq->owner_opcode ^= 1u << MLX4_SQ_OWNER_BIT; 981 } 982 /* 983 * For raw Ethernet, the SOLICIT flag is used to indicate 984 * that no ICRC should be calculated. 985 */ 986 if (--txq->elts_comp_cd == 0) { 987 /* Save the completion burst end address. */ 988 elt_next->eocb = (volatile uint32_t *)ctrl_next; 989 txq->elts_comp_cd = txq->elts_comp_cd_init; 990 srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT | 991 MLX4_WQE_CTRL_CQ_UPDATE); 992 } else { 993 srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT); 994 } 995 /* Enable HW checksum offload if requested */ 996 if (txq->csum && 997 (buf->ol_flags & 998 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))) { 999 const uint64_t is_tunneled = (buf->ol_flags & 1000 (PKT_TX_TUNNEL_GRE | 1001 PKT_TX_TUNNEL_VXLAN)); 1002 1003 if (is_tunneled && txq->csum_l2tun) { 1004 owner_opcode |= MLX4_WQE_CTRL_IIP_HDR_CSUM | 1005 MLX4_WQE_CTRL_IL4_HDR_CSUM; 1006 if (buf->ol_flags & PKT_TX_OUTER_IP_CKSUM) 1007 srcrb.flags |= 1008 RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM); 1009 } else { 1010 srcrb.flags |= 1011 RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM | 1012 MLX4_WQE_CTRL_TCP_UDP_CSUM); 1013 } 1014 } 1015 if (txq->lb) { 1016 /* 1017 * Copy destination MAC address to the WQE, this allows 1018 * loopback in eSwitch, so that VFs and PF can 1019 * communicate with each other. 1020 */ 1021 srcrb.flags16[0] = *(rte_pktmbuf_mtod(buf, uint16_t *)); 1022 ctrl->imm = *(rte_pktmbuf_mtod_offset(buf, uint32_t *, 1023 sizeof(uint16_t))); 1024 } else { 1025 ctrl->imm = 0; 1026 } 1027 ctrl->srcrb_flags = srcrb.flags; 1028 /* 1029 * Make sure descriptor is fully written before 1030 * setting ownership bit (because HW can start 1031 * executing as soon as we do). 1032 */ 1033 rte_io_wmb(); 1034 ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode); 1035 elt->buf = buf; 1036 bytes_sent += buf->pkt_len; 1037 ctrl = ctrl_next; 1038 elt = elt_next; 1039 } 1040 /* Take a shortcut if nothing must be sent. */ 1041 if (unlikely(i == 0)) 1042 return 0; 1043 /* Save WQE address of the next Tx burst element. */ 1044 elt->wqe = ctrl; 1045 /* Increment send statistics counters. */ 1046 txq->stats.opackets += i; 1047 txq->stats.obytes += bytes_sent; 1048 /* Make sure that descriptors are written before doorbell record. */ 1049 rte_wmb(); 1050 /* Ring QP doorbell. */ 1051 rte_write32(txq->msq.doorbell_qpn, MLX4_TX_BFREG(txq)); 1052 txq->elts_head += i; 1053 return i; 1054 } 1055 1056 /** 1057 * Translate Rx completion flags to packet type. 1058 * 1059 * @param[in] cqe 1060 * Pointer to CQE. 1061 * 1062 * @return 1063 * Packet type for struct rte_mbuf. 1064 */ 1065 static inline uint32_t 1066 rxq_cq_to_pkt_type(volatile struct mlx4_cqe *cqe, 1067 uint32_t l2tun_offload) 1068 { 1069 uint8_t idx = 0; 1070 uint32_t pinfo = rte_be_to_cpu_32(cqe->vlan_my_qpn); 1071 uint32_t status = rte_be_to_cpu_32(cqe->status); 1072 1073 /* 1074 * The index to the array should have: 1075 * bit[7] - MLX4_CQE_L2_TUNNEL 1076 * bit[6] - MLX4_CQE_L2_TUNNEL_IPV4 1077 */ 1078 if (l2tun_offload && (pinfo & MLX4_CQE_L2_TUNNEL)) 1079 idx |= ((pinfo & MLX4_CQE_L2_TUNNEL) >> 20) | 1080 ((pinfo & MLX4_CQE_L2_TUNNEL_IPV4) >> 19); 1081 /* 1082 * The index to the array should have: 1083 * bit[5] - MLX4_CQE_STATUS_UDP 1084 * bit[4] - MLX4_CQE_STATUS_TCP 1085 * bit[3] - MLX4_CQE_STATUS_IPV4OPT 1086 * bit[2] - MLX4_CQE_STATUS_IPV6 1087 * bit[1] - MLX4_CQE_STATUS_IPF 1088 * bit[0] - MLX4_CQE_STATUS_IPV4 1089 * giving a total of up to 256 entries. 1090 */ 1091 idx |= ((status & MLX4_CQE_STATUS_PTYPE_MASK) >> 22); 1092 if (status & MLX4_CQE_STATUS_IPV6) 1093 idx |= ((status & MLX4_CQE_STATUS_IPV6F) >> 11); 1094 return mlx4_ptype_table[idx]; 1095 } 1096 1097 /** 1098 * Translate Rx completion flags to offload flags. 1099 * 1100 * @param flags 1101 * Rx completion flags returned by mlx4_cqe_flags(). 1102 * @param csum 1103 * Whether Rx checksums are enabled. 1104 * @param csum_l2tun 1105 * Whether Rx L2 tunnel checksums are enabled. 1106 * 1107 * @return 1108 * Offload flags (ol_flags) in mbuf format. 1109 */ 1110 static inline uint32_t 1111 rxq_cq_to_ol_flags(uint32_t flags, int csum, int csum_l2tun) 1112 { 1113 uint32_t ol_flags = 0; 1114 1115 if (csum) 1116 ol_flags |= 1117 mlx4_transpose(flags, 1118 MLX4_CQE_STATUS_IP_HDR_CSUM_OK, 1119 PKT_RX_IP_CKSUM_GOOD) | 1120 mlx4_transpose(flags, 1121 MLX4_CQE_STATUS_TCP_UDP_CSUM_OK, 1122 PKT_RX_L4_CKSUM_GOOD); 1123 if ((flags & MLX4_CQE_L2_TUNNEL) && csum_l2tun) 1124 ol_flags |= 1125 mlx4_transpose(flags, 1126 MLX4_CQE_L2_TUNNEL_IPOK, 1127 PKT_RX_IP_CKSUM_GOOD) | 1128 mlx4_transpose(flags, 1129 MLX4_CQE_L2_TUNNEL_L4_CSUM, 1130 PKT_RX_L4_CKSUM_GOOD); 1131 return ol_flags; 1132 } 1133 1134 /** 1135 * Extract checksum information from CQE flags. 1136 * 1137 * @param cqe 1138 * Pointer to CQE structure. 1139 * @param csum 1140 * Whether Rx checksums are enabled. 1141 * @param csum_l2tun 1142 * Whether Rx L2 tunnel checksums are enabled. 1143 * 1144 * @return 1145 * CQE checksum information. 1146 */ 1147 static inline uint32_t 1148 mlx4_cqe_flags(volatile struct mlx4_cqe *cqe, int csum, int csum_l2tun) 1149 { 1150 uint32_t flags = 0; 1151 1152 /* 1153 * The relevant bits are in different locations on their 1154 * CQE fields therefore we can join them in one 32bit 1155 * variable. 1156 */ 1157 if (csum) 1158 flags = (rte_be_to_cpu_32(cqe->status) & 1159 MLX4_CQE_STATUS_IPV4_CSUM_OK); 1160 if (csum_l2tun) 1161 flags |= (rte_be_to_cpu_32(cqe->vlan_my_qpn) & 1162 (MLX4_CQE_L2_TUNNEL | 1163 MLX4_CQE_L2_TUNNEL_IPOK | 1164 MLX4_CQE_L2_TUNNEL_L4_CSUM | 1165 MLX4_CQE_L2_TUNNEL_IPV4)); 1166 return flags; 1167 } 1168 1169 /** 1170 * Poll one CQE from CQ. 1171 * 1172 * @param rxq 1173 * Pointer to the receive queue structure. 1174 * @param[out] out 1175 * Just polled CQE. 1176 * 1177 * @return 1178 * Number of bytes of the CQE, 0 in case there is no completion. 1179 */ 1180 static unsigned int 1181 mlx4_cq_poll_one(struct rxq *rxq, volatile struct mlx4_cqe **out) 1182 { 1183 int ret = 0; 1184 volatile struct mlx4_cqe *cqe = NULL; 1185 struct mlx4_cq *cq = &rxq->mcq; 1186 1187 cqe = (volatile struct mlx4_cqe *)mlx4_get_cqe(cq, cq->cons_index); 1188 if (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^ 1189 !!(cq->cons_index & cq->cqe_cnt)) 1190 goto out; 1191 /* 1192 * Make sure we read CQ entry contents after we've checked the 1193 * ownership bit. 1194 */ 1195 rte_rmb(); 1196 MLX4_ASSERT(!(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)); 1197 MLX4_ASSERT((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) != 1198 MLX4_CQE_OPCODE_ERROR); 1199 ret = rte_be_to_cpu_32(cqe->byte_cnt); 1200 ++cq->cons_index; 1201 out: 1202 *out = cqe; 1203 return ret; 1204 } 1205 1206 /** 1207 * DPDK callback for Rx with scattered packets support. 1208 * 1209 * @param dpdk_rxq 1210 * Generic pointer to Rx queue structure. 1211 * @param[out] pkts 1212 * Array to store received packets. 1213 * @param pkts_n 1214 * Maximum number of packets in array. 1215 * 1216 * @return 1217 * Number of packets successfully received (<= pkts_n). 1218 */ 1219 uint16_t 1220 mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1221 { 1222 struct rxq *rxq = dpdk_rxq; 1223 const uint32_t wr_cnt = (1 << rxq->elts_n) - 1; 1224 const uint16_t sges_n = rxq->sges_n; 1225 struct rte_mbuf *pkt = NULL; 1226 struct rte_mbuf *seg = NULL; 1227 unsigned int i = 0; 1228 uint32_t rq_ci = rxq->rq_ci << sges_n; 1229 int len = 0; 1230 1231 while (pkts_n) { 1232 volatile struct mlx4_cqe *cqe; 1233 uint32_t idx = rq_ci & wr_cnt; 1234 struct rte_mbuf *rep = (*rxq->elts)[idx]; 1235 volatile struct mlx4_wqe_data_seg *scat = &(*rxq->wqes)[idx]; 1236 1237 /* Update the 'next' pointer of the previous segment. */ 1238 if (pkt) 1239 seg->next = rep; 1240 seg = rep; 1241 rte_prefetch0(seg); 1242 rte_prefetch0(scat); 1243 rep = rte_mbuf_raw_alloc(rxq->mp); 1244 if (unlikely(rep == NULL)) { 1245 ++rxq->stats.rx_nombuf; 1246 if (!pkt) { 1247 /* 1248 * No buffers before we even started, 1249 * bail out silently. 1250 */ 1251 break; 1252 } 1253 while (pkt != seg) { 1254 MLX4_ASSERT(pkt != (*rxq->elts)[idx]); 1255 rep = pkt->next; 1256 pkt->next = NULL; 1257 pkt->nb_segs = 1; 1258 rte_mbuf_raw_free(pkt); 1259 pkt = rep; 1260 } 1261 break; 1262 } 1263 if (!pkt) { 1264 /* Looking for the new packet. */ 1265 len = mlx4_cq_poll_one(rxq, &cqe); 1266 if (!len) { 1267 rte_mbuf_raw_free(rep); 1268 break; 1269 } 1270 if (unlikely(len < 0)) { 1271 /* Rx error, packet is likely too large. */ 1272 rte_mbuf_raw_free(rep); 1273 ++rxq->stats.idropped; 1274 goto skip; 1275 } 1276 pkt = seg; 1277 MLX4_ASSERT(len >= (rxq->crc_present << 2)); 1278 /* Update packet information. */ 1279 pkt->packet_type = 1280 rxq_cq_to_pkt_type(cqe, rxq->l2tun_offload); 1281 pkt->ol_flags = PKT_RX_RSS_HASH; 1282 pkt->hash.rss = cqe->immed_rss_invalid; 1283 if (rxq->crc_present) 1284 len -= RTE_ETHER_CRC_LEN; 1285 pkt->pkt_len = len; 1286 if (rxq->csum | rxq->csum_l2tun) { 1287 uint32_t flags = 1288 mlx4_cqe_flags(cqe, 1289 rxq->csum, 1290 rxq->csum_l2tun); 1291 1292 pkt->ol_flags = 1293 rxq_cq_to_ol_flags(flags, 1294 rxq->csum, 1295 rxq->csum_l2tun); 1296 } 1297 } 1298 rep->nb_segs = 1; 1299 rep->port = rxq->port_id; 1300 rep->data_len = seg->data_len; 1301 rep->data_off = seg->data_off; 1302 (*rxq->elts)[idx] = rep; 1303 /* 1304 * Fill NIC descriptor with the new buffer. The lkey and size 1305 * of the buffers are already known, only the buffer address 1306 * changes. 1307 */ 1308 scat->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t)); 1309 /* If there's only one MR, no need to replace LKey in WQE. */ 1310 if (unlikely(mlx4_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) 1311 scat->lkey = mlx4_rx_mb2mr(rxq, rep); 1312 if (len > seg->data_len) { 1313 len -= seg->data_len; 1314 ++pkt->nb_segs; 1315 ++rq_ci; 1316 continue; 1317 } 1318 /* The last segment. */ 1319 seg->data_len = len; 1320 /* Increment bytes counter. */ 1321 rxq->stats.ibytes += pkt->pkt_len; 1322 /* Return packet. */ 1323 *(pkts++) = pkt; 1324 pkt = NULL; 1325 --pkts_n; 1326 ++i; 1327 skip: 1328 /* Align consumer index to the next stride. */ 1329 rq_ci >>= sges_n; 1330 ++rq_ci; 1331 rq_ci <<= sges_n; 1332 } 1333 if (unlikely(i == 0 && (rq_ci >> sges_n) == rxq->rq_ci)) 1334 return 0; 1335 /* Update the consumer index. */ 1336 rxq->rq_ci = rq_ci >> sges_n; 1337 rte_wmb(); 1338 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 1339 *rxq->mcq.set_ci_db = 1340 rte_cpu_to_be_32(rxq->mcq.cons_index & MLX4_CQ_DB_CI_MASK); 1341 /* Increment packets counter. */ 1342 rxq->stats.ipackets += i; 1343 return i; 1344 } 1345 1346 /** 1347 * Dummy DPDK callback for Tx. 1348 * 1349 * This function is used to temporarily replace the real callback during 1350 * unsafe control operations on the queue, or in case of error. 1351 * 1352 * @param dpdk_txq 1353 * Generic pointer to Tx queue structure. 1354 * @param[in] pkts 1355 * Packets to transmit. 1356 * @param pkts_n 1357 * Number of packets in array. 1358 * 1359 * @return 1360 * Number of packets successfully transmitted (<= pkts_n). 1361 */ 1362 uint16_t 1363 mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 1364 { 1365 (void)dpdk_txq; 1366 (void)pkts; 1367 (void)pkts_n; 1368 rte_mb(); 1369 return 0; 1370 } 1371 1372 /** 1373 * Dummy DPDK callback for Rx. 1374 * 1375 * This function is used to temporarily replace the real callback during 1376 * unsafe control operations on the queue, or in case of error. 1377 * 1378 * @param dpdk_rxq 1379 * Generic pointer to Rx queue structure. 1380 * @param[out] pkts 1381 * Array to store received packets. 1382 * @param pkts_n 1383 * Maximum number of packets in array. 1384 * 1385 * @return 1386 * Number of packets successfully received (<= pkts_n). 1387 */ 1388 uint16_t 1389 mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1390 { 1391 (void)dpdk_rxq; 1392 (void)pkts; 1393 (void)pkts_n; 1394 rte_mb(); 1395 return 0; 1396 } 1397