1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2017 6WIND S.A. 3 * Copyright 2017 Mellanox Technologies, Ltd 4 */ 5 6 /** 7 * @file 8 * Data plane functions for mlx4 driver. 9 */ 10 11 #include <assert.h> 12 #include <stdbool.h> 13 #include <stdint.h> 14 #include <string.h> 15 16 /* Verbs headers do not support -pedantic. */ 17 #ifdef PEDANTIC 18 #pragma GCC diagnostic ignored "-Wpedantic" 19 #endif 20 #include <infiniband/verbs.h> 21 #ifdef PEDANTIC 22 #pragma GCC diagnostic error "-Wpedantic" 23 #endif 24 25 #include <rte_branch_prediction.h> 26 #include <rte_common.h> 27 #include <rte_io.h> 28 #include <rte_mbuf.h> 29 #include <rte_mempool.h> 30 #include <rte_prefetch.h> 31 32 #include "mlx4.h" 33 #include "mlx4_prm.h" 34 #include "mlx4_rxtx.h" 35 #include "mlx4_utils.h" 36 37 /** 38 * Pointer-value pair structure used in tx_post_send for saving the first 39 * DWORD (32 byte) of a TXBB. 40 */ 41 struct pv { 42 union { 43 volatile struct mlx4_wqe_data_seg *dseg; 44 volatile uint32_t *dst; 45 }; 46 uint32_t val; 47 }; 48 49 /** A helper structure for TSO packet handling. */ 50 struct tso_info { 51 /** Pointer to the array of saved first DWORD (32 byte) of a TXBB. */ 52 struct pv *pv; 53 /** Current entry in the pv array. */ 54 int pv_counter; 55 /** Total size of the WQE including padding. */ 56 uint32_t wqe_size; 57 /** Size of TSO header to prepend to each packet to send. */ 58 uint16_t tso_header_size; 59 /** Total size of the TSO segment in the WQE. */ 60 uint16_t wqe_tso_seg_size; 61 /** Raw WQE size in units of 16 Bytes and without padding. */ 62 uint8_t fence_size; 63 }; 64 65 /** A table to translate Rx completion flags to packet type. */ 66 uint32_t mlx4_ptype_table[0x100] __rte_cache_aligned = { 67 /* 68 * The index to the array should have: 69 * bit[7] - MLX4_CQE_L2_TUNNEL 70 * bit[6] - MLX4_CQE_L2_TUNNEL_IPV4 71 * bit[5] - MLX4_CQE_STATUS_UDP 72 * bit[4] - MLX4_CQE_STATUS_TCP 73 * bit[3] - MLX4_CQE_STATUS_IPV4OPT 74 * bit[2] - MLX4_CQE_STATUS_IPV6 75 * bit[1] - MLX4_CQE_STATUS_IPF 76 * bit[0] - MLX4_CQE_STATUS_IPV4 77 * giving a total of up to 256 entries. 78 */ 79 /* L2 */ 80 [0x00] = RTE_PTYPE_L2_ETHER, 81 /* L3 */ 82 [0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 83 RTE_PTYPE_L4_NONFRAG, 84 [0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 85 RTE_PTYPE_L4_FRAG, 86 [0x03] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 87 RTE_PTYPE_L4_FRAG, 88 [0x04] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 89 RTE_PTYPE_L4_NONFRAG, 90 [0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 91 RTE_PTYPE_L4_FRAG, 92 [0x08] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | 93 RTE_PTYPE_L4_NONFRAG, 94 [0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | 95 RTE_PTYPE_L4_NONFRAG, 96 [0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | 97 RTE_PTYPE_L4_FRAG, 98 [0x0b] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | 99 RTE_PTYPE_L4_FRAG, 100 /* TCP */ 101 [0x11] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 102 RTE_PTYPE_L4_TCP, 103 [0x14] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 104 RTE_PTYPE_L4_TCP, 105 [0x16] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 106 RTE_PTYPE_L4_FRAG, 107 [0x18] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | 108 RTE_PTYPE_L4_TCP, 109 [0x19] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | 110 RTE_PTYPE_L4_TCP, 111 /* UDP */ 112 [0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 113 RTE_PTYPE_L4_UDP, 114 [0x24] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 115 RTE_PTYPE_L4_UDP, 116 [0x26] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 117 RTE_PTYPE_L4_FRAG, 118 [0x28] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | 119 RTE_PTYPE_L4_UDP, 120 [0x29] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | 121 RTE_PTYPE_L4_UDP, 122 /* Tunneled - L3 IPV6 */ 123 [0x80] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN, 124 [0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 125 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 126 RTE_PTYPE_INNER_L4_NONFRAG, 127 [0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 128 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 129 RTE_PTYPE_INNER_L4_FRAG, 130 [0x83] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 131 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 132 RTE_PTYPE_INNER_L4_FRAG, 133 [0x84] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 134 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 135 RTE_PTYPE_INNER_L4_NONFRAG, 136 [0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 137 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 138 RTE_PTYPE_INNER_L4_FRAG, 139 [0x88] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 140 RTE_PTYPE_INNER_L3_IPV4_EXT | 141 RTE_PTYPE_INNER_L4_NONFRAG, 142 [0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 143 RTE_PTYPE_INNER_L3_IPV4_EXT | 144 RTE_PTYPE_INNER_L4_NONFRAG, 145 [0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 146 RTE_PTYPE_INNER_L3_IPV4_EXT | 147 RTE_PTYPE_INNER_L4_FRAG, 148 [0x8b] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 149 RTE_PTYPE_INNER_L3_IPV4_EXT | 150 RTE_PTYPE_INNER_L4_FRAG, 151 /* Tunneled - L3 IPV6, TCP */ 152 [0x91] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 153 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 154 RTE_PTYPE_INNER_L4_TCP, 155 [0x94] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 156 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 157 RTE_PTYPE_INNER_L4_TCP, 158 [0x96] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 159 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 160 RTE_PTYPE_INNER_L4_FRAG, 161 [0x98] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 162 RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_TCP, 163 [0x99] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 164 RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_TCP, 165 /* Tunneled - L3 IPV6, UDP */ 166 [0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 167 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 168 RTE_PTYPE_INNER_L4_UDP, 169 [0xa4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 170 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 171 RTE_PTYPE_INNER_L4_UDP, 172 [0xa6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 173 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 174 RTE_PTYPE_INNER_L4_FRAG, 175 [0xa8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 176 RTE_PTYPE_INNER_L3_IPV4_EXT | 177 RTE_PTYPE_INNER_L4_UDP, 178 [0xa9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | 179 RTE_PTYPE_INNER_L3_IPV4_EXT | 180 RTE_PTYPE_INNER_L4_UDP, 181 /* Tunneled - L3 IPV4 */ 182 [0xc0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN, 183 [0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 184 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 185 RTE_PTYPE_INNER_L4_NONFRAG, 186 [0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 187 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 188 RTE_PTYPE_INNER_L4_FRAG, 189 [0xc3] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 190 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 191 RTE_PTYPE_INNER_L4_FRAG, 192 [0xc4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 193 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 194 RTE_PTYPE_INNER_L4_NONFRAG, 195 [0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 196 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 197 RTE_PTYPE_INNER_L4_FRAG, 198 [0xc8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 199 RTE_PTYPE_INNER_L3_IPV4_EXT | 200 RTE_PTYPE_INNER_L4_NONFRAG, 201 [0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 202 RTE_PTYPE_INNER_L3_IPV4_EXT | 203 RTE_PTYPE_INNER_L4_NONFRAG, 204 [0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 205 RTE_PTYPE_INNER_L3_IPV4_EXT | 206 RTE_PTYPE_INNER_L4_FRAG, 207 [0xcb] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 208 RTE_PTYPE_INNER_L3_IPV4_EXT | 209 RTE_PTYPE_INNER_L4_FRAG, 210 /* Tunneled - L3 IPV4, TCP */ 211 [0xd1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 212 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 213 RTE_PTYPE_INNER_L4_TCP, 214 [0xd4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 215 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 216 RTE_PTYPE_INNER_L4_TCP, 217 [0xd6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 218 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 219 RTE_PTYPE_INNER_L4_FRAG, 220 [0xd8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 221 RTE_PTYPE_INNER_L3_IPV4_EXT | 222 RTE_PTYPE_INNER_L4_TCP, 223 [0xd9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 224 RTE_PTYPE_INNER_L3_IPV4_EXT | 225 RTE_PTYPE_INNER_L4_TCP, 226 /* Tunneled - L3 IPV4, UDP */ 227 [0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 228 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | 229 RTE_PTYPE_INNER_L4_UDP, 230 [0xe4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 231 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 232 RTE_PTYPE_INNER_L4_UDP, 233 [0xe6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 234 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | 235 RTE_PTYPE_INNER_L4_FRAG, 236 [0xe8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 237 RTE_PTYPE_INNER_L3_IPV4_EXT | 238 RTE_PTYPE_INNER_L4_UDP, 239 [0xe9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | 240 RTE_PTYPE_INNER_L3_IPV4_EXT | 241 RTE_PTYPE_INNER_L4_UDP, 242 }; 243 244 /** 245 * Stamp TXBB burst so it won't be reused by the HW. 246 * 247 * Routine is used when freeing WQE used by the chip or when failing 248 * building an WQ entry has failed leaving partial information on the queue. 249 * 250 * @param sq 251 * Pointer to the SQ structure. 252 * @param start 253 * Pointer to the first TXBB to stamp. 254 * @param end 255 * Pointer to the followed end TXBB to stamp. 256 * 257 * @return 258 * Stamping burst size in byte units. 259 */ 260 static uint32_t 261 mlx4_txq_stamp_freed_wqe(struct mlx4_sq *sq, volatile uint32_t *start, 262 volatile uint32_t *end) 263 { 264 uint32_t stamp = sq->stamp; 265 int32_t size = (intptr_t)end - (intptr_t)start; 266 267 assert(start != end); 268 /* Hold SQ ring wrap around. */ 269 if (size < 0) { 270 size = (int32_t)sq->size + size; 271 do { 272 *start = stamp; 273 start += MLX4_SQ_STAMP_DWORDS; 274 } while (start != (volatile uint32_t *)sq->eob); 275 start = (volatile uint32_t *)sq->buf; 276 /* Flip invalid stamping ownership. */ 277 stamp ^= RTE_BE32(1u << MLX4_SQ_OWNER_BIT); 278 sq->stamp = stamp; 279 if (start == end) 280 return size; 281 } 282 do { 283 *start = stamp; 284 start += MLX4_SQ_STAMP_DWORDS; 285 } while (start != end); 286 return (uint32_t)size; 287 } 288 289 /** 290 * Manage Tx completions. 291 * 292 * When sending a burst, mlx4_tx_burst() posts several WRs. 293 * To improve performance, a completion event is only required once every 294 * MLX4_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information 295 * for other WRs, but this information would not be used anyway. 296 * 297 * @param txq 298 * Pointer to Tx queue structure. 299 * @param elts_m 300 * Tx elements number mask. 301 * @param sq 302 * Pointer to the SQ structure. 303 */ 304 static void 305 mlx4_txq_complete(struct txq *txq, const unsigned int elts_m, 306 struct mlx4_sq *sq) 307 { 308 unsigned int elts_tail = txq->elts_tail; 309 struct mlx4_cq *cq = &txq->mcq; 310 volatile struct mlx4_cqe *cqe; 311 uint32_t completed; 312 uint32_t cons_index = cq->cons_index; 313 volatile uint32_t *first_txbb; 314 315 /* 316 * Traverse over all CQ entries reported and handle each WQ entry 317 * reported by them. 318 */ 319 do { 320 cqe = (volatile struct mlx4_cqe *)mlx4_get_cqe(cq, cons_index); 321 if (unlikely(!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^ 322 !!(cons_index & cq->cqe_cnt))) 323 break; 324 #ifndef NDEBUG 325 /* 326 * Make sure we read the CQE after we read the ownership bit. 327 */ 328 rte_io_rmb(); 329 if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == 330 MLX4_CQE_OPCODE_ERROR)) { 331 volatile struct mlx4_err_cqe *cqe_err = 332 (volatile struct mlx4_err_cqe *)cqe; 333 ERROR("%p CQE error - vendor syndrome: 0x%x" 334 " syndrome: 0x%x\n", 335 (void *)txq, cqe_err->vendor_err, 336 cqe_err->syndrome); 337 break; 338 } 339 #endif /* NDEBUG */ 340 cons_index++; 341 } while (1); 342 completed = (cons_index - cq->cons_index) * txq->elts_comp_cd_init; 343 if (unlikely(!completed)) 344 return; 345 /* First stamping address is the end of the last one. */ 346 first_txbb = (&(*txq->elts)[elts_tail & elts_m])->eocb; 347 elts_tail += completed; 348 /* The new tail element holds the end address. */ 349 sq->remain_size += mlx4_txq_stamp_freed_wqe(sq, first_txbb, 350 (&(*txq->elts)[elts_tail & elts_m])->eocb); 351 /* Update CQ consumer index. */ 352 cq->cons_index = cons_index; 353 *cq->set_ci_db = rte_cpu_to_be_32(cons_index & MLX4_CQ_DB_CI_MASK); 354 txq->elts_tail = elts_tail; 355 } 356 357 /** 358 * Write Tx data segment to the SQ. 359 * 360 * @param dseg 361 * Pointer to data segment in SQ. 362 * @param lkey 363 * Memory region lkey. 364 * @param addr 365 * Data address. 366 * @param byte_count 367 * Big endian bytes count of the data to send. 368 */ 369 static inline void 370 mlx4_fill_tx_data_seg(volatile struct mlx4_wqe_data_seg *dseg, 371 uint32_t lkey, uintptr_t addr, rte_be32_t byte_count) 372 { 373 dseg->addr = rte_cpu_to_be_64(addr); 374 dseg->lkey = lkey; 375 #if RTE_CACHE_LINE_SIZE < 64 376 /* 377 * Need a barrier here before writing the byte_count 378 * fields to make sure that all the data is visible 379 * before the byte_count field is set. 380 * Otherwise, if the segment begins a new cacheline, 381 * the HCA prefetcher could grab the 64-byte chunk and 382 * get a valid (!= 0xffffffff) byte count but stale 383 * data, and end up sending the wrong data. 384 */ 385 rte_io_wmb(); 386 #endif /* RTE_CACHE_LINE_SIZE */ 387 dseg->byte_count = byte_count; 388 } 389 390 /** 391 * Obtain and calculate TSO information needed for assembling a TSO WQE. 392 * 393 * @param buf 394 * Pointer to the first packet mbuf. 395 * @param txq 396 * Pointer to Tx queue structure. 397 * @param tinfo 398 * Pointer to a structure to fill the info with. 399 * 400 * @return 401 * 0 on success, negative value upon error. 402 */ 403 static inline int 404 mlx4_tx_burst_tso_get_params(struct rte_mbuf *buf, 405 struct txq *txq, 406 struct tso_info *tinfo) 407 { 408 struct mlx4_sq *sq = &txq->msq; 409 const uint8_t tunneled = txq->priv->hw_csum_l2tun && 410 (buf->ol_flags & PKT_TX_TUNNEL_MASK); 411 412 tinfo->tso_header_size = buf->l2_len + buf->l3_len + buf->l4_len; 413 if (tunneled) 414 tinfo->tso_header_size += 415 buf->outer_l2_len + buf->outer_l3_len; 416 if (unlikely(buf->tso_segsz == 0 || 417 tinfo->tso_header_size == 0 || 418 tinfo->tso_header_size > MLX4_MAX_TSO_HEADER || 419 tinfo->tso_header_size > buf->data_len)) 420 return -EINVAL; 421 /* 422 * Calculate the WQE TSO segment size 423 * Note: 424 * 1. An LSO segment must be padded such that the subsequent data 425 * segment is 16-byte aligned. 426 * 2. The start address of the TSO segment is always 16 Bytes aligned. 427 */ 428 tinfo->wqe_tso_seg_size = RTE_ALIGN(sizeof(struct mlx4_wqe_lso_seg) + 429 tinfo->tso_header_size, 430 sizeof(struct mlx4_wqe_data_seg)); 431 tinfo->fence_size = ((sizeof(struct mlx4_wqe_ctrl_seg) + 432 tinfo->wqe_tso_seg_size) >> MLX4_SEG_SHIFT) + 433 buf->nb_segs; 434 tinfo->wqe_size = 435 RTE_ALIGN((uint32_t)(tinfo->fence_size << MLX4_SEG_SHIFT), 436 MLX4_TXBB_SIZE); 437 /* Validate WQE size and WQE space in the send queue. */ 438 if (sq->remain_size < tinfo->wqe_size || 439 tinfo->wqe_size > MLX4_MAX_WQE_SIZE) 440 return -ENOMEM; 441 /* Init pv. */ 442 tinfo->pv = (struct pv *)txq->bounce_buf; 443 tinfo->pv_counter = 0; 444 return 0; 445 } 446 447 /** 448 * Fill the TSO WQE data segments with info on buffers to transmit . 449 * 450 * @param buf 451 * Pointer to the first packet mbuf. 452 * @param txq 453 * Pointer to Tx queue structure. 454 * @param tinfo 455 * Pointer to TSO info to use. 456 * @param dseg 457 * Pointer to the first data segment in the TSO WQE. 458 * @param ctrl 459 * Pointer to the control segment in the TSO WQE. 460 * 461 * @return 462 * 0 on success, negative value upon error. 463 */ 464 static inline volatile struct mlx4_wqe_ctrl_seg * 465 mlx4_tx_burst_fill_tso_dsegs(struct rte_mbuf *buf, 466 struct txq *txq, 467 struct tso_info *tinfo, 468 volatile struct mlx4_wqe_data_seg *dseg, 469 volatile struct mlx4_wqe_ctrl_seg *ctrl) 470 { 471 uint32_t lkey; 472 int nb_segs = buf->nb_segs; 473 int nb_segs_txbb; 474 struct mlx4_sq *sq = &txq->msq; 475 struct rte_mbuf *sbuf = buf; 476 struct pv *pv = tinfo->pv; 477 int *pv_counter = &tinfo->pv_counter; 478 volatile struct mlx4_wqe_ctrl_seg *ctrl_next = 479 (volatile struct mlx4_wqe_ctrl_seg *) 480 ((volatile uint8_t *)ctrl + tinfo->wqe_size); 481 uint16_t data_len = sbuf->data_len - tinfo->tso_header_size; 482 uintptr_t data_addr = rte_pktmbuf_mtod_offset(sbuf, uintptr_t, 483 tinfo->tso_header_size); 484 485 do { 486 /* how many dseg entries do we have in the current TXBB ? */ 487 nb_segs_txbb = (MLX4_TXBB_SIZE - 488 ((uintptr_t)dseg & (MLX4_TXBB_SIZE - 1))) >> 489 MLX4_SEG_SHIFT; 490 switch (nb_segs_txbb) { 491 #ifndef NDEBUG 492 default: 493 /* Should never happen. */ 494 rte_panic("%p: Invalid number of SGEs(%d) for a TXBB", 495 (void *)txq, nb_segs_txbb); 496 /* rte_panic never returns. */ 497 break; 498 #endif /* NDEBUG */ 499 case 4: 500 /* Memory region key for this memory pool. */ 501 lkey = mlx4_tx_mb2mr(txq, sbuf); 502 if (unlikely(lkey == (uint32_t)-1)) 503 goto err; 504 dseg->addr = rte_cpu_to_be_64(data_addr); 505 dseg->lkey = lkey; 506 /* 507 * This data segment starts at the beginning of a new 508 * TXBB, so we need to postpone its byte_count writing 509 * for later. 510 */ 511 pv[*pv_counter].dseg = dseg; 512 /* 513 * Zero length segment is treated as inline segment 514 * with zero data. 515 */ 516 pv[(*pv_counter)++].val = 517 rte_cpu_to_be_32(data_len ? 518 data_len : 519 0x80000000); 520 if (--nb_segs == 0) 521 return ctrl_next; 522 /* Prepare next buf info */ 523 sbuf = sbuf->next; 524 dseg++; 525 data_len = sbuf->data_len; 526 data_addr = rte_pktmbuf_mtod(sbuf, uintptr_t); 527 /* fallthrough */ 528 case 3: 529 lkey = mlx4_tx_mb2mr(txq, sbuf); 530 if (unlikely(lkey == (uint32_t)-1)) 531 goto err; 532 mlx4_fill_tx_data_seg(dseg, lkey, data_addr, 533 rte_cpu_to_be_32(data_len ? 534 data_len : 535 0x80000000)); 536 if (--nb_segs == 0) 537 return ctrl_next; 538 /* Prepare next buf info */ 539 sbuf = sbuf->next; 540 dseg++; 541 data_len = sbuf->data_len; 542 data_addr = rte_pktmbuf_mtod(sbuf, uintptr_t); 543 /* fallthrough */ 544 case 2: 545 lkey = mlx4_tx_mb2mr(txq, sbuf); 546 if (unlikely(lkey == (uint32_t)-1)) 547 goto err; 548 mlx4_fill_tx_data_seg(dseg, lkey, data_addr, 549 rte_cpu_to_be_32(data_len ? 550 data_len : 551 0x80000000)); 552 if (--nb_segs == 0) 553 return ctrl_next; 554 /* Prepare next buf info */ 555 sbuf = sbuf->next; 556 dseg++; 557 data_len = sbuf->data_len; 558 data_addr = rte_pktmbuf_mtod(sbuf, uintptr_t); 559 /* fallthrough */ 560 case 1: 561 lkey = mlx4_tx_mb2mr(txq, sbuf); 562 if (unlikely(lkey == (uint32_t)-1)) 563 goto err; 564 mlx4_fill_tx_data_seg(dseg, lkey, data_addr, 565 rte_cpu_to_be_32(data_len ? 566 data_len : 567 0x80000000)); 568 if (--nb_segs == 0) 569 return ctrl_next; 570 /* Prepare next buf info */ 571 sbuf = sbuf->next; 572 dseg++; 573 data_len = sbuf->data_len; 574 data_addr = rte_pktmbuf_mtod(sbuf, uintptr_t); 575 /* fallthrough */ 576 } 577 /* Wrap dseg if it points at the end of the queue. */ 578 if ((volatile uint8_t *)dseg >= sq->eob) 579 dseg = (volatile struct mlx4_wqe_data_seg *) 580 ((volatile uint8_t *)dseg - sq->size); 581 } while (true); 582 err: 583 return NULL; 584 } 585 586 /** 587 * Fill the packet's l2, l3 and l4 headers to the WQE. 588 * 589 * This will be used as the header for each TSO segment that is transmitted. 590 * 591 * @param buf 592 * Pointer to the first packet mbuf. 593 * @param txq 594 * Pointer to Tx queue structure. 595 * @param tinfo 596 * Pointer to TSO info to use. 597 * @param ctrl 598 * Pointer to the control segment in the TSO WQE. 599 * 600 * @return 601 * 0 on success, negative value upon error. 602 */ 603 static inline volatile struct mlx4_wqe_data_seg * 604 mlx4_tx_burst_fill_tso_hdr(struct rte_mbuf *buf, 605 struct txq *txq, 606 struct tso_info *tinfo, 607 volatile struct mlx4_wqe_ctrl_seg *ctrl) 608 { 609 volatile struct mlx4_wqe_lso_seg *tseg = 610 (volatile struct mlx4_wqe_lso_seg *)(ctrl + 1); 611 struct mlx4_sq *sq = &txq->msq; 612 struct pv *pv = tinfo->pv; 613 int *pv_counter = &tinfo->pv_counter; 614 int remain_size = tinfo->tso_header_size; 615 char *from = rte_pktmbuf_mtod(buf, char *); 616 uint16_t txbb_avail_space; 617 /* Union to overcome volatile constraints when copying TSO header. */ 618 union { 619 volatile uint8_t *vto; 620 uint8_t *to; 621 } thdr = { .vto = (volatile uint8_t *)tseg->header, }; 622 623 /* 624 * TSO data always starts at offset 20 from the beginning of the TXBB 625 * (16 byte ctrl + 4byte TSO desc). Since each TXBB is 64Byte aligned 626 * we can write the first 44 TSO header bytes without worry for TxQ 627 * wrapping or overwriting the first TXBB 32bit word. 628 */ 629 txbb_avail_space = MLX4_TXBB_SIZE - 630 (sizeof(struct mlx4_wqe_ctrl_seg) + 631 sizeof(struct mlx4_wqe_lso_seg)); 632 while (remain_size >= (int)(txbb_avail_space + sizeof(uint32_t))) { 633 /* Copy to end of txbb. */ 634 rte_memcpy(thdr.to, from, txbb_avail_space); 635 from += txbb_avail_space; 636 thdr.to += txbb_avail_space; 637 /* New TXBB, Check for TxQ wrap. */ 638 if (thdr.to >= sq->eob) 639 thdr.vto = sq->buf; 640 /* New TXBB, stash the first 32bits for later use. */ 641 pv[*pv_counter].dst = (volatile uint32_t *)thdr.to; 642 pv[(*pv_counter)++].val = *(uint32_t *)from, 643 from += sizeof(uint32_t); 644 thdr.to += sizeof(uint32_t); 645 remain_size -= txbb_avail_space + sizeof(uint32_t); 646 /* Avail space in new TXBB is TXBB size - 4 */ 647 txbb_avail_space = MLX4_TXBB_SIZE - sizeof(uint32_t); 648 } 649 if (remain_size > txbb_avail_space) { 650 rte_memcpy(thdr.to, from, txbb_avail_space); 651 from += txbb_avail_space; 652 thdr.to += txbb_avail_space; 653 remain_size -= txbb_avail_space; 654 /* New TXBB, Check for TxQ wrap. */ 655 if (thdr.to >= sq->eob) 656 thdr.vto = sq->buf; 657 pv[*pv_counter].dst = (volatile uint32_t *)thdr.to; 658 rte_memcpy(&pv[*pv_counter].val, from, remain_size); 659 (*pv_counter)++; 660 } else if (remain_size) { 661 rte_memcpy(thdr.to, from, remain_size); 662 } 663 tseg->mss_hdr_size = rte_cpu_to_be_32((buf->tso_segsz << 16) | 664 tinfo->tso_header_size); 665 /* Calculate data segment location */ 666 return (volatile struct mlx4_wqe_data_seg *) 667 ((uintptr_t)tseg + tinfo->wqe_tso_seg_size); 668 } 669 670 /** 671 * Write data segments and header for TSO uni/multi segment packet. 672 * 673 * @param buf 674 * Pointer to the first packet mbuf. 675 * @param txq 676 * Pointer to Tx queue structure. 677 * @param ctrl 678 * Pointer to the WQE control segment. 679 * 680 * @return 681 * Pointer to the next WQE control segment on success, NULL otherwise. 682 */ 683 static volatile struct mlx4_wqe_ctrl_seg * 684 mlx4_tx_burst_tso(struct rte_mbuf *buf, struct txq *txq, 685 volatile struct mlx4_wqe_ctrl_seg *ctrl) 686 { 687 volatile struct mlx4_wqe_data_seg *dseg; 688 volatile struct mlx4_wqe_ctrl_seg *ctrl_next; 689 struct mlx4_sq *sq = &txq->msq; 690 struct tso_info tinfo; 691 struct pv *pv; 692 int pv_counter; 693 int ret; 694 695 ret = mlx4_tx_burst_tso_get_params(buf, txq, &tinfo); 696 if (unlikely(ret)) 697 goto error; 698 dseg = mlx4_tx_burst_fill_tso_hdr(buf, txq, &tinfo, ctrl); 699 if (unlikely(dseg == NULL)) 700 goto error; 701 if ((uintptr_t)dseg >= (uintptr_t)sq->eob) 702 dseg = (volatile struct mlx4_wqe_data_seg *) 703 ((uintptr_t)dseg - sq->size); 704 ctrl_next = mlx4_tx_burst_fill_tso_dsegs(buf, txq, &tinfo, dseg, ctrl); 705 if (unlikely(ctrl_next == NULL)) 706 goto error; 707 /* Write the first DWORD of each TXBB save earlier. */ 708 if (likely(tinfo.pv_counter)) { 709 pv = tinfo.pv; 710 pv_counter = tinfo.pv_counter; 711 /* Need a barrier here before writing the first TXBB word. */ 712 rte_io_wmb(); 713 do { 714 --pv_counter; 715 *pv[pv_counter].dst = pv[pv_counter].val; 716 } while (pv_counter > 0); 717 } 718 ctrl->fence_size = tinfo.fence_size; 719 sq->remain_size -= tinfo.wqe_size; 720 return ctrl_next; 721 error: 722 txq->stats.odropped++; 723 return NULL; 724 } 725 726 /** 727 * Write data segments of multi-segment packet. 728 * 729 * @param buf 730 * Pointer to the first packet mbuf. 731 * @param txq 732 * Pointer to Tx queue structure. 733 * @param ctrl 734 * Pointer to the WQE control segment. 735 * 736 * @return 737 * Pointer to the next WQE control segment on success, NULL otherwise. 738 */ 739 static volatile struct mlx4_wqe_ctrl_seg * 740 mlx4_tx_burst_segs(struct rte_mbuf *buf, struct txq *txq, 741 volatile struct mlx4_wqe_ctrl_seg *ctrl) 742 { 743 struct pv *pv = (struct pv *)txq->bounce_buf; 744 struct mlx4_sq *sq = &txq->msq; 745 struct rte_mbuf *sbuf = buf; 746 uint32_t lkey; 747 int pv_counter = 0; 748 int nb_segs = buf->nb_segs; 749 uint32_t wqe_size; 750 volatile struct mlx4_wqe_data_seg *dseg = 751 (volatile struct mlx4_wqe_data_seg *)(ctrl + 1); 752 753 ctrl->fence_size = 1 + nb_segs; 754 wqe_size = RTE_ALIGN((uint32_t)(ctrl->fence_size << MLX4_SEG_SHIFT), 755 MLX4_TXBB_SIZE); 756 /* Validate WQE size and WQE space in the send queue. */ 757 if (sq->remain_size < wqe_size || 758 wqe_size > MLX4_MAX_WQE_SIZE) 759 return NULL; 760 /* 761 * Fill the data segments with buffer information. 762 * First WQE TXBB head segment is always control segment, 763 * so jump to tail TXBB data segments code for the first 764 * WQE data segments filling. 765 */ 766 goto txbb_tail_segs; 767 txbb_head_seg: 768 /* Memory region key (big endian) for this memory pool. */ 769 lkey = mlx4_tx_mb2mr(txq, sbuf); 770 if (unlikely(lkey == (uint32_t)-1)) { 771 DEBUG("%p: unable to get MP <-> MR association", 772 (void *)txq); 773 return NULL; 774 } 775 /* Handle WQE wraparound. */ 776 if (dseg >= 777 (volatile struct mlx4_wqe_data_seg *)sq->eob) 778 dseg = (volatile struct mlx4_wqe_data_seg *) 779 sq->buf; 780 dseg->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(sbuf, uintptr_t)); 781 dseg->lkey = lkey; 782 /* 783 * This data segment starts at the beginning of a new 784 * TXBB, so we need to postpone its byte_count writing 785 * for later. 786 */ 787 pv[pv_counter].dseg = dseg; 788 /* 789 * Zero length segment is treated as inline segment 790 * with zero data. 791 */ 792 pv[pv_counter++].val = rte_cpu_to_be_32(sbuf->data_len ? 793 sbuf->data_len : 0x80000000); 794 sbuf = sbuf->next; 795 dseg++; 796 nb_segs--; 797 txbb_tail_segs: 798 /* Jump to default if there are more than two segments remaining. */ 799 switch (nb_segs) { 800 default: 801 lkey = mlx4_tx_mb2mr(txq, sbuf); 802 if (unlikely(lkey == (uint32_t)-1)) { 803 DEBUG("%p: unable to get MP <-> MR association", 804 (void *)txq); 805 return NULL; 806 } 807 mlx4_fill_tx_data_seg(dseg, lkey, 808 rte_pktmbuf_mtod(sbuf, uintptr_t), 809 rte_cpu_to_be_32(sbuf->data_len ? 810 sbuf->data_len : 811 0x80000000)); 812 sbuf = sbuf->next; 813 dseg++; 814 nb_segs--; 815 /* fallthrough */ 816 case 2: 817 lkey = mlx4_tx_mb2mr(txq, sbuf); 818 if (unlikely(lkey == (uint32_t)-1)) { 819 DEBUG("%p: unable to get MP <-> MR association", 820 (void *)txq); 821 return NULL; 822 } 823 mlx4_fill_tx_data_seg(dseg, lkey, 824 rte_pktmbuf_mtod(sbuf, uintptr_t), 825 rte_cpu_to_be_32(sbuf->data_len ? 826 sbuf->data_len : 827 0x80000000)); 828 sbuf = sbuf->next; 829 dseg++; 830 nb_segs--; 831 /* fallthrough */ 832 case 1: 833 lkey = mlx4_tx_mb2mr(txq, sbuf); 834 if (unlikely(lkey == (uint32_t)-1)) { 835 DEBUG("%p: unable to get MP <-> MR association", 836 (void *)txq); 837 return NULL; 838 } 839 mlx4_fill_tx_data_seg(dseg, lkey, 840 rte_pktmbuf_mtod(sbuf, uintptr_t), 841 rte_cpu_to_be_32(sbuf->data_len ? 842 sbuf->data_len : 843 0x80000000)); 844 nb_segs--; 845 if (nb_segs) { 846 sbuf = sbuf->next; 847 dseg++; 848 goto txbb_head_seg; 849 } 850 /* fallthrough */ 851 case 0: 852 break; 853 } 854 /* Write the first DWORD of each TXBB save earlier. */ 855 if (pv_counter) { 856 /* Need a barrier here before writing the byte_count. */ 857 rte_io_wmb(); 858 for (--pv_counter; pv_counter >= 0; pv_counter--) 859 pv[pv_counter].dseg->byte_count = pv[pv_counter].val; 860 } 861 sq->remain_size -= wqe_size; 862 /* Align next WQE address to the next TXBB. */ 863 return (volatile struct mlx4_wqe_ctrl_seg *) 864 ((volatile uint8_t *)ctrl + wqe_size); 865 } 866 867 /** 868 * DPDK callback for Tx. 869 * 870 * @param dpdk_txq 871 * Generic pointer to Tx queue structure. 872 * @param[in] pkts 873 * Packets to transmit. 874 * @param pkts_n 875 * Number of packets in array. 876 * 877 * @return 878 * Number of packets successfully transmitted (<= pkts_n). 879 */ 880 uint16_t 881 mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 882 { 883 struct txq *txq = (struct txq *)dpdk_txq; 884 unsigned int elts_head = txq->elts_head; 885 const unsigned int elts_n = txq->elts_n; 886 const unsigned int elts_m = elts_n - 1; 887 unsigned int bytes_sent = 0; 888 unsigned int i; 889 unsigned int max = elts_head - txq->elts_tail; 890 struct mlx4_sq *sq = &txq->msq; 891 volatile struct mlx4_wqe_ctrl_seg *ctrl; 892 struct txq_elt *elt; 893 894 assert(txq->elts_comp_cd != 0); 895 if (likely(max >= txq->elts_comp_cd_init)) 896 mlx4_txq_complete(txq, elts_m, sq); 897 max = elts_n - max; 898 assert(max >= 1); 899 assert(max <= elts_n); 900 /* Always leave one free entry in the ring. */ 901 --max; 902 if (max > pkts_n) 903 max = pkts_n; 904 elt = &(*txq->elts)[elts_head & elts_m]; 905 /* First Tx burst element saves the next WQE control segment. */ 906 ctrl = elt->wqe; 907 for (i = 0; (i != max); ++i) { 908 struct rte_mbuf *buf = pkts[i]; 909 struct txq_elt *elt_next = &(*txq->elts)[++elts_head & elts_m]; 910 uint32_t owner_opcode = sq->owner_opcode; 911 volatile struct mlx4_wqe_data_seg *dseg = 912 (volatile struct mlx4_wqe_data_seg *)(ctrl + 1); 913 volatile struct mlx4_wqe_ctrl_seg *ctrl_next; 914 union { 915 uint32_t flags; 916 uint16_t flags16[2]; 917 } srcrb; 918 uint32_t lkey; 919 bool tso = txq->priv->tso && (buf->ol_flags & PKT_TX_TCP_SEG); 920 921 /* Clean up old buffer. */ 922 if (likely(elt->buf != NULL)) { 923 struct rte_mbuf *tmp = elt->buf; 924 925 #ifndef NDEBUG 926 /* Poisoning. */ 927 memset(&elt->buf, 0x66, sizeof(struct rte_mbuf *)); 928 #endif 929 /* Faster than rte_pktmbuf_free(). */ 930 do { 931 struct rte_mbuf *next = tmp->next; 932 933 rte_pktmbuf_free_seg(tmp); 934 tmp = next; 935 } while (tmp != NULL); 936 } 937 RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); 938 if (tso) { 939 /* Change opcode to TSO */ 940 owner_opcode &= ~MLX4_OPCODE_CONFIG_CMD; 941 owner_opcode |= MLX4_OPCODE_LSO | MLX4_WQE_CTRL_RR; 942 ctrl_next = mlx4_tx_burst_tso(buf, txq, ctrl); 943 if (!ctrl_next) { 944 elt->buf = NULL; 945 break; 946 } 947 } else if (buf->nb_segs == 1) { 948 /* Validate WQE space in the send queue. */ 949 if (sq->remain_size < MLX4_TXBB_SIZE) { 950 elt->buf = NULL; 951 break; 952 } 953 lkey = mlx4_tx_mb2mr(txq, buf); 954 if (unlikely(lkey == (uint32_t)-1)) { 955 /* MR does not exist. */ 956 DEBUG("%p: unable to get MP <-> MR association", 957 (void *)txq); 958 elt->buf = NULL; 959 break; 960 } 961 mlx4_fill_tx_data_seg(dseg++, lkey, 962 rte_pktmbuf_mtod(buf, uintptr_t), 963 rte_cpu_to_be_32(buf->data_len)); 964 /* Set WQE size in 16-byte units. */ 965 ctrl->fence_size = 0x2; 966 sq->remain_size -= MLX4_TXBB_SIZE; 967 /* Align next WQE address to the next TXBB. */ 968 ctrl_next = ctrl + 0x4; 969 } else { 970 ctrl_next = mlx4_tx_burst_segs(buf, txq, ctrl); 971 if (!ctrl_next) { 972 elt->buf = NULL; 973 break; 974 } 975 } 976 /* Hold SQ ring wrap around. */ 977 if ((volatile uint8_t *)ctrl_next >= sq->eob) { 978 ctrl_next = (volatile struct mlx4_wqe_ctrl_seg *) 979 ((volatile uint8_t *)ctrl_next - sq->size); 980 /* Flip HW valid ownership. */ 981 sq->owner_opcode ^= 1u << MLX4_SQ_OWNER_BIT; 982 } 983 /* 984 * For raw Ethernet, the SOLICIT flag is used to indicate 985 * that no ICRC should be calculated. 986 */ 987 if (--txq->elts_comp_cd == 0) { 988 /* Save the completion burst end address. */ 989 elt_next->eocb = (volatile uint32_t *)ctrl_next; 990 txq->elts_comp_cd = txq->elts_comp_cd_init; 991 srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT | 992 MLX4_WQE_CTRL_CQ_UPDATE); 993 } else { 994 srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT); 995 } 996 /* Enable HW checksum offload if requested */ 997 if (txq->csum && 998 (buf->ol_flags & 999 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))) { 1000 const uint64_t is_tunneled = (buf->ol_flags & 1001 (PKT_TX_TUNNEL_GRE | 1002 PKT_TX_TUNNEL_VXLAN)); 1003 1004 if (is_tunneled && txq->csum_l2tun) { 1005 owner_opcode |= MLX4_WQE_CTRL_IIP_HDR_CSUM | 1006 MLX4_WQE_CTRL_IL4_HDR_CSUM; 1007 if (buf->ol_flags & PKT_TX_OUTER_IP_CKSUM) 1008 srcrb.flags |= 1009 RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM); 1010 } else { 1011 srcrb.flags |= 1012 RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM | 1013 MLX4_WQE_CTRL_TCP_UDP_CSUM); 1014 } 1015 } 1016 if (txq->lb) { 1017 /* 1018 * Copy destination MAC address to the WQE, this allows 1019 * loopback in eSwitch, so that VFs and PF can 1020 * communicate with each other. 1021 */ 1022 srcrb.flags16[0] = *(rte_pktmbuf_mtod(buf, uint16_t *)); 1023 ctrl->imm = *(rte_pktmbuf_mtod_offset(buf, uint32_t *, 1024 sizeof(uint16_t))); 1025 } else { 1026 ctrl->imm = 0; 1027 } 1028 ctrl->srcrb_flags = srcrb.flags; 1029 /* 1030 * Make sure descriptor is fully written before 1031 * setting ownership bit (because HW can start 1032 * executing as soon as we do). 1033 */ 1034 rte_io_wmb(); 1035 ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode); 1036 elt->buf = buf; 1037 bytes_sent += buf->pkt_len; 1038 ctrl = ctrl_next; 1039 elt = elt_next; 1040 } 1041 /* Take a shortcut if nothing must be sent. */ 1042 if (unlikely(i == 0)) 1043 return 0; 1044 /* Save WQE address of the next Tx burst element. */ 1045 elt->wqe = ctrl; 1046 /* Increment send statistics counters. */ 1047 txq->stats.opackets += i; 1048 txq->stats.obytes += bytes_sent; 1049 /* Make sure that descriptors are written before doorbell record. */ 1050 rte_wmb(); 1051 /* Ring QP doorbell. */ 1052 rte_write32(txq->msq.doorbell_qpn, MLX4_TX_BFREG(txq)); 1053 txq->elts_head += i; 1054 return i; 1055 } 1056 1057 /** 1058 * Translate Rx completion flags to packet type. 1059 * 1060 * @param[in] cqe 1061 * Pointer to CQE. 1062 * 1063 * @return 1064 * Packet type for struct rte_mbuf. 1065 */ 1066 static inline uint32_t 1067 rxq_cq_to_pkt_type(volatile struct mlx4_cqe *cqe, 1068 uint32_t l2tun_offload) 1069 { 1070 uint8_t idx = 0; 1071 uint32_t pinfo = rte_be_to_cpu_32(cqe->vlan_my_qpn); 1072 uint32_t status = rte_be_to_cpu_32(cqe->status); 1073 1074 /* 1075 * The index to the array should have: 1076 * bit[7] - MLX4_CQE_L2_TUNNEL 1077 * bit[6] - MLX4_CQE_L2_TUNNEL_IPV4 1078 */ 1079 if (l2tun_offload && (pinfo & MLX4_CQE_L2_TUNNEL)) 1080 idx |= ((pinfo & MLX4_CQE_L2_TUNNEL) >> 20) | 1081 ((pinfo & MLX4_CQE_L2_TUNNEL_IPV4) >> 19); 1082 /* 1083 * The index to the array should have: 1084 * bit[5] - MLX4_CQE_STATUS_UDP 1085 * bit[4] - MLX4_CQE_STATUS_TCP 1086 * bit[3] - MLX4_CQE_STATUS_IPV4OPT 1087 * bit[2] - MLX4_CQE_STATUS_IPV6 1088 * bit[1] - MLX4_CQE_STATUS_IPF 1089 * bit[0] - MLX4_CQE_STATUS_IPV4 1090 * giving a total of up to 256 entries. 1091 */ 1092 idx |= ((status & MLX4_CQE_STATUS_PTYPE_MASK) >> 22); 1093 if (status & MLX4_CQE_STATUS_IPV6) 1094 idx |= ((status & MLX4_CQE_STATUS_IPV6F) >> 11); 1095 return mlx4_ptype_table[idx]; 1096 } 1097 1098 /** 1099 * Translate Rx completion flags to offload flags. 1100 * 1101 * @param flags 1102 * Rx completion flags returned by mlx4_cqe_flags(). 1103 * @param csum 1104 * Whether Rx checksums are enabled. 1105 * @param csum_l2tun 1106 * Whether Rx L2 tunnel checksums are enabled. 1107 * 1108 * @return 1109 * Offload flags (ol_flags) in mbuf format. 1110 */ 1111 static inline uint32_t 1112 rxq_cq_to_ol_flags(uint32_t flags, int csum, int csum_l2tun) 1113 { 1114 uint32_t ol_flags = 0; 1115 1116 if (csum) 1117 ol_flags |= 1118 mlx4_transpose(flags, 1119 MLX4_CQE_STATUS_IP_HDR_CSUM_OK, 1120 PKT_RX_IP_CKSUM_GOOD) | 1121 mlx4_transpose(flags, 1122 MLX4_CQE_STATUS_TCP_UDP_CSUM_OK, 1123 PKT_RX_L4_CKSUM_GOOD); 1124 if ((flags & MLX4_CQE_L2_TUNNEL) && csum_l2tun) 1125 ol_flags |= 1126 mlx4_transpose(flags, 1127 MLX4_CQE_L2_TUNNEL_IPOK, 1128 PKT_RX_IP_CKSUM_GOOD) | 1129 mlx4_transpose(flags, 1130 MLX4_CQE_L2_TUNNEL_L4_CSUM, 1131 PKT_RX_L4_CKSUM_GOOD); 1132 return ol_flags; 1133 } 1134 1135 /** 1136 * Extract checksum information from CQE flags. 1137 * 1138 * @param cqe 1139 * Pointer to CQE structure. 1140 * @param csum 1141 * Whether Rx checksums are enabled. 1142 * @param csum_l2tun 1143 * Whether Rx L2 tunnel checksums are enabled. 1144 * 1145 * @return 1146 * CQE checksum information. 1147 */ 1148 static inline uint32_t 1149 mlx4_cqe_flags(volatile struct mlx4_cqe *cqe, int csum, int csum_l2tun) 1150 { 1151 uint32_t flags = 0; 1152 1153 /* 1154 * The relevant bits are in different locations on their 1155 * CQE fields therefore we can join them in one 32bit 1156 * variable. 1157 */ 1158 if (csum) 1159 flags = (rte_be_to_cpu_32(cqe->status) & 1160 MLX4_CQE_STATUS_IPV4_CSUM_OK); 1161 if (csum_l2tun) 1162 flags |= (rte_be_to_cpu_32(cqe->vlan_my_qpn) & 1163 (MLX4_CQE_L2_TUNNEL | 1164 MLX4_CQE_L2_TUNNEL_IPOK | 1165 MLX4_CQE_L2_TUNNEL_L4_CSUM | 1166 MLX4_CQE_L2_TUNNEL_IPV4)); 1167 return flags; 1168 } 1169 1170 /** 1171 * Poll one CQE from CQ. 1172 * 1173 * @param rxq 1174 * Pointer to the receive queue structure. 1175 * @param[out] out 1176 * Just polled CQE. 1177 * 1178 * @return 1179 * Number of bytes of the CQE, 0 in case there is no completion. 1180 */ 1181 static unsigned int 1182 mlx4_cq_poll_one(struct rxq *rxq, volatile struct mlx4_cqe **out) 1183 { 1184 int ret = 0; 1185 volatile struct mlx4_cqe *cqe = NULL; 1186 struct mlx4_cq *cq = &rxq->mcq; 1187 1188 cqe = (volatile struct mlx4_cqe *)mlx4_get_cqe(cq, cq->cons_index); 1189 if (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^ 1190 !!(cq->cons_index & cq->cqe_cnt)) 1191 goto out; 1192 /* 1193 * Make sure we read CQ entry contents after we've checked the 1194 * ownership bit. 1195 */ 1196 rte_rmb(); 1197 assert(!(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)); 1198 assert((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) != 1199 MLX4_CQE_OPCODE_ERROR); 1200 ret = rte_be_to_cpu_32(cqe->byte_cnt); 1201 ++cq->cons_index; 1202 out: 1203 *out = cqe; 1204 return ret; 1205 } 1206 1207 /** 1208 * DPDK callback for Rx with scattered packets support. 1209 * 1210 * @param dpdk_rxq 1211 * Generic pointer to Rx queue structure. 1212 * @param[out] pkts 1213 * Array to store received packets. 1214 * @param pkts_n 1215 * Maximum number of packets in array. 1216 * 1217 * @return 1218 * Number of packets successfully received (<= pkts_n). 1219 */ 1220 uint16_t 1221 mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1222 { 1223 struct rxq *rxq = dpdk_rxq; 1224 const uint32_t wr_cnt = (1 << rxq->elts_n) - 1; 1225 const uint16_t sges_n = rxq->sges_n; 1226 struct rte_mbuf *pkt = NULL; 1227 struct rte_mbuf *seg = NULL; 1228 unsigned int i = 0; 1229 uint32_t rq_ci = rxq->rq_ci << sges_n; 1230 int len = 0; 1231 1232 while (pkts_n) { 1233 volatile struct mlx4_cqe *cqe; 1234 uint32_t idx = rq_ci & wr_cnt; 1235 struct rte_mbuf *rep = (*rxq->elts)[idx]; 1236 volatile struct mlx4_wqe_data_seg *scat = &(*rxq->wqes)[idx]; 1237 1238 /* Update the 'next' pointer of the previous segment. */ 1239 if (pkt) 1240 seg->next = rep; 1241 seg = rep; 1242 rte_prefetch0(seg); 1243 rte_prefetch0(scat); 1244 rep = rte_mbuf_raw_alloc(rxq->mp); 1245 if (unlikely(rep == NULL)) { 1246 ++rxq->stats.rx_nombuf; 1247 if (!pkt) { 1248 /* 1249 * No buffers before we even started, 1250 * bail out silently. 1251 */ 1252 break; 1253 } 1254 while (pkt != seg) { 1255 assert(pkt != (*rxq->elts)[idx]); 1256 rep = pkt->next; 1257 pkt->next = NULL; 1258 pkt->nb_segs = 1; 1259 rte_mbuf_raw_free(pkt); 1260 pkt = rep; 1261 } 1262 break; 1263 } 1264 if (!pkt) { 1265 /* Looking for the new packet. */ 1266 len = mlx4_cq_poll_one(rxq, &cqe); 1267 if (!len) { 1268 rte_mbuf_raw_free(rep); 1269 break; 1270 } 1271 if (unlikely(len < 0)) { 1272 /* Rx error, packet is likely too large. */ 1273 rte_mbuf_raw_free(rep); 1274 ++rxq->stats.idropped; 1275 goto skip; 1276 } 1277 pkt = seg; 1278 assert(len >= (rxq->crc_present << 2)); 1279 /* Update packet information. */ 1280 pkt->packet_type = 1281 rxq_cq_to_pkt_type(cqe, rxq->l2tun_offload); 1282 pkt->ol_flags = PKT_RX_RSS_HASH; 1283 pkt->hash.rss = cqe->immed_rss_invalid; 1284 if (rxq->crc_present) 1285 len -= RTE_ETHER_CRC_LEN; 1286 pkt->pkt_len = len; 1287 if (rxq->csum | rxq->csum_l2tun) { 1288 uint32_t flags = 1289 mlx4_cqe_flags(cqe, 1290 rxq->csum, 1291 rxq->csum_l2tun); 1292 1293 pkt->ol_flags = 1294 rxq_cq_to_ol_flags(flags, 1295 rxq->csum, 1296 rxq->csum_l2tun); 1297 } 1298 } 1299 rep->nb_segs = 1; 1300 rep->port = rxq->port_id; 1301 rep->data_len = seg->data_len; 1302 rep->data_off = seg->data_off; 1303 (*rxq->elts)[idx] = rep; 1304 /* 1305 * Fill NIC descriptor with the new buffer. The lkey and size 1306 * of the buffers are already known, only the buffer address 1307 * changes. 1308 */ 1309 scat->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t)); 1310 /* If there's only one MR, no need to replace LKey in WQE. */ 1311 if (unlikely(mlx4_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) 1312 scat->lkey = mlx4_rx_mb2mr(rxq, rep); 1313 if (len > seg->data_len) { 1314 len -= seg->data_len; 1315 ++pkt->nb_segs; 1316 ++rq_ci; 1317 continue; 1318 } 1319 /* The last segment. */ 1320 seg->data_len = len; 1321 /* Increment bytes counter. */ 1322 rxq->stats.ibytes += pkt->pkt_len; 1323 /* Return packet. */ 1324 *(pkts++) = pkt; 1325 pkt = NULL; 1326 --pkts_n; 1327 ++i; 1328 skip: 1329 /* Align consumer index to the next stride. */ 1330 rq_ci >>= sges_n; 1331 ++rq_ci; 1332 rq_ci <<= sges_n; 1333 } 1334 if (unlikely(i == 0 && (rq_ci >> sges_n) == rxq->rq_ci)) 1335 return 0; 1336 /* Update the consumer index. */ 1337 rxq->rq_ci = rq_ci >> sges_n; 1338 rte_wmb(); 1339 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 1340 *rxq->mcq.set_ci_db = 1341 rte_cpu_to_be_32(rxq->mcq.cons_index & MLX4_CQ_DB_CI_MASK); 1342 /* Increment packets counter. */ 1343 rxq->stats.ipackets += i; 1344 return i; 1345 } 1346 1347 /** 1348 * Dummy DPDK callback for Tx. 1349 * 1350 * This function is used to temporarily replace the real callback during 1351 * unsafe control operations on the queue, or in case of error. 1352 * 1353 * @param dpdk_txq 1354 * Generic pointer to Tx queue structure. 1355 * @param[in] pkts 1356 * Packets to transmit. 1357 * @param pkts_n 1358 * Number of packets in array. 1359 * 1360 * @return 1361 * Number of packets successfully transmitted (<= pkts_n). 1362 */ 1363 uint16_t 1364 mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 1365 { 1366 (void)dpdk_txq; 1367 (void)pkts; 1368 (void)pkts_n; 1369 rte_mb(); 1370 return 0; 1371 } 1372 1373 /** 1374 * Dummy DPDK callback for Rx. 1375 * 1376 * This function is used to temporarily replace the real callback during 1377 * unsafe control operations on the queue, or in case of error. 1378 * 1379 * @param dpdk_rxq 1380 * Generic pointer to Rx queue structure. 1381 * @param[out] pkts 1382 * Array to store received packets. 1383 * @param pkts_n 1384 * Maximum number of packets in array. 1385 * 1386 * @return 1387 * Number of packets successfully received (<= pkts_n). 1388 */ 1389 uint16_t 1390 mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1391 { 1392 (void)dpdk_rxq; 1393 (void)pkts; 1394 (void)pkts_n; 1395 rte_mb(); 1396 return 0; 1397 } 1398