1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2021 6WIND S.A. 3 * Copyright 2021 Mellanox Technologies, Ltd 4 */ 5 6 #include <stdint.h> 7 #include <string.h> 8 #include <stdlib.h> 9 10 #include <rte_mbuf.h> 11 #include <rte_mempool.h> 12 #include <rte_prefetch.h> 13 #include <rte_common.h> 14 #include <rte_branch_prediction.h> 15 #include <rte_ether.h> 16 #include <rte_cycles.h> 17 #include <rte_flow.h> 18 19 #include <mlx5_prm.h> 20 #include <mlx5_common.h> 21 #include <mlx5_common_mr.h> 22 #include <rte_pmd_mlx5.h> 23 24 #include "mlx5_autoconf.h" 25 #include "mlx5_defs.h" 26 #include "mlx5.h" 27 #include "mlx5_utils.h" 28 #include "mlx5_rxtx.h" 29 #include "mlx5_devx.h" 30 #include "mlx5_rx.h" 31 #ifdef HAVE_MLX5_MSTFLINT 32 #include <mstflint/mtcr.h> 33 #endif 34 35 36 static __rte_always_inline uint32_t 37 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, 38 volatile struct mlx5_mini_cqe8 *mcqe); 39 40 static __rte_always_inline int 41 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, 42 uint16_t cqe_n, uint16_t cqe_mask, 43 volatile struct mlx5_mini_cqe8 **mcqe, 44 uint16_t *skip_cnt, bool mprq); 45 46 static __rte_always_inline uint32_t 47 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe); 48 49 static __rte_always_inline void 50 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, 51 volatile struct mlx5_cqe *cqe, 52 volatile struct mlx5_mini_cqe8 *mcqe); 53 54 static inline void 55 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *__rte_restrict tcp, 56 volatile struct mlx5_cqe *__rte_restrict cqe, 57 uint32_t phcsum, uint8_t l4_type); 58 59 static inline void 60 mlx5_lro_update_hdr(uint8_t *__rte_restrict padd, 61 volatile struct mlx5_cqe *__rte_restrict cqe, 62 volatile struct mlx5_mini_cqe8 *mcqe, 63 struct mlx5_rxq_data *rxq, uint32_t len); 64 65 66 /** 67 * Internal function to compute the number of used descriptors in an RX queue. 68 * 69 * @param rxq 70 * The Rx queue. 71 * 72 * @return 73 * The number of used Rx descriptor. 74 */ 75 static uint32_t 76 rx_queue_count(struct mlx5_rxq_data *rxq) 77 { 78 struct rxq_zip *zip = &rxq->zip; 79 volatile struct mlx5_cqe *cqe; 80 const unsigned int cqe_n = (1 << rxq->cqe_n); 81 const unsigned int sges_n = (1 << rxq->sges_n); 82 const unsigned int elts_n = (1 << rxq->elts_n); 83 const unsigned int strd_n = RTE_BIT32(rxq->log_strd_num); 84 const unsigned int cqe_cnt = cqe_n - 1; 85 unsigned int cq_ci, used; 86 87 /* if we are processing a compressed cqe */ 88 if (zip->ai) { 89 used = zip->cqe_cnt - zip->ai; 90 cq_ci = zip->cq_ci; 91 } else { 92 used = 0; 93 cq_ci = rxq->cq_ci; 94 } 95 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; 96 while (check_cqe(cqe, cqe_n, cq_ci) != MLX5_CQE_STATUS_HW_OWN) { 97 int8_t op_own; 98 unsigned int n; 99 100 op_own = cqe->op_own; 101 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) 102 n = rte_be_to_cpu_32(cqe->byte_cnt); 103 else 104 n = 1; 105 cq_ci += n; 106 used += n; 107 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; 108 } 109 used = RTE_MIN(used * sges_n, elts_n * strd_n); 110 return used; 111 } 112 113 /** 114 * DPDK callback to check the status of a Rx descriptor. 115 * 116 * @param rx_queue 117 * The Rx queue. 118 * @param[in] offset 119 * The index of the descriptor in the ring. 120 * 121 * @return 122 * The status of the Rx descriptor. 123 */ 124 int 125 mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset) 126 { 127 struct mlx5_rxq_data *rxq = rx_queue; 128 129 if (offset >= (1 << rxq->cqe_n)) { 130 rte_errno = EINVAL; 131 return -rte_errno; 132 } 133 if (offset < rx_queue_count(rxq)) 134 return RTE_ETH_RX_DESC_DONE; 135 return RTE_ETH_RX_DESC_AVAIL; 136 } 137 138 /* Get rxq lwm percentage according to lwm number. */ 139 static uint8_t 140 mlx5_rxq_lwm_to_percentage(struct mlx5_rxq_priv *rxq) 141 { 142 struct mlx5_rxq_data *rxq_data = &rxq->ctrl->rxq; 143 uint32_t wqe_cnt = 1 << (rxq_data->elts_n - rxq_data->sges_n); 144 145 return rxq->lwm * 100 / wqe_cnt; 146 } 147 148 /** 149 * DPDK callback to get the RX queue information. 150 * 151 * @param dev 152 * Pointer to the device structure. 153 * 154 * @param rx_queue_id 155 * Rx queue identificator. 156 * 157 * @param qinfo 158 * Pointer to the RX queue information structure. 159 * 160 * @return 161 * None. 162 */ 163 164 void 165 mlx5_rxq_info_get(struct rte_eth_dev *dev, uint16_t rx_queue_id, 166 struct rte_eth_rxq_info *qinfo) 167 { 168 struct mlx5_rxq_ctrl *rxq_ctrl = mlx5_rxq_ctrl_get(dev, rx_queue_id); 169 struct mlx5_rxq_data *rxq = mlx5_rxq_data_get(dev, rx_queue_id); 170 struct mlx5_rxq_priv *rxq_priv = mlx5_rxq_get(dev, rx_queue_id); 171 172 if (!rxq) 173 return; 174 qinfo->mp = mlx5_rxq_mprq_enabled(rxq) ? 175 rxq->mprq_mp : rxq->mp; 176 qinfo->conf.rx_thresh.pthresh = 0; 177 qinfo->conf.rx_thresh.hthresh = 0; 178 qinfo->conf.rx_thresh.wthresh = 0; 179 qinfo->conf.rx_free_thresh = rxq->rq_repl_thresh; 180 qinfo->conf.rx_drop_en = 1; 181 if (rxq_ctrl == NULL || rxq_ctrl->obj == NULL) 182 qinfo->conf.rx_deferred_start = 0; 183 else 184 qinfo->conf.rx_deferred_start = 1; 185 qinfo->conf.offloads = dev->data->dev_conf.rxmode.offloads; 186 qinfo->scattered_rx = dev->data->scattered_rx; 187 qinfo->nb_desc = mlx5_rxq_mprq_enabled(rxq) ? 188 RTE_BIT32(rxq->elts_n) * RTE_BIT32(rxq->log_strd_num) : 189 RTE_BIT32(rxq->elts_n); 190 qinfo->avail_thresh = rxq_priv ? 191 mlx5_rxq_lwm_to_percentage(rxq_priv) : 0; 192 } 193 194 /** 195 * DPDK callback to get the RX packet burst mode information. 196 * 197 * @param dev 198 * Pointer to the device structure. 199 * 200 * @param rx_queue_id 201 * Rx queue identification. 202 * 203 * @param mode 204 * Pointer to the burts mode information. 205 * 206 * @return 207 * 0 as success, -EINVAL as failure. 208 */ 209 int 210 mlx5_rx_burst_mode_get(struct rte_eth_dev *dev, 211 uint16_t rx_queue_id __rte_unused, 212 struct rte_eth_burst_mode *mode) 213 { 214 eth_rx_burst_t pkt_burst = dev->rx_pkt_burst; 215 struct mlx5_rxq_priv *rxq = mlx5_rxq_get(dev, rx_queue_id); 216 217 if (!rxq) { 218 rte_errno = EINVAL; 219 return -rte_errno; 220 } 221 if (pkt_burst == mlx5_rx_burst) { 222 snprintf(mode->info, sizeof(mode->info), "%s", "Scalar"); 223 } else if (pkt_burst == mlx5_rx_burst_mprq) { 224 snprintf(mode->info, sizeof(mode->info), "%s", "Multi-Packet RQ"); 225 } else if (pkt_burst == mlx5_rx_burst_vec) { 226 #if defined RTE_ARCH_X86_64 227 snprintf(mode->info, sizeof(mode->info), "%s", "Vector SSE"); 228 #elif defined RTE_ARCH_ARM64 229 snprintf(mode->info, sizeof(mode->info), "%s", "Vector Neon"); 230 #elif defined RTE_ARCH_PPC_64 231 snprintf(mode->info, sizeof(mode->info), "%s", "Vector AltiVec"); 232 #else 233 return -EINVAL; 234 #endif 235 } else if (pkt_burst == mlx5_rx_burst_mprq_vec) { 236 #if defined RTE_ARCH_X86_64 237 snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector SSE"); 238 #elif defined RTE_ARCH_ARM64 239 snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector Neon"); 240 #elif defined RTE_ARCH_PPC_64 241 snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector AltiVec"); 242 #else 243 return -EINVAL; 244 #endif 245 } else { 246 return -EINVAL; 247 } 248 return 0; 249 } 250 251 /** 252 * DPDK callback to get the number of used descriptors in a RX queue. 253 * 254 * @param rx_queue 255 * The Rx queue pointer. 256 * 257 * @return 258 * The number of used rx descriptor. 259 * -EINVAL if the queue is invalid 260 */ 261 uint32_t 262 mlx5_rx_queue_count(void *rx_queue) 263 { 264 struct mlx5_rxq_data *rxq = rx_queue; 265 struct rte_eth_dev *dev; 266 267 if (!rxq) { 268 rte_errno = EINVAL; 269 return -rte_errno; 270 } 271 272 dev = &rte_eth_devices[rxq->port_id]; 273 274 if (dev->rx_pkt_burst == NULL || 275 dev->rx_pkt_burst == rte_eth_pkt_burst_dummy) { 276 rte_errno = ENOTSUP; 277 return -rte_errno; 278 } 279 280 return rx_queue_count(rxq); 281 } 282 283 #define CLB_VAL_IDX 0 284 #define CLB_MSK_IDX 1 285 static int 286 mlx5_monitor_callback(const uint64_t value, 287 const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ]) 288 { 289 const uint64_t m = opaque[CLB_MSK_IDX]; 290 const uint64_t v = opaque[CLB_VAL_IDX]; 291 292 return (value & m) == v ? -1 : 0; 293 } 294 295 int mlx5_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc) 296 { 297 struct mlx5_rxq_data *rxq = rx_queue; 298 const unsigned int cqe_num = 1 << rxq->cqe_n; 299 const unsigned int cqe_mask = cqe_num - 1; 300 const uint16_t idx = rxq->cq_ci & cqe_num; 301 const uint8_t vic = rxq->cq_ci >> rxq->cqe_n; 302 volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask]; 303 304 if (unlikely(rxq->cqes == NULL)) { 305 rte_errno = EINVAL; 306 return -rte_errno; 307 } 308 if (rxq->cqe_comp_layout) { 309 pmc->addr = &cqe->validity_iteration_count; 310 pmc->opaque[CLB_VAL_IDX] = vic; 311 pmc->opaque[CLB_MSK_IDX] = MLX5_CQE_VIC_INIT; 312 } else { 313 pmc->addr = &cqe->op_own; 314 pmc->opaque[CLB_VAL_IDX] = !!idx; 315 pmc->opaque[CLB_MSK_IDX] = MLX5_CQE_OWNER_MASK; 316 } 317 pmc->fn = mlx5_monitor_callback; 318 pmc->size = sizeof(uint8_t); 319 return 0; 320 } 321 322 /** 323 * Translate RX completion flags to packet type. 324 * 325 * @param[in] rxq 326 * Pointer to RX queue structure. 327 * @param[in] cqe 328 * Pointer to CQE. 329 * 330 * @note: fix mlx5_dev_supported_ptypes_get() if any change here. 331 * 332 * @return 333 * Packet type for struct rte_mbuf. 334 */ 335 static inline uint32_t 336 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, 337 volatile struct mlx5_mini_cqe8 *mcqe) 338 { 339 uint8_t idx; 340 uint8_t ptype; 341 uint8_t pinfo = (cqe->pkt_info & 0x3) << 6; 342 343 /* Get l3/l4 header from mini-CQE in case L3/L4 format*/ 344 if (mcqe == NULL || 345 rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX) 346 ptype = (cqe->hdr_type_etc & 0xfc00) >> 10; 347 else 348 ptype = mcqe->hdr_type >> 2; 349 /* 350 * The index to the array should have: 351 * bit[1:0] = l3_hdr_type 352 * bit[4:2] = l4_hdr_type 353 * bit[5] = ip_frag 354 * bit[6] = tunneled 355 * bit[7] = outer_l3_type 356 */ 357 idx = pinfo | ptype; 358 return mlx5_ptype_table[idx] | rxq->tunnel * !!(idx & (1 << 6)); 359 } 360 361 /** 362 * Initialize Rx WQ and indexes. 363 * 364 * @param[in] rxq 365 * Pointer to RX queue structure. 366 */ 367 void 368 mlx5_rxq_initialize(struct mlx5_rxq_data *rxq) 369 { 370 const unsigned int wqe_n = 1 << rxq->elts_n; 371 unsigned int i; 372 373 for (i = 0; (i != wqe_n); ++i) { 374 volatile struct mlx5_wqe_data_seg *scat; 375 uintptr_t addr; 376 uint32_t byte_count; 377 uint32_t lkey; 378 379 if (mlx5_rxq_mprq_enabled(rxq)) { 380 struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[i]; 381 382 scat = &((volatile struct mlx5_wqe_mprq *) 383 rxq->wqes)[i].dseg; 384 addr = (uintptr_t)mlx5_mprq_buf_addr 385 (buf, RTE_BIT32(rxq->log_strd_num)); 386 byte_count = RTE_BIT32(rxq->log_strd_sz) * 387 RTE_BIT32(rxq->log_strd_num); 388 lkey = mlx5_rx_addr2mr(rxq, addr); 389 } else { 390 struct rte_mbuf *buf = (*rxq->elts)[i]; 391 392 scat = &((volatile struct mlx5_wqe_data_seg *) 393 rxq->wqes)[i]; 394 addr = rte_pktmbuf_mtod(buf, uintptr_t); 395 byte_count = DATA_LEN(buf); 396 lkey = mlx5_rx_mb2mr(rxq, buf); 397 } 398 /* scat->addr must be able to store a pointer. */ 399 MLX5_ASSERT(sizeof(scat->addr) >= sizeof(uintptr_t)); 400 *scat = (struct mlx5_wqe_data_seg){ 401 .addr = rte_cpu_to_be_64(addr), 402 .byte_count = rte_cpu_to_be_32(byte_count), 403 .lkey = lkey, 404 }; 405 } 406 rxq->consumed_strd = 0; 407 rxq->decompressed = 0; 408 rxq->rq_pi = 0; 409 rxq->zip = (struct rxq_zip){ 410 .ai = 0, 411 }; 412 rxq->elts_ci = mlx5_rxq_mprq_enabled(rxq) ? 413 (wqe_n >> rxq->sges_n) * RTE_BIT32(rxq->log_strd_num) : 0; 414 /* Update doorbell counter. */ 415 rxq->rq_ci = wqe_n >> rxq->sges_n; 416 rte_io_wmb(); 417 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 418 } 419 420 #define MLX5_ERROR_CQE_MASK 0x40000000 421 /* Must be negative. */ 422 #define MLX5_REGULAR_ERROR_CQE_RET (-5) 423 #define MLX5_CRITICAL_ERROR_CQE_RET (-4) 424 /* Must not be negative. */ 425 #define MLX5_RECOVERY_ERROR_RET 0 426 #define MLX5_RECOVERY_IGNORE_RET 1 427 #define MLX5_RECOVERY_COMPLETED_RET 2 428 429 /** 430 * Handle a Rx error. 431 * The function inserts the RQ state to reset when the first error CQE is 432 * shown, then drains the CQ by the caller function loop. When the CQ is empty, 433 * it moves the RQ state to ready and initializes the RQ. 434 * Next CQE identification and error counting are in the caller responsibility. 435 * 436 * @param[in] rxq 437 * Pointer to RX queue structure. 438 * @param[in] vec 439 * 1 when called from vectorized Rx burst, need to prepare mbufs for the RQ. 440 * 0 when called from non-vectorized Rx burst. 441 * @param[in] err_n 442 * Number of CQEs to check for an error. 443 * 444 * @return 445 * MLX5_RECOVERY_ERROR_RET in case of recovery error, 446 * MLX5_RECOVERY_IGNORE_RET in case of non-critical error syndrome, 447 * MLX5_RECOVERY_COMPLETED_RET in case of recovery is completed, 448 * otherwise the CQE status after ignored error syndrome or queue reset. 449 */ 450 int 451 mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec, 452 uint16_t err_n, uint16_t *skip_cnt) 453 { 454 const uint16_t cqe_n = 1 << rxq->cqe_n; 455 const uint16_t cqe_mask = cqe_n - 1; 456 const uint16_t wqe_n = 1 << rxq->elts_n; 457 const uint16_t strd_n = RTE_BIT32(rxq->log_strd_num); 458 struct mlx5_rxq_ctrl *rxq_ctrl = 459 container_of(rxq, struct mlx5_rxq_ctrl, rxq); 460 union { 461 volatile struct mlx5_cqe *cqe; 462 volatile struct mlx5_err_cqe *err_cqe; 463 } u = { 464 .cqe = &(*rxq->cqes)[(rxq->cq_ci - vec) & cqe_mask], 465 }; 466 struct mlx5_mp_arg_queue_state_modify sm; 467 bool critical_syndrome = false; 468 int ret, i; 469 470 switch (rxq->err_state) { 471 case MLX5_RXQ_ERR_STATE_IGNORE: 472 ret = check_cqe(u.cqe, cqe_n, rxq->cq_ci - vec); 473 if (ret != MLX5_CQE_STATUS_ERR) { 474 rxq->err_state = MLX5_RXQ_ERR_STATE_NO_ERROR; 475 return ret; 476 } 477 /* Fall-through */ 478 case MLX5_RXQ_ERR_STATE_NO_ERROR: 479 for (i = 0; i < (int)err_n; i++) { 480 u.cqe = &(*rxq->cqes)[(rxq->cq_ci - vec - i) & cqe_mask]; 481 if (MLX5_CQE_OPCODE(u.cqe->op_own) == MLX5_CQE_RESP_ERR) { 482 if (u.err_cqe->syndrome == MLX5_CQE_SYNDROME_LOCAL_QP_OP_ERR || 483 u.err_cqe->syndrome == MLX5_CQE_SYNDROME_LOCAL_PROT_ERR || 484 u.err_cqe->syndrome == MLX5_CQE_SYNDROME_WR_FLUSH_ERR) 485 critical_syndrome = true; 486 break; 487 } 488 } 489 if (!critical_syndrome) { 490 if (rxq->err_state == MLX5_RXQ_ERR_STATE_NO_ERROR) { 491 *skip_cnt = 0; 492 if (i == err_n) 493 rxq->err_state = MLX5_RXQ_ERR_STATE_IGNORE; 494 } 495 return MLX5_RECOVERY_IGNORE_RET; 496 } 497 rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_RESET; 498 /* Fall-through */ 499 case MLX5_RXQ_ERR_STATE_NEED_RESET: 500 sm.is_wq = 1; 501 sm.queue_id = rxq->idx; 502 sm.state = IBV_WQS_RESET; 503 if (mlx5_queue_state_modify(RXQ_DEV(rxq_ctrl), &sm)) 504 return MLX5_RECOVERY_ERROR_RET; 505 if (rxq_ctrl->dump_file_n < 506 RXQ_PORT(rxq_ctrl)->config.max_dump_files_num) { 507 MKSTR(err_str, "Unexpected CQE error syndrome " 508 "0x%02x CQN = %u RQN = %u wqe_counter = %u" 509 " rq_ci = %u cq_ci = %u", u.err_cqe->syndrome, 510 rxq->cqn, rxq_ctrl->wqn, 511 rte_be_to_cpu_16(u.err_cqe->wqe_counter), 512 rxq->rq_ci << rxq->sges_n, rxq->cq_ci); 513 MKSTR(name, "dpdk_mlx5_port_%u_rxq_%u_%u", 514 rxq->port_id, rxq->idx, (uint32_t)rte_rdtsc()); 515 mlx5_dump_debug_information(name, NULL, err_str, 0); 516 mlx5_dump_debug_information(name, "MLX5 Error CQ:", 517 (const void *)((uintptr_t) 518 rxq->cqes), 519 sizeof(*u.cqe) * cqe_n); 520 mlx5_dump_debug_information(name, "MLX5 Error RQ:", 521 (const void *)((uintptr_t) 522 rxq->wqes), 523 16 * wqe_n); 524 rxq_ctrl->dump_file_n++; 525 } 526 rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_READY; 527 /* Fall-through */ 528 case MLX5_RXQ_ERR_STATE_NEED_READY: 529 ret = check_cqe(u.cqe, cqe_n, rxq->cq_ci); 530 if (ret == MLX5_CQE_STATUS_HW_OWN) { 531 rte_io_wmb(); 532 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); 533 rte_io_wmb(); 534 /* 535 * The RQ consumer index must be zeroed while moving 536 * from RESET state to RDY state. 537 */ 538 *rxq->rq_db = rte_cpu_to_be_32(0); 539 rte_io_wmb(); 540 sm.is_wq = 1; 541 sm.queue_id = rxq->idx; 542 sm.state = IBV_WQS_RDY; 543 if (mlx5_queue_state_modify(RXQ_DEV(rxq_ctrl), &sm)) 544 return MLX5_RECOVERY_ERROR_RET; 545 if (vec) { 546 const uint32_t elts_n = 547 mlx5_rxq_mprq_enabled(rxq) ? 548 wqe_n * strd_n : wqe_n; 549 const uint32_t e_mask = elts_n - 1; 550 uint32_t elts_ci = 551 mlx5_rxq_mprq_enabled(rxq) ? 552 rxq->elts_ci : rxq->rq_ci; 553 uint32_t elt_idx; 554 struct rte_mbuf **elt; 555 unsigned int n = elts_n - (elts_ci - 556 rxq->rq_pi); 557 558 for (i = 0; i < (int)n; ++i) { 559 elt_idx = (elts_ci + i) & e_mask; 560 elt = &(*rxq->elts)[elt_idx]; 561 *elt = rte_mbuf_raw_alloc(rxq->mp); 562 if (!*elt) { 563 for (i--; i >= 0; --i) { 564 elt_idx = (elts_ci + 565 i) & elts_n; 566 elt = &(*rxq->elts) 567 [elt_idx]; 568 rte_pktmbuf_free_seg 569 (*elt); 570 } 571 return MLX5_RECOVERY_ERROR_RET; 572 } 573 } 574 for (i = 0; i < (int)elts_n; ++i) { 575 elt = &(*rxq->elts)[i]; 576 DATA_LEN(*elt) = 577 (uint16_t)((*elt)->buf_len - 578 rte_pktmbuf_headroom(*elt)); 579 } 580 /* Padding with a fake mbuf for vec Rx. */ 581 for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i) 582 (*rxq->elts)[elts_n + i] = 583 &rxq->fake_mbuf; 584 } 585 mlx5_rxq_initialize(rxq); 586 rxq->err_state = MLX5_RXQ_ERR_STATE_NO_ERROR; 587 return MLX5_RECOVERY_COMPLETED_RET; 588 } 589 return ret; 590 default: 591 return MLX5_RECOVERY_ERROR_RET; 592 } 593 } 594 595 /** 596 * Get size of the next packet for a given CQE. For compressed CQEs, the 597 * consumer index is updated only once all packets of the current one have 598 * been processed. 599 * 600 * @param rxq 601 * Pointer to RX queue. 602 * @param cqe 603 * CQE to process. 604 * @param cqe_n 605 * Completion queue count. 606 * @param cqe_mask 607 * Completion queue mask. 608 * @param[out] mcqe 609 * Store pointer to mini-CQE if compressed. Otherwise, the pointer is not 610 * written. 611 * @param[out] skip_cnt 612 * Number of packets skipped due to recoverable errors. 613 * @param mprq 614 * Indication if it is called from MPRQ. 615 * @return 616 * 0 in case of empty CQE, MLX5_REGULAR_ERROR_CQE_RET in case of error CQE, 617 * MLX5_CRITICAL_ERROR_CQE_RET in case of error CQE lead to Rx queue reset, 618 * otherwise the packet size in regular RxQ, 619 * and striding byte count format in mprq case. 620 */ 621 static inline int 622 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, 623 uint16_t cqe_n, uint16_t cqe_mask, 624 volatile struct mlx5_mini_cqe8 **mcqe, 625 uint16_t *skip_cnt, bool mprq) 626 { 627 struct rxq_zip *zip = &rxq->zip; 628 int len = 0, ret = 0; 629 uint32_t idx, end; 630 631 do { 632 len = 0; 633 /* Process compressed data in the CQE and mini arrays. */ 634 if (zip->ai) { 635 volatile struct mlx5_mini_cqe8 (*mc)[8] = 636 (volatile struct mlx5_mini_cqe8 (*)[8]) 637 (uintptr_t)(&(*rxq->cqes)[zip->ca & 638 cqe_mask].pkt_info); 639 len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt & 640 rxq->byte_mask); 641 *mcqe = &(*mc)[zip->ai & 7]; 642 if (rxq->cqe_comp_layout) { 643 zip->ai++; 644 if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) { 645 rxq->cq_ci = zip->cq_ci; 646 zip->ai = 0; 647 } 648 } else { 649 if ((++zip->ai & 7) == 0) { 650 /* Invalidate consumed CQEs */ 651 idx = zip->ca; 652 end = zip->na; 653 while (idx != end) { 654 (*rxq->cqes)[idx & cqe_mask].op_own = 655 MLX5_CQE_INVALIDATE; 656 ++idx; 657 } 658 /* 659 * Increment consumer index to skip the number 660 * of CQEs consumed. Hardware leaves holes in 661 * the CQ ring for software use. 662 */ 663 zip->ca = zip->na; 664 zip->na += 8; 665 } 666 if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) { 667 /* Invalidate the rest */ 668 idx = zip->ca; 669 end = zip->cq_ci; 670 671 while (idx != end) { 672 (*rxq->cqes)[idx & cqe_mask].op_own = 673 MLX5_CQE_INVALIDATE; 674 ++idx; 675 } 676 rxq->cq_ci = zip->cq_ci; 677 zip->ai = 0; 678 } 679 } 680 /* 681 * No compressed data, get next CQE and verify if it is 682 * compressed. 683 */ 684 } else { 685 int8_t op_own; 686 uint32_t cq_ci; 687 688 ret = (rxq->cqe_comp_layout) ? 689 check_cqe_iteration(cqe, rxq->cqe_n, rxq->cq_ci) : 690 check_cqe(cqe, cqe_n, rxq->cq_ci); 691 if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) { 692 if (unlikely(ret == MLX5_CQE_STATUS_ERR || 693 rxq->err_state)) { 694 ret = mlx5_rx_err_handle(rxq, 0, 1, skip_cnt); 695 if (ret == MLX5_CQE_STATUS_HW_OWN) 696 return MLX5_ERROR_CQE_MASK; 697 if (ret == MLX5_RECOVERY_ERROR_RET || 698 ret == MLX5_RECOVERY_COMPLETED_RET) 699 return MLX5_CRITICAL_ERROR_CQE_RET; 700 } else { 701 return 0; 702 } 703 } 704 /* 705 * Introduce the local variable to have queue cq_ci 706 * index in queue structure always consistent with 707 * actual CQE boundary (not pointing to the middle 708 * of compressed CQE session). 709 */ 710 cq_ci = rxq->cq_ci + !rxq->cqe_comp_layout; 711 op_own = cqe->op_own; 712 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) { 713 volatile struct mlx5_mini_cqe8 (*mc)[8] = 714 (volatile struct mlx5_mini_cqe8 (*)[8]) 715 (uintptr_t)(&(*rxq->cqes) 716 [cq_ci & cqe_mask].pkt_info); 717 718 /* Fix endianness. */ 719 zip->cqe_cnt = rxq->cqe_comp_layout ? 720 (MLX5_CQE_NUM_MINIS(op_own) + 1U) : 721 rte_be_to_cpu_32(cqe->byte_cnt); 722 /* 723 * Current mini array position is the one 724 * returned by check_cqe64(). 725 * 726 * If completion comprises several mini arrays, 727 * as a special case the second one is located 728 * 7 CQEs after the initial CQE instead of 8 729 * for subsequent ones. 730 */ 731 zip->ca = cq_ci; 732 zip->na = zip->ca + 7; 733 /* Compute the next non compressed CQE. */ 734 zip->cq_ci = rxq->cq_ci + zip->cqe_cnt; 735 /* Get packet size to return. */ 736 len = rte_be_to_cpu_32((*mc)[0].byte_cnt & 737 rxq->byte_mask); 738 *mcqe = &(*mc)[0]; 739 if (rxq->cqe_comp_layout) { 740 if (MLX5_CQE_NUM_MINIS(op_own)) 741 zip->ai = 1; 742 else 743 rxq->cq_ci = zip->cq_ci; 744 } else { 745 zip->ai = 1; 746 /* Prefetch all to be invalidated */ 747 idx = zip->ca; 748 end = zip->cq_ci; 749 while (idx != end) { 750 rte_prefetch0(&(*rxq->cqes)[(idx) & cqe_mask]); 751 ++idx; 752 } 753 } 754 } else { 755 ++rxq->cq_ci; 756 len = rte_be_to_cpu_32(cqe->byte_cnt); 757 if (rxq->cqe_comp_layout) { 758 volatile struct mlx5_cqe *next; 759 760 next = &(*rxq->cqes)[rxq->cq_ci & cqe_mask]; 761 ret = check_cqe_iteration(next, rxq->cqe_n, rxq->cq_ci); 762 if (ret != MLX5_CQE_STATUS_SW_OWN || 763 MLX5_CQE_FORMAT(next->op_own) == MLX5_COMPRESSED) 764 rte_memcpy(&rxq->title_cqe, 765 (const void *)(uintptr_t)cqe, 766 sizeof(struct mlx5_cqe)); 767 } 768 } 769 } 770 if (unlikely(rxq->err_state)) { 771 if (rxq->err_state == MLX5_RXQ_ERR_STATE_IGNORE && 772 ret == MLX5_CQE_STATUS_SW_OWN) { 773 rxq->err_state = MLX5_RXQ_ERR_STATE_NO_ERROR; 774 return len & MLX5_ERROR_CQE_MASK; 775 } 776 cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask]; 777 ++rxq->stats.idropped; 778 (*skip_cnt) += mprq ? (len & MLX5_MPRQ_STRIDE_NUM_MASK) >> 779 MLX5_MPRQ_STRIDE_NUM_SHIFT : 1; 780 } else { 781 return len; 782 } 783 } while (1); 784 } 785 786 /** 787 * Translate RX completion flags to offload flags. 788 * 789 * @param[in] cqe 790 * Pointer to CQE. 791 * 792 * @return 793 * Offload flags (ol_flags) for struct rte_mbuf. 794 */ 795 static inline uint32_t 796 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe) 797 { 798 uint32_t ol_flags = 0; 799 uint16_t flags = rte_be_to_cpu_16(cqe->hdr_type_etc); 800 801 ol_flags = 802 TRANSPOSE(flags, 803 MLX5_CQE_RX_L3_HDR_VALID, 804 RTE_MBUF_F_RX_IP_CKSUM_GOOD) | 805 TRANSPOSE(flags, 806 MLX5_CQE_RX_L4_HDR_VALID, 807 RTE_MBUF_F_RX_L4_CKSUM_GOOD); 808 return ol_flags; 809 } 810 811 /** 812 * Fill in mbuf fields from RX completion flags. 813 * Note that pkt->ol_flags should be initialized outside of this function. 814 * 815 * @param rxq 816 * Pointer to RX queue. 817 * @param pkt 818 * mbuf to fill. 819 * @param cqe 820 * CQE to process. 821 * @param rss_hash_res 822 * Packet RSS Hash result. 823 */ 824 static inline void 825 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, 826 volatile struct mlx5_cqe *cqe, 827 volatile struct mlx5_mini_cqe8 *mcqe) 828 { 829 /* Update packet information. */ 830 pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe, mcqe); 831 pkt->port = unlikely(rxq->shared) ? cqe->user_index_low : rxq->port_id; 832 833 if (rxq->rss_hash) { 834 uint32_t rss_hash_res = 0; 835 836 /* If compressed, take hash result from mini-CQE. */ 837 if (mcqe == NULL || 838 rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_HASH) 839 rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res); 840 else 841 rss_hash_res = rte_be_to_cpu_32(mcqe->rx_hash_result); 842 if (rss_hash_res) { 843 pkt->hash.rss = rss_hash_res; 844 pkt->ol_flags |= RTE_MBUF_F_RX_RSS_HASH; 845 } 846 } 847 if (rxq->mark) { 848 uint32_t mark = 0; 849 850 /* If compressed, take flow tag from mini-CQE. */ 851 if (mcqe == NULL || 852 rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_FTAG_STRIDX) 853 mark = cqe->sop_drop_qpn; 854 else 855 mark = ((mcqe->byte_cnt_flow & 0xff) << 8) | 856 (mcqe->flow_tag_high << 16); 857 if (MLX5_FLOW_MARK_IS_VALID(mark)) { 858 pkt->ol_flags |= RTE_MBUF_F_RX_FDIR; 859 if (mark != RTE_BE32(MLX5_FLOW_MARK_DEFAULT)) { 860 pkt->ol_flags |= rxq->mark_flag; 861 pkt->hash.fdir.hi = mlx5_flow_mark_get(mark); 862 } 863 } 864 } 865 if (rxq->dynf_meta) { 866 uint32_t meta = rte_be_to_cpu_32(cqe->flow_table_metadata) & 867 rxq->flow_meta_port_mask; 868 869 if (meta) { 870 pkt->ol_flags |= rxq->flow_meta_mask; 871 *RTE_MBUF_DYNFIELD(pkt, rxq->flow_meta_offset, 872 uint32_t *) = meta; 873 } 874 } 875 if (rxq->csum) 876 pkt->ol_flags |= rxq_cq_to_ol_flags(cqe); 877 if (rxq->vlan_strip) { 878 bool vlan_strip; 879 880 if (mcqe == NULL || 881 rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX) 882 vlan_strip = cqe->hdr_type_etc & 883 RTE_BE16(MLX5_CQE_VLAN_STRIPPED); 884 else 885 vlan_strip = mcqe->hdr_type & 886 RTE_BE16(MLX5_CQE_VLAN_STRIPPED); 887 if (vlan_strip) { 888 pkt->ol_flags |= RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED; 889 pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info); 890 } 891 } 892 if (rxq->hw_timestamp) { 893 uint64_t ts = rte_be_to_cpu_64(cqe->timestamp); 894 895 if (rxq->rt_timestamp) 896 ts = mlx5_txpp_convert_rx_ts(rxq->sh, ts); 897 mlx5_timestamp_set(pkt, rxq->timestamp_offset, ts); 898 pkt->ol_flags |= rxq->timestamp_rx_flag; 899 } 900 } 901 902 /** 903 * DPDK callback for RX. 904 * 905 * @param dpdk_rxq 906 * Generic pointer to RX queue structure. 907 * @param[out] pkts 908 * Array to store received packets. 909 * @param pkts_n 910 * Maximum number of packets in array. 911 * 912 * @return 913 * Number of packets successfully received (<= pkts_n). 914 */ 915 uint16_t 916 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 917 { 918 struct mlx5_rxq_data *rxq = dpdk_rxq; 919 const uint32_t wqe_n = 1 << rxq->elts_n; 920 const uint32_t wqe_mask = wqe_n - 1; 921 const uint32_t cqe_n = 1 << rxq->cqe_n; 922 const uint32_t cqe_mask = cqe_n - 1; 923 const unsigned int sges_n = rxq->sges_n; 924 struct rte_mbuf *pkt = NULL; 925 struct rte_mbuf *seg = NULL; 926 volatile struct mlx5_cqe *cqe = 927 &(*rxq->cqes)[rxq->cq_ci & cqe_mask]; 928 unsigned int i = 0; 929 unsigned int rq_ci = rxq->rq_ci << sges_n; 930 int len = 0; /* keep its value across iterations. */ 931 932 while (pkts_n) { 933 uint16_t skip_cnt; 934 unsigned int idx = rq_ci & wqe_mask; 935 volatile struct mlx5_wqe_data_seg *wqe = 936 &((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx]; 937 struct rte_mbuf *rep = (*rxq->elts)[idx]; 938 volatile struct mlx5_mini_cqe8 *mcqe = NULL; 939 940 if (pkt) 941 NEXT(seg) = rep; 942 seg = rep; 943 rte_prefetch0(seg); 944 rte_prefetch0(cqe); 945 rte_prefetch0(wqe); 946 /* Allocate the buf from the same pool. */ 947 rep = rte_mbuf_raw_alloc(seg->pool); 948 if (unlikely(rep == NULL)) { 949 ++rxq->stats.rx_nombuf; 950 if (!pkt) { 951 /* 952 * no buffers before we even started, 953 * bail out silently. 954 */ 955 break; 956 } 957 while (pkt != seg) { 958 MLX5_ASSERT(pkt != (*rxq->elts)[idx]); 959 rep = NEXT(pkt); 960 NEXT(pkt) = NULL; 961 NB_SEGS(pkt) = 1; 962 rte_mbuf_raw_free(pkt); 963 pkt = rep; 964 } 965 rq_ci >>= sges_n; 966 ++rq_ci; 967 rq_ci <<= sges_n; 968 break; 969 } 970 if (!pkt) { 971 cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask]; 972 len = mlx5_rx_poll_len(rxq, cqe, cqe_n, cqe_mask, &mcqe, &skip_cnt, false); 973 if (unlikely(len & MLX5_ERROR_CQE_MASK)) { 974 if (len == MLX5_CRITICAL_ERROR_CQE_RET) { 975 rte_mbuf_raw_free(rep); 976 rq_ci = rxq->rq_ci << sges_n; 977 break; 978 } 979 rq_ci >>= sges_n; 980 rq_ci += skip_cnt; 981 rq_ci <<= sges_n; 982 idx = rq_ci & wqe_mask; 983 wqe = &((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx]; 984 seg = (*rxq->elts)[idx]; 985 cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask]; 986 len = len & ~MLX5_ERROR_CQE_MASK; 987 } 988 if (len == 0) { 989 rte_mbuf_raw_free(rep); 990 break; 991 } 992 pkt = seg; 993 MLX5_ASSERT(len >= (rxq->crc_present << 2)); 994 pkt->ol_flags &= RTE_MBUF_F_EXTERNAL; 995 if (rxq->cqe_comp_layout && mcqe) 996 cqe = &rxq->title_cqe; 997 rxq_cq_to_mbuf(rxq, pkt, cqe, mcqe); 998 if (rxq->crc_present) 999 len -= RTE_ETHER_CRC_LEN; 1000 PKT_LEN(pkt) = len; 1001 if (cqe->lro_num_seg > 1) { 1002 mlx5_lro_update_hdr 1003 (rte_pktmbuf_mtod(pkt, uint8_t *), cqe, 1004 mcqe, rxq, len); 1005 pkt->ol_flags |= RTE_MBUF_F_RX_LRO; 1006 pkt->tso_segsz = len / cqe->lro_num_seg; 1007 } 1008 } 1009 DATA_LEN(rep) = DATA_LEN(seg); 1010 PKT_LEN(rep) = PKT_LEN(seg); 1011 SET_DATA_OFF(rep, DATA_OFF(seg)); 1012 PORT(rep) = PORT(seg); 1013 (*rxq->elts)[idx] = rep; 1014 /* 1015 * Fill NIC descriptor with the new buffer. The lkey and size 1016 * of the buffers are already known, only the buffer address 1017 * changes. 1018 */ 1019 wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t)); 1020 /* If there's only one MR, no need to replace LKey in WQE. */ 1021 if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) 1022 wqe->lkey = mlx5_rx_mb2mr(rxq, rep); 1023 if (len > DATA_LEN(seg)) { 1024 len -= DATA_LEN(seg); 1025 ++NB_SEGS(pkt); 1026 ++rq_ci; 1027 continue; 1028 } 1029 DATA_LEN(seg) = len; 1030 #ifdef MLX5_PMD_SOFT_COUNTERS 1031 /* Increment bytes counter. */ 1032 rxq->stats.ibytes += PKT_LEN(pkt); 1033 #endif 1034 /* Return packet. */ 1035 *(pkts++) = pkt; 1036 pkt = NULL; 1037 --pkts_n; 1038 ++i; 1039 /* Align consumer index to the next stride. */ 1040 rq_ci >>= sges_n; 1041 ++rq_ci; 1042 rq_ci <<= sges_n; 1043 } 1044 if (unlikely(i == 0 && ((rq_ci >> sges_n) == rxq->rq_ci))) 1045 return 0; 1046 /* Update the consumer index. */ 1047 rxq->rq_ci = rq_ci >> sges_n; 1048 rte_io_wmb(); 1049 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); 1050 rte_io_wmb(); 1051 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 1052 #ifdef MLX5_PMD_SOFT_COUNTERS 1053 /* Increment packets counter. */ 1054 rxq->stats.ipackets += i; 1055 #endif 1056 return i; 1057 } 1058 1059 /** 1060 * Update LRO packet TCP header. 1061 * The HW LRO feature doesn't update the TCP header after coalescing the 1062 * TCP segments but supplies information in CQE to fill it by SW. 1063 * 1064 * @param tcp 1065 * Pointer to the TCP header. 1066 * @param cqe 1067 * Pointer to the completion entry. 1068 * @param phcsum 1069 * The L3 pseudo-header checksum. 1070 */ 1071 static inline void 1072 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *__rte_restrict tcp, 1073 volatile struct mlx5_cqe *__rte_restrict cqe, 1074 uint32_t phcsum, uint8_t l4_type) 1075 { 1076 /* 1077 * The HW calculates only the TCP payload checksum, need to complete 1078 * the TCP header checksum and the L3 pseudo-header checksum. 1079 */ 1080 uint32_t csum = phcsum + cqe->csum; 1081 1082 if (l4_type == MLX5_L4_HDR_TYPE_TCP_EMPTY_ACK || 1083 l4_type == MLX5_L4_HDR_TYPE_TCP_WITH_ACL) { 1084 tcp->tcp_flags |= RTE_TCP_ACK_FLAG; 1085 tcp->recv_ack = cqe->lro_ack_seq_num; 1086 tcp->rx_win = cqe->lro_tcp_win; 1087 } 1088 if (cqe->lro_tcppsh_abort_dupack & MLX5_CQE_LRO_PUSH_MASK) 1089 tcp->tcp_flags |= RTE_TCP_PSH_FLAG; 1090 tcp->cksum = 0; 1091 csum += rte_raw_cksum(tcp, (tcp->data_off >> 4) * 4); 1092 csum = ((csum & 0xffff0000) >> 16) + (csum & 0xffff); 1093 csum = ((csum & 0xffff0000) >> 16) + (csum & 0xffff); 1094 csum = (~csum) & 0xffff; 1095 if (csum == 0) 1096 csum = 0xffff; 1097 tcp->cksum = csum; 1098 } 1099 1100 /** 1101 * Update LRO packet headers. 1102 * The HW LRO feature doesn't update the L3/TCP headers after coalescing the 1103 * TCP segments but supply information in CQE to fill it by SW. 1104 * 1105 * @param padd 1106 * The packet address. 1107 * @param cqe 1108 * Pointer to the completion entry. 1109 * @param len 1110 * The packet length. 1111 */ 1112 static inline void 1113 mlx5_lro_update_hdr(uint8_t *__rte_restrict padd, 1114 volatile struct mlx5_cqe *__rte_restrict cqe, 1115 volatile struct mlx5_mini_cqe8 *mcqe, 1116 struct mlx5_rxq_data *rxq, uint32_t len) 1117 { 1118 union { 1119 struct rte_ether_hdr *eth; 1120 struct rte_vlan_hdr *vlan; 1121 struct rte_ipv4_hdr *ipv4; 1122 struct rte_ipv6_hdr *ipv6; 1123 struct rte_tcp_hdr *tcp; 1124 uint8_t *hdr; 1125 } h = { 1126 .hdr = padd, 1127 }; 1128 uint16_t proto = h.eth->ether_type; 1129 uint32_t phcsum; 1130 uint8_t l4_type; 1131 1132 h.eth++; 1133 while (proto == RTE_BE16(RTE_ETHER_TYPE_VLAN) || 1134 proto == RTE_BE16(RTE_ETHER_TYPE_QINQ)) { 1135 proto = h.vlan->eth_proto; 1136 h.vlan++; 1137 } 1138 if (proto == RTE_BE16(RTE_ETHER_TYPE_IPV4)) { 1139 h.ipv4->time_to_live = cqe->lro_min_ttl; 1140 h.ipv4->total_length = rte_cpu_to_be_16(len - (h.hdr - padd)); 1141 h.ipv4->hdr_checksum = 0; 1142 h.ipv4->hdr_checksum = rte_ipv4_cksum(h.ipv4); 1143 phcsum = rte_ipv4_phdr_cksum(h.ipv4, 0); 1144 h.ipv4++; 1145 } else { 1146 h.ipv6->hop_limits = cqe->lro_min_ttl; 1147 h.ipv6->payload_len = rte_cpu_to_be_16(len - (h.hdr - padd) - 1148 sizeof(*h.ipv6)); 1149 phcsum = rte_ipv6_phdr_cksum(h.ipv6, 0); 1150 h.ipv6++; 1151 } 1152 if (mcqe == NULL || 1153 rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX) 1154 l4_type = (rte_be_to_cpu_16(cqe->hdr_type_etc) & 1155 MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT; 1156 else 1157 l4_type = (rte_be_to_cpu_16(mcqe->hdr_type) & 1158 MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT; 1159 mlx5_lro_update_tcp_hdr(h.tcp, cqe, phcsum, l4_type); 1160 } 1161 1162 void 1163 mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf) 1164 { 1165 mlx5_mprq_buf_free_cb(NULL, buf); 1166 } 1167 1168 /** 1169 * DPDK callback for RX with Multi-Packet RQ support. 1170 * 1171 * @param dpdk_rxq 1172 * Generic pointer to RX queue structure. 1173 * @param[out] pkts 1174 * Array to store received packets. 1175 * @param pkts_n 1176 * Maximum number of packets in array. 1177 * 1178 * @return 1179 * Number of packets successfully received (<= pkts_n). 1180 */ 1181 uint16_t 1182 mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1183 { 1184 struct mlx5_rxq_data *rxq = dpdk_rxq; 1185 const uint32_t strd_n = RTE_BIT32(rxq->log_strd_num); 1186 const uint32_t strd_sz = RTE_BIT32(rxq->log_strd_sz); 1187 const uint32_t cqe_n = 1 << rxq->cqe_n; 1188 const uint32_t cq_mask = cqe_n - 1; 1189 const uint32_t wqe_n = 1 << rxq->elts_n; 1190 const uint32_t wq_mask = wqe_n - 1; 1191 volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask]; 1192 unsigned int i = 0; 1193 uint32_t rq_ci = rxq->rq_ci; 1194 uint16_t consumed_strd = rxq->consumed_strd; 1195 struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_ci & wq_mask]; 1196 1197 while (i < pkts_n) { 1198 struct rte_mbuf *pkt; 1199 int ret; 1200 uint32_t len; 1201 uint16_t strd_cnt; 1202 uint16_t strd_idx; 1203 uint32_t byte_cnt; 1204 uint16_t skip_cnt; 1205 volatile struct mlx5_mini_cqe8 *mcqe = NULL; 1206 enum mlx5_rqx_code rxq_code; 1207 1208 if (consumed_strd == strd_n) { 1209 /* Replace WQE if the buffer is still in use. */ 1210 mprq_buf_replace(rxq, rq_ci & wq_mask); 1211 /* Advance to the next WQE. */ 1212 consumed_strd = 0; 1213 ++rq_ci; 1214 buf = (*rxq->mprq_bufs)[rq_ci & wq_mask]; 1215 } 1216 cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask]; 1217 ret = mlx5_rx_poll_len(rxq, cqe, cqe_n, cq_mask, &mcqe, &skip_cnt, true); 1218 if (unlikely(ret & MLX5_ERROR_CQE_MASK)) { 1219 if (ret == MLX5_CRITICAL_ERROR_CQE_RET) { 1220 rq_ci = rxq->rq_ci; 1221 consumed_strd = rxq->consumed_strd; 1222 break; 1223 } 1224 consumed_strd += skip_cnt; 1225 while (consumed_strd >= strd_n) { 1226 /* Replace WQE if the buffer is still in use. */ 1227 mprq_buf_replace(rxq, rq_ci & wq_mask); 1228 /* Advance to the next WQE. */ 1229 consumed_strd -= strd_n; 1230 ++rq_ci; 1231 buf = (*rxq->mprq_bufs)[rq_ci & wq_mask]; 1232 } 1233 cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask]; 1234 } 1235 if (ret == 0) 1236 break; 1237 byte_cnt = ret; 1238 len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT; 1239 MLX5_ASSERT((int)len >= (rxq->crc_present << 2)); 1240 if (rxq->crc_present) 1241 len -= RTE_ETHER_CRC_LEN; 1242 if (mcqe && 1243 rxq->mcqe_format == MLX5_CQE_RESP_FORMAT_FTAG_STRIDX) 1244 strd_cnt = (len / strd_sz) + !!(len % strd_sz); 1245 else 1246 strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >> 1247 MLX5_MPRQ_STRIDE_NUM_SHIFT; 1248 MLX5_ASSERT(strd_cnt); 1249 consumed_strd += strd_cnt; 1250 if (byte_cnt & MLX5_MPRQ_FILLER_MASK) 1251 continue; 1252 if (rxq->cqe_comp_layout && mcqe) 1253 cqe = &rxq->title_cqe; 1254 strd_idx = rte_be_to_cpu_16(mcqe == NULL ? 1255 cqe->wqe_counter : 1256 mcqe->stride_idx); 1257 MLX5_ASSERT(strd_idx < strd_n); 1258 MLX5_ASSERT(!((rte_be_to_cpu_16(cqe->wqe_id) ^ rq_ci) & 1259 wq_mask)); 1260 pkt = rte_pktmbuf_alloc(rxq->mp); 1261 if (unlikely(pkt == NULL)) { 1262 ++rxq->stats.rx_nombuf; 1263 break; 1264 } 1265 len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT; 1266 MLX5_ASSERT((int)len >= (rxq->crc_present << 2)); 1267 if (rxq->crc_present) 1268 len -= RTE_ETHER_CRC_LEN; 1269 rxq_code = mprq_buf_to_pkt(rxq, pkt, len, buf, 1270 strd_idx, strd_cnt); 1271 if (unlikely(rxq_code != MLX5_RXQ_CODE_EXIT)) { 1272 rte_pktmbuf_free_seg(pkt); 1273 if (rxq_code == MLX5_RXQ_CODE_DROPPED) { 1274 ++rxq->stats.idropped; 1275 continue; 1276 } 1277 if (rxq_code == MLX5_RXQ_CODE_NOMBUF) { 1278 ++rxq->stats.rx_nombuf; 1279 break; 1280 } 1281 } 1282 rxq_cq_to_mbuf(rxq, pkt, cqe, mcqe); 1283 if (cqe->lro_num_seg > 1) { 1284 mlx5_lro_update_hdr(rte_pktmbuf_mtod(pkt, uint8_t *), 1285 cqe, mcqe, rxq, len); 1286 pkt->ol_flags |= RTE_MBUF_F_RX_LRO; 1287 pkt->tso_segsz = len / cqe->lro_num_seg; 1288 } 1289 PKT_LEN(pkt) = len; 1290 PORT(pkt) = rxq->port_id; 1291 #ifdef MLX5_PMD_SOFT_COUNTERS 1292 /* Increment bytes counter. */ 1293 rxq->stats.ibytes += PKT_LEN(pkt); 1294 #endif 1295 /* Return packet. */ 1296 *(pkts++) = pkt; 1297 ++i; 1298 } 1299 /* Update the consumer indexes. */ 1300 rxq->consumed_strd = consumed_strd; 1301 rte_io_wmb(); 1302 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); 1303 if (rq_ci != rxq->rq_ci) { 1304 rxq->rq_ci = rq_ci; 1305 rte_io_wmb(); 1306 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 1307 } 1308 #ifdef MLX5_PMD_SOFT_COUNTERS 1309 /* Increment packets counter. */ 1310 rxq->stats.ipackets += i; 1311 #endif 1312 return i; 1313 } 1314 1315 int 1316 mlx5_rx_queue_lwm_query(struct rte_eth_dev *dev, 1317 uint16_t *queue_id, uint8_t *lwm) 1318 { 1319 struct mlx5_priv *priv = dev->data->dev_private; 1320 unsigned int rxq_id, found = 0, n; 1321 struct mlx5_rxq_priv *rxq; 1322 1323 if (!queue_id) 1324 return -EINVAL; 1325 /* Query all the Rx queues of the port in a circular way. */ 1326 for (rxq_id = *queue_id, n = 0; n < priv->rxqs_n; n++) { 1327 rxq = mlx5_rxq_get(dev, rxq_id); 1328 if (rxq && rxq->lwm_event_pending) { 1329 pthread_mutex_lock(&priv->sh->lwm_config_lock); 1330 rxq->lwm_event_pending = 0; 1331 pthread_mutex_unlock(&priv->sh->lwm_config_lock); 1332 *queue_id = rxq_id; 1333 found = 1; 1334 if (lwm) 1335 *lwm = mlx5_rxq_lwm_to_percentage(rxq); 1336 break; 1337 } 1338 rxq_id = (rxq_id + 1) % priv->rxqs_n; 1339 } 1340 return found; 1341 } 1342 1343 /** 1344 * Rte interrupt handler for LWM event. 1345 * It first checks if the event arrives, if so process the callback for 1346 * RTE_ETH_EVENT_RX_LWM. 1347 * 1348 * @param args 1349 * Generic pointer to mlx5_priv. 1350 */ 1351 void 1352 mlx5_dev_interrupt_handler_lwm(void *args) 1353 { 1354 struct mlx5_priv *priv = args; 1355 struct mlx5_rxq_priv *rxq; 1356 struct rte_eth_dev *dev; 1357 int ret, rxq_idx = 0, port_id = 0; 1358 1359 ret = priv->obj_ops.rxq_event_get_lwm(priv, &rxq_idx, &port_id); 1360 if (unlikely(ret < 0)) { 1361 DRV_LOG(WARNING, "Cannot get LWM event context."); 1362 return; 1363 } 1364 DRV_LOG(INFO, "%s get LWM event, port_id:%d rxq_id:%d.", __func__, 1365 port_id, rxq_idx); 1366 dev = &rte_eth_devices[port_id]; 1367 rxq = mlx5_rxq_get(dev, rxq_idx); 1368 if (rxq) { 1369 pthread_mutex_lock(&priv->sh->lwm_config_lock); 1370 rxq->lwm_event_pending = 1; 1371 pthread_mutex_unlock(&priv->sh->lwm_config_lock); 1372 } 1373 rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_RX_AVAIL_THRESH, NULL); 1374 } 1375 1376 /** 1377 * DPDK callback to arm an Rx queue LWM(limit watermark) event. 1378 * While the Rx queue fullness reaches the LWM limit, the driver catches 1379 * an HW event and invokes the user event callback. 1380 * After the last event handling, the user needs to call this API again 1381 * to arm an additional event. 1382 * 1383 * @param dev 1384 * Pointer to the device structure. 1385 * @param[in] rx_queue_id 1386 * Rx queue identificator. 1387 * @param[in] lwm 1388 * The LWM value, is defined by a percentage of the Rx queue size. 1389 * [1-99] to set a new LWM (update the old value). 1390 * 0 to unarm the event. 1391 * 1392 * @return 1393 * 0 : operation success. 1394 * Otherwise: 1395 * - ENOMEM - not enough memory to create LWM event channel. 1396 * - EINVAL - the input Rxq is not created by devx. 1397 * - E2BIG - lwm is bigger than 99. 1398 */ 1399 int 1400 mlx5_rx_queue_lwm_set(struct rte_eth_dev *dev, uint16_t rx_queue_id, 1401 uint8_t lwm) 1402 { 1403 struct mlx5_priv *priv = dev->data->dev_private; 1404 uint16_t port_id = PORT_ID(priv); 1405 struct mlx5_rxq_priv *rxq = mlx5_rxq_get(dev, rx_queue_id); 1406 uint16_t event_nums[1] = {MLX5_EVENT_TYPE_SRQ_LIMIT_REACHED}; 1407 struct mlx5_rxq_data *rxq_data; 1408 uint32_t wqe_cnt; 1409 uint64_t cookie; 1410 int ret = 0; 1411 1412 if (!rxq) { 1413 rte_errno = EINVAL; 1414 return -rte_errno; 1415 } 1416 rxq_data = &rxq->ctrl->rxq; 1417 /* Ensure the Rq is created by devx. */ 1418 if (priv->obj_ops.rxq_obj_new != devx_obj_ops.rxq_obj_new) { 1419 rte_errno = EINVAL; 1420 return -rte_errno; 1421 } 1422 if (lwm > 99) { 1423 DRV_LOG(WARNING, "Too big LWM configuration."); 1424 rte_errno = E2BIG; 1425 return -rte_errno; 1426 } 1427 /* Start config LWM. */ 1428 pthread_mutex_lock(&priv->sh->lwm_config_lock); 1429 if (rxq->lwm == 0 && lwm == 0) { 1430 /* Both old/new values are 0, do nothing. */ 1431 ret = 0; 1432 goto end; 1433 } 1434 wqe_cnt = 1 << (rxq_data->elts_n - rxq_data->sges_n); 1435 if (lwm) { 1436 if (!priv->sh->devx_channel_lwm) { 1437 ret = mlx5_lwm_setup(priv); 1438 if (ret) { 1439 DRV_LOG(WARNING, 1440 "Failed to create shared_lwm."); 1441 rte_errno = ENOMEM; 1442 ret = -rte_errno; 1443 goto end; 1444 } 1445 } 1446 if (!rxq->lwm_devx_subscribed) { 1447 cookie = ((uint32_t) 1448 (port_id << LWM_COOKIE_PORTID_OFFSET)) | 1449 (rx_queue_id << LWM_COOKIE_RXQID_OFFSET); 1450 ret = mlx5_os_devx_subscribe_devx_event 1451 (priv->sh->devx_channel_lwm, 1452 rxq->devx_rq.rq->obj, 1453 sizeof(event_nums), 1454 event_nums, 1455 cookie); 1456 if (ret) { 1457 rte_errno = rte_errno ? rte_errno : EINVAL; 1458 ret = -rte_errno; 1459 goto end; 1460 } 1461 rxq->lwm_devx_subscribed = 1; 1462 } 1463 } 1464 /* Save LWM to rxq and send modify_rq devx command. */ 1465 rxq->lwm = lwm * wqe_cnt / 100; 1466 /* Prevent integer division loss when switch lwm number to percentage. */ 1467 if (lwm && (lwm * wqe_cnt % 100)) { 1468 rxq->lwm = ((uint32_t)(rxq->lwm + 1) >= wqe_cnt) ? 1469 rxq->lwm : (rxq->lwm + 1); 1470 } 1471 if (lwm && !rxq->lwm) { 1472 /* With mprq, wqe_cnt may be < 100. */ 1473 DRV_LOG(WARNING, "Too small LWM configuration."); 1474 rte_errno = EINVAL; 1475 ret = -rte_errno; 1476 goto end; 1477 } 1478 ret = mlx5_devx_modify_rq(rxq, MLX5_RXQ_MOD_RDY2RDY); 1479 end: 1480 pthread_mutex_unlock(&priv->sh->lwm_config_lock); 1481 return ret; 1482 } 1483 1484 /** 1485 * Mlx5 access register function to configure host shaper. 1486 * It calls API in libmtcr_ul to access QSHR(Qos Shaper Host Register) 1487 * in firmware. 1488 * 1489 * @param dev 1490 * Pointer to rte_eth_dev. 1491 * @param lwm_triggered 1492 * Flag to enable/disable lwm_triggered bit in QSHR. 1493 * @param rate 1494 * Host shaper rate, unit is 100Mbps, set to 0 means disable the shaper. 1495 * @return 1496 * 0 : operation success. 1497 * Otherwise: 1498 * - ENOENT - no ibdev interface. 1499 * - EBUSY - the register access unit is busy. 1500 * - EIO - the register access command meets IO error. 1501 */ 1502 static int 1503 mlxreg_host_shaper_config(struct rte_eth_dev *dev, 1504 bool lwm_triggered, uint8_t rate) 1505 { 1506 #ifdef HAVE_MLX5_MSTFLINT 1507 struct mlx5_priv *priv = dev->data->dev_private; 1508 uint32_t data[MLX5_ST_SZ_DW(register_qshr)] = {0}; 1509 int rc, retry_count = 3; 1510 mfile *mf = NULL; 1511 int status; 1512 void *ptr; 1513 1514 mf = mopen(priv->sh->ibdev_name); 1515 if (!mf) { 1516 DRV_LOG(WARNING, "mopen failed\n"); 1517 rte_errno = ENOENT; 1518 return -rte_errno; 1519 } 1520 MLX5_SET(register_qshr, data, connected_host, 1); 1521 MLX5_SET(register_qshr, data, fast_response, lwm_triggered ? 1 : 0); 1522 MLX5_SET(register_qshr, data, local_port, 1); 1523 ptr = MLX5_ADDR_OF(register_qshr, data, global_config); 1524 MLX5_SET(ets_global_config_register, ptr, rate_limit_update, 1); 1525 MLX5_SET(ets_global_config_register, ptr, max_bw_units, 1526 rate ? ETS_GLOBAL_CONFIG_BW_UNIT_HUNDREDS_MBPS : 1527 ETS_GLOBAL_CONFIG_BW_UNIT_DISABLED); 1528 MLX5_SET(ets_global_config_register, ptr, max_bw_value, rate); 1529 do { 1530 rc = maccess_reg(mf, 1531 MLX5_QSHR_REGISTER_ID, 1532 MACCESS_REG_METHOD_SET, 1533 (u_int32_t *)&data[0], 1534 sizeof(data), 1535 sizeof(data), 1536 sizeof(data), 1537 &status); 1538 if ((rc != ME_ICMD_STATUS_IFC_BUSY && 1539 status != ME_REG_ACCESS_BAD_PARAM) || 1540 !(mf->flags & MDEVS_REM)) { 1541 break; 1542 } 1543 DRV_LOG(WARNING, "%s retry.", __func__); 1544 usleep(10000); 1545 } while (retry_count-- > 0); 1546 mclose(mf); 1547 rte_errno = (rc == ME_REG_ACCESS_DEV_BUSY) ? EBUSY : EIO; 1548 return rc ? -rte_errno : 0; 1549 #else 1550 (void)dev; 1551 (void)lwm_triggered; 1552 (void)rate; 1553 return -1; 1554 #endif 1555 } 1556 1557 int rte_pmd_mlx5_host_shaper_config(int port_id, uint8_t rate, 1558 uint32_t flags) 1559 { 1560 struct rte_eth_dev *dev = &rte_eth_devices[port_id]; 1561 struct mlx5_priv *priv = dev->data->dev_private; 1562 bool lwm_triggered = 1563 !!(flags & RTE_BIT32(RTE_PMD_MLX5_HOST_SHAPER_FLAG_AVAIL_THRESH_TRIGGERED)); 1564 1565 if (!lwm_triggered) { 1566 priv->sh->host_shaper_rate = rate; 1567 } else { 1568 switch (rate) { 1569 case 0: 1570 /* Rate 0 means disable lwm_triggered. */ 1571 priv->sh->lwm_triggered = 0; 1572 break; 1573 case 1: 1574 /* Rate 1 means enable lwm_triggered. */ 1575 priv->sh->lwm_triggered = 1; 1576 break; 1577 default: 1578 return -ENOTSUP; 1579 } 1580 } 1581 return mlxreg_host_shaper_config(dev, priv->sh->lwm_triggered, 1582 priv->sh->host_shaper_rate); 1583 } 1584