1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2021 6WIND S.A. 3 * Copyright 2021 Mellanox Technologies, Ltd 4 */ 5 6 #include <stdint.h> 7 #include <string.h> 8 #include <stdlib.h> 9 10 #include <rte_mbuf.h> 11 #include <rte_mempool.h> 12 #include <rte_prefetch.h> 13 #include <rte_common.h> 14 #include <rte_branch_prediction.h> 15 #include <rte_ether.h> 16 #include <rte_cycles.h> 17 #include <rte_flow.h> 18 19 #include <mlx5_prm.h> 20 #include <mlx5_common.h> 21 #include <mlx5_common_mr.h> 22 #include <rte_pmd_mlx5.h> 23 24 #include "mlx5_autoconf.h" 25 #include "mlx5_defs.h" 26 #include "mlx5.h" 27 #include "mlx5_utils.h" 28 #include "mlx5_rxtx.h" 29 #include "mlx5_devx.h" 30 #include "mlx5_rx.h" 31 #ifdef HAVE_MLX5_MSTFLINT 32 #include <mstflint/mtcr.h> 33 #endif 34 35 36 static __rte_always_inline uint32_t 37 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, 38 volatile struct mlx5_mini_cqe8 *mcqe); 39 40 static __rte_always_inline int 41 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, 42 uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe, 43 uint16_t *skip_cnt, bool mprq); 44 45 static __rte_always_inline uint32_t 46 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe); 47 48 static __rte_always_inline void 49 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, 50 volatile struct mlx5_cqe *cqe, 51 volatile struct mlx5_mini_cqe8 *mcqe); 52 53 static inline void 54 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *__rte_restrict tcp, 55 volatile struct mlx5_cqe *__rte_restrict cqe, 56 uint32_t phcsum, uint8_t l4_type); 57 58 static inline void 59 mlx5_lro_update_hdr(uint8_t *__rte_restrict padd, 60 volatile struct mlx5_cqe *__rte_restrict cqe, 61 volatile struct mlx5_mini_cqe8 *mcqe, 62 struct mlx5_rxq_data *rxq, uint32_t len); 63 64 65 /** 66 * Internal function to compute the number of used descriptors in an RX queue. 67 * 68 * @param rxq 69 * The Rx queue. 70 * 71 * @return 72 * The number of used Rx descriptor. 73 */ 74 static uint32_t 75 rx_queue_count(struct mlx5_rxq_data *rxq) 76 { 77 struct rxq_zip *zip = &rxq->zip; 78 volatile struct mlx5_cqe *cqe; 79 const unsigned int cqe_n = (1 << rxq->cqe_n); 80 const unsigned int sges_n = (1 << rxq->sges_n); 81 const unsigned int elts_n = (1 << rxq->elts_n); 82 const unsigned int strd_n = RTE_BIT32(rxq->log_strd_num); 83 const unsigned int cqe_cnt = cqe_n - 1; 84 unsigned int cq_ci, used; 85 86 /* if we are processing a compressed cqe */ 87 if (zip->ai) { 88 used = zip->cqe_cnt - zip->ai; 89 cq_ci = zip->cq_ci; 90 } else { 91 used = 0; 92 cq_ci = rxq->cq_ci; 93 } 94 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; 95 while (check_cqe(cqe, cqe_n, cq_ci) != MLX5_CQE_STATUS_HW_OWN) { 96 int8_t op_own; 97 unsigned int n; 98 99 op_own = cqe->op_own; 100 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) 101 n = rte_be_to_cpu_32(cqe->byte_cnt); 102 else 103 n = 1; 104 cq_ci += n; 105 used += n; 106 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; 107 } 108 used = RTE_MIN(used * sges_n, elts_n * strd_n); 109 return used; 110 } 111 112 /** 113 * DPDK callback to check the status of a Rx descriptor. 114 * 115 * @param rx_queue 116 * The Rx queue. 117 * @param[in] offset 118 * The index of the descriptor in the ring. 119 * 120 * @return 121 * The status of the Rx descriptor. 122 */ 123 int 124 mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset) 125 { 126 struct mlx5_rxq_data *rxq = rx_queue; 127 128 if (offset >= (1 << rxq->cqe_n)) { 129 rte_errno = EINVAL; 130 return -rte_errno; 131 } 132 if (offset < rx_queue_count(rxq)) 133 return RTE_ETH_RX_DESC_DONE; 134 return RTE_ETH_RX_DESC_AVAIL; 135 } 136 137 /* Get rxq lwm percentage according to lwm number. */ 138 static uint8_t 139 mlx5_rxq_lwm_to_percentage(struct mlx5_rxq_priv *rxq) 140 { 141 struct mlx5_rxq_data *rxq_data = &rxq->ctrl->rxq; 142 uint32_t wqe_cnt = 1 << (rxq_data->elts_n - rxq_data->sges_n); 143 144 return rxq->lwm * 100 / wqe_cnt; 145 } 146 147 /** 148 * DPDK callback to get the RX queue information. 149 * 150 * @param dev 151 * Pointer to the device structure. 152 * 153 * @param rx_queue_id 154 * Rx queue identificator. 155 * 156 * @param qinfo 157 * Pointer to the RX queue information structure. 158 * 159 * @return 160 * None. 161 */ 162 163 void 164 mlx5_rxq_info_get(struct rte_eth_dev *dev, uint16_t rx_queue_id, 165 struct rte_eth_rxq_info *qinfo) 166 { 167 struct mlx5_rxq_ctrl *rxq_ctrl = mlx5_rxq_ctrl_get(dev, rx_queue_id); 168 struct mlx5_rxq_data *rxq = mlx5_rxq_data_get(dev, rx_queue_id); 169 struct mlx5_rxq_priv *rxq_priv = mlx5_rxq_get(dev, rx_queue_id); 170 171 if (!rxq) 172 return; 173 qinfo->mp = mlx5_rxq_mprq_enabled(rxq) ? 174 rxq->mprq_mp : rxq->mp; 175 qinfo->conf.rx_thresh.pthresh = 0; 176 qinfo->conf.rx_thresh.hthresh = 0; 177 qinfo->conf.rx_thresh.wthresh = 0; 178 qinfo->conf.rx_free_thresh = rxq->rq_repl_thresh; 179 qinfo->conf.rx_drop_en = 1; 180 if (rxq_ctrl == NULL || rxq_ctrl->obj == NULL) 181 qinfo->conf.rx_deferred_start = 0; 182 else 183 qinfo->conf.rx_deferred_start = 1; 184 qinfo->conf.offloads = dev->data->dev_conf.rxmode.offloads; 185 qinfo->scattered_rx = dev->data->scattered_rx; 186 qinfo->nb_desc = mlx5_rxq_mprq_enabled(rxq) ? 187 RTE_BIT32(rxq->elts_n) * RTE_BIT32(rxq->log_strd_num) : 188 RTE_BIT32(rxq->elts_n); 189 qinfo->avail_thresh = rxq_priv ? 190 mlx5_rxq_lwm_to_percentage(rxq_priv) : 0; 191 } 192 193 /** 194 * DPDK callback to get the RX packet burst mode information. 195 * 196 * @param dev 197 * Pointer to the device structure. 198 * 199 * @param rx_queue_id 200 * Rx queue identification. 201 * 202 * @param mode 203 * Pointer to the burts mode information. 204 * 205 * @return 206 * 0 as success, -EINVAL as failure. 207 */ 208 int 209 mlx5_rx_burst_mode_get(struct rte_eth_dev *dev, 210 uint16_t rx_queue_id __rte_unused, 211 struct rte_eth_burst_mode *mode) 212 { 213 eth_rx_burst_t pkt_burst = dev->rx_pkt_burst; 214 struct mlx5_rxq_priv *rxq = mlx5_rxq_get(dev, rx_queue_id); 215 216 if (!rxq) { 217 rte_errno = EINVAL; 218 return -rte_errno; 219 } 220 if (pkt_burst == mlx5_rx_burst) { 221 snprintf(mode->info, sizeof(mode->info), "%s", "Scalar"); 222 } else if (pkt_burst == mlx5_rx_burst_mprq) { 223 snprintf(mode->info, sizeof(mode->info), "%s", "Multi-Packet RQ"); 224 } else if (pkt_burst == mlx5_rx_burst_vec) { 225 #if defined RTE_ARCH_X86_64 226 snprintf(mode->info, sizeof(mode->info), "%s", "Vector SSE"); 227 #elif defined RTE_ARCH_ARM64 228 snprintf(mode->info, sizeof(mode->info), "%s", "Vector Neon"); 229 #elif defined RTE_ARCH_PPC_64 230 snprintf(mode->info, sizeof(mode->info), "%s", "Vector AltiVec"); 231 #else 232 return -EINVAL; 233 #endif 234 } else if (pkt_burst == mlx5_rx_burst_mprq_vec) { 235 #if defined RTE_ARCH_X86_64 236 snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector SSE"); 237 #elif defined RTE_ARCH_ARM64 238 snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector Neon"); 239 #elif defined RTE_ARCH_PPC_64 240 snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector AltiVec"); 241 #else 242 return -EINVAL; 243 #endif 244 } else { 245 return -EINVAL; 246 } 247 return 0; 248 } 249 250 /** 251 * DPDK callback to get the number of used descriptors in a RX queue. 252 * 253 * @param rx_queue 254 * The Rx queue pointer. 255 * 256 * @return 257 * The number of used rx descriptor. 258 * -EINVAL if the queue is invalid 259 */ 260 uint32_t 261 mlx5_rx_queue_count(void *rx_queue) 262 { 263 struct mlx5_rxq_data *rxq = rx_queue; 264 struct rte_eth_dev *dev; 265 266 if (!rxq) { 267 rte_errno = EINVAL; 268 return -rte_errno; 269 } 270 271 dev = &rte_eth_devices[rxq->port_id]; 272 273 if (dev->rx_pkt_burst == NULL || 274 dev->rx_pkt_burst == rte_eth_pkt_burst_dummy) { 275 rte_errno = ENOTSUP; 276 return -rte_errno; 277 } 278 279 return rx_queue_count(rxq); 280 } 281 282 #define CLB_VAL_IDX 0 283 #define CLB_MSK_IDX 1 284 static int 285 mlx5_monitor_callback(const uint64_t value, 286 const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ]) 287 { 288 const uint64_t m = opaque[CLB_MSK_IDX]; 289 const uint64_t v = opaque[CLB_VAL_IDX]; 290 291 return (value & m) == v ? -1 : 0; 292 } 293 294 int mlx5_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc) 295 { 296 struct mlx5_rxq_data *rxq = rx_queue; 297 const unsigned int cqe_num = 1 << rxq->cqe_n; 298 const unsigned int cqe_mask = cqe_num - 1; 299 const uint16_t idx = rxq->cq_ci & cqe_num; 300 volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask]; 301 302 if (unlikely(rxq->cqes == NULL)) { 303 rte_errno = EINVAL; 304 return -rte_errno; 305 } 306 pmc->addr = &cqe->op_own; 307 pmc->opaque[CLB_VAL_IDX] = !!idx; 308 pmc->opaque[CLB_MSK_IDX] = MLX5_CQE_OWNER_MASK; 309 pmc->fn = mlx5_monitor_callback; 310 pmc->size = sizeof(uint8_t); 311 return 0; 312 } 313 314 /** 315 * Translate RX completion flags to packet type. 316 * 317 * @param[in] rxq 318 * Pointer to RX queue structure. 319 * @param[in] cqe 320 * Pointer to CQE. 321 * 322 * @note: fix mlx5_dev_supported_ptypes_get() if any change here. 323 * 324 * @return 325 * Packet type for struct rte_mbuf. 326 */ 327 static inline uint32_t 328 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, 329 volatile struct mlx5_mini_cqe8 *mcqe) 330 { 331 uint8_t idx; 332 uint8_t ptype; 333 uint8_t pinfo = (cqe->pkt_info & 0x3) << 6; 334 335 /* Get l3/l4 header from mini-CQE in case L3/L4 format*/ 336 if (mcqe == NULL || 337 rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX) 338 ptype = (cqe->hdr_type_etc & 0xfc00) >> 10; 339 else 340 ptype = mcqe->hdr_type >> 2; 341 /* 342 * The index to the array should have: 343 * bit[1:0] = l3_hdr_type 344 * bit[4:2] = l4_hdr_type 345 * bit[5] = ip_frag 346 * bit[6] = tunneled 347 * bit[7] = outer_l3_type 348 */ 349 idx = pinfo | ptype; 350 return mlx5_ptype_table[idx] | rxq->tunnel * !!(idx & (1 << 6)); 351 } 352 353 /** 354 * Initialize Rx WQ and indexes. 355 * 356 * @param[in] rxq 357 * Pointer to RX queue structure. 358 */ 359 void 360 mlx5_rxq_initialize(struct mlx5_rxq_data *rxq) 361 { 362 const unsigned int wqe_n = 1 << rxq->elts_n; 363 unsigned int i; 364 365 for (i = 0; (i != wqe_n); ++i) { 366 volatile struct mlx5_wqe_data_seg *scat; 367 uintptr_t addr; 368 uint32_t byte_count; 369 uint32_t lkey; 370 371 if (mlx5_rxq_mprq_enabled(rxq)) { 372 struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[i]; 373 374 scat = &((volatile struct mlx5_wqe_mprq *) 375 rxq->wqes)[i].dseg; 376 addr = (uintptr_t)mlx5_mprq_buf_addr 377 (buf, RTE_BIT32(rxq->log_strd_num)); 378 byte_count = RTE_BIT32(rxq->log_strd_sz) * 379 RTE_BIT32(rxq->log_strd_num); 380 lkey = mlx5_rx_addr2mr(rxq, addr); 381 } else { 382 struct rte_mbuf *buf = (*rxq->elts)[i]; 383 384 scat = &((volatile struct mlx5_wqe_data_seg *) 385 rxq->wqes)[i]; 386 addr = rte_pktmbuf_mtod(buf, uintptr_t); 387 byte_count = DATA_LEN(buf); 388 lkey = mlx5_rx_mb2mr(rxq, buf); 389 } 390 /* scat->addr must be able to store a pointer. */ 391 MLX5_ASSERT(sizeof(scat->addr) >= sizeof(uintptr_t)); 392 *scat = (struct mlx5_wqe_data_seg){ 393 .addr = rte_cpu_to_be_64(addr), 394 .byte_count = rte_cpu_to_be_32(byte_count), 395 .lkey = lkey, 396 }; 397 } 398 rxq->consumed_strd = 0; 399 rxq->decompressed = 0; 400 rxq->rq_pi = 0; 401 rxq->zip = (struct rxq_zip){ 402 .ai = 0, 403 }; 404 rxq->elts_ci = mlx5_rxq_mprq_enabled(rxq) ? 405 (wqe_n >> rxq->sges_n) * RTE_BIT32(rxq->log_strd_num) : 0; 406 /* Update doorbell counter. */ 407 rxq->rq_ci = wqe_n >> rxq->sges_n; 408 rte_io_wmb(); 409 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 410 } 411 412 #define MLX5_ERROR_CQE_MASK 0x40000000 413 /* Must be negative. */ 414 #define MLX5_REGULAR_ERROR_CQE_RET (-5) 415 #define MLX5_CRITICAL_ERROR_CQE_RET (-4) 416 /* Must not be negative. */ 417 #define MLX5_RECOVERY_ERROR_RET 0 418 #define MLX5_RECOVERY_IGNORE_RET 1 419 #define MLX5_RECOVERY_COMPLETED_RET 2 420 421 /** 422 * Handle a Rx error. 423 * The function inserts the RQ state to reset when the first error CQE is 424 * shown, then drains the CQ by the caller function loop. When the CQ is empty, 425 * it moves the RQ state to ready and initializes the RQ. 426 * Next CQE identification and error counting are in the caller responsibility. 427 * 428 * @param[in] rxq 429 * Pointer to RX queue structure. 430 * @param[in] vec 431 * 1 when called from vectorized Rx burst, need to prepare mbufs for the RQ. 432 * 0 when called from non-vectorized Rx burst. 433 * @param[in] err_n 434 * Number of CQEs to check for an error. 435 * 436 * @return 437 * MLX5_RECOVERY_ERROR_RET in case of recovery error, 438 * MLX5_RECOVERY_IGNORE_RET in case of non-critical error syndrome, 439 * MLX5_RECOVERY_COMPLETED_RET in case of recovery is completed, 440 * otherwise the CQE status after ignored error syndrome or queue reset. 441 */ 442 int 443 mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec, 444 uint16_t err_n, uint16_t *skip_cnt) 445 { 446 const uint16_t cqe_n = 1 << rxq->cqe_n; 447 const uint16_t cqe_mask = cqe_n - 1; 448 const uint16_t wqe_n = 1 << rxq->elts_n; 449 const uint16_t strd_n = RTE_BIT32(rxq->log_strd_num); 450 struct mlx5_rxq_ctrl *rxq_ctrl = 451 container_of(rxq, struct mlx5_rxq_ctrl, rxq); 452 union { 453 volatile struct mlx5_cqe *cqe; 454 volatile struct mlx5_err_cqe *err_cqe; 455 } u = { 456 .cqe = &(*rxq->cqes)[(rxq->cq_ci - vec) & cqe_mask], 457 }; 458 struct mlx5_mp_arg_queue_state_modify sm; 459 bool critical_syndrome = false; 460 int ret, i; 461 462 switch (rxq->err_state) { 463 case MLX5_RXQ_ERR_STATE_IGNORE: 464 ret = check_cqe(u.cqe, cqe_n, rxq->cq_ci - vec); 465 if (ret != MLX5_CQE_STATUS_ERR) { 466 rxq->err_state = MLX5_RXQ_ERR_STATE_NO_ERROR; 467 return ret; 468 } 469 /* Fall-through */ 470 case MLX5_RXQ_ERR_STATE_NO_ERROR: 471 for (i = 0; i < (int)err_n; i++) { 472 u.cqe = &(*rxq->cqes)[(rxq->cq_ci - vec - i) & cqe_mask]; 473 if (MLX5_CQE_OPCODE(u.cqe->op_own) == MLX5_CQE_RESP_ERR) { 474 if (u.err_cqe->syndrome == MLX5_CQE_SYNDROME_LOCAL_QP_OP_ERR || 475 u.err_cqe->syndrome == MLX5_CQE_SYNDROME_LOCAL_PROT_ERR || 476 u.err_cqe->syndrome == MLX5_CQE_SYNDROME_WR_FLUSH_ERR) 477 critical_syndrome = true; 478 break; 479 } 480 } 481 if (!critical_syndrome) { 482 if (rxq->err_state == MLX5_RXQ_ERR_STATE_NO_ERROR) { 483 *skip_cnt = 0; 484 if (i == err_n) 485 rxq->err_state = MLX5_RXQ_ERR_STATE_IGNORE; 486 } 487 return MLX5_RECOVERY_IGNORE_RET; 488 } 489 rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_RESET; 490 /* Fall-through */ 491 case MLX5_RXQ_ERR_STATE_NEED_RESET: 492 sm.is_wq = 1; 493 sm.queue_id = rxq->idx; 494 sm.state = IBV_WQS_RESET; 495 if (mlx5_queue_state_modify(RXQ_DEV(rxq_ctrl), &sm)) 496 return MLX5_RECOVERY_ERROR_RET; 497 if (rxq_ctrl->dump_file_n < 498 RXQ_PORT(rxq_ctrl)->config.max_dump_files_num) { 499 MKSTR(err_str, "Unexpected CQE error syndrome " 500 "0x%02x CQN = %u RQN = %u wqe_counter = %u" 501 " rq_ci = %u cq_ci = %u", u.err_cqe->syndrome, 502 rxq->cqn, rxq_ctrl->wqn, 503 rte_be_to_cpu_16(u.err_cqe->wqe_counter), 504 rxq->rq_ci << rxq->sges_n, rxq->cq_ci); 505 MKSTR(name, "dpdk_mlx5_port_%u_rxq_%u_%u", 506 rxq->port_id, rxq->idx, (uint32_t)rte_rdtsc()); 507 mlx5_dump_debug_information(name, NULL, err_str, 0); 508 mlx5_dump_debug_information(name, "MLX5 Error CQ:", 509 (const void *)((uintptr_t) 510 rxq->cqes), 511 sizeof(*u.cqe) * cqe_n); 512 mlx5_dump_debug_information(name, "MLX5 Error RQ:", 513 (const void *)((uintptr_t) 514 rxq->wqes), 515 16 * wqe_n); 516 rxq_ctrl->dump_file_n++; 517 } 518 rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_READY; 519 /* Fall-through */ 520 case MLX5_RXQ_ERR_STATE_NEED_READY: 521 ret = check_cqe(u.cqe, cqe_n, rxq->cq_ci); 522 if (ret == MLX5_CQE_STATUS_HW_OWN) { 523 rte_io_wmb(); 524 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); 525 rte_io_wmb(); 526 /* 527 * The RQ consumer index must be zeroed while moving 528 * from RESET state to RDY state. 529 */ 530 *rxq->rq_db = rte_cpu_to_be_32(0); 531 rte_io_wmb(); 532 sm.is_wq = 1; 533 sm.queue_id = rxq->idx; 534 sm.state = IBV_WQS_RDY; 535 if (mlx5_queue_state_modify(RXQ_DEV(rxq_ctrl), &sm)) 536 return MLX5_RECOVERY_ERROR_RET; 537 if (vec) { 538 const uint32_t elts_n = 539 mlx5_rxq_mprq_enabled(rxq) ? 540 wqe_n * strd_n : wqe_n; 541 const uint32_t e_mask = elts_n - 1; 542 uint32_t elts_ci = 543 mlx5_rxq_mprq_enabled(rxq) ? 544 rxq->elts_ci : rxq->rq_ci; 545 uint32_t elt_idx; 546 struct rte_mbuf **elt; 547 unsigned int n = elts_n - (elts_ci - 548 rxq->rq_pi); 549 550 for (i = 0; i < (int)n; ++i) { 551 elt_idx = (elts_ci + i) & e_mask; 552 elt = &(*rxq->elts)[elt_idx]; 553 *elt = rte_mbuf_raw_alloc(rxq->mp); 554 if (!*elt) { 555 for (i--; i >= 0; --i) { 556 elt_idx = (elts_ci + 557 i) & elts_n; 558 elt = &(*rxq->elts) 559 [elt_idx]; 560 rte_pktmbuf_free_seg 561 (*elt); 562 } 563 return MLX5_RECOVERY_ERROR_RET; 564 } 565 } 566 for (i = 0; i < (int)elts_n; ++i) { 567 elt = &(*rxq->elts)[i]; 568 DATA_LEN(*elt) = 569 (uint16_t)((*elt)->buf_len - 570 rte_pktmbuf_headroom(*elt)); 571 } 572 /* Padding with a fake mbuf for vec Rx. */ 573 for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i) 574 (*rxq->elts)[elts_n + i] = 575 &rxq->fake_mbuf; 576 } 577 mlx5_rxq_initialize(rxq); 578 rxq->err_state = MLX5_RXQ_ERR_STATE_NO_ERROR; 579 return MLX5_RECOVERY_COMPLETED_RET; 580 } 581 return ret; 582 default: 583 return MLX5_RECOVERY_ERROR_RET; 584 } 585 } 586 587 /** 588 * Get size of the next packet for a given CQE. For compressed CQEs, the 589 * consumer index is updated only once all packets of the current one have 590 * been processed. 591 * 592 * @param rxq 593 * Pointer to RX queue. 594 * @param cqe 595 * CQE to process. 596 * @param[out] mcqe 597 * Store pointer to mini-CQE if compressed. Otherwise, the pointer is not 598 * written. 599 * @param[out] skip_cnt 600 * Number of packets skipped due to recoverable errors. 601 * @param mprq 602 * Indication if it is called from MPRQ. 603 * @return 604 * 0 in case of empty CQE, MLX5_REGULAR_ERROR_CQE_RET in case of error CQE, 605 * MLX5_CRITICAL_ERROR_CQE_RET in case of error CQE lead to Rx queue reset, 606 * otherwise the packet size in regular RxQ, 607 * and striding byte count format in mprq case. 608 */ 609 static inline int 610 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, 611 uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe, 612 uint16_t *skip_cnt, bool mprq) 613 { 614 struct rxq_zip *zip = &rxq->zip; 615 uint16_t cqe_n = cqe_cnt + 1; 616 int len = 0, ret = 0; 617 uint16_t idx, end; 618 619 do { 620 len = 0; 621 /* Process compressed data in the CQE and mini arrays. */ 622 if (zip->ai) { 623 volatile struct mlx5_mini_cqe8 (*mc)[8] = 624 (volatile struct mlx5_mini_cqe8 (*)[8]) 625 (uintptr_t)(&(*rxq->cqes)[zip->ca & 626 cqe_cnt].pkt_info); 627 len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt & 628 rxq->byte_mask); 629 *mcqe = &(*mc)[zip->ai & 7]; 630 if ((++zip->ai & 7) == 0) { 631 /* Invalidate consumed CQEs */ 632 idx = zip->ca; 633 end = zip->na; 634 while (idx != end) { 635 (*rxq->cqes)[idx & cqe_cnt].op_own = 636 MLX5_CQE_INVALIDATE; 637 ++idx; 638 } 639 /* 640 * Increment consumer index to skip the number 641 * of CQEs consumed. Hardware leaves holes in 642 * the CQ ring for software use. 643 */ 644 zip->ca = zip->na; 645 zip->na += 8; 646 } 647 if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) { 648 /* Invalidate the rest */ 649 idx = zip->ca; 650 end = zip->cq_ci; 651 652 while (idx != end) { 653 (*rxq->cqes)[idx & cqe_cnt].op_own = 654 MLX5_CQE_INVALIDATE; 655 ++idx; 656 } 657 rxq->cq_ci = zip->cq_ci; 658 zip->ai = 0; 659 } 660 /* 661 * No compressed data, get next CQE and verify if it is 662 * compressed. 663 */ 664 } else { 665 int8_t op_own; 666 uint32_t cq_ci; 667 668 ret = check_cqe(cqe, cqe_n, rxq->cq_ci); 669 if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) { 670 if (unlikely(ret == MLX5_CQE_STATUS_ERR || 671 rxq->err_state)) { 672 ret = mlx5_rx_err_handle(rxq, 0, 1, skip_cnt); 673 if (ret == MLX5_CQE_STATUS_HW_OWN) 674 return MLX5_ERROR_CQE_MASK; 675 if (ret == MLX5_RECOVERY_ERROR_RET || 676 ret == MLX5_RECOVERY_COMPLETED_RET) 677 return MLX5_CRITICAL_ERROR_CQE_RET; 678 } else { 679 return 0; 680 } 681 } 682 /* 683 * Introduce the local variable to have queue cq_ci 684 * index in queue structure always consistent with 685 * actual CQE boundary (not pointing to the middle 686 * of compressed CQE session). 687 */ 688 cq_ci = rxq->cq_ci + 1; 689 op_own = cqe->op_own; 690 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) { 691 volatile struct mlx5_mini_cqe8 (*mc)[8] = 692 (volatile struct mlx5_mini_cqe8 (*)[8]) 693 (uintptr_t)(&(*rxq->cqes) 694 [cq_ci & cqe_cnt].pkt_info); 695 696 /* Fix endianness. */ 697 zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt); 698 /* 699 * Current mini array position is the one 700 * returned by check_cqe64(). 701 * 702 * If completion comprises several mini arrays, 703 * as a special case the second one is located 704 * 7 CQEs after the initial CQE instead of 8 705 * for subsequent ones. 706 */ 707 zip->ca = cq_ci; 708 zip->na = zip->ca + 7; 709 /* Compute the next non compressed CQE. */ 710 zip->cq_ci = rxq->cq_ci + zip->cqe_cnt; 711 /* Get packet size to return. */ 712 len = rte_be_to_cpu_32((*mc)[0].byte_cnt & 713 rxq->byte_mask); 714 *mcqe = &(*mc)[0]; 715 zip->ai = 1; 716 /* Prefetch all to be invalidated */ 717 idx = zip->ca; 718 end = zip->cq_ci; 719 while (idx != end) { 720 rte_prefetch0(&(*rxq->cqes)[(idx) & 721 cqe_cnt]); 722 ++idx; 723 } 724 } else { 725 rxq->cq_ci = cq_ci; 726 len = rte_be_to_cpu_32(cqe->byte_cnt); 727 } 728 } 729 if (unlikely(rxq->err_state)) { 730 if (rxq->err_state == MLX5_RXQ_ERR_STATE_IGNORE && 731 ret == MLX5_CQE_STATUS_SW_OWN) { 732 rxq->err_state = MLX5_RXQ_ERR_STATE_NO_ERROR; 733 return len & MLX5_ERROR_CQE_MASK; 734 } 735 cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 736 ++rxq->stats.idropped; 737 (*skip_cnt) += mprq ? (len & MLX5_MPRQ_STRIDE_NUM_MASK) >> 738 MLX5_MPRQ_STRIDE_NUM_SHIFT : 1; 739 } else { 740 return len; 741 } 742 } while (1); 743 } 744 745 /** 746 * Translate RX completion flags to offload flags. 747 * 748 * @param[in] cqe 749 * Pointer to CQE. 750 * 751 * @return 752 * Offload flags (ol_flags) for struct rte_mbuf. 753 */ 754 static inline uint32_t 755 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe) 756 { 757 uint32_t ol_flags = 0; 758 uint16_t flags = rte_be_to_cpu_16(cqe->hdr_type_etc); 759 760 ol_flags = 761 TRANSPOSE(flags, 762 MLX5_CQE_RX_L3_HDR_VALID, 763 RTE_MBUF_F_RX_IP_CKSUM_GOOD) | 764 TRANSPOSE(flags, 765 MLX5_CQE_RX_L4_HDR_VALID, 766 RTE_MBUF_F_RX_L4_CKSUM_GOOD); 767 return ol_flags; 768 } 769 770 /** 771 * Fill in mbuf fields from RX completion flags. 772 * Note that pkt->ol_flags should be initialized outside of this function. 773 * 774 * @param rxq 775 * Pointer to RX queue. 776 * @param pkt 777 * mbuf to fill. 778 * @param cqe 779 * CQE to process. 780 * @param rss_hash_res 781 * Packet RSS Hash result. 782 */ 783 static inline void 784 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, 785 volatile struct mlx5_cqe *cqe, 786 volatile struct mlx5_mini_cqe8 *mcqe) 787 { 788 /* Update packet information. */ 789 pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe, mcqe); 790 pkt->port = unlikely(rxq->shared) ? cqe->user_index_low : rxq->port_id; 791 792 if (rxq->rss_hash) { 793 uint32_t rss_hash_res = 0; 794 795 /* If compressed, take hash result from mini-CQE. */ 796 if (mcqe == NULL || 797 rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_HASH) 798 rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res); 799 else 800 rss_hash_res = rte_be_to_cpu_32(mcqe->rx_hash_result); 801 if (rss_hash_res) { 802 pkt->hash.rss = rss_hash_res; 803 pkt->ol_flags |= RTE_MBUF_F_RX_RSS_HASH; 804 } 805 } 806 if (rxq->mark) { 807 uint32_t mark = 0; 808 809 /* If compressed, take flow tag from mini-CQE. */ 810 if (mcqe == NULL || 811 rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_FTAG_STRIDX) 812 mark = cqe->sop_drop_qpn; 813 else 814 mark = ((mcqe->byte_cnt_flow & 0xff) << 8) | 815 (mcqe->flow_tag_high << 16); 816 if (MLX5_FLOW_MARK_IS_VALID(mark)) { 817 pkt->ol_flags |= RTE_MBUF_F_RX_FDIR; 818 if (mark != RTE_BE32(MLX5_FLOW_MARK_DEFAULT)) { 819 pkt->ol_flags |= RTE_MBUF_F_RX_FDIR_ID; 820 pkt->hash.fdir.hi = mlx5_flow_mark_get(mark); 821 } 822 } 823 } 824 if (rxq->dynf_meta) { 825 uint32_t meta = rte_be_to_cpu_32(cqe->flow_table_metadata) & 826 rxq->flow_meta_port_mask; 827 828 if (meta) { 829 pkt->ol_flags |= rxq->flow_meta_mask; 830 *RTE_MBUF_DYNFIELD(pkt, rxq->flow_meta_offset, 831 uint32_t *) = meta; 832 } 833 } 834 if (rxq->csum) 835 pkt->ol_flags |= rxq_cq_to_ol_flags(cqe); 836 if (rxq->vlan_strip) { 837 bool vlan_strip; 838 839 if (mcqe == NULL || 840 rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX) 841 vlan_strip = cqe->hdr_type_etc & 842 RTE_BE16(MLX5_CQE_VLAN_STRIPPED); 843 else 844 vlan_strip = mcqe->hdr_type & 845 RTE_BE16(MLX5_CQE_VLAN_STRIPPED); 846 if (vlan_strip) { 847 pkt->ol_flags |= RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED; 848 pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info); 849 } 850 } 851 if (rxq->hw_timestamp) { 852 uint64_t ts = rte_be_to_cpu_64(cqe->timestamp); 853 854 if (rxq->rt_timestamp) 855 ts = mlx5_txpp_convert_rx_ts(rxq->sh, ts); 856 mlx5_timestamp_set(pkt, rxq->timestamp_offset, ts); 857 pkt->ol_flags |= rxq->timestamp_rx_flag; 858 } 859 } 860 861 /** 862 * DPDK callback for RX. 863 * 864 * @param dpdk_rxq 865 * Generic pointer to RX queue structure. 866 * @param[out] pkts 867 * Array to store received packets. 868 * @param pkts_n 869 * Maximum number of packets in array. 870 * 871 * @return 872 * Number of packets successfully received (<= pkts_n). 873 */ 874 uint16_t 875 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 876 { 877 struct mlx5_rxq_data *rxq = dpdk_rxq; 878 const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1; 879 const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1; 880 const unsigned int sges_n = rxq->sges_n; 881 struct rte_mbuf *pkt = NULL; 882 struct rte_mbuf *seg = NULL; 883 volatile struct mlx5_cqe *cqe = 884 &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 885 unsigned int i = 0; 886 unsigned int rq_ci = rxq->rq_ci << sges_n; 887 int len = 0; /* keep its value across iterations. */ 888 889 while (pkts_n) { 890 uint16_t skip_cnt; 891 unsigned int idx = rq_ci & wqe_cnt; 892 volatile struct mlx5_wqe_data_seg *wqe = 893 &((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx]; 894 struct rte_mbuf *rep = (*rxq->elts)[idx]; 895 volatile struct mlx5_mini_cqe8 *mcqe = NULL; 896 897 if (pkt) 898 NEXT(seg) = rep; 899 seg = rep; 900 rte_prefetch0(seg); 901 rte_prefetch0(cqe); 902 rte_prefetch0(wqe); 903 /* Allocate the buf from the same pool. */ 904 rep = rte_mbuf_raw_alloc(seg->pool); 905 if (unlikely(rep == NULL)) { 906 ++rxq->stats.rx_nombuf; 907 if (!pkt) { 908 /* 909 * no buffers before we even started, 910 * bail out silently. 911 */ 912 break; 913 } 914 while (pkt != seg) { 915 MLX5_ASSERT(pkt != (*rxq->elts)[idx]); 916 rep = NEXT(pkt); 917 NEXT(pkt) = NULL; 918 NB_SEGS(pkt) = 1; 919 rte_mbuf_raw_free(pkt); 920 pkt = rep; 921 } 922 rq_ci >>= sges_n; 923 ++rq_ci; 924 rq_ci <<= sges_n; 925 break; 926 } 927 if (!pkt) { 928 cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 929 len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt, &mcqe, &skip_cnt, false); 930 if (unlikely(len & MLX5_ERROR_CQE_MASK)) { 931 if (len == MLX5_CRITICAL_ERROR_CQE_RET) { 932 rte_mbuf_raw_free(rep); 933 rq_ci = rxq->rq_ci << sges_n; 934 break; 935 } 936 rq_ci >>= sges_n; 937 rq_ci += skip_cnt; 938 rq_ci <<= sges_n; 939 idx = rq_ci & wqe_cnt; 940 wqe = &((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx]; 941 seg = (*rxq->elts)[idx]; 942 cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; 943 len = len & ~MLX5_ERROR_CQE_MASK; 944 } 945 if (len == 0) { 946 rte_mbuf_raw_free(rep); 947 break; 948 } 949 pkt = seg; 950 MLX5_ASSERT(len >= (rxq->crc_present << 2)); 951 pkt->ol_flags &= RTE_MBUF_F_EXTERNAL; 952 rxq_cq_to_mbuf(rxq, pkt, cqe, mcqe); 953 if (rxq->crc_present) 954 len -= RTE_ETHER_CRC_LEN; 955 PKT_LEN(pkt) = len; 956 if (cqe->lro_num_seg > 1) { 957 mlx5_lro_update_hdr 958 (rte_pktmbuf_mtod(pkt, uint8_t *), cqe, 959 mcqe, rxq, len); 960 pkt->ol_flags |= RTE_MBUF_F_RX_LRO; 961 pkt->tso_segsz = len / cqe->lro_num_seg; 962 } 963 } 964 DATA_LEN(rep) = DATA_LEN(seg); 965 PKT_LEN(rep) = PKT_LEN(seg); 966 SET_DATA_OFF(rep, DATA_OFF(seg)); 967 PORT(rep) = PORT(seg); 968 (*rxq->elts)[idx] = rep; 969 /* 970 * Fill NIC descriptor with the new buffer. The lkey and size 971 * of the buffers are already known, only the buffer address 972 * changes. 973 */ 974 wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t)); 975 /* If there's only one MR, no need to replace LKey in WQE. */ 976 if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) 977 wqe->lkey = mlx5_rx_mb2mr(rxq, rep); 978 if (len > DATA_LEN(seg)) { 979 len -= DATA_LEN(seg); 980 ++NB_SEGS(pkt); 981 ++rq_ci; 982 continue; 983 } 984 DATA_LEN(seg) = len; 985 #ifdef MLX5_PMD_SOFT_COUNTERS 986 /* Increment bytes counter. */ 987 rxq->stats.ibytes += PKT_LEN(pkt); 988 #endif 989 /* Return packet. */ 990 *(pkts++) = pkt; 991 pkt = NULL; 992 --pkts_n; 993 ++i; 994 /* Align consumer index to the next stride. */ 995 rq_ci >>= sges_n; 996 ++rq_ci; 997 rq_ci <<= sges_n; 998 } 999 if (unlikely(i == 0 && ((rq_ci >> sges_n) == rxq->rq_ci))) 1000 return 0; 1001 /* Update the consumer index. */ 1002 rxq->rq_ci = rq_ci >> sges_n; 1003 rte_io_wmb(); 1004 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); 1005 rte_io_wmb(); 1006 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 1007 #ifdef MLX5_PMD_SOFT_COUNTERS 1008 /* Increment packets counter. */ 1009 rxq->stats.ipackets += i; 1010 #endif 1011 return i; 1012 } 1013 1014 /** 1015 * Update LRO packet TCP header. 1016 * The HW LRO feature doesn't update the TCP header after coalescing the 1017 * TCP segments but supplies information in CQE to fill it by SW. 1018 * 1019 * @param tcp 1020 * Pointer to the TCP header. 1021 * @param cqe 1022 * Pointer to the completion entry. 1023 * @param phcsum 1024 * The L3 pseudo-header checksum. 1025 */ 1026 static inline void 1027 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *__rte_restrict tcp, 1028 volatile struct mlx5_cqe *__rte_restrict cqe, 1029 uint32_t phcsum, uint8_t l4_type) 1030 { 1031 /* 1032 * The HW calculates only the TCP payload checksum, need to complete 1033 * the TCP header checksum and the L3 pseudo-header checksum. 1034 */ 1035 uint32_t csum = phcsum + cqe->csum; 1036 1037 if (l4_type == MLX5_L4_HDR_TYPE_TCP_EMPTY_ACK || 1038 l4_type == MLX5_L4_HDR_TYPE_TCP_WITH_ACL) { 1039 tcp->tcp_flags |= RTE_TCP_ACK_FLAG; 1040 tcp->recv_ack = cqe->lro_ack_seq_num; 1041 tcp->rx_win = cqe->lro_tcp_win; 1042 } 1043 if (cqe->lro_tcppsh_abort_dupack & MLX5_CQE_LRO_PUSH_MASK) 1044 tcp->tcp_flags |= RTE_TCP_PSH_FLAG; 1045 tcp->cksum = 0; 1046 csum += rte_raw_cksum(tcp, (tcp->data_off >> 4) * 4); 1047 csum = ((csum & 0xffff0000) >> 16) + (csum & 0xffff); 1048 csum = (~csum) & 0xffff; 1049 if (csum == 0) 1050 csum = 0xffff; 1051 tcp->cksum = csum; 1052 } 1053 1054 /** 1055 * Update LRO packet headers. 1056 * The HW LRO feature doesn't update the L3/TCP headers after coalescing the 1057 * TCP segments but supply information in CQE to fill it by SW. 1058 * 1059 * @param padd 1060 * The packet address. 1061 * @param cqe 1062 * Pointer to the completion entry. 1063 * @param len 1064 * The packet length. 1065 */ 1066 static inline void 1067 mlx5_lro_update_hdr(uint8_t *__rte_restrict padd, 1068 volatile struct mlx5_cqe *__rte_restrict cqe, 1069 volatile struct mlx5_mini_cqe8 *mcqe, 1070 struct mlx5_rxq_data *rxq, uint32_t len) 1071 { 1072 union { 1073 struct rte_ether_hdr *eth; 1074 struct rte_vlan_hdr *vlan; 1075 struct rte_ipv4_hdr *ipv4; 1076 struct rte_ipv6_hdr *ipv6; 1077 struct rte_tcp_hdr *tcp; 1078 uint8_t *hdr; 1079 } h = { 1080 .hdr = padd, 1081 }; 1082 uint16_t proto = h.eth->ether_type; 1083 uint32_t phcsum; 1084 uint8_t l4_type; 1085 1086 h.eth++; 1087 while (proto == RTE_BE16(RTE_ETHER_TYPE_VLAN) || 1088 proto == RTE_BE16(RTE_ETHER_TYPE_QINQ)) { 1089 proto = h.vlan->eth_proto; 1090 h.vlan++; 1091 } 1092 if (proto == RTE_BE16(RTE_ETHER_TYPE_IPV4)) { 1093 h.ipv4->time_to_live = cqe->lro_min_ttl; 1094 h.ipv4->total_length = rte_cpu_to_be_16(len - (h.hdr - padd)); 1095 h.ipv4->hdr_checksum = 0; 1096 h.ipv4->hdr_checksum = rte_ipv4_cksum(h.ipv4); 1097 phcsum = rte_ipv4_phdr_cksum(h.ipv4, 0); 1098 h.ipv4++; 1099 } else { 1100 h.ipv6->hop_limits = cqe->lro_min_ttl; 1101 h.ipv6->payload_len = rte_cpu_to_be_16(len - (h.hdr - padd) - 1102 sizeof(*h.ipv6)); 1103 phcsum = rte_ipv6_phdr_cksum(h.ipv6, 0); 1104 h.ipv6++; 1105 } 1106 if (mcqe == NULL || 1107 rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX) 1108 l4_type = (rte_be_to_cpu_16(cqe->hdr_type_etc) & 1109 MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT; 1110 else 1111 l4_type = (rte_be_to_cpu_16(mcqe->hdr_type) & 1112 MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT; 1113 mlx5_lro_update_tcp_hdr(h.tcp, cqe, phcsum, l4_type); 1114 } 1115 1116 void 1117 mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf) 1118 { 1119 mlx5_mprq_buf_free_cb(NULL, buf); 1120 } 1121 1122 /** 1123 * DPDK callback for RX with Multi-Packet RQ support. 1124 * 1125 * @param dpdk_rxq 1126 * Generic pointer to RX queue structure. 1127 * @param[out] pkts 1128 * Array to store received packets. 1129 * @param pkts_n 1130 * Maximum number of packets in array. 1131 * 1132 * @return 1133 * Number of packets successfully received (<= pkts_n). 1134 */ 1135 uint16_t 1136 mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1137 { 1138 struct mlx5_rxq_data *rxq = dpdk_rxq; 1139 const uint32_t strd_n = RTE_BIT32(rxq->log_strd_num); 1140 const uint32_t strd_sz = RTE_BIT32(rxq->log_strd_sz); 1141 const uint32_t cq_mask = (1 << rxq->cqe_n) - 1; 1142 const uint32_t wq_mask = (1 << rxq->elts_n) - 1; 1143 volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask]; 1144 unsigned int i = 0; 1145 uint32_t rq_ci = rxq->rq_ci; 1146 uint16_t consumed_strd = rxq->consumed_strd; 1147 struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_ci & wq_mask]; 1148 1149 while (i < pkts_n) { 1150 struct rte_mbuf *pkt; 1151 int ret; 1152 uint32_t len; 1153 uint16_t strd_cnt; 1154 uint16_t strd_idx; 1155 uint32_t byte_cnt; 1156 uint16_t skip_cnt; 1157 volatile struct mlx5_mini_cqe8 *mcqe = NULL; 1158 enum mlx5_rqx_code rxq_code; 1159 1160 if (consumed_strd == strd_n) { 1161 /* Replace WQE if the buffer is still in use. */ 1162 mprq_buf_replace(rxq, rq_ci & wq_mask); 1163 /* Advance to the next WQE. */ 1164 consumed_strd = 0; 1165 ++rq_ci; 1166 buf = (*rxq->mprq_bufs)[rq_ci & wq_mask]; 1167 } 1168 cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask]; 1169 ret = mlx5_rx_poll_len(rxq, cqe, cq_mask, &mcqe, &skip_cnt, true); 1170 if (unlikely(ret & MLX5_ERROR_CQE_MASK)) { 1171 if (ret == MLX5_CRITICAL_ERROR_CQE_RET) { 1172 rq_ci = rxq->rq_ci; 1173 consumed_strd = rxq->consumed_strd; 1174 break; 1175 } 1176 consumed_strd += skip_cnt; 1177 while (consumed_strd >= strd_n) { 1178 /* Replace WQE if the buffer is still in use. */ 1179 mprq_buf_replace(rxq, rq_ci & wq_mask); 1180 /* Advance to the next WQE. */ 1181 consumed_strd -= strd_n; 1182 ++rq_ci; 1183 buf = (*rxq->mprq_bufs)[rq_ci & wq_mask]; 1184 } 1185 cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask]; 1186 } 1187 if (ret == 0) 1188 break; 1189 byte_cnt = ret; 1190 len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT; 1191 MLX5_ASSERT((int)len >= (rxq->crc_present << 2)); 1192 if (rxq->crc_present) 1193 len -= RTE_ETHER_CRC_LEN; 1194 if (mcqe && 1195 rxq->mcqe_format == MLX5_CQE_RESP_FORMAT_FTAG_STRIDX) 1196 strd_cnt = (len / strd_sz) + !!(len % strd_sz); 1197 else 1198 strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >> 1199 MLX5_MPRQ_STRIDE_NUM_SHIFT; 1200 MLX5_ASSERT(strd_cnt); 1201 consumed_strd += strd_cnt; 1202 if (byte_cnt & MLX5_MPRQ_FILLER_MASK) 1203 continue; 1204 strd_idx = rte_be_to_cpu_16(mcqe == NULL ? 1205 cqe->wqe_counter : 1206 mcqe->stride_idx); 1207 MLX5_ASSERT(strd_idx < strd_n); 1208 MLX5_ASSERT(!((rte_be_to_cpu_16(cqe->wqe_id) ^ rq_ci) & 1209 wq_mask)); 1210 pkt = rte_pktmbuf_alloc(rxq->mp); 1211 if (unlikely(pkt == NULL)) { 1212 ++rxq->stats.rx_nombuf; 1213 break; 1214 } 1215 len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT; 1216 MLX5_ASSERT((int)len >= (rxq->crc_present << 2)); 1217 if (rxq->crc_present) 1218 len -= RTE_ETHER_CRC_LEN; 1219 rxq_code = mprq_buf_to_pkt(rxq, pkt, len, buf, 1220 strd_idx, strd_cnt); 1221 if (unlikely(rxq_code != MLX5_RXQ_CODE_EXIT)) { 1222 rte_pktmbuf_free_seg(pkt); 1223 if (rxq_code == MLX5_RXQ_CODE_DROPPED) { 1224 ++rxq->stats.idropped; 1225 continue; 1226 } 1227 if (rxq_code == MLX5_RXQ_CODE_NOMBUF) { 1228 ++rxq->stats.rx_nombuf; 1229 break; 1230 } 1231 } 1232 rxq_cq_to_mbuf(rxq, pkt, cqe, mcqe); 1233 if (cqe->lro_num_seg > 1) { 1234 mlx5_lro_update_hdr(rte_pktmbuf_mtod(pkt, uint8_t *), 1235 cqe, mcqe, rxq, len); 1236 pkt->ol_flags |= RTE_MBUF_F_RX_LRO; 1237 pkt->tso_segsz = len / cqe->lro_num_seg; 1238 } 1239 PKT_LEN(pkt) = len; 1240 PORT(pkt) = rxq->port_id; 1241 #ifdef MLX5_PMD_SOFT_COUNTERS 1242 /* Increment bytes counter. */ 1243 rxq->stats.ibytes += PKT_LEN(pkt); 1244 #endif 1245 /* Return packet. */ 1246 *(pkts++) = pkt; 1247 ++i; 1248 } 1249 /* Update the consumer indexes. */ 1250 rxq->consumed_strd = consumed_strd; 1251 rte_io_wmb(); 1252 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); 1253 if (rq_ci != rxq->rq_ci) { 1254 rxq->rq_ci = rq_ci; 1255 rte_io_wmb(); 1256 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 1257 } 1258 #ifdef MLX5_PMD_SOFT_COUNTERS 1259 /* Increment packets counter. */ 1260 rxq->stats.ipackets += i; 1261 #endif 1262 return i; 1263 } 1264 1265 /* 1266 * Vectorized Rx routines are not compiled in when required vector instructions 1267 * are not supported on a target architecture. 1268 * The following null stubs are needed for linkage when those are not included 1269 * outside of this file (e.g. mlx5_rxtx_vec_sse.c for x86). 1270 */ 1271 1272 __rte_weak uint16_t 1273 mlx5_rx_burst_vec(void *dpdk_rxq __rte_unused, 1274 struct rte_mbuf **pkts __rte_unused, 1275 uint16_t pkts_n __rte_unused) 1276 { 1277 return 0; 1278 } 1279 1280 __rte_weak uint16_t 1281 mlx5_rx_burst_mprq_vec(void *dpdk_rxq __rte_unused, 1282 struct rte_mbuf **pkts __rte_unused, 1283 uint16_t pkts_n __rte_unused) 1284 { 1285 return 0; 1286 } 1287 1288 __rte_weak int 1289 mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused) 1290 { 1291 return -ENOTSUP; 1292 } 1293 1294 __rte_weak int 1295 mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused) 1296 { 1297 return -ENOTSUP; 1298 } 1299 1300 int 1301 mlx5_rx_queue_lwm_query(struct rte_eth_dev *dev, 1302 uint16_t *queue_id, uint8_t *lwm) 1303 { 1304 struct mlx5_priv *priv = dev->data->dev_private; 1305 unsigned int rxq_id, found = 0, n; 1306 struct mlx5_rxq_priv *rxq; 1307 1308 if (!queue_id) 1309 return -EINVAL; 1310 /* Query all the Rx queues of the port in a circular way. */ 1311 for (rxq_id = *queue_id, n = 0; n < priv->rxqs_n; n++) { 1312 rxq = mlx5_rxq_get(dev, rxq_id); 1313 if (rxq && rxq->lwm_event_pending) { 1314 pthread_mutex_lock(&priv->sh->lwm_config_lock); 1315 rxq->lwm_event_pending = 0; 1316 pthread_mutex_unlock(&priv->sh->lwm_config_lock); 1317 *queue_id = rxq_id; 1318 found = 1; 1319 if (lwm) 1320 *lwm = mlx5_rxq_lwm_to_percentage(rxq); 1321 break; 1322 } 1323 rxq_id = (rxq_id + 1) % priv->rxqs_n; 1324 } 1325 return found; 1326 } 1327 1328 /** 1329 * Rte interrupt handler for LWM event. 1330 * It first checks if the event arrives, if so process the callback for 1331 * RTE_ETH_EVENT_RX_LWM. 1332 * 1333 * @param args 1334 * Generic pointer to mlx5_priv. 1335 */ 1336 void 1337 mlx5_dev_interrupt_handler_lwm(void *args) 1338 { 1339 struct mlx5_priv *priv = args; 1340 struct mlx5_rxq_priv *rxq; 1341 struct rte_eth_dev *dev; 1342 int ret, rxq_idx = 0, port_id = 0; 1343 1344 ret = priv->obj_ops.rxq_event_get_lwm(priv, &rxq_idx, &port_id); 1345 if (unlikely(ret < 0)) { 1346 DRV_LOG(WARNING, "Cannot get LWM event context."); 1347 return; 1348 } 1349 DRV_LOG(INFO, "%s get LWM event, port_id:%d rxq_id:%d.", __func__, 1350 port_id, rxq_idx); 1351 dev = &rte_eth_devices[port_id]; 1352 rxq = mlx5_rxq_get(dev, rxq_idx); 1353 if (rxq) { 1354 pthread_mutex_lock(&priv->sh->lwm_config_lock); 1355 rxq->lwm_event_pending = 1; 1356 pthread_mutex_unlock(&priv->sh->lwm_config_lock); 1357 } 1358 rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_RX_AVAIL_THRESH, NULL); 1359 } 1360 1361 /** 1362 * DPDK callback to arm an Rx queue LWM(limit watermark) event. 1363 * While the Rx queue fullness reaches the LWM limit, the driver catches 1364 * an HW event and invokes the user event callback. 1365 * After the last event handling, the user needs to call this API again 1366 * to arm an additional event. 1367 * 1368 * @param dev 1369 * Pointer to the device structure. 1370 * @param[in] rx_queue_id 1371 * Rx queue identificator. 1372 * @param[in] lwm 1373 * The LWM value, is defined by a percentage of the Rx queue size. 1374 * [1-99] to set a new LWM (update the old value). 1375 * 0 to unarm the event. 1376 * 1377 * @return 1378 * 0 : operation success. 1379 * Otherwise: 1380 * - ENOMEM - not enough memory to create LWM event channel. 1381 * - EINVAL - the input Rxq is not created by devx. 1382 * - E2BIG - lwm is bigger than 99. 1383 */ 1384 int 1385 mlx5_rx_queue_lwm_set(struct rte_eth_dev *dev, uint16_t rx_queue_id, 1386 uint8_t lwm) 1387 { 1388 struct mlx5_priv *priv = dev->data->dev_private; 1389 uint16_t port_id = PORT_ID(priv); 1390 struct mlx5_rxq_priv *rxq = mlx5_rxq_get(dev, rx_queue_id); 1391 uint16_t event_nums[1] = {MLX5_EVENT_TYPE_SRQ_LIMIT_REACHED}; 1392 struct mlx5_rxq_data *rxq_data; 1393 uint32_t wqe_cnt; 1394 uint64_t cookie; 1395 int ret = 0; 1396 1397 if (!rxq) { 1398 rte_errno = EINVAL; 1399 return -rte_errno; 1400 } 1401 rxq_data = &rxq->ctrl->rxq; 1402 /* Ensure the Rq is created by devx. */ 1403 if (priv->obj_ops.rxq_obj_new != devx_obj_ops.rxq_obj_new) { 1404 rte_errno = EINVAL; 1405 return -rte_errno; 1406 } 1407 if (lwm > 99) { 1408 DRV_LOG(WARNING, "Too big LWM configuration."); 1409 rte_errno = E2BIG; 1410 return -rte_errno; 1411 } 1412 /* Start config LWM. */ 1413 pthread_mutex_lock(&priv->sh->lwm_config_lock); 1414 if (rxq->lwm == 0 && lwm == 0) { 1415 /* Both old/new values are 0, do nothing. */ 1416 ret = 0; 1417 goto end; 1418 } 1419 wqe_cnt = 1 << (rxq_data->elts_n - rxq_data->sges_n); 1420 if (lwm) { 1421 if (!priv->sh->devx_channel_lwm) { 1422 ret = mlx5_lwm_setup(priv); 1423 if (ret) { 1424 DRV_LOG(WARNING, 1425 "Failed to create shared_lwm."); 1426 rte_errno = ENOMEM; 1427 ret = -rte_errno; 1428 goto end; 1429 } 1430 } 1431 if (!rxq->lwm_devx_subscribed) { 1432 cookie = ((uint32_t) 1433 (port_id << LWM_COOKIE_PORTID_OFFSET)) | 1434 (rx_queue_id << LWM_COOKIE_RXQID_OFFSET); 1435 ret = mlx5_os_devx_subscribe_devx_event 1436 (priv->sh->devx_channel_lwm, 1437 rxq->devx_rq.rq->obj, 1438 sizeof(event_nums), 1439 event_nums, 1440 cookie); 1441 if (ret) { 1442 rte_errno = rte_errno ? rte_errno : EINVAL; 1443 ret = -rte_errno; 1444 goto end; 1445 } 1446 rxq->lwm_devx_subscribed = 1; 1447 } 1448 } 1449 /* Save LWM to rxq and send modify_rq devx command. */ 1450 rxq->lwm = lwm * wqe_cnt / 100; 1451 /* Prevent integer division loss when switch lwm number to percentage. */ 1452 if (lwm && (lwm * wqe_cnt % 100)) { 1453 rxq->lwm = ((uint32_t)(rxq->lwm + 1) >= wqe_cnt) ? 1454 rxq->lwm : (rxq->lwm + 1); 1455 } 1456 if (lwm && !rxq->lwm) { 1457 /* With mprq, wqe_cnt may be < 100. */ 1458 DRV_LOG(WARNING, "Too small LWM configuration."); 1459 rte_errno = EINVAL; 1460 ret = -rte_errno; 1461 goto end; 1462 } 1463 ret = mlx5_devx_modify_rq(rxq, MLX5_RXQ_MOD_RDY2RDY); 1464 end: 1465 pthread_mutex_unlock(&priv->sh->lwm_config_lock); 1466 return ret; 1467 } 1468 1469 /** 1470 * Mlx5 access register function to configure host shaper. 1471 * It calls API in libmtcr_ul to access QSHR(Qos Shaper Host Register) 1472 * in firmware. 1473 * 1474 * @param dev 1475 * Pointer to rte_eth_dev. 1476 * @param lwm_triggered 1477 * Flag to enable/disable lwm_triggered bit in QSHR. 1478 * @param rate 1479 * Host shaper rate, unit is 100Mbps, set to 0 means disable the shaper. 1480 * @return 1481 * 0 : operation success. 1482 * Otherwise: 1483 * - ENOENT - no ibdev interface. 1484 * - EBUSY - the register access unit is busy. 1485 * - EIO - the register access command meets IO error. 1486 */ 1487 static int 1488 mlxreg_host_shaper_config(struct rte_eth_dev *dev, 1489 bool lwm_triggered, uint8_t rate) 1490 { 1491 #ifdef HAVE_MLX5_MSTFLINT 1492 struct mlx5_priv *priv = dev->data->dev_private; 1493 uint32_t data[MLX5_ST_SZ_DW(register_qshr)] = {0}; 1494 int rc, retry_count = 3; 1495 mfile *mf = NULL; 1496 int status; 1497 void *ptr; 1498 1499 mf = mopen(priv->sh->ibdev_name); 1500 if (!mf) { 1501 DRV_LOG(WARNING, "mopen failed\n"); 1502 rte_errno = ENOENT; 1503 return -rte_errno; 1504 } 1505 MLX5_SET(register_qshr, data, connected_host, 1); 1506 MLX5_SET(register_qshr, data, fast_response, lwm_triggered ? 1 : 0); 1507 MLX5_SET(register_qshr, data, local_port, 1); 1508 ptr = MLX5_ADDR_OF(register_qshr, data, global_config); 1509 MLX5_SET(ets_global_config_register, ptr, rate_limit_update, 1); 1510 MLX5_SET(ets_global_config_register, ptr, max_bw_units, 1511 rate ? ETS_GLOBAL_CONFIG_BW_UNIT_HUNDREDS_MBPS : 1512 ETS_GLOBAL_CONFIG_BW_UNIT_DISABLED); 1513 MLX5_SET(ets_global_config_register, ptr, max_bw_value, rate); 1514 do { 1515 rc = maccess_reg(mf, 1516 MLX5_QSHR_REGISTER_ID, 1517 MACCESS_REG_METHOD_SET, 1518 (u_int32_t *)&data[0], 1519 sizeof(data), 1520 sizeof(data), 1521 sizeof(data), 1522 &status); 1523 if ((rc != ME_ICMD_STATUS_IFC_BUSY && 1524 status != ME_REG_ACCESS_BAD_PARAM) || 1525 !(mf->flags & MDEVS_REM)) { 1526 break; 1527 } 1528 DRV_LOG(WARNING, "%s retry.", __func__); 1529 usleep(10000); 1530 } while (retry_count-- > 0); 1531 mclose(mf); 1532 rte_errno = (rc == ME_REG_ACCESS_DEV_BUSY) ? EBUSY : EIO; 1533 return rc ? -rte_errno : 0; 1534 #else 1535 (void)dev; 1536 (void)lwm_triggered; 1537 (void)rate; 1538 return -1; 1539 #endif 1540 } 1541 1542 int rte_pmd_mlx5_host_shaper_config(int port_id, uint8_t rate, 1543 uint32_t flags) 1544 { 1545 struct rte_eth_dev *dev = &rte_eth_devices[port_id]; 1546 struct mlx5_priv *priv = dev->data->dev_private; 1547 bool lwm_triggered = 1548 !!(flags & RTE_BIT32(MLX5_HOST_SHAPER_FLAG_AVAIL_THRESH_TRIGGERED)); 1549 1550 if (!lwm_triggered) { 1551 priv->sh->host_shaper_rate = rate; 1552 } else { 1553 switch (rate) { 1554 case 0: 1555 /* Rate 0 means disable lwm_triggered. */ 1556 priv->sh->lwm_triggered = 0; 1557 break; 1558 case 1: 1559 /* Rate 1 means enable lwm_triggered. */ 1560 priv->sh->lwm_triggered = 1; 1561 break; 1562 default: 1563 return -ENOTSUP; 1564 } 1565 } 1566 return mlxreg_host_shaper_config(dev, priv->sh->lwm_triggered, 1567 priv->sh->host_shaper_rate); 1568 } 1569