1 /*- 2 * BSD LICENSE 3 * 4 * Copyright 2015 6WIND S.A. 5 * Copyright 2015 Mellanox. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of 6WIND S.A. nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <stddef.h> 35 #include <assert.h> 36 #include <errno.h> 37 #include <string.h> 38 #include <stdint.h> 39 40 /* Verbs header. */ 41 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 42 #ifdef PEDANTIC 43 #pragma GCC diagnostic ignored "-Wpedantic" 44 #endif 45 #include <infiniband/verbs.h> 46 #ifdef PEDANTIC 47 #pragma GCC diagnostic error "-Wpedantic" 48 #endif 49 50 /* DPDK headers don't like -pedantic. */ 51 #ifdef PEDANTIC 52 #pragma GCC diagnostic ignored "-Wpedantic" 53 #endif 54 #include <rte_mbuf.h> 55 #include <rte_malloc.h> 56 #include <rte_ethdev.h> 57 #include <rte_common.h> 58 #ifdef PEDANTIC 59 #pragma GCC diagnostic error "-Wpedantic" 60 #endif 61 62 #include "mlx5_utils.h" 63 #include "mlx5_defs.h" 64 #include "mlx5.h" 65 #include "mlx5_rxtx.h" 66 #include "mlx5_autoconf.h" 67 #include "mlx5_defs.h" 68 69 /** 70 * Allocate TX queue elements. 71 * 72 * @param txq_ctrl 73 * Pointer to TX queue structure. 74 * @param elts_n 75 * Number of elements to allocate. 76 */ 77 static void 78 txq_alloc_elts(struct txq_ctrl *txq_ctrl, unsigned int elts_n) 79 { 80 unsigned int i; 81 82 for (i = 0; (i != elts_n); ++i) 83 (*txq_ctrl->txq.elts)[i] = NULL; 84 for (i = 0; (i != (1u << txq_ctrl->txq.wqe_n)); ++i) { 85 volatile struct mlx5_wqe64 *wqe = 86 (volatile struct mlx5_wqe64 *) 87 txq_ctrl->txq.wqes + i; 88 89 memset((void *)(uintptr_t)wqe, 0x0, sizeof(*wqe)); 90 } 91 DEBUG("%p: allocated and configured %u WRs", (void *)txq_ctrl, elts_n); 92 txq_ctrl->txq.elts_head = 0; 93 txq_ctrl->txq.elts_tail = 0; 94 txq_ctrl->txq.elts_comp = 0; 95 } 96 97 /** 98 * Free TX queue elements. 99 * 100 * @param txq_ctrl 101 * Pointer to TX queue structure. 102 */ 103 static void 104 txq_free_elts(struct txq_ctrl *txq_ctrl) 105 { 106 unsigned int elts_n = 1 << txq_ctrl->txq.elts_n; 107 unsigned int elts_head = txq_ctrl->txq.elts_head; 108 unsigned int elts_tail = txq_ctrl->txq.elts_tail; 109 struct rte_mbuf *(*elts)[elts_n] = txq_ctrl->txq.elts; 110 111 DEBUG("%p: freeing WRs", (void *)txq_ctrl); 112 txq_ctrl->txq.elts_head = 0; 113 txq_ctrl->txq.elts_tail = 0; 114 txq_ctrl->txq.elts_comp = 0; 115 116 while (elts_tail != elts_head) { 117 struct rte_mbuf *elt = (*elts)[elts_tail]; 118 119 assert(elt != NULL); 120 rte_pktmbuf_free(elt); 121 #ifndef NDEBUG 122 /* Poisoning. */ 123 memset(&(*elts)[elts_tail], 124 0x77, 125 sizeof((*elts)[elts_tail])); 126 #endif 127 if (++elts_tail == elts_n) 128 elts_tail = 0; 129 } 130 } 131 132 /** 133 * Clean up a TX queue. 134 * 135 * Destroy objects, free allocated memory and reset the structure for reuse. 136 * 137 * @param txq_ctrl 138 * Pointer to TX queue structure. 139 */ 140 void 141 txq_cleanup(struct txq_ctrl *txq_ctrl) 142 { 143 size_t i; 144 145 DEBUG("cleaning up %p", (void *)txq_ctrl); 146 txq_free_elts(txq_ctrl); 147 if (txq_ctrl->qp != NULL) 148 claim_zero(ibv_destroy_qp(txq_ctrl->qp)); 149 if (txq_ctrl->cq != NULL) 150 claim_zero(ibv_destroy_cq(txq_ctrl->cq)); 151 for (i = 0; (i != RTE_DIM(txq_ctrl->txq.mp2mr)); ++i) { 152 if (txq_ctrl->txq.mp2mr[i].mp == NULL) 153 break; 154 assert(txq_ctrl->txq.mp2mr[i].mr != NULL); 155 claim_zero(ibv_dereg_mr(txq_ctrl->txq.mp2mr[i].mr)); 156 } 157 memset(txq_ctrl, 0, sizeof(*txq_ctrl)); 158 } 159 160 /** 161 * Initialize TX queue. 162 * 163 * @param tmpl 164 * Pointer to TX queue control template. 165 * @param txq_ctrl 166 * Pointer to TX queue control. 167 * 168 * @return 169 * 0 on success, errno value on failure. 170 */ 171 static inline int 172 txq_setup(struct txq_ctrl *tmpl, struct txq_ctrl *txq_ctrl) 173 { 174 struct mlx5_qp *qp = to_mqp(tmpl->qp); 175 struct ibv_cq *ibcq = tmpl->cq; 176 struct mlx5_cq *cq = to_mxxx(cq, cq); 177 178 if (cq->cqe_sz != RTE_CACHE_LINE_SIZE) { 179 ERROR("Wrong MLX5_CQE_SIZE environment variable value: " 180 "it should be set to %u", RTE_CACHE_LINE_SIZE); 181 return EINVAL; 182 } 183 tmpl->txq.cqe_n = log2above(ibcq->cqe); 184 tmpl->txq.qp_num_8s = qp->ctrl_seg.qp_num << 8; 185 tmpl->txq.wqes = qp->gen_data.sqstart; 186 tmpl->txq.wqe_n = log2above(qp->sq.wqe_cnt); 187 tmpl->txq.qp_db = &qp->gen_data.db[MLX5_SND_DBR]; 188 tmpl->txq.bf_reg = qp->gen_data.bf->reg; 189 tmpl->txq.cq_db = cq->dbrec; 190 tmpl->txq.cqes = 191 (volatile struct mlx5_cqe (*)[]) 192 (uintptr_t)cq->active_buf->buf; 193 tmpl->txq.elts = 194 (struct rte_mbuf *(*)[1 << tmpl->txq.elts_n]) 195 ((uintptr_t)txq_ctrl + sizeof(*txq_ctrl)); 196 return 0; 197 } 198 199 /** 200 * Configure a TX queue. 201 * 202 * @param dev 203 * Pointer to Ethernet device structure. 204 * @param txq_ctrl 205 * Pointer to TX queue structure. 206 * @param desc 207 * Number of descriptors to configure in queue. 208 * @param socket 209 * NUMA socket on which memory must be allocated. 210 * @param[in] conf 211 * Thresholds parameters. 212 * 213 * @return 214 * 0 on success, errno value on failure. 215 */ 216 int 217 txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, 218 uint16_t desc, unsigned int socket, 219 const struct rte_eth_txconf *conf) 220 { 221 struct priv *priv = mlx5_get_priv(dev); 222 struct txq_ctrl tmpl = { 223 .priv = priv, 224 .socket = socket, 225 }; 226 union { 227 struct ibv_exp_qp_init_attr init; 228 struct ibv_exp_cq_init_attr cq; 229 struct ibv_exp_qp_attr mod; 230 struct ibv_exp_cq_attr cq_attr; 231 } attr; 232 unsigned int cqe_n; 233 int ret = 0; 234 235 if (mlx5_getenv_int("MLX5_ENABLE_CQE_COMPRESSION")) { 236 ret = ENOTSUP; 237 ERROR("MLX5_ENABLE_CQE_COMPRESSION must never be set"); 238 goto error; 239 } 240 (void)conf; /* Thresholds configuration (ignored). */ 241 assert(desc > MLX5_TX_COMP_THRESH); 242 tmpl.txq.elts_n = log2above(desc); 243 if (priv->mps == MLX5_MPW_ENHANCED) 244 tmpl.txq.mpw_hdr_dseg = priv->mpw_hdr_dseg; 245 /* MRs will be registered in mp2mr[] later. */ 246 attr.cq = (struct ibv_exp_cq_init_attr){ 247 .comp_mask = 0, 248 }; 249 cqe_n = ((desc / MLX5_TX_COMP_THRESH) - 1) ? 250 ((desc / MLX5_TX_COMP_THRESH) - 1) : 1; 251 if (priv->mps == MLX5_MPW_ENHANCED) 252 cqe_n += MLX5_TX_COMP_THRESH_INLINE_DIV; 253 tmpl.cq = ibv_exp_create_cq(priv->ctx, 254 cqe_n, 255 NULL, NULL, 0, &attr.cq); 256 if (tmpl.cq == NULL) { 257 ret = ENOMEM; 258 ERROR("%p: CQ creation failure: %s", 259 (void *)dev, strerror(ret)); 260 goto error; 261 } 262 DEBUG("priv->device_attr.max_qp_wr is %d", 263 priv->device_attr.max_qp_wr); 264 DEBUG("priv->device_attr.max_sge is %d", 265 priv->device_attr.max_sge); 266 attr.init = (struct ibv_exp_qp_init_attr){ 267 /* CQ to be associated with the send queue. */ 268 .send_cq = tmpl.cq, 269 /* CQ to be associated with the receive queue. */ 270 .recv_cq = tmpl.cq, 271 .cap = { 272 /* Max number of outstanding WRs. */ 273 .max_send_wr = ((priv->device_attr.max_qp_wr < desc) ? 274 priv->device_attr.max_qp_wr : 275 desc), 276 /* 277 * Max number of scatter/gather elements in a WR, 278 * must be 1 to prevent libmlx5 from trying to affect 279 * too much memory. TX gather is not impacted by the 280 * priv->device_attr.max_sge limit and will still work 281 * properly. 282 */ 283 .max_send_sge = 1, 284 }, 285 .qp_type = IBV_QPT_RAW_PACKET, 286 /* Do *NOT* enable this, completions events are managed per 287 * TX burst. */ 288 .sq_sig_all = 0, 289 .pd = priv->pd, 290 .comp_mask = IBV_EXP_QP_INIT_ATTR_PD, 291 }; 292 if (priv->txq_inline && (priv->txqs_n >= priv->txqs_inline)) { 293 tmpl.txq.max_inline = 294 ((priv->txq_inline + (RTE_CACHE_LINE_SIZE - 1)) / 295 RTE_CACHE_LINE_SIZE); 296 tmpl.txq.inline_en = 1; 297 /* TSO and MPS can't be enabled concurrently. */ 298 assert(!priv->tso || !priv->mps); 299 if (priv->mps == MLX5_MPW_ENHANCED) { 300 tmpl.txq.inline_max_packet_sz = 301 priv->inline_max_packet_sz; 302 /* To minimize the size of data set, avoid requesting 303 * too large WQ. 304 */ 305 attr.init.cap.max_inline_data = 306 ((RTE_MIN(priv->txq_inline, 307 priv->inline_max_packet_sz) + 308 (RTE_CACHE_LINE_SIZE - 1)) / 309 RTE_CACHE_LINE_SIZE) * RTE_CACHE_LINE_SIZE; 310 } else { 311 attr.init.cap.max_inline_data = 312 tmpl.txq.max_inline * RTE_CACHE_LINE_SIZE; 313 } 314 } 315 if (priv->tso) { 316 uint16_t max_tso_inline = ((MLX5_MAX_TSO_HEADER + 317 (RTE_CACHE_LINE_SIZE - 1)) / 318 RTE_CACHE_LINE_SIZE); 319 320 attr.init.max_tso_header = 321 max_tso_inline * RTE_CACHE_LINE_SIZE; 322 attr.init.comp_mask |= IBV_EXP_QP_INIT_ATTR_MAX_TSO_HEADER; 323 tmpl.txq.max_inline = RTE_MAX(tmpl.txq.max_inline, 324 max_tso_inline); 325 tmpl.txq.tso_en = 1; 326 } 327 if (priv->tunnel_en) 328 tmpl.txq.tunnel_en = 1; 329 tmpl.qp = ibv_exp_create_qp(priv->ctx, &attr.init); 330 if (tmpl.qp == NULL) { 331 ret = (errno ? errno : EINVAL); 332 ERROR("%p: QP creation failure: %s", 333 (void *)dev, strerror(ret)); 334 goto error; 335 } 336 DEBUG("TX queue capabilities: max_send_wr=%u, max_send_sge=%u," 337 " max_inline_data=%u", 338 attr.init.cap.max_send_wr, 339 attr.init.cap.max_send_sge, 340 attr.init.cap.max_inline_data); 341 attr.mod = (struct ibv_exp_qp_attr){ 342 /* Move the QP to this state. */ 343 .qp_state = IBV_QPS_INIT, 344 /* Primary port number. */ 345 .port_num = priv->port 346 }; 347 ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, 348 (IBV_EXP_QP_STATE | IBV_EXP_QP_PORT)); 349 if (ret) { 350 ERROR("%p: QP state to IBV_QPS_INIT failed: %s", 351 (void *)dev, strerror(ret)); 352 goto error; 353 } 354 ret = txq_setup(&tmpl, txq_ctrl); 355 if (ret) { 356 ERROR("%p: cannot initialize TX queue structure: %s", 357 (void *)dev, strerror(ret)); 358 goto error; 359 } 360 txq_alloc_elts(&tmpl, desc); 361 attr.mod = (struct ibv_exp_qp_attr){ 362 .qp_state = IBV_QPS_RTR 363 }; 364 ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, IBV_EXP_QP_STATE); 365 if (ret) { 366 ERROR("%p: QP state to IBV_QPS_RTR failed: %s", 367 (void *)dev, strerror(ret)); 368 goto error; 369 } 370 attr.mod.qp_state = IBV_QPS_RTS; 371 ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, IBV_EXP_QP_STATE); 372 if (ret) { 373 ERROR("%p: QP state to IBV_QPS_RTS failed: %s", 374 (void *)dev, strerror(ret)); 375 goto error; 376 } 377 /* Clean up txq in case we're reinitializing it. */ 378 DEBUG("%p: cleaning-up old txq just in case", (void *)txq_ctrl); 379 txq_cleanup(txq_ctrl); 380 *txq_ctrl = tmpl; 381 DEBUG("%p: txq updated with %p", (void *)txq_ctrl, (void *)&tmpl); 382 /* Pre-register known mempools. */ 383 rte_mempool_walk(txq_mp2mr_iter, txq_ctrl); 384 assert(ret == 0); 385 return 0; 386 error: 387 txq_cleanup(&tmpl); 388 assert(ret > 0); 389 return ret; 390 } 391 392 /** 393 * DPDK callback to configure a TX queue. 394 * 395 * @param dev 396 * Pointer to Ethernet device structure. 397 * @param idx 398 * TX queue index. 399 * @param desc 400 * Number of descriptors to configure in queue. 401 * @param socket 402 * NUMA socket on which memory must be allocated. 403 * @param[in] conf 404 * Thresholds parameters. 405 * 406 * @return 407 * 0 on success, negative errno value on failure. 408 */ 409 int 410 mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, 411 unsigned int socket, const struct rte_eth_txconf *conf) 412 { 413 struct priv *priv = dev->data->dev_private; 414 struct txq *txq = (*priv->txqs)[idx]; 415 struct txq_ctrl *txq_ctrl = container_of(txq, struct txq_ctrl, txq); 416 int ret; 417 418 if (mlx5_is_secondary()) 419 return -E_RTE_SECONDARY; 420 421 priv_lock(priv); 422 if (desc <= MLX5_TX_COMP_THRESH) { 423 WARN("%p: number of descriptors requested for TX queue %u" 424 " must be higher than MLX5_TX_COMP_THRESH, using" 425 " %u instead of %u", 426 (void *)dev, idx, MLX5_TX_COMP_THRESH + 1, desc); 427 desc = MLX5_TX_COMP_THRESH + 1; 428 } 429 if (!rte_is_power_of_2(desc)) { 430 desc = 1 << log2above(desc); 431 WARN("%p: increased number of descriptors in TX queue %u" 432 " to the next power of two (%d)", 433 (void *)dev, idx, desc); 434 } 435 DEBUG("%p: configuring queue %u for %u descriptors", 436 (void *)dev, idx, desc); 437 if (idx >= priv->txqs_n) { 438 ERROR("%p: queue index out of range (%u >= %u)", 439 (void *)dev, idx, priv->txqs_n); 440 priv_unlock(priv); 441 return -EOVERFLOW; 442 } 443 if (txq != NULL) { 444 DEBUG("%p: reusing already allocated queue index %u (%p)", 445 (void *)dev, idx, (void *)txq); 446 if (priv->started) { 447 priv_unlock(priv); 448 return -EEXIST; 449 } 450 (*priv->txqs)[idx] = NULL; 451 txq_cleanup(txq_ctrl); 452 /* Resize if txq size is changed. */ 453 if (txq_ctrl->txq.elts_n != log2above(desc)) { 454 txq_ctrl = rte_realloc(txq_ctrl, 455 sizeof(*txq_ctrl) + 456 desc * sizeof(struct rte_mbuf *), 457 RTE_CACHE_LINE_SIZE); 458 if (!txq_ctrl) { 459 ERROR("%p: unable to reallocate queue index %u", 460 (void *)dev, idx); 461 priv_unlock(priv); 462 return -ENOMEM; 463 } 464 } 465 } else { 466 txq_ctrl = 467 rte_calloc_socket("TXQ", 1, 468 sizeof(*txq_ctrl) + 469 desc * sizeof(struct rte_mbuf *), 470 0, socket); 471 if (txq_ctrl == NULL) { 472 ERROR("%p: unable to allocate queue index %u", 473 (void *)dev, idx); 474 priv_unlock(priv); 475 return -ENOMEM; 476 } 477 } 478 ret = txq_ctrl_setup(dev, txq_ctrl, desc, socket, conf); 479 if (ret) 480 rte_free(txq_ctrl); 481 else { 482 txq_ctrl->txq.stats.idx = idx; 483 DEBUG("%p: adding TX queue %p to list", 484 (void *)dev, (void *)txq_ctrl); 485 (*priv->txqs)[idx] = &txq_ctrl->txq; 486 /* Update send callback. */ 487 priv_select_tx_function(priv); 488 } 489 priv_unlock(priv); 490 return -ret; 491 } 492 493 /** 494 * DPDK callback to release a TX queue. 495 * 496 * @param dpdk_txq 497 * Generic TX queue pointer. 498 */ 499 void 500 mlx5_tx_queue_release(void *dpdk_txq) 501 { 502 struct txq *txq = (struct txq *)dpdk_txq; 503 struct txq_ctrl *txq_ctrl; 504 struct priv *priv; 505 unsigned int i; 506 507 if (mlx5_is_secondary()) 508 return; 509 510 if (txq == NULL) 511 return; 512 txq_ctrl = container_of(txq, struct txq_ctrl, txq); 513 priv = txq_ctrl->priv; 514 priv_lock(priv); 515 for (i = 0; (i != priv->txqs_n); ++i) 516 if ((*priv->txqs)[i] == txq) { 517 DEBUG("%p: removing TX queue %p from list", 518 (void *)priv->dev, (void *)txq_ctrl); 519 (*priv->txqs)[i] = NULL; 520 break; 521 } 522 txq_cleanup(txq_ctrl); 523 rte_free(txq_ctrl); 524 priv_unlock(priv); 525 } 526 527 /** 528 * DPDK callback for TX in secondary processes. 529 * 530 * This function configures all queues from primary process information 531 * if necessary before reverting to the normal TX burst callback. 532 * 533 * @param dpdk_txq 534 * Generic pointer to TX queue structure. 535 * @param[in] pkts 536 * Packets to transmit. 537 * @param pkts_n 538 * Number of packets in array. 539 * 540 * @return 541 * Number of packets successfully transmitted (<= pkts_n). 542 */ 543 uint16_t 544 mlx5_tx_burst_secondary_setup(void *dpdk_txq, struct rte_mbuf **pkts, 545 uint16_t pkts_n) 546 { 547 struct txq *txq = dpdk_txq; 548 struct txq_ctrl *txq_ctrl = container_of(txq, struct txq_ctrl, txq); 549 struct priv *priv = mlx5_secondary_data_setup(txq_ctrl->priv); 550 struct priv *primary_priv; 551 unsigned int index; 552 553 if (priv == NULL) 554 return 0; 555 primary_priv = 556 mlx5_secondary_data[priv->dev->data->port_id].primary_priv; 557 /* Look for queue index in both private structures. */ 558 for (index = 0; index != priv->txqs_n; ++index) 559 if (((*primary_priv->txqs)[index] == txq) || 560 ((*priv->txqs)[index] == txq)) 561 break; 562 if (index == priv->txqs_n) 563 return 0; 564 txq = (*priv->txqs)[index]; 565 return priv->dev->tx_pkt_burst(txq, pkts, pkts_n); 566 } 567