xref: /dpdk/drivers/net/mlx5/mlx5_txq.c (revision 4c3d7961d9002bb715a8ee76bcf464d633316d4c)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5 
6 #include <stddef.h>
7 #include <errno.h>
8 #include <string.h>
9 #include <stdint.h>
10 #include <unistd.h>
11 #include <inttypes.h>
12 
13 #include <rte_mbuf.h>
14 #include <rte_malloc.h>
15 #include <ethdev_driver.h>
16 #include <bus_pci_driver.h>
17 #include <rte_common.h>
18 #include <rte_eal_paging.h>
19 
20 #include <mlx5_common.h>
21 #include <mlx5_common_mr.h>
22 #include <mlx5_malloc.h>
23 
24 #include "mlx5_defs.h"
25 #include "mlx5_utils.h"
26 #include "mlx5.h"
27 #include "mlx5_tx.h"
28 #include "mlx5_rxtx.h"
29 #include "mlx5_autoconf.h"
30 #include "mlx5_devx.h"
31 #include "rte_pmd_mlx5.h"
32 #include "mlx5_flow.h"
33 
34 /**
35  * Allocate TX queue elements.
36  *
37  * @param txq_ctrl
38  *   Pointer to TX queue structure.
39  */
40 void
41 txq_alloc_elts(struct mlx5_txq_ctrl *txq_ctrl)
42 {
43 	const unsigned int elts_n = 1 << txq_ctrl->txq.elts_n;
44 	unsigned int i;
45 
46 	for (i = 0; (i != elts_n); ++i)
47 		txq_ctrl->txq.elts[i] = NULL;
48 	DRV_LOG(DEBUG, "port %u Tx queue %u allocated and configured %u WRs",
49 		PORT_ID(txq_ctrl->priv), txq_ctrl->txq.idx, elts_n);
50 	txq_ctrl->txq.elts_head = 0;
51 	txq_ctrl->txq.elts_tail = 0;
52 	txq_ctrl->txq.elts_comp = 0;
53 }
54 
55 /**
56  * Free TX queue elements.
57  *
58  * @param txq_ctrl
59  *   Pointer to TX queue structure.
60  */
61 void
62 txq_free_elts(struct mlx5_txq_ctrl *txq_ctrl)
63 {
64 	const uint16_t elts_n = 1 << txq_ctrl->txq.elts_n;
65 	const uint16_t elts_m = elts_n - 1;
66 	uint16_t elts_head = txq_ctrl->txq.elts_head;
67 	uint16_t elts_tail = txq_ctrl->txq.elts_tail;
68 	struct rte_mbuf *(*elts)[] = &txq_ctrl->txq.elts;
69 
70 	DRV_LOG(DEBUG, "port %u Tx queue %u freeing WRs",
71 		PORT_ID(txq_ctrl->priv), txq_ctrl->txq.idx);
72 	txq_ctrl->txq.elts_head = 0;
73 	txq_ctrl->txq.elts_tail = 0;
74 	txq_ctrl->txq.elts_comp = 0;
75 
76 	while (elts_tail != elts_head) {
77 		struct rte_mbuf *elt = (*elts)[elts_tail & elts_m];
78 
79 		MLX5_ASSERT(elt != NULL);
80 		rte_pktmbuf_free_seg(elt);
81 #ifdef RTE_LIBRTE_MLX5_DEBUG
82 		/* Poisoning. */
83 		memset(&(*elts)[elts_tail & elts_m],
84 		       0x77,
85 		       sizeof((*elts)[elts_tail & elts_m]));
86 #endif
87 		++elts_tail;
88 	}
89 }
90 
91 /**
92  * Returns the per-port supported offloads.
93  *
94  * @param dev
95  *   Pointer to Ethernet device.
96  *
97  * @return
98  *   Supported Tx offloads.
99  */
100 uint64_t
101 mlx5_get_tx_port_offloads(struct rte_eth_dev *dev)
102 {
103 	struct mlx5_priv *priv = dev->data->dev_private;
104 	uint64_t offloads = (RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
105 			     RTE_ETH_TX_OFFLOAD_VLAN_INSERT);
106 	struct mlx5_port_config *config = &priv->config;
107 	struct mlx5_dev_cap *dev_cap = &priv->sh->dev_cap;
108 
109 	if (dev_cap->hw_csum)
110 		offloads |= (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM |
111 			     RTE_ETH_TX_OFFLOAD_UDP_CKSUM |
112 			     RTE_ETH_TX_OFFLOAD_TCP_CKSUM);
113 	if (dev_cap->tso)
114 		offloads |= RTE_ETH_TX_OFFLOAD_TCP_TSO;
115 	if (priv->sh->config.tx_pp ||
116 	    priv->sh->cdev->config.hca_attr.wait_on_time)
117 		offloads |= RTE_ETH_TX_OFFLOAD_SEND_ON_TIMESTAMP;
118 	if (dev_cap->swp) {
119 		if (dev_cap->swp & MLX5_SW_PARSING_CSUM_CAP)
120 			offloads |= RTE_ETH_TX_OFFLOAD_OUTER_IPV4_CKSUM;
121 		if (dev_cap->swp & MLX5_SW_PARSING_TSO_CAP)
122 			offloads |= (RTE_ETH_TX_OFFLOAD_IP_TNL_TSO |
123 				     RTE_ETH_TX_OFFLOAD_UDP_TNL_TSO);
124 	}
125 	if (dev_cap->tunnel_en) {
126 		if (dev_cap->hw_csum)
127 			offloads |= RTE_ETH_TX_OFFLOAD_OUTER_IPV4_CKSUM;
128 		if (dev_cap->tso) {
129 			if (dev_cap->tunnel_en &
130 				MLX5_TUNNELED_OFFLOADS_VXLAN_CAP)
131 				offloads |= RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO;
132 			if (dev_cap->tunnel_en &
133 				MLX5_TUNNELED_OFFLOADS_GRE_CAP)
134 				offloads |= RTE_ETH_TX_OFFLOAD_GRE_TNL_TSO;
135 			if (dev_cap->tunnel_en &
136 				MLX5_TUNNELED_OFFLOADS_GENEVE_CAP)
137 				offloads |= RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO;
138 		}
139 	}
140 	if (!config->mprq.enabled)
141 		offloads |= RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE;
142 	return offloads;
143 }
144 
145 /* Fetches and drops all SW-owned and error CQEs to synchronize CQ. */
146 static void
147 txq_sync_cq(struct mlx5_txq_data *txq)
148 {
149 	volatile struct mlx5_cqe *cqe;
150 	int ret, i;
151 
152 	i = txq->cqe_s;
153 	do {
154 		cqe = &txq->cqes[txq->cq_ci & txq->cqe_m];
155 		ret = check_cqe(cqe, txq->cqe_s, txq->cq_ci);
156 		if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
157 			if (likely(ret != MLX5_CQE_STATUS_ERR)) {
158 				/* No new CQEs in completion queue. */
159 				MLX5_ASSERT(ret == MLX5_CQE_STATUS_HW_OWN);
160 				break;
161 			}
162 		}
163 		++txq->cq_ci;
164 	} while (--i);
165 	/* Move all CQEs to HW ownership. */
166 	for (i = 0; i < txq->cqe_s; i++) {
167 		cqe = &txq->cqes[i];
168 		cqe->op_own = MLX5_CQE_INVALIDATE;
169 	}
170 	/* Resync CQE and WQE (WQ in reset state). */
171 	rte_io_wmb();
172 	*txq->cq_db = rte_cpu_to_be_32(txq->cq_ci);
173 	txq->cq_pi = txq->cq_ci;
174 	rte_io_wmb();
175 }
176 
177 /**
178  * Tx queue stop. Device queue goes to the idle state,
179  * all involved mbufs are freed from elts/WQ.
180  *
181  * @param dev
182  *   Pointer to Ethernet device structure.
183  * @param idx
184  *   Tx queue index.
185  *
186  * @return
187  *   0 on success, a negative errno value otherwise and rte_errno is set.
188  */
189 int
190 mlx5_tx_queue_stop_primary(struct rte_eth_dev *dev, uint16_t idx)
191 {
192 	struct mlx5_priv *priv = dev->data->dev_private;
193 	struct mlx5_txq_data *txq = (*priv->txqs)[idx];
194 	struct mlx5_txq_ctrl *txq_ctrl =
195 			container_of(txq, struct mlx5_txq_ctrl, txq);
196 	int ret;
197 
198 	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
199 	/* Move QP to RESET state. */
200 	ret = priv->obj_ops.txq_obj_modify(txq_ctrl->obj, MLX5_TXQ_MOD_RDY2RST,
201 					   (uint8_t)priv->dev_port);
202 	if (ret)
203 		return ret;
204 	/* Handle all send completions. */
205 	txq_sync_cq(txq);
206 	/* Free elts stored in the SQ. */
207 	txq_free_elts(txq_ctrl);
208 	/* Prevent writing new pkts to SQ by setting no free WQE.*/
209 	txq->wqe_ci = txq->wqe_s;
210 	txq->wqe_pi = 0;
211 	txq->elts_comp = 0;
212 	/* Set the actual queue state. */
213 	dev->data->tx_queue_state[idx] = RTE_ETH_QUEUE_STATE_STOPPED;
214 	return 0;
215 }
216 
217 /**
218  * Tx queue stop. Device queue goes to the idle state,
219  * all involved mbufs are freed from elts/WQ.
220  *
221  * @param dev
222  *   Pointer to Ethernet device structure.
223  * @param idx
224  *   Tx queue index.
225  *
226  * @return
227  *   0 on success, a negative errno value otherwise and rte_errno is set.
228  */
229 int
230 mlx5_tx_queue_stop(struct rte_eth_dev *dev, uint16_t idx)
231 {
232 	int ret;
233 
234 	if (rte_eth_dev_is_tx_hairpin_queue(dev, idx)) {
235 		DRV_LOG(ERR, "Hairpin queue can't be stopped");
236 		rte_errno = EINVAL;
237 		return -EINVAL;
238 	}
239 	if (dev->data->tx_queue_state[idx] == RTE_ETH_QUEUE_STATE_STOPPED)
240 		return 0;
241 	if (rte_eal_process_type() ==  RTE_PROC_SECONDARY) {
242 		ret = mlx5_mp_os_req_queue_control(dev, idx,
243 						   MLX5_MP_REQ_QUEUE_TX_STOP);
244 	} else {
245 		ret = mlx5_tx_queue_stop_primary(dev, idx);
246 	}
247 	return ret;
248 }
249 
250 /**
251  * Rx queue start. Device queue goes to the ready state,
252  * all required mbufs are allocated and WQ is replenished.
253  *
254  * @param dev
255  *   Pointer to Ethernet device structure.
256  * @param idx
257  *   RX queue index.
258  *
259  * @return
260  *   0 on success, a negative errno value otherwise and rte_errno is set.
261  */
262 int
263 mlx5_tx_queue_start_primary(struct rte_eth_dev *dev, uint16_t idx)
264 {
265 	struct mlx5_priv *priv = dev->data->dev_private;
266 	struct mlx5_txq_data *txq = (*priv->txqs)[idx];
267 	struct mlx5_txq_ctrl *txq_ctrl =
268 			container_of(txq, struct mlx5_txq_ctrl, txq);
269 	int ret;
270 
271 	MLX5_ASSERT(rte_eal_process_type() ==  RTE_PROC_PRIMARY);
272 	ret = priv->obj_ops.txq_obj_modify(txq_ctrl->obj,
273 					   MLX5_TXQ_MOD_RST2RDY,
274 					   (uint8_t)priv->dev_port);
275 	if (ret)
276 		return ret;
277 	txq_ctrl->txq.wqe_ci = 0;
278 	txq_ctrl->txq.wqe_pi = 0;
279 	txq_ctrl->txq.elts_comp = 0;
280 	/* Set the actual queue state. */
281 	dev->data->tx_queue_state[idx] = RTE_ETH_QUEUE_STATE_STARTED;
282 	return 0;
283 }
284 
285 /**
286  * Rx queue start. Device queue goes to the ready state,
287  * all required mbufs are allocated and WQ is replenished.
288  *
289  * @param dev
290  *   Pointer to Ethernet device structure.
291  * @param idx
292  *   RX queue index.
293  *
294  * @return
295  *   0 on success, a negative errno value otherwise and rte_errno is set.
296  */
297 int
298 mlx5_tx_queue_start(struct rte_eth_dev *dev, uint16_t idx)
299 {
300 	int ret;
301 
302 	if (rte_eth_dev_is_tx_hairpin_queue(dev, idx)) {
303 		DRV_LOG(ERR, "Hairpin queue can't be started");
304 		rte_errno = EINVAL;
305 		return -EINVAL;
306 	}
307 	if (dev->data->tx_queue_state[idx] == RTE_ETH_QUEUE_STATE_STARTED)
308 		return 0;
309 	if (rte_eal_process_type() ==  RTE_PROC_SECONDARY) {
310 		ret = mlx5_mp_os_req_queue_control(dev, idx,
311 						   MLX5_MP_REQ_QUEUE_TX_START);
312 	} else {
313 		ret = mlx5_tx_queue_start_primary(dev, idx);
314 	}
315 	return ret;
316 }
317 
318 /**
319  * Tx queue presetup checks.
320  *
321  * @param dev
322  *   Pointer to Ethernet device structure.
323  * @param idx
324  *   Tx queue index.
325  * @param desc
326  *   Number of descriptors to configure in queue.
327  *
328  * @return
329  *   0 on success, a negative errno value otherwise and rte_errno is set.
330  */
331 static int
332 mlx5_tx_queue_pre_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t *desc)
333 {
334 	struct mlx5_priv *priv = dev->data->dev_private;
335 
336 	if (*desc > 1 << priv->sh->cdev->config.hca_attr.log_max_wq_sz) {
337 		DRV_LOG(ERR,
338 			"port %u number of descriptors requested for Tx queue"
339 			" %u is more than supported",
340 			dev->data->port_id, idx);
341 		rte_errno = EINVAL;
342 		return -EINVAL;
343 	}
344 	if (*desc <= MLX5_TX_COMP_THRESH) {
345 		DRV_LOG(WARNING,
346 			"port %u number of descriptors requested for Tx queue"
347 			" %u must be higher than MLX5_TX_COMP_THRESH, using %u"
348 			" instead of %u", dev->data->port_id, idx,
349 			MLX5_TX_COMP_THRESH + 1, *desc);
350 		*desc = MLX5_TX_COMP_THRESH + 1;
351 	}
352 	if (!rte_is_power_of_2(*desc)) {
353 		*desc = 1 << log2above(*desc);
354 		DRV_LOG(WARNING,
355 			"port %u increased number of descriptors in Tx queue"
356 			" %u to the next power of two (%d)",
357 			dev->data->port_id, idx, *desc);
358 	}
359 	DRV_LOG(DEBUG, "port %u configuring queue %u for %u descriptors",
360 		dev->data->port_id, idx, *desc);
361 	if (idx >= priv->txqs_n) {
362 		DRV_LOG(ERR, "port %u Tx queue index out of range (%u >= %u)",
363 			dev->data->port_id, idx, priv->txqs_n);
364 		rte_errno = EOVERFLOW;
365 		return -rte_errno;
366 	}
367 	if (!mlx5_txq_releasable(dev, idx)) {
368 		rte_errno = EBUSY;
369 		DRV_LOG(ERR, "port %u unable to release queue index %u",
370 			dev->data->port_id, idx);
371 		return -rte_errno;
372 	}
373 	mlx5_txq_release(dev, idx);
374 	return 0;
375 }
376 
377 /**
378  * DPDK callback to configure a TX queue.
379  *
380  * @param dev
381  *   Pointer to Ethernet device structure.
382  * @param idx
383  *   TX queue index.
384  * @param desc
385  *   Number of descriptors to configure in queue.
386  * @param socket
387  *   NUMA socket on which memory must be allocated.
388  * @param[in] conf
389  *   Thresholds parameters.
390  *
391  * @return
392  *   0 on success, a negative errno value otherwise and rte_errno is set.
393  */
394 int
395 mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
396 		    unsigned int socket, const struct rte_eth_txconf *conf)
397 {
398 	struct mlx5_priv *priv = dev->data->dev_private;
399 	struct mlx5_txq_data *txq = (*priv->txqs)[idx];
400 	struct mlx5_txq_ctrl *txq_ctrl =
401 		container_of(txq, struct mlx5_txq_ctrl, txq);
402 	int res;
403 
404 	res = mlx5_tx_queue_pre_setup(dev, idx, &desc);
405 	if (res)
406 		return res;
407 	txq_ctrl = mlx5_txq_new(dev, idx, desc, socket, conf);
408 	if (!txq_ctrl) {
409 		DRV_LOG(ERR, "port %u unable to allocate queue index %u",
410 			dev->data->port_id, idx);
411 		return -rte_errno;
412 	}
413 	DRV_LOG(DEBUG, "port %u adding Tx queue %u to list",
414 		dev->data->port_id, idx);
415 	(*priv->txqs)[idx] = &txq_ctrl->txq;
416 	return 0;
417 }
418 
419 /**
420  * DPDK callback to configure a TX hairpin queue.
421  *
422  * @param dev
423  *   Pointer to Ethernet device structure.
424  * @param idx
425  *   TX queue index.
426  * @param desc
427  *   Number of descriptors to configure in queue.
428  * @param[in] hairpin_conf
429  *   The hairpin binding configuration.
430  *
431  * @return
432  *   0 on success, a negative errno value otherwise and rte_errno is set.
433  */
434 int
435 mlx5_tx_hairpin_queue_setup(struct rte_eth_dev *dev, uint16_t idx,
436 			    uint16_t desc,
437 			    const struct rte_eth_hairpin_conf *hairpin_conf)
438 {
439 	struct mlx5_priv *priv = dev->data->dev_private;
440 	struct mlx5_txq_data *txq = (*priv->txqs)[idx];
441 	struct mlx5_txq_ctrl *txq_ctrl =
442 		container_of(txq, struct mlx5_txq_ctrl, txq);
443 	int res;
444 
445 	res = mlx5_tx_queue_pre_setup(dev, idx, &desc);
446 	if (res)
447 		return res;
448 	if (hairpin_conf->peer_count != 1) {
449 		rte_errno = EINVAL;
450 		DRV_LOG(ERR, "port %u unable to setup Tx hairpin queue index %u"
451 			" peer count is %u", dev->data->port_id,
452 			idx, hairpin_conf->peer_count);
453 		return -rte_errno;
454 	}
455 	if (hairpin_conf->peers[0].port == dev->data->port_id) {
456 		if (hairpin_conf->peers[0].queue >= priv->rxqs_n) {
457 			rte_errno = EINVAL;
458 			DRV_LOG(ERR, "port %u unable to setup Tx hairpin queue"
459 				" index %u, Rx %u is larger than %u",
460 				dev->data->port_id, idx,
461 				hairpin_conf->peers[0].queue, priv->txqs_n);
462 			return -rte_errno;
463 		}
464 	} else {
465 		if (hairpin_conf->manual_bind == 0 ||
466 		    hairpin_conf->tx_explicit == 0) {
467 			rte_errno = EINVAL;
468 			DRV_LOG(ERR, "port %u unable to setup Tx hairpin queue"
469 				" index %u peer port %u with attributes %u %u",
470 				dev->data->port_id, idx,
471 				hairpin_conf->peers[0].port,
472 				hairpin_conf->manual_bind,
473 				hairpin_conf->tx_explicit);
474 			return -rte_errno;
475 		}
476 	}
477 	txq_ctrl = mlx5_txq_hairpin_new(dev, idx, desc,	hairpin_conf);
478 	if (!txq_ctrl) {
479 		DRV_LOG(ERR, "port %u unable to allocate queue index %u",
480 			dev->data->port_id, idx);
481 		return -rte_errno;
482 	}
483 	DRV_LOG(DEBUG, "port %u adding Tx queue %u to list",
484 		dev->data->port_id, idx);
485 	(*priv->txqs)[idx] = &txq_ctrl->txq;
486 	dev->data->tx_queue_state[idx] = RTE_ETH_QUEUE_STATE_HAIRPIN;
487 	return 0;
488 }
489 
490 /**
491  * DPDK callback to release a TX queue.
492  *
493  * @param dev
494  *   Pointer to Ethernet device structure.
495  * @param qid
496  *   Transmit queue index.
497  */
498 void
499 mlx5_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
500 {
501 	struct mlx5_txq_data *txq = dev->data->tx_queues[qid];
502 
503 	if (txq == NULL)
504 		return;
505 	DRV_LOG(DEBUG, "port %u removing Tx queue %u from list",
506 		dev->data->port_id, qid);
507 	mlx5_txq_release(dev, qid);
508 }
509 
510 /**
511  * Remap UAR register of a Tx queue for secondary process.
512  *
513  * Remapped address is stored at the table in the process private structure of
514  * the device, indexed by queue index.
515  *
516  * @param txq_ctrl
517  *   Pointer to Tx queue control structure.
518  * @param fd
519  *   Verbs file descriptor to map UAR pages.
520  *
521  * @return
522  *   0 on success, a negative errno value otherwise and rte_errno is set.
523  */
524 static int
525 txq_uar_init_secondary(struct mlx5_txq_ctrl *txq_ctrl, int fd)
526 {
527 	struct mlx5_priv *priv = txq_ctrl->priv;
528 	struct mlx5_proc_priv *ppriv = MLX5_PROC_PRIV(PORT_ID(priv));
529 	struct mlx5_proc_priv *primary_ppriv = priv->sh->pppriv;
530 	struct mlx5_txq_data *txq = &txq_ctrl->txq;
531 	void *addr;
532 	uintptr_t uar_va;
533 	uintptr_t offset;
534 	const size_t page_size = rte_mem_page_size();
535 	if (page_size == (size_t)-1) {
536 		DRV_LOG(ERR, "Failed to get mem page size");
537 		rte_errno = ENOMEM;
538 		return -rte_errno;
539 	}
540 
541 	if (txq_ctrl->is_hairpin)
542 		return 0;
543 	MLX5_ASSERT(ppriv);
544 	/*
545 	 * As rdma-core, UARs are mapped in size of OS page
546 	 * size. Ref to libmlx5 function: mlx5_init_context()
547 	 */
548 	uar_va = (uintptr_t)primary_ppriv->uar_table[txq->idx].db;
549 	offset = uar_va & (page_size - 1); /* Offset in page. */
550 	addr = rte_mem_map(NULL, page_size, RTE_PROT_WRITE, RTE_MAP_SHARED,
551 			   fd, txq_ctrl->uar_mmap_offset);
552 	if (!addr) {
553 		DRV_LOG(ERR, "Port %u mmap failed for BF reg of txq %u.",
554 			txq->port_id, txq->idx);
555 		rte_errno = ENXIO;
556 		return -rte_errno;
557 	}
558 	addr = RTE_PTR_ADD(addr, offset);
559 	ppriv->uar_table[txq->idx].db = addr;
560 #ifndef RTE_ARCH_64
561 	ppriv->uar_table[txq->idx].sl_p =
562 			primary_ppriv->uar_table[txq->idx].sl_p;
563 #endif
564 	return 0;
565 }
566 
567 /**
568  * Unmap UAR register of a Tx queue for secondary process.
569  *
570  * @param txq_ctrl
571  *   Pointer to Tx queue control structure.
572  */
573 static void
574 txq_uar_uninit_secondary(struct mlx5_txq_ctrl *txq_ctrl)
575 {
576 	struct mlx5_proc_priv *ppriv = MLX5_PROC_PRIV(PORT_ID(txq_ctrl->priv));
577 	void *addr;
578 	const size_t page_size = rte_mem_page_size();
579 	if (page_size == (size_t)-1) {
580 		DRV_LOG(ERR, "Failed to get mem page size");
581 		rte_errno = ENOMEM;
582 	}
583 
584 	if (txq_ctrl->is_hairpin)
585 		return;
586 	addr = ppriv->uar_table[txq_ctrl->txq.idx].db;
587 	rte_mem_unmap(RTE_PTR_ALIGN_FLOOR(addr, page_size), page_size);
588 }
589 
590 /**
591  * Deinitialize Tx UAR registers for secondary process.
592  *
593  * @param dev
594  *   Pointer to Ethernet device.
595  */
596 void
597 mlx5_tx_uar_uninit_secondary(struct rte_eth_dev *dev)
598 {
599 	struct mlx5_proc_priv *ppriv = (struct mlx5_proc_priv *)
600 					dev->process_private;
601 	const size_t page_size = rte_mem_page_size();
602 	void *addr;
603 	unsigned int i;
604 
605 	if (page_size == (size_t)-1) {
606 		DRV_LOG(ERR, "Failed to get mem page size");
607 		return;
608 	}
609 	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_SECONDARY);
610 	for (i = 0; i != ppriv->uar_table_sz; ++i) {
611 		if (!ppriv->uar_table[i].db)
612 			continue;
613 		addr = ppriv->uar_table[i].db;
614 		rte_mem_unmap(RTE_PTR_ALIGN_FLOOR(addr, page_size), page_size);
615 
616 	}
617 }
618 
619 /**
620  * Initialize Tx UAR registers for secondary process.
621  *
622  * @param dev
623  *   Pointer to Ethernet device.
624  * @param fd
625  *   Verbs file descriptor to map UAR pages.
626  *
627  * @return
628  *   0 on success, a negative errno value otherwise and rte_errno is set.
629  */
630 int
631 mlx5_tx_uar_init_secondary(struct rte_eth_dev *dev, int fd)
632 {
633 	struct mlx5_priv *priv = dev->data->dev_private;
634 	struct mlx5_txq_data *txq;
635 	struct mlx5_txq_ctrl *txq_ctrl;
636 	unsigned int i;
637 	int ret;
638 
639 	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_SECONDARY);
640 	for (i = 0; i != priv->txqs_n; ++i) {
641 		if (!(*priv->txqs)[i])
642 			continue;
643 		txq = (*priv->txqs)[i];
644 		txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq);
645 		if (txq_ctrl->is_hairpin)
646 			continue;
647 		MLX5_ASSERT(txq->idx == (uint16_t)i);
648 		ret = txq_uar_init_secondary(txq_ctrl, fd);
649 		if (ret)
650 			goto error;
651 	}
652 	return 0;
653 error:
654 	/* Rollback. */
655 	do {
656 		if (!(*priv->txqs)[i])
657 			continue;
658 		txq = (*priv->txqs)[i];
659 		txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq);
660 		txq_uar_uninit_secondary(txq_ctrl);
661 	} while (i--);
662 	return -rte_errno;
663 }
664 
665 /**
666  * Verify the Verbs Tx queue list is empty
667  *
668  * @param dev
669  *   Pointer to Ethernet device.
670  *
671  * @return
672  *   The number of object not released.
673  */
674 int
675 mlx5_txq_obj_verify(struct rte_eth_dev *dev)
676 {
677 	struct mlx5_priv *priv = dev->data->dev_private;
678 	int ret = 0;
679 	struct mlx5_txq_obj *txq_obj;
680 
681 	LIST_FOREACH(txq_obj, &priv->txqsobj, next) {
682 		DRV_LOG(DEBUG, "port %u Verbs Tx queue %u still referenced",
683 			dev->data->port_id, txq_obj->txq_ctrl->txq.idx);
684 		++ret;
685 	}
686 	return ret;
687 }
688 
689 /**
690  * Calculate the total number of WQEBB for Tx queue.
691  *
692  * Simplified version of calc_sq_size() in rdma-core.
693  *
694  * @param txq_ctrl
695  *   Pointer to Tx queue control structure.
696  *
697  * @return
698  *   The number of WQEBB.
699  */
700 static int
701 txq_calc_wqebb_cnt(struct mlx5_txq_ctrl *txq_ctrl)
702 {
703 	unsigned int wqe_size;
704 	const unsigned int desc = 1 << txq_ctrl->txq.elts_n;
705 
706 	wqe_size = MLX5_WQE_CSEG_SIZE +
707 		   MLX5_WQE_ESEG_SIZE +
708 		   MLX5_WSEG_SIZE -
709 		   MLX5_ESEG_MIN_INLINE_SIZE +
710 		   txq_ctrl->max_inline_data;
711 	return rte_align32pow2(wqe_size * desc) / MLX5_WQE_SIZE;
712 }
713 
714 /**
715  * Calculate the maximal inline data size for Tx queue.
716  *
717  * @param txq_ctrl
718  *   Pointer to Tx queue control structure.
719  *
720  * @return
721  *   The maximal inline data size.
722  */
723 static unsigned int
724 txq_calc_inline_max(struct mlx5_txq_ctrl *txq_ctrl)
725 {
726 	const unsigned int desc = 1 << txq_ctrl->txq.elts_n;
727 	struct mlx5_priv *priv = txq_ctrl->priv;
728 	unsigned int wqe_size;
729 
730 	wqe_size = priv->sh->dev_cap.max_qp_wr / desc;
731 	if (!wqe_size)
732 		return 0;
733 	/*
734 	 * This calculation is derived from tthe source of
735 	 * mlx5_calc_send_wqe() in rdma_core library.
736 	 */
737 	wqe_size = wqe_size * MLX5_WQE_SIZE -
738 		   MLX5_WQE_CSEG_SIZE -
739 		   MLX5_WQE_ESEG_SIZE -
740 		   MLX5_WSEG_SIZE -
741 		   MLX5_WSEG_SIZE +
742 		   MLX5_DSEG_MIN_INLINE_SIZE;
743 	return wqe_size;
744 }
745 
746 /**
747  * Set Tx queue parameters from device configuration.
748  *
749  * @param txq_ctrl
750  *   Pointer to Tx queue control structure.
751  */
752 static void
753 txq_set_params(struct mlx5_txq_ctrl *txq_ctrl)
754 {
755 	struct mlx5_priv *priv = txq_ctrl->priv;
756 	struct mlx5_port_config *config = &priv->config;
757 	struct mlx5_dev_cap *dev_cap = &priv->sh->dev_cap;
758 	unsigned int inlen_send; /* Inline data for ordinary SEND.*/
759 	unsigned int inlen_empw; /* Inline data for enhanced MPW. */
760 	unsigned int inlen_mode; /* Minimal required Inline data. */
761 	unsigned int txqs_inline; /* Min Tx queues to enable inline. */
762 	uint64_t dev_txoff = priv->dev_data->dev_conf.txmode.offloads;
763 	bool tso = txq_ctrl->txq.offloads & (RTE_ETH_TX_OFFLOAD_TCP_TSO |
764 					    RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO |
765 					    RTE_ETH_TX_OFFLOAD_GRE_TNL_TSO |
766 					    RTE_ETH_TX_OFFLOAD_IP_TNL_TSO |
767 					    RTE_ETH_TX_OFFLOAD_UDP_TNL_TSO);
768 	bool vlan_inline;
769 	unsigned int temp;
770 
771 	txq_ctrl->txq.fast_free =
772 		!!((txq_ctrl->txq.offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) &&
773 		   !(txq_ctrl->txq.offloads & RTE_ETH_TX_OFFLOAD_MULTI_SEGS) &&
774 		   !config->mprq.enabled);
775 	if (config->txqs_inline == MLX5_ARG_UNSET)
776 		txqs_inline =
777 #if defined(RTE_ARCH_ARM64)
778 		(priv->pci_dev && priv->pci_dev->id.device_id ==
779 			PCI_DEVICE_ID_MELLANOX_BLUEFIELD) ?
780 			MLX5_INLINE_MAX_TXQS_BLUEFIELD :
781 #endif
782 			MLX5_INLINE_MAX_TXQS;
783 	else
784 		txqs_inline = (unsigned int)config->txqs_inline;
785 	inlen_send = (config->txq_inline_max == MLX5_ARG_UNSET) ?
786 		     MLX5_SEND_DEF_INLINE_LEN :
787 		     (unsigned int)config->txq_inline_max;
788 	inlen_empw = (config->txq_inline_mpw == MLX5_ARG_UNSET) ?
789 		     MLX5_EMPW_DEF_INLINE_LEN :
790 		     (unsigned int)config->txq_inline_mpw;
791 	inlen_mode = (config->txq_inline_min == MLX5_ARG_UNSET) ?
792 		     0 : (unsigned int)config->txq_inline_min;
793 	if (config->mps != MLX5_MPW_ENHANCED && config->mps != MLX5_MPW)
794 		inlen_empw = 0;
795 	/*
796 	 * If there is requested minimal amount of data to inline
797 	 * we MUST enable inlining. This is a case for ConnectX-4
798 	 * which usually requires L2 inlined for correct operating
799 	 * and ConnectX-4 Lx which requires L2-L4 inlined to
800 	 * support E-Switch Flows.
801 	 */
802 	if (inlen_mode) {
803 		if (inlen_mode <= MLX5_ESEG_MIN_INLINE_SIZE) {
804 			/*
805 			 * Optimize minimal inlining for single
806 			 * segment packets to fill one WQEBB
807 			 * without gaps.
808 			 */
809 			temp = MLX5_ESEG_MIN_INLINE_SIZE;
810 		} else {
811 			temp = inlen_mode - MLX5_ESEG_MIN_INLINE_SIZE;
812 			temp = RTE_ALIGN(temp, MLX5_WSEG_SIZE) +
813 			       MLX5_ESEG_MIN_INLINE_SIZE;
814 			temp = RTE_MIN(temp, MLX5_SEND_MAX_INLINE_LEN);
815 		}
816 		if (temp != inlen_mode) {
817 			DRV_LOG(INFO,
818 				"port %u minimal required inline setting"
819 				" aligned from %u to %u",
820 				PORT_ID(priv), inlen_mode, temp);
821 			inlen_mode = temp;
822 		}
823 	}
824 	/*
825 	 * If port is configured to support VLAN insertion and device
826 	 * does not support this feature by HW (for NICs before ConnectX-5
827 	 * or in case of wqe_vlan_insert flag is not set) we must enable
828 	 * data inline on all queues because it is supported by single
829 	 * tx_burst routine.
830 	 */
831 	txq_ctrl->txq.vlan_en = config->hw_vlan_insert;
832 	vlan_inline = (dev_txoff & RTE_ETH_TX_OFFLOAD_VLAN_INSERT) &&
833 		      !config->hw_vlan_insert;
834 	/*
835 	 * If there are few Tx queues it is prioritized
836 	 * to save CPU cycles and disable data inlining at all.
837 	 */
838 	if (inlen_send && priv->txqs_n >= txqs_inline) {
839 		/*
840 		 * The data sent with ordinal MLX5_OPCODE_SEND
841 		 * may be inlined in Ethernet Segment, align the
842 		 * length accordingly to fit entire WQEBBs.
843 		 */
844 		temp = RTE_MAX(inlen_send,
845 			       MLX5_ESEG_MIN_INLINE_SIZE + MLX5_WQE_DSEG_SIZE);
846 		temp -= MLX5_ESEG_MIN_INLINE_SIZE + MLX5_WQE_DSEG_SIZE;
847 		temp = RTE_ALIGN(temp, MLX5_WQE_SIZE);
848 		temp += MLX5_ESEG_MIN_INLINE_SIZE + MLX5_WQE_DSEG_SIZE;
849 		temp = RTE_MIN(temp, MLX5_WQE_SIZE_MAX +
850 				     MLX5_ESEG_MIN_INLINE_SIZE -
851 				     MLX5_WQE_CSEG_SIZE -
852 				     MLX5_WQE_ESEG_SIZE -
853 				     MLX5_WQE_DSEG_SIZE * 2);
854 		temp = RTE_MIN(temp, MLX5_SEND_MAX_INLINE_LEN);
855 		temp = RTE_MAX(temp, inlen_mode);
856 		if (temp != inlen_send) {
857 			DRV_LOG(INFO,
858 				"port %u ordinary send inline setting"
859 				" aligned from %u to %u",
860 				PORT_ID(priv), inlen_send, temp);
861 			inlen_send = temp;
862 		}
863 		/*
864 		 * Not aligned to cache lines, but to WQEs.
865 		 * First bytes of data (initial alignment)
866 		 * is going to be copied explicitly at the
867 		 * beginning of inlining buffer in Ethernet
868 		 * Segment.
869 		 */
870 		MLX5_ASSERT(inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE);
871 		MLX5_ASSERT(inlen_send <= MLX5_WQE_SIZE_MAX +
872 					  MLX5_ESEG_MIN_INLINE_SIZE -
873 					  MLX5_WQE_CSEG_SIZE -
874 					  MLX5_WQE_ESEG_SIZE -
875 					  MLX5_WQE_DSEG_SIZE * 2);
876 	} else if (inlen_mode) {
877 		/*
878 		 * If minimal inlining is requested we must
879 		 * enable inlining in general, despite the
880 		 * number of configured queues. Ignore the
881 		 * txq_inline_max devarg, this is not
882 		 * full-featured inline.
883 		 */
884 		inlen_send = inlen_mode;
885 		inlen_empw = 0;
886 	} else if (vlan_inline) {
887 		/*
888 		 * Hardware does not report offload for
889 		 * VLAN insertion, we must enable data inline
890 		 * to implement feature by software.
891 		 */
892 		inlen_send = MLX5_ESEG_MIN_INLINE_SIZE;
893 		inlen_empw = 0;
894 	} else {
895 		inlen_send = 0;
896 		inlen_empw = 0;
897 	}
898 	txq_ctrl->txq.inlen_send = inlen_send;
899 	txq_ctrl->txq.inlen_mode = inlen_mode;
900 	txq_ctrl->txq.inlen_empw = 0;
901 	if (inlen_send && inlen_empw && priv->txqs_n >= txqs_inline) {
902 		/*
903 		 * The data sent with MLX5_OPCODE_ENHANCED_MPSW
904 		 * may be inlined in Data Segment, align the
905 		 * length accordingly to fit entire WQEBBs.
906 		 */
907 		temp = RTE_MAX(inlen_empw,
908 			       MLX5_WQE_SIZE + MLX5_DSEG_MIN_INLINE_SIZE);
909 		temp -= MLX5_DSEG_MIN_INLINE_SIZE;
910 		temp = RTE_ALIGN(temp, MLX5_WQE_SIZE);
911 		temp += MLX5_DSEG_MIN_INLINE_SIZE;
912 		temp = RTE_MIN(temp, MLX5_WQE_SIZE_MAX +
913 				     MLX5_DSEG_MIN_INLINE_SIZE -
914 				     MLX5_WQE_CSEG_SIZE -
915 				     MLX5_WQE_ESEG_SIZE -
916 				     MLX5_WQE_DSEG_SIZE);
917 		temp = RTE_MIN(temp, MLX5_EMPW_MAX_INLINE_LEN);
918 		if (temp != inlen_empw) {
919 			DRV_LOG(INFO,
920 				"port %u enhanced empw inline setting"
921 				" aligned from %u to %u",
922 				PORT_ID(priv), inlen_empw, temp);
923 			inlen_empw = temp;
924 		}
925 		MLX5_ASSERT(inlen_empw >= MLX5_ESEG_MIN_INLINE_SIZE);
926 		MLX5_ASSERT(inlen_empw <= MLX5_WQE_SIZE_MAX +
927 					  MLX5_DSEG_MIN_INLINE_SIZE -
928 					  MLX5_WQE_CSEG_SIZE -
929 					  MLX5_WQE_ESEG_SIZE -
930 					  MLX5_WQE_DSEG_SIZE);
931 		txq_ctrl->txq.inlen_empw = inlen_empw;
932 	}
933 	txq_ctrl->max_inline_data = RTE_MAX(inlen_send, inlen_empw);
934 	if (tso) {
935 		txq_ctrl->max_tso_header = MLX5_MAX_TSO_HEADER;
936 		txq_ctrl->max_inline_data = RTE_MAX(txq_ctrl->max_inline_data,
937 						    MLX5_MAX_TSO_HEADER);
938 		txq_ctrl->txq.tso_en = 1;
939 	}
940 	if (((RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO & txq_ctrl->txq.offloads) &&
941 	    (dev_cap->tunnel_en & MLX5_TUNNELED_OFFLOADS_VXLAN_CAP)) |
942 	   ((RTE_ETH_TX_OFFLOAD_GRE_TNL_TSO & txq_ctrl->txq.offloads) &&
943 	    (dev_cap->tunnel_en & MLX5_TUNNELED_OFFLOADS_GRE_CAP)) |
944 	   ((RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO & txq_ctrl->txq.offloads) &&
945 	    (dev_cap->tunnel_en & MLX5_TUNNELED_OFFLOADS_GENEVE_CAP)) |
946 	   (dev_cap->swp  & MLX5_SW_PARSING_TSO_CAP))
947 		txq_ctrl->txq.tunnel_en = 1;
948 	txq_ctrl->txq.swp_en = (((RTE_ETH_TX_OFFLOAD_IP_TNL_TSO |
949 				  RTE_ETH_TX_OFFLOAD_UDP_TNL_TSO) &
950 				  txq_ctrl->txq.offloads) && (dev_cap->swp &
951 				  MLX5_SW_PARSING_TSO_CAP)) |
952 				((RTE_ETH_TX_OFFLOAD_OUTER_IPV4_CKSUM &
953 				 txq_ctrl->txq.offloads) && (dev_cap->swp &
954 				 MLX5_SW_PARSING_CSUM_CAP));
955 }
956 
957 /**
958  * Adjust Tx queue data inline parameters for large queue sizes.
959  * The data inline feature requires multiple WQEs to fit the packets,
960  * and if the large amount of Tx descriptors is requested by application
961  * the total WQE amount may exceed the hardware capabilities. If the
962  * default inline setting are used we can try to adjust these ones and
963  * meet the hardware requirements and not exceed the queue size.
964  *
965  * @param txq_ctrl
966  *   Pointer to Tx queue control structure.
967  *
968  * @return
969  *   Zero on success, otherwise the parameters can not be adjusted.
970  */
971 static int
972 txq_adjust_params(struct mlx5_txq_ctrl *txq_ctrl)
973 {
974 	struct mlx5_priv *priv = txq_ctrl->priv;
975 	struct mlx5_port_config *config = &priv->config;
976 	unsigned int max_inline;
977 
978 	max_inline = txq_calc_inline_max(txq_ctrl);
979 	if (!txq_ctrl->txq.inlen_send) {
980 		/*
981 		 * Inline data feature is not engaged at all.
982 		 * There is nothing to adjust.
983 		 */
984 		return 0;
985 	}
986 	if (txq_ctrl->max_inline_data <= max_inline) {
987 		/*
988 		 * The requested inline data length does not
989 		 * exceed queue capabilities.
990 		 */
991 		return 0;
992 	}
993 	if (txq_ctrl->txq.inlen_mode > max_inline) {
994 		DRV_LOG(ERR,
995 			"minimal data inline requirements (%u) are not"
996 			" satisfied (%u) on port %u, try the smaller"
997 			" Tx queue size (%d)",
998 			txq_ctrl->txq.inlen_mode, max_inline,
999 			priv->dev_data->port_id, priv->sh->dev_cap.max_qp_wr);
1000 		goto error;
1001 	}
1002 	if (txq_ctrl->txq.inlen_send > max_inline &&
1003 	    config->txq_inline_max != MLX5_ARG_UNSET &&
1004 	    config->txq_inline_max > (int)max_inline) {
1005 		DRV_LOG(ERR,
1006 			"txq_inline_max requirements (%u) are not"
1007 			" satisfied (%u) on port %u, try the smaller"
1008 			" Tx queue size (%d)",
1009 			txq_ctrl->txq.inlen_send, max_inline,
1010 			priv->dev_data->port_id, priv->sh->dev_cap.max_qp_wr);
1011 		goto error;
1012 	}
1013 	if (txq_ctrl->txq.inlen_empw > max_inline &&
1014 	    config->txq_inline_mpw != MLX5_ARG_UNSET &&
1015 	    config->txq_inline_mpw > (int)max_inline) {
1016 		DRV_LOG(ERR,
1017 			"txq_inline_mpw requirements (%u) are not"
1018 			" satisfied (%u) on port %u, try the smaller"
1019 			" Tx queue size (%d)",
1020 			txq_ctrl->txq.inlen_empw, max_inline,
1021 			priv->dev_data->port_id, priv->sh->dev_cap.max_qp_wr);
1022 		goto error;
1023 	}
1024 	if (txq_ctrl->txq.tso_en && max_inline < MLX5_MAX_TSO_HEADER) {
1025 		DRV_LOG(ERR,
1026 			"tso header inline requirements (%u) are not"
1027 			" satisfied (%u) on port %u, try the smaller"
1028 			" Tx queue size (%d)",
1029 			MLX5_MAX_TSO_HEADER, max_inline,
1030 			priv->dev_data->port_id, priv->sh->dev_cap.max_qp_wr);
1031 		goto error;
1032 	}
1033 	if (txq_ctrl->txq.inlen_send > max_inline) {
1034 		DRV_LOG(WARNING,
1035 			"adjust txq_inline_max (%u->%u)"
1036 			" due to large Tx queue on port %u",
1037 			txq_ctrl->txq.inlen_send, max_inline,
1038 			priv->dev_data->port_id);
1039 		txq_ctrl->txq.inlen_send = max_inline;
1040 	}
1041 	if (txq_ctrl->txq.inlen_empw > max_inline) {
1042 		DRV_LOG(WARNING,
1043 			"adjust txq_inline_mpw (%u->%u)"
1044 			"due to large Tx queue on port %u",
1045 			txq_ctrl->txq.inlen_empw, max_inline,
1046 			priv->dev_data->port_id);
1047 		txq_ctrl->txq.inlen_empw = max_inline;
1048 	}
1049 	txq_ctrl->max_inline_data = RTE_MAX(txq_ctrl->txq.inlen_send,
1050 					    txq_ctrl->txq.inlen_empw);
1051 	MLX5_ASSERT(txq_ctrl->max_inline_data <= max_inline);
1052 	MLX5_ASSERT(txq_ctrl->txq.inlen_mode <= max_inline);
1053 	MLX5_ASSERT(txq_ctrl->txq.inlen_mode <= txq_ctrl->txq.inlen_send);
1054 	MLX5_ASSERT(txq_ctrl->txq.inlen_mode <= txq_ctrl->txq.inlen_empw ||
1055 		    !txq_ctrl->txq.inlen_empw);
1056 	return 0;
1057 error:
1058 	rte_errno = ENOMEM;
1059 	return -ENOMEM;
1060 }
1061 
1062 /**
1063  * Create a DPDK Tx queue.
1064  *
1065  * @param dev
1066  *   Pointer to Ethernet device.
1067  * @param idx
1068  *   TX queue index.
1069  * @param desc
1070  *   Number of descriptors to configure in queue.
1071  * @param socket
1072  *   NUMA socket on which memory must be allocated.
1073  * @param[in] conf
1074  *  Thresholds parameters.
1075  *
1076  * @return
1077  *   A DPDK queue object on success, NULL otherwise and rte_errno is set.
1078  */
1079 struct mlx5_txq_ctrl *
1080 mlx5_txq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
1081 	     unsigned int socket, const struct rte_eth_txconf *conf)
1082 {
1083 	struct mlx5_priv *priv = dev->data->dev_private;
1084 	struct mlx5_txq_ctrl *tmpl;
1085 
1086 	tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, sizeof(*tmpl) +
1087 			   desc * sizeof(struct rte_mbuf *), 0, socket);
1088 	if (!tmpl) {
1089 		rte_errno = ENOMEM;
1090 		return NULL;
1091 	}
1092 	if (mlx5_mr_ctrl_init(&tmpl->txq.mr_ctrl,
1093 			      &priv->sh->cdev->mr_scache.dev_gen, socket)) {
1094 		/* rte_errno is already set. */
1095 		goto error;
1096 	}
1097 	MLX5_ASSERT(desc > MLX5_TX_COMP_THRESH);
1098 	tmpl->txq.offloads = conf->offloads |
1099 			     dev->data->dev_conf.txmode.offloads;
1100 	tmpl->priv = priv;
1101 	tmpl->socket = socket;
1102 	tmpl->txq.elts_n = log2above(desc);
1103 	tmpl->txq.elts_s = desc;
1104 	tmpl->txq.elts_m = desc - 1;
1105 	tmpl->txq.port_id = dev->data->port_id;
1106 	tmpl->txq.idx = idx;
1107 	txq_set_params(tmpl);
1108 	if (txq_adjust_params(tmpl))
1109 		goto error;
1110 	if (txq_calc_wqebb_cnt(tmpl) >
1111 	    priv->sh->dev_cap.max_qp_wr) {
1112 		DRV_LOG(ERR,
1113 			"port %u Tx WQEBB count (%d) exceeds the limit (%d),"
1114 			" try smaller queue size",
1115 			dev->data->port_id, txq_calc_wqebb_cnt(tmpl),
1116 			priv->sh->dev_cap.max_qp_wr);
1117 		rte_errno = ENOMEM;
1118 		goto error;
1119 	}
1120 	rte_atomic_fetch_add_explicit(&tmpl->refcnt, 1, rte_memory_order_relaxed);
1121 	tmpl->is_hairpin = false;
1122 	LIST_INSERT_HEAD(&priv->txqsctrl, tmpl, next);
1123 	return tmpl;
1124 error:
1125 	mlx5_mr_btree_free(&tmpl->txq.mr_ctrl.cache_bh);
1126 	mlx5_free(tmpl);
1127 	return NULL;
1128 }
1129 
1130 /**
1131  * Create a DPDK Tx hairpin queue.
1132  *
1133  * @param dev
1134  *   Pointer to Ethernet device.
1135  * @param idx
1136  *   TX queue index.
1137  * @param desc
1138  *   Number of descriptors to configure in queue.
1139  * @param hairpin_conf
1140  *  The hairpin configuration.
1141  *
1142  * @return
1143  *   A DPDK queue object on success, NULL otherwise and rte_errno is set.
1144  */
1145 struct mlx5_txq_ctrl *
1146 mlx5_txq_hairpin_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
1147 		     const struct rte_eth_hairpin_conf *hairpin_conf)
1148 {
1149 	struct mlx5_priv *priv = dev->data->dev_private;
1150 	struct mlx5_txq_ctrl *tmpl;
1151 
1152 	tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, sizeof(*tmpl), 0,
1153 			   SOCKET_ID_ANY);
1154 	if (!tmpl) {
1155 		rte_errno = ENOMEM;
1156 		return NULL;
1157 	}
1158 	tmpl->priv = priv;
1159 	tmpl->socket = SOCKET_ID_ANY;
1160 	tmpl->txq.elts_n = log2above(desc);
1161 	tmpl->txq.port_id = dev->data->port_id;
1162 	tmpl->txq.idx = idx;
1163 	tmpl->hairpin_conf = *hairpin_conf;
1164 	tmpl->is_hairpin = true;
1165 	rte_atomic_fetch_add_explicit(&tmpl->refcnt, 1, rte_memory_order_relaxed);
1166 	LIST_INSERT_HEAD(&priv->txqsctrl, tmpl, next);
1167 	return tmpl;
1168 }
1169 
1170 /**
1171  * Get a Tx queue.
1172  *
1173  * @param dev
1174  *   Pointer to Ethernet device.
1175  * @param idx
1176  *   TX queue index.
1177  *
1178  * @return
1179  *   A pointer to the queue if it exists.
1180  */
1181 struct mlx5_txq_ctrl *
1182 mlx5_txq_get(struct rte_eth_dev *dev, uint16_t idx)
1183 {
1184 	struct mlx5_priv *priv = dev->data->dev_private;
1185 	struct mlx5_txq_data *txq_data = (*priv->txqs)[idx];
1186 	struct mlx5_txq_ctrl *ctrl = NULL;
1187 
1188 	if (txq_data) {
1189 		ctrl = container_of(txq_data, struct mlx5_txq_ctrl, txq);
1190 		rte_atomic_fetch_add_explicit(&ctrl->refcnt, 1, rte_memory_order_relaxed);
1191 	}
1192 	return ctrl;
1193 }
1194 
1195 /**
1196  * Get an external Tx queue.
1197  *
1198  * @param dev
1199  *   Pointer to Ethernet device.
1200  * @param idx
1201  *   External Tx queue index.
1202  *
1203  * @return
1204  *   A pointer to the queue if it exists, NULL otherwise.
1205  */
1206 struct mlx5_external_q *
1207 mlx5_ext_txq_get(struct rte_eth_dev *dev, uint16_t idx)
1208 {
1209 	struct mlx5_priv *priv = dev->data->dev_private;
1210 
1211 	MLX5_ASSERT(mlx5_is_external_txq(dev, idx));
1212 	return &priv->ext_txqs[idx - MLX5_EXTERNAL_TX_QUEUE_ID_MIN];
1213 }
1214 
1215 /**
1216  * Verify the external Tx Queue list is empty.
1217  *
1218  * @param dev
1219  *   Pointer to Ethernet device.
1220  *
1221  * @return
1222  *   The number of object not released.
1223  */
1224 int
1225 mlx5_ext_txq_verify(struct rte_eth_dev *dev)
1226 {
1227 	struct mlx5_priv *priv = dev->data->dev_private;
1228 	struct mlx5_external_q *txq;
1229 	uint32_t i;
1230 	int ret = 0;
1231 
1232 	if (priv->ext_txqs == NULL)
1233 		return 0;
1234 
1235 	for (i = MLX5_EXTERNAL_TX_QUEUE_ID_MIN; i <= UINT16_MAX ; ++i) {
1236 		txq = mlx5_ext_txq_get(dev, i);
1237 		if (txq->refcnt < 2)
1238 			continue;
1239 		DRV_LOG(DEBUG, "Port %u external TxQ %u still referenced.",
1240 			dev->data->port_id, i);
1241 		++ret;
1242 	}
1243 	return ret;
1244 }
1245 
1246 /**
1247  * Release a Tx queue.
1248  *
1249  * @param dev
1250  *   Pointer to Ethernet device.
1251  * @param idx
1252  *   TX queue index.
1253  *
1254  * @return
1255  *   1 while a reference on it exists, 0 when freed.
1256  */
1257 int
1258 mlx5_txq_release(struct rte_eth_dev *dev, uint16_t idx)
1259 {
1260 	struct mlx5_priv *priv = dev->data->dev_private;
1261 	struct mlx5_txq_ctrl *txq_ctrl;
1262 
1263 	if (priv->txqs == NULL || (*priv->txqs)[idx] == NULL)
1264 		return 0;
1265 	txq_ctrl = container_of((*priv->txqs)[idx], struct mlx5_txq_ctrl, txq);
1266 	if (rte_atomic_fetch_sub_explicit(&txq_ctrl->refcnt, 1, rte_memory_order_relaxed) - 1 > 1)
1267 		return 1;
1268 	if (txq_ctrl->obj) {
1269 		priv->obj_ops.txq_obj_release(txq_ctrl->obj);
1270 		LIST_REMOVE(txq_ctrl->obj, next);
1271 		mlx5_free(txq_ctrl->obj);
1272 		txq_ctrl->obj = NULL;
1273 	}
1274 	if (!txq_ctrl->is_hairpin) {
1275 		if (txq_ctrl->txq.fcqs) {
1276 			mlx5_free(txq_ctrl->txq.fcqs);
1277 			txq_ctrl->txq.fcqs = NULL;
1278 		}
1279 		txq_free_elts(txq_ctrl);
1280 		dev->data->tx_queue_state[idx] = RTE_ETH_QUEUE_STATE_STOPPED;
1281 	}
1282 	if (!rte_atomic_load_explicit(&txq_ctrl->refcnt, rte_memory_order_relaxed)) {
1283 		if (!txq_ctrl->is_hairpin)
1284 			mlx5_mr_btree_free(&txq_ctrl->txq.mr_ctrl.cache_bh);
1285 		LIST_REMOVE(txq_ctrl, next);
1286 		mlx5_free(txq_ctrl);
1287 		(*priv->txqs)[idx] = NULL;
1288 	}
1289 	return 0;
1290 }
1291 
1292 /**
1293  * Verify if the queue can be released.
1294  *
1295  * @param dev
1296  *   Pointer to Ethernet device.
1297  * @param idx
1298  *   TX queue index.
1299  *
1300  * @return
1301  *   1 if the queue can be released.
1302  */
1303 int
1304 mlx5_txq_releasable(struct rte_eth_dev *dev, uint16_t idx)
1305 {
1306 	struct mlx5_priv *priv = dev->data->dev_private;
1307 	struct mlx5_txq_ctrl *txq;
1308 
1309 	if (!(*priv->txqs)[idx])
1310 		return -1;
1311 	txq = container_of((*priv->txqs)[idx], struct mlx5_txq_ctrl, txq);
1312 	return (rte_atomic_load_explicit(&txq->refcnt, rte_memory_order_relaxed) == 1);
1313 }
1314 
1315 /**
1316  * Verify the Tx Queue list is empty
1317  *
1318  * @param dev
1319  *   Pointer to Ethernet device.
1320  *
1321  * @return
1322  *   The number of object not released.
1323  */
1324 int
1325 mlx5_txq_verify(struct rte_eth_dev *dev)
1326 {
1327 	struct mlx5_priv *priv = dev->data->dev_private;
1328 	struct mlx5_txq_ctrl *txq_ctrl;
1329 	int ret = 0;
1330 
1331 	LIST_FOREACH(txq_ctrl, &priv->txqsctrl, next) {
1332 		DRV_LOG(DEBUG, "port %u Tx queue %u still referenced",
1333 			dev->data->port_id, txq_ctrl->txq.idx);
1334 		++ret;
1335 	}
1336 	return ret;
1337 }
1338 
1339 int
1340 mlx5_txq_get_sqn(struct mlx5_txq_ctrl *txq)
1341 {
1342 	return txq->is_hairpin ? txq->obj->sq->id : txq->obj->sq_obj.sq->id;
1343 }
1344 
1345 int
1346 rte_pmd_mlx5_external_sq_enable(uint16_t port_id, uint32_t sq_num)
1347 {
1348 	struct rte_eth_dev *dev;
1349 	struct mlx5_priv *priv;
1350 	uint32_t flow;
1351 
1352 	if (rte_eth_dev_is_valid_port(port_id) < 0) {
1353 		DRV_LOG(ERR, "There is no Ethernet device for port %u.",
1354 			port_id);
1355 		rte_errno = ENODEV;
1356 		return -rte_errno;
1357 	}
1358 	dev = &rte_eth_devices[port_id];
1359 	priv = dev->data->dev_private;
1360 	if ((!priv->representor && !priv->master) ||
1361 	    !priv->sh->config.dv_esw_en) {
1362 		DRV_LOG(ERR, "Port %u must be represetnor or master port in E-Switch mode.",
1363 			port_id);
1364 		rte_errno = EINVAL;
1365 		return -rte_errno;
1366 	}
1367 	if (sq_num == 0) {
1368 		DRV_LOG(ERR, "Invalid SQ number.");
1369 		rte_errno = EINVAL;
1370 		return -rte_errno;
1371 	}
1372 #ifdef HAVE_MLX5_HWS_SUPPORT
1373 	if (priv->sh->config.dv_flow_en == 2) {
1374 		bool sq_miss_created = false;
1375 
1376 		if (priv->sh->config.fdb_def_rule) {
1377 			if (mlx5_flow_hw_esw_create_sq_miss_flow(dev, sq_num, true))
1378 				return -rte_errno;
1379 			sq_miss_created = true;
1380 		}
1381 
1382 		if (priv->sh->config.repr_matching &&
1383 		    mlx5_flow_hw_tx_repr_matching_flow(dev, sq_num, true)) {
1384 			if (sq_miss_created)
1385 				mlx5_flow_hw_esw_destroy_sq_miss_flow(dev, sq_num);
1386 			return -rte_errno;
1387 		}
1388 		return 0;
1389 	}
1390 #endif
1391 	flow = mlx5_flow_create_devx_sq_miss_flow(dev, sq_num);
1392 	if (flow > 0)
1393 		return 0;
1394 	DRV_LOG(ERR, "Port %u failed to create default miss flow for SQ %u.",
1395 		port_id, sq_num);
1396 	return -rte_errno;
1397 }
1398 
1399 /**
1400  * Set the Tx queue dynamic timestamp (mask and offset)
1401  *
1402  * @param[in] dev
1403  *   Pointer to the Ethernet device structure.
1404  */
1405 void
1406 mlx5_txq_dynf_timestamp_set(struct rte_eth_dev *dev)
1407 {
1408 	struct mlx5_priv *priv = dev->data->dev_private;
1409 	struct mlx5_dev_ctx_shared *sh = priv->sh;
1410 	struct mlx5_txq_data *data;
1411 	int off, nbit;
1412 	unsigned int i;
1413 	uint64_t mask = 0;
1414 	uint64_t ts_mask;
1415 
1416 	if (sh->dev_cap.rt_timestamp ||
1417 	    !sh->cdev->config.hca_attr.dev_freq_khz)
1418 		ts_mask = MLX5_TS_MASK_SECS << 32;
1419 	else
1420 		ts_mask = rte_align64pow2(MLX5_TS_MASK_SECS * 1000ull *
1421 				sh->cdev->config.hca_attr.dev_freq_khz);
1422 	ts_mask = rte_cpu_to_be_64(ts_mask - 1ull);
1423 	nbit = rte_mbuf_dynflag_lookup
1424 				(RTE_MBUF_DYNFLAG_TX_TIMESTAMP_NAME, NULL);
1425 	off = rte_mbuf_dynfield_lookup
1426 				(RTE_MBUF_DYNFIELD_TIMESTAMP_NAME, NULL);
1427 	if (nbit >= 0 && off >= 0 &&
1428 	    (sh->txpp.refcnt || priv->sh->cdev->config.hca_attr.wait_on_time))
1429 		mask = 1ULL << nbit;
1430 	for (i = 0; i != priv->txqs_n; ++i) {
1431 		data = (*priv->txqs)[i];
1432 		if (!data)
1433 			continue;
1434 		data->sh = sh;
1435 		data->ts_mask = mask;
1436 		data->ts_offset = off;
1437 		data->rt_timestamp = sh->dev_cap.rt_timestamp;
1438 		data->rt_timemask = (data->offloads &
1439 				     RTE_ETH_TX_OFFLOAD_SEND_ON_TIMESTAMP) ?
1440 				     ts_mask : 0;
1441 	}
1442 }
1443 
1444 int mlx5_count_aggr_ports(struct rte_eth_dev *dev)
1445 {
1446 	struct mlx5_priv *priv = dev->data->dev_private;
1447 
1448 	return priv->sh->bond.n_port;
1449 }
1450 
1451 int mlx5_map_aggr_tx_affinity(struct rte_eth_dev *dev, uint16_t tx_queue_id,
1452 			      uint8_t affinity)
1453 {
1454 	struct mlx5_txq_ctrl *txq_ctrl;
1455 	struct mlx5_txq_data *txq;
1456 	struct mlx5_priv *priv;
1457 
1458 	priv = dev->data->dev_private;
1459 	if (!mlx5_devx_obj_ops_en(priv->sh)) {
1460 		DRV_LOG(ERR, "Tx affinity mapping isn't supported by Verbs API.");
1461 		rte_errno = ENOTSUP;
1462 		return -rte_errno;
1463 	}
1464 	txq = (*priv->txqs)[tx_queue_id];
1465 	if (!txq)
1466 		return -1;
1467 	txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq);
1468 	if (tx_queue_id >= priv->txqs_n) {
1469 		DRV_LOG(ERR, "port %u Tx queue index out of range (%u >= %u)",
1470 			dev->data->port_id, tx_queue_id, priv->txqs_n);
1471 		rte_errno = EOVERFLOW;
1472 		return -rte_errno;
1473 	}
1474 	if (affinity > priv->num_lag_ports) {
1475 		DRV_LOG(ERR, "port %u unable to setup Tx queue index %u"
1476 			" affinity is %u exceeds the maximum %u", dev->data->port_id,
1477 			tx_queue_id, affinity, priv->num_lag_ports);
1478 		rte_errno = EINVAL;
1479 		return -rte_errno;
1480 	}
1481 	DRV_LOG(DEBUG, "port %u configuring queue %u for aggregated affinity %u",
1482 		dev->data->port_id, tx_queue_id, affinity);
1483 	txq_ctrl->txq.tx_aggr_affinity = affinity;
1484 	return 0;
1485 }
1486 
1487 /**
1488  * Validate given external TxQ rte_flow index, and get pointer to concurrent
1489  * external TxQ object to map/unmap.
1490  *
1491  * @param[in] port_id
1492  *   The port identifier of the Ethernet device.
1493  * @param[in] dpdk_idx
1494  *   Tx Queue index in rte_flow.
1495  *
1496  * @return
1497  *   Pointer to concurrent external TxQ on success,
1498  *   NULL otherwise and rte_errno is set.
1499  */
1500 static struct mlx5_external_q *
1501 mlx5_external_tx_queue_get_validate(uint16_t port_id, uint16_t dpdk_idx)
1502 {
1503 	struct rte_eth_dev *dev;
1504 	struct mlx5_priv *priv;
1505 	int ret;
1506 
1507 	if (dpdk_idx < MLX5_EXTERNAL_TX_QUEUE_ID_MIN) {
1508 		DRV_LOG(ERR, "Queue index %u should be in range: [%u, %u].",
1509 			dpdk_idx, MLX5_EXTERNAL_TX_QUEUE_ID_MIN, UINT16_MAX);
1510 		rte_errno = EINVAL;
1511 		return NULL;
1512 	}
1513 	ret = mlx5_devx_extq_port_validate(port_id);
1514 	if (unlikely(ret))
1515 		return NULL;
1516 	dev = &rte_eth_devices[port_id];
1517 	priv = dev->data->dev_private;
1518 	/*
1519 	 * When user configures remote PD and CTX and device creates TxQ by
1520 	 * DevX, external TxQs array is allocated.
1521 	 */
1522 	MLX5_ASSERT(priv->ext_txqs != NULL);
1523 	return &priv->ext_txqs[dpdk_idx - MLX5_EXTERNAL_TX_QUEUE_ID_MIN];
1524 }
1525 
1526 int
1527 rte_pmd_mlx5_external_tx_queue_id_map(uint16_t port_id, uint16_t dpdk_idx,
1528 				      uint32_t hw_idx)
1529 {
1530 	struct mlx5_external_q *ext_txq;
1531 	uint32_t unmapped = 0;
1532 
1533 	ext_txq = mlx5_external_tx_queue_get_validate(port_id, dpdk_idx);
1534 	if (ext_txq == NULL)
1535 		return -rte_errno;
1536 	if (!rte_atomic_compare_exchange_strong_explicit(&ext_txq->refcnt, &unmapped, 1,
1537 					 rte_memory_order_relaxed, rte_memory_order_relaxed)) {
1538 		if (ext_txq->hw_id != hw_idx) {
1539 			DRV_LOG(ERR, "Port %u external TxQ index %u "
1540 				"is already mapped to HW index (requesting is "
1541 				"%u, existing is %u).",
1542 				port_id, dpdk_idx, hw_idx, ext_txq->hw_id);
1543 			rte_errno = EEXIST;
1544 			return -rte_errno;
1545 		}
1546 		DRV_LOG(WARNING, "Port %u external TxQ index %u "
1547 			"is already mapped to the requested HW index (%u)",
1548 			port_id, dpdk_idx, hw_idx);
1549 
1550 	} else {
1551 		ext_txq->hw_id = hw_idx;
1552 		DRV_LOG(DEBUG, "Port %u external TxQ index %u "
1553 			"is successfully mapped to the requested HW index (%u)",
1554 			port_id, dpdk_idx, hw_idx);
1555 	}
1556 	return 0;
1557 }
1558 
1559 int
1560 rte_pmd_mlx5_external_tx_queue_id_unmap(uint16_t port_id, uint16_t dpdk_idx)
1561 {
1562 	struct mlx5_external_q *ext_txq;
1563 	uint32_t mapped = 1;
1564 
1565 	ext_txq = mlx5_external_tx_queue_get_validate(port_id, dpdk_idx);
1566 	if (ext_txq == NULL)
1567 		return -rte_errno;
1568 	if (ext_txq->refcnt > 1) {
1569 		DRV_LOG(ERR, "Port %u external TxQ index %u still referenced.",
1570 			port_id, dpdk_idx);
1571 		rte_errno = EINVAL;
1572 		return -rte_errno;
1573 	}
1574 	if (!rte_atomic_compare_exchange_strong_explicit(&ext_txq->refcnt, &mapped, 0,
1575 					 rte_memory_order_relaxed, rte_memory_order_relaxed)) {
1576 		DRV_LOG(ERR, "Port %u external TxQ index %u doesn't exist.",
1577 			port_id, dpdk_idx);
1578 		rte_errno = EINVAL;
1579 		return -rte_errno;
1580 	}
1581 	DRV_LOG(DEBUG,
1582 		"Port %u external TxQ index %u is successfully unmapped.",
1583 		port_id, dpdk_idx);
1584 	return 0;
1585 }
1586