xref: /dpdk/drivers/net/mlx5/mlx5_txq.c (revision 3da59f30a23f2e795d2315f3d949e1b3e0ce0c3d)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5 
6 #include <stddef.h>
7 #include <errno.h>
8 #include <string.h>
9 #include <stdint.h>
10 #include <unistd.h>
11 #include <inttypes.h>
12 
13 #include <rte_mbuf.h>
14 #include <rte_malloc.h>
15 #include <ethdev_driver.h>
16 #include <bus_pci_driver.h>
17 #include <rte_common.h>
18 #include <rte_eal_paging.h>
19 
20 #include <mlx5_common.h>
21 #include <mlx5_common_mr.h>
22 #include <mlx5_malloc.h>
23 
24 #include "mlx5_defs.h"
25 #include "mlx5_utils.h"
26 #include "mlx5.h"
27 #include "mlx5_tx.h"
28 #include "mlx5_rxtx.h"
29 #include "mlx5_autoconf.h"
30 #include "rte_pmd_mlx5.h"
31 #include "mlx5_flow.h"
32 
33 /**
34  * Allocate TX queue elements.
35  *
36  * @param txq_ctrl
37  *   Pointer to TX queue structure.
38  */
39 void
40 txq_alloc_elts(struct mlx5_txq_ctrl *txq_ctrl)
41 {
42 	const unsigned int elts_n = 1 << txq_ctrl->txq.elts_n;
43 	unsigned int i;
44 
45 	for (i = 0; (i != elts_n); ++i)
46 		txq_ctrl->txq.elts[i] = NULL;
47 	DRV_LOG(DEBUG, "port %u Tx queue %u allocated and configured %u WRs",
48 		PORT_ID(txq_ctrl->priv), txq_ctrl->txq.idx, elts_n);
49 	txq_ctrl->txq.elts_head = 0;
50 	txq_ctrl->txq.elts_tail = 0;
51 	txq_ctrl->txq.elts_comp = 0;
52 }
53 
54 /**
55  * Free TX queue elements.
56  *
57  * @param txq_ctrl
58  *   Pointer to TX queue structure.
59  */
60 void
61 txq_free_elts(struct mlx5_txq_ctrl *txq_ctrl)
62 {
63 	const uint16_t elts_n = 1 << txq_ctrl->txq.elts_n;
64 	const uint16_t elts_m = elts_n - 1;
65 	uint16_t elts_head = txq_ctrl->txq.elts_head;
66 	uint16_t elts_tail = txq_ctrl->txq.elts_tail;
67 	struct rte_mbuf *(*elts)[elts_n] = &txq_ctrl->txq.elts;
68 
69 	DRV_LOG(DEBUG, "port %u Tx queue %u freeing WRs",
70 		PORT_ID(txq_ctrl->priv), txq_ctrl->txq.idx);
71 	txq_ctrl->txq.elts_head = 0;
72 	txq_ctrl->txq.elts_tail = 0;
73 	txq_ctrl->txq.elts_comp = 0;
74 
75 	while (elts_tail != elts_head) {
76 		struct rte_mbuf *elt = (*elts)[elts_tail & elts_m];
77 
78 		MLX5_ASSERT(elt != NULL);
79 		rte_pktmbuf_free_seg(elt);
80 #ifdef RTE_LIBRTE_MLX5_DEBUG
81 		/* Poisoning. */
82 		memset(&(*elts)[elts_tail & elts_m],
83 		       0x77,
84 		       sizeof((*elts)[elts_tail & elts_m]));
85 #endif
86 		++elts_tail;
87 	}
88 }
89 
90 /**
91  * Returns the per-port supported offloads.
92  *
93  * @param dev
94  *   Pointer to Ethernet device.
95  *
96  * @return
97  *   Supported Tx offloads.
98  */
99 uint64_t
100 mlx5_get_tx_port_offloads(struct rte_eth_dev *dev)
101 {
102 	struct mlx5_priv *priv = dev->data->dev_private;
103 	uint64_t offloads = (RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
104 			     RTE_ETH_TX_OFFLOAD_VLAN_INSERT);
105 	struct mlx5_port_config *config = &priv->config;
106 	struct mlx5_dev_cap *dev_cap = &priv->sh->dev_cap;
107 
108 	if (dev_cap->hw_csum)
109 		offloads |= (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM |
110 			     RTE_ETH_TX_OFFLOAD_UDP_CKSUM |
111 			     RTE_ETH_TX_OFFLOAD_TCP_CKSUM);
112 	if (dev_cap->tso)
113 		offloads |= RTE_ETH_TX_OFFLOAD_TCP_TSO;
114 	if (priv->sh->config.tx_pp ||
115 	    priv->sh->cdev->config.hca_attr.wait_on_time)
116 		offloads |= RTE_ETH_TX_OFFLOAD_SEND_ON_TIMESTAMP;
117 	if (dev_cap->swp) {
118 		if (dev_cap->swp & MLX5_SW_PARSING_CSUM_CAP)
119 			offloads |= RTE_ETH_TX_OFFLOAD_OUTER_IPV4_CKSUM;
120 		if (dev_cap->swp & MLX5_SW_PARSING_TSO_CAP)
121 			offloads |= (RTE_ETH_TX_OFFLOAD_IP_TNL_TSO |
122 				     RTE_ETH_TX_OFFLOAD_UDP_TNL_TSO);
123 	}
124 	if (dev_cap->tunnel_en) {
125 		if (dev_cap->hw_csum)
126 			offloads |= RTE_ETH_TX_OFFLOAD_OUTER_IPV4_CKSUM;
127 		if (dev_cap->tso) {
128 			if (dev_cap->tunnel_en &
129 				MLX5_TUNNELED_OFFLOADS_VXLAN_CAP)
130 				offloads |= RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO;
131 			if (dev_cap->tunnel_en &
132 				MLX5_TUNNELED_OFFLOADS_GRE_CAP)
133 				offloads |= RTE_ETH_TX_OFFLOAD_GRE_TNL_TSO;
134 			if (dev_cap->tunnel_en &
135 				MLX5_TUNNELED_OFFLOADS_GENEVE_CAP)
136 				offloads |= RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO;
137 		}
138 	}
139 	if (!config->mprq.enabled)
140 		offloads |= RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE;
141 	return offloads;
142 }
143 
144 /* Fetches and drops all SW-owned and error CQEs to synchronize CQ. */
145 static void
146 txq_sync_cq(struct mlx5_txq_data *txq)
147 {
148 	volatile struct mlx5_cqe *cqe;
149 	int ret, i;
150 
151 	i = txq->cqe_s;
152 	do {
153 		cqe = &txq->cqes[txq->cq_ci & txq->cqe_m];
154 		ret = check_cqe(cqe, txq->cqe_s, txq->cq_ci);
155 		if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
156 			if (likely(ret != MLX5_CQE_STATUS_ERR)) {
157 				/* No new CQEs in completion queue. */
158 				MLX5_ASSERT(ret == MLX5_CQE_STATUS_HW_OWN);
159 				break;
160 			}
161 		}
162 		++txq->cq_ci;
163 	} while (--i);
164 	/* Move all CQEs to HW ownership. */
165 	for (i = 0; i < txq->cqe_s; i++) {
166 		cqe = &txq->cqes[i];
167 		cqe->op_own = MLX5_CQE_INVALIDATE;
168 	}
169 	/* Resync CQE and WQE (WQ in reset state). */
170 	rte_io_wmb();
171 	*txq->cq_db = rte_cpu_to_be_32(txq->cq_ci);
172 	txq->cq_pi = txq->cq_ci;
173 	rte_io_wmb();
174 }
175 
176 /**
177  * Tx queue stop. Device queue goes to the idle state,
178  * all involved mbufs are freed from elts/WQ.
179  *
180  * @param dev
181  *   Pointer to Ethernet device structure.
182  * @param idx
183  *   Tx queue index.
184  *
185  * @return
186  *   0 on success, a negative errno value otherwise and rte_errno is set.
187  */
188 int
189 mlx5_tx_queue_stop_primary(struct rte_eth_dev *dev, uint16_t idx)
190 {
191 	struct mlx5_priv *priv = dev->data->dev_private;
192 	struct mlx5_txq_data *txq = (*priv->txqs)[idx];
193 	struct mlx5_txq_ctrl *txq_ctrl =
194 			container_of(txq, struct mlx5_txq_ctrl, txq);
195 	int ret;
196 
197 	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
198 	/* Move QP to RESET state. */
199 	ret = priv->obj_ops.txq_obj_modify(txq_ctrl->obj, MLX5_TXQ_MOD_RDY2RST,
200 					   (uint8_t)priv->dev_port);
201 	if (ret)
202 		return ret;
203 	/* Handle all send completions. */
204 	txq_sync_cq(txq);
205 	/* Free elts stored in the SQ. */
206 	txq_free_elts(txq_ctrl);
207 	/* Prevent writing new pkts to SQ by setting no free WQE.*/
208 	txq->wqe_ci = txq->wqe_s;
209 	txq->wqe_pi = 0;
210 	txq->elts_comp = 0;
211 	/* Set the actual queue state. */
212 	dev->data->tx_queue_state[idx] = RTE_ETH_QUEUE_STATE_STOPPED;
213 	return 0;
214 }
215 
216 /**
217  * Tx queue stop. Device queue goes to the idle state,
218  * all involved mbufs are freed from elts/WQ.
219  *
220  * @param dev
221  *   Pointer to Ethernet device structure.
222  * @param idx
223  *   Tx queue index.
224  *
225  * @return
226  *   0 on success, a negative errno value otherwise and rte_errno is set.
227  */
228 int
229 mlx5_tx_queue_stop(struct rte_eth_dev *dev, uint16_t idx)
230 {
231 	int ret;
232 
233 	if (rte_eth_dev_is_tx_hairpin_queue(dev, idx)) {
234 		DRV_LOG(ERR, "Hairpin queue can't be stopped");
235 		rte_errno = EINVAL;
236 		return -EINVAL;
237 	}
238 	if (dev->data->tx_queue_state[idx] == RTE_ETH_QUEUE_STATE_STOPPED)
239 		return 0;
240 	if (rte_eal_process_type() ==  RTE_PROC_SECONDARY) {
241 		ret = mlx5_mp_os_req_queue_control(dev, idx,
242 						   MLX5_MP_REQ_QUEUE_TX_STOP);
243 	} else {
244 		ret = mlx5_tx_queue_stop_primary(dev, idx);
245 	}
246 	return ret;
247 }
248 
249 /**
250  * Rx queue start. Device queue goes to the ready state,
251  * all required mbufs are allocated and WQ is replenished.
252  *
253  * @param dev
254  *   Pointer to Ethernet device structure.
255  * @param idx
256  *   RX queue index.
257  *
258  * @return
259  *   0 on success, a negative errno value otherwise and rte_errno is set.
260  */
261 int
262 mlx5_tx_queue_start_primary(struct rte_eth_dev *dev, uint16_t idx)
263 {
264 	struct mlx5_priv *priv = dev->data->dev_private;
265 	struct mlx5_txq_data *txq = (*priv->txqs)[idx];
266 	struct mlx5_txq_ctrl *txq_ctrl =
267 			container_of(txq, struct mlx5_txq_ctrl, txq);
268 	int ret;
269 
270 	MLX5_ASSERT(rte_eal_process_type() ==  RTE_PROC_PRIMARY);
271 	ret = priv->obj_ops.txq_obj_modify(txq_ctrl->obj,
272 					   MLX5_TXQ_MOD_RST2RDY,
273 					   (uint8_t)priv->dev_port);
274 	if (ret)
275 		return ret;
276 	txq_ctrl->txq.wqe_ci = 0;
277 	txq_ctrl->txq.wqe_pi = 0;
278 	txq_ctrl->txq.elts_comp = 0;
279 	/* Set the actual queue state. */
280 	dev->data->tx_queue_state[idx] = RTE_ETH_QUEUE_STATE_STARTED;
281 	return 0;
282 }
283 
284 /**
285  * Rx queue start. Device queue goes to the ready state,
286  * all required mbufs are allocated and WQ is replenished.
287  *
288  * @param dev
289  *   Pointer to Ethernet device structure.
290  * @param idx
291  *   RX queue index.
292  *
293  * @return
294  *   0 on success, a negative errno value otherwise and rte_errno is set.
295  */
296 int
297 mlx5_tx_queue_start(struct rte_eth_dev *dev, uint16_t idx)
298 {
299 	int ret;
300 
301 	if (rte_eth_dev_is_tx_hairpin_queue(dev, idx)) {
302 		DRV_LOG(ERR, "Hairpin queue can't be started");
303 		rte_errno = EINVAL;
304 		return -EINVAL;
305 	}
306 	if (dev->data->tx_queue_state[idx] == RTE_ETH_QUEUE_STATE_STARTED)
307 		return 0;
308 	if (rte_eal_process_type() ==  RTE_PROC_SECONDARY) {
309 		ret = mlx5_mp_os_req_queue_control(dev, idx,
310 						   MLX5_MP_REQ_QUEUE_TX_START);
311 	} else {
312 		ret = mlx5_tx_queue_start_primary(dev, idx);
313 	}
314 	return ret;
315 }
316 
317 /**
318  * Tx queue presetup checks.
319  *
320  * @param dev
321  *   Pointer to Ethernet device structure.
322  * @param idx
323  *   Tx queue index.
324  * @param desc
325  *   Number of descriptors to configure in queue.
326  *
327  * @return
328  *   0 on success, a negative errno value otherwise and rte_errno is set.
329  */
330 static int
331 mlx5_tx_queue_pre_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t *desc)
332 {
333 	struct mlx5_priv *priv = dev->data->dev_private;
334 
335 	if (*desc <= MLX5_TX_COMP_THRESH) {
336 		DRV_LOG(WARNING,
337 			"port %u number of descriptors requested for Tx queue"
338 			" %u must be higher than MLX5_TX_COMP_THRESH, using %u"
339 			" instead of %u", dev->data->port_id, idx,
340 			MLX5_TX_COMP_THRESH + 1, *desc);
341 		*desc = MLX5_TX_COMP_THRESH + 1;
342 	}
343 	if (!rte_is_power_of_2(*desc)) {
344 		*desc = 1 << log2above(*desc);
345 		DRV_LOG(WARNING,
346 			"port %u increased number of descriptors in Tx queue"
347 			" %u to the next power of two (%d)",
348 			dev->data->port_id, idx, *desc);
349 	}
350 	DRV_LOG(DEBUG, "port %u configuring queue %u for %u descriptors",
351 		dev->data->port_id, idx, *desc);
352 	if (idx >= priv->txqs_n) {
353 		DRV_LOG(ERR, "port %u Tx queue index out of range (%u >= %u)",
354 			dev->data->port_id, idx, priv->txqs_n);
355 		rte_errno = EOVERFLOW;
356 		return -rte_errno;
357 	}
358 	if (!mlx5_txq_releasable(dev, idx)) {
359 		rte_errno = EBUSY;
360 		DRV_LOG(ERR, "port %u unable to release queue index %u",
361 			dev->data->port_id, idx);
362 		return -rte_errno;
363 	}
364 	mlx5_txq_release(dev, idx);
365 	return 0;
366 }
367 
368 /**
369  * DPDK callback to configure a TX queue.
370  *
371  * @param dev
372  *   Pointer to Ethernet device structure.
373  * @param idx
374  *   TX queue index.
375  * @param desc
376  *   Number of descriptors to configure in queue.
377  * @param socket
378  *   NUMA socket on which memory must be allocated.
379  * @param[in] conf
380  *   Thresholds parameters.
381  *
382  * @return
383  *   0 on success, a negative errno value otherwise and rte_errno is set.
384  */
385 int
386 mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
387 		    unsigned int socket, const struct rte_eth_txconf *conf)
388 {
389 	struct mlx5_priv *priv = dev->data->dev_private;
390 	struct mlx5_txq_data *txq = (*priv->txqs)[idx];
391 	struct mlx5_txq_ctrl *txq_ctrl =
392 		container_of(txq, struct mlx5_txq_ctrl, txq);
393 	int res;
394 
395 	res = mlx5_tx_queue_pre_setup(dev, idx, &desc);
396 	if (res)
397 		return res;
398 	txq_ctrl = mlx5_txq_new(dev, idx, desc, socket, conf);
399 	if (!txq_ctrl) {
400 		DRV_LOG(ERR, "port %u unable to allocate queue index %u",
401 			dev->data->port_id, idx);
402 		return -rte_errno;
403 	}
404 	DRV_LOG(DEBUG, "port %u adding Tx queue %u to list",
405 		dev->data->port_id, idx);
406 	(*priv->txqs)[idx] = &txq_ctrl->txq;
407 	return 0;
408 }
409 
410 /**
411  * DPDK callback to configure a TX hairpin queue.
412  *
413  * @param dev
414  *   Pointer to Ethernet device structure.
415  * @param idx
416  *   TX queue index.
417  * @param desc
418  *   Number of descriptors to configure in queue.
419  * @param[in] hairpin_conf
420  *   The hairpin binding configuration.
421  *
422  * @return
423  *   0 on success, a negative errno value otherwise and rte_errno is set.
424  */
425 int
426 mlx5_tx_hairpin_queue_setup(struct rte_eth_dev *dev, uint16_t idx,
427 			    uint16_t desc,
428 			    const struct rte_eth_hairpin_conf *hairpin_conf)
429 {
430 	struct mlx5_priv *priv = dev->data->dev_private;
431 	struct mlx5_txq_data *txq = (*priv->txqs)[idx];
432 	struct mlx5_txq_ctrl *txq_ctrl =
433 		container_of(txq, struct mlx5_txq_ctrl, txq);
434 	int res;
435 
436 	res = mlx5_tx_queue_pre_setup(dev, idx, &desc);
437 	if (res)
438 		return res;
439 	if (hairpin_conf->peer_count != 1) {
440 		rte_errno = EINVAL;
441 		DRV_LOG(ERR, "port %u unable to setup Tx hairpin queue index %u"
442 			" peer count is %u", dev->data->port_id,
443 			idx, hairpin_conf->peer_count);
444 		return -rte_errno;
445 	}
446 	if (hairpin_conf->peers[0].port == dev->data->port_id) {
447 		if (hairpin_conf->peers[0].queue >= priv->rxqs_n) {
448 			rte_errno = EINVAL;
449 			DRV_LOG(ERR, "port %u unable to setup Tx hairpin queue"
450 				" index %u, Rx %u is larger than %u",
451 				dev->data->port_id, idx,
452 				hairpin_conf->peers[0].queue, priv->txqs_n);
453 			return -rte_errno;
454 		}
455 	} else {
456 		if (hairpin_conf->manual_bind == 0 ||
457 		    hairpin_conf->tx_explicit == 0) {
458 			rte_errno = EINVAL;
459 			DRV_LOG(ERR, "port %u unable to setup Tx hairpin queue"
460 				" index %u peer port %u with attributes %u %u",
461 				dev->data->port_id, idx,
462 				hairpin_conf->peers[0].port,
463 				hairpin_conf->manual_bind,
464 				hairpin_conf->tx_explicit);
465 			return -rte_errno;
466 		}
467 	}
468 	txq_ctrl = mlx5_txq_hairpin_new(dev, idx, desc,	hairpin_conf);
469 	if (!txq_ctrl) {
470 		DRV_LOG(ERR, "port %u unable to allocate queue index %u",
471 			dev->data->port_id, idx);
472 		return -rte_errno;
473 	}
474 	DRV_LOG(DEBUG, "port %u adding Tx queue %u to list",
475 		dev->data->port_id, idx);
476 	(*priv->txqs)[idx] = &txq_ctrl->txq;
477 	dev->data->tx_queue_state[idx] = RTE_ETH_QUEUE_STATE_HAIRPIN;
478 	return 0;
479 }
480 
481 /**
482  * DPDK callback to release a TX queue.
483  *
484  * @param dev
485  *   Pointer to Ethernet device structure.
486  * @param qid
487  *   Transmit queue index.
488  */
489 void
490 mlx5_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
491 {
492 	struct mlx5_txq_data *txq = dev->data->tx_queues[qid];
493 
494 	if (txq == NULL)
495 		return;
496 	DRV_LOG(DEBUG, "port %u removing Tx queue %u from list",
497 		dev->data->port_id, qid);
498 	mlx5_txq_release(dev, qid);
499 }
500 
501 /**
502  * Remap UAR register of a Tx queue for secondary process.
503  *
504  * Remapped address is stored at the table in the process private structure of
505  * the device, indexed by queue index.
506  *
507  * @param txq_ctrl
508  *   Pointer to Tx queue control structure.
509  * @param fd
510  *   Verbs file descriptor to map UAR pages.
511  *
512  * @return
513  *   0 on success, a negative errno value otherwise and rte_errno is set.
514  */
515 static int
516 txq_uar_init_secondary(struct mlx5_txq_ctrl *txq_ctrl, int fd)
517 {
518 	struct mlx5_priv *priv = txq_ctrl->priv;
519 	struct mlx5_proc_priv *ppriv = MLX5_PROC_PRIV(PORT_ID(priv));
520 	struct mlx5_proc_priv *primary_ppriv = priv->sh->pppriv;
521 	struct mlx5_txq_data *txq = &txq_ctrl->txq;
522 	void *addr;
523 	uintptr_t uar_va;
524 	uintptr_t offset;
525 	const size_t page_size = rte_mem_page_size();
526 	if (page_size == (size_t)-1) {
527 		DRV_LOG(ERR, "Failed to get mem page size");
528 		rte_errno = ENOMEM;
529 		return -rte_errno;
530 	}
531 
532 	if (txq_ctrl->is_hairpin)
533 		return 0;
534 	MLX5_ASSERT(ppriv);
535 	/*
536 	 * As rdma-core, UARs are mapped in size of OS page
537 	 * size. Ref to libmlx5 function: mlx5_init_context()
538 	 */
539 	uar_va = (uintptr_t)primary_ppriv->uar_table[txq->idx].db;
540 	offset = uar_va & (page_size - 1); /* Offset in page. */
541 	addr = rte_mem_map(NULL, page_size, RTE_PROT_WRITE, RTE_MAP_SHARED,
542 			   fd, txq_ctrl->uar_mmap_offset);
543 	if (!addr) {
544 		DRV_LOG(ERR, "Port %u mmap failed for BF reg of txq %u.",
545 			txq->port_id, txq->idx);
546 		rte_errno = ENXIO;
547 		return -rte_errno;
548 	}
549 	addr = RTE_PTR_ADD(addr, offset);
550 	ppriv->uar_table[txq->idx].db = addr;
551 #ifndef RTE_ARCH_64
552 	ppriv->uar_table[txq->idx].sl_p =
553 			primary_ppriv->uar_table[txq->idx].sl_p;
554 #endif
555 	return 0;
556 }
557 
558 /**
559  * Unmap UAR register of a Tx queue for secondary process.
560  *
561  * @param txq_ctrl
562  *   Pointer to Tx queue control structure.
563  */
564 static void
565 txq_uar_uninit_secondary(struct mlx5_txq_ctrl *txq_ctrl)
566 {
567 	struct mlx5_proc_priv *ppriv = MLX5_PROC_PRIV(PORT_ID(txq_ctrl->priv));
568 	void *addr;
569 	const size_t page_size = rte_mem_page_size();
570 	if (page_size == (size_t)-1) {
571 		DRV_LOG(ERR, "Failed to get mem page size");
572 		rte_errno = ENOMEM;
573 	}
574 
575 	if (txq_ctrl->is_hairpin)
576 		return;
577 	addr = ppriv->uar_table[txq_ctrl->txq.idx].db;
578 	rte_mem_unmap(RTE_PTR_ALIGN_FLOOR(addr, page_size), page_size);
579 }
580 
581 /**
582  * Deinitialize Tx UAR registers for secondary process.
583  *
584  * @param dev
585  *   Pointer to Ethernet device.
586  */
587 void
588 mlx5_tx_uar_uninit_secondary(struct rte_eth_dev *dev)
589 {
590 	struct mlx5_proc_priv *ppriv = (struct mlx5_proc_priv *)
591 					dev->process_private;
592 	const size_t page_size = rte_mem_page_size();
593 	void *addr;
594 	unsigned int i;
595 
596 	if (page_size == (size_t)-1) {
597 		DRV_LOG(ERR, "Failed to get mem page size");
598 		return;
599 	}
600 	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_SECONDARY);
601 	for (i = 0; i != ppriv->uar_table_sz; ++i) {
602 		if (!ppriv->uar_table[i].db)
603 			continue;
604 		addr = ppriv->uar_table[i].db;
605 		rte_mem_unmap(RTE_PTR_ALIGN_FLOOR(addr, page_size), page_size);
606 
607 	}
608 }
609 
610 /**
611  * Initialize Tx UAR registers for secondary process.
612  *
613  * @param dev
614  *   Pointer to Ethernet device.
615  * @param fd
616  *   Verbs file descriptor to map UAR pages.
617  *
618  * @return
619  *   0 on success, a negative errno value otherwise and rte_errno is set.
620  */
621 int
622 mlx5_tx_uar_init_secondary(struct rte_eth_dev *dev, int fd)
623 {
624 	struct mlx5_priv *priv = dev->data->dev_private;
625 	struct mlx5_txq_data *txq;
626 	struct mlx5_txq_ctrl *txq_ctrl;
627 	unsigned int i;
628 	int ret;
629 
630 	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_SECONDARY);
631 	for (i = 0; i != priv->txqs_n; ++i) {
632 		if (!(*priv->txqs)[i])
633 			continue;
634 		txq = (*priv->txqs)[i];
635 		txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq);
636 		if (txq_ctrl->is_hairpin)
637 			continue;
638 		MLX5_ASSERT(txq->idx == (uint16_t)i);
639 		ret = txq_uar_init_secondary(txq_ctrl, fd);
640 		if (ret)
641 			goto error;
642 	}
643 	return 0;
644 error:
645 	/* Rollback. */
646 	do {
647 		if (!(*priv->txqs)[i])
648 			continue;
649 		txq = (*priv->txqs)[i];
650 		txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq);
651 		txq_uar_uninit_secondary(txq_ctrl);
652 	} while (i--);
653 	return -rte_errno;
654 }
655 
656 /**
657  * Verify the Verbs Tx queue list is empty
658  *
659  * @param dev
660  *   Pointer to Ethernet device.
661  *
662  * @return
663  *   The number of object not released.
664  */
665 int
666 mlx5_txq_obj_verify(struct rte_eth_dev *dev)
667 {
668 	struct mlx5_priv *priv = dev->data->dev_private;
669 	int ret = 0;
670 	struct mlx5_txq_obj *txq_obj;
671 
672 	LIST_FOREACH(txq_obj, &priv->txqsobj, next) {
673 		DRV_LOG(DEBUG, "port %u Verbs Tx queue %u still referenced",
674 			dev->data->port_id, txq_obj->txq_ctrl->txq.idx);
675 		++ret;
676 	}
677 	return ret;
678 }
679 
680 /**
681  * Calculate the total number of WQEBB for Tx queue.
682  *
683  * Simplified version of calc_sq_size() in rdma-core.
684  *
685  * @param txq_ctrl
686  *   Pointer to Tx queue control structure.
687  *
688  * @return
689  *   The number of WQEBB.
690  */
691 static int
692 txq_calc_wqebb_cnt(struct mlx5_txq_ctrl *txq_ctrl)
693 {
694 	unsigned int wqe_size;
695 	const unsigned int desc = 1 << txq_ctrl->txq.elts_n;
696 
697 	wqe_size = MLX5_WQE_CSEG_SIZE +
698 		   MLX5_WQE_ESEG_SIZE +
699 		   MLX5_WSEG_SIZE -
700 		   MLX5_ESEG_MIN_INLINE_SIZE +
701 		   txq_ctrl->max_inline_data;
702 	return rte_align32pow2(wqe_size * desc) / MLX5_WQE_SIZE;
703 }
704 
705 /**
706  * Calculate the maximal inline data size for Tx queue.
707  *
708  * @param txq_ctrl
709  *   Pointer to Tx queue control structure.
710  *
711  * @return
712  *   The maximal inline data size.
713  */
714 static unsigned int
715 txq_calc_inline_max(struct mlx5_txq_ctrl *txq_ctrl)
716 {
717 	const unsigned int desc = 1 << txq_ctrl->txq.elts_n;
718 	struct mlx5_priv *priv = txq_ctrl->priv;
719 	unsigned int wqe_size;
720 
721 	wqe_size = priv->sh->dev_cap.max_qp_wr / desc;
722 	if (!wqe_size)
723 		return 0;
724 	/*
725 	 * This calculation is derived from tthe source of
726 	 * mlx5_calc_send_wqe() in rdma_core library.
727 	 */
728 	wqe_size = wqe_size * MLX5_WQE_SIZE -
729 		   MLX5_WQE_CSEG_SIZE -
730 		   MLX5_WQE_ESEG_SIZE -
731 		   MLX5_WSEG_SIZE -
732 		   MLX5_WSEG_SIZE +
733 		   MLX5_DSEG_MIN_INLINE_SIZE;
734 	return wqe_size;
735 }
736 
737 /**
738  * Set Tx queue parameters from device configuration.
739  *
740  * @param txq_ctrl
741  *   Pointer to Tx queue control structure.
742  */
743 static void
744 txq_set_params(struct mlx5_txq_ctrl *txq_ctrl)
745 {
746 	struct mlx5_priv *priv = txq_ctrl->priv;
747 	struct mlx5_port_config *config = &priv->config;
748 	struct mlx5_dev_cap *dev_cap = &priv->sh->dev_cap;
749 	unsigned int inlen_send; /* Inline data for ordinary SEND.*/
750 	unsigned int inlen_empw; /* Inline data for enhanced MPW. */
751 	unsigned int inlen_mode; /* Minimal required Inline data. */
752 	unsigned int txqs_inline; /* Min Tx queues to enable inline. */
753 	uint64_t dev_txoff = priv->dev_data->dev_conf.txmode.offloads;
754 	bool tso = txq_ctrl->txq.offloads & (RTE_ETH_TX_OFFLOAD_TCP_TSO |
755 					    RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO |
756 					    RTE_ETH_TX_OFFLOAD_GRE_TNL_TSO |
757 					    RTE_ETH_TX_OFFLOAD_IP_TNL_TSO |
758 					    RTE_ETH_TX_OFFLOAD_UDP_TNL_TSO);
759 	bool vlan_inline;
760 	unsigned int temp;
761 
762 	txq_ctrl->txq.fast_free =
763 		!!((txq_ctrl->txq.offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) &&
764 		   !(txq_ctrl->txq.offloads & RTE_ETH_TX_OFFLOAD_MULTI_SEGS) &&
765 		   !config->mprq.enabled);
766 	if (config->txqs_inline == MLX5_ARG_UNSET)
767 		txqs_inline =
768 #if defined(RTE_ARCH_ARM64)
769 		(priv->pci_dev && priv->pci_dev->id.device_id ==
770 			PCI_DEVICE_ID_MELLANOX_BLUEFIELD) ?
771 			MLX5_INLINE_MAX_TXQS_BLUEFIELD :
772 #endif
773 			MLX5_INLINE_MAX_TXQS;
774 	else
775 		txqs_inline = (unsigned int)config->txqs_inline;
776 	inlen_send = (config->txq_inline_max == MLX5_ARG_UNSET) ?
777 		     MLX5_SEND_DEF_INLINE_LEN :
778 		     (unsigned int)config->txq_inline_max;
779 	inlen_empw = (config->txq_inline_mpw == MLX5_ARG_UNSET) ?
780 		     MLX5_EMPW_DEF_INLINE_LEN :
781 		     (unsigned int)config->txq_inline_mpw;
782 	inlen_mode = (config->txq_inline_min == MLX5_ARG_UNSET) ?
783 		     0 : (unsigned int)config->txq_inline_min;
784 	if (config->mps != MLX5_MPW_ENHANCED && config->mps != MLX5_MPW)
785 		inlen_empw = 0;
786 	/*
787 	 * If there is requested minimal amount of data to inline
788 	 * we MUST enable inlining. This is a case for ConnectX-4
789 	 * which usually requires L2 inlined for correct operating
790 	 * and ConnectX-4 Lx which requires L2-L4 inlined to
791 	 * support E-Switch Flows.
792 	 */
793 	if (inlen_mode) {
794 		if (inlen_mode <= MLX5_ESEG_MIN_INLINE_SIZE) {
795 			/*
796 			 * Optimize minimal inlining for single
797 			 * segment packets to fill one WQEBB
798 			 * without gaps.
799 			 */
800 			temp = MLX5_ESEG_MIN_INLINE_SIZE;
801 		} else {
802 			temp = inlen_mode - MLX5_ESEG_MIN_INLINE_SIZE;
803 			temp = RTE_ALIGN(temp, MLX5_WSEG_SIZE) +
804 			       MLX5_ESEG_MIN_INLINE_SIZE;
805 			temp = RTE_MIN(temp, MLX5_SEND_MAX_INLINE_LEN);
806 		}
807 		if (temp != inlen_mode) {
808 			DRV_LOG(INFO,
809 				"port %u minimal required inline setting"
810 				" aligned from %u to %u",
811 				PORT_ID(priv), inlen_mode, temp);
812 			inlen_mode = temp;
813 		}
814 	}
815 	/*
816 	 * If port is configured to support VLAN insertion and device
817 	 * does not support this feature by HW (for NICs before ConnectX-5
818 	 * or in case of wqe_vlan_insert flag is not set) we must enable
819 	 * data inline on all queues because it is supported by single
820 	 * tx_burst routine.
821 	 */
822 	txq_ctrl->txq.vlan_en = config->hw_vlan_insert;
823 	vlan_inline = (dev_txoff & RTE_ETH_TX_OFFLOAD_VLAN_INSERT) &&
824 		      !config->hw_vlan_insert;
825 	/*
826 	 * If there are few Tx queues it is prioritized
827 	 * to save CPU cycles and disable data inlining at all.
828 	 */
829 	if (inlen_send && priv->txqs_n >= txqs_inline) {
830 		/*
831 		 * The data sent with ordinal MLX5_OPCODE_SEND
832 		 * may be inlined in Ethernet Segment, align the
833 		 * length accordingly to fit entire WQEBBs.
834 		 */
835 		temp = RTE_MAX(inlen_send,
836 			       MLX5_ESEG_MIN_INLINE_SIZE + MLX5_WQE_DSEG_SIZE);
837 		temp -= MLX5_ESEG_MIN_INLINE_SIZE + MLX5_WQE_DSEG_SIZE;
838 		temp = RTE_ALIGN(temp, MLX5_WQE_SIZE);
839 		temp += MLX5_ESEG_MIN_INLINE_SIZE + MLX5_WQE_DSEG_SIZE;
840 		temp = RTE_MIN(temp, MLX5_WQE_SIZE_MAX +
841 				     MLX5_ESEG_MIN_INLINE_SIZE -
842 				     MLX5_WQE_CSEG_SIZE -
843 				     MLX5_WQE_ESEG_SIZE -
844 				     MLX5_WQE_DSEG_SIZE * 2);
845 		temp = RTE_MIN(temp, MLX5_SEND_MAX_INLINE_LEN);
846 		temp = RTE_MAX(temp, inlen_mode);
847 		if (temp != inlen_send) {
848 			DRV_LOG(INFO,
849 				"port %u ordinary send inline setting"
850 				" aligned from %u to %u",
851 				PORT_ID(priv), inlen_send, temp);
852 			inlen_send = temp;
853 		}
854 		/*
855 		 * Not aligned to cache lines, but to WQEs.
856 		 * First bytes of data (initial alignment)
857 		 * is going to be copied explicitly at the
858 		 * beginning of inlining buffer in Ethernet
859 		 * Segment.
860 		 */
861 		MLX5_ASSERT(inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE);
862 		MLX5_ASSERT(inlen_send <= MLX5_WQE_SIZE_MAX +
863 					  MLX5_ESEG_MIN_INLINE_SIZE -
864 					  MLX5_WQE_CSEG_SIZE -
865 					  MLX5_WQE_ESEG_SIZE -
866 					  MLX5_WQE_DSEG_SIZE * 2);
867 	} else if (inlen_mode) {
868 		/*
869 		 * If minimal inlining is requested we must
870 		 * enable inlining in general, despite the
871 		 * number of configured queues. Ignore the
872 		 * txq_inline_max devarg, this is not
873 		 * full-featured inline.
874 		 */
875 		inlen_send = inlen_mode;
876 		inlen_empw = 0;
877 	} else if (vlan_inline) {
878 		/*
879 		 * Hardware does not report offload for
880 		 * VLAN insertion, we must enable data inline
881 		 * to implement feature by software.
882 		 */
883 		inlen_send = MLX5_ESEG_MIN_INLINE_SIZE;
884 		inlen_empw = 0;
885 	} else {
886 		inlen_send = 0;
887 		inlen_empw = 0;
888 	}
889 	txq_ctrl->txq.inlen_send = inlen_send;
890 	txq_ctrl->txq.inlen_mode = inlen_mode;
891 	txq_ctrl->txq.inlen_empw = 0;
892 	if (inlen_send && inlen_empw && priv->txqs_n >= txqs_inline) {
893 		/*
894 		 * The data sent with MLX5_OPCODE_ENHANCED_MPSW
895 		 * may be inlined in Data Segment, align the
896 		 * length accordingly to fit entire WQEBBs.
897 		 */
898 		temp = RTE_MAX(inlen_empw,
899 			       MLX5_WQE_SIZE + MLX5_DSEG_MIN_INLINE_SIZE);
900 		temp -= MLX5_DSEG_MIN_INLINE_SIZE;
901 		temp = RTE_ALIGN(temp, MLX5_WQE_SIZE);
902 		temp += MLX5_DSEG_MIN_INLINE_SIZE;
903 		temp = RTE_MIN(temp, MLX5_WQE_SIZE_MAX +
904 				     MLX5_DSEG_MIN_INLINE_SIZE -
905 				     MLX5_WQE_CSEG_SIZE -
906 				     MLX5_WQE_ESEG_SIZE -
907 				     MLX5_WQE_DSEG_SIZE);
908 		temp = RTE_MIN(temp, MLX5_EMPW_MAX_INLINE_LEN);
909 		if (temp != inlen_empw) {
910 			DRV_LOG(INFO,
911 				"port %u enhanced empw inline setting"
912 				" aligned from %u to %u",
913 				PORT_ID(priv), inlen_empw, temp);
914 			inlen_empw = temp;
915 		}
916 		MLX5_ASSERT(inlen_empw >= MLX5_ESEG_MIN_INLINE_SIZE);
917 		MLX5_ASSERT(inlen_empw <= MLX5_WQE_SIZE_MAX +
918 					  MLX5_DSEG_MIN_INLINE_SIZE -
919 					  MLX5_WQE_CSEG_SIZE -
920 					  MLX5_WQE_ESEG_SIZE -
921 					  MLX5_WQE_DSEG_SIZE);
922 		txq_ctrl->txq.inlen_empw = inlen_empw;
923 	}
924 	txq_ctrl->max_inline_data = RTE_MAX(inlen_send, inlen_empw);
925 	if (tso) {
926 		txq_ctrl->max_tso_header = MLX5_MAX_TSO_HEADER;
927 		txq_ctrl->max_inline_data = RTE_MAX(txq_ctrl->max_inline_data,
928 						    MLX5_MAX_TSO_HEADER);
929 		txq_ctrl->txq.tso_en = 1;
930 	}
931 	if (((RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO & txq_ctrl->txq.offloads) &&
932 	    (dev_cap->tunnel_en & MLX5_TUNNELED_OFFLOADS_VXLAN_CAP)) |
933 	   ((RTE_ETH_TX_OFFLOAD_GRE_TNL_TSO & txq_ctrl->txq.offloads) &&
934 	    (dev_cap->tunnel_en & MLX5_TUNNELED_OFFLOADS_GRE_CAP)) |
935 	   ((RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO & txq_ctrl->txq.offloads) &&
936 	    (dev_cap->tunnel_en & MLX5_TUNNELED_OFFLOADS_GENEVE_CAP)) |
937 	   (dev_cap->swp  & MLX5_SW_PARSING_TSO_CAP))
938 		txq_ctrl->txq.tunnel_en = 1;
939 	txq_ctrl->txq.swp_en = (((RTE_ETH_TX_OFFLOAD_IP_TNL_TSO |
940 				  RTE_ETH_TX_OFFLOAD_UDP_TNL_TSO) &
941 				  txq_ctrl->txq.offloads) && (dev_cap->swp &
942 				  MLX5_SW_PARSING_TSO_CAP)) |
943 				((RTE_ETH_TX_OFFLOAD_OUTER_IPV4_CKSUM &
944 				 txq_ctrl->txq.offloads) && (dev_cap->swp &
945 				 MLX5_SW_PARSING_CSUM_CAP));
946 }
947 
948 /**
949  * Adjust Tx queue data inline parameters for large queue sizes.
950  * The data inline feature requires multiple WQEs to fit the packets,
951  * and if the large amount of Tx descriptors is requested by application
952  * the total WQE amount may exceed the hardware capabilities. If the
953  * default inline setting are used we can try to adjust these ones and
954  * meet the hardware requirements and not exceed the queue size.
955  *
956  * @param txq_ctrl
957  *   Pointer to Tx queue control structure.
958  *
959  * @return
960  *   Zero on success, otherwise the parameters can not be adjusted.
961  */
962 static int
963 txq_adjust_params(struct mlx5_txq_ctrl *txq_ctrl)
964 {
965 	struct mlx5_priv *priv = txq_ctrl->priv;
966 	struct mlx5_port_config *config = &priv->config;
967 	unsigned int max_inline;
968 
969 	max_inline = txq_calc_inline_max(txq_ctrl);
970 	if (!txq_ctrl->txq.inlen_send) {
971 		/*
972 		 * Inline data feature is not engaged at all.
973 		 * There is nothing to adjust.
974 		 */
975 		return 0;
976 	}
977 	if (txq_ctrl->max_inline_data <= max_inline) {
978 		/*
979 		 * The requested inline data length does not
980 		 * exceed queue capabilities.
981 		 */
982 		return 0;
983 	}
984 	if (txq_ctrl->txq.inlen_mode > max_inline) {
985 		DRV_LOG(ERR,
986 			"minimal data inline requirements (%u) are not"
987 			" satisfied (%u) on port %u, try the smaller"
988 			" Tx queue size (%d)",
989 			txq_ctrl->txq.inlen_mode, max_inline,
990 			priv->dev_data->port_id, priv->sh->dev_cap.max_qp_wr);
991 		goto error;
992 	}
993 	if (txq_ctrl->txq.inlen_send > max_inline &&
994 	    config->txq_inline_max != MLX5_ARG_UNSET &&
995 	    config->txq_inline_max > (int)max_inline) {
996 		DRV_LOG(ERR,
997 			"txq_inline_max requirements (%u) are not"
998 			" satisfied (%u) on port %u, try the smaller"
999 			" Tx queue size (%d)",
1000 			txq_ctrl->txq.inlen_send, max_inline,
1001 			priv->dev_data->port_id, priv->sh->dev_cap.max_qp_wr);
1002 		goto error;
1003 	}
1004 	if (txq_ctrl->txq.inlen_empw > max_inline &&
1005 	    config->txq_inline_mpw != MLX5_ARG_UNSET &&
1006 	    config->txq_inline_mpw > (int)max_inline) {
1007 		DRV_LOG(ERR,
1008 			"txq_inline_mpw requirements (%u) are not"
1009 			" satisfied (%u) on port %u, try the smaller"
1010 			" Tx queue size (%d)",
1011 			txq_ctrl->txq.inlen_empw, max_inline,
1012 			priv->dev_data->port_id, priv->sh->dev_cap.max_qp_wr);
1013 		goto error;
1014 	}
1015 	if (txq_ctrl->txq.tso_en && max_inline < MLX5_MAX_TSO_HEADER) {
1016 		DRV_LOG(ERR,
1017 			"tso header inline requirements (%u) are not"
1018 			" satisfied (%u) on port %u, try the smaller"
1019 			" Tx queue size (%d)",
1020 			MLX5_MAX_TSO_HEADER, max_inline,
1021 			priv->dev_data->port_id, priv->sh->dev_cap.max_qp_wr);
1022 		goto error;
1023 	}
1024 	if (txq_ctrl->txq.inlen_send > max_inline) {
1025 		DRV_LOG(WARNING,
1026 			"adjust txq_inline_max (%u->%u)"
1027 			" due to large Tx queue on port %u",
1028 			txq_ctrl->txq.inlen_send, max_inline,
1029 			priv->dev_data->port_id);
1030 		txq_ctrl->txq.inlen_send = max_inline;
1031 	}
1032 	if (txq_ctrl->txq.inlen_empw > max_inline) {
1033 		DRV_LOG(WARNING,
1034 			"adjust txq_inline_mpw (%u->%u)"
1035 			"due to large Tx queue on port %u",
1036 			txq_ctrl->txq.inlen_empw, max_inline,
1037 			priv->dev_data->port_id);
1038 		txq_ctrl->txq.inlen_empw = max_inline;
1039 	}
1040 	txq_ctrl->max_inline_data = RTE_MAX(txq_ctrl->txq.inlen_send,
1041 					    txq_ctrl->txq.inlen_empw);
1042 	MLX5_ASSERT(txq_ctrl->max_inline_data <= max_inline);
1043 	MLX5_ASSERT(txq_ctrl->txq.inlen_mode <= max_inline);
1044 	MLX5_ASSERT(txq_ctrl->txq.inlen_mode <= txq_ctrl->txq.inlen_send);
1045 	MLX5_ASSERT(txq_ctrl->txq.inlen_mode <= txq_ctrl->txq.inlen_empw ||
1046 		    !txq_ctrl->txq.inlen_empw);
1047 	return 0;
1048 error:
1049 	rte_errno = ENOMEM;
1050 	return -ENOMEM;
1051 }
1052 
1053 /**
1054  * Create a DPDK Tx queue.
1055  *
1056  * @param dev
1057  *   Pointer to Ethernet device.
1058  * @param idx
1059  *   TX queue index.
1060  * @param desc
1061  *   Number of descriptors to configure in queue.
1062  * @param socket
1063  *   NUMA socket on which memory must be allocated.
1064  * @param[in] conf
1065  *  Thresholds parameters.
1066  *
1067  * @return
1068  *   A DPDK queue object on success, NULL otherwise and rte_errno is set.
1069  */
1070 struct mlx5_txq_ctrl *
1071 mlx5_txq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
1072 	     unsigned int socket, const struct rte_eth_txconf *conf)
1073 {
1074 	struct mlx5_priv *priv = dev->data->dev_private;
1075 	struct mlx5_txq_ctrl *tmpl;
1076 
1077 	tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, sizeof(*tmpl) +
1078 			   desc * sizeof(struct rte_mbuf *), 0, socket);
1079 	if (!tmpl) {
1080 		rte_errno = ENOMEM;
1081 		return NULL;
1082 	}
1083 	if (mlx5_mr_ctrl_init(&tmpl->txq.mr_ctrl,
1084 			      &priv->sh->cdev->mr_scache.dev_gen, socket)) {
1085 		/* rte_errno is already set. */
1086 		goto error;
1087 	}
1088 	MLX5_ASSERT(desc > MLX5_TX_COMP_THRESH);
1089 	tmpl->txq.offloads = conf->offloads |
1090 			     dev->data->dev_conf.txmode.offloads;
1091 	tmpl->priv = priv;
1092 	tmpl->socket = socket;
1093 	tmpl->txq.elts_n = log2above(desc);
1094 	tmpl->txq.elts_s = desc;
1095 	tmpl->txq.elts_m = desc - 1;
1096 	tmpl->txq.port_id = dev->data->port_id;
1097 	tmpl->txq.idx = idx;
1098 	txq_set_params(tmpl);
1099 	if (txq_adjust_params(tmpl))
1100 		goto error;
1101 	if (txq_calc_wqebb_cnt(tmpl) >
1102 	    priv->sh->dev_cap.max_qp_wr) {
1103 		DRV_LOG(ERR,
1104 			"port %u Tx WQEBB count (%d) exceeds the limit (%d),"
1105 			" try smaller queue size",
1106 			dev->data->port_id, txq_calc_wqebb_cnt(tmpl),
1107 			priv->sh->dev_cap.max_qp_wr);
1108 		rte_errno = ENOMEM;
1109 		goto error;
1110 	}
1111 	__atomic_fetch_add(&tmpl->refcnt, 1, __ATOMIC_RELAXED);
1112 	tmpl->is_hairpin = false;
1113 	LIST_INSERT_HEAD(&priv->txqsctrl, tmpl, next);
1114 	return tmpl;
1115 error:
1116 	mlx5_mr_btree_free(&tmpl->txq.mr_ctrl.cache_bh);
1117 	mlx5_free(tmpl);
1118 	return NULL;
1119 }
1120 
1121 /**
1122  * Create a DPDK Tx hairpin queue.
1123  *
1124  * @param dev
1125  *   Pointer to Ethernet device.
1126  * @param idx
1127  *   TX queue index.
1128  * @param desc
1129  *   Number of descriptors to configure in queue.
1130  * @param hairpin_conf
1131  *  The hairpin configuration.
1132  *
1133  * @return
1134  *   A DPDK queue object on success, NULL otherwise and rte_errno is set.
1135  */
1136 struct mlx5_txq_ctrl *
1137 mlx5_txq_hairpin_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
1138 		     const struct rte_eth_hairpin_conf *hairpin_conf)
1139 {
1140 	struct mlx5_priv *priv = dev->data->dev_private;
1141 	struct mlx5_txq_ctrl *tmpl;
1142 
1143 	tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, sizeof(*tmpl), 0,
1144 			   SOCKET_ID_ANY);
1145 	if (!tmpl) {
1146 		rte_errno = ENOMEM;
1147 		return NULL;
1148 	}
1149 	tmpl->priv = priv;
1150 	tmpl->socket = SOCKET_ID_ANY;
1151 	tmpl->txq.elts_n = log2above(desc);
1152 	tmpl->txq.port_id = dev->data->port_id;
1153 	tmpl->txq.idx = idx;
1154 	tmpl->hairpin_conf = *hairpin_conf;
1155 	tmpl->is_hairpin = true;
1156 	__atomic_fetch_add(&tmpl->refcnt, 1, __ATOMIC_RELAXED);
1157 	LIST_INSERT_HEAD(&priv->txqsctrl, tmpl, next);
1158 	return tmpl;
1159 }
1160 
1161 /**
1162  * Get a Tx queue.
1163  *
1164  * @param dev
1165  *   Pointer to Ethernet device.
1166  * @param idx
1167  *   TX queue index.
1168  *
1169  * @return
1170  *   A pointer to the queue if it exists.
1171  */
1172 struct mlx5_txq_ctrl *
1173 mlx5_txq_get(struct rte_eth_dev *dev, uint16_t idx)
1174 {
1175 	struct mlx5_priv *priv = dev->data->dev_private;
1176 	struct mlx5_txq_data *txq_data = (*priv->txqs)[idx];
1177 	struct mlx5_txq_ctrl *ctrl = NULL;
1178 
1179 	if (txq_data) {
1180 		ctrl = container_of(txq_data, struct mlx5_txq_ctrl, txq);
1181 		__atomic_fetch_add(&ctrl->refcnt, 1, __ATOMIC_RELAXED);
1182 	}
1183 	return ctrl;
1184 }
1185 
1186 /**
1187  * Release a Tx queue.
1188  *
1189  * @param dev
1190  *   Pointer to Ethernet device.
1191  * @param idx
1192  *   TX queue index.
1193  *
1194  * @return
1195  *   1 while a reference on it exists, 0 when freed.
1196  */
1197 int
1198 mlx5_txq_release(struct rte_eth_dev *dev, uint16_t idx)
1199 {
1200 	struct mlx5_priv *priv = dev->data->dev_private;
1201 	struct mlx5_txq_ctrl *txq_ctrl;
1202 
1203 	if (priv->txqs == NULL || (*priv->txqs)[idx] == NULL)
1204 		return 0;
1205 	txq_ctrl = container_of((*priv->txqs)[idx], struct mlx5_txq_ctrl, txq);
1206 	if (__atomic_fetch_sub(&txq_ctrl->refcnt, 1, __ATOMIC_RELAXED) - 1 > 1)
1207 		return 1;
1208 	if (txq_ctrl->obj) {
1209 		priv->obj_ops.txq_obj_release(txq_ctrl->obj);
1210 		LIST_REMOVE(txq_ctrl->obj, next);
1211 		mlx5_free(txq_ctrl->obj);
1212 		txq_ctrl->obj = NULL;
1213 	}
1214 	if (!txq_ctrl->is_hairpin) {
1215 		if (txq_ctrl->txq.fcqs) {
1216 			mlx5_free(txq_ctrl->txq.fcqs);
1217 			txq_ctrl->txq.fcqs = NULL;
1218 		}
1219 		txq_free_elts(txq_ctrl);
1220 		dev->data->tx_queue_state[idx] = RTE_ETH_QUEUE_STATE_STOPPED;
1221 	}
1222 	if (!__atomic_load_n(&txq_ctrl->refcnt, __ATOMIC_RELAXED)) {
1223 		if (!txq_ctrl->is_hairpin)
1224 			mlx5_mr_btree_free(&txq_ctrl->txq.mr_ctrl.cache_bh);
1225 		LIST_REMOVE(txq_ctrl, next);
1226 		mlx5_free(txq_ctrl);
1227 		(*priv->txqs)[idx] = NULL;
1228 	}
1229 	return 0;
1230 }
1231 
1232 /**
1233  * Verify if the queue can be released.
1234  *
1235  * @param dev
1236  *   Pointer to Ethernet device.
1237  * @param idx
1238  *   TX queue index.
1239  *
1240  * @return
1241  *   1 if the queue can be released.
1242  */
1243 int
1244 mlx5_txq_releasable(struct rte_eth_dev *dev, uint16_t idx)
1245 {
1246 	struct mlx5_priv *priv = dev->data->dev_private;
1247 	struct mlx5_txq_ctrl *txq;
1248 
1249 	if (!(*priv->txqs)[idx])
1250 		return -1;
1251 	txq = container_of((*priv->txqs)[idx], struct mlx5_txq_ctrl, txq);
1252 	return (__atomic_load_n(&txq->refcnt, __ATOMIC_RELAXED) == 1);
1253 }
1254 
1255 /**
1256  * Verify the Tx Queue list is empty
1257  *
1258  * @param dev
1259  *   Pointer to Ethernet device.
1260  *
1261  * @return
1262  *   The number of object not released.
1263  */
1264 int
1265 mlx5_txq_verify(struct rte_eth_dev *dev)
1266 {
1267 	struct mlx5_priv *priv = dev->data->dev_private;
1268 	struct mlx5_txq_ctrl *txq_ctrl;
1269 	int ret = 0;
1270 
1271 	LIST_FOREACH(txq_ctrl, &priv->txqsctrl, next) {
1272 		DRV_LOG(DEBUG, "port %u Tx queue %u still referenced",
1273 			dev->data->port_id, txq_ctrl->txq.idx);
1274 		++ret;
1275 	}
1276 	return ret;
1277 }
1278 
1279 int
1280 mlx5_txq_get_sqn(struct mlx5_txq_ctrl *txq)
1281 {
1282 	return txq->is_hairpin ? txq->obj->sq->id : txq->obj->sq_obj.sq->id;
1283 }
1284 
1285 int
1286 rte_pmd_mlx5_external_sq_enable(uint16_t port_id, uint32_t sq_num)
1287 {
1288 	struct rte_eth_dev *dev;
1289 	struct mlx5_priv *priv;
1290 	uint32_t flow;
1291 
1292 	if (rte_eth_dev_is_valid_port(port_id) < 0) {
1293 		DRV_LOG(ERR, "There is no Ethernet device for port %u.",
1294 			port_id);
1295 		rte_errno = ENODEV;
1296 		return -rte_errno;
1297 	}
1298 	dev = &rte_eth_devices[port_id];
1299 	priv = dev->data->dev_private;
1300 	if ((!priv->representor && !priv->master) ||
1301 	    !priv->sh->config.dv_esw_en) {
1302 		DRV_LOG(ERR, "Port %u must be represetnor or master port in E-Switch mode.",
1303 			port_id);
1304 		rte_errno = EINVAL;
1305 		return -rte_errno;
1306 	}
1307 	if (sq_num == 0) {
1308 		DRV_LOG(ERR, "Invalid SQ number.");
1309 		rte_errno = EINVAL;
1310 		return -rte_errno;
1311 	}
1312 #ifdef HAVE_MLX5_HWS_SUPPORT
1313 	if (priv->sh->config.dv_flow_en == 2) {
1314 		if (mlx5_flow_hw_esw_create_sq_miss_flow(dev, sq_num, true))
1315 			return -rte_errno;
1316 		if (priv->sh->config.repr_matching &&
1317 		    mlx5_flow_hw_tx_repr_matching_flow(dev, sq_num, true)) {
1318 			mlx5_flow_hw_esw_destroy_sq_miss_flow(dev, sq_num);
1319 			return -rte_errno;
1320 		}
1321 		return 0;
1322 	}
1323 #endif
1324 	flow = mlx5_flow_create_devx_sq_miss_flow(dev, sq_num);
1325 	if (flow > 0)
1326 		return 0;
1327 	DRV_LOG(ERR, "Port %u failed to create default miss flow for SQ %u.",
1328 		port_id, sq_num);
1329 	return -rte_errno;
1330 }
1331 
1332 /**
1333  * Set the Tx queue dynamic timestamp (mask and offset)
1334  *
1335  * @param[in] dev
1336  *   Pointer to the Ethernet device structure.
1337  */
1338 void
1339 mlx5_txq_dynf_timestamp_set(struct rte_eth_dev *dev)
1340 {
1341 	struct mlx5_priv *priv = dev->data->dev_private;
1342 	struct mlx5_dev_ctx_shared *sh = priv->sh;
1343 	struct mlx5_txq_data *data;
1344 	int off, nbit;
1345 	unsigned int i;
1346 	uint64_t mask = 0;
1347 	uint64_t ts_mask;
1348 
1349 	if (sh->dev_cap.rt_timestamp ||
1350 	    !sh->cdev->config.hca_attr.dev_freq_khz)
1351 		ts_mask = MLX5_TS_MASK_SECS << 32;
1352 	else
1353 		ts_mask = rte_align64pow2(MLX5_TS_MASK_SECS * 1000ull *
1354 				sh->cdev->config.hca_attr.dev_freq_khz);
1355 	ts_mask = rte_cpu_to_be_64(ts_mask - 1ull);
1356 	nbit = rte_mbuf_dynflag_lookup
1357 				(RTE_MBUF_DYNFLAG_TX_TIMESTAMP_NAME, NULL);
1358 	off = rte_mbuf_dynfield_lookup
1359 				(RTE_MBUF_DYNFIELD_TIMESTAMP_NAME, NULL);
1360 	if (nbit >= 0 && off >= 0 &&
1361 	    (sh->txpp.refcnt || priv->sh->cdev->config.hca_attr.wait_on_time))
1362 		mask = 1ULL << nbit;
1363 	for (i = 0; i != priv->txqs_n; ++i) {
1364 		data = (*priv->txqs)[i];
1365 		if (!data)
1366 			continue;
1367 		data->sh = sh;
1368 		data->ts_mask = mask;
1369 		data->ts_offset = off;
1370 		data->rt_timestamp = sh->dev_cap.rt_timestamp;
1371 		data->rt_timemask = (data->offloads &
1372 				     RTE_ETH_TX_OFFLOAD_SEND_ON_TIMESTAMP) ?
1373 				     ts_mask : 0;
1374 	}
1375 }
1376 
1377 int mlx5_count_aggr_ports(struct rte_eth_dev *dev)
1378 {
1379 	struct mlx5_priv *priv = dev->data->dev_private;
1380 
1381 	return priv->sh->bond.n_port;
1382 }
1383 
1384 int mlx5_map_aggr_tx_affinity(struct rte_eth_dev *dev, uint16_t tx_queue_id,
1385 			      uint8_t affinity)
1386 {
1387 	struct mlx5_txq_ctrl *txq_ctrl;
1388 	struct mlx5_txq_data *txq;
1389 	struct mlx5_priv *priv;
1390 
1391 	priv = dev->data->dev_private;
1392 	if (!mlx5_devx_obj_ops_en(priv->sh)) {
1393 		DRV_LOG(ERR, "Tx affinity mapping isn't supported by Verbs API.");
1394 		rte_errno = ENOTSUP;
1395 		return -rte_errno;
1396 	}
1397 	txq = (*priv->txqs)[tx_queue_id];
1398 	if (!txq)
1399 		return -1;
1400 	txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq);
1401 	if (tx_queue_id >= priv->txqs_n) {
1402 		DRV_LOG(ERR, "port %u Tx queue index out of range (%u >= %u)",
1403 			dev->data->port_id, tx_queue_id, priv->txqs_n);
1404 		rte_errno = EOVERFLOW;
1405 		return -rte_errno;
1406 	}
1407 	if (affinity > priv->num_lag_ports) {
1408 		DRV_LOG(ERR, "port %u unable to setup Tx queue index %u"
1409 			" affinity is %u exceeds the maximum %u", dev->data->port_id,
1410 			tx_queue_id, affinity, priv->num_lag_ports);
1411 		rte_errno = EINVAL;
1412 		return -rte_errno;
1413 	}
1414 	DRV_LOG(DEBUG, "port %u configuring queue %u for aggregated affinity %u",
1415 		dev->data->port_id, tx_queue_id, affinity);
1416 	txq_ctrl->txq.tx_aggr_affinity = affinity;
1417 	return 0;
1418 }
1419