xref: /dpdk/drivers/net/mlx5/mlx5_trigger.c (revision 2490bb897182f57de80fd924dd3ae48dda819b8c)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5 
6 #include <unistd.h>
7 
8 #include <rte_ether.h>
9 #include <ethdev_driver.h>
10 #include <rte_interrupts.h>
11 #include <rte_alarm.h>
12 #include <rte_cycles.h>
13 
14 #include <mlx5_malloc.h>
15 
16 #include "mlx5.h"
17 #include "mlx5_flow.h"
18 #include "mlx5_rx.h"
19 #include "mlx5_tx.h"
20 #include "mlx5_utils.h"
21 #include "rte_pmd_mlx5.h"
22 
23 /**
24  * Stop traffic on Tx queues.
25  *
26  * @param dev
27  *   Pointer to Ethernet device structure.
28  */
29 static void
30 mlx5_txq_stop(struct rte_eth_dev *dev)
31 {
32 	struct mlx5_priv *priv = dev->data->dev_private;
33 	unsigned int i;
34 
35 	for (i = 0; i != priv->txqs_n; ++i)
36 		mlx5_txq_release(dev, i);
37 }
38 
39 /**
40  * Start traffic on Tx queues.
41  *
42  * @param dev
43  *   Pointer to Ethernet device structure.
44  *
45  * @return
46  *   0 on success, a negative errno value otherwise and rte_errno is set.
47  */
48 static int
49 mlx5_txq_start(struct rte_eth_dev *dev)
50 {
51 	struct mlx5_priv *priv = dev->data->dev_private;
52 	unsigned int i;
53 	int ret;
54 
55 	for (i = 0; i != priv->txqs_n; ++i) {
56 		struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i);
57 		struct mlx5_txq_data *txq_data = &txq_ctrl->txq;
58 		uint32_t flags = MLX5_MEM_RTE | MLX5_MEM_ZERO;
59 
60 		if (!txq_ctrl)
61 			continue;
62 		if (txq_ctrl->type == MLX5_TXQ_TYPE_STANDARD)
63 			txq_alloc_elts(txq_ctrl);
64 		MLX5_ASSERT(!txq_ctrl->obj);
65 		txq_ctrl->obj = mlx5_malloc(flags, sizeof(struct mlx5_txq_obj),
66 					    0, txq_ctrl->socket);
67 		if (!txq_ctrl->obj) {
68 			DRV_LOG(ERR, "Port %u Tx queue %u cannot allocate "
69 				"memory resources.", dev->data->port_id,
70 				txq_data->idx);
71 			rte_errno = ENOMEM;
72 			goto error;
73 		}
74 		ret = priv->obj_ops.txq_obj_new(dev, i);
75 		if (ret < 0) {
76 			mlx5_free(txq_ctrl->obj);
77 			txq_ctrl->obj = NULL;
78 			goto error;
79 		}
80 		if (txq_ctrl->type == MLX5_TXQ_TYPE_STANDARD) {
81 			size_t size = txq_data->cqe_s * sizeof(*txq_data->fcqs);
82 
83 			txq_data->fcqs = mlx5_malloc(flags, size,
84 						     RTE_CACHE_LINE_SIZE,
85 						     txq_ctrl->socket);
86 			if (!txq_data->fcqs) {
87 				DRV_LOG(ERR, "Port %u Tx queue %u cannot "
88 					"allocate memory (FCQ).",
89 					dev->data->port_id, i);
90 				rte_errno = ENOMEM;
91 				goto error;
92 			}
93 		}
94 		DRV_LOG(DEBUG, "Port %u txq %u updated with %p.",
95 			dev->data->port_id, i, (void *)&txq_ctrl->obj);
96 		LIST_INSERT_HEAD(&priv->txqsobj, txq_ctrl->obj, next);
97 	}
98 	return 0;
99 error:
100 	ret = rte_errno; /* Save rte_errno before cleanup. */
101 	do {
102 		mlx5_txq_release(dev, i);
103 	} while (i-- != 0);
104 	rte_errno = ret; /* Restore rte_errno. */
105 	return -rte_errno;
106 }
107 
108 /**
109  * Translate the chunk address to MR key in order to put in into the cache.
110  */
111 static void
112 mlx5_rxq_mempool_register_cb(struct rte_mempool *mp, void *opaque,
113 			     struct rte_mempool_memhdr *memhdr,
114 			     unsigned int idx)
115 {
116 	struct mlx5_rxq_data *rxq = opaque;
117 
118 	RTE_SET_USED(mp);
119 	RTE_SET_USED(idx);
120 	mlx5_rx_addr2mr(rxq, (uintptr_t)memhdr->addr);
121 }
122 
123 /**
124  * Register Rx queue mempools and fill the Rx queue cache.
125  * This function tolerates repeated mempool registration.
126  *
127  * @param[in] rxq_ctrl
128  *   Rx queue control data.
129  *
130  * @return
131  *   0 on success, (-1) on failure and rte_errno is set.
132  */
133 static int
134 mlx5_rxq_mempool_register(struct mlx5_rxq_ctrl *rxq_ctrl)
135 {
136 	struct rte_mempool *mp;
137 	uint32_t s;
138 	int ret = 0;
139 
140 	mlx5_mr_flush_local_cache(&rxq_ctrl->rxq.mr_ctrl);
141 	/* MPRQ mempool is registered on creation, just fill the cache. */
142 	if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq)) {
143 		rte_mempool_mem_iter(rxq_ctrl->rxq.mprq_mp,
144 				     mlx5_rxq_mempool_register_cb,
145 				     &rxq_ctrl->rxq);
146 		return 0;
147 	}
148 	for (s = 0; s < rxq_ctrl->rxq.rxseg_n; s++) {
149 		uint32_t flags;
150 
151 		mp = rxq_ctrl->rxq.rxseg[s].mp;
152 		flags = mp != rxq_ctrl->rxq.mprq_mp ?
153 			rte_pktmbuf_priv_flags(mp) : 0;
154 		ret = mlx5_mr_mempool_register(rxq_ctrl->sh->cdev, mp);
155 		if (ret < 0 && rte_errno != EEXIST)
156 			return ret;
157 		if ((flags & RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF) == 0)
158 			rte_mempool_mem_iter(mp, mlx5_rxq_mempool_register_cb,
159 					     &rxq_ctrl->rxq);
160 	}
161 	return 0;
162 }
163 
164 /**
165  * Stop traffic on Rx queues.
166  *
167  * @param dev
168  *   Pointer to Ethernet device structure.
169  */
170 static void
171 mlx5_rxq_stop(struct rte_eth_dev *dev)
172 {
173 	struct mlx5_priv *priv = dev->data->dev_private;
174 	unsigned int i;
175 
176 	for (i = 0; i != priv->rxqs_n; ++i)
177 		mlx5_rxq_release(dev, i);
178 }
179 
180 static int
181 mlx5_rxq_ctrl_prepare(struct rte_eth_dev *dev, struct mlx5_rxq_ctrl *rxq_ctrl,
182 		      unsigned int idx)
183 {
184 	int ret = 0;
185 
186 	if (rxq_ctrl->type == MLX5_RXQ_TYPE_STANDARD) {
187 		/*
188 		 * Pre-register the mempools. Regardless of whether
189 		 * the implicit registration is enabled or not,
190 		 * Rx mempool destruction is tracked to free MRs.
191 		 */
192 		if (mlx5_rxq_mempool_register(rxq_ctrl) < 0)
193 			return -rte_errno;
194 		ret = rxq_alloc_elts(rxq_ctrl);
195 		if (ret)
196 			return ret;
197 	}
198 	MLX5_ASSERT(!rxq_ctrl->obj);
199 	rxq_ctrl->obj = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
200 				    sizeof(*rxq_ctrl->obj), 0,
201 				    rxq_ctrl->socket);
202 	if (!rxq_ctrl->obj) {
203 		DRV_LOG(ERR, "Port %u Rx queue %u can't allocate resources.",
204 			dev->data->port_id, idx);
205 		rte_errno = ENOMEM;
206 		return -rte_errno;
207 	}
208 	DRV_LOG(DEBUG, "Port %u rxq %u updated with %p.", dev->data->port_id,
209 		idx, (void *)&rxq_ctrl->obj);
210 	return 0;
211 }
212 
213 /**
214  * Start traffic on Rx queues.
215  *
216  * @param dev
217  *   Pointer to Ethernet device structure.
218  *
219  * @return
220  *   0 on success, a negative errno value otherwise and rte_errno is set.
221  */
222 static int
223 mlx5_rxq_start(struct rte_eth_dev *dev)
224 {
225 	struct mlx5_priv *priv = dev->data->dev_private;
226 	unsigned int i;
227 	int ret = 0;
228 
229 	/* Allocate/reuse/resize mempool for Multi-Packet RQ. */
230 	if (mlx5_mprq_alloc_mp(dev)) {
231 		/* Should not release Rx queues but return immediately. */
232 		return -rte_errno;
233 	}
234 	DRV_LOG(DEBUG, "Port %u device_attr.max_qp_wr is %d.",
235 		dev->data->port_id, priv->sh->device_attr.max_qp_wr);
236 	DRV_LOG(DEBUG, "Port %u device_attr.max_sge is %d.",
237 		dev->data->port_id, priv->sh->device_attr.max_sge);
238 	for (i = 0; i != priv->rxqs_n; ++i) {
239 		struct mlx5_rxq_priv *rxq = mlx5_rxq_ref(dev, i);
240 		struct mlx5_rxq_ctrl *rxq_ctrl;
241 
242 		if (rxq == NULL)
243 			continue;
244 		rxq_ctrl = rxq->ctrl;
245 		if (!rxq_ctrl->started) {
246 			if (mlx5_rxq_ctrl_prepare(dev, rxq_ctrl, i) < 0)
247 				goto error;
248 			LIST_INSERT_HEAD(&priv->rxqsobj, rxq_ctrl->obj, next);
249 		}
250 		ret = priv->obj_ops.rxq_obj_new(rxq);
251 		if (ret) {
252 			mlx5_free(rxq_ctrl->obj);
253 			rxq_ctrl->obj = NULL;
254 			goto error;
255 		}
256 		rxq_ctrl->started = true;
257 	}
258 	return 0;
259 error:
260 	ret = rte_errno; /* Save rte_errno before cleanup. */
261 	do {
262 		mlx5_rxq_release(dev, i);
263 	} while (i-- != 0);
264 	rte_errno = ret; /* Restore rte_errno. */
265 	return -rte_errno;
266 }
267 
268 /**
269  * Binds Tx queues to Rx queues for hairpin.
270  *
271  * Binds Tx queues to the target Rx queues.
272  *
273  * @param dev
274  *   Pointer to Ethernet device structure.
275  *
276  * @return
277  *   0 on success, a negative errno value otherwise and rte_errno is set.
278  */
279 static int
280 mlx5_hairpin_auto_bind(struct rte_eth_dev *dev)
281 {
282 	struct mlx5_priv *priv = dev->data->dev_private;
283 	struct mlx5_devx_modify_sq_attr sq_attr = { 0 };
284 	struct mlx5_devx_modify_rq_attr rq_attr = { 0 };
285 	struct mlx5_txq_ctrl *txq_ctrl;
286 	struct mlx5_rxq_priv *rxq;
287 	struct mlx5_rxq_ctrl *rxq_ctrl;
288 	struct mlx5_devx_obj *sq;
289 	struct mlx5_devx_obj *rq;
290 	unsigned int i;
291 	int ret = 0;
292 	bool need_auto = false;
293 	uint16_t self_port = dev->data->port_id;
294 
295 	for (i = 0; i != priv->txqs_n; ++i) {
296 		txq_ctrl = mlx5_txq_get(dev, i);
297 		if (!txq_ctrl)
298 			continue;
299 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN ||
300 		    txq_ctrl->hairpin_conf.peers[0].port != self_port) {
301 			mlx5_txq_release(dev, i);
302 			continue;
303 		}
304 		if (txq_ctrl->hairpin_conf.manual_bind) {
305 			mlx5_txq_release(dev, i);
306 			return 0;
307 		}
308 		need_auto = true;
309 		mlx5_txq_release(dev, i);
310 	}
311 	if (!need_auto)
312 		return 0;
313 	for (i = 0; i != priv->txqs_n; ++i) {
314 		txq_ctrl = mlx5_txq_get(dev, i);
315 		if (!txq_ctrl)
316 			continue;
317 		/* Skip hairpin queues with other peer ports. */
318 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN ||
319 		    txq_ctrl->hairpin_conf.peers[0].port != self_port) {
320 			mlx5_txq_release(dev, i);
321 			continue;
322 		}
323 		if (!txq_ctrl->obj) {
324 			rte_errno = ENOMEM;
325 			DRV_LOG(ERR, "port %u no txq object found: %d",
326 				dev->data->port_id, i);
327 			mlx5_txq_release(dev, i);
328 			return -rte_errno;
329 		}
330 		sq = txq_ctrl->obj->sq;
331 		rxq = mlx5_rxq_get(dev, txq_ctrl->hairpin_conf.peers[0].queue);
332 		if (rxq == NULL) {
333 			mlx5_txq_release(dev, i);
334 			rte_errno = EINVAL;
335 			DRV_LOG(ERR, "port %u no rxq object found: %d",
336 				dev->data->port_id,
337 				txq_ctrl->hairpin_conf.peers[0].queue);
338 			return -rte_errno;
339 		}
340 		rxq_ctrl = rxq->ctrl;
341 		if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN ||
342 		    rxq->hairpin_conf.peers[0].queue != i) {
343 			rte_errno = ENOMEM;
344 			DRV_LOG(ERR, "port %u Tx queue %d can't be binded to "
345 				"Rx queue %d", dev->data->port_id,
346 				i, txq_ctrl->hairpin_conf.peers[0].queue);
347 			goto error;
348 		}
349 		rq = rxq_ctrl->obj->rq;
350 		if (!rq) {
351 			rte_errno = ENOMEM;
352 			DRV_LOG(ERR, "port %u hairpin no matching rxq: %d",
353 				dev->data->port_id,
354 				txq_ctrl->hairpin_conf.peers[0].queue);
355 			goto error;
356 		}
357 		sq_attr.state = MLX5_SQC_STATE_RDY;
358 		sq_attr.sq_state = MLX5_SQC_STATE_RST;
359 		sq_attr.hairpin_peer_rq = rq->id;
360 		sq_attr.hairpin_peer_vhca = priv->config.hca_attr.vhca_id;
361 		ret = mlx5_devx_cmd_modify_sq(sq, &sq_attr);
362 		if (ret)
363 			goto error;
364 		rq_attr.state = MLX5_SQC_STATE_RDY;
365 		rq_attr.rq_state = MLX5_SQC_STATE_RST;
366 		rq_attr.hairpin_peer_sq = sq->id;
367 		rq_attr.hairpin_peer_vhca = priv->config.hca_attr.vhca_id;
368 		ret = mlx5_devx_cmd_modify_rq(rq, &rq_attr);
369 		if (ret)
370 			goto error;
371 		/* Qs with auto-bind will be destroyed directly. */
372 		rxq->hairpin_status = 1;
373 		txq_ctrl->hairpin_status = 1;
374 		mlx5_txq_release(dev, i);
375 	}
376 	return 0;
377 error:
378 	mlx5_txq_release(dev, i);
379 	return -rte_errno;
380 }
381 
382 /*
383  * Fetch the peer queue's SW & HW information.
384  *
385  * @param dev
386  *   Pointer to Ethernet device structure.
387  * @param peer_queue
388  *   Index of the queue to fetch the information.
389  * @param current_info
390  *   Pointer to the input peer information, not used currently.
391  * @param peer_info
392  *   Pointer to the structure to store the information, output.
393  * @param direction
394  *   Positive to get the RxQ information, zero to get the TxQ information.
395  *
396  * @return
397  *   0 on success, a negative errno value otherwise and rte_errno is set.
398  */
399 int
400 mlx5_hairpin_queue_peer_update(struct rte_eth_dev *dev, uint16_t peer_queue,
401 			       struct rte_hairpin_peer_info *current_info,
402 			       struct rte_hairpin_peer_info *peer_info,
403 			       uint32_t direction)
404 {
405 	struct mlx5_priv *priv = dev->data->dev_private;
406 	RTE_SET_USED(current_info);
407 
408 	if (dev->data->dev_started == 0) {
409 		rte_errno = EBUSY;
410 		DRV_LOG(ERR, "peer port %u is not started",
411 			dev->data->port_id);
412 		return -rte_errno;
413 	}
414 	/*
415 	 * Peer port used as egress. In the current design, hairpin Tx queue
416 	 * will be bound to the peer Rx queue. Indeed, only the information of
417 	 * peer Rx queue needs to be fetched.
418 	 */
419 	if (direction == 0) {
420 		struct mlx5_txq_ctrl *txq_ctrl;
421 
422 		txq_ctrl = mlx5_txq_get(dev, peer_queue);
423 		if (txq_ctrl == NULL) {
424 			rte_errno = EINVAL;
425 			DRV_LOG(ERR, "Failed to get port %u Tx queue %d",
426 				dev->data->port_id, peer_queue);
427 			return -rte_errno;
428 		}
429 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
430 			rte_errno = EINVAL;
431 			DRV_LOG(ERR, "port %u queue %d is not a hairpin Txq",
432 				dev->data->port_id, peer_queue);
433 			mlx5_txq_release(dev, peer_queue);
434 			return -rte_errno;
435 		}
436 		if (txq_ctrl->obj == NULL || txq_ctrl->obj->sq == NULL) {
437 			rte_errno = ENOMEM;
438 			DRV_LOG(ERR, "port %u no Txq object found: %d",
439 				dev->data->port_id, peer_queue);
440 			mlx5_txq_release(dev, peer_queue);
441 			return -rte_errno;
442 		}
443 		peer_info->qp_id = txq_ctrl->obj->sq->id;
444 		peer_info->vhca_id = priv->config.hca_attr.vhca_id;
445 		/* 1-to-1 mapping, only the first one is used. */
446 		peer_info->peer_q = txq_ctrl->hairpin_conf.peers[0].queue;
447 		peer_info->tx_explicit = txq_ctrl->hairpin_conf.tx_explicit;
448 		peer_info->manual_bind = txq_ctrl->hairpin_conf.manual_bind;
449 		mlx5_txq_release(dev, peer_queue);
450 	} else { /* Peer port used as ingress. */
451 		struct mlx5_rxq_priv *rxq = mlx5_rxq_get(dev, peer_queue);
452 		struct mlx5_rxq_ctrl *rxq_ctrl;
453 
454 		if (rxq == NULL) {
455 			rte_errno = EINVAL;
456 			DRV_LOG(ERR, "Failed to get port %u Rx queue %d",
457 				dev->data->port_id, peer_queue);
458 			return -rte_errno;
459 		}
460 		rxq_ctrl = rxq->ctrl;
461 		if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN) {
462 			rte_errno = EINVAL;
463 			DRV_LOG(ERR, "port %u queue %d is not a hairpin Rxq",
464 				dev->data->port_id, peer_queue);
465 			return -rte_errno;
466 		}
467 		if (rxq_ctrl->obj == NULL || rxq_ctrl->obj->rq == NULL) {
468 			rte_errno = ENOMEM;
469 			DRV_LOG(ERR, "port %u no Rxq object found: %d",
470 				dev->data->port_id, peer_queue);
471 			return -rte_errno;
472 		}
473 		peer_info->qp_id = rxq_ctrl->obj->rq->id;
474 		peer_info->vhca_id = priv->config.hca_attr.vhca_id;
475 		peer_info->peer_q = rxq->hairpin_conf.peers[0].queue;
476 		peer_info->tx_explicit = rxq->hairpin_conf.tx_explicit;
477 		peer_info->manual_bind = rxq->hairpin_conf.manual_bind;
478 	}
479 	return 0;
480 }
481 
482 /*
483  * Bind the hairpin queue with the peer HW information.
484  * This needs to be called twice both for Tx and Rx queues of a pair.
485  * If the queue is already bound, it is considered successful.
486  *
487  * @param dev
488  *   Pointer to Ethernet device structure.
489  * @param cur_queue
490  *   Index of the queue to change the HW configuration to bind.
491  * @param peer_info
492  *   Pointer to information of the peer queue.
493  * @param direction
494  *   Positive to configure the TxQ, zero to configure the RxQ.
495  *
496  * @return
497  *   0 on success, a negative errno value otherwise and rte_errno is set.
498  */
499 int
500 mlx5_hairpin_queue_peer_bind(struct rte_eth_dev *dev, uint16_t cur_queue,
501 			     struct rte_hairpin_peer_info *peer_info,
502 			     uint32_t direction)
503 {
504 	int ret = 0;
505 
506 	/*
507 	 * Consistency checking of the peer queue: opposite direction is used
508 	 * to get the peer queue info with ethdev port ID, no need to check.
509 	 */
510 	if (peer_info->peer_q != cur_queue) {
511 		rte_errno = EINVAL;
512 		DRV_LOG(ERR, "port %u queue %d and peer queue %d mismatch",
513 			dev->data->port_id, cur_queue, peer_info->peer_q);
514 		return -rte_errno;
515 	}
516 	if (direction != 0) {
517 		struct mlx5_txq_ctrl *txq_ctrl;
518 		struct mlx5_devx_modify_sq_attr sq_attr = { 0 };
519 
520 		txq_ctrl = mlx5_txq_get(dev, cur_queue);
521 		if (txq_ctrl == NULL) {
522 			rte_errno = EINVAL;
523 			DRV_LOG(ERR, "Failed to get port %u Tx queue %d",
524 				dev->data->port_id, cur_queue);
525 			return -rte_errno;
526 		}
527 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
528 			rte_errno = EINVAL;
529 			DRV_LOG(ERR, "port %u queue %d not a hairpin Txq",
530 				dev->data->port_id, cur_queue);
531 			mlx5_txq_release(dev, cur_queue);
532 			return -rte_errno;
533 		}
534 		if (txq_ctrl->obj == NULL || txq_ctrl->obj->sq == NULL) {
535 			rte_errno = ENOMEM;
536 			DRV_LOG(ERR, "port %u no Txq object found: %d",
537 				dev->data->port_id, cur_queue);
538 			mlx5_txq_release(dev, cur_queue);
539 			return -rte_errno;
540 		}
541 		if (txq_ctrl->hairpin_status != 0) {
542 			DRV_LOG(DEBUG, "port %u Tx queue %d is already bound",
543 				dev->data->port_id, cur_queue);
544 			mlx5_txq_release(dev, cur_queue);
545 			return 0;
546 		}
547 		/*
548 		 * All queues' of one port consistency checking is done in the
549 		 * bind() function, and that is optional.
550 		 */
551 		if (peer_info->tx_explicit !=
552 		    txq_ctrl->hairpin_conf.tx_explicit) {
553 			rte_errno = EINVAL;
554 			DRV_LOG(ERR, "port %u Tx queue %d and peer Tx rule mode"
555 				" mismatch", dev->data->port_id, cur_queue);
556 			mlx5_txq_release(dev, cur_queue);
557 			return -rte_errno;
558 		}
559 		if (peer_info->manual_bind !=
560 		    txq_ctrl->hairpin_conf.manual_bind) {
561 			rte_errno = EINVAL;
562 			DRV_LOG(ERR, "port %u Tx queue %d and peer binding mode"
563 				" mismatch", dev->data->port_id, cur_queue);
564 			mlx5_txq_release(dev, cur_queue);
565 			return -rte_errno;
566 		}
567 		sq_attr.state = MLX5_SQC_STATE_RDY;
568 		sq_attr.sq_state = MLX5_SQC_STATE_RST;
569 		sq_attr.hairpin_peer_rq = peer_info->qp_id;
570 		sq_attr.hairpin_peer_vhca = peer_info->vhca_id;
571 		ret = mlx5_devx_cmd_modify_sq(txq_ctrl->obj->sq, &sq_attr);
572 		if (ret == 0)
573 			txq_ctrl->hairpin_status = 1;
574 		mlx5_txq_release(dev, cur_queue);
575 	} else {
576 		struct mlx5_rxq_priv *rxq = mlx5_rxq_get(dev, cur_queue);
577 		struct mlx5_rxq_ctrl *rxq_ctrl;
578 		struct mlx5_devx_modify_rq_attr rq_attr = { 0 };
579 
580 		if (rxq == NULL) {
581 			rte_errno = EINVAL;
582 			DRV_LOG(ERR, "Failed to get port %u Rx queue %d",
583 				dev->data->port_id, cur_queue);
584 			return -rte_errno;
585 		}
586 		rxq_ctrl = rxq->ctrl;
587 		if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN) {
588 			rte_errno = EINVAL;
589 			DRV_LOG(ERR, "port %u queue %d not a hairpin Rxq",
590 				dev->data->port_id, cur_queue);
591 			return -rte_errno;
592 		}
593 		if (rxq_ctrl->obj == NULL || rxq_ctrl->obj->rq == NULL) {
594 			rte_errno = ENOMEM;
595 			DRV_LOG(ERR, "port %u no Rxq object found: %d",
596 				dev->data->port_id, cur_queue);
597 			return -rte_errno;
598 		}
599 		if (rxq->hairpin_status != 0) {
600 			DRV_LOG(DEBUG, "port %u Rx queue %d is already bound",
601 				dev->data->port_id, cur_queue);
602 			return 0;
603 		}
604 		if (peer_info->tx_explicit !=
605 		    rxq->hairpin_conf.tx_explicit) {
606 			rte_errno = EINVAL;
607 			DRV_LOG(ERR, "port %u Rx queue %d and peer Tx rule mode"
608 				" mismatch", dev->data->port_id, cur_queue);
609 			return -rte_errno;
610 		}
611 		if (peer_info->manual_bind !=
612 		    rxq->hairpin_conf.manual_bind) {
613 			rte_errno = EINVAL;
614 			DRV_LOG(ERR, "port %u Rx queue %d and peer binding mode"
615 				" mismatch", dev->data->port_id, cur_queue);
616 			return -rte_errno;
617 		}
618 		rq_attr.state = MLX5_SQC_STATE_RDY;
619 		rq_attr.rq_state = MLX5_SQC_STATE_RST;
620 		rq_attr.hairpin_peer_sq = peer_info->qp_id;
621 		rq_attr.hairpin_peer_vhca = peer_info->vhca_id;
622 		ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq, &rq_attr);
623 		if (ret == 0)
624 			rxq->hairpin_status = 1;
625 	}
626 	return ret;
627 }
628 
629 /*
630  * Unbind the hairpin queue and reset its HW configuration.
631  * This needs to be called twice both for Tx and Rx queues of a pair.
632  * If the queue is already unbound, it is considered successful.
633  *
634  * @param dev
635  *   Pointer to Ethernet device structure.
636  * @param cur_queue
637  *   Index of the queue to change the HW configuration to unbind.
638  * @param direction
639  *   Positive to reset the TxQ, zero to reset the RxQ.
640  *
641  * @return
642  *   0 on success, a negative errno value otherwise and rte_errno is set.
643  */
644 int
645 mlx5_hairpin_queue_peer_unbind(struct rte_eth_dev *dev, uint16_t cur_queue,
646 			       uint32_t direction)
647 {
648 	int ret = 0;
649 
650 	if (direction != 0) {
651 		struct mlx5_txq_ctrl *txq_ctrl;
652 		struct mlx5_devx_modify_sq_attr sq_attr = { 0 };
653 
654 		txq_ctrl = mlx5_txq_get(dev, cur_queue);
655 		if (txq_ctrl == NULL) {
656 			rte_errno = EINVAL;
657 			DRV_LOG(ERR, "Failed to get port %u Tx queue %d",
658 				dev->data->port_id, cur_queue);
659 			return -rte_errno;
660 		}
661 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
662 			rte_errno = EINVAL;
663 			DRV_LOG(ERR, "port %u queue %d not a hairpin Txq",
664 				dev->data->port_id, cur_queue);
665 			mlx5_txq_release(dev, cur_queue);
666 			return -rte_errno;
667 		}
668 		/* Already unbound, return success before obj checking. */
669 		if (txq_ctrl->hairpin_status == 0) {
670 			DRV_LOG(DEBUG, "port %u Tx queue %d is already unbound",
671 				dev->data->port_id, cur_queue);
672 			mlx5_txq_release(dev, cur_queue);
673 			return 0;
674 		}
675 		if (!txq_ctrl->obj || !txq_ctrl->obj->sq) {
676 			rte_errno = ENOMEM;
677 			DRV_LOG(ERR, "port %u no Txq object found: %d",
678 				dev->data->port_id, cur_queue);
679 			mlx5_txq_release(dev, cur_queue);
680 			return -rte_errno;
681 		}
682 		sq_attr.state = MLX5_SQC_STATE_RST;
683 		sq_attr.sq_state = MLX5_SQC_STATE_RST;
684 		ret = mlx5_devx_cmd_modify_sq(txq_ctrl->obj->sq, &sq_attr);
685 		if (ret == 0)
686 			txq_ctrl->hairpin_status = 0;
687 		mlx5_txq_release(dev, cur_queue);
688 	} else {
689 		struct mlx5_rxq_priv *rxq = mlx5_rxq_get(dev, cur_queue);
690 		struct mlx5_rxq_ctrl *rxq_ctrl;
691 		struct mlx5_devx_modify_rq_attr rq_attr = { 0 };
692 
693 		if (rxq == NULL) {
694 			rte_errno = EINVAL;
695 			DRV_LOG(ERR, "Failed to get port %u Rx queue %d",
696 				dev->data->port_id, cur_queue);
697 			return -rte_errno;
698 		}
699 		rxq_ctrl = rxq->ctrl;
700 		if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN) {
701 			rte_errno = EINVAL;
702 			DRV_LOG(ERR, "port %u queue %d not a hairpin Rxq",
703 				dev->data->port_id, cur_queue);
704 			return -rte_errno;
705 		}
706 		if (rxq->hairpin_status == 0) {
707 			DRV_LOG(DEBUG, "port %u Rx queue %d is already unbound",
708 				dev->data->port_id, cur_queue);
709 			return 0;
710 		}
711 		if (rxq_ctrl->obj == NULL || rxq_ctrl->obj->rq == NULL) {
712 			rte_errno = ENOMEM;
713 			DRV_LOG(ERR, "port %u no Rxq object found: %d",
714 				dev->data->port_id, cur_queue);
715 			return -rte_errno;
716 		}
717 		rq_attr.state = MLX5_SQC_STATE_RST;
718 		rq_attr.rq_state = MLX5_SQC_STATE_RST;
719 		ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq, &rq_attr);
720 		if (ret == 0)
721 			rxq->hairpin_status = 0;
722 	}
723 	return ret;
724 }
725 
726 /*
727  * Bind the hairpin port pairs, from the Tx to the peer Rx.
728  * This function only supports to bind the Tx to one Rx.
729  *
730  * @param dev
731  *   Pointer to Ethernet device structure.
732  * @param rx_port
733  *   Port identifier of the Rx port.
734  *
735  * @return
736  *   0 on success, a negative errno value otherwise and rte_errno is set.
737  */
738 static int
739 mlx5_hairpin_bind_single_port(struct rte_eth_dev *dev, uint16_t rx_port)
740 {
741 	struct mlx5_priv *priv = dev->data->dev_private;
742 	int ret = 0;
743 	struct mlx5_txq_ctrl *txq_ctrl;
744 	uint32_t i;
745 	struct rte_hairpin_peer_info peer = {0xffffff};
746 	struct rte_hairpin_peer_info cur;
747 	const struct rte_eth_hairpin_conf *conf;
748 	uint16_t num_q = 0;
749 	uint16_t local_port = priv->dev_data->port_id;
750 	uint32_t manual;
751 	uint32_t explicit;
752 	uint16_t rx_queue;
753 
754 	if (mlx5_eth_find_next(rx_port, dev->device) != rx_port) {
755 		rte_errno = ENODEV;
756 		DRV_LOG(ERR, "Rx port %u does not belong to mlx5", rx_port);
757 		return -rte_errno;
758 	}
759 	/*
760 	 * Before binding TxQ to peer RxQ, first round loop will be used for
761 	 * checking the queues' configuration consistency. This would be a
762 	 * little time consuming but better than doing the rollback.
763 	 */
764 	for (i = 0; i != priv->txqs_n; i++) {
765 		txq_ctrl = mlx5_txq_get(dev, i);
766 		if (txq_ctrl == NULL)
767 			continue;
768 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
769 			mlx5_txq_release(dev, i);
770 			continue;
771 		}
772 		/*
773 		 * All hairpin Tx queues of a single port that connected to the
774 		 * same peer Rx port should have the same "auto binding" and
775 		 * "implicit Tx flow" modes.
776 		 * Peer consistency checking will be done in per queue binding.
777 		 */
778 		conf = &txq_ctrl->hairpin_conf;
779 		if (conf->peers[0].port == rx_port) {
780 			if (num_q == 0) {
781 				manual = conf->manual_bind;
782 				explicit = conf->tx_explicit;
783 			} else {
784 				if (manual != conf->manual_bind ||
785 				    explicit != conf->tx_explicit) {
786 					rte_errno = EINVAL;
787 					DRV_LOG(ERR, "port %u queue %d mode"
788 						" mismatch: %u %u, %u %u",
789 						local_port, i, manual,
790 						conf->manual_bind, explicit,
791 						conf->tx_explicit);
792 					mlx5_txq_release(dev, i);
793 					return -rte_errno;
794 				}
795 			}
796 			num_q++;
797 		}
798 		mlx5_txq_release(dev, i);
799 	}
800 	/* Once no queue is configured, success is returned directly. */
801 	if (num_q == 0)
802 		return ret;
803 	/* All the hairpin TX queues need to be traversed again. */
804 	for (i = 0; i != priv->txqs_n; i++) {
805 		txq_ctrl = mlx5_txq_get(dev, i);
806 		if (txq_ctrl == NULL)
807 			continue;
808 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
809 			mlx5_txq_release(dev, i);
810 			continue;
811 		}
812 		if (txq_ctrl->hairpin_conf.peers[0].port != rx_port) {
813 			mlx5_txq_release(dev, i);
814 			continue;
815 		}
816 		rx_queue = txq_ctrl->hairpin_conf.peers[0].queue;
817 		/*
818 		 * Fetch peer RxQ's information.
819 		 * No need to pass the information of the current queue.
820 		 */
821 		ret = rte_eth_hairpin_queue_peer_update(rx_port, rx_queue,
822 							NULL, &peer, 1);
823 		if (ret != 0) {
824 			mlx5_txq_release(dev, i);
825 			goto error;
826 		}
827 		/* Accessing its own device, inside mlx5 PMD. */
828 		ret = mlx5_hairpin_queue_peer_bind(dev, i, &peer, 1);
829 		if (ret != 0) {
830 			mlx5_txq_release(dev, i);
831 			goto error;
832 		}
833 		/* Pass TxQ's information to peer RxQ and try binding. */
834 		cur.peer_q = rx_queue;
835 		cur.qp_id = txq_ctrl->obj->sq->id;
836 		cur.vhca_id = priv->config.hca_attr.vhca_id;
837 		cur.tx_explicit = txq_ctrl->hairpin_conf.tx_explicit;
838 		cur.manual_bind = txq_ctrl->hairpin_conf.manual_bind;
839 		/*
840 		 * In order to access another device in a proper way, RTE level
841 		 * private function is needed.
842 		 */
843 		ret = rte_eth_hairpin_queue_peer_bind(rx_port, rx_queue,
844 						      &cur, 0);
845 		if (ret != 0) {
846 			mlx5_txq_release(dev, i);
847 			goto error;
848 		}
849 		mlx5_txq_release(dev, i);
850 	}
851 	return 0;
852 error:
853 	/*
854 	 * Do roll-back process for the queues already bound.
855 	 * No need to check the return value of the queue unbind function.
856 	 */
857 	do {
858 		/* No validation is needed here. */
859 		txq_ctrl = mlx5_txq_get(dev, i);
860 		if (txq_ctrl == NULL)
861 			continue;
862 		rx_queue = txq_ctrl->hairpin_conf.peers[0].queue;
863 		rte_eth_hairpin_queue_peer_unbind(rx_port, rx_queue, 0);
864 		mlx5_hairpin_queue_peer_unbind(dev, i, 1);
865 		mlx5_txq_release(dev, i);
866 	} while (i--);
867 	return ret;
868 }
869 
870 /*
871  * Unbind the hairpin port pair, HW configuration of both devices will be clear
872  * and status will be reset for all the queues used between the them.
873  * This function only supports to unbind the Tx from one Rx.
874  *
875  * @param dev
876  *   Pointer to Ethernet device structure.
877  * @param rx_port
878  *   Port identifier of the Rx port.
879  *
880  * @return
881  *   0 on success, a negative errno value otherwise and rte_errno is set.
882  */
883 static int
884 mlx5_hairpin_unbind_single_port(struct rte_eth_dev *dev, uint16_t rx_port)
885 {
886 	struct mlx5_priv *priv = dev->data->dev_private;
887 	struct mlx5_txq_ctrl *txq_ctrl;
888 	uint32_t i;
889 	int ret;
890 	uint16_t cur_port = priv->dev_data->port_id;
891 
892 	if (mlx5_eth_find_next(rx_port, dev->device) != rx_port) {
893 		rte_errno = ENODEV;
894 		DRV_LOG(ERR, "Rx port %u does not belong to mlx5", rx_port);
895 		return -rte_errno;
896 	}
897 	for (i = 0; i != priv->txqs_n; i++) {
898 		uint16_t rx_queue;
899 
900 		txq_ctrl = mlx5_txq_get(dev, i);
901 		if (txq_ctrl == NULL)
902 			continue;
903 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
904 			mlx5_txq_release(dev, i);
905 			continue;
906 		}
907 		if (txq_ctrl->hairpin_conf.peers[0].port != rx_port) {
908 			mlx5_txq_release(dev, i);
909 			continue;
910 		}
911 		/* Indeed, only the first used queue needs to be checked. */
912 		if (txq_ctrl->hairpin_conf.manual_bind == 0) {
913 			if (cur_port != rx_port) {
914 				rte_errno = EINVAL;
915 				DRV_LOG(ERR, "port %u and port %u are in"
916 					" auto-bind mode", cur_port, rx_port);
917 				mlx5_txq_release(dev, i);
918 				return -rte_errno;
919 			} else {
920 				return 0;
921 			}
922 		}
923 		rx_queue = txq_ctrl->hairpin_conf.peers[0].queue;
924 		mlx5_txq_release(dev, i);
925 		ret = rte_eth_hairpin_queue_peer_unbind(rx_port, rx_queue, 0);
926 		if (ret) {
927 			DRV_LOG(ERR, "port %u Rx queue %d unbind - failure",
928 				rx_port, rx_queue);
929 			return ret;
930 		}
931 		ret = mlx5_hairpin_queue_peer_unbind(dev, i, 1);
932 		if (ret) {
933 			DRV_LOG(ERR, "port %u Tx queue %d unbind - failure",
934 				cur_port, i);
935 			return ret;
936 		}
937 	}
938 	return 0;
939 }
940 
941 /*
942  * Bind hairpin ports, Rx could be all ports when using RTE_MAX_ETHPORTS.
943  * @see mlx5_hairpin_bind_single_port()
944  */
945 int
946 mlx5_hairpin_bind(struct rte_eth_dev *dev, uint16_t rx_port)
947 {
948 	int ret = 0;
949 	uint16_t p, pp;
950 
951 	/*
952 	 * If the Rx port has no hairpin configuration with the current port,
953 	 * the binding will be skipped in the called function of single port.
954 	 * Device started status will be checked only before the queue
955 	 * information updating.
956 	 */
957 	if (rx_port == RTE_MAX_ETHPORTS) {
958 		MLX5_ETH_FOREACH_DEV(p, dev->device) {
959 			ret = mlx5_hairpin_bind_single_port(dev, p);
960 			if (ret != 0)
961 				goto unbind;
962 		}
963 		return ret;
964 	} else {
965 		return mlx5_hairpin_bind_single_port(dev, rx_port);
966 	}
967 unbind:
968 	MLX5_ETH_FOREACH_DEV(pp, dev->device)
969 		if (pp < p)
970 			mlx5_hairpin_unbind_single_port(dev, pp);
971 	return ret;
972 }
973 
974 /*
975  * Unbind hairpin ports, Rx could be all ports when using RTE_MAX_ETHPORTS.
976  * @see mlx5_hairpin_unbind_single_port()
977  */
978 int
979 mlx5_hairpin_unbind(struct rte_eth_dev *dev, uint16_t rx_port)
980 {
981 	int ret = 0;
982 	uint16_t p;
983 
984 	if (rx_port == RTE_MAX_ETHPORTS)
985 		MLX5_ETH_FOREACH_DEV(p, dev->device) {
986 			ret = mlx5_hairpin_unbind_single_port(dev, p);
987 			if (ret != 0)
988 				return ret;
989 		}
990 	else
991 		ret = mlx5_hairpin_unbind_single_port(dev, rx_port);
992 	return ret;
993 }
994 
995 /*
996  * DPDK callback to get the hairpin peer ports list.
997  * This will return the actual number of peer ports and save the identifiers
998  * into the array (sorted, may be different from that when setting up the
999  * hairpin peer queues).
1000  * The peer port ID could be the same as the port ID of the current device.
1001  *
1002  * @param dev
1003  *   Pointer to Ethernet device structure.
1004  * @param peer_ports
1005  *   Pointer to array to save the port identifiers.
1006  * @param len
1007  *   The length of the array.
1008  * @param direction
1009  *   Current port to peer port direction.
1010  *   positive - current used as Tx to get all peer Rx ports.
1011  *   zero - current used as Rx to get all peer Tx ports.
1012  *
1013  * @return
1014  *   0 or positive value on success, actual number of peer ports.
1015  *   a negative errno value otherwise and rte_errno is set.
1016  */
1017 int
1018 mlx5_hairpin_get_peer_ports(struct rte_eth_dev *dev, uint16_t *peer_ports,
1019 			    size_t len, uint32_t direction)
1020 {
1021 	struct mlx5_priv *priv = dev->data->dev_private;
1022 	struct mlx5_txq_ctrl *txq_ctrl;
1023 	uint32_t i;
1024 	uint16_t pp;
1025 	uint32_t bits[(RTE_MAX_ETHPORTS + 31) / 32] = {0};
1026 	int ret = 0;
1027 
1028 	if (direction) {
1029 		for (i = 0; i < priv->txqs_n; i++) {
1030 			txq_ctrl = mlx5_txq_get(dev, i);
1031 			if (!txq_ctrl)
1032 				continue;
1033 			if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
1034 				mlx5_txq_release(dev, i);
1035 				continue;
1036 			}
1037 			pp = txq_ctrl->hairpin_conf.peers[0].port;
1038 			if (pp >= RTE_MAX_ETHPORTS) {
1039 				rte_errno = ERANGE;
1040 				mlx5_txq_release(dev, i);
1041 				DRV_LOG(ERR, "port %hu queue %u peer port "
1042 					"out of range %hu",
1043 					priv->dev_data->port_id, i, pp);
1044 				return -rte_errno;
1045 			}
1046 			bits[pp / 32] |= 1 << (pp % 32);
1047 			mlx5_txq_release(dev, i);
1048 		}
1049 	} else {
1050 		for (i = 0; i < priv->rxqs_n; i++) {
1051 			struct mlx5_rxq_priv *rxq = mlx5_rxq_get(dev, i);
1052 			struct mlx5_rxq_ctrl *rxq_ctrl;
1053 
1054 			if (rxq == NULL)
1055 				continue;
1056 			rxq_ctrl = rxq->ctrl;
1057 			if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN)
1058 				continue;
1059 			pp = rxq->hairpin_conf.peers[0].port;
1060 			if (pp >= RTE_MAX_ETHPORTS) {
1061 				rte_errno = ERANGE;
1062 				DRV_LOG(ERR, "port %hu queue %u peer port "
1063 					"out of range %hu",
1064 					priv->dev_data->port_id, i, pp);
1065 				return -rte_errno;
1066 			}
1067 			bits[pp / 32] |= 1 << (pp % 32);
1068 		}
1069 	}
1070 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
1071 		if (bits[i / 32] & (1 << (i % 32))) {
1072 			if ((size_t)ret >= len) {
1073 				rte_errno = E2BIG;
1074 				return -rte_errno;
1075 			}
1076 			peer_ports[ret++] = i;
1077 		}
1078 	}
1079 	return ret;
1080 }
1081 
1082 /**
1083  * DPDK callback to start the device.
1084  *
1085  * Simulate device start by attaching all configured flows.
1086  *
1087  * @param dev
1088  *   Pointer to Ethernet device structure.
1089  *
1090  * @return
1091  *   0 on success, a negative errno value otherwise and rte_errno is set.
1092  */
1093 int
1094 mlx5_dev_start(struct rte_eth_dev *dev)
1095 {
1096 	struct mlx5_priv *priv = dev->data->dev_private;
1097 	int ret;
1098 	int fine_inline;
1099 
1100 	DRV_LOG(DEBUG, "port %u starting device", dev->data->port_id);
1101 	fine_inline = rte_mbuf_dynflag_lookup
1102 		(RTE_PMD_MLX5_FINE_GRANULARITY_INLINE, NULL);
1103 	if (fine_inline >= 0)
1104 		rte_net_mlx5_dynf_inline_mask = 1UL << fine_inline;
1105 	else
1106 		rte_net_mlx5_dynf_inline_mask = 0;
1107 	if (dev->data->nb_rx_queues > 0) {
1108 		ret = mlx5_dev_configure_rss_reta(dev);
1109 		if (ret) {
1110 			DRV_LOG(ERR, "port %u reta config failed: %s",
1111 				dev->data->port_id, strerror(rte_errno));
1112 			return -rte_errno;
1113 		}
1114 	}
1115 	ret = mlx5_txpp_start(dev);
1116 	if (ret) {
1117 		DRV_LOG(ERR, "port %u Tx packet pacing init failed: %s",
1118 			dev->data->port_id, strerror(rte_errno));
1119 		goto error;
1120 	}
1121 	if ((priv->sh->devx && priv->config.dv_flow_en &&
1122 	    priv->config.dest_tir) && priv->obj_ops.lb_dummy_queue_create) {
1123 		ret = priv->obj_ops.lb_dummy_queue_create(dev);
1124 		if (ret)
1125 			goto error;
1126 	}
1127 	ret = mlx5_txq_start(dev);
1128 	if (ret) {
1129 		DRV_LOG(ERR, "port %u Tx queue allocation failed: %s",
1130 			dev->data->port_id, strerror(rte_errno));
1131 		goto error;
1132 	}
1133 	if (priv->config.std_delay_drop || priv->config.hp_delay_drop) {
1134 		if (!priv->config.vf && !priv->config.sf &&
1135 		    !priv->representor) {
1136 			ret = mlx5_get_flag_dropless_rq(dev);
1137 			if (ret < 0)
1138 				DRV_LOG(WARNING,
1139 					"port %u cannot query dropless flag",
1140 					dev->data->port_id);
1141 			else if (!ret)
1142 				DRV_LOG(WARNING,
1143 					"port %u dropless_rq OFF, no rearming",
1144 					dev->data->port_id);
1145 		} else {
1146 			DRV_LOG(DEBUG,
1147 				"port %u doesn't support dropless_rq flag",
1148 				dev->data->port_id);
1149 		}
1150 	}
1151 	ret = mlx5_rxq_start(dev);
1152 	if (ret) {
1153 		DRV_LOG(ERR, "port %u Rx queue allocation failed: %s",
1154 			dev->data->port_id, strerror(rte_errno));
1155 		goto error;
1156 	}
1157 	/*
1158 	 * Such step will be skipped if there is no hairpin TX queue configured
1159 	 * with RX peer queue from the same device.
1160 	 */
1161 	ret = mlx5_hairpin_auto_bind(dev);
1162 	if (ret) {
1163 		DRV_LOG(ERR, "port %u hairpin auto binding failed: %s",
1164 			dev->data->port_id, strerror(rte_errno));
1165 		goto error;
1166 	}
1167 	/* Set started flag here for the following steps like control flow. */
1168 	dev->data->dev_started = 1;
1169 	ret = mlx5_rx_intr_vec_enable(dev);
1170 	if (ret) {
1171 		DRV_LOG(ERR, "port %u Rx interrupt vector creation failed",
1172 			dev->data->port_id);
1173 		goto error;
1174 	}
1175 	mlx5_os_stats_init(dev);
1176 	/*
1177 	 * Attach indirection table objects detached on port stop.
1178 	 * They may be needed to create RSS in non-isolated mode.
1179 	 */
1180 	ret = mlx5_action_handle_attach(dev);
1181 	if (ret) {
1182 		DRV_LOG(ERR,
1183 			"port %u failed to attach indirect actions: %s",
1184 			dev->data->port_id, rte_strerror(rte_errno));
1185 		goto error;
1186 	}
1187 	ret = mlx5_traffic_enable(dev);
1188 	if (ret) {
1189 		DRV_LOG(ERR, "port %u failed to set defaults flows",
1190 			dev->data->port_id);
1191 		goto error;
1192 	}
1193 	/* Set a mask and offset of dynamic metadata flows into Rx queues. */
1194 	mlx5_flow_rxq_dynf_metadata_set(dev);
1195 	/* Set flags and context to convert Rx timestamps. */
1196 	mlx5_rxq_timestamp_set(dev);
1197 	/* Set a mask and offset of scheduling on timestamp into Tx queues. */
1198 	mlx5_txq_dynf_timestamp_set(dev);
1199 	/*
1200 	 * In non-cached mode, it only needs to start the default mreg copy
1201 	 * action and no flow created by application exists anymore.
1202 	 * But it is worth wrapping the interface for further usage.
1203 	 */
1204 	ret = mlx5_flow_start_default(dev);
1205 	if (ret) {
1206 		DRV_LOG(DEBUG, "port %u failed to start default actions: %s",
1207 			dev->data->port_id, strerror(rte_errno));
1208 		goto error;
1209 	}
1210 	if (mlx5_dev_ctx_shared_mempool_subscribe(dev) != 0) {
1211 		DRV_LOG(ERR, "port %u failed to subscribe for mempool life cycle: %s",
1212 			dev->data->port_id, rte_strerror(rte_errno));
1213 		goto error;
1214 	}
1215 	rte_wmb();
1216 	dev->tx_pkt_burst = mlx5_select_tx_function(dev);
1217 	dev->rx_pkt_burst = mlx5_select_rx_function(dev);
1218 	/* Enable datapath on secondary process. */
1219 	mlx5_mp_os_req_start_rxtx(dev);
1220 	if (rte_intr_fd_get(priv->sh->intr_handle) >= 0) {
1221 		priv->sh->port[priv->dev_port - 1].ih_port_id =
1222 					(uint32_t)dev->data->port_id;
1223 	} else {
1224 		DRV_LOG(INFO, "port %u starts without LSC and RMV interrupts.",
1225 			dev->data->port_id);
1226 		dev->data->dev_conf.intr_conf.lsc = 0;
1227 		dev->data->dev_conf.intr_conf.rmv = 0;
1228 	}
1229 	if (rte_intr_fd_get(priv->sh->intr_handle_devx) >= 0)
1230 		priv->sh->port[priv->dev_port - 1].devx_ih_port_id =
1231 					(uint32_t)dev->data->port_id;
1232 	return 0;
1233 error:
1234 	ret = rte_errno; /* Save rte_errno before cleanup. */
1235 	/* Rollback. */
1236 	dev->data->dev_started = 0;
1237 	mlx5_flow_stop_default(dev);
1238 	mlx5_traffic_disable(dev);
1239 	mlx5_txq_stop(dev);
1240 	mlx5_rxq_stop(dev);
1241 	if (priv->obj_ops.lb_dummy_queue_release)
1242 		priv->obj_ops.lb_dummy_queue_release(dev);
1243 	mlx5_txpp_stop(dev); /* Stop last. */
1244 	rte_errno = ret; /* Restore rte_errno. */
1245 	return -rte_errno;
1246 }
1247 
1248 /**
1249  * DPDK callback to stop the device.
1250  *
1251  * Simulate device stop by detaching all configured flows.
1252  *
1253  * @param dev
1254  *   Pointer to Ethernet device structure.
1255  */
1256 int
1257 mlx5_dev_stop(struct rte_eth_dev *dev)
1258 {
1259 	struct mlx5_priv *priv = dev->data->dev_private;
1260 
1261 	dev->data->dev_started = 0;
1262 	/* Prevent crashes when queues are still in use. */
1263 	dev->rx_pkt_burst = removed_rx_burst;
1264 	dev->tx_pkt_burst = removed_tx_burst;
1265 	rte_wmb();
1266 	/* Disable datapath on secondary process. */
1267 	mlx5_mp_os_req_stop_rxtx(dev);
1268 	rte_delay_us_sleep(1000 * priv->rxqs_n);
1269 	DRV_LOG(DEBUG, "port %u stopping device", dev->data->port_id);
1270 	mlx5_flow_stop_default(dev);
1271 	/* Control flows for default traffic can be removed firstly. */
1272 	mlx5_traffic_disable(dev);
1273 	/* All RX queue flags will be cleared in the flush interface. */
1274 	mlx5_flow_list_flush(dev, MLX5_FLOW_TYPE_GEN, true);
1275 	mlx5_flow_meter_rxq_flush(dev);
1276 	mlx5_action_handle_detach(dev);
1277 	mlx5_rx_intr_vec_disable(dev);
1278 	priv->sh->port[priv->dev_port - 1].ih_port_id = RTE_MAX_ETHPORTS;
1279 	priv->sh->port[priv->dev_port - 1].devx_ih_port_id = RTE_MAX_ETHPORTS;
1280 	mlx5_txq_stop(dev);
1281 	mlx5_rxq_stop(dev);
1282 	if (priv->obj_ops.lb_dummy_queue_release)
1283 		priv->obj_ops.lb_dummy_queue_release(dev);
1284 	mlx5_txpp_stop(dev);
1285 
1286 	return 0;
1287 }
1288 
1289 /**
1290  * Enable traffic flows configured by control plane
1291  *
1292  * @param dev
1293  *   Pointer to Ethernet device private data.
1294  * @param dev
1295  *   Pointer to Ethernet device structure.
1296  *
1297  * @return
1298  *   0 on success, a negative errno value otherwise and rte_errno is set.
1299  */
1300 int
1301 mlx5_traffic_enable(struct rte_eth_dev *dev)
1302 {
1303 	struct mlx5_priv *priv = dev->data->dev_private;
1304 	struct rte_flow_item_eth bcast = {
1305 		.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
1306 	};
1307 	struct rte_flow_item_eth ipv6_multi_spec = {
1308 		.dst.addr_bytes = "\x33\x33\x00\x00\x00\x00",
1309 	};
1310 	struct rte_flow_item_eth ipv6_multi_mask = {
1311 		.dst.addr_bytes = "\xff\xff\x00\x00\x00\x00",
1312 	};
1313 	struct rte_flow_item_eth unicast = {
1314 		.src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1315 	};
1316 	struct rte_flow_item_eth unicast_mask = {
1317 		.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
1318 	};
1319 	const unsigned int vlan_filter_n = priv->vlan_filter_n;
1320 	const struct rte_ether_addr cmp = {
1321 		.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1322 	};
1323 	unsigned int i;
1324 	unsigned int j;
1325 	int ret;
1326 
1327 	/*
1328 	 * Hairpin txq default flow should be created no matter if it is
1329 	 * isolation mode. Or else all the packets to be sent will be sent
1330 	 * out directly without the TX flow actions, e.g. encapsulation.
1331 	 */
1332 	for (i = 0; i != priv->txqs_n; ++i) {
1333 		struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i);
1334 		if (!txq_ctrl)
1335 			continue;
1336 		/* Only Tx implicit mode requires the default Tx flow. */
1337 		if (txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN &&
1338 		    txq_ctrl->hairpin_conf.tx_explicit == 0 &&
1339 		    txq_ctrl->hairpin_conf.peers[0].port ==
1340 		    priv->dev_data->port_id) {
1341 			ret = mlx5_ctrl_flow_source_queue(dev, i);
1342 			if (ret) {
1343 				mlx5_txq_release(dev, i);
1344 				goto error;
1345 			}
1346 		}
1347 		if ((priv->representor || priv->master) &&
1348 		    priv->config.dv_esw_en) {
1349 			if (mlx5_flow_create_devx_sq_miss_flow(dev, i) == 0) {
1350 				DRV_LOG(ERR,
1351 					"Port %u Tx queue %u SQ create representor devx default miss rule failed.",
1352 					dev->data->port_id, i);
1353 				goto error;
1354 			}
1355 		}
1356 		mlx5_txq_release(dev, i);
1357 	}
1358 	if ((priv->master || priv->representor) && priv->config.dv_esw_en) {
1359 		if (mlx5_flow_create_esw_table_zero_flow(dev))
1360 			priv->fdb_def_rule = 1;
1361 		else
1362 			DRV_LOG(INFO, "port %u FDB default rule cannot be"
1363 				" configured - only Eswitch group 0 flows are"
1364 				" supported.", dev->data->port_id);
1365 	}
1366 	if (!priv->config.lacp_by_user && priv->pf_bond >= 0) {
1367 		ret = mlx5_flow_lacp_miss(dev);
1368 		if (ret)
1369 			DRV_LOG(INFO, "port %u LACP rule cannot be created - "
1370 				"forward LACP to kernel.", dev->data->port_id);
1371 		else
1372 			DRV_LOG(INFO, "LACP traffic will be missed in port %u."
1373 				, dev->data->port_id);
1374 	}
1375 	if (priv->isolated)
1376 		return 0;
1377 	if (dev->data->promiscuous) {
1378 		struct rte_flow_item_eth promisc = {
1379 			.dst.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1380 			.src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1381 			.type = 0,
1382 		};
1383 
1384 		ret = mlx5_ctrl_flow(dev, &promisc, &promisc);
1385 		if (ret)
1386 			goto error;
1387 	}
1388 	if (dev->data->all_multicast) {
1389 		struct rte_flow_item_eth multicast = {
1390 			.dst.addr_bytes = "\x01\x00\x00\x00\x00\x00",
1391 			.src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1392 			.type = 0,
1393 		};
1394 
1395 		ret = mlx5_ctrl_flow(dev, &multicast, &multicast);
1396 		if (ret)
1397 			goto error;
1398 	} else {
1399 		/* Add broadcast/multicast flows. */
1400 		for (i = 0; i != vlan_filter_n; ++i) {
1401 			uint16_t vlan = priv->vlan_filter[i];
1402 
1403 			struct rte_flow_item_vlan vlan_spec = {
1404 				.tci = rte_cpu_to_be_16(vlan),
1405 			};
1406 			struct rte_flow_item_vlan vlan_mask =
1407 				rte_flow_item_vlan_mask;
1408 
1409 			ret = mlx5_ctrl_flow_vlan(dev, &bcast, &bcast,
1410 						  &vlan_spec, &vlan_mask);
1411 			if (ret)
1412 				goto error;
1413 			ret = mlx5_ctrl_flow_vlan(dev, &ipv6_multi_spec,
1414 						  &ipv6_multi_mask,
1415 						  &vlan_spec, &vlan_mask);
1416 			if (ret)
1417 				goto error;
1418 		}
1419 		if (!vlan_filter_n) {
1420 			ret = mlx5_ctrl_flow(dev, &bcast, &bcast);
1421 			if (ret)
1422 				goto error;
1423 			ret = mlx5_ctrl_flow(dev, &ipv6_multi_spec,
1424 					     &ipv6_multi_mask);
1425 			if (ret) {
1426 				/* Do not fail on IPv6 broadcast creation failure. */
1427 				DRV_LOG(WARNING,
1428 					"IPv6 broadcast is not supported");
1429 				ret = 0;
1430 			}
1431 		}
1432 	}
1433 	/* Add MAC address flows. */
1434 	for (i = 0; i != MLX5_MAX_MAC_ADDRESSES; ++i) {
1435 		struct rte_ether_addr *mac = &dev->data->mac_addrs[i];
1436 
1437 		if (!memcmp(mac, &cmp, sizeof(*mac)))
1438 			continue;
1439 		memcpy(&unicast.dst.addr_bytes,
1440 		       mac->addr_bytes,
1441 		       RTE_ETHER_ADDR_LEN);
1442 		for (j = 0; j != vlan_filter_n; ++j) {
1443 			uint16_t vlan = priv->vlan_filter[j];
1444 
1445 			struct rte_flow_item_vlan vlan_spec = {
1446 				.tci = rte_cpu_to_be_16(vlan),
1447 			};
1448 			struct rte_flow_item_vlan vlan_mask =
1449 				rte_flow_item_vlan_mask;
1450 
1451 			ret = mlx5_ctrl_flow_vlan(dev, &unicast,
1452 						  &unicast_mask,
1453 						  &vlan_spec,
1454 						  &vlan_mask);
1455 			if (ret)
1456 				goto error;
1457 		}
1458 		if (!vlan_filter_n) {
1459 			ret = mlx5_ctrl_flow(dev, &unicast, &unicast_mask);
1460 			if (ret)
1461 				goto error;
1462 		}
1463 	}
1464 	return 0;
1465 error:
1466 	ret = rte_errno; /* Save rte_errno before cleanup. */
1467 	mlx5_flow_list_flush(dev, MLX5_FLOW_TYPE_CTL, false);
1468 	rte_errno = ret; /* Restore rte_errno. */
1469 	return -rte_errno;
1470 }
1471 
1472 
1473 /**
1474  * Disable traffic flows configured by control plane
1475  *
1476  * @param dev
1477  *   Pointer to Ethernet device private data.
1478  */
1479 void
1480 mlx5_traffic_disable(struct rte_eth_dev *dev)
1481 {
1482 	mlx5_flow_list_flush(dev, MLX5_FLOW_TYPE_CTL, false);
1483 }
1484 
1485 /**
1486  * Restart traffic flows configured by control plane
1487  *
1488  * @param dev
1489  *   Pointer to Ethernet device private data.
1490  *
1491  * @return
1492  *   0 on success, a negative errno value otherwise and rte_errno is set.
1493  */
1494 int
1495 mlx5_traffic_restart(struct rte_eth_dev *dev)
1496 {
1497 	if (dev->data->dev_started) {
1498 		mlx5_traffic_disable(dev);
1499 		return mlx5_traffic_enable(dev);
1500 	}
1501 	return 0;
1502 }
1503