xref: /dpdk/drivers/net/mlx5/mlx5_trigger.c (revision 69f9d8aa357d2299e057b7e335f340e20a0c5e7e)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5 
6 #include <unistd.h>
7 
8 #include <rte_ether.h>
9 #include <ethdev_driver.h>
10 #include <rte_interrupts.h>
11 #include <rte_alarm.h>
12 #include <rte_cycles.h>
13 
14 #include <mlx5_malloc.h>
15 
16 #include "mlx5.h"
17 #include "mlx5_flow.h"
18 #include "mlx5_rx.h"
19 #include "mlx5_tx.h"
20 #include "mlx5_utils.h"
21 #include "rte_pmd_mlx5.h"
22 
23 /**
24  * Stop traffic on Tx queues.
25  *
26  * @param dev
27  *   Pointer to Ethernet device structure.
28  */
29 static void
30 mlx5_txq_stop(struct rte_eth_dev *dev)
31 {
32 	struct mlx5_priv *priv = dev->data->dev_private;
33 	unsigned int i;
34 
35 	for (i = 0; i != priv->txqs_n; ++i)
36 		mlx5_txq_release(dev, i);
37 }
38 
39 /**
40  * Start traffic on Tx queues.
41  *
42  * @param dev
43  *   Pointer to Ethernet device structure.
44  *
45  * @return
46  *   0 on success, a negative errno value otherwise and rte_errno is set.
47  */
48 static int
49 mlx5_txq_start(struct rte_eth_dev *dev)
50 {
51 	struct mlx5_priv *priv = dev->data->dev_private;
52 	unsigned int i;
53 	int ret;
54 
55 	for (i = 0; i != priv->txqs_n; ++i) {
56 		struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i);
57 		struct mlx5_txq_data *txq_data = &txq_ctrl->txq;
58 		uint32_t flags = MLX5_MEM_RTE | MLX5_MEM_ZERO;
59 
60 		if (!txq_ctrl)
61 			continue;
62 		if (txq_ctrl->type == MLX5_TXQ_TYPE_STANDARD)
63 			txq_alloc_elts(txq_ctrl);
64 		MLX5_ASSERT(!txq_ctrl->obj);
65 		txq_ctrl->obj = mlx5_malloc(flags, sizeof(struct mlx5_txq_obj),
66 					    0, txq_ctrl->socket);
67 		if (!txq_ctrl->obj) {
68 			DRV_LOG(ERR, "Port %u Tx queue %u cannot allocate "
69 				"memory resources.", dev->data->port_id,
70 				txq_data->idx);
71 			rte_errno = ENOMEM;
72 			goto error;
73 		}
74 		ret = priv->obj_ops.txq_obj_new(dev, i);
75 		if (ret < 0) {
76 			mlx5_free(txq_ctrl->obj);
77 			txq_ctrl->obj = NULL;
78 			goto error;
79 		}
80 		if (txq_ctrl->type == MLX5_TXQ_TYPE_STANDARD) {
81 			size_t size = txq_data->cqe_s * sizeof(*txq_data->fcqs);
82 
83 			txq_data->fcqs = mlx5_malloc(flags, size,
84 						     RTE_CACHE_LINE_SIZE,
85 						     txq_ctrl->socket);
86 			if (!txq_data->fcqs) {
87 				DRV_LOG(ERR, "Port %u Tx queue %u cannot "
88 					"allocate memory (FCQ).",
89 					dev->data->port_id, i);
90 				rte_errno = ENOMEM;
91 				goto error;
92 			}
93 		}
94 		DRV_LOG(DEBUG, "Port %u txq %u updated with %p.",
95 			dev->data->port_id, i, (void *)&txq_ctrl->obj);
96 		LIST_INSERT_HEAD(&priv->txqsobj, txq_ctrl->obj, next);
97 	}
98 	return 0;
99 error:
100 	ret = rte_errno; /* Save rte_errno before cleanup. */
101 	do {
102 		mlx5_txq_release(dev, i);
103 	} while (i-- != 0);
104 	rte_errno = ret; /* Restore rte_errno. */
105 	return -rte_errno;
106 }
107 
108 /**
109  * Translate the chunk address to MR key in order to put in into the cache.
110  */
111 static void
112 mlx5_rxq_mempool_register_cb(struct rte_mempool *mp, void *opaque,
113 			     struct rte_mempool_memhdr *memhdr,
114 			     unsigned int idx)
115 {
116 	struct mlx5_rxq_data *rxq = opaque;
117 
118 	RTE_SET_USED(mp);
119 	RTE_SET_USED(idx);
120 	mlx5_rx_addr2mr(rxq, (uintptr_t)memhdr->addr);
121 }
122 
123 /**
124  * Register Rx queue mempools and fill the Rx queue cache.
125  * This function tolerates repeated mempool registration.
126  *
127  * @param[in] rxq_ctrl
128  *   Rx queue control data.
129  *
130  * @return
131  *   0 on success, (-1) on failure and rte_errno is set.
132  */
133 static int
134 mlx5_rxq_mempool_register(struct mlx5_rxq_ctrl *rxq_ctrl)
135 {
136 	struct rte_mempool *mp;
137 	uint32_t s;
138 	int ret = 0;
139 
140 	mlx5_mr_flush_local_cache(&rxq_ctrl->rxq.mr_ctrl);
141 	/* MPRQ mempool is registered on creation, just fill the cache. */
142 	if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq)) {
143 		rte_mempool_mem_iter(rxq_ctrl->rxq.mprq_mp,
144 				     mlx5_rxq_mempool_register_cb,
145 				     &rxq_ctrl->rxq);
146 		return 0;
147 	}
148 	for (s = 0; s < rxq_ctrl->rxq.rxseg_n; s++) {
149 		uint32_t flags;
150 
151 		mp = rxq_ctrl->rxq.rxseg[s].mp;
152 		flags = rte_pktmbuf_priv_flags(mp);
153 		ret = mlx5_mr_mempool_register(rxq_ctrl->sh->cdev, mp);
154 		if (ret < 0 && rte_errno != EEXIST)
155 			return ret;
156 		if ((flags & RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF) == 0)
157 			rte_mempool_mem_iter(mp, mlx5_rxq_mempool_register_cb,
158 					&rxq_ctrl->rxq);
159 	}
160 	return 0;
161 }
162 
163 /**
164  * Stop traffic on Rx queues.
165  *
166  * @param dev
167  *   Pointer to Ethernet device structure.
168  */
169 static void
170 mlx5_rxq_stop(struct rte_eth_dev *dev)
171 {
172 	struct mlx5_priv *priv = dev->data->dev_private;
173 	unsigned int i;
174 
175 	for (i = 0; i != priv->rxqs_n; ++i)
176 		mlx5_rxq_release(dev, i);
177 }
178 
179 static int
180 mlx5_rxq_ctrl_prepare(struct rte_eth_dev *dev, struct mlx5_rxq_ctrl *rxq_ctrl,
181 		      unsigned int idx)
182 {
183 	int ret = 0;
184 
185 	if (rxq_ctrl->type == MLX5_RXQ_TYPE_STANDARD) {
186 		/*
187 		 * Pre-register the mempools. Regardless of whether
188 		 * the implicit registration is enabled or not,
189 		 * Rx mempool destruction is tracked to free MRs.
190 		 */
191 		if (mlx5_rxq_mempool_register(rxq_ctrl) < 0)
192 			return -rte_errno;
193 		ret = rxq_alloc_elts(rxq_ctrl);
194 		if (ret)
195 			return ret;
196 	}
197 	MLX5_ASSERT(!rxq_ctrl->obj);
198 	rxq_ctrl->obj = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
199 				    sizeof(*rxq_ctrl->obj), 0,
200 				    rxq_ctrl->socket);
201 	if (!rxq_ctrl->obj) {
202 		DRV_LOG(ERR, "Port %u Rx queue %u can't allocate resources.",
203 			dev->data->port_id, idx);
204 		rte_errno = ENOMEM;
205 		return -rte_errno;
206 	}
207 	DRV_LOG(DEBUG, "Port %u rxq %u updated with %p.", dev->data->port_id,
208 		idx, (void *)&rxq_ctrl->obj);
209 	return 0;
210 }
211 
212 /**
213  * Start traffic on Rx queues.
214  *
215  * @param dev
216  *   Pointer to Ethernet device structure.
217  *
218  * @return
219  *   0 on success, a negative errno value otherwise and rte_errno is set.
220  */
221 static int
222 mlx5_rxq_start(struct rte_eth_dev *dev)
223 {
224 	struct mlx5_priv *priv = dev->data->dev_private;
225 	unsigned int i;
226 	int ret = 0;
227 
228 	/* Allocate/reuse/resize mempool for Multi-Packet RQ. */
229 	if (mlx5_mprq_alloc_mp(dev)) {
230 		/* Should not release Rx queues but return immediately. */
231 		return -rte_errno;
232 	}
233 	DRV_LOG(DEBUG, "Port %u device_attr.max_qp_wr is %d.",
234 		dev->data->port_id, priv->sh->device_attr.max_qp_wr);
235 	DRV_LOG(DEBUG, "Port %u device_attr.max_sge is %d.",
236 		dev->data->port_id, priv->sh->device_attr.max_sge);
237 	for (i = 0; i != priv->rxqs_n; ++i) {
238 		struct mlx5_rxq_priv *rxq = mlx5_rxq_ref(dev, i);
239 		struct mlx5_rxq_ctrl *rxq_ctrl;
240 
241 		if (rxq == NULL)
242 			continue;
243 		rxq_ctrl = rxq->ctrl;
244 		if (!rxq_ctrl->started) {
245 			if (mlx5_rxq_ctrl_prepare(dev, rxq_ctrl, i) < 0)
246 				goto error;
247 			LIST_INSERT_HEAD(&priv->rxqsobj, rxq_ctrl->obj, next);
248 		}
249 		ret = priv->obj_ops.rxq_obj_new(rxq);
250 		if (ret) {
251 			mlx5_free(rxq_ctrl->obj);
252 			rxq_ctrl->obj = NULL;
253 			goto error;
254 		}
255 		rxq_ctrl->started = true;
256 	}
257 	return 0;
258 error:
259 	ret = rte_errno; /* Save rte_errno before cleanup. */
260 	do {
261 		mlx5_rxq_release(dev, i);
262 	} while (i-- != 0);
263 	rte_errno = ret; /* Restore rte_errno. */
264 	return -rte_errno;
265 }
266 
267 /**
268  * Binds Tx queues to Rx queues for hairpin.
269  *
270  * Binds Tx queues to the target Rx queues.
271  *
272  * @param dev
273  *   Pointer to Ethernet device structure.
274  *
275  * @return
276  *   0 on success, a negative errno value otherwise and rte_errno is set.
277  */
278 static int
279 mlx5_hairpin_auto_bind(struct rte_eth_dev *dev)
280 {
281 	struct mlx5_priv *priv = dev->data->dev_private;
282 	struct mlx5_devx_modify_sq_attr sq_attr = { 0 };
283 	struct mlx5_devx_modify_rq_attr rq_attr = { 0 };
284 	struct mlx5_txq_ctrl *txq_ctrl;
285 	struct mlx5_rxq_priv *rxq;
286 	struct mlx5_rxq_ctrl *rxq_ctrl;
287 	struct mlx5_devx_obj *sq;
288 	struct mlx5_devx_obj *rq;
289 	unsigned int i;
290 	int ret = 0;
291 	bool need_auto = false;
292 	uint16_t self_port = dev->data->port_id;
293 
294 	for (i = 0; i != priv->txqs_n; ++i) {
295 		txq_ctrl = mlx5_txq_get(dev, i);
296 		if (!txq_ctrl)
297 			continue;
298 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN ||
299 		    txq_ctrl->hairpin_conf.peers[0].port != self_port) {
300 			mlx5_txq_release(dev, i);
301 			continue;
302 		}
303 		if (txq_ctrl->hairpin_conf.manual_bind) {
304 			mlx5_txq_release(dev, i);
305 			return 0;
306 		}
307 		need_auto = true;
308 		mlx5_txq_release(dev, i);
309 	}
310 	if (!need_auto)
311 		return 0;
312 	for (i = 0; i != priv->txqs_n; ++i) {
313 		txq_ctrl = mlx5_txq_get(dev, i);
314 		if (!txq_ctrl)
315 			continue;
316 		/* Skip hairpin queues with other peer ports. */
317 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN ||
318 		    txq_ctrl->hairpin_conf.peers[0].port != self_port) {
319 			mlx5_txq_release(dev, i);
320 			continue;
321 		}
322 		if (!txq_ctrl->obj) {
323 			rte_errno = ENOMEM;
324 			DRV_LOG(ERR, "port %u no txq object found: %d",
325 				dev->data->port_id, i);
326 			mlx5_txq_release(dev, i);
327 			return -rte_errno;
328 		}
329 		sq = txq_ctrl->obj->sq;
330 		rxq = mlx5_rxq_get(dev, txq_ctrl->hairpin_conf.peers[0].queue);
331 		if (rxq == NULL) {
332 			mlx5_txq_release(dev, i);
333 			rte_errno = EINVAL;
334 			DRV_LOG(ERR, "port %u no rxq object found: %d",
335 				dev->data->port_id,
336 				txq_ctrl->hairpin_conf.peers[0].queue);
337 			return -rte_errno;
338 		}
339 		rxq_ctrl = rxq->ctrl;
340 		if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN ||
341 		    rxq->hairpin_conf.peers[0].queue != i) {
342 			rte_errno = ENOMEM;
343 			DRV_LOG(ERR, "port %u Tx queue %d can't be binded to "
344 				"Rx queue %d", dev->data->port_id,
345 				i, txq_ctrl->hairpin_conf.peers[0].queue);
346 			goto error;
347 		}
348 		rq = rxq_ctrl->obj->rq;
349 		if (!rq) {
350 			rte_errno = ENOMEM;
351 			DRV_LOG(ERR, "port %u hairpin no matching rxq: %d",
352 				dev->data->port_id,
353 				txq_ctrl->hairpin_conf.peers[0].queue);
354 			goto error;
355 		}
356 		sq_attr.state = MLX5_SQC_STATE_RDY;
357 		sq_attr.sq_state = MLX5_SQC_STATE_RST;
358 		sq_attr.hairpin_peer_rq = rq->id;
359 		sq_attr.hairpin_peer_vhca = priv->config.hca_attr.vhca_id;
360 		ret = mlx5_devx_cmd_modify_sq(sq, &sq_attr);
361 		if (ret)
362 			goto error;
363 		rq_attr.state = MLX5_SQC_STATE_RDY;
364 		rq_attr.rq_state = MLX5_SQC_STATE_RST;
365 		rq_attr.hairpin_peer_sq = sq->id;
366 		rq_attr.hairpin_peer_vhca = priv->config.hca_attr.vhca_id;
367 		ret = mlx5_devx_cmd_modify_rq(rq, &rq_attr);
368 		if (ret)
369 			goto error;
370 		/* Qs with auto-bind will be destroyed directly. */
371 		rxq->hairpin_status = 1;
372 		txq_ctrl->hairpin_status = 1;
373 		mlx5_txq_release(dev, i);
374 	}
375 	return 0;
376 error:
377 	mlx5_txq_release(dev, i);
378 	return -rte_errno;
379 }
380 
381 /*
382  * Fetch the peer queue's SW & HW information.
383  *
384  * @param dev
385  *   Pointer to Ethernet device structure.
386  * @param peer_queue
387  *   Index of the queue to fetch the information.
388  * @param current_info
389  *   Pointer to the input peer information, not used currently.
390  * @param peer_info
391  *   Pointer to the structure to store the information, output.
392  * @param direction
393  *   Positive to get the RxQ information, zero to get the TxQ information.
394  *
395  * @return
396  *   0 on success, a negative errno value otherwise and rte_errno is set.
397  */
398 int
399 mlx5_hairpin_queue_peer_update(struct rte_eth_dev *dev, uint16_t peer_queue,
400 			       struct rte_hairpin_peer_info *current_info,
401 			       struct rte_hairpin_peer_info *peer_info,
402 			       uint32_t direction)
403 {
404 	struct mlx5_priv *priv = dev->data->dev_private;
405 	RTE_SET_USED(current_info);
406 
407 	if (dev->data->dev_started == 0) {
408 		rte_errno = EBUSY;
409 		DRV_LOG(ERR, "peer port %u is not started",
410 			dev->data->port_id);
411 		return -rte_errno;
412 	}
413 	/*
414 	 * Peer port used as egress. In the current design, hairpin Tx queue
415 	 * will be bound to the peer Rx queue. Indeed, only the information of
416 	 * peer Rx queue needs to be fetched.
417 	 */
418 	if (direction == 0) {
419 		struct mlx5_txq_ctrl *txq_ctrl;
420 
421 		txq_ctrl = mlx5_txq_get(dev, peer_queue);
422 		if (txq_ctrl == NULL) {
423 			rte_errno = EINVAL;
424 			DRV_LOG(ERR, "Failed to get port %u Tx queue %d",
425 				dev->data->port_id, peer_queue);
426 			return -rte_errno;
427 		}
428 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
429 			rte_errno = EINVAL;
430 			DRV_LOG(ERR, "port %u queue %d is not a hairpin Txq",
431 				dev->data->port_id, peer_queue);
432 			mlx5_txq_release(dev, peer_queue);
433 			return -rte_errno;
434 		}
435 		if (txq_ctrl->obj == NULL || txq_ctrl->obj->sq == NULL) {
436 			rte_errno = ENOMEM;
437 			DRV_LOG(ERR, "port %u no Txq object found: %d",
438 				dev->data->port_id, peer_queue);
439 			mlx5_txq_release(dev, peer_queue);
440 			return -rte_errno;
441 		}
442 		peer_info->qp_id = txq_ctrl->obj->sq->id;
443 		peer_info->vhca_id = priv->config.hca_attr.vhca_id;
444 		/* 1-to-1 mapping, only the first one is used. */
445 		peer_info->peer_q = txq_ctrl->hairpin_conf.peers[0].queue;
446 		peer_info->tx_explicit = txq_ctrl->hairpin_conf.tx_explicit;
447 		peer_info->manual_bind = txq_ctrl->hairpin_conf.manual_bind;
448 		mlx5_txq_release(dev, peer_queue);
449 	} else { /* Peer port used as ingress. */
450 		struct mlx5_rxq_priv *rxq = mlx5_rxq_get(dev, peer_queue);
451 		struct mlx5_rxq_ctrl *rxq_ctrl;
452 
453 		if (rxq == NULL) {
454 			rte_errno = EINVAL;
455 			DRV_LOG(ERR, "Failed to get port %u Rx queue %d",
456 				dev->data->port_id, peer_queue);
457 			return -rte_errno;
458 		}
459 		rxq_ctrl = rxq->ctrl;
460 		if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN) {
461 			rte_errno = EINVAL;
462 			DRV_LOG(ERR, "port %u queue %d is not a hairpin Rxq",
463 				dev->data->port_id, peer_queue);
464 			return -rte_errno;
465 		}
466 		if (rxq_ctrl->obj == NULL || rxq_ctrl->obj->rq == NULL) {
467 			rte_errno = ENOMEM;
468 			DRV_LOG(ERR, "port %u no Rxq object found: %d",
469 				dev->data->port_id, peer_queue);
470 			return -rte_errno;
471 		}
472 		peer_info->qp_id = rxq_ctrl->obj->rq->id;
473 		peer_info->vhca_id = priv->config.hca_attr.vhca_id;
474 		peer_info->peer_q = rxq->hairpin_conf.peers[0].queue;
475 		peer_info->tx_explicit = rxq->hairpin_conf.tx_explicit;
476 		peer_info->manual_bind = rxq->hairpin_conf.manual_bind;
477 	}
478 	return 0;
479 }
480 
481 /*
482  * Bind the hairpin queue with the peer HW information.
483  * This needs to be called twice both for Tx and Rx queues of a pair.
484  * If the queue is already bound, it is considered successful.
485  *
486  * @param dev
487  *   Pointer to Ethernet device structure.
488  * @param cur_queue
489  *   Index of the queue to change the HW configuration to bind.
490  * @param peer_info
491  *   Pointer to information of the peer queue.
492  * @param direction
493  *   Positive to configure the TxQ, zero to configure the RxQ.
494  *
495  * @return
496  *   0 on success, a negative errno value otherwise and rte_errno is set.
497  */
498 int
499 mlx5_hairpin_queue_peer_bind(struct rte_eth_dev *dev, uint16_t cur_queue,
500 			     struct rte_hairpin_peer_info *peer_info,
501 			     uint32_t direction)
502 {
503 	int ret = 0;
504 
505 	/*
506 	 * Consistency checking of the peer queue: opposite direction is used
507 	 * to get the peer queue info with ethdev port ID, no need to check.
508 	 */
509 	if (peer_info->peer_q != cur_queue) {
510 		rte_errno = EINVAL;
511 		DRV_LOG(ERR, "port %u queue %d and peer queue %d mismatch",
512 			dev->data->port_id, cur_queue, peer_info->peer_q);
513 		return -rte_errno;
514 	}
515 	if (direction != 0) {
516 		struct mlx5_txq_ctrl *txq_ctrl;
517 		struct mlx5_devx_modify_sq_attr sq_attr = { 0 };
518 
519 		txq_ctrl = mlx5_txq_get(dev, cur_queue);
520 		if (txq_ctrl == NULL) {
521 			rte_errno = EINVAL;
522 			DRV_LOG(ERR, "Failed to get port %u Tx queue %d",
523 				dev->data->port_id, cur_queue);
524 			return -rte_errno;
525 		}
526 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
527 			rte_errno = EINVAL;
528 			DRV_LOG(ERR, "port %u queue %d not a hairpin Txq",
529 				dev->data->port_id, cur_queue);
530 			mlx5_txq_release(dev, cur_queue);
531 			return -rte_errno;
532 		}
533 		if (txq_ctrl->obj == NULL || txq_ctrl->obj->sq == NULL) {
534 			rte_errno = ENOMEM;
535 			DRV_LOG(ERR, "port %u no Txq object found: %d",
536 				dev->data->port_id, cur_queue);
537 			mlx5_txq_release(dev, cur_queue);
538 			return -rte_errno;
539 		}
540 		if (txq_ctrl->hairpin_status != 0) {
541 			DRV_LOG(DEBUG, "port %u Tx queue %d is already bound",
542 				dev->data->port_id, cur_queue);
543 			mlx5_txq_release(dev, cur_queue);
544 			return 0;
545 		}
546 		/*
547 		 * All queues' of one port consistency checking is done in the
548 		 * bind() function, and that is optional.
549 		 */
550 		if (peer_info->tx_explicit !=
551 		    txq_ctrl->hairpin_conf.tx_explicit) {
552 			rte_errno = EINVAL;
553 			DRV_LOG(ERR, "port %u Tx queue %d and peer Tx rule mode"
554 				" mismatch", dev->data->port_id, cur_queue);
555 			mlx5_txq_release(dev, cur_queue);
556 			return -rte_errno;
557 		}
558 		if (peer_info->manual_bind !=
559 		    txq_ctrl->hairpin_conf.manual_bind) {
560 			rte_errno = EINVAL;
561 			DRV_LOG(ERR, "port %u Tx queue %d and peer binding mode"
562 				" mismatch", dev->data->port_id, cur_queue);
563 			mlx5_txq_release(dev, cur_queue);
564 			return -rte_errno;
565 		}
566 		sq_attr.state = MLX5_SQC_STATE_RDY;
567 		sq_attr.sq_state = MLX5_SQC_STATE_RST;
568 		sq_attr.hairpin_peer_rq = peer_info->qp_id;
569 		sq_attr.hairpin_peer_vhca = peer_info->vhca_id;
570 		ret = mlx5_devx_cmd_modify_sq(txq_ctrl->obj->sq, &sq_attr);
571 		if (ret == 0)
572 			txq_ctrl->hairpin_status = 1;
573 		mlx5_txq_release(dev, cur_queue);
574 	} else {
575 		struct mlx5_rxq_priv *rxq = mlx5_rxq_get(dev, cur_queue);
576 		struct mlx5_rxq_ctrl *rxq_ctrl;
577 		struct mlx5_devx_modify_rq_attr rq_attr = { 0 };
578 
579 		if (rxq == NULL) {
580 			rte_errno = EINVAL;
581 			DRV_LOG(ERR, "Failed to get port %u Rx queue %d",
582 				dev->data->port_id, cur_queue);
583 			return -rte_errno;
584 		}
585 		rxq_ctrl = rxq->ctrl;
586 		if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN) {
587 			rte_errno = EINVAL;
588 			DRV_LOG(ERR, "port %u queue %d not a hairpin Rxq",
589 				dev->data->port_id, cur_queue);
590 			return -rte_errno;
591 		}
592 		if (rxq_ctrl->obj == NULL || rxq_ctrl->obj->rq == NULL) {
593 			rte_errno = ENOMEM;
594 			DRV_LOG(ERR, "port %u no Rxq object found: %d",
595 				dev->data->port_id, cur_queue);
596 			return -rte_errno;
597 		}
598 		if (rxq->hairpin_status != 0) {
599 			DRV_LOG(DEBUG, "port %u Rx queue %d is already bound",
600 				dev->data->port_id, cur_queue);
601 			return 0;
602 		}
603 		if (peer_info->tx_explicit !=
604 		    rxq->hairpin_conf.tx_explicit) {
605 			rte_errno = EINVAL;
606 			DRV_LOG(ERR, "port %u Rx queue %d and peer Tx rule mode"
607 				" mismatch", dev->data->port_id, cur_queue);
608 			return -rte_errno;
609 		}
610 		if (peer_info->manual_bind !=
611 		    rxq->hairpin_conf.manual_bind) {
612 			rte_errno = EINVAL;
613 			DRV_LOG(ERR, "port %u Rx queue %d and peer binding mode"
614 				" mismatch", dev->data->port_id, cur_queue);
615 			return -rte_errno;
616 		}
617 		rq_attr.state = MLX5_SQC_STATE_RDY;
618 		rq_attr.rq_state = MLX5_SQC_STATE_RST;
619 		rq_attr.hairpin_peer_sq = peer_info->qp_id;
620 		rq_attr.hairpin_peer_vhca = peer_info->vhca_id;
621 		ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq, &rq_attr);
622 		if (ret == 0)
623 			rxq->hairpin_status = 1;
624 	}
625 	return ret;
626 }
627 
628 /*
629  * Unbind the hairpin queue and reset its HW configuration.
630  * This needs to be called twice both for Tx and Rx queues of a pair.
631  * If the queue is already unbound, it is considered successful.
632  *
633  * @param dev
634  *   Pointer to Ethernet device structure.
635  * @param cur_queue
636  *   Index of the queue to change the HW configuration to unbind.
637  * @param direction
638  *   Positive to reset the TxQ, zero to reset the RxQ.
639  *
640  * @return
641  *   0 on success, a negative errno value otherwise and rte_errno is set.
642  */
643 int
644 mlx5_hairpin_queue_peer_unbind(struct rte_eth_dev *dev, uint16_t cur_queue,
645 			       uint32_t direction)
646 {
647 	int ret = 0;
648 
649 	if (direction != 0) {
650 		struct mlx5_txq_ctrl *txq_ctrl;
651 		struct mlx5_devx_modify_sq_attr sq_attr = { 0 };
652 
653 		txq_ctrl = mlx5_txq_get(dev, cur_queue);
654 		if (txq_ctrl == NULL) {
655 			rte_errno = EINVAL;
656 			DRV_LOG(ERR, "Failed to get port %u Tx queue %d",
657 				dev->data->port_id, cur_queue);
658 			return -rte_errno;
659 		}
660 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
661 			rte_errno = EINVAL;
662 			DRV_LOG(ERR, "port %u queue %d not a hairpin Txq",
663 				dev->data->port_id, cur_queue);
664 			mlx5_txq_release(dev, cur_queue);
665 			return -rte_errno;
666 		}
667 		/* Already unbound, return success before obj checking. */
668 		if (txq_ctrl->hairpin_status == 0) {
669 			DRV_LOG(DEBUG, "port %u Tx queue %d is already unbound",
670 				dev->data->port_id, cur_queue);
671 			mlx5_txq_release(dev, cur_queue);
672 			return 0;
673 		}
674 		if (!txq_ctrl->obj || !txq_ctrl->obj->sq) {
675 			rte_errno = ENOMEM;
676 			DRV_LOG(ERR, "port %u no Txq object found: %d",
677 				dev->data->port_id, cur_queue);
678 			mlx5_txq_release(dev, cur_queue);
679 			return -rte_errno;
680 		}
681 		sq_attr.state = MLX5_SQC_STATE_RST;
682 		sq_attr.sq_state = MLX5_SQC_STATE_RST;
683 		ret = mlx5_devx_cmd_modify_sq(txq_ctrl->obj->sq, &sq_attr);
684 		if (ret == 0)
685 			txq_ctrl->hairpin_status = 0;
686 		mlx5_txq_release(dev, cur_queue);
687 	} else {
688 		struct mlx5_rxq_priv *rxq = mlx5_rxq_get(dev, cur_queue);
689 		struct mlx5_rxq_ctrl *rxq_ctrl;
690 		struct mlx5_devx_modify_rq_attr rq_attr = { 0 };
691 
692 		if (rxq == NULL) {
693 			rte_errno = EINVAL;
694 			DRV_LOG(ERR, "Failed to get port %u Rx queue %d",
695 				dev->data->port_id, cur_queue);
696 			return -rte_errno;
697 		}
698 		rxq_ctrl = rxq->ctrl;
699 		if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN) {
700 			rte_errno = EINVAL;
701 			DRV_LOG(ERR, "port %u queue %d not a hairpin Rxq",
702 				dev->data->port_id, cur_queue);
703 			return -rte_errno;
704 		}
705 		if (rxq->hairpin_status == 0) {
706 			DRV_LOG(DEBUG, "port %u Rx queue %d is already unbound",
707 				dev->data->port_id, cur_queue);
708 			return 0;
709 		}
710 		if (rxq_ctrl->obj == NULL || rxq_ctrl->obj->rq == NULL) {
711 			rte_errno = ENOMEM;
712 			DRV_LOG(ERR, "port %u no Rxq object found: %d",
713 				dev->data->port_id, cur_queue);
714 			return -rte_errno;
715 		}
716 		rq_attr.state = MLX5_SQC_STATE_RST;
717 		rq_attr.rq_state = MLX5_SQC_STATE_RST;
718 		ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq, &rq_attr);
719 		if (ret == 0)
720 			rxq->hairpin_status = 0;
721 	}
722 	return ret;
723 }
724 
725 /*
726  * Bind the hairpin port pairs, from the Tx to the peer Rx.
727  * This function only supports to bind the Tx to one Rx.
728  *
729  * @param dev
730  *   Pointer to Ethernet device structure.
731  * @param rx_port
732  *   Port identifier of the Rx port.
733  *
734  * @return
735  *   0 on success, a negative errno value otherwise and rte_errno is set.
736  */
737 static int
738 mlx5_hairpin_bind_single_port(struct rte_eth_dev *dev, uint16_t rx_port)
739 {
740 	struct mlx5_priv *priv = dev->data->dev_private;
741 	int ret = 0;
742 	struct mlx5_txq_ctrl *txq_ctrl;
743 	uint32_t i;
744 	struct rte_hairpin_peer_info peer = {0xffffff};
745 	struct rte_hairpin_peer_info cur;
746 	const struct rte_eth_hairpin_conf *conf;
747 	uint16_t num_q = 0;
748 	uint16_t local_port = priv->dev_data->port_id;
749 	uint32_t manual;
750 	uint32_t explicit;
751 	uint16_t rx_queue;
752 
753 	if (mlx5_eth_find_next(rx_port, dev->device) != rx_port) {
754 		rte_errno = ENODEV;
755 		DRV_LOG(ERR, "Rx port %u does not belong to mlx5", rx_port);
756 		return -rte_errno;
757 	}
758 	/*
759 	 * Before binding TxQ to peer RxQ, first round loop will be used for
760 	 * checking the queues' configuration consistency. This would be a
761 	 * little time consuming but better than doing the rollback.
762 	 */
763 	for (i = 0; i != priv->txqs_n; i++) {
764 		txq_ctrl = mlx5_txq_get(dev, i);
765 		if (txq_ctrl == NULL)
766 			continue;
767 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
768 			mlx5_txq_release(dev, i);
769 			continue;
770 		}
771 		/*
772 		 * All hairpin Tx queues of a single port that connected to the
773 		 * same peer Rx port should have the same "auto binding" and
774 		 * "implicit Tx flow" modes.
775 		 * Peer consistency checking will be done in per queue binding.
776 		 */
777 		conf = &txq_ctrl->hairpin_conf;
778 		if (conf->peers[0].port == rx_port) {
779 			if (num_q == 0) {
780 				manual = conf->manual_bind;
781 				explicit = conf->tx_explicit;
782 			} else {
783 				if (manual != conf->manual_bind ||
784 				    explicit != conf->tx_explicit) {
785 					rte_errno = EINVAL;
786 					DRV_LOG(ERR, "port %u queue %d mode"
787 						" mismatch: %u %u, %u %u",
788 						local_port, i, manual,
789 						conf->manual_bind, explicit,
790 						conf->tx_explicit);
791 					mlx5_txq_release(dev, i);
792 					return -rte_errno;
793 				}
794 			}
795 			num_q++;
796 		}
797 		mlx5_txq_release(dev, i);
798 	}
799 	/* Once no queue is configured, success is returned directly. */
800 	if (num_q == 0)
801 		return ret;
802 	/* All the hairpin TX queues need to be traversed again. */
803 	for (i = 0; i != priv->txqs_n; i++) {
804 		txq_ctrl = mlx5_txq_get(dev, i);
805 		if (txq_ctrl == NULL)
806 			continue;
807 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
808 			mlx5_txq_release(dev, i);
809 			continue;
810 		}
811 		if (txq_ctrl->hairpin_conf.peers[0].port != rx_port) {
812 			mlx5_txq_release(dev, i);
813 			continue;
814 		}
815 		rx_queue = txq_ctrl->hairpin_conf.peers[0].queue;
816 		/*
817 		 * Fetch peer RxQ's information.
818 		 * No need to pass the information of the current queue.
819 		 */
820 		ret = rte_eth_hairpin_queue_peer_update(rx_port, rx_queue,
821 							NULL, &peer, 1);
822 		if (ret != 0) {
823 			mlx5_txq_release(dev, i);
824 			goto error;
825 		}
826 		/* Accessing its own device, inside mlx5 PMD. */
827 		ret = mlx5_hairpin_queue_peer_bind(dev, i, &peer, 1);
828 		if (ret != 0) {
829 			mlx5_txq_release(dev, i);
830 			goto error;
831 		}
832 		/* Pass TxQ's information to peer RxQ and try binding. */
833 		cur.peer_q = rx_queue;
834 		cur.qp_id = txq_ctrl->obj->sq->id;
835 		cur.vhca_id = priv->config.hca_attr.vhca_id;
836 		cur.tx_explicit = txq_ctrl->hairpin_conf.tx_explicit;
837 		cur.manual_bind = txq_ctrl->hairpin_conf.manual_bind;
838 		/*
839 		 * In order to access another device in a proper way, RTE level
840 		 * private function is needed.
841 		 */
842 		ret = rte_eth_hairpin_queue_peer_bind(rx_port, rx_queue,
843 						      &cur, 0);
844 		if (ret != 0) {
845 			mlx5_txq_release(dev, i);
846 			goto error;
847 		}
848 		mlx5_txq_release(dev, i);
849 	}
850 	return 0;
851 error:
852 	/*
853 	 * Do roll-back process for the queues already bound.
854 	 * No need to check the return value of the queue unbind function.
855 	 */
856 	do {
857 		/* No validation is needed here. */
858 		txq_ctrl = mlx5_txq_get(dev, i);
859 		if (txq_ctrl == NULL)
860 			continue;
861 		rx_queue = txq_ctrl->hairpin_conf.peers[0].queue;
862 		rte_eth_hairpin_queue_peer_unbind(rx_port, rx_queue, 0);
863 		mlx5_hairpin_queue_peer_unbind(dev, i, 1);
864 		mlx5_txq_release(dev, i);
865 	} while (i--);
866 	return ret;
867 }
868 
869 /*
870  * Unbind the hairpin port pair, HW configuration of both devices will be clear
871  * and status will be reset for all the queues used between the them.
872  * This function only supports to unbind the Tx from one Rx.
873  *
874  * @param dev
875  *   Pointer to Ethernet device structure.
876  * @param rx_port
877  *   Port identifier of the Rx port.
878  *
879  * @return
880  *   0 on success, a negative errno value otherwise and rte_errno is set.
881  */
882 static int
883 mlx5_hairpin_unbind_single_port(struct rte_eth_dev *dev, uint16_t rx_port)
884 {
885 	struct mlx5_priv *priv = dev->data->dev_private;
886 	struct mlx5_txq_ctrl *txq_ctrl;
887 	uint32_t i;
888 	int ret;
889 	uint16_t cur_port = priv->dev_data->port_id;
890 
891 	if (mlx5_eth_find_next(rx_port, dev->device) != rx_port) {
892 		rte_errno = ENODEV;
893 		DRV_LOG(ERR, "Rx port %u does not belong to mlx5", rx_port);
894 		return -rte_errno;
895 	}
896 	for (i = 0; i != priv->txqs_n; i++) {
897 		uint16_t rx_queue;
898 
899 		txq_ctrl = mlx5_txq_get(dev, i);
900 		if (txq_ctrl == NULL)
901 			continue;
902 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
903 			mlx5_txq_release(dev, i);
904 			continue;
905 		}
906 		if (txq_ctrl->hairpin_conf.peers[0].port != rx_port) {
907 			mlx5_txq_release(dev, i);
908 			continue;
909 		}
910 		/* Indeed, only the first used queue needs to be checked. */
911 		if (txq_ctrl->hairpin_conf.manual_bind == 0) {
912 			if (cur_port != rx_port) {
913 				rte_errno = EINVAL;
914 				DRV_LOG(ERR, "port %u and port %u are in"
915 					" auto-bind mode", cur_port, rx_port);
916 				mlx5_txq_release(dev, i);
917 				return -rte_errno;
918 			} else {
919 				return 0;
920 			}
921 		}
922 		rx_queue = txq_ctrl->hairpin_conf.peers[0].queue;
923 		mlx5_txq_release(dev, i);
924 		ret = rte_eth_hairpin_queue_peer_unbind(rx_port, rx_queue, 0);
925 		if (ret) {
926 			DRV_LOG(ERR, "port %u Rx queue %d unbind - failure",
927 				rx_port, rx_queue);
928 			return ret;
929 		}
930 		ret = mlx5_hairpin_queue_peer_unbind(dev, i, 1);
931 		if (ret) {
932 			DRV_LOG(ERR, "port %u Tx queue %d unbind - failure",
933 				cur_port, i);
934 			return ret;
935 		}
936 	}
937 	return 0;
938 }
939 
940 /*
941  * Bind hairpin ports, Rx could be all ports when using RTE_MAX_ETHPORTS.
942  * @see mlx5_hairpin_bind_single_port()
943  */
944 int
945 mlx5_hairpin_bind(struct rte_eth_dev *dev, uint16_t rx_port)
946 {
947 	int ret = 0;
948 	uint16_t p, pp;
949 
950 	/*
951 	 * If the Rx port has no hairpin configuration with the current port,
952 	 * the binding will be skipped in the called function of single port.
953 	 * Device started status will be checked only before the queue
954 	 * information updating.
955 	 */
956 	if (rx_port == RTE_MAX_ETHPORTS) {
957 		MLX5_ETH_FOREACH_DEV(p, dev->device) {
958 			ret = mlx5_hairpin_bind_single_port(dev, p);
959 			if (ret != 0)
960 				goto unbind;
961 		}
962 		return ret;
963 	} else {
964 		return mlx5_hairpin_bind_single_port(dev, rx_port);
965 	}
966 unbind:
967 	MLX5_ETH_FOREACH_DEV(pp, dev->device)
968 		if (pp < p)
969 			mlx5_hairpin_unbind_single_port(dev, pp);
970 	return ret;
971 }
972 
973 /*
974  * Unbind hairpin ports, Rx could be all ports when using RTE_MAX_ETHPORTS.
975  * @see mlx5_hairpin_unbind_single_port()
976  */
977 int
978 mlx5_hairpin_unbind(struct rte_eth_dev *dev, uint16_t rx_port)
979 {
980 	int ret = 0;
981 	uint16_t p;
982 
983 	if (rx_port == RTE_MAX_ETHPORTS)
984 		MLX5_ETH_FOREACH_DEV(p, dev->device) {
985 			ret = mlx5_hairpin_unbind_single_port(dev, p);
986 			if (ret != 0)
987 				return ret;
988 		}
989 	else
990 		ret = mlx5_hairpin_unbind_single_port(dev, rx_port);
991 	return ret;
992 }
993 
994 /*
995  * DPDK callback to get the hairpin peer ports list.
996  * This will return the actual number of peer ports and save the identifiers
997  * into the array (sorted, may be different from that when setting up the
998  * hairpin peer queues).
999  * The peer port ID could be the same as the port ID of the current device.
1000  *
1001  * @param dev
1002  *   Pointer to Ethernet device structure.
1003  * @param peer_ports
1004  *   Pointer to array to save the port identifiers.
1005  * @param len
1006  *   The length of the array.
1007  * @param direction
1008  *   Current port to peer port direction.
1009  *   positive - current used as Tx to get all peer Rx ports.
1010  *   zero - current used as Rx to get all peer Tx ports.
1011  *
1012  * @return
1013  *   0 or positive value on success, actual number of peer ports.
1014  *   a negative errno value otherwise and rte_errno is set.
1015  */
1016 int
1017 mlx5_hairpin_get_peer_ports(struct rte_eth_dev *dev, uint16_t *peer_ports,
1018 			    size_t len, uint32_t direction)
1019 {
1020 	struct mlx5_priv *priv = dev->data->dev_private;
1021 	struct mlx5_txq_ctrl *txq_ctrl;
1022 	uint32_t i;
1023 	uint16_t pp;
1024 	uint32_t bits[(RTE_MAX_ETHPORTS + 31) / 32] = {0};
1025 	int ret = 0;
1026 
1027 	if (direction) {
1028 		for (i = 0; i < priv->txqs_n; i++) {
1029 			txq_ctrl = mlx5_txq_get(dev, i);
1030 			if (!txq_ctrl)
1031 				continue;
1032 			if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
1033 				mlx5_txq_release(dev, i);
1034 				continue;
1035 			}
1036 			pp = txq_ctrl->hairpin_conf.peers[0].port;
1037 			if (pp >= RTE_MAX_ETHPORTS) {
1038 				rte_errno = ERANGE;
1039 				mlx5_txq_release(dev, i);
1040 				DRV_LOG(ERR, "port %hu queue %u peer port "
1041 					"out of range %hu",
1042 					priv->dev_data->port_id, i, pp);
1043 				return -rte_errno;
1044 			}
1045 			bits[pp / 32] |= 1 << (pp % 32);
1046 			mlx5_txq_release(dev, i);
1047 		}
1048 	} else {
1049 		for (i = 0; i < priv->rxqs_n; i++) {
1050 			struct mlx5_rxq_priv *rxq = mlx5_rxq_get(dev, i);
1051 			struct mlx5_rxq_ctrl *rxq_ctrl;
1052 
1053 			if (rxq == NULL)
1054 				continue;
1055 			rxq_ctrl = rxq->ctrl;
1056 			if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN)
1057 				continue;
1058 			pp = rxq->hairpin_conf.peers[0].port;
1059 			if (pp >= RTE_MAX_ETHPORTS) {
1060 				rte_errno = ERANGE;
1061 				DRV_LOG(ERR, "port %hu queue %u peer port "
1062 					"out of range %hu",
1063 					priv->dev_data->port_id, i, pp);
1064 				return -rte_errno;
1065 			}
1066 			bits[pp / 32] |= 1 << (pp % 32);
1067 		}
1068 	}
1069 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
1070 		if (bits[i / 32] & (1 << (i % 32))) {
1071 			if ((size_t)ret >= len) {
1072 				rte_errno = E2BIG;
1073 				return -rte_errno;
1074 			}
1075 			peer_ports[ret++] = i;
1076 		}
1077 	}
1078 	return ret;
1079 }
1080 
1081 /**
1082  * DPDK callback to start the device.
1083  *
1084  * Simulate device start by attaching all configured flows.
1085  *
1086  * @param dev
1087  *   Pointer to Ethernet device structure.
1088  *
1089  * @return
1090  *   0 on success, a negative errno value otherwise and rte_errno is set.
1091  */
1092 int
1093 mlx5_dev_start(struct rte_eth_dev *dev)
1094 {
1095 	struct mlx5_priv *priv = dev->data->dev_private;
1096 	int ret;
1097 	int fine_inline;
1098 
1099 	DRV_LOG(DEBUG, "port %u starting device", dev->data->port_id);
1100 	fine_inline = rte_mbuf_dynflag_lookup
1101 		(RTE_PMD_MLX5_FINE_GRANULARITY_INLINE, NULL);
1102 	if (fine_inline >= 0)
1103 		rte_net_mlx5_dynf_inline_mask = 1UL << fine_inline;
1104 	else
1105 		rte_net_mlx5_dynf_inline_mask = 0;
1106 	if (dev->data->nb_rx_queues > 0) {
1107 		ret = mlx5_dev_configure_rss_reta(dev);
1108 		if (ret) {
1109 			DRV_LOG(ERR, "port %u reta config failed: %s",
1110 				dev->data->port_id, strerror(rte_errno));
1111 			return -rte_errno;
1112 		}
1113 	}
1114 	ret = mlx5_txpp_start(dev);
1115 	if (ret) {
1116 		DRV_LOG(ERR, "port %u Tx packet pacing init failed: %s",
1117 			dev->data->port_id, strerror(rte_errno));
1118 		goto error;
1119 	}
1120 	if ((priv->sh->devx && priv->config.dv_flow_en &&
1121 	    priv->config.dest_tir) && priv->obj_ops.lb_dummy_queue_create) {
1122 		ret = priv->obj_ops.lb_dummy_queue_create(dev);
1123 		if (ret)
1124 			goto error;
1125 	}
1126 	ret = mlx5_txq_start(dev);
1127 	if (ret) {
1128 		DRV_LOG(ERR, "port %u Tx queue allocation failed: %s",
1129 			dev->data->port_id, strerror(rte_errno));
1130 		goto error;
1131 	}
1132 	if (priv->config.std_delay_drop || priv->config.hp_delay_drop) {
1133 		if (!priv->config.vf && !priv->config.sf &&
1134 		    !priv->representor) {
1135 			ret = mlx5_get_flag_dropless_rq(dev);
1136 			if (ret < 0)
1137 				DRV_LOG(WARNING,
1138 					"port %u cannot query dropless flag",
1139 					dev->data->port_id);
1140 			else if (!ret)
1141 				DRV_LOG(WARNING,
1142 					"port %u dropless_rq OFF, no rearming",
1143 					dev->data->port_id);
1144 		} else {
1145 			DRV_LOG(DEBUG,
1146 				"port %u doesn't support dropless_rq flag",
1147 				dev->data->port_id);
1148 		}
1149 	}
1150 	ret = mlx5_rxq_start(dev);
1151 	if (ret) {
1152 		DRV_LOG(ERR, "port %u Rx queue allocation failed: %s",
1153 			dev->data->port_id, strerror(rte_errno));
1154 		goto error;
1155 	}
1156 	/*
1157 	 * Such step will be skipped if there is no hairpin TX queue configured
1158 	 * with RX peer queue from the same device.
1159 	 */
1160 	ret = mlx5_hairpin_auto_bind(dev);
1161 	if (ret) {
1162 		DRV_LOG(ERR, "port %u hairpin auto binding failed: %s",
1163 			dev->data->port_id, strerror(rte_errno));
1164 		goto error;
1165 	}
1166 	/* Set started flag here for the following steps like control flow. */
1167 	dev->data->dev_started = 1;
1168 	ret = mlx5_rx_intr_vec_enable(dev);
1169 	if (ret) {
1170 		DRV_LOG(ERR, "port %u Rx interrupt vector creation failed",
1171 			dev->data->port_id);
1172 		goto error;
1173 	}
1174 	mlx5_os_stats_init(dev);
1175 	ret = mlx5_traffic_enable(dev);
1176 	if (ret) {
1177 		DRV_LOG(ERR, "port %u failed to set defaults flows",
1178 			dev->data->port_id);
1179 		goto error;
1180 	}
1181 	/* Set a mask and offset of dynamic metadata flows into Rx queues. */
1182 	mlx5_flow_rxq_dynf_metadata_set(dev);
1183 	/* Set flags and context to convert Rx timestamps. */
1184 	mlx5_rxq_timestamp_set(dev);
1185 	/* Set a mask and offset of scheduling on timestamp into Tx queues. */
1186 	mlx5_txq_dynf_timestamp_set(dev);
1187 	/* Attach indirection table objects detached on port stop. */
1188 	ret = mlx5_action_handle_attach(dev);
1189 	if (ret) {
1190 		DRV_LOG(ERR,
1191 			"port %u failed to attach indirect actions: %s",
1192 			dev->data->port_id, rte_strerror(rte_errno));
1193 		goto error;
1194 	}
1195 	/*
1196 	 * In non-cached mode, it only needs to start the default mreg copy
1197 	 * action and no flow created by application exists anymore.
1198 	 * But it is worth wrapping the interface for further usage.
1199 	 */
1200 	ret = mlx5_flow_start_default(dev);
1201 	if (ret) {
1202 		DRV_LOG(DEBUG, "port %u failed to start default actions: %s",
1203 			dev->data->port_id, strerror(rte_errno));
1204 		goto error;
1205 	}
1206 	if (mlx5_dev_ctx_shared_mempool_subscribe(dev) != 0) {
1207 		DRV_LOG(ERR, "port %u failed to subscribe for mempool life cycle: %s",
1208 			dev->data->port_id, rte_strerror(rte_errno));
1209 		goto error;
1210 	}
1211 	rte_wmb();
1212 	dev->tx_pkt_burst = mlx5_select_tx_function(dev);
1213 	dev->rx_pkt_burst = mlx5_select_rx_function(dev);
1214 	/* Enable datapath on secondary process. */
1215 	mlx5_mp_os_req_start_rxtx(dev);
1216 	if (rte_intr_fd_get(priv->sh->intr_handle) >= 0) {
1217 		priv->sh->port[priv->dev_port - 1].ih_port_id =
1218 					(uint32_t)dev->data->port_id;
1219 	} else {
1220 		DRV_LOG(INFO, "port %u starts without LSC and RMV interrupts.",
1221 			dev->data->port_id);
1222 		dev->data->dev_conf.intr_conf.lsc = 0;
1223 		dev->data->dev_conf.intr_conf.rmv = 0;
1224 	}
1225 	if (rte_intr_fd_get(priv->sh->intr_handle_devx) >= 0)
1226 		priv->sh->port[priv->dev_port - 1].devx_ih_port_id =
1227 					(uint32_t)dev->data->port_id;
1228 	return 0;
1229 error:
1230 	ret = rte_errno; /* Save rte_errno before cleanup. */
1231 	/* Rollback. */
1232 	dev->data->dev_started = 0;
1233 	mlx5_flow_stop_default(dev);
1234 	mlx5_traffic_disable(dev);
1235 	mlx5_txq_stop(dev);
1236 	mlx5_rxq_stop(dev);
1237 	if (priv->obj_ops.lb_dummy_queue_release)
1238 		priv->obj_ops.lb_dummy_queue_release(dev);
1239 	mlx5_txpp_stop(dev); /* Stop last. */
1240 	rte_errno = ret; /* Restore rte_errno. */
1241 	return -rte_errno;
1242 }
1243 
1244 /**
1245  * DPDK callback to stop the device.
1246  *
1247  * Simulate device stop by detaching all configured flows.
1248  *
1249  * @param dev
1250  *   Pointer to Ethernet device structure.
1251  */
1252 int
1253 mlx5_dev_stop(struct rte_eth_dev *dev)
1254 {
1255 	struct mlx5_priv *priv = dev->data->dev_private;
1256 
1257 	dev->data->dev_started = 0;
1258 	/* Prevent crashes when queues are still in use. */
1259 	dev->rx_pkt_burst = removed_rx_burst;
1260 	dev->tx_pkt_burst = removed_tx_burst;
1261 	rte_wmb();
1262 	/* Disable datapath on secondary process. */
1263 	mlx5_mp_os_req_stop_rxtx(dev);
1264 	rte_delay_us_sleep(1000 * priv->rxqs_n);
1265 	DRV_LOG(DEBUG, "port %u stopping device", dev->data->port_id);
1266 	mlx5_flow_stop_default(dev);
1267 	/* Control flows for default traffic can be removed firstly. */
1268 	mlx5_traffic_disable(dev);
1269 	/* All RX queue flags will be cleared in the flush interface. */
1270 	mlx5_flow_list_flush(dev, MLX5_FLOW_TYPE_GEN, true);
1271 	mlx5_flow_meter_rxq_flush(dev);
1272 	mlx5_action_handle_detach(dev);
1273 	mlx5_rx_intr_vec_disable(dev);
1274 	priv->sh->port[priv->dev_port - 1].ih_port_id = RTE_MAX_ETHPORTS;
1275 	priv->sh->port[priv->dev_port - 1].devx_ih_port_id = RTE_MAX_ETHPORTS;
1276 	mlx5_txq_stop(dev);
1277 	mlx5_rxq_stop(dev);
1278 	if (priv->obj_ops.lb_dummy_queue_release)
1279 		priv->obj_ops.lb_dummy_queue_release(dev);
1280 	mlx5_txpp_stop(dev);
1281 
1282 	return 0;
1283 }
1284 
1285 /**
1286  * Enable traffic flows configured by control plane
1287  *
1288  * @param dev
1289  *   Pointer to Ethernet device private data.
1290  * @param dev
1291  *   Pointer to Ethernet device structure.
1292  *
1293  * @return
1294  *   0 on success, a negative errno value otherwise and rte_errno is set.
1295  */
1296 int
1297 mlx5_traffic_enable(struct rte_eth_dev *dev)
1298 {
1299 	struct mlx5_priv *priv = dev->data->dev_private;
1300 	struct rte_flow_item_eth bcast = {
1301 		.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
1302 	};
1303 	struct rte_flow_item_eth ipv6_multi_spec = {
1304 		.dst.addr_bytes = "\x33\x33\x00\x00\x00\x00",
1305 	};
1306 	struct rte_flow_item_eth ipv6_multi_mask = {
1307 		.dst.addr_bytes = "\xff\xff\x00\x00\x00\x00",
1308 	};
1309 	struct rte_flow_item_eth unicast = {
1310 		.src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1311 	};
1312 	struct rte_flow_item_eth unicast_mask = {
1313 		.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
1314 	};
1315 	const unsigned int vlan_filter_n = priv->vlan_filter_n;
1316 	const struct rte_ether_addr cmp = {
1317 		.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1318 	};
1319 	unsigned int i;
1320 	unsigned int j;
1321 	int ret;
1322 
1323 	/*
1324 	 * Hairpin txq default flow should be created no matter if it is
1325 	 * isolation mode. Or else all the packets to be sent will be sent
1326 	 * out directly without the TX flow actions, e.g. encapsulation.
1327 	 */
1328 	for (i = 0; i != priv->txqs_n; ++i) {
1329 		struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i);
1330 		if (!txq_ctrl)
1331 			continue;
1332 		/* Only Tx implicit mode requires the default Tx flow. */
1333 		if (txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN &&
1334 		    txq_ctrl->hairpin_conf.tx_explicit == 0 &&
1335 		    txq_ctrl->hairpin_conf.peers[0].port ==
1336 		    priv->dev_data->port_id) {
1337 			ret = mlx5_ctrl_flow_source_queue(dev, i);
1338 			if (ret) {
1339 				mlx5_txq_release(dev, i);
1340 				goto error;
1341 			}
1342 		}
1343 		if ((priv->representor || priv->master) &&
1344 		    priv->config.dv_esw_en) {
1345 			if (mlx5_flow_create_devx_sq_miss_flow(dev, i) == 0) {
1346 				DRV_LOG(ERR,
1347 					"Port %u Tx queue %u SQ create representor devx default miss rule failed.",
1348 					dev->data->port_id, i);
1349 				goto error;
1350 			}
1351 		}
1352 		mlx5_txq_release(dev, i);
1353 	}
1354 	if ((priv->master || priv->representor) && priv->config.dv_esw_en) {
1355 		if (mlx5_flow_create_esw_table_zero_flow(dev))
1356 			priv->fdb_def_rule = 1;
1357 		else
1358 			DRV_LOG(INFO, "port %u FDB default rule cannot be"
1359 				" configured - only Eswitch group 0 flows are"
1360 				" supported.", dev->data->port_id);
1361 	}
1362 	if (!priv->config.lacp_by_user && priv->pf_bond >= 0) {
1363 		ret = mlx5_flow_lacp_miss(dev);
1364 		if (ret)
1365 			DRV_LOG(INFO, "port %u LACP rule cannot be created - "
1366 				"forward LACP to kernel.", dev->data->port_id);
1367 		else
1368 			DRV_LOG(INFO, "LACP traffic will be missed in port %u."
1369 				, dev->data->port_id);
1370 	}
1371 	if (priv->isolated)
1372 		return 0;
1373 	if (dev->data->promiscuous) {
1374 		struct rte_flow_item_eth promisc = {
1375 			.dst.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1376 			.src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1377 			.type = 0,
1378 		};
1379 
1380 		ret = mlx5_ctrl_flow(dev, &promisc, &promisc);
1381 		if (ret)
1382 			goto error;
1383 	}
1384 	if (dev->data->all_multicast) {
1385 		struct rte_flow_item_eth multicast = {
1386 			.dst.addr_bytes = "\x01\x00\x00\x00\x00\x00",
1387 			.src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1388 			.type = 0,
1389 		};
1390 
1391 		ret = mlx5_ctrl_flow(dev, &multicast, &multicast);
1392 		if (ret)
1393 			goto error;
1394 	} else {
1395 		/* Add broadcast/multicast flows. */
1396 		for (i = 0; i != vlan_filter_n; ++i) {
1397 			uint16_t vlan = priv->vlan_filter[i];
1398 
1399 			struct rte_flow_item_vlan vlan_spec = {
1400 				.tci = rte_cpu_to_be_16(vlan),
1401 			};
1402 			struct rte_flow_item_vlan vlan_mask =
1403 				rte_flow_item_vlan_mask;
1404 
1405 			ret = mlx5_ctrl_flow_vlan(dev, &bcast, &bcast,
1406 						  &vlan_spec, &vlan_mask);
1407 			if (ret)
1408 				goto error;
1409 			ret = mlx5_ctrl_flow_vlan(dev, &ipv6_multi_spec,
1410 						  &ipv6_multi_mask,
1411 						  &vlan_spec, &vlan_mask);
1412 			if (ret)
1413 				goto error;
1414 		}
1415 		if (!vlan_filter_n) {
1416 			ret = mlx5_ctrl_flow(dev, &bcast, &bcast);
1417 			if (ret)
1418 				goto error;
1419 			ret = mlx5_ctrl_flow(dev, &ipv6_multi_spec,
1420 					     &ipv6_multi_mask);
1421 			if (ret) {
1422 				/* Do not fail on IPv6 broadcast creation failure. */
1423 				DRV_LOG(WARNING,
1424 					"IPv6 broadcast is not supported");
1425 				ret = 0;
1426 			}
1427 		}
1428 	}
1429 	/* Add MAC address flows. */
1430 	for (i = 0; i != MLX5_MAX_MAC_ADDRESSES; ++i) {
1431 		struct rte_ether_addr *mac = &dev->data->mac_addrs[i];
1432 
1433 		if (!memcmp(mac, &cmp, sizeof(*mac)))
1434 			continue;
1435 		memcpy(&unicast.dst.addr_bytes,
1436 		       mac->addr_bytes,
1437 		       RTE_ETHER_ADDR_LEN);
1438 		for (j = 0; j != vlan_filter_n; ++j) {
1439 			uint16_t vlan = priv->vlan_filter[j];
1440 
1441 			struct rte_flow_item_vlan vlan_spec = {
1442 				.tci = rte_cpu_to_be_16(vlan),
1443 			};
1444 			struct rte_flow_item_vlan vlan_mask =
1445 				rte_flow_item_vlan_mask;
1446 
1447 			ret = mlx5_ctrl_flow_vlan(dev, &unicast,
1448 						  &unicast_mask,
1449 						  &vlan_spec,
1450 						  &vlan_mask);
1451 			if (ret)
1452 				goto error;
1453 		}
1454 		if (!vlan_filter_n) {
1455 			ret = mlx5_ctrl_flow(dev, &unicast, &unicast_mask);
1456 			if (ret)
1457 				goto error;
1458 		}
1459 	}
1460 	return 0;
1461 error:
1462 	ret = rte_errno; /* Save rte_errno before cleanup. */
1463 	mlx5_flow_list_flush(dev, MLX5_FLOW_TYPE_CTL, false);
1464 	rte_errno = ret; /* Restore rte_errno. */
1465 	return -rte_errno;
1466 }
1467 
1468 
1469 /**
1470  * Disable traffic flows configured by control plane
1471  *
1472  * @param dev
1473  *   Pointer to Ethernet device private data.
1474  */
1475 void
1476 mlx5_traffic_disable(struct rte_eth_dev *dev)
1477 {
1478 	mlx5_flow_list_flush(dev, MLX5_FLOW_TYPE_CTL, false);
1479 }
1480 
1481 /**
1482  * Restart traffic flows configured by control plane
1483  *
1484  * @param dev
1485  *   Pointer to Ethernet device private data.
1486  *
1487  * @return
1488  *   0 on success, a negative errno value otherwise and rte_errno is set.
1489  */
1490 int
1491 mlx5_traffic_restart(struct rte_eth_dev *dev)
1492 {
1493 	if (dev->data->dev_started) {
1494 		mlx5_traffic_disable(dev);
1495 		return mlx5_traffic_enable(dev);
1496 	}
1497 	return 0;
1498 }
1499