xref: /dpdk/drivers/net/mlx5/mlx5_trigger.c (revision a7db3afce75346832059d8bfe54a8f81945fb213)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5 
6 #include <unistd.h>
7 
8 #include <rte_ether.h>
9 #include <ethdev_driver.h>
10 #include <rte_interrupts.h>
11 #include <rte_alarm.h>
12 #include <rte_cycles.h>
13 
14 #include <mlx5_malloc.h>
15 
16 #include "mlx5.h"
17 #include "mlx5_mr.h"
18 #include "mlx5_rx.h"
19 #include "mlx5_tx.h"
20 #include "mlx5_utils.h"
21 #include "rte_pmd_mlx5.h"
22 
23 /**
24  * Stop traffic on Tx queues.
25  *
26  * @param dev
27  *   Pointer to Ethernet device structure.
28  */
29 static void
30 mlx5_txq_stop(struct rte_eth_dev *dev)
31 {
32 	struct mlx5_priv *priv = dev->data->dev_private;
33 	unsigned int i;
34 
35 	for (i = 0; i != priv->txqs_n; ++i)
36 		mlx5_txq_release(dev, i);
37 }
38 
39 /**
40  * Start traffic on Tx queues.
41  *
42  * @param dev
43  *   Pointer to Ethernet device structure.
44  *
45  * @return
46  *   0 on success, a negative errno value otherwise and rte_errno is set.
47  */
48 static int
49 mlx5_txq_start(struct rte_eth_dev *dev)
50 {
51 	struct mlx5_priv *priv = dev->data->dev_private;
52 	unsigned int i;
53 	int ret;
54 
55 	for (i = 0; i != priv->txqs_n; ++i) {
56 		struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i);
57 		struct mlx5_txq_data *txq_data = &txq_ctrl->txq;
58 		uint32_t flags = MLX5_MEM_RTE | MLX5_MEM_ZERO;
59 
60 		if (!txq_ctrl)
61 			continue;
62 		if (txq_ctrl->type == MLX5_TXQ_TYPE_STANDARD)
63 			txq_alloc_elts(txq_ctrl);
64 		MLX5_ASSERT(!txq_ctrl->obj);
65 		txq_ctrl->obj = mlx5_malloc(flags, sizeof(struct mlx5_txq_obj),
66 					    0, txq_ctrl->socket);
67 		if (!txq_ctrl->obj) {
68 			DRV_LOG(ERR, "Port %u Tx queue %u cannot allocate "
69 				"memory resources.", dev->data->port_id,
70 				txq_data->idx);
71 			rte_errno = ENOMEM;
72 			goto error;
73 		}
74 		ret = priv->obj_ops.txq_obj_new(dev, i);
75 		if (ret < 0) {
76 			mlx5_free(txq_ctrl->obj);
77 			txq_ctrl->obj = NULL;
78 			goto error;
79 		}
80 		if (txq_ctrl->type == MLX5_TXQ_TYPE_STANDARD) {
81 			size_t size = txq_data->cqe_s * sizeof(*txq_data->fcqs);
82 
83 			txq_data->fcqs = mlx5_malloc(flags, size,
84 						     RTE_CACHE_LINE_SIZE,
85 						     txq_ctrl->socket);
86 			if (!txq_data->fcqs) {
87 				DRV_LOG(ERR, "Port %u Tx queue %u cannot "
88 					"allocate memory (FCQ).",
89 					dev->data->port_id, i);
90 				rte_errno = ENOMEM;
91 				goto error;
92 			}
93 		}
94 		DRV_LOG(DEBUG, "Port %u txq %u updated with %p.",
95 			dev->data->port_id, i, (void *)&txq_ctrl->obj);
96 		LIST_INSERT_HEAD(&priv->txqsobj, txq_ctrl->obj, next);
97 	}
98 	return 0;
99 error:
100 	ret = rte_errno; /* Save rte_errno before cleanup. */
101 	do {
102 		mlx5_txq_release(dev, i);
103 	} while (i-- != 0);
104 	rte_errno = ret; /* Restore rte_errno. */
105 	return -rte_errno;
106 }
107 
108 /**
109  * Stop traffic on Rx queues.
110  *
111  * @param dev
112  *   Pointer to Ethernet device structure.
113  */
114 static void
115 mlx5_rxq_stop(struct rte_eth_dev *dev)
116 {
117 	struct mlx5_priv *priv = dev->data->dev_private;
118 	unsigned int i;
119 
120 	for (i = 0; i != priv->rxqs_n; ++i)
121 		mlx5_rxq_release(dev, i);
122 }
123 
124 /**
125  * Start traffic on Rx queues.
126  *
127  * @param dev
128  *   Pointer to Ethernet device structure.
129  *
130  * @return
131  *   0 on success, a negative errno value otherwise and rte_errno is set.
132  */
133 static int
134 mlx5_rxq_start(struct rte_eth_dev *dev)
135 {
136 	struct mlx5_priv *priv = dev->data->dev_private;
137 	unsigned int i;
138 	int ret = 0;
139 
140 	/* Allocate/reuse/resize mempool for Multi-Packet RQ. */
141 	if (mlx5_mprq_alloc_mp(dev)) {
142 		/* Should not release Rx queues but return immediately. */
143 		return -rte_errno;
144 	}
145 	DRV_LOG(DEBUG, "Port %u device_attr.max_qp_wr is %d.",
146 		dev->data->port_id, priv->sh->device_attr.max_qp_wr);
147 	DRV_LOG(DEBUG, "Port %u device_attr.max_sge is %d.",
148 		dev->data->port_id, priv->sh->device_attr.max_sge);
149 	for (i = 0; i != priv->rxqs_n; ++i) {
150 		struct mlx5_rxq_ctrl *rxq_ctrl = mlx5_rxq_get(dev, i);
151 
152 		if (!rxq_ctrl)
153 			continue;
154 		if (rxq_ctrl->type == MLX5_RXQ_TYPE_STANDARD) {
155 			/* Pre-register Rx mempools. */
156 			if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq)) {
157 				mlx5_mr_update_mp(dev, &rxq_ctrl->rxq.mr_ctrl,
158 						  rxq_ctrl->rxq.mprq_mp);
159 			} else {
160 				uint32_t s;
161 
162 				for (s = 0; s < rxq_ctrl->rxq.rxseg_n; s++)
163 					mlx5_mr_update_mp
164 						(dev, &rxq_ctrl->rxq.mr_ctrl,
165 						rxq_ctrl->rxq.rxseg[s].mp);
166 			}
167 			ret = rxq_alloc_elts(rxq_ctrl);
168 			if (ret)
169 				goto error;
170 		}
171 		MLX5_ASSERT(!rxq_ctrl->obj);
172 		rxq_ctrl->obj = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
173 					    sizeof(*rxq_ctrl->obj), 0,
174 					    rxq_ctrl->socket);
175 		if (!rxq_ctrl->obj) {
176 			DRV_LOG(ERR,
177 				"Port %u Rx queue %u can't allocate resources.",
178 				dev->data->port_id, (*priv->rxqs)[i]->idx);
179 			rte_errno = ENOMEM;
180 			goto error;
181 		}
182 		ret = priv->obj_ops.rxq_obj_new(dev, i);
183 		if (ret) {
184 			mlx5_free(rxq_ctrl->obj);
185 			goto error;
186 		}
187 		DRV_LOG(DEBUG, "Port %u rxq %u updated with %p.",
188 			dev->data->port_id, i, (void *)&rxq_ctrl->obj);
189 		LIST_INSERT_HEAD(&priv->rxqsobj, rxq_ctrl->obj, next);
190 	}
191 	return 0;
192 error:
193 	ret = rte_errno; /* Save rte_errno before cleanup. */
194 	do {
195 		mlx5_rxq_release(dev, i);
196 	} while (i-- != 0);
197 	rte_errno = ret; /* Restore rte_errno. */
198 	return -rte_errno;
199 }
200 
201 /**
202  * Binds Tx queues to Rx queues for hairpin.
203  *
204  * Binds Tx queues to the target Rx queues.
205  *
206  * @param dev
207  *   Pointer to Ethernet device structure.
208  *
209  * @return
210  *   0 on success, a negative errno value otherwise and rte_errno is set.
211  */
212 static int
213 mlx5_hairpin_auto_bind(struct rte_eth_dev *dev)
214 {
215 	struct mlx5_priv *priv = dev->data->dev_private;
216 	struct mlx5_devx_modify_sq_attr sq_attr = { 0 };
217 	struct mlx5_devx_modify_rq_attr rq_attr = { 0 };
218 	struct mlx5_txq_ctrl *txq_ctrl;
219 	struct mlx5_rxq_ctrl *rxq_ctrl;
220 	struct mlx5_devx_obj *sq;
221 	struct mlx5_devx_obj *rq;
222 	unsigned int i;
223 	int ret = 0;
224 	bool need_auto = false;
225 	uint16_t self_port = dev->data->port_id;
226 
227 	for (i = 0; i != priv->txqs_n; ++i) {
228 		txq_ctrl = mlx5_txq_get(dev, i);
229 		if (!txq_ctrl)
230 			continue;
231 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN ||
232 		    txq_ctrl->hairpin_conf.peers[0].port != self_port) {
233 			mlx5_txq_release(dev, i);
234 			continue;
235 		}
236 		if (txq_ctrl->hairpin_conf.manual_bind) {
237 			mlx5_txq_release(dev, i);
238 			return 0;
239 		}
240 		need_auto = true;
241 		mlx5_txq_release(dev, i);
242 	}
243 	if (!need_auto)
244 		return 0;
245 	for (i = 0; i != priv->txqs_n; ++i) {
246 		txq_ctrl = mlx5_txq_get(dev, i);
247 		if (!txq_ctrl)
248 			continue;
249 		/* Skip hairpin queues with other peer ports. */
250 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN ||
251 		    txq_ctrl->hairpin_conf.peers[0].port != self_port) {
252 			mlx5_txq_release(dev, i);
253 			continue;
254 		}
255 		if (!txq_ctrl->obj) {
256 			rte_errno = ENOMEM;
257 			DRV_LOG(ERR, "port %u no txq object found: %d",
258 				dev->data->port_id, i);
259 			mlx5_txq_release(dev, i);
260 			return -rte_errno;
261 		}
262 		sq = txq_ctrl->obj->sq;
263 		rxq_ctrl = mlx5_rxq_get(dev,
264 					txq_ctrl->hairpin_conf.peers[0].queue);
265 		if (!rxq_ctrl) {
266 			mlx5_txq_release(dev, i);
267 			rte_errno = EINVAL;
268 			DRV_LOG(ERR, "port %u no rxq object found: %d",
269 				dev->data->port_id,
270 				txq_ctrl->hairpin_conf.peers[0].queue);
271 			return -rte_errno;
272 		}
273 		if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN ||
274 		    rxq_ctrl->hairpin_conf.peers[0].queue != i) {
275 			rte_errno = ENOMEM;
276 			DRV_LOG(ERR, "port %u Tx queue %d can't be binded to "
277 				"Rx queue %d", dev->data->port_id,
278 				i, txq_ctrl->hairpin_conf.peers[0].queue);
279 			goto error;
280 		}
281 		rq = rxq_ctrl->obj->rq;
282 		if (!rq) {
283 			rte_errno = ENOMEM;
284 			DRV_LOG(ERR, "port %u hairpin no matching rxq: %d",
285 				dev->data->port_id,
286 				txq_ctrl->hairpin_conf.peers[0].queue);
287 			goto error;
288 		}
289 		sq_attr.state = MLX5_SQC_STATE_RDY;
290 		sq_attr.sq_state = MLX5_SQC_STATE_RST;
291 		sq_attr.hairpin_peer_rq = rq->id;
292 		sq_attr.hairpin_peer_vhca = priv->config.hca_attr.vhca_id;
293 		ret = mlx5_devx_cmd_modify_sq(sq, &sq_attr);
294 		if (ret)
295 			goto error;
296 		rq_attr.state = MLX5_SQC_STATE_RDY;
297 		rq_attr.rq_state = MLX5_SQC_STATE_RST;
298 		rq_attr.hairpin_peer_sq = sq->id;
299 		rq_attr.hairpin_peer_vhca = priv->config.hca_attr.vhca_id;
300 		ret = mlx5_devx_cmd_modify_rq(rq, &rq_attr);
301 		if (ret)
302 			goto error;
303 		/* Qs with auto-bind will be destroyed directly. */
304 		rxq_ctrl->hairpin_status = 1;
305 		txq_ctrl->hairpin_status = 1;
306 		mlx5_txq_release(dev, i);
307 		mlx5_rxq_release(dev, txq_ctrl->hairpin_conf.peers[0].queue);
308 	}
309 	return 0;
310 error:
311 	mlx5_txq_release(dev, i);
312 	mlx5_rxq_release(dev, txq_ctrl->hairpin_conf.peers[0].queue);
313 	return -rte_errno;
314 }
315 
316 /*
317  * Fetch the peer queue's SW & HW information.
318  *
319  * @param dev
320  *   Pointer to Ethernet device structure.
321  * @param peer_queue
322  *   Index of the queue to fetch the information.
323  * @param current_info
324  *   Pointer to the input peer information, not used currently.
325  * @param peer_info
326  *   Pointer to the structure to store the information, output.
327  * @param direction
328  *   Positive to get the RxQ information, zero to get the TxQ information.
329  *
330  * @return
331  *   0 on success, a negative errno value otherwise and rte_errno is set.
332  */
333 int
334 mlx5_hairpin_queue_peer_update(struct rte_eth_dev *dev, uint16_t peer_queue,
335 			       struct rte_hairpin_peer_info *current_info,
336 			       struct rte_hairpin_peer_info *peer_info,
337 			       uint32_t direction)
338 {
339 	struct mlx5_priv *priv = dev->data->dev_private;
340 	RTE_SET_USED(current_info);
341 
342 	if (dev->data->dev_started == 0) {
343 		rte_errno = EBUSY;
344 		DRV_LOG(ERR, "peer port %u is not started",
345 			dev->data->port_id);
346 		return -rte_errno;
347 	}
348 	/*
349 	 * Peer port used as egress. In the current design, hairpin Tx queue
350 	 * will be bound to the peer Rx queue. Indeed, only the information of
351 	 * peer Rx queue needs to be fetched.
352 	 */
353 	if (direction == 0) {
354 		struct mlx5_txq_ctrl *txq_ctrl;
355 
356 		txq_ctrl = mlx5_txq_get(dev, peer_queue);
357 		if (txq_ctrl == NULL) {
358 			rte_errno = EINVAL;
359 			DRV_LOG(ERR, "Failed to get port %u Tx queue %d",
360 				dev->data->port_id, peer_queue);
361 			return -rte_errno;
362 		}
363 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
364 			rte_errno = EINVAL;
365 			DRV_LOG(ERR, "port %u queue %d is not a hairpin Txq",
366 				dev->data->port_id, peer_queue);
367 			mlx5_txq_release(dev, peer_queue);
368 			return -rte_errno;
369 		}
370 		if (txq_ctrl->obj == NULL || txq_ctrl->obj->sq == NULL) {
371 			rte_errno = ENOMEM;
372 			DRV_LOG(ERR, "port %u no Txq object found: %d",
373 				dev->data->port_id, peer_queue);
374 			mlx5_txq_release(dev, peer_queue);
375 			return -rte_errno;
376 		}
377 		peer_info->qp_id = txq_ctrl->obj->sq->id;
378 		peer_info->vhca_id = priv->config.hca_attr.vhca_id;
379 		/* 1-to-1 mapping, only the first one is used. */
380 		peer_info->peer_q = txq_ctrl->hairpin_conf.peers[0].queue;
381 		peer_info->tx_explicit = txq_ctrl->hairpin_conf.tx_explicit;
382 		peer_info->manual_bind = txq_ctrl->hairpin_conf.manual_bind;
383 		mlx5_txq_release(dev, peer_queue);
384 	} else { /* Peer port used as ingress. */
385 		struct mlx5_rxq_ctrl *rxq_ctrl;
386 
387 		rxq_ctrl = mlx5_rxq_get(dev, peer_queue);
388 		if (rxq_ctrl == NULL) {
389 			rte_errno = EINVAL;
390 			DRV_LOG(ERR, "Failed to get port %u Rx queue %d",
391 				dev->data->port_id, peer_queue);
392 			return -rte_errno;
393 		}
394 		if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN) {
395 			rte_errno = EINVAL;
396 			DRV_LOG(ERR, "port %u queue %d is not a hairpin Rxq",
397 				dev->data->port_id, peer_queue);
398 			mlx5_rxq_release(dev, peer_queue);
399 			return -rte_errno;
400 		}
401 		if (rxq_ctrl->obj == NULL || rxq_ctrl->obj->rq == NULL) {
402 			rte_errno = ENOMEM;
403 			DRV_LOG(ERR, "port %u no Rxq object found: %d",
404 				dev->data->port_id, peer_queue);
405 			mlx5_rxq_release(dev, peer_queue);
406 			return -rte_errno;
407 		}
408 		peer_info->qp_id = rxq_ctrl->obj->rq->id;
409 		peer_info->vhca_id = priv->config.hca_attr.vhca_id;
410 		peer_info->peer_q = rxq_ctrl->hairpin_conf.peers[0].queue;
411 		peer_info->tx_explicit = rxq_ctrl->hairpin_conf.tx_explicit;
412 		peer_info->manual_bind = rxq_ctrl->hairpin_conf.manual_bind;
413 		mlx5_rxq_release(dev, peer_queue);
414 	}
415 	return 0;
416 }
417 
418 /*
419  * Bind the hairpin queue with the peer HW information.
420  * This needs to be called twice both for Tx and Rx queues of a pair.
421  * If the queue is already bound, it is considered successful.
422  *
423  * @param dev
424  *   Pointer to Ethernet device structure.
425  * @param cur_queue
426  *   Index of the queue to change the HW configuration to bind.
427  * @param peer_info
428  *   Pointer to information of the peer queue.
429  * @param direction
430  *   Positive to configure the TxQ, zero to configure the RxQ.
431  *
432  * @return
433  *   0 on success, a negative errno value otherwise and rte_errno is set.
434  */
435 int
436 mlx5_hairpin_queue_peer_bind(struct rte_eth_dev *dev, uint16_t cur_queue,
437 			     struct rte_hairpin_peer_info *peer_info,
438 			     uint32_t direction)
439 {
440 	int ret = 0;
441 
442 	/*
443 	 * Consistency checking of the peer queue: opposite direction is used
444 	 * to get the peer queue info with ethdev port ID, no need to check.
445 	 */
446 	if (peer_info->peer_q != cur_queue) {
447 		rte_errno = EINVAL;
448 		DRV_LOG(ERR, "port %u queue %d and peer queue %d mismatch",
449 			dev->data->port_id, cur_queue, peer_info->peer_q);
450 		return -rte_errno;
451 	}
452 	if (direction != 0) {
453 		struct mlx5_txq_ctrl *txq_ctrl;
454 		struct mlx5_devx_modify_sq_attr sq_attr = { 0 };
455 
456 		txq_ctrl = mlx5_txq_get(dev, cur_queue);
457 		if (txq_ctrl == NULL) {
458 			rte_errno = EINVAL;
459 			DRV_LOG(ERR, "Failed to get port %u Tx queue %d",
460 				dev->data->port_id, cur_queue);
461 			return -rte_errno;
462 		}
463 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
464 			rte_errno = EINVAL;
465 			DRV_LOG(ERR, "port %u queue %d not a hairpin Txq",
466 				dev->data->port_id, cur_queue);
467 			mlx5_txq_release(dev, cur_queue);
468 			return -rte_errno;
469 		}
470 		if (txq_ctrl->obj == NULL || txq_ctrl->obj->sq == NULL) {
471 			rte_errno = ENOMEM;
472 			DRV_LOG(ERR, "port %u no Txq object found: %d",
473 				dev->data->port_id, cur_queue);
474 			mlx5_txq_release(dev, cur_queue);
475 			return -rte_errno;
476 		}
477 		if (txq_ctrl->hairpin_status != 0) {
478 			DRV_LOG(DEBUG, "port %u Tx queue %d is already bound",
479 				dev->data->port_id, cur_queue);
480 			mlx5_txq_release(dev, cur_queue);
481 			return 0;
482 		}
483 		/*
484 		 * All queues' of one port consistency checking is done in the
485 		 * bind() function, and that is optional.
486 		 */
487 		if (peer_info->tx_explicit !=
488 		    txq_ctrl->hairpin_conf.tx_explicit) {
489 			rte_errno = EINVAL;
490 			DRV_LOG(ERR, "port %u Tx queue %d and peer Tx rule mode"
491 				" mismatch", dev->data->port_id, cur_queue);
492 			mlx5_txq_release(dev, cur_queue);
493 			return -rte_errno;
494 		}
495 		if (peer_info->manual_bind !=
496 		    txq_ctrl->hairpin_conf.manual_bind) {
497 			rte_errno = EINVAL;
498 			DRV_LOG(ERR, "port %u Tx queue %d and peer binding mode"
499 				" mismatch", dev->data->port_id, cur_queue);
500 			mlx5_txq_release(dev, cur_queue);
501 			return -rte_errno;
502 		}
503 		sq_attr.state = MLX5_SQC_STATE_RDY;
504 		sq_attr.sq_state = MLX5_SQC_STATE_RST;
505 		sq_attr.hairpin_peer_rq = peer_info->qp_id;
506 		sq_attr.hairpin_peer_vhca = peer_info->vhca_id;
507 		ret = mlx5_devx_cmd_modify_sq(txq_ctrl->obj->sq, &sq_attr);
508 		if (ret == 0)
509 			txq_ctrl->hairpin_status = 1;
510 		mlx5_txq_release(dev, cur_queue);
511 	} else {
512 		struct mlx5_rxq_ctrl *rxq_ctrl;
513 		struct mlx5_devx_modify_rq_attr rq_attr = { 0 };
514 
515 		rxq_ctrl = mlx5_rxq_get(dev, cur_queue);
516 		if (rxq_ctrl == NULL) {
517 			rte_errno = EINVAL;
518 			DRV_LOG(ERR, "Failed to get port %u Rx queue %d",
519 				dev->data->port_id, cur_queue);
520 			return -rte_errno;
521 		}
522 		if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN) {
523 			rte_errno = EINVAL;
524 			DRV_LOG(ERR, "port %u queue %d not a hairpin Rxq",
525 				dev->data->port_id, cur_queue);
526 			mlx5_rxq_release(dev, cur_queue);
527 			return -rte_errno;
528 		}
529 		if (rxq_ctrl->obj == NULL || rxq_ctrl->obj->rq == NULL) {
530 			rte_errno = ENOMEM;
531 			DRV_LOG(ERR, "port %u no Rxq object found: %d",
532 				dev->data->port_id, cur_queue);
533 			mlx5_rxq_release(dev, cur_queue);
534 			return -rte_errno;
535 		}
536 		if (rxq_ctrl->hairpin_status != 0) {
537 			DRV_LOG(DEBUG, "port %u Rx queue %d is already bound",
538 				dev->data->port_id, cur_queue);
539 			mlx5_rxq_release(dev, cur_queue);
540 			return 0;
541 		}
542 		if (peer_info->tx_explicit !=
543 		    rxq_ctrl->hairpin_conf.tx_explicit) {
544 			rte_errno = EINVAL;
545 			DRV_LOG(ERR, "port %u Rx queue %d and peer Tx rule mode"
546 				" mismatch", dev->data->port_id, cur_queue);
547 			mlx5_rxq_release(dev, cur_queue);
548 			return -rte_errno;
549 		}
550 		if (peer_info->manual_bind !=
551 		    rxq_ctrl->hairpin_conf.manual_bind) {
552 			rte_errno = EINVAL;
553 			DRV_LOG(ERR, "port %u Rx queue %d and peer binding mode"
554 				" mismatch", dev->data->port_id, cur_queue);
555 			mlx5_rxq_release(dev, cur_queue);
556 			return -rte_errno;
557 		}
558 		rq_attr.state = MLX5_SQC_STATE_RDY;
559 		rq_attr.rq_state = MLX5_SQC_STATE_RST;
560 		rq_attr.hairpin_peer_sq = peer_info->qp_id;
561 		rq_attr.hairpin_peer_vhca = peer_info->vhca_id;
562 		ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq, &rq_attr);
563 		if (ret == 0)
564 			rxq_ctrl->hairpin_status = 1;
565 		mlx5_rxq_release(dev, cur_queue);
566 	}
567 	return ret;
568 }
569 
570 /*
571  * Unbind the hairpin queue and reset its HW configuration.
572  * This needs to be called twice both for Tx and Rx queues of a pair.
573  * If the queue is already unbound, it is considered successful.
574  *
575  * @param dev
576  *   Pointer to Ethernet device structure.
577  * @param cur_queue
578  *   Index of the queue to change the HW configuration to unbind.
579  * @param direction
580  *   Positive to reset the TxQ, zero to reset the RxQ.
581  *
582  * @return
583  *   0 on success, a negative errno value otherwise and rte_errno is set.
584  */
585 int
586 mlx5_hairpin_queue_peer_unbind(struct rte_eth_dev *dev, uint16_t cur_queue,
587 			       uint32_t direction)
588 {
589 	int ret = 0;
590 
591 	if (direction != 0) {
592 		struct mlx5_txq_ctrl *txq_ctrl;
593 		struct mlx5_devx_modify_sq_attr sq_attr = { 0 };
594 
595 		txq_ctrl = mlx5_txq_get(dev, cur_queue);
596 		if (txq_ctrl == NULL) {
597 			rte_errno = EINVAL;
598 			DRV_LOG(ERR, "Failed to get port %u Tx queue %d",
599 				dev->data->port_id, cur_queue);
600 			return -rte_errno;
601 		}
602 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
603 			rte_errno = EINVAL;
604 			DRV_LOG(ERR, "port %u queue %d not a hairpin Txq",
605 				dev->data->port_id, cur_queue);
606 			mlx5_txq_release(dev, cur_queue);
607 			return -rte_errno;
608 		}
609 		/* Already unbound, return success before obj checking. */
610 		if (txq_ctrl->hairpin_status == 0) {
611 			DRV_LOG(DEBUG, "port %u Tx queue %d is already unbound",
612 				dev->data->port_id, cur_queue);
613 			mlx5_txq_release(dev, cur_queue);
614 			return 0;
615 		}
616 		if (!txq_ctrl->obj || !txq_ctrl->obj->sq) {
617 			rte_errno = ENOMEM;
618 			DRV_LOG(ERR, "port %u no Txq object found: %d",
619 				dev->data->port_id, cur_queue);
620 			mlx5_txq_release(dev, cur_queue);
621 			return -rte_errno;
622 		}
623 		sq_attr.state = MLX5_SQC_STATE_RST;
624 		sq_attr.sq_state = MLX5_SQC_STATE_RST;
625 		ret = mlx5_devx_cmd_modify_sq(txq_ctrl->obj->sq, &sq_attr);
626 		if (ret == 0)
627 			txq_ctrl->hairpin_status = 0;
628 		mlx5_txq_release(dev, cur_queue);
629 	} else {
630 		struct mlx5_rxq_ctrl *rxq_ctrl;
631 		struct mlx5_devx_modify_rq_attr rq_attr = { 0 };
632 
633 		rxq_ctrl = mlx5_rxq_get(dev, cur_queue);
634 		if (rxq_ctrl == NULL) {
635 			rte_errno = EINVAL;
636 			DRV_LOG(ERR, "Failed to get port %u Rx queue %d",
637 				dev->data->port_id, cur_queue);
638 			return -rte_errno;
639 		}
640 		if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN) {
641 			rte_errno = EINVAL;
642 			DRV_LOG(ERR, "port %u queue %d not a hairpin Rxq",
643 				dev->data->port_id, cur_queue);
644 			mlx5_rxq_release(dev, cur_queue);
645 			return -rte_errno;
646 		}
647 		if (rxq_ctrl->hairpin_status == 0) {
648 			DRV_LOG(DEBUG, "port %u Rx queue %d is already unbound",
649 				dev->data->port_id, cur_queue);
650 			mlx5_rxq_release(dev, cur_queue);
651 			return 0;
652 		}
653 		if (rxq_ctrl->obj == NULL || rxq_ctrl->obj->rq == NULL) {
654 			rte_errno = ENOMEM;
655 			DRV_LOG(ERR, "port %u no Rxq object found: %d",
656 				dev->data->port_id, cur_queue);
657 			mlx5_rxq_release(dev, cur_queue);
658 			return -rte_errno;
659 		}
660 		rq_attr.state = MLX5_SQC_STATE_RST;
661 		rq_attr.rq_state = MLX5_SQC_STATE_RST;
662 		ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq, &rq_attr);
663 		if (ret == 0)
664 			rxq_ctrl->hairpin_status = 0;
665 		mlx5_rxq_release(dev, cur_queue);
666 	}
667 	return ret;
668 }
669 
670 /*
671  * Bind the hairpin port pairs, from the Tx to the peer Rx.
672  * This function only supports to bind the Tx to one Rx.
673  *
674  * @param dev
675  *   Pointer to Ethernet device structure.
676  * @param rx_port
677  *   Port identifier of the Rx port.
678  *
679  * @return
680  *   0 on success, a negative errno value otherwise and rte_errno is set.
681  */
682 static int
683 mlx5_hairpin_bind_single_port(struct rte_eth_dev *dev, uint16_t rx_port)
684 {
685 	struct mlx5_priv *priv = dev->data->dev_private;
686 	int ret = 0;
687 	struct mlx5_txq_ctrl *txq_ctrl;
688 	uint32_t i;
689 	struct rte_hairpin_peer_info peer = {0xffffff};
690 	struct rte_hairpin_peer_info cur;
691 	const struct rte_eth_hairpin_conf *conf;
692 	uint16_t num_q = 0;
693 	uint16_t local_port = priv->dev_data->port_id;
694 	uint32_t manual;
695 	uint32_t explicit;
696 	uint16_t rx_queue;
697 
698 	if (mlx5_eth_find_next(rx_port, dev->device) != rx_port) {
699 		rte_errno = ENODEV;
700 		DRV_LOG(ERR, "Rx port %u does not belong to mlx5", rx_port);
701 		return -rte_errno;
702 	}
703 	/*
704 	 * Before binding TxQ to peer RxQ, first round loop will be used for
705 	 * checking the queues' configuration consistency. This would be a
706 	 * little time consuming but better than doing the rollback.
707 	 */
708 	for (i = 0; i != priv->txqs_n; i++) {
709 		txq_ctrl = mlx5_txq_get(dev, i);
710 		if (txq_ctrl == NULL)
711 			continue;
712 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
713 			mlx5_txq_release(dev, i);
714 			continue;
715 		}
716 		/*
717 		 * All hairpin Tx queues of a single port that connected to the
718 		 * same peer Rx port should have the same "auto binding" and
719 		 * "implicit Tx flow" modes.
720 		 * Peer consistency checking will be done in per queue binding.
721 		 */
722 		conf = &txq_ctrl->hairpin_conf;
723 		if (conf->peers[0].port == rx_port) {
724 			if (num_q == 0) {
725 				manual = conf->manual_bind;
726 				explicit = conf->tx_explicit;
727 			} else {
728 				if (manual != conf->manual_bind ||
729 				    explicit != conf->tx_explicit) {
730 					rte_errno = EINVAL;
731 					DRV_LOG(ERR, "port %u queue %d mode"
732 						" mismatch: %u %u, %u %u",
733 						local_port, i, manual,
734 						conf->manual_bind, explicit,
735 						conf->tx_explicit);
736 					mlx5_txq_release(dev, i);
737 					return -rte_errno;
738 				}
739 			}
740 			num_q++;
741 		}
742 		mlx5_txq_release(dev, i);
743 	}
744 	/* Once no queue is configured, success is returned directly. */
745 	if (num_q == 0)
746 		return ret;
747 	/* All the hairpin TX queues need to be traversed again. */
748 	for (i = 0; i != priv->txqs_n; i++) {
749 		txq_ctrl = mlx5_txq_get(dev, i);
750 		if (txq_ctrl == NULL)
751 			continue;
752 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
753 			mlx5_txq_release(dev, i);
754 			continue;
755 		}
756 		if (txq_ctrl->hairpin_conf.peers[0].port != rx_port) {
757 			mlx5_txq_release(dev, i);
758 			continue;
759 		}
760 		rx_queue = txq_ctrl->hairpin_conf.peers[0].queue;
761 		/*
762 		 * Fetch peer RxQ's information.
763 		 * No need to pass the information of the current queue.
764 		 */
765 		ret = rte_eth_hairpin_queue_peer_update(rx_port, rx_queue,
766 							NULL, &peer, 1);
767 		if (ret != 0) {
768 			mlx5_txq_release(dev, i);
769 			goto error;
770 		}
771 		/* Accessing its own device, inside mlx5 PMD. */
772 		ret = mlx5_hairpin_queue_peer_bind(dev, i, &peer, 1);
773 		if (ret != 0) {
774 			mlx5_txq_release(dev, i);
775 			goto error;
776 		}
777 		/* Pass TxQ's information to peer RxQ and try binding. */
778 		cur.peer_q = rx_queue;
779 		cur.qp_id = txq_ctrl->obj->sq->id;
780 		cur.vhca_id = priv->config.hca_attr.vhca_id;
781 		cur.tx_explicit = txq_ctrl->hairpin_conf.tx_explicit;
782 		cur.manual_bind = txq_ctrl->hairpin_conf.manual_bind;
783 		/*
784 		 * In order to access another device in a proper way, RTE level
785 		 * private function is needed.
786 		 */
787 		ret = rte_eth_hairpin_queue_peer_bind(rx_port, rx_queue,
788 						      &cur, 0);
789 		if (ret != 0) {
790 			mlx5_txq_release(dev, i);
791 			goto error;
792 		}
793 		mlx5_txq_release(dev, i);
794 	}
795 	return 0;
796 error:
797 	/*
798 	 * Do roll-back process for the queues already bound.
799 	 * No need to check the return value of the queue unbind function.
800 	 */
801 	do {
802 		/* No validation is needed here. */
803 		txq_ctrl = mlx5_txq_get(dev, i);
804 		if (txq_ctrl == NULL)
805 			continue;
806 		rx_queue = txq_ctrl->hairpin_conf.peers[0].queue;
807 		rte_eth_hairpin_queue_peer_unbind(rx_port, rx_queue, 0);
808 		mlx5_hairpin_queue_peer_unbind(dev, i, 1);
809 		mlx5_txq_release(dev, i);
810 	} while (i--);
811 	return ret;
812 }
813 
814 /*
815  * Unbind the hairpin port pair, HW configuration of both devices will be clear
816  * and status will be reset for all the queues used between the them.
817  * This function only supports to unbind the Tx from one Rx.
818  *
819  * @param dev
820  *   Pointer to Ethernet device structure.
821  * @param rx_port
822  *   Port identifier of the Rx port.
823  *
824  * @return
825  *   0 on success, a negative errno value otherwise and rte_errno is set.
826  */
827 static int
828 mlx5_hairpin_unbind_single_port(struct rte_eth_dev *dev, uint16_t rx_port)
829 {
830 	struct mlx5_priv *priv = dev->data->dev_private;
831 	struct mlx5_txq_ctrl *txq_ctrl;
832 	uint32_t i;
833 	int ret;
834 	uint16_t cur_port = priv->dev_data->port_id;
835 
836 	if (mlx5_eth_find_next(rx_port, dev->device) != rx_port) {
837 		rte_errno = ENODEV;
838 		DRV_LOG(ERR, "Rx port %u does not belong to mlx5", rx_port);
839 		return -rte_errno;
840 	}
841 	for (i = 0; i != priv->txqs_n; i++) {
842 		uint16_t rx_queue;
843 
844 		txq_ctrl = mlx5_txq_get(dev, i);
845 		if (txq_ctrl == NULL)
846 			continue;
847 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
848 			mlx5_txq_release(dev, i);
849 			continue;
850 		}
851 		if (txq_ctrl->hairpin_conf.peers[0].port != rx_port) {
852 			mlx5_txq_release(dev, i);
853 			continue;
854 		}
855 		/* Indeed, only the first used queue needs to be checked. */
856 		if (txq_ctrl->hairpin_conf.manual_bind == 0) {
857 			if (cur_port != rx_port) {
858 				rte_errno = EINVAL;
859 				DRV_LOG(ERR, "port %u and port %u are in"
860 					" auto-bind mode", cur_port, rx_port);
861 				mlx5_txq_release(dev, i);
862 				return -rte_errno;
863 			} else {
864 				return 0;
865 			}
866 		}
867 		rx_queue = txq_ctrl->hairpin_conf.peers[0].queue;
868 		mlx5_txq_release(dev, i);
869 		ret = rte_eth_hairpin_queue_peer_unbind(rx_port, rx_queue, 0);
870 		if (ret) {
871 			DRV_LOG(ERR, "port %u Rx queue %d unbind - failure",
872 				rx_port, rx_queue);
873 			return ret;
874 		}
875 		ret = mlx5_hairpin_queue_peer_unbind(dev, i, 1);
876 		if (ret) {
877 			DRV_LOG(ERR, "port %u Tx queue %d unbind - failure",
878 				cur_port, i);
879 			return ret;
880 		}
881 	}
882 	return 0;
883 }
884 
885 /*
886  * Bind hairpin ports, Rx could be all ports when using RTE_MAX_ETHPORTS.
887  * @see mlx5_hairpin_bind_single_port()
888  */
889 int
890 mlx5_hairpin_bind(struct rte_eth_dev *dev, uint16_t rx_port)
891 {
892 	int ret = 0;
893 	uint16_t p, pp;
894 
895 	/*
896 	 * If the Rx port has no hairpin configuration with the current port,
897 	 * the binding will be skipped in the called function of single port.
898 	 * Device started status will be checked only before the queue
899 	 * information updating.
900 	 */
901 	if (rx_port == RTE_MAX_ETHPORTS) {
902 		MLX5_ETH_FOREACH_DEV(p, dev->device) {
903 			ret = mlx5_hairpin_bind_single_port(dev, p);
904 			if (ret != 0)
905 				goto unbind;
906 		}
907 		return ret;
908 	} else {
909 		return mlx5_hairpin_bind_single_port(dev, rx_port);
910 	}
911 unbind:
912 	MLX5_ETH_FOREACH_DEV(pp, dev->device)
913 		if (pp < p)
914 			mlx5_hairpin_unbind_single_port(dev, pp);
915 	return ret;
916 }
917 
918 /*
919  * Unbind hairpin ports, Rx could be all ports when using RTE_MAX_ETHPORTS.
920  * @see mlx5_hairpin_unbind_single_port()
921  */
922 int
923 mlx5_hairpin_unbind(struct rte_eth_dev *dev, uint16_t rx_port)
924 {
925 	int ret = 0;
926 	uint16_t p;
927 
928 	if (rx_port == RTE_MAX_ETHPORTS)
929 		MLX5_ETH_FOREACH_DEV(p, dev->device) {
930 			ret = mlx5_hairpin_unbind_single_port(dev, p);
931 			if (ret != 0)
932 				return ret;
933 		}
934 	else
935 		ret = mlx5_hairpin_unbind_single_port(dev, rx_port);
936 	return ret;
937 }
938 
939 /*
940  * DPDK callback to get the hairpin peer ports list.
941  * This will return the actual number of peer ports and save the identifiers
942  * into the array (sorted, may be different from that when setting up the
943  * hairpin peer queues).
944  * The peer port ID could be the same as the port ID of the current device.
945  *
946  * @param dev
947  *   Pointer to Ethernet device structure.
948  * @param peer_ports
949  *   Pointer to array to save the port identifiers.
950  * @param len
951  *   The length of the array.
952  * @param direction
953  *   Current port to peer port direction.
954  *   positive - current used as Tx to get all peer Rx ports.
955  *   zero - current used as Rx to get all peer Tx ports.
956  *
957  * @return
958  *   0 or positive value on success, actual number of peer ports.
959  *   a negative errno value otherwise and rte_errno is set.
960  */
961 int
962 mlx5_hairpin_get_peer_ports(struct rte_eth_dev *dev, uint16_t *peer_ports,
963 			    size_t len, uint32_t direction)
964 {
965 	struct mlx5_priv *priv = dev->data->dev_private;
966 	struct mlx5_txq_ctrl *txq_ctrl;
967 	struct mlx5_rxq_ctrl *rxq_ctrl;
968 	uint32_t i;
969 	uint16_t pp;
970 	uint32_t bits[(RTE_MAX_ETHPORTS + 31) / 32] = {0};
971 	int ret = 0;
972 
973 	if (direction) {
974 		for (i = 0; i < priv->txqs_n; i++) {
975 			txq_ctrl = mlx5_txq_get(dev, i);
976 			if (!txq_ctrl)
977 				continue;
978 			if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
979 				mlx5_txq_release(dev, i);
980 				continue;
981 			}
982 			pp = txq_ctrl->hairpin_conf.peers[0].port;
983 			if (pp >= RTE_MAX_ETHPORTS) {
984 				rte_errno = ERANGE;
985 				mlx5_txq_release(dev, i);
986 				DRV_LOG(ERR, "port %hu queue %u peer port "
987 					"out of range %hu",
988 					priv->dev_data->port_id, i, pp);
989 				return -rte_errno;
990 			}
991 			bits[pp / 32] |= 1 << (pp % 32);
992 			mlx5_txq_release(dev, i);
993 		}
994 	} else {
995 		for (i = 0; i < priv->rxqs_n; i++) {
996 			rxq_ctrl = mlx5_rxq_get(dev, i);
997 			if (!rxq_ctrl)
998 				continue;
999 			if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN) {
1000 				mlx5_rxq_release(dev, i);
1001 				continue;
1002 			}
1003 			pp = rxq_ctrl->hairpin_conf.peers[0].port;
1004 			if (pp >= RTE_MAX_ETHPORTS) {
1005 				rte_errno = ERANGE;
1006 				mlx5_rxq_release(dev, i);
1007 				DRV_LOG(ERR, "port %hu queue %u peer port "
1008 					"out of range %hu",
1009 					priv->dev_data->port_id, i, pp);
1010 				return -rte_errno;
1011 			}
1012 			bits[pp / 32] |= 1 << (pp % 32);
1013 			mlx5_rxq_release(dev, i);
1014 		}
1015 	}
1016 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
1017 		if (bits[i / 32] & (1 << (i % 32))) {
1018 			if ((size_t)ret >= len) {
1019 				rte_errno = E2BIG;
1020 				return -rte_errno;
1021 			}
1022 			peer_ports[ret++] = i;
1023 		}
1024 	}
1025 	return ret;
1026 }
1027 
1028 /**
1029  * DPDK callback to start the device.
1030  *
1031  * Simulate device start by attaching all configured flows.
1032  *
1033  * @param dev
1034  *   Pointer to Ethernet device structure.
1035  *
1036  * @return
1037  *   0 on success, a negative errno value otherwise and rte_errno is set.
1038  */
1039 int
1040 mlx5_dev_start(struct rte_eth_dev *dev)
1041 {
1042 	struct mlx5_priv *priv = dev->data->dev_private;
1043 	int ret;
1044 	int fine_inline;
1045 
1046 	DRV_LOG(DEBUG, "port %u starting device", dev->data->port_id);
1047 	fine_inline = rte_mbuf_dynflag_lookup
1048 		(RTE_PMD_MLX5_FINE_GRANULARITY_INLINE, NULL);
1049 	if (fine_inline >= 0)
1050 		rte_net_mlx5_dynf_inline_mask = 1UL << fine_inline;
1051 	else
1052 		rte_net_mlx5_dynf_inline_mask = 0;
1053 	if (dev->data->nb_rx_queues > 0) {
1054 		ret = mlx5_dev_configure_rss_reta(dev);
1055 		if (ret) {
1056 			DRV_LOG(ERR, "port %u reta config failed: %s",
1057 				dev->data->port_id, strerror(rte_errno));
1058 			return -rte_errno;
1059 		}
1060 	}
1061 	ret = mlx5_txpp_start(dev);
1062 	if (ret) {
1063 		DRV_LOG(ERR, "port %u Tx packet pacing init failed: %s",
1064 			dev->data->port_id, strerror(rte_errno));
1065 		goto error;
1066 	}
1067 	if ((priv->config.devx && priv->config.dv_flow_en &&
1068 	    priv->config.dest_tir) && priv->obj_ops.lb_dummy_queue_create) {
1069 		ret = priv->obj_ops.lb_dummy_queue_create(dev);
1070 		if (ret)
1071 			goto error;
1072 	}
1073 	ret = mlx5_txq_start(dev);
1074 	if (ret) {
1075 		DRV_LOG(ERR, "port %u Tx queue allocation failed: %s",
1076 			dev->data->port_id, strerror(rte_errno));
1077 		goto error;
1078 	}
1079 	ret = mlx5_rxq_start(dev);
1080 	if (ret) {
1081 		DRV_LOG(ERR, "port %u Rx queue allocation failed: %s",
1082 			dev->data->port_id, strerror(rte_errno));
1083 		goto error;
1084 	}
1085 	/*
1086 	 * Such step will be skipped if there is no hairpin TX queue configured
1087 	 * with RX peer queue from the same device.
1088 	 */
1089 	ret = mlx5_hairpin_auto_bind(dev);
1090 	if (ret) {
1091 		DRV_LOG(ERR, "port %u hairpin auto binding failed: %s",
1092 			dev->data->port_id, strerror(rte_errno));
1093 		goto error;
1094 	}
1095 	/* Set started flag here for the following steps like control flow. */
1096 	dev->data->dev_started = 1;
1097 	ret = mlx5_rx_intr_vec_enable(dev);
1098 	if (ret) {
1099 		DRV_LOG(ERR, "port %u Rx interrupt vector creation failed",
1100 			dev->data->port_id);
1101 		goto error;
1102 	}
1103 	mlx5_os_stats_init(dev);
1104 	ret = mlx5_traffic_enable(dev);
1105 	if (ret) {
1106 		DRV_LOG(ERR, "port %u failed to set defaults flows",
1107 			dev->data->port_id);
1108 		goto error;
1109 	}
1110 	/* Set a mask and offset of dynamic metadata flows into Rx queues. */
1111 	mlx5_flow_rxq_dynf_metadata_set(dev);
1112 	/* Set flags and context to convert Rx timestamps. */
1113 	mlx5_rxq_timestamp_set(dev);
1114 	/* Set a mask and offset of scheduling on timestamp into Tx queues. */
1115 	mlx5_txq_dynf_timestamp_set(dev);
1116 	/*
1117 	 * In non-cached mode, it only needs to start the default mreg copy
1118 	 * action and no flow created by application exists anymore.
1119 	 * But it is worth wrapping the interface for further usage.
1120 	 */
1121 	ret = mlx5_flow_start_default(dev);
1122 	if (ret) {
1123 		DRV_LOG(DEBUG, "port %u failed to start default actions: %s",
1124 			dev->data->port_id, strerror(rte_errno));
1125 		goto error;
1126 	}
1127 	rte_wmb();
1128 	dev->tx_pkt_burst = mlx5_select_tx_function(dev);
1129 	dev->rx_pkt_burst = mlx5_select_rx_function(dev);
1130 	/* Enable datapath on secondary process. */
1131 	mlx5_mp_os_req_start_rxtx(dev);
1132 	if (priv->sh->intr_handle.fd >= 0) {
1133 		priv->sh->port[priv->dev_port - 1].ih_port_id =
1134 					(uint32_t)dev->data->port_id;
1135 	} else {
1136 		DRV_LOG(INFO, "port %u starts without LSC and RMV interrupts.",
1137 			dev->data->port_id);
1138 		dev->data->dev_conf.intr_conf.lsc = 0;
1139 		dev->data->dev_conf.intr_conf.rmv = 0;
1140 	}
1141 	if (priv->sh->intr_handle_devx.fd >= 0)
1142 		priv->sh->port[priv->dev_port - 1].devx_ih_port_id =
1143 					(uint32_t)dev->data->port_id;
1144 	return 0;
1145 error:
1146 	ret = rte_errno; /* Save rte_errno before cleanup. */
1147 	/* Rollback. */
1148 	dev->data->dev_started = 0;
1149 	mlx5_flow_stop_default(dev);
1150 	mlx5_traffic_disable(dev);
1151 	mlx5_txq_stop(dev);
1152 	mlx5_rxq_stop(dev);
1153 	if (priv->obj_ops.lb_dummy_queue_release)
1154 		priv->obj_ops.lb_dummy_queue_release(dev);
1155 	mlx5_txpp_stop(dev); /* Stop last. */
1156 	rte_errno = ret; /* Restore rte_errno. */
1157 	return -rte_errno;
1158 }
1159 
1160 /**
1161  * DPDK callback to stop the device.
1162  *
1163  * Simulate device stop by detaching all configured flows.
1164  *
1165  * @param dev
1166  *   Pointer to Ethernet device structure.
1167  */
1168 int
1169 mlx5_dev_stop(struct rte_eth_dev *dev)
1170 {
1171 	struct mlx5_priv *priv = dev->data->dev_private;
1172 
1173 	dev->data->dev_started = 0;
1174 	/* Prevent crashes when queues are still in use. */
1175 	dev->rx_pkt_burst = removed_rx_burst;
1176 	dev->tx_pkt_burst = removed_tx_burst;
1177 	rte_wmb();
1178 	/* Disable datapath on secondary process. */
1179 	mlx5_mp_os_req_stop_rxtx(dev);
1180 	rte_delay_us_sleep(1000 * priv->rxqs_n);
1181 	DRV_LOG(DEBUG, "port %u stopping device", dev->data->port_id);
1182 	mlx5_flow_stop_default(dev);
1183 	/* Control flows for default traffic can be removed firstly. */
1184 	mlx5_traffic_disable(dev);
1185 	/* All RX queue flags will be cleared in the flush interface. */
1186 	mlx5_flow_list_flush(dev, MLX5_FLOW_TYPE_GEN, true);
1187 	mlx5_flow_meter_rxq_flush(dev);
1188 	mlx5_rx_intr_vec_disable(dev);
1189 	priv->sh->port[priv->dev_port - 1].ih_port_id = RTE_MAX_ETHPORTS;
1190 	priv->sh->port[priv->dev_port - 1].devx_ih_port_id = RTE_MAX_ETHPORTS;
1191 	mlx5_txq_stop(dev);
1192 	mlx5_rxq_stop(dev);
1193 	if (priv->obj_ops.lb_dummy_queue_release)
1194 		priv->obj_ops.lb_dummy_queue_release(dev);
1195 	mlx5_txpp_stop(dev);
1196 
1197 	return 0;
1198 }
1199 
1200 /**
1201  * Enable traffic flows configured by control plane
1202  *
1203  * @param dev
1204  *   Pointer to Ethernet device private data.
1205  * @param dev
1206  *   Pointer to Ethernet device structure.
1207  *
1208  * @return
1209  *   0 on success, a negative errno value otherwise and rte_errno is set.
1210  */
1211 int
1212 mlx5_traffic_enable(struct rte_eth_dev *dev)
1213 {
1214 	struct mlx5_priv *priv = dev->data->dev_private;
1215 	struct rte_flow_item_eth bcast = {
1216 		.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
1217 	};
1218 	struct rte_flow_item_eth ipv6_multi_spec = {
1219 		.dst.addr_bytes = "\x33\x33\x00\x00\x00\x00",
1220 	};
1221 	struct rte_flow_item_eth ipv6_multi_mask = {
1222 		.dst.addr_bytes = "\xff\xff\x00\x00\x00\x00",
1223 	};
1224 	struct rte_flow_item_eth unicast = {
1225 		.src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1226 	};
1227 	struct rte_flow_item_eth unicast_mask = {
1228 		.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
1229 	};
1230 	const unsigned int vlan_filter_n = priv->vlan_filter_n;
1231 	const struct rte_ether_addr cmp = {
1232 		.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1233 	};
1234 	unsigned int i;
1235 	unsigned int j;
1236 	int ret;
1237 
1238 	/*
1239 	 * Hairpin txq default flow should be created no matter if it is
1240 	 * isolation mode. Or else all the packets to be sent will be sent
1241 	 * out directly without the TX flow actions, e.g. encapsulation.
1242 	 */
1243 	for (i = 0; i != priv->txqs_n; ++i) {
1244 		struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i);
1245 		if (!txq_ctrl)
1246 			continue;
1247 		/* Only Tx implicit mode requires the default Tx flow. */
1248 		if (txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN &&
1249 		    txq_ctrl->hairpin_conf.tx_explicit == 0 &&
1250 		    txq_ctrl->hairpin_conf.peers[0].port ==
1251 		    priv->dev_data->port_id) {
1252 			ret = mlx5_ctrl_flow_source_queue(dev, i);
1253 			if (ret) {
1254 				mlx5_txq_release(dev, i);
1255 				goto error;
1256 			}
1257 		}
1258 		mlx5_txq_release(dev, i);
1259 	}
1260 	if (priv->config.dv_esw_en && !priv->config.vf && !priv->config.sf) {
1261 		if (mlx5_flow_create_esw_table_zero_flow(dev))
1262 			priv->fdb_def_rule = 1;
1263 		else
1264 			DRV_LOG(INFO, "port %u FDB default rule cannot be"
1265 				" configured - only Eswitch group 0 flows are"
1266 				" supported.", dev->data->port_id);
1267 	}
1268 	if (!priv->config.lacp_by_user && priv->pf_bond >= 0) {
1269 		ret = mlx5_flow_lacp_miss(dev);
1270 		if (ret)
1271 			DRV_LOG(INFO, "port %u LACP rule cannot be created - "
1272 				"forward LACP to kernel.", dev->data->port_id);
1273 		else
1274 			DRV_LOG(INFO, "LACP traffic will be missed in port %u."
1275 				, dev->data->port_id);
1276 	}
1277 	if (priv->isolated)
1278 		return 0;
1279 	if (dev->data->promiscuous) {
1280 		struct rte_flow_item_eth promisc = {
1281 			.dst.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1282 			.src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1283 			.type = 0,
1284 		};
1285 
1286 		ret = mlx5_ctrl_flow(dev, &promisc, &promisc);
1287 		if (ret)
1288 			goto error;
1289 	}
1290 	if (dev->data->all_multicast) {
1291 		struct rte_flow_item_eth multicast = {
1292 			.dst.addr_bytes = "\x01\x00\x00\x00\x00\x00",
1293 			.src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1294 			.type = 0,
1295 		};
1296 
1297 		ret = mlx5_ctrl_flow(dev, &multicast, &multicast);
1298 		if (ret)
1299 			goto error;
1300 	} else {
1301 		/* Add broadcast/multicast flows. */
1302 		for (i = 0; i != vlan_filter_n; ++i) {
1303 			uint16_t vlan = priv->vlan_filter[i];
1304 
1305 			struct rte_flow_item_vlan vlan_spec = {
1306 				.tci = rte_cpu_to_be_16(vlan),
1307 			};
1308 			struct rte_flow_item_vlan vlan_mask =
1309 				rte_flow_item_vlan_mask;
1310 
1311 			ret = mlx5_ctrl_flow_vlan(dev, &bcast, &bcast,
1312 						  &vlan_spec, &vlan_mask);
1313 			if (ret)
1314 				goto error;
1315 			ret = mlx5_ctrl_flow_vlan(dev, &ipv6_multi_spec,
1316 						  &ipv6_multi_mask,
1317 						  &vlan_spec, &vlan_mask);
1318 			if (ret)
1319 				goto error;
1320 		}
1321 		if (!vlan_filter_n) {
1322 			ret = mlx5_ctrl_flow(dev, &bcast, &bcast);
1323 			if (ret)
1324 				goto error;
1325 			ret = mlx5_ctrl_flow(dev, &ipv6_multi_spec,
1326 					     &ipv6_multi_mask);
1327 			if (ret) {
1328 				/* Do not fail on IPv6 broadcast creation failure. */
1329 				DRV_LOG(WARNING,
1330 					"IPv6 broadcast is not supported");
1331 				ret = 0;
1332 			}
1333 		}
1334 	}
1335 	/* Add MAC address flows. */
1336 	for (i = 0; i != MLX5_MAX_MAC_ADDRESSES; ++i) {
1337 		struct rte_ether_addr *mac = &dev->data->mac_addrs[i];
1338 
1339 		if (!memcmp(mac, &cmp, sizeof(*mac)))
1340 			continue;
1341 		memcpy(&unicast.dst.addr_bytes,
1342 		       mac->addr_bytes,
1343 		       RTE_ETHER_ADDR_LEN);
1344 		for (j = 0; j != vlan_filter_n; ++j) {
1345 			uint16_t vlan = priv->vlan_filter[j];
1346 
1347 			struct rte_flow_item_vlan vlan_spec = {
1348 				.tci = rte_cpu_to_be_16(vlan),
1349 			};
1350 			struct rte_flow_item_vlan vlan_mask =
1351 				rte_flow_item_vlan_mask;
1352 
1353 			ret = mlx5_ctrl_flow_vlan(dev, &unicast,
1354 						  &unicast_mask,
1355 						  &vlan_spec,
1356 						  &vlan_mask);
1357 			if (ret)
1358 				goto error;
1359 		}
1360 		if (!vlan_filter_n) {
1361 			ret = mlx5_ctrl_flow(dev, &unicast, &unicast_mask);
1362 			if (ret)
1363 				goto error;
1364 		}
1365 	}
1366 	return 0;
1367 error:
1368 	ret = rte_errno; /* Save rte_errno before cleanup. */
1369 	mlx5_flow_list_flush(dev, MLX5_FLOW_TYPE_CTL, false);
1370 	rte_errno = ret; /* Restore rte_errno. */
1371 	return -rte_errno;
1372 }
1373 
1374 
1375 /**
1376  * Disable traffic flows configured by control plane
1377  *
1378  * @param dev
1379  *   Pointer to Ethernet device private data.
1380  */
1381 void
1382 mlx5_traffic_disable(struct rte_eth_dev *dev)
1383 {
1384 	mlx5_flow_list_flush(dev, MLX5_FLOW_TYPE_CTL, false);
1385 }
1386 
1387 /**
1388  * Restart traffic flows configured by control plane
1389  *
1390  * @param dev
1391  *   Pointer to Ethernet device private data.
1392  *
1393  * @return
1394  *   0 on success, a negative errno value otherwise and rte_errno is set.
1395  */
1396 int
1397 mlx5_traffic_restart(struct rte_eth_dev *dev)
1398 {
1399 	if (dev->data->dev_started) {
1400 		mlx5_traffic_disable(dev);
1401 		return mlx5_traffic_enable(dev);
1402 	}
1403 	return 0;
1404 }
1405