xref: /dpdk/drivers/net/mlx5/mlx5_trigger.c (revision 9f3b3a96dec2f4c01cc92a132d763b8887d29e6a)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5 
6 #include <unistd.h>
7 
8 #include <rte_ether.h>
9 #include <rte_ethdev_driver.h>
10 #include <rte_interrupts.h>
11 #include <rte_alarm.h>
12 
13 #include <mlx5_malloc.h>
14 
15 #include "mlx5.h"
16 #include "mlx5_mr.h"
17 #include "mlx5_rxtx.h"
18 #include "mlx5_utils.h"
19 #include "rte_pmd_mlx5.h"
20 
21 /**
22  * Stop traffic on Tx queues.
23  *
24  * @param dev
25  *   Pointer to Ethernet device structure.
26  */
27 static void
28 mlx5_txq_stop(struct rte_eth_dev *dev)
29 {
30 	struct mlx5_priv *priv = dev->data->dev_private;
31 	unsigned int i;
32 
33 	for (i = 0; i != priv->txqs_n; ++i)
34 		mlx5_txq_release(dev, i);
35 }
36 
37 /**
38  * Start traffic on Tx queues.
39  *
40  * @param dev
41  *   Pointer to Ethernet device structure.
42  *
43  * @return
44  *   0 on success, a negative errno value otherwise and rte_errno is set.
45  */
46 static int
47 mlx5_txq_start(struct rte_eth_dev *dev)
48 {
49 	struct mlx5_priv *priv = dev->data->dev_private;
50 	unsigned int i;
51 	int ret;
52 
53 	for (i = 0; i != priv->txqs_n; ++i) {
54 		struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i);
55 		struct mlx5_txq_data *txq_data = &txq_ctrl->txq;
56 		uint32_t flags = MLX5_MEM_RTE | MLX5_MEM_ZERO;
57 
58 		if (!txq_ctrl)
59 			continue;
60 		if (txq_ctrl->type == MLX5_TXQ_TYPE_STANDARD)
61 			txq_alloc_elts(txq_ctrl);
62 		MLX5_ASSERT(!txq_ctrl->obj);
63 		txq_ctrl->obj = mlx5_malloc(flags, sizeof(struct mlx5_txq_obj),
64 					    0, txq_ctrl->socket);
65 		if (!txq_ctrl->obj) {
66 			DRV_LOG(ERR, "Port %u Tx queue %u cannot allocate "
67 				"memory resources.", dev->data->port_id,
68 				txq_data->idx);
69 			rte_errno = ENOMEM;
70 			goto error;
71 		}
72 		ret = priv->obj_ops.txq_obj_new(dev, i);
73 		if (ret < 0) {
74 			mlx5_free(txq_ctrl->obj);
75 			txq_ctrl->obj = NULL;
76 			goto error;
77 		}
78 		if (txq_ctrl->type == MLX5_TXQ_TYPE_STANDARD) {
79 			size_t size = txq_data->cqe_s * sizeof(*txq_data->fcqs);
80 			txq_data->fcqs = mlx5_malloc(flags, size,
81 						     RTE_CACHE_LINE_SIZE,
82 						     txq_ctrl->socket);
83 			if (!txq_data->fcqs) {
84 				DRV_LOG(ERR, "Port %u Tx queue %u cannot "
85 					"allocate memory (FCQ).",
86 					dev->data->port_id, i);
87 				rte_errno = ENOMEM;
88 				goto error;
89 			}
90 		}
91 		DRV_LOG(DEBUG, "Port %u txq %u updated with %p.",
92 			dev->data->port_id, i, (void *)&txq_ctrl->obj);
93 		LIST_INSERT_HEAD(&priv->txqsobj, txq_ctrl->obj, next);
94 	}
95 	return 0;
96 error:
97 	ret = rte_errno; /* Save rte_errno before cleanup. */
98 	do {
99 		mlx5_txq_release(dev, i);
100 	} while (i-- != 0);
101 	rte_errno = ret; /* Restore rte_errno. */
102 	return -rte_errno;
103 }
104 
105 /**
106  * Stop traffic on Rx queues.
107  *
108  * @param dev
109  *   Pointer to Ethernet device structure.
110  */
111 static void
112 mlx5_rxq_stop(struct rte_eth_dev *dev)
113 {
114 	struct mlx5_priv *priv = dev->data->dev_private;
115 	unsigned int i;
116 
117 	for (i = 0; i != priv->rxqs_n; ++i)
118 		mlx5_rxq_release(dev, i);
119 }
120 
121 /**
122  * Start traffic on Rx queues.
123  *
124  * @param dev
125  *   Pointer to Ethernet device structure.
126  *
127  * @return
128  *   0 on success, a negative errno value otherwise and rte_errno is set.
129  */
130 static int
131 mlx5_rxq_start(struct rte_eth_dev *dev)
132 {
133 	struct mlx5_priv *priv = dev->data->dev_private;
134 	unsigned int i;
135 	int ret = 0;
136 
137 	/* Allocate/reuse/resize mempool for Multi-Packet RQ. */
138 	if (mlx5_mprq_alloc_mp(dev)) {
139 		/* Should not release Rx queues but return immediately. */
140 		return -rte_errno;
141 	}
142 	DRV_LOG(DEBUG, "Port %u device_attr.max_qp_wr is %d.",
143 		dev->data->port_id, priv->sh->device_attr.max_qp_wr);
144 	DRV_LOG(DEBUG, "Port %u device_attr.max_sge is %d.",
145 		dev->data->port_id, priv->sh->device_attr.max_sge);
146 	for (i = 0; i != priv->rxqs_n; ++i) {
147 		struct mlx5_rxq_ctrl *rxq_ctrl = mlx5_rxq_get(dev, i);
148 
149 		if (!rxq_ctrl)
150 			continue;
151 		if (rxq_ctrl->type == MLX5_RXQ_TYPE_STANDARD) {
152 			/* Pre-register Rx mempools. */
153 			if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq)) {
154 				mlx5_mr_update_mp(dev, &rxq_ctrl->rxq.mr_ctrl,
155 						  rxq_ctrl->rxq.mprq_mp);
156 			} else {
157 				uint32_t s;
158 
159 				for (s = 0; s < rxq_ctrl->rxq.rxseg_n; s++)
160 					mlx5_mr_update_mp
161 						(dev, &rxq_ctrl->rxq.mr_ctrl,
162 						rxq_ctrl->rxq.rxseg[s].mp);
163 			}
164 			ret = rxq_alloc_elts(rxq_ctrl);
165 			if (ret)
166 				goto error;
167 		}
168 		MLX5_ASSERT(!rxq_ctrl->obj);
169 		rxq_ctrl->obj = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
170 					    sizeof(*rxq_ctrl->obj), 0,
171 					    rxq_ctrl->socket);
172 		if (!rxq_ctrl->obj) {
173 			DRV_LOG(ERR,
174 				"Port %u Rx queue %u can't allocate resources.",
175 				dev->data->port_id, (*priv->rxqs)[i]->idx);
176 			rte_errno = ENOMEM;
177 			goto error;
178 		}
179 		ret = priv->obj_ops.rxq_obj_new(dev, i);
180 		if (ret) {
181 			mlx5_free(rxq_ctrl->obj);
182 			goto error;
183 		}
184 		DRV_LOG(DEBUG, "Port %u rxq %u updated with %p.",
185 			dev->data->port_id, i, (void *)&rxq_ctrl->obj);
186 		LIST_INSERT_HEAD(&priv->rxqsobj, rxq_ctrl->obj, next);
187 	}
188 	return 0;
189 error:
190 	ret = rte_errno; /* Save rte_errno before cleanup. */
191 	do {
192 		mlx5_rxq_release(dev, i);
193 	} while (i-- != 0);
194 	rte_errno = ret; /* Restore rte_errno. */
195 	return -rte_errno;
196 }
197 
198 /**
199  * Binds Tx queues to Rx queues for hairpin.
200  *
201  * Binds Tx queues to the target Rx queues.
202  *
203  * @param dev
204  *   Pointer to Ethernet device structure.
205  *
206  * @return
207  *   0 on success, a negative errno value otherwise and rte_errno is set.
208  */
209 static int
210 mlx5_hairpin_auto_bind(struct rte_eth_dev *dev)
211 {
212 	struct mlx5_priv *priv = dev->data->dev_private;
213 	struct mlx5_devx_modify_sq_attr sq_attr = { 0 };
214 	struct mlx5_devx_modify_rq_attr rq_attr = { 0 };
215 	struct mlx5_txq_ctrl *txq_ctrl;
216 	struct mlx5_rxq_ctrl *rxq_ctrl;
217 	struct mlx5_devx_obj *sq;
218 	struct mlx5_devx_obj *rq;
219 	unsigned int i;
220 	int ret = 0;
221 	bool need_auto = false;
222 	uint16_t self_port = dev->data->port_id;
223 
224 	for (i = 0; i != priv->txqs_n; ++i) {
225 		txq_ctrl = mlx5_txq_get(dev, i);
226 		if (!txq_ctrl)
227 			continue;
228 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
229 			mlx5_txq_release(dev, i);
230 			continue;
231 		}
232 		if (txq_ctrl->hairpin_conf.peers[0].port != self_port)
233 			continue;
234 		if (txq_ctrl->hairpin_conf.manual_bind) {
235 			mlx5_txq_release(dev, i);
236 			return 0;
237 		}
238 		need_auto = true;
239 		mlx5_txq_release(dev, i);
240 	}
241 	if (!need_auto)
242 		return 0;
243 	for (i = 0; i != priv->txqs_n; ++i) {
244 		txq_ctrl = mlx5_txq_get(dev, i);
245 		if (!txq_ctrl)
246 			continue;
247 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
248 			mlx5_txq_release(dev, i);
249 			continue;
250 		}
251 		/* Skip hairpin queues with other peer ports. */
252 		if (txq_ctrl->hairpin_conf.peers[0].port != self_port)
253 			continue;
254 		if (!txq_ctrl->obj) {
255 			rte_errno = ENOMEM;
256 			DRV_LOG(ERR, "port %u no txq object found: %d",
257 				dev->data->port_id, i);
258 			mlx5_txq_release(dev, i);
259 			return -rte_errno;
260 		}
261 		sq = txq_ctrl->obj->sq;
262 		rxq_ctrl = mlx5_rxq_get(dev,
263 					txq_ctrl->hairpin_conf.peers[0].queue);
264 		if (!rxq_ctrl) {
265 			mlx5_txq_release(dev, i);
266 			rte_errno = EINVAL;
267 			DRV_LOG(ERR, "port %u no rxq object found: %d",
268 				dev->data->port_id,
269 				txq_ctrl->hairpin_conf.peers[0].queue);
270 			return -rte_errno;
271 		}
272 		if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN ||
273 		    rxq_ctrl->hairpin_conf.peers[0].queue != i) {
274 			rte_errno = ENOMEM;
275 			DRV_LOG(ERR, "port %u Tx queue %d can't be binded to "
276 				"Rx queue %d", dev->data->port_id,
277 				i, txq_ctrl->hairpin_conf.peers[0].queue);
278 			goto error;
279 		}
280 		rq = rxq_ctrl->obj->rq;
281 		if (!rq) {
282 			rte_errno = ENOMEM;
283 			DRV_LOG(ERR, "port %u hairpin no matching rxq: %d",
284 				dev->data->port_id,
285 				txq_ctrl->hairpin_conf.peers[0].queue);
286 			goto error;
287 		}
288 		sq_attr.state = MLX5_SQC_STATE_RDY;
289 		sq_attr.sq_state = MLX5_SQC_STATE_RST;
290 		sq_attr.hairpin_peer_rq = rq->id;
291 		sq_attr.hairpin_peer_vhca = priv->config.hca_attr.vhca_id;
292 		ret = mlx5_devx_cmd_modify_sq(sq, &sq_attr);
293 		if (ret)
294 			goto error;
295 		rq_attr.state = MLX5_SQC_STATE_RDY;
296 		rq_attr.rq_state = MLX5_SQC_STATE_RST;
297 		rq_attr.hairpin_peer_sq = sq->id;
298 		rq_attr.hairpin_peer_vhca = priv->config.hca_attr.vhca_id;
299 		ret = mlx5_devx_cmd_modify_rq(rq, &rq_attr);
300 		if (ret)
301 			goto error;
302 		/* Qs with auto-bind will be destroyed directly. */
303 		rxq_ctrl->hairpin_status = 1;
304 		txq_ctrl->hairpin_status = 1;
305 		mlx5_txq_release(dev, i);
306 		mlx5_rxq_release(dev, txq_ctrl->hairpin_conf.peers[0].queue);
307 	}
308 	return 0;
309 error:
310 	mlx5_txq_release(dev, i);
311 	mlx5_rxq_release(dev, txq_ctrl->hairpin_conf.peers[0].queue);
312 	return -rte_errno;
313 }
314 
315 /*
316  * Fetch the peer queue's SW & HW information.
317  *
318  * @param dev
319  *   Pointer to Ethernet device structure.
320  * @param peer_queue
321  *   Index of the queue to fetch the information.
322  * @param current_info
323  *   Pointer to the input peer information, not used currently.
324  * @param peer_info
325  *   Pointer to the structure to store the information, output.
326  * @param direction
327  *   Positive to get the RxQ information, zero to get the TxQ information.
328  *
329  * @return
330  *   0 on success, a negative errno value otherwise and rte_errno is set.
331  */
332 int
333 mlx5_hairpin_queue_peer_update(struct rte_eth_dev *dev, uint16_t peer_queue,
334 			       struct rte_hairpin_peer_info *current_info,
335 			       struct rte_hairpin_peer_info *peer_info,
336 			       uint32_t direction)
337 {
338 	struct mlx5_priv *priv = dev->data->dev_private;
339 	RTE_SET_USED(current_info);
340 
341 	if (dev->data->dev_started == 0) {
342 		rte_errno = EBUSY;
343 		DRV_LOG(ERR, "peer port %u is not started",
344 			dev->data->port_id);
345 		return -rte_errno;
346 	}
347 	/*
348 	 * Peer port used as egress. In the current design, hairpin Tx queue
349 	 * will be bound to the peer Rx queue. Indeed, only the information of
350 	 * peer Rx queue needs to be fetched.
351 	 */
352 	if (direction == 0) {
353 		struct mlx5_txq_ctrl *txq_ctrl;
354 
355 		txq_ctrl = mlx5_txq_get(dev, peer_queue);
356 		if (txq_ctrl == NULL) {
357 			rte_errno = EINVAL;
358 			DRV_LOG(ERR, "Failed to get port %u Tx queue %d",
359 				dev->data->port_id, peer_queue);
360 			return -rte_errno;
361 		}
362 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
363 			rte_errno = EINVAL;
364 			DRV_LOG(ERR, "port %u queue %d is not a hairpin Txq",
365 				dev->data->port_id, peer_queue);
366 			mlx5_txq_release(dev, peer_queue);
367 			return -rte_errno;
368 		}
369 		if (txq_ctrl->obj == NULL || txq_ctrl->obj->sq == NULL) {
370 			rte_errno = ENOMEM;
371 			DRV_LOG(ERR, "port %u no Txq object found: %d",
372 				dev->data->port_id, peer_queue);
373 			mlx5_txq_release(dev, peer_queue);
374 			return -rte_errno;
375 		}
376 		peer_info->qp_id = txq_ctrl->obj->sq->id;
377 		peer_info->vhca_id = priv->config.hca_attr.vhca_id;
378 		/* 1-to-1 mapping, only the first one is used. */
379 		peer_info->peer_q = txq_ctrl->hairpin_conf.peers[0].queue;
380 		peer_info->tx_explicit = txq_ctrl->hairpin_conf.tx_explicit;
381 		peer_info->manual_bind = txq_ctrl->hairpin_conf.manual_bind;
382 		mlx5_txq_release(dev, peer_queue);
383 	} else { /* Peer port used as ingress. */
384 		struct mlx5_rxq_ctrl *rxq_ctrl;
385 
386 		rxq_ctrl = mlx5_rxq_get(dev, peer_queue);
387 		if (rxq_ctrl == NULL) {
388 			rte_errno = EINVAL;
389 			DRV_LOG(ERR, "Failed to get port %u Rx queue %d",
390 				dev->data->port_id, peer_queue);
391 			return -rte_errno;
392 		}
393 		if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN) {
394 			rte_errno = EINVAL;
395 			DRV_LOG(ERR, "port %u queue %d is not a hairpin Rxq",
396 				dev->data->port_id, peer_queue);
397 			mlx5_rxq_release(dev, peer_queue);
398 			return -rte_errno;
399 		}
400 		if (rxq_ctrl->obj == NULL || rxq_ctrl->obj->rq == NULL) {
401 			rte_errno = ENOMEM;
402 			DRV_LOG(ERR, "port %u no Rxq object found: %d",
403 				dev->data->port_id, peer_queue);
404 			mlx5_rxq_release(dev, peer_queue);
405 			return -rte_errno;
406 		}
407 		peer_info->qp_id = rxq_ctrl->obj->rq->id;
408 		peer_info->vhca_id = priv->config.hca_attr.vhca_id;
409 		peer_info->peer_q = rxq_ctrl->hairpin_conf.peers[0].queue;
410 		peer_info->tx_explicit = rxq_ctrl->hairpin_conf.tx_explicit;
411 		peer_info->manual_bind = rxq_ctrl->hairpin_conf.manual_bind;
412 		mlx5_rxq_release(dev, peer_queue);
413 	}
414 	return 0;
415 }
416 
417 /*
418  * Bind the hairpin queue with the peer HW information.
419  * This needs to be called twice both for Tx and Rx queues of a pair.
420  * If the queue is already bound, it is considered successful.
421  *
422  * @param dev
423  *   Pointer to Ethernet device structure.
424  * @param cur_queue
425  *   Index of the queue to change the HW configuration to bind.
426  * @param peer_info
427  *   Pointer to information of the peer queue.
428  * @param direction
429  *   Positive to configure the TxQ, zero to configure the RxQ.
430  *
431  * @return
432  *   0 on success, a negative errno value otherwise and rte_errno is set.
433  */
434 int
435 mlx5_hairpin_queue_peer_bind(struct rte_eth_dev *dev, uint16_t cur_queue,
436 			     struct rte_hairpin_peer_info *peer_info,
437 			     uint32_t direction)
438 {
439 	int ret = 0;
440 
441 	/*
442 	 * Consistency checking of the peer queue: opposite direction is used
443 	 * to get the peer queue info with ethdev port ID, no need to check.
444 	 */
445 	if (peer_info->peer_q != cur_queue) {
446 		rte_errno = EINVAL;
447 		DRV_LOG(ERR, "port %u queue %d and peer queue %d mismatch",
448 			dev->data->port_id, cur_queue, peer_info->peer_q);
449 		return -rte_errno;
450 	}
451 	if (direction != 0) {
452 		struct mlx5_txq_ctrl *txq_ctrl;
453 		struct mlx5_devx_modify_sq_attr sq_attr = { 0 };
454 
455 		txq_ctrl = mlx5_txq_get(dev, cur_queue);
456 		if (txq_ctrl == NULL) {
457 			rte_errno = EINVAL;
458 			DRV_LOG(ERR, "Failed to get port %u Tx queue %d",
459 				dev->data->port_id, cur_queue);
460 			return -rte_errno;
461 		}
462 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
463 			rte_errno = EINVAL;
464 			DRV_LOG(ERR, "port %u queue %d not a hairpin Txq",
465 				dev->data->port_id, cur_queue);
466 			mlx5_txq_release(dev, cur_queue);
467 			return -rte_errno;
468 		}
469 		if (txq_ctrl->obj == NULL || txq_ctrl->obj->sq == NULL) {
470 			rte_errno = ENOMEM;
471 			DRV_LOG(ERR, "port %u no Txq object found: %d",
472 				dev->data->port_id, cur_queue);
473 			mlx5_txq_release(dev, cur_queue);
474 			return -rte_errno;
475 		}
476 		if (txq_ctrl->hairpin_status != 0) {
477 			DRV_LOG(DEBUG, "port %u Tx queue %d is already bound",
478 				dev->data->port_id, cur_queue);
479 			mlx5_txq_release(dev, cur_queue);
480 			return 0;
481 		}
482 		/*
483 		 * All queues' of one port consistency checking is done in the
484 		 * bind() function, and that is optional.
485 		 */
486 		if (peer_info->tx_explicit !=
487 		    txq_ctrl->hairpin_conf.tx_explicit) {
488 			rte_errno = EINVAL;
489 			DRV_LOG(ERR, "port %u Tx queue %d and peer Tx rule mode"
490 				" mismatch", dev->data->port_id, cur_queue);
491 			mlx5_txq_release(dev, cur_queue);
492 			return -rte_errno;
493 		}
494 		if (peer_info->manual_bind !=
495 		    txq_ctrl->hairpin_conf.manual_bind) {
496 			rte_errno = EINVAL;
497 			DRV_LOG(ERR, "port %u Tx queue %d and peer binding mode"
498 				" mismatch", dev->data->port_id, cur_queue);
499 			mlx5_txq_release(dev, cur_queue);
500 			return -rte_errno;
501 		}
502 		sq_attr.state = MLX5_SQC_STATE_RDY;
503 		sq_attr.sq_state = MLX5_SQC_STATE_RST;
504 		sq_attr.hairpin_peer_rq = peer_info->qp_id;
505 		sq_attr.hairpin_peer_vhca = peer_info->vhca_id;
506 		ret = mlx5_devx_cmd_modify_sq(txq_ctrl->obj->sq, &sq_attr);
507 		if (ret == 0)
508 			txq_ctrl->hairpin_status = 1;
509 		mlx5_txq_release(dev, cur_queue);
510 	} else {
511 		struct mlx5_rxq_ctrl *rxq_ctrl;
512 		struct mlx5_devx_modify_rq_attr rq_attr = { 0 };
513 
514 		rxq_ctrl = mlx5_rxq_get(dev, cur_queue);
515 		if (rxq_ctrl == NULL) {
516 			rte_errno = EINVAL;
517 			DRV_LOG(ERR, "Failed to get port %u Rx queue %d",
518 				dev->data->port_id, cur_queue);
519 			return -rte_errno;
520 		}
521 		if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN) {
522 			rte_errno = EINVAL;
523 			DRV_LOG(ERR, "port %u queue %d not a hairpin Rxq",
524 				dev->data->port_id, cur_queue);
525 			mlx5_rxq_release(dev, cur_queue);
526 			return -rte_errno;
527 		}
528 		if (rxq_ctrl->obj == NULL || rxq_ctrl->obj->rq == NULL) {
529 			rte_errno = ENOMEM;
530 			DRV_LOG(ERR, "port %u no Rxq object found: %d",
531 				dev->data->port_id, cur_queue);
532 			mlx5_rxq_release(dev, cur_queue);
533 			return -rte_errno;
534 		}
535 		if (rxq_ctrl->hairpin_status != 0) {
536 			DRV_LOG(DEBUG, "port %u Rx queue %d is already bound",
537 				dev->data->port_id, cur_queue);
538 			mlx5_rxq_release(dev, cur_queue);
539 			return 0;
540 		}
541 		if (peer_info->tx_explicit !=
542 		    rxq_ctrl->hairpin_conf.tx_explicit) {
543 			rte_errno = EINVAL;
544 			DRV_LOG(ERR, "port %u Rx queue %d and peer Tx rule mode"
545 				" mismatch", dev->data->port_id, cur_queue);
546 			mlx5_rxq_release(dev, cur_queue);
547 			return -rte_errno;
548 		}
549 		if (peer_info->manual_bind !=
550 		    rxq_ctrl->hairpin_conf.manual_bind) {
551 			rte_errno = EINVAL;
552 			DRV_LOG(ERR, "port %u Rx queue %d and peer binding mode"
553 				" mismatch", dev->data->port_id, cur_queue);
554 			mlx5_rxq_release(dev, cur_queue);
555 			return -rte_errno;
556 		}
557 		rq_attr.state = MLX5_SQC_STATE_RDY;
558 		rq_attr.rq_state = MLX5_SQC_STATE_RST;
559 		rq_attr.hairpin_peer_sq = peer_info->qp_id;
560 		rq_attr.hairpin_peer_vhca = peer_info->vhca_id;
561 		ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq, &rq_attr);
562 		if (ret == 0)
563 			rxq_ctrl->hairpin_status = 1;
564 		mlx5_rxq_release(dev, cur_queue);
565 	}
566 	return ret;
567 }
568 
569 /*
570  * Unbind the hairpin queue and reset its HW configuration.
571  * This needs to be called twice both for Tx and Rx queues of a pair.
572  * If the queue is already unbound, it is considered successful.
573  *
574  * @param dev
575  *   Pointer to Ethernet device structure.
576  * @param cur_queue
577  *   Index of the queue to change the HW configuration to unbind.
578  * @param direction
579  *   Positive to reset the TxQ, zero to reset the RxQ.
580  *
581  * @return
582  *   0 on success, a negative errno value otherwise and rte_errno is set.
583  */
584 int
585 mlx5_hairpin_queue_peer_unbind(struct rte_eth_dev *dev, uint16_t cur_queue,
586 			       uint32_t direction)
587 {
588 	int ret = 0;
589 
590 	if (direction != 0) {
591 		struct mlx5_txq_ctrl *txq_ctrl;
592 		struct mlx5_devx_modify_sq_attr sq_attr = { 0 };
593 
594 		txq_ctrl = mlx5_txq_get(dev, cur_queue);
595 		if (txq_ctrl == NULL) {
596 			rte_errno = EINVAL;
597 			DRV_LOG(ERR, "Failed to get port %u Tx queue %d",
598 				dev->data->port_id, cur_queue);
599 			return -rte_errno;
600 		}
601 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
602 			rte_errno = EINVAL;
603 			DRV_LOG(ERR, "port %u queue %d not a hairpin Txq",
604 				dev->data->port_id, cur_queue);
605 			mlx5_txq_release(dev, cur_queue);
606 			return -rte_errno;
607 		}
608 		/* Already unbound, return success before obj checking. */
609 		if (txq_ctrl->hairpin_status == 0) {
610 			DRV_LOG(DEBUG, "port %u Tx queue %d is already unbound",
611 				dev->data->port_id, cur_queue);
612 			mlx5_txq_release(dev, cur_queue);
613 			return 0;
614 		}
615 		if (!txq_ctrl->obj || !txq_ctrl->obj->sq) {
616 			rte_errno = ENOMEM;
617 			DRV_LOG(ERR, "port %u no Txq object found: %d",
618 				dev->data->port_id, cur_queue);
619 			mlx5_txq_release(dev, cur_queue);
620 			return -rte_errno;
621 		}
622 		sq_attr.state = MLX5_SQC_STATE_RST;
623 		sq_attr.sq_state = MLX5_SQC_STATE_RST;
624 		ret = mlx5_devx_cmd_modify_sq(txq_ctrl->obj->sq, &sq_attr);
625 		if (ret == 0)
626 			txq_ctrl->hairpin_status = 0;
627 		mlx5_txq_release(dev, cur_queue);
628 	} else {
629 		struct mlx5_rxq_ctrl *rxq_ctrl;
630 		struct mlx5_devx_modify_rq_attr rq_attr = { 0 };
631 
632 		rxq_ctrl = mlx5_rxq_get(dev, cur_queue);
633 		if (rxq_ctrl == NULL) {
634 			rte_errno = EINVAL;
635 			DRV_LOG(ERR, "Failed to get port %u Rx queue %d",
636 				dev->data->port_id, cur_queue);
637 			return -rte_errno;
638 		}
639 		if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN) {
640 			rte_errno = EINVAL;
641 			DRV_LOG(ERR, "port %u queue %d not a hairpin Rxq",
642 				dev->data->port_id, cur_queue);
643 			mlx5_rxq_release(dev, cur_queue);
644 			return -rte_errno;
645 		}
646 		if (rxq_ctrl->hairpin_status == 0) {
647 			DRV_LOG(DEBUG, "port %u Rx queue %d is already unbound",
648 				dev->data->port_id, cur_queue);
649 			mlx5_rxq_release(dev, cur_queue);
650 			return 0;
651 		}
652 		if (rxq_ctrl->obj == NULL || rxq_ctrl->obj->rq == NULL) {
653 			rte_errno = ENOMEM;
654 			DRV_LOG(ERR, "port %u no Rxq object found: %d",
655 				dev->data->port_id, cur_queue);
656 			mlx5_rxq_release(dev, cur_queue);
657 			return -rte_errno;
658 		}
659 		rq_attr.state = MLX5_SQC_STATE_RST;
660 		rq_attr.rq_state = MLX5_SQC_STATE_RST;
661 		ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq, &rq_attr);
662 		if (ret == 0)
663 			rxq_ctrl->hairpin_status = 0;
664 		mlx5_rxq_release(dev, cur_queue);
665 	}
666 	return ret;
667 }
668 
669 /*
670  * Bind the hairpin port pairs, from the Tx to the peer Rx.
671  * This function only supports to bind the Tx to one Rx.
672  *
673  * @param dev
674  *   Pointer to Ethernet device structure.
675  * @param rx_port
676  *   Port identifier of the Rx port.
677  *
678  * @return
679  *   0 on success, a negative errno value otherwise and rte_errno is set.
680  */
681 static int
682 mlx5_hairpin_bind_single_port(struct rte_eth_dev *dev, uint16_t rx_port)
683 {
684 	struct mlx5_priv *priv = dev->data->dev_private;
685 	int ret = 0;
686 	struct mlx5_txq_ctrl *txq_ctrl;
687 	uint32_t i;
688 	struct rte_hairpin_peer_info peer = {0xffffff};
689 	struct rte_hairpin_peer_info cur;
690 	const struct rte_eth_hairpin_conf *conf;
691 	uint16_t num_q = 0;
692 	uint16_t local_port = priv->dev_data->port_id;
693 	uint32_t manual;
694 	uint32_t explicit;
695 	uint16_t rx_queue;
696 
697 	if (mlx5_eth_find_next(rx_port, priv->pci_dev) != rx_port) {
698 		rte_errno = ENODEV;
699 		DRV_LOG(ERR, "Rx port %u does not belong to mlx5", rx_port);
700 		return -rte_errno;
701 	}
702 	/*
703 	 * Before binding TxQ to peer RxQ, first round loop will be used for
704 	 * checking the queues' configuration consistency. This would be a
705 	 * little time consuming but better than doing the rollback.
706 	 */
707 	for (i = 0; i != priv->txqs_n; i++) {
708 		txq_ctrl = mlx5_txq_get(dev, i);
709 		if (txq_ctrl == NULL)
710 			continue;
711 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
712 			mlx5_txq_release(dev, i);
713 			continue;
714 		}
715 		/*
716 		 * All hairpin Tx queues of a single port that connected to the
717 		 * same peer Rx port should have the same "auto binding" and
718 		 * "implicit Tx flow" modes.
719 		 * Peer consistency checking will be done in per queue binding.
720 		 */
721 		conf = &txq_ctrl->hairpin_conf;
722 		if (conf->peers[0].port == rx_port) {
723 			if (num_q == 0) {
724 				manual = conf->manual_bind;
725 				explicit = conf->tx_explicit;
726 			} else {
727 				if (manual != conf->manual_bind ||
728 				    explicit != conf->tx_explicit) {
729 					rte_errno = EINVAL;
730 					DRV_LOG(ERR, "port %u queue %d mode"
731 						" mismatch: %u %u, %u %u",
732 						local_port, i, manual,
733 						conf->manual_bind, explicit,
734 						conf->tx_explicit);
735 					mlx5_txq_release(dev, i);
736 					return -rte_errno;
737 				}
738 			}
739 			num_q++;
740 		}
741 		mlx5_txq_release(dev, i);
742 	}
743 	/* Once no queue is configured, success is returned directly. */
744 	if (num_q == 0)
745 		return ret;
746 	/* All the hairpin TX queues need to be traversed again. */
747 	for (i = 0; i != priv->txqs_n; i++) {
748 		txq_ctrl = mlx5_txq_get(dev, i);
749 		if (txq_ctrl == NULL)
750 			continue;
751 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
752 			mlx5_txq_release(dev, i);
753 			continue;
754 		}
755 		if (txq_ctrl->hairpin_conf.peers[0].port != rx_port) {
756 			mlx5_txq_release(dev, i);
757 			continue;
758 		}
759 		rx_queue = txq_ctrl->hairpin_conf.peers[0].queue;
760 		/*
761 		 * Fetch peer RxQ's information.
762 		 * No need to pass the information of the current queue.
763 		 */
764 		ret = rte_eth_hairpin_queue_peer_update(rx_port, rx_queue,
765 							NULL, &peer, 1);
766 		if (ret != 0) {
767 			mlx5_txq_release(dev, i);
768 			goto error;
769 		}
770 		/* Accessing its own device, inside mlx5 PMD. */
771 		ret = mlx5_hairpin_queue_peer_bind(dev, i, &peer, 1);
772 		if (ret != 0) {
773 			mlx5_txq_release(dev, i);
774 			goto error;
775 		}
776 		/* Pass TxQ's information to peer RxQ and try binding. */
777 		cur.peer_q = rx_queue;
778 		cur.qp_id = txq_ctrl->obj->sq->id;
779 		cur.vhca_id = priv->config.hca_attr.vhca_id;
780 		cur.tx_explicit = txq_ctrl->hairpin_conf.tx_explicit;
781 		cur.manual_bind = txq_ctrl->hairpin_conf.manual_bind;
782 		/*
783 		 * In order to access another device in a proper way, RTE level
784 		 * private function is needed.
785 		 */
786 		ret = rte_eth_hairpin_queue_peer_bind(rx_port, rx_queue,
787 						      &cur, 0);
788 		if (ret != 0) {
789 			mlx5_txq_release(dev, i);
790 			goto error;
791 		}
792 		mlx5_txq_release(dev, i);
793 	}
794 	return 0;
795 error:
796 	/*
797 	 * Do roll-back process for the queues already bound.
798 	 * No need to check the return value of the queue unbind function.
799 	 */
800 	do {
801 		/* No validation is needed here. */
802 		txq_ctrl = mlx5_txq_get(dev, i);
803 		if (txq_ctrl == NULL)
804 			continue;
805 		rx_queue = txq_ctrl->hairpin_conf.peers[0].queue;
806 		rte_eth_hairpin_queue_peer_unbind(rx_port, rx_queue, 0);
807 		mlx5_hairpin_queue_peer_unbind(dev, i, 1);
808 		mlx5_txq_release(dev, i);
809 	} while (i--);
810 	return ret;
811 }
812 
813 /*
814  * Unbind the hairpin port pair, HW configuration of both devices will be clear
815  * and status will be reset for all the queues used between the them.
816  * This function only supports to unbind the Tx from one Rx.
817  *
818  * @param dev
819  *   Pointer to Ethernet device structure.
820  * @param rx_port
821  *   Port identifier of the Rx port.
822  *
823  * @return
824  *   0 on success, a negative errno value otherwise and rte_errno is set.
825  */
826 static int
827 mlx5_hairpin_unbind_single_port(struct rte_eth_dev *dev, uint16_t rx_port)
828 {
829 	struct mlx5_priv *priv = dev->data->dev_private;
830 	struct mlx5_txq_ctrl *txq_ctrl;
831 	uint32_t i;
832 	int ret;
833 	uint16_t cur_port = priv->dev_data->port_id;
834 
835 	if (mlx5_eth_find_next(rx_port, priv->pci_dev) != rx_port) {
836 		rte_errno = ENODEV;
837 		DRV_LOG(ERR, "Rx port %u does not belong to mlx5", rx_port);
838 		return -rte_errno;
839 	}
840 	for (i = 0; i != priv->txqs_n; i++) {
841 		uint16_t rx_queue;
842 
843 		txq_ctrl = mlx5_txq_get(dev, i);
844 		if (txq_ctrl == NULL)
845 			continue;
846 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
847 			mlx5_txq_release(dev, i);
848 			continue;
849 		}
850 		if (txq_ctrl->hairpin_conf.peers[0].port != rx_port) {
851 			mlx5_txq_release(dev, i);
852 			continue;
853 		}
854 		/* Indeed, only the first used queue needs to be checked. */
855 		if (txq_ctrl->hairpin_conf.manual_bind == 0) {
856 			if (cur_port != rx_port) {
857 				rte_errno = EINVAL;
858 				DRV_LOG(ERR, "port %u and port %u are in"
859 					" auto-bind mode", cur_port, rx_port);
860 				mlx5_txq_release(dev, i);
861 				return -rte_errno;
862 			} else {
863 				return 0;
864 			}
865 		}
866 		rx_queue = txq_ctrl->hairpin_conf.peers[0].queue;
867 		mlx5_txq_release(dev, i);
868 		ret = rte_eth_hairpin_queue_peer_unbind(rx_port, rx_queue, 0);
869 		if (ret) {
870 			DRV_LOG(ERR, "port %u Rx queue %d unbind - failure",
871 				rx_port, rx_queue);
872 			return ret;
873 		}
874 		ret = mlx5_hairpin_queue_peer_unbind(dev, i, 1);
875 		if (ret) {
876 			DRV_LOG(ERR, "port %u Tx queue %d unbind - failure",
877 				cur_port, i);
878 			return ret;
879 		}
880 	}
881 	return 0;
882 }
883 
884 /*
885  * Bind hairpin ports, Rx could be all ports when using RTE_MAX_ETHPORTS.
886  * @see mlx5_hairpin_bind_single_port()
887  */
888 int
889 mlx5_hairpin_bind(struct rte_eth_dev *dev, uint16_t rx_port)
890 {
891 	int ret = 0;
892 	uint16_t p, pp;
893 	struct mlx5_priv *priv = dev->data->dev_private;
894 
895 	/*
896 	 * If the Rx port has no hairpin configuration with the current port,
897 	 * the binding will be skipped in the called function of single port.
898 	 * Device started status will be checked only before the queue
899 	 * information updating.
900 	 */
901 	if (rx_port == RTE_MAX_ETHPORTS) {
902 		MLX5_ETH_FOREACH_DEV(p, priv->pci_dev) {
903 			ret = mlx5_hairpin_bind_single_port(dev, p);
904 			if (ret != 0)
905 				goto unbind;
906 		}
907 		return ret;
908 	} else {
909 		return mlx5_hairpin_bind_single_port(dev, rx_port);
910 	}
911 unbind:
912 	MLX5_ETH_FOREACH_DEV(pp, priv->pci_dev)
913 		if (pp < p)
914 			mlx5_hairpin_unbind_single_port(dev, pp);
915 	return ret;
916 }
917 
918 /*
919  * Unbind hairpin ports, Rx could be all ports when using RTE_MAX_ETHPORTS.
920  * @see mlx5_hairpin_unbind_single_port()
921  */
922 int
923 mlx5_hairpin_unbind(struct rte_eth_dev *dev, uint16_t rx_port)
924 {
925 	int ret = 0;
926 	uint16_t p;
927 	struct mlx5_priv *priv = dev->data->dev_private;
928 
929 	if (rx_port == RTE_MAX_ETHPORTS)
930 		MLX5_ETH_FOREACH_DEV(p, priv->pci_dev) {
931 			ret = mlx5_hairpin_unbind_single_port(dev, p);
932 			if (ret != 0)
933 				return ret;
934 		}
935 	else
936 		ret = mlx5_hairpin_bind_single_port(dev, rx_port);
937 	return ret;
938 }
939 
940 /*
941  * DPDK callback to get the hairpin peer ports list.
942  * This will return the actual number of peer ports and save the identifiers
943  * into the array (sorted, may be different from that when setting up the
944  * hairpin peer queues).
945  * The peer port ID could be the same as the port ID of the current device.
946  *
947  * @param dev
948  *   Pointer to Ethernet device structure.
949  * @param peer_ports
950  *   Pointer to array to save the port identifiers.
951  * @param len
952  *   The length of the array.
953  * @param direction
954  *   Current port to peer port direction.
955  *   positive - current used as Tx to get all peer Rx ports.
956  *   zero - current used as Rx to get all peer Tx ports.
957  *
958  * @return
959  *   0 or positive value on success, actual number of peer ports.
960  *   a negative errno value otherwise and rte_errno is set.
961  */
962 int
963 mlx5_hairpin_get_peer_ports(struct rte_eth_dev *dev, uint16_t *peer_ports,
964 			    size_t len, uint32_t direction)
965 {
966 	struct mlx5_priv *priv = dev->data->dev_private;
967 	struct mlx5_txq_ctrl *txq_ctrl;
968 	struct mlx5_rxq_ctrl *rxq_ctrl;
969 	uint32_t i;
970 	uint16_t pp;
971 	uint32_t bits[(RTE_MAX_ETHPORTS + 31) / 32] = {0};
972 	int ret = 0;
973 
974 	if (direction) {
975 		for (i = 0; i < priv->txqs_n; i++) {
976 			txq_ctrl = mlx5_txq_get(dev, i);
977 			if (!txq_ctrl)
978 				continue;
979 			if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
980 				mlx5_txq_release(dev, i);
981 				continue;
982 			}
983 			pp = txq_ctrl->hairpin_conf.peers[0].port;
984 			if (pp >= RTE_MAX_ETHPORTS) {
985 				rte_errno = ERANGE;
986 				mlx5_txq_release(dev, i);
987 				DRV_LOG(ERR, "port %hu queue %u peer port "
988 					"out of range %hu",
989 					priv->dev_data->port_id, i, pp);
990 				return -rte_errno;
991 			}
992 			bits[pp / 32] |= 1 << (pp % 32);
993 			mlx5_txq_release(dev, i);
994 		}
995 	} else {
996 		for (i = 0; i < priv->rxqs_n; i++) {
997 			rxq_ctrl = mlx5_rxq_get(dev, i);
998 			if (!rxq_ctrl)
999 				continue;
1000 			if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN) {
1001 				mlx5_rxq_release(dev, i);
1002 				continue;
1003 			}
1004 			pp = rxq_ctrl->hairpin_conf.peers[0].port;
1005 			if (pp >= RTE_MAX_ETHPORTS) {
1006 				rte_errno = ERANGE;
1007 				mlx5_rxq_release(dev, i);
1008 				DRV_LOG(ERR, "port %hu queue %u peer port "
1009 					"out of range %hu",
1010 					priv->dev_data->port_id, i, pp);
1011 				return -rte_errno;
1012 			}
1013 			bits[pp / 32] |= 1 << (pp % 32);
1014 			mlx5_rxq_release(dev, i);
1015 		}
1016 	}
1017 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
1018 		if (bits[i / 32] & (1 << (i % 32))) {
1019 			if ((size_t)ret >= len) {
1020 				rte_errno = E2BIG;
1021 				return -rte_errno;
1022 			}
1023 			peer_ports[ret++] = i;
1024 		}
1025 	}
1026 	return ret;
1027 }
1028 
1029 /**
1030  * DPDK callback to start the device.
1031  *
1032  * Simulate device start by attaching all configured flows.
1033  *
1034  * @param dev
1035  *   Pointer to Ethernet device structure.
1036  *
1037  * @return
1038  *   0 on success, a negative errno value otherwise and rte_errno is set.
1039  */
1040 int
1041 mlx5_dev_start(struct rte_eth_dev *dev)
1042 {
1043 	struct mlx5_priv *priv = dev->data->dev_private;
1044 	int ret;
1045 	int fine_inline;
1046 
1047 	DRV_LOG(DEBUG, "port %u starting device", dev->data->port_id);
1048 	fine_inline = rte_mbuf_dynflag_lookup
1049 		(RTE_PMD_MLX5_FINE_GRANULARITY_INLINE, NULL);
1050 	if (fine_inline >= 0)
1051 		rte_net_mlx5_dynf_inline_mask = 1UL << fine_inline;
1052 	else
1053 		rte_net_mlx5_dynf_inline_mask = 0;
1054 	if (dev->data->nb_rx_queues > 0) {
1055 		ret = mlx5_dev_configure_rss_reta(dev);
1056 		if (ret) {
1057 			DRV_LOG(ERR, "port %u reta config failed: %s",
1058 				dev->data->port_id, strerror(rte_errno));
1059 			return -rte_errno;
1060 		}
1061 	}
1062 	ret = mlx5_txpp_start(dev);
1063 	if (ret) {
1064 		DRV_LOG(ERR, "port %u Tx packet pacing init failed: %s",
1065 			dev->data->port_id, strerror(rte_errno));
1066 		goto error;
1067 	}
1068 	ret = mlx5_txq_start(dev);
1069 	if (ret) {
1070 		DRV_LOG(ERR, "port %u Tx queue allocation failed: %s",
1071 			dev->data->port_id, strerror(rte_errno));
1072 		goto error;
1073 	}
1074 	ret = mlx5_rxq_start(dev);
1075 	if (ret) {
1076 		DRV_LOG(ERR, "port %u Rx queue allocation failed: %s",
1077 			dev->data->port_id, strerror(rte_errno));
1078 		goto error;
1079 	}
1080 	/*
1081 	 * Such step will be skipped if there is no hairpin TX queue configured
1082 	 * with RX peer queue from the same device.
1083 	 */
1084 	ret = mlx5_hairpin_auto_bind(dev);
1085 	if (ret) {
1086 		DRV_LOG(ERR, "port %u hairpin auto binding failed: %s",
1087 			dev->data->port_id, strerror(rte_errno));
1088 		goto error;
1089 	}
1090 	/* Set started flag here for the following steps like control flow. */
1091 	dev->data->dev_started = 1;
1092 	ret = mlx5_rx_intr_vec_enable(dev);
1093 	if (ret) {
1094 		DRV_LOG(ERR, "port %u Rx interrupt vector creation failed",
1095 			dev->data->port_id);
1096 		goto error;
1097 	}
1098 	mlx5_os_stats_init(dev);
1099 	ret = mlx5_traffic_enable(dev);
1100 	if (ret) {
1101 		DRV_LOG(ERR, "port %u failed to set defaults flows",
1102 			dev->data->port_id);
1103 		goto error;
1104 	}
1105 	/* Set a mask and offset of dynamic metadata flows into Rx queues. */
1106 	mlx5_flow_rxq_dynf_metadata_set(dev);
1107 	/* Set flags and context to convert Rx timestamps. */
1108 	mlx5_rxq_timestamp_set(dev);
1109 	/* Set a mask and offset of scheduling on timestamp into Tx queues. */
1110 	mlx5_txq_dynf_timestamp_set(dev);
1111 	/*
1112 	 * In non-cached mode, it only needs to start the default mreg copy
1113 	 * action and no flow created by application exists anymore.
1114 	 * But it is worth wrapping the interface for further usage.
1115 	 */
1116 	ret = mlx5_flow_start_default(dev);
1117 	if (ret) {
1118 		DRV_LOG(DEBUG, "port %u failed to start default actions: %s",
1119 			dev->data->port_id, strerror(rte_errno));
1120 		goto error;
1121 	}
1122 	rte_wmb();
1123 	dev->tx_pkt_burst = mlx5_select_tx_function(dev);
1124 	dev->rx_pkt_burst = mlx5_select_rx_function(dev);
1125 	/* Enable datapath on secondary process. */
1126 	mlx5_mp_os_req_start_rxtx(dev);
1127 	if (priv->sh->intr_handle.fd >= 0) {
1128 		priv->sh->port[priv->dev_port - 1].ih_port_id =
1129 					(uint32_t)dev->data->port_id;
1130 	} else {
1131 		DRV_LOG(INFO, "port %u starts without LSC and RMV interrupts.",
1132 			dev->data->port_id);
1133 		dev->data->dev_conf.intr_conf.lsc = 0;
1134 		dev->data->dev_conf.intr_conf.rmv = 0;
1135 	}
1136 	if (priv->sh->intr_handle_devx.fd >= 0)
1137 		priv->sh->port[priv->dev_port - 1].devx_ih_port_id =
1138 					(uint32_t)dev->data->port_id;
1139 	return 0;
1140 error:
1141 	ret = rte_errno; /* Save rte_errno before cleanup. */
1142 	/* Rollback. */
1143 	dev->data->dev_started = 0;
1144 	mlx5_flow_stop_default(dev);
1145 	mlx5_traffic_disable(dev);
1146 	mlx5_txq_stop(dev);
1147 	mlx5_rxq_stop(dev);
1148 	mlx5_txpp_stop(dev); /* Stop last. */
1149 	rte_errno = ret; /* Restore rte_errno. */
1150 	return -rte_errno;
1151 }
1152 
1153 /**
1154  * DPDK callback to stop the device.
1155  *
1156  * Simulate device stop by detaching all configured flows.
1157  *
1158  * @param dev
1159  *   Pointer to Ethernet device structure.
1160  */
1161 int
1162 mlx5_dev_stop(struct rte_eth_dev *dev)
1163 {
1164 	struct mlx5_priv *priv = dev->data->dev_private;
1165 
1166 	dev->data->dev_started = 0;
1167 	/* Prevent crashes when queues are still in use. */
1168 	dev->rx_pkt_burst = removed_rx_burst;
1169 	dev->tx_pkt_burst = removed_tx_burst;
1170 	rte_wmb();
1171 	/* Disable datapath on secondary process. */
1172 	mlx5_mp_os_req_stop_rxtx(dev);
1173 	usleep(1000 * priv->rxqs_n);
1174 	DRV_LOG(DEBUG, "port %u stopping device", dev->data->port_id);
1175 	mlx5_flow_stop_default(dev);
1176 	/* Control flows for default traffic can be removed firstly. */
1177 	mlx5_traffic_disable(dev);
1178 	/* All RX queue flags will be cleared in the flush interface. */
1179 	mlx5_flow_list_flush(dev, &priv->flows, true);
1180 	mlx5_rx_intr_vec_disable(dev);
1181 	priv->sh->port[priv->dev_port - 1].ih_port_id = RTE_MAX_ETHPORTS;
1182 	priv->sh->port[priv->dev_port - 1].devx_ih_port_id = RTE_MAX_ETHPORTS;
1183 	mlx5_txq_stop(dev);
1184 	mlx5_rxq_stop(dev);
1185 	mlx5_txpp_stop(dev);
1186 
1187 	return 0;
1188 }
1189 
1190 /**
1191  * Enable traffic flows configured by control plane
1192  *
1193  * @param dev
1194  *   Pointer to Ethernet device private data.
1195  * @param dev
1196  *   Pointer to Ethernet device structure.
1197  *
1198  * @return
1199  *   0 on success, a negative errno value otherwise and rte_errno is set.
1200  */
1201 int
1202 mlx5_traffic_enable(struct rte_eth_dev *dev)
1203 {
1204 	struct mlx5_priv *priv = dev->data->dev_private;
1205 	struct rte_flow_item_eth bcast = {
1206 		.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
1207 	};
1208 	struct rte_flow_item_eth ipv6_multi_spec = {
1209 		.dst.addr_bytes = "\x33\x33\x00\x00\x00\x00",
1210 	};
1211 	struct rte_flow_item_eth ipv6_multi_mask = {
1212 		.dst.addr_bytes = "\xff\xff\x00\x00\x00\x00",
1213 	};
1214 	struct rte_flow_item_eth unicast = {
1215 		.src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1216 	};
1217 	struct rte_flow_item_eth unicast_mask = {
1218 		.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
1219 	};
1220 	const unsigned int vlan_filter_n = priv->vlan_filter_n;
1221 	const struct rte_ether_addr cmp = {
1222 		.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1223 	};
1224 	unsigned int i;
1225 	unsigned int j;
1226 	int ret;
1227 
1228 	/*
1229 	 * Hairpin txq default flow should be created no matter if it is
1230 	 * isolation mode. Or else all the packets to be sent will be sent
1231 	 * out directly without the TX flow actions, e.g. encapsulation.
1232 	 */
1233 	for (i = 0; i != priv->txqs_n; ++i) {
1234 		struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i);
1235 		if (!txq_ctrl)
1236 			continue;
1237 		/* Only Tx implicit mode requires the default Tx flow. */
1238 		if (txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN &&
1239 		    txq_ctrl->hairpin_conf.tx_explicit == 0 &&
1240 		    txq_ctrl->hairpin_conf.peers[0].port ==
1241 		    priv->dev_data->port_id) {
1242 			ret = mlx5_ctrl_flow_source_queue(dev, i);
1243 			if (ret) {
1244 				mlx5_txq_release(dev, i);
1245 				goto error;
1246 			}
1247 		}
1248 		mlx5_txq_release(dev, i);
1249 	}
1250 	if (priv->config.dv_esw_en && !priv->config.vf) {
1251 		if (mlx5_flow_create_esw_table_zero_flow(dev))
1252 			priv->fdb_def_rule = 1;
1253 		else
1254 			DRV_LOG(INFO, "port %u FDB default rule cannot be"
1255 				" configured - only Eswitch group 0 flows are"
1256 				" supported.", dev->data->port_id);
1257 	}
1258 	if (!priv->config.lacp_by_user && priv->pf_bond >= 0) {
1259 		ret = mlx5_flow_lacp_miss(dev);
1260 		if (ret)
1261 			DRV_LOG(INFO, "port %u LACP rule cannot be created - "
1262 				"forward LACP to kernel.", dev->data->port_id);
1263 		else
1264 			DRV_LOG(INFO, "LACP traffic will be missed in port %u."
1265 				, dev->data->port_id);
1266 	}
1267 	if (priv->isolated)
1268 		return 0;
1269 	if (dev->data->promiscuous) {
1270 		struct rte_flow_item_eth promisc = {
1271 			.dst.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1272 			.src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1273 			.type = 0,
1274 		};
1275 
1276 		ret = mlx5_ctrl_flow(dev, &promisc, &promisc);
1277 		if (ret)
1278 			goto error;
1279 	}
1280 	if (dev->data->all_multicast) {
1281 		struct rte_flow_item_eth multicast = {
1282 			.dst.addr_bytes = "\x01\x00\x00\x00\x00\x00",
1283 			.src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1284 			.type = 0,
1285 		};
1286 
1287 		ret = mlx5_ctrl_flow(dev, &multicast, &multicast);
1288 		if (ret)
1289 			goto error;
1290 	} else {
1291 		/* Add broadcast/multicast flows. */
1292 		for (i = 0; i != vlan_filter_n; ++i) {
1293 			uint16_t vlan = priv->vlan_filter[i];
1294 
1295 			struct rte_flow_item_vlan vlan_spec = {
1296 				.tci = rte_cpu_to_be_16(vlan),
1297 			};
1298 			struct rte_flow_item_vlan vlan_mask =
1299 				rte_flow_item_vlan_mask;
1300 
1301 			ret = mlx5_ctrl_flow_vlan(dev, &bcast, &bcast,
1302 						  &vlan_spec, &vlan_mask);
1303 			if (ret)
1304 				goto error;
1305 			ret = mlx5_ctrl_flow_vlan(dev, &ipv6_multi_spec,
1306 						  &ipv6_multi_mask,
1307 						  &vlan_spec, &vlan_mask);
1308 			if (ret)
1309 				goto error;
1310 		}
1311 		if (!vlan_filter_n) {
1312 			ret = mlx5_ctrl_flow(dev, &bcast, &bcast);
1313 			if (ret)
1314 				goto error;
1315 			ret = mlx5_ctrl_flow(dev, &ipv6_multi_spec,
1316 					     &ipv6_multi_mask);
1317 			if (ret)
1318 				goto error;
1319 		}
1320 	}
1321 	/* Add MAC address flows. */
1322 	for (i = 0; i != MLX5_MAX_MAC_ADDRESSES; ++i) {
1323 		struct rte_ether_addr *mac = &dev->data->mac_addrs[i];
1324 
1325 		if (!memcmp(mac, &cmp, sizeof(*mac)))
1326 			continue;
1327 		memcpy(&unicast.dst.addr_bytes,
1328 		       mac->addr_bytes,
1329 		       RTE_ETHER_ADDR_LEN);
1330 		for (j = 0; j != vlan_filter_n; ++j) {
1331 			uint16_t vlan = priv->vlan_filter[j];
1332 
1333 			struct rte_flow_item_vlan vlan_spec = {
1334 				.tci = rte_cpu_to_be_16(vlan),
1335 			};
1336 			struct rte_flow_item_vlan vlan_mask =
1337 				rte_flow_item_vlan_mask;
1338 
1339 			ret = mlx5_ctrl_flow_vlan(dev, &unicast,
1340 						  &unicast_mask,
1341 						  &vlan_spec,
1342 						  &vlan_mask);
1343 			if (ret)
1344 				goto error;
1345 		}
1346 		if (!vlan_filter_n) {
1347 			ret = mlx5_ctrl_flow(dev, &unicast, &unicast_mask);
1348 			if (ret)
1349 				goto error;
1350 		}
1351 	}
1352 	return 0;
1353 error:
1354 	ret = rte_errno; /* Save rte_errno before cleanup. */
1355 	mlx5_flow_list_flush(dev, &priv->ctrl_flows, false);
1356 	rte_errno = ret; /* Restore rte_errno. */
1357 	return -rte_errno;
1358 }
1359 
1360 
1361 /**
1362  * Disable traffic flows configured by control plane
1363  *
1364  * @param dev
1365  *   Pointer to Ethernet device private data.
1366  */
1367 void
1368 mlx5_traffic_disable(struct rte_eth_dev *dev)
1369 {
1370 	struct mlx5_priv *priv = dev->data->dev_private;
1371 
1372 	mlx5_flow_list_flush(dev, &priv->ctrl_flows, false);
1373 }
1374 
1375 /**
1376  * Restart traffic flows configured by control plane
1377  *
1378  * @param dev
1379  *   Pointer to Ethernet device private data.
1380  *
1381  * @return
1382  *   0 on success, a negative errno value otherwise and rte_errno is set.
1383  */
1384 int
1385 mlx5_traffic_restart(struct rte_eth_dev *dev)
1386 {
1387 	if (dev->data->dev_started) {
1388 		mlx5_traffic_disable(dev);
1389 		return mlx5_traffic_enable(dev);
1390 	}
1391 	return 0;
1392 }
1393