xref: /dpdk/drivers/net/mlx5/mlx5_trigger.c (revision 5db77fef78b4bf3559d1f5e83f905f9be729c78b)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5 
6 #include <unistd.h>
7 
8 #include <rte_ether.h>
9 #include <ethdev_driver.h>
10 #include <rte_interrupts.h>
11 #include <rte_alarm.h>
12 #include <rte_cycles.h>
13 
14 #include <mlx5_malloc.h>
15 
16 #include "mlx5.h"
17 #include "mlx5_flow.h"
18 #include "mlx5_rx.h"
19 #include "mlx5_tx.h"
20 #include "mlx5_utils.h"
21 #include "rte_pmd_mlx5.h"
22 
23 /**
24  * Stop traffic on Tx queues.
25  *
26  * @param dev
27  *   Pointer to Ethernet device structure.
28  */
29 static void
30 mlx5_txq_stop(struct rte_eth_dev *dev)
31 {
32 	struct mlx5_priv *priv = dev->data->dev_private;
33 	unsigned int i;
34 
35 	for (i = 0; i != priv->txqs_n; ++i)
36 		mlx5_txq_release(dev, i);
37 }
38 
39 /**
40  * Start traffic on Tx queues.
41  *
42  * @param dev
43  *   Pointer to Ethernet device structure.
44  *
45  * @return
46  *   0 on success, a negative errno value otherwise and rte_errno is set.
47  */
48 static int
49 mlx5_txq_start(struct rte_eth_dev *dev)
50 {
51 	struct mlx5_priv *priv = dev->data->dev_private;
52 	unsigned int i;
53 	int ret;
54 
55 	for (i = 0; i != priv->txqs_n; ++i) {
56 		struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i);
57 		struct mlx5_txq_data *txq_data = &txq_ctrl->txq;
58 		uint32_t flags = MLX5_MEM_RTE | MLX5_MEM_ZERO;
59 
60 		if (!txq_ctrl)
61 			continue;
62 		if (txq_ctrl->type == MLX5_TXQ_TYPE_STANDARD)
63 			txq_alloc_elts(txq_ctrl);
64 		MLX5_ASSERT(!txq_ctrl->obj);
65 		txq_ctrl->obj = mlx5_malloc(flags, sizeof(struct mlx5_txq_obj),
66 					    0, txq_ctrl->socket);
67 		if (!txq_ctrl->obj) {
68 			DRV_LOG(ERR, "Port %u Tx queue %u cannot allocate "
69 				"memory resources.", dev->data->port_id,
70 				txq_data->idx);
71 			rte_errno = ENOMEM;
72 			goto error;
73 		}
74 		ret = priv->obj_ops.txq_obj_new(dev, i);
75 		if (ret < 0) {
76 			mlx5_free(txq_ctrl->obj);
77 			txq_ctrl->obj = NULL;
78 			goto error;
79 		}
80 		if (txq_ctrl->type == MLX5_TXQ_TYPE_STANDARD) {
81 			size_t size = txq_data->cqe_s * sizeof(*txq_data->fcqs);
82 
83 			txq_data->fcqs = mlx5_malloc(flags, size,
84 						     RTE_CACHE_LINE_SIZE,
85 						     txq_ctrl->socket);
86 			if (!txq_data->fcqs) {
87 				DRV_LOG(ERR, "Port %u Tx queue %u cannot "
88 					"allocate memory (FCQ).",
89 					dev->data->port_id, i);
90 				rte_errno = ENOMEM;
91 				goto error;
92 			}
93 		}
94 		DRV_LOG(DEBUG, "Port %u txq %u updated with %p.",
95 			dev->data->port_id, i, (void *)&txq_ctrl->obj);
96 		LIST_INSERT_HEAD(&priv->txqsobj, txq_ctrl->obj, next);
97 	}
98 	return 0;
99 error:
100 	ret = rte_errno; /* Save rte_errno before cleanup. */
101 	do {
102 		mlx5_txq_release(dev, i);
103 	} while (i-- != 0);
104 	rte_errno = ret; /* Restore rte_errno. */
105 	return -rte_errno;
106 }
107 
108 /**
109  * Translate the chunk address to MR key in order to put in into the cache.
110  */
111 static void
112 mlx5_rxq_mempool_register_cb(struct rte_mempool *mp, void *opaque,
113 			     struct rte_mempool_memhdr *memhdr,
114 			     unsigned int idx)
115 {
116 	struct mlx5_rxq_data *rxq = opaque;
117 
118 	RTE_SET_USED(mp);
119 	RTE_SET_USED(idx);
120 	mlx5_rx_addr2mr(rxq, (uintptr_t)memhdr->addr);
121 }
122 
123 /**
124  * Register Rx queue mempools and fill the Rx queue cache.
125  * This function tolerates repeated mempool registration.
126  *
127  * @param[in] rxq_ctrl
128  *   Rx queue control data.
129  *
130  * @return
131  *   0 on success, (-1) on failure and rte_errno is set.
132  */
133 static int
134 mlx5_rxq_mempool_register(struct rte_eth_dev *dev,
135 			  struct mlx5_rxq_ctrl *rxq_ctrl)
136 {
137 	struct mlx5_priv *priv = dev->data->dev_private;
138 	struct mlx5_dev_ctx_shared *sh = rxq_ctrl->sh;
139 	struct rte_mempool *mp;
140 	uint32_t s;
141 	int ret = 0;
142 
143 	mlx5_mr_flush_local_cache(&rxq_ctrl->rxq.mr_ctrl);
144 	/* MPRQ mempool is registered on creation, just fill the cache. */
145 	if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq)) {
146 		rte_mempool_mem_iter(rxq_ctrl->rxq.mprq_mp,
147 				     mlx5_rxq_mempool_register_cb,
148 				     &rxq_ctrl->rxq);
149 		return 0;
150 	}
151 	for (s = 0; s < rxq_ctrl->rxq.rxseg_n; s++) {
152 		mp = rxq_ctrl->rxq.rxseg[s].mp;
153 		ret = mlx5_mr_mempool_register(&sh->cdev->mr_scache,
154 					       sh->cdev->pd, mp, &priv->mp_id);
155 		if (ret < 0 && rte_errno != EEXIST)
156 			return ret;
157 		rte_mempool_mem_iter(mp, mlx5_rxq_mempool_register_cb,
158 				     &rxq_ctrl->rxq);
159 	}
160 	return 0;
161 }
162 
163 /**
164  * Stop traffic on Rx queues.
165  *
166  * @param dev
167  *   Pointer to Ethernet device structure.
168  */
169 static void
170 mlx5_rxq_stop(struct rte_eth_dev *dev)
171 {
172 	struct mlx5_priv *priv = dev->data->dev_private;
173 	unsigned int i;
174 
175 	for (i = 0; i != priv->rxqs_n; ++i)
176 		mlx5_rxq_release(dev, i);
177 }
178 
179 /**
180  * Start traffic on Rx queues.
181  *
182  * @param dev
183  *   Pointer to Ethernet device structure.
184  *
185  * @return
186  *   0 on success, a negative errno value otherwise and rte_errno is set.
187  */
188 static int
189 mlx5_rxq_start(struct rte_eth_dev *dev)
190 {
191 	struct mlx5_priv *priv = dev->data->dev_private;
192 	unsigned int i;
193 	int ret = 0;
194 
195 	/* Allocate/reuse/resize mempool for Multi-Packet RQ. */
196 	if (mlx5_mprq_alloc_mp(dev)) {
197 		/* Should not release Rx queues but return immediately. */
198 		return -rte_errno;
199 	}
200 	DRV_LOG(DEBUG, "Port %u device_attr.max_qp_wr is %d.",
201 		dev->data->port_id, priv->sh->device_attr.max_qp_wr);
202 	DRV_LOG(DEBUG, "Port %u device_attr.max_sge is %d.",
203 		dev->data->port_id, priv->sh->device_attr.max_sge);
204 	for (i = 0; i != priv->rxqs_n; ++i) {
205 		struct mlx5_rxq_priv *rxq = mlx5_rxq_ref(dev, i);
206 		struct mlx5_rxq_ctrl *rxq_ctrl;
207 
208 		if (rxq == NULL)
209 			continue;
210 		rxq_ctrl = rxq->ctrl;
211 		if (rxq_ctrl->type == MLX5_RXQ_TYPE_STANDARD) {
212 			/*
213 			 * Pre-register the mempools. Regardless of whether
214 			 * the implicit registration is enabled or not,
215 			 * Rx mempool destruction is tracked to free MRs.
216 			 */
217 			if (mlx5_rxq_mempool_register(dev, rxq_ctrl) < 0)
218 				goto error;
219 			ret = rxq_alloc_elts(rxq_ctrl);
220 			if (ret)
221 				goto error;
222 		}
223 		MLX5_ASSERT(!rxq_ctrl->obj);
224 		rxq_ctrl->obj = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
225 					    sizeof(*rxq_ctrl->obj), 0,
226 					    rxq_ctrl->socket);
227 		if (!rxq_ctrl->obj) {
228 			DRV_LOG(ERR,
229 				"Port %u Rx queue %u can't allocate resources.",
230 				dev->data->port_id, (*priv->rxqs)[i]->idx);
231 			rte_errno = ENOMEM;
232 			goto error;
233 		}
234 		ret = priv->obj_ops.rxq_obj_new(dev, i);
235 		if (ret) {
236 			mlx5_free(rxq_ctrl->obj);
237 			rxq_ctrl->obj = NULL;
238 			goto error;
239 		}
240 		DRV_LOG(DEBUG, "Port %u rxq %u updated with %p.",
241 			dev->data->port_id, i, (void *)&rxq_ctrl->obj);
242 		LIST_INSERT_HEAD(&priv->rxqsobj, rxq_ctrl->obj, next);
243 	}
244 	return 0;
245 error:
246 	ret = rte_errno; /* Save rte_errno before cleanup. */
247 	do {
248 		mlx5_rxq_release(dev, i);
249 	} while (i-- != 0);
250 	rte_errno = ret; /* Restore rte_errno. */
251 	return -rte_errno;
252 }
253 
254 /**
255  * Binds Tx queues to Rx queues for hairpin.
256  *
257  * Binds Tx queues to the target Rx queues.
258  *
259  * @param dev
260  *   Pointer to Ethernet device structure.
261  *
262  * @return
263  *   0 on success, a negative errno value otherwise and rte_errno is set.
264  */
265 static int
266 mlx5_hairpin_auto_bind(struct rte_eth_dev *dev)
267 {
268 	struct mlx5_priv *priv = dev->data->dev_private;
269 	struct mlx5_devx_modify_sq_attr sq_attr = { 0 };
270 	struct mlx5_devx_modify_rq_attr rq_attr = { 0 };
271 	struct mlx5_txq_ctrl *txq_ctrl;
272 	struct mlx5_rxq_priv *rxq;
273 	struct mlx5_rxq_ctrl *rxq_ctrl;
274 	struct mlx5_devx_obj *sq;
275 	struct mlx5_devx_obj *rq;
276 	unsigned int i;
277 	int ret = 0;
278 	bool need_auto = false;
279 	uint16_t self_port = dev->data->port_id;
280 
281 	for (i = 0; i != priv->txqs_n; ++i) {
282 		txq_ctrl = mlx5_txq_get(dev, i);
283 		if (!txq_ctrl)
284 			continue;
285 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN ||
286 		    txq_ctrl->hairpin_conf.peers[0].port != self_port) {
287 			mlx5_txq_release(dev, i);
288 			continue;
289 		}
290 		if (txq_ctrl->hairpin_conf.manual_bind) {
291 			mlx5_txq_release(dev, i);
292 			return 0;
293 		}
294 		need_auto = true;
295 		mlx5_txq_release(dev, i);
296 	}
297 	if (!need_auto)
298 		return 0;
299 	for (i = 0; i != priv->txqs_n; ++i) {
300 		txq_ctrl = mlx5_txq_get(dev, i);
301 		if (!txq_ctrl)
302 			continue;
303 		/* Skip hairpin queues with other peer ports. */
304 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN ||
305 		    txq_ctrl->hairpin_conf.peers[0].port != self_port) {
306 			mlx5_txq_release(dev, i);
307 			continue;
308 		}
309 		if (!txq_ctrl->obj) {
310 			rte_errno = ENOMEM;
311 			DRV_LOG(ERR, "port %u no txq object found: %d",
312 				dev->data->port_id, i);
313 			mlx5_txq_release(dev, i);
314 			return -rte_errno;
315 		}
316 		sq = txq_ctrl->obj->sq;
317 		rxq = mlx5_rxq_get(dev, txq_ctrl->hairpin_conf.peers[0].queue);
318 		if (rxq == NULL) {
319 			mlx5_txq_release(dev, i);
320 			rte_errno = EINVAL;
321 			DRV_LOG(ERR, "port %u no rxq object found: %d",
322 				dev->data->port_id,
323 				txq_ctrl->hairpin_conf.peers[0].queue);
324 			return -rte_errno;
325 		}
326 		rxq_ctrl = rxq->ctrl;
327 		if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN ||
328 		    rxq->hairpin_conf.peers[0].queue != i) {
329 			rte_errno = ENOMEM;
330 			DRV_LOG(ERR, "port %u Tx queue %d can't be binded to "
331 				"Rx queue %d", dev->data->port_id,
332 				i, txq_ctrl->hairpin_conf.peers[0].queue);
333 			goto error;
334 		}
335 		rq = rxq_ctrl->obj->rq;
336 		if (!rq) {
337 			rte_errno = ENOMEM;
338 			DRV_LOG(ERR, "port %u hairpin no matching rxq: %d",
339 				dev->data->port_id,
340 				txq_ctrl->hairpin_conf.peers[0].queue);
341 			goto error;
342 		}
343 		sq_attr.state = MLX5_SQC_STATE_RDY;
344 		sq_attr.sq_state = MLX5_SQC_STATE_RST;
345 		sq_attr.hairpin_peer_rq = rq->id;
346 		sq_attr.hairpin_peer_vhca = priv->config.hca_attr.vhca_id;
347 		ret = mlx5_devx_cmd_modify_sq(sq, &sq_attr);
348 		if (ret)
349 			goto error;
350 		rq_attr.state = MLX5_SQC_STATE_RDY;
351 		rq_attr.rq_state = MLX5_SQC_STATE_RST;
352 		rq_attr.hairpin_peer_sq = sq->id;
353 		rq_attr.hairpin_peer_vhca = priv->config.hca_attr.vhca_id;
354 		ret = mlx5_devx_cmd_modify_rq(rq, &rq_attr);
355 		if (ret)
356 			goto error;
357 		/* Qs with auto-bind will be destroyed directly. */
358 		rxq->hairpin_status = 1;
359 		txq_ctrl->hairpin_status = 1;
360 		mlx5_txq_release(dev, i);
361 	}
362 	return 0;
363 error:
364 	mlx5_txq_release(dev, i);
365 	return -rte_errno;
366 }
367 
368 /*
369  * Fetch the peer queue's SW & HW information.
370  *
371  * @param dev
372  *   Pointer to Ethernet device structure.
373  * @param peer_queue
374  *   Index of the queue to fetch the information.
375  * @param current_info
376  *   Pointer to the input peer information, not used currently.
377  * @param peer_info
378  *   Pointer to the structure to store the information, output.
379  * @param direction
380  *   Positive to get the RxQ information, zero to get the TxQ information.
381  *
382  * @return
383  *   0 on success, a negative errno value otherwise and rte_errno is set.
384  */
385 int
386 mlx5_hairpin_queue_peer_update(struct rte_eth_dev *dev, uint16_t peer_queue,
387 			       struct rte_hairpin_peer_info *current_info,
388 			       struct rte_hairpin_peer_info *peer_info,
389 			       uint32_t direction)
390 {
391 	struct mlx5_priv *priv = dev->data->dev_private;
392 	RTE_SET_USED(current_info);
393 
394 	if (dev->data->dev_started == 0) {
395 		rte_errno = EBUSY;
396 		DRV_LOG(ERR, "peer port %u is not started",
397 			dev->data->port_id);
398 		return -rte_errno;
399 	}
400 	/*
401 	 * Peer port used as egress. In the current design, hairpin Tx queue
402 	 * will be bound to the peer Rx queue. Indeed, only the information of
403 	 * peer Rx queue needs to be fetched.
404 	 */
405 	if (direction == 0) {
406 		struct mlx5_txq_ctrl *txq_ctrl;
407 
408 		txq_ctrl = mlx5_txq_get(dev, peer_queue);
409 		if (txq_ctrl == NULL) {
410 			rte_errno = EINVAL;
411 			DRV_LOG(ERR, "Failed to get port %u Tx queue %d",
412 				dev->data->port_id, peer_queue);
413 			return -rte_errno;
414 		}
415 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
416 			rte_errno = EINVAL;
417 			DRV_LOG(ERR, "port %u queue %d is not a hairpin Txq",
418 				dev->data->port_id, peer_queue);
419 			mlx5_txq_release(dev, peer_queue);
420 			return -rte_errno;
421 		}
422 		if (txq_ctrl->obj == NULL || txq_ctrl->obj->sq == NULL) {
423 			rte_errno = ENOMEM;
424 			DRV_LOG(ERR, "port %u no Txq object found: %d",
425 				dev->data->port_id, peer_queue);
426 			mlx5_txq_release(dev, peer_queue);
427 			return -rte_errno;
428 		}
429 		peer_info->qp_id = txq_ctrl->obj->sq->id;
430 		peer_info->vhca_id = priv->config.hca_attr.vhca_id;
431 		/* 1-to-1 mapping, only the first one is used. */
432 		peer_info->peer_q = txq_ctrl->hairpin_conf.peers[0].queue;
433 		peer_info->tx_explicit = txq_ctrl->hairpin_conf.tx_explicit;
434 		peer_info->manual_bind = txq_ctrl->hairpin_conf.manual_bind;
435 		mlx5_txq_release(dev, peer_queue);
436 	} else { /* Peer port used as ingress. */
437 		struct mlx5_rxq_priv *rxq = mlx5_rxq_get(dev, peer_queue);
438 		struct mlx5_rxq_ctrl *rxq_ctrl;
439 
440 		if (rxq == NULL) {
441 			rte_errno = EINVAL;
442 			DRV_LOG(ERR, "Failed to get port %u Rx queue %d",
443 				dev->data->port_id, peer_queue);
444 			return -rte_errno;
445 		}
446 		rxq_ctrl = rxq->ctrl;
447 		if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN) {
448 			rte_errno = EINVAL;
449 			DRV_LOG(ERR, "port %u queue %d is not a hairpin Rxq",
450 				dev->data->port_id, peer_queue);
451 			return -rte_errno;
452 		}
453 		if (rxq_ctrl->obj == NULL || rxq_ctrl->obj->rq == NULL) {
454 			rte_errno = ENOMEM;
455 			DRV_LOG(ERR, "port %u no Rxq object found: %d",
456 				dev->data->port_id, peer_queue);
457 			return -rte_errno;
458 		}
459 		peer_info->qp_id = rxq_ctrl->obj->rq->id;
460 		peer_info->vhca_id = priv->config.hca_attr.vhca_id;
461 		peer_info->peer_q = rxq->hairpin_conf.peers[0].queue;
462 		peer_info->tx_explicit = rxq->hairpin_conf.tx_explicit;
463 		peer_info->manual_bind = rxq->hairpin_conf.manual_bind;
464 	}
465 	return 0;
466 }
467 
468 /*
469  * Bind the hairpin queue with the peer HW information.
470  * This needs to be called twice both for Tx and Rx queues of a pair.
471  * If the queue is already bound, it is considered successful.
472  *
473  * @param dev
474  *   Pointer to Ethernet device structure.
475  * @param cur_queue
476  *   Index of the queue to change the HW configuration to bind.
477  * @param peer_info
478  *   Pointer to information of the peer queue.
479  * @param direction
480  *   Positive to configure the TxQ, zero to configure the RxQ.
481  *
482  * @return
483  *   0 on success, a negative errno value otherwise and rte_errno is set.
484  */
485 int
486 mlx5_hairpin_queue_peer_bind(struct rte_eth_dev *dev, uint16_t cur_queue,
487 			     struct rte_hairpin_peer_info *peer_info,
488 			     uint32_t direction)
489 {
490 	int ret = 0;
491 
492 	/*
493 	 * Consistency checking of the peer queue: opposite direction is used
494 	 * to get the peer queue info with ethdev port ID, no need to check.
495 	 */
496 	if (peer_info->peer_q != cur_queue) {
497 		rte_errno = EINVAL;
498 		DRV_LOG(ERR, "port %u queue %d and peer queue %d mismatch",
499 			dev->data->port_id, cur_queue, peer_info->peer_q);
500 		return -rte_errno;
501 	}
502 	if (direction != 0) {
503 		struct mlx5_txq_ctrl *txq_ctrl;
504 		struct mlx5_devx_modify_sq_attr sq_attr = { 0 };
505 
506 		txq_ctrl = mlx5_txq_get(dev, cur_queue);
507 		if (txq_ctrl == NULL) {
508 			rte_errno = EINVAL;
509 			DRV_LOG(ERR, "Failed to get port %u Tx queue %d",
510 				dev->data->port_id, cur_queue);
511 			return -rte_errno;
512 		}
513 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
514 			rte_errno = EINVAL;
515 			DRV_LOG(ERR, "port %u queue %d not a hairpin Txq",
516 				dev->data->port_id, cur_queue);
517 			mlx5_txq_release(dev, cur_queue);
518 			return -rte_errno;
519 		}
520 		if (txq_ctrl->obj == NULL || txq_ctrl->obj->sq == NULL) {
521 			rte_errno = ENOMEM;
522 			DRV_LOG(ERR, "port %u no Txq object found: %d",
523 				dev->data->port_id, cur_queue);
524 			mlx5_txq_release(dev, cur_queue);
525 			return -rte_errno;
526 		}
527 		if (txq_ctrl->hairpin_status != 0) {
528 			DRV_LOG(DEBUG, "port %u Tx queue %d is already bound",
529 				dev->data->port_id, cur_queue);
530 			mlx5_txq_release(dev, cur_queue);
531 			return 0;
532 		}
533 		/*
534 		 * All queues' of one port consistency checking is done in the
535 		 * bind() function, and that is optional.
536 		 */
537 		if (peer_info->tx_explicit !=
538 		    txq_ctrl->hairpin_conf.tx_explicit) {
539 			rte_errno = EINVAL;
540 			DRV_LOG(ERR, "port %u Tx queue %d and peer Tx rule mode"
541 				" mismatch", dev->data->port_id, cur_queue);
542 			mlx5_txq_release(dev, cur_queue);
543 			return -rte_errno;
544 		}
545 		if (peer_info->manual_bind !=
546 		    txq_ctrl->hairpin_conf.manual_bind) {
547 			rte_errno = EINVAL;
548 			DRV_LOG(ERR, "port %u Tx queue %d and peer binding mode"
549 				" mismatch", dev->data->port_id, cur_queue);
550 			mlx5_txq_release(dev, cur_queue);
551 			return -rte_errno;
552 		}
553 		sq_attr.state = MLX5_SQC_STATE_RDY;
554 		sq_attr.sq_state = MLX5_SQC_STATE_RST;
555 		sq_attr.hairpin_peer_rq = peer_info->qp_id;
556 		sq_attr.hairpin_peer_vhca = peer_info->vhca_id;
557 		ret = mlx5_devx_cmd_modify_sq(txq_ctrl->obj->sq, &sq_attr);
558 		if (ret == 0)
559 			txq_ctrl->hairpin_status = 1;
560 		mlx5_txq_release(dev, cur_queue);
561 	} else {
562 		struct mlx5_rxq_priv *rxq = mlx5_rxq_get(dev, cur_queue);
563 		struct mlx5_rxq_ctrl *rxq_ctrl;
564 		struct mlx5_devx_modify_rq_attr rq_attr = { 0 };
565 
566 		if (rxq == NULL) {
567 			rte_errno = EINVAL;
568 			DRV_LOG(ERR, "Failed to get port %u Rx queue %d",
569 				dev->data->port_id, cur_queue);
570 			return -rte_errno;
571 		}
572 		rxq_ctrl = rxq->ctrl;
573 		if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN) {
574 			rte_errno = EINVAL;
575 			DRV_LOG(ERR, "port %u queue %d not a hairpin Rxq",
576 				dev->data->port_id, cur_queue);
577 			return -rte_errno;
578 		}
579 		if (rxq_ctrl->obj == NULL || rxq_ctrl->obj->rq == NULL) {
580 			rte_errno = ENOMEM;
581 			DRV_LOG(ERR, "port %u no Rxq object found: %d",
582 				dev->data->port_id, cur_queue);
583 			return -rte_errno;
584 		}
585 		if (rxq->hairpin_status != 0) {
586 			DRV_LOG(DEBUG, "port %u Rx queue %d is already bound",
587 				dev->data->port_id, cur_queue);
588 			return 0;
589 		}
590 		if (peer_info->tx_explicit !=
591 		    rxq->hairpin_conf.tx_explicit) {
592 			rte_errno = EINVAL;
593 			DRV_LOG(ERR, "port %u Rx queue %d and peer Tx rule mode"
594 				" mismatch", dev->data->port_id, cur_queue);
595 			return -rte_errno;
596 		}
597 		if (peer_info->manual_bind !=
598 		    rxq->hairpin_conf.manual_bind) {
599 			rte_errno = EINVAL;
600 			DRV_LOG(ERR, "port %u Rx queue %d and peer binding mode"
601 				" mismatch", dev->data->port_id, cur_queue);
602 			return -rte_errno;
603 		}
604 		rq_attr.state = MLX5_SQC_STATE_RDY;
605 		rq_attr.rq_state = MLX5_SQC_STATE_RST;
606 		rq_attr.hairpin_peer_sq = peer_info->qp_id;
607 		rq_attr.hairpin_peer_vhca = peer_info->vhca_id;
608 		ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq, &rq_attr);
609 		if (ret == 0)
610 			rxq->hairpin_status = 1;
611 	}
612 	return ret;
613 }
614 
615 /*
616  * Unbind the hairpin queue and reset its HW configuration.
617  * This needs to be called twice both for Tx and Rx queues of a pair.
618  * If the queue is already unbound, it is considered successful.
619  *
620  * @param dev
621  *   Pointer to Ethernet device structure.
622  * @param cur_queue
623  *   Index of the queue to change the HW configuration to unbind.
624  * @param direction
625  *   Positive to reset the TxQ, zero to reset the RxQ.
626  *
627  * @return
628  *   0 on success, a negative errno value otherwise and rte_errno is set.
629  */
630 int
631 mlx5_hairpin_queue_peer_unbind(struct rte_eth_dev *dev, uint16_t cur_queue,
632 			       uint32_t direction)
633 {
634 	int ret = 0;
635 
636 	if (direction != 0) {
637 		struct mlx5_txq_ctrl *txq_ctrl;
638 		struct mlx5_devx_modify_sq_attr sq_attr = { 0 };
639 
640 		txq_ctrl = mlx5_txq_get(dev, cur_queue);
641 		if (txq_ctrl == NULL) {
642 			rte_errno = EINVAL;
643 			DRV_LOG(ERR, "Failed to get port %u Tx queue %d",
644 				dev->data->port_id, cur_queue);
645 			return -rte_errno;
646 		}
647 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
648 			rte_errno = EINVAL;
649 			DRV_LOG(ERR, "port %u queue %d not a hairpin Txq",
650 				dev->data->port_id, cur_queue);
651 			mlx5_txq_release(dev, cur_queue);
652 			return -rte_errno;
653 		}
654 		/* Already unbound, return success before obj checking. */
655 		if (txq_ctrl->hairpin_status == 0) {
656 			DRV_LOG(DEBUG, "port %u Tx queue %d is already unbound",
657 				dev->data->port_id, cur_queue);
658 			mlx5_txq_release(dev, cur_queue);
659 			return 0;
660 		}
661 		if (!txq_ctrl->obj || !txq_ctrl->obj->sq) {
662 			rte_errno = ENOMEM;
663 			DRV_LOG(ERR, "port %u no Txq object found: %d",
664 				dev->data->port_id, cur_queue);
665 			mlx5_txq_release(dev, cur_queue);
666 			return -rte_errno;
667 		}
668 		sq_attr.state = MLX5_SQC_STATE_RST;
669 		sq_attr.sq_state = MLX5_SQC_STATE_RST;
670 		ret = mlx5_devx_cmd_modify_sq(txq_ctrl->obj->sq, &sq_attr);
671 		if (ret == 0)
672 			txq_ctrl->hairpin_status = 0;
673 		mlx5_txq_release(dev, cur_queue);
674 	} else {
675 		struct mlx5_rxq_priv *rxq = mlx5_rxq_get(dev, cur_queue);
676 		struct mlx5_rxq_ctrl *rxq_ctrl;
677 		struct mlx5_devx_modify_rq_attr rq_attr = { 0 };
678 
679 		if (rxq == NULL) {
680 			rte_errno = EINVAL;
681 			DRV_LOG(ERR, "Failed to get port %u Rx queue %d",
682 				dev->data->port_id, cur_queue);
683 			return -rte_errno;
684 		}
685 		rxq_ctrl = rxq->ctrl;
686 		if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN) {
687 			rte_errno = EINVAL;
688 			DRV_LOG(ERR, "port %u queue %d not a hairpin Rxq",
689 				dev->data->port_id, cur_queue);
690 			return -rte_errno;
691 		}
692 		if (rxq->hairpin_status == 0) {
693 			DRV_LOG(DEBUG, "port %u Rx queue %d is already unbound",
694 				dev->data->port_id, cur_queue);
695 			return 0;
696 		}
697 		if (rxq_ctrl->obj == NULL || rxq_ctrl->obj->rq == NULL) {
698 			rte_errno = ENOMEM;
699 			DRV_LOG(ERR, "port %u no Rxq object found: %d",
700 				dev->data->port_id, cur_queue);
701 			return -rte_errno;
702 		}
703 		rq_attr.state = MLX5_SQC_STATE_RST;
704 		rq_attr.rq_state = MLX5_SQC_STATE_RST;
705 		ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq, &rq_attr);
706 		if (ret == 0)
707 			rxq->hairpin_status = 0;
708 	}
709 	return ret;
710 }
711 
712 /*
713  * Bind the hairpin port pairs, from the Tx to the peer Rx.
714  * This function only supports to bind the Tx to one Rx.
715  *
716  * @param dev
717  *   Pointer to Ethernet device structure.
718  * @param rx_port
719  *   Port identifier of the Rx port.
720  *
721  * @return
722  *   0 on success, a negative errno value otherwise and rte_errno is set.
723  */
724 static int
725 mlx5_hairpin_bind_single_port(struct rte_eth_dev *dev, uint16_t rx_port)
726 {
727 	struct mlx5_priv *priv = dev->data->dev_private;
728 	int ret = 0;
729 	struct mlx5_txq_ctrl *txq_ctrl;
730 	uint32_t i;
731 	struct rte_hairpin_peer_info peer = {0xffffff};
732 	struct rte_hairpin_peer_info cur;
733 	const struct rte_eth_hairpin_conf *conf;
734 	uint16_t num_q = 0;
735 	uint16_t local_port = priv->dev_data->port_id;
736 	uint32_t manual;
737 	uint32_t explicit;
738 	uint16_t rx_queue;
739 
740 	if (mlx5_eth_find_next(rx_port, dev->device) != rx_port) {
741 		rte_errno = ENODEV;
742 		DRV_LOG(ERR, "Rx port %u does not belong to mlx5", rx_port);
743 		return -rte_errno;
744 	}
745 	/*
746 	 * Before binding TxQ to peer RxQ, first round loop will be used for
747 	 * checking the queues' configuration consistency. This would be a
748 	 * little time consuming but better than doing the rollback.
749 	 */
750 	for (i = 0; i != priv->txqs_n; i++) {
751 		txq_ctrl = mlx5_txq_get(dev, i);
752 		if (txq_ctrl == NULL)
753 			continue;
754 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
755 			mlx5_txq_release(dev, i);
756 			continue;
757 		}
758 		/*
759 		 * All hairpin Tx queues of a single port that connected to the
760 		 * same peer Rx port should have the same "auto binding" and
761 		 * "implicit Tx flow" modes.
762 		 * Peer consistency checking will be done in per queue binding.
763 		 */
764 		conf = &txq_ctrl->hairpin_conf;
765 		if (conf->peers[0].port == rx_port) {
766 			if (num_q == 0) {
767 				manual = conf->manual_bind;
768 				explicit = conf->tx_explicit;
769 			} else {
770 				if (manual != conf->manual_bind ||
771 				    explicit != conf->tx_explicit) {
772 					rte_errno = EINVAL;
773 					DRV_LOG(ERR, "port %u queue %d mode"
774 						" mismatch: %u %u, %u %u",
775 						local_port, i, manual,
776 						conf->manual_bind, explicit,
777 						conf->tx_explicit);
778 					mlx5_txq_release(dev, i);
779 					return -rte_errno;
780 				}
781 			}
782 			num_q++;
783 		}
784 		mlx5_txq_release(dev, i);
785 	}
786 	/* Once no queue is configured, success is returned directly. */
787 	if (num_q == 0)
788 		return ret;
789 	/* All the hairpin TX queues need to be traversed again. */
790 	for (i = 0; i != priv->txqs_n; i++) {
791 		txq_ctrl = mlx5_txq_get(dev, i);
792 		if (txq_ctrl == NULL)
793 			continue;
794 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
795 			mlx5_txq_release(dev, i);
796 			continue;
797 		}
798 		if (txq_ctrl->hairpin_conf.peers[0].port != rx_port) {
799 			mlx5_txq_release(dev, i);
800 			continue;
801 		}
802 		rx_queue = txq_ctrl->hairpin_conf.peers[0].queue;
803 		/*
804 		 * Fetch peer RxQ's information.
805 		 * No need to pass the information of the current queue.
806 		 */
807 		ret = rte_eth_hairpin_queue_peer_update(rx_port, rx_queue,
808 							NULL, &peer, 1);
809 		if (ret != 0) {
810 			mlx5_txq_release(dev, i);
811 			goto error;
812 		}
813 		/* Accessing its own device, inside mlx5 PMD. */
814 		ret = mlx5_hairpin_queue_peer_bind(dev, i, &peer, 1);
815 		if (ret != 0) {
816 			mlx5_txq_release(dev, i);
817 			goto error;
818 		}
819 		/* Pass TxQ's information to peer RxQ and try binding. */
820 		cur.peer_q = rx_queue;
821 		cur.qp_id = txq_ctrl->obj->sq->id;
822 		cur.vhca_id = priv->config.hca_attr.vhca_id;
823 		cur.tx_explicit = txq_ctrl->hairpin_conf.tx_explicit;
824 		cur.manual_bind = txq_ctrl->hairpin_conf.manual_bind;
825 		/*
826 		 * In order to access another device in a proper way, RTE level
827 		 * private function is needed.
828 		 */
829 		ret = rte_eth_hairpin_queue_peer_bind(rx_port, rx_queue,
830 						      &cur, 0);
831 		if (ret != 0) {
832 			mlx5_txq_release(dev, i);
833 			goto error;
834 		}
835 		mlx5_txq_release(dev, i);
836 	}
837 	return 0;
838 error:
839 	/*
840 	 * Do roll-back process for the queues already bound.
841 	 * No need to check the return value of the queue unbind function.
842 	 */
843 	do {
844 		/* No validation is needed here. */
845 		txq_ctrl = mlx5_txq_get(dev, i);
846 		if (txq_ctrl == NULL)
847 			continue;
848 		rx_queue = txq_ctrl->hairpin_conf.peers[0].queue;
849 		rte_eth_hairpin_queue_peer_unbind(rx_port, rx_queue, 0);
850 		mlx5_hairpin_queue_peer_unbind(dev, i, 1);
851 		mlx5_txq_release(dev, i);
852 	} while (i--);
853 	return ret;
854 }
855 
856 /*
857  * Unbind the hairpin port pair, HW configuration of both devices will be clear
858  * and status will be reset for all the queues used between the them.
859  * This function only supports to unbind the Tx from one Rx.
860  *
861  * @param dev
862  *   Pointer to Ethernet device structure.
863  * @param rx_port
864  *   Port identifier of the Rx port.
865  *
866  * @return
867  *   0 on success, a negative errno value otherwise and rte_errno is set.
868  */
869 static int
870 mlx5_hairpin_unbind_single_port(struct rte_eth_dev *dev, uint16_t rx_port)
871 {
872 	struct mlx5_priv *priv = dev->data->dev_private;
873 	struct mlx5_txq_ctrl *txq_ctrl;
874 	uint32_t i;
875 	int ret;
876 	uint16_t cur_port = priv->dev_data->port_id;
877 
878 	if (mlx5_eth_find_next(rx_port, dev->device) != rx_port) {
879 		rte_errno = ENODEV;
880 		DRV_LOG(ERR, "Rx port %u does not belong to mlx5", rx_port);
881 		return -rte_errno;
882 	}
883 	for (i = 0; i != priv->txqs_n; i++) {
884 		uint16_t rx_queue;
885 
886 		txq_ctrl = mlx5_txq_get(dev, i);
887 		if (txq_ctrl == NULL)
888 			continue;
889 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
890 			mlx5_txq_release(dev, i);
891 			continue;
892 		}
893 		if (txq_ctrl->hairpin_conf.peers[0].port != rx_port) {
894 			mlx5_txq_release(dev, i);
895 			continue;
896 		}
897 		/* Indeed, only the first used queue needs to be checked. */
898 		if (txq_ctrl->hairpin_conf.manual_bind == 0) {
899 			if (cur_port != rx_port) {
900 				rte_errno = EINVAL;
901 				DRV_LOG(ERR, "port %u and port %u are in"
902 					" auto-bind mode", cur_port, rx_port);
903 				mlx5_txq_release(dev, i);
904 				return -rte_errno;
905 			} else {
906 				return 0;
907 			}
908 		}
909 		rx_queue = txq_ctrl->hairpin_conf.peers[0].queue;
910 		mlx5_txq_release(dev, i);
911 		ret = rte_eth_hairpin_queue_peer_unbind(rx_port, rx_queue, 0);
912 		if (ret) {
913 			DRV_LOG(ERR, "port %u Rx queue %d unbind - failure",
914 				rx_port, rx_queue);
915 			return ret;
916 		}
917 		ret = mlx5_hairpin_queue_peer_unbind(dev, i, 1);
918 		if (ret) {
919 			DRV_LOG(ERR, "port %u Tx queue %d unbind - failure",
920 				cur_port, i);
921 			return ret;
922 		}
923 	}
924 	return 0;
925 }
926 
927 /*
928  * Bind hairpin ports, Rx could be all ports when using RTE_MAX_ETHPORTS.
929  * @see mlx5_hairpin_bind_single_port()
930  */
931 int
932 mlx5_hairpin_bind(struct rte_eth_dev *dev, uint16_t rx_port)
933 {
934 	int ret = 0;
935 	uint16_t p, pp;
936 
937 	/*
938 	 * If the Rx port has no hairpin configuration with the current port,
939 	 * the binding will be skipped in the called function of single port.
940 	 * Device started status will be checked only before the queue
941 	 * information updating.
942 	 */
943 	if (rx_port == RTE_MAX_ETHPORTS) {
944 		MLX5_ETH_FOREACH_DEV(p, dev->device) {
945 			ret = mlx5_hairpin_bind_single_port(dev, p);
946 			if (ret != 0)
947 				goto unbind;
948 		}
949 		return ret;
950 	} else {
951 		return mlx5_hairpin_bind_single_port(dev, rx_port);
952 	}
953 unbind:
954 	MLX5_ETH_FOREACH_DEV(pp, dev->device)
955 		if (pp < p)
956 			mlx5_hairpin_unbind_single_port(dev, pp);
957 	return ret;
958 }
959 
960 /*
961  * Unbind hairpin ports, Rx could be all ports when using RTE_MAX_ETHPORTS.
962  * @see mlx5_hairpin_unbind_single_port()
963  */
964 int
965 mlx5_hairpin_unbind(struct rte_eth_dev *dev, uint16_t rx_port)
966 {
967 	int ret = 0;
968 	uint16_t p;
969 
970 	if (rx_port == RTE_MAX_ETHPORTS)
971 		MLX5_ETH_FOREACH_DEV(p, dev->device) {
972 			ret = mlx5_hairpin_unbind_single_port(dev, p);
973 			if (ret != 0)
974 				return ret;
975 		}
976 	else
977 		ret = mlx5_hairpin_unbind_single_port(dev, rx_port);
978 	return ret;
979 }
980 
981 /*
982  * DPDK callback to get the hairpin peer ports list.
983  * This will return the actual number of peer ports and save the identifiers
984  * into the array (sorted, may be different from that when setting up the
985  * hairpin peer queues).
986  * The peer port ID could be the same as the port ID of the current device.
987  *
988  * @param dev
989  *   Pointer to Ethernet device structure.
990  * @param peer_ports
991  *   Pointer to array to save the port identifiers.
992  * @param len
993  *   The length of the array.
994  * @param direction
995  *   Current port to peer port direction.
996  *   positive - current used as Tx to get all peer Rx ports.
997  *   zero - current used as Rx to get all peer Tx ports.
998  *
999  * @return
1000  *   0 or positive value on success, actual number of peer ports.
1001  *   a negative errno value otherwise and rte_errno is set.
1002  */
1003 int
1004 mlx5_hairpin_get_peer_ports(struct rte_eth_dev *dev, uint16_t *peer_ports,
1005 			    size_t len, uint32_t direction)
1006 {
1007 	struct mlx5_priv *priv = dev->data->dev_private;
1008 	struct mlx5_txq_ctrl *txq_ctrl;
1009 	uint32_t i;
1010 	uint16_t pp;
1011 	uint32_t bits[(RTE_MAX_ETHPORTS + 31) / 32] = {0};
1012 	int ret = 0;
1013 
1014 	if (direction) {
1015 		for (i = 0; i < priv->txqs_n; i++) {
1016 			txq_ctrl = mlx5_txq_get(dev, i);
1017 			if (!txq_ctrl)
1018 				continue;
1019 			if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
1020 				mlx5_txq_release(dev, i);
1021 				continue;
1022 			}
1023 			pp = txq_ctrl->hairpin_conf.peers[0].port;
1024 			if (pp >= RTE_MAX_ETHPORTS) {
1025 				rte_errno = ERANGE;
1026 				mlx5_txq_release(dev, i);
1027 				DRV_LOG(ERR, "port %hu queue %u peer port "
1028 					"out of range %hu",
1029 					priv->dev_data->port_id, i, pp);
1030 				return -rte_errno;
1031 			}
1032 			bits[pp / 32] |= 1 << (pp % 32);
1033 			mlx5_txq_release(dev, i);
1034 		}
1035 	} else {
1036 		for (i = 0; i < priv->rxqs_n; i++) {
1037 			struct mlx5_rxq_priv *rxq = mlx5_rxq_get(dev, i);
1038 			struct mlx5_rxq_ctrl *rxq_ctrl;
1039 
1040 			if (rxq == NULL)
1041 				continue;
1042 			rxq_ctrl = rxq->ctrl;
1043 			if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN)
1044 				continue;
1045 			pp = rxq->hairpin_conf.peers[0].port;
1046 			if (pp >= RTE_MAX_ETHPORTS) {
1047 				rte_errno = ERANGE;
1048 				DRV_LOG(ERR, "port %hu queue %u peer port "
1049 					"out of range %hu",
1050 					priv->dev_data->port_id, i, pp);
1051 				return -rte_errno;
1052 			}
1053 			bits[pp / 32] |= 1 << (pp % 32);
1054 		}
1055 	}
1056 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
1057 		if (bits[i / 32] & (1 << (i % 32))) {
1058 			if ((size_t)ret >= len) {
1059 				rte_errno = E2BIG;
1060 				return -rte_errno;
1061 			}
1062 			peer_ports[ret++] = i;
1063 		}
1064 	}
1065 	return ret;
1066 }
1067 
1068 /**
1069  * DPDK callback to start the device.
1070  *
1071  * Simulate device start by attaching all configured flows.
1072  *
1073  * @param dev
1074  *   Pointer to Ethernet device structure.
1075  *
1076  * @return
1077  *   0 on success, a negative errno value otherwise and rte_errno is set.
1078  */
1079 int
1080 mlx5_dev_start(struct rte_eth_dev *dev)
1081 {
1082 	struct mlx5_priv *priv = dev->data->dev_private;
1083 	int ret;
1084 	int fine_inline;
1085 
1086 	DRV_LOG(DEBUG, "port %u starting device", dev->data->port_id);
1087 	fine_inline = rte_mbuf_dynflag_lookup
1088 		(RTE_PMD_MLX5_FINE_GRANULARITY_INLINE, NULL);
1089 	if (fine_inline >= 0)
1090 		rte_net_mlx5_dynf_inline_mask = 1UL << fine_inline;
1091 	else
1092 		rte_net_mlx5_dynf_inline_mask = 0;
1093 	if (dev->data->nb_rx_queues > 0) {
1094 		ret = mlx5_dev_configure_rss_reta(dev);
1095 		if (ret) {
1096 			DRV_LOG(ERR, "port %u reta config failed: %s",
1097 				dev->data->port_id, strerror(rte_errno));
1098 			return -rte_errno;
1099 		}
1100 	}
1101 	ret = mlx5_txpp_start(dev);
1102 	if (ret) {
1103 		DRV_LOG(ERR, "port %u Tx packet pacing init failed: %s",
1104 			dev->data->port_id, strerror(rte_errno));
1105 		goto error;
1106 	}
1107 	if ((priv->sh->devx && priv->config.dv_flow_en &&
1108 	    priv->config.dest_tir) && priv->obj_ops.lb_dummy_queue_create) {
1109 		ret = priv->obj_ops.lb_dummy_queue_create(dev);
1110 		if (ret)
1111 			goto error;
1112 	}
1113 	ret = mlx5_txq_start(dev);
1114 	if (ret) {
1115 		DRV_LOG(ERR, "port %u Tx queue allocation failed: %s",
1116 			dev->data->port_id, strerror(rte_errno));
1117 		goto error;
1118 	}
1119 	ret = mlx5_rxq_start(dev);
1120 	if (ret) {
1121 		DRV_LOG(ERR, "port %u Rx queue allocation failed: %s",
1122 			dev->data->port_id, strerror(rte_errno));
1123 		goto error;
1124 	}
1125 	/*
1126 	 * Such step will be skipped if there is no hairpin TX queue configured
1127 	 * with RX peer queue from the same device.
1128 	 */
1129 	ret = mlx5_hairpin_auto_bind(dev);
1130 	if (ret) {
1131 		DRV_LOG(ERR, "port %u hairpin auto binding failed: %s",
1132 			dev->data->port_id, strerror(rte_errno));
1133 		goto error;
1134 	}
1135 	/* Set started flag here for the following steps like control flow. */
1136 	dev->data->dev_started = 1;
1137 	ret = mlx5_rx_intr_vec_enable(dev);
1138 	if (ret) {
1139 		DRV_LOG(ERR, "port %u Rx interrupt vector creation failed",
1140 			dev->data->port_id);
1141 		goto error;
1142 	}
1143 	mlx5_os_stats_init(dev);
1144 	ret = mlx5_traffic_enable(dev);
1145 	if (ret) {
1146 		DRV_LOG(ERR, "port %u failed to set defaults flows",
1147 			dev->data->port_id);
1148 		goto error;
1149 	}
1150 	/* Set a mask and offset of dynamic metadata flows into Rx queues. */
1151 	mlx5_flow_rxq_dynf_metadata_set(dev);
1152 	/* Set flags and context to convert Rx timestamps. */
1153 	mlx5_rxq_timestamp_set(dev);
1154 	/* Set a mask and offset of scheduling on timestamp into Tx queues. */
1155 	mlx5_txq_dynf_timestamp_set(dev);
1156 	/* Attach indirection table objects detached on port stop. */
1157 	ret = mlx5_action_handle_attach(dev);
1158 	if (ret) {
1159 		DRV_LOG(ERR,
1160 			"port %u failed to attach indirect actions: %s",
1161 			dev->data->port_id, rte_strerror(rte_errno));
1162 		goto error;
1163 	}
1164 	/*
1165 	 * In non-cached mode, it only needs to start the default mreg copy
1166 	 * action and no flow created by application exists anymore.
1167 	 * But it is worth wrapping the interface for further usage.
1168 	 */
1169 	ret = mlx5_flow_start_default(dev);
1170 	if (ret) {
1171 		DRV_LOG(DEBUG, "port %u failed to start default actions: %s",
1172 			dev->data->port_id, strerror(rte_errno));
1173 		goto error;
1174 	}
1175 	if (mlx5_dev_ctx_shared_mempool_subscribe(dev) != 0) {
1176 		DRV_LOG(ERR, "port %u failed to subscribe for mempool life cycle: %s",
1177 			dev->data->port_id, rte_strerror(rte_errno));
1178 		goto error;
1179 	}
1180 	rte_wmb();
1181 	dev->tx_pkt_burst = mlx5_select_tx_function(dev);
1182 	dev->rx_pkt_burst = mlx5_select_rx_function(dev);
1183 	/* Enable datapath on secondary process. */
1184 	mlx5_mp_os_req_start_rxtx(dev);
1185 	if (rte_intr_fd_get(priv->sh->intr_handle) >= 0) {
1186 		priv->sh->port[priv->dev_port - 1].ih_port_id =
1187 					(uint32_t)dev->data->port_id;
1188 	} else {
1189 		DRV_LOG(INFO, "port %u starts without LSC and RMV interrupts.",
1190 			dev->data->port_id);
1191 		dev->data->dev_conf.intr_conf.lsc = 0;
1192 		dev->data->dev_conf.intr_conf.rmv = 0;
1193 	}
1194 	if (rte_intr_fd_get(priv->sh->intr_handle_devx) >= 0)
1195 		priv->sh->port[priv->dev_port - 1].devx_ih_port_id =
1196 					(uint32_t)dev->data->port_id;
1197 	return 0;
1198 error:
1199 	ret = rte_errno; /* Save rte_errno before cleanup. */
1200 	/* Rollback. */
1201 	dev->data->dev_started = 0;
1202 	mlx5_flow_stop_default(dev);
1203 	mlx5_traffic_disable(dev);
1204 	mlx5_txq_stop(dev);
1205 	mlx5_rxq_stop(dev);
1206 	if (priv->obj_ops.lb_dummy_queue_release)
1207 		priv->obj_ops.lb_dummy_queue_release(dev);
1208 	mlx5_txpp_stop(dev); /* Stop last. */
1209 	rte_errno = ret; /* Restore rte_errno. */
1210 	return -rte_errno;
1211 }
1212 
1213 /**
1214  * DPDK callback to stop the device.
1215  *
1216  * Simulate device stop by detaching all configured flows.
1217  *
1218  * @param dev
1219  *   Pointer to Ethernet device structure.
1220  */
1221 int
1222 mlx5_dev_stop(struct rte_eth_dev *dev)
1223 {
1224 	struct mlx5_priv *priv = dev->data->dev_private;
1225 
1226 	dev->data->dev_started = 0;
1227 	/* Prevent crashes when queues are still in use. */
1228 	dev->rx_pkt_burst = removed_rx_burst;
1229 	dev->tx_pkt_burst = removed_tx_burst;
1230 	rte_wmb();
1231 	/* Disable datapath on secondary process. */
1232 	mlx5_mp_os_req_stop_rxtx(dev);
1233 	rte_delay_us_sleep(1000 * priv->rxqs_n);
1234 	DRV_LOG(DEBUG, "port %u stopping device", dev->data->port_id);
1235 	mlx5_flow_stop_default(dev);
1236 	/* Control flows for default traffic can be removed firstly. */
1237 	mlx5_traffic_disable(dev);
1238 	/* All RX queue flags will be cleared in the flush interface. */
1239 	mlx5_flow_list_flush(dev, MLX5_FLOW_TYPE_GEN, true);
1240 	mlx5_flow_meter_rxq_flush(dev);
1241 	mlx5_action_handle_detach(dev);
1242 	mlx5_rx_intr_vec_disable(dev);
1243 	priv->sh->port[priv->dev_port - 1].ih_port_id = RTE_MAX_ETHPORTS;
1244 	priv->sh->port[priv->dev_port - 1].devx_ih_port_id = RTE_MAX_ETHPORTS;
1245 	mlx5_txq_stop(dev);
1246 	mlx5_rxq_stop(dev);
1247 	if (priv->obj_ops.lb_dummy_queue_release)
1248 		priv->obj_ops.lb_dummy_queue_release(dev);
1249 	mlx5_txpp_stop(dev);
1250 
1251 	return 0;
1252 }
1253 
1254 /**
1255  * Enable traffic flows configured by control plane
1256  *
1257  * @param dev
1258  *   Pointer to Ethernet device private data.
1259  * @param dev
1260  *   Pointer to Ethernet device structure.
1261  *
1262  * @return
1263  *   0 on success, a negative errno value otherwise and rte_errno is set.
1264  */
1265 int
1266 mlx5_traffic_enable(struct rte_eth_dev *dev)
1267 {
1268 	struct mlx5_priv *priv = dev->data->dev_private;
1269 	struct rte_flow_item_eth bcast = {
1270 		.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
1271 	};
1272 	struct rte_flow_item_eth ipv6_multi_spec = {
1273 		.dst.addr_bytes = "\x33\x33\x00\x00\x00\x00",
1274 	};
1275 	struct rte_flow_item_eth ipv6_multi_mask = {
1276 		.dst.addr_bytes = "\xff\xff\x00\x00\x00\x00",
1277 	};
1278 	struct rte_flow_item_eth unicast = {
1279 		.src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1280 	};
1281 	struct rte_flow_item_eth unicast_mask = {
1282 		.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
1283 	};
1284 	const unsigned int vlan_filter_n = priv->vlan_filter_n;
1285 	const struct rte_ether_addr cmp = {
1286 		.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1287 	};
1288 	unsigned int i;
1289 	unsigned int j;
1290 	int ret;
1291 
1292 	/*
1293 	 * Hairpin txq default flow should be created no matter if it is
1294 	 * isolation mode. Or else all the packets to be sent will be sent
1295 	 * out directly without the TX flow actions, e.g. encapsulation.
1296 	 */
1297 	for (i = 0; i != priv->txqs_n; ++i) {
1298 		struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i);
1299 		if (!txq_ctrl)
1300 			continue;
1301 		/* Only Tx implicit mode requires the default Tx flow. */
1302 		if (txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN &&
1303 		    txq_ctrl->hairpin_conf.tx_explicit == 0 &&
1304 		    txq_ctrl->hairpin_conf.peers[0].port ==
1305 		    priv->dev_data->port_id) {
1306 			ret = mlx5_ctrl_flow_source_queue(dev, i);
1307 			if (ret) {
1308 				mlx5_txq_release(dev, i);
1309 				goto error;
1310 			}
1311 		}
1312 		if ((priv->representor || priv->master) &&
1313 		    priv->config.dv_esw_en) {
1314 			if (mlx5_flow_create_devx_sq_miss_flow(dev, i) == 0) {
1315 				DRV_LOG(ERR,
1316 					"Port %u Tx queue %u SQ create representor devx default miss rule failed.",
1317 					dev->data->port_id, i);
1318 				goto error;
1319 			}
1320 		}
1321 		mlx5_txq_release(dev, i);
1322 	}
1323 	if ((priv->master || priv->representor) && priv->config.dv_esw_en) {
1324 		if (mlx5_flow_create_esw_table_zero_flow(dev))
1325 			priv->fdb_def_rule = 1;
1326 		else
1327 			DRV_LOG(INFO, "port %u FDB default rule cannot be"
1328 				" configured - only Eswitch group 0 flows are"
1329 				" supported.", dev->data->port_id);
1330 	}
1331 	if (!priv->config.lacp_by_user && priv->pf_bond >= 0) {
1332 		ret = mlx5_flow_lacp_miss(dev);
1333 		if (ret)
1334 			DRV_LOG(INFO, "port %u LACP rule cannot be created - "
1335 				"forward LACP to kernel.", dev->data->port_id);
1336 		else
1337 			DRV_LOG(INFO, "LACP traffic will be missed in port %u."
1338 				, dev->data->port_id);
1339 	}
1340 	if (priv->isolated)
1341 		return 0;
1342 	if (dev->data->promiscuous) {
1343 		struct rte_flow_item_eth promisc = {
1344 			.dst.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1345 			.src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1346 			.type = 0,
1347 		};
1348 
1349 		ret = mlx5_ctrl_flow(dev, &promisc, &promisc);
1350 		if (ret)
1351 			goto error;
1352 	}
1353 	if (dev->data->all_multicast) {
1354 		struct rte_flow_item_eth multicast = {
1355 			.dst.addr_bytes = "\x01\x00\x00\x00\x00\x00",
1356 			.src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1357 			.type = 0,
1358 		};
1359 
1360 		ret = mlx5_ctrl_flow(dev, &multicast, &multicast);
1361 		if (ret)
1362 			goto error;
1363 	} else {
1364 		/* Add broadcast/multicast flows. */
1365 		for (i = 0; i != vlan_filter_n; ++i) {
1366 			uint16_t vlan = priv->vlan_filter[i];
1367 
1368 			struct rte_flow_item_vlan vlan_spec = {
1369 				.tci = rte_cpu_to_be_16(vlan),
1370 			};
1371 			struct rte_flow_item_vlan vlan_mask =
1372 				rte_flow_item_vlan_mask;
1373 
1374 			ret = mlx5_ctrl_flow_vlan(dev, &bcast, &bcast,
1375 						  &vlan_spec, &vlan_mask);
1376 			if (ret)
1377 				goto error;
1378 			ret = mlx5_ctrl_flow_vlan(dev, &ipv6_multi_spec,
1379 						  &ipv6_multi_mask,
1380 						  &vlan_spec, &vlan_mask);
1381 			if (ret)
1382 				goto error;
1383 		}
1384 		if (!vlan_filter_n) {
1385 			ret = mlx5_ctrl_flow(dev, &bcast, &bcast);
1386 			if (ret)
1387 				goto error;
1388 			ret = mlx5_ctrl_flow(dev, &ipv6_multi_spec,
1389 					     &ipv6_multi_mask);
1390 			if (ret) {
1391 				/* Do not fail on IPv6 broadcast creation failure. */
1392 				DRV_LOG(WARNING,
1393 					"IPv6 broadcast is not supported");
1394 				ret = 0;
1395 			}
1396 		}
1397 	}
1398 	/* Add MAC address flows. */
1399 	for (i = 0; i != MLX5_MAX_MAC_ADDRESSES; ++i) {
1400 		struct rte_ether_addr *mac = &dev->data->mac_addrs[i];
1401 
1402 		if (!memcmp(mac, &cmp, sizeof(*mac)))
1403 			continue;
1404 		memcpy(&unicast.dst.addr_bytes,
1405 		       mac->addr_bytes,
1406 		       RTE_ETHER_ADDR_LEN);
1407 		for (j = 0; j != vlan_filter_n; ++j) {
1408 			uint16_t vlan = priv->vlan_filter[j];
1409 
1410 			struct rte_flow_item_vlan vlan_spec = {
1411 				.tci = rte_cpu_to_be_16(vlan),
1412 			};
1413 			struct rte_flow_item_vlan vlan_mask =
1414 				rte_flow_item_vlan_mask;
1415 
1416 			ret = mlx5_ctrl_flow_vlan(dev, &unicast,
1417 						  &unicast_mask,
1418 						  &vlan_spec,
1419 						  &vlan_mask);
1420 			if (ret)
1421 				goto error;
1422 		}
1423 		if (!vlan_filter_n) {
1424 			ret = mlx5_ctrl_flow(dev, &unicast, &unicast_mask);
1425 			if (ret)
1426 				goto error;
1427 		}
1428 	}
1429 	return 0;
1430 error:
1431 	ret = rte_errno; /* Save rte_errno before cleanup. */
1432 	mlx5_flow_list_flush(dev, MLX5_FLOW_TYPE_CTL, false);
1433 	rte_errno = ret; /* Restore rte_errno. */
1434 	return -rte_errno;
1435 }
1436 
1437 
1438 /**
1439  * Disable traffic flows configured by control plane
1440  *
1441  * @param dev
1442  *   Pointer to Ethernet device private data.
1443  */
1444 void
1445 mlx5_traffic_disable(struct rte_eth_dev *dev)
1446 {
1447 	mlx5_flow_list_flush(dev, MLX5_FLOW_TYPE_CTL, false);
1448 }
1449 
1450 /**
1451  * Restart traffic flows configured by control plane
1452  *
1453  * @param dev
1454  *   Pointer to Ethernet device private data.
1455  *
1456  * @return
1457  *   0 on success, a negative errno value otherwise and rte_errno is set.
1458  */
1459 int
1460 mlx5_traffic_restart(struct rte_eth_dev *dev)
1461 {
1462 	if (dev->data->dev_started) {
1463 		mlx5_traffic_disable(dev);
1464 		return mlx5_traffic_enable(dev);
1465 	}
1466 	return 0;
1467 }
1468