xref: /dpdk/drivers/net/mlx5/mlx5_trigger.c (revision daa02b5cddbb8e11b31d41e2bf7bb1ae64dcae2f)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5 
6 #include <unistd.h>
7 
8 #include <rte_ether.h>
9 #include <ethdev_driver.h>
10 #include <rte_interrupts.h>
11 #include <rte_alarm.h>
12 #include <rte_cycles.h>
13 
14 #include <mlx5_malloc.h>
15 
16 #include "mlx5.h"
17 #include "mlx5_rx.h"
18 #include "mlx5_tx.h"
19 #include "mlx5_utils.h"
20 #include "rte_pmd_mlx5.h"
21 
22 /**
23  * Stop traffic on Tx queues.
24  *
25  * @param dev
26  *   Pointer to Ethernet device structure.
27  */
28 static void
29 mlx5_txq_stop(struct rte_eth_dev *dev)
30 {
31 	struct mlx5_priv *priv = dev->data->dev_private;
32 	unsigned int i;
33 
34 	for (i = 0; i != priv->txqs_n; ++i)
35 		mlx5_txq_release(dev, i);
36 }
37 
38 /**
39  * Start traffic on Tx queues.
40  *
41  * @param dev
42  *   Pointer to Ethernet device structure.
43  *
44  * @return
45  *   0 on success, a negative errno value otherwise and rte_errno is set.
46  */
47 static int
48 mlx5_txq_start(struct rte_eth_dev *dev)
49 {
50 	struct mlx5_priv *priv = dev->data->dev_private;
51 	unsigned int i;
52 	int ret;
53 
54 	for (i = 0; i != priv->txqs_n; ++i) {
55 		struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i);
56 		struct mlx5_txq_data *txq_data = &txq_ctrl->txq;
57 		uint32_t flags = MLX5_MEM_RTE | MLX5_MEM_ZERO;
58 
59 		if (!txq_ctrl)
60 			continue;
61 		if (txq_ctrl->type == MLX5_TXQ_TYPE_STANDARD)
62 			txq_alloc_elts(txq_ctrl);
63 		MLX5_ASSERT(!txq_ctrl->obj);
64 		txq_ctrl->obj = mlx5_malloc(flags, sizeof(struct mlx5_txq_obj),
65 					    0, txq_ctrl->socket);
66 		if (!txq_ctrl->obj) {
67 			DRV_LOG(ERR, "Port %u Tx queue %u cannot allocate "
68 				"memory resources.", dev->data->port_id,
69 				txq_data->idx);
70 			rte_errno = ENOMEM;
71 			goto error;
72 		}
73 		ret = priv->obj_ops.txq_obj_new(dev, i);
74 		if (ret < 0) {
75 			mlx5_free(txq_ctrl->obj);
76 			txq_ctrl->obj = NULL;
77 			goto error;
78 		}
79 		if (txq_ctrl->type == MLX5_TXQ_TYPE_STANDARD) {
80 			size_t size = txq_data->cqe_s * sizeof(*txq_data->fcqs);
81 
82 			txq_data->fcqs = mlx5_malloc(flags, size,
83 						     RTE_CACHE_LINE_SIZE,
84 						     txq_ctrl->socket);
85 			if (!txq_data->fcqs) {
86 				DRV_LOG(ERR, "Port %u Tx queue %u cannot "
87 					"allocate memory (FCQ).",
88 					dev->data->port_id, i);
89 				rte_errno = ENOMEM;
90 				goto error;
91 			}
92 		}
93 		DRV_LOG(DEBUG, "Port %u txq %u updated with %p.",
94 			dev->data->port_id, i, (void *)&txq_ctrl->obj);
95 		LIST_INSERT_HEAD(&priv->txqsobj, txq_ctrl->obj, next);
96 	}
97 	return 0;
98 error:
99 	ret = rte_errno; /* Save rte_errno before cleanup. */
100 	do {
101 		mlx5_txq_release(dev, i);
102 	} while (i-- != 0);
103 	rte_errno = ret; /* Restore rte_errno. */
104 	return -rte_errno;
105 }
106 
107 /**
108  * Translate the chunk address to MR key in order to put in into the cache.
109  */
110 static void
111 mlx5_rxq_mempool_register_cb(struct rte_mempool *mp, void *opaque,
112 			     struct rte_mempool_memhdr *memhdr,
113 			     unsigned int idx)
114 {
115 	struct mlx5_rxq_data *rxq = opaque;
116 
117 	RTE_SET_USED(mp);
118 	RTE_SET_USED(idx);
119 	mlx5_rx_addr2mr(rxq, (uintptr_t)memhdr->addr);
120 }
121 
122 /**
123  * Register Rx queue mempools and fill the Rx queue cache.
124  * This function tolerates repeated mempool registration.
125  *
126  * @param[in] rxq_ctrl
127  *   Rx queue control data.
128  *
129  * @return
130  *   0 on success, (-1) on failure and rte_errno is set.
131  */
132 static int
133 mlx5_rxq_mempool_register(struct mlx5_rxq_ctrl *rxq_ctrl)
134 {
135 	struct mlx5_priv *priv = rxq_ctrl->priv;
136 	struct rte_mempool *mp;
137 	uint32_t s;
138 	int ret = 0;
139 
140 	mlx5_mr_flush_local_cache(&rxq_ctrl->rxq.mr_ctrl);
141 	/* MPRQ mempool is registered on creation, just fill the cache. */
142 	if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq)) {
143 		rte_mempool_mem_iter(rxq_ctrl->rxq.mprq_mp,
144 				     mlx5_rxq_mempool_register_cb,
145 				     &rxq_ctrl->rxq);
146 		return 0;
147 	}
148 	for (s = 0; s < rxq_ctrl->rxq.rxseg_n; s++) {
149 		mp = rxq_ctrl->rxq.rxseg[s].mp;
150 		ret = mlx5_mr_mempool_register(&priv->sh->cdev->mr_scache,
151 					       priv->sh->cdev->pd, mp,
152 					       &priv->mp_id);
153 		if (ret < 0 && rte_errno != EEXIST)
154 			return ret;
155 		rte_mempool_mem_iter(mp, mlx5_rxq_mempool_register_cb,
156 				     &rxq_ctrl->rxq);
157 	}
158 	return 0;
159 }
160 
161 /**
162  * Stop traffic on Rx queues.
163  *
164  * @param dev
165  *   Pointer to Ethernet device structure.
166  */
167 static void
168 mlx5_rxq_stop(struct rte_eth_dev *dev)
169 {
170 	struct mlx5_priv *priv = dev->data->dev_private;
171 	unsigned int i;
172 
173 	for (i = 0; i != priv->rxqs_n; ++i)
174 		mlx5_rxq_release(dev, i);
175 }
176 
177 /**
178  * Start traffic on Rx queues.
179  *
180  * @param dev
181  *   Pointer to Ethernet device structure.
182  *
183  * @return
184  *   0 on success, a negative errno value otherwise and rte_errno is set.
185  */
186 static int
187 mlx5_rxq_start(struct rte_eth_dev *dev)
188 {
189 	struct mlx5_priv *priv = dev->data->dev_private;
190 	unsigned int i;
191 	int ret = 0;
192 
193 	/* Allocate/reuse/resize mempool for Multi-Packet RQ. */
194 	if (mlx5_mprq_alloc_mp(dev)) {
195 		/* Should not release Rx queues but return immediately. */
196 		return -rte_errno;
197 	}
198 	DRV_LOG(DEBUG, "Port %u device_attr.max_qp_wr is %d.",
199 		dev->data->port_id, priv->sh->device_attr.max_qp_wr);
200 	DRV_LOG(DEBUG, "Port %u device_attr.max_sge is %d.",
201 		dev->data->port_id, priv->sh->device_attr.max_sge);
202 	for (i = 0; i != priv->rxqs_n; ++i) {
203 		struct mlx5_rxq_ctrl *rxq_ctrl = mlx5_rxq_get(dev, i);
204 
205 		if (!rxq_ctrl)
206 			continue;
207 		if (rxq_ctrl->type == MLX5_RXQ_TYPE_STANDARD) {
208 			/*
209 			 * Pre-register the mempools. Regardless of whether
210 			 * the implicit registration is enabled or not,
211 			 * Rx mempool destruction is tracked to free MRs.
212 			 */
213 			if (mlx5_rxq_mempool_register(rxq_ctrl) < 0)
214 				goto error;
215 			ret = rxq_alloc_elts(rxq_ctrl);
216 			if (ret)
217 				goto error;
218 		}
219 		MLX5_ASSERT(!rxq_ctrl->obj);
220 		rxq_ctrl->obj = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
221 					    sizeof(*rxq_ctrl->obj), 0,
222 					    rxq_ctrl->socket);
223 		if (!rxq_ctrl->obj) {
224 			DRV_LOG(ERR,
225 				"Port %u Rx queue %u can't allocate resources.",
226 				dev->data->port_id, (*priv->rxqs)[i]->idx);
227 			rte_errno = ENOMEM;
228 			goto error;
229 		}
230 		ret = priv->obj_ops.rxq_obj_new(dev, i);
231 		if (ret) {
232 			mlx5_free(rxq_ctrl->obj);
233 			rxq_ctrl->obj = NULL;
234 			goto error;
235 		}
236 		DRV_LOG(DEBUG, "Port %u rxq %u updated with %p.",
237 			dev->data->port_id, i, (void *)&rxq_ctrl->obj);
238 		LIST_INSERT_HEAD(&priv->rxqsobj, rxq_ctrl->obj, next);
239 	}
240 	return 0;
241 error:
242 	ret = rte_errno; /* Save rte_errno before cleanup. */
243 	do {
244 		mlx5_rxq_release(dev, i);
245 	} while (i-- != 0);
246 	rte_errno = ret; /* Restore rte_errno. */
247 	return -rte_errno;
248 }
249 
250 /**
251  * Binds Tx queues to Rx queues for hairpin.
252  *
253  * Binds Tx queues to the target Rx queues.
254  *
255  * @param dev
256  *   Pointer to Ethernet device structure.
257  *
258  * @return
259  *   0 on success, a negative errno value otherwise and rte_errno is set.
260  */
261 static int
262 mlx5_hairpin_auto_bind(struct rte_eth_dev *dev)
263 {
264 	struct mlx5_priv *priv = dev->data->dev_private;
265 	struct mlx5_devx_modify_sq_attr sq_attr = { 0 };
266 	struct mlx5_devx_modify_rq_attr rq_attr = { 0 };
267 	struct mlx5_txq_ctrl *txq_ctrl;
268 	struct mlx5_rxq_ctrl *rxq_ctrl;
269 	struct mlx5_devx_obj *sq;
270 	struct mlx5_devx_obj *rq;
271 	unsigned int i;
272 	int ret = 0;
273 	bool need_auto = false;
274 	uint16_t self_port = dev->data->port_id;
275 
276 	for (i = 0; i != priv->txqs_n; ++i) {
277 		txq_ctrl = mlx5_txq_get(dev, i);
278 		if (!txq_ctrl)
279 			continue;
280 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN ||
281 		    txq_ctrl->hairpin_conf.peers[0].port != self_port) {
282 			mlx5_txq_release(dev, i);
283 			continue;
284 		}
285 		if (txq_ctrl->hairpin_conf.manual_bind) {
286 			mlx5_txq_release(dev, i);
287 			return 0;
288 		}
289 		need_auto = true;
290 		mlx5_txq_release(dev, i);
291 	}
292 	if (!need_auto)
293 		return 0;
294 	for (i = 0; i != priv->txqs_n; ++i) {
295 		txq_ctrl = mlx5_txq_get(dev, i);
296 		if (!txq_ctrl)
297 			continue;
298 		/* Skip hairpin queues with other peer ports. */
299 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN ||
300 		    txq_ctrl->hairpin_conf.peers[0].port != self_port) {
301 			mlx5_txq_release(dev, i);
302 			continue;
303 		}
304 		if (!txq_ctrl->obj) {
305 			rte_errno = ENOMEM;
306 			DRV_LOG(ERR, "port %u no txq object found: %d",
307 				dev->data->port_id, i);
308 			mlx5_txq_release(dev, i);
309 			return -rte_errno;
310 		}
311 		sq = txq_ctrl->obj->sq;
312 		rxq_ctrl = mlx5_rxq_get(dev,
313 					txq_ctrl->hairpin_conf.peers[0].queue);
314 		if (!rxq_ctrl) {
315 			mlx5_txq_release(dev, i);
316 			rte_errno = EINVAL;
317 			DRV_LOG(ERR, "port %u no rxq object found: %d",
318 				dev->data->port_id,
319 				txq_ctrl->hairpin_conf.peers[0].queue);
320 			return -rte_errno;
321 		}
322 		if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN ||
323 		    rxq_ctrl->hairpin_conf.peers[0].queue != i) {
324 			rte_errno = ENOMEM;
325 			DRV_LOG(ERR, "port %u Tx queue %d can't be binded to "
326 				"Rx queue %d", dev->data->port_id,
327 				i, txq_ctrl->hairpin_conf.peers[0].queue);
328 			goto error;
329 		}
330 		rq = rxq_ctrl->obj->rq;
331 		if (!rq) {
332 			rte_errno = ENOMEM;
333 			DRV_LOG(ERR, "port %u hairpin no matching rxq: %d",
334 				dev->data->port_id,
335 				txq_ctrl->hairpin_conf.peers[0].queue);
336 			goto error;
337 		}
338 		sq_attr.state = MLX5_SQC_STATE_RDY;
339 		sq_attr.sq_state = MLX5_SQC_STATE_RST;
340 		sq_attr.hairpin_peer_rq = rq->id;
341 		sq_attr.hairpin_peer_vhca = priv->config.hca_attr.vhca_id;
342 		ret = mlx5_devx_cmd_modify_sq(sq, &sq_attr);
343 		if (ret)
344 			goto error;
345 		rq_attr.state = MLX5_SQC_STATE_RDY;
346 		rq_attr.rq_state = MLX5_SQC_STATE_RST;
347 		rq_attr.hairpin_peer_sq = sq->id;
348 		rq_attr.hairpin_peer_vhca = priv->config.hca_attr.vhca_id;
349 		ret = mlx5_devx_cmd_modify_rq(rq, &rq_attr);
350 		if (ret)
351 			goto error;
352 		/* Qs with auto-bind will be destroyed directly. */
353 		rxq_ctrl->hairpin_status = 1;
354 		txq_ctrl->hairpin_status = 1;
355 		mlx5_txq_release(dev, i);
356 		mlx5_rxq_release(dev, txq_ctrl->hairpin_conf.peers[0].queue);
357 	}
358 	return 0;
359 error:
360 	mlx5_txq_release(dev, i);
361 	mlx5_rxq_release(dev, txq_ctrl->hairpin_conf.peers[0].queue);
362 	return -rte_errno;
363 }
364 
365 /*
366  * Fetch the peer queue's SW & HW information.
367  *
368  * @param dev
369  *   Pointer to Ethernet device structure.
370  * @param peer_queue
371  *   Index of the queue to fetch the information.
372  * @param current_info
373  *   Pointer to the input peer information, not used currently.
374  * @param peer_info
375  *   Pointer to the structure to store the information, output.
376  * @param direction
377  *   Positive to get the RxQ information, zero to get the TxQ information.
378  *
379  * @return
380  *   0 on success, a negative errno value otherwise and rte_errno is set.
381  */
382 int
383 mlx5_hairpin_queue_peer_update(struct rte_eth_dev *dev, uint16_t peer_queue,
384 			       struct rte_hairpin_peer_info *current_info,
385 			       struct rte_hairpin_peer_info *peer_info,
386 			       uint32_t direction)
387 {
388 	struct mlx5_priv *priv = dev->data->dev_private;
389 	RTE_SET_USED(current_info);
390 
391 	if (dev->data->dev_started == 0) {
392 		rte_errno = EBUSY;
393 		DRV_LOG(ERR, "peer port %u is not started",
394 			dev->data->port_id);
395 		return -rte_errno;
396 	}
397 	/*
398 	 * Peer port used as egress. In the current design, hairpin Tx queue
399 	 * will be bound to the peer Rx queue. Indeed, only the information of
400 	 * peer Rx queue needs to be fetched.
401 	 */
402 	if (direction == 0) {
403 		struct mlx5_txq_ctrl *txq_ctrl;
404 
405 		txq_ctrl = mlx5_txq_get(dev, peer_queue);
406 		if (txq_ctrl == NULL) {
407 			rte_errno = EINVAL;
408 			DRV_LOG(ERR, "Failed to get port %u Tx queue %d",
409 				dev->data->port_id, peer_queue);
410 			return -rte_errno;
411 		}
412 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
413 			rte_errno = EINVAL;
414 			DRV_LOG(ERR, "port %u queue %d is not a hairpin Txq",
415 				dev->data->port_id, peer_queue);
416 			mlx5_txq_release(dev, peer_queue);
417 			return -rte_errno;
418 		}
419 		if (txq_ctrl->obj == NULL || txq_ctrl->obj->sq == NULL) {
420 			rte_errno = ENOMEM;
421 			DRV_LOG(ERR, "port %u no Txq object found: %d",
422 				dev->data->port_id, peer_queue);
423 			mlx5_txq_release(dev, peer_queue);
424 			return -rte_errno;
425 		}
426 		peer_info->qp_id = txq_ctrl->obj->sq->id;
427 		peer_info->vhca_id = priv->config.hca_attr.vhca_id;
428 		/* 1-to-1 mapping, only the first one is used. */
429 		peer_info->peer_q = txq_ctrl->hairpin_conf.peers[0].queue;
430 		peer_info->tx_explicit = txq_ctrl->hairpin_conf.tx_explicit;
431 		peer_info->manual_bind = txq_ctrl->hairpin_conf.manual_bind;
432 		mlx5_txq_release(dev, peer_queue);
433 	} else { /* Peer port used as ingress. */
434 		struct mlx5_rxq_ctrl *rxq_ctrl;
435 
436 		rxq_ctrl = mlx5_rxq_get(dev, peer_queue);
437 		if (rxq_ctrl == NULL) {
438 			rte_errno = EINVAL;
439 			DRV_LOG(ERR, "Failed to get port %u Rx queue %d",
440 				dev->data->port_id, peer_queue);
441 			return -rte_errno;
442 		}
443 		if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN) {
444 			rte_errno = EINVAL;
445 			DRV_LOG(ERR, "port %u queue %d is not a hairpin Rxq",
446 				dev->data->port_id, peer_queue);
447 			mlx5_rxq_release(dev, peer_queue);
448 			return -rte_errno;
449 		}
450 		if (rxq_ctrl->obj == NULL || rxq_ctrl->obj->rq == NULL) {
451 			rte_errno = ENOMEM;
452 			DRV_LOG(ERR, "port %u no Rxq object found: %d",
453 				dev->data->port_id, peer_queue);
454 			mlx5_rxq_release(dev, peer_queue);
455 			return -rte_errno;
456 		}
457 		peer_info->qp_id = rxq_ctrl->obj->rq->id;
458 		peer_info->vhca_id = priv->config.hca_attr.vhca_id;
459 		peer_info->peer_q = rxq_ctrl->hairpin_conf.peers[0].queue;
460 		peer_info->tx_explicit = rxq_ctrl->hairpin_conf.tx_explicit;
461 		peer_info->manual_bind = rxq_ctrl->hairpin_conf.manual_bind;
462 		mlx5_rxq_release(dev, peer_queue);
463 	}
464 	return 0;
465 }
466 
467 /*
468  * Bind the hairpin queue with the peer HW information.
469  * This needs to be called twice both for Tx and Rx queues of a pair.
470  * If the queue is already bound, it is considered successful.
471  *
472  * @param dev
473  *   Pointer to Ethernet device structure.
474  * @param cur_queue
475  *   Index of the queue to change the HW configuration to bind.
476  * @param peer_info
477  *   Pointer to information of the peer queue.
478  * @param direction
479  *   Positive to configure the TxQ, zero to configure the RxQ.
480  *
481  * @return
482  *   0 on success, a negative errno value otherwise and rte_errno is set.
483  */
484 int
485 mlx5_hairpin_queue_peer_bind(struct rte_eth_dev *dev, uint16_t cur_queue,
486 			     struct rte_hairpin_peer_info *peer_info,
487 			     uint32_t direction)
488 {
489 	int ret = 0;
490 
491 	/*
492 	 * Consistency checking of the peer queue: opposite direction is used
493 	 * to get the peer queue info with ethdev port ID, no need to check.
494 	 */
495 	if (peer_info->peer_q != cur_queue) {
496 		rte_errno = EINVAL;
497 		DRV_LOG(ERR, "port %u queue %d and peer queue %d mismatch",
498 			dev->data->port_id, cur_queue, peer_info->peer_q);
499 		return -rte_errno;
500 	}
501 	if (direction != 0) {
502 		struct mlx5_txq_ctrl *txq_ctrl;
503 		struct mlx5_devx_modify_sq_attr sq_attr = { 0 };
504 
505 		txq_ctrl = mlx5_txq_get(dev, cur_queue);
506 		if (txq_ctrl == NULL) {
507 			rte_errno = EINVAL;
508 			DRV_LOG(ERR, "Failed to get port %u Tx queue %d",
509 				dev->data->port_id, cur_queue);
510 			return -rte_errno;
511 		}
512 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
513 			rte_errno = EINVAL;
514 			DRV_LOG(ERR, "port %u queue %d not a hairpin Txq",
515 				dev->data->port_id, cur_queue);
516 			mlx5_txq_release(dev, cur_queue);
517 			return -rte_errno;
518 		}
519 		if (txq_ctrl->obj == NULL || txq_ctrl->obj->sq == NULL) {
520 			rte_errno = ENOMEM;
521 			DRV_LOG(ERR, "port %u no Txq object found: %d",
522 				dev->data->port_id, cur_queue);
523 			mlx5_txq_release(dev, cur_queue);
524 			return -rte_errno;
525 		}
526 		if (txq_ctrl->hairpin_status != 0) {
527 			DRV_LOG(DEBUG, "port %u Tx queue %d is already bound",
528 				dev->data->port_id, cur_queue);
529 			mlx5_txq_release(dev, cur_queue);
530 			return 0;
531 		}
532 		/*
533 		 * All queues' of one port consistency checking is done in the
534 		 * bind() function, and that is optional.
535 		 */
536 		if (peer_info->tx_explicit !=
537 		    txq_ctrl->hairpin_conf.tx_explicit) {
538 			rte_errno = EINVAL;
539 			DRV_LOG(ERR, "port %u Tx queue %d and peer Tx rule mode"
540 				" mismatch", dev->data->port_id, cur_queue);
541 			mlx5_txq_release(dev, cur_queue);
542 			return -rte_errno;
543 		}
544 		if (peer_info->manual_bind !=
545 		    txq_ctrl->hairpin_conf.manual_bind) {
546 			rte_errno = EINVAL;
547 			DRV_LOG(ERR, "port %u Tx queue %d and peer binding mode"
548 				" mismatch", dev->data->port_id, cur_queue);
549 			mlx5_txq_release(dev, cur_queue);
550 			return -rte_errno;
551 		}
552 		sq_attr.state = MLX5_SQC_STATE_RDY;
553 		sq_attr.sq_state = MLX5_SQC_STATE_RST;
554 		sq_attr.hairpin_peer_rq = peer_info->qp_id;
555 		sq_attr.hairpin_peer_vhca = peer_info->vhca_id;
556 		ret = mlx5_devx_cmd_modify_sq(txq_ctrl->obj->sq, &sq_attr);
557 		if (ret == 0)
558 			txq_ctrl->hairpin_status = 1;
559 		mlx5_txq_release(dev, cur_queue);
560 	} else {
561 		struct mlx5_rxq_ctrl *rxq_ctrl;
562 		struct mlx5_devx_modify_rq_attr rq_attr = { 0 };
563 
564 		rxq_ctrl = mlx5_rxq_get(dev, cur_queue);
565 		if (rxq_ctrl == NULL) {
566 			rte_errno = EINVAL;
567 			DRV_LOG(ERR, "Failed to get port %u Rx queue %d",
568 				dev->data->port_id, cur_queue);
569 			return -rte_errno;
570 		}
571 		if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN) {
572 			rte_errno = EINVAL;
573 			DRV_LOG(ERR, "port %u queue %d not a hairpin Rxq",
574 				dev->data->port_id, cur_queue);
575 			mlx5_rxq_release(dev, cur_queue);
576 			return -rte_errno;
577 		}
578 		if (rxq_ctrl->obj == NULL || rxq_ctrl->obj->rq == NULL) {
579 			rte_errno = ENOMEM;
580 			DRV_LOG(ERR, "port %u no Rxq object found: %d",
581 				dev->data->port_id, cur_queue);
582 			mlx5_rxq_release(dev, cur_queue);
583 			return -rte_errno;
584 		}
585 		if (rxq_ctrl->hairpin_status != 0) {
586 			DRV_LOG(DEBUG, "port %u Rx queue %d is already bound",
587 				dev->data->port_id, cur_queue);
588 			mlx5_rxq_release(dev, cur_queue);
589 			return 0;
590 		}
591 		if (peer_info->tx_explicit !=
592 		    rxq_ctrl->hairpin_conf.tx_explicit) {
593 			rte_errno = EINVAL;
594 			DRV_LOG(ERR, "port %u Rx queue %d and peer Tx rule mode"
595 				" mismatch", dev->data->port_id, cur_queue);
596 			mlx5_rxq_release(dev, cur_queue);
597 			return -rte_errno;
598 		}
599 		if (peer_info->manual_bind !=
600 		    rxq_ctrl->hairpin_conf.manual_bind) {
601 			rte_errno = EINVAL;
602 			DRV_LOG(ERR, "port %u Rx queue %d and peer binding mode"
603 				" mismatch", dev->data->port_id, cur_queue);
604 			mlx5_rxq_release(dev, cur_queue);
605 			return -rte_errno;
606 		}
607 		rq_attr.state = MLX5_SQC_STATE_RDY;
608 		rq_attr.rq_state = MLX5_SQC_STATE_RST;
609 		rq_attr.hairpin_peer_sq = peer_info->qp_id;
610 		rq_attr.hairpin_peer_vhca = peer_info->vhca_id;
611 		ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq, &rq_attr);
612 		if (ret == 0)
613 			rxq_ctrl->hairpin_status = 1;
614 		mlx5_rxq_release(dev, cur_queue);
615 	}
616 	return ret;
617 }
618 
619 /*
620  * Unbind the hairpin queue and reset its HW configuration.
621  * This needs to be called twice both for Tx and Rx queues of a pair.
622  * If the queue is already unbound, it is considered successful.
623  *
624  * @param dev
625  *   Pointer to Ethernet device structure.
626  * @param cur_queue
627  *   Index of the queue to change the HW configuration to unbind.
628  * @param direction
629  *   Positive to reset the TxQ, zero to reset the RxQ.
630  *
631  * @return
632  *   0 on success, a negative errno value otherwise and rte_errno is set.
633  */
634 int
635 mlx5_hairpin_queue_peer_unbind(struct rte_eth_dev *dev, uint16_t cur_queue,
636 			       uint32_t direction)
637 {
638 	int ret = 0;
639 
640 	if (direction != 0) {
641 		struct mlx5_txq_ctrl *txq_ctrl;
642 		struct mlx5_devx_modify_sq_attr sq_attr = { 0 };
643 
644 		txq_ctrl = mlx5_txq_get(dev, cur_queue);
645 		if (txq_ctrl == NULL) {
646 			rte_errno = EINVAL;
647 			DRV_LOG(ERR, "Failed to get port %u Tx queue %d",
648 				dev->data->port_id, cur_queue);
649 			return -rte_errno;
650 		}
651 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
652 			rte_errno = EINVAL;
653 			DRV_LOG(ERR, "port %u queue %d not a hairpin Txq",
654 				dev->data->port_id, cur_queue);
655 			mlx5_txq_release(dev, cur_queue);
656 			return -rte_errno;
657 		}
658 		/* Already unbound, return success before obj checking. */
659 		if (txq_ctrl->hairpin_status == 0) {
660 			DRV_LOG(DEBUG, "port %u Tx queue %d is already unbound",
661 				dev->data->port_id, cur_queue);
662 			mlx5_txq_release(dev, cur_queue);
663 			return 0;
664 		}
665 		if (!txq_ctrl->obj || !txq_ctrl->obj->sq) {
666 			rte_errno = ENOMEM;
667 			DRV_LOG(ERR, "port %u no Txq object found: %d",
668 				dev->data->port_id, cur_queue);
669 			mlx5_txq_release(dev, cur_queue);
670 			return -rte_errno;
671 		}
672 		sq_attr.state = MLX5_SQC_STATE_RST;
673 		sq_attr.sq_state = MLX5_SQC_STATE_RST;
674 		ret = mlx5_devx_cmd_modify_sq(txq_ctrl->obj->sq, &sq_attr);
675 		if (ret == 0)
676 			txq_ctrl->hairpin_status = 0;
677 		mlx5_txq_release(dev, cur_queue);
678 	} else {
679 		struct mlx5_rxq_ctrl *rxq_ctrl;
680 		struct mlx5_devx_modify_rq_attr rq_attr = { 0 };
681 
682 		rxq_ctrl = mlx5_rxq_get(dev, cur_queue);
683 		if (rxq_ctrl == NULL) {
684 			rte_errno = EINVAL;
685 			DRV_LOG(ERR, "Failed to get port %u Rx queue %d",
686 				dev->data->port_id, cur_queue);
687 			return -rte_errno;
688 		}
689 		if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN) {
690 			rte_errno = EINVAL;
691 			DRV_LOG(ERR, "port %u queue %d not a hairpin Rxq",
692 				dev->data->port_id, cur_queue);
693 			mlx5_rxq_release(dev, cur_queue);
694 			return -rte_errno;
695 		}
696 		if (rxq_ctrl->hairpin_status == 0) {
697 			DRV_LOG(DEBUG, "port %u Rx queue %d is already unbound",
698 				dev->data->port_id, cur_queue);
699 			mlx5_rxq_release(dev, cur_queue);
700 			return 0;
701 		}
702 		if (rxq_ctrl->obj == NULL || rxq_ctrl->obj->rq == NULL) {
703 			rte_errno = ENOMEM;
704 			DRV_LOG(ERR, "port %u no Rxq object found: %d",
705 				dev->data->port_id, cur_queue);
706 			mlx5_rxq_release(dev, cur_queue);
707 			return -rte_errno;
708 		}
709 		rq_attr.state = MLX5_SQC_STATE_RST;
710 		rq_attr.rq_state = MLX5_SQC_STATE_RST;
711 		ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq, &rq_attr);
712 		if (ret == 0)
713 			rxq_ctrl->hairpin_status = 0;
714 		mlx5_rxq_release(dev, cur_queue);
715 	}
716 	return ret;
717 }
718 
719 /*
720  * Bind the hairpin port pairs, from the Tx to the peer Rx.
721  * This function only supports to bind the Tx to one Rx.
722  *
723  * @param dev
724  *   Pointer to Ethernet device structure.
725  * @param rx_port
726  *   Port identifier of the Rx port.
727  *
728  * @return
729  *   0 on success, a negative errno value otherwise and rte_errno is set.
730  */
731 static int
732 mlx5_hairpin_bind_single_port(struct rte_eth_dev *dev, uint16_t rx_port)
733 {
734 	struct mlx5_priv *priv = dev->data->dev_private;
735 	int ret = 0;
736 	struct mlx5_txq_ctrl *txq_ctrl;
737 	uint32_t i;
738 	struct rte_hairpin_peer_info peer = {0xffffff};
739 	struct rte_hairpin_peer_info cur;
740 	const struct rte_eth_hairpin_conf *conf;
741 	uint16_t num_q = 0;
742 	uint16_t local_port = priv->dev_data->port_id;
743 	uint32_t manual;
744 	uint32_t explicit;
745 	uint16_t rx_queue;
746 
747 	if (mlx5_eth_find_next(rx_port, dev->device) != rx_port) {
748 		rte_errno = ENODEV;
749 		DRV_LOG(ERR, "Rx port %u does not belong to mlx5", rx_port);
750 		return -rte_errno;
751 	}
752 	/*
753 	 * Before binding TxQ to peer RxQ, first round loop will be used for
754 	 * checking the queues' configuration consistency. This would be a
755 	 * little time consuming but better than doing the rollback.
756 	 */
757 	for (i = 0; i != priv->txqs_n; i++) {
758 		txq_ctrl = mlx5_txq_get(dev, i);
759 		if (txq_ctrl == NULL)
760 			continue;
761 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
762 			mlx5_txq_release(dev, i);
763 			continue;
764 		}
765 		/*
766 		 * All hairpin Tx queues of a single port that connected to the
767 		 * same peer Rx port should have the same "auto binding" and
768 		 * "implicit Tx flow" modes.
769 		 * Peer consistency checking will be done in per queue binding.
770 		 */
771 		conf = &txq_ctrl->hairpin_conf;
772 		if (conf->peers[0].port == rx_port) {
773 			if (num_q == 0) {
774 				manual = conf->manual_bind;
775 				explicit = conf->tx_explicit;
776 			} else {
777 				if (manual != conf->manual_bind ||
778 				    explicit != conf->tx_explicit) {
779 					rte_errno = EINVAL;
780 					DRV_LOG(ERR, "port %u queue %d mode"
781 						" mismatch: %u %u, %u %u",
782 						local_port, i, manual,
783 						conf->manual_bind, explicit,
784 						conf->tx_explicit);
785 					mlx5_txq_release(dev, i);
786 					return -rte_errno;
787 				}
788 			}
789 			num_q++;
790 		}
791 		mlx5_txq_release(dev, i);
792 	}
793 	/* Once no queue is configured, success is returned directly. */
794 	if (num_q == 0)
795 		return ret;
796 	/* All the hairpin TX queues need to be traversed again. */
797 	for (i = 0; i != priv->txqs_n; i++) {
798 		txq_ctrl = mlx5_txq_get(dev, i);
799 		if (txq_ctrl == NULL)
800 			continue;
801 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
802 			mlx5_txq_release(dev, i);
803 			continue;
804 		}
805 		if (txq_ctrl->hairpin_conf.peers[0].port != rx_port) {
806 			mlx5_txq_release(dev, i);
807 			continue;
808 		}
809 		rx_queue = txq_ctrl->hairpin_conf.peers[0].queue;
810 		/*
811 		 * Fetch peer RxQ's information.
812 		 * No need to pass the information of the current queue.
813 		 */
814 		ret = rte_eth_hairpin_queue_peer_update(rx_port, rx_queue,
815 							NULL, &peer, 1);
816 		if (ret != 0) {
817 			mlx5_txq_release(dev, i);
818 			goto error;
819 		}
820 		/* Accessing its own device, inside mlx5 PMD. */
821 		ret = mlx5_hairpin_queue_peer_bind(dev, i, &peer, 1);
822 		if (ret != 0) {
823 			mlx5_txq_release(dev, i);
824 			goto error;
825 		}
826 		/* Pass TxQ's information to peer RxQ and try binding. */
827 		cur.peer_q = rx_queue;
828 		cur.qp_id = txq_ctrl->obj->sq->id;
829 		cur.vhca_id = priv->config.hca_attr.vhca_id;
830 		cur.tx_explicit = txq_ctrl->hairpin_conf.tx_explicit;
831 		cur.manual_bind = txq_ctrl->hairpin_conf.manual_bind;
832 		/*
833 		 * In order to access another device in a proper way, RTE level
834 		 * private function is needed.
835 		 */
836 		ret = rte_eth_hairpin_queue_peer_bind(rx_port, rx_queue,
837 						      &cur, 0);
838 		if (ret != 0) {
839 			mlx5_txq_release(dev, i);
840 			goto error;
841 		}
842 		mlx5_txq_release(dev, i);
843 	}
844 	return 0;
845 error:
846 	/*
847 	 * Do roll-back process for the queues already bound.
848 	 * No need to check the return value of the queue unbind function.
849 	 */
850 	do {
851 		/* No validation is needed here. */
852 		txq_ctrl = mlx5_txq_get(dev, i);
853 		if (txq_ctrl == NULL)
854 			continue;
855 		rx_queue = txq_ctrl->hairpin_conf.peers[0].queue;
856 		rte_eth_hairpin_queue_peer_unbind(rx_port, rx_queue, 0);
857 		mlx5_hairpin_queue_peer_unbind(dev, i, 1);
858 		mlx5_txq_release(dev, i);
859 	} while (i--);
860 	return ret;
861 }
862 
863 /*
864  * Unbind the hairpin port pair, HW configuration of both devices will be clear
865  * and status will be reset for all the queues used between the them.
866  * This function only supports to unbind the Tx from one Rx.
867  *
868  * @param dev
869  *   Pointer to Ethernet device structure.
870  * @param rx_port
871  *   Port identifier of the Rx port.
872  *
873  * @return
874  *   0 on success, a negative errno value otherwise and rte_errno is set.
875  */
876 static int
877 mlx5_hairpin_unbind_single_port(struct rte_eth_dev *dev, uint16_t rx_port)
878 {
879 	struct mlx5_priv *priv = dev->data->dev_private;
880 	struct mlx5_txq_ctrl *txq_ctrl;
881 	uint32_t i;
882 	int ret;
883 	uint16_t cur_port = priv->dev_data->port_id;
884 
885 	if (mlx5_eth_find_next(rx_port, dev->device) != rx_port) {
886 		rte_errno = ENODEV;
887 		DRV_LOG(ERR, "Rx port %u does not belong to mlx5", rx_port);
888 		return -rte_errno;
889 	}
890 	for (i = 0; i != priv->txqs_n; i++) {
891 		uint16_t rx_queue;
892 
893 		txq_ctrl = mlx5_txq_get(dev, i);
894 		if (txq_ctrl == NULL)
895 			continue;
896 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
897 			mlx5_txq_release(dev, i);
898 			continue;
899 		}
900 		if (txq_ctrl->hairpin_conf.peers[0].port != rx_port) {
901 			mlx5_txq_release(dev, i);
902 			continue;
903 		}
904 		/* Indeed, only the first used queue needs to be checked. */
905 		if (txq_ctrl->hairpin_conf.manual_bind == 0) {
906 			if (cur_port != rx_port) {
907 				rte_errno = EINVAL;
908 				DRV_LOG(ERR, "port %u and port %u are in"
909 					" auto-bind mode", cur_port, rx_port);
910 				mlx5_txq_release(dev, i);
911 				return -rte_errno;
912 			} else {
913 				return 0;
914 			}
915 		}
916 		rx_queue = txq_ctrl->hairpin_conf.peers[0].queue;
917 		mlx5_txq_release(dev, i);
918 		ret = rte_eth_hairpin_queue_peer_unbind(rx_port, rx_queue, 0);
919 		if (ret) {
920 			DRV_LOG(ERR, "port %u Rx queue %d unbind - failure",
921 				rx_port, rx_queue);
922 			return ret;
923 		}
924 		ret = mlx5_hairpin_queue_peer_unbind(dev, i, 1);
925 		if (ret) {
926 			DRV_LOG(ERR, "port %u Tx queue %d unbind - failure",
927 				cur_port, i);
928 			return ret;
929 		}
930 	}
931 	return 0;
932 }
933 
934 /*
935  * Bind hairpin ports, Rx could be all ports when using RTE_MAX_ETHPORTS.
936  * @see mlx5_hairpin_bind_single_port()
937  */
938 int
939 mlx5_hairpin_bind(struct rte_eth_dev *dev, uint16_t rx_port)
940 {
941 	int ret = 0;
942 	uint16_t p, pp;
943 
944 	/*
945 	 * If the Rx port has no hairpin configuration with the current port,
946 	 * the binding will be skipped in the called function of single port.
947 	 * Device started status will be checked only before the queue
948 	 * information updating.
949 	 */
950 	if (rx_port == RTE_MAX_ETHPORTS) {
951 		MLX5_ETH_FOREACH_DEV(p, dev->device) {
952 			ret = mlx5_hairpin_bind_single_port(dev, p);
953 			if (ret != 0)
954 				goto unbind;
955 		}
956 		return ret;
957 	} else {
958 		return mlx5_hairpin_bind_single_port(dev, rx_port);
959 	}
960 unbind:
961 	MLX5_ETH_FOREACH_DEV(pp, dev->device)
962 		if (pp < p)
963 			mlx5_hairpin_unbind_single_port(dev, pp);
964 	return ret;
965 }
966 
967 /*
968  * Unbind hairpin ports, Rx could be all ports when using RTE_MAX_ETHPORTS.
969  * @see mlx5_hairpin_unbind_single_port()
970  */
971 int
972 mlx5_hairpin_unbind(struct rte_eth_dev *dev, uint16_t rx_port)
973 {
974 	int ret = 0;
975 	uint16_t p;
976 
977 	if (rx_port == RTE_MAX_ETHPORTS)
978 		MLX5_ETH_FOREACH_DEV(p, dev->device) {
979 			ret = mlx5_hairpin_unbind_single_port(dev, p);
980 			if (ret != 0)
981 				return ret;
982 		}
983 	else
984 		ret = mlx5_hairpin_unbind_single_port(dev, rx_port);
985 	return ret;
986 }
987 
988 /*
989  * DPDK callback to get the hairpin peer ports list.
990  * This will return the actual number of peer ports and save the identifiers
991  * into the array (sorted, may be different from that when setting up the
992  * hairpin peer queues).
993  * The peer port ID could be the same as the port ID of the current device.
994  *
995  * @param dev
996  *   Pointer to Ethernet device structure.
997  * @param peer_ports
998  *   Pointer to array to save the port identifiers.
999  * @param len
1000  *   The length of the array.
1001  * @param direction
1002  *   Current port to peer port direction.
1003  *   positive - current used as Tx to get all peer Rx ports.
1004  *   zero - current used as Rx to get all peer Tx ports.
1005  *
1006  * @return
1007  *   0 or positive value on success, actual number of peer ports.
1008  *   a negative errno value otherwise and rte_errno is set.
1009  */
1010 int
1011 mlx5_hairpin_get_peer_ports(struct rte_eth_dev *dev, uint16_t *peer_ports,
1012 			    size_t len, uint32_t direction)
1013 {
1014 	struct mlx5_priv *priv = dev->data->dev_private;
1015 	struct mlx5_txq_ctrl *txq_ctrl;
1016 	struct mlx5_rxq_ctrl *rxq_ctrl;
1017 	uint32_t i;
1018 	uint16_t pp;
1019 	uint32_t bits[(RTE_MAX_ETHPORTS + 31) / 32] = {0};
1020 	int ret = 0;
1021 
1022 	if (direction) {
1023 		for (i = 0; i < priv->txqs_n; i++) {
1024 			txq_ctrl = mlx5_txq_get(dev, i);
1025 			if (!txq_ctrl)
1026 				continue;
1027 			if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
1028 				mlx5_txq_release(dev, i);
1029 				continue;
1030 			}
1031 			pp = txq_ctrl->hairpin_conf.peers[0].port;
1032 			if (pp >= RTE_MAX_ETHPORTS) {
1033 				rte_errno = ERANGE;
1034 				mlx5_txq_release(dev, i);
1035 				DRV_LOG(ERR, "port %hu queue %u peer port "
1036 					"out of range %hu",
1037 					priv->dev_data->port_id, i, pp);
1038 				return -rte_errno;
1039 			}
1040 			bits[pp / 32] |= 1 << (pp % 32);
1041 			mlx5_txq_release(dev, i);
1042 		}
1043 	} else {
1044 		for (i = 0; i < priv->rxqs_n; i++) {
1045 			rxq_ctrl = mlx5_rxq_get(dev, i);
1046 			if (!rxq_ctrl)
1047 				continue;
1048 			if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN) {
1049 				mlx5_rxq_release(dev, i);
1050 				continue;
1051 			}
1052 			pp = rxq_ctrl->hairpin_conf.peers[0].port;
1053 			if (pp >= RTE_MAX_ETHPORTS) {
1054 				rte_errno = ERANGE;
1055 				mlx5_rxq_release(dev, i);
1056 				DRV_LOG(ERR, "port %hu queue %u peer port "
1057 					"out of range %hu",
1058 					priv->dev_data->port_id, i, pp);
1059 				return -rte_errno;
1060 			}
1061 			bits[pp / 32] |= 1 << (pp % 32);
1062 			mlx5_rxq_release(dev, i);
1063 		}
1064 	}
1065 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
1066 		if (bits[i / 32] & (1 << (i % 32))) {
1067 			if ((size_t)ret >= len) {
1068 				rte_errno = E2BIG;
1069 				return -rte_errno;
1070 			}
1071 			peer_ports[ret++] = i;
1072 		}
1073 	}
1074 	return ret;
1075 }
1076 
1077 /**
1078  * DPDK callback to start the device.
1079  *
1080  * Simulate device start by attaching all configured flows.
1081  *
1082  * @param dev
1083  *   Pointer to Ethernet device structure.
1084  *
1085  * @return
1086  *   0 on success, a negative errno value otherwise and rte_errno is set.
1087  */
1088 int
1089 mlx5_dev_start(struct rte_eth_dev *dev)
1090 {
1091 	struct mlx5_priv *priv = dev->data->dev_private;
1092 	int ret;
1093 	int fine_inline;
1094 
1095 	DRV_LOG(DEBUG, "port %u starting device", dev->data->port_id);
1096 	fine_inline = rte_mbuf_dynflag_lookup
1097 		(RTE_PMD_MLX5_FINE_GRANULARITY_INLINE, NULL);
1098 	if (fine_inline >= 0)
1099 		rte_net_mlx5_dynf_inline_mask = 1UL << fine_inline;
1100 	else
1101 		rte_net_mlx5_dynf_inline_mask = 0;
1102 	if (dev->data->nb_rx_queues > 0) {
1103 		ret = mlx5_dev_configure_rss_reta(dev);
1104 		if (ret) {
1105 			DRV_LOG(ERR, "port %u reta config failed: %s",
1106 				dev->data->port_id, strerror(rte_errno));
1107 			return -rte_errno;
1108 		}
1109 	}
1110 	ret = mlx5_txpp_start(dev);
1111 	if (ret) {
1112 		DRV_LOG(ERR, "port %u Tx packet pacing init failed: %s",
1113 			dev->data->port_id, strerror(rte_errno));
1114 		goto error;
1115 	}
1116 	if ((priv->sh->devx && priv->config.dv_flow_en &&
1117 	    priv->config.dest_tir) && priv->obj_ops.lb_dummy_queue_create) {
1118 		ret = priv->obj_ops.lb_dummy_queue_create(dev);
1119 		if (ret)
1120 			goto error;
1121 	}
1122 	ret = mlx5_txq_start(dev);
1123 	if (ret) {
1124 		DRV_LOG(ERR, "port %u Tx queue allocation failed: %s",
1125 			dev->data->port_id, strerror(rte_errno));
1126 		goto error;
1127 	}
1128 	ret = mlx5_rxq_start(dev);
1129 	if (ret) {
1130 		DRV_LOG(ERR, "port %u Rx queue allocation failed: %s",
1131 			dev->data->port_id, strerror(rte_errno));
1132 		goto error;
1133 	}
1134 	/*
1135 	 * Such step will be skipped if there is no hairpin TX queue configured
1136 	 * with RX peer queue from the same device.
1137 	 */
1138 	ret = mlx5_hairpin_auto_bind(dev);
1139 	if (ret) {
1140 		DRV_LOG(ERR, "port %u hairpin auto binding failed: %s",
1141 			dev->data->port_id, strerror(rte_errno));
1142 		goto error;
1143 	}
1144 	/* Set started flag here for the following steps like control flow. */
1145 	dev->data->dev_started = 1;
1146 	ret = mlx5_rx_intr_vec_enable(dev);
1147 	if (ret) {
1148 		DRV_LOG(ERR, "port %u Rx interrupt vector creation failed",
1149 			dev->data->port_id);
1150 		goto error;
1151 	}
1152 	mlx5_os_stats_init(dev);
1153 	ret = mlx5_traffic_enable(dev);
1154 	if (ret) {
1155 		DRV_LOG(ERR, "port %u failed to set defaults flows",
1156 			dev->data->port_id);
1157 		goto error;
1158 	}
1159 	/* Set a mask and offset of dynamic metadata flows into Rx queues. */
1160 	mlx5_flow_rxq_dynf_metadata_set(dev);
1161 	/* Set flags and context to convert Rx timestamps. */
1162 	mlx5_rxq_timestamp_set(dev);
1163 	/* Set a mask and offset of scheduling on timestamp into Tx queues. */
1164 	mlx5_txq_dynf_timestamp_set(dev);
1165 	/*
1166 	 * In non-cached mode, it only needs to start the default mreg copy
1167 	 * action and no flow created by application exists anymore.
1168 	 * But it is worth wrapping the interface for further usage.
1169 	 */
1170 	ret = mlx5_flow_start_default(dev);
1171 	if (ret) {
1172 		DRV_LOG(DEBUG, "port %u failed to start default actions: %s",
1173 			dev->data->port_id, strerror(rte_errno));
1174 		goto error;
1175 	}
1176 	if (mlx5_dev_ctx_shared_mempool_subscribe(dev) != 0) {
1177 		DRV_LOG(ERR, "port %u failed to subscribe for mempool life cycle: %s",
1178 			dev->data->port_id, rte_strerror(rte_errno));
1179 		goto error;
1180 	}
1181 	rte_wmb();
1182 	dev->tx_pkt_burst = mlx5_select_tx_function(dev);
1183 	dev->rx_pkt_burst = mlx5_select_rx_function(dev);
1184 	/* Enable datapath on secondary process. */
1185 	mlx5_mp_os_req_start_rxtx(dev);
1186 	if (priv->sh->intr_handle.fd >= 0) {
1187 		priv->sh->port[priv->dev_port - 1].ih_port_id =
1188 					(uint32_t)dev->data->port_id;
1189 	} else {
1190 		DRV_LOG(INFO, "port %u starts without LSC and RMV interrupts.",
1191 			dev->data->port_id);
1192 		dev->data->dev_conf.intr_conf.lsc = 0;
1193 		dev->data->dev_conf.intr_conf.rmv = 0;
1194 	}
1195 	if (priv->sh->intr_handle_devx.fd >= 0)
1196 		priv->sh->port[priv->dev_port - 1].devx_ih_port_id =
1197 					(uint32_t)dev->data->port_id;
1198 	return 0;
1199 error:
1200 	ret = rte_errno; /* Save rte_errno before cleanup. */
1201 	/* Rollback. */
1202 	dev->data->dev_started = 0;
1203 	mlx5_flow_stop_default(dev);
1204 	mlx5_traffic_disable(dev);
1205 	mlx5_txq_stop(dev);
1206 	mlx5_rxq_stop(dev);
1207 	if (priv->obj_ops.lb_dummy_queue_release)
1208 		priv->obj_ops.lb_dummy_queue_release(dev);
1209 	mlx5_txpp_stop(dev); /* Stop last. */
1210 	rte_errno = ret; /* Restore rte_errno. */
1211 	return -rte_errno;
1212 }
1213 
1214 /**
1215  * DPDK callback to stop the device.
1216  *
1217  * Simulate device stop by detaching all configured flows.
1218  *
1219  * @param dev
1220  *   Pointer to Ethernet device structure.
1221  */
1222 int
1223 mlx5_dev_stop(struct rte_eth_dev *dev)
1224 {
1225 	struct mlx5_priv *priv = dev->data->dev_private;
1226 
1227 	dev->data->dev_started = 0;
1228 	/* Prevent crashes when queues are still in use. */
1229 	dev->rx_pkt_burst = removed_rx_burst;
1230 	dev->tx_pkt_burst = removed_tx_burst;
1231 	rte_wmb();
1232 	/* Disable datapath on secondary process. */
1233 	mlx5_mp_os_req_stop_rxtx(dev);
1234 	rte_delay_us_sleep(1000 * priv->rxqs_n);
1235 	DRV_LOG(DEBUG, "port %u stopping device", dev->data->port_id);
1236 	mlx5_flow_stop_default(dev);
1237 	/* Control flows for default traffic can be removed firstly. */
1238 	mlx5_traffic_disable(dev);
1239 	/* All RX queue flags will be cleared in the flush interface. */
1240 	mlx5_flow_list_flush(dev, MLX5_FLOW_TYPE_GEN, true);
1241 	mlx5_flow_meter_rxq_flush(dev);
1242 	mlx5_rx_intr_vec_disable(dev);
1243 	priv->sh->port[priv->dev_port - 1].ih_port_id = RTE_MAX_ETHPORTS;
1244 	priv->sh->port[priv->dev_port - 1].devx_ih_port_id = RTE_MAX_ETHPORTS;
1245 	mlx5_txq_stop(dev);
1246 	mlx5_rxq_stop(dev);
1247 	if (priv->obj_ops.lb_dummy_queue_release)
1248 		priv->obj_ops.lb_dummy_queue_release(dev);
1249 	mlx5_txpp_stop(dev);
1250 
1251 	return 0;
1252 }
1253 
1254 /**
1255  * Enable traffic flows configured by control plane
1256  *
1257  * @param dev
1258  *   Pointer to Ethernet device private data.
1259  * @param dev
1260  *   Pointer to Ethernet device structure.
1261  *
1262  * @return
1263  *   0 on success, a negative errno value otherwise and rte_errno is set.
1264  */
1265 int
1266 mlx5_traffic_enable(struct rte_eth_dev *dev)
1267 {
1268 	struct mlx5_priv *priv = dev->data->dev_private;
1269 	struct rte_flow_item_eth bcast = {
1270 		.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
1271 	};
1272 	struct rte_flow_item_eth ipv6_multi_spec = {
1273 		.dst.addr_bytes = "\x33\x33\x00\x00\x00\x00",
1274 	};
1275 	struct rte_flow_item_eth ipv6_multi_mask = {
1276 		.dst.addr_bytes = "\xff\xff\x00\x00\x00\x00",
1277 	};
1278 	struct rte_flow_item_eth unicast = {
1279 		.src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1280 	};
1281 	struct rte_flow_item_eth unicast_mask = {
1282 		.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
1283 	};
1284 	const unsigned int vlan_filter_n = priv->vlan_filter_n;
1285 	const struct rte_ether_addr cmp = {
1286 		.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1287 	};
1288 	unsigned int i;
1289 	unsigned int j;
1290 	int ret;
1291 
1292 	/*
1293 	 * Hairpin txq default flow should be created no matter if it is
1294 	 * isolation mode. Or else all the packets to be sent will be sent
1295 	 * out directly without the TX flow actions, e.g. encapsulation.
1296 	 */
1297 	for (i = 0; i != priv->txqs_n; ++i) {
1298 		struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i);
1299 		if (!txq_ctrl)
1300 			continue;
1301 		/* Only Tx implicit mode requires the default Tx flow. */
1302 		if (txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN &&
1303 		    txq_ctrl->hairpin_conf.tx_explicit == 0 &&
1304 		    txq_ctrl->hairpin_conf.peers[0].port ==
1305 		    priv->dev_data->port_id) {
1306 			ret = mlx5_ctrl_flow_source_queue(dev, i);
1307 			if (ret) {
1308 				mlx5_txq_release(dev, i);
1309 				goto error;
1310 			}
1311 		}
1312 		if ((priv->representor || priv->master) &&
1313 		    priv->config.dv_esw_en) {
1314 			if (mlx5_flow_create_devx_sq_miss_flow(dev, i) == 0) {
1315 				DRV_LOG(ERR,
1316 					"Port %u Tx queue %u SQ create representor devx default miss rule failed.",
1317 					dev->data->port_id, i);
1318 				goto error;
1319 			}
1320 		}
1321 		mlx5_txq_release(dev, i);
1322 	}
1323 	if ((priv->master || priv->representor) && priv->config.dv_esw_en) {
1324 		if (mlx5_flow_create_esw_table_zero_flow(dev))
1325 			priv->fdb_def_rule = 1;
1326 		else
1327 			DRV_LOG(INFO, "port %u FDB default rule cannot be"
1328 				" configured - only Eswitch group 0 flows are"
1329 				" supported.", dev->data->port_id);
1330 	}
1331 	if (!priv->config.lacp_by_user && priv->pf_bond >= 0) {
1332 		ret = mlx5_flow_lacp_miss(dev);
1333 		if (ret)
1334 			DRV_LOG(INFO, "port %u LACP rule cannot be created - "
1335 				"forward LACP to kernel.", dev->data->port_id);
1336 		else
1337 			DRV_LOG(INFO, "LACP traffic will be missed in port %u."
1338 				, dev->data->port_id);
1339 	}
1340 	if (priv->isolated)
1341 		return 0;
1342 	if (dev->data->promiscuous) {
1343 		struct rte_flow_item_eth promisc = {
1344 			.dst.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1345 			.src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1346 			.type = 0,
1347 		};
1348 
1349 		ret = mlx5_ctrl_flow(dev, &promisc, &promisc);
1350 		if (ret)
1351 			goto error;
1352 	}
1353 	if (dev->data->all_multicast) {
1354 		struct rte_flow_item_eth multicast = {
1355 			.dst.addr_bytes = "\x01\x00\x00\x00\x00\x00",
1356 			.src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1357 			.type = 0,
1358 		};
1359 
1360 		ret = mlx5_ctrl_flow(dev, &multicast, &multicast);
1361 		if (ret)
1362 			goto error;
1363 	} else {
1364 		/* Add broadcast/multicast flows. */
1365 		for (i = 0; i != vlan_filter_n; ++i) {
1366 			uint16_t vlan = priv->vlan_filter[i];
1367 
1368 			struct rte_flow_item_vlan vlan_spec = {
1369 				.tci = rte_cpu_to_be_16(vlan),
1370 			};
1371 			struct rte_flow_item_vlan vlan_mask =
1372 				rte_flow_item_vlan_mask;
1373 
1374 			ret = mlx5_ctrl_flow_vlan(dev, &bcast, &bcast,
1375 						  &vlan_spec, &vlan_mask);
1376 			if (ret)
1377 				goto error;
1378 			ret = mlx5_ctrl_flow_vlan(dev, &ipv6_multi_spec,
1379 						  &ipv6_multi_mask,
1380 						  &vlan_spec, &vlan_mask);
1381 			if (ret)
1382 				goto error;
1383 		}
1384 		if (!vlan_filter_n) {
1385 			ret = mlx5_ctrl_flow(dev, &bcast, &bcast);
1386 			if (ret)
1387 				goto error;
1388 			ret = mlx5_ctrl_flow(dev, &ipv6_multi_spec,
1389 					     &ipv6_multi_mask);
1390 			if (ret) {
1391 				/* Do not fail on IPv6 broadcast creation failure. */
1392 				DRV_LOG(WARNING,
1393 					"IPv6 broadcast is not supported");
1394 				ret = 0;
1395 			}
1396 		}
1397 	}
1398 	/* Add MAC address flows. */
1399 	for (i = 0; i != MLX5_MAX_MAC_ADDRESSES; ++i) {
1400 		struct rte_ether_addr *mac = &dev->data->mac_addrs[i];
1401 
1402 		if (!memcmp(mac, &cmp, sizeof(*mac)))
1403 			continue;
1404 		memcpy(&unicast.dst.addr_bytes,
1405 		       mac->addr_bytes,
1406 		       RTE_ETHER_ADDR_LEN);
1407 		for (j = 0; j != vlan_filter_n; ++j) {
1408 			uint16_t vlan = priv->vlan_filter[j];
1409 
1410 			struct rte_flow_item_vlan vlan_spec = {
1411 				.tci = rte_cpu_to_be_16(vlan),
1412 			};
1413 			struct rte_flow_item_vlan vlan_mask =
1414 				rte_flow_item_vlan_mask;
1415 
1416 			ret = mlx5_ctrl_flow_vlan(dev, &unicast,
1417 						  &unicast_mask,
1418 						  &vlan_spec,
1419 						  &vlan_mask);
1420 			if (ret)
1421 				goto error;
1422 		}
1423 		if (!vlan_filter_n) {
1424 			ret = mlx5_ctrl_flow(dev, &unicast, &unicast_mask);
1425 			if (ret)
1426 				goto error;
1427 		}
1428 	}
1429 	return 0;
1430 error:
1431 	ret = rte_errno; /* Save rte_errno before cleanup. */
1432 	mlx5_flow_list_flush(dev, MLX5_FLOW_TYPE_CTL, false);
1433 	rte_errno = ret; /* Restore rte_errno. */
1434 	return -rte_errno;
1435 }
1436 
1437 
1438 /**
1439  * Disable traffic flows configured by control plane
1440  *
1441  * @param dev
1442  *   Pointer to Ethernet device private data.
1443  */
1444 void
1445 mlx5_traffic_disable(struct rte_eth_dev *dev)
1446 {
1447 	mlx5_flow_list_flush(dev, MLX5_FLOW_TYPE_CTL, false);
1448 }
1449 
1450 /**
1451  * Restart traffic flows configured by control plane
1452  *
1453  * @param dev
1454  *   Pointer to Ethernet device private data.
1455  *
1456  * @return
1457  *   0 on success, a negative errno value otherwise and rte_errno is set.
1458  */
1459 int
1460 mlx5_traffic_restart(struct rte_eth_dev *dev)
1461 {
1462 	if (dev->data->dev_started) {
1463 		mlx5_traffic_disable(dev);
1464 		return mlx5_traffic_enable(dev);
1465 	}
1466 	return 0;
1467 }
1468