xref: /dpdk/drivers/net/mlx5/mlx5_trigger.c (revision 1dc6665d364b06ad44423f9dfac3818924950593)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5 
6 #include <unistd.h>
7 
8 #include <rte_ether.h>
9 #include <ethdev_driver.h>
10 #include <rte_interrupts.h>
11 #include <rte_alarm.h>
12 #include <rte_cycles.h>
13 
14 #include <mlx5_malloc.h>
15 
16 #include "mlx5.h"
17 #include "mlx5_mr.h"
18 #include "mlx5_rx.h"
19 #include "mlx5_tx.h"
20 #include "mlx5_utils.h"
21 #include "rte_pmd_mlx5.h"
22 
23 /**
24  * Stop traffic on Tx queues.
25  *
26  * @param dev
27  *   Pointer to Ethernet device structure.
28  */
29 static void
30 mlx5_txq_stop(struct rte_eth_dev *dev)
31 {
32 	struct mlx5_priv *priv = dev->data->dev_private;
33 	unsigned int i;
34 
35 	for (i = 0; i != priv->txqs_n; ++i)
36 		mlx5_txq_release(dev, i);
37 }
38 
39 /**
40  * Start traffic on Tx queues.
41  *
42  * @param dev
43  *   Pointer to Ethernet device structure.
44  *
45  * @return
46  *   0 on success, a negative errno value otherwise and rte_errno is set.
47  */
48 static int
49 mlx5_txq_start(struct rte_eth_dev *dev)
50 {
51 	struct mlx5_priv *priv = dev->data->dev_private;
52 	unsigned int i;
53 	int ret;
54 
55 	for (i = 0; i != priv->txqs_n; ++i) {
56 		struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i);
57 		struct mlx5_txq_data *txq_data = &txq_ctrl->txq;
58 		uint32_t flags = MLX5_MEM_RTE | MLX5_MEM_ZERO;
59 
60 		if (!txq_ctrl)
61 			continue;
62 		if (txq_ctrl->type == MLX5_TXQ_TYPE_STANDARD)
63 			txq_alloc_elts(txq_ctrl);
64 		MLX5_ASSERT(!txq_ctrl->obj);
65 		txq_ctrl->obj = mlx5_malloc(flags, sizeof(struct mlx5_txq_obj),
66 					    0, txq_ctrl->socket);
67 		if (!txq_ctrl->obj) {
68 			DRV_LOG(ERR, "Port %u Tx queue %u cannot allocate "
69 				"memory resources.", dev->data->port_id,
70 				txq_data->idx);
71 			rte_errno = ENOMEM;
72 			goto error;
73 		}
74 		ret = priv->obj_ops.txq_obj_new(dev, i);
75 		if (ret < 0) {
76 			mlx5_free(txq_ctrl->obj);
77 			txq_ctrl->obj = NULL;
78 			goto error;
79 		}
80 		if (txq_ctrl->type == MLX5_TXQ_TYPE_STANDARD) {
81 			size_t size = txq_data->cqe_s * sizeof(*txq_data->fcqs);
82 
83 			txq_data->fcqs = mlx5_malloc(flags, size,
84 						     RTE_CACHE_LINE_SIZE,
85 						     txq_ctrl->socket);
86 			if (!txq_data->fcqs) {
87 				DRV_LOG(ERR, "Port %u Tx queue %u cannot "
88 					"allocate memory (FCQ).",
89 					dev->data->port_id, i);
90 				rte_errno = ENOMEM;
91 				goto error;
92 			}
93 		}
94 		DRV_LOG(DEBUG, "Port %u txq %u updated with %p.",
95 			dev->data->port_id, i, (void *)&txq_ctrl->obj);
96 		LIST_INSERT_HEAD(&priv->txqsobj, txq_ctrl->obj, next);
97 	}
98 	return 0;
99 error:
100 	ret = rte_errno; /* Save rte_errno before cleanup. */
101 	do {
102 		mlx5_txq_release(dev, i);
103 	} while (i-- != 0);
104 	rte_errno = ret; /* Restore rte_errno. */
105 	return -rte_errno;
106 }
107 
108 /**
109  * Translate the chunk address to MR key in order to put in into the cache.
110  */
111 static void
112 mlx5_rxq_mempool_register_cb(struct rte_mempool *mp, void *opaque,
113 			     struct rte_mempool_memhdr *memhdr,
114 			     unsigned int idx)
115 {
116 	struct mlx5_rxq_data *rxq = opaque;
117 
118 	RTE_SET_USED(mp);
119 	RTE_SET_USED(idx);
120 	mlx5_rx_addr2mr(rxq, (uintptr_t)memhdr->addr);
121 }
122 
123 /**
124  * Register Rx queue mempools and fill the Rx queue cache.
125  * This function tolerates repeated mempool registration.
126  *
127  * @param[in] rxq_ctrl
128  *   Rx queue control data.
129  *
130  * @return
131  *   0 on success, (-1) on failure and rte_errno is set.
132  */
133 static int
134 mlx5_rxq_mempool_register(struct mlx5_rxq_ctrl *rxq_ctrl)
135 {
136 	struct mlx5_priv *priv = rxq_ctrl->priv;
137 	struct rte_mempool *mp;
138 	uint32_t s;
139 	int ret = 0;
140 
141 	mlx5_mr_flush_local_cache(&rxq_ctrl->rxq.mr_ctrl);
142 	/* MPRQ mempool is registered on creation, just fill the cache. */
143 	if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq)) {
144 		rte_mempool_mem_iter(rxq_ctrl->rxq.mprq_mp,
145 				     mlx5_rxq_mempool_register_cb,
146 				     &rxq_ctrl->rxq);
147 		return 0;
148 	}
149 	for (s = 0; s < rxq_ctrl->rxq.rxseg_n; s++) {
150 		mp = rxq_ctrl->rxq.rxseg[s].mp;
151 		ret = mlx5_mr_mempool_register(&priv->sh->share_cache,
152 					       priv->sh->pd, mp, &priv->mp_id);
153 		if (ret < 0 && rte_errno != EEXIST)
154 			return ret;
155 		rte_mempool_mem_iter(mp, mlx5_rxq_mempool_register_cb,
156 				     &rxq_ctrl->rxq);
157 	}
158 	return 0;
159 }
160 
161 /**
162  * Stop traffic on Rx queues.
163  *
164  * @param dev
165  *   Pointer to Ethernet device structure.
166  */
167 static void
168 mlx5_rxq_stop(struct rte_eth_dev *dev)
169 {
170 	struct mlx5_priv *priv = dev->data->dev_private;
171 	unsigned int i;
172 
173 	for (i = 0; i != priv->rxqs_n; ++i)
174 		mlx5_rxq_release(dev, i);
175 }
176 
177 /**
178  * Start traffic on Rx queues.
179  *
180  * @param dev
181  *   Pointer to Ethernet device structure.
182  *
183  * @return
184  *   0 on success, a negative errno value otherwise and rte_errno is set.
185  */
186 static int
187 mlx5_rxq_start(struct rte_eth_dev *dev)
188 {
189 	struct mlx5_priv *priv = dev->data->dev_private;
190 	unsigned int i;
191 	int ret = 0;
192 
193 	/* Allocate/reuse/resize mempool for Multi-Packet RQ. */
194 	if (mlx5_mprq_alloc_mp(dev)) {
195 		/* Should not release Rx queues but return immediately. */
196 		return -rte_errno;
197 	}
198 	DRV_LOG(DEBUG, "Port %u device_attr.max_qp_wr is %d.",
199 		dev->data->port_id, priv->sh->device_attr.max_qp_wr);
200 	DRV_LOG(DEBUG, "Port %u device_attr.max_sge is %d.",
201 		dev->data->port_id, priv->sh->device_attr.max_sge);
202 	for (i = 0; i != priv->rxqs_n; ++i) {
203 		struct mlx5_rxq_ctrl *rxq_ctrl = mlx5_rxq_get(dev, i);
204 
205 		if (!rxq_ctrl)
206 			continue;
207 		if (rxq_ctrl->type == MLX5_RXQ_TYPE_STANDARD) {
208 			/*
209 			 * Pre-register the mempools. Regardless of whether
210 			 * the implicit registration is enabled or not,
211 			 * Rx mempool destruction is tracked to free MRs.
212 			 */
213 			if (mlx5_rxq_mempool_register(rxq_ctrl) < 0)
214 				goto error;
215 			ret = rxq_alloc_elts(rxq_ctrl);
216 			if (ret)
217 				goto error;
218 		}
219 		MLX5_ASSERT(!rxq_ctrl->obj);
220 		rxq_ctrl->obj = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
221 					    sizeof(*rxq_ctrl->obj), 0,
222 					    rxq_ctrl->socket);
223 		if (!rxq_ctrl->obj) {
224 			DRV_LOG(ERR,
225 				"Port %u Rx queue %u can't allocate resources.",
226 				dev->data->port_id, (*priv->rxqs)[i]->idx);
227 			rte_errno = ENOMEM;
228 			goto error;
229 		}
230 		ret = priv->obj_ops.rxq_obj_new(dev, i);
231 		if (ret) {
232 			mlx5_free(rxq_ctrl->obj);
233 			goto error;
234 		}
235 		DRV_LOG(DEBUG, "Port %u rxq %u updated with %p.",
236 			dev->data->port_id, i, (void *)&rxq_ctrl->obj);
237 		LIST_INSERT_HEAD(&priv->rxqsobj, rxq_ctrl->obj, next);
238 	}
239 	return 0;
240 error:
241 	ret = rte_errno; /* Save rte_errno before cleanup. */
242 	do {
243 		mlx5_rxq_release(dev, i);
244 	} while (i-- != 0);
245 	rte_errno = ret; /* Restore rte_errno. */
246 	return -rte_errno;
247 }
248 
249 /**
250  * Binds Tx queues to Rx queues for hairpin.
251  *
252  * Binds Tx queues to the target Rx queues.
253  *
254  * @param dev
255  *   Pointer to Ethernet device structure.
256  *
257  * @return
258  *   0 on success, a negative errno value otherwise and rte_errno is set.
259  */
260 static int
261 mlx5_hairpin_auto_bind(struct rte_eth_dev *dev)
262 {
263 	struct mlx5_priv *priv = dev->data->dev_private;
264 	struct mlx5_devx_modify_sq_attr sq_attr = { 0 };
265 	struct mlx5_devx_modify_rq_attr rq_attr = { 0 };
266 	struct mlx5_txq_ctrl *txq_ctrl;
267 	struct mlx5_rxq_ctrl *rxq_ctrl;
268 	struct mlx5_devx_obj *sq;
269 	struct mlx5_devx_obj *rq;
270 	unsigned int i;
271 	int ret = 0;
272 	bool need_auto = false;
273 	uint16_t self_port = dev->data->port_id;
274 
275 	for (i = 0; i != priv->txqs_n; ++i) {
276 		txq_ctrl = mlx5_txq_get(dev, i);
277 		if (!txq_ctrl)
278 			continue;
279 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN ||
280 		    txq_ctrl->hairpin_conf.peers[0].port != self_port) {
281 			mlx5_txq_release(dev, i);
282 			continue;
283 		}
284 		if (txq_ctrl->hairpin_conf.manual_bind) {
285 			mlx5_txq_release(dev, i);
286 			return 0;
287 		}
288 		need_auto = true;
289 		mlx5_txq_release(dev, i);
290 	}
291 	if (!need_auto)
292 		return 0;
293 	for (i = 0; i != priv->txqs_n; ++i) {
294 		txq_ctrl = mlx5_txq_get(dev, i);
295 		if (!txq_ctrl)
296 			continue;
297 		/* Skip hairpin queues with other peer ports. */
298 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN ||
299 		    txq_ctrl->hairpin_conf.peers[0].port != self_port) {
300 			mlx5_txq_release(dev, i);
301 			continue;
302 		}
303 		if (!txq_ctrl->obj) {
304 			rte_errno = ENOMEM;
305 			DRV_LOG(ERR, "port %u no txq object found: %d",
306 				dev->data->port_id, i);
307 			mlx5_txq_release(dev, i);
308 			return -rte_errno;
309 		}
310 		sq = txq_ctrl->obj->sq;
311 		rxq_ctrl = mlx5_rxq_get(dev,
312 					txq_ctrl->hairpin_conf.peers[0].queue);
313 		if (!rxq_ctrl) {
314 			mlx5_txq_release(dev, i);
315 			rte_errno = EINVAL;
316 			DRV_LOG(ERR, "port %u no rxq object found: %d",
317 				dev->data->port_id,
318 				txq_ctrl->hairpin_conf.peers[0].queue);
319 			return -rte_errno;
320 		}
321 		if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN ||
322 		    rxq_ctrl->hairpin_conf.peers[0].queue != i) {
323 			rte_errno = ENOMEM;
324 			DRV_LOG(ERR, "port %u Tx queue %d can't be binded to "
325 				"Rx queue %d", dev->data->port_id,
326 				i, txq_ctrl->hairpin_conf.peers[0].queue);
327 			goto error;
328 		}
329 		rq = rxq_ctrl->obj->rq;
330 		if (!rq) {
331 			rte_errno = ENOMEM;
332 			DRV_LOG(ERR, "port %u hairpin no matching rxq: %d",
333 				dev->data->port_id,
334 				txq_ctrl->hairpin_conf.peers[0].queue);
335 			goto error;
336 		}
337 		sq_attr.state = MLX5_SQC_STATE_RDY;
338 		sq_attr.sq_state = MLX5_SQC_STATE_RST;
339 		sq_attr.hairpin_peer_rq = rq->id;
340 		sq_attr.hairpin_peer_vhca = priv->config.hca_attr.vhca_id;
341 		ret = mlx5_devx_cmd_modify_sq(sq, &sq_attr);
342 		if (ret)
343 			goto error;
344 		rq_attr.state = MLX5_SQC_STATE_RDY;
345 		rq_attr.rq_state = MLX5_SQC_STATE_RST;
346 		rq_attr.hairpin_peer_sq = sq->id;
347 		rq_attr.hairpin_peer_vhca = priv->config.hca_attr.vhca_id;
348 		ret = mlx5_devx_cmd_modify_rq(rq, &rq_attr);
349 		if (ret)
350 			goto error;
351 		/* Qs with auto-bind will be destroyed directly. */
352 		rxq_ctrl->hairpin_status = 1;
353 		txq_ctrl->hairpin_status = 1;
354 		mlx5_txq_release(dev, i);
355 		mlx5_rxq_release(dev, txq_ctrl->hairpin_conf.peers[0].queue);
356 	}
357 	return 0;
358 error:
359 	mlx5_txq_release(dev, i);
360 	mlx5_rxq_release(dev, txq_ctrl->hairpin_conf.peers[0].queue);
361 	return -rte_errno;
362 }
363 
364 /*
365  * Fetch the peer queue's SW & HW information.
366  *
367  * @param dev
368  *   Pointer to Ethernet device structure.
369  * @param peer_queue
370  *   Index of the queue to fetch the information.
371  * @param current_info
372  *   Pointer to the input peer information, not used currently.
373  * @param peer_info
374  *   Pointer to the structure to store the information, output.
375  * @param direction
376  *   Positive to get the RxQ information, zero to get the TxQ information.
377  *
378  * @return
379  *   0 on success, a negative errno value otherwise and rte_errno is set.
380  */
381 int
382 mlx5_hairpin_queue_peer_update(struct rte_eth_dev *dev, uint16_t peer_queue,
383 			       struct rte_hairpin_peer_info *current_info,
384 			       struct rte_hairpin_peer_info *peer_info,
385 			       uint32_t direction)
386 {
387 	struct mlx5_priv *priv = dev->data->dev_private;
388 	RTE_SET_USED(current_info);
389 
390 	if (dev->data->dev_started == 0) {
391 		rte_errno = EBUSY;
392 		DRV_LOG(ERR, "peer port %u is not started",
393 			dev->data->port_id);
394 		return -rte_errno;
395 	}
396 	/*
397 	 * Peer port used as egress. In the current design, hairpin Tx queue
398 	 * will be bound to the peer Rx queue. Indeed, only the information of
399 	 * peer Rx queue needs to be fetched.
400 	 */
401 	if (direction == 0) {
402 		struct mlx5_txq_ctrl *txq_ctrl;
403 
404 		txq_ctrl = mlx5_txq_get(dev, peer_queue);
405 		if (txq_ctrl == NULL) {
406 			rte_errno = EINVAL;
407 			DRV_LOG(ERR, "Failed to get port %u Tx queue %d",
408 				dev->data->port_id, peer_queue);
409 			return -rte_errno;
410 		}
411 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
412 			rte_errno = EINVAL;
413 			DRV_LOG(ERR, "port %u queue %d is not a hairpin Txq",
414 				dev->data->port_id, peer_queue);
415 			mlx5_txq_release(dev, peer_queue);
416 			return -rte_errno;
417 		}
418 		if (txq_ctrl->obj == NULL || txq_ctrl->obj->sq == NULL) {
419 			rte_errno = ENOMEM;
420 			DRV_LOG(ERR, "port %u no Txq object found: %d",
421 				dev->data->port_id, peer_queue);
422 			mlx5_txq_release(dev, peer_queue);
423 			return -rte_errno;
424 		}
425 		peer_info->qp_id = txq_ctrl->obj->sq->id;
426 		peer_info->vhca_id = priv->config.hca_attr.vhca_id;
427 		/* 1-to-1 mapping, only the first one is used. */
428 		peer_info->peer_q = txq_ctrl->hairpin_conf.peers[0].queue;
429 		peer_info->tx_explicit = txq_ctrl->hairpin_conf.tx_explicit;
430 		peer_info->manual_bind = txq_ctrl->hairpin_conf.manual_bind;
431 		mlx5_txq_release(dev, peer_queue);
432 	} else { /* Peer port used as ingress. */
433 		struct mlx5_rxq_ctrl *rxq_ctrl;
434 
435 		rxq_ctrl = mlx5_rxq_get(dev, peer_queue);
436 		if (rxq_ctrl == NULL) {
437 			rte_errno = EINVAL;
438 			DRV_LOG(ERR, "Failed to get port %u Rx queue %d",
439 				dev->data->port_id, peer_queue);
440 			return -rte_errno;
441 		}
442 		if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN) {
443 			rte_errno = EINVAL;
444 			DRV_LOG(ERR, "port %u queue %d is not a hairpin Rxq",
445 				dev->data->port_id, peer_queue);
446 			mlx5_rxq_release(dev, peer_queue);
447 			return -rte_errno;
448 		}
449 		if (rxq_ctrl->obj == NULL || rxq_ctrl->obj->rq == NULL) {
450 			rte_errno = ENOMEM;
451 			DRV_LOG(ERR, "port %u no Rxq object found: %d",
452 				dev->data->port_id, peer_queue);
453 			mlx5_rxq_release(dev, peer_queue);
454 			return -rte_errno;
455 		}
456 		peer_info->qp_id = rxq_ctrl->obj->rq->id;
457 		peer_info->vhca_id = priv->config.hca_attr.vhca_id;
458 		peer_info->peer_q = rxq_ctrl->hairpin_conf.peers[0].queue;
459 		peer_info->tx_explicit = rxq_ctrl->hairpin_conf.tx_explicit;
460 		peer_info->manual_bind = rxq_ctrl->hairpin_conf.manual_bind;
461 		mlx5_rxq_release(dev, peer_queue);
462 	}
463 	return 0;
464 }
465 
466 /*
467  * Bind the hairpin queue with the peer HW information.
468  * This needs to be called twice both for Tx and Rx queues of a pair.
469  * If the queue is already bound, it is considered successful.
470  *
471  * @param dev
472  *   Pointer to Ethernet device structure.
473  * @param cur_queue
474  *   Index of the queue to change the HW configuration to bind.
475  * @param peer_info
476  *   Pointer to information of the peer queue.
477  * @param direction
478  *   Positive to configure the TxQ, zero to configure the RxQ.
479  *
480  * @return
481  *   0 on success, a negative errno value otherwise and rte_errno is set.
482  */
483 int
484 mlx5_hairpin_queue_peer_bind(struct rte_eth_dev *dev, uint16_t cur_queue,
485 			     struct rte_hairpin_peer_info *peer_info,
486 			     uint32_t direction)
487 {
488 	int ret = 0;
489 
490 	/*
491 	 * Consistency checking of the peer queue: opposite direction is used
492 	 * to get the peer queue info with ethdev port ID, no need to check.
493 	 */
494 	if (peer_info->peer_q != cur_queue) {
495 		rte_errno = EINVAL;
496 		DRV_LOG(ERR, "port %u queue %d and peer queue %d mismatch",
497 			dev->data->port_id, cur_queue, peer_info->peer_q);
498 		return -rte_errno;
499 	}
500 	if (direction != 0) {
501 		struct mlx5_txq_ctrl *txq_ctrl;
502 		struct mlx5_devx_modify_sq_attr sq_attr = { 0 };
503 
504 		txq_ctrl = mlx5_txq_get(dev, cur_queue);
505 		if (txq_ctrl == NULL) {
506 			rte_errno = EINVAL;
507 			DRV_LOG(ERR, "Failed to get port %u Tx queue %d",
508 				dev->data->port_id, cur_queue);
509 			return -rte_errno;
510 		}
511 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
512 			rte_errno = EINVAL;
513 			DRV_LOG(ERR, "port %u queue %d not a hairpin Txq",
514 				dev->data->port_id, cur_queue);
515 			mlx5_txq_release(dev, cur_queue);
516 			return -rte_errno;
517 		}
518 		if (txq_ctrl->obj == NULL || txq_ctrl->obj->sq == NULL) {
519 			rte_errno = ENOMEM;
520 			DRV_LOG(ERR, "port %u no Txq object found: %d",
521 				dev->data->port_id, cur_queue);
522 			mlx5_txq_release(dev, cur_queue);
523 			return -rte_errno;
524 		}
525 		if (txq_ctrl->hairpin_status != 0) {
526 			DRV_LOG(DEBUG, "port %u Tx queue %d is already bound",
527 				dev->data->port_id, cur_queue);
528 			mlx5_txq_release(dev, cur_queue);
529 			return 0;
530 		}
531 		/*
532 		 * All queues' of one port consistency checking is done in the
533 		 * bind() function, and that is optional.
534 		 */
535 		if (peer_info->tx_explicit !=
536 		    txq_ctrl->hairpin_conf.tx_explicit) {
537 			rte_errno = EINVAL;
538 			DRV_LOG(ERR, "port %u Tx queue %d and peer Tx rule mode"
539 				" mismatch", dev->data->port_id, cur_queue);
540 			mlx5_txq_release(dev, cur_queue);
541 			return -rte_errno;
542 		}
543 		if (peer_info->manual_bind !=
544 		    txq_ctrl->hairpin_conf.manual_bind) {
545 			rte_errno = EINVAL;
546 			DRV_LOG(ERR, "port %u Tx queue %d and peer binding mode"
547 				" mismatch", dev->data->port_id, cur_queue);
548 			mlx5_txq_release(dev, cur_queue);
549 			return -rte_errno;
550 		}
551 		sq_attr.state = MLX5_SQC_STATE_RDY;
552 		sq_attr.sq_state = MLX5_SQC_STATE_RST;
553 		sq_attr.hairpin_peer_rq = peer_info->qp_id;
554 		sq_attr.hairpin_peer_vhca = peer_info->vhca_id;
555 		ret = mlx5_devx_cmd_modify_sq(txq_ctrl->obj->sq, &sq_attr);
556 		if (ret == 0)
557 			txq_ctrl->hairpin_status = 1;
558 		mlx5_txq_release(dev, cur_queue);
559 	} else {
560 		struct mlx5_rxq_ctrl *rxq_ctrl;
561 		struct mlx5_devx_modify_rq_attr rq_attr = { 0 };
562 
563 		rxq_ctrl = mlx5_rxq_get(dev, cur_queue);
564 		if (rxq_ctrl == NULL) {
565 			rte_errno = EINVAL;
566 			DRV_LOG(ERR, "Failed to get port %u Rx queue %d",
567 				dev->data->port_id, cur_queue);
568 			return -rte_errno;
569 		}
570 		if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN) {
571 			rte_errno = EINVAL;
572 			DRV_LOG(ERR, "port %u queue %d not a hairpin Rxq",
573 				dev->data->port_id, cur_queue);
574 			mlx5_rxq_release(dev, cur_queue);
575 			return -rte_errno;
576 		}
577 		if (rxq_ctrl->obj == NULL || rxq_ctrl->obj->rq == NULL) {
578 			rte_errno = ENOMEM;
579 			DRV_LOG(ERR, "port %u no Rxq object found: %d",
580 				dev->data->port_id, cur_queue);
581 			mlx5_rxq_release(dev, cur_queue);
582 			return -rte_errno;
583 		}
584 		if (rxq_ctrl->hairpin_status != 0) {
585 			DRV_LOG(DEBUG, "port %u Rx queue %d is already bound",
586 				dev->data->port_id, cur_queue);
587 			mlx5_rxq_release(dev, cur_queue);
588 			return 0;
589 		}
590 		if (peer_info->tx_explicit !=
591 		    rxq_ctrl->hairpin_conf.tx_explicit) {
592 			rte_errno = EINVAL;
593 			DRV_LOG(ERR, "port %u Rx queue %d and peer Tx rule mode"
594 				" mismatch", dev->data->port_id, cur_queue);
595 			mlx5_rxq_release(dev, cur_queue);
596 			return -rte_errno;
597 		}
598 		if (peer_info->manual_bind !=
599 		    rxq_ctrl->hairpin_conf.manual_bind) {
600 			rte_errno = EINVAL;
601 			DRV_LOG(ERR, "port %u Rx queue %d and peer binding mode"
602 				" mismatch", dev->data->port_id, cur_queue);
603 			mlx5_rxq_release(dev, cur_queue);
604 			return -rte_errno;
605 		}
606 		rq_attr.state = MLX5_SQC_STATE_RDY;
607 		rq_attr.rq_state = MLX5_SQC_STATE_RST;
608 		rq_attr.hairpin_peer_sq = peer_info->qp_id;
609 		rq_attr.hairpin_peer_vhca = peer_info->vhca_id;
610 		ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq, &rq_attr);
611 		if (ret == 0)
612 			rxq_ctrl->hairpin_status = 1;
613 		mlx5_rxq_release(dev, cur_queue);
614 	}
615 	return ret;
616 }
617 
618 /*
619  * Unbind the hairpin queue and reset its HW configuration.
620  * This needs to be called twice both for Tx and Rx queues of a pair.
621  * If the queue is already unbound, it is considered successful.
622  *
623  * @param dev
624  *   Pointer to Ethernet device structure.
625  * @param cur_queue
626  *   Index of the queue to change the HW configuration to unbind.
627  * @param direction
628  *   Positive to reset the TxQ, zero to reset the RxQ.
629  *
630  * @return
631  *   0 on success, a negative errno value otherwise and rte_errno is set.
632  */
633 int
634 mlx5_hairpin_queue_peer_unbind(struct rte_eth_dev *dev, uint16_t cur_queue,
635 			       uint32_t direction)
636 {
637 	int ret = 0;
638 
639 	if (direction != 0) {
640 		struct mlx5_txq_ctrl *txq_ctrl;
641 		struct mlx5_devx_modify_sq_attr sq_attr = { 0 };
642 
643 		txq_ctrl = mlx5_txq_get(dev, cur_queue);
644 		if (txq_ctrl == NULL) {
645 			rte_errno = EINVAL;
646 			DRV_LOG(ERR, "Failed to get port %u Tx queue %d",
647 				dev->data->port_id, cur_queue);
648 			return -rte_errno;
649 		}
650 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
651 			rte_errno = EINVAL;
652 			DRV_LOG(ERR, "port %u queue %d not a hairpin Txq",
653 				dev->data->port_id, cur_queue);
654 			mlx5_txq_release(dev, cur_queue);
655 			return -rte_errno;
656 		}
657 		/* Already unbound, return success before obj checking. */
658 		if (txq_ctrl->hairpin_status == 0) {
659 			DRV_LOG(DEBUG, "port %u Tx queue %d is already unbound",
660 				dev->data->port_id, cur_queue);
661 			mlx5_txq_release(dev, cur_queue);
662 			return 0;
663 		}
664 		if (!txq_ctrl->obj || !txq_ctrl->obj->sq) {
665 			rte_errno = ENOMEM;
666 			DRV_LOG(ERR, "port %u no Txq object found: %d",
667 				dev->data->port_id, cur_queue);
668 			mlx5_txq_release(dev, cur_queue);
669 			return -rte_errno;
670 		}
671 		sq_attr.state = MLX5_SQC_STATE_RST;
672 		sq_attr.sq_state = MLX5_SQC_STATE_RST;
673 		ret = mlx5_devx_cmd_modify_sq(txq_ctrl->obj->sq, &sq_attr);
674 		if (ret == 0)
675 			txq_ctrl->hairpin_status = 0;
676 		mlx5_txq_release(dev, cur_queue);
677 	} else {
678 		struct mlx5_rxq_ctrl *rxq_ctrl;
679 		struct mlx5_devx_modify_rq_attr rq_attr = { 0 };
680 
681 		rxq_ctrl = mlx5_rxq_get(dev, cur_queue);
682 		if (rxq_ctrl == NULL) {
683 			rte_errno = EINVAL;
684 			DRV_LOG(ERR, "Failed to get port %u Rx queue %d",
685 				dev->data->port_id, cur_queue);
686 			return -rte_errno;
687 		}
688 		if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN) {
689 			rte_errno = EINVAL;
690 			DRV_LOG(ERR, "port %u queue %d not a hairpin Rxq",
691 				dev->data->port_id, cur_queue);
692 			mlx5_rxq_release(dev, cur_queue);
693 			return -rte_errno;
694 		}
695 		if (rxq_ctrl->hairpin_status == 0) {
696 			DRV_LOG(DEBUG, "port %u Rx queue %d is already unbound",
697 				dev->data->port_id, cur_queue);
698 			mlx5_rxq_release(dev, cur_queue);
699 			return 0;
700 		}
701 		if (rxq_ctrl->obj == NULL || rxq_ctrl->obj->rq == NULL) {
702 			rte_errno = ENOMEM;
703 			DRV_LOG(ERR, "port %u no Rxq object found: %d",
704 				dev->data->port_id, cur_queue);
705 			mlx5_rxq_release(dev, cur_queue);
706 			return -rte_errno;
707 		}
708 		rq_attr.state = MLX5_SQC_STATE_RST;
709 		rq_attr.rq_state = MLX5_SQC_STATE_RST;
710 		ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq, &rq_attr);
711 		if (ret == 0)
712 			rxq_ctrl->hairpin_status = 0;
713 		mlx5_rxq_release(dev, cur_queue);
714 	}
715 	return ret;
716 }
717 
718 /*
719  * Bind the hairpin port pairs, from the Tx to the peer Rx.
720  * This function only supports to bind the Tx to one Rx.
721  *
722  * @param dev
723  *   Pointer to Ethernet device structure.
724  * @param rx_port
725  *   Port identifier of the Rx port.
726  *
727  * @return
728  *   0 on success, a negative errno value otherwise and rte_errno is set.
729  */
730 static int
731 mlx5_hairpin_bind_single_port(struct rte_eth_dev *dev, uint16_t rx_port)
732 {
733 	struct mlx5_priv *priv = dev->data->dev_private;
734 	int ret = 0;
735 	struct mlx5_txq_ctrl *txq_ctrl;
736 	uint32_t i;
737 	struct rte_hairpin_peer_info peer = {0xffffff};
738 	struct rte_hairpin_peer_info cur;
739 	const struct rte_eth_hairpin_conf *conf;
740 	uint16_t num_q = 0;
741 	uint16_t local_port = priv->dev_data->port_id;
742 	uint32_t manual;
743 	uint32_t explicit;
744 	uint16_t rx_queue;
745 
746 	if (mlx5_eth_find_next(rx_port, dev->device) != rx_port) {
747 		rte_errno = ENODEV;
748 		DRV_LOG(ERR, "Rx port %u does not belong to mlx5", rx_port);
749 		return -rte_errno;
750 	}
751 	/*
752 	 * Before binding TxQ to peer RxQ, first round loop will be used for
753 	 * checking the queues' configuration consistency. This would be a
754 	 * little time consuming but better than doing the rollback.
755 	 */
756 	for (i = 0; i != priv->txqs_n; i++) {
757 		txq_ctrl = mlx5_txq_get(dev, i);
758 		if (txq_ctrl == NULL)
759 			continue;
760 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
761 			mlx5_txq_release(dev, i);
762 			continue;
763 		}
764 		/*
765 		 * All hairpin Tx queues of a single port that connected to the
766 		 * same peer Rx port should have the same "auto binding" and
767 		 * "implicit Tx flow" modes.
768 		 * Peer consistency checking will be done in per queue binding.
769 		 */
770 		conf = &txq_ctrl->hairpin_conf;
771 		if (conf->peers[0].port == rx_port) {
772 			if (num_q == 0) {
773 				manual = conf->manual_bind;
774 				explicit = conf->tx_explicit;
775 			} else {
776 				if (manual != conf->manual_bind ||
777 				    explicit != conf->tx_explicit) {
778 					rte_errno = EINVAL;
779 					DRV_LOG(ERR, "port %u queue %d mode"
780 						" mismatch: %u %u, %u %u",
781 						local_port, i, manual,
782 						conf->manual_bind, explicit,
783 						conf->tx_explicit);
784 					mlx5_txq_release(dev, i);
785 					return -rte_errno;
786 				}
787 			}
788 			num_q++;
789 		}
790 		mlx5_txq_release(dev, i);
791 	}
792 	/* Once no queue is configured, success is returned directly. */
793 	if (num_q == 0)
794 		return ret;
795 	/* All the hairpin TX queues need to be traversed again. */
796 	for (i = 0; i != priv->txqs_n; i++) {
797 		txq_ctrl = mlx5_txq_get(dev, i);
798 		if (txq_ctrl == NULL)
799 			continue;
800 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
801 			mlx5_txq_release(dev, i);
802 			continue;
803 		}
804 		if (txq_ctrl->hairpin_conf.peers[0].port != rx_port) {
805 			mlx5_txq_release(dev, i);
806 			continue;
807 		}
808 		rx_queue = txq_ctrl->hairpin_conf.peers[0].queue;
809 		/*
810 		 * Fetch peer RxQ's information.
811 		 * No need to pass the information of the current queue.
812 		 */
813 		ret = rte_eth_hairpin_queue_peer_update(rx_port, rx_queue,
814 							NULL, &peer, 1);
815 		if (ret != 0) {
816 			mlx5_txq_release(dev, i);
817 			goto error;
818 		}
819 		/* Accessing its own device, inside mlx5 PMD. */
820 		ret = mlx5_hairpin_queue_peer_bind(dev, i, &peer, 1);
821 		if (ret != 0) {
822 			mlx5_txq_release(dev, i);
823 			goto error;
824 		}
825 		/* Pass TxQ's information to peer RxQ and try binding. */
826 		cur.peer_q = rx_queue;
827 		cur.qp_id = txq_ctrl->obj->sq->id;
828 		cur.vhca_id = priv->config.hca_attr.vhca_id;
829 		cur.tx_explicit = txq_ctrl->hairpin_conf.tx_explicit;
830 		cur.manual_bind = txq_ctrl->hairpin_conf.manual_bind;
831 		/*
832 		 * In order to access another device in a proper way, RTE level
833 		 * private function is needed.
834 		 */
835 		ret = rte_eth_hairpin_queue_peer_bind(rx_port, rx_queue,
836 						      &cur, 0);
837 		if (ret != 0) {
838 			mlx5_txq_release(dev, i);
839 			goto error;
840 		}
841 		mlx5_txq_release(dev, i);
842 	}
843 	return 0;
844 error:
845 	/*
846 	 * Do roll-back process for the queues already bound.
847 	 * No need to check the return value of the queue unbind function.
848 	 */
849 	do {
850 		/* No validation is needed here. */
851 		txq_ctrl = mlx5_txq_get(dev, i);
852 		if (txq_ctrl == NULL)
853 			continue;
854 		rx_queue = txq_ctrl->hairpin_conf.peers[0].queue;
855 		rte_eth_hairpin_queue_peer_unbind(rx_port, rx_queue, 0);
856 		mlx5_hairpin_queue_peer_unbind(dev, i, 1);
857 		mlx5_txq_release(dev, i);
858 	} while (i--);
859 	return ret;
860 }
861 
862 /*
863  * Unbind the hairpin port pair, HW configuration of both devices will be clear
864  * and status will be reset for all the queues used between the them.
865  * This function only supports to unbind the Tx from one Rx.
866  *
867  * @param dev
868  *   Pointer to Ethernet device structure.
869  * @param rx_port
870  *   Port identifier of the Rx port.
871  *
872  * @return
873  *   0 on success, a negative errno value otherwise and rte_errno is set.
874  */
875 static int
876 mlx5_hairpin_unbind_single_port(struct rte_eth_dev *dev, uint16_t rx_port)
877 {
878 	struct mlx5_priv *priv = dev->data->dev_private;
879 	struct mlx5_txq_ctrl *txq_ctrl;
880 	uint32_t i;
881 	int ret;
882 	uint16_t cur_port = priv->dev_data->port_id;
883 
884 	if (mlx5_eth_find_next(rx_port, dev->device) != rx_port) {
885 		rte_errno = ENODEV;
886 		DRV_LOG(ERR, "Rx port %u does not belong to mlx5", rx_port);
887 		return -rte_errno;
888 	}
889 	for (i = 0; i != priv->txqs_n; i++) {
890 		uint16_t rx_queue;
891 
892 		txq_ctrl = mlx5_txq_get(dev, i);
893 		if (txq_ctrl == NULL)
894 			continue;
895 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
896 			mlx5_txq_release(dev, i);
897 			continue;
898 		}
899 		if (txq_ctrl->hairpin_conf.peers[0].port != rx_port) {
900 			mlx5_txq_release(dev, i);
901 			continue;
902 		}
903 		/* Indeed, only the first used queue needs to be checked. */
904 		if (txq_ctrl->hairpin_conf.manual_bind == 0) {
905 			if (cur_port != rx_port) {
906 				rte_errno = EINVAL;
907 				DRV_LOG(ERR, "port %u and port %u are in"
908 					" auto-bind mode", cur_port, rx_port);
909 				mlx5_txq_release(dev, i);
910 				return -rte_errno;
911 			} else {
912 				return 0;
913 			}
914 		}
915 		rx_queue = txq_ctrl->hairpin_conf.peers[0].queue;
916 		mlx5_txq_release(dev, i);
917 		ret = rte_eth_hairpin_queue_peer_unbind(rx_port, rx_queue, 0);
918 		if (ret) {
919 			DRV_LOG(ERR, "port %u Rx queue %d unbind - failure",
920 				rx_port, rx_queue);
921 			return ret;
922 		}
923 		ret = mlx5_hairpin_queue_peer_unbind(dev, i, 1);
924 		if (ret) {
925 			DRV_LOG(ERR, "port %u Tx queue %d unbind - failure",
926 				cur_port, i);
927 			return ret;
928 		}
929 	}
930 	return 0;
931 }
932 
933 /*
934  * Bind hairpin ports, Rx could be all ports when using RTE_MAX_ETHPORTS.
935  * @see mlx5_hairpin_bind_single_port()
936  */
937 int
938 mlx5_hairpin_bind(struct rte_eth_dev *dev, uint16_t rx_port)
939 {
940 	int ret = 0;
941 	uint16_t p, pp;
942 
943 	/*
944 	 * If the Rx port has no hairpin configuration with the current port,
945 	 * the binding will be skipped in the called function of single port.
946 	 * Device started status will be checked only before the queue
947 	 * information updating.
948 	 */
949 	if (rx_port == RTE_MAX_ETHPORTS) {
950 		MLX5_ETH_FOREACH_DEV(p, dev->device) {
951 			ret = mlx5_hairpin_bind_single_port(dev, p);
952 			if (ret != 0)
953 				goto unbind;
954 		}
955 		return ret;
956 	} else {
957 		return mlx5_hairpin_bind_single_port(dev, rx_port);
958 	}
959 unbind:
960 	MLX5_ETH_FOREACH_DEV(pp, dev->device)
961 		if (pp < p)
962 			mlx5_hairpin_unbind_single_port(dev, pp);
963 	return ret;
964 }
965 
966 /*
967  * Unbind hairpin ports, Rx could be all ports when using RTE_MAX_ETHPORTS.
968  * @see mlx5_hairpin_unbind_single_port()
969  */
970 int
971 mlx5_hairpin_unbind(struct rte_eth_dev *dev, uint16_t rx_port)
972 {
973 	int ret = 0;
974 	uint16_t p;
975 
976 	if (rx_port == RTE_MAX_ETHPORTS)
977 		MLX5_ETH_FOREACH_DEV(p, dev->device) {
978 			ret = mlx5_hairpin_unbind_single_port(dev, p);
979 			if (ret != 0)
980 				return ret;
981 		}
982 	else
983 		ret = mlx5_hairpin_unbind_single_port(dev, rx_port);
984 	return ret;
985 }
986 
987 /*
988  * DPDK callback to get the hairpin peer ports list.
989  * This will return the actual number of peer ports and save the identifiers
990  * into the array (sorted, may be different from that when setting up the
991  * hairpin peer queues).
992  * The peer port ID could be the same as the port ID of the current device.
993  *
994  * @param dev
995  *   Pointer to Ethernet device structure.
996  * @param peer_ports
997  *   Pointer to array to save the port identifiers.
998  * @param len
999  *   The length of the array.
1000  * @param direction
1001  *   Current port to peer port direction.
1002  *   positive - current used as Tx to get all peer Rx ports.
1003  *   zero - current used as Rx to get all peer Tx ports.
1004  *
1005  * @return
1006  *   0 or positive value on success, actual number of peer ports.
1007  *   a negative errno value otherwise and rte_errno is set.
1008  */
1009 int
1010 mlx5_hairpin_get_peer_ports(struct rte_eth_dev *dev, uint16_t *peer_ports,
1011 			    size_t len, uint32_t direction)
1012 {
1013 	struct mlx5_priv *priv = dev->data->dev_private;
1014 	struct mlx5_txq_ctrl *txq_ctrl;
1015 	struct mlx5_rxq_ctrl *rxq_ctrl;
1016 	uint32_t i;
1017 	uint16_t pp;
1018 	uint32_t bits[(RTE_MAX_ETHPORTS + 31) / 32] = {0};
1019 	int ret = 0;
1020 
1021 	if (direction) {
1022 		for (i = 0; i < priv->txqs_n; i++) {
1023 			txq_ctrl = mlx5_txq_get(dev, i);
1024 			if (!txq_ctrl)
1025 				continue;
1026 			if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
1027 				mlx5_txq_release(dev, i);
1028 				continue;
1029 			}
1030 			pp = txq_ctrl->hairpin_conf.peers[0].port;
1031 			if (pp >= RTE_MAX_ETHPORTS) {
1032 				rte_errno = ERANGE;
1033 				mlx5_txq_release(dev, i);
1034 				DRV_LOG(ERR, "port %hu queue %u peer port "
1035 					"out of range %hu",
1036 					priv->dev_data->port_id, i, pp);
1037 				return -rte_errno;
1038 			}
1039 			bits[pp / 32] |= 1 << (pp % 32);
1040 			mlx5_txq_release(dev, i);
1041 		}
1042 	} else {
1043 		for (i = 0; i < priv->rxqs_n; i++) {
1044 			rxq_ctrl = mlx5_rxq_get(dev, i);
1045 			if (!rxq_ctrl)
1046 				continue;
1047 			if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN) {
1048 				mlx5_rxq_release(dev, i);
1049 				continue;
1050 			}
1051 			pp = rxq_ctrl->hairpin_conf.peers[0].port;
1052 			if (pp >= RTE_MAX_ETHPORTS) {
1053 				rte_errno = ERANGE;
1054 				mlx5_rxq_release(dev, i);
1055 				DRV_LOG(ERR, "port %hu queue %u peer port "
1056 					"out of range %hu",
1057 					priv->dev_data->port_id, i, pp);
1058 				return -rte_errno;
1059 			}
1060 			bits[pp / 32] |= 1 << (pp % 32);
1061 			mlx5_rxq_release(dev, i);
1062 		}
1063 	}
1064 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
1065 		if (bits[i / 32] & (1 << (i % 32))) {
1066 			if ((size_t)ret >= len) {
1067 				rte_errno = E2BIG;
1068 				return -rte_errno;
1069 			}
1070 			peer_ports[ret++] = i;
1071 		}
1072 	}
1073 	return ret;
1074 }
1075 
1076 /**
1077  * DPDK callback to start the device.
1078  *
1079  * Simulate device start by attaching all configured flows.
1080  *
1081  * @param dev
1082  *   Pointer to Ethernet device structure.
1083  *
1084  * @return
1085  *   0 on success, a negative errno value otherwise and rte_errno is set.
1086  */
1087 int
1088 mlx5_dev_start(struct rte_eth_dev *dev)
1089 {
1090 	struct mlx5_priv *priv = dev->data->dev_private;
1091 	int ret;
1092 	int fine_inline;
1093 
1094 	DRV_LOG(DEBUG, "port %u starting device", dev->data->port_id);
1095 	fine_inline = rte_mbuf_dynflag_lookup
1096 		(RTE_PMD_MLX5_FINE_GRANULARITY_INLINE, NULL);
1097 	if (fine_inline >= 0)
1098 		rte_net_mlx5_dynf_inline_mask = 1UL << fine_inline;
1099 	else
1100 		rte_net_mlx5_dynf_inline_mask = 0;
1101 	if (dev->data->nb_rx_queues > 0) {
1102 		ret = mlx5_dev_configure_rss_reta(dev);
1103 		if (ret) {
1104 			DRV_LOG(ERR, "port %u reta config failed: %s",
1105 				dev->data->port_id, strerror(rte_errno));
1106 			return -rte_errno;
1107 		}
1108 	}
1109 	ret = mlx5_txpp_start(dev);
1110 	if (ret) {
1111 		DRV_LOG(ERR, "port %u Tx packet pacing init failed: %s",
1112 			dev->data->port_id, strerror(rte_errno));
1113 		goto error;
1114 	}
1115 	if ((priv->config.devx && priv->config.dv_flow_en &&
1116 	    priv->config.dest_tir) && priv->obj_ops.lb_dummy_queue_create) {
1117 		ret = priv->obj_ops.lb_dummy_queue_create(dev);
1118 		if (ret)
1119 			goto error;
1120 	}
1121 	ret = mlx5_txq_start(dev);
1122 	if (ret) {
1123 		DRV_LOG(ERR, "port %u Tx queue allocation failed: %s",
1124 			dev->data->port_id, strerror(rte_errno));
1125 		goto error;
1126 	}
1127 	ret = mlx5_rxq_start(dev);
1128 	if (ret) {
1129 		DRV_LOG(ERR, "port %u Rx queue allocation failed: %s",
1130 			dev->data->port_id, strerror(rte_errno));
1131 		goto error;
1132 	}
1133 	/*
1134 	 * Such step will be skipped if there is no hairpin TX queue configured
1135 	 * with RX peer queue from the same device.
1136 	 */
1137 	ret = mlx5_hairpin_auto_bind(dev);
1138 	if (ret) {
1139 		DRV_LOG(ERR, "port %u hairpin auto binding failed: %s",
1140 			dev->data->port_id, strerror(rte_errno));
1141 		goto error;
1142 	}
1143 	/* Set started flag here for the following steps like control flow. */
1144 	dev->data->dev_started = 1;
1145 	ret = mlx5_rx_intr_vec_enable(dev);
1146 	if (ret) {
1147 		DRV_LOG(ERR, "port %u Rx interrupt vector creation failed",
1148 			dev->data->port_id);
1149 		goto error;
1150 	}
1151 	mlx5_os_stats_init(dev);
1152 	ret = mlx5_traffic_enable(dev);
1153 	if (ret) {
1154 		DRV_LOG(ERR, "port %u failed to set defaults flows",
1155 			dev->data->port_id);
1156 		goto error;
1157 	}
1158 	/* Set a mask and offset of dynamic metadata flows into Rx queues. */
1159 	mlx5_flow_rxq_dynf_metadata_set(dev);
1160 	/* Set flags and context to convert Rx timestamps. */
1161 	mlx5_rxq_timestamp_set(dev);
1162 	/* Set a mask and offset of scheduling on timestamp into Tx queues. */
1163 	mlx5_txq_dynf_timestamp_set(dev);
1164 	/*
1165 	 * In non-cached mode, it only needs to start the default mreg copy
1166 	 * action and no flow created by application exists anymore.
1167 	 * But it is worth wrapping the interface for further usage.
1168 	 */
1169 	ret = mlx5_flow_start_default(dev);
1170 	if (ret) {
1171 		DRV_LOG(DEBUG, "port %u failed to start default actions: %s",
1172 			dev->data->port_id, strerror(rte_errno));
1173 		goto error;
1174 	}
1175 	if (mlx5_dev_ctx_shared_mempool_subscribe(dev) != 0) {
1176 		DRV_LOG(ERR, "port %u failed to subscribe for mempool life cycle: %s",
1177 			dev->data->port_id, rte_strerror(rte_errno));
1178 		goto error;
1179 	}
1180 	rte_wmb();
1181 	dev->tx_pkt_burst = mlx5_select_tx_function(dev);
1182 	dev->rx_pkt_burst = mlx5_select_rx_function(dev);
1183 	/* Enable datapath on secondary process. */
1184 	mlx5_mp_os_req_start_rxtx(dev);
1185 	if (priv->sh->intr_handle.fd >= 0) {
1186 		priv->sh->port[priv->dev_port - 1].ih_port_id =
1187 					(uint32_t)dev->data->port_id;
1188 	} else {
1189 		DRV_LOG(INFO, "port %u starts without LSC and RMV interrupts.",
1190 			dev->data->port_id);
1191 		dev->data->dev_conf.intr_conf.lsc = 0;
1192 		dev->data->dev_conf.intr_conf.rmv = 0;
1193 	}
1194 	if (priv->sh->intr_handle_devx.fd >= 0)
1195 		priv->sh->port[priv->dev_port - 1].devx_ih_port_id =
1196 					(uint32_t)dev->data->port_id;
1197 	return 0;
1198 error:
1199 	ret = rte_errno; /* Save rte_errno before cleanup. */
1200 	/* Rollback. */
1201 	dev->data->dev_started = 0;
1202 	mlx5_flow_stop_default(dev);
1203 	mlx5_traffic_disable(dev);
1204 	mlx5_txq_stop(dev);
1205 	mlx5_rxq_stop(dev);
1206 	if (priv->obj_ops.lb_dummy_queue_release)
1207 		priv->obj_ops.lb_dummy_queue_release(dev);
1208 	mlx5_txpp_stop(dev); /* Stop last. */
1209 	rte_errno = ret; /* Restore rte_errno. */
1210 	return -rte_errno;
1211 }
1212 
1213 /**
1214  * DPDK callback to stop the device.
1215  *
1216  * Simulate device stop by detaching all configured flows.
1217  *
1218  * @param dev
1219  *   Pointer to Ethernet device structure.
1220  */
1221 int
1222 mlx5_dev_stop(struct rte_eth_dev *dev)
1223 {
1224 	struct mlx5_priv *priv = dev->data->dev_private;
1225 
1226 	dev->data->dev_started = 0;
1227 	/* Prevent crashes when queues are still in use. */
1228 	dev->rx_pkt_burst = removed_rx_burst;
1229 	dev->tx_pkt_burst = removed_tx_burst;
1230 	rte_wmb();
1231 	/* Disable datapath on secondary process. */
1232 	mlx5_mp_os_req_stop_rxtx(dev);
1233 	rte_delay_us_sleep(1000 * priv->rxqs_n);
1234 	DRV_LOG(DEBUG, "port %u stopping device", dev->data->port_id);
1235 	mlx5_flow_stop_default(dev);
1236 	/* Control flows for default traffic can be removed firstly. */
1237 	mlx5_traffic_disable(dev);
1238 	/* All RX queue flags will be cleared in the flush interface. */
1239 	mlx5_flow_list_flush(dev, MLX5_FLOW_TYPE_GEN, true);
1240 	mlx5_flow_meter_rxq_flush(dev);
1241 	mlx5_rx_intr_vec_disable(dev);
1242 	priv->sh->port[priv->dev_port - 1].ih_port_id = RTE_MAX_ETHPORTS;
1243 	priv->sh->port[priv->dev_port - 1].devx_ih_port_id = RTE_MAX_ETHPORTS;
1244 	mlx5_txq_stop(dev);
1245 	mlx5_rxq_stop(dev);
1246 	if (priv->obj_ops.lb_dummy_queue_release)
1247 		priv->obj_ops.lb_dummy_queue_release(dev);
1248 	mlx5_txpp_stop(dev);
1249 
1250 	return 0;
1251 }
1252 
1253 /**
1254  * Enable traffic flows configured by control plane
1255  *
1256  * @param dev
1257  *   Pointer to Ethernet device private data.
1258  * @param dev
1259  *   Pointer to Ethernet device structure.
1260  *
1261  * @return
1262  *   0 on success, a negative errno value otherwise and rte_errno is set.
1263  */
1264 int
1265 mlx5_traffic_enable(struct rte_eth_dev *dev)
1266 {
1267 	struct mlx5_priv *priv = dev->data->dev_private;
1268 	struct rte_flow_item_eth bcast = {
1269 		.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
1270 	};
1271 	struct rte_flow_item_eth ipv6_multi_spec = {
1272 		.dst.addr_bytes = "\x33\x33\x00\x00\x00\x00",
1273 	};
1274 	struct rte_flow_item_eth ipv6_multi_mask = {
1275 		.dst.addr_bytes = "\xff\xff\x00\x00\x00\x00",
1276 	};
1277 	struct rte_flow_item_eth unicast = {
1278 		.src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1279 	};
1280 	struct rte_flow_item_eth unicast_mask = {
1281 		.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
1282 	};
1283 	const unsigned int vlan_filter_n = priv->vlan_filter_n;
1284 	const struct rte_ether_addr cmp = {
1285 		.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1286 	};
1287 	unsigned int i;
1288 	unsigned int j;
1289 	int ret;
1290 
1291 	/*
1292 	 * Hairpin txq default flow should be created no matter if it is
1293 	 * isolation mode. Or else all the packets to be sent will be sent
1294 	 * out directly without the TX flow actions, e.g. encapsulation.
1295 	 */
1296 	for (i = 0; i != priv->txqs_n; ++i) {
1297 		struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i);
1298 		if (!txq_ctrl)
1299 			continue;
1300 		/* Only Tx implicit mode requires the default Tx flow. */
1301 		if (txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN &&
1302 		    txq_ctrl->hairpin_conf.tx_explicit == 0 &&
1303 		    txq_ctrl->hairpin_conf.peers[0].port ==
1304 		    priv->dev_data->port_id) {
1305 			ret = mlx5_ctrl_flow_source_queue(dev, i);
1306 			if (ret) {
1307 				mlx5_txq_release(dev, i);
1308 				goto error;
1309 			}
1310 		}
1311 		mlx5_txq_release(dev, i);
1312 	}
1313 	if (priv->config.dv_esw_en && !priv->config.vf && !priv->config.sf) {
1314 		if (mlx5_flow_create_esw_table_zero_flow(dev))
1315 			priv->fdb_def_rule = 1;
1316 		else
1317 			DRV_LOG(INFO, "port %u FDB default rule cannot be"
1318 				" configured - only Eswitch group 0 flows are"
1319 				" supported.", dev->data->port_id);
1320 	}
1321 	if (!priv->config.lacp_by_user && priv->pf_bond >= 0) {
1322 		ret = mlx5_flow_lacp_miss(dev);
1323 		if (ret)
1324 			DRV_LOG(INFO, "port %u LACP rule cannot be created - "
1325 				"forward LACP to kernel.", dev->data->port_id);
1326 		else
1327 			DRV_LOG(INFO, "LACP traffic will be missed in port %u."
1328 				, dev->data->port_id);
1329 	}
1330 	if (priv->isolated)
1331 		return 0;
1332 	if (dev->data->promiscuous) {
1333 		struct rte_flow_item_eth promisc = {
1334 			.dst.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1335 			.src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1336 			.type = 0,
1337 		};
1338 
1339 		ret = mlx5_ctrl_flow(dev, &promisc, &promisc);
1340 		if (ret)
1341 			goto error;
1342 	}
1343 	if (dev->data->all_multicast) {
1344 		struct rte_flow_item_eth multicast = {
1345 			.dst.addr_bytes = "\x01\x00\x00\x00\x00\x00",
1346 			.src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1347 			.type = 0,
1348 		};
1349 
1350 		ret = mlx5_ctrl_flow(dev, &multicast, &multicast);
1351 		if (ret)
1352 			goto error;
1353 	} else {
1354 		/* Add broadcast/multicast flows. */
1355 		for (i = 0; i != vlan_filter_n; ++i) {
1356 			uint16_t vlan = priv->vlan_filter[i];
1357 
1358 			struct rte_flow_item_vlan vlan_spec = {
1359 				.tci = rte_cpu_to_be_16(vlan),
1360 			};
1361 			struct rte_flow_item_vlan vlan_mask =
1362 				rte_flow_item_vlan_mask;
1363 
1364 			ret = mlx5_ctrl_flow_vlan(dev, &bcast, &bcast,
1365 						  &vlan_spec, &vlan_mask);
1366 			if (ret)
1367 				goto error;
1368 			ret = mlx5_ctrl_flow_vlan(dev, &ipv6_multi_spec,
1369 						  &ipv6_multi_mask,
1370 						  &vlan_spec, &vlan_mask);
1371 			if (ret)
1372 				goto error;
1373 		}
1374 		if (!vlan_filter_n) {
1375 			ret = mlx5_ctrl_flow(dev, &bcast, &bcast);
1376 			if (ret)
1377 				goto error;
1378 			ret = mlx5_ctrl_flow(dev, &ipv6_multi_spec,
1379 					     &ipv6_multi_mask);
1380 			if (ret) {
1381 				/* Do not fail on IPv6 broadcast creation failure. */
1382 				DRV_LOG(WARNING,
1383 					"IPv6 broadcast is not supported");
1384 				ret = 0;
1385 			}
1386 		}
1387 	}
1388 	/* Add MAC address flows. */
1389 	for (i = 0; i != MLX5_MAX_MAC_ADDRESSES; ++i) {
1390 		struct rte_ether_addr *mac = &dev->data->mac_addrs[i];
1391 
1392 		if (!memcmp(mac, &cmp, sizeof(*mac)))
1393 			continue;
1394 		memcpy(&unicast.dst.addr_bytes,
1395 		       mac->addr_bytes,
1396 		       RTE_ETHER_ADDR_LEN);
1397 		for (j = 0; j != vlan_filter_n; ++j) {
1398 			uint16_t vlan = priv->vlan_filter[j];
1399 
1400 			struct rte_flow_item_vlan vlan_spec = {
1401 				.tci = rte_cpu_to_be_16(vlan),
1402 			};
1403 			struct rte_flow_item_vlan vlan_mask =
1404 				rte_flow_item_vlan_mask;
1405 
1406 			ret = mlx5_ctrl_flow_vlan(dev, &unicast,
1407 						  &unicast_mask,
1408 						  &vlan_spec,
1409 						  &vlan_mask);
1410 			if (ret)
1411 				goto error;
1412 		}
1413 		if (!vlan_filter_n) {
1414 			ret = mlx5_ctrl_flow(dev, &unicast, &unicast_mask);
1415 			if (ret)
1416 				goto error;
1417 		}
1418 	}
1419 	return 0;
1420 error:
1421 	ret = rte_errno; /* Save rte_errno before cleanup. */
1422 	mlx5_flow_list_flush(dev, MLX5_FLOW_TYPE_CTL, false);
1423 	rte_errno = ret; /* Restore rte_errno. */
1424 	return -rte_errno;
1425 }
1426 
1427 
1428 /**
1429  * Disable traffic flows configured by control plane
1430  *
1431  * @param dev
1432  *   Pointer to Ethernet device private data.
1433  */
1434 void
1435 mlx5_traffic_disable(struct rte_eth_dev *dev)
1436 {
1437 	mlx5_flow_list_flush(dev, MLX5_FLOW_TYPE_CTL, false);
1438 }
1439 
1440 /**
1441  * Restart traffic flows configured by control plane
1442  *
1443  * @param dev
1444  *   Pointer to Ethernet device private data.
1445  *
1446  * @return
1447  *   0 on success, a negative errno value otherwise and rte_errno is set.
1448  */
1449 int
1450 mlx5_traffic_restart(struct rte_eth_dev *dev)
1451 {
1452 	if (dev->data->dev_started) {
1453 		mlx5_traffic_disable(dev);
1454 		return mlx5_traffic_enable(dev);
1455 	}
1456 	return 0;
1457 }
1458