xref: /dpdk/drivers/net/mlx5/mlx5_trigger.c (revision aaf3b44c6618f60ec878237e3a3dcad0912bedf4)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5 
6 #include <unistd.h>
7 
8 #include <rte_ether.h>
9 #include <ethdev_driver.h>
10 #include <rte_interrupts.h>
11 #include <rte_alarm.h>
12 #include <rte_cycles.h>
13 
14 #include <mlx5_malloc.h>
15 
16 #include "mlx5.h"
17 #include "mlx5_flow.h"
18 #include "mlx5_rx.h"
19 #include "mlx5_tx.h"
20 #include "mlx5_utils.h"
21 #include "rte_pmd_mlx5.h"
22 
23 /**
24  * Stop traffic on Tx queues.
25  *
26  * @param dev
27  *   Pointer to Ethernet device structure.
28  */
29 static void
30 mlx5_txq_stop(struct rte_eth_dev *dev)
31 {
32 	struct mlx5_priv *priv = dev->data->dev_private;
33 	unsigned int i;
34 
35 	for (i = 0; i != priv->txqs_n; ++i)
36 		mlx5_txq_release(dev, i);
37 }
38 
39 /**
40  * Start traffic on Tx queues.
41  *
42  * @param dev
43  *   Pointer to Ethernet device structure.
44  *
45  * @return
46  *   0 on success, a negative errno value otherwise and rte_errno is set.
47  */
48 static int
49 mlx5_txq_start(struct rte_eth_dev *dev)
50 {
51 	struct mlx5_priv *priv = dev->data->dev_private;
52 	unsigned int i;
53 	int ret;
54 
55 	for (i = 0; i != priv->txqs_n; ++i) {
56 		struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i);
57 		struct mlx5_txq_data *txq_data = &txq_ctrl->txq;
58 		uint32_t flags = MLX5_MEM_RTE | MLX5_MEM_ZERO;
59 
60 		if (!txq_ctrl)
61 			continue;
62 		if (txq_ctrl->type == MLX5_TXQ_TYPE_STANDARD)
63 			txq_alloc_elts(txq_ctrl);
64 		MLX5_ASSERT(!txq_ctrl->obj);
65 		txq_ctrl->obj = mlx5_malloc(flags, sizeof(struct mlx5_txq_obj),
66 					    0, txq_ctrl->socket);
67 		if (!txq_ctrl->obj) {
68 			DRV_LOG(ERR, "Port %u Tx queue %u cannot allocate "
69 				"memory resources.", dev->data->port_id,
70 				txq_data->idx);
71 			rte_errno = ENOMEM;
72 			goto error;
73 		}
74 		ret = priv->obj_ops.txq_obj_new(dev, i);
75 		if (ret < 0) {
76 			mlx5_free(txq_ctrl->obj);
77 			txq_ctrl->obj = NULL;
78 			goto error;
79 		}
80 		if (txq_ctrl->type == MLX5_TXQ_TYPE_STANDARD) {
81 			size_t size = txq_data->cqe_s * sizeof(*txq_data->fcqs);
82 
83 			txq_data->fcqs = mlx5_malloc(flags, size,
84 						     RTE_CACHE_LINE_SIZE,
85 						     txq_ctrl->socket);
86 			if (!txq_data->fcqs) {
87 				DRV_LOG(ERR, "Port %u Tx queue %u cannot "
88 					"allocate memory (FCQ).",
89 					dev->data->port_id, i);
90 				rte_errno = ENOMEM;
91 				goto error;
92 			}
93 		}
94 		DRV_LOG(DEBUG, "Port %u txq %u updated with %p.",
95 			dev->data->port_id, i, (void *)&txq_ctrl->obj);
96 		LIST_INSERT_HEAD(&priv->txqsobj, txq_ctrl->obj, next);
97 	}
98 	return 0;
99 error:
100 	ret = rte_errno; /* Save rte_errno before cleanup. */
101 	do {
102 		mlx5_txq_release(dev, i);
103 	} while (i-- != 0);
104 	rte_errno = ret; /* Restore rte_errno. */
105 	return -rte_errno;
106 }
107 
108 /**
109  * Translate the chunk address to MR key in order to put in into the cache.
110  */
111 static void
112 mlx5_rxq_mempool_register_cb(struct rte_mempool *mp, void *opaque,
113 			     struct rte_mempool_memhdr *memhdr,
114 			     unsigned int idx)
115 {
116 	struct mlx5_rxq_data *rxq = opaque;
117 
118 	RTE_SET_USED(mp);
119 	RTE_SET_USED(idx);
120 	mlx5_rx_addr2mr(rxq, (uintptr_t)memhdr->addr);
121 }
122 
123 /**
124  * Register Rx queue mempools and fill the Rx queue cache.
125  * This function tolerates repeated mempool registration.
126  *
127  * @param[in] rxq_ctrl
128  *   Rx queue control data.
129  *
130  * @return
131  *   0 on success, (-1) on failure and rte_errno is set.
132  */
133 static int
134 mlx5_rxq_mempool_register(struct mlx5_rxq_ctrl *rxq_ctrl)
135 {
136 	struct mlx5_priv *priv = rxq_ctrl->priv;
137 	struct rte_mempool *mp;
138 	uint32_t s;
139 	int ret = 0;
140 
141 	mlx5_mr_flush_local_cache(&rxq_ctrl->rxq.mr_ctrl);
142 	/* MPRQ mempool is registered on creation, just fill the cache. */
143 	if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq)) {
144 		rte_mempool_mem_iter(rxq_ctrl->rxq.mprq_mp,
145 				     mlx5_rxq_mempool_register_cb,
146 				     &rxq_ctrl->rxq);
147 		return 0;
148 	}
149 	for (s = 0; s < rxq_ctrl->rxq.rxseg_n; s++) {
150 		mp = rxq_ctrl->rxq.rxseg[s].mp;
151 		ret = mlx5_mr_mempool_register(&priv->sh->cdev->mr_scache,
152 					       priv->sh->cdev->pd, mp,
153 					       &priv->mp_id);
154 		if (ret < 0 && rte_errno != EEXIST)
155 			return ret;
156 		rte_mempool_mem_iter(mp, mlx5_rxq_mempool_register_cb,
157 				     &rxq_ctrl->rxq);
158 	}
159 	return 0;
160 }
161 
162 /**
163  * Stop traffic on Rx queues.
164  *
165  * @param dev
166  *   Pointer to Ethernet device structure.
167  */
168 static void
169 mlx5_rxq_stop(struct rte_eth_dev *dev)
170 {
171 	struct mlx5_priv *priv = dev->data->dev_private;
172 	unsigned int i;
173 
174 	for (i = 0; i != priv->rxqs_n; ++i)
175 		mlx5_rxq_release(dev, i);
176 }
177 
178 /**
179  * Start traffic on Rx queues.
180  *
181  * @param dev
182  *   Pointer to Ethernet device structure.
183  *
184  * @return
185  *   0 on success, a negative errno value otherwise and rte_errno is set.
186  */
187 static int
188 mlx5_rxq_start(struct rte_eth_dev *dev)
189 {
190 	struct mlx5_priv *priv = dev->data->dev_private;
191 	unsigned int i;
192 	int ret = 0;
193 
194 	/* Allocate/reuse/resize mempool for Multi-Packet RQ. */
195 	if (mlx5_mprq_alloc_mp(dev)) {
196 		/* Should not release Rx queues but return immediately. */
197 		return -rte_errno;
198 	}
199 	DRV_LOG(DEBUG, "Port %u device_attr.max_qp_wr is %d.",
200 		dev->data->port_id, priv->sh->device_attr.max_qp_wr);
201 	DRV_LOG(DEBUG, "Port %u device_attr.max_sge is %d.",
202 		dev->data->port_id, priv->sh->device_attr.max_sge);
203 	for (i = 0; i != priv->rxqs_n; ++i) {
204 		struct mlx5_rxq_ctrl *rxq_ctrl = mlx5_rxq_get(dev, i);
205 
206 		if (!rxq_ctrl)
207 			continue;
208 		if (rxq_ctrl->type == MLX5_RXQ_TYPE_STANDARD) {
209 			/*
210 			 * Pre-register the mempools. Regardless of whether
211 			 * the implicit registration is enabled or not,
212 			 * Rx mempool destruction is tracked to free MRs.
213 			 */
214 			if (mlx5_rxq_mempool_register(rxq_ctrl) < 0)
215 				goto error;
216 			ret = rxq_alloc_elts(rxq_ctrl);
217 			if (ret)
218 				goto error;
219 		}
220 		MLX5_ASSERT(!rxq_ctrl->obj);
221 		rxq_ctrl->obj = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
222 					    sizeof(*rxq_ctrl->obj), 0,
223 					    rxq_ctrl->socket);
224 		if (!rxq_ctrl->obj) {
225 			DRV_LOG(ERR,
226 				"Port %u Rx queue %u can't allocate resources.",
227 				dev->data->port_id, (*priv->rxqs)[i]->idx);
228 			rte_errno = ENOMEM;
229 			goto error;
230 		}
231 		ret = priv->obj_ops.rxq_obj_new(dev, i);
232 		if (ret) {
233 			mlx5_free(rxq_ctrl->obj);
234 			rxq_ctrl->obj = NULL;
235 			goto error;
236 		}
237 		DRV_LOG(DEBUG, "Port %u rxq %u updated with %p.",
238 			dev->data->port_id, i, (void *)&rxq_ctrl->obj);
239 		LIST_INSERT_HEAD(&priv->rxqsobj, rxq_ctrl->obj, next);
240 	}
241 	return 0;
242 error:
243 	ret = rte_errno; /* Save rte_errno before cleanup. */
244 	do {
245 		mlx5_rxq_release(dev, i);
246 	} while (i-- != 0);
247 	rte_errno = ret; /* Restore rte_errno. */
248 	return -rte_errno;
249 }
250 
251 /**
252  * Binds Tx queues to Rx queues for hairpin.
253  *
254  * Binds Tx queues to the target Rx queues.
255  *
256  * @param dev
257  *   Pointer to Ethernet device structure.
258  *
259  * @return
260  *   0 on success, a negative errno value otherwise and rte_errno is set.
261  */
262 static int
263 mlx5_hairpin_auto_bind(struct rte_eth_dev *dev)
264 {
265 	struct mlx5_priv *priv = dev->data->dev_private;
266 	struct mlx5_devx_modify_sq_attr sq_attr = { 0 };
267 	struct mlx5_devx_modify_rq_attr rq_attr = { 0 };
268 	struct mlx5_txq_ctrl *txq_ctrl;
269 	struct mlx5_rxq_ctrl *rxq_ctrl;
270 	struct mlx5_devx_obj *sq;
271 	struct mlx5_devx_obj *rq;
272 	unsigned int i;
273 	int ret = 0;
274 	bool need_auto = false;
275 	uint16_t self_port = dev->data->port_id;
276 
277 	for (i = 0; i != priv->txqs_n; ++i) {
278 		txq_ctrl = mlx5_txq_get(dev, i);
279 		if (!txq_ctrl)
280 			continue;
281 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN ||
282 		    txq_ctrl->hairpin_conf.peers[0].port != self_port) {
283 			mlx5_txq_release(dev, i);
284 			continue;
285 		}
286 		if (txq_ctrl->hairpin_conf.manual_bind) {
287 			mlx5_txq_release(dev, i);
288 			return 0;
289 		}
290 		need_auto = true;
291 		mlx5_txq_release(dev, i);
292 	}
293 	if (!need_auto)
294 		return 0;
295 	for (i = 0; i != priv->txqs_n; ++i) {
296 		txq_ctrl = mlx5_txq_get(dev, i);
297 		if (!txq_ctrl)
298 			continue;
299 		/* Skip hairpin queues with other peer ports. */
300 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN ||
301 		    txq_ctrl->hairpin_conf.peers[0].port != self_port) {
302 			mlx5_txq_release(dev, i);
303 			continue;
304 		}
305 		if (!txq_ctrl->obj) {
306 			rte_errno = ENOMEM;
307 			DRV_LOG(ERR, "port %u no txq object found: %d",
308 				dev->data->port_id, i);
309 			mlx5_txq_release(dev, i);
310 			return -rte_errno;
311 		}
312 		sq = txq_ctrl->obj->sq;
313 		rxq_ctrl = mlx5_rxq_get(dev,
314 					txq_ctrl->hairpin_conf.peers[0].queue);
315 		if (!rxq_ctrl) {
316 			mlx5_txq_release(dev, i);
317 			rte_errno = EINVAL;
318 			DRV_LOG(ERR, "port %u no rxq object found: %d",
319 				dev->data->port_id,
320 				txq_ctrl->hairpin_conf.peers[0].queue);
321 			return -rte_errno;
322 		}
323 		if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN ||
324 		    rxq_ctrl->hairpin_conf.peers[0].queue != i) {
325 			rte_errno = ENOMEM;
326 			DRV_LOG(ERR, "port %u Tx queue %d can't be binded to "
327 				"Rx queue %d", dev->data->port_id,
328 				i, txq_ctrl->hairpin_conf.peers[0].queue);
329 			goto error;
330 		}
331 		rq = rxq_ctrl->obj->rq;
332 		if (!rq) {
333 			rte_errno = ENOMEM;
334 			DRV_LOG(ERR, "port %u hairpin no matching rxq: %d",
335 				dev->data->port_id,
336 				txq_ctrl->hairpin_conf.peers[0].queue);
337 			goto error;
338 		}
339 		sq_attr.state = MLX5_SQC_STATE_RDY;
340 		sq_attr.sq_state = MLX5_SQC_STATE_RST;
341 		sq_attr.hairpin_peer_rq = rq->id;
342 		sq_attr.hairpin_peer_vhca = priv->config.hca_attr.vhca_id;
343 		ret = mlx5_devx_cmd_modify_sq(sq, &sq_attr);
344 		if (ret)
345 			goto error;
346 		rq_attr.state = MLX5_SQC_STATE_RDY;
347 		rq_attr.rq_state = MLX5_SQC_STATE_RST;
348 		rq_attr.hairpin_peer_sq = sq->id;
349 		rq_attr.hairpin_peer_vhca = priv->config.hca_attr.vhca_id;
350 		ret = mlx5_devx_cmd_modify_rq(rq, &rq_attr);
351 		if (ret)
352 			goto error;
353 		/* Qs with auto-bind will be destroyed directly. */
354 		rxq_ctrl->hairpin_status = 1;
355 		txq_ctrl->hairpin_status = 1;
356 		mlx5_txq_release(dev, i);
357 		mlx5_rxq_release(dev, txq_ctrl->hairpin_conf.peers[0].queue);
358 	}
359 	return 0;
360 error:
361 	mlx5_txq_release(dev, i);
362 	mlx5_rxq_release(dev, txq_ctrl->hairpin_conf.peers[0].queue);
363 	return -rte_errno;
364 }
365 
366 /*
367  * Fetch the peer queue's SW & HW information.
368  *
369  * @param dev
370  *   Pointer to Ethernet device structure.
371  * @param peer_queue
372  *   Index of the queue to fetch the information.
373  * @param current_info
374  *   Pointer to the input peer information, not used currently.
375  * @param peer_info
376  *   Pointer to the structure to store the information, output.
377  * @param direction
378  *   Positive to get the RxQ information, zero to get the TxQ information.
379  *
380  * @return
381  *   0 on success, a negative errno value otherwise and rte_errno is set.
382  */
383 int
384 mlx5_hairpin_queue_peer_update(struct rte_eth_dev *dev, uint16_t peer_queue,
385 			       struct rte_hairpin_peer_info *current_info,
386 			       struct rte_hairpin_peer_info *peer_info,
387 			       uint32_t direction)
388 {
389 	struct mlx5_priv *priv = dev->data->dev_private;
390 	RTE_SET_USED(current_info);
391 
392 	if (dev->data->dev_started == 0) {
393 		rte_errno = EBUSY;
394 		DRV_LOG(ERR, "peer port %u is not started",
395 			dev->data->port_id);
396 		return -rte_errno;
397 	}
398 	/*
399 	 * Peer port used as egress. In the current design, hairpin Tx queue
400 	 * will be bound to the peer Rx queue. Indeed, only the information of
401 	 * peer Rx queue needs to be fetched.
402 	 */
403 	if (direction == 0) {
404 		struct mlx5_txq_ctrl *txq_ctrl;
405 
406 		txq_ctrl = mlx5_txq_get(dev, peer_queue);
407 		if (txq_ctrl == NULL) {
408 			rte_errno = EINVAL;
409 			DRV_LOG(ERR, "Failed to get port %u Tx queue %d",
410 				dev->data->port_id, peer_queue);
411 			return -rte_errno;
412 		}
413 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
414 			rte_errno = EINVAL;
415 			DRV_LOG(ERR, "port %u queue %d is not a hairpin Txq",
416 				dev->data->port_id, peer_queue);
417 			mlx5_txq_release(dev, peer_queue);
418 			return -rte_errno;
419 		}
420 		if (txq_ctrl->obj == NULL || txq_ctrl->obj->sq == NULL) {
421 			rte_errno = ENOMEM;
422 			DRV_LOG(ERR, "port %u no Txq object found: %d",
423 				dev->data->port_id, peer_queue);
424 			mlx5_txq_release(dev, peer_queue);
425 			return -rte_errno;
426 		}
427 		peer_info->qp_id = txq_ctrl->obj->sq->id;
428 		peer_info->vhca_id = priv->config.hca_attr.vhca_id;
429 		/* 1-to-1 mapping, only the first one is used. */
430 		peer_info->peer_q = txq_ctrl->hairpin_conf.peers[0].queue;
431 		peer_info->tx_explicit = txq_ctrl->hairpin_conf.tx_explicit;
432 		peer_info->manual_bind = txq_ctrl->hairpin_conf.manual_bind;
433 		mlx5_txq_release(dev, peer_queue);
434 	} else { /* Peer port used as ingress. */
435 		struct mlx5_rxq_ctrl *rxq_ctrl;
436 
437 		rxq_ctrl = mlx5_rxq_get(dev, peer_queue);
438 		if (rxq_ctrl == NULL) {
439 			rte_errno = EINVAL;
440 			DRV_LOG(ERR, "Failed to get port %u Rx queue %d",
441 				dev->data->port_id, peer_queue);
442 			return -rte_errno;
443 		}
444 		if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN) {
445 			rte_errno = EINVAL;
446 			DRV_LOG(ERR, "port %u queue %d is not a hairpin Rxq",
447 				dev->data->port_id, peer_queue);
448 			mlx5_rxq_release(dev, peer_queue);
449 			return -rte_errno;
450 		}
451 		if (rxq_ctrl->obj == NULL || rxq_ctrl->obj->rq == NULL) {
452 			rte_errno = ENOMEM;
453 			DRV_LOG(ERR, "port %u no Rxq object found: %d",
454 				dev->data->port_id, peer_queue);
455 			mlx5_rxq_release(dev, peer_queue);
456 			return -rte_errno;
457 		}
458 		peer_info->qp_id = rxq_ctrl->obj->rq->id;
459 		peer_info->vhca_id = priv->config.hca_attr.vhca_id;
460 		peer_info->peer_q = rxq_ctrl->hairpin_conf.peers[0].queue;
461 		peer_info->tx_explicit = rxq_ctrl->hairpin_conf.tx_explicit;
462 		peer_info->manual_bind = rxq_ctrl->hairpin_conf.manual_bind;
463 		mlx5_rxq_release(dev, peer_queue);
464 	}
465 	return 0;
466 }
467 
468 /*
469  * Bind the hairpin queue with the peer HW information.
470  * This needs to be called twice both for Tx and Rx queues of a pair.
471  * If the queue is already bound, it is considered successful.
472  *
473  * @param dev
474  *   Pointer to Ethernet device structure.
475  * @param cur_queue
476  *   Index of the queue to change the HW configuration to bind.
477  * @param peer_info
478  *   Pointer to information of the peer queue.
479  * @param direction
480  *   Positive to configure the TxQ, zero to configure the RxQ.
481  *
482  * @return
483  *   0 on success, a negative errno value otherwise and rte_errno is set.
484  */
485 int
486 mlx5_hairpin_queue_peer_bind(struct rte_eth_dev *dev, uint16_t cur_queue,
487 			     struct rte_hairpin_peer_info *peer_info,
488 			     uint32_t direction)
489 {
490 	int ret = 0;
491 
492 	/*
493 	 * Consistency checking of the peer queue: opposite direction is used
494 	 * to get the peer queue info with ethdev port ID, no need to check.
495 	 */
496 	if (peer_info->peer_q != cur_queue) {
497 		rte_errno = EINVAL;
498 		DRV_LOG(ERR, "port %u queue %d and peer queue %d mismatch",
499 			dev->data->port_id, cur_queue, peer_info->peer_q);
500 		return -rte_errno;
501 	}
502 	if (direction != 0) {
503 		struct mlx5_txq_ctrl *txq_ctrl;
504 		struct mlx5_devx_modify_sq_attr sq_attr = { 0 };
505 
506 		txq_ctrl = mlx5_txq_get(dev, cur_queue);
507 		if (txq_ctrl == NULL) {
508 			rte_errno = EINVAL;
509 			DRV_LOG(ERR, "Failed to get port %u Tx queue %d",
510 				dev->data->port_id, cur_queue);
511 			return -rte_errno;
512 		}
513 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
514 			rte_errno = EINVAL;
515 			DRV_LOG(ERR, "port %u queue %d not a hairpin Txq",
516 				dev->data->port_id, cur_queue);
517 			mlx5_txq_release(dev, cur_queue);
518 			return -rte_errno;
519 		}
520 		if (txq_ctrl->obj == NULL || txq_ctrl->obj->sq == NULL) {
521 			rte_errno = ENOMEM;
522 			DRV_LOG(ERR, "port %u no Txq object found: %d",
523 				dev->data->port_id, cur_queue);
524 			mlx5_txq_release(dev, cur_queue);
525 			return -rte_errno;
526 		}
527 		if (txq_ctrl->hairpin_status != 0) {
528 			DRV_LOG(DEBUG, "port %u Tx queue %d is already bound",
529 				dev->data->port_id, cur_queue);
530 			mlx5_txq_release(dev, cur_queue);
531 			return 0;
532 		}
533 		/*
534 		 * All queues' of one port consistency checking is done in the
535 		 * bind() function, and that is optional.
536 		 */
537 		if (peer_info->tx_explicit !=
538 		    txq_ctrl->hairpin_conf.tx_explicit) {
539 			rte_errno = EINVAL;
540 			DRV_LOG(ERR, "port %u Tx queue %d and peer Tx rule mode"
541 				" mismatch", dev->data->port_id, cur_queue);
542 			mlx5_txq_release(dev, cur_queue);
543 			return -rte_errno;
544 		}
545 		if (peer_info->manual_bind !=
546 		    txq_ctrl->hairpin_conf.manual_bind) {
547 			rte_errno = EINVAL;
548 			DRV_LOG(ERR, "port %u Tx queue %d and peer binding mode"
549 				" mismatch", dev->data->port_id, cur_queue);
550 			mlx5_txq_release(dev, cur_queue);
551 			return -rte_errno;
552 		}
553 		sq_attr.state = MLX5_SQC_STATE_RDY;
554 		sq_attr.sq_state = MLX5_SQC_STATE_RST;
555 		sq_attr.hairpin_peer_rq = peer_info->qp_id;
556 		sq_attr.hairpin_peer_vhca = peer_info->vhca_id;
557 		ret = mlx5_devx_cmd_modify_sq(txq_ctrl->obj->sq, &sq_attr);
558 		if (ret == 0)
559 			txq_ctrl->hairpin_status = 1;
560 		mlx5_txq_release(dev, cur_queue);
561 	} else {
562 		struct mlx5_rxq_ctrl *rxq_ctrl;
563 		struct mlx5_devx_modify_rq_attr rq_attr = { 0 };
564 
565 		rxq_ctrl = mlx5_rxq_get(dev, cur_queue);
566 		if (rxq_ctrl == NULL) {
567 			rte_errno = EINVAL;
568 			DRV_LOG(ERR, "Failed to get port %u Rx queue %d",
569 				dev->data->port_id, cur_queue);
570 			return -rte_errno;
571 		}
572 		if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN) {
573 			rte_errno = EINVAL;
574 			DRV_LOG(ERR, "port %u queue %d not a hairpin Rxq",
575 				dev->data->port_id, cur_queue);
576 			mlx5_rxq_release(dev, cur_queue);
577 			return -rte_errno;
578 		}
579 		if (rxq_ctrl->obj == NULL || rxq_ctrl->obj->rq == NULL) {
580 			rte_errno = ENOMEM;
581 			DRV_LOG(ERR, "port %u no Rxq object found: %d",
582 				dev->data->port_id, cur_queue);
583 			mlx5_rxq_release(dev, cur_queue);
584 			return -rte_errno;
585 		}
586 		if (rxq_ctrl->hairpin_status != 0) {
587 			DRV_LOG(DEBUG, "port %u Rx queue %d is already bound",
588 				dev->data->port_id, cur_queue);
589 			mlx5_rxq_release(dev, cur_queue);
590 			return 0;
591 		}
592 		if (peer_info->tx_explicit !=
593 		    rxq_ctrl->hairpin_conf.tx_explicit) {
594 			rte_errno = EINVAL;
595 			DRV_LOG(ERR, "port %u Rx queue %d and peer Tx rule mode"
596 				" mismatch", dev->data->port_id, cur_queue);
597 			mlx5_rxq_release(dev, cur_queue);
598 			return -rte_errno;
599 		}
600 		if (peer_info->manual_bind !=
601 		    rxq_ctrl->hairpin_conf.manual_bind) {
602 			rte_errno = EINVAL;
603 			DRV_LOG(ERR, "port %u Rx queue %d and peer binding mode"
604 				" mismatch", dev->data->port_id, cur_queue);
605 			mlx5_rxq_release(dev, cur_queue);
606 			return -rte_errno;
607 		}
608 		rq_attr.state = MLX5_SQC_STATE_RDY;
609 		rq_attr.rq_state = MLX5_SQC_STATE_RST;
610 		rq_attr.hairpin_peer_sq = peer_info->qp_id;
611 		rq_attr.hairpin_peer_vhca = peer_info->vhca_id;
612 		ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq, &rq_attr);
613 		if (ret == 0)
614 			rxq_ctrl->hairpin_status = 1;
615 		mlx5_rxq_release(dev, cur_queue);
616 	}
617 	return ret;
618 }
619 
620 /*
621  * Unbind the hairpin queue and reset its HW configuration.
622  * This needs to be called twice both for Tx and Rx queues of a pair.
623  * If the queue is already unbound, it is considered successful.
624  *
625  * @param dev
626  *   Pointer to Ethernet device structure.
627  * @param cur_queue
628  *   Index of the queue to change the HW configuration to unbind.
629  * @param direction
630  *   Positive to reset the TxQ, zero to reset the RxQ.
631  *
632  * @return
633  *   0 on success, a negative errno value otherwise and rte_errno is set.
634  */
635 int
636 mlx5_hairpin_queue_peer_unbind(struct rte_eth_dev *dev, uint16_t cur_queue,
637 			       uint32_t direction)
638 {
639 	int ret = 0;
640 
641 	if (direction != 0) {
642 		struct mlx5_txq_ctrl *txq_ctrl;
643 		struct mlx5_devx_modify_sq_attr sq_attr = { 0 };
644 
645 		txq_ctrl = mlx5_txq_get(dev, cur_queue);
646 		if (txq_ctrl == NULL) {
647 			rte_errno = EINVAL;
648 			DRV_LOG(ERR, "Failed to get port %u Tx queue %d",
649 				dev->data->port_id, cur_queue);
650 			return -rte_errno;
651 		}
652 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
653 			rte_errno = EINVAL;
654 			DRV_LOG(ERR, "port %u queue %d not a hairpin Txq",
655 				dev->data->port_id, cur_queue);
656 			mlx5_txq_release(dev, cur_queue);
657 			return -rte_errno;
658 		}
659 		/* Already unbound, return success before obj checking. */
660 		if (txq_ctrl->hairpin_status == 0) {
661 			DRV_LOG(DEBUG, "port %u Tx queue %d is already unbound",
662 				dev->data->port_id, cur_queue);
663 			mlx5_txq_release(dev, cur_queue);
664 			return 0;
665 		}
666 		if (!txq_ctrl->obj || !txq_ctrl->obj->sq) {
667 			rte_errno = ENOMEM;
668 			DRV_LOG(ERR, "port %u no Txq object found: %d",
669 				dev->data->port_id, cur_queue);
670 			mlx5_txq_release(dev, cur_queue);
671 			return -rte_errno;
672 		}
673 		sq_attr.state = MLX5_SQC_STATE_RST;
674 		sq_attr.sq_state = MLX5_SQC_STATE_RST;
675 		ret = mlx5_devx_cmd_modify_sq(txq_ctrl->obj->sq, &sq_attr);
676 		if (ret == 0)
677 			txq_ctrl->hairpin_status = 0;
678 		mlx5_txq_release(dev, cur_queue);
679 	} else {
680 		struct mlx5_rxq_ctrl *rxq_ctrl;
681 		struct mlx5_devx_modify_rq_attr rq_attr = { 0 };
682 
683 		rxq_ctrl = mlx5_rxq_get(dev, cur_queue);
684 		if (rxq_ctrl == NULL) {
685 			rte_errno = EINVAL;
686 			DRV_LOG(ERR, "Failed to get port %u Rx queue %d",
687 				dev->data->port_id, cur_queue);
688 			return -rte_errno;
689 		}
690 		if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN) {
691 			rte_errno = EINVAL;
692 			DRV_LOG(ERR, "port %u queue %d not a hairpin Rxq",
693 				dev->data->port_id, cur_queue);
694 			mlx5_rxq_release(dev, cur_queue);
695 			return -rte_errno;
696 		}
697 		if (rxq_ctrl->hairpin_status == 0) {
698 			DRV_LOG(DEBUG, "port %u Rx queue %d is already unbound",
699 				dev->data->port_id, cur_queue);
700 			mlx5_rxq_release(dev, cur_queue);
701 			return 0;
702 		}
703 		if (rxq_ctrl->obj == NULL || rxq_ctrl->obj->rq == NULL) {
704 			rte_errno = ENOMEM;
705 			DRV_LOG(ERR, "port %u no Rxq object found: %d",
706 				dev->data->port_id, cur_queue);
707 			mlx5_rxq_release(dev, cur_queue);
708 			return -rte_errno;
709 		}
710 		rq_attr.state = MLX5_SQC_STATE_RST;
711 		rq_attr.rq_state = MLX5_SQC_STATE_RST;
712 		ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq, &rq_attr);
713 		if (ret == 0)
714 			rxq_ctrl->hairpin_status = 0;
715 		mlx5_rxq_release(dev, cur_queue);
716 	}
717 	return ret;
718 }
719 
720 /*
721  * Bind the hairpin port pairs, from the Tx to the peer Rx.
722  * This function only supports to bind the Tx to one Rx.
723  *
724  * @param dev
725  *   Pointer to Ethernet device structure.
726  * @param rx_port
727  *   Port identifier of the Rx port.
728  *
729  * @return
730  *   0 on success, a negative errno value otherwise and rte_errno is set.
731  */
732 static int
733 mlx5_hairpin_bind_single_port(struct rte_eth_dev *dev, uint16_t rx_port)
734 {
735 	struct mlx5_priv *priv = dev->data->dev_private;
736 	int ret = 0;
737 	struct mlx5_txq_ctrl *txq_ctrl;
738 	uint32_t i;
739 	struct rte_hairpin_peer_info peer = {0xffffff};
740 	struct rte_hairpin_peer_info cur;
741 	const struct rte_eth_hairpin_conf *conf;
742 	uint16_t num_q = 0;
743 	uint16_t local_port = priv->dev_data->port_id;
744 	uint32_t manual;
745 	uint32_t explicit;
746 	uint16_t rx_queue;
747 
748 	if (mlx5_eth_find_next(rx_port, dev->device) != rx_port) {
749 		rte_errno = ENODEV;
750 		DRV_LOG(ERR, "Rx port %u does not belong to mlx5", rx_port);
751 		return -rte_errno;
752 	}
753 	/*
754 	 * Before binding TxQ to peer RxQ, first round loop will be used for
755 	 * checking the queues' configuration consistency. This would be a
756 	 * little time consuming but better than doing the rollback.
757 	 */
758 	for (i = 0; i != priv->txqs_n; i++) {
759 		txq_ctrl = mlx5_txq_get(dev, i);
760 		if (txq_ctrl == NULL)
761 			continue;
762 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
763 			mlx5_txq_release(dev, i);
764 			continue;
765 		}
766 		/*
767 		 * All hairpin Tx queues of a single port that connected to the
768 		 * same peer Rx port should have the same "auto binding" and
769 		 * "implicit Tx flow" modes.
770 		 * Peer consistency checking will be done in per queue binding.
771 		 */
772 		conf = &txq_ctrl->hairpin_conf;
773 		if (conf->peers[0].port == rx_port) {
774 			if (num_q == 0) {
775 				manual = conf->manual_bind;
776 				explicit = conf->tx_explicit;
777 			} else {
778 				if (manual != conf->manual_bind ||
779 				    explicit != conf->tx_explicit) {
780 					rte_errno = EINVAL;
781 					DRV_LOG(ERR, "port %u queue %d mode"
782 						" mismatch: %u %u, %u %u",
783 						local_port, i, manual,
784 						conf->manual_bind, explicit,
785 						conf->tx_explicit);
786 					mlx5_txq_release(dev, i);
787 					return -rte_errno;
788 				}
789 			}
790 			num_q++;
791 		}
792 		mlx5_txq_release(dev, i);
793 	}
794 	/* Once no queue is configured, success is returned directly. */
795 	if (num_q == 0)
796 		return ret;
797 	/* All the hairpin TX queues need to be traversed again. */
798 	for (i = 0; i != priv->txqs_n; i++) {
799 		txq_ctrl = mlx5_txq_get(dev, i);
800 		if (txq_ctrl == NULL)
801 			continue;
802 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
803 			mlx5_txq_release(dev, i);
804 			continue;
805 		}
806 		if (txq_ctrl->hairpin_conf.peers[0].port != rx_port) {
807 			mlx5_txq_release(dev, i);
808 			continue;
809 		}
810 		rx_queue = txq_ctrl->hairpin_conf.peers[0].queue;
811 		/*
812 		 * Fetch peer RxQ's information.
813 		 * No need to pass the information of the current queue.
814 		 */
815 		ret = rte_eth_hairpin_queue_peer_update(rx_port, rx_queue,
816 							NULL, &peer, 1);
817 		if (ret != 0) {
818 			mlx5_txq_release(dev, i);
819 			goto error;
820 		}
821 		/* Accessing its own device, inside mlx5 PMD. */
822 		ret = mlx5_hairpin_queue_peer_bind(dev, i, &peer, 1);
823 		if (ret != 0) {
824 			mlx5_txq_release(dev, i);
825 			goto error;
826 		}
827 		/* Pass TxQ's information to peer RxQ and try binding. */
828 		cur.peer_q = rx_queue;
829 		cur.qp_id = txq_ctrl->obj->sq->id;
830 		cur.vhca_id = priv->config.hca_attr.vhca_id;
831 		cur.tx_explicit = txq_ctrl->hairpin_conf.tx_explicit;
832 		cur.manual_bind = txq_ctrl->hairpin_conf.manual_bind;
833 		/*
834 		 * In order to access another device in a proper way, RTE level
835 		 * private function is needed.
836 		 */
837 		ret = rte_eth_hairpin_queue_peer_bind(rx_port, rx_queue,
838 						      &cur, 0);
839 		if (ret != 0) {
840 			mlx5_txq_release(dev, i);
841 			goto error;
842 		}
843 		mlx5_txq_release(dev, i);
844 	}
845 	return 0;
846 error:
847 	/*
848 	 * Do roll-back process for the queues already bound.
849 	 * No need to check the return value of the queue unbind function.
850 	 */
851 	do {
852 		/* No validation is needed here. */
853 		txq_ctrl = mlx5_txq_get(dev, i);
854 		if (txq_ctrl == NULL)
855 			continue;
856 		rx_queue = txq_ctrl->hairpin_conf.peers[0].queue;
857 		rte_eth_hairpin_queue_peer_unbind(rx_port, rx_queue, 0);
858 		mlx5_hairpin_queue_peer_unbind(dev, i, 1);
859 		mlx5_txq_release(dev, i);
860 	} while (i--);
861 	return ret;
862 }
863 
864 /*
865  * Unbind the hairpin port pair, HW configuration of both devices will be clear
866  * and status will be reset for all the queues used between the them.
867  * This function only supports to unbind the Tx from one Rx.
868  *
869  * @param dev
870  *   Pointer to Ethernet device structure.
871  * @param rx_port
872  *   Port identifier of the Rx port.
873  *
874  * @return
875  *   0 on success, a negative errno value otherwise and rte_errno is set.
876  */
877 static int
878 mlx5_hairpin_unbind_single_port(struct rte_eth_dev *dev, uint16_t rx_port)
879 {
880 	struct mlx5_priv *priv = dev->data->dev_private;
881 	struct mlx5_txq_ctrl *txq_ctrl;
882 	uint32_t i;
883 	int ret;
884 	uint16_t cur_port = priv->dev_data->port_id;
885 
886 	if (mlx5_eth_find_next(rx_port, dev->device) != rx_port) {
887 		rte_errno = ENODEV;
888 		DRV_LOG(ERR, "Rx port %u does not belong to mlx5", rx_port);
889 		return -rte_errno;
890 	}
891 	for (i = 0; i != priv->txqs_n; i++) {
892 		uint16_t rx_queue;
893 
894 		txq_ctrl = mlx5_txq_get(dev, i);
895 		if (txq_ctrl == NULL)
896 			continue;
897 		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
898 			mlx5_txq_release(dev, i);
899 			continue;
900 		}
901 		if (txq_ctrl->hairpin_conf.peers[0].port != rx_port) {
902 			mlx5_txq_release(dev, i);
903 			continue;
904 		}
905 		/* Indeed, only the first used queue needs to be checked. */
906 		if (txq_ctrl->hairpin_conf.manual_bind == 0) {
907 			if (cur_port != rx_port) {
908 				rte_errno = EINVAL;
909 				DRV_LOG(ERR, "port %u and port %u are in"
910 					" auto-bind mode", cur_port, rx_port);
911 				mlx5_txq_release(dev, i);
912 				return -rte_errno;
913 			} else {
914 				return 0;
915 			}
916 		}
917 		rx_queue = txq_ctrl->hairpin_conf.peers[0].queue;
918 		mlx5_txq_release(dev, i);
919 		ret = rte_eth_hairpin_queue_peer_unbind(rx_port, rx_queue, 0);
920 		if (ret) {
921 			DRV_LOG(ERR, "port %u Rx queue %d unbind - failure",
922 				rx_port, rx_queue);
923 			return ret;
924 		}
925 		ret = mlx5_hairpin_queue_peer_unbind(dev, i, 1);
926 		if (ret) {
927 			DRV_LOG(ERR, "port %u Tx queue %d unbind - failure",
928 				cur_port, i);
929 			return ret;
930 		}
931 	}
932 	return 0;
933 }
934 
935 /*
936  * Bind hairpin ports, Rx could be all ports when using RTE_MAX_ETHPORTS.
937  * @see mlx5_hairpin_bind_single_port()
938  */
939 int
940 mlx5_hairpin_bind(struct rte_eth_dev *dev, uint16_t rx_port)
941 {
942 	int ret = 0;
943 	uint16_t p, pp;
944 
945 	/*
946 	 * If the Rx port has no hairpin configuration with the current port,
947 	 * the binding will be skipped in the called function of single port.
948 	 * Device started status will be checked only before the queue
949 	 * information updating.
950 	 */
951 	if (rx_port == RTE_MAX_ETHPORTS) {
952 		MLX5_ETH_FOREACH_DEV(p, dev->device) {
953 			ret = mlx5_hairpin_bind_single_port(dev, p);
954 			if (ret != 0)
955 				goto unbind;
956 		}
957 		return ret;
958 	} else {
959 		return mlx5_hairpin_bind_single_port(dev, rx_port);
960 	}
961 unbind:
962 	MLX5_ETH_FOREACH_DEV(pp, dev->device)
963 		if (pp < p)
964 			mlx5_hairpin_unbind_single_port(dev, pp);
965 	return ret;
966 }
967 
968 /*
969  * Unbind hairpin ports, Rx could be all ports when using RTE_MAX_ETHPORTS.
970  * @see mlx5_hairpin_unbind_single_port()
971  */
972 int
973 mlx5_hairpin_unbind(struct rte_eth_dev *dev, uint16_t rx_port)
974 {
975 	int ret = 0;
976 	uint16_t p;
977 
978 	if (rx_port == RTE_MAX_ETHPORTS)
979 		MLX5_ETH_FOREACH_DEV(p, dev->device) {
980 			ret = mlx5_hairpin_unbind_single_port(dev, p);
981 			if (ret != 0)
982 				return ret;
983 		}
984 	else
985 		ret = mlx5_hairpin_unbind_single_port(dev, rx_port);
986 	return ret;
987 }
988 
989 /*
990  * DPDK callback to get the hairpin peer ports list.
991  * This will return the actual number of peer ports and save the identifiers
992  * into the array (sorted, may be different from that when setting up the
993  * hairpin peer queues).
994  * The peer port ID could be the same as the port ID of the current device.
995  *
996  * @param dev
997  *   Pointer to Ethernet device structure.
998  * @param peer_ports
999  *   Pointer to array to save the port identifiers.
1000  * @param len
1001  *   The length of the array.
1002  * @param direction
1003  *   Current port to peer port direction.
1004  *   positive - current used as Tx to get all peer Rx ports.
1005  *   zero - current used as Rx to get all peer Tx ports.
1006  *
1007  * @return
1008  *   0 or positive value on success, actual number of peer ports.
1009  *   a negative errno value otherwise and rte_errno is set.
1010  */
1011 int
1012 mlx5_hairpin_get_peer_ports(struct rte_eth_dev *dev, uint16_t *peer_ports,
1013 			    size_t len, uint32_t direction)
1014 {
1015 	struct mlx5_priv *priv = dev->data->dev_private;
1016 	struct mlx5_txq_ctrl *txq_ctrl;
1017 	struct mlx5_rxq_ctrl *rxq_ctrl;
1018 	uint32_t i;
1019 	uint16_t pp;
1020 	uint32_t bits[(RTE_MAX_ETHPORTS + 31) / 32] = {0};
1021 	int ret = 0;
1022 
1023 	if (direction) {
1024 		for (i = 0; i < priv->txqs_n; i++) {
1025 			txq_ctrl = mlx5_txq_get(dev, i);
1026 			if (!txq_ctrl)
1027 				continue;
1028 			if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
1029 				mlx5_txq_release(dev, i);
1030 				continue;
1031 			}
1032 			pp = txq_ctrl->hairpin_conf.peers[0].port;
1033 			if (pp >= RTE_MAX_ETHPORTS) {
1034 				rte_errno = ERANGE;
1035 				mlx5_txq_release(dev, i);
1036 				DRV_LOG(ERR, "port %hu queue %u peer port "
1037 					"out of range %hu",
1038 					priv->dev_data->port_id, i, pp);
1039 				return -rte_errno;
1040 			}
1041 			bits[pp / 32] |= 1 << (pp % 32);
1042 			mlx5_txq_release(dev, i);
1043 		}
1044 	} else {
1045 		for (i = 0; i < priv->rxqs_n; i++) {
1046 			rxq_ctrl = mlx5_rxq_get(dev, i);
1047 			if (!rxq_ctrl)
1048 				continue;
1049 			if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN) {
1050 				mlx5_rxq_release(dev, i);
1051 				continue;
1052 			}
1053 			pp = rxq_ctrl->hairpin_conf.peers[0].port;
1054 			if (pp >= RTE_MAX_ETHPORTS) {
1055 				rte_errno = ERANGE;
1056 				mlx5_rxq_release(dev, i);
1057 				DRV_LOG(ERR, "port %hu queue %u peer port "
1058 					"out of range %hu",
1059 					priv->dev_data->port_id, i, pp);
1060 				return -rte_errno;
1061 			}
1062 			bits[pp / 32] |= 1 << (pp % 32);
1063 			mlx5_rxq_release(dev, i);
1064 		}
1065 	}
1066 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
1067 		if (bits[i / 32] & (1 << (i % 32))) {
1068 			if ((size_t)ret >= len) {
1069 				rte_errno = E2BIG;
1070 				return -rte_errno;
1071 			}
1072 			peer_ports[ret++] = i;
1073 		}
1074 	}
1075 	return ret;
1076 }
1077 
1078 /**
1079  * DPDK callback to start the device.
1080  *
1081  * Simulate device start by attaching all configured flows.
1082  *
1083  * @param dev
1084  *   Pointer to Ethernet device structure.
1085  *
1086  * @return
1087  *   0 on success, a negative errno value otherwise and rte_errno is set.
1088  */
1089 int
1090 mlx5_dev_start(struct rte_eth_dev *dev)
1091 {
1092 	struct mlx5_priv *priv = dev->data->dev_private;
1093 	int ret;
1094 	int fine_inline;
1095 
1096 	DRV_LOG(DEBUG, "port %u starting device", dev->data->port_id);
1097 	fine_inline = rte_mbuf_dynflag_lookup
1098 		(RTE_PMD_MLX5_FINE_GRANULARITY_INLINE, NULL);
1099 	if (fine_inline >= 0)
1100 		rte_net_mlx5_dynf_inline_mask = 1UL << fine_inline;
1101 	else
1102 		rte_net_mlx5_dynf_inline_mask = 0;
1103 	if (dev->data->nb_rx_queues > 0) {
1104 		ret = mlx5_dev_configure_rss_reta(dev);
1105 		if (ret) {
1106 			DRV_LOG(ERR, "port %u reta config failed: %s",
1107 				dev->data->port_id, strerror(rte_errno));
1108 			return -rte_errno;
1109 		}
1110 	}
1111 	ret = mlx5_txpp_start(dev);
1112 	if (ret) {
1113 		DRV_LOG(ERR, "port %u Tx packet pacing init failed: %s",
1114 			dev->data->port_id, strerror(rte_errno));
1115 		goto error;
1116 	}
1117 	if ((priv->sh->devx && priv->config.dv_flow_en &&
1118 	    priv->config.dest_tir) && priv->obj_ops.lb_dummy_queue_create) {
1119 		ret = priv->obj_ops.lb_dummy_queue_create(dev);
1120 		if (ret)
1121 			goto error;
1122 	}
1123 	ret = mlx5_txq_start(dev);
1124 	if (ret) {
1125 		DRV_LOG(ERR, "port %u Tx queue allocation failed: %s",
1126 			dev->data->port_id, strerror(rte_errno));
1127 		goto error;
1128 	}
1129 	ret = mlx5_rxq_start(dev);
1130 	if (ret) {
1131 		DRV_LOG(ERR, "port %u Rx queue allocation failed: %s",
1132 			dev->data->port_id, strerror(rte_errno));
1133 		goto error;
1134 	}
1135 	/*
1136 	 * Such step will be skipped if there is no hairpin TX queue configured
1137 	 * with RX peer queue from the same device.
1138 	 */
1139 	ret = mlx5_hairpin_auto_bind(dev);
1140 	if (ret) {
1141 		DRV_LOG(ERR, "port %u hairpin auto binding failed: %s",
1142 			dev->data->port_id, strerror(rte_errno));
1143 		goto error;
1144 	}
1145 	/* Set started flag here for the following steps like control flow. */
1146 	dev->data->dev_started = 1;
1147 	ret = mlx5_rx_intr_vec_enable(dev);
1148 	if (ret) {
1149 		DRV_LOG(ERR, "port %u Rx interrupt vector creation failed",
1150 			dev->data->port_id);
1151 		goto error;
1152 	}
1153 	mlx5_os_stats_init(dev);
1154 	ret = mlx5_traffic_enable(dev);
1155 	if (ret) {
1156 		DRV_LOG(ERR, "port %u failed to set defaults flows",
1157 			dev->data->port_id);
1158 		goto error;
1159 	}
1160 	/* Set a mask and offset of dynamic metadata flows into Rx queues. */
1161 	mlx5_flow_rxq_dynf_metadata_set(dev);
1162 	/* Set flags and context to convert Rx timestamps. */
1163 	mlx5_rxq_timestamp_set(dev);
1164 	/* Set a mask and offset of scheduling on timestamp into Tx queues. */
1165 	mlx5_txq_dynf_timestamp_set(dev);
1166 	/* Attach indirection table objects detached on port stop. */
1167 	ret = mlx5_action_handle_attach(dev);
1168 	if (ret) {
1169 		DRV_LOG(ERR,
1170 			"port %u failed to attach indirect actions: %s",
1171 			dev->data->port_id, rte_strerror(rte_errno));
1172 		goto error;
1173 	}
1174 	/*
1175 	 * In non-cached mode, it only needs to start the default mreg copy
1176 	 * action and no flow created by application exists anymore.
1177 	 * But it is worth wrapping the interface for further usage.
1178 	 */
1179 	ret = mlx5_flow_start_default(dev);
1180 	if (ret) {
1181 		DRV_LOG(DEBUG, "port %u failed to start default actions: %s",
1182 			dev->data->port_id, strerror(rte_errno));
1183 		goto error;
1184 	}
1185 	if (mlx5_dev_ctx_shared_mempool_subscribe(dev) != 0) {
1186 		DRV_LOG(ERR, "port %u failed to subscribe for mempool life cycle: %s",
1187 			dev->data->port_id, rte_strerror(rte_errno));
1188 		goto error;
1189 	}
1190 	rte_wmb();
1191 	dev->tx_pkt_burst = mlx5_select_tx_function(dev);
1192 	dev->rx_pkt_burst = mlx5_select_rx_function(dev);
1193 	/* Enable datapath on secondary process. */
1194 	mlx5_mp_os_req_start_rxtx(dev);
1195 	if (rte_intr_fd_get(priv->sh->intr_handle) >= 0) {
1196 		priv->sh->port[priv->dev_port - 1].ih_port_id =
1197 					(uint32_t)dev->data->port_id;
1198 	} else {
1199 		DRV_LOG(INFO, "port %u starts without LSC and RMV interrupts.",
1200 			dev->data->port_id);
1201 		dev->data->dev_conf.intr_conf.lsc = 0;
1202 		dev->data->dev_conf.intr_conf.rmv = 0;
1203 	}
1204 	if (rte_intr_fd_get(priv->sh->intr_handle_devx) >= 0)
1205 		priv->sh->port[priv->dev_port - 1].devx_ih_port_id =
1206 					(uint32_t)dev->data->port_id;
1207 	return 0;
1208 error:
1209 	ret = rte_errno; /* Save rte_errno before cleanup. */
1210 	/* Rollback. */
1211 	dev->data->dev_started = 0;
1212 	mlx5_flow_stop_default(dev);
1213 	mlx5_traffic_disable(dev);
1214 	mlx5_txq_stop(dev);
1215 	mlx5_rxq_stop(dev);
1216 	if (priv->obj_ops.lb_dummy_queue_release)
1217 		priv->obj_ops.lb_dummy_queue_release(dev);
1218 	mlx5_txpp_stop(dev); /* Stop last. */
1219 	rte_errno = ret; /* Restore rte_errno. */
1220 	return -rte_errno;
1221 }
1222 
1223 /**
1224  * DPDK callback to stop the device.
1225  *
1226  * Simulate device stop by detaching all configured flows.
1227  *
1228  * @param dev
1229  *   Pointer to Ethernet device structure.
1230  */
1231 int
1232 mlx5_dev_stop(struct rte_eth_dev *dev)
1233 {
1234 	struct mlx5_priv *priv = dev->data->dev_private;
1235 
1236 	dev->data->dev_started = 0;
1237 	/* Prevent crashes when queues are still in use. */
1238 	dev->rx_pkt_burst = removed_rx_burst;
1239 	dev->tx_pkt_burst = removed_tx_burst;
1240 	rte_wmb();
1241 	/* Disable datapath on secondary process. */
1242 	mlx5_mp_os_req_stop_rxtx(dev);
1243 	rte_delay_us_sleep(1000 * priv->rxqs_n);
1244 	DRV_LOG(DEBUG, "port %u stopping device", dev->data->port_id);
1245 	mlx5_flow_stop_default(dev);
1246 	/* Control flows for default traffic can be removed firstly. */
1247 	mlx5_traffic_disable(dev);
1248 	/* All RX queue flags will be cleared in the flush interface. */
1249 	mlx5_flow_list_flush(dev, MLX5_FLOW_TYPE_GEN, true);
1250 	mlx5_flow_meter_rxq_flush(dev);
1251 	mlx5_action_handle_detach(dev);
1252 	mlx5_rx_intr_vec_disable(dev);
1253 	priv->sh->port[priv->dev_port - 1].ih_port_id = RTE_MAX_ETHPORTS;
1254 	priv->sh->port[priv->dev_port - 1].devx_ih_port_id = RTE_MAX_ETHPORTS;
1255 	mlx5_txq_stop(dev);
1256 	mlx5_rxq_stop(dev);
1257 	if (priv->obj_ops.lb_dummy_queue_release)
1258 		priv->obj_ops.lb_dummy_queue_release(dev);
1259 	mlx5_txpp_stop(dev);
1260 
1261 	return 0;
1262 }
1263 
1264 /**
1265  * Enable traffic flows configured by control plane
1266  *
1267  * @param dev
1268  *   Pointer to Ethernet device private data.
1269  * @param dev
1270  *   Pointer to Ethernet device structure.
1271  *
1272  * @return
1273  *   0 on success, a negative errno value otherwise and rte_errno is set.
1274  */
1275 int
1276 mlx5_traffic_enable(struct rte_eth_dev *dev)
1277 {
1278 	struct mlx5_priv *priv = dev->data->dev_private;
1279 	struct rte_flow_item_eth bcast = {
1280 		.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
1281 	};
1282 	struct rte_flow_item_eth ipv6_multi_spec = {
1283 		.dst.addr_bytes = "\x33\x33\x00\x00\x00\x00",
1284 	};
1285 	struct rte_flow_item_eth ipv6_multi_mask = {
1286 		.dst.addr_bytes = "\xff\xff\x00\x00\x00\x00",
1287 	};
1288 	struct rte_flow_item_eth unicast = {
1289 		.src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1290 	};
1291 	struct rte_flow_item_eth unicast_mask = {
1292 		.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
1293 	};
1294 	const unsigned int vlan_filter_n = priv->vlan_filter_n;
1295 	const struct rte_ether_addr cmp = {
1296 		.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1297 	};
1298 	unsigned int i;
1299 	unsigned int j;
1300 	int ret;
1301 
1302 	/*
1303 	 * Hairpin txq default flow should be created no matter if it is
1304 	 * isolation mode. Or else all the packets to be sent will be sent
1305 	 * out directly without the TX flow actions, e.g. encapsulation.
1306 	 */
1307 	for (i = 0; i != priv->txqs_n; ++i) {
1308 		struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i);
1309 		if (!txq_ctrl)
1310 			continue;
1311 		/* Only Tx implicit mode requires the default Tx flow. */
1312 		if (txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN &&
1313 		    txq_ctrl->hairpin_conf.tx_explicit == 0 &&
1314 		    txq_ctrl->hairpin_conf.peers[0].port ==
1315 		    priv->dev_data->port_id) {
1316 			ret = mlx5_ctrl_flow_source_queue(dev, i);
1317 			if (ret) {
1318 				mlx5_txq_release(dev, i);
1319 				goto error;
1320 			}
1321 		}
1322 		if ((priv->representor || priv->master) &&
1323 		    priv->config.dv_esw_en) {
1324 			if (mlx5_flow_create_devx_sq_miss_flow(dev, i) == 0) {
1325 				DRV_LOG(ERR,
1326 					"Port %u Tx queue %u SQ create representor devx default miss rule failed.",
1327 					dev->data->port_id, i);
1328 				goto error;
1329 			}
1330 		}
1331 		mlx5_txq_release(dev, i);
1332 	}
1333 	if ((priv->master || priv->representor) && priv->config.dv_esw_en) {
1334 		if (mlx5_flow_create_esw_table_zero_flow(dev))
1335 			priv->fdb_def_rule = 1;
1336 		else
1337 			DRV_LOG(INFO, "port %u FDB default rule cannot be"
1338 				" configured - only Eswitch group 0 flows are"
1339 				" supported.", dev->data->port_id);
1340 	}
1341 	if (!priv->config.lacp_by_user && priv->pf_bond >= 0) {
1342 		ret = mlx5_flow_lacp_miss(dev);
1343 		if (ret)
1344 			DRV_LOG(INFO, "port %u LACP rule cannot be created - "
1345 				"forward LACP to kernel.", dev->data->port_id);
1346 		else
1347 			DRV_LOG(INFO, "LACP traffic will be missed in port %u."
1348 				, dev->data->port_id);
1349 	}
1350 	if (priv->isolated)
1351 		return 0;
1352 	if (dev->data->promiscuous) {
1353 		struct rte_flow_item_eth promisc = {
1354 			.dst.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1355 			.src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1356 			.type = 0,
1357 		};
1358 
1359 		ret = mlx5_ctrl_flow(dev, &promisc, &promisc);
1360 		if (ret)
1361 			goto error;
1362 	}
1363 	if (dev->data->all_multicast) {
1364 		struct rte_flow_item_eth multicast = {
1365 			.dst.addr_bytes = "\x01\x00\x00\x00\x00\x00",
1366 			.src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
1367 			.type = 0,
1368 		};
1369 
1370 		ret = mlx5_ctrl_flow(dev, &multicast, &multicast);
1371 		if (ret)
1372 			goto error;
1373 	} else {
1374 		/* Add broadcast/multicast flows. */
1375 		for (i = 0; i != vlan_filter_n; ++i) {
1376 			uint16_t vlan = priv->vlan_filter[i];
1377 
1378 			struct rte_flow_item_vlan vlan_spec = {
1379 				.tci = rte_cpu_to_be_16(vlan),
1380 			};
1381 			struct rte_flow_item_vlan vlan_mask =
1382 				rte_flow_item_vlan_mask;
1383 
1384 			ret = mlx5_ctrl_flow_vlan(dev, &bcast, &bcast,
1385 						  &vlan_spec, &vlan_mask);
1386 			if (ret)
1387 				goto error;
1388 			ret = mlx5_ctrl_flow_vlan(dev, &ipv6_multi_spec,
1389 						  &ipv6_multi_mask,
1390 						  &vlan_spec, &vlan_mask);
1391 			if (ret)
1392 				goto error;
1393 		}
1394 		if (!vlan_filter_n) {
1395 			ret = mlx5_ctrl_flow(dev, &bcast, &bcast);
1396 			if (ret)
1397 				goto error;
1398 			ret = mlx5_ctrl_flow(dev, &ipv6_multi_spec,
1399 					     &ipv6_multi_mask);
1400 			if (ret) {
1401 				/* Do not fail on IPv6 broadcast creation failure. */
1402 				DRV_LOG(WARNING,
1403 					"IPv6 broadcast is not supported");
1404 				ret = 0;
1405 			}
1406 		}
1407 	}
1408 	/* Add MAC address flows. */
1409 	for (i = 0; i != MLX5_MAX_MAC_ADDRESSES; ++i) {
1410 		struct rte_ether_addr *mac = &dev->data->mac_addrs[i];
1411 
1412 		if (!memcmp(mac, &cmp, sizeof(*mac)))
1413 			continue;
1414 		memcpy(&unicast.dst.addr_bytes,
1415 		       mac->addr_bytes,
1416 		       RTE_ETHER_ADDR_LEN);
1417 		for (j = 0; j != vlan_filter_n; ++j) {
1418 			uint16_t vlan = priv->vlan_filter[j];
1419 
1420 			struct rte_flow_item_vlan vlan_spec = {
1421 				.tci = rte_cpu_to_be_16(vlan),
1422 			};
1423 			struct rte_flow_item_vlan vlan_mask =
1424 				rte_flow_item_vlan_mask;
1425 
1426 			ret = mlx5_ctrl_flow_vlan(dev, &unicast,
1427 						  &unicast_mask,
1428 						  &vlan_spec,
1429 						  &vlan_mask);
1430 			if (ret)
1431 				goto error;
1432 		}
1433 		if (!vlan_filter_n) {
1434 			ret = mlx5_ctrl_flow(dev, &unicast, &unicast_mask);
1435 			if (ret)
1436 				goto error;
1437 		}
1438 	}
1439 	return 0;
1440 error:
1441 	ret = rte_errno; /* Save rte_errno before cleanup. */
1442 	mlx5_flow_list_flush(dev, MLX5_FLOW_TYPE_CTL, false);
1443 	rte_errno = ret; /* Restore rte_errno. */
1444 	return -rte_errno;
1445 }
1446 
1447 
1448 /**
1449  * Disable traffic flows configured by control plane
1450  *
1451  * @param dev
1452  *   Pointer to Ethernet device private data.
1453  */
1454 void
1455 mlx5_traffic_disable(struct rte_eth_dev *dev)
1456 {
1457 	mlx5_flow_list_flush(dev, MLX5_FLOW_TYPE_CTL, false);
1458 }
1459 
1460 /**
1461  * Restart traffic flows configured by control plane
1462  *
1463  * @param dev
1464  *   Pointer to Ethernet device private data.
1465  *
1466  * @return
1467  *   0 on success, a negative errno value otherwise and rte_errno is set.
1468  */
1469 int
1470 mlx5_traffic_restart(struct rte_eth_dev *dev)
1471 {
1472 	if (dev->data->dev_started) {
1473 		mlx5_traffic_disable(dev);
1474 		return mlx5_traffic_enable(dev);
1475 	}
1476 	return 0;
1477 }
1478