xref: /dpdk/drivers/net/mlx5/mlx5_trigger.c (revision d46f3b525aafbb4c6c88d9c61b445eb0d93d2149)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5 
6 #include <unistd.h>
7 
8 #include <rte_ether.h>
9 #include <ethdev_driver.h>
10 #include <rte_interrupts.h>
11 #include <rte_alarm.h>
12 #include <rte_cycles.h>
13 
14 #include <mlx5_malloc.h>
15 
16 #include "mlx5.h"
17 #include "mlx5_flow.h"
18 #include "mlx5_rx.h"
19 #include "mlx5_tx.h"
20 #include "mlx5_utils.h"
21 #include "rte_pmd_mlx5.h"
22 
23 static void mlx5_traffic_disable_legacy(struct rte_eth_dev *dev);
24 
25 /**
26  * Stop traffic on Tx queues.
27  *
28  * @param dev
29  *   Pointer to Ethernet device structure.
30  */
31 static void
32 mlx5_txq_stop(struct rte_eth_dev *dev)
33 {
34 	struct mlx5_priv *priv = dev->data->dev_private;
35 	unsigned int i;
36 
37 	for (i = 0; i != priv->txqs_n; ++i)
38 		mlx5_txq_release(dev, i);
39 }
40 
41 /**
42  * Start traffic on Tx queues.
43  *
44  * @param dev
45  *   Pointer to Ethernet device structure.
46  *
47  * @return
48  *   0 on success, a negative errno value otherwise and rte_errno is set.
49  */
50 static int
51 mlx5_txq_start(struct rte_eth_dev *dev)
52 {
53 	struct mlx5_priv *priv = dev->data->dev_private;
54 	unsigned int i;
55 	int ret;
56 
57 	for (i = 0; i != priv->txqs_n; ++i) {
58 		struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i);
59 		struct mlx5_txq_data *txq_data = &txq_ctrl->txq;
60 		uint32_t flags = MLX5_MEM_RTE | MLX5_MEM_ZERO;
61 
62 		if (!txq_ctrl)
63 			continue;
64 		if (!txq_ctrl->is_hairpin)
65 			txq_alloc_elts(txq_ctrl);
66 		MLX5_ASSERT(!txq_ctrl->obj);
67 		txq_ctrl->obj = mlx5_malloc(flags, sizeof(struct mlx5_txq_obj),
68 					    0, txq_ctrl->socket);
69 		if (!txq_ctrl->obj) {
70 			DRV_LOG(ERR, "Port %u Tx queue %u cannot allocate "
71 				"memory resources.", dev->data->port_id,
72 				txq_data->idx);
73 			rte_errno = ENOMEM;
74 			goto error;
75 		}
76 		ret = priv->obj_ops.txq_obj_new(dev, i);
77 		if (ret < 0) {
78 			mlx5_free(txq_ctrl->obj);
79 			txq_ctrl->obj = NULL;
80 			goto error;
81 		}
82 		if (!txq_ctrl->is_hairpin) {
83 			size_t size = txq_data->cqe_s * sizeof(*txq_data->fcqs);
84 
85 			txq_data->fcqs = mlx5_malloc(flags, size,
86 						     RTE_CACHE_LINE_SIZE,
87 						     txq_ctrl->socket);
88 			if (!txq_data->fcqs) {
89 				DRV_LOG(ERR, "Port %u Tx queue %u cannot "
90 					"allocate memory (FCQ).",
91 					dev->data->port_id, i);
92 				rte_errno = ENOMEM;
93 				goto error;
94 			}
95 		}
96 		DRV_LOG(DEBUG, "Port %u txq %u updated with %p.",
97 			dev->data->port_id, i, (void *)&txq_ctrl->obj);
98 		LIST_INSERT_HEAD(&priv->txqsobj, txq_ctrl->obj, next);
99 	}
100 	return 0;
101 error:
102 	ret = rte_errno; /* Save rte_errno before cleanup. */
103 	do {
104 		mlx5_txq_release(dev, i);
105 	} while (i-- != 0);
106 	rte_errno = ret; /* Restore rte_errno. */
107 	return -rte_errno;
108 }
109 
110 /**
111  * Register Rx queue mempools and fill the Rx queue cache.
112  * This function tolerates repeated mempool registration.
113  *
114  * @param[in] rxq_ctrl
115  *   Rx queue control data.
116  *
117  * @return
118  *   0 on success, (-1) on failure and rte_errno is set.
119  */
120 static int
121 mlx5_rxq_mempool_register(struct mlx5_rxq_ctrl *rxq_ctrl)
122 {
123 	struct rte_mempool *mp;
124 	uint32_t s;
125 	int ret = 0;
126 
127 	mlx5_mr_flush_local_cache(&rxq_ctrl->rxq.mr_ctrl);
128 	/* MPRQ mempool is registered on creation, just fill the cache. */
129 	if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq))
130 		return mlx5_mr_mempool_populate_cache(&rxq_ctrl->rxq.mr_ctrl,
131 						      rxq_ctrl->rxq.mprq_mp);
132 	for (s = 0; s < rxq_ctrl->rxq.rxseg_n; s++) {
133 		bool is_extmem;
134 
135 		mp = rxq_ctrl->rxq.rxseg[s].mp;
136 		is_extmem = (rte_pktmbuf_priv_flags(mp) &
137 			     RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF) != 0;
138 		ret = mlx5_mr_mempool_register(rxq_ctrl->sh->cdev, mp,
139 					       is_extmem);
140 		if (ret < 0 && rte_errno != EEXIST)
141 			return ret;
142 		ret = mlx5_mr_mempool_populate_cache(&rxq_ctrl->rxq.mr_ctrl,
143 						     mp);
144 		if (ret < 0)
145 			return ret;
146 	}
147 	return 0;
148 }
149 
150 /**
151  * Stop traffic on Rx queues.
152  *
153  * @param dev
154  *   Pointer to Ethernet device structure.
155  */
156 static void
157 mlx5_rxq_stop(struct rte_eth_dev *dev)
158 {
159 	struct mlx5_priv *priv = dev->data->dev_private;
160 	unsigned int i;
161 
162 	for (i = 0; i != priv->rxqs_n; ++i)
163 		mlx5_rxq_release(dev, i);
164 }
165 
166 static int
167 mlx5_rxq_ctrl_prepare(struct rte_eth_dev *dev, struct mlx5_rxq_ctrl *rxq_ctrl,
168 		      unsigned int idx)
169 {
170 	int ret = 0;
171 
172 	if (!rxq_ctrl->is_hairpin) {
173 		/*
174 		 * Pre-register the mempools. Regardless of whether
175 		 * the implicit registration is enabled or not,
176 		 * Rx mempool destruction is tracked to free MRs.
177 		 */
178 		if (mlx5_rxq_mempool_register(rxq_ctrl) < 0)
179 			return -rte_errno;
180 		ret = rxq_alloc_elts(rxq_ctrl);
181 		if (ret)
182 			return ret;
183 	}
184 	MLX5_ASSERT(!rxq_ctrl->obj);
185 	rxq_ctrl->obj = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
186 				    sizeof(*rxq_ctrl->obj), 0,
187 				    rxq_ctrl->socket);
188 	if (!rxq_ctrl->obj) {
189 		DRV_LOG(ERR, "Port %u Rx queue %u can't allocate resources.",
190 			dev->data->port_id, idx);
191 		rte_errno = ENOMEM;
192 		return -rte_errno;
193 	}
194 	DRV_LOG(DEBUG, "Port %u rxq %u updated with %p.", dev->data->port_id,
195 		idx, (void *)&rxq_ctrl->obj);
196 	return 0;
197 }
198 
199 /**
200  * Start traffic on Rx queues.
201  *
202  * @param dev
203  *   Pointer to Ethernet device structure.
204  *
205  * @return
206  *   0 on success, a negative errno value otherwise and rte_errno is set.
207  */
208 static int
209 mlx5_rxq_start(struct rte_eth_dev *dev)
210 {
211 	struct mlx5_priv *priv = dev->data->dev_private;
212 	unsigned int i;
213 	int ret = 0;
214 
215 	/* Allocate/reuse/resize mempool for Multi-Packet RQ. */
216 	if (mlx5_mprq_alloc_mp(dev)) {
217 		/* Should not release Rx queues but return immediately. */
218 		return -rte_errno;
219 	}
220 	DRV_LOG(DEBUG, "Port %u dev_cap.max_qp_wr is %d.",
221 		dev->data->port_id, priv->sh->dev_cap.max_qp_wr);
222 	DRV_LOG(DEBUG, "Port %u dev_cap.max_sge is %d.",
223 		dev->data->port_id, priv->sh->dev_cap.max_sge);
224 	for (i = 0; i != priv->rxqs_n; ++i) {
225 		struct mlx5_rxq_priv *rxq = mlx5_rxq_ref(dev, i);
226 		struct mlx5_rxq_ctrl *rxq_ctrl;
227 
228 		if (rxq == NULL)
229 			continue;
230 		rxq_ctrl = rxq->ctrl;
231 		if (!rxq_ctrl->started)
232 			if (mlx5_rxq_ctrl_prepare(dev, rxq_ctrl, i) < 0)
233 				goto error;
234 		ret = priv->obj_ops.rxq_obj_new(rxq);
235 		if (ret) {
236 			mlx5_free(rxq_ctrl->obj);
237 			rxq_ctrl->obj = NULL;
238 			goto error;
239 		}
240 		if (!rxq_ctrl->started)
241 			LIST_INSERT_HEAD(&priv->rxqsobj, rxq_ctrl->obj, next);
242 		rxq_ctrl->started = true;
243 	}
244 	return 0;
245 error:
246 	ret = rte_errno; /* Save rte_errno before cleanup. */
247 	do {
248 		mlx5_rxq_release(dev, i);
249 	} while (i-- != 0);
250 	rte_errno = ret; /* Restore rte_errno. */
251 	return -rte_errno;
252 }
253 
254 /**
255  * Binds Tx queues to Rx queues for hairpin.
256  *
257  * Binds Tx queues to the target Rx queues.
258  *
259  * @param dev
260  *   Pointer to Ethernet device structure.
261  *
262  * @return
263  *   0 on success, a negative errno value otherwise and rte_errno is set.
264  */
265 static int
266 mlx5_hairpin_auto_bind(struct rte_eth_dev *dev)
267 {
268 	struct mlx5_priv *priv = dev->data->dev_private;
269 	struct mlx5_devx_modify_sq_attr sq_attr = { 0 };
270 	struct mlx5_devx_modify_rq_attr rq_attr = { 0 };
271 	struct mlx5_txq_ctrl *txq_ctrl;
272 	struct mlx5_rxq_priv *rxq;
273 	struct mlx5_rxq_ctrl *rxq_ctrl;
274 	struct mlx5_devx_obj *sq;
275 	struct mlx5_devx_obj *rq;
276 	unsigned int i;
277 	int ret = 0;
278 	bool need_auto = false;
279 	uint16_t self_port = dev->data->port_id;
280 
281 	for (i = 0; i != priv->txqs_n; ++i) {
282 		txq_ctrl = mlx5_txq_get(dev, i);
283 		if (!txq_ctrl)
284 			continue;
285 		if (!txq_ctrl->is_hairpin ||
286 		    txq_ctrl->hairpin_conf.peers[0].port != self_port) {
287 			mlx5_txq_release(dev, i);
288 			continue;
289 		}
290 		if (txq_ctrl->hairpin_conf.manual_bind) {
291 			mlx5_txq_release(dev, i);
292 			return 0;
293 		}
294 		need_auto = true;
295 		mlx5_txq_release(dev, i);
296 	}
297 	if (!need_auto)
298 		return 0;
299 	for (i = 0; i != priv->txqs_n; ++i) {
300 		txq_ctrl = mlx5_txq_get(dev, i);
301 		if (!txq_ctrl)
302 			continue;
303 		/* Skip hairpin queues with other peer ports. */
304 		if (!txq_ctrl->is_hairpin ||
305 		    txq_ctrl->hairpin_conf.peers[0].port != self_port) {
306 			mlx5_txq_release(dev, i);
307 			continue;
308 		}
309 		if (!txq_ctrl->obj) {
310 			rte_errno = ENOMEM;
311 			DRV_LOG(ERR, "port %u no txq object found: %d",
312 				dev->data->port_id, i);
313 			mlx5_txq_release(dev, i);
314 			return -rte_errno;
315 		}
316 		sq = txq_ctrl->obj->sq;
317 		rxq = mlx5_rxq_get(dev, txq_ctrl->hairpin_conf.peers[0].queue);
318 		if (rxq == NULL) {
319 			mlx5_txq_release(dev, i);
320 			rte_errno = EINVAL;
321 			DRV_LOG(ERR, "port %u no rxq object found: %d",
322 				dev->data->port_id,
323 				txq_ctrl->hairpin_conf.peers[0].queue);
324 			return -rte_errno;
325 		}
326 		rxq_ctrl = rxq->ctrl;
327 		if (!rxq_ctrl->is_hairpin ||
328 		    rxq->hairpin_conf.peers[0].queue != i) {
329 			rte_errno = ENOMEM;
330 			DRV_LOG(ERR, "port %u Tx queue %d can't be binded to "
331 				"Rx queue %d", dev->data->port_id,
332 				i, txq_ctrl->hairpin_conf.peers[0].queue);
333 			goto error;
334 		}
335 		rq = rxq_ctrl->obj->rq;
336 		if (!rq) {
337 			rte_errno = ENOMEM;
338 			DRV_LOG(ERR, "port %u hairpin no matching rxq: %d",
339 				dev->data->port_id,
340 				txq_ctrl->hairpin_conf.peers[0].queue);
341 			goto error;
342 		}
343 		sq_attr.state = MLX5_SQC_STATE_RDY;
344 		sq_attr.sq_state = MLX5_SQC_STATE_RST;
345 		sq_attr.hairpin_peer_rq = rq->id;
346 		sq_attr.hairpin_peer_vhca =
347 				priv->sh->cdev->config.hca_attr.vhca_id;
348 		ret = mlx5_devx_cmd_modify_sq(sq, &sq_attr);
349 		if (ret)
350 			goto error;
351 		rq_attr.state = MLX5_RQC_STATE_RDY;
352 		rq_attr.rq_state = MLX5_RQC_STATE_RST;
353 		rq_attr.hairpin_peer_sq = sq->id;
354 		rq_attr.hairpin_peer_vhca =
355 				priv->sh->cdev->config.hca_attr.vhca_id;
356 		ret = mlx5_devx_cmd_modify_rq(rq, &rq_attr);
357 		if (ret)
358 			goto error;
359 		/* Qs with auto-bind will be destroyed directly. */
360 		rxq->hairpin_status = 1;
361 		txq_ctrl->hairpin_status = 1;
362 		mlx5_txq_release(dev, i);
363 	}
364 	return 0;
365 error:
366 	mlx5_txq_release(dev, i);
367 	return -rte_errno;
368 }
369 
370 /*
371  * Fetch the peer queue's SW & HW information.
372  *
373  * @param dev
374  *   Pointer to Ethernet device structure.
375  * @param peer_queue
376  *   Index of the queue to fetch the information.
377  * @param current_info
378  *   Pointer to the input peer information, not used currently.
379  * @param peer_info
380  *   Pointer to the structure to store the information, output.
381  * @param direction
382  *   Positive to get the RxQ information, zero to get the TxQ information.
383  *
384  * @return
385  *   0 on success, a negative errno value otherwise and rte_errno is set.
386  */
387 int
388 mlx5_hairpin_queue_peer_update(struct rte_eth_dev *dev, uint16_t peer_queue,
389 			       struct rte_hairpin_peer_info *current_info,
390 			       struct rte_hairpin_peer_info *peer_info,
391 			       uint32_t direction)
392 {
393 	struct mlx5_priv *priv = dev->data->dev_private;
394 	RTE_SET_USED(current_info);
395 
396 	if (dev->data->dev_started == 0) {
397 		rte_errno = EBUSY;
398 		DRV_LOG(ERR, "peer port %u is not started",
399 			dev->data->port_id);
400 		return -rte_errno;
401 	}
402 	/*
403 	 * Peer port used as egress. In the current design, hairpin Tx queue
404 	 * will be bound to the peer Rx queue. Indeed, only the information of
405 	 * peer Rx queue needs to be fetched.
406 	 */
407 	if (direction == 0) {
408 		struct mlx5_txq_ctrl *txq_ctrl;
409 
410 		txq_ctrl = mlx5_txq_get(dev, peer_queue);
411 		if (txq_ctrl == NULL) {
412 			rte_errno = EINVAL;
413 			DRV_LOG(ERR, "Failed to get port %u Tx queue %d",
414 				dev->data->port_id, peer_queue);
415 			return -rte_errno;
416 		}
417 		if (!txq_ctrl->is_hairpin) {
418 			rte_errno = EINVAL;
419 			DRV_LOG(ERR, "port %u queue %d is not a hairpin Txq",
420 				dev->data->port_id, peer_queue);
421 			mlx5_txq_release(dev, peer_queue);
422 			return -rte_errno;
423 		}
424 		if (txq_ctrl->obj == NULL || txq_ctrl->obj->sq == NULL) {
425 			rte_errno = ENOMEM;
426 			DRV_LOG(ERR, "port %u no Txq object found: %d",
427 				dev->data->port_id, peer_queue);
428 			mlx5_txq_release(dev, peer_queue);
429 			return -rte_errno;
430 		}
431 		peer_info->qp_id = mlx5_txq_get_sqn(txq_ctrl);
432 		peer_info->vhca_id = priv->sh->cdev->config.hca_attr.vhca_id;
433 		/* 1-to-1 mapping, only the first one is used. */
434 		peer_info->peer_q = txq_ctrl->hairpin_conf.peers[0].queue;
435 		peer_info->tx_explicit = txq_ctrl->hairpin_conf.tx_explicit;
436 		peer_info->manual_bind = txq_ctrl->hairpin_conf.manual_bind;
437 		mlx5_txq_release(dev, peer_queue);
438 	} else { /* Peer port used as ingress. */
439 		struct mlx5_rxq_priv *rxq = mlx5_rxq_get(dev, peer_queue);
440 		struct mlx5_rxq_ctrl *rxq_ctrl;
441 
442 		if (rxq == NULL) {
443 			rte_errno = EINVAL;
444 			DRV_LOG(ERR, "Failed to get port %u Rx queue %d",
445 				dev->data->port_id, peer_queue);
446 			return -rte_errno;
447 		}
448 		rxq_ctrl = rxq->ctrl;
449 		if (!rxq_ctrl->is_hairpin) {
450 			rte_errno = EINVAL;
451 			DRV_LOG(ERR, "port %u queue %d is not a hairpin Rxq",
452 				dev->data->port_id, peer_queue);
453 			return -rte_errno;
454 		}
455 		if (rxq_ctrl->obj == NULL || rxq_ctrl->obj->rq == NULL) {
456 			rte_errno = ENOMEM;
457 			DRV_LOG(ERR, "port %u no Rxq object found: %d",
458 				dev->data->port_id, peer_queue);
459 			return -rte_errno;
460 		}
461 		peer_info->qp_id = rxq_ctrl->obj->rq->id;
462 		peer_info->vhca_id = priv->sh->cdev->config.hca_attr.vhca_id;
463 		peer_info->peer_q = rxq->hairpin_conf.peers[0].queue;
464 		peer_info->tx_explicit = rxq->hairpin_conf.tx_explicit;
465 		peer_info->manual_bind = rxq->hairpin_conf.manual_bind;
466 	}
467 	return 0;
468 }
469 
470 /*
471  * Bind the hairpin queue with the peer HW information.
472  * This needs to be called twice both for Tx and Rx queues of a pair.
473  * If the queue is already bound, it is considered successful.
474  *
475  * @param dev
476  *   Pointer to Ethernet device structure.
477  * @param cur_queue
478  *   Index of the queue to change the HW configuration to bind.
479  * @param peer_info
480  *   Pointer to information of the peer queue.
481  * @param direction
482  *   Positive to configure the TxQ, zero to configure the RxQ.
483  *
484  * @return
485  *   0 on success, a negative errno value otherwise and rte_errno is set.
486  */
487 int
488 mlx5_hairpin_queue_peer_bind(struct rte_eth_dev *dev, uint16_t cur_queue,
489 			     struct rte_hairpin_peer_info *peer_info,
490 			     uint32_t direction)
491 {
492 	int ret = 0;
493 
494 	/*
495 	 * Consistency checking of the peer queue: opposite direction is used
496 	 * to get the peer queue info with ethdev port ID, no need to check.
497 	 */
498 	if (peer_info->peer_q != cur_queue) {
499 		rte_errno = EINVAL;
500 		DRV_LOG(ERR, "port %u queue %d and peer queue %d mismatch",
501 			dev->data->port_id, cur_queue, peer_info->peer_q);
502 		return -rte_errno;
503 	}
504 	if (direction != 0) {
505 		struct mlx5_txq_ctrl *txq_ctrl;
506 		struct mlx5_devx_modify_sq_attr sq_attr = { 0 };
507 
508 		txq_ctrl = mlx5_txq_get(dev, cur_queue);
509 		if (txq_ctrl == NULL) {
510 			rte_errno = EINVAL;
511 			DRV_LOG(ERR, "Failed to get port %u Tx queue %d",
512 				dev->data->port_id, cur_queue);
513 			return -rte_errno;
514 		}
515 		if (!txq_ctrl->is_hairpin) {
516 			rte_errno = EINVAL;
517 			DRV_LOG(ERR, "port %u queue %d not a hairpin Txq",
518 				dev->data->port_id, cur_queue);
519 			mlx5_txq_release(dev, cur_queue);
520 			return -rte_errno;
521 		}
522 		if (txq_ctrl->obj == NULL || txq_ctrl->obj->sq == NULL) {
523 			rte_errno = ENOMEM;
524 			DRV_LOG(ERR, "port %u no Txq object found: %d",
525 				dev->data->port_id, cur_queue);
526 			mlx5_txq_release(dev, cur_queue);
527 			return -rte_errno;
528 		}
529 		if (txq_ctrl->hairpin_status != 0) {
530 			DRV_LOG(DEBUG, "port %u Tx queue %d is already bound",
531 				dev->data->port_id, cur_queue);
532 			mlx5_txq_release(dev, cur_queue);
533 			return 0;
534 		}
535 		/*
536 		 * All queues' of one port consistency checking is done in the
537 		 * bind() function, and that is optional.
538 		 */
539 		if (peer_info->tx_explicit !=
540 		    txq_ctrl->hairpin_conf.tx_explicit) {
541 			rte_errno = EINVAL;
542 			DRV_LOG(ERR, "port %u Tx queue %d and peer Tx rule mode"
543 				" mismatch", dev->data->port_id, cur_queue);
544 			mlx5_txq_release(dev, cur_queue);
545 			return -rte_errno;
546 		}
547 		if (peer_info->manual_bind !=
548 		    txq_ctrl->hairpin_conf.manual_bind) {
549 			rte_errno = EINVAL;
550 			DRV_LOG(ERR, "port %u Tx queue %d and peer binding mode"
551 				" mismatch", dev->data->port_id, cur_queue);
552 			mlx5_txq_release(dev, cur_queue);
553 			return -rte_errno;
554 		}
555 		sq_attr.state = MLX5_SQC_STATE_RDY;
556 		sq_attr.sq_state = MLX5_SQC_STATE_RST;
557 		sq_attr.hairpin_peer_rq = peer_info->qp_id;
558 		sq_attr.hairpin_peer_vhca = peer_info->vhca_id;
559 		ret = mlx5_devx_cmd_modify_sq(txq_ctrl->obj->sq, &sq_attr);
560 		if (ret == 0)
561 			txq_ctrl->hairpin_status = 1;
562 		mlx5_txq_release(dev, cur_queue);
563 	} else {
564 		struct mlx5_rxq_priv *rxq = mlx5_rxq_get(dev, cur_queue);
565 		struct mlx5_rxq_ctrl *rxq_ctrl;
566 		struct mlx5_devx_modify_rq_attr rq_attr = { 0 };
567 
568 		if (rxq == NULL) {
569 			rte_errno = EINVAL;
570 			DRV_LOG(ERR, "Failed to get port %u Rx queue %d",
571 				dev->data->port_id, cur_queue);
572 			return -rte_errno;
573 		}
574 		rxq_ctrl = rxq->ctrl;
575 		if (!rxq_ctrl->is_hairpin) {
576 			rte_errno = EINVAL;
577 			DRV_LOG(ERR, "port %u queue %d not a hairpin Rxq",
578 				dev->data->port_id, cur_queue);
579 			return -rte_errno;
580 		}
581 		if (rxq_ctrl->obj == NULL || rxq_ctrl->obj->rq == NULL) {
582 			rte_errno = ENOMEM;
583 			DRV_LOG(ERR, "port %u no Rxq object found: %d",
584 				dev->data->port_id, cur_queue);
585 			return -rte_errno;
586 		}
587 		if (rxq->hairpin_status != 0) {
588 			DRV_LOG(DEBUG, "port %u Rx queue %d is already bound",
589 				dev->data->port_id, cur_queue);
590 			return 0;
591 		}
592 		if (peer_info->tx_explicit !=
593 		    rxq->hairpin_conf.tx_explicit) {
594 			rte_errno = EINVAL;
595 			DRV_LOG(ERR, "port %u Rx queue %d and peer Tx rule mode"
596 				" mismatch", dev->data->port_id, cur_queue);
597 			return -rte_errno;
598 		}
599 		if (peer_info->manual_bind !=
600 		    rxq->hairpin_conf.manual_bind) {
601 			rte_errno = EINVAL;
602 			DRV_LOG(ERR, "port %u Rx queue %d and peer binding mode"
603 				" mismatch", dev->data->port_id, cur_queue);
604 			return -rte_errno;
605 		}
606 		rq_attr.state = MLX5_RQC_STATE_RDY;
607 		rq_attr.rq_state = MLX5_RQC_STATE_RST;
608 		rq_attr.hairpin_peer_sq = peer_info->qp_id;
609 		rq_attr.hairpin_peer_vhca = peer_info->vhca_id;
610 		ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq, &rq_attr);
611 		if (ret == 0)
612 			rxq->hairpin_status = 1;
613 	}
614 	return ret;
615 }
616 
617 /*
618  * Unbind the hairpin queue and reset its HW configuration.
619  * This needs to be called twice both for Tx and Rx queues of a pair.
620  * If the queue is already unbound, it is considered successful.
621  *
622  * @param dev
623  *   Pointer to Ethernet device structure.
624  * @param cur_queue
625  *   Index of the queue to change the HW configuration to unbind.
626  * @param direction
627  *   Positive to reset the TxQ, zero to reset the RxQ.
628  *
629  * @return
630  *   0 on success, a negative errno value otherwise and rte_errno is set.
631  */
632 int
633 mlx5_hairpin_queue_peer_unbind(struct rte_eth_dev *dev, uint16_t cur_queue,
634 			       uint32_t direction)
635 {
636 	int ret = 0;
637 
638 	if (direction != 0) {
639 		struct mlx5_txq_ctrl *txq_ctrl;
640 		struct mlx5_devx_modify_sq_attr sq_attr = { 0 };
641 
642 		txq_ctrl = mlx5_txq_get(dev, cur_queue);
643 		if (txq_ctrl == NULL) {
644 			rte_errno = EINVAL;
645 			DRV_LOG(ERR, "Failed to get port %u Tx queue %d",
646 				dev->data->port_id, cur_queue);
647 			return -rte_errno;
648 		}
649 		if (!txq_ctrl->is_hairpin) {
650 			rte_errno = EINVAL;
651 			DRV_LOG(ERR, "port %u queue %d not a hairpin Txq",
652 				dev->data->port_id, cur_queue);
653 			mlx5_txq_release(dev, cur_queue);
654 			return -rte_errno;
655 		}
656 		/* Already unbound, return success before obj checking. */
657 		if (txq_ctrl->hairpin_status == 0) {
658 			DRV_LOG(DEBUG, "port %u Tx queue %d is already unbound",
659 				dev->data->port_id, cur_queue);
660 			mlx5_txq_release(dev, cur_queue);
661 			return 0;
662 		}
663 		if (!txq_ctrl->obj || !txq_ctrl->obj->sq) {
664 			rte_errno = ENOMEM;
665 			DRV_LOG(ERR, "port %u no Txq object found: %d",
666 				dev->data->port_id, cur_queue);
667 			mlx5_txq_release(dev, cur_queue);
668 			return -rte_errno;
669 		}
670 		sq_attr.state = MLX5_SQC_STATE_RST;
671 		sq_attr.sq_state = MLX5_SQC_STATE_RDY;
672 		ret = mlx5_devx_cmd_modify_sq(txq_ctrl->obj->sq, &sq_attr);
673 		if (ret == 0)
674 			txq_ctrl->hairpin_status = 0;
675 		mlx5_txq_release(dev, cur_queue);
676 	} else {
677 		struct mlx5_rxq_priv *rxq = mlx5_rxq_get(dev, cur_queue);
678 		struct mlx5_rxq_ctrl *rxq_ctrl;
679 		struct mlx5_devx_modify_rq_attr rq_attr = { 0 };
680 
681 		if (rxq == NULL) {
682 			rte_errno = EINVAL;
683 			DRV_LOG(ERR, "Failed to get port %u Rx queue %d",
684 				dev->data->port_id, cur_queue);
685 			return -rte_errno;
686 		}
687 		rxq_ctrl = rxq->ctrl;
688 		if (!rxq_ctrl->is_hairpin) {
689 			rte_errno = EINVAL;
690 			DRV_LOG(ERR, "port %u queue %d not a hairpin Rxq",
691 				dev->data->port_id, cur_queue);
692 			return -rte_errno;
693 		}
694 		if (rxq->hairpin_status == 0) {
695 			DRV_LOG(DEBUG, "port %u Rx queue %d is already unbound",
696 				dev->data->port_id, cur_queue);
697 			return 0;
698 		}
699 		if (rxq_ctrl->obj == NULL || rxq_ctrl->obj->rq == NULL) {
700 			rte_errno = ENOMEM;
701 			DRV_LOG(ERR, "port %u no Rxq object found: %d",
702 				dev->data->port_id, cur_queue);
703 			return -rte_errno;
704 		}
705 		rq_attr.state = MLX5_RQC_STATE_RST;
706 		rq_attr.rq_state = MLX5_RQC_STATE_RDY;
707 		ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq, &rq_attr);
708 		if (ret == 0)
709 			rxq->hairpin_status = 0;
710 	}
711 	return ret;
712 }
713 
714 /*
715  * Bind the hairpin port pairs, from the Tx to the peer Rx.
716  * This function only supports to bind the Tx to one Rx.
717  *
718  * @param dev
719  *   Pointer to Ethernet device structure.
720  * @param rx_port
721  *   Port identifier of the Rx port.
722  *
723  * @return
724  *   0 on success, a negative errno value otherwise and rte_errno is set.
725  */
726 static int
727 mlx5_hairpin_bind_single_port(struct rte_eth_dev *dev, uint16_t rx_port)
728 {
729 	struct mlx5_priv *priv = dev->data->dev_private;
730 	int ret = 0;
731 	struct mlx5_txq_ctrl *txq_ctrl;
732 	uint32_t i;
733 	struct rte_hairpin_peer_info peer = {0xffffff};
734 	struct rte_hairpin_peer_info cur;
735 	const struct rte_eth_hairpin_conf *conf;
736 	uint16_t num_q = 0;
737 	uint16_t local_port = priv->dev_data->port_id;
738 	uint32_t manual;
739 	uint32_t explicit;
740 	uint16_t rx_queue;
741 
742 	if (mlx5_eth_find_next(rx_port, dev->device) != rx_port) {
743 		rte_errno = ENODEV;
744 		DRV_LOG(ERR, "Rx port %u does not belong to mlx5", rx_port);
745 		return -rte_errno;
746 	}
747 	/*
748 	 * Before binding TxQ to peer RxQ, first round loop will be used for
749 	 * checking the queues' configuration consistency. This would be a
750 	 * little time consuming but better than doing the rollback.
751 	 */
752 	for (i = 0; i != priv->txqs_n; i++) {
753 		txq_ctrl = mlx5_txq_get(dev, i);
754 		if (txq_ctrl == NULL)
755 			continue;
756 		if (!txq_ctrl->is_hairpin) {
757 			mlx5_txq_release(dev, i);
758 			continue;
759 		}
760 		/*
761 		 * All hairpin Tx queues of a single port that connected to the
762 		 * same peer Rx port should have the same "auto binding" and
763 		 * "implicit Tx flow" modes.
764 		 * Peer consistency checking will be done in per queue binding.
765 		 */
766 		conf = &txq_ctrl->hairpin_conf;
767 		if (conf->peers[0].port == rx_port) {
768 			if (num_q == 0) {
769 				manual = conf->manual_bind;
770 				explicit = conf->tx_explicit;
771 			} else {
772 				if (manual != conf->manual_bind ||
773 				    explicit != conf->tx_explicit) {
774 					rte_errno = EINVAL;
775 					DRV_LOG(ERR, "port %u queue %d mode"
776 						" mismatch: %u %u, %u %u",
777 						local_port, i, manual,
778 						conf->manual_bind, explicit,
779 						conf->tx_explicit);
780 					mlx5_txq_release(dev, i);
781 					return -rte_errno;
782 				}
783 			}
784 			num_q++;
785 		}
786 		mlx5_txq_release(dev, i);
787 	}
788 	/* Once no queue is configured, success is returned directly. */
789 	if (num_q == 0)
790 		return ret;
791 	/* All the hairpin TX queues need to be traversed again. */
792 	for (i = 0; i != priv->txqs_n; i++) {
793 		txq_ctrl = mlx5_txq_get(dev, i);
794 		if (txq_ctrl == NULL)
795 			continue;
796 		if (!txq_ctrl->is_hairpin) {
797 			mlx5_txq_release(dev, i);
798 			continue;
799 		}
800 		if (txq_ctrl->hairpin_conf.peers[0].port != rx_port) {
801 			mlx5_txq_release(dev, i);
802 			continue;
803 		}
804 		rx_queue = txq_ctrl->hairpin_conf.peers[0].queue;
805 		/*
806 		 * Fetch peer RxQ's information.
807 		 * No need to pass the information of the current queue.
808 		 */
809 		ret = rte_eth_hairpin_queue_peer_update(rx_port, rx_queue,
810 							NULL, &peer, 1);
811 		if (ret != 0) {
812 			mlx5_txq_release(dev, i);
813 			goto error;
814 		}
815 		/* Accessing its own device, inside mlx5 PMD. */
816 		ret = mlx5_hairpin_queue_peer_bind(dev, i, &peer, 1);
817 		if (ret != 0) {
818 			mlx5_txq_release(dev, i);
819 			goto error;
820 		}
821 		/* Pass TxQ's information to peer RxQ and try binding. */
822 		cur.peer_q = rx_queue;
823 		cur.qp_id = mlx5_txq_get_sqn(txq_ctrl);
824 		cur.vhca_id = priv->sh->cdev->config.hca_attr.vhca_id;
825 		cur.tx_explicit = txq_ctrl->hairpin_conf.tx_explicit;
826 		cur.manual_bind = txq_ctrl->hairpin_conf.manual_bind;
827 		/*
828 		 * In order to access another device in a proper way, RTE level
829 		 * private function is needed.
830 		 */
831 		ret = rte_eth_hairpin_queue_peer_bind(rx_port, rx_queue,
832 						      &cur, 0);
833 		if (ret != 0) {
834 			mlx5_txq_release(dev, i);
835 			goto error;
836 		}
837 		mlx5_txq_release(dev, i);
838 	}
839 	return 0;
840 error:
841 	/*
842 	 * Do roll-back process for the queues already bound.
843 	 * No need to check the return value of the queue unbind function.
844 	 */
845 	do {
846 		/* No validation is needed here. */
847 		txq_ctrl = mlx5_txq_get(dev, i);
848 		if (txq_ctrl == NULL)
849 			continue;
850 		if (!txq_ctrl->is_hairpin ||
851 		    txq_ctrl->hairpin_conf.peers[0].port != rx_port) {
852 			mlx5_txq_release(dev, i);
853 			continue;
854 		}
855 		rx_queue = txq_ctrl->hairpin_conf.peers[0].queue;
856 		rte_eth_hairpin_queue_peer_unbind(rx_port, rx_queue, 0);
857 		mlx5_hairpin_queue_peer_unbind(dev, i, 1);
858 		mlx5_txq_release(dev, i);
859 	} while (i--);
860 	return ret;
861 }
862 
863 /*
864  * Unbind the hairpin port pair, HW configuration of both devices will be clear
865  * and status will be reset for all the queues used between them.
866  * This function only supports to unbind the Tx from one Rx.
867  *
868  * @param dev
869  *   Pointer to Ethernet device structure.
870  * @param rx_port
871  *   Port identifier of the Rx port.
872  *
873  * @return
874  *   0 on success, a negative errno value otherwise and rte_errno is set.
875  */
876 static int
877 mlx5_hairpin_unbind_single_port(struct rte_eth_dev *dev, uint16_t rx_port)
878 {
879 	struct mlx5_priv *priv = dev->data->dev_private;
880 	struct mlx5_txq_ctrl *txq_ctrl;
881 	uint32_t i;
882 	int ret;
883 	uint16_t cur_port = priv->dev_data->port_id;
884 
885 	if (mlx5_eth_find_next(rx_port, dev->device) != rx_port) {
886 		rte_errno = ENODEV;
887 		DRV_LOG(ERR, "Rx port %u does not belong to mlx5", rx_port);
888 		return -rte_errno;
889 	}
890 	for (i = 0; i != priv->txqs_n; i++) {
891 		uint16_t rx_queue;
892 
893 		txq_ctrl = mlx5_txq_get(dev, i);
894 		if (txq_ctrl == NULL)
895 			continue;
896 		if (!txq_ctrl->is_hairpin) {
897 			mlx5_txq_release(dev, i);
898 			continue;
899 		}
900 		if (txq_ctrl->hairpin_conf.peers[0].port != rx_port) {
901 			mlx5_txq_release(dev, i);
902 			continue;
903 		}
904 		/* Indeed, only the first used queue needs to be checked. */
905 		if (txq_ctrl->hairpin_conf.manual_bind == 0) {
906 			mlx5_txq_release(dev, i);
907 			if (cur_port != rx_port) {
908 				rte_errno = EINVAL;
909 				DRV_LOG(ERR, "port %u and port %u are in"
910 					" auto-bind mode", cur_port, rx_port);
911 				return -rte_errno;
912 			} else {
913 				return 0;
914 			}
915 		}
916 		rx_queue = txq_ctrl->hairpin_conf.peers[0].queue;
917 		mlx5_txq_release(dev, i);
918 		ret = rte_eth_hairpin_queue_peer_unbind(rx_port, rx_queue, 0);
919 		if (ret) {
920 			DRV_LOG(ERR, "port %u Rx queue %d unbind - failure",
921 				rx_port, rx_queue);
922 			return ret;
923 		}
924 		ret = mlx5_hairpin_queue_peer_unbind(dev, i, 1);
925 		if (ret) {
926 			DRV_LOG(ERR, "port %u Tx queue %d unbind - failure",
927 				cur_port, i);
928 			return ret;
929 		}
930 	}
931 	return 0;
932 }
933 
934 /*
935  * Bind hairpin ports, Rx could be all ports when using RTE_MAX_ETHPORTS.
936  * @see mlx5_hairpin_bind_single_port()
937  */
938 int
939 mlx5_hairpin_bind(struct rte_eth_dev *dev, uint16_t rx_port)
940 {
941 	int ret = 0;
942 	uint16_t p, pp;
943 
944 	/*
945 	 * If the Rx port has no hairpin configuration with the current port,
946 	 * the binding will be skipped in the called function of single port.
947 	 * Device started status will be checked only before the queue
948 	 * information updating.
949 	 */
950 	if (rx_port == RTE_MAX_ETHPORTS) {
951 		MLX5_ETH_FOREACH_DEV(p, dev->device) {
952 			ret = mlx5_hairpin_bind_single_port(dev, p);
953 			if (ret != 0)
954 				goto unbind;
955 		}
956 		return ret;
957 	} else {
958 		return mlx5_hairpin_bind_single_port(dev, rx_port);
959 	}
960 unbind:
961 	MLX5_ETH_FOREACH_DEV(pp, dev->device)
962 		if (pp < p)
963 			mlx5_hairpin_unbind_single_port(dev, pp);
964 	return ret;
965 }
966 
967 /*
968  * Unbind hairpin ports, Rx could be all ports when using RTE_MAX_ETHPORTS.
969  * @see mlx5_hairpin_unbind_single_port()
970  */
971 int
972 mlx5_hairpin_unbind(struct rte_eth_dev *dev, uint16_t rx_port)
973 {
974 	int ret = 0;
975 	uint16_t p;
976 
977 	if (rx_port == RTE_MAX_ETHPORTS)
978 		MLX5_ETH_FOREACH_DEV(p, dev->device) {
979 			ret = mlx5_hairpin_unbind_single_port(dev, p);
980 			if (ret != 0)
981 				return ret;
982 		}
983 	else
984 		ret = mlx5_hairpin_unbind_single_port(dev, rx_port);
985 	return ret;
986 }
987 
988 /*
989  * DPDK callback to get the hairpin peer ports list.
990  * This will return the actual number of peer ports and save the identifiers
991  * into the array (sorted, may be different from that when setting up the
992  * hairpin peer queues).
993  * The peer port ID could be the same as the port ID of the current device.
994  *
995  * @param dev
996  *   Pointer to Ethernet device structure.
997  * @param peer_ports
998  *   Pointer to array to save the port identifiers.
999  * @param len
1000  *   The length of the array.
1001  * @param direction
1002  *   Current port to peer port direction.
1003  *   positive - current used as Tx to get all peer Rx ports.
1004  *   zero - current used as Rx to get all peer Tx ports.
1005  *
1006  * @return
1007  *   0 or positive value on success, actual number of peer ports.
1008  *   a negative errno value otherwise and rte_errno is set.
1009  */
1010 int
1011 mlx5_hairpin_get_peer_ports(struct rte_eth_dev *dev, uint16_t *peer_ports,
1012 			    size_t len, uint32_t direction)
1013 {
1014 	struct mlx5_priv *priv = dev->data->dev_private;
1015 	struct mlx5_txq_ctrl *txq_ctrl;
1016 	uint32_t i;
1017 	uint16_t pp;
1018 	uint32_t bits[(RTE_MAX_ETHPORTS + 31) / 32] = {0};
1019 	int ret = 0;
1020 
1021 	if (direction) {
1022 		for (i = 0; i < priv->txqs_n; i++) {
1023 			txq_ctrl = mlx5_txq_get(dev, i);
1024 			if (!txq_ctrl)
1025 				continue;
1026 			if (!txq_ctrl->is_hairpin) {
1027 				mlx5_txq_release(dev, i);
1028 				continue;
1029 			}
1030 			pp = txq_ctrl->hairpin_conf.peers[0].port;
1031 			if (pp >= RTE_MAX_ETHPORTS) {
1032 				rte_errno = ERANGE;
1033 				mlx5_txq_release(dev, i);
1034 				DRV_LOG(ERR, "port %hu queue %u peer port "
1035 					"out of range %hu",
1036 					priv->dev_data->port_id, i, pp);
1037 				return -rte_errno;
1038 			}
1039 			bits[pp / 32] |= 1 << (pp % 32);
1040 			mlx5_txq_release(dev, i);
1041 		}
1042 	} else {
1043 		for (i = 0; i < priv->rxqs_n; i++) {
1044 			struct mlx5_rxq_priv *rxq = mlx5_rxq_get(dev, i);
1045 			struct mlx5_rxq_ctrl *rxq_ctrl;
1046 
1047 			if (rxq == NULL)
1048 				continue;
1049 			rxq_ctrl = rxq->ctrl;
1050 			if (!rxq_ctrl->is_hairpin)
1051 				continue;
1052 			pp = rxq->hairpin_conf.peers[0].port;
1053 			if (pp >= RTE_MAX_ETHPORTS) {
1054 				rte_errno = ERANGE;
1055 				DRV_LOG(ERR, "port %hu queue %u peer port "
1056 					"out of range %hu",
1057 					priv->dev_data->port_id, i, pp);
1058 				return -rte_errno;
1059 			}
1060 			bits[pp / 32] |= 1 << (pp % 32);
1061 		}
1062 	}
1063 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
1064 		if (bits[i / 32] & (1 << (i % 32))) {
1065 			if ((size_t)ret >= len) {
1066 				rte_errno = E2BIG;
1067 				return -rte_errno;
1068 			}
1069 			peer_ports[ret++] = i;
1070 		}
1071 	}
1072 	return ret;
1073 }
1074 
1075 #ifdef HAVE_MLX5_HWS_SUPPORT
1076 
1077 /**
1078  * Check if starting representor port is allowed.
1079  *
1080  * If transfer proxy port is configured for HWS, then starting representor port
1081  * is allowed if and only if transfer proxy port is started as well.
1082  *
1083  * @param dev
1084  *   Pointer to Ethernet device structure.
1085  *
1086  * @return
1087  *   If stopping representor port is allowed, then 0 is returned.
1088  *   Otherwise rte_errno is set, and negative errno value is returned.
1089  */
1090 static int
1091 mlx5_hw_representor_port_allowed_start(struct rte_eth_dev *dev)
1092 {
1093 	struct mlx5_priv *priv = dev->data->dev_private;
1094 	struct rte_eth_dev *proxy_dev;
1095 	struct mlx5_priv *proxy_priv;
1096 	uint16_t proxy_port_id = UINT16_MAX;
1097 	int ret;
1098 
1099 	MLX5_ASSERT(priv->sh->config.dv_flow_en == 2);
1100 	MLX5_ASSERT(priv->sh->config.dv_esw_en);
1101 	MLX5_ASSERT(priv->representor);
1102 	ret = rte_flow_pick_transfer_proxy(dev->data->port_id, &proxy_port_id, NULL);
1103 	if (ret) {
1104 		if (ret == -ENODEV)
1105 			DRV_LOG(ERR, "Starting representor port %u is not allowed. Transfer "
1106 				     "proxy port is not available.", dev->data->port_id);
1107 		else
1108 			DRV_LOG(ERR, "Failed to pick transfer proxy for port %u (ret = %d)",
1109 				dev->data->port_id, ret);
1110 		return ret;
1111 	}
1112 	proxy_dev = &rte_eth_devices[proxy_port_id];
1113 	proxy_priv = proxy_dev->data->dev_private;
1114 	if (proxy_priv->dr_ctx == NULL) {
1115 		DRV_LOG(DEBUG, "Starting representor port %u is allowed, but default traffic flows"
1116 			       " will not be created. Transfer proxy port must be configured"
1117 			       " for HWS and started.",
1118 			       dev->data->port_id);
1119 		return 0;
1120 	}
1121 	if (!proxy_dev->data->dev_started) {
1122 		DRV_LOG(ERR, "Failed to start port %u: transfer proxy (port %u) must be started",
1123 			     dev->data->port_id, proxy_port_id);
1124 		rte_errno = EAGAIN;
1125 		return -rte_errno;
1126 	}
1127 	if (priv->sh->config.repr_matching && !priv->dr_ctx) {
1128 		DRV_LOG(ERR, "Failed to start port %u: with representor matching enabled, port "
1129 			     "must be configured for HWS", dev->data->port_id);
1130 		rte_errno = EINVAL;
1131 		return -rte_errno;
1132 	}
1133 	return 0;
1134 }
1135 
1136 #endif
1137 
1138 /**
1139  * DPDK callback to start the device.
1140  *
1141  * Simulate device start by attaching all configured flows.
1142  *
1143  * @param dev
1144  *   Pointer to Ethernet device structure.
1145  *
1146  * @return
1147  *   0 on success, a negative errno value otherwise and rte_errno is set.
1148  *   The following error values are defined:
1149  *
1150  *   - -EAGAIN: If port representor cannot be started,
1151  *     because transfer proxy port is not started.
1152  */
1153 int
1154 mlx5_dev_start(struct rte_eth_dev *dev)
1155 {
1156 	struct mlx5_priv *priv = dev->data->dev_private;
1157 	int ret;
1158 	int fine_inline;
1159 
1160 	DRV_LOG(DEBUG, "port %u starting device", dev->data->port_id);
1161 #ifdef HAVE_MLX5_HWS_SUPPORT
1162 	if (priv->sh->config.dv_flow_en == 2) {
1163 		struct rte_flow_error error = { 0, };
1164 
1165 		/*If previous configuration does not exist. */
1166 		if (!(priv->dr_ctx)) {
1167 			ret = flow_hw_init(dev, &error);
1168 			if (ret) {
1169 				DRV_LOG(ERR, "Failed to start port %u %s: %s",
1170 					dev->data->port_id, dev->data->name,
1171 					error.message);
1172 				return ret;
1173 			}
1174 		}
1175 		/* If there is no E-Switch, then there are no start/stop order limitations. */
1176 		if (!priv->sh->config.dv_esw_en)
1177 			goto continue_dev_start;
1178 		/* If master is being started, then it is always allowed. */
1179 		if (priv->master)
1180 			goto continue_dev_start;
1181 		if (mlx5_hw_representor_port_allowed_start(dev))
1182 			return -rte_errno;
1183 	}
1184 continue_dev_start:
1185 #endif
1186 	fine_inline = rte_mbuf_dynflag_lookup
1187 		(RTE_PMD_MLX5_FINE_GRANULARITY_INLINE, NULL);
1188 	if (fine_inline >= 0)
1189 		rte_net_mlx5_dynf_inline_mask = 1UL << fine_inline;
1190 	else
1191 		rte_net_mlx5_dynf_inline_mask = 0;
1192 	if (dev->data->nb_rx_queues > 0) {
1193 		uint32_t max_lro_msg_size = priv->max_lro_msg_size;
1194 
1195 		if (max_lro_msg_size < MLX5_LRO_SEG_CHUNK_SIZE) {
1196 			uint32_t i;
1197 			struct mlx5_rxq_priv *rxq;
1198 
1199 			for (i = 0; i != priv->rxqs_n; ++i) {
1200 				rxq = mlx5_rxq_get(dev, i);
1201 				if (rxq && rxq->ctrl && rxq->ctrl->rxq.lro) {
1202 					DRV_LOG(ERR, "port %u invalid max LRO size",
1203 						dev->data->port_id);
1204 					rte_errno = EINVAL;
1205 					return -rte_errno;
1206 				}
1207 			}
1208 		}
1209 		ret = mlx5_dev_configure_rss_reta(dev);
1210 		if (ret) {
1211 			DRV_LOG(ERR, "port %u reta config failed: %s",
1212 				dev->data->port_id, strerror(rte_errno));
1213 			return -rte_errno;
1214 		}
1215 	}
1216 	ret = mlx5_txpp_start(dev);
1217 	if (ret) {
1218 		DRV_LOG(ERR, "port %u Tx packet pacing init failed: %s",
1219 			dev->data->port_id, strerror(rte_errno));
1220 		goto error;
1221 	}
1222 	if (mlx5_devx_obj_ops_en(priv->sh) &&
1223 	    priv->obj_ops.lb_dummy_queue_create) {
1224 		ret = priv->obj_ops.lb_dummy_queue_create(dev);
1225 		if (ret)
1226 			goto error;
1227 	}
1228 	ret = mlx5_txq_start(dev);
1229 	if (ret) {
1230 		DRV_LOG(ERR, "port %u Tx queue allocation failed: %s",
1231 			dev->data->port_id, strerror(rte_errno));
1232 		goto error;
1233 	}
1234 	if (priv->config.std_delay_drop || priv->config.hp_delay_drop) {
1235 		if (!priv->sh->dev_cap.vf && !priv->sh->dev_cap.sf &&
1236 		    !priv->representor) {
1237 			ret = mlx5_get_flag_dropless_rq(dev);
1238 			if (ret < 0)
1239 				DRV_LOG(WARNING,
1240 					"port %u cannot query dropless flag",
1241 					dev->data->port_id);
1242 			else if (!ret)
1243 				DRV_LOG(WARNING,
1244 					"port %u dropless_rq OFF, no rearming",
1245 					dev->data->port_id);
1246 		} else {
1247 			DRV_LOG(DEBUG,
1248 				"port %u doesn't support dropless_rq flag",
1249 				dev->data->port_id);
1250 		}
1251 	}
1252 	ret = mlx5_rxq_start(dev);
1253 	if (ret) {
1254 		DRV_LOG(ERR, "port %u Rx queue allocation failed: %s",
1255 			dev->data->port_id, strerror(rte_errno));
1256 		goto error;
1257 	}
1258 	/*
1259 	 * Such step will be skipped if there is no hairpin TX queue configured
1260 	 * with RX peer queue from the same device.
1261 	 */
1262 	ret = mlx5_hairpin_auto_bind(dev);
1263 	if (ret) {
1264 		DRV_LOG(ERR, "port %u hairpin auto binding failed: %s",
1265 			dev->data->port_id, strerror(rte_errno));
1266 		goto error;
1267 	}
1268 	/* Set started flag here for the following steps like control flow. */
1269 	dev->data->dev_started = 1;
1270 	ret = mlx5_rx_intr_vec_enable(dev);
1271 	if (ret) {
1272 		DRV_LOG(ERR, "port %u Rx interrupt vector creation failed",
1273 			dev->data->port_id);
1274 		goto error;
1275 	}
1276 	mlx5_os_stats_init(dev);
1277 	/*
1278 	 * Attach indirection table objects detached on port stop.
1279 	 * They may be needed to create RSS in non-isolated mode.
1280 	 */
1281 	ret = mlx5_action_handle_attach(dev);
1282 	if (ret) {
1283 		DRV_LOG(ERR,
1284 			"port %u failed to attach indirect actions: %s",
1285 			dev->data->port_id, rte_strerror(rte_errno));
1286 		goto error;
1287 	}
1288 #ifdef HAVE_MLX5_HWS_SUPPORT
1289 	if (priv->sh->config.dv_flow_en == 2) {
1290 		ret = flow_hw_table_update(dev, NULL);
1291 		if (ret) {
1292 			DRV_LOG(ERR, "port %u failed to update HWS tables",
1293 				dev->data->port_id);
1294 			goto error;
1295 		}
1296 	}
1297 #endif
1298 	ret = mlx5_traffic_enable(dev);
1299 	if (ret) {
1300 		DRV_LOG(ERR, "port %u failed to set defaults flows",
1301 			dev->data->port_id);
1302 		goto error;
1303 	}
1304 	/* Set dynamic fields and flags into Rx queues. */
1305 	mlx5_flow_rxq_dynf_set(dev);
1306 	/* Set flags and context to convert Rx timestamps. */
1307 	mlx5_rxq_timestamp_set(dev);
1308 	/* Set a mask and offset of scheduling on timestamp into Tx queues. */
1309 	mlx5_txq_dynf_timestamp_set(dev);
1310 	/*
1311 	 * In non-cached mode, it only needs to start the default mreg copy
1312 	 * action and no flow created by application exists anymore.
1313 	 * But it is worth wrapping the interface for further usage.
1314 	 */
1315 	ret = mlx5_flow_start_default(dev);
1316 	if (ret) {
1317 		DRV_LOG(DEBUG, "port %u failed to start default actions: %s",
1318 			dev->data->port_id, strerror(rte_errno));
1319 		goto error;
1320 	}
1321 	if (mlx5_dev_ctx_shared_mempool_subscribe(dev) != 0) {
1322 		DRV_LOG(ERR, "port %u failed to subscribe for mempool life cycle: %s",
1323 			dev->data->port_id, rte_strerror(rte_errno));
1324 		goto error;
1325 	}
1326 	rte_wmb();
1327 	dev->tx_pkt_burst = mlx5_select_tx_function(dev);
1328 	dev->rx_pkt_burst = mlx5_select_rx_function(dev);
1329 	/* Enable datapath on secondary process. */
1330 	mlx5_mp_os_req_start_rxtx(dev);
1331 	if (rte_intr_fd_get(priv->sh->intr_handle) >= 0) {
1332 		priv->sh->port[priv->dev_port - 1].ih_port_id =
1333 					(uint32_t)dev->data->port_id;
1334 	} else {
1335 		DRV_LOG(INFO, "port %u starts without RMV interrupts.",
1336 			dev->data->port_id);
1337 		dev->data->dev_conf.intr_conf.rmv = 0;
1338 	}
1339 	if (rte_intr_fd_get(priv->sh->intr_handle_nl) >= 0) {
1340 		priv->sh->port[priv->dev_port - 1].nl_ih_port_id =
1341 					(uint32_t)dev->data->port_id;
1342 	} else {
1343 		DRV_LOG(INFO, "port %u starts without LSC interrupts.",
1344 			dev->data->port_id);
1345 		dev->data->dev_conf.intr_conf.lsc = 0;
1346 	}
1347 	if (rte_intr_fd_get(priv->sh->intr_handle_devx) >= 0)
1348 		priv->sh->port[priv->dev_port - 1].devx_ih_port_id =
1349 					(uint32_t)dev->data->port_id;
1350 	return 0;
1351 error:
1352 	ret = rte_errno; /* Save rte_errno before cleanup. */
1353 	/* Rollback. */
1354 	dev->data->dev_started = 0;
1355 	mlx5_flow_stop_default(dev);
1356 	mlx5_traffic_disable(dev);
1357 	mlx5_txq_stop(dev);
1358 	mlx5_rxq_stop(dev);
1359 	if (priv->obj_ops.lb_dummy_queue_release)
1360 		priv->obj_ops.lb_dummy_queue_release(dev);
1361 	mlx5_txpp_stop(dev); /* Stop last. */
1362 	rte_errno = ret; /* Restore rte_errno. */
1363 	return -rte_errno;
1364 }
1365 
1366 #ifdef HAVE_MLX5_HWS_SUPPORT
1367 /**
1368  * Check if stopping transfer proxy port is allowed.
1369  *
1370  * If transfer proxy port is configured for HWS, then it is allowed to stop it
1371  * if and only if all other representor ports are stopped.
1372  *
1373  * @param dev
1374  *   Pointer to Ethernet device structure.
1375  *
1376  * @return
1377  *   If stopping transfer proxy port is allowed, then 0 is returned.
1378  *   Otherwise rte_errno is set, and negative errno value is returned.
1379  */
1380 static int
1381 mlx5_hw_proxy_port_allowed_stop(struct rte_eth_dev *dev)
1382 {
1383 	struct mlx5_priv *priv = dev->data->dev_private;
1384 	bool representor_started = false;
1385 	uint16_t port_id;
1386 
1387 	MLX5_ASSERT(priv->sh->config.dv_flow_en == 2);
1388 	MLX5_ASSERT(priv->sh->config.dv_esw_en);
1389 	MLX5_ASSERT(priv->master);
1390 	/* If transfer proxy port was not configured for HWS, then stopping it is allowed. */
1391 	if (!priv->dr_ctx)
1392 		return 0;
1393 	MLX5_ETH_FOREACH_DEV(port_id, dev->device) {
1394 		const struct rte_eth_dev *port_dev = &rte_eth_devices[port_id];
1395 		const struct mlx5_priv *port_priv = port_dev->data->dev_private;
1396 
1397 		if (port_id != dev->data->port_id &&
1398 		    port_priv->domain_id == priv->domain_id &&
1399 		    port_dev->data->dev_started)
1400 			representor_started = true;
1401 	}
1402 	if (representor_started) {
1403 		DRV_LOG(ERR, "Failed to stop port %u: attached representor ports"
1404 			     " must be stopped before stopping transfer proxy port",
1405 			     dev->data->port_id);
1406 		rte_errno = EBUSY;
1407 		return -rte_errno;
1408 	}
1409 	return 0;
1410 }
1411 #endif
1412 
1413 /**
1414  * DPDK callback to stop the device.
1415  *
1416  * Simulate device stop by detaching all configured flows.
1417  *
1418  * @param dev
1419  *   Pointer to Ethernet device structure.
1420  *
1421  * @return
1422  *   0 on success, a negative errno value otherwise and rte_errno is set.
1423  *   The following error values are defined:
1424  *
1425  *   - -EBUSY: If transfer proxy port cannot be stopped,
1426  *     because other port representors are still running.
1427  */
1428 int
1429 mlx5_dev_stop(struct rte_eth_dev *dev)
1430 {
1431 	struct mlx5_priv *priv = dev->data->dev_private;
1432 
1433 #ifdef HAVE_MLX5_HWS_SUPPORT
1434 	if (priv->sh->config.dv_flow_en == 2) {
1435 		/* If there is no E-Switch, then there are no start/stop order limitations. */
1436 		if (!priv->sh->config.dv_esw_en)
1437 			goto continue_dev_stop;
1438 		/* If representor is being stopped, then it is always allowed. */
1439 		if (priv->representor)
1440 			goto continue_dev_stop;
1441 		if (mlx5_hw_proxy_port_allowed_stop(dev)) {
1442 			dev->data->dev_started = 1;
1443 			return -rte_errno;
1444 		}
1445 	}
1446 continue_dev_stop:
1447 #endif
1448 	dev->data->dev_started = 0;
1449 	/* Prevent crashes when queues are still in use. */
1450 	dev->rx_pkt_burst = rte_eth_pkt_burst_dummy;
1451 	dev->tx_pkt_burst = rte_eth_pkt_burst_dummy;
1452 	rte_wmb();
1453 	/* Disable datapath on secondary process. */
1454 	mlx5_mp_os_req_stop_rxtx(dev);
1455 	rte_delay_us_sleep(1000 * priv->rxqs_n);
1456 	DRV_LOG(DEBUG, "port %u stopping device", dev->data->port_id);
1457 	mlx5_flow_stop_default(dev);
1458 	/* Control flows for default traffic can be removed firstly. */
1459 	mlx5_traffic_disable(dev);
1460 	/* All RX queue flags will be cleared in the flush interface. */
1461 	mlx5_flow_list_flush(dev, MLX5_FLOW_TYPE_GEN, true);
1462 	mlx5_flow_meter_rxq_flush(dev);
1463 	mlx5_action_handle_detach(dev);
1464 #ifdef HAVE_MLX5_HWS_SUPPORT
1465 	mlx5_flow_hw_cleanup_ctrl_rx_templates(dev);
1466 #endif
1467 	mlx5_rx_intr_vec_disable(dev);
1468 	priv->sh->port[priv->dev_port - 1].ih_port_id = RTE_MAX_ETHPORTS;
1469 	priv->sh->port[priv->dev_port - 1].devx_ih_port_id = RTE_MAX_ETHPORTS;
1470 	priv->sh->port[priv->dev_port - 1].nl_ih_port_id = RTE_MAX_ETHPORTS;
1471 	mlx5_txq_stop(dev);
1472 	mlx5_rxq_stop(dev);
1473 	if (priv->obj_ops.lb_dummy_queue_release)
1474 		priv->obj_ops.lb_dummy_queue_release(dev);
1475 	mlx5_txpp_stop(dev);
1476 
1477 	return 0;
1478 }
1479 
1480 #ifdef HAVE_MLX5_HWS_SUPPORT
1481 
1482 static int
1483 mlx5_traffic_enable_hws(struct rte_eth_dev *dev)
1484 {
1485 	struct mlx5_priv *priv = dev->data->dev_private;
1486 	struct mlx5_sh_config *config = &priv->sh->config;
1487 	uint64_t flags = 0;
1488 	unsigned int i;
1489 	int ret;
1490 
1491 	/*
1492 	 * With extended metadata enabled, the Tx metadata copy is handled by default
1493 	 * Tx tagging flow rules, so default Tx flow rule is not needed. It is only
1494 	 * required when representor matching is disabled.
1495 	 */
1496 	if (config->dv_esw_en &&
1497 	    !config->repr_matching &&
1498 	    config->dv_xmeta_en == MLX5_XMETA_MODE_META32_HWS &&
1499 	    priv->master) {
1500 		if (mlx5_flow_hw_create_tx_default_mreg_copy_flow(dev))
1501 			goto error;
1502 	}
1503 	for (i = 0; i < priv->txqs_n; ++i) {
1504 		struct mlx5_txq_ctrl *txq = mlx5_txq_get(dev, i);
1505 		uint32_t queue;
1506 
1507 		if (!txq)
1508 			continue;
1509 		queue = mlx5_txq_get_sqn(txq);
1510 		if ((priv->representor || priv->master) &&
1511 		    config->dv_esw_en &&
1512 		    config->fdb_def_rule) {
1513 			if (mlx5_flow_hw_esw_create_sq_miss_flow(dev, queue, false)) {
1514 				mlx5_txq_release(dev, i);
1515 				goto error;
1516 			}
1517 		}
1518 		if (config->dv_esw_en && config->repr_matching) {
1519 			if (mlx5_flow_hw_tx_repr_matching_flow(dev, queue, false)) {
1520 				mlx5_txq_release(dev, i);
1521 				goto error;
1522 			}
1523 		}
1524 		mlx5_txq_release(dev, i);
1525 	}
1526 	if (config->fdb_def_rule) {
1527 		if ((priv->master || priv->representor) && config->dv_esw_en) {
1528 			if (!mlx5_flow_hw_esw_create_default_jump_flow(dev))
1529 				priv->fdb_def_rule = 1;
1530 			else
1531 				goto error;
1532 		}
1533 	} else {
1534 		DRV_LOG(INFO, "port %u FDB default rule is disabled", dev->data->port_id);
1535 	}
1536 	if (priv->isolated)
1537 		return 0;
1538 	if (!priv->sh->config.lacp_by_user && priv->pf_bond >= 0 && priv->master)
1539 		if (mlx5_flow_hw_lacp_rx_flow(dev))
1540 			goto error;
1541 	if (dev->data->promiscuous)
1542 		flags |= MLX5_CTRL_PROMISCUOUS;
1543 	if (dev->data->all_multicast)
1544 		flags |= MLX5_CTRL_ALL_MULTICAST;
1545 	else
1546 		flags |= MLX5_CTRL_BROADCAST | MLX5_CTRL_IPV4_MULTICAST | MLX5_CTRL_IPV6_MULTICAST;
1547 	flags |= MLX5_CTRL_DMAC;
1548 	if (priv->vlan_filter_n)
1549 		flags |= MLX5_CTRL_VLAN_FILTER;
1550 	return mlx5_flow_hw_ctrl_flows(dev, flags);
1551 error:
1552 	ret = rte_errno;
1553 	mlx5_flow_hw_flush_ctrl_flows(dev);
1554 	rte_errno = ret;
1555 	return -rte_errno;
1556 }
1557 
1558 #endif
1559 
1560 /**
1561  * Enable traffic flows configured by control plane
1562  *
1563  * @param dev
1564  *   Pointer to Ethernet device structure.
1565  *
1566  * @return
1567  *   0 on success, a negative errno value otherwise and rte_errno is set.
1568  */
1569 int
1570 mlx5_traffic_enable(struct rte_eth_dev *dev)
1571 {
1572 	struct mlx5_priv *priv = dev->data->dev_private;
1573 	struct rte_flow_item_eth bcast = {
1574 		.hdr.dst_addr.addr_bytes = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
1575 	};
1576 	struct rte_flow_item_eth ipv6_multi_spec = {
1577 		.hdr.dst_addr.addr_bytes = { 0x33, 0x33, 0x00, 0x00, 0x00, 0x00 },
1578 	};
1579 	struct rte_flow_item_eth ipv6_multi_mask = {
1580 		.hdr.dst_addr.addr_bytes = { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 },
1581 	};
1582 	struct rte_flow_item_eth unicast = {
1583 		.hdr.src_addr.addr_bytes = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
1584 	};
1585 	struct rte_flow_item_eth unicast_mask = {
1586 		.hdr.dst_addr.addr_bytes = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
1587 	};
1588 	const unsigned int vlan_filter_n = priv->vlan_filter_n;
1589 	const struct rte_ether_addr cmp = {
1590 		.addr_bytes = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
1591 	};
1592 	unsigned int i;
1593 	unsigned int j;
1594 	int ret;
1595 
1596 #ifdef HAVE_MLX5_HWS_SUPPORT
1597 	if (priv->sh->config.dv_flow_en == 2)
1598 		return mlx5_traffic_enable_hws(dev);
1599 #endif
1600 	/*
1601 	 * Hairpin txq default flow should be created no matter if it is
1602 	 * isolation mode. Or else all the packets to be sent will be sent
1603 	 * out directly without the TX flow actions, e.g. encapsulation.
1604 	 */
1605 	for (i = 0; i != priv->txqs_n; ++i) {
1606 		struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i);
1607 		if (!txq_ctrl)
1608 			continue;
1609 		/* Only Tx implicit mode requires the default Tx flow. */
1610 		if (txq_ctrl->is_hairpin &&
1611 		    txq_ctrl->hairpin_conf.tx_explicit == 0 &&
1612 		    txq_ctrl->hairpin_conf.peers[0].port ==
1613 		    priv->dev_data->port_id) {
1614 			ret = mlx5_ctrl_flow_source_queue(dev,
1615 					mlx5_txq_get_sqn(txq_ctrl));
1616 			if (ret) {
1617 				mlx5_txq_release(dev, i);
1618 				goto error;
1619 			}
1620 		}
1621 		if (priv->sh->config.dv_esw_en) {
1622 			uint32_t q = mlx5_txq_get_sqn(txq_ctrl);
1623 
1624 			if (mlx5_flow_create_devx_sq_miss_flow(dev, q) == 0) {
1625 				mlx5_txq_release(dev, i);
1626 				DRV_LOG(ERR,
1627 					"Port %u Tx queue %u SQ create representor devx default miss rule failed.",
1628 					dev->data->port_id, i);
1629 				goto error;
1630 			}
1631 		}
1632 		mlx5_txq_release(dev, i);
1633 	}
1634 	if (priv->sh->config.fdb_def_rule) {
1635 		if (priv->sh->config.dv_esw_en) {
1636 			if (mlx5_flow_create_esw_table_zero_flow(dev))
1637 				priv->fdb_def_rule = 1;
1638 			else
1639 				DRV_LOG(INFO, "port %u FDB default rule cannot be configured - only Eswitch group 0 flows are supported.",
1640 					dev->data->port_id);
1641 		}
1642 	} else {
1643 		DRV_LOG(INFO, "port %u FDB default rule is disabled",
1644 			dev->data->port_id);
1645 	}
1646 	if (!priv->sh->config.lacp_by_user && priv->pf_bond >= 0 && priv->master) {
1647 		ret = mlx5_flow_lacp_miss(dev);
1648 		if (ret)
1649 			DRV_LOG(INFO, "port %u LACP rule cannot be created - "
1650 				"forward LACP to kernel.", dev->data->port_id);
1651 		else
1652 			DRV_LOG(INFO, "LACP traffic will be missed in port %u.",
1653 				dev->data->port_id);
1654 	}
1655 	if (priv->isolated)
1656 		return 0;
1657 	if (dev->data->promiscuous) {
1658 		struct rte_flow_item_eth promisc = {
1659 			.hdr.dst_addr.addr_bytes = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
1660 			.hdr.src_addr.addr_bytes = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
1661 			.hdr.ether_type = 0,
1662 		};
1663 
1664 		ret = mlx5_ctrl_flow(dev, &promisc, &promisc);
1665 		if (ret)
1666 			goto error;
1667 	}
1668 	if (dev->data->all_multicast) {
1669 		struct rte_flow_item_eth multicast = {
1670 			.hdr.dst_addr.addr_bytes = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00 },
1671 			.hdr.src_addr.addr_bytes = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
1672 			.hdr.ether_type = 0,
1673 		};
1674 
1675 		ret = mlx5_ctrl_flow(dev, &multicast, &multicast);
1676 		if (ret)
1677 			goto error;
1678 	} else {
1679 		/* Add broadcast/multicast flows. */
1680 		for (i = 0; i != vlan_filter_n; ++i) {
1681 			uint16_t vlan = priv->vlan_filter[i];
1682 
1683 			struct rte_flow_item_vlan vlan_spec = {
1684 				.hdr.vlan_tci = rte_cpu_to_be_16(vlan),
1685 			};
1686 			struct rte_flow_item_vlan vlan_mask =
1687 				rte_flow_item_vlan_mask;
1688 
1689 			ret = mlx5_ctrl_flow_vlan(dev, &bcast, &bcast,
1690 						  &vlan_spec, &vlan_mask);
1691 			if (ret)
1692 				goto error;
1693 			ret = mlx5_ctrl_flow_vlan(dev, &ipv6_multi_spec,
1694 						  &ipv6_multi_mask,
1695 						  &vlan_spec, &vlan_mask);
1696 			if (ret)
1697 				goto error;
1698 		}
1699 		if (!vlan_filter_n) {
1700 			ret = mlx5_ctrl_flow(dev, &bcast, &bcast);
1701 			if (ret)
1702 				goto error;
1703 			ret = mlx5_ctrl_flow(dev, &ipv6_multi_spec,
1704 					     &ipv6_multi_mask);
1705 			if (ret) {
1706 				/* Do not fail on IPv6 broadcast creation failure. */
1707 				DRV_LOG(WARNING,
1708 					"IPv6 broadcast is not supported");
1709 				ret = 0;
1710 			}
1711 		}
1712 	}
1713 	/* Add MAC address flows. */
1714 	for (i = 0; i != MLX5_MAX_MAC_ADDRESSES; ++i) {
1715 		struct rte_ether_addr *mac = &dev->data->mac_addrs[i];
1716 
1717 		if (!memcmp(mac, &cmp, sizeof(*mac)))
1718 			continue;
1719 		memcpy(&unicast.hdr.dst_addr.addr_bytes,
1720 		       mac->addr_bytes,
1721 		       RTE_ETHER_ADDR_LEN);
1722 		for (j = 0; j != vlan_filter_n; ++j) {
1723 			uint16_t vlan = priv->vlan_filter[j];
1724 
1725 			struct rte_flow_item_vlan vlan_spec = {
1726 				.hdr.vlan_tci = rte_cpu_to_be_16(vlan),
1727 			};
1728 			struct rte_flow_item_vlan vlan_mask =
1729 				rte_flow_item_vlan_mask;
1730 
1731 			ret = mlx5_ctrl_flow_vlan(dev, &unicast,
1732 						  &unicast_mask,
1733 						  &vlan_spec,
1734 						  &vlan_mask);
1735 			if (ret)
1736 				goto error;
1737 		}
1738 		if (!vlan_filter_n) {
1739 			ret = mlx5_ctrl_flow(dev, &unicast, &unicast_mask);
1740 			if (ret)
1741 				goto error;
1742 		}
1743 	}
1744 	return 0;
1745 error:
1746 	ret = rte_errno; /* Save rte_errno before cleanup. */
1747 	mlx5_traffic_disable_legacy(dev);
1748 	rte_errno = ret; /* Restore rte_errno. */
1749 	return -rte_errno;
1750 }
1751 
1752 static void
1753 mlx5_traffic_disable_legacy(struct rte_eth_dev *dev)
1754 {
1755 	struct mlx5_priv *priv = dev->data->dev_private;
1756 	struct mlx5_ctrl_flow_entry *entry;
1757 	struct mlx5_ctrl_flow_entry *tmp;
1758 
1759 	/*
1760 	 * Free registered control flow rules first,
1761 	 * to free the memory allocated for list entries
1762 	 */
1763 	entry = LIST_FIRST(&priv->hw_ctrl_flows);
1764 	while (entry != NULL) {
1765 		tmp = LIST_NEXT(entry, next);
1766 		mlx5_legacy_ctrl_flow_destroy(dev, entry);
1767 		entry = tmp;
1768 	}
1769 
1770 	mlx5_flow_list_flush(dev, MLX5_FLOW_TYPE_CTL, false);
1771 }
1772 
1773 /**
1774  * Disable traffic flows configured by control plane
1775  *
1776  * @param dev
1777  *   Pointer to Ethernet device private data.
1778  */
1779 void
1780 mlx5_traffic_disable(struct rte_eth_dev *dev)
1781 {
1782 #ifdef HAVE_MLX5_HWS_SUPPORT
1783 	struct mlx5_priv *priv = dev->data->dev_private;
1784 
1785 	if (priv->sh->config.dv_flow_en == 2)
1786 		mlx5_flow_hw_flush_ctrl_flows(dev);
1787 	else
1788 #endif
1789 		mlx5_traffic_disable_legacy(dev);
1790 }
1791 
1792 /**
1793  * Restart traffic flows configured by control plane
1794  *
1795  * @param dev
1796  *   Pointer to Ethernet device private data.
1797  *
1798  * @return
1799  *   0 on success, a negative errno value otherwise and rte_errno is set.
1800  */
1801 int
1802 mlx5_traffic_restart(struct rte_eth_dev *dev)
1803 {
1804 	if (dev->data->dev_started) {
1805 		mlx5_traffic_disable(dev);
1806 #ifdef HAVE_MLX5_HWS_SUPPORT
1807 		mlx5_flow_hw_cleanup_ctrl_rx_templates(dev);
1808 #endif
1809 		return mlx5_traffic_enable(dev);
1810 	}
1811 	return 0;
1812 }
1813 
1814 static bool
1815 mac_flows_update_needed(struct rte_eth_dev *dev)
1816 {
1817 	struct mlx5_priv *priv = dev->data->dev_private;
1818 
1819 	if (!dev->data->dev_started)
1820 		return false;
1821 	if (dev->data->promiscuous)
1822 		return false;
1823 	if (priv->isolated)
1824 		return false;
1825 
1826 	return true;
1827 }
1828 
1829 static int
1830 traffic_dmac_create(struct rte_eth_dev *dev, const struct rte_ether_addr *addr)
1831 {
1832 	struct mlx5_priv *priv = dev->data->dev_private;
1833 
1834 	if (priv->sh->config.dv_flow_en == 2)
1835 		return mlx5_flow_hw_ctrl_flow_dmac(dev, addr);
1836 	else
1837 		return mlx5_legacy_dmac_flow_create(dev, addr);
1838 }
1839 
1840 static int
1841 traffic_dmac_destroy(struct rte_eth_dev *dev, const struct rte_ether_addr *addr)
1842 {
1843 	struct mlx5_priv *priv = dev->data->dev_private;
1844 
1845 	if (priv->sh->config.dv_flow_en == 2)
1846 		return mlx5_flow_hw_ctrl_flow_dmac_destroy(dev, addr);
1847 	else
1848 		return mlx5_legacy_dmac_flow_destroy(dev, addr);
1849 }
1850 
1851 static int
1852 traffic_dmac_vlan_create(struct rte_eth_dev *dev,
1853 			 const struct rte_ether_addr *addr,
1854 			 const uint16_t vid)
1855 {
1856 	struct mlx5_priv *priv = dev->data->dev_private;
1857 
1858 	if (priv->sh->config.dv_flow_en == 2)
1859 		return mlx5_flow_hw_ctrl_flow_dmac_vlan(dev, addr, vid);
1860 	else
1861 		return mlx5_legacy_dmac_vlan_flow_create(dev, addr, vid);
1862 }
1863 
1864 static int
1865 traffic_dmac_vlan_destroy(struct rte_eth_dev *dev,
1866 			 const struct rte_ether_addr *addr,
1867 			 const uint16_t vid)
1868 {
1869 	struct mlx5_priv *priv = dev->data->dev_private;
1870 
1871 	if (priv->sh->config.dv_flow_en == 2)
1872 		return mlx5_flow_hw_ctrl_flow_dmac_vlan_destroy(dev, addr, vid);
1873 	else
1874 		return mlx5_legacy_dmac_vlan_flow_destroy(dev, addr, vid);
1875 }
1876 
1877 /**
1878  * Adjust Rx control flow rules to allow traffic on provided MAC address.
1879  */
1880 int
1881 mlx5_traffic_mac_add(struct rte_eth_dev *dev, const struct rte_ether_addr *addr)
1882 {
1883 	struct mlx5_priv *priv = dev->data->dev_private;
1884 
1885 	if (!mac_flows_update_needed(dev))
1886 		return 0;
1887 
1888 	if (priv->vlan_filter_n > 0) {
1889 		unsigned int i;
1890 
1891 		for (i = 0; i < priv->vlan_filter_n; ++i) {
1892 			uint16_t vlan = priv->vlan_filter[i];
1893 			int ret;
1894 
1895 			if (mlx5_ctrl_flow_uc_dmac_vlan_exists(dev, addr, vlan))
1896 				continue;
1897 
1898 			ret = traffic_dmac_vlan_create(dev, addr, vlan);
1899 			if (ret != 0)
1900 				return ret;
1901 		}
1902 
1903 		return 0;
1904 	}
1905 
1906 	if (mlx5_ctrl_flow_uc_dmac_exists(dev, addr))
1907 		return 0;
1908 
1909 	return traffic_dmac_create(dev, addr);
1910 }
1911 
1912 /**
1913  * Adjust Rx control flow rules to disallow traffic with removed MAC address.
1914  */
1915 int
1916 mlx5_traffic_mac_remove(struct rte_eth_dev *dev, const struct rte_ether_addr *addr)
1917 {
1918 	struct mlx5_priv *priv = dev->data->dev_private;
1919 
1920 	if (!mac_flows_update_needed(dev))
1921 		return 0;
1922 
1923 	if (priv->vlan_filter_n > 0) {
1924 		unsigned int i;
1925 
1926 		for (i = 0; i < priv->vlan_filter_n; ++i) {
1927 			uint16_t vlan = priv->vlan_filter[i];
1928 			int ret;
1929 
1930 			if (!mlx5_ctrl_flow_uc_dmac_vlan_exists(dev, addr, vlan))
1931 				continue;
1932 
1933 			ret = traffic_dmac_vlan_destroy(dev, addr, vlan);
1934 			if (ret != 0)
1935 				return ret;
1936 		}
1937 
1938 		return 0;
1939 	}
1940 
1941 	if (!mlx5_ctrl_flow_uc_dmac_exists(dev, addr))
1942 		return 0;
1943 
1944 	return traffic_dmac_destroy(dev, addr);
1945 }
1946 
1947 /**
1948  * Adjust Rx control flow rules to allow traffic on provided VLAN.
1949  *
1950  * Assumptions:
1951  * - Called when VLAN is added.
1952  * - At least one VLAN is enabled before function call.
1953  *
1954  * This functions assumes that VLAN is new and was not included in
1955  * Rx control flow rules set up before calling it.
1956  */
1957 int
1958 mlx5_traffic_vlan_add(struct rte_eth_dev *dev, const uint16_t vid)
1959 {
1960 	struct mlx5_priv *priv = dev->data->dev_private;
1961 	unsigned int i;
1962 	int ret;
1963 
1964 	if (!mac_flows_update_needed(dev))
1965 		return 0;
1966 
1967 	/* Add all unicast DMAC flow rules with new VLAN attached. */
1968 	for (i = 0; i != MLX5_MAX_MAC_ADDRESSES; ++i) {
1969 		struct rte_ether_addr *mac = &dev->data->mac_addrs[i];
1970 
1971 		if (rte_is_zero_ether_addr(mac))
1972 			continue;
1973 
1974 		ret = traffic_dmac_vlan_create(dev, mac, vid);
1975 		if (ret != 0)
1976 			return ret;
1977 	}
1978 
1979 	if (priv->vlan_filter_n == 1) {
1980 		/*
1981 		 * Adding first VLAN. Need to remove unicast DMAC rules before adding new rules.
1982 		 * Removing after creating VLAN rules so that traffic "gap" is not introduced.
1983 		 */
1984 
1985 		for (i = 0; i != MLX5_MAX_MAC_ADDRESSES; ++i) {
1986 			struct rte_ether_addr *mac = &dev->data->mac_addrs[i];
1987 
1988 			if (rte_is_zero_ether_addr(mac))
1989 				continue;
1990 
1991 			ret = traffic_dmac_destroy(dev, mac);
1992 			if (ret != 0)
1993 				return ret;
1994 		}
1995 	}
1996 
1997 	return 0;
1998 }
1999 
2000 /**
2001  * Adjust Rx control flow rules to disallow traffic with removed VLAN.
2002  *
2003  * Assumptions:
2004  *
2005  * - VLAN was really removed.
2006  */
2007 int
2008 mlx5_traffic_vlan_remove(struct rte_eth_dev *dev, const uint16_t vid)
2009 {
2010 	struct mlx5_priv *priv = dev->data->dev_private;
2011 	unsigned int i;
2012 	int ret;
2013 
2014 	if (!mac_flows_update_needed(dev))
2015 		return 0;
2016 
2017 	if (priv->vlan_filter_n == 0) {
2018 		/*
2019 		 * If there are no VLANs as a result, unicast DMAC flow rules must be recreated.
2020 		 * Recreating first to ensure no traffic "gap".
2021 		 */
2022 
2023 		for (i = 0; i != MLX5_MAX_MAC_ADDRESSES; ++i) {
2024 			struct rte_ether_addr *mac = &dev->data->mac_addrs[i];
2025 
2026 			if (rte_is_zero_ether_addr(mac))
2027 				continue;
2028 
2029 			ret = traffic_dmac_create(dev, mac);
2030 			if (ret != 0)
2031 				return ret;
2032 		}
2033 	}
2034 
2035 	/* Remove all unicast DMAC flow rules with this VLAN. */
2036 	for (i = 0; i != MLX5_MAX_MAC_ADDRESSES; ++i) {
2037 		struct rte_ether_addr *mac = &dev->data->mac_addrs[i];
2038 
2039 		if (rte_is_zero_ether_addr(mac))
2040 			continue;
2041 
2042 		ret = traffic_dmac_vlan_destroy(dev, mac, vid);
2043 		if (ret != 0)
2044 			return ret;
2045 	}
2046 
2047 	return 0;
2048 }
2049