xref: /dpdk/drivers/net/mlx5/mlx5_rxq.c (revision cc9ecbb48ee3a8fb80df6c470141260df3eacec0)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5 
6 #include <stddef.h>
7 #include <assert.h>
8 #include <errno.h>
9 #include <string.h>
10 #include <stdint.h>
11 #include <fcntl.h>
12 #include <sys/queue.h>
13 
14 /* Verbs header. */
15 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
16 #ifdef PEDANTIC
17 #pragma GCC diagnostic ignored "-Wpedantic"
18 #endif
19 #include <infiniband/verbs.h>
20 #include <infiniband/mlx5dv.h>
21 #ifdef PEDANTIC
22 #pragma GCC diagnostic error "-Wpedantic"
23 #endif
24 
25 #include <rte_mbuf.h>
26 #include <rte_malloc.h>
27 #include <rte_ethdev_driver.h>
28 #include <rte_common.h>
29 #include <rte_interrupts.h>
30 #include <rte_debug.h>
31 #include <rte_io.h>
32 
33 #include "mlx5.h"
34 #include "mlx5_rxtx.h"
35 #include "mlx5_utils.h"
36 #include "mlx5_autoconf.h"
37 #include "mlx5_defs.h"
38 #include "mlx5_glue.h"
39 
40 /* Default RSS hash key also used for ConnectX-3. */
41 uint8_t rss_hash_default_key[] = {
42 	0x2c, 0xc6, 0x81, 0xd1,
43 	0x5b, 0xdb, 0xf4, 0xf7,
44 	0xfc, 0xa2, 0x83, 0x19,
45 	0xdb, 0x1a, 0x3e, 0x94,
46 	0x6b, 0x9e, 0x38, 0xd9,
47 	0x2c, 0x9c, 0x03, 0xd1,
48 	0xad, 0x99, 0x44, 0xa7,
49 	0xd9, 0x56, 0x3d, 0x59,
50 	0x06, 0x3c, 0x25, 0xf3,
51 	0xfc, 0x1f, 0xdc, 0x2a,
52 };
53 
54 /* Length of the default RSS hash key. */
55 static_assert(MLX5_RSS_HASH_KEY_LEN ==
56 	      (unsigned int)sizeof(rss_hash_default_key),
57 	      "wrong RSS default key size.");
58 
59 /**
60  * Check whether Multi-Packet RQ can be enabled for the device.
61  *
62  * @param dev
63  *   Pointer to Ethernet device.
64  *
65  * @return
66  *   1 if supported, negative errno value if not.
67  */
68 inline int
69 mlx5_check_mprq_support(struct rte_eth_dev *dev)
70 {
71 	struct priv *priv = dev->data->dev_private;
72 
73 	if (priv->config.mprq.enabled &&
74 	    priv->rxqs_n >= priv->config.mprq.min_rxqs_num)
75 		return 1;
76 	return -ENOTSUP;
77 }
78 
79 /**
80  * Check whether Multi-Packet RQ is enabled for the Rx queue.
81  *
82  *  @param rxq
83  *     Pointer to receive queue structure.
84  *
85  * @return
86  *   0 if disabled, otherwise enabled.
87  */
88 inline int
89 mlx5_rxq_mprq_enabled(struct mlx5_rxq_data *rxq)
90 {
91 	return rxq->strd_num_n > 0;
92 }
93 
94 /**
95  * Check whether Multi-Packet RQ is enabled for the device.
96  *
97  * @param dev
98  *   Pointer to Ethernet device.
99  *
100  * @return
101  *   0 if disabled, otherwise enabled.
102  */
103 inline int
104 mlx5_mprq_enabled(struct rte_eth_dev *dev)
105 {
106 	struct priv *priv = dev->data->dev_private;
107 	uint16_t i;
108 	uint16_t n = 0;
109 
110 	if (mlx5_check_mprq_support(dev) < 0)
111 		return 0;
112 	/* All the configured queues should be enabled. */
113 	for (i = 0; i < priv->rxqs_n; ++i) {
114 		struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
115 
116 		if (!rxq)
117 			continue;
118 		if (mlx5_rxq_mprq_enabled(rxq))
119 			++n;
120 	}
121 	/* Multi-Packet RQ can't be partially configured. */
122 	assert(n == 0 || n == priv->rxqs_n);
123 	return n == priv->rxqs_n;
124 }
125 
126 /**
127  * Allocate RX queue elements for Multi-Packet RQ.
128  *
129  * @param rxq_ctrl
130  *   Pointer to RX queue structure.
131  *
132  * @return
133  *   0 on success, a negative errno value otherwise and rte_errno is set.
134  */
135 static int
136 rxq_alloc_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)
137 {
138 	struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
139 	unsigned int wqe_n = 1 << rxq->elts_n;
140 	unsigned int i;
141 	int err;
142 
143 	/* Iterate on segments. */
144 	for (i = 0; i <= wqe_n; ++i) {
145 		struct mlx5_mprq_buf *buf;
146 
147 		if (rte_mempool_get(rxq->mprq_mp, (void **)&buf) < 0) {
148 			DRV_LOG(ERR, "port %u empty mbuf pool", rxq->port_id);
149 			rte_errno = ENOMEM;
150 			goto error;
151 		}
152 		if (i < wqe_n)
153 			(*rxq->mprq_bufs)[i] = buf;
154 		else
155 			rxq->mprq_repl = buf;
156 	}
157 	DRV_LOG(DEBUG,
158 		"port %u Rx queue %u allocated and configured %u segments",
159 		rxq->port_id, rxq_ctrl->idx, wqe_n);
160 	return 0;
161 error:
162 	err = rte_errno; /* Save rte_errno before cleanup. */
163 	wqe_n = i;
164 	for (i = 0; (i != wqe_n); ++i) {
165 		if ((*rxq->mprq_bufs)[i] != NULL)
166 			rte_mempool_put(rxq->mprq_mp,
167 					(*rxq->mprq_bufs)[i]);
168 		(*rxq->mprq_bufs)[i] = NULL;
169 	}
170 	DRV_LOG(DEBUG, "port %u Rx queue %u failed, freed everything",
171 		rxq->port_id, rxq_ctrl->idx);
172 	rte_errno = err; /* Restore rte_errno. */
173 	return -rte_errno;
174 }
175 
176 /**
177  * Allocate RX queue elements for Single-Packet RQ.
178  *
179  * @param rxq_ctrl
180  *   Pointer to RX queue structure.
181  *
182  * @return
183  *   0 on success, errno value on failure.
184  */
185 static int
186 rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
187 {
188 	const unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n;
189 	unsigned int elts_n = 1 << rxq_ctrl->rxq.elts_n;
190 	unsigned int i;
191 	int err;
192 
193 	/* Iterate on segments. */
194 	for (i = 0; (i != elts_n); ++i) {
195 		struct rte_mbuf *buf;
196 
197 		buf = rte_pktmbuf_alloc(rxq_ctrl->rxq.mp);
198 		if (buf == NULL) {
199 			DRV_LOG(ERR, "port %u empty mbuf pool",
200 				PORT_ID(rxq_ctrl->priv));
201 			rte_errno = ENOMEM;
202 			goto error;
203 		}
204 		/* Headroom is reserved by rte_pktmbuf_alloc(). */
205 		assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
206 		/* Buffer is supposed to be empty. */
207 		assert(rte_pktmbuf_data_len(buf) == 0);
208 		assert(rte_pktmbuf_pkt_len(buf) == 0);
209 		assert(!buf->next);
210 		/* Only the first segment keeps headroom. */
211 		if (i % sges_n)
212 			SET_DATA_OFF(buf, 0);
213 		PORT(buf) = rxq_ctrl->rxq.port_id;
214 		DATA_LEN(buf) = rte_pktmbuf_tailroom(buf);
215 		PKT_LEN(buf) = DATA_LEN(buf);
216 		NB_SEGS(buf) = 1;
217 		(*rxq_ctrl->rxq.elts)[i] = buf;
218 	}
219 	/* If Rx vector is activated. */
220 	if (mlx5_rxq_check_vec_support(&rxq_ctrl->rxq) > 0) {
221 		struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
222 		struct rte_mbuf *mbuf_init = &rxq->fake_mbuf;
223 		int j;
224 
225 		/* Initialize default rearm_data for vPMD. */
226 		mbuf_init->data_off = RTE_PKTMBUF_HEADROOM;
227 		rte_mbuf_refcnt_set(mbuf_init, 1);
228 		mbuf_init->nb_segs = 1;
229 		mbuf_init->port = rxq->port_id;
230 		/*
231 		 * prevent compiler reordering:
232 		 * rearm_data covers previous fields.
233 		 */
234 		rte_compiler_barrier();
235 		rxq->mbuf_initializer =
236 			*(uint64_t *)&mbuf_init->rearm_data;
237 		/* Padding with a fake mbuf for vectorized Rx. */
238 		for (j = 0; j < MLX5_VPMD_DESCS_PER_LOOP; ++j)
239 			(*rxq->elts)[elts_n + j] = &rxq->fake_mbuf;
240 	}
241 	DRV_LOG(DEBUG,
242 		"port %u Rx queue %u allocated and configured %u segments"
243 		" (max %u packets)",
244 		PORT_ID(rxq_ctrl->priv), rxq_ctrl->idx, elts_n,
245 		elts_n / (1 << rxq_ctrl->rxq.sges_n));
246 	return 0;
247 error:
248 	err = rte_errno; /* Save rte_errno before cleanup. */
249 	elts_n = i;
250 	for (i = 0; (i != elts_n); ++i) {
251 		if ((*rxq_ctrl->rxq.elts)[i] != NULL)
252 			rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
253 		(*rxq_ctrl->rxq.elts)[i] = NULL;
254 	}
255 	DRV_LOG(DEBUG, "port %u Rx queue %u failed, freed everything",
256 		PORT_ID(rxq_ctrl->priv), rxq_ctrl->idx);
257 	rte_errno = err; /* Restore rte_errno. */
258 	return -rte_errno;
259 }
260 
261 /**
262  * Allocate RX queue elements.
263  *
264  * @param rxq_ctrl
265  *   Pointer to RX queue structure.
266  *
267  * @return
268  *   0 on success, errno value on failure.
269  */
270 int
271 rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl)
272 {
273 	return mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ?
274 	       rxq_alloc_elts_mprq(rxq_ctrl) : rxq_alloc_elts_sprq(rxq_ctrl);
275 }
276 
277 /**
278  * Free RX queue elements for Multi-Packet RQ.
279  *
280  * @param rxq_ctrl
281  *   Pointer to RX queue structure.
282  */
283 static void
284 rxq_free_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)
285 {
286 	struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
287 	uint16_t i;
288 
289 	DRV_LOG(DEBUG, "port %u Multi-Packet Rx queue %u freeing WRs",
290 		rxq->port_id, rxq_ctrl->idx);
291 	if (rxq->mprq_bufs == NULL)
292 		return;
293 	assert(mlx5_rxq_check_vec_support(rxq) < 0);
294 	for (i = 0; (i != (1u << rxq->elts_n)); ++i) {
295 		if ((*rxq->mprq_bufs)[i] != NULL)
296 			mlx5_mprq_buf_free((*rxq->mprq_bufs)[i]);
297 		(*rxq->mprq_bufs)[i] = NULL;
298 	}
299 	if (rxq->mprq_repl != NULL) {
300 		mlx5_mprq_buf_free(rxq->mprq_repl);
301 		rxq->mprq_repl = NULL;
302 	}
303 }
304 
305 /**
306  * Free RX queue elements for Single-Packet RQ.
307  *
308  * @param rxq_ctrl
309  *   Pointer to RX queue structure.
310  */
311 static void
312 rxq_free_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
313 {
314 	struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
315 	const uint16_t q_n = (1 << rxq->elts_n);
316 	const uint16_t q_mask = q_n - 1;
317 	uint16_t used = q_n - (rxq->rq_ci - rxq->rq_pi);
318 	uint16_t i;
319 
320 	DRV_LOG(DEBUG, "port %u Rx queue %u freeing WRs",
321 		PORT_ID(rxq_ctrl->priv), rxq_ctrl->idx);
322 	if (rxq->elts == NULL)
323 		return;
324 	/**
325 	 * Some mbuf in the Ring belongs to the application.  They cannot be
326 	 * freed.
327 	 */
328 	if (mlx5_rxq_check_vec_support(rxq) > 0) {
329 		for (i = 0; i < used; ++i)
330 			(*rxq->elts)[(rxq->rq_ci + i) & q_mask] = NULL;
331 		rxq->rq_pi = rxq->rq_ci;
332 	}
333 	for (i = 0; (i != (1u << rxq->elts_n)); ++i) {
334 		if ((*rxq->elts)[i] != NULL)
335 			rte_pktmbuf_free_seg((*rxq->elts)[i]);
336 		(*rxq->elts)[i] = NULL;
337 	}
338 }
339 
340 /**
341  * Free RX queue elements.
342  *
343  * @param rxq_ctrl
344  *   Pointer to RX queue structure.
345  */
346 static void
347 rxq_free_elts(struct mlx5_rxq_ctrl *rxq_ctrl)
348 {
349 	if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq))
350 		rxq_free_elts_mprq(rxq_ctrl);
351 	else
352 		rxq_free_elts_sprq(rxq_ctrl);
353 }
354 
355 /**
356  * Clean up a RX queue.
357  *
358  * Destroy objects, free allocated memory and reset the structure for reuse.
359  *
360  * @param rxq_ctrl
361  *   Pointer to RX queue structure.
362  */
363 void
364 mlx5_rxq_cleanup(struct mlx5_rxq_ctrl *rxq_ctrl)
365 {
366 	DRV_LOG(DEBUG, "port %u cleaning up Rx queue %u",
367 		PORT_ID(rxq_ctrl->priv), rxq_ctrl->idx);
368 	if (rxq_ctrl->ibv)
369 		mlx5_rxq_ibv_release(rxq_ctrl->ibv);
370 	memset(rxq_ctrl, 0, sizeof(*rxq_ctrl));
371 }
372 
373 /**
374  * Returns the per-queue supported offloads.
375  *
376  * @param dev
377  *   Pointer to Ethernet device.
378  *
379  * @return
380  *   Supported Rx offloads.
381  */
382 uint64_t
383 mlx5_get_rx_queue_offloads(struct rte_eth_dev *dev)
384 {
385 	struct priv *priv = dev->data->dev_private;
386 	struct mlx5_dev_config *config = &priv->config;
387 	uint64_t offloads = (DEV_RX_OFFLOAD_SCATTER |
388 			     DEV_RX_OFFLOAD_TIMESTAMP |
389 			     DEV_RX_OFFLOAD_JUMBO_FRAME);
390 
391 	offloads |= DEV_RX_OFFLOAD_CRC_STRIP;
392 	if (config->hw_fcs_strip)
393 		offloads |= DEV_RX_OFFLOAD_KEEP_CRC;
394 
395 	if (config->hw_csum)
396 		offloads |= (DEV_RX_OFFLOAD_IPV4_CKSUM |
397 			     DEV_RX_OFFLOAD_UDP_CKSUM |
398 			     DEV_RX_OFFLOAD_TCP_CKSUM);
399 	if (config->hw_vlan_strip)
400 		offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
401 	return offloads;
402 }
403 
404 
405 /**
406  * Returns the per-port supported offloads.
407  *
408  * @return
409  *   Supported Rx offloads.
410  */
411 uint64_t
412 mlx5_get_rx_port_offloads(void)
413 {
414 	uint64_t offloads = DEV_RX_OFFLOAD_VLAN_FILTER;
415 
416 	return offloads;
417 }
418 
419 /**
420  *
421  * @param dev
422  *   Pointer to Ethernet device structure.
423  * @param idx
424  *   RX queue index.
425  * @param desc
426  *   Number of descriptors to configure in queue.
427  * @param socket
428  *   NUMA socket on which memory must be allocated.
429  * @param[in] conf
430  *   Thresholds parameters.
431  * @param mp
432  *   Memory pool for buffer allocations.
433  *
434  * @return
435  *   0 on success, a negative errno value otherwise and rte_errno is set.
436  */
437 int
438 mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
439 		    unsigned int socket, const struct rte_eth_rxconf *conf,
440 		    struct rte_mempool *mp)
441 {
442 	struct priv *priv = dev->data->dev_private;
443 	struct mlx5_rxq_data *rxq = (*priv->rxqs)[idx];
444 	struct mlx5_rxq_ctrl *rxq_ctrl =
445 		container_of(rxq, struct mlx5_rxq_ctrl, rxq);
446 
447 	if (!rte_is_power_of_2(desc)) {
448 		desc = 1 << log2above(desc);
449 		DRV_LOG(WARNING,
450 			"port %u increased number of descriptors in Rx queue %u"
451 			" to the next power of two (%d)",
452 			dev->data->port_id, idx, desc);
453 	}
454 	DRV_LOG(DEBUG, "port %u configuring Rx queue %u for %u descriptors",
455 		dev->data->port_id, idx, desc);
456 	if (idx >= priv->rxqs_n) {
457 		DRV_LOG(ERR, "port %u Rx queue index out of range (%u >= %u)",
458 			dev->data->port_id, idx, priv->rxqs_n);
459 		rte_errno = EOVERFLOW;
460 		return -rte_errno;
461 	}
462 	if (!mlx5_rxq_releasable(dev, idx)) {
463 		DRV_LOG(ERR, "port %u unable to release queue index %u",
464 			dev->data->port_id, idx);
465 		rte_errno = EBUSY;
466 		return -rte_errno;
467 	}
468 	mlx5_rxq_release(dev, idx);
469 	rxq_ctrl = mlx5_rxq_new(dev, idx, desc, socket, conf, mp);
470 	if (!rxq_ctrl) {
471 		DRV_LOG(ERR, "port %u unable to allocate queue index %u",
472 			dev->data->port_id, idx);
473 		rte_errno = ENOMEM;
474 		return -rte_errno;
475 	}
476 	DRV_LOG(DEBUG, "port %u adding Rx queue %u to list",
477 		dev->data->port_id, idx);
478 	(*priv->rxqs)[idx] = &rxq_ctrl->rxq;
479 	return 0;
480 }
481 
482 /**
483  * DPDK callback to release a RX queue.
484  *
485  * @param dpdk_rxq
486  *   Generic RX queue pointer.
487  */
488 void
489 mlx5_rx_queue_release(void *dpdk_rxq)
490 {
491 	struct mlx5_rxq_data *rxq = (struct mlx5_rxq_data *)dpdk_rxq;
492 	struct mlx5_rxq_ctrl *rxq_ctrl;
493 	struct priv *priv;
494 
495 	if (rxq == NULL)
496 		return;
497 	rxq_ctrl = container_of(rxq, struct mlx5_rxq_ctrl, rxq);
498 	priv = rxq_ctrl->priv;
499 	if (!mlx5_rxq_releasable(ETH_DEV(priv), rxq_ctrl->rxq.stats.idx))
500 		rte_panic("port %u Rx queue %u is still used by a flow and"
501 			  " cannot be removed\n",
502 			  PORT_ID(priv), rxq_ctrl->idx);
503 	mlx5_rxq_release(ETH_DEV(priv), rxq_ctrl->rxq.stats.idx);
504 }
505 
506 /**
507  * Allocate queue vector and fill epoll fd list for Rx interrupts.
508  *
509  * @param dev
510  *   Pointer to Ethernet device.
511  *
512  * @return
513  *   0 on success, a negative errno value otherwise and rte_errno is set.
514  */
515 int
516 mlx5_rx_intr_vec_enable(struct rte_eth_dev *dev)
517 {
518 	struct priv *priv = dev->data->dev_private;
519 	unsigned int i;
520 	unsigned int rxqs_n = priv->rxqs_n;
521 	unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
522 	unsigned int count = 0;
523 	struct rte_intr_handle *intr_handle = dev->intr_handle;
524 
525 	if (!dev->data->dev_conf.intr_conf.rxq)
526 		return 0;
527 	mlx5_rx_intr_vec_disable(dev);
528 	intr_handle->intr_vec = malloc(n * sizeof(intr_handle->intr_vec[0]));
529 	if (intr_handle->intr_vec == NULL) {
530 		DRV_LOG(ERR,
531 			"port %u failed to allocate memory for interrupt"
532 			" vector, Rx interrupts will not be supported",
533 			dev->data->port_id);
534 		rte_errno = ENOMEM;
535 		return -rte_errno;
536 	}
537 	intr_handle->type = RTE_INTR_HANDLE_EXT;
538 	for (i = 0; i != n; ++i) {
539 		/* This rxq ibv must not be released in this function. */
540 		struct mlx5_rxq_ibv *rxq_ibv = mlx5_rxq_ibv_get(dev, i);
541 		int fd;
542 		int flags;
543 		int rc;
544 
545 		/* Skip queues that cannot request interrupts. */
546 		if (!rxq_ibv || !rxq_ibv->channel) {
547 			/* Use invalid intr_vec[] index to disable entry. */
548 			intr_handle->intr_vec[i] =
549 				RTE_INTR_VEC_RXTX_OFFSET +
550 				RTE_MAX_RXTX_INTR_VEC_ID;
551 			continue;
552 		}
553 		if (count >= RTE_MAX_RXTX_INTR_VEC_ID) {
554 			DRV_LOG(ERR,
555 				"port %u too many Rx queues for interrupt"
556 				" vector size (%d), Rx interrupts cannot be"
557 				" enabled",
558 				dev->data->port_id, RTE_MAX_RXTX_INTR_VEC_ID);
559 			mlx5_rx_intr_vec_disable(dev);
560 			rte_errno = ENOMEM;
561 			return -rte_errno;
562 		}
563 		fd = rxq_ibv->channel->fd;
564 		flags = fcntl(fd, F_GETFL);
565 		rc = fcntl(fd, F_SETFL, flags | O_NONBLOCK);
566 		if (rc < 0) {
567 			rte_errno = errno;
568 			DRV_LOG(ERR,
569 				"port %u failed to make Rx interrupt file"
570 				" descriptor %d non-blocking for queue index"
571 				" %d",
572 				dev->data->port_id, fd, i);
573 			mlx5_rx_intr_vec_disable(dev);
574 			return -rte_errno;
575 		}
576 		intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + count;
577 		intr_handle->efds[count] = fd;
578 		count++;
579 	}
580 	if (!count)
581 		mlx5_rx_intr_vec_disable(dev);
582 	else
583 		intr_handle->nb_efd = count;
584 	return 0;
585 }
586 
587 /**
588  * Clean up Rx interrupts handler.
589  *
590  * @param dev
591  *   Pointer to Ethernet device.
592  */
593 void
594 mlx5_rx_intr_vec_disable(struct rte_eth_dev *dev)
595 {
596 	struct priv *priv = dev->data->dev_private;
597 	struct rte_intr_handle *intr_handle = dev->intr_handle;
598 	unsigned int i;
599 	unsigned int rxqs_n = priv->rxqs_n;
600 	unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
601 
602 	if (!dev->data->dev_conf.intr_conf.rxq)
603 		return;
604 	if (!intr_handle->intr_vec)
605 		goto free;
606 	for (i = 0; i != n; ++i) {
607 		struct mlx5_rxq_ctrl *rxq_ctrl;
608 		struct mlx5_rxq_data *rxq_data;
609 
610 		if (intr_handle->intr_vec[i] == RTE_INTR_VEC_RXTX_OFFSET +
611 		    RTE_MAX_RXTX_INTR_VEC_ID)
612 			continue;
613 		/**
614 		 * Need to access directly the queue to release the reference
615 		 * kept in priv_rx_intr_vec_enable().
616 		 */
617 		rxq_data = (*priv->rxqs)[i];
618 		rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
619 		mlx5_rxq_ibv_release(rxq_ctrl->ibv);
620 	}
621 free:
622 	rte_intr_free_epoll_fd(intr_handle);
623 	if (intr_handle->intr_vec)
624 		free(intr_handle->intr_vec);
625 	intr_handle->nb_efd = 0;
626 	intr_handle->intr_vec = NULL;
627 }
628 
629 /**
630  *  MLX5 CQ notification .
631  *
632  *  @param rxq
633  *     Pointer to receive queue structure.
634  *  @param sq_n_rxq
635  *     Sequence number per receive queue .
636  */
637 static inline void
638 mlx5_arm_cq(struct mlx5_rxq_data *rxq, int sq_n_rxq)
639 {
640 	int sq_n = 0;
641 	uint32_t doorbell_hi;
642 	uint64_t doorbell;
643 	void *cq_db_reg = (char *)rxq->cq_uar + MLX5_CQ_DOORBELL;
644 
645 	sq_n = sq_n_rxq & MLX5_CQ_SQN_MASK;
646 	doorbell_hi = sq_n << MLX5_CQ_SQN_OFFSET | (rxq->cq_ci & MLX5_CI_MASK);
647 	doorbell = (uint64_t)doorbell_hi << 32;
648 	doorbell |=  rxq->cqn;
649 	rxq->cq_db[MLX5_CQ_ARM_DB] = rte_cpu_to_be_32(doorbell_hi);
650 	mlx5_uar_write64(rte_cpu_to_be_64(doorbell),
651 			 cq_db_reg, rxq->uar_lock_cq);
652 }
653 
654 /**
655  * DPDK callback for Rx queue interrupt enable.
656  *
657  * @param dev
658  *   Pointer to Ethernet device structure.
659  * @param rx_queue_id
660  *   Rx queue number.
661  *
662  * @return
663  *   0 on success, a negative errno value otherwise and rte_errno is set.
664  */
665 int
666 mlx5_rx_intr_enable(struct rte_eth_dev *dev, uint16_t rx_queue_id)
667 {
668 	struct priv *priv = dev->data->dev_private;
669 	struct mlx5_rxq_data *rxq_data;
670 	struct mlx5_rxq_ctrl *rxq_ctrl;
671 
672 	rxq_data = (*priv->rxqs)[rx_queue_id];
673 	if (!rxq_data) {
674 		rte_errno = EINVAL;
675 		return -rte_errno;
676 	}
677 	rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
678 	if (rxq_ctrl->irq) {
679 		struct mlx5_rxq_ibv *rxq_ibv;
680 
681 		rxq_ibv = mlx5_rxq_ibv_get(dev, rx_queue_id);
682 		if (!rxq_ibv) {
683 			rte_errno = EINVAL;
684 			return -rte_errno;
685 		}
686 		mlx5_arm_cq(rxq_data, rxq_data->cq_arm_sn);
687 		mlx5_rxq_ibv_release(rxq_ibv);
688 	}
689 	return 0;
690 }
691 
692 /**
693  * DPDK callback for Rx queue interrupt disable.
694  *
695  * @param dev
696  *   Pointer to Ethernet device structure.
697  * @param rx_queue_id
698  *   Rx queue number.
699  *
700  * @return
701  *   0 on success, a negative errno value otherwise and rte_errno is set.
702  */
703 int
704 mlx5_rx_intr_disable(struct rte_eth_dev *dev, uint16_t rx_queue_id)
705 {
706 	struct priv *priv = dev->data->dev_private;
707 	struct mlx5_rxq_data *rxq_data;
708 	struct mlx5_rxq_ctrl *rxq_ctrl;
709 	struct mlx5_rxq_ibv *rxq_ibv = NULL;
710 	struct ibv_cq *ev_cq;
711 	void *ev_ctx;
712 	int ret;
713 
714 	rxq_data = (*priv->rxqs)[rx_queue_id];
715 	if (!rxq_data) {
716 		rte_errno = EINVAL;
717 		return -rte_errno;
718 	}
719 	rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
720 	if (!rxq_ctrl->irq)
721 		return 0;
722 	rxq_ibv = mlx5_rxq_ibv_get(dev, rx_queue_id);
723 	if (!rxq_ibv) {
724 		rte_errno = EINVAL;
725 		return -rte_errno;
726 	}
727 	ret = mlx5_glue->get_cq_event(rxq_ibv->channel, &ev_cq, &ev_ctx);
728 	if (ret || ev_cq != rxq_ibv->cq) {
729 		rte_errno = EINVAL;
730 		goto exit;
731 	}
732 	rxq_data->cq_arm_sn++;
733 	mlx5_glue->ack_cq_events(rxq_ibv->cq, 1);
734 	return 0;
735 exit:
736 	ret = rte_errno; /* Save rte_errno before cleanup. */
737 	if (rxq_ibv)
738 		mlx5_rxq_ibv_release(rxq_ibv);
739 	DRV_LOG(WARNING, "port %u unable to disable interrupt on Rx queue %d",
740 		dev->data->port_id, rx_queue_id);
741 	rte_errno = ret; /* Restore rte_errno. */
742 	return -rte_errno;
743 }
744 
745 /**
746  * Create the Rx queue Verbs object.
747  *
748  * @param dev
749  *   Pointer to Ethernet device.
750  * @param idx
751  *   Queue index in DPDK Rx queue array
752  *
753  * @return
754  *   The Verbs object initialised, NULL otherwise and rte_errno is set.
755  */
756 struct mlx5_rxq_ibv *
757 mlx5_rxq_ibv_new(struct rte_eth_dev *dev, uint16_t idx)
758 {
759 	struct priv *priv = dev->data->dev_private;
760 	struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx];
761 	struct mlx5_rxq_ctrl *rxq_ctrl =
762 		container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
763 	struct ibv_wq_attr mod;
764 	union {
765 		struct {
766 			struct ibv_cq_init_attr_ex ibv;
767 			struct mlx5dv_cq_init_attr mlx5;
768 		} cq;
769 		struct {
770 			struct ibv_wq_init_attr ibv;
771 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
772 			struct mlx5dv_wq_init_attr mlx5;
773 #endif
774 		} wq;
775 		struct ibv_cq_ex cq_attr;
776 	} attr;
777 	unsigned int cqe_n;
778 	unsigned int wqe_n = 1 << rxq_data->elts_n;
779 	struct mlx5_rxq_ibv *tmpl;
780 	struct mlx5dv_cq cq_info;
781 	struct mlx5dv_rwq rwq;
782 	unsigned int i;
783 	int ret = 0;
784 	struct mlx5dv_obj obj;
785 	struct mlx5_dev_config *config = &priv->config;
786 	const int mprq_en = mlx5_rxq_mprq_enabled(rxq_data);
787 
788 	assert(rxq_data);
789 	assert(!rxq_ctrl->ibv);
790 	priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_RX_QUEUE;
791 	priv->verbs_alloc_ctx.obj = rxq_ctrl;
792 	tmpl = rte_calloc_socket(__func__, 1, sizeof(*tmpl), 0,
793 				 rxq_ctrl->socket);
794 	if (!tmpl) {
795 		DRV_LOG(ERR,
796 			"port %u Rx queue %u cannot allocate verbs resources",
797 			dev->data->port_id, rxq_ctrl->idx);
798 		rte_errno = ENOMEM;
799 		goto error;
800 	}
801 	tmpl->rxq_ctrl = rxq_ctrl;
802 	if (rxq_ctrl->irq) {
803 		tmpl->channel = mlx5_glue->create_comp_channel(priv->ctx);
804 		if (!tmpl->channel) {
805 			DRV_LOG(ERR, "port %u: comp channel creation failure",
806 				dev->data->port_id);
807 			rte_errno = ENOMEM;
808 			goto error;
809 		}
810 	}
811 	if (mprq_en)
812 		cqe_n = wqe_n * (1 << rxq_data->strd_num_n) - 1;
813 	else
814 		cqe_n = wqe_n  - 1;
815 	attr.cq.ibv = (struct ibv_cq_init_attr_ex){
816 		.cqe = cqe_n,
817 		.channel = tmpl->channel,
818 		.comp_mask = 0,
819 	};
820 	attr.cq.mlx5 = (struct mlx5dv_cq_init_attr){
821 		.comp_mask = 0,
822 	};
823 	if (config->cqe_comp && !rxq_data->hw_timestamp) {
824 		attr.cq.mlx5.comp_mask |=
825 			MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE;
826 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
827 		attr.cq.mlx5.cqe_comp_res_format =
828 			mprq_en ? MLX5DV_CQE_RES_FORMAT_CSUM_STRIDX :
829 				  MLX5DV_CQE_RES_FORMAT_HASH;
830 #else
831 		attr.cq.mlx5.cqe_comp_res_format = MLX5DV_CQE_RES_FORMAT_HASH;
832 #endif
833 		/*
834 		 * For vectorized Rx, it must not be doubled in order to
835 		 * make cq_ci and rq_ci aligned.
836 		 */
837 		if (mlx5_rxq_check_vec_support(rxq_data) < 0)
838 			attr.cq.ibv.cqe *= 2;
839 	} else if (config->cqe_comp && rxq_data->hw_timestamp) {
840 		DRV_LOG(DEBUG,
841 			"port %u Rx CQE compression is disabled for HW"
842 			" timestamp",
843 			dev->data->port_id);
844 	}
845 	tmpl->cq = mlx5_glue->cq_ex_to_cq
846 		(mlx5_glue->dv_create_cq(priv->ctx, &attr.cq.ibv,
847 					 &attr.cq.mlx5));
848 	if (tmpl->cq == NULL) {
849 		DRV_LOG(ERR, "port %u Rx queue %u CQ creation failure",
850 			dev->data->port_id, idx);
851 		rte_errno = ENOMEM;
852 		goto error;
853 	}
854 	DRV_LOG(DEBUG, "port %u priv->device_attr.max_qp_wr is %d",
855 		dev->data->port_id, priv->device_attr.orig_attr.max_qp_wr);
856 	DRV_LOG(DEBUG, "port %u priv->device_attr.max_sge is %d",
857 		dev->data->port_id, priv->device_attr.orig_attr.max_sge);
858 	attr.wq.ibv = (struct ibv_wq_init_attr){
859 		.wq_context = NULL, /* Could be useful in the future. */
860 		.wq_type = IBV_WQT_RQ,
861 		/* Max number of outstanding WRs. */
862 		.max_wr = wqe_n >> rxq_data->sges_n,
863 		/* Max number of scatter/gather elements in a WR. */
864 		.max_sge = 1 << rxq_data->sges_n,
865 		.pd = priv->pd,
866 		.cq = tmpl->cq,
867 		.comp_mask =
868 			IBV_WQ_FLAGS_CVLAN_STRIPPING |
869 			0,
870 		.create_flags = (rxq_data->vlan_strip ?
871 				 IBV_WQ_FLAGS_CVLAN_STRIPPING :
872 				 0),
873 	};
874 	/* By default, FCS (CRC) is stripped by hardware. */
875 	if (rxq_data->crc_present) {
876 		attr.wq.ibv.create_flags |= IBV_WQ_FLAGS_SCATTER_FCS;
877 		attr.wq.ibv.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS;
878 	}
879 #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
880 	if (config->hw_padding) {
881 		attr.wq.ibv.create_flags |= IBV_WQ_FLAG_RX_END_PADDING;
882 		attr.wq.ibv.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS;
883 	}
884 #endif
885 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
886 	attr.wq.mlx5 = (struct mlx5dv_wq_init_attr){
887 		.comp_mask = 0,
888 	};
889 	if (mprq_en) {
890 		struct mlx5dv_striding_rq_init_attr *mprq_attr =
891 			&attr.wq.mlx5.striding_rq_attrs;
892 
893 		attr.wq.mlx5.comp_mask |= MLX5DV_WQ_INIT_ATTR_MASK_STRIDING_RQ;
894 		*mprq_attr = (struct mlx5dv_striding_rq_init_attr){
895 			.single_stride_log_num_of_bytes = rxq_data->strd_sz_n,
896 			.single_wqe_log_num_of_strides = rxq_data->strd_num_n,
897 			.two_byte_shift_en = MLX5_MPRQ_TWO_BYTE_SHIFT,
898 		};
899 	}
900 	tmpl->wq = mlx5_glue->dv_create_wq(priv->ctx, &attr.wq.ibv,
901 					   &attr.wq.mlx5);
902 #else
903 	tmpl->wq = mlx5_glue->create_wq(priv->ctx, &attr.wq.ibv);
904 #endif
905 	if (tmpl->wq == NULL) {
906 		DRV_LOG(ERR, "port %u Rx queue %u WQ creation failure",
907 			dev->data->port_id, idx);
908 		rte_errno = ENOMEM;
909 		goto error;
910 	}
911 	/*
912 	 * Make sure number of WRs*SGEs match expectations since a queue
913 	 * cannot allocate more than "desc" buffers.
914 	 */
915 	if (attr.wq.ibv.max_wr != (wqe_n >> rxq_data->sges_n) ||
916 	    attr.wq.ibv.max_sge != (1u << rxq_data->sges_n)) {
917 		DRV_LOG(ERR,
918 			"port %u Rx queue %u requested %u*%u but got %u*%u"
919 			" WRs*SGEs",
920 			dev->data->port_id, idx,
921 			wqe_n >> rxq_data->sges_n, (1 << rxq_data->sges_n),
922 			attr.wq.ibv.max_wr, attr.wq.ibv.max_sge);
923 		rte_errno = EINVAL;
924 		goto error;
925 	}
926 	/* Change queue state to ready. */
927 	mod = (struct ibv_wq_attr){
928 		.attr_mask = IBV_WQ_ATTR_STATE,
929 		.wq_state = IBV_WQS_RDY,
930 	};
931 	ret = mlx5_glue->modify_wq(tmpl->wq, &mod);
932 	if (ret) {
933 		DRV_LOG(ERR,
934 			"port %u Rx queue %u WQ state to IBV_WQS_RDY failed",
935 			dev->data->port_id, idx);
936 		rte_errno = ret;
937 		goto error;
938 	}
939 	obj.cq.in = tmpl->cq;
940 	obj.cq.out = &cq_info;
941 	obj.rwq.in = tmpl->wq;
942 	obj.rwq.out = &rwq;
943 	ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_CQ | MLX5DV_OBJ_RWQ);
944 	if (ret) {
945 		rte_errno = ret;
946 		goto error;
947 	}
948 	if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) {
949 		DRV_LOG(ERR,
950 			"port %u wrong MLX5_CQE_SIZE environment variable"
951 			" value: it should be set to %u",
952 			dev->data->port_id, RTE_CACHE_LINE_SIZE);
953 		rte_errno = EINVAL;
954 		goto error;
955 	}
956 	/* Fill the rings. */
957 	rxq_data->wqes = rwq.buf;
958 	for (i = 0; (i != wqe_n); ++i) {
959 		volatile struct mlx5_wqe_data_seg *scat;
960 		uintptr_t addr;
961 		uint32_t byte_count;
962 
963 		if (mprq_en) {
964 			struct mlx5_mprq_buf *buf = (*rxq_data->mprq_bufs)[i];
965 
966 			scat = &((volatile struct mlx5_wqe_mprq *)
967 				 rxq_data->wqes)[i].dseg;
968 			addr = (uintptr_t)mlx5_mprq_buf_addr(buf);
969 			byte_count = (1 << rxq_data->strd_sz_n) *
970 				     (1 << rxq_data->strd_num_n);
971 		} else {
972 			struct rte_mbuf *buf = (*rxq_data->elts)[i];
973 
974 			scat = &((volatile struct mlx5_wqe_data_seg *)
975 				 rxq_data->wqes)[i];
976 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
977 			byte_count = DATA_LEN(buf);
978 		}
979 		/* scat->addr must be able to store a pointer. */
980 		assert(sizeof(scat->addr) >= sizeof(uintptr_t));
981 		*scat = (struct mlx5_wqe_data_seg){
982 			.addr = rte_cpu_to_be_64(addr),
983 			.byte_count = rte_cpu_to_be_32(byte_count),
984 			.lkey = mlx5_rx_addr2mr(rxq_data, addr),
985 		};
986 	}
987 	rxq_data->rq_db = rwq.dbrec;
988 	rxq_data->cqe_n = log2above(cq_info.cqe_cnt);
989 	rxq_data->cq_ci = 0;
990 	rxq_data->consumed_strd = 0;
991 	rxq_data->rq_pi = 0;
992 	rxq_data->zip = (struct rxq_zip){
993 		.ai = 0,
994 	};
995 	rxq_data->cq_db = cq_info.dbrec;
996 	rxq_data->cqes = (volatile struct mlx5_cqe (*)[])(uintptr_t)cq_info.buf;
997 	rxq_data->cq_uar = cq_info.cq_uar;
998 	rxq_data->cqn = cq_info.cqn;
999 	rxq_data->cq_arm_sn = 0;
1000 	/* Update doorbell counter. */
1001 	rxq_data->rq_ci = wqe_n >> rxq_data->sges_n;
1002 	rte_wmb();
1003 	*rxq_data->rq_db = rte_cpu_to_be_32(rxq_data->rq_ci);
1004 	DRV_LOG(DEBUG, "port %u rxq %u updated with %p", dev->data->port_id,
1005 		idx, (void *)&tmpl);
1006 	rte_atomic32_inc(&tmpl->refcnt);
1007 	LIST_INSERT_HEAD(&priv->rxqsibv, tmpl, next);
1008 	priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE;
1009 	return tmpl;
1010 error:
1011 	ret = rte_errno; /* Save rte_errno before cleanup. */
1012 	if (tmpl->wq)
1013 		claim_zero(mlx5_glue->destroy_wq(tmpl->wq));
1014 	if (tmpl->cq)
1015 		claim_zero(mlx5_glue->destroy_cq(tmpl->cq));
1016 	if (tmpl->channel)
1017 		claim_zero(mlx5_glue->destroy_comp_channel(tmpl->channel));
1018 	priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE;
1019 	rte_errno = ret; /* Restore rte_errno. */
1020 	return NULL;
1021 }
1022 
1023 /**
1024  * Get an Rx queue Verbs object.
1025  *
1026  * @param dev
1027  *   Pointer to Ethernet device.
1028  * @param idx
1029  *   Queue index in DPDK Rx queue array
1030  *
1031  * @return
1032  *   The Verbs object if it exists.
1033  */
1034 struct mlx5_rxq_ibv *
1035 mlx5_rxq_ibv_get(struct rte_eth_dev *dev, uint16_t idx)
1036 {
1037 	struct priv *priv = dev->data->dev_private;
1038 	struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx];
1039 	struct mlx5_rxq_ctrl *rxq_ctrl;
1040 
1041 	if (idx >= priv->rxqs_n)
1042 		return NULL;
1043 	if (!rxq_data)
1044 		return NULL;
1045 	rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
1046 	if (rxq_ctrl->ibv) {
1047 		rte_atomic32_inc(&rxq_ctrl->ibv->refcnt);
1048 	}
1049 	return rxq_ctrl->ibv;
1050 }
1051 
1052 /**
1053  * Release an Rx verbs queue object.
1054  *
1055  * @param rxq_ibv
1056  *   Verbs Rx queue object.
1057  *
1058  * @return
1059  *   1 while a reference on it exists, 0 when freed.
1060  */
1061 int
1062 mlx5_rxq_ibv_release(struct mlx5_rxq_ibv *rxq_ibv)
1063 {
1064 	assert(rxq_ibv);
1065 	assert(rxq_ibv->wq);
1066 	assert(rxq_ibv->cq);
1067 	if (rte_atomic32_dec_and_test(&rxq_ibv->refcnt)) {
1068 		rxq_free_elts(rxq_ibv->rxq_ctrl);
1069 		claim_zero(mlx5_glue->destroy_wq(rxq_ibv->wq));
1070 		claim_zero(mlx5_glue->destroy_cq(rxq_ibv->cq));
1071 		if (rxq_ibv->channel)
1072 			claim_zero(mlx5_glue->destroy_comp_channel
1073 				   (rxq_ibv->channel));
1074 		LIST_REMOVE(rxq_ibv, next);
1075 		rte_free(rxq_ibv);
1076 		return 0;
1077 	}
1078 	return 1;
1079 }
1080 
1081 /**
1082  * Verify the Verbs Rx queue list is empty
1083  *
1084  * @param dev
1085  *   Pointer to Ethernet device.
1086  *
1087  * @return
1088  *   The number of object not released.
1089  */
1090 int
1091 mlx5_rxq_ibv_verify(struct rte_eth_dev *dev)
1092 {
1093 	struct priv *priv = dev->data->dev_private;
1094 	int ret = 0;
1095 	struct mlx5_rxq_ibv *rxq_ibv;
1096 
1097 	LIST_FOREACH(rxq_ibv, &priv->rxqsibv, next) {
1098 		DRV_LOG(DEBUG, "port %u Verbs Rx queue %u still referenced",
1099 			dev->data->port_id, rxq_ibv->rxq_ctrl->idx);
1100 		++ret;
1101 	}
1102 	return ret;
1103 }
1104 
1105 /**
1106  * Return true if a single reference exists on the object.
1107  *
1108  * @param rxq_ibv
1109  *   Verbs Rx queue object.
1110  */
1111 int
1112 mlx5_rxq_ibv_releasable(struct mlx5_rxq_ibv *rxq_ibv)
1113 {
1114 	assert(rxq_ibv);
1115 	return (rte_atomic32_read(&rxq_ibv->refcnt) == 1);
1116 }
1117 
1118 /**
1119  * Callback function to initialize mbufs for Multi-Packet RQ.
1120  */
1121 static inline void
1122 mlx5_mprq_buf_init(struct rte_mempool *mp, void *opaque_arg __rte_unused,
1123 		    void *_m, unsigned int i __rte_unused)
1124 {
1125 	struct mlx5_mprq_buf *buf = _m;
1126 
1127 	memset(_m, 0, sizeof(*buf));
1128 	buf->mp = mp;
1129 	rte_atomic16_set(&buf->refcnt, 1);
1130 }
1131 
1132 /**
1133  * Free mempool of Multi-Packet RQ.
1134  *
1135  * @param dev
1136  *   Pointer to Ethernet device.
1137  *
1138  * @return
1139  *   0 on success, negative errno value on failure.
1140  */
1141 int
1142 mlx5_mprq_free_mp(struct rte_eth_dev *dev)
1143 {
1144 	struct priv *priv = dev->data->dev_private;
1145 	struct rte_mempool *mp = priv->mprq_mp;
1146 	unsigned int i;
1147 
1148 	if (mp == NULL)
1149 		return 0;
1150 	DRV_LOG(DEBUG, "port %u freeing mempool (%s) for Multi-Packet RQ",
1151 		dev->data->port_id, mp->name);
1152 	/*
1153 	 * If a buffer in the pool has been externally attached to a mbuf and it
1154 	 * is still in use by application, destroying the Rx qeueue can spoil
1155 	 * the packet. It is unlikely to happen but if application dynamically
1156 	 * creates and destroys with holding Rx packets, this can happen.
1157 	 *
1158 	 * TODO: It is unavoidable for now because the mempool for Multi-Packet
1159 	 * RQ isn't provided by application but managed by PMD.
1160 	 */
1161 	if (!rte_mempool_full(mp)) {
1162 		DRV_LOG(ERR,
1163 			"port %u mempool for Multi-Packet RQ is still in use",
1164 			dev->data->port_id);
1165 		rte_errno = EBUSY;
1166 		return -rte_errno;
1167 	}
1168 	rte_mempool_free(mp);
1169 	/* Unset mempool for each Rx queue. */
1170 	for (i = 0; i != priv->rxqs_n; ++i) {
1171 		struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
1172 
1173 		if (rxq == NULL)
1174 			continue;
1175 		rxq->mprq_mp = NULL;
1176 	}
1177 	return 0;
1178 }
1179 
1180 /**
1181  * Allocate a mempool for Multi-Packet RQ. All configured Rx queues share the
1182  * mempool. If already allocated, reuse it if there're enough elements.
1183  * Otherwise, resize it.
1184  *
1185  * @param dev
1186  *   Pointer to Ethernet device.
1187  *
1188  * @return
1189  *   0 on success, negative errno value on failure.
1190  */
1191 int
1192 mlx5_mprq_alloc_mp(struct rte_eth_dev *dev)
1193 {
1194 	struct priv *priv = dev->data->dev_private;
1195 	struct rte_mempool *mp = priv->mprq_mp;
1196 	char name[RTE_MEMPOOL_NAMESIZE];
1197 	unsigned int desc = 0;
1198 	unsigned int buf_len;
1199 	unsigned int obj_num;
1200 	unsigned int obj_size;
1201 	unsigned int strd_num_n = 0;
1202 	unsigned int strd_sz_n = 0;
1203 	unsigned int i;
1204 
1205 	if (!mlx5_mprq_enabled(dev))
1206 		return 0;
1207 	/* Count the total number of descriptors configured. */
1208 	for (i = 0; i != priv->rxqs_n; ++i) {
1209 		struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
1210 
1211 		if (rxq == NULL)
1212 			continue;
1213 		desc += 1 << rxq->elts_n;
1214 		/* Get the max number of strides. */
1215 		if (strd_num_n < rxq->strd_num_n)
1216 			strd_num_n = rxq->strd_num_n;
1217 		/* Get the max size of a stride. */
1218 		if (strd_sz_n < rxq->strd_sz_n)
1219 			strd_sz_n = rxq->strd_sz_n;
1220 	}
1221 	assert(strd_num_n && strd_sz_n);
1222 	buf_len = (1 << strd_num_n) * (1 << strd_sz_n);
1223 	obj_size = buf_len + sizeof(struct mlx5_mprq_buf);
1224 	/*
1225 	 * Received packets can be either memcpy'd or externally referenced. In
1226 	 * case that the packet is attached to an mbuf as an external buffer, as
1227 	 * it isn't possible to predict how the buffers will be queued by
1228 	 * application, there's no option to exactly pre-allocate needed buffers
1229 	 * in advance but to speculatively prepares enough buffers.
1230 	 *
1231 	 * In the data path, if this Mempool is depleted, PMD will try to memcpy
1232 	 * received packets to buffers provided by application (rxq->mp) until
1233 	 * this Mempool gets available again.
1234 	 */
1235 	desc *= 4;
1236 	obj_num = desc + MLX5_MPRQ_MP_CACHE_SZ * priv->rxqs_n;
1237 	/*
1238 	 * rte_mempool_create_empty() has sanity check to refuse large cache
1239 	 * size compared to the number of elements.
1240 	 * CACHE_FLUSHTHRESH_MULTIPLIER is defined in a C file, so using a
1241 	 * constant number 2 instead.
1242 	 */
1243 	obj_num = RTE_MAX(obj_num, MLX5_MPRQ_MP_CACHE_SZ * 2);
1244 	/* Check a mempool is already allocated and if it can be resued. */
1245 	if (mp != NULL && mp->elt_size >= obj_size && mp->size >= obj_num) {
1246 		DRV_LOG(DEBUG, "port %u mempool %s is being reused",
1247 			dev->data->port_id, mp->name);
1248 		/* Reuse. */
1249 		goto exit;
1250 	} else if (mp != NULL) {
1251 		DRV_LOG(DEBUG, "port %u mempool %s should be resized, freeing it",
1252 			dev->data->port_id, mp->name);
1253 		/*
1254 		 * If failed to free, which means it may be still in use, no way
1255 		 * but to keep using the existing one. On buffer underrun,
1256 		 * packets will be memcpy'd instead of external buffer
1257 		 * attachment.
1258 		 */
1259 		if (mlx5_mprq_free_mp(dev)) {
1260 			if (mp->elt_size >= obj_size)
1261 				goto exit;
1262 			else
1263 				return -rte_errno;
1264 		}
1265 	}
1266 	snprintf(name, sizeof(name), "%s-mprq", dev->device->name);
1267 	mp = rte_mempool_create(name, obj_num, obj_size, MLX5_MPRQ_MP_CACHE_SZ,
1268 				0, NULL, NULL, mlx5_mprq_buf_init, NULL,
1269 				dev->device->numa_node, 0);
1270 	if (mp == NULL) {
1271 		DRV_LOG(ERR,
1272 			"port %u failed to allocate a mempool for"
1273 			" Multi-Packet RQ, count=%u, size=%u",
1274 			dev->data->port_id, obj_num, obj_size);
1275 		rte_errno = ENOMEM;
1276 		return -rte_errno;
1277 	}
1278 	priv->mprq_mp = mp;
1279 exit:
1280 	/* Set mempool for each Rx queue. */
1281 	for (i = 0; i != priv->rxqs_n; ++i) {
1282 		struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
1283 
1284 		if (rxq == NULL)
1285 			continue;
1286 		rxq->mprq_mp = mp;
1287 	}
1288 	DRV_LOG(INFO, "port %u Multi-Packet RQ is configured",
1289 		dev->data->port_id);
1290 	return 0;
1291 }
1292 
1293 /**
1294  * Create a DPDK Rx queue.
1295  *
1296  * @param dev
1297  *   Pointer to Ethernet device.
1298  * @param idx
1299  *   RX queue index.
1300  * @param desc
1301  *   Number of descriptors to configure in queue.
1302  * @param socket
1303  *   NUMA socket on which memory must be allocated.
1304  *
1305  * @return
1306  *   A DPDK queue object on success, NULL otherwise and rte_errno is set.
1307  */
1308 struct mlx5_rxq_ctrl *
1309 mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
1310 	     unsigned int socket, const struct rte_eth_rxconf *conf,
1311 	     struct rte_mempool *mp)
1312 {
1313 	struct priv *priv = dev->data->dev_private;
1314 	struct mlx5_rxq_ctrl *tmpl;
1315 	unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
1316 	unsigned int mprq_stride_size;
1317 	struct mlx5_dev_config *config = &priv->config;
1318 	/*
1319 	 * Always allocate extra slots, even if eventually
1320 	 * the vector Rx will not be used.
1321 	 */
1322 	uint16_t desc_n =
1323 		desc + config->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP;
1324 	uint64_t offloads = conf->offloads |
1325 			   dev->data->dev_conf.rxmode.offloads;
1326 	const int mprq_en = mlx5_check_mprq_support(dev) > 0;
1327 
1328 	tmpl = rte_calloc_socket("RXQ", 1,
1329 				 sizeof(*tmpl) +
1330 				 desc_n * sizeof(struct rte_mbuf *),
1331 				 0, socket);
1332 	if (!tmpl) {
1333 		rte_errno = ENOMEM;
1334 		return NULL;
1335 	}
1336 	if (mlx5_mr_btree_init(&tmpl->rxq.mr_ctrl.cache_bh,
1337 			       MLX5_MR_BTREE_CACHE_N, socket)) {
1338 		/* rte_errno is already set. */
1339 		goto error;
1340 	}
1341 	tmpl->socket = socket;
1342 	if (dev->data->dev_conf.intr_conf.rxq)
1343 		tmpl->irq = 1;
1344 	/*
1345 	 * This Rx queue can be configured as a Multi-Packet RQ if all of the
1346 	 * following conditions are met:
1347 	 *  - MPRQ is enabled.
1348 	 *  - The number of descs is more than the number of strides.
1349 	 *  - max_rx_pkt_len plus overhead is less than the max size of a
1350 	 *    stride.
1351 	 *  Otherwise, enable Rx scatter if necessary.
1352 	 */
1353 	assert(mb_len >= RTE_PKTMBUF_HEADROOM);
1354 	mprq_stride_size =
1355 		dev->data->dev_conf.rxmode.max_rx_pkt_len +
1356 		sizeof(struct rte_mbuf_ext_shared_info) +
1357 		RTE_PKTMBUF_HEADROOM;
1358 	if (mprq_en &&
1359 	    desc > (1U << config->mprq.stride_num_n) &&
1360 	    mprq_stride_size <= (1U << config->mprq.max_stride_size_n)) {
1361 		/* TODO: Rx scatter isn't supported yet. */
1362 		tmpl->rxq.sges_n = 0;
1363 		/* Trim the number of descs needed. */
1364 		desc >>= config->mprq.stride_num_n;
1365 		tmpl->rxq.strd_num_n = config->mprq.stride_num_n;
1366 		tmpl->rxq.strd_sz_n = RTE_MAX(log2above(mprq_stride_size),
1367 					      config->mprq.min_stride_size_n);
1368 		tmpl->rxq.strd_shift_en = MLX5_MPRQ_TWO_BYTE_SHIFT;
1369 		tmpl->rxq.mprq_max_memcpy_len =
1370 			RTE_MIN(mb_len - RTE_PKTMBUF_HEADROOM,
1371 				config->mprq.max_memcpy_len);
1372 		DRV_LOG(DEBUG,
1373 			"port %u Rx queue %u: Multi-Packet RQ is enabled"
1374 			" strd_num_n = %u, strd_sz_n = %u",
1375 			dev->data->port_id, idx,
1376 			tmpl->rxq.strd_num_n, tmpl->rxq.strd_sz_n);
1377 	} else if (dev->data->dev_conf.rxmode.max_rx_pkt_len <=
1378 		   (mb_len - RTE_PKTMBUF_HEADROOM)) {
1379 		tmpl->rxq.sges_n = 0;
1380 	} else if (offloads & DEV_RX_OFFLOAD_SCATTER) {
1381 		unsigned int size =
1382 			RTE_PKTMBUF_HEADROOM +
1383 			dev->data->dev_conf.rxmode.max_rx_pkt_len;
1384 		unsigned int sges_n;
1385 
1386 		/*
1387 		 * Determine the number of SGEs needed for a full packet
1388 		 * and round it to the next power of two.
1389 		 */
1390 		sges_n = log2above((size / mb_len) + !!(size % mb_len));
1391 		tmpl->rxq.sges_n = sges_n;
1392 		/* Make sure rxq.sges_n did not overflow. */
1393 		size = mb_len * (1 << tmpl->rxq.sges_n);
1394 		size -= RTE_PKTMBUF_HEADROOM;
1395 		if (size < dev->data->dev_conf.rxmode.max_rx_pkt_len) {
1396 			DRV_LOG(ERR,
1397 				"port %u too many SGEs (%u) needed to handle"
1398 				" requested maximum packet size %u",
1399 				dev->data->port_id,
1400 				1 << sges_n,
1401 				dev->data->dev_conf.rxmode.max_rx_pkt_len);
1402 			rte_errno = EOVERFLOW;
1403 			goto error;
1404 		}
1405 	} else {
1406 		DRV_LOG(WARNING,
1407 			"port %u the requested maximum Rx packet size (%u) is"
1408 			" larger than a single mbuf (%u) and scattered mode has"
1409 			" not been requested",
1410 			dev->data->port_id,
1411 			dev->data->dev_conf.rxmode.max_rx_pkt_len,
1412 			mb_len - RTE_PKTMBUF_HEADROOM);
1413 	}
1414 	if (mprq_en && !mlx5_rxq_mprq_enabled(&tmpl->rxq))
1415 		DRV_LOG(WARNING,
1416 			"port %u MPRQ is requested but cannot be enabled"
1417 			" (requested: desc = %u, stride_sz = %u,"
1418 			" supported: min_stride_num = %u, max_stride_sz = %u).",
1419 			dev->data->port_id, desc, mprq_stride_size,
1420 			(1 << config->mprq.stride_num_n),
1421 			(1 << config->mprq.max_stride_size_n));
1422 	DRV_LOG(DEBUG, "port %u maximum number of segments per packet: %u",
1423 		dev->data->port_id, 1 << tmpl->rxq.sges_n);
1424 	if (desc % (1 << tmpl->rxq.sges_n)) {
1425 		DRV_LOG(ERR,
1426 			"port %u number of Rx queue descriptors (%u) is not a"
1427 			" multiple of SGEs per packet (%u)",
1428 			dev->data->port_id,
1429 			desc,
1430 			1 << tmpl->rxq.sges_n);
1431 		rte_errno = EINVAL;
1432 		goto error;
1433 	}
1434 	/* Toggle RX checksum offload if hardware supports it. */
1435 	tmpl->rxq.csum = !!(offloads & DEV_RX_OFFLOAD_CHECKSUM);
1436 	tmpl->rxq.hw_timestamp = !!(offloads & DEV_RX_OFFLOAD_TIMESTAMP);
1437 	/* Configure VLAN stripping. */
1438 	tmpl->rxq.vlan_strip = !!(offloads & DEV_RX_OFFLOAD_VLAN_STRIP);
1439 	/* By default, FCS (CRC) is stripped by hardware. */
1440 	tmpl->rxq.crc_present = 0;
1441 	if (rte_eth_dev_must_keep_crc(offloads)) {
1442 		if (config->hw_fcs_strip) {
1443 			tmpl->rxq.crc_present = 1;
1444 		} else {
1445 			DRV_LOG(WARNING,
1446 				"port %u CRC stripping has been disabled but will"
1447 				" still be performed by hardware, make sure MLNX_OFED"
1448 				" and firmware are up to date",
1449 				dev->data->port_id);
1450 		}
1451 	}
1452 	DRV_LOG(DEBUG,
1453 		"port %u CRC stripping is %s, %u bytes will be subtracted from"
1454 		" incoming frames to hide it",
1455 		dev->data->port_id,
1456 		tmpl->rxq.crc_present ? "disabled" : "enabled",
1457 		tmpl->rxq.crc_present << 2);
1458 	/* Save port ID. */
1459 	tmpl->rxq.rss_hash = !!priv->rss_conf.rss_hf &&
1460 		(!!(dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS));
1461 	tmpl->rxq.port_id = dev->data->port_id;
1462 	tmpl->priv = priv;
1463 	tmpl->rxq.mp = mp;
1464 	tmpl->rxq.stats.idx = idx;
1465 	tmpl->rxq.elts_n = log2above(desc);
1466 	tmpl->rxq.elts =
1467 		(struct rte_mbuf *(*)[1 << tmpl->rxq.elts_n])(tmpl + 1);
1468 #ifndef RTE_ARCH_64
1469 	tmpl->rxq.uar_lock_cq = &priv->uar_lock_cq;
1470 #endif
1471 	tmpl->idx = idx;
1472 	rte_atomic32_inc(&tmpl->refcnt);
1473 	LIST_INSERT_HEAD(&priv->rxqsctrl, tmpl, next);
1474 	return tmpl;
1475 error:
1476 	rte_free(tmpl);
1477 	return NULL;
1478 }
1479 
1480 /**
1481  * Get a Rx queue.
1482  *
1483  * @param dev
1484  *   Pointer to Ethernet device.
1485  * @param idx
1486  *   TX queue index.
1487  *
1488  * @return
1489  *   A pointer to the queue if it exists, NULL otherwise.
1490  */
1491 struct mlx5_rxq_ctrl *
1492 mlx5_rxq_get(struct rte_eth_dev *dev, uint16_t idx)
1493 {
1494 	struct priv *priv = dev->data->dev_private;
1495 	struct mlx5_rxq_ctrl *rxq_ctrl = NULL;
1496 
1497 	if ((*priv->rxqs)[idx]) {
1498 		rxq_ctrl = container_of((*priv->rxqs)[idx],
1499 					struct mlx5_rxq_ctrl,
1500 					rxq);
1501 		mlx5_rxq_ibv_get(dev, idx);
1502 		rte_atomic32_inc(&rxq_ctrl->refcnt);
1503 	}
1504 	return rxq_ctrl;
1505 }
1506 
1507 /**
1508  * Release a Rx queue.
1509  *
1510  * @param dev
1511  *   Pointer to Ethernet device.
1512  * @param idx
1513  *   TX queue index.
1514  *
1515  * @return
1516  *   1 while a reference on it exists, 0 when freed.
1517  */
1518 int
1519 mlx5_rxq_release(struct rte_eth_dev *dev, uint16_t idx)
1520 {
1521 	struct priv *priv = dev->data->dev_private;
1522 	struct mlx5_rxq_ctrl *rxq_ctrl;
1523 
1524 	if (!(*priv->rxqs)[idx])
1525 		return 0;
1526 	rxq_ctrl = container_of((*priv->rxqs)[idx], struct mlx5_rxq_ctrl, rxq);
1527 	assert(rxq_ctrl->priv);
1528 	if (rxq_ctrl->ibv && !mlx5_rxq_ibv_release(rxq_ctrl->ibv))
1529 		rxq_ctrl->ibv = NULL;
1530 	if (rte_atomic32_dec_and_test(&rxq_ctrl->refcnt)) {
1531 		mlx5_mr_btree_free(&rxq_ctrl->rxq.mr_ctrl.cache_bh);
1532 		LIST_REMOVE(rxq_ctrl, next);
1533 		rte_free(rxq_ctrl);
1534 		(*priv->rxqs)[idx] = NULL;
1535 		return 0;
1536 	}
1537 	return 1;
1538 }
1539 
1540 /**
1541  * Verify if the queue can be released.
1542  *
1543  * @param dev
1544  *   Pointer to Ethernet device.
1545  * @param idx
1546  *   TX queue index.
1547  *
1548  * @return
1549  *   1 if the queue can be released, negative errno otherwise and rte_errno is
1550  *   set.
1551  */
1552 int
1553 mlx5_rxq_releasable(struct rte_eth_dev *dev, uint16_t idx)
1554 {
1555 	struct priv *priv = dev->data->dev_private;
1556 	struct mlx5_rxq_ctrl *rxq_ctrl;
1557 
1558 	if (!(*priv->rxqs)[idx]) {
1559 		rte_errno = EINVAL;
1560 		return -rte_errno;
1561 	}
1562 	rxq_ctrl = container_of((*priv->rxqs)[idx], struct mlx5_rxq_ctrl, rxq);
1563 	return (rte_atomic32_read(&rxq_ctrl->refcnt) == 1);
1564 }
1565 
1566 /**
1567  * Verify the Rx Queue list is empty
1568  *
1569  * @param dev
1570  *   Pointer to Ethernet device.
1571  *
1572  * @return
1573  *   The number of object not released.
1574  */
1575 int
1576 mlx5_rxq_verify(struct rte_eth_dev *dev)
1577 {
1578 	struct priv *priv = dev->data->dev_private;
1579 	struct mlx5_rxq_ctrl *rxq_ctrl;
1580 	int ret = 0;
1581 
1582 	LIST_FOREACH(rxq_ctrl, &priv->rxqsctrl, next) {
1583 		DRV_LOG(DEBUG, "port %u Rx Queue %u still referenced",
1584 			dev->data->port_id, rxq_ctrl->idx);
1585 		++ret;
1586 	}
1587 	return ret;
1588 }
1589 
1590 /**
1591  * Create an indirection table.
1592  *
1593  * @param dev
1594  *   Pointer to Ethernet device.
1595  * @param queues
1596  *   Queues entering in the indirection table.
1597  * @param queues_n
1598  *   Number of queues in the array.
1599  *
1600  * @return
1601  *   The Verbs object initialised, NULL otherwise and rte_errno is set.
1602  */
1603 struct mlx5_ind_table_ibv *
1604 mlx5_ind_table_ibv_new(struct rte_eth_dev *dev, const uint16_t *queues,
1605 		       uint32_t queues_n)
1606 {
1607 	struct priv *priv = dev->data->dev_private;
1608 	struct mlx5_ind_table_ibv *ind_tbl;
1609 	const unsigned int wq_n = rte_is_power_of_2(queues_n) ?
1610 		log2above(queues_n) :
1611 		log2above(priv->config.ind_table_max_size);
1612 	struct ibv_wq *wq[1 << wq_n];
1613 	unsigned int i;
1614 	unsigned int j;
1615 
1616 	ind_tbl = rte_calloc(__func__, 1, sizeof(*ind_tbl) +
1617 			     queues_n * sizeof(uint16_t), 0);
1618 	if (!ind_tbl) {
1619 		rte_errno = ENOMEM;
1620 		return NULL;
1621 	}
1622 	for (i = 0; i != queues_n; ++i) {
1623 		struct mlx5_rxq_ctrl *rxq = mlx5_rxq_get(dev, queues[i]);
1624 
1625 		if (!rxq)
1626 			goto error;
1627 		wq[i] = rxq->ibv->wq;
1628 		ind_tbl->queues[i] = queues[i];
1629 	}
1630 	ind_tbl->queues_n = queues_n;
1631 	/* Finalise indirection table. */
1632 	for (j = 0; i != (unsigned int)(1 << wq_n); ++i, ++j)
1633 		wq[i] = wq[j];
1634 	ind_tbl->ind_table = mlx5_glue->create_rwq_ind_table
1635 		(priv->ctx,
1636 		 &(struct ibv_rwq_ind_table_init_attr){
1637 			.log_ind_tbl_size = wq_n,
1638 			.ind_tbl = wq,
1639 			.comp_mask = 0,
1640 		 });
1641 	if (!ind_tbl->ind_table) {
1642 		rte_errno = errno;
1643 		goto error;
1644 	}
1645 	rte_atomic32_inc(&ind_tbl->refcnt);
1646 	LIST_INSERT_HEAD(&priv->ind_tbls, ind_tbl, next);
1647 	return ind_tbl;
1648 error:
1649 	rte_free(ind_tbl);
1650 	DEBUG("port %u cannot create indirection table", dev->data->port_id);
1651 	return NULL;
1652 }
1653 
1654 /**
1655  * Get an indirection table.
1656  *
1657  * @param dev
1658  *   Pointer to Ethernet device.
1659  * @param queues
1660  *   Queues entering in the indirection table.
1661  * @param queues_n
1662  *   Number of queues in the array.
1663  *
1664  * @return
1665  *   An indirection table if found.
1666  */
1667 struct mlx5_ind_table_ibv *
1668 mlx5_ind_table_ibv_get(struct rte_eth_dev *dev, const uint16_t *queues,
1669 		       uint32_t queues_n)
1670 {
1671 	struct priv *priv = dev->data->dev_private;
1672 	struct mlx5_ind_table_ibv *ind_tbl;
1673 
1674 	LIST_FOREACH(ind_tbl, &priv->ind_tbls, next) {
1675 		if ((ind_tbl->queues_n == queues_n) &&
1676 		    (memcmp(ind_tbl->queues, queues,
1677 			    ind_tbl->queues_n * sizeof(ind_tbl->queues[0]))
1678 		     == 0))
1679 			break;
1680 	}
1681 	if (ind_tbl) {
1682 		unsigned int i;
1683 
1684 		rte_atomic32_inc(&ind_tbl->refcnt);
1685 		for (i = 0; i != ind_tbl->queues_n; ++i)
1686 			mlx5_rxq_get(dev, ind_tbl->queues[i]);
1687 	}
1688 	return ind_tbl;
1689 }
1690 
1691 /**
1692  * Release an indirection table.
1693  *
1694  * @param dev
1695  *   Pointer to Ethernet device.
1696  * @param ind_table
1697  *   Indirection table to release.
1698  *
1699  * @return
1700  *   1 while a reference on it exists, 0 when freed.
1701  */
1702 int
1703 mlx5_ind_table_ibv_release(struct rte_eth_dev *dev,
1704 			   struct mlx5_ind_table_ibv *ind_tbl)
1705 {
1706 	unsigned int i;
1707 
1708 	if (rte_atomic32_dec_and_test(&ind_tbl->refcnt))
1709 		claim_zero(mlx5_glue->destroy_rwq_ind_table
1710 			   (ind_tbl->ind_table));
1711 	for (i = 0; i != ind_tbl->queues_n; ++i)
1712 		claim_nonzero(mlx5_rxq_release(dev, ind_tbl->queues[i]));
1713 	if (!rte_atomic32_read(&ind_tbl->refcnt)) {
1714 		LIST_REMOVE(ind_tbl, next);
1715 		rte_free(ind_tbl);
1716 		return 0;
1717 	}
1718 	return 1;
1719 }
1720 
1721 /**
1722  * Verify the Rx Queue list is empty
1723  *
1724  * @param dev
1725  *   Pointer to Ethernet device.
1726  *
1727  * @return
1728  *   The number of object not released.
1729  */
1730 int
1731 mlx5_ind_table_ibv_verify(struct rte_eth_dev *dev)
1732 {
1733 	struct priv *priv = dev->data->dev_private;
1734 	struct mlx5_ind_table_ibv *ind_tbl;
1735 	int ret = 0;
1736 
1737 	LIST_FOREACH(ind_tbl, &priv->ind_tbls, next) {
1738 		DRV_LOG(DEBUG,
1739 			"port %u Verbs indirection table %p still referenced",
1740 			dev->data->port_id, (void *)ind_tbl);
1741 		++ret;
1742 	}
1743 	return ret;
1744 }
1745 
1746 /**
1747  * Create an Rx Hash queue.
1748  *
1749  * @param dev
1750  *   Pointer to Ethernet device.
1751  * @param rss_key
1752  *   RSS key for the Rx hash queue.
1753  * @param rss_key_len
1754  *   RSS key length.
1755  * @param hash_fields
1756  *   Verbs protocol hash field to make the RSS on.
1757  * @param queues
1758  *   Queues entering in hash queue. In case of empty hash_fields only the
1759  *   first queue index will be taken for the indirection table.
1760  * @param queues_n
1761  *   Number of queues.
1762  *
1763  * @return
1764  *   The Verbs object initialised, NULL otherwise and rte_errno is set.
1765  */
1766 struct mlx5_hrxq *
1767 mlx5_hrxq_new(struct rte_eth_dev *dev,
1768 	      const uint8_t *rss_key, uint32_t rss_key_len,
1769 	      uint64_t hash_fields,
1770 	      const uint16_t *queues, uint32_t queues_n,
1771 	      int tunnel __rte_unused)
1772 {
1773 	struct priv *priv = dev->data->dev_private;
1774 	struct mlx5_hrxq *hrxq;
1775 	struct mlx5_ind_table_ibv *ind_tbl;
1776 	struct ibv_qp *qp;
1777 	int err;
1778 
1779 	queues_n = hash_fields ? queues_n : 1;
1780 	ind_tbl = mlx5_ind_table_ibv_get(dev, queues, queues_n);
1781 	if (!ind_tbl)
1782 		ind_tbl = mlx5_ind_table_ibv_new(dev, queues, queues_n);
1783 	if (!ind_tbl) {
1784 		rte_errno = ENOMEM;
1785 		return NULL;
1786 	}
1787 	if (!rss_key_len) {
1788 		rss_key_len = MLX5_RSS_HASH_KEY_LEN;
1789 		rss_key = rss_hash_default_key;
1790 	}
1791 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
1792 	qp = mlx5_glue->dv_create_qp
1793 		(priv->ctx,
1794 		 &(struct ibv_qp_init_attr_ex){
1795 			.qp_type = IBV_QPT_RAW_PACKET,
1796 			.comp_mask =
1797 				IBV_QP_INIT_ATTR_PD |
1798 				IBV_QP_INIT_ATTR_IND_TABLE |
1799 				IBV_QP_INIT_ATTR_RX_HASH,
1800 			.rx_hash_conf = (struct ibv_rx_hash_conf){
1801 				.rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ,
1802 				.rx_hash_key_len = rss_key_len ? rss_key_len :
1803 						   MLX5_RSS_HASH_KEY_LEN,
1804 				.rx_hash_key = rss_key ?
1805 					       (void *)(uintptr_t)rss_key :
1806 					       rss_hash_default_key,
1807 				.rx_hash_fields_mask = hash_fields,
1808 			},
1809 			.rwq_ind_tbl = ind_tbl->ind_table,
1810 			.pd = priv->pd,
1811 		 },
1812 		 &(struct mlx5dv_qp_init_attr){
1813 			.comp_mask = tunnel ?
1814 				MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS : 0,
1815 			.create_flags = MLX5DV_QP_CREATE_TUNNEL_OFFLOADS,
1816 		 });
1817 #else
1818 	qp = mlx5_glue->create_qp_ex
1819 		(priv->ctx,
1820 		 &(struct ibv_qp_init_attr_ex){
1821 			.qp_type = IBV_QPT_RAW_PACKET,
1822 			.comp_mask =
1823 				IBV_QP_INIT_ATTR_PD |
1824 				IBV_QP_INIT_ATTR_IND_TABLE |
1825 				IBV_QP_INIT_ATTR_RX_HASH,
1826 			.rx_hash_conf = (struct ibv_rx_hash_conf){
1827 				.rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ,
1828 				.rx_hash_key_len = rss_key_len ? rss_key_len :
1829 						   MLX5_RSS_HASH_KEY_LEN,
1830 				.rx_hash_key = rss_key ?
1831 					       (void *)(uintptr_t)rss_key :
1832 					       rss_hash_default_key,
1833 				.rx_hash_fields_mask = hash_fields,
1834 			},
1835 			.rwq_ind_tbl = ind_tbl->ind_table,
1836 			.pd = priv->pd,
1837 		 });
1838 #endif
1839 	if (!qp) {
1840 		rte_errno = errno;
1841 		goto error;
1842 	}
1843 	hrxq = rte_calloc(__func__, 1, sizeof(*hrxq) + rss_key_len, 0);
1844 	if (!hrxq)
1845 		goto error;
1846 	hrxq->ind_table = ind_tbl;
1847 	hrxq->qp = qp;
1848 	hrxq->rss_key_len = rss_key_len;
1849 	hrxq->hash_fields = hash_fields;
1850 	memcpy(hrxq->rss_key, rss_key, rss_key_len);
1851 	rte_atomic32_inc(&hrxq->refcnt);
1852 	LIST_INSERT_HEAD(&priv->hrxqs, hrxq, next);
1853 	return hrxq;
1854 error:
1855 	err = rte_errno; /* Save rte_errno before cleanup. */
1856 	mlx5_ind_table_ibv_release(dev, ind_tbl);
1857 	if (qp)
1858 		claim_zero(mlx5_glue->destroy_qp(qp));
1859 	rte_errno = err; /* Restore rte_errno. */
1860 	return NULL;
1861 }
1862 
1863 /**
1864  * Get an Rx Hash queue.
1865  *
1866  * @param dev
1867  *   Pointer to Ethernet device.
1868  * @param rss_conf
1869  *   RSS configuration for the Rx hash queue.
1870  * @param queues
1871  *   Queues entering in hash queue. In case of empty hash_fields only the
1872  *   first queue index will be taken for the indirection table.
1873  * @param queues_n
1874  *   Number of queues.
1875  *
1876  * @return
1877  *   An hash Rx queue on success.
1878  */
1879 struct mlx5_hrxq *
1880 mlx5_hrxq_get(struct rte_eth_dev *dev,
1881 	      const uint8_t *rss_key, uint32_t rss_key_len,
1882 	      uint64_t hash_fields,
1883 	      const uint16_t *queues, uint32_t queues_n)
1884 {
1885 	struct priv *priv = dev->data->dev_private;
1886 	struct mlx5_hrxq *hrxq;
1887 
1888 	queues_n = hash_fields ? queues_n : 1;
1889 	LIST_FOREACH(hrxq, &priv->hrxqs, next) {
1890 		struct mlx5_ind_table_ibv *ind_tbl;
1891 
1892 		if (hrxq->rss_key_len != rss_key_len)
1893 			continue;
1894 		if (memcmp(hrxq->rss_key, rss_key, rss_key_len))
1895 			continue;
1896 		if (hrxq->hash_fields != hash_fields)
1897 			continue;
1898 		ind_tbl = mlx5_ind_table_ibv_get(dev, queues, queues_n);
1899 		if (!ind_tbl)
1900 			continue;
1901 		if (ind_tbl != hrxq->ind_table) {
1902 			mlx5_ind_table_ibv_release(dev, ind_tbl);
1903 			continue;
1904 		}
1905 		rte_atomic32_inc(&hrxq->refcnt);
1906 		return hrxq;
1907 	}
1908 	return NULL;
1909 }
1910 
1911 /**
1912  * Release the hash Rx queue.
1913  *
1914  * @param dev
1915  *   Pointer to Ethernet device.
1916  * @param hrxq
1917  *   Pointer to Hash Rx queue to release.
1918  *
1919  * @return
1920  *   1 while a reference on it exists, 0 when freed.
1921  */
1922 int
1923 mlx5_hrxq_release(struct rte_eth_dev *dev, struct mlx5_hrxq *hrxq)
1924 {
1925 	if (rte_atomic32_dec_and_test(&hrxq->refcnt)) {
1926 		claim_zero(mlx5_glue->destroy_qp(hrxq->qp));
1927 		mlx5_ind_table_ibv_release(dev, hrxq->ind_table);
1928 		LIST_REMOVE(hrxq, next);
1929 		rte_free(hrxq);
1930 		return 0;
1931 	}
1932 	claim_nonzero(mlx5_ind_table_ibv_release(dev, hrxq->ind_table));
1933 	return 1;
1934 }
1935 
1936 /**
1937  * Verify the Rx Queue list is empty
1938  *
1939  * @param dev
1940  *   Pointer to Ethernet device.
1941  *
1942  * @return
1943  *   The number of object not released.
1944  */
1945 int
1946 mlx5_hrxq_ibv_verify(struct rte_eth_dev *dev)
1947 {
1948 	struct priv *priv = dev->data->dev_private;
1949 	struct mlx5_hrxq *hrxq;
1950 	int ret = 0;
1951 
1952 	LIST_FOREACH(hrxq, &priv->hrxqs, next) {
1953 		DRV_LOG(DEBUG,
1954 			"port %u Verbs hash Rx queue %p still referenced",
1955 			dev->data->port_id, (void *)hrxq);
1956 		++ret;
1957 	}
1958 	return ret;
1959 }
1960 
1961 /**
1962  * Create a drop Rx queue Verbs object.
1963  *
1964  * @param dev
1965  *   Pointer to Ethernet device.
1966  *
1967  * @return
1968  *   The Verbs object initialised, NULL otherwise and rte_errno is set.
1969  */
1970 struct mlx5_rxq_ibv *
1971 mlx5_rxq_ibv_drop_new(struct rte_eth_dev *dev)
1972 {
1973 	struct priv *priv = dev->data->dev_private;
1974 	struct ibv_cq *cq;
1975 	struct ibv_wq *wq = NULL;
1976 	struct mlx5_rxq_ibv *rxq;
1977 
1978 	if (priv->drop_queue.rxq)
1979 		return priv->drop_queue.rxq;
1980 	cq = mlx5_glue->create_cq(priv->ctx, 1, NULL, NULL, 0);
1981 	if (!cq) {
1982 		DEBUG("port %u cannot allocate CQ for drop queue",
1983 		      dev->data->port_id);
1984 		rte_errno = errno;
1985 		goto error;
1986 	}
1987 	wq = mlx5_glue->create_wq(priv->ctx,
1988 		 &(struct ibv_wq_init_attr){
1989 			.wq_type = IBV_WQT_RQ,
1990 			.max_wr = 1,
1991 			.max_sge = 1,
1992 			.pd = priv->pd,
1993 			.cq = cq,
1994 		 });
1995 	if (!wq) {
1996 		DEBUG("port %u cannot allocate WQ for drop queue",
1997 		      dev->data->port_id);
1998 		rte_errno = errno;
1999 		goto error;
2000 	}
2001 	rxq = rte_calloc(__func__, 1, sizeof(*rxq), 0);
2002 	if (!rxq) {
2003 		DEBUG("port %u cannot allocate drop Rx queue memory",
2004 		      dev->data->port_id);
2005 		rte_errno = ENOMEM;
2006 		goto error;
2007 	}
2008 	rxq->cq = cq;
2009 	rxq->wq = wq;
2010 	priv->drop_queue.rxq = rxq;
2011 	return rxq;
2012 error:
2013 	if (wq)
2014 		claim_zero(mlx5_glue->destroy_wq(wq));
2015 	if (cq)
2016 		claim_zero(mlx5_glue->destroy_cq(cq));
2017 	return NULL;
2018 }
2019 
2020 /**
2021  * Release a drop Rx queue Verbs object.
2022  *
2023  * @param dev
2024  *   Pointer to Ethernet device.
2025  *
2026  * @return
2027  *   The Verbs object initialised, NULL otherwise and rte_errno is set.
2028  */
2029 void
2030 mlx5_rxq_ibv_drop_release(struct rte_eth_dev *dev)
2031 {
2032 	struct priv *priv = dev->data->dev_private;
2033 	struct mlx5_rxq_ibv *rxq = priv->drop_queue.rxq;
2034 
2035 	if (rxq->wq)
2036 		claim_zero(mlx5_glue->destroy_wq(rxq->wq));
2037 	if (rxq->cq)
2038 		claim_zero(mlx5_glue->destroy_cq(rxq->cq));
2039 	rte_free(rxq);
2040 	priv->drop_queue.rxq = NULL;
2041 }
2042 
2043 /**
2044  * Create a drop indirection table.
2045  *
2046  * @param dev
2047  *   Pointer to Ethernet device.
2048  *
2049  * @return
2050  *   The Verbs object initialised, NULL otherwise and rte_errno is set.
2051  */
2052 struct mlx5_ind_table_ibv *
2053 mlx5_ind_table_ibv_drop_new(struct rte_eth_dev *dev)
2054 {
2055 	struct priv *priv = dev->data->dev_private;
2056 	struct mlx5_ind_table_ibv *ind_tbl;
2057 	struct mlx5_rxq_ibv *rxq;
2058 	struct mlx5_ind_table_ibv tmpl;
2059 
2060 	rxq = mlx5_rxq_ibv_drop_new(dev);
2061 	if (!rxq)
2062 		return NULL;
2063 	tmpl.ind_table = mlx5_glue->create_rwq_ind_table
2064 		(priv->ctx,
2065 		 &(struct ibv_rwq_ind_table_init_attr){
2066 			.log_ind_tbl_size = 0,
2067 			.ind_tbl = &rxq->wq,
2068 			.comp_mask = 0,
2069 		 });
2070 	if (!tmpl.ind_table) {
2071 		DEBUG("port %u cannot allocate indirection table for drop"
2072 		      " queue",
2073 		      dev->data->port_id);
2074 		rte_errno = errno;
2075 		goto error;
2076 	}
2077 	ind_tbl = rte_calloc(__func__, 1, sizeof(*ind_tbl), 0);
2078 	if (!ind_tbl) {
2079 		rte_errno = ENOMEM;
2080 		goto error;
2081 	}
2082 	ind_tbl->ind_table = tmpl.ind_table;
2083 	return ind_tbl;
2084 error:
2085 	mlx5_rxq_ibv_drop_release(dev);
2086 	return NULL;
2087 }
2088 
2089 /**
2090  * Release a drop indirection table.
2091  *
2092  * @param dev
2093  *   Pointer to Ethernet device.
2094  */
2095 void
2096 mlx5_ind_table_ibv_drop_release(struct rte_eth_dev *dev)
2097 {
2098 	struct priv *priv = dev->data->dev_private;
2099 	struct mlx5_ind_table_ibv *ind_tbl = priv->drop_queue.hrxq->ind_table;
2100 
2101 	claim_zero(mlx5_glue->destroy_rwq_ind_table(ind_tbl->ind_table));
2102 	mlx5_rxq_ibv_drop_release(dev);
2103 	rte_free(ind_tbl);
2104 	priv->drop_queue.hrxq->ind_table = NULL;
2105 }
2106 
2107 /**
2108  * Create a drop Rx Hash queue.
2109  *
2110  * @param dev
2111  *   Pointer to Ethernet device.
2112  *
2113  * @return
2114  *   The Verbs object initialised, NULL otherwise and rte_errno is set.
2115  */
2116 struct mlx5_hrxq *
2117 mlx5_hrxq_drop_new(struct rte_eth_dev *dev)
2118 {
2119 	struct priv *priv = dev->data->dev_private;
2120 	struct mlx5_ind_table_ibv *ind_tbl;
2121 	struct ibv_qp *qp;
2122 	struct mlx5_hrxq *hrxq;
2123 
2124 	if (priv->drop_queue.hrxq) {
2125 		rte_atomic32_inc(&priv->drop_queue.hrxq->refcnt);
2126 		return priv->drop_queue.hrxq;
2127 	}
2128 	ind_tbl = mlx5_ind_table_ibv_drop_new(dev);
2129 	if (!ind_tbl)
2130 		return NULL;
2131 	qp = mlx5_glue->create_qp_ex(priv->ctx,
2132 		 &(struct ibv_qp_init_attr_ex){
2133 			.qp_type = IBV_QPT_RAW_PACKET,
2134 			.comp_mask =
2135 				IBV_QP_INIT_ATTR_PD |
2136 				IBV_QP_INIT_ATTR_IND_TABLE |
2137 				IBV_QP_INIT_ATTR_RX_HASH,
2138 			.rx_hash_conf = (struct ibv_rx_hash_conf){
2139 				.rx_hash_function =
2140 					IBV_RX_HASH_FUNC_TOEPLITZ,
2141 				.rx_hash_key_len = MLX5_RSS_HASH_KEY_LEN,
2142 				.rx_hash_key = rss_hash_default_key,
2143 				.rx_hash_fields_mask = 0,
2144 				},
2145 			.rwq_ind_tbl = ind_tbl->ind_table,
2146 			.pd = priv->pd
2147 		 });
2148 	if (!qp) {
2149 		DEBUG("port %u cannot allocate QP for drop queue",
2150 		      dev->data->port_id);
2151 		rte_errno = errno;
2152 		goto error;
2153 	}
2154 	hrxq = rte_calloc(__func__, 1, sizeof(*hrxq), 0);
2155 	if (!hrxq) {
2156 		DRV_LOG(WARNING,
2157 			"port %u cannot allocate memory for drop queue",
2158 			dev->data->port_id);
2159 		rte_errno = ENOMEM;
2160 		goto error;
2161 	}
2162 	hrxq->ind_table = ind_tbl;
2163 	hrxq->qp = qp;
2164 	priv->drop_queue.hrxq = hrxq;
2165 	rte_atomic32_set(&hrxq->refcnt, 1);
2166 	return hrxq;
2167 error:
2168 	if (ind_tbl)
2169 		mlx5_ind_table_ibv_drop_release(dev);
2170 	return NULL;
2171 }
2172 
2173 /**
2174  * Release a drop hash Rx queue.
2175  *
2176  * @param dev
2177  *   Pointer to Ethernet device.
2178  */
2179 void
2180 mlx5_hrxq_drop_release(struct rte_eth_dev *dev)
2181 {
2182 	struct priv *priv = dev->data->dev_private;
2183 	struct mlx5_hrxq *hrxq = priv->drop_queue.hrxq;
2184 
2185 	if (rte_atomic32_dec_and_test(&hrxq->refcnt)) {
2186 		claim_zero(mlx5_glue->destroy_qp(hrxq->qp));
2187 		mlx5_ind_table_ibv_drop_release(dev);
2188 		rte_free(hrxq);
2189 		priv->drop_queue.hrxq = NULL;
2190 	}
2191 }
2192