xref: /dpdk/drivers/net/mlx5/mlx5_rxq.c (revision 89f0711f9ddfb5822da9d34f384b92f72a61c4dc)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright 2015 6WIND S.A.
5  *   Copyright 2015 Mellanox.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of 6WIND S.A. nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <stddef.h>
35 #include <assert.h>
36 #include <errno.h>
37 #include <string.h>
38 #include <stdint.h>
39 #include <fcntl.h>
40 #include <sys/queue.h>
41 
42 /* Verbs header. */
43 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
44 #ifdef PEDANTIC
45 #pragma GCC diagnostic ignored "-Wpedantic"
46 #endif
47 #include <infiniband/verbs.h>
48 #include <infiniband/mlx5dv.h>
49 #ifdef PEDANTIC
50 #pragma GCC diagnostic error "-Wpedantic"
51 #endif
52 
53 #include <rte_mbuf.h>
54 #include <rte_malloc.h>
55 #include <rte_ethdev_driver.h>
56 #include <rte_common.h>
57 #include <rte_interrupts.h>
58 #include <rte_debug.h>
59 #include <rte_io.h>
60 
61 #include "mlx5.h"
62 #include "mlx5_rxtx.h"
63 #include "mlx5_utils.h"
64 #include "mlx5_autoconf.h"
65 #include "mlx5_defs.h"
66 
67 /* Default RSS hash key also used for ConnectX-3. */
68 uint8_t rss_hash_default_key[] = {
69 	0x2c, 0xc6, 0x81, 0xd1,
70 	0x5b, 0xdb, 0xf4, 0xf7,
71 	0xfc, 0xa2, 0x83, 0x19,
72 	0xdb, 0x1a, 0x3e, 0x94,
73 	0x6b, 0x9e, 0x38, 0xd9,
74 	0x2c, 0x9c, 0x03, 0xd1,
75 	0xad, 0x99, 0x44, 0xa7,
76 	0xd9, 0x56, 0x3d, 0x59,
77 	0x06, 0x3c, 0x25, 0xf3,
78 	0xfc, 0x1f, 0xdc, 0x2a,
79 };
80 
81 /* Length of the default RSS hash key. */
82 const size_t rss_hash_default_key_len = sizeof(rss_hash_default_key);
83 
84 /**
85  * Allocate RX queue elements.
86  *
87  * @param rxq_ctrl
88  *   Pointer to RX queue structure.
89  *
90  * @return
91  *   0 on success, errno value on failure.
92  */
93 int
94 rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl)
95 {
96 	const unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n;
97 	unsigned int elts_n = 1 << rxq_ctrl->rxq.elts_n;
98 	unsigned int i;
99 	int ret = 0;
100 
101 	/* Iterate on segments. */
102 	for (i = 0; (i != elts_n); ++i) {
103 		struct rte_mbuf *buf;
104 
105 		buf = rte_pktmbuf_alloc(rxq_ctrl->rxq.mp);
106 		if (buf == NULL) {
107 			ERROR("%p: empty mbuf pool", (void *)rxq_ctrl);
108 			ret = ENOMEM;
109 			goto error;
110 		}
111 		/* Headroom is reserved by rte_pktmbuf_alloc(). */
112 		assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
113 		/* Buffer is supposed to be empty. */
114 		assert(rte_pktmbuf_data_len(buf) == 0);
115 		assert(rte_pktmbuf_pkt_len(buf) == 0);
116 		assert(!buf->next);
117 		/* Only the first segment keeps headroom. */
118 		if (i % sges_n)
119 			SET_DATA_OFF(buf, 0);
120 		PORT(buf) = rxq_ctrl->rxq.port_id;
121 		DATA_LEN(buf) = rte_pktmbuf_tailroom(buf);
122 		PKT_LEN(buf) = DATA_LEN(buf);
123 		NB_SEGS(buf) = 1;
124 		(*rxq_ctrl->rxq.elts)[i] = buf;
125 	}
126 	/* If Rx vector is activated. */
127 	if (rxq_check_vec_support(&rxq_ctrl->rxq) > 0) {
128 		struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
129 		struct rte_mbuf *mbuf_init = &rxq->fake_mbuf;
130 		int j;
131 
132 		/* Initialize default rearm_data for vPMD. */
133 		mbuf_init->data_off = RTE_PKTMBUF_HEADROOM;
134 		rte_mbuf_refcnt_set(mbuf_init, 1);
135 		mbuf_init->nb_segs = 1;
136 		mbuf_init->port = rxq->port_id;
137 		/*
138 		 * prevent compiler reordering:
139 		 * rearm_data covers previous fields.
140 		 */
141 		rte_compiler_barrier();
142 		rxq->mbuf_initializer =
143 			*(uint64_t *)&mbuf_init->rearm_data;
144 		/* Padding with a fake mbuf for vectorized Rx. */
145 		for (j = 0; j < MLX5_VPMD_DESCS_PER_LOOP; ++j)
146 			(*rxq->elts)[elts_n + j] = &rxq->fake_mbuf;
147 	}
148 	DEBUG("%p: allocated and configured %u segments (max %u packets)",
149 	      (void *)rxq_ctrl, elts_n, elts_n / (1 << rxq_ctrl->rxq.sges_n));
150 	assert(ret == 0);
151 	return 0;
152 error:
153 	elts_n = i;
154 	for (i = 0; (i != elts_n); ++i) {
155 		if ((*rxq_ctrl->rxq.elts)[i] != NULL)
156 			rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
157 		(*rxq_ctrl->rxq.elts)[i] = NULL;
158 	}
159 	DEBUG("%p: failed, freed everything", (void *)rxq_ctrl);
160 	assert(ret > 0);
161 	return ret;
162 }
163 
164 /**
165  * Free RX queue elements.
166  *
167  * @param rxq_ctrl
168  *   Pointer to RX queue structure.
169  */
170 static void
171 rxq_free_elts(struct mlx5_rxq_ctrl *rxq_ctrl)
172 {
173 	struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
174 	const uint16_t q_n = (1 << rxq->elts_n);
175 	const uint16_t q_mask = q_n - 1;
176 	uint16_t used = q_n - (rxq->rq_ci - rxq->rq_pi);
177 	uint16_t i;
178 
179 	DEBUG("%p: freeing WRs", (void *)rxq_ctrl);
180 	if (rxq->elts == NULL)
181 		return;
182 	/**
183 	 * Some mbuf in the Ring belongs to the application.  They cannot be
184 	 * freed.
185 	 */
186 	if (rxq_check_vec_support(rxq) > 0) {
187 		for (i = 0; i < used; ++i)
188 			(*rxq->elts)[(rxq->rq_ci + i) & q_mask] = NULL;
189 		rxq->rq_pi = rxq->rq_ci;
190 	}
191 	for (i = 0; (i != (1u << rxq->elts_n)); ++i) {
192 		if ((*rxq->elts)[i] != NULL)
193 			rte_pktmbuf_free_seg((*rxq->elts)[i]);
194 		(*rxq->elts)[i] = NULL;
195 	}
196 }
197 
198 /**
199  * Clean up a RX queue.
200  *
201  * Destroy objects, free allocated memory and reset the structure for reuse.
202  *
203  * @param rxq_ctrl
204  *   Pointer to RX queue structure.
205  */
206 void
207 mlx5_rxq_cleanup(struct mlx5_rxq_ctrl *rxq_ctrl)
208 {
209 	DEBUG("cleaning up %p", (void *)rxq_ctrl);
210 	if (rxq_ctrl->ibv)
211 		mlx5_priv_rxq_ibv_release(rxq_ctrl->priv, rxq_ctrl->ibv);
212 	memset(rxq_ctrl, 0, sizeof(*rxq_ctrl));
213 }
214 
215 /**
216  * Returns the per-queue supported offloads.
217  *
218  * @param priv
219  *   Pointer to private structure.
220  *
221  * @return
222  *   Supported Rx offloads.
223  */
224 uint64_t
225 mlx5_priv_get_rx_queue_offloads(struct priv *priv)
226 {
227 	struct mlx5_dev_config *config = &priv->config;
228 	uint64_t offloads = (DEV_RX_OFFLOAD_SCATTER |
229 			     DEV_RX_OFFLOAD_TIMESTAMP |
230 			     DEV_RX_OFFLOAD_JUMBO_FRAME);
231 
232 	if (config->hw_fcs_strip)
233 		offloads |= DEV_RX_OFFLOAD_CRC_STRIP;
234 	if (config->hw_csum)
235 		offloads |= (DEV_RX_OFFLOAD_IPV4_CKSUM |
236 			     DEV_RX_OFFLOAD_UDP_CKSUM |
237 			     DEV_RX_OFFLOAD_TCP_CKSUM);
238 	if (config->hw_vlan_strip)
239 		offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
240 	return offloads;
241 }
242 
243 
244 /**
245  * Returns the per-port supported offloads.
246  *
247  * @param priv
248  *   Pointer to private structure.
249  * @return
250  *   Supported Rx offloads.
251  */
252 uint64_t
253 mlx5_priv_get_rx_port_offloads(struct priv *priv __rte_unused)
254 {
255 	uint64_t offloads = DEV_RX_OFFLOAD_VLAN_FILTER;
256 
257 	return offloads;
258 }
259 
260 /**
261  * Checks if the per-queue offload configuration is valid.
262  *
263  * @param priv
264  *   Pointer to private structure.
265  * @param offloads
266  *   Per-queue offloads configuration.
267  *
268  * @return
269  *   1 if the configuration is valid, 0 otherwise.
270  */
271 static int
272 priv_is_rx_queue_offloads_allowed(struct priv *priv, uint64_t offloads)
273 {
274 	uint64_t port_offloads = priv->dev->data->dev_conf.rxmode.offloads;
275 	uint64_t queue_supp_offloads =
276 		mlx5_priv_get_rx_queue_offloads(priv);
277 	uint64_t port_supp_offloads = mlx5_priv_get_rx_port_offloads(priv);
278 
279 	if ((offloads & (queue_supp_offloads | port_supp_offloads)) !=
280 	    offloads)
281 		return 0;
282 	if (((port_offloads ^ offloads) & port_supp_offloads))
283 		return 0;
284 	return 1;
285 }
286 
287 /**
288  *
289  * @param dev
290  *   Pointer to Ethernet device structure.
291  * @param idx
292  *   RX queue index.
293  * @param desc
294  *   Number of descriptors to configure in queue.
295  * @param socket
296  *   NUMA socket on which memory must be allocated.
297  * @param[in] conf
298  *   Thresholds parameters.
299  * @param mp
300  *   Memory pool for buffer allocations.
301  *
302  * @return
303  *   0 on success, negative errno value on failure.
304  */
305 int
306 mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
307 		    unsigned int socket, const struct rte_eth_rxconf *conf,
308 		    struct rte_mempool *mp)
309 {
310 	struct priv *priv = dev->data->dev_private;
311 	struct mlx5_rxq_data *rxq = (*priv->rxqs)[idx];
312 	struct mlx5_rxq_ctrl *rxq_ctrl =
313 		container_of(rxq, struct mlx5_rxq_ctrl, rxq);
314 	int ret = 0;
315 
316 	priv_lock(priv);
317 	if (!rte_is_power_of_2(desc)) {
318 		desc = 1 << log2above(desc);
319 		WARN("%p: increased number of descriptors in RX queue %u"
320 		     " to the next power of two (%d)",
321 		     (void *)dev, idx, desc);
322 	}
323 	DEBUG("%p: configuring queue %u for %u descriptors",
324 	      (void *)dev, idx, desc);
325 	if (idx >= priv->rxqs_n) {
326 		ERROR("%p: queue index out of range (%u >= %u)",
327 		      (void *)dev, idx, priv->rxqs_n);
328 		priv_unlock(priv);
329 		return -EOVERFLOW;
330 	}
331 	if (!priv_is_rx_queue_offloads_allowed(priv, conf->offloads)) {
332 		ret = ENOTSUP;
333 		ERROR("%p: Rx queue offloads 0x%" PRIx64 " don't match port "
334 		      "offloads 0x%" PRIx64 " or supported offloads 0x%" PRIx64,
335 		      (void *)dev, conf->offloads,
336 		      dev->data->dev_conf.rxmode.offloads,
337 		      (mlx5_priv_get_rx_port_offloads(priv) |
338 		       mlx5_priv_get_rx_queue_offloads(priv)));
339 		goto out;
340 	}
341 	if (!mlx5_priv_rxq_releasable(priv, idx)) {
342 		ret = EBUSY;
343 		ERROR("%p: unable to release queue index %u",
344 		      (void *)dev, idx);
345 		goto out;
346 	}
347 	mlx5_priv_rxq_release(priv, idx);
348 	rxq_ctrl = mlx5_priv_rxq_new(priv, idx, desc, socket, conf, mp);
349 	if (!rxq_ctrl) {
350 		ERROR("%p: unable to allocate queue index %u",
351 		      (void *)dev, idx);
352 		ret = ENOMEM;
353 		goto out;
354 	}
355 	DEBUG("%p: adding RX queue %p to list",
356 	      (void *)dev, (void *)rxq_ctrl);
357 	(*priv->rxqs)[idx] = &rxq_ctrl->rxq;
358 out:
359 	priv_unlock(priv);
360 	return -ret;
361 }
362 
363 /**
364  * DPDK callback to release a RX queue.
365  *
366  * @param dpdk_rxq
367  *   Generic RX queue pointer.
368  */
369 void
370 mlx5_rx_queue_release(void *dpdk_rxq)
371 {
372 	struct mlx5_rxq_data *rxq = (struct mlx5_rxq_data *)dpdk_rxq;
373 	struct mlx5_rxq_ctrl *rxq_ctrl;
374 	struct priv *priv;
375 
376 	if (rxq == NULL)
377 		return;
378 	rxq_ctrl = container_of(rxq, struct mlx5_rxq_ctrl, rxq);
379 	priv = rxq_ctrl->priv;
380 	priv_lock(priv);
381 	if (!mlx5_priv_rxq_releasable(priv, rxq_ctrl->rxq.stats.idx))
382 		rte_panic("Rx queue %p is still used by a flow and cannot be"
383 			  " removed\n", (void *)rxq_ctrl);
384 	mlx5_priv_rxq_release(priv, rxq_ctrl->rxq.stats.idx);
385 	priv_unlock(priv);
386 }
387 
388 /**
389  * Allocate queue vector and fill epoll fd list for Rx interrupts.
390  *
391  * @param priv
392  *   Pointer to private structure.
393  *
394  * @return
395  *   0 on success, negative on failure.
396  */
397 int
398 priv_rx_intr_vec_enable(struct priv *priv)
399 {
400 	unsigned int i;
401 	unsigned int rxqs_n = priv->rxqs_n;
402 	unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
403 	unsigned int count = 0;
404 	struct rte_intr_handle *intr_handle = priv->dev->intr_handle;
405 
406 	if (!priv->dev->data->dev_conf.intr_conf.rxq)
407 		return 0;
408 	priv_rx_intr_vec_disable(priv);
409 	intr_handle->intr_vec = malloc(n * sizeof(intr_handle->intr_vec[0]));
410 	if (intr_handle->intr_vec == NULL) {
411 		ERROR("failed to allocate memory for interrupt vector,"
412 		      " Rx interrupts will not be supported");
413 		return -ENOMEM;
414 	}
415 	intr_handle->type = RTE_INTR_HANDLE_EXT;
416 	for (i = 0; i != n; ++i) {
417 		/* This rxq ibv must not be released in this function. */
418 		struct mlx5_rxq_ibv *rxq_ibv = mlx5_priv_rxq_ibv_get(priv, i);
419 		int fd;
420 		int flags;
421 		int rc;
422 
423 		/* Skip queues that cannot request interrupts. */
424 		if (!rxq_ibv || !rxq_ibv->channel) {
425 			/* Use invalid intr_vec[] index to disable entry. */
426 			intr_handle->intr_vec[i] =
427 				RTE_INTR_VEC_RXTX_OFFSET +
428 				RTE_MAX_RXTX_INTR_VEC_ID;
429 			continue;
430 		}
431 		if (count >= RTE_MAX_RXTX_INTR_VEC_ID) {
432 			ERROR("too many Rx queues for interrupt vector size"
433 			      " (%d), Rx interrupts cannot be enabled",
434 			      RTE_MAX_RXTX_INTR_VEC_ID);
435 			priv_rx_intr_vec_disable(priv);
436 			return -1;
437 		}
438 		fd = rxq_ibv->channel->fd;
439 		flags = fcntl(fd, F_GETFL);
440 		rc = fcntl(fd, F_SETFL, flags | O_NONBLOCK);
441 		if (rc < 0) {
442 			ERROR("failed to make Rx interrupt file descriptor"
443 			      " %d non-blocking for queue index %d", fd, i);
444 			priv_rx_intr_vec_disable(priv);
445 			return -1;
446 		}
447 		intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + count;
448 		intr_handle->efds[count] = fd;
449 		count++;
450 	}
451 	if (!count)
452 		priv_rx_intr_vec_disable(priv);
453 	else
454 		intr_handle->nb_efd = count;
455 	return 0;
456 }
457 
458 /**
459  * Clean up Rx interrupts handler.
460  *
461  * @param priv
462  *   Pointer to private structure.
463  */
464 void
465 priv_rx_intr_vec_disable(struct priv *priv)
466 {
467 	struct rte_intr_handle *intr_handle = priv->dev->intr_handle;
468 	unsigned int i;
469 	unsigned int rxqs_n = priv->rxqs_n;
470 	unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
471 
472 	if (!priv->dev->data->dev_conf.intr_conf.rxq)
473 		return;
474 	if (!intr_handle->intr_vec)
475 		goto free;
476 	for (i = 0; i != n; ++i) {
477 		struct mlx5_rxq_ctrl *rxq_ctrl;
478 		struct mlx5_rxq_data *rxq_data;
479 
480 		if (intr_handle->intr_vec[i] == RTE_INTR_VEC_RXTX_OFFSET +
481 		    RTE_MAX_RXTX_INTR_VEC_ID)
482 			continue;
483 		/**
484 		 * Need to access directly the queue to release the reference
485 		 * kept in priv_rx_intr_vec_enable().
486 		 */
487 		rxq_data = (*priv->rxqs)[i];
488 		rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
489 		mlx5_priv_rxq_ibv_release(priv, rxq_ctrl->ibv);
490 	}
491 free:
492 	rte_intr_free_epoll_fd(intr_handle);
493 	if (intr_handle->intr_vec)
494 		free(intr_handle->intr_vec);
495 	intr_handle->nb_efd = 0;
496 	intr_handle->intr_vec = NULL;
497 }
498 
499 /**
500  *  MLX5 CQ notification .
501  *
502  *  @param rxq
503  *     Pointer to receive queue structure.
504  *  @param sq_n_rxq
505  *     Sequence number per receive queue .
506  */
507 static inline void
508 mlx5_arm_cq(struct mlx5_rxq_data *rxq, int sq_n_rxq)
509 {
510 	int sq_n = 0;
511 	uint32_t doorbell_hi;
512 	uint64_t doorbell;
513 	void *cq_db_reg = (char *)rxq->cq_uar + MLX5_CQ_DOORBELL;
514 
515 	sq_n = sq_n_rxq & MLX5_CQ_SQN_MASK;
516 	doorbell_hi = sq_n << MLX5_CQ_SQN_OFFSET | (rxq->cq_ci & MLX5_CI_MASK);
517 	doorbell = (uint64_t)doorbell_hi << 32;
518 	doorbell |=  rxq->cqn;
519 	rxq->cq_db[MLX5_CQ_ARM_DB] = rte_cpu_to_be_32(doorbell_hi);
520 	rte_write64(rte_cpu_to_be_64(doorbell), cq_db_reg);
521 }
522 
523 /**
524  * DPDK callback for Rx queue interrupt enable.
525  *
526  * @param dev
527  *   Pointer to Ethernet device structure.
528  * @param rx_queue_id
529  *   Rx queue number.
530  *
531  * @return
532  *   0 on success, negative on failure.
533  */
534 int
535 mlx5_rx_intr_enable(struct rte_eth_dev *dev, uint16_t rx_queue_id)
536 {
537 	struct priv *priv = dev->data->dev_private;
538 	struct mlx5_rxq_data *rxq_data;
539 	struct mlx5_rxq_ctrl *rxq_ctrl;
540 	int ret = 0;
541 
542 	priv_lock(priv);
543 	rxq_data = (*priv->rxqs)[rx_queue_id];
544 	if (!rxq_data) {
545 		ret = EINVAL;
546 		goto exit;
547 	}
548 	rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
549 	if (rxq_ctrl->irq) {
550 		struct mlx5_rxq_ibv *rxq_ibv;
551 
552 		rxq_ibv = mlx5_priv_rxq_ibv_get(priv, rx_queue_id);
553 		if (!rxq_ibv) {
554 			ret = EINVAL;
555 			goto exit;
556 		}
557 		mlx5_arm_cq(rxq_data, rxq_data->cq_arm_sn);
558 		mlx5_priv_rxq_ibv_release(priv, rxq_ibv);
559 	}
560 exit:
561 	priv_unlock(priv);
562 	if (ret)
563 		WARN("unable to arm interrupt on rx queue %d", rx_queue_id);
564 	return -ret;
565 }
566 
567 /**
568  * DPDK callback for Rx queue interrupt disable.
569  *
570  * @param dev
571  *   Pointer to Ethernet device structure.
572  * @param rx_queue_id
573  *   Rx queue number.
574  *
575  * @return
576  *   0 on success, negative on failure.
577  */
578 int
579 mlx5_rx_intr_disable(struct rte_eth_dev *dev, uint16_t rx_queue_id)
580 {
581 	struct priv *priv = dev->data->dev_private;
582 	struct mlx5_rxq_data *rxq_data;
583 	struct mlx5_rxq_ctrl *rxq_ctrl;
584 	struct mlx5_rxq_ibv *rxq_ibv = NULL;
585 	struct ibv_cq *ev_cq;
586 	void *ev_ctx;
587 	int ret = 0;
588 
589 	priv_lock(priv);
590 	rxq_data = (*priv->rxqs)[rx_queue_id];
591 	if (!rxq_data) {
592 		ret = EINVAL;
593 		goto exit;
594 	}
595 	rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
596 	if (!rxq_ctrl->irq)
597 		goto exit;
598 	rxq_ibv = mlx5_priv_rxq_ibv_get(priv, rx_queue_id);
599 	if (!rxq_ibv) {
600 		ret = EINVAL;
601 		goto exit;
602 	}
603 	ret = ibv_get_cq_event(rxq_ibv->channel, &ev_cq, &ev_ctx);
604 	if (ret || ev_cq != rxq_ibv->cq) {
605 		ret = EINVAL;
606 		goto exit;
607 	}
608 	rxq_data->cq_arm_sn++;
609 	ibv_ack_cq_events(rxq_ibv->cq, 1);
610 exit:
611 	if (rxq_ibv)
612 		mlx5_priv_rxq_ibv_release(priv, rxq_ibv);
613 	priv_unlock(priv);
614 	if (ret)
615 		WARN("unable to disable interrupt on rx queue %d",
616 		     rx_queue_id);
617 	return -ret;
618 }
619 
620 /**
621  * Create the Rx queue Verbs object.
622  *
623  * @param priv
624  *   Pointer to private structure.
625  * @param idx
626  *   Queue index in DPDK Rx queue array
627  *
628  * @return
629  *   The Verbs object initialised if it can be created.
630  */
631 struct mlx5_rxq_ibv*
632 mlx5_priv_rxq_ibv_new(struct priv *priv, uint16_t idx)
633 {
634 	struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx];
635 	struct mlx5_rxq_ctrl *rxq_ctrl =
636 		container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
637 	struct ibv_wq_attr mod;
638 	union {
639 		struct {
640 			struct ibv_cq_init_attr_ex ibv;
641 			struct mlx5dv_cq_init_attr mlx5;
642 		} cq;
643 		struct ibv_wq_init_attr wq;
644 		struct ibv_cq_ex cq_attr;
645 	} attr;
646 	unsigned int cqe_n = (1 << rxq_data->elts_n) - 1;
647 	struct mlx5_rxq_ibv *tmpl;
648 	struct mlx5dv_cq cq_info;
649 	struct mlx5dv_rwq rwq;
650 	unsigned int i;
651 	int ret = 0;
652 	struct mlx5dv_obj obj;
653 	struct mlx5_dev_config *config = &priv->config;
654 
655 	assert(rxq_data);
656 	assert(!rxq_ctrl->ibv);
657 	priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_RX_QUEUE;
658 	priv->verbs_alloc_ctx.obj = rxq_ctrl;
659 	tmpl = rte_calloc_socket(__func__, 1, sizeof(*tmpl), 0,
660 				 rxq_ctrl->socket);
661 	if (!tmpl) {
662 		ERROR("%p: cannot allocate verbs resources",
663 		       (void *)rxq_ctrl);
664 		goto error;
665 	}
666 	tmpl->rxq_ctrl = rxq_ctrl;
667 	/* Use the entire RX mempool as the memory region. */
668 	tmpl->mr = priv_mr_get(priv, rxq_data->mp);
669 	if (!tmpl->mr) {
670 		tmpl->mr = priv_mr_new(priv, rxq_data->mp);
671 		if (!tmpl->mr) {
672 			ERROR("%p: MR creation failure", (void *)rxq_ctrl);
673 			goto error;
674 		}
675 	}
676 	if (rxq_ctrl->irq) {
677 		tmpl->channel = ibv_create_comp_channel(priv->ctx);
678 		if (!tmpl->channel) {
679 			ERROR("%p: Comp Channel creation failure",
680 			      (void *)rxq_ctrl);
681 			goto error;
682 		}
683 	}
684 	attr.cq.ibv = (struct ibv_cq_init_attr_ex){
685 		.cqe = cqe_n,
686 		.channel = tmpl->channel,
687 		.comp_mask = 0,
688 	};
689 	attr.cq.mlx5 = (struct mlx5dv_cq_init_attr){
690 		.comp_mask = 0,
691 	};
692 	if (config->cqe_comp && !rxq_data->hw_timestamp) {
693 		attr.cq.mlx5.comp_mask |=
694 			MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE;
695 		attr.cq.mlx5.cqe_comp_res_format = MLX5DV_CQE_RES_FORMAT_HASH;
696 		/*
697 		 * For vectorized Rx, it must not be doubled in order to
698 		 * make cq_ci and rq_ci aligned.
699 		 */
700 		if (rxq_check_vec_support(rxq_data) < 0)
701 			attr.cq.ibv.cqe *= 2;
702 	} else if (config->cqe_comp && rxq_data->hw_timestamp) {
703 		DEBUG("Rx CQE compression is disabled for HW timestamp");
704 	}
705 	tmpl->cq = ibv_cq_ex_to_cq(mlx5dv_create_cq(priv->ctx, &attr.cq.ibv,
706 						    &attr.cq.mlx5));
707 	if (tmpl->cq == NULL) {
708 		ERROR("%p: CQ creation failure", (void *)rxq_ctrl);
709 		goto error;
710 	}
711 	DEBUG("priv->device_attr.max_qp_wr is %d",
712 	      priv->device_attr.orig_attr.max_qp_wr);
713 	DEBUG("priv->device_attr.max_sge is %d",
714 	      priv->device_attr.orig_attr.max_sge);
715 	attr.wq = (struct ibv_wq_init_attr){
716 		.wq_context = NULL, /* Could be useful in the future. */
717 		.wq_type = IBV_WQT_RQ,
718 		/* Max number of outstanding WRs. */
719 		.max_wr = (1 << rxq_data->elts_n) >> rxq_data->sges_n,
720 		/* Max number of scatter/gather elements in a WR. */
721 		.max_sge = 1 << rxq_data->sges_n,
722 		.pd = priv->pd,
723 		.cq = tmpl->cq,
724 		.comp_mask =
725 			IBV_WQ_FLAGS_CVLAN_STRIPPING |
726 			0,
727 		.create_flags = (rxq_data->vlan_strip ?
728 				 IBV_WQ_FLAGS_CVLAN_STRIPPING :
729 				 0),
730 	};
731 	/* By default, FCS (CRC) is stripped by hardware. */
732 	if (rxq_data->crc_present) {
733 		attr.wq.create_flags |= IBV_WQ_FLAGS_SCATTER_FCS;
734 		attr.wq.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS;
735 	}
736 #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
737 	if (config->hw_padding) {
738 		attr.wq.create_flags |= IBV_WQ_FLAG_RX_END_PADDING;
739 		attr.wq.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS;
740 	}
741 #endif
742 	tmpl->wq = ibv_create_wq(priv->ctx, &attr.wq);
743 	if (tmpl->wq == NULL) {
744 		ERROR("%p: WQ creation failure", (void *)rxq_ctrl);
745 		goto error;
746 	}
747 	/*
748 	 * Make sure number of WRs*SGEs match expectations since a queue
749 	 * cannot allocate more than "desc" buffers.
750 	 */
751 	if (((int)attr.wq.max_wr !=
752 	     ((1 << rxq_data->elts_n) >> rxq_data->sges_n)) ||
753 	    ((int)attr.wq.max_sge != (1 << rxq_data->sges_n))) {
754 		ERROR("%p: requested %u*%u but got %u*%u WRs*SGEs",
755 		      (void *)rxq_ctrl,
756 		      ((1 << rxq_data->elts_n) >> rxq_data->sges_n),
757 		      (1 << rxq_data->sges_n),
758 		      attr.wq.max_wr, attr.wq.max_sge);
759 		goto error;
760 	}
761 	/* Change queue state to ready. */
762 	mod = (struct ibv_wq_attr){
763 		.attr_mask = IBV_WQ_ATTR_STATE,
764 		.wq_state = IBV_WQS_RDY,
765 	};
766 	ret = ibv_modify_wq(tmpl->wq, &mod);
767 	if (ret) {
768 		ERROR("%p: WQ state to IBV_WQS_RDY failed",
769 		      (void *)rxq_ctrl);
770 		goto error;
771 	}
772 	obj.cq.in = tmpl->cq;
773 	obj.cq.out = &cq_info;
774 	obj.rwq.in = tmpl->wq;
775 	obj.rwq.out = &rwq;
776 	ret = mlx5dv_init_obj(&obj, MLX5DV_OBJ_CQ | MLX5DV_OBJ_RWQ);
777 	if (ret != 0)
778 		goto error;
779 	if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) {
780 		ERROR("Wrong MLX5_CQE_SIZE environment variable value: "
781 		      "it should be set to %u", RTE_CACHE_LINE_SIZE);
782 		goto error;
783 	}
784 	/* Fill the rings. */
785 	rxq_data->wqes = (volatile struct mlx5_wqe_data_seg (*)[])
786 		(uintptr_t)rwq.buf;
787 	for (i = 0; (i != (unsigned int)(1 << rxq_data->elts_n)); ++i) {
788 		struct rte_mbuf *buf = (*rxq_data->elts)[i];
789 		volatile struct mlx5_wqe_data_seg *scat = &(*rxq_data->wqes)[i];
790 
791 		/* scat->addr must be able to store a pointer. */
792 		assert(sizeof(scat->addr) >= sizeof(uintptr_t));
793 		*scat = (struct mlx5_wqe_data_seg){
794 			.addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf,
795 								  uintptr_t)),
796 			.byte_count = rte_cpu_to_be_32(DATA_LEN(buf)),
797 			.lkey = tmpl->mr->lkey,
798 		};
799 	}
800 	rxq_data->rq_db = rwq.dbrec;
801 	rxq_data->cqe_n = log2above(cq_info.cqe_cnt);
802 	rxq_data->cq_ci = 0;
803 	rxq_data->rq_ci = 0;
804 	rxq_data->rq_pi = 0;
805 	rxq_data->zip = (struct rxq_zip){
806 		.ai = 0,
807 	};
808 	rxq_data->cq_db = cq_info.dbrec;
809 	rxq_data->cqes = (volatile struct mlx5_cqe (*)[])(uintptr_t)cq_info.buf;
810 	rxq_data->cq_uar = cq_info.cq_uar;
811 	rxq_data->cqn = cq_info.cqn;
812 	rxq_data->cq_arm_sn = 0;
813 	/* Update doorbell counter. */
814 	rxq_data->rq_ci = (1 << rxq_data->elts_n) >> rxq_data->sges_n;
815 	rte_wmb();
816 	*rxq_data->rq_db = rte_cpu_to_be_32(rxq_data->rq_ci);
817 	DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl);
818 	rte_atomic32_inc(&tmpl->refcnt);
819 	DEBUG("%p: Verbs Rx queue %p: refcnt %d", (void *)priv,
820 	      (void *)tmpl, rte_atomic32_read(&tmpl->refcnt));
821 	LIST_INSERT_HEAD(&priv->rxqsibv, tmpl, next);
822 	priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE;
823 	return tmpl;
824 error:
825 	if (tmpl->wq)
826 		claim_zero(ibv_destroy_wq(tmpl->wq));
827 	if (tmpl->cq)
828 		claim_zero(ibv_destroy_cq(tmpl->cq));
829 	if (tmpl->channel)
830 		claim_zero(ibv_destroy_comp_channel(tmpl->channel));
831 	if (tmpl->mr)
832 		priv_mr_release(priv, tmpl->mr);
833 	priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE;
834 	return NULL;
835 }
836 
837 /**
838  * Get an Rx queue Verbs object.
839  *
840  * @param priv
841  *   Pointer to private structure.
842  * @param idx
843  *   Queue index in DPDK Rx queue array
844  *
845  * @return
846  *   The Verbs object if it exists.
847  */
848 struct mlx5_rxq_ibv*
849 mlx5_priv_rxq_ibv_get(struct priv *priv, uint16_t idx)
850 {
851 	struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx];
852 	struct mlx5_rxq_ctrl *rxq_ctrl;
853 
854 	if (idx >= priv->rxqs_n)
855 		return NULL;
856 	if (!rxq_data)
857 		return NULL;
858 	rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
859 	if (rxq_ctrl->ibv) {
860 		priv_mr_get(priv, rxq_data->mp);
861 		rte_atomic32_inc(&rxq_ctrl->ibv->refcnt);
862 		DEBUG("%p: Verbs Rx queue %p: refcnt %d", (void *)priv,
863 		      (void *)rxq_ctrl->ibv,
864 		      rte_atomic32_read(&rxq_ctrl->ibv->refcnt));
865 	}
866 	return rxq_ctrl->ibv;
867 }
868 
869 /**
870  * Release an Rx verbs queue object.
871  *
872  * @param priv
873  *   Pointer to private structure.
874  * @param rxq_ibv
875  *   Verbs Rx queue object.
876  *
877  * @return
878  *   0 on success, errno value on failure.
879  */
880 int
881 mlx5_priv_rxq_ibv_release(struct priv *priv, struct mlx5_rxq_ibv *rxq_ibv)
882 {
883 	int ret;
884 
885 	assert(rxq_ibv);
886 	assert(rxq_ibv->wq);
887 	assert(rxq_ibv->cq);
888 	assert(rxq_ibv->mr);
889 	ret = priv_mr_release(priv, rxq_ibv->mr);
890 	if (!ret)
891 		rxq_ibv->mr = NULL;
892 	DEBUG("%p: Verbs Rx queue %p: refcnt %d", (void *)priv,
893 	      (void *)rxq_ibv, rte_atomic32_read(&rxq_ibv->refcnt));
894 	if (rte_atomic32_dec_and_test(&rxq_ibv->refcnt)) {
895 		rxq_free_elts(rxq_ibv->rxq_ctrl);
896 		claim_zero(ibv_destroy_wq(rxq_ibv->wq));
897 		claim_zero(ibv_destroy_cq(rxq_ibv->cq));
898 		if (rxq_ibv->channel)
899 			claim_zero(ibv_destroy_comp_channel(rxq_ibv->channel));
900 		LIST_REMOVE(rxq_ibv, next);
901 		rte_free(rxq_ibv);
902 		return 0;
903 	}
904 	return EBUSY;
905 }
906 
907 /**
908  * Verify the Verbs Rx queue list is empty
909  *
910  * @param priv
911  *  Pointer to private structure.
912  *
913  * @return the number of object not released.
914  */
915 int
916 mlx5_priv_rxq_ibv_verify(struct priv *priv)
917 {
918 	int ret = 0;
919 	struct mlx5_rxq_ibv *rxq_ibv;
920 
921 	LIST_FOREACH(rxq_ibv, &priv->rxqsibv, next) {
922 		DEBUG("%p: Verbs Rx queue %p still referenced", (void *)priv,
923 		      (void *)rxq_ibv);
924 		++ret;
925 	}
926 	return ret;
927 }
928 
929 /**
930  * Return true if a single reference exists on the object.
931  *
932  * @param priv
933  *   Pointer to private structure.
934  * @param rxq_ibv
935  *   Verbs Rx queue object.
936  */
937 int
938 mlx5_priv_rxq_ibv_releasable(struct priv *priv, struct mlx5_rxq_ibv *rxq_ibv)
939 {
940 	(void)priv;
941 	assert(rxq_ibv);
942 	return (rte_atomic32_read(&rxq_ibv->refcnt) == 1);
943 }
944 
945 /**
946  * Create a DPDK Rx queue.
947  *
948  * @param priv
949  *   Pointer to private structure.
950  * @param idx
951  *   TX queue index.
952  * @param desc
953  *   Number of descriptors to configure in queue.
954  * @param socket
955  *   NUMA socket on which memory must be allocated.
956  *
957  * @return
958  *   A DPDK queue object on success.
959  */
960 struct mlx5_rxq_ctrl*
961 mlx5_priv_rxq_new(struct priv *priv, uint16_t idx, uint16_t desc,
962 		  unsigned int socket, const struct rte_eth_rxconf *conf,
963 		  struct rte_mempool *mp)
964 {
965 	struct rte_eth_dev *dev = priv->dev;
966 	struct mlx5_rxq_ctrl *tmpl;
967 	unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
968 	struct mlx5_dev_config *config = &priv->config;
969 	/*
970 	 * Always allocate extra slots, even if eventually
971 	 * the vector Rx will not be used.
972 	 */
973 	const uint16_t desc_n =
974 		desc + config->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP;
975 
976 	tmpl = rte_calloc_socket("RXQ", 1,
977 				 sizeof(*tmpl) +
978 				 desc_n * sizeof(struct rte_mbuf *),
979 				 0, socket);
980 	if (!tmpl)
981 		return NULL;
982 	tmpl->socket = socket;
983 	if (priv->dev->data->dev_conf.intr_conf.rxq)
984 		tmpl->irq = 1;
985 	/* Enable scattered packets support for this queue if necessary. */
986 	assert(mb_len >= RTE_PKTMBUF_HEADROOM);
987 	if (dev->data->dev_conf.rxmode.max_rx_pkt_len <=
988 	    (mb_len - RTE_PKTMBUF_HEADROOM)) {
989 		tmpl->rxq.sges_n = 0;
990 	} else if (conf->offloads & DEV_RX_OFFLOAD_SCATTER) {
991 		unsigned int size =
992 			RTE_PKTMBUF_HEADROOM +
993 			dev->data->dev_conf.rxmode.max_rx_pkt_len;
994 		unsigned int sges_n;
995 
996 		/*
997 		 * Determine the number of SGEs needed for a full packet
998 		 * and round it to the next power of two.
999 		 */
1000 		sges_n = log2above((size / mb_len) + !!(size % mb_len));
1001 		tmpl->rxq.sges_n = sges_n;
1002 		/* Make sure rxq.sges_n did not overflow. */
1003 		size = mb_len * (1 << tmpl->rxq.sges_n);
1004 		size -= RTE_PKTMBUF_HEADROOM;
1005 		if (size < dev->data->dev_conf.rxmode.max_rx_pkt_len) {
1006 			ERROR("%p: too many SGEs (%u) needed to handle"
1007 			      " requested maximum packet size %u",
1008 			      (void *)dev,
1009 			      1 << sges_n,
1010 			      dev->data->dev_conf.rxmode.max_rx_pkt_len);
1011 			goto error;
1012 		}
1013 	} else {
1014 		WARN("%p: the requested maximum Rx packet size (%u) is"
1015 		     " larger than a single mbuf (%u) and scattered"
1016 		     " mode has not been requested",
1017 		     (void *)dev,
1018 		     dev->data->dev_conf.rxmode.max_rx_pkt_len,
1019 		     mb_len - RTE_PKTMBUF_HEADROOM);
1020 	}
1021 	DEBUG("%p: maximum number of segments per packet: %u",
1022 	      (void *)dev, 1 << tmpl->rxq.sges_n);
1023 	if (desc % (1 << tmpl->rxq.sges_n)) {
1024 		ERROR("%p: number of RX queue descriptors (%u) is not a"
1025 		      " multiple of SGEs per packet (%u)",
1026 		      (void *)dev,
1027 		      desc,
1028 		      1 << tmpl->rxq.sges_n);
1029 		goto error;
1030 	}
1031 	/* Toggle RX checksum offload if hardware supports it. */
1032 	tmpl->rxq.csum = !!(conf->offloads & DEV_RX_OFFLOAD_CHECKSUM);
1033 	tmpl->rxq.csum_l2tun = (!!(conf->offloads & DEV_RX_OFFLOAD_CHECKSUM) &&
1034 				priv->config.hw_csum_l2tun);
1035 	tmpl->rxq.hw_timestamp = !!(conf->offloads & DEV_RX_OFFLOAD_TIMESTAMP);
1036 	/* Configure VLAN stripping. */
1037 	tmpl->rxq.vlan_strip = !!(conf->offloads & DEV_RX_OFFLOAD_VLAN_STRIP);
1038 	/* By default, FCS (CRC) is stripped by hardware. */
1039 	if (conf->offloads & DEV_RX_OFFLOAD_CRC_STRIP) {
1040 		tmpl->rxq.crc_present = 0;
1041 	} else if (config->hw_fcs_strip) {
1042 		tmpl->rxq.crc_present = 1;
1043 	} else {
1044 		WARN("%p: CRC stripping has been disabled but will still"
1045 		     " be performed by hardware, make sure MLNX_OFED and"
1046 		     " firmware are up to date",
1047 		     (void *)dev);
1048 		tmpl->rxq.crc_present = 0;
1049 	}
1050 	DEBUG("%p: CRC stripping is %s, %u bytes will be subtracted from"
1051 	      " incoming frames to hide it",
1052 	      (void *)dev,
1053 	      tmpl->rxq.crc_present ? "disabled" : "enabled",
1054 	      tmpl->rxq.crc_present << 2);
1055 	/* Save port ID. */
1056 	tmpl->rxq.rss_hash = priv->rxqs_n > 1;
1057 	tmpl->rxq.port_id = dev->data->port_id;
1058 	tmpl->priv = priv;
1059 	tmpl->rxq.mp = mp;
1060 	tmpl->rxq.stats.idx = idx;
1061 	tmpl->rxq.elts_n = log2above(desc);
1062 	tmpl->rxq.elts =
1063 		(struct rte_mbuf *(*)[1 << tmpl->rxq.elts_n])(tmpl + 1);
1064 	rte_atomic32_inc(&tmpl->refcnt);
1065 	DEBUG("%p: Rx queue %p: refcnt %d", (void *)priv,
1066 	      (void *)tmpl, rte_atomic32_read(&tmpl->refcnt));
1067 	LIST_INSERT_HEAD(&priv->rxqsctrl, tmpl, next);
1068 	return tmpl;
1069 error:
1070 	rte_free(tmpl);
1071 	return NULL;
1072 }
1073 
1074 /**
1075  * Get a Rx queue.
1076  *
1077  * @param priv
1078  *   Pointer to private structure.
1079  * @param idx
1080  *   TX queue index.
1081  *
1082  * @return
1083  *   A pointer to the queue if it exists.
1084  */
1085 struct mlx5_rxq_ctrl*
1086 mlx5_priv_rxq_get(struct priv *priv, uint16_t idx)
1087 {
1088 	struct mlx5_rxq_ctrl *rxq_ctrl = NULL;
1089 
1090 	if ((*priv->rxqs)[idx]) {
1091 		rxq_ctrl = container_of((*priv->rxqs)[idx],
1092 					struct mlx5_rxq_ctrl,
1093 					rxq);
1094 
1095 		mlx5_priv_rxq_ibv_get(priv, idx);
1096 		rte_atomic32_inc(&rxq_ctrl->refcnt);
1097 		DEBUG("%p: Rx queue %p: refcnt %d", (void *)priv,
1098 		      (void *)rxq_ctrl, rte_atomic32_read(&rxq_ctrl->refcnt));
1099 	}
1100 	return rxq_ctrl;
1101 }
1102 
1103 /**
1104  * Release a Rx queue.
1105  *
1106  * @param priv
1107  *   Pointer to private structure.
1108  * @param idx
1109  *   TX queue index.
1110  *
1111  * @return
1112  *   0 on success, errno value on failure.
1113  */
1114 int
1115 mlx5_priv_rxq_release(struct priv *priv, uint16_t idx)
1116 {
1117 	struct mlx5_rxq_ctrl *rxq_ctrl;
1118 
1119 	if (!(*priv->rxqs)[idx])
1120 		return 0;
1121 	rxq_ctrl = container_of((*priv->rxqs)[idx], struct mlx5_rxq_ctrl, rxq);
1122 	assert(rxq_ctrl->priv);
1123 	if (rxq_ctrl->ibv) {
1124 		int ret;
1125 
1126 		ret = mlx5_priv_rxq_ibv_release(rxq_ctrl->priv, rxq_ctrl->ibv);
1127 		if (!ret)
1128 			rxq_ctrl->ibv = NULL;
1129 	}
1130 	DEBUG("%p: Rx queue %p: refcnt %d", (void *)priv,
1131 	      (void *)rxq_ctrl, rte_atomic32_read(&rxq_ctrl->refcnt));
1132 	if (rte_atomic32_dec_and_test(&rxq_ctrl->refcnt)) {
1133 		LIST_REMOVE(rxq_ctrl, next);
1134 		rte_free(rxq_ctrl);
1135 		(*priv->rxqs)[idx] = NULL;
1136 		return 0;
1137 	}
1138 	return EBUSY;
1139 }
1140 
1141 /**
1142  * Verify if the queue can be released.
1143  *
1144  * @param priv
1145  *   Pointer to private structure.
1146  * @param idx
1147  *   TX queue index.
1148  *
1149  * @return
1150  *   1 if the queue can be released.
1151  */
1152 int
1153 mlx5_priv_rxq_releasable(struct priv *priv, uint16_t idx)
1154 {
1155 	struct mlx5_rxq_ctrl *rxq_ctrl;
1156 
1157 	if (!(*priv->rxqs)[idx])
1158 		return -1;
1159 	rxq_ctrl = container_of((*priv->rxqs)[idx], struct mlx5_rxq_ctrl, rxq);
1160 	return (rte_atomic32_read(&rxq_ctrl->refcnt) == 1);
1161 }
1162 
1163 /**
1164  * Verify the Rx Queue list is empty
1165  *
1166  * @param priv
1167  *  Pointer to private structure.
1168  *
1169  * @return the number of object not released.
1170  */
1171 int
1172 mlx5_priv_rxq_verify(struct priv *priv)
1173 {
1174 	struct mlx5_rxq_ctrl *rxq_ctrl;
1175 	int ret = 0;
1176 
1177 	LIST_FOREACH(rxq_ctrl, &priv->rxqsctrl, next) {
1178 		DEBUG("%p: Rx Queue %p still referenced", (void *)priv,
1179 		      (void *)rxq_ctrl);
1180 		++ret;
1181 	}
1182 	return ret;
1183 }
1184 
1185 /**
1186  * Create an indirection table.
1187  *
1188  * @param priv
1189  *   Pointer to private structure.
1190  * @param queues
1191  *   Queues entering in the indirection table.
1192  * @param queues_n
1193  *   Number of queues in the array.
1194  *
1195  * @return
1196  *   A new indirection table.
1197  */
1198 struct mlx5_ind_table_ibv*
1199 mlx5_priv_ind_table_ibv_new(struct priv *priv, uint16_t queues[],
1200 			    uint16_t queues_n)
1201 {
1202 	struct mlx5_ind_table_ibv *ind_tbl;
1203 	const unsigned int wq_n = rte_is_power_of_2(queues_n) ?
1204 		log2above(queues_n) :
1205 		log2above(priv->config.ind_table_max_size);
1206 	struct ibv_wq *wq[1 << wq_n];
1207 	unsigned int i;
1208 	unsigned int j;
1209 
1210 	ind_tbl = rte_calloc(__func__, 1, sizeof(*ind_tbl) +
1211 			     queues_n * sizeof(uint16_t), 0);
1212 	if (!ind_tbl)
1213 		return NULL;
1214 	for (i = 0; i != queues_n; ++i) {
1215 		struct mlx5_rxq_ctrl *rxq =
1216 			mlx5_priv_rxq_get(priv, queues[i]);
1217 
1218 		if (!rxq)
1219 			goto error;
1220 		wq[i] = rxq->ibv->wq;
1221 		ind_tbl->queues[i] = queues[i];
1222 	}
1223 	ind_tbl->queues_n = queues_n;
1224 	/* Finalise indirection table. */
1225 	for (j = 0; i != (unsigned int)(1 << wq_n); ++i, ++j)
1226 		wq[i] = wq[j];
1227 	ind_tbl->ind_table = ibv_create_rwq_ind_table(
1228 		priv->ctx,
1229 		&(struct ibv_rwq_ind_table_init_attr){
1230 			.log_ind_tbl_size = wq_n,
1231 			.ind_tbl = wq,
1232 			.comp_mask = 0,
1233 		});
1234 	if (!ind_tbl->ind_table)
1235 		goto error;
1236 	rte_atomic32_inc(&ind_tbl->refcnt);
1237 	LIST_INSERT_HEAD(&priv->ind_tbls, ind_tbl, next);
1238 	DEBUG("%p: Indirection table %p: refcnt %d", (void *)priv,
1239 	      (void *)ind_tbl, rte_atomic32_read(&ind_tbl->refcnt));
1240 	return ind_tbl;
1241 error:
1242 	rte_free(ind_tbl);
1243 	DEBUG("%p cannot create indirection table", (void *)priv);
1244 	return NULL;
1245 }
1246 
1247 /**
1248  * Get an indirection table.
1249  *
1250  * @param priv
1251  *   Pointer to private structure.
1252  * @param queues
1253  *   Queues entering in the indirection table.
1254  * @param queues_n
1255  *   Number of queues in the array.
1256  *
1257  * @return
1258  *   An indirection table if found.
1259  */
1260 struct mlx5_ind_table_ibv*
1261 mlx5_priv_ind_table_ibv_get(struct priv *priv, uint16_t queues[],
1262 			    uint16_t queues_n)
1263 {
1264 	struct mlx5_ind_table_ibv *ind_tbl;
1265 
1266 	LIST_FOREACH(ind_tbl, &priv->ind_tbls, next) {
1267 		if ((ind_tbl->queues_n == queues_n) &&
1268 		    (memcmp(ind_tbl->queues, queues,
1269 			    ind_tbl->queues_n * sizeof(ind_tbl->queues[0]))
1270 		     == 0))
1271 			break;
1272 	}
1273 	if (ind_tbl) {
1274 		unsigned int i;
1275 
1276 		rte_atomic32_inc(&ind_tbl->refcnt);
1277 		DEBUG("%p: Indirection table %p: refcnt %d", (void *)priv,
1278 		      (void *)ind_tbl, rte_atomic32_read(&ind_tbl->refcnt));
1279 		for (i = 0; i != ind_tbl->queues_n; ++i)
1280 			mlx5_priv_rxq_get(priv, ind_tbl->queues[i]);
1281 	}
1282 	return ind_tbl;
1283 }
1284 
1285 /**
1286  * Release an indirection table.
1287  *
1288  * @param priv
1289  *   Pointer to private structure.
1290  * @param ind_table
1291  *   Indirection table to release.
1292  *
1293  * @return
1294  *   0 on success, errno value on failure.
1295  */
1296 int
1297 mlx5_priv_ind_table_ibv_release(struct priv *priv,
1298 				struct mlx5_ind_table_ibv *ind_tbl)
1299 {
1300 	unsigned int i;
1301 
1302 	DEBUG("%p: Indirection table %p: refcnt %d", (void *)priv,
1303 	      (void *)ind_tbl, rte_atomic32_read(&ind_tbl->refcnt));
1304 	if (rte_atomic32_dec_and_test(&ind_tbl->refcnt))
1305 		claim_zero(ibv_destroy_rwq_ind_table(ind_tbl->ind_table));
1306 	for (i = 0; i != ind_tbl->queues_n; ++i)
1307 		claim_nonzero(mlx5_priv_rxq_release(priv, ind_tbl->queues[i]));
1308 	if (!rte_atomic32_read(&ind_tbl->refcnt)) {
1309 		LIST_REMOVE(ind_tbl, next);
1310 		rte_free(ind_tbl);
1311 		return 0;
1312 	}
1313 	return EBUSY;
1314 }
1315 
1316 /**
1317  * Verify the Rx Queue list is empty
1318  *
1319  * @param priv
1320  *  Pointer to private structure.
1321  *
1322  * @return the number of object not released.
1323  */
1324 int
1325 mlx5_priv_ind_table_ibv_verify(struct priv *priv)
1326 {
1327 	struct mlx5_ind_table_ibv *ind_tbl;
1328 	int ret = 0;
1329 
1330 	LIST_FOREACH(ind_tbl, &priv->ind_tbls, next) {
1331 		DEBUG("%p: Verbs indirection table %p still referenced",
1332 		      (void *)priv, (void *)ind_tbl);
1333 		++ret;
1334 	}
1335 	return ret;
1336 }
1337 
1338 /**
1339  * Create an Rx Hash queue.
1340  *
1341  * @param priv
1342  *   Pointer to private structure.
1343  * @param rss_key
1344  *   RSS key for the Rx hash queue.
1345  * @param rss_key_len
1346  *   RSS key length.
1347  * @param hash_fields
1348  *   Verbs protocol hash field to make the RSS on.
1349  * @param queues
1350  *   Queues entering in hash queue. In case of empty hash_fields only the
1351  *   first queue index will be taken for the indirection table.
1352  * @param queues_n
1353  *   Number of queues.
1354  *
1355  * @return
1356  *   An hash Rx queue on success.
1357  */
1358 struct mlx5_hrxq*
1359 mlx5_priv_hrxq_new(struct priv *priv, uint8_t *rss_key, uint8_t rss_key_len,
1360 		   uint64_t hash_fields, uint16_t queues[], uint16_t queues_n)
1361 {
1362 	struct mlx5_hrxq *hrxq;
1363 	struct mlx5_ind_table_ibv *ind_tbl;
1364 	struct ibv_qp *qp;
1365 
1366 	queues_n = hash_fields ? queues_n : 1;
1367 	ind_tbl = mlx5_priv_ind_table_ibv_get(priv, queues, queues_n);
1368 	if (!ind_tbl)
1369 		ind_tbl = mlx5_priv_ind_table_ibv_new(priv, queues, queues_n);
1370 	if (!ind_tbl)
1371 		return NULL;
1372 	qp = ibv_create_qp_ex(
1373 		priv->ctx,
1374 		&(struct ibv_qp_init_attr_ex){
1375 			.qp_type = IBV_QPT_RAW_PACKET,
1376 			.comp_mask =
1377 				IBV_QP_INIT_ATTR_PD |
1378 				IBV_QP_INIT_ATTR_IND_TABLE |
1379 				IBV_QP_INIT_ATTR_RX_HASH,
1380 			.rx_hash_conf = (struct ibv_rx_hash_conf){
1381 				.rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ,
1382 				.rx_hash_key_len = rss_key_len,
1383 				.rx_hash_key = rss_key,
1384 				.rx_hash_fields_mask = hash_fields,
1385 			},
1386 			.rwq_ind_tbl = ind_tbl->ind_table,
1387 			.pd = priv->pd,
1388 		});
1389 	if (!qp)
1390 		goto error;
1391 	hrxq = rte_calloc(__func__, 1, sizeof(*hrxq) + rss_key_len, 0);
1392 	if (!hrxq)
1393 		goto error;
1394 	hrxq->ind_table = ind_tbl;
1395 	hrxq->qp = qp;
1396 	hrxq->rss_key_len = rss_key_len;
1397 	hrxq->hash_fields = hash_fields;
1398 	memcpy(hrxq->rss_key, rss_key, rss_key_len);
1399 	rte_atomic32_inc(&hrxq->refcnt);
1400 	LIST_INSERT_HEAD(&priv->hrxqs, hrxq, next);
1401 	DEBUG("%p: Hash Rx queue %p: refcnt %d", (void *)priv,
1402 	      (void *)hrxq, rte_atomic32_read(&hrxq->refcnt));
1403 	return hrxq;
1404 error:
1405 	mlx5_priv_ind_table_ibv_release(priv, ind_tbl);
1406 	if (qp)
1407 		claim_zero(ibv_destroy_qp(qp));
1408 	return NULL;
1409 }
1410 
1411 /**
1412  * Get an Rx Hash queue.
1413  *
1414  * @param priv
1415  *   Pointer to private structure.
1416  * @param rss_conf
1417  *   RSS configuration for the Rx hash queue.
1418  * @param queues
1419  *   Queues entering in hash queue. In case of empty hash_fields only the
1420  *   first queue index will be taken for the indirection table.
1421  * @param queues_n
1422  *   Number of queues.
1423  *
1424  * @return
1425  *   An hash Rx queue on success.
1426  */
1427 struct mlx5_hrxq*
1428 mlx5_priv_hrxq_get(struct priv *priv, uint8_t *rss_key, uint8_t rss_key_len,
1429 		   uint64_t hash_fields, uint16_t queues[], uint16_t queues_n)
1430 {
1431 	struct mlx5_hrxq *hrxq;
1432 
1433 	queues_n = hash_fields ? queues_n : 1;
1434 	LIST_FOREACH(hrxq, &priv->hrxqs, next) {
1435 		struct mlx5_ind_table_ibv *ind_tbl;
1436 
1437 		if (hrxq->rss_key_len != rss_key_len)
1438 			continue;
1439 		if (memcmp(hrxq->rss_key, rss_key, rss_key_len))
1440 			continue;
1441 		if (hrxq->hash_fields != hash_fields)
1442 			continue;
1443 		ind_tbl = mlx5_priv_ind_table_ibv_get(priv, queues, queues_n);
1444 		if (!ind_tbl)
1445 			continue;
1446 		if (ind_tbl != hrxq->ind_table) {
1447 			mlx5_priv_ind_table_ibv_release(priv, ind_tbl);
1448 			continue;
1449 		}
1450 		rte_atomic32_inc(&hrxq->refcnt);
1451 		DEBUG("%p: Hash Rx queue %p: refcnt %d", (void *)priv,
1452 		      (void *)hrxq, rte_atomic32_read(&hrxq->refcnt));
1453 		return hrxq;
1454 	}
1455 	return NULL;
1456 }
1457 
1458 /**
1459  * Release the hash Rx queue.
1460  *
1461  * @param priv
1462  *   Pointer to private structure.
1463  * @param hrxq
1464  *   Pointer to Hash Rx queue to release.
1465  *
1466  * @return
1467  *   0 on success, errno value on failure.
1468  */
1469 int
1470 mlx5_priv_hrxq_release(struct priv *priv, struct mlx5_hrxq *hrxq)
1471 {
1472 	DEBUG("%p: Hash Rx queue %p: refcnt %d", (void *)priv,
1473 	      (void *)hrxq, rte_atomic32_read(&hrxq->refcnt));
1474 	if (rte_atomic32_dec_and_test(&hrxq->refcnt)) {
1475 		claim_zero(ibv_destroy_qp(hrxq->qp));
1476 		mlx5_priv_ind_table_ibv_release(priv, hrxq->ind_table);
1477 		LIST_REMOVE(hrxq, next);
1478 		rte_free(hrxq);
1479 		return 0;
1480 	}
1481 	claim_nonzero(mlx5_priv_ind_table_ibv_release(priv, hrxq->ind_table));
1482 	return EBUSY;
1483 }
1484 
1485 /**
1486  * Verify the Rx Queue list is empty
1487  *
1488  * @param priv
1489  *  Pointer to private structure.
1490  *
1491  * @return the number of object not released.
1492  */
1493 int
1494 mlx5_priv_hrxq_ibv_verify(struct priv *priv)
1495 {
1496 	struct mlx5_hrxq *hrxq;
1497 	int ret = 0;
1498 
1499 	LIST_FOREACH(hrxq, &priv->hrxqs, next) {
1500 		DEBUG("%p: Verbs Hash Rx queue %p still referenced",
1501 		      (void *)priv, (void *)hrxq);
1502 		++ret;
1503 	}
1504 	return ret;
1505 }
1506