xref: /dpdk/drivers/net/mlx5/mlx5_rxq.c (revision 200bc52e5aa0d72e70464c9cd22b55cf536ed13c)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5 
6 #include <stddef.h>
7 #include <assert.h>
8 #include <errno.h>
9 #include <string.h>
10 #include <stdint.h>
11 #include <fcntl.h>
12 #include <sys/queue.h>
13 
14 /* Verbs header. */
15 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
16 #ifdef PEDANTIC
17 #pragma GCC diagnostic ignored "-Wpedantic"
18 #endif
19 #include <infiniband/verbs.h>
20 #include <infiniband/mlx5dv.h>
21 #ifdef PEDANTIC
22 #pragma GCC diagnostic error "-Wpedantic"
23 #endif
24 
25 #include <rte_mbuf.h>
26 #include <rte_malloc.h>
27 #include <rte_ethdev_driver.h>
28 #include <rte_common.h>
29 #include <rte_interrupts.h>
30 #include <rte_debug.h>
31 #include <rte_io.h>
32 
33 #include "mlx5.h"
34 #include "mlx5_rxtx.h"
35 #include "mlx5_utils.h"
36 #include "mlx5_autoconf.h"
37 #include "mlx5_defs.h"
38 #include "mlx5_glue.h"
39 
40 /* Default RSS hash key also used for ConnectX-3. */
41 uint8_t rss_hash_default_key[] = {
42 	0x2c, 0xc6, 0x81, 0xd1,
43 	0x5b, 0xdb, 0xf4, 0xf7,
44 	0xfc, 0xa2, 0x83, 0x19,
45 	0xdb, 0x1a, 0x3e, 0x94,
46 	0x6b, 0x9e, 0x38, 0xd9,
47 	0x2c, 0x9c, 0x03, 0xd1,
48 	0xad, 0x99, 0x44, 0xa7,
49 	0xd9, 0x56, 0x3d, 0x59,
50 	0x06, 0x3c, 0x25, 0xf3,
51 	0xfc, 0x1f, 0xdc, 0x2a,
52 };
53 
54 /* Length of the default RSS hash key. */
55 static_assert(MLX5_RSS_HASH_KEY_LEN ==
56 	      (unsigned int)sizeof(rss_hash_default_key),
57 	      "wrong RSS default key size.");
58 
59 /**
60  * Check whether Multi-Packet RQ can be enabled for the device.
61  *
62  * @param dev
63  *   Pointer to Ethernet device.
64  *
65  * @return
66  *   1 if supported, negative errno value if not.
67  */
68 inline int
69 mlx5_check_mprq_support(struct rte_eth_dev *dev)
70 {
71 	struct mlx5_priv *priv = dev->data->dev_private;
72 
73 	if (priv->config.mprq.enabled &&
74 	    priv->rxqs_n >= priv->config.mprq.min_rxqs_num)
75 		return 1;
76 	return -ENOTSUP;
77 }
78 
79 /**
80  * Check whether Multi-Packet RQ is enabled for the Rx queue.
81  *
82  *  @param rxq
83  *     Pointer to receive queue structure.
84  *
85  * @return
86  *   0 if disabled, otherwise enabled.
87  */
88 inline int
89 mlx5_rxq_mprq_enabled(struct mlx5_rxq_data *rxq)
90 {
91 	return rxq->strd_num_n > 0;
92 }
93 
94 /**
95  * Check whether Multi-Packet RQ is enabled for the device.
96  *
97  * @param dev
98  *   Pointer to Ethernet device.
99  *
100  * @return
101  *   0 if disabled, otherwise enabled.
102  */
103 inline int
104 mlx5_mprq_enabled(struct rte_eth_dev *dev)
105 {
106 	struct mlx5_priv *priv = dev->data->dev_private;
107 	uint16_t i;
108 	uint16_t n = 0;
109 
110 	if (mlx5_check_mprq_support(dev) < 0)
111 		return 0;
112 	/* All the configured queues should be enabled. */
113 	for (i = 0; i < priv->rxqs_n; ++i) {
114 		struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
115 
116 		if (!rxq)
117 			continue;
118 		if (mlx5_rxq_mprq_enabled(rxq))
119 			++n;
120 	}
121 	/* Multi-Packet RQ can't be partially configured. */
122 	assert(n == 0 || n == priv->rxqs_n);
123 	return n == priv->rxqs_n;
124 }
125 
126 /**
127  * Allocate RX queue elements for Multi-Packet RQ.
128  *
129  * @param rxq_ctrl
130  *   Pointer to RX queue structure.
131  *
132  * @return
133  *   0 on success, a negative errno value otherwise and rte_errno is set.
134  */
135 static int
136 rxq_alloc_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)
137 {
138 	struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
139 	unsigned int wqe_n = 1 << rxq->elts_n;
140 	unsigned int i;
141 	int err;
142 
143 	/* Iterate on segments. */
144 	for (i = 0; i <= wqe_n; ++i) {
145 		struct mlx5_mprq_buf *buf;
146 
147 		if (rte_mempool_get(rxq->mprq_mp, (void **)&buf) < 0) {
148 			DRV_LOG(ERR, "port %u empty mbuf pool", rxq->port_id);
149 			rte_errno = ENOMEM;
150 			goto error;
151 		}
152 		if (i < wqe_n)
153 			(*rxq->mprq_bufs)[i] = buf;
154 		else
155 			rxq->mprq_repl = buf;
156 	}
157 	DRV_LOG(DEBUG,
158 		"port %u Rx queue %u allocated and configured %u segments",
159 		rxq->port_id, rxq->idx, wqe_n);
160 	return 0;
161 error:
162 	err = rte_errno; /* Save rte_errno before cleanup. */
163 	wqe_n = i;
164 	for (i = 0; (i != wqe_n); ++i) {
165 		if ((*rxq->mprq_bufs)[i] != NULL)
166 			rte_mempool_put(rxq->mprq_mp,
167 					(*rxq->mprq_bufs)[i]);
168 		(*rxq->mprq_bufs)[i] = NULL;
169 	}
170 	DRV_LOG(DEBUG, "port %u Rx queue %u failed, freed everything",
171 		rxq->port_id, rxq->idx);
172 	rte_errno = err; /* Restore rte_errno. */
173 	return -rte_errno;
174 }
175 
176 /**
177  * Allocate RX queue elements for Single-Packet RQ.
178  *
179  * @param rxq_ctrl
180  *   Pointer to RX queue structure.
181  *
182  * @return
183  *   0 on success, errno value on failure.
184  */
185 static int
186 rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
187 {
188 	const unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n;
189 	unsigned int elts_n = 1 << rxq_ctrl->rxq.elts_n;
190 	unsigned int i;
191 	int err;
192 
193 	/* Iterate on segments. */
194 	for (i = 0; (i != elts_n); ++i) {
195 		struct rte_mbuf *buf;
196 
197 		buf = rte_pktmbuf_alloc(rxq_ctrl->rxq.mp);
198 		if (buf == NULL) {
199 			DRV_LOG(ERR, "port %u empty mbuf pool",
200 				PORT_ID(rxq_ctrl->priv));
201 			rte_errno = ENOMEM;
202 			goto error;
203 		}
204 		/* Headroom is reserved by rte_pktmbuf_alloc(). */
205 		assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
206 		/* Buffer is supposed to be empty. */
207 		assert(rte_pktmbuf_data_len(buf) == 0);
208 		assert(rte_pktmbuf_pkt_len(buf) == 0);
209 		assert(!buf->next);
210 		/* Only the first segment keeps headroom. */
211 		if (i % sges_n)
212 			SET_DATA_OFF(buf, 0);
213 		PORT(buf) = rxq_ctrl->rxq.port_id;
214 		DATA_LEN(buf) = rte_pktmbuf_tailroom(buf);
215 		PKT_LEN(buf) = DATA_LEN(buf);
216 		NB_SEGS(buf) = 1;
217 		(*rxq_ctrl->rxq.elts)[i] = buf;
218 	}
219 	/* If Rx vector is activated. */
220 	if (mlx5_rxq_check_vec_support(&rxq_ctrl->rxq) > 0) {
221 		struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
222 		struct rte_mbuf *mbuf_init = &rxq->fake_mbuf;
223 		int j;
224 
225 		/* Initialize default rearm_data for vPMD. */
226 		mbuf_init->data_off = RTE_PKTMBUF_HEADROOM;
227 		rte_mbuf_refcnt_set(mbuf_init, 1);
228 		mbuf_init->nb_segs = 1;
229 		mbuf_init->port = rxq->port_id;
230 		/*
231 		 * prevent compiler reordering:
232 		 * rearm_data covers previous fields.
233 		 */
234 		rte_compiler_barrier();
235 		rxq->mbuf_initializer =
236 			*(uint64_t *)&mbuf_init->rearm_data;
237 		/* Padding with a fake mbuf for vectorized Rx. */
238 		for (j = 0; j < MLX5_VPMD_DESCS_PER_LOOP; ++j)
239 			(*rxq->elts)[elts_n + j] = &rxq->fake_mbuf;
240 	}
241 	DRV_LOG(DEBUG,
242 		"port %u Rx queue %u allocated and configured %u segments"
243 		" (max %u packets)",
244 		PORT_ID(rxq_ctrl->priv), rxq_ctrl->rxq.idx, elts_n,
245 		elts_n / (1 << rxq_ctrl->rxq.sges_n));
246 	return 0;
247 error:
248 	err = rte_errno; /* Save rte_errno before cleanup. */
249 	elts_n = i;
250 	for (i = 0; (i != elts_n); ++i) {
251 		if ((*rxq_ctrl->rxq.elts)[i] != NULL)
252 			rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
253 		(*rxq_ctrl->rxq.elts)[i] = NULL;
254 	}
255 	DRV_LOG(DEBUG, "port %u Rx queue %u failed, freed everything",
256 		PORT_ID(rxq_ctrl->priv), rxq_ctrl->rxq.idx);
257 	rte_errno = err; /* Restore rte_errno. */
258 	return -rte_errno;
259 }
260 
261 /**
262  * Allocate RX queue elements.
263  *
264  * @param rxq_ctrl
265  *   Pointer to RX queue structure.
266  *
267  * @return
268  *   0 on success, errno value on failure.
269  */
270 int
271 rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl)
272 {
273 	return mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ?
274 	       rxq_alloc_elts_mprq(rxq_ctrl) : rxq_alloc_elts_sprq(rxq_ctrl);
275 }
276 
277 /**
278  * Free RX queue elements for Multi-Packet RQ.
279  *
280  * @param rxq_ctrl
281  *   Pointer to RX queue structure.
282  */
283 static void
284 rxq_free_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)
285 {
286 	struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
287 	uint16_t i;
288 
289 	DRV_LOG(DEBUG, "port %u Multi-Packet Rx queue %u freeing WRs",
290 		rxq->port_id, rxq->idx);
291 	if (rxq->mprq_bufs == NULL)
292 		return;
293 	assert(mlx5_rxq_check_vec_support(rxq) < 0);
294 	for (i = 0; (i != (1u << rxq->elts_n)); ++i) {
295 		if ((*rxq->mprq_bufs)[i] != NULL)
296 			mlx5_mprq_buf_free((*rxq->mprq_bufs)[i]);
297 		(*rxq->mprq_bufs)[i] = NULL;
298 	}
299 	if (rxq->mprq_repl != NULL) {
300 		mlx5_mprq_buf_free(rxq->mprq_repl);
301 		rxq->mprq_repl = NULL;
302 	}
303 }
304 
305 /**
306  * Free RX queue elements for Single-Packet RQ.
307  *
308  * @param rxq_ctrl
309  *   Pointer to RX queue structure.
310  */
311 static void
312 rxq_free_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
313 {
314 	struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
315 	const uint16_t q_n = (1 << rxq->elts_n);
316 	const uint16_t q_mask = q_n - 1;
317 	uint16_t used = q_n - (rxq->rq_ci - rxq->rq_pi);
318 	uint16_t i;
319 
320 	DRV_LOG(DEBUG, "port %u Rx queue %u freeing WRs",
321 		PORT_ID(rxq_ctrl->priv), rxq->idx);
322 	if (rxq->elts == NULL)
323 		return;
324 	/**
325 	 * Some mbuf in the Ring belongs to the application.  They cannot be
326 	 * freed.
327 	 */
328 	if (mlx5_rxq_check_vec_support(rxq) > 0) {
329 		for (i = 0; i < used; ++i)
330 			(*rxq->elts)[(rxq->rq_ci + i) & q_mask] = NULL;
331 		rxq->rq_pi = rxq->rq_ci;
332 	}
333 	for (i = 0; (i != (1u << rxq->elts_n)); ++i) {
334 		if ((*rxq->elts)[i] != NULL)
335 			rte_pktmbuf_free_seg((*rxq->elts)[i]);
336 		(*rxq->elts)[i] = NULL;
337 	}
338 }
339 
340 /**
341  * Free RX queue elements.
342  *
343  * @param rxq_ctrl
344  *   Pointer to RX queue structure.
345  */
346 static void
347 rxq_free_elts(struct mlx5_rxq_ctrl *rxq_ctrl)
348 {
349 	if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq))
350 		rxq_free_elts_mprq(rxq_ctrl);
351 	else
352 		rxq_free_elts_sprq(rxq_ctrl);
353 }
354 
355 /**
356  * Returns the per-queue supported offloads.
357  *
358  * @param dev
359  *   Pointer to Ethernet device.
360  *
361  * @return
362  *   Supported Rx offloads.
363  */
364 uint64_t
365 mlx5_get_rx_queue_offloads(struct rte_eth_dev *dev)
366 {
367 	struct mlx5_priv *priv = dev->data->dev_private;
368 	struct mlx5_dev_config *config = &priv->config;
369 	uint64_t offloads = (DEV_RX_OFFLOAD_SCATTER |
370 			     DEV_RX_OFFLOAD_TIMESTAMP |
371 			     DEV_RX_OFFLOAD_JUMBO_FRAME);
372 
373 	if (config->hw_fcs_strip)
374 		offloads |= DEV_RX_OFFLOAD_KEEP_CRC;
375 
376 	if (config->hw_csum)
377 		offloads |= (DEV_RX_OFFLOAD_IPV4_CKSUM |
378 			     DEV_RX_OFFLOAD_UDP_CKSUM |
379 			     DEV_RX_OFFLOAD_TCP_CKSUM);
380 	if (config->hw_vlan_strip)
381 		offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
382 	return offloads;
383 }
384 
385 
386 /**
387  * Returns the per-port supported offloads.
388  *
389  * @return
390  *   Supported Rx offloads.
391  */
392 uint64_t
393 mlx5_get_rx_port_offloads(void)
394 {
395 	uint64_t offloads = DEV_RX_OFFLOAD_VLAN_FILTER;
396 
397 	return offloads;
398 }
399 
400 /**
401  * Verify if the queue can be released.
402  *
403  * @param dev
404  *   Pointer to Ethernet device.
405  * @param idx
406  *   RX queue index.
407  *
408  * @return
409  *   1 if the queue can be released
410  *   0 if the queue can not be released, there are references to it.
411  *   Negative errno and rte_errno is set if queue doesn't exist.
412  */
413 static int
414 mlx5_rxq_releasable(struct rte_eth_dev *dev, uint16_t idx)
415 {
416 	struct mlx5_priv *priv = dev->data->dev_private;
417 	struct mlx5_rxq_ctrl *rxq_ctrl;
418 
419 	if (!(*priv->rxqs)[idx]) {
420 		rte_errno = EINVAL;
421 		return -rte_errno;
422 	}
423 	rxq_ctrl = container_of((*priv->rxqs)[idx], struct mlx5_rxq_ctrl, rxq);
424 	return (rte_atomic32_read(&rxq_ctrl->refcnt) == 1);
425 }
426 
427 /**
428  *
429  * @param dev
430  *   Pointer to Ethernet device structure.
431  * @param idx
432  *   RX queue index.
433  * @param desc
434  *   Number of descriptors to configure in queue.
435  * @param socket
436  *   NUMA socket on which memory must be allocated.
437  * @param[in] conf
438  *   Thresholds parameters.
439  * @param mp
440  *   Memory pool for buffer allocations.
441  *
442  * @return
443  *   0 on success, a negative errno value otherwise and rte_errno is set.
444  */
445 int
446 mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
447 		    unsigned int socket, const struct rte_eth_rxconf *conf,
448 		    struct rte_mempool *mp)
449 {
450 	struct mlx5_priv *priv = dev->data->dev_private;
451 	struct mlx5_rxq_data *rxq = (*priv->rxqs)[idx];
452 	struct mlx5_rxq_ctrl *rxq_ctrl =
453 		container_of(rxq, struct mlx5_rxq_ctrl, rxq);
454 
455 	if (!rte_is_power_of_2(desc)) {
456 		desc = 1 << log2above(desc);
457 		DRV_LOG(WARNING,
458 			"port %u increased number of descriptors in Rx queue %u"
459 			" to the next power of two (%d)",
460 			dev->data->port_id, idx, desc);
461 	}
462 	DRV_LOG(DEBUG, "port %u configuring Rx queue %u for %u descriptors",
463 		dev->data->port_id, idx, desc);
464 	if (idx >= priv->rxqs_n) {
465 		DRV_LOG(ERR, "port %u Rx queue index out of range (%u >= %u)",
466 			dev->data->port_id, idx, priv->rxqs_n);
467 		rte_errno = EOVERFLOW;
468 		return -rte_errno;
469 	}
470 	if (!mlx5_rxq_releasable(dev, idx)) {
471 		DRV_LOG(ERR, "port %u unable to release queue index %u",
472 			dev->data->port_id, idx);
473 		rte_errno = EBUSY;
474 		return -rte_errno;
475 	}
476 	mlx5_rxq_release(dev, idx);
477 	rxq_ctrl = mlx5_rxq_new(dev, idx, desc, socket, conf, mp);
478 	if (!rxq_ctrl) {
479 		DRV_LOG(ERR, "port %u unable to allocate queue index %u",
480 			dev->data->port_id, idx);
481 		rte_errno = ENOMEM;
482 		return -rte_errno;
483 	}
484 	DRV_LOG(DEBUG, "port %u adding Rx queue %u to list",
485 		dev->data->port_id, idx);
486 	(*priv->rxqs)[idx] = &rxq_ctrl->rxq;
487 	return 0;
488 }
489 
490 /**
491  * DPDK callback to release a RX queue.
492  *
493  * @param dpdk_rxq
494  *   Generic RX queue pointer.
495  */
496 void
497 mlx5_rx_queue_release(void *dpdk_rxq)
498 {
499 	struct mlx5_rxq_data *rxq = (struct mlx5_rxq_data *)dpdk_rxq;
500 	struct mlx5_rxq_ctrl *rxq_ctrl;
501 	struct mlx5_priv *priv;
502 
503 	if (rxq == NULL)
504 		return;
505 	rxq_ctrl = container_of(rxq, struct mlx5_rxq_ctrl, rxq);
506 	priv = rxq_ctrl->priv;
507 	if (!mlx5_rxq_releasable(ETH_DEV(priv), rxq_ctrl->rxq.idx))
508 		rte_panic("port %u Rx queue %u is still used by a flow and"
509 			  " cannot be removed\n",
510 			  PORT_ID(priv), rxq->idx);
511 	mlx5_rxq_release(ETH_DEV(priv), rxq_ctrl->rxq.idx);
512 }
513 
514 /**
515  * Get an Rx queue Verbs object.
516  *
517  * @param dev
518  *   Pointer to Ethernet device.
519  * @param idx
520  *   Queue index in DPDK Rx queue array
521  *
522  * @return
523  *   The Verbs object if it exists.
524  */
525 static struct mlx5_rxq_ibv *
526 mlx5_rxq_ibv_get(struct rte_eth_dev *dev, uint16_t idx)
527 {
528 	struct mlx5_priv *priv = dev->data->dev_private;
529 	struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx];
530 	struct mlx5_rxq_ctrl *rxq_ctrl;
531 
532 	if (idx >= priv->rxqs_n)
533 		return NULL;
534 	if (!rxq_data)
535 		return NULL;
536 	rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
537 	if (rxq_ctrl->ibv)
538 		rte_atomic32_inc(&rxq_ctrl->ibv->refcnt);
539 	return rxq_ctrl->ibv;
540 }
541 
542 /**
543  * Release an Rx verbs queue object.
544  *
545  * @param rxq_ibv
546  *   Verbs Rx queue object.
547  *
548  * @return
549  *   1 while a reference on it exists, 0 when freed.
550  */
551 static int
552 mlx5_rxq_ibv_release(struct mlx5_rxq_ibv *rxq_ibv)
553 {
554 	assert(rxq_ibv);
555 	assert(rxq_ibv->wq);
556 	assert(rxq_ibv->cq);
557 	if (rte_atomic32_dec_and_test(&rxq_ibv->refcnt)) {
558 		rxq_free_elts(rxq_ibv->rxq_ctrl);
559 		claim_zero(mlx5_glue->destroy_wq(rxq_ibv->wq));
560 		claim_zero(mlx5_glue->destroy_cq(rxq_ibv->cq));
561 		if (rxq_ibv->channel)
562 			claim_zero(mlx5_glue->destroy_comp_channel
563 				   (rxq_ibv->channel));
564 		LIST_REMOVE(rxq_ibv, next);
565 		rte_free(rxq_ibv);
566 		return 0;
567 	}
568 	return 1;
569 }
570 
571 /**
572  * Allocate queue vector and fill epoll fd list for Rx interrupts.
573  *
574  * @param dev
575  *   Pointer to Ethernet device.
576  *
577  * @return
578  *   0 on success, a negative errno value otherwise and rte_errno is set.
579  */
580 int
581 mlx5_rx_intr_vec_enable(struct rte_eth_dev *dev)
582 {
583 	struct mlx5_priv *priv = dev->data->dev_private;
584 	unsigned int i;
585 	unsigned int rxqs_n = priv->rxqs_n;
586 	unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
587 	unsigned int count = 0;
588 	struct rte_intr_handle *intr_handle = dev->intr_handle;
589 
590 	if (!dev->data->dev_conf.intr_conf.rxq)
591 		return 0;
592 	mlx5_rx_intr_vec_disable(dev);
593 	intr_handle->intr_vec = malloc(n * sizeof(intr_handle->intr_vec[0]));
594 	if (intr_handle->intr_vec == NULL) {
595 		DRV_LOG(ERR,
596 			"port %u failed to allocate memory for interrupt"
597 			" vector, Rx interrupts will not be supported",
598 			dev->data->port_id);
599 		rte_errno = ENOMEM;
600 		return -rte_errno;
601 	}
602 	intr_handle->type = RTE_INTR_HANDLE_EXT;
603 	for (i = 0; i != n; ++i) {
604 		/* This rxq ibv must not be released in this function. */
605 		struct mlx5_rxq_ibv *rxq_ibv = mlx5_rxq_ibv_get(dev, i);
606 		int fd;
607 		int flags;
608 		int rc;
609 
610 		/* Skip queues that cannot request interrupts. */
611 		if (!rxq_ibv || !rxq_ibv->channel) {
612 			/* Use invalid intr_vec[] index to disable entry. */
613 			intr_handle->intr_vec[i] =
614 				RTE_INTR_VEC_RXTX_OFFSET +
615 				RTE_MAX_RXTX_INTR_VEC_ID;
616 			continue;
617 		}
618 		if (count >= RTE_MAX_RXTX_INTR_VEC_ID) {
619 			DRV_LOG(ERR,
620 				"port %u too many Rx queues for interrupt"
621 				" vector size (%d), Rx interrupts cannot be"
622 				" enabled",
623 				dev->data->port_id, RTE_MAX_RXTX_INTR_VEC_ID);
624 			mlx5_rx_intr_vec_disable(dev);
625 			rte_errno = ENOMEM;
626 			return -rte_errno;
627 		}
628 		fd = rxq_ibv->channel->fd;
629 		flags = fcntl(fd, F_GETFL);
630 		rc = fcntl(fd, F_SETFL, flags | O_NONBLOCK);
631 		if (rc < 0) {
632 			rte_errno = errno;
633 			DRV_LOG(ERR,
634 				"port %u failed to make Rx interrupt file"
635 				" descriptor %d non-blocking for queue index"
636 				" %d",
637 				dev->data->port_id, fd, i);
638 			mlx5_rx_intr_vec_disable(dev);
639 			return -rte_errno;
640 		}
641 		intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + count;
642 		intr_handle->efds[count] = fd;
643 		count++;
644 	}
645 	if (!count)
646 		mlx5_rx_intr_vec_disable(dev);
647 	else
648 		intr_handle->nb_efd = count;
649 	return 0;
650 }
651 
652 /**
653  * Clean up Rx interrupts handler.
654  *
655  * @param dev
656  *   Pointer to Ethernet device.
657  */
658 void
659 mlx5_rx_intr_vec_disable(struct rte_eth_dev *dev)
660 {
661 	struct mlx5_priv *priv = dev->data->dev_private;
662 	struct rte_intr_handle *intr_handle = dev->intr_handle;
663 	unsigned int i;
664 	unsigned int rxqs_n = priv->rxqs_n;
665 	unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
666 
667 	if (!dev->data->dev_conf.intr_conf.rxq)
668 		return;
669 	if (!intr_handle->intr_vec)
670 		goto free;
671 	for (i = 0; i != n; ++i) {
672 		struct mlx5_rxq_ctrl *rxq_ctrl;
673 		struct mlx5_rxq_data *rxq_data;
674 
675 		if (intr_handle->intr_vec[i] == RTE_INTR_VEC_RXTX_OFFSET +
676 		    RTE_MAX_RXTX_INTR_VEC_ID)
677 			continue;
678 		/**
679 		 * Need to access directly the queue to release the reference
680 		 * kept in mlx5_rx_intr_vec_enable().
681 		 */
682 		rxq_data = (*priv->rxqs)[i];
683 		rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
684 		if (rxq_ctrl->ibv)
685 			mlx5_rxq_ibv_release(rxq_ctrl->ibv);
686 	}
687 free:
688 	rte_intr_free_epoll_fd(intr_handle);
689 	if (intr_handle->intr_vec)
690 		free(intr_handle->intr_vec);
691 	intr_handle->nb_efd = 0;
692 	intr_handle->intr_vec = NULL;
693 }
694 
695 /**
696  *  MLX5 CQ notification .
697  *
698  *  @param rxq
699  *     Pointer to receive queue structure.
700  *  @param sq_n_rxq
701  *     Sequence number per receive queue .
702  */
703 static inline void
704 mlx5_arm_cq(struct mlx5_rxq_data *rxq, int sq_n_rxq)
705 {
706 	int sq_n = 0;
707 	uint32_t doorbell_hi;
708 	uint64_t doorbell;
709 	void *cq_db_reg = (char *)rxq->cq_uar + MLX5_CQ_DOORBELL;
710 
711 	sq_n = sq_n_rxq & MLX5_CQ_SQN_MASK;
712 	doorbell_hi = sq_n << MLX5_CQ_SQN_OFFSET | (rxq->cq_ci & MLX5_CI_MASK);
713 	doorbell = (uint64_t)doorbell_hi << 32;
714 	doorbell |=  rxq->cqn;
715 	rxq->cq_db[MLX5_CQ_ARM_DB] = rte_cpu_to_be_32(doorbell_hi);
716 	mlx5_uar_write64(rte_cpu_to_be_64(doorbell),
717 			 cq_db_reg, rxq->uar_lock_cq);
718 }
719 
720 /**
721  * DPDK callback for Rx queue interrupt enable.
722  *
723  * @param dev
724  *   Pointer to Ethernet device structure.
725  * @param rx_queue_id
726  *   Rx queue number.
727  *
728  * @return
729  *   0 on success, a negative errno value otherwise and rte_errno is set.
730  */
731 int
732 mlx5_rx_intr_enable(struct rte_eth_dev *dev, uint16_t rx_queue_id)
733 {
734 	struct mlx5_priv *priv = dev->data->dev_private;
735 	struct mlx5_rxq_data *rxq_data;
736 	struct mlx5_rxq_ctrl *rxq_ctrl;
737 
738 	rxq_data = (*priv->rxqs)[rx_queue_id];
739 	if (!rxq_data) {
740 		rte_errno = EINVAL;
741 		return -rte_errno;
742 	}
743 	rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
744 	if (rxq_ctrl->irq) {
745 		struct mlx5_rxq_ibv *rxq_ibv;
746 
747 		rxq_ibv = mlx5_rxq_ibv_get(dev, rx_queue_id);
748 		if (!rxq_ibv) {
749 			rte_errno = EINVAL;
750 			return -rte_errno;
751 		}
752 		mlx5_arm_cq(rxq_data, rxq_data->cq_arm_sn);
753 		mlx5_rxq_ibv_release(rxq_ibv);
754 	}
755 	return 0;
756 }
757 
758 /**
759  * DPDK callback for Rx queue interrupt disable.
760  *
761  * @param dev
762  *   Pointer to Ethernet device structure.
763  * @param rx_queue_id
764  *   Rx queue number.
765  *
766  * @return
767  *   0 on success, a negative errno value otherwise and rte_errno is set.
768  */
769 int
770 mlx5_rx_intr_disable(struct rte_eth_dev *dev, uint16_t rx_queue_id)
771 {
772 	struct mlx5_priv *priv = dev->data->dev_private;
773 	struct mlx5_rxq_data *rxq_data;
774 	struct mlx5_rxq_ctrl *rxq_ctrl;
775 	struct mlx5_rxq_ibv *rxq_ibv = NULL;
776 	struct ibv_cq *ev_cq;
777 	void *ev_ctx;
778 	int ret;
779 
780 	rxq_data = (*priv->rxqs)[rx_queue_id];
781 	if (!rxq_data) {
782 		rte_errno = EINVAL;
783 		return -rte_errno;
784 	}
785 	rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
786 	if (!rxq_ctrl->irq)
787 		return 0;
788 	rxq_ibv = mlx5_rxq_ibv_get(dev, rx_queue_id);
789 	if (!rxq_ibv) {
790 		rte_errno = EINVAL;
791 		return -rte_errno;
792 	}
793 	ret = mlx5_glue->get_cq_event(rxq_ibv->channel, &ev_cq, &ev_ctx);
794 	if (ret || ev_cq != rxq_ibv->cq) {
795 		rte_errno = EINVAL;
796 		goto exit;
797 	}
798 	rxq_data->cq_arm_sn++;
799 	mlx5_glue->ack_cq_events(rxq_ibv->cq, 1);
800 	mlx5_rxq_ibv_release(rxq_ibv);
801 	return 0;
802 exit:
803 	ret = rte_errno; /* Save rte_errno before cleanup. */
804 	if (rxq_ibv)
805 		mlx5_rxq_ibv_release(rxq_ibv);
806 	DRV_LOG(WARNING, "port %u unable to disable interrupt on Rx queue %d",
807 		dev->data->port_id, rx_queue_id);
808 	rte_errno = ret; /* Restore rte_errno. */
809 	return -rte_errno;
810 }
811 
812 /**
813  * Create the Rx queue Verbs object.
814  *
815  * @param dev
816  *   Pointer to Ethernet device.
817  * @param idx
818  *   Queue index in DPDK Rx queue array
819  *
820  * @return
821  *   The Verbs object initialised, NULL otherwise and rte_errno is set.
822  */
823 struct mlx5_rxq_ibv *
824 mlx5_rxq_ibv_new(struct rte_eth_dev *dev, uint16_t idx)
825 {
826 	struct mlx5_priv *priv = dev->data->dev_private;
827 	struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx];
828 	struct mlx5_rxq_ctrl *rxq_ctrl =
829 		container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
830 	struct ibv_wq_attr mod;
831 	union {
832 		struct {
833 			struct ibv_cq_init_attr_ex ibv;
834 			struct mlx5dv_cq_init_attr mlx5;
835 		} cq;
836 		struct {
837 			struct ibv_wq_init_attr ibv;
838 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
839 			struct mlx5dv_wq_init_attr mlx5;
840 #endif
841 		} wq;
842 		struct ibv_cq_ex cq_attr;
843 	} attr;
844 	unsigned int cqe_n;
845 	unsigned int wqe_n = 1 << rxq_data->elts_n;
846 	struct mlx5_rxq_ibv *tmpl = NULL;
847 	struct mlx5dv_cq cq_info;
848 	struct mlx5dv_rwq rwq;
849 	unsigned int i;
850 	int ret = 0;
851 	struct mlx5dv_obj obj;
852 	struct mlx5_dev_config *config = &priv->config;
853 	const int mprq_en = mlx5_rxq_mprq_enabled(rxq_data);
854 
855 	assert(rxq_data);
856 	assert(!rxq_ctrl->ibv);
857 	priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_RX_QUEUE;
858 	priv->verbs_alloc_ctx.obj = rxq_ctrl;
859 	tmpl = rte_calloc_socket(__func__, 1, sizeof(*tmpl), 0,
860 				 rxq_ctrl->socket);
861 	if (!tmpl) {
862 		DRV_LOG(ERR,
863 			"port %u Rx queue %u cannot allocate verbs resources",
864 			dev->data->port_id, rxq_data->idx);
865 		rte_errno = ENOMEM;
866 		goto error;
867 	}
868 	tmpl->rxq_ctrl = rxq_ctrl;
869 	if (rxq_ctrl->irq) {
870 		tmpl->channel = mlx5_glue->create_comp_channel(priv->sh->ctx);
871 		if (!tmpl->channel) {
872 			DRV_LOG(ERR, "port %u: comp channel creation failure",
873 				dev->data->port_id);
874 			rte_errno = ENOMEM;
875 			goto error;
876 		}
877 	}
878 	if (mprq_en)
879 		cqe_n = wqe_n * (1 << rxq_data->strd_num_n) - 1;
880 	else
881 		cqe_n = wqe_n  - 1;
882 	attr.cq.ibv = (struct ibv_cq_init_attr_ex){
883 		.cqe = cqe_n,
884 		.channel = tmpl->channel,
885 		.comp_mask = 0,
886 	};
887 	attr.cq.mlx5 = (struct mlx5dv_cq_init_attr){
888 		.comp_mask = 0,
889 	};
890 	if (config->cqe_comp && !rxq_data->hw_timestamp) {
891 		attr.cq.mlx5.comp_mask |=
892 			MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE;
893 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
894 		attr.cq.mlx5.cqe_comp_res_format =
895 			mprq_en ? MLX5DV_CQE_RES_FORMAT_CSUM_STRIDX :
896 				  MLX5DV_CQE_RES_FORMAT_HASH;
897 #else
898 		attr.cq.mlx5.cqe_comp_res_format = MLX5DV_CQE_RES_FORMAT_HASH;
899 #endif
900 		/*
901 		 * For vectorized Rx, it must not be doubled in order to
902 		 * make cq_ci and rq_ci aligned.
903 		 */
904 		if (mlx5_rxq_check_vec_support(rxq_data) < 0)
905 			attr.cq.ibv.cqe *= 2;
906 	} else if (config->cqe_comp && rxq_data->hw_timestamp) {
907 		DRV_LOG(DEBUG,
908 			"port %u Rx CQE compression is disabled for HW"
909 			" timestamp",
910 			dev->data->port_id);
911 	}
912 #ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD
913 	if (config->cqe_pad) {
914 		attr.cq.mlx5.comp_mask |= MLX5DV_CQ_INIT_ATTR_MASK_FLAGS;
915 		attr.cq.mlx5.flags |= MLX5DV_CQ_INIT_ATTR_FLAGS_CQE_PAD;
916 	}
917 #endif
918 	tmpl->cq = mlx5_glue->cq_ex_to_cq
919 		(mlx5_glue->dv_create_cq(priv->sh->ctx, &attr.cq.ibv,
920 					 &attr.cq.mlx5));
921 	if (tmpl->cq == NULL) {
922 		DRV_LOG(ERR, "port %u Rx queue %u CQ creation failure",
923 			dev->data->port_id, idx);
924 		rte_errno = ENOMEM;
925 		goto error;
926 	}
927 	DRV_LOG(DEBUG, "port %u device_attr.max_qp_wr is %d",
928 		dev->data->port_id, priv->sh->device_attr.orig_attr.max_qp_wr);
929 	DRV_LOG(DEBUG, "port %u device_attr.max_sge is %d",
930 		dev->data->port_id, priv->sh->device_attr.orig_attr.max_sge);
931 	attr.wq.ibv = (struct ibv_wq_init_attr){
932 		.wq_context = NULL, /* Could be useful in the future. */
933 		.wq_type = IBV_WQT_RQ,
934 		/* Max number of outstanding WRs. */
935 		.max_wr = wqe_n >> rxq_data->sges_n,
936 		/* Max number of scatter/gather elements in a WR. */
937 		.max_sge = 1 << rxq_data->sges_n,
938 		.pd = priv->sh->pd,
939 		.cq = tmpl->cq,
940 		.comp_mask =
941 			IBV_WQ_FLAGS_CVLAN_STRIPPING |
942 			0,
943 		.create_flags = (rxq_data->vlan_strip ?
944 				 IBV_WQ_FLAGS_CVLAN_STRIPPING :
945 				 0),
946 	};
947 	/* By default, FCS (CRC) is stripped by hardware. */
948 	if (rxq_data->crc_present) {
949 		attr.wq.ibv.create_flags |= IBV_WQ_FLAGS_SCATTER_FCS;
950 		attr.wq.ibv.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS;
951 	}
952 	if (config->hw_padding) {
953 #if defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING)
954 		attr.wq.ibv.create_flags |= IBV_WQ_FLAG_RX_END_PADDING;
955 		attr.wq.ibv.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS;
956 #elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING)
957 		attr.wq.ibv.create_flags |= IBV_WQ_FLAGS_PCI_WRITE_END_PADDING;
958 		attr.wq.ibv.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS;
959 #endif
960 	}
961 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
962 	attr.wq.mlx5 = (struct mlx5dv_wq_init_attr){
963 		.comp_mask = 0,
964 	};
965 	if (mprq_en) {
966 		struct mlx5dv_striding_rq_init_attr *mprq_attr =
967 			&attr.wq.mlx5.striding_rq_attrs;
968 
969 		attr.wq.mlx5.comp_mask |= MLX5DV_WQ_INIT_ATTR_MASK_STRIDING_RQ;
970 		*mprq_attr = (struct mlx5dv_striding_rq_init_attr){
971 			.single_stride_log_num_of_bytes = rxq_data->strd_sz_n,
972 			.single_wqe_log_num_of_strides = rxq_data->strd_num_n,
973 			.two_byte_shift_en = MLX5_MPRQ_TWO_BYTE_SHIFT,
974 		};
975 	}
976 	tmpl->wq = mlx5_glue->dv_create_wq(priv->sh->ctx, &attr.wq.ibv,
977 					   &attr.wq.mlx5);
978 #else
979 	tmpl->wq = mlx5_glue->create_wq(priv->sh->ctx, &attr.wq.ibv);
980 #endif
981 	if (tmpl->wq == NULL) {
982 		DRV_LOG(ERR, "port %u Rx queue %u WQ creation failure",
983 			dev->data->port_id, idx);
984 		rte_errno = ENOMEM;
985 		goto error;
986 	}
987 	/*
988 	 * Make sure number of WRs*SGEs match expectations since a queue
989 	 * cannot allocate more than "desc" buffers.
990 	 */
991 	if (attr.wq.ibv.max_wr != (wqe_n >> rxq_data->sges_n) ||
992 	    attr.wq.ibv.max_sge != (1u << rxq_data->sges_n)) {
993 		DRV_LOG(ERR,
994 			"port %u Rx queue %u requested %u*%u but got %u*%u"
995 			" WRs*SGEs",
996 			dev->data->port_id, idx,
997 			wqe_n >> rxq_data->sges_n, (1 << rxq_data->sges_n),
998 			attr.wq.ibv.max_wr, attr.wq.ibv.max_sge);
999 		rte_errno = EINVAL;
1000 		goto error;
1001 	}
1002 	/* Change queue state to ready. */
1003 	mod = (struct ibv_wq_attr){
1004 		.attr_mask = IBV_WQ_ATTR_STATE,
1005 		.wq_state = IBV_WQS_RDY,
1006 	};
1007 	ret = mlx5_glue->modify_wq(tmpl->wq, &mod);
1008 	if (ret) {
1009 		DRV_LOG(ERR,
1010 			"port %u Rx queue %u WQ state to IBV_WQS_RDY failed",
1011 			dev->data->port_id, idx);
1012 		rte_errno = ret;
1013 		goto error;
1014 	}
1015 	obj.cq.in = tmpl->cq;
1016 	obj.cq.out = &cq_info;
1017 	obj.rwq.in = tmpl->wq;
1018 	obj.rwq.out = &rwq;
1019 	ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_CQ | MLX5DV_OBJ_RWQ);
1020 	if (ret) {
1021 		rte_errno = ret;
1022 		goto error;
1023 	}
1024 	if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) {
1025 		DRV_LOG(ERR,
1026 			"port %u wrong MLX5_CQE_SIZE environment variable"
1027 			" value: it should be set to %u",
1028 			dev->data->port_id, RTE_CACHE_LINE_SIZE);
1029 		rte_errno = EINVAL;
1030 		goto error;
1031 	}
1032 	/* Fill the rings. */
1033 	rxq_data->wqes = rwq.buf;
1034 	for (i = 0; (i != wqe_n); ++i) {
1035 		volatile struct mlx5_wqe_data_seg *scat;
1036 		uintptr_t addr;
1037 		uint32_t byte_count;
1038 
1039 		if (mprq_en) {
1040 			struct mlx5_mprq_buf *buf = (*rxq_data->mprq_bufs)[i];
1041 
1042 			scat = &((volatile struct mlx5_wqe_mprq *)
1043 				 rxq_data->wqes)[i].dseg;
1044 			addr = (uintptr_t)mlx5_mprq_buf_addr(buf);
1045 			byte_count = (1 << rxq_data->strd_sz_n) *
1046 				     (1 << rxq_data->strd_num_n);
1047 		} else {
1048 			struct rte_mbuf *buf = (*rxq_data->elts)[i];
1049 
1050 			scat = &((volatile struct mlx5_wqe_data_seg *)
1051 				 rxq_data->wqes)[i];
1052 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
1053 			byte_count = DATA_LEN(buf);
1054 		}
1055 		/* scat->addr must be able to store a pointer. */
1056 		assert(sizeof(scat->addr) >= sizeof(uintptr_t));
1057 		*scat = (struct mlx5_wqe_data_seg){
1058 			.addr = rte_cpu_to_be_64(addr),
1059 			.byte_count = rte_cpu_to_be_32(byte_count),
1060 			.lkey = mlx5_rx_addr2mr(rxq_data, addr),
1061 		};
1062 	}
1063 	rxq_data->rq_db = rwq.dbrec;
1064 	rxq_data->cqe_n = log2above(cq_info.cqe_cnt);
1065 	rxq_data->cq_ci = 0;
1066 	rxq_data->consumed_strd = 0;
1067 	rxq_data->rq_pi = 0;
1068 	rxq_data->zip = (struct rxq_zip){
1069 		.ai = 0,
1070 	};
1071 	rxq_data->cq_db = cq_info.dbrec;
1072 	rxq_data->cqes = (volatile struct mlx5_cqe (*)[])(uintptr_t)cq_info.buf;
1073 	rxq_data->cq_uar = cq_info.cq_uar;
1074 	rxq_data->cqn = cq_info.cqn;
1075 	rxq_data->cq_arm_sn = 0;
1076 	/* Update doorbell counter. */
1077 	rxq_data->rq_ci = wqe_n >> rxq_data->sges_n;
1078 	rte_wmb();
1079 	*rxq_data->rq_db = rte_cpu_to_be_32(rxq_data->rq_ci);
1080 	DRV_LOG(DEBUG, "port %u rxq %u updated with %p", dev->data->port_id,
1081 		idx, (void *)&tmpl);
1082 	rte_atomic32_inc(&tmpl->refcnt);
1083 	LIST_INSERT_HEAD(&priv->rxqsibv, tmpl, next);
1084 	priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE;
1085 	return tmpl;
1086 error:
1087 	if (tmpl) {
1088 		ret = rte_errno; /* Save rte_errno before cleanup. */
1089 		if (tmpl->wq)
1090 			claim_zero(mlx5_glue->destroy_wq(tmpl->wq));
1091 		if (tmpl->cq)
1092 			claim_zero(mlx5_glue->destroy_cq(tmpl->cq));
1093 		if (tmpl->channel)
1094 			claim_zero(mlx5_glue->destroy_comp_channel
1095 							(tmpl->channel));
1096 		rte_free(tmpl);
1097 		rte_errno = ret; /* Restore rte_errno. */
1098 	}
1099 	priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE;
1100 	return NULL;
1101 }
1102 
1103 /**
1104  * Verify the Verbs Rx queue list is empty
1105  *
1106  * @param dev
1107  *   Pointer to Ethernet device.
1108  *
1109  * @return
1110  *   The number of object not released.
1111  */
1112 int
1113 mlx5_rxq_ibv_verify(struct rte_eth_dev *dev)
1114 {
1115 	struct mlx5_priv *priv = dev->data->dev_private;
1116 	int ret = 0;
1117 	struct mlx5_rxq_ibv *rxq_ibv;
1118 
1119 	LIST_FOREACH(rxq_ibv, &priv->rxqsibv, next) {
1120 		DRV_LOG(DEBUG, "port %u Verbs Rx queue %u still referenced",
1121 			dev->data->port_id, rxq_ibv->rxq_ctrl->rxq.idx);
1122 		++ret;
1123 	}
1124 	return ret;
1125 }
1126 
1127 /**
1128  * Callback function to initialize mbufs for Multi-Packet RQ.
1129  */
1130 static inline void
1131 mlx5_mprq_buf_init(struct rte_mempool *mp, void *opaque_arg __rte_unused,
1132 		    void *_m, unsigned int i __rte_unused)
1133 {
1134 	struct mlx5_mprq_buf *buf = _m;
1135 
1136 	memset(_m, 0, sizeof(*buf));
1137 	buf->mp = mp;
1138 	rte_atomic16_set(&buf->refcnt, 1);
1139 }
1140 
1141 /**
1142  * Free mempool of Multi-Packet RQ.
1143  *
1144  * @param dev
1145  *   Pointer to Ethernet device.
1146  *
1147  * @return
1148  *   0 on success, negative errno value on failure.
1149  */
1150 int
1151 mlx5_mprq_free_mp(struct rte_eth_dev *dev)
1152 {
1153 	struct mlx5_priv *priv = dev->data->dev_private;
1154 	struct rte_mempool *mp = priv->mprq_mp;
1155 	unsigned int i;
1156 
1157 	if (mp == NULL)
1158 		return 0;
1159 	DRV_LOG(DEBUG, "port %u freeing mempool (%s) for Multi-Packet RQ",
1160 		dev->data->port_id, mp->name);
1161 	/*
1162 	 * If a buffer in the pool has been externally attached to a mbuf and it
1163 	 * is still in use by application, destroying the Rx qeueue can spoil
1164 	 * the packet. It is unlikely to happen but if application dynamically
1165 	 * creates and destroys with holding Rx packets, this can happen.
1166 	 *
1167 	 * TODO: It is unavoidable for now because the mempool for Multi-Packet
1168 	 * RQ isn't provided by application but managed by PMD.
1169 	 */
1170 	if (!rte_mempool_full(mp)) {
1171 		DRV_LOG(ERR,
1172 			"port %u mempool for Multi-Packet RQ is still in use",
1173 			dev->data->port_id);
1174 		rte_errno = EBUSY;
1175 		return -rte_errno;
1176 	}
1177 	rte_mempool_free(mp);
1178 	/* Unset mempool for each Rx queue. */
1179 	for (i = 0; i != priv->rxqs_n; ++i) {
1180 		struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
1181 
1182 		if (rxq == NULL)
1183 			continue;
1184 		rxq->mprq_mp = NULL;
1185 	}
1186 	priv->mprq_mp = NULL;
1187 	return 0;
1188 }
1189 
1190 /**
1191  * Allocate a mempool for Multi-Packet RQ. All configured Rx queues share the
1192  * mempool. If already allocated, reuse it if there're enough elements.
1193  * Otherwise, resize it.
1194  *
1195  * @param dev
1196  *   Pointer to Ethernet device.
1197  *
1198  * @return
1199  *   0 on success, negative errno value on failure.
1200  */
1201 int
1202 mlx5_mprq_alloc_mp(struct rte_eth_dev *dev)
1203 {
1204 	struct mlx5_priv *priv = dev->data->dev_private;
1205 	struct rte_mempool *mp = priv->mprq_mp;
1206 	char name[RTE_MEMPOOL_NAMESIZE];
1207 	unsigned int desc = 0;
1208 	unsigned int buf_len;
1209 	unsigned int obj_num;
1210 	unsigned int obj_size;
1211 	unsigned int strd_num_n = 0;
1212 	unsigned int strd_sz_n = 0;
1213 	unsigned int i;
1214 
1215 	if (!mlx5_mprq_enabled(dev))
1216 		return 0;
1217 	/* Count the total number of descriptors configured. */
1218 	for (i = 0; i != priv->rxqs_n; ++i) {
1219 		struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
1220 
1221 		if (rxq == NULL)
1222 			continue;
1223 		desc += 1 << rxq->elts_n;
1224 		/* Get the max number of strides. */
1225 		if (strd_num_n < rxq->strd_num_n)
1226 			strd_num_n = rxq->strd_num_n;
1227 		/* Get the max size of a stride. */
1228 		if (strd_sz_n < rxq->strd_sz_n)
1229 			strd_sz_n = rxq->strd_sz_n;
1230 	}
1231 	assert(strd_num_n && strd_sz_n);
1232 	buf_len = (1 << strd_num_n) * (1 << strd_sz_n);
1233 	obj_size = buf_len + sizeof(struct mlx5_mprq_buf);
1234 	/*
1235 	 * Received packets can be either memcpy'd or externally referenced. In
1236 	 * case that the packet is attached to an mbuf as an external buffer, as
1237 	 * it isn't possible to predict how the buffers will be queued by
1238 	 * application, there's no option to exactly pre-allocate needed buffers
1239 	 * in advance but to speculatively prepares enough buffers.
1240 	 *
1241 	 * In the data path, if this Mempool is depleted, PMD will try to memcpy
1242 	 * received packets to buffers provided by application (rxq->mp) until
1243 	 * this Mempool gets available again.
1244 	 */
1245 	desc *= 4;
1246 	obj_num = desc + MLX5_MPRQ_MP_CACHE_SZ * priv->rxqs_n;
1247 	/*
1248 	 * rte_mempool_create_empty() has sanity check to refuse large cache
1249 	 * size compared to the number of elements.
1250 	 * CACHE_FLUSHTHRESH_MULTIPLIER is defined in a C file, so using a
1251 	 * constant number 2 instead.
1252 	 */
1253 	obj_num = RTE_MAX(obj_num, MLX5_MPRQ_MP_CACHE_SZ * 2);
1254 	/* Check a mempool is already allocated and if it can be resued. */
1255 	if (mp != NULL && mp->elt_size >= obj_size && mp->size >= obj_num) {
1256 		DRV_LOG(DEBUG, "port %u mempool %s is being reused",
1257 			dev->data->port_id, mp->name);
1258 		/* Reuse. */
1259 		goto exit;
1260 	} else if (mp != NULL) {
1261 		DRV_LOG(DEBUG, "port %u mempool %s should be resized, freeing it",
1262 			dev->data->port_id, mp->name);
1263 		/*
1264 		 * If failed to free, which means it may be still in use, no way
1265 		 * but to keep using the existing one. On buffer underrun,
1266 		 * packets will be memcpy'd instead of external buffer
1267 		 * attachment.
1268 		 */
1269 		if (mlx5_mprq_free_mp(dev)) {
1270 			if (mp->elt_size >= obj_size)
1271 				goto exit;
1272 			else
1273 				return -rte_errno;
1274 		}
1275 	}
1276 	snprintf(name, sizeof(name), "port-%u-mprq", dev->data->port_id);
1277 	mp = rte_mempool_create(name, obj_num, obj_size, MLX5_MPRQ_MP_CACHE_SZ,
1278 				0, NULL, NULL, mlx5_mprq_buf_init, NULL,
1279 				dev->device->numa_node, 0);
1280 	if (mp == NULL) {
1281 		DRV_LOG(ERR,
1282 			"port %u failed to allocate a mempool for"
1283 			" Multi-Packet RQ, count=%u, size=%u",
1284 			dev->data->port_id, obj_num, obj_size);
1285 		rte_errno = ENOMEM;
1286 		return -rte_errno;
1287 	}
1288 	priv->mprq_mp = mp;
1289 exit:
1290 	/* Set mempool for each Rx queue. */
1291 	for (i = 0; i != priv->rxqs_n; ++i) {
1292 		struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
1293 
1294 		if (rxq == NULL)
1295 			continue;
1296 		rxq->mprq_mp = mp;
1297 	}
1298 	DRV_LOG(INFO, "port %u Multi-Packet RQ is configured",
1299 		dev->data->port_id);
1300 	return 0;
1301 }
1302 
1303 /**
1304  * Create a DPDK Rx queue.
1305  *
1306  * @param dev
1307  *   Pointer to Ethernet device.
1308  * @param idx
1309  *   RX queue index.
1310  * @param desc
1311  *   Number of descriptors to configure in queue.
1312  * @param socket
1313  *   NUMA socket on which memory must be allocated.
1314  *
1315  * @return
1316  *   A DPDK queue object on success, NULL otherwise and rte_errno is set.
1317  */
1318 struct mlx5_rxq_ctrl *
1319 mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
1320 	     unsigned int socket, const struct rte_eth_rxconf *conf,
1321 	     struct rte_mempool *mp)
1322 {
1323 	struct mlx5_priv *priv = dev->data->dev_private;
1324 	struct mlx5_rxq_ctrl *tmpl;
1325 	unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
1326 	unsigned int mprq_stride_size;
1327 	struct mlx5_dev_config *config = &priv->config;
1328 	/*
1329 	 * Always allocate extra slots, even if eventually
1330 	 * the vector Rx will not be used.
1331 	 */
1332 	uint16_t desc_n =
1333 		desc + config->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP;
1334 	uint64_t offloads = conf->offloads |
1335 			   dev->data->dev_conf.rxmode.offloads;
1336 	const int mprq_en = mlx5_check_mprq_support(dev) > 0;
1337 
1338 	tmpl = rte_calloc_socket("RXQ", 1,
1339 				 sizeof(*tmpl) +
1340 				 desc_n * sizeof(struct rte_mbuf *),
1341 				 0, socket);
1342 	if (!tmpl) {
1343 		rte_errno = ENOMEM;
1344 		return NULL;
1345 	}
1346 	if (mlx5_mr_btree_init(&tmpl->rxq.mr_ctrl.cache_bh,
1347 			       MLX5_MR_BTREE_CACHE_N, socket)) {
1348 		/* rte_errno is already set. */
1349 		goto error;
1350 	}
1351 	tmpl->socket = socket;
1352 	if (dev->data->dev_conf.intr_conf.rxq)
1353 		tmpl->irq = 1;
1354 	/*
1355 	 * This Rx queue can be configured as a Multi-Packet RQ if all of the
1356 	 * following conditions are met:
1357 	 *  - MPRQ is enabled.
1358 	 *  - The number of descs is more than the number of strides.
1359 	 *  - max_rx_pkt_len plus overhead is less than the max size of a
1360 	 *    stride.
1361 	 *  Otherwise, enable Rx scatter if necessary.
1362 	 */
1363 	assert(mb_len >= RTE_PKTMBUF_HEADROOM);
1364 	mprq_stride_size =
1365 		dev->data->dev_conf.rxmode.max_rx_pkt_len +
1366 		sizeof(struct rte_mbuf_ext_shared_info) +
1367 		RTE_PKTMBUF_HEADROOM;
1368 	if (mprq_en &&
1369 	    desc > (1U << config->mprq.stride_num_n) &&
1370 	    mprq_stride_size <= (1U << config->mprq.max_stride_size_n)) {
1371 		/* TODO: Rx scatter isn't supported yet. */
1372 		tmpl->rxq.sges_n = 0;
1373 		/* Trim the number of descs needed. */
1374 		desc >>= config->mprq.stride_num_n;
1375 		tmpl->rxq.strd_num_n = config->mprq.stride_num_n;
1376 		tmpl->rxq.strd_sz_n = RTE_MAX(log2above(mprq_stride_size),
1377 					      config->mprq.min_stride_size_n);
1378 		tmpl->rxq.strd_shift_en = MLX5_MPRQ_TWO_BYTE_SHIFT;
1379 		tmpl->rxq.mprq_max_memcpy_len =
1380 			RTE_MIN(mb_len - RTE_PKTMBUF_HEADROOM,
1381 				config->mprq.max_memcpy_len);
1382 		DRV_LOG(DEBUG,
1383 			"port %u Rx queue %u: Multi-Packet RQ is enabled"
1384 			" strd_num_n = %u, strd_sz_n = %u",
1385 			dev->data->port_id, idx,
1386 			tmpl->rxq.strd_num_n, tmpl->rxq.strd_sz_n);
1387 	} else if (dev->data->dev_conf.rxmode.max_rx_pkt_len <=
1388 		   (mb_len - RTE_PKTMBUF_HEADROOM)) {
1389 		tmpl->rxq.sges_n = 0;
1390 	} else if (offloads & DEV_RX_OFFLOAD_SCATTER) {
1391 		unsigned int size =
1392 			RTE_PKTMBUF_HEADROOM +
1393 			dev->data->dev_conf.rxmode.max_rx_pkt_len;
1394 		unsigned int sges_n;
1395 
1396 		/*
1397 		 * Determine the number of SGEs needed for a full packet
1398 		 * and round it to the next power of two.
1399 		 */
1400 		sges_n = log2above((size / mb_len) + !!(size % mb_len));
1401 		tmpl->rxq.sges_n = sges_n;
1402 		/* Make sure rxq.sges_n did not overflow. */
1403 		size = mb_len * (1 << tmpl->rxq.sges_n);
1404 		size -= RTE_PKTMBUF_HEADROOM;
1405 		if (size < dev->data->dev_conf.rxmode.max_rx_pkt_len) {
1406 			DRV_LOG(ERR,
1407 				"port %u too many SGEs (%u) needed to handle"
1408 				" requested maximum packet size %u",
1409 				dev->data->port_id,
1410 				1 << sges_n,
1411 				dev->data->dev_conf.rxmode.max_rx_pkt_len);
1412 			rte_errno = EOVERFLOW;
1413 			goto error;
1414 		}
1415 	} else {
1416 		DRV_LOG(WARNING,
1417 			"port %u the requested maximum Rx packet size (%u) is"
1418 			" larger than a single mbuf (%u) and scattered mode has"
1419 			" not been requested",
1420 			dev->data->port_id,
1421 			dev->data->dev_conf.rxmode.max_rx_pkt_len,
1422 			mb_len - RTE_PKTMBUF_HEADROOM);
1423 	}
1424 	if (mprq_en && !mlx5_rxq_mprq_enabled(&tmpl->rxq))
1425 		DRV_LOG(WARNING,
1426 			"port %u MPRQ is requested but cannot be enabled"
1427 			" (requested: desc = %u, stride_sz = %u,"
1428 			" supported: min_stride_num = %u, max_stride_sz = %u).",
1429 			dev->data->port_id, desc, mprq_stride_size,
1430 			(1 << config->mprq.stride_num_n),
1431 			(1 << config->mprq.max_stride_size_n));
1432 	DRV_LOG(DEBUG, "port %u maximum number of segments per packet: %u",
1433 		dev->data->port_id, 1 << tmpl->rxq.sges_n);
1434 	if (desc % (1 << tmpl->rxq.sges_n)) {
1435 		DRV_LOG(ERR,
1436 			"port %u number of Rx queue descriptors (%u) is not a"
1437 			" multiple of SGEs per packet (%u)",
1438 			dev->data->port_id,
1439 			desc,
1440 			1 << tmpl->rxq.sges_n);
1441 		rte_errno = EINVAL;
1442 		goto error;
1443 	}
1444 	/* Toggle RX checksum offload if hardware supports it. */
1445 	tmpl->rxq.csum = !!(offloads & DEV_RX_OFFLOAD_CHECKSUM);
1446 	tmpl->rxq.hw_timestamp = !!(offloads & DEV_RX_OFFLOAD_TIMESTAMP);
1447 	/* Configure VLAN stripping. */
1448 	tmpl->rxq.vlan_strip = !!(offloads & DEV_RX_OFFLOAD_VLAN_STRIP);
1449 	/* By default, FCS (CRC) is stripped by hardware. */
1450 	tmpl->rxq.crc_present = 0;
1451 	if (offloads & DEV_RX_OFFLOAD_KEEP_CRC) {
1452 		if (config->hw_fcs_strip) {
1453 			tmpl->rxq.crc_present = 1;
1454 		} else {
1455 			DRV_LOG(WARNING,
1456 				"port %u CRC stripping has been disabled but will"
1457 				" still be performed by hardware, make sure MLNX_OFED"
1458 				" and firmware are up to date",
1459 				dev->data->port_id);
1460 		}
1461 	}
1462 	DRV_LOG(DEBUG,
1463 		"port %u CRC stripping is %s, %u bytes will be subtracted from"
1464 		" incoming frames to hide it",
1465 		dev->data->port_id,
1466 		tmpl->rxq.crc_present ? "disabled" : "enabled",
1467 		tmpl->rxq.crc_present << 2);
1468 	/* Save port ID. */
1469 	tmpl->rxq.rss_hash = !!priv->rss_conf.rss_hf &&
1470 		(!!(dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS));
1471 	tmpl->rxq.port_id = dev->data->port_id;
1472 	tmpl->priv = priv;
1473 	tmpl->rxq.mp = mp;
1474 	tmpl->rxq.elts_n = log2above(desc);
1475 	tmpl->rxq.rq_repl_thresh =
1476 		MLX5_VPMD_RXQ_RPLNSH_THRESH(1 << tmpl->rxq.elts_n);
1477 	tmpl->rxq.elts =
1478 		(struct rte_mbuf *(*)[1 << tmpl->rxq.elts_n])(tmpl + 1);
1479 #ifndef RTE_ARCH_64
1480 	tmpl->rxq.uar_lock_cq = &priv->uar_lock_cq;
1481 #endif
1482 	tmpl->rxq.idx = idx;
1483 	rte_atomic32_inc(&tmpl->refcnt);
1484 	LIST_INSERT_HEAD(&priv->rxqsctrl, tmpl, next);
1485 	return tmpl;
1486 error:
1487 	rte_free(tmpl);
1488 	return NULL;
1489 }
1490 
1491 /**
1492  * Get a Rx queue.
1493  *
1494  * @param dev
1495  *   Pointer to Ethernet device.
1496  * @param idx
1497  *   RX queue index.
1498  *
1499  * @return
1500  *   A pointer to the queue if it exists, NULL otherwise.
1501  */
1502 struct mlx5_rxq_ctrl *
1503 mlx5_rxq_get(struct rte_eth_dev *dev, uint16_t idx)
1504 {
1505 	struct mlx5_priv *priv = dev->data->dev_private;
1506 	struct mlx5_rxq_ctrl *rxq_ctrl = NULL;
1507 
1508 	if ((*priv->rxqs)[idx]) {
1509 		rxq_ctrl = container_of((*priv->rxqs)[idx],
1510 					struct mlx5_rxq_ctrl,
1511 					rxq);
1512 		mlx5_rxq_ibv_get(dev, idx);
1513 		rte_atomic32_inc(&rxq_ctrl->refcnt);
1514 	}
1515 	return rxq_ctrl;
1516 }
1517 
1518 /**
1519  * Release a Rx queue.
1520  *
1521  * @param dev
1522  *   Pointer to Ethernet device.
1523  * @param idx
1524  *   RX queue index.
1525  *
1526  * @return
1527  *   1 while a reference on it exists, 0 when freed.
1528  */
1529 int
1530 mlx5_rxq_release(struct rte_eth_dev *dev, uint16_t idx)
1531 {
1532 	struct mlx5_priv *priv = dev->data->dev_private;
1533 	struct mlx5_rxq_ctrl *rxq_ctrl;
1534 
1535 	if (!(*priv->rxqs)[idx])
1536 		return 0;
1537 	rxq_ctrl = container_of((*priv->rxqs)[idx], struct mlx5_rxq_ctrl, rxq);
1538 	assert(rxq_ctrl->priv);
1539 	if (rxq_ctrl->ibv && !mlx5_rxq_ibv_release(rxq_ctrl->ibv))
1540 		rxq_ctrl->ibv = NULL;
1541 	if (rte_atomic32_dec_and_test(&rxq_ctrl->refcnt)) {
1542 		mlx5_mr_btree_free(&rxq_ctrl->rxq.mr_ctrl.cache_bh);
1543 		LIST_REMOVE(rxq_ctrl, next);
1544 		rte_free(rxq_ctrl);
1545 		(*priv->rxqs)[idx] = NULL;
1546 		return 0;
1547 	}
1548 	return 1;
1549 }
1550 
1551 /**
1552  * Verify the Rx Queue list is empty
1553  *
1554  * @param dev
1555  *   Pointer to Ethernet device.
1556  *
1557  * @return
1558  *   The number of object not released.
1559  */
1560 int
1561 mlx5_rxq_verify(struct rte_eth_dev *dev)
1562 {
1563 	struct mlx5_priv *priv = dev->data->dev_private;
1564 	struct mlx5_rxq_ctrl *rxq_ctrl;
1565 	int ret = 0;
1566 
1567 	LIST_FOREACH(rxq_ctrl, &priv->rxqsctrl, next) {
1568 		DRV_LOG(DEBUG, "port %u Rx Queue %u still referenced",
1569 			dev->data->port_id, rxq_ctrl->rxq.idx);
1570 		++ret;
1571 	}
1572 	return ret;
1573 }
1574 
1575 /**
1576  * Create an indirection table.
1577  *
1578  * @param dev
1579  *   Pointer to Ethernet device.
1580  * @param queues
1581  *   Queues entering in the indirection table.
1582  * @param queues_n
1583  *   Number of queues in the array.
1584  *
1585  * @return
1586  *   The Verbs object initialised, NULL otherwise and rte_errno is set.
1587  */
1588 static struct mlx5_ind_table_ibv *
1589 mlx5_ind_table_ibv_new(struct rte_eth_dev *dev, const uint16_t *queues,
1590 		       uint32_t queues_n)
1591 {
1592 	struct mlx5_priv *priv = dev->data->dev_private;
1593 	struct mlx5_ind_table_ibv *ind_tbl;
1594 	const unsigned int wq_n = rte_is_power_of_2(queues_n) ?
1595 		log2above(queues_n) :
1596 		log2above(priv->config.ind_table_max_size);
1597 	struct ibv_wq *wq[1 << wq_n];
1598 	unsigned int i;
1599 	unsigned int j;
1600 
1601 	ind_tbl = rte_calloc(__func__, 1, sizeof(*ind_tbl) +
1602 			     queues_n * sizeof(uint16_t), 0);
1603 	if (!ind_tbl) {
1604 		rte_errno = ENOMEM;
1605 		return NULL;
1606 	}
1607 	for (i = 0; i != queues_n; ++i) {
1608 		struct mlx5_rxq_ctrl *rxq = mlx5_rxq_get(dev, queues[i]);
1609 
1610 		if (!rxq)
1611 			goto error;
1612 		wq[i] = rxq->ibv->wq;
1613 		ind_tbl->queues[i] = queues[i];
1614 	}
1615 	ind_tbl->queues_n = queues_n;
1616 	/* Finalise indirection table. */
1617 	for (j = 0; i != (unsigned int)(1 << wq_n); ++i, ++j)
1618 		wq[i] = wq[j];
1619 	ind_tbl->ind_table = mlx5_glue->create_rwq_ind_table
1620 		(priv->sh->ctx,
1621 		 &(struct ibv_rwq_ind_table_init_attr){
1622 			.log_ind_tbl_size = wq_n,
1623 			.ind_tbl = wq,
1624 			.comp_mask = 0,
1625 		 });
1626 	if (!ind_tbl->ind_table) {
1627 		rte_errno = errno;
1628 		goto error;
1629 	}
1630 	rte_atomic32_inc(&ind_tbl->refcnt);
1631 	LIST_INSERT_HEAD(&priv->ind_tbls, ind_tbl, next);
1632 	return ind_tbl;
1633 error:
1634 	rte_free(ind_tbl);
1635 	DEBUG("port %u cannot create indirection table", dev->data->port_id);
1636 	return NULL;
1637 }
1638 
1639 /**
1640  * Get an indirection table.
1641  *
1642  * @param dev
1643  *   Pointer to Ethernet device.
1644  * @param queues
1645  *   Queues entering in the indirection table.
1646  * @param queues_n
1647  *   Number of queues in the array.
1648  *
1649  * @return
1650  *   An indirection table if found.
1651  */
1652 static struct mlx5_ind_table_ibv *
1653 mlx5_ind_table_ibv_get(struct rte_eth_dev *dev, const uint16_t *queues,
1654 		       uint32_t queues_n)
1655 {
1656 	struct mlx5_priv *priv = dev->data->dev_private;
1657 	struct mlx5_ind_table_ibv *ind_tbl;
1658 
1659 	LIST_FOREACH(ind_tbl, &priv->ind_tbls, next) {
1660 		if ((ind_tbl->queues_n == queues_n) &&
1661 		    (memcmp(ind_tbl->queues, queues,
1662 			    ind_tbl->queues_n * sizeof(ind_tbl->queues[0]))
1663 		     == 0))
1664 			break;
1665 	}
1666 	if (ind_tbl) {
1667 		unsigned int i;
1668 
1669 		rte_atomic32_inc(&ind_tbl->refcnt);
1670 		for (i = 0; i != ind_tbl->queues_n; ++i)
1671 			mlx5_rxq_get(dev, ind_tbl->queues[i]);
1672 	}
1673 	return ind_tbl;
1674 }
1675 
1676 /**
1677  * Release an indirection table.
1678  *
1679  * @param dev
1680  *   Pointer to Ethernet device.
1681  * @param ind_table
1682  *   Indirection table to release.
1683  *
1684  * @return
1685  *   1 while a reference on it exists, 0 when freed.
1686  */
1687 static int
1688 mlx5_ind_table_ibv_release(struct rte_eth_dev *dev,
1689 			   struct mlx5_ind_table_ibv *ind_tbl)
1690 {
1691 	unsigned int i;
1692 
1693 	if (rte_atomic32_dec_and_test(&ind_tbl->refcnt))
1694 		claim_zero(mlx5_glue->destroy_rwq_ind_table
1695 			   (ind_tbl->ind_table));
1696 	for (i = 0; i != ind_tbl->queues_n; ++i)
1697 		claim_nonzero(mlx5_rxq_release(dev, ind_tbl->queues[i]));
1698 	if (!rte_atomic32_read(&ind_tbl->refcnt)) {
1699 		LIST_REMOVE(ind_tbl, next);
1700 		rte_free(ind_tbl);
1701 		return 0;
1702 	}
1703 	return 1;
1704 }
1705 
1706 /**
1707  * Verify the Rx Queue list is empty
1708  *
1709  * @param dev
1710  *   Pointer to Ethernet device.
1711  *
1712  * @return
1713  *   The number of object not released.
1714  */
1715 int
1716 mlx5_ind_table_ibv_verify(struct rte_eth_dev *dev)
1717 {
1718 	struct mlx5_priv *priv = dev->data->dev_private;
1719 	struct mlx5_ind_table_ibv *ind_tbl;
1720 	int ret = 0;
1721 
1722 	LIST_FOREACH(ind_tbl, &priv->ind_tbls, next) {
1723 		DRV_LOG(DEBUG,
1724 			"port %u Verbs indirection table %p still referenced",
1725 			dev->data->port_id, (void *)ind_tbl);
1726 		++ret;
1727 	}
1728 	return ret;
1729 }
1730 
1731 /**
1732  * Create an Rx Hash queue.
1733  *
1734  * @param dev
1735  *   Pointer to Ethernet device.
1736  * @param rss_key
1737  *   RSS key for the Rx hash queue.
1738  * @param rss_key_len
1739  *   RSS key length.
1740  * @param hash_fields
1741  *   Verbs protocol hash field to make the RSS on.
1742  * @param queues
1743  *   Queues entering in hash queue. In case of empty hash_fields only the
1744  *   first queue index will be taken for the indirection table.
1745  * @param queues_n
1746  *   Number of queues.
1747  * @param tunnel
1748  *   Tunnel type.
1749  *
1750  * @return
1751  *   The Verbs object initialised, NULL otherwise and rte_errno is set.
1752  */
1753 struct mlx5_hrxq *
1754 mlx5_hrxq_new(struct rte_eth_dev *dev,
1755 	      const uint8_t *rss_key, uint32_t rss_key_len,
1756 	      uint64_t hash_fields,
1757 	      const uint16_t *queues, uint32_t queues_n,
1758 	      int tunnel __rte_unused)
1759 {
1760 	struct mlx5_priv *priv = dev->data->dev_private;
1761 	struct mlx5_hrxq *hrxq;
1762 	struct mlx5_ind_table_ibv *ind_tbl;
1763 	struct ibv_qp *qp;
1764 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
1765 	struct mlx5dv_qp_init_attr qp_init_attr;
1766 #endif
1767 	int err;
1768 
1769 	queues_n = hash_fields ? queues_n : 1;
1770 	ind_tbl = mlx5_ind_table_ibv_get(dev, queues, queues_n);
1771 	if (!ind_tbl)
1772 		ind_tbl = mlx5_ind_table_ibv_new(dev, queues, queues_n);
1773 	if (!ind_tbl) {
1774 		rte_errno = ENOMEM;
1775 		return NULL;
1776 	}
1777 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
1778 	memset(&qp_init_attr, 0, sizeof(qp_init_attr));
1779 	if (tunnel) {
1780 		qp_init_attr.comp_mask =
1781 				MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS;
1782 		qp_init_attr.create_flags = MLX5DV_QP_CREATE_TUNNEL_OFFLOADS;
1783 	}
1784 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
1785 	if (dev->data->dev_conf.lpbk_mode) {
1786 		/* Allow packet sent from NIC loop back w/o source MAC check. */
1787 		qp_init_attr.comp_mask |=
1788 				MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS;
1789 		qp_init_attr.create_flags |=
1790 				MLX5DV_QP_CREATE_TIR_ALLOW_SELF_LOOPBACK_UC;
1791 	}
1792 #endif
1793 	qp = mlx5_glue->dv_create_qp
1794 		(priv->sh->ctx,
1795 		 &(struct ibv_qp_init_attr_ex){
1796 			.qp_type = IBV_QPT_RAW_PACKET,
1797 			.comp_mask =
1798 				IBV_QP_INIT_ATTR_PD |
1799 				IBV_QP_INIT_ATTR_IND_TABLE |
1800 				IBV_QP_INIT_ATTR_RX_HASH,
1801 			.rx_hash_conf = (struct ibv_rx_hash_conf){
1802 				.rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ,
1803 				.rx_hash_key_len = rss_key_len,
1804 				.rx_hash_key = (void *)(uintptr_t)rss_key,
1805 				.rx_hash_fields_mask = hash_fields,
1806 			},
1807 			.rwq_ind_tbl = ind_tbl->ind_table,
1808 			.pd = priv->sh->pd,
1809 		 },
1810 		 &qp_init_attr);
1811 #else
1812 	qp = mlx5_glue->create_qp_ex
1813 		(priv->sh->ctx,
1814 		 &(struct ibv_qp_init_attr_ex){
1815 			.qp_type = IBV_QPT_RAW_PACKET,
1816 			.comp_mask =
1817 				IBV_QP_INIT_ATTR_PD |
1818 				IBV_QP_INIT_ATTR_IND_TABLE |
1819 				IBV_QP_INIT_ATTR_RX_HASH,
1820 			.rx_hash_conf = (struct ibv_rx_hash_conf){
1821 				.rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ,
1822 				.rx_hash_key_len = rss_key_len,
1823 				.rx_hash_key = (void *)(uintptr_t)rss_key,
1824 				.rx_hash_fields_mask = hash_fields,
1825 			},
1826 			.rwq_ind_tbl = ind_tbl->ind_table,
1827 			.pd = priv->sh->pd,
1828 		 });
1829 #endif
1830 	if (!qp) {
1831 		rte_errno = errno;
1832 		goto error;
1833 	}
1834 	hrxq = rte_calloc(__func__, 1, sizeof(*hrxq) + rss_key_len, 0);
1835 	if (!hrxq)
1836 		goto error;
1837 	hrxq->ind_table = ind_tbl;
1838 	hrxq->qp = qp;
1839 	hrxq->rss_key_len = rss_key_len;
1840 	hrxq->hash_fields = hash_fields;
1841 	memcpy(hrxq->rss_key, rss_key, rss_key_len);
1842 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
1843 	hrxq->action = mlx5_glue->dv_create_flow_action_dest_ibv_qp(hrxq->qp);
1844 	if (!hrxq->action) {
1845 		rte_errno = errno;
1846 		goto error;
1847 	}
1848 #endif
1849 	rte_atomic32_inc(&hrxq->refcnt);
1850 	LIST_INSERT_HEAD(&priv->hrxqs, hrxq, next);
1851 	return hrxq;
1852 error:
1853 	err = rte_errno; /* Save rte_errno before cleanup. */
1854 	mlx5_ind_table_ibv_release(dev, ind_tbl);
1855 	if (qp)
1856 		claim_zero(mlx5_glue->destroy_qp(qp));
1857 	rte_errno = err; /* Restore rte_errno. */
1858 	return NULL;
1859 }
1860 
1861 /**
1862  * Get an Rx Hash queue.
1863  *
1864  * @param dev
1865  *   Pointer to Ethernet device.
1866  * @param rss_conf
1867  *   RSS configuration for the Rx hash queue.
1868  * @param queues
1869  *   Queues entering in hash queue. In case of empty hash_fields only the
1870  *   first queue index will be taken for the indirection table.
1871  * @param queues_n
1872  *   Number of queues.
1873  *
1874  * @return
1875  *   An hash Rx queue on success.
1876  */
1877 struct mlx5_hrxq *
1878 mlx5_hrxq_get(struct rte_eth_dev *dev,
1879 	      const uint8_t *rss_key, uint32_t rss_key_len,
1880 	      uint64_t hash_fields,
1881 	      const uint16_t *queues, uint32_t queues_n)
1882 {
1883 	struct mlx5_priv *priv = dev->data->dev_private;
1884 	struct mlx5_hrxq *hrxq;
1885 
1886 	queues_n = hash_fields ? queues_n : 1;
1887 	LIST_FOREACH(hrxq, &priv->hrxqs, next) {
1888 		struct mlx5_ind_table_ibv *ind_tbl;
1889 
1890 		if (hrxq->rss_key_len != rss_key_len)
1891 			continue;
1892 		if (memcmp(hrxq->rss_key, rss_key, rss_key_len))
1893 			continue;
1894 		if (hrxq->hash_fields != hash_fields)
1895 			continue;
1896 		ind_tbl = mlx5_ind_table_ibv_get(dev, queues, queues_n);
1897 		if (!ind_tbl)
1898 			continue;
1899 		if (ind_tbl != hrxq->ind_table) {
1900 			mlx5_ind_table_ibv_release(dev, ind_tbl);
1901 			continue;
1902 		}
1903 		rte_atomic32_inc(&hrxq->refcnt);
1904 		return hrxq;
1905 	}
1906 	return NULL;
1907 }
1908 
1909 /**
1910  * Release the hash Rx queue.
1911  *
1912  * @param dev
1913  *   Pointer to Ethernet device.
1914  * @param hrxq
1915  *   Pointer to Hash Rx queue to release.
1916  *
1917  * @return
1918  *   1 while a reference on it exists, 0 when freed.
1919  */
1920 int
1921 mlx5_hrxq_release(struct rte_eth_dev *dev, struct mlx5_hrxq *hrxq)
1922 {
1923 	if (rte_atomic32_dec_and_test(&hrxq->refcnt)) {
1924 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
1925 		mlx5_glue->destroy_flow_action(hrxq->action);
1926 #endif
1927 		claim_zero(mlx5_glue->destroy_qp(hrxq->qp));
1928 		mlx5_ind_table_ibv_release(dev, hrxq->ind_table);
1929 		LIST_REMOVE(hrxq, next);
1930 		rte_free(hrxq);
1931 		return 0;
1932 	}
1933 	claim_nonzero(mlx5_ind_table_ibv_release(dev, hrxq->ind_table));
1934 	return 1;
1935 }
1936 
1937 /**
1938  * Verify the Rx Queue list is empty
1939  *
1940  * @param dev
1941  *   Pointer to Ethernet device.
1942  *
1943  * @return
1944  *   The number of object not released.
1945  */
1946 int
1947 mlx5_hrxq_ibv_verify(struct rte_eth_dev *dev)
1948 {
1949 	struct mlx5_priv *priv = dev->data->dev_private;
1950 	struct mlx5_hrxq *hrxq;
1951 	int ret = 0;
1952 
1953 	LIST_FOREACH(hrxq, &priv->hrxqs, next) {
1954 		DRV_LOG(DEBUG,
1955 			"port %u Verbs hash Rx queue %p still referenced",
1956 			dev->data->port_id, (void *)hrxq);
1957 		++ret;
1958 	}
1959 	return ret;
1960 }
1961 
1962 /**
1963  * Create a drop Rx queue Verbs object.
1964  *
1965  * @param dev
1966  *   Pointer to Ethernet device.
1967  *
1968  * @return
1969  *   The Verbs object initialised, NULL otherwise and rte_errno is set.
1970  */
1971 static struct mlx5_rxq_ibv *
1972 mlx5_rxq_ibv_drop_new(struct rte_eth_dev *dev)
1973 {
1974 	struct mlx5_priv *priv = dev->data->dev_private;
1975 	struct ibv_context *ctx = priv->sh->ctx;
1976 	struct ibv_cq *cq;
1977 	struct ibv_wq *wq = NULL;
1978 	struct mlx5_rxq_ibv *rxq;
1979 
1980 	if (priv->drop_queue.rxq)
1981 		return priv->drop_queue.rxq;
1982 	cq = mlx5_glue->create_cq(ctx, 1, NULL, NULL, 0);
1983 	if (!cq) {
1984 		DEBUG("port %u cannot allocate CQ for drop queue",
1985 		      dev->data->port_id);
1986 		rte_errno = errno;
1987 		goto error;
1988 	}
1989 	wq = mlx5_glue->create_wq(ctx,
1990 		 &(struct ibv_wq_init_attr){
1991 			.wq_type = IBV_WQT_RQ,
1992 			.max_wr = 1,
1993 			.max_sge = 1,
1994 			.pd = priv->sh->pd,
1995 			.cq = cq,
1996 		 });
1997 	if (!wq) {
1998 		DEBUG("port %u cannot allocate WQ for drop queue",
1999 		      dev->data->port_id);
2000 		rte_errno = errno;
2001 		goto error;
2002 	}
2003 	rxq = rte_calloc(__func__, 1, sizeof(*rxq), 0);
2004 	if (!rxq) {
2005 		DEBUG("port %u cannot allocate drop Rx queue memory",
2006 		      dev->data->port_id);
2007 		rte_errno = ENOMEM;
2008 		goto error;
2009 	}
2010 	rxq->cq = cq;
2011 	rxq->wq = wq;
2012 	priv->drop_queue.rxq = rxq;
2013 	return rxq;
2014 error:
2015 	if (wq)
2016 		claim_zero(mlx5_glue->destroy_wq(wq));
2017 	if (cq)
2018 		claim_zero(mlx5_glue->destroy_cq(cq));
2019 	return NULL;
2020 }
2021 
2022 /**
2023  * Release a drop Rx queue Verbs object.
2024  *
2025  * @param dev
2026  *   Pointer to Ethernet device.
2027  *
2028  * @return
2029  *   The Verbs object initialised, NULL otherwise and rte_errno is set.
2030  */
2031 static void
2032 mlx5_rxq_ibv_drop_release(struct rte_eth_dev *dev)
2033 {
2034 	struct mlx5_priv *priv = dev->data->dev_private;
2035 	struct mlx5_rxq_ibv *rxq = priv->drop_queue.rxq;
2036 
2037 	if (rxq->wq)
2038 		claim_zero(mlx5_glue->destroy_wq(rxq->wq));
2039 	if (rxq->cq)
2040 		claim_zero(mlx5_glue->destroy_cq(rxq->cq));
2041 	rte_free(rxq);
2042 	priv->drop_queue.rxq = NULL;
2043 }
2044 
2045 /**
2046  * Create a drop indirection table.
2047  *
2048  * @param dev
2049  *   Pointer to Ethernet device.
2050  *
2051  * @return
2052  *   The Verbs object initialised, NULL otherwise and rte_errno is set.
2053  */
2054 static struct mlx5_ind_table_ibv *
2055 mlx5_ind_table_ibv_drop_new(struct rte_eth_dev *dev)
2056 {
2057 	struct mlx5_priv *priv = dev->data->dev_private;
2058 	struct mlx5_ind_table_ibv *ind_tbl;
2059 	struct mlx5_rxq_ibv *rxq;
2060 	struct mlx5_ind_table_ibv tmpl;
2061 
2062 	rxq = mlx5_rxq_ibv_drop_new(dev);
2063 	if (!rxq)
2064 		return NULL;
2065 	tmpl.ind_table = mlx5_glue->create_rwq_ind_table
2066 		(priv->sh->ctx,
2067 		 &(struct ibv_rwq_ind_table_init_attr){
2068 			.log_ind_tbl_size = 0,
2069 			.ind_tbl = &rxq->wq,
2070 			.comp_mask = 0,
2071 		 });
2072 	if (!tmpl.ind_table) {
2073 		DEBUG("port %u cannot allocate indirection table for drop"
2074 		      " queue",
2075 		      dev->data->port_id);
2076 		rte_errno = errno;
2077 		goto error;
2078 	}
2079 	ind_tbl = rte_calloc(__func__, 1, sizeof(*ind_tbl), 0);
2080 	if (!ind_tbl) {
2081 		rte_errno = ENOMEM;
2082 		goto error;
2083 	}
2084 	ind_tbl->ind_table = tmpl.ind_table;
2085 	return ind_tbl;
2086 error:
2087 	mlx5_rxq_ibv_drop_release(dev);
2088 	return NULL;
2089 }
2090 
2091 /**
2092  * Release a drop indirection table.
2093  *
2094  * @param dev
2095  *   Pointer to Ethernet device.
2096  */
2097 static void
2098 mlx5_ind_table_ibv_drop_release(struct rte_eth_dev *dev)
2099 {
2100 	struct mlx5_priv *priv = dev->data->dev_private;
2101 	struct mlx5_ind_table_ibv *ind_tbl = priv->drop_queue.hrxq->ind_table;
2102 
2103 	claim_zero(mlx5_glue->destroy_rwq_ind_table(ind_tbl->ind_table));
2104 	mlx5_rxq_ibv_drop_release(dev);
2105 	rte_free(ind_tbl);
2106 	priv->drop_queue.hrxq->ind_table = NULL;
2107 }
2108 
2109 /**
2110  * Create a drop Rx Hash queue.
2111  *
2112  * @param dev
2113  *   Pointer to Ethernet device.
2114  *
2115  * @return
2116  *   The Verbs object initialised, NULL otherwise and rte_errno is set.
2117  */
2118 struct mlx5_hrxq *
2119 mlx5_hrxq_drop_new(struct rte_eth_dev *dev)
2120 {
2121 	struct mlx5_priv *priv = dev->data->dev_private;
2122 	struct mlx5_ind_table_ibv *ind_tbl;
2123 	struct ibv_qp *qp;
2124 	struct mlx5_hrxq *hrxq;
2125 
2126 	if (priv->drop_queue.hrxq) {
2127 		rte_atomic32_inc(&priv->drop_queue.hrxq->refcnt);
2128 		return priv->drop_queue.hrxq;
2129 	}
2130 	ind_tbl = mlx5_ind_table_ibv_drop_new(dev);
2131 	if (!ind_tbl)
2132 		return NULL;
2133 	qp = mlx5_glue->create_qp_ex(priv->sh->ctx,
2134 		 &(struct ibv_qp_init_attr_ex){
2135 			.qp_type = IBV_QPT_RAW_PACKET,
2136 			.comp_mask =
2137 				IBV_QP_INIT_ATTR_PD |
2138 				IBV_QP_INIT_ATTR_IND_TABLE |
2139 				IBV_QP_INIT_ATTR_RX_HASH,
2140 			.rx_hash_conf = (struct ibv_rx_hash_conf){
2141 				.rx_hash_function =
2142 					IBV_RX_HASH_FUNC_TOEPLITZ,
2143 				.rx_hash_key_len = MLX5_RSS_HASH_KEY_LEN,
2144 				.rx_hash_key = rss_hash_default_key,
2145 				.rx_hash_fields_mask = 0,
2146 				},
2147 			.rwq_ind_tbl = ind_tbl->ind_table,
2148 			.pd = priv->sh->pd
2149 		 });
2150 	if (!qp) {
2151 		DEBUG("port %u cannot allocate QP for drop queue",
2152 		      dev->data->port_id);
2153 		rte_errno = errno;
2154 		goto error;
2155 	}
2156 	hrxq = rte_calloc(__func__, 1, sizeof(*hrxq), 0);
2157 	if (!hrxq) {
2158 		DRV_LOG(WARNING,
2159 			"port %u cannot allocate memory for drop queue",
2160 			dev->data->port_id);
2161 		rte_errno = ENOMEM;
2162 		goto error;
2163 	}
2164 	hrxq->ind_table = ind_tbl;
2165 	hrxq->qp = qp;
2166 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
2167 	hrxq->action = mlx5_glue->dv_create_flow_action_dest_ibv_qp(hrxq->qp);
2168 	if (!hrxq->action) {
2169 		rte_errno = errno;
2170 		goto error;
2171 	}
2172 #endif
2173 	priv->drop_queue.hrxq = hrxq;
2174 	rte_atomic32_set(&hrxq->refcnt, 1);
2175 	return hrxq;
2176 error:
2177 	if (ind_tbl)
2178 		mlx5_ind_table_ibv_drop_release(dev);
2179 	return NULL;
2180 }
2181 
2182 /**
2183  * Release a drop hash Rx queue.
2184  *
2185  * @param dev
2186  *   Pointer to Ethernet device.
2187  */
2188 void
2189 mlx5_hrxq_drop_release(struct rte_eth_dev *dev)
2190 {
2191 	struct mlx5_priv *priv = dev->data->dev_private;
2192 	struct mlx5_hrxq *hrxq = priv->drop_queue.hrxq;
2193 
2194 	if (rte_atomic32_dec_and_test(&hrxq->refcnt)) {
2195 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
2196 		mlx5_glue->destroy_flow_action(hrxq->action);
2197 #endif
2198 		claim_zero(mlx5_glue->destroy_qp(hrxq->qp));
2199 		mlx5_ind_table_ibv_drop_release(dev);
2200 		rte_free(hrxq);
2201 		priv->drop_queue.hrxq = NULL;
2202 	}
2203 }
2204