xref: /dpdk/drivers/net/mlx5/mlx5_rxq.c (revision 089e5ed727a15da2729cfee9b63533dd120bd04c)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5 
6 #include <stddef.h>
7 #include <assert.h>
8 #include <errno.h>
9 #include <string.h>
10 #include <stdint.h>
11 #include <fcntl.h>
12 #include <sys/queue.h>
13 
14 /* Verbs header. */
15 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
16 #ifdef PEDANTIC
17 #pragma GCC diagnostic ignored "-Wpedantic"
18 #endif
19 #include <infiniband/verbs.h>
20 #include <infiniband/mlx5dv.h>
21 #ifdef PEDANTIC
22 #pragma GCC diagnostic error "-Wpedantic"
23 #endif
24 
25 #include <rte_mbuf.h>
26 #include <rte_malloc.h>
27 #include <rte_ethdev_driver.h>
28 #include <rte_common.h>
29 #include <rte_interrupts.h>
30 #include <rte_debug.h>
31 #include <rte_io.h>
32 
33 #include "mlx5.h"
34 #include "mlx5_rxtx.h"
35 #include "mlx5_utils.h"
36 #include "mlx5_autoconf.h"
37 #include "mlx5_defs.h"
38 #include "mlx5_glue.h"
39 
40 /* Default RSS hash key also used for ConnectX-3. */
41 uint8_t rss_hash_default_key[] = {
42 	0x2c, 0xc6, 0x81, 0xd1,
43 	0x5b, 0xdb, 0xf4, 0xf7,
44 	0xfc, 0xa2, 0x83, 0x19,
45 	0xdb, 0x1a, 0x3e, 0x94,
46 	0x6b, 0x9e, 0x38, 0xd9,
47 	0x2c, 0x9c, 0x03, 0xd1,
48 	0xad, 0x99, 0x44, 0xa7,
49 	0xd9, 0x56, 0x3d, 0x59,
50 	0x06, 0x3c, 0x25, 0xf3,
51 	0xfc, 0x1f, 0xdc, 0x2a,
52 };
53 
54 /* Length of the default RSS hash key. */
55 static_assert(MLX5_RSS_HASH_KEY_LEN ==
56 	      (unsigned int)sizeof(rss_hash_default_key),
57 	      "wrong RSS default key size.");
58 
59 /**
60  * Check whether Multi-Packet RQ can be enabled for the device.
61  *
62  * @param dev
63  *   Pointer to Ethernet device.
64  *
65  * @return
66  *   1 if supported, negative errno value if not.
67  */
68 inline int
69 mlx5_check_mprq_support(struct rte_eth_dev *dev)
70 {
71 	struct mlx5_priv *priv = dev->data->dev_private;
72 
73 	if (priv->config.mprq.enabled &&
74 	    priv->rxqs_n >= priv->config.mprq.min_rxqs_num)
75 		return 1;
76 	return -ENOTSUP;
77 }
78 
79 /**
80  * Check whether Multi-Packet RQ is enabled for the Rx queue.
81  *
82  *  @param rxq
83  *     Pointer to receive queue structure.
84  *
85  * @return
86  *   0 if disabled, otherwise enabled.
87  */
88 inline int
89 mlx5_rxq_mprq_enabled(struct mlx5_rxq_data *rxq)
90 {
91 	return rxq->strd_num_n > 0;
92 }
93 
94 /**
95  * Check whether Multi-Packet RQ is enabled for the device.
96  *
97  * @param dev
98  *   Pointer to Ethernet device.
99  *
100  * @return
101  *   0 if disabled, otherwise enabled.
102  */
103 inline int
104 mlx5_mprq_enabled(struct rte_eth_dev *dev)
105 {
106 	struct mlx5_priv *priv = dev->data->dev_private;
107 	uint16_t i;
108 	uint16_t n = 0;
109 
110 	if (mlx5_check_mprq_support(dev) < 0)
111 		return 0;
112 	/* All the configured queues should be enabled. */
113 	for (i = 0; i < priv->rxqs_n; ++i) {
114 		struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
115 
116 		if (!rxq)
117 			continue;
118 		if (mlx5_rxq_mprq_enabled(rxq))
119 			++n;
120 	}
121 	/* Multi-Packet RQ can't be partially configured. */
122 	assert(n == 0 || n == priv->rxqs_n);
123 	return n == priv->rxqs_n;
124 }
125 
126 /**
127  * Allocate RX queue elements for Multi-Packet RQ.
128  *
129  * @param rxq_ctrl
130  *   Pointer to RX queue structure.
131  *
132  * @return
133  *   0 on success, a negative errno value otherwise and rte_errno is set.
134  */
135 static int
136 rxq_alloc_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)
137 {
138 	struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
139 	unsigned int wqe_n = 1 << rxq->elts_n;
140 	unsigned int i;
141 	int err;
142 
143 	/* Iterate on segments. */
144 	for (i = 0; i <= wqe_n; ++i) {
145 		struct mlx5_mprq_buf *buf;
146 
147 		if (rte_mempool_get(rxq->mprq_mp, (void **)&buf) < 0) {
148 			DRV_LOG(ERR, "port %u empty mbuf pool", rxq->port_id);
149 			rte_errno = ENOMEM;
150 			goto error;
151 		}
152 		if (i < wqe_n)
153 			(*rxq->mprq_bufs)[i] = buf;
154 		else
155 			rxq->mprq_repl = buf;
156 	}
157 	DRV_LOG(DEBUG,
158 		"port %u Rx queue %u allocated and configured %u segments",
159 		rxq->port_id, rxq->idx, wqe_n);
160 	return 0;
161 error:
162 	err = rte_errno; /* Save rte_errno before cleanup. */
163 	wqe_n = i;
164 	for (i = 0; (i != wqe_n); ++i) {
165 		if ((*rxq->mprq_bufs)[i] != NULL)
166 			rte_mempool_put(rxq->mprq_mp,
167 					(*rxq->mprq_bufs)[i]);
168 		(*rxq->mprq_bufs)[i] = NULL;
169 	}
170 	DRV_LOG(DEBUG, "port %u Rx queue %u failed, freed everything",
171 		rxq->port_id, rxq->idx);
172 	rte_errno = err; /* Restore rte_errno. */
173 	return -rte_errno;
174 }
175 
176 /**
177  * Allocate RX queue elements for Single-Packet RQ.
178  *
179  * @param rxq_ctrl
180  *   Pointer to RX queue structure.
181  *
182  * @return
183  *   0 on success, errno value on failure.
184  */
185 static int
186 rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
187 {
188 	const unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n;
189 	unsigned int elts_n = 1 << rxq_ctrl->rxq.elts_n;
190 	unsigned int i;
191 	int err;
192 
193 	/* Iterate on segments. */
194 	for (i = 0; (i != elts_n); ++i) {
195 		struct rte_mbuf *buf;
196 
197 		buf = rte_pktmbuf_alloc(rxq_ctrl->rxq.mp);
198 		if (buf == NULL) {
199 			DRV_LOG(ERR, "port %u empty mbuf pool",
200 				PORT_ID(rxq_ctrl->priv));
201 			rte_errno = ENOMEM;
202 			goto error;
203 		}
204 		/* Headroom is reserved by rte_pktmbuf_alloc(). */
205 		assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
206 		/* Buffer is supposed to be empty. */
207 		assert(rte_pktmbuf_data_len(buf) == 0);
208 		assert(rte_pktmbuf_pkt_len(buf) == 0);
209 		assert(!buf->next);
210 		/* Only the first segment keeps headroom. */
211 		if (i % sges_n)
212 			SET_DATA_OFF(buf, 0);
213 		PORT(buf) = rxq_ctrl->rxq.port_id;
214 		DATA_LEN(buf) = rte_pktmbuf_tailroom(buf);
215 		PKT_LEN(buf) = DATA_LEN(buf);
216 		NB_SEGS(buf) = 1;
217 		(*rxq_ctrl->rxq.elts)[i] = buf;
218 	}
219 	/* If Rx vector is activated. */
220 	if (mlx5_rxq_check_vec_support(&rxq_ctrl->rxq) > 0) {
221 		struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
222 		struct rte_mbuf *mbuf_init = &rxq->fake_mbuf;
223 		int j;
224 
225 		/* Initialize default rearm_data for vPMD. */
226 		mbuf_init->data_off = RTE_PKTMBUF_HEADROOM;
227 		rte_mbuf_refcnt_set(mbuf_init, 1);
228 		mbuf_init->nb_segs = 1;
229 		mbuf_init->port = rxq->port_id;
230 		/*
231 		 * prevent compiler reordering:
232 		 * rearm_data covers previous fields.
233 		 */
234 		rte_compiler_barrier();
235 		rxq->mbuf_initializer =
236 			*(uint64_t *)&mbuf_init->rearm_data;
237 		/* Padding with a fake mbuf for vectorized Rx. */
238 		for (j = 0; j < MLX5_VPMD_DESCS_PER_LOOP; ++j)
239 			(*rxq->elts)[elts_n + j] = &rxq->fake_mbuf;
240 	}
241 	DRV_LOG(DEBUG,
242 		"port %u Rx queue %u allocated and configured %u segments"
243 		" (max %u packets)",
244 		PORT_ID(rxq_ctrl->priv), rxq_ctrl->rxq.idx, elts_n,
245 		elts_n / (1 << rxq_ctrl->rxq.sges_n));
246 	return 0;
247 error:
248 	err = rte_errno; /* Save rte_errno before cleanup. */
249 	elts_n = i;
250 	for (i = 0; (i != elts_n); ++i) {
251 		if ((*rxq_ctrl->rxq.elts)[i] != NULL)
252 			rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
253 		(*rxq_ctrl->rxq.elts)[i] = NULL;
254 	}
255 	DRV_LOG(DEBUG, "port %u Rx queue %u failed, freed everything",
256 		PORT_ID(rxq_ctrl->priv), rxq_ctrl->rxq.idx);
257 	rte_errno = err; /* Restore rte_errno. */
258 	return -rte_errno;
259 }
260 
261 /**
262  * Allocate RX queue elements.
263  *
264  * @param rxq_ctrl
265  *   Pointer to RX queue structure.
266  *
267  * @return
268  *   0 on success, errno value on failure.
269  */
270 int
271 rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl)
272 {
273 	return mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ?
274 	       rxq_alloc_elts_mprq(rxq_ctrl) : rxq_alloc_elts_sprq(rxq_ctrl);
275 }
276 
277 /**
278  * Free RX queue elements for Multi-Packet RQ.
279  *
280  * @param rxq_ctrl
281  *   Pointer to RX queue structure.
282  */
283 static void
284 rxq_free_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)
285 {
286 	struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
287 	uint16_t i;
288 
289 	DRV_LOG(DEBUG, "port %u Multi-Packet Rx queue %u freeing WRs",
290 		rxq->port_id, rxq->idx);
291 	if (rxq->mprq_bufs == NULL)
292 		return;
293 	assert(mlx5_rxq_check_vec_support(rxq) < 0);
294 	for (i = 0; (i != (1u << rxq->elts_n)); ++i) {
295 		if ((*rxq->mprq_bufs)[i] != NULL)
296 			mlx5_mprq_buf_free((*rxq->mprq_bufs)[i]);
297 		(*rxq->mprq_bufs)[i] = NULL;
298 	}
299 	if (rxq->mprq_repl != NULL) {
300 		mlx5_mprq_buf_free(rxq->mprq_repl);
301 		rxq->mprq_repl = NULL;
302 	}
303 }
304 
305 /**
306  * Free RX queue elements for Single-Packet RQ.
307  *
308  * @param rxq_ctrl
309  *   Pointer to RX queue structure.
310  */
311 static void
312 rxq_free_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
313 {
314 	struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
315 	const uint16_t q_n = (1 << rxq->elts_n);
316 	const uint16_t q_mask = q_n - 1;
317 	uint16_t used = q_n - (rxq->rq_ci - rxq->rq_pi);
318 	uint16_t i;
319 
320 	DRV_LOG(DEBUG, "port %u Rx queue %u freeing WRs",
321 		PORT_ID(rxq_ctrl->priv), rxq->idx);
322 	if (rxq->elts == NULL)
323 		return;
324 	/**
325 	 * Some mbuf in the Ring belongs to the application.  They cannot be
326 	 * freed.
327 	 */
328 	if (mlx5_rxq_check_vec_support(rxq) > 0) {
329 		for (i = 0; i < used; ++i)
330 			(*rxq->elts)[(rxq->rq_ci + i) & q_mask] = NULL;
331 		rxq->rq_pi = rxq->rq_ci;
332 	}
333 	for (i = 0; (i != (1u << rxq->elts_n)); ++i) {
334 		if ((*rxq->elts)[i] != NULL)
335 			rte_pktmbuf_free_seg((*rxq->elts)[i]);
336 		(*rxq->elts)[i] = NULL;
337 	}
338 }
339 
340 /**
341  * Free RX queue elements.
342  *
343  * @param rxq_ctrl
344  *   Pointer to RX queue structure.
345  */
346 static void
347 rxq_free_elts(struct mlx5_rxq_ctrl *rxq_ctrl)
348 {
349 	if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq))
350 		rxq_free_elts_mprq(rxq_ctrl);
351 	else
352 		rxq_free_elts_sprq(rxq_ctrl);
353 }
354 
355 /**
356  * Returns the per-queue supported offloads.
357  *
358  * @param dev
359  *   Pointer to Ethernet device.
360  *
361  * @return
362  *   Supported Rx offloads.
363  */
364 uint64_t
365 mlx5_get_rx_queue_offloads(struct rte_eth_dev *dev)
366 {
367 	struct mlx5_priv *priv = dev->data->dev_private;
368 	struct mlx5_dev_config *config = &priv->config;
369 	uint64_t offloads = (DEV_RX_OFFLOAD_SCATTER |
370 			     DEV_RX_OFFLOAD_TIMESTAMP |
371 			     DEV_RX_OFFLOAD_JUMBO_FRAME);
372 
373 	if (config->hw_fcs_strip)
374 		offloads |= DEV_RX_OFFLOAD_KEEP_CRC;
375 
376 	if (config->hw_csum)
377 		offloads |= (DEV_RX_OFFLOAD_IPV4_CKSUM |
378 			     DEV_RX_OFFLOAD_UDP_CKSUM |
379 			     DEV_RX_OFFLOAD_TCP_CKSUM);
380 	if (config->hw_vlan_strip)
381 		offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
382 	if (MLX5_LRO_SUPPORTED(dev))
383 		offloads |= DEV_RX_OFFLOAD_TCP_LRO;
384 	return offloads;
385 }
386 
387 
388 /**
389  * Returns the per-port supported offloads.
390  *
391  * @return
392  *   Supported Rx offloads.
393  */
394 uint64_t
395 mlx5_get_rx_port_offloads(void)
396 {
397 	uint64_t offloads = DEV_RX_OFFLOAD_VLAN_FILTER;
398 
399 	return offloads;
400 }
401 
402 /**
403  * Verify if the queue can be released.
404  *
405  * @param dev
406  *   Pointer to Ethernet device.
407  * @param idx
408  *   RX queue index.
409  *
410  * @return
411  *   1 if the queue can be released
412  *   0 if the queue can not be released, there are references to it.
413  *   Negative errno and rte_errno is set if queue doesn't exist.
414  */
415 static int
416 mlx5_rxq_releasable(struct rte_eth_dev *dev, uint16_t idx)
417 {
418 	struct mlx5_priv *priv = dev->data->dev_private;
419 	struct mlx5_rxq_ctrl *rxq_ctrl;
420 
421 	if (!(*priv->rxqs)[idx]) {
422 		rte_errno = EINVAL;
423 		return -rte_errno;
424 	}
425 	rxq_ctrl = container_of((*priv->rxqs)[idx], struct mlx5_rxq_ctrl, rxq);
426 	return (rte_atomic32_read(&rxq_ctrl->refcnt) == 1);
427 }
428 
429 /**
430  *
431  * @param dev
432  *   Pointer to Ethernet device structure.
433  * @param idx
434  *   RX queue index.
435  * @param desc
436  *   Number of descriptors to configure in queue.
437  * @param socket
438  *   NUMA socket on which memory must be allocated.
439  * @param[in] conf
440  *   Thresholds parameters.
441  * @param mp
442  *   Memory pool for buffer allocations.
443  *
444  * @return
445  *   0 on success, a negative errno value otherwise and rte_errno is set.
446  */
447 int
448 mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
449 		    unsigned int socket, const struct rte_eth_rxconf *conf,
450 		    struct rte_mempool *mp)
451 {
452 	struct mlx5_priv *priv = dev->data->dev_private;
453 	struct mlx5_rxq_data *rxq = (*priv->rxqs)[idx];
454 	struct mlx5_rxq_ctrl *rxq_ctrl =
455 		container_of(rxq, struct mlx5_rxq_ctrl, rxq);
456 
457 	if (!rte_is_power_of_2(desc)) {
458 		desc = 1 << log2above(desc);
459 		DRV_LOG(WARNING,
460 			"port %u increased number of descriptors in Rx queue %u"
461 			" to the next power of two (%d)",
462 			dev->data->port_id, idx, desc);
463 	}
464 	DRV_LOG(DEBUG, "port %u configuring Rx queue %u for %u descriptors",
465 		dev->data->port_id, idx, desc);
466 	if (idx >= priv->rxqs_n) {
467 		DRV_LOG(ERR, "port %u Rx queue index out of range (%u >= %u)",
468 			dev->data->port_id, idx, priv->rxqs_n);
469 		rte_errno = EOVERFLOW;
470 		return -rte_errno;
471 	}
472 	if (!mlx5_rxq_releasable(dev, idx)) {
473 		DRV_LOG(ERR, "port %u unable to release queue index %u",
474 			dev->data->port_id, idx);
475 		rte_errno = EBUSY;
476 		return -rte_errno;
477 	}
478 	mlx5_rxq_release(dev, idx);
479 	rxq_ctrl = mlx5_rxq_new(dev, idx, desc, socket, conf, mp);
480 	if (!rxq_ctrl) {
481 		DRV_LOG(ERR, "port %u unable to allocate queue index %u",
482 			dev->data->port_id, idx);
483 		rte_errno = ENOMEM;
484 		return -rte_errno;
485 	}
486 	DRV_LOG(DEBUG, "port %u adding Rx queue %u to list",
487 		dev->data->port_id, idx);
488 	(*priv->rxqs)[idx] = &rxq_ctrl->rxq;
489 	return 0;
490 }
491 
492 /**
493  * DPDK callback to release a RX queue.
494  *
495  * @param dpdk_rxq
496  *   Generic RX queue pointer.
497  */
498 void
499 mlx5_rx_queue_release(void *dpdk_rxq)
500 {
501 	struct mlx5_rxq_data *rxq = (struct mlx5_rxq_data *)dpdk_rxq;
502 	struct mlx5_rxq_ctrl *rxq_ctrl;
503 	struct mlx5_priv *priv;
504 
505 	if (rxq == NULL)
506 		return;
507 	rxq_ctrl = container_of(rxq, struct mlx5_rxq_ctrl, rxq);
508 	priv = rxq_ctrl->priv;
509 	if (!mlx5_rxq_releasable(ETH_DEV(priv), rxq_ctrl->rxq.idx))
510 		rte_panic("port %u Rx queue %u is still used by a flow and"
511 			  " cannot be removed\n",
512 			  PORT_ID(priv), rxq->idx);
513 	mlx5_rxq_release(ETH_DEV(priv), rxq_ctrl->rxq.idx);
514 }
515 
516 /**
517  * Get an Rx queue Verbs/DevX object.
518  *
519  * @param dev
520  *   Pointer to Ethernet device.
521  * @param idx
522  *   Queue index in DPDK Rx queue array
523  *
524  * @return
525  *   The Verbs/DevX object if it exists.
526  */
527 static struct mlx5_rxq_obj *
528 mlx5_rxq_obj_get(struct rte_eth_dev *dev, uint16_t idx)
529 {
530 	struct mlx5_priv *priv = dev->data->dev_private;
531 	struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx];
532 	struct mlx5_rxq_ctrl *rxq_ctrl;
533 
534 	if (idx >= priv->rxqs_n)
535 		return NULL;
536 	if (!rxq_data)
537 		return NULL;
538 	rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
539 	if (rxq_ctrl->obj)
540 		rte_atomic32_inc(&rxq_ctrl->obj->refcnt);
541 	return rxq_ctrl->obj;
542 }
543 
544 /**
545  * Release the resources allocated for an RQ DevX object.
546  *
547  * @param rxq_ctrl
548  *   DevX Rx queue object.
549  */
550 static void
551 rxq_release_rq_resources(struct mlx5_rxq_ctrl *rxq_ctrl)
552 {
553 	if (rxq_ctrl->rxq.wqes) {
554 		rte_free((void *)(uintptr_t)rxq_ctrl->rxq.wqes);
555 		rxq_ctrl->rxq.wqes = NULL;
556 	}
557 	if (rxq_ctrl->wq_umem) {
558 		mlx5_glue->devx_umem_dereg(rxq_ctrl->wq_umem);
559 		rxq_ctrl->wq_umem = NULL;
560 	}
561 }
562 
563 /**
564  * Release an Rx verbs/DevX queue object.
565  *
566  * @param rxq_obj
567  *   Verbs/DevX Rx queue object.
568  *
569  * @return
570  *   1 while a reference on it exists, 0 when freed.
571  */
572 static int
573 mlx5_rxq_obj_release(struct mlx5_rxq_obj *rxq_obj)
574 {
575 	assert(rxq_obj);
576 	if (rxq_obj->type == MLX5_RXQ_OBJ_TYPE_IBV)
577 		assert(rxq_obj->wq);
578 	assert(rxq_obj->cq);
579 	if (rte_atomic32_dec_and_test(&rxq_obj->refcnt)) {
580 		rxq_free_elts(rxq_obj->rxq_ctrl);
581 		if (rxq_obj->type == MLX5_RXQ_OBJ_TYPE_IBV) {
582 			claim_zero(mlx5_glue->destroy_wq(rxq_obj->wq));
583 		} else if (rxq_obj->type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ) {
584 			claim_zero(mlx5_devx_cmd_destroy(rxq_obj->rq));
585 			rxq_release_rq_resources(rxq_obj->rxq_ctrl);
586 		}
587 		claim_zero(mlx5_glue->destroy_cq(rxq_obj->cq));
588 		if (rxq_obj->channel)
589 			claim_zero(mlx5_glue->destroy_comp_channel
590 				   (rxq_obj->channel));
591 		LIST_REMOVE(rxq_obj, next);
592 		rte_free(rxq_obj);
593 		return 0;
594 	}
595 	return 1;
596 }
597 
598 /**
599  * Allocate queue vector and fill epoll fd list for Rx interrupts.
600  *
601  * @param dev
602  *   Pointer to Ethernet device.
603  *
604  * @return
605  *   0 on success, a negative errno value otherwise and rte_errno is set.
606  */
607 int
608 mlx5_rx_intr_vec_enable(struct rte_eth_dev *dev)
609 {
610 	struct mlx5_priv *priv = dev->data->dev_private;
611 	unsigned int i;
612 	unsigned int rxqs_n = priv->rxqs_n;
613 	unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
614 	unsigned int count = 0;
615 	struct rte_intr_handle *intr_handle = dev->intr_handle;
616 
617 	if (!dev->data->dev_conf.intr_conf.rxq)
618 		return 0;
619 	mlx5_rx_intr_vec_disable(dev);
620 	intr_handle->intr_vec = malloc(n * sizeof(intr_handle->intr_vec[0]));
621 	if (intr_handle->intr_vec == NULL) {
622 		DRV_LOG(ERR,
623 			"port %u failed to allocate memory for interrupt"
624 			" vector, Rx interrupts will not be supported",
625 			dev->data->port_id);
626 		rte_errno = ENOMEM;
627 		return -rte_errno;
628 	}
629 	intr_handle->type = RTE_INTR_HANDLE_EXT;
630 	for (i = 0; i != n; ++i) {
631 		/* This rxq obj must not be released in this function. */
632 		struct mlx5_rxq_obj *rxq_obj = mlx5_rxq_obj_get(dev, i);
633 		int fd;
634 		int flags;
635 		int rc;
636 
637 		/* Skip queues that cannot request interrupts. */
638 		if (!rxq_obj || !rxq_obj->channel) {
639 			/* Use invalid intr_vec[] index to disable entry. */
640 			intr_handle->intr_vec[i] =
641 				RTE_INTR_VEC_RXTX_OFFSET +
642 				RTE_MAX_RXTX_INTR_VEC_ID;
643 			continue;
644 		}
645 		if (count >= RTE_MAX_RXTX_INTR_VEC_ID) {
646 			DRV_LOG(ERR,
647 				"port %u too many Rx queues for interrupt"
648 				" vector size (%d), Rx interrupts cannot be"
649 				" enabled",
650 				dev->data->port_id, RTE_MAX_RXTX_INTR_VEC_ID);
651 			mlx5_rx_intr_vec_disable(dev);
652 			rte_errno = ENOMEM;
653 			return -rte_errno;
654 		}
655 		fd = rxq_obj->channel->fd;
656 		flags = fcntl(fd, F_GETFL);
657 		rc = fcntl(fd, F_SETFL, flags | O_NONBLOCK);
658 		if (rc < 0) {
659 			rte_errno = errno;
660 			DRV_LOG(ERR,
661 				"port %u failed to make Rx interrupt file"
662 				" descriptor %d non-blocking for queue index"
663 				" %d",
664 				dev->data->port_id, fd, i);
665 			mlx5_rx_intr_vec_disable(dev);
666 			return -rte_errno;
667 		}
668 		intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + count;
669 		intr_handle->efds[count] = fd;
670 		count++;
671 	}
672 	if (!count)
673 		mlx5_rx_intr_vec_disable(dev);
674 	else
675 		intr_handle->nb_efd = count;
676 	return 0;
677 }
678 
679 /**
680  * Clean up Rx interrupts handler.
681  *
682  * @param dev
683  *   Pointer to Ethernet device.
684  */
685 void
686 mlx5_rx_intr_vec_disable(struct rte_eth_dev *dev)
687 {
688 	struct mlx5_priv *priv = dev->data->dev_private;
689 	struct rte_intr_handle *intr_handle = dev->intr_handle;
690 	unsigned int i;
691 	unsigned int rxqs_n = priv->rxqs_n;
692 	unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
693 
694 	if (!dev->data->dev_conf.intr_conf.rxq)
695 		return;
696 	if (!intr_handle->intr_vec)
697 		goto free;
698 	for (i = 0; i != n; ++i) {
699 		struct mlx5_rxq_ctrl *rxq_ctrl;
700 		struct mlx5_rxq_data *rxq_data;
701 
702 		if (intr_handle->intr_vec[i] == RTE_INTR_VEC_RXTX_OFFSET +
703 		    RTE_MAX_RXTX_INTR_VEC_ID)
704 			continue;
705 		/**
706 		 * Need to access directly the queue to release the reference
707 		 * kept in mlx5_rx_intr_vec_enable().
708 		 */
709 		rxq_data = (*priv->rxqs)[i];
710 		rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
711 		if (rxq_ctrl->obj)
712 			mlx5_rxq_obj_release(rxq_ctrl->obj);
713 	}
714 free:
715 	rte_intr_free_epoll_fd(intr_handle);
716 	if (intr_handle->intr_vec)
717 		free(intr_handle->intr_vec);
718 	intr_handle->nb_efd = 0;
719 	intr_handle->intr_vec = NULL;
720 }
721 
722 /**
723  *  MLX5 CQ notification .
724  *
725  *  @param rxq
726  *     Pointer to receive queue structure.
727  *  @param sq_n_rxq
728  *     Sequence number per receive queue .
729  */
730 static inline void
731 mlx5_arm_cq(struct mlx5_rxq_data *rxq, int sq_n_rxq)
732 {
733 	int sq_n = 0;
734 	uint32_t doorbell_hi;
735 	uint64_t doorbell;
736 	void *cq_db_reg = (char *)rxq->cq_uar + MLX5_CQ_DOORBELL;
737 
738 	sq_n = sq_n_rxq & MLX5_CQ_SQN_MASK;
739 	doorbell_hi = sq_n << MLX5_CQ_SQN_OFFSET | (rxq->cq_ci & MLX5_CI_MASK);
740 	doorbell = (uint64_t)doorbell_hi << 32;
741 	doorbell |=  rxq->cqn;
742 	rxq->cq_db[MLX5_CQ_ARM_DB] = rte_cpu_to_be_32(doorbell_hi);
743 	mlx5_uar_write64(rte_cpu_to_be_64(doorbell),
744 			 cq_db_reg, rxq->uar_lock_cq);
745 }
746 
747 /**
748  * DPDK callback for Rx queue interrupt enable.
749  *
750  * @param dev
751  *   Pointer to Ethernet device structure.
752  * @param rx_queue_id
753  *   Rx queue number.
754  *
755  * @return
756  *   0 on success, a negative errno value otherwise and rte_errno is set.
757  */
758 int
759 mlx5_rx_intr_enable(struct rte_eth_dev *dev, uint16_t rx_queue_id)
760 {
761 	struct mlx5_priv *priv = dev->data->dev_private;
762 	struct mlx5_rxq_data *rxq_data;
763 	struct mlx5_rxq_ctrl *rxq_ctrl;
764 
765 	rxq_data = (*priv->rxqs)[rx_queue_id];
766 	if (!rxq_data) {
767 		rte_errno = EINVAL;
768 		return -rte_errno;
769 	}
770 	rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
771 	if (rxq_ctrl->irq) {
772 		struct mlx5_rxq_obj *rxq_obj;
773 
774 		rxq_obj = mlx5_rxq_obj_get(dev, rx_queue_id);
775 		if (!rxq_obj) {
776 			rte_errno = EINVAL;
777 			return -rte_errno;
778 		}
779 		mlx5_arm_cq(rxq_data, rxq_data->cq_arm_sn);
780 		mlx5_rxq_obj_release(rxq_obj);
781 	}
782 	return 0;
783 }
784 
785 /**
786  * DPDK callback for Rx queue interrupt disable.
787  *
788  * @param dev
789  *   Pointer to Ethernet device structure.
790  * @param rx_queue_id
791  *   Rx queue number.
792  *
793  * @return
794  *   0 on success, a negative errno value otherwise and rte_errno is set.
795  */
796 int
797 mlx5_rx_intr_disable(struct rte_eth_dev *dev, uint16_t rx_queue_id)
798 {
799 	struct mlx5_priv *priv = dev->data->dev_private;
800 	struct mlx5_rxq_data *rxq_data;
801 	struct mlx5_rxq_ctrl *rxq_ctrl;
802 	struct mlx5_rxq_obj *rxq_obj = NULL;
803 	struct ibv_cq *ev_cq;
804 	void *ev_ctx;
805 	int ret;
806 
807 	rxq_data = (*priv->rxqs)[rx_queue_id];
808 	if (!rxq_data) {
809 		rte_errno = EINVAL;
810 		return -rte_errno;
811 	}
812 	rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
813 	if (!rxq_ctrl->irq)
814 		return 0;
815 	rxq_obj = mlx5_rxq_obj_get(dev, rx_queue_id);
816 	if (!rxq_obj) {
817 		rte_errno = EINVAL;
818 		return -rte_errno;
819 	}
820 	ret = mlx5_glue->get_cq_event(rxq_obj->channel, &ev_cq, &ev_ctx);
821 	if (ret || ev_cq != rxq_obj->cq) {
822 		rte_errno = EINVAL;
823 		goto exit;
824 	}
825 	rxq_data->cq_arm_sn++;
826 	mlx5_glue->ack_cq_events(rxq_obj->cq, 1);
827 	mlx5_rxq_obj_release(rxq_obj);
828 	return 0;
829 exit:
830 	ret = rte_errno; /* Save rte_errno before cleanup. */
831 	if (rxq_obj)
832 		mlx5_rxq_obj_release(rxq_obj);
833 	DRV_LOG(WARNING, "port %u unable to disable interrupt on Rx queue %d",
834 		dev->data->port_id, rx_queue_id);
835 	rte_errno = ret; /* Restore rte_errno. */
836 	return -rte_errno;
837 }
838 
839 /**
840  * Create a CQ Verbs object.
841  *
842  * @param dev
843  *   Pointer to Ethernet device.
844  * @param priv
845  *   Pointer to device private data.
846  * @param rxq_data
847  *   Pointer to Rx queue data.
848  * @param cqe_n
849  *   Number of CQEs in CQ.
850  * @param rxq_obj
851  *   Pointer to Rx queue object data.
852  *
853  * @return
854  *   The Verbs object initialised, NULL otherwise and rte_errno is set.
855  */
856 static struct ibv_cq *
857 mlx5_ibv_cq_new(struct rte_eth_dev *dev, struct mlx5_priv *priv,
858 		struct mlx5_rxq_data *rxq_data,
859 		unsigned int cqe_n, struct mlx5_rxq_obj *rxq_obj)
860 {
861 	struct {
862 		struct ibv_cq_init_attr_ex ibv;
863 		struct mlx5dv_cq_init_attr mlx5;
864 	} cq_attr;
865 
866 	cq_attr.ibv = (struct ibv_cq_init_attr_ex){
867 		.cqe = cqe_n,
868 		.channel = rxq_obj->channel,
869 		.comp_mask = 0,
870 	};
871 	cq_attr.mlx5 = (struct mlx5dv_cq_init_attr){
872 		.comp_mask = 0,
873 	};
874 	if (priv->config.cqe_comp && !rxq_data->hw_timestamp &&
875 	    !rxq_data->lro) {
876 		cq_attr.mlx5.comp_mask |=
877 				MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE;
878 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
879 		cq_attr.mlx5.cqe_comp_res_format =
880 				mlx5_rxq_mprq_enabled(rxq_data) ?
881 				MLX5DV_CQE_RES_FORMAT_CSUM_STRIDX :
882 				MLX5DV_CQE_RES_FORMAT_HASH;
883 #else
884 		cq_attr.mlx5.cqe_comp_res_format = MLX5DV_CQE_RES_FORMAT_HASH;
885 #endif
886 		/*
887 		 * For vectorized Rx, it must not be doubled in order to
888 		 * make cq_ci and rq_ci aligned.
889 		 */
890 		if (mlx5_rxq_check_vec_support(rxq_data) < 0)
891 			cq_attr.ibv.cqe *= 2;
892 	} else if (priv->config.cqe_comp && rxq_data->hw_timestamp) {
893 		DRV_LOG(DEBUG,
894 			"port %u Rx CQE compression is disabled for HW"
895 			" timestamp",
896 			dev->data->port_id);
897 	} else if (priv->config.cqe_comp && rxq_data->lro) {
898 		DRV_LOG(DEBUG,
899 			"port %u Rx CQE compression is disabled for LRO",
900 			dev->data->port_id);
901 	}
902 #ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD
903 	if (priv->config.cqe_pad) {
904 		cq_attr.mlx5.comp_mask |= MLX5DV_CQ_INIT_ATTR_MASK_FLAGS;
905 		cq_attr.mlx5.flags |= MLX5DV_CQ_INIT_ATTR_FLAGS_CQE_PAD;
906 	}
907 #endif
908 	return mlx5_glue->cq_ex_to_cq(mlx5_glue->dv_create_cq(priv->sh->ctx,
909 							      &cq_attr.ibv,
910 							      &cq_attr.mlx5));
911 }
912 
913 /**
914  * Create a WQ Verbs object.
915  *
916  * @param dev
917  *   Pointer to Ethernet device.
918  * @param priv
919  *   Pointer to device private data.
920  * @param rxq_data
921  *   Pointer to Rx queue data.
922  * @param idx
923  *   Queue index in DPDK Rx queue array
924  * @param wqe_n
925  *   Number of WQEs in WQ.
926  * @param rxq_obj
927  *   Pointer to Rx queue object data.
928  *
929  * @return
930  *   The Verbs object initialised, NULL otherwise and rte_errno is set.
931  */
932 static struct ibv_wq *
933 mlx5_ibv_wq_new(struct rte_eth_dev *dev, struct mlx5_priv *priv,
934 		struct mlx5_rxq_data *rxq_data, uint16_t idx,
935 		unsigned int wqe_n, struct mlx5_rxq_obj *rxq_obj)
936 {
937 	struct {
938 		struct ibv_wq_init_attr ibv;
939 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
940 		struct mlx5dv_wq_init_attr mlx5;
941 #endif
942 	} wq_attr;
943 
944 	wq_attr.ibv = (struct ibv_wq_init_attr){
945 		.wq_context = NULL, /* Could be useful in the future. */
946 		.wq_type = IBV_WQT_RQ,
947 		/* Max number of outstanding WRs. */
948 		.max_wr = wqe_n >> rxq_data->sges_n,
949 		/* Max number of scatter/gather elements in a WR. */
950 		.max_sge = 1 << rxq_data->sges_n,
951 		.pd = priv->sh->pd,
952 		.cq = rxq_obj->cq,
953 		.comp_mask = IBV_WQ_FLAGS_CVLAN_STRIPPING | 0,
954 		.create_flags = (rxq_data->vlan_strip ?
955 				 IBV_WQ_FLAGS_CVLAN_STRIPPING : 0),
956 	};
957 	/* By default, FCS (CRC) is stripped by hardware. */
958 	if (rxq_data->crc_present) {
959 		wq_attr.ibv.create_flags |= IBV_WQ_FLAGS_SCATTER_FCS;
960 		wq_attr.ibv.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS;
961 	}
962 	if (priv->config.hw_padding) {
963 #if defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING)
964 		wq_attr.ibv.create_flags |= IBV_WQ_FLAG_RX_END_PADDING;
965 		wq_attr.ibv.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS;
966 #elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING)
967 		wq_attr.ibv.create_flags |= IBV_WQ_FLAGS_PCI_WRITE_END_PADDING;
968 		wq_attr.ibv.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS;
969 #endif
970 	}
971 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
972 	wq_attr.mlx5 = (struct mlx5dv_wq_init_attr){
973 		.comp_mask = 0,
974 	};
975 	if (mlx5_rxq_mprq_enabled(rxq_data)) {
976 		struct mlx5dv_striding_rq_init_attr *mprq_attr =
977 						&wq_attr.mlx5.striding_rq_attrs;
978 
979 		wq_attr.mlx5.comp_mask |= MLX5DV_WQ_INIT_ATTR_MASK_STRIDING_RQ;
980 		*mprq_attr = (struct mlx5dv_striding_rq_init_attr){
981 			.single_stride_log_num_of_bytes = rxq_data->strd_sz_n,
982 			.single_wqe_log_num_of_strides = rxq_data->strd_num_n,
983 			.two_byte_shift_en = MLX5_MPRQ_TWO_BYTE_SHIFT,
984 		};
985 	}
986 	rxq_obj->wq = mlx5_glue->dv_create_wq(priv->sh->ctx, &wq_attr.ibv,
987 					      &wq_attr.mlx5);
988 #else
989 	rxq_obj->wq = mlx5_glue->create_wq(priv->sh->ctx, &wq_attr.ibv);
990 #endif
991 	if (rxq_obj->wq) {
992 		/*
993 		 * Make sure number of WRs*SGEs match expectations since a queue
994 		 * cannot allocate more than "desc" buffers.
995 		 */
996 		if (wq_attr.ibv.max_wr != (wqe_n >> rxq_data->sges_n) ||
997 		    wq_attr.ibv.max_sge != (1u << rxq_data->sges_n)) {
998 			DRV_LOG(ERR,
999 				"port %u Rx queue %u requested %u*%u but got"
1000 				" %u*%u WRs*SGEs",
1001 				dev->data->port_id, idx,
1002 				wqe_n >> rxq_data->sges_n,
1003 				(1 << rxq_data->sges_n),
1004 				wq_attr.ibv.max_wr, wq_attr.ibv.max_sge);
1005 			claim_zero(mlx5_glue->destroy_wq(rxq_obj->wq));
1006 			rxq_obj->wq = NULL;
1007 			rte_errno = EINVAL;
1008 		}
1009 	}
1010 	return rxq_obj->wq;
1011 }
1012 
1013 /**
1014  * Fill common fields of create RQ attributes structure.
1015  *
1016  * @param rxq_data
1017  *   Pointer to Rx queue data.
1018  * @param cqn
1019  *   CQ number to use with this RQ.
1020  * @param rq_attr
1021  *   RQ attributes structure to fill..
1022  */
1023 static void
1024 mlx5_devx_create_rq_attr_fill(struct mlx5_rxq_data *rxq_data, uint32_t cqn,
1025 			      struct mlx5_devx_create_rq_attr *rq_attr)
1026 {
1027 	rq_attr->state = MLX5_RQC_STATE_RST;
1028 	rq_attr->vsd = (rxq_data->vlan_strip) ? 0 : 1;
1029 	rq_attr->cqn = cqn;
1030 	rq_attr->scatter_fcs = (rxq_data->crc_present) ? 1 : 0;
1031 }
1032 
1033 /**
1034  * Fill common fields of DevX WQ attributes structure.
1035  *
1036  * @param priv
1037  *   Pointer to device private data.
1038  * @param rxq_ctrl
1039  *   Pointer to Rx queue control structure.
1040  * @param wq_attr
1041  *   WQ attributes structure to fill..
1042  */
1043 static void
1044 mlx5_devx_wq_attr_fill(struct mlx5_priv *priv, struct mlx5_rxq_ctrl *rxq_ctrl,
1045 		       struct mlx5_devx_wq_attr *wq_attr)
1046 {
1047 	wq_attr->end_padding_mode = priv->config.cqe_pad ?
1048 					MLX5_WQ_END_PAD_MODE_ALIGN :
1049 					MLX5_WQ_END_PAD_MODE_NONE;
1050 	wq_attr->pd = priv->sh->pdn;
1051 	wq_attr->dbr_addr = rxq_ctrl->dbr_offset;
1052 	wq_attr->dbr_umem_id = rxq_ctrl->dbr_umem_id;
1053 	wq_attr->dbr_umem_valid = 1;
1054 	wq_attr->wq_umem_id = rxq_ctrl->wq_umem->umem_id;
1055 	wq_attr->wq_umem_valid = 1;
1056 }
1057 
1058 /**
1059  * Create a RQ object using DevX.
1060  *
1061  * @param dev
1062  *   Pointer to Ethernet device.
1063  * @param idx
1064  *   Queue index in DPDK Rx queue array
1065  * @param cqn
1066  *   CQ number to use with this RQ.
1067  *
1068  * @return
1069  *   The DevX object initialised, NULL otherwise and rte_errno is set.
1070  */
1071 static struct mlx5_devx_obj *
1072 mlx5_devx_rq_new(struct rte_eth_dev *dev, uint16_t idx, uint32_t cqn)
1073 {
1074 	struct mlx5_priv *priv = dev->data->dev_private;
1075 	struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx];
1076 	struct mlx5_rxq_ctrl *rxq_ctrl =
1077 		container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
1078 	struct mlx5_devx_create_rq_attr rq_attr;
1079 	uint32_t wqe_n = 1 << (rxq_data->elts_n - rxq_data->sges_n);
1080 	uint32_t wq_size = 0;
1081 	uint32_t wqe_size = 0;
1082 	uint32_t log_wqe_size = 0;
1083 	void *buf = NULL;
1084 	struct mlx5_devx_obj *rq;
1085 
1086 	memset(&rq_attr, 0, sizeof(rq_attr));
1087 	/* Fill RQ attributes. */
1088 	rq_attr.mem_rq_type = MLX5_RQC_MEM_RQ_TYPE_MEMORY_RQ_INLINE;
1089 	rq_attr.flush_in_error_en = 1;
1090 	mlx5_devx_create_rq_attr_fill(rxq_data, cqn, &rq_attr);
1091 	/* Fill WQ attributes for this RQ. */
1092 	if (mlx5_rxq_mprq_enabled(rxq_data)) {
1093 		rq_attr.wq_attr.wq_type = MLX5_WQ_TYPE_CYCLIC_STRIDING_RQ;
1094 		/*
1095 		 * Number of strides in each WQE:
1096 		 * 512*2^single_wqe_log_num_of_strides.
1097 		 */
1098 		rq_attr.wq_attr.single_wqe_log_num_of_strides =
1099 				rxq_data->strd_num_n -
1100 				MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES;
1101 		/* Stride size = (2^single_stride_log_num_of_bytes)*64B. */
1102 		rq_attr.wq_attr.single_stride_log_num_of_bytes =
1103 				rxq_data->strd_sz_n -
1104 				MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES;
1105 		wqe_size = sizeof(struct mlx5_wqe_mprq);
1106 	} else {
1107 		rq_attr.wq_attr.wq_type = MLX5_WQ_TYPE_CYCLIC;
1108 		wqe_size = sizeof(struct mlx5_wqe_data_seg);
1109 	}
1110 	log_wqe_size = log2above(wqe_size) + rxq_data->sges_n;
1111 	rq_attr.wq_attr.log_wq_stride = log_wqe_size;
1112 	rq_attr.wq_attr.log_wq_sz = rxq_data->elts_n - rxq_data->sges_n;
1113 	/* Calculate and allocate WQ memory space. */
1114 	wqe_size = 1 << log_wqe_size; /* round up power of two.*/
1115 	wq_size = wqe_n * wqe_size;
1116 	buf = rte_calloc_socket(__func__, 1, wq_size, MLX5_WQE_BUF_ALIGNMENT,
1117 				rxq_ctrl->socket);
1118 	if (!buf)
1119 		return NULL;
1120 	rxq_data->wqes = buf;
1121 	rxq_ctrl->wq_umem = mlx5_glue->devx_umem_reg(priv->sh->ctx,
1122 						     buf, wq_size, 0);
1123 	if (!rxq_ctrl->wq_umem) {
1124 		rte_free(buf);
1125 		return NULL;
1126 	}
1127 	mlx5_devx_wq_attr_fill(priv, rxq_ctrl, &rq_attr.wq_attr);
1128 	rq = mlx5_devx_cmd_create_rq(priv->sh->ctx, &rq_attr, rxq_ctrl->socket);
1129 	if (!rq)
1130 		rxq_release_rq_resources(rxq_ctrl);
1131 	return rq;
1132 }
1133 
1134 /**
1135  * Create the Rx queue Verbs/DevX object.
1136  *
1137  * @param dev
1138  *   Pointer to Ethernet device.
1139  * @param idx
1140  *   Queue index in DPDK Rx queue array
1141  * @param type
1142  *   Type of Rx queue object to create.
1143  *
1144  * @return
1145  *   The Verbs/DevX object initialised, NULL otherwise and rte_errno is set.
1146  */
1147 struct mlx5_rxq_obj *
1148 mlx5_rxq_obj_new(struct rte_eth_dev *dev, uint16_t idx,
1149 		 enum mlx5_rxq_obj_type type)
1150 {
1151 	struct mlx5_priv *priv = dev->data->dev_private;
1152 	struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx];
1153 	struct mlx5_rxq_ctrl *rxq_ctrl =
1154 		container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
1155 	struct ibv_wq_attr mod;
1156 	unsigned int cqe_n;
1157 	unsigned int wqe_n = 1 << rxq_data->elts_n;
1158 	struct mlx5_rxq_obj *tmpl = NULL;
1159 	struct mlx5dv_cq cq_info;
1160 	struct mlx5dv_rwq rwq;
1161 	int ret = 0;
1162 	struct mlx5dv_obj obj;
1163 
1164 	assert(rxq_data);
1165 	assert(!rxq_ctrl->obj);
1166 	priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_RX_QUEUE;
1167 	priv->verbs_alloc_ctx.obj = rxq_ctrl;
1168 	tmpl = rte_calloc_socket(__func__, 1, sizeof(*tmpl), 0,
1169 				 rxq_ctrl->socket);
1170 	if (!tmpl) {
1171 		DRV_LOG(ERR,
1172 			"port %u Rx queue %u cannot allocate verbs resources",
1173 			dev->data->port_id, rxq_data->idx);
1174 		rte_errno = ENOMEM;
1175 		goto error;
1176 	}
1177 	tmpl->type = type;
1178 	tmpl->rxq_ctrl = rxq_ctrl;
1179 	if (rxq_ctrl->irq) {
1180 		tmpl->channel = mlx5_glue->create_comp_channel(priv->sh->ctx);
1181 		if (!tmpl->channel) {
1182 			DRV_LOG(ERR, "port %u: comp channel creation failure",
1183 				dev->data->port_id);
1184 			rte_errno = ENOMEM;
1185 			goto error;
1186 		}
1187 	}
1188 	if (mlx5_rxq_mprq_enabled(rxq_data))
1189 		cqe_n = wqe_n * (1 << rxq_data->strd_num_n) - 1;
1190 	else
1191 		cqe_n = wqe_n  - 1;
1192 	tmpl->cq = mlx5_ibv_cq_new(dev, priv, rxq_data, cqe_n, tmpl);
1193 	if (!tmpl->cq) {
1194 		DRV_LOG(ERR, "port %u Rx queue %u CQ creation failure",
1195 			dev->data->port_id, idx);
1196 		rte_errno = ENOMEM;
1197 		goto error;
1198 	}
1199 	obj.cq.in = tmpl->cq;
1200 	obj.cq.out = &cq_info;
1201 	ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_CQ);
1202 	if (ret) {
1203 		rte_errno = ret;
1204 		goto error;
1205 	}
1206 	if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) {
1207 		DRV_LOG(ERR,
1208 			"port %u wrong MLX5_CQE_SIZE environment variable"
1209 			" value: it should be set to %u",
1210 			dev->data->port_id, RTE_CACHE_LINE_SIZE);
1211 		rte_errno = EINVAL;
1212 		goto error;
1213 	}
1214 	DRV_LOG(DEBUG, "port %u device_attr.max_qp_wr is %d",
1215 		dev->data->port_id, priv->sh->device_attr.orig_attr.max_qp_wr);
1216 	DRV_LOG(DEBUG, "port %u device_attr.max_sge is %d",
1217 		dev->data->port_id, priv->sh->device_attr.orig_attr.max_sge);
1218 	/* Allocate door-bell for types created with DevX. */
1219 	if (tmpl->type != MLX5_RXQ_OBJ_TYPE_IBV) {
1220 		struct mlx5_devx_dbr_page *dbr_page;
1221 		int64_t dbr_offset;
1222 
1223 		dbr_offset = mlx5_get_dbr(dev, &dbr_page);
1224 		if (dbr_offset < 0)
1225 			goto error;
1226 		rxq_ctrl->dbr_offset = dbr_offset;
1227 		rxq_ctrl->dbr_umem_id = dbr_page->umem->umem_id;
1228 		rxq_ctrl->dbr_umem_id_valid = 1;
1229 		rxq_data->rq_db = (uint32_t *)((uintptr_t)dbr_page->dbrs +
1230 					       (uintptr_t)rxq_ctrl->dbr_offset);
1231 	}
1232 	if (tmpl->type == MLX5_RXQ_OBJ_TYPE_IBV) {
1233 		tmpl->wq = mlx5_ibv_wq_new(dev, priv, rxq_data, idx, wqe_n,
1234 					   tmpl);
1235 		if (!tmpl->wq) {
1236 			DRV_LOG(ERR, "port %u Rx queue %u WQ creation failure",
1237 				dev->data->port_id, idx);
1238 			rte_errno = ENOMEM;
1239 			goto error;
1240 		}
1241 		/* Change queue state to ready. */
1242 		mod = (struct ibv_wq_attr){
1243 			.attr_mask = IBV_WQ_ATTR_STATE,
1244 			.wq_state = IBV_WQS_RDY,
1245 		};
1246 		ret = mlx5_glue->modify_wq(tmpl->wq, &mod);
1247 		if (ret) {
1248 			DRV_LOG(ERR,
1249 				"port %u Rx queue %u WQ state to IBV_WQS_RDY"
1250 				" failed", dev->data->port_id, idx);
1251 			rte_errno = ret;
1252 			goto error;
1253 		}
1254 		obj.rwq.in = tmpl->wq;
1255 		obj.rwq.out = &rwq;
1256 		ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_RWQ);
1257 		if (ret) {
1258 			rte_errno = ret;
1259 			goto error;
1260 		}
1261 		rxq_data->wqes = rwq.buf;
1262 		rxq_data->rq_db = rwq.dbrec;
1263 	} else if (tmpl->type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ) {
1264 		struct mlx5_devx_modify_rq_attr rq_attr;
1265 
1266 		memset(&rq_attr, 0, sizeof(rq_attr));
1267 		tmpl->rq = mlx5_devx_rq_new(dev, idx, cq_info.cqn);
1268 		if (!tmpl->rq) {
1269 			DRV_LOG(ERR, "port %u Rx queue %u RQ creation failure",
1270 				dev->data->port_id, idx);
1271 			rte_errno = ENOMEM;
1272 			goto error;
1273 		}
1274 		/* Change queue state to ready. */
1275 		rq_attr.rq_state = MLX5_RQC_STATE_RST;
1276 		rq_attr.state = MLX5_RQC_STATE_RDY;
1277 		ret = mlx5_devx_cmd_modify_rq(tmpl->rq, &rq_attr);
1278 		if (ret)
1279 			goto error;
1280 	}
1281 	/* Fill the rings. */
1282 	rxq_data->cqe_n = log2above(cq_info.cqe_cnt);
1283 	rxq_data->cq_db = cq_info.dbrec;
1284 	rxq_data->cqes = (volatile struct mlx5_cqe (*)[])(uintptr_t)cq_info.buf;
1285 	rxq_data->cq_uar = cq_info.cq_uar;
1286 	rxq_data->cqn = cq_info.cqn;
1287 	rxq_data->cq_arm_sn = 0;
1288 	mlx5_rxq_initialize(rxq_data);
1289 	rxq_data->cq_ci = 0;
1290 	DRV_LOG(DEBUG, "port %u rxq %u updated with %p", dev->data->port_id,
1291 		idx, (void *)&tmpl);
1292 	rte_atomic32_inc(&tmpl->refcnt);
1293 	LIST_INSERT_HEAD(&priv->rxqsobj, tmpl, next);
1294 	priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE;
1295 	return tmpl;
1296 error:
1297 	if (tmpl) {
1298 		ret = rte_errno; /* Save rte_errno before cleanup. */
1299 		if (tmpl->type == MLX5_RXQ_OBJ_TYPE_IBV && tmpl->wq)
1300 			claim_zero(mlx5_glue->destroy_wq(tmpl->wq));
1301 		else if (tmpl->type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ && tmpl->rq)
1302 			claim_zero(mlx5_devx_cmd_destroy(tmpl->rq));
1303 		if (tmpl->cq)
1304 			claim_zero(mlx5_glue->destroy_cq(tmpl->cq));
1305 		if (tmpl->channel)
1306 			claim_zero(mlx5_glue->destroy_comp_channel
1307 							(tmpl->channel));
1308 		rte_free(tmpl);
1309 		rte_errno = ret; /* Restore rte_errno. */
1310 	}
1311 	if (type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ)
1312 		rxq_release_rq_resources(rxq_ctrl);
1313 	priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE;
1314 	return NULL;
1315 }
1316 
1317 /**
1318  * Verify the Rx queue objects list is empty
1319  *
1320  * @param dev
1321  *   Pointer to Ethernet device.
1322  *
1323  * @return
1324  *   The number of objects not released.
1325  */
1326 int
1327 mlx5_rxq_obj_verify(struct rte_eth_dev *dev)
1328 {
1329 	struct mlx5_priv *priv = dev->data->dev_private;
1330 	int ret = 0;
1331 	struct mlx5_rxq_obj *rxq_obj;
1332 
1333 	LIST_FOREACH(rxq_obj, &priv->rxqsobj, next) {
1334 		DRV_LOG(DEBUG, "port %u Rx queue %u still referenced",
1335 			dev->data->port_id, rxq_obj->rxq_ctrl->rxq.idx);
1336 		++ret;
1337 	}
1338 	return ret;
1339 }
1340 
1341 /**
1342  * Callback function to initialize mbufs for Multi-Packet RQ.
1343  */
1344 static inline void
1345 mlx5_mprq_buf_init(struct rte_mempool *mp, void *opaque_arg,
1346 		    void *_m, unsigned int i __rte_unused)
1347 {
1348 	struct mlx5_mprq_buf *buf = _m;
1349 	struct rte_mbuf_ext_shared_info *shinfo;
1350 	unsigned int strd_n = (unsigned int)(uintptr_t)opaque_arg;
1351 	unsigned int j;
1352 
1353 	memset(_m, 0, sizeof(*buf));
1354 	buf->mp = mp;
1355 	rte_atomic16_set(&buf->refcnt, 1);
1356 	for (j = 0; j != strd_n; ++j) {
1357 		shinfo = &buf->shinfos[j];
1358 		shinfo->free_cb = mlx5_mprq_buf_free_cb;
1359 		shinfo->fcb_opaque = buf;
1360 	}
1361 }
1362 
1363 /**
1364  * Free mempool of Multi-Packet RQ.
1365  *
1366  * @param dev
1367  *   Pointer to Ethernet device.
1368  *
1369  * @return
1370  *   0 on success, negative errno value on failure.
1371  */
1372 int
1373 mlx5_mprq_free_mp(struct rte_eth_dev *dev)
1374 {
1375 	struct mlx5_priv *priv = dev->data->dev_private;
1376 	struct rte_mempool *mp = priv->mprq_mp;
1377 	unsigned int i;
1378 
1379 	if (mp == NULL)
1380 		return 0;
1381 	DRV_LOG(DEBUG, "port %u freeing mempool (%s) for Multi-Packet RQ",
1382 		dev->data->port_id, mp->name);
1383 	/*
1384 	 * If a buffer in the pool has been externally attached to a mbuf and it
1385 	 * is still in use by application, destroying the Rx queue can spoil
1386 	 * the packet. It is unlikely to happen but if application dynamically
1387 	 * creates and destroys with holding Rx packets, this can happen.
1388 	 *
1389 	 * TODO: It is unavoidable for now because the mempool for Multi-Packet
1390 	 * RQ isn't provided by application but managed by PMD.
1391 	 */
1392 	if (!rte_mempool_full(mp)) {
1393 		DRV_LOG(ERR,
1394 			"port %u mempool for Multi-Packet RQ is still in use",
1395 			dev->data->port_id);
1396 		rte_errno = EBUSY;
1397 		return -rte_errno;
1398 	}
1399 	rte_mempool_free(mp);
1400 	/* Unset mempool for each Rx queue. */
1401 	for (i = 0; i != priv->rxqs_n; ++i) {
1402 		struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
1403 
1404 		if (rxq == NULL)
1405 			continue;
1406 		rxq->mprq_mp = NULL;
1407 	}
1408 	priv->mprq_mp = NULL;
1409 	return 0;
1410 }
1411 
1412 /**
1413  * Allocate a mempool for Multi-Packet RQ. All configured Rx queues share the
1414  * mempool. If already allocated, reuse it if there're enough elements.
1415  * Otherwise, resize it.
1416  *
1417  * @param dev
1418  *   Pointer to Ethernet device.
1419  *
1420  * @return
1421  *   0 on success, negative errno value on failure.
1422  */
1423 int
1424 mlx5_mprq_alloc_mp(struct rte_eth_dev *dev)
1425 {
1426 	struct mlx5_priv *priv = dev->data->dev_private;
1427 	struct rte_mempool *mp = priv->mprq_mp;
1428 	char name[RTE_MEMPOOL_NAMESIZE];
1429 	unsigned int desc = 0;
1430 	unsigned int buf_len;
1431 	unsigned int obj_num;
1432 	unsigned int obj_size;
1433 	unsigned int strd_num_n = 0;
1434 	unsigned int strd_sz_n = 0;
1435 	unsigned int i;
1436 
1437 	if (!mlx5_mprq_enabled(dev))
1438 		return 0;
1439 	/* Count the total number of descriptors configured. */
1440 	for (i = 0; i != priv->rxqs_n; ++i) {
1441 		struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
1442 
1443 		if (rxq == NULL)
1444 			continue;
1445 		desc += 1 << rxq->elts_n;
1446 		/* Get the max number of strides. */
1447 		if (strd_num_n < rxq->strd_num_n)
1448 			strd_num_n = rxq->strd_num_n;
1449 		/* Get the max size of a stride. */
1450 		if (strd_sz_n < rxq->strd_sz_n)
1451 			strd_sz_n = rxq->strd_sz_n;
1452 	}
1453 	assert(strd_num_n && strd_sz_n);
1454 	buf_len = (1 << strd_num_n) * (1 << strd_sz_n);
1455 	obj_size = sizeof(struct mlx5_mprq_buf) + buf_len + (1 << strd_num_n) *
1456 		sizeof(struct rte_mbuf_ext_shared_info) + RTE_PKTMBUF_HEADROOM;
1457 	/*
1458 	 * Received packets can be either memcpy'd or externally referenced. In
1459 	 * case that the packet is attached to an mbuf as an external buffer, as
1460 	 * it isn't possible to predict how the buffers will be queued by
1461 	 * application, there's no option to exactly pre-allocate needed buffers
1462 	 * in advance but to speculatively prepares enough buffers.
1463 	 *
1464 	 * In the data path, if this Mempool is depleted, PMD will try to memcpy
1465 	 * received packets to buffers provided by application (rxq->mp) until
1466 	 * this Mempool gets available again.
1467 	 */
1468 	desc *= 4;
1469 	obj_num = desc + MLX5_MPRQ_MP_CACHE_SZ * priv->rxqs_n;
1470 	/*
1471 	 * rte_mempool_create_empty() has sanity check to refuse large cache
1472 	 * size compared to the number of elements.
1473 	 * CACHE_FLUSHTHRESH_MULTIPLIER is defined in a C file, so using a
1474 	 * constant number 2 instead.
1475 	 */
1476 	obj_num = RTE_MAX(obj_num, MLX5_MPRQ_MP_CACHE_SZ * 2);
1477 	/* Check a mempool is already allocated and if it can be resued. */
1478 	if (mp != NULL && mp->elt_size >= obj_size && mp->size >= obj_num) {
1479 		DRV_LOG(DEBUG, "port %u mempool %s is being reused",
1480 			dev->data->port_id, mp->name);
1481 		/* Reuse. */
1482 		goto exit;
1483 	} else if (mp != NULL) {
1484 		DRV_LOG(DEBUG, "port %u mempool %s should be resized, freeing it",
1485 			dev->data->port_id, mp->name);
1486 		/*
1487 		 * If failed to free, which means it may be still in use, no way
1488 		 * but to keep using the existing one. On buffer underrun,
1489 		 * packets will be memcpy'd instead of external buffer
1490 		 * attachment.
1491 		 */
1492 		if (mlx5_mprq_free_mp(dev)) {
1493 			if (mp->elt_size >= obj_size)
1494 				goto exit;
1495 			else
1496 				return -rte_errno;
1497 		}
1498 	}
1499 	snprintf(name, sizeof(name), "port-%u-mprq", dev->data->port_id);
1500 	mp = rte_mempool_create(name, obj_num, obj_size, MLX5_MPRQ_MP_CACHE_SZ,
1501 				0, NULL, NULL, mlx5_mprq_buf_init,
1502 				(void *)(uintptr_t)(1 << strd_num_n),
1503 				dev->device->numa_node, 0);
1504 	if (mp == NULL) {
1505 		DRV_LOG(ERR,
1506 			"port %u failed to allocate a mempool for"
1507 			" Multi-Packet RQ, count=%u, size=%u",
1508 			dev->data->port_id, obj_num, obj_size);
1509 		rte_errno = ENOMEM;
1510 		return -rte_errno;
1511 	}
1512 	priv->mprq_mp = mp;
1513 exit:
1514 	/* Set mempool for each Rx queue. */
1515 	for (i = 0; i != priv->rxqs_n; ++i) {
1516 		struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
1517 
1518 		if (rxq == NULL)
1519 			continue;
1520 		rxq->mprq_mp = mp;
1521 	}
1522 	DRV_LOG(INFO, "port %u Multi-Packet RQ is configured",
1523 		dev->data->port_id);
1524 	return 0;
1525 }
1526 
1527 #define MLX5_MAX_LRO_SIZE (UINT8_MAX * 256u)
1528 #define MLX5_MAX_TCP_HDR_OFFSET ((unsigned int)(sizeof(struct rte_ether_hdr) + \
1529 					sizeof(struct rte_vlan_hdr) * 2 + \
1530 					sizeof(struct rte_ipv6_hdr)))
1531 #define MAX_TCP_OPTION_SIZE 40u
1532 #define MLX5_MAX_LRO_HEADER_FIX ((unsigned int)(MLX5_MAX_TCP_HDR_OFFSET + \
1533 				 sizeof(struct rte_tcp_hdr) + \
1534 				 MAX_TCP_OPTION_SIZE))
1535 
1536 /**
1537  * Adjust the maximum LRO massage size.
1538  *
1539  * @param dev
1540  *   Pointer to Ethernet device.
1541  * @param max_lro_size
1542  *   The maximum size for LRO packet.
1543  */
1544 static void
1545 mlx5_max_lro_msg_size_adjust(struct rte_eth_dev *dev, uint32_t max_lro_size)
1546 {
1547 	struct mlx5_priv *priv = dev->data->dev_private;
1548 
1549 	if (priv->config.hca_attr.lro_max_msg_sz_mode ==
1550 	    MLX5_LRO_MAX_MSG_SIZE_START_FROM_L4 && max_lro_size >
1551 	    MLX5_MAX_TCP_HDR_OFFSET)
1552 		max_lro_size -= MLX5_MAX_TCP_HDR_OFFSET;
1553 	max_lro_size = RTE_MIN(max_lro_size, MLX5_MAX_LRO_SIZE);
1554 	assert(max_lro_size >= 256u);
1555 	max_lro_size /= 256u;
1556 	if (priv->max_lro_msg_size)
1557 		priv->max_lro_msg_size =
1558 			RTE_MIN((uint32_t)priv->max_lro_msg_size, max_lro_size);
1559 	else
1560 		priv->max_lro_msg_size = max_lro_size;
1561 }
1562 
1563 /**
1564  * Create a DPDK Rx queue.
1565  *
1566  * @param dev
1567  *   Pointer to Ethernet device.
1568  * @param idx
1569  *   RX queue index.
1570  * @param desc
1571  *   Number of descriptors to configure in queue.
1572  * @param socket
1573  *   NUMA socket on which memory must be allocated.
1574  *
1575  * @return
1576  *   A DPDK queue object on success, NULL otherwise and rte_errno is set.
1577  */
1578 struct mlx5_rxq_ctrl *
1579 mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
1580 	     unsigned int socket, const struct rte_eth_rxconf *conf,
1581 	     struct rte_mempool *mp)
1582 {
1583 	struct mlx5_priv *priv = dev->data->dev_private;
1584 	struct mlx5_rxq_ctrl *tmpl;
1585 	unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
1586 	unsigned int mprq_stride_size;
1587 	struct mlx5_dev_config *config = &priv->config;
1588 	unsigned int strd_headroom_en;
1589 	/*
1590 	 * Always allocate extra slots, even if eventually
1591 	 * the vector Rx will not be used.
1592 	 */
1593 	uint16_t desc_n =
1594 		desc + config->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP;
1595 	uint64_t offloads = conf->offloads |
1596 			   dev->data->dev_conf.rxmode.offloads;
1597 	unsigned int lro_on_queue = !!(offloads & DEV_RX_OFFLOAD_TCP_LRO);
1598 	const int mprq_en = mlx5_check_mprq_support(dev) > 0;
1599 	unsigned int max_rx_pkt_len = dev->data->dev_conf.rxmode.max_rx_pkt_len;
1600 	unsigned int non_scatter_min_mbuf_size = max_rx_pkt_len +
1601 							RTE_PKTMBUF_HEADROOM;
1602 	unsigned int max_lro_size = 0;
1603 	unsigned int first_mb_free_size = mb_len - RTE_PKTMBUF_HEADROOM;
1604 
1605 	if (non_scatter_min_mbuf_size > mb_len && !(offloads &
1606 						    DEV_RX_OFFLOAD_SCATTER)) {
1607 		DRV_LOG(ERR, "port %u Rx queue %u: Scatter offload is not"
1608 			" configured and no enough mbuf space(%u) to contain "
1609 			"the maximum RX packet length(%u) with head-room(%u)",
1610 			dev->data->port_id, idx, mb_len, max_rx_pkt_len,
1611 			RTE_PKTMBUF_HEADROOM);
1612 		rte_errno = ENOSPC;
1613 		return NULL;
1614 	}
1615 	tmpl = rte_calloc_socket("RXQ", 1,
1616 				 sizeof(*tmpl) +
1617 				 desc_n * sizeof(struct rte_mbuf *),
1618 				 0, socket);
1619 	if (!tmpl) {
1620 		rte_errno = ENOMEM;
1621 		return NULL;
1622 	}
1623 	if (mlx5_mr_btree_init(&tmpl->rxq.mr_ctrl.cache_bh,
1624 			       MLX5_MR_BTREE_CACHE_N, socket)) {
1625 		/* rte_errno is already set. */
1626 		goto error;
1627 	}
1628 	tmpl->socket = socket;
1629 	if (dev->data->dev_conf.intr_conf.rxq)
1630 		tmpl->irq = 1;
1631 	/*
1632 	 * LRO packet may consume all the stride memory, hence we cannot
1633 	 * guaranty head-room near the packet memory in the stride.
1634 	 * In this case scatter is, for sure, enabled and an empty mbuf may be
1635 	 * added in the start for the head-room.
1636 	 */
1637 	if (lro_on_queue && RTE_PKTMBUF_HEADROOM > 0 &&
1638 	    non_scatter_min_mbuf_size > mb_len) {
1639 		strd_headroom_en = 0;
1640 		mprq_stride_size = RTE_MIN(max_rx_pkt_len,
1641 					1u << config->mprq.max_stride_size_n);
1642 	} else {
1643 		strd_headroom_en = 1;
1644 		mprq_stride_size = non_scatter_min_mbuf_size;
1645 	}
1646 	/*
1647 	 * This Rx queue can be configured as a Multi-Packet RQ if all of the
1648 	 * following conditions are met:
1649 	 *  - MPRQ is enabled.
1650 	 *  - The number of descs is more than the number of strides.
1651 	 *  - max_rx_pkt_len plus overhead is less than the max size of a
1652 	 *    stride.
1653 	 *  Otherwise, enable Rx scatter if necessary.
1654 	 */
1655 	if (mprq_en &&
1656 	    desc > (1U << config->mprq.stride_num_n) &&
1657 	    mprq_stride_size <= (1U << config->mprq.max_stride_size_n)) {
1658 		/* TODO: Rx scatter isn't supported yet. */
1659 		tmpl->rxq.sges_n = 0;
1660 		/* Trim the number of descs needed. */
1661 		desc >>= config->mprq.stride_num_n;
1662 		tmpl->rxq.strd_num_n = config->mprq.stride_num_n;
1663 		tmpl->rxq.strd_sz_n = RTE_MAX(log2above(mprq_stride_size),
1664 					      config->mprq.min_stride_size_n);
1665 		tmpl->rxq.strd_shift_en = MLX5_MPRQ_TWO_BYTE_SHIFT;
1666 		tmpl->rxq.strd_headroom_en = strd_headroom_en;
1667 		tmpl->rxq.mprq_max_memcpy_len = RTE_MIN(first_mb_free_size,
1668 				config->mprq.max_memcpy_len);
1669 		max_lro_size = RTE_MIN(max_rx_pkt_len,
1670 				       (1u << tmpl->rxq.strd_num_n) *
1671 				       (1u << tmpl->rxq.strd_sz_n));
1672 		DRV_LOG(DEBUG,
1673 			"port %u Rx queue %u: Multi-Packet RQ is enabled"
1674 			" strd_num_n = %u, strd_sz_n = %u",
1675 			dev->data->port_id, idx,
1676 			tmpl->rxq.strd_num_n, tmpl->rxq.strd_sz_n);
1677 	} else if (max_rx_pkt_len <= first_mb_free_size) {
1678 		tmpl->rxq.sges_n = 0;
1679 		max_lro_size = max_rx_pkt_len;
1680 	} else if (offloads & DEV_RX_OFFLOAD_SCATTER) {
1681 		unsigned int size = non_scatter_min_mbuf_size;
1682 		unsigned int sges_n;
1683 
1684 		if (lro_on_queue && first_mb_free_size <
1685 		    MLX5_MAX_LRO_HEADER_FIX) {
1686 			DRV_LOG(ERR, "Not enough space in the first segment(%u)"
1687 				" to include the max header size(%u) for LRO",
1688 				first_mb_free_size, MLX5_MAX_LRO_HEADER_FIX);
1689 			rte_errno = ENOTSUP;
1690 			goto error;
1691 		}
1692 		/*
1693 		 * Determine the number of SGEs needed for a full packet
1694 		 * and round it to the next power of two.
1695 		 */
1696 		sges_n = log2above((size / mb_len) + !!(size % mb_len));
1697 		if (sges_n > MLX5_MAX_LOG_RQ_SEGS) {
1698 			DRV_LOG(ERR,
1699 				"port %u too many SGEs (%u) needed to handle"
1700 				" requested maximum packet size %u, the maximum"
1701 				" supported are %u", dev->data->port_id,
1702 				1 << sges_n, max_rx_pkt_len,
1703 				1u << MLX5_MAX_LOG_RQ_SEGS);
1704 			rte_errno = ENOTSUP;
1705 			goto error;
1706 		}
1707 		tmpl->rxq.sges_n = sges_n;
1708 		max_lro_size = max_rx_pkt_len;
1709 	}
1710 	if (mprq_en && !mlx5_rxq_mprq_enabled(&tmpl->rxq))
1711 		DRV_LOG(WARNING,
1712 			"port %u MPRQ is requested but cannot be enabled"
1713 			" (requested: desc = %u, stride_sz = %u,"
1714 			" supported: min_stride_num = %u, max_stride_sz = %u).",
1715 			dev->data->port_id, desc, mprq_stride_size,
1716 			(1 << config->mprq.stride_num_n),
1717 			(1 << config->mprq.max_stride_size_n));
1718 	DRV_LOG(DEBUG, "port %u maximum number of segments per packet: %u",
1719 		dev->data->port_id, 1 << tmpl->rxq.sges_n);
1720 	if (desc % (1 << tmpl->rxq.sges_n)) {
1721 		DRV_LOG(ERR,
1722 			"port %u number of Rx queue descriptors (%u) is not a"
1723 			" multiple of SGEs per packet (%u)",
1724 			dev->data->port_id,
1725 			desc,
1726 			1 << tmpl->rxq.sges_n);
1727 		rte_errno = EINVAL;
1728 		goto error;
1729 	}
1730 	mlx5_max_lro_msg_size_adjust(dev, max_lro_size);
1731 	/* Toggle RX checksum offload if hardware supports it. */
1732 	tmpl->rxq.csum = !!(offloads & DEV_RX_OFFLOAD_CHECKSUM);
1733 	tmpl->rxq.hw_timestamp = !!(offloads & DEV_RX_OFFLOAD_TIMESTAMP);
1734 	/* Configure VLAN stripping. */
1735 	tmpl->rxq.vlan_strip = !!(offloads & DEV_RX_OFFLOAD_VLAN_STRIP);
1736 	/* By default, FCS (CRC) is stripped by hardware. */
1737 	tmpl->rxq.crc_present = 0;
1738 	tmpl->rxq.lro = lro_on_queue;
1739 	if (offloads & DEV_RX_OFFLOAD_KEEP_CRC) {
1740 		if (config->hw_fcs_strip) {
1741 			/*
1742 			 * RQs used for LRO-enabled TIRs should not be
1743 			 * configured to scatter the FCS.
1744 			 */
1745 			if (lro_on_queue)
1746 				DRV_LOG(WARNING,
1747 					"port %u CRC stripping has been "
1748 					"disabled but will still be performed "
1749 					"by hardware, because LRO is enabled",
1750 					dev->data->port_id);
1751 			else
1752 				tmpl->rxq.crc_present = 1;
1753 		} else {
1754 			DRV_LOG(WARNING,
1755 				"port %u CRC stripping has been disabled but will"
1756 				" still be performed by hardware, make sure MLNX_OFED"
1757 				" and firmware are up to date",
1758 				dev->data->port_id);
1759 		}
1760 	}
1761 	DRV_LOG(DEBUG,
1762 		"port %u CRC stripping is %s, %u bytes will be subtracted from"
1763 		" incoming frames to hide it",
1764 		dev->data->port_id,
1765 		tmpl->rxq.crc_present ? "disabled" : "enabled",
1766 		tmpl->rxq.crc_present << 2);
1767 	/* Save port ID. */
1768 	tmpl->rxq.rss_hash = !!priv->rss_conf.rss_hf &&
1769 		(!!(dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS));
1770 	tmpl->rxq.port_id = dev->data->port_id;
1771 	tmpl->priv = priv;
1772 	tmpl->rxq.mp = mp;
1773 	tmpl->rxq.elts_n = log2above(desc);
1774 	tmpl->rxq.rq_repl_thresh =
1775 		MLX5_VPMD_RXQ_RPLNSH_THRESH(1 << tmpl->rxq.elts_n);
1776 	tmpl->rxq.elts =
1777 		(struct rte_mbuf *(*)[1 << tmpl->rxq.elts_n])(tmpl + 1);
1778 #ifndef RTE_ARCH_64
1779 	tmpl->rxq.uar_lock_cq = &priv->uar_lock_cq;
1780 #endif
1781 	tmpl->rxq.idx = idx;
1782 	rte_atomic32_inc(&tmpl->refcnt);
1783 	LIST_INSERT_HEAD(&priv->rxqsctrl, tmpl, next);
1784 	return tmpl;
1785 error:
1786 	rte_free(tmpl);
1787 	return NULL;
1788 }
1789 
1790 /**
1791  * Get a Rx queue.
1792  *
1793  * @param dev
1794  *   Pointer to Ethernet device.
1795  * @param idx
1796  *   RX queue index.
1797  *
1798  * @return
1799  *   A pointer to the queue if it exists, NULL otherwise.
1800  */
1801 struct mlx5_rxq_ctrl *
1802 mlx5_rxq_get(struct rte_eth_dev *dev, uint16_t idx)
1803 {
1804 	struct mlx5_priv *priv = dev->data->dev_private;
1805 	struct mlx5_rxq_ctrl *rxq_ctrl = NULL;
1806 
1807 	if ((*priv->rxqs)[idx]) {
1808 		rxq_ctrl = container_of((*priv->rxqs)[idx],
1809 					struct mlx5_rxq_ctrl,
1810 					rxq);
1811 		mlx5_rxq_obj_get(dev, idx);
1812 		rte_atomic32_inc(&rxq_ctrl->refcnt);
1813 	}
1814 	return rxq_ctrl;
1815 }
1816 
1817 /**
1818  * Release a Rx queue.
1819  *
1820  * @param dev
1821  *   Pointer to Ethernet device.
1822  * @param idx
1823  *   RX queue index.
1824  *
1825  * @return
1826  *   1 while a reference on it exists, 0 when freed.
1827  */
1828 int
1829 mlx5_rxq_release(struct rte_eth_dev *dev, uint16_t idx)
1830 {
1831 	struct mlx5_priv *priv = dev->data->dev_private;
1832 	struct mlx5_rxq_ctrl *rxq_ctrl;
1833 
1834 	if (!(*priv->rxqs)[idx])
1835 		return 0;
1836 	rxq_ctrl = container_of((*priv->rxqs)[idx], struct mlx5_rxq_ctrl, rxq);
1837 	assert(rxq_ctrl->priv);
1838 	if (rxq_ctrl->obj && !mlx5_rxq_obj_release(rxq_ctrl->obj))
1839 		rxq_ctrl->obj = NULL;
1840 	if (rte_atomic32_dec_and_test(&rxq_ctrl->refcnt)) {
1841 		if (rxq_ctrl->dbr_umem_id_valid)
1842 			claim_zero(mlx5_release_dbr(dev, rxq_ctrl->dbr_umem_id,
1843 						    rxq_ctrl->dbr_offset));
1844 		mlx5_mr_btree_free(&rxq_ctrl->rxq.mr_ctrl.cache_bh);
1845 		LIST_REMOVE(rxq_ctrl, next);
1846 		rte_free(rxq_ctrl);
1847 		(*priv->rxqs)[idx] = NULL;
1848 		return 0;
1849 	}
1850 	return 1;
1851 }
1852 
1853 /**
1854  * Verify the Rx Queue list is empty
1855  *
1856  * @param dev
1857  *   Pointer to Ethernet device.
1858  *
1859  * @return
1860  *   The number of object not released.
1861  */
1862 int
1863 mlx5_rxq_verify(struct rte_eth_dev *dev)
1864 {
1865 	struct mlx5_priv *priv = dev->data->dev_private;
1866 	struct mlx5_rxq_ctrl *rxq_ctrl;
1867 	int ret = 0;
1868 
1869 	LIST_FOREACH(rxq_ctrl, &priv->rxqsctrl, next) {
1870 		DRV_LOG(DEBUG, "port %u Rx Queue %u still referenced",
1871 			dev->data->port_id, rxq_ctrl->rxq.idx);
1872 		++ret;
1873 	}
1874 	return ret;
1875 }
1876 
1877 /**
1878  * Create an indirection table.
1879  *
1880  * @param dev
1881  *   Pointer to Ethernet device.
1882  * @param queues
1883  *   Queues entering in the indirection table.
1884  * @param queues_n
1885  *   Number of queues in the array.
1886  *
1887  * @return
1888  *   The Verbs/DevX object initialised, NULL otherwise and rte_errno is set.
1889  */
1890 static struct mlx5_ind_table_obj *
1891 mlx5_ind_table_obj_new(struct rte_eth_dev *dev, const uint16_t *queues,
1892 		       uint32_t queues_n, enum mlx5_ind_tbl_type type)
1893 {
1894 	struct mlx5_priv *priv = dev->data->dev_private;
1895 	struct mlx5_ind_table_obj *ind_tbl;
1896 	unsigned int i = 0, j = 0, k = 0;
1897 
1898 	ind_tbl = rte_calloc(__func__, 1, sizeof(*ind_tbl) +
1899 			     queues_n * sizeof(uint16_t), 0);
1900 	if (!ind_tbl) {
1901 		rte_errno = ENOMEM;
1902 		return NULL;
1903 	}
1904 	ind_tbl->type = type;
1905 	if (ind_tbl->type == MLX5_IND_TBL_TYPE_IBV) {
1906 		const unsigned int wq_n = rte_is_power_of_2(queues_n) ?
1907 			log2above(queues_n) :
1908 			log2above(priv->config.ind_table_max_size);
1909 		struct ibv_wq *wq[1 << wq_n];
1910 
1911 		for (i = 0; i != queues_n; ++i) {
1912 			struct mlx5_rxq_ctrl *rxq = mlx5_rxq_get(dev,
1913 								 queues[i]);
1914 			if (!rxq)
1915 				goto error;
1916 			wq[i] = rxq->obj->wq;
1917 			ind_tbl->queues[i] = queues[i];
1918 		}
1919 		ind_tbl->queues_n = queues_n;
1920 		/* Finalise indirection table. */
1921 		k = i; /* Retain value of i for use in error case. */
1922 		for (j = 0; k != (unsigned int)(1 << wq_n); ++k, ++j)
1923 			wq[k] = wq[j];
1924 		ind_tbl->ind_table = mlx5_glue->create_rwq_ind_table
1925 			(priv->sh->ctx,
1926 			 &(struct ibv_rwq_ind_table_init_attr){
1927 				.log_ind_tbl_size = wq_n,
1928 				.ind_tbl = wq,
1929 				.comp_mask = 0,
1930 			});
1931 		if (!ind_tbl->ind_table) {
1932 			rte_errno = errno;
1933 			goto error;
1934 		}
1935 	} else { /* ind_tbl->type == MLX5_IND_TBL_TYPE_DEVX */
1936 		struct mlx5_devx_rqt_attr *rqt_attr = NULL;
1937 
1938 		rqt_attr = rte_calloc(__func__, 1, sizeof(*rqt_attr) +
1939 				      queues_n * sizeof(uint16_t), 0);
1940 		if (!rqt_attr) {
1941 			DRV_LOG(ERR, "port %u cannot allocate RQT resources",
1942 				dev->data->port_id);
1943 			rte_errno = ENOMEM;
1944 			goto error;
1945 		}
1946 		rqt_attr->rqt_max_size = priv->config.ind_table_max_size;
1947 		rqt_attr->rqt_actual_size = queues_n;
1948 		for (i = 0; i != queues_n; ++i) {
1949 			struct mlx5_rxq_ctrl *rxq = mlx5_rxq_get(dev,
1950 								 queues[i]);
1951 			if (!rxq)
1952 				goto error;
1953 			rqt_attr->rq_list[i] = rxq->obj->rq->id;
1954 			ind_tbl->queues[i] = queues[i];
1955 		}
1956 		ind_tbl->rqt = mlx5_devx_cmd_create_rqt(priv->sh->ctx,
1957 							rqt_attr);
1958 		rte_free(rqt_attr);
1959 		if (!ind_tbl->rqt) {
1960 			DRV_LOG(ERR, "port %u cannot create DevX RQT",
1961 				dev->data->port_id);
1962 			rte_errno = errno;
1963 			goto error;
1964 		}
1965 		ind_tbl->queues_n = queues_n;
1966 	}
1967 	rte_atomic32_inc(&ind_tbl->refcnt);
1968 	LIST_INSERT_HEAD(&priv->ind_tbls, ind_tbl, next);
1969 	return ind_tbl;
1970 error:
1971 	for (j = 0; j < i; j++)
1972 		mlx5_rxq_release(dev, ind_tbl->queues[j]);
1973 	rte_free(ind_tbl);
1974 	DEBUG("port %u cannot create indirection table", dev->data->port_id);
1975 	return NULL;
1976 }
1977 
1978 /**
1979  * Get an indirection table.
1980  *
1981  * @param dev
1982  *   Pointer to Ethernet device.
1983  * @param queues
1984  *   Queues entering in the indirection table.
1985  * @param queues_n
1986  *   Number of queues in the array.
1987  *
1988  * @return
1989  *   An indirection table if found.
1990  */
1991 static struct mlx5_ind_table_obj *
1992 mlx5_ind_table_obj_get(struct rte_eth_dev *dev, const uint16_t *queues,
1993 		       uint32_t queues_n)
1994 {
1995 	struct mlx5_priv *priv = dev->data->dev_private;
1996 	struct mlx5_ind_table_obj *ind_tbl;
1997 
1998 	LIST_FOREACH(ind_tbl, &priv->ind_tbls, next) {
1999 		if ((ind_tbl->queues_n == queues_n) &&
2000 		    (memcmp(ind_tbl->queues, queues,
2001 			    ind_tbl->queues_n * sizeof(ind_tbl->queues[0]))
2002 		     == 0))
2003 			break;
2004 	}
2005 	if (ind_tbl) {
2006 		unsigned int i;
2007 
2008 		rte_atomic32_inc(&ind_tbl->refcnt);
2009 		for (i = 0; i != ind_tbl->queues_n; ++i)
2010 			mlx5_rxq_get(dev, ind_tbl->queues[i]);
2011 	}
2012 	return ind_tbl;
2013 }
2014 
2015 /**
2016  * Release an indirection table.
2017  *
2018  * @param dev
2019  *   Pointer to Ethernet device.
2020  * @param ind_table
2021  *   Indirection table to release.
2022  *
2023  * @return
2024  *   1 while a reference on it exists, 0 when freed.
2025  */
2026 static int
2027 mlx5_ind_table_obj_release(struct rte_eth_dev *dev,
2028 			   struct mlx5_ind_table_obj *ind_tbl)
2029 {
2030 	unsigned int i;
2031 
2032 	if (rte_atomic32_dec_and_test(&ind_tbl->refcnt)) {
2033 		if (ind_tbl->type == MLX5_IND_TBL_TYPE_IBV)
2034 			claim_zero(mlx5_glue->destroy_rwq_ind_table
2035 							(ind_tbl->ind_table));
2036 		else if (ind_tbl->type == MLX5_IND_TBL_TYPE_DEVX)
2037 			claim_zero(mlx5_devx_cmd_destroy(ind_tbl->rqt));
2038 	}
2039 	for (i = 0; i != ind_tbl->queues_n; ++i)
2040 		claim_nonzero(mlx5_rxq_release(dev, ind_tbl->queues[i]));
2041 	if (!rte_atomic32_read(&ind_tbl->refcnt)) {
2042 		LIST_REMOVE(ind_tbl, next);
2043 		rte_free(ind_tbl);
2044 		return 0;
2045 	}
2046 	return 1;
2047 }
2048 
2049 /**
2050  * Verify the Rx Queue list is empty
2051  *
2052  * @param dev
2053  *   Pointer to Ethernet device.
2054  *
2055  * @return
2056  *   The number of object not released.
2057  */
2058 int
2059 mlx5_ind_table_obj_verify(struct rte_eth_dev *dev)
2060 {
2061 	struct mlx5_priv *priv = dev->data->dev_private;
2062 	struct mlx5_ind_table_obj *ind_tbl;
2063 	int ret = 0;
2064 
2065 	LIST_FOREACH(ind_tbl, &priv->ind_tbls, next) {
2066 		DRV_LOG(DEBUG,
2067 			"port %u indirection table obj %p still referenced",
2068 			dev->data->port_id, (void *)ind_tbl);
2069 		++ret;
2070 	}
2071 	return ret;
2072 }
2073 
2074 /**
2075  * Create an Rx Hash queue.
2076  *
2077  * @param dev
2078  *   Pointer to Ethernet device.
2079  * @param rss_key
2080  *   RSS key for the Rx hash queue.
2081  * @param rss_key_len
2082  *   RSS key length.
2083  * @param hash_fields
2084  *   Verbs protocol hash field to make the RSS on.
2085  * @param queues
2086  *   Queues entering in hash queue. In case of empty hash_fields only the
2087  *   first queue index will be taken for the indirection table.
2088  * @param queues_n
2089  *   Number of queues.
2090  * @param tunnel
2091  *   Tunnel type.
2092  *
2093  * @return
2094  *   The Verbs/DevX object initialised, NULL otherwise and rte_errno is set.
2095  */
2096 struct mlx5_hrxq *
2097 mlx5_hrxq_new(struct rte_eth_dev *dev,
2098 	      const uint8_t *rss_key, uint32_t rss_key_len,
2099 	      uint64_t hash_fields,
2100 	      const uint16_t *queues, uint32_t queues_n,
2101 	      int tunnel __rte_unused)
2102 {
2103 	struct mlx5_priv *priv = dev->data->dev_private;
2104 	struct mlx5_hrxq *hrxq;
2105 	struct ibv_qp *qp = NULL;
2106 	struct mlx5_ind_table_obj *ind_tbl;
2107 	int err;
2108 	struct mlx5_devx_obj *tir = NULL;
2109 
2110 	queues_n = hash_fields ? queues_n : 1;
2111 	ind_tbl = mlx5_ind_table_obj_get(dev, queues, queues_n);
2112 	if (!ind_tbl) {
2113 		struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[queues[0]];
2114 		struct mlx5_rxq_ctrl *rxq_ctrl =
2115 			container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
2116 		enum mlx5_ind_tbl_type type;
2117 
2118 		type = rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_IBV ?
2119 				MLX5_IND_TBL_TYPE_IBV : MLX5_IND_TBL_TYPE_DEVX;
2120 		ind_tbl = mlx5_ind_table_obj_new(dev, queues, queues_n, type);
2121 	}
2122 	if (!ind_tbl) {
2123 		rte_errno = ENOMEM;
2124 		return NULL;
2125 	}
2126 	if (ind_tbl->type == MLX5_IND_TBL_TYPE_IBV) {
2127 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
2128 		struct mlx5dv_qp_init_attr qp_init_attr;
2129 
2130 		memset(&qp_init_attr, 0, sizeof(qp_init_attr));
2131 		if (tunnel) {
2132 			qp_init_attr.comp_mask =
2133 				MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS;
2134 			qp_init_attr.create_flags =
2135 				MLX5DV_QP_CREATE_TUNNEL_OFFLOADS;
2136 		}
2137 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
2138 		if (dev->data->dev_conf.lpbk_mode) {
2139 			/*
2140 			 * Allow packet sent from NIC loop back
2141 			 * w/o source MAC check.
2142 			 */
2143 			qp_init_attr.comp_mask |=
2144 				MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS;
2145 			qp_init_attr.create_flags |=
2146 				MLX5DV_QP_CREATE_TIR_ALLOW_SELF_LOOPBACK_UC;
2147 		}
2148 #endif
2149 		qp = mlx5_glue->dv_create_qp
2150 			(priv->sh->ctx,
2151 			 &(struct ibv_qp_init_attr_ex){
2152 				.qp_type = IBV_QPT_RAW_PACKET,
2153 				.comp_mask =
2154 					IBV_QP_INIT_ATTR_PD |
2155 					IBV_QP_INIT_ATTR_IND_TABLE |
2156 					IBV_QP_INIT_ATTR_RX_HASH,
2157 				.rx_hash_conf = (struct ibv_rx_hash_conf){
2158 					.rx_hash_function =
2159 						IBV_RX_HASH_FUNC_TOEPLITZ,
2160 					.rx_hash_key_len = rss_key_len,
2161 					.rx_hash_key =
2162 						(void *)(uintptr_t)rss_key,
2163 					.rx_hash_fields_mask = hash_fields,
2164 				},
2165 				.rwq_ind_tbl = ind_tbl->ind_table,
2166 				.pd = priv->sh->pd,
2167 			  },
2168 			  &qp_init_attr);
2169 #else
2170 		qp = mlx5_glue->create_qp_ex
2171 			(priv->sh->ctx,
2172 			 &(struct ibv_qp_init_attr_ex){
2173 				.qp_type = IBV_QPT_RAW_PACKET,
2174 				.comp_mask =
2175 					IBV_QP_INIT_ATTR_PD |
2176 					IBV_QP_INIT_ATTR_IND_TABLE |
2177 					IBV_QP_INIT_ATTR_RX_HASH,
2178 				.rx_hash_conf = (struct ibv_rx_hash_conf){
2179 					.rx_hash_function =
2180 						IBV_RX_HASH_FUNC_TOEPLITZ,
2181 					.rx_hash_key_len = rss_key_len,
2182 					.rx_hash_key =
2183 						(void *)(uintptr_t)rss_key,
2184 					.rx_hash_fields_mask = hash_fields,
2185 				},
2186 				.rwq_ind_tbl = ind_tbl->ind_table,
2187 				.pd = priv->sh->pd,
2188 			 });
2189 #endif
2190 		if (!qp) {
2191 			rte_errno = errno;
2192 			goto error;
2193 		}
2194 	} else { /* ind_tbl->type == MLX5_IND_TBL_TYPE_DEVX */
2195 		struct mlx5_devx_tir_attr tir_attr;
2196 		uint32_t i;
2197 		uint32_t lro = 1;
2198 
2199 		/* Enable TIR LRO only if all the queues were configured for. */
2200 		for (i = 0; i < queues_n; ++i) {
2201 			if (!(*priv->rxqs)[queues[i]]->lro) {
2202 				lro = 0;
2203 				break;
2204 			}
2205 		}
2206 		memset(&tir_attr, 0, sizeof(tir_attr));
2207 		tir_attr.disp_type = MLX5_TIRC_DISP_TYPE_INDIRECT;
2208 		tir_attr.rx_hash_fn = MLX5_RX_HASH_FN_TOEPLITZ;
2209 		memcpy(&tir_attr.rx_hash_field_selector_outer, &hash_fields,
2210 		       sizeof(uint64_t));
2211 		tir_attr.transport_domain = priv->sh->tdn;
2212 		memcpy(tir_attr.rx_hash_toeplitz_key, rss_key, rss_key_len);
2213 		tir_attr.indirect_table = ind_tbl->rqt->id;
2214 		if (dev->data->dev_conf.lpbk_mode)
2215 			tir_attr.self_lb_block =
2216 					MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST;
2217 		if (lro) {
2218 			tir_attr.lro_timeout_period_usecs =
2219 					priv->config.lro.timeout;
2220 			tir_attr.lro_max_msg_sz = priv->max_lro_msg_size;
2221 			tir_attr.lro_enable_mask =
2222 					MLX5_TIRC_LRO_ENABLE_MASK_IPV4_LRO |
2223 					MLX5_TIRC_LRO_ENABLE_MASK_IPV6_LRO;
2224 		}
2225 		tir = mlx5_devx_cmd_create_tir(priv->sh->ctx, &tir_attr);
2226 		if (!tir) {
2227 			DRV_LOG(ERR, "port %u cannot create DevX TIR",
2228 				dev->data->port_id);
2229 			rte_errno = errno;
2230 			goto error;
2231 		}
2232 	}
2233 	hrxq = rte_calloc(__func__, 1, sizeof(*hrxq) + rss_key_len, 0);
2234 	if (!hrxq)
2235 		goto error;
2236 	hrxq->ind_table = ind_tbl;
2237 	if (ind_tbl->type == MLX5_IND_TBL_TYPE_IBV) {
2238 		hrxq->qp = qp;
2239 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
2240 		hrxq->action =
2241 			mlx5_glue->dv_create_flow_action_dest_ibv_qp(hrxq->qp);
2242 		if (!hrxq->action) {
2243 			rte_errno = errno;
2244 			goto error;
2245 		}
2246 #endif
2247 	} else { /* ind_tbl->type == MLX5_IND_TBL_TYPE_DEVX */
2248 		hrxq->tir = tir;
2249 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
2250 		hrxq->action = mlx5_glue->dv_create_flow_action_dest_devx_tir
2251 							(hrxq->tir->obj);
2252 		if (!hrxq->action) {
2253 			rte_errno = errno;
2254 			goto error;
2255 		}
2256 #endif
2257 	}
2258 	hrxq->rss_key_len = rss_key_len;
2259 	hrxq->hash_fields = hash_fields;
2260 	memcpy(hrxq->rss_key, rss_key, rss_key_len);
2261 	rte_atomic32_inc(&hrxq->refcnt);
2262 	LIST_INSERT_HEAD(&priv->hrxqs, hrxq, next);
2263 	return hrxq;
2264 error:
2265 	err = rte_errno; /* Save rte_errno before cleanup. */
2266 	mlx5_ind_table_obj_release(dev, ind_tbl);
2267 	if (qp)
2268 		claim_zero(mlx5_glue->destroy_qp(qp));
2269 	else if (tir)
2270 		claim_zero(mlx5_devx_cmd_destroy(tir));
2271 	rte_errno = err; /* Restore rte_errno. */
2272 	return NULL;
2273 }
2274 
2275 /**
2276  * Get an Rx Hash queue.
2277  *
2278  * @param dev
2279  *   Pointer to Ethernet device.
2280  * @param rss_conf
2281  *   RSS configuration for the Rx hash queue.
2282  * @param queues
2283  *   Queues entering in hash queue. In case of empty hash_fields only the
2284  *   first queue index will be taken for the indirection table.
2285  * @param queues_n
2286  *   Number of queues.
2287  *
2288  * @return
2289  *   An hash Rx queue on success.
2290  */
2291 struct mlx5_hrxq *
2292 mlx5_hrxq_get(struct rte_eth_dev *dev,
2293 	      const uint8_t *rss_key, uint32_t rss_key_len,
2294 	      uint64_t hash_fields,
2295 	      const uint16_t *queues, uint32_t queues_n)
2296 {
2297 	struct mlx5_priv *priv = dev->data->dev_private;
2298 	struct mlx5_hrxq *hrxq;
2299 
2300 	queues_n = hash_fields ? queues_n : 1;
2301 	LIST_FOREACH(hrxq, &priv->hrxqs, next) {
2302 		struct mlx5_ind_table_obj *ind_tbl;
2303 
2304 		if (hrxq->rss_key_len != rss_key_len)
2305 			continue;
2306 		if (memcmp(hrxq->rss_key, rss_key, rss_key_len))
2307 			continue;
2308 		if (hrxq->hash_fields != hash_fields)
2309 			continue;
2310 		ind_tbl = mlx5_ind_table_obj_get(dev, queues, queues_n);
2311 		if (!ind_tbl)
2312 			continue;
2313 		if (ind_tbl != hrxq->ind_table) {
2314 			mlx5_ind_table_obj_release(dev, ind_tbl);
2315 			continue;
2316 		}
2317 		rte_atomic32_inc(&hrxq->refcnt);
2318 		return hrxq;
2319 	}
2320 	return NULL;
2321 }
2322 
2323 /**
2324  * Release the hash Rx queue.
2325  *
2326  * @param dev
2327  *   Pointer to Ethernet device.
2328  * @param hrxq
2329  *   Pointer to Hash Rx queue to release.
2330  *
2331  * @return
2332  *   1 while a reference on it exists, 0 when freed.
2333  */
2334 int
2335 mlx5_hrxq_release(struct rte_eth_dev *dev, struct mlx5_hrxq *hrxq)
2336 {
2337 	if (rte_atomic32_dec_and_test(&hrxq->refcnt)) {
2338 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
2339 		mlx5_glue->destroy_flow_action(hrxq->action);
2340 #endif
2341 		if (hrxq->ind_table->type == MLX5_IND_TBL_TYPE_IBV)
2342 			claim_zero(mlx5_glue->destroy_qp(hrxq->qp));
2343 		else /* hrxq->ind_table->type == MLX5_IND_TBL_TYPE_DEVX */
2344 			claim_zero(mlx5_devx_cmd_destroy(hrxq->tir));
2345 		mlx5_ind_table_obj_release(dev, hrxq->ind_table);
2346 		LIST_REMOVE(hrxq, next);
2347 		rte_free(hrxq);
2348 		return 0;
2349 	}
2350 	claim_nonzero(mlx5_ind_table_obj_release(dev, hrxq->ind_table));
2351 	return 1;
2352 }
2353 
2354 /**
2355  * Verify the Rx Queue list is empty
2356  *
2357  * @param dev
2358  *   Pointer to Ethernet device.
2359  *
2360  * @return
2361  *   The number of object not released.
2362  */
2363 int
2364 mlx5_hrxq_verify(struct rte_eth_dev *dev)
2365 {
2366 	struct mlx5_priv *priv = dev->data->dev_private;
2367 	struct mlx5_hrxq *hrxq;
2368 	int ret = 0;
2369 
2370 	LIST_FOREACH(hrxq, &priv->hrxqs, next) {
2371 		DRV_LOG(DEBUG,
2372 			"port %u hash Rx queue %p still referenced",
2373 			dev->data->port_id, (void *)hrxq);
2374 		++ret;
2375 	}
2376 	return ret;
2377 }
2378 
2379 /**
2380  * Create a drop Rx queue Verbs/DevX object.
2381  *
2382  * @param dev
2383  *   Pointer to Ethernet device.
2384  *
2385  * @return
2386  *   The Verbs/DevX object initialised, NULL otherwise and rte_errno is set.
2387  */
2388 static struct mlx5_rxq_obj *
2389 mlx5_rxq_obj_drop_new(struct rte_eth_dev *dev)
2390 {
2391 	struct mlx5_priv *priv = dev->data->dev_private;
2392 	struct ibv_context *ctx = priv->sh->ctx;
2393 	struct ibv_cq *cq;
2394 	struct ibv_wq *wq = NULL;
2395 	struct mlx5_rxq_obj *rxq;
2396 
2397 	if (priv->drop_queue.rxq)
2398 		return priv->drop_queue.rxq;
2399 	cq = mlx5_glue->create_cq(ctx, 1, NULL, NULL, 0);
2400 	if (!cq) {
2401 		DEBUG("port %u cannot allocate CQ for drop queue",
2402 		      dev->data->port_id);
2403 		rte_errno = errno;
2404 		goto error;
2405 	}
2406 	wq = mlx5_glue->create_wq(ctx,
2407 		 &(struct ibv_wq_init_attr){
2408 			.wq_type = IBV_WQT_RQ,
2409 			.max_wr = 1,
2410 			.max_sge = 1,
2411 			.pd = priv->sh->pd,
2412 			.cq = cq,
2413 		 });
2414 	if (!wq) {
2415 		DEBUG("port %u cannot allocate WQ for drop queue",
2416 		      dev->data->port_id);
2417 		rte_errno = errno;
2418 		goto error;
2419 	}
2420 	rxq = rte_calloc(__func__, 1, sizeof(*rxq), 0);
2421 	if (!rxq) {
2422 		DEBUG("port %u cannot allocate drop Rx queue memory",
2423 		      dev->data->port_id);
2424 		rte_errno = ENOMEM;
2425 		goto error;
2426 	}
2427 	rxq->cq = cq;
2428 	rxq->wq = wq;
2429 	priv->drop_queue.rxq = rxq;
2430 	return rxq;
2431 error:
2432 	if (wq)
2433 		claim_zero(mlx5_glue->destroy_wq(wq));
2434 	if (cq)
2435 		claim_zero(mlx5_glue->destroy_cq(cq));
2436 	return NULL;
2437 }
2438 
2439 /**
2440  * Release a drop Rx queue Verbs/DevX object.
2441  *
2442  * @param dev
2443  *   Pointer to Ethernet device.
2444  *
2445  * @return
2446  *   The Verbs/DevX object initialised, NULL otherwise and rte_errno is set.
2447  */
2448 static void
2449 mlx5_rxq_obj_drop_release(struct rte_eth_dev *dev)
2450 {
2451 	struct mlx5_priv *priv = dev->data->dev_private;
2452 	struct mlx5_rxq_obj *rxq = priv->drop_queue.rxq;
2453 
2454 	if (rxq->wq)
2455 		claim_zero(mlx5_glue->destroy_wq(rxq->wq));
2456 	if (rxq->cq)
2457 		claim_zero(mlx5_glue->destroy_cq(rxq->cq));
2458 	rte_free(rxq);
2459 	priv->drop_queue.rxq = NULL;
2460 }
2461 
2462 /**
2463  * Create a drop indirection table.
2464  *
2465  * @param dev
2466  *   Pointer to Ethernet device.
2467  *
2468  * @return
2469  *   The Verbs/DevX object initialised, NULL otherwise and rte_errno is set.
2470  */
2471 static struct mlx5_ind_table_obj *
2472 mlx5_ind_table_obj_drop_new(struct rte_eth_dev *dev)
2473 {
2474 	struct mlx5_priv *priv = dev->data->dev_private;
2475 	struct mlx5_ind_table_obj *ind_tbl;
2476 	struct mlx5_rxq_obj *rxq;
2477 	struct mlx5_ind_table_obj tmpl;
2478 
2479 	rxq = mlx5_rxq_obj_drop_new(dev);
2480 	if (!rxq)
2481 		return NULL;
2482 	tmpl.ind_table = mlx5_glue->create_rwq_ind_table
2483 		(priv->sh->ctx,
2484 		 &(struct ibv_rwq_ind_table_init_attr){
2485 			.log_ind_tbl_size = 0,
2486 			.ind_tbl = &rxq->wq,
2487 			.comp_mask = 0,
2488 		 });
2489 	if (!tmpl.ind_table) {
2490 		DEBUG("port %u cannot allocate indirection table for drop"
2491 		      " queue",
2492 		      dev->data->port_id);
2493 		rte_errno = errno;
2494 		goto error;
2495 	}
2496 	ind_tbl = rte_calloc(__func__, 1, sizeof(*ind_tbl), 0);
2497 	if (!ind_tbl) {
2498 		rte_errno = ENOMEM;
2499 		goto error;
2500 	}
2501 	ind_tbl->ind_table = tmpl.ind_table;
2502 	return ind_tbl;
2503 error:
2504 	mlx5_rxq_obj_drop_release(dev);
2505 	return NULL;
2506 }
2507 
2508 /**
2509  * Release a drop indirection table.
2510  *
2511  * @param dev
2512  *   Pointer to Ethernet device.
2513  */
2514 static void
2515 mlx5_ind_table_obj_drop_release(struct rte_eth_dev *dev)
2516 {
2517 	struct mlx5_priv *priv = dev->data->dev_private;
2518 	struct mlx5_ind_table_obj *ind_tbl = priv->drop_queue.hrxq->ind_table;
2519 
2520 	claim_zero(mlx5_glue->destroy_rwq_ind_table(ind_tbl->ind_table));
2521 	mlx5_rxq_obj_drop_release(dev);
2522 	rte_free(ind_tbl);
2523 	priv->drop_queue.hrxq->ind_table = NULL;
2524 }
2525 
2526 /**
2527  * Create a drop Rx Hash queue.
2528  *
2529  * @param dev
2530  *   Pointer to Ethernet device.
2531  *
2532  * @return
2533  *   The Verbs/DevX object initialised, NULL otherwise and rte_errno is set.
2534  */
2535 struct mlx5_hrxq *
2536 mlx5_hrxq_drop_new(struct rte_eth_dev *dev)
2537 {
2538 	struct mlx5_priv *priv = dev->data->dev_private;
2539 	struct mlx5_ind_table_obj *ind_tbl;
2540 	struct ibv_qp *qp;
2541 	struct mlx5_hrxq *hrxq;
2542 
2543 	if (priv->drop_queue.hrxq) {
2544 		rte_atomic32_inc(&priv->drop_queue.hrxq->refcnt);
2545 		return priv->drop_queue.hrxq;
2546 	}
2547 	ind_tbl = mlx5_ind_table_obj_drop_new(dev);
2548 	if (!ind_tbl)
2549 		return NULL;
2550 	qp = mlx5_glue->create_qp_ex(priv->sh->ctx,
2551 		 &(struct ibv_qp_init_attr_ex){
2552 			.qp_type = IBV_QPT_RAW_PACKET,
2553 			.comp_mask =
2554 				IBV_QP_INIT_ATTR_PD |
2555 				IBV_QP_INIT_ATTR_IND_TABLE |
2556 				IBV_QP_INIT_ATTR_RX_HASH,
2557 			.rx_hash_conf = (struct ibv_rx_hash_conf){
2558 				.rx_hash_function =
2559 					IBV_RX_HASH_FUNC_TOEPLITZ,
2560 				.rx_hash_key_len = MLX5_RSS_HASH_KEY_LEN,
2561 				.rx_hash_key = rss_hash_default_key,
2562 				.rx_hash_fields_mask = 0,
2563 				},
2564 			.rwq_ind_tbl = ind_tbl->ind_table,
2565 			.pd = priv->sh->pd
2566 		 });
2567 	if (!qp) {
2568 		DEBUG("port %u cannot allocate QP for drop queue",
2569 		      dev->data->port_id);
2570 		rte_errno = errno;
2571 		goto error;
2572 	}
2573 	hrxq = rte_calloc(__func__, 1, sizeof(*hrxq), 0);
2574 	if (!hrxq) {
2575 		DRV_LOG(WARNING,
2576 			"port %u cannot allocate memory for drop queue",
2577 			dev->data->port_id);
2578 		rte_errno = ENOMEM;
2579 		goto error;
2580 	}
2581 	hrxq->ind_table = ind_tbl;
2582 	hrxq->qp = qp;
2583 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
2584 	hrxq->action = mlx5_glue->dv_create_flow_action_dest_ibv_qp(hrxq->qp);
2585 	if (!hrxq->action) {
2586 		rte_errno = errno;
2587 		goto error;
2588 	}
2589 #endif
2590 	priv->drop_queue.hrxq = hrxq;
2591 	rte_atomic32_set(&hrxq->refcnt, 1);
2592 	return hrxq;
2593 error:
2594 	if (ind_tbl)
2595 		mlx5_ind_table_obj_drop_release(dev);
2596 	return NULL;
2597 }
2598 
2599 /**
2600  * Release a drop hash Rx queue.
2601  *
2602  * @param dev
2603  *   Pointer to Ethernet device.
2604  */
2605 void
2606 mlx5_hrxq_drop_release(struct rte_eth_dev *dev)
2607 {
2608 	struct mlx5_priv *priv = dev->data->dev_private;
2609 	struct mlx5_hrxq *hrxq = priv->drop_queue.hrxq;
2610 
2611 	if (rte_atomic32_dec_and_test(&hrxq->refcnt)) {
2612 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
2613 		mlx5_glue->destroy_flow_action(hrxq->action);
2614 #endif
2615 		claim_zero(mlx5_glue->destroy_qp(hrxq->qp));
2616 		mlx5_ind_table_obj_drop_release(dev);
2617 		rte_free(hrxq);
2618 		priv->drop_queue.hrxq = NULL;
2619 	}
2620 }
2621