xref: /dpdk/drivers/net/mlx5/mlx5_rxq.c (revision 0857b942113874c69dc3db5df11a828ee3cc9b6b)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright 2015 6WIND S.A.
5  *   Copyright 2015 Mellanox.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of 6WIND S.A. nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <stddef.h>
35 #include <assert.h>
36 #include <errno.h>
37 #include <string.h>
38 #include <stdint.h>
39 #include <fcntl.h>
40 
41 /* Verbs header. */
42 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
43 #ifdef PEDANTIC
44 #pragma GCC diagnostic ignored "-Wpedantic"
45 #endif
46 #include <infiniband/verbs.h>
47 #include <infiniband/arch.h>
48 #include <infiniband/mlx5_hw.h>
49 #ifdef PEDANTIC
50 #pragma GCC diagnostic error "-Wpedantic"
51 #endif
52 
53 /* DPDK headers don't like -pedantic. */
54 #ifdef PEDANTIC
55 #pragma GCC diagnostic ignored "-Wpedantic"
56 #endif
57 #include <rte_mbuf.h>
58 #include <rte_malloc.h>
59 #include <rte_ethdev.h>
60 #include <rte_common.h>
61 #include <rte_interrupts.h>
62 #ifdef PEDANTIC
63 #pragma GCC diagnostic error "-Wpedantic"
64 #endif
65 
66 #include "mlx5.h"
67 #include "mlx5_rxtx.h"
68 #include "mlx5_utils.h"
69 #include "mlx5_autoconf.h"
70 #include "mlx5_defs.h"
71 
72 /* Initialization data for hash RX queues. */
73 const struct hash_rxq_init hash_rxq_init[] = {
74 	[HASH_RXQ_TCPV4] = {
75 		.hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
76 				IBV_EXP_RX_HASH_DST_IPV4 |
77 				IBV_EXP_RX_HASH_SRC_PORT_TCP |
78 				IBV_EXP_RX_HASH_DST_PORT_TCP),
79 		.dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_TCP,
80 		.flow_priority = 0,
81 		.flow_spec.tcp_udp = {
82 			.type = IBV_EXP_FLOW_SPEC_TCP,
83 			.size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
84 		},
85 		.underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
86 	},
87 	[HASH_RXQ_UDPV4] = {
88 		.hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
89 				IBV_EXP_RX_HASH_DST_IPV4 |
90 				IBV_EXP_RX_HASH_SRC_PORT_UDP |
91 				IBV_EXP_RX_HASH_DST_PORT_UDP),
92 		.dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_UDP,
93 		.flow_priority = 0,
94 		.flow_spec.tcp_udp = {
95 			.type = IBV_EXP_FLOW_SPEC_UDP,
96 			.size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
97 		},
98 		.underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
99 	},
100 	[HASH_RXQ_IPV4] = {
101 		.hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
102 				IBV_EXP_RX_HASH_DST_IPV4),
103 		.dpdk_rss_hf = (ETH_RSS_IPV4 |
104 				ETH_RSS_FRAG_IPV4),
105 		.flow_priority = 1,
106 		.flow_spec.ipv4 = {
107 			.type = IBV_EXP_FLOW_SPEC_IPV4,
108 			.size = sizeof(hash_rxq_init[0].flow_spec.ipv4),
109 		},
110 		.underlayer = &hash_rxq_init[HASH_RXQ_ETH],
111 	},
112 	[HASH_RXQ_TCPV6] = {
113 		.hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
114 				IBV_EXP_RX_HASH_DST_IPV6 |
115 				IBV_EXP_RX_HASH_SRC_PORT_TCP |
116 				IBV_EXP_RX_HASH_DST_PORT_TCP),
117 		.dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_TCP,
118 		.flow_priority = 0,
119 		.flow_spec.tcp_udp = {
120 			.type = IBV_EXP_FLOW_SPEC_TCP,
121 			.size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
122 		},
123 		.underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
124 	},
125 	[HASH_RXQ_UDPV6] = {
126 		.hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
127 				IBV_EXP_RX_HASH_DST_IPV6 |
128 				IBV_EXP_RX_HASH_SRC_PORT_UDP |
129 				IBV_EXP_RX_HASH_DST_PORT_UDP),
130 		.dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_UDP,
131 		.flow_priority = 0,
132 		.flow_spec.tcp_udp = {
133 			.type = IBV_EXP_FLOW_SPEC_UDP,
134 			.size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
135 		},
136 		.underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
137 	},
138 	[HASH_RXQ_IPV6] = {
139 		.hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
140 				IBV_EXP_RX_HASH_DST_IPV6),
141 		.dpdk_rss_hf = (ETH_RSS_IPV6 |
142 				ETH_RSS_FRAG_IPV6),
143 		.flow_priority = 1,
144 		.flow_spec.ipv6 = {
145 			.type = IBV_EXP_FLOW_SPEC_IPV6,
146 			.size = sizeof(hash_rxq_init[0].flow_spec.ipv6),
147 		},
148 		.underlayer = &hash_rxq_init[HASH_RXQ_ETH],
149 	},
150 	[HASH_RXQ_ETH] = {
151 		.hash_fields = 0,
152 		.dpdk_rss_hf = 0,
153 		.flow_priority = 2,
154 		.flow_spec.eth = {
155 			.type = IBV_EXP_FLOW_SPEC_ETH,
156 			.size = sizeof(hash_rxq_init[0].flow_spec.eth),
157 		},
158 		.underlayer = NULL,
159 	},
160 };
161 
162 /* Number of entries in hash_rxq_init[]. */
163 const unsigned int hash_rxq_init_n = RTE_DIM(hash_rxq_init);
164 
165 /* Initialization data for hash RX queue indirection tables. */
166 static const struct ind_table_init ind_table_init[] = {
167 	{
168 		.max_size = -1u, /* Superseded by HW limitations. */
169 		.hash_types =
170 			1 << HASH_RXQ_TCPV4 |
171 			1 << HASH_RXQ_UDPV4 |
172 			1 << HASH_RXQ_IPV4 |
173 			1 << HASH_RXQ_TCPV6 |
174 			1 << HASH_RXQ_UDPV6 |
175 			1 << HASH_RXQ_IPV6 |
176 			0,
177 		.hash_types_n = 6,
178 	},
179 	{
180 		.max_size = 1,
181 		.hash_types = 1 << HASH_RXQ_ETH,
182 		.hash_types_n = 1,
183 	},
184 };
185 
186 #define IND_TABLE_INIT_N RTE_DIM(ind_table_init)
187 
188 /* Default RSS hash key also used for ConnectX-3. */
189 uint8_t rss_hash_default_key[] = {
190 	0x2c, 0xc6, 0x81, 0xd1,
191 	0x5b, 0xdb, 0xf4, 0xf7,
192 	0xfc, 0xa2, 0x83, 0x19,
193 	0xdb, 0x1a, 0x3e, 0x94,
194 	0x6b, 0x9e, 0x38, 0xd9,
195 	0x2c, 0x9c, 0x03, 0xd1,
196 	0xad, 0x99, 0x44, 0xa7,
197 	0xd9, 0x56, 0x3d, 0x59,
198 	0x06, 0x3c, 0x25, 0xf3,
199 	0xfc, 0x1f, 0xdc, 0x2a,
200 };
201 
202 /* Length of the default RSS hash key. */
203 const size_t rss_hash_default_key_len = sizeof(rss_hash_default_key);
204 
205 /**
206  * Populate flow steering rule for a given hash RX queue type using
207  * information from hash_rxq_init[]. Nothing is written to flow_attr when
208  * flow_attr_size is not large enough, but the required size is still returned.
209  *
210  * @param priv
211  *   Pointer to private structure.
212  * @param[out] flow_attr
213  *   Pointer to flow attribute structure to fill. Note that the allocated
214  *   area must be larger and large enough to hold all flow specifications.
215  * @param flow_attr_size
216  *   Entire size of flow_attr and trailing room for flow specifications.
217  * @param type
218  *   Hash RX queue type to use for flow steering rule.
219  *
220  * @return
221  *   Total size of the flow attribute buffer. No errors are defined.
222  */
223 size_t
224 priv_flow_attr(struct priv *priv, struct ibv_exp_flow_attr *flow_attr,
225 	       size_t flow_attr_size, enum hash_rxq_type type)
226 {
227 	size_t offset = sizeof(*flow_attr);
228 	const struct hash_rxq_init *init = &hash_rxq_init[type];
229 
230 	assert(priv != NULL);
231 	assert((size_t)type < RTE_DIM(hash_rxq_init));
232 	do {
233 		offset += init->flow_spec.hdr.size;
234 		init = init->underlayer;
235 	} while (init != NULL);
236 	if (offset > flow_attr_size)
237 		return offset;
238 	flow_attr_size = offset;
239 	init = &hash_rxq_init[type];
240 	*flow_attr = (struct ibv_exp_flow_attr){
241 		.type = IBV_EXP_FLOW_ATTR_NORMAL,
242 		/* Priorities < 3 are reserved for flow director. */
243 		.priority = init->flow_priority + 3,
244 		.num_of_specs = 0,
245 		.port = priv->port,
246 		.flags = 0,
247 	};
248 	do {
249 		offset -= init->flow_spec.hdr.size;
250 		memcpy((void *)((uintptr_t)flow_attr + offset),
251 		       &init->flow_spec,
252 		       init->flow_spec.hdr.size);
253 		++flow_attr->num_of_specs;
254 		init = init->underlayer;
255 	} while (init != NULL);
256 	return flow_attr_size;
257 }
258 
259 /**
260  * Convert hash type position in indirection table initializer to
261  * hash RX queue type.
262  *
263  * @param table
264  *   Indirection table initializer.
265  * @param pos
266  *   Hash type position.
267  *
268  * @return
269  *   Hash RX queue type.
270  */
271 static enum hash_rxq_type
272 hash_rxq_type_from_pos(const struct ind_table_init *table, unsigned int pos)
273 {
274 	enum hash_rxq_type type = HASH_RXQ_TCPV4;
275 
276 	assert(pos < table->hash_types_n);
277 	do {
278 		if ((table->hash_types & (1 << type)) && (pos-- == 0))
279 			break;
280 		++type;
281 	} while (1);
282 	return type;
283 }
284 
285 /**
286  * Filter out disabled hash RX queue types from ind_table_init[].
287  *
288  * @param priv
289  *   Pointer to private structure.
290  * @param[out] table
291  *   Output table.
292  *
293  * @return
294  *   Number of table entries.
295  */
296 static unsigned int
297 priv_make_ind_table_init(struct priv *priv,
298 			 struct ind_table_init (*table)[IND_TABLE_INIT_N])
299 {
300 	uint64_t rss_hf;
301 	unsigned int i;
302 	unsigned int j;
303 	unsigned int table_n = 0;
304 	/* Mandatory to receive frames not handled by normal hash RX queues. */
305 	unsigned int hash_types_sup = 1 << HASH_RXQ_ETH;
306 
307 	rss_hf = priv->rss_hf;
308 	/* Process other protocols only if more than one queue. */
309 	if (priv->rxqs_n > 1)
310 		for (i = 0; (i != hash_rxq_init_n); ++i)
311 			if (rss_hf & hash_rxq_init[i].dpdk_rss_hf)
312 				hash_types_sup |= (1 << i);
313 
314 	/* Filter out entries whose protocols are not in the set. */
315 	for (i = 0, j = 0; (i != IND_TABLE_INIT_N); ++i) {
316 		unsigned int nb;
317 		unsigned int h;
318 
319 		/* j is increased only if the table has valid protocols. */
320 		assert(j <= i);
321 		(*table)[j] = ind_table_init[i];
322 		(*table)[j].hash_types &= hash_types_sup;
323 		for (h = 0, nb = 0; (h != hash_rxq_init_n); ++h)
324 			if (((*table)[j].hash_types >> h) & 0x1)
325 				++nb;
326 		(*table)[i].hash_types_n = nb;
327 		if (nb) {
328 			++table_n;
329 			++j;
330 		}
331 	}
332 	return table_n;
333 }
334 
335 /**
336  * Initialize hash RX queues and indirection table.
337  *
338  * @param priv
339  *   Pointer to private structure.
340  *
341  * @return
342  *   0 on success, errno value on failure.
343  */
344 int
345 priv_create_hash_rxqs(struct priv *priv)
346 {
347 	struct ibv_exp_wq *wqs[priv->reta_idx_n];
348 	struct ind_table_init ind_table_init[IND_TABLE_INIT_N];
349 	unsigned int ind_tables_n =
350 		priv_make_ind_table_init(priv, &ind_table_init);
351 	unsigned int hash_rxqs_n = 0;
352 	struct hash_rxq (*hash_rxqs)[] = NULL;
353 	struct ibv_exp_rwq_ind_table *(*ind_tables)[] = NULL;
354 	unsigned int i;
355 	unsigned int j;
356 	unsigned int k;
357 	int err = 0;
358 
359 	assert(priv->ind_tables == NULL);
360 	assert(priv->ind_tables_n == 0);
361 	assert(priv->hash_rxqs == NULL);
362 	assert(priv->hash_rxqs_n == 0);
363 	assert(priv->pd != NULL);
364 	assert(priv->ctx != NULL);
365 	if (priv->rxqs_n == 0)
366 		return EINVAL;
367 	assert(priv->rxqs != NULL);
368 	if (ind_tables_n == 0) {
369 		ERROR("all hash RX queue types have been filtered out,"
370 		      " indirection table cannot be created");
371 		return EINVAL;
372 	}
373 	if (priv->rxqs_n & (priv->rxqs_n - 1)) {
374 		INFO("%u RX queues are configured, consider rounding this"
375 		     " number to the next power of two for better balancing",
376 		     priv->rxqs_n);
377 		DEBUG("indirection table extended to assume %u WQs",
378 		      priv->reta_idx_n);
379 	}
380 	for (i = 0; (i != priv->reta_idx_n); ++i) {
381 		struct rxq_ctrl *rxq_ctrl;
382 
383 		rxq_ctrl = container_of((*priv->rxqs)[(*priv->reta_idx)[i]],
384 					struct rxq_ctrl, rxq);
385 		wqs[i] = rxq_ctrl->wq;
386 	}
387 	/* Get number of hash RX queues to configure. */
388 	for (i = 0, hash_rxqs_n = 0; (i != ind_tables_n); ++i)
389 		hash_rxqs_n += ind_table_init[i].hash_types_n;
390 	DEBUG("allocating %u hash RX queues for %u WQs, %u indirection tables",
391 	      hash_rxqs_n, priv->rxqs_n, ind_tables_n);
392 	/* Create indirection tables. */
393 	ind_tables = rte_calloc(__func__, ind_tables_n,
394 				sizeof((*ind_tables)[0]), 0);
395 	if (ind_tables == NULL) {
396 		err = ENOMEM;
397 		ERROR("cannot allocate indirection tables container: %s",
398 		      strerror(err));
399 		goto error;
400 	}
401 	for (i = 0; (i != ind_tables_n); ++i) {
402 		struct ibv_exp_rwq_ind_table_init_attr ind_init_attr = {
403 			.pd = priv->pd,
404 			.log_ind_tbl_size = 0, /* Set below. */
405 			.ind_tbl = wqs,
406 			.comp_mask = 0,
407 		};
408 		unsigned int ind_tbl_size = ind_table_init[i].max_size;
409 		struct ibv_exp_rwq_ind_table *ind_table;
410 
411 		if (priv->reta_idx_n < ind_tbl_size)
412 			ind_tbl_size = priv->reta_idx_n;
413 		ind_init_attr.log_ind_tbl_size = log2above(ind_tbl_size);
414 		errno = 0;
415 		ind_table = ibv_exp_create_rwq_ind_table(priv->ctx,
416 							 &ind_init_attr);
417 		if (ind_table != NULL) {
418 			(*ind_tables)[i] = ind_table;
419 			continue;
420 		}
421 		/* Not clear whether errno is set. */
422 		err = (errno ? errno : EINVAL);
423 		ERROR("RX indirection table creation failed with error %d: %s",
424 		      err, strerror(err));
425 		goto error;
426 	}
427 	/* Allocate array that holds hash RX queues and related data. */
428 	hash_rxqs = rte_calloc(__func__, hash_rxqs_n,
429 			       sizeof((*hash_rxqs)[0]), 0);
430 	if (hash_rxqs == NULL) {
431 		err = ENOMEM;
432 		ERROR("cannot allocate hash RX queues container: %s",
433 		      strerror(err));
434 		goto error;
435 	}
436 	for (i = 0, j = 0, k = 0;
437 	     ((i != hash_rxqs_n) && (j != ind_tables_n));
438 	     ++i) {
439 		struct hash_rxq *hash_rxq = &(*hash_rxqs)[i];
440 		enum hash_rxq_type type =
441 			hash_rxq_type_from_pos(&ind_table_init[j], k);
442 		struct rte_eth_rss_conf *priv_rss_conf =
443 			(*priv->rss_conf)[type];
444 		struct ibv_exp_rx_hash_conf hash_conf = {
445 			.rx_hash_function = IBV_EXP_RX_HASH_FUNC_TOEPLITZ,
446 			.rx_hash_key_len = (priv_rss_conf ?
447 					    priv_rss_conf->rss_key_len :
448 					    rss_hash_default_key_len),
449 			.rx_hash_key = (priv_rss_conf ?
450 					priv_rss_conf->rss_key :
451 					rss_hash_default_key),
452 			.rx_hash_fields_mask = hash_rxq_init[type].hash_fields,
453 			.rwq_ind_tbl = (*ind_tables)[j],
454 		};
455 		struct ibv_exp_qp_init_attr qp_init_attr = {
456 			.max_inl_recv = 0, /* Currently not supported. */
457 			.qp_type = IBV_QPT_RAW_PACKET,
458 			.comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
459 				      IBV_EXP_QP_INIT_ATTR_RX_HASH),
460 			.pd = priv->pd,
461 			.rx_hash_conf = &hash_conf,
462 			.port_num = priv->port,
463 		};
464 
465 		DEBUG("using indirection table %u for hash RX queue %u type %d",
466 		      j, i, type);
467 		*hash_rxq = (struct hash_rxq){
468 			.priv = priv,
469 			.qp = ibv_exp_create_qp(priv->ctx, &qp_init_attr),
470 			.type = type,
471 		};
472 		if (hash_rxq->qp == NULL) {
473 			err = (errno ? errno : EINVAL);
474 			ERROR("Hash RX QP creation failure: %s",
475 			      strerror(err));
476 			goto error;
477 		}
478 		if (++k < ind_table_init[j].hash_types_n)
479 			continue;
480 		/* Switch to the next indirection table and reset hash RX
481 		 * queue type array index. */
482 		++j;
483 		k = 0;
484 	}
485 	priv->ind_tables = ind_tables;
486 	priv->ind_tables_n = ind_tables_n;
487 	priv->hash_rxqs = hash_rxqs;
488 	priv->hash_rxqs_n = hash_rxqs_n;
489 	assert(err == 0);
490 	return 0;
491 error:
492 	if (hash_rxqs != NULL) {
493 		for (i = 0; (i != hash_rxqs_n); ++i) {
494 			struct ibv_qp *qp = (*hash_rxqs)[i].qp;
495 
496 			if (qp == NULL)
497 				continue;
498 			claim_zero(ibv_destroy_qp(qp));
499 		}
500 		rte_free(hash_rxqs);
501 	}
502 	if (ind_tables != NULL) {
503 		for (j = 0; (j != ind_tables_n); ++j) {
504 			struct ibv_exp_rwq_ind_table *ind_table =
505 				(*ind_tables)[j];
506 
507 			if (ind_table == NULL)
508 				continue;
509 			claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
510 		}
511 		rte_free(ind_tables);
512 	}
513 	return err;
514 }
515 
516 /**
517  * Clean up hash RX queues and indirection table.
518  *
519  * @param priv
520  *   Pointer to private structure.
521  */
522 void
523 priv_destroy_hash_rxqs(struct priv *priv)
524 {
525 	unsigned int i;
526 
527 	DEBUG("destroying %u hash RX queues", priv->hash_rxqs_n);
528 	if (priv->hash_rxqs_n == 0) {
529 		assert(priv->hash_rxqs == NULL);
530 		assert(priv->ind_tables == NULL);
531 		return;
532 	}
533 	for (i = 0; (i != priv->hash_rxqs_n); ++i) {
534 		struct hash_rxq *hash_rxq = &(*priv->hash_rxqs)[i];
535 		unsigned int j, k;
536 
537 		assert(hash_rxq->priv == priv);
538 		assert(hash_rxq->qp != NULL);
539 		/* Also check that there are no remaining flows. */
540 		for (j = 0; (j != RTE_DIM(hash_rxq->special_flow)); ++j)
541 			for (k = 0;
542 			     (k != RTE_DIM(hash_rxq->special_flow[j]));
543 			     ++k)
544 				assert(hash_rxq->special_flow[j][k] == NULL);
545 		for (j = 0; (j != RTE_DIM(hash_rxq->mac_flow)); ++j)
546 			for (k = 0; (k != RTE_DIM(hash_rxq->mac_flow[j])); ++k)
547 				assert(hash_rxq->mac_flow[j][k] == NULL);
548 		claim_zero(ibv_destroy_qp(hash_rxq->qp));
549 	}
550 	priv->hash_rxqs_n = 0;
551 	rte_free(priv->hash_rxqs);
552 	priv->hash_rxqs = NULL;
553 	for (i = 0; (i != priv->ind_tables_n); ++i) {
554 		struct ibv_exp_rwq_ind_table *ind_table =
555 			(*priv->ind_tables)[i];
556 
557 		assert(ind_table != NULL);
558 		claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
559 	}
560 	priv->ind_tables_n = 0;
561 	rte_free(priv->ind_tables);
562 	priv->ind_tables = NULL;
563 }
564 
565 /**
566  * Check whether a given flow type is allowed.
567  *
568  * @param priv
569  *   Pointer to private structure.
570  * @param type
571  *   Flow type to check.
572  *
573  * @return
574  *   Nonzero if the given flow type is allowed.
575  */
576 int
577 priv_allow_flow_type(struct priv *priv, enum hash_rxq_flow_type type)
578 {
579 	/* Only FLOW_TYPE_PROMISC is allowed when promiscuous mode
580 	 * has been requested. */
581 	if (priv->promisc_req)
582 		return type == HASH_RXQ_FLOW_TYPE_PROMISC;
583 	switch (type) {
584 	case HASH_RXQ_FLOW_TYPE_PROMISC:
585 		return !!priv->promisc_req;
586 	case HASH_RXQ_FLOW_TYPE_ALLMULTI:
587 		return !!priv->allmulti_req;
588 	case HASH_RXQ_FLOW_TYPE_BROADCAST:
589 	case HASH_RXQ_FLOW_TYPE_IPV6MULTI:
590 		/* If allmulti is enabled, broadcast and ipv6multi
591 		 * are unnecessary. */
592 		return !priv->allmulti_req;
593 	case HASH_RXQ_FLOW_TYPE_MAC:
594 		return 1;
595 	default:
596 		/* Unsupported flow type is not allowed. */
597 		return 0;
598 	}
599 	return 0;
600 }
601 
602 /**
603  * Automatically enable/disable flows according to configuration.
604  *
605  * @param priv
606  *   Private structure.
607  *
608  * @return
609  *   0 on success, errno value on failure.
610  */
611 int
612 priv_rehash_flows(struct priv *priv)
613 {
614 	enum hash_rxq_flow_type i;
615 
616 	for (i = HASH_RXQ_FLOW_TYPE_PROMISC;
617 			i != RTE_DIM((*priv->hash_rxqs)[0].special_flow);
618 			++i)
619 		if (!priv_allow_flow_type(priv, i)) {
620 			priv_special_flow_disable(priv, i);
621 		} else {
622 			int ret = priv_special_flow_enable(priv, i);
623 
624 			if (ret)
625 				return ret;
626 		}
627 	if (priv_allow_flow_type(priv, HASH_RXQ_FLOW_TYPE_MAC))
628 		return priv_mac_addrs_enable(priv);
629 	priv_mac_addrs_disable(priv);
630 	return 0;
631 }
632 
633 /**
634  * Allocate RX queue elements.
635  *
636  * @param rxq_ctrl
637  *   Pointer to RX queue structure.
638  * @param elts_n
639  *   Number of elements to allocate.
640  * @param[in] pool
641  *   If not NULL, fetch buffers from this array instead of allocating them
642  *   with rte_pktmbuf_alloc().
643  *
644  * @return
645  *   0 on success, errno value on failure.
646  */
647 static int
648 rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
649 	       struct rte_mbuf *(*pool)[])
650 {
651 	const unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n;
652 	unsigned int i;
653 	int ret = 0;
654 
655 	/* Iterate on segments. */
656 	for (i = 0; (i != elts_n); ++i) {
657 		struct rte_mbuf *buf;
658 		volatile struct mlx5_wqe_data_seg *scat =
659 			&(*rxq_ctrl->rxq.wqes)[i];
660 
661 		if (pool != NULL) {
662 			buf = (*pool)[i];
663 			assert(buf != NULL);
664 			rte_pktmbuf_reset(buf);
665 			rte_pktmbuf_refcnt_update(buf, 1);
666 		} else
667 			buf = rte_pktmbuf_alloc(rxq_ctrl->rxq.mp);
668 		if (buf == NULL) {
669 			assert(pool == NULL);
670 			ERROR("%p: empty mbuf pool", (void *)rxq_ctrl);
671 			ret = ENOMEM;
672 			goto error;
673 		}
674 		/* Headroom is reserved by rte_pktmbuf_alloc(). */
675 		assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
676 		/* Buffer is supposed to be empty. */
677 		assert(rte_pktmbuf_data_len(buf) == 0);
678 		assert(rte_pktmbuf_pkt_len(buf) == 0);
679 		assert(!buf->next);
680 		/* Only the first segment keeps headroom. */
681 		if (i % sges_n)
682 			SET_DATA_OFF(buf, 0);
683 		PORT(buf) = rxq_ctrl->rxq.port_id;
684 		DATA_LEN(buf) = rte_pktmbuf_tailroom(buf);
685 		PKT_LEN(buf) = DATA_LEN(buf);
686 		NB_SEGS(buf) = 1;
687 		/* scat->addr must be able to store a pointer. */
688 		assert(sizeof(scat->addr) >= sizeof(uintptr_t));
689 		*scat = (struct mlx5_wqe_data_seg){
690 			.addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)),
691 			.byte_count = htonl(DATA_LEN(buf)),
692 			.lkey = htonl(rxq_ctrl->mr->lkey),
693 		};
694 		(*rxq_ctrl->rxq.elts)[i] = buf;
695 	}
696 	DEBUG("%p: allocated and configured %u segments (max %u packets)",
697 	      (void *)rxq_ctrl, elts_n, elts_n / (1 << rxq_ctrl->rxq.sges_n));
698 	assert(ret == 0);
699 	return 0;
700 error:
701 	assert(pool == NULL);
702 	elts_n = i;
703 	for (i = 0; (i != elts_n); ++i) {
704 		if ((*rxq_ctrl->rxq.elts)[i] != NULL)
705 			rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
706 		(*rxq_ctrl->rxq.elts)[i] = NULL;
707 	}
708 	DEBUG("%p: failed, freed everything", (void *)rxq_ctrl);
709 	assert(ret > 0);
710 	return ret;
711 }
712 
713 /**
714  * Free RX queue elements.
715  *
716  * @param rxq_ctrl
717  *   Pointer to RX queue structure.
718  */
719 static void
720 rxq_free_elts(struct rxq_ctrl *rxq_ctrl)
721 {
722 	unsigned int i;
723 
724 	DEBUG("%p: freeing WRs", (void *)rxq_ctrl);
725 	if (rxq_ctrl->rxq.elts == NULL)
726 		return;
727 
728 	for (i = 0; (i != (1u << rxq_ctrl->rxq.elts_n)); ++i) {
729 		if ((*rxq_ctrl->rxq.elts)[i] != NULL)
730 			rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
731 		(*rxq_ctrl->rxq.elts)[i] = NULL;
732 	}
733 }
734 
735 /**
736  * Clean up a RX queue.
737  *
738  * Destroy objects, free allocated memory and reset the structure for reuse.
739  *
740  * @param rxq_ctrl
741  *   Pointer to RX queue structure.
742  */
743 void
744 rxq_cleanup(struct rxq_ctrl *rxq_ctrl)
745 {
746 	DEBUG("cleaning up %p", (void *)rxq_ctrl);
747 	rxq_free_elts(rxq_ctrl);
748 	if (rxq_ctrl->fdir_queue != NULL)
749 		priv_fdir_queue_destroy(rxq_ctrl->priv, rxq_ctrl->fdir_queue);
750 	if (rxq_ctrl->wq != NULL)
751 		claim_zero(ibv_exp_destroy_wq(rxq_ctrl->wq));
752 	if (rxq_ctrl->cq != NULL)
753 		claim_zero(ibv_destroy_cq(rxq_ctrl->cq));
754 	if (rxq_ctrl->channel != NULL)
755 		claim_zero(ibv_destroy_comp_channel(rxq_ctrl->channel));
756 	if (rxq_ctrl->mr != NULL)
757 		claim_zero(ibv_dereg_mr(rxq_ctrl->mr));
758 	memset(rxq_ctrl, 0, sizeof(*rxq_ctrl));
759 }
760 
761 /**
762  * Reconfigure RX queue buffers.
763  *
764  * rxq_rehash() does not allocate mbufs, which, if not done from the right
765  * thread (such as a control thread), may corrupt the pool.
766  * In case of failure, the queue is left untouched.
767  *
768  * @param dev
769  *   Pointer to Ethernet device structure.
770  * @param rxq_ctrl
771  *   RX queue pointer.
772  *
773  * @return
774  *   0 on success, errno value on failure.
775  */
776 int
777 rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
778 {
779 	unsigned int elts_n = 1 << rxq_ctrl->rxq.elts_n;
780 	unsigned int i;
781 	struct ibv_exp_wq_attr mod;
782 	int err;
783 
784 	DEBUG("%p: rehashing queue %p with %u SGE(s) per packet",
785 	      (void *)dev, (void *)rxq_ctrl, 1 << rxq_ctrl->rxq.sges_n);
786 	assert(!(elts_n % (1 << rxq_ctrl->rxq.sges_n)));
787 	/* From now on, any failure will render the queue unusable.
788 	 * Reinitialize WQ. */
789 	mod = (struct ibv_exp_wq_attr){
790 		.attr_mask = IBV_EXP_WQ_ATTR_STATE,
791 		.wq_state = IBV_EXP_WQS_RESET,
792 	};
793 	err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod);
794 	if (err) {
795 		ERROR("%p: cannot reset WQ: %s", (void *)dev, strerror(err));
796 		assert(err > 0);
797 		return err;
798 	}
799 	/* Snatch mbufs from original queue. */
800 	claim_zero(rxq_alloc_elts(rxq_ctrl, elts_n, rxq_ctrl->rxq.elts));
801 	for (i = 0; i != elts_n; ++i) {
802 		struct rte_mbuf *buf = (*rxq_ctrl->rxq.elts)[i];
803 
804 		assert(rte_mbuf_refcnt_read(buf) == 2);
805 		rte_pktmbuf_free_seg(buf);
806 	}
807 	/* Change queue state to ready. */
808 	mod = (struct ibv_exp_wq_attr){
809 		.attr_mask = IBV_EXP_WQ_ATTR_STATE,
810 		.wq_state = IBV_EXP_WQS_RDY,
811 	};
812 	err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod);
813 	if (err) {
814 		ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
815 		      (void *)dev, strerror(err));
816 		goto error;
817 	}
818 	/* Update doorbell counter. */
819 	rxq_ctrl->rxq.rq_ci = elts_n >> rxq_ctrl->rxq.sges_n;
820 	rte_wmb();
821 	*rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
822 error:
823 	assert(err >= 0);
824 	return err;
825 }
826 
827 /**
828  * Initialize RX queue.
829  *
830  * @param tmpl
831  *   Pointer to RX queue control template.
832  *
833  * @return
834  *   0 on success, errno value on failure.
835  */
836 static inline int
837 rxq_setup(struct rxq_ctrl *tmpl)
838 {
839 	struct ibv_cq *ibcq = tmpl->cq;
840 	struct mlx5_cq *cq = to_mxxx(cq, cq);
841 	struct mlx5_rwq *rwq = container_of(tmpl->wq, struct mlx5_rwq, wq);
842 	struct rte_mbuf *(*elts)[1 << tmpl->rxq.elts_n] =
843 		rte_calloc_socket("RXQ", 1, sizeof(*elts), 0, tmpl->socket);
844 
845 	if (cq->cqe_sz != RTE_CACHE_LINE_SIZE) {
846 		ERROR("Wrong MLX5_CQE_SIZE environment variable value: "
847 		      "it should be set to %u", RTE_CACHE_LINE_SIZE);
848 		return EINVAL;
849 	}
850 	if (elts == NULL)
851 		return ENOMEM;
852 	tmpl->rxq.rq_db = rwq->rq.db;
853 	tmpl->rxq.cqe_n = log2above(ibcq->cqe);
854 	tmpl->rxq.cq_ci = 0;
855 	tmpl->rxq.rq_ci = 0;
856 	tmpl->rxq.cq_db = cq->dbrec;
857 	tmpl->rxq.wqes =
858 		(volatile struct mlx5_wqe_data_seg (*)[])
859 		(uintptr_t)rwq->rq.buff;
860 	tmpl->rxq.cqes =
861 		(volatile struct mlx5_cqe (*)[])
862 		(uintptr_t)cq->active_buf->buf;
863 	tmpl->rxq.elts = elts;
864 	return 0;
865 }
866 
867 /**
868  * Configure a RX queue.
869  *
870  * @param dev
871  *   Pointer to Ethernet device structure.
872  * @param rxq_ctrl
873  *   Pointer to RX queue structure.
874  * @param desc
875  *   Number of descriptors to configure in queue.
876  * @param socket
877  *   NUMA socket on which memory must be allocated.
878  * @param[in] conf
879  *   Thresholds parameters.
880  * @param mp
881  *   Memory pool for buffer allocations.
882  *
883  * @return
884  *   0 on success, errno value on failure.
885  */
886 int
887 rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
888 	       uint16_t desc, unsigned int socket,
889 	       const struct rte_eth_rxconf *conf, struct rte_mempool *mp)
890 {
891 	struct priv *priv = dev->data->dev_private;
892 	struct rxq_ctrl tmpl = {
893 		.priv = priv,
894 		.socket = socket,
895 		.rxq = {
896 			.elts_n = log2above(desc),
897 			.mp = mp,
898 			.rss_hash = priv->rxqs_n > 1,
899 		},
900 	};
901 	struct ibv_exp_wq_attr mod;
902 	union {
903 		struct ibv_exp_cq_init_attr cq;
904 		struct ibv_exp_wq_init_attr wq;
905 		struct ibv_exp_cq_attr cq_attr;
906 	} attr;
907 	unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
908 	unsigned int cqe_n = desc - 1;
909 	struct rte_mbuf *(*elts)[desc] = NULL;
910 	int ret = 0;
911 
912 	(void)conf; /* Thresholds configuration (ignored). */
913 	/* Enable scattered packets support for this queue if necessary. */
914 	assert(mb_len >= RTE_PKTMBUF_HEADROOM);
915 	if (dev->data->dev_conf.rxmode.max_rx_pkt_len <=
916 	    (mb_len - RTE_PKTMBUF_HEADROOM)) {
917 		tmpl.rxq.sges_n = 0;
918 	} else if (dev->data->dev_conf.rxmode.enable_scatter) {
919 		unsigned int size =
920 			RTE_PKTMBUF_HEADROOM +
921 			dev->data->dev_conf.rxmode.max_rx_pkt_len;
922 		unsigned int sges_n;
923 
924 		/*
925 		 * Determine the number of SGEs needed for a full packet
926 		 * and round it to the next power of two.
927 		 */
928 		sges_n = log2above((size / mb_len) + !!(size % mb_len));
929 		tmpl.rxq.sges_n = sges_n;
930 		/* Make sure rxq.sges_n did not overflow. */
931 		size = mb_len * (1 << tmpl.rxq.sges_n);
932 		size -= RTE_PKTMBUF_HEADROOM;
933 		if (size < dev->data->dev_conf.rxmode.max_rx_pkt_len) {
934 			ERROR("%p: too many SGEs (%u) needed to handle"
935 			      " requested maximum packet size %u",
936 			      (void *)dev,
937 			      1 << sges_n,
938 			      dev->data->dev_conf.rxmode.max_rx_pkt_len);
939 			return EOVERFLOW;
940 		}
941 	} else {
942 		WARN("%p: the requested maximum Rx packet size (%u) is"
943 		     " larger than a single mbuf (%u) and scattered"
944 		     " mode has not been requested",
945 		     (void *)dev,
946 		     dev->data->dev_conf.rxmode.max_rx_pkt_len,
947 		     mb_len - RTE_PKTMBUF_HEADROOM);
948 	}
949 	DEBUG("%p: maximum number of segments per packet: %u",
950 	      (void *)dev, 1 << tmpl.rxq.sges_n);
951 	if (desc % (1 << tmpl.rxq.sges_n)) {
952 		ERROR("%p: number of RX queue descriptors (%u) is not a"
953 		      " multiple of SGEs per packet (%u)",
954 		      (void *)dev,
955 		      desc,
956 		      1 << tmpl.rxq.sges_n);
957 		return EINVAL;
958 	}
959 	/* Toggle RX checksum offload if hardware supports it. */
960 	if (priv->hw_csum)
961 		tmpl.rxq.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
962 	if (priv->hw_csum_l2tun)
963 		tmpl.rxq.csum_l2tun =
964 			!!dev->data->dev_conf.rxmode.hw_ip_checksum;
965 	/* Use the entire RX mempool as the memory region. */
966 	tmpl.mr = mlx5_mp2mr(priv->pd, mp);
967 	if (tmpl.mr == NULL) {
968 		ret = EINVAL;
969 		ERROR("%p: MR creation failure: %s",
970 		      (void *)dev, strerror(ret));
971 		goto error;
972 	}
973 	if (dev->data->dev_conf.intr_conf.rxq) {
974 		tmpl.channel = ibv_create_comp_channel(priv->ctx);
975 		if (tmpl.channel == NULL) {
976 			dev->data->dev_conf.intr_conf.rxq = 0;
977 			ret = ENOMEM;
978 			ERROR("%p: Comp Channel creation failure: %s",
979 			(void *)dev, strerror(ret));
980 			goto error;
981 		}
982 	}
983 	attr.cq = (struct ibv_exp_cq_init_attr){
984 		.comp_mask = 0,
985 	};
986 	if (priv->cqe_comp) {
987 		attr.cq.comp_mask |= IBV_EXP_CQ_INIT_ATTR_FLAGS;
988 		attr.cq.flags |= IBV_EXP_CQ_COMPRESSED_CQE;
989 		cqe_n = (desc * 2) - 1; /* Double the number of CQEs. */
990 	}
991 	tmpl.cq = ibv_exp_create_cq(priv->ctx, cqe_n, NULL, tmpl.channel, 0,
992 				    &attr.cq);
993 	if (tmpl.cq == NULL) {
994 		ret = ENOMEM;
995 		ERROR("%p: CQ creation failure: %s",
996 		      (void *)dev, strerror(ret));
997 		goto error;
998 	}
999 	DEBUG("priv->device_attr.max_qp_wr is %d",
1000 	      priv->device_attr.max_qp_wr);
1001 	DEBUG("priv->device_attr.max_sge is %d",
1002 	      priv->device_attr.max_sge);
1003 	/* Configure VLAN stripping. */
1004 	tmpl.rxq.vlan_strip = (priv->hw_vlan_strip &&
1005 			       !!dev->data->dev_conf.rxmode.hw_vlan_strip);
1006 	attr.wq = (struct ibv_exp_wq_init_attr){
1007 		.wq_context = NULL, /* Could be useful in the future. */
1008 		.wq_type = IBV_EXP_WQT_RQ,
1009 		/* Max number of outstanding WRs. */
1010 		.max_recv_wr = desc >> tmpl.rxq.sges_n,
1011 		/* Max number of scatter/gather elements in a WR. */
1012 		.max_recv_sge = 1 << tmpl.rxq.sges_n,
1013 		.pd = priv->pd,
1014 		.cq = tmpl.cq,
1015 		.comp_mask =
1016 			IBV_EXP_CREATE_WQ_VLAN_OFFLOADS |
1017 			0,
1018 		.vlan_offloads = (tmpl.rxq.vlan_strip ?
1019 				  IBV_EXP_RECEIVE_WQ_CVLAN_STRIP :
1020 				  0),
1021 	};
1022 	/* By default, FCS (CRC) is stripped by hardware. */
1023 	if (dev->data->dev_conf.rxmode.hw_strip_crc) {
1024 		tmpl.rxq.crc_present = 0;
1025 	} else if (priv->hw_fcs_strip) {
1026 		/* Ask HW/Verbs to leave CRC in place when supported. */
1027 		attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_SCATTER_FCS;
1028 		attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
1029 		tmpl.rxq.crc_present = 1;
1030 	} else {
1031 		WARN("%p: CRC stripping has been disabled but will still"
1032 		     " be performed by hardware, make sure MLNX_OFED and"
1033 		     " firmware are up to date",
1034 		     (void *)dev);
1035 		tmpl.rxq.crc_present = 0;
1036 	}
1037 	DEBUG("%p: CRC stripping is %s, %u bytes will be subtracted from"
1038 	      " incoming frames to hide it",
1039 	      (void *)dev,
1040 	      tmpl.rxq.crc_present ? "disabled" : "enabled",
1041 	      tmpl.rxq.crc_present << 2);
1042 	if (!mlx5_getenv_int("MLX5_PMD_ENABLE_PADDING"))
1043 		; /* Nothing else to do. */
1044 	else if (priv->hw_padding) {
1045 		INFO("%p: enabling packet padding on queue %p",
1046 		     (void *)dev, (void *)rxq_ctrl);
1047 		attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_RX_END_PADDING;
1048 		attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
1049 	} else
1050 		WARN("%p: packet padding has been requested but is not"
1051 		     " supported, make sure MLNX_OFED and firmware are"
1052 		     " up to date",
1053 		     (void *)dev);
1054 
1055 	tmpl.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
1056 	if (tmpl.wq == NULL) {
1057 		ret = (errno ? errno : EINVAL);
1058 		ERROR("%p: WQ creation failure: %s",
1059 		      (void *)dev, strerror(ret));
1060 		goto error;
1061 	}
1062 	/*
1063 	 * Make sure number of WRs*SGEs match expectations since a queue
1064 	 * cannot allocate more than "desc" buffers.
1065 	 */
1066 	if (((int)attr.wq.max_recv_wr != (desc >> tmpl.rxq.sges_n)) ||
1067 	    ((int)attr.wq.max_recv_sge != (1 << tmpl.rxq.sges_n))) {
1068 		ERROR("%p: requested %u*%u but got %u*%u WRs*SGEs",
1069 		      (void *)dev,
1070 		      (desc >> tmpl.rxq.sges_n), (1 << tmpl.rxq.sges_n),
1071 		      attr.wq.max_recv_wr, attr.wq.max_recv_sge);
1072 		ret = EINVAL;
1073 		goto error;
1074 	}
1075 	/* Save port ID. */
1076 	tmpl.rxq.port_id = dev->data->port_id;
1077 	DEBUG("%p: RTE port ID: %u", (void *)rxq_ctrl, tmpl.rxq.port_id);
1078 	/* Change queue state to ready. */
1079 	mod = (struct ibv_exp_wq_attr){
1080 		.attr_mask = IBV_EXP_WQ_ATTR_STATE,
1081 		.wq_state = IBV_EXP_WQS_RDY,
1082 	};
1083 	ret = ibv_exp_modify_wq(tmpl.wq, &mod);
1084 	if (ret) {
1085 		ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
1086 		      (void *)dev, strerror(ret));
1087 		goto error;
1088 	}
1089 	ret = rxq_setup(&tmpl);
1090 	if (ret) {
1091 		ERROR("%p: cannot initialize RX queue structure: %s",
1092 		      (void *)dev, strerror(ret));
1093 		goto error;
1094 	}
1095 	/* Reuse buffers from original queue if possible. */
1096 	if (rxq_ctrl->rxq.elts_n) {
1097 		assert(1 << rxq_ctrl->rxq.elts_n == desc);
1098 		assert(rxq_ctrl->rxq.elts != tmpl.rxq.elts);
1099 		ret = rxq_alloc_elts(&tmpl, desc, rxq_ctrl->rxq.elts);
1100 	} else
1101 		ret = rxq_alloc_elts(&tmpl, desc, NULL);
1102 	if (ret) {
1103 		ERROR("%p: RXQ allocation failed: %s",
1104 		      (void *)dev, strerror(ret));
1105 		goto error;
1106 	}
1107 	/* Clean up rxq in case we're reinitializing it. */
1108 	DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq_ctrl);
1109 	rxq_cleanup(rxq_ctrl);
1110 	/* Move mbuf pointers to dedicated storage area in RX queue. */
1111 	elts = (void *)(rxq_ctrl + 1);
1112 	rte_memcpy(elts, tmpl.rxq.elts, sizeof(*elts));
1113 #ifndef NDEBUG
1114 	memset(tmpl.rxq.elts, 0x55, sizeof(*elts));
1115 #endif
1116 	rte_free(tmpl.rxq.elts);
1117 	tmpl.rxq.elts = elts;
1118 	*rxq_ctrl = tmpl;
1119 	/* Update doorbell counter. */
1120 	rxq_ctrl->rxq.rq_ci = desc >> rxq_ctrl->rxq.sges_n;
1121 	rte_wmb();
1122 	*rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
1123 	DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl);
1124 	assert(ret == 0);
1125 	return 0;
1126 error:
1127 	elts = tmpl.rxq.elts;
1128 	rxq_cleanup(&tmpl);
1129 	rte_free(elts);
1130 	assert(ret > 0);
1131 	return ret;
1132 }
1133 
1134 /**
1135  * DPDK callback to configure a RX queue.
1136  *
1137  * @param dev
1138  *   Pointer to Ethernet device structure.
1139  * @param idx
1140  *   RX queue index.
1141  * @param desc
1142  *   Number of descriptors to configure in queue.
1143  * @param socket
1144  *   NUMA socket on which memory must be allocated.
1145  * @param[in] conf
1146  *   Thresholds parameters.
1147  * @param mp
1148  *   Memory pool for buffer allocations.
1149  *
1150  * @return
1151  *   0 on success, negative errno value on failure.
1152  */
1153 int
1154 mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
1155 		    unsigned int socket, const struct rte_eth_rxconf *conf,
1156 		    struct rte_mempool *mp)
1157 {
1158 	struct priv *priv = dev->data->dev_private;
1159 	struct rxq *rxq = (*priv->rxqs)[idx];
1160 	struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
1161 	int ret;
1162 
1163 	if (mlx5_is_secondary())
1164 		return -E_RTE_SECONDARY;
1165 
1166 	priv_lock(priv);
1167 	if (!rte_is_power_of_2(desc)) {
1168 		desc = 1 << log2above(desc);
1169 		WARN("%p: increased number of descriptors in RX queue %u"
1170 		     " to the next power of two (%d)",
1171 		     (void *)dev, idx, desc);
1172 	}
1173 	DEBUG("%p: configuring queue %u for %u descriptors",
1174 	      (void *)dev, idx, desc);
1175 	if (idx >= priv->rxqs_n) {
1176 		ERROR("%p: queue index out of range (%u >= %u)",
1177 		      (void *)dev, idx, priv->rxqs_n);
1178 		priv_unlock(priv);
1179 		return -EOVERFLOW;
1180 	}
1181 	if (rxq != NULL) {
1182 		DEBUG("%p: reusing already allocated queue index %u (%p)",
1183 		      (void *)dev, idx, (void *)rxq);
1184 		if (priv->started) {
1185 			priv_unlock(priv);
1186 			return -EEXIST;
1187 		}
1188 		(*priv->rxqs)[idx] = NULL;
1189 		rxq_cleanup(rxq_ctrl);
1190 		/* Resize if rxq size is changed. */
1191 		if (rxq_ctrl->rxq.elts_n != log2above(desc)) {
1192 			rxq_ctrl = rte_realloc(rxq_ctrl,
1193 					       sizeof(*rxq_ctrl) +
1194 					       desc * sizeof(struct rte_mbuf *),
1195 					       RTE_CACHE_LINE_SIZE);
1196 			if (!rxq_ctrl) {
1197 				ERROR("%p: unable to reallocate queue index %u",
1198 					(void *)dev, idx);
1199 				priv_unlock(priv);
1200 				return -ENOMEM;
1201 			}
1202 		}
1203 	} else {
1204 		rxq_ctrl = rte_calloc_socket("RXQ", 1, sizeof(*rxq_ctrl) +
1205 					     desc * sizeof(struct rte_mbuf *),
1206 					     0, socket);
1207 		if (rxq_ctrl == NULL) {
1208 			ERROR("%p: unable to allocate queue index %u",
1209 			      (void *)dev, idx);
1210 			priv_unlock(priv);
1211 			return -ENOMEM;
1212 		}
1213 	}
1214 	ret = rxq_ctrl_setup(dev, rxq_ctrl, desc, socket, conf, mp);
1215 	if (ret)
1216 		rte_free(rxq_ctrl);
1217 	else {
1218 		rxq_ctrl->rxq.stats.idx = idx;
1219 		DEBUG("%p: adding RX queue %p to list",
1220 		      (void *)dev, (void *)rxq_ctrl);
1221 		(*priv->rxqs)[idx] = &rxq_ctrl->rxq;
1222 		/* Update receive callback. */
1223 		priv_select_rx_function(priv);
1224 	}
1225 	priv_unlock(priv);
1226 	return -ret;
1227 }
1228 
1229 /**
1230  * DPDK callback to release a RX queue.
1231  *
1232  * @param dpdk_rxq
1233  *   Generic RX queue pointer.
1234  */
1235 void
1236 mlx5_rx_queue_release(void *dpdk_rxq)
1237 {
1238 	struct rxq *rxq = (struct rxq *)dpdk_rxq;
1239 	struct rxq_ctrl *rxq_ctrl;
1240 	struct priv *priv;
1241 	unsigned int i;
1242 
1243 	if (mlx5_is_secondary())
1244 		return;
1245 
1246 	if (rxq == NULL)
1247 		return;
1248 	rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
1249 	priv = rxq_ctrl->priv;
1250 	priv_lock(priv);
1251 	for (i = 0; (i != priv->rxqs_n); ++i)
1252 		if ((*priv->rxqs)[i] == rxq) {
1253 			DEBUG("%p: removing RX queue %p from list",
1254 			      (void *)priv->dev, (void *)rxq_ctrl);
1255 			(*priv->rxqs)[i] = NULL;
1256 			break;
1257 		}
1258 	rxq_cleanup(rxq_ctrl);
1259 	rte_free(rxq_ctrl);
1260 	priv_unlock(priv);
1261 }
1262 
1263 /**
1264  * DPDK callback for RX in secondary processes.
1265  *
1266  * This function configures all queues from primary process information
1267  * if necessary before reverting to the normal RX burst callback.
1268  *
1269  * @param dpdk_rxq
1270  *   Generic pointer to RX queue structure.
1271  * @param[out] pkts
1272  *   Array to store received packets.
1273  * @param pkts_n
1274  *   Maximum number of packets in array.
1275  *
1276  * @return
1277  *   Number of packets successfully received (<= pkts_n).
1278  */
1279 uint16_t
1280 mlx5_rx_burst_secondary_setup(void *dpdk_rxq, struct rte_mbuf **pkts,
1281 			      uint16_t pkts_n)
1282 {
1283 	struct rxq *rxq = dpdk_rxq;
1284 	struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
1285 	struct priv *priv = mlx5_secondary_data_setup(rxq_ctrl->priv);
1286 	struct priv *primary_priv;
1287 	unsigned int index;
1288 
1289 	if (priv == NULL)
1290 		return 0;
1291 	primary_priv =
1292 		mlx5_secondary_data[priv->dev->data->port_id].primary_priv;
1293 	/* Look for queue index in both private structures. */
1294 	for (index = 0; index != priv->rxqs_n; ++index)
1295 		if (((*primary_priv->rxqs)[index] == rxq) ||
1296 		    ((*priv->rxqs)[index] == rxq))
1297 			break;
1298 	if (index == priv->rxqs_n)
1299 		return 0;
1300 	rxq = (*priv->rxqs)[index];
1301 	return priv->dev->rx_pkt_burst(rxq, pkts, pkts_n);
1302 }
1303 
1304 /**
1305  * Fill epoll fd list for rxq interrupts.
1306  *
1307  * @param priv
1308  *   Private structure.
1309  *
1310  * @return
1311  *   0 on success, negative on failure.
1312  */
1313 int
1314 priv_intr_efd_enable(struct priv *priv)
1315 {
1316 	unsigned int i;
1317 	unsigned int rxqs_n = priv->rxqs_n;
1318 	unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
1319 	struct rte_intr_handle *intr_handle = priv->dev->intr_handle;
1320 
1321 	if (n == 0)
1322 		return 0;
1323 	if (n < rxqs_n) {
1324 		WARN("rxqs num is larger than EAL max interrupt vector "
1325 		     "%u > %u unable to supprt rxq interrupts",
1326 		     rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
1327 		return -EINVAL;
1328 	}
1329 	intr_handle->type = RTE_INTR_HANDLE_EXT;
1330 	for (i = 0; i != n; ++i) {
1331 		struct rxq *rxq = (*priv->rxqs)[i];
1332 		struct rxq_ctrl *rxq_ctrl =
1333 			container_of(rxq, struct rxq_ctrl, rxq);
1334 		int fd = rxq_ctrl->channel->fd;
1335 		int flags;
1336 		int rc;
1337 
1338 		flags = fcntl(fd, F_GETFL);
1339 		rc = fcntl(fd, F_SETFL, flags | O_NONBLOCK);
1340 		if (rc < 0) {
1341 			WARN("failed to change rxq interrupt file "
1342 			     "descriptor %d for queue index %d", fd, i);
1343 			return -1;
1344 		}
1345 		intr_handle->efds[i] = fd;
1346 	}
1347 	intr_handle->nb_efd = n;
1348 	return 0;
1349 }
1350 
1351 /**
1352  * Clean epoll fd list for rxq interrupts.
1353  *
1354  * @param priv
1355  *   Private structure.
1356  */
1357 void
1358 priv_intr_efd_disable(struct priv *priv)
1359 {
1360 	struct rte_intr_handle *intr_handle = priv->dev->intr_handle;
1361 
1362 	rte_intr_free_epoll_fd(intr_handle);
1363 }
1364 
1365 /**
1366  * Create and init interrupt vector array.
1367  *
1368  * @param priv
1369  *   Private structure.
1370  *
1371  * @return
1372  *   0 on success, negative on failure.
1373  */
1374 int
1375 priv_create_intr_vec(struct priv *priv)
1376 {
1377 	unsigned int rxqs_n = priv->rxqs_n;
1378 	unsigned int i;
1379 	struct rte_intr_handle *intr_handle = priv->dev->intr_handle;
1380 
1381 	if (rxqs_n == 0)
1382 		return 0;
1383 	intr_handle->intr_vec = (int *)
1384 		rte_malloc("intr_vec", rxqs_n * sizeof(int), 0);
1385 	if (intr_handle->intr_vec == NULL) {
1386 		WARN("Failed to allocate memory for intr_vec "
1387 		     "rxq interrupt will not be supported");
1388 		return -ENOMEM;
1389 	}
1390 	for (i = 0; i != rxqs_n; ++i) {
1391 		/* 1:1 mapping between rxq and interrupt. */
1392 		intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + i;
1393 	}
1394 	return 0;
1395 }
1396 
1397 /**
1398  * Destroy init interrupt vector array.
1399  *
1400  * @param priv
1401  *   Private structure.
1402  *
1403  * @return
1404  *   0 on success, negative on failure.
1405  */
1406 void
1407 priv_destroy_intr_vec(struct priv *priv)
1408 {
1409 	struct rte_intr_handle *intr_handle = priv->dev->intr_handle;
1410 
1411 	rte_free(intr_handle->intr_vec);
1412 }
1413