xref: /dpdk/drivers/net/mlx5/mlx5_rxq.c (revision ceb1ccd5d50c1a89ba8bdd97cc199e7f07422b98)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright 2015 6WIND S.A.
5  *   Copyright 2015 Mellanox.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of 6WIND S.A. nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <stddef.h>
35 #include <assert.h>
36 #include <errno.h>
37 #include <string.h>
38 #include <stdint.h>
39 
40 /* Verbs header. */
41 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
42 #ifdef PEDANTIC
43 #pragma GCC diagnostic ignored "-pedantic"
44 #endif
45 #include <infiniband/verbs.h>
46 #ifdef PEDANTIC
47 #pragma GCC diagnostic error "-pedantic"
48 #endif
49 
50 /* DPDK headers don't like -pedantic. */
51 #ifdef PEDANTIC
52 #pragma GCC diagnostic ignored "-pedantic"
53 #endif
54 #include <rte_mbuf.h>
55 #include <rte_malloc.h>
56 #include <rte_ethdev.h>
57 #include <rte_common.h>
58 #ifdef PEDANTIC
59 #pragma GCC diagnostic error "-pedantic"
60 #endif
61 
62 #include "mlx5.h"
63 #include "mlx5_rxtx.h"
64 #include "mlx5_utils.h"
65 #include "mlx5_autoconf.h"
66 #include "mlx5_defs.h"
67 
68 /* Initialization data for hash RX queues. */
69 const struct hash_rxq_init hash_rxq_init[] = {
70 	[HASH_RXQ_TCPV4] = {
71 		.hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
72 				IBV_EXP_RX_HASH_DST_IPV4 |
73 				IBV_EXP_RX_HASH_SRC_PORT_TCP |
74 				IBV_EXP_RX_HASH_DST_PORT_TCP),
75 		.dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_TCP,
76 		.flow_priority = 0,
77 		.flow_spec.tcp_udp = {
78 			.type = IBV_EXP_FLOW_SPEC_TCP,
79 			.size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
80 		},
81 		.underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
82 	},
83 	[HASH_RXQ_UDPV4] = {
84 		.hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
85 				IBV_EXP_RX_HASH_DST_IPV4 |
86 				IBV_EXP_RX_HASH_SRC_PORT_UDP |
87 				IBV_EXP_RX_HASH_DST_PORT_UDP),
88 		.dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_UDP,
89 		.flow_priority = 0,
90 		.flow_spec.tcp_udp = {
91 			.type = IBV_EXP_FLOW_SPEC_UDP,
92 			.size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
93 		},
94 		.underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
95 	},
96 	[HASH_RXQ_IPV4] = {
97 		.hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
98 				IBV_EXP_RX_HASH_DST_IPV4),
99 		.dpdk_rss_hf = (ETH_RSS_IPV4 |
100 				ETH_RSS_FRAG_IPV4),
101 		.flow_priority = 1,
102 		.flow_spec.ipv4 = {
103 			.type = IBV_EXP_FLOW_SPEC_IPV4,
104 			.size = sizeof(hash_rxq_init[0].flow_spec.ipv4),
105 		},
106 		.underlayer = &hash_rxq_init[HASH_RXQ_ETH],
107 	},
108 #ifdef HAVE_FLOW_SPEC_IPV6
109 	[HASH_RXQ_TCPV6] = {
110 		.hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
111 				IBV_EXP_RX_HASH_DST_IPV6 |
112 				IBV_EXP_RX_HASH_SRC_PORT_TCP |
113 				IBV_EXP_RX_HASH_DST_PORT_TCP),
114 		.dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_TCP,
115 		.flow_priority = 0,
116 		.flow_spec.tcp_udp = {
117 			.type = IBV_EXP_FLOW_SPEC_TCP,
118 			.size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
119 		},
120 		.underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
121 	},
122 	[HASH_RXQ_UDPV6] = {
123 		.hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
124 				IBV_EXP_RX_HASH_DST_IPV6 |
125 				IBV_EXP_RX_HASH_SRC_PORT_UDP |
126 				IBV_EXP_RX_HASH_DST_PORT_UDP),
127 		.dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_UDP,
128 		.flow_priority = 0,
129 		.flow_spec.tcp_udp = {
130 			.type = IBV_EXP_FLOW_SPEC_UDP,
131 			.size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
132 		},
133 		.underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
134 	},
135 	[HASH_RXQ_IPV6] = {
136 		.hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
137 				IBV_EXP_RX_HASH_DST_IPV6),
138 		.dpdk_rss_hf = (ETH_RSS_IPV6 |
139 				ETH_RSS_FRAG_IPV6),
140 		.flow_priority = 1,
141 		.flow_spec.ipv6 = {
142 			.type = IBV_EXP_FLOW_SPEC_IPV6,
143 			.size = sizeof(hash_rxq_init[0].flow_spec.ipv6),
144 		},
145 		.underlayer = &hash_rxq_init[HASH_RXQ_ETH],
146 	},
147 #endif /* HAVE_FLOW_SPEC_IPV6 */
148 	[HASH_RXQ_ETH] = {
149 		.hash_fields = 0,
150 		.dpdk_rss_hf = 0,
151 		.flow_priority = 2,
152 		.flow_spec.eth = {
153 			.type = IBV_EXP_FLOW_SPEC_ETH,
154 			.size = sizeof(hash_rxq_init[0].flow_spec.eth),
155 		},
156 		.underlayer = NULL,
157 	},
158 };
159 
160 /* Number of entries in hash_rxq_init[]. */
161 const unsigned int hash_rxq_init_n = RTE_DIM(hash_rxq_init);
162 
163 /* Initialization data for hash RX queue indirection tables. */
164 static const struct ind_table_init ind_table_init[] = {
165 	{
166 		.max_size = -1u, /* Superseded by HW limitations. */
167 		.hash_types =
168 			1 << HASH_RXQ_TCPV4 |
169 			1 << HASH_RXQ_UDPV4 |
170 			1 << HASH_RXQ_IPV4 |
171 #ifdef HAVE_FLOW_SPEC_IPV6
172 			1 << HASH_RXQ_TCPV6 |
173 			1 << HASH_RXQ_UDPV6 |
174 			1 << HASH_RXQ_IPV6 |
175 #endif /* HAVE_FLOW_SPEC_IPV6 */
176 			0,
177 #ifdef HAVE_FLOW_SPEC_IPV6
178 		.hash_types_n = 6,
179 #else /* HAVE_FLOW_SPEC_IPV6 */
180 		.hash_types_n = 3,
181 #endif /* HAVE_FLOW_SPEC_IPV6 */
182 	},
183 	{
184 		.max_size = 1,
185 		.hash_types = 1 << HASH_RXQ_ETH,
186 		.hash_types_n = 1,
187 	},
188 };
189 
190 #define IND_TABLE_INIT_N RTE_DIM(ind_table_init)
191 
192 /* Default RSS hash key also used for ConnectX-3. */
193 uint8_t rss_hash_default_key[] = {
194 	0x2c, 0xc6, 0x81, 0xd1,
195 	0x5b, 0xdb, 0xf4, 0xf7,
196 	0xfc, 0xa2, 0x83, 0x19,
197 	0xdb, 0x1a, 0x3e, 0x94,
198 	0x6b, 0x9e, 0x38, 0xd9,
199 	0x2c, 0x9c, 0x03, 0xd1,
200 	0xad, 0x99, 0x44, 0xa7,
201 	0xd9, 0x56, 0x3d, 0x59,
202 	0x06, 0x3c, 0x25, 0xf3,
203 	0xfc, 0x1f, 0xdc, 0x2a,
204 };
205 
206 /* Length of the default RSS hash key. */
207 const size_t rss_hash_default_key_len = sizeof(rss_hash_default_key);
208 
209 /**
210  * Populate flow steering rule for a given hash RX queue type using
211  * information from hash_rxq_init[]. Nothing is written to flow_attr when
212  * flow_attr_size is not large enough, but the required size is still returned.
213  *
214  * @param priv
215  *   Pointer to private structure.
216  * @param[out] flow_attr
217  *   Pointer to flow attribute structure to fill. Note that the allocated
218  *   area must be larger and large enough to hold all flow specifications.
219  * @param flow_attr_size
220  *   Entire size of flow_attr and trailing room for flow specifications.
221  * @param type
222  *   Hash RX queue type to use for flow steering rule.
223  *
224  * @return
225  *   Total size of the flow attribute buffer. No errors are defined.
226  */
227 size_t
228 priv_flow_attr(struct priv *priv, struct ibv_exp_flow_attr *flow_attr,
229 	       size_t flow_attr_size, enum hash_rxq_type type)
230 {
231 	size_t offset = sizeof(*flow_attr);
232 	const struct hash_rxq_init *init = &hash_rxq_init[type];
233 
234 	assert(priv != NULL);
235 	assert((size_t)type < RTE_DIM(hash_rxq_init));
236 	do {
237 		offset += init->flow_spec.hdr.size;
238 		init = init->underlayer;
239 	} while (init != NULL);
240 	if (offset > flow_attr_size)
241 		return offset;
242 	flow_attr_size = offset;
243 	init = &hash_rxq_init[type];
244 	*flow_attr = (struct ibv_exp_flow_attr){
245 		.type = IBV_EXP_FLOW_ATTR_NORMAL,
246 #ifdef MLX5_FDIR_SUPPORT
247 		/* Priorities < 3 are reserved for flow director. */
248 		.priority = init->flow_priority + 3,
249 #else /* MLX5_FDIR_SUPPORT */
250 		.priority = init->flow_priority,
251 #endif /* MLX5_FDIR_SUPPORT */
252 		.num_of_specs = 0,
253 		.port = priv->port,
254 		.flags = 0,
255 	};
256 	do {
257 		offset -= init->flow_spec.hdr.size;
258 		memcpy((void *)((uintptr_t)flow_attr + offset),
259 		       &init->flow_spec,
260 		       init->flow_spec.hdr.size);
261 		++flow_attr->num_of_specs;
262 		init = init->underlayer;
263 	} while (init != NULL);
264 	return flow_attr_size;
265 }
266 
267 /**
268  * Convert hash type position in indirection table initializer to
269  * hash RX queue type.
270  *
271  * @param table
272  *   Indirection table initializer.
273  * @param pos
274  *   Hash type position.
275  *
276  * @return
277  *   Hash RX queue type.
278  */
279 static enum hash_rxq_type
280 hash_rxq_type_from_pos(const struct ind_table_init *table, unsigned int pos)
281 {
282 	enum hash_rxq_type type = 0;
283 
284 	assert(pos < table->hash_types_n);
285 	do {
286 		if ((table->hash_types & (1 << type)) && (pos-- == 0))
287 			break;
288 		++type;
289 	} while (1);
290 	return type;
291 }
292 
293 /**
294  * Filter out disabled hash RX queue types from ind_table_init[].
295  *
296  * @param priv
297  *   Pointer to private structure.
298  * @param[out] table
299  *   Output table.
300  *
301  * @return
302  *   Number of table entries.
303  */
304 static unsigned int
305 priv_make_ind_table_init(struct priv *priv,
306 			 struct ind_table_init (*table)[IND_TABLE_INIT_N])
307 {
308 	uint64_t rss_hf;
309 	unsigned int i;
310 	unsigned int j;
311 	unsigned int table_n = 0;
312 	/* Mandatory to receive frames not handled by normal hash RX queues. */
313 	unsigned int hash_types_sup = 1 << HASH_RXQ_ETH;
314 
315 	rss_hf = priv->rss_hf;
316 	/* Process other protocols only if more than one queue. */
317 	if (priv->rxqs_n > 1)
318 		for (i = 0; (i != hash_rxq_init_n); ++i)
319 			if (rss_hf & hash_rxq_init[i].dpdk_rss_hf)
320 				hash_types_sup |= (1 << i);
321 
322 	/* Filter out entries whose protocols are not in the set. */
323 	for (i = 0, j = 0; (i != IND_TABLE_INIT_N); ++i) {
324 		unsigned int nb;
325 		unsigned int h;
326 
327 		/* j is increased only if the table has valid protocols. */
328 		assert(j <= i);
329 		(*table)[j] = ind_table_init[i];
330 		(*table)[j].hash_types &= hash_types_sup;
331 		for (h = 0, nb = 0; (h != hash_rxq_init_n); ++h)
332 			if (((*table)[j].hash_types >> h) & 0x1)
333 				++nb;
334 		(*table)[i].hash_types_n = nb;
335 		if (nb) {
336 			++table_n;
337 			++j;
338 		}
339 	}
340 	return table_n;
341 }
342 
343 /**
344  * Initialize hash RX queues and indirection table.
345  *
346  * @param priv
347  *   Pointer to private structure.
348  *
349  * @return
350  *   0 on success, errno value on failure.
351  */
352 int
353 priv_create_hash_rxqs(struct priv *priv)
354 {
355 	struct ibv_exp_wq *wqs[priv->reta_idx_n];
356 	struct ind_table_init ind_table_init[IND_TABLE_INIT_N];
357 	unsigned int ind_tables_n =
358 		priv_make_ind_table_init(priv, &ind_table_init);
359 	unsigned int hash_rxqs_n = 0;
360 	struct hash_rxq (*hash_rxqs)[] = NULL;
361 	struct ibv_exp_rwq_ind_table *(*ind_tables)[] = NULL;
362 	unsigned int i;
363 	unsigned int j;
364 	unsigned int k;
365 	int err = 0;
366 
367 	assert(priv->ind_tables == NULL);
368 	assert(priv->ind_tables_n == 0);
369 	assert(priv->hash_rxqs == NULL);
370 	assert(priv->hash_rxqs_n == 0);
371 	assert(priv->pd != NULL);
372 	assert(priv->ctx != NULL);
373 	if (priv->rxqs_n == 0)
374 		return EINVAL;
375 	assert(priv->rxqs != NULL);
376 	if (ind_tables_n == 0) {
377 		ERROR("all hash RX queue types have been filtered out,"
378 		      " indirection table cannot be created");
379 		return EINVAL;
380 	}
381 	if (priv->rxqs_n & (priv->rxqs_n - 1)) {
382 		INFO("%u RX queues are configured, consider rounding this"
383 		     " number to the next power of two for better balancing",
384 		     priv->rxqs_n);
385 		DEBUG("indirection table extended to assume %u WQs",
386 		      priv->reta_idx_n);
387 	}
388 	for (i = 0; (i != priv->reta_idx_n); ++i)
389 		wqs[i] = (*priv->rxqs)[(*priv->reta_idx)[i]]->wq;
390 	/* Get number of hash RX queues to configure. */
391 	for (i = 0, hash_rxqs_n = 0; (i != ind_tables_n); ++i)
392 		hash_rxqs_n += ind_table_init[i].hash_types_n;
393 	DEBUG("allocating %u hash RX queues for %u WQs, %u indirection tables",
394 	      hash_rxqs_n, priv->rxqs_n, ind_tables_n);
395 	/* Create indirection tables. */
396 	ind_tables = rte_calloc(__func__, ind_tables_n,
397 				sizeof((*ind_tables)[0]), 0);
398 	if (ind_tables == NULL) {
399 		err = ENOMEM;
400 		ERROR("cannot allocate indirection tables container: %s",
401 		      strerror(err));
402 		goto error;
403 	}
404 	for (i = 0; (i != ind_tables_n); ++i) {
405 		struct ibv_exp_rwq_ind_table_init_attr ind_init_attr = {
406 			.pd = priv->pd,
407 			.log_ind_tbl_size = 0, /* Set below. */
408 			.ind_tbl = wqs,
409 			.comp_mask = 0,
410 		};
411 		unsigned int ind_tbl_size = ind_table_init[i].max_size;
412 		struct ibv_exp_rwq_ind_table *ind_table;
413 
414 		if (priv->reta_idx_n < ind_tbl_size)
415 			ind_tbl_size = priv->reta_idx_n;
416 		ind_init_attr.log_ind_tbl_size = log2above(ind_tbl_size);
417 		errno = 0;
418 		ind_table = ibv_exp_create_rwq_ind_table(priv->ctx,
419 							 &ind_init_attr);
420 		if (ind_table != NULL) {
421 			(*ind_tables)[i] = ind_table;
422 			continue;
423 		}
424 		/* Not clear whether errno is set. */
425 		err = (errno ? errno : EINVAL);
426 		ERROR("RX indirection table creation failed with error %d: %s",
427 		      err, strerror(err));
428 		goto error;
429 	}
430 	/* Allocate array that holds hash RX queues and related data. */
431 	hash_rxqs = rte_calloc(__func__, hash_rxqs_n,
432 			       sizeof((*hash_rxqs)[0]), 0);
433 	if (hash_rxqs == NULL) {
434 		err = ENOMEM;
435 		ERROR("cannot allocate hash RX queues container: %s",
436 		      strerror(err));
437 		goto error;
438 	}
439 	for (i = 0, j = 0, k = 0;
440 	     ((i != hash_rxqs_n) && (j != ind_tables_n));
441 	     ++i) {
442 		struct hash_rxq *hash_rxq = &(*hash_rxqs)[i];
443 		enum hash_rxq_type type =
444 			hash_rxq_type_from_pos(&ind_table_init[j], k);
445 		struct rte_eth_rss_conf *priv_rss_conf =
446 			(*priv->rss_conf)[type];
447 		struct ibv_exp_rx_hash_conf hash_conf = {
448 			.rx_hash_function = IBV_EXP_RX_HASH_FUNC_TOEPLITZ,
449 			.rx_hash_key_len = (priv_rss_conf ?
450 					    priv_rss_conf->rss_key_len :
451 					    rss_hash_default_key_len),
452 			.rx_hash_key = (priv_rss_conf ?
453 					priv_rss_conf->rss_key :
454 					rss_hash_default_key),
455 			.rx_hash_fields_mask = hash_rxq_init[type].hash_fields,
456 			.rwq_ind_tbl = (*ind_tables)[j],
457 		};
458 		struct ibv_exp_qp_init_attr qp_init_attr = {
459 			.max_inl_recv = 0, /* Currently not supported. */
460 			.qp_type = IBV_QPT_RAW_PACKET,
461 			.comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
462 				      IBV_EXP_QP_INIT_ATTR_RX_HASH),
463 			.pd = priv->pd,
464 			.rx_hash_conf = &hash_conf,
465 			.port_num = priv->port,
466 		};
467 
468 		DEBUG("using indirection table %u for hash RX queue %u type %d",
469 		      j, i, type);
470 		*hash_rxq = (struct hash_rxq){
471 			.priv = priv,
472 			.qp = ibv_exp_create_qp(priv->ctx, &qp_init_attr),
473 			.type = type,
474 		};
475 		if (hash_rxq->qp == NULL) {
476 			err = (errno ? errno : EINVAL);
477 			ERROR("Hash RX QP creation failure: %s",
478 			      strerror(err));
479 			goto error;
480 		}
481 		if (++k < ind_table_init[j].hash_types_n)
482 			continue;
483 		/* Switch to the next indirection table and reset hash RX
484 		 * queue type array index. */
485 		++j;
486 		k = 0;
487 	}
488 	priv->ind_tables = ind_tables;
489 	priv->ind_tables_n = ind_tables_n;
490 	priv->hash_rxqs = hash_rxqs;
491 	priv->hash_rxqs_n = hash_rxqs_n;
492 	assert(err == 0);
493 	return 0;
494 error:
495 	if (hash_rxqs != NULL) {
496 		for (i = 0; (i != hash_rxqs_n); ++i) {
497 			struct ibv_qp *qp = (*hash_rxqs)[i].qp;
498 
499 			if (qp == NULL)
500 				continue;
501 			claim_zero(ibv_destroy_qp(qp));
502 		}
503 		rte_free(hash_rxqs);
504 	}
505 	if (ind_tables != NULL) {
506 		for (j = 0; (j != ind_tables_n); ++j) {
507 			struct ibv_exp_rwq_ind_table *ind_table =
508 				(*ind_tables)[j];
509 
510 			if (ind_table == NULL)
511 				continue;
512 			claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
513 		}
514 		rte_free(ind_tables);
515 	}
516 	return err;
517 }
518 
519 /**
520  * Clean up hash RX queues and indirection table.
521  *
522  * @param priv
523  *   Pointer to private structure.
524  */
525 void
526 priv_destroy_hash_rxqs(struct priv *priv)
527 {
528 	unsigned int i;
529 
530 	DEBUG("destroying %u hash RX queues", priv->hash_rxqs_n);
531 	if (priv->hash_rxqs_n == 0) {
532 		assert(priv->hash_rxqs == NULL);
533 		assert(priv->ind_tables == NULL);
534 		return;
535 	}
536 	for (i = 0; (i != priv->hash_rxqs_n); ++i) {
537 		struct hash_rxq *hash_rxq = &(*priv->hash_rxqs)[i];
538 		unsigned int j, k;
539 
540 		assert(hash_rxq->priv == priv);
541 		assert(hash_rxq->qp != NULL);
542 		/* Also check that there are no remaining flows. */
543 		for (j = 0; (j != RTE_DIM(hash_rxq->special_flow)); ++j)
544 			for (k = 0;
545 			     (k != RTE_DIM(hash_rxq->special_flow[j]));
546 			     ++k)
547 				assert(hash_rxq->special_flow[j][k] == NULL);
548 		for (j = 0; (j != RTE_DIM(hash_rxq->mac_flow)); ++j)
549 			for (k = 0; (k != RTE_DIM(hash_rxq->mac_flow[j])); ++k)
550 				assert(hash_rxq->mac_flow[j][k] == NULL);
551 		claim_zero(ibv_destroy_qp(hash_rxq->qp));
552 	}
553 	priv->hash_rxqs_n = 0;
554 	rte_free(priv->hash_rxqs);
555 	priv->hash_rxqs = NULL;
556 	for (i = 0; (i != priv->ind_tables_n); ++i) {
557 		struct ibv_exp_rwq_ind_table *ind_table =
558 			(*priv->ind_tables)[i];
559 
560 		assert(ind_table != NULL);
561 		claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
562 	}
563 	priv->ind_tables_n = 0;
564 	rte_free(priv->ind_tables);
565 	priv->ind_tables = NULL;
566 }
567 
568 /**
569  * Check whether a given flow type is allowed.
570  *
571  * @param priv
572  *   Pointer to private structure.
573  * @param type
574  *   Flow type to check.
575  *
576  * @return
577  *   Nonzero if the given flow type is allowed.
578  */
579 int
580 priv_allow_flow_type(struct priv *priv, enum hash_rxq_flow_type type)
581 {
582 	/* Only FLOW_TYPE_PROMISC is allowed when promiscuous mode
583 	 * has been requested. */
584 	if (priv->promisc_req)
585 		return type == HASH_RXQ_FLOW_TYPE_PROMISC;
586 	switch (type) {
587 	case HASH_RXQ_FLOW_TYPE_PROMISC:
588 		return !!priv->promisc_req;
589 	case HASH_RXQ_FLOW_TYPE_ALLMULTI:
590 		return !!priv->allmulti_req;
591 	case HASH_RXQ_FLOW_TYPE_BROADCAST:
592 #ifdef HAVE_FLOW_SPEC_IPV6
593 	case HASH_RXQ_FLOW_TYPE_IPV6MULTI:
594 #endif /* HAVE_FLOW_SPEC_IPV6 */
595 		/* If allmulti is enabled, broadcast and ipv6multi
596 		 * are unnecessary. */
597 		return !priv->allmulti_req;
598 	case HASH_RXQ_FLOW_TYPE_MAC:
599 		return 1;
600 	default:
601 		/* Unsupported flow type is not allowed. */
602 		return 0;
603 	}
604 	return 0;
605 }
606 
607 /**
608  * Automatically enable/disable flows according to configuration.
609  *
610  * @param priv
611  *   Private structure.
612  *
613  * @return
614  *   0 on success, errno value on failure.
615  */
616 int
617 priv_rehash_flows(struct priv *priv)
618 {
619 	unsigned int i;
620 
621 	for (i = 0; (i != RTE_DIM((*priv->hash_rxqs)[0].special_flow)); ++i)
622 		if (!priv_allow_flow_type(priv, i)) {
623 			priv_special_flow_disable(priv, i);
624 		} else {
625 			int ret = priv_special_flow_enable(priv, i);
626 
627 			if (ret)
628 				return ret;
629 		}
630 	if (priv_allow_flow_type(priv, HASH_RXQ_FLOW_TYPE_MAC))
631 		return priv_mac_addrs_enable(priv);
632 	priv_mac_addrs_disable(priv);
633 	return 0;
634 }
635 
636 /**
637  * Allocate RX queue elements with scattered packets support.
638  *
639  * @param rxq
640  *   Pointer to RX queue structure.
641  * @param elts_n
642  *   Number of elements to allocate.
643  * @param[in] pool
644  *   If not NULL, fetch buffers from this array instead of allocating them
645  *   with rte_pktmbuf_alloc().
646  *
647  * @return
648  *   0 on success, errno value on failure.
649  */
650 static int
651 rxq_alloc_elts_sp(struct rxq *rxq, unsigned int elts_n,
652 		  struct rte_mbuf **pool)
653 {
654 	unsigned int i;
655 	struct rxq_elt_sp (*elts)[elts_n] =
656 		rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
657 				  rxq->socket);
658 	int ret = 0;
659 
660 	if (elts == NULL) {
661 		ERROR("%p: can't allocate packets array", (void *)rxq);
662 		ret = ENOMEM;
663 		goto error;
664 	}
665 	/* For each WR (packet). */
666 	for (i = 0; (i != elts_n); ++i) {
667 		unsigned int j;
668 		struct rxq_elt_sp *elt = &(*elts)[i];
669 		struct ibv_sge (*sges)[RTE_DIM(elt->sges)] = &elt->sges;
670 
671 		/* These two arrays must have the same size. */
672 		assert(RTE_DIM(elt->sges) == RTE_DIM(elt->bufs));
673 		/* For each SGE (segment). */
674 		for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
675 			struct ibv_sge *sge = &(*sges)[j];
676 			struct rte_mbuf *buf;
677 
678 			if (pool != NULL) {
679 				buf = *(pool++);
680 				assert(buf != NULL);
681 				rte_pktmbuf_reset(buf);
682 			} else
683 				buf = rte_pktmbuf_alloc(rxq->mp);
684 			if (buf == NULL) {
685 				assert(pool == NULL);
686 				ERROR("%p: empty mbuf pool", (void *)rxq);
687 				ret = ENOMEM;
688 				goto error;
689 			}
690 			elt->bufs[j] = buf;
691 			/* Headroom is reserved by rte_pktmbuf_alloc(). */
692 			assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
693 			/* Buffer is supposed to be empty. */
694 			assert(rte_pktmbuf_data_len(buf) == 0);
695 			assert(rte_pktmbuf_pkt_len(buf) == 0);
696 			/* sge->addr must be able to store a pointer. */
697 			assert(sizeof(sge->addr) >= sizeof(uintptr_t));
698 			if (j == 0) {
699 				/* The first SGE keeps its headroom. */
700 				sge->addr = rte_pktmbuf_mtod(buf, uintptr_t);
701 				sge->length = (buf->buf_len -
702 					       RTE_PKTMBUF_HEADROOM);
703 			} else {
704 				/* Subsequent SGEs lose theirs. */
705 				assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
706 				SET_DATA_OFF(buf, 0);
707 				sge->addr = (uintptr_t)buf->buf_addr;
708 				sge->length = buf->buf_len;
709 			}
710 			sge->lkey = rxq->mr->lkey;
711 			/* Redundant check for tailroom. */
712 			assert(sge->length == rte_pktmbuf_tailroom(buf));
713 		}
714 	}
715 	DEBUG("%p: allocated and configured %u WRs (%zu segments)",
716 	      (void *)rxq, elts_n, (elts_n * RTE_DIM((*elts)[0].sges)));
717 	rxq->elts_n = elts_n;
718 	rxq->elts_head = 0;
719 	rxq->elts.sp = elts;
720 	assert(ret == 0);
721 	return 0;
722 error:
723 	if (elts != NULL) {
724 		assert(pool == NULL);
725 		for (i = 0; (i != RTE_DIM(*elts)); ++i) {
726 			unsigned int j;
727 			struct rxq_elt_sp *elt = &(*elts)[i];
728 
729 			for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
730 				struct rte_mbuf *buf = elt->bufs[j];
731 
732 				if (buf != NULL)
733 					rte_pktmbuf_free_seg(buf);
734 			}
735 		}
736 		rte_free(elts);
737 	}
738 	DEBUG("%p: failed, freed everything", (void *)rxq);
739 	assert(ret > 0);
740 	return ret;
741 }
742 
743 /**
744  * Free RX queue elements with scattered packets support.
745  *
746  * @param rxq
747  *   Pointer to RX queue structure.
748  */
749 static void
750 rxq_free_elts_sp(struct rxq *rxq)
751 {
752 	unsigned int i;
753 	unsigned int elts_n = rxq->elts_n;
754 	struct rxq_elt_sp (*elts)[elts_n] = rxq->elts.sp;
755 
756 	DEBUG("%p: freeing WRs", (void *)rxq);
757 	rxq->elts_n = 0;
758 	rxq->elts.sp = NULL;
759 	if (elts == NULL)
760 		return;
761 	for (i = 0; (i != RTE_DIM(*elts)); ++i) {
762 		unsigned int j;
763 		struct rxq_elt_sp *elt = &(*elts)[i];
764 
765 		for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
766 			struct rte_mbuf *buf = elt->bufs[j];
767 
768 			if (buf != NULL)
769 				rte_pktmbuf_free_seg(buf);
770 		}
771 	}
772 	rte_free(elts);
773 }
774 
775 /**
776  * Allocate RX queue elements.
777  *
778  * @param rxq
779  *   Pointer to RX queue structure.
780  * @param elts_n
781  *   Number of elements to allocate.
782  * @param[in] pool
783  *   If not NULL, fetch buffers from this array instead of allocating them
784  *   with rte_pktmbuf_alloc().
785  *
786  * @return
787  *   0 on success, errno value on failure.
788  */
789 static int
790 rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, struct rte_mbuf **pool)
791 {
792 	unsigned int i;
793 	struct rxq_elt (*elts)[elts_n] =
794 		rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
795 				  rxq->socket);
796 	int ret = 0;
797 
798 	if (elts == NULL) {
799 		ERROR("%p: can't allocate packets array", (void *)rxq);
800 		ret = ENOMEM;
801 		goto error;
802 	}
803 	/* For each WR (packet). */
804 	for (i = 0; (i != elts_n); ++i) {
805 		struct rxq_elt *elt = &(*elts)[i];
806 		struct ibv_sge *sge = &(*elts)[i].sge;
807 		struct rte_mbuf *buf;
808 
809 		if (pool != NULL) {
810 			buf = *(pool++);
811 			assert(buf != NULL);
812 			rte_pktmbuf_reset(buf);
813 		} else
814 			buf = rte_pktmbuf_alloc(rxq->mp);
815 		if (buf == NULL) {
816 			assert(pool == NULL);
817 			ERROR("%p: empty mbuf pool", (void *)rxq);
818 			ret = ENOMEM;
819 			goto error;
820 		}
821 		elt->buf = buf;
822 		/* Headroom is reserved by rte_pktmbuf_alloc(). */
823 		assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
824 		/* Buffer is supposed to be empty. */
825 		assert(rte_pktmbuf_data_len(buf) == 0);
826 		assert(rte_pktmbuf_pkt_len(buf) == 0);
827 		/* sge->addr must be able to store a pointer. */
828 		assert(sizeof(sge->addr) >= sizeof(uintptr_t));
829 		/* SGE keeps its headroom. */
830 		sge->addr = (uintptr_t)
831 			((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM);
832 		sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM);
833 		sge->lkey = rxq->mr->lkey;
834 		/* Redundant check for tailroom. */
835 		assert(sge->length == rte_pktmbuf_tailroom(buf));
836 	}
837 	DEBUG("%p: allocated and configured %u single-segment WRs",
838 	      (void *)rxq, elts_n);
839 	rxq->elts_n = elts_n;
840 	rxq->elts_head = 0;
841 	rxq->elts.no_sp = elts;
842 	assert(ret == 0);
843 	return 0;
844 error:
845 	if (elts != NULL) {
846 		assert(pool == NULL);
847 		for (i = 0; (i != RTE_DIM(*elts)); ++i) {
848 			struct rxq_elt *elt = &(*elts)[i];
849 			struct rte_mbuf *buf = elt->buf;
850 
851 			if (buf != NULL)
852 				rte_pktmbuf_free_seg(buf);
853 		}
854 		rte_free(elts);
855 	}
856 	DEBUG("%p: failed, freed everything", (void *)rxq);
857 	assert(ret > 0);
858 	return ret;
859 }
860 
861 /**
862  * Free RX queue elements.
863  *
864  * @param rxq
865  *   Pointer to RX queue structure.
866  */
867 static void
868 rxq_free_elts(struct rxq *rxq)
869 {
870 	unsigned int i;
871 	unsigned int elts_n = rxq->elts_n;
872 	struct rxq_elt (*elts)[elts_n] = rxq->elts.no_sp;
873 
874 	DEBUG("%p: freeing WRs", (void *)rxq);
875 	rxq->elts_n = 0;
876 	rxq->elts.no_sp = NULL;
877 	if (elts == NULL)
878 		return;
879 	for (i = 0; (i != RTE_DIM(*elts)); ++i) {
880 		struct rxq_elt *elt = &(*elts)[i];
881 		struct rte_mbuf *buf = elt->buf;
882 
883 		if (buf != NULL)
884 			rte_pktmbuf_free_seg(buf);
885 	}
886 	rte_free(elts);
887 }
888 
889 /**
890  * Clean up a RX queue.
891  *
892  * Destroy objects, free allocated memory and reset the structure for reuse.
893  *
894  * @param rxq
895  *   Pointer to RX queue structure.
896  */
897 void
898 rxq_cleanup(struct rxq *rxq)
899 {
900 	struct ibv_exp_release_intf_params params;
901 
902 	DEBUG("cleaning up %p", (void *)rxq);
903 	if (rxq->sp)
904 		rxq_free_elts_sp(rxq);
905 	else
906 		rxq_free_elts(rxq);
907 	rxq->poll = NULL;
908 	rxq->recv = NULL;
909 	if (rxq->if_wq != NULL) {
910 		assert(rxq->priv != NULL);
911 		assert(rxq->priv->ctx != NULL);
912 		assert(rxq->wq != NULL);
913 		params = (struct ibv_exp_release_intf_params){
914 			.comp_mask = 0,
915 		};
916 		claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
917 						rxq->if_wq,
918 						&params));
919 	}
920 	if (rxq->if_cq != NULL) {
921 		assert(rxq->priv != NULL);
922 		assert(rxq->priv->ctx != NULL);
923 		assert(rxq->cq != NULL);
924 		params = (struct ibv_exp_release_intf_params){
925 			.comp_mask = 0,
926 		};
927 		claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
928 						rxq->if_cq,
929 						&params));
930 	}
931 	if (rxq->wq != NULL)
932 		claim_zero(ibv_exp_destroy_wq(rxq->wq));
933 	if (rxq->cq != NULL)
934 		claim_zero(ibv_destroy_cq(rxq->cq));
935 	if (rxq->rd != NULL) {
936 		struct ibv_exp_destroy_res_domain_attr attr = {
937 			.comp_mask = 0,
938 		};
939 
940 		assert(rxq->priv != NULL);
941 		assert(rxq->priv->ctx != NULL);
942 		claim_zero(ibv_exp_destroy_res_domain(rxq->priv->ctx,
943 						      rxq->rd,
944 						      &attr));
945 	}
946 	if (rxq->mr != NULL)
947 		claim_zero(ibv_dereg_mr(rxq->mr));
948 	memset(rxq, 0, sizeof(*rxq));
949 }
950 
951 /**
952  * Reconfigure a RX queue with new parameters.
953  *
954  * rxq_rehash() does not allocate mbufs, which, if not done from the right
955  * thread (such as a control thread), may corrupt the pool.
956  * In case of failure, the queue is left untouched.
957  *
958  * @param dev
959  *   Pointer to Ethernet device structure.
960  * @param rxq
961  *   RX queue pointer.
962  *
963  * @return
964  *   0 on success, errno value on failure.
965  */
966 int
967 rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
968 {
969 	struct priv *priv = rxq->priv;
970 	struct rxq tmpl = *rxq;
971 	unsigned int mbuf_n;
972 	unsigned int desc_n;
973 	struct rte_mbuf **pool;
974 	unsigned int i, k;
975 	struct ibv_exp_wq_attr mod;
976 	int err;
977 
978 	DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq);
979 	/* Number of descriptors and mbufs currently allocated. */
980 	desc_n = (tmpl.elts_n * (tmpl.sp ? MLX5_PMD_SGE_WR_N : 1));
981 	mbuf_n = desc_n;
982 	/* Toggle RX checksum offload if hardware supports it. */
983 	if (priv->hw_csum) {
984 		tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
985 		rxq->csum = tmpl.csum;
986 	}
987 	if (priv->hw_csum_l2tun) {
988 		tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
989 		rxq->csum_l2tun = tmpl.csum_l2tun;
990 	}
991 	/* Enable scattered packets support for this queue if necessary. */
992 	if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
993 	    (dev->data->dev_conf.rxmode.max_rx_pkt_len >
994 	     (tmpl.mb_len - RTE_PKTMBUF_HEADROOM))) {
995 		tmpl.sp = 1;
996 		desc_n /= MLX5_PMD_SGE_WR_N;
997 	} else
998 		tmpl.sp = 0;
999 	DEBUG("%p: %s scattered packets support (%u WRs)",
1000 	      (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc_n);
1001 	/* If scatter mode is the same as before, nothing to do. */
1002 	if (tmpl.sp == rxq->sp) {
1003 		DEBUG("%p: nothing to do", (void *)dev);
1004 		return 0;
1005 	}
1006 	/* From now on, any failure will render the queue unusable.
1007 	 * Reinitialize WQ. */
1008 	mod = (struct ibv_exp_wq_attr){
1009 		.attr_mask = IBV_EXP_WQ_ATTR_STATE,
1010 		.wq_state = IBV_EXP_WQS_RESET,
1011 	};
1012 	err = ibv_exp_modify_wq(tmpl.wq, &mod);
1013 	if (err) {
1014 		ERROR("%p: cannot reset WQ: %s", (void *)dev, strerror(err));
1015 		assert(err > 0);
1016 		return err;
1017 	}
1018 	/* Allocate pool. */
1019 	pool = rte_malloc(__func__, (mbuf_n * sizeof(*pool)), 0);
1020 	if (pool == NULL) {
1021 		ERROR("%p: cannot allocate memory", (void *)dev);
1022 		return ENOBUFS;
1023 	}
1024 	/* Snatch mbufs from original queue. */
1025 	k = 0;
1026 	if (rxq->sp) {
1027 		struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp;
1028 
1029 		for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1030 			struct rxq_elt_sp *elt = &(*elts)[i];
1031 			unsigned int j;
1032 
1033 			for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
1034 				assert(elt->bufs[j] != NULL);
1035 				pool[k++] = elt->bufs[j];
1036 			}
1037 		}
1038 	} else {
1039 		struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp;
1040 
1041 		for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1042 			struct rxq_elt *elt = &(*elts)[i];
1043 			struct rte_mbuf *buf = elt->buf;
1044 
1045 			pool[k++] = buf;
1046 		}
1047 	}
1048 	assert(k == mbuf_n);
1049 	tmpl.elts_n = 0;
1050 	tmpl.elts.sp = NULL;
1051 	assert((void *)&tmpl.elts.sp == (void *)&tmpl.elts.no_sp);
1052 	err = ((tmpl.sp) ?
1053 	       rxq_alloc_elts_sp(&tmpl, desc_n, pool) :
1054 	       rxq_alloc_elts(&tmpl, desc_n, pool));
1055 	if (err) {
1056 		ERROR("%p: cannot reallocate WRs, aborting", (void *)dev);
1057 		rte_free(pool);
1058 		assert(err > 0);
1059 		return err;
1060 	}
1061 	assert(tmpl.elts_n == desc_n);
1062 	assert(tmpl.elts.sp != NULL);
1063 	rte_free(pool);
1064 	/* Clean up original data. */
1065 	rxq->elts_n = 0;
1066 	rte_free(rxq->elts.sp);
1067 	rxq->elts.sp = NULL;
1068 	/* Change queue state to ready. */
1069 	mod = (struct ibv_exp_wq_attr){
1070 		.attr_mask = IBV_EXP_WQ_ATTR_STATE,
1071 		.wq_state = IBV_EXP_WQS_RDY,
1072 	};
1073 	err = ibv_exp_modify_wq(tmpl.wq, &mod);
1074 	if (err) {
1075 		ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
1076 		      (void *)dev, strerror(err));
1077 		goto error;
1078 	}
1079 	/* Post SGEs. */
1080 	assert(tmpl.if_wq != NULL);
1081 	if (tmpl.sp) {
1082 		struct rxq_elt_sp (*elts)[tmpl.elts_n] = tmpl.elts.sp;
1083 
1084 		for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1085 			err = tmpl.if_wq->recv_sg_list
1086 				(tmpl.wq,
1087 				 (*elts)[i].sges,
1088 				 RTE_DIM((*elts)[i].sges));
1089 			if (err)
1090 				break;
1091 		}
1092 	} else {
1093 		struct rxq_elt (*elts)[tmpl.elts_n] = tmpl.elts.no_sp;
1094 
1095 		for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1096 			err = tmpl.if_wq->recv_burst(
1097 				tmpl.wq,
1098 				&(*elts)[i].sge,
1099 				1);
1100 			if (err)
1101 				break;
1102 		}
1103 	}
1104 	if (err) {
1105 		ERROR("%p: failed to post SGEs with error %d",
1106 		      (void *)dev, err);
1107 		/* Set err because it does not contain a valid errno value. */
1108 		err = EIO;
1109 		goto error;
1110 	}
1111 	if (tmpl.sp)
1112 		tmpl.recv = tmpl.if_wq->recv_sg_list;
1113 	else
1114 		tmpl.recv = tmpl.if_wq->recv_burst;
1115 error:
1116 	*rxq = tmpl;
1117 	assert(err >= 0);
1118 	return err;
1119 }
1120 
1121 /**
1122  * Configure a RX queue.
1123  *
1124  * @param dev
1125  *   Pointer to Ethernet device structure.
1126  * @param rxq
1127  *   Pointer to RX queue structure.
1128  * @param desc
1129  *   Number of descriptors to configure in queue.
1130  * @param socket
1131  *   NUMA socket on which memory must be allocated.
1132  * @param[in] conf
1133  *   Thresholds parameters.
1134  * @param mp
1135  *   Memory pool for buffer allocations.
1136  *
1137  * @return
1138  *   0 on success, errno value on failure.
1139  */
1140 int
1141 rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
1142 	  unsigned int socket, const struct rte_eth_rxconf *conf,
1143 	  struct rte_mempool *mp)
1144 {
1145 	struct priv *priv = dev->data->dev_private;
1146 	struct rxq tmpl = {
1147 		.priv = priv,
1148 		.mp = mp,
1149 		.socket = socket
1150 	};
1151 	struct ibv_exp_wq_attr mod;
1152 	union {
1153 		struct ibv_exp_query_intf_params params;
1154 		struct ibv_exp_cq_init_attr cq;
1155 		struct ibv_exp_res_domain_init_attr rd;
1156 		struct ibv_exp_wq_init_attr wq;
1157 	} attr;
1158 	enum ibv_exp_query_intf_status status;
1159 	struct rte_mbuf *buf;
1160 	int ret = 0;
1161 	unsigned int i;
1162 	unsigned int cq_size = desc;
1163 
1164 	(void)conf; /* Thresholds configuration (ignored). */
1165 	if ((desc == 0) || (desc % MLX5_PMD_SGE_WR_N)) {
1166 		ERROR("%p: invalid number of RX descriptors (must be a"
1167 		      " multiple of %d)", (void *)dev, MLX5_PMD_SGE_WR_N);
1168 		return EINVAL;
1169 	}
1170 	/* Get mbuf length. */
1171 	buf = rte_pktmbuf_alloc(mp);
1172 	if (buf == NULL) {
1173 		ERROR("%p: unable to allocate mbuf", (void *)dev);
1174 		return ENOMEM;
1175 	}
1176 	tmpl.mb_len = buf->buf_len;
1177 	assert((rte_pktmbuf_headroom(buf) +
1178 		rte_pktmbuf_tailroom(buf)) == tmpl.mb_len);
1179 	assert(rte_pktmbuf_headroom(buf) == RTE_PKTMBUF_HEADROOM);
1180 	rte_pktmbuf_free(buf);
1181 	/* Toggle RX checksum offload if hardware supports it. */
1182 	if (priv->hw_csum)
1183 		tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
1184 	if (priv->hw_csum_l2tun)
1185 		tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
1186 	/* Enable scattered packets support for this queue if necessary. */
1187 	if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
1188 	    (dev->data->dev_conf.rxmode.max_rx_pkt_len >
1189 	     (tmpl.mb_len - RTE_PKTMBUF_HEADROOM))) {
1190 		tmpl.sp = 1;
1191 		desc /= MLX5_PMD_SGE_WR_N;
1192 	}
1193 	DEBUG("%p: %s scattered packets support (%u WRs)",
1194 	      (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc);
1195 	/* Use the entire RX mempool as the memory region. */
1196 	tmpl.mr = mlx5_mp2mr(priv->pd, mp);
1197 	if (tmpl.mr == NULL) {
1198 		ret = EINVAL;
1199 		ERROR("%p: MR creation failure: %s",
1200 		      (void *)dev, strerror(ret));
1201 		goto error;
1202 	}
1203 	attr.rd = (struct ibv_exp_res_domain_init_attr){
1204 		.comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
1205 			      IBV_EXP_RES_DOMAIN_MSG_MODEL),
1206 		.thread_model = IBV_EXP_THREAD_SINGLE,
1207 		.msg_model = IBV_EXP_MSG_HIGH_BW,
1208 	};
1209 	tmpl.rd = ibv_exp_create_res_domain(priv->ctx, &attr.rd);
1210 	if (tmpl.rd == NULL) {
1211 		ret = ENOMEM;
1212 		ERROR("%p: RD creation failure: %s",
1213 		      (void *)dev, strerror(ret));
1214 		goto error;
1215 	}
1216 	attr.cq = (struct ibv_exp_cq_init_attr){
1217 		.comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
1218 		.res_domain = tmpl.rd,
1219 	};
1220 	tmpl.cq = ibv_exp_create_cq(priv->ctx, cq_size, NULL, NULL, 0,
1221 				    &attr.cq);
1222 	if (tmpl.cq == NULL) {
1223 		ret = ENOMEM;
1224 		ERROR("%p: CQ creation failure: %s",
1225 		      (void *)dev, strerror(ret));
1226 		goto error;
1227 	}
1228 	DEBUG("priv->device_attr.max_qp_wr is %d",
1229 	      priv->device_attr.max_qp_wr);
1230 	DEBUG("priv->device_attr.max_sge is %d",
1231 	      priv->device_attr.max_sge);
1232 	/* Configure VLAN stripping. */
1233 	tmpl.vlan_strip = dev->data->dev_conf.rxmode.hw_vlan_strip;
1234 	attr.wq = (struct ibv_exp_wq_init_attr){
1235 		.wq_context = NULL, /* Could be useful in the future. */
1236 		.wq_type = IBV_EXP_WQT_RQ,
1237 		/* Max number of outstanding WRs. */
1238 		.max_recv_wr = ((priv->device_attr.max_qp_wr < (int)cq_size) ?
1239 				priv->device_attr.max_qp_wr :
1240 				(int)cq_size),
1241 		/* Max number of scatter/gather elements in a WR. */
1242 		.max_recv_sge = ((priv->device_attr.max_sge <
1243 				  MLX5_PMD_SGE_WR_N) ?
1244 				 priv->device_attr.max_sge :
1245 				 MLX5_PMD_SGE_WR_N),
1246 		.pd = priv->pd,
1247 		.cq = tmpl.cq,
1248 		.comp_mask =
1249 			IBV_EXP_CREATE_WQ_RES_DOMAIN |
1250 #ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
1251 			IBV_EXP_CREATE_WQ_VLAN_OFFLOADS |
1252 #endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
1253 			0,
1254 		.res_domain = tmpl.rd,
1255 #ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
1256 		.vlan_offloads = (tmpl.vlan_strip ?
1257 				  IBV_EXP_RECEIVE_WQ_CVLAN_STRIP :
1258 				  0),
1259 #endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
1260 	};
1261 
1262 #ifdef HAVE_VERBS_FCS
1263 	/* By default, FCS (CRC) is stripped by hardware. */
1264 	if (dev->data->dev_conf.rxmode.hw_strip_crc) {
1265 		tmpl.crc_present = 0;
1266 	} else if (priv->hw_fcs_strip) {
1267 		/* Ask HW/Verbs to leave CRC in place when supported. */
1268 		attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_SCATTER_FCS;
1269 		attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
1270 		tmpl.crc_present = 1;
1271 	} else {
1272 		WARN("%p: CRC stripping has been disabled but will still"
1273 		     " be performed by hardware, make sure MLNX_OFED and"
1274 		     " firmware are up to date",
1275 		     (void *)dev);
1276 		tmpl.crc_present = 0;
1277 	}
1278 	DEBUG("%p: CRC stripping is %s, %u bytes will be subtracted from"
1279 	      " incoming frames to hide it",
1280 	      (void *)dev,
1281 	      tmpl.crc_present ? "disabled" : "enabled",
1282 	      tmpl.crc_present << 2);
1283 #endif /* HAVE_VERBS_FCS */
1284 
1285 #ifdef HAVE_VERBS_RX_END_PADDING
1286 	if (!mlx5_getenv_int("MLX5_PMD_ENABLE_PADDING"))
1287 		; /* Nothing else to do. */
1288 	else if (priv->hw_padding) {
1289 		INFO("%p: enabling packet padding on queue %p",
1290 		     (void *)dev, (void *)rxq);
1291 		attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_RX_END_PADDING;
1292 		attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
1293 	} else
1294 		WARN("%p: packet padding has been requested but is not"
1295 		     " supported, make sure MLNX_OFED and firmware are"
1296 		     " up to date",
1297 		     (void *)dev);
1298 #endif /* HAVE_VERBS_RX_END_PADDING */
1299 
1300 	tmpl.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
1301 	if (tmpl.wq == NULL) {
1302 		ret = (errno ? errno : EINVAL);
1303 		ERROR("%p: WQ creation failure: %s",
1304 		      (void *)dev, strerror(ret));
1305 		goto error;
1306 	}
1307 	if (tmpl.sp)
1308 		ret = rxq_alloc_elts_sp(&tmpl, desc, NULL);
1309 	else
1310 		ret = rxq_alloc_elts(&tmpl, desc, NULL);
1311 	if (ret) {
1312 		ERROR("%p: RXQ allocation failed: %s",
1313 		      (void *)dev, strerror(ret));
1314 		goto error;
1315 	}
1316 	/* Save port ID. */
1317 	tmpl.port_id = dev->data->port_id;
1318 	DEBUG("%p: RTE port ID: %u", (void *)rxq, tmpl.port_id);
1319 	attr.params = (struct ibv_exp_query_intf_params){
1320 		.intf_scope = IBV_EXP_INTF_GLOBAL,
1321 #ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
1322 		.intf_version = 1,
1323 #endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
1324 		.intf = IBV_EXP_INTF_CQ,
1325 		.obj = tmpl.cq,
1326 	};
1327 	tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
1328 	if (tmpl.if_cq == NULL) {
1329 		ERROR("%p: CQ interface family query failed with status %d",
1330 		      (void *)dev, status);
1331 		goto error;
1332 	}
1333 	attr.params = (struct ibv_exp_query_intf_params){
1334 		.intf_scope = IBV_EXP_INTF_GLOBAL,
1335 		.intf = IBV_EXP_INTF_WQ,
1336 		.obj = tmpl.wq,
1337 	};
1338 	tmpl.if_wq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
1339 	if (tmpl.if_wq == NULL) {
1340 		ERROR("%p: WQ interface family query failed with status %d",
1341 		      (void *)dev, status);
1342 		goto error;
1343 	}
1344 	/* Change queue state to ready. */
1345 	mod = (struct ibv_exp_wq_attr){
1346 		.attr_mask = IBV_EXP_WQ_ATTR_STATE,
1347 		.wq_state = IBV_EXP_WQS_RDY,
1348 	};
1349 	ret = ibv_exp_modify_wq(tmpl.wq, &mod);
1350 	if (ret) {
1351 		ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
1352 		      (void *)dev, strerror(ret));
1353 		goto error;
1354 	}
1355 	/* Post SGEs. */
1356 	if (tmpl.sp) {
1357 		struct rxq_elt_sp (*elts)[tmpl.elts_n] = tmpl.elts.sp;
1358 
1359 		for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1360 			ret = tmpl.if_wq->recv_sg_list
1361 				(tmpl.wq,
1362 				 (*elts)[i].sges,
1363 				 RTE_DIM((*elts)[i].sges));
1364 			if (ret)
1365 				break;
1366 		}
1367 	} else {
1368 		struct rxq_elt (*elts)[tmpl.elts_n] = tmpl.elts.no_sp;
1369 
1370 		for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1371 			ret = tmpl.if_wq->recv_burst(
1372 				tmpl.wq,
1373 				&(*elts)[i].sge,
1374 				1);
1375 			if (ret)
1376 				break;
1377 		}
1378 	}
1379 	if (ret) {
1380 		ERROR("%p: failed to post SGEs with error %d",
1381 		      (void *)dev, ret);
1382 		/* Set ret because it does not contain a valid errno value. */
1383 		ret = EIO;
1384 		goto error;
1385 	}
1386 	/* Clean up rxq in case we're reinitializing it. */
1387 	DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq);
1388 	rxq_cleanup(rxq);
1389 	*rxq = tmpl;
1390 	DEBUG("%p: rxq updated with %p", (void *)rxq, (void *)&tmpl);
1391 	assert(ret == 0);
1392 	/* Assign function in queue. */
1393 #ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
1394 	rxq->poll = rxq->if_cq->poll_length_flags_cvlan;
1395 #else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
1396 	rxq->poll = rxq->if_cq->poll_length_flags;
1397 #endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
1398 	if (rxq->sp)
1399 		rxq->recv = rxq->if_wq->recv_sg_list;
1400 	else
1401 		rxq->recv = rxq->if_wq->recv_burst;
1402 	return 0;
1403 error:
1404 	rxq_cleanup(&tmpl);
1405 	assert(ret > 0);
1406 	return ret;
1407 }
1408 
1409 /**
1410  * DPDK callback to configure a RX queue.
1411  *
1412  * @param dev
1413  *   Pointer to Ethernet device structure.
1414  * @param idx
1415  *   RX queue index.
1416  * @param desc
1417  *   Number of descriptors to configure in queue.
1418  * @param socket
1419  *   NUMA socket on which memory must be allocated.
1420  * @param[in] conf
1421  *   Thresholds parameters.
1422  * @param mp
1423  *   Memory pool for buffer allocations.
1424  *
1425  * @return
1426  *   0 on success, negative errno value on failure.
1427  */
1428 int
1429 mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
1430 		    unsigned int socket, const struct rte_eth_rxconf *conf,
1431 		    struct rte_mempool *mp)
1432 {
1433 	struct priv *priv = dev->data->dev_private;
1434 	struct rxq *rxq = (*priv->rxqs)[idx];
1435 	int ret;
1436 
1437 	if (mlx5_is_secondary())
1438 		return -E_RTE_SECONDARY;
1439 
1440 	priv_lock(priv);
1441 	DEBUG("%p: configuring queue %u for %u descriptors",
1442 	      (void *)dev, idx, desc);
1443 	if (idx >= priv->rxqs_n) {
1444 		ERROR("%p: queue index out of range (%u >= %u)",
1445 		      (void *)dev, idx, priv->rxqs_n);
1446 		priv_unlock(priv);
1447 		return -EOVERFLOW;
1448 	}
1449 	if (rxq != NULL) {
1450 		DEBUG("%p: reusing already allocated queue index %u (%p)",
1451 		      (void *)dev, idx, (void *)rxq);
1452 		if (priv->started) {
1453 			priv_unlock(priv);
1454 			return -EEXIST;
1455 		}
1456 		(*priv->rxqs)[idx] = NULL;
1457 		rxq_cleanup(rxq);
1458 	} else {
1459 		rxq = rte_calloc_socket("RXQ", 1, sizeof(*rxq), 0, socket);
1460 		if (rxq == NULL) {
1461 			ERROR("%p: unable to allocate queue index %u",
1462 			      (void *)dev, idx);
1463 			priv_unlock(priv);
1464 			return -ENOMEM;
1465 		}
1466 	}
1467 	ret = rxq_setup(dev, rxq, desc, socket, conf, mp);
1468 	if (ret)
1469 		rte_free(rxq);
1470 	else {
1471 		rxq->stats.idx = idx;
1472 		DEBUG("%p: adding RX queue %p to list",
1473 		      (void *)dev, (void *)rxq);
1474 		(*priv->rxqs)[idx] = rxq;
1475 		/* Update receive callback. */
1476 		if (rxq->sp)
1477 			dev->rx_pkt_burst = mlx5_rx_burst_sp;
1478 		else
1479 			dev->rx_pkt_burst = mlx5_rx_burst;
1480 	}
1481 	priv_unlock(priv);
1482 	return -ret;
1483 }
1484 
1485 /**
1486  * DPDK callback to release a RX queue.
1487  *
1488  * @param dpdk_rxq
1489  *   Generic RX queue pointer.
1490  */
1491 void
1492 mlx5_rx_queue_release(void *dpdk_rxq)
1493 {
1494 	struct rxq *rxq = (struct rxq *)dpdk_rxq;
1495 	struct priv *priv;
1496 	unsigned int i;
1497 
1498 	if (mlx5_is_secondary())
1499 		return;
1500 
1501 	if (rxq == NULL)
1502 		return;
1503 	priv = rxq->priv;
1504 	priv_lock(priv);
1505 	for (i = 0; (i != priv->rxqs_n); ++i)
1506 		if ((*priv->rxqs)[i] == rxq) {
1507 			DEBUG("%p: removing RX queue %p from list",
1508 			      (void *)priv->dev, (void *)rxq);
1509 			(*priv->rxqs)[i] = NULL;
1510 			break;
1511 		}
1512 	rxq_cleanup(rxq);
1513 	rte_free(rxq);
1514 	priv_unlock(priv);
1515 }
1516 
1517 /**
1518  * DPDK callback for RX in secondary processes.
1519  *
1520  * This function configures all queues from primary process information
1521  * if necessary before reverting to the normal RX burst callback.
1522  *
1523  * @param dpdk_rxq
1524  *   Generic pointer to RX queue structure.
1525  * @param[out] pkts
1526  *   Array to store received packets.
1527  * @param pkts_n
1528  *   Maximum number of packets in array.
1529  *
1530  * @return
1531  *   Number of packets successfully received (<= pkts_n).
1532  */
1533 uint16_t
1534 mlx5_rx_burst_secondary_setup(void *dpdk_rxq, struct rte_mbuf **pkts,
1535 			      uint16_t pkts_n)
1536 {
1537 	struct rxq *rxq = dpdk_rxq;
1538 	struct priv *priv = mlx5_secondary_data_setup(rxq->priv);
1539 	struct priv *primary_priv;
1540 	unsigned int index;
1541 
1542 	if (priv == NULL)
1543 		return 0;
1544 	primary_priv =
1545 		mlx5_secondary_data[priv->dev->data->port_id].primary_priv;
1546 	/* Look for queue index in both private structures. */
1547 	for (index = 0; index != priv->rxqs_n; ++index)
1548 		if (((*primary_priv->rxqs)[index] == rxq) ||
1549 		    ((*priv->rxqs)[index] == rxq))
1550 			break;
1551 	if (index == priv->rxqs_n)
1552 		return 0;
1553 	rxq = (*priv->rxqs)[index];
1554 	return priv->dev->rx_pkt_burst(rxq, pkts, pkts_n);
1555 }
1556