xref: /dpdk/drivers/net/sfc/sfc_rx.c (revision 5ecb687a5698d2d8ec1f3b3b5a7a16bceca3e29c)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  *
3  * Copyright (c) 2016-2018 Solarflare Communications Inc.
4  * All rights reserved.
5  *
6  * This software was jointly developed between OKTET Labs (under contract
7  * for Solarflare) and Solarflare Communications, Inc.
8  */
9 
10 #include <rte_mempool.h>
11 
12 #include "efx.h"
13 
14 #include "sfc.h"
15 #include "sfc_debug.h"
16 #include "sfc_log.h"
17 #include "sfc_ev.h"
18 #include "sfc_rx.h"
19 #include "sfc_kvargs.h"
20 #include "sfc_tweak.h"
21 
22 /*
23  * Maximum number of Rx queue flush attempt in the case of failure or
24  * flush timeout
25  */
26 #define SFC_RX_QFLUSH_ATTEMPTS		(3)
27 
28 /*
29  * Time to wait between event queue polling attempts when waiting for Rx
30  * queue flush done or failed events.
31  */
32 #define SFC_RX_QFLUSH_POLL_WAIT_MS	(1)
33 
34 /*
35  * Maximum number of event queue polling attempts when waiting for Rx queue
36  * flush done or failed events. It defines Rx queue flush attempt timeout
37  * together with SFC_RX_QFLUSH_POLL_WAIT_MS.
38  */
39 #define SFC_RX_QFLUSH_POLL_ATTEMPTS	(2000)
40 
41 void
42 sfc_rx_qflush_done(struct sfc_rxq_info *rxq_info)
43 {
44 	rxq_info->state |= SFC_RXQ_FLUSHED;
45 	rxq_info->state &= ~SFC_RXQ_FLUSHING;
46 }
47 
48 void
49 sfc_rx_qflush_failed(struct sfc_rxq_info *rxq_info)
50 {
51 	rxq_info->state |= SFC_RXQ_FLUSH_FAILED;
52 	rxq_info->state &= ~SFC_RXQ_FLUSHING;
53 }
54 
55 static void
56 sfc_efx_rx_qrefill(struct sfc_efx_rxq *rxq)
57 {
58 	unsigned int free_space;
59 	unsigned int bulks;
60 	void *objs[SFC_RX_REFILL_BULK];
61 	efsys_dma_addr_t addr[RTE_DIM(objs)];
62 	unsigned int added = rxq->added;
63 	unsigned int id;
64 	unsigned int i;
65 	struct sfc_efx_rx_sw_desc *rxd;
66 	struct rte_mbuf *m;
67 	uint16_t port_id = rxq->dp.dpq.port_id;
68 
69 	free_space = rxq->max_fill_level - (added - rxq->completed);
70 
71 	if (free_space < rxq->refill_threshold)
72 		return;
73 
74 	bulks = free_space / RTE_DIM(objs);
75 	/* refill_threshold guarantees that bulks is positive */
76 	SFC_ASSERT(bulks > 0);
77 
78 	id = added & rxq->ptr_mask;
79 	do {
80 		if (unlikely(rte_mempool_get_bulk(rxq->refill_mb_pool, objs,
81 						  RTE_DIM(objs)) < 0)) {
82 			/*
83 			 * It is hardly a safe way to increment counter
84 			 * from different contexts, but all PMDs do it.
85 			 */
86 			rxq->evq->sa->eth_dev->data->rx_mbuf_alloc_failed +=
87 				RTE_DIM(objs);
88 			/* Return if we have posted nothing yet */
89 			if (added == rxq->added)
90 				return;
91 			/* Push posted */
92 			break;
93 		}
94 
95 		for (i = 0; i < RTE_DIM(objs);
96 		     ++i, id = (id + 1) & rxq->ptr_mask) {
97 			m = objs[i];
98 
99 			MBUF_RAW_ALLOC_CHECK(m);
100 
101 			rxd = &rxq->sw_desc[id];
102 			rxd->mbuf = m;
103 
104 			m->data_off = RTE_PKTMBUF_HEADROOM;
105 			m->port = port_id;
106 
107 			addr[i] = rte_pktmbuf_iova(m);
108 		}
109 
110 		efx_rx_qpost(rxq->common, addr, rxq->buf_size,
111 			     RTE_DIM(objs), rxq->completed, added);
112 		added += RTE_DIM(objs);
113 	} while (--bulks > 0);
114 
115 	SFC_ASSERT(added != rxq->added);
116 	rxq->added = added;
117 	efx_rx_qpush(rxq->common, added, &rxq->pushed);
118 }
119 
120 static uint64_t
121 sfc_efx_rx_desc_flags_to_offload_flags(const unsigned int desc_flags)
122 {
123 	uint64_t mbuf_flags = 0;
124 
125 	switch (desc_flags & (EFX_PKT_IPV4 | EFX_CKSUM_IPV4)) {
126 	case (EFX_PKT_IPV4 | EFX_CKSUM_IPV4):
127 		mbuf_flags |= PKT_RX_IP_CKSUM_GOOD;
128 		break;
129 	case EFX_PKT_IPV4:
130 		mbuf_flags |= PKT_RX_IP_CKSUM_BAD;
131 		break;
132 	default:
133 		RTE_BUILD_BUG_ON(PKT_RX_IP_CKSUM_UNKNOWN != 0);
134 		SFC_ASSERT((mbuf_flags & PKT_RX_IP_CKSUM_MASK) ==
135 			   PKT_RX_IP_CKSUM_UNKNOWN);
136 		break;
137 	}
138 
139 	switch ((desc_flags &
140 		 (EFX_PKT_TCP | EFX_PKT_UDP | EFX_CKSUM_TCPUDP))) {
141 	case (EFX_PKT_TCP | EFX_CKSUM_TCPUDP):
142 	case (EFX_PKT_UDP | EFX_CKSUM_TCPUDP):
143 		mbuf_flags |= PKT_RX_L4_CKSUM_GOOD;
144 		break;
145 	case EFX_PKT_TCP:
146 	case EFX_PKT_UDP:
147 		mbuf_flags |= PKT_RX_L4_CKSUM_BAD;
148 		break;
149 	default:
150 		RTE_BUILD_BUG_ON(PKT_RX_L4_CKSUM_UNKNOWN != 0);
151 		SFC_ASSERT((mbuf_flags & PKT_RX_L4_CKSUM_MASK) ==
152 			   PKT_RX_L4_CKSUM_UNKNOWN);
153 		break;
154 	}
155 
156 	return mbuf_flags;
157 }
158 
159 static uint32_t
160 sfc_efx_rx_desc_flags_to_packet_type(const unsigned int desc_flags)
161 {
162 	return RTE_PTYPE_L2_ETHER |
163 		((desc_flags & EFX_PKT_IPV4) ?
164 			RTE_PTYPE_L3_IPV4_EXT_UNKNOWN : 0) |
165 		((desc_flags & EFX_PKT_IPV6) ?
166 			RTE_PTYPE_L3_IPV6_EXT_UNKNOWN : 0) |
167 		((desc_flags & EFX_PKT_TCP) ? RTE_PTYPE_L4_TCP : 0) |
168 		((desc_flags & EFX_PKT_UDP) ? RTE_PTYPE_L4_UDP : 0);
169 }
170 
171 static const uint32_t *
172 sfc_efx_supported_ptypes_get(__rte_unused uint32_t tunnel_encaps)
173 {
174 	static const uint32_t ptypes[] = {
175 		RTE_PTYPE_L2_ETHER,
176 		RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
177 		RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
178 		RTE_PTYPE_L4_TCP,
179 		RTE_PTYPE_L4_UDP,
180 		RTE_PTYPE_UNKNOWN
181 	};
182 
183 	return ptypes;
184 }
185 
186 static void
187 sfc_efx_rx_set_rss_hash(struct sfc_efx_rxq *rxq, unsigned int flags,
188 			struct rte_mbuf *m)
189 {
190 	uint8_t *mbuf_data;
191 
192 
193 	if ((rxq->flags & SFC_EFX_RXQ_FLAG_RSS_HASH) == 0)
194 		return;
195 
196 	mbuf_data = rte_pktmbuf_mtod(m, uint8_t *);
197 
198 	if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
199 		m->hash.rss = efx_pseudo_hdr_hash_get(rxq->common,
200 						      EFX_RX_HASHALG_TOEPLITZ,
201 						      mbuf_data);
202 
203 		m->ol_flags |= PKT_RX_RSS_HASH;
204 	}
205 }
206 
207 static uint16_t
208 sfc_efx_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
209 {
210 	struct sfc_dp_rxq *dp_rxq = rx_queue;
211 	struct sfc_efx_rxq *rxq = sfc_efx_rxq_by_dp_rxq(dp_rxq);
212 	unsigned int completed;
213 	unsigned int prefix_size = rxq->prefix_size;
214 	unsigned int done_pkts = 0;
215 	boolean_t discard_next = B_FALSE;
216 	struct rte_mbuf *scatter_pkt = NULL;
217 
218 	if (unlikely((rxq->flags & SFC_EFX_RXQ_FLAG_RUNNING) == 0))
219 		return 0;
220 
221 	sfc_ev_qpoll(rxq->evq);
222 
223 	completed = rxq->completed;
224 	while (completed != rxq->pending && done_pkts < nb_pkts) {
225 		unsigned int id;
226 		struct sfc_efx_rx_sw_desc *rxd;
227 		struct rte_mbuf *m;
228 		unsigned int seg_len;
229 		unsigned int desc_flags;
230 
231 		id = completed++ & rxq->ptr_mask;
232 		rxd = &rxq->sw_desc[id];
233 		m = rxd->mbuf;
234 		desc_flags = rxd->flags;
235 
236 		if (discard_next)
237 			goto discard;
238 
239 		if (desc_flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
240 			goto discard;
241 
242 		if (desc_flags & EFX_PKT_PREFIX_LEN) {
243 			uint16_t tmp_size;
244 			int rc __rte_unused;
245 
246 			rc = efx_pseudo_hdr_pkt_length_get(rxq->common,
247 				rte_pktmbuf_mtod(m, uint8_t *), &tmp_size);
248 			SFC_ASSERT(rc == 0);
249 			seg_len = tmp_size;
250 		} else {
251 			seg_len = rxd->size - prefix_size;
252 		}
253 
254 		rte_pktmbuf_data_len(m) = seg_len;
255 		rte_pktmbuf_pkt_len(m) = seg_len;
256 
257 		if (scatter_pkt != NULL) {
258 			if (rte_pktmbuf_chain(scatter_pkt, m) != 0) {
259 				rte_pktmbuf_free(scatter_pkt);
260 				goto discard;
261 			}
262 			/* The packet to deliver */
263 			m = scatter_pkt;
264 		}
265 
266 		if (desc_flags & EFX_PKT_CONT) {
267 			/* The packet is scattered, more fragments to come */
268 			scatter_pkt = m;
269 			/* Further fragments have no prefix */
270 			prefix_size = 0;
271 			continue;
272 		}
273 
274 		/* Scattered packet is done */
275 		scatter_pkt = NULL;
276 		/* The first fragment of the packet has prefix */
277 		prefix_size = rxq->prefix_size;
278 
279 		m->ol_flags =
280 			sfc_efx_rx_desc_flags_to_offload_flags(desc_flags);
281 		m->packet_type =
282 			sfc_efx_rx_desc_flags_to_packet_type(desc_flags);
283 
284 		/*
285 		 * Extract RSS hash from the packet prefix and
286 		 * set the corresponding field (if needed and possible)
287 		 */
288 		sfc_efx_rx_set_rss_hash(rxq, desc_flags, m);
289 
290 		m->data_off += prefix_size;
291 
292 		*rx_pkts++ = m;
293 		done_pkts++;
294 		continue;
295 
296 discard:
297 		discard_next = ((desc_flags & EFX_PKT_CONT) != 0);
298 		rte_mbuf_raw_free(m);
299 		rxd->mbuf = NULL;
300 	}
301 
302 	/* pending is only moved when entire packet is received */
303 	SFC_ASSERT(scatter_pkt == NULL);
304 
305 	rxq->completed = completed;
306 
307 	sfc_efx_rx_qrefill(rxq);
308 
309 	return done_pkts;
310 }
311 
312 static sfc_dp_rx_qdesc_npending_t sfc_efx_rx_qdesc_npending;
313 static unsigned int
314 sfc_efx_rx_qdesc_npending(struct sfc_dp_rxq *dp_rxq)
315 {
316 	struct sfc_efx_rxq *rxq = sfc_efx_rxq_by_dp_rxq(dp_rxq);
317 
318 	if ((rxq->flags & SFC_EFX_RXQ_FLAG_RUNNING) == 0)
319 		return 0;
320 
321 	sfc_ev_qpoll(rxq->evq);
322 
323 	return rxq->pending - rxq->completed;
324 }
325 
326 static sfc_dp_rx_qdesc_status_t sfc_efx_rx_qdesc_status;
327 static int
328 sfc_efx_rx_qdesc_status(struct sfc_dp_rxq *dp_rxq, uint16_t offset)
329 {
330 	struct sfc_efx_rxq *rxq = sfc_efx_rxq_by_dp_rxq(dp_rxq);
331 
332 	if (unlikely(offset > rxq->ptr_mask))
333 		return -EINVAL;
334 
335 	/*
336 	 * Poll EvQ to derive up-to-date 'rxq->pending' figure;
337 	 * it is required for the queue to be running, but the
338 	 * check is omitted because API design assumes that it
339 	 * is the duty of the caller to satisfy all conditions
340 	 */
341 	SFC_ASSERT((rxq->flags & SFC_EFX_RXQ_FLAG_RUNNING) ==
342 		   SFC_EFX_RXQ_FLAG_RUNNING);
343 	sfc_ev_qpoll(rxq->evq);
344 
345 	/*
346 	 * There is a handful of reserved entries in the ring,
347 	 * but an explicit check whether the offset points to
348 	 * a reserved entry is neglected since the two checks
349 	 * below rely on the figures which take the HW limits
350 	 * into account and thus if an entry is reserved, the
351 	 * checks will fail and UNAVAIL code will be returned
352 	 */
353 
354 	if (offset < (rxq->pending - rxq->completed))
355 		return RTE_ETH_RX_DESC_DONE;
356 
357 	if (offset < (rxq->added - rxq->completed))
358 		return RTE_ETH_RX_DESC_AVAIL;
359 
360 	return RTE_ETH_RX_DESC_UNAVAIL;
361 }
362 
363 /** Get Rx datapath ops by the datapath RxQ handle */
364 const struct sfc_dp_rx *
365 sfc_dp_rx_by_dp_rxq(const struct sfc_dp_rxq *dp_rxq)
366 {
367 	const struct sfc_dp_queue *dpq = &dp_rxq->dpq;
368 	struct rte_eth_dev *eth_dev;
369 	struct sfc_adapter_priv *sap;
370 
371 	SFC_ASSERT(rte_eth_dev_is_valid_port(dpq->port_id));
372 	eth_dev = &rte_eth_devices[dpq->port_id];
373 
374 	sap = sfc_adapter_priv_by_eth_dev(eth_dev);
375 
376 	return sap->dp_rx;
377 }
378 
379 struct sfc_rxq_info *
380 sfc_rxq_info_by_dp_rxq(const struct sfc_dp_rxq *dp_rxq)
381 {
382 	const struct sfc_dp_queue *dpq = &dp_rxq->dpq;
383 	struct rte_eth_dev *eth_dev;
384 	struct sfc_adapter_shared *sas;
385 
386 	SFC_ASSERT(rte_eth_dev_is_valid_port(dpq->port_id));
387 	eth_dev = &rte_eth_devices[dpq->port_id];
388 
389 	sas = sfc_adapter_shared_by_eth_dev(eth_dev);
390 
391 	SFC_ASSERT(dpq->queue_id < sas->rxq_count);
392 	return &sas->rxq_info[dpq->queue_id];
393 }
394 
395 struct sfc_rxq *
396 sfc_rxq_by_dp_rxq(const struct sfc_dp_rxq *dp_rxq)
397 {
398 	const struct sfc_dp_queue *dpq = &dp_rxq->dpq;
399 	struct rte_eth_dev *eth_dev;
400 	struct sfc_adapter *sa;
401 
402 	SFC_ASSERT(rte_eth_dev_is_valid_port(dpq->port_id));
403 	eth_dev = &rte_eth_devices[dpq->port_id];
404 
405 	sa = sfc_adapter_by_eth_dev(eth_dev);
406 
407 	SFC_ASSERT(dpq->queue_id < sfc_sa2shared(sa)->rxq_count);
408 	return &sa->rxq_ctrl[dpq->queue_id];
409 }
410 
411 static sfc_dp_rx_qsize_up_rings_t sfc_efx_rx_qsize_up_rings;
412 static int
413 sfc_efx_rx_qsize_up_rings(uint16_t nb_rx_desc,
414 			  __rte_unused struct sfc_dp_rx_hw_limits *limits,
415 			  __rte_unused struct rte_mempool *mb_pool,
416 			  unsigned int *rxq_entries,
417 			  unsigned int *evq_entries,
418 			  unsigned int *rxq_max_fill_level)
419 {
420 	*rxq_entries = nb_rx_desc;
421 	*evq_entries = nb_rx_desc;
422 	*rxq_max_fill_level = EFX_RXQ_LIMIT(*rxq_entries);
423 	return 0;
424 }
425 
426 static sfc_dp_rx_qcreate_t sfc_efx_rx_qcreate;
427 static int
428 sfc_efx_rx_qcreate(uint16_t port_id, uint16_t queue_id,
429 		   const struct rte_pci_addr *pci_addr, int socket_id,
430 		   const struct sfc_dp_rx_qcreate_info *info,
431 		   struct sfc_dp_rxq **dp_rxqp)
432 {
433 	struct sfc_efx_rxq *rxq;
434 	int rc;
435 
436 	rc = ENOMEM;
437 	rxq = rte_zmalloc_socket("sfc-efx-rxq", sizeof(*rxq),
438 				 RTE_CACHE_LINE_SIZE, socket_id);
439 	if (rxq == NULL)
440 		goto fail_rxq_alloc;
441 
442 	sfc_dp_queue_init(&rxq->dp.dpq, port_id, queue_id, pci_addr);
443 
444 	rc = ENOMEM;
445 	rxq->sw_desc = rte_calloc_socket("sfc-efx-rxq-sw_desc",
446 					 info->rxq_entries,
447 					 sizeof(*rxq->sw_desc),
448 					 RTE_CACHE_LINE_SIZE, socket_id);
449 	if (rxq->sw_desc == NULL)
450 		goto fail_desc_alloc;
451 
452 	/* efx datapath is bound to efx control path */
453 	rxq->evq = sfc_rxq_by_dp_rxq(&rxq->dp)->evq;
454 	if (info->flags & SFC_RXQ_FLAG_RSS_HASH)
455 		rxq->flags |= SFC_EFX_RXQ_FLAG_RSS_HASH;
456 	rxq->ptr_mask = info->rxq_entries - 1;
457 	rxq->batch_max = info->batch_max;
458 	rxq->prefix_size = info->prefix_size;
459 	rxq->max_fill_level = info->max_fill_level;
460 	rxq->refill_threshold = info->refill_threshold;
461 	rxq->buf_size = info->buf_size;
462 	rxq->refill_mb_pool = info->refill_mb_pool;
463 
464 	*dp_rxqp = &rxq->dp;
465 	return 0;
466 
467 fail_desc_alloc:
468 	rte_free(rxq);
469 
470 fail_rxq_alloc:
471 	return rc;
472 }
473 
474 static sfc_dp_rx_qdestroy_t sfc_efx_rx_qdestroy;
475 static void
476 sfc_efx_rx_qdestroy(struct sfc_dp_rxq *dp_rxq)
477 {
478 	struct sfc_efx_rxq *rxq = sfc_efx_rxq_by_dp_rxq(dp_rxq);
479 
480 	rte_free(rxq->sw_desc);
481 	rte_free(rxq);
482 }
483 
484 static sfc_dp_rx_qstart_t sfc_efx_rx_qstart;
485 static int
486 sfc_efx_rx_qstart(struct sfc_dp_rxq *dp_rxq,
487 		  __rte_unused unsigned int evq_read_ptr)
488 {
489 	/* libefx-based datapath is specific to libefx-based PMD */
490 	struct sfc_efx_rxq *rxq = sfc_efx_rxq_by_dp_rxq(dp_rxq);
491 	struct sfc_rxq *crxq = sfc_rxq_by_dp_rxq(dp_rxq);
492 
493 	rxq->common = crxq->common;
494 
495 	rxq->pending = rxq->completed = rxq->added = rxq->pushed = 0;
496 
497 	sfc_efx_rx_qrefill(rxq);
498 
499 	rxq->flags |= (SFC_EFX_RXQ_FLAG_STARTED | SFC_EFX_RXQ_FLAG_RUNNING);
500 
501 	return 0;
502 }
503 
504 static sfc_dp_rx_qstop_t sfc_efx_rx_qstop;
505 static void
506 sfc_efx_rx_qstop(struct sfc_dp_rxq *dp_rxq,
507 		 __rte_unused unsigned int *evq_read_ptr)
508 {
509 	struct sfc_efx_rxq *rxq = sfc_efx_rxq_by_dp_rxq(dp_rxq);
510 
511 	rxq->flags &= ~SFC_EFX_RXQ_FLAG_RUNNING;
512 
513 	/* libefx-based datapath is bound to libefx-based PMD and uses
514 	 * event queue structure directly. So, there is no necessity to
515 	 * return EvQ read pointer.
516 	 */
517 }
518 
519 static sfc_dp_rx_qpurge_t sfc_efx_rx_qpurge;
520 static void
521 sfc_efx_rx_qpurge(struct sfc_dp_rxq *dp_rxq)
522 {
523 	struct sfc_efx_rxq *rxq = sfc_efx_rxq_by_dp_rxq(dp_rxq);
524 	unsigned int i;
525 	struct sfc_efx_rx_sw_desc *rxd;
526 
527 	for (i = rxq->completed; i != rxq->added; ++i) {
528 		rxd = &rxq->sw_desc[i & rxq->ptr_mask];
529 		rte_mbuf_raw_free(rxd->mbuf);
530 		rxd->mbuf = NULL;
531 		/* Packed stream relies on 0 in inactive SW desc.
532 		 * Rx queue stop is not performance critical, so
533 		 * there is no harm to do it always.
534 		 */
535 		rxd->flags = 0;
536 		rxd->size = 0;
537 	}
538 
539 	rxq->flags &= ~SFC_EFX_RXQ_FLAG_STARTED;
540 }
541 
542 struct sfc_dp_rx sfc_efx_rx = {
543 	.dp = {
544 		.name		= SFC_KVARG_DATAPATH_EFX,
545 		.type		= SFC_DP_RX,
546 		.hw_fw_caps	= 0,
547 	},
548 	.features		= SFC_DP_RX_FEAT_SCATTER |
549 				  SFC_DP_RX_FEAT_CHECKSUM,
550 	.qsize_up_rings		= sfc_efx_rx_qsize_up_rings,
551 	.qcreate		= sfc_efx_rx_qcreate,
552 	.qdestroy		= sfc_efx_rx_qdestroy,
553 	.qstart			= sfc_efx_rx_qstart,
554 	.qstop			= sfc_efx_rx_qstop,
555 	.qpurge			= sfc_efx_rx_qpurge,
556 	.supported_ptypes_get	= sfc_efx_supported_ptypes_get,
557 	.qdesc_npending		= sfc_efx_rx_qdesc_npending,
558 	.qdesc_status		= sfc_efx_rx_qdesc_status,
559 	.pkt_burst		= sfc_efx_recv_pkts,
560 };
561 
562 static void
563 sfc_rx_qflush(struct sfc_adapter *sa, unsigned int sw_index)
564 {
565 	struct sfc_rxq_info *rxq_info;
566 	struct sfc_rxq *rxq;
567 	unsigned int retry_count;
568 	unsigned int wait_count;
569 	int rc;
570 
571 	rxq_info = &sfc_sa2shared(sa)->rxq_info[sw_index];
572 	SFC_ASSERT(rxq_info->state & SFC_RXQ_STARTED);
573 
574 	rxq = &sa->rxq_ctrl[sw_index];
575 
576 	/*
577 	 * Retry Rx queue flushing in the case of flush failed or
578 	 * timeout. In the worst case it can delay for 6 seconds.
579 	 */
580 	for (retry_count = 0;
581 	     ((rxq_info->state & SFC_RXQ_FLUSHED) == 0) &&
582 	     (retry_count < SFC_RX_QFLUSH_ATTEMPTS);
583 	     ++retry_count) {
584 		rc = efx_rx_qflush(rxq->common);
585 		if (rc != 0) {
586 			rxq_info->state |= (rc == EALREADY) ?
587 				SFC_RXQ_FLUSHED : SFC_RXQ_FLUSH_FAILED;
588 			break;
589 		}
590 		rxq_info->state &= ~SFC_RXQ_FLUSH_FAILED;
591 		rxq_info->state |= SFC_RXQ_FLUSHING;
592 
593 		/*
594 		 * Wait for Rx queue flush done or failed event at least
595 		 * SFC_RX_QFLUSH_POLL_WAIT_MS milliseconds and not more
596 		 * than 2 seconds (SFC_RX_QFLUSH_POLL_WAIT_MS multiplied
597 		 * by SFC_RX_QFLUSH_POLL_ATTEMPTS).
598 		 */
599 		wait_count = 0;
600 		do {
601 			rte_delay_ms(SFC_RX_QFLUSH_POLL_WAIT_MS);
602 			sfc_ev_qpoll(rxq->evq);
603 		} while ((rxq_info->state & SFC_RXQ_FLUSHING) &&
604 			 (wait_count++ < SFC_RX_QFLUSH_POLL_ATTEMPTS));
605 
606 		if (rxq_info->state & SFC_RXQ_FLUSHING)
607 			sfc_err(sa, "RxQ %u flush timed out", sw_index);
608 
609 		if (rxq_info->state & SFC_RXQ_FLUSH_FAILED)
610 			sfc_err(sa, "RxQ %u flush failed", sw_index);
611 
612 		if (rxq_info->state & SFC_RXQ_FLUSHED)
613 			sfc_notice(sa, "RxQ %u flushed", sw_index);
614 	}
615 
616 	sa->priv.dp_rx->qpurge(rxq_info->dp);
617 }
618 
619 static int
620 sfc_rx_default_rxq_set_filter(struct sfc_adapter *sa, struct sfc_rxq *rxq)
621 {
622 	struct sfc_rss *rss = &sfc_sa2shared(sa)->rss;
623 	boolean_t need_rss = (rss->channels > 0) ? B_TRUE : B_FALSE;
624 	struct sfc_port *port = &sa->port;
625 	int rc;
626 
627 	/*
628 	 * If promiscuous or all-multicast mode has been requested, setting
629 	 * filter for the default Rx queue might fail, in particular, while
630 	 * running over PCI function which is not a member of corresponding
631 	 * privilege groups; if this occurs, few iterations will be made to
632 	 * repeat this step without promiscuous and all-multicast flags set
633 	 */
634 retry:
635 	rc = efx_mac_filter_default_rxq_set(sa->nic, rxq->common, need_rss);
636 	if (rc == 0)
637 		return 0;
638 	else if (rc != EOPNOTSUPP)
639 		return rc;
640 
641 	if (port->promisc) {
642 		sfc_warn(sa, "promiscuous mode has been requested, "
643 			     "but the HW rejects it");
644 		sfc_warn(sa, "promiscuous mode will be disabled");
645 
646 		port->promisc = B_FALSE;
647 		rc = sfc_set_rx_mode(sa);
648 		if (rc != 0)
649 			return rc;
650 
651 		goto retry;
652 	}
653 
654 	if (port->allmulti) {
655 		sfc_warn(sa, "all-multicast mode has been requested, "
656 			     "but the HW rejects it");
657 		sfc_warn(sa, "all-multicast mode will be disabled");
658 
659 		port->allmulti = B_FALSE;
660 		rc = sfc_set_rx_mode(sa);
661 		if (rc != 0)
662 			return rc;
663 
664 		goto retry;
665 	}
666 
667 	return rc;
668 }
669 
670 int
671 sfc_rx_qstart(struct sfc_adapter *sa, unsigned int sw_index)
672 {
673 	struct sfc_rxq_info *rxq_info;
674 	struct sfc_rxq *rxq;
675 	struct sfc_evq *evq;
676 	int rc;
677 
678 	sfc_log_init(sa, "sw_index=%u", sw_index);
679 
680 	SFC_ASSERT(sw_index < sfc_sa2shared(sa)->rxq_count);
681 
682 	rxq_info = &sfc_sa2shared(sa)->rxq_info[sw_index];
683 	SFC_ASSERT(rxq_info->state == SFC_RXQ_INITIALIZED);
684 
685 	rxq = &sa->rxq_ctrl[sw_index];
686 	evq = rxq->evq;
687 
688 	rc = sfc_ev_qstart(evq, sfc_evq_index_by_rxq_sw_index(sa, sw_index));
689 	if (rc != 0)
690 		goto fail_ev_qstart;
691 
692 	switch (rxq_info->type) {
693 	case EFX_RXQ_TYPE_DEFAULT:
694 		rc = efx_rx_qcreate(sa->nic, rxq->hw_index, 0, rxq_info->type,
695 			rxq->buf_size,
696 			&rxq->mem, rxq_info->entries, 0 /* not used on EF10 */,
697 			rxq_info->type_flags, evq->common, &rxq->common);
698 		break;
699 	case EFX_RXQ_TYPE_ES_SUPER_BUFFER: {
700 		struct rte_mempool *mp = rxq_info->refill_mb_pool;
701 		struct rte_mempool_info mp_info;
702 
703 		rc = rte_mempool_ops_get_info(mp, &mp_info);
704 		if (rc != 0) {
705 			/* Positive errno is used in the driver */
706 			rc = -rc;
707 			goto fail_mp_get_info;
708 		}
709 		if (mp_info.contig_block_size <= 0) {
710 			rc = EINVAL;
711 			goto fail_bad_contig_block_size;
712 		}
713 		rc = efx_rx_qcreate_es_super_buffer(sa->nic, rxq->hw_index, 0,
714 			mp_info.contig_block_size, rxq->buf_size,
715 			mp->header_size + mp->elt_size + mp->trailer_size,
716 			sa->rxd_wait_timeout_ns,
717 			&rxq->mem, rxq_info->entries, rxq_info->type_flags,
718 			evq->common, &rxq->common);
719 		break;
720 	}
721 	default:
722 		rc = ENOTSUP;
723 	}
724 	if (rc != 0)
725 		goto fail_rx_qcreate;
726 
727 	efx_rx_qenable(rxq->common);
728 
729 	rc = sa->priv.dp_rx->qstart(rxq_info->dp, evq->read_ptr);
730 	if (rc != 0)
731 		goto fail_dp_qstart;
732 
733 	rxq_info->state |= SFC_RXQ_STARTED;
734 
735 	if (sw_index == 0 && !sfc_sa2shared(sa)->isolated) {
736 		rc = sfc_rx_default_rxq_set_filter(sa, rxq);
737 		if (rc != 0)
738 			goto fail_mac_filter_default_rxq_set;
739 	}
740 
741 	/* It seems to be used by DPDK for debug purposes only ('rte_ether') */
742 	sa->eth_dev->data->rx_queue_state[sw_index] =
743 		RTE_ETH_QUEUE_STATE_STARTED;
744 
745 	return 0;
746 
747 fail_mac_filter_default_rxq_set:
748 	sa->priv.dp_rx->qstop(rxq_info->dp, &rxq->evq->read_ptr);
749 
750 fail_dp_qstart:
751 	sfc_rx_qflush(sa, sw_index);
752 
753 fail_rx_qcreate:
754 fail_bad_contig_block_size:
755 fail_mp_get_info:
756 	sfc_ev_qstop(evq);
757 
758 fail_ev_qstart:
759 	return rc;
760 }
761 
762 void
763 sfc_rx_qstop(struct sfc_adapter *sa, unsigned int sw_index)
764 {
765 	struct sfc_rxq_info *rxq_info;
766 	struct sfc_rxq *rxq;
767 
768 	sfc_log_init(sa, "sw_index=%u", sw_index);
769 
770 	SFC_ASSERT(sw_index < sfc_sa2shared(sa)->rxq_count);
771 
772 	rxq_info = &sfc_sa2shared(sa)->rxq_info[sw_index];
773 
774 	if (rxq_info->state == SFC_RXQ_INITIALIZED)
775 		return;
776 	SFC_ASSERT(rxq_info->state & SFC_RXQ_STARTED);
777 
778 	/* It seems to be used by DPDK for debug purposes only ('rte_ether') */
779 	sa->eth_dev->data->rx_queue_state[sw_index] =
780 		RTE_ETH_QUEUE_STATE_STOPPED;
781 
782 	rxq = &sa->rxq_ctrl[sw_index];
783 	sa->priv.dp_rx->qstop(rxq_info->dp, &rxq->evq->read_ptr);
784 
785 	if (sw_index == 0)
786 		efx_mac_filter_default_rxq_clear(sa->nic);
787 
788 	sfc_rx_qflush(sa, sw_index);
789 
790 	rxq_info->state = SFC_RXQ_INITIALIZED;
791 
792 	efx_rx_qdestroy(rxq->common);
793 
794 	sfc_ev_qstop(rxq->evq);
795 }
796 
797 uint64_t
798 sfc_rx_get_dev_offload_caps(struct sfc_adapter *sa)
799 {
800 	const efx_nic_cfg_t *encp = efx_nic_cfg_get(sa->nic);
801 	uint64_t caps = 0;
802 
803 	caps |= DEV_RX_OFFLOAD_JUMBO_FRAME;
804 
805 	if (sa->priv.dp_rx->features & SFC_DP_RX_FEAT_CHECKSUM) {
806 		caps |= DEV_RX_OFFLOAD_IPV4_CKSUM;
807 		caps |= DEV_RX_OFFLOAD_UDP_CKSUM;
808 		caps |= DEV_RX_OFFLOAD_TCP_CKSUM;
809 	}
810 
811 	if (encp->enc_tunnel_encapsulations_supported &&
812 	    (sa->priv.dp_rx->features & SFC_DP_RX_FEAT_TUNNELS))
813 		caps |= DEV_RX_OFFLOAD_OUTER_IPV4_CKSUM;
814 
815 	return caps;
816 }
817 
818 uint64_t
819 sfc_rx_get_queue_offload_caps(struct sfc_adapter *sa)
820 {
821 	uint64_t caps = 0;
822 
823 	if (sa->priv.dp_rx->features & SFC_DP_RX_FEAT_SCATTER)
824 		caps |= DEV_RX_OFFLOAD_SCATTER;
825 
826 	return caps;
827 }
828 
829 static int
830 sfc_rx_qcheck_conf(struct sfc_adapter *sa, unsigned int rxq_max_fill_level,
831 		   const struct rte_eth_rxconf *rx_conf,
832 		   __rte_unused uint64_t offloads)
833 {
834 	int rc = 0;
835 
836 	if (rx_conf->rx_thresh.pthresh != 0 ||
837 	    rx_conf->rx_thresh.hthresh != 0 ||
838 	    rx_conf->rx_thresh.wthresh != 0) {
839 		sfc_warn(sa,
840 			"RxQ prefetch/host/writeback thresholds are not supported");
841 	}
842 
843 	if (rx_conf->rx_free_thresh > rxq_max_fill_level) {
844 		sfc_err(sa,
845 			"RxQ free threshold too large: %u vs maximum %u",
846 			rx_conf->rx_free_thresh, rxq_max_fill_level);
847 		rc = EINVAL;
848 	}
849 
850 	if (rx_conf->rx_drop_en == 0) {
851 		sfc_err(sa, "RxQ drop disable is not supported");
852 		rc = EINVAL;
853 	}
854 
855 	return rc;
856 }
857 
858 static unsigned int
859 sfc_rx_mbuf_data_alignment(struct rte_mempool *mb_pool)
860 {
861 	uint32_t data_off;
862 	uint32_t order;
863 
864 	/* The mbuf object itself is always cache line aligned */
865 	order = rte_bsf32(RTE_CACHE_LINE_SIZE);
866 
867 	/* Data offset from mbuf object start */
868 	data_off = sizeof(struct rte_mbuf) + rte_pktmbuf_priv_size(mb_pool) +
869 		RTE_PKTMBUF_HEADROOM;
870 
871 	order = MIN(order, rte_bsf32(data_off));
872 
873 	return 1u << order;
874 }
875 
876 static uint16_t
877 sfc_rx_mb_pool_buf_size(struct sfc_adapter *sa, struct rte_mempool *mb_pool)
878 {
879 	const efx_nic_cfg_t *encp = efx_nic_cfg_get(sa->nic);
880 	const uint32_t nic_align_start = MAX(1, encp->enc_rx_buf_align_start);
881 	const uint32_t nic_align_end = MAX(1, encp->enc_rx_buf_align_end);
882 	uint16_t buf_size;
883 	unsigned int buf_aligned;
884 	unsigned int start_alignment;
885 	unsigned int end_padding_alignment;
886 
887 	/* Below it is assumed that both alignments are power of 2 */
888 	SFC_ASSERT(rte_is_power_of_2(nic_align_start));
889 	SFC_ASSERT(rte_is_power_of_2(nic_align_end));
890 
891 	/*
892 	 * mbuf is always cache line aligned, double-check
893 	 * that it meets rx buffer start alignment requirements.
894 	 */
895 
896 	/* Start from mbuf pool data room size */
897 	buf_size = rte_pktmbuf_data_room_size(mb_pool);
898 
899 	/* Remove headroom */
900 	if (buf_size <= RTE_PKTMBUF_HEADROOM) {
901 		sfc_err(sa,
902 			"RxQ mbuf pool %s object data room size %u is smaller than headroom %u",
903 			mb_pool->name, buf_size, RTE_PKTMBUF_HEADROOM);
904 		return 0;
905 	}
906 	buf_size -= RTE_PKTMBUF_HEADROOM;
907 
908 	/* Calculate guaranteed data start alignment */
909 	buf_aligned = sfc_rx_mbuf_data_alignment(mb_pool);
910 
911 	/* Reserve space for start alignment */
912 	if (buf_aligned < nic_align_start) {
913 		start_alignment = nic_align_start - buf_aligned;
914 		if (buf_size <= start_alignment) {
915 			sfc_err(sa,
916 				"RxQ mbuf pool %s object data room size %u is insufficient for headroom %u and buffer start alignment %u required by NIC",
917 				mb_pool->name,
918 				rte_pktmbuf_data_room_size(mb_pool),
919 				RTE_PKTMBUF_HEADROOM, start_alignment);
920 			return 0;
921 		}
922 		buf_aligned = nic_align_start;
923 		buf_size -= start_alignment;
924 	} else {
925 		start_alignment = 0;
926 	}
927 
928 	/* Make sure that end padding does not write beyond the buffer */
929 	if (buf_aligned < nic_align_end) {
930 		/*
931 		 * Estimate space which can be lost. If guarnteed buffer
932 		 * size is odd, lost space is (nic_align_end - 1). More
933 		 * accurate formula is below.
934 		 */
935 		end_padding_alignment = nic_align_end -
936 			MIN(buf_aligned, 1u << (rte_bsf32(buf_size) - 1));
937 		if (buf_size <= end_padding_alignment) {
938 			sfc_err(sa,
939 				"RxQ mbuf pool %s object data room size %u is insufficient for headroom %u, buffer start alignment %u and end padding alignment %u required by NIC",
940 				mb_pool->name,
941 				rte_pktmbuf_data_room_size(mb_pool),
942 				RTE_PKTMBUF_HEADROOM, start_alignment,
943 				end_padding_alignment);
944 			return 0;
945 		}
946 		buf_size -= end_padding_alignment;
947 	} else {
948 		/*
949 		 * Start is aligned the same or better than end,
950 		 * just align length.
951 		 */
952 		buf_size = P2ALIGN(buf_size, nic_align_end);
953 	}
954 
955 	return buf_size;
956 }
957 
958 int
959 sfc_rx_qinit(struct sfc_adapter *sa, unsigned int sw_index,
960 	     uint16_t nb_rx_desc, unsigned int socket_id,
961 	     const struct rte_eth_rxconf *rx_conf,
962 	     struct rte_mempool *mb_pool)
963 {
964 	const efx_nic_cfg_t *encp = efx_nic_cfg_get(sa->nic);
965 	struct sfc_rss *rss = &sfc_sa2shared(sa)->rss;
966 	int rc;
967 	unsigned int rxq_entries;
968 	unsigned int evq_entries;
969 	unsigned int rxq_max_fill_level;
970 	uint64_t offloads;
971 	uint16_t buf_size;
972 	struct sfc_rxq_info *rxq_info;
973 	struct sfc_evq *evq;
974 	struct sfc_rxq *rxq;
975 	struct sfc_dp_rx_qcreate_info info;
976 	struct sfc_dp_rx_hw_limits hw_limits;
977 	uint16_t rx_free_thresh;
978 
979 	memset(&hw_limits, 0, sizeof(hw_limits));
980 	hw_limits.rxq_max_entries = sa->rxq_max_entries;
981 	hw_limits.rxq_min_entries = sa->rxq_min_entries;
982 	hw_limits.evq_max_entries = sa->evq_max_entries;
983 	hw_limits.evq_min_entries = sa->evq_min_entries;
984 
985 	rc = sa->priv.dp_rx->qsize_up_rings(nb_rx_desc, &hw_limits, mb_pool,
986 					    &rxq_entries, &evq_entries,
987 					    &rxq_max_fill_level);
988 	if (rc != 0)
989 		goto fail_size_up_rings;
990 	SFC_ASSERT(rxq_entries >= sa->rxq_min_entries);
991 	SFC_ASSERT(rxq_entries <= sa->rxq_max_entries);
992 	SFC_ASSERT(rxq_max_fill_level <= nb_rx_desc);
993 
994 	offloads = rx_conf->offloads |
995 		sa->eth_dev->data->dev_conf.rxmode.offloads;
996 	rc = sfc_rx_qcheck_conf(sa, rxq_max_fill_level, rx_conf, offloads);
997 	if (rc != 0)
998 		goto fail_bad_conf;
999 
1000 	buf_size = sfc_rx_mb_pool_buf_size(sa, mb_pool);
1001 	if (buf_size == 0) {
1002 		sfc_err(sa, "RxQ %u mbuf pool object size is too small",
1003 			sw_index);
1004 		rc = EINVAL;
1005 		goto fail_bad_conf;
1006 	}
1007 
1008 	if ((buf_size < sa->port.pdu + encp->enc_rx_prefix_size) &&
1009 	    (~offloads & DEV_RX_OFFLOAD_SCATTER)) {
1010 		sfc_err(sa, "Rx scatter is disabled and RxQ %u mbuf pool "
1011 			"object size is too small", sw_index);
1012 		sfc_err(sa, "RxQ %u calculated Rx buffer size is %u vs "
1013 			"PDU size %u plus Rx prefix %u bytes",
1014 			sw_index, buf_size, (unsigned int)sa->port.pdu,
1015 			encp->enc_rx_prefix_size);
1016 		rc = EINVAL;
1017 		goto fail_bad_conf;
1018 	}
1019 
1020 	SFC_ASSERT(sw_index < sfc_sa2shared(sa)->rxq_count);
1021 	rxq_info = &sfc_sa2shared(sa)->rxq_info[sw_index];
1022 
1023 	SFC_ASSERT(rxq_entries <= rxq_info->max_entries);
1024 	rxq_info->entries = rxq_entries;
1025 
1026 	if (sa->priv.dp_rx->dp.hw_fw_caps & SFC_DP_HW_FW_CAP_RX_ES_SUPER_BUFFER)
1027 		rxq_info->type = EFX_RXQ_TYPE_ES_SUPER_BUFFER;
1028 	else
1029 		rxq_info->type = EFX_RXQ_TYPE_DEFAULT;
1030 
1031 	rxq_info->type_flags =
1032 		(offloads & DEV_RX_OFFLOAD_SCATTER) ?
1033 		EFX_RXQ_FLAG_SCATTER : EFX_RXQ_FLAG_NONE;
1034 
1035 	if ((encp->enc_tunnel_encapsulations_supported != 0) &&
1036 	    (sa->priv.dp_rx->features & SFC_DP_RX_FEAT_TUNNELS))
1037 		rxq_info->type_flags |= EFX_RXQ_FLAG_INNER_CLASSES;
1038 
1039 	rc = sfc_ev_qinit(sa, SFC_EVQ_TYPE_RX, sw_index,
1040 			  evq_entries, socket_id, &evq);
1041 	if (rc != 0)
1042 		goto fail_ev_qinit;
1043 
1044 	rxq = &sa->rxq_ctrl[sw_index];
1045 	rxq->evq = evq;
1046 	rxq->hw_index = sw_index;
1047 	/*
1048 	 * If Rx refill threshold is specified (its value is non zero) in
1049 	 * Rx configuration, use specified value. Otherwise use 1/8 of
1050 	 * the Rx descriptors number as the default. It allows to keep
1051 	 * Rx ring full-enough and does not refill too aggressive if
1052 	 * packet rate is high.
1053 	 *
1054 	 * Since PMD refills in bulks waiting for full bulk may be
1055 	 * refilled (basically round down), it is better to round up
1056 	 * here to mitigate it a bit.
1057 	 */
1058 	rx_free_thresh = (rx_conf->rx_free_thresh != 0) ?
1059 		rx_conf->rx_free_thresh : EFX_DIV_ROUND_UP(nb_rx_desc, 8);
1060 	/* Rx refill threshold cannot be smaller than refill bulk */
1061 	rxq_info->refill_threshold =
1062 		RTE_MAX(rx_free_thresh, SFC_RX_REFILL_BULK);
1063 	rxq_info->refill_mb_pool = mb_pool;
1064 	rxq->buf_size = buf_size;
1065 
1066 	rc = sfc_dma_alloc(sa, "rxq", sw_index,
1067 			   efx_rxq_size(sa->nic, rxq_info->entries),
1068 			   socket_id, &rxq->mem);
1069 	if (rc != 0)
1070 		goto fail_dma_alloc;
1071 
1072 	memset(&info, 0, sizeof(info));
1073 	info.refill_mb_pool = rxq_info->refill_mb_pool;
1074 	info.max_fill_level = rxq_max_fill_level;
1075 	info.refill_threshold = rxq_info->refill_threshold;
1076 	info.buf_size = buf_size;
1077 	info.batch_max = encp->enc_rx_batch_max;
1078 	info.prefix_size = encp->enc_rx_prefix_size;
1079 
1080 	if (rss->hash_support == EFX_RX_HASH_AVAILABLE && rss->channels > 0)
1081 		info.flags |= SFC_RXQ_FLAG_RSS_HASH;
1082 
1083 	info.rxq_entries = rxq_info->entries;
1084 	info.rxq_hw_ring = rxq->mem.esm_base;
1085 	info.evq_entries = evq_entries;
1086 	info.evq_hw_ring = evq->mem.esm_base;
1087 	info.hw_index = rxq->hw_index;
1088 	info.mem_bar = sa->mem_bar.esb_base;
1089 	info.vi_window_shift = encp->enc_vi_window_shift;
1090 
1091 	rc = sa->priv.dp_rx->qcreate(sa->eth_dev->data->port_id, sw_index,
1092 				     &RTE_ETH_DEV_TO_PCI(sa->eth_dev)->addr,
1093 				     socket_id, &info, &rxq_info->dp);
1094 	if (rc != 0)
1095 		goto fail_dp_rx_qcreate;
1096 
1097 	evq->dp_rxq = rxq_info->dp;
1098 
1099 	rxq_info->state = SFC_RXQ_INITIALIZED;
1100 
1101 	rxq_info->deferred_start = (rx_conf->rx_deferred_start != 0);
1102 
1103 	return 0;
1104 
1105 fail_dp_rx_qcreate:
1106 	sfc_dma_free(sa, &rxq->mem);
1107 
1108 fail_dma_alloc:
1109 	sfc_ev_qfini(evq);
1110 
1111 fail_ev_qinit:
1112 	rxq_info->entries = 0;
1113 
1114 fail_bad_conf:
1115 fail_size_up_rings:
1116 	sfc_log_init(sa, "failed %d", rc);
1117 	return rc;
1118 }
1119 
1120 void
1121 sfc_rx_qfini(struct sfc_adapter *sa, unsigned int sw_index)
1122 {
1123 	struct sfc_rxq_info *rxq_info;
1124 	struct sfc_rxq *rxq;
1125 
1126 	SFC_ASSERT(sw_index < sfc_sa2shared(sa)->rxq_count);
1127 	sa->eth_dev->data->rx_queues[sw_index] = NULL;
1128 
1129 	rxq_info = &sfc_sa2shared(sa)->rxq_info[sw_index];
1130 
1131 	SFC_ASSERT(rxq_info->state == SFC_RXQ_INITIALIZED);
1132 
1133 	sa->priv.dp_rx->qdestroy(rxq_info->dp);
1134 	rxq_info->dp = NULL;
1135 
1136 	rxq_info->state &= ~SFC_RXQ_INITIALIZED;
1137 	rxq_info->entries = 0;
1138 
1139 	rxq = &sa->rxq_ctrl[sw_index];
1140 
1141 	sfc_dma_free(sa, &rxq->mem);
1142 
1143 	sfc_ev_qfini(rxq->evq);
1144 	rxq->evq = NULL;
1145 }
1146 
1147 /*
1148  * Mapping between RTE RSS hash functions and their EFX counterparts.
1149  */
1150 static const struct sfc_rss_hf_rte_to_efx sfc_rss_hf_map[] = {
1151 	{ ETH_RSS_NONFRAG_IPV4_TCP,
1152 	  EFX_RX_HASH(IPV4_TCP, 4TUPLE) },
1153 	{ ETH_RSS_NONFRAG_IPV4_UDP,
1154 	  EFX_RX_HASH(IPV4_UDP, 4TUPLE) },
1155 	{ ETH_RSS_NONFRAG_IPV6_TCP | ETH_RSS_IPV6_TCP_EX,
1156 	  EFX_RX_HASH(IPV6_TCP, 4TUPLE) },
1157 	{ ETH_RSS_NONFRAG_IPV6_UDP | ETH_RSS_IPV6_UDP_EX,
1158 	  EFX_RX_HASH(IPV6_UDP, 4TUPLE) },
1159 	{ ETH_RSS_IPV4 | ETH_RSS_FRAG_IPV4 | ETH_RSS_NONFRAG_IPV4_OTHER,
1160 	  EFX_RX_HASH(IPV4_TCP, 2TUPLE) | EFX_RX_HASH(IPV4_UDP, 2TUPLE) |
1161 	  EFX_RX_HASH(IPV4, 2TUPLE) },
1162 	{ ETH_RSS_IPV6 | ETH_RSS_FRAG_IPV6 | ETH_RSS_NONFRAG_IPV6_OTHER |
1163 	  ETH_RSS_IPV6_EX,
1164 	  EFX_RX_HASH(IPV6_TCP, 2TUPLE) | EFX_RX_HASH(IPV6_UDP, 2TUPLE) |
1165 	  EFX_RX_HASH(IPV6, 2TUPLE) }
1166 };
1167 
1168 static efx_rx_hash_type_t
1169 sfc_rx_hash_types_mask_supp(efx_rx_hash_type_t hash_type,
1170 			    unsigned int *hash_type_flags_supported,
1171 			    unsigned int nb_hash_type_flags_supported)
1172 {
1173 	efx_rx_hash_type_t hash_type_masked = 0;
1174 	unsigned int i, j;
1175 
1176 	for (i = 0; i < nb_hash_type_flags_supported; ++i) {
1177 		unsigned int class_tuple_lbn[] = {
1178 			EFX_RX_CLASS_IPV4_TCP_LBN,
1179 			EFX_RX_CLASS_IPV4_UDP_LBN,
1180 			EFX_RX_CLASS_IPV4_LBN,
1181 			EFX_RX_CLASS_IPV6_TCP_LBN,
1182 			EFX_RX_CLASS_IPV6_UDP_LBN,
1183 			EFX_RX_CLASS_IPV6_LBN
1184 		};
1185 
1186 		for (j = 0; j < RTE_DIM(class_tuple_lbn); ++j) {
1187 			unsigned int tuple_mask = EFX_RX_CLASS_HASH_4TUPLE;
1188 			unsigned int flag;
1189 
1190 			tuple_mask <<= class_tuple_lbn[j];
1191 			flag = hash_type & tuple_mask;
1192 
1193 			if (flag == hash_type_flags_supported[i])
1194 				hash_type_masked |= flag;
1195 		}
1196 	}
1197 
1198 	return hash_type_masked;
1199 }
1200 
1201 int
1202 sfc_rx_hash_init(struct sfc_adapter *sa)
1203 {
1204 	struct sfc_rss *rss = &sfc_sa2shared(sa)->rss;
1205 	const efx_nic_cfg_t *encp = efx_nic_cfg_get(sa->nic);
1206 	uint32_t alg_mask = encp->enc_rx_scale_hash_alg_mask;
1207 	efx_rx_hash_alg_t alg;
1208 	unsigned int flags_supp[EFX_RX_HASH_NFLAGS];
1209 	unsigned int nb_flags_supp;
1210 	struct sfc_rss_hf_rte_to_efx *hf_map;
1211 	struct sfc_rss_hf_rte_to_efx *entry;
1212 	efx_rx_hash_type_t efx_hash_types;
1213 	unsigned int i;
1214 	int rc;
1215 
1216 	if (alg_mask & (1U << EFX_RX_HASHALG_TOEPLITZ))
1217 		alg = EFX_RX_HASHALG_TOEPLITZ;
1218 	else if (alg_mask & (1U << EFX_RX_HASHALG_PACKED_STREAM))
1219 		alg = EFX_RX_HASHALG_PACKED_STREAM;
1220 	else
1221 		return EINVAL;
1222 
1223 	rc = efx_rx_scale_hash_flags_get(sa->nic, alg, flags_supp,
1224 					 RTE_DIM(flags_supp), &nb_flags_supp);
1225 	if (rc != 0)
1226 		return rc;
1227 
1228 	hf_map = rte_calloc_socket("sfc-rss-hf-map",
1229 				   RTE_DIM(sfc_rss_hf_map),
1230 				   sizeof(*hf_map), 0, sa->socket_id);
1231 	if (hf_map == NULL)
1232 		return ENOMEM;
1233 
1234 	entry = hf_map;
1235 	efx_hash_types = 0;
1236 	for (i = 0; i < RTE_DIM(sfc_rss_hf_map); ++i) {
1237 		efx_rx_hash_type_t ht;
1238 
1239 		ht = sfc_rx_hash_types_mask_supp(sfc_rss_hf_map[i].efx,
1240 						 flags_supp, nb_flags_supp);
1241 		if (ht != 0) {
1242 			entry->rte = sfc_rss_hf_map[i].rte;
1243 			entry->efx = ht;
1244 			efx_hash_types |= ht;
1245 			++entry;
1246 		}
1247 	}
1248 
1249 	rss->hash_alg = alg;
1250 	rss->hf_map_nb_entries = (unsigned int)(entry - hf_map);
1251 	rss->hf_map = hf_map;
1252 	rss->hash_types = efx_hash_types;
1253 
1254 	return 0;
1255 }
1256 
1257 void
1258 sfc_rx_hash_fini(struct sfc_adapter *sa)
1259 {
1260 	struct sfc_rss *rss = &sfc_sa2shared(sa)->rss;
1261 
1262 	rte_free(rss->hf_map);
1263 }
1264 
1265 int
1266 sfc_rx_hf_rte_to_efx(struct sfc_adapter *sa, uint64_t rte,
1267 		     efx_rx_hash_type_t *efx)
1268 {
1269 	struct sfc_rss *rss = &sfc_sa2shared(sa)->rss;
1270 	efx_rx_hash_type_t hash_types = 0;
1271 	unsigned int i;
1272 
1273 	for (i = 0; i < rss->hf_map_nb_entries; ++i) {
1274 		uint64_t rte_mask = rss->hf_map[i].rte;
1275 
1276 		if ((rte & rte_mask) != 0) {
1277 			rte &= ~rte_mask;
1278 			hash_types |= rss->hf_map[i].efx;
1279 		}
1280 	}
1281 
1282 	if (rte != 0) {
1283 		sfc_err(sa, "unsupported hash functions requested");
1284 		return EINVAL;
1285 	}
1286 
1287 	*efx = hash_types;
1288 
1289 	return 0;
1290 }
1291 
1292 uint64_t
1293 sfc_rx_hf_efx_to_rte(struct sfc_rss *rss, efx_rx_hash_type_t efx)
1294 {
1295 	uint64_t rte = 0;
1296 	unsigned int i;
1297 
1298 	for (i = 0; i < rss->hf_map_nb_entries; ++i) {
1299 		efx_rx_hash_type_t hash_type = rss->hf_map[i].efx;
1300 
1301 		if ((efx & hash_type) == hash_type)
1302 			rte |= rss->hf_map[i].rte;
1303 	}
1304 
1305 	return rte;
1306 }
1307 
1308 static int
1309 sfc_rx_process_adv_conf_rss(struct sfc_adapter *sa,
1310 			    struct rte_eth_rss_conf *conf)
1311 {
1312 	struct sfc_rss *rss = &sfc_sa2shared(sa)->rss;
1313 	efx_rx_hash_type_t efx_hash_types = rss->hash_types;
1314 	uint64_t rss_hf = sfc_rx_hf_efx_to_rte(rss, efx_hash_types);
1315 	int rc;
1316 
1317 	if (rss->context_type != EFX_RX_SCALE_EXCLUSIVE) {
1318 		if ((conf->rss_hf != 0 && conf->rss_hf != rss_hf) ||
1319 		    conf->rss_key != NULL)
1320 			return EINVAL;
1321 	}
1322 
1323 	if (conf->rss_hf != 0) {
1324 		rc = sfc_rx_hf_rte_to_efx(sa, conf->rss_hf, &efx_hash_types);
1325 		if (rc != 0)
1326 			return rc;
1327 	}
1328 
1329 	if (conf->rss_key != NULL) {
1330 		if (conf->rss_key_len != sizeof(rss->key)) {
1331 			sfc_err(sa, "RSS key size is wrong (should be %lu)",
1332 				sizeof(rss->key));
1333 			return EINVAL;
1334 		}
1335 		rte_memcpy(rss->key, conf->rss_key, sizeof(rss->key));
1336 	}
1337 
1338 	rss->hash_types = efx_hash_types;
1339 
1340 	return 0;
1341 }
1342 
1343 static int
1344 sfc_rx_rss_config(struct sfc_adapter *sa)
1345 {
1346 	struct sfc_rss *rss = &sfc_sa2shared(sa)->rss;
1347 	int rc = 0;
1348 
1349 	if (rss->channels > 0) {
1350 		rc = efx_rx_scale_mode_set(sa->nic, EFX_RSS_CONTEXT_DEFAULT,
1351 					   rss->hash_alg, rss->hash_types,
1352 					   B_TRUE);
1353 		if (rc != 0)
1354 			goto finish;
1355 
1356 		rc = efx_rx_scale_key_set(sa->nic, EFX_RSS_CONTEXT_DEFAULT,
1357 					  rss->key, sizeof(rss->key));
1358 		if (rc != 0)
1359 			goto finish;
1360 
1361 		rc = efx_rx_scale_tbl_set(sa->nic, EFX_RSS_CONTEXT_DEFAULT,
1362 					  rss->tbl, RTE_DIM(rss->tbl));
1363 	}
1364 
1365 finish:
1366 	return rc;
1367 }
1368 
1369 int
1370 sfc_rx_start(struct sfc_adapter *sa)
1371 {
1372 	struct sfc_adapter_shared * const sas = sfc_sa2shared(sa);
1373 	unsigned int sw_index;
1374 	int rc;
1375 
1376 	sfc_log_init(sa, "rxq_count=%u", sas->rxq_count);
1377 
1378 	rc = efx_rx_init(sa->nic);
1379 	if (rc != 0)
1380 		goto fail_rx_init;
1381 
1382 	rc = sfc_rx_rss_config(sa);
1383 	if (rc != 0)
1384 		goto fail_rss_config;
1385 
1386 	for (sw_index = 0; sw_index < sas->rxq_count; ++sw_index) {
1387 		if (sas->rxq_info[sw_index].state == SFC_RXQ_INITIALIZED &&
1388 		    (!sas->rxq_info[sw_index].deferred_start ||
1389 		     sas->rxq_info[sw_index].deferred_started)) {
1390 			rc = sfc_rx_qstart(sa, sw_index);
1391 			if (rc != 0)
1392 				goto fail_rx_qstart;
1393 		}
1394 	}
1395 
1396 	return 0;
1397 
1398 fail_rx_qstart:
1399 	while (sw_index-- > 0)
1400 		sfc_rx_qstop(sa, sw_index);
1401 
1402 fail_rss_config:
1403 	efx_rx_fini(sa->nic);
1404 
1405 fail_rx_init:
1406 	sfc_log_init(sa, "failed %d", rc);
1407 	return rc;
1408 }
1409 
1410 void
1411 sfc_rx_stop(struct sfc_adapter *sa)
1412 {
1413 	struct sfc_adapter_shared * const sas = sfc_sa2shared(sa);
1414 	unsigned int sw_index;
1415 
1416 	sfc_log_init(sa, "rxq_count=%u", sas->rxq_count);
1417 
1418 	sw_index = sas->rxq_count;
1419 	while (sw_index-- > 0) {
1420 		if (sas->rxq_info[sw_index].state & SFC_RXQ_STARTED)
1421 			sfc_rx_qstop(sa, sw_index);
1422 	}
1423 
1424 	efx_rx_fini(sa->nic);
1425 }
1426 
1427 static int
1428 sfc_rx_qinit_info(struct sfc_adapter *sa, unsigned int sw_index)
1429 {
1430 	struct sfc_adapter_shared * const sas = sfc_sa2shared(sa);
1431 	struct sfc_rxq_info *rxq_info = &sas->rxq_info[sw_index];
1432 	const efx_nic_cfg_t *encp = efx_nic_cfg_get(sa->nic);
1433 	unsigned int max_entries;
1434 
1435 	max_entries = encp->enc_rxq_max_ndescs;
1436 	SFC_ASSERT(rte_is_power_of_2(max_entries));
1437 
1438 	rxq_info->max_entries = max_entries;
1439 
1440 	return 0;
1441 }
1442 
1443 static int
1444 sfc_rx_check_mode(struct sfc_adapter *sa, struct rte_eth_rxmode *rxmode)
1445 {
1446 	struct sfc_adapter_shared * const sas = sfc_sa2shared(sa);
1447 	uint64_t offloads_supported = sfc_rx_get_dev_offload_caps(sa) |
1448 				      sfc_rx_get_queue_offload_caps(sa);
1449 	struct sfc_rss *rss = &sas->rss;
1450 	int rc = 0;
1451 
1452 	switch (rxmode->mq_mode) {
1453 	case ETH_MQ_RX_NONE:
1454 		/* No special checks are required */
1455 		break;
1456 	case ETH_MQ_RX_RSS:
1457 		if (rss->context_type == EFX_RX_SCALE_UNAVAILABLE) {
1458 			sfc_err(sa, "RSS is not available");
1459 			rc = EINVAL;
1460 		}
1461 		break;
1462 	default:
1463 		sfc_err(sa, "Rx multi-queue mode %u not supported",
1464 			rxmode->mq_mode);
1465 		rc = EINVAL;
1466 	}
1467 
1468 	/*
1469 	 * Requested offloads are validated against supported by ethdev,
1470 	 * so unsupported offloads cannot be added as the result of
1471 	 * below check.
1472 	 */
1473 	if ((rxmode->offloads & DEV_RX_OFFLOAD_CHECKSUM) !=
1474 	    (offloads_supported & DEV_RX_OFFLOAD_CHECKSUM)) {
1475 		sfc_warn(sa, "Rx checksum offloads cannot be disabled - always on (IPv4/TCP/UDP)");
1476 		rxmode->offloads |= DEV_RX_OFFLOAD_CHECKSUM;
1477 	}
1478 
1479 	if ((offloads_supported & DEV_RX_OFFLOAD_OUTER_IPV4_CKSUM) &&
1480 	    (~rxmode->offloads & DEV_RX_OFFLOAD_OUTER_IPV4_CKSUM)) {
1481 		sfc_warn(sa, "Rx outer IPv4 checksum offload cannot be disabled - always on");
1482 		rxmode->offloads |= DEV_RX_OFFLOAD_OUTER_IPV4_CKSUM;
1483 	}
1484 
1485 	return rc;
1486 }
1487 
1488 /**
1489  * Destroy excess queues that are no longer needed after reconfiguration
1490  * or complete close.
1491  */
1492 static void
1493 sfc_rx_fini_queues(struct sfc_adapter *sa, unsigned int nb_rx_queues)
1494 {
1495 	struct sfc_adapter_shared * const sas = sfc_sa2shared(sa);
1496 	int sw_index;
1497 
1498 	SFC_ASSERT(nb_rx_queues <= sas->rxq_count);
1499 
1500 	sw_index = sas->rxq_count;
1501 	while (--sw_index >= (int)nb_rx_queues) {
1502 		if (sas->rxq_info[sw_index].state & SFC_RXQ_INITIALIZED)
1503 			sfc_rx_qfini(sa, sw_index);
1504 	}
1505 
1506 	sas->rxq_count = nb_rx_queues;
1507 }
1508 
1509 /**
1510  * Initialize Rx subsystem.
1511  *
1512  * Called at device (re)configuration stage when number of receive queues is
1513  * specified together with other device level receive configuration.
1514  *
1515  * It should be used to allocate NUMA-unaware resources.
1516  */
1517 int
1518 sfc_rx_configure(struct sfc_adapter *sa)
1519 {
1520 	struct sfc_adapter_shared * const sas = sfc_sa2shared(sa);
1521 	struct sfc_rss *rss = &sas->rss;
1522 	struct rte_eth_conf *dev_conf = &sa->eth_dev->data->dev_conf;
1523 	const unsigned int nb_rx_queues = sa->eth_dev->data->nb_rx_queues;
1524 	int rc;
1525 
1526 	sfc_log_init(sa, "nb_rx_queues=%u (old %u)",
1527 		     nb_rx_queues, sas->rxq_count);
1528 
1529 	rc = sfc_rx_check_mode(sa, &dev_conf->rxmode);
1530 	if (rc != 0)
1531 		goto fail_check_mode;
1532 
1533 	if (nb_rx_queues == sas->rxq_count)
1534 		goto configure_rss;
1535 
1536 	if (sas->rxq_info == NULL) {
1537 		rc = ENOMEM;
1538 		sas->rxq_info = rte_calloc_socket("sfc-rxqs", nb_rx_queues,
1539 						  sizeof(sas->rxq_info[0]), 0,
1540 						  sa->socket_id);
1541 		if (sas->rxq_info == NULL)
1542 			goto fail_rxqs_alloc;
1543 
1544 		/*
1545 		 * Allocate primary process only RxQ control from heap
1546 		 * since it should not be shared.
1547 		 */
1548 		rc = ENOMEM;
1549 		sa->rxq_ctrl = calloc(nb_rx_queues, sizeof(sa->rxq_ctrl[0]));
1550 		if (sa->rxq_ctrl == NULL)
1551 			goto fail_rxqs_ctrl_alloc;
1552 	} else {
1553 		struct sfc_rxq_info *new_rxq_info;
1554 		struct sfc_rxq *new_rxq_ctrl;
1555 
1556 		if (nb_rx_queues < sas->rxq_count)
1557 			sfc_rx_fini_queues(sa, nb_rx_queues);
1558 
1559 		rc = ENOMEM;
1560 		new_rxq_info =
1561 			rte_realloc(sas->rxq_info,
1562 				    nb_rx_queues * sizeof(sas->rxq_info[0]), 0);
1563 		if (new_rxq_info == NULL && nb_rx_queues > 0)
1564 			goto fail_rxqs_realloc;
1565 
1566 		rc = ENOMEM;
1567 		new_rxq_ctrl = realloc(sa->rxq_ctrl,
1568 				       nb_rx_queues * sizeof(sa->rxq_ctrl[0]));
1569 		if (new_rxq_ctrl == NULL && nb_rx_queues > 0)
1570 			goto fail_rxqs_ctrl_realloc;
1571 
1572 		sas->rxq_info = new_rxq_info;
1573 		sa->rxq_ctrl = new_rxq_ctrl;
1574 		if (nb_rx_queues > sas->rxq_count) {
1575 			memset(&sas->rxq_info[sas->rxq_count], 0,
1576 			       (nb_rx_queues - sas->rxq_count) *
1577 			       sizeof(sas->rxq_info[0]));
1578 			memset(&sa->rxq_ctrl[sas->rxq_count], 0,
1579 			       (nb_rx_queues - sas->rxq_count) *
1580 			       sizeof(sa->rxq_ctrl[0]));
1581 		}
1582 	}
1583 
1584 	while (sas->rxq_count < nb_rx_queues) {
1585 		rc = sfc_rx_qinit_info(sa, sas->rxq_count);
1586 		if (rc != 0)
1587 			goto fail_rx_qinit_info;
1588 
1589 		sas->rxq_count++;
1590 	}
1591 
1592 configure_rss:
1593 	rss->channels = (dev_conf->rxmode.mq_mode == ETH_MQ_RX_RSS) ?
1594 			 MIN(sas->rxq_count, EFX_MAXRSS) : 0;
1595 
1596 	if (rss->channels > 0) {
1597 		struct rte_eth_rss_conf *adv_conf_rss;
1598 		unsigned int sw_index;
1599 
1600 		for (sw_index = 0; sw_index < EFX_RSS_TBL_SIZE; ++sw_index)
1601 			rss->tbl[sw_index] = sw_index % rss->channels;
1602 
1603 		adv_conf_rss = &dev_conf->rx_adv_conf.rss_conf;
1604 		rc = sfc_rx_process_adv_conf_rss(sa, adv_conf_rss);
1605 		if (rc != 0)
1606 			goto fail_rx_process_adv_conf_rss;
1607 	}
1608 
1609 	return 0;
1610 
1611 fail_rx_process_adv_conf_rss:
1612 fail_rx_qinit_info:
1613 fail_rxqs_ctrl_realloc:
1614 fail_rxqs_realloc:
1615 fail_rxqs_ctrl_alloc:
1616 fail_rxqs_alloc:
1617 	sfc_rx_close(sa);
1618 
1619 fail_check_mode:
1620 	sfc_log_init(sa, "failed %d", rc);
1621 	return rc;
1622 }
1623 
1624 /**
1625  * Shutdown Rx subsystem.
1626  *
1627  * Called at device close stage, for example, before device shutdown.
1628  */
1629 void
1630 sfc_rx_close(struct sfc_adapter *sa)
1631 {
1632 	struct sfc_rss *rss = &sfc_sa2shared(sa)->rss;
1633 
1634 	sfc_rx_fini_queues(sa, 0);
1635 
1636 	rss->channels = 0;
1637 
1638 	free(sa->rxq_ctrl);
1639 	sa->rxq_ctrl = NULL;
1640 
1641 	rte_free(sfc_sa2shared(sa)->rxq_info);
1642 	sfc_sa2shared(sa)->rxq_info = NULL;
1643 }
1644