xref: /dpdk/drivers/net/sfc/sfc_ef10_tx.c (revision f5057be340e44f3edc0fe90fa875eb89a4c49b4f)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  *
3  * Copyright(c) 2019-2020 Xilinx, Inc.
4  * Copyright(c) 2016-2019 Solarflare Communications Inc.
5  *
6  * This software was jointly developed between OKTET Labs (under contract
7  * for Solarflare) and Solarflare Communications, Inc.
8  */
9 
10 #include <stdbool.h>
11 
12 #include <rte_mbuf.h>
13 #include <rte_io.h>
14 #include <rte_ip.h>
15 #include <rte_tcp.h>
16 
17 #include "efx.h"
18 #include "efx_types.h"
19 #include "efx_regs.h"
20 #include "efx_regs_ef10.h"
21 
22 #include "sfc_debug.h"
23 #include "sfc_dp_tx.h"
24 #include "sfc_tweak.h"
25 #include "sfc_kvargs.h"
26 #include "sfc_ef10.h"
27 #include "sfc_tso.h"
28 
29 #define sfc_ef10_tx_err(dpq, ...) \
30 	SFC_DP_LOG(SFC_KVARG_DATAPATH_EF10, ERR, dpq, __VA_ARGS__)
31 
32 /** Maximum length of the DMA descriptor data */
33 #define SFC_EF10_TX_DMA_DESC_LEN_MAX \
34 	((1u << ESF_DZ_TX_KER_BYTE_CNT_WIDTH) - 1)
35 
36 /**
37  * Maximum number of descriptors/buffers in the Tx ring.
38  * It should guarantee that corresponding event queue never overfill.
39  * EF10 native datapath uses event queue of the same size as Tx queue.
40  * Maximum number of events on datapath can be estimated as number of
41  * Tx queue entries (one event per Tx buffer in the worst case) plus
42  * Tx error and flush events.
43  */
44 #define SFC_EF10_TXQ_LIMIT(_ndesc) \
45 	((_ndesc) - 1 /* head must not step on tail */ - \
46 	 (SFC_EF10_EV_PER_CACHE_LINE - 1) /* max unused EvQ entries */ - \
47 	 1 /* Rx error */ - 1 /* flush */)
48 
49 struct sfc_ef10_tx_sw_desc {
50 	struct rte_mbuf			*mbuf;
51 };
52 
53 struct sfc_ef10_txq {
54 	unsigned int			flags;
55 #define SFC_EF10_TXQ_STARTED		0x1
56 #define SFC_EF10_TXQ_NOT_RUNNING	0x2
57 #define SFC_EF10_TXQ_EXCEPTION		0x4
58 
59 	unsigned int			ptr_mask;
60 	unsigned int			added;
61 	unsigned int			completed;
62 	unsigned int			max_fill_level;
63 	unsigned int			free_thresh;
64 	unsigned int			evq_read_ptr;
65 	struct sfc_ef10_tx_sw_desc	*sw_ring;
66 	efx_qword_t			*txq_hw_ring;
67 	volatile void			*doorbell;
68 	efx_qword_t			*evq_hw_ring;
69 	uint8_t				*tsoh;
70 	rte_iova_t			tsoh_iova;
71 	uint16_t			tso_tcp_header_offset_limit;
72 
73 	/* Datapath transmit queue anchor */
74 	struct sfc_dp_txq		dp;
75 };
76 
77 static inline struct sfc_ef10_txq *
78 sfc_ef10_txq_by_dp_txq(struct sfc_dp_txq *dp_txq)
79 {
80 	return container_of(dp_txq, struct sfc_ef10_txq, dp);
81 }
82 
83 static bool
84 sfc_ef10_tx_get_event(struct sfc_ef10_txq *txq, efx_qword_t *tx_ev)
85 {
86 	volatile efx_qword_t *evq_hw_ring = txq->evq_hw_ring;
87 
88 	/*
89 	 * Exception flag is set when reap is done.
90 	 * It is never done twice per packet burst get and absence of
91 	 * the flag is checked on burst get entry.
92 	 */
93 	SFC_ASSERT((txq->flags & SFC_EF10_TXQ_EXCEPTION) == 0);
94 
95 	*tx_ev = evq_hw_ring[txq->evq_read_ptr & txq->ptr_mask];
96 
97 	if (!sfc_ef10_ev_present(*tx_ev))
98 		return false;
99 
100 	if (unlikely(EFX_QWORD_FIELD(*tx_ev, FSF_AZ_EV_CODE) !=
101 		     FSE_AZ_EV_CODE_TX_EV)) {
102 		/*
103 		 * Do not move read_ptr to keep the event for exception
104 		 * handling by the control path.
105 		 */
106 		txq->flags |= SFC_EF10_TXQ_EXCEPTION;
107 		sfc_ef10_tx_err(&txq->dp.dpq,
108 				"TxQ exception at EvQ read ptr %#x",
109 				txq->evq_read_ptr);
110 		return false;
111 	}
112 
113 	txq->evq_read_ptr++;
114 	return true;
115 }
116 
117 static unsigned int
118 sfc_ef10_tx_process_events(struct sfc_ef10_txq *txq)
119 {
120 	const unsigned int curr_done = txq->completed - 1;
121 	unsigned int anew_done = curr_done;
122 	efx_qword_t tx_ev;
123 
124 	while (sfc_ef10_tx_get_event(txq, &tx_ev)) {
125 		/*
126 		 * DROP_EVENT is an internal to the NIC, software should
127 		 * never see it and, therefore, may ignore it.
128 		 */
129 
130 		/* Update the latest done descriptor */
131 		anew_done = EFX_QWORD_FIELD(tx_ev, ESF_DZ_TX_DESCR_INDX);
132 	}
133 	return (anew_done - curr_done) & txq->ptr_mask;
134 }
135 
136 static void
137 sfc_ef10_tx_reap(struct sfc_ef10_txq *txq)
138 {
139 	const unsigned int old_read_ptr = txq->evq_read_ptr;
140 	const unsigned int ptr_mask = txq->ptr_mask;
141 	unsigned int completed = txq->completed;
142 	unsigned int pending = completed;
143 
144 	pending += sfc_ef10_tx_process_events(txq);
145 
146 	if (pending != completed) {
147 		struct rte_mbuf *bulk[SFC_TX_REAP_BULK_SIZE];
148 		unsigned int nb = 0;
149 
150 		do {
151 			struct sfc_ef10_tx_sw_desc *txd;
152 			struct rte_mbuf *m;
153 
154 			txd = &txq->sw_ring[completed & ptr_mask];
155 			if (txd->mbuf == NULL)
156 				continue;
157 
158 			m = rte_pktmbuf_prefree_seg(txd->mbuf);
159 			txd->mbuf = NULL;
160 			if (m == NULL)
161 				continue;
162 
163 			if ((nb == RTE_DIM(bulk)) ||
164 			    ((nb != 0) && (m->pool != bulk[0]->pool))) {
165 				rte_mempool_put_bulk(bulk[0]->pool,
166 						     (void *)bulk, nb);
167 				nb = 0;
168 			}
169 
170 			bulk[nb++] = m;
171 		} while (++completed != pending);
172 
173 		if (nb != 0)
174 			rte_mempool_put_bulk(bulk[0]->pool, (void *)bulk, nb);
175 
176 		txq->completed = completed;
177 	}
178 
179 	sfc_ef10_ev_qclear(txq->evq_hw_ring, ptr_mask, old_read_ptr,
180 			   txq->evq_read_ptr);
181 }
182 
183 static void
184 sfc_ef10_tx_qdesc_dma_create(rte_iova_t addr, uint16_t size, bool eop,
185 			     efx_qword_t *edp)
186 {
187 	EFX_POPULATE_QWORD_4(*edp,
188 			     ESF_DZ_TX_KER_TYPE, 0,
189 			     ESF_DZ_TX_KER_CONT, !eop,
190 			     ESF_DZ_TX_KER_BYTE_CNT, size,
191 			     ESF_DZ_TX_KER_BUF_ADDR, addr);
192 }
193 
194 static void
195 sfc_ef10_tx_qdesc_tso2_create(struct sfc_ef10_txq * const txq,
196 			      unsigned int added, uint16_t ipv4_id,
197 			      uint16_t outer_ipv4_id, uint32_t tcp_seq,
198 			      uint16_t tcp_mss)
199 {
200 	EFX_POPULATE_QWORD_5(txq->txq_hw_ring[added & txq->ptr_mask],
201 			    ESF_DZ_TX_DESC_IS_OPT, 1,
202 			    ESF_DZ_TX_OPTION_TYPE,
203 			    ESE_DZ_TX_OPTION_DESC_TSO,
204 			    ESF_DZ_TX_TSO_OPTION_TYPE,
205 			    ESE_DZ_TX_TSO_OPTION_DESC_FATSO2A,
206 			    ESF_DZ_TX_TSO_IP_ID, ipv4_id,
207 			    ESF_DZ_TX_TSO_TCP_SEQNO, tcp_seq);
208 	EFX_POPULATE_QWORD_5(txq->txq_hw_ring[(added + 1) & txq->ptr_mask],
209 			    ESF_DZ_TX_DESC_IS_OPT, 1,
210 			    ESF_DZ_TX_OPTION_TYPE,
211 			    ESE_DZ_TX_OPTION_DESC_TSO,
212 			    ESF_DZ_TX_TSO_OPTION_TYPE,
213 			    ESE_DZ_TX_TSO_OPTION_DESC_FATSO2B,
214 			    ESF_DZ_TX_TSO_TCP_MSS, tcp_mss,
215 			    ESF_DZ_TX_TSO_OUTER_IPID, outer_ipv4_id);
216 }
217 
218 static inline void
219 sfc_ef10_tx_qpush(struct sfc_ef10_txq *txq, unsigned int added,
220 		  unsigned int pushed)
221 {
222 	efx_qword_t desc;
223 	efx_oword_t oword;
224 
225 	/*
226 	 * This improves performance by pushing a TX descriptor at the same
227 	 * time as the doorbell. The descriptor must be added to the TXQ,
228 	 * so that can be used if the hardware decides not to use the pushed
229 	 * descriptor.
230 	 */
231 	desc.eq_u64[0] = txq->txq_hw_ring[pushed & txq->ptr_mask].eq_u64[0];
232 	EFX_POPULATE_OWORD_3(oword,
233 		ERF_DZ_TX_DESC_WPTR, added & txq->ptr_mask,
234 		ERF_DZ_TX_DESC_HWORD, EFX_QWORD_FIELD(desc, EFX_DWORD_1),
235 		ERF_DZ_TX_DESC_LWORD, EFX_QWORD_FIELD(desc, EFX_DWORD_0));
236 
237 	/* DMA sync to device is not required */
238 
239 	/*
240 	 * rte_io_wmb() which guarantees that the STORE operations
241 	 * (i.e. Tx and event descriptor updates) that precede
242 	 * the rte_io_wmb() call are visible to NIC before the STORE
243 	 * operations that follow it (i.e. doorbell write).
244 	 */
245 	rte_io_wmb();
246 
247 	*(volatile __m128i *)txq->doorbell = oword.eo_u128[0];
248 }
249 
250 static unsigned int
251 sfc_ef10_tx_pkt_descs_max(const struct rte_mbuf *m)
252 {
253 	unsigned int extra_descs_per_seg;
254 	unsigned int extra_descs_per_pkt;
255 
256 	/*
257 	 * VLAN offload is not supported yet, so no extra descriptors
258 	 * are required for VLAN option descriptor.
259 	 */
260 
261 /** Maximum length of the mbuf segment data */
262 #define SFC_MBUF_SEG_LEN_MAX		UINT16_MAX
263 	RTE_BUILD_BUG_ON(sizeof(m->data_len) != 2);
264 
265 	/*
266 	 * Each segment is already counted once below.  So, calculate
267 	 * how many extra DMA descriptors may be required per segment in
268 	 * the worst case because of maximum DMA descriptor length limit.
269 	 * If maximum segment length is less or equal to maximum DMA
270 	 * descriptor length, no extra DMA descriptors are required.
271 	 */
272 	extra_descs_per_seg =
273 		(SFC_MBUF_SEG_LEN_MAX - 1) / SFC_EF10_TX_DMA_DESC_LEN_MAX;
274 
275 /** Maximum length of the packet */
276 #define SFC_MBUF_PKT_LEN_MAX		UINT32_MAX
277 	RTE_BUILD_BUG_ON(sizeof(m->pkt_len) != 4);
278 
279 	/*
280 	 * One more limitation on maximum number of extra DMA descriptors
281 	 * comes from slicing entire packet because of DMA descriptor length
282 	 * limit taking into account that there is at least one segment
283 	 * which is already counted below (so division of the maximum
284 	 * packet length minus one with round down).
285 	 * TSO is not supported yet, so packet length is limited by
286 	 * maximum PDU size.
287 	 */
288 	extra_descs_per_pkt =
289 		(RTE_MIN((unsigned int)EFX_MAC_PDU_MAX,
290 			 SFC_MBUF_PKT_LEN_MAX) - 1) /
291 		SFC_EF10_TX_DMA_DESC_LEN_MAX;
292 
293 	return m->nb_segs + RTE_MIN(m->nb_segs * extra_descs_per_seg,
294 				    extra_descs_per_pkt);
295 }
296 
297 static bool
298 sfc_ef10_try_reap(struct sfc_ef10_txq * const txq, unsigned int added,
299 		  unsigned int needed_desc, unsigned int *dma_desc_space,
300 		  bool *reap_done)
301 {
302 	if (*reap_done)
303 		return false;
304 
305 	if (added != txq->added) {
306 		sfc_ef10_tx_qpush(txq, added, txq->added);
307 		txq->added = added;
308 	}
309 
310 	sfc_ef10_tx_reap(txq);
311 	*reap_done = true;
312 
313 	/*
314 	 * Recalculate DMA descriptor space since Tx reap may change
315 	 * the number of completed descriptors
316 	 */
317 	*dma_desc_space = txq->max_fill_level -
318 		(added - txq->completed);
319 
320 	return (needed_desc <= *dma_desc_space);
321 }
322 
323 static uint16_t
324 sfc_ef10_prepare_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
325 		      uint16_t nb_pkts)
326 {
327 	struct sfc_ef10_txq * const txq = sfc_ef10_txq_by_dp_txq(tx_queue);
328 	uint16_t i;
329 
330 	for (i = 0; i < nb_pkts; i++) {
331 		struct rte_mbuf *m = tx_pkts[i];
332 		int ret;
333 
334 #ifdef RTE_LIBRTE_SFC_EFX_DEBUG
335 		/*
336 		 * In non-TSO case, check that a packet segments do not exceed
337 		 * the size limit. Perform the check in debug mode since MTU
338 		 * more than 9k is not supported, but the limit here is 16k-1.
339 		 */
340 		if (!(m->ol_flags & PKT_TX_TCP_SEG)) {
341 			struct rte_mbuf *m_seg;
342 
343 			for (m_seg = m; m_seg != NULL; m_seg = m_seg->next) {
344 				if (m_seg->data_len >
345 				    SFC_EF10_TX_DMA_DESC_LEN_MAX) {
346 					rte_errno = EINVAL;
347 					break;
348 				}
349 			}
350 		}
351 #endif
352 		ret = sfc_dp_tx_prepare_pkt(m,
353 				txq->tso_tcp_header_offset_limit,
354 				txq->max_fill_level,
355 				SFC_EF10_TSO_OPT_DESCS_NUM, 0);
356 		if (unlikely(ret != 0)) {
357 			rte_errno = ret;
358 			break;
359 		}
360 	}
361 
362 	return i;
363 }
364 
365 static int
366 sfc_ef10_xmit_tso_pkt(struct sfc_ef10_txq * const txq, struct rte_mbuf *m_seg,
367 		      unsigned int *added, unsigned int *dma_desc_space,
368 		      bool *reap_done)
369 {
370 	size_t iph_off = ((m_seg->ol_flags & PKT_TX_TUNNEL_MASK) ?
371 			  m_seg->outer_l2_len + m_seg->outer_l3_len : 0) +
372 			 m_seg->l2_len;
373 	size_t tcph_off = iph_off + m_seg->l3_len;
374 	size_t header_len = tcph_off + m_seg->l4_len;
375 	/* Offset of the payload in the last segment that contains the header */
376 	size_t in_off = 0;
377 	const struct rte_tcp_hdr *th;
378 	uint16_t packet_id = 0;
379 	uint16_t outer_packet_id = 0;
380 	uint32_t sent_seq;
381 	uint8_t *hdr_addr;
382 	rte_iova_t hdr_iova;
383 	struct rte_mbuf *first_m_seg = m_seg;
384 	unsigned int pkt_start = *added;
385 	unsigned int needed_desc;
386 	struct rte_mbuf *m_seg_to_free_up_to = first_m_seg;
387 	bool eop;
388 
389 	/*
390 	 * Preliminary estimation of required DMA descriptors, including extra
391 	 * descriptor for TSO header that is needed when the header is
392 	 * separated from payload in one segment. It does not include
393 	 * extra descriptors that may appear when a big segment is split across
394 	 * several descriptors.
395 	 */
396 	needed_desc = m_seg->nb_segs +
397 			(unsigned int)SFC_EF10_TSO_OPT_DESCS_NUM +
398 			(unsigned int)SFC_EF10_TSO_HDR_DESCS_NUM;
399 
400 	if (needed_desc > *dma_desc_space &&
401 	    !sfc_ef10_try_reap(txq, pkt_start, needed_desc,
402 			       dma_desc_space, reap_done)) {
403 		/*
404 		 * If a future Tx reap may increase available DMA descriptor
405 		 * space, do not try to send the packet.
406 		 */
407 		if (txq->completed != pkt_start)
408 			return ENOSPC;
409 		/*
410 		 * Do not allow to send packet if the maximum DMA
411 		 * descriptor space is not sufficient to hold TSO
412 		 * descriptors, header descriptor and at least 1
413 		 * segment descriptor.
414 		 */
415 		if (*dma_desc_space < SFC_EF10_TSO_OPT_DESCS_NUM +
416 				SFC_EF10_TSO_HDR_DESCS_NUM + 1)
417 			return EMSGSIZE;
418 	}
419 
420 	/* Check if the header is not fragmented */
421 	if (rte_pktmbuf_data_len(m_seg) >= header_len) {
422 		hdr_addr = rte_pktmbuf_mtod(m_seg, uint8_t *);
423 		hdr_iova = rte_mbuf_data_iova(m_seg);
424 		if (rte_pktmbuf_data_len(m_seg) == header_len) {
425 			/* Cannot send a packet that consists only of header */
426 			if (unlikely(m_seg->next == NULL))
427 				return EMSGSIZE;
428 			/*
429 			 * Associate header mbuf with header descriptor
430 			 * which is located after TSO descriptors.
431 			 */
432 			txq->sw_ring[(pkt_start + SFC_EF10_TSO_OPT_DESCS_NUM) &
433 				     txq->ptr_mask].mbuf = m_seg;
434 			m_seg = m_seg->next;
435 			in_off = 0;
436 
437 			/*
438 			 * If there is no payload offset (payload starts at the
439 			 * beginning of a segment) then an extra descriptor for
440 			 * separated header is not needed.
441 			 */
442 			needed_desc--;
443 		} else {
444 			in_off = header_len;
445 		}
446 	} else {
447 		unsigned int copied_segs;
448 		unsigned int hdr_addr_off = (*added & txq->ptr_mask) *
449 				SFC_TSOH_STD_LEN;
450 
451 		/*
452 		 * Discard a packet if header linearization is needed but
453 		 * the header is too big.
454 		 * Duplicate Tx prepare check here to avoid spoil of
455 		 * memory if Tx prepare is skipped.
456 		 */
457 		if (unlikely(header_len > SFC_TSOH_STD_LEN))
458 			return EMSGSIZE;
459 
460 		hdr_addr = txq->tsoh + hdr_addr_off;
461 		hdr_iova = txq->tsoh_iova + hdr_addr_off;
462 		copied_segs = sfc_tso_prepare_header(hdr_addr, header_len,
463 						     &m_seg, &in_off);
464 
465 		/* Cannot send a packet that consists only of header */
466 		if (unlikely(m_seg == NULL))
467 			return EMSGSIZE;
468 
469 		m_seg_to_free_up_to = m_seg;
470 		/*
471 		 * Reduce the number of needed descriptors by the number of
472 		 * segments that entirely consist of header data.
473 		 */
474 		needed_desc -= copied_segs;
475 
476 		/* Extra descriptor for separated header is not needed */
477 		if (in_off == 0)
478 			needed_desc--;
479 	}
480 
481 	/*
482 	 * Tx prepare has debug-only checks that offload flags are correctly
483 	 * filled in in TSO mbuf. Use zero IPID if there is no IPv4 flag.
484 	 * If the packet is still IPv4, HW will simply start from zero IPID.
485 	 */
486 	if (first_m_seg->ol_flags & PKT_TX_IPV4)
487 		packet_id = sfc_tso_ip4_get_ipid(hdr_addr, iph_off);
488 
489 	if (first_m_seg->ol_flags & PKT_TX_OUTER_IPV4)
490 		outer_packet_id = sfc_tso_ip4_get_ipid(hdr_addr,
491 						first_m_seg->outer_l2_len);
492 
493 	th = (const struct rte_tcp_hdr *)(hdr_addr + tcph_off);
494 	rte_memcpy(&sent_seq, &th->sent_seq, sizeof(uint32_t));
495 	sent_seq = rte_be_to_cpu_32(sent_seq);
496 
497 	sfc_ef10_tx_qdesc_tso2_create(txq, *added, packet_id, outer_packet_id,
498 			sent_seq, first_m_seg->tso_segsz);
499 	(*added) += SFC_EF10_TSO_OPT_DESCS_NUM;
500 
501 	sfc_ef10_tx_qdesc_dma_create(hdr_iova, header_len, false,
502 			&txq->txq_hw_ring[(*added) & txq->ptr_mask]);
503 	(*added)++;
504 
505 	do {
506 		rte_iova_t next_frag = rte_mbuf_data_iova(m_seg);
507 		unsigned int seg_len = rte_pktmbuf_data_len(m_seg);
508 		unsigned int id;
509 
510 		next_frag += in_off;
511 		seg_len -= in_off;
512 		in_off = 0;
513 
514 		do {
515 			rte_iova_t frag_addr = next_frag;
516 			size_t frag_len;
517 
518 			frag_len = RTE_MIN(seg_len,
519 					   SFC_EF10_TX_DMA_DESC_LEN_MAX);
520 
521 			next_frag += frag_len;
522 			seg_len -= frag_len;
523 
524 			eop = (seg_len == 0 && m_seg->next == NULL);
525 
526 			id = (*added) & txq->ptr_mask;
527 			(*added)++;
528 
529 			/*
530 			 * Initially we assume that one DMA descriptor is needed
531 			 * for every segment. When the segment is split across
532 			 * several DMA descriptors, increase the estimation.
533 			 */
534 			needed_desc += (seg_len != 0);
535 
536 			/*
537 			 * When no more descriptors can be added, but not all
538 			 * segments are processed.
539 			 */
540 			if (*added - pkt_start == *dma_desc_space &&
541 			    !eop &&
542 			    !sfc_ef10_try_reap(txq, pkt_start, needed_desc,
543 						dma_desc_space, reap_done)) {
544 				struct rte_mbuf *m;
545 				struct rte_mbuf *m_next;
546 
547 				if (txq->completed != pkt_start) {
548 					unsigned int i;
549 
550 					/*
551 					 * Reset mbuf associations with added
552 					 * descriptors.
553 					 */
554 					for (i = pkt_start; i != *added; i++) {
555 						id = i & txq->ptr_mask;
556 						txq->sw_ring[id].mbuf = NULL;
557 					}
558 					return ENOSPC;
559 				}
560 
561 				/* Free the segments that cannot be sent */
562 				for (m = m_seg->next; m != NULL; m = m_next) {
563 					m_next = m->next;
564 					rte_pktmbuf_free_seg(m);
565 				}
566 				eop = true;
567 				/* Ignore the rest of the segment */
568 				seg_len = 0;
569 			}
570 
571 			sfc_ef10_tx_qdesc_dma_create(frag_addr, frag_len,
572 					eop, &txq->txq_hw_ring[id]);
573 
574 		} while (seg_len != 0);
575 
576 		txq->sw_ring[id].mbuf = m_seg;
577 
578 		m_seg = m_seg->next;
579 	} while (!eop);
580 
581 	/*
582 	 * Free segments which content was entirely copied to the TSO header
583 	 * memory space of Tx queue
584 	 */
585 	for (m_seg = first_m_seg; m_seg != m_seg_to_free_up_to;) {
586 		struct rte_mbuf *seg_to_free = m_seg;
587 
588 		m_seg = m_seg->next;
589 		rte_pktmbuf_free_seg(seg_to_free);
590 	}
591 
592 	return 0;
593 }
594 
595 static uint16_t
596 sfc_ef10_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
597 {
598 	struct sfc_ef10_txq * const txq = sfc_ef10_txq_by_dp_txq(tx_queue);
599 	unsigned int added;
600 	unsigned int dma_desc_space;
601 	bool reap_done;
602 	struct rte_mbuf **pktp;
603 	struct rte_mbuf **pktp_end;
604 
605 	if (unlikely(txq->flags &
606 		     (SFC_EF10_TXQ_NOT_RUNNING | SFC_EF10_TXQ_EXCEPTION)))
607 		return 0;
608 
609 	added = txq->added;
610 	dma_desc_space = txq->max_fill_level - (added - txq->completed);
611 
612 	reap_done = (dma_desc_space < txq->free_thresh);
613 	if (reap_done) {
614 		sfc_ef10_tx_reap(txq);
615 		dma_desc_space = txq->max_fill_level - (added - txq->completed);
616 	}
617 
618 	for (pktp = &tx_pkts[0], pktp_end = &tx_pkts[nb_pkts];
619 	     pktp != pktp_end;
620 	     ++pktp) {
621 		struct rte_mbuf *m_seg = *pktp;
622 		unsigned int pkt_start = added;
623 		uint32_t pkt_len;
624 
625 		if (likely(pktp + 1 != pktp_end))
626 			rte_mbuf_prefetch_part1(pktp[1]);
627 
628 		if (m_seg->ol_flags & PKT_TX_TCP_SEG) {
629 			int rc;
630 
631 			rc = sfc_ef10_xmit_tso_pkt(txq, m_seg, &added,
632 					&dma_desc_space, &reap_done);
633 			if (rc != 0) {
634 				added = pkt_start;
635 
636 				/* Packet can be sent in following xmit calls */
637 				if (likely(rc == ENOSPC))
638 					break;
639 
640 				/*
641 				 * Packet cannot be sent, tell RTE that
642 				 * it is sent, but actually drop it and
643 				 * continue with another packet
644 				 */
645 				rte_pktmbuf_free(*pktp);
646 				continue;
647 			}
648 
649 			goto dma_desc_space_update;
650 		}
651 
652 		if (sfc_ef10_tx_pkt_descs_max(m_seg) > dma_desc_space) {
653 			if (reap_done)
654 				break;
655 
656 			/* Push already prepared descriptors before polling */
657 			if (added != txq->added) {
658 				sfc_ef10_tx_qpush(txq, added, txq->added);
659 				txq->added = added;
660 			}
661 
662 			sfc_ef10_tx_reap(txq);
663 			reap_done = true;
664 			dma_desc_space = txq->max_fill_level -
665 				(added - txq->completed);
666 			if (sfc_ef10_tx_pkt_descs_max(m_seg) > dma_desc_space)
667 				break;
668 		}
669 
670 		pkt_len = m_seg->pkt_len;
671 		do {
672 			rte_iova_t seg_addr = rte_mbuf_data_iova(m_seg);
673 			unsigned int seg_len = rte_pktmbuf_data_len(m_seg);
674 			unsigned int id = added & txq->ptr_mask;
675 
676 			SFC_ASSERT(seg_len <= SFC_EF10_TX_DMA_DESC_LEN_MAX);
677 
678 			pkt_len -= seg_len;
679 
680 			sfc_ef10_tx_qdesc_dma_create(seg_addr,
681 				seg_len, (pkt_len == 0),
682 				&txq->txq_hw_ring[id]);
683 
684 			/*
685 			 * rte_pktmbuf_free() is commonly used in DPDK for
686 			 * recycling packets - the function checks every
687 			 * segment's reference counter and returns the
688 			 * buffer to its pool whenever possible;
689 			 * nevertheless, freeing mbuf segments one by one
690 			 * may entail some performance decline;
691 			 * from this point, sfc_efx_tx_reap() does the same job
692 			 * on its own and frees buffers in bulks (all mbufs
693 			 * within a bulk belong to the same pool);
694 			 * from this perspective, individual segment pointers
695 			 * must be associated with the corresponding SW
696 			 * descriptors independently so that only one loop
697 			 * is sufficient on reap to inspect all the buffers
698 			 */
699 			txq->sw_ring[id].mbuf = m_seg;
700 
701 			++added;
702 
703 		} while ((m_seg = m_seg->next) != 0);
704 
705 dma_desc_space_update:
706 		dma_desc_space -= (added - pkt_start);
707 	}
708 
709 	if (likely(added != txq->added)) {
710 		sfc_ef10_tx_qpush(txq, added, txq->added);
711 		txq->added = added;
712 	}
713 
714 #if SFC_TX_XMIT_PKTS_REAP_AT_LEAST_ONCE
715 	if (!reap_done)
716 		sfc_ef10_tx_reap(txq);
717 #endif
718 
719 	return pktp - &tx_pkts[0];
720 }
721 
722 static void
723 sfc_ef10_simple_tx_reap(struct sfc_ef10_txq *txq)
724 {
725 	const unsigned int old_read_ptr = txq->evq_read_ptr;
726 	const unsigned int ptr_mask = txq->ptr_mask;
727 	unsigned int completed = txq->completed;
728 	unsigned int pending = completed;
729 
730 	pending += sfc_ef10_tx_process_events(txq);
731 
732 	if (pending != completed) {
733 		struct rte_mbuf *bulk[SFC_TX_REAP_BULK_SIZE];
734 		unsigned int nb = 0;
735 
736 		do {
737 			struct sfc_ef10_tx_sw_desc *txd;
738 
739 			txd = &txq->sw_ring[completed & ptr_mask];
740 
741 			if (nb == RTE_DIM(bulk)) {
742 				rte_mempool_put_bulk(bulk[0]->pool,
743 						     (void *)bulk, nb);
744 				nb = 0;
745 			}
746 
747 			bulk[nb++] = txd->mbuf;
748 		} while (++completed != pending);
749 
750 		rte_mempool_put_bulk(bulk[0]->pool, (void *)bulk, nb);
751 
752 		txq->completed = completed;
753 	}
754 
755 	sfc_ef10_ev_qclear(txq->evq_hw_ring, ptr_mask, old_read_ptr,
756 			   txq->evq_read_ptr);
757 }
758 
759 #ifdef RTE_LIBRTE_SFC_EFX_DEBUG
760 static uint16_t
761 sfc_ef10_simple_prepare_pkts(__rte_unused void *tx_queue,
762 			     struct rte_mbuf **tx_pkts,
763 			     uint16_t nb_pkts)
764 {
765 	uint16_t i;
766 
767 	for (i = 0; i < nb_pkts; i++) {
768 		struct rte_mbuf *m = tx_pkts[i];
769 		int ret;
770 
771 		ret = rte_validate_tx_offload(m);
772 		if (unlikely(ret != 0)) {
773 			/*
774 			 * Negative error code is returned by
775 			 * rte_validate_tx_offload(), but positive are used
776 			 * inside net/sfc PMD.
777 			 */
778 			SFC_ASSERT(ret < 0);
779 			rte_errno = -ret;
780 			break;
781 		}
782 
783 		/* ef10_simple does not support TSO and VLAN insertion */
784 		if (unlikely(m->ol_flags &
785 			     (PKT_TX_TCP_SEG | PKT_TX_VLAN_PKT))) {
786 			rte_errno = ENOTSUP;
787 			break;
788 		}
789 
790 		/* ef10_simple does not support scattered packets */
791 		if (unlikely(m->nb_segs != 1)) {
792 			rte_errno = ENOTSUP;
793 			break;
794 		}
795 
796 		/*
797 		 * ef10_simple requires fast-free which ignores reference
798 		 * counters
799 		 */
800 		if (unlikely(rte_mbuf_refcnt_read(m) != 1)) {
801 			rte_errno = ENOTSUP;
802 			break;
803 		}
804 
805 		/* ef10_simple requires single pool for all packets */
806 		if (unlikely(m->pool != tx_pkts[0]->pool)) {
807 			rte_errno = ENOTSUP;
808 			break;
809 		}
810 	}
811 
812 	return i;
813 }
814 #endif
815 
816 static uint16_t
817 sfc_ef10_simple_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
818 			  uint16_t nb_pkts)
819 {
820 	struct sfc_ef10_txq * const txq = sfc_ef10_txq_by_dp_txq(tx_queue);
821 	unsigned int ptr_mask;
822 	unsigned int added;
823 	unsigned int dma_desc_space;
824 	bool reap_done;
825 	struct rte_mbuf **pktp;
826 	struct rte_mbuf **pktp_end;
827 
828 	if (unlikely(txq->flags &
829 		     (SFC_EF10_TXQ_NOT_RUNNING | SFC_EF10_TXQ_EXCEPTION)))
830 		return 0;
831 
832 	ptr_mask = txq->ptr_mask;
833 	added = txq->added;
834 	dma_desc_space = txq->max_fill_level - (added - txq->completed);
835 
836 	reap_done = (dma_desc_space < RTE_MAX(txq->free_thresh, nb_pkts));
837 	if (reap_done) {
838 		sfc_ef10_simple_tx_reap(txq);
839 		dma_desc_space = txq->max_fill_level - (added - txq->completed);
840 	}
841 
842 	pktp_end = &tx_pkts[MIN(nb_pkts, dma_desc_space)];
843 	for (pktp = &tx_pkts[0]; pktp != pktp_end; ++pktp) {
844 		struct rte_mbuf *pkt = *pktp;
845 		unsigned int id = added & ptr_mask;
846 
847 		SFC_ASSERT(rte_pktmbuf_data_len(pkt) <=
848 			   SFC_EF10_TX_DMA_DESC_LEN_MAX);
849 
850 		sfc_ef10_tx_qdesc_dma_create(rte_mbuf_data_iova(pkt),
851 					     rte_pktmbuf_data_len(pkt),
852 					     true, &txq->txq_hw_ring[id]);
853 
854 		txq->sw_ring[id].mbuf = pkt;
855 
856 		++added;
857 	}
858 
859 	if (likely(added != txq->added)) {
860 		sfc_ef10_tx_qpush(txq, added, txq->added);
861 		txq->added = added;
862 	}
863 
864 #if SFC_TX_XMIT_PKTS_REAP_AT_LEAST_ONCE
865 	if (!reap_done)
866 		sfc_ef10_simple_tx_reap(txq);
867 #endif
868 
869 	return pktp - &tx_pkts[0];
870 }
871 
872 static sfc_dp_tx_get_dev_info_t sfc_ef10_get_dev_info;
873 static void
874 sfc_ef10_get_dev_info(struct rte_eth_dev_info *dev_info)
875 {
876 	/*
877 	 * Number of descriptors just defines maximum number of pushed
878 	 * descriptors (fill level).
879 	 */
880 	dev_info->tx_desc_lim.nb_min = 1;
881 	dev_info->tx_desc_lim.nb_align = 1;
882 }
883 
884 static sfc_dp_tx_qsize_up_rings_t sfc_ef10_tx_qsize_up_rings;
885 static int
886 sfc_ef10_tx_qsize_up_rings(uint16_t nb_tx_desc,
887 			   struct sfc_dp_tx_hw_limits *limits,
888 			   unsigned int *txq_entries,
889 			   unsigned int *evq_entries,
890 			   unsigned int *txq_max_fill_level)
891 {
892 	/*
893 	 * rte_ethdev API guarantees that the number meets min, max and
894 	 * alignment requirements.
895 	 */
896 	if (nb_tx_desc <= limits->txq_min_entries)
897 		*txq_entries = limits->txq_min_entries;
898 	else
899 		*txq_entries = rte_align32pow2(nb_tx_desc);
900 
901 	*evq_entries = *txq_entries;
902 
903 	*txq_max_fill_level = RTE_MIN(nb_tx_desc,
904 				      SFC_EF10_TXQ_LIMIT(*evq_entries));
905 	return 0;
906 }
907 
908 static sfc_dp_tx_qcreate_t sfc_ef10_tx_qcreate;
909 static int
910 sfc_ef10_tx_qcreate(uint16_t port_id, uint16_t queue_id,
911 		    const struct rte_pci_addr *pci_addr, int socket_id,
912 		    const struct sfc_dp_tx_qcreate_info *info,
913 		    struct sfc_dp_txq **dp_txqp)
914 {
915 	struct sfc_ef10_txq *txq;
916 	int rc;
917 
918 	rc = EINVAL;
919 	if (info->txq_entries != info->evq_entries)
920 		goto fail_bad_args;
921 
922 	rc = ENOMEM;
923 	txq = rte_zmalloc_socket("sfc-ef10-txq", sizeof(*txq),
924 				 RTE_CACHE_LINE_SIZE, socket_id);
925 	if (txq == NULL)
926 		goto fail_txq_alloc;
927 
928 	sfc_dp_queue_init(&txq->dp.dpq, port_id, queue_id, pci_addr);
929 
930 	rc = ENOMEM;
931 	txq->sw_ring = rte_calloc_socket("sfc-ef10-txq-sw_ring",
932 					 info->txq_entries,
933 					 sizeof(*txq->sw_ring),
934 					 RTE_CACHE_LINE_SIZE, socket_id);
935 	if (txq->sw_ring == NULL)
936 		goto fail_sw_ring_alloc;
937 
938 	if (info->offloads & (DEV_TX_OFFLOAD_TCP_TSO |
939 			      DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
940 			      DEV_TX_OFFLOAD_GENEVE_TNL_TSO)) {
941 		txq->tsoh = rte_calloc_socket("sfc-ef10-txq-tsoh",
942 					      info->txq_entries,
943 					      SFC_TSOH_STD_LEN,
944 					      RTE_CACHE_LINE_SIZE,
945 					      socket_id);
946 		if (txq->tsoh == NULL)
947 			goto fail_tsoh_alloc;
948 
949 		txq->tsoh_iova = rte_malloc_virt2iova(txq->tsoh);
950 	}
951 
952 	txq->flags = SFC_EF10_TXQ_NOT_RUNNING;
953 	txq->ptr_mask = info->txq_entries - 1;
954 	txq->max_fill_level = info->max_fill_level;
955 	txq->free_thresh = info->free_thresh;
956 	txq->txq_hw_ring = info->txq_hw_ring;
957 	txq->doorbell = (volatile uint8_t *)info->mem_bar +
958 			ER_DZ_TX_DESC_UPD_REG_OFST +
959 			(info->hw_index << info->vi_window_shift);
960 	txq->evq_hw_ring = info->evq_hw_ring;
961 	txq->tso_tcp_header_offset_limit = info->tso_tcp_header_offset_limit;
962 
963 	*dp_txqp = &txq->dp;
964 	return 0;
965 
966 fail_tsoh_alloc:
967 	rte_free(txq->sw_ring);
968 
969 fail_sw_ring_alloc:
970 	rte_free(txq);
971 
972 fail_txq_alloc:
973 fail_bad_args:
974 	return rc;
975 }
976 
977 static sfc_dp_tx_qdestroy_t sfc_ef10_tx_qdestroy;
978 static void
979 sfc_ef10_tx_qdestroy(struct sfc_dp_txq *dp_txq)
980 {
981 	struct sfc_ef10_txq *txq = sfc_ef10_txq_by_dp_txq(dp_txq);
982 
983 	rte_free(txq->tsoh);
984 	rte_free(txq->sw_ring);
985 	rte_free(txq);
986 }
987 
988 static sfc_dp_tx_qstart_t sfc_ef10_tx_qstart;
989 static int
990 sfc_ef10_tx_qstart(struct sfc_dp_txq *dp_txq, unsigned int evq_read_ptr,
991 		   unsigned int txq_desc_index)
992 {
993 	struct sfc_ef10_txq *txq = sfc_ef10_txq_by_dp_txq(dp_txq);
994 
995 	txq->evq_read_ptr = evq_read_ptr;
996 	txq->added = txq->completed = txq_desc_index;
997 
998 	txq->flags |= SFC_EF10_TXQ_STARTED;
999 	txq->flags &= ~(SFC_EF10_TXQ_NOT_RUNNING | SFC_EF10_TXQ_EXCEPTION);
1000 
1001 	return 0;
1002 }
1003 
1004 static sfc_dp_tx_qstop_t sfc_ef10_tx_qstop;
1005 static void
1006 sfc_ef10_tx_qstop(struct sfc_dp_txq *dp_txq, unsigned int *evq_read_ptr)
1007 {
1008 	struct sfc_ef10_txq *txq = sfc_ef10_txq_by_dp_txq(dp_txq);
1009 
1010 	txq->flags |= SFC_EF10_TXQ_NOT_RUNNING;
1011 
1012 	*evq_read_ptr = txq->evq_read_ptr;
1013 }
1014 
1015 static sfc_dp_tx_qtx_ev_t sfc_ef10_tx_qtx_ev;
1016 static bool
1017 sfc_ef10_tx_qtx_ev(struct sfc_dp_txq *dp_txq, __rte_unused unsigned int id)
1018 {
1019 	__rte_unused struct sfc_ef10_txq *txq = sfc_ef10_txq_by_dp_txq(dp_txq);
1020 
1021 	SFC_ASSERT(txq->flags & SFC_EF10_TXQ_NOT_RUNNING);
1022 
1023 	/*
1024 	 * It is safe to ignore Tx event since we reap all mbufs on
1025 	 * queue purge anyway.
1026 	 */
1027 
1028 	return false;
1029 }
1030 
1031 static sfc_dp_tx_qreap_t sfc_ef10_tx_qreap;
1032 static void
1033 sfc_ef10_tx_qreap(struct sfc_dp_txq *dp_txq)
1034 {
1035 	struct sfc_ef10_txq *txq = sfc_ef10_txq_by_dp_txq(dp_txq);
1036 	unsigned int completed;
1037 
1038 	for (completed = txq->completed; completed != txq->added; ++completed) {
1039 		struct sfc_ef10_tx_sw_desc *txd;
1040 
1041 		txd = &txq->sw_ring[completed & txq->ptr_mask];
1042 		if (txd->mbuf != NULL) {
1043 			rte_pktmbuf_free_seg(txd->mbuf);
1044 			txd->mbuf = NULL;
1045 		}
1046 	}
1047 
1048 	txq->flags &= ~SFC_EF10_TXQ_STARTED;
1049 }
1050 
1051 static unsigned int
1052 sfc_ef10_tx_qdesc_npending(struct sfc_ef10_txq *txq)
1053 {
1054 	const unsigned int curr_done = txq->completed - 1;
1055 	unsigned int anew_done = curr_done;
1056 	efx_qword_t tx_ev;
1057 	const unsigned int evq_old_read_ptr = txq->evq_read_ptr;
1058 
1059 	if (unlikely(txq->flags &
1060 		     (SFC_EF10_TXQ_NOT_RUNNING | SFC_EF10_TXQ_EXCEPTION)))
1061 		return 0;
1062 
1063 	while (sfc_ef10_tx_get_event(txq, &tx_ev))
1064 		anew_done = EFX_QWORD_FIELD(tx_ev, ESF_DZ_TX_DESCR_INDX);
1065 
1066 	/*
1067 	 * The function does not process events, so return event queue read
1068 	 * pointer to the original position to allow the events that were
1069 	 * read to be processed later
1070 	 */
1071 	txq->evq_read_ptr = evq_old_read_ptr;
1072 
1073 	return (anew_done - curr_done) & txq->ptr_mask;
1074 }
1075 
1076 static sfc_dp_tx_qdesc_status_t sfc_ef10_tx_qdesc_status;
1077 static int
1078 sfc_ef10_tx_qdesc_status(struct sfc_dp_txq *dp_txq,
1079 			 uint16_t offset)
1080 {
1081 	struct sfc_ef10_txq *txq = sfc_ef10_txq_by_dp_txq(dp_txq);
1082 	unsigned int npending = sfc_ef10_tx_qdesc_npending(txq);
1083 
1084 	if (unlikely(offset > txq->ptr_mask))
1085 		return -EINVAL;
1086 
1087 	if (unlikely(offset >= txq->max_fill_level))
1088 		return RTE_ETH_TX_DESC_UNAVAIL;
1089 
1090 	if (unlikely(offset < npending))
1091 		return RTE_ETH_TX_DESC_FULL;
1092 
1093 	return RTE_ETH_TX_DESC_DONE;
1094 }
1095 
1096 struct sfc_dp_tx sfc_ef10_tx = {
1097 	.dp = {
1098 		.name		= SFC_KVARG_DATAPATH_EF10,
1099 		.type		= SFC_DP_TX,
1100 		.hw_fw_caps	= SFC_DP_HW_FW_CAP_EF10,
1101 	},
1102 	.features		= SFC_DP_TX_FEAT_MULTI_PROCESS,
1103 	.dev_offload_capa	= DEV_TX_OFFLOAD_MULTI_SEGS,
1104 	.queue_offload_capa	= DEV_TX_OFFLOAD_IPV4_CKSUM |
1105 				  DEV_TX_OFFLOAD_UDP_CKSUM |
1106 				  DEV_TX_OFFLOAD_TCP_CKSUM |
1107 				  DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM |
1108 				  DEV_TX_OFFLOAD_TCP_TSO |
1109 				  DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
1110 				  DEV_TX_OFFLOAD_GENEVE_TNL_TSO,
1111 	.get_dev_info		= sfc_ef10_get_dev_info,
1112 	.qsize_up_rings		= sfc_ef10_tx_qsize_up_rings,
1113 	.qcreate		= sfc_ef10_tx_qcreate,
1114 	.qdestroy		= sfc_ef10_tx_qdestroy,
1115 	.qstart			= sfc_ef10_tx_qstart,
1116 	.qtx_ev			= sfc_ef10_tx_qtx_ev,
1117 	.qstop			= sfc_ef10_tx_qstop,
1118 	.qreap			= sfc_ef10_tx_qreap,
1119 	.qdesc_status		= sfc_ef10_tx_qdesc_status,
1120 	.pkt_prepare		= sfc_ef10_prepare_pkts,
1121 	.pkt_burst		= sfc_ef10_xmit_pkts,
1122 };
1123 
1124 struct sfc_dp_tx sfc_ef10_simple_tx = {
1125 	.dp = {
1126 		.name		= SFC_KVARG_DATAPATH_EF10_SIMPLE,
1127 		.type		= SFC_DP_TX,
1128 	},
1129 	.features		= SFC_DP_TX_FEAT_MULTI_PROCESS,
1130 	.dev_offload_capa	= DEV_TX_OFFLOAD_MBUF_FAST_FREE,
1131 	.queue_offload_capa	= DEV_TX_OFFLOAD_IPV4_CKSUM |
1132 				  DEV_TX_OFFLOAD_UDP_CKSUM |
1133 				  DEV_TX_OFFLOAD_TCP_CKSUM |
1134 				  DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM,
1135 	.get_dev_info		= sfc_ef10_get_dev_info,
1136 	.qsize_up_rings		= sfc_ef10_tx_qsize_up_rings,
1137 	.qcreate		= sfc_ef10_tx_qcreate,
1138 	.qdestroy		= sfc_ef10_tx_qdestroy,
1139 	.qstart			= sfc_ef10_tx_qstart,
1140 	.qtx_ev			= sfc_ef10_tx_qtx_ev,
1141 	.qstop			= sfc_ef10_tx_qstop,
1142 	.qreap			= sfc_ef10_tx_qreap,
1143 	.qdesc_status		= sfc_ef10_tx_qdesc_status,
1144 #ifdef RTE_LIBRTE_SFC_EFX_DEBUG
1145 	.pkt_prepare		= sfc_ef10_simple_prepare_pkts,
1146 #endif
1147 	.pkt_burst		= sfc_ef10_simple_xmit_pkts,
1148 };
1149