xref: /dpdk/drivers/net/sfc/sfc_ef100_tx.c (revision 68a03efeed657e6e05f281479b33b51102797e15)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  *
3  * Copyright(c) 2019-2021 Xilinx, Inc.
4  * Copyright(c) 2018-2019 Solarflare Communications Inc.
5  *
6  * This software was jointly developed between OKTET Labs (under contract
7  * for Solarflare) and Solarflare Communications, Inc.
8  */
9 
10 #include <stdbool.h>
11 
12 #include <rte_mbuf.h>
13 #include <rte_io.h>
14 #include <rte_net.h>
15 
16 #include "efx.h"
17 #include "efx_types.h"
18 #include "efx_regs.h"
19 #include "efx_regs_ef100.h"
20 
21 #include "sfc_debug.h"
22 #include "sfc_dp_tx.h"
23 #include "sfc_tweak.h"
24 #include "sfc_kvargs.h"
25 #include "sfc_ef100.h"
26 
27 
28 #define sfc_ef100_tx_err(_txq, ...) \
29 	SFC_DP_LOG(SFC_KVARG_DATAPATH_EF100, ERR, &(_txq)->dp.dpq, __VA_ARGS__)
30 
31 #define sfc_ef100_tx_debug(_txq, ...) \
32 	SFC_DP_LOG(SFC_KVARG_DATAPATH_EF100, DEBUG, &(_txq)->dp.dpq, \
33 		   __VA_ARGS__)
34 
35 
36 /** Maximum length of the send descriptor data */
37 #define SFC_EF100_TX_SEND_DESC_LEN_MAX \
38 	((1u << ESF_GZ_TX_SEND_LEN_WIDTH) - 1)
39 
40 /** Maximum length of the segment descriptor data */
41 #define SFC_EF100_TX_SEG_DESC_LEN_MAX \
42 	((1u << ESF_GZ_TX_SEG_LEN_WIDTH) - 1)
43 
44 /**
45  * Maximum number of descriptors/buffers in the Tx ring.
46  * It should guarantee that corresponding event queue never overfill.
47  * EF100 native datapath uses event queue of the same size as Tx queue.
48  * Maximum number of events on datapath can be estimated as number of
49  * Tx queue entries (one event per Tx buffer in the worst case) plus
50  * Tx error and flush events.
51  */
52 #define SFC_EF100_TXQ_LIMIT(_ndesc) \
53 	((_ndesc) - 1 /* head must not step on tail */ - \
54 	 1 /* Rx error */ - 1 /* flush */)
55 
56 struct sfc_ef100_tx_sw_desc {
57 	struct rte_mbuf			*mbuf;
58 };
59 
60 struct sfc_ef100_txq {
61 	unsigned int			flags;
62 #define SFC_EF100_TXQ_STARTED		0x1
63 #define SFC_EF100_TXQ_NOT_RUNNING	0x2
64 #define SFC_EF100_TXQ_EXCEPTION		0x4
65 
66 	unsigned int			ptr_mask;
67 	unsigned int			added;
68 	unsigned int			completed;
69 	unsigned int			max_fill_level;
70 	unsigned int			free_thresh;
71 	struct sfc_ef100_tx_sw_desc	*sw_ring;
72 	efx_oword_t			*txq_hw_ring;
73 	volatile void			*doorbell;
74 
75 	/* Completion/reap */
76 	unsigned int			evq_read_ptr;
77 	unsigned int			evq_phase_bit_shift;
78 	volatile efx_qword_t		*evq_hw_ring;
79 
80 	uint16_t			tso_tcp_header_offset_limit;
81 	uint16_t			tso_max_nb_header_descs;
82 	uint16_t			tso_max_header_len;
83 	uint16_t			tso_max_nb_payload_descs;
84 	uint32_t			tso_max_payload_len;
85 	uint32_t			tso_max_nb_outgoing_frames;
86 
87 	/* Datapath transmit queue anchor */
88 	struct sfc_dp_txq		dp;
89 };
90 
91 static inline struct sfc_ef100_txq *
92 sfc_ef100_txq_by_dp_txq(struct sfc_dp_txq *dp_txq)
93 {
94 	return container_of(dp_txq, struct sfc_ef100_txq, dp);
95 }
96 
97 static int
98 sfc_ef100_tx_prepare_pkt_tso(struct sfc_ef100_txq * const txq,
99 			     struct rte_mbuf *m)
100 {
101 	size_t header_len = ((m->ol_flags & PKT_TX_TUNNEL_MASK) ?
102 			     m->outer_l2_len + m->outer_l3_len : 0) +
103 			    m->l2_len + m->l3_len + m->l4_len;
104 	size_t payload_len = m->pkt_len - header_len;
105 	unsigned long mss_conformant_max_payload_len;
106 	unsigned int nb_payload_descs;
107 
108 #ifdef RTE_LIBRTE_SFC_EFX_DEBUG
109 	switch (m->ol_flags & PKT_TX_TUNNEL_MASK) {
110 	case 0:
111 		/* FALLTHROUGH */
112 	case PKT_TX_TUNNEL_VXLAN:
113 		/* FALLTHROUGH */
114 	case PKT_TX_TUNNEL_GENEVE:
115 		break;
116 	default:
117 		return ENOTSUP;
118 	}
119 #endif
120 
121 	mss_conformant_max_payload_len =
122 		m->tso_segsz * txq->tso_max_nb_outgoing_frames;
123 
124 	/*
125 	 * Don't really want to know exact number of payload segments.
126 	 * Just use total number of segments as upper limit. Practically
127 	 * maximum number of payload segments is significantly bigger
128 	 * than maximum number header segments, so we can neglect header
129 	 * segments excluded total number of segments to estimate number
130 	 * of payload segments required.
131 	 */
132 	nb_payload_descs = m->nb_segs;
133 
134 	/*
135 	 * Carry out multiple independent checks using bitwise OR
136 	 * to avoid unnecessary conditional branching.
137 	 */
138 	if (unlikely((header_len > txq->tso_max_header_len) |
139 		     (nb_payload_descs > txq->tso_max_nb_payload_descs) |
140 		     (payload_len > txq->tso_max_payload_len) |
141 		     (payload_len > mss_conformant_max_payload_len) |
142 		     (m->pkt_len == header_len)))
143 		return EINVAL;
144 
145 	return 0;
146 }
147 
148 static uint16_t
149 sfc_ef100_tx_prepare_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
150 			  uint16_t nb_pkts)
151 {
152 	struct sfc_ef100_txq * const txq = sfc_ef100_txq_by_dp_txq(tx_queue);
153 	uint16_t i;
154 
155 	for (i = 0; i < nb_pkts; i++) {
156 		struct rte_mbuf *m = tx_pkts[i];
157 		unsigned int max_nb_header_segs = 0;
158 		bool calc_phdr_cksum = false;
159 		int ret;
160 
161 		/*
162 		 * Partial checksum offload is used in the case of
163 		 * inner TCP/UDP checksum offload. It requires
164 		 * pseudo-header checksum which is calculated below,
165 		 * but requires contiguous packet headers.
166 		 */
167 		if ((m->ol_flags & PKT_TX_TUNNEL_MASK) &&
168 		    (m->ol_flags & PKT_TX_L4_MASK)) {
169 			calc_phdr_cksum = true;
170 			max_nb_header_segs = 1;
171 		} else if (m->ol_flags & PKT_TX_TCP_SEG) {
172 			max_nb_header_segs = txq->tso_max_nb_header_descs;
173 		}
174 
175 		ret = sfc_dp_tx_prepare_pkt(m, max_nb_header_segs, 0,
176 					    txq->tso_tcp_header_offset_limit,
177 					    txq->max_fill_level, 1, 0);
178 		if (unlikely(ret != 0)) {
179 			rte_errno = ret;
180 			break;
181 		}
182 
183 		if (m->ol_flags & PKT_TX_TCP_SEG) {
184 			ret = sfc_ef100_tx_prepare_pkt_tso(txq, m);
185 			if (unlikely(ret != 0)) {
186 				rte_errno = ret;
187 				break;
188 			}
189 		} else if (m->nb_segs > EFX_MASK32(ESF_GZ_TX_SEND_NUM_SEGS)) {
190 			rte_errno = EINVAL;
191 			break;
192 		}
193 
194 		if (calc_phdr_cksum) {
195 			/*
196 			 * Full checksum offload does IPv4 header checksum
197 			 * and does not require any assistance.
198 			 */
199 			ret = rte_net_intel_cksum_flags_prepare(m,
200 					m->ol_flags & ~PKT_TX_IP_CKSUM);
201 			if (unlikely(ret != 0)) {
202 				rte_errno = -ret;
203 				break;
204 			}
205 		}
206 	}
207 
208 	return i;
209 }
210 
211 static bool
212 sfc_ef100_tx_get_event(struct sfc_ef100_txq *txq, efx_qword_t *ev)
213 {
214 	volatile efx_qword_t *evq_hw_ring = txq->evq_hw_ring;
215 
216 	/*
217 	 * Exception flag is set when reap is done.
218 	 * It is never done twice per packet burst get, and absence of
219 	 * the flag is checked on burst get entry.
220 	 */
221 	SFC_ASSERT((txq->flags & SFC_EF100_TXQ_EXCEPTION) == 0);
222 
223 	*ev = evq_hw_ring[txq->evq_read_ptr & txq->ptr_mask];
224 
225 	if (!sfc_ef100_ev_present(ev,
226 			(txq->evq_read_ptr >> txq->evq_phase_bit_shift) & 1))
227 		return false;
228 
229 	if (unlikely(!sfc_ef100_ev_type_is(ev,
230 					   ESE_GZ_EF100_EV_TX_COMPLETION))) {
231 		/*
232 		 * Do not move read_ptr to keep the event for exception
233 		 * handling by the control path.
234 		 */
235 		txq->flags |= SFC_EF100_TXQ_EXCEPTION;
236 		sfc_ef100_tx_err(txq,
237 			"TxQ exception at EvQ ptr %u(%#x), event %08x:%08x",
238 			txq->evq_read_ptr, txq->evq_read_ptr & txq->ptr_mask,
239 			EFX_QWORD_FIELD(*ev, EFX_DWORD_1),
240 			EFX_QWORD_FIELD(*ev, EFX_DWORD_0));
241 		return false;
242 	}
243 
244 	sfc_ef100_tx_debug(txq, "TxQ got event %08x:%08x at %u (%#x)",
245 			   EFX_QWORD_FIELD(*ev, EFX_DWORD_1),
246 			   EFX_QWORD_FIELD(*ev, EFX_DWORD_0),
247 			   txq->evq_read_ptr,
248 			   txq->evq_read_ptr & txq->ptr_mask);
249 
250 	txq->evq_read_ptr++;
251 	return true;
252 }
253 
254 static unsigned int
255 sfc_ef100_tx_process_events(struct sfc_ef100_txq *txq)
256 {
257 	unsigned int num_descs = 0;
258 	efx_qword_t tx_ev;
259 
260 	while (sfc_ef100_tx_get_event(txq, &tx_ev))
261 		num_descs += EFX_QWORD_FIELD(tx_ev, ESF_GZ_EV_TXCMPL_NUM_DESC);
262 
263 	return num_descs;
264 }
265 
266 static void
267 sfc_ef100_tx_reap_num_descs(struct sfc_ef100_txq *txq, unsigned int num_descs)
268 {
269 	if (num_descs > 0) {
270 		unsigned int completed = txq->completed;
271 		unsigned int pending = completed + num_descs;
272 		struct rte_mbuf *bulk[SFC_TX_REAP_BULK_SIZE];
273 		unsigned int nb = 0;
274 
275 		do {
276 			struct sfc_ef100_tx_sw_desc *txd;
277 			struct rte_mbuf *m;
278 
279 			txd = &txq->sw_ring[completed & txq->ptr_mask];
280 			if (txd->mbuf == NULL)
281 				continue;
282 
283 			m = rte_pktmbuf_prefree_seg(txd->mbuf);
284 			if (m == NULL)
285 				continue;
286 
287 			txd->mbuf = NULL;
288 
289 			if (nb == RTE_DIM(bulk) ||
290 			    (nb != 0 && m->pool != bulk[0]->pool)) {
291 				rte_mempool_put_bulk(bulk[0]->pool,
292 						     (void *)bulk, nb);
293 				nb = 0;
294 			}
295 
296 			bulk[nb++] = m;
297 		} while (++completed != pending);
298 
299 		if (nb != 0)
300 			rte_mempool_put_bulk(bulk[0]->pool, (void *)bulk, nb);
301 
302 		txq->completed = completed;
303 	}
304 }
305 
306 static void
307 sfc_ef100_tx_reap(struct sfc_ef100_txq *txq)
308 {
309 	sfc_ef100_tx_reap_num_descs(txq, sfc_ef100_tx_process_events(txq));
310 }
311 
312 static uint8_t
313 sfc_ef100_tx_qdesc_cso_inner_l3(uint64_t tx_tunnel)
314 {
315 	uint8_t inner_l3;
316 
317 	switch (tx_tunnel) {
318 	case PKT_TX_TUNNEL_VXLAN:
319 		inner_l3 = ESE_GZ_TX_DESC_CS_INNER_L3_VXLAN;
320 		break;
321 	case PKT_TX_TUNNEL_GENEVE:
322 		inner_l3 = ESE_GZ_TX_DESC_CS_INNER_L3_GENEVE;
323 		break;
324 	default:
325 		inner_l3 = ESE_GZ_TX_DESC_CS_INNER_L3_OFF;
326 		break;
327 	}
328 	return inner_l3;
329 }
330 
331 static void
332 sfc_ef100_tx_qdesc_send_create(const struct rte_mbuf *m, efx_oword_t *tx_desc)
333 {
334 	bool outer_l3;
335 	bool outer_l4;
336 	uint8_t inner_l3;
337 	uint8_t partial_en;
338 	uint16_t part_cksum_w;
339 	uint16_t l4_offset_w;
340 
341 	if ((m->ol_flags & PKT_TX_TUNNEL_MASK) == 0) {
342 		outer_l3 = (m->ol_flags & PKT_TX_IP_CKSUM);
343 		outer_l4 = (m->ol_flags & PKT_TX_L4_MASK);
344 		inner_l3 = ESE_GZ_TX_DESC_CS_INNER_L3_OFF;
345 		partial_en = ESE_GZ_TX_DESC_CSO_PARTIAL_EN_OFF;
346 		part_cksum_w = 0;
347 		l4_offset_w = 0;
348 	} else {
349 		outer_l3 = (m->ol_flags & PKT_TX_OUTER_IP_CKSUM);
350 		outer_l4 = (m->ol_flags & PKT_TX_OUTER_UDP_CKSUM);
351 		inner_l3 = sfc_ef100_tx_qdesc_cso_inner_l3(m->ol_flags &
352 							   PKT_TX_TUNNEL_MASK);
353 
354 		switch (m->ol_flags & PKT_TX_L4_MASK) {
355 		case PKT_TX_TCP_CKSUM:
356 			partial_en = ESE_GZ_TX_DESC_CSO_PARTIAL_EN_TCP;
357 			part_cksum_w = offsetof(struct rte_tcp_hdr, cksum) >> 1;
358 			break;
359 		case PKT_TX_UDP_CKSUM:
360 			partial_en = ESE_GZ_TX_DESC_CSO_PARTIAL_EN_UDP;
361 			part_cksum_w = offsetof(struct rte_udp_hdr,
362 						dgram_cksum) >> 1;
363 			break;
364 		default:
365 			partial_en = ESE_GZ_TX_DESC_CSO_PARTIAL_EN_OFF;
366 			part_cksum_w = 0;
367 			break;
368 		}
369 		l4_offset_w = (m->outer_l2_len + m->outer_l3_len +
370 				m->l2_len + m->l3_len) >> 1;
371 	}
372 
373 	EFX_POPULATE_OWORD_10(*tx_desc,
374 			ESF_GZ_TX_SEND_ADDR, rte_mbuf_data_iova(m),
375 			ESF_GZ_TX_SEND_LEN, rte_pktmbuf_data_len(m),
376 			ESF_GZ_TX_SEND_NUM_SEGS, m->nb_segs,
377 			ESF_GZ_TX_SEND_CSO_PARTIAL_START_W, l4_offset_w,
378 			ESF_GZ_TX_SEND_CSO_PARTIAL_CSUM_W, part_cksum_w,
379 			ESF_GZ_TX_SEND_CSO_PARTIAL_EN, partial_en,
380 			ESF_GZ_TX_SEND_CSO_INNER_L3, inner_l3,
381 			ESF_GZ_TX_SEND_CSO_OUTER_L3, outer_l3,
382 			ESF_GZ_TX_SEND_CSO_OUTER_L4, outer_l4,
383 			ESF_GZ_TX_DESC_TYPE, ESE_GZ_TX_DESC_TYPE_SEND);
384 
385 	if (m->ol_flags & PKT_TX_VLAN_PKT) {
386 		efx_oword_t tx_desc_extra_fields;
387 
388 		EFX_POPULATE_OWORD_2(tx_desc_extra_fields,
389 				ESF_GZ_TX_SEND_VLAN_INSERT_EN, 1,
390 				ESF_GZ_TX_SEND_VLAN_INSERT_TCI, m->vlan_tci);
391 
392 		EFX_OR_OWORD(*tx_desc, tx_desc_extra_fields);
393 	}
394 }
395 
396 static void
397 sfc_ef100_tx_qdesc_seg_create(rte_iova_t addr, uint16_t len,
398 			      efx_oword_t *tx_desc)
399 {
400 	EFX_POPULATE_OWORD_3(*tx_desc,
401 			ESF_GZ_TX_SEG_ADDR, addr,
402 			ESF_GZ_TX_SEG_LEN, len,
403 			ESF_GZ_TX_DESC_TYPE, ESE_GZ_TX_DESC_TYPE_SEG);
404 }
405 
406 static void
407 sfc_ef100_tx_qdesc_tso_create(const struct rte_mbuf *m,
408 			      uint16_t nb_header_descs,
409 			      uint16_t nb_payload_descs,
410 			      size_t header_len, size_t payload_len,
411 			      size_t outer_iph_off, size_t outer_udph_off,
412 			      size_t iph_off, size_t tcph_off,
413 			      efx_oword_t *tx_desc)
414 {
415 	efx_oword_t tx_desc_extra_fields;
416 	int ed_outer_udp_len = (outer_udph_off != 0) ? 1 : 0;
417 	int ed_outer_ip_len = (outer_iph_off != 0) ? 1 : 0;
418 	int ed_outer_ip_id = (outer_iph_off != 0) ?
419 		ESE_GZ_TX_DESC_IP4_ID_INC_MOD16 : 0;
420 	/*
421 	 * If no tunnel encapsulation is present, then the ED_INNER
422 	 * fields should be used.
423 	 */
424 	int ed_inner_ip_id = ESE_GZ_TX_DESC_IP4_ID_INC_MOD16;
425 	uint8_t inner_l3 = sfc_ef100_tx_qdesc_cso_inner_l3(
426 					m->ol_flags & PKT_TX_TUNNEL_MASK);
427 
428 	EFX_POPULATE_OWORD_10(*tx_desc,
429 			ESF_GZ_TX_TSO_MSS, m->tso_segsz,
430 			ESF_GZ_TX_TSO_HDR_NUM_SEGS, nb_header_descs,
431 			ESF_GZ_TX_TSO_PAYLOAD_NUM_SEGS, nb_payload_descs,
432 			ESF_GZ_TX_TSO_ED_OUTER_IP4_ID, ed_outer_ip_id,
433 			ESF_GZ_TX_TSO_ED_INNER_IP4_ID, ed_inner_ip_id,
434 			ESF_GZ_TX_TSO_ED_OUTER_IP_LEN, ed_outer_ip_len,
435 			ESF_GZ_TX_TSO_ED_INNER_IP_LEN, 1,
436 			ESF_GZ_TX_TSO_ED_OUTER_UDP_LEN, ed_outer_udp_len,
437 			ESF_GZ_TX_TSO_HDR_LEN_W, header_len >> 1,
438 			ESF_GZ_TX_TSO_PAYLOAD_LEN, payload_len);
439 
440 	EFX_POPULATE_OWORD_9(tx_desc_extra_fields,
441 			/*
442 			 * Outer offsets are required for outer IPv4 ID
443 			 * and length edits in the case of tunnel TSO.
444 			 */
445 			ESF_GZ_TX_TSO_OUTER_L3_OFF_W, outer_iph_off >> 1,
446 			ESF_GZ_TX_TSO_OUTER_L4_OFF_W, outer_udph_off >> 1,
447 			/*
448 			 * Inner offsets are required for inner IPv4 ID
449 			 * and IP length edits and partial checksum
450 			 * offload in the case of tunnel TSO.
451 			 */
452 			ESF_GZ_TX_TSO_INNER_L3_OFF_W, iph_off >> 1,
453 			ESF_GZ_TX_TSO_INNER_L4_OFF_W, tcph_off >> 1,
454 			ESF_GZ_TX_TSO_CSO_INNER_L4,
455 				inner_l3 != ESE_GZ_TX_DESC_CS_INNER_L3_OFF,
456 			ESF_GZ_TX_TSO_CSO_INNER_L3, inner_l3,
457 			/*
458 			 * Use outer full checksum offloads which do
459 			 * not require any extra information.
460 			 */
461 			ESF_GZ_TX_TSO_CSO_OUTER_L3, 1,
462 			ESF_GZ_TX_TSO_CSO_OUTER_L4, 1,
463 			ESF_GZ_TX_DESC_TYPE, ESE_GZ_TX_DESC_TYPE_TSO);
464 
465 	EFX_OR_OWORD(*tx_desc, tx_desc_extra_fields);
466 
467 	if (m->ol_flags & PKT_TX_VLAN_PKT) {
468 		EFX_POPULATE_OWORD_2(tx_desc_extra_fields,
469 				ESF_GZ_TX_TSO_VLAN_INSERT_EN, 1,
470 				ESF_GZ_TX_TSO_VLAN_INSERT_TCI, m->vlan_tci);
471 
472 		EFX_OR_OWORD(*tx_desc, tx_desc_extra_fields);
473 	}
474 }
475 
476 static inline void
477 sfc_ef100_tx_qpush(struct sfc_ef100_txq *txq, unsigned int added)
478 {
479 	efx_dword_t dword;
480 
481 	EFX_POPULATE_DWORD_1(dword, ERF_GZ_TX_RING_PIDX, added & txq->ptr_mask);
482 
483 	/* DMA sync to device is not required */
484 
485 	/*
486 	 * rte_write32() has rte_io_wmb() which guarantees that the STORE
487 	 * operations (i.e. Rx and event descriptor updates) that precede
488 	 * the rte_io_wmb() call are visible to NIC before the STORE
489 	 * operations that follow it (i.e. doorbell write).
490 	 */
491 	rte_write32(dword.ed_u32[0], txq->doorbell);
492 
493 	sfc_ef100_tx_debug(txq, "TxQ pushed doorbell at pidx %u (added=%u)",
494 			   EFX_DWORD_FIELD(dword, ERF_GZ_TX_RING_PIDX),
495 			   added);
496 }
497 
498 static unsigned int
499 sfc_ef100_tx_pkt_descs_max(const struct rte_mbuf *m)
500 {
501 	unsigned int extra_descs = 0;
502 
503 /** Maximum length of an mbuf segment data */
504 #define SFC_MBUF_SEG_LEN_MAX		UINT16_MAX
505 	RTE_BUILD_BUG_ON(sizeof(m->data_len) != 2);
506 
507 	if (m->ol_flags & PKT_TX_TCP_SEG) {
508 		/* Tx TSO descriptor */
509 		extra_descs++;
510 		/*
511 		 * Extra Tx segment descriptor may be required if header
512 		 * ends in the middle of segment.
513 		 */
514 		extra_descs++;
515 	} else {
516 		/*
517 		 * mbuf segment cannot be bigger than maximum segment length
518 		 * and maximum packet length since TSO is not supported yet.
519 		 * Make sure that the first segment does not need fragmentation
520 		 * (split into many Tx descriptors).
521 		 */
522 		RTE_BUILD_BUG_ON(SFC_EF100_TX_SEND_DESC_LEN_MAX <
523 				 RTE_MIN((unsigned int)EFX_MAC_PDU_MAX,
524 				 SFC_MBUF_SEG_LEN_MAX));
525 	}
526 
527 	/*
528 	 * Any segment of scattered packet cannot be bigger than maximum
529 	 * segment length. Make sure that subsequent segments do not need
530 	 * fragmentation (split into many Tx descriptors).
531 	 */
532 	RTE_BUILD_BUG_ON(SFC_EF100_TX_SEG_DESC_LEN_MAX < SFC_MBUF_SEG_LEN_MAX);
533 
534 	return m->nb_segs + extra_descs;
535 }
536 
537 static struct rte_mbuf *
538 sfc_ef100_xmit_tso_pkt(struct sfc_ef100_txq * const txq,
539 		       struct rte_mbuf *m, unsigned int *added)
540 {
541 	struct rte_mbuf *m_seg = m;
542 	unsigned int nb_hdr_descs;
543 	unsigned int nb_pld_descs;
544 	unsigned int seg_split = 0;
545 	unsigned int tso_desc_id;
546 	unsigned int id;
547 	size_t outer_iph_off;
548 	size_t outer_udph_off;
549 	size_t iph_off;
550 	size_t tcph_off;
551 	size_t header_len;
552 	size_t remaining_hdr_len;
553 
554 	if (m->ol_flags & PKT_TX_TUNNEL_MASK) {
555 		outer_iph_off = m->outer_l2_len;
556 		outer_udph_off = outer_iph_off + m->outer_l3_len;
557 	} else {
558 		outer_iph_off = 0;
559 		outer_udph_off = 0;
560 	}
561 	iph_off = outer_udph_off + m->l2_len;
562 	tcph_off = iph_off + m->l3_len;
563 	header_len = tcph_off + m->l4_len;
564 
565 	/*
566 	 * Remember ID of the TX_TSO descriptor to be filled in.
567 	 * We can't fill it in right now since we need to calculate
568 	 * number of header and payload segments first and don't want
569 	 * to traverse it twice here.
570 	 */
571 	tso_desc_id = (*added)++ & txq->ptr_mask;
572 
573 	remaining_hdr_len = header_len;
574 	do {
575 		id = (*added)++ & txq->ptr_mask;
576 		if (rte_pktmbuf_data_len(m_seg) <= remaining_hdr_len) {
577 			/* The segment is fully header segment */
578 			sfc_ef100_tx_qdesc_seg_create(
579 				rte_mbuf_data_iova(m_seg),
580 				rte_pktmbuf_data_len(m_seg),
581 				&txq->txq_hw_ring[id]);
582 			remaining_hdr_len -= rte_pktmbuf_data_len(m_seg);
583 		} else {
584 			/*
585 			 * The segment must be split into header and
586 			 * payload segments
587 			 */
588 			sfc_ef100_tx_qdesc_seg_create(
589 				rte_mbuf_data_iova(m_seg),
590 				remaining_hdr_len,
591 				&txq->txq_hw_ring[id]);
592 			SFC_ASSERT(txq->sw_ring[id].mbuf == NULL);
593 
594 			id = (*added)++ & txq->ptr_mask;
595 			sfc_ef100_tx_qdesc_seg_create(
596 				rte_mbuf_data_iova(m_seg) + remaining_hdr_len,
597 				rte_pktmbuf_data_len(m_seg) - remaining_hdr_len,
598 				&txq->txq_hw_ring[id]);
599 			remaining_hdr_len = 0;
600 			seg_split = 1;
601 		}
602 		txq->sw_ring[id].mbuf = m_seg;
603 		m_seg = m_seg->next;
604 	} while (remaining_hdr_len > 0);
605 
606 	/*
607 	 * If a segment is split into header and payload segments, added
608 	 * pointer counts it twice and we should correct it.
609 	 */
610 	nb_hdr_descs = ((id - tso_desc_id) & txq->ptr_mask) - seg_split;
611 	nb_pld_descs = m->nb_segs - nb_hdr_descs + seg_split;
612 
613 	sfc_ef100_tx_qdesc_tso_create(m, nb_hdr_descs, nb_pld_descs, header_len,
614 				      rte_pktmbuf_pkt_len(m) - header_len,
615 				      outer_iph_off, outer_udph_off,
616 				      iph_off, tcph_off,
617 				      &txq->txq_hw_ring[tso_desc_id]);
618 
619 	return m_seg;
620 }
621 
622 static uint16_t
623 sfc_ef100_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
624 {
625 	struct sfc_ef100_txq * const txq = sfc_ef100_txq_by_dp_txq(tx_queue);
626 	unsigned int added;
627 	unsigned int dma_desc_space;
628 	bool reap_done;
629 	struct rte_mbuf **pktp;
630 	struct rte_mbuf **pktp_end;
631 
632 	if (unlikely(txq->flags &
633 		     (SFC_EF100_TXQ_NOT_RUNNING | SFC_EF100_TXQ_EXCEPTION)))
634 		return 0;
635 
636 	added = txq->added;
637 	dma_desc_space = txq->max_fill_level - (added - txq->completed);
638 
639 	reap_done = (dma_desc_space < txq->free_thresh);
640 	if (reap_done) {
641 		sfc_ef100_tx_reap(txq);
642 		dma_desc_space = txq->max_fill_level - (added - txq->completed);
643 	}
644 
645 	for (pktp = &tx_pkts[0], pktp_end = &tx_pkts[nb_pkts];
646 	     pktp != pktp_end;
647 	     ++pktp) {
648 		struct rte_mbuf *m_seg = *pktp;
649 		unsigned int pkt_start = added;
650 		unsigned int id;
651 
652 		if (likely(pktp + 1 != pktp_end))
653 			rte_mbuf_prefetch_part1(pktp[1]);
654 
655 		if (sfc_ef100_tx_pkt_descs_max(m_seg) > dma_desc_space) {
656 			if (reap_done)
657 				break;
658 
659 			/* Push already prepared descriptors before polling */
660 			if (added != txq->added) {
661 				sfc_ef100_tx_qpush(txq, added);
662 				txq->added = added;
663 			}
664 
665 			sfc_ef100_tx_reap(txq);
666 			reap_done = true;
667 			dma_desc_space = txq->max_fill_level -
668 				(added - txq->completed);
669 			if (sfc_ef100_tx_pkt_descs_max(m_seg) > dma_desc_space)
670 				break;
671 		}
672 
673 		if (m_seg->ol_flags & PKT_TX_TCP_SEG) {
674 			m_seg = sfc_ef100_xmit_tso_pkt(txq, m_seg, &added);
675 		} else {
676 			id = added++ & txq->ptr_mask;
677 			sfc_ef100_tx_qdesc_send_create(m_seg,
678 						       &txq->txq_hw_ring[id]);
679 
680 			/*
681 			 * rte_pktmbuf_free() is commonly used in DPDK for
682 			 * recycling packets - the function checks every
683 			 * segment's reference counter and returns the
684 			 * buffer to its pool whenever possible;
685 			 * nevertheless, freeing mbuf segments one by one
686 			 * may entail some performance decline;
687 			 * from this point, sfc_efx_tx_reap() does the same job
688 			 * on its own and frees buffers in bulks (all mbufs
689 			 * within a bulk belong to the same pool);
690 			 * from this perspective, individual segment pointers
691 			 * must be associated with the corresponding SW
692 			 * descriptors independently so that only one loop
693 			 * is sufficient on reap to inspect all the buffers
694 			 */
695 			txq->sw_ring[id].mbuf = m_seg;
696 			m_seg = m_seg->next;
697 		}
698 
699 		while (m_seg != NULL) {
700 			RTE_BUILD_BUG_ON(SFC_MBUF_SEG_LEN_MAX >
701 					 SFC_EF100_TX_SEG_DESC_LEN_MAX);
702 
703 			id = added++ & txq->ptr_mask;
704 			sfc_ef100_tx_qdesc_seg_create(rte_mbuf_data_iova(m_seg),
705 					rte_pktmbuf_data_len(m_seg),
706 					&txq->txq_hw_ring[id]);
707 			txq->sw_ring[id].mbuf = m_seg;
708 			m_seg = m_seg->next;
709 		}
710 
711 		dma_desc_space -= (added - pkt_start);
712 	}
713 
714 	if (likely(added != txq->added)) {
715 		sfc_ef100_tx_qpush(txq, added);
716 		txq->added = added;
717 	}
718 
719 #if SFC_TX_XMIT_PKTS_REAP_AT_LEAST_ONCE
720 	if (!reap_done)
721 		sfc_ef100_tx_reap(txq);
722 #endif
723 
724 	return pktp - &tx_pkts[0];
725 }
726 
727 static sfc_dp_tx_get_dev_info_t sfc_ef100_get_dev_info;
728 static void
729 sfc_ef100_get_dev_info(struct rte_eth_dev_info *dev_info)
730 {
731 	/*
732 	 * Number of descriptors just defines maximum number of pushed
733 	 * descriptors (fill level).
734 	 */
735 	dev_info->tx_desc_lim.nb_min = 1;
736 	dev_info->tx_desc_lim.nb_align = 1;
737 }
738 
739 static sfc_dp_tx_qsize_up_rings_t sfc_ef100_tx_qsize_up_rings;
740 static int
741 sfc_ef100_tx_qsize_up_rings(uint16_t nb_tx_desc,
742 			   struct sfc_dp_tx_hw_limits *limits,
743 			   unsigned int *txq_entries,
744 			   unsigned int *evq_entries,
745 			   unsigned int *txq_max_fill_level)
746 {
747 	/*
748 	 * rte_ethdev API guarantees that the number meets min, max and
749 	 * alignment requirements.
750 	 */
751 	if (nb_tx_desc <= limits->txq_min_entries)
752 		*txq_entries = limits->txq_min_entries;
753 	else
754 		*txq_entries = rte_align32pow2(nb_tx_desc);
755 
756 	*evq_entries = *txq_entries;
757 
758 	*txq_max_fill_level = RTE_MIN(nb_tx_desc,
759 				      SFC_EF100_TXQ_LIMIT(*evq_entries));
760 	return 0;
761 }
762 
763 static sfc_dp_tx_qcreate_t sfc_ef100_tx_qcreate;
764 static int
765 sfc_ef100_tx_qcreate(uint16_t port_id, uint16_t queue_id,
766 		    const struct rte_pci_addr *pci_addr, int socket_id,
767 		    const struct sfc_dp_tx_qcreate_info *info,
768 		    struct sfc_dp_txq **dp_txqp)
769 {
770 	struct sfc_ef100_txq *txq;
771 	int rc;
772 
773 	rc = EINVAL;
774 	if (info->txq_entries != info->evq_entries)
775 		goto fail_bad_args;
776 
777 	rc = ENOMEM;
778 	txq = rte_zmalloc_socket("sfc-ef100-txq", sizeof(*txq),
779 				 RTE_CACHE_LINE_SIZE, socket_id);
780 	if (txq == NULL)
781 		goto fail_txq_alloc;
782 
783 	sfc_dp_queue_init(&txq->dp.dpq, port_id, queue_id, pci_addr);
784 
785 	rc = ENOMEM;
786 	txq->sw_ring = rte_calloc_socket("sfc-ef100-txq-sw_ring",
787 					 info->txq_entries,
788 					 sizeof(*txq->sw_ring),
789 					 RTE_CACHE_LINE_SIZE, socket_id);
790 	if (txq->sw_ring == NULL)
791 		goto fail_sw_ring_alloc;
792 
793 	txq->flags = SFC_EF100_TXQ_NOT_RUNNING;
794 	txq->ptr_mask = info->txq_entries - 1;
795 	txq->max_fill_level = info->max_fill_level;
796 	txq->free_thresh = info->free_thresh;
797 	txq->evq_phase_bit_shift = rte_bsf32(info->evq_entries);
798 	txq->txq_hw_ring = info->txq_hw_ring;
799 	txq->doorbell = (volatile uint8_t *)info->mem_bar +
800 			ER_GZ_TX_RING_DOORBELL_OFST +
801 			(info->hw_index << info->vi_window_shift);
802 	txq->evq_hw_ring = info->evq_hw_ring;
803 
804 	txq->tso_tcp_header_offset_limit = info->tso_tcp_header_offset_limit;
805 	txq->tso_max_nb_header_descs = info->tso_max_nb_header_descs;
806 	txq->tso_max_header_len = info->tso_max_header_len;
807 	txq->tso_max_nb_payload_descs = info->tso_max_nb_payload_descs;
808 	txq->tso_max_payload_len = info->tso_max_payload_len;
809 	txq->tso_max_nb_outgoing_frames = info->tso_max_nb_outgoing_frames;
810 
811 	sfc_ef100_tx_debug(txq, "TxQ doorbell is %p", txq->doorbell);
812 
813 	*dp_txqp = &txq->dp;
814 	return 0;
815 
816 fail_sw_ring_alloc:
817 	rte_free(txq);
818 
819 fail_txq_alloc:
820 fail_bad_args:
821 	return rc;
822 }
823 
824 static sfc_dp_tx_qdestroy_t sfc_ef100_tx_qdestroy;
825 static void
826 sfc_ef100_tx_qdestroy(struct sfc_dp_txq *dp_txq)
827 {
828 	struct sfc_ef100_txq *txq = sfc_ef100_txq_by_dp_txq(dp_txq);
829 
830 	rte_free(txq->sw_ring);
831 	rte_free(txq);
832 }
833 
834 static sfc_dp_tx_qstart_t sfc_ef100_tx_qstart;
835 static int
836 sfc_ef100_tx_qstart(struct sfc_dp_txq *dp_txq, unsigned int evq_read_ptr,
837 		   unsigned int txq_desc_index)
838 {
839 	struct sfc_ef100_txq *txq = sfc_ef100_txq_by_dp_txq(dp_txq);
840 
841 	txq->evq_read_ptr = evq_read_ptr;
842 	txq->added = txq->completed = txq_desc_index;
843 
844 	txq->flags |= SFC_EF100_TXQ_STARTED;
845 	txq->flags &= ~(SFC_EF100_TXQ_NOT_RUNNING | SFC_EF100_TXQ_EXCEPTION);
846 
847 	return 0;
848 }
849 
850 static sfc_dp_tx_qstop_t sfc_ef100_tx_qstop;
851 static void
852 sfc_ef100_tx_qstop(struct sfc_dp_txq *dp_txq, unsigned int *evq_read_ptr)
853 {
854 	struct sfc_ef100_txq *txq = sfc_ef100_txq_by_dp_txq(dp_txq);
855 
856 	txq->flags |= SFC_EF100_TXQ_NOT_RUNNING;
857 
858 	*evq_read_ptr = txq->evq_read_ptr;
859 }
860 
861 static sfc_dp_tx_qtx_ev_t sfc_ef100_tx_qtx_ev;
862 static bool
863 sfc_ef100_tx_qtx_ev(struct sfc_dp_txq *dp_txq, unsigned int num_descs)
864 {
865 	struct sfc_ef100_txq *txq = sfc_ef100_txq_by_dp_txq(dp_txq);
866 
867 	SFC_ASSERT(txq->flags & SFC_EF100_TXQ_NOT_RUNNING);
868 
869 	sfc_ef100_tx_reap_num_descs(txq, num_descs);
870 
871 	return false;
872 }
873 
874 static sfc_dp_tx_qreap_t sfc_ef100_tx_qreap;
875 static void
876 sfc_ef100_tx_qreap(struct sfc_dp_txq *dp_txq)
877 {
878 	struct sfc_ef100_txq *txq = sfc_ef100_txq_by_dp_txq(dp_txq);
879 	unsigned int completed;
880 
881 	for (completed = txq->completed; completed != txq->added; ++completed) {
882 		struct sfc_ef100_tx_sw_desc *txd;
883 
884 		txd = &txq->sw_ring[completed & txq->ptr_mask];
885 		if (txd->mbuf != NULL) {
886 			rte_pktmbuf_free_seg(txd->mbuf);
887 			txd->mbuf = NULL;
888 		}
889 	}
890 
891 	txq->flags &= ~SFC_EF100_TXQ_STARTED;
892 }
893 
894 static unsigned int
895 sfc_ef100_tx_qdesc_npending(struct sfc_ef100_txq *txq)
896 {
897 	const unsigned int evq_old_read_ptr = txq->evq_read_ptr;
898 	unsigned int npending = 0;
899 	efx_qword_t tx_ev;
900 
901 	if (unlikely(txq->flags &
902 		     (SFC_EF100_TXQ_NOT_RUNNING | SFC_EF100_TXQ_EXCEPTION)))
903 		return 0;
904 
905 	while (sfc_ef100_tx_get_event(txq, &tx_ev))
906 		npending += EFX_QWORD_FIELD(tx_ev, ESF_GZ_EV_TXCMPL_NUM_DESC);
907 
908 	/*
909 	 * The function does not process events, so return event queue read
910 	 * pointer to the original position to allow the events that were
911 	 * read to be processed later
912 	 */
913 	txq->evq_read_ptr = evq_old_read_ptr;
914 
915 	return npending;
916 }
917 
918 static sfc_dp_tx_qdesc_status_t sfc_ef100_tx_qdesc_status;
919 static int
920 sfc_ef100_tx_qdesc_status(struct sfc_dp_txq *dp_txq, uint16_t offset)
921 {
922 	struct sfc_ef100_txq *txq = sfc_ef100_txq_by_dp_txq(dp_txq);
923 	unsigned int pushed = txq->added - txq->completed;
924 
925 	if (unlikely(offset > txq->ptr_mask))
926 		return -EINVAL;
927 
928 	if (unlikely(offset >= txq->max_fill_level))
929 		return RTE_ETH_TX_DESC_UNAVAIL;
930 
931 	return (offset >= pushed ||
932 		offset < sfc_ef100_tx_qdesc_npending(txq)) ?
933 		RTE_ETH_TX_DESC_DONE : RTE_ETH_TX_DESC_FULL;
934 }
935 
936 struct sfc_dp_tx sfc_ef100_tx = {
937 	.dp = {
938 		.name		= SFC_KVARG_DATAPATH_EF100,
939 		.type		= SFC_DP_TX,
940 		.hw_fw_caps	= SFC_DP_HW_FW_CAP_EF100,
941 	},
942 	.features		= SFC_DP_TX_FEAT_MULTI_PROCESS,
943 	.dev_offload_capa	= 0,
944 	.queue_offload_capa	= DEV_TX_OFFLOAD_VLAN_INSERT |
945 				  DEV_TX_OFFLOAD_IPV4_CKSUM |
946 				  DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM |
947 				  DEV_TX_OFFLOAD_OUTER_UDP_CKSUM |
948 				  DEV_TX_OFFLOAD_UDP_CKSUM |
949 				  DEV_TX_OFFLOAD_TCP_CKSUM |
950 				  DEV_TX_OFFLOAD_MULTI_SEGS |
951 				  DEV_TX_OFFLOAD_TCP_TSO |
952 				  DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
953 				  DEV_TX_OFFLOAD_GENEVE_TNL_TSO,
954 	.get_dev_info		= sfc_ef100_get_dev_info,
955 	.qsize_up_rings		= sfc_ef100_tx_qsize_up_rings,
956 	.qcreate		= sfc_ef100_tx_qcreate,
957 	.qdestroy		= sfc_ef100_tx_qdestroy,
958 	.qstart			= sfc_ef100_tx_qstart,
959 	.qtx_ev			= sfc_ef100_tx_qtx_ev,
960 	.qstop			= sfc_ef100_tx_qstop,
961 	.qreap			= sfc_ef100_tx_qreap,
962 	.qdesc_status		= sfc_ef100_tx_qdesc_status,
963 	.pkt_prepare		= sfc_ef100_tx_prepare_pkts,
964 	.pkt_burst		= sfc_ef100_xmit_pkts,
965 };
966