xref: /dpdk/drivers/net/sfc/sfc_tx.c (revision 5ecb687a5698d2d8ec1f3b3b5a7a16bceca3e29c)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  *
3  * Copyright (c) 2016-2018 Solarflare Communications Inc.
4  * All rights reserved.
5  *
6  * This software was jointly developed between OKTET Labs (under contract
7  * for Solarflare) and Solarflare Communications, Inc.
8  */
9 
10 #include "sfc.h"
11 #include "sfc_debug.h"
12 #include "sfc_log.h"
13 #include "sfc_ev.h"
14 #include "sfc_tx.h"
15 #include "sfc_tweak.h"
16 #include "sfc_kvargs.h"
17 
18 /*
19  * Maximum number of TX queue flush attempts in case of
20  * failure or flush timeout
21  */
22 #define SFC_TX_QFLUSH_ATTEMPTS		(3)
23 
24 /*
25  * Time to wait between event queue polling attempts when waiting for TX
26  * queue flush done or flush failed events
27  */
28 #define SFC_TX_QFLUSH_POLL_WAIT_MS	(1)
29 
30 /*
31  * Maximum number of event queue polling attempts when waiting for TX queue
32  * flush done or flush failed events; it defines TX queue flush attempt timeout
33  * together with SFC_TX_QFLUSH_POLL_WAIT_MS
34  */
35 #define SFC_TX_QFLUSH_POLL_ATTEMPTS	(2000)
36 
37 uint64_t
38 sfc_tx_get_dev_offload_caps(struct sfc_adapter *sa)
39 {
40 	const efx_nic_cfg_t *encp = efx_nic_cfg_get(sa->nic);
41 	uint64_t caps = 0;
42 
43 	if ((sa->priv.dp_tx->features & SFC_DP_TX_FEAT_VLAN_INSERT) &&
44 	    encp->enc_hw_tx_insert_vlan_enabled)
45 		caps |= DEV_TX_OFFLOAD_VLAN_INSERT;
46 
47 	if (sa->priv.dp_tx->features & SFC_DP_TX_FEAT_MULTI_SEG)
48 		caps |= DEV_TX_OFFLOAD_MULTI_SEGS;
49 
50 	if ((~sa->priv.dp_tx->features & SFC_DP_TX_FEAT_MULTI_POOL) &&
51 	    (~sa->priv.dp_tx->features & SFC_DP_TX_FEAT_REFCNT))
52 		caps |= DEV_TX_OFFLOAD_MBUF_FAST_FREE;
53 
54 	return caps;
55 }
56 
57 uint64_t
58 sfc_tx_get_queue_offload_caps(struct sfc_adapter *sa)
59 {
60 	const efx_nic_cfg_t *encp = efx_nic_cfg_get(sa->nic);
61 	uint64_t caps = 0;
62 
63 	caps |= DEV_TX_OFFLOAD_IPV4_CKSUM;
64 	caps |= DEV_TX_OFFLOAD_UDP_CKSUM;
65 	caps |= DEV_TX_OFFLOAD_TCP_CKSUM;
66 
67 	if (encp->enc_tunnel_encapsulations_supported)
68 		caps |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
69 
70 	if (sa->tso)
71 		caps |= DEV_TX_OFFLOAD_TCP_TSO;
72 
73 	if (sa->tso_encap)
74 		caps |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
75 			 DEV_TX_OFFLOAD_GENEVE_TNL_TSO);
76 
77 	return caps;
78 }
79 
80 static int
81 sfc_tx_qcheck_conf(struct sfc_adapter *sa, unsigned int txq_max_fill_level,
82 		   const struct rte_eth_txconf *tx_conf,
83 		   uint64_t offloads)
84 {
85 	int rc = 0;
86 
87 	if (tx_conf->tx_rs_thresh != 0) {
88 		sfc_err(sa, "RS bit in transmit descriptor is not supported");
89 		rc = EINVAL;
90 	}
91 
92 	if (tx_conf->tx_free_thresh > txq_max_fill_level) {
93 		sfc_err(sa,
94 			"TxQ free threshold too large: %u vs maximum %u",
95 			tx_conf->tx_free_thresh, txq_max_fill_level);
96 		rc = EINVAL;
97 	}
98 
99 	if (tx_conf->tx_thresh.pthresh != 0 ||
100 	    tx_conf->tx_thresh.hthresh != 0 ||
101 	    tx_conf->tx_thresh.wthresh != 0) {
102 		sfc_warn(sa,
103 			"prefetch/host/writeback thresholds are not supported");
104 	}
105 
106 	/* We either perform both TCP and UDP offload, or no offload at all */
107 	if (((offloads & DEV_TX_OFFLOAD_TCP_CKSUM) == 0) !=
108 	    ((offloads & DEV_TX_OFFLOAD_UDP_CKSUM) == 0)) {
109 		sfc_err(sa, "TCP and UDP offloads can't be set independently");
110 		rc = EINVAL;
111 	}
112 
113 	return rc;
114 }
115 
116 void
117 sfc_tx_qflush_done(struct sfc_txq_info *txq_info)
118 {
119 	txq_info->state |= SFC_TXQ_FLUSHED;
120 	txq_info->state &= ~SFC_TXQ_FLUSHING;
121 }
122 
123 int
124 sfc_tx_qinit(struct sfc_adapter *sa, unsigned int sw_index,
125 	     uint16_t nb_tx_desc, unsigned int socket_id,
126 	     const struct rte_eth_txconf *tx_conf)
127 {
128 	const efx_nic_cfg_t *encp = efx_nic_cfg_get(sa->nic);
129 	unsigned int txq_entries;
130 	unsigned int evq_entries;
131 	unsigned int txq_max_fill_level;
132 	struct sfc_txq_info *txq_info;
133 	struct sfc_evq *evq;
134 	struct sfc_txq *txq;
135 	int rc = 0;
136 	struct sfc_dp_tx_qcreate_info info;
137 	uint64_t offloads;
138 	struct sfc_dp_tx_hw_limits hw_limits;
139 
140 	sfc_log_init(sa, "TxQ = %u", sw_index);
141 
142 	memset(&hw_limits, 0, sizeof(hw_limits));
143 	hw_limits.txq_max_entries = sa->txq_max_entries;
144 	hw_limits.txq_min_entries = sa->txq_min_entries;
145 
146 	rc = sa->priv.dp_tx->qsize_up_rings(nb_tx_desc, &hw_limits,
147 					    &txq_entries, &evq_entries,
148 					    &txq_max_fill_level);
149 	if (rc != 0)
150 		goto fail_size_up_rings;
151 	SFC_ASSERT(txq_entries >= sa->txq_min_entries);
152 	SFC_ASSERT(txq_entries <= sa->txq_max_entries);
153 	SFC_ASSERT(txq_entries >= nb_tx_desc);
154 	SFC_ASSERT(txq_max_fill_level <= nb_tx_desc);
155 
156 	offloads = tx_conf->offloads |
157 		sa->eth_dev->data->dev_conf.txmode.offloads;
158 	rc = sfc_tx_qcheck_conf(sa, txq_max_fill_level, tx_conf, offloads);
159 	if (rc != 0)
160 		goto fail_bad_conf;
161 
162 	SFC_ASSERT(sw_index < sfc_sa2shared(sa)->txq_count);
163 	txq_info = &sfc_sa2shared(sa)->txq_info[sw_index];
164 
165 	txq_info->entries = txq_entries;
166 
167 	rc = sfc_ev_qinit(sa, SFC_EVQ_TYPE_TX, sw_index,
168 			  evq_entries, socket_id, &evq);
169 	if (rc != 0)
170 		goto fail_ev_qinit;
171 
172 	txq = &sa->txq_ctrl[sw_index];
173 	txq->hw_index = sw_index;
174 	txq->evq = evq;
175 	txq_info->free_thresh =
176 		(tx_conf->tx_free_thresh) ? tx_conf->tx_free_thresh :
177 		SFC_TX_DEFAULT_FREE_THRESH;
178 	txq_info->offloads = offloads;
179 
180 	rc = sfc_dma_alloc(sa, "txq", sw_index,
181 			   efx_txq_size(sa->nic, txq_info->entries),
182 			   socket_id, &txq->mem);
183 	if (rc != 0)
184 		goto fail_dma_alloc;
185 
186 	memset(&info, 0, sizeof(info));
187 	info.max_fill_level = txq_max_fill_level;
188 	info.free_thresh = txq_info->free_thresh;
189 	info.offloads = offloads;
190 	info.txq_entries = txq_info->entries;
191 	info.dma_desc_size_max = encp->enc_tx_dma_desc_size_max;
192 	info.txq_hw_ring = txq->mem.esm_base;
193 	info.evq_entries = evq_entries;
194 	info.evq_hw_ring = evq->mem.esm_base;
195 	info.hw_index = txq->hw_index;
196 	info.mem_bar = sa->mem_bar.esb_base;
197 	info.vi_window_shift = encp->enc_vi_window_shift;
198 	info.tso_tcp_header_offset_limit =
199 		encp->enc_tx_tso_tcp_header_offset_limit;
200 
201 	rc = sa->priv.dp_tx->qcreate(sa->eth_dev->data->port_id, sw_index,
202 				     &RTE_ETH_DEV_TO_PCI(sa->eth_dev)->addr,
203 				     socket_id, &info, &txq_info->dp);
204 	if (rc != 0)
205 		goto fail_dp_tx_qinit;
206 
207 	evq->dp_txq = txq_info->dp;
208 
209 	txq_info->state = SFC_TXQ_INITIALIZED;
210 
211 	txq_info->deferred_start = (tx_conf->tx_deferred_start != 0);
212 
213 	return 0;
214 
215 fail_dp_tx_qinit:
216 	sfc_dma_free(sa, &txq->mem);
217 
218 fail_dma_alloc:
219 	sfc_ev_qfini(evq);
220 
221 fail_ev_qinit:
222 	txq_info->entries = 0;
223 
224 fail_bad_conf:
225 fail_size_up_rings:
226 	sfc_log_init(sa, "failed (TxQ = %u, rc = %d)", sw_index, rc);
227 	return rc;
228 }
229 
230 void
231 sfc_tx_qfini(struct sfc_adapter *sa, unsigned int sw_index)
232 {
233 	struct sfc_txq_info *txq_info;
234 	struct sfc_txq *txq;
235 
236 	sfc_log_init(sa, "TxQ = %u", sw_index);
237 
238 	SFC_ASSERT(sw_index < sfc_sa2shared(sa)->txq_count);
239 	sa->eth_dev->data->tx_queues[sw_index] = NULL;
240 
241 	txq_info = &sfc_sa2shared(sa)->txq_info[sw_index];
242 
243 	SFC_ASSERT(txq_info->state == SFC_TXQ_INITIALIZED);
244 
245 	sa->priv.dp_tx->qdestroy(txq_info->dp);
246 	txq_info->dp = NULL;
247 
248 	txq_info->state &= ~SFC_TXQ_INITIALIZED;
249 	txq_info->entries = 0;
250 
251 	txq = &sa->txq_ctrl[sw_index];
252 
253 	sfc_dma_free(sa, &txq->mem);
254 
255 	sfc_ev_qfini(txq->evq);
256 	txq->evq = NULL;
257 }
258 
259 static int
260 sfc_tx_qinit_info(struct sfc_adapter *sa, unsigned int sw_index)
261 {
262 	sfc_log_init(sa, "TxQ = %u", sw_index);
263 
264 	return 0;
265 }
266 
267 static int
268 sfc_tx_check_mode(struct sfc_adapter *sa, const struct rte_eth_txmode *txmode)
269 {
270 	int rc = 0;
271 
272 	switch (txmode->mq_mode) {
273 	case ETH_MQ_TX_NONE:
274 		break;
275 	default:
276 		sfc_err(sa, "Tx multi-queue mode %u not supported",
277 			txmode->mq_mode);
278 		rc = EINVAL;
279 	}
280 
281 	/*
282 	 * These features are claimed to be i40e-specific,
283 	 * but it does make sense to double-check their absence
284 	 */
285 	if (txmode->hw_vlan_reject_tagged) {
286 		sfc_err(sa, "Rejecting tagged packets not supported");
287 		rc = EINVAL;
288 	}
289 
290 	if (txmode->hw_vlan_reject_untagged) {
291 		sfc_err(sa, "Rejecting untagged packets not supported");
292 		rc = EINVAL;
293 	}
294 
295 	if (txmode->hw_vlan_insert_pvid) {
296 		sfc_err(sa, "Port-based VLAN insertion not supported");
297 		rc = EINVAL;
298 	}
299 
300 	return rc;
301 }
302 
303 /**
304  * Destroy excess queues that are no longer needed after reconfiguration
305  * or complete close.
306  */
307 static void
308 sfc_tx_fini_queues(struct sfc_adapter *sa, unsigned int nb_tx_queues)
309 {
310 	struct sfc_adapter_shared * const sas = sfc_sa2shared(sa);
311 	int sw_index;
312 
313 	SFC_ASSERT(nb_tx_queues <= sas->txq_count);
314 
315 	sw_index = sas->txq_count;
316 	while (--sw_index >= (int)nb_tx_queues) {
317 		if (sas->txq_info[sw_index].state & SFC_TXQ_INITIALIZED)
318 			sfc_tx_qfini(sa, sw_index);
319 	}
320 
321 	sas->txq_count = nb_tx_queues;
322 }
323 
324 int
325 sfc_tx_configure(struct sfc_adapter *sa)
326 {
327 	struct sfc_adapter_shared * const sas = sfc_sa2shared(sa);
328 	const efx_nic_cfg_t *encp = efx_nic_cfg_get(sa->nic);
329 	const struct rte_eth_conf *dev_conf = &sa->eth_dev->data->dev_conf;
330 	const unsigned int nb_tx_queues = sa->eth_dev->data->nb_tx_queues;
331 	int rc = 0;
332 
333 	sfc_log_init(sa, "nb_tx_queues=%u (old %u)",
334 		     nb_tx_queues, sas->txq_count);
335 
336 	/*
337 	 * The datapath implementation assumes absence of boundary
338 	 * limits on Tx DMA descriptors. Addition of these checks on
339 	 * datapath would simply make the datapath slower.
340 	 */
341 	if (encp->enc_tx_dma_desc_boundary != 0) {
342 		rc = ENOTSUP;
343 		goto fail_tx_dma_desc_boundary;
344 	}
345 
346 	rc = sfc_tx_check_mode(sa, &dev_conf->txmode);
347 	if (rc != 0)
348 		goto fail_check_mode;
349 
350 	if (nb_tx_queues == sas->txq_count)
351 		goto done;
352 
353 	if (sas->txq_info == NULL) {
354 		sas->txq_info = rte_calloc_socket("sfc-txqs", nb_tx_queues,
355 						  sizeof(sas->txq_info[0]), 0,
356 						  sa->socket_id);
357 		if (sas->txq_info == NULL)
358 			goto fail_txqs_alloc;
359 
360 		/*
361 		 * Allocate primary process only TxQ control from heap
362 		 * since it should not be shared.
363 		 */
364 		rc = ENOMEM;
365 		sa->txq_ctrl = calloc(nb_tx_queues, sizeof(sa->txq_ctrl[0]));
366 		if (sa->txq_ctrl == NULL)
367 			goto fail_txqs_ctrl_alloc;
368 	} else {
369 		struct sfc_txq_info *new_txq_info;
370 		struct sfc_txq *new_txq_ctrl;
371 
372 		if (nb_tx_queues < sas->txq_count)
373 			sfc_tx_fini_queues(sa, nb_tx_queues);
374 
375 		new_txq_info =
376 			rte_realloc(sas->txq_info,
377 				    nb_tx_queues * sizeof(sas->txq_info[0]), 0);
378 		if (new_txq_info == NULL && nb_tx_queues > 0)
379 			goto fail_txqs_realloc;
380 
381 		new_txq_ctrl = realloc(sa->txq_ctrl,
382 				       nb_tx_queues * sizeof(sa->txq_ctrl[0]));
383 		if (new_txq_ctrl == NULL && nb_tx_queues > 0)
384 			goto fail_txqs_ctrl_realloc;
385 
386 		sas->txq_info = new_txq_info;
387 		sa->txq_ctrl = new_txq_ctrl;
388 		if (nb_tx_queues > sas->txq_count) {
389 			memset(&sas->txq_info[sas->txq_count], 0,
390 			       (nb_tx_queues - sas->txq_count) *
391 			       sizeof(sas->txq_info[0]));
392 			memset(&sa->txq_ctrl[sas->txq_count], 0,
393 			       (nb_tx_queues - sas->txq_count) *
394 			       sizeof(sa->txq_ctrl[0]));
395 		}
396 	}
397 
398 	while (sas->txq_count < nb_tx_queues) {
399 		rc = sfc_tx_qinit_info(sa, sas->txq_count);
400 		if (rc != 0)
401 			goto fail_tx_qinit_info;
402 
403 		sas->txq_count++;
404 	}
405 
406 done:
407 	return 0;
408 
409 fail_tx_qinit_info:
410 fail_txqs_ctrl_realloc:
411 fail_txqs_realloc:
412 fail_txqs_ctrl_alloc:
413 fail_txqs_alloc:
414 	sfc_tx_close(sa);
415 
416 fail_check_mode:
417 fail_tx_dma_desc_boundary:
418 	sfc_log_init(sa, "failed (rc = %d)", rc);
419 	return rc;
420 }
421 
422 void
423 sfc_tx_close(struct sfc_adapter *sa)
424 {
425 	sfc_tx_fini_queues(sa, 0);
426 
427 	free(sa->txq_ctrl);
428 	sa->txq_ctrl = NULL;
429 
430 	rte_free(sfc_sa2shared(sa)->txq_info);
431 	sfc_sa2shared(sa)->txq_info = NULL;
432 }
433 
434 int
435 sfc_tx_qstart(struct sfc_adapter *sa, unsigned int sw_index)
436 {
437 	struct sfc_adapter_shared * const sas = sfc_sa2shared(sa);
438 	uint64_t offloads_supported = sfc_tx_get_dev_offload_caps(sa) |
439 				      sfc_tx_get_queue_offload_caps(sa);
440 	struct rte_eth_dev_data *dev_data;
441 	struct sfc_txq_info *txq_info;
442 	struct sfc_txq *txq;
443 	struct sfc_evq *evq;
444 	uint16_t flags = 0;
445 	unsigned int desc_index;
446 	int rc = 0;
447 
448 	sfc_log_init(sa, "TxQ = %u", sw_index);
449 
450 	SFC_ASSERT(sw_index < sas->txq_count);
451 	txq_info = &sas->txq_info[sw_index];
452 
453 	SFC_ASSERT(txq_info->state == SFC_TXQ_INITIALIZED);
454 
455 	txq = &sa->txq_ctrl[sw_index];
456 	evq = txq->evq;
457 
458 	rc = sfc_ev_qstart(evq, sfc_evq_index_by_txq_sw_index(sa, sw_index));
459 	if (rc != 0)
460 		goto fail_ev_qstart;
461 
462 	if (txq_info->offloads & DEV_TX_OFFLOAD_IPV4_CKSUM)
463 		flags |= EFX_TXQ_CKSUM_IPV4;
464 
465 	if (txq_info->offloads & DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)
466 		flags |= EFX_TXQ_CKSUM_INNER_IPV4;
467 
468 	if ((txq_info->offloads & DEV_TX_OFFLOAD_TCP_CKSUM) ||
469 	    (txq_info->offloads & DEV_TX_OFFLOAD_UDP_CKSUM)) {
470 		flags |= EFX_TXQ_CKSUM_TCPUDP;
471 
472 		if (offloads_supported & DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)
473 			flags |= EFX_TXQ_CKSUM_INNER_TCPUDP;
474 	}
475 
476 	if (txq_info->offloads & (DEV_TX_OFFLOAD_TCP_TSO |
477 				  DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
478 				  DEV_TX_OFFLOAD_GENEVE_TNL_TSO))
479 		flags |= EFX_TXQ_FATSOV2;
480 
481 	rc = efx_tx_qcreate(sa->nic, txq->hw_index, 0, &txq->mem,
482 			    txq_info->entries, 0 /* not used on EF10 */,
483 			    flags, evq->common,
484 			    &txq->common, &desc_index);
485 	if (rc != 0) {
486 		if (sa->tso && (rc == ENOSPC))
487 			sfc_err(sa, "ran out of TSO contexts");
488 
489 		goto fail_tx_qcreate;
490 	}
491 
492 	efx_tx_qenable(txq->common);
493 
494 	txq_info->state |= SFC_TXQ_STARTED;
495 
496 	rc = sa->priv.dp_tx->qstart(txq_info->dp, evq->read_ptr, desc_index);
497 	if (rc != 0)
498 		goto fail_dp_qstart;
499 
500 	/*
501 	 * It seems to be used by DPDK for debug purposes only ('rte_ether')
502 	 */
503 	dev_data = sa->eth_dev->data;
504 	dev_data->tx_queue_state[sw_index] = RTE_ETH_QUEUE_STATE_STARTED;
505 
506 	return 0;
507 
508 fail_dp_qstart:
509 	txq_info->state = SFC_TXQ_INITIALIZED;
510 	efx_tx_qdestroy(txq->common);
511 
512 fail_tx_qcreate:
513 	sfc_ev_qstop(evq);
514 
515 fail_ev_qstart:
516 	return rc;
517 }
518 
519 void
520 sfc_tx_qstop(struct sfc_adapter *sa, unsigned int sw_index)
521 {
522 	struct sfc_adapter_shared * const sas = sfc_sa2shared(sa);
523 	struct rte_eth_dev_data *dev_data;
524 	struct sfc_txq_info *txq_info;
525 	struct sfc_txq *txq;
526 	unsigned int retry_count;
527 	unsigned int wait_count;
528 	int rc;
529 
530 	sfc_log_init(sa, "TxQ = %u", sw_index);
531 
532 	SFC_ASSERT(sw_index < sas->txq_count);
533 	txq_info = &sas->txq_info[sw_index];
534 
535 	if (txq_info->state == SFC_TXQ_INITIALIZED)
536 		return;
537 
538 	SFC_ASSERT(txq_info->state & SFC_TXQ_STARTED);
539 
540 	txq = &sa->txq_ctrl[sw_index];
541 	sa->priv.dp_tx->qstop(txq_info->dp, &txq->evq->read_ptr);
542 
543 	/*
544 	 * Retry TX queue flushing in case of flush failed or
545 	 * timeout; in the worst case it can delay for 6 seconds
546 	 */
547 	for (retry_count = 0;
548 	     ((txq_info->state & SFC_TXQ_FLUSHED) == 0) &&
549 	     (retry_count < SFC_TX_QFLUSH_ATTEMPTS);
550 	     ++retry_count) {
551 		rc = efx_tx_qflush(txq->common);
552 		if (rc != 0) {
553 			txq_info->state |= (rc == EALREADY) ?
554 				SFC_TXQ_FLUSHED : SFC_TXQ_FLUSH_FAILED;
555 			break;
556 		}
557 
558 		/*
559 		 * Wait for TX queue flush done or flush failed event at least
560 		 * SFC_TX_QFLUSH_POLL_WAIT_MS milliseconds and not more
561 		 * than 2 seconds (SFC_TX_QFLUSH_POLL_WAIT_MS multiplied
562 		 * by SFC_TX_QFLUSH_POLL_ATTEMPTS)
563 		 */
564 		wait_count = 0;
565 		do {
566 			rte_delay_ms(SFC_TX_QFLUSH_POLL_WAIT_MS);
567 			sfc_ev_qpoll(txq->evq);
568 		} while ((txq_info->state & SFC_TXQ_FLUSHING) &&
569 			 wait_count++ < SFC_TX_QFLUSH_POLL_ATTEMPTS);
570 
571 		if (txq_info->state & SFC_TXQ_FLUSHING)
572 			sfc_err(sa, "TxQ %u flush timed out", sw_index);
573 
574 		if (txq_info->state & SFC_TXQ_FLUSHED)
575 			sfc_notice(sa, "TxQ %u flushed", sw_index);
576 	}
577 
578 	sa->priv.dp_tx->qreap(txq_info->dp);
579 
580 	txq_info->state = SFC_TXQ_INITIALIZED;
581 
582 	efx_tx_qdestroy(txq->common);
583 
584 	sfc_ev_qstop(txq->evq);
585 
586 	/*
587 	 * It seems to be used by DPDK for debug purposes only ('rte_ether')
588 	 */
589 	dev_data = sa->eth_dev->data;
590 	dev_data->tx_queue_state[sw_index] = RTE_ETH_QUEUE_STATE_STOPPED;
591 }
592 
593 int
594 sfc_tx_start(struct sfc_adapter *sa)
595 {
596 	struct sfc_adapter_shared * const sas = sfc_sa2shared(sa);
597 	const efx_nic_cfg_t *encp = efx_nic_cfg_get(sa->nic);
598 	unsigned int sw_index;
599 	int rc = 0;
600 
601 	sfc_log_init(sa, "txq_count = %u", sas->txq_count);
602 
603 	if (sa->tso) {
604 		if (!encp->enc_fw_assisted_tso_v2_enabled) {
605 			sfc_warn(sa, "TSO support was unable to be restored");
606 			sa->tso = B_FALSE;
607 			sa->tso_encap = B_FALSE;
608 		}
609 	}
610 
611 	if (sa->tso_encap && !encp->enc_fw_assisted_tso_v2_encap_enabled) {
612 		sfc_warn(sa, "Encapsulated TSO support was unable to be restored");
613 		sa->tso_encap = B_FALSE;
614 	}
615 
616 	rc = efx_tx_init(sa->nic);
617 	if (rc != 0)
618 		goto fail_efx_tx_init;
619 
620 	for (sw_index = 0; sw_index < sas->txq_count; ++sw_index) {
621 		if (sas->txq_info[sw_index].state == SFC_TXQ_INITIALIZED &&
622 		    (!(sas->txq_info[sw_index].deferred_start) ||
623 		     sas->txq_info[sw_index].deferred_started)) {
624 			rc = sfc_tx_qstart(sa, sw_index);
625 			if (rc != 0)
626 				goto fail_tx_qstart;
627 		}
628 	}
629 
630 	return 0;
631 
632 fail_tx_qstart:
633 	while (sw_index-- > 0)
634 		sfc_tx_qstop(sa, sw_index);
635 
636 	efx_tx_fini(sa->nic);
637 
638 fail_efx_tx_init:
639 	sfc_log_init(sa, "failed (rc = %d)", rc);
640 	return rc;
641 }
642 
643 void
644 sfc_tx_stop(struct sfc_adapter *sa)
645 {
646 	struct sfc_adapter_shared * const sas = sfc_sa2shared(sa);
647 	unsigned int sw_index;
648 
649 	sfc_log_init(sa, "txq_count = %u", sas->txq_count);
650 
651 	sw_index = sas->txq_count;
652 	while (sw_index-- > 0) {
653 		if (sas->txq_info[sw_index].state & SFC_TXQ_STARTED)
654 			sfc_tx_qstop(sa, sw_index);
655 	}
656 
657 	efx_tx_fini(sa->nic);
658 }
659 
660 static void
661 sfc_efx_tx_reap(struct sfc_efx_txq *txq)
662 {
663 	unsigned int completed;
664 
665 	sfc_ev_qpoll(txq->evq);
666 
667 	for (completed = txq->completed;
668 	     completed != txq->pending; completed++) {
669 		struct sfc_efx_tx_sw_desc *txd;
670 
671 		txd = &txq->sw_ring[completed & txq->ptr_mask];
672 
673 		if (txd->mbuf != NULL) {
674 			rte_pktmbuf_free(txd->mbuf);
675 			txd->mbuf = NULL;
676 		}
677 	}
678 
679 	txq->completed = completed;
680 }
681 
682 /*
683  * The function is used to insert or update VLAN tag;
684  * the firmware has state of the firmware tag to insert per TxQ
685  * (controlled by option descriptors), hence, if the tag of the
686  * packet to be sent is different from one remembered by the firmware,
687  * the function will update it
688  */
689 static unsigned int
690 sfc_efx_tx_maybe_insert_tag(struct sfc_efx_txq *txq, struct rte_mbuf *m,
691 			    efx_desc_t **pend)
692 {
693 	uint16_t this_tag = ((m->ol_flags & PKT_TX_VLAN_PKT) ?
694 			     m->vlan_tci : 0);
695 
696 	if (this_tag == txq->hw_vlan_tci)
697 		return 0;
698 
699 	/*
700 	 * The expression inside SFC_ASSERT() is not desired to be checked in
701 	 * a non-debug build because it might be too expensive on the data path
702 	 */
703 	SFC_ASSERT(efx_nic_cfg_get(txq->evq->sa->nic)->enc_hw_tx_insert_vlan_enabled);
704 
705 	efx_tx_qdesc_vlantci_create(txq->common, rte_cpu_to_be_16(this_tag),
706 				    *pend);
707 	(*pend)++;
708 	txq->hw_vlan_tci = this_tag;
709 
710 	return 1;
711 }
712 
713 static uint16_t
714 sfc_efx_prepare_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
715 		     uint16_t nb_pkts)
716 {
717 	struct sfc_dp_txq *dp_txq = tx_queue;
718 	struct sfc_efx_txq *txq = sfc_efx_txq_by_dp_txq(dp_txq);
719 	const efx_nic_cfg_t *encp = efx_nic_cfg_get(txq->evq->sa->nic);
720 	uint16_t i;
721 
722 	for (i = 0; i < nb_pkts; i++) {
723 		int ret;
724 
725 		/*
726 		 * EFX Tx datapath may require extra VLAN descriptor if VLAN
727 		 * insertion offload is requested regardless the offload
728 		 * requested/supported.
729 		 */
730 		ret = sfc_dp_tx_prepare_pkt(tx_pkts[i],
731 				encp->enc_tx_tso_tcp_header_offset_limit,
732 				txq->max_fill_level, EFX_TX_FATSOV2_OPT_NDESCS,
733 				1);
734 		if (unlikely(ret != 0)) {
735 			rte_errno = ret;
736 			break;
737 		}
738 	}
739 
740 	return i;
741 }
742 
743 static uint16_t
744 sfc_efx_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
745 {
746 	struct sfc_dp_txq *dp_txq = (struct sfc_dp_txq *)tx_queue;
747 	struct sfc_efx_txq *txq = sfc_efx_txq_by_dp_txq(dp_txq);
748 	unsigned int added = txq->added;
749 	unsigned int pushed = added;
750 	unsigned int pkts_sent = 0;
751 	efx_desc_t *pend = &txq->pend_desc[0];
752 	const unsigned int hard_max_fill = txq->max_fill_level;
753 	const unsigned int soft_max_fill = hard_max_fill - txq->free_thresh;
754 	unsigned int fill_level = added - txq->completed;
755 	boolean_t reap_done;
756 	int rc __rte_unused;
757 	struct rte_mbuf **pktp;
758 
759 	if (unlikely((txq->flags & SFC_EFX_TXQ_FLAG_RUNNING) == 0))
760 		goto done;
761 
762 	/*
763 	 * If insufficient space for a single packet is present,
764 	 * we should reap; otherwise, we shouldn't do that all the time
765 	 * to avoid latency increase
766 	 */
767 	reap_done = (fill_level > soft_max_fill);
768 
769 	if (reap_done) {
770 		sfc_efx_tx_reap(txq);
771 		/*
772 		 * Recalculate fill level since 'txq->completed'
773 		 * might have changed on reap
774 		 */
775 		fill_level = added - txq->completed;
776 	}
777 
778 	for (pkts_sent = 0, pktp = &tx_pkts[0];
779 	     (pkts_sent < nb_pkts) && (fill_level <= soft_max_fill);
780 	     pkts_sent++, pktp++) {
781 		uint16_t		hw_vlan_tci_prev = txq->hw_vlan_tci;
782 		struct rte_mbuf		*m_seg = *pktp;
783 		size_t			pkt_len = m_seg->pkt_len;
784 		unsigned int		pkt_descs = 0;
785 		size_t			in_off = 0;
786 
787 		/*
788 		 * Here VLAN TCI is expected to be zero in case if no
789 		 * DEV_TX_OFFLOAD_VLAN_INSERT capability is advertised;
790 		 * if the calling app ignores the absence of
791 		 * DEV_TX_OFFLOAD_VLAN_INSERT and pushes VLAN TCI, then
792 		 * TX_ERROR will occur
793 		 */
794 		pkt_descs += sfc_efx_tx_maybe_insert_tag(txq, m_seg, &pend);
795 
796 		if (m_seg->ol_flags & PKT_TX_TCP_SEG) {
797 			/*
798 			 * We expect correct 'pkt->l[2, 3, 4]_len' values
799 			 * to be set correctly by the caller
800 			 */
801 			if (sfc_efx_tso_do(txq, added, &m_seg, &in_off, &pend,
802 					   &pkt_descs, &pkt_len) != 0) {
803 				/* We may have reached this place if packet
804 				 * header linearization is needed but the
805 				 * header length is greater than
806 				 * SFC_TSOH_STD_LEN
807 				 *
808 				 * We will deceive RTE saying that we have sent
809 				 * the packet, but we will actually drop it.
810 				 * Hence, we should revert 'pend' to the
811 				 * previous state (in case we have added
812 				 * VLAN descriptor) and start processing
813 				 * another one packet. But the original
814 				 * mbuf shouldn't be orphaned
815 				 */
816 				pend -= pkt_descs;
817 				txq->hw_vlan_tci = hw_vlan_tci_prev;
818 
819 				rte_pktmbuf_free(*pktp);
820 
821 				continue;
822 			}
823 
824 			/*
825 			 * We've only added 2 FATSOv2 option descriptors
826 			 * and 1 descriptor for the linearized packet header.
827 			 * The outstanding work will be done in the same manner
828 			 * as for the usual non-TSO path
829 			 */
830 		}
831 
832 		for (; m_seg != NULL; m_seg = m_seg->next) {
833 			efsys_dma_addr_t	next_frag;
834 			size_t			seg_len;
835 
836 			seg_len = m_seg->data_len;
837 			next_frag = rte_mbuf_data_iova(m_seg);
838 
839 			/*
840 			 * If we've started TSO transaction few steps earlier,
841 			 * we'll skip packet header using an offset in the
842 			 * current segment (which has been set to the
843 			 * first one containing payload)
844 			 */
845 			seg_len -= in_off;
846 			next_frag += in_off;
847 			in_off = 0;
848 
849 			do {
850 				efsys_dma_addr_t	frag_addr = next_frag;
851 				size_t			frag_len;
852 
853 				/*
854 				 * It is assumed here that there is no
855 				 * limitation on address boundary
856 				 * crossing by DMA descriptor.
857 				 */
858 				frag_len = MIN(seg_len, txq->dma_desc_size_max);
859 				next_frag += frag_len;
860 				seg_len -= frag_len;
861 				pkt_len -= frag_len;
862 
863 				efx_tx_qdesc_dma_create(txq->common,
864 							frag_addr, frag_len,
865 							(pkt_len == 0),
866 							pend++);
867 
868 				pkt_descs++;
869 			} while (seg_len != 0);
870 		}
871 
872 		added += pkt_descs;
873 
874 		fill_level += pkt_descs;
875 		if (unlikely(fill_level > hard_max_fill)) {
876 			/*
877 			 * Our estimation for maximum number of descriptors
878 			 * required to send a packet seems to be wrong.
879 			 * Try to reap (if we haven't yet).
880 			 */
881 			if (!reap_done) {
882 				sfc_efx_tx_reap(txq);
883 				reap_done = B_TRUE;
884 				fill_level = added - txq->completed;
885 				if (fill_level > hard_max_fill) {
886 					pend -= pkt_descs;
887 					txq->hw_vlan_tci = hw_vlan_tci_prev;
888 					break;
889 				}
890 			} else {
891 				pend -= pkt_descs;
892 				txq->hw_vlan_tci = hw_vlan_tci_prev;
893 				break;
894 			}
895 		}
896 
897 		/* Assign mbuf to the last used desc */
898 		txq->sw_ring[(added - 1) & txq->ptr_mask].mbuf = *pktp;
899 	}
900 
901 	if (likely(pkts_sent > 0)) {
902 		rc = efx_tx_qdesc_post(txq->common, txq->pend_desc,
903 				       pend - &txq->pend_desc[0],
904 				       txq->completed, &txq->added);
905 		SFC_ASSERT(rc == 0);
906 
907 		if (likely(pushed != txq->added))
908 			efx_tx_qpush(txq->common, txq->added, pushed);
909 	}
910 
911 #if SFC_TX_XMIT_PKTS_REAP_AT_LEAST_ONCE
912 	if (!reap_done)
913 		sfc_efx_tx_reap(txq);
914 #endif
915 
916 done:
917 	return pkts_sent;
918 }
919 
920 const struct sfc_dp_tx *
921 sfc_dp_tx_by_dp_txq(const struct sfc_dp_txq *dp_txq)
922 {
923 	const struct sfc_dp_queue *dpq = &dp_txq->dpq;
924 	struct rte_eth_dev *eth_dev;
925 	struct sfc_adapter_priv *sap;
926 
927 	SFC_ASSERT(rte_eth_dev_is_valid_port(dpq->port_id));
928 	eth_dev = &rte_eth_devices[dpq->port_id];
929 
930 	sap = sfc_adapter_priv_by_eth_dev(eth_dev);
931 
932 	return sap->dp_tx;
933 }
934 
935 struct sfc_txq_info *
936 sfc_txq_info_by_dp_txq(const struct sfc_dp_txq *dp_txq)
937 {
938 	const struct sfc_dp_queue *dpq = &dp_txq->dpq;
939 	struct rte_eth_dev *eth_dev;
940 	struct sfc_adapter_shared *sas;
941 
942 	SFC_ASSERT(rte_eth_dev_is_valid_port(dpq->port_id));
943 	eth_dev = &rte_eth_devices[dpq->port_id];
944 
945 	sas = sfc_adapter_shared_by_eth_dev(eth_dev);
946 
947 	SFC_ASSERT(dpq->queue_id < sas->txq_count);
948 	return &sas->txq_info[dpq->queue_id];
949 }
950 
951 struct sfc_txq *
952 sfc_txq_by_dp_txq(const struct sfc_dp_txq *dp_txq)
953 {
954 	const struct sfc_dp_queue *dpq = &dp_txq->dpq;
955 	struct rte_eth_dev *eth_dev;
956 	struct sfc_adapter *sa;
957 
958 	SFC_ASSERT(rte_eth_dev_is_valid_port(dpq->port_id));
959 	eth_dev = &rte_eth_devices[dpq->port_id];
960 
961 	sa = sfc_adapter_by_eth_dev(eth_dev);
962 
963 	SFC_ASSERT(dpq->queue_id < sfc_sa2shared(sa)->txq_count);
964 	return &sa->txq_ctrl[dpq->queue_id];
965 }
966 
967 static sfc_dp_tx_qsize_up_rings_t sfc_efx_tx_qsize_up_rings;
968 static int
969 sfc_efx_tx_qsize_up_rings(uint16_t nb_tx_desc,
970 			  __rte_unused struct sfc_dp_tx_hw_limits *limits,
971 			  unsigned int *txq_entries,
972 			  unsigned int *evq_entries,
973 			  unsigned int *txq_max_fill_level)
974 {
975 	*txq_entries = nb_tx_desc;
976 	*evq_entries = nb_tx_desc;
977 	*txq_max_fill_level = EFX_TXQ_LIMIT(*txq_entries);
978 	return 0;
979 }
980 
981 static sfc_dp_tx_qcreate_t sfc_efx_tx_qcreate;
982 static int
983 sfc_efx_tx_qcreate(uint16_t port_id, uint16_t queue_id,
984 		   const struct rte_pci_addr *pci_addr,
985 		   int socket_id,
986 		   const struct sfc_dp_tx_qcreate_info *info,
987 		   struct sfc_dp_txq **dp_txqp)
988 {
989 	struct sfc_efx_txq *txq;
990 	struct sfc_txq *ctrl_txq;
991 	int rc;
992 
993 	rc = ENOMEM;
994 	txq = rte_zmalloc_socket("sfc-efx-txq", sizeof(*txq),
995 				 RTE_CACHE_LINE_SIZE, socket_id);
996 	if (txq == NULL)
997 		goto fail_txq_alloc;
998 
999 	sfc_dp_queue_init(&txq->dp.dpq, port_id, queue_id, pci_addr);
1000 
1001 	rc = ENOMEM;
1002 	txq->pend_desc = rte_calloc_socket("sfc-efx-txq-pend-desc",
1003 					   EFX_TXQ_LIMIT(info->txq_entries),
1004 					   sizeof(*txq->pend_desc), 0,
1005 					   socket_id);
1006 	if (txq->pend_desc == NULL)
1007 		goto fail_pend_desc_alloc;
1008 
1009 	rc = ENOMEM;
1010 	txq->sw_ring = rte_calloc_socket("sfc-efx-txq-sw_ring",
1011 					 info->txq_entries,
1012 					 sizeof(*txq->sw_ring),
1013 					 RTE_CACHE_LINE_SIZE, socket_id);
1014 	if (txq->sw_ring == NULL)
1015 		goto fail_sw_ring_alloc;
1016 
1017 	ctrl_txq = sfc_txq_by_dp_txq(&txq->dp);
1018 	if (ctrl_txq->evq->sa->tso) {
1019 		rc = sfc_efx_tso_alloc_tsoh_objs(txq->sw_ring,
1020 						 info->txq_entries, socket_id);
1021 		if (rc != 0)
1022 			goto fail_alloc_tsoh_objs;
1023 	}
1024 
1025 	txq->evq = ctrl_txq->evq;
1026 	txq->ptr_mask = info->txq_entries - 1;
1027 	txq->max_fill_level = info->max_fill_level;
1028 	txq->free_thresh = info->free_thresh;
1029 	txq->dma_desc_size_max = info->dma_desc_size_max;
1030 
1031 	*dp_txqp = &txq->dp;
1032 	return 0;
1033 
1034 fail_alloc_tsoh_objs:
1035 	rte_free(txq->sw_ring);
1036 
1037 fail_sw_ring_alloc:
1038 	rte_free(txq->pend_desc);
1039 
1040 fail_pend_desc_alloc:
1041 	rte_free(txq);
1042 
1043 fail_txq_alloc:
1044 	return rc;
1045 }
1046 
1047 static sfc_dp_tx_qdestroy_t sfc_efx_tx_qdestroy;
1048 static void
1049 sfc_efx_tx_qdestroy(struct sfc_dp_txq *dp_txq)
1050 {
1051 	struct sfc_efx_txq *txq = sfc_efx_txq_by_dp_txq(dp_txq);
1052 
1053 	sfc_efx_tso_free_tsoh_objs(txq->sw_ring, txq->ptr_mask + 1);
1054 	rte_free(txq->sw_ring);
1055 	rte_free(txq->pend_desc);
1056 	rte_free(txq);
1057 }
1058 
1059 static sfc_dp_tx_qstart_t sfc_efx_tx_qstart;
1060 static int
1061 sfc_efx_tx_qstart(struct sfc_dp_txq *dp_txq,
1062 		  __rte_unused unsigned int evq_read_ptr,
1063 		  unsigned int txq_desc_index)
1064 {
1065 	/* libefx-based datapath is specific to libefx-based PMD */
1066 	struct sfc_efx_txq *txq = sfc_efx_txq_by_dp_txq(dp_txq);
1067 	struct sfc_txq *ctrl_txq = sfc_txq_by_dp_txq(dp_txq);
1068 
1069 	txq->common = ctrl_txq->common;
1070 
1071 	txq->pending = txq->completed = txq->added = txq_desc_index;
1072 	txq->hw_vlan_tci = 0;
1073 
1074 	txq->flags |= (SFC_EFX_TXQ_FLAG_STARTED | SFC_EFX_TXQ_FLAG_RUNNING);
1075 
1076 	return 0;
1077 }
1078 
1079 static sfc_dp_tx_qstop_t sfc_efx_tx_qstop;
1080 static void
1081 sfc_efx_tx_qstop(struct sfc_dp_txq *dp_txq,
1082 		 __rte_unused unsigned int *evq_read_ptr)
1083 {
1084 	struct sfc_efx_txq *txq = sfc_efx_txq_by_dp_txq(dp_txq);
1085 
1086 	txq->flags &= ~SFC_EFX_TXQ_FLAG_RUNNING;
1087 }
1088 
1089 static sfc_dp_tx_qreap_t sfc_efx_tx_qreap;
1090 static void
1091 sfc_efx_tx_qreap(struct sfc_dp_txq *dp_txq)
1092 {
1093 	struct sfc_efx_txq *txq = sfc_efx_txq_by_dp_txq(dp_txq);
1094 	unsigned int txds;
1095 
1096 	sfc_efx_tx_reap(txq);
1097 
1098 	for (txds = 0; txds <= txq->ptr_mask; txds++) {
1099 		if (txq->sw_ring[txds].mbuf != NULL) {
1100 			rte_pktmbuf_free(txq->sw_ring[txds].mbuf);
1101 			txq->sw_ring[txds].mbuf = NULL;
1102 		}
1103 	}
1104 
1105 	txq->flags &= ~SFC_EFX_TXQ_FLAG_STARTED;
1106 }
1107 
1108 static sfc_dp_tx_qdesc_status_t sfc_efx_tx_qdesc_status;
1109 static int
1110 sfc_efx_tx_qdesc_status(struct sfc_dp_txq *dp_txq, uint16_t offset)
1111 {
1112 	struct sfc_efx_txq *txq = sfc_efx_txq_by_dp_txq(dp_txq);
1113 
1114 	if (unlikely(offset > txq->ptr_mask))
1115 		return -EINVAL;
1116 
1117 	if (unlikely(offset >= txq->max_fill_level))
1118 		return RTE_ETH_TX_DESC_UNAVAIL;
1119 
1120 	/*
1121 	 * Poll EvQ to derive up-to-date 'txq->pending' figure;
1122 	 * it is required for the queue to be running, but the
1123 	 * check is omitted because API design assumes that it
1124 	 * is the duty of the caller to satisfy all conditions
1125 	 */
1126 	SFC_ASSERT((txq->flags & SFC_EFX_TXQ_FLAG_RUNNING) ==
1127 		   SFC_EFX_TXQ_FLAG_RUNNING);
1128 	sfc_ev_qpoll(txq->evq);
1129 
1130 	/*
1131 	 * Ring tail is 'txq->pending', and although descriptors
1132 	 * between 'txq->completed' and 'txq->pending' are still
1133 	 * in use by the driver, they should be reported as DONE
1134 	 */
1135 	if (unlikely(offset < (txq->added - txq->pending)))
1136 		return RTE_ETH_TX_DESC_FULL;
1137 
1138 	/*
1139 	 * There is no separate return value for unused descriptors;
1140 	 * the latter will be reported as DONE because genuine DONE
1141 	 * descriptors will be freed anyway in SW on the next burst
1142 	 */
1143 	return RTE_ETH_TX_DESC_DONE;
1144 }
1145 
1146 struct sfc_dp_tx sfc_efx_tx = {
1147 	.dp = {
1148 		.name		= SFC_KVARG_DATAPATH_EFX,
1149 		.type		= SFC_DP_TX,
1150 		.hw_fw_caps	= 0,
1151 	},
1152 	.features		= SFC_DP_TX_FEAT_VLAN_INSERT |
1153 				  SFC_DP_TX_FEAT_TSO |
1154 				  SFC_DP_TX_FEAT_MULTI_POOL |
1155 				  SFC_DP_TX_FEAT_REFCNT |
1156 				  SFC_DP_TX_FEAT_MULTI_SEG,
1157 	.qsize_up_rings		= sfc_efx_tx_qsize_up_rings,
1158 	.qcreate		= sfc_efx_tx_qcreate,
1159 	.qdestroy		= sfc_efx_tx_qdestroy,
1160 	.qstart			= sfc_efx_tx_qstart,
1161 	.qstop			= sfc_efx_tx_qstop,
1162 	.qreap			= sfc_efx_tx_qreap,
1163 	.qdesc_status		= sfc_efx_tx_qdesc_status,
1164 	.pkt_prepare		= sfc_efx_prepare_pkts,
1165 	.pkt_burst		= sfc_efx_xmit_pkts,
1166 };
1167