xref: /dpdk/drivers/net/virtio/virtio_rxtx.c (revision a49342abbb5d68fafab1d2ba4c669c0e76e32c65)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <stdint.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <errno.h>
39 
40 #include <rte_cycles.h>
41 #include <rte_memory.h>
42 #include <rte_memzone.h>
43 #include <rte_branch_prediction.h>
44 #include <rte_mempool.h>
45 #include <rte_malloc.h>
46 #include <rte_mbuf.h>
47 #include <rte_ether.h>
48 #include <rte_ethdev.h>
49 #include <rte_prefetch.h>
50 #include <rte_string_fns.h>
51 #include <rte_errno.h>
52 #include <rte_byteorder.h>
53 #include <rte_cpuflags.h>
54 #include <rte_net.h>
55 #include <rte_ip.h>
56 #include <rte_udp.h>
57 #include <rte_tcp.h>
58 
59 #include "virtio_logs.h"
60 #include "virtio_ethdev.h"
61 #include "virtio_pci.h"
62 #include "virtqueue.h"
63 #include "virtio_rxtx.h"
64 
65 #ifdef RTE_LIBRTE_VIRTIO_DEBUG_DUMP
66 #define VIRTIO_DUMP_PACKET(m, len) rte_pktmbuf_dump(stdout, m, len)
67 #else
68 #define  VIRTIO_DUMP_PACKET(m, len) do { } while (0)
69 #endif
70 
71 
72 #define VIRTIO_SIMPLE_FLAGS ((uint32_t)ETH_TXQ_FLAGS_NOMULTSEGS | \
73 	ETH_TXQ_FLAGS_NOOFFLOADS)
74 
75 static void
76 vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx)
77 {
78 	struct vring_desc *dp, *dp_tail;
79 	struct vq_desc_extra *dxp;
80 	uint16_t desc_idx_last = desc_idx;
81 
82 	dp  = &vq->vq_ring.desc[desc_idx];
83 	dxp = &vq->vq_descx[desc_idx];
84 	vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt + dxp->ndescs);
85 	if ((dp->flags & VRING_DESC_F_INDIRECT) == 0) {
86 		while (dp->flags & VRING_DESC_F_NEXT) {
87 			desc_idx_last = dp->next;
88 			dp = &vq->vq_ring.desc[dp->next];
89 		}
90 	}
91 	dxp->ndescs = 0;
92 
93 	/*
94 	 * We must append the existing free chain, if any, to the end of
95 	 * newly freed chain. If the virtqueue was completely used, then
96 	 * head would be VQ_RING_DESC_CHAIN_END (ASSERTed above).
97 	 */
98 	if (vq->vq_desc_tail_idx == VQ_RING_DESC_CHAIN_END) {
99 		vq->vq_desc_head_idx = desc_idx;
100 	} else {
101 		dp_tail = &vq->vq_ring.desc[vq->vq_desc_tail_idx];
102 		dp_tail->next = desc_idx;
103 	}
104 
105 	vq->vq_desc_tail_idx = desc_idx_last;
106 	dp->next = VQ_RING_DESC_CHAIN_END;
107 }
108 
109 static uint16_t
110 virtqueue_dequeue_burst_rx(struct virtqueue *vq, struct rte_mbuf **rx_pkts,
111 			   uint32_t *len, uint16_t num)
112 {
113 	struct vring_used_elem *uep;
114 	struct rte_mbuf *cookie;
115 	uint16_t used_idx, desc_idx;
116 	uint16_t i;
117 
118 	/*  Caller does the check */
119 	for (i = 0; i < num ; i++) {
120 		used_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
121 		uep = &vq->vq_ring.used->ring[used_idx];
122 		desc_idx = (uint16_t) uep->id;
123 		len[i] = uep->len;
124 		cookie = (struct rte_mbuf *)vq->vq_descx[desc_idx].cookie;
125 
126 		if (unlikely(cookie == NULL)) {
127 			PMD_DRV_LOG(ERR, "vring descriptor with no mbuf cookie at %u\n",
128 				vq->vq_used_cons_idx);
129 			break;
130 		}
131 
132 		rte_prefetch0(cookie);
133 		rte_packet_prefetch(rte_pktmbuf_mtod(cookie, void *));
134 		rx_pkts[i]  = cookie;
135 		vq->vq_used_cons_idx++;
136 		vq_ring_free_chain(vq, desc_idx);
137 		vq->vq_descx[desc_idx].cookie = NULL;
138 	}
139 
140 	return i;
141 }
142 
143 #ifndef DEFAULT_TX_FREE_THRESH
144 #define DEFAULT_TX_FREE_THRESH 32
145 #endif
146 
147 /* Cleanup from completed transmits. */
148 static void
149 virtio_xmit_cleanup(struct virtqueue *vq, uint16_t num)
150 {
151 	uint16_t i, used_idx, desc_idx;
152 	for (i = 0; i < num; i++) {
153 		struct vring_used_elem *uep;
154 		struct vq_desc_extra *dxp;
155 
156 		used_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
157 		uep = &vq->vq_ring.used->ring[used_idx];
158 
159 		desc_idx = (uint16_t) uep->id;
160 		dxp = &vq->vq_descx[desc_idx];
161 		vq->vq_used_cons_idx++;
162 		vq_ring_free_chain(vq, desc_idx);
163 
164 		if (dxp->cookie != NULL) {
165 			rte_pktmbuf_free(dxp->cookie);
166 			dxp->cookie = NULL;
167 		}
168 	}
169 }
170 
171 
172 static inline int
173 virtqueue_enqueue_recv_refill(struct virtqueue *vq, struct rte_mbuf *cookie)
174 {
175 	struct vq_desc_extra *dxp;
176 	struct virtio_hw *hw = vq->hw;
177 	struct vring_desc *start_dp;
178 	uint16_t needed = 1;
179 	uint16_t head_idx, idx;
180 
181 	if (unlikely(vq->vq_free_cnt == 0))
182 		return -ENOSPC;
183 	if (unlikely(vq->vq_free_cnt < needed))
184 		return -EMSGSIZE;
185 
186 	head_idx = vq->vq_desc_head_idx;
187 	if (unlikely(head_idx >= vq->vq_nentries))
188 		return -EFAULT;
189 
190 	idx = head_idx;
191 	dxp = &vq->vq_descx[idx];
192 	dxp->cookie = (void *)cookie;
193 	dxp->ndescs = needed;
194 
195 	start_dp = vq->vq_ring.desc;
196 	start_dp[idx].addr =
197 		VIRTIO_MBUF_ADDR(cookie, vq) +
198 		RTE_PKTMBUF_HEADROOM - hw->vtnet_hdr_size;
199 	start_dp[idx].len =
200 		cookie->buf_len - RTE_PKTMBUF_HEADROOM + hw->vtnet_hdr_size;
201 	start_dp[idx].flags =  VRING_DESC_F_WRITE;
202 	idx = start_dp[idx].next;
203 	vq->vq_desc_head_idx = idx;
204 	if (vq->vq_desc_head_idx == VQ_RING_DESC_CHAIN_END)
205 		vq->vq_desc_tail_idx = idx;
206 	vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - needed);
207 	vq_update_avail_ring(vq, head_idx);
208 
209 	return 0;
210 }
211 
212 /* When doing TSO, the IP length is not included in the pseudo header
213  * checksum of the packet given to the PMD, but for virtio it is
214  * expected.
215  */
216 static void
217 virtio_tso_fix_cksum(struct rte_mbuf *m)
218 {
219 	/* common case: header is not fragmented */
220 	if (likely(rte_pktmbuf_data_len(m) >= m->l2_len + m->l3_len +
221 			m->l4_len)) {
222 		struct ipv4_hdr *iph;
223 		struct ipv6_hdr *ip6h;
224 		struct tcp_hdr *th;
225 		uint16_t prev_cksum, new_cksum, ip_len, ip_paylen;
226 		uint32_t tmp;
227 
228 		iph = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *, m->l2_len);
229 		th = RTE_PTR_ADD(iph, m->l3_len);
230 		if ((iph->version_ihl >> 4) == 4) {
231 			iph->hdr_checksum = 0;
232 			iph->hdr_checksum = rte_ipv4_cksum(iph);
233 			ip_len = iph->total_length;
234 			ip_paylen = rte_cpu_to_be_16(rte_be_to_cpu_16(ip_len) -
235 				m->l3_len);
236 		} else {
237 			ip6h = (struct ipv6_hdr *)iph;
238 			ip_paylen = ip6h->payload_len;
239 		}
240 
241 		/* calculate the new phdr checksum not including ip_paylen */
242 		prev_cksum = th->cksum;
243 		tmp = prev_cksum;
244 		tmp += ip_paylen;
245 		tmp = (tmp & 0xffff) + (tmp >> 16);
246 		new_cksum = tmp;
247 
248 		/* replace it in the packet */
249 		th->cksum = new_cksum;
250 	}
251 }
252 
253 static inline int
254 tx_offload_enabled(struct virtio_hw *hw)
255 {
256 	return vtpci_with_feature(hw, VIRTIO_NET_F_CSUM) ||
257 		vtpci_with_feature(hw, VIRTIO_NET_F_HOST_TSO4) ||
258 		vtpci_with_feature(hw, VIRTIO_NET_F_HOST_TSO6);
259 }
260 
261 static inline void
262 virtqueue_enqueue_xmit(struct virtnet_tx *txvq, struct rte_mbuf *cookie,
263 		       uint16_t needed, int use_indirect, int can_push)
264 {
265 	struct virtio_tx_region *txr = txvq->virtio_net_hdr_mz->addr;
266 	struct vq_desc_extra *dxp;
267 	struct virtqueue *vq = txvq->vq;
268 	struct vring_desc *start_dp;
269 	uint16_t seg_num = cookie->nb_segs;
270 	uint16_t head_idx, idx;
271 	uint16_t head_size = vq->hw->vtnet_hdr_size;
272 	struct virtio_net_hdr *hdr;
273 	int offload;
274 
275 	offload = tx_offload_enabled(vq->hw);
276 	head_idx = vq->vq_desc_head_idx;
277 	idx = head_idx;
278 	dxp = &vq->vq_descx[idx];
279 	dxp->cookie = (void *)cookie;
280 	dxp->ndescs = needed;
281 
282 	start_dp = vq->vq_ring.desc;
283 
284 	if (can_push) {
285 		/* prepend cannot fail, checked by caller */
286 		hdr = (struct virtio_net_hdr *)
287 			rte_pktmbuf_prepend(cookie, head_size);
288 		/* if offload disabled, it is not zeroed below, do it now */
289 		if (offload == 0)
290 			memset(hdr, 0, head_size);
291 	} else if (use_indirect) {
292 		/* setup tx ring slot to point to indirect
293 		 * descriptor list stored in reserved region.
294 		 *
295 		 * the first slot in indirect ring is already preset
296 		 * to point to the header in reserved region
297 		 */
298 		start_dp[idx].addr  = txvq->virtio_net_hdr_mem +
299 			RTE_PTR_DIFF(&txr[idx].tx_indir, txr);
300 		start_dp[idx].len   = (seg_num + 1) * sizeof(struct vring_desc);
301 		start_dp[idx].flags = VRING_DESC_F_INDIRECT;
302 		hdr = (struct virtio_net_hdr *)&txr[idx].tx_hdr;
303 
304 		/* loop below will fill in rest of the indirect elements */
305 		start_dp = txr[idx].tx_indir;
306 		idx = 1;
307 	} else {
308 		/* setup first tx ring slot to point to header
309 		 * stored in reserved region.
310 		 */
311 		start_dp[idx].addr  = txvq->virtio_net_hdr_mem +
312 			RTE_PTR_DIFF(&txr[idx].tx_hdr, txr);
313 		start_dp[idx].len   = vq->hw->vtnet_hdr_size;
314 		start_dp[idx].flags = VRING_DESC_F_NEXT;
315 		hdr = (struct virtio_net_hdr *)&txr[idx].tx_hdr;
316 
317 		idx = start_dp[idx].next;
318 	}
319 
320 	/* Checksum Offload / TSO */
321 	if (offload) {
322 		if (cookie->ol_flags & PKT_TX_TCP_SEG)
323 			cookie->ol_flags |= PKT_TX_TCP_CKSUM;
324 
325 		switch (cookie->ol_flags & PKT_TX_L4_MASK) {
326 		case PKT_TX_UDP_CKSUM:
327 			hdr->csum_start = cookie->l2_len + cookie->l3_len;
328 			hdr->csum_offset = offsetof(struct udp_hdr,
329 				dgram_cksum);
330 			hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
331 			break;
332 
333 		case PKT_TX_TCP_CKSUM:
334 			hdr->csum_start = cookie->l2_len + cookie->l3_len;
335 			hdr->csum_offset = offsetof(struct tcp_hdr, cksum);
336 			hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
337 			break;
338 
339 		default:
340 			hdr->csum_start = 0;
341 			hdr->csum_offset = 0;
342 			hdr->flags = 0;
343 			break;
344 		}
345 
346 		/* TCP Segmentation Offload */
347 		if (cookie->ol_flags & PKT_TX_TCP_SEG) {
348 			virtio_tso_fix_cksum(cookie);
349 			hdr->gso_type = (cookie->ol_flags & PKT_TX_IPV6) ?
350 				VIRTIO_NET_HDR_GSO_TCPV6 :
351 				VIRTIO_NET_HDR_GSO_TCPV4;
352 			hdr->gso_size = cookie->tso_segsz;
353 			hdr->hdr_len =
354 				cookie->l2_len +
355 				cookie->l3_len +
356 				cookie->l4_len;
357 		} else {
358 			hdr->gso_type = 0;
359 			hdr->gso_size = 0;
360 			hdr->hdr_len = 0;
361 		}
362 	}
363 
364 	do {
365 		start_dp[idx].addr  = VIRTIO_MBUF_DATA_DMA_ADDR(cookie, vq);
366 		start_dp[idx].len   = cookie->data_len;
367 		start_dp[idx].flags = cookie->next ? VRING_DESC_F_NEXT : 0;
368 		idx = start_dp[idx].next;
369 	} while ((cookie = cookie->next) != NULL);
370 
371 	if (use_indirect)
372 		idx = vq->vq_ring.desc[head_idx].next;
373 
374 	vq->vq_desc_head_idx = idx;
375 	if (vq->vq_desc_head_idx == VQ_RING_DESC_CHAIN_END)
376 		vq->vq_desc_tail_idx = idx;
377 	vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - needed);
378 	vq_update_avail_ring(vq, head_idx);
379 }
380 
381 void
382 virtio_dev_cq_start(struct rte_eth_dev *dev)
383 {
384 	struct virtio_hw *hw = dev->data->dev_private;
385 
386 	if (hw->cvq && hw->cvq->vq) {
387 		VIRTQUEUE_DUMP((struct virtqueue *)hw->cvq->vq);
388 	}
389 }
390 
391 int
392 virtio_dev_rx_queue_setup(struct rte_eth_dev *dev,
393 			uint16_t queue_idx,
394 			uint16_t nb_desc,
395 			unsigned int socket_id __rte_unused,
396 			__rte_unused const struct rte_eth_rxconf *rx_conf,
397 			struct rte_mempool *mp)
398 {
399 	uint16_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_RQ_QUEUE_IDX;
400 	struct virtio_hw *hw = dev->data->dev_private;
401 	struct virtqueue *vq = hw->vqs[vtpci_queue_idx];
402 	struct virtnet_rx *rxvq;
403 	int error, nbufs;
404 	struct rte_mbuf *m;
405 	uint16_t desc_idx;
406 
407 	PMD_INIT_FUNC_TRACE();
408 
409 	if (nb_desc == 0 || nb_desc > vq->vq_nentries)
410 		nb_desc = vq->vq_nentries;
411 	vq->vq_free_cnt = RTE_MIN(vq->vq_free_cnt, nb_desc);
412 
413 	rxvq = &vq->rxq;
414 	rxvq->queue_id = queue_idx;
415 	rxvq->mpool = mp;
416 	if (rxvq->mpool == NULL) {
417 		rte_exit(EXIT_FAILURE,
418 			"Cannot allocate mbufs for rx virtqueue");
419 	}
420 	dev->data->rx_queues[queue_idx] = rxvq;
421 
422 
423 	/* Allocate blank mbufs for the each rx descriptor */
424 	nbufs = 0;
425 	error = ENOSPC;
426 
427 	if (hw->use_simple_rxtx) {
428 		for (desc_idx = 0; desc_idx < vq->vq_nentries;
429 		     desc_idx++) {
430 			vq->vq_ring.avail->ring[desc_idx] = desc_idx;
431 			vq->vq_ring.desc[desc_idx].flags =
432 				VRING_DESC_F_WRITE;
433 		}
434 	}
435 
436 	memset(&rxvq->fake_mbuf, 0, sizeof(rxvq->fake_mbuf));
437 	for (desc_idx = 0; desc_idx < RTE_PMD_VIRTIO_RX_MAX_BURST;
438 	     desc_idx++) {
439 		vq->sw_ring[vq->vq_nentries + desc_idx] =
440 			&rxvq->fake_mbuf;
441 	}
442 
443 	while (!virtqueue_full(vq)) {
444 		m = rte_mbuf_raw_alloc(rxvq->mpool);
445 		if (m == NULL)
446 			break;
447 
448 		/* Enqueue allocated buffers */
449 		if (hw->use_simple_rxtx)
450 			error = virtqueue_enqueue_recv_refill_simple(vq, m);
451 		else
452 			error = virtqueue_enqueue_recv_refill(vq, m);
453 
454 		if (error) {
455 			rte_pktmbuf_free(m);
456 			break;
457 		}
458 		nbufs++;
459 	}
460 
461 	vq_update_avail_idx(vq);
462 
463 	PMD_INIT_LOG(DEBUG, "Allocated %d bufs", nbufs);
464 
465 	virtio_rxq_vec_setup(rxvq);
466 
467 	VIRTQUEUE_DUMP(vq);
468 
469 	return 0;
470 }
471 
472 static void
473 virtio_update_rxtx_handler(struct rte_eth_dev *dev,
474 			   const struct rte_eth_txconf *tx_conf)
475 {
476 	uint8_t use_simple_rxtx = 0;
477 	struct virtio_hw *hw = dev->data->dev_private;
478 
479 #if defined RTE_ARCH_X86
480 	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE3))
481 		use_simple_rxtx = 1;
482 #elif defined RTE_ARCH_ARM64 || defined CONFIG_RTE_ARCH_ARM
483 	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
484 		use_simple_rxtx = 1;
485 #endif
486 	/* Use simple rx/tx func if single segment and no offloads */
487 	if (use_simple_rxtx &&
488 	    (tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
489 	    !vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
490 		PMD_INIT_LOG(INFO, "Using simple rx/tx path");
491 		dev->tx_pkt_burst = virtio_xmit_pkts_simple;
492 		dev->rx_pkt_burst = virtio_recv_pkts_vec;
493 		hw->use_simple_rxtx = use_simple_rxtx;
494 	}
495 }
496 
497 /*
498  * struct rte_eth_dev *dev: Used to update dev
499  * uint16_t nb_desc: Defaults to values read from config space
500  * unsigned int socket_id: Used to allocate memzone
501  * const struct rte_eth_txconf *tx_conf: Used to setup tx engine
502  * uint16_t queue_idx: Just used as an index in dev txq list
503  */
504 int
505 virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
506 			uint16_t queue_idx,
507 			uint16_t nb_desc,
508 			unsigned int socket_id __rte_unused,
509 			const struct rte_eth_txconf *tx_conf)
510 {
511 	uint8_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_TQ_QUEUE_IDX;
512 	struct virtio_hw *hw = dev->data->dev_private;
513 	struct virtqueue *vq = hw->vqs[vtpci_queue_idx];
514 	struct virtnet_tx *txvq;
515 	uint16_t tx_free_thresh;
516 	uint16_t desc_idx;
517 
518 	PMD_INIT_FUNC_TRACE();
519 
520 	virtio_update_rxtx_handler(dev, tx_conf);
521 
522 	if (nb_desc == 0 || nb_desc > vq->vq_nentries)
523 		nb_desc = vq->vq_nentries;
524 	vq->vq_free_cnt = RTE_MIN(vq->vq_free_cnt, nb_desc);
525 
526 	txvq = &vq->txq;
527 	txvq->queue_id = queue_idx;
528 
529 	tx_free_thresh = tx_conf->tx_free_thresh;
530 	if (tx_free_thresh == 0)
531 		tx_free_thresh =
532 			RTE_MIN(vq->vq_nentries / 4, DEFAULT_TX_FREE_THRESH);
533 
534 	if (tx_free_thresh >= (vq->vq_nentries - 3)) {
535 		RTE_LOG(ERR, PMD, "tx_free_thresh must be less than the "
536 			"number of TX entries minus 3 (%u)."
537 			" (tx_free_thresh=%u port=%u queue=%u)\n",
538 			vq->vq_nentries - 3,
539 			tx_free_thresh, dev->data->port_id, queue_idx);
540 		return -EINVAL;
541 	}
542 
543 	vq->vq_free_thresh = tx_free_thresh;
544 
545 	if (hw->use_simple_rxtx) {
546 		uint16_t mid_idx  = vq->vq_nentries >> 1;
547 
548 		for (desc_idx = 0; desc_idx < mid_idx; desc_idx++) {
549 			vq->vq_ring.avail->ring[desc_idx] =
550 				desc_idx + mid_idx;
551 			vq->vq_ring.desc[desc_idx + mid_idx].next =
552 				desc_idx;
553 			vq->vq_ring.desc[desc_idx + mid_idx].addr =
554 				txvq->virtio_net_hdr_mem +
555 				offsetof(struct virtio_tx_region, tx_hdr);
556 			vq->vq_ring.desc[desc_idx + mid_idx].len =
557 				vq->hw->vtnet_hdr_size;
558 			vq->vq_ring.desc[desc_idx + mid_idx].flags =
559 				VRING_DESC_F_NEXT;
560 			vq->vq_ring.desc[desc_idx].flags = 0;
561 		}
562 		for (desc_idx = mid_idx; desc_idx < vq->vq_nentries;
563 		     desc_idx++)
564 			vq->vq_ring.avail->ring[desc_idx] = desc_idx;
565 	}
566 
567 	VIRTQUEUE_DUMP(vq);
568 
569 	dev->data->tx_queues[queue_idx] = txvq;
570 	return 0;
571 }
572 
573 static void
574 virtio_discard_rxbuf(struct virtqueue *vq, struct rte_mbuf *m)
575 {
576 	int error;
577 	/*
578 	 * Requeue the discarded mbuf. This should always be
579 	 * successful since it was just dequeued.
580 	 */
581 	error = virtqueue_enqueue_recv_refill(vq, m);
582 	if (unlikely(error)) {
583 		RTE_LOG(ERR, PMD, "cannot requeue discarded mbuf");
584 		rte_pktmbuf_free(m);
585 	}
586 }
587 
588 static void
589 virtio_update_packet_stats(struct virtnet_stats *stats, struct rte_mbuf *mbuf)
590 {
591 	uint32_t s = mbuf->pkt_len;
592 	struct ether_addr *ea;
593 
594 	if (s == 64) {
595 		stats->size_bins[1]++;
596 	} else if (s > 64 && s < 1024) {
597 		uint32_t bin;
598 
599 		/* count zeros, and offset into correct bin */
600 		bin = (sizeof(s) * 8) - __builtin_clz(s) - 5;
601 		stats->size_bins[bin]++;
602 	} else {
603 		if (s < 64)
604 			stats->size_bins[0]++;
605 		else if (s < 1519)
606 			stats->size_bins[6]++;
607 		else if (s >= 1519)
608 			stats->size_bins[7]++;
609 	}
610 
611 	ea = rte_pktmbuf_mtod(mbuf, struct ether_addr *);
612 	if (is_multicast_ether_addr(ea)) {
613 		if (is_broadcast_ether_addr(ea))
614 			stats->broadcast++;
615 		else
616 			stats->multicast++;
617 	}
618 }
619 
620 /* Optionally fill offload information in structure */
621 static int
622 virtio_rx_offload(struct rte_mbuf *m, struct virtio_net_hdr *hdr)
623 {
624 	struct rte_net_hdr_lens hdr_lens;
625 	uint32_t hdrlen, ptype;
626 	int l4_supported = 0;
627 
628 	/* nothing to do */
629 	if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE)
630 		return 0;
631 
632 	m->ol_flags |= PKT_RX_IP_CKSUM_UNKNOWN;
633 
634 	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
635 	m->packet_type = ptype;
636 	if ((ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP ||
637 	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP ||
638 	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP)
639 		l4_supported = 1;
640 
641 	if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
642 		hdrlen = hdr_lens.l2_len + hdr_lens.l3_len + hdr_lens.l4_len;
643 		if (hdr->csum_start <= hdrlen && l4_supported) {
644 			m->ol_flags |= PKT_RX_L4_CKSUM_NONE;
645 		} else {
646 			/* Unknown proto or tunnel, do sw cksum. We can assume
647 			 * the cksum field is in the first segment since the
648 			 * buffers we provided to the host are large enough.
649 			 * In case of SCTP, this will be wrong since it's a CRC
650 			 * but there's nothing we can do.
651 			 */
652 			uint16_t csum, off;
653 
654 			rte_raw_cksum_mbuf(m, hdr->csum_start,
655 				rte_pktmbuf_pkt_len(m) - hdr->csum_start,
656 				&csum);
657 			if (likely(csum != 0xffff))
658 				csum = ~csum;
659 			off = hdr->csum_offset + hdr->csum_start;
660 			if (rte_pktmbuf_data_len(m) >= off + 1)
661 				*rte_pktmbuf_mtod_offset(m, uint16_t *,
662 					off) = csum;
663 		}
664 	} else if (hdr->flags & VIRTIO_NET_HDR_F_DATA_VALID && l4_supported) {
665 		m->ol_flags |= PKT_RX_L4_CKSUM_GOOD;
666 	}
667 
668 	/* GSO request, save required information in mbuf */
669 	if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
670 		/* Check unsupported modes */
671 		if ((hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN) ||
672 		    (hdr->gso_size == 0)) {
673 			return -EINVAL;
674 		}
675 
676 		/* Update mss lengthes in mbuf */
677 		m->tso_segsz = hdr->gso_size;
678 		switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
679 			case VIRTIO_NET_HDR_GSO_TCPV4:
680 			case VIRTIO_NET_HDR_GSO_TCPV6:
681 				m->ol_flags |= PKT_RX_LRO | \
682 					PKT_RX_L4_CKSUM_NONE;
683 				break;
684 			default:
685 				return -EINVAL;
686 		}
687 	}
688 
689 	return 0;
690 }
691 
692 static inline int
693 rx_offload_enabled(struct virtio_hw *hw)
694 {
695 	return vtpci_with_feature(hw, VIRTIO_NET_F_GUEST_CSUM) ||
696 		vtpci_with_feature(hw, VIRTIO_NET_F_GUEST_TSO4) ||
697 		vtpci_with_feature(hw, VIRTIO_NET_F_GUEST_TSO6);
698 }
699 
700 #define VIRTIO_MBUF_BURST_SZ 64
701 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
702 uint16_t
703 virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
704 {
705 	struct virtnet_rx *rxvq = rx_queue;
706 	struct virtqueue *vq = rxvq->vq;
707 	struct virtio_hw *hw;
708 	struct rte_mbuf *rxm, *new_mbuf;
709 	uint16_t nb_used, num, nb_rx;
710 	uint32_t len[VIRTIO_MBUF_BURST_SZ];
711 	struct rte_mbuf *rcv_pkts[VIRTIO_MBUF_BURST_SZ];
712 	int error;
713 	uint32_t i, nb_enqueued;
714 	uint32_t hdr_size;
715 	int offload;
716 	struct virtio_net_hdr *hdr;
717 
718 	nb_used = VIRTQUEUE_NUSED(vq);
719 
720 	virtio_rmb();
721 
722 	num = (uint16_t)(likely(nb_used <= nb_pkts) ? nb_used : nb_pkts);
723 	num = (uint16_t)(likely(num <= VIRTIO_MBUF_BURST_SZ) ? num : VIRTIO_MBUF_BURST_SZ);
724 	if (likely(num > DESC_PER_CACHELINE))
725 		num = num - ((vq->vq_used_cons_idx + num) % DESC_PER_CACHELINE);
726 
727 	num = virtqueue_dequeue_burst_rx(vq, rcv_pkts, len, num);
728 	PMD_RX_LOG(DEBUG, "used:%d dequeue:%d", nb_used, num);
729 
730 	hw = vq->hw;
731 	nb_rx = 0;
732 	nb_enqueued = 0;
733 	hdr_size = hw->vtnet_hdr_size;
734 	offload = rx_offload_enabled(hw);
735 
736 	for (i = 0; i < num ; i++) {
737 		rxm = rcv_pkts[i];
738 
739 		PMD_RX_LOG(DEBUG, "packet len:%d", len[i]);
740 
741 		if (unlikely(len[i] < hdr_size + ETHER_HDR_LEN)) {
742 			PMD_RX_LOG(ERR, "Packet drop");
743 			nb_enqueued++;
744 			virtio_discard_rxbuf(vq, rxm);
745 			rxvq->stats.errors++;
746 			continue;
747 		}
748 
749 		rxm->port = rxvq->port_id;
750 		rxm->data_off = RTE_PKTMBUF_HEADROOM;
751 		rxm->ol_flags = 0;
752 		rxm->vlan_tci = 0;
753 
754 		rxm->nb_segs = 1;
755 		rxm->next = NULL;
756 		rxm->pkt_len = (uint32_t)(len[i] - hdr_size);
757 		rxm->data_len = (uint16_t)(len[i] - hdr_size);
758 
759 		hdr = (struct virtio_net_hdr *)((char *)rxm->buf_addr +
760 			RTE_PKTMBUF_HEADROOM - hdr_size);
761 
762 		if (hw->vlan_strip)
763 			rte_vlan_strip(rxm);
764 
765 		if (offload && virtio_rx_offload(rxm, hdr) < 0) {
766 			virtio_discard_rxbuf(vq, rxm);
767 			rxvq->stats.errors++;
768 			continue;
769 		}
770 
771 		VIRTIO_DUMP_PACKET(rxm, rxm->data_len);
772 
773 		rx_pkts[nb_rx++] = rxm;
774 
775 		rxvq->stats.bytes += rx_pkts[nb_rx - 1]->pkt_len;
776 		virtio_update_packet_stats(&rxvq->stats, rxm);
777 	}
778 
779 	rxvq->stats.packets += nb_rx;
780 
781 	/* Allocate new mbuf for the used descriptor */
782 	error = ENOSPC;
783 	while (likely(!virtqueue_full(vq))) {
784 		new_mbuf = rte_mbuf_raw_alloc(rxvq->mpool);
785 		if (unlikely(new_mbuf == NULL)) {
786 			struct rte_eth_dev *dev
787 				= &rte_eth_devices[rxvq->port_id];
788 			dev->data->rx_mbuf_alloc_failed++;
789 			break;
790 		}
791 		error = virtqueue_enqueue_recv_refill(vq, new_mbuf);
792 		if (unlikely(error)) {
793 			rte_pktmbuf_free(new_mbuf);
794 			break;
795 		}
796 		nb_enqueued++;
797 	}
798 
799 	if (likely(nb_enqueued)) {
800 		vq_update_avail_idx(vq);
801 
802 		if (unlikely(virtqueue_kick_prepare(vq))) {
803 			virtqueue_notify(vq);
804 			PMD_RX_LOG(DEBUG, "Notified");
805 		}
806 	}
807 
808 	return nb_rx;
809 }
810 
811 uint16_t
812 virtio_recv_mergeable_pkts(void *rx_queue,
813 			struct rte_mbuf **rx_pkts,
814 			uint16_t nb_pkts)
815 {
816 	struct virtnet_rx *rxvq = rx_queue;
817 	struct virtqueue *vq = rxvq->vq;
818 	struct virtio_hw *hw;
819 	struct rte_mbuf *rxm, *new_mbuf;
820 	uint16_t nb_used, num, nb_rx;
821 	uint32_t len[VIRTIO_MBUF_BURST_SZ];
822 	struct rte_mbuf *rcv_pkts[VIRTIO_MBUF_BURST_SZ];
823 	struct rte_mbuf *prev;
824 	int error;
825 	uint32_t i, nb_enqueued;
826 	uint32_t seg_num;
827 	uint16_t extra_idx;
828 	uint32_t seg_res;
829 	uint32_t hdr_size;
830 	int offload;
831 
832 	nb_used = VIRTQUEUE_NUSED(vq);
833 
834 	virtio_rmb();
835 
836 	PMD_RX_LOG(DEBUG, "used:%d", nb_used);
837 
838 	hw = vq->hw;
839 	nb_rx = 0;
840 	i = 0;
841 	nb_enqueued = 0;
842 	seg_num = 0;
843 	extra_idx = 0;
844 	seg_res = 0;
845 	hdr_size = hw->vtnet_hdr_size;
846 	offload = rx_offload_enabled(hw);
847 
848 	while (i < nb_used) {
849 		struct virtio_net_hdr_mrg_rxbuf *header;
850 
851 		if (nb_rx == nb_pkts)
852 			break;
853 
854 		num = virtqueue_dequeue_burst_rx(vq, rcv_pkts, len, 1);
855 		if (num != 1)
856 			continue;
857 
858 		i++;
859 
860 		PMD_RX_LOG(DEBUG, "dequeue:%d", num);
861 		PMD_RX_LOG(DEBUG, "packet len:%d", len[0]);
862 
863 		rxm = rcv_pkts[0];
864 
865 		if (unlikely(len[0] < hdr_size + ETHER_HDR_LEN)) {
866 			PMD_RX_LOG(ERR, "Packet drop");
867 			nb_enqueued++;
868 			virtio_discard_rxbuf(vq, rxm);
869 			rxvq->stats.errors++;
870 			continue;
871 		}
872 
873 		header = (struct virtio_net_hdr_mrg_rxbuf *)((char *)rxm->buf_addr +
874 			RTE_PKTMBUF_HEADROOM - hdr_size);
875 		seg_num = header->num_buffers;
876 
877 		if (seg_num == 0)
878 			seg_num = 1;
879 
880 		rxm->data_off = RTE_PKTMBUF_HEADROOM;
881 		rxm->nb_segs = seg_num;
882 		rxm->next = NULL;
883 		rxm->ol_flags = 0;
884 		rxm->vlan_tci = 0;
885 		rxm->pkt_len = (uint32_t)(len[0] - hdr_size);
886 		rxm->data_len = (uint16_t)(len[0] - hdr_size);
887 
888 		rxm->port = rxvq->port_id;
889 		rx_pkts[nb_rx] = rxm;
890 		prev = rxm;
891 
892 		if (offload && virtio_rx_offload(rxm, &header->hdr) < 0) {
893 			virtio_discard_rxbuf(vq, rxm);
894 			rxvq->stats.errors++;
895 			continue;
896 		}
897 
898 		seg_res = seg_num - 1;
899 
900 		while (seg_res != 0) {
901 			/*
902 			 * Get extra segments for current uncompleted packet.
903 			 */
904 			uint16_t  rcv_cnt =
905 				RTE_MIN(seg_res, RTE_DIM(rcv_pkts));
906 			if (likely(VIRTQUEUE_NUSED(vq) >= rcv_cnt)) {
907 				uint32_t rx_num =
908 					virtqueue_dequeue_burst_rx(vq,
909 					rcv_pkts, len, rcv_cnt);
910 				i += rx_num;
911 				rcv_cnt = rx_num;
912 			} else {
913 				PMD_RX_LOG(ERR,
914 					   "No enough segments for packet.");
915 				nb_enqueued++;
916 				virtio_discard_rxbuf(vq, rxm);
917 				rxvq->stats.errors++;
918 				break;
919 			}
920 
921 			extra_idx = 0;
922 
923 			while (extra_idx < rcv_cnt) {
924 				rxm = rcv_pkts[extra_idx];
925 
926 				rxm->data_off = RTE_PKTMBUF_HEADROOM - hdr_size;
927 				rxm->next = NULL;
928 				rxm->pkt_len = (uint32_t)(len[extra_idx]);
929 				rxm->data_len = (uint16_t)(len[extra_idx]);
930 
931 				if (prev)
932 					prev->next = rxm;
933 
934 				prev = rxm;
935 				rx_pkts[nb_rx]->pkt_len += rxm->pkt_len;
936 				extra_idx++;
937 			};
938 			seg_res -= rcv_cnt;
939 		}
940 
941 		if (hw->vlan_strip)
942 			rte_vlan_strip(rx_pkts[nb_rx]);
943 
944 		VIRTIO_DUMP_PACKET(rx_pkts[nb_rx],
945 			rx_pkts[nb_rx]->data_len);
946 
947 		rxvq->stats.bytes += rx_pkts[nb_rx]->pkt_len;
948 		virtio_update_packet_stats(&rxvq->stats, rx_pkts[nb_rx]);
949 		nb_rx++;
950 	}
951 
952 	rxvq->stats.packets += nb_rx;
953 
954 	/* Allocate new mbuf for the used descriptor */
955 	error = ENOSPC;
956 	while (likely(!virtqueue_full(vq))) {
957 		new_mbuf = rte_mbuf_raw_alloc(rxvq->mpool);
958 		if (unlikely(new_mbuf == NULL)) {
959 			struct rte_eth_dev *dev
960 				= &rte_eth_devices[rxvq->port_id];
961 			dev->data->rx_mbuf_alloc_failed++;
962 			break;
963 		}
964 		error = virtqueue_enqueue_recv_refill(vq, new_mbuf);
965 		if (unlikely(error)) {
966 			rte_pktmbuf_free(new_mbuf);
967 			break;
968 		}
969 		nb_enqueued++;
970 	}
971 
972 	if (likely(nb_enqueued)) {
973 		vq_update_avail_idx(vq);
974 
975 		if (unlikely(virtqueue_kick_prepare(vq))) {
976 			virtqueue_notify(vq);
977 			PMD_RX_LOG(DEBUG, "Notified");
978 		}
979 	}
980 
981 	return nb_rx;
982 }
983 
984 uint16_t
985 virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
986 {
987 	struct virtnet_tx *txvq = tx_queue;
988 	struct virtqueue *vq = txvq->vq;
989 	struct virtio_hw *hw = vq->hw;
990 	uint16_t hdr_size = hw->vtnet_hdr_size;
991 	uint16_t nb_used, nb_tx;
992 	int error;
993 
994 	if (unlikely(nb_pkts < 1))
995 		return nb_pkts;
996 
997 	PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts);
998 	nb_used = VIRTQUEUE_NUSED(vq);
999 
1000 	virtio_rmb();
1001 	if (likely(nb_used > vq->vq_nentries - vq->vq_free_thresh))
1002 		virtio_xmit_cleanup(vq, nb_used);
1003 
1004 	for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
1005 		struct rte_mbuf *txm = tx_pkts[nb_tx];
1006 		int can_push = 0, use_indirect = 0, slots, need;
1007 
1008 		/* Do VLAN tag insertion */
1009 		if (unlikely(txm->ol_flags & PKT_TX_VLAN_PKT)) {
1010 			error = rte_vlan_insert(&txm);
1011 			if (unlikely(error)) {
1012 				rte_pktmbuf_free(txm);
1013 				continue;
1014 			}
1015 		}
1016 
1017 		/* optimize ring usage */
1018 		if (vtpci_with_feature(hw, VIRTIO_F_ANY_LAYOUT) &&
1019 		    rte_mbuf_refcnt_read(txm) == 1 &&
1020 		    RTE_MBUF_DIRECT(txm) &&
1021 		    txm->nb_segs == 1 &&
1022 		    rte_pktmbuf_headroom(txm) >= hdr_size &&
1023 		    rte_is_aligned(rte_pktmbuf_mtod(txm, char *),
1024 				   __alignof__(struct virtio_net_hdr_mrg_rxbuf)))
1025 			can_push = 1;
1026 		else if (vtpci_with_feature(hw, VIRTIO_RING_F_INDIRECT_DESC) &&
1027 			 txm->nb_segs < VIRTIO_MAX_TX_INDIRECT)
1028 			use_indirect = 1;
1029 
1030 		/* How many main ring entries are needed to this Tx?
1031 		 * any_layout => number of segments
1032 		 * indirect   => 1
1033 		 * default    => number of segments + 1
1034 		 */
1035 		slots = use_indirect ? 1 : (txm->nb_segs + !can_push);
1036 		need = slots - vq->vq_free_cnt;
1037 
1038 		/* Positive value indicates it need free vring descriptors */
1039 		if (unlikely(need > 0)) {
1040 			nb_used = VIRTQUEUE_NUSED(vq);
1041 			virtio_rmb();
1042 			need = RTE_MIN(need, (int)nb_used);
1043 
1044 			virtio_xmit_cleanup(vq, need);
1045 			need = slots - vq->vq_free_cnt;
1046 			if (unlikely(need > 0)) {
1047 				PMD_TX_LOG(ERR,
1048 					   "No free tx descriptors to transmit");
1049 				break;
1050 			}
1051 		}
1052 
1053 		/* Enqueue Packet buffers */
1054 		virtqueue_enqueue_xmit(txvq, txm, slots, use_indirect, can_push);
1055 
1056 		txvq->stats.bytes += txm->pkt_len;
1057 		virtio_update_packet_stats(&txvq->stats, txm);
1058 	}
1059 
1060 	txvq->stats.packets += nb_tx;
1061 
1062 	if (likely(nb_tx)) {
1063 		vq_update_avail_idx(vq);
1064 
1065 		if (unlikely(virtqueue_kick_prepare(vq))) {
1066 			virtqueue_notify(vq);
1067 			PMD_TX_LOG(DEBUG, "Notified backend after xmit");
1068 		}
1069 	}
1070 
1071 	return nb_tx;
1072 }
1073