xref: /dpdk/lib/vhost/virtio_net.c (revision 65c2bbf41f2258fea8e1639a86598f48d8251756)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2016 Intel Corporation
3  */
4 
5 #include <stdint.h>
6 #include <stdbool.h>
7 #include <linux/virtio_net.h>
8 
9 #include <rte_mbuf.h>
10 #include <rte_memcpy.h>
11 #include <rte_net.h>
12 #include <rte_ether.h>
13 #include <rte_ip.h>
14 #include <rte_vhost.h>
15 #include <rte_tcp.h>
16 #include <rte_udp.h>
17 #include <rte_sctp.h>
18 #include <rte_arp.h>
19 #include <rte_spinlock.h>
20 #include <rte_malloc.h>
21 #include <rte_vhost_async.h>
22 
23 #include "iotlb.h"
24 #include "vhost.h"
25 
26 #define MAX_BATCH_LEN 256
27 
28 #define VHOST_ASYNC_BATCH_THRESHOLD 32
29 
30 static  __rte_always_inline bool
31 rxvq_is_mergeable(struct virtio_net *dev)
32 {
33 	return dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF);
34 }
35 
36 static  __rte_always_inline bool
37 virtio_net_is_inorder(struct virtio_net *dev)
38 {
39 	return dev->features & (1ULL << VIRTIO_F_IN_ORDER);
40 }
41 
42 static bool
43 is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring)
44 {
45 	return (is_tx ^ (idx & 1)) == 0 && idx < nr_vring;
46 }
47 
48 static inline void
49 do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
50 {
51 	struct batch_copy_elem *elem = vq->batch_copy_elems;
52 	uint16_t count = vq->batch_copy_nb_elems;
53 	int i;
54 
55 	for (i = 0; i < count; i++) {
56 		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
57 		vhost_log_cache_write_iova(dev, vq, elem[i].log_addr,
58 					   elem[i].len);
59 		PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
60 	}
61 
62 	vq->batch_copy_nb_elems = 0;
63 }
64 
65 static inline void
66 do_data_copy_dequeue(struct vhost_virtqueue *vq)
67 {
68 	struct batch_copy_elem *elem = vq->batch_copy_elems;
69 	uint16_t count = vq->batch_copy_nb_elems;
70 	int i;
71 
72 	for (i = 0; i < count; i++)
73 		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
74 
75 	vq->batch_copy_nb_elems = 0;
76 }
77 
78 static __rte_always_inline void
79 do_flush_shadow_used_ring_split(struct virtio_net *dev,
80 			struct vhost_virtqueue *vq,
81 			uint16_t to, uint16_t from, uint16_t size)
82 {
83 	rte_memcpy(&vq->used->ring[to],
84 			&vq->shadow_used_split[from],
85 			size * sizeof(struct vring_used_elem));
86 	vhost_log_cache_used_vring(dev, vq,
87 			offsetof(struct vring_used, ring[to]),
88 			size * sizeof(struct vring_used_elem));
89 }
90 
91 static __rte_always_inline void
92 flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
93 {
94 	uint16_t used_idx = vq->last_used_idx & (vq->size - 1);
95 
96 	if (used_idx + vq->shadow_used_idx <= vq->size) {
97 		do_flush_shadow_used_ring_split(dev, vq, used_idx, 0,
98 					  vq->shadow_used_idx);
99 	} else {
100 		uint16_t size;
101 
102 		/* update used ring interval [used_idx, vq->size] */
103 		size = vq->size - used_idx;
104 		do_flush_shadow_used_ring_split(dev, vq, used_idx, 0, size);
105 
106 		/* update the left half used ring interval [0, left_size] */
107 		do_flush_shadow_used_ring_split(dev, vq, 0, size,
108 					  vq->shadow_used_idx - size);
109 	}
110 	vq->last_used_idx += vq->shadow_used_idx;
111 
112 	vhost_log_cache_sync(dev, vq);
113 
114 	__atomic_add_fetch(&vq->used->idx, vq->shadow_used_idx,
115 			   __ATOMIC_RELEASE);
116 	vq->shadow_used_idx = 0;
117 	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
118 		sizeof(vq->used->idx));
119 }
120 
121 static __rte_always_inline void
122 update_shadow_used_ring_split(struct vhost_virtqueue *vq,
123 			 uint16_t desc_idx, uint32_t len)
124 {
125 	uint16_t i = vq->shadow_used_idx++;
126 
127 	vq->shadow_used_split[i].id  = desc_idx;
128 	vq->shadow_used_split[i].len = len;
129 }
130 
131 static __rte_always_inline void
132 vhost_flush_enqueue_shadow_packed(struct virtio_net *dev,
133 				  struct vhost_virtqueue *vq)
134 {
135 	int i;
136 	uint16_t used_idx = vq->last_used_idx;
137 	uint16_t head_idx = vq->last_used_idx;
138 	uint16_t head_flags = 0;
139 
140 	/* Split loop in two to save memory barriers */
141 	for (i = 0; i < vq->shadow_used_idx; i++) {
142 		vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id;
143 		vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len;
144 
145 		used_idx += vq->shadow_used_packed[i].count;
146 		if (used_idx >= vq->size)
147 			used_idx -= vq->size;
148 	}
149 
150 	/* The ordering for storing desc flags needs to be enforced. */
151 	rte_atomic_thread_fence(__ATOMIC_RELEASE);
152 
153 	for (i = 0; i < vq->shadow_used_idx; i++) {
154 		uint16_t flags;
155 
156 		if (vq->shadow_used_packed[i].len)
157 			flags = VRING_DESC_F_WRITE;
158 		else
159 			flags = 0;
160 
161 		if (vq->used_wrap_counter) {
162 			flags |= VRING_DESC_F_USED;
163 			flags |= VRING_DESC_F_AVAIL;
164 		} else {
165 			flags &= ~VRING_DESC_F_USED;
166 			flags &= ~VRING_DESC_F_AVAIL;
167 		}
168 
169 		if (i > 0) {
170 			vq->desc_packed[vq->last_used_idx].flags = flags;
171 
172 			vhost_log_cache_used_vring(dev, vq,
173 					vq->last_used_idx *
174 					sizeof(struct vring_packed_desc),
175 					sizeof(struct vring_packed_desc));
176 		} else {
177 			head_idx = vq->last_used_idx;
178 			head_flags = flags;
179 		}
180 
181 		vq_inc_last_used_packed(vq, vq->shadow_used_packed[i].count);
182 	}
183 
184 	vq->desc_packed[head_idx].flags = head_flags;
185 
186 	vhost_log_cache_used_vring(dev, vq,
187 				head_idx *
188 				sizeof(struct vring_packed_desc),
189 				sizeof(struct vring_packed_desc));
190 
191 	vq->shadow_used_idx = 0;
192 	vhost_log_cache_sync(dev, vq);
193 }
194 
195 static __rte_always_inline void
196 vhost_flush_dequeue_shadow_packed(struct virtio_net *dev,
197 				  struct vhost_virtqueue *vq)
198 {
199 	struct vring_used_elem_packed *used_elem = &vq->shadow_used_packed[0];
200 
201 	vq->desc_packed[vq->shadow_last_used_idx].id = used_elem->id;
202 	/* desc flags is the synchronization point for virtio packed vring */
203 	__atomic_store_n(&vq->desc_packed[vq->shadow_last_used_idx].flags,
204 			 used_elem->flags, __ATOMIC_RELEASE);
205 
206 	vhost_log_cache_used_vring(dev, vq, vq->shadow_last_used_idx *
207 				   sizeof(struct vring_packed_desc),
208 				   sizeof(struct vring_packed_desc));
209 	vq->shadow_used_idx = 0;
210 	vhost_log_cache_sync(dev, vq);
211 }
212 
213 static __rte_always_inline void
214 vhost_flush_enqueue_batch_packed(struct virtio_net *dev,
215 				 struct vhost_virtqueue *vq,
216 				 uint64_t *lens,
217 				 uint16_t *ids)
218 {
219 	uint16_t i;
220 	uint16_t flags;
221 	uint16_t last_used_idx;
222 	struct vring_packed_desc *desc_base;
223 
224 	last_used_idx = vq->last_used_idx;
225 	desc_base = &vq->desc_packed[last_used_idx];
226 
227 	flags = PACKED_DESC_ENQUEUE_USED_FLAG(vq->used_wrap_counter);
228 
229 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
230 		desc_base[i].id = ids[i];
231 		desc_base[i].len = lens[i];
232 	}
233 
234 	rte_atomic_thread_fence(__ATOMIC_RELEASE);
235 
236 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
237 		desc_base[i].flags = flags;
238 	}
239 
240 	vhost_log_cache_used_vring(dev, vq, last_used_idx *
241 				   sizeof(struct vring_packed_desc),
242 				   sizeof(struct vring_packed_desc) *
243 				   PACKED_BATCH_SIZE);
244 	vhost_log_cache_sync(dev, vq);
245 
246 	vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
247 }
248 
249 static __rte_always_inline void
250 vhost_shadow_dequeue_batch_packed_inorder(struct vhost_virtqueue *vq,
251 					  uint16_t id)
252 {
253 	vq->shadow_used_packed[0].id = id;
254 
255 	if (!vq->shadow_used_idx) {
256 		vq->shadow_last_used_idx = vq->last_used_idx;
257 		vq->shadow_used_packed[0].flags =
258 			PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter);
259 		vq->shadow_used_packed[0].len = 0;
260 		vq->shadow_used_packed[0].count = 1;
261 		vq->shadow_used_idx++;
262 	}
263 
264 	vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
265 }
266 
267 static __rte_always_inline void
268 vhost_shadow_dequeue_batch_packed(struct virtio_net *dev,
269 				  struct vhost_virtqueue *vq,
270 				  uint16_t *ids)
271 {
272 	uint16_t flags;
273 	uint16_t i;
274 	uint16_t begin;
275 
276 	flags = PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter);
277 
278 	if (!vq->shadow_used_idx) {
279 		vq->shadow_last_used_idx = vq->last_used_idx;
280 		vq->shadow_used_packed[0].id  = ids[0];
281 		vq->shadow_used_packed[0].len = 0;
282 		vq->shadow_used_packed[0].count = 1;
283 		vq->shadow_used_packed[0].flags = flags;
284 		vq->shadow_used_idx++;
285 		begin = 1;
286 	} else
287 		begin = 0;
288 
289 	vhost_for_each_try_unroll(i, begin, PACKED_BATCH_SIZE) {
290 		vq->desc_packed[vq->last_used_idx + i].id = ids[i];
291 		vq->desc_packed[vq->last_used_idx + i].len = 0;
292 	}
293 
294 	rte_atomic_thread_fence(__ATOMIC_RELEASE);
295 	vhost_for_each_try_unroll(i, begin, PACKED_BATCH_SIZE)
296 		vq->desc_packed[vq->last_used_idx + i].flags = flags;
297 
298 	vhost_log_cache_used_vring(dev, vq, vq->last_used_idx *
299 				   sizeof(struct vring_packed_desc),
300 				   sizeof(struct vring_packed_desc) *
301 				   PACKED_BATCH_SIZE);
302 	vhost_log_cache_sync(dev, vq);
303 
304 	vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
305 }
306 
307 static __rte_always_inline void
308 vhost_shadow_dequeue_single_packed(struct vhost_virtqueue *vq,
309 				   uint16_t buf_id,
310 				   uint16_t count)
311 {
312 	uint16_t flags;
313 
314 	flags = vq->desc_packed[vq->last_used_idx].flags;
315 	if (vq->used_wrap_counter) {
316 		flags |= VRING_DESC_F_USED;
317 		flags |= VRING_DESC_F_AVAIL;
318 	} else {
319 		flags &= ~VRING_DESC_F_USED;
320 		flags &= ~VRING_DESC_F_AVAIL;
321 	}
322 
323 	if (!vq->shadow_used_idx) {
324 		vq->shadow_last_used_idx = vq->last_used_idx;
325 
326 		vq->shadow_used_packed[0].id  = buf_id;
327 		vq->shadow_used_packed[0].len = 0;
328 		vq->shadow_used_packed[0].flags = flags;
329 		vq->shadow_used_idx++;
330 	} else {
331 		vq->desc_packed[vq->last_used_idx].id = buf_id;
332 		vq->desc_packed[vq->last_used_idx].len = 0;
333 		vq->desc_packed[vq->last_used_idx].flags = flags;
334 	}
335 
336 	vq_inc_last_used_packed(vq, count);
337 }
338 
339 static __rte_always_inline void
340 vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue *vq,
341 					   uint16_t buf_id,
342 					   uint16_t count)
343 {
344 	uint16_t flags;
345 
346 	vq->shadow_used_packed[0].id = buf_id;
347 
348 	flags = vq->desc_packed[vq->last_used_idx].flags;
349 	if (vq->used_wrap_counter) {
350 		flags |= VRING_DESC_F_USED;
351 		flags |= VRING_DESC_F_AVAIL;
352 	} else {
353 		flags &= ~VRING_DESC_F_USED;
354 		flags &= ~VRING_DESC_F_AVAIL;
355 	}
356 
357 	if (!vq->shadow_used_idx) {
358 		vq->shadow_last_used_idx = vq->last_used_idx;
359 		vq->shadow_used_packed[0].len = 0;
360 		vq->shadow_used_packed[0].flags = flags;
361 		vq->shadow_used_idx++;
362 	}
363 
364 	vq_inc_last_used_packed(vq, count);
365 }
366 
367 static __rte_always_inline void
368 vhost_shadow_enqueue_packed(struct vhost_virtqueue *vq,
369 				   uint32_t *len,
370 				   uint16_t *id,
371 				   uint16_t *count,
372 				   uint16_t num_buffers)
373 {
374 	uint16_t i;
375 
376 	for (i = 0; i < num_buffers; i++) {
377 		/* enqueue shadow flush action aligned with batch num */
378 		if (!vq->shadow_used_idx)
379 			vq->shadow_aligned_idx = vq->last_used_idx &
380 				PACKED_BATCH_MASK;
381 		vq->shadow_used_packed[vq->shadow_used_idx].id  = id[i];
382 		vq->shadow_used_packed[vq->shadow_used_idx].len = len[i];
383 		vq->shadow_used_packed[vq->shadow_used_idx].count = count[i];
384 		vq->shadow_aligned_idx += count[i];
385 		vq->shadow_used_idx++;
386 	}
387 }
388 
389 static __rte_always_inline void
390 vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
391 				   struct vhost_virtqueue *vq,
392 				   uint32_t *len,
393 				   uint16_t *id,
394 				   uint16_t *count,
395 				   uint16_t num_buffers)
396 {
397 	vhost_shadow_enqueue_packed(vq, len, id, count, num_buffers);
398 
399 	if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) {
400 		do_data_copy_enqueue(dev, vq);
401 		vhost_flush_enqueue_shadow_packed(dev, vq);
402 	}
403 }
404 
405 /* avoid write operation when necessary, to lessen cache issues */
406 #define ASSIGN_UNLESS_EQUAL(var, val) do {	\
407 	if ((var) != (val))			\
408 		(var) = (val);			\
409 } while (0)
410 
411 static __rte_always_inline void
412 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
413 {
414 	uint64_t csum_l4 = m_buf->ol_flags & RTE_MBUF_F_TX_L4_MASK;
415 
416 	if (m_buf->ol_flags & RTE_MBUF_F_TX_TCP_SEG)
417 		csum_l4 |= RTE_MBUF_F_TX_TCP_CKSUM;
418 
419 	if (csum_l4) {
420 		net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
421 		net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
422 
423 		switch (csum_l4) {
424 		case RTE_MBUF_F_TX_TCP_CKSUM:
425 			net_hdr->csum_offset = (offsetof(struct rte_tcp_hdr,
426 						cksum));
427 			break;
428 		case RTE_MBUF_F_TX_UDP_CKSUM:
429 			net_hdr->csum_offset = (offsetof(struct rte_udp_hdr,
430 						dgram_cksum));
431 			break;
432 		case RTE_MBUF_F_TX_SCTP_CKSUM:
433 			net_hdr->csum_offset = (offsetof(struct rte_sctp_hdr,
434 						cksum));
435 			break;
436 		}
437 	} else {
438 		ASSIGN_UNLESS_EQUAL(net_hdr->csum_start, 0);
439 		ASSIGN_UNLESS_EQUAL(net_hdr->csum_offset, 0);
440 		ASSIGN_UNLESS_EQUAL(net_hdr->flags, 0);
441 	}
442 
443 	/* IP cksum verification cannot be bypassed, then calculate here */
444 	if (m_buf->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) {
445 		struct rte_ipv4_hdr *ipv4_hdr;
446 
447 		ipv4_hdr = rte_pktmbuf_mtod_offset(m_buf, struct rte_ipv4_hdr *,
448 						   m_buf->l2_len);
449 		ipv4_hdr->hdr_checksum = 0;
450 		ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr);
451 	}
452 
453 	if (m_buf->ol_flags & RTE_MBUF_F_TX_TCP_SEG) {
454 		if (m_buf->ol_flags & RTE_MBUF_F_TX_IPV4)
455 			net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
456 		else
457 			net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
458 		net_hdr->gso_size = m_buf->tso_segsz;
459 		net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
460 					+ m_buf->l4_len;
461 	} else if (m_buf->ol_flags & RTE_MBUF_F_TX_UDP_SEG) {
462 		net_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
463 		net_hdr->gso_size = m_buf->tso_segsz;
464 		net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len +
465 			m_buf->l4_len;
466 	} else {
467 		ASSIGN_UNLESS_EQUAL(net_hdr->gso_type, 0);
468 		ASSIGN_UNLESS_EQUAL(net_hdr->gso_size, 0);
469 		ASSIGN_UNLESS_EQUAL(net_hdr->hdr_len, 0);
470 	}
471 }
472 
473 static __rte_always_inline int
474 map_one_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
475 		struct buf_vector *buf_vec, uint16_t *vec_idx,
476 		uint64_t desc_iova, uint64_t desc_len, uint8_t perm)
477 {
478 	uint16_t vec_id = *vec_idx;
479 
480 	while (desc_len) {
481 		uint64_t desc_addr;
482 		uint64_t desc_chunck_len = desc_len;
483 
484 		if (unlikely(vec_id >= BUF_VECTOR_MAX))
485 			return -1;
486 
487 		desc_addr = vhost_iova_to_vva(dev, vq,
488 				desc_iova,
489 				&desc_chunck_len,
490 				perm);
491 		if (unlikely(!desc_addr))
492 			return -1;
493 
494 		rte_prefetch0((void *)(uintptr_t)desc_addr);
495 
496 		buf_vec[vec_id].buf_iova = desc_iova;
497 		buf_vec[vec_id].buf_addr = desc_addr;
498 		buf_vec[vec_id].buf_len  = desc_chunck_len;
499 
500 		desc_len -= desc_chunck_len;
501 		desc_iova += desc_chunck_len;
502 		vec_id++;
503 	}
504 	*vec_idx = vec_id;
505 
506 	return 0;
507 }
508 
509 static __rte_always_inline int
510 fill_vec_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
511 			 uint32_t avail_idx, uint16_t *vec_idx,
512 			 struct buf_vector *buf_vec, uint16_t *desc_chain_head,
513 			 uint32_t *desc_chain_len, uint8_t perm)
514 {
515 	uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
516 	uint16_t vec_id = *vec_idx;
517 	uint32_t len    = 0;
518 	uint64_t dlen;
519 	uint32_t nr_descs = vq->size;
520 	uint32_t cnt    = 0;
521 	struct vring_desc *descs = vq->desc;
522 	struct vring_desc *idesc = NULL;
523 
524 	if (unlikely(idx >= vq->size))
525 		return -1;
526 
527 	*desc_chain_head = idx;
528 
529 	if (vq->desc[idx].flags & VRING_DESC_F_INDIRECT) {
530 		dlen = vq->desc[idx].len;
531 		nr_descs = dlen / sizeof(struct vring_desc);
532 		if (unlikely(nr_descs > vq->size))
533 			return -1;
534 
535 		descs = (struct vring_desc *)(uintptr_t)
536 			vhost_iova_to_vva(dev, vq, vq->desc[idx].addr,
537 						&dlen,
538 						VHOST_ACCESS_RO);
539 		if (unlikely(!descs))
540 			return -1;
541 
542 		if (unlikely(dlen < vq->desc[idx].len)) {
543 			/*
544 			 * The indirect desc table is not contiguous
545 			 * in process VA space, we have to copy it.
546 			 */
547 			idesc = vhost_alloc_copy_ind_table(dev, vq,
548 					vq->desc[idx].addr, vq->desc[idx].len);
549 			if (unlikely(!idesc))
550 				return -1;
551 
552 			descs = idesc;
553 		}
554 
555 		idx = 0;
556 	}
557 
558 	while (1) {
559 		if (unlikely(idx >= nr_descs || cnt++ >= nr_descs)) {
560 			free_ind_table(idesc);
561 			return -1;
562 		}
563 
564 		dlen = descs[idx].len;
565 		len += dlen;
566 
567 		if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
568 						descs[idx].addr, dlen,
569 						perm))) {
570 			free_ind_table(idesc);
571 			return -1;
572 		}
573 
574 		if ((descs[idx].flags & VRING_DESC_F_NEXT) == 0)
575 			break;
576 
577 		idx = descs[idx].next;
578 	}
579 
580 	*desc_chain_len = len;
581 	*vec_idx = vec_id;
582 
583 	if (unlikely(!!idesc))
584 		free_ind_table(idesc);
585 
586 	return 0;
587 }
588 
589 /*
590  * Returns -1 on fail, 0 on success
591  */
592 static inline int
593 reserve_avail_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
594 				uint32_t size, struct buf_vector *buf_vec,
595 				uint16_t *num_buffers, uint16_t avail_head,
596 				uint16_t *nr_vec)
597 {
598 	uint16_t cur_idx;
599 	uint16_t vec_idx = 0;
600 	uint16_t max_tries, tries = 0;
601 
602 	uint16_t head_idx = 0;
603 	uint32_t len = 0;
604 
605 	*num_buffers = 0;
606 	cur_idx  = vq->last_avail_idx;
607 
608 	if (rxvq_is_mergeable(dev))
609 		max_tries = vq->size - 1;
610 	else
611 		max_tries = 1;
612 
613 	while (size > 0) {
614 		if (unlikely(cur_idx == avail_head))
615 			return -1;
616 		/*
617 		 * if we tried all available ring items, and still
618 		 * can't get enough buf, it means something abnormal
619 		 * happened.
620 		 */
621 		if (unlikely(++tries > max_tries))
622 			return -1;
623 
624 		if (unlikely(fill_vec_buf_split(dev, vq, cur_idx,
625 						&vec_idx, buf_vec,
626 						&head_idx, &len,
627 						VHOST_ACCESS_RW) < 0))
628 			return -1;
629 		len = RTE_MIN(len, size);
630 		update_shadow_used_ring_split(vq, head_idx, len);
631 		size -= len;
632 
633 		cur_idx++;
634 		*num_buffers += 1;
635 	}
636 
637 	*nr_vec = vec_idx;
638 
639 	return 0;
640 }
641 
642 static __rte_always_inline int
643 fill_vec_buf_packed_indirect(struct virtio_net *dev,
644 			struct vhost_virtqueue *vq,
645 			struct vring_packed_desc *desc, uint16_t *vec_idx,
646 			struct buf_vector *buf_vec, uint32_t *len, uint8_t perm)
647 {
648 	uint16_t i;
649 	uint32_t nr_descs;
650 	uint16_t vec_id = *vec_idx;
651 	uint64_t dlen;
652 	struct vring_packed_desc *descs, *idescs = NULL;
653 
654 	dlen = desc->len;
655 	descs = (struct vring_packed_desc *)(uintptr_t)
656 		vhost_iova_to_vva(dev, vq, desc->addr, &dlen, VHOST_ACCESS_RO);
657 	if (unlikely(!descs))
658 		return -1;
659 
660 	if (unlikely(dlen < desc->len)) {
661 		/*
662 		 * The indirect desc table is not contiguous
663 		 * in process VA space, we have to copy it.
664 		 */
665 		idescs = vhost_alloc_copy_ind_table(dev,
666 				vq, desc->addr, desc->len);
667 		if (unlikely(!idescs))
668 			return -1;
669 
670 		descs = idescs;
671 	}
672 
673 	nr_descs =  desc->len / sizeof(struct vring_packed_desc);
674 	if (unlikely(nr_descs >= vq->size)) {
675 		free_ind_table(idescs);
676 		return -1;
677 	}
678 
679 	for (i = 0; i < nr_descs; i++) {
680 		if (unlikely(vec_id >= BUF_VECTOR_MAX)) {
681 			free_ind_table(idescs);
682 			return -1;
683 		}
684 
685 		dlen = descs[i].len;
686 		*len += dlen;
687 		if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
688 						descs[i].addr, dlen,
689 						perm)))
690 			return -1;
691 	}
692 	*vec_idx = vec_id;
693 
694 	if (unlikely(!!idescs))
695 		free_ind_table(idescs);
696 
697 	return 0;
698 }
699 
700 static __rte_always_inline int
701 fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
702 				uint16_t avail_idx, uint16_t *desc_count,
703 				struct buf_vector *buf_vec, uint16_t *vec_idx,
704 				uint16_t *buf_id, uint32_t *len, uint8_t perm)
705 {
706 	bool wrap_counter = vq->avail_wrap_counter;
707 	struct vring_packed_desc *descs = vq->desc_packed;
708 	uint16_t vec_id = *vec_idx;
709 	uint64_t dlen;
710 
711 	if (avail_idx < vq->last_avail_idx)
712 		wrap_counter ^= 1;
713 
714 	/*
715 	 * Perform a load-acquire barrier in desc_is_avail to
716 	 * enforce the ordering between desc flags and desc
717 	 * content.
718 	 */
719 	if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter)))
720 		return -1;
721 
722 	*desc_count = 0;
723 	*len = 0;
724 
725 	while (1) {
726 		if (unlikely(vec_id >= BUF_VECTOR_MAX))
727 			return -1;
728 
729 		if (unlikely(*desc_count >= vq->size))
730 			return -1;
731 
732 		*desc_count += 1;
733 		*buf_id = descs[avail_idx].id;
734 
735 		if (descs[avail_idx].flags & VRING_DESC_F_INDIRECT) {
736 			if (unlikely(fill_vec_buf_packed_indirect(dev, vq,
737 							&descs[avail_idx],
738 							&vec_id, buf_vec,
739 							len, perm) < 0))
740 				return -1;
741 		} else {
742 			dlen = descs[avail_idx].len;
743 			*len += dlen;
744 
745 			if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
746 							descs[avail_idx].addr,
747 							dlen,
748 							perm)))
749 				return -1;
750 		}
751 
752 		if ((descs[avail_idx].flags & VRING_DESC_F_NEXT) == 0)
753 			break;
754 
755 		if (++avail_idx >= vq->size) {
756 			avail_idx -= vq->size;
757 			wrap_counter ^= 1;
758 		}
759 	}
760 
761 	*vec_idx = vec_id;
762 
763 	return 0;
764 }
765 
766 static __rte_noinline void
767 copy_vnet_hdr_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
768 		struct buf_vector *buf_vec,
769 		struct virtio_net_hdr_mrg_rxbuf *hdr)
770 {
771 	uint64_t len;
772 	uint64_t remain = dev->vhost_hlen;
773 	uint64_t src = (uint64_t)(uintptr_t)hdr, dst;
774 	uint64_t iova = buf_vec->buf_iova;
775 
776 	while (remain) {
777 		len = RTE_MIN(remain,
778 				buf_vec->buf_len);
779 		dst = buf_vec->buf_addr;
780 		rte_memcpy((void *)(uintptr_t)dst,
781 				(void *)(uintptr_t)src,
782 				len);
783 
784 		PRINT_PACKET(dev, (uintptr_t)dst,
785 				(uint32_t)len, 0);
786 		vhost_log_cache_write_iova(dev, vq,
787 				iova, len);
788 
789 		remain -= len;
790 		iova += len;
791 		src += len;
792 		buf_vec++;
793 	}
794 }
795 
796 static __rte_always_inline int
797 copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
798 			    struct rte_mbuf *m, struct buf_vector *buf_vec,
799 			    uint16_t nr_vec, uint16_t num_buffers)
800 {
801 	uint32_t vec_idx = 0;
802 	uint32_t mbuf_offset, mbuf_avail;
803 	uint32_t buf_offset, buf_avail;
804 	uint64_t buf_addr, buf_iova, buf_len;
805 	uint32_t cpy_len;
806 	uint64_t hdr_addr;
807 	struct rte_mbuf *hdr_mbuf;
808 	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
809 	struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
810 	int error = 0;
811 
812 	if (unlikely(m == NULL)) {
813 		error = -1;
814 		goto out;
815 	}
816 
817 	buf_addr = buf_vec[vec_idx].buf_addr;
818 	buf_iova = buf_vec[vec_idx].buf_iova;
819 	buf_len = buf_vec[vec_idx].buf_len;
820 
821 	if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
822 		error = -1;
823 		goto out;
824 	}
825 
826 	hdr_mbuf = m;
827 	hdr_addr = buf_addr;
828 	if (unlikely(buf_len < dev->vhost_hlen)) {
829 		memset(&tmp_hdr, 0, sizeof(struct virtio_net_hdr_mrg_rxbuf));
830 		hdr = &tmp_hdr;
831 	} else
832 		hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr;
833 
834 	VHOST_LOG_DATA(DEBUG, "(%d) RX: num merge buffers %d\n",
835 		dev->vid, num_buffers);
836 
837 	if (unlikely(buf_len < dev->vhost_hlen)) {
838 		buf_offset = dev->vhost_hlen - buf_len;
839 		vec_idx++;
840 		buf_addr = buf_vec[vec_idx].buf_addr;
841 		buf_iova = buf_vec[vec_idx].buf_iova;
842 		buf_len = buf_vec[vec_idx].buf_len;
843 		buf_avail = buf_len - buf_offset;
844 	} else {
845 		buf_offset = dev->vhost_hlen;
846 		buf_avail = buf_len - dev->vhost_hlen;
847 	}
848 
849 	mbuf_avail  = rte_pktmbuf_data_len(m);
850 	mbuf_offset = 0;
851 	while (mbuf_avail != 0 || m->next != NULL) {
852 		/* done with current buf, get the next one */
853 		if (buf_avail == 0) {
854 			vec_idx++;
855 			if (unlikely(vec_idx >= nr_vec)) {
856 				error = -1;
857 				goto out;
858 			}
859 
860 			buf_addr = buf_vec[vec_idx].buf_addr;
861 			buf_iova = buf_vec[vec_idx].buf_iova;
862 			buf_len = buf_vec[vec_idx].buf_len;
863 
864 			buf_offset = 0;
865 			buf_avail  = buf_len;
866 		}
867 
868 		/* done with current mbuf, get the next one */
869 		if (mbuf_avail == 0) {
870 			m = m->next;
871 
872 			mbuf_offset = 0;
873 			mbuf_avail  = rte_pktmbuf_data_len(m);
874 		}
875 
876 		if (hdr_addr) {
877 			virtio_enqueue_offload(hdr_mbuf, &hdr->hdr);
878 			if (rxvq_is_mergeable(dev))
879 				ASSIGN_UNLESS_EQUAL(hdr->num_buffers,
880 						num_buffers);
881 
882 			if (unlikely(hdr == &tmp_hdr)) {
883 				copy_vnet_hdr_to_desc(dev, vq, buf_vec, hdr);
884 			} else {
885 				PRINT_PACKET(dev, (uintptr_t)hdr_addr,
886 						dev->vhost_hlen, 0);
887 				vhost_log_cache_write_iova(dev, vq,
888 						buf_vec[0].buf_iova,
889 						dev->vhost_hlen);
890 			}
891 
892 			hdr_addr = 0;
893 		}
894 
895 		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
896 
897 		if (likely(cpy_len > MAX_BATCH_LEN ||
898 					vq->batch_copy_nb_elems >= vq->size)) {
899 			rte_memcpy((void *)((uintptr_t)(buf_addr + buf_offset)),
900 				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
901 				cpy_len);
902 			vhost_log_cache_write_iova(dev, vq,
903 						   buf_iova + buf_offset,
904 						   cpy_len);
905 			PRINT_PACKET(dev, (uintptr_t)(buf_addr + buf_offset),
906 				cpy_len, 0);
907 		} else {
908 			batch_copy[vq->batch_copy_nb_elems].dst =
909 				(void *)((uintptr_t)(buf_addr + buf_offset));
910 			batch_copy[vq->batch_copy_nb_elems].src =
911 				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
912 			batch_copy[vq->batch_copy_nb_elems].log_addr =
913 				buf_iova + buf_offset;
914 			batch_copy[vq->batch_copy_nb_elems].len = cpy_len;
915 			vq->batch_copy_nb_elems++;
916 		}
917 
918 		mbuf_avail  -= cpy_len;
919 		mbuf_offset += cpy_len;
920 		buf_avail  -= cpy_len;
921 		buf_offset += cpy_len;
922 	}
923 
924 out:
925 
926 	return error;
927 }
928 
929 static __rte_always_inline void
930 async_fill_vec(struct iovec *v, void *base, size_t len)
931 {
932 	v->iov_base = base;
933 	v->iov_len = len;
934 }
935 
936 static __rte_always_inline void
937 async_fill_iter(struct rte_vhost_iov_iter *it, size_t count,
938 	struct iovec *vec, unsigned long nr_seg)
939 {
940 	it->offset = 0;
941 	it->count = count;
942 
943 	if (count) {
944 		it->iov = vec;
945 		it->nr_segs = nr_seg;
946 	} else {
947 		it->iov = 0;
948 		it->nr_segs = 0;
949 	}
950 }
951 
952 static __rte_always_inline void
953 async_fill_desc(struct rte_vhost_async_desc *desc,
954 	struct rte_vhost_iov_iter *src, struct rte_vhost_iov_iter *dst)
955 {
956 	desc->src = src;
957 	desc->dst = dst;
958 }
959 
960 static __rte_always_inline int
961 async_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
962 			struct rte_mbuf *m, struct buf_vector *buf_vec,
963 			uint16_t nr_vec, uint16_t num_buffers,
964 			struct iovec *src_iovec, struct iovec *dst_iovec,
965 			struct rte_vhost_iov_iter *src_it,
966 			struct rte_vhost_iov_iter *dst_it)
967 {
968 	struct rte_mbuf *hdr_mbuf;
969 	struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
970 	uint64_t buf_addr, buf_iova;
971 	uint64_t hdr_addr;
972 	uint64_t mapped_len;
973 	uint32_t vec_idx = 0;
974 	uint32_t mbuf_offset, mbuf_avail;
975 	uint32_t buf_offset, buf_avail;
976 	uint32_t cpy_len, buf_len;
977 	int error = 0;
978 
979 	uint32_t tlen = 0;
980 	int tvec_idx = 0;
981 	void *hpa;
982 
983 	if (unlikely(m == NULL)) {
984 		error = -1;
985 		goto out;
986 	}
987 
988 	buf_addr = buf_vec[vec_idx].buf_addr;
989 	buf_iova = buf_vec[vec_idx].buf_iova;
990 	buf_len = buf_vec[vec_idx].buf_len;
991 
992 	if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
993 		error = -1;
994 		goto out;
995 	}
996 
997 	hdr_mbuf = m;
998 	hdr_addr = buf_addr;
999 	if (unlikely(buf_len < dev->vhost_hlen)) {
1000 		memset(&tmp_hdr, 0, sizeof(struct virtio_net_hdr_mrg_rxbuf));
1001 		hdr = &tmp_hdr;
1002 	} else
1003 		hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr;
1004 
1005 	VHOST_LOG_DATA(DEBUG, "(%d) RX: num merge buffers %d\n",
1006 		dev->vid, num_buffers);
1007 
1008 	if (unlikely(buf_len < dev->vhost_hlen)) {
1009 		buf_offset = dev->vhost_hlen - buf_len;
1010 		vec_idx++;
1011 		buf_addr = buf_vec[vec_idx].buf_addr;
1012 		buf_iova = buf_vec[vec_idx].buf_iova;
1013 		buf_len = buf_vec[vec_idx].buf_len;
1014 		buf_avail = buf_len - buf_offset;
1015 	} else {
1016 		buf_offset = dev->vhost_hlen;
1017 		buf_avail = buf_len - dev->vhost_hlen;
1018 	}
1019 
1020 	mbuf_avail  = rte_pktmbuf_data_len(m);
1021 	mbuf_offset = 0;
1022 
1023 	while (mbuf_avail != 0 || m->next != NULL) {
1024 		/* done with current buf, get the next one */
1025 		if (buf_avail == 0) {
1026 			vec_idx++;
1027 			if (unlikely(vec_idx >= nr_vec)) {
1028 				error = -1;
1029 				goto out;
1030 			}
1031 
1032 			buf_addr = buf_vec[vec_idx].buf_addr;
1033 			buf_iova = buf_vec[vec_idx].buf_iova;
1034 			buf_len = buf_vec[vec_idx].buf_len;
1035 
1036 			buf_offset = 0;
1037 			buf_avail = buf_len;
1038 		}
1039 
1040 		/* done with current mbuf, get the next one */
1041 		if (mbuf_avail == 0) {
1042 			m = m->next;
1043 
1044 			mbuf_offset = 0;
1045 			mbuf_avail = rte_pktmbuf_data_len(m);
1046 		}
1047 
1048 		if (hdr_addr) {
1049 			virtio_enqueue_offload(hdr_mbuf, &hdr->hdr);
1050 			if (rxvq_is_mergeable(dev))
1051 				ASSIGN_UNLESS_EQUAL(hdr->num_buffers,
1052 						num_buffers);
1053 
1054 			if (unlikely(hdr == &tmp_hdr)) {
1055 				copy_vnet_hdr_to_desc(dev, vq, buf_vec, hdr);
1056 			} else {
1057 				PRINT_PACKET(dev, (uintptr_t)hdr_addr,
1058 						dev->vhost_hlen, 0);
1059 				vhost_log_cache_write_iova(dev, vq,
1060 						buf_vec[0].buf_iova,
1061 						dev->vhost_hlen);
1062 			}
1063 
1064 			hdr_addr = 0;
1065 		}
1066 
1067 		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
1068 
1069 		while (unlikely(cpy_len)) {
1070 			hpa = (void *)(uintptr_t)gpa_to_first_hpa(dev,
1071 					buf_iova + buf_offset,
1072 					cpy_len, &mapped_len);
1073 			if (unlikely(!hpa)) {
1074 				VHOST_LOG_DATA(ERR, "(%d) %s: failed to get hpa.\n",
1075 				dev->vid, __func__);
1076 				error = -1;
1077 				goto out;
1078 			}
1079 
1080 			async_fill_vec(src_iovec + tvec_idx,
1081 				(void *)(uintptr_t)rte_pktmbuf_iova_offset(m,
1082 				mbuf_offset), (size_t)mapped_len);
1083 			async_fill_vec(dst_iovec + tvec_idx,
1084 					hpa, (size_t)mapped_len);
1085 
1086 			tlen += (uint32_t)mapped_len;
1087 			cpy_len -= (uint32_t)mapped_len;
1088 			mbuf_avail  -= (uint32_t)mapped_len;
1089 			mbuf_offset += (uint32_t)mapped_len;
1090 			buf_avail  -= (uint32_t)mapped_len;
1091 			buf_offset += (uint32_t)mapped_len;
1092 			tvec_idx++;
1093 		}
1094 	}
1095 
1096 	async_fill_iter(src_it, tlen, src_iovec, tvec_idx);
1097 	async_fill_iter(dst_it, tlen, dst_iovec, tvec_idx);
1098 out:
1099 	return error;
1100 }
1101 
1102 static __rte_always_inline int
1103 vhost_enqueue_single_packed(struct virtio_net *dev,
1104 			    struct vhost_virtqueue *vq,
1105 			    struct rte_mbuf *pkt,
1106 			    struct buf_vector *buf_vec,
1107 			    uint16_t *nr_descs)
1108 {
1109 	uint16_t nr_vec = 0;
1110 	uint16_t avail_idx = vq->last_avail_idx;
1111 	uint16_t max_tries, tries = 0;
1112 	uint16_t buf_id = 0;
1113 	uint32_t len = 0;
1114 	uint16_t desc_count;
1115 	uint32_t size = pkt->pkt_len + sizeof(struct virtio_net_hdr_mrg_rxbuf);
1116 	uint16_t num_buffers = 0;
1117 	uint32_t buffer_len[vq->size];
1118 	uint16_t buffer_buf_id[vq->size];
1119 	uint16_t buffer_desc_count[vq->size];
1120 
1121 	if (rxvq_is_mergeable(dev))
1122 		max_tries = vq->size - 1;
1123 	else
1124 		max_tries = 1;
1125 
1126 	while (size > 0) {
1127 		/*
1128 		 * if we tried all available ring items, and still
1129 		 * can't get enough buf, it means something abnormal
1130 		 * happened.
1131 		 */
1132 		if (unlikely(++tries > max_tries))
1133 			return -1;
1134 
1135 		if (unlikely(fill_vec_buf_packed(dev, vq,
1136 						avail_idx, &desc_count,
1137 						buf_vec, &nr_vec,
1138 						&buf_id, &len,
1139 						VHOST_ACCESS_RW) < 0))
1140 			return -1;
1141 
1142 		len = RTE_MIN(len, size);
1143 		size -= len;
1144 
1145 		buffer_len[num_buffers] = len;
1146 		buffer_buf_id[num_buffers] = buf_id;
1147 		buffer_desc_count[num_buffers] = desc_count;
1148 		num_buffers += 1;
1149 
1150 		*nr_descs += desc_count;
1151 		avail_idx += desc_count;
1152 		if (avail_idx >= vq->size)
1153 			avail_idx -= vq->size;
1154 	}
1155 
1156 	if (copy_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, num_buffers) < 0)
1157 		return -1;
1158 
1159 	vhost_shadow_enqueue_single_packed(dev, vq, buffer_len, buffer_buf_id,
1160 					   buffer_desc_count, num_buffers);
1161 
1162 	return 0;
1163 }
1164 
1165 static __rte_noinline uint32_t
1166 virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
1167 	struct rte_mbuf **pkts, uint32_t count)
1168 {
1169 	uint32_t pkt_idx = 0;
1170 	uint16_t num_buffers;
1171 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
1172 	uint16_t avail_head;
1173 
1174 	/*
1175 	 * The ordering between avail index and
1176 	 * desc reads needs to be enforced.
1177 	 */
1178 	avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);
1179 
1180 	rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
1181 
1182 	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
1183 		uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
1184 		uint16_t nr_vec = 0;
1185 
1186 		if (unlikely(reserve_avail_buf_split(dev, vq,
1187 						pkt_len, buf_vec, &num_buffers,
1188 						avail_head, &nr_vec) < 0)) {
1189 			VHOST_LOG_DATA(DEBUG,
1190 				"(%d) failed to get enough desc from vring\n",
1191 				dev->vid);
1192 			vq->shadow_used_idx -= num_buffers;
1193 			break;
1194 		}
1195 
1196 		VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
1197 			dev->vid, vq->last_avail_idx,
1198 			vq->last_avail_idx + num_buffers);
1199 
1200 		if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx],
1201 						buf_vec, nr_vec,
1202 						num_buffers) < 0) {
1203 			vq->shadow_used_idx -= num_buffers;
1204 			break;
1205 		}
1206 
1207 		vq->last_avail_idx += num_buffers;
1208 	}
1209 
1210 	do_data_copy_enqueue(dev, vq);
1211 
1212 	if (likely(vq->shadow_used_idx)) {
1213 		flush_shadow_used_ring_split(dev, vq);
1214 		vhost_vring_call_split(dev, vq);
1215 	}
1216 
1217 	return pkt_idx;
1218 }
1219 
1220 static __rte_always_inline int
1221 virtio_dev_rx_sync_batch_check(struct virtio_net *dev,
1222 			   struct vhost_virtqueue *vq,
1223 			   struct rte_mbuf **pkts,
1224 			   uint64_t *desc_addrs,
1225 			   uint64_t *lens)
1226 {
1227 	bool wrap_counter = vq->avail_wrap_counter;
1228 	struct vring_packed_desc *descs = vq->desc_packed;
1229 	uint16_t avail_idx = vq->last_avail_idx;
1230 	uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
1231 	uint16_t i;
1232 
1233 	if (unlikely(avail_idx & PACKED_BATCH_MASK))
1234 		return -1;
1235 
1236 	if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size))
1237 		return -1;
1238 
1239 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1240 		if (unlikely(pkts[i]->next != NULL))
1241 			return -1;
1242 		if (unlikely(!desc_is_avail(&descs[avail_idx + i],
1243 					    wrap_counter)))
1244 			return -1;
1245 	}
1246 
1247 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1248 		lens[i] = descs[avail_idx + i].len;
1249 
1250 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1251 		if (unlikely(pkts[i]->pkt_len > (lens[i] - buf_offset)))
1252 			return -1;
1253 	}
1254 
1255 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1256 		desc_addrs[i] = vhost_iova_to_vva(dev, vq,
1257 						  descs[avail_idx + i].addr,
1258 						  &lens[i],
1259 						  VHOST_ACCESS_RW);
1260 
1261 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1262 		if (unlikely(!desc_addrs[i]))
1263 			return -1;
1264 		if (unlikely(lens[i] != descs[avail_idx + i].len))
1265 			return -1;
1266 	}
1267 
1268 	return 0;
1269 }
1270 
1271 static __rte_always_inline void
1272 virtio_dev_rx_batch_packed_copy(struct virtio_net *dev,
1273 			   struct vhost_virtqueue *vq,
1274 			   struct rte_mbuf **pkts,
1275 			   uint64_t *desc_addrs,
1276 			   uint64_t *lens)
1277 {
1278 	uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
1279 	struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_BATCH_SIZE];
1280 	struct vring_packed_desc *descs = vq->desc_packed;
1281 	uint16_t avail_idx = vq->last_avail_idx;
1282 	uint16_t ids[PACKED_BATCH_SIZE];
1283 	uint16_t i;
1284 
1285 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1286 		rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
1287 		hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *)
1288 					(uintptr_t)desc_addrs[i];
1289 		lens[i] = pkts[i]->pkt_len +
1290 			sizeof(struct virtio_net_hdr_mrg_rxbuf);
1291 	}
1292 
1293 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1294 		virtio_enqueue_offload(pkts[i], &hdrs[i]->hdr);
1295 
1296 	vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE);
1297 
1298 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1299 		rte_memcpy((void *)(uintptr_t)(desc_addrs[i] + buf_offset),
1300 			   rte_pktmbuf_mtod_offset(pkts[i], void *, 0),
1301 			   pkts[i]->pkt_len);
1302 	}
1303 
1304 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1305 		vhost_log_cache_write_iova(dev, vq, descs[avail_idx + i].addr,
1306 					   lens[i]);
1307 
1308 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1309 		ids[i] = descs[avail_idx + i].id;
1310 
1311 	vhost_flush_enqueue_batch_packed(dev, vq, lens, ids);
1312 }
1313 
1314 static __rte_always_inline int
1315 virtio_dev_rx_sync_batch_packed(struct virtio_net *dev,
1316 			   struct vhost_virtqueue *vq,
1317 			   struct rte_mbuf **pkts)
1318 {
1319 	uint64_t desc_addrs[PACKED_BATCH_SIZE];
1320 	uint64_t lens[PACKED_BATCH_SIZE];
1321 
1322 	if (virtio_dev_rx_sync_batch_check(dev, vq, pkts, desc_addrs, lens) == -1)
1323 		return -1;
1324 
1325 	if (vq->shadow_used_idx) {
1326 		do_data_copy_enqueue(dev, vq);
1327 		vhost_flush_enqueue_shadow_packed(dev, vq);
1328 	}
1329 
1330 	virtio_dev_rx_batch_packed_copy(dev, vq, pkts, desc_addrs, lens);
1331 
1332 	return 0;
1333 }
1334 
1335 static __rte_always_inline int16_t
1336 virtio_dev_rx_single_packed(struct virtio_net *dev,
1337 			    struct vhost_virtqueue *vq,
1338 			    struct rte_mbuf *pkt)
1339 {
1340 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
1341 	uint16_t nr_descs = 0;
1342 
1343 	if (unlikely(vhost_enqueue_single_packed(dev, vq, pkt, buf_vec,
1344 						 &nr_descs) < 0)) {
1345 		VHOST_LOG_DATA(DEBUG,
1346 				"(%d) failed to get enough desc from vring\n",
1347 				dev->vid);
1348 		return -1;
1349 	}
1350 
1351 	VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
1352 			dev->vid, vq->last_avail_idx,
1353 			vq->last_avail_idx + nr_descs);
1354 
1355 	vq_inc_last_avail_packed(vq, nr_descs);
1356 
1357 	return 0;
1358 }
1359 
1360 static __rte_noinline uint32_t
1361 virtio_dev_rx_packed(struct virtio_net *dev,
1362 		     struct vhost_virtqueue *__rte_restrict vq,
1363 		     struct rte_mbuf **__rte_restrict pkts,
1364 		     uint32_t count)
1365 {
1366 	uint32_t pkt_idx = 0;
1367 
1368 	do {
1369 		rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
1370 
1371 		if (count - pkt_idx >= PACKED_BATCH_SIZE) {
1372 			if (!virtio_dev_rx_sync_batch_packed(dev, vq,
1373 							&pkts[pkt_idx])) {
1374 				pkt_idx += PACKED_BATCH_SIZE;
1375 				continue;
1376 			}
1377 		}
1378 
1379 		if (virtio_dev_rx_single_packed(dev, vq, pkts[pkt_idx]))
1380 			break;
1381 		pkt_idx++;
1382 
1383 	} while (pkt_idx < count);
1384 
1385 	if (vq->shadow_used_idx) {
1386 		do_data_copy_enqueue(dev, vq);
1387 		vhost_flush_enqueue_shadow_packed(dev, vq);
1388 	}
1389 
1390 	if (pkt_idx)
1391 		vhost_vring_call_packed(dev, vq);
1392 
1393 	return pkt_idx;
1394 }
1395 
1396 static __rte_always_inline uint32_t
1397 virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
1398 	struct rte_mbuf **pkts, uint32_t count)
1399 {
1400 	struct vhost_virtqueue *vq;
1401 	uint32_t nb_tx = 0;
1402 
1403 	VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
1404 	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
1405 		VHOST_LOG_DATA(ERR, "(%d) %s: invalid virtqueue idx %d.\n",
1406 			dev->vid, __func__, queue_id);
1407 		return 0;
1408 	}
1409 
1410 	vq = dev->virtqueue[queue_id];
1411 
1412 	rte_spinlock_lock(&vq->access_lock);
1413 
1414 	if (unlikely(!vq->enabled))
1415 		goto out_access_unlock;
1416 
1417 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1418 		vhost_user_iotlb_rd_lock(vq);
1419 
1420 	if (unlikely(!vq->access_ok))
1421 		if (unlikely(vring_translate(dev, vq) < 0))
1422 			goto out;
1423 
1424 	count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
1425 	if (count == 0)
1426 		goto out;
1427 
1428 	if (vq_is_packed(dev))
1429 		nb_tx = virtio_dev_rx_packed(dev, vq, pkts, count);
1430 	else
1431 		nb_tx = virtio_dev_rx_split(dev, vq, pkts, count);
1432 
1433 out:
1434 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1435 		vhost_user_iotlb_rd_unlock(vq);
1436 
1437 out_access_unlock:
1438 	rte_spinlock_unlock(&vq->access_lock);
1439 
1440 	return nb_tx;
1441 }
1442 
1443 uint16_t
1444 rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
1445 	struct rte_mbuf **__rte_restrict pkts, uint16_t count)
1446 {
1447 	struct virtio_net *dev = get_device(vid);
1448 
1449 	if (!dev)
1450 		return 0;
1451 
1452 	if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
1453 		VHOST_LOG_DATA(ERR,
1454 			"(%d) %s: built-in vhost net backend is disabled.\n",
1455 			dev->vid, __func__);
1456 		return 0;
1457 	}
1458 
1459 	return virtio_dev_rx(dev, queue_id, pkts, count);
1460 }
1461 
1462 static __rte_always_inline uint16_t
1463 virtio_dev_rx_async_get_info_idx(uint16_t pkts_idx,
1464 	uint16_t vq_size, uint16_t n_inflight)
1465 {
1466 	return pkts_idx > n_inflight ? (pkts_idx - n_inflight) :
1467 		(vq_size - n_inflight + pkts_idx) % vq_size;
1468 }
1469 
1470 static __rte_always_inline void
1471 store_dma_desc_info_split(struct vring_used_elem *s_ring, struct vring_used_elem *d_ring,
1472 		uint16_t ring_size, uint16_t s_idx, uint16_t d_idx, uint16_t count)
1473 {
1474 	size_t elem_size = sizeof(struct vring_used_elem);
1475 
1476 	if (d_idx + count <= ring_size) {
1477 		rte_memcpy(d_ring + d_idx, s_ring + s_idx, count * elem_size);
1478 	} else {
1479 		uint16_t size = ring_size - d_idx;
1480 
1481 		rte_memcpy(d_ring + d_idx, s_ring + s_idx, size * elem_size);
1482 		rte_memcpy(d_ring, s_ring + s_idx + size, (count - size) * elem_size);
1483 	}
1484 }
1485 
1486 static __rte_always_inline void
1487 store_dma_desc_info_packed(struct vring_used_elem_packed *s_ring,
1488 		struct vring_used_elem_packed *d_ring,
1489 		uint16_t ring_size, uint16_t s_idx, uint16_t d_idx, uint16_t count)
1490 {
1491 	size_t elem_size = sizeof(struct vring_used_elem_packed);
1492 
1493 	if (d_idx + count <= ring_size) {
1494 		rte_memcpy(d_ring + d_idx, s_ring + s_idx, count * elem_size);
1495 	} else {
1496 		uint16_t size = ring_size - d_idx;
1497 
1498 		rte_memcpy(d_ring + d_idx, s_ring + s_idx, size * elem_size);
1499 		rte_memcpy(d_ring, s_ring + s_idx + size, (count - size) * elem_size);
1500 	}
1501 }
1502 
1503 static __rte_noinline uint32_t
1504 virtio_dev_rx_async_submit_split(struct virtio_net *dev,
1505 	struct vhost_virtqueue *vq, uint16_t queue_id,
1506 	struct rte_mbuf **pkts, uint32_t count)
1507 {
1508 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
1509 	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
1510 	uint16_t num_buffers;
1511 	uint16_t avail_head;
1512 
1513 	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
1514 	struct iovec *vec_pool = vq->vec_pool;
1515 	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
1516 	struct iovec *src_iovec = vec_pool;
1517 	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
1518 	struct async_inflight_info *pkts_info = vq->async_pkts_info;
1519 	uint32_t n_pkts = 0, pkt_err = 0;
1520 	int32_t n_xfer;
1521 	uint16_t segs_await = 0;
1522 	uint16_t iovec_idx = 0, it_idx = 0, slot_idx = 0;
1523 
1524 	/*
1525 	 * The ordering between avail index and desc reads need to be enforced.
1526 	 */
1527 	avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);
1528 
1529 	rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
1530 
1531 	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
1532 		uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
1533 		uint16_t nr_vec = 0;
1534 
1535 		if (unlikely(reserve_avail_buf_split(dev, vq,
1536 						pkt_len, buf_vec, &num_buffers,
1537 						avail_head, &nr_vec) < 0)) {
1538 			VHOST_LOG_DATA(DEBUG,
1539 				"(%d) failed to get enough desc from vring\n",
1540 				dev->vid);
1541 			vq->shadow_used_idx -= num_buffers;
1542 			break;
1543 		}
1544 
1545 		VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
1546 			dev->vid, vq->last_avail_idx,
1547 			vq->last_avail_idx + num_buffers);
1548 
1549 		if (async_mbuf_to_desc(dev, vq, pkts[pkt_idx], buf_vec, nr_vec, num_buffers,
1550 				&src_iovec[iovec_idx], &dst_iovec[iovec_idx],
1551 				&it_pool[it_idx], &it_pool[it_idx + 1]) < 0) {
1552 			vq->shadow_used_idx -= num_buffers;
1553 			break;
1554 		}
1555 
1556 		async_fill_desc(&tdes[pkt_burst_idx++], &it_pool[it_idx],
1557 				&it_pool[it_idx + 1]);
1558 
1559 		slot_idx = (vq->async_pkts_idx + pkt_idx) & (vq->size - 1);
1560 		pkts_info[slot_idx].descs = num_buffers;
1561 		pkts_info[slot_idx].mbuf = pkts[pkt_idx];
1562 
1563 		iovec_idx += it_pool[it_idx].nr_segs;
1564 		segs_await += it_pool[it_idx].nr_segs;
1565 		it_idx += 2;
1566 
1567 		vq->last_avail_idx += num_buffers;
1568 
1569 		/*
1570 		 * conditions to trigger async device transfer:
1571 		 * - buffered packet number reaches transfer threshold
1572 		 * - unused async iov number is less than max vhost vector
1573 		 */
1574 		if (unlikely(pkt_burst_idx >= VHOST_ASYNC_BATCH_THRESHOLD ||
1575 			((VHOST_MAX_ASYNC_VEC >> 1) - segs_await <
1576 			BUF_VECTOR_MAX))) {
1577 			n_xfer = vq->async_ops.transfer_data(dev->vid,
1578 					queue_id, tdes, 0, pkt_burst_idx);
1579 			if (likely(n_xfer >= 0)) {
1580 				n_pkts = n_xfer;
1581 			} else {
1582 				VHOST_LOG_DATA(ERR,
1583 					"(%d) %s: failed to transfer data for queue id %d.\n",
1584 					dev->vid, __func__, queue_id);
1585 				n_pkts = 0;
1586 			}
1587 
1588 			iovec_idx = 0;
1589 			it_idx = 0;
1590 			segs_await = 0;
1591 
1592 			if (unlikely(n_pkts < pkt_burst_idx)) {
1593 				/*
1594 				 * log error packets number here and do actual
1595 				 * error processing when applications poll
1596 				 * completion
1597 				 */
1598 				pkt_err = pkt_burst_idx - n_pkts;
1599 				pkt_idx++;
1600 				pkt_burst_idx = 0;
1601 				break;
1602 			}
1603 
1604 			pkt_burst_idx = 0;
1605 		}
1606 	}
1607 
1608 	if (pkt_burst_idx) {
1609 		n_xfer = vq->async_ops.transfer_data(dev->vid, queue_id, tdes, 0, pkt_burst_idx);
1610 		if (likely(n_xfer >= 0)) {
1611 			n_pkts = n_xfer;
1612 		} else {
1613 			VHOST_LOG_DATA(ERR, "(%d) %s: failed to transfer data for queue id %d.\n",
1614 				dev->vid, __func__, queue_id);
1615 			n_pkts = 0;
1616 		}
1617 
1618 		if (unlikely(n_pkts < pkt_burst_idx))
1619 			pkt_err = pkt_burst_idx - n_pkts;
1620 	}
1621 
1622 	if (unlikely(pkt_err)) {
1623 		uint16_t num_descs = 0;
1624 
1625 		/* update number of completed packets */
1626 		pkt_idx -= pkt_err;
1627 
1628 		/* calculate the sum of descriptors to revert */
1629 		while (pkt_err-- > 0) {
1630 			num_descs += pkts_info[slot_idx & (vq->size - 1)].descs;
1631 			slot_idx--;
1632 		}
1633 
1634 		/* recover shadow used ring and available ring */
1635 		vq->shadow_used_idx -= num_descs;
1636 		vq->last_avail_idx -= num_descs;
1637 	}
1638 
1639 	/* keep used descriptors */
1640 	if (likely(vq->shadow_used_idx)) {
1641 		uint16_t to = vq->async_desc_idx_split & (vq->size - 1);
1642 
1643 		store_dma_desc_info_split(vq->shadow_used_split,
1644 				vq->async_descs_split, vq->size, 0, to,
1645 				vq->shadow_used_idx);
1646 
1647 		vq->async_desc_idx_split += vq->shadow_used_idx;
1648 		vq->async_pkts_idx += pkt_idx;
1649 		vq->async_pkts_inflight_n += pkt_idx;
1650 		vq->shadow_used_idx = 0;
1651 	}
1652 
1653 	return pkt_idx;
1654 }
1655 
1656 static __rte_always_inline void
1657 vhost_update_used_packed(struct vhost_virtqueue *vq,
1658 			struct vring_used_elem_packed *shadow_ring,
1659 			uint16_t count)
1660 {
1661 	int i;
1662 	uint16_t used_idx = vq->last_used_idx;
1663 	uint16_t head_idx = vq->last_used_idx;
1664 	uint16_t head_flags = 0;
1665 
1666 	if (count == 0)
1667 		return;
1668 
1669 	/* Split loop in two to save memory barriers */
1670 	for (i = 0; i < count; i++) {
1671 		vq->desc_packed[used_idx].id = shadow_ring[i].id;
1672 		vq->desc_packed[used_idx].len = shadow_ring[i].len;
1673 
1674 		used_idx += shadow_ring[i].count;
1675 		if (used_idx >= vq->size)
1676 			used_idx -= vq->size;
1677 	}
1678 
1679 	/* The ordering for storing desc flags needs to be enforced. */
1680 	rte_atomic_thread_fence(__ATOMIC_RELEASE);
1681 
1682 	for (i = 0; i < count; i++) {
1683 		uint16_t flags;
1684 
1685 		if (vq->shadow_used_packed[i].len)
1686 			flags = VRING_DESC_F_WRITE;
1687 		else
1688 			flags = 0;
1689 
1690 		if (vq->used_wrap_counter) {
1691 			flags |= VRING_DESC_F_USED;
1692 			flags |= VRING_DESC_F_AVAIL;
1693 		} else {
1694 			flags &= ~VRING_DESC_F_USED;
1695 			flags &= ~VRING_DESC_F_AVAIL;
1696 		}
1697 
1698 		if (i > 0) {
1699 			vq->desc_packed[vq->last_used_idx].flags = flags;
1700 		} else {
1701 			head_idx = vq->last_used_idx;
1702 			head_flags = flags;
1703 		}
1704 
1705 		vq_inc_last_used_packed(vq, shadow_ring[i].count);
1706 	}
1707 
1708 	vq->desc_packed[head_idx].flags = head_flags;
1709 }
1710 
1711 static __rte_always_inline int
1712 vhost_enqueue_async_packed(struct virtio_net *dev,
1713 			    struct vhost_virtqueue *vq,
1714 			    struct rte_mbuf *pkt,
1715 			    struct buf_vector *buf_vec,
1716 			    uint16_t *nr_descs,
1717 			    uint16_t *nr_buffers,
1718 			    struct iovec *src_iovec, struct iovec *dst_iovec,
1719 			    struct rte_vhost_iov_iter *src_it,
1720 			    struct rte_vhost_iov_iter *dst_it)
1721 {
1722 	uint16_t nr_vec = 0;
1723 	uint16_t avail_idx = vq->last_avail_idx;
1724 	uint16_t max_tries, tries = 0;
1725 	uint16_t buf_id = 0;
1726 	uint32_t len = 0;
1727 	uint16_t desc_count = 0;
1728 	uint32_t size = pkt->pkt_len + sizeof(struct virtio_net_hdr_mrg_rxbuf);
1729 	uint32_t buffer_len[vq->size];
1730 	uint16_t buffer_buf_id[vq->size];
1731 	uint16_t buffer_desc_count[vq->size];
1732 
1733 	if (rxvq_is_mergeable(dev))
1734 		max_tries = vq->size - 1;
1735 	else
1736 		max_tries = 1;
1737 
1738 	while (size > 0) {
1739 		/*
1740 		 * if we tried all available ring items, and still
1741 		 * can't get enough buf, it means something abnormal
1742 		 * happened.
1743 		 */
1744 		if (unlikely(++tries > max_tries))
1745 			return -1;
1746 
1747 		if (unlikely(fill_vec_buf_packed(dev, vq, avail_idx, &desc_count, buf_vec, &nr_vec,
1748 						&buf_id, &len, VHOST_ACCESS_RW) < 0))
1749 			return -1;
1750 
1751 		len = RTE_MIN(len, size);
1752 		size -= len;
1753 
1754 		buffer_len[*nr_buffers] = len;
1755 		buffer_buf_id[*nr_buffers] = buf_id;
1756 		buffer_desc_count[*nr_buffers] = desc_count;
1757 		*nr_buffers += 1;
1758 		*nr_descs += desc_count;
1759 		avail_idx += desc_count;
1760 		if (avail_idx >= vq->size)
1761 			avail_idx -= vq->size;
1762 	}
1763 
1764 	if (unlikely(async_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec,
1765 					*nr_buffers, src_iovec, dst_iovec,
1766 					src_it, dst_it) < 0))
1767 		return -1;
1768 
1769 	vhost_shadow_enqueue_packed(vq, buffer_len, buffer_buf_id, buffer_desc_count, *nr_buffers);
1770 
1771 	return 0;
1772 }
1773 
1774 static __rte_always_inline int16_t
1775 virtio_dev_rx_async_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
1776 			    struct rte_mbuf *pkt, uint16_t *nr_descs, uint16_t *nr_buffers,
1777 			    struct iovec *src_iovec, struct iovec *dst_iovec,
1778 			    struct rte_vhost_iov_iter *src_it, struct rte_vhost_iov_iter *dst_it)
1779 {
1780 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
1781 
1782 	if (unlikely(vhost_enqueue_async_packed(dev, vq, pkt, buf_vec, nr_descs, nr_buffers,
1783 						 src_iovec, dst_iovec,
1784 						 src_it, dst_it) < 0)) {
1785 		VHOST_LOG_DATA(DEBUG, "(%d) failed to get enough desc from vring\n", dev->vid);
1786 		return -1;
1787 	}
1788 
1789 	VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
1790 			dev->vid, vq->last_avail_idx, vq->last_avail_idx + *nr_descs);
1791 
1792 	return 0;
1793 }
1794 
1795 static __rte_always_inline void
1796 dma_error_handler_packed(struct vhost_virtqueue *vq, uint16_t slot_idx,
1797 			uint32_t nr_err, uint32_t *pkt_idx)
1798 {
1799 	uint16_t descs_err = 0;
1800 	uint16_t buffers_err = 0;
1801 	struct async_inflight_info *pkts_info = vq->async_pkts_info;
1802 
1803 	*pkt_idx -= nr_err;
1804 	/* calculate the sum of buffers and descs of DMA-error packets. */
1805 	while (nr_err-- > 0) {
1806 		descs_err += pkts_info[slot_idx % vq->size].descs;
1807 		buffers_err += pkts_info[slot_idx % vq->size].nr_buffers;
1808 		slot_idx--;
1809 	}
1810 
1811 	if (vq->last_avail_idx >= descs_err) {
1812 		vq->last_avail_idx -= descs_err;
1813 	} else {
1814 		vq->last_avail_idx = vq->last_avail_idx + vq->size - descs_err;
1815 		vq->avail_wrap_counter ^= 1;
1816 	}
1817 
1818 	vq->shadow_used_idx -= buffers_err;
1819 }
1820 
1821 static __rte_noinline uint32_t
1822 virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
1823 	struct vhost_virtqueue *vq, uint16_t queue_id,
1824 	struct rte_mbuf **pkts, uint32_t count)
1825 {
1826 	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
1827 	uint32_t remained = count;
1828 	int32_t n_xfer;
1829 	uint16_t num_buffers;
1830 	uint16_t num_descs;
1831 
1832 	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
1833 	struct iovec *vec_pool = vq->vec_pool;
1834 	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
1835 	struct iovec *src_iovec = vec_pool;
1836 	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
1837 	struct async_inflight_info *pkts_info = vq->async_pkts_info;
1838 	uint32_t n_pkts = 0, pkt_err = 0;
1839 	uint16_t slot_idx = 0;
1840 	uint16_t segs_await = 0;
1841 	uint16_t iovec_idx = 0, it_idx = 0;
1842 
1843 	do {
1844 		rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
1845 
1846 		num_buffers = 0;
1847 		num_descs = 0;
1848 		if (unlikely(virtio_dev_rx_async_packed(dev, vq, pkts[pkt_idx],
1849 						&num_descs, &num_buffers,
1850 						&src_iovec[iovec_idx], &dst_iovec[iovec_idx],
1851 						&it_pool[it_idx], &it_pool[it_idx + 1]) < 0))
1852 			break;
1853 
1854 		slot_idx = (vq->async_pkts_idx + pkt_idx) % vq->size;
1855 
1856 		async_fill_desc(&tdes[pkt_burst_idx++], &it_pool[it_idx],
1857 				&it_pool[it_idx + 1]);
1858 		pkts_info[slot_idx].descs = num_descs;
1859 		pkts_info[slot_idx].nr_buffers = num_buffers;
1860 		pkts_info[slot_idx].mbuf = pkts[pkt_idx];
1861 		iovec_idx += it_pool[it_idx].nr_segs;
1862 		segs_await += it_pool[it_idx].nr_segs;
1863 		it_idx += 2;
1864 
1865 		pkt_idx++;
1866 		remained--;
1867 		vq_inc_last_avail_packed(vq, num_descs);
1868 
1869 		/*
1870 		 * conditions to trigger async device transfer:
1871 		 * - buffered packet number reaches transfer threshold
1872 		 * - unused async iov number is less than max vhost vector
1873 		 */
1874 		if (unlikely(pkt_burst_idx >= VHOST_ASYNC_BATCH_THRESHOLD ||
1875 			((VHOST_MAX_ASYNC_VEC >> 1) - segs_await < BUF_VECTOR_MAX))) {
1876 			n_xfer = vq->async_ops.transfer_data(dev->vid,
1877 					queue_id, tdes, 0, pkt_burst_idx);
1878 			if (likely(n_xfer >= 0)) {
1879 				n_pkts = n_xfer;
1880 			} else {
1881 				VHOST_LOG_DATA(ERR,
1882 					"(%d) %s: failed to transfer data for queue id %d.\n",
1883 					dev->vid, __func__, queue_id);
1884 				n_pkts = 0;
1885 			}
1886 
1887 			iovec_idx = 0;
1888 			it_idx = 0;
1889 			segs_await = 0;
1890 
1891 			if (unlikely(n_pkts < pkt_burst_idx)) {
1892 				/*
1893 				 * log error packets number here and do actual
1894 				 * error processing when applications poll
1895 				 * completion
1896 				 */
1897 				pkt_err = pkt_burst_idx - n_pkts;
1898 				pkt_burst_idx = 0;
1899 				break;
1900 			}
1901 
1902 			pkt_burst_idx = 0;
1903 		}
1904 	} while (pkt_idx < count);
1905 
1906 	if (pkt_burst_idx) {
1907 		n_xfer = vq->async_ops.transfer_data(dev->vid, queue_id, tdes, 0, pkt_burst_idx);
1908 		if (likely(n_xfer >= 0)) {
1909 			n_pkts = n_xfer;
1910 		} else {
1911 			VHOST_LOG_DATA(ERR, "(%d) %s: failed to transfer data for queue id %d.\n",
1912 				dev->vid, __func__, queue_id);
1913 			n_pkts = 0;
1914 		}
1915 
1916 		if (unlikely(n_pkts < pkt_burst_idx))
1917 			pkt_err = pkt_burst_idx - n_pkts;
1918 	}
1919 
1920 	if (unlikely(pkt_err))
1921 		dma_error_handler_packed(vq, slot_idx, pkt_err, &pkt_idx);
1922 
1923 	if (likely(vq->shadow_used_idx)) {
1924 		/* keep used descriptors. */
1925 		store_dma_desc_info_packed(vq->shadow_used_packed, vq->async_buffers_packed,
1926 					vq->size, 0, vq->async_buffer_idx_packed,
1927 					vq->shadow_used_idx);
1928 
1929 		vq->async_buffer_idx_packed += vq->shadow_used_idx;
1930 		if (vq->async_buffer_idx_packed >= vq->size)
1931 			vq->async_buffer_idx_packed -= vq->size;
1932 
1933 		vq->async_pkts_idx += pkt_idx;
1934 		if (vq->async_pkts_idx >= vq->size)
1935 			vq->async_pkts_idx -= vq->size;
1936 
1937 		vq->shadow_used_idx = 0;
1938 		vq->async_pkts_inflight_n += pkt_idx;
1939 	}
1940 
1941 	return pkt_idx;
1942 }
1943 
1944 static __rte_always_inline void
1945 write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t n_descs)
1946 {
1947 	uint16_t nr_left = n_descs;
1948 	uint16_t nr_copy;
1949 	uint16_t to, from;
1950 
1951 	do {
1952 		from = vq->last_async_desc_idx_split & (vq->size - 1);
1953 		nr_copy = nr_left + from <= vq->size ? nr_left : vq->size - from;
1954 		to = vq->last_used_idx & (vq->size - 1);
1955 
1956 		if (to + nr_copy <= vq->size) {
1957 			rte_memcpy(&vq->used->ring[to], &vq->async_descs_split[from],
1958 					nr_copy * sizeof(struct vring_used_elem));
1959 		} else {
1960 			uint16_t size = vq->size - to;
1961 
1962 			rte_memcpy(&vq->used->ring[to], &vq->async_descs_split[from],
1963 					size * sizeof(struct vring_used_elem));
1964 			rte_memcpy(&vq->used->ring[0], &vq->async_descs_split[from + size],
1965 					(nr_copy - size) * sizeof(struct vring_used_elem));
1966 		}
1967 
1968 		vq->last_async_desc_idx_split += nr_copy;
1969 		vq->last_used_idx += nr_copy;
1970 		nr_left -= nr_copy;
1971 	} while (nr_left > 0);
1972 }
1973 
1974 static __rte_always_inline void
1975 write_back_completed_descs_packed(struct vhost_virtqueue *vq,
1976 				uint16_t n_buffers)
1977 {
1978 	uint16_t nr_left = n_buffers;
1979 	uint16_t from, to;
1980 
1981 	do {
1982 		from = vq->last_async_buffer_idx_packed;
1983 		to = (from + nr_left) % vq->size;
1984 		if (to > from) {
1985 			vhost_update_used_packed(vq, vq->async_buffers_packed + from, to - from);
1986 			vq->last_async_buffer_idx_packed += nr_left;
1987 			nr_left = 0;
1988 		} else {
1989 			vhost_update_used_packed(vq, vq->async_buffers_packed + from,
1990 				vq->size - from);
1991 			vq->last_async_buffer_idx_packed = 0;
1992 			nr_left -= vq->size - from;
1993 		}
1994 	} while (nr_left > 0);
1995 }
1996 
1997 static __rte_always_inline uint16_t
1998 vhost_poll_enqueue_completed(struct virtio_net *dev, uint16_t queue_id,
1999 		struct rte_mbuf **pkts, uint16_t count)
2000 {
2001 	struct vhost_virtqueue *vq;
2002 	struct async_inflight_info *pkts_info;
2003 	int32_t n_cpl;
2004 	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0, n_buffers = 0;
2005 	uint16_t start_idx, pkts_idx, vq_size;
2006 	uint16_t from, i;
2007 
2008 	vq = dev->virtqueue[queue_id];
2009 	pkts_idx = vq->async_pkts_idx % vq->size;
2010 	pkts_info = vq->async_pkts_info;
2011 	vq_size = vq->size;
2012 	start_idx = virtio_dev_rx_async_get_info_idx(pkts_idx,
2013 		vq_size, vq->async_pkts_inflight_n);
2014 
2015 	if (count > vq->async_last_pkts_n) {
2016 		n_cpl = vq->async_ops.check_completed_copies(dev->vid,
2017 			queue_id, 0, count - vq->async_last_pkts_n);
2018 		if (likely(n_cpl >= 0)) {
2019 			n_pkts_cpl = n_cpl;
2020 		} else {
2021 			VHOST_LOG_DATA(ERR,
2022 				"(%d) %s: failed to check completed copies for queue id %d.\n",
2023 				dev->vid, __func__, queue_id);
2024 			n_pkts_cpl = 0;
2025 		}
2026 	}
2027 
2028 	n_pkts_cpl += vq->async_last_pkts_n;
2029 	n_pkts_put = RTE_MIN(n_pkts_cpl, count);
2030 	if (unlikely(n_pkts_put == 0)) {
2031 		vq->async_last_pkts_n = n_pkts_cpl;
2032 		return 0;
2033 	}
2034 
2035 	if (vq_is_packed(dev)) {
2036 		for (i = 0; i < n_pkts_put; i++) {
2037 			from = (start_idx + i) % vq_size;
2038 			n_buffers += pkts_info[from].nr_buffers;
2039 			pkts[i] = pkts_info[from].mbuf;
2040 		}
2041 	} else {
2042 		for (i = 0; i < n_pkts_put; i++) {
2043 			from = (start_idx + i) & (vq_size - 1);
2044 			n_descs += pkts_info[from].descs;
2045 			pkts[i] = pkts_info[from].mbuf;
2046 		}
2047 	}
2048 	vq->async_last_pkts_n = n_pkts_cpl - n_pkts_put;
2049 	vq->async_pkts_inflight_n -= n_pkts_put;
2050 
2051 	if (likely(vq->enabled && vq->access_ok)) {
2052 		if (vq_is_packed(dev)) {
2053 			write_back_completed_descs_packed(vq, n_buffers);
2054 
2055 			vhost_vring_call_packed(dev, vq);
2056 		} else {
2057 			write_back_completed_descs_split(vq, n_descs);
2058 
2059 			__atomic_add_fetch(&vq->used->idx, n_descs,
2060 					__ATOMIC_RELEASE);
2061 			vhost_vring_call_split(dev, vq);
2062 		}
2063 	} else {
2064 		if (vq_is_packed(dev)) {
2065 			vq->last_async_buffer_idx_packed += n_buffers;
2066 			if (vq->last_async_buffer_idx_packed >= vq->size)
2067 				vq->last_async_buffer_idx_packed -= vq->size;
2068 		} else {
2069 			vq->last_async_desc_idx_split += n_descs;
2070 		}
2071 	}
2072 
2073 	return n_pkts_put;
2074 }
2075 
2076 uint16_t
2077 rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
2078 		struct rte_mbuf **pkts, uint16_t count)
2079 {
2080 	struct virtio_net *dev = get_device(vid);
2081 	struct vhost_virtqueue *vq;
2082 	uint16_t n_pkts_cpl = 0;
2083 
2084 	if (unlikely(!dev))
2085 		return 0;
2086 
2087 	VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
2088 	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
2089 		VHOST_LOG_DATA(ERR, "(%d) %s: invalid virtqueue idx %d.\n",
2090 			dev->vid, __func__, queue_id);
2091 		return 0;
2092 	}
2093 
2094 	vq = dev->virtqueue[queue_id];
2095 
2096 	if (unlikely(!vq->async_registered)) {
2097 		VHOST_LOG_DATA(ERR, "(%d) %s: async not registered for queue id %d.\n",
2098 			dev->vid, __func__, queue_id);
2099 		return 0;
2100 	}
2101 
2102 	rte_spinlock_lock(&vq->access_lock);
2103 
2104 	n_pkts_cpl = vhost_poll_enqueue_completed(dev, queue_id, pkts, count);
2105 
2106 	rte_spinlock_unlock(&vq->access_lock);
2107 
2108 	return n_pkts_cpl;
2109 }
2110 
2111 uint16_t
2112 rte_vhost_clear_queue_thread_unsafe(int vid, uint16_t queue_id,
2113 		struct rte_mbuf **pkts, uint16_t count)
2114 {
2115 	struct virtio_net *dev = get_device(vid);
2116 	struct vhost_virtqueue *vq;
2117 	uint16_t n_pkts_cpl = 0;
2118 
2119 	if (!dev)
2120 		return 0;
2121 
2122 	VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
2123 	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
2124 		VHOST_LOG_DATA(ERR, "(%d) %s: invalid virtqueue idx %d.\n",
2125 			dev->vid, __func__, queue_id);
2126 		return 0;
2127 	}
2128 
2129 	vq = dev->virtqueue[queue_id];
2130 
2131 	if (unlikely(!vq->async_registered)) {
2132 		VHOST_LOG_DATA(ERR, "(%d) %s: async not registered for queue id %d.\n",
2133 			dev->vid, __func__, queue_id);
2134 		return 0;
2135 	}
2136 
2137 	n_pkts_cpl = vhost_poll_enqueue_completed(dev, queue_id, pkts, count);
2138 
2139 	return n_pkts_cpl;
2140 }
2141 
2142 static __rte_always_inline uint32_t
2143 virtio_dev_rx_async_submit(struct virtio_net *dev, uint16_t queue_id,
2144 	struct rte_mbuf **pkts, uint32_t count)
2145 {
2146 	struct vhost_virtqueue *vq;
2147 	uint32_t nb_tx = 0;
2148 
2149 	VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
2150 	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
2151 		VHOST_LOG_DATA(ERR, "(%d) %s: invalid virtqueue idx %d.\n",
2152 			dev->vid, __func__, queue_id);
2153 		return 0;
2154 	}
2155 
2156 	vq = dev->virtqueue[queue_id];
2157 
2158 	rte_spinlock_lock(&vq->access_lock);
2159 
2160 	if (unlikely(!vq->enabled || !vq->async_registered))
2161 		goto out_access_unlock;
2162 
2163 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
2164 		vhost_user_iotlb_rd_lock(vq);
2165 
2166 	if (unlikely(!vq->access_ok))
2167 		if (unlikely(vring_translate(dev, vq) < 0))
2168 			goto out;
2169 
2170 	count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
2171 	if (count == 0)
2172 		goto out;
2173 
2174 	if (vq_is_packed(dev))
2175 		nb_tx = virtio_dev_rx_async_submit_packed(dev, vq, queue_id,
2176 				pkts, count);
2177 	else
2178 		nb_tx = virtio_dev_rx_async_submit_split(dev, vq, queue_id,
2179 				pkts, count);
2180 
2181 out:
2182 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
2183 		vhost_user_iotlb_rd_unlock(vq);
2184 
2185 out_access_unlock:
2186 	rte_spinlock_unlock(&vq->access_lock);
2187 
2188 	return nb_tx;
2189 }
2190 
2191 uint16_t
2192 rte_vhost_submit_enqueue_burst(int vid, uint16_t queue_id,
2193 		struct rte_mbuf **pkts, uint16_t count)
2194 {
2195 	struct virtio_net *dev = get_device(vid);
2196 
2197 	if (!dev)
2198 		return 0;
2199 
2200 	if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
2201 		VHOST_LOG_DATA(ERR,
2202 			"(%d) %s: built-in vhost net backend is disabled.\n",
2203 			dev->vid, __func__);
2204 		return 0;
2205 	}
2206 
2207 	return virtio_dev_rx_async_submit(dev, queue_id, pkts, count);
2208 }
2209 
2210 static inline bool
2211 virtio_net_with_host_offload(struct virtio_net *dev)
2212 {
2213 	if (dev->features &
2214 			((1ULL << VIRTIO_NET_F_CSUM) |
2215 			 (1ULL << VIRTIO_NET_F_HOST_ECN) |
2216 			 (1ULL << VIRTIO_NET_F_HOST_TSO4) |
2217 			 (1ULL << VIRTIO_NET_F_HOST_TSO6) |
2218 			 (1ULL << VIRTIO_NET_F_HOST_UFO)))
2219 		return true;
2220 
2221 	return false;
2222 }
2223 
2224 static int
2225 parse_headers(struct rte_mbuf *m, uint8_t *l4_proto)
2226 {
2227 	struct rte_ipv4_hdr *ipv4_hdr;
2228 	struct rte_ipv6_hdr *ipv6_hdr;
2229 	struct rte_ether_hdr *eth_hdr;
2230 	uint16_t ethertype;
2231 	uint16_t data_len = rte_pktmbuf_data_len(m);
2232 
2233 	if (data_len < sizeof(struct rte_ether_hdr))
2234 		return -EINVAL;
2235 
2236 	eth_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
2237 
2238 	m->l2_len = sizeof(struct rte_ether_hdr);
2239 	ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
2240 
2241 	if (ethertype == RTE_ETHER_TYPE_VLAN) {
2242 		if (data_len < sizeof(struct rte_ether_hdr) +
2243 				sizeof(struct rte_vlan_hdr))
2244 			goto error;
2245 
2246 		struct rte_vlan_hdr *vlan_hdr =
2247 			(struct rte_vlan_hdr *)(eth_hdr + 1);
2248 
2249 		m->l2_len += sizeof(struct rte_vlan_hdr);
2250 		ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
2251 	}
2252 
2253 	switch (ethertype) {
2254 	case RTE_ETHER_TYPE_IPV4:
2255 		if (data_len < m->l2_len + sizeof(struct rte_ipv4_hdr))
2256 			goto error;
2257 		ipv4_hdr = rte_pktmbuf_mtod_offset(m, struct rte_ipv4_hdr *,
2258 				m->l2_len);
2259 		m->l3_len = rte_ipv4_hdr_len(ipv4_hdr);
2260 		if (data_len < m->l2_len + m->l3_len)
2261 			goto error;
2262 		m->ol_flags |= RTE_MBUF_F_TX_IPV4;
2263 		*l4_proto = ipv4_hdr->next_proto_id;
2264 		break;
2265 	case RTE_ETHER_TYPE_IPV6:
2266 		if (data_len < m->l2_len + sizeof(struct rte_ipv6_hdr))
2267 			goto error;
2268 		ipv6_hdr = rte_pktmbuf_mtod_offset(m, struct rte_ipv6_hdr *,
2269 				m->l2_len);
2270 		m->l3_len = sizeof(struct rte_ipv6_hdr);
2271 		m->ol_flags |= RTE_MBUF_F_TX_IPV6;
2272 		*l4_proto = ipv6_hdr->proto;
2273 		break;
2274 	default:
2275 		/* a valid L3 header is needed for further L4 parsing */
2276 		goto error;
2277 	}
2278 
2279 	/* both CSUM and GSO need a valid L4 header */
2280 	switch (*l4_proto) {
2281 	case IPPROTO_TCP:
2282 		if (data_len < m->l2_len + m->l3_len +
2283 				sizeof(struct rte_tcp_hdr))
2284 			goto error;
2285 		break;
2286 	case IPPROTO_UDP:
2287 		if (data_len < m->l2_len + m->l3_len +
2288 				sizeof(struct rte_udp_hdr))
2289 			goto error;
2290 		break;
2291 	case IPPROTO_SCTP:
2292 		if (data_len < m->l2_len + m->l3_len +
2293 				sizeof(struct rte_sctp_hdr))
2294 			goto error;
2295 		break;
2296 	default:
2297 		goto error;
2298 	}
2299 
2300 	return 0;
2301 
2302 error:
2303 	m->l2_len = 0;
2304 	m->l3_len = 0;
2305 	m->ol_flags = 0;
2306 	return -EINVAL;
2307 }
2308 
2309 static __rte_always_inline void
2310 vhost_dequeue_offload_legacy(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
2311 {
2312 	uint8_t l4_proto = 0;
2313 	struct rte_tcp_hdr *tcp_hdr = NULL;
2314 	uint16_t tcp_len;
2315 	uint16_t data_len = rte_pktmbuf_data_len(m);
2316 
2317 	if (parse_headers(m, &l4_proto) < 0)
2318 		return;
2319 
2320 	if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2321 		if (hdr->csum_start == (m->l2_len + m->l3_len)) {
2322 			switch (hdr->csum_offset) {
2323 			case (offsetof(struct rte_tcp_hdr, cksum)):
2324 				if (l4_proto != IPPROTO_TCP)
2325 					goto error;
2326 				m->ol_flags |= RTE_MBUF_F_TX_TCP_CKSUM;
2327 				break;
2328 			case (offsetof(struct rte_udp_hdr, dgram_cksum)):
2329 				if (l4_proto != IPPROTO_UDP)
2330 					goto error;
2331 				m->ol_flags |= RTE_MBUF_F_TX_UDP_CKSUM;
2332 				break;
2333 			case (offsetof(struct rte_sctp_hdr, cksum)):
2334 				if (l4_proto != IPPROTO_SCTP)
2335 					goto error;
2336 				m->ol_flags |= RTE_MBUF_F_TX_SCTP_CKSUM;
2337 				break;
2338 			default:
2339 				goto error;
2340 			}
2341 		} else {
2342 			goto error;
2343 		}
2344 	}
2345 
2346 	if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2347 		switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2348 		case VIRTIO_NET_HDR_GSO_TCPV4:
2349 		case VIRTIO_NET_HDR_GSO_TCPV6:
2350 			if (l4_proto != IPPROTO_TCP)
2351 				goto error;
2352 			tcp_hdr = rte_pktmbuf_mtod_offset(m,
2353 					struct rte_tcp_hdr *,
2354 					m->l2_len + m->l3_len);
2355 			tcp_len = (tcp_hdr->data_off & 0xf0) >> 2;
2356 			if (data_len < m->l2_len + m->l3_len + tcp_len)
2357 				goto error;
2358 			m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG;
2359 			m->tso_segsz = hdr->gso_size;
2360 			m->l4_len = tcp_len;
2361 			break;
2362 		case VIRTIO_NET_HDR_GSO_UDP:
2363 			if (l4_proto != IPPROTO_UDP)
2364 				goto error;
2365 			m->ol_flags |= RTE_MBUF_F_TX_UDP_SEG;
2366 			m->tso_segsz = hdr->gso_size;
2367 			m->l4_len = sizeof(struct rte_udp_hdr);
2368 			break;
2369 		default:
2370 			VHOST_LOG_DATA(WARNING,
2371 				"unsupported gso type %u.\n", hdr->gso_type);
2372 			goto error;
2373 		}
2374 	}
2375 	return;
2376 
2377 error:
2378 	m->l2_len = 0;
2379 	m->l3_len = 0;
2380 	m->ol_flags = 0;
2381 }
2382 
2383 static __rte_always_inline void
2384 vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m,
2385 	bool legacy_ol_flags)
2386 {
2387 	struct rte_net_hdr_lens hdr_lens;
2388 	int l4_supported = 0;
2389 	uint32_t ptype;
2390 
2391 	if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE)
2392 		return;
2393 
2394 	if (legacy_ol_flags) {
2395 		vhost_dequeue_offload_legacy(hdr, m);
2396 		return;
2397 	}
2398 
2399 	m->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_UNKNOWN;
2400 
2401 	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
2402 	m->packet_type = ptype;
2403 	if ((ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP ||
2404 	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP ||
2405 	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP)
2406 		l4_supported = 1;
2407 
2408 	/* According to Virtio 1.1 spec, the device only needs to look at
2409 	 * VIRTIO_NET_HDR_F_NEEDS_CSUM in the packet transmission path.
2410 	 * This differs from the processing incoming packets path where the
2411 	 * driver could rely on VIRTIO_NET_HDR_F_DATA_VALID flag set by the
2412 	 * device.
2413 	 *
2414 	 * 5.1.6.2.1 Driver Requirements: Packet Transmission
2415 	 * The driver MUST NOT set the VIRTIO_NET_HDR_F_DATA_VALID and
2416 	 * VIRTIO_NET_HDR_F_RSC_INFO bits in flags.
2417 	 *
2418 	 * 5.1.6.2.2 Device Requirements: Packet Transmission
2419 	 * The device MUST ignore flag bits that it does not recognize.
2420 	 */
2421 	if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2422 		uint32_t hdrlen;
2423 
2424 		hdrlen = hdr_lens.l2_len + hdr_lens.l3_len + hdr_lens.l4_len;
2425 		if (hdr->csum_start <= hdrlen && l4_supported != 0) {
2426 			m->ol_flags |= RTE_MBUF_F_RX_L4_CKSUM_NONE;
2427 		} else {
2428 			/* Unknown proto or tunnel, do sw cksum. We can assume
2429 			 * the cksum field is in the first segment since the
2430 			 * buffers we provided to the host are large enough.
2431 			 * In case of SCTP, this will be wrong since it's a CRC
2432 			 * but there's nothing we can do.
2433 			 */
2434 			uint16_t csum = 0, off;
2435 
2436 			if (rte_raw_cksum_mbuf(m, hdr->csum_start,
2437 					rte_pktmbuf_pkt_len(m) - hdr->csum_start, &csum) < 0)
2438 				return;
2439 			if (likely(csum != 0xffff))
2440 				csum = ~csum;
2441 			off = hdr->csum_offset + hdr->csum_start;
2442 			if (rte_pktmbuf_data_len(m) >= off + 1)
2443 				*rte_pktmbuf_mtod_offset(m, uint16_t *, off) = csum;
2444 		}
2445 	}
2446 
2447 	if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2448 		if (hdr->gso_size == 0)
2449 			return;
2450 
2451 		switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2452 		case VIRTIO_NET_HDR_GSO_TCPV4:
2453 		case VIRTIO_NET_HDR_GSO_TCPV6:
2454 			if ((ptype & RTE_PTYPE_L4_MASK) != RTE_PTYPE_L4_TCP)
2455 				break;
2456 			m->ol_flags |= RTE_MBUF_F_RX_LRO | RTE_MBUF_F_RX_L4_CKSUM_NONE;
2457 			m->tso_segsz = hdr->gso_size;
2458 			break;
2459 		case VIRTIO_NET_HDR_GSO_UDP:
2460 			if ((ptype & RTE_PTYPE_L4_MASK) != RTE_PTYPE_L4_UDP)
2461 				break;
2462 			m->ol_flags |= RTE_MBUF_F_RX_LRO | RTE_MBUF_F_RX_L4_CKSUM_NONE;
2463 			m->tso_segsz = hdr->gso_size;
2464 			break;
2465 		default:
2466 			break;
2467 		}
2468 	}
2469 }
2470 
2471 static __rte_noinline void
2472 copy_vnet_hdr_from_desc(struct virtio_net_hdr *hdr,
2473 		struct buf_vector *buf_vec)
2474 {
2475 	uint64_t len;
2476 	uint64_t remain = sizeof(struct virtio_net_hdr);
2477 	uint64_t src;
2478 	uint64_t dst = (uint64_t)(uintptr_t)hdr;
2479 
2480 	while (remain) {
2481 		len = RTE_MIN(remain, buf_vec->buf_len);
2482 		src = buf_vec->buf_addr;
2483 		rte_memcpy((void *)(uintptr_t)dst,
2484 				(void *)(uintptr_t)src, len);
2485 
2486 		remain -= len;
2487 		dst += len;
2488 		buf_vec++;
2489 	}
2490 }
2491 
2492 static __rte_always_inline int
2493 copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
2494 		  struct buf_vector *buf_vec, uint16_t nr_vec,
2495 		  struct rte_mbuf *m, struct rte_mempool *mbuf_pool,
2496 		  bool legacy_ol_flags)
2497 {
2498 	uint32_t buf_avail, buf_offset;
2499 	uint64_t buf_addr, buf_len;
2500 	uint32_t mbuf_avail, mbuf_offset;
2501 	uint32_t cpy_len;
2502 	struct rte_mbuf *cur = m, *prev = m;
2503 	struct virtio_net_hdr tmp_hdr;
2504 	struct virtio_net_hdr *hdr = NULL;
2505 	/* A counter to avoid desc dead loop chain */
2506 	uint16_t vec_idx = 0;
2507 	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
2508 	int error = 0;
2509 
2510 	buf_addr = buf_vec[vec_idx].buf_addr;
2511 	buf_len = buf_vec[vec_idx].buf_len;
2512 
2513 	if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
2514 		error = -1;
2515 		goto out;
2516 	}
2517 
2518 	if (virtio_net_with_host_offload(dev)) {
2519 		if (unlikely(buf_len < sizeof(struct virtio_net_hdr))) {
2520 			/*
2521 			 * No luck, the virtio-net header doesn't fit
2522 			 * in a contiguous virtual area.
2523 			 */
2524 			copy_vnet_hdr_from_desc(&tmp_hdr, buf_vec);
2525 			hdr = &tmp_hdr;
2526 		} else {
2527 			hdr = (struct virtio_net_hdr *)((uintptr_t)buf_addr);
2528 		}
2529 	}
2530 
2531 	/*
2532 	 * A virtio driver normally uses at least 2 desc buffers
2533 	 * for Tx: the first for storing the header, and others
2534 	 * for storing the data.
2535 	 */
2536 	if (unlikely(buf_len < dev->vhost_hlen)) {
2537 		buf_offset = dev->vhost_hlen - buf_len;
2538 		vec_idx++;
2539 		buf_addr = buf_vec[vec_idx].buf_addr;
2540 		buf_len = buf_vec[vec_idx].buf_len;
2541 		buf_avail  = buf_len - buf_offset;
2542 	} else if (buf_len == dev->vhost_hlen) {
2543 		if (unlikely(++vec_idx >= nr_vec))
2544 			goto out;
2545 		buf_addr = buf_vec[vec_idx].buf_addr;
2546 		buf_len = buf_vec[vec_idx].buf_len;
2547 
2548 		buf_offset = 0;
2549 		buf_avail = buf_len;
2550 	} else {
2551 		buf_offset = dev->vhost_hlen;
2552 		buf_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
2553 	}
2554 
2555 	PRINT_PACKET(dev,
2556 			(uintptr_t)(buf_addr + buf_offset),
2557 			(uint32_t)buf_avail, 0);
2558 
2559 	mbuf_offset = 0;
2560 	mbuf_avail  = m->buf_len - RTE_PKTMBUF_HEADROOM;
2561 	while (1) {
2562 		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
2563 
2564 		if (likely(cpy_len > MAX_BATCH_LEN ||
2565 					vq->batch_copy_nb_elems >= vq->size ||
2566 					(hdr && cur == m))) {
2567 			rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *,
2568 						mbuf_offset),
2569 					(void *)((uintptr_t)(buf_addr +
2570 							buf_offset)), cpy_len);
2571 		} else {
2572 			batch_copy[vq->batch_copy_nb_elems].dst =
2573 				rte_pktmbuf_mtod_offset(cur, void *,
2574 						mbuf_offset);
2575 			batch_copy[vq->batch_copy_nb_elems].src =
2576 				(void *)((uintptr_t)(buf_addr + buf_offset));
2577 			batch_copy[vq->batch_copy_nb_elems].len = cpy_len;
2578 			vq->batch_copy_nb_elems++;
2579 		}
2580 
2581 		mbuf_avail  -= cpy_len;
2582 		mbuf_offset += cpy_len;
2583 		buf_avail -= cpy_len;
2584 		buf_offset += cpy_len;
2585 
2586 		/* This buf reaches to its end, get the next one */
2587 		if (buf_avail == 0) {
2588 			if (++vec_idx >= nr_vec)
2589 				break;
2590 
2591 			buf_addr = buf_vec[vec_idx].buf_addr;
2592 			buf_len = buf_vec[vec_idx].buf_len;
2593 
2594 			buf_offset = 0;
2595 			buf_avail  = buf_len;
2596 
2597 			PRINT_PACKET(dev, (uintptr_t)buf_addr,
2598 					(uint32_t)buf_avail, 0);
2599 		}
2600 
2601 		/*
2602 		 * This mbuf reaches to its end, get a new one
2603 		 * to hold more data.
2604 		 */
2605 		if (mbuf_avail == 0) {
2606 			cur = rte_pktmbuf_alloc(mbuf_pool);
2607 			if (unlikely(cur == NULL)) {
2608 				VHOST_LOG_DATA(ERR, "Failed to "
2609 					"allocate memory for mbuf.\n");
2610 				error = -1;
2611 				goto out;
2612 			}
2613 
2614 			prev->next = cur;
2615 			prev->data_len = mbuf_offset;
2616 			m->nb_segs += 1;
2617 			m->pkt_len += mbuf_offset;
2618 			prev = cur;
2619 
2620 			mbuf_offset = 0;
2621 			mbuf_avail  = cur->buf_len - RTE_PKTMBUF_HEADROOM;
2622 		}
2623 	}
2624 
2625 	prev->data_len = mbuf_offset;
2626 	m->pkt_len    += mbuf_offset;
2627 
2628 	if (hdr)
2629 		vhost_dequeue_offload(hdr, m, legacy_ol_flags);
2630 
2631 out:
2632 
2633 	return error;
2634 }
2635 
2636 static void
2637 virtio_dev_extbuf_free(void *addr __rte_unused, void *opaque)
2638 {
2639 	rte_free(opaque);
2640 }
2641 
2642 static int
2643 virtio_dev_extbuf_alloc(struct rte_mbuf *pkt, uint32_t size)
2644 {
2645 	struct rte_mbuf_ext_shared_info *shinfo = NULL;
2646 	uint32_t total_len = RTE_PKTMBUF_HEADROOM + size;
2647 	uint16_t buf_len;
2648 	rte_iova_t iova;
2649 	void *buf;
2650 
2651 	total_len += sizeof(*shinfo) + sizeof(uintptr_t);
2652 	total_len = RTE_ALIGN_CEIL(total_len, sizeof(uintptr_t));
2653 
2654 	if (unlikely(total_len > UINT16_MAX))
2655 		return -ENOSPC;
2656 
2657 	buf_len = total_len;
2658 	buf = rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE);
2659 	if (unlikely(buf == NULL))
2660 		return -ENOMEM;
2661 
2662 	/* Initialize shinfo */
2663 	shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf, &buf_len,
2664 						virtio_dev_extbuf_free, buf);
2665 	if (unlikely(shinfo == NULL)) {
2666 		rte_free(buf);
2667 		VHOST_LOG_DATA(ERR, "Failed to init shinfo\n");
2668 		return -1;
2669 	}
2670 
2671 	iova = rte_malloc_virt2iova(buf);
2672 	rte_pktmbuf_attach_extbuf(pkt, buf, iova, buf_len, shinfo);
2673 	rte_pktmbuf_reset_headroom(pkt);
2674 
2675 	return 0;
2676 }
2677 
2678 /*
2679  * Prepare a host supported pktmbuf.
2680  */
2681 static __rte_always_inline int
2682 virtio_dev_pktmbuf_prep(struct virtio_net *dev, struct rte_mbuf *pkt,
2683 			 uint32_t data_len)
2684 {
2685 	if (rte_pktmbuf_tailroom(pkt) >= data_len)
2686 		return 0;
2687 
2688 	/* attach an external buffer if supported */
2689 	if (dev->extbuf && !virtio_dev_extbuf_alloc(pkt, data_len))
2690 		return 0;
2691 
2692 	/* check if chained buffers are allowed */
2693 	if (!dev->linearbuf)
2694 		return 0;
2695 
2696 	return -1;
2697 }
2698 
2699 __rte_always_inline
2700 static uint16_t
2701 virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
2702 	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count,
2703 	bool legacy_ol_flags)
2704 {
2705 	uint16_t i;
2706 	uint16_t free_entries;
2707 	uint16_t dropped = 0;
2708 	static bool allocerr_warned;
2709 
2710 	/*
2711 	 * The ordering between avail index and
2712 	 * desc reads needs to be enforced.
2713 	 */
2714 	free_entries = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE) -
2715 			vq->last_avail_idx;
2716 	if (free_entries == 0)
2717 		return 0;
2718 
2719 	rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
2720 
2721 	VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
2722 
2723 	count = RTE_MIN(count, MAX_PKT_BURST);
2724 	count = RTE_MIN(count, free_entries);
2725 	VHOST_LOG_DATA(DEBUG, "(%d) about to dequeue %u buffers\n",
2726 			dev->vid, count);
2727 
2728 	if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts, count))
2729 		return 0;
2730 
2731 	for (i = 0; i < count; i++) {
2732 		struct buf_vector buf_vec[BUF_VECTOR_MAX];
2733 		uint16_t head_idx;
2734 		uint32_t buf_len;
2735 		uint16_t nr_vec = 0;
2736 		int err;
2737 
2738 		if (unlikely(fill_vec_buf_split(dev, vq,
2739 						vq->last_avail_idx + i,
2740 						&nr_vec, buf_vec,
2741 						&head_idx, &buf_len,
2742 						VHOST_ACCESS_RO) < 0))
2743 			break;
2744 
2745 		update_shadow_used_ring_split(vq, head_idx, 0);
2746 
2747 		err = virtio_dev_pktmbuf_prep(dev, pkts[i], buf_len);
2748 		if (unlikely(err)) {
2749 			/*
2750 			 * mbuf allocation fails for jumbo packets when external
2751 			 * buffer allocation is not allowed and linear buffer
2752 			 * is required. Drop this packet.
2753 			 */
2754 			if (!allocerr_warned) {
2755 				VHOST_LOG_DATA(ERR,
2756 					"Failed mbuf alloc of size %d from %s on %s.\n",
2757 					buf_len, mbuf_pool->name, dev->ifname);
2758 				allocerr_warned = true;
2759 			}
2760 			dropped += 1;
2761 			i++;
2762 			break;
2763 		}
2764 
2765 		err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
2766 				mbuf_pool, legacy_ol_flags);
2767 		if (unlikely(err)) {
2768 			if (!allocerr_warned) {
2769 				VHOST_LOG_DATA(ERR,
2770 					"Failed to copy desc to mbuf on %s.\n",
2771 					dev->ifname);
2772 				allocerr_warned = true;
2773 			}
2774 			dropped += 1;
2775 			i++;
2776 			break;
2777 		}
2778 	}
2779 
2780 	if (dropped)
2781 		rte_pktmbuf_free_bulk(&pkts[i - 1], count - i + 1);
2782 
2783 	vq->last_avail_idx += i;
2784 
2785 	do_data_copy_dequeue(vq);
2786 	if (unlikely(i < count))
2787 		vq->shadow_used_idx = i;
2788 	if (likely(vq->shadow_used_idx)) {
2789 		flush_shadow_used_ring_split(dev, vq);
2790 		vhost_vring_call_split(dev, vq);
2791 	}
2792 
2793 	return (i - dropped);
2794 }
2795 
2796 __rte_noinline
2797 static uint16_t
2798 virtio_dev_tx_split_legacy(struct virtio_net *dev,
2799 	struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool,
2800 	struct rte_mbuf **pkts, uint16_t count)
2801 {
2802 	return virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count, true);
2803 }
2804 
2805 __rte_noinline
2806 static uint16_t
2807 virtio_dev_tx_split_compliant(struct virtio_net *dev,
2808 	struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool,
2809 	struct rte_mbuf **pkts, uint16_t count)
2810 {
2811 	return virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count, false);
2812 }
2813 
2814 static __rte_always_inline int
2815 vhost_reserve_avail_batch_packed(struct virtio_net *dev,
2816 				 struct vhost_virtqueue *vq,
2817 				 struct rte_mbuf **pkts,
2818 				 uint16_t avail_idx,
2819 				 uintptr_t *desc_addrs,
2820 				 uint16_t *ids)
2821 {
2822 	bool wrap = vq->avail_wrap_counter;
2823 	struct vring_packed_desc *descs = vq->desc_packed;
2824 	uint64_t lens[PACKED_BATCH_SIZE];
2825 	uint64_t buf_lens[PACKED_BATCH_SIZE];
2826 	uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
2827 	uint16_t flags, i;
2828 
2829 	if (unlikely(avail_idx & PACKED_BATCH_MASK))
2830 		return -1;
2831 	if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size))
2832 		return -1;
2833 
2834 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2835 		flags = descs[avail_idx + i].flags;
2836 		if (unlikely((wrap != !!(flags & VRING_DESC_F_AVAIL)) ||
2837 			     (wrap == !!(flags & VRING_DESC_F_USED))  ||
2838 			     (flags & PACKED_DESC_SINGLE_DEQUEUE_FLAG)))
2839 			return -1;
2840 	}
2841 
2842 	rte_atomic_thread_fence(__ATOMIC_ACQUIRE);
2843 
2844 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
2845 		lens[i] = descs[avail_idx + i].len;
2846 
2847 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2848 		desc_addrs[i] = vhost_iova_to_vva(dev, vq,
2849 						  descs[avail_idx + i].addr,
2850 						  &lens[i], VHOST_ACCESS_RW);
2851 	}
2852 
2853 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2854 		if (unlikely(!desc_addrs[i]))
2855 			return -1;
2856 		if (unlikely((lens[i] != descs[avail_idx + i].len)))
2857 			return -1;
2858 	}
2859 
2860 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2861 		if (virtio_dev_pktmbuf_prep(dev, pkts[i], lens[i]))
2862 			goto err;
2863 	}
2864 
2865 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
2866 		buf_lens[i] = pkts[i]->buf_len - pkts[i]->data_off;
2867 
2868 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2869 		if (unlikely(buf_lens[i] < (lens[i] - buf_offset)))
2870 			goto err;
2871 	}
2872 
2873 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2874 		pkts[i]->pkt_len = lens[i] - buf_offset;
2875 		pkts[i]->data_len = pkts[i]->pkt_len;
2876 		ids[i] = descs[avail_idx + i].id;
2877 	}
2878 
2879 	return 0;
2880 
2881 err:
2882 	return -1;
2883 }
2884 
2885 static __rte_always_inline int
2886 virtio_dev_tx_batch_packed(struct virtio_net *dev,
2887 			   struct vhost_virtqueue *vq,
2888 			   struct rte_mbuf **pkts,
2889 			   bool legacy_ol_flags)
2890 {
2891 	uint16_t avail_idx = vq->last_avail_idx;
2892 	uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
2893 	struct virtio_net_hdr *hdr;
2894 	uintptr_t desc_addrs[PACKED_BATCH_SIZE];
2895 	uint16_t ids[PACKED_BATCH_SIZE];
2896 	uint16_t i;
2897 
2898 	if (vhost_reserve_avail_batch_packed(dev, vq, pkts, avail_idx,
2899 					     desc_addrs, ids))
2900 		return -1;
2901 
2902 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
2903 		rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
2904 
2905 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
2906 		rte_memcpy(rte_pktmbuf_mtod_offset(pkts[i], void *, 0),
2907 			   (void *)(uintptr_t)(desc_addrs[i] + buf_offset),
2908 			   pkts[i]->pkt_len);
2909 
2910 	if (virtio_net_with_host_offload(dev)) {
2911 		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2912 			hdr = (struct virtio_net_hdr *)(desc_addrs[i]);
2913 			vhost_dequeue_offload(hdr, pkts[i], legacy_ol_flags);
2914 		}
2915 	}
2916 
2917 	if (virtio_net_is_inorder(dev))
2918 		vhost_shadow_dequeue_batch_packed_inorder(vq,
2919 			ids[PACKED_BATCH_SIZE - 1]);
2920 	else
2921 		vhost_shadow_dequeue_batch_packed(dev, vq, ids);
2922 
2923 	vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE);
2924 
2925 	return 0;
2926 }
2927 
2928 static __rte_always_inline int
2929 vhost_dequeue_single_packed(struct virtio_net *dev,
2930 			    struct vhost_virtqueue *vq,
2931 			    struct rte_mempool *mbuf_pool,
2932 			    struct rte_mbuf *pkts,
2933 			    uint16_t *buf_id,
2934 			    uint16_t *desc_count,
2935 			    bool legacy_ol_flags)
2936 {
2937 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
2938 	uint32_t buf_len;
2939 	uint16_t nr_vec = 0;
2940 	int err;
2941 	static bool allocerr_warned;
2942 
2943 	if (unlikely(fill_vec_buf_packed(dev, vq,
2944 					 vq->last_avail_idx, desc_count,
2945 					 buf_vec, &nr_vec,
2946 					 buf_id, &buf_len,
2947 					 VHOST_ACCESS_RO) < 0))
2948 		return -1;
2949 
2950 	if (unlikely(virtio_dev_pktmbuf_prep(dev, pkts, buf_len))) {
2951 		if (!allocerr_warned) {
2952 			VHOST_LOG_DATA(ERR,
2953 				"Failed mbuf alloc of size %d from %s on %s.\n",
2954 				buf_len, mbuf_pool->name, dev->ifname);
2955 			allocerr_warned = true;
2956 		}
2957 		return -1;
2958 	}
2959 
2960 	err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts,
2961 				mbuf_pool, legacy_ol_flags);
2962 	if (unlikely(err)) {
2963 		if (!allocerr_warned) {
2964 			VHOST_LOG_DATA(ERR,
2965 				"Failed to copy desc to mbuf on %s.\n",
2966 				dev->ifname);
2967 			allocerr_warned = true;
2968 		}
2969 		return -1;
2970 	}
2971 
2972 	return 0;
2973 }
2974 
2975 static __rte_always_inline int
2976 virtio_dev_tx_single_packed(struct virtio_net *dev,
2977 			    struct vhost_virtqueue *vq,
2978 			    struct rte_mempool *mbuf_pool,
2979 			    struct rte_mbuf *pkts,
2980 			    bool legacy_ol_flags)
2981 {
2982 
2983 	uint16_t buf_id, desc_count = 0;
2984 	int ret;
2985 
2986 	ret = vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id,
2987 					&desc_count, legacy_ol_flags);
2988 
2989 	if (likely(desc_count > 0)) {
2990 		if (virtio_net_is_inorder(dev))
2991 			vhost_shadow_dequeue_single_packed_inorder(vq, buf_id,
2992 								   desc_count);
2993 		else
2994 			vhost_shadow_dequeue_single_packed(vq, buf_id,
2995 					desc_count);
2996 
2997 		vq_inc_last_avail_packed(vq, desc_count);
2998 	}
2999 
3000 	return ret;
3001 }
3002 
3003 __rte_always_inline
3004 static uint16_t
3005 virtio_dev_tx_packed(struct virtio_net *dev,
3006 		     struct vhost_virtqueue *__rte_restrict vq,
3007 		     struct rte_mempool *mbuf_pool,
3008 		     struct rte_mbuf **__rte_restrict pkts,
3009 		     uint32_t count,
3010 		     bool legacy_ol_flags)
3011 {
3012 	uint32_t pkt_idx = 0;
3013 
3014 	if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts, count))
3015 		return 0;
3016 
3017 	do {
3018 		rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
3019 
3020 		if (count - pkt_idx >= PACKED_BATCH_SIZE) {
3021 			if (!virtio_dev_tx_batch_packed(dev, vq,
3022 							&pkts[pkt_idx],
3023 							legacy_ol_flags)) {
3024 				pkt_idx += PACKED_BATCH_SIZE;
3025 				continue;
3026 			}
3027 		}
3028 
3029 		if (virtio_dev_tx_single_packed(dev, vq, mbuf_pool,
3030 						pkts[pkt_idx],
3031 						legacy_ol_flags))
3032 			break;
3033 		pkt_idx++;
3034 	} while (pkt_idx < count);
3035 
3036 	if (pkt_idx != count)
3037 		rte_pktmbuf_free_bulk(&pkts[pkt_idx], count - pkt_idx);
3038 
3039 	if (vq->shadow_used_idx) {
3040 		do_data_copy_dequeue(vq);
3041 
3042 		vhost_flush_dequeue_shadow_packed(dev, vq);
3043 		vhost_vring_call_packed(dev, vq);
3044 	}
3045 
3046 	return pkt_idx;
3047 }
3048 
3049 __rte_noinline
3050 static uint16_t
3051 virtio_dev_tx_packed_legacy(struct virtio_net *dev,
3052 	struct vhost_virtqueue *__rte_restrict vq, struct rte_mempool *mbuf_pool,
3053 	struct rte_mbuf **__rte_restrict pkts, uint32_t count)
3054 {
3055 	return virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count, true);
3056 }
3057 
3058 __rte_noinline
3059 static uint16_t
3060 virtio_dev_tx_packed_compliant(struct virtio_net *dev,
3061 	struct vhost_virtqueue *__rte_restrict vq, struct rte_mempool *mbuf_pool,
3062 	struct rte_mbuf **__rte_restrict pkts, uint32_t count)
3063 {
3064 	return virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count, false);
3065 }
3066 
3067 uint16_t
3068 rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
3069 	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
3070 {
3071 	struct virtio_net *dev;
3072 	struct rte_mbuf *rarp_mbuf = NULL;
3073 	struct vhost_virtqueue *vq;
3074 	int16_t success = 1;
3075 
3076 	dev = get_device(vid);
3077 	if (!dev)
3078 		return 0;
3079 
3080 	if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
3081 		VHOST_LOG_DATA(ERR,
3082 			"(%d) %s: built-in vhost net backend is disabled.\n",
3083 			dev->vid, __func__);
3084 		return 0;
3085 	}
3086 
3087 	if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) {
3088 		VHOST_LOG_DATA(ERR,
3089 			"(%d) %s: invalid virtqueue idx %d.\n",
3090 			dev->vid, __func__, queue_id);
3091 		return 0;
3092 	}
3093 
3094 	vq = dev->virtqueue[queue_id];
3095 
3096 	if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
3097 		return 0;
3098 
3099 	if (unlikely(!vq->enabled)) {
3100 		count = 0;
3101 		goto out_access_unlock;
3102 	}
3103 
3104 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
3105 		vhost_user_iotlb_rd_lock(vq);
3106 
3107 	if (unlikely(!vq->access_ok))
3108 		if (unlikely(vring_translate(dev, vq) < 0)) {
3109 			count = 0;
3110 			goto out;
3111 		}
3112 
3113 	/*
3114 	 * Construct a RARP broadcast packet, and inject it to the "pkts"
3115 	 * array, to looks like that guest actually send such packet.
3116 	 *
3117 	 * Check user_send_rarp() for more information.
3118 	 *
3119 	 * broadcast_rarp shares a cacheline in the virtio_net structure
3120 	 * with some fields that are accessed during enqueue and
3121 	 * __atomic_compare_exchange_n causes a write if performed compare
3122 	 * and exchange. This could result in false sharing between enqueue
3123 	 * and dequeue.
3124 	 *
3125 	 * Prevent unnecessary false sharing by reading broadcast_rarp first
3126 	 * and only performing compare and exchange if the read indicates it
3127 	 * is likely to be set.
3128 	 */
3129 	if (unlikely(__atomic_load_n(&dev->broadcast_rarp, __ATOMIC_ACQUIRE) &&
3130 			__atomic_compare_exchange_n(&dev->broadcast_rarp,
3131 			&success, 0, 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED))) {
3132 
3133 		rarp_mbuf = rte_net_make_rarp_packet(mbuf_pool, &dev->mac);
3134 		if (rarp_mbuf == NULL) {
3135 			VHOST_LOG_DATA(ERR, "Failed to make RARP packet.\n");
3136 			count = 0;
3137 			goto out;
3138 		}
3139 		/*
3140 		 * Inject it to the head of "pkts" array, so that switch's mac
3141 		 * learning table will get updated first.
3142 		 */
3143 		pkts[0] = rarp_mbuf;
3144 		pkts++;
3145 		count -= 1;
3146 	}
3147 
3148 	if (vq_is_packed(dev)) {
3149 		if (dev->flags & VIRTIO_DEV_LEGACY_OL_FLAGS)
3150 			count = virtio_dev_tx_packed_legacy(dev, vq, mbuf_pool, pkts, count);
3151 		else
3152 			count = virtio_dev_tx_packed_compliant(dev, vq, mbuf_pool, pkts, count);
3153 	} else {
3154 		if (dev->flags & VIRTIO_DEV_LEGACY_OL_FLAGS)
3155 			count = virtio_dev_tx_split_legacy(dev, vq, mbuf_pool, pkts, count);
3156 		else
3157 			count = virtio_dev_tx_split_compliant(dev, vq, mbuf_pool, pkts, count);
3158 	}
3159 
3160 out:
3161 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
3162 		vhost_user_iotlb_rd_unlock(vq);
3163 
3164 out_access_unlock:
3165 	rte_spinlock_unlock(&vq->access_lock);
3166 
3167 	if (unlikely(rarp_mbuf != NULL))
3168 		count += 1;
3169 
3170 	return count;
3171 }
3172