xref: /dpdk/lib/vhost/virtio_net.c (revision cf8a8a8f4896c0885d3996716f73513c4317e545)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2016 Intel Corporation
3  */
4 
5 #include <stdint.h>
6 #include <stdbool.h>
7 #include <linux/virtio_net.h>
8 
9 #include <rte_mbuf.h>
10 #include <rte_memcpy.h>
11 #include <rte_net.h>
12 #include <rte_ether.h>
13 #include <rte_ip.h>
14 #include <rte_vhost.h>
15 #include <rte_tcp.h>
16 #include <rte_udp.h>
17 #include <rte_sctp.h>
18 #include <rte_arp.h>
19 #include <rte_spinlock.h>
20 #include <rte_malloc.h>
21 #include <rte_vhost_async.h>
22 
23 #include "iotlb.h"
24 #include "vhost.h"
25 
26 #define MAX_BATCH_LEN 256
27 
28 #define VHOST_ASYNC_BATCH_THRESHOLD 32
29 
30 static  __rte_always_inline bool
31 rxvq_is_mergeable(struct virtio_net *dev)
32 {
33 	return dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF);
34 }
35 
36 static  __rte_always_inline bool
37 virtio_net_is_inorder(struct virtio_net *dev)
38 {
39 	return dev->features & (1ULL << VIRTIO_F_IN_ORDER);
40 }
41 
42 static bool
43 is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring)
44 {
45 	return (is_tx ^ (idx & 1)) == 0 && idx < nr_vring;
46 }
47 
48 static inline void
49 do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
50 {
51 	struct batch_copy_elem *elem = vq->batch_copy_elems;
52 	uint16_t count = vq->batch_copy_nb_elems;
53 	int i;
54 
55 	for (i = 0; i < count; i++) {
56 		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
57 		vhost_log_cache_write_iova(dev, vq, elem[i].log_addr,
58 					   elem[i].len);
59 		PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
60 	}
61 
62 	vq->batch_copy_nb_elems = 0;
63 }
64 
65 static inline void
66 do_data_copy_dequeue(struct vhost_virtqueue *vq)
67 {
68 	struct batch_copy_elem *elem = vq->batch_copy_elems;
69 	uint16_t count = vq->batch_copy_nb_elems;
70 	int i;
71 
72 	for (i = 0; i < count; i++)
73 		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
74 
75 	vq->batch_copy_nb_elems = 0;
76 }
77 
78 static __rte_always_inline void
79 do_flush_shadow_used_ring_split(struct virtio_net *dev,
80 			struct vhost_virtqueue *vq,
81 			uint16_t to, uint16_t from, uint16_t size)
82 {
83 	rte_memcpy(&vq->used->ring[to],
84 			&vq->shadow_used_split[from],
85 			size * sizeof(struct vring_used_elem));
86 	vhost_log_cache_used_vring(dev, vq,
87 			offsetof(struct vring_used, ring[to]),
88 			size * sizeof(struct vring_used_elem));
89 }
90 
91 static __rte_always_inline void
92 flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
93 {
94 	uint16_t used_idx = vq->last_used_idx & (vq->size - 1);
95 
96 	if (used_idx + vq->shadow_used_idx <= vq->size) {
97 		do_flush_shadow_used_ring_split(dev, vq, used_idx, 0,
98 					  vq->shadow_used_idx);
99 	} else {
100 		uint16_t size;
101 
102 		/* update used ring interval [used_idx, vq->size] */
103 		size = vq->size - used_idx;
104 		do_flush_shadow_used_ring_split(dev, vq, used_idx, 0, size);
105 
106 		/* update the left half used ring interval [0, left_size] */
107 		do_flush_shadow_used_ring_split(dev, vq, 0, size,
108 					  vq->shadow_used_idx - size);
109 	}
110 	vq->last_used_idx += vq->shadow_used_idx;
111 
112 	vhost_log_cache_sync(dev, vq);
113 
114 	__atomic_add_fetch(&vq->used->idx, vq->shadow_used_idx,
115 			   __ATOMIC_RELEASE);
116 	vq->shadow_used_idx = 0;
117 	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
118 		sizeof(vq->used->idx));
119 }
120 
121 static __rte_always_inline void
122 update_shadow_used_ring_split(struct vhost_virtqueue *vq,
123 			 uint16_t desc_idx, uint32_t len)
124 {
125 	uint16_t i = vq->shadow_used_idx++;
126 
127 	vq->shadow_used_split[i].id  = desc_idx;
128 	vq->shadow_used_split[i].len = len;
129 }
130 
131 static __rte_always_inline void
132 vhost_flush_enqueue_shadow_packed(struct virtio_net *dev,
133 				  struct vhost_virtqueue *vq)
134 {
135 	int i;
136 	uint16_t used_idx = vq->last_used_idx;
137 	uint16_t head_idx = vq->last_used_idx;
138 	uint16_t head_flags = 0;
139 
140 	/* Split loop in two to save memory barriers */
141 	for (i = 0; i < vq->shadow_used_idx; i++) {
142 		vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id;
143 		vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len;
144 
145 		used_idx += vq->shadow_used_packed[i].count;
146 		if (used_idx >= vq->size)
147 			used_idx -= vq->size;
148 	}
149 
150 	/* The ordering for storing desc flags needs to be enforced. */
151 	rte_atomic_thread_fence(__ATOMIC_RELEASE);
152 
153 	for (i = 0; i < vq->shadow_used_idx; i++) {
154 		uint16_t flags;
155 
156 		if (vq->shadow_used_packed[i].len)
157 			flags = VRING_DESC_F_WRITE;
158 		else
159 			flags = 0;
160 
161 		if (vq->used_wrap_counter) {
162 			flags |= VRING_DESC_F_USED;
163 			flags |= VRING_DESC_F_AVAIL;
164 		} else {
165 			flags &= ~VRING_DESC_F_USED;
166 			flags &= ~VRING_DESC_F_AVAIL;
167 		}
168 
169 		if (i > 0) {
170 			vq->desc_packed[vq->last_used_idx].flags = flags;
171 
172 			vhost_log_cache_used_vring(dev, vq,
173 					vq->last_used_idx *
174 					sizeof(struct vring_packed_desc),
175 					sizeof(struct vring_packed_desc));
176 		} else {
177 			head_idx = vq->last_used_idx;
178 			head_flags = flags;
179 		}
180 
181 		vq_inc_last_used_packed(vq, vq->shadow_used_packed[i].count);
182 	}
183 
184 	vq->desc_packed[head_idx].flags = head_flags;
185 
186 	vhost_log_cache_used_vring(dev, vq,
187 				head_idx *
188 				sizeof(struct vring_packed_desc),
189 				sizeof(struct vring_packed_desc));
190 
191 	vq->shadow_used_idx = 0;
192 	vhost_log_cache_sync(dev, vq);
193 }
194 
195 static __rte_always_inline void
196 vhost_flush_dequeue_shadow_packed(struct virtio_net *dev,
197 				  struct vhost_virtqueue *vq)
198 {
199 	struct vring_used_elem_packed *used_elem = &vq->shadow_used_packed[0];
200 
201 	vq->desc_packed[vq->shadow_last_used_idx].id = used_elem->id;
202 	/* desc flags is the synchronization point for virtio packed vring */
203 	__atomic_store_n(&vq->desc_packed[vq->shadow_last_used_idx].flags,
204 			 used_elem->flags, __ATOMIC_RELEASE);
205 
206 	vhost_log_cache_used_vring(dev, vq, vq->shadow_last_used_idx *
207 				   sizeof(struct vring_packed_desc),
208 				   sizeof(struct vring_packed_desc));
209 	vq->shadow_used_idx = 0;
210 	vhost_log_cache_sync(dev, vq);
211 }
212 
213 static __rte_always_inline void
214 vhost_flush_enqueue_batch_packed(struct virtio_net *dev,
215 				 struct vhost_virtqueue *vq,
216 				 uint64_t *lens,
217 				 uint16_t *ids)
218 {
219 	uint16_t i;
220 	uint16_t flags;
221 	uint16_t last_used_idx;
222 	struct vring_packed_desc *desc_base;
223 
224 	last_used_idx = vq->last_used_idx;
225 	desc_base = &vq->desc_packed[last_used_idx];
226 
227 	flags = PACKED_DESC_ENQUEUE_USED_FLAG(vq->used_wrap_counter);
228 
229 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
230 		desc_base[i].id = ids[i];
231 		desc_base[i].len = lens[i];
232 	}
233 
234 	rte_atomic_thread_fence(__ATOMIC_RELEASE);
235 
236 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
237 		desc_base[i].flags = flags;
238 	}
239 
240 	vhost_log_cache_used_vring(dev, vq, last_used_idx *
241 				   sizeof(struct vring_packed_desc),
242 				   sizeof(struct vring_packed_desc) *
243 				   PACKED_BATCH_SIZE);
244 	vhost_log_cache_sync(dev, vq);
245 
246 	vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
247 }
248 
249 static __rte_always_inline void
250 vhost_shadow_dequeue_batch_packed_inorder(struct vhost_virtqueue *vq,
251 					  uint16_t id)
252 {
253 	vq->shadow_used_packed[0].id = id;
254 
255 	if (!vq->shadow_used_idx) {
256 		vq->shadow_last_used_idx = vq->last_used_idx;
257 		vq->shadow_used_packed[0].flags =
258 			PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter);
259 		vq->shadow_used_packed[0].len = 0;
260 		vq->shadow_used_packed[0].count = 1;
261 		vq->shadow_used_idx++;
262 	}
263 
264 	vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
265 }
266 
267 static __rte_always_inline void
268 vhost_shadow_dequeue_batch_packed(struct virtio_net *dev,
269 				  struct vhost_virtqueue *vq,
270 				  uint16_t *ids)
271 {
272 	uint16_t flags;
273 	uint16_t i;
274 	uint16_t begin;
275 
276 	flags = PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter);
277 
278 	if (!vq->shadow_used_idx) {
279 		vq->shadow_last_used_idx = vq->last_used_idx;
280 		vq->shadow_used_packed[0].id  = ids[0];
281 		vq->shadow_used_packed[0].len = 0;
282 		vq->shadow_used_packed[0].count = 1;
283 		vq->shadow_used_packed[0].flags = flags;
284 		vq->shadow_used_idx++;
285 		begin = 1;
286 	} else
287 		begin = 0;
288 
289 	vhost_for_each_try_unroll(i, begin, PACKED_BATCH_SIZE) {
290 		vq->desc_packed[vq->last_used_idx + i].id = ids[i];
291 		vq->desc_packed[vq->last_used_idx + i].len = 0;
292 	}
293 
294 	rte_atomic_thread_fence(__ATOMIC_RELEASE);
295 	vhost_for_each_try_unroll(i, begin, PACKED_BATCH_SIZE)
296 		vq->desc_packed[vq->last_used_idx + i].flags = flags;
297 
298 	vhost_log_cache_used_vring(dev, vq, vq->last_used_idx *
299 				   sizeof(struct vring_packed_desc),
300 				   sizeof(struct vring_packed_desc) *
301 				   PACKED_BATCH_SIZE);
302 	vhost_log_cache_sync(dev, vq);
303 
304 	vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
305 }
306 
307 static __rte_always_inline void
308 vhost_shadow_dequeue_single_packed(struct vhost_virtqueue *vq,
309 				   uint16_t buf_id,
310 				   uint16_t count)
311 {
312 	uint16_t flags;
313 
314 	flags = vq->desc_packed[vq->last_used_idx].flags;
315 	if (vq->used_wrap_counter) {
316 		flags |= VRING_DESC_F_USED;
317 		flags |= VRING_DESC_F_AVAIL;
318 	} else {
319 		flags &= ~VRING_DESC_F_USED;
320 		flags &= ~VRING_DESC_F_AVAIL;
321 	}
322 
323 	if (!vq->shadow_used_idx) {
324 		vq->shadow_last_used_idx = vq->last_used_idx;
325 
326 		vq->shadow_used_packed[0].id  = buf_id;
327 		vq->shadow_used_packed[0].len = 0;
328 		vq->shadow_used_packed[0].flags = flags;
329 		vq->shadow_used_idx++;
330 	} else {
331 		vq->desc_packed[vq->last_used_idx].id = buf_id;
332 		vq->desc_packed[vq->last_used_idx].len = 0;
333 		vq->desc_packed[vq->last_used_idx].flags = flags;
334 	}
335 
336 	vq_inc_last_used_packed(vq, count);
337 }
338 
339 static __rte_always_inline void
340 vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue *vq,
341 					   uint16_t buf_id,
342 					   uint16_t count)
343 {
344 	uint16_t flags;
345 
346 	vq->shadow_used_packed[0].id = buf_id;
347 
348 	flags = vq->desc_packed[vq->last_used_idx].flags;
349 	if (vq->used_wrap_counter) {
350 		flags |= VRING_DESC_F_USED;
351 		flags |= VRING_DESC_F_AVAIL;
352 	} else {
353 		flags &= ~VRING_DESC_F_USED;
354 		flags &= ~VRING_DESC_F_AVAIL;
355 	}
356 
357 	if (!vq->shadow_used_idx) {
358 		vq->shadow_last_used_idx = vq->last_used_idx;
359 		vq->shadow_used_packed[0].len = 0;
360 		vq->shadow_used_packed[0].flags = flags;
361 		vq->shadow_used_idx++;
362 	}
363 
364 	vq_inc_last_used_packed(vq, count);
365 }
366 
367 static __rte_always_inline void
368 vhost_shadow_enqueue_packed(struct vhost_virtqueue *vq,
369 				   uint32_t *len,
370 				   uint16_t *id,
371 				   uint16_t *count,
372 				   uint16_t num_buffers)
373 {
374 	uint16_t i;
375 
376 	for (i = 0; i < num_buffers; i++) {
377 		/* enqueue shadow flush action aligned with batch num */
378 		if (!vq->shadow_used_idx)
379 			vq->shadow_aligned_idx = vq->last_used_idx &
380 				PACKED_BATCH_MASK;
381 		vq->shadow_used_packed[vq->shadow_used_idx].id  = id[i];
382 		vq->shadow_used_packed[vq->shadow_used_idx].len = len[i];
383 		vq->shadow_used_packed[vq->shadow_used_idx].count = count[i];
384 		vq->shadow_aligned_idx += count[i];
385 		vq->shadow_used_idx++;
386 	}
387 }
388 
389 static __rte_always_inline void
390 vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
391 				   struct vhost_virtqueue *vq,
392 				   uint32_t *len,
393 				   uint16_t *id,
394 				   uint16_t *count,
395 				   uint16_t num_buffers)
396 {
397 	vhost_shadow_enqueue_packed(vq, len, id, count, num_buffers);
398 
399 	if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) {
400 		do_data_copy_enqueue(dev, vq);
401 		vhost_flush_enqueue_shadow_packed(dev, vq);
402 	}
403 }
404 
405 /* avoid write operation when necessary, to lessen cache issues */
406 #define ASSIGN_UNLESS_EQUAL(var, val) do {	\
407 	if ((var) != (val))			\
408 		(var) = (val);			\
409 } while (0)
410 
411 static __rte_always_inline void
412 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
413 {
414 	uint64_t csum_l4 = m_buf->ol_flags & PKT_TX_L4_MASK;
415 
416 	if (m_buf->ol_flags & PKT_TX_TCP_SEG)
417 		csum_l4 |= PKT_TX_TCP_CKSUM;
418 
419 	if (csum_l4) {
420 		net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
421 		net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
422 
423 		switch (csum_l4) {
424 		case PKT_TX_TCP_CKSUM:
425 			net_hdr->csum_offset = (offsetof(struct rte_tcp_hdr,
426 						cksum));
427 			break;
428 		case PKT_TX_UDP_CKSUM:
429 			net_hdr->csum_offset = (offsetof(struct rte_udp_hdr,
430 						dgram_cksum));
431 			break;
432 		case PKT_TX_SCTP_CKSUM:
433 			net_hdr->csum_offset = (offsetof(struct rte_sctp_hdr,
434 						cksum));
435 			break;
436 		}
437 	} else {
438 		ASSIGN_UNLESS_EQUAL(net_hdr->csum_start, 0);
439 		ASSIGN_UNLESS_EQUAL(net_hdr->csum_offset, 0);
440 		ASSIGN_UNLESS_EQUAL(net_hdr->flags, 0);
441 	}
442 
443 	/* IP cksum verification cannot be bypassed, then calculate here */
444 	if (m_buf->ol_flags & PKT_TX_IP_CKSUM) {
445 		struct rte_ipv4_hdr *ipv4_hdr;
446 
447 		ipv4_hdr = rte_pktmbuf_mtod_offset(m_buf, struct rte_ipv4_hdr *,
448 						   m_buf->l2_len);
449 		ipv4_hdr->hdr_checksum = 0;
450 		ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr);
451 	}
452 
453 	if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
454 		if (m_buf->ol_flags & PKT_TX_IPV4)
455 			net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
456 		else
457 			net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
458 		net_hdr->gso_size = m_buf->tso_segsz;
459 		net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
460 					+ m_buf->l4_len;
461 	} else if (m_buf->ol_flags & PKT_TX_UDP_SEG) {
462 		net_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
463 		net_hdr->gso_size = m_buf->tso_segsz;
464 		net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len +
465 			m_buf->l4_len;
466 	} else {
467 		ASSIGN_UNLESS_EQUAL(net_hdr->gso_type, 0);
468 		ASSIGN_UNLESS_EQUAL(net_hdr->gso_size, 0);
469 		ASSIGN_UNLESS_EQUAL(net_hdr->hdr_len, 0);
470 	}
471 }
472 
473 static __rte_always_inline int
474 map_one_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
475 		struct buf_vector *buf_vec, uint16_t *vec_idx,
476 		uint64_t desc_iova, uint64_t desc_len, uint8_t perm)
477 {
478 	uint16_t vec_id = *vec_idx;
479 
480 	while (desc_len) {
481 		uint64_t desc_addr;
482 		uint64_t desc_chunck_len = desc_len;
483 
484 		if (unlikely(vec_id >= BUF_VECTOR_MAX))
485 			return -1;
486 
487 		desc_addr = vhost_iova_to_vva(dev, vq,
488 				desc_iova,
489 				&desc_chunck_len,
490 				perm);
491 		if (unlikely(!desc_addr))
492 			return -1;
493 
494 		rte_prefetch0((void *)(uintptr_t)desc_addr);
495 
496 		buf_vec[vec_id].buf_iova = desc_iova;
497 		buf_vec[vec_id].buf_addr = desc_addr;
498 		buf_vec[vec_id].buf_len  = desc_chunck_len;
499 
500 		desc_len -= desc_chunck_len;
501 		desc_iova += desc_chunck_len;
502 		vec_id++;
503 	}
504 	*vec_idx = vec_id;
505 
506 	return 0;
507 }
508 
509 static __rte_always_inline int
510 fill_vec_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
511 			 uint32_t avail_idx, uint16_t *vec_idx,
512 			 struct buf_vector *buf_vec, uint16_t *desc_chain_head,
513 			 uint32_t *desc_chain_len, uint8_t perm)
514 {
515 	uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
516 	uint16_t vec_id = *vec_idx;
517 	uint32_t len    = 0;
518 	uint64_t dlen;
519 	uint32_t nr_descs = vq->size;
520 	uint32_t cnt    = 0;
521 	struct vring_desc *descs = vq->desc;
522 	struct vring_desc *idesc = NULL;
523 
524 	if (unlikely(idx >= vq->size))
525 		return -1;
526 
527 	*desc_chain_head = idx;
528 
529 	if (vq->desc[idx].flags & VRING_DESC_F_INDIRECT) {
530 		dlen = vq->desc[idx].len;
531 		nr_descs = dlen / sizeof(struct vring_desc);
532 		if (unlikely(nr_descs > vq->size))
533 			return -1;
534 
535 		descs = (struct vring_desc *)(uintptr_t)
536 			vhost_iova_to_vva(dev, vq, vq->desc[idx].addr,
537 						&dlen,
538 						VHOST_ACCESS_RO);
539 		if (unlikely(!descs))
540 			return -1;
541 
542 		if (unlikely(dlen < vq->desc[idx].len)) {
543 			/*
544 			 * The indirect desc table is not contiguous
545 			 * in process VA space, we have to copy it.
546 			 */
547 			idesc = vhost_alloc_copy_ind_table(dev, vq,
548 					vq->desc[idx].addr, vq->desc[idx].len);
549 			if (unlikely(!idesc))
550 				return -1;
551 
552 			descs = idesc;
553 		}
554 
555 		idx = 0;
556 	}
557 
558 	while (1) {
559 		if (unlikely(idx >= nr_descs || cnt++ >= nr_descs)) {
560 			free_ind_table(idesc);
561 			return -1;
562 		}
563 
564 		dlen = descs[idx].len;
565 		len += dlen;
566 
567 		if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
568 						descs[idx].addr, dlen,
569 						perm))) {
570 			free_ind_table(idesc);
571 			return -1;
572 		}
573 
574 		if ((descs[idx].flags & VRING_DESC_F_NEXT) == 0)
575 			break;
576 
577 		idx = descs[idx].next;
578 	}
579 
580 	*desc_chain_len = len;
581 	*vec_idx = vec_id;
582 
583 	if (unlikely(!!idesc))
584 		free_ind_table(idesc);
585 
586 	return 0;
587 }
588 
589 /*
590  * Returns -1 on fail, 0 on success
591  */
592 static inline int
593 reserve_avail_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
594 				uint32_t size, struct buf_vector *buf_vec,
595 				uint16_t *num_buffers, uint16_t avail_head,
596 				uint16_t *nr_vec)
597 {
598 	uint16_t cur_idx;
599 	uint16_t vec_idx = 0;
600 	uint16_t max_tries, tries = 0;
601 
602 	uint16_t head_idx = 0;
603 	uint32_t len = 0;
604 
605 	*num_buffers = 0;
606 	cur_idx  = vq->last_avail_idx;
607 
608 	if (rxvq_is_mergeable(dev))
609 		max_tries = vq->size - 1;
610 	else
611 		max_tries = 1;
612 
613 	while (size > 0) {
614 		if (unlikely(cur_idx == avail_head))
615 			return -1;
616 		/*
617 		 * if we tried all available ring items, and still
618 		 * can't get enough buf, it means something abnormal
619 		 * happened.
620 		 */
621 		if (unlikely(++tries > max_tries))
622 			return -1;
623 
624 		if (unlikely(fill_vec_buf_split(dev, vq, cur_idx,
625 						&vec_idx, buf_vec,
626 						&head_idx, &len,
627 						VHOST_ACCESS_RW) < 0))
628 			return -1;
629 		len = RTE_MIN(len, size);
630 		update_shadow_used_ring_split(vq, head_idx, len);
631 		size -= len;
632 
633 		cur_idx++;
634 		*num_buffers += 1;
635 	}
636 
637 	*nr_vec = vec_idx;
638 
639 	return 0;
640 }
641 
642 static __rte_always_inline int
643 fill_vec_buf_packed_indirect(struct virtio_net *dev,
644 			struct vhost_virtqueue *vq,
645 			struct vring_packed_desc *desc, uint16_t *vec_idx,
646 			struct buf_vector *buf_vec, uint32_t *len, uint8_t perm)
647 {
648 	uint16_t i;
649 	uint32_t nr_descs;
650 	uint16_t vec_id = *vec_idx;
651 	uint64_t dlen;
652 	struct vring_packed_desc *descs, *idescs = NULL;
653 
654 	dlen = desc->len;
655 	descs = (struct vring_packed_desc *)(uintptr_t)
656 		vhost_iova_to_vva(dev, vq, desc->addr, &dlen, VHOST_ACCESS_RO);
657 	if (unlikely(!descs))
658 		return -1;
659 
660 	if (unlikely(dlen < desc->len)) {
661 		/*
662 		 * The indirect desc table is not contiguous
663 		 * in process VA space, we have to copy it.
664 		 */
665 		idescs = vhost_alloc_copy_ind_table(dev,
666 				vq, desc->addr, desc->len);
667 		if (unlikely(!idescs))
668 			return -1;
669 
670 		descs = idescs;
671 	}
672 
673 	nr_descs =  desc->len / sizeof(struct vring_packed_desc);
674 	if (unlikely(nr_descs >= vq->size)) {
675 		free_ind_table(idescs);
676 		return -1;
677 	}
678 
679 	for (i = 0; i < nr_descs; i++) {
680 		if (unlikely(vec_id >= BUF_VECTOR_MAX)) {
681 			free_ind_table(idescs);
682 			return -1;
683 		}
684 
685 		dlen = descs[i].len;
686 		*len += dlen;
687 		if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
688 						descs[i].addr, dlen,
689 						perm)))
690 			return -1;
691 	}
692 	*vec_idx = vec_id;
693 
694 	if (unlikely(!!idescs))
695 		free_ind_table(idescs);
696 
697 	return 0;
698 }
699 
700 static __rte_always_inline int
701 fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
702 				uint16_t avail_idx, uint16_t *desc_count,
703 				struct buf_vector *buf_vec, uint16_t *vec_idx,
704 				uint16_t *buf_id, uint32_t *len, uint8_t perm)
705 {
706 	bool wrap_counter = vq->avail_wrap_counter;
707 	struct vring_packed_desc *descs = vq->desc_packed;
708 	uint16_t vec_id = *vec_idx;
709 	uint64_t dlen;
710 
711 	if (avail_idx < vq->last_avail_idx)
712 		wrap_counter ^= 1;
713 
714 	/*
715 	 * Perform a load-acquire barrier in desc_is_avail to
716 	 * enforce the ordering between desc flags and desc
717 	 * content.
718 	 */
719 	if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter)))
720 		return -1;
721 
722 	*desc_count = 0;
723 	*len = 0;
724 
725 	while (1) {
726 		if (unlikely(vec_id >= BUF_VECTOR_MAX))
727 			return -1;
728 
729 		if (unlikely(*desc_count >= vq->size))
730 			return -1;
731 
732 		*desc_count += 1;
733 		*buf_id = descs[avail_idx].id;
734 
735 		if (descs[avail_idx].flags & VRING_DESC_F_INDIRECT) {
736 			if (unlikely(fill_vec_buf_packed_indirect(dev, vq,
737 							&descs[avail_idx],
738 							&vec_id, buf_vec,
739 							len, perm) < 0))
740 				return -1;
741 		} else {
742 			dlen = descs[avail_idx].len;
743 			*len += dlen;
744 
745 			if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
746 							descs[avail_idx].addr,
747 							dlen,
748 							perm)))
749 				return -1;
750 		}
751 
752 		if ((descs[avail_idx].flags & VRING_DESC_F_NEXT) == 0)
753 			break;
754 
755 		if (++avail_idx >= vq->size) {
756 			avail_idx -= vq->size;
757 			wrap_counter ^= 1;
758 		}
759 	}
760 
761 	*vec_idx = vec_id;
762 
763 	return 0;
764 }
765 
766 static __rte_noinline void
767 copy_vnet_hdr_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
768 		struct buf_vector *buf_vec,
769 		struct virtio_net_hdr_mrg_rxbuf *hdr)
770 {
771 	uint64_t len;
772 	uint64_t remain = dev->vhost_hlen;
773 	uint64_t src = (uint64_t)(uintptr_t)hdr, dst;
774 	uint64_t iova = buf_vec->buf_iova;
775 
776 	while (remain) {
777 		len = RTE_MIN(remain,
778 				buf_vec->buf_len);
779 		dst = buf_vec->buf_addr;
780 		rte_memcpy((void *)(uintptr_t)dst,
781 				(void *)(uintptr_t)src,
782 				len);
783 
784 		PRINT_PACKET(dev, (uintptr_t)dst,
785 				(uint32_t)len, 0);
786 		vhost_log_cache_write_iova(dev, vq,
787 				iova, len);
788 
789 		remain -= len;
790 		iova += len;
791 		src += len;
792 		buf_vec++;
793 	}
794 }
795 
796 static __rte_always_inline int
797 copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
798 			    struct rte_mbuf *m, struct buf_vector *buf_vec,
799 			    uint16_t nr_vec, uint16_t num_buffers)
800 {
801 	uint32_t vec_idx = 0;
802 	uint32_t mbuf_offset, mbuf_avail;
803 	uint32_t buf_offset, buf_avail;
804 	uint64_t buf_addr, buf_iova, buf_len;
805 	uint32_t cpy_len;
806 	uint64_t hdr_addr;
807 	struct rte_mbuf *hdr_mbuf;
808 	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
809 	struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
810 	int error = 0;
811 
812 	if (unlikely(m == NULL)) {
813 		error = -1;
814 		goto out;
815 	}
816 
817 	buf_addr = buf_vec[vec_idx].buf_addr;
818 	buf_iova = buf_vec[vec_idx].buf_iova;
819 	buf_len = buf_vec[vec_idx].buf_len;
820 
821 	if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
822 		error = -1;
823 		goto out;
824 	}
825 
826 	hdr_mbuf = m;
827 	hdr_addr = buf_addr;
828 	if (unlikely(buf_len < dev->vhost_hlen)) {
829 		memset(&tmp_hdr, 0, sizeof(struct virtio_net_hdr_mrg_rxbuf));
830 		hdr = &tmp_hdr;
831 	} else
832 		hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr;
833 
834 	VHOST_LOG_DATA(DEBUG, "(%d) RX: num merge buffers %d\n",
835 		dev->vid, num_buffers);
836 
837 	if (unlikely(buf_len < dev->vhost_hlen)) {
838 		buf_offset = dev->vhost_hlen - buf_len;
839 		vec_idx++;
840 		buf_addr = buf_vec[vec_idx].buf_addr;
841 		buf_iova = buf_vec[vec_idx].buf_iova;
842 		buf_len = buf_vec[vec_idx].buf_len;
843 		buf_avail = buf_len - buf_offset;
844 	} else {
845 		buf_offset = dev->vhost_hlen;
846 		buf_avail = buf_len - dev->vhost_hlen;
847 	}
848 
849 	mbuf_avail  = rte_pktmbuf_data_len(m);
850 	mbuf_offset = 0;
851 	while (mbuf_avail != 0 || m->next != NULL) {
852 		/* done with current buf, get the next one */
853 		if (buf_avail == 0) {
854 			vec_idx++;
855 			if (unlikely(vec_idx >= nr_vec)) {
856 				error = -1;
857 				goto out;
858 			}
859 
860 			buf_addr = buf_vec[vec_idx].buf_addr;
861 			buf_iova = buf_vec[vec_idx].buf_iova;
862 			buf_len = buf_vec[vec_idx].buf_len;
863 
864 			buf_offset = 0;
865 			buf_avail  = buf_len;
866 		}
867 
868 		/* done with current mbuf, get the next one */
869 		if (mbuf_avail == 0) {
870 			m = m->next;
871 
872 			mbuf_offset = 0;
873 			mbuf_avail  = rte_pktmbuf_data_len(m);
874 		}
875 
876 		if (hdr_addr) {
877 			virtio_enqueue_offload(hdr_mbuf, &hdr->hdr);
878 			if (rxvq_is_mergeable(dev))
879 				ASSIGN_UNLESS_EQUAL(hdr->num_buffers,
880 						num_buffers);
881 
882 			if (unlikely(hdr == &tmp_hdr)) {
883 				copy_vnet_hdr_to_desc(dev, vq, buf_vec, hdr);
884 			} else {
885 				PRINT_PACKET(dev, (uintptr_t)hdr_addr,
886 						dev->vhost_hlen, 0);
887 				vhost_log_cache_write_iova(dev, vq,
888 						buf_vec[0].buf_iova,
889 						dev->vhost_hlen);
890 			}
891 
892 			hdr_addr = 0;
893 		}
894 
895 		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
896 
897 		if (likely(cpy_len > MAX_BATCH_LEN ||
898 					vq->batch_copy_nb_elems >= vq->size)) {
899 			rte_memcpy((void *)((uintptr_t)(buf_addr + buf_offset)),
900 				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
901 				cpy_len);
902 			vhost_log_cache_write_iova(dev, vq,
903 						   buf_iova + buf_offset,
904 						   cpy_len);
905 			PRINT_PACKET(dev, (uintptr_t)(buf_addr + buf_offset),
906 				cpy_len, 0);
907 		} else {
908 			batch_copy[vq->batch_copy_nb_elems].dst =
909 				(void *)((uintptr_t)(buf_addr + buf_offset));
910 			batch_copy[vq->batch_copy_nb_elems].src =
911 				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
912 			batch_copy[vq->batch_copy_nb_elems].log_addr =
913 				buf_iova + buf_offset;
914 			batch_copy[vq->batch_copy_nb_elems].len = cpy_len;
915 			vq->batch_copy_nb_elems++;
916 		}
917 
918 		mbuf_avail  -= cpy_len;
919 		mbuf_offset += cpy_len;
920 		buf_avail  -= cpy_len;
921 		buf_offset += cpy_len;
922 	}
923 
924 out:
925 
926 	return error;
927 }
928 
929 static __rte_always_inline void
930 async_fill_vec(struct iovec *v, void *base, size_t len)
931 {
932 	v->iov_base = base;
933 	v->iov_len = len;
934 }
935 
936 static __rte_always_inline void
937 async_fill_iter(struct rte_vhost_iov_iter *it, size_t count,
938 	struct iovec *vec, unsigned long nr_seg)
939 {
940 	it->offset = 0;
941 	it->count = count;
942 
943 	if (count) {
944 		it->iov = vec;
945 		it->nr_segs = nr_seg;
946 	} else {
947 		it->iov = 0;
948 		it->nr_segs = 0;
949 	}
950 }
951 
952 static __rte_always_inline void
953 async_fill_desc(struct rte_vhost_async_desc *desc,
954 	struct rte_vhost_iov_iter *src, struct rte_vhost_iov_iter *dst)
955 {
956 	desc->src = src;
957 	desc->dst = dst;
958 }
959 
960 static __rte_always_inline int
961 async_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
962 			struct rte_mbuf *m, struct buf_vector *buf_vec,
963 			uint16_t nr_vec, uint16_t num_buffers,
964 			struct iovec *src_iovec, struct iovec *dst_iovec,
965 			struct rte_vhost_iov_iter *src_it,
966 			struct rte_vhost_iov_iter *dst_it)
967 {
968 	uint32_t vec_idx = 0;
969 	uint32_t mbuf_offset, mbuf_avail;
970 	uint32_t buf_offset, buf_avail;
971 	uint64_t buf_addr, buf_iova, buf_len;
972 	uint32_t cpy_len, cpy_threshold;
973 	uint64_t hdr_addr;
974 	struct rte_mbuf *hdr_mbuf;
975 	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
976 	struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
977 	int error = 0;
978 	uint64_t mapped_len;
979 
980 	uint32_t tlen = 0;
981 	int tvec_idx = 0;
982 	void *hpa;
983 
984 	if (unlikely(m == NULL)) {
985 		error = -1;
986 		goto out;
987 	}
988 
989 	cpy_threshold = vq->async_threshold;
990 
991 	buf_addr = buf_vec[vec_idx].buf_addr;
992 	buf_iova = buf_vec[vec_idx].buf_iova;
993 	buf_len = buf_vec[vec_idx].buf_len;
994 
995 	if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
996 		error = -1;
997 		goto out;
998 	}
999 
1000 	hdr_mbuf = m;
1001 	hdr_addr = buf_addr;
1002 	if (unlikely(buf_len < dev->vhost_hlen)) {
1003 		memset(&tmp_hdr, 0, sizeof(struct virtio_net_hdr_mrg_rxbuf));
1004 		hdr = &tmp_hdr;
1005 	} else
1006 		hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr;
1007 
1008 	VHOST_LOG_DATA(DEBUG, "(%d) RX: num merge buffers %d\n",
1009 		dev->vid, num_buffers);
1010 
1011 	if (unlikely(buf_len < dev->vhost_hlen)) {
1012 		buf_offset = dev->vhost_hlen - buf_len;
1013 		vec_idx++;
1014 		buf_addr = buf_vec[vec_idx].buf_addr;
1015 		buf_iova = buf_vec[vec_idx].buf_iova;
1016 		buf_len = buf_vec[vec_idx].buf_len;
1017 		buf_avail = buf_len - buf_offset;
1018 	} else {
1019 		buf_offset = dev->vhost_hlen;
1020 		buf_avail = buf_len - dev->vhost_hlen;
1021 	}
1022 
1023 	mbuf_avail  = rte_pktmbuf_data_len(m);
1024 	mbuf_offset = 0;
1025 
1026 	while (mbuf_avail != 0 || m->next != NULL) {
1027 		/* done with current buf, get the next one */
1028 		if (buf_avail == 0) {
1029 			vec_idx++;
1030 			if (unlikely(vec_idx >= nr_vec)) {
1031 				error = -1;
1032 				goto out;
1033 			}
1034 
1035 			buf_addr = buf_vec[vec_idx].buf_addr;
1036 			buf_iova = buf_vec[vec_idx].buf_iova;
1037 			buf_len = buf_vec[vec_idx].buf_len;
1038 
1039 			buf_offset = 0;
1040 			buf_avail  = buf_len;
1041 		}
1042 
1043 		/* done with current mbuf, get the next one */
1044 		if (mbuf_avail == 0) {
1045 			m = m->next;
1046 
1047 			mbuf_offset = 0;
1048 			mbuf_avail  = rte_pktmbuf_data_len(m);
1049 		}
1050 
1051 		if (hdr_addr) {
1052 			virtio_enqueue_offload(hdr_mbuf, &hdr->hdr);
1053 			if (rxvq_is_mergeable(dev))
1054 				ASSIGN_UNLESS_EQUAL(hdr->num_buffers,
1055 						num_buffers);
1056 
1057 			if (unlikely(hdr == &tmp_hdr)) {
1058 				copy_vnet_hdr_to_desc(dev, vq, buf_vec, hdr);
1059 			} else {
1060 				PRINT_PACKET(dev, (uintptr_t)hdr_addr,
1061 						dev->vhost_hlen, 0);
1062 				vhost_log_cache_write_iova(dev, vq,
1063 						buf_vec[0].buf_iova,
1064 						dev->vhost_hlen);
1065 			}
1066 
1067 			hdr_addr = 0;
1068 		}
1069 
1070 		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
1071 
1072 		while (unlikely(cpy_len && cpy_len >= cpy_threshold)) {
1073 			hpa = (void *)(uintptr_t)gpa_to_first_hpa(dev,
1074 					buf_iova + buf_offset,
1075 					cpy_len, &mapped_len);
1076 
1077 			if (unlikely(!hpa || mapped_len < cpy_threshold))
1078 				break;
1079 
1080 			async_fill_vec(src_iovec + tvec_idx,
1081 				(void *)(uintptr_t)rte_pktmbuf_iova_offset(m,
1082 				mbuf_offset), (size_t)mapped_len);
1083 
1084 			async_fill_vec(dst_iovec + tvec_idx,
1085 					hpa, (size_t)mapped_len);
1086 
1087 			tlen += (uint32_t)mapped_len;
1088 			cpy_len -= (uint32_t)mapped_len;
1089 			mbuf_avail  -= (uint32_t)mapped_len;
1090 			mbuf_offset += (uint32_t)mapped_len;
1091 			buf_avail  -= (uint32_t)mapped_len;
1092 			buf_offset += (uint32_t)mapped_len;
1093 			tvec_idx++;
1094 		}
1095 
1096 		if (likely(cpy_len)) {
1097 			if (unlikely(vq->batch_copy_nb_elems >= vq->size)) {
1098 				rte_memcpy(
1099 				(void *)((uintptr_t)(buf_addr + buf_offset)),
1100 				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
1101 				cpy_len);
1102 
1103 				PRINT_PACKET(dev,
1104 					(uintptr_t)(buf_addr + buf_offset),
1105 					cpy_len, 0);
1106 			} else {
1107 				batch_copy[vq->batch_copy_nb_elems].dst =
1108 				(void *)((uintptr_t)(buf_addr + buf_offset));
1109 				batch_copy[vq->batch_copy_nb_elems].src =
1110 				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
1111 				batch_copy[vq->batch_copy_nb_elems].log_addr =
1112 					buf_iova + buf_offset;
1113 				batch_copy[vq->batch_copy_nb_elems].len =
1114 					cpy_len;
1115 				vq->batch_copy_nb_elems++;
1116 			}
1117 
1118 			mbuf_avail  -= cpy_len;
1119 			mbuf_offset += cpy_len;
1120 			buf_avail  -= cpy_len;
1121 			buf_offset += cpy_len;
1122 		}
1123 
1124 	}
1125 
1126 out:
1127 	if (tlen) {
1128 		async_fill_iter(src_it, tlen, src_iovec, tvec_idx);
1129 		async_fill_iter(dst_it, tlen, dst_iovec, tvec_idx);
1130 	} else {
1131 		src_it->count = 0;
1132 	}
1133 
1134 	return error;
1135 }
1136 
1137 static __rte_always_inline int
1138 vhost_enqueue_single_packed(struct virtio_net *dev,
1139 			    struct vhost_virtqueue *vq,
1140 			    struct rte_mbuf *pkt,
1141 			    struct buf_vector *buf_vec,
1142 			    uint16_t *nr_descs)
1143 {
1144 	uint16_t nr_vec = 0;
1145 	uint16_t avail_idx = vq->last_avail_idx;
1146 	uint16_t max_tries, tries = 0;
1147 	uint16_t buf_id = 0;
1148 	uint32_t len = 0;
1149 	uint16_t desc_count;
1150 	uint32_t size = pkt->pkt_len + sizeof(struct virtio_net_hdr_mrg_rxbuf);
1151 	uint16_t num_buffers = 0;
1152 	uint32_t buffer_len[vq->size];
1153 	uint16_t buffer_buf_id[vq->size];
1154 	uint16_t buffer_desc_count[vq->size];
1155 
1156 	if (rxvq_is_mergeable(dev))
1157 		max_tries = vq->size - 1;
1158 	else
1159 		max_tries = 1;
1160 
1161 	while (size > 0) {
1162 		/*
1163 		 * if we tried all available ring items, and still
1164 		 * can't get enough buf, it means something abnormal
1165 		 * happened.
1166 		 */
1167 		if (unlikely(++tries > max_tries))
1168 			return -1;
1169 
1170 		if (unlikely(fill_vec_buf_packed(dev, vq,
1171 						avail_idx, &desc_count,
1172 						buf_vec, &nr_vec,
1173 						&buf_id, &len,
1174 						VHOST_ACCESS_RW) < 0))
1175 			return -1;
1176 
1177 		len = RTE_MIN(len, size);
1178 		size -= len;
1179 
1180 		buffer_len[num_buffers] = len;
1181 		buffer_buf_id[num_buffers] = buf_id;
1182 		buffer_desc_count[num_buffers] = desc_count;
1183 		num_buffers += 1;
1184 
1185 		*nr_descs += desc_count;
1186 		avail_idx += desc_count;
1187 		if (avail_idx >= vq->size)
1188 			avail_idx -= vq->size;
1189 	}
1190 
1191 	if (copy_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, num_buffers) < 0)
1192 		return -1;
1193 
1194 	vhost_shadow_enqueue_single_packed(dev, vq, buffer_len, buffer_buf_id,
1195 					   buffer_desc_count, num_buffers);
1196 
1197 	return 0;
1198 }
1199 
1200 static __rte_noinline uint32_t
1201 virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
1202 	struct rte_mbuf **pkts, uint32_t count)
1203 {
1204 	uint32_t pkt_idx = 0;
1205 	uint16_t num_buffers;
1206 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
1207 	uint16_t avail_head;
1208 
1209 	/*
1210 	 * The ordering between avail index and
1211 	 * desc reads needs to be enforced.
1212 	 */
1213 	avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);
1214 
1215 	rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
1216 
1217 	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
1218 		uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
1219 		uint16_t nr_vec = 0;
1220 
1221 		if (unlikely(reserve_avail_buf_split(dev, vq,
1222 						pkt_len, buf_vec, &num_buffers,
1223 						avail_head, &nr_vec) < 0)) {
1224 			VHOST_LOG_DATA(DEBUG,
1225 				"(%d) failed to get enough desc from vring\n",
1226 				dev->vid);
1227 			vq->shadow_used_idx -= num_buffers;
1228 			break;
1229 		}
1230 
1231 		VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
1232 			dev->vid, vq->last_avail_idx,
1233 			vq->last_avail_idx + num_buffers);
1234 
1235 		if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx],
1236 						buf_vec, nr_vec,
1237 						num_buffers) < 0) {
1238 			vq->shadow_used_idx -= num_buffers;
1239 			break;
1240 		}
1241 
1242 		vq->last_avail_idx += num_buffers;
1243 	}
1244 
1245 	do_data_copy_enqueue(dev, vq);
1246 
1247 	if (likely(vq->shadow_used_idx)) {
1248 		flush_shadow_used_ring_split(dev, vq);
1249 		vhost_vring_call_split(dev, vq);
1250 	}
1251 
1252 	return pkt_idx;
1253 }
1254 
1255 static __rte_always_inline int
1256 virtio_dev_rx_sync_batch_check(struct virtio_net *dev,
1257 			   struct vhost_virtqueue *vq,
1258 			   struct rte_mbuf **pkts,
1259 			   uint64_t *desc_addrs,
1260 			   uint64_t *lens)
1261 {
1262 	bool wrap_counter = vq->avail_wrap_counter;
1263 	struct vring_packed_desc *descs = vq->desc_packed;
1264 	uint16_t avail_idx = vq->last_avail_idx;
1265 	uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
1266 	uint16_t i;
1267 
1268 	if (unlikely(avail_idx & PACKED_BATCH_MASK))
1269 		return -1;
1270 
1271 	if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size))
1272 		return -1;
1273 
1274 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1275 		if (unlikely(pkts[i]->next != NULL))
1276 			return -1;
1277 		if (unlikely(!desc_is_avail(&descs[avail_idx + i],
1278 					    wrap_counter)))
1279 			return -1;
1280 	}
1281 
1282 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1283 		lens[i] = descs[avail_idx + i].len;
1284 
1285 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1286 		if (unlikely(pkts[i]->pkt_len > (lens[i] - buf_offset)))
1287 			return -1;
1288 	}
1289 
1290 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1291 		desc_addrs[i] = vhost_iova_to_vva(dev, vq,
1292 						  descs[avail_idx + i].addr,
1293 						  &lens[i],
1294 						  VHOST_ACCESS_RW);
1295 
1296 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1297 		if (unlikely(!desc_addrs[i]))
1298 			return -1;
1299 		if (unlikely(lens[i] != descs[avail_idx + i].len))
1300 			return -1;
1301 	}
1302 
1303 	return 0;
1304 }
1305 
1306 static __rte_always_inline int
1307 virtio_dev_rx_async_batch_check(struct virtio_net *dev,
1308 			   struct vhost_virtqueue *vq,
1309 			   struct rte_mbuf **pkts,
1310 			   uint64_t *desc_addrs,
1311 			   uint64_t *lens)
1312 {
1313 	bool wrap_counter = vq->avail_wrap_counter;
1314 	struct vring_packed_desc *descs = vq->desc_packed;
1315 	uint16_t avail_idx = vq->last_avail_idx;
1316 	uint16_t used_idx = vq->last_used_idx;
1317 	uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
1318 	uint32_t cpy_threshold = vq->async_threshold;
1319 	uint16_t i;
1320 
1321 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1322 		if (unlikely(pkts[i]->data_len >= cpy_threshold))
1323 			return -1;
1324 	}
1325 
1326 	if (unlikely(avail_idx & PACKED_BATCH_MASK))
1327 		return -1;
1328 
1329 	if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size))
1330 		return -1;
1331 
1332 	if (unlikely((used_idx + PACKED_BATCH_SIZE) > vq->size))
1333 		return -1;
1334 
1335 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1336 		if (unlikely(pkts[i]->next != NULL))
1337 			return -1;
1338 		if (unlikely(!desc_is_avail(&descs[avail_idx + i],
1339 					    wrap_counter)))
1340 			return -1;
1341 	}
1342 
1343 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1344 		lens[i] = descs[avail_idx + i].len;
1345 
1346 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1347 		if (unlikely(pkts[i]->pkt_len > (lens[i] - buf_offset)))
1348 			return -1;
1349 	}
1350 
1351 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1352 		desc_addrs[i] = vhost_iova_to_vva(dev, vq,
1353 						  descs[avail_idx + i].addr,
1354 						  &lens[i],
1355 						  VHOST_ACCESS_RW);
1356 
1357 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1358 		if (unlikely(!desc_addrs[i]))
1359 			return -1;
1360 		if (unlikely(lens[i] != descs[avail_idx + i].len))
1361 			return -1;
1362 	}
1363 
1364 	return 0;
1365 }
1366 
1367 static __rte_always_inline void
1368 virtio_dev_rx_batch_packed_copy(struct virtio_net *dev,
1369 			   struct vhost_virtqueue *vq,
1370 			   struct rte_mbuf **pkts,
1371 			   uint64_t *desc_addrs,
1372 			   uint64_t *lens)
1373 {
1374 	uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
1375 	struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_BATCH_SIZE];
1376 	struct vring_packed_desc *descs = vq->desc_packed;
1377 	uint16_t avail_idx = vq->last_avail_idx;
1378 	uint16_t ids[PACKED_BATCH_SIZE];
1379 	uint16_t i;
1380 
1381 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1382 		rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
1383 		hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *)
1384 					(uintptr_t)desc_addrs[i];
1385 		lens[i] = pkts[i]->pkt_len +
1386 			sizeof(struct virtio_net_hdr_mrg_rxbuf);
1387 	}
1388 
1389 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1390 		virtio_enqueue_offload(pkts[i], &hdrs[i]->hdr);
1391 
1392 	vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE);
1393 
1394 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1395 		rte_memcpy((void *)(uintptr_t)(desc_addrs[i] + buf_offset),
1396 			   rte_pktmbuf_mtod_offset(pkts[i], void *, 0),
1397 			   pkts[i]->pkt_len);
1398 	}
1399 
1400 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1401 		vhost_log_cache_write_iova(dev, vq, descs[avail_idx + i].addr,
1402 					   lens[i]);
1403 
1404 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1405 		ids[i] = descs[avail_idx + i].id;
1406 
1407 	vhost_flush_enqueue_batch_packed(dev, vq, lens, ids);
1408 }
1409 
1410 static __rte_always_inline int
1411 virtio_dev_rx_sync_batch_packed(struct virtio_net *dev,
1412 			   struct vhost_virtqueue *vq,
1413 			   struct rte_mbuf **pkts)
1414 {
1415 	uint64_t desc_addrs[PACKED_BATCH_SIZE];
1416 	uint64_t lens[PACKED_BATCH_SIZE];
1417 
1418 	if (virtio_dev_rx_sync_batch_check(dev, vq, pkts, desc_addrs, lens) == -1)
1419 		return -1;
1420 
1421 	if (vq->shadow_used_idx) {
1422 		do_data_copy_enqueue(dev, vq);
1423 		vhost_flush_enqueue_shadow_packed(dev, vq);
1424 	}
1425 
1426 	virtio_dev_rx_batch_packed_copy(dev, vq, pkts, desc_addrs, lens);
1427 
1428 	return 0;
1429 }
1430 
1431 static __rte_always_inline int
1432 virtio_dev_rx_async_batch_packed(struct virtio_net *dev,
1433 			   struct vhost_virtqueue *vq,
1434 			   struct rte_mbuf **pkts,
1435 			   struct rte_mbuf **comp_pkts, uint32_t *pkt_done)
1436 {
1437 	uint16_t i;
1438 	uint64_t desc_addrs[PACKED_BATCH_SIZE];
1439 	uint64_t lens[PACKED_BATCH_SIZE];
1440 
1441 	if (virtio_dev_rx_async_batch_check(dev, vq, pkts, desc_addrs, lens) == -1)
1442 		return -1;
1443 
1444 	virtio_dev_rx_batch_packed_copy(dev, vq, pkts, desc_addrs, lens);
1445 
1446 	if (vq->shadow_used_idx) {
1447 		do_data_copy_enqueue(dev, vq);
1448 		vhost_flush_enqueue_shadow_packed(dev, vq);
1449 	}
1450 
1451 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1452 		comp_pkts[(*pkt_done)++] = pkts[i];
1453 
1454 	return 0;
1455 }
1456 
1457 static __rte_always_inline int16_t
1458 virtio_dev_rx_single_packed(struct virtio_net *dev,
1459 			    struct vhost_virtqueue *vq,
1460 			    struct rte_mbuf *pkt)
1461 {
1462 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
1463 	uint16_t nr_descs = 0;
1464 
1465 	if (unlikely(vhost_enqueue_single_packed(dev, vq, pkt, buf_vec,
1466 						 &nr_descs) < 0)) {
1467 		VHOST_LOG_DATA(DEBUG,
1468 				"(%d) failed to get enough desc from vring\n",
1469 				dev->vid);
1470 		return -1;
1471 	}
1472 
1473 	VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
1474 			dev->vid, vq->last_avail_idx,
1475 			vq->last_avail_idx + nr_descs);
1476 
1477 	vq_inc_last_avail_packed(vq, nr_descs);
1478 
1479 	return 0;
1480 }
1481 
1482 static __rte_noinline uint32_t
1483 virtio_dev_rx_packed(struct virtio_net *dev,
1484 		     struct vhost_virtqueue *__rte_restrict vq,
1485 		     struct rte_mbuf **__rte_restrict pkts,
1486 		     uint32_t count)
1487 {
1488 	uint32_t pkt_idx = 0;
1489 
1490 	do {
1491 		rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
1492 
1493 		if (count - pkt_idx >= PACKED_BATCH_SIZE) {
1494 			if (!virtio_dev_rx_sync_batch_packed(dev, vq,
1495 							&pkts[pkt_idx])) {
1496 				pkt_idx += PACKED_BATCH_SIZE;
1497 				continue;
1498 			}
1499 		}
1500 
1501 		if (virtio_dev_rx_single_packed(dev, vq, pkts[pkt_idx]))
1502 			break;
1503 		pkt_idx++;
1504 
1505 	} while (pkt_idx < count);
1506 
1507 	if (vq->shadow_used_idx) {
1508 		do_data_copy_enqueue(dev, vq);
1509 		vhost_flush_enqueue_shadow_packed(dev, vq);
1510 	}
1511 
1512 	if (pkt_idx)
1513 		vhost_vring_call_packed(dev, vq);
1514 
1515 	return pkt_idx;
1516 }
1517 
1518 static __rte_always_inline uint32_t
1519 virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
1520 	struct rte_mbuf **pkts, uint32_t count)
1521 {
1522 	struct vhost_virtqueue *vq;
1523 	uint32_t nb_tx = 0;
1524 
1525 	VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
1526 	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
1527 		VHOST_LOG_DATA(ERR, "(%d) %s: invalid virtqueue idx %d.\n",
1528 			dev->vid, __func__, queue_id);
1529 		return 0;
1530 	}
1531 
1532 	vq = dev->virtqueue[queue_id];
1533 
1534 	rte_spinlock_lock(&vq->access_lock);
1535 
1536 	if (unlikely(!vq->enabled))
1537 		goto out_access_unlock;
1538 
1539 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1540 		vhost_user_iotlb_rd_lock(vq);
1541 
1542 	if (unlikely(!vq->access_ok))
1543 		if (unlikely(vring_translate(dev, vq) < 0))
1544 			goto out;
1545 
1546 	count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
1547 	if (count == 0)
1548 		goto out;
1549 
1550 	if (vq_is_packed(dev))
1551 		nb_tx = virtio_dev_rx_packed(dev, vq, pkts, count);
1552 	else
1553 		nb_tx = virtio_dev_rx_split(dev, vq, pkts, count);
1554 
1555 out:
1556 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1557 		vhost_user_iotlb_rd_unlock(vq);
1558 
1559 out_access_unlock:
1560 	rte_spinlock_unlock(&vq->access_lock);
1561 
1562 	return nb_tx;
1563 }
1564 
1565 uint16_t
1566 rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
1567 	struct rte_mbuf **__rte_restrict pkts, uint16_t count)
1568 {
1569 	struct virtio_net *dev = get_device(vid);
1570 
1571 	if (!dev)
1572 		return 0;
1573 
1574 	if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
1575 		VHOST_LOG_DATA(ERR,
1576 			"(%d) %s: built-in vhost net backend is disabled.\n",
1577 			dev->vid, __func__);
1578 		return 0;
1579 	}
1580 
1581 	return virtio_dev_rx(dev, queue_id, pkts, count);
1582 }
1583 
1584 static __rte_always_inline uint16_t
1585 virtio_dev_rx_async_get_info_idx(uint16_t pkts_idx,
1586 	uint16_t vq_size, uint16_t n_inflight)
1587 {
1588 	return pkts_idx > n_inflight ? (pkts_idx - n_inflight) :
1589 		(vq_size - n_inflight + pkts_idx) % vq_size;
1590 }
1591 
1592 static __rte_always_inline void
1593 store_dma_desc_info_split(struct vring_used_elem *s_ring, struct vring_used_elem *d_ring,
1594 		uint16_t ring_size, uint16_t s_idx, uint16_t d_idx, uint16_t count)
1595 {
1596 	size_t elem_size = sizeof(struct vring_used_elem);
1597 
1598 	if (d_idx + count <= ring_size) {
1599 		rte_memcpy(d_ring + d_idx, s_ring + s_idx, count * elem_size);
1600 	} else {
1601 		uint16_t size = ring_size - d_idx;
1602 
1603 		rte_memcpy(d_ring + d_idx, s_ring + s_idx, size * elem_size);
1604 		rte_memcpy(d_ring, s_ring + s_idx + size, (count - size) * elem_size);
1605 	}
1606 }
1607 
1608 static __rte_always_inline void
1609 store_dma_desc_info_packed(struct vring_used_elem_packed *s_ring,
1610 		struct vring_used_elem_packed *d_ring,
1611 		uint16_t ring_size, uint16_t s_idx, uint16_t d_idx, uint16_t count)
1612 {
1613 	size_t elem_size = sizeof(struct vring_used_elem_packed);
1614 
1615 	if (d_idx + count <= ring_size) {
1616 		rte_memcpy(d_ring + d_idx, s_ring + s_idx, count * elem_size);
1617 	} else {
1618 		uint16_t size = ring_size - d_idx;
1619 
1620 		rte_memcpy(d_ring + d_idx, s_ring + s_idx, size * elem_size);
1621 		rte_memcpy(d_ring, s_ring + s_idx + size, (count - size) * elem_size);
1622 	}
1623 }
1624 
1625 static __rte_noinline uint32_t
1626 virtio_dev_rx_async_submit_split(struct virtio_net *dev,
1627 	struct vhost_virtqueue *vq, uint16_t queue_id,
1628 	struct rte_mbuf **pkts, uint32_t count,
1629 	struct rte_mbuf **comp_pkts, uint32_t *comp_count)
1630 {
1631 	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
1632 	uint16_t num_buffers;
1633 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
1634 	uint16_t avail_head;
1635 
1636 	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
1637 	struct iovec *vec_pool = vq->vec_pool;
1638 	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
1639 	struct iovec *src_iovec = vec_pool;
1640 	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
1641 	uint16_t slot_idx = 0;
1642 	uint16_t segs_await = 0;
1643 	uint16_t iovec_idx = 0, it_idx = 0;
1644 	struct async_inflight_info *pkts_info = vq->async_pkts_info;
1645 	uint32_t n_pkts = 0, pkt_err = 0;
1646 	uint32_t num_async_pkts = 0, num_done_pkts = 0;
1647 	struct {
1648 		uint16_t pkt_idx;
1649 		uint16_t last_avail_idx;
1650 	} async_pkts_log[MAX_PKT_BURST];
1651 
1652 	/*
1653 	 * The ordering between avail index and desc reads need to be enforced.
1654 	 */
1655 	avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);
1656 
1657 	rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
1658 
1659 	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
1660 		uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
1661 		uint16_t nr_vec = 0;
1662 
1663 		if (unlikely(reserve_avail_buf_split(dev, vq,
1664 						pkt_len, buf_vec, &num_buffers,
1665 						avail_head, &nr_vec) < 0)) {
1666 			VHOST_LOG_DATA(DEBUG,
1667 				"(%d) failed to get enough desc from vring\n",
1668 				dev->vid);
1669 			vq->shadow_used_idx -= num_buffers;
1670 			break;
1671 		}
1672 
1673 		VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
1674 			dev->vid, vq->last_avail_idx,
1675 			vq->last_avail_idx + num_buffers);
1676 
1677 		if (async_mbuf_to_desc(dev, vq, pkts[pkt_idx], buf_vec, nr_vec, num_buffers,
1678 				&src_iovec[iovec_idx], &dst_iovec[iovec_idx],
1679 				&it_pool[it_idx], &it_pool[it_idx + 1]) < 0) {
1680 			vq->shadow_used_idx -= num_buffers;
1681 			break;
1682 		}
1683 
1684 		slot_idx = (vq->async_pkts_idx + num_async_pkts) &
1685 			(vq->size - 1);
1686 		if (it_pool[it_idx].count) {
1687 			uint16_t from, to;
1688 
1689 			async_fill_desc(&tdes[pkt_burst_idx++],
1690 				&it_pool[it_idx], &it_pool[it_idx + 1]);
1691 			pkts_info[slot_idx].descs = num_buffers;
1692 			pkts_info[slot_idx].mbuf = pkts[pkt_idx];
1693 			async_pkts_log[num_async_pkts].pkt_idx = pkt_idx;
1694 			async_pkts_log[num_async_pkts++].last_avail_idx =
1695 				vq->last_avail_idx;
1696 
1697 			iovec_idx += it_pool[it_idx].nr_segs;
1698 			it_idx += 2;
1699 
1700 			segs_await += it_pool[it_idx].nr_segs;
1701 
1702 			/**
1703 			 * recover shadow used ring and keep DMA-occupied
1704 			 * descriptors.
1705 			 */
1706 			from = vq->shadow_used_idx - num_buffers;
1707 			to = vq->async_desc_idx_split & (vq->size - 1);
1708 
1709 			store_dma_desc_info_split(vq->shadow_used_split,
1710 					vq->async_descs_split, vq->size, from, to, num_buffers);
1711 
1712 			vq->async_desc_idx_split += num_buffers;
1713 			vq->shadow_used_idx -= num_buffers;
1714 		} else
1715 			comp_pkts[num_done_pkts++] = pkts[pkt_idx];
1716 
1717 		vq->last_avail_idx += num_buffers;
1718 
1719 		/*
1720 		 * conditions to trigger async device transfer:
1721 		 * - buffered packet number reaches transfer threshold
1722 		 * - unused async iov number is less than max vhost vector
1723 		 */
1724 		if (unlikely(pkt_burst_idx >= VHOST_ASYNC_BATCH_THRESHOLD ||
1725 			((VHOST_MAX_ASYNC_VEC >> 1) - segs_await <
1726 			BUF_VECTOR_MAX))) {
1727 			n_pkts = vq->async_ops.transfer_data(dev->vid,
1728 					queue_id, tdes, 0, pkt_burst_idx);
1729 			iovec_idx = 0;
1730 			it_idx = 0;
1731 
1732 			segs_await = 0;
1733 			vq->async_pkts_inflight_n += n_pkts;
1734 
1735 			if (unlikely(n_pkts < pkt_burst_idx)) {
1736 				/*
1737 				 * log error packets number here and do actual
1738 				 * error processing when applications poll
1739 				 * completion
1740 				 */
1741 				pkt_err = pkt_burst_idx - n_pkts;
1742 				pkt_burst_idx = 0;
1743 				break;
1744 			}
1745 
1746 			pkt_burst_idx = 0;
1747 		}
1748 	}
1749 
1750 	if (pkt_burst_idx) {
1751 		n_pkts = vq->async_ops.transfer_data(dev->vid,
1752 				queue_id, tdes, 0, pkt_burst_idx);
1753 		vq->async_pkts_inflight_n += n_pkts;
1754 
1755 		if (unlikely(n_pkts < pkt_burst_idx))
1756 			pkt_err = pkt_burst_idx - n_pkts;
1757 	}
1758 
1759 	do_data_copy_enqueue(dev, vq);
1760 
1761 	if (unlikely(pkt_err)) {
1762 		uint16_t num_descs = 0;
1763 
1764 		num_async_pkts -= pkt_err;
1765 		/* calculate the sum of descriptors of DMA-error packets. */
1766 		while (pkt_err-- > 0) {
1767 			num_descs += pkts_info[slot_idx & (vq->size - 1)].descs;
1768 			slot_idx--;
1769 		}
1770 		vq->async_desc_idx_split -= num_descs;
1771 		/* recover shadow used ring and available ring */
1772 		vq->shadow_used_idx -= (vq->last_avail_idx -
1773 				async_pkts_log[num_async_pkts].last_avail_idx -
1774 				num_descs);
1775 		vq->last_avail_idx =
1776 			async_pkts_log[num_async_pkts].last_avail_idx;
1777 		pkt_idx = async_pkts_log[num_async_pkts].pkt_idx;
1778 		num_done_pkts = pkt_idx - num_async_pkts;
1779 	}
1780 
1781 	vq->async_pkts_idx += num_async_pkts;
1782 	*comp_count = num_done_pkts;
1783 
1784 	if (likely(vq->shadow_used_idx)) {
1785 		flush_shadow_used_ring_split(dev, vq);
1786 		vhost_vring_call_split(dev, vq);
1787 	}
1788 
1789 	return pkt_idx;
1790 }
1791 
1792 static __rte_always_inline void
1793 vhost_update_used_packed(struct vhost_virtqueue *vq,
1794 			struct vring_used_elem_packed *shadow_ring,
1795 			uint16_t count)
1796 {
1797 	int i;
1798 	uint16_t used_idx = vq->last_used_idx;
1799 	uint16_t head_idx = vq->last_used_idx;
1800 	uint16_t head_flags = 0;
1801 
1802 	if (count == 0)
1803 		return;
1804 
1805 	/* Split loop in two to save memory barriers */
1806 	for (i = 0; i < count; i++) {
1807 		vq->desc_packed[used_idx].id = shadow_ring[i].id;
1808 		vq->desc_packed[used_idx].len = shadow_ring[i].len;
1809 
1810 		used_idx += shadow_ring[i].count;
1811 		if (used_idx >= vq->size)
1812 			used_idx -= vq->size;
1813 	}
1814 
1815 	/* The ordering for storing desc flags needs to be enforced. */
1816 	rte_atomic_thread_fence(__ATOMIC_RELEASE);
1817 
1818 	for (i = 0; i < count; i++) {
1819 		uint16_t flags;
1820 
1821 		if (vq->shadow_used_packed[i].len)
1822 			flags = VRING_DESC_F_WRITE;
1823 		else
1824 			flags = 0;
1825 
1826 		if (vq->used_wrap_counter) {
1827 			flags |= VRING_DESC_F_USED;
1828 			flags |= VRING_DESC_F_AVAIL;
1829 		} else {
1830 			flags &= ~VRING_DESC_F_USED;
1831 			flags &= ~VRING_DESC_F_AVAIL;
1832 		}
1833 
1834 		if (i > 0) {
1835 			vq->desc_packed[vq->last_used_idx].flags = flags;
1836 		} else {
1837 			head_idx = vq->last_used_idx;
1838 			head_flags = flags;
1839 		}
1840 
1841 		vq_inc_last_used_packed(vq, shadow_ring[i].count);
1842 	}
1843 
1844 	vq->desc_packed[head_idx].flags = head_flags;
1845 }
1846 
1847 static __rte_always_inline int
1848 vhost_enqueue_async_single_packed(struct virtio_net *dev,
1849 			    struct vhost_virtqueue *vq,
1850 			    struct rte_mbuf *pkt,
1851 			    struct buf_vector *buf_vec,
1852 			    uint16_t *nr_descs,
1853 			    uint16_t *nr_buffers,
1854 			    struct vring_packed_desc *async_descs,
1855 			    struct iovec *src_iovec, struct iovec *dst_iovec,
1856 			    struct rte_vhost_iov_iter *src_it,
1857 			    struct rte_vhost_iov_iter *dst_it)
1858 {
1859 	uint16_t nr_vec = 0;
1860 	uint16_t avail_idx = vq->last_avail_idx;
1861 	uint16_t max_tries, tries = 0;
1862 	uint16_t buf_id = 0;
1863 	uint32_t len = 0;
1864 	uint16_t desc_count = 0;
1865 	uint32_t size = pkt->pkt_len + sizeof(struct virtio_net_hdr_mrg_rxbuf);
1866 	uint32_t buffer_len[vq->size];
1867 	uint16_t buffer_buf_id[vq->size];
1868 	uint16_t buffer_desc_count[vq->size];
1869 
1870 	if (rxvq_is_mergeable(dev))
1871 		max_tries = vq->size - 1;
1872 	else
1873 		max_tries = 1;
1874 
1875 	while (size > 0) {
1876 		/*
1877 		 * if we tried all available ring items, and still
1878 		 * can't get enough buf, it means something abnormal
1879 		 * happened.
1880 		 */
1881 		if (unlikely(++tries > max_tries))
1882 			return -1;
1883 
1884 		if (unlikely(fill_vec_buf_packed(dev, vq, avail_idx, &desc_count, buf_vec, &nr_vec,
1885 						&buf_id, &len, VHOST_ACCESS_RW) < 0))
1886 			return -1;
1887 
1888 		len = RTE_MIN(len, size);
1889 		size -= len;
1890 
1891 		buffer_len[*nr_buffers] = len;
1892 		buffer_buf_id[*nr_buffers] = buf_id;
1893 		buffer_desc_count[*nr_buffers] = desc_count;
1894 		*nr_buffers += 1;
1895 
1896 		*nr_descs += desc_count;
1897 		avail_idx += desc_count;
1898 		if (avail_idx >= vq->size)
1899 			avail_idx -= vq->size;
1900 	}
1901 
1902 	if (async_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, *nr_buffers, src_iovec, dst_iovec,
1903 			src_it, dst_it) < 0)
1904 		return -1;
1905 	/* store descriptors for DMA */
1906 	if (avail_idx >= *nr_descs) {
1907 		rte_memcpy(async_descs, &vq->desc_packed[vq->last_avail_idx],
1908 			*nr_descs * sizeof(struct vring_packed_desc));
1909 	} else {
1910 		uint16_t nr_copy = vq->size - vq->last_avail_idx;
1911 
1912 		rte_memcpy(async_descs, &vq->desc_packed[vq->last_avail_idx],
1913 			nr_copy * sizeof(struct vring_packed_desc));
1914 		rte_memcpy(async_descs + nr_copy, vq->desc_packed,
1915 			(*nr_descs - nr_copy) * sizeof(struct vring_packed_desc));
1916 	}
1917 
1918 	vhost_shadow_enqueue_packed(vq, buffer_len, buffer_buf_id, buffer_desc_count, *nr_buffers);
1919 
1920 	return 0;
1921 }
1922 
1923 static __rte_always_inline int16_t
1924 virtio_dev_rx_async_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
1925 			    struct rte_mbuf *pkt, uint16_t *nr_descs, uint16_t *nr_buffers,
1926 			    struct vring_packed_desc *async_descs,
1927 			    struct iovec *src_iovec, struct iovec *dst_iovec,
1928 			    struct rte_vhost_iov_iter *src_it, struct rte_vhost_iov_iter *dst_it)
1929 {
1930 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
1931 
1932 	if (unlikely(vhost_enqueue_async_single_packed(dev, vq, pkt, buf_vec, nr_descs, nr_buffers,
1933 						 async_descs, src_iovec, dst_iovec,
1934 						 src_it, dst_it) < 0)) {
1935 		VHOST_LOG_DATA(DEBUG, "(%d) failed to get enough desc from vring\n", dev->vid);
1936 		return -1;
1937 	}
1938 
1939 	VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
1940 			dev->vid, vq->last_avail_idx, vq->last_avail_idx + *nr_descs);
1941 
1942 	return 0;
1943 }
1944 
1945 static __rte_always_inline void
1946 dma_error_handler_packed(struct vhost_virtqueue *vq, struct vring_packed_desc *async_descs,
1947 			uint16_t async_descs_idx, uint16_t slot_idx, uint32_t nr_err,
1948 			uint32_t *pkt_idx, uint32_t *num_async_pkts, uint32_t *num_done_pkts)
1949 {
1950 	uint16_t descs_err = 0;
1951 	uint16_t buffers_err = 0;
1952 	struct async_inflight_info *pkts_info = vq->async_pkts_info;
1953 
1954 	*num_async_pkts -= nr_err;
1955 	*pkt_idx -= nr_err;
1956 	/* calculate the sum of buffers and descs of DMA-error packets. */
1957 	while (nr_err-- > 0) {
1958 		descs_err += pkts_info[slot_idx % vq->size].descs;
1959 		buffers_err += pkts_info[slot_idx % vq->size].nr_buffers;
1960 		slot_idx--;
1961 	}
1962 
1963 	vq->async_buffer_idx_packed -= buffers_err;
1964 
1965 	if (vq->last_avail_idx >= descs_err) {
1966 		vq->last_avail_idx -= descs_err;
1967 
1968 		rte_memcpy(&vq->desc_packed[vq->last_avail_idx],
1969 			&async_descs[async_descs_idx - descs_err],
1970 			descs_err * sizeof(struct vring_packed_desc));
1971 	} else {
1972 		uint16_t nr_copy;
1973 
1974 		vq->last_avail_idx = vq->last_avail_idx + vq->size - descs_err;
1975 		nr_copy = vq->size - vq->last_avail_idx;
1976 		rte_memcpy(&vq->desc_packed[vq->last_avail_idx],
1977 			&async_descs[async_descs_idx - descs_err],
1978 			nr_copy * sizeof(struct vring_packed_desc));
1979 		descs_err -= nr_copy;
1980 		rte_memcpy(&vq->desc_packed[0], &async_descs[async_descs_idx - descs_err],
1981 			descs_err * sizeof(struct vring_packed_desc));
1982 		vq->avail_wrap_counter ^= 1;
1983 	}
1984 
1985 	*num_done_pkts = *pkt_idx - *num_async_pkts;
1986 }
1987 
1988 static __rte_noinline uint32_t
1989 virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
1990 	struct vhost_virtqueue *vq, uint16_t queue_id,
1991 	struct rte_mbuf **pkts, uint32_t count,
1992 	struct rte_mbuf **comp_pkts, uint32_t *comp_count)
1993 {
1994 	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
1995 	uint32_t remained = count;
1996 	uint16_t async_descs_idx = 0;
1997 	uint16_t num_buffers;
1998 	uint16_t num_descs;
1999 
2000 	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
2001 	struct iovec *vec_pool = vq->vec_pool;
2002 	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
2003 	struct iovec *src_iovec = vec_pool;
2004 	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
2005 	uint16_t slot_idx = 0;
2006 	uint16_t segs_await = 0;
2007 	uint16_t iovec_idx = 0, it_idx = 0;
2008 	struct async_inflight_info *pkts_info = vq->async_pkts_info;
2009 	uint32_t n_pkts = 0, pkt_err = 0;
2010 	uint32_t num_async_pkts = 0, num_done_pkts = 0;
2011 	struct vring_packed_desc async_descs[vq->size];
2012 
2013 	do {
2014 		rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
2015 		if (remained >= PACKED_BATCH_SIZE) {
2016 			if (!virtio_dev_rx_async_batch_packed(dev, vq,
2017 				&pkts[pkt_idx], comp_pkts, &num_done_pkts)) {
2018 				pkt_idx += PACKED_BATCH_SIZE;
2019 				remained -= PACKED_BATCH_SIZE;
2020 				continue;
2021 			}
2022 		}
2023 
2024 		num_buffers = 0;
2025 		num_descs = 0;
2026 		if (unlikely(virtio_dev_rx_async_single_packed(dev, vq, pkts[pkt_idx],
2027 						&num_descs, &num_buffers,
2028 						&async_descs[async_descs_idx],
2029 						&src_iovec[iovec_idx], &dst_iovec[iovec_idx],
2030 						&it_pool[it_idx], &it_pool[it_idx + 1]) < 0))
2031 			break;
2032 
2033 		VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
2034 			dev->vid, vq->last_avail_idx,
2035 			vq->last_avail_idx + num_descs);
2036 
2037 		slot_idx = (vq->async_pkts_idx + num_async_pkts) % vq->size;
2038 		if (it_pool[it_idx].count) {
2039 			uint16_t from;
2040 
2041 			async_descs_idx += num_descs;
2042 			async_fill_desc(&tdes[pkt_burst_idx++],
2043 				&it_pool[it_idx], &it_pool[it_idx + 1]);
2044 			pkts_info[slot_idx].descs = num_descs;
2045 			pkts_info[slot_idx].nr_buffers = num_buffers;
2046 			pkts_info[slot_idx].mbuf = pkts[pkt_idx];
2047 			num_async_pkts++;
2048 			iovec_idx += it_pool[it_idx].nr_segs;
2049 			it_idx += 2;
2050 
2051 			segs_await += it_pool[it_idx].nr_segs;
2052 
2053 			/**
2054 			 * recover shadow used ring and keep DMA-occupied
2055 			 * descriptors.
2056 			 */
2057 			from = vq->shadow_used_idx - num_buffers;
2058 			store_dma_desc_info_packed(vq->shadow_used_packed,
2059 					vq->async_buffers_packed, vq->size, from,
2060 					vq->async_buffer_idx_packed, num_buffers);
2061 
2062 			vq->async_buffer_idx_packed += num_buffers;
2063 			if (vq->async_buffer_idx_packed >= vq->size)
2064 				vq->async_buffer_idx_packed -= vq->size;
2065 			vq->shadow_used_idx -= num_buffers;
2066 		} else {
2067 			comp_pkts[num_done_pkts++] = pkts[pkt_idx];
2068 		}
2069 
2070 		pkt_idx++;
2071 		remained--;
2072 		vq_inc_last_avail_packed(vq, num_descs);
2073 
2074 		/*
2075 		 * conditions to trigger async device transfer:
2076 		 * - buffered packet number reaches transfer threshold
2077 		 * - unused async iov number is less than max vhost vector
2078 		 */
2079 		if (unlikely(pkt_burst_idx >= VHOST_ASYNC_BATCH_THRESHOLD ||
2080 			((VHOST_MAX_ASYNC_VEC >> 1) - segs_await < BUF_VECTOR_MAX))) {
2081 			n_pkts = vq->async_ops.transfer_data(dev->vid, queue_id,
2082 				tdes, 0, pkt_burst_idx);
2083 			iovec_idx = 0;
2084 			it_idx = 0;
2085 			segs_await = 0;
2086 			vq->async_pkts_inflight_n += n_pkts;
2087 
2088 			if (unlikely(n_pkts < pkt_burst_idx)) {
2089 				/*
2090 				 * log error packets number here and do actual
2091 				 * error processing when applications poll
2092 				 * completion
2093 				 */
2094 				pkt_err = pkt_burst_idx - n_pkts;
2095 				pkt_burst_idx = 0;
2096 				break;
2097 			}
2098 
2099 			pkt_burst_idx = 0;
2100 		}
2101 	} while (pkt_idx < count);
2102 
2103 	if (pkt_burst_idx) {
2104 		n_pkts = vq->async_ops.transfer_data(dev->vid, queue_id, tdes, 0, pkt_burst_idx);
2105 		vq->async_pkts_inflight_n += n_pkts;
2106 
2107 		if (unlikely(n_pkts < pkt_burst_idx))
2108 			pkt_err = pkt_burst_idx - n_pkts;
2109 	}
2110 
2111 	do_data_copy_enqueue(dev, vq);
2112 
2113 	if (unlikely(pkt_err))
2114 		dma_error_handler_packed(vq, async_descs, async_descs_idx, slot_idx, pkt_err,
2115 					&pkt_idx, &num_async_pkts, &num_done_pkts);
2116 	vq->async_pkts_idx += num_async_pkts;
2117 	if (vq->async_pkts_idx >= vq->size)
2118 		vq->async_pkts_idx -= vq->size;
2119 	*comp_count = num_done_pkts;
2120 
2121 	if (likely(vq->shadow_used_idx)) {
2122 		vhost_flush_enqueue_shadow_packed(dev, vq);
2123 		vhost_vring_call_packed(dev, vq);
2124 	}
2125 
2126 	return pkt_idx;
2127 }
2128 
2129 static __rte_always_inline void
2130 write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t n_descs)
2131 {
2132 	uint16_t nr_left = n_descs;
2133 	uint16_t nr_copy;
2134 	uint16_t to, from;
2135 
2136 	do {
2137 		from = vq->last_async_desc_idx_split & (vq->size - 1);
2138 		nr_copy = nr_left + from <= vq->size ? nr_left : vq->size - from;
2139 		to = vq->last_used_idx & (vq->size - 1);
2140 
2141 		if (to + nr_copy <= vq->size) {
2142 			rte_memcpy(&vq->used->ring[to], &vq->async_descs_split[from],
2143 					nr_copy * sizeof(struct vring_used_elem));
2144 		} else {
2145 			uint16_t size = vq->size - to;
2146 
2147 			rte_memcpy(&vq->used->ring[to], &vq->async_descs_split[from],
2148 					size * sizeof(struct vring_used_elem));
2149 			rte_memcpy(&vq->used->ring[0], &vq->async_descs_split[from + size],
2150 					(nr_copy - size) * sizeof(struct vring_used_elem));
2151 		}
2152 
2153 		vq->last_async_desc_idx_split += nr_copy;
2154 		vq->last_used_idx += nr_copy;
2155 		nr_left -= nr_copy;
2156 	} while (nr_left > 0);
2157 }
2158 
2159 static __rte_always_inline void
2160 write_back_completed_descs_packed(struct vhost_virtqueue *vq,
2161 				uint16_t n_buffers)
2162 {
2163 	uint16_t nr_left = n_buffers;
2164 	uint16_t from, to;
2165 
2166 	do {
2167 		from = vq->last_async_buffer_idx_packed;
2168 		to = (from + nr_left) % vq->size;
2169 		if (to > from) {
2170 			vhost_update_used_packed(vq, vq->async_buffers_packed + from, to - from);
2171 			vq->last_async_buffer_idx_packed += nr_left;
2172 			nr_left = 0;
2173 		} else {
2174 			vhost_update_used_packed(vq, vq->async_buffers_packed + from,
2175 				vq->size - from);
2176 			vq->last_async_buffer_idx_packed = 0;
2177 			nr_left -= vq->size - from;
2178 		}
2179 	} while (nr_left > 0);
2180 }
2181 
2182 uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
2183 		struct rte_mbuf **pkts, uint16_t count)
2184 {
2185 	struct virtio_net *dev = get_device(vid);
2186 	struct vhost_virtqueue *vq;
2187 	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0, n_buffers = 0;
2188 	uint16_t start_idx, pkts_idx, vq_size;
2189 	struct async_inflight_info *pkts_info;
2190 	uint16_t from, i;
2191 
2192 	if (!dev)
2193 		return 0;
2194 
2195 	VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
2196 	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
2197 		VHOST_LOG_DATA(ERR, "(%d) %s: invalid virtqueue idx %d.\n",
2198 			dev->vid, __func__, queue_id);
2199 		return 0;
2200 	}
2201 
2202 	vq = dev->virtqueue[queue_id];
2203 
2204 	if (unlikely(!vq->async_registered)) {
2205 		VHOST_LOG_DATA(ERR, "(%d) %s: async not registered for queue id %d.\n",
2206 			dev->vid, __func__, queue_id);
2207 		return 0;
2208 	}
2209 
2210 	rte_spinlock_lock(&vq->access_lock);
2211 
2212 	pkts_idx = vq->async_pkts_idx % vq->size;
2213 	pkts_info = vq->async_pkts_info;
2214 	vq_size = vq->size;
2215 	start_idx = virtio_dev_rx_async_get_info_idx(pkts_idx,
2216 		vq_size, vq->async_pkts_inflight_n);
2217 
2218 	if (count > vq->async_last_pkts_n)
2219 		n_pkts_cpl = vq->async_ops.check_completed_copies(vid,
2220 			queue_id, 0, count - vq->async_last_pkts_n);
2221 	n_pkts_cpl += vq->async_last_pkts_n;
2222 
2223 	n_pkts_put = RTE_MIN(count, n_pkts_cpl);
2224 	if (unlikely(n_pkts_put == 0)) {
2225 		vq->async_last_pkts_n = n_pkts_cpl;
2226 		goto done;
2227 	}
2228 
2229 	if (vq_is_packed(dev)) {
2230 		for (i = 0; i < n_pkts_put; i++) {
2231 			from = (start_idx + i) % vq_size;
2232 			n_buffers += pkts_info[from].nr_buffers;
2233 			pkts[i] = pkts_info[from].mbuf;
2234 		}
2235 	} else {
2236 		for (i = 0; i < n_pkts_put; i++) {
2237 			from = (start_idx + i) & (vq_size - 1);
2238 			n_descs += pkts_info[from].descs;
2239 			pkts[i] = pkts_info[from].mbuf;
2240 		}
2241 	}
2242 
2243 	vq->async_last_pkts_n = n_pkts_cpl - n_pkts_put;
2244 	vq->async_pkts_inflight_n -= n_pkts_put;
2245 
2246 	if (likely(vq->enabled && vq->access_ok)) {
2247 		if (vq_is_packed(dev)) {
2248 			write_back_completed_descs_packed(vq, n_buffers);
2249 
2250 			vhost_vring_call_packed(dev, vq);
2251 		} else {
2252 			write_back_completed_descs_split(vq, n_descs);
2253 
2254 			__atomic_add_fetch(&vq->used->idx, n_descs,
2255 					__ATOMIC_RELEASE);
2256 			vhost_vring_call_split(dev, vq);
2257 		}
2258 	} else {
2259 		if (vq_is_packed(dev)) {
2260 			vq->last_async_buffer_idx_packed += n_buffers;
2261 			if (vq->last_async_buffer_idx_packed >= vq->size)
2262 				vq->last_async_buffer_idx_packed -= vq->size;
2263 		} else {
2264 			vq->last_async_desc_idx_split += n_descs;
2265 		}
2266 	}
2267 
2268 done:
2269 	rte_spinlock_unlock(&vq->access_lock);
2270 
2271 	return n_pkts_put;
2272 }
2273 
2274 static __rte_always_inline uint32_t
2275 virtio_dev_rx_async_submit(struct virtio_net *dev, uint16_t queue_id,
2276 	struct rte_mbuf **pkts, uint32_t count,
2277 	struct rte_mbuf **comp_pkts, uint32_t *comp_count)
2278 {
2279 	struct vhost_virtqueue *vq;
2280 	uint32_t nb_tx = 0;
2281 
2282 	VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
2283 	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
2284 		VHOST_LOG_DATA(ERR, "(%d) %s: invalid virtqueue idx %d.\n",
2285 			dev->vid, __func__, queue_id);
2286 		return 0;
2287 	}
2288 
2289 	vq = dev->virtqueue[queue_id];
2290 
2291 	rte_spinlock_lock(&vq->access_lock);
2292 
2293 	if (unlikely(!vq->enabled || !vq->async_registered))
2294 		goto out_access_unlock;
2295 
2296 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
2297 		vhost_user_iotlb_rd_lock(vq);
2298 
2299 	if (unlikely(!vq->access_ok))
2300 		if (unlikely(vring_translate(dev, vq) < 0))
2301 			goto out;
2302 
2303 	count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
2304 	if (count == 0)
2305 		goto out;
2306 
2307 	if (vq_is_packed(dev))
2308 		nb_tx = virtio_dev_rx_async_submit_packed(dev,
2309 				vq, queue_id, pkts, count, comp_pkts,
2310 				comp_count);
2311 	else
2312 		nb_tx = virtio_dev_rx_async_submit_split(dev,
2313 				vq, queue_id, pkts, count, comp_pkts,
2314 				comp_count);
2315 
2316 out:
2317 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
2318 		vhost_user_iotlb_rd_unlock(vq);
2319 
2320 out_access_unlock:
2321 	rte_spinlock_unlock(&vq->access_lock);
2322 
2323 	return nb_tx;
2324 }
2325 
2326 uint16_t
2327 rte_vhost_submit_enqueue_burst(int vid, uint16_t queue_id,
2328 		struct rte_mbuf **pkts, uint16_t count,
2329 		struct rte_mbuf **comp_pkts, uint32_t *comp_count)
2330 {
2331 	struct virtio_net *dev = get_device(vid);
2332 
2333 	*comp_count = 0;
2334 	if (!dev)
2335 		return 0;
2336 
2337 	if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
2338 		VHOST_LOG_DATA(ERR,
2339 			"(%d) %s: built-in vhost net backend is disabled.\n",
2340 			dev->vid, __func__);
2341 		return 0;
2342 	}
2343 
2344 	return virtio_dev_rx_async_submit(dev, queue_id, pkts, count, comp_pkts,
2345 			comp_count);
2346 }
2347 
2348 static inline bool
2349 virtio_net_with_host_offload(struct virtio_net *dev)
2350 {
2351 	if (dev->features &
2352 			((1ULL << VIRTIO_NET_F_CSUM) |
2353 			 (1ULL << VIRTIO_NET_F_HOST_ECN) |
2354 			 (1ULL << VIRTIO_NET_F_HOST_TSO4) |
2355 			 (1ULL << VIRTIO_NET_F_HOST_TSO6) |
2356 			 (1ULL << VIRTIO_NET_F_HOST_UFO)))
2357 		return true;
2358 
2359 	return false;
2360 }
2361 
2362 static int
2363 parse_headers(struct rte_mbuf *m, uint8_t *l4_proto)
2364 {
2365 	struct rte_ipv4_hdr *ipv4_hdr;
2366 	struct rte_ipv6_hdr *ipv6_hdr;
2367 	struct rte_ether_hdr *eth_hdr;
2368 	uint16_t ethertype;
2369 	uint16_t data_len = rte_pktmbuf_data_len(m);
2370 
2371 	if (data_len < sizeof(struct rte_ether_hdr))
2372 		return -EINVAL;
2373 
2374 	eth_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
2375 
2376 	m->l2_len = sizeof(struct rte_ether_hdr);
2377 	ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
2378 
2379 	if (ethertype == RTE_ETHER_TYPE_VLAN) {
2380 		if (data_len < sizeof(struct rte_ether_hdr) +
2381 				sizeof(struct rte_vlan_hdr))
2382 			goto error;
2383 
2384 		struct rte_vlan_hdr *vlan_hdr =
2385 			(struct rte_vlan_hdr *)(eth_hdr + 1);
2386 
2387 		m->l2_len += sizeof(struct rte_vlan_hdr);
2388 		ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
2389 	}
2390 
2391 	switch (ethertype) {
2392 	case RTE_ETHER_TYPE_IPV4:
2393 		if (data_len < m->l2_len + sizeof(struct rte_ipv4_hdr))
2394 			goto error;
2395 		ipv4_hdr = rte_pktmbuf_mtod_offset(m, struct rte_ipv4_hdr *,
2396 				m->l2_len);
2397 		m->l3_len = rte_ipv4_hdr_len(ipv4_hdr);
2398 		if (data_len < m->l2_len + m->l3_len)
2399 			goto error;
2400 		m->ol_flags |= PKT_TX_IPV4;
2401 		*l4_proto = ipv4_hdr->next_proto_id;
2402 		break;
2403 	case RTE_ETHER_TYPE_IPV6:
2404 		if (data_len < m->l2_len + sizeof(struct rte_ipv6_hdr))
2405 			goto error;
2406 		ipv6_hdr = rte_pktmbuf_mtod_offset(m, struct rte_ipv6_hdr *,
2407 				m->l2_len);
2408 		m->l3_len = sizeof(struct rte_ipv6_hdr);
2409 		m->ol_flags |= PKT_TX_IPV6;
2410 		*l4_proto = ipv6_hdr->proto;
2411 		break;
2412 	default:
2413 		/* a valid L3 header is needed for further L4 parsing */
2414 		goto error;
2415 	}
2416 
2417 	/* both CSUM and GSO need a valid L4 header */
2418 	switch (*l4_proto) {
2419 	case IPPROTO_TCP:
2420 		if (data_len < m->l2_len + m->l3_len +
2421 				sizeof(struct rte_tcp_hdr))
2422 			goto error;
2423 		break;
2424 	case IPPROTO_UDP:
2425 		if (data_len < m->l2_len + m->l3_len +
2426 				sizeof(struct rte_udp_hdr))
2427 			goto error;
2428 		break;
2429 	case IPPROTO_SCTP:
2430 		if (data_len < m->l2_len + m->l3_len +
2431 				sizeof(struct rte_sctp_hdr))
2432 			goto error;
2433 		break;
2434 	default:
2435 		goto error;
2436 	}
2437 
2438 	return 0;
2439 
2440 error:
2441 	m->l2_len = 0;
2442 	m->l3_len = 0;
2443 	m->ol_flags = 0;
2444 	return -EINVAL;
2445 }
2446 
2447 static __rte_always_inline void
2448 vhost_dequeue_offload_legacy(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
2449 {
2450 	uint8_t l4_proto = 0;
2451 	struct rte_tcp_hdr *tcp_hdr = NULL;
2452 	uint16_t tcp_len;
2453 	uint16_t data_len = rte_pktmbuf_data_len(m);
2454 
2455 	if (parse_headers(m, &l4_proto) < 0)
2456 		return;
2457 
2458 	if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2459 		if (hdr->csum_start == (m->l2_len + m->l3_len)) {
2460 			switch (hdr->csum_offset) {
2461 			case (offsetof(struct rte_tcp_hdr, cksum)):
2462 				if (l4_proto != IPPROTO_TCP)
2463 					goto error;
2464 				m->ol_flags |= PKT_TX_TCP_CKSUM;
2465 				break;
2466 			case (offsetof(struct rte_udp_hdr, dgram_cksum)):
2467 				if (l4_proto != IPPROTO_UDP)
2468 					goto error;
2469 				m->ol_flags |= PKT_TX_UDP_CKSUM;
2470 				break;
2471 			case (offsetof(struct rte_sctp_hdr, cksum)):
2472 				if (l4_proto != IPPROTO_SCTP)
2473 					goto error;
2474 				m->ol_flags |= PKT_TX_SCTP_CKSUM;
2475 				break;
2476 			default:
2477 				goto error;
2478 			}
2479 		} else {
2480 			goto error;
2481 		}
2482 	}
2483 
2484 	if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2485 		switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2486 		case VIRTIO_NET_HDR_GSO_TCPV4:
2487 		case VIRTIO_NET_HDR_GSO_TCPV6:
2488 			if (l4_proto != IPPROTO_TCP)
2489 				goto error;
2490 			tcp_hdr = rte_pktmbuf_mtod_offset(m,
2491 					struct rte_tcp_hdr *,
2492 					m->l2_len + m->l3_len);
2493 			tcp_len = (tcp_hdr->data_off & 0xf0) >> 2;
2494 			if (data_len < m->l2_len + m->l3_len + tcp_len)
2495 				goto error;
2496 			m->ol_flags |= PKT_TX_TCP_SEG;
2497 			m->tso_segsz = hdr->gso_size;
2498 			m->l4_len = tcp_len;
2499 			break;
2500 		case VIRTIO_NET_HDR_GSO_UDP:
2501 			if (l4_proto != IPPROTO_UDP)
2502 				goto error;
2503 			m->ol_flags |= PKT_TX_UDP_SEG;
2504 			m->tso_segsz = hdr->gso_size;
2505 			m->l4_len = sizeof(struct rte_udp_hdr);
2506 			break;
2507 		default:
2508 			VHOST_LOG_DATA(WARNING,
2509 				"unsupported gso type %u.\n", hdr->gso_type);
2510 			goto error;
2511 		}
2512 	}
2513 	return;
2514 
2515 error:
2516 	m->l2_len = 0;
2517 	m->l3_len = 0;
2518 	m->ol_flags = 0;
2519 }
2520 
2521 static __rte_always_inline void
2522 vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m,
2523 	bool legacy_ol_flags)
2524 {
2525 	struct rte_net_hdr_lens hdr_lens;
2526 	int l4_supported = 0;
2527 	uint32_t ptype;
2528 
2529 	if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE)
2530 		return;
2531 
2532 	if (legacy_ol_flags) {
2533 		vhost_dequeue_offload_legacy(hdr, m);
2534 		return;
2535 	}
2536 
2537 	m->ol_flags |= PKT_RX_IP_CKSUM_UNKNOWN;
2538 
2539 	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
2540 	m->packet_type = ptype;
2541 	if ((ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP ||
2542 	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP ||
2543 	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP)
2544 		l4_supported = 1;
2545 
2546 	/* According to Virtio 1.1 spec, the device only needs to look at
2547 	 * VIRTIO_NET_HDR_F_NEEDS_CSUM in the packet transmission path.
2548 	 * This differs from the processing incoming packets path where the
2549 	 * driver could rely on VIRTIO_NET_HDR_F_DATA_VALID flag set by the
2550 	 * device.
2551 	 *
2552 	 * 5.1.6.2.1 Driver Requirements: Packet Transmission
2553 	 * The driver MUST NOT set the VIRTIO_NET_HDR_F_DATA_VALID and
2554 	 * VIRTIO_NET_HDR_F_RSC_INFO bits in flags.
2555 	 *
2556 	 * 5.1.6.2.2 Device Requirements: Packet Transmission
2557 	 * The device MUST ignore flag bits that it does not recognize.
2558 	 */
2559 	if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2560 		uint32_t hdrlen;
2561 
2562 		hdrlen = hdr_lens.l2_len + hdr_lens.l3_len + hdr_lens.l4_len;
2563 		if (hdr->csum_start <= hdrlen && l4_supported != 0) {
2564 			m->ol_flags |= PKT_RX_L4_CKSUM_NONE;
2565 		} else {
2566 			/* Unknown proto or tunnel, do sw cksum. We can assume
2567 			 * the cksum field is in the first segment since the
2568 			 * buffers we provided to the host are large enough.
2569 			 * In case of SCTP, this will be wrong since it's a CRC
2570 			 * but there's nothing we can do.
2571 			 */
2572 			uint16_t csum = 0, off;
2573 
2574 			if (rte_raw_cksum_mbuf(m, hdr->csum_start,
2575 					rte_pktmbuf_pkt_len(m) - hdr->csum_start, &csum) < 0)
2576 				return;
2577 			if (likely(csum != 0xffff))
2578 				csum = ~csum;
2579 			off = hdr->csum_offset + hdr->csum_start;
2580 			if (rte_pktmbuf_data_len(m) >= off + 1)
2581 				*rte_pktmbuf_mtod_offset(m, uint16_t *, off) = csum;
2582 		}
2583 	}
2584 
2585 	if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2586 		if (hdr->gso_size == 0)
2587 			return;
2588 
2589 		switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2590 		case VIRTIO_NET_HDR_GSO_TCPV4:
2591 		case VIRTIO_NET_HDR_GSO_TCPV6:
2592 			if ((ptype & RTE_PTYPE_L4_MASK) != RTE_PTYPE_L4_TCP)
2593 				break;
2594 			m->ol_flags |= PKT_RX_LRO | PKT_RX_L4_CKSUM_NONE;
2595 			m->tso_segsz = hdr->gso_size;
2596 			break;
2597 		case VIRTIO_NET_HDR_GSO_UDP:
2598 			if ((ptype & RTE_PTYPE_L4_MASK) != RTE_PTYPE_L4_UDP)
2599 				break;
2600 			m->ol_flags |= PKT_RX_LRO | PKT_RX_L4_CKSUM_NONE;
2601 			m->tso_segsz = hdr->gso_size;
2602 			break;
2603 		default:
2604 			break;
2605 		}
2606 	}
2607 }
2608 
2609 static __rte_noinline void
2610 copy_vnet_hdr_from_desc(struct virtio_net_hdr *hdr,
2611 		struct buf_vector *buf_vec)
2612 {
2613 	uint64_t len;
2614 	uint64_t remain = sizeof(struct virtio_net_hdr);
2615 	uint64_t src;
2616 	uint64_t dst = (uint64_t)(uintptr_t)hdr;
2617 
2618 	while (remain) {
2619 		len = RTE_MIN(remain, buf_vec->buf_len);
2620 		src = buf_vec->buf_addr;
2621 		rte_memcpy((void *)(uintptr_t)dst,
2622 				(void *)(uintptr_t)src, len);
2623 
2624 		remain -= len;
2625 		dst += len;
2626 		buf_vec++;
2627 	}
2628 }
2629 
2630 static __rte_always_inline int
2631 copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
2632 		  struct buf_vector *buf_vec, uint16_t nr_vec,
2633 		  struct rte_mbuf *m, struct rte_mempool *mbuf_pool,
2634 		  bool legacy_ol_flags)
2635 {
2636 	uint32_t buf_avail, buf_offset;
2637 	uint64_t buf_addr, buf_len;
2638 	uint32_t mbuf_avail, mbuf_offset;
2639 	uint32_t cpy_len;
2640 	struct rte_mbuf *cur = m, *prev = m;
2641 	struct virtio_net_hdr tmp_hdr;
2642 	struct virtio_net_hdr *hdr = NULL;
2643 	/* A counter to avoid desc dead loop chain */
2644 	uint16_t vec_idx = 0;
2645 	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
2646 	int error = 0;
2647 
2648 	buf_addr = buf_vec[vec_idx].buf_addr;
2649 	buf_len = buf_vec[vec_idx].buf_len;
2650 
2651 	if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
2652 		error = -1;
2653 		goto out;
2654 	}
2655 
2656 	if (virtio_net_with_host_offload(dev)) {
2657 		if (unlikely(buf_len < sizeof(struct virtio_net_hdr))) {
2658 			/*
2659 			 * No luck, the virtio-net header doesn't fit
2660 			 * in a contiguous virtual area.
2661 			 */
2662 			copy_vnet_hdr_from_desc(&tmp_hdr, buf_vec);
2663 			hdr = &tmp_hdr;
2664 		} else {
2665 			hdr = (struct virtio_net_hdr *)((uintptr_t)buf_addr);
2666 		}
2667 	}
2668 
2669 	/*
2670 	 * A virtio driver normally uses at least 2 desc buffers
2671 	 * for Tx: the first for storing the header, and others
2672 	 * for storing the data.
2673 	 */
2674 	if (unlikely(buf_len < dev->vhost_hlen)) {
2675 		buf_offset = dev->vhost_hlen - buf_len;
2676 		vec_idx++;
2677 		buf_addr = buf_vec[vec_idx].buf_addr;
2678 		buf_len = buf_vec[vec_idx].buf_len;
2679 		buf_avail  = buf_len - buf_offset;
2680 	} else if (buf_len == dev->vhost_hlen) {
2681 		if (unlikely(++vec_idx >= nr_vec))
2682 			goto out;
2683 		buf_addr = buf_vec[vec_idx].buf_addr;
2684 		buf_len = buf_vec[vec_idx].buf_len;
2685 
2686 		buf_offset = 0;
2687 		buf_avail = buf_len;
2688 	} else {
2689 		buf_offset = dev->vhost_hlen;
2690 		buf_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
2691 	}
2692 
2693 	PRINT_PACKET(dev,
2694 			(uintptr_t)(buf_addr + buf_offset),
2695 			(uint32_t)buf_avail, 0);
2696 
2697 	mbuf_offset = 0;
2698 	mbuf_avail  = m->buf_len - RTE_PKTMBUF_HEADROOM;
2699 	while (1) {
2700 		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
2701 
2702 		if (likely(cpy_len > MAX_BATCH_LEN ||
2703 					vq->batch_copy_nb_elems >= vq->size ||
2704 					(hdr && cur == m))) {
2705 			rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *,
2706 						mbuf_offset),
2707 					(void *)((uintptr_t)(buf_addr +
2708 							buf_offset)), cpy_len);
2709 		} else {
2710 			batch_copy[vq->batch_copy_nb_elems].dst =
2711 				rte_pktmbuf_mtod_offset(cur, void *,
2712 						mbuf_offset);
2713 			batch_copy[vq->batch_copy_nb_elems].src =
2714 				(void *)((uintptr_t)(buf_addr + buf_offset));
2715 			batch_copy[vq->batch_copy_nb_elems].len = cpy_len;
2716 			vq->batch_copy_nb_elems++;
2717 		}
2718 
2719 		mbuf_avail  -= cpy_len;
2720 		mbuf_offset += cpy_len;
2721 		buf_avail -= cpy_len;
2722 		buf_offset += cpy_len;
2723 
2724 		/* This buf reaches to its end, get the next one */
2725 		if (buf_avail == 0) {
2726 			if (++vec_idx >= nr_vec)
2727 				break;
2728 
2729 			buf_addr = buf_vec[vec_idx].buf_addr;
2730 			buf_len = buf_vec[vec_idx].buf_len;
2731 
2732 			buf_offset = 0;
2733 			buf_avail  = buf_len;
2734 
2735 			PRINT_PACKET(dev, (uintptr_t)buf_addr,
2736 					(uint32_t)buf_avail, 0);
2737 		}
2738 
2739 		/*
2740 		 * This mbuf reaches to its end, get a new one
2741 		 * to hold more data.
2742 		 */
2743 		if (mbuf_avail == 0) {
2744 			cur = rte_pktmbuf_alloc(mbuf_pool);
2745 			if (unlikely(cur == NULL)) {
2746 				VHOST_LOG_DATA(ERR, "Failed to "
2747 					"allocate memory for mbuf.\n");
2748 				error = -1;
2749 				goto out;
2750 			}
2751 
2752 			prev->next = cur;
2753 			prev->data_len = mbuf_offset;
2754 			m->nb_segs += 1;
2755 			m->pkt_len += mbuf_offset;
2756 			prev = cur;
2757 
2758 			mbuf_offset = 0;
2759 			mbuf_avail  = cur->buf_len - RTE_PKTMBUF_HEADROOM;
2760 		}
2761 	}
2762 
2763 	prev->data_len = mbuf_offset;
2764 	m->pkt_len    += mbuf_offset;
2765 
2766 	if (hdr)
2767 		vhost_dequeue_offload(hdr, m, legacy_ol_flags);
2768 
2769 out:
2770 
2771 	return error;
2772 }
2773 
2774 static void
2775 virtio_dev_extbuf_free(void *addr __rte_unused, void *opaque)
2776 {
2777 	rte_free(opaque);
2778 }
2779 
2780 static int
2781 virtio_dev_extbuf_alloc(struct rte_mbuf *pkt, uint32_t size)
2782 {
2783 	struct rte_mbuf_ext_shared_info *shinfo = NULL;
2784 	uint32_t total_len = RTE_PKTMBUF_HEADROOM + size;
2785 	uint16_t buf_len;
2786 	rte_iova_t iova;
2787 	void *buf;
2788 
2789 	total_len += sizeof(*shinfo) + sizeof(uintptr_t);
2790 	total_len = RTE_ALIGN_CEIL(total_len, sizeof(uintptr_t));
2791 
2792 	if (unlikely(total_len > UINT16_MAX))
2793 		return -ENOSPC;
2794 
2795 	buf_len = total_len;
2796 	buf = rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE);
2797 	if (unlikely(buf == NULL))
2798 		return -ENOMEM;
2799 
2800 	/* Initialize shinfo */
2801 	shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf, &buf_len,
2802 						virtio_dev_extbuf_free, buf);
2803 	if (unlikely(shinfo == NULL)) {
2804 		rte_free(buf);
2805 		VHOST_LOG_DATA(ERR, "Failed to init shinfo\n");
2806 		return -1;
2807 	}
2808 
2809 	iova = rte_malloc_virt2iova(buf);
2810 	rte_pktmbuf_attach_extbuf(pkt, buf, iova, buf_len, shinfo);
2811 	rte_pktmbuf_reset_headroom(pkt);
2812 
2813 	return 0;
2814 }
2815 
2816 /*
2817  * Prepare a host supported pktmbuf.
2818  */
2819 static __rte_always_inline int
2820 virtio_dev_pktmbuf_prep(struct virtio_net *dev, struct rte_mbuf *pkt,
2821 			 uint32_t data_len)
2822 {
2823 	if (rte_pktmbuf_tailroom(pkt) >= data_len)
2824 		return 0;
2825 
2826 	/* attach an external buffer if supported */
2827 	if (dev->extbuf && !virtio_dev_extbuf_alloc(pkt, data_len))
2828 		return 0;
2829 
2830 	/* check if chained buffers are allowed */
2831 	if (!dev->linearbuf)
2832 		return 0;
2833 
2834 	return -1;
2835 }
2836 
2837 __rte_always_inline
2838 static uint16_t
2839 virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
2840 	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count,
2841 	bool legacy_ol_flags)
2842 {
2843 	uint16_t i;
2844 	uint16_t free_entries;
2845 	uint16_t dropped = 0;
2846 	static bool allocerr_warned;
2847 
2848 	/*
2849 	 * The ordering between avail index and
2850 	 * desc reads needs to be enforced.
2851 	 */
2852 	free_entries = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE) -
2853 			vq->last_avail_idx;
2854 	if (free_entries == 0)
2855 		return 0;
2856 
2857 	rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
2858 
2859 	VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
2860 
2861 	count = RTE_MIN(count, MAX_PKT_BURST);
2862 	count = RTE_MIN(count, free_entries);
2863 	VHOST_LOG_DATA(DEBUG, "(%d) about to dequeue %u buffers\n",
2864 			dev->vid, count);
2865 
2866 	if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts, count))
2867 		return 0;
2868 
2869 	for (i = 0; i < count; i++) {
2870 		struct buf_vector buf_vec[BUF_VECTOR_MAX];
2871 		uint16_t head_idx;
2872 		uint32_t buf_len;
2873 		uint16_t nr_vec = 0;
2874 		int err;
2875 
2876 		if (unlikely(fill_vec_buf_split(dev, vq,
2877 						vq->last_avail_idx + i,
2878 						&nr_vec, buf_vec,
2879 						&head_idx, &buf_len,
2880 						VHOST_ACCESS_RO) < 0))
2881 			break;
2882 
2883 		update_shadow_used_ring_split(vq, head_idx, 0);
2884 
2885 		err = virtio_dev_pktmbuf_prep(dev, pkts[i], buf_len);
2886 		if (unlikely(err)) {
2887 			/*
2888 			 * mbuf allocation fails for jumbo packets when external
2889 			 * buffer allocation is not allowed and linear buffer
2890 			 * is required. Drop this packet.
2891 			 */
2892 			if (!allocerr_warned) {
2893 				VHOST_LOG_DATA(ERR,
2894 					"Failed mbuf alloc of size %d from %s on %s.\n",
2895 					buf_len, mbuf_pool->name, dev->ifname);
2896 				allocerr_warned = true;
2897 			}
2898 			dropped += 1;
2899 			i++;
2900 			break;
2901 		}
2902 
2903 		err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
2904 				mbuf_pool, legacy_ol_flags);
2905 		if (unlikely(err)) {
2906 			if (!allocerr_warned) {
2907 				VHOST_LOG_DATA(ERR,
2908 					"Failed to copy desc to mbuf on %s.\n",
2909 					dev->ifname);
2910 				allocerr_warned = true;
2911 			}
2912 			dropped += 1;
2913 			i++;
2914 			break;
2915 		}
2916 	}
2917 
2918 	if (dropped)
2919 		rte_pktmbuf_free_bulk(&pkts[i - 1], count - i + 1);
2920 
2921 	vq->last_avail_idx += i;
2922 
2923 	do_data_copy_dequeue(vq);
2924 	if (unlikely(i < count))
2925 		vq->shadow_used_idx = i;
2926 	if (likely(vq->shadow_used_idx)) {
2927 		flush_shadow_used_ring_split(dev, vq);
2928 		vhost_vring_call_split(dev, vq);
2929 	}
2930 
2931 	return (i - dropped);
2932 }
2933 
2934 __rte_noinline
2935 static uint16_t
2936 virtio_dev_tx_split_legacy(struct virtio_net *dev,
2937 	struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool,
2938 	struct rte_mbuf **pkts, uint16_t count)
2939 {
2940 	return virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count, true);
2941 }
2942 
2943 __rte_noinline
2944 static uint16_t
2945 virtio_dev_tx_split_compliant(struct virtio_net *dev,
2946 	struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool,
2947 	struct rte_mbuf **pkts, uint16_t count)
2948 {
2949 	return virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count, false);
2950 }
2951 
2952 static __rte_always_inline int
2953 vhost_reserve_avail_batch_packed(struct virtio_net *dev,
2954 				 struct vhost_virtqueue *vq,
2955 				 struct rte_mbuf **pkts,
2956 				 uint16_t avail_idx,
2957 				 uintptr_t *desc_addrs,
2958 				 uint16_t *ids)
2959 {
2960 	bool wrap = vq->avail_wrap_counter;
2961 	struct vring_packed_desc *descs = vq->desc_packed;
2962 	uint64_t lens[PACKED_BATCH_SIZE];
2963 	uint64_t buf_lens[PACKED_BATCH_SIZE];
2964 	uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
2965 	uint16_t flags, i;
2966 
2967 	if (unlikely(avail_idx & PACKED_BATCH_MASK))
2968 		return -1;
2969 	if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size))
2970 		return -1;
2971 
2972 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2973 		flags = descs[avail_idx + i].flags;
2974 		if (unlikely((wrap != !!(flags & VRING_DESC_F_AVAIL)) ||
2975 			     (wrap == !!(flags & VRING_DESC_F_USED))  ||
2976 			     (flags & PACKED_DESC_SINGLE_DEQUEUE_FLAG)))
2977 			return -1;
2978 	}
2979 
2980 	rte_atomic_thread_fence(__ATOMIC_ACQUIRE);
2981 
2982 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
2983 		lens[i] = descs[avail_idx + i].len;
2984 
2985 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2986 		desc_addrs[i] = vhost_iova_to_vva(dev, vq,
2987 						  descs[avail_idx + i].addr,
2988 						  &lens[i], VHOST_ACCESS_RW);
2989 	}
2990 
2991 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2992 		if (unlikely(!desc_addrs[i]))
2993 			return -1;
2994 		if (unlikely((lens[i] != descs[avail_idx + i].len)))
2995 			return -1;
2996 	}
2997 
2998 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2999 		if (virtio_dev_pktmbuf_prep(dev, pkts[i], lens[i]))
3000 			goto err;
3001 	}
3002 
3003 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
3004 		buf_lens[i] = pkts[i]->buf_len - pkts[i]->data_off;
3005 
3006 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
3007 		if (unlikely(buf_lens[i] < (lens[i] - buf_offset)))
3008 			goto err;
3009 	}
3010 
3011 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
3012 		pkts[i]->pkt_len = lens[i] - buf_offset;
3013 		pkts[i]->data_len = pkts[i]->pkt_len;
3014 		ids[i] = descs[avail_idx + i].id;
3015 	}
3016 
3017 	return 0;
3018 
3019 err:
3020 	return -1;
3021 }
3022 
3023 static __rte_always_inline int
3024 virtio_dev_tx_batch_packed(struct virtio_net *dev,
3025 			   struct vhost_virtqueue *vq,
3026 			   struct rte_mbuf **pkts,
3027 			   bool legacy_ol_flags)
3028 {
3029 	uint16_t avail_idx = vq->last_avail_idx;
3030 	uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
3031 	struct virtio_net_hdr *hdr;
3032 	uintptr_t desc_addrs[PACKED_BATCH_SIZE];
3033 	uint16_t ids[PACKED_BATCH_SIZE];
3034 	uint16_t i;
3035 
3036 	if (vhost_reserve_avail_batch_packed(dev, vq, pkts, avail_idx,
3037 					     desc_addrs, ids))
3038 		return -1;
3039 
3040 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
3041 		rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
3042 
3043 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
3044 		rte_memcpy(rte_pktmbuf_mtod_offset(pkts[i], void *, 0),
3045 			   (void *)(uintptr_t)(desc_addrs[i] + buf_offset),
3046 			   pkts[i]->pkt_len);
3047 
3048 	if (virtio_net_with_host_offload(dev)) {
3049 		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
3050 			hdr = (struct virtio_net_hdr *)(desc_addrs[i]);
3051 			vhost_dequeue_offload(hdr, pkts[i], legacy_ol_flags);
3052 		}
3053 	}
3054 
3055 	if (virtio_net_is_inorder(dev))
3056 		vhost_shadow_dequeue_batch_packed_inorder(vq,
3057 			ids[PACKED_BATCH_SIZE - 1]);
3058 	else
3059 		vhost_shadow_dequeue_batch_packed(dev, vq, ids);
3060 
3061 	vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE);
3062 
3063 	return 0;
3064 }
3065 
3066 static __rte_always_inline int
3067 vhost_dequeue_single_packed(struct virtio_net *dev,
3068 			    struct vhost_virtqueue *vq,
3069 			    struct rte_mempool *mbuf_pool,
3070 			    struct rte_mbuf *pkts,
3071 			    uint16_t *buf_id,
3072 			    uint16_t *desc_count,
3073 			    bool legacy_ol_flags)
3074 {
3075 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
3076 	uint32_t buf_len;
3077 	uint16_t nr_vec = 0;
3078 	int err;
3079 	static bool allocerr_warned;
3080 
3081 	if (unlikely(fill_vec_buf_packed(dev, vq,
3082 					 vq->last_avail_idx, desc_count,
3083 					 buf_vec, &nr_vec,
3084 					 buf_id, &buf_len,
3085 					 VHOST_ACCESS_RO) < 0))
3086 		return -1;
3087 
3088 	if (unlikely(virtio_dev_pktmbuf_prep(dev, pkts, buf_len))) {
3089 		if (!allocerr_warned) {
3090 			VHOST_LOG_DATA(ERR,
3091 				"Failed mbuf alloc of size %d from %s on %s.\n",
3092 				buf_len, mbuf_pool->name, dev->ifname);
3093 			allocerr_warned = true;
3094 		}
3095 		return -1;
3096 	}
3097 
3098 	err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts,
3099 				mbuf_pool, legacy_ol_flags);
3100 	if (unlikely(err)) {
3101 		if (!allocerr_warned) {
3102 			VHOST_LOG_DATA(ERR,
3103 				"Failed to copy desc to mbuf on %s.\n",
3104 				dev->ifname);
3105 			allocerr_warned = true;
3106 		}
3107 		return -1;
3108 	}
3109 
3110 	return 0;
3111 }
3112 
3113 static __rte_always_inline int
3114 virtio_dev_tx_single_packed(struct virtio_net *dev,
3115 			    struct vhost_virtqueue *vq,
3116 			    struct rte_mempool *mbuf_pool,
3117 			    struct rte_mbuf *pkts,
3118 			    bool legacy_ol_flags)
3119 {
3120 
3121 	uint16_t buf_id, desc_count = 0;
3122 	int ret;
3123 
3124 	ret = vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id,
3125 					&desc_count, legacy_ol_flags);
3126 
3127 	if (likely(desc_count > 0)) {
3128 		if (virtio_net_is_inorder(dev))
3129 			vhost_shadow_dequeue_single_packed_inorder(vq, buf_id,
3130 								   desc_count);
3131 		else
3132 			vhost_shadow_dequeue_single_packed(vq, buf_id,
3133 					desc_count);
3134 
3135 		vq_inc_last_avail_packed(vq, desc_count);
3136 	}
3137 
3138 	return ret;
3139 }
3140 
3141 __rte_always_inline
3142 static uint16_t
3143 virtio_dev_tx_packed(struct virtio_net *dev,
3144 		     struct vhost_virtqueue *__rte_restrict vq,
3145 		     struct rte_mempool *mbuf_pool,
3146 		     struct rte_mbuf **__rte_restrict pkts,
3147 		     uint32_t count,
3148 		     bool legacy_ol_flags)
3149 {
3150 	uint32_t pkt_idx = 0;
3151 
3152 	if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts, count))
3153 		return 0;
3154 
3155 	do {
3156 		rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
3157 
3158 		if (count - pkt_idx >= PACKED_BATCH_SIZE) {
3159 			if (!virtio_dev_tx_batch_packed(dev, vq,
3160 							&pkts[pkt_idx],
3161 							legacy_ol_flags)) {
3162 				pkt_idx += PACKED_BATCH_SIZE;
3163 				continue;
3164 			}
3165 		}
3166 
3167 		if (virtio_dev_tx_single_packed(dev, vq, mbuf_pool,
3168 						pkts[pkt_idx],
3169 						legacy_ol_flags))
3170 			break;
3171 		pkt_idx++;
3172 	} while (pkt_idx < count);
3173 
3174 	if (pkt_idx != count)
3175 		rte_pktmbuf_free_bulk(&pkts[pkt_idx], count - pkt_idx);
3176 
3177 	if (vq->shadow_used_idx) {
3178 		do_data_copy_dequeue(vq);
3179 
3180 		vhost_flush_dequeue_shadow_packed(dev, vq);
3181 		vhost_vring_call_packed(dev, vq);
3182 	}
3183 
3184 	return pkt_idx;
3185 }
3186 
3187 __rte_noinline
3188 static uint16_t
3189 virtio_dev_tx_packed_legacy(struct virtio_net *dev,
3190 	struct vhost_virtqueue *__rte_restrict vq, struct rte_mempool *mbuf_pool,
3191 	struct rte_mbuf **__rte_restrict pkts, uint32_t count)
3192 {
3193 	return virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count, true);
3194 }
3195 
3196 __rte_noinline
3197 static uint16_t
3198 virtio_dev_tx_packed_compliant(struct virtio_net *dev,
3199 	struct vhost_virtqueue *__rte_restrict vq, struct rte_mempool *mbuf_pool,
3200 	struct rte_mbuf **__rte_restrict pkts, uint32_t count)
3201 {
3202 	return virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count, false);
3203 }
3204 
3205 uint16_t
3206 rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
3207 	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
3208 {
3209 	struct virtio_net *dev;
3210 	struct rte_mbuf *rarp_mbuf = NULL;
3211 	struct vhost_virtqueue *vq;
3212 	int16_t success = 1;
3213 
3214 	dev = get_device(vid);
3215 	if (!dev)
3216 		return 0;
3217 
3218 	if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
3219 		VHOST_LOG_DATA(ERR,
3220 			"(%d) %s: built-in vhost net backend is disabled.\n",
3221 			dev->vid, __func__);
3222 		return 0;
3223 	}
3224 
3225 	if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) {
3226 		VHOST_LOG_DATA(ERR,
3227 			"(%d) %s: invalid virtqueue idx %d.\n",
3228 			dev->vid, __func__, queue_id);
3229 		return 0;
3230 	}
3231 
3232 	vq = dev->virtqueue[queue_id];
3233 
3234 	if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
3235 		return 0;
3236 
3237 	if (unlikely(!vq->enabled)) {
3238 		count = 0;
3239 		goto out_access_unlock;
3240 	}
3241 
3242 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
3243 		vhost_user_iotlb_rd_lock(vq);
3244 
3245 	if (unlikely(!vq->access_ok))
3246 		if (unlikely(vring_translate(dev, vq) < 0)) {
3247 			count = 0;
3248 			goto out;
3249 		}
3250 
3251 	/*
3252 	 * Construct a RARP broadcast packet, and inject it to the "pkts"
3253 	 * array, to looks like that guest actually send such packet.
3254 	 *
3255 	 * Check user_send_rarp() for more information.
3256 	 *
3257 	 * broadcast_rarp shares a cacheline in the virtio_net structure
3258 	 * with some fields that are accessed during enqueue and
3259 	 * __atomic_compare_exchange_n causes a write if performed compare
3260 	 * and exchange. This could result in false sharing between enqueue
3261 	 * and dequeue.
3262 	 *
3263 	 * Prevent unnecessary false sharing by reading broadcast_rarp first
3264 	 * and only performing compare and exchange if the read indicates it
3265 	 * is likely to be set.
3266 	 */
3267 	if (unlikely(__atomic_load_n(&dev->broadcast_rarp, __ATOMIC_ACQUIRE) &&
3268 			__atomic_compare_exchange_n(&dev->broadcast_rarp,
3269 			&success, 0, 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED))) {
3270 
3271 		rarp_mbuf = rte_net_make_rarp_packet(mbuf_pool, &dev->mac);
3272 		if (rarp_mbuf == NULL) {
3273 			VHOST_LOG_DATA(ERR, "Failed to make RARP packet.\n");
3274 			count = 0;
3275 			goto out;
3276 		}
3277 		count -= 1;
3278 	}
3279 
3280 	if (vq_is_packed(dev)) {
3281 		if (dev->flags & VIRTIO_DEV_LEGACY_OL_FLAGS)
3282 			count = virtio_dev_tx_packed_legacy(dev, vq, mbuf_pool, pkts, count);
3283 		else
3284 			count = virtio_dev_tx_packed_compliant(dev, vq, mbuf_pool, pkts, count);
3285 	} else {
3286 		if (dev->flags & VIRTIO_DEV_LEGACY_OL_FLAGS)
3287 			count = virtio_dev_tx_split_legacy(dev, vq, mbuf_pool, pkts, count);
3288 		else
3289 			count = virtio_dev_tx_split_compliant(dev, vq, mbuf_pool, pkts, count);
3290 	}
3291 
3292 out:
3293 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
3294 		vhost_user_iotlb_rd_unlock(vq);
3295 
3296 out_access_unlock:
3297 	rte_spinlock_unlock(&vq->access_lock);
3298 
3299 	if (unlikely(rarp_mbuf != NULL)) {
3300 		/*
3301 		 * Inject it to the head of "pkts" array, so that switch's mac
3302 		 * learning table will get updated first.
3303 		 */
3304 		memmove(&pkts[1], pkts, count * sizeof(struct rte_mbuf *));
3305 		pkts[0] = rarp_mbuf;
3306 		count += 1;
3307 	}
3308 
3309 	return count;
3310 }
3311