xref: /dpdk/lib/vhost/virtio_net.c (revision a7db3afce75346832059d8bfe54a8f81945fb213)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2016 Intel Corporation
3  */
4 
5 #include <stdint.h>
6 #include <stdbool.h>
7 #include <linux/virtio_net.h>
8 
9 #include <rte_mbuf.h>
10 #include <rte_memcpy.h>
11 #include <rte_net.h>
12 #include <rte_ether.h>
13 #include <rte_ip.h>
14 #include <rte_vhost.h>
15 #include <rte_tcp.h>
16 #include <rte_udp.h>
17 #include <rte_sctp.h>
18 #include <rte_arp.h>
19 #include <rte_spinlock.h>
20 #include <rte_malloc.h>
21 #include <rte_vhost_async.h>
22 
23 #include "iotlb.h"
24 #include "vhost.h"
25 
26 #define MAX_BATCH_LEN 256
27 
28 #define VHOST_ASYNC_BATCH_THRESHOLD 32
29 
30 static  __rte_always_inline bool
31 rxvq_is_mergeable(struct virtio_net *dev)
32 {
33 	return dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF);
34 }
35 
36 static  __rte_always_inline bool
37 virtio_net_is_inorder(struct virtio_net *dev)
38 {
39 	return dev->features & (1ULL << VIRTIO_F_IN_ORDER);
40 }
41 
42 static bool
43 is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring)
44 {
45 	return (is_tx ^ (idx & 1)) == 0 && idx < nr_vring;
46 }
47 
48 static inline void
49 do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
50 {
51 	struct batch_copy_elem *elem = vq->batch_copy_elems;
52 	uint16_t count = vq->batch_copy_nb_elems;
53 	int i;
54 
55 	for (i = 0; i < count; i++) {
56 		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
57 		vhost_log_cache_write_iova(dev, vq, elem[i].log_addr,
58 					   elem[i].len);
59 		PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
60 	}
61 
62 	vq->batch_copy_nb_elems = 0;
63 }
64 
65 static inline void
66 do_data_copy_dequeue(struct vhost_virtqueue *vq)
67 {
68 	struct batch_copy_elem *elem = vq->batch_copy_elems;
69 	uint16_t count = vq->batch_copy_nb_elems;
70 	int i;
71 
72 	for (i = 0; i < count; i++)
73 		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
74 
75 	vq->batch_copy_nb_elems = 0;
76 }
77 
78 static __rte_always_inline void
79 do_flush_shadow_used_ring_split(struct virtio_net *dev,
80 			struct vhost_virtqueue *vq,
81 			uint16_t to, uint16_t from, uint16_t size)
82 {
83 	rte_memcpy(&vq->used->ring[to],
84 			&vq->shadow_used_split[from],
85 			size * sizeof(struct vring_used_elem));
86 	vhost_log_cache_used_vring(dev, vq,
87 			offsetof(struct vring_used, ring[to]),
88 			size * sizeof(struct vring_used_elem));
89 }
90 
91 static __rte_always_inline void
92 flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
93 {
94 	uint16_t used_idx = vq->last_used_idx & (vq->size - 1);
95 
96 	if (used_idx + vq->shadow_used_idx <= vq->size) {
97 		do_flush_shadow_used_ring_split(dev, vq, used_idx, 0,
98 					  vq->shadow_used_idx);
99 	} else {
100 		uint16_t size;
101 
102 		/* update used ring interval [used_idx, vq->size] */
103 		size = vq->size - used_idx;
104 		do_flush_shadow_used_ring_split(dev, vq, used_idx, 0, size);
105 
106 		/* update the left half used ring interval [0, left_size] */
107 		do_flush_shadow_used_ring_split(dev, vq, 0, size,
108 					  vq->shadow_used_idx - size);
109 	}
110 	vq->last_used_idx += vq->shadow_used_idx;
111 
112 	vhost_log_cache_sync(dev, vq);
113 
114 	__atomic_add_fetch(&vq->used->idx, vq->shadow_used_idx,
115 			   __ATOMIC_RELEASE);
116 	vq->shadow_used_idx = 0;
117 	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
118 		sizeof(vq->used->idx));
119 }
120 
121 static __rte_always_inline void
122 update_shadow_used_ring_split(struct vhost_virtqueue *vq,
123 			 uint16_t desc_idx, uint32_t len)
124 {
125 	uint16_t i = vq->shadow_used_idx++;
126 
127 	vq->shadow_used_split[i].id  = desc_idx;
128 	vq->shadow_used_split[i].len = len;
129 }
130 
131 static __rte_always_inline void
132 vhost_flush_enqueue_shadow_packed(struct virtio_net *dev,
133 				  struct vhost_virtqueue *vq)
134 {
135 	int i;
136 	uint16_t used_idx = vq->last_used_idx;
137 	uint16_t head_idx = vq->last_used_idx;
138 	uint16_t head_flags = 0;
139 
140 	/* Split loop in two to save memory barriers */
141 	for (i = 0; i < vq->shadow_used_idx; i++) {
142 		vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id;
143 		vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len;
144 
145 		used_idx += vq->shadow_used_packed[i].count;
146 		if (used_idx >= vq->size)
147 			used_idx -= vq->size;
148 	}
149 
150 	/* The ordering for storing desc flags needs to be enforced. */
151 	rte_atomic_thread_fence(__ATOMIC_RELEASE);
152 
153 	for (i = 0; i < vq->shadow_used_idx; i++) {
154 		uint16_t flags;
155 
156 		if (vq->shadow_used_packed[i].len)
157 			flags = VRING_DESC_F_WRITE;
158 		else
159 			flags = 0;
160 
161 		if (vq->used_wrap_counter) {
162 			flags |= VRING_DESC_F_USED;
163 			flags |= VRING_DESC_F_AVAIL;
164 		} else {
165 			flags &= ~VRING_DESC_F_USED;
166 			flags &= ~VRING_DESC_F_AVAIL;
167 		}
168 
169 		if (i > 0) {
170 			vq->desc_packed[vq->last_used_idx].flags = flags;
171 
172 			vhost_log_cache_used_vring(dev, vq,
173 					vq->last_used_idx *
174 					sizeof(struct vring_packed_desc),
175 					sizeof(struct vring_packed_desc));
176 		} else {
177 			head_idx = vq->last_used_idx;
178 			head_flags = flags;
179 		}
180 
181 		vq_inc_last_used_packed(vq, vq->shadow_used_packed[i].count);
182 	}
183 
184 	vq->desc_packed[head_idx].flags = head_flags;
185 
186 	vhost_log_cache_used_vring(dev, vq,
187 				head_idx *
188 				sizeof(struct vring_packed_desc),
189 				sizeof(struct vring_packed_desc));
190 
191 	vq->shadow_used_idx = 0;
192 	vhost_log_cache_sync(dev, vq);
193 }
194 
195 static __rte_always_inline void
196 vhost_flush_dequeue_shadow_packed(struct virtio_net *dev,
197 				  struct vhost_virtqueue *vq)
198 {
199 	struct vring_used_elem_packed *used_elem = &vq->shadow_used_packed[0];
200 
201 	vq->desc_packed[vq->shadow_last_used_idx].id = used_elem->id;
202 	/* desc flags is the synchronization point for virtio packed vring */
203 	__atomic_store_n(&vq->desc_packed[vq->shadow_last_used_idx].flags,
204 			 used_elem->flags, __ATOMIC_RELEASE);
205 
206 	vhost_log_cache_used_vring(dev, vq, vq->shadow_last_used_idx *
207 				   sizeof(struct vring_packed_desc),
208 				   sizeof(struct vring_packed_desc));
209 	vq->shadow_used_idx = 0;
210 	vhost_log_cache_sync(dev, vq);
211 }
212 
213 static __rte_always_inline void
214 vhost_flush_enqueue_batch_packed(struct virtio_net *dev,
215 				 struct vhost_virtqueue *vq,
216 				 uint64_t *lens,
217 				 uint16_t *ids)
218 {
219 	uint16_t i;
220 	uint16_t flags;
221 	uint16_t last_used_idx;
222 	struct vring_packed_desc *desc_base;
223 
224 	last_used_idx = vq->last_used_idx;
225 	desc_base = &vq->desc_packed[last_used_idx];
226 
227 	flags = PACKED_DESC_ENQUEUE_USED_FLAG(vq->used_wrap_counter);
228 
229 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
230 		desc_base[i].id = ids[i];
231 		desc_base[i].len = lens[i];
232 	}
233 
234 	rte_atomic_thread_fence(__ATOMIC_RELEASE);
235 
236 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
237 		desc_base[i].flags = flags;
238 	}
239 
240 	vhost_log_cache_used_vring(dev, vq, last_used_idx *
241 				   sizeof(struct vring_packed_desc),
242 				   sizeof(struct vring_packed_desc) *
243 				   PACKED_BATCH_SIZE);
244 	vhost_log_cache_sync(dev, vq);
245 
246 	vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
247 }
248 
249 static __rte_always_inline void
250 vhost_shadow_dequeue_batch_packed_inorder(struct vhost_virtqueue *vq,
251 					  uint16_t id)
252 {
253 	vq->shadow_used_packed[0].id = id;
254 
255 	if (!vq->shadow_used_idx) {
256 		vq->shadow_last_used_idx = vq->last_used_idx;
257 		vq->shadow_used_packed[0].flags =
258 			PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter);
259 		vq->shadow_used_packed[0].len = 0;
260 		vq->shadow_used_packed[0].count = 1;
261 		vq->shadow_used_idx++;
262 	}
263 
264 	vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
265 }
266 
267 static __rte_always_inline void
268 vhost_shadow_dequeue_batch_packed(struct virtio_net *dev,
269 				  struct vhost_virtqueue *vq,
270 				  uint16_t *ids)
271 {
272 	uint16_t flags;
273 	uint16_t i;
274 	uint16_t begin;
275 
276 	flags = PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter);
277 
278 	if (!vq->shadow_used_idx) {
279 		vq->shadow_last_used_idx = vq->last_used_idx;
280 		vq->shadow_used_packed[0].id  = ids[0];
281 		vq->shadow_used_packed[0].len = 0;
282 		vq->shadow_used_packed[0].count = 1;
283 		vq->shadow_used_packed[0].flags = flags;
284 		vq->shadow_used_idx++;
285 		begin = 1;
286 	} else
287 		begin = 0;
288 
289 	vhost_for_each_try_unroll(i, begin, PACKED_BATCH_SIZE) {
290 		vq->desc_packed[vq->last_used_idx + i].id = ids[i];
291 		vq->desc_packed[vq->last_used_idx + i].len = 0;
292 	}
293 
294 	rte_atomic_thread_fence(__ATOMIC_RELEASE);
295 	vhost_for_each_try_unroll(i, begin, PACKED_BATCH_SIZE)
296 		vq->desc_packed[vq->last_used_idx + i].flags = flags;
297 
298 	vhost_log_cache_used_vring(dev, vq, vq->last_used_idx *
299 				   sizeof(struct vring_packed_desc),
300 				   sizeof(struct vring_packed_desc) *
301 				   PACKED_BATCH_SIZE);
302 	vhost_log_cache_sync(dev, vq);
303 
304 	vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
305 }
306 
307 static __rte_always_inline void
308 vhost_shadow_dequeue_single_packed(struct vhost_virtqueue *vq,
309 				   uint16_t buf_id,
310 				   uint16_t count)
311 {
312 	uint16_t flags;
313 
314 	flags = vq->desc_packed[vq->last_used_idx].flags;
315 	if (vq->used_wrap_counter) {
316 		flags |= VRING_DESC_F_USED;
317 		flags |= VRING_DESC_F_AVAIL;
318 	} else {
319 		flags &= ~VRING_DESC_F_USED;
320 		flags &= ~VRING_DESC_F_AVAIL;
321 	}
322 
323 	if (!vq->shadow_used_idx) {
324 		vq->shadow_last_used_idx = vq->last_used_idx;
325 
326 		vq->shadow_used_packed[0].id  = buf_id;
327 		vq->shadow_used_packed[0].len = 0;
328 		vq->shadow_used_packed[0].flags = flags;
329 		vq->shadow_used_idx++;
330 	} else {
331 		vq->desc_packed[vq->last_used_idx].id = buf_id;
332 		vq->desc_packed[vq->last_used_idx].len = 0;
333 		vq->desc_packed[vq->last_used_idx].flags = flags;
334 	}
335 
336 	vq_inc_last_used_packed(vq, count);
337 }
338 
339 static __rte_always_inline void
340 vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue *vq,
341 					   uint16_t buf_id,
342 					   uint16_t count)
343 {
344 	uint16_t flags;
345 
346 	vq->shadow_used_packed[0].id = buf_id;
347 
348 	flags = vq->desc_packed[vq->last_used_idx].flags;
349 	if (vq->used_wrap_counter) {
350 		flags |= VRING_DESC_F_USED;
351 		flags |= VRING_DESC_F_AVAIL;
352 	} else {
353 		flags &= ~VRING_DESC_F_USED;
354 		flags &= ~VRING_DESC_F_AVAIL;
355 	}
356 
357 	if (!vq->shadow_used_idx) {
358 		vq->shadow_last_used_idx = vq->last_used_idx;
359 		vq->shadow_used_packed[0].len = 0;
360 		vq->shadow_used_packed[0].flags = flags;
361 		vq->shadow_used_idx++;
362 	}
363 
364 	vq_inc_last_used_packed(vq, count);
365 }
366 
367 static __rte_always_inline void
368 vhost_shadow_enqueue_packed(struct vhost_virtqueue *vq,
369 				   uint32_t *len,
370 				   uint16_t *id,
371 				   uint16_t *count,
372 				   uint16_t num_buffers)
373 {
374 	uint16_t i;
375 
376 	for (i = 0; i < num_buffers; i++) {
377 		/* enqueue shadow flush action aligned with batch num */
378 		if (!vq->shadow_used_idx)
379 			vq->shadow_aligned_idx = vq->last_used_idx &
380 				PACKED_BATCH_MASK;
381 		vq->shadow_used_packed[vq->shadow_used_idx].id  = id[i];
382 		vq->shadow_used_packed[vq->shadow_used_idx].len = len[i];
383 		vq->shadow_used_packed[vq->shadow_used_idx].count = count[i];
384 		vq->shadow_aligned_idx += count[i];
385 		vq->shadow_used_idx++;
386 	}
387 }
388 
389 static __rte_always_inline void
390 vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
391 				   struct vhost_virtqueue *vq,
392 				   uint32_t *len,
393 				   uint16_t *id,
394 				   uint16_t *count,
395 				   uint16_t num_buffers)
396 {
397 	vhost_shadow_enqueue_packed(vq, len, id, count, num_buffers);
398 
399 	if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) {
400 		do_data_copy_enqueue(dev, vq);
401 		vhost_flush_enqueue_shadow_packed(dev, vq);
402 	}
403 }
404 
405 /* avoid write operation when necessary, to lessen cache issues */
406 #define ASSIGN_UNLESS_EQUAL(var, val) do {	\
407 	if ((var) != (val))			\
408 		(var) = (val);			\
409 } while (0)
410 
411 static __rte_always_inline void
412 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
413 {
414 	uint64_t csum_l4 = m_buf->ol_flags & PKT_TX_L4_MASK;
415 
416 	if (m_buf->ol_flags & PKT_TX_TCP_SEG)
417 		csum_l4 |= PKT_TX_TCP_CKSUM;
418 
419 	if (csum_l4) {
420 		net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
421 		net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
422 
423 		switch (csum_l4) {
424 		case PKT_TX_TCP_CKSUM:
425 			net_hdr->csum_offset = (offsetof(struct rte_tcp_hdr,
426 						cksum));
427 			break;
428 		case PKT_TX_UDP_CKSUM:
429 			net_hdr->csum_offset = (offsetof(struct rte_udp_hdr,
430 						dgram_cksum));
431 			break;
432 		case PKT_TX_SCTP_CKSUM:
433 			net_hdr->csum_offset = (offsetof(struct rte_sctp_hdr,
434 						cksum));
435 			break;
436 		}
437 	} else {
438 		ASSIGN_UNLESS_EQUAL(net_hdr->csum_start, 0);
439 		ASSIGN_UNLESS_EQUAL(net_hdr->csum_offset, 0);
440 		ASSIGN_UNLESS_EQUAL(net_hdr->flags, 0);
441 	}
442 
443 	/* IP cksum verification cannot be bypassed, then calculate here */
444 	if (m_buf->ol_flags & PKT_TX_IP_CKSUM) {
445 		struct rte_ipv4_hdr *ipv4_hdr;
446 
447 		ipv4_hdr = rte_pktmbuf_mtod_offset(m_buf, struct rte_ipv4_hdr *,
448 						   m_buf->l2_len);
449 		ipv4_hdr->hdr_checksum = 0;
450 		ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr);
451 	}
452 
453 	if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
454 		if (m_buf->ol_flags & PKT_TX_IPV4)
455 			net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
456 		else
457 			net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
458 		net_hdr->gso_size = m_buf->tso_segsz;
459 		net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
460 					+ m_buf->l4_len;
461 	} else if (m_buf->ol_flags & PKT_TX_UDP_SEG) {
462 		net_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
463 		net_hdr->gso_size = m_buf->tso_segsz;
464 		net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len +
465 			m_buf->l4_len;
466 	} else {
467 		ASSIGN_UNLESS_EQUAL(net_hdr->gso_type, 0);
468 		ASSIGN_UNLESS_EQUAL(net_hdr->gso_size, 0);
469 		ASSIGN_UNLESS_EQUAL(net_hdr->hdr_len, 0);
470 	}
471 }
472 
473 static __rte_always_inline int
474 map_one_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
475 		struct buf_vector *buf_vec, uint16_t *vec_idx,
476 		uint64_t desc_iova, uint64_t desc_len, uint8_t perm)
477 {
478 	uint16_t vec_id = *vec_idx;
479 
480 	while (desc_len) {
481 		uint64_t desc_addr;
482 		uint64_t desc_chunck_len = desc_len;
483 
484 		if (unlikely(vec_id >= BUF_VECTOR_MAX))
485 			return -1;
486 
487 		desc_addr = vhost_iova_to_vva(dev, vq,
488 				desc_iova,
489 				&desc_chunck_len,
490 				perm);
491 		if (unlikely(!desc_addr))
492 			return -1;
493 
494 		rte_prefetch0((void *)(uintptr_t)desc_addr);
495 
496 		buf_vec[vec_id].buf_iova = desc_iova;
497 		buf_vec[vec_id].buf_addr = desc_addr;
498 		buf_vec[vec_id].buf_len  = desc_chunck_len;
499 
500 		desc_len -= desc_chunck_len;
501 		desc_iova += desc_chunck_len;
502 		vec_id++;
503 	}
504 	*vec_idx = vec_id;
505 
506 	return 0;
507 }
508 
509 static __rte_always_inline int
510 fill_vec_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
511 			 uint32_t avail_idx, uint16_t *vec_idx,
512 			 struct buf_vector *buf_vec, uint16_t *desc_chain_head,
513 			 uint32_t *desc_chain_len, uint8_t perm)
514 {
515 	uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
516 	uint16_t vec_id = *vec_idx;
517 	uint32_t len    = 0;
518 	uint64_t dlen;
519 	uint32_t nr_descs = vq->size;
520 	uint32_t cnt    = 0;
521 	struct vring_desc *descs = vq->desc;
522 	struct vring_desc *idesc = NULL;
523 
524 	if (unlikely(idx >= vq->size))
525 		return -1;
526 
527 	*desc_chain_head = idx;
528 
529 	if (vq->desc[idx].flags & VRING_DESC_F_INDIRECT) {
530 		dlen = vq->desc[idx].len;
531 		nr_descs = dlen / sizeof(struct vring_desc);
532 		if (unlikely(nr_descs > vq->size))
533 			return -1;
534 
535 		descs = (struct vring_desc *)(uintptr_t)
536 			vhost_iova_to_vva(dev, vq, vq->desc[idx].addr,
537 						&dlen,
538 						VHOST_ACCESS_RO);
539 		if (unlikely(!descs))
540 			return -1;
541 
542 		if (unlikely(dlen < vq->desc[idx].len)) {
543 			/*
544 			 * The indirect desc table is not contiguous
545 			 * in process VA space, we have to copy it.
546 			 */
547 			idesc = vhost_alloc_copy_ind_table(dev, vq,
548 					vq->desc[idx].addr, vq->desc[idx].len);
549 			if (unlikely(!idesc))
550 				return -1;
551 
552 			descs = idesc;
553 		}
554 
555 		idx = 0;
556 	}
557 
558 	while (1) {
559 		if (unlikely(idx >= nr_descs || cnt++ >= nr_descs)) {
560 			free_ind_table(idesc);
561 			return -1;
562 		}
563 
564 		dlen = descs[idx].len;
565 		len += dlen;
566 
567 		if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
568 						descs[idx].addr, dlen,
569 						perm))) {
570 			free_ind_table(idesc);
571 			return -1;
572 		}
573 
574 		if ((descs[idx].flags & VRING_DESC_F_NEXT) == 0)
575 			break;
576 
577 		idx = descs[idx].next;
578 	}
579 
580 	*desc_chain_len = len;
581 	*vec_idx = vec_id;
582 
583 	if (unlikely(!!idesc))
584 		free_ind_table(idesc);
585 
586 	return 0;
587 }
588 
589 /*
590  * Returns -1 on fail, 0 on success
591  */
592 static inline int
593 reserve_avail_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
594 				uint32_t size, struct buf_vector *buf_vec,
595 				uint16_t *num_buffers, uint16_t avail_head,
596 				uint16_t *nr_vec)
597 {
598 	uint16_t cur_idx;
599 	uint16_t vec_idx = 0;
600 	uint16_t max_tries, tries = 0;
601 
602 	uint16_t head_idx = 0;
603 	uint32_t len = 0;
604 
605 	*num_buffers = 0;
606 	cur_idx  = vq->last_avail_idx;
607 
608 	if (rxvq_is_mergeable(dev))
609 		max_tries = vq->size - 1;
610 	else
611 		max_tries = 1;
612 
613 	while (size > 0) {
614 		if (unlikely(cur_idx == avail_head))
615 			return -1;
616 		/*
617 		 * if we tried all available ring items, and still
618 		 * can't get enough buf, it means something abnormal
619 		 * happened.
620 		 */
621 		if (unlikely(++tries > max_tries))
622 			return -1;
623 
624 		if (unlikely(fill_vec_buf_split(dev, vq, cur_idx,
625 						&vec_idx, buf_vec,
626 						&head_idx, &len,
627 						VHOST_ACCESS_RW) < 0))
628 			return -1;
629 		len = RTE_MIN(len, size);
630 		update_shadow_used_ring_split(vq, head_idx, len);
631 		size -= len;
632 
633 		cur_idx++;
634 		*num_buffers += 1;
635 	}
636 
637 	*nr_vec = vec_idx;
638 
639 	return 0;
640 }
641 
642 static __rte_always_inline int
643 fill_vec_buf_packed_indirect(struct virtio_net *dev,
644 			struct vhost_virtqueue *vq,
645 			struct vring_packed_desc *desc, uint16_t *vec_idx,
646 			struct buf_vector *buf_vec, uint32_t *len, uint8_t perm)
647 {
648 	uint16_t i;
649 	uint32_t nr_descs;
650 	uint16_t vec_id = *vec_idx;
651 	uint64_t dlen;
652 	struct vring_packed_desc *descs, *idescs = NULL;
653 
654 	dlen = desc->len;
655 	descs = (struct vring_packed_desc *)(uintptr_t)
656 		vhost_iova_to_vva(dev, vq, desc->addr, &dlen, VHOST_ACCESS_RO);
657 	if (unlikely(!descs))
658 		return -1;
659 
660 	if (unlikely(dlen < desc->len)) {
661 		/*
662 		 * The indirect desc table is not contiguous
663 		 * in process VA space, we have to copy it.
664 		 */
665 		idescs = vhost_alloc_copy_ind_table(dev,
666 				vq, desc->addr, desc->len);
667 		if (unlikely(!idescs))
668 			return -1;
669 
670 		descs = idescs;
671 	}
672 
673 	nr_descs =  desc->len / sizeof(struct vring_packed_desc);
674 	if (unlikely(nr_descs >= vq->size)) {
675 		free_ind_table(idescs);
676 		return -1;
677 	}
678 
679 	for (i = 0; i < nr_descs; i++) {
680 		if (unlikely(vec_id >= BUF_VECTOR_MAX)) {
681 			free_ind_table(idescs);
682 			return -1;
683 		}
684 
685 		dlen = descs[i].len;
686 		*len += dlen;
687 		if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
688 						descs[i].addr, dlen,
689 						perm)))
690 			return -1;
691 	}
692 	*vec_idx = vec_id;
693 
694 	if (unlikely(!!idescs))
695 		free_ind_table(idescs);
696 
697 	return 0;
698 }
699 
700 static __rte_always_inline int
701 fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
702 				uint16_t avail_idx, uint16_t *desc_count,
703 				struct buf_vector *buf_vec, uint16_t *vec_idx,
704 				uint16_t *buf_id, uint32_t *len, uint8_t perm)
705 {
706 	bool wrap_counter = vq->avail_wrap_counter;
707 	struct vring_packed_desc *descs = vq->desc_packed;
708 	uint16_t vec_id = *vec_idx;
709 	uint64_t dlen;
710 
711 	if (avail_idx < vq->last_avail_idx)
712 		wrap_counter ^= 1;
713 
714 	/*
715 	 * Perform a load-acquire barrier in desc_is_avail to
716 	 * enforce the ordering between desc flags and desc
717 	 * content.
718 	 */
719 	if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter)))
720 		return -1;
721 
722 	*desc_count = 0;
723 	*len = 0;
724 
725 	while (1) {
726 		if (unlikely(vec_id >= BUF_VECTOR_MAX))
727 			return -1;
728 
729 		if (unlikely(*desc_count >= vq->size))
730 			return -1;
731 
732 		*desc_count += 1;
733 		*buf_id = descs[avail_idx].id;
734 
735 		if (descs[avail_idx].flags & VRING_DESC_F_INDIRECT) {
736 			if (unlikely(fill_vec_buf_packed_indirect(dev, vq,
737 							&descs[avail_idx],
738 							&vec_id, buf_vec,
739 							len, perm) < 0))
740 				return -1;
741 		} else {
742 			dlen = descs[avail_idx].len;
743 			*len += dlen;
744 
745 			if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
746 							descs[avail_idx].addr,
747 							dlen,
748 							perm)))
749 				return -1;
750 		}
751 
752 		if ((descs[avail_idx].flags & VRING_DESC_F_NEXT) == 0)
753 			break;
754 
755 		if (++avail_idx >= vq->size) {
756 			avail_idx -= vq->size;
757 			wrap_counter ^= 1;
758 		}
759 	}
760 
761 	*vec_idx = vec_id;
762 
763 	return 0;
764 }
765 
766 static __rte_noinline void
767 copy_vnet_hdr_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
768 		struct buf_vector *buf_vec,
769 		struct virtio_net_hdr_mrg_rxbuf *hdr)
770 {
771 	uint64_t len;
772 	uint64_t remain = dev->vhost_hlen;
773 	uint64_t src = (uint64_t)(uintptr_t)hdr, dst;
774 	uint64_t iova = buf_vec->buf_iova;
775 
776 	while (remain) {
777 		len = RTE_MIN(remain,
778 				buf_vec->buf_len);
779 		dst = buf_vec->buf_addr;
780 		rte_memcpy((void *)(uintptr_t)dst,
781 				(void *)(uintptr_t)src,
782 				len);
783 
784 		PRINT_PACKET(dev, (uintptr_t)dst,
785 				(uint32_t)len, 0);
786 		vhost_log_cache_write_iova(dev, vq,
787 				iova, len);
788 
789 		remain -= len;
790 		iova += len;
791 		src += len;
792 		buf_vec++;
793 	}
794 }
795 
796 static __rte_always_inline int
797 copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
798 			    struct rte_mbuf *m, struct buf_vector *buf_vec,
799 			    uint16_t nr_vec, uint16_t num_buffers)
800 {
801 	uint32_t vec_idx = 0;
802 	uint32_t mbuf_offset, mbuf_avail;
803 	uint32_t buf_offset, buf_avail;
804 	uint64_t buf_addr, buf_iova, buf_len;
805 	uint32_t cpy_len;
806 	uint64_t hdr_addr;
807 	struct rte_mbuf *hdr_mbuf;
808 	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
809 	struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
810 	int error = 0;
811 
812 	if (unlikely(m == NULL)) {
813 		error = -1;
814 		goto out;
815 	}
816 
817 	buf_addr = buf_vec[vec_idx].buf_addr;
818 	buf_iova = buf_vec[vec_idx].buf_iova;
819 	buf_len = buf_vec[vec_idx].buf_len;
820 
821 	if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
822 		error = -1;
823 		goto out;
824 	}
825 
826 	hdr_mbuf = m;
827 	hdr_addr = buf_addr;
828 	if (unlikely(buf_len < dev->vhost_hlen)) {
829 		memset(&tmp_hdr, 0, sizeof(struct virtio_net_hdr_mrg_rxbuf));
830 		hdr = &tmp_hdr;
831 	} else
832 		hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr;
833 
834 	VHOST_LOG_DATA(DEBUG, "(%d) RX: num merge buffers %d\n",
835 		dev->vid, num_buffers);
836 
837 	if (unlikely(buf_len < dev->vhost_hlen)) {
838 		buf_offset = dev->vhost_hlen - buf_len;
839 		vec_idx++;
840 		buf_addr = buf_vec[vec_idx].buf_addr;
841 		buf_iova = buf_vec[vec_idx].buf_iova;
842 		buf_len = buf_vec[vec_idx].buf_len;
843 		buf_avail = buf_len - buf_offset;
844 	} else {
845 		buf_offset = dev->vhost_hlen;
846 		buf_avail = buf_len - dev->vhost_hlen;
847 	}
848 
849 	mbuf_avail  = rte_pktmbuf_data_len(m);
850 	mbuf_offset = 0;
851 	while (mbuf_avail != 0 || m->next != NULL) {
852 		/* done with current buf, get the next one */
853 		if (buf_avail == 0) {
854 			vec_idx++;
855 			if (unlikely(vec_idx >= nr_vec)) {
856 				error = -1;
857 				goto out;
858 			}
859 
860 			buf_addr = buf_vec[vec_idx].buf_addr;
861 			buf_iova = buf_vec[vec_idx].buf_iova;
862 			buf_len = buf_vec[vec_idx].buf_len;
863 
864 			buf_offset = 0;
865 			buf_avail  = buf_len;
866 		}
867 
868 		/* done with current mbuf, get the next one */
869 		if (mbuf_avail == 0) {
870 			m = m->next;
871 
872 			mbuf_offset = 0;
873 			mbuf_avail  = rte_pktmbuf_data_len(m);
874 		}
875 
876 		if (hdr_addr) {
877 			virtio_enqueue_offload(hdr_mbuf, &hdr->hdr);
878 			if (rxvq_is_mergeable(dev))
879 				ASSIGN_UNLESS_EQUAL(hdr->num_buffers,
880 						num_buffers);
881 
882 			if (unlikely(hdr == &tmp_hdr)) {
883 				copy_vnet_hdr_to_desc(dev, vq, buf_vec, hdr);
884 			} else {
885 				PRINT_PACKET(dev, (uintptr_t)hdr_addr,
886 						dev->vhost_hlen, 0);
887 				vhost_log_cache_write_iova(dev, vq,
888 						buf_vec[0].buf_iova,
889 						dev->vhost_hlen);
890 			}
891 
892 			hdr_addr = 0;
893 		}
894 
895 		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
896 
897 		if (likely(cpy_len > MAX_BATCH_LEN ||
898 					vq->batch_copy_nb_elems >= vq->size)) {
899 			rte_memcpy((void *)((uintptr_t)(buf_addr + buf_offset)),
900 				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
901 				cpy_len);
902 			vhost_log_cache_write_iova(dev, vq,
903 						   buf_iova + buf_offset,
904 						   cpy_len);
905 			PRINT_PACKET(dev, (uintptr_t)(buf_addr + buf_offset),
906 				cpy_len, 0);
907 		} else {
908 			batch_copy[vq->batch_copy_nb_elems].dst =
909 				(void *)((uintptr_t)(buf_addr + buf_offset));
910 			batch_copy[vq->batch_copy_nb_elems].src =
911 				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
912 			batch_copy[vq->batch_copy_nb_elems].log_addr =
913 				buf_iova + buf_offset;
914 			batch_copy[vq->batch_copy_nb_elems].len = cpy_len;
915 			vq->batch_copy_nb_elems++;
916 		}
917 
918 		mbuf_avail  -= cpy_len;
919 		mbuf_offset += cpy_len;
920 		buf_avail  -= cpy_len;
921 		buf_offset += cpy_len;
922 	}
923 
924 out:
925 
926 	return error;
927 }
928 
929 static __rte_always_inline void
930 async_fill_vec(struct iovec *v, void *base, size_t len)
931 {
932 	v->iov_base = base;
933 	v->iov_len = len;
934 }
935 
936 static __rte_always_inline void
937 async_fill_iter(struct rte_vhost_iov_iter *it, size_t count,
938 	struct iovec *vec, unsigned long nr_seg)
939 {
940 	it->offset = 0;
941 	it->count = count;
942 
943 	if (count) {
944 		it->iov = vec;
945 		it->nr_segs = nr_seg;
946 	} else {
947 		it->iov = 0;
948 		it->nr_segs = 0;
949 	}
950 }
951 
952 static __rte_always_inline void
953 async_fill_desc(struct rte_vhost_async_desc *desc,
954 	struct rte_vhost_iov_iter *src, struct rte_vhost_iov_iter *dst)
955 {
956 	desc->src = src;
957 	desc->dst = dst;
958 }
959 
960 static __rte_always_inline int
961 async_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
962 			struct rte_mbuf *m, struct buf_vector *buf_vec,
963 			uint16_t nr_vec, uint16_t num_buffers,
964 			struct iovec *src_iovec, struct iovec *dst_iovec,
965 			struct rte_vhost_iov_iter *src_it,
966 			struct rte_vhost_iov_iter *dst_it)
967 {
968 	uint32_t vec_idx = 0;
969 	uint32_t mbuf_offset, mbuf_avail;
970 	uint32_t buf_offset, buf_avail;
971 	uint64_t buf_addr, buf_iova, buf_len;
972 	uint32_t cpy_len, cpy_threshold;
973 	uint64_t hdr_addr;
974 	struct rte_mbuf *hdr_mbuf;
975 	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
976 	struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
977 	int error = 0;
978 	uint64_t mapped_len;
979 
980 	uint32_t tlen = 0;
981 	int tvec_idx = 0;
982 	void *hpa;
983 
984 	if (unlikely(m == NULL)) {
985 		error = -1;
986 		goto out;
987 	}
988 
989 	cpy_threshold = vq->async_threshold;
990 
991 	buf_addr = buf_vec[vec_idx].buf_addr;
992 	buf_iova = buf_vec[vec_idx].buf_iova;
993 	buf_len = buf_vec[vec_idx].buf_len;
994 
995 	if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
996 		error = -1;
997 		goto out;
998 	}
999 
1000 	hdr_mbuf = m;
1001 	hdr_addr = buf_addr;
1002 	if (unlikely(buf_len < dev->vhost_hlen)) {
1003 		memset(&tmp_hdr, 0, sizeof(struct virtio_net_hdr_mrg_rxbuf));
1004 		hdr = &tmp_hdr;
1005 	} else
1006 		hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr;
1007 
1008 	VHOST_LOG_DATA(DEBUG, "(%d) RX: num merge buffers %d\n",
1009 		dev->vid, num_buffers);
1010 
1011 	if (unlikely(buf_len < dev->vhost_hlen)) {
1012 		buf_offset = dev->vhost_hlen - buf_len;
1013 		vec_idx++;
1014 		buf_addr = buf_vec[vec_idx].buf_addr;
1015 		buf_iova = buf_vec[vec_idx].buf_iova;
1016 		buf_len = buf_vec[vec_idx].buf_len;
1017 		buf_avail = buf_len - buf_offset;
1018 	} else {
1019 		buf_offset = dev->vhost_hlen;
1020 		buf_avail = buf_len - dev->vhost_hlen;
1021 	}
1022 
1023 	mbuf_avail  = rte_pktmbuf_data_len(m);
1024 	mbuf_offset = 0;
1025 
1026 	while (mbuf_avail != 0 || m->next != NULL) {
1027 		/* done with current buf, get the next one */
1028 		if (buf_avail == 0) {
1029 			vec_idx++;
1030 			if (unlikely(vec_idx >= nr_vec)) {
1031 				error = -1;
1032 				goto out;
1033 			}
1034 
1035 			buf_addr = buf_vec[vec_idx].buf_addr;
1036 			buf_iova = buf_vec[vec_idx].buf_iova;
1037 			buf_len = buf_vec[vec_idx].buf_len;
1038 
1039 			buf_offset = 0;
1040 			buf_avail  = buf_len;
1041 		}
1042 
1043 		/* done with current mbuf, get the next one */
1044 		if (mbuf_avail == 0) {
1045 			m = m->next;
1046 
1047 			mbuf_offset = 0;
1048 			mbuf_avail  = rte_pktmbuf_data_len(m);
1049 		}
1050 
1051 		if (hdr_addr) {
1052 			virtio_enqueue_offload(hdr_mbuf, &hdr->hdr);
1053 			if (rxvq_is_mergeable(dev))
1054 				ASSIGN_UNLESS_EQUAL(hdr->num_buffers,
1055 						num_buffers);
1056 
1057 			if (unlikely(hdr == &tmp_hdr)) {
1058 				copy_vnet_hdr_to_desc(dev, vq, buf_vec, hdr);
1059 			} else {
1060 				PRINT_PACKET(dev, (uintptr_t)hdr_addr,
1061 						dev->vhost_hlen, 0);
1062 				vhost_log_cache_write_iova(dev, vq,
1063 						buf_vec[0].buf_iova,
1064 						dev->vhost_hlen);
1065 			}
1066 
1067 			hdr_addr = 0;
1068 		}
1069 
1070 		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
1071 
1072 		while (unlikely(cpy_len && cpy_len >= cpy_threshold)) {
1073 			hpa = (void *)(uintptr_t)gpa_to_first_hpa(dev,
1074 					buf_iova + buf_offset,
1075 					cpy_len, &mapped_len);
1076 
1077 			if (unlikely(!hpa || mapped_len < cpy_threshold))
1078 				break;
1079 
1080 			async_fill_vec(src_iovec + tvec_idx,
1081 				(void *)(uintptr_t)rte_pktmbuf_iova_offset(m,
1082 				mbuf_offset), (size_t)mapped_len);
1083 
1084 			async_fill_vec(dst_iovec + tvec_idx,
1085 					hpa, (size_t)mapped_len);
1086 
1087 			tlen += (uint32_t)mapped_len;
1088 			cpy_len -= (uint32_t)mapped_len;
1089 			mbuf_avail  -= (uint32_t)mapped_len;
1090 			mbuf_offset += (uint32_t)mapped_len;
1091 			buf_avail  -= (uint32_t)mapped_len;
1092 			buf_offset += (uint32_t)mapped_len;
1093 			tvec_idx++;
1094 		}
1095 
1096 		if (likely(cpy_len)) {
1097 			if (unlikely(vq->batch_copy_nb_elems >= vq->size)) {
1098 				rte_memcpy(
1099 				(void *)((uintptr_t)(buf_addr + buf_offset)),
1100 				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
1101 				cpy_len);
1102 
1103 				PRINT_PACKET(dev,
1104 					(uintptr_t)(buf_addr + buf_offset),
1105 					cpy_len, 0);
1106 			} else {
1107 				batch_copy[vq->batch_copy_nb_elems].dst =
1108 				(void *)((uintptr_t)(buf_addr + buf_offset));
1109 				batch_copy[vq->batch_copy_nb_elems].src =
1110 				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
1111 				batch_copy[vq->batch_copy_nb_elems].log_addr =
1112 					buf_iova + buf_offset;
1113 				batch_copy[vq->batch_copy_nb_elems].len =
1114 					cpy_len;
1115 				vq->batch_copy_nb_elems++;
1116 			}
1117 
1118 			mbuf_avail  -= cpy_len;
1119 			mbuf_offset += cpy_len;
1120 			buf_avail  -= cpy_len;
1121 			buf_offset += cpy_len;
1122 		}
1123 
1124 	}
1125 
1126 out:
1127 	if (tlen) {
1128 		async_fill_iter(src_it, tlen, src_iovec, tvec_idx);
1129 		async_fill_iter(dst_it, tlen, dst_iovec, tvec_idx);
1130 	} else {
1131 		src_it->count = 0;
1132 	}
1133 
1134 	return error;
1135 }
1136 
1137 static __rte_always_inline int
1138 vhost_enqueue_single_packed(struct virtio_net *dev,
1139 			    struct vhost_virtqueue *vq,
1140 			    struct rte_mbuf *pkt,
1141 			    struct buf_vector *buf_vec,
1142 			    uint16_t *nr_descs)
1143 {
1144 	uint16_t nr_vec = 0;
1145 	uint16_t avail_idx = vq->last_avail_idx;
1146 	uint16_t max_tries, tries = 0;
1147 	uint16_t buf_id = 0;
1148 	uint32_t len = 0;
1149 	uint16_t desc_count;
1150 	uint32_t size = pkt->pkt_len + sizeof(struct virtio_net_hdr_mrg_rxbuf);
1151 	uint16_t num_buffers = 0;
1152 	uint32_t buffer_len[vq->size];
1153 	uint16_t buffer_buf_id[vq->size];
1154 	uint16_t buffer_desc_count[vq->size];
1155 
1156 	if (rxvq_is_mergeable(dev))
1157 		max_tries = vq->size - 1;
1158 	else
1159 		max_tries = 1;
1160 
1161 	while (size > 0) {
1162 		/*
1163 		 * if we tried all available ring items, and still
1164 		 * can't get enough buf, it means something abnormal
1165 		 * happened.
1166 		 */
1167 		if (unlikely(++tries > max_tries))
1168 			return -1;
1169 
1170 		if (unlikely(fill_vec_buf_packed(dev, vq,
1171 						avail_idx, &desc_count,
1172 						buf_vec, &nr_vec,
1173 						&buf_id, &len,
1174 						VHOST_ACCESS_RW) < 0))
1175 			return -1;
1176 
1177 		len = RTE_MIN(len, size);
1178 		size -= len;
1179 
1180 		buffer_len[num_buffers] = len;
1181 		buffer_buf_id[num_buffers] = buf_id;
1182 		buffer_desc_count[num_buffers] = desc_count;
1183 		num_buffers += 1;
1184 
1185 		*nr_descs += desc_count;
1186 		avail_idx += desc_count;
1187 		if (avail_idx >= vq->size)
1188 			avail_idx -= vq->size;
1189 	}
1190 
1191 	if (copy_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, num_buffers) < 0)
1192 		return -1;
1193 
1194 	vhost_shadow_enqueue_single_packed(dev, vq, buffer_len, buffer_buf_id,
1195 					   buffer_desc_count, num_buffers);
1196 
1197 	return 0;
1198 }
1199 
1200 static __rte_noinline uint32_t
1201 virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
1202 	struct rte_mbuf **pkts, uint32_t count)
1203 {
1204 	uint32_t pkt_idx = 0;
1205 	uint16_t num_buffers;
1206 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
1207 	uint16_t avail_head;
1208 
1209 	/*
1210 	 * The ordering between avail index and
1211 	 * desc reads needs to be enforced.
1212 	 */
1213 	avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);
1214 
1215 	rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
1216 
1217 	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
1218 		uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
1219 		uint16_t nr_vec = 0;
1220 
1221 		if (unlikely(reserve_avail_buf_split(dev, vq,
1222 						pkt_len, buf_vec, &num_buffers,
1223 						avail_head, &nr_vec) < 0)) {
1224 			VHOST_LOG_DATA(DEBUG,
1225 				"(%d) failed to get enough desc from vring\n",
1226 				dev->vid);
1227 			vq->shadow_used_idx -= num_buffers;
1228 			break;
1229 		}
1230 
1231 		VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
1232 			dev->vid, vq->last_avail_idx,
1233 			vq->last_avail_idx + num_buffers);
1234 
1235 		if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx],
1236 						buf_vec, nr_vec,
1237 						num_buffers) < 0) {
1238 			vq->shadow_used_idx -= num_buffers;
1239 			break;
1240 		}
1241 
1242 		vq->last_avail_idx += num_buffers;
1243 	}
1244 
1245 	do_data_copy_enqueue(dev, vq);
1246 
1247 	if (likely(vq->shadow_used_idx)) {
1248 		flush_shadow_used_ring_split(dev, vq);
1249 		vhost_vring_call_split(dev, vq);
1250 	}
1251 
1252 	return pkt_idx;
1253 }
1254 
1255 static __rte_always_inline int
1256 virtio_dev_rx_sync_batch_check(struct virtio_net *dev,
1257 			   struct vhost_virtqueue *vq,
1258 			   struct rte_mbuf **pkts,
1259 			   uint64_t *desc_addrs,
1260 			   uint64_t *lens)
1261 {
1262 	bool wrap_counter = vq->avail_wrap_counter;
1263 	struct vring_packed_desc *descs = vq->desc_packed;
1264 	uint16_t avail_idx = vq->last_avail_idx;
1265 	uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
1266 	uint16_t i;
1267 
1268 	if (unlikely(avail_idx & PACKED_BATCH_MASK))
1269 		return -1;
1270 
1271 	if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size))
1272 		return -1;
1273 
1274 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1275 		if (unlikely(pkts[i]->next != NULL))
1276 			return -1;
1277 		if (unlikely(!desc_is_avail(&descs[avail_idx + i],
1278 					    wrap_counter)))
1279 			return -1;
1280 	}
1281 
1282 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1283 		lens[i] = descs[avail_idx + i].len;
1284 
1285 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1286 		if (unlikely(pkts[i]->pkt_len > (lens[i] - buf_offset)))
1287 			return -1;
1288 	}
1289 
1290 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1291 		desc_addrs[i] = vhost_iova_to_vva(dev, vq,
1292 						  descs[avail_idx + i].addr,
1293 						  &lens[i],
1294 						  VHOST_ACCESS_RW);
1295 
1296 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1297 		if (unlikely(!desc_addrs[i]))
1298 			return -1;
1299 		if (unlikely(lens[i] != descs[avail_idx + i].len))
1300 			return -1;
1301 	}
1302 
1303 	return 0;
1304 }
1305 
1306 static __rte_always_inline int
1307 virtio_dev_rx_async_batch_check(struct virtio_net *dev,
1308 			   struct vhost_virtqueue *vq,
1309 			   struct rte_mbuf **pkts,
1310 			   uint64_t *desc_addrs,
1311 			   uint64_t *lens)
1312 {
1313 	bool wrap_counter = vq->avail_wrap_counter;
1314 	struct vring_packed_desc *descs = vq->desc_packed;
1315 	uint16_t avail_idx = vq->last_avail_idx;
1316 	uint16_t used_idx = vq->last_used_idx;
1317 	uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
1318 	uint32_t cpy_threshold = vq->async_threshold;
1319 	uint16_t i;
1320 
1321 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1322 		if (unlikely(pkts[i]->data_len >= cpy_threshold))
1323 			return -1;
1324 	}
1325 
1326 	if (unlikely(avail_idx & PACKED_BATCH_MASK))
1327 		return -1;
1328 
1329 	if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size))
1330 		return -1;
1331 
1332 	if (unlikely((used_idx + PACKED_BATCH_SIZE) > vq->size))
1333 		return -1;
1334 
1335 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1336 		if (unlikely(pkts[i]->next != NULL))
1337 			return -1;
1338 		if (unlikely(!desc_is_avail(&descs[avail_idx + i],
1339 					    wrap_counter)))
1340 			return -1;
1341 	}
1342 
1343 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1344 		lens[i] = descs[avail_idx + i].len;
1345 
1346 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1347 		if (unlikely(pkts[i]->pkt_len > (lens[i] - buf_offset)))
1348 			return -1;
1349 	}
1350 
1351 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1352 		desc_addrs[i] = vhost_iova_to_vva(dev, vq,
1353 						  descs[avail_idx + i].addr,
1354 						  &lens[i],
1355 						  VHOST_ACCESS_RW);
1356 
1357 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1358 		if (unlikely(!desc_addrs[i]))
1359 			return -1;
1360 		if (unlikely(lens[i] != descs[avail_idx + i].len))
1361 			return -1;
1362 	}
1363 
1364 	return 0;
1365 }
1366 
1367 static __rte_always_inline void
1368 virtio_dev_rx_batch_packed_copy(struct virtio_net *dev,
1369 			   struct vhost_virtqueue *vq,
1370 			   struct rte_mbuf **pkts,
1371 			   uint64_t *desc_addrs,
1372 			   uint64_t *lens)
1373 {
1374 	uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
1375 	struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_BATCH_SIZE];
1376 	struct vring_packed_desc *descs = vq->desc_packed;
1377 	uint16_t avail_idx = vq->last_avail_idx;
1378 	uint16_t ids[PACKED_BATCH_SIZE];
1379 	uint16_t i;
1380 
1381 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1382 		rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
1383 		hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *)
1384 					(uintptr_t)desc_addrs[i];
1385 		lens[i] = pkts[i]->pkt_len +
1386 			sizeof(struct virtio_net_hdr_mrg_rxbuf);
1387 	}
1388 
1389 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1390 		virtio_enqueue_offload(pkts[i], &hdrs[i]->hdr);
1391 
1392 	vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE);
1393 
1394 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1395 		rte_memcpy((void *)(uintptr_t)(desc_addrs[i] + buf_offset),
1396 			   rte_pktmbuf_mtod_offset(pkts[i], void *, 0),
1397 			   pkts[i]->pkt_len);
1398 	}
1399 
1400 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1401 		vhost_log_cache_write_iova(dev, vq, descs[avail_idx + i].addr,
1402 					   lens[i]);
1403 
1404 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1405 		ids[i] = descs[avail_idx + i].id;
1406 
1407 	vhost_flush_enqueue_batch_packed(dev, vq, lens, ids);
1408 }
1409 
1410 static __rte_always_inline int
1411 virtio_dev_rx_sync_batch_packed(struct virtio_net *dev,
1412 			   struct vhost_virtqueue *vq,
1413 			   struct rte_mbuf **pkts)
1414 {
1415 	uint64_t desc_addrs[PACKED_BATCH_SIZE];
1416 	uint64_t lens[PACKED_BATCH_SIZE];
1417 
1418 	if (virtio_dev_rx_sync_batch_check(dev, vq, pkts, desc_addrs, lens) == -1)
1419 		return -1;
1420 
1421 	if (vq->shadow_used_idx) {
1422 		do_data_copy_enqueue(dev, vq);
1423 		vhost_flush_enqueue_shadow_packed(dev, vq);
1424 	}
1425 
1426 	virtio_dev_rx_batch_packed_copy(dev, vq, pkts, desc_addrs, lens);
1427 
1428 	return 0;
1429 }
1430 
1431 static __rte_always_inline int
1432 virtio_dev_rx_async_batch_packed(struct virtio_net *dev,
1433 			   struct vhost_virtqueue *vq,
1434 			   struct rte_mbuf **pkts,
1435 			   struct rte_mbuf **comp_pkts, uint32_t *pkt_done)
1436 {
1437 	uint16_t i;
1438 	uint64_t desc_addrs[PACKED_BATCH_SIZE];
1439 	uint64_t lens[PACKED_BATCH_SIZE];
1440 
1441 	if (virtio_dev_rx_async_batch_check(dev, vq, pkts, desc_addrs, lens) == -1)
1442 		return -1;
1443 
1444 	virtio_dev_rx_batch_packed_copy(dev, vq, pkts, desc_addrs, lens);
1445 
1446 	if (vq->shadow_used_idx) {
1447 		do_data_copy_enqueue(dev, vq);
1448 		vhost_flush_enqueue_shadow_packed(dev, vq);
1449 	}
1450 
1451 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1452 		comp_pkts[(*pkt_done)++] = pkts[i];
1453 
1454 	return 0;
1455 }
1456 
1457 static __rte_always_inline int16_t
1458 virtio_dev_rx_single_packed(struct virtio_net *dev,
1459 			    struct vhost_virtqueue *vq,
1460 			    struct rte_mbuf *pkt)
1461 {
1462 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
1463 	uint16_t nr_descs = 0;
1464 
1465 	if (unlikely(vhost_enqueue_single_packed(dev, vq, pkt, buf_vec,
1466 						 &nr_descs) < 0)) {
1467 		VHOST_LOG_DATA(DEBUG,
1468 				"(%d) failed to get enough desc from vring\n",
1469 				dev->vid);
1470 		return -1;
1471 	}
1472 
1473 	VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
1474 			dev->vid, vq->last_avail_idx,
1475 			vq->last_avail_idx + nr_descs);
1476 
1477 	vq_inc_last_avail_packed(vq, nr_descs);
1478 
1479 	return 0;
1480 }
1481 
1482 static __rte_noinline uint32_t
1483 virtio_dev_rx_packed(struct virtio_net *dev,
1484 		     struct vhost_virtqueue *__rte_restrict vq,
1485 		     struct rte_mbuf **__rte_restrict pkts,
1486 		     uint32_t count)
1487 {
1488 	uint32_t pkt_idx = 0;
1489 
1490 	do {
1491 		rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
1492 
1493 		if (count - pkt_idx >= PACKED_BATCH_SIZE) {
1494 			if (!virtio_dev_rx_sync_batch_packed(dev, vq,
1495 							&pkts[pkt_idx])) {
1496 				pkt_idx += PACKED_BATCH_SIZE;
1497 				continue;
1498 			}
1499 		}
1500 
1501 		if (virtio_dev_rx_single_packed(dev, vq, pkts[pkt_idx]))
1502 			break;
1503 		pkt_idx++;
1504 
1505 	} while (pkt_idx < count);
1506 
1507 	if (vq->shadow_used_idx) {
1508 		do_data_copy_enqueue(dev, vq);
1509 		vhost_flush_enqueue_shadow_packed(dev, vq);
1510 	}
1511 
1512 	if (pkt_idx)
1513 		vhost_vring_call_packed(dev, vq);
1514 
1515 	return pkt_idx;
1516 }
1517 
1518 static __rte_always_inline uint32_t
1519 virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
1520 	struct rte_mbuf **pkts, uint32_t count)
1521 {
1522 	struct vhost_virtqueue *vq;
1523 	uint32_t nb_tx = 0;
1524 
1525 	VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
1526 	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
1527 		VHOST_LOG_DATA(ERR, "(%d) %s: invalid virtqueue idx %d.\n",
1528 			dev->vid, __func__, queue_id);
1529 		return 0;
1530 	}
1531 
1532 	vq = dev->virtqueue[queue_id];
1533 
1534 	rte_spinlock_lock(&vq->access_lock);
1535 
1536 	if (unlikely(!vq->enabled))
1537 		goto out_access_unlock;
1538 
1539 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1540 		vhost_user_iotlb_rd_lock(vq);
1541 
1542 	if (unlikely(!vq->access_ok))
1543 		if (unlikely(vring_translate(dev, vq) < 0))
1544 			goto out;
1545 
1546 	count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
1547 	if (count == 0)
1548 		goto out;
1549 
1550 	if (vq_is_packed(dev))
1551 		nb_tx = virtio_dev_rx_packed(dev, vq, pkts, count);
1552 	else
1553 		nb_tx = virtio_dev_rx_split(dev, vq, pkts, count);
1554 
1555 out:
1556 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1557 		vhost_user_iotlb_rd_unlock(vq);
1558 
1559 out_access_unlock:
1560 	rte_spinlock_unlock(&vq->access_lock);
1561 
1562 	return nb_tx;
1563 }
1564 
1565 uint16_t
1566 rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
1567 	struct rte_mbuf **__rte_restrict pkts, uint16_t count)
1568 {
1569 	struct virtio_net *dev = get_device(vid);
1570 
1571 	if (!dev)
1572 		return 0;
1573 
1574 	if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
1575 		VHOST_LOG_DATA(ERR,
1576 			"(%d) %s: built-in vhost net backend is disabled.\n",
1577 			dev->vid, __func__);
1578 		return 0;
1579 	}
1580 
1581 	return virtio_dev_rx(dev, queue_id, pkts, count);
1582 }
1583 
1584 static __rte_always_inline uint16_t
1585 virtio_dev_rx_async_get_info_idx(uint16_t pkts_idx,
1586 	uint16_t vq_size, uint16_t n_inflight)
1587 {
1588 	return pkts_idx > n_inflight ? (pkts_idx - n_inflight) :
1589 		(vq_size - n_inflight + pkts_idx) % vq_size;
1590 }
1591 
1592 static __rte_always_inline void
1593 store_dma_desc_info_split(struct vring_used_elem *s_ring, struct vring_used_elem *d_ring,
1594 		uint16_t ring_size, uint16_t s_idx, uint16_t d_idx, uint16_t count)
1595 {
1596 	size_t elem_size = sizeof(struct vring_used_elem);
1597 
1598 	if (d_idx + count <= ring_size) {
1599 		rte_memcpy(d_ring + d_idx, s_ring + s_idx, count * elem_size);
1600 	} else {
1601 		uint16_t size = ring_size - d_idx;
1602 
1603 		rte_memcpy(d_ring + d_idx, s_ring + s_idx, size * elem_size);
1604 		rte_memcpy(d_ring, s_ring + s_idx + size, (count - size) * elem_size);
1605 	}
1606 }
1607 
1608 static __rte_always_inline void
1609 store_dma_desc_info_packed(struct vring_used_elem_packed *s_ring,
1610 		struct vring_used_elem_packed *d_ring,
1611 		uint16_t ring_size, uint16_t s_idx, uint16_t d_idx, uint16_t count)
1612 {
1613 	size_t elem_size = sizeof(struct vring_used_elem_packed);
1614 
1615 	if (d_idx + count <= ring_size) {
1616 		rte_memcpy(d_ring + d_idx, s_ring + s_idx, count * elem_size);
1617 	} else {
1618 		uint16_t size = ring_size - d_idx;
1619 
1620 		rte_memcpy(d_ring + d_idx, s_ring + s_idx, size * elem_size);
1621 		rte_memcpy(d_ring, s_ring + s_idx + size, (count - size) * elem_size);
1622 	}
1623 }
1624 
1625 static __rte_noinline uint32_t
1626 virtio_dev_rx_async_submit_split(struct virtio_net *dev,
1627 	struct vhost_virtqueue *vq, uint16_t queue_id,
1628 	struct rte_mbuf **pkts, uint32_t count,
1629 	struct rte_mbuf **comp_pkts, uint32_t *comp_count)
1630 {
1631 	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
1632 	uint16_t num_buffers;
1633 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
1634 	uint16_t avail_head;
1635 
1636 	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
1637 	struct iovec *vec_pool = vq->vec_pool;
1638 	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
1639 	struct iovec *src_iovec = vec_pool;
1640 	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
1641 	uint16_t slot_idx = 0;
1642 	uint16_t segs_await = 0;
1643 	uint16_t iovec_idx = 0, it_idx = 0;
1644 	struct async_inflight_info *pkts_info = vq->async_pkts_info;
1645 	uint32_t n_pkts = 0, pkt_err = 0;
1646 	uint32_t num_async_pkts = 0, num_done_pkts = 0;
1647 	int32_t n_xfer;
1648 	struct {
1649 		uint16_t pkt_idx;
1650 		uint16_t last_avail_idx;
1651 	} async_pkts_log[MAX_PKT_BURST];
1652 
1653 	/*
1654 	 * The ordering between avail index and desc reads need to be enforced.
1655 	 */
1656 	avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);
1657 
1658 	rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
1659 
1660 	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
1661 		uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
1662 		uint16_t nr_vec = 0;
1663 
1664 		if (unlikely(reserve_avail_buf_split(dev, vq,
1665 						pkt_len, buf_vec, &num_buffers,
1666 						avail_head, &nr_vec) < 0)) {
1667 			VHOST_LOG_DATA(DEBUG,
1668 				"(%d) failed to get enough desc from vring\n",
1669 				dev->vid);
1670 			vq->shadow_used_idx -= num_buffers;
1671 			break;
1672 		}
1673 
1674 		VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
1675 			dev->vid, vq->last_avail_idx,
1676 			vq->last_avail_idx + num_buffers);
1677 
1678 		if (async_mbuf_to_desc(dev, vq, pkts[pkt_idx], buf_vec, nr_vec, num_buffers,
1679 				&src_iovec[iovec_idx], &dst_iovec[iovec_idx],
1680 				&it_pool[it_idx], &it_pool[it_idx + 1]) < 0) {
1681 			vq->shadow_used_idx -= num_buffers;
1682 			break;
1683 		}
1684 
1685 		slot_idx = (vq->async_pkts_idx + num_async_pkts) &
1686 			(vq->size - 1);
1687 		if (it_pool[it_idx].count) {
1688 			uint16_t from, to;
1689 
1690 			async_fill_desc(&tdes[pkt_burst_idx++],
1691 				&it_pool[it_idx], &it_pool[it_idx + 1]);
1692 			pkts_info[slot_idx].descs = num_buffers;
1693 			pkts_info[slot_idx].mbuf = pkts[pkt_idx];
1694 			async_pkts_log[num_async_pkts].pkt_idx = pkt_idx;
1695 			async_pkts_log[num_async_pkts++].last_avail_idx =
1696 				vq->last_avail_idx;
1697 
1698 			iovec_idx += it_pool[it_idx].nr_segs;
1699 			it_idx += 2;
1700 
1701 			segs_await += it_pool[it_idx].nr_segs;
1702 
1703 			/**
1704 			 * recover shadow used ring and keep DMA-occupied
1705 			 * descriptors.
1706 			 */
1707 			from = vq->shadow_used_idx - num_buffers;
1708 			to = vq->async_desc_idx_split & (vq->size - 1);
1709 
1710 			store_dma_desc_info_split(vq->shadow_used_split,
1711 					vq->async_descs_split, vq->size, from, to, num_buffers);
1712 
1713 			vq->async_desc_idx_split += num_buffers;
1714 			vq->shadow_used_idx -= num_buffers;
1715 		} else
1716 			comp_pkts[num_done_pkts++] = pkts[pkt_idx];
1717 
1718 		vq->last_avail_idx += num_buffers;
1719 
1720 		/*
1721 		 * conditions to trigger async device transfer:
1722 		 * - buffered packet number reaches transfer threshold
1723 		 * - unused async iov number is less than max vhost vector
1724 		 */
1725 		if (unlikely(pkt_burst_idx >= VHOST_ASYNC_BATCH_THRESHOLD ||
1726 			((VHOST_MAX_ASYNC_VEC >> 1) - segs_await <
1727 			BUF_VECTOR_MAX))) {
1728 			n_xfer = vq->async_ops.transfer_data(dev->vid,
1729 					queue_id, tdes, 0, pkt_burst_idx);
1730 			if (n_xfer >= 0) {
1731 				n_pkts = n_xfer;
1732 			} else {
1733 				VHOST_LOG_DATA(ERR,
1734 					"(%d) %s: failed to transfer data for queue id %d.\n",
1735 					dev->vid, __func__, queue_id);
1736 				n_pkts = 0;
1737 			}
1738 
1739 			iovec_idx = 0;
1740 			it_idx = 0;
1741 
1742 			segs_await = 0;
1743 			vq->async_pkts_inflight_n += n_pkts;
1744 
1745 			if (unlikely(n_pkts < pkt_burst_idx)) {
1746 				/*
1747 				 * log error packets number here and do actual
1748 				 * error processing when applications poll
1749 				 * completion
1750 				 */
1751 				pkt_err = pkt_burst_idx - n_pkts;
1752 				pkt_burst_idx = 0;
1753 				break;
1754 			}
1755 
1756 			pkt_burst_idx = 0;
1757 		}
1758 	}
1759 
1760 	if (pkt_burst_idx) {
1761 		n_xfer = vq->async_ops.transfer_data(dev->vid, queue_id, tdes, 0, pkt_burst_idx);
1762 		if (n_xfer >= 0) {
1763 			n_pkts = n_xfer;
1764 		} else {
1765 			VHOST_LOG_DATA(ERR, "(%d) %s: failed to transfer data for queue id %d.\n",
1766 				dev->vid, __func__, queue_id);
1767 			n_pkts = 0;
1768 		}
1769 
1770 		vq->async_pkts_inflight_n += n_pkts;
1771 
1772 		if (unlikely(n_pkts < pkt_burst_idx))
1773 			pkt_err = pkt_burst_idx - n_pkts;
1774 	}
1775 
1776 	do_data_copy_enqueue(dev, vq);
1777 
1778 	if (unlikely(pkt_err)) {
1779 		uint16_t num_descs = 0;
1780 
1781 		num_async_pkts -= pkt_err;
1782 		/* calculate the sum of descriptors of DMA-error packets. */
1783 		while (pkt_err-- > 0) {
1784 			num_descs += pkts_info[slot_idx & (vq->size - 1)].descs;
1785 			slot_idx--;
1786 		}
1787 		vq->async_desc_idx_split -= num_descs;
1788 		/* recover shadow used ring and available ring */
1789 		vq->shadow_used_idx -= (vq->last_avail_idx -
1790 				async_pkts_log[num_async_pkts].last_avail_idx -
1791 				num_descs);
1792 		vq->last_avail_idx =
1793 			async_pkts_log[num_async_pkts].last_avail_idx;
1794 		pkt_idx = async_pkts_log[num_async_pkts].pkt_idx;
1795 		num_done_pkts = pkt_idx - num_async_pkts;
1796 	}
1797 
1798 	vq->async_pkts_idx += num_async_pkts;
1799 	*comp_count = num_done_pkts;
1800 
1801 	if (likely(vq->shadow_used_idx)) {
1802 		flush_shadow_used_ring_split(dev, vq);
1803 		vhost_vring_call_split(dev, vq);
1804 	}
1805 
1806 	return pkt_idx;
1807 }
1808 
1809 static __rte_always_inline void
1810 vhost_update_used_packed(struct vhost_virtqueue *vq,
1811 			struct vring_used_elem_packed *shadow_ring,
1812 			uint16_t count)
1813 {
1814 	int i;
1815 	uint16_t used_idx = vq->last_used_idx;
1816 	uint16_t head_idx = vq->last_used_idx;
1817 	uint16_t head_flags = 0;
1818 
1819 	if (count == 0)
1820 		return;
1821 
1822 	/* Split loop in two to save memory barriers */
1823 	for (i = 0; i < count; i++) {
1824 		vq->desc_packed[used_idx].id = shadow_ring[i].id;
1825 		vq->desc_packed[used_idx].len = shadow_ring[i].len;
1826 
1827 		used_idx += shadow_ring[i].count;
1828 		if (used_idx >= vq->size)
1829 			used_idx -= vq->size;
1830 	}
1831 
1832 	/* The ordering for storing desc flags needs to be enforced. */
1833 	rte_atomic_thread_fence(__ATOMIC_RELEASE);
1834 
1835 	for (i = 0; i < count; i++) {
1836 		uint16_t flags;
1837 
1838 		if (vq->shadow_used_packed[i].len)
1839 			flags = VRING_DESC_F_WRITE;
1840 		else
1841 			flags = 0;
1842 
1843 		if (vq->used_wrap_counter) {
1844 			flags |= VRING_DESC_F_USED;
1845 			flags |= VRING_DESC_F_AVAIL;
1846 		} else {
1847 			flags &= ~VRING_DESC_F_USED;
1848 			flags &= ~VRING_DESC_F_AVAIL;
1849 		}
1850 
1851 		if (i > 0) {
1852 			vq->desc_packed[vq->last_used_idx].flags = flags;
1853 		} else {
1854 			head_idx = vq->last_used_idx;
1855 			head_flags = flags;
1856 		}
1857 
1858 		vq_inc_last_used_packed(vq, shadow_ring[i].count);
1859 	}
1860 
1861 	vq->desc_packed[head_idx].flags = head_flags;
1862 }
1863 
1864 static __rte_always_inline int
1865 vhost_enqueue_async_single_packed(struct virtio_net *dev,
1866 			    struct vhost_virtqueue *vq,
1867 			    struct rte_mbuf *pkt,
1868 			    struct buf_vector *buf_vec,
1869 			    uint16_t *nr_descs,
1870 			    uint16_t *nr_buffers,
1871 			    struct vring_packed_desc *async_descs,
1872 			    struct iovec *src_iovec, struct iovec *dst_iovec,
1873 			    struct rte_vhost_iov_iter *src_it,
1874 			    struct rte_vhost_iov_iter *dst_it)
1875 {
1876 	uint16_t nr_vec = 0;
1877 	uint16_t avail_idx = vq->last_avail_idx;
1878 	uint16_t max_tries, tries = 0;
1879 	uint16_t buf_id = 0;
1880 	uint32_t len = 0;
1881 	uint16_t desc_count = 0;
1882 	uint32_t size = pkt->pkt_len + sizeof(struct virtio_net_hdr_mrg_rxbuf);
1883 	uint32_t buffer_len[vq->size];
1884 	uint16_t buffer_buf_id[vq->size];
1885 	uint16_t buffer_desc_count[vq->size];
1886 
1887 	if (rxvq_is_mergeable(dev))
1888 		max_tries = vq->size - 1;
1889 	else
1890 		max_tries = 1;
1891 
1892 	while (size > 0) {
1893 		/*
1894 		 * if we tried all available ring items, and still
1895 		 * can't get enough buf, it means something abnormal
1896 		 * happened.
1897 		 */
1898 		if (unlikely(++tries > max_tries))
1899 			return -1;
1900 
1901 		if (unlikely(fill_vec_buf_packed(dev, vq, avail_idx, &desc_count, buf_vec, &nr_vec,
1902 						&buf_id, &len, VHOST_ACCESS_RW) < 0))
1903 			return -1;
1904 
1905 		len = RTE_MIN(len, size);
1906 		size -= len;
1907 
1908 		buffer_len[*nr_buffers] = len;
1909 		buffer_buf_id[*nr_buffers] = buf_id;
1910 		buffer_desc_count[*nr_buffers] = desc_count;
1911 		*nr_buffers += 1;
1912 
1913 		*nr_descs += desc_count;
1914 		avail_idx += desc_count;
1915 		if (avail_idx >= vq->size)
1916 			avail_idx -= vq->size;
1917 	}
1918 
1919 	if (async_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, *nr_buffers, src_iovec, dst_iovec,
1920 			src_it, dst_it) < 0)
1921 		return -1;
1922 	/* store descriptors for DMA */
1923 	if (avail_idx >= *nr_descs) {
1924 		rte_memcpy(async_descs, &vq->desc_packed[vq->last_avail_idx],
1925 			*nr_descs * sizeof(struct vring_packed_desc));
1926 	} else {
1927 		uint16_t nr_copy = vq->size - vq->last_avail_idx;
1928 
1929 		rte_memcpy(async_descs, &vq->desc_packed[vq->last_avail_idx],
1930 			nr_copy * sizeof(struct vring_packed_desc));
1931 		rte_memcpy(async_descs + nr_copy, vq->desc_packed,
1932 			(*nr_descs - nr_copy) * sizeof(struct vring_packed_desc));
1933 	}
1934 
1935 	vhost_shadow_enqueue_packed(vq, buffer_len, buffer_buf_id, buffer_desc_count, *nr_buffers);
1936 
1937 	return 0;
1938 }
1939 
1940 static __rte_always_inline int16_t
1941 virtio_dev_rx_async_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
1942 			    struct rte_mbuf *pkt, uint16_t *nr_descs, uint16_t *nr_buffers,
1943 			    struct vring_packed_desc *async_descs,
1944 			    struct iovec *src_iovec, struct iovec *dst_iovec,
1945 			    struct rte_vhost_iov_iter *src_it, struct rte_vhost_iov_iter *dst_it)
1946 {
1947 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
1948 
1949 	if (unlikely(vhost_enqueue_async_single_packed(dev, vq, pkt, buf_vec, nr_descs, nr_buffers,
1950 						 async_descs, src_iovec, dst_iovec,
1951 						 src_it, dst_it) < 0)) {
1952 		VHOST_LOG_DATA(DEBUG, "(%d) failed to get enough desc from vring\n", dev->vid);
1953 		return -1;
1954 	}
1955 
1956 	VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
1957 			dev->vid, vq->last_avail_idx, vq->last_avail_idx + *nr_descs);
1958 
1959 	return 0;
1960 }
1961 
1962 static __rte_always_inline void
1963 dma_error_handler_packed(struct vhost_virtqueue *vq, struct vring_packed_desc *async_descs,
1964 			uint16_t async_descs_idx, uint16_t slot_idx, uint32_t nr_err,
1965 			uint32_t *pkt_idx, uint32_t *num_async_pkts, uint32_t *num_done_pkts)
1966 {
1967 	uint16_t descs_err = 0;
1968 	uint16_t buffers_err = 0;
1969 	struct async_inflight_info *pkts_info = vq->async_pkts_info;
1970 
1971 	*num_async_pkts -= nr_err;
1972 	*pkt_idx -= nr_err;
1973 	/* calculate the sum of buffers and descs of DMA-error packets. */
1974 	while (nr_err-- > 0) {
1975 		descs_err += pkts_info[slot_idx % vq->size].descs;
1976 		buffers_err += pkts_info[slot_idx % vq->size].nr_buffers;
1977 		slot_idx--;
1978 	}
1979 
1980 	vq->async_buffer_idx_packed -= buffers_err;
1981 
1982 	if (vq->last_avail_idx >= descs_err) {
1983 		vq->last_avail_idx -= descs_err;
1984 
1985 		rte_memcpy(&vq->desc_packed[vq->last_avail_idx],
1986 			&async_descs[async_descs_idx - descs_err],
1987 			descs_err * sizeof(struct vring_packed_desc));
1988 	} else {
1989 		uint16_t nr_copy;
1990 
1991 		vq->last_avail_idx = vq->last_avail_idx + vq->size - descs_err;
1992 		nr_copy = vq->size - vq->last_avail_idx;
1993 		rte_memcpy(&vq->desc_packed[vq->last_avail_idx],
1994 			&async_descs[async_descs_idx - descs_err],
1995 			nr_copy * sizeof(struct vring_packed_desc));
1996 		descs_err -= nr_copy;
1997 		rte_memcpy(&vq->desc_packed[0], &async_descs[async_descs_idx - descs_err],
1998 			descs_err * sizeof(struct vring_packed_desc));
1999 		vq->avail_wrap_counter ^= 1;
2000 	}
2001 
2002 	*num_done_pkts = *pkt_idx - *num_async_pkts;
2003 }
2004 
2005 static __rte_noinline uint32_t
2006 virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
2007 	struct vhost_virtqueue *vq, uint16_t queue_id,
2008 	struct rte_mbuf **pkts, uint32_t count,
2009 	struct rte_mbuf **comp_pkts, uint32_t *comp_count)
2010 {
2011 	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
2012 	uint32_t remained = count;
2013 	uint16_t async_descs_idx = 0;
2014 	uint16_t num_buffers;
2015 	uint16_t num_descs;
2016 	int32_t n_xfer;
2017 
2018 	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
2019 	struct iovec *vec_pool = vq->vec_pool;
2020 	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
2021 	struct iovec *src_iovec = vec_pool;
2022 	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
2023 	uint16_t slot_idx = 0;
2024 	uint16_t segs_await = 0;
2025 	uint16_t iovec_idx = 0, it_idx = 0;
2026 	struct async_inflight_info *pkts_info = vq->async_pkts_info;
2027 	uint32_t n_pkts = 0, pkt_err = 0;
2028 	uint32_t num_async_pkts = 0, num_done_pkts = 0;
2029 	struct vring_packed_desc async_descs[vq->size];
2030 
2031 	do {
2032 		rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
2033 		if (remained >= PACKED_BATCH_SIZE) {
2034 			if (!virtio_dev_rx_async_batch_packed(dev, vq,
2035 				&pkts[pkt_idx], comp_pkts, &num_done_pkts)) {
2036 				pkt_idx += PACKED_BATCH_SIZE;
2037 				remained -= PACKED_BATCH_SIZE;
2038 				continue;
2039 			}
2040 		}
2041 
2042 		num_buffers = 0;
2043 		num_descs = 0;
2044 		if (unlikely(virtio_dev_rx_async_single_packed(dev, vq, pkts[pkt_idx],
2045 						&num_descs, &num_buffers,
2046 						&async_descs[async_descs_idx],
2047 						&src_iovec[iovec_idx], &dst_iovec[iovec_idx],
2048 						&it_pool[it_idx], &it_pool[it_idx + 1]) < 0))
2049 			break;
2050 
2051 		VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
2052 			dev->vid, vq->last_avail_idx,
2053 			vq->last_avail_idx + num_descs);
2054 
2055 		slot_idx = (vq->async_pkts_idx + num_async_pkts) % vq->size;
2056 		if (it_pool[it_idx].count) {
2057 			uint16_t from;
2058 
2059 			async_descs_idx += num_descs;
2060 			async_fill_desc(&tdes[pkt_burst_idx++],
2061 				&it_pool[it_idx], &it_pool[it_idx + 1]);
2062 			pkts_info[slot_idx].descs = num_descs;
2063 			pkts_info[slot_idx].nr_buffers = num_buffers;
2064 			pkts_info[slot_idx].mbuf = pkts[pkt_idx];
2065 			num_async_pkts++;
2066 			iovec_idx += it_pool[it_idx].nr_segs;
2067 			it_idx += 2;
2068 
2069 			segs_await += it_pool[it_idx].nr_segs;
2070 
2071 			/**
2072 			 * recover shadow used ring and keep DMA-occupied
2073 			 * descriptors.
2074 			 */
2075 			from = vq->shadow_used_idx - num_buffers;
2076 			store_dma_desc_info_packed(vq->shadow_used_packed,
2077 					vq->async_buffers_packed, vq->size, from,
2078 					vq->async_buffer_idx_packed, num_buffers);
2079 
2080 			vq->async_buffer_idx_packed += num_buffers;
2081 			if (vq->async_buffer_idx_packed >= vq->size)
2082 				vq->async_buffer_idx_packed -= vq->size;
2083 			vq->shadow_used_idx -= num_buffers;
2084 		} else {
2085 			comp_pkts[num_done_pkts++] = pkts[pkt_idx];
2086 		}
2087 
2088 		pkt_idx++;
2089 		remained--;
2090 		vq_inc_last_avail_packed(vq, num_descs);
2091 
2092 		/*
2093 		 * conditions to trigger async device transfer:
2094 		 * - buffered packet number reaches transfer threshold
2095 		 * - unused async iov number is less than max vhost vector
2096 		 */
2097 		if (unlikely(pkt_burst_idx >= VHOST_ASYNC_BATCH_THRESHOLD ||
2098 			((VHOST_MAX_ASYNC_VEC >> 1) - segs_await < BUF_VECTOR_MAX))) {
2099 			n_xfer = vq->async_ops.transfer_data(dev->vid,
2100 					queue_id, tdes, 0, pkt_burst_idx);
2101 			if (n_xfer >= 0) {
2102 				n_pkts = n_xfer;
2103 			} else {
2104 				VHOST_LOG_DATA(ERR,
2105 					"(%d) %s: failed to transfer data for queue id %d.\n",
2106 					dev->vid, __func__, queue_id);
2107 				n_pkts = 0;
2108 			}
2109 
2110 			iovec_idx = 0;
2111 			it_idx = 0;
2112 			segs_await = 0;
2113 			vq->async_pkts_inflight_n += n_pkts;
2114 
2115 			if (unlikely(n_pkts < pkt_burst_idx)) {
2116 				/*
2117 				 * log error packets number here and do actual
2118 				 * error processing when applications poll
2119 				 * completion
2120 				 */
2121 				pkt_err = pkt_burst_idx - n_pkts;
2122 				pkt_burst_idx = 0;
2123 				break;
2124 			}
2125 
2126 			pkt_burst_idx = 0;
2127 		}
2128 	} while (pkt_idx < count);
2129 
2130 	if (pkt_burst_idx) {
2131 		n_xfer = vq->async_ops.transfer_data(dev->vid, queue_id, tdes, 0, pkt_burst_idx);
2132 		if (n_xfer >= 0) {
2133 			n_pkts = n_xfer;
2134 		} else {
2135 			VHOST_LOG_DATA(ERR, "(%d) %s: failed to transfer data for queue id %d.\n",
2136 				dev->vid, __func__, queue_id);
2137 			n_pkts = 0;
2138 		}
2139 
2140 		vq->async_pkts_inflight_n += n_pkts;
2141 
2142 		if (unlikely(n_pkts < pkt_burst_idx))
2143 			pkt_err = pkt_burst_idx - n_pkts;
2144 	}
2145 
2146 	do_data_copy_enqueue(dev, vq);
2147 
2148 	if (unlikely(pkt_err))
2149 		dma_error_handler_packed(vq, async_descs, async_descs_idx, slot_idx, pkt_err,
2150 					&pkt_idx, &num_async_pkts, &num_done_pkts);
2151 	vq->async_pkts_idx += num_async_pkts;
2152 	if (vq->async_pkts_idx >= vq->size)
2153 		vq->async_pkts_idx -= vq->size;
2154 	*comp_count = num_done_pkts;
2155 
2156 	if (likely(vq->shadow_used_idx)) {
2157 		vhost_flush_enqueue_shadow_packed(dev, vq);
2158 		vhost_vring_call_packed(dev, vq);
2159 	}
2160 
2161 	return pkt_idx;
2162 }
2163 
2164 static __rte_always_inline void
2165 write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t n_descs)
2166 {
2167 	uint16_t nr_left = n_descs;
2168 	uint16_t nr_copy;
2169 	uint16_t to, from;
2170 
2171 	do {
2172 		from = vq->last_async_desc_idx_split & (vq->size - 1);
2173 		nr_copy = nr_left + from <= vq->size ? nr_left : vq->size - from;
2174 		to = vq->last_used_idx & (vq->size - 1);
2175 
2176 		if (to + nr_copy <= vq->size) {
2177 			rte_memcpy(&vq->used->ring[to], &vq->async_descs_split[from],
2178 					nr_copy * sizeof(struct vring_used_elem));
2179 		} else {
2180 			uint16_t size = vq->size - to;
2181 
2182 			rte_memcpy(&vq->used->ring[to], &vq->async_descs_split[from],
2183 					size * sizeof(struct vring_used_elem));
2184 			rte_memcpy(&vq->used->ring[0], &vq->async_descs_split[from + size],
2185 					(nr_copy - size) * sizeof(struct vring_used_elem));
2186 		}
2187 
2188 		vq->last_async_desc_idx_split += nr_copy;
2189 		vq->last_used_idx += nr_copy;
2190 		nr_left -= nr_copy;
2191 	} while (nr_left > 0);
2192 }
2193 
2194 static __rte_always_inline void
2195 write_back_completed_descs_packed(struct vhost_virtqueue *vq,
2196 				uint16_t n_buffers)
2197 {
2198 	uint16_t nr_left = n_buffers;
2199 	uint16_t from, to;
2200 
2201 	do {
2202 		from = vq->last_async_buffer_idx_packed;
2203 		to = (from + nr_left) % vq->size;
2204 		if (to > from) {
2205 			vhost_update_used_packed(vq, vq->async_buffers_packed + from, to - from);
2206 			vq->last_async_buffer_idx_packed += nr_left;
2207 			nr_left = 0;
2208 		} else {
2209 			vhost_update_used_packed(vq, vq->async_buffers_packed + from,
2210 				vq->size - from);
2211 			vq->last_async_buffer_idx_packed = 0;
2212 			nr_left -= vq->size - from;
2213 		}
2214 	} while (nr_left > 0);
2215 }
2216 
2217 static __rte_always_inline uint16_t
2218 vhost_poll_enqueue_completed(struct virtio_net *dev, uint16_t queue_id,
2219 		struct rte_mbuf **pkts, uint16_t count)
2220 {
2221 	struct vhost_virtqueue *vq;
2222 	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0, n_buffers = 0;
2223 	uint16_t start_idx, pkts_idx, vq_size;
2224 	struct async_inflight_info *pkts_info;
2225 	uint16_t from, i;
2226 	int32_t n_cpl;
2227 
2228 	vq = dev->virtqueue[queue_id];
2229 
2230 	pkts_idx = vq->async_pkts_idx % vq->size;
2231 	pkts_info = vq->async_pkts_info;
2232 	vq_size = vq->size;
2233 	start_idx = virtio_dev_rx_async_get_info_idx(pkts_idx,
2234 		vq_size, vq->async_pkts_inflight_n);
2235 
2236 	if (count > vq->async_last_pkts_n) {
2237 		n_cpl = vq->async_ops.check_completed_copies(dev->vid,
2238 			queue_id, 0, count - vq->async_last_pkts_n);
2239 		if (n_cpl >= 0) {
2240 			n_pkts_cpl = n_cpl;
2241 		} else {
2242 			VHOST_LOG_DATA(ERR,
2243 				"(%d) %s: failed to check completed copies for queue id %d.\n",
2244 				dev->vid, __func__, queue_id);
2245 			n_pkts_cpl = 0;
2246 		}
2247 	}
2248 	n_pkts_cpl += vq->async_last_pkts_n;
2249 
2250 	n_pkts_put = RTE_MIN(count, n_pkts_cpl);
2251 	if (unlikely(n_pkts_put == 0)) {
2252 		vq->async_last_pkts_n = n_pkts_cpl;
2253 		return 0;
2254 	}
2255 
2256 	if (vq_is_packed(dev)) {
2257 		for (i = 0; i < n_pkts_put; i++) {
2258 			from = (start_idx + i) % vq_size;
2259 			n_buffers += pkts_info[from].nr_buffers;
2260 			pkts[i] = pkts_info[from].mbuf;
2261 		}
2262 	} else {
2263 		for (i = 0; i < n_pkts_put; i++) {
2264 			from = (start_idx + i) & (vq_size - 1);
2265 			n_descs += pkts_info[from].descs;
2266 			pkts[i] = pkts_info[from].mbuf;
2267 		}
2268 	}
2269 
2270 	vq->async_last_pkts_n = n_pkts_cpl - n_pkts_put;
2271 	vq->async_pkts_inflight_n -= n_pkts_put;
2272 
2273 	if (likely(vq->enabled && vq->access_ok)) {
2274 		if (vq_is_packed(dev)) {
2275 			write_back_completed_descs_packed(vq, n_buffers);
2276 
2277 			vhost_vring_call_packed(dev, vq);
2278 		} else {
2279 			write_back_completed_descs_split(vq, n_descs);
2280 
2281 			__atomic_add_fetch(&vq->used->idx, n_descs,
2282 					__ATOMIC_RELEASE);
2283 			vhost_vring_call_split(dev, vq);
2284 		}
2285 	} else {
2286 		if (vq_is_packed(dev)) {
2287 			vq->last_async_buffer_idx_packed += n_buffers;
2288 			if (vq->last_async_buffer_idx_packed >= vq->size)
2289 				vq->last_async_buffer_idx_packed -= vq->size;
2290 		} else {
2291 			vq->last_async_desc_idx_split += n_descs;
2292 		}
2293 	}
2294 
2295 	return n_pkts_put;
2296 }
2297 
2298 uint16_t
2299 rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
2300 		struct rte_mbuf **pkts, uint16_t count)
2301 {
2302 	struct virtio_net *dev = get_device(vid);
2303 	struct vhost_virtqueue *vq;
2304 	uint16_t n_pkts_cpl = 0;
2305 
2306 	if (!dev)
2307 		return 0;
2308 
2309 	VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
2310 	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
2311 		VHOST_LOG_DATA(ERR, "(%d) %s: invalid virtqueue idx %d.\n",
2312 			dev->vid, __func__, queue_id);
2313 		return 0;
2314 	}
2315 
2316 	vq = dev->virtqueue[queue_id];
2317 
2318 	if (unlikely(!vq->async_registered)) {
2319 		VHOST_LOG_DATA(ERR, "(%d) %s: async not registered for queue id %d.\n",
2320 			dev->vid, __func__, queue_id);
2321 		return 0;
2322 	}
2323 
2324 	rte_spinlock_lock(&vq->access_lock);
2325 
2326 	n_pkts_cpl = vhost_poll_enqueue_completed(dev, queue_id, pkts, count);
2327 
2328 	rte_spinlock_unlock(&vq->access_lock);
2329 
2330 	return n_pkts_cpl;
2331 }
2332 
2333 uint16_t
2334 rte_vhost_clear_queue_thread_unsafe(int vid, uint16_t queue_id,
2335 		struct rte_mbuf **pkts, uint16_t count)
2336 {
2337 	struct virtio_net *dev = get_device(vid);
2338 	struct vhost_virtqueue *vq;
2339 	uint16_t n_pkts_cpl = 0;
2340 
2341 	if (!dev)
2342 		return 0;
2343 
2344 	VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
2345 	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
2346 		VHOST_LOG_DATA(ERR, "(%d) %s: invalid virtqueue idx %d.\n",
2347 			dev->vid, __func__, queue_id);
2348 		return 0;
2349 	}
2350 
2351 	vq = dev->virtqueue[queue_id];
2352 
2353 	if (unlikely(!vq->async_registered)) {
2354 		VHOST_LOG_DATA(ERR, "(%d) %s: async not registered for queue id %d.\n",
2355 			dev->vid, __func__, queue_id);
2356 		return 0;
2357 	}
2358 
2359 	n_pkts_cpl = vhost_poll_enqueue_completed(dev, queue_id, pkts, count);
2360 
2361 	return n_pkts_cpl;
2362 }
2363 
2364 static __rte_always_inline uint32_t
2365 virtio_dev_rx_async_submit(struct virtio_net *dev, uint16_t queue_id,
2366 	struct rte_mbuf **pkts, uint32_t count,
2367 	struct rte_mbuf **comp_pkts, uint32_t *comp_count)
2368 {
2369 	struct vhost_virtqueue *vq;
2370 	uint32_t nb_tx = 0;
2371 
2372 	VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
2373 	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
2374 		VHOST_LOG_DATA(ERR, "(%d) %s: invalid virtqueue idx %d.\n",
2375 			dev->vid, __func__, queue_id);
2376 		return 0;
2377 	}
2378 
2379 	vq = dev->virtqueue[queue_id];
2380 
2381 	rte_spinlock_lock(&vq->access_lock);
2382 
2383 	if (unlikely(!vq->enabled || !vq->async_registered))
2384 		goto out_access_unlock;
2385 
2386 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
2387 		vhost_user_iotlb_rd_lock(vq);
2388 
2389 	if (unlikely(!vq->access_ok))
2390 		if (unlikely(vring_translate(dev, vq) < 0))
2391 			goto out;
2392 
2393 	count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
2394 	if (count == 0)
2395 		goto out;
2396 
2397 	if (vq_is_packed(dev))
2398 		nb_tx = virtio_dev_rx_async_submit_packed(dev,
2399 				vq, queue_id, pkts, count, comp_pkts,
2400 				comp_count);
2401 	else
2402 		nb_tx = virtio_dev_rx_async_submit_split(dev,
2403 				vq, queue_id, pkts, count, comp_pkts,
2404 				comp_count);
2405 
2406 out:
2407 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
2408 		vhost_user_iotlb_rd_unlock(vq);
2409 
2410 out_access_unlock:
2411 	rte_spinlock_unlock(&vq->access_lock);
2412 
2413 	return nb_tx;
2414 }
2415 
2416 uint16_t
2417 rte_vhost_submit_enqueue_burst(int vid, uint16_t queue_id,
2418 		struct rte_mbuf **pkts, uint16_t count,
2419 		struct rte_mbuf **comp_pkts, uint32_t *comp_count)
2420 {
2421 	struct virtio_net *dev = get_device(vid);
2422 
2423 	*comp_count = 0;
2424 	if (!dev)
2425 		return 0;
2426 
2427 	if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
2428 		VHOST_LOG_DATA(ERR,
2429 			"(%d) %s: built-in vhost net backend is disabled.\n",
2430 			dev->vid, __func__);
2431 		return 0;
2432 	}
2433 
2434 	return virtio_dev_rx_async_submit(dev, queue_id, pkts, count, comp_pkts,
2435 			comp_count);
2436 }
2437 
2438 static inline bool
2439 virtio_net_with_host_offload(struct virtio_net *dev)
2440 {
2441 	if (dev->features &
2442 			((1ULL << VIRTIO_NET_F_CSUM) |
2443 			 (1ULL << VIRTIO_NET_F_HOST_ECN) |
2444 			 (1ULL << VIRTIO_NET_F_HOST_TSO4) |
2445 			 (1ULL << VIRTIO_NET_F_HOST_TSO6) |
2446 			 (1ULL << VIRTIO_NET_F_HOST_UFO)))
2447 		return true;
2448 
2449 	return false;
2450 }
2451 
2452 static int
2453 parse_headers(struct rte_mbuf *m, uint8_t *l4_proto)
2454 {
2455 	struct rte_ipv4_hdr *ipv4_hdr;
2456 	struct rte_ipv6_hdr *ipv6_hdr;
2457 	struct rte_ether_hdr *eth_hdr;
2458 	uint16_t ethertype;
2459 	uint16_t data_len = rte_pktmbuf_data_len(m);
2460 
2461 	if (data_len < sizeof(struct rte_ether_hdr))
2462 		return -EINVAL;
2463 
2464 	eth_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
2465 
2466 	m->l2_len = sizeof(struct rte_ether_hdr);
2467 	ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
2468 
2469 	if (ethertype == RTE_ETHER_TYPE_VLAN) {
2470 		if (data_len < sizeof(struct rte_ether_hdr) +
2471 				sizeof(struct rte_vlan_hdr))
2472 			goto error;
2473 
2474 		struct rte_vlan_hdr *vlan_hdr =
2475 			(struct rte_vlan_hdr *)(eth_hdr + 1);
2476 
2477 		m->l2_len += sizeof(struct rte_vlan_hdr);
2478 		ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
2479 	}
2480 
2481 	switch (ethertype) {
2482 	case RTE_ETHER_TYPE_IPV4:
2483 		if (data_len < m->l2_len + sizeof(struct rte_ipv4_hdr))
2484 			goto error;
2485 		ipv4_hdr = rte_pktmbuf_mtod_offset(m, struct rte_ipv4_hdr *,
2486 				m->l2_len);
2487 		m->l3_len = rte_ipv4_hdr_len(ipv4_hdr);
2488 		if (data_len < m->l2_len + m->l3_len)
2489 			goto error;
2490 		m->ol_flags |= PKT_TX_IPV4;
2491 		*l4_proto = ipv4_hdr->next_proto_id;
2492 		break;
2493 	case RTE_ETHER_TYPE_IPV6:
2494 		if (data_len < m->l2_len + sizeof(struct rte_ipv6_hdr))
2495 			goto error;
2496 		ipv6_hdr = rte_pktmbuf_mtod_offset(m, struct rte_ipv6_hdr *,
2497 				m->l2_len);
2498 		m->l3_len = sizeof(struct rte_ipv6_hdr);
2499 		m->ol_flags |= PKT_TX_IPV6;
2500 		*l4_proto = ipv6_hdr->proto;
2501 		break;
2502 	default:
2503 		/* a valid L3 header is needed for further L4 parsing */
2504 		goto error;
2505 	}
2506 
2507 	/* both CSUM and GSO need a valid L4 header */
2508 	switch (*l4_proto) {
2509 	case IPPROTO_TCP:
2510 		if (data_len < m->l2_len + m->l3_len +
2511 				sizeof(struct rte_tcp_hdr))
2512 			goto error;
2513 		break;
2514 	case IPPROTO_UDP:
2515 		if (data_len < m->l2_len + m->l3_len +
2516 				sizeof(struct rte_udp_hdr))
2517 			goto error;
2518 		break;
2519 	case IPPROTO_SCTP:
2520 		if (data_len < m->l2_len + m->l3_len +
2521 				sizeof(struct rte_sctp_hdr))
2522 			goto error;
2523 		break;
2524 	default:
2525 		goto error;
2526 	}
2527 
2528 	return 0;
2529 
2530 error:
2531 	m->l2_len = 0;
2532 	m->l3_len = 0;
2533 	m->ol_flags = 0;
2534 	return -EINVAL;
2535 }
2536 
2537 static __rte_always_inline void
2538 vhost_dequeue_offload_legacy(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
2539 {
2540 	uint8_t l4_proto = 0;
2541 	struct rte_tcp_hdr *tcp_hdr = NULL;
2542 	uint16_t tcp_len;
2543 	uint16_t data_len = rte_pktmbuf_data_len(m);
2544 
2545 	if (parse_headers(m, &l4_proto) < 0)
2546 		return;
2547 
2548 	if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2549 		if (hdr->csum_start == (m->l2_len + m->l3_len)) {
2550 			switch (hdr->csum_offset) {
2551 			case (offsetof(struct rte_tcp_hdr, cksum)):
2552 				if (l4_proto != IPPROTO_TCP)
2553 					goto error;
2554 				m->ol_flags |= PKT_TX_TCP_CKSUM;
2555 				break;
2556 			case (offsetof(struct rte_udp_hdr, dgram_cksum)):
2557 				if (l4_proto != IPPROTO_UDP)
2558 					goto error;
2559 				m->ol_flags |= PKT_TX_UDP_CKSUM;
2560 				break;
2561 			case (offsetof(struct rte_sctp_hdr, cksum)):
2562 				if (l4_proto != IPPROTO_SCTP)
2563 					goto error;
2564 				m->ol_flags |= PKT_TX_SCTP_CKSUM;
2565 				break;
2566 			default:
2567 				goto error;
2568 			}
2569 		} else {
2570 			goto error;
2571 		}
2572 	}
2573 
2574 	if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2575 		switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2576 		case VIRTIO_NET_HDR_GSO_TCPV4:
2577 		case VIRTIO_NET_HDR_GSO_TCPV6:
2578 			if (l4_proto != IPPROTO_TCP)
2579 				goto error;
2580 			tcp_hdr = rte_pktmbuf_mtod_offset(m,
2581 					struct rte_tcp_hdr *,
2582 					m->l2_len + m->l3_len);
2583 			tcp_len = (tcp_hdr->data_off & 0xf0) >> 2;
2584 			if (data_len < m->l2_len + m->l3_len + tcp_len)
2585 				goto error;
2586 			m->ol_flags |= PKT_TX_TCP_SEG;
2587 			m->tso_segsz = hdr->gso_size;
2588 			m->l4_len = tcp_len;
2589 			break;
2590 		case VIRTIO_NET_HDR_GSO_UDP:
2591 			if (l4_proto != IPPROTO_UDP)
2592 				goto error;
2593 			m->ol_flags |= PKT_TX_UDP_SEG;
2594 			m->tso_segsz = hdr->gso_size;
2595 			m->l4_len = sizeof(struct rte_udp_hdr);
2596 			break;
2597 		default:
2598 			VHOST_LOG_DATA(WARNING,
2599 				"unsupported gso type %u.\n", hdr->gso_type);
2600 			goto error;
2601 		}
2602 	}
2603 	return;
2604 
2605 error:
2606 	m->l2_len = 0;
2607 	m->l3_len = 0;
2608 	m->ol_flags = 0;
2609 }
2610 
2611 static __rte_always_inline void
2612 vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m,
2613 	bool legacy_ol_flags)
2614 {
2615 	struct rte_net_hdr_lens hdr_lens;
2616 	int l4_supported = 0;
2617 	uint32_t ptype;
2618 
2619 	if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE)
2620 		return;
2621 
2622 	if (legacy_ol_flags) {
2623 		vhost_dequeue_offload_legacy(hdr, m);
2624 		return;
2625 	}
2626 
2627 	m->ol_flags |= PKT_RX_IP_CKSUM_UNKNOWN;
2628 
2629 	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
2630 	m->packet_type = ptype;
2631 	if ((ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP ||
2632 	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP ||
2633 	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP)
2634 		l4_supported = 1;
2635 
2636 	/* According to Virtio 1.1 spec, the device only needs to look at
2637 	 * VIRTIO_NET_HDR_F_NEEDS_CSUM in the packet transmission path.
2638 	 * This differs from the processing incoming packets path where the
2639 	 * driver could rely on VIRTIO_NET_HDR_F_DATA_VALID flag set by the
2640 	 * device.
2641 	 *
2642 	 * 5.1.6.2.1 Driver Requirements: Packet Transmission
2643 	 * The driver MUST NOT set the VIRTIO_NET_HDR_F_DATA_VALID and
2644 	 * VIRTIO_NET_HDR_F_RSC_INFO bits in flags.
2645 	 *
2646 	 * 5.1.6.2.2 Device Requirements: Packet Transmission
2647 	 * The device MUST ignore flag bits that it does not recognize.
2648 	 */
2649 	if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2650 		uint32_t hdrlen;
2651 
2652 		hdrlen = hdr_lens.l2_len + hdr_lens.l3_len + hdr_lens.l4_len;
2653 		if (hdr->csum_start <= hdrlen && l4_supported != 0) {
2654 			m->ol_flags |= PKT_RX_L4_CKSUM_NONE;
2655 		} else {
2656 			/* Unknown proto or tunnel, do sw cksum. We can assume
2657 			 * the cksum field is in the first segment since the
2658 			 * buffers we provided to the host are large enough.
2659 			 * In case of SCTP, this will be wrong since it's a CRC
2660 			 * but there's nothing we can do.
2661 			 */
2662 			uint16_t csum = 0, off;
2663 
2664 			if (rte_raw_cksum_mbuf(m, hdr->csum_start,
2665 					rte_pktmbuf_pkt_len(m) - hdr->csum_start, &csum) < 0)
2666 				return;
2667 			if (likely(csum != 0xffff))
2668 				csum = ~csum;
2669 			off = hdr->csum_offset + hdr->csum_start;
2670 			if (rte_pktmbuf_data_len(m) >= off + 1)
2671 				*rte_pktmbuf_mtod_offset(m, uint16_t *, off) = csum;
2672 		}
2673 	}
2674 
2675 	if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2676 		if (hdr->gso_size == 0)
2677 			return;
2678 
2679 		switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2680 		case VIRTIO_NET_HDR_GSO_TCPV4:
2681 		case VIRTIO_NET_HDR_GSO_TCPV6:
2682 			if ((ptype & RTE_PTYPE_L4_MASK) != RTE_PTYPE_L4_TCP)
2683 				break;
2684 			m->ol_flags |= PKT_RX_LRO | PKT_RX_L4_CKSUM_NONE;
2685 			m->tso_segsz = hdr->gso_size;
2686 			break;
2687 		case VIRTIO_NET_HDR_GSO_UDP:
2688 			if ((ptype & RTE_PTYPE_L4_MASK) != RTE_PTYPE_L4_UDP)
2689 				break;
2690 			m->ol_flags |= PKT_RX_LRO | PKT_RX_L4_CKSUM_NONE;
2691 			m->tso_segsz = hdr->gso_size;
2692 			break;
2693 		default:
2694 			break;
2695 		}
2696 	}
2697 }
2698 
2699 static __rte_noinline void
2700 copy_vnet_hdr_from_desc(struct virtio_net_hdr *hdr,
2701 		struct buf_vector *buf_vec)
2702 {
2703 	uint64_t len;
2704 	uint64_t remain = sizeof(struct virtio_net_hdr);
2705 	uint64_t src;
2706 	uint64_t dst = (uint64_t)(uintptr_t)hdr;
2707 
2708 	while (remain) {
2709 		len = RTE_MIN(remain, buf_vec->buf_len);
2710 		src = buf_vec->buf_addr;
2711 		rte_memcpy((void *)(uintptr_t)dst,
2712 				(void *)(uintptr_t)src, len);
2713 
2714 		remain -= len;
2715 		dst += len;
2716 		buf_vec++;
2717 	}
2718 }
2719 
2720 static __rte_always_inline int
2721 copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
2722 		  struct buf_vector *buf_vec, uint16_t nr_vec,
2723 		  struct rte_mbuf *m, struct rte_mempool *mbuf_pool,
2724 		  bool legacy_ol_flags)
2725 {
2726 	uint32_t buf_avail, buf_offset;
2727 	uint64_t buf_addr, buf_len;
2728 	uint32_t mbuf_avail, mbuf_offset;
2729 	uint32_t cpy_len;
2730 	struct rte_mbuf *cur = m, *prev = m;
2731 	struct virtio_net_hdr tmp_hdr;
2732 	struct virtio_net_hdr *hdr = NULL;
2733 	/* A counter to avoid desc dead loop chain */
2734 	uint16_t vec_idx = 0;
2735 	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
2736 	int error = 0;
2737 
2738 	buf_addr = buf_vec[vec_idx].buf_addr;
2739 	buf_len = buf_vec[vec_idx].buf_len;
2740 
2741 	if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
2742 		error = -1;
2743 		goto out;
2744 	}
2745 
2746 	if (virtio_net_with_host_offload(dev)) {
2747 		if (unlikely(buf_len < sizeof(struct virtio_net_hdr))) {
2748 			/*
2749 			 * No luck, the virtio-net header doesn't fit
2750 			 * in a contiguous virtual area.
2751 			 */
2752 			copy_vnet_hdr_from_desc(&tmp_hdr, buf_vec);
2753 			hdr = &tmp_hdr;
2754 		} else {
2755 			hdr = (struct virtio_net_hdr *)((uintptr_t)buf_addr);
2756 		}
2757 	}
2758 
2759 	/*
2760 	 * A virtio driver normally uses at least 2 desc buffers
2761 	 * for Tx: the first for storing the header, and others
2762 	 * for storing the data.
2763 	 */
2764 	if (unlikely(buf_len < dev->vhost_hlen)) {
2765 		buf_offset = dev->vhost_hlen - buf_len;
2766 		vec_idx++;
2767 		buf_addr = buf_vec[vec_idx].buf_addr;
2768 		buf_len = buf_vec[vec_idx].buf_len;
2769 		buf_avail  = buf_len - buf_offset;
2770 	} else if (buf_len == dev->vhost_hlen) {
2771 		if (unlikely(++vec_idx >= nr_vec))
2772 			goto out;
2773 		buf_addr = buf_vec[vec_idx].buf_addr;
2774 		buf_len = buf_vec[vec_idx].buf_len;
2775 
2776 		buf_offset = 0;
2777 		buf_avail = buf_len;
2778 	} else {
2779 		buf_offset = dev->vhost_hlen;
2780 		buf_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
2781 	}
2782 
2783 	PRINT_PACKET(dev,
2784 			(uintptr_t)(buf_addr + buf_offset),
2785 			(uint32_t)buf_avail, 0);
2786 
2787 	mbuf_offset = 0;
2788 	mbuf_avail  = m->buf_len - RTE_PKTMBUF_HEADROOM;
2789 	while (1) {
2790 		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
2791 
2792 		if (likely(cpy_len > MAX_BATCH_LEN ||
2793 					vq->batch_copy_nb_elems >= vq->size ||
2794 					(hdr && cur == m))) {
2795 			rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *,
2796 						mbuf_offset),
2797 					(void *)((uintptr_t)(buf_addr +
2798 							buf_offset)), cpy_len);
2799 		} else {
2800 			batch_copy[vq->batch_copy_nb_elems].dst =
2801 				rte_pktmbuf_mtod_offset(cur, void *,
2802 						mbuf_offset);
2803 			batch_copy[vq->batch_copy_nb_elems].src =
2804 				(void *)((uintptr_t)(buf_addr + buf_offset));
2805 			batch_copy[vq->batch_copy_nb_elems].len = cpy_len;
2806 			vq->batch_copy_nb_elems++;
2807 		}
2808 
2809 		mbuf_avail  -= cpy_len;
2810 		mbuf_offset += cpy_len;
2811 		buf_avail -= cpy_len;
2812 		buf_offset += cpy_len;
2813 
2814 		/* This buf reaches to its end, get the next one */
2815 		if (buf_avail == 0) {
2816 			if (++vec_idx >= nr_vec)
2817 				break;
2818 
2819 			buf_addr = buf_vec[vec_idx].buf_addr;
2820 			buf_len = buf_vec[vec_idx].buf_len;
2821 
2822 			buf_offset = 0;
2823 			buf_avail  = buf_len;
2824 
2825 			PRINT_PACKET(dev, (uintptr_t)buf_addr,
2826 					(uint32_t)buf_avail, 0);
2827 		}
2828 
2829 		/*
2830 		 * This mbuf reaches to its end, get a new one
2831 		 * to hold more data.
2832 		 */
2833 		if (mbuf_avail == 0) {
2834 			cur = rte_pktmbuf_alloc(mbuf_pool);
2835 			if (unlikely(cur == NULL)) {
2836 				VHOST_LOG_DATA(ERR, "Failed to "
2837 					"allocate memory for mbuf.\n");
2838 				error = -1;
2839 				goto out;
2840 			}
2841 
2842 			prev->next = cur;
2843 			prev->data_len = mbuf_offset;
2844 			m->nb_segs += 1;
2845 			m->pkt_len += mbuf_offset;
2846 			prev = cur;
2847 
2848 			mbuf_offset = 0;
2849 			mbuf_avail  = cur->buf_len - RTE_PKTMBUF_HEADROOM;
2850 		}
2851 	}
2852 
2853 	prev->data_len = mbuf_offset;
2854 	m->pkt_len    += mbuf_offset;
2855 
2856 	if (hdr)
2857 		vhost_dequeue_offload(hdr, m, legacy_ol_flags);
2858 
2859 out:
2860 
2861 	return error;
2862 }
2863 
2864 static void
2865 virtio_dev_extbuf_free(void *addr __rte_unused, void *opaque)
2866 {
2867 	rte_free(opaque);
2868 }
2869 
2870 static int
2871 virtio_dev_extbuf_alloc(struct rte_mbuf *pkt, uint32_t size)
2872 {
2873 	struct rte_mbuf_ext_shared_info *shinfo = NULL;
2874 	uint32_t total_len = RTE_PKTMBUF_HEADROOM + size;
2875 	uint16_t buf_len;
2876 	rte_iova_t iova;
2877 	void *buf;
2878 
2879 	total_len += sizeof(*shinfo) + sizeof(uintptr_t);
2880 	total_len = RTE_ALIGN_CEIL(total_len, sizeof(uintptr_t));
2881 
2882 	if (unlikely(total_len > UINT16_MAX))
2883 		return -ENOSPC;
2884 
2885 	buf_len = total_len;
2886 	buf = rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE);
2887 	if (unlikely(buf == NULL))
2888 		return -ENOMEM;
2889 
2890 	/* Initialize shinfo */
2891 	shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf, &buf_len,
2892 						virtio_dev_extbuf_free, buf);
2893 	if (unlikely(shinfo == NULL)) {
2894 		rte_free(buf);
2895 		VHOST_LOG_DATA(ERR, "Failed to init shinfo\n");
2896 		return -1;
2897 	}
2898 
2899 	iova = rte_malloc_virt2iova(buf);
2900 	rte_pktmbuf_attach_extbuf(pkt, buf, iova, buf_len, shinfo);
2901 	rte_pktmbuf_reset_headroom(pkt);
2902 
2903 	return 0;
2904 }
2905 
2906 /*
2907  * Prepare a host supported pktmbuf.
2908  */
2909 static __rte_always_inline int
2910 virtio_dev_pktmbuf_prep(struct virtio_net *dev, struct rte_mbuf *pkt,
2911 			 uint32_t data_len)
2912 {
2913 	if (rte_pktmbuf_tailroom(pkt) >= data_len)
2914 		return 0;
2915 
2916 	/* attach an external buffer if supported */
2917 	if (dev->extbuf && !virtio_dev_extbuf_alloc(pkt, data_len))
2918 		return 0;
2919 
2920 	/* check if chained buffers are allowed */
2921 	if (!dev->linearbuf)
2922 		return 0;
2923 
2924 	return -1;
2925 }
2926 
2927 __rte_always_inline
2928 static uint16_t
2929 virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
2930 	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count,
2931 	bool legacy_ol_flags)
2932 {
2933 	uint16_t i;
2934 	uint16_t free_entries;
2935 	uint16_t dropped = 0;
2936 	static bool allocerr_warned;
2937 
2938 	/*
2939 	 * The ordering between avail index and
2940 	 * desc reads needs to be enforced.
2941 	 */
2942 	free_entries = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE) -
2943 			vq->last_avail_idx;
2944 	if (free_entries == 0)
2945 		return 0;
2946 
2947 	rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
2948 
2949 	VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
2950 
2951 	count = RTE_MIN(count, MAX_PKT_BURST);
2952 	count = RTE_MIN(count, free_entries);
2953 	VHOST_LOG_DATA(DEBUG, "(%d) about to dequeue %u buffers\n",
2954 			dev->vid, count);
2955 
2956 	if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts, count))
2957 		return 0;
2958 
2959 	for (i = 0; i < count; i++) {
2960 		struct buf_vector buf_vec[BUF_VECTOR_MAX];
2961 		uint16_t head_idx;
2962 		uint32_t buf_len;
2963 		uint16_t nr_vec = 0;
2964 		int err;
2965 
2966 		if (unlikely(fill_vec_buf_split(dev, vq,
2967 						vq->last_avail_idx + i,
2968 						&nr_vec, buf_vec,
2969 						&head_idx, &buf_len,
2970 						VHOST_ACCESS_RO) < 0))
2971 			break;
2972 
2973 		update_shadow_used_ring_split(vq, head_idx, 0);
2974 
2975 		err = virtio_dev_pktmbuf_prep(dev, pkts[i], buf_len);
2976 		if (unlikely(err)) {
2977 			/*
2978 			 * mbuf allocation fails for jumbo packets when external
2979 			 * buffer allocation is not allowed and linear buffer
2980 			 * is required. Drop this packet.
2981 			 */
2982 			if (!allocerr_warned) {
2983 				VHOST_LOG_DATA(ERR,
2984 					"Failed mbuf alloc of size %d from %s on %s.\n",
2985 					buf_len, mbuf_pool->name, dev->ifname);
2986 				allocerr_warned = true;
2987 			}
2988 			dropped += 1;
2989 			i++;
2990 			break;
2991 		}
2992 
2993 		err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
2994 				mbuf_pool, legacy_ol_flags);
2995 		if (unlikely(err)) {
2996 			if (!allocerr_warned) {
2997 				VHOST_LOG_DATA(ERR,
2998 					"Failed to copy desc to mbuf on %s.\n",
2999 					dev->ifname);
3000 				allocerr_warned = true;
3001 			}
3002 			dropped += 1;
3003 			i++;
3004 			break;
3005 		}
3006 	}
3007 
3008 	if (dropped)
3009 		rte_pktmbuf_free_bulk(&pkts[i - 1], count - i + 1);
3010 
3011 	vq->last_avail_idx += i;
3012 
3013 	do_data_copy_dequeue(vq);
3014 	if (unlikely(i < count))
3015 		vq->shadow_used_idx = i;
3016 	if (likely(vq->shadow_used_idx)) {
3017 		flush_shadow_used_ring_split(dev, vq);
3018 		vhost_vring_call_split(dev, vq);
3019 	}
3020 
3021 	return (i - dropped);
3022 }
3023 
3024 __rte_noinline
3025 static uint16_t
3026 virtio_dev_tx_split_legacy(struct virtio_net *dev,
3027 	struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool,
3028 	struct rte_mbuf **pkts, uint16_t count)
3029 {
3030 	return virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count, true);
3031 }
3032 
3033 __rte_noinline
3034 static uint16_t
3035 virtio_dev_tx_split_compliant(struct virtio_net *dev,
3036 	struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool,
3037 	struct rte_mbuf **pkts, uint16_t count)
3038 {
3039 	return virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count, false);
3040 }
3041 
3042 static __rte_always_inline int
3043 vhost_reserve_avail_batch_packed(struct virtio_net *dev,
3044 				 struct vhost_virtqueue *vq,
3045 				 struct rte_mbuf **pkts,
3046 				 uint16_t avail_idx,
3047 				 uintptr_t *desc_addrs,
3048 				 uint16_t *ids)
3049 {
3050 	bool wrap = vq->avail_wrap_counter;
3051 	struct vring_packed_desc *descs = vq->desc_packed;
3052 	uint64_t lens[PACKED_BATCH_SIZE];
3053 	uint64_t buf_lens[PACKED_BATCH_SIZE];
3054 	uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
3055 	uint16_t flags, i;
3056 
3057 	if (unlikely(avail_idx & PACKED_BATCH_MASK))
3058 		return -1;
3059 	if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size))
3060 		return -1;
3061 
3062 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
3063 		flags = descs[avail_idx + i].flags;
3064 		if (unlikely((wrap != !!(flags & VRING_DESC_F_AVAIL)) ||
3065 			     (wrap == !!(flags & VRING_DESC_F_USED))  ||
3066 			     (flags & PACKED_DESC_SINGLE_DEQUEUE_FLAG)))
3067 			return -1;
3068 	}
3069 
3070 	rte_atomic_thread_fence(__ATOMIC_ACQUIRE);
3071 
3072 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
3073 		lens[i] = descs[avail_idx + i].len;
3074 
3075 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
3076 		desc_addrs[i] = vhost_iova_to_vva(dev, vq,
3077 						  descs[avail_idx + i].addr,
3078 						  &lens[i], VHOST_ACCESS_RW);
3079 	}
3080 
3081 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
3082 		if (unlikely(!desc_addrs[i]))
3083 			return -1;
3084 		if (unlikely((lens[i] != descs[avail_idx + i].len)))
3085 			return -1;
3086 	}
3087 
3088 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
3089 		if (virtio_dev_pktmbuf_prep(dev, pkts[i], lens[i]))
3090 			goto err;
3091 	}
3092 
3093 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
3094 		buf_lens[i] = pkts[i]->buf_len - pkts[i]->data_off;
3095 
3096 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
3097 		if (unlikely(buf_lens[i] < (lens[i] - buf_offset)))
3098 			goto err;
3099 	}
3100 
3101 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
3102 		pkts[i]->pkt_len = lens[i] - buf_offset;
3103 		pkts[i]->data_len = pkts[i]->pkt_len;
3104 		ids[i] = descs[avail_idx + i].id;
3105 	}
3106 
3107 	return 0;
3108 
3109 err:
3110 	return -1;
3111 }
3112 
3113 static __rte_always_inline int
3114 virtio_dev_tx_batch_packed(struct virtio_net *dev,
3115 			   struct vhost_virtqueue *vq,
3116 			   struct rte_mbuf **pkts,
3117 			   bool legacy_ol_flags)
3118 {
3119 	uint16_t avail_idx = vq->last_avail_idx;
3120 	uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
3121 	struct virtio_net_hdr *hdr;
3122 	uintptr_t desc_addrs[PACKED_BATCH_SIZE];
3123 	uint16_t ids[PACKED_BATCH_SIZE];
3124 	uint16_t i;
3125 
3126 	if (vhost_reserve_avail_batch_packed(dev, vq, pkts, avail_idx,
3127 					     desc_addrs, ids))
3128 		return -1;
3129 
3130 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
3131 		rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
3132 
3133 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
3134 		rte_memcpy(rte_pktmbuf_mtod_offset(pkts[i], void *, 0),
3135 			   (void *)(uintptr_t)(desc_addrs[i] + buf_offset),
3136 			   pkts[i]->pkt_len);
3137 
3138 	if (virtio_net_with_host_offload(dev)) {
3139 		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
3140 			hdr = (struct virtio_net_hdr *)(desc_addrs[i]);
3141 			vhost_dequeue_offload(hdr, pkts[i], legacy_ol_flags);
3142 		}
3143 	}
3144 
3145 	if (virtio_net_is_inorder(dev))
3146 		vhost_shadow_dequeue_batch_packed_inorder(vq,
3147 			ids[PACKED_BATCH_SIZE - 1]);
3148 	else
3149 		vhost_shadow_dequeue_batch_packed(dev, vq, ids);
3150 
3151 	vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE);
3152 
3153 	return 0;
3154 }
3155 
3156 static __rte_always_inline int
3157 vhost_dequeue_single_packed(struct virtio_net *dev,
3158 			    struct vhost_virtqueue *vq,
3159 			    struct rte_mempool *mbuf_pool,
3160 			    struct rte_mbuf *pkts,
3161 			    uint16_t *buf_id,
3162 			    uint16_t *desc_count,
3163 			    bool legacy_ol_flags)
3164 {
3165 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
3166 	uint32_t buf_len;
3167 	uint16_t nr_vec = 0;
3168 	int err;
3169 	static bool allocerr_warned;
3170 
3171 	if (unlikely(fill_vec_buf_packed(dev, vq,
3172 					 vq->last_avail_idx, desc_count,
3173 					 buf_vec, &nr_vec,
3174 					 buf_id, &buf_len,
3175 					 VHOST_ACCESS_RO) < 0))
3176 		return -1;
3177 
3178 	if (unlikely(virtio_dev_pktmbuf_prep(dev, pkts, buf_len))) {
3179 		if (!allocerr_warned) {
3180 			VHOST_LOG_DATA(ERR,
3181 				"Failed mbuf alloc of size %d from %s on %s.\n",
3182 				buf_len, mbuf_pool->name, dev->ifname);
3183 			allocerr_warned = true;
3184 		}
3185 		return -1;
3186 	}
3187 
3188 	err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts,
3189 				mbuf_pool, legacy_ol_flags);
3190 	if (unlikely(err)) {
3191 		if (!allocerr_warned) {
3192 			VHOST_LOG_DATA(ERR,
3193 				"Failed to copy desc to mbuf on %s.\n",
3194 				dev->ifname);
3195 			allocerr_warned = true;
3196 		}
3197 		return -1;
3198 	}
3199 
3200 	return 0;
3201 }
3202 
3203 static __rte_always_inline int
3204 virtio_dev_tx_single_packed(struct virtio_net *dev,
3205 			    struct vhost_virtqueue *vq,
3206 			    struct rte_mempool *mbuf_pool,
3207 			    struct rte_mbuf *pkts,
3208 			    bool legacy_ol_flags)
3209 {
3210 
3211 	uint16_t buf_id, desc_count = 0;
3212 	int ret;
3213 
3214 	ret = vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id,
3215 					&desc_count, legacy_ol_flags);
3216 
3217 	if (likely(desc_count > 0)) {
3218 		if (virtio_net_is_inorder(dev))
3219 			vhost_shadow_dequeue_single_packed_inorder(vq, buf_id,
3220 								   desc_count);
3221 		else
3222 			vhost_shadow_dequeue_single_packed(vq, buf_id,
3223 					desc_count);
3224 
3225 		vq_inc_last_avail_packed(vq, desc_count);
3226 	}
3227 
3228 	return ret;
3229 }
3230 
3231 __rte_always_inline
3232 static uint16_t
3233 virtio_dev_tx_packed(struct virtio_net *dev,
3234 		     struct vhost_virtqueue *__rte_restrict vq,
3235 		     struct rte_mempool *mbuf_pool,
3236 		     struct rte_mbuf **__rte_restrict pkts,
3237 		     uint32_t count,
3238 		     bool legacy_ol_flags)
3239 {
3240 	uint32_t pkt_idx = 0;
3241 
3242 	if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts, count))
3243 		return 0;
3244 
3245 	do {
3246 		rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
3247 
3248 		if (count - pkt_idx >= PACKED_BATCH_SIZE) {
3249 			if (!virtio_dev_tx_batch_packed(dev, vq,
3250 							&pkts[pkt_idx],
3251 							legacy_ol_flags)) {
3252 				pkt_idx += PACKED_BATCH_SIZE;
3253 				continue;
3254 			}
3255 		}
3256 
3257 		if (virtio_dev_tx_single_packed(dev, vq, mbuf_pool,
3258 						pkts[pkt_idx],
3259 						legacy_ol_flags))
3260 			break;
3261 		pkt_idx++;
3262 	} while (pkt_idx < count);
3263 
3264 	if (pkt_idx != count)
3265 		rte_pktmbuf_free_bulk(&pkts[pkt_idx], count - pkt_idx);
3266 
3267 	if (vq->shadow_used_idx) {
3268 		do_data_copy_dequeue(vq);
3269 
3270 		vhost_flush_dequeue_shadow_packed(dev, vq);
3271 		vhost_vring_call_packed(dev, vq);
3272 	}
3273 
3274 	return pkt_idx;
3275 }
3276 
3277 __rte_noinline
3278 static uint16_t
3279 virtio_dev_tx_packed_legacy(struct virtio_net *dev,
3280 	struct vhost_virtqueue *__rte_restrict vq, struct rte_mempool *mbuf_pool,
3281 	struct rte_mbuf **__rte_restrict pkts, uint32_t count)
3282 {
3283 	return virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count, true);
3284 }
3285 
3286 __rte_noinline
3287 static uint16_t
3288 virtio_dev_tx_packed_compliant(struct virtio_net *dev,
3289 	struct vhost_virtqueue *__rte_restrict vq, struct rte_mempool *mbuf_pool,
3290 	struct rte_mbuf **__rte_restrict pkts, uint32_t count)
3291 {
3292 	return virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count, false);
3293 }
3294 
3295 uint16_t
3296 rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
3297 	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
3298 {
3299 	struct virtio_net *dev;
3300 	struct rte_mbuf *rarp_mbuf = NULL;
3301 	struct vhost_virtqueue *vq;
3302 	int16_t success = 1;
3303 
3304 	dev = get_device(vid);
3305 	if (!dev)
3306 		return 0;
3307 
3308 	if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
3309 		VHOST_LOG_DATA(ERR,
3310 			"(%d) %s: built-in vhost net backend is disabled.\n",
3311 			dev->vid, __func__);
3312 		return 0;
3313 	}
3314 
3315 	if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) {
3316 		VHOST_LOG_DATA(ERR,
3317 			"(%d) %s: invalid virtqueue idx %d.\n",
3318 			dev->vid, __func__, queue_id);
3319 		return 0;
3320 	}
3321 
3322 	vq = dev->virtqueue[queue_id];
3323 
3324 	if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
3325 		return 0;
3326 
3327 	if (unlikely(!vq->enabled)) {
3328 		count = 0;
3329 		goto out_access_unlock;
3330 	}
3331 
3332 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
3333 		vhost_user_iotlb_rd_lock(vq);
3334 
3335 	if (unlikely(!vq->access_ok))
3336 		if (unlikely(vring_translate(dev, vq) < 0)) {
3337 			count = 0;
3338 			goto out;
3339 		}
3340 
3341 	/*
3342 	 * Construct a RARP broadcast packet, and inject it to the "pkts"
3343 	 * array, to looks like that guest actually send such packet.
3344 	 *
3345 	 * Check user_send_rarp() for more information.
3346 	 *
3347 	 * broadcast_rarp shares a cacheline in the virtio_net structure
3348 	 * with some fields that are accessed during enqueue and
3349 	 * __atomic_compare_exchange_n causes a write if performed compare
3350 	 * and exchange. This could result in false sharing between enqueue
3351 	 * and dequeue.
3352 	 *
3353 	 * Prevent unnecessary false sharing by reading broadcast_rarp first
3354 	 * and only performing compare and exchange if the read indicates it
3355 	 * is likely to be set.
3356 	 */
3357 	if (unlikely(__atomic_load_n(&dev->broadcast_rarp, __ATOMIC_ACQUIRE) &&
3358 			__atomic_compare_exchange_n(&dev->broadcast_rarp,
3359 			&success, 0, 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED))) {
3360 
3361 		rarp_mbuf = rte_net_make_rarp_packet(mbuf_pool, &dev->mac);
3362 		if (rarp_mbuf == NULL) {
3363 			VHOST_LOG_DATA(ERR, "Failed to make RARP packet.\n");
3364 			count = 0;
3365 			goto out;
3366 		}
3367 		count -= 1;
3368 	}
3369 
3370 	if (vq_is_packed(dev)) {
3371 		if (dev->flags & VIRTIO_DEV_LEGACY_OL_FLAGS)
3372 			count = virtio_dev_tx_packed_legacy(dev, vq, mbuf_pool, pkts, count);
3373 		else
3374 			count = virtio_dev_tx_packed_compliant(dev, vq, mbuf_pool, pkts, count);
3375 	} else {
3376 		if (dev->flags & VIRTIO_DEV_LEGACY_OL_FLAGS)
3377 			count = virtio_dev_tx_split_legacy(dev, vq, mbuf_pool, pkts, count);
3378 		else
3379 			count = virtio_dev_tx_split_compliant(dev, vq, mbuf_pool, pkts, count);
3380 	}
3381 
3382 out:
3383 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
3384 		vhost_user_iotlb_rd_unlock(vq);
3385 
3386 out_access_unlock:
3387 	rte_spinlock_unlock(&vq->access_lock);
3388 
3389 	if (unlikely(rarp_mbuf != NULL)) {
3390 		/*
3391 		 * Inject it to the head of "pkts" array, so that switch's mac
3392 		 * learning table will get updated first.
3393 		 */
3394 		memmove(&pkts[1], pkts, count * sizeof(struct rte_mbuf *));
3395 		pkts[0] = rarp_mbuf;
3396 		count += 1;
3397 	}
3398 
3399 	return count;
3400 }
3401