xref: /dpdk/lib/vhost/virtio_net.c (revision c5c507100ea58e24f812401c77c66cdb9bceee36)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2016 Intel Corporation
3  */
4 
5 #include <stdint.h>
6 #include <stdbool.h>
7 #include <linux/virtio_net.h>
8 
9 #include <rte_mbuf.h>
10 #include <rte_memcpy.h>
11 #include <rte_net.h>
12 #include <rte_ether.h>
13 #include <rte_ip.h>
14 #include <rte_vhost.h>
15 #include <rte_tcp.h>
16 #include <rte_udp.h>
17 #include <rte_sctp.h>
18 #include <rte_arp.h>
19 #include <rte_spinlock.h>
20 #include <rte_malloc.h>
21 #include <rte_vhost_async.h>
22 
23 #include "iotlb.h"
24 #include "vhost.h"
25 
26 #define MAX_BATCH_LEN 256
27 
28 #define VHOST_ASYNC_BATCH_THRESHOLD 32
29 
30 static  __rte_always_inline bool
31 rxvq_is_mergeable(struct virtio_net *dev)
32 {
33 	return dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF);
34 }
35 
36 static  __rte_always_inline bool
37 virtio_net_is_inorder(struct virtio_net *dev)
38 {
39 	return dev->features & (1ULL << VIRTIO_F_IN_ORDER);
40 }
41 
42 static bool
43 is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring)
44 {
45 	return (is_tx ^ (idx & 1)) == 0 && idx < nr_vring;
46 }
47 
48 static inline void
49 do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
50 {
51 	struct batch_copy_elem *elem = vq->batch_copy_elems;
52 	uint16_t count = vq->batch_copy_nb_elems;
53 	int i;
54 
55 	for (i = 0; i < count; i++) {
56 		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
57 		vhost_log_cache_write_iova(dev, vq, elem[i].log_addr,
58 					   elem[i].len);
59 		PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
60 	}
61 
62 	vq->batch_copy_nb_elems = 0;
63 }
64 
65 static inline void
66 do_data_copy_dequeue(struct vhost_virtqueue *vq)
67 {
68 	struct batch_copy_elem *elem = vq->batch_copy_elems;
69 	uint16_t count = vq->batch_copy_nb_elems;
70 	int i;
71 
72 	for (i = 0; i < count; i++)
73 		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
74 
75 	vq->batch_copy_nb_elems = 0;
76 }
77 
78 static __rte_always_inline void
79 do_flush_shadow_used_ring_split(struct virtio_net *dev,
80 			struct vhost_virtqueue *vq,
81 			uint16_t to, uint16_t from, uint16_t size)
82 {
83 	rte_memcpy(&vq->used->ring[to],
84 			&vq->shadow_used_split[from],
85 			size * sizeof(struct vring_used_elem));
86 	vhost_log_cache_used_vring(dev, vq,
87 			offsetof(struct vring_used, ring[to]),
88 			size * sizeof(struct vring_used_elem));
89 }
90 
91 static __rte_always_inline void
92 flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
93 {
94 	uint16_t used_idx = vq->last_used_idx & (vq->size - 1);
95 
96 	if (used_idx + vq->shadow_used_idx <= vq->size) {
97 		do_flush_shadow_used_ring_split(dev, vq, used_idx, 0,
98 					  vq->shadow_used_idx);
99 	} else {
100 		uint16_t size;
101 
102 		/* update used ring interval [used_idx, vq->size] */
103 		size = vq->size - used_idx;
104 		do_flush_shadow_used_ring_split(dev, vq, used_idx, 0, size);
105 
106 		/* update the left half used ring interval [0, left_size] */
107 		do_flush_shadow_used_ring_split(dev, vq, 0, size,
108 					  vq->shadow_used_idx - size);
109 	}
110 	vq->last_used_idx += vq->shadow_used_idx;
111 
112 	vhost_log_cache_sync(dev, vq);
113 
114 	__atomic_add_fetch(&vq->used->idx, vq->shadow_used_idx,
115 			   __ATOMIC_RELEASE);
116 	vq->shadow_used_idx = 0;
117 	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
118 		sizeof(vq->used->idx));
119 }
120 
121 static __rte_always_inline void
122 update_shadow_used_ring_split(struct vhost_virtqueue *vq,
123 			 uint16_t desc_idx, uint32_t len)
124 {
125 	uint16_t i = vq->shadow_used_idx++;
126 
127 	vq->shadow_used_split[i].id  = desc_idx;
128 	vq->shadow_used_split[i].len = len;
129 }
130 
131 static __rte_always_inline void
132 vhost_flush_enqueue_shadow_packed(struct virtio_net *dev,
133 				  struct vhost_virtqueue *vq)
134 {
135 	int i;
136 	uint16_t used_idx = vq->last_used_idx;
137 	uint16_t head_idx = vq->last_used_idx;
138 	uint16_t head_flags = 0;
139 
140 	/* Split loop in two to save memory barriers */
141 	for (i = 0; i < vq->shadow_used_idx; i++) {
142 		vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id;
143 		vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len;
144 
145 		used_idx += vq->shadow_used_packed[i].count;
146 		if (used_idx >= vq->size)
147 			used_idx -= vq->size;
148 	}
149 
150 	/* The ordering for storing desc flags needs to be enforced. */
151 	rte_atomic_thread_fence(__ATOMIC_RELEASE);
152 
153 	for (i = 0; i < vq->shadow_used_idx; i++) {
154 		uint16_t flags;
155 
156 		if (vq->shadow_used_packed[i].len)
157 			flags = VRING_DESC_F_WRITE;
158 		else
159 			flags = 0;
160 
161 		if (vq->used_wrap_counter) {
162 			flags |= VRING_DESC_F_USED;
163 			flags |= VRING_DESC_F_AVAIL;
164 		} else {
165 			flags &= ~VRING_DESC_F_USED;
166 			flags &= ~VRING_DESC_F_AVAIL;
167 		}
168 
169 		if (i > 0) {
170 			vq->desc_packed[vq->last_used_idx].flags = flags;
171 
172 			vhost_log_cache_used_vring(dev, vq,
173 					vq->last_used_idx *
174 					sizeof(struct vring_packed_desc),
175 					sizeof(struct vring_packed_desc));
176 		} else {
177 			head_idx = vq->last_used_idx;
178 			head_flags = flags;
179 		}
180 
181 		vq_inc_last_used_packed(vq, vq->shadow_used_packed[i].count);
182 	}
183 
184 	vq->desc_packed[head_idx].flags = head_flags;
185 
186 	vhost_log_cache_used_vring(dev, vq,
187 				head_idx *
188 				sizeof(struct vring_packed_desc),
189 				sizeof(struct vring_packed_desc));
190 
191 	vq->shadow_used_idx = 0;
192 	vhost_log_cache_sync(dev, vq);
193 }
194 
195 static __rte_always_inline void
196 vhost_flush_dequeue_shadow_packed(struct virtio_net *dev,
197 				  struct vhost_virtqueue *vq)
198 {
199 	struct vring_used_elem_packed *used_elem = &vq->shadow_used_packed[0];
200 
201 	vq->desc_packed[vq->shadow_last_used_idx].id = used_elem->id;
202 	/* desc flags is the synchronization point for virtio packed vring */
203 	__atomic_store_n(&vq->desc_packed[vq->shadow_last_used_idx].flags,
204 			 used_elem->flags, __ATOMIC_RELEASE);
205 
206 	vhost_log_cache_used_vring(dev, vq, vq->shadow_last_used_idx *
207 				   sizeof(struct vring_packed_desc),
208 				   sizeof(struct vring_packed_desc));
209 	vq->shadow_used_idx = 0;
210 	vhost_log_cache_sync(dev, vq);
211 }
212 
213 static __rte_always_inline void
214 vhost_flush_enqueue_batch_packed(struct virtio_net *dev,
215 				 struct vhost_virtqueue *vq,
216 				 uint64_t *lens,
217 				 uint16_t *ids)
218 {
219 	uint16_t i;
220 	uint16_t flags;
221 	uint16_t last_used_idx = vq->last_used_idx;
222 	struct vring_packed_desc *desc_base = &vq->desc_packed[last_used_idx];
223 
224 	if (vq->shadow_used_idx) {
225 		do_data_copy_enqueue(dev, vq);
226 		vhost_flush_enqueue_shadow_packed(dev, vq);
227 	}
228 
229 	flags = PACKED_DESC_ENQUEUE_USED_FLAG(vq->used_wrap_counter);
230 
231 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
232 		desc_base[i].id = ids[i];
233 		desc_base[i].len = lens[i];
234 	}
235 
236 	rte_atomic_thread_fence(__ATOMIC_RELEASE);
237 
238 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
239 		desc_base[i].flags = flags;
240 	}
241 
242 	vhost_log_cache_used_vring(dev, vq, last_used_idx *
243 				   sizeof(struct vring_packed_desc),
244 				   sizeof(struct vring_packed_desc) *
245 				   PACKED_BATCH_SIZE);
246 	vhost_log_cache_sync(dev, vq);
247 
248 	vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
249 }
250 
251 static __rte_always_inline void
252 vhost_shadow_dequeue_batch_packed_inorder(struct vhost_virtqueue *vq,
253 					  uint16_t id)
254 {
255 	vq->shadow_used_packed[0].id = id;
256 
257 	if (!vq->shadow_used_idx) {
258 		vq->shadow_last_used_idx = vq->last_used_idx;
259 		vq->shadow_used_packed[0].flags =
260 			PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter);
261 		vq->shadow_used_packed[0].len = 0;
262 		vq->shadow_used_packed[0].count = 1;
263 		vq->shadow_used_idx++;
264 	}
265 
266 	vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
267 }
268 
269 static __rte_always_inline void
270 vhost_shadow_dequeue_batch_packed(struct virtio_net *dev,
271 				  struct vhost_virtqueue *vq,
272 				  uint16_t *ids)
273 {
274 	uint16_t flags;
275 	uint16_t i;
276 	uint16_t begin;
277 
278 	flags = PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter);
279 
280 	if (!vq->shadow_used_idx) {
281 		vq->shadow_last_used_idx = vq->last_used_idx;
282 		vq->shadow_used_packed[0].id  = ids[0];
283 		vq->shadow_used_packed[0].len = 0;
284 		vq->shadow_used_packed[0].count = 1;
285 		vq->shadow_used_packed[0].flags = flags;
286 		vq->shadow_used_idx++;
287 		begin = 1;
288 	} else
289 		begin = 0;
290 
291 	vhost_for_each_try_unroll(i, begin, PACKED_BATCH_SIZE) {
292 		vq->desc_packed[vq->last_used_idx + i].id = ids[i];
293 		vq->desc_packed[vq->last_used_idx + i].len = 0;
294 	}
295 
296 	rte_atomic_thread_fence(__ATOMIC_RELEASE);
297 	vhost_for_each_try_unroll(i, begin, PACKED_BATCH_SIZE)
298 		vq->desc_packed[vq->last_used_idx + i].flags = flags;
299 
300 	vhost_log_cache_used_vring(dev, vq, vq->last_used_idx *
301 				   sizeof(struct vring_packed_desc),
302 				   sizeof(struct vring_packed_desc) *
303 				   PACKED_BATCH_SIZE);
304 	vhost_log_cache_sync(dev, vq);
305 
306 	vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
307 }
308 
309 static __rte_always_inline void
310 vhost_shadow_dequeue_single_packed(struct vhost_virtqueue *vq,
311 				   uint16_t buf_id,
312 				   uint16_t count)
313 {
314 	uint16_t flags;
315 
316 	flags = vq->desc_packed[vq->last_used_idx].flags;
317 	if (vq->used_wrap_counter) {
318 		flags |= VRING_DESC_F_USED;
319 		flags |= VRING_DESC_F_AVAIL;
320 	} else {
321 		flags &= ~VRING_DESC_F_USED;
322 		flags &= ~VRING_DESC_F_AVAIL;
323 	}
324 
325 	if (!vq->shadow_used_idx) {
326 		vq->shadow_last_used_idx = vq->last_used_idx;
327 
328 		vq->shadow_used_packed[0].id  = buf_id;
329 		vq->shadow_used_packed[0].len = 0;
330 		vq->shadow_used_packed[0].flags = flags;
331 		vq->shadow_used_idx++;
332 	} else {
333 		vq->desc_packed[vq->last_used_idx].id = buf_id;
334 		vq->desc_packed[vq->last_used_idx].len = 0;
335 		vq->desc_packed[vq->last_used_idx].flags = flags;
336 	}
337 
338 	vq_inc_last_used_packed(vq, count);
339 }
340 
341 static __rte_always_inline void
342 vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue *vq,
343 					   uint16_t buf_id,
344 					   uint16_t count)
345 {
346 	uint16_t flags;
347 
348 	vq->shadow_used_packed[0].id = buf_id;
349 
350 	flags = vq->desc_packed[vq->last_used_idx].flags;
351 	if (vq->used_wrap_counter) {
352 		flags |= VRING_DESC_F_USED;
353 		flags |= VRING_DESC_F_AVAIL;
354 	} else {
355 		flags &= ~VRING_DESC_F_USED;
356 		flags &= ~VRING_DESC_F_AVAIL;
357 	}
358 
359 	if (!vq->shadow_used_idx) {
360 		vq->shadow_last_used_idx = vq->last_used_idx;
361 		vq->shadow_used_packed[0].len = 0;
362 		vq->shadow_used_packed[0].flags = flags;
363 		vq->shadow_used_idx++;
364 	}
365 
366 	vq_inc_last_used_packed(vq, count);
367 }
368 
369 static __rte_always_inline void
370 vhost_shadow_enqueue_packed(struct vhost_virtqueue *vq,
371 				   uint32_t *len,
372 				   uint16_t *id,
373 				   uint16_t *count,
374 				   uint16_t num_buffers)
375 {
376 	uint16_t i;
377 
378 	for (i = 0; i < num_buffers; i++) {
379 		/* enqueue shadow flush action aligned with batch num */
380 		if (!vq->shadow_used_idx)
381 			vq->shadow_aligned_idx = vq->last_used_idx &
382 				PACKED_BATCH_MASK;
383 		vq->shadow_used_packed[vq->shadow_used_idx].id  = id[i];
384 		vq->shadow_used_packed[vq->shadow_used_idx].len = len[i];
385 		vq->shadow_used_packed[vq->shadow_used_idx].count = count[i];
386 		vq->shadow_aligned_idx += count[i];
387 		vq->shadow_used_idx++;
388 	}
389 }
390 
391 static __rte_always_inline void
392 vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
393 				   struct vhost_virtqueue *vq,
394 				   uint32_t *len,
395 				   uint16_t *id,
396 				   uint16_t *count,
397 				   uint16_t num_buffers)
398 {
399 	vhost_shadow_enqueue_packed(vq, len, id, count, num_buffers);
400 
401 	if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) {
402 		do_data_copy_enqueue(dev, vq);
403 		vhost_flush_enqueue_shadow_packed(dev, vq);
404 	}
405 }
406 
407 /* avoid write operation when necessary, to lessen cache issues */
408 #define ASSIGN_UNLESS_EQUAL(var, val) do {	\
409 	if ((var) != (val))			\
410 		(var) = (val);			\
411 } while (0)
412 
413 static __rte_always_inline void
414 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
415 {
416 	uint64_t csum_l4 = m_buf->ol_flags & PKT_TX_L4_MASK;
417 
418 	if (m_buf->ol_flags & PKT_TX_TCP_SEG)
419 		csum_l4 |= PKT_TX_TCP_CKSUM;
420 
421 	if (csum_l4) {
422 		net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
423 		net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
424 
425 		switch (csum_l4) {
426 		case PKT_TX_TCP_CKSUM:
427 			net_hdr->csum_offset = (offsetof(struct rte_tcp_hdr,
428 						cksum));
429 			break;
430 		case PKT_TX_UDP_CKSUM:
431 			net_hdr->csum_offset = (offsetof(struct rte_udp_hdr,
432 						dgram_cksum));
433 			break;
434 		case PKT_TX_SCTP_CKSUM:
435 			net_hdr->csum_offset = (offsetof(struct rte_sctp_hdr,
436 						cksum));
437 			break;
438 		}
439 	} else {
440 		ASSIGN_UNLESS_EQUAL(net_hdr->csum_start, 0);
441 		ASSIGN_UNLESS_EQUAL(net_hdr->csum_offset, 0);
442 		ASSIGN_UNLESS_EQUAL(net_hdr->flags, 0);
443 	}
444 
445 	/* IP cksum verification cannot be bypassed, then calculate here */
446 	if (m_buf->ol_flags & PKT_TX_IP_CKSUM) {
447 		struct rte_ipv4_hdr *ipv4_hdr;
448 
449 		ipv4_hdr = rte_pktmbuf_mtod_offset(m_buf, struct rte_ipv4_hdr *,
450 						   m_buf->l2_len);
451 		ipv4_hdr->hdr_checksum = 0;
452 		ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr);
453 	}
454 
455 	if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
456 		if (m_buf->ol_flags & PKT_TX_IPV4)
457 			net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
458 		else
459 			net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
460 		net_hdr->gso_size = m_buf->tso_segsz;
461 		net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
462 					+ m_buf->l4_len;
463 	} else if (m_buf->ol_flags & PKT_TX_UDP_SEG) {
464 		net_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
465 		net_hdr->gso_size = m_buf->tso_segsz;
466 		net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len +
467 			m_buf->l4_len;
468 	} else {
469 		ASSIGN_UNLESS_EQUAL(net_hdr->gso_type, 0);
470 		ASSIGN_UNLESS_EQUAL(net_hdr->gso_size, 0);
471 		ASSIGN_UNLESS_EQUAL(net_hdr->hdr_len, 0);
472 	}
473 }
474 
475 static __rte_always_inline int
476 map_one_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
477 		struct buf_vector *buf_vec, uint16_t *vec_idx,
478 		uint64_t desc_iova, uint64_t desc_len, uint8_t perm)
479 {
480 	uint16_t vec_id = *vec_idx;
481 
482 	while (desc_len) {
483 		uint64_t desc_addr;
484 		uint64_t desc_chunck_len = desc_len;
485 
486 		if (unlikely(vec_id >= BUF_VECTOR_MAX))
487 			return -1;
488 
489 		desc_addr = vhost_iova_to_vva(dev, vq,
490 				desc_iova,
491 				&desc_chunck_len,
492 				perm);
493 		if (unlikely(!desc_addr))
494 			return -1;
495 
496 		rte_prefetch0((void *)(uintptr_t)desc_addr);
497 
498 		buf_vec[vec_id].buf_iova = desc_iova;
499 		buf_vec[vec_id].buf_addr = desc_addr;
500 		buf_vec[vec_id].buf_len  = desc_chunck_len;
501 
502 		desc_len -= desc_chunck_len;
503 		desc_iova += desc_chunck_len;
504 		vec_id++;
505 	}
506 	*vec_idx = vec_id;
507 
508 	return 0;
509 }
510 
511 static __rte_always_inline int
512 fill_vec_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
513 			 uint32_t avail_idx, uint16_t *vec_idx,
514 			 struct buf_vector *buf_vec, uint16_t *desc_chain_head,
515 			 uint32_t *desc_chain_len, uint8_t perm)
516 {
517 	uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
518 	uint16_t vec_id = *vec_idx;
519 	uint32_t len    = 0;
520 	uint64_t dlen;
521 	uint32_t nr_descs = vq->size;
522 	uint32_t cnt    = 0;
523 	struct vring_desc *descs = vq->desc;
524 	struct vring_desc *idesc = NULL;
525 
526 	if (unlikely(idx >= vq->size))
527 		return -1;
528 
529 	*desc_chain_head = idx;
530 
531 	if (vq->desc[idx].flags & VRING_DESC_F_INDIRECT) {
532 		dlen = vq->desc[idx].len;
533 		nr_descs = dlen / sizeof(struct vring_desc);
534 		if (unlikely(nr_descs > vq->size))
535 			return -1;
536 
537 		descs = (struct vring_desc *)(uintptr_t)
538 			vhost_iova_to_vva(dev, vq, vq->desc[idx].addr,
539 						&dlen,
540 						VHOST_ACCESS_RO);
541 		if (unlikely(!descs))
542 			return -1;
543 
544 		if (unlikely(dlen < vq->desc[idx].len)) {
545 			/*
546 			 * The indirect desc table is not contiguous
547 			 * in process VA space, we have to copy it.
548 			 */
549 			idesc = vhost_alloc_copy_ind_table(dev, vq,
550 					vq->desc[idx].addr, vq->desc[idx].len);
551 			if (unlikely(!idesc))
552 				return -1;
553 
554 			descs = idesc;
555 		}
556 
557 		idx = 0;
558 	}
559 
560 	while (1) {
561 		if (unlikely(idx >= nr_descs || cnt++ >= nr_descs)) {
562 			free_ind_table(idesc);
563 			return -1;
564 		}
565 
566 		dlen = descs[idx].len;
567 		len += dlen;
568 
569 		if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
570 						descs[idx].addr, dlen,
571 						perm))) {
572 			free_ind_table(idesc);
573 			return -1;
574 		}
575 
576 		if ((descs[idx].flags & VRING_DESC_F_NEXT) == 0)
577 			break;
578 
579 		idx = descs[idx].next;
580 	}
581 
582 	*desc_chain_len = len;
583 	*vec_idx = vec_id;
584 
585 	if (unlikely(!!idesc))
586 		free_ind_table(idesc);
587 
588 	return 0;
589 }
590 
591 /*
592  * Returns -1 on fail, 0 on success
593  */
594 static inline int
595 reserve_avail_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
596 				uint32_t size, struct buf_vector *buf_vec,
597 				uint16_t *num_buffers, uint16_t avail_head,
598 				uint16_t *nr_vec)
599 {
600 	uint16_t cur_idx;
601 	uint16_t vec_idx = 0;
602 	uint16_t max_tries, tries = 0;
603 
604 	uint16_t head_idx = 0;
605 	uint32_t len = 0;
606 
607 	*num_buffers = 0;
608 	cur_idx  = vq->last_avail_idx;
609 
610 	if (rxvq_is_mergeable(dev))
611 		max_tries = vq->size - 1;
612 	else
613 		max_tries = 1;
614 
615 	while (size > 0) {
616 		if (unlikely(cur_idx == avail_head))
617 			return -1;
618 		/*
619 		 * if we tried all available ring items, and still
620 		 * can't get enough buf, it means something abnormal
621 		 * happened.
622 		 */
623 		if (unlikely(++tries > max_tries))
624 			return -1;
625 
626 		if (unlikely(fill_vec_buf_split(dev, vq, cur_idx,
627 						&vec_idx, buf_vec,
628 						&head_idx, &len,
629 						VHOST_ACCESS_RW) < 0))
630 			return -1;
631 		len = RTE_MIN(len, size);
632 		update_shadow_used_ring_split(vq, head_idx, len);
633 		size -= len;
634 
635 		cur_idx++;
636 		*num_buffers += 1;
637 	}
638 
639 	*nr_vec = vec_idx;
640 
641 	return 0;
642 }
643 
644 static __rte_always_inline int
645 fill_vec_buf_packed_indirect(struct virtio_net *dev,
646 			struct vhost_virtqueue *vq,
647 			struct vring_packed_desc *desc, uint16_t *vec_idx,
648 			struct buf_vector *buf_vec, uint32_t *len, uint8_t perm)
649 {
650 	uint16_t i;
651 	uint32_t nr_descs;
652 	uint16_t vec_id = *vec_idx;
653 	uint64_t dlen;
654 	struct vring_packed_desc *descs, *idescs = NULL;
655 
656 	dlen = desc->len;
657 	descs = (struct vring_packed_desc *)(uintptr_t)
658 		vhost_iova_to_vva(dev, vq, desc->addr, &dlen, VHOST_ACCESS_RO);
659 	if (unlikely(!descs))
660 		return -1;
661 
662 	if (unlikely(dlen < desc->len)) {
663 		/*
664 		 * The indirect desc table is not contiguous
665 		 * in process VA space, we have to copy it.
666 		 */
667 		idescs = vhost_alloc_copy_ind_table(dev,
668 				vq, desc->addr, desc->len);
669 		if (unlikely(!idescs))
670 			return -1;
671 
672 		descs = idescs;
673 	}
674 
675 	nr_descs =  desc->len / sizeof(struct vring_packed_desc);
676 	if (unlikely(nr_descs >= vq->size)) {
677 		free_ind_table(idescs);
678 		return -1;
679 	}
680 
681 	for (i = 0; i < nr_descs; i++) {
682 		if (unlikely(vec_id >= BUF_VECTOR_MAX)) {
683 			free_ind_table(idescs);
684 			return -1;
685 		}
686 
687 		dlen = descs[i].len;
688 		*len += dlen;
689 		if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
690 						descs[i].addr, dlen,
691 						perm)))
692 			return -1;
693 	}
694 	*vec_idx = vec_id;
695 
696 	if (unlikely(!!idescs))
697 		free_ind_table(idescs);
698 
699 	return 0;
700 }
701 
702 static __rte_always_inline int
703 fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
704 				uint16_t avail_idx, uint16_t *desc_count,
705 				struct buf_vector *buf_vec, uint16_t *vec_idx,
706 				uint16_t *buf_id, uint32_t *len, uint8_t perm)
707 {
708 	bool wrap_counter = vq->avail_wrap_counter;
709 	struct vring_packed_desc *descs = vq->desc_packed;
710 	uint16_t vec_id = *vec_idx;
711 	uint64_t dlen;
712 
713 	if (avail_idx < vq->last_avail_idx)
714 		wrap_counter ^= 1;
715 
716 	/*
717 	 * Perform a load-acquire barrier in desc_is_avail to
718 	 * enforce the ordering between desc flags and desc
719 	 * content.
720 	 */
721 	if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter)))
722 		return -1;
723 
724 	*desc_count = 0;
725 	*len = 0;
726 
727 	while (1) {
728 		if (unlikely(vec_id >= BUF_VECTOR_MAX))
729 			return -1;
730 
731 		if (unlikely(*desc_count >= vq->size))
732 			return -1;
733 
734 		*desc_count += 1;
735 		*buf_id = descs[avail_idx].id;
736 
737 		if (descs[avail_idx].flags & VRING_DESC_F_INDIRECT) {
738 			if (unlikely(fill_vec_buf_packed_indirect(dev, vq,
739 							&descs[avail_idx],
740 							&vec_id, buf_vec,
741 							len, perm) < 0))
742 				return -1;
743 		} else {
744 			dlen = descs[avail_idx].len;
745 			*len += dlen;
746 
747 			if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
748 							descs[avail_idx].addr,
749 							dlen,
750 							perm)))
751 				return -1;
752 		}
753 
754 		if ((descs[avail_idx].flags & VRING_DESC_F_NEXT) == 0)
755 			break;
756 
757 		if (++avail_idx >= vq->size) {
758 			avail_idx -= vq->size;
759 			wrap_counter ^= 1;
760 		}
761 	}
762 
763 	*vec_idx = vec_id;
764 
765 	return 0;
766 }
767 
768 static __rte_noinline void
769 copy_vnet_hdr_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
770 		struct buf_vector *buf_vec,
771 		struct virtio_net_hdr_mrg_rxbuf *hdr)
772 {
773 	uint64_t len;
774 	uint64_t remain = dev->vhost_hlen;
775 	uint64_t src = (uint64_t)(uintptr_t)hdr, dst;
776 	uint64_t iova = buf_vec->buf_iova;
777 
778 	while (remain) {
779 		len = RTE_MIN(remain,
780 				buf_vec->buf_len);
781 		dst = buf_vec->buf_addr;
782 		rte_memcpy((void *)(uintptr_t)dst,
783 				(void *)(uintptr_t)src,
784 				len);
785 
786 		PRINT_PACKET(dev, (uintptr_t)dst,
787 				(uint32_t)len, 0);
788 		vhost_log_cache_write_iova(dev, vq,
789 				iova, len);
790 
791 		remain -= len;
792 		iova += len;
793 		src += len;
794 		buf_vec++;
795 	}
796 }
797 
798 static __rte_always_inline int
799 copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
800 			    struct rte_mbuf *m, struct buf_vector *buf_vec,
801 			    uint16_t nr_vec, uint16_t num_buffers)
802 {
803 	uint32_t vec_idx = 0;
804 	uint32_t mbuf_offset, mbuf_avail;
805 	uint32_t buf_offset, buf_avail;
806 	uint64_t buf_addr, buf_iova, buf_len;
807 	uint32_t cpy_len;
808 	uint64_t hdr_addr;
809 	struct rte_mbuf *hdr_mbuf;
810 	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
811 	struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
812 	int error = 0;
813 
814 	if (unlikely(m == NULL)) {
815 		error = -1;
816 		goto out;
817 	}
818 
819 	buf_addr = buf_vec[vec_idx].buf_addr;
820 	buf_iova = buf_vec[vec_idx].buf_iova;
821 	buf_len = buf_vec[vec_idx].buf_len;
822 
823 	if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
824 		error = -1;
825 		goto out;
826 	}
827 
828 	hdr_mbuf = m;
829 	hdr_addr = buf_addr;
830 	if (unlikely(buf_len < dev->vhost_hlen)) {
831 		memset(&tmp_hdr, 0, sizeof(struct virtio_net_hdr_mrg_rxbuf));
832 		hdr = &tmp_hdr;
833 	} else
834 		hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr;
835 
836 	VHOST_LOG_DATA(DEBUG, "(%d) RX: num merge buffers %d\n",
837 		dev->vid, num_buffers);
838 
839 	if (unlikely(buf_len < dev->vhost_hlen)) {
840 		buf_offset = dev->vhost_hlen - buf_len;
841 		vec_idx++;
842 		buf_addr = buf_vec[vec_idx].buf_addr;
843 		buf_iova = buf_vec[vec_idx].buf_iova;
844 		buf_len = buf_vec[vec_idx].buf_len;
845 		buf_avail = buf_len - buf_offset;
846 	} else {
847 		buf_offset = dev->vhost_hlen;
848 		buf_avail = buf_len - dev->vhost_hlen;
849 	}
850 
851 	mbuf_avail  = rte_pktmbuf_data_len(m);
852 	mbuf_offset = 0;
853 	while (mbuf_avail != 0 || m->next != NULL) {
854 		/* done with current buf, get the next one */
855 		if (buf_avail == 0) {
856 			vec_idx++;
857 			if (unlikely(vec_idx >= nr_vec)) {
858 				error = -1;
859 				goto out;
860 			}
861 
862 			buf_addr = buf_vec[vec_idx].buf_addr;
863 			buf_iova = buf_vec[vec_idx].buf_iova;
864 			buf_len = buf_vec[vec_idx].buf_len;
865 
866 			buf_offset = 0;
867 			buf_avail  = buf_len;
868 		}
869 
870 		/* done with current mbuf, get the next one */
871 		if (mbuf_avail == 0) {
872 			m = m->next;
873 
874 			mbuf_offset = 0;
875 			mbuf_avail  = rte_pktmbuf_data_len(m);
876 		}
877 
878 		if (hdr_addr) {
879 			virtio_enqueue_offload(hdr_mbuf, &hdr->hdr);
880 			if (rxvq_is_mergeable(dev))
881 				ASSIGN_UNLESS_EQUAL(hdr->num_buffers,
882 						num_buffers);
883 
884 			if (unlikely(hdr == &tmp_hdr)) {
885 				copy_vnet_hdr_to_desc(dev, vq, buf_vec, hdr);
886 			} else {
887 				PRINT_PACKET(dev, (uintptr_t)hdr_addr,
888 						dev->vhost_hlen, 0);
889 				vhost_log_cache_write_iova(dev, vq,
890 						buf_vec[0].buf_iova,
891 						dev->vhost_hlen);
892 			}
893 
894 			hdr_addr = 0;
895 		}
896 
897 		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
898 
899 		if (likely(cpy_len > MAX_BATCH_LEN ||
900 					vq->batch_copy_nb_elems >= vq->size)) {
901 			rte_memcpy((void *)((uintptr_t)(buf_addr + buf_offset)),
902 				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
903 				cpy_len);
904 			vhost_log_cache_write_iova(dev, vq,
905 						   buf_iova + buf_offset,
906 						   cpy_len);
907 			PRINT_PACKET(dev, (uintptr_t)(buf_addr + buf_offset),
908 				cpy_len, 0);
909 		} else {
910 			batch_copy[vq->batch_copy_nb_elems].dst =
911 				(void *)((uintptr_t)(buf_addr + buf_offset));
912 			batch_copy[vq->batch_copy_nb_elems].src =
913 				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
914 			batch_copy[vq->batch_copy_nb_elems].log_addr =
915 				buf_iova + buf_offset;
916 			batch_copy[vq->batch_copy_nb_elems].len = cpy_len;
917 			vq->batch_copy_nb_elems++;
918 		}
919 
920 		mbuf_avail  -= cpy_len;
921 		mbuf_offset += cpy_len;
922 		buf_avail  -= cpy_len;
923 		buf_offset += cpy_len;
924 	}
925 
926 out:
927 
928 	return error;
929 }
930 
931 static __rte_always_inline void
932 async_fill_vec(struct iovec *v, void *base, size_t len)
933 {
934 	v->iov_base = base;
935 	v->iov_len = len;
936 }
937 
938 static __rte_always_inline void
939 async_fill_iter(struct rte_vhost_iov_iter *it, size_t count,
940 	struct iovec *vec, unsigned long nr_seg)
941 {
942 	it->offset = 0;
943 	it->count = count;
944 
945 	if (count) {
946 		it->iov = vec;
947 		it->nr_segs = nr_seg;
948 	} else {
949 		it->iov = 0;
950 		it->nr_segs = 0;
951 	}
952 }
953 
954 static __rte_always_inline void
955 async_fill_desc(struct rte_vhost_async_desc *desc,
956 	struct rte_vhost_iov_iter *src, struct rte_vhost_iov_iter *dst)
957 {
958 	desc->src = src;
959 	desc->dst = dst;
960 }
961 
962 static __rte_always_inline int
963 async_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
964 			struct rte_mbuf *m, struct buf_vector *buf_vec,
965 			uint16_t nr_vec, uint16_t num_buffers,
966 			struct iovec *src_iovec, struct iovec *dst_iovec,
967 			struct rte_vhost_iov_iter *src_it,
968 			struct rte_vhost_iov_iter *dst_it)
969 {
970 	uint32_t vec_idx = 0;
971 	uint32_t mbuf_offset, mbuf_avail;
972 	uint32_t buf_offset, buf_avail;
973 	uint64_t buf_addr, buf_iova, buf_len;
974 	uint32_t cpy_len, cpy_threshold;
975 	uint64_t hdr_addr;
976 	struct rte_mbuf *hdr_mbuf;
977 	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
978 	struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
979 	int error = 0;
980 	uint64_t mapped_len;
981 
982 	uint32_t tlen = 0;
983 	int tvec_idx = 0;
984 	void *hpa;
985 
986 	if (unlikely(m == NULL)) {
987 		error = -1;
988 		goto out;
989 	}
990 
991 	cpy_threshold = vq->async_threshold;
992 
993 	buf_addr = buf_vec[vec_idx].buf_addr;
994 	buf_iova = buf_vec[vec_idx].buf_iova;
995 	buf_len = buf_vec[vec_idx].buf_len;
996 
997 	if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
998 		error = -1;
999 		goto out;
1000 	}
1001 
1002 	hdr_mbuf = m;
1003 	hdr_addr = buf_addr;
1004 	if (unlikely(buf_len < dev->vhost_hlen)) {
1005 		memset(&tmp_hdr, 0, sizeof(struct virtio_net_hdr_mrg_rxbuf));
1006 		hdr = &tmp_hdr;
1007 	} else
1008 		hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr;
1009 
1010 	VHOST_LOG_DATA(DEBUG, "(%d) RX: num merge buffers %d\n",
1011 		dev->vid, num_buffers);
1012 
1013 	if (unlikely(buf_len < dev->vhost_hlen)) {
1014 		buf_offset = dev->vhost_hlen - buf_len;
1015 		vec_idx++;
1016 		buf_addr = buf_vec[vec_idx].buf_addr;
1017 		buf_iova = buf_vec[vec_idx].buf_iova;
1018 		buf_len = buf_vec[vec_idx].buf_len;
1019 		buf_avail = buf_len - buf_offset;
1020 	} else {
1021 		buf_offset = dev->vhost_hlen;
1022 		buf_avail = buf_len - dev->vhost_hlen;
1023 	}
1024 
1025 	mbuf_avail  = rte_pktmbuf_data_len(m);
1026 	mbuf_offset = 0;
1027 
1028 	while (mbuf_avail != 0 || m->next != NULL) {
1029 		/* done with current buf, get the next one */
1030 		if (buf_avail == 0) {
1031 			vec_idx++;
1032 			if (unlikely(vec_idx >= nr_vec)) {
1033 				error = -1;
1034 				goto out;
1035 			}
1036 
1037 			buf_addr = buf_vec[vec_idx].buf_addr;
1038 			buf_iova = buf_vec[vec_idx].buf_iova;
1039 			buf_len = buf_vec[vec_idx].buf_len;
1040 
1041 			buf_offset = 0;
1042 			buf_avail  = buf_len;
1043 		}
1044 
1045 		/* done with current mbuf, get the next one */
1046 		if (mbuf_avail == 0) {
1047 			m = m->next;
1048 
1049 			mbuf_offset = 0;
1050 			mbuf_avail  = rte_pktmbuf_data_len(m);
1051 		}
1052 
1053 		if (hdr_addr) {
1054 			virtio_enqueue_offload(hdr_mbuf, &hdr->hdr);
1055 			if (rxvq_is_mergeable(dev))
1056 				ASSIGN_UNLESS_EQUAL(hdr->num_buffers,
1057 						num_buffers);
1058 
1059 			if (unlikely(hdr == &tmp_hdr)) {
1060 				copy_vnet_hdr_to_desc(dev, vq, buf_vec, hdr);
1061 			} else {
1062 				PRINT_PACKET(dev, (uintptr_t)hdr_addr,
1063 						dev->vhost_hlen, 0);
1064 				vhost_log_cache_write_iova(dev, vq,
1065 						buf_vec[0].buf_iova,
1066 						dev->vhost_hlen);
1067 			}
1068 
1069 			hdr_addr = 0;
1070 		}
1071 
1072 		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
1073 
1074 		while (unlikely(cpy_len && cpy_len >= cpy_threshold)) {
1075 			hpa = (void *)(uintptr_t)gpa_to_first_hpa(dev,
1076 					buf_iova + buf_offset,
1077 					cpy_len, &mapped_len);
1078 
1079 			if (unlikely(!hpa || mapped_len < cpy_threshold))
1080 				break;
1081 
1082 			async_fill_vec(src_iovec + tvec_idx,
1083 				(void *)(uintptr_t)rte_pktmbuf_iova_offset(m,
1084 				mbuf_offset), (size_t)mapped_len);
1085 
1086 			async_fill_vec(dst_iovec + tvec_idx,
1087 					hpa, (size_t)mapped_len);
1088 
1089 			tlen += (uint32_t)mapped_len;
1090 			cpy_len -= (uint32_t)mapped_len;
1091 			mbuf_avail  -= (uint32_t)mapped_len;
1092 			mbuf_offset += (uint32_t)mapped_len;
1093 			buf_avail  -= (uint32_t)mapped_len;
1094 			buf_offset += (uint32_t)mapped_len;
1095 			tvec_idx++;
1096 		}
1097 
1098 		if (likely(cpy_len)) {
1099 			if (unlikely(vq->batch_copy_nb_elems >= vq->size)) {
1100 				rte_memcpy(
1101 				(void *)((uintptr_t)(buf_addr + buf_offset)),
1102 				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
1103 				cpy_len);
1104 
1105 				PRINT_PACKET(dev,
1106 					(uintptr_t)(buf_addr + buf_offset),
1107 					cpy_len, 0);
1108 			} else {
1109 				batch_copy[vq->batch_copy_nb_elems].dst =
1110 				(void *)((uintptr_t)(buf_addr + buf_offset));
1111 				batch_copy[vq->batch_copy_nb_elems].src =
1112 				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
1113 				batch_copy[vq->batch_copy_nb_elems].log_addr =
1114 					buf_iova + buf_offset;
1115 				batch_copy[vq->batch_copy_nb_elems].len =
1116 					cpy_len;
1117 				vq->batch_copy_nb_elems++;
1118 			}
1119 
1120 			mbuf_avail  -= cpy_len;
1121 			mbuf_offset += cpy_len;
1122 			buf_avail  -= cpy_len;
1123 			buf_offset += cpy_len;
1124 		}
1125 
1126 	}
1127 
1128 out:
1129 	if (tlen) {
1130 		async_fill_iter(src_it, tlen, src_iovec, tvec_idx);
1131 		async_fill_iter(dst_it, tlen, dst_iovec, tvec_idx);
1132 	} else {
1133 		src_it->count = 0;
1134 	}
1135 
1136 	return error;
1137 }
1138 
1139 static __rte_always_inline int
1140 vhost_enqueue_single_packed(struct virtio_net *dev,
1141 			    struct vhost_virtqueue *vq,
1142 			    struct rte_mbuf *pkt,
1143 			    struct buf_vector *buf_vec,
1144 			    uint16_t *nr_descs)
1145 {
1146 	uint16_t nr_vec = 0;
1147 	uint16_t avail_idx = vq->last_avail_idx;
1148 	uint16_t max_tries, tries = 0;
1149 	uint16_t buf_id = 0;
1150 	uint32_t len = 0;
1151 	uint16_t desc_count;
1152 	uint32_t size = pkt->pkt_len + sizeof(struct virtio_net_hdr_mrg_rxbuf);
1153 	uint16_t num_buffers = 0;
1154 	uint32_t buffer_len[vq->size];
1155 	uint16_t buffer_buf_id[vq->size];
1156 	uint16_t buffer_desc_count[vq->size];
1157 
1158 	if (rxvq_is_mergeable(dev))
1159 		max_tries = vq->size - 1;
1160 	else
1161 		max_tries = 1;
1162 
1163 	while (size > 0) {
1164 		/*
1165 		 * if we tried all available ring items, and still
1166 		 * can't get enough buf, it means something abnormal
1167 		 * happened.
1168 		 */
1169 		if (unlikely(++tries > max_tries))
1170 			return -1;
1171 
1172 		if (unlikely(fill_vec_buf_packed(dev, vq,
1173 						avail_idx, &desc_count,
1174 						buf_vec, &nr_vec,
1175 						&buf_id, &len,
1176 						VHOST_ACCESS_RW) < 0))
1177 			return -1;
1178 
1179 		len = RTE_MIN(len, size);
1180 		size -= len;
1181 
1182 		buffer_len[num_buffers] = len;
1183 		buffer_buf_id[num_buffers] = buf_id;
1184 		buffer_desc_count[num_buffers] = desc_count;
1185 		num_buffers += 1;
1186 
1187 		*nr_descs += desc_count;
1188 		avail_idx += desc_count;
1189 		if (avail_idx >= vq->size)
1190 			avail_idx -= vq->size;
1191 	}
1192 
1193 	if (copy_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, num_buffers) < 0)
1194 		return -1;
1195 
1196 	vhost_shadow_enqueue_single_packed(dev, vq, buffer_len, buffer_buf_id,
1197 					   buffer_desc_count, num_buffers);
1198 
1199 	return 0;
1200 }
1201 
1202 static __rte_noinline uint32_t
1203 virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
1204 	struct rte_mbuf **pkts, uint32_t count)
1205 {
1206 	uint32_t pkt_idx = 0;
1207 	uint16_t num_buffers;
1208 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
1209 	uint16_t avail_head;
1210 
1211 	/*
1212 	 * The ordering between avail index and
1213 	 * desc reads needs to be enforced.
1214 	 */
1215 	avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);
1216 
1217 	rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
1218 
1219 	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
1220 		uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
1221 		uint16_t nr_vec = 0;
1222 
1223 		if (unlikely(reserve_avail_buf_split(dev, vq,
1224 						pkt_len, buf_vec, &num_buffers,
1225 						avail_head, &nr_vec) < 0)) {
1226 			VHOST_LOG_DATA(DEBUG,
1227 				"(%d) failed to get enough desc from vring\n",
1228 				dev->vid);
1229 			vq->shadow_used_idx -= num_buffers;
1230 			break;
1231 		}
1232 
1233 		VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
1234 			dev->vid, vq->last_avail_idx,
1235 			vq->last_avail_idx + num_buffers);
1236 
1237 		if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx],
1238 						buf_vec, nr_vec,
1239 						num_buffers) < 0) {
1240 			vq->shadow_used_idx -= num_buffers;
1241 			break;
1242 		}
1243 
1244 		vq->last_avail_idx += num_buffers;
1245 	}
1246 
1247 	do_data_copy_enqueue(dev, vq);
1248 
1249 	if (likely(vq->shadow_used_idx)) {
1250 		flush_shadow_used_ring_split(dev, vq);
1251 		vhost_vring_call_split(dev, vq);
1252 	}
1253 
1254 	return pkt_idx;
1255 }
1256 
1257 static __rte_always_inline int
1258 virtio_dev_rx_batch_packed(struct virtio_net *dev,
1259 			   struct vhost_virtqueue *vq,
1260 			   struct rte_mbuf **pkts)
1261 {
1262 	bool wrap_counter = vq->avail_wrap_counter;
1263 	struct vring_packed_desc *descs = vq->desc_packed;
1264 	uint16_t avail_idx = vq->last_avail_idx;
1265 	uint64_t desc_addrs[PACKED_BATCH_SIZE];
1266 	struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_BATCH_SIZE];
1267 	uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
1268 	uint64_t lens[PACKED_BATCH_SIZE];
1269 	uint16_t ids[PACKED_BATCH_SIZE];
1270 	uint16_t i;
1271 
1272 	if (unlikely(avail_idx & PACKED_BATCH_MASK))
1273 		return -1;
1274 
1275 	if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size))
1276 		return -1;
1277 
1278 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1279 		if (unlikely(pkts[i]->next != NULL))
1280 			return -1;
1281 		if (unlikely(!desc_is_avail(&descs[avail_idx + i],
1282 					    wrap_counter)))
1283 			return -1;
1284 	}
1285 
1286 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1287 		lens[i] = descs[avail_idx + i].len;
1288 
1289 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1290 		if (unlikely(pkts[i]->pkt_len > (lens[i] - buf_offset)))
1291 			return -1;
1292 	}
1293 
1294 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1295 		desc_addrs[i] = vhost_iova_to_vva(dev, vq,
1296 						  descs[avail_idx + i].addr,
1297 						  &lens[i],
1298 						  VHOST_ACCESS_RW);
1299 
1300 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1301 		if (unlikely(!desc_addrs[i]))
1302 			return -1;
1303 		if (unlikely(lens[i] != descs[avail_idx + i].len))
1304 			return -1;
1305 	}
1306 
1307 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1308 		rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
1309 		hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *)
1310 					(uintptr_t)desc_addrs[i];
1311 		lens[i] = pkts[i]->pkt_len +
1312 			sizeof(struct virtio_net_hdr_mrg_rxbuf);
1313 	}
1314 
1315 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1316 		virtio_enqueue_offload(pkts[i], &hdrs[i]->hdr);
1317 
1318 	vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE);
1319 
1320 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1321 		rte_memcpy((void *)(uintptr_t)(desc_addrs[i] + buf_offset),
1322 			   rte_pktmbuf_mtod_offset(pkts[i], void *, 0),
1323 			   pkts[i]->pkt_len);
1324 	}
1325 
1326 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1327 		vhost_log_cache_write_iova(dev, vq, descs[avail_idx + i].addr,
1328 					   lens[i]);
1329 
1330 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1331 		ids[i] = descs[avail_idx + i].id;
1332 
1333 	vhost_flush_enqueue_batch_packed(dev, vq, lens, ids);
1334 
1335 	return 0;
1336 }
1337 
1338 static __rte_always_inline int16_t
1339 virtio_dev_rx_single_packed(struct virtio_net *dev,
1340 			    struct vhost_virtqueue *vq,
1341 			    struct rte_mbuf *pkt)
1342 {
1343 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
1344 	uint16_t nr_descs = 0;
1345 
1346 	if (unlikely(vhost_enqueue_single_packed(dev, vq, pkt, buf_vec,
1347 						 &nr_descs) < 0)) {
1348 		VHOST_LOG_DATA(DEBUG,
1349 				"(%d) failed to get enough desc from vring\n",
1350 				dev->vid);
1351 		return -1;
1352 	}
1353 
1354 	VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
1355 			dev->vid, vq->last_avail_idx,
1356 			vq->last_avail_idx + nr_descs);
1357 
1358 	vq_inc_last_avail_packed(vq, nr_descs);
1359 
1360 	return 0;
1361 }
1362 
1363 static __rte_noinline uint32_t
1364 virtio_dev_rx_packed(struct virtio_net *dev,
1365 		     struct vhost_virtqueue *__rte_restrict vq,
1366 		     struct rte_mbuf **__rte_restrict pkts,
1367 		     uint32_t count)
1368 {
1369 	uint32_t pkt_idx = 0;
1370 
1371 	do {
1372 		rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
1373 
1374 		if (count - pkt_idx >= PACKED_BATCH_SIZE) {
1375 			if (!virtio_dev_rx_batch_packed(dev, vq,
1376 							&pkts[pkt_idx])) {
1377 				pkt_idx += PACKED_BATCH_SIZE;
1378 				continue;
1379 			}
1380 		}
1381 
1382 		if (virtio_dev_rx_single_packed(dev, vq, pkts[pkt_idx]))
1383 			break;
1384 		pkt_idx++;
1385 
1386 	} while (pkt_idx < count);
1387 
1388 	if (vq->shadow_used_idx) {
1389 		do_data_copy_enqueue(dev, vq);
1390 		vhost_flush_enqueue_shadow_packed(dev, vq);
1391 	}
1392 
1393 	if (pkt_idx)
1394 		vhost_vring_call_packed(dev, vq);
1395 
1396 	return pkt_idx;
1397 }
1398 
1399 static __rte_always_inline uint32_t
1400 virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
1401 	struct rte_mbuf **pkts, uint32_t count)
1402 {
1403 	struct vhost_virtqueue *vq;
1404 	uint32_t nb_tx = 0;
1405 
1406 	VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
1407 	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
1408 		VHOST_LOG_DATA(ERR, "(%d) %s: invalid virtqueue idx %d.\n",
1409 			dev->vid, __func__, queue_id);
1410 		return 0;
1411 	}
1412 
1413 	vq = dev->virtqueue[queue_id];
1414 
1415 	rte_spinlock_lock(&vq->access_lock);
1416 
1417 	if (unlikely(!vq->enabled))
1418 		goto out_access_unlock;
1419 
1420 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1421 		vhost_user_iotlb_rd_lock(vq);
1422 
1423 	if (unlikely(!vq->access_ok))
1424 		if (unlikely(vring_translate(dev, vq) < 0))
1425 			goto out;
1426 
1427 	count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
1428 	if (count == 0)
1429 		goto out;
1430 
1431 	if (vq_is_packed(dev))
1432 		nb_tx = virtio_dev_rx_packed(dev, vq, pkts, count);
1433 	else
1434 		nb_tx = virtio_dev_rx_split(dev, vq, pkts, count);
1435 
1436 out:
1437 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1438 		vhost_user_iotlb_rd_unlock(vq);
1439 
1440 out_access_unlock:
1441 	rte_spinlock_unlock(&vq->access_lock);
1442 
1443 	return nb_tx;
1444 }
1445 
1446 uint16_t
1447 rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
1448 	struct rte_mbuf **__rte_restrict pkts, uint16_t count)
1449 {
1450 	struct virtio_net *dev = get_device(vid);
1451 
1452 	if (!dev)
1453 		return 0;
1454 
1455 	if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
1456 		VHOST_LOG_DATA(ERR,
1457 			"(%d) %s: built-in vhost net backend is disabled.\n",
1458 			dev->vid, __func__);
1459 		return 0;
1460 	}
1461 
1462 	return virtio_dev_rx(dev, queue_id, pkts, count);
1463 }
1464 
1465 static __rte_always_inline uint16_t
1466 virtio_dev_rx_async_get_info_idx(uint16_t pkts_idx,
1467 	uint16_t vq_size, uint16_t n_inflight)
1468 {
1469 	return pkts_idx > n_inflight ? (pkts_idx - n_inflight) :
1470 		(vq_size - n_inflight + pkts_idx) & (vq_size - 1);
1471 }
1472 
1473 static __rte_always_inline void
1474 store_dma_desc_info_split(struct vring_used_elem *s_ring, struct vring_used_elem *d_ring,
1475 		uint16_t ring_size, uint16_t s_idx, uint16_t d_idx, uint16_t count)
1476 {
1477 	size_t elem_size = sizeof(struct vring_used_elem);
1478 
1479 	if (d_idx + count <= ring_size) {
1480 		rte_memcpy(d_ring + d_idx, s_ring + s_idx, count * elem_size);
1481 	} else {
1482 		uint16_t size = ring_size - d_idx;
1483 
1484 		rte_memcpy(d_ring + d_idx, s_ring + s_idx, size * elem_size);
1485 		rte_memcpy(d_ring, s_ring + s_idx + size, (count - size) * elem_size);
1486 	}
1487 }
1488 
1489 static __rte_always_inline void
1490 store_dma_desc_info_packed(struct vring_used_elem_packed *s_ring,
1491 		struct vring_used_elem_packed *d_ring,
1492 		uint16_t ring_size, uint16_t s_idx, uint16_t d_idx, uint16_t count)
1493 {
1494 	size_t elem_size = sizeof(struct vring_used_elem_packed);
1495 
1496 	if (d_idx + count <= ring_size) {
1497 		rte_memcpy(d_ring + d_idx, s_ring + s_idx, count * elem_size);
1498 	} else {
1499 		uint16_t size = ring_size - d_idx;
1500 
1501 		rte_memcpy(d_ring + d_idx, s_ring + s_idx, size * elem_size);
1502 		rte_memcpy(d_ring, s_ring + s_idx + size, (count - size) * elem_size);
1503 	}
1504 }
1505 
1506 static __rte_noinline uint32_t
1507 virtio_dev_rx_async_submit_split(struct virtio_net *dev,
1508 	struct vhost_virtqueue *vq, uint16_t queue_id,
1509 	struct rte_mbuf **pkts, uint32_t count,
1510 	struct rte_mbuf **comp_pkts, uint32_t *comp_count)
1511 {
1512 	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
1513 	uint16_t num_buffers;
1514 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
1515 	uint16_t avail_head;
1516 
1517 	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
1518 	struct iovec *vec_pool = vq->vec_pool;
1519 	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
1520 	struct iovec *src_iovec = vec_pool;
1521 	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
1522 	uint16_t slot_idx = 0;
1523 	uint16_t segs_await = 0;
1524 	uint16_t iovec_idx = 0, it_idx = 0;
1525 	struct async_inflight_info *pkts_info = vq->async_pkts_info;
1526 	uint32_t n_pkts = 0, pkt_err = 0;
1527 	uint32_t num_async_pkts = 0, num_done_pkts = 0;
1528 	struct {
1529 		uint16_t pkt_idx;
1530 		uint16_t last_avail_idx;
1531 	} async_pkts_log[MAX_PKT_BURST];
1532 
1533 	/*
1534 	 * The ordering between avail index and desc reads need to be enforced.
1535 	 */
1536 	avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);
1537 
1538 	rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
1539 
1540 	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
1541 		uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
1542 		uint16_t nr_vec = 0;
1543 
1544 		if (unlikely(reserve_avail_buf_split(dev, vq,
1545 						pkt_len, buf_vec, &num_buffers,
1546 						avail_head, &nr_vec) < 0)) {
1547 			VHOST_LOG_DATA(DEBUG,
1548 				"(%d) failed to get enough desc from vring\n",
1549 				dev->vid);
1550 			vq->shadow_used_idx -= num_buffers;
1551 			break;
1552 		}
1553 
1554 		VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
1555 			dev->vid, vq->last_avail_idx,
1556 			vq->last_avail_idx + num_buffers);
1557 
1558 		if (async_mbuf_to_desc(dev, vq, pkts[pkt_idx], buf_vec, nr_vec, num_buffers,
1559 				&src_iovec[iovec_idx], &dst_iovec[iovec_idx],
1560 				&it_pool[it_idx], &it_pool[it_idx + 1]) < 0) {
1561 			vq->shadow_used_idx -= num_buffers;
1562 			break;
1563 		}
1564 
1565 		slot_idx = (vq->async_pkts_idx + num_async_pkts) &
1566 			(vq->size - 1);
1567 		if (it_pool[it_idx].count) {
1568 			uint16_t from, to;
1569 
1570 			async_fill_desc(&tdes[pkt_burst_idx++],
1571 				&it_pool[it_idx], &it_pool[it_idx + 1]);
1572 			pkts_info[slot_idx].descs = num_buffers;
1573 			pkts_info[slot_idx].mbuf = pkts[pkt_idx];
1574 			async_pkts_log[num_async_pkts].pkt_idx = pkt_idx;
1575 			async_pkts_log[num_async_pkts++].last_avail_idx =
1576 				vq->last_avail_idx;
1577 
1578 			iovec_idx += it_pool[it_idx].nr_segs;
1579 			it_idx += 2;
1580 
1581 			segs_await += it_pool[it_idx].nr_segs;
1582 
1583 			/**
1584 			 * recover shadow used ring and keep DMA-occupied
1585 			 * descriptors.
1586 			 */
1587 			from = vq->shadow_used_idx - num_buffers;
1588 			to = vq->async_desc_idx_split & (vq->size - 1);
1589 
1590 			store_dma_desc_info_split(vq->shadow_used_split,
1591 					vq->async_descs_split, vq->size, from, to, num_buffers);
1592 
1593 			vq->async_desc_idx_split += num_buffers;
1594 			vq->shadow_used_idx -= num_buffers;
1595 		} else
1596 			comp_pkts[num_done_pkts++] = pkts[pkt_idx];
1597 
1598 		vq->last_avail_idx += num_buffers;
1599 
1600 		/*
1601 		 * conditions to trigger async device transfer:
1602 		 * - buffered packet number reaches transfer threshold
1603 		 * - unused async iov number is less than max vhost vector
1604 		 */
1605 		if (unlikely(pkt_burst_idx >= VHOST_ASYNC_BATCH_THRESHOLD ||
1606 			((VHOST_MAX_ASYNC_VEC >> 1) - segs_await <
1607 			BUF_VECTOR_MAX))) {
1608 			n_pkts = vq->async_ops.transfer_data(dev->vid,
1609 					queue_id, tdes, 0, pkt_burst_idx);
1610 			iovec_idx = 0;
1611 			it_idx = 0;
1612 
1613 			segs_await = 0;
1614 			vq->async_pkts_inflight_n += n_pkts;
1615 
1616 			if (unlikely(n_pkts < pkt_burst_idx)) {
1617 				/*
1618 				 * log error packets number here and do actual
1619 				 * error processing when applications poll
1620 				 * completion
1621 				 */
1622 				pkt_err = pkt_burst_idx - n_pkts;
1623 				pkt_burst_idx = 0;
1624 				break;
1625 			}
1626 
1627 			pkt_burst_idx = 0;
1628 		}
1629 	}
1630 
1631 	if (pkt_burst_idx) {
1632 		n_pkts = vq->async_ops.transfer_data(dev->vid,
1633 				queue_id, tdes, 0, pkt_burst_idx);
1634 		vq->async_pkts_inflight_n += n_pkts;
1635 
1636 		if (unlikely(n_pkts < pkt_burst_idx))
1637 			pkt_err = pkt_burst_idx - n_pkts;
1638 	}
1639 
1640 	do_data_copy_enqueue(dev, vq);
1641 
1642 	if (unlikely(pkt_err)) {
1643 		uint16_t num_descs = 0;
1644 
1645 		num_async_pkts -= pkt_err;
1646 		/* calculate the sum of descriptors of DMA-error packets. */
1647 		while (pkt_err-- > 0) {
1648 			num_descs += pkts_info[slot_idx & (vq->size - 1)].descs;
1649 			slot_idx--;
1650 		}
1651 		vq->async_desc_idx_split -= num_descs;
1652 		/* recover shadow used ring and available ring */
1653 		vq->shadow_used_idx -= (vq->last_avail_idx -
1654 				async_pkts_log[num_async_pkts].last_avail_idx -
1655 				num_descs);
1656 		vq->last_avail_idx =
1657 			async_pkts_log[num_async_pkts].last_avail_idx;
1658 		pkt_idx = async_pkts_log[num_async_pkts].pkt_idx;
1659 		num_done_pkts = pkt_idx - num_async_pkts;
1660 	}
1661 
1662 	vq->async_pkts_idx += num_async_pkts;
1663 	*comp_count = num_done_pkts;
1664 
1665 	if (likely(vq->shadow_used_idx)) {
1666 		flush_shadow_used_ring_split(dev, vq);
1667 		vhost_vring_call_split(dev, vq);
1668 	}
1669 
1670 	return pkt_idx;
1671 }
1672 
1673 static __rte_always_inline void
1674 vhost_update_used_packed(struct vhost_virtqueue *vq,
1675 			struct vring_used_elem_packed *shadow_ring,
1676 			uint16_t count)
1677 {
1678 	int i;
1679 	uint16_t used_idx = vq->last_used_idx;
1680 	uint16_t head_idx = vq->last_used_idx;
1681 	uint16_t head_flags = 0;
1682 
1683 	if (count == 0)
1684 		return;
1685 
1686 	/* Split loop in two to save memory barriers */
1687 	for (i = 0; i < count; i++) {
1688 		vq->desc_packed[used_idx].id = shadow_ring[i].id;
1689 		vq->desc_packed[used_idx].len = shadow_ring[i].len;
1690 
1691 		used_idx += shadow_ring[i].count;
1692 		if (used_idx >= vq->size)
1693 			used_idx -= vq->size;
1694 	}
1695 
1696 	/* The ordering for storing desc flags needs to be enforced. */
1697 	rte_atomic_thread_fence(__ATOMIC_RELEASE);
1698 
1699 	for (i = 0; i < count; i++) {
1700 		uint16_t flags;
1701 
1702 		if (vq->shadow_used_packed[i].len)
1703 			flags = VRING_DESC_F_WRITE;
1704 		else
1705 			flags = 0;
1706 
1707 		if (vq->used_wrap_counter) {
1708 			flags |= VRING_DESC_F_USED;
1709 			flags |= VRING_DESC_F_AVAIL;
1710 		} else {
1711 			flags &= ~VRING_DESC_F_USED;
1712 			flags &= ~VRING_DESC_F_AVAIL;
1713 		}
1714 
1715 		if (i > 0) {
1716 			vq->desc_packed[vq->last_used_idx].flags = flags;
1717 		} else {
1718 			head_idx = vq->last_used_idx;
1719 			head_flags = flags;
1720 		}
1721 
1722 		vq_inc_last_used_packed(vq, shadow_ring[i].count);
1723 	}
1724 
1725 	vq->desc_packed[head_idx].flags = head_flags;
1726 }
1727 
1728 static __rte_always_inline int
1729 virtio_dev_rx_async_batch_packed(struct virtio_net *dev,
1730 			   struct vhost_virtqueue *vq,
1731 			   struct rte_mbuf **pkts,
1732 			   struct rte_mbuf **comp_pkts, uint32_t *pkt_done)
1733 {
1734 	uint16_t i;
1735 	uint32_t cpy_threshold = vq->async_threshold;
1736 
1737 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1738 		if (unlikely(pkts[i]->pkt_len >= cpy_threshold))
1739 			return -1;
1740 	}
1741 	if (!virtio_dev_rx_batch_packed(dev, vq, pkts)) {
1742 		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1743 			comp_pkts[(*pkt_done)++] = pkts[i];
1744 
1745 		return 0;
1746 	}
1747 
1748 	return -1;
1749 }
1750 
1751 static __rte_always_inline int
1752 vhost_enqueue_async_single_packed(struct virtio_net *dev,
1753 			    struct vhost_virtqueue *vq,
1754 			    struct rte_mbuf *pkt,
1755 			    struct buf_vector *buf_vec,
1756 			    uint16_t *nr_descs,
1757 			    uint16_t *nr_buffers,
1758 			    struct vring_packed_desc *async_descs,
1759 			    struct iovec *src_iovec, struct iovec *dst_iovec,
1760 			    struct rte_vhost_iov_iter *src_it,
1761 			    struct rte_vhost_iov_iter *dst_it)
1762 {
1763 	uint16_t nr_vec = 0;
1764 	uint16_t avail_idx = vq->last_avail_idx;
1765 	uint16_t max_tries, tries = 0;
1766 	uint16_t buf_id = 0;
1767 	uint32_t len = 0;
1768 	uint16_t desc_count = 0;
1769 	uint32_t size = pkt->pkt_len + sizeof(struct virtio_net_hdr_mrg_rxbuf);
1770 	uint32_t buffer_len[vq->size];
1771 	uint16_t buffer_buf_id[vq->size];
1772 	uint16_t buffer_desc_count[vq->size];
1773 
1774 	if (rxvq_is_mergeable(dev))
1775 		max_tries = vq->size - 1;
1776 	else
1777 		max_tries = 1;
1778 
1779 	while (size > 0) {
1780 		/*
1781 		 * if we tried all available ring items, and still
1782 		 * can't get enough buf, it means something abnormal
1783 		 * happened.
1784 		 */
1785 		if (unlikely(++tries > max_tries))
1786 			return -1;
1787 
1788 		if (unlikely(fill_vec_buf_packed(dev, vq, avail_idx, &desc_count, buf_vec, &nr_vec,
1789 						&buf_id, &len, VHOST_ACCESS_RW) < 0))
1790 			return -1;
1791 
1792 		len = RTE_MIN(len, size);
1793 		size -= len;
1794 
1795 		buffer_len[*nr_buffers] = len;
1796 		buffer_buf_id[*nr_buffers] = buf_id;
1797 		buffer_desc_count[*nr_buffers] = desc_count;
1798 		*nr_buffers += 1;
1799 
1800 		*nr_descs += desc_count;
1801 		avail_idx += desc_count;
1802 		if (avail_idx >= vq->size)
1803 			avail_idx -= vq->size;
1804 	}
1805 
1806 	if (async_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, *nr_buffers, src_iovec, dst_iovec,
1807 			src_it, dst_it) < 0)
1808 		return -1;
1809 	/* store descriptors for DMA */
1810 	if (avail_idx >= *nr_descs) {
1811 		rte_memcpy(async_descs, &vq->desc_packed[vq->last_avail_idx],
1812 			*nr_descs * sizeof(struct vring_packed_desc));
1813 	} else {
1814 		uint16_t nr_copy = vq->size - vq->last_avail_idx;
1815 
1816 		rte_memcpy(async_descs, &vq->desc_packed[vq->last_avail_idx],
1817 			nr_copy * sizeof(struct vring_packed_desc));
1818 		rte_memcpy(async_descs + nr_copy, vq->desc_packed,
1819 			(*nr_descs - nr_copy) * sizeof(struct vring_packed_desc));
1820 	}
1821 
1822 	vhost_shadow_enqueue_packed(vq, buffer_len, buffer_buf_id, buffer_desc_count, *nr_buffers);
1823 
1824 	return 0;
1825 }
1826 
1827 static __rte_always_inline int16_t
1828 virtio_dev_rx_async_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
1829 			    struct rte_mbuf *pkt, uint16_t *nr_descs, uint16_t *nr_buffers,
1830 			    struct vring_packed_desc *async_descs,
1831 			    struct iovec *src_iovec, struct iovec *dst_iovec,
1832 			    struct rte_vhost_iov_iter *src_it, struct rte_vhost_iov_iter *dst_it)
1833 {
1834 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
1835 
1836 	if (unlikely(vhost_enqueue_async_single_packed(dev, vq, pkt, buf_vec, nr_descs, nr_buffers,
1837 						 async_descs, src_iovec, dst_iovec,
1838 						 src_it, dst_it) < 0)) {
1839 		VHOST_LOG_DATA(DEBUG, "(%d) failed to get enough desc from vring\n", dev->vid);
1840 		return -1;
1841 	}
1842 
1843 	VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
1844 			dev->vid, vq->last_avail_idx, vq->last_avail_idx + *nr_descs);
1845 
1846 	return 0;
1847 }
1848 
1849 static __rte_always_inline void
1850 dma_error_handler_packed(struct vhost_virtqueue *vq, struct vring_packed_desc *async_descs,
1851 			uint16_t async_descs_idx, uint16_t slot_idx, uint32_t nr_err,
1852 			uint32_t *pkt_idx, uint32_t *num_async_pkts, uint32_t *num_done_pkts)
1853 {
1854 	uint16_t descs_err = 0;
1855 	uint16_t buffers_err = 0;
1856 	struct async_inflight_info *pkts_info = vq->async_pkts_info;
1857 
1858 	*num_async_pkts -= nr_err;
1859 	*pkt_idx -= nr_err;
1860 	/* calculate the sum of buffers and descs of DMA-error packets. */
1861 	while (nr_err-- > 0) {
1862 		descs_err += pkts_info[slot_idx % vq->size].descs;
1863 		buffers_err += pkts_info[slot_idx % vq->size].nr_buffers;
1864 		slot_idx--;
1865 	}
1866 
1867 	vq->async_buffer_idx_packed -= buffers_err;
1868 
1869 	if (vq->last_avail_idx >= descs_err) {
1870 		vq->last_avail_idx -= descs_err;
1871 
1872 		rte_memcpy(&vq->desc_packed[vq->last_avail_idx],
1873 			&async_descs[async_descs_idx - descs_err],
1874 			descs_err * sizeof(struct vring_packed_desc));
1875 	} else {
1876 		uint16_t nr_copy;
1877 
1878 		vq->last_avail_idx = vq->last_avail_idx + vq->size - descs_err;
1879 		nr_copy = vq->size - vq->last_avail_idx;
1880 		rte_memcpy(&vq->desc_packed[vq->last_avail_idx],
1881 			&async_descs[async_descs_idx - descs_err],
1882 			nr_copy * sizeof(struct vring_packed_desc));
1883 		descs_err -= nr_copy;
1884 		rte_memcpy(&vq->desc_packed[0], &async_descs[async_descs_idx - descs_err],
1885 			descs_err * sizeof(struct vring_packed_desc));
1886 		vq->avail_wrap_counter ^= 1;
1887 	}
1888 
1889 	*num_done_pkts = *pkt_idx - *num_async_pkts;
1890 }
1891 
1892 static __rte_noinline uint32_t
1893 virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
1894 	struct vhost_virtqueue *vq, uint16_t queue_id,
1895 	struct rte_mbuf **pkts, uint32_t count,
1896 	struct rte_mbuf **comp_pkts, uint32_t *comp_count)
1897 {
1898 	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
1899 	uint32_t remained = count;
1900 	uint16_t async_descs_idx = 0;
1901 	uint16_t num_buffers;
1902 	uint16_t num_descs;
1903 
1904 	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
1905 	struct iovec *vec_pool = vq->vec_pool;
1906 	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
1907 	struct iovec *src_iovec = vec_pool;
1908 	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
1909 	uint16_t slot_idx = 0;
1910 	uint16_t segs_await = 0;
1911 	uint16_t iovec_idx = 0, it_idx = 0;
1912 	struct async_inflight_info *pkts_info = vq->async_pkts_info;
1913 	uint32_t n_pkts = 0, pkt_err = 0;
1914 	uint32_t num_async_pkts = 0, num_done_pkts = 0;
1915 	struct vring_packed_desc async_descs[vq->size];
1916 
1917 	do {
1918 		rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
1919 		if (remained >= PACKED_BATCH_SIZE) {
1920 			if (!virtio_dev_rx_async_batch_packed(dev, vq,
1921 				&pkts[pkt_idx], comp_pkts, &num_done_pkts)) {
1922 				pkt_idx += PACKED_BATCH_SIZE;
1923 				remained -= PACKED_BATCH_SIZE;
1924 				continue;
1925 			}
1926 		}
1927 
1928 		num_buffers = 0;
1929 		num_descs = 0;
1930 		if (unlikely(virtio_dev_rx_async_single_packed(dev, vq, pkts[pkt_idx],
1931 						&num_descs, &num_buffers,
1932 						&async_descs[async_descs_idx],
1933 						&src_iovec[iovec_idx], &dst_iovec[iovec_idx],
1934 						&it_pool[it_idx], &it_pool[it_idx + 1]) < 0))
1935 			break;
1936 
1937 		VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
1938 			dev->vid, vq->last_avail_idx,
1939 			vq->last_avail_idx + num_descs);
1940 
1941 		slot_idx = (vq->async_pkts_idx + num_async_pkts) % vq->size;
1942 		if (it_pool[it_idx].count) {
1943 			uint16_t from, to;
1944 
1945 			async_descs_idx += num_descs;
1946 			async_fill_desc(&tdes[pkt_burst_idx++],
1947 				&it_pool[it_idx], &it_pool[it_idx + 1]);
1948 			pkts_info[slot_idx].descs = num_descs;
1949 			pkts_info[slot_idx].nr_buffers = num_buffers;
1950 			pkts_info[slot_idx].mbuf = pkts[pkt_idx];
1951 			num_async_pkts++;
1952 			iovec_idx += it_pool[it_idx].nr_segs;
1953 			it_idx += 2;
1954 
1955 			segs_await += it_pool[it_idx].nr_segs;
1956 
1957 			/**
1958 			 * recover shadow used ring and keep DMA-occupied
1959 			 * descriptors.
1960 			 */
1961 			from = vq->shadow_used_idx - num_buffers;
1962 			to = vq->async_buffer_idx_packed % vq->size;
1963 			store_dma_desc_info_packed(vq->shadow_used_packed,
1964 					vq->async_buffers_packed, vq->size, from, to, num_buffers);
1965 
1966 			vq->async_buffer_idx_packed += num_buffers;
1967 			vq->shadow_used_idx -= num_buffers;
1968 		} else {
1969 			comp_pkts[num_done_pkts++] = pkts[pkt_idx];
1970 		}
1971 
1972 		pkt_idx++;
1973 		remained--;
1974 		vq_inc_last_avail_packed(vq, num_descs);
1975 
1976 		/*
1977 		 * conditions to trigger async device transfer:
1978 		 * - buffered packet number reaches transfer threshold
1979 		 * - unused async iov number is less than max vhost vector
1980 		 */
1981 		if (unlikely(pkt_burst_idx >= VHOST_ASYNC_BATCH_THRESHOLD ||
1982 			((VHOST_MAX_ASYNC_VEC >> 1) - segs_await < BUF_VECTOR_MAX))) {
1983 			n_pkts = vq->async_ops.transfer_data(dev->vid, queue_id,
1984 				tdes, 0, pkt_burst_idx);
1985 			iovec_idx = 0;
1986 			it_idx = 0;
1987 			segs_await = 0;
1988 			vq->async_pkts_inflight_n += n_pkts;
1989 
1990 			if (unlikely(n_pkts < pkt_burst_idx)) {
1991 				/*
1992 				 * log error packets number here and do actual
1993 				 * error processing when applications poll
1994 				 * completion
1995 				 */
1996 				pkt_err = pkt_burst_idx - n_pkts;
1997 				pkt_burst_idx = 0;
1998 				break;
1999 			}
2000 
2001 			pkt_burst_idx = 0;
2002 		}
2003 	} while (pkt_idx < count);
2004 
2005 	if (pkt_burst_idx) {
2006 		n_pkts = vq->async_ops.transfer_data(dev->vid, queue_id, tdes, 0, pkt_burst_idx);
2007 		vq->async_pkts_inflight_n += n_pkts;
2008 
2009 		if (unlikely(n_pkts < pkt_burst_idx))
2010 			pkt_err = pkt_burst_idx - n_pkts;
2011 	}
2012 
2013 	do_data_copy_enqueue(dev, vq);
2014 
2015 	if (unlikely(pkt_err))
2016 		dma_error_handler_packed(vq, async_descs, async_descs_idx, slot_idx, pkt_err,
2017 					&pkt_idx, &num_async_pkts, &num_done_pkts);
2018 	vq->async_pkts_idx += num_async_pkts;
2019 	*comp_count = num_done_pkts;
2020 
2021 	if (likely(vq->shadow_used_idx)) {
2022 		vhost_flush_enqueue_shadow_packed(dev, vq);
2023 		vhost_vring_call_packed(dev, vq);
2024 	}
2025 
2026 	return pkt_idx;
2027 }
2028 
2029 static __rte_always_inline void
2030 write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t n_descs)
2031 {
2032 	uint16_t nr_left = n_descs;
2033 	uint16_t nr_copy;
2034 	uint16_t to, from;
2035 
2036 	do {
2037 		from = vq->last_async_desc_idx_split & (vq->size - 1);
2038 		nr_copy = nr_left + from <= vq->size ? nr_left : vq->size - from;
2039 		to = vq->last_used_idx & (vq->size - 1);
2040 
2041 		if (to + nr_copy <= vq->size) {
2042 			rte_memcpy(&vq->used->ring[to], &vq->async_descs_split[from],
2043 					nr_copy * sizeof(struct vring_used_elem));
2044 		} else {
2045 			uint16_t size = vq->size - to;
2046 
2047 			rte_memcpy(&vq->used->ring[to], &vq->async_descs_split[from],
2048 					size * sizeof(struct vring_used_elem));
2049 			rte_memcpy(&vq->used->ring[0], &vq->async_descs_split[from + size],
2050 					(nr_copy - size) * sizeof(struct vring_used_elem));
2051 		}
2052 
2053 		vq->last_async_desc_idx_split += nr_copy;
2054 		vq->last_used_idx += nr_copy;
2055 		nr_left -= nr_copy;
2056 	} while (nr_left > 0);
2057 }
2058 
2059 static __rte_always_inline void
2060 write_back_completed_descs_packed(struct vhost_virtqueue *vq,
2061 				uint16_t n_buffers)
2062 {
2063 	uint16_t nr_left = n_buffers;
2064 	uint16_t from, to;
2065 
2066 	do {
2067 		from = vq->last_async_buffer_idx_packed % vq->size;
2068 		to = (from + nr_left) % vq->size;
2069 		if (to > from) {
2070 			vhost_update_used_packed(vq, vq->async_buffers_packed + from, to - from);
2071 			vq->last_async_buffer_idx_packed += nr_left;
2072 			nr_left = 0;
2073 		} else {
2074 			vhost_update_used_packed(vq, vq->async_buffers_packed + from,
2075 				vq->size - from);
2076 			vq->last_async_buffer_idx_packed += vq->size - from;
2077 			nr_left -= vq->size - from;
2078 		}
2079 	} while (nr_left > 0);
2080 }
2081 
2082 uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
2083 		struct rte_mbuf **pkts, uint16_t count)
2084 {
2085 	struct virtio_net *dev = get_device(vid);
2086 	struct vhost_virtqueue *vq;
2087 	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0, n_buffers = 0;
2088 	uint16_t start_idx, pkts_idx, vq_size;
2089 	struct async_inflight_info *pkts_info;
2090 	uint16_t from, i;
2091 
2092 	if (!dev)
2093 		return 0;
2094 
2095 	VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
2096 	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
2097 		VHOST_LOG_DATA(ERR, "(%d) %s: invalid virtqueue idx %d.\n",
2098 			dev->vid, __func__, queue_id);
2099 		return 0;
2100 	}
2101 
2102 	vq = dev->virtqueue[queue_id];
2103 
2104 	if (unlikely(!vq->async_registered)) {
2105 		VHOST_LOG_DATA(ERR, "(%d) %s: async not registered for queue id %d.\n",
2106 			dev->vid, __func__, queue_id);
2107 		return 0;
2108 	}
2109 
2110 	rte_spinlock_lock(&vq->access_lock);
2111 
2112 	pkts_idx = vq->async_pkts_idx % vq->size;
2113 	pkts_info = vq->async_pkts_info;
2114 	vq_size = vq->size;
2115 	start_idx = virtio_dev_rx_async_get_info_idx(pkts_idx,
2116 		vq_size, vq->async_pkts_inflight_n);
2117 
2118 	if (count > vq->async_last_pkts_n)
2119 		n_pkts_cpl = vq->async_ops.check_completed_copies(vid,
2120 			queue_id, 0, count - vq->async_last_pkts_n);
2121 	n_pkts_cpl += vq->async_last_pkts_n;
2122 
2123 	n_pkts_put = RTE_MIN(count, n_pkts_cpl);
2124 	if (unlikely(n_pkts_put == 0)) {
2125 		vq->async_last_pkts_n = n_pkts_cpl;
2126 		goto done;
2127 	}
2128 
2129 	if (vq_is_packed(dev)) {
2130 		for (i = 0; i < n_pkts_put; i++) {
2131 			from = (start_idx + i) & (vq_size - 1);
2132 			n_buffers += pkts_info[from].nr_buffers;
2133 			pkts[i] = pkts_info[from].mbuf;
2134 		}
2135 	} else {
2136 		for (i = 0; i < n_pkts_put; i++) {
2137 			from = (start_idx + i) & (vq_size - 1);
2138 			n_descs += pkts_info[from].descs;
2139 			pkts[i] = pkts_info[from].mbuf;
2140 		}
2141 	}
2142 
2143 	vq->async_last_pkts_n = n_pkts_cpl - n_pkts_put;
2144 	vq->async_pkts_inflight_n -= n_pkts_put;
2145 
2146 	if (likely(vq->enabled && vq->access_ok)) {
2147 		if (vq_is_packed(dev)) {
2148 			write_back_completed_descs_packed(vq, n_buffers);
2149 
2150 			vhost_vring_call_packed(dev, vq);
2151 		} else {
2152 			write_back_completed_descs_split(vq, n_descs);
2153 
2154 			__atomic_add_fetch(&vq->used->idx, n_descs,
2155 					__ATOMIC_RELEASE);
2156 			vhost_vring_call_split(dev, vq);
2157 		}
2158 	} else {
2159 		if (vq_is_packed(dev))
2160 			vq->last_async_buffer_idx_packed += n_buffers;
2161 		else
2162 			vq->last_async_desc_idx_split += n_descs;
2163 	}
2164 
2165 done:
2166 	rte_spinlock_unlock(&vq->access_lock);
2167 
2168 	return n_pkts_put;
2169 }
2170 
2171 static __rte_always_inline uint32_t
2172 virtio_dev_rx_async_submit(struct virtio_net *dev, uint16_t queue_id,
2173 	struct rte_mbuf **pkts, uint32_t count,
2174 	struct rte_mbuf **comp_pkts, uint32_t *comp_count)
2175 {
2176 	struct vhost_virtqueue *vq;
2177 	uint32_t nb_tx = 0;
2178 
2179 	VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
2180 	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
2181 		VHOST_LOG_DATA(ERR, "(%d) %s: invalid virtqueue idx %d.\n",
2182 			dev->vid, __func__, queue_id);
2183 		return 0;
2184 	}
2185 
2186 	vq = dev->virtqueue[queue_id];
2187 
2188 	rte_spinlock_lock(&vq->access_lock);
2189 
2190 	if (unlikely(!vq->enabled || !vq->async_registered))
2191 		goto out_access_unlock;
2192 
2193 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
2194 		vhost_user_iotlb_rd_lock(vq);
2195 
2196 	if (unlikely(!vq->access_ok))
2197 		if (unlikely(vring_translate(dev, vq) < 0))
2198 			goto out;
2199 
2200 	count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
2201 	if (count == 0)
2202 		goto out;
2203 
2204 	if (vq_is_packed(dev))
2205 		nb_tx = virtio_dev_rx_async_submit_packed(dev,
2206 				vq, queue_id, pkts, count, comp_pkts,
2207 				comp_count);
2208 	else
2209 		nb_tx = virtio_dev_rx_async_submit_split(dev,
2210 				vq, queue_id, pkts, count, comp_pkts,
2211 				comp_count);
2212 
2213 out:
2214 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
2215 		vhost_user_iotlb_rd_unlock(vq);
2216 
2217 out_access_unlock:
2218 	rte_spinlock_unlock(&vq->access_lock);
2219 
2220 	return nb_tx;
2221 }
2222 
2223 uint16_t
2224 rte_vhost_submit_enqueue_burst(int vid, uint16_t queue_id,
2225 		struct rte_mbuf **pkts, uint16_t count,
2226 		struct rte_mbuf **comp_pkts, uint32_t *comp_count)
2227 {
2228 	struct virtio_net *dev = get_device(vid);
2229 
2230 	*comp_count = 0;
2231 	if (!dev)
2232 		return 0;
2233 
2234 	if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
2235 		VHOST_LOG_DATA(ERR,
2236 			"(%d) %s: built-in vhost net backend is disabled.\n",
2237 			dev->vid, __func__);
2238 		return 0;
2239 	}
2240 
2241 	return virtio_dev_rx_async_submit(dev, queue_id, pkts, count, comp_pkts,
2242 			comp_count);
2243 }
2244 
2245 static inline bool
2246 virtio_net_with_host_offload(struct virtio_net *dev)
2247 {
2248 	if (dev->features &
2249 			((1ULL << VIRTIO_NET_F_CSUM) |
2250 			 (1ULL << VIRTIO_NET_F_HOST_ECN) |
2251 			 (1ULL << VIRTIO_NET_F_HOST_TSO4) |
2252 			 (1ULL << VIRTIO_NET_F_HOST_TSO6) |
2253 			 (1ULL << VIRTIO_NET_F_HOST_UFO)))
2254 		return true;
2255 
2256 	return false;
2257 }
2258 
2259 static void
2260 parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr)
2261 {
2262 	struct rte_ipv4_hdr *ipv4_hdr;
2263 	struct rte_ipv6_hdr *ipv6_hdr;
2264 	void *l3_hdr = NULL;
2265 	struct rte_ether_hdr *eth_hdr;
2266 	uint16_t ethertype;
2267 
2268 	eth_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
2269 
2270 	m->l2_len = sizeof(struct rte_ether_hdr);
2271 	ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
2272 
2273 	if (ethertype == RTE_ETHER_TYPE_VLAN) {
2274 		struct rte_vlan_hdr *vlan_hdr =
2275 			(struct rte_vlan_hdr *)(eth_hdr + 1);
2276 
2277 		m->l2_len += sizeof(struct rte_vlan_hdr);
2278 		ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
2279 	}
2280 
2281 	l3_hdr = (char *)eth_hdr + m->l2_len;
2282 
2283 	switch (ethertype) {
2284 	case RTE_ETHER_TYPE_IPV4:
2285 		ipv4_hdr = l3_hdr;
2286 		*l4_proto = ipv4_hdr->next_proto_id;
2287 		m->l3_len = rte_ipv4_hdr_len(ipv4_hdr);
2288 		*l4_hdr = (char *)l3_hdr + m->l3_len;
2289 		m->ol_flags |= PKT_TX_IPV4;
2290 		break;
2291 	case RTE_ETHER_TYPE_IPV6:
2292 		ipv6_hdr = l3_hdr;
2293 		*l4_proto = ipv6_hdr->proto;
2294 		m->l3_len = sizeof(struct rte_ipv6_hdr);
2295 		*l4_hdr = (char *)l3_hdr + m->l3_len;
2296 		m->ol_flags |= PKT_TX_IPV6;
2297 		break;
2298 	default:
2299 		m->l3_len = 0;
2300 		*l4_proto = 0;
2301 		*l4_hdr = NULL;
2302 		break;
2303 	}
2304 }
2305 
2306 static __rte_always_inline void
2307 vhost_dequeue_offload_legacy(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
2308 {
2309 	uint16_t l4_proto = 0;
2310 	void *l4_hdr = NULL;
2311 	struct rte_tcp_hdr *tcp_hdr = NULL;
2312 
2313 	parse_ethernet(m, &l4_proto, &l4_hdr);
2314 	if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2315 		if (hdr->csum_start == (m->l2_len + m->l3_len)) {
2316 			switch (hdr->csum_offset) {
2317 			case (offsetof(struct rte_tcp_hdr, cksum)):
2318 				if (l4_proto == IPPROTO_TCP)
2319 					m->ol_flags |= PKT_TX_TCP_CKSUM;
2320 				break;
2321 			case (offsetof(struct rte_udp_hdr, dgram_cksum)):
2322 				if (l4_proto == IPPROTO_UDP)
2323 					m->ol_flags |= PKT_TX_UDP_CKSUM;
2324 				break;
2325 			case (offsetof(struct rte_sctp_hdr, cksum)):
2326 				if (l4_proto == IPPROTO_SCTP)
2327 					m->ol_flags |= PKT_TX_SCTP_CKSUM;
2328 				break;
2329 			default:
2330 				break;
2331 			}
2332 		}
2333 	}
2334 
2335 	if (l4_hdr && hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2336 		switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2337 		case VIRTIO_NET_HDR_GSO_TCPV4:
2338 		case VIRTIO_NET_HDR_GSO_TCPV6:
2339 			tcp_hdr = l4_hdr;
2340 			m->ol_flags |= PKT_TX_TCP_SEG;
2341 			m->tso_segsz = hdr->gso_size;
2342 			m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
2343 			break;
2344 		case VIRTIO_NET_HDR_GSO_UDP:
2345 			m->ol_flags |= PKT_TX_UDP_SEG;
2346 			m->tso_segsz = hdr->gso_size;
2347 			m->l4_len = sizeof(struct rte_udp_hdr);
2348 			break;
2349 		default:
2350 			VHOST_LOG_DATA(WARNING,
2351 				"unsupported gso type %u.\n", hdr->gso_type);
2352 			break;
2353 		}
2354 	}
2355 }
2356 
2357 static __rte_always_inline void
2358 vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m,
2359 	bool legacy_ol_flags)
2360 {
2361 	struct rte_net_hdr_lens hdr_lens;
2362 	int l4_supported = 0;
2363 	uint32_t ptype;
2364 
2365 	if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE)
2366 		return;
2367 
2368 	if (legacy_ol_flags) {
2369 		vhost_dequeue_offload_legacy(hdr, m);
2370 		return;
2371 	}
2372 
2373 	m->ol_flags |= PKT_RX_IP_CKSUM_UNKNOWN;
2374 
2375 	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
2376 	m->packet_type = ptype;
2377 	if ((ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP ||
2378 	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP ||
2379 	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP)
2380 		l4_supported = 1;
2381 
2382 	/* According to Virtio 1.1 spec, the device only needs to look at
2383 	 * VIRTIO_NET_HDR_F_NEEDS_CSUM in the packet transmission path.
2384 	 * This differs from the processing incoming packets path where the
2385 	 * driver could rely on VIRTIO_NET_HDR_F_DATA_VALID flag set by the
2386 	 * device.
2387 	 *
2388 	 * 5.1.6.2.1 Driver Requirements: Packet Transmission
2389 	 * The driver MUST NOT set the VIRTIO_NET_HDR_F_DATA_VALID and
2390 	 * VIRTIO_NET_HDR_F_RSC_INFO bits in flags.
2391 	 *
2392 	 * 5.1.6.2.2 Device Requirements: Packet Transmission
2393 	 * The device MUST ignore flag bits that it does not recognize.
2394 	 */
2395 	if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2396 		uint32_t hdrlen;
2397 
2398 		hdrlen = hdr_lens.l2_len + hdr_lens.l3_len + hdr_lens.l4_len;
2399 		if (hdr->csum_start <= hdrlen && l4_supported != 0) {
2400 			m->ol_flags |= PKT_RX_L4_CKSUM_NONE;
2401 		} else {
2402 			/* Unknown proto or tunnel, do sw cksum. We can assume
2403 			 * the cksum field is in the first segment since the
2404 			 * buffers we provided to the host are large enough.
2405 			 * In case of SCTP, this will be wrong since it's a CRC
2406 			 * but there's nothing we can do.
2407 			 */
2408 			uint16_t csum = 0, off;
2409 
2410 			if (rte_raw_cksum_mbuf(m, hdr->csum_start,
2411 					rte_pktmbuf_pkt_len(m) - hdr->csum_start, &csum) < 0)
2412 				return;
2413 			if (likely(csum != 0xffff))
2414 				csum = ~csum;
2415 			off = hdr->csum_offset + hdr->csum_start;
2416 			if (rte_pktmbuf_data_len(m) >= off + 1)
2417 				*rte_pktmbuf_mtod_offset(m, uint16_t *, off) = csum;
2418 		}
2419 	}
2420 
2421 	if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2422 		if (hdr->gso_size == 0)
2423 			return;
2424 
2425 		switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2426 		case VIRTIO_NET_HDR_GSO_TCPV4:
2427 		case VIRTIO_NET_HDR_GSO_TCPV6:
2428 			if ((ptype & RTE_PTYPE_L4_MASK) != RTE_PTYPE_L4_TCP)
2429 				break;
2430 			m->ol_flags |= PKT_RX_LRO | PKT_RX_L4_CKSUM_NONE;
2431 			m->tso_segsz = hdr->gso_size;
2432 			break;
2433 		case VIRTIO_NET_HDR_GSO_UDP:
2434 			if ((ptype & RTE_PTYPE_L4_MASK) != RTE_PTYPE_L4_UDP)
2435 				break;
2436 			m->ol_flags |= PKT_RX_LRO | PKT_RX_L4_CKSUM_NONE;
2437 			m->tso_segsz = hdr->gso_size;
2438 			break;
2439 		default:
2440 			break;
2441 		}
2442 	}
2443 }
2444 
2445 static __rte_noinline void
2446 copy_vnet_hdr_from_desc(struct virtio_net_hdr *hdr,
2447 		struct buf_vector *buf_vec)
2448 {
2449 	uint64_t len;
2450 	uint64_t remain = sizeof(struct virtio_net_hdr);
2451 	uint64_t src;
2452 	uint64_t dst = (uint64_t)(uintptr_t)hdr;
2453 
2454 	while (remain) {
2455 		len = RTE_MIN(remain, buf_vec->buf_len);
2456 		src = buf_vec->buf_addr;
2457 		rte_memcpy((void *)(uintptr_t)dst,
2458 				(void *)(uintptr_t)src, len);
2459 
2460 		remain -= len;
2461 		dst += len;
2462 		buf_vec++;
2463 	}
2464 }
2465 
2466 static __rte_always_inline int
2467 copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
2468 		  struct buf_vector *buf_vec, uint16_t nr_vec,
2469 		  struct rte_mbuf *m, struct rte_mempool *mbuf_pool,
2470 		  bool legacy_ol_flags)
2471 {
2472 	uint32_t buf_avail, buf_offset;
2473 	uint64_t buf_addr, buf_len;
2474 	uint32_t mbuf_avail, mbuf_offset;
2475 	uint32_t cpy_len;
2476 	struct rte_mbuf *cur = m, *prev = m;
2477 	struct virtio_net_hdr tmp_hdr;
2478 	struct virtio_net_hdr *hdr = NULL;
2479 	/* A counter to avoid desc dead loop chain */
2480 	uint16_t vec_idx = 0;
2481 	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
2482 	int error = 0;
2483 
2484 	buf_addr = buf_vec[vec_idx].buf_addr;
2485 	buf_len = buf_vec[vec_idx].buf_len;
2486 
2487 	if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
2488 		error = -1;
2489 		goto out;
2490 	}
2491 
2492 	if (virtio_net_with_host_offload(dev)) {
2493 		if (unlikely(buf_len < sizeof(struct virtio_net_hdr))) {
2494 			/*
2495 			 * No luck, the virtio-net header doesn't fit
2496 			 * in a contiguous virtual area.
2497 			 */
2498 			copy_vnet_hdr_from_desc(&tmp_hdr, buf_vec);
2499 			hdr = &tmp_hdr;
2500 		} else {
2501 			hdr = (struct virtio_net_hdr *)((uintptr_t)buf_addr);
2502 		}
2503 	}
2504 
2505 	/*
2506 	 * A virtio driver normally uses at least 2 desc buffers
2507 	 * for Tx: the first for storing the header, and others
2508 	 * for storing the data.
2509 	 */
2510 	if (unlikely(buf_len < dev->vhost_hlen)) {
2511 		buf_offset = dev->vhost_hlen - buf_len;
2512 		vec_idx++;
2513 		buf_addr = buf_vec[vec_idx].buf_addr;
2514 		buf_len = buf_vec[vec_idx].buf_len;
2515 		buf_avail  = buf_len - buf_offset;
2516 	} else if (buf_len == dev->vhost_hlen) {
2517 		if (unlikely(++vec_idx >= nr_vec))
2518 			goto out;
2519 		buf_addr = buf_vec[vec_idx].buf_addr;
2520 		buf_len = buf_vec[vec_idx].buf_len;
2521 
2522 		buf_offset = 0;
2523 		buf_avail = buf_len;
2524 	} else {
2525 		buf_offset = dev->vhost_hlen;
2526 		buf_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
2527 	}
2528 
2529 	PRINT_PACKET(dev,
2530 			(uintptr_t)(buf_addr + buf_offset),
2531 			(uint32_t)buf_avail, 0);
2532 
2533 	mbuf_offset = 0;
2534 	mbuf_avail  = m->buf_len - RTE_PKTMBUF_HEADROOM;
2535 	while (1) {
2536 		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
2537 
2538 		if (likely(cpy_len > MAX_BATCH_LEN ||
2539 					vq->batch_copy_nb_elems >= vq->size ||
2540 					(hdr && cur == m))) {
2541 			rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *,
2542 						mbuf_offset),
2543 					(void *)((uintptr_t)(buf_addr +
2544 							buf_offset)), cpy_len);
2545 		} else {
2546 			batch_copy[vq->batch_copy_nb_elems].dst =
2547 				rte_pktmbuf_mtod_offset(cur, void *,
2548 						mbuf_offset);
2549 			batch_copy[vq->batch_copy_nb_elems].src =
2550 				(void *)((uintptr_t)(buf_addr + buf_offset));
2551 			batch_copy[vq->batch_copy_nb_elems].len = cpy_len;
2552 			vq->batch_copy_nb_elems++;
2553 		}
2554 
2555 		mbuf_avail  -= cpy_len;
2556 		mbuf_offset += cpy_len;
2557 		buf_avail -= cpy_len;
2558 		buf_offset += cpy_len;
2559 
2560 		/* This buf reaches to its end, get the next one */
2561 		if (buf_avail == 0) {
2562 			if (++vec_idx >= nr_vec)
2563 				break;
2564 
2565 			buf_addr = buf_vec[vec_idx].buf_addr;
2566 			buf_len = buf_vec[vec_idx].buf_len;
2567 
2568 			buf_offset = 0;
2569 			buf_avail  = buf_len;
2570 
2571 			PRINT_PACKET(dev, (uintptr_t)buf_addr,
2572 					(uint32_t)buf_avail, 0);
2573 		}
2574 
2575 		/*
2576 		 * This mbuf reaches to its end, get a new one
2577 		 * to hold more data.
2578 		 */
2579 		if (mbuf_avail == 0) {
2580 			cur = rte_pktmbuf_alloc(mbuf_pool);
2581 			if (unlikely(cur == NULL)) {
2582 				VHOST_LOG_DATA(ERR, "Failed to "
2583 					"allocate memory for mbuf.\n");
2584 				error = -1;
2585 				goto out;
2586 			}
2587 
2588 			prev->next = cur;
2589 			prev->data_len = mbuf_offset;
2590 			m->nb_segs += 1;
2591 			m->pkt_len += mbuf_offset;
2592 			prev = cur;
2593 
2594 			mbuf_offset = 0;
2595 			mbuf_avail  = cur->buf_len - RTE_PKTMBUF_HEADROOM;
2596 		}
2597 	}
2598 
2599 	prev->data_len = mbuf_offset;
2600 	m->pkt_len    += mbuf_offset;
2601 
2602 	if (hdr)
2603 		vhost_dequeue_offload(hdr, m, legacy_ol_flags);
2604 
2605 out:
2606 
2607 	return error;
2608 }
2609 
2610 static void
2611 virtio_dev_extbuf_free(void *addr __rte_unused, void *opaque)
2612 {
2613 	rte_free(opaque);
2614 }
2615 
2616 static int
2617 virtio_dev_extbuf_alloc(struct rte_mbuf *pkt, uint32_t size)
2618 {
2619 	struct rte_mbuf_ext_shared_info *shinfo = NULL;
2620 	uint32_t total_len = RTE_PKTMBUF_HEADROOM + size;
2621 	uint16_t buf_len;
2622 	rte_iova_t iova;
2623 	void *buf;
2624 
2625 	total_len += sizeof(*shinfo) + sizeof(uintptr_t);
2626 	total_len = RTE_ALIGN_CEIL(total_len, sizeof(uintptr_t));
2627 
2628 	if (unlikely(total_len > UINT16_MAX))
2629 		return -ENOSPC;
2630 
2631 	buf_len = total_len;
2632 	buf = rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE);
2633 	if (unlikely(buf == NULL))
2634 		return -ENOMEM;
2635 
2636 	/* Initialize shinfo */
2637 	shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf, &buf_len,
2638 						virtio_dev_extbuf_free, buf);
2639 	if (unlikely(shinfo == NULL)) {
2640 		rte_free(buf);
2641 		VHOST_LOG_DATA(ERR, "Failed to init shinfo\n");
2642 		return -1;
2643 	}
2644 
2645 	iova = rte_malloc_virt2iova(buf);
2646 	rte_pktmbuf_attach_extbuf(pkt, buf, iova, buf_len, shinfo);
2647 	rte_pktmbuf_reset_headroom(pkt);
2648 
2649 	return 0;
2650 }
2651 
2652 static __rte_always_inline int
2653 virtio_dev_pktmbuf_prep(struct virtio_net *dev, struct rte_mbuf *pkt,
2654 			 uint32_t data_len)
2655 {
2656 	if (rte_pktmbuf_tailroom(pkt) >= data_len)
2657 		return 0;
2658 
2659 	/* attach an external buffer if supported */
2660 	if (dev->extbuf && !virtio_dev_extbuf_alloc(pkt, data_len))
2661 		return 0;
2662 
2663 	/* check if chained buffers are allowed */
2664 	if (!dev->linearbuf)
2665 		return 0;
2666 
2667 	return -1;
2668 }
2669 
2670 /*
2671  * Allocate a host supported pktmbuf.
2672  */
2673 static __rte_always_inline struct rte_mbuf *
2674 virtio_dev_pktmbuf_alloc(struct virtio_net *dev, struct rte_mempool *mp,
2675 			 uint32_t data_len)
2676 {
2677 	struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
2678 
2679 	if (unlikely(pkt == NULL)) {
2680 		VHOST_LOG_DATA(ERR,
2681 			"Failed to allocate memory for mbuf.\n");
2682 		return NULL;
2683 	}
2684 
2685 	if (virtio_dev_pktmbuf_prep(dev, pkt, data_len)) {
2686 		/* Data doesn't fit into the buffer and the host supports
2687 		 * only linear buffers
2688 		 */
2689 		rte_pktmbuf_free(pkt);
2690 		return NULL;
2691 	}
2692 
2693 	return pkt;
2694 }
2695 
2696 __rte_always_inline
2697 static uint16_t
2698 virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
2699 	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count,
2700 	bool legacy_ol_flags)
2701 {
2702 	uint16_t i;
2703 	uint16_t free_entries;
2704 	uint16_t dropped = 0;
2705 	static bool allocerr_warned;
2706 
2707 	/*
2708 	 * The ordering between avail index and
2709 	 * desc reads needs to be enforced.
2710 	 */
2711 	free_entries = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE) -
2712 			vq->last_avail_idx;
2713 	if (free_entries == 0)
2714 		return 0;
2715 
2716 	rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
2717 
2718 	VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
2719 
2720 	count = RTE_MIN(count, MAX_PKT_BURST);
2721 	count = RTE_MIN(count, free_entries);
2722 	VHOST_LOG_DATA(DEBUG, "(%d) about to dequeue %u buffers\n",
2723 			dev->vid, count);
2724 
2725 	for (i = 0; i < count; i++) {
2726 		struct buf_vector buf_vec[BUF_VECTOR_MAX];
2727 		uint16_t head_idx;
2728 		uint32_t buf_len;
2729 		uint16_t nr_vec = 0;
2730 		int err;
2731 
2732 		if (unlikely(fill_vec_buf_split(dev, vq,
2733 						vq->last_avail_idx + i,
2734 						&nr_vec, buf_vec,
2735 						&head_idx, &buf_len,
2736 						VHOST_ACCESS_RO) < 0))
2737 			break;
2738 
2739 		update_shadow_used_ring_split(vq, head_idx, 0);
2740 
2741 		pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len);
2742 		if (unlikely(pkts[i] == NULL)) {
2743 			/*
2744 			 * mbuf allocation fails for jumbo packets when external
2745 			 * buffer allocation is not allowed and linear buffer
2746 			 * is required. Drop this packet.
2747 			 */
2748 			if (!allocerr_warned) {
2749 				VHOST_LOG_DATA(ERR,
2750 					"Failed mbuf alloc of size %d from %s on %s.\n",
2751 					buf_len, mbuf_pool->name, dev->ifname);
2752 				allocerr_warned = true;
2753 			}
2754 			dropped += 1;
2755 			i++;
2756 			break;
2757 		}
2758 
2759 		err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
2760 				mbuf_pool, legacy_ol_flags);
2761 		if (unlikely(err)) {
2762 			rte_pktmbuf_free(pkts[i]);
2763 			if (!allocerr_warned) {
2764 				VHOST_LOG_DATA(ERR,
2765 					"Failed to copy desc to mbuf on %s.\n",
2766 					dev->ifname);
2767 				allocerr_warned = true;
2768 			}
2769 			dropped += 1;
2770 			i++;
2771 			break;
2772 		}
2773 	}
2774 
2775 	vq->last_avail_idx += i;
2776 
2777 	do_data_copy_dequeue(vq);
2778 	if (unlikely(i < count))
2779 		vq->shadow_used_idx = i;
2780 	if (likely(vq->shadow_used_idx)) {
2781 		flush_shadow_used_ring_split(dev, vq);
2782 		vhost_vring_call_split(dev, vq);
2783 	}
2784 
2785 	return (i - dropped);
2786 }
2787 
2788 __rte_noinline
2789 static uint16_t
2790 virtio_dev_tx_split_legacy(struct virtio_net *dev,
2791 	struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool,
2792 	struct rte_mbuf **pkts, uint16_t count)
2793 {
2794 	return virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count, true);
2795 }
2796 
2797 __rte_noinline
2798 static uint16_t
2799 virtio_dev_tx_split_compliant(struct virtio_net *dev,
2800 	struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool,
2801 	struct rte_mbuf **pkts, uint16_t count)
2802 {
2803 	return virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count, false);
2804 }
2805 
2806 static __rte_always_inline int
2807 vhost_reserve_avail_batch_packed(struct virtio_net *dev,
2808 				 struct vhost_virtqueue *vq,
2809 				 struct rte_mbuf **pkts,
2810 				 uint16_t avail_idx,
2811 				 uintptr_t *desc_addrs,
2812 				 uint16_t *ids)
2813 {
2814 	bool wrap = vq->avail_wrap_counter;
2815 	struct vring_packed_desc *descs = vq->desc_packed;
2816 	uint64_t lens[PACKED_BATCH_SIZE];
2817 	uint64_t buf_lens[PACKED_BATCH_SIZE];
2818 	uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
2819 	uint16_t flags, i;
2820 
2821 	if (unlikely(avail_idx & PACKED_BATCH_MASK))
2822 		return -1;
2823 	if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size))
2824 		return -1;
2825 
2826 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2827 		flags = descs[avail_idx + i].flags;
2828 		if (unlikely((wrap != !!(flags & VRING_DESC_F_AVAIL)) ||
2829 			     (wrap == !!(flags & VRING_DESC_F_USED))  ||
2830 			     (flags & PACKED_DESC_SINGLE_DEQUEUE_FLAG)))
2831 			return -1;
2832 	}
2833 
2834 	rte_atomic_thread_fence(__ATOMIC_ACQUIRE);
2835 
2836 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
2837 		lens[i] = descs[avail_idx + i].len;
2838 
2839 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2840 		desc_addrs[i] = vhost_iova_to_vva(dev, vq,
2841 						  descs[avail_idx + i].addr,
2842 						  &lens[i], VHOST_ACCESS_RW);
2843 	}
2844 
2845 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2846 		if (unlikely(!desc_addrs[i]))
2847 			return -1;
2848 		if (unlikely((lens[i] != descs[avail_idx + i].len)))
2849 			return -1;
2850 	}
2851 
2852 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2853 		if (virtio_dev_pktmbuf_prep(dev, pkts[i], lens[i]))
2854 			goto err;
2855 	}
2856 
2857 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
2858 		buf_lens[i] = pkts[i]->buf_len - pkts[i]->data_off;
2859 
2860 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2861 		if (unlikely(buf_lens[i] < (lens[i] - buf_offset)))
2862 			goto err;
2863 	}
2864 
2865 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2866 		pkts[i]->pkt_len = lens[i] - buf_offset;
2867 		pkts[i]->data_len = pkts[i]->pkt_len;
2868 		ids[i] = descs[avail_idx + i].id;
2869 	}
2870 
2871 	return 0;
2872 
2873 err:
2874 	return -1;
2875 }
2876 
2877 static __rte_always_inline int
2878 virtio_dev_tx_batch_packed(struct virtio_net *dev,
2879 			   struct vhost_virtqueue *vq,
2880 			   struct rte_mbuf **pkts,
2881 			   bool legacy_ol_flags)
2882 {
2883 	uint16_t avail_idx = vq->last_avail_idx;
2884 	uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
2885 	struct virtio_net_hdr *hdr;
2886 	uintptr_t desc_addrs[PACKED_BATCH_SIZE];
2887 	uint16_t ids[PACKED_BATCH_SIZE];
2888 	uint16_t i;
2889 
2890 	if (vhost_reserve_avail_batch_packed(dev, vq, pkts, avail_idx,
2891 					     desc_addrs, ids))
2892 		return -1;
2893 
2894 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
2895 		rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
2896 
2897 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
2898 		rte_memcpy(rte_pktmbuf_mtod_offset(pkts[i], void *, 0),
2899 			   (void *)(uintptr_t)(desc_addrs[i] + buf_offset),
2900 			   pkts[i]->pkt_len);
2901 
2902 	if (virtio_net_with_host_offload(dev)) {
2903 		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2904 			hdr = (struct virtio_net_hdr *)(desc_addrs[i]);
2905 			vhost_dequeue_offload(hdr, pkts[i], legacy_ol_flags);
2906 		}
2907 	}
2908 
2909 	if (virtio_net_is_inorder(dev))
2910 		vhost_shadow_dequeue_batch_packed_inorder(vq,
2911 			ids[PACKED_BATCH_SIZE - 1]);
2912 	else
2913 		vhost_shadow_dequeue_batch_packed(dev, vq, ids);
2914 
2915 	vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE);
2916 
2917 	return 0;
2918 }
2919 
2920 static __rte_always_inline int
2921 vhost_dequeue_single_packed(struct virtio_net *dev,
2922 			    struct vhost_virtqueue *vq,
2923 			    struct rte_mempool *mbuf_pool,
2924 			    struct rte_mbuf *pkts,
2925 			    uint16_t *buf_id,
2926 			    uint16_t *desc_count,
2927 			    bool legacy_ol_flags)
2928 {
2929 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
2930 	uint32_t buf_len;
2931 	uint16_t nr_vec = 0;
2932 	int err;
2933 	static bool allocerr_warned;
2934 
2935 	if (unlikely(fill_vec_buf_packed(dev, vq,
2936 					 vq->last_avail_idx, desc_count,
2937 					 buf_vec, &nr_vec,
2938 					 buf_id, &buf_len,
2939 					 VHOST_ACCESS_RO) < 0))
2940 		return -1;
2941 
2942 	if (unlikely(virtio_dev_pktmbuf_prep(dev, pkts, buf_len))) {
2943 		if (!allocerr_warned) {
2944 			VHOST_LOG_DATA(ERR,
2945 				"Failed mbuf alloc of size %d from %s on %s.\n",
2946 				buf_len, mbuf_pool->name, dev->ifname);
2947 			allocerr_warned = true;
2948 		}
2949 		return -1;
2950 	}
2951 
2952 	err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts,
2953 				mbuf_pool, legacy_ol_flags);
2954 	if (unlikely(err)) {
2955 		if (!allocerr_warned) {
2956 			VHOST_LOG_DATA(ERR,
2957 				"Failed to copy desc to mbuf on %s.\n",
2958 				dev->ifname);
2959 			allocerr_warned = true;
2960 		}
2961 		return -1;
2962 	}
2963 
2964 	return 0;
2965 }
2966 
2967 static __rte_always_inline int
2968 virtio_dev_tx_single_packed(struct virtio_net *dev,
2969 			    struct vhost_virtqueue *vq,
2970 			    struct rte_mempool *mbuf_pool,
2971 			    struct rte_mbuf *pkts,
2972 			    bool legacy_ol_flags)
2973 {
2974 
2975 	uint16_t buf_id, desc_count = 0;
2976 	int ret;
2977 
2978 	ret = vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id,
2979 					&desc_count, legacy_ol_flags);
2980 
2981 	if (likely(desc_count > 0)) {
2982 		if (virtio_net_is_inorder(dev))
2983 			vhost_shadow_dequeue_single_packed_inorder(vq, buf_id,
2984 								   desc_count);
2985 		else
2986 			vhost_shadow_dequeue_single_packed(vq, buf_id,
2987 					desc_count);
2988 
2989 		vq_inc_last_avail_packed(vq, desc_count);
2990 	}
2991 
2992 	return ret;
2993 }
2994 
2995 __rte_always_inline
2996 static uint16_t
2997 virtio_dev_tx_packed(struct virtio_net *dev,
2998 		     struct vhost_virtqueue *__rte_restrict vq,
2999 		     struct rte_mempool *mbuf_pool,
3000 		     struct rte_mbuf **__rte_restrict pkts,
3001 		     uint32_t count,
3002 		     bool legacy_ol_flags)
3003 {
3004 	uint32_t pkt_idx = 0;
3005 
3006 	if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts, count))
3007 		return 0;
3008 
3009 	do {
3010 		rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
3011 
3012 		if (count - pkt_idx >= PACKED_BATCH_SIZE) {
3013 			if (!virtio_dev_tx_batch_packed(dev, vq,
3014 							&pkts[pkt_idx],
3015 							legacy_ol_flags)) {
3016 				pkt_idx += PACKED_BATCH_SIZE;
3017 				continue;
3018 			}
3019 		}
3020 
3021 		if (virtio_dev_tx_single_packed(dev, vq, mbuf_pool,
3022 						pkts[pkt_idx],
3023 						legacy_ol_flags))
3024 			break;
3025 		pkt_idx++;
3026 	} while (pkt_idx < count);
3027 
3028 	if (pkt_idx != count)
3029 		rte_pktmbuf_free_bulk(&pkts[pkt_idx], count - pkt_idx);
3030 
3031 	if (vq->shadow_used_idx) {
3032 		do_data_copy_dequeue(vq);
3033 
3034 		vhost_flush_dequeue_shadow_packed(dev, vq);
3035 		vhost_vring_call_packed(dev, vq);
3036 	}
3037 
3038 	return pkt_idx;
3039 }
3040 
3041 __rte_noinline
3042 static uint16_t
3043 virtio_dev_tx_packed_legacy(struct virtio_net *dev,
3044 	struct vhost_virtqueue *__rte_restrict vq, struct rte_mempool *mbuf_pool,
3045 	struct rte_mbuf **__rte_restrict pkts, uint32_t count)
3046 {
3047 	return virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count, true);
3048 }
3049 
3050 __rte_noinline
3051 static uint16_t
3052 virtio_dev_tx_packed_compliant(struct virtio_net *dev,
3053 	struct vhost_virtqueue *__rte_restrict vq, struct rte_mempool *mbuf_pool,
3054 	struct rte_mbuf **__rte_restrict pkts, uint32_t count)
3055 {
3056 	return virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count, false);
3057 }
3058 
3059 uint16_t
3060 rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
3061 	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
3062 {
3063 	struct virtio_net *dev;
3064 	struct rte_mbuf *rarp_mbuf = NULL;
3065 	struct vhost_virtqueue *vq;
3066 	int16_t success = 1;
3067 
3068 	dev = get_device(vid);
3069 	if (!dev)
3070 		return 0;
3071 
3072 	if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
3073 		VHOST_LOG_DATA(ERR,
3074 			"(%d) %s: built-in vhost net backend is disabled.\n",
3075 			dev->vid, __func__);
3076 		return 0;
3077 	}
3078 
3079 	if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) {
3080 		VHOST_LOG_DATA(ERR,
3081 			"(%d) %s: invalid virtqueue idx %d.\n",
3082 			dev->vid, __func__, queue_id);
3083 		return 0;
3084 	}
3085 
3086 	vq = dev->virtqueue[queue_id];
3087 
3088 	if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
3089 		return 0;
3090 
3091 	if (unlikely(!vq->enabled)) {
3092 		count = 0;
3093 		goto out_access_unlock;
3094 	}
3095 
3096 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
3097 		vhost_user_iotlb_rd_lock(vq);
3098 
3099 	if (unlikely(!vq->access_ok))
3100 		if (unlikely(vring_translate(dev, vq) < 0)) {
3101 			count = 0;
3102 			goto out;
3103 		}
3104 
3105 	/*
3106 	 * Construct a RARP broadcast packet, and inject it to the "pkts"
3107 	 * array, to looks like that guest actually send such packet.
3108 	 *
3109 	 * Check user_send_rarp() for more information.
3110 	 *
3111 	 * broadcast_rarp shares a cacheline in the virtio_net structure
3112 	 * with some fields that are accessed during enqueue and
3113 	 * __atomic_compare_exchange_n causes a write if performed compare
3114 	 * and exchange. This could result in false sharing between enqueue
3115 	 * and dequeue.
3116 	 *
3117 	 * Prevent unnecessary false sharing by reading broadcast_rarp first
3118 	 * and only performing compare and exchange if the read indicates it
3119 	 * is likely to be set.
3120 	 */
3121 	if (unlikely(__atomic_load_n(&dev->broadcast_rarp, __ATOMIC_ACQUIRE) &&
3122 			__atomic_compare_exchange_n(&dev->broadcast_rarp,
3123 			&success, 0, 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED))) {
3124 
3125 		rarp_mbuf = rte_net_make_rarp_packet(mbuf_pool, &dev->mac);
3126 		if (rarp_mbuf == NULL) {
3127 			VHOST_LOG_DATA(ERR, "Failed to make RARP packet.\n");
3128 			count = 0;
3129 			goto out;
3130 		}
3131 		count -= 1;
3132 	}
3133 
3134 	if (vq_is_packed(dev)) {
3135 		if (dev->flags & VIRTIO_DEV_LEGACY_OL_FLAGS)
3136 			count = virtio_dev_tx_packed_legacy(dev, vq, mbuf_pool, pkts, count);
3137 		else
3138 			count = virtio_dev_tx_packed_compliant(dev, vq, mbuf_pool, pkts, count);
3139 	} else {
3140 		if (dev->flags & VIRTIO_DEV_LEGACY_OL_FLAGS)
3141 			count = virtio_dev_tx_split_legacy(dev, vq, mbuf_pool, pkts, count);
3142 		else
3143 			count = virtio_dev_tx_split_compliant(dev, vq, mbuf_pool, pkts, count);
3144 	}
3145 
3146 out:
3147 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
3148 		vhost_user_iotlb_rd_unlock(vq);
3149 
3150 out_access_unlock:
3151 	rte_spinlock_unlock(&vq->access_lock);
3152 
3153 	if (unlikely(rarp_mbuf != NULL)) {
3154 		/*
3155 		 * Inject it to the head of "pkts" array, so that switch's mac
3156 		 * learning table will get updated first.
3157 		 */
3158 		memmove(&pkts[1], pkts, count * sizeof(struct rte_mbuf *));
3159 		pkts[0] = rarp_mbuf;
3160 		count += 1;
3161 	}
3162 
3163 	return count;
3164 }
3165