xref: /dpdk/examples/vhost/main.c (revision 68fa37e021a1c44c6b2a947cefc20eb61c729947)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 
53 #include "main.h"
54 #include "virtio-net.h"
55 #include "vhost-net-cdev.h"
56 
57 #define MAX_QUEUES 128
58 
59 /* the maximum number of external ports supported */
60 #define MAX_SUP_PORTS 1
61 
62 /*
63  * Calculate the number of buffers needed per port
64  */
65 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +  		\
66 							(num_switching_cores*MAX_PKT_BURST) +  			\
67 							(num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
68 							(num_switching_cores*MBUF_CACHE_SIZE))
69 
70 #define MBUF_CACHE_SIZE 128
71 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
72 
73 /*
74  * No frame data buffer allocated from host are required for zero copy
75  * implementation, guest will allocate the frame data buffer, and vhost
76  * directly use it.
77  */
78 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
79 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
80 	+ RTE_PKTMBUF_HEADROOM)
81 #define MBUF_CACHE_SIZE_ZCP 0
82 
83 /*
84  * RX and TX Prefetch, Host, and Write-back threshold values should be
85  * carefully set for optimal performance. Consult the network
86  * controller's datasheet and supporting DPDK documentation for guidance
87  * on how these parameters should be set.
88  */
89 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
90 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
91 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
92 
93 /*
94  * These default values are optimized for use with the Intel(R) 82599 10 GbE
95  * Controller and the DPDK ixgbe PMD. Consider using other values for other
96  * network controllers and/or network drivers.
97  */
98 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
99 #define TX_HTHRESH 0  /* Default values of TX host threshold reg. */
100 #define TX_WTHRESH 0  /* Default values of TX write-back threshold reg. */
101 
102 #define MAX_PKT_BURST 32 		/* Max burst size for RX/TX */
103 #define MAX_MRG_PKT_BURST 16 	/* Max burst for merge buffers. Set to 1 due to performance issue. */
104 #define BURST_TX_DRAIN_US 100 	/* TX drain every ~100us */
105 
106 #define BURST_RX_WAIT_US 15 	/* Defines how long we wait between retries on RX */
107 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
108 
109 #define JUMBO_FRAME_MAX_SIZE    0x2600
110 
111 /* State of virtio device. */
112 #define DEVICE_MAC_LEARNING 0
113 #define DEVICE_RX			1
114 #define DEVICE_SAFE_REMOVE	2
115 
116 /* Config_core_flag status definitions. */
117 #define REQUEST_DEV_REMOVAL 1
118 #define ACK_DEV_REMOVAL 0
119 
120 /* Configurable number of RX/TX ring descriptors */
121 #define RTE_TEST_RX_DESC_DEFAULT 1024
122 #define RTE_TEST_TX_DESC_DEFAULT 512
123 
124 /*
125  * Need refine these 2 macros for legacy and DPDK based front end:
126  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
127  * And then adjust power 2.
128  */
129 /*
130  * For legacy front end, 128 descriptors,
131  * half for virtio header, another half for mbuf.
132  */
133 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
134 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
135 
136 /* Get first 4 bytes in mbuf headroom. */
137 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
138 		+ sizeof(struct rte_mbuf)))
139 
140 /* true if x is a power of 2 */
141 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
142 
143 #define INVALID_PORT_ID 0xFF
144 
145 /* Max number of devices. Limited by vmdq. */
146 #define MAX_DEVICES 64
147 
148 /* Size of buffers used for snprintfs. */
149 #define MAX_PRINT_BUFF 6072
150 
151 /* Maximum character device basename size. */
152 #define MAX_BASENAME_SZ 10
153 
154 /* Maximum long option length for option parsing. */
155 #define MAX_LONG_OPT_SZ 64
156 
157 /* Used to compare MAC addresses. */
158 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
159 
160 /* Number of descriptors per cacheline. */
161 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc))
162 
163 /* mask of enabled ports */
164 static uint32_t enabled_port_mask = 0;
165 
166 /*Number of switching cores enabled*/
167 static uint32_t num_switching_cores = 0;
168 
169 /* number of devices/queues to support*/
170 static uint32_t num_queues = 0;
171 uint32_t num_devices = 0;
172 
173 /*
174  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
175  * disabled on default.
176  */
177 static uint32_t zero_copy;
178 
179 /* number of descriptors to apply*/
180 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
181 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
182 
183 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
184 #define MAX_RING_DESC 4096
185 
186 struct vpool {
187 	struct rte_mempool *pool;
188 	struct rte_ring *ring;
189 	uint32_t buf_size;
190 } vpool_array[MAX_QUEUES+MAX_QUEUES];
191 
192 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
193 typedef enum {
194 	VM2VM_DISABLED = 0,
195 	VM2VM_SOFTWARE = 1,
196 	VM2VM_HARDWARE = 2,
197 	VM2VM_LAST
198 } vm2vm_type;
199 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
200 
201 /* The type of host physical address translated from guest physical address. */
202 typedef enum {
203 	PHYS_ADDR_CONTINUOUS = 0,
204 	PHYS_ADDR_CROSS_SUBREG = 1,
205 	PHYS_ADDR_INVALID = 2,
206 	PHYS_ADDR_LAST
207 } hpa_type;
208 
209 /* Enable stats. */
210 static uint32_t enable_stats = 0;
211 /* Enable retries on RX. */
212 static uint32_t enable_retry = 1;
213 /* Specify timeout (in useconds) between retries on RX. */
214 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
215 /* Specify the number of retries on RX. */
216 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
217 
218 /* Character device basename. Can be set by user. */
219 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
220 
221 /* Charater device index. Can be set by user. */
222 static uint32_t dev_index = 0;
223 
224 /* This can be set by the user so it is made available here. */
225 extern uint64_t VHOST_FEATURES;
226 
227 /* Default configuration for rx and tx thresholds etc. */
228 static struct rte_eth_rxconf rx_conf_default = {
229 	.rx_thresh = {
230 		.pthresh = RX_PTHRESH,
231 		.hthresh = RX_HTHRESH,
232 		.wthresh = RX_WTHRESH,
233 	},
234 	.rx_drop_en = 1,
235 };
236 
237 /*
238  * These default values are optimized for use with the Intel(R) 82599 10 GbE
239  * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
240  * network controllers and/or network drivers.
241  */
242 static struct rte_eth_txconf tx_conf_default = {
243 	.tx_thresh = {
244 		.pthresh = TX_PTHRESH,
245 		.hthresh = TX_HTHRESH,
246 		.wthresh = TX_WTHRESH,
247 	},
248 	.tx_free_thresh = 0, /* Use PMD default values */
249 	.tx_rs_thresh = 0, /* Use PMD default values */
250 };
251 
252 /* empty vmdq configuration structure. Filled in programatically */
253 static struct rte_eth_conf vmdq_conf_default = {
254 	.rxmode = {
255 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
256 		.split_hdr_size = 0,
257 		.header_split   = 0, /**< Header Split disabled */
258 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
259 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
260 		/*
261 		 * It is necessary for 1G NIC such as I350,
262 		 * this fixes bug of ipv4 forwarding in guest can't
263 		 * forward pakets from one virtio dev to another virtio dev.
264 		 */
265 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
266 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
267 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
268 	},
269 
270 	.txmode = {
271 		.mq_mode = ETH_MQ_TX_NONE,
272 	},
273 	.rx_adv_conf = {
274 		/*
275 		 * should be overridden separately in code with
276 		 * appropriate values
277 		 */
278 		.vmdq_rx_conf = {
279 			.nb_queue_pools = ETH_8_POOLS,
280 			.enable_default_pool = 0,
281 			.default_pool = 0,
282 			.nb_pool_maps = 0,
283 			.pool_map = {{0, 0},},
284 		},
285 	},
286 };
287 
288 static unsigned lcore_ids[RTE_MAX_LCORE];
289 static uint8_t ports[RTE_MAX_ETHPORTS];
290 static unsigned num_ports = 0; /**< The number of ports specified in command line */
291 
292 static const uint16_t external_pkt_default_vlan_tag = 2000;
293 const uint16_t vlan_tags[] = {
294 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
295 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
296 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
297 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
298 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
299 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
300 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
301 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
302 };
303 
304 /* ethernet addresses of ports */
305 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
306 
307 /* heads for the main used and free linked lists for the data path. */
308 static struct virtio_net_data_ll *ll_root_used = NULL;
309 static struct virtio_net_data_ll *ll_root_free = NULL;
310 
311 /* Array of data core structures containing information on individual core linked lists. */
312 static struct lcore_info lcore_info[RTE_MAX_LCORE];
313 
314 /* Used for queueing bursts of TX packets. */
315 struct mbuf_table {
316 	unsigned len;
317 	unsigned txq_id;
318 	struct rte_mbuf *m_table[MAX_PKT_BURST];
319 };
320 
321 /* TX queue for each data core. */
322 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
323 
324 /* TX queue fori each virtio device for zero copy. */
325 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
326 
327 /* Vlan header struct used to insert vlan tags on TX. */
328 struct vlan_ethhdr {
329 	unsigned char   h_dest[ETH_ALEN];
330 	unsigned char   h_source[ETH_ALEN];
331 	__be16          h_vlan_proto;
332 	__be16          h_vlan_TCI;
333 	__be16          h_vlan_encapsulated_proto;
334 };
335 
336 /* IPv4 Header */
337 struct ipv4_hdr {
338 	uint8_t  version_ihl;		/**< version and header length */
339 	uint8_t  type_of_service;	/**< type of service */
340 	uint16_t total_length;		/**< length of packet */
341 	uint16_t packet_id;		/**< packet ID */
342 	uint16_t fragment_offset;	/**< fragmentation offset */
343 	uint8_t  time_to_live;		/**< time to live */
344 	uint8_t  next_proto_id;		/**< protocol ID */
345 	uint16_t hdr_checksum;		/**< header checksum */
346 	uint32_t src_addr;		/**< source address */
347 	uint32_t dst_addr;		/**< destination address */
348 } __attribute__((__packed__));
349 
350 /* Header lengths. */
351 #define VLAN_HLEN       4
352 #define VLAN_ETH_HLEN   18
353 
354 /* Per-device statistics struct */
355 struct device_statistics {
356 	uint64_t tx_total;
357 	rte_atomic64_t rx_total_atomic;
358 	uint64_t rx_total;
359 	uint64_t tx;
360 	rte_atomic64_t rx_atomic;
361 	uint64_t rx;
362 } __rte_cache_aligned;
363 struct device_statistics dev_statistics[MAX_DEVICES];
364 
365 /*
366  * Builds up the correct configuration for VMDQ VLAN pool map
367  * according to the pool & queue limits.
368  */
369 static inline int
370 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
371 {
372 	struct rte_eth_vmdq_rx_conf conf;
373 	unsigned i;
374 
375 	memset(&conf, 0, sizeof(conf));
376 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
377 	conf.nb_pool_maps = num_devices;
378 	conf.enable_loop_back =
379 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back;
380 
381 	for (i = 0; i < conf.nb_pool_maps; i++) {
382 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
383 		conf.pool_map[i].pools = (1UL << i);
384 	}
385 
386 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
387 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
388 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
389 	return 0;
390 }
391 
392 /*
393  * Validate the device number according to the max pool number gotten form
394  * dev_info. If the device number is invalid, give the error message and
395  * return -1. Each device must have its own pool.
396  */
397 static inline int
398 validate_num_devices(uint32_t max_nb_devices)
399 {
400 	if (num_devices > max_nb_devices) {
401 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
402 		return -1;
403 	}
404 	return 0;
405 }
406 
407 /*
408  * Initialises a given port using global settings and with the rx buffers
409  * coming from the mbuf_pool passed as parameter
410  */
411 static inline int
412 port_init(uint8_t port)
413 {
414 	struct rte_eth_dev_info dev_info;
415 	struct rte_eth_conf port_conf;
416 	uint16_t rx_rings, tx_rings;
417 	uint16_t rx_ring_size, tx_ring_size;
418 	int retval;
419 	uint16_t q;
420 
421 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
422 	rte_eth_dev_info_get (port, &dev_info);
423 
424 	/*configure the number of supported virtio devices based on VMDQ limits */
425 	num_devices = dev_info.max_vmdq_pools;
426 	num_queues = dev_info.max_rx_queues;
427 
428 	if (zero_copy) {
429 		rx_ring_size = num_rx_descriptor;
430 		tx_ring_size = num_tx_descriptor;
431 		tx_rings = dev_info.max_tx_queues;
432 	} else {
433 		rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
434 		tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
435 		tx_rings = (uint16_t)rte_lcore_count();
436 	}
437 
438 	retval = validate_num_devices(MAX_DEVICES);
439 	if (retval < 0)
440 		return retval;
441 
442 	/* Get port configuration. */
443 	retval = get_eth_conf(&port_conf, num_devices);
444 	if (retval < 0)
445 		return retval;
446 
447 	if (port >= rte_eth_dev_count()) return -1;
448 
449 	rx_rings = (uint16_t)num_queues,
450 	/* Configure ethernet device. */
451 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
452 	if (retval != 0)
453 		return retval;
454 
455 	/* Setup the queues. */
456 	for (q = 0; q < rx_rings; q ++) {
457 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
458 						rte_eth_dev_socket_id(port), &rx_conf_default,
459 						vpool_array[q].pool);
460 		if (retval < 0)
461 			return retval;
462 	}
463 	for (q = 0; q < tx_rings; q ++) {
464 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
465 						rte_eth_dev_socket_id(port), &tx_conf_default);
466 		if (retval < 0)
467 			return retval;
468 	}
469 
470 	/* Start the device. */
471 	retval  = rte_eth_dev_start(port);
472 	if (retval < 0) {
473 		RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
474 		return retval;
475 	}
476 
477 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
478 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
479 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
480 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
481 			(unsigned)port,
482 			vmdq_ports_eth_addr[port].addr_bytes[0],
483 			vmdq_ports_eth_addr[port].addr_bytes[1],
484 			vmdq_ports_eth_addr[port].addr_bytes[2],
485 			vmdq_ports_eth_addr[port].addr_bytes[3],
486 			vmdq_ports_eth_addr[port].addr_bytes[4],
487 			vmdq_ports_eth_addr[port].addr_bytes[5]);
488 
489 	return 0;
490 }
491 
492 /*
493  * Set character device basename.
494  */
495 static int
496 us_vhost_parse_basename(const char *q_arg)
497 {
498 	/* parse number string */
499 
500 	if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
501 		return -1;
502 	else
503 		snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
504 
505 	return 0;
506 }
507 
508 /*
509  * Parse the portmask provided at run time.
510  */
511 static int
512 parse_portmask(const char *portmask)
513 {
514 	char *end = NULL;
515 	unsigned long pm;
516 
517 	errno = 0;
518 
519 	/* parse hexadecimal string */
520 	pm = strtoul(portmask, &end, 16);
521 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
522 		return -1;
523 
524 	if (pm == 0)
525 		return -1;
526 
527 	return pm;
528 
529 }
530 
531 /*
532  * Parse num options at run time.
533  */
534 static int
535 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
536 {
537 	char *end = NULL;
538 	unsigned long num;
539 
540 	errno = 0;
541 
542 	/* parse unsigned int string */
543 	num = strtoul(q_arg, &end, 10);
544 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
545 		return -1;
546 
547 	if (num > max_valid_value)
548 		return -1;
549 
550 	return num;
551 
552 }
553 
554 /*
555  * Display usage
556  */
557 static void
558 us_vhost_usage(const char *prgname)
559 {
560 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
561 	"		--vm2vm [0|1|2]\n"
562 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
563 	"		--dev-basename <name> --dev-index [0-N]\n"
564 	"		--nb-devices ND\n"
565 	"		-p PORTMASK: Set mask for ports to be used by application\n"
566 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
567 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
568 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
569 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
570 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
571 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
572 	"		--dev-basename: The basename to be used for the character device.\n"
573 	"		--dev-index [0-N]: Defaults to zero if not used. Index is appended to basename.\n"
574 	"		--zero-copy [0|1]: disable(default)/enable rx/tx "
575 			"zero copy\n"
576 	"		--rx-desc-num [0-N]: the number of descriptors on rx, "
577 			"used only when zero copy is enabled.\n"
578 	"		--tx-desc-num [0-N]: the number of descriptors on tx, "
579 			"used only when zero copy is enabled.\n",
580 	       prgname);
581 }
582 
583 /*
584  * Parse the arguments given in the command line of the application.
585  */
586 static int
587 us_vhost_parse_args(int argc, char **argv)
588 {
589 	int opt, ret;
590 	int option_index;
591 	unsigned i;
592 	const char *prgname = argv[0];
593 	static struct option long_option[] = {
594 		{"vm2vm", required_argument, NULL, 0},
595 		{"rx-retry", required_argument, NULL, 0},
596 		{"rx-retry-delay", required_argument, NULL, 0},
597 		{"rx-retry-num", required_argument, NULL, 0},
598 		{"mergeable", required_argument, NULL, 0},
599 		{"stats", required_argument, NULL, 0},
600 		{"dev-basename", required_argument, NULL, 0},
601 		{"dev-index", required_argument, NULL, 0},
602 		{"zero-copy", required_argument, NULL, 0},
603 		{"rx-desc-num", required_argument, NULL, 0},
604 		{"tx-desc-num", required_argument, NULL, 0},
605 		{NULL, 0, 0, 0},
606 	};
607 
608 	/* Parse command line */
609 	while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) {
610 		switch (opt) {
611 		/* Portmask */
612 		case 'p':
613 			enabled_port_mask = parse_portmask(optarg);
614 			if (enabled_port_mask == 0) {
615 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
616 				us_vhost_usage(prgname);
617 				return -1;
618 			}
619 			break;
620 
621 		case 0:
622 			/* Enable/disable vm2vm comms. */
623 			if (!strncmp(long_option[option_index].name, "vm2vm",
624 				MAX_LONG_OPT_SZ)) {
625 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
626 				if (ret == -1) {
627 					RTE_LOG(INFO, VHOST_CONFIG,
628 						"Invalid argument for "
629 						"vm2vm [0|1|2]\n");
630 					us_vhost_usage(prgname);
631 					return -1;
632 				} else {
633 					vm2vm_mode = (vm2vm_type)ret;
634 				}
635 			}
636 
637 			/* Enable/disable retries on RX. */
638 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
639 				ret = parse_num_opt(optarg, 1);
640 				if (ret == -1) {
641 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
642 					us_vhost_usage(prgname);
643 					return -1;
644 				} else {
645 					enable_retry = ret;
646 				}
647 			}
648 
649 			/* Specify the retries delay time (in useconds) on RX. */
650 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
651 				ret = parse_num_opt(optarg, INT32_MAX);
652 				if (ret == -1) {
653 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
654 					us_vhost_usage(prgname);
655 					return -1;
656 				} else {
657 					burst_rx_delay_time = ret;
658 				}
659 			}
660 
661 			/* Specify the retries number on RX. */
662 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
663 				ret = parse_num_opt(optarg, INT32_MAX);
664 				if (ret == -1) {
665 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
666 					us_vhost_usage(prgname);
667 					return -1;
668 				} else {
669 					burst_rx_retry_num = ret;
670 				}
671 			}
672 
673 			/* Enable/disable RX mergeable buffers. */
674 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
675 				ret = parse_num_opt(optarg, 1);
676 				if (ret == -1) {
677 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
678 					us_vhost_usage(prgname);
679 					return -1;
680 				} else {
681 					if (ret) {
682 						vmdq_conf_default.rxmode.jumbo_frame = 1;
683 						vmdq_conf_default.rxmode.max_rx_pkt_len
684 							= JUMBO_FRAME_MAX_SIZE;
685 						VHOST_FEATURES = (1ULL << VIRTIO_NET_F_MRG_RXBUF);
686 					}
687 				}
688 			}
689 
690 			/* Enable/disable stats. */
691 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
692 				ret = parse_num_opt(optarg, INT32_MAX);
693 				if (ret == -1) {
694 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
695 					us_vhost_usage(prgname);
696 					return -1;
697 				} else {
698 					enable_stats = ret;
699 				}
700 			}
701 
702 			/* Set character device basename. */
703 			if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
704 				if (us_vhost_parse_basename(optarg) == -1) {
705 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
706 					us_vhost_usage(prgname);
707 					return -1;
708 				}
709 			}
710 
711 			/* Set character device index. */
712 			if (!strncmp(long_option[option_index].name, "dev-index", MAX_LONG_OPT_SZ)) {
713 				ret = parse_num_opt(optarg, INT32_MAX);
714 				if (ret == -1) {
715 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device index [0..N]\n");
716 					us_vhost_usage(prgname);
717 					return -1;
718 				} else
719 					dev_index = ret;
720 			}
721 
722 			/* Enable/disable rx/tx zero copy. */
723 			if (!strncmp(long_option[option_index].name,
724 				"zero-copy", MAX_LONG_OPT_SZ)) {
725 				ret = parse_num_opt(optarg, 1);
726 				if (ret == -1) {
727 					RTE_LOG(INFO, VHOST_CONFIG,
728 						"Invalid argument"
729 						" for zero-copy [0|1]\n");
730 					us_vhost_usage(prgname);
731 					return -1;
732 				} else
733 					zero_copy = ret;
734 
735 				if (zero_copy) {
736 #ifdef RTE_MBUF_REFCNT
737 					RTE_LOG(ERR, VHOST_CONFIG, "Before running "
738 					"zero copy vhost APP, please "
739 					"disable RTE_MBUF_REFCNT\n"
740 					"in config file and then rebuild DPDK "
741 					"core lib!\n"
742 					"Otherwise please disable zero copy "
743 					"flag in command line!\n");
744 					return -1;
745 #endif
746 				}
747 			}
748 
749 			/* Specify the descriptor number on RX. */
750 			if (!strncmp(long_option[option_index].name,
751 				"rx-desc-num", MAX_LONG_OPT_SZ)) {
752 				ret = parse_num_opt(optarg, MAX_RING_DESC);
753 				if ((ret == -1) || (!POWEROF2(ret))) {
754 					RTE_LOG(INFO, VHOST_CONFIG,
755 					"Invalid argument for rx-desc-num[0-N],"
756 					"power of 2 required.\n");
757 					us_vhost_usage(prgname);
758 					return -1;
759 				} else {
760 					num_rx_descriptor = ret;
761 				}
762 			}
763 
764 			/* Specify the descriptor number on TX. */
765 			if (!strncmp(long_option[option_index].name,
766 				"tx-desc-num", MAX_LONG_OPT_SZ)) {
767 				ret = parse_num_opt(optarg, MAX_RING_DESC);
768 				if ((ret == -1) || (!POWEROF2(ret))) {
769 					RTE_LOG(INFO, VHOST_CONFIG,
770 					"Invalid argument for tx-desc-num [0-N],"
771 					"power of 2 required.\n");
772 					us_vhost_usage(prgname);
773 					return -1;
774 				} else {
775 					num_tx_descriptor = ret;
776 				}
777 			}
778 
779 			break;
780 
781 			/* Invalid option - print options. */
782 		default:
783 			us_vhost_usage(prgname);
784 			return -1;
785 		}
786 	}
787 
788 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
789 		if (enabled_port_mask & (1 << i))
790 			ports[num_ports++] = (uint8_t)i;
791 	}
792 
793 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
794 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
795 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
796 		return -1;
797 	}
798 
799 	if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
800 		RTE_LOG(INFO, VHOST_PORT,
801 			"Vhost zero copy doesn't support software vm2vm,"
802 			"please specify 'vm2vm 2' to use hardware vm2vm.\n");
803 		return -1;
804 	}
805 
806 	if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
807 		RTE_LOG(INFO, VHOST_PORT,
808 			"Vhost zero copy doesn't support jumbo frame,"
809 			"please specify '--mergeable 0' to disable the "
810 			"mergeable feature.\n");
811 		return -1;
812 	}
813 
814 	return 0;
815 }
816 
817 /*
818  * Update the global var NUM_PORTS and array PORTS according to system ports number
819  * and return valid ports number
820  */
821 static unsigned check_ports_num(unsigned nb_ports)
822 {
823 	unsigned valid_num_ports = num_ports;
824 	unsigned portid;
825 
826 	if (num_ports > nb_ports) {
827 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
828 			num_ports, nb_ports);
829 		num_ports = nb_ports;
830 	}
831 
832 	for (portid = 0; portid < num_ports; portid ++) {
833 		if (ports[portid] >= nb_ports) {
834 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
835 				ports[portid], (nb_ports - 1));
836 			ports[portid] = INVALID_PORT_ID;
837 			valid_num_ports--;
838 		}
839 	}
840 	return valid_num_ports;
841 }
842 
843 /*
844  * Macro to print out packet contents. Wrapped in debug define so that the
845  * data path is not effected when debug is disabled.
846  */
847 #ifdef DEBUG
848 #define PRINT_PACKET(device, addr, size, header) do {																\
849 	char *pkt_addr = (char*)(addr);																					\
850 	unsigned int index;																								\
851 	char packet[MAX_PRINT_BUFF];																					\
852 																													\
853 	if ((header))																									\
854 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));				\
855 	else																											\
856 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));				\
857 	for (index = 0; index < (size); index++) {																		\
858 		snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),	\
859 			"%02hhx ", pkt_addr[index]);																			\
860 	}																												\
861 	snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");	\
862 																													\
863 	LOG_DEBUG(VHOST_DATA, "%s", packet);																					\
864 } while(0)
865 #else
866 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
867 #endif
868 
869 /*
870  * Function to convert guest physical addresses to vhost virtual addresses. This
871  * is used to convert virtio buffer addresses.
872  */
873 static inline uint64_t __attribute__((always_inline))
874 gpa_to_vva(struct virtio_net *dev, uint64_t guest_pa)
875 {
876 	struct virtio_memory_regions *region;
877 	uint32_t regionidx;
878 	uint64_t vhost_va = 0;
879 
880 	for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
881 		region = &dev->mem->regions[regionidx];
882 		if ((guest_pa >= region->guest_phys_address) &&
883 			(guest_pa <= region->guest_phys_address_end)) {
884 			vhost_va = region->address_offset + guest_pa;
885 			break;
886 		}
887 	}
888 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| VVA %p\n",
889 		dev->device_fh, (void*)(uintptr_t)guest_pa, (void*)(uintptr_t)vhost_va);
890 
891 	return vhost_va;
892 }
893 
894 /*
895  * Function to convert guest physical addresses to vhost physical addresses.
896  * This is used to convert virtio buffer addresses.
897  */
898 static inline uint64_t __attribute__((always_inline))
899 gpa_to_hpa(struct virtio_net *dev, uint64_t guest_pa,
900 	uint32_t buf_len, hpa_type *addr_type)
901 {
902 	struct virtio_memory_regions_hpa *region;
903 	uint32_t regionidx;
904 	uint64_t vhost_pa = 0;
905 
906 	*addr_type = PHYS_ADDR_INVALID;
907 
908 	for (regionidx = 0; regionidx < dev->mem->nregions_hpa; regionidx++) {
909 		region = &dev->mem->regions_hpa[regionidx];
910 		if ((guest_pa >= region->guest_phys_address) &&
911 			(guest_pa <= region->guest_phys_address_end)) {
912 			vhost_pa = region->host_phys_addr_offset + guest_pa;
913 			if (likely((guest_pa + buf_len - 1)
914 				<= region->guest_phys_address_end))
915 				*addr_type = PHYS_ADDR_CONTINUOUS;
916 			else
917 				*addr_type = PHYS_ADDR_CROSS_SUBREG;
918 			break;
919 		}
920 	}
921 
922 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
923 		dev->device_fh, (void *)(uintptr_t)guest_pa,
924 		(void *)(uintptr_t)vhost_pa);
925 
926 	return vhost_pa;
927 }
928 
929 /*
930  * This function adds buffers to the virtio devices RX virtqueue. Buffers can
931  * be received from the physical port or from another virtio device. A packet
932  * count is returned to indicate the number of packets that were succesfully
933  * added to the RX queue. This function works when mergeable is disabled.
934  */
935 static inline uint32_t __attribute__((always_inline))
936 virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count)
937 {
938 	struct vhost_virtqueue *vq;
939 	struct vring_desc *desc;
940 	struct rte_mbuf *buff;
941 	/* The virtio_hdr is initialised to 0. */
942 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0,0,0,0,0,0},0};
943 	uint64_t buff_addr = 0;
944 	uint64_t buff_hdr_addr = 0;
945 	uint32_t head[MAX_PKT_BURST], packet_len = 0;
946 	uint32_t head_idx, packet_success = 0;
947 	uint32_t retry = 0;
948 	uint16_t avail_idx, res_cur_idx;
949 	uint16_t res_base_idx, res_end_idx;
950 	uint16_t free_entries;
951 	uint8_t success = 0;
952 
953 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
954 	vq = dev->virtqueue[VIRTIO_RXQ];
955 	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
956 
957 	/* As many data cores may want access to available buffers, they need to be reserved. */
958 	do {
959 		res_base_idx = vq->last_used_idx_res;
960 		avail_idx = *((volatile uint16_t *)&vq->avail->idx);
961 
962 		free_entries = (avail_idx - res_base_idx);
963 		/* If retry is enabled and the queue is full then we wait and retry to avoid packet loss. */
964 		if (enable_retry && unlikely(count > free_entries)) {
965 			for (retry = 0; retry < burst_rx_retry_num; retry++) {
966 				rte_delay_us(burst_rx_delay_time);
967 				avail_idx =
968 					*((volatile uint16_t *)&vq->avail->idx);
969 				free_entries = (avail_idx - res_base_idx);
970 				if (count <= free_entries)
971 					break;
972 			}
973 		}
974 
975 		/*check that we have enough buffers*/
976 		if (unlikely(count > free_entries))
977 			count = free_entries;
978 
979 		if (count == 0)
980 			return 0;
981 
982 		res_end_idx = res_base_idx + count;
983 		/* vq->last_used_idx_res is atomically updated. */
984 		success = rte_atomic16_cmpset(&vq->last_used_idx_res, res_base_idx,
985 									res_end_idx);
986 	} while (unlikely(success == 0));
987 	res_cur_idx = res_base_idx;
988 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", dev->device_fh, res_cur_idx, res_end_idx);
989 
990 	/* Prefetch available ring to retrieve indexes. */
991 	rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]);
992 
993 	/* Retrieve all of the head indexes first to avoid caching issues. */
994 	for (head_idx = 0; head_idx < count; head_idx++)
995 		head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) & (vq->size - 1)];
996 
997 	/*Prefetch descriptor index. */
998 	rte_prefetch0(&vq->desc[head[packet_success]]);
999 
1000 	while (res_cur_idx != res_end_idx) {
1001 		/* Get descriptor from available ring */
1002 		desc = &vq->desc[head[packet_success]];
1003 
1004 		buff = pkts[packet_success];
1005 
1006 		/* Convert from gpa to vva (guest physical addr -> vhost virtual addr) */
1007 		buff_addr = gpa_to_vva(dev, desc->addr);
1008 		/* Prefetch buffer address. */
1009 		rte_prefetch0((void*)(uintptr_t)buff_addr);
1010 
1011 		/* Copy virtio_hdr to packet and increment buffer address */
1012 		buff_hdr_addr = buff_addr;
1013 		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1014 
1015 		/*
1016 		 * If the descriptors are chained the header and data are
1017 		 * placed in separate buffers.
1018 		 */
1019 		if (desc->flags & VRING_DESC_F_NEXT) {
1020 			desc->len = vq->vhost_hlen;
1021 			desc = &vq->desc[desc->next];
1022 			/* Buffer address translation. */
1023 			buff_addr = gpa_to_vva(dev, desc->addr);
1024 			desc->len = rte_pktmbuf_data_len(buff);
1025 		} else {
1026 			buff_addr += vq->vhost_hlen;
1027 			desc->len = packet_len;
1028 		}
1029 
1030 		/* Update used ring with desc information */
1031 		vq->used->ring[res_cur_idx & (vq->size - 1)].id = head[packet_success];
1032 		vq->used->ring[res_cur_idx & (vq->size - 1)].len = packet_len;
1033 
1034 		/* Copy mbuf data to buffer */
1035 		rte_memcpy((void *)(uintptr_t)buff_addr,
1036 			rte_pktmbuf_mtod(buff, const void *),
1037 			rte_pktmbuf_data_len(buff));
1038 		PRINT_PACKET(dev, (uintptr_t)buff_addr,
1039 			rte_pktmbuf_data_len(buff), 0);
1040 
1041 		res_cur_idx++;
1042 		packet_success++;
1043 
1044 		rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1045 			(const void *)&virtio_hdr, vq->vhost_hlen);
1046 
1047 		PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1048 
1049 		if (res_cur_idx < res_end_idx) {
1050 			/* Prefetch descriptor index. */
1051 			rte_prefetch0(&vq->desc[head[packet_success]]);
1052 		}
1053 	}
1054 
1055 	rte_compiler_barrier();
1056 
1057 	/* Wait until it's our turn to add our buffer to the used ring. */
1058 	while (unlikely(vq->last_used_idx != res_base_idx))
1059 		rte_pause();
1060 
1061 	*(volatile uint16_t *)&vq->used->idx += count;
1062 	vq->last_used_idx = res_end_idx;
1063 
1064 	/* Kick the guest if necessary. */
1065 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1066 		eventfd_write((int)vq->kickfd, 1);
1067 	return count;
1068 }
1069 
1070 static inline uint32_t __attribute__((always_inline))
1071 copy_from_mbuf_to_vring(struct virtio_net *dev,
1072 	uint16_t res_base_idx, uint16_t res_end_idx,
1073 	struct rte_mbuf *pkt)
1074 {
1075 	uint32_t vec_idx = 0;
1076 	uint32_t entry_success = 0;
1077 	struct vhost_virtqueue *vq;
1078 	/* The virtio_hdr is initialised to 0. */
1079 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {
1080 		{0, 0, 0, 0, 0, 0}, 0};
1081 	uint16_t cur_idx = res_base_idx;
1082 	uint64_t vb_addr = 0;
1083 	uint64_t vb_hdr_addr = 0;
1084 	uint32_t seg_offset = 0;
1085 	uint32_t vb_offset = 0;
1086 	uint32_t seg_avail;
1087 	uint32_t vb_avail;
1088 	uint32_t cpy_len, entry_len;
1089 
1090 	if (pkt == NULL)
1091 		return 0;
1092 
1093 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| "
1094 		"End Index %d\n",
1095 		dev->device_fh, cur_idx, res_end_idx);
1096 
1097 	/*
1098 	 * Convert from gpa to vva
1099 	 * (guest physical addr -> vhost virtual addr)
1100 	 */
1101 	vq = dev->virtqueue[VIRTIO_RXQ];
1102 	vb_addr =
1103 		gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
1104 	vb_hdr_addr = vb_addr;
1105 
1106 	/* Prefetch buffer address. */
1107 	rte_prefetch0((void *)(uintptr_t)vb_addr);
1108 
1109 	virtio_hdr.num_buffers = res_end_idx - res_base_idx;
1110 
1111 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge buffers %d\n",
1112 		dev->device_fh, virtio_hdr.num_buffers);
1113 
1114 	rte_memcpy((void *)(uintptr_t)vb_hdr_addr,
1115 		(const void *)&virtio_hdr, vq->vhost_hlen);
1116 
1117 	PRINT_PACKET(dev, (uintptr_t)vb_hdr_addr, vq->vhost_hlen, 1);
1118 
1119 	seg_avail = rte_pktmbuf_data_len(pkt);
1120 	vb_offset = vq->vhost_hlen;
1121 	vb_avail =
1122 		vq->buf_vec[vec_idx].buf_len - vq->vhost_hlen;
1123 
1124 	entry_len = vq->vhost_hlen;
1125 
1126 	if (vb_avail == 0) {
1127 		uint32_t desc_idx =
1128 			vq->buf_vec[vec_idx].desc_idx;
1129 		vq->desc[desc_idx].len = vq->vhost_hlen;
1130 
1131 		if ((vq->desc[desc_idx].flags
1132 			& VRING_DESC_F_NEXT) == 0) {
1133 			/* Update used ring with desc information */
1134 			vq->used->ring[cur_idx & (vq->size - 1)].id
1135 				= vq->buf_vec[vec_idx].desc_idx;
1136 			vq->used->ring[cur_idx & (vq->size - 1)].len
1137 				= entry_len;
1138 
1139 			entry_len = 0;
1140 			cur_idx++;
1141 			entry_success++;
1142 		}
1143 
1144 		vec_idx++;
1145 		vb_addr =
1146 			gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
1147 
1148 		/* Prefetch buffer address. */
1149 		rte_prefetch0((void *)(uintptr_t)vb_addr);
1150 		vb_offset = 0;
1151 		vb_avail = vq->buf_vec[vec_idx].buf_len;
1152 	}
1153 
1154 	cpy_len = RTE_MIN(vb_avail, seg_avail);
1155 
1156 	while (cpy_len > 0) {
1157 		/* Copy mbuf data to vring buffer */
1158 		rte_memcpy((void *)(uintptr_t)(vb_addr + vb_offset),
1159 			(const void *)(rte_pktmbuf_mtod(pkt, char*) + seg_offset),
1160 			cpy_len);
1161 
1162 		PRINT_PACKET(dev,
1163 			(uintptr_t)(vb_addr + vb_offset),
1164 			cpy_len, 0);
1165 
1166 		seg_offset += cpy_len;
1167 		vb_offset += cpy_len;
1168 		seg_avail -= cpy_len;
1169 		vb_avail -= cpy_len;
1170 		entry_len += cpy_len;
1171 
1172 		if (seg_avail != 0) {
1173 			/*
1174 			 * The virtio buffer in this vring
1175 			 * entry reach to its end.
1176 			 * But the segment doesn't complete.
1177 			 */
1178 			if ((vq->desc[vq->buf_vec[vec_idx].desc_idx].flags &
1179 				VRING_DESC_F_NEXT) == 0) {
1180 				/* Update used ring with desc information */
1181 				vq->used->ring[cur_idx & (vq->size - 1)].id
1182 					= vq->buf_vec[vec_idx].desc_idx;
1183 				vq->used->ring[cur_idx & (vq->size - 1)].len
1184 					= entry_len;
1185 				entry_len = 0;
1186 				cur_idx++;
1187 				entry_success++;
1188 			}
1189 
1190 			vec_idx++;
1191 			vb_addr = gpa_to_vva(dev,
1192 				vq->buf_vec[vec_idx].buf_addr);
1193 			vb_offset = 0;
1194 			vb_avail = vq->buf_vec[vec_idx].buf_len;
1195 			cpy_len = RTE_MIN(vb_avail, seg_avail);
1196 		} else {
1197 			/*
1198 			 * This current segment complete, need continue to
1199 			 * check if the whole packet complete or not.
1200 			 */
1201 			pkt = pkt->next;
1202 			if (pkt != NULL) {
1203 				/*
1204 				 * There are more segments.
1205 				 */
1206 				if (vb_avail == 0) {
1207 					/*
1208 					 * This current buffer from vring is
1209 					 * used up, need fetch next buffer
1210 					 * from buf_vec.
1211 					 */
1212 					uint32_t desc_idx =
1213 						vq->buf_vec[vec_idx].desc_idx;
1214 					vq->desc[desc_idx].len = vb_offset;
1215 
1216 					if ((vq->desc[desc_idx].flags &
1217 						VRING_DESC_F_NEXT) == 0) {
1218 						uint16_t wrapped_idx =
1219 							cur_idx & (vq->size - 1);
1220 						/*
1221 						 * Update used ring with the
1222 						 * descriptor information
1223 						 */
1224 						vq->used->ring[wrapped_idx].id
1225 							= desc_idx;
1226 						vq->used->ring[wrapped_idx].len
1227 							= entry_len;
1228 						entry_success++;
1229 						entry_len = 0;
1230 						cur_idx++;
1231 					}
1232 
1233 					/* Get next buffer from buf_vec. */
1234 					vec_idx++;
1235 					vb_addr = gpa_to_vva(dev,
1236 						vq->buf_vec[vec_idx].buf_addr);
1237 					vb_avail =
1238 						vq->buf_vec[vec_idx].buf_len;
1239 					vb_offset = 0;
1240 				}
1241 
1242 				seg_offset = 0;
1243 				seg_avail = rte_pktmbuf_data_len(pkt);
1244 				cpy_len = RTE_MIN(vb_avail, seg_avail);
1245 			} else {
1246 				/*
1247 				 * This whole packet completes.
1248 				 */
1249 				uint32_t desc_idx =
1250 					vq->buf_vec[vec_idx].desc_idx;
1251 				vq->desc[desc_idx].len = vb_offset;
1252 
1253 				while (vq->desc[desc_idx].flags &
1254 					VRING_DESC_F_NEXT) {
1255 					desc_idx = vq->desc[desc_idx].next;
1256 					 vq->desc[desc_idx].len = 0;
1257 				}
1258 
1259 				/* Update used ring with desc information */
1260 				vq->used->ring[cur_idx & (vq->size - 1)].id
1261 					= vq->buf_vec[vec_idx].desc_idx;
1262 				vq->used->ring[cur_idx & (vq->size - 1)].len
1263 					= entry_len;
1264 				entry_len = 0;
1265 				cur_idx++;
1266 				entry_success++;
1267 				seg_avail = 0;
1268 				cpy_len = RTE_MIN(vb_avail, seg_avail);
1269 			}
1270 		}
1271 	}
1272 
1273 	return entry_success;
1274 }
1275 
1276 /*
1277  * This function adds buffers to the virtio devices RX virtqueue. Buffers can
1278  * be received from the physical port or from another virtio device. A packet
1279  * count is returned to indicate the number of packets that were succesfully
1280  * added to the RX queue. This function works for mergeable RX.
1281  */
1282 static inline uint32_t __attribute__((always_inline))
1283 virtio_dev_merge_rx(struct virtio_net *dev, struct rte_mbuf **pkts,
1284 	uint32_t count)
1285 {
1286 	struct vhost_virtqueue *vq;
1287 	uint32_t pkt_idx = 0, entry_success = 0;
1288 	uint32_t retry = 0;
1289 	uint16_t avail_idx, res_cur_idx;
1290 	uint16_t res_base_idx, res_end_idx;
1291 	uint8_t success = 0;
1292 
1293 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_rx()\n",
1294 		dev->device_fh);
1295 	vq = dev->virtqueue[VIRTIO_RXQ];
1296 	count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
1297 
1298 	if (count == 0)
1299 		return 0;
1300 
1301 	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
1302 		uint32_t secure_len = 0;
1303 		uint16_t need_cnt;
1304 		uint32_t vec_idx = 0;
1305 		uint32_t pkt_len = pkts[pkt_idx]->pkt_len + vq->vhost_hlen;
1306 		uint16_t i, id;
1307 
1308 		do {
1309 			/*
1310 			 * As many data cores may want access to available
1311 			 * buffers, they need to be reserved.
1312 			 */
1313 			res_base_idx = vq->last_used_idx_res;
1314 			res_cur_idx = res_base_idx;
1315 
1316 			do {
1317 				avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1318 				if (unlikely(res_cur_idx == avail_idx)) {
1319 					/*
1320 					 * If retry is enabled and the queue is
1321 					 * full then we wait and retry to avoid
1322 					 * packet loss.
1323 					 */
1324 					if (enable_retry) {
1325 						uint8_t cont = 0;
1326 						for (retry = 0; retry < burst_rx_retry_num; retry++) {
1327 							rte_delay_us(burst_rx_delay_time);
1328 							avail_idx =
1329 								*((volatile uint16_t *)&vq->avail->idx);
1330 							if (likely(res_cur_idx != avail_idx)) {
1331 								cont = 1;
1332 								break;
1333 							}
1334 						}
1335 						if (cont == 1)
1336 							continue;
1337 					}
1338 
1339 					LOG_DEBUG(VHOST_DATA,
1340 						"(%"PRIu64") Failed "
1341 						"to get enough desc from "
1342 						"vring\n",
1343 						dev->device_fh);
1344 					return pkt_idx;
1345 				} else {
1346 					uint16_t wrapped_idx =
1347 						(res_cur_idx) & (vq->size - 1);
1348 					uint32_t idx =
1349 						vq->avail->ring[wrapped_idx];
1350 					uint8_t next_desc;
1351 
1352 					do {
1353 						next_desc = 0;
1354 						secure_len += vq->desc[idx].len;
1355 						if (vq->desc[idx].flags &
1356 							VRING_DESC_F_NEXT) {
1357 							idx = vq->desc[idx].next;
1358 							next_desc = 1;
1359 						}
1360 					} while (next_desc);
1361 
1362 					res_cur_idx++;
1363 				}
1364 			} while (pkt_len > secure_len);
1365 
1366 			/* vq->last_used_idx_res is atomically updated. */
1367 			success = rte_atomic16_cmpset(&vq->last_used_idx_res,
1368 							res_base_idx,
1369 							res_cur_idx);
1370 		} while (success == 0);
1371 
1372 		id = res_base_idx;
1373 		need_cnt = res_cur_idx - res_base_idx;
1374 
1375 		for (i = 0; i < need_cnt; i++, id++) {
1376 			uint16_t wrapped_idx = id & (vq->size - 1);
1377 			uint32_t idx = vq->avail->ring[wrapped_idx];
1378 			uint8_t next_desc;
1379 			do {
1380 				next_desc = 0;
1381 				vq->buf_vec[vec_idx].buf_addr =
1382 					vq->desc[idx].addr;
1383 				vq->buf_vec[vec_idx].buf_len =
1384 					vq->desc[idx].len;
1385 				vq->buf_vec[vec_idx].desc_idx = idx;
1386 				vec_idx++;
1387 
1388 				if (vq->desc[idx].flags & VRING_DESC_F_NEXT) {
1389 					idx = vq->desc[idx].next;
1390 					next_desc = 1;
1391 				}
1392 			} while (next_desc);
1393 		}
1394 
1395 		res_end_idx = res_cur_idx;
1396 
1397 		entry_success = copy_from_mbuf_to_vring(dev, res_base_idx,
1398 			res_end_idx, pkts[pkt_idx]);
1399 
1400 		rte_compiler_barrier();
1401 
1402 		/*
1403 		 * Wait until it's our turn to add our buffer
1404 		 * to the used ring.
1405 		 */
1406 		while (unlikely(vq->last_used_idx != res_base_idx))
1407 			rte_pause();
1408 
1409 		*(volatile uint16_t *)&vq->used->idx += entry_success;
1410 		vq->last_used_idx = res_end_idx;
1411 
1412 		/* Kick the guest if necessary. */
1413 		if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1414 			eventfd_write((int)vq->kickfd, 1);
1415 	}
1416 
1417 	return count;
1418 }
1419 
1420 /*
1421  * Compares a packet destination MAC address to a device MAC address.
1422  */
1423 static inline int __attribute__((always_inline))
1424 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
1425 {
1426 	return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
1427 }
1428 
1429 /*
1430  * This function learns the MAC address of the device and registers this along with a
1431  * vlan tag to a VMDQ.
1432  */
1433 static int
1434 link_vmdq(struct virtio_net *dev, struct rte_mbuf *m)
1435 {
1436 	struct ether_hdr *pkt_hdr;
1437 	struct virtio_net_data_ll *dev_ll;
1438 	int i, ret;
1439 
1440 	/* Learn MAC address of guest device from packet */
1441 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1442 
1443 	dev_ll = ll_root_used;
1444 
1445 	while (dev_ll != NULL) {
1446 		if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->dev->mac_address)) {
1447 			RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
1448 			return -1;
1449 		}
1450 		dev_ll = dev_ll->next;
1451 	}
1452 
1453 	for (i = 0; i < ETHER_ADDR_LEN; i++)
1454 		dev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
1455 
1456 	/* vlan_tag currently uses the device_id. */
1457 	dev->vlan_tag = vlan_tags[dev->device_fh];
1458 
1459 	/* Print out VMDQ registration info. */
1460 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
1461 		dev->device_fh,
1462 		dev->mac_address.addr_bytes[0], dev->mac_address.addr_bytes[1],
1463 		dev->mac_address.addr_bytes[2], dev->mac_address.addr_bytes[3],
1464 		dev->mac_address.addr_bytes[4], dev->mac_address.addr_bytes[5],
1465 		dev->vlan_tag);
1466 
1467 	/* Register the MAC address. */
1468 	ret = rte_eth_dev_mac_addr_add(ports[0], &dev->mac_address, (uint32_t)dev->device_fh);
1469 	if (ret)
1470 		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
1471 					dev->device_fh);
1472 
1473 	/* Enable stripping of the vlan tag as we handle routing. */
1474 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)dev->vmdq_rx_q, 1);
1475 
1476 	/* Set device as ready for RX. */
1477 	dev->ready = DEVICE_RX;
1478 
1479 	return 0;
1480 }
1481 
1482 /*
1483  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
1484  * queue before disabling RX on the device.
1485  */
1486 static inline void
1487 unlink_vmdq(struct virtio_net *dev)
1488 {
1489 	unsigned i = 0;
1490 	unsigned rx_count;
1491 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1492 
1493 	if (dev->ready == DEVICE_RX) {
1494 		/*clear MAC and VLAN settings*/
1495 		rte_eth_dev_mac_addr_remove(ports[0], &dev->mac_address);
1496 		for (i = 0; i < 6; i++)
1497 			dev->mac_address.addr_bytes[i] = 0;
1498 
1499 		dev->vlan_tag = 0;
1500 
1501 		/*Clear out the receive buffers*/
1502 		rx_count = rte_eth_rx_burst(ports[0],
1503 					(uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1504 
1505 		while (rx_count) {
1506 			for (i = 0; i < rx_count; i++)
1507 				rte_pktmbuf_free(pkts_burst[i]);
1508 
1509 			rx_count = rte_eth_rx_burst(ports[0],
1510 					(uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1511 		}
1512 
1513 		dev->ready = DEVICE_MAC_LEARNING;
1514 	}
1515 }
1516 
1517 /*
1518  * Check if the packet destination MAC address is for a local device. If so then put
1519  * the packet on that devices RX queue. If not then return.
1520  */
1521 static inline unsigned __attribute__((always_inline))
1522 virtio_tx_local(struct virtio_net *dev, struct rte_mbuf *m)
1523 {
1524 	struct virtio_net_data_ll *dev_ll;
1525 	struct ether_hdr *pkt_hdr;
1526 	uint64_t ret = 0;
1527 
1528 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1529 
1530 	/*get the used devices list*/
1531 	dev_ll = ll_root_used;
1532 
1533 	while (dev_ll != NULL) {
1534 		if ((dev_ll->dev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1535 				          &dev_ll->dev->mac_address)) {
1536 
1537 			/* Drop the packet if the TX packet is destined for the TX device. */
1538 			if (dev_ll->dev->device_fh == dev->device_fh) {
1539 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1540 							dev_ll->dev->device_fh);
1541 				return 0;
1542 			}
1543 
1544 
1545 			LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", dev_ll->dev->device_fh);
1546 
1547 			if (dev_ll->dev->remove) {
1548 				/*drop the packet if the device is marked for removal*/
1549 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", dev_ll->dev->device_fh);
1550 			} else {
1551 				uint32_t mergeable =
1552 					dev_ll->dev->features &
1553 					(1 << VIRTIO_NET_F_MRG_RXBUF);
1554 
1555 				/*send the packet to the local virtio device*/
1556 				if (likely(mergeable == 0))
1557 					ret = virtio_dev_rx(dev_ll->dev, &m, 1);
1558 				else
1559 					ret = virtio_dev_merge_rx(dev_ll->dev,
1560 						&m, 1);
1561 
1562 				if (enable_stats) {
1563 					rte_atomic64_add(
1564 					&dev_statistics[dev_ll->dev->device_fh].rx_total_atomic,
1565 					1);
1566 					rte_atomic64_add(
1567 					&dev_statistics[dev_ll->dev->device_fh].rx_atomic,
1568 					ret);
1569 					dev_statistics[dev->device_fh].tx_total++;
1570 					dev_statistics[dev->device_fh].tx += ret;
1571 				}
1572 			}
1573 
1574 			return 0;
1575 		}
1576 		dev_ll = dev_ll->next;
1577 	}
1578 
1579 	return -1;
1580 }
1581 
1582 /*
1583  * This function routes the TX packet to the correct interface. This may be a local device
1584  * or the physical port.
1585  */
1586 static inline void __attribute__((always_inline))
1587 virtio_tx_route(struct virtio_net* dev, struct rte_mbuf *m, struct rte_mempool *mbuf_pool, uint16_t vlan_tag)
1588 {
1589 	struct mbuf_table *tx_q;
1590 	struct vlan_ethhdr *vlan_hdr;
1591 	struct rte_mbuf **m_table;
1592 	struct rte_mbuf *mbuf, *prev;
1593 	unsigned len, ret, offset = 0;
1594 	const uint16_t lcore_id = rte_lcore_id();
1595 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1596 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1597 
1598 	/*check if destination is local VM*/
1599 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(dev, m) == 0))
1600 		return;
1601 
1602 	if (vm2vm_mode == VM2VM_HARDWARE) {
1603 		while (dev_ll != NULL) {
1604 			if ((dev_ll->dev->ready == DEVICE_RX)
1605 				&& ether_addr_cmp(&(pkt_hdr->d_addr),
1606 				&dev_ll->dev->mac_address)) {
1607 				/*
1608 				 * Drop the packet if the TX packet is
1609 				 * destined for the TX device.
1610 				 */
1611 				if (dev_ll->dev->device_fh == dev->device_fh) {
1612 					LOG_DEBUG(VHOST_DATA,
1613 					"(%"PRIu64") TX: Source and destination"
1614 					" MAC addresses are the same. Dropping "
1615 					"packet.\n",
1616 					dev_ll->dev->device_fh);
1617 					return;
1618 				}
1619 				offset = 4;
1620 				vlan_tag =
1621 				(uint16_t)
1622 				vlan_tags[(uint16_t)dev_ll->dev->device_fh];
1623 
1624 				LOG_DEBUG(VHOST_DATA,
1625 				"(%"PRIu64") TX: pkt to local VM device id:"
1626 				"(%"PRIu64") vlan tag: %d.\n",
1627 				dev->device_fh, dev_ll->dev->device_fh,
1628 				vlan_tag);
1629 
1630 				break;
1631 			}
1632 			dev_ll = dev_ll->next;
1633 		}
1634 	}
1635 
1636 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1637 
1638 	/*Add packet to the port tx queue*/
1639 	tx_q = &lcore_tx_queue[lcore_id];
1640 	len = tx_q->len;
1641 
1642 	/* Allocate an mbuf and populate the structure. */
1643 	mbuf = rte_pktmbuf_alloc(mbuf_pool);
1644 	if (unlikely(mbuf == NULL)) {
1645 		RTE_LOG(ERR, VHOST_DATA,
1646 			"Failed to allocate memory for mbuf.\n");
1647 		return;
1648 	}
1649 
1650 	mbuf->data_len = m->data_len + VLAN_HLEN + offset;
1651 	mbuf->pkt_len = m->pkt_len + VLAN_HLEN + offset;
1652 	mbuf->nb_segs = m->nb_segs;
1653 
1654 	/* Copy ethernet header to mbuf. */
1655 	rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1656 		rte_pktmbuf_mtod(m, const void *),
1657 		ETH_HLEN);
1658 
1659 
1660 	/* Setup vlan header. Bytes need to be re-ordered for network with htons()*/
1661 	vlan_hdr = rte_pktmbuf_mtod(mbuf, struct vlan_ethhdr *);
1662 	vlan_hdr->h_vlan_encapsulated_proto = vlan_hdr->h_vlan_proto;
1663 	vlan_hdr->h_vlan_proto = htons(ETH_P_8021Q);
1664 	vlan_hdr->h_vlan_TCI = htons(vlan_tag);
1665 
1666 	/* Copy the remaining packet contents to the mbuf. */
1667 	rte_memcpy((void *)(rte_pktmbuf_mtod(mbuf, uint8_t *) + VLAN_ETH_HLEN),
1668 		(const void *)(rte_pktmbuf_mtod(m, uint8_t *) + ETH_HLEN),
1669 		(m->data_len - ETH_HLEN));
1670 
1671 	/* Copy the remaining segments for the whole packet. */
1672 	prev = mbuf;
1673 	while (m->next) {
1674 		/* Allocate an mbuf and populate the structure. */
1675 		struct rte_mbuf *next_mbuf = rte_pktmbuf_alloc(mbuf_pool);
1676 		if (unlikely(next_mbuf == NULL)) {
1677 			rte_pktmbuf_free(mbuf);
1678 			RTE_LOG(ERR, VHOST_DATA,
1679 				"Failed to allocate memory for mbuf.\n");
1680 			return;
1681 		}
1682 
1683 		m = m->next;
1684 		prev->next = next_mbuf;
1685 		prev = next_mbuf;
1686 		next_mbuf->data_len = m->data_len;
1687 
1688 		/* Copy data to next mbuf. */
1689 		rte_memcpy(rte_pktmbuf_mtod(next_mbuf, void *),
1690 			rte_pktmbuf_mtod(m, const void *), m->data_len);
1691 	}
1692 
1693 	tx_q->m_table[len] = mbuf;
1694 	len++;
1695 	if (enable_stats) {
1696 		dev_statistics[dev->device_fh].tx_total++;
1697 		dev_statistics[dev->device_fh].tx++;
1698 	}
1699 
1700 	if (unlikely(len == MAX_PKT_BURST)) {
1701 		m_table = (struct rte_mbuf **)tx_q->m_table;
1702 		ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1703 		/* Free any buffers not handled by TX and update the port stats. */
1704 		if (unlikely(ret < len)) {
1705 			do {
1706 				rte_pktmbuf_free(m_table[ret]);
1707 			} while (++ret < len);
1708 		}
1709 
1710 		len = 0;
1711 	}
1712 
1713 	tx_q->len = len;
1714 	return;
1715 }
1716 
1717 static inline void __attribute__((always_inline))
1718 virtio_dev_tx(struct virtio_net* dev, struct rte_mempool *mbuf_pool)
1719 {
1720 	struct rte_mbuf m;
1721 	struct vhost_virtqueue *vq;
1722 	struct vring_desc *desc;
1723 	uint64_t buff_addr = 0;
1724 	uint32_t head[MAX_PKT_BURST];
1725 	uint32_t used_idx;
1726 	uint32_t i;
1727 	uint16_t free_entries, packet_success = 0;
1728 	uint16_t avail_idx;
1729 
1730 	vq = dev->virtqueue[VIRTIO_TXQ];
1731 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1732 
1733 	/* If there are no available buffers then return. */
1734 	if (vq->last_used_idx == avail_idx)
1735 		return;
1736 
1737 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1738 
1739 	/* Prefetch available ring to retrieve head indexes. */
1740 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]);
1741 
1742 	/*get the number of free entries in the ring*/
1743 	free_entries = (avail_idx - vq->last_used_idx);
1744 
1745 	/* Limit to MAX_PKT_BURST. */
1746 	if (free_entries > MAX_PKT_BURST)
1747 		free_entries = MAX_PKT_BURST;
1748 
1749 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", dev->device_fh, free_entries);
1750 	/* Retrieve all of the head indexes first to avoid caching issues. */
1751 	for (i = 0; i < free_entries; i++)
1752 		head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)];
1753 
1754 	/* Prefetch descriptor index. */
1755 	rte_prefetch0(&vq->desc[head[packet_success]]);
1756 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1757 
1758 	while (packet_success < free_entries) {
1759 		desc = &vq->desc[head[packet_success]];
1760 
1761 		/* Discard first buffer as it is the virtio header */
1762 		desc = &vq->desc[desc->next];
1763 
1764 		/* Buffer address translation. */
1765 		buff_addr = gpa_to_vva(dev, desc->addr);
1766 		/* Prefetch buffer address. */
1767 		rte_prefetch0((void*)(uintptr_t)buff_addr);
1768 
1769 		used_idx = vq->last_used_idx & (vq->size - 1);
1770 
1771 		if (packet_success < (free_entries - 1)) {
1772 			/* Prefetch descriptor index. */
1773 			rte_prefetch0(&vq->desc[head[packet_success+1]]);
1774 			rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]);
1775 		}
1776 
1777 		/* Update used index buffer information. */
1778 		vq->used->ring[used_idx].id = head[packet_success];
1779 		vq->used->ring[used_idx].len = 0;
1780 
1781 		/* Setup dummy mbuf. This is copied to a real mbuf if transmitted out the physical port. */
1782 		m.data_len = desc->len;
1783 		m.pkt_len = desc->len;
1784 		m.data_off = 0;
1785 
1786 		PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1787 
1788 		/* If this is the first received packet we need to learn the MAC and setup VMDQ */
1789 		if (dev->ready == DEVICE_MAC_LEARNING) {
1790 			if (dev->remove || (link_vmdq(dev, &m) == -1)) {
1791 				/*discard frame if device is scheduled for removal or a duplicate MAC address is found. */
1792 				packet_success += free_entries;
1793 				vq->last_used_idx += packet_success;
1794 				break;
1795 			}
1796 		}
1797 		virtio_tx_route(dev, &m, mbuf_pool, (uint16_t)dev->device_fh);
1798 
1799 		vq->last_used_idx++;
1800 		packet_success++;
1801 	}
1802 
1803 	rte_compiler_barrier();
1804 	vq->used->idx += packet_success;
1805 	/* Kick guest if required. */
1806 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1807 		eventfd_write((int)vq->kickfd, 1);
1808 }
1809 
1810 /* This function works for TX packets with mergeable feature enabled. */
1811 static inline void __attribute__((always_inline))
1812 virtio_dev_merge_tx(struct virtio_net *dev, struct rte_mempool *mbuf_pool)
1813 {
1814 	struct rte_mbuf *m, *prev;
1815 	struct vhost_virtqueue *vq;
1816 	struct vring_desc *desc;
1817 	uint64_t vb_addr = 0;
1818 	uint32_t head[MAX_PKT_BURST];
1819 	uint32_t used_idx;
1820 	uint32_t i;
1821 	uint16_t free_entries, entry_success = 0;
1822 	uint16_t avail_idx;
1823 	uint32_t buf_size = MBUF_SIZE - (sizeof(struct rte_mbuf)
1824 			+ RTE_PKTMBUF_HEADROOM);
1825 
1826 	vq = dev->virtqueue[VIRTIO_TXQ];
1827 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1828 
1829 	/* If there are no available buffers then return. */
1830 	if (vq->last_used_idx == avail_idx)
1831 		return;
1832 
1833 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_tx()\n",
1834 		dev->device_fh);
1835 
1836 	/* Prefetch available ring to retrieve head indexes. */
1837 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]);
1838 
1839 	/*get the number of free entries in the ring*/
1840 	free_entries = (avail_idx - vq->last_used_idx);
1841 
1842 	/* Limit to MAX_PKT_BURST. */
1843 	free_entries = RTE_MIN(free_entries, MAX_PKT_BURST);
1844 
1845 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1846 		dev->device_fh, free_entries);
1847 	/* Retrieve all of the head indexes first to avoid caching issues. */
1848 	for (i = 0; i < free_entries; i++)
1849 		head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)];
1850 
1851 	/* Prefetch descriptor index. */
1852 	rte_prefetch0(&vq->desc[head[entry_success]]);
1853 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1854 
1855 	while (entry_success < free_entries) {
1856 		uint32_t vb_avail, vb_offset;
1857 		uint32_t seg_avail, seg_offset;
1858 		uint32_t cpy_len;
1859 		uint32_t seg_num = 0;
1860 		struct rte_mbuf *cur;
1861 		uint8_t alloc_err = 0;
1862 
1863 		desc = &vq->desc[head[entry_success]];
1864 
1865 		/* Discard first buffer as it is the virtio header */
1866 		desc = &vq->desc[desc->next];
1867 
1868 		/* Buffer address translation. */
1869 		vb_addr = gpa_to_vva(dev, desc->addr);
1870 		/* Prefetch buffer address. */
1871 		rte_prefetch0((void *)(uintptr_t)vb_addr);
1872 
1873 		used_idx = vq->last_used_idx & (vq->size - 1);
1874 
1875 		if (entry_success < (free_entries - 1)) {
1876 			/* Prefetch descriptor index. */
1877 			rte_prefetch0(&vq->desc[head[entry_success+1]]);
1878 			rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]);
1879 		}
1880 
1881 		/* Update used index buffer information. */
1882 		vq->used->ring[used_idx].id = head[entry_success];
1883 		vq->used->ring[used_idx].len = 0;
1884 
1885 		vb_offset = 0;
1886 		vb_avail = desc->len;
1887 		seg_offset = 0;
1888 		seg_avail = buf_size;
1889 		cpy_len = RTE_MIN(vb_avail, seg_avail);
1890 
1891 		PRINT_PACKET(dev, (uintptr_t)vb_addr, desc->len, 0);
1892 
1893 		/* Allocate an mbuf and populate the structure. */
1894 		m = rte_pktmbuf_alloc(mbuf_pool);
1895 		if (unlikely(m == NULL)) {
1896 			RTE_LOG(ERR, VHOST_DATA,
1897 				"Failed to allocate memory for mbuf.\n");
1898 			return;
1899 		}
1900 
1901 		seg_num++;
1902 		cur = m;
1903 		prev = m;
1904 		while (cpy_len != 0) {
1905 			rte_memcpy((void *)(rte_pktmbuf_mtod(cur, char *) + seg_offset),
1906 				(void *)((uintptr_t)(vb_addr + vb_offset)),
1907 				cpy_len);
1908 
1909 			seg_offset += cpy_len;
1910 			vb_offset += cpy_len;
1911 			vb_avail -= cpy_len;
1912 			seg_avail -= cpy_len;
1913 
1914 			if (vb_avail != 0) {
1915 				/*
1916 				 * The segment reachs to its end,
1917 				 * while the virtio buffer in TX vring has
1918 				 * more data to be copied.
1919 				 */
1920 				cur->data_len = seg_offset;
1921 				m->pkt_len += seg_offset;
1922 				/* Allocate mbuf and populate the structure. */
1923 				cur = rte_pktmbuf_alloc(mbuf_pool);
1924 				if (unlikely(cur == NULL)) {
1925 					RTE_LOG(ERR, VHOST_DATA, "Failed to "
1926 						"allocate memory for mbuf.\n");
1927 					rte_pktmbuf_free(m);
1928 					alloc_err = 1;
1929 					break;
1930 				}
1931 
1932 				seg_num++;
1933 				prev->next = cur;
1934 				prev = cur;
1935 				seg_offset = 0;
1936 				seg_avail = buf_size;
1937 			} else {
1938 				if (desc->flags & VRING_DESC_F_NEXT) {
1939 					/*
1940 					 * There are more virtio buffers in
1941 					 * same vring entry need to be copied.
1942 					 */
1943 					if (seg_avail == 0) {
1944 						/*
1945 						 * The current segment hasn't
1946 						 * room to accomodate more
1947 						 * data.
1948 						 */
1949 						cur->data_len = seg_offset;
1950 						m->pkt_len += seg_offset;
1951 						/*
1952 						 * Allocate an mbuf and
1953 						 * populate the structure.
1954 						 */
1955 						cur = rte_pktmbuf_alloc(mbuf_pool);
1956 						if (unlikely(cur == NULL)) {
1957 							RTE_LOG(ERR,
1958 								VHOST_DATA,
1959 								"Failed to "
1960 								"allocate memory "
1961 								"for mbuf\n");
1962 							rte_pktmbuf_free(m);
1963 							alloc_err = 1;
1964 							break;
1965 						}
1966 						seg_num++;
1967 						prev->next = cur;
1968 						prev = cur;
1969 						seg_offset = 0;
1970 						seg_avail = buf_size;
1971 					}
1972 
1973 					desc = &vq->desc[desc->next];
1974 
1975 					/* Buffer address translation. */
1976 					vb_addr = gpa_to_vva(dev, desc->addr);
1977 					/* Prefetch buffer address. */
1978 					rte_prefetch0((void *)(uintptr_t)vb_addr);
1979 					vb_offset = 0;
1980 					vb_avail = desc->len;
1981 
1982 					PRINT_PACKET(dev, (uintptr_t)vb_addr,
1983 						desc->len, 0);
1984 				} else {
1985 					/* The whole packet completes. */
1986 					cur->data_len = seg_offset;
1987 					m->pkt_len += seg_offset;
1988 					vb_avail = 0;
1989 				}
1990 			}
1991 
1992 			cpy_len = RTE_MIN(vb_avail, seg_avail);
1993 		}
1994 
1995 		if (unlikely(alloc_err == 1))
1996 			break;
1997 
1998 		m->nb_segs = seg_num;
1999 
2000 		/*
2001 		 * If this is the first received packet we need to learn
2002 		 * the MAC and setup VMDQ
2003 		 */
2004 		if (dev->ready == DEVICE_MAC_LEARNING) {
2005 			if (dev->remove || (link_vmdq(dev, m) == -1)) {
2006 				/*
2007 				 * Discard frame if device is scheduled for
2008 				 * removal or a duplicate MAC address is found.
2009 				 */
2010 				entry_success = free_entries;
2011 				vq->last_used_idx += entry_success;
2012 				rte_pktmbuf_free(m);
2013 				break;
2014 			}
2015 		}
2016 
2017 		virtio_tx_route(dev, m, mbuf_pool, (uint16_t)dev->device_fh);
2018 		vq->last_used_idx++;
2019 		entry_success++;
2020 		rte_pktmbuf_free(m);
2021 	}
2022 
2023 	rte_compiler_barrier();
2024 	vq->used->idx += entry_success;
2025 	/* Kick guest if required. */
2026 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
2027 		eventfd_write((int)vq->kickfd, 1);
2028 
2029 }
2030 
2031 /*
2032  * This function is called by each data core. It handles all RX/TX registered with the
2033  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
2034  * with all devices in the main linked list.
2035  */
2036 static int
2037 switch_worker(__attribute__((unused)) void *arg)
2038 {
2039 	struct rte_mempool *mbuf_pool = arg;
2040 	struct virtio_net *dev = NULL;
2041 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2042 	struct virtio_net_data_ll *dev_ll;
2043 	struct mbuf_table *tx_q;
2044 	volatile struct lcore_ll_info *lcore_ll;
2045 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
2046 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2047 	unsigned ret, i;
2048 	const uint16_t lcore_id = rte_lcore_id();
2049 	const uint16_t num_cores = (uint16_t)rte_lcore_count();
2050 	uint16_t rx_count = 0;
2051 	uint32_t mergeable = 0;
2052 
2053 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2054 	lcore_ll = lcore_info[lcore_id].lcore_ll;
2055 	prev_tsc = 0;
2056 
2057 	tx_q = &lcore_tx_queue[lcore_id];
2058 	for (i = 0; i < num_cores; i ++) {
2059 		if (lcore_ids[i] == lcore_id) {
2060 			tx_q->txq_id = i;
2061 			break;
2062 		}
2063 	}
2064 
2065 	while(1) {
2066 		cur_tsc = rte_rdtsc();
2067 		/*
2068 		 * TX burst queue drain
2069 		 */
2070 		diff_tsc = cur_tsc - prev_tsc;
2071 		if (unlikely(diff_tsc > drain_tsc)) {
2072 
2073 			if (tx_q->len) {
2074 				LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
2075 
2076 				/*Tx any packets in the queue*/
2077 				ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
2078 									   (struct rte_mbuf **)tx_q->m_table,
2079 									   (uint16_t)tx_q->len);
2080 				if (unlikely(ret < tx_q->len)) {
2081 					do {
2082 						rte_pktmbuf_free(tx_q->m_table[ret]);
2083 					} while (++ret < tx_q->len);
2084 				}
2085 
2086 				tx_q->len = 0;
2087 			}
2088 
2089 			prev_tsc = cur_tsc;
2090 
2091 		}
2092 
2093 		rte_prefetch0(lcore_ll->ll_root_used);
2094 		/*
2095 		 * Inform the configuration core that we have exited the linked list and that no devices are
2096 		 * in use if requested.
2097 		 */
2098 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2099 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2100 
2101 		/*
2102 		 * Process devices
2103 		 */
2104 		dev_ll = lcore_ll->ll_root_used;
2105 
2106 		while (dev_ll != NULL) {
2107 			/*get virtio device ID*/
2108 			dev = dev_ll->dev;
2109 			mergeable =
2110 				dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF);
2111 
2112 			if (dev->remove) {
2113 				dev_ll = dev_ll->next;
2114 				unlink_vmdq(dev);
2115 				dev->ready = DEVICE_SAFE_REMOVE;
2116 				continue;
2117 			}
2118 			if (likely(dev->ready == DEVICE_RX)) {
2119 				/*Handle guest RX*/
2120 				rx_count = rte_eth_rx_burst(ports[0],
2121 					(uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
2122 
2123 				if (rx_count) {
2124 					if (likely(mergeable == 0))
2125 						ret_count =
2126 							virtio_dev_rx(dev,
2127 							pkts_burst, rx_count);
2128 					else
2129 						ret_count =
2130 							virtio_dev_merge_rx(dev,
2131 							pkts_burst, rx_count);
2132 
2133 					if (enable_stats) {
2134 						rte_atomic64_add(
2135 						&dev_statistics[dev_ll->dev->device_fh].rx_total_atomic,
2136 						rx_count);
2137 						rte_atomic64_add(
2138 						&dev_statistics[dev_ll->dev->device_fh].rx_atomic, ret_count);
2139 					}
2140 					while (likely(rx_count)) {
2141 						rx_count--;
2142 						rte_pktmbuf_free(pkts_burst[rx_count]);
2143 					}
2144 
2145 				}
2146 			}
2147 
2148 			if (!dev->remove) {
2149 				/*Handle guest TX*/
2150 				if (likely(mergeable == 0))
2151 					virtio_dev_tx(dev, mbuf_pool);
2152 				else
2153 					virtio_dev_merge_tx(dev, mbuf_pool);
2154 			}
2155 
2156 			/*move to the next device in the list*/
2157 			dev_ll = dev_ll->next;
2158 		}
2159 	}
2160 
2161 	return 0;
2162 }
2163 
2164 /*
2165  * This function gets available ring number for zero copy rx.
2166  * Only one thread will call this funciton for a paticular virtio device,
2167  * so, it is designed as non-thread-safe function.
2168  */
2169 static inline uint32_t __attribute__((always_inline))
2170 get_available_ring_num_zcp(struct virtio_net *dev)
2171 {
2172 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
2173 	uint16_t avail_idx;
2174 
2175 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
2176 	return (uint32_t)(avail_idx - vq->last_used_idx_res);
2177 }
2178 
2179 /*
2180  * This function gets available ring index for zero copy rx,
2181  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
2182  * Only one thread will call this funciton for a paticular virtio device,
2183  * so, it is designed as non-thread-safe function.
2184  */
2185 static inline uint32_t __attribute__((always_inline))
2186 get_available_ring_index_zcp(struct virtio_net *dev,
2187 	uint16_t *res_base_idx, uint32_t count)
2188 {
2189 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
2190 	uint16_t avail_idx;
2191 	uint32_t retry = 0;
2192 	uint16_t free_entries;
2193 
2194 	*res_base_idx = vq->last_used_idx_res;
2195 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
2196 	free_entries = (avail_idx - *res_base_idx);
2197 
2198 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
2199 			"avail idx: %d, "
2200 			"res base idx:%d, free entries:%d\n",
2201 			dev->device_fh, avail_idx, *res_base_idx,
2202 			free_entries);
2203 
2204 	/*
2205 	 * If retry is enabled and the queue is full then we wait
2206 	 * and retry to avoid packet loss.
2207 	 */
2208 	if (enable_retry && unlikely(count > free_entries)) {
2209 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
2210 			rte_delay_us(burst_rx_delay_time);
2211 			avail_idx = *((volatile uint16_t *)&vq->avail->idx);
2212 			free_entries = (avail_idx - *res_base_idx);
2213 			if (count <= free_entries)
2214 				break;
2215 		}
2216 	}
2217 
2218 	/*check that we have enough buffers*/
2219 	if (unlikely(count > free_entries))
2220 		count = free_entries;
2221 
2222 	if (unlikely(count == 0)) {
2223 		LOG_DEBUG(VHOST_DATA,
2224 			"(%"PRIu64") Fail in get_available_ring_index_zcp: "
2225 			"avail idx: %d, res base idx:%d, free entries:%d\n",
2226 			dev->device_fh, avail_idx,
2227 			*res_base_idx, free_entries);
2228 		return 0;
2229 	}
2230 
2231 	vq->last_used_idx_res = *res_base_idx + count;
2232 
2233 	return count;
2234 }
2235 
2236 /*
2237  * This function put descriptor back to used list.
2238  */
2239 static inline void __attribute__((always_inline))
2240 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
2241 {
2242 	uint16_t res_cur_idx = vq->last_used_idx;
2243 	vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
2244 	vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
2245 	rte_compiler_barrier();
2246 	*(volatile uint16_t *)&vq->used->idx += 1;
2247 	vq->last_used_idx += 1;
2248 
2249 	/* Kick the guest if necessary. */
2250 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
2251 		eventfd_write((int)vq->kickfd, 1);
2252 }
2253 
2254 /*
2255  * This function get available descriptor from vitio vring and un-attached mbuf
2256  * from vpool->ring, and then attach them together. It needs adjust the offset
2257  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
2258  * frame data may be put to wrong location in mbuf.
2259  */
2260 static inline void __attribute__((always_inline))
2261 attach_rxmbuf_zcp(struct virtio_net *dev)
2262 {
2263 	uint16_t res_base_idx, desc_idx;
2264 	uint64_t buff_addr, phys_addr;
2265 	struct vhost_virtqueue *vq;
2266 	struct vring_desc *desc;
2267 	struct rte_mbuf *mbuf = NULL;
2268 	struct vpool *vpool;
2269 	hpa_type addr_type;
2270 
2271 	vpool = &vpool_array[dev->vmdq_rx_q];
2272 	vq = dev->virtqueue[VIRTIO_RXQ];
2273 
2274 	do {
2275 		if (unlikely(get_available_ring_index_zcp(dev, &res_base_idx,
2276 				1) != 1))
2277 			return;
2278 		desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
2279 
2280 		desc = &vq->desc[desc_idx];
2281 		if (desc->flags & VRING_DESC_F_NEXT) {
2282 			desc = &vq->desc[desc->next];
2283 			buff_addr = gpa_to_vva(dev, desc->addr);
2284 			phys_addr = gpa_to_hpa(dev, desc->addr, desc->len,
2285 					&addr_type);
2286 		} else {
2287 			buff_addr = gpa_to_vva(dev,
2288 					desc->addr + vq->vhost_hlen);
2289 			phys_addr = gpa_to_hpa(dev,
2290 					desc->addr + vq->vhost_hlen,
2291 					desc->len, &addr_type);
2292 		}
2293 
2294 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
2295 			RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
2296 				" address found when attaching RX frame buffer"
2297 				" address!\n", dev->device_fh);
2298 			put_desc_to_used_list_zcp(vq, desc_idx);
2299 			continue;
2300 		}
2301 
2302 		/*
2303 		 * Check if the frame buffer address from guest crosses
2304 		 * sub-region or not.
2305 		 */
2306 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
2307 			RTE_LOG(ERR, VHOST_DATA,
2308 				"(%"PRIu64") Frame buffer address cross "
2309 				"sub-regioin found when attaching RX frame "
2310 				"buffer address!\n",
2311 				dev->device_fh);
2312 			put_desc_to_used_list_zcp(vq, desc_idx);
2313 			continue;
2314 		}
2315 	} while (unlikely(phys_addr == 0));
2316 
2317 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
2318 	if (unlikely(mbuf == NULL)) {
2319 		LOG_DEBUG(VHOST_DATA,
2320 			"(%"PRIu64") in attach_rxmbuf_zcp: "
2321 			"ring_sc_dequeue fail.\n",
2322 			dev->device_fh);
2323 		put_desc_to_used_list_zcp(vq, desc_idx);
2324 		return;
2325 	}
2326 
2327 	if (unlikely(vpool->buf_size > desc->len)) {
2328 		LOG_DEBUG(VHOST_DATA,
2329 			"(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
2330 			"length(%d) of descriptor idx: %d less than room "
2331 			"size required: %d\n",
2332 			dev->device_fh, desc->len, desc_idx, vpool->buf_size);
2333 		put_desc_to_used_list_zcp(vq, desc_idx);
2334 		rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
2335 		return;
2336 	}
2337 
2338 	mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
2339 	mbuf->data_off = RTE_PKTMBUF_HEADROOM;
2340 	mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
2341 	mbuf->data_len = desc->len;
2342 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
2343 
2344 	LOG_DEBUG(VHOST_DATA,
2345 		"(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
2346 		"descriptor idx:%d\n",
2347 		dev->device_fh, res_base_idx, desc_idx);
2348 
2349 	__rte_mbuf_raw_free(mbuf);
2350 
2351 	return;
2352 }
2353 
2354 /*
2355  * Detach an attched packet mbuf -
2356  *  - restore original mbuf address and length values.
2357  *  - reset pktmbuf data and data_len to their default values.
2358  *  All other fields of the given packet mbuf will be left intact.
2359  *
2360  * @param m
2361  *   The attached packet mbuf.
2362  */
2363 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
2364 {
2365 	const struct rte_mempool *mp = m->pool;
2366 	void *buf = RTE_MBUF_TO_BADDR(m);
2367 	uint32_t buf_ofs;
2368 	uint32_t buf_len = mp->elt_size - sizeof(*m);
2369 	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
2370 
2371 	m->buf_addr = buf;
2372 	m->buf_len = (uint16_t)buf_len;
2373 
2374 	buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
2375 			RTE_PKTMBUF_HEADROOM : m->buf_len;
2376 	m->data_off = buf_ofs;
2377 
2378 	m->data_len = 0;
2379 }
2380 
2381 /*
2382  * This function is called after packets have been transimited. It fetchs mbuf
2383  * from vpool->pool, detached it and put into vpool->ring. It also update the
2384  * used index and kick the guest if necessary.
2385  */
2386 static inline uint32_t __attribute__((always_inline))
2387 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
2388 {
2389 	struct rte_mbuf *mbuf;
2390 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
2391 	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
2392 	uint32_t index = 0;
2393 	uint32_t mbuf_count = rte_mempool_count(vpool->pool);
2394 
2395 	LOG_DEBUG(VHOST_DATA,
2396 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
2397 		"clean is: %d\n",
2398 		dev->device_fh, mbuf_count);
2399 	LOG_DEBUG(VHOST_DATA,
2400 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
2401 		"clean  is : %d\n",
2402 		dev->device_fh, rte_ring_count(vpool->ring));
2403 
2404 	for (index = 0; index < mbuf_count; index++) {
2405 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
2406 		if (likely(RTE_MBUF_INDIRECT(mbuf)))
2407 			pktmbuf_detach_zcp(mbuf);
2408 		rte_ring_sp_enqueue(vpool->ring, mbuf);
2409 
2410 		/* Update used index buffer information. */
2411 		vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
2412 		vq->used->ring[used_idx].len = 0;
2413 
2414 		used_idx = (used_idx + 1) & (vq->size - 1);
2415 	}
2416 
2417 	LOG_DEBUG(VHOST_DATA,
2418 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
2419 		"clean is: %d\n",
2420 		dev->device_fh, rte_mempool_count(vpool->pool));
2421 	LOG_DEBUG(VHOST_DATA,
2422 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
2423 		"clean  is : %d\n",
2424 		dev->device_fh, rte_ring_count(vpool->ring));
2425 	LOG_DEBUG(VHOST_DATA,
2426 		"(%"PRIu64") in txmbuf_clean_zcp: before updated "
2427 		"vq->last_used_idx:%d\n",
2428 		dev->device_fh, vq->last_used_idx);
2429 
2430 	vq->last_used_idx += mbuf_count;
2431 
2432 	LOG_DEBUG(VHOST_DATA,
2433 		"(%"PRIu64") in txmbuf_clean_zcp: after updated "
2434 		"vq->last_used_idx:%d\n",
2435 		dev->device_fh, vq->last_used_idx);
2436 
2437 	rte_compiler_barrier();
2438 
2439 	*(volatile uint16_t *)&vq->used->idx += mbuf_count;
2440 
2441 	/* Kick guest if required. */
2442 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
2443 		eventfd_write((int)vq->kickfd, 1);
2444 
2445 	return 0;
2446 }
2447 
2448 /*
2449  * This function is called when a virtio device is destroy.
2450  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
2451  */
2452 static void mbuf_destroy_zcp(struct vpool *vpool)
2453 {
2454 	struct rte_mbuf *mbuf = NULL;
2455 	uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
2456 
2457 	LOG_DEBUG(VHOST_CONFIG,
2458 		"in mbuf_destroy_zcp: mbuf count in mempool before "
2459 		"mbuf_destroy_zcp is: %d\n",
2460 		mbuf_count);
2461 	LOG_DEBUG(VHOST_CONFIG,
2462 		"in mbuf_destroy_zcp: mbuf count in  ring before "
2463 		"mbuf_destroy_zcp  is : %d\n",
2464 		rte_ring_count(vpool->ring));
2465 
2466 	for (index = 0; index < mbuf_count; index++) {
2467 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
2468 		if (likely(mbuf != NULL)) {
2469 			if (likely(RTE_MBUF_INDIRECT(mbuf)))
2470 				pktmbuf_detach_zcp(mbuf);
2471 			rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
2472 		}
2473 	}
2474 
2475 	LOG_DEBUG(VHOST_CONFIG,
2476 		"in mbuf_destroy_zcp: mbuf count in mempool after "
2477 		"mbuf_destroy_zcp is: %d\n",
2478 		rte_mempool_count(vpool->pool));
2479 	LOG_DEBUG(VHOST_CONFIG,
2480 		"in mbuf_destroy_zcp: mbuf count in ring after "
2481 		"mbuf_destroy_zcp is : %d\n",
2482 		rte_ring_count(vpool->ring));
2483 }
2484 
2485 /*
2486  * This function update the use flag and counter.
2487  */
2488 static inline uint32_t __attribute__((always_inline))
2489 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
2490 	uint32_t count)
2491 {
2492 	struct vhost_virtqueue *vq;
2493 	struct vring_desc *desc;
2494 	struct rte_mbuf *buff;
2495 	/* The virtio_hdr is initialised to 0. */
2496 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr
2497 		= {{0, 0, 0, 0, 0, 0}, 0};
2498 	uint64_t buff_hdr_addr = 0;
2499 	uint32_t head[MAX_PKT_BURST], packet_len = 0;
2500 	uint32_t head_idx, packet_success = 0;
2501 	uint16_t res_cur_idx;
2502 
2503 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
2504 
2505 	if (count == 0)
2506 		return 0;
2507 
2508 	vq = dev->virtqueue[VIRTIO_RXQ];
2509 	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
2510 
2511 	res_cur_idx = vq->last_used_idx;
2512 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
2513 		dev->device_fh, res_cur_idx, res_cur_idx + count);
2514 
2515 	/* Retrieve all of the head indexes first to avoid caching issues. */
2516 	for (head_idx = 0; head_idx < count; head_idx++)
2517 		head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
2518 
2519 	/*Prefetch descriptor index. */
2520 	rte_prefetch0(&vq->desc[head[packet_success]]);
2521 
2522 	while (packet_success != count) {
2523 		/* Get descriptor from available ring */
2524 		desc = &vq->desc[head[packet_success]];
2525 
2526 		buff = pkts[packet_success];
2527 		LOG_DEBUG(VHOST_DATA,
2528 			"(%"PRIu64") in dev_rx_zcp: update the used idx for "
2529 			"pkt[%d] descriptor idx: %d\n",
2530 			dev->device_fh, packet_success,
2531 			MBUF_HEADROOM_UINT32(buff));
2532 
2533 		PRINT_PACKET(dev,
2534 			(uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
2535 			+ RTE_PKTMBUF_HEADROOM),
2536 			rte_pktmbuf_data_len(buff), 0);
2537 
2538 		/* Buffer address translation for virtio header. */
2539 		buff_hdr_addr = gpa_to_vva(dev, desc->addr);
2540 		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
2541 
2542 		/*
2543 		 * If the descriptors are chained the header and data are
2544 		 * placed in separate buffers.
2545 		 */
2546 		if (desc->flags & VRING_DESC_F_NEXT) {
2547 			desc->len = vq->vhost_hlen;
2548 			desc = &vq->desc[desc->next];
2549 			desc->len = rte_pktmbuf_data_len(buff);
2550 		} else {
2551 			desc->len = packet_len;
2552 		}
2553 
2554 		/* Update used ring with desc information */
2555 		vq->used->ring[res_cur_idx & (vq->size - 1)].id
2556 			= head[packet_success];
2557 		vq->used->ring[res_cur_idx & (vq->size - 1)].len
2558 			= packet_len;
2559 		res_cur_idx++;
2560 		packet_success++;
2561 
2562 		/* A header is required per buffer. */
2563 		rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
2564 			(const void *)&virtio_hdr, vq->vhost_hlen);
2565 
2566 		PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
2567 
2568 		if (likely(packet_success < count)) {
2569 			/* Prefetch descriptor index. */
2570 			rte_prefetch0(&vq->desc[head[packet_success]]);
2571 		}
2572 	}
2573 
2574 	rte_compiler_barrier();
2575 
2576 	LOG_DEBUG(VHOST_DATA,
2577 		"(%"PRIu64") in dev_rx_zcp: before update used idx: "
2578 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
2579 		dev->device_fh, vq->last_used_idx, vq->used->idx);
2580 
2581 	*(volatile uint16_t *)&vq->used->idx += count;
2582 	vq->last_used_idx += count;
2583 
2584 	LOG_DEBUG(VHOST_DATA,
2585 		"(%"PRIu64") in dev_rx_zcp: after  update used idx: "
2586 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
2587 		dev->device_fh, vq->last_used_idx, vq->used->idx);
2588 
2589 	/* Kick the guest if necessary. */
2590 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
2591 		eventfd_write((int)vq->kickfd, 1);
2592 
2593 	return count;
2594 }
2595 
2596 /*
2597  * This function routes the TX packet to the correct interface.
2598  * This may be a local device or the physical port.
2599  */
2600 static inline void __attribute__((always_inline))
2601 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
2602 	uint32_t desc_idx, uint8_t need_copy)
2603 {
2604 	struct mbuf_table *tx_q;
2605 	struct rte_mbuf **m_table;
2606 	struct rte_mbuf *mbuf = NULL;
2607 	unsigned len, ret, offset = 0;
2608 	struct vpool *vpool;
2609 	struct virtio_net_data_ll *dev_ll = ll_root_used;
2610 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
2611 	uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
2612 
2613 	/*Add packet to the port tx queue*/
2614 	tx_q = &tx_queue_zcp[(uint16_t)dev->vmdq_rx_q];
2615 	len = tx_q->len;
2616 
2617 	/* Allocate an mbuf and populate the structure. */
2618 	vpool = &vpool_array[MAX_QUEUES + (uint16_t)dev->vmdq_rx_q];
2619 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
2620 	if (unlikely(mbuf == NULL)) {
2621 		struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
2622 		RTE_LOG(ERR, VHOST_DATA,
2623 			"(%"PRIu64") Failed to allocate memory for mbuf.\n",
2624 			dev->device_fh);
2625 		put_desc_to_used_list_zcp(vq, desc_idx);
2626 		return;
2627 	}
2628 
2629 	if (vm2vm_mode == VM2VM_HARDWARE) {
2630 		/* Avoid using a vlan tag from any vm for external pkt, such as
2631 		 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
2632 		 * selection, MAC address determines it as an external pkt
2633 		 * which should go to network, while vlan tag determine it as
2634 		 * a vm2vm pkt should forward to another vm. Hardware confuse
2635 		 * such a ambiguous situation, so pkt will lost.
2636 		 */
2637 		vlan_tag = external_pkt_default_vlan_tag;
2638 		while (dev_ll != NULL) {
2639 			if (likely(dev_ll->dev->ready == DEVICE_RX) &&
2640 				ether_addr_cmp(&(pkt_hdr->d_addr),
2641 				&dev_ll->dev->mac_address)) {
2642 
2643 				/*
2644 				 * Drop the packet if the TX packet is destined
2645 				 * for the TX device.
2646 				 */
2647 				if (unlikely(dev_ll->dev->device_fh
2648 					== dev->device_fh)) {
2649 					LOG_DEBUG(VHOST_DATA,
2650 					"(%"PRIu64") TX: Source and destination"
2651 					"MAC addresses are the same. Dropping "
2652 					"packet.\n",
2653 					dev_ll->dev->device_fh);
2654 					MBUF_HEADROOM_UINT32(mbuf)
2655 						= (uint32_t)desc_idx;
2656 					__rte_mbuf_raw_free(mbuf);
2657 					return;
2658 				}
2659 
2660 				/*
2661 				 * Packet length offset 4 bytes for HW vlan
2662 				 * strip when L2 switch back.
2663 				 */
2664 				offset = 4;
2665 				vlan_tag =
2666 				(uint16_t)
2667 				vlan_tags[(uint16_t)dev_ll->dev->device_fh];
2668 
2669 				LOG_DEBUG(VHOST_DATA,
2670 				"(%"PRIu64") TX: pkt to local VM device id:"
2671 				"(%"PRIu64") vlan tag: %d.\n",
2672 				dev->device_fh, dev_ll->dev->device_fh,
2673 				vlan_tag);
2674 
2675 				break;
2676 			}
2677 			dev_ll = dev_ll->next;
2678 		}
2679 	}
2680 
2681 	mbuf->nb_segs = m->nb_segs;
2682 	mbuf->next = m->next;
2683 	mbuf->data_len = m->data_len + offset;
2684 	mbuf->pkt_len = mbuf->data_len;
2685 	if (unlikely(need_copy)) {
2686 		/* Copy the packet contents to the mbuf. */
2687 		rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
2688 			rte_pktmbuf_mtod(m, void *),
2689 			m->data_len);
2690 	} else {
2691 		mbuf->data_off = m->data_off;
2692 		mbuf->buf_physaddr = m->buf_physaddr;
2693 		mbuf->buf_addr = m->buf_addr;
2694 	}
2695 	mbuf->ol_flags = PKT_TX_VLAN_PKT;
2696 	mbuf->vlan_tci = vlan_tag;
2697 	mbuf->l2_len = sizeof(struct ether_hdr);
2698 	mbuf->l3_len = sizeof(struct ipv4_hdr);
2699 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
2700 
2701 	tx_q->m_table[len] = mbuf;
2702 	len++;
2703 
2704 	LOG_DEBUG(VHOST_DATA,
2705 		"(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
2706 		dev->device_fh,
2707 		mbuf->nb_segs,
2708 		(mbuf->next == NULL) ? "null" : "non-null");
2709 
2710 	if (enable_stats) {
2711 		dev_statistics[dev->device_fh].tx_total++;
2712 		dev_statistics[dev->device_fh].tx++;
2713 	}
2714 
2715 	if (unlikely(len == MAX_PKT_BURST)) {
2716 		m_table = (struct rte_mbuf **)tx_q->m_table;
2717 		ret = rte_eth_tx_burst(ports[0],
2718 			(uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
2719 
2720 		/*
2721 		 * Free any buffers not handled by TX and update
2722 		 * the port stats.
2723 		 */
2724 		if (unlikely(ret < len)) {
2725 			do {
2726 				rte_pktmbuf_free(m_table[ret]);
2727 			} while (++ret < len);
2728 		}
2729 
2730 		len = 0;
2731 		txmbuf_clean_zcp(dev, vpool);
2732 	}
2733 
2734 	tx_q->len = len;
2735 
2736 	return;
2737 }
2738 
2739 /*
2740  * This function TX all available packets in virtio TX queue for one
2741  * virtio-net device. If it is first packet, it learns MAC address and
2742  * setup VMDQ.
2743  */
2744 static inline void __attribute__((always_inline))
2745 virtio_dev_tx_zcp(struct virtio_net *dev)
2746 {
2747 	struct rte_mbuf m;
2748 	struct vhost_virtqueue *vq;
2749 	struct vring_desc *desc;
2750 	uint64_t buff_addr = 0, phys_addr;
2751 	uint32_t head[MAX_PKT_BURST];
2752 	uint32_t i;
2753 	uint16_t free_entries, packet_success = 0;
2754 	uint16_t avail_idx;
2755 	uint8_t need_copy = 0;
2756 	hpa_type addr_type;
2757 
2758 	vq = dev->virtqueue[VIRTIO_TXQ];
2759 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
2760 
2761 	/* If there are no available buffers then return. */
2762 	if (vq->last_used_idx_res == avail_idx)
2763 		return;
2764 
2765 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
2766 
2767 	/* Prefetch available ring to retrieve head indexes. */
2768 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
2769 
2770 	/* Get the number of free entries in the ring */
2771 	free_entries = (avail_idx - vq->last_used_idx_res);
2772 
2773 	/* Limit to MAX_PKT_BURST. */
2774 	free_entries
2775 		= (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
2776 
2777 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
2778 		dev->device_fh, free_entries);
2779 
2780 	/* Retrieve all of the head indexes first to avoid caching issues. */
2781 	for (i = 0; i < free_entries; i++)
2782 		head[i]
2783 			= vq->avail->ring[(vq->last_used_idx_res + i)
2784 			& (vq->size - 1)];
2785 
2786 	vq->last_used_idx_res += free_entries;
2787 
2788 	/* Prefetch descriptor index. */
2789 	rte_prefetch0(&vq->desc[head[packet_success]]);
2790 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
2791 
2792 	while (packet_success < free_entries) {
2793 		desc = &vq->desc[head[packet_success]];
2794 
2795 		/* Discard first buffer as it is the virtio header */
2796 		desc = &vq->desc[desc->next];
2797 
2798 		/* Buffer address translation. */
2799 		buff_addr = gpa_to_vva(dev, desc->addr);
2800 		phys_addr = gpa_to_hpa(dev, desc->addr, desc->len, &addr_type);
2801 
2802 		if (likely(packet_success < (free_entries - 1)))
2803 			/* Prefetch descriptor index. */
2804 			rte_prefetch0(&vq->desc[head[packet_success + 1]]);
2805 
2806 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
2807 			RTE_LOG(ERR, VHOST_DATA,
2808 				"(%"PRIu64") Invalid frame buffer address found"
2809 				"when TX packets!\n",
2810 				dev->device_fh);
2811 			packet_success++;
2812 			continue;
2813 		}
2814 
2815 		/* Prefetch buffer address. */
2816 		rte_prefetch0((void *)(uintptr_t)buff_addr);
2817 
2818 		/*
2819 		 * Setup dummy mbuf. This is copied to a real mbuf if
2820 		 * transmitted out the physical port.
2821 		 */
2822 		m.data_len = desc->len;
2823 		m.nb_segs = 1;
2824 		m.next = NULL;
2825 		m.data_off = 0;
2826 		m.buf_addr = (void *)(uintptr_t)buff_addr;
2827 		m.buf_physaddr = phys_addr;
2828 
2829 		/*
2830 		 * Check if the frame buffer address from guest crosses
2831 		 * sub-region or not.
2832 		 */
2833 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
2834 			RTE_LOG(ERR, VHOST_DATA,
2835 				"(%"PRIu64") Frame buffer address cross "
2836 				"sub-regioin found when attaching TX frame "
2837 				"buffer address!\n",
2838 				dev->device_fh);
2839 			need_copy = 1;
2840 		} else
2841 			need_copy = 0;
2842 
2843 		PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2844 
2845 		/*
2846 		 * If this is the first received packet we need to learn
2847 		 * the MAC and setup VMDQ
2848 		 */
2849 		if (unlikely(dev->ready == DEVICE_MAC_LEARNING)) {
2850 			if (dev->remove || (link_vmdq(dev, &m) == -1)) {
2851 				/*
2852 				 * Discard frame if device is scheduled for
2853 				 * removal or a duplicate MAC address is found.
2854 				 */
2855 				packet_success += free_entries;
2856 				vq->last_used_idx += packet_success;
2857 				break;
2858 			}
2859 		}
2860 
2861 		virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2862 		packet_success++;
2863 	}
2864 }
2865 
2866 /*
2867  * This function is called by each data core. It handles all RX/TX registered
2868  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2869  * addresses are compared with all devices in the main linked list.
2870  */
2871 static int
2872 switch_worker_zcp(__attribute__((unused)) void *arg)
2873 {
2874 	struct virtio_net *dev = NULL;
2875 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2876 	struct virtio_net_data_ll *dev_ll;
2877 	struct mbuf_table *tx_q;
2878 	volatile struct lcore_ll_info *lcore_ll;
2879 	const uint64_t drain_tsc
2880 		= (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2881 		* BURST_TX_DRAIN_US;
2882 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2883 	unsigned ret;
2884 	const uint16_t lcore_id = rte_lcore_id();
2885 	uint16_t count_in_ring, rx_count = 0;
2886 
2887 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2888 
2889 	lcore_ll = lcore_info[lcore_id].lcore_ll;
2890 	prev_tsc = 0;
2891 
2892 	while (1) {
2893 		cur_tsc = rte_rdtsc();
2894 
2895 		/* TX burst queue drain */
2896 		diff_tsc = cur_tsc - prev_tsc;
2897 		if (unlikely(diff_tsc > drain_tsc)) {
2898 			/*
2899 			 * Get mbuf from vpool.pool and detach mbuf and
2900 			 * put back into vpool.ring.
2901 			 */
2902 			dev_ll = lcore_ll->ll_root_used;
2903 			while ((dev_ll != NULL) && (dev_ll->dev != NULL)) {
2904 				/* Get virtio device ID */
2905 				dev = dev_ll->dev;
2906 
2907 				if (likely(!dev->remove)) {
2908 					tx_q = &tx_queue_zcp[(uint16_t)dev->vmdq_rx_q];
2909 					if (tx_q->len) {
2910 						LOG_DEBUG(VHOST_DATA,
2911 						"TX queue drained after timeout"
2912 						" with burst size %u\n",
2913 						tx_q->len);
2914 
2915 						/*
2916 						 * Tx any packets in the queue
2917 						 */
2918 						ret = rte_eth_tx_burst(
2919 							ports[0],
2920 							(uint16_t)tx_q->txq_id,
2921 							(struct rte_mbuf **)
2922 							tx_q->m_table,
2923 							(uint16_t)tx_q->len);
2924 						if (unlikely(ret < tx_q->len)) {
2925 							do {
2926 								rte_pktmbuf_free(
2927 									tx_q->m_table[ret]);
2928 							} while (++ret < tx_q->len);
2929 						}
2930 						tx_q->len = 0;
2931 
2932 						txmbuf_clean_zcp(dev,
2933 							&vpool_array[MAX_QUEUES+dev->vmdq_rx_q]);
2934 					}
2935 				}
2936 				dev_ll = dev_ll->next;
2937 			}
2938 			prev_tsc = cur_tsc;
2939 		}
2940 
2941 		rte_prefetch0(lcore_ll->ll_root_used);
2942 
2943 		/*
2944 		 * Inform the configuration core that we have exited the linked
2945 		 * list and that no devices are in use if requested.
2946 		 */
2947 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2948 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2949 
2950 		/* Process devices */
2951 		dev_ll = lcore_ll->ll_root_used;
2952 
2953 		while ((dev_ll != NULL) && (dev_ll->dev != NULL)) {
2954 			dev = dev_ll->dev;
2955 			if (unlikely(dev->remove)) {
2956 				dev_ll = dev_ll->next;
2957 				unlink_vmdq(dev);
2958 				dev->ready = DEVICE_SAFE_REMOVE;
2959 				continue;
2960 			}
2961 
2962 			if (likely(dev->ready == DEVICE_RX)) {
2963 				uint32_t index = dev->vmdq_rx_q;
2964 				uint16_t i;
2965 				count_in_ring
2966 				= rte_ring_count(vpool_array[index].ring);
2967 				uint16_t free_entries
2968 				= (uint16_t)get_available_ring_num_zcp(dev);
2969 
2970 				/*
2971 				 * Attach all mbufs in vpool.ring and put back
2972 				 * into vpool.pool.
2973 				 */
2974 				for (i = 0;
2975 				i < RTE_MIN(free_entries,
2976 				RTE_MIN(count_in_ring, MAX_PKT_BURST));
2977 				i++)
2978 					attach_rxmbuf_zcp(dev);
2979 
2980 				/* Handle guest RX */
2981 				rx_count = rte_eth_rx_burst(ports[0],
2982 					(uint16_t)dev->vmdq_rx_q, pkts_burst,
2983 					MAX_PKT_BURST);
2984 
2985 				if (rx_count) {
2986 					ret_count = virtio_dev_rx_zcp(dev,
2987 							pkts_burst, rx_count);
2988 					if (enable_stats) {
2989 						dev_statistics[dev->device_fh].rx_total
2990 							+= rx_count;
2991 						dev_statistics[dev->device_fh].rx
2992 							+= ret_count;
2993 					}
2994 					while (likely(rx_count)) {
2995 						rx_count--;
2996 						pktmbuf_detach_zcp(
2997 							pkts_burst[rx_count]);
2998 						rte_ring_sp_enqueue(
2999 							vpool_array[index].ring,
3000 							(void *)pkts_burst[rx_count]);
3001 					}
3002 				}
3003 			}
3004 
3005 			if (likely(!dev->remove))
3006 				/* Handle guest TX */
3007 				virtio_dev_tx_zcp(dev);
3008 
3009 			/* Move to the next device in the list */
3010 			dev_ll = dev_ll->next;
3011 		}
3012 	}
3013 
3014 	return 0;
3015 }
3016 
3017 
3018 /*
3019  * Add an entry to a used linked list. A free entry must first be found
3020  * in the free linked list using get_data_ll_free_entry();
3021  */
3022 static void
3023 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
3024 	struct virtio_net_data_ll *ll_dev)
3025 {
3026 	struct virtio_net_data_ll *ll = *ll_root_addr;
3027 
3028 	/* Set next as NULL and use a compiler barrier to avoid reordering. */
3029 	ll_dev->next = NULL;
3030 	rte_compiler_barrier();
3031 
3032 	/* If ll == NULL then this is the first device. */
3033 	if (ll) {
3034 		/* Increment to the tail of the linked list. */
3035 		while ((ll->next != NULL) )
3036 			ll = ll->next;
3037 
3038 		ll->next = ll_dev;
3039 	} else {
3040 		*ll_root_addr = ll_dev;
3041 	}
3042 }
3043 
3044 /*
3045  * Remove an entry from a used linked list. The entry must then be added to
3046  * the free linked list using put_data_ll_free_entry().
3047  */
3048 static void
3049 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
3050 	struct virtio_net_data_ll *ll_dev,
3051 	struct virtio_net_data_ll *ll_dev_last)
3052 {
3053 	struct virtio_net_data_ll *ll = *ll_root_addr;
3054 
3055 	if (unlikely((ll == NULL) || (ll_dev == NULL)))
3056 		return;
3057 
3058 	if (ll_dev == ll)
3059 		*ll_root_addr = ll_dev->next;
3060 	else
3061 		if (likely(ll_dev_last != NULL))
3062 			ll_dev_last->next = ll_dev->next;
3063 		else
3064 			RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
3065 }
3066 
3067 /*
3068  * Find and return an entry from the free linked list.
3069  */
3070 static struct virtio_net_data_ll *
3071 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
3072 {
3073 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
3074 	struct virtio_net_data_ll *ll_dev;
3075 
3076 	if (ll_free == NULL)
3077 		return NULL;
3078 
3079 	ll_dev = ll_free;
3080 	*ll_root_addr = ll_free->next;
3081 
3082 	return ll_dev;
3083 }
3084 
3085 /*
3086  * Place an entry back on to the free linked list.
3087  */
3088 static void
3089 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
3090 	struct virtio_net_data_ll *ll_dev)
3091 {
3092 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
3093 
3094 	if (ll_dev == NULL)
3095 		return;
3096 
3097 	ll_dev->next = ll_free;
3098 	*ll_root_addr = ll_dev;
3099 }
3100 
3101 /*
3102  * Creates a linked list of a given size.
3103  */
3104 static struct virtio_net_data_ll *
3105 alloc_data_ll(uint32_t size)
3106 {
3107 	struct virtio_net_data_ll *ll_new;
3108 	uint32_t i;
3109 
3110 	/* Malloc and then chain the linked list. */
3111 	ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
3112 	if (ll_new == NULL) {
3113 		RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
3114 		return NULL;
3115 	}
3116 
3117 	for (i = 0; i < size - 1; i++) {
3118 		ll_new[i].dev = NULL;
3119 		ll_new[i].next = &ll_new[i+1];
3120 	}
3121 	ll_new[i].next = NULL;
3122 
3123 	return (ll_new);
3124 }
3125 
3126 /*
3127  * Create the main linked list along with each individual cores linked list. A used and a free list
3128  * are created to manage entries.
3129  */
3130 static int
3131 init_data_ll (void)
3132 {
3133 	int lcore;
3134 
3135 	RTE_LCORE_FOREACH_SLAVE(lcore) {
3136 		lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
3137 		if (lcore_info[lcore].lcore_ll == NULL) {
3138 			RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
3139 			return -1;
3140 		}
3141 
3142 		lcore_info[lcore].lcore_ll->device_num = 0;
3143 		lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
3144 		lcore_info[lcore].lcore_ll->ll_root_used = NULL;
3145 		if (num_devices % num_switching_cores)
3146 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
3147 		else
3148 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
3149 	}
3150 
3151 	/* Allocate devices up to a maximum of MAX_DEVICES. */
3152 	ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
3153 
3154 	return 0;
3155 }
3156 
3157 /*
3158  * Set virtqueue flags so that we do not receive interrupts.
3159  */
3160 static void
3161 set_irq_status (struct virtio_net *dev)
3162 {
3163 	dev->virtqueue[VIRTIO_RXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
3164 	dev->virtqueue[VIRTIO_TXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
3165 }
3166 
3167 /*
3168  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
3169  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
3170  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
3171  */
3172 static void
3173 destroy_device (volatile struct virtio_net *dev)
3174 {
3175 	struct virtio_net_data_ll *ll_lcore_dev_cur;
3176 	struct virtio_net_data_ll *ll_main_dev_cur;
3177 	struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
3178 	struct virtio_net_data_ll *ll_main_dev_last = NULL;
3179 	int lcore;
3180 
3181 	dev->flags &= ~VIRTIO_DEV_RUNNING;
3182 
3183 	/*set the remove flag. */
3184 	dev->remove = 1;
3185 
3186 	while(dev->ready != DEVICE_SAFE_REMOVE) {
3187 		rte_pause();
3188 	}
3189 
3190 	/* Search for entry to be removed from lcore ll */
3191 	ll_lcore_dev_cur = lcore_info[dev->coreid].lcore_ll->ll_root_used;
3192 	while (ll_lcore_dev_cur != NULL) {
3193 		if (ll_lcore_dev_cur->dev == dev) {
3194 			break;
3195 		} else {
3196 			ll_lcore_dev_last = ll_lcore_dev_cur;
3197 			ll_lcore_dev_cur = ll_lcore_dev_cur->next;
3198 		}
3199 	}
3200 
3201 	if (ll_lcore_dev_cur == NULL) {
3202 		RTE_LOG(ERR, VHOST_CONFIG,
3203 			"(%"PRIu64") Failed to find the dev to be destroy.\n",
3204 			dev->device_fh);
3205 		return;
3206 	}
3207 
3208 	/* Search for entry to be removed from main ll */
3209 	ll_main_dev_cur = ll_root_used;
3210 	ll_main_dev_last = NULL;
3211 	while (ll_main_dev_cur != NULL) {
3212 		if (ll_main_dev_cur->dev == dev) {
3213 			break;
3214 		} else {
3215 			ll_main_dev_last = ll_main_dev_cur;
3216 			ll_main_dev_cur = ll_main_dev_cur->next;
3217 		}
3218 	}
3219 
3220 	/* Remove entries from the lcore and main ll. */
3221 	rm_data_ll_entry(&lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
3222 	rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
3223 
3224 	/* Set the dev_removal_flag on each lcore. */
3225 	RTE_LCORE_FOREACH_SLAVE(lcore) {
3226 		lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
3227 	}
3228 
3229 	/*
3230 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
3231 	 * they can no longer access the device removed from the linked lists and that the devices
3232 	 * are no longer in use.
3233 	 */
3234 	RTE_LCORE_FOREACH_SLAVE(lcore) {
3235 		while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
3236 			rte_pause();
3237 		}
3238 	}
3239 
3240 	/* Add the entries back to the lcore and main free ll.*/
3241 	put_data_ll_free_entry(&lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
3242 	put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
3243 
3244 	/* Decrement number of device on the lcore. */
3245 	lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->device_num--;
3246 
3247 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
3248 
3249 	if (zero_copy) {
3250 		struct vpool *vpool = &vpool_array[dev->vmdq_rx_q];
3251 
3252 		/* Stop the RX queue. */
3253 		if (rte_eth_dev_rx_queue_stop(ports[0], dev->vmdq_rx_q) != 0) {
3254 			LOG_DEBUG(VHOST_CONFIG,
3255 				"(%"PRIu64") In destroy_device: Failed to stop "
3256 				"rx queue:%d\n",
3257 				dev->device_fh,
3258 				dev->vmdq_rx_q);
3259 		}
3260 
3261 		LOG_DEBUG(VHOST_CONFIG,
3262 			"(%"PRIu64") in destroy_device: Start put mbuf in "
3263 			"mempool back to ring for RX queue: %d\n",
3264 			dev->device_fh, dev->vmdq_rx_q);
3265 
3266 		mbuf_destroy_zcp(vpool);
3267 
3268 		/* Stop the TX queue. */
3269 		if (rte_eth_dev_tx_queue_stop(ports[0], dev->vmdq_rx_q) != 0) {
3270 			LOG_DEBUG(VHOST_CONFIG,
3271 				"(%"PRIu64") In destroy_device: Failed to "
3272 				"stop tx queue:%d\n",
3273 				dev->device_fh, dev->vmdq_rx_q);
3274 		}
3275 
3276 		vpool = &vpool_array[dev->vmdq_rx_q + MAX_QUEUES];
3277 
3278 		LOG_DEBUG(VHOST_CONFIG,
3279 			"(%"PRIu64") destroy_device: Start put mbuf in mempool "
3280 			"back to ring for TX queue: %d, dev:(%"PRIu64")\n",
3281 			dev->device_fh, (dev->vmdq_rx_q + MAX_QUEUES),
3282 			dev->device_fh);
3283 
3284 		mbuf_destroy_zcp(vpool);
3285 	}
3286 
3287 }
3288 
3289 /*
3290  * A new device is added to a data core. First the device is added to the main linked list
3291  * and the allocated to a specific data core.
3292  */
3293 static int
3294 new_device (struct virtio_net *dev)
3295 {
3296 	struct virtio_net_data_ll *ll_dev;
3297 	int lcore, core_add = 0;
3298 	uint32_t device_num_min = num_devices;
3299 
3300 	/* Add device to main ll */
3301 	ll_dev = get_data_ll_free_entry(&ll_root_free);
3302 	if (ll_dev == NULL) {
3303 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
3304 			"of %d devices per core has been reached\n",
3305 			dev->device_fh, num_devices);
3306 		return -1;
3307 	}
3308 	ll_dev->dev = dev;
3309 	add_data_ll_entry(&ll_root_used, ll_dev);
3310 	ll_dev->dev->vmdq_rx_q
3311 		= ll_dev->dev->device_fh * (num_queues / num_devices);
3312 
3313 	if (zero_copy) {
3314 		uint32_t index = ll_dev->dev->vmdq_rx_q;
3315 		uint32_t count_in_ring, i;
3316 		struct mbuf_table *tx_q;
3317 
3318 		count_in_ring = rte_ring_count(vpool_array[index].ring);
3319 
3320 		LOG_DEBUG(VHOST_CONFIG,
3321 			"(%"PRIu64") in new_device: mbuf count in mempool "
3322 			"before attach is: %d\n",
3323 			dev->device_fh,
3324 			rte_mempool_count(vpool_array[index].pool));
3325 		LOG_DEBUG(VHOST_CONFIG,
3326 			"(%"PRIu64") in new_device: mbuf count in  ring "
3327 			"before attach  is : %d\n",
3328 			dev->device_fh, count_in_ring);
3329 
3330 		/*
3331 		 * Attach all mbufs in vpool.ring and put back intovpool.pool.
3332 		 */
3333 		for (i = 0; i < count_in_ring; i++)
3334 			attach_rxmbuf_zcp(dev);
3335 
3336 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
3337 			"mempool after attach is: %d\n",
3338 			dev->device_fh,
3339 			rte_mempool_count(vpool_array[index].pool));
3340 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
3341 			"ring after attach  is : %d\n",
3342 			dev->device_fh,
3343 			rte_ring_count(vpool_array[index].ring));
3344 
3345 		tx_q = &tx_queue_zcp[(uint16_t)dev->vmdq_rx_q];
3346 		tx_q->txq_id = dev->vmdq_rx_q;
3347 
3348 		if (rte_eth_dev_tx_queue_start(ports[0], dev->vmdq_rx_q) != 0) {
3349 			struct vpool *vpool = &vpool_array[dev->vmdq_rx_q];
3350 
3351 			LOG_DEBUG(VHOST_CONFIG,
3352 				"(%"PRIu64") In new_device: Failed to start "
3353 				"tx queue:%d\n",
3354 				dev->device_fh, dev->vmdq_rx_q);
3355 
3356 			mbuf_destroy_zcp(vpool);
3357 			return -1;
3358 		}
3359 
3360 		if (rte_eth_dev_rx_queue_start(ports[0], dev->vmdq_rx_q) != 0) {
3361 			struct vpool *vpool = &vpool_array[dev->vmdq_rx_q];
3362 
3363 			LOG_DEBUG(VHOST_CONFIG,
3364 				"(%"PRIu64") In new_device: Failed to start "
3365 				"rx queue:%d\n",
3366 				dev->device_fh, dev->vmdq_rx_q);
3367 
3368 			/* Stop the TX queue. */
3369 			if (rte_eth_dev_tx_queue_stop(ports[0],
3370 				dev->vmdq_rx_q) != 0) {
3371 				LOG_DEBUG(VHOST_CONFIG,
3372 					"(%"PRIu64") In new_device: Failed to "
3373 					"stop tx queue:%d\n",
3374 					dev->device_fh, dev->vmdq_rx_q);
3375 			}
3376 
3377 			mbuf_destroy_zcp(vpool);
3378 			return -1;
3379 		}
3380 
3381 	}
3382 
3383 	/*reset ready flag*/
3384 	dev->ready = DEVICE_MAC_LEARNING;
3385 	dev->remove = 0;
3386 
3387 	/* Find a suitable lcore to add the device. */
3388 	RTE_LCORE_FOREACH_SLAVE(lcore) {
3389 		if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
3390 			device_num_min = lcore_info[lcore].lcore_ll->device_num;
3391 			core_add = lcore;
3392 		}
3393 	}
3394 	/* Add device to lcore ll */
3395 	ll_dev->dev->coreid = core_add;
3396 	ll_dev = get_data_ll_free_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_free);
3397 	if (ll_dev == NULL) {
3398 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
3399 		dev->ready = DEVICE_SAFE_REMOVE;
3400 		destroy_device(dev);
3401 		return -1;
3402 	}
3403 	ll_dev->dev = dev;
3404 	add_data_ll_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_used, ll_dev);
3405 
3406 	/* Initialize device stats */
3407 	memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
3408 
3409 	/* Disable notifications. */
3410 	set_irq_status(dev);
3411 	lcore_info[ll_dev->dev->coreid].lcore_ll->device_num++;
3412 	dev->flags |= VIRTIO_DEV_RUNNING;
3413 
3414 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, dev->coreid);
3415 
3416 	return 0;
3417 }
3418 
3419 /*
3420  * These callback allow devices to be added to the data core when configuration
3421  * has been fully complete.
3422  */
3423 static const struct virtio_net_device_ops virtio_net_device_ops =
3424 {
3425 	.new_device =  new_device,
3426 	.destroy_device = destroy_device,
3427 };
3428 
3429 /*
3430  * This is a thread will wake up after a period to print stats if the user has
3431  * enabled them.
3432  */
3433 static void
3434 print_stats(void)
3435 {
3436 	struct virtio_net_data_ll *dev_ll;
3437 	uint64_t tx_dropped, rx_dropped;
3438 	uint64_t tx, tx_total, rx, rx_total;
3439 	uint32_t device_fh;
3440 	const char clr[] = { 27, '[', '2', 'J', '\0' };
3441 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
3442 
3443 	while(1) {
3444 		sleep(enable_stats);
3445 
3446 		/* Clear screen and move to top left */
3447 		printf("%s%s", clr, top_left);
3448 
3449 		printf("\nDevice statistics ====================================");
3450 
3451 		dev_ll = ll_root_used;
3452 		while (dev_ll != NULL) {
3453 			device_fh = (uint32_t)dev_ll->dev->device_fh;
3454 			tx_total = dev_statistics[device_fh].tx_total;
3455 			tx = dev_statistics[device_fh].tx;
3456 			tx_dropped = tx_total - tx;
3457 			if (zero_copy == 0) {
3458 				rx_total = rte_atomic64_read(
3459 					&dev_statistics[device_fh].rx_total_atomic);
3460 				rx = rte_atomic64_read(
3461 					&dev_statistics[device_fh].rx_atomic);
3462 			} else {
3463 				rx_total = dev_statistics[device_fh].rx_total;
3464 				rx = dev_statistics[device_fh].rx;
3465 			}
3466 			rx_dropped = rx_total - rx;
3467 
3468 			printf("\nStatistics for device %"PRIu32" ------------------------------"
3469 					"\nTX total: 		%"PRIu64""
3470 					"\nTX dropped: 		%"PRIu64""
3471 					"\nTX successful: 		%"PRIu64""
3472 					"\nRX total: 		%"PRIu64""
3473 					"\nRX dropped: 		%"PRIu64""
3474 					"\nRX successful: 		%"PRIu64"",
3475 					device_fh,
3476 					tx_total,
3477 					tx_dropped,
3478 					tx,
3479 					rx_total,
3480 					rx_dropped,
3481 					rx);
3482 
3483 			dev_ll = dev_ll->next;
3484 		}
3485 		printf("\n======================================================\n");
3486 	}
3487 }
3488 
3489 static void
3490 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
3491 	char *ring_name, uint32_t nb_mbuf)
3492 {
3493 	uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
3494 	vpool_array[index].pool
3495 		= rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
3496 		MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
3497 		rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
3498 		rte_pktmbuf_init, NULL, socket, 0);
3499 	if (vpool_array[index].pool != NULL) {
3500 		vpool_array[index].ring
3501 			= rte_ring_create(ring_name,
3502 				rte_align32pow2(nb_mbuf + 1),
3503 				socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
3504 		if (likely(vpool_array[index].ring != NULL)) {
3505 			LOG_DEBUG(VHOST_CONFIG,
3506 				"in setup_mempool_tbl: mbuf count in "
3507 				"mempool is: %d\n",
3508 				rte_mempool_count(vpool_array[index].pool));
3509 			LOG_DEBUG(VHOST_CONFIG,
3510 				"in setup_mempool_tbl: mbuf count in "
3511 				"ring   is: %d\n",
3512 				rte_ring_count(vpool_array[index].ring));
3513 		} else {
3514 			rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
3515 				ring_name);
3516 		}
3517 
3518 		/* Need consider head room. */
3519 		vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
3520 	} else {
3521 		rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
3522 	}
3523 }
3524 
3525 
3526 /*
3527  * Main function, does initialisation and calls the per-lcore functions. The CUSE
3528  * device is also registered here to handle the IOCTLs.
3529  */
3530 int
3531 MAIN(int argc, char *argv[])
3532 {
3533 	struct rte_mempool *mbuf_pool = NULL;
3534 	unsigned lcore_id, core_id = 0;
3535 	unsigned nb_ports, valid_num_ports;
3536 	int ret;
3537 	uint8_t portid, queue_id = 0;
3538 	static pthread_t tid;
3539 
3540 	/* init EAL */
3541 	ret = rte_eal_init(argc, argv);
3542 	if (ret < 0)
3543 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
3544 	argc -= ret;
3545 	argv += ret;
3546 
3547 	/* parse app arguments */
3548 	ret = us_vhost_parse_args(argc, argv);
3549 	if (ret < 0)
3550 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
3551 
3552 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
3553 		if (rte_lcore_is_enabled(lcore_id))
3554 			lcore_ids[core_id ++] = lcore_id;
3555 
3556 	if (rte_lcore_count() > RTE_MAX_LCORE)
3557 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
3558 
3559 	/*set the number of swithcing cores available*/
3560 	num_switching_cores = rte_lcore_count()-1;
3561 
3562 	/* Get the number of physical ports. */
3563 	nb_ports = rte_eth_dev_count();
3564 	if (nb_ports > RTE_MAX_ETHPORTS)
3565 		nb_ports = RTE_MAX_ETHPORTS;
3566 
3567 	/*
3568 	 * Update the global var NUM_PORTS and global array PORTS
3569 	 * and get value of var VALID_NUM_PORTS according to system ports number
3570 	 */
3571 	valid_num_ports = check_ports_num(nb_ports);
3572 
3573 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
3574 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
3575 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
3576 		return -1;
3577 	}
3578 
3579 	if (zero_copy == 0) {
3580 		/* Create the mbuf pool. */
3581 		mbuf_pool = rte_mempool_create(
3582 				"MBUF_POOL",
3583 				NUM_MBUFS_PER_PORT
3584 				* valid_num_ports,
3585 				MBUF_SIZE, MBUF_CACHE_SIZE,
3586 				sizeof(struct rte_pktmbuf_pool_private),
3587 				rte_pktmbuf_pool_init, NULL,
3588 				rte_pktmbuf_init, NULL,
3589 				rte_socket_id(), 0);
3590 		if (mbuf_pool == NULL)
3591 			rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
3592 
3593 		for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
3594 			vpool_array[queue_id].pool = mbuf_pool;
3595 
3596 		if (vm2vm_mode == VM2VM_HARDWARE) {
3597 			/* Enable VT loop back to let L2 switch to do it. */
3598 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3599 			LOG_DEBUG(VHOST_CONFIG,
3600 				"Enable loop back for L2 switch in vmdq.\n");
3601 		}
3602 	} else {
3603 		uint32_t nb_mbuf;
3604 		char pool_name[RTE_MEMPOOL_NAMESIZE];
3605 		char ring_name[RTE_MEMPOOL_NAMESIZE];
3606 
3607 		rx_conf_default.start_rx_per_q = (uint8_t)zero_copy;
3608 		rx_conf_default.rx_drop_en = 0;
3609 		tx_conf_default.start_tx_per_q = (uint8_t)zero_copy;
3610 		nb_mbuf = num_rx_descriptor
3611 			+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
3612 			+ num_switching_cores * MAX_PKT_BURST;
3613 
3614 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3615 			snprintf(pool_name, sizeof(pool_name),
3616 				"rxmbuf_pool_%u", queue_id);
3617 			snprintf(ring_name, sizeof(ring_name),
3618 				"rxmbuf_ring_%u", queue_id);
3619 			setup_mempool_tbl(rte_socket_id(), queue_id,
3620 				pool_name, ring_name, nb_mbuf);
3621 		}
3622 
3623 		nb_mbuf = num_tx_descriptor
3624 				+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
3625 				+ num_switching_cores * MAX_PKT_BURST;
3626 
3627 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3628 			snprintf(pool_name, sizeof(pool_name),
3629 				"txmbuf_pool_%u", queue_id);
3630 			snprintf(ring_name, sizeof(ring_name),
3631 				"txmbuf_ring_%u", queue_id);
3632 			setup_mempool_tbl(rte_socket_id(),
3633 				(queue_id + MAX_QUEUES),
3634 				pool_name, ring_name, nb_mbuf);
3635 		}
3636 
3637 		if (vm2vm_mode == VM2VM_HARDWARE) {
3638 			/* Enable VT loop back to let L2 switch to do it. */
3639 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3640 			LOG_DEBUG(VHOST_CONFIG,
3641 				"Enable loop back for L2 switch in vmdq.\n");
3642 		}
3643 	}
3644 	/* Set log level. */
3645 	rte_set_log_level(LOG_LEVEL);
3646 
3647 	/* initialize all ports */
3648 	for (portid = 0; portid < nb_ports; portid++) {
3649 		/* skip ports that are not enabled */
3650 		if ((enabled_port_mask & (1 << portid)) == 0) {
3651 			RTE_LOG(INFO, VHOST_PORT,
3652 				"Skipping disabled port %d\n", portid);
3653 			continue;
3654 		}
3655 		if (port_init(portid) != 0)
3656 			rte_exit(EXIT_FAILURE,
3657 				"Cannot initialize network ports\n");
3658 	}
3659 
3660 	/* Initialise all linked lists. */
3661 	if (init_data_ll() == -1)
3662 		rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3663 
3664 	/* Initialize device stats */
3665 	memset(&dev_statistics, 0, sizeof(dev_statistics));
3666 
3667 	/* Enable stats if the user option is set. */
3668 	if (enable_stats)
3669 		pthread_create(&tid, NULL, (void*)print_stats, NULL );
3670 
3671 	/* Launch all data cores. */
3672 	if (zero_copy == 0) {
3673 		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3674 			rte_eal_remote_launch(switch_worker,
3675 				mbuf_pool, lcore_id);
3676 		}
3677 	} else {
3678 		uint32_t count_in_mempool, index, i;
3679 		for (index = 0; index < 2*MAX_QUEUES; index++) {
3680 			/* For all RX and TX queues. */
3681 			count_in_mempool
3682 				= rte_mempool_count(vpool_array[index].pool);
3683 
3684 			/*
3685 			 * Transfer all un-attached mbufs from vpool.pool
3686 			 * to vpoo.ring.
3687 			 */
3688 			for (i = 0; i < count_in_mempool; i++) {
3689 				struct rte_mbuf *mbuf
3690 					= __rte_mbuf_raw_alloc(
3691 						vpool_array[index].pool);
3692 				rte_ring_sp_enqueue(vpool_array[index].ring,
3693 						(void *)mbuf);
3694 			}
3695 
3696 			LOG_DEBUG(VHOST_CONFIG,
3697 				"in MAIN: mbuf count in mempool at initial "
3698 				"is: %d\n", count_in_mempool);
3699 			LOG_DEBUG(VHOST_CONFIG,
3700 				"in MAIN: mbuf count in  ring at initial  is :"
3701 				" %d\n",
3702 				rte_ring_count(vpool_array[index].ring));
3703 		}
3704 
3705 		RTE_LCORE_FOREACH_SLAVE(lcore_id)
3706 			rte_eal_remote_launch(switch_worker_zcp, NULL,
3707 				lcore_id);
3708 	}
3709 
3710 	/* Register CUSE device to handle IOCTLs. */
3711 	ret = register_cuse_device((char*)&dev_basename, dev_index, get_virtio_net_callbacks());
3712 	if (ret != 0)
3713 		rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3714 
3715 	init_virtio_net(&virtio_net_device_ops);
3716 
3717 	/* Start CUSE session. */
3718 	start_cuse_session_loop();
3719 	return 0;
3720 
3721 }
3722 
3723