xref: /dpdk/examples/vhost/main.c (revision 7e37aef78c54a1f6e2007bd68b9e6c48d9acc8a4)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 
54 #include "main.h"
55 
56 #ifndef MAX_QUEUES
57 #define MAX_QUEUES 128
58 #endif
59 
60 /* the maximum number of external ports supported */
61 #define MAX_SUP_PORTS 1
62 
63 /*
64  * Calculate the number of buffers needed per port
65  */
66 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +		\
67 							(num_switching_cores*MAX_PKT_BURST) +  			\
68 							(num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
69 							(num_switching_cores*MBUF_CACHE_SIZE))
70 
71 #define MBUF_CACHE_SIZE	128
72 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
73 
74 /*
75  * No frame data buffer allocated from host are required for zero copy
76  * implementation, guest will allocate the frame data buffer, and vhost
77  * directly use it.
78  */
79 #define VIRTIO_DESCRIPTOR_LEN_ZCP	RTE_MBUF_DEFAULT_DATAROOM
80 #define MBUF_DATA_SIZE_ZCP		RTE_MBUF_DEFAULT_BUF_SIZE
81 #define MBUF_CACHE_SIZE_ZCP 0
82 
83 #define MAX_PKT_BURST 32		/* Max burst size for RX/TX */
84 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
85 
86 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
87 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
88 
89 #define JUMBO_FRAME_MAX_SIZE    0x2600
90 
91 /* State of virtio device. */
92 #define DEVICE_MAC_LEARNING 0
93 #define DEVICE_RX			1
94 #define DEVICE_SAFE_REMOVE	2
95 
96 /* Config_core_flag status definitions. */
97 #define REQUEST_DEV_REMOVAL 1
98 #define ACK_DEV_REMOVAL 0
99 
100 /* Configurable number of RX/TX ring descriptors */
101 #define RTE_TEST_RX_DESC_DEFAULT 1024
102 #define RTE_TEST_TX_DESC_DEFAULT 512
103 
104 /*
105  * Need refine these 2 macros for legacy and DPDK based front end:
106  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
107  * And then adjust power 2.
108  */
109 /*
110  * For legacy front end, 128 descriptors,
111  * half for virtio header, another half for mbuf.
112  */
113 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
114 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
115 
116 /* Get first 4 bytes in mbuf headroom. */
117 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
118 		+ sizeof(struct rte_mbuf)))
119 
120 /* true if x is a power of 2 */
121 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
122 
123 #define INVALID_PORT_ID 0xFF
124 
125 /* Max number of devices. Limited by vmdq. */
126 #define MAX_DEVICES 64
127 
128 /* Size of buffers used for snprintfs. */
129 #define MAX_PRINT_BUFF 6072
130 
131 /* Maximum character device basename size. */
132 #define MAX_BASENAME_SZ 10
133 
134 /* Maximum long option length for option parsing. */
135 #define MAX_LONG_OPT_SZ 64
136 
137 /* Used to compare MAC addresses. */
138 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
139 
140 /* Number of descriptors per cacheline. */
141 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
142 
143 #define MBUF_EXT_MEM(mb)   (rte_mbuf_from_indirect(mb) != (mb))
144 
145 /* mask of enabled ports */
146 static uint32_t enabled_port_mask = 0;
147 
148 /* Promiscuous mode */
149 static uint32_t promiscuous;
150 
151 /*Number of switching cores enabled*/
152 static uint32_t num_switching_cores = 0;
153 
154 /* number of devices/queues to support*/
155 static uint32_t num_queues = 0;
156 static uint32_t num_devices;
157 
158 /*
159  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
160  * disabled on default.
161  */
162 static uint32_t zero_copy;
163 static int mergeable;
164 
165 /* Do vlan strip on host, enabled on default */
166 static uint32_t vlan_strip = 1;
167 
168 /* number of descriptors to apply*/
169 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
170 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
171 
172 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
173 #define MAX_RING_DESC 4096
174 
175 struct vpool {
176 	struct rte_mempool *pool;
177 	struct rte_ring *ring;
178 	uint32_t buf_size;
179 } vpool_array[MAX_QUEUES+MAX_QUEUES];
180 
181 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
182 typedef enum {
183 	VM2VM_DISABLED = 0,
184 	VM2VM_SOFTWARE = 1,
185 	VM2VM_HARDWARE = 2,
186 	VM2VM_LAST
187 } vm2vm_type;
188 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
189 
190 /* The type of host physical address translated from guest physical address. */
191 typedef enum {
192 	PHYS_ADDR_CONTINUOUS = 0,
193 	PHYS_ADDR_CROSS_SUBREG = 1,
194 	PHYS_ADDR_INVALID = 2,
195 	PHYS_ADDR_LAST
196 } hpa_type;
197 
198 /* Enable stats. */
199 static uint32_t enable_stats = 0;
200 /* Enable retries on RX. */
201 static uint32_t enable_retry = 1;
202 /* Specify timeout (in useconds) between retries on RX. */
203 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
204 /* Specify the number of retries on RX. */
205 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
206 
207 /* Character device basename. Can be set by user. */
208 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
209 
210 /* empty vmdq configuration structure. Filled in programatically */
211 static struct rte_eth_conf vmdq_conf_default = {
212 	.rxmode = {
213 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
214 		.split_hdr_size = 0,
215 		.header_split   = 0, /**< Header Split disabled */
216 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
217 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
218 		/*
219 		 * It is necessary for 1G NIC such as I350,
220 		 * this fixes bug of ipv4 forwarding in guest can't
221 		 * forward pakets from one virtio dev to another virtio dev.
222 		 */
223 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
224 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
225 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
226 	},
227 
228 	.txmode = {
229 		.mq_mode = ETH_MQ_TX_NONE,
230 	},
231 	.rx_adv_conf = {
232 		/*
233 		 * should be overridden separately in code with
234 		 * appropriate values
235 		 */
236 		.vmdq_rx_conf = {
237 			.nb_queue_pools = ETH_8_POOLS,
238 			.enable_default_pool = 0,
239 			.default_pool = 0,
240 			.nb_pool_maps = 0,
241 			.pool_map = {{0, 0},},
242 		},
243 	},
244 };
245 
246 static unsigned lcore_ids[RTE_MAX_LCORE];
247 static uint8_t ports[RTE_MAX_ETHPORTS];
248 static unsigned num_ports = 0; /**< The number of ports specified in command line */
249 static uint16_t num_pf_queues, num_vmdq_queues;
250 static uint16_t vmdq_pool_base, vmdq_queue_base;
251 static uint16_t queues_per_pool;
252 
253 static const uint16_t external_pkt_default_vlan_tag = 2000;
254 const uint16_t vlan_tags[] = {
255 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
256 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
257 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
258 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
259 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
260 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
261 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
262 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
263 };
264 
265 /* ethernet addresses of ports */
266 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
267 
268 /* heads for the main used and free linked lists for the data path. */
269 static struct virtio_net_data_ll *ll_root_used = NULL;
270 static struct virtio_net_data_ll *ll_root_free = NULL;
271 
272 /* Array of data core structures containing information on individual core linked lists. */
273 static struct lcore_info lcore_info[RTE_MAX_LCORE];
274 
275 /* Used for queueing bursts of TX packets. */
276 struct mbuf_table {
277 	unsigned len;
278 	unsigned txq_id;
279 	struct rte_mbuf *m_table[MAX_PKT_BURST];
280 };
281 
282 /* TX queue for each data core. */
283 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
284 
285 /* TX queue fori each virtio device for zero copy. */
286 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
287 
288 /* Vlan header struct used to insert vlan tags on TX. */
289 struct vlan_ethhdr {
290 	unsigned char   h_dest[ETH_ALEN];
291 	unsigned char   h_source[ETH_ALEN];
292 	__be16          h_vlan_proto;
293 	__be16          h_vlan_TCI;
294 	__be16          h_vlan_encapsulated_proto;
295 };
296 
297 /* IPv4 Header */
298 struct ipv4_hdr {
299 	uint8_t  version_ihl;		/**< version and header length */
300 	uint8_t  type_of_service;	/**< type of service */
301 	uint16_t total_length;		/**< length of packet */
302 	uint16_t packet_id;		/**< packet ID */
303 	uint16_t fragment_offset;	/**< fragmentation offset */
304 	uint8_t  time_to_live;		/**< time to live */
305 	uint8_t  next_proto_id;		/**< protocol ID */
306 	uint16_t hdr_checksum;		/**< header checksum */
307 	uint32_t src_addr;		/**< source address */
308 	uint32_t dst_addr;		/**< destination address */
309 } __attribute__((__packed__));
310 
311 /* Header lengths. */
312 #define VLAN_HLEN       4
313 #define VLAN_ETH_HLEN   18
314 
315 /* Per-device statistics struct */
316 struct device_statistics {
317 	uint64_t tx_total;
318 	rte_atomic64_t rx_total_atomic;
319 	uint64_t rx_total;
320 	uint64_t tx;
321 	rte_atomic64_t rx_atomic;
322 	uint64_t rx;
323 } __rte_cache_aligned;
324 struct device_statistics dev_statistics[MAX_DEVICES];
325 
326 /*
327  * Builds up the correct configuration for VMDQ VLAN pool map
328  * according to the pool & queue limits.
329  */
330 static inline int
331 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
332 {
333 	struct rte_eth_vmdq_rx_conf conf;
334 	struct rte_eth_vmdq_rx_conf *def_conf =
335 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
336 	unsigned i;
337 
338 	memset(&conf, 0, sizeof(conf));
339 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
340 	conf.nb_pool_maps = num_devices;
341 	conf.enable_loop_back = def_conf->enable_loop_back;
342 	conf.rx_mode = def_conf->rx_mode;
343 
344 	for (i = 0; i < conf.nb_pool_maps; i++) {
345 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
346 		conf.pool_map[i].pools = (1UL << i);
347 	}
348 
349 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
350 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
351 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
352 	return 0;
353 }
354 
355 /*
356  * Validate the device number according to the max pool number gotten form
357  * dev_info. If the device number is invalid, give the error message and
358  * return -1. Each device must have its own pool.
359  */
360 static inline int
361 validate_num_devices(uint32_t max_nb_devices)
362 {
363 	if (num_devices > max_nb_devices) {
364 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
365 		return -1;
366 	}
367 	return 0;
368 }
369 
370 /*
371  * Initialises a given port using global settings and with the rx buffers
372  * coming from the mbuf_pool passed as parameter
373  */
374 static inline int
375 port_init(uint8_t port)
376 {
377 	struct rte_eth_dev_info dev_info;
378 	struct rte_eth_conf port_conf;
379 	struct rte_eth_rxconf *rxconf;
380 	struct rte_eth_txconf *txconf;
381 	int16_t rx_rings, tx_rings;
382 	uint16_t rx_ring_size, tx_ring_size;
383 	int retval;
384 	uint16_t q;
385 
386 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
387 	rte_eth_dev_info_get (port, &dev_info);
388 
389 	if (dev_info.max_rx_queues > MAX_QUEUES) {
390 		rte_exit(EXIT_FAILURE,
391 			"please define MAX_QUEUES no less than %u in %s\n",
392 			dev_info.max_rx_queues, __FILE__);
393 	}
394 
395 	rxconf = &dev_info.default_rxconf;
396 	txconf = &dev_info.default_txconf;
397 	rxconf->rx_drop_en = 1;
398 
399 	/* Enable vlan offload */
400 	txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
401 
402 	/*
403 	 * Zero copy defers queue RX/TX start to the time when guest
404 	 * finishes its startup and packet buffers from that guest are
405 	 * available.
406 	 */
407 	if (zero_copy) {
408 		rxconf->rx_deferred_start = 1;
409 		rxconf->rx_drop_en = 0;
410 		txconf->tx_deferred_start = 1;
411 	}
412 
413 	/*configure the number of supported virtio devices based on VMDQ limits */
414 	num_devices = dev_info.max_vmdq_pools;
415 
416 	if (zero_copy) {
417 		rx_ring_size = num_rx_descriptor;
418 		tx_ring_size = num_tx_descriptor;
419 		tx_rings = dev_info.max_tx_queues;
420 	} else {
421 		rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
422 		tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
423 		tx_rings = (uint16_t)rte_lcore_count();
424 	}
425 
426 	retval = validate_num_devices(MAX_DEVICES);
427 	if (retval < 0)
428 		return retval;
429 
430 	/* Get port configuration. */
431 	retval = get_eth_conf(&port_conf, num_devices);
432 	if (retval < 0)
433 		return retval;
434 	/* NIC queues are divided into pf queues and vmdq queues.  */
435 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
436 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
437 	num_vmdq_queues = num_devices * queues_per_pool;
438 	num_queues = num_pf_queues + num_vmdq_queues;
439 	vmdq_queue_base = dev_info.vmdq_queue_base;
440 	vmdq_pool_base  = dev_info.vmdq_pool_base;
441 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
442 		num_pf_queues, num_devices, queues_per_pool);
443 
444 	if (port >= rte_eth_dev_count()) return -1;
445 
446 	rx_rings = (uint16_t)dev_info.max_rx_queues;
447 	/* Configure ethernet device. */
448 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
449 	if (retval != 0)
450 		return retval;
451 
452 	/* Setup the queues. */
453 	for (q = 0; q < rx_rings; q ++) {
454 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
455 						rte_eth_dev_socket_id(port),
456 						rxconf,
457 						vpool_array[q].pool);
458 		if (retval < 0)
459 			return retval;
460 	}
461 	for (q = 0; q < tx_rings; q ++) {
462 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
463 						rte_eth_dev_socket_id(port),
464 						txconf);
465 		if (retval < 0)
466 			return retval;
467 	}
468 
469 	/* Start the device. */
470 	retval  = rte_eth_dev_start(port);
471 	if (retval < 0) {
472 		RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
473 		return retval;
474 	}
475 
476 	if (promiscuous)
477 		rte_eth_promiscuous_enable(port);
478 
479 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
480 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
481 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
482 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
483 			(unsigned)port,
484 			vmdq_ports_eth_addr[port].addr_bytes[0],
485 			vmdq_ports_eth_addr[port].addr_bytes[1],
486 			vmdq_ports_eth_addr[port].addr_bytes[2],
487 			vmdq_ports_eth_addr[port].addr_bytes[3],
488 			vmdq_ports_eth_addr[port].addr_bytes[4],
489 			vmdq_ports_eth_addr[port].addr_bytes[5]);
490 
491 	return 0;
492 }
493 
494 /*
495  * Set character device basename.
496  */
497 static int
498 us_vhost_parse_basename(const char *q_arg)
499 {
500 	/* parse number string */
501 
502 	if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
503 		return -1;
504 	else
505 		snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
506 
507 	return 0;
508 }
509 
510 /*
511  * Parse the portmask provided at run time.
512  */
513 static int
514 parse_portmask(const char *portmask)
515 {
516 	char *end = NULL;
517 	unsigned long pm;
518 
519 	errno = 0;
520 
521 	/* parse hexadecimal string */
522 	pm = strtoul(portmask, &end, 16);
523 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
524 		return -1;
525 
526 	if (pm == 0)
527 		return -1;
528 
529 	return pm;
530 
531 }
532 
533 /*
534  * Parse num options at run time.
535  */
536 static int
537 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
538 {
539 	char *end = NULL;
540 	unsigned long num;
541 
542 	errno = 0;
543 
544 	/* parse unsigned int string */
545 	num = strtoul(q_arg, &end, 10);
546 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
547 		return -1;
548 
549 	if (num > max_valid_value)
550 		return -1;
551 
552 	return num;
553 
554 }
555 
556 /*
557  * Display usage
558  */
559 static void
560 us_vhost_usage(const char *prgname)
561 {
562 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
563 	"		--vm2vm [0|1|2]\n"
564 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
565 	"		--dev-basename <name>\n"
566 	"		--nb-devices ND\n"
567 	"		-p PORTMASK: Set mask for ports to be used by application\n"
568 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
569 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
570 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
571 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
572 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
573 	"		--vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
574 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
575 	"		--dev-basename: The basename to be used for the character device.\n"
576 	"		--zero-copy [0|1]: disable(default)/enable rx/tx "
577 			"zero copy\n"
578 	"		--rx-desc-num [0-N]: the number of descriptors on rx, "
579 			"used only when zero copy is enabled.\n"
580 	"		--tx-desc-num [0-N]: the number of descriptors on tx, "
581 			"used only when zero copy is enabled.\n",
582 	       prgname);
583 }
584 
585 /*
586  * Parse the arguments given in the command line of the application.
587  */
588 static int
589 us_vhost_parse_args(int argc, char **argv)
590 {
591 	int opt, ret;
592 	int option_index;
593 	unsigned i;
594 	const char *prgname = argv[0];
595 	static struct option long_option[] = {
596 		{"vm2vm", required_argument, NULL, 0},
597 		{"rx-retry", required_argument, NULL, 0},
598 		{"rx-retry-delay", required_argument, NULL, 0},
599 		{"rx-retry-num", required_argument, NULL, 0},
600 		{"mergeable", required_argument, NULL, 0},
601 		{"vlan-strip", required_argument, NULL, 0},
602 		{"stats", required_argument, NULL, 0},
603 		{"dev-basename", required_argument, NULL, 0},
604 		{"zero-copy", required_argument, NULL, 0},
605 		{"rx-desc-num", required_argument, NULL, 0},
606 		{"tx-desc-num", required_argument, NULL, 0},
607 		{NULL, 0, 0, 0},
608 	};
609 
610 	/* Parse command line */
611 	while ((opt = getopt_long(argc, argv, "p:P",
612 			long_option, &option_index)) != EOF) {
613 		switch (opt) {
614 		/* Portmask */
615 		case 'p':
616 			enabled_port_mask = parse_portmask(optarg);
617 			if (enabled_port_mask == 0) {
618 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
619 				us_vhost_usage(prgname);
620 				return -1;
621 			}
622 			break;
623 
624 		case 'P':
625 			promiscuous = 1;
626 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
627 				ETH_VMDQ_ACCEPT_BROADCAST |
628 				ETH_VMDQ_ACCEPT_MULTICAST;
629 			rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
630 
631 			break;
632 
633 		case 0:
634 			/* Enable/disable vm2vm comms. */
635 			if (!strncmp(long_option[option_index].name, "vm2vm",
636 				MAX_LONG_OPT_SZ)) {
637 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
638 				if (ret == -1) {
639 					RTE_LOG(INFO, VHOST_CONFIG,
640 						"Invalid argument for "
641 						"vm2vm [0|1|2]\n");
642 					us_vhost_usage(prgname);
643 					return -1;
644 				} else {
645 					vm2vm_mode = (vm2vm_type)ret;
646 				}
647 			}
648 
649 			/* Enable/disable retries on RX. */
650 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
651 				ret = parse_num_opt(optarg, 1);
652 				if (ret == -1) {
653 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
654 					us_vhost_usage(prgname);
655 					return -1;
656 				} else {
657 					enable_retry = ret;
658 				}
659 			}
660 
661 			/* Specify the retries delay time (in useconds) on RX. */
662 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
663 				ret = parse_num_opt(optarg, INT32_MAX);
664 				if (ret == -1) {
665 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
666 					us_vhost_usage(prgname);
667 					return -1;
668 				} else {
669 					burst_rx_delay_time = ret;
670 				}
671 			}
672 
673 			/* Specify the retries number on RX. */
674 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
675 				ret = parse_num_opt(optarg, INT32_MAX);
676 				if (ret == -1) {
677 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
678 					us_vhost_usage(prgname);
679 					return -1;
680 				} else {
681 					burst_rx_retry_num = ret;
682 				}
683 			}
684 
685 			/* Enable/disable RX mergeable buffers. */
686 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
687 				ret = parse_num_opt(optarg, 1);
688 				if (ret == -1) {
689 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
690 					us_vhost_usage(prgname);
691 					return -1;
692 				} else {
693 					mergeable = !!ret;
694 					if (ret) {
695 						vmdq_conf_default.rxmode.jumbo_frame = 1;
696 						vmdq_conf_default.rxmode.max_rx_pkt_len
697 							= JUMBO_FRAME_MAX_SIZE;
698 					}
699 				}
700 			}
701 
702 			/* Enable/disable RX VLAN strip on host. */
703 			if (!strncmp(long_option[option_index].name,
704 				"vlan-strip", MAX_LONG_OPT_SZ)) {
705 				ret = parse_num_opt(optarg, 1);
706 				if (ret == -1) {
707 					RTE_LOG(INFO, VHOST_CONFIG,
708 						"Invalid argument for VLAN strip [0|1]\n");
709 					us_vhost_usage(prgname);
710 					return -1;
711 				} else {
712 					vlan_strip = !!ret;
713 					vmdq_conf_default.rxmode.hw_vlan_strip =
714 						vlan_strip;
715 				}
716 			}
717 
718 			/* Enable/disable stats. */
719 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
720 				ret = parse_num_opt(optarg, INT32_MAX);
721 				if (ret == -1) {
722 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
723 					us_vhost_usage(prgname);
724 					return -1;
725 				} else {
726 					enable_stats = ret;
727 				}
728 			}
729 
730 			/* Set character device basename. */
731 			if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
732 				if (us_vhost_parse_basename(optarg) == -1) {
733 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
734 					us_vhost_usage(prgname);
735 					return -1;
736 				}
737 			}
738 
739 			/* Enable/disable rx/tx zero copy. */
740 			if (!strncmp(long_option[option_index].name,
741 				"zero-copy", MAX_LONG_OPT_SZ)) {
742 				ret = parse_num_opt(optarg, 1);
743 				if (ret == -1) {
744 					RTE_LOG(INFO, VHOST_CONFIG,
745 						"Invalid argument"
746 						" for zero-copy [0|1]\n");
747 					us_vhost_usage(prgname);
748 					return -1;
749 				} else
750 					zero_copy = ret;
751 			}
752 
753 			/* Specify the descriptor number on RX. */
754 			if (!strncmp(long_option[option_index].name,
755 				"rx-desc-num", MAX_LONG_OPT_SZ)) {
756 				ret = parse_num_opt(optarg, MAX_RING_DESC);
757 				if ((ret == -1) || (!POWEROF2(ret))) {
758 					RTE_LOG(INFO, VHOST_CONFIG,
759 					"Invalid argument for rx-desc-num[0-N],"
760 					"power of 2 required.\n");
761 					us_vhost_usage(prgname);
762 					return -1;
763 				} else {
764 					num_rx_descriptor = ret;
765 				}
766 			}
767 
768 			/* Specify the descriptor number on TX. */
769 			if (!strncmp(long_option[option_index].name,
770 				"tx-desc-num", MAX_LONG_OPT_SZ)) {
771 				ret = parse_num_opt(optarg, MAX_RING_DESC);
772 				if ((ret == -1) || (!POWEROF2(ret))) {
773 					RTE_LOG(INFO, VHOST_CONFIG,
774 					"Invalid argument for tx-desc-num [0-N],"
775 					"power of 2 required.\n");
776 					us_vhost_usage(prgname);
777 					return -1;
778 				} else {
779 					num_tx_descriptor = ret;
780 				}
781 			}
782 
783 			break;
784 
785 			/* Invalid option - print options. */
786 		default:
787 			us_vhost_usage(prgname);
788 			return -1;
789 		}
790 	}
791 
792 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
793 		if (enabled_port_mask & (1 << i))
794 			ports[num_ports++] = (uint8_t)i;
795 	}
796 
797 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
798 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
799 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
800 		return -1;
801 	}
802 
803 	if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
804 		RTE_LOG(INFO, VHOST_PORT,
805 			"Vhost zero copy doesn't support software vm2vm,"
806 			"please specify 'vm2vm 2' to use hardware vm2vm.\n");
807 		return -1;
808 	}
809 
810 	if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
811 		RTE_LOG(INFO, VHOST_PORT,
812 			"Vhost zero copy doesn't support jumbo frame,"
813 			"please specify '--mergeable 0' to disable the "
814 			"mergeable feature.\n");
815 		return -1;
816 	}
817 
818 	return 0;
819 }
820 
821 /*
822  * Update the global var NUM_PORTS and array PORTS according to system ports number
823  * and return valid ports number
824  */
825 static unsigned check_ports_num(unsigned nb_ports)
826 {
827 	unsigned valid_num_ports = num_ports;
828 	unsigned portid;
829 
830 	if (num_ports > nb_ports) {
831 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
832 			num_ports, nb_ports);
833 		num_ports = nb_ports;
834 	}
835 
836 	for (portid = 0; portid < num_ports; portid ++) {
837 		if (ports[portid] >= nb_ports) {
838 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
839 				ports[portid], (nb_ports - 1));
840 			ports[portid] = INVALID_PORT_ID;
841 			valid_num_ports--;
842 		}
843 	}
844 	return valid_num_ports;
845 }
846 
847 /*
848  * Macro to print out packet contents. Wrapped in debug define so that the
849  * data path is not effected when debug is disabled.
850  */
851 #ifdef DEBUG
852 #define PRINT_PACKET(device, addr, size, header) do {																\
853 	char *pkt_addr = (char*)(addr);																					\
854 	unsigned int index;																								\
855 	char packet[MAX_PRINT_BUFF];																					\
856 																													\
857 	if ((header))																									\
858 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));				\
859 	else																											\
860 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));				\
861 	for (index = 0; index < (size); index++) {																		\
862 		snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),	\
863 			"%02hhx ", pkt_addr[index]);																			\
864 	}																												\
865 	snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");	\
866 																													\
867 	LOG_DEBUG(VHOST_DATA, "%s", packet);																					\
868 } while(0)
869 #else
870 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
871 #endif
872 
873 /*
874  * Function to convert guest physical addresses to vhost physical addresses.
875  * This is used to convert virtio buffer addresses.
876  */
877 static inline uint64_t __attribute__((always_inline))
878 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
879 	uint32_t buf_len, hpa_type *addr_type)
880 {
881 	struct virtio_memory_regions_hpa *region;
882 	uint32_t regionidx;
883 	uint64_t vhost_pa = 0;
884 
885 	*addr_type = PHYS_ADDR_INVALID;
886 
887 	for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
888 		region = &vdev->regions_hpa[regionidx];
889 		if ((guest_pa >= region->guest_phys_address) &&
890 			(guest_pa <= region->guest_phys_address_end)) {
891 			vhost_pa = region->host_phys_addr_offset + guest_pa;
892 			if (likely((guest_pa + buf_len - 1)
893 				<= region->guest_phys_address_end))
894 				*addr_type = PHYS_ADDR_CONTINUOUS;
895 			else
896 				*addr_type = PHYS_ADDR_CROSS_SUBREG;
897 			break;
898 		}
899 	}
900 
901 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
902 		vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
903 		(void *)(uintptr_t)vhost_pa);
904 
905 	return vhost_pa;
906 }
907 
908 /*
909  * Compares a packet destination MAC address to a device MAC address.
910  */
911 static inline int __attribute__((always_inline))
912 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
913 {
914 	return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
915 }
916 
917 /*
918  * This function learns the MAC address of the device and registers this along with a
919  * vlan tag to a VMDQ.
920  */
921 static int
922 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
923 {
924 	struct ether_hdr *pkt_hdr;
925 	struct virtio_net_data_ll *dev_ll;
926 	struct virtio_net *dev = vdev->dev;
927 	int i, ret;
928 
929 	/* Learn MAC address of guest device from packet */
930 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
931 
932 	dev_ll = ll_root_used;
933 
934 	while (dev_ll != NULL) {
935 		if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
936 			RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
937 			return -1;
938 		}
939 		dev_ll = dev_ll->next;
940 	}
941 
942 	for (i = 0; i < ETHER_ADDR_LEN; i++)
943 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
944 
945 	/* vlan_tag currently uses the device_id. */
946 	vdev->vlan_tag = vlan_tags[dev->device_fh];
947 
948 	/* Print out VMDQ registration info. */
949 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
950 		dev->device_fh,
951 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
952 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
953 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
954 		vdev->vlan_tag);
955 
956 	/* Register the MAC address. */
957 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
958 				(uint32_t)dev->device_fh + vmdq_pool_base);
959 	if (ret)
960 		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
961 					dev->device_fh);
962 
963 	/* Enable stripping of the vlan tag as we handle routing. */
964 	if (vlan_strip)
965 		rte_eth_dev_set_vlan_strip_on_queue(ports[0],
966 			(uint16_t)vdev->vmdq_rx_q, 1);
967 
968 	/* Set device as ready for RX. */
969 	vdev->ready = DEVICE_RX;
970 
971 	return 0;
972 }
973 
974 /*
975  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
976  * queue before disabling RX on the device.
977  */
978 static inline void
979 unlink_vmdq(struct vhost_dev *vdev)
980 {
981 	unsigned i = 0;
982 	unsigned rx_count;
983 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
984 
985 	if (vdev->ready == DEVICE_RX) {
986 		/*clear MAC and VLAN settings*/
987 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
988 		for (i = 0; i < 6; i++)
989 			vdev->mac_address.addr_bytes[i] = 0;
990 
991 		vdev->vlan_tag = 0;
992 
993 		/*Clear out the receive buffers*/
994 		rx_count = rte_eth_rx_burst(ports[0],
995 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
996 
997 		while (rx_count) {
998 			for (i = 0; i < rx_count; i++)
999 				rte_pktmbuf_free(pkts_burst[i]);
1000 
1001 			rx_count = rte_eth_rx_burst(ports[0],
1002 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1003 		}
1004 
1005 		vdev->ready = DEVICE_MAC_LEARNING;
1006 	}
1007 }
1008 
1009 /*
1010  * Check if the packet destination MAC address is for a local device. If so then put
1011  * the packet on that devices RX queue. If not then return.
1012  */
1013 static inline int __attribute__((always_inline))
1014 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1015 {
1016 	struct virtio_net_data_ll *dev_ll;
1017 	struct ether_hdr *pkt_hdr;
1018 	uint64_t ret = 0;
1019 	struct virtio_net *dev = vdev->dev;
1020 	struct virtio_net *tdev; /* destination virito device */
1021 
1022 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1023 
1024 	/*get the used devices list*/
1025 	dev_ll = ll_root_used;
1026 
1027 	while (dev_ll != NULL) {
1028 		if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1029 				          &dev_ll->vdev->mac_address)) {
1030 
1031 			/* Drop the packet if the TX packet is destined for the TX device. */
1032 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1033 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1034 							dev->device_fh);
1035 				return 0;
1036 			}
1037 			tdev = dev_ll->vdev->dev;
1038 
1039 
1040 			LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1041 
1042 			if (unlikely(dev_ll->vdev->remove)) {
1043 				/*drop the packet if the device is marked for removal*/
1044 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1045 			} else {
1046 				/*send the packet to the local virtio device*/
1047 				ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1048 				if (enable_stats) {
1049 					rte_atomic64_add(
1050 					&dev_statistics[tdev->device_fh].rx_total_atomic,
1051 					1);
1052 					rte_atomic64_add(
1053 					&dev_statistics[tdev->device_fh].rx_atomic,
1054 					ret);
1055 					dev_statistics[dev->device_fh].tx_total++;
1056 					dev_statistics[dev->device_fh].tx += ret;
1057 				}
1058 			}
1059 
1060 			return 0;
1061 		}
1062 		dev_ll = dev_ll->next;
1063 	}
1064 
1065 	return -1;
1066 }
1067 
1068 /*
1069  * Check if the destination MAC of a packet is one local VM,
1070  * and get its vlan tag, and offset if it is.
1071  */
1072 static inline int __attribute__((always_inline))
1073 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1074 	uint32_t *offset, uint16_t *vlan_tag)
1075 {
1076 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1077 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1078 
1079 	while (dev_ll != NULL) {
1080 		if ((dev_ll->vdev->ready == DEVICE_RX)
1081 			&& ether_addr_cmp(&(pkt_hdr->d_addr),
1082 		&dev_ll->vdev->mac_address)) {
1083 			/*
1084 			 * Drop the packet if the TX packet is
1085 			 * destined for the TX device.
1086 			 */
1087 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1088 				LOG_DEBUG(VHOST_DATA,
1089 				"(%"PRIu64") TX: Source and destination"
1090 				" MAC addresses are the same. Dropping "
1091 				"packet.\n",
1092 				dev_ll->vdev->dev->device_fh);
1093 				return -1;
1094 			}
1095 
1096 			/*
1097 			 * HW vlan strip will reduce the packet length
1098 			 * by minus length of vlan tag, so need restore
1099 			 * the packet length by plus it.
1100 			 */
1101 			*offset = VLAN_HLEN;
1102 			*vlan_tag =
1103 			(uint16_t)
1104 			vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1105 
1106 			LOG_DEBUG(VHOST_DATA,
1107 			"(%"PRIu64") TX: pkt to local VM device id:"
1108 			"(%"PRIu64") vlan tag: %d.\n",
1109 			dev->device_fh, dev_ll->vdev->dev->device_fh,
1110 			(int)*vlan_tag);
1111 
1112 			break;
1113 		}
1114 		dev_ll = dev_ll->next;
1115 	}
1116 	return 0;
1117 }
1118 
1119 /*
1120  * This function routes the TX packet to the correct interface. This may be a local device
1121  * or the physical port.
1122  */
1123 static inline void __attribute__((always_inline))
1124 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1125 {
1126 	struct mbuf_table *tx_q;
1127 	struct rte_mbuf **m_table;
1128 	unsigned len, ret, offset = 0;
1129 	const uint16_t lcore_id = rte_lcore_id();
1130 	struct virtio_net *dev = vdev->dev;
1131 	struct ether_hdr *nh;
1132 
1133 	/*check if destination is local VM*/
1134 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1135 		rte_pktmbuf_free(m);
1136 		return;
1137 	}
1138 
1139 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1140 		if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
1141 			rte_pktmbuf_free(m);
1142 			return;
1143 		}
1144 	}
1145 
1146 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1147 
1148 	/*Add packet to the port tx queue*/
1149 	tx_q = &lcore_tx_queue[lcore_id];
1150 	len = tx_q->len;
1151 
1152 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
1153 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
1154 		/* Guest has inserted the vlan tag. */
1155 		struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
1156 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1157 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1158 			(vh->vlan_tci != vlan_tag_be))
1159 			vh->vlan_tci = vlan_tag_be;
1160 	} else {
1161 		m->ol_flags = PKT_TX_VLAN_PKT;
1162 
1163 		/*
1164 		 * Find the right seg to adjust the data len when offset is
1165 		 * bigger than tail room size.
1166 		 */
1167 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1168 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1169 				m->data_len += offset;
1170 			else {
1171 				struct rte_mbuf *seg = m;
1172 
1173 				while ((seg->next != NULL) &&
1174 					(offset > rte_pktmbuf_tailroom(seg)))
1175 					seg = seg->next;
1176 
1177 				seg->data_len += offset;
1178 			}
1179 			m->pkt_len += offset;
1180 		}
1181 
1182 		m->vlan_tci = vlan_tag;
1183 	}
1184 
1185 	tx_q->m_table[len] = m;
1186 	len++;
1187 	if (enable_stats) {
1188 		dev_statistics[dev->device_fh].tx_total++;
1189 		dev_statistics[dev->device_fh].tx++;
1190 	}
1191 
1192 	if (unlikely(len == MAX_PKT_BURST)) {
1193 		m_table = (struct rte_mbuf **)tx_q->m_table;
1194 		ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1195 		/* Free any buffers not handled by TX and update the port stats. */
1196 		if (unlikely(ret < len)) {
1197 			do {
1198 				rte_pktmbuf_free(m_table[ret]);
1199 			} while (++ret < len);
1200 		}
1201 
1202 		len = 0;
1203 	}
1204 
1205 	tx_q->len = len;
1206 	return;
1207 }
1208 /*
1209  * This function is called by each data core. It handles all RX/TX registered with the
1210  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1211  * with all devices in the main linked list.
1212  */
1213 static int
1214 switch_worker(__attribute__((unused)) void *arg)
1215 {
1216 	struct rte_mempool *mbuf_pool = arg;
1217 	struct virtio_net *dev = NULL;
1218 	struct vhost_dev *vdev = NULL;
1219 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1220 	struct virtio_net_data_ll *dev_ll;
1221 	struct mbuf_table *tx_q;
1222 	volatile struct lcore_ll_info *lcore_ll;
1223 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1224 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1225 	unsigned ret, i;
1226 	const uint16_t lcore_id = rte_lcore_id();
1227 	const uint16_t num_cores = (uint16_t)rte_lcore_count();
1228 	uint16_t rx_count = 0;
1229 	uint16_t tx_count;
1230 	uint32_t retry = 0;
1231 
1232 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1233 	lcore_ll = lcore_info[lcore_id].lcore_ll;
1234 	prev_tsc = 0;
1235 
1236 	tx_q = &lcore_tx_queue[lcore_id];
1237 	for (i = 0; i < num_cores; i ++) {
1238 		if (lcore_ids[i] == lcore_id) {
1239 			tx_q->txq_id = i;
1240 			break;
1241 		}
1242 	}
1243 
1244 	while(1) {
1245 		cur_tsc = rte_rdtsc();
1246 		/*
1247 		 * TX burst queue drain
1248 		 */
1249 		diff_tsc = cur_tsc - prev_tsc;
1250 		if (unlikely(diff_tsc > drain_tsc)) {
1251 
1252 			if (tx_q->len) {
1253 				LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1254 
1255 				/*Tx any packets in the queue*/
1256 				ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1257 									   (struct rte_mbuf **)tx_q->m_table,
1258 									   (uint16_t)tx_q->len);
1259 				if (unlikely(ret < tx_q->len)) {
1260 					do {
1261 						rte_pktmbuf_free(tx_q->m_table[ret]);
1262 					} while (++ret < tx_q->len);
1263 				}
1264 
1265 				tx_q->len = 0;
1266 			}
1267 
1268 			prev_tsc = cur_tsc;
1269 
1270 		}
1271 
1272 		rte_prefetch0(lcore_ll->ll_root_used);
1273 		/*
1274 		 * Inform the configuration core that we have exited the linked list and that no devices are
1275 		 * in use if requested.
1276 		 */
1277 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1278 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1279 
1280 		/*
1281 		 * Process devices
1282 		 */
1283 		dev_ll = lcore_ll->ll_root_used;
1284 
1285 		while (dev_ll != NULL) {
1286 			/*get virtio device ID*/
1287 			vdev = dev_ll->vdev;
1288 			dev = vdev->dev;
1289 
1290 			if (unlikely(vdev->remove)) {
1291 				dev_ll = dev_ll->next;
1292 				unlink_vmdq(vdev);
1293 				vdev->ready = DEVICE_SAFE_REMOVE;
1294 				continue;
1295 			}
1296 			if (likely(vdev->ready == DEVICE_RX)) {
1297 				/*Handle guest RX*/
1298 				rx_count = rte_eth_rx_burst(ports[0],
1299 					vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1300 
1301 				if (rx_count) {
1302 					/*
1303 					* Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1304 					* Here MAX_PKT_BURST must be less than virtio queue size
1305 					*/
1306 					if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1307 						for (retry = 0; retry < burst_rx_retry_num; retry++) {
1308 							rte_delay_us(burst_rx_delay_time);
1309 							if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1310 								break;
1311 						}
1312 					}
1313 					ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1314 					if (enable_stats) {
1315 						rte_atomic64_add(
1316 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1317 						rx_count);
1318 						rte_atomic64_add(
1319 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1320 					}
1321 					while (likely(rx_count)) {
1322 						rx_count--;
1323 						rte_pktmbuf_free(pkts_burst[rx_count]);
1324 					}
1325 
1326 				}
1327 			}
1328 
1329 			if (likely(!vdev->remove)) {
1330 				/* Handle guest TX*/
1331 				tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1332 				/* If this is the first received packet we need to learn the MAC and setup VMDQ */
1333 				if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1334 					if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1335 						while (tx_count)
1336 							rte_pktmbuf_free(pkts_burst[--tx_count]);
1337 					}
1338 				}
1339 				while (tx_count)
1340 					virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1341 			}
1342 
1343 			/*move to the next device in the list*/
1344 			dev_ll = dev_ll->next;
1345 		}
1346 	}
1347 
1348 	return 0;
1349 }
1350 
1351 /*
1352  * This function gets available ring number for zero copy rx.
1353  * Only one thread will call this funciton for a paticular virtio device,
1354  * so, it is designed as non-thread-safe function.
1355  */
1356 static inline uint32_t __attribute__((always_inline))
1357 get_available_ring_num_zcp(struct virtio_net *dev)
1358 {
1359 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1360 	uint16_t avail_idx;
1361 
1362 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1363 	return (uint32_t)(avail_idx - vq->last_used_idx_res);
1364 }
1365 
1366 /*
1367  * This function gets available ring index for zero copy rx,
1368  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1369  * Only one thread will call this funciton for a paticular virtio device,
1370  * so, it is designed as non-thread-safe function.
1371  */
1372 static inline uint32_t __attribute__((always_inline))
1373 get_available_ring_index_zcp(struct virtio_net *dev,
1374 	uint16_t *res_base_idx, uint32_t count)
1375 {
1376 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1377 	uint16_t avail_idx;
1378 	uint32_t retry = 0;
1379 	uint16_t free_entries;
1380 
1381 	*res_base_idx = vq->last_used_idx_res;
1382 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1383 	free_entries = (avail_idx - *res_base_idx);
1384 
1385 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1386 			"avail idx: %d, "
1387 			"res base idx:%d, free entries:%d\n",
1388 			dev->device_fh, avail_idx, *res_base_idx,
1389 			free_entries);
1390 
1391 	/*
1392 	 * If retry is enabled and the queue is full then we wait
1393 	 * and retry to avoid packet loss.
1394 	 */
1395 	if (enable_retry && unlikely(count > free_entries)) {
1396 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1397 			rte_delay_us(burst_rx_delay_time);
1398 			avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1399 			free_entries = (avail_idx - *res_base_idx);
1400 			if (count <= free_entries)
1401 				break;
1402 		}
1403 	}
1404 
1405 	/*check that we have enough buffers*/
1406 	if (unlikely(count > free_entries))
1407 		count = free_entries;
1408 
1409 	if (unlikely(count == 0)) {
1410 		LOG_DEBUG(VHOST_DATA,
1411 			"(%"PRIu64") Fail in get_available_ring_index_zcp: "
1412 			"avail idx: %d, res base idx:%d, free entries:%d\n",
1413 			dev->device_fh, avail_idx,
1414 			*res_base_idx, free_entries);
1415 		return 0;
1416 	}
1417 
1418 	vq->last_used_idx_res = *res_base_idx + count;
1419 
1420 	return count;
1421 }
1422 
1423 /*
1424  * This function put descriptor back to used list.
1425  */
1426 static inline void __attribute__((always_inline))
1427 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1428 {
1429 	uint16_t res_cur_idx = vq->last_used_idx;
1430 	vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1431 	vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1432 	rte_compiler_barrier();
1433 	*(volatile uint16_t *)&vq->used->idx += 1;
1434 	vq->last_used_idx += 1;
1435 
1436 	/* Kick the guest if necessary. */
1437 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1438 		eventfd_write(vq->callfd, (eventfd_t)1);
1439 }
1440 
1441 /*
1442  * This function get available descriptor from vitio vring and un-attached mbuf
1443  * from vpool->ring, and then attach them together. It needs adjust the offset
1444  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1445  * frame data may be put to wrong location in mbuf.
1446  */
1447 static inline void __attribute__((always_inline))
1448 attach_rxmbuf_zcp(struct virtio_net *dev)
1449 {
1450 	uint16_t res_base_idx, desc_idx;
1451 	uint64_t buff_addr, phys_addr;
1452 	struct vhost_virtqueue *vq;
1453 	struct vring_desc *desc;
1454 	void *obj = NULL;
1455 	struct rte_mbuf *mbuf;
1456 	struct vpool *vpool;
1457 	hpa_type addr_type;
1458 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1459 
1460 	vpool = &vpool_array[vdev->vmdq_rx_q];
1461 	vq = dev->virtqueue[VIRTIO_RXQ];
1462 
1463 	do {
1464 		if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1465 				1) != 1))
1466 			return;
1467 		desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1468 
1469 		desc = &vq->desc[desc_idx];
1470 		if (desc->flags & VRING_DESC_F_NEXT) {
1471 			desc = &vq->desc[desc->next];
1472 			buff_addr = gpa_to_vva(dev, desc->addr);
1473 			phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1474 					&addr_type);
1475 		} else {
1476 			buff_addr = gpa_to_vva(dev,
1477 					desc->addr + vq->vhost_hlen);
1478 			phys_addr = gpa_to_hpa(vdev,
1479 					desc->addr + vq->vhost_hlen,
1480 					desc->len, &addr_type);
1481 		}
1482 
1483 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1484 			RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1485 				" address found when attaching RX frame buffer"
1486 				" address!\n", dev->device_fh);
1487 			put_desc_to_used_list_zcp(vq, desc_idx);
1488 			continue;
1489 		}
1490 
1491 		/*
1492 		 * Check if the frame buffer address from guest crosses
1493 		 * sub-region or not.
1494 		 */
1495 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1496 			RTE_LOG(ERR, VHOST_DATA,
1497 				"(%"PRIu64") Frame buffer address cross "
1498 				"sub-regioin found when attaching RX frame "
1499 				"buffer address!\n",
1500 				dev->device_fh);
1501 			put_desc_to_used_list_zcp(vq, desc_idx);
1502 			continue;
1503 		}
1504 	} while (unlikely(phys_addr == 0));
1505 
1506 	rte_ring_sc_dequeue(vpool->ring, &obj);
1507 	mbuf = obj;
1508 	if (unlikely(mbuf == NULL)) {
1509 		LOG_DEBUG(VHOST_DATA,
1510 			"(%"PRIu64") in attach_rxmbuf_zcp: "
1511 			"ring_sc_dequeue fail.\n",
1512 			dev->device_fh);
1513 		put_desc_to_used_list_zcp(vq, desc_idx);
1514 		return;
1515 	}
1516 
1517 	if (unlikely(vpool->buf_size > desc->len)) {
1518 		LOG_DEBUG(VHOST_DATA,
1519 			"(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1520 			"length(%d) of descriptor idx: %d less than room "
1521 			"size required: %d\n",
1522 			dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1523 		put_desc_to_used_list_zcp(vq, desc_idx);
1524 		rte_ring_sp_enqueue(vpool->ring, obj);
1525 		return;
1526 	}
1527 
1528 	mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1529 	mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1530 	mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1531 	mbuf->data_len = desc->len;
1532 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1533 
1534 	LOG_DEBUG(VHOST_DATA,
1535 		"(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1536 		"descriptor idx:%d\n",
1537 		dev->device_fh, res_base_idx, desc_idx);
1538 
1539 	__rte_mbuf_raw_free(mbuf);
1540 
1541 	return;
1542 }
1543 
1544 /*
1545  * Detach an attched packet mbuf -
1546  *  - restore original mbuf address and length values.
1547  *  - reset pktmbuf data and data_len to their default values.
1548  *  All other fields of the given packet mbuf will be left intact.
1549  *
1550  * @param m
1551  *   The attached packet mbuf.
1552  */
1553 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1554 {
1555 	const struct rte_mempool *mp = m->pool;
1556 	void *buf = rte_mbuf_to_baddr(m);
1557 	uint32_t buf_ofs;
1558 	uint32_t buf_len = mp->elt_size - sizeof(*m);
1559 	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1560 
1561 	m->buf_addr = buf;
1562 	m->buf_len = (uint16_t)buf_len;
1563 
1564 	buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1565 			RTE_PKTMBUF_HEADROOM : m->buf_len;
1566 	m->data_off = buf_ofs;
1567 
1568 	m->data_len = 0;
1569 }
1570 
1571 /*
1572  * This function is called after packets have been transimited. It fetchs mbuf
1573  * from vpool->pool, detached it and put into vpool->ring. It also update the
1574  * used index and kick the guest if necessary.
1575  */
1576 static inline uint32_t __attribute__((always_inline))
1577 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1578 {
1579 	struct rte_mbuf *mbuf;
1580 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1581 	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1582 	uint32_t index = 0;
1583 	uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1584 
1585 	LOG_DEBUG(VHOST_DATA,
1586 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1587 		"clean is: %d\n",
1588 		dev->device_fh, mbuf_count);
1589 	LOG_DEBUG(VHOST_DATA,
1590 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1591 		"clean  is : %d\n",
1592 		dev->device_fh, rte_ring_count(vpool->ring));
1593 
1594 	for (index = 0; index < mbuf_count; index++) {
1595 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1596 		if (likely(MBUF_EXT_MEM(mbuf)))
1597 			pktmbuf_detach_zcp(mbuf);
1598 		rte_ring_sp_enqueue(vpool->ring, mbuf);
1599 
1600 		/* Update used index buffer information. */
1601 		vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1602 		vq->used->ring[used_idx].len = 0;
1603 
1604 		used_idx = (used_idx + 1) & (vq->size - 1);
1605 	}
1606 
1607 	LOG_DEBUG(VHOST_DATA,
1608 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1609 		"clean is: %d\n",
1610 		dev->device_fh, rte_mempool_count(vpool->pool));
1611 	LOG_DEBUG(VHOST_DATA,
1612 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1613 		"clean  is : %d\n",
1614 		dev->device_fh, rte_ring_count(vpool->ring));
1615 	LOG_DEBUG(VHOST_DATA,
1616 		"(%"PRIu64") in txmbuf_clean_zcp: before updated "
1617 		"vq->last_used_idx:%d\n",
1618 		dev->device_fh, vq->last_used_idx);
1619 
1620 	vq->last_used_idx += mbuf_count;
1621 
1622 	LOG_DEBUG(VHOST_DATA,
1623 		"(%"PRIu64") in txmbuf_clean_zcp: after updated "
1624 		"vq->last_used_idx:%d\n",
1625 		dev->device_fh, vq->last_used_idx);
1626 
1627 	rte_compiler_barrier();
1628 
1629 	*(volatile uint16_t *)&vq->used->idx += mbuf_count;
1630 
1631 	/* Kick guest if required. */
1632 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1633 		eventfd_write(vq->callfd, (eventfd_t)1);
1634 
1635 	return 0;
1636 }
1637 
1638 /*
1639  * This function is called when a virtio device is destroy.
1640  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1641  */
1642 static void mbuf_destroy_zcp(struct vpool *vpool)
1643 {
1644 	struct rte_mbuf *mbuf = NULL;
1645 	uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1646 
1647 	LOG_DEBUG(VHOST_CONFIG,
1648 		"in mbuf_destroy_zcp: mbuf count in mempool before "
1649 		"mbuf_destroy_zcp is: %d\n",
1650 		mbuf_count);
1651 	LOG_DEBUG(VHOST_CONFIG,
1652 		"in mbuf_destroy_zcp: mbuf count in  ring before "
1653 		"mbuf_destroy_zcp  is : %d\n",
1654 		rte_ring_count(vpool->ring));
1655 
1656 	for (index = 0; index < mbuf_count; index++) {
1657 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1658 		if (likely(mbuf != NULL)) {
1659 			if (likely(MBUF_EXT_MEM(mbuf)))
1660 				pktmbuf_detach_zcp(mbuf);
1661 			rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1662 		}
1663 	}
1664 
1665 	LOG_DEBUG(VHOST_CONFIG,
1666 		"in mbuf_destroy_zcp: mbuf count in mempool after "
1667 		"mbuf_destroy_zcp is: %d\n",
1668 		rte_mempool_count(vpool->pool));
1669 	LOG_DEBUG(VHOST_CONFIG,
1670 		"in mbuf_destroy_zcp: mbuf count in ring after "
1671 		"mbuf_destroy_zcp is : %d\n",
1672 		rte_ring_count(vpool->ring));
1673 }
1674 
1675 /*
1676  * This function update the use flag and counter.
1677  */
1678 static inline uint32_t __attribute__((always_inline))
1679 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1680 	uint32_t count)
1681 {
1682 	struct vhost_virtqueue *vq;
1683 	struct vring_desc *desc;
1684 	struct rte_mbuf *buff;
1685 	/* The virtio_hdr is initialised to 0. */
1686 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1687 		= {{0, 0, 0, 0, 0, 0}, 0};
1688 	uint64_t buff_hdr_addr = 0;
1689 	uint32_t head[MAX_PKT_BURST], packet_len = 0;
1690 	uint32_t head_idx, packet_success = 0;
1691 	uint16_t res_cur_idx;
1692 
1693 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1694 
1695 	if (count == 0)
1696 		return 0;
1697 
1698 	vq = dev->virtqueue[VIRTIO_RXQ];
1699 	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1700 
1701 	res_cur_idx = vq->last_used_idx;
1702 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1703 		dev->device_fh, res_cur_idx, res_cur_idx + count);
1704 
1705 	/* Retrieve all of the head indexes first to avoid caching issues. */
1706 	for (head_idx = 0; head_idx < count; head_idx++)
1707 		head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1708 
1709 	/*Prefetch descriptor index. */
1710 	rte_prefetch0(&vq->desc[head[packet_success]]);
1711 
1712 	while (packet_success != count) {
1713 		/* Get descriptor from available ring */
1714 		desc = &vq->desc[head[packet_success]];
1715 
1716 		buff = pkts[packet_success];
1717 		LOG_DEBUG(VHOST_DATA,
1718 			"(%"PRIu64") in dev_rx_zcp: update the used idx for "
1719 			"pkt[%d] descriptor idx: %d\n",
1720 			dev->device_fh, packet_success,
1721 			MBUF_HEADROOM_UINT32(buff));
1722 
1723 		PRINT_PACKET(dev,
1724 			(uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1725 			+ RTE_PKTMBUF_HEADROOM),
1726 			rte_pktmbuf_data_len(buff), 0);
1727 
1728 		/* Buffer address translation for virtio header. */
1729 		buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1730 		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1731 
1732 		/*
1733 		 * If the descriptors are chained the header and data are
1734 		 * placed in separate buffers.
1735 		 */
1736 		if (desc->flags & VRING_DESC_F_NEXT) {
1737 			desc->len = vq->vhost_hlen;
1738 			desc = &vq->desc[desc->next];
1739 			desc->len = rte_pktmbuf_data_len(buff);
1740 		} else {
1741 			desc->len = packet_len;
1742 		}
1743 
1744 		/* Update used ring with desc information */
1745 		vq->used->ring[res_cur_idx & (vq->size - 1)].id
1746 			= head[packet_success];
1747 		vq->used->ring[res_cur_idx & (vq->size - 1)].len
1748 			= packet_len;
1749 		res_cur_idx++;
1750 		packet_success++;
1751 
1752 		/* A header is required per buffer. */
1753 		rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1754 			(const void *)&virtio_hdr, vq->vhost_hlen);
1755 
1756 		PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1757 
1758 		if (likely(packet_success < count)) {
1759 			/* Prefetch descriptor index. */
1760 			rte_prefetch0(&vq->desc[head[packet_success]]);
1761 		}
1762 	}
1763 
1764 	rte_compiler_barrier();
1765 
1766 	LOG_DEBUG(VHOST_DATA,
1767 		"(%"PRIu64") in dev_rx_zcp: before update used idx: "
1768 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1769 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1770 
1771 	*(volatile uint16_t *)&vq->used->idx += count;
1772 	vq->last_used_idx += count;
1773 
1774 	LOG_DEBUG(VHOST_DATA,
1775 		"(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1776 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1777 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1778 
1779 	/* Kick the guest if necessary. */
1780 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1781 		eventfd_write(vq->callfd, (eventfd_t)1);
1782 
1783 	return count;
1784 }
1785 
1786 /*
1787  * This function routes the TX packet to the correct interface.
1788  * This may be a local device or the physical port.
1789  */
1790 static inline void __attribute__((always_inline))
1791 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1792 	uint32_t desc_idx, uint8_t need_copy)
1793 {
1794 	struct mbuf_table *tx_q;
1795 	struct rte_mbuf **m_table;
1796 	void *obj = NULL;
1797 	struct rte_mbuf *mbuf;
1798 	unsigned len, ret, offset = 0;
1799 	struct vpool *vpool;
1800 	uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1801 	uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1802 
1803 	/*Add packet to the port tx queue*/
1804 	tx_q = &tx_queue_zcp[vmdq_rx_q];
1805 	len = tx_q->len;
1806 
1807 	/* Allocate an mbuf and populate the structure. */
1808 	vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1809 	rte_ring_sc_dequeue(vpool->ring, &obj);
1810 	mbuf = obj;
1811 	if (unlikely(mbuf == NULL)) {
1812 		struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1813 		RTE_LOG(ERR, VHOST_DATA,
1814 			"(%"PRIu64") Failed to allocate memory for mbuf.\n",
1815 			dev->device_fh);
1816 		put_desc_to_used_list_zcp(vq, desc_idx);
1817 		return;
1818 	}
1819 
1820 	if (vm2vm_mode == VM2VM_HARDWARE) {
1821 		/* Avoid using a vlan tag from any vm for external pkt, such as
1822 		 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1823 		 * selection, MAC address determines it as an external pkt
1824 		 * which should go to network, while vlan tag determine it as
1825 		 * a vm2vm pkt should forward to another vm. Hardware confuse
1826 		 * such a ambiguous situation, so pkt will lost.
1827 		 */
1828 		vlan_tag = external_pkt_default_vlan_tag;
1829 		if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1830 			MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1831 			__rte_mbuf_raw_free(mbuf);
1832 			return;
1833 		}
1834 	}
1835 
1836 	mbuf->nb_segs = m->nb_segs;
1837 	mbuf->next = m->next;
1838 	mbuf->data_len = m->data_len + offset;
1839 	mbuf->pkt_len = mbuf->data_len;
1840 	if (unlikely(need_copy)) {
1841 		/* Copy the packet contents to the mbuf. */
1842 		rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1843 			rte_pktmbuf_mtod(m, void *),
1844 			m->data_len);
1845 	} else {
1846 		mbuf->data_off = m->data_off;
1847 		mbuf->buf_physaddr = m->buf_physaddr;
1848 		mbuf->buf_addr = m->buf_addr;
1849 	}
1850 	mbuf->ol_flags = PKT_TX_VLAN_PKT;
1851 	mbuf->vlan_tci = vlan_tag;
1852 	mbuf->l2_len = sizeof(struct ether_hdr);
1853 	mbuf->l3_len = sizeof(struct ipv4_hdr);
1854 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1855 
1856 	tx_q->m_table[len] = mbuf;
1857 	len++;
1858 
1859 	LOG_DEBUG(VHOST_DATA,
1860 		"(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1861 		dev->device_fh,
1862 		mbuf->nb_segs,
1863 		(mbuf->next == NULL) ? "null" : "non-null");
1864 
1865 	if (enable_stats) {
1866 		dev_statistics[dev->device_fh].tx_total++;
1867 		dev_statistics[dev->device_fh].tx++;
1868 	}
1869 
1870 	if (unlikely(len == MAX_PKT_BURST)) {
1871 		m_table = (struct rte_mbuf **)tx_q->m_table;
1872 		ret = rte_eth_tx_burst(ports[0],
1873 			(uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1874 
1875 		/*
1876 		 * Free any buffers not handled by TX and update
1877 		 * the port stats.
1878 		 */
1879 		if (unlikely(ret < len)) {
1880 			do {
1881 				rte_pktmbuf_free(m_table[ret]);
1882 			} while (++ret < len);
1883 		}
1884 
1885 		len = 0;
1886 		txmbuf_clean_zcp(dev, vpool);
1887 	}
1888 
1889 	tx_q->len = len;
1890 
1891 	return;
1892 }
1893 
1894 /*
1895  * This function TX all available packets in virtio TX queue for one
1896  * virtio-net device. If it is first packet, it learns MAC address and
1897  * setup VMDQ.
1898  */
1899 static inline void __attribute__((always_inline))
1900 virtio_dev_tx_zcp(struct virtio_net *dev)
1901 {
1902 	struct rte_mbuf m;
1903 	struct vhost_virtqueue *vq;
1904 	struct vring_desc *desc;
1905 	uint64_t buff_addr = 0, phys_addr;
1906 	uint32_t head[MAX_PKT_BURST];
1907 	uint32_t i;
1908 	uint16_t free_entries, packet_success = 0;
1909 	uint16_t avail_idx;
1910 	uint8_t need_copy = 0;
1911 	hpa_type addr_type;
1912 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1913 
1914 	vq = dev->virtqueue[VIRTIO_TXQ];
1915 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1916 
1917 	/* If there are no available buffers then return. */
1918 	if (vq->last_used_idx_res == avail_idx)
1919 		return;
1920 
1921 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1922 
1923 	/* Prefetch available ring to retrieve head indexes. */
1924 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1925 
1926 	/* Get the number of free entries in the ring */
1927 	free_entries = (avail_idx - vq->last_used_idx_res);
1928 
1929 	/* Limit to MAX_PKT_BURST. */
1930 	free_entries
1931 		= (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1932 
1933 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1934 		dev->device_fh, free_entries);
1935 
1936 	/* Retrieve all of the head indexes first to avoid caching issues. */
1937 	for (i = 0; i < free_entries; i++)
1938 		head[i]
1939 			= vq->avail->ring[(vq->last_used_idx_res + i)
1940 			& (vq->size - 1)];
1941 
1942 	vq->last_used_idx_res += free_entries;
1943 
1944 	/* Prefetch descriptor index. */
1945 	rte_prefetch0(&vq->desc[head[packet_success]]);
1946 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1947 
1948 	while (packet_success < free_entries) {
1949 		desc = &vq->desc[head[packet_success]];
1950 
1951 		/* Discard first buffer as it is the virtio header */
1952 		desc = &vq->desc[desc->next];
1953 
1954 		/* Buffer address translation. */
1955 		buff_addr = gpa_to_vva(dev, desc->addr);
1956 		/* Need check extra VLAN_HLEN size for inserting VLAN tag */
1957 		phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1958 			&addr_type);
1959 
1960 		if (likely(packet_success < (free_entries - 1)))
1961 			/* Prefetch descriptor index. */
1962 			rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1963 
1964 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1965 			RTE_LOG(ERR, VHOST_DATA,
1966 				"(%"PRIu64") Invalid frame buffer address found"
1967 				"when TX packets!\n",
1968 				dev->device_fh);
1969 			packet_success++;
1970 			continue;
1971 		}
1972 
1973 		/* Prefetch buffer address. */
1974 		rte_prefetch0((void *)(uintptr_t)buff_addr);
1975 
1976 		/*
1977 		 * Setup dummy mbuf. This is copied to a real mbuf if
1978 		 * transmitted out the physical port.
1979 		 */
1980 		m.data_len = desc->len;
1981 		m.nb_segs = 1;
1982 		m.next = NULL;
1983 		m.data_off = 0;
1984 		m.buf_addr = (void *)(uintptr_t)buff_addr;
1985 		m.buf_physaddr = phys_addr;
1986 
1987 		/*
1988 		 * Check if the frame buffer address from guest crosses
1989 		 * sub-region or not.
1990 		 */
1991 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1992 			RTE_LOG(ERR, VHOST_DATA,
1993 				"(%"PRIu64") Frame buffer address cross "
1994 				"sub-regioin found when attaching TX frame "
1995 				"buffer address!\n",
1996 				dev->device_fh);
1997 			need_copy = 1;
1998 		} else
1999 			need_copy = 0;
2000 
2001 		PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2002 
2003 		/*
2004 		 * If this is the first received packet we need to learn
2005 		 * the MAC and setup VMDQ
2006 		 */
2007 		if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2008 			if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2009 				/*
2010 				 * Discard frame if device is scheduled for
2011 				 * removal or a duplicate MAC address is found.
2012 				 */
2013 				packet_success += free_entries;
2014 				vq->last_used_idx += packet_success;
2015 				break;
2016 			}
2017 		}
2018 
2019 		virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2020 		packet_success++;
2021 	}
2022 }
2023 
2024 /*
2025  * This function is called by each data core. It handles all RX/TX registered
2026  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2027  * addresses are compared with all devices in the main linked list.
2028  */
2029 static int
2030 switch_worker_zcp(__attribute__((unused)) void *arg)
2031 {
2032 	struct virtio_net *dev = NULL;
2033 	struct vhost_dev  *vdev = NULL;
2034 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2035 	struct virtio_net_data_ll *dev_ll;
2036 	struct mbuf_table *tx_q;
2037 	volatile struct lcore_ll_info *lcore_ll;
2038 	const uint64_t drain_tsc
2039 		= (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2040 		* BURST_TX_DRAIN_US;
2041 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2042 	unsigned ret;
2043 	const uint16_t lcore_id = rte_lcore_id();
2044 	uint16_t count_in_ring, rx_count = 0;
2045 
2046 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2047 
2048 	lcore_ll = lcore_info[lcore_id].lcore_ll;
2049 	prev_tsc = 0;
2050 
2051 	while (1) {
2052 		cur_tsc = rte_rdtsc();
2053 
2054 		/* TX burst queue drain */
2055 		diff_tsc = cur_tsc - prev_tsc;
2056 		if (unlikely(diff_tsc > drain_tsc)) {
2057 			/*
2058 			 * Get mbuf from vpool.pool and detach mbuf and
2059 			 * put back into vpool.ring.
2060 			 */
2061 			dev_ll = lcore_ll->ll_root_used;
2062 			while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2063 				/* Get virtio device ID */
2064 				vdev = dev_ll->vdev;
2065 				dev = vdev->dev;
2066 
2067 				if (likely(!vdev->remove)) {
2068 					tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2069 					if (tx_q->len) {
2070 						LOG_DEBUG(VHOST_DATA,
2071 						"TX queue drained after timeout"
2072 						" with burst size %u\n",
2073 						tx_q->len);
2074 
2075 						/*
2076 						 * Tx any packets in the queue
2077 						 */
2078 						ret = rte_eth_tx_burst(
2079 							ports[0],
2080 							(uint16_t)tx_q->txq_id,
2081 							(struct rte_mbuf **)
2082 							tx_q->m_table,
2083 							(uint16_t)tx_q->len);
2084 						if (unlikely(ret < tx_q->len)) {
2085 							do {
2086 								rte_pktmbuf_free(
2087 									tx_q->m_table[ret]);
2088 							} while (++ret < tx_q->len);
2089 						}
2090 						tx_q->len = 0;
2091 
2092 						txmbuf_clean_zcp(dev,
2093 							&vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2094 					}
2095 				}
2096 				dev_ll = dev_ll->next;
2097 			}
2098 			prev_tsc = cur_tsc;
2099 		}
2100 
2101 		rte_prefetch0(lcore_ll->ll_root_used);
2102 
2103 		/*
2104 		 * Inform the configuration core that we have exited the linked
2105 		 * list and that no devices are in use if requested.
2106 		 */
2107 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2108 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2109 
2110 		/* Process devices */
2111 		dev_ll = lcore_ll->ll_root_used;
2112 
2113 		while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2114 			vdev = dev_ll->vdev;
2115 			dev  = vdev->dev;
2116 			if (unlikely(vdev->remove)) {
2117 				dev_ll = dev_ll->next;
2118 				unlink_vmdq(vdev);
2119 				vdev->ready = DEVICE_SAFE_REMOVE;
2120 				continue;
2121 			}
2122 
2123 			if (likely(vdev->ready == DEVICE_RX)) {
2124 				uint32_t index = vdev->vmdq_rx_q;
2125 				uint16_t i;
2126 				count_in_ring
2127 				= rte_ring_count(vpool_array[index].ring);
2128 				uint16_t free_entries
2129 				= (uint16_t)get_available_ring_num_zcp(dev);
2130 
2131 				/*
2132 				 * Attach all mbufs in vpool.ring and put back
2133 				 * into vpool.pool.
2134 				 */
2135 				for (i = 0;
2136 				i < RTE_MIN(free_entries,
2137 				RTE_MIN(count_in_ring, MAX_PKT_BURST));
2138 				i++)
2139 					attach_rxmbuf_zcp(dev);
2140 
2141 				/* Handle guest RX */
2142 				rx_count = rte_eth_rx_burst(ports[0],
2143 					vdev->vmdq_rx_q, pkts_burst,
2144 					MAX_PKT_BURST);
2145 
2146 				if (rx_count) {
2147 					ret_count = virtio_dev_rx_zcp(dev,
2148 							pkts_burst, rx_count);
2149 					if (enable_stats) {
2150 						dev_statistics[dev->device_fh].rx_total
2151 							+= rx_count;
2152 						dev_statistics[dev->device_fh].rx
2153 							+= ret_count;
2154 					}
2155 					while (likely(rx_count)) {
2156 						rx_count--;
2157 						pktmbuf_detach_zcp(
2158 							pkts_burst[rx_count]);
2159 						rte_ring_sp_enqueue(
2160 							vpool_array[index].ring,
2161 							(void *)pkts_burst[rx_count]);
2162 					}
2163 				}
2164 			}
2165 
2166 			if (likely(!vdev->remove))
2167 				/* Handle guest TX */
2168 				virtio_dev_tx_zcp(dev);
2169 
2170 			/* Move to the next device in the list */
2171 			dev_ll = dev_ll->next;
2172 		}
2173 	}
2174 
2175 	return 0;
2176 }
2177 
2178 
2179 /*
2180  * Add an entry to a used linked list. A free entry must first be found
2181  * in the free linked list using get_data_ll_free_entry();
2182  */
2183 static void
2184 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2185 	struct virtio_net_data_ll *ll_dev)
2186 {
2187 	struct virtio_net_data_ll *ll = *ll_root_addr;
2188 
2189 	/* Set next as NULL and use a compiler barrier to avoid reordering. */
2190 	ll_dev->next = NULL;
2191 	rte_compiler_barrier();
2192 
2193 	/* If ll == NULL then this is the first device. */
2194 	if (ll) {
2195 		/* Increment to the tail of the linked list. */
2196 		while ((ll->next != NULL) )
2197 			ll = ll->next;
2198 
2199 		ll->next = ll_dev;
2200 	} else {
2201 		*ll_root_addr = ll_dev;
2202 	}
2203 }
2204 
2205 /*
2206  * Remove an entry from a used linked list. The entry must then be added to
2207  * the free linked list using put_data_ll_free_entry().
2208  */
2209 static void
2210 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2211 	struct virtio_net_data_ll *ll_dev,
2212 	struct virtio_net_data_ll *ll_dev_last)
2213 {
2214 	struct virtio_net_data_ll *ll = *ll_root_addr;
2215 
2216 	if (unlikely((ll == NULL) || (ll_dev == NULL)))
2217 		return;
2218 
2219 	if (ll_dev == ll)
2220 		*ll_root_addr = ll_dev->next;
2221 	else
2222 		if (likely(ll_dev_last != NULL))
2223 			ll_dev_last->next = ll_dev->next;
2224 		else
2225 			RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2226 }
2227 
2228 /*
2229  * Find and return an entry from the free linked list.
2230  */
2231 static struct virtio_net_data_ll *
2232 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2233 {
2234 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2235 	struct virtio_net_data_ll *ll_dev;
2236 
2237 	if (ll_free == NULL)
2238 		return NULL;
2239 
2240 	ll_dev = ll_free;
2241 	*ll_root_addr = ll_free->next;
2242 
2243 	return ll_dev;
2244 }
2245 
2246 /*
2247  * Place an entry back on to the free linked list.
2248  */
2249 static void
2250 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2251 	struct virtio_net_data_ll *ll_dev)
2252 {
2253 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2254 
2255 	if (ll_dev == NULL)
2256 		return;
2257 
2258 	ll_dev->next = ll_free;
2259 	*ll_root_addr = ll_dev;
2260 }
2261 
2262 /*
2263  * Creates a linked list of a given size.
2264  */
2265 static struct virtio_net_data_ll *
2266 alloc_data_ll(uint32_t size)
2267 {
2268 	struct virtio_net_data_ll *ll_new;
2269 	uint32_t i;
2270 
2271 	/* Malloc and then chain the linked list. */
2272 	ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2273 	if (ll_new == NULL) {
2274 		RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2275 		return NULL;
2276 	}
2277 
2278 	for (i = 0; i < size - 1; i++) {
2279 		ll_new[i].vdev = NULL;
2280 		ll_new[i].next = &ll_new[i+1];
2281 	}
2282 	ll_new[i].next = NULL;
2283 
2284 	return (ll_new);
2285 }
2286 
2287 /*
2288  * Create the main linked list along with each individual cores linked list. A used and a free list
2289  * are created to manage entries.
2290  */
2291 static int
2292 init_data_ll (void)
2293 {
2294 	int lcore;
2295 
2296 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2297 		lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2298 		if (lcore_info[lcore].lcore_ll == NULL) {
2299 			RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2300 			return -1;
2301 		}
2302 
2303 		lcore_info[lcore].lcore_ll->device_num = 0;
2304 		lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2305 		lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2306 		if (num_devices % num_switching_cores)
2307 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2308 		else
2309 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2310 	}
2311 
2312 	/* Allocate devices up to a maximum of MAX_DEVICES. */
2313 	ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2314 
2315 	return 0;
2316 }
2317 
2318 /*
2319  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2320  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2321  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2322  */
2323 static void
2324 destroy_device (volatile struct virtio_net *dev)
2325 {
2326 	struct virtio_net_data_ll *ll_lcore_dev_cur;
2327 	struct virtio_net_data_ll *ll_main_dev_cur;
2328 	struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2329 	struct virtio_net_data_ll *ll_main_dev_last = NULL;
2330 	struct vhost_dev *vdev;
2331 	int lcore;
2332 
2333 	dev->flags &= ~VIRTIO_DEV_RUNNING;
2334 
2335 	vdev = (struct vhost_dev *)dev->priv;
2336 	/*set the remove flag. */
2337 	vdev->remove = 1;
2338 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
2339 		rte_pause();
2340 	}
2341 
2342 	/* Search for entry to be removed from lcore ll */
2343 	ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2344 	while (ll_lcore_dev_cur != NULL) {
2345 		if (ll_lcore_dev_cur->vdev == vdev) {
2346 			break;
2347 		} else {
2348 			ll_lcore_dev_last = ll_lcore_dev_cur;
2349 			ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2350 		}
2351 	}
2352 
2353 	if (ll_lcore_dev_cur == NULL) {
2354 		RTE_LOG(ERR, VHOST_CONFIG,
2355 			"(%"PRIu64") Failed to find the dev to be destroy.\n",
2356 			dev->device_fh);
2357 		return;
2358 	}
2359 
2360 	/* Search for entry to be removed from main ll */
2361 	ll_main_dev_cur = ll_root_used;
2362 	ll_main_dev_last = NULL;
2363 	while (ll_main_dev_cur != NULL) {
2364 		if (ll_main_dev_cur->vdev == vdev) {
2365 			break;
2366 		} else {
2367 			ll_main_dev_last = ll_main_dev_cur;
2368 			ll_main_dev_cur = ll_main_dev_cur->next;
2369 		}
2370 	}
2371 
2372 	/* Remove entries from the lcore and main ll. */
2373 	rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2374 	rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2375 
2376 	/* Set the dev_removal_flag on each lcore. */
2377 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2378 		lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2379 	}
2380 
2381 	/*
2382 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2383 	 * they can no longer access the device removed from the linked lists and that the devices
2384 	 * are no longer in use.
2385 	 */
2386 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2387 		while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2388 			rte_pause();
2389 		}
2390 	}
2391 
2392 	/* Add the entries back to the lcore and main free ll.*/
2393 	put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2394 	put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2395 
2396 	/* Decrement number of device on the lcore. */
2397 	lcore_info[vdev->coreid].lcore_ll->device_num--;
2398 
2399 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2400 
2401 	if (zero_copy) {
2402 		struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2403 
2404 		/* Stop the RX queue. */
2405 		if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2406 			LOG_DEBUG(VHOST_CONFIG,
2407 				"(%"PRIu64") In destroy_device: Failed to stop "
2408 				"rx queue:%d\n",
2409 				dev->device_fh,
2410 				vdev->vmdq_rx_q);
2411 		}
2412 
2413 		LOG_DEBUG(VHOST_CONFIG,
2414 			"(%"PRIu64") in destroy_device: Start put mbuf in "
2415 			"mempool back to ring for RX queue: %d\n",
2416 			dev->device_fh, vdev->vmdq_rx_q);
2417 
2418 		mbuf_destroy_zcp(vpool);
2419 
2420 		/* Stop the TX queue. */
2421 		if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2422 			LOG_DEBUG(VHOST_CONFIG,
2423 				"(%"PRIu64") In destroy_device: Failed to "
2424 				"stop tx queue:%d\n",
2425 				dev->device_fh, vdev->vmdq_rx_q);
2426 		}
2427 
2428 		vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2429 
2430 		LOG_DEBUG(VHOST_CONFIG,
2431 			"(%"PRIu64") destroy_device: Start put mbuf in mempool "
2432 			"back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2433 			dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2434 			dev->device_fh);
2435 
2436 		mbuf_destroy_zcp(vpool);
2437 		rte_free(vdev->regions_hpa);
2438 	}
2439 	rte_free(vdev);
2440 
2441 }
2442 
2443 /*
2444  * Calculate the region count of physical continous regions for one particular
2445  * region of whose vhost virtual address is continous. The particular region
2446  * start from vva_start, with size of 'size' in argument.
2447  */
2448 static uint32_t
2449 check_hpa_regions(uint64_t vva_start, uint64_t size)
2450 {
2451 	uint32_t i, nregions = 0, page_size = getpagesize();
2452 	uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2453 	if (vva_start % page_size) {
2454 		LOG_DEBUG(VHOST_CONFIG,
2455 			"in check_countinous: vva start(%p) mod page_size(%d) "
2456 			"has remainder\n",
2457 			(void *)(uintptr_t)vva_start, page_size);
2458 		return 0;
2459 	}
2460 	if (size % page_size) {
2461 		LOG_DEBUG(VHOST_CONFIG,
2462 			"in check_countinous: "
2463 			"size((%"PRIu64")) mod page_size(%d) has remainder\n",
2464 			size, page_size);
2465 		return 0;
2466 	}
2467 	for (i = 0; i < size - page_size; i = i + page_size) {
2468 		cur_phys_addr
2469 			= rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2470 		next_phys_addr = rte_mem_virt2phy(
2471 			(void *)(uintptr_t)(vva_start + i + page_size));
2472 		if ((cur_phys_addr + page_size) != next_phys_addr) {
2473 			++nregions;
2474 			LOG_DEBUG(VHOST_CONFIG,
2475 				"in check_continuous: hva addr:(%p) is not "
2476 				"continuous with hva addr:(%p), diff:%d\n",
2477 				(void *)(uintptr_t)(vva_start + (uint64_t)i),
2478 				(void *)(uintptr_t)(vva_start + (uint64_t)i
2479 				+ page_size), page_size);
2480 			LOG_DEBUG(VHOST_CONFIG,
2481 				"in check_continuous: hpa addr:(%p) is not "
2482 				"continuous with hpa addr:(%p), "
2483 				"diff:(%"PRIu64")\n",
2484 				(void *)(uintptr_t)cur_phys_addr,
2485 				(void *)(uintptr_t)next_phys_addr,
2486 				(next_phys_addr-cur_phys_addr));
2487 		}
2488 	}
2489 	return nregions;
2490 }
2491 
2492 /*
2493  * Divide each region whose vhost virtual address is continous into a few
2494  * sub-regions, make sure the physical address within each sub-region are
2495  * continous. And fill offset(to GPA) and size etc. information of each
2496  * sub-region into regions_hpa.
2497  */
2498 static uint32_t
2499 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2500 {
2501 	uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2502 	uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2503 
2504 	if (mem_region_hpa == NULL)
2505 		return 0;
2506 
2507 	for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2508 		vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2509 			virtio_memory->regions[regionidx].address_offset;
2510 		mem_region_hpa[regionidx_hpa].guest_phys_address
2511 			= virtio_memory->regions[regionidx].guest_phys_address;
2512 		mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2513 			rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2514 			mem_region_hpa[regionidx_hpa].guest_phys_address;
2515 		LOG_DEBUG(VHOST_CONFIG,
2516 			"in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2517 			regionidx_hpa,
2518 			(void *)(uintptr_t)
2519 			(mem_region_hpa[regionidx_hpa].guest_phys_address));
2520 		LOG_DEBUG(VHOST_CONFIG,
2521 			"in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2522 			regionidx_hpa,
2523 			(void *)(uintptr_t)
2524 			(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2525 		for (i = 0, k = 0;
2526 			i < virtio_memory->regions[regionidx].memory_size -
2527 				page_size;
2528 			i += page_size) {
2529 			cur_phys_addr = rte_mem_virt2phy(
2530 					(void *)(uintptr_t)(vva_start + i));
2531 			next_phys_addr = rte_mem_virt2phy(
2532 					(void *)(uintptr_t)(vva_start +
2533 					i + page_size));
2534 			if ((cur_phys_addr + page_size) != next_phys_addr) {
2535 				mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2536 					mem_region_hpa[regionidx_hpa].guest_phys_address +
2537 					k + page_size;
2538 				mem_region_hpa[regionidx_hpa].memory_size
2539 					= k + page_size;
2540 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2541 					"phys addr end  [%d]:(%p)\n",
2542 					regionidx_hpa,
2543 					(void *)(uintptr_t)
2544 					(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2545 				LOG_DEBUG(VHOST_CONFIG,
2546 					"in fill_hpa_regions: guest phys addr "
2547 					"size [%d]:(%p)\n",
2548 					regionidx_hpa,
2549 					(void *)(uintptr_t)
2550 					(mem_region_hpa[regionidx_hpa].memory_size));
2551 				mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2552 					= mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2553 				++regionidx_hpa;
2554 				mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2555 					next_phys_addr -
2556 					mem_region_hpa[regionidx_hpa].guest_phys_address;
2557 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2558 					" phys addr start[%d]:(%p)\n",
2559 					regionidx_hpa,
2560 					(void *)(uintptr_t)
2561 					(mem_region_hpa[regionidx_hpa].guest_phys_address));
2562 				LOG_DEBUG(VHOST_CONFIG,
2563 					"in fill_hpa_regions: host  phys addr "
2564 					"start[%d]:(%p)\n",
2565 					regionidx_hpa,
2566 					(void *)(uintptr_t)
2567 					(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2568 				k = 0;
2569 			} else {
2570 				k += page_size;
2571 			}
2572 		}
2573 		mem_region_hpa[regionidx_hpa].guest_phys_address_end
2574 			= mem_region_hpa[regionidx_hpa].guest_phys_address
2575 			+ k + page_size;
2576 		mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2577 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2578 			"[%d]:(%p)\n", regionidx_hpa,
2579 			(void *)(uintptr_t)
2580 			(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2581 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2582 			"[%d]:(%p)\n", regionidx_hpa,
2583 			(void *)(uintptr_t)
2584 			(mem_region_hpa[regionidx_hpa].memory_size));
2585 		++regionidx_hpa;
2586 	}
2587 	return regionidx_hpa;
2588 }
2589 
2590 /*
2591  * A new device is added to a data core. First the device is added to the main linked list
2592  * and the allocated to a specific data core.
2593  */
2594 static int
2595 new_device (struct virtio_net *dev)
2596 {
2597 	struct virtio_net_data_ll *ll_dev;
2598 	int lcore, core_add = 0;
2599 	uint32_t device_num_min = num_devices;
2600 	struct vhost_dev *vdev;
2601 	uint32_t regionidx;
2602 
2603 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2604 	if (vdev == NULL) {
2605 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2606 			dev->device_fh);
2607 		return -1;
2608 	}
2609 	vdev->dev = dev;
2610 	dev->priv = vdev;
2611 
2612 	if (zero_copy) {
2613 		vdev->nregions_hpa = dev->mem->nregions;
2614 		for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2615 			vdev->nregions_hpa
2616 				+= check_hpa_regions(
2617 					dev->mem->regions[regionidx].guest_phys_address
2618 					+ dev->mem->regions[regionidx].address_offset,
2619 					dev->mem->regions[regionidx].memory_size);
2620 
2621 		}
2622 
2623 		vdev->regions_hpa = rte_calloc("vhost hpa region",
2624 					       vdev->nregions_hpa,
2625 					       sizeof(struct virtio_memory_regions_hpa),
2626 					       RTE_CACHE_LINE_SIZE);
2627 		if (vdev->regions_hpa == NULL) {
2628 			RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2629 			rte_free(vdev);
2630 			return -1;
2631 		}
2632 
2633 
2634 		if (fill_hpa_memory_regions(
2635 			vdev->regions_hpa, dev->mem
2636 			) != vdev->nregions_hpa) {
2637 
2638 			RTE_LOG(ERR, VHOST_CONFIG,
2639 				"hpa memory regions number mismatch: "
2640 				"[%d]\n", vdev->nregions_hpa);
2641 			rte_free(vdev->regions_hpa);
2642 			rte_free(vdev);
2643 			return -1;
2644 		}
2645 	}
2646 
2647 
2648 	/* Add device to main ll */
2649 	ll_dev = get_data_ll_free_entry(&ll_root_free);
2650 	if (ll_dev == NULL) {
2651 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2652 			"of %d devices per core has been reached\n",
2653 			dev->device_fh, num_devices);
2654 		if (vdev->regions_hpa)
2655 			rte_free(vdev->regions_hpa);
2656 		rte_free(vdev);
2657 		return -1;
2658 	}
2659 	ll_dev->vdev = vdev;
2660 	add_data_ll_entry(&ll_root_used, ll_dev);
2661 	vdev->vmdq_rx_q
2662 		= dev->device_fh * queues_per_pool + vmdq_queue_base;
2663 
2664 	if (zero_copy) {
2665 		uint32_t index = vdev->vmdq_rx_q;
2666 		uint32_t count_in_ring, i;
2667 		struct mbuf_table *tx_q;
2668 
2669 		count_in_ring = rte_ring_count(vpool_array[index].ring);
2670 
2671 		LOG_DEBUG(VHOST_CONFIG,
2672 			"(%"PRIu64") in new_device: mbuf count in mempool "
2673 			"before attach is: %d\n",
2674 			dev->device_fh,
2675 			rte_mempool_count(vpool_array[index].pool));
2676 		LOG_DEBUG(VHOST_CONFIG,
2677 			"(%"PRIu64") in new_device: mbuf count in  ring "
2678 			"before attach  is : %d\n",
2679 			dev->device_fh, count_in_ring);
2680 
2681 		/*
2682 		 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2683 		 */
2684 		for (i = 0; i < count_in_ring; i++)
2685 			attach_rxmbuf_zcp(dev);
2686 
2687 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2688 			"mempool after attach is: %d\n",
2689 			dev->device_fh,
2690 			rte_mempool_count(vpool_array[index].pool));
2691 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2692 			"ring after attach  is : %d\n",
2693 			dev->device_fh,
2694 			rte_ring_count(vpool_array[index].ring));
2695 
2696 		tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2697 		tx_q->txq_id = vdev->vmdq_rx_q;
2698 
2699 		if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2700 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2701 
2702 			LOG_DEBUG(VHOST_CONFIG,
2703 				"(%"PRIu64") In new_device: Failed to start "
2704 				"tx queue:%d\n",
2705 				dev->device_fh, vdev->vmdq_rx_q);
2706 
2707 			mbuf_destroy_zcp(vpool);
2708 			rte_free(vdev->regions_hpa);
2709 			rte_free(vdev);
2710 			return -1;
2711 		}
2712 
2713 		if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2714 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2715 
2716 			LOG_DEBUG(VHOST_CONFIG,
2717 				"(%"PRIu64") In new_device: Failed to start "
2718 				"rx queue:%d\n",
2719 				dev->device_fh, vdev->vmdq_rx_q);
2720 
2721 			/* Stop the TX queue. */
2722 			if (rte_eth_dev_tx_queue_stop(ports[0],
2723 				vdev->vmdq_rx_q) != 0) {
2724 				LOG_DEBUG(VHOST_CONFIG,
2725 					"(%"PRIu64") In new_device: Failed to "
2726 					"stop tx queue:%d\n",
2727 					dev->device_fh, vdev->vmdq_rx_q);
2728 			}
2729 
2730 			mbuf_destroy_zcp(vpool);
2731 			rte_free(vdev->regions_hpa);
2732 			rte_free(vdev);
2733 			return -1;
2734 		}
2735 
2736 	}
2737 
2738 	/*reset ready flag*/
2739 	vdev->ready = DEVICE_MAC_LEARNING;
2740 	vdev->remove = 0;
2741 
2742 	/* Find a suitable lcore to add the device. */
2743 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2744 		if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2745 			device_num_min = lcore_info[lcore].lcore_ll->device_num;
2746 			core_add = lcore;
2747 		}
2748 	}
2749 	/* Add device to lcore ll */
2750 	ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2751 	if (ll_dev == NULL) {
2752 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2753 		vdev->ready = DEVICE_SAFE_REMOVE;
2754 		destroy_device(dev);
2755 		rte_free(vdev->regions_hpa);
2756 		rte_free(vdev);
2757 		return -1;
2758 	}
2759 	ll_dev->vdev = vdev;
2760 	vdev->coreid = core_add;
2761 
2762 	add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2763 
2764 	/* Initialize device stats */
2765 	memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2766 
2767 	/* Disable notifications. */
2768 	rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2769 	rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2770 	lcore_info[vdev->coreid].lcore_ll->device_num++;
2771 	dev->flags |= VIRTIO_DEV_RUNNING;
2772 
2773 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2774 
2775 	return 0;
2776 }
2777 
2778 /*
2779  * These callback allow devices to be added to the data core when configuration
2780  * has been fully complete.
2781  */
2782 static const struct virtio_net_device_ops virtio_net_device_ops =
2783 {
2784 	.new_device =  new_device,
2785 	.destroy_device = destroy_device,
2786 };
2787 
2788 /*
2789  * This is a thread will wake up after a period to print stats if the user has
2790  * enabled them.
2791  */
2792 static void
2793 print_stats(void)
2794 {
2795 	struct virtio_net_data_ll *dev_ll;
2796 	uint64_t tx_dropped, rx_dropped;
2797 	uint64_t tx, tx_total, rx, rx_total;
2798 	uint32_t device_fh;
2799 	const char clr[] = { 27, '[', '2', 'J', '\0' };
2800 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2801 
2802 	while(1) {
2803 		sleep(enable_stats);
2804 
2805 		/* Clear screen and move to top left */
2806 		printf("%s%s", clr, top_left);
2807 
2808 		printf("\nDevice statistics ====================================");
2809 
2810 		dev_ll = ll_root_used;
2811 		while (dev_ll != NULL) {
2812 			device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2813 			tx_total = dev_statistics[device_fh].tx_total;
2814 			tx = dev_statistics[device_fh].tx;
2815 			tx_dropped = tx_total - tx;
2816 			if (zero_copy == 0) {
2817 				rx_total = rte_atomic64_read(
2818 					&dev_statistics[device_fh].rx_total_atomic);
2819 				rx = rte_atomic64_read(
2820 					&dev_statistics[device_fh].rx_atomic);
2821 			} else {
2822 				rx_total = dev_statistics[device_fh].rx_total;
2823 				rx = dev_statistics[device_fh].rx;
2824 			}
2825 			rx_dropped = rx_total - rx;
2826 
2827 			printf("\nStatistics for device %"PRIu32" ------------------------------"
2828 					"\nTX total: 		%"PRIu64""
2829 					"\nTX dropped: 		%"PRIu64""
2830 					"\nTX successful: 		%"PRIu64""
2831 					"\nRX total: 		%"PRIu64""
2832 					"\nRX dropped: 		%"PRIu64""
2833 					"\nRX successful: 		%"PRIu64"",
2834 					device_fh,
2835 					tx_total,
2836 					tx_dropped,
2837 					tx,
2838 					rx_total,
2839 					rx_dropped,
2840 					rx);
2841 
2842 			dev_ll = dev_ll->next;
2843 		}
2844 		printf("\n======================================================\n");
2845 	}
2846 }
2847 
2848 static void
2849 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2850 	char *ring_name, uint32_t nb_mbuf)
2851 {
2852 	vpool_array[index].pool	= rte_pktmbuf_pool_create(pool_name, nb_mbuf,
2853 		MBUF_CACHE_SIZE_ZCP, 0, MBUF_DATA_SIZE_ZCP, socket);
2854 	if (vpool_array[index].pool != NULL) {
2855 		vpool_array[index].ring
2856 			= rte_ring_create(ring_name,
2857 				rte_align32pow2(nb_mbuf + 1),
2858 				socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2859 		if (likely(vpool_array[index].ring != NULL)) {
2860 			LOG_DEBUG(VHOST_CONFIG,
2861 				"in setup_mempool_tbl: mbuf count in "
2862 				"mempool is: %d\n",
2863 				rte_mempool_count(vpool_array[index].pool));
2864 			LOG_DEBUG(VHOST_CONFIG,
2865 				"in setup_mempool_tbl: mbuf count in "
2866 				"ring   is: %d\n",
2867 				rte_ring_count(vpool_array[index].ring));
2868 		} else {
2869 			rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2870 				ring_name);
2871 		}
2872 
2873 		/* Need consider head room. */
2874 		vpool_array[index].buf_size = VIRTIO_DESCRIPTOR_LEN_ZCP;
2875 	} else {
2876 		rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2877 	}
2878 }
2879 
2880 /* When we receive a INT signal, unregister vhost driver */
2881 static void
2882 sigint_handler(__rte_unused int signum)
2883 {
2884 	/* Unregister vhost driver. */
2885 	int ret = rte_vhost_driver_unregister((char *)&dev_basename);
2886 	if (ret != 0)
2887 		rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n");
2888 	exit(0);
2889 }
2890 
2891 /*
2892  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2893  * device is also registered here to handle the IOCTLs.
2894  */
2895 int
2896 main(int argc, char *argv[])
2897 {
2898 	struct rte_mempool *mbuf_pool = NULL;
2899 	unsigned lcore_id, core_id = 0;
2900 	unsigned nb_ports, valid_num_ports;
2901 	int ret;
2902 	uint8_t portid;
2903 	uint16_t queue_id;
2904 	static pthread_t tid;
2905 	char thread_name[RTE_MAX_THREAD_NAME_LEN];
2906 
2907 	signal(SIGINT, sigint_handler);
2908 
2909 	/* init EAL */
2910 	ret = rte_eal_init(argc, argv);
2911 	if (ret < 0)
2912 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2913 	argc -= ret;
2914 	argv += ret;
2915 
2916 	/* parse app arguments */
2917 	ret = us_vhost_parse_args(argc, argv);
2918 	if (ret < 0)
2919 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
2920 
2921 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2922 		if (rte_lcore_is_enabled(lcore_id))
2923 			lcore_ids[core_id ++] = lcore_id;
2924 
2925 	if (rte_lcore_count() > RTE_MAX_LCORE)
2926 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
2927 
2928 	/*set the number of swithcing cores available*/
2929 	num_switching_cores = rte_lcore_count()-1;
2930 
2931 	/* Get the number of physical ports. */
2932 	nb_ports = rte_eth_dev_count();
2933 	if (nb_ports > RTE_MAX_ETHPORTS)
2934 		nb_ports = RTE_MAX_ETHPORTS;
2935 
2936 	/*
2937 	 * Update the global var NUM_PORTS and global array PORTS
2938 	 * and get value of var VALID_NUM_PORTS according to system ports number
2939 	 */
2940 	valid_num_ports = check_ports_num(nb_ports);
2941 
2942 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2943 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2944 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2945 		return -1;
2946 	}
2947 
2948 	if (zero_copy == 0) {
2949 		/* Create the mbuf pool. */
2950 		mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL",
2951 			NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE,
2952 			0, MBUF_DATA_SIZE, rte_socket_id());
2953 		if (mbuf_pool == NULL)
2954 			rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2955 
2956 		for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2957 			vpool_array[queue_id].pool = mbuf_pool;
2958 
2959 		if (vm2vm_mode == VM2VM_HARDWARE) {
2960 			/* Enable VT loop back to let L2 switch to do it. */
2961 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2962 			LOG_DEBUG(VHOST_CONFIG,
2963 				"Enable loop back for L2 switch in vmdq.\n");
2964 		}
2965 	} else {
2966 		uint32_t nb_mbuf;
2967 		char pool_name[RTE_MEMPOOL_NAMESIZE];
2968 		char ring_name[RTE_MEMPOOL_NAMESIZE];
2969 
2970 		nb_mbuf = num_rx_descriptor
2971 			+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2972 			+ num_switching_cores * MAX_PKT_BURST;
2973 
2974 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2975 			snprintf(pool_name, sizeof(pool_name),
2976 				"rxmbuf_pool_%u", queue_id);
2977 			snprintf(ring_name, sizeof(ring_name),
2978 				"rxmbuf_ring_%u", queue_id);
2979 			setup_mempool_tbl(rte_socket_id(), queue_id,
2980 				pool_name, ring_name, nb_mbuf);
2981 		}
2982 
2983 		nb_mbuf = num_tx_descriptor
2984 				+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2985 				+ num_switching_cores * MAX_PKT_BURST;
2986 
2987 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2988 			snprintf(pool_name, sizeof(pool_name),
2989 				"txmbuf_pool_%u", queue_id);
2990 			snprintf(ring_name, sizeof(ring_name),
2991 				"txmbuf_ring_%u", queue_id);
2992 			setup_mempool_tbl(rte_socket_id(),
2993 				(queue_id + MAX_QUEUES),
2994 				pool_name, ring_name, nb_mbuf);
2995 		}
2996 
2997 		if (vm2vm_mode == VM2VM_HARDWARE) {
2998 			/* Enable VT loop back to let L2 switch to do it. */
2999 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3000 			LOG_DEBUG(VHOST_CONFIG,
3001 				"Enable loop back for L2 switch in vmdq.\n");
3002 		}
3003 	}
3004 	/* Set log level. */
3005 	rte_set_log_level(LOG_LEVEL);
3006 
3007 	/* initialize all ports */
3008 	for (portid = 0; portid < nb_ports; portid++) {
3009 		/* skip ports that are not enabled */
3010 		if ((enabled_port_mask & (1 << portid)) == 0) {
3011 			RTE_LOG(INFO, VHOST_PORT,
3012 				"Skipping disabled port %d\n", portid);
3013 			continue;
3014 		}
3015 		if (port_init(portid) != 0)
3016 			rte_exit(EXIT_FAILURE,
3017 				"Cannot initialize network ports\n");
3018 	}
3019 
3020 	/* Initialise all linked lists. */
3021 	if (init_data_ll() == -1)
3022 		rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3023 
3024 	/* Initialize device stats */
3025 	memset(&dev_statistics, 0, sizeof(dev_statistics));
3026 
3027 	/* Enable stats if the user option is set. */
3028 	if (enable_stats) {
3029 		ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
3030 		if (ret != 0)
3031 			rte_exit(EXIT_FAILURE,
3032 				"Cannot create print-stats thread\n");
3033 
3034 		/* Set thread_name for aid in debugging.  */
3035 		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
3036 		ret = rte_thread_setname(tid, thread_name);
3037 		if (ret != 0)
3038 			RTE_LOG(ERR, VHOST_CONFIG,
3039 				"Cannot set print-stats name\n");
3040 	}
3041 
3042 	/* Launch all data cores. */
3043 	if (zero_copy == 0) {
3044 		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3045 			rte_eal_remote_launch(switch_worker,
3046 				mbuf_pool, lcore_id);
3047 		}
3048 	} else {
3049 		uint32_t count_in_mempool, index, i;
3050 		for (index = 0; index < 2*MAX_QUEUES; index++) {
3051 			/* For all RX and TX queues. */
3052 			count_in_mempool
3053 				= rte_mempool_count(vpool_array[index].pool);
3054 
3055 			/*
3056 			 * Transfer all un-attached mbufs from vpool.pool
3057 			 * to vpoo.ring.
3058 			 */
3059 			for (i = 0; i < count_in_mempool; i++) {
3060 				struct rte_mbuf *mbuf
3061 					= __rte_mbuf_raw_alloc(
3062 						vpool_array[index].pool);
3063 				rte_ring_sp_enqueue(vpool_array[index].ring,
3064 						(void *)mbuf);
3065 			}
3066 
3067 			LOG_DEBUG(VHOST_CONFIG,
3068 				"in main: mbuf count in mempool at initial "
3069 				"is: %d\n", count_in_mempool);
3070 			LOG_DEBUG(VHOST_CONFIG,
3071 				"in main: mbuf count in  ring at initial  is :"
3072 				" %d\n",
3073 				rte_ring_count(vpool_array[index].ring));
3074 		}
3075 
3076 		RTE_LCORE_FOREACH_SLAVE(lcore_id)
3077 			rte_eal_remote_launch(switch_worker_zcp, NULL,
3078 				lcore_id);
3079 	}
3080 
3081 	if (mergeable == 0)
3082 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3083 
3084 	/* Register vhost(cuse or user) driver to handle vhost messages. */
3085 	ret = rte_vhost_driver_register((char *)&dev_basename);
3086 	if (ret != 0)
3087 		rte_exit(EXIT_FAILURE, "vhost driver register failure.\n");
3088 
3089 	rte_vhost_driver_callback_register(&virtio_net_device_ops);
3090 
3091 	/* Start CUSE session. */
3092 	rte_vhost_driver_session_start();
3093 	return 0;
3094 
3095 }
3096