xref: /dpdk/examples/vhost/main.c (revision 68363d85857d7ae5326f58e34e8cf0c9b1553b97)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 #include <rte_ip.h>
54 #include <rte_tcp.h>
55 
56 #include "main.h"
57 
58 #ifndef MAX_QUEUES
59 #define MAX_QUEUES 128
60 #endif
61 
62 /* the maximum number of external ports supported */
63 #define MAX_SUP_PORTS 1
64 
65 /*
66  * Calculate the number of buffers needed per port
67  */
68 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +		\
69 							(num_switching_cores*MAX_PKT_BURST) +  			\
70 							(num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
71 							((num_switching_cores+1)*MBUF_CACHE_SIZE))
72 
73 #define MBUF_CACHE_SIZE	128
74 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
75 
76 #define MAX_PKT_BURST 32		/* Max burst size for RX/TX */
77 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
78 
79 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
80 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
81 
82 #define JUMBO_FRAME_MAX_SIZE    0x2600
83 
84 /* State of virtio device. */
85 #define DEVICE_MAC_LEARNING 0
86 #define DEVICE_RX			1
87 #define DEVICE_SAFE_REMOVE	2
88 
89 /* Config_core_flag status definitions. */
90 #define REQUEST_DEV_REMOVAL 1
91 #define ACK_DEV_REMOVAL 0
92 
93 /* Configurable number of RX/TX ring descriptors */
94 #define RTE_TEST_RX_DESC_DEFAULT 1024
95 #define RTE_TEST_TX_DESC_DEFAULT 512
96 
97 #define INVALID_PORT_ID 0xFF
98 
99 /* Max number of devices. Limited by vmdq. */
100 #define MAX_DEVICES 64
101 
102 /* Size of buffers used for snprintfs. */
103 #define MAX_PRINT_BUFF 6072
104 
105 /* Maximum character device basename size. */
106 #define MAX_BASENAME_SZ 10
107 
108 /* Maximum long option length for option parsing. */
109 #define MAX_LONG_OPT_SZ 64
110 
111 /* Used to compare MAC addresses. */
112 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
113 
114 /* Number of descriptors per cacheline. */
115 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
116 
117 /* mask of enabled ports */
118 static uint32_t enabled_port_mask = 0;
119 
120 /* Promiscuous mode */
121 static uint32_t promiscuous;
122 
123 /*Number of switching cores enabled*/
124 static uint32_t num_switching_cores = 0;
125 
126 /* number of devices/queues to support*/
127 static uint32_t num_queues = 0;
128 static uint32_t num_devices;
129 
130 static struct rte_mempool *mbuf_pool;
131 static int mergeable;
132 
133 /* Do vlan strip on host, enabled on default */
134 static uint32_t vlan_strip = 1;
135 
136 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
137 typedef enum {
138 	VM2VM_DISABLED = 0,
139 	VM2VM_SOFTWARE = 1,
140 	VM2VM_HARDWARE = 2,
141 	VM2VM_LAST
142 } vm2vm_type;
143 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
144 
145 /* Enable stats. */
146 static uint32_t enable_stats = 0;
147 /* Enable retries on RX. */
148 static uint32_t enable_retry = 1;
149 
150 /* Disable TX checksum offload */
151 static uint32_t enable_tx_csum;
152 
153 /* Disable TSO offload */
154 static uint32_t enable_tso;
155 
156 /* Specify timeout (in useconds) between retries on RX. */
157 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
158 /* Specify the number of retries on RX. */
159 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
160 
161 /* Character device basename. Can be set by user. */
162 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
163 
164 /* empty vmdq configuration structure. Filled in programatically */
165 static struct rte_eth_conf vmdq_conf_default = {
166 	.rxmode = {
167 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
168 		.split_hdr_size = 0,
169 		.header_split   = 0, /**< Header Split disabled */
170 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
171 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
172 		/*
173 		 * It is necessary for 1G NIC such as I350,
174 		 * this fixes bug of ipv4 forwarding in guest can't
175 		 * forward pakets from one virtio dev to another virtio dev.
176 		 */
177 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
178 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
179 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
180 	},
181 
182 	.txmode = {
183 		.mq_mode = ETH_MQ_TX_NONE,
184 	},
185 	.rx_adv_conf = {
186 		/*
187 		 * should be overridden separately in code with
188 		 * appropriate values
189 		 */
190 		.vmdq_rx_conf = {
191 			.nb_queue_pools = ETH_8_POOLS,
192 			.enable_default_pool = 0,
193 			.default_pool = 0,
194 			.nb_pool_maps = 0,
195 			.pool_map = {{0, 0},},
196 		},
197 	},
198 };
199 
200 static unsigned lcore_ids[RTE_MAX_LCORE];
201 static uint8_t ports[RTE_MAX_ETHPORTS];
202 static unsigned num_ports = 0; /**< The number of ports specified in command line */
203 static uint16_t num_pf_queues, num_vmdq_queues;
204 static uint16_t vmdq_pool_base, vmdq_queue_base;
205 static uint16_t queues_per_pool;
206 
207 const uint16_t vlan_tags[] = {
208 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
209 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
210 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
211 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
212 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
213 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
214 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
215 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
216 };
217 
218 /* ethernet addresses of ports */
219 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
220 
221 /* heads for the main used and free linked lists for the data path. */
222 static struct virtio_net_data_ll *ll_root_used = NULL;
223 static struct virtio_net_data_ll *ll_root_free = NULL;
224 
225 /* Array of data core structures containing information on individual core linked lists. */
226 static struct lcore_info lcore_info[RTE_MAX_LCORE];
227 
228 /* Used for queueing bursts of TX packets. */
229 struct mbuf_table {
230 	unsigned len;
231 	unsigned txq_id;
232 	struct rte_mbuf *m_table[MAX_PKT_BURST];
233 };
234 
235 /* TX queue for each data core. */
236 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
237 
238 /* Vlan header struct used to insert vlan tags on TX. */
239 struct vlan_ethhdr {
240 	unsigned char   h_dest[ETH_ALEN];
241 	unsigned char   h_source[ETH_ALEN];
242 	__be16          h_vlan_proto;
243 	__be16          h_vlan_TCI;
244 	__be16          h_vlan_encapsulated_proto;
245 };
246 
247 /* Header lengths. */
248 #define VLAN_HLEN       4
249 #define VLAN_ETH_HLEN   18
250 
251 /* Per-device statistics struct */
252 struct device_statistics {
253 	uint64_t tx_total;
254 	rte_atomic64_t rx_total_atomic;
255 	uint64_t tx;
256 	rte_atomic64_t rx_atomic;
257 } __rte_cache_aligned;
258 struct device_statistics dev_statistics[MAX_DEVICES];
259 
260 /*
261  * Builds up the correct configuration for VMDQ VLAN pool map
262  * according to the pool & queue limits.
263  */
264 static inline int
265 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
266 {
267 	struct rte_eth_vmdq_rx_conf conf;
268 	struct rte_eth_vmdq_rx_conf *def_conf =
269 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
270 	unsigned i;
271 
272 	memset(&conf, 0, sizeof(conf));
273 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
274 	conf.nb_pool_maps = num_devices;
275 	conf.enable_loop_back = def_conf->enable_loop_back;
276 	conf.rx_mode = def_conf->rx_mode;
277 
278 	for (i = 0; i < conf.nb_pool_maps; i++) {
279 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
280 		conf.pool_map[i].pools = (1UL << i);
281 	}
282 
283 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
284 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
285 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
286 	return 0;
287 }
288 
289 /*
290  * Validate the device number according to the max pool number gotten form
291  * dev_info. If the device number is invalid, give the error message and
292  * return -1. Each device must have its own pool.
293  */
294 static inline int
295 validate_num_devices(uint32_t max_nb_devices)
296 {
297 	if (num_devices > max_nb_devices) {
298 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
299 		return -1;
300 	}
301 	return 0;
302 }
303 
304 /*
305  * Initialises a given port using global settings and with the rx buffers
306  * coming from the mbuf_pool passed as parameter
307  */
308 static inline int
309 port_init(uint8_t port)
310 {
311 	struct rte_eth_dev_info dev_info;
312 	struct rte_eth_conf port_conf;
313 	struct rte_eth_rxconf *rxconf;
314 	struct rte_eth_txconf *txconf;
315 	int16_t rx_rings, tx_rings;
316 	uint16_t rx_ring_size, tx_ring_size;
317 	int retval;
318 	uint16_t q;
319 
320 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
321 	rte_eth_dev_info_get (port, &dev_info);
322 
323 	if (dev_info.max_rx_queues > MAX_QUEUES) {
324 		rte_exit(EXIT_FAILURE,
325 			"please define MAX_QUEUES no less than %u in %s\n",
326 			dev_info.max_rx_queues, __FILE__);
327 	}
328 
329 	rxconf = &dev_info.default_rxconf;
330 	txconf = &dev_info.default_txconf;
331 	rxconf->rx_drop_en = 1;
332 
333 	/* Enable vlan offload */
334 	txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
335 
336 	/*configure the number of supported virtio devices based on VMDQ limits */
337 	num_devices = dev_info.max_vmdq_pools;
338 
339 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
340 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
341 	tx_rings = (uint16_t)rte_lcore_count();
342 
343 	retval = validate_num_devices(MAX_DEVICES);
344 	if (retval < 0)
345 		return retval;
346 
347 	/* Get port configuration. */
348 	retval = get_eth_conf(&port_conf, num_devices);
349 	if (retval < 0)
350 		return retval;
351 	/* NIC queues are divided into pf queues and vmdq queues.  */
352 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
353 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
354 	num_vmdq_queues = num_devices * queues_per_pool;
355 	num_queues = num_pf_queues + num_vmdq_queues;
356 	vmdq_queue_base = dev_info.vmdq_queue_base;
357 	vmdq_pool_base  = dev_info.vmdq_pool_base;
358 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
359 		num_pf_queues, num_devices, queues_per_pool);
360 
361 	if (port >= rte_eth_dev_count()) return -1;
362 
363 	if (enable_tx_csum == 0)
364 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_CSUM);
365 
366 	if (enable_tso == 0) {
367 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4);
368 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO6);
369 	}
370 
371 	rx_rings = (uint16_t)dev_info.max_rx_queues;
372 	/* Configure ethernet device. */
373 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
374 	if (retval != 0)
375 		return retval;
376 
377 	/* Setup the queues. */
378 	for (q = 0; q < rx_rings; q ++) {
379 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
380 						rte_eth_dev_socket_id(port),
381 						rxconf,
382 						mbuf_pool);
383 		if (retval < 0)
384 			return retval;
385 	}
386 	for (q = 0; q < tx_rings; q ++) {
387 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
388 						rte_eth_dev_socket_id(port),
389 						txconf);
390 		if (retval < 0)
391 			return retval;
392 	}
393 
394 	/* Start the device. */
395 	retval  = rte_eth_dev_start(port);
396 	if (retval < 0) {
397 		RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
398 		return retval;
399 	}
400 
401 	if (promiscuous)
402 		rte_eth_promiscuous_enable(port);
403 
404 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
405 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
406 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
407 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
408 			(unsigned)port,
409 			vmdq_ports_eth_addr[port].addr_bytes[0],
410 			vmdq_ports_eth_addr[port].addr_bytes[1],
411 			vmdq_ports_eth_addr[port].addr_bytes[2],
412 			vmdq_ports_eth_addr[port].addr_bytes[3],
413 			vmdq_ports_eth_addr[port].addr_bytes[4],
414 			vmdq_ports_eth_addr[port].addr_bytes[5]);
415 
416 	return 0;
417 }
418 
419 /*
420  * Set character device basename.
421  */
422 static int
423 us_vhost_parse_basename(const char *q_arg)
424 {
425 	/* parse number string */
426 
427 	if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
428 		return -1;
429 	else
430 		snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
431 
432 	return 0;
433 }
434 
435 /*
436  * Parse the portmask provided at run time.
437  */
438 static int
439 parse_portmask(const char *portmask)
440 {
441 	char *end = NULL;
442 	unsigned long pm;
443 
444 	errno = 0;
445 
446 	/* parse hexadecimal string */
447 	pm = strtoul(portmask, &end, 16);
448 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
449 		return -1;
450 
451 	if (pm == 0)
452 		return -1;
453 
454 	return pm;
455 
456 }
457 
458 /*
459  * Parse num options at run time.
460  */
461 static int
462 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
463 {
464 	char *end = NULL;
465 	unsigned long num;
466 
467 	errno = 0;
468 
469 	/* parse unsigned int string */
470 	num = strtoul(q_arg, &end, 10);
471 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
472 		return -1;
473 
474 	if (num > max_valid_value)
475 		return -1;
476 
477 	return num;
478 
479 }
480 
481 /*
482  * Display usage
483  */
484 static void
485 us_vhost_usage(const char *prgname)
486 {
487 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
488 	"		--vm2vm [0|1|2]\n"
489 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
490 	"		--dev-basename <name>\n"
491 	"		--nb-devices ND\n"
492 	"		-p PORTMASK: Set mask for ports to be used by application\n"
493 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
494 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
495 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
496 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
497 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
498 	"		--vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
499 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
500 	"		--dev-basename: The basename to be used for the character device.\n"
501 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
502 	"		--tso [0|1] disable/enable TCP segment offload.\n",
503 	       prgname);
504 }
505 
506 /*
507  * Parse the arguments given in the command line of the application.
508  */
509 static int
510 us_vhost_parse_args(int argc, char **argv)
511 {
512 	int opt, ret;
513 	int option_index;
514 	unsigned i;
515 	const char *prgname = argv[0];
516 	static struct option long_option[] = {
517 		{"vm2vm", required_argument, NULL, 0},
518 		{"rx-retry", required_argument, NULL, 0},
519 		{"rx-retry-delay", required_argument, NULL, 0},
520 		{"rx-retry-num", required_argument, NULL, 0},
521 		{"mergeable", required_argument, NULL, 0},
522 		{"vlan-strip", required_argument, NULL, 0},
523 		{"stats", required_argument, NULL, 0},
524 		{"dev-basename", required_argument, NULL, 0},
525 		{"tx-csum", required_argument, NULL, 0},
526 		{"tso", required_argument, NULL, 0},
527 		{NULL, 0, 0, 0},
528 	};
529 
530 	/* Parse command line */
531 	while ((opt = getopt_long(argc, argv, "p:P",
532 			long_option, &option_index)) != EOF) {
533 		switch (opt) {
534 		/* Portmask */
535 		case 'p':
536 			enabled_port_mask = parse_portmask(optarg);
537 			if (enabled_port_mask == 0) {
538 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
539 				us_vhost_usage(prgname);
540 				return -1;
541 			}
542 			break;
543 
544 		case 'P':
545 			promiscuous = 1;
546 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
547 				ETH_VMDQ_ACCEPT_BROADCAST |
548 				ETH_VMDQ_ACCEPT_MULTICAST;
549 			rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
550 
551 			break;
552 
553 		case 0:
554 			/* Enable/disable vm2vm comms. */
555 			if (!strncmp(long_option[option_index].name, "vm2vm",
556 				MAX_LONG_OPT_SZ)) {
557 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
558 				if (ret == -1) {
559 					RTE_LOG(INFO, VHOST_CONFIG,
560 						"Invalid argument for "
561 						"vm2vm [0|1|2]\n");
562 					us_vhost_usage(prgname);
563 					return -1;
564 				} else {
565 					vm2vm_mode = (vm2vm_type)ret;
566 				}
567 			}
568 
569 			/* Enable/disable retries on RX. */
570 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
571 				ret = parse_num_opt(optarg, 1);
572 				if (ret == -1) {
573 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
574 					us_vhost_usage(prgname);
575 					return -1;
576 				} else {
577 					enable_retry = ret;
578 				}
579 			}
580 
581 			/* Enable/disable TX checksum offload. */
582 			if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
583 				ret = parse_num_opt(optarg, 1);
584 				if (ret == -1) {
585 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
586 					us_vhost_usage(prgname);
587 					return -1;
588 				} else
589 					enable_tx_csum = ret;
590 			}
591 
592 			/* Enable/disable TSO offload. */
593 			if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
594 				ret = parse_num_opt(optarg, 1);
595 				if (ret == -1) {
596 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
597 					us_vhost_usage(prgname);
598 					return -1;
599 				} else
600 					enable_tso = ret;
601 			}
602 
603 			/* Specify the retries delay time (in useconds) on RX. */
604 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
605 				ret = parse_num_opt(optarg, INT32_MAX);
606 				if (ret == -1) {
607 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
608 					us_vhost_usage(prgname);
609 					return -1;
610 				} else {
611 					burst_rx_delay_time = ret;
612 				}
613 			}
614 
615 			/* Specify the retries number on RX. */
616 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
617 				ret = parse_num_opt(optarg, INT32_MAX);
618 				if (ret == -1) {
619 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
620 					us_vhost_usage(prgname);
621 					return -1;
622 				} else {
623 					burst_rx_retry_num = ret;
624 				}
625 			}
626 
627 			/* Enable/disable RX mergeable buffers. */
628 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
629 				ret = parse_num_opt(optarg, 1);
630 				if (ret == -1) {
631 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
632 					us_vhost_usage(prgname);
633 					return -1;
634 				} else {
635 					mergeable = !!ret;
636 					if (ret) {
637 						vmdq_conf_default.rxmode.jumbo_frame = 1;
638 						vmdq_conf_default.rxmode.max_rx_pkt_len
639 							= JUMBO_FRAME_MAX_SIZE;
640 					}
641 				}
642 			}
643 
644 			/* Enable/disable RX VLAN strip on host. */
645 			if (!strncmp(long_option[option_index].name,
646 				"vlan-strip", MAX_LONG_OPT_SZ)) {
647 				ret = parse_num_opt(optarg, 1);
648 				if (ret == -1) {
649 					RTE_LOG(INFO, VHOST_CONFIG,
650 						"Invalid argument for VLAN strip [0|1]\n");
651 					us_vhost_usage(prgname);
652 					return -1;
653 				} else {
654 					vlan_strip = !!ret;
655 					vmdq_conf_default.rxmode.hw_vlan_strip =
656 						vlan_strip;
657 				}
658 			}
659 
660 			/* Enable/disable stats. */
661 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
662 				ret = parse_num_opt(optarg, INT32_MAX);
663 				if (ret == -1) {
664 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
665 					us_vhost_usage(prgname);
666 					return -1;
667 				} else {
668 					enable_stats = ret;
669 				}
670 			}
671 
672 			/* Set character device basename. */
673 			if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
674 				if (us_vhost_parse_basename(optarg) == -1) {
675 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
676 					us_vhost_usage(prgname);
677 					return -1;
678 				}
679 			}
680 
681 			break;
682 
683 			/* Invalid option - print options. */
684 		default:
685 			us_vhost_usage(prgname);
686 			return -1;
687 		}
688 	}
689 
690 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
691 		if (enabled_port_mask & (1 << i))
692 			ports[num_ports++] = (uint8_t)i;
693 	}
694 
695 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
696 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
697 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
698 		return -1;
699 	}
700 
701 	return 0;
702 }
703 
704 /*
705  * Update the global var NUM_PORTS and array PORTS according to system ports number
706  * and return valid ports number
707  */
708 static unsigned check_ports_num(unsigned nb_ports)
709 {
710 	unsigned valid_num_ports = num_ports;
711 	unsigned portid;
712 
713 	if (num_ports > nb_ports) {
714 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
715 			num_ports, nb_ports);
716 		num_ports = nb_ports;
717 	}
718 
719 	for (portid = 0; portid < num_ports; portid ++) {
720 		if (ports[portid] >= nb_ports) {
721 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
722 				ports[portid], (nb_ports - 1));
723 			ports[portid] = INVALID_PORT_ID;
724 			valid_num_ports--;
725 		}
726 	}
727 	return valid_num_ports;
728 }
729 
730 /*
731  * Compares a packet destination MAC address to a device MAC address.
732  */
733 static inline int __attribute__((always_inline))
734 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
735 {
736 	return ((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0;
737 }
738 
739 /*
740  * This function learns the MAC address of the device and registers this along with a
741  * vlan tag to a VMDQ.
742  */
743 static int
744 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
745 {
746 	struct ether_hdr *pkt_hdr;
747 	struct virtio_net_data_ll *dev_ll;
748 	struct virtio_net *dev = vdev->dev;
749 	int i, ret;
750 
751 	/* Learn MAC address of guest device from packet */
752 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
753 
754 	dev_ll = ll_root_used;
755 
756 	while (dev_ll != NULL) {
757 		if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
758 			RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
759 			return -1;
760 		}
761 		dev_ll = dev_ll->next;
762 	}
763 
764 	for (i = 0; i < ETHER_ADDR_LEN; i++)
765 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
766 
767 	/* vlan_tag currently uses the device_id. */
768 	vdev->vlan_tag = vlan_tags[dev->device_fh];
769 
770 	/* Print out VMDQ registration info. */
771 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
772 		dev->device_fh,
773 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
774 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
775 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
776 		vdev->vlan_tag);
777 
778 	/* Register the MAC address. */
779 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
780 				(uint32_t)dev->device_fh + vmdq_pool_base);
781 	if (ret)
782 		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
783 					dev->device_fh);
784 
785 	/* Enable stripping of the vlan tag as we handle routing. */
786 	if (vlan_strip)
787 		rte_eth_dev_set_vlan_strip_on_queue(ports[0],
788 			(uint16_t)vdev->vmdq_rx_q, 1);
789 
790 	/* Set device as ready for RX. */
791 	vdev->ready = DEVICE_RX;
792 
793 	return 0;
794 }
795 
796 /*
797  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
798  * queue before disabling RX on the device.
799  */
800 static inline void
801 unlink_vmdq(struct vhost_dev *vdev)
802 {
803 	unsigned i = 0;
804 	unsigned rx_count;
805 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
806 
807 	if (vdev->ready == DEVICE_RX) {
808 		/*clear MAC and VLAN settings*/
809 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
810 		for (i = 0; i < 6; i++)
811 			vdev->mac_address.addr_bytes[i] = 0;
812 
813 		vdev->vlan_tag = 0;
814 
815 		/*Clear out the receive buffers*/
816 		rx_count = rte_eth_rx_burst(ports[0],
817 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
818 
819 		while (rx_count) {
820 			for (i = 0; i < rx_count; i++)
821 				rte_pktmbuf_free(pkts_burst[i]);
822 
823 			rx_count = rte_eth_rx_burst(ports[0],
824 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
825 		}
826 
827 		vdev->ready = DEVICE_MAC_LEARNING;
828 	}
829 }
830 
831 /*
832  * Check if the packet destination MAC address is for a local device. If so then put
833  * the packet on that devices RX queue. If not then return.
834  */
835 static inline int __attribute__((always_inline))
836 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
837 {
838 	struct virtio_net_data_ll *dev_ll;
839 	struct ether_hdr *pkt_hdr;
840 	uint64_t ret = 0;
841 	struct virtio_net *dev = vdev->dev;
842 	struct virtio_net *tdev; /* destination virito device */
843 
844 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
845 
846 	/*get the used devices list*/
847 	dev_ll = ll_root_used;
848 
849 	while (dev_ll != NULL) {
850 		if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
851 				          &dev_ll->vdev->mac_address)) {
852 
853 			/* Drop the packet if the TX packet is destined for the TX device. */
854 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
855 				RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: "
856 					"Source and destination MAC addresses are the same. "
857 					"Dropping packet.\n",
858 					dev->device_fh);
859 				return 0;
860 			}
861 			tdev = dev_ll->vdev->dev;
862 
863 
864 			RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: "
865 				"MAC address is local\n", tdev->device_fh);
866 
867 			if (unlikely(dev_ll->vdev->remove)) {
868 				/*drop the packet if the device is marked for removal*/
869 				RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") "
870 					"Device is marked for removal\n", tdev->device_fh);
871 			} else {
872 				/*send the packet to the local virtio device*/
873 				ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
874 				if (enable_stats) {
875 					rte_atomic64_add(
876 					&dev_statistics[tdev->device_fh].rx_total_atomic,
877 					1);
878 					rte_atomic64_add(
879 					&dev_statistics[tdev->device_fh].rx_atomic,
880 					ret);
881 					dev_statistics[dev->device_fh].tx_total++;
882 					dev_statistics[dev->device_fh].tx += ret;
883 				}
884 			}
885 
886 			return 0;
887 		}
888 		dev_ll = dev_ll->next;
889 	}
890 
891 	return -1;
892 }
893 
894 /*
895  * Check if the destination MAC of a packet is one local VM,
896  * and get its vlan tag, and offset if it is.
897  */
898 static inline int __attribute__((always_inline))
899 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
900 	uint32_t *offset, uint16_t *vlan_tag)
901 {
902 	struct virtio_net_data_ll *dev_ll = ll_root_used;
903 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
904 
905 	while (dev_ll != NULL) {
906 		if ((dev_ll->vdev->ready == DEVICE_RX)
907 			&& ether_addr_cmp(&(pkt_hdr->d_addr),
908 		&dev_ll->vdev->mac_address)) {
909 			/*
910 			 * Drop the packet if the TX packet is
911 			 * destined for the TX device.
912 			 */
913 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
914 				RTE_LOG(DEBUG, VHOST_DATA,
915 				"(%"PRIu64") TX: Source and destination"
916 				" MAC addresses are the same. Dropping "
917 				"packet.\n",
918 				dev_ll->vdev->dev->device_fh);
919 				return -1;
920 			}
921 
922 			/*
923 			 * HW vlan strip will reduce the packet length
924 			 * by minus length of vlan tag, so need restore
925 			 * the packet length by plus it.
926 			 */
927 			*offset = VLAN_HLEN;
928 			*vlan_tag =
929 			(uint16_t)
930 			vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
931 
932 			RTE_LOG(DEBUG, VHOST_DATA,
933 			"(%"PRIu64") TX: pkt to local VM device id:"
934 			"(%"PRIu64") vlan tag: %d.\n",
935 			dev->device_fh, dev_ll->vdev->dev->device_fh,
936 			(int)*vlan_tag);
937 
938 			break;
939 		}
940 		dev_ll = dev_ll->next;
941 	}
942 	return 0;
943 }
944 
945 static uint16_t
946 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
947 {
948 	if (ol_flags & PKT_TX_IPV4)
949 		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
950 	else /* assume ethertype == ETHER_TYPE_IPv6 */
951 		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
952 }
953 
954 static void virtio_tx_offload(struct rte_mbuf *m)
955 {
956 	void *l3_hdr;
957 	struct ipv4_hdr *ipv4_hdr = NULL;
958 	struct tcp_hdr *tcp_hdr = NULL;
959 	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
960 
961 	l3_hdr = (char *)eth_hdr + m->l2_len;
962 
963 	if (m->ol_flags & PKT_TX_IPV4) {
964 		ipv4_hdr = l3_hdr;
965 		ipv4_hdr->hdr_checksum = 0;
966 		m->ol_flags |= PKT_TX_IP_CKSUM;
967 	}
968 
969 	tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
970 	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
971 }
972 
973 /*
974  * This function routes the TX packet to the correct interface. This may be a local device
975  * or the physical port.
976  */
977 static inline void __attribute__((always_inline))
978 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
979 {
980 	struct mbuf_table *tx_q;
981 	struct rte_mbuf **m_table;
982 	unsigned len, ret, offset = 0;
983 	const uint16_t lcore_id = rte_lcore_id();
984 	struct virtio_net *dev = vdev->dev;
985 	struct ether_hdr *nh;
986 
987 	/*check if destination is local VM*/
988 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
989 		rte_pktmbuf_free(m);
990 		return;
991 	}
992 
993 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
994 		if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
995 			rte_pktmbuf_free(m);
996 			return;
997 		}
998 	}
999 
1000 	RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: "
1001 		"MAC address is external\n", dev->device_fh);
1002 
1003 	/*Add packet to the port tx queue*/
1004 	tx_q = &lcore_tx_queue[lcore_id];
1005 	len = tx_q->len;
1006 
1007 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
1008 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
1009 		/* Guest has inserted the vlan tag. */
1010 		struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
1011 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1012 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1013 			(vh->vlan_tci != vlan_tag_be))
1014 			vh->vlan_tci = vlan_tag_be;
1015 	} else {
1016 		m->ol_flags |= PKT_TX_VLAN_PKT;
1017 
1018 		/*
1019 		 * Find the right seg to adjust the data len when offset is
1020 		 * bigger than tail room size.
1021 		 */
1022 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1023 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1024 				m->data_len += offset;
1025 			else {
1026 				struct rte_mbuf *seg = m;
1027 
1028 				while ((seg->next != NULL) &&
1029 					(offset > rte_pktmbuf_tailroom(seg)))
1030 					seg = seg->next;
1031 
1032 				seg->data_len += offset;
1033 			}
1034 			m->pkt_len += offset;
1035 		}
1036 
1037 		m->vlan_tci = vlan_tag;
1038 	}
1039 
1040 	if (m->ol_flags & PKT_TX_TCP_SEG)
1041 		virtio_tx_offload(m);
1042 
1043 	tx_q->m_table[len] = m;
1044 	len++;
1045 	if (enable_stats) {
1046 		dev_statistics[dev->device_fh].tx_total++;
1047 		dev_statistics[dev->device_fh].tx++;
1048 	}
1049 
1050 	if (unlikely(len == MAX_PKT_BURST)) {
1051 		m_table = (struct rte_mbuf **)tx_q->m_table;
1052 		ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1053 		/* Free any buffers not handled by TX and update the port stats. */
1054 		if (unlikely(ret < len)) {
1055 			do {
1056 				rte_pktmbuf_free(m_table[ret]);
1057 			} while (++ret < len);
1058 		}
1059 
1060 		len = 0;
1061 	}
1062 
1063 	tx_q->len = len;
1064 	return;
1065 }
1066 /*
1067  * This function is called by each data core. It handles all RX/TX registered with the
1068  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1069  * with all devices in the main linked list.
1070  */
1071 static int
1072 switch_worker(__attribute__((unused)) void *arg)
1073 {
1074 	struct virtio_net *dev = NULL;
1075 	struct vhost_dev *vdev = NULL;
1076 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1077 	struct virtio_net_data_ll *dev_ll;
1078 	struct mbuf_table *tx_q;
1079 	volatile struct lcore_ll_info *lcore_ll;
1080 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1081 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1082 	unsigned ret, i;
1083 	const uint16_t lcore_id = rte_lcore_id();
1084 	const uint16_t num_cores = (uint16_t)rte_lcore_count();
1085 	uint16_t rx_count = 0;
1086 	uint16_t tx_count;
1087 	uint32_t retry = 0;
1088 
1089 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1090 	lcore_ll = lcore_info[lcore_id].lcore_ll;
1091 	prev_tsc = 0;
1092 
1093 	tx_q = &lcore_tx_queue[lcore_id];
1094 	for (i = 0; i < num_cores; i ++) {
1095 		if (lcore_ids[i] == lcore_id) {
1096 			tx_q->txq_id = i;
1097 			break;
1098 		}
1099 	}
1100 
1101 	while(1) {
1102 		cur_tsc = rte_rdtsc();
1103 		/*
1104 		 * TX burst queue drain
1105 		 */
1106 		diff_tsc = cur_tsc - prev_tsc;
1107 		if (unlikely(diff_tsc > drain_tsc)) {
1108 
1109 			if (tx_q->len) {
1110 				RTE_LOG(DEBUG, VHOST_DATA,
1111 					"TX queue drained after timeout with burst size %u\n",
1112 					tx_q->len);
1113 
1114 				/*Tx any packets in the queue*/
1115 				ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1116 									   (struct rte_mbuf **)tx_q->m_table,
1117 									   (uint16_t)tx_q->len);
1118 				if (unlikely(ret < tx_q->len)) {
1119 					do {
1120 						rte_pktmbuf_free(tx_q->m_table[ret]);
1121 					} while (++ret < tx_q->len);
1122 				}
1123 
1124 				tx_q->len = 0;
1125 			}
1126 
1127 			prev_tsc = cur_tsc;
1128 
1129 		}
1130 
1131 		rte_prefetch0(lcore_ll->ll_root_used);
1132 		/*
1133 		 * Inform the configuration core that we have exited the linked list and that no devices are
1134 		 * in use if requested.
1135 		 */
1136 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1137 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1138 
1139 		/*
1140 		 * Process devices
1141 		 */
1142 		dev_ll = lcore_ll->ll_root_used;
1143 
1144 		while (dev_ll != NULL) {
1145 			/*get virtio device ID*/
1146 			vdev = dev_ll->vdev;
1147 			dev = vdev->dev;
1148 
1149 			if (unlikely(vdev->remove)) {
1150 				dev_ll = dev_ll->next;
1151 				unlink_vmdq(vdev);
1152 				vdev->ready = DEVICE_SAFE_REMOVE;
1153 				continue;
1154 			}
1155 			if (likely(vdev->ready == DEVICE_RX)) {
1156 				/*Handle guest RX*/
1157 				rx_count = rte_eth_rx_burst(ports[0],
1158 					vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1159 
1160 				if (rx_count) {
1161 					/*
1162 					* Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1163 					* Here MAX_PKT_BURST must be less than virtio queue size
1164 					*/
1165 					if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1166 						for (retry = 0; retry < burst_rx_retry_num; retry++) {
1167 							rte_delay_us(burst_rx_delay_time);
1168 							if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1169 								break;
1170 						}
1171 					}
1172 					ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1173 					if (enable_stats) {
1174 						rte_atomic64_add(
1175 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1176 						rx_count);
1177 						rte_atomic64_add(
1178 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1179 					}
1180 					while (likely(rx_count)) {
1181 						rx_count--;
1182 						rte_pktmbuf_free(pkts_burst[rx_count]);
1183 					}
1184 
1185 				}
1186 			}
1187 
1188 			if (likely(!vdev->remove)) {
1189 				/* Handle guest TX*/
1190 				tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1191 				/* If this is the first received packet we need to learn the MAC and setup VMDQ */
1192 				if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1193 					if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1194 						while (tx_count)
1195 							rte_pktmbuf_free(pkts_burst[--tx_count]);
1196 					}
1197 				}
1198 				for (i = 0; i < tx_count; ++i) {
1199 					virtio_tx_route(vdev, pkts_burst[i],
1200 						vlan_tags[(uint16_t)dev->device_fh]);
1201 				}
1202 			}
1203 
1204 			/*move to the next device in the list*/
1205 			dev_ll = dev_ll->next;
1206 		}
1207 	}
1208 
1209 	return 0;
1210 }
1211 
1212 /*
1213  * Add an entry to a used linked list. A free entry must first be found
1214  * in the free linked list using get_data_ll_free_entry();
1215  */
1216 static void
1217 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
1218 	struct virtio_net_data_ll *ll_dev)
1219 {
1220 	struct virtio_net_data_ll *ll = *ll_root_addr;
1221 
1222 	/* Set next as NULL and use a compiler barrier to avoid reordering. */
1223 	ll_dev->next = NULL;
1224 	rte_compiler_barrier();
1225 
1226 	/* If ll == NULL then this is the first device. */
1227 	if (ll) {
1228 		/* Increment to the tail of the linked list. */
1229 		while ((ll->next != NULL) )
1230 			ll = ll->next;
1231 
1232 		ll->next = ll_dev;
1233 	} else {
1234 		*ll_root_addr = ll_dev;
1235 	}
1236 }
1237 
1238 /*
1239  * Remove an entry from a used linked list. The entry must then be added to
1240  * the free linked list using put_data_ll_free_entry().
1241  */
1242 static void
1243 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
1244 	struct virtio_net_data_ll *ll_dev,
1245 	struct virtio_net_data_ll *ll_dev_last)
1246 {
1247 	struct virtio_net_data_ll *ll = *ll_root_addr;
1248 
1249 	if (unlikely((ll == NULL) || (ll_dev == NULL)))
1250 		return;
1251 
1252 	if (ll_dev == ll)
1253 		*ll_root_addr = ll_dev->next;
1254 	else
1255 		if (likely(ll_dev_last != NULL))
1256 			ll_dev_last->next = ll_dev->next;
1257 		else
1258 			RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
1259 }
1260 
1261 /*
1262  * Find and return an entry from the free linked list.
1263  */
1264 static struct virtio_net_data_ll *
1265 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
1266 {
1267 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
1268 	struct virtio_net_data_ll *ll_dev;
1269 
1270 	if (ll_free == NULL)
1271 		return NULL;
1272 
1273 	ll_dev = ll_free;
1274 	*ll_root_addr = ll_free->next;
1275 
1276 	return ll_dev;
1277 }
1278 
1279 /*
1280  * Place an entry back on to the free linked list.
1281  */
1282 static void
1283 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
1284 	struct virtio_net_data_ll *ll_dev)
1285 {
1286 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
1287 
1288 	if (ll_dev == NULL)
1289 		return;
1290 
1291 	ll_dev->next = ll_free;
1292 	*ll_root_addr = ll_dev;
1293 }
1294 
1295 /*
1296  * Creates a linked list of a given size.
1297  */
1298 static struct virtio_net_data_ll *
1299 alloc_data_ll(uint32_t size)
1300 {
1301 	struct virtio_net_data_ll *ll_new;
1302 	uint32_t i;
1303 
1304 	/* Malloc and then chain the linked list. */
1305 	ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
1306 	if (ll_new == NULL) {
1307 		RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
1308 		return NULL;
1309 	}
1310 
1311 	for (i = 0; i < size - 1; i++) {
1312 		ll_new[i].vdev = NULL;
1313 		ll_new[i].next = &ll_new[i+1];
1314 	}
1315 	ll_new[i].next = NULL;
1316 
1317 	return ll_new;
1318 }
1319 
1320 /*
1321  * Create the main linked list along with each individual cores linked list. A used and a free list
1322  * are created to manage entries.
1323  */
1324 static int
1325 init_data_ll (void)
1326 {
1327 	int lcore;
1328 
1329 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1330 		lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
1331 		if (lcore_info[lcore].lcore_ll == NULL) {
1332 			RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
1333 			return -1;
1334 		}
1335 
1336 		lcore_info[lcore].lcore_ll->device_num = 0;
1337 		lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1338 		lcore_info[lcore].lcore_ll->ll_root_used = NULL;
1339 		if (num_devices % num_switching_cores)
1340 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
1341 		else
1342 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
1343 	}
1344 
1345 	/* Allocate devices up to a maximum of MAX_DEVICES. */
1346 	ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
1347 
1348 	return 0;
1349 }
1350 
1351 /*
1352  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
1353  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1354  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1355  */
1356 static void
1357 destroy_device (volatile struct virtio_net *dev)
1358 {
1359 	struct virtio_net_data_ll *ll_lcore_dev_cur;
1360 	struct virtio_net_data_ll *ll_main_dev_cur;
1361 	struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
1362 	struct virtio_net_data_ll *ll_main_dev_last = NULL;
1363 	struct vhost_dev *vdev;
1364 	int lcore;
1365 
1366 	dev->flags &= ~VIRTIO_DEV_RUNNING;
1367 
1368 	vdev = (struct vhost_dev *)dev->priv;
1369 	/*set the remove flag. */
1370 	vdev->remove = 1;
1371 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1372 		rte_pause();
1373 	}
1374 
1375 	/* Search for entry to be removed from lcore ll */
1376 	ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
1377 	while (ll_lcore_dev_cur != NULL) {
1378 		if (ll_lcore_dev_cur->vdev == vdev) {
1379 			break;
1380 		} else {
1381 			ll_lcore_dev_last = ll_lcore_dev_cur;
1382 			ll_lcore_dev_cur = ll_lcore_dev_cur->next;
1383 		}
1384 	}
1385 
1386 	if (ll_lcore_dev_cur == NULL) {
1387 		RTE_LOG(ERR, VHOST_CONFIG,
1388 			"(%"PRIu64") Failed to find the dev to be destroy.\n",
1389 			dev->device_fh);
1390 		return;
1391 	}
1392 
1393 	/* Search for entry to be removed from main ll */
1394 	ll_main_dev_cur = ll_root_used;
1395 	ll_main_dev_last = NULL;
1396 	while (ll_main_dev_cur != NULL) {
1397 		if (ll_main_dev_cur->vdev == vdev) {
1398 			break;
1399 		} else {
1400 			ll_main_dev_last = ll_main_dev_cur;
1401 			ll_main_dev_cur = ll_main_dev_cur->next;
1402 		}
1403 	}
1404 
1405 	/* Remove entries from the lcore and main ll. */
1406 	rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
1407 	rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
1408 
1409 	/* Set the dev_removal_flag on each lcore. */
1410 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1411 		lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
1412 	}
1413 
1414 	/*
1415 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
1416 	 * they can no longer access the device removed from the linked lists and that the devices
1417 	 * are no longer in use.
1418 	 */
1419 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1420 		while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
1421 			rte_pause();
1422 		}
1423 	}
1424 
1425 	/* Add the entries back to the lcore and main free ll.*/
1426 	put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
1427 	put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
1428 
1429 	/* Decrement number of device on the lcore. */
1430 	lcore_info[vdev->coreid].lcore_ll->device_num--;
1431 
1432 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
1433 
1434 	rte_free(vdev);
1435 
1436 }
1437 
1438 /*
1439  * A new device is added to a data core. First the device is added to the main linked list
1440  * and the allocated to a specific data core.
1441  */
1442 static int
1443 new_device (struct virtio_net *dev)
1444 {
1445 	struct virtio_net_data_ll *ll_dev;
1446 	int lcore, core_add = 0;
1447 	uint32_t device_num_min = num_devices;
1448 	struct vhost_dev *vdev;
1449 
1450 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1451 	if (vdev == NULL) {
1452 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
1453 			dev->device_fh);
1454 		return -1;
1455 	}
1456 	vdev->dev = dev;
1457 	dev->priv = vdev;
1458 
1459 	/* Add device to main ll */
1460 	ll_dev = get_data_ll_free_entry(&ll_root_free);
1461 	if (ll_dev == NULL) {
1462 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
1463 			"of %d devices per core has been reached\n",
1464 			dev->device_fh, num_devices);
1465 		rte_free(vdev);
1466 		return -1;
1467 	}
1468 	ll_dev->vdev = vdev;
1469 	add_data_ll_entry(&ll_root_used, ll_dev);
1470 	vdev->vmdq_rx_q
1471 		= dev->device_fh * queues_per_pool + vmdq_queue_base;
1472 
1473 	/*reset ready flag*/
1474 	vdev->ready = DEVICE_MAC_LEARNING;
1475 	vdev->remove = 0;
1476 
1477 	/* Find a suitable lcore to add the device. */
1478 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1479 		if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
1480 			device_num_min = lcore_info[lcore].lcore_ll->device_num;
1481 			core_add = lcore;
1482 		}
1483 	}
1484 	/* Add device to lcore ll */
1485 	ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
1486 	if (ll_dev == NULL) {
1487 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
1488 		vdev->ready = DEVICE_SAFE_REMOVE;
1489 		destroy_device(dev);
1490 		rte_free(vdev);
1491 		return -1;
1492 	}
1493 	ll_dev->vdev = vdev;
1494 	vdev->coreid = core_add;
1495 
1496 	add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
1497 
1498 	/* Initialize device stats */
1499 	memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
1500 
1501 	/* Disable notifications. */
1502 	rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
1503 	rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
1504 	lcore_info[vdev->coreid].lcore_ll->device_num++;
1505 	dev->flags |= VIRTIO_DEV_RUNNING;
1506 
1507 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
1508 
1509 	return 0;
1510 }
1511 
1512 /*
1513  * These callback allow devices to be added to the data core when configuration
1514  * has been fully complete.
1515  */
1516 static const struct virtio_net_device_ops virtio_net_device_ops =
1517 {
1518 	.new_device =  new_device,
1519 	.destroy_device = destroy_device,
1520 };
1521 
1522 /*
1523  * This is a thread will wake up after a period to print stats if the user has
1524  * enabled them.
1525  */
1526 static void
1527 print_stats(void)
1528 {
1529 	struct virtio_net_data_ll *dev_ll;
1530 	uint64_t tx_dropped, rx_dropped;
1531 	uint64_t tx, tx_total, rx, rx_total;
1532 	uint32_t device_fh;
1533 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1534 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1535 
1536 	while(1) {
1537 		sleep(enable_stats);
1538 
1539 		/* Clear screen and move to top left */
1540 		printf("%s%s", clr, top_left);
1541 
1542 		printf("\nDevice statistics ====================================");
1543 
1544 		dev_ll = ll_root_used;
1545 		while (dev_ll != NULL) {
1546 			device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
1547 			tx_total = dev_statistics[device_fh].tx_total;
1548 			tx = dev_statistics[device_fh].tx;
1549 			tx_dropped = tx_total - tx;
1550 			rx_total = rte_atomic64_read(
1551 				&dev_statistics[device_fh].rx_total_atomic);
1552 			rx = rte_atomic64_read(
1553 				&dev_statistics[device_fh].rx_atomic);
1554 			rx_dropped = rx_total - rx;
1555 
1556 			printf("\nStatistics for device %"PRIu32" ------------------------------"
1557 					"\nTX total: 		%"PRIu64""
1558 					"\nTX dropped: 		%"PRIu64""
1559 					"\nTX successful: 		%"PRIu64""
1560 					"\nRX total: 		%"PRIu64""
1561 					"\nRX dropped: 		%"PRIu64""
1562 					"\nRX successful: 		%"PRIu64"",
1563 					device_fh,
1564 					tx_total,
1565 					tx_dropped,
1566 					tx,
1567 					rx_total,
1568 					rx_dropped,
1569 					rx);
1570 
1571 			dev_ll = dev_ll->next;
1572 		}
1573 		printf("\n======================================================\n");
1574 	}
1575 }
1576 
1577 /* When we receive a INT signal, unregister vhost driver */
1578 static void
1579 sigint_handler(__rte_unused int signum)
1580 {
1581 	/* Unregister vhost driver. */
1582 	int ret = rte_vhost_driver_unregister((char *)&dev_basename);
1583 	if (ret != 0)
1584 		rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n");
1585 	exit(0);
1586 }
1587 
1588 /*
1589  * Main function, does initialisation and calls the per-lcore functions. The CUSE
1590  * device is also registered here to handle the IOCTLs.
1591  */
1592 int
1593 main(int argc, char *argv[])
1594 {
1595 	unsigned lcore_id, core_id = 0;
1596 	unsigned nb_ports, valid_num_ports;
1597 	int ret;
1598 	uint8_t portid;
1599 	static pthread_t tid;
1600 	char thread_name[RTE_MAX_THREAD_NAME_LEN];
1601 
1602 	signal(SIGINT, sigint_handler);
1603 
1604 	/* init EAL */
1605 	ret = rte_eal_init(argc, argv);
1606 	if (ret < 0)
1607 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1608 	argc -= ret;
1609 	argv += ret;
1610 
1611 	/* parse app arguments */
1612 	ret = us_vhost_parse_args(argc, argv);
1613 	if (ret < 0)
1614 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1615 
1616 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
1617 		if (rte_lcore_is_enabled(lcore_id))
1618 			lcore_ids[core_id ++] = lcore_id;
1619 
1620 	if (rte_lcore_count() > RTE_MAX_LCORE)
1621 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1622 
1623 	/*set the number of swithcing cores available*/
1624 	num_switching_cores = rte_lcore_count()-1;
1625 
1626 	/* Get the number of physical ports. */
1627 	nb_ports = rte_eth_dev_count();
1628 	if (nb_ports > RTE_MAX_ETHPORTS)
1629 		nb_ports = RTE_MAX_ETHPORTS;
1630 
1631 	/*
1632 	 * Update the global var NUM_PORTS and global array PORTS
1633 	 * and get value of var VALID_NUM_PORTS according to system ports number
1634 	 */
1635 	valid_num_ports = check_ports_num(nb_ports);
1636 
1637 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1638 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1639 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1640 		return -1;
1641 	}
1642 
1643 	/* Create the mbuf pool. */
1644 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL",
1645 		NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE,
1646 		0, MBUF_DATA_SIZE, rte_socket_id());
1647 	if (mbuf_pool == NULL)
1648 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1649 
1650 	if (vm2vm_mode == VM2VM_HARDWARE) {
1651 		/* Enable VT loop back to let L2 switch to do it. */
1652 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1653 		RTE_LOG(DEBUG, VHOST_CONFIG,
1654 			"Enable loop back for L2 switch in vmdq.\n");
1655 	}
1656 
1657 	/* initialize all ports */
1658 	for (portid = 0; portid < nb_ports; portid++) {
1659 		/* skip ports that are not enabled */
1660 		if ((enabled_port_mask & (1 << portid)) == 0) {
1661 			RTE_LOG(INFO, VHOST_PORT,
1662 				"Skipping disabled port %d\n", portid);
1663 			continue;
1664 		}
1665 		if (port_init(portid) != 0)
1666 			rte_exit(EXIT_FAILURE,
1667 				"Cannot initialize network ports\n");
1668 	}
1669 
1670 	/* Initialise all linked lists. */
1671 	if (init_data_ll() == -1)
1672 		rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
1673 
1674 	/* Initialize device stats */
1675 	memset(&dev_statistics, 0, sizeof(dev_statistics));
1676 
1677 	/* Enable stats if the user option is set. */
1678 	if (enable_stats) {
1679 		ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
1680 		if (ret != 0)
1681 			rte_exit(EXIT_FAILURE,
1682 				"Cannot create print-stats thread\n");
1683 
1684 		/* Set thread_name for aid in debugging.  */
1685 		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
1686 		ret = rte_thread_setname(tid, thread_name);
1687 		if (ret != 0)
1688 			RTE_LOG(ERR, VHOST_CONFIG,
1689 				"Cannot set print-stats name\n");
1690 	}
1691 
1692 	/* Launch all data cores. */
1693 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1694 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1695 
1696 	if (mergeable == 0)
1697 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
1698 
1699 	/* Register vhost(cuse or user) driver to handle vhost messages. */
1700 	ret = rte_vhost_driver_register((char *)&dev_basename);
1701 	if (ret != 0)
1702 		rte_exit(EXIT_FAILURE, "vhost driver register failure.\n");
1703 
1704 	rte_vhost_driver_callback_register(&virtio_net_device_ops);
1705 
1706 	/* Start CUSE session. */
1707 	rte_vhost_driver_session_start();
1708 	return 0;
1709 
1710 }
1711