xref: /dpdk/examples/vhost/main.c (revision b79e4c00af0e7cfb8601ab0208659d226b82bd10)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_vhost.h>
53 #include <rte_ip.h>
54 #include <rte_tcp.h>
55 #include <rte_pause.h>
56 
57 #include "main.h"
58 
59 #ifndef MAX_QUEUES
60 #define MAX_QUEUES 128
61 #endif
62 
63 /* the maximum number of external ports supported */
64 #define MAX_SUP_PORTS 1
65 
66 #define MBUF_CACHE_SIZE	128
67 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
68 
69 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
70 
71 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
72 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
73 
74 #define JUMBO_FRAME_MAX_SIZE    0x2600
75 
76 /* State of virtio device. */
77 #define DEVICE_MAC_LEARNING 0
78 #define DEVICE_RX			1
79 #define DEVICE_SAFE_REMOVE	2
80 
81 /* Configurable number of RX/TX ring descriptors */
82 #define RTE_TEST_RX_DESC_DEFAULT 1024
83 #define RTE_TEST_TX_DESC_DEFAULT 512
84 
85 #define INVALID_PORT_ID 0xFF
86 
87 /* Max number of devices. Limited by vmdq. */
88 #define MAX_DEVICES 64
89 
90 /* Size of buffers used for snprintfs. */
91 #define MAX_PRINT_BUFF 6072
92 
93 /* Maximum long option length for option parsing. */
94 #define MAX_LONG_OPT_SZ 64
95 
96 /* mask of enabled ports */
97 static uint32_t enabled_port_mask = 0;
98 
99 /* Promiscuous mode */
100 static uint32_t promiscuous;
101 
102 /* number of devices/queues to support*/
103 static uint32_t num_queues = 0;
104 static uint32_t num_devices;
105 
106 static struct rte_mempool *mbuf_pool;
107 static int mergeable;
108 
109 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
110 typedef enum {
111 	VM2VM_DISABLED = 0,
112 	VM2VM_SOFTWARE = 1,
113 	VM2VM_HARDWARE = 2,
114 	VM2VM_LAST
115 } vm2vm_type;
116 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
117 
118 /* Enable stats. */
119 static uint32_t enable_stats = 0;
120 /* Enable retries on RX. */
121 static uint32_t enable_retry = 1;
122 
123 /* Disable TX checksum offload */
124 static uint32_t enable_tx_csum;
125 
126 /* Disable TSO offload */
127 static uint32_t enable_tso;
128 
129 static int client_mode;
130 static int dequeue_zero_copy;
131 
132 static int builtin_net_driver;
133 
134 /* Specify timeout (in useconds) between retries on RX. */
135 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
136 /* Specify the number of retries on RX. */
137 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
138 
139 /* Socket file paths. Can be set by user */
140 static char *socket_files;
141 static int nb_sockets;
142 
143 /* empty vmdq configuration structure. Filled in programatically */
144 static struct rte_eth_conf vmdq_conf_default = {
145 	.rxmode = {
146 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
147 		.split_hdr_size = 0,
148 		.header_split   = 0, /**< Header Split disabled */
149 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
150 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
151 		/*
152 		 * It is necessary for 1G NIC such as I350,
153 		 * this fixes bug of ipv4 forwarding in guest can't
154 		 * forward pakets from one virtio dev to another virtio dev.
155 		 */
156 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
157 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
158 		.hw_strip_crc   = 1, /**< CRC stripped by hardware */
159 	},
160 
161 	.txmode = {
162 		.mq_mode = ETH_MQ_TX_NONE,
163 	},
164 	.rx_adv_conf = {
165 		/*
166 		 * should be overridden separately in code with
167 		 * appropriate values
168 		 */
169 		.vmdq_rx_conf = {
170 			.nb_queue_pools = ETH_8_POOLS,
171 			.enable_default_pool = 0,
172 			.default_pool = 0,
173 			.nb_pool_maps = 0,
174 			.pool_map = {{0, 0},},
175 		},
176 	},
177 };
178 
179 static unsigned lcore_ids[RTE_MAX_LCORE];
180 static uint8_t ports[RTE_MAX_ETHPORTS];
181 static unsigned num_ports = 0; /**< The number of ports specified in command line */
182 static uint16_t num_pf_queues, num_vmdq_queues;
183 static uint16_t vmdq_pool_base, vmdq_queue_base;
184 static uint16_t queues_per_pool;
185 
186 const uint16_t vlan_tags[] = {
187 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
188 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
189 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
190 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
191 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
192 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
193 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
194 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
195 };
196 
197 /* ethernet addresses of ports */
198 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
199 
200 static struct vhost_dev_tailq_list vhost_dev_list =
201 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
202 
203 static struct lcore_info lcore_info[RTE_MAX_LCORE];
204 
205 /* Used for queueing bursts of TX packets. */
206 struct mbuf_table {
207 	unsigned len;
208 	unsigned txq_id;
209 	struct rte_mbuf *m_table[MAX_PKT_BURST];
210 };
211 
212 /* TX queue for each data core. */
213 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
214 
215 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
216 				 / US_PER_S * BURST_TX_DRAIN_US)
217 #define VLAN_HLEN       4
218 
219 /*
220  * Builds up the correct configuration for VMDQ VLAN pool map
221  * according to the pool & queue limits.
222  */
223 static inline int
224 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
225 {
226 	struct rte_eth_vmdq_rx_conf conf;
227 	struct rte_eth_vmdq_rx_conf *def_conf =
228 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
229 	unsigned i;
230 
231 	memset(&conf, 0, sizeof(conf));
232 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
233 	conf.nb_pool_maps = num_devices;
234 	conf.enable_loop_back = def_conf->enable_loop_back;
235 	conf.rx_mode = def_conf->rx_mode;
236 
237 	for (i = 0; i < conf.nb_pool_maps; i++) {
238 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
239 		conf.pool_map[i].pools = (1UL << i);
240 	}
241 
242 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
243 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
244 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
245 	return 0;
246 }
247 
248 /*
249  * Validate the device number according to the max pool number gotten form
250  * dev_info. If the device number is invalid, give the error message and
251  * return -1. Each device must have its own pool.
252  */
253 static inline int
254 validate_num_devices(uint32_t max_nb_devices)
255 {
256 	if (num_devices > max_nb_devices) {
257 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
258 		return -1;
259 	}
260 	return 0;
261 }
262 
263 /*
264  * Initialises a given port using global settings and with the rx buffers
265  * coming from the mbuf_pool passed as parameter
266  */
267 static inline int
268 port_init(uint8_t port)
269 {
270 	struct rte_eth_dev_info dev_info;
271 	struct rte_eth_conf port_conf;
272 	struct rte_eth_rxconf *rxconf;
273 	struct rte_eth_txconf *txconf;
274 	int16_t rx_rings, tx_rings;
275 	uint16_t rx_ring_size, tx_ring_size;
276 	int retval;
277 	uint16_t q;
278 
279 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
280 	rte_eth_dev_info_get (port, &dev_info);
281 
282 	if (dev_info.max_rx_queues > MAX_QUEUES) {
283 		rte_exit(EXIT_FAILURE,
284 			"please define MAX_QUEUES no less than %u in %s\n",
285 			dev_info.max_rx_queues, __FILE__);
286 	}
287 
288 	rxconf = &dev_info.default_rxconf;
289 	txconf = &dev_info.default_txconf;
290 	rxconf->rx_drop_en = 1;
291 
292 	/* Enable vlan offload */
293 	txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
294 
295 	/*configure the number of supported virtio devices based on VMDQ limits */
296 	num_devices = dev_info.max_vmdq_pools;
297 
298 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
299 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
300 
301 	/*
302 	 * When dequeue zero copy is enabled, guest Tx used vring will be
303 	 * updated only when corresponding mbuf is freed. Thus, the nb_tx_desc
304 	 * (tx_ring_size here) must be small enough so that the driver will
305 	 * hit the free threshold easily and free mbufs timely. Otherwise,
306 	 * guest Tx vring would be starved.
307 	 */
308 	if (dequeue_zero_copy)
309 		tx_ring_size = 64;
310 
311 	tx_rings = (uint16_t)rte_lcore_count();
312 
313 	retval = validate_num_devices(MAX_DEVICES);
314 	if (retval < 0)
315 		return retval;
316 
317 	/* Get port configuration. */
318 	retval = get_eth_conf(&port_conf, num_devices);
319 	if (retval < 0)
320 		return retval;
321 	/* NIC queues are divided into pf queues and vmdq queues.  */
322 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
323 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
324 	num_vmdq_queues = num_devices * queues_per_pool;
325 	num_queues = num_pf_queues + num_vmdq_queues;
326 	vmdq_queue_base = dev_info.vmdq_queue_base;
327 	vmdq_pool_base  = dev_info.vmdq_pool_base;
328 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
329 		num_pf_queues, num_devices, queues_per_pool);
330 
331 	if (port >= rte_eth_dev_count()) return -1;
332 
333 	rx_rings = (uint16_t)dev_info.max_rx_queues;
334 	/* Configure ethernet device. */
335 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
336 	if (retval != 0) {
337 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
338 			port, strerror(-retval));
339 		return retval;
340 	}
341 
342 	/* Setup the queues. */
343 	for (q = 0; q < rx_rings; q ++) {
344 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
345 						rte_eth_dev_socket_id(port),
346 						rxconf,
347 						mbuf_pool);
348 		if (retval < 0) {
349 			RTE_LOG(ERR, VHOST_PORT,
350 				"Failed to setup rx queue %u of port %u: %s.\n",
351 				q, port, strerror(-retval));
352 			return retval;
353 		}
354 	}
355 	for (q = 0; q < tx_rings; q ++) {
356 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
357 						rte_eth_dev_socket_id(port),
358 						txconf);
359 		if (retval < 0) {
360 			RTE_LOG(ERR, VHOST_PORT,
361 				"Failed to setup tx queue %u of port %u: %s.\n",
362 				q, port, strerror(-retval));
363 			return retval;
364 		}
365 	}
366 
367 	/* Start the device. */
368 	retval  = rte_eth_dev_start(port);
369 	if (retval < 0) {
370 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
371 			port, strerror(-retval));
372 		return retval;
373 	}
374 
375 	if (promiscuous)
376 		rte_eth_promiscuous_enable(port);
377 
378 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
379 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
380 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
381 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
382 			(unsigned)port,
383 			vmdq_ports_eth_addr[port].addr_bytes[0],
384 			vmdq_ports_eth_addr[port].addr_bytes[1],
385 			vmdq_ports_eth_addr[port].addr_bytes[2],
386 			vmdq_ports_eth_addr[port].addr_bytes[3],
387 			vmdq_ports_eth_addr[port].addr_bytes[4],
388 			vmdq_ports_eth_addr[port].addr_bytes[5]);
389 
390 	return 0;
391 }
392 
393 /*
394  * Set socket file path.
395  */
396 static int
397 us_vhost_parse_socket_path(const char *q_arg)
398 {
399 	/* parse number string */
400 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
401 		return -1;
402 
403 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
404 	snprintf(socket_files + nb_sockets * PATH_MAX, PATH_MAX, "%s", q_arg);
405 	nb_sockets++;
406 
407 	return 0;
408 }
409 
410 /*
411  * Parse the portmask provided at run time.
412  */
413 static int
414 parse_portmask(const char *portmask)
415 {
416 	char *end = NULL;
417 	unsigned long pm;
418 
419 	errno = 0;
420 
421 	/* parse hexadecimal string */
422 	pm = strtoul(portmask, &end, 16);
423 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
424 		return -1;
425 
426 	if (pm == 0)
427 		return -1;
428 
429 	return pm;
430 
431 }
432 
433 /*
434  * Parse num options at run time.
435  */
436 static int
437 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
438 {
439 	char *end = NULL;
440 	unsigned long num;
441 
442 	errno = 0;
443 
444 	/* parse unsigned int string */
445 	num = strtoul(q_arg, &end, 10);
446 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
447 		return -1;
448 
449 	if (num > max_valid_value)
450 		return -1;
451 
452 	return num;
453 
454 }
455 
456 /*
457  * Display usage
458  */
459 static void
460 us_vhost_usage(const char *prgname)
461 {
462 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
463 	"		--vm2vm [0|1|2]\n"
464 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
465 	"		--socket-file <path>\n"
466 	"		--nb-devices ND\n"
467 	"		-p PORTMASK: Set mask for ports to be used by application\n"
468 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
469 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
470 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
471 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
472 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
473 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
474 	"		--socket-file: The path of the socket file.\n"
475 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
476 	"		--tso [0|1] disable/enable TCP segment offload.\n"
477 	"		--client register a vhost-user socket as client mode.\n"
478 	"		--dequeue-zero-copy enables dequeue zero copy\n",
479 	       prgname);
480 }
481 
482 /*
483  * Parse the arguments given in the command line of the application.
484  */
485 static int
486 us_vhost_parse_args(int argc, char **argv)
487 {
488 	int opt, ret;
489 	int option_index;
490 	unsigned i;
491 	const char *prgname = argv[0];
492 	static struct option long_option[] = {
493 		{"vm2vm", required_argument, NULL, 0},
494 		{"rx-retry", required_argument, NULL, 0},
495 		{"rx-retry-delay", required_argument, NULL, 0},
496 		{"rx-retry-num", required_argument, NULL, 0},
497 		{"mergeable", required_argument, NULL, 0},
498 		{"stats", required_argument, NULL, 0},
499 		{"socket-file", required_argument, NULL, 0},
500 		{"tx-csum", required_argument, NULL, 0},
501 		{"tso", required_argument, NULL, 0},
502 		{"client", no_argument, &client_mode, 1},
503 		{"dequeue-zero-copy", no_argument, &dequeue_zero_copy, 1},
504 		{"builtin-net-driver", no_argument, &builtin_net_driver, 1},
505 		{NULL, 0, 0, 0},
506 	};
507 
508 	/* Parse command line */
509 	while ((opt = getopt_long(argc, argv, "p:P",
510 			long_option, &option_index)) != EOF) {
511 		switch (opt) {
512 		/* Portmask */
513 		case 'p':
514 			enabled_port_mask = parse_portmask(optarg);
515 			if (enabled_port_mask == 0) {
516 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
517 				us_vhost_usage(prgname);
518 				return -1;
519 			}
520 			break;
521 
522 		case 'P':
523 			promiscuous = 1;
524 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
525 				ETH_VMDQ_ACCEPT_BROADCAST |
526 				ETH_VMDQ_ACCEPT_MULTICAST;
527 
528 			break;
529 
530 		case 0:
531 			/* Enable/disable vm2vm comms. */
532 			if (!strncmp(long_option[option_index].name, "vm2vm",
533 				MAX_LONG_OPT_SZ)) {
534 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
535 				if (ret == -1) {
536 					RTE_LOG(INFO, VHOST_CONFIG,
537 						"Invalid argument for "
538 						"vm2vm [0|1|2]\n");
539 					us_vhost_usage(prgname);
540 					return -1;
541 				} else {
542 					vm2vm_mode = (vm2vm_type)ret;
543 				}
544 			}
545 
546 			/* Enable/disable retries on RX. */
547 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
548 				ret = parse_num_opt(optarg, 1);
549 				if (ret == -1) {
550 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
551 					us_vhost_usage(prgname);
552 					return -1;
553 				} else {
554 					enable_retry = ret;
555 				}
556 			}
557 
558 			/* Enable/disable TX checksum offload. */
559 			if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
560 				ret = parse_num_opt(optarg, 1);
561 				if (ret == -1) {
562 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
563 					us_vhost_usage(prgname);
564 					return -1;
565 				} else
566 					enable_tx_csum = ret;
567 			}
568 
569 			/* Enable/disable TSO offload. */
570 			if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
571 				ret = parse_num_opt(optarg, 1);
572 				if (ret == -1) {
573 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
574 					us_vhost_usage(prgname);
575 					return -1;
576 				} else
577 					enable_tso = ret;
578 			}
579 
580 			/* Specify the retries delay time (in useconds) on RX. */
581 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
582 				ret = parse_num_opt(optarg, INT32_MAX);
583 				if (ret == -1) {
584 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
585 					us_vhost_usage(prgname);
586 					return -1;
587 				} else {
588 					burst_rx_delay_time = ret;
589 				}
590 			}
591 
592 			/* Specify the retries number on RX. */
593 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
594 				ret = parse_num_opt(optarg, INT32_MAX);
595 				if (ret == -1) {
596 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
597 					us_vhost_usage(prgname);
598 					return -1;
599 				} else {
600 					burst_rx_retry_num = ret;
601 				}
602 			}
603 
604 			/* Enable/disable RX mergeable buffers. */
605 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
606 				ret = parse_num_opt(optarg, 1);
607 				if (ret == -1) {
608 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
609 					us_vhost_usage(prgname);
610 					return -1;
611 				} else {
612 					mergeable = !!ret;
613 					if (ret) {
614 						vmdq_conf_default.rxmode.jumbo_frame = 1;
615 						vmdq_conf_default.rxmode.max_rx_pkt_len
616 							= JUMBO_FRAME_MAX_SIZE;
617 					}
618 				}
619 			}
620 
621 			/* Enable/disable stats. */
622 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
623 				ret = parse_num_opt(optarg, INT32_MAX);
624 				if (ret == -1) {
625 					RTE_LOG(INFO, VHOST_CONFIG,
626 						"Invalid argument for stats [0..N]\n");
627 					us_vhost_usage(prgname);
628 					return -1;
629 				} else {
630 					enable_stats = ret;
631 				}
632 			}
633 
634 			/* Set socket file path. */
635 			if (!strncmp(long_option[option_index].name,
636 						"socket-file", MAX_LONG_OPT_SZ)) {
637 				if (us_vhost_parse_socket_path(optarg) == -1) {
638 					RTE_LOG(INFO, VHOST_CONFIG,
639 					"Invalid argument for socket name (Max %d characters)\n",
640 					PATH_MAX);
641 					us_vhost_usage(prgname);
642 					return -1;
643 				}
644 			}
645 
646 			break;
647 
648 			/* Invalid option - print options. */
649 		default:
650 			us_vhost_usage(prgname);
651 			return -1;
652 		}
653 	}
654 
655 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
656 		if (enabled_port_mask & (1 << i))
657 			ports[num_ports++] = (uint8_t)i;
658 	}
659 
660 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
661 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
662 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
663 		return -1;
664 	}
665 
666 	return 0;
667 }
668 
669 /*
670  * Update the global var NUM_PORTS and array PORTS according to system ports number
671  * and return valid ports number
672  */
673 static unsigned check_ports_num(unsigned nb_ports)
674 {
675 	unsigned valid_num_ports = num_ports;
676 	unsigned portid;
677 
678 	if (num_ports > nb_ports) {
679 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
680 			num_ports, nb_ports);
681 		num_ports = nb_ports;
682 	}
683 
684 	for (portid = 0; portid < num_ports; portid ++) {
685 		if (ports[portid] >= nb_ports) {
686 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
687 				ports[portid], (nb_ports - 1));
688 			ports[portid] = INVALID_PORT_ID;
689 			valid_num_ports--;
690 		}
691 	}
692 	return valid_num_ports;
693 }
694 
695 static __rte_always_inline struct vhost_dev *
696 find_vhost_dev(struct ether_addr *mac)
697 {
698 	struct vhost_dev *vdev;
699 
700 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
701 		if (vdev->ready == DEVICE_RX &&
702 		    is_same_ether_addr(mac, &vdev->mac_address))
703 			return vdev;
704 	}
705 
706 	return NULL;
707 }
708 
709 /*
710  * This function learns the MAC address of the device and registers this along with a
711  * vlan tag to a VMDQ.
712  */
713 static int
714 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
715 {
716 	struct ether_hdr *pkt_hdr;
717 	int i, ret;
718 
719 	/* Learn MAC address of guest device from packet */
720 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
721 
722 	if (find_vhost_dev(&pkt_hdr->s_addr)) {
723 		RTE_LOG(ERR, VHOST_DATA,
724 			"(%d) device is using a registered MAC!\n",
725 			vdev->vid);
726 		return -1;
727 	}
728 
729 	for (i = 0; i < ETHER_ADDR_LEN; i++)
730 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
731 
732 	/* vlan_tag currently uses the device_id. */
733 	vdev->vlan_tag = vlan_tags[vdev->vid];
734 
735 	/* Print out VMDQ registration info. */
736 	RTE_LOG(INFO, VHOST_DATA,
737 		"(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
738 		vdev->vid,
739 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
740 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
741 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
742 		vdev->vlan_tag);
743 
744 	/* Register the MAC address. */
745 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
746 				(uint32_t)vdev->vid + vmdq_pool_base);
747 	if (ret)
748 		RTE_LOG(ERR, VHOST_DATA,
749 			"(%d) failed to add device MAC address to VMDQ\n",
750 			vdev->vid);
751 
752 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
753 
754 	/* Set device as ready for RX. */
755 	vdev->ready = DEVICE_RX;
756 
757 	return 0;
758 }
759 
760 /*
761  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
762  * queue before disabling RX on the device.
763  */
764 static inline void
765 unlink_vmdq(struct vhost_dev *vdev)
766 {
767 	unsigned i = 0;
768 	unsigned rx_count;
769 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
770 
771 	if (vdev->ready == DEVICE_RX) {
772 		/*clear MAC and VLAN settings*/
773 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
774 		for (i = 0; i < 6; i++)
775 			vdev->mac_address.addr_bytes[i] = 0;
776 
777 		vdev->vlan_tag = 0;
778 
779 		/*Clear out the receive buffers*/
780 		rx_count = rte_eth_rx_burst(ports[0],
781 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
782 
783 		while (rx_count) {
784 			for (i = 0; i < rx_count; i++)
785 				rte_pktmbuf_free(pkts_burst[i]);
786 
787 			rx_count = rte_eth_rx_burst(ports[0],
788 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
789 		}
790 
791 		vdev->ready = DEVICE_MAC_LEARNING;
792 	}
793 }
794 
795 static __rte_always_inline void
796 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
797 	    struct rte_mbuf *m)
798 {
799 	uint16_t ret;
800 
801 	if (builtin_net_driver) {
802 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
803 	} else {
804 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
805 	}
806 
807 	if (enable_stats) {
808 		rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
809 		rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
810 		src_vdev->stats.tx_total++;
811 		src_vdev->stats.tx += ret;
812 	}
813 }
814 
815 /*
816  * Check if the packet destination MAC address is for a local device. If so then put
817  * the packet on that devices RX queue. If not then return.
818  */
819 static __rte_always_inline int
820 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
821 {
822 	struct ether_hdr *pkt_hdr;
823 	struct vhost_dev *dst_vdev;
824 
825 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
826 
827 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
828 	if (!dst_vdev)
829 		return -1;
830 
831 	if (vdev->vid == dst_vdev->vid) {
832 		RTE_LOG_DP(DEBUG, VHOST_DATA,
833 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
834 			vdev->vid);
835 		return 0;
836 	}
837 
838 	RTE_LOG_DP(DEBUG, VHOST_DATA,
839 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
840 
841 	if (unlikely(dst_vdev->remove)) {
842 		RTE_LOG_DP(DEBUG, VHOST_DATA,
843 			"(%d) device is marked for removal\n", dst_vdev->vid);
844 		return 0;
845 	}
846 
847 	virtio_xmit(dst_vdev, vdev, m);
848 	return 0;
849 }
850 
851 /*
852  * Check if the destination MAC of a packet is one local VM,
853  * and get its vlan tag, and offset if it is.
854  */
855 static __rte_always_inline int
856 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
857 	uint32_t *offset, uint16_t *vlan_tag)
858 {
859 	struct vhost_dev *dst_vdev;
860 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
861 
862 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
863 	if (!dst_vdev)
864 		return 0;
865 
866 	if (vdev->vid == dst_vdev->vid) {
867 		RTE_LOG_DP(DEBUG, VHOST_DATA,
868 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
869 			vdev->vid);
870 		return -1;
871 	}
872 
873 	/*
874 	 * HW vlan strip will reduce the packet length
875 	 * by minus length of vlan tag, so need restore
876 	 * the packet length by plus it.
877 	 */
878 	*offset  = VLAN_HLEN;
879 	*vlan_tag = vlan_tags[vdev->vid];
880 
881 	RTE_LOG_DP(DEBUG, VHOST_DATA,
882 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
883 		vdev->vid, dst_vdev->vid, *vlan_tag);
884 
885 	return 0;
886 }
887 
888 static uint16_t
889 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
890 {
891 	if (ol_flags & PKT_TX_IPV4)
892 		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
893 	else /* assume ethertype == ETHER_TYPE_IPv6 */
894 		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
895 }
896 
897 static void virtio_tx_offload(struct rte_mbuf *m)
898 {
899 	void *l3_hdr;
900 	struct ipv4_hdr *ipv4_hdr = NULL;
901 	struct tcp_hdr *tcp_hdr = NULL;
902 	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
903 
904 	l3_hdr = (char *)eth_hdr + m->l2_len;
905 
906 	if (m->ol_flags & PKT_TX_IPV4) {
907 		ipv4_hdr = l3_hdr;
908 		ipv4_hdr->hdr_checksum = 0;
909 		m->ol_flags |= PKT_TX_IP_CKSUM;
910 	}
911 
912 	tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
913 	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
914 }
915 
916 static inline void
917 free_pkts(struct rte_mbuf **pkts, uint16_t n)
918 {
919 	while (n--)
920 		rte_pktmbuf_free(pkts[n]);
921 }
922 
923 static __rte_always_inline void
924 do_drain_mbuf_table(struct mbuf_table *tx_q)
925 {
926 	uint16_t count;
927 
928 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
929 				 tx_q->m_table, tx_q->len);
930 	if (unlikely(count < tx_q->len))
931 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
932 
933 	tx_q->len = 0;
934 }
935 
936 /*
937  * This function routes the TX packet to the correct interface. This
938  * may be a local device or the physical port.
939  */
940 static __rte_always_inline void
941 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
942 {
943 	struct mbuf_table *tx_q;
944 	unsigned offset = 0;
945 	const uint16_t lcore_id = rte_lcore_id();
946 	struct ether_hdr *nh;
947 
948 
949 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
950 	if (unlikely(is_broadcast_ether_addr(&nh->d_addr))) {
951 		struct vhost_dev *vdev2;
952 
953 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
954 			virtio_xmit(vdev2, vdev, m);
955 		}
956 		goto queue2nic;
957 	}
958 
959 	/*check if destination is local VM*/
960 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
961 		rte_pktmbuf_free(m);
962 		return;
963 	}
964 
965 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
966 		if (unlikely(find_local_dest(vdev, m, &offset,
967 					     &vlan_tag) != 0)) {
968 			rte_pktmbuf_free(m);
969 			return;
970 		}
971 	}
972 
973 	RTE_LOG_DP(DEBUG, VHOST_DATA,
974 		"(%d) TX: MAC address is external\n", vdev->vid);
975 
976 queue2nic:
977 
978 	/*Add packet to the port tx queue*/
979 	tx_q = &lcore_tx_queue[lcore_id];
980 
981 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
982 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
983 		/* Guest has inserted the vlan tag. */
984 		struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
985 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
986 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
987 			(vh->vlan_tci != vlan_tag_be))
988 			vh->vlan_tci = vlan_tag_be;
989 	} else {
990 		m->ol_flags |= PKT_TX_VLAN_PKT;
991 
992 		/*
993 		 * Find the right seg to adjust the data len when offset is
994 		 * bigger than tail room size.
995 		 */
996 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
997 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
998 				m->data_len += offset;
999 			else {
1000 				struct rte_mbuf *seg = m;
1001 
1002 				while ((seg->next != NULL) &&
1003 					(offset > rte_pktmbuf_tailroom(seg)))
1004 					seg = seg->next;
1005 
1006 				seg->data_len += offset;
1007 			}
1008 			m->pkt_len += offset;
1009 		}
1010 
1011 		m->vlan_tci = vlan_tag;
1012 	}
1013 
1014 	if (m->ol_flags & PKT_TX_TCP_SEG)
1015 		virtio_tx_offload(m);
1016 
1017 	tx_q->m_table[tx_q->len++] = m;
1018 	if (enable_stats) {
1019 		vdev->stats.tx_total++;
1020 		vdev->stats.tx++;
1021 	}
1022 
1023 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1024 		do_drain_mbuf_table(tx_q);
1025 }
1026 
1027 
1028 static __rte_always_inline void
1029 drain_mbuf_table(struct mbuf_table *tx_q)
1030 {
1031 	static uint64_t prev_tsc;
1032 	uint64_t cur_tsc;
1033 
1034 	if (tx_q->len == 0)
1035 		return;
1036 
1037 	cur_tsc = rte_rdtsc();
1038 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1039 		prev_tsc = cur_tsc;
1040 
1041 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1042 			"TX queue drained after timeout with burst size %u\n",
1043 			tx_q->len);
1044 		do_drain_mbuf_table(tx_q);
1045 	}
1046 }
1047 
1048 static __rte_always_inline void
1049 drain_eth_rx(struct vhost_dev *vdev)
1050 {
1051 	uint16_t rx_count, enqueue_count;
1052 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1053 
1054 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1055 				    pkts, MAX_PKT_BURST);
1056 	if (!rx_count)
1057 		return;
1058 
1059 	/*
1060 	 * When "enable_retry" is set, here we wait and retry when there
1061 	 * is no enough free slots in the queue to hold @rx_count packets,
1062 	 * to diminish packet loss.
1063 	 */
1064 	if (enable_retry &&
1065 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1066 			VIRTIO_RXQ))) {
1067 		uint32_t retry;
1068 
1069 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1070 			rte_delay_us(burst_rx_delay_time);
1071 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1072 					VIRTIO_RXQ))
1073 				break;
1074 		}
1075 	}
1076 
1077 	if (builtin_net_driver) {
1078 		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1079 						pkts, rx_count);
1080 	} else {
1081 		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1082 						pkts, rx_count);
1083 	}
1084 	if (enable_stats) {
1085 		rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1086 		rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1087 	}
1088 
1089 	free_pkts(pkts, rx_count);
1090 }
1091 
1092 static __rte_always_inline void
1093 drain_virtio_tx(struct vhost_dev *vdev)
1094 {
1095 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1096 	uint16_t count;
1097 	uint16_t i;
1098 
1099 	if (builtin_net_driver) {
1100 		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1101 					pkts, MAX_PKT_BURST);
1102 	} else {
1103 		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1104 					mbuf_pool, pkts, MAX_PKT_BURST);
1105 	}
1106 
1107 	/* setup VMDq for the first packet */
1108 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1109 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1110 			free_pkts(pkts, count);
1111 	}
1112 
1113 	for (i = 0; i < count; ++i)
1114 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1115 }
1116 
1117 /*
1118  * Main function of vhost-switch. It basically does:
1119  *
1120  * for each vhost device {
1121  *    - drain_eth_rx()
1122  *
1123  *      Which drains the host eth Rx queue linked to the vhost device,
1124  *      and deliver all of them to guest virito Rx ring associated with
1125  *      this vhost device.
1126  *
1127  *    - drain_virtio_tx()
1128  *
1129  *      Which drains the guest virtio Tx queue and deliver all of them
1130  *      to the target, which could be another vhost device, or the
1131  *      physical eth dev. The route is done in function "virtio_tx_route".
1132  * }
1133  */
1134 static int
1135 switch_worker(void *arg __rte_unused)
1136 {
1137 	unsigned i;
1138 	unsigned lcore_id = rte_lcore_id();
1139 	struct vhost_dev *vdev;
1140 	struct mbuf_table *tx_q;
1141 
1142 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1143 
1144 	tx_q = &lcore_tx_queue[lcore_id];
1145 	for (i = 0; i < rte_lcore_count(); i++) {
1146 		if (lcore_ids[i] == lcore_id) {
1147 			tx_q->txq_id = i;
1148 			break;
1149 		}
1150 	}
1151 
1152 	while(1) {
1153 		drain_mbuf_table(tx_q);
1154 
1155 		/*
1156 		 * Inform the configuration core that we have exited the
1157 		 * linked list and that no devices are in use if requested.
1158 		 */
1159 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1160 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1161 
1162 		/*
1163 		 * Process vhost devices
1164 		 */
1165 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1166 			      lcore_vdev_entry) {
1167 			if (unlikely(vdev->remove)) {
1168 				unlink_vmdq(vdev);
1169 				vdev->ready = DEVICE_SAFE_REMOVE;
1170 				continue;
1171 			}
1172 
1173 			if (likely(vdev->ready == DEVICE_RX))
1174 				drain_eth_rx(vdev);
1175 
1176 			if (likely(!vdev->remove))
1177 				drain_virtio_tx(vdev);
1178 		}
1179 	}
1180 
1181 	return 0;
1182 }
1183 
1184 /*
1185  * Remove a device from the specific data core linked list and from the
1186  * main linked list. Synchonization  occurs through the use of the
1187  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1188  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1189  */
1190 static void
1191 destroy_device(int vid)
1192 {
1193 	struct vhost_dev *vdev = NULL;
1194 	int lcore;
1195 
1196 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1197 		if (vdev->vid == vid)
1198 			break;
1199 	}
1200 	if (!vdev)
1201 		return;
1202 	/*set the remove flag. */
1203 	vdev->remove = 1;
1204 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1205 		rte_pause();
1206 	}
1207 
1208 	if (builtin_net_driver)
1209 		vs_vhost_net_remove(vdev);
1210 
1211 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1212 		     lcore_vdev_entry);
1213 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1214 
1215 
1216 	/* Set the dev_removal_flag on each lcore. */
1217 	RTE_LCORE_FOREACH_SLAVE(lcore)
1218 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1219 
1220 	/*
1221 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1222 	 * we can be sure that they can no longer access the device removed
1223 	 * from the linked lists and that the devices are no longer in use.
1224 	 */
1225 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1226 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1227 			rte_pause();
1228 	}
1229 
1230 	lcore_info[vdev->coreid].device_num--;
1231 
1232 	RTE_LOG(INFO, VHOST_DATA,
1233 		"(%d) device has been removed from data core\n",
1234 		vdev->vid);
1235 
1236 	rte_free(vdev);
1237 }
1238 
1239 /*
1240  * A new device is added to a data core. First the device is added to the main linked list
1241  * and the allocated to a specific data core.
1242  */
1243 static int
1244 new_device(int vid)
1245 {
1246 	int lcore, core_add = 0;
1247 	uint32_t device_num_min = num_devices;
1248 	struct vhost_dev *vdev;
1249 
1250 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1251 	if (vdev == NULL) {
1252 		RTE_LOG(INFO, VHOST_DATA,
1253 			"(%d) couldn't allocate memory for vhost dev\n",
1254 			vid);
1255 		return -1;
1256 	}
1257 	vdev->vid = vid;
1258 
1259 	if (builtin_net_driver)
1260 		vs_vhost_net_setup(vdev);
1261 
1262 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1263 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1264 
1265 	/*reset ready flag*/
1266 	vdev->ready = DEVICE_MAC_LEARNING;
1267 	vdev->remove = 0;
1268 
1269 	/* Find a suitable lcore to add the device. */
1270 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1271 		if (lcore_info[lcore].device_num < device_num_min) {
1272 			device_num_min = lcore_info[lcore].device_num;
1273 			core_add = lcore;
1274 		}
1275 	}
1276 	vdev->coreid = core_add;
1277 
1278 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1279 			  lcore_vdev_entry);
1280 	lcore_info[vdev->coreid].device_num++;
1281 
1282 	/* Disable notifications. */
1283 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1284 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1285 
1286 	RTE_LOG(INFO, VHOST_DATA,
1287 		"(%d) device has been added to data core %d\n",
1288 		vid, vdev->coreid);
1289 
1290 	return 0;
1291 }
1292 
1293 /*
1294  * These callback allow devices to be added to the data core when configuration
1295  * has been fully complete.
1296  */
1297 static const struct vhost_device_ops virtio_net_device_ops =
1298 {
1299 	.new_device =  new_device,
1300 	.destroy_device = destroy_device,
1301 };
1302 
1303 /*
1304  * This is a thread will wake up after a period to print stats if the user has
1305  * enabled them.
1306  */
1307 static void
1308 print_stats(void)
1309 {
1310 	struct vhost_dev *vdev;
1311 	uint64_t tx_dropped, rx_dropped;
1312 	uint64_t tx, tx_total, rx, rx_total;
1313 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1314 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1315 
1316 	while(1) {
1317 		sleep(enable_stats);
1318 
1319 		/* Clear screen and move to top left */
1320 		printf("%s%s\n", clr, top_left);
1321 		printf("Device statistics =================================\n");
1322 
1323 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1324 			tx_total   = vdev->stats.tx_total;
1325 			tx         = vdev->stats.tx;
1326 			tx_dropped = tx_total - tx;
1327 
1328 			rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1329 			rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1330 			rx_dropped = rx_total - rx;
1331 
1332 			printf("Statistics for device %d\n"
1333 				"-----------------------\n"
1334 				"TX total:              %" PRIu64 "\n"
1335 				"TX dropped:            %" PRIu64 "\n"
1336 				"TX successful:         %" PRIu64 "\n"
1337 				"RX total:              %" PRIu64 "\n"
1338 				"RX dropped:            %" PRIu64 "\n"
1339 				"RX successful:         %" PRIu64 "\n",
1340 				vdev->vid,
1341 				tx_total, tx_dropped, tx,
1342 				rx_total, rx_dropped, rx);
1343 		}
1344 
1345 		printf("===================================================\n");
1346 	}
1347 }
1348 
1349 static void
1350 unregister_drivers(int socket_num)
1351 {
1352 	int i, ret;
1353 
1354 	for (i = 0; i < socket_num; i++) {
1355 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1356 		if (ret != 0)
1357 			RTE_LOG(ERR, VHOST_CONFIG,
1358 				"Fail to unregister vhost driver for %s.\n",
1359 				socket_files + i * PATH_MAX);
1360 	}
1361 }
1362 
1363 /* When we receive a INT signal, unregister vhost driver */
1364 static void
1365 sigint_handler(__rte_unused int signum)
1366 {
1367 	/* Unregister vhost driver. */
1368 	unregister_drivers(nb_sockets);
1369 
1370 	exit(0);
1371 }
1372 
1373 /*
1374  * While creating an mbuf pool, one key thing is to figure out how
1375  * many mbuf entries is enough for our use. FYI, here are some
1376  * guidelines:
1377  *
1378  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1379  *
1380  * - For each switch core (A CPU core does the packet switch), we need
1381  *   also make some reservation for receiving the packets from virtio
1382  *   Tx queue. How many is enough depends on the usage. It's normally
1383  *   a simple calculation like following:
1384  *
1385  *       MAX_PKT_BURST * max packet size / mbuf size
1386  *
1387  *   So, we definitely need allocate more mbufs when TSO is enabled.
1388  *
1389  * - Similarly, for each switching core, we should serve @nr_rx_desc
1390  *   mbufs for receiving the packets from physical NIC device.
1391  *
1392  * - We also need make sure, for each switch core, we have allocated
1393  *   enough mbufs to fill up the mbuf cache.
1394  */
1395 static void
1396 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1397 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1398 {
1399 	uint32_t nr_mbufs;
1400 	uint32_t nr_mbufs_per_core;
1401 	uint32_t mtu = 1500;
1402 
1403 	if (mergeable)
1404 		mtu = 9000;
1405 	if (enable_tso)
1406 		mtu = 64 * 1024;
1407 
1408 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1409 			(mbuf_size - RTE_PKTMBUF_HEADROOM);
1410 	nr_mbufs_per_core += nr_rx_desc;
1411 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1412 
1413 	nr_mbufs  = nr_queues * nr_rx_desc;
1414 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1415 	nr_mbufs *= nr_port;
1416 
1417 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1418 					    nr_mbuf_cache, 0, mbuf_size,
1419 					    rte_socket_id());
1420 	if (mbuf_pool == NULL)
1421 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1422 }
1423 
1424 /*
1425  * Main function, does initialisation and calls the per-lcore functions.
1426  */
1427 int
1428 main(int argc, char *argv[])
1429 {
1430 	unsigned lcore_id, core_id = 0;
1431 	unsigned nb_ports, valid_num_ports;
1432 	int ret, i;
1433 	uint8_t portid;
1434 	static pthread_t tid;
1435 	char thread_name[RTE_MAX_THREAD_NAME_LEN];
1436 	uint64_t flags = 0;
1437 
1438 	signal(SIGINT, sigint_handler);
1439 
1440 	/* init EAL */
1441 	ret = rte_eal_init(argc, argv);
1442 	if (ret < 0)
1443 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1444 	argc -= ret;
1445 	argv += ret;
1446 
1447 	/* parse app arguments */
1448 	ret = us_vhost_parse_args(argc, argv);
1449 	if (ret < 0)
1450 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1451 
1452 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1453 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1454 
1455 		if (rte_lcore_is_enabled(lcore_id))
1456 			lcore_ids[core_id++] = lcore_id;
1457 	}
1458 
1459 	if (rte_lcore_count() > RTE_MAX_LCORE)
1460 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1461 
1462 	/* Get the number of physical ports. */
1463 	nb_ports = rte_eth_dev_count();
1464 
1465 	/*
1466 	 * Update the global var NUM_PORTS and global array PORTS
1467 	 * and get value of var VALID_NUM_PORTS according to system ports number
1468 	 */
1469 	valid_num_ports = check_ports_num(nb_ports);
1470 
1471 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1472 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1473 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1474 		return -1;
1475 	}
1476 
1477 	/*
1478 	 * FIXME: here we are trying to allocate mbufs big enough for
1479 	 * @MAX_QUEUES, but the truth is we're never going to use that
1480 	 * many queues here. We probably should only do allocation for
1481 	 * those queues we are going to use.
1482 	 */
1483 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1484 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1485 
1486 	if (vm2vm_mode == VM2VM_HARDWARE) {
1487 		/* Enable VT loop back to let L2 switch to do it. */
1488 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1489 		RTE_LOG(DEBUG, VHOST_CONFIG,
1490 			"Enable loop back for L2 switch in vmdq.\n");
1491 	}
1492 
1493 	/* initialize all ports */
1494 	for (portid = 0; portid < nb_ports; portid++) {
1495 		/* skip ports that are not enabled */
1496 		if ((enabled_port_mask & (1 << portid)) == 0) {
1497 			RTE_LOG(INFO, VHOST_PORT,
1498 				"Skipping disabled port %d\n", portid);
1499 			continue;
1500 		}
1501 		if (port_init(portid) != 0)
1502 			rte_exit(EXIT_FAILURE,
1503 				"Cannot initialize network ports\n");
1504 	}
1505 
1506 	/* Enable stats if the user option is set. */
1507 	if (enable_stats) {
1508 		ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
1509 		if (ret != 0)
1510 			rte_exit(EXIT_FAILURE,
1511 				"Cannot create print-stats thread\n");
1512 
1513 		/* Set thread_name for aid in debugging.  */
1514 		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
1515 		ret = rte_thread_setname(tid, thread_name);
1516 		if (ret != 0)
1517 			RTE_LOG(DEBUG, VHOST_CONFIG,
1518 				"Cannot set print-stats name\n");
1519 	}
1520 
1521 	/* Launch all data cores. */
1522 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1523 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1524 
1525 	if (client_mode)
1526 		flags |= RTE_VHOST_USER_CLIENT;
1527 
1528 	if (dequeue_zero_copy)
1529 		flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1530 
1531 	/* Register vhost user driver to handle vhost messages. */
1532 	for (i = 0; i < nb_sockets; i++) {
1533 		char *file = socket_files + i * PATH_MAX;
1534 		ret = rte_vhost_driver_register(file, flags);
1535 		if (ret != 0) {
1536 			unregister_drivers(i);
1537 			rte_exit(EXIT_FAILURE,
1538 				"vhost driver register failure.\n");
1539 		}
1540 
1541 		if (builtin_net_driver)
1542 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1543 
1544 		if (mergeable == 0) {
1545 			rte_vhost_driver_disable_features(file,
1546 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
1547 		}
1548 
1549 		if (enable_tx_csum == 0) {
1550 			rte_vhost_driver_disable_features(file,
1551 				1ULL << VIRTIO_NET_F_CSUM);
1552 		}
1553 
1554 		if (enable_tso == 0) {
1555 			rte_vhost_driver_disable_features(file,
1556 				1ULL << VIRTIO_NET_F_HOST_TSO4);
1557 			rte_vhost_driver_disable_features(file,
1558 				1ULL << VIRTIO_NET_F_HOST_TSO6);
1559 			rte_vhost_driver_disable_features(file,
1560 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
1561 			rte_vhost_driver_disable_features(file,
1562 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
1563 		}
1564 
1565 		if (promiscuous) {
1566 			rte_vhost_driver_enable_features(file,
1567 				1ULL << VIRTIO_NET_F_CTRL_RX);
1568 		}
1569 
1570 		ret = rte_vhost_driver_callback_register(file,
1571 			&virtio_net_device_ops);
1572 		if (ret != 0) {
1573 			rte_exit(EXIT_FAILURE,
1574 				"failed to register vhost driver callbacks.\n");
1575 		}
1576 
1577 		if (rte_vhost_driver_start(file) < 0) {
1578 			rte_exit(EXIT_FAILURE,
1579 				"failed to start vhost driver.\n");
1580 		}
1581 	}
1582 
1583 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1584 		rte_eal_wait_lcore(lcore_id);
1585 
1586 	return 0;
1587 
1588 }
1589