xref: /dpdk/examples/vhost/main.c (revision 4c3ea50880c90a1681c7f4c12a558cff37d01f03)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 #include <rte_ip.h>
54 #include <rte_tcp.h>
55 
56 #include "main.h"
57 
58 #ifndef MAX_QUEUES
59 #define MAX_QUEUES 128
60 #endif
61 
62 /* the maximum number of external ports supported */
63 #define MAX_SUP_PORTS 1
64 
65 #define MBUF_CACHE_SIZE	128
66 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
67 
68 #define MAX_PKT_BURST 32		/* Max burst size for RX/TX */
69 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
70 
71 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
72 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
73 
74 #define JUMBO_FRAME_MAX_SIZE    0x2600
75 
76 /* State of virtio device. */
77 #define DEVICE_MAC_LEARNING 0
78 #define DEVICE_RX			1
79 #define DEVICE_SAFE_REMOVE	2
80 
81 /* Configurable number of RX/TX ring descriptors */
82 #define RTE_TEST_RX_DESC_DEFAULT 1024
83 #define RTE_TEST_TX_DESC_DEFAULT 512
84 
85 #define INVALID_PORT_ID 0xFF
86 
87 /* Max number of devices. Limited by vmdq. */
88 #define MAX_DEVICES 64
89 
90 /* Size of buffers used for snprintfs. */
91 #define MAX_PRINT_BUFF 6072
92 
93 /* Maximum long option length for option parsing. */
94 #define MAX_LONG_OPT_SZ 64
95 
96 /* mask of enabled ports */
97 static uint32_t enabled_port_mask = 0;
98 
99 /* Promiscuous mode */
100 static uint32_t promiscuous;
101 
102 /* number of devices/queues to support*/
103 static uint32_t num_queues = 0;
104 static uint32_t num_devices;
105 
106 static struct rte_mempool *mbuf_pool;
107 static int mergeable;
108 
109 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
110 typedef enum {
111 	VM2VM_DISABLED = 0,
112 	VM2VM_SOFTWARE = 1,
113 	VM2VM_HARDWARE = 2,
114 	VM2VM_LAST
115 } vm2vm_type;
116 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
117 
118 /* Enable stats. */
119 static uint32_t enable_stats = 0;
120 /* Enable retries on RX. */
121 static uint32_t enable_retry = 1;
122 
123 /* Disable TX checksum offload */
124 static uint32_t enable_tx_csum;
125 
126 /* Disable TSO offload */
127 static uint32_t enable_tso;
128 
129 static int client_mode;
130 
131 /* Specify timeout (in useconds) between retries on RX. */
132 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
133 /* Specify the number of retries on RX. */
134 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
135 
136 /* Socket file paths. Can be set by user */
137 static char *socket_files;
138 static int nb_sockets;
139 
140 /* empty vmdq configuration structure. Filled in programatically */
141 static struct rte_eth_conf vmdq_conf_default = {
142 	.rxmode = {
143 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
144 		.split_hdr_size = 0,
145 		.header_split   = 0, /**< Header Split disabled */
146 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
147 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
148 		/*
149 		 * It is necessary for 1G NIC such as I350,
150 		 * this fixes bug of ipv4 forwarding in guest can't
151 		 * forward pakets from one virtio dev to another virtio dev.
152 		 */
153 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
154 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
155 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
156 	},
157 
158 	.txmode = {
159 		.mq_mode = ETH_MQ_TX_NONE,
160 	},
161 	.rx_adv_conf = {
162 		/*
163 		 * should be overridden separately in code with
164 		 * appropriate values
165 		 */
166 		.vmdq_rx_conf = {
167 			.nb_queue_pools = ETH_8_POOLS,
168 			.enable_default_pool = 0,
169 			.default_pool = 0,
170 			.nb_pool_maps = 0,
171 			.pool_map = {{0, 0},},
172 		},
173 	},
174 };
175 
176 static unsigned lcore_ids[RTE_MAX_LCORE];
177 static uint8_t ports[RTE_MAX_ETHPORTS];
178 static unsigned num_ports = 0; /**< The number of ports specified in command line */
179 static uint16_t num_pf_queues, num_vmdq_queues;
180 static uint16_t vmdq_pool_base, vmdq_queue_base;
181 static uint16_t queues_per_pool;
182 
183 const uint16_t vlan_tags[] = {
184 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
185 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
186 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
187 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
188 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
189 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
190 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
191 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
192 };
193 
194 /* ethernet addresses of ports */
195 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
196 
197 static struct vhost_dev_tailq_list vhost_dev_list =
198 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
199 
200 static struct lcore_info lcore_info[RTE_MAX_LCORE];
201 
202 /* Used for queueing bursts of TX packets. */
203 struct mbuf_table {
204 	unsigned len;
205 	unsigned txq_id;
206 	struct rte_mbuf *m_table[MAX_PKT_BURST];
207 };
208 
209 /* TX queue for each data core. */
210 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
211 
212 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
213 				 / US_PER_S * BURST_TX_DRAIN_US)
214 #define VLAN_HLEN       4
215 
216 /*
217  * Builds up the correct configuration for VMDQ VLAN pool map
218  * according to the pool & queue limits.
219  */
220 static inline int
221 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
222 {
223 	struct rte_eth_vmdq_rx_conf conf;
224 	struct rte_eth_vmdq_rx_conf *def_conf =
225 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
226 	unsigned i;
227 
228 	memset(&conf, 0, sizeof(conf));
229 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
230 	conf.nb_pool_maps = num_devices;
231 	conf.enable_loop_back = def_conf->enable_loop_back;
232 	conf.rx_mode = def_conf->rx_mode;
233 
234 	for (i = 0; i < conf.nb_pool_maps; i++) {
235 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
236 		conf.pool_map[i].pools = (1UL << i);
237 	}
238 
239 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
240 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
241 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
242 	return 0;
243 }
244 
245 /*
246  * Validate the device number according to the max pool number gotten form
247  * dev_info. If the device number is invalid, give the error message and
248  * return -1. Each device must have its own pool.
249  */
250 static inline int
251 validate_num_devices(uint32_t max_nb_devices)
252 {
253 	if (num_devices > max_nb_devices) {
254 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
255 		return -1;
256 	}
257 	return 0;
258 }
259 
260 /*
261  * Initialises a given port using global settings and with the rx buffers
262  * coming from the mbuf_pool passed as parameter
263  */
264 static inline int
265 port_init(uint8_t port)
266 {
267 	struct rte_eth_dev_info dev_info;
268 	struct rte_eth_conf port_conf;
269 	struct rte_eth_rxconf *rxconf;
270 	struct rte_eth_txconf *txconf;
271 	int16_t rx_rings, tx_rings;
272 	uint16_t rx_ring_size, tx_ring_size;
273 	int retval;
274 	uint16_t q;
275 
276 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
277 	rte_eth_dev_info_get (port, &dev_info);
278 
279 	if (dev_info.max_rx_queues > MAX_QUEUES) {
280 		rte_exit(EXIT_FAILURE,
281 			"please define MAX_QUEUES no less than %u in %s\n",
282 			dev_info.max_rx_queues, __FILE__);
283 	}
284 
285 	rxconf = &dev_info.default_rxconf;
286 	txconf = &dev_info.default_txconf;
287 	rxconf->rx_drop_en = 1;
288 
289 	/* Enable vlan offload */
290 	txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
291 
292 	/*configure the number of supported virtio devices based on VMDQ limits */
293 	num_devices = dev_info.max_vmdq_pools;
294 
295 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
296 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
297 	tx_rings = (uint16_t)rte_lcore_count();
298 
299 	retval = validate_num_devices(MAX_DEVICES);
300 	if (retval < 0)
301 		return retval;
302 
303 	/* Get port configuration. */
304 	retval = get_eth_conf(&port_conf, num_devices);
305 	if (retval < 0)
306 		return retval;
307 	/* NIC queues are divided into pf queues and vmdq queues.  */
308 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
309 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
310 	num_vmdq_queues = num_devices * queues_per_pool;
311 	num_queues = num_pf_queues + num_vmdq_queues;
312 	vmdq_queue_base = dev_info.vmdq_queue_base;
313 	vmdq_pool_base  = dev_info.vmdq_pool_base;
314 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
315 		num_pf_queues, num_devices, queues_per_pool);
316 
317 	if (port >= rte_eth_dev_count()) return -1;
318 
319 	if (enable_tx_csum == 0)
320 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_CSUM);
321 
322 	if (enable_tso == 0) {
323 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4);
324 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO6);
325 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_GUEST_TSO4);
326 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_GUEST_TSO6);
327 	}
328 
329 	rx_rings = (uint16_t)dev_info.max_rx_queues;
330 	/* Configure ethernet device. */
331 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
332 	if (retval != 0) {
333 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
334 			port, strerror(-retval));
335 		return retval;
336 	}
337 
338 	/* Setup the queues. */
339 	for (q = 0; q < rx_rings; q ++) {
340 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
341 						rte_eth_dev_socket_id(port),
342 						rxconf,
343 						mbuf_pool);
344 		if (retval < 0) {
345 			RTE_LOG(ERR, VHOST_PORT,
346 				"Failed to setup rx queue %u of port %u: %s.\n",
347 				q, port, strerror(-retval));
348 			return retval;
349 		}
350 	}
351 	for (q = 0; q < tx_rings; q ++) {
352 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
353 						rte_eth_dev_socket_id(port),
354 						txconf);
355 		if (retval < 0) {
356 			RTE_LOG(ERR, VHOST_PORT,
357 				"Failed to setup tx queue %u of port %u: %s.\n",
358 				q, port, strerror(-retval));
359 			return retval;
360 		}
361 	}
362 
363 	/* Start the device. */
364 	retval  = rte_eth_dev_start(port);
365 	if (retval < 0) {
366 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
367 			port, strerror(-retval));
368 		return retval;
369 	}
370 
371 	if (promiscuous)
372 		rte_eth_promiscuous_enable(port);
373 
374 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
375 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
376 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
377 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
378 			(unsigned)port,
379 			vmdq_ports_eth_addr[port].addr_bytes[0],
380 			vmdq_ports_eth_addr[port].addr_bytes[1],
381 			vmdq_ports_eth_addr[port].addr_bytes[2],
382 			vmdq_ports_eth_addr[port].addr_bytes[3],
383 			vmdq_ports_eth_addr[port].addr_bytes[4],
384 			vmdq_ports_eth_addr[port].addr_bytes[5]);
385 
386 	return 0;
387 }
388 
389 /*
390  * Set socket file path.
391  */
392 static int
393 us_vhost_parse_socket_path(const char *q_arg)
394 {
395 	/* parse number string */
396 	if (strnlen(q_arg, PATH_MAX) > PATH_MAX)
397 		return -1;
398 
399 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
400 	snprintf(socket_files + nb_sockets * PATH_MAX, PATH_MAX, "%s", q_arg);
401 	nb_sockets++;
402 
403 	return 0;
404 }
405 
406 /*
407  * Parse the portmask provided at run time.
408  */
409 static int
410 parse_portmask(const char *portmask)
411 {
412 	char *end = NULL;
413 	unsigned long pm;
414 
415 	errno = 0;
416 
417 	/* parse hexadecimal string */
418 	pm = strtoul(portmask, &end, 16);
419 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
420 		return -1;
421 
422 	if (pm == 0)
423 		return -1;
424 
425 	return pm;
426 
427 }
428 
429 /*
430  * Parse num options at run time.
431  */
432 static int
433 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
434 {
435 	char *end = NULL;
436 	unsigned long num;
437 
438 	errno = 0;
439 
440 	/* parse unsigned int string */
441 	num = strtoul(q_arg, &end, 10);
442 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
443 		return -1;
444 
445 	if (num > max_valid_value)
446 		return -1;
447 
448 	return num;
449 
450 }
451 
452 /*
453  * Display usage
454  */
455 static void
456 us_vhost_usage(const char *prgname)
457 {
458 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
459 	"		--vm2vm [0|1|2]\n"
460 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
461 	"		--socket-file <path>\n"
462 	"		--nb-devices ND\n"
463 	"		-p PORTMASK: Set mask for ports to be used by application\n"
464 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
465 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
466 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
467 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
468 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
469 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
470 	"		--socket-file: The path of the socket file.\n"
471 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
472 	"		--tso [0|1] disable/enable TCP segment offload.\n"
473 	"		--client register a vhost-user socket as client mode.\n",
474 	       prgname);
475 }
476 
477 /*
478  * Parse the arguments given in the command line of the application.
479  */
480 static int
481 us_vhost_parse_args(int argc, char **argv)
482 {
483 	int opt, ret;
484 	int option_index;
485 	unsigned i;
486 	const char *prgname = argv[0];
487 	static struct option long_option[] = {
488 		{"vm2vm", required_argument, NULL, 0},
489 		{"rx-retry", required_argument, NULL, 0},
490 		{"rx-retry-delay", required_argument, NULL, 0},
491 		{"rx-retry-num", required_argument, NULL, 0},
492 		{"mergeable", required_argument, NULL, 0},
493 		{"stats", required_argument, NULL, 0},
494 		{"socket-file", required_argument, NULL, 0},
495 		{"tx-csum", required_argument, NULL, 0},
496 		{"tso", required_argument, NULL, 0},
497 		{"client", no_argument, &client_mode, 1},
498 		{NULL, 0, 0, 0},
499 	};
500 
501 	/* Parse command line */
502 	while ((opt = getopt_long(argc, argv, "p:P",
503 			long_option, &option_index)) != EOF) {
504 		switch (opt) {
505 		/* Portmask */
506 		case 'p':
507 			enabled_port_mask = parse_portmask(optarg);
508 			if (enabled_port_mask == 0) {
509 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
510 				us_vhost_usage(prgname);
511 				return -1;
512 			}
513 			break;
514 
515 		case 'P':
516 			promiscuous = 1;
517 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
518 				ETH_VMDQ_ACCEPT_BROADCAST |
519 				ETH_VMDQ_ACCEPT_MULTICAST;
520 			rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
521 
522 			break;
523 
524 		case 0:
525 			/* Enable/disable vm2vm comms. */
526 			if (!strncmp(long_option[option_index].name, "vm2vm",
527 				MAX_LONG_OPT_SZ)) {
528 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
529 				if (ret == -1) {
530 					RTE_LOG(INFO, VHOST_CONFIG,
531 						"Invalid argument for "
532 						"vm2vm [0|1|2]\n");
533 					us_vhost_usage(prgname);
534 					return -1;
535 				} else {
536 					vm2vm_mode = (vm2vm_type)ret;
537 				}
538 			}
539 
540 			/* Enable/disable retries on RX. */
541 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
542 				ret = parse_num_opt(optarg, 1);
543 				if (ret == -1) {
544 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
545 					us_vhost_usage(prgname);
546 					return -1;
547 				} else {
548 					enable_retry = ret;
549 				}
550 			}
551 
552 			/* Enable/disable TX checksum offload. */
553 			if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
554 				ret = parse_num_opt(optarg, 1);
555 				if (ret == -1) {
556 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
557 					us_vhost_usage(prgname);
558 					return -1;
559 				} else
560 					enable_tx_csum = ret;
561 			}
562 
563 			/* Enable/disable TSO offload. */
564 			if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
565 				ret = parse_num_opt(optarg, 1);
566 				if (ret == -1) {
567 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
568 					us_vhost_usage(prgname);
569 					return -1;
570 				} else
571 					enable_tso = ret;
572 			}
573 
574 			/* Specify the retries delay time (in useconds) on RX. */
575 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
576 				ret = parse_num_opt(optarg, INT32_MAX);
577 				if (ret == -1) {
578 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
579 					us_vhost_usage(prgname);
580 					return -1;
581 				} else {
582 					burst_rx_delay_time = ret;
583 				}
584 			}
585 
586 			/* Specify the retries number on RX. */
587 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
588 				ret = parse_num_opt(optarg, INT32_MAX);
589 				if (ret == -1) {
590 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
591 					us_vhost_usage(prgname);
592 					return -1;
593 				} else {
594 					burst_rx_retry_num = ret;
595 				}
596 			}
597 
598 			/* Enable/disable RX mergeable buffers. */
599 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
600 				ret = parse_num_opt(optarg, 1);
601 				if (ret == -1) {
602 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
603 					us_vhost_usage(prgname);
604 					return -1;
605 				} else {
606 					mergeable = !!ret;
607 					if (ret) {
608 						vmdq_conf_default.rxmode.jumbo_frame = 1;
609 						vmdq_conf_default.rxmode.max_rx_pkt_len
610 							= JUMBO_FRAME_MAX_SIZE;
611 					}
612 				}
613 			}
614 
615 			/* Enable/disable stats. */
616 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
617 				ret = parse_num_opt(optarg, INT32_MAX);
618 				if (ret == -1) {
619 					RTE_LOG(INFO, VHOST_CONFIG,
620 						"Invalid argument for stats [0..N]\n");
621 					us_vhost_usage(prgname);
622 					return -1;
623 				} else {
624 					enable_stats = ret;
625 				}
626 			}
627 
628 			/* Set socket file path. */
629 			if (!strncmp(long_option[option_index].name,
630 						"socket-file", MAX_LONG_OPT_SZ)) {
631 				if (us_vhost_parse_socket_path(optarg) == -1) {
632 					RTE_LOG(INFO, VHOST_CONFIG,
633 					"Invalid argument for socket name (Max %d characters)\n",
634 					PATH_MAX);
635 					us_vhost_usage(prgname);
636 					return -1;
637 				}
638 			}
639 
640 			break;
641 
642 			/* Invalid option - print options. */
643 		default:
644 			us_vhost_usage(prgname);
645 			return -1;
646 		}
647 	}
648 
649 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
650 		if (enabled_port_mask & (1 << i))
651 			ports[num_ports++] = (uint8_t)i;
652 	}
653 
654 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
655 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
656 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
657 		return -1;
658 	}
659 
660 	return 0;
661 }
662 
663 /*
664  * Update the global var NUM_PORTS and array PORTS according to system ports number
665  * and return valid ports number
666  */
667 static unsigned check_ports_num(unsigned nb_ports)
668 {
669 	unsigned valid_num_ports = num_ports;
670 	unsigned portid;
671 
672 	if (num_ports > nb_ports) {
673 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
674 			num_ports, nb_ports);
675 		num_ports = nb_ports;
676 	}
677 
678 	for (portid = 0; portid < num_ports; portid ++) {
679 		if (ports[portid] >= nb_ports) {
680 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
681 				ports[portid], (nb_ports - 1));
682 			ports[portid] = INVALID_PORT_ID;
683 			valid_num_ports--;
684 		}
685 	}
686 	return valid_num_ports;
687 }
688 
689 static inline struct vhost_dev *__attribute__((always_inline))
690 find_vhost_dev(struct ether_addr *mac)
691 {
692 	struct vhost_dev *vdev;
693 
694 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
695 		if (vdev->ready == DEVICE_RX &&
696 		    is_same_ether_addr(mac, &vdev->mac_address))
697 			return vdev;
698 	}
699 
700 	return NULL;
701 }
702 
703 /*
704  * This function learns the MAC address of the device and registers this along with a
705  * vlan tag to a VMDQ.
706  */
707 static int
708 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
709 {
710 	struct ether_hdr *pkt_hdr;
711 	int i, ret;
712 
713 	/* Learn MAC address of guest device from packet */
714 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
715 
716 	if (find_vhost_dev(&pkt_hdr->s_addr)) {
717 		RTE_LOG(ERR, VHOST_DATA,
718 			"(%d) device is using a registered MAC!\n",
719 			vdev->vid);
720 		return -1;
721 	}
722 
723 	for (i = 0; i < ETHER_ADDR_LEN; i++)
724 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
725 
726 	/* vlan_tag currently uses the device_id. */
727 	vdev->vlan_tag = vlan_tags[vdev->vid];
728 
729 	/* Print out VMDQ registration info. */
730 	RTE_LOG(INFO, VHOST_DATA,
731 		"(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
732 		vdev->vid,
733 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
734 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
735 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
736 		vdev->vlan_tag);
737 
738 	/* Register the MAC address. */
739 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
740 				(uint32_t)vdev->vid + vmdq_pool_base);
741 	if (ret)
742 		RTE_LOG(ERR, VHOST_DATA,
743 			"(%d) failed to add device MAC address to VMDQ\n",
744 			vdev->vid);
745 
746 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
747 
748 	/* Set device as ready for RX. */
749 	vdev->ready = DEVICE_RX;
750 
751 	return 0;
752 }
753 
754 /*
755  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
756  * queue before disabling RX on the device.
757  */
758 static inline void
759 unlink_vmdq(struct vhost_dev *vdev)
760 {
761 	unsigned i = 0;
762 	unsigned rx_count;
763 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
764 
765 	if (vdev->ready == DEVICE_RX) {
766 		/*clear MAC and VLAN settings*/
767 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
768 		for (i = 0; i < 6; i++)
769 			vdev->mac_address.addr_bytes[i] = 0;
770 
771 		vdev->vlan_tag = 0;
772 
773 		/*Clear out the receive buffers*/
774 		rx_count = rte_eth_rx_burst(ports[0],
775 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
776 
777 		while (rx_count) {
778 			for (i = 0; i < rx_count; i++)
779 				rte_pktmbuf_free(pkts_burst[i]);
780 
781 			rx_count = rte_eth_rx_burst(ports[0],
782 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
783 		}
784 
785 		vdev->ready = DEVICE_MAC_LEARNING;
786 	}
787 }
788 
789 static inline void __attribute__((always_inline))
790 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
791 	    struct rte_mbuf *m)
792 {
793 	uint16_t ret;
794 
795 	ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
796 	if (enable_stats) {
797 		rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
798 		rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
799 		src_vdev->stats.tx_total++;
800 		src_vdev->stats.tx += ret;
801 	}
802 }
803 
804 /*
805  * Check if the packet destination MAC address is for a local device. If so then put
806  * the packet on that devices RX queue. If not then return.
807  */
808 static inline int __attribute__((always_inline))
809 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
810 {
811 	struct ether_hdr *pkt_hdr;
812 	struct vhost_dev *dst_vdev;
813 
814 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
815 
816 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
817 	if (!dst_vdev)
818 		return -1;
819 
820 	if (vdev->vid == dst_vdev->vid) {
821 		RTE_LOG(DEBUG, VHOST_DATA,
822 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
823 			vdev->vid);
824 		return 0;
825 	}
826 
827 	RTE_LOG(DEBUG, VHOST_DATA,
828 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
829 
830 	if (unlikely(dst_vdev->remove)) {
831 		RTE_LOG(DEBUG, VHOST_DATA,
832 			"(%d) device is marked for removal\n", dst_vdev->vid);
833 		return 0;
834 	}
835 
836 	virtio_xmit(dst_vdev, vdev, m);
837 	return 0;
838 }
839 
840 /*
841  * Check if the destination MAC of a packet is one local VM,
842  * and get its vlan tag, and offset if it is.
843  */
844 static inline int __attribute__((always_inline))
845 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
846 	uint32_t *offset, uint16_t *vlan_tag)
847 {
848 	struct vhost_dev *dst_vdev;
849 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
850 
851 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
852 	if (!dst_vdev)
853 		return 0;
854 
855 	if (vdev->vid == dst_vdev->vid) {
856 		RTE_LOG(DEBUG, VHOST_DATA,
857 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
858 			vdev->vid);
859 		return -1;
860 	}
861 
862 	/*
863 	 * HW vlan strip will reduce the packet length
864 	 * by minus length of vlan tag, so need restore
865 	 * the packet length by plus it.
866 	 */
867 	*offset  = VLAN_HLEN;
868 	*vlan_tag = vlan_tags[vdev->vid];
869 
870 	RTE_LOG(DEBUG, VHOST_DATA,
871 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
872 		vdev->vid, dst_vdev->vid, *vlan_tag);
873 
874 	return 0;
875 }
876 
877 static uint16_t
878 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
879 {
880 	if (ol_flags & PKT_TX_IPV4)
881 		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
882 	else /* assume ethertype == ETHER_TYPE_IPv6 */
883 		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
884 }
885 
886 static void virtio_tx_offload(struct rte_mbuf *m)
887 {
888 	void *l3_hdr;
889 	struct ipv4_hdr *ipv4_hdr = NULL;
890 	struct tcp_hdr *tcp_hdr = NULL;
891 	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
892 
893 	l3_hdr = (char *)eth_hdr + m->l2_len;
894 
895 	if (m->ol_flags & PKT_TX_IPV4) {
896 		ipv4_hdr = l3_hdr;
897 		ipv4_hdr->hdr_checksum = 0;
898 		m->ol_flags |= PKT_TX_IP_CKSUM;
899 	}
900 
901 	tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
902 	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
903 }
904 
905 static inline void
906 free_pkts(struct rte_mbuf **pkts, uint16_t n)
907 {
908 	while (n--)
909 		rte_pktmbuf_free(pkts[n]);
910 }
911 
912 static inline void __attribute__((always_inline))
913 do_drain_mbuf_table(struct mbuf_table *tx_q)
914 {
915 	uint16_t count;
916 
917 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
918 				 tx_q->m_table, tx_q->len);
919 	if (unlikely(count < tx_q->len))
920 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
921 
922 	tx_q->len = 0;
923 }
924 
925 /*
926  * This function routes the TX packet to the correct interface. This
927  * may be a local device or the physical port.
928  */
929 static inline void __attribute__((always_inline))
930 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
931 {
932 	struct mbuf_table *tx_q;
933 	unsigned offset = 0;
934 	const uint16_t lcore_id = rte_lcore_id();
935 	struct ether_hdr *nh;
936 
937 
938 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
939 	if (unlikely(is_broadcast_ether_addr(&nh->d_addr))) {
940 		struct vhost_dev *vdev2;
941 
942 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
943 			virtio_xmit(vdev2, vdev, m);
944 		}
945 		goto queue2nic;
946 	}
947 
948 	/*check if destination is local VM*/
949 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
950 		rte_pktmbuf_free(m);
951 		return;
952 	}
953 
954 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
955 		if (unlikely(find_local_dest(vdev, m, &offset,
956 					     &vlan_tag) != 0)) {
957 			rte_pktmbuf_free(m);
958 			return;
959 		}
960 	}
961 
962 	RTE_LOG(DEBUG, VHOST_DATA,
963 		"(%d) TX: MAC address is external\n", vdev->vid);
964 
965 queue2nic:
966 
967 	/*Add packet to the port tx queue*/
968 	tx_q = &lcore_tx_queue[lcore_id];
969 
970 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
971 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
972 		/* Guest has inserted the vlan tag. */
973 		struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
974 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
975 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
976 			(vh->vlan_tci != vlan_tag_be))
977 			vh->vlan_tci = vlan_tag_be;
978 	} else {
979 		m->ol_flags |= PKT_TX_VLAN_PKT;
980 
981 		/*
982 		 * Find the right seg to adjust the data len when offset is
983 		 * bigger than tail room size.
984 		 */
985 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
986 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
987 				m->data_len += offset;
988 			else {
989 				struct rte_mbuf *seg = m;
990 
991 				while ((seg->next != NULL) &&
992 					(offset > rte_pktmbuf_tailroom(seg)))
993 					seg = seg->next;
994 
995 				seg->data_len += offset;
996 			}
997 			m->pkt_len += offset;
998 		}
999 
1000 		m->vlan_tci = vlan_tag;
1001 	}
1002 
1003 	if (m->ol_flags & PKT_TX_TCP_SEG)
1004 		virtio_tx_offload(m);
1005 
1006 	tx_q->m_table[tx_q->len++] = m;
1007 	if (enable_stats) {
1008 		vdev->stats.tx_total++;
1009 		vdev->stats.tx++;
1010 	}
1011 
1012 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1013 		do_drain_mbuf_table(tx_q);
1014 }
1015 
1016 
1017 static inline void __attribute__((always_inline))
1018 drain_mbuf_table(struct mbuf_table *tx_q)
1019 {
1020 	static uint64_t prev_tsc;
1021 	uint64_t cur_tsc;
1022 
1023 	if (tx_q->len == 0)
1024 		return;
1025 
1026 	cur_tsc = rte_rdtsc();
1027 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1028 		prev_tsc = cur_tsc;
1029 
1030 		RTE_LOG(DEBUG, VHOST_DATA,
1031 			"TX queue drained after timeout with burst size %u\n",
1032 			tx_q->len);
1033 		do_drain_mbuf_table(tx_q);
1034 	}
1035 }
1036 
1037 static inline void __attribute__((always_inline))
1038 drain_eth_rx(struct vhost_dev *vdev)
1039 {
1040 	uint16_t rx_count, enqueue_count;
1041 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1042 
1043 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1044 				    pkts, MAX_PKT_BURST);
1045 	if (!rx_count)
1046 		return;
1047 
1048 	/*
1049 	 * When "enable_retry" is set, here we wait and retry when there
1050 	 * is no enough free slots in the queue to hold @rx_count packets,
1051 	 * to diminish packet loss.
1052 	 */
1053 	if (enable_retry &&
1054 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1055 			VIRTIO_RXQ))) {
1056 		uint32_t retry;
1057 
1058 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1059 			rte_delay_us(burst_rx_delay_time);
1060 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1061 					VIRTIO_RXQ))
1062 				break;
1063 		}
1064 	}
1065 
1066 	enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1067 						pkts, rx_count);
1068 	if (enable_stats) {
1069 		rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1070 		rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1071 	}
1072 
1073 	free_pkts(pkts, rx_count);
1074 }
1075 
1076 static inline void __attribute__((always_inline))
1077 drain_virtio_tx(struct vhost_dev *vdev)
1078 {
1079 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1080 	uint16_t count;
1081 	uint16_t i;
1082 
1083 	count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ, mbuf_pool,
1084 					pkts, MAX_PKT_BURST);
1085 
1086 	/* setup VMDq for the first packet */
1087 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1088 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1089 			free_pkts(pkts, count);
1090 	}
1091 
1092 	for (i = 0; i < count; ++i)
1093 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1094 }
1095 
1096 /*
1097  * Main function of vhost-switch. It basically does:
1098  *
1099  * for each vhost device {
1100  *    - drain_eth_rx()
1101  *
1102  *      Which drains the host eth Rx queue linked to the vhost device,
1103  *      and deliver all of them to guest virito Rx ring associated with
1104  *      this vhost device.
1105  *
1106  *    - drain_virtio_tx()
1107  *
1108  *      Which drains the guest virtio Tx queue and deliver all of them
1109  *      to the target, which could be another vhost device, or the
1110  *      physical eth dev. The route is done in function "virtio_tx_route".
1111  * }
1112  */
1113 static int
1114 switch_worker(void *arg __rte_unused)
1115 {
1116 	unsigned i;
1117 	unsigned lcore_id = rte_lcore_id();
1118 	struct vhost_dev *vdev;
1119 	struct mbuf_table *tx_q;
1120 
1121 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1122 
1123 	tx_q = &lcore_tx_queue[lcore_id];
1124 	for (i = 0; i < rte_lcore_count(); i++) {
1125 		if (lcore_ids[i] == lcore_id) {
1126 			tx_q->txq_id = i;
1127 			break;
1128 		}
1129 	}
1130 
1131 	while(1) {
1132 		drain_mbuf_table(tx_q);
1133 
1134 		/*
1135 		 * Inform the configuration core that we have exited the
1136 		 * linked list and that no devices are in use if requested.
1137 		 */
1138 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1139 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1140 
1141 		/*
1142 		 * Process vhost devices
1143 		 */
1144 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1145 			      lcore_vdev_entry) {
1146 			if (unlikely(vdev->remove)) {
1147 				unlink_vmdq(vdev);
1148 				vdev->ready = DEVICE_SAFE_REMOVE;
1149 				continue;
1150 			}
1151 
1152 			if (likely(vdev->ready == DEVICE_RX))
1153 				drain_eth_rx(vdev);
1154 
1155 			if (likely(!vdev->remove))
1156 				drain_virtio_tx(vdev);
1157 		}
1158 	}
1159 
1160 	return 0;
1161 }
1162 
1163 /*
1164  * Remove a device from the specific data core linked list and from the
1165  * main linked list. Synchonization  occurs through the use of the
1166  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1167  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1168  */
1169 static void
1170 destroy_device(int vid)
1171 {
1172 	struct vhost_dev *vdev = NULL;
1173 	int lcore;
1174 
1175 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1176 		if (vdev->vid == vid)
1177 			break;
1178 	}
1179 	if (!vdev)
1180 		return;
1181 	/*set the remove flag. */
1182 	vdev->remove = 1;
1183 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1184 		rte_pause();
1185 	}
1186 
1187 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1188 		     lcore_vdev_entry);
1189 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1190 
1191 
1192 	/* Set the dev_removal_flag on each lcore. */
1193 	RTE_LCORE_FOREACH_SLAVE(lcore)
1194 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1195 
1196 	/*
1197 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1198 	 * we can be sure that they can no longer access the device removed
1199 	 * from the linked lists and that the devices are no longer in use.
1200 	 */
1201 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1202 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1203 			rte_pause();
1204 	}
1205 
1206 	lcore_info[vdev->coreid].device_num--;
1207 
1208 	RTE_LOG(INFO, VHOST_DATA,
1209 		"(%d) device has been removed from data core\n",
1210 		vdev->vid);
1211 
1212 	rte_free(vdev);
1213 }
1214 
1215 /*
1216  * A new device is added to a data core. First the device is added to the main linked list
1217  * and the allocated to a specific data core.
1218  */
1219 static int
1220 new_device(int vid)
1221 {
1222 	int lcore, core_add = 0;
1223 	uint32_t device_num_min = num_devices;
1224 	struct vhost_dev *vdev;
1225 
1226 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1227 	if (vdev == NULL) {
1228 		RTE_LOG(INFO, VHOST_DATA,
1229 			"(%d) couldn't allocate memory for vhost dev\n",
1230 			vid);
1231 		return -1;
1232 	}
1233 	vdev->vid = vid;
1234 
1235 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1236 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1237 
1238 	/*reset ready flag*/
1239 	vdev->ready = DEVICE_MAC_LEARNING;
1240 	vdev->remove = 0;
1241 
1242 	/* Find a suitable lcore to add the device. */
1243 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1244 		if (lcore_info[lcore].device_num < device_num_min) {
1245 			device_num_min = lcore_info[lcore].device_num;
1246 			core_add = lcore;
1247 		}
1248 	}
1249 	vdev->coreid = core_add;
1250 
1251 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1252 			  lcore_vdev_entry);
1253 	lcore_info[vdev->coreid].device_num++;
1254 
1255 	/* Disable notifications. */
1256 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1257 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1258 
1259 	RTE_LOG(INFO, VHOST_DATA,
1260 		"(%d) device has been added to data core %d\n",
1261 		vid, vdev->coreid);
1262 
1263 	return 0;
1264 }
1265 
1266 /*
1267  * These callback allow devices to be added to the data core when configuration
1268  * has been fully complete.
1269  */
1270 static const struct virtio_net_device_ops virtio_net_device_ops =
1271 {
1272 	.new_device =  new_device,
1273 	.destroy_device = destroy_device,
1274 };
1275 
1276 /*
1277  * This is a thread will wake up after a period to print stats if the user has
1278  * enabled them.
1279  */
1280 static void
1281 print_stats(void)
1282 {
1283 	struct vhost_dev *vdev;
1284 	uint64_t tx_dropped, rx_dropped;
1285 	uint64_t tx, tx_total, rx, rx_total;
1286 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1287 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1288 
1289 	while(1) {
1290 		sleep(enable_stats);
1291 
1292 		/* Clear screen and move to top left */
1293 		printf("%s%s\n", clr, top_left);
1294 		printf("Device statistics =================================\n");
1295 
1296 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1297 			tx_total   = vdev->stats.tx_total;
1298 			tx         = vdev->stats.tx;
1299 			tx_dropped = tx_total - tx;
1300 
1301 			rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1302 			rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1303 			rx_dropped = rx_total - rx;
1304 
1305 			printf("Statistics for device %d\n"
1306 				"-----------------------\n"
1307 				"TX total:              %" PRIu64 "\n"
1308 				"TX dropped:            %" PRIu64 "\n"
1309 				"TX successful:         %" PRIu64 "\n"
1310 				"RX total:              %" PRIu64 "\n"
1311 				"RX dropped:            %" PRIu64 "\n"
1312 				"RX successful:         %" PRIu64 "\n",
1313 				vdev->vid,
1314 				tx_total, tx_dropped, tx,
1315 				rx_total, rx_dropped, rx);
1316 		}
1317 
1318 		printf("===================================================\n");
1319 	}
1320 }
1321 
1322 static void
1323 unregister_drivers(int socket_num)
1324 {
1325 	int i, ret;
1326 
1327 	for (i = 0; i < socket_num; i++) {
1328 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1329 		if (ret != 0)
1330 			RTE_LOG(ERR, VHOST_CONFIG,
1331 				"Fail to unregister vhost driver for %s.\n",
1332 				socket_files + i * PATH_MAX);
1333 	}
1334 }
1335 
1336 /* When we receive a INT signal, unregister vhost driver */
1337 static void
1338 sigint_handler(__rte_unused int signum)
1339 {
1340 	/* Unregister vhost driver. */
1341 	unregister_drivers(nb_sockets);
1342 
1343 	exit(0);
1344 }
1345 
1346 /*
1347  * While creating an mbuf pool, one key thing is to figure out how
1348  * many mbuf entries is enough for our use. FYI, here are some
1349  * guidelines:
1350  *
1351  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1352  *
1353  * - For each switch core (A CPU core does the packet switch), we need
1354  *   also make some reservation for receiving the packets from virtio
1355  *   Tx queue. How many is enough depends on the usage. It's normally
1356  *   a simple calculation like following:
1357  *
1358  *       MAX_PKT_BURST * max packet size / mbuf size
1359  *
1360  *   So, we definitely need allocate more mbufs when TSO is enabled.
1361  *
1362  * - Similarly, for each switching core, we should serve @nr_rx_desc
1363  *   mbufs for receiving the packets from physical NIC device.
1364  *
1365  * - We also need make sure, for each switch core, we have allocated
1366  *   enough mbufs to fill up the mbuf cache.
1367  */
1368 static void
1369 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1370 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1371 {
1372 	uint32_t nr_mbufs;
1373 	uint32_t nr_mbufs_per_core;
1374 	uint32_t mtu = 1500;
1375 
1376 	if (mergeable)
1377 		mtu = 9000;
1378 	if (enable_tso)
1379 		mtu = 64 * 1024;
1380 
1381 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1382 			(mbuf_size - RTE_PKTMBUF_HEADROOM) * MAX_PKT_BURST;
1383 	nr_mbufs_per_core += nr_rx_desc;
1384 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1385 
1386 	nr_mbufs  = nr_queues * nr_rx_desc;
1387 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1388 	nr_mbufs *= nr_port;
1389 
1390 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1391 					    nr_mbuf_cache, 0, mbuf_size,
1392 					    rte_socket_id());
1393 	if (mbuf_pool == NULL)
1394 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1395 }
1396 
1397 /*
1398  * Main function, does initialisation and calls the per-lcore functions. The CUSE
1399  * device is also registered here to handle the IOCTLs.
1400  */
1401 int
1402 main(int argc, char *argv[])
1403 {
1404 	unsigned lcore_id, core_id = 0;
1405 	unsigned nb_ports, valid_num_ports;
1406 	int ret, i;
1407 	uint8_t portid;
1408 	static pthread_t tid;
1409 	char thread_name[RTE_MAX_THREAD_NAME_LEN];
1410 	uint64_t flags = 0;
1411 
1412 	signal(SIGINT, sigint_handler);
1413 
1414 	/* init EAL */
1415 	ret = rte_eal_init(argc, argv);
1416 	if (ret < 0)
1417 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1418 	argc -= ret;
1419 	argv += ret;
1420 
1421 	/* parse app arguments */
1422 	ret = us_vhost_parse_args(argc, argv);
1423 	if (ret < 0)
1424 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1425 
1426 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
1427 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1428 
1429 		if (rte_lcore_is_enabled(lcore_id))
1430 			lcore_ids[core_id ++] = lcore_id;
1431 
1432 	if (rte_lcore_count() > RTE_MAX_LCORE)
1433 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1434 
1435 	/* Get the number of physical ports. */
1436 	nb_ports = rte_eth_dev_count();
1437 
1438 	/*
1439 	 * Update the global var NUM_PORTS and global array PORTS
1440 	 * and get value of var VALID_NUM_PORTS according to system ports number
1441 	 */
1442 	valid_num_ports = check_ports_num(nb_ports);
1443 
1444 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1445 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1446 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1447 		return -1;
1448 	}
1449 
1450 	/*
1451 	 * FIXME: here we are trying to allocate mbufs big enough for
1452 	 * @MAX_QUEUES, but the truth is we're never going to use that
1453 	 * many queues here. We probably should only do allocation for
1454 	 * those queues we are going to use.
1455 	 */
1456 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1457 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1458 
1459 	if (vm2vm_mode == VM2VM_HARDWARE) {
1460 		/* Enable VT loop back to let L2 switch to do it. */
1461 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1462 		RTE_LOG(DEBUG, VHOST_CONFIG,
1463 			"Enable loop back for L2 switch in vmdq.\n");
1464 	}
1465 
1466 	/* initialize all ports */
1467 	for (portid = 0; portid < nb_ports; portid++) {
1468 		/* skip ports that are not enabled */
1469 		if ((enabled_port_mask & (1 << portid)) == 0) {
1470 			RTE_LOG(INFO, VHOST_PORT,
1471 				"Skipping disabled port %d\n", portid);
1472 			continue;
1473 		}
1474 		if (port_init(portid) != 0)
1475 			rte_exit(EXIT_FAILURE,
1476 				"Cannot initialize network ports\n");
1477 	}
1478 
1479 	/* Enable stats if the user option is set. */
1480 	if (enable_stats) {
1481 		ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
1482 		if (ret != 0)
1483 			rte_exit(EXIT_FAILURE,
1484 				"Cannot create print-stats thread\n");
1485 
1486 		/* Set thread_name for aid in debugging.  */
1487 		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
1488 		ret = rte_thread_setname(tid, thread_name);
1489 		if (ret != 0)
1490 			RTE_LOG(DEBUG, VHOST_CONFIG,
1491 				"Cannot set print-stats name\n");
1492 	}
1493 
1494 	/* Launch all data cores. */
1495 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1496 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1497 
1498 	if (mergeable == 0)
1499 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
1500 
1501 	if (client_mode)
1502 		flags |= RTE_VHOST_USER_CLIENT;
1503 
1504 	/* Register vhost user driver to handle vhost messages. */
1505 	for (i = 0; i < nb_sockets; i++) {
1506 		ret = rte_vhost_driver_register
1507 				(socket_files + i * PATH_MAX, flags);
1508 		if (ret != 0) {
1509 			unregister_drivers(i);
1510 			rte_exit(EXIT_FAILURE,
1511 				"vhost driver register failure.\n");
1512 		}
1513 	}
1514 
1515 	rte_vhost_driver_callback_register(&virtio_net_device_ops);
1516 
1517 	/* Start CUSE session. */
1518 	rte_vhost_driver_session_start();
1519 	return 0;
1520 
1521 }
1522