xref: /dpdk/examples/vhost/main.c (revision 61e99293f6591f8ac2e0051b80fe7e1ba638ea96)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4 
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16 
17 #include <rte_atomic.h>
18 #include <rte_cycles.h>
19 #include <rte_ethdev.h>
20 #include <rte_log.h>
21 #include <rte_string_fns.h>
22 #include <rte_malloc.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27 
28 #include "main.h"
29 
30 #ifndef MAX_QUEUES
31 #define MAX_QUEUES 128
32 #endif
33 
34 /* the maximum number of external ports supported */
35 #define MAX_SUP_PORTS 1
36 
37 #define MBUF_CACHE_SIZE	128
38 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
39 
40 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
41 
42 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
43 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
44 
45 #define JUMBO_FRAME_MAX_SIZE    0x2600
46 
47 /* State of virtio device. */
48 #define DEVICE_MAC_LEARNING 0
49 #define DEVICE_RX			1
50 #define DEVICE_SAFE_REMOVE	2
51 
52 /* Configurable number of RX/TX ring descriptors */
53 #define RTE_TEST_RX_DESC_DEFAULT 1024
54 #define RTE_TEST_TX_DESC_DEFAULT 512
55 
56 #define INVALID_PORT_ID 0xFF
57 
58 /* Max number of devices. Limited by vmdq. */
59 #define MAX_DEVICES 64
60 
61 /* Size of buffers used for snprintfs. */
62 #define MAX_PRINT_BUFF 6072
63 
64 /* Maximum long option length for option parsing. */
65 #define MAX_LONG_OPT_SZ 64
66 
67 /* mask of enabled ports */
68 static uint32_t enabled_port_mask = 0;
69 
70 /* Promiscuous mode */
71 static uint32_t promiscuous;
72 
73 /* number of devices/queues to support*/
74 static uint32_t num_queues = 0;
75 static uint32_t num_devices;
76 
77 static struct rte_mempool *mbuf_pool;
78 static int mergeable;
79 
80 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
81 typedef enum {
82 	VM2VM_DISABLED = 0,
83 	VM2VM_SOFTWARE = 1,
84 	VM2VM_HARDWARE = 2,
85 	VM2VM_LAST
86 } vm2vm_type;
87 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
88 
89 /* Enable stats. */
90 static uint32_t enable_stats = 0;
91 /* Enable retries on RX. */
92 static uint32_t enable_retry = 1;
93 
94 /* Disable TX checksum offload */
95 static uint32_t enable_tx_csum;
96 
97 /* Disable TSO offload */
98 static uint32_t enable_tso;
99 
100 static int client_mode;
101 static int dequeue_zero_copy;
102 
103 static int builtin_net_driver;
104 
105 /* Specify timeout (in useconds) between retries on RX. */
106 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
107 /* Specify the number of retries on RX. */
108 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
109 
110 /* Socket file paths. Can be set by user */
111 static char *socket_files;
112 static int nb_sockets;
113 
114 /* empty vmdq configuration structure. Filled in programatically */
115 static struct rte_eth_conf vmdq_conf_default = {
116 	.rxmode = {
117 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
118 		.split_hdr_size = 0,
119 		.header_split   = 0, /**< Header Split disabled */
120 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
121 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
122 		/*
123 		 * It is necessary for 1G NIC such as I350,
124 		 * this fixes bug of ipv4 forwarding in guest can't
125 		 * forward pakets from one virtio dev to another virtio dev.
126 		 */
127 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
128 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
129 		.hw_strip_crc   = 1, /**< CRC stripped by hardware */
130 	},
131 
132 	.txmode = {
133 		.mq_mode = ETH_MQ_TX_NONE,
134 	},
135 	.rx_adv_conf = {
136 		/*
137 		 * should be overridden separately in code with
138 		 * appropriate values
139 		 */
140 		.vmdq_rx_conf = {
141 			.nb_queue_pools = ETH_8_POOLS,
142 			.enable_default_pool = 0,
143 			.default_pool = 0,
144 			.nb_pool_maps = 0,
145 			.pool_map = {{0, 0},},
146 		},
147 	},
148 };
149 
150 static unsigned lcore_ids[RTE_MAX_LCORE];
151 static uint16_t ports[RTE_MAX_ETHPORTS];
152 static unsigned num_ports = 0; /**< The number of ports specified in command line */
153 static uint16_t num_pf_queues, num_vmdq_queues;
154 static uint16_t vmdq_pool_base, vmdq_queue_base;
155 static uint16_t queues_per_pool;
156 
157 const uint16_t vlan_tags[] = {
158 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
159 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
160 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
161 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
162 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
163 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
164 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
165 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
166 };
167 
168 /* ethernet addresses of ports */
169 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
170 
171 static struct vhost_dev_tailq_list vhost_dev_list =
172 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
173 
174 static struct lcore_info lcore_info[RTE_MAX_LCORE];
175 
176 /* Used for queueing bursts of TX packets. */
177 struct mbuf_table {
178 	unsigned len;
179 	unsigned txq_id;
180 	struct rte_mbuf *m_table[MAX_PKT_BURST];
181 };
182 
183 /* TX queue for each data core. */
184 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
185 
186 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
187 				 / US_PER_S * BURST_TX_DRAIN_US)
188 #define VLAN_HLEN       4
189 
190 /*
191  * Builds up the correct configuration for VMDQ VLAN pool map
192  * according to the pool & queue limits.
193  */
194 static inline int
195 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
196 {
197 	struct rte_eth_vmdq_rx_conf conf;
198 	struct rte_eth_vmdq_rx_conf *def_conf =
199 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
200 	unsigned i;
201 
202 	memset(&conf, 0, sizeof(conf));
203 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
204 	conf.nb_pool_maps = num_devices;
205 	conf.enable_loop_back = def_conf->enable_loop_back;
206 	conf.rx_mode = def_conf->rx_mode;
207 
208 	for (i = 0; i < conf.nb_pool_maps; i++) {
209 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
210 		conf.pool_map[i].pools = (1UL << i);
211 	}
212 
213 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
214 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
215 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
216 	return 0;
217 }
218 
219 /*
220  * Validate the device number according to the max pool number gotten form
221  * dev_info. If the device number is invalid, give the error message and
222  * return -1. Each device must have its own pool.
223  */
224 static inline int
225 validate_num_devices(uint32_t max_nb_devices)
226 {
227 	if (num_devices > max_nb_devices) {
228 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
229 		return -1;
230 	}
231 	return 0;
232 }
233 
234 /*
235  * Initialises a given port using global settings and with the rx buffers
236  * coming from the mbuf_pool passed as parameter
237  */
238 static inline int
239 port_init(uint16_t port)
240 {
241 	struct rte_eth_dev_info dev_info;
242 	struct rte_eth_conf port_conf;
243 	struct rte_eth_rxconf *rxconf;
244 	struct rte_eth_txconf *txconf;
245 	int16_t rx_rings, tx_rings;
246 	uint16_t rx_ring_size, tx_ring_size;
247 	int retval;
248 	uint16_t q;
249 
250 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
251 	rte_eth_dev_info_get (port, &dev_info);
252 
253 	rxconf = &dev_info.default_rxconf;
254 	txconf = &dev_info.default_txconf;
255 	rxconf->rx_drop_en = 1;
256 
257 	/* Enable vlan offload */
258 	txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
259 
260 	/*configure the number of supported virtio devices based on VMDQ limits */
261 	num_devices = dev_info.max_vmdq_pools;
262 
263 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
264 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
265 
266 	/*
267 	 * When dequeue zero copy is enabled, guest Tx used vring will be
268 	 * updated only when corresponding mbuf is freed. Thus, the nb_tx_desc
269 	 * (tx_ring_size here) must be small enough so that the driver will
270 	 * hit the free threshold easily and free mbufs timely. Otherwise,
271 	 * guest Tx vring would be starved.
272 	 */
273 	if (dequeue_zero_copy)
274 		tx_ring_size = 64;
275 
276 	tx_rings = (uint16_t)rte_lcore_count();
277 
278 	retval = validate_num_devices(MAX_DEVICES);
279 	if (retval < 0)
280 		return retval;
281 
282 	/* Get port configuration. */
283 	retval = get_eth_conf(&port_conf, num_devices);
284 	if (retval < 0)
285 		return retval;
286 	/* NIC queues are divided into pf queues and vmdq queues.  */
287 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
288 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
289 	num_vmdq_queues = num_devices * queues_per_pool;
290 	num_queues = num_pf_queues + num_vmdq_queues;
291 	vmdq_queue_base = dev_info.vmdq_queue_base;
292 	vmdq_pool_base  = dev_info.vmdq_pool_base;
293 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
294 		num_pf_queues, num_devices, queues_per_pool);
295 
296 	if (port >= rte_eth_dev_count()) return -1;
297 
298 	rx_rings = (uint16_t)dev_info.max_rx_queues;
299 	/* Configure ethernet device. */
300 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
301 	if (retval != 0) {
302 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
303 			port, strerror(-retval));
304 		return retval;
305 	}
306 
307 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
308 		&tx_ring_size);
309 	if (retval != 0) {
310 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
311 			"for port %u: %s.\n", port, strerror(-retval));
312 		return retval;
313 	}
314 	if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
315 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
316 			"for Rx queues on port %u.\n", port);
317 		return -1;
318 	}
319 
320 	/* Setup the queues. */
321 	for (q = 0; q < rx_rings; q ++) {
322 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
323 						rte_eth_dev_socket_id(port),
324 						rxconf,
325 						mbuf_pool);
326 		if (retval < 0) {
327 			RTE_LOG(ERR, VHOST_PORT,
328 				"Failed to setup rx queue %u of port %u: %s.\n",
329 				q, port, strerror(-retval));
330 			return retval;
331 		}
332 	}
333 	for (q = 0; q < tx_rings; q ++) {
334 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
335 						rte_eth_dev_socket_id(port),
336 						txconf);
337 		if (retval < 0) {
338 			RTE_LOG(ERR, VHOST_PORT,
339 				"Failed to setup tx queue %u of port %u: %s.\n",
340 				q, port, strerror(-retval));
341 			return retval;
342 		}
343 	}
344 
345 	/* Start the device. */
346 	retval  = rte_eth_dev_start(port);
347 	if (retval < 0) {
348 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
349 			port, strerror(-retval));
350 		return retval;
351 	}
352 
353 	if (promiscuous)
354 		rte_eth_promiscuous_enable(port);
355 
356 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
357 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
358 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
359 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
360 			port,
361 			vmdq_ports_eth_addr[port].addr_bytes[0],
362 			vmdq_ports_eth_addr[port].addr_bytes[1],
363 			vmdq_ports_eth_addr[port].addr_bytes[2],
364 			vmdq_ports_eth_addr[port].addr_bytes[3],
365 			vmdq_ports_eth_addr[port].addr_bytes[4],
366 			vmdq_ports_eth_addr[port].addr_bytes[5]);
367 
368 	return 0;
369 }
370 
371 /*
372  * Set socket file path.
373  */
374 static int
375 us_vhost_parse_socket_path(const char *q_arg)
376 {
377 	/* parse number string */
378 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
379 		return -1;
380 
381 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
382 	snprintf(socket_files + nb_sockets * PATH_MAX, PATH_MAX, "%s", q_arg);
383 	nb_sockets++;
384 
385 	return 0;
386 }
387 
388 /*
389  * Parse the portmask provided at run time.
390  */
391 static int
392 parse_portmask(const char *portmask)
393 {
394 	char *end = NULL;
395 	unsigned long pm;
396 
397 	errno = 0;
398 
399 	/* parse hexadecimal string */
400 	pm = strtoul(portmask, &end, 16);
401 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
402 		return -1;
403 
404 	if (pm == 0)
405 		return -1;
406 
407 	return pm;
408 
409 }
410 
411 /*
412  * Parse num options at run time.
413  */
414 static int
415 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
416 {
417 	char *end = NULL;
418 	unsigned long num;
419 
420 	errno = 0;
421 
422 	/* parse unsigned int string */
423 	num = strtoul(q_arg, &end, 10);
424 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
425 		return -1;
426 
427 	if (num > max_valid_value)
428 		return -1;
429 
430 	return num;
431 
432 }
433 
434 /*
435  * Display usage
436  */
437 static void
438 us_vhost_usage(const char *prgname)
439 {
440 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
441 	"		--vm2vm [0|1|2]\n"
442 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
443 	"		--socket-file <path>\n"
444 	"		--nb-devices ND\n"
445 	"		-p PORTMASK: Set mask for ports to be used by application\n"
446 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
447 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
448 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
449 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
450 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
451 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
452 	"		--socket-file: The path of the socket file.\n"
453 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
454 	"		--tso [0|1] disable/enable TCP segment offload.\n"
455 	"		--client register a vhost-user socket as client mode.\n"
456 	"		--dequeue-zero-copy enables dequeue zero copy\n",
457 	       prgname);
458 }
459 
460 /*
461  * Parse the arguments given in the command line of the application.
462  */
463 static int
464 us_vhost_parse_args(int argc, char **argv)
465 {
466 	int opt, ret;
467 	int option_index;
468 	unsigned i;
469 	const char *prgname = argv[0];
470 	static struct option long_option[] = {
471 		{"vm2vm", required_argument, NULL, 0},
472 		{"rx-retry", required_argument, NULL, 0},
473 		{"rx-retry-delay", required_argument, NULL, 0},
474 		{"rx-retry-num", required_argument, NULL, 0},
475 		{"mergeable", required_argument, NULL, 0},
476 		{"stats", required_argument, NULL, 0},
477 		{"socket-file", required_argument, NULL, 0},
478 		{"tx-csum", required_argument, NULL, 0},
479 		{"tso", required_argument, NULL, 0},
480 		{"client", no_argument, &client_mode, 1},
481 		{"dequeue-zero-copy", no_argument, &dequeue_zero_copy, 1},
482 		{"builtin-net-driver", no_argument, &builtin_net_driver, 1},
483 		{NULL, 0, 0, 0},
484 	};
485 
486 	/* Parse command line */
487 	while ((opt = getopt_long(argc, argv, "p:P",
488 			long_option, &option_index)) != EOF) {
489 		switch (opt) {
490 		/* Portmask */
491 		case 'p':
492 			enabled_port_mask = parse_portmask(optarg);
493 			if (enabled_port_mask == 0) {
494 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
495 				us_vhost_usage(prgname);
496 				return -1;
497 			}
498 			break;
499 
500 		case 'P':
501 			promiscuous = 1;
502 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
503 				ETH_VMDQ_ACCEPT_BROADCAST |
504 				ETH_VMDQ_ACCEPT_MULTICAST;
505 
506 			break;
507 
508 		case 0:
509 			/* Enable/disable vm2vm comms. */
510 			if (!strncmp(long_option[option_index].name, "vm2vm",
511 				MAX_LONG_OPT_SZ)) {
512 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
513 				if (ret == -1) {
514 					RTE_LOG(INFO, VHOST_CONFIG,
515 						"Invalid argument for "
516 						"vm2vm [0|1|2]\n");
517 					us_vhost_usage(prgname);
518 					return -1;
519 				} else {
520 					vm2vm_mode = (vm2vm_type)ret;
521 				}
522 			}
523 
524 			/* Enable/disable retries on RX. */
525 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
526 				ret = parse_num_opt(optarg, 1);
527 				if (ret == -1) {
528 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
529 					us_vhost_usage(prgname);
530 					return -1;
531 				} else {
532 					enable_retry = ret;
533 				}
534 			}
535 
536 			/* Enable/disable TX checksum offload. */
537 			if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
538 				ret = parse_num_opt(optarg, 1);
539 				if (ret == -1) {
540 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
541 					us_vhost_usage(prgname);
542 					return -1;
543 				} else
544 					enable_tx_csum = ret;
545 			}
546 
547 			/* Enable/disable TSO offload. */
548 			if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
549 				ret = parse_num_opt(optarg, 1);
550 				if (ret == -1) {
551 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
552 					us_vhost_usage(prgname);
553 					return -1;
554 				} else
555 					enable_tso = ret;
556 			}
557 
558 			/* Specify the retries delay time (in useconds) on RX. */
559 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
560 				ret = parse_num_opt(optarg, INT32_MAX);
561 				if (ret == -1) {
562 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
563 					us_vhost_usage(prgname);
564 					return -1;
565 				} else {
566 					burst_rx_delay_time = ret;
567 				}
568 			}
569 
570 			/* Specify the retries number on RX. */
571 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
572 				ret = parse_num_opt(optarg, INT32_MAX);
573 				if (ret == -1) {
574 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
575 					us_vhost_usage(prgname);
576 					return -1;
577 				} else {
578 					burst_rx_retry_num = ret;
579 				}
580 			}
581 
582 			/* Enable/disable RX mergeable buffers. */
583 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
584 				ret = parse_num_opt(optarg, 1);
585 				if (ret == -1) {
586 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
587 					us_vhost_usage(prgname);
588 					return -1;
589 				} else {
590 					mergeable = !!ret;
591 					if (ret) {
592 						vmdq_conf_default.rxmode.jumbo_frame = 1;
593 						vmdq_conf_default.rxmode.max_rx_pkt_len
594 							= JUMBO_FRAME_MAX_SIZE;
595 					}
596 				}
597 			}
598 
599 			/* Enable/disable stats. */
600 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
601 				ret = parse_num_opt(optarg, INT32_MAX);
602 				if (ret == -1) {
603 					RTE_LOG(INFO, VHOST_CONFIG,
604 						"Invalid argument for stats [0..N]\n");
605 					us_vhost_usage(prgname);
606 					return -1;
607 				} else {
608 					enable_stats = ret;
609 				}
610 			}
611 
612 			/* Set socket file path. */
613 			if (!strncmp(long_option[option_index].name,
614 						"socket-file", MAX_LONG_OPT_SZ)) {
615 				if (us_vhost_parse_socket_path(optarg) == -1) {
616 					RTE_LOG(INFO, VHOST_CONFIG,
617 					"Invalid argument for socket name (Max %d characters)\n",
618 					PATH_MAX);
619 					us_vhost_usage(prgname);
620 					return -1;
621 				}
622 			}
623 
624 			break;
625 
626 			/* Invalid option - print options. */
627 		default:
628 			us_vhost_usage(prgname);
629 			return -1;
630 		}
631 	}
632 
633 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
634 		if (enabled_port_mask & (1 << i))
635 			ports[num_ports++] = i;
636 	}
637 
638 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
639 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
640 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
641 		return -1;
642 	}
643 
644 	return 0;
645 }
646 
647 /*
648  * Update the global var NUM_PORTS and array PORTS according to system ports number
649  * and return valid ports number
650  */
651 static unsigned check_ports_num(unsigned nb_ports)
652 {
653 	unsigned valid_num_ports = num_ports;
654 	unsigned portid;
655 
656 	if (num_ports > nb_ports) {
657 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
658 			num_ports, nb_ports);
659 		num_ports = nb_ports;
660 	}
661 
662 	for (portid = 0; portid < num_ports; portid ++) {
663 		if (ports[portid] >= nb_ports) {
664 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
665 				ports[portid], (nb_ports - 1));
666 			ports[portid] = INVALID_PORT_ID;
667 			valid_num_ports--;
668 		}
669 	}
670 	return valid_num_ports;
671 }
672 
673 static __rte_always_inline struct vhost_dev *
674 find_vhost_dev(struct ether_addr *mac)
675 {
676 	struct vhost_dev *vdev;
677 
678 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
679 		if (vdev->ready == DEVICE_RX &&
680 		    is_same_ether_addr(mac, &vdev->mac_address))
681 			return vdev;
682 	}
683 
684 	return NULL;
685 }
686 
687 /*
688  * This function learns the MAC address of the device and registers this along with a
689  * vlan tag to a VMDQ.
690  */
691 static int
692 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
693 {
694 	struct ether_hdr *pkt_hdr;
695 	int i, ret;
696 
697 	/* Learn MAC address of guest device from packet */
698 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
699 
700 	if (find_vhost_dev(&pkt_hdr->s_addr)) {
701 		RTE_LOG(ERR, VHOST_DATA,
702 			"(%d) device is using a registered MAC!\n",
703 			vdev->vid);
704 		return -1;
705 	}
706 
707 	for (i = 0; i < ETHER_ADDR_LEN; i++)
708 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
709 
710 	/* vlan_tag currently uses the device_id. */
711 	vdev->vlan_tag = vlan_tags[vdev->vid];
712 
713 	/* Print out VMDQ registration info. */
714 	RTE_LOG(INFO, VHOST_DATA,
715 		"(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
716 		vdev->vid,
717 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
718 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
719 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
720 		vdev->vlan_tag);
721 
722 	/* Register the MAC address. */
723 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
724 				(uint32_t)vdev->vid + vmdq_pool_base);
725 	if (ret)
726 		RTE_LOG(ERR, VHOST_DATA,
727 			"(%d) failed to add device MAC address to VMDQ\n",
728 			vdev->vid);
729 
730 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
731 
732 	/* Set device as ready for RX. */
733 	vdev->ready = DEVICE_RX;
734 
735 	return 0;
736 }
737 
738 /*
739  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
740  * queue before disabling RX on the device.
741  */
742 static inline void
743 unlink_vmdq(struct vhost_dev *vdev)
744 {
745 	unsigned i = 0;
746 	unsigned rx_count;
747 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
748 
749 	if (vdev->ready == DEVICE_RX) {
750 		/*clear MAC and VLAN settings*/
751 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
752 		for (i = 0; i < 6; i++)
753 			vdev->mac_address.addr_bytes[i] = 0;
754 
755 		vdev->vlan_tag = 0;
756 
757 		/*Clear out the receive buffers*/
758 		rx_count = rte_eth_rx_burst(ports[0],
759 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
760 
761 		while (rx_count) {
762 			for (i = 0; i < rx_count; i++)
763 				rte_pktmbuf_free(pkts_burst[i]);
764 
765 			rx_count = rte_eth_rx_burst(ports[0],
766 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
767 		}
768 
769 		vdev->ready = DEVICE_MAC_LEARNING;
770 	}
771 }
772 
773 static __rte_always_inline void
774 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
775 	    struct rte_mbuf *m)
776 {
777 	uint16_t ret;
778 
779 	if (builtin_net_driver) {
780 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
781 	} else {
782 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
783 	}
784 
785 	if (enable_stats) {
786 		rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
787 		rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
788 		src_vdev->stats.tx_total++;
789 		src_vdev->stats.tx += ret;
790 	}
791 }
792 
793 /*
794  * Check if the packet destination MAC address is for a local device. If so then put
795  * the packet on that devices RX queue. If not then return.
796  */
797 static __rte_always_inline int
798 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
799 {
800 	struct ether_hdr *pkt_hdr;
801 	struct vhost_dev *dst_vdev;
802 
803 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
804 
805 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
806 	if (!dst_vdev)
807 		return -1;
808 
809 	if (vdev->vid == dst_vdev->vid) {
810 		RTE_LOG_DP(DEBUG, VHOST_DATA,
811 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
812 			vdev->vid);
813 		return 0;
814 	}
815 
816 	RTE_LOG_DP(DEBUG, VHOST_DATA,
817 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
818 
819 	if (unlikely(dst_vdev->remove)) {
820 		RTE_LOG_DP(DEBUG, VHOST_DATA,
821 			"(%d) device is marked for removal\n", dst_vdev->vid);
822 		return 0;
823 	}
824 
825 	virtio_xmit(dst_vdev, vdev, m);
826 	return 0;
827 }
828 
829 /*
830  * Check if the destination MAC of a packet is one local VM,
831  * and get its vlan tag, and offset if it is.
832  */
833 static __rte_always_inline int
834 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
835 	uint32_t *offset, uint16_t *vlan_tag)
836 {
837 	struct vhost_dev *dst_vdev;
838 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
839 
840 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
841 	if (!dst_vdev)
842 		return 0;
843 
844 	if (vdev->vid == dst_vdev->vid) {
845 		RTE_LOG_DP(DEBUG, VHOST_DATA,
846 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
847 			vdev->vid);
848 		return -1;
849 	}
850 
851 	/*
852 	 * HW vlan strip will reduce the packet length
853 	 * by minus length of vlan tag, so need restore
854 	 * the packet length by plus it.
855 	 */
856 	*offset  = VLAN_HLEN;
857 	*vlan_tag = vlan_tags[vdev->vid];
858 
859 	RTE_LOG_DP(DEBUG, VHOST_DATA,
860 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
861 		vdev->vid, dst_vdev->vid, *vlan_tag);
862 
863 	return 0;
864 }
865 
866 static uint16_t
867 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
868 {
869 	if (ol_flags & PKT_TX_IPV4)
870 		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
871 	else /* assume ethertype == ETHER_TYPE_IPv6 */
872 		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
873 }
874 
875 static void virtio_tx_offload(struct rte_mbuf *m)
876 {
877 	void *l3_hdr;
878 	struct ipv4_hdr *ipv4_hdr = NULL;
879 	struct tcp_hdr *tcp_hdr = NULL;
880 	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
881 
882 	l3_hdr = (char *)eth_hdr + m->l2_len;
883 
884 	if (m->ol_flags & PKT_TX_IPV4) {
885 		ipv4_hdr = l3_hdr;
886 		ipv4_hdr->hdr_checksum = 0;
887 		m->ol_flags |= PKT_TX_IP_CKSUM;
888 	}
889 
890 	tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
891 	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
892 }
893 
894 static inline void
895 free_pkts(struct rte_mbuf **pkts, uint16_t n)
896 {
897 	while (n--)
898 		rte_pktmbuf_free(pkts[n]);
899 }
900 
901 static __rte_always_inline void
902 do_drain_mbuf_table(struct mbuf_table *tx_q)
903 {
904 	uint16_t count;
905 
906 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
907 				 tx_q->m_table, tx_q->len);
908 	if (unlikely(count < tx_q->len))
909 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
910 
911 	tx_q->len = 0;
912 }
913 
914 /*
915  * This function routes the TX packet to the correct interface. This
916  * may be a local device or the physical port.
917  */
918 static __rte_always_inline void
919 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
920 {
921 	struct mbuf_table *tx_q;
922 	unsigned offset = 0;
923 	const uint16_t lcore_id = rte_lcore_id();
924 	struct ether_hdr *nh;
925 
926 
927 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
928 	if (unlikely(is_broadcast_ether_addr(&nh->d_addr))) {
929 		struct vhost_dev *vdev2;
930 
931 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
932 			if (vdev2 != vdev)
933 				virtio_xmit(vdev2, vdev, m);
934 		}
935 		goto queue2nic;
936 	}
937 
938 	/*check if destination is local VM*/
939 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
940 		rte_pktmbuf_free(m);
941 		return;
942 	}
943 
944 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
945 		if (unlikely(find_local_dest(vdev, m, &offset,
946 					     &vlan_tag) != 0)) {
947 			rte_pktmbuf_free(m);
948 			return;
949 		}
950 	}
951 
952 	RTE_LOG_DP(DEBUG, VHOST_DATA,
953 		"(%d) TX: MAC address is external\n", vdev->vid);
954 
955 queue2nic:
956 
957 	/*Add packet to the port tx queue*/
958 	tx_q = &lcore_tx_queue[lcore_id];
959 
960 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
961 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
962 		/* Guest has inserted the vlan tag. */
963 		struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
964 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
965 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
966 			(vh->vlan_tci != vlan_tag_be))
967 			vh->vlan_tci = vlan_tag_be;
968 	} else {
969 		m->ol_flags |= PKT_TX_VLAN_PKT;
970 
971 		/*
972 		 * Find the right seg to adjust the data len when offset is
973 		 * bigger than tail room size.
974 		 */
975 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
976 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
977 				m->data_len += offset;
978 			else {
979 				struct rte_mbuf *seg = m;
980 
981 				while ((seg->next != NULL) &&
982 					(offset > rte_pktmbuf_tailroom(seg)))
983 					seg = seg->next;
984 
985 				seg->data_len += offset;
986 			}
987 			m->pkt_len += offset;
988 		}
989 
990 		m->vlan_tci = vlan_tag;
991 	}
992 
993 	if (m->ol_flags & PKT_TX_TCP_SEG)
994 		virtio_tx_offload(m);
995 
996 	tx_q->m_table[tx_q->len++] = m;
997 	if (enable_stats) {
998 		vdev->stats.tx_total++;
999 		vdev->stats.tx++;
1000 	}
1001 
1002 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1003 		do_drain_mbuf_table(tx_q);
1004 }
1005 
1006 
1007 static __rte_always_inline void
1008 drain_mbuf_table(struct mbuf_table *tx_q)
1009 {
1010 	static uint64_t prev_tsc;
1011 	uint64_t cur_tsc;
1012 
1013 	if (tx_q->len == 0)
1014 		return;
1015 
1016 	cur_tsc = rte_rdtsc();
1017 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1018 		prev_tsc = cur_tsc;
1019 
1020 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1021 			"TX queue drained after timeout with burst size %u\n",
1022 			tx_q->len);
1023 		do_drain_mbuf_table(tx_q);
1024 	}
1025 }
1026 
1027 static __rte_always_inline void
1028 drain_eth_rx(struct vhost_dev *vdev)
1029 {
1030 	uint16_t rx_count, enqueue_count;
1031 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1032 
1033 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1034 				    pkts, MAX_PKT_BURST);
1035 	if (!rx_count)
1036 		return;
1037 
1038 	/*
1039 	 * When "enable_retry" is set, here we wait and retry when there
1040 	 * is no enough free slots in the queue to hold @rx_count packets,
1041 	 * to diminish packet loss.
1042 	 */
1043 	if (enable_retry &&
1044 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1045 			VIRTIO_RXQ))) {
1046 		uint32_t retry;
1047 
1048 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1049 			rte_delay_us(burst_rx_delay_time);
1050 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1051 					VIRTIO_RXQ))
1052 				break;
1053 		}
1054 	}
1055 
1056 	if (builtin_net_driver) {
1057 		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1058 						pkts, rx_count);
1059 	} else {
1060 		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1061 						pkts, rx_count);
1062 	}
1063 	if (enable_stats) {
1064 		rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1065 		rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1066 	}
1067 
1068 	free_pkts(pkts, rx_count);
1069 }
1070 
1071 static __rte_always_inline void
1072 drain_virtio_tx(struct vhost_dev *vdev)
1073 {
1074 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1075 	uint16_t count;
1076 	uint16_t i;
1077 
1078 	if (builtin_net_driver) {
1079 		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1080 					pkts, MAX_PKT_BURST);
1081 	} else {
1082 		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1083 					mbuf_pool, pkts, MAX_PKT_BURST);
1084 	}
1085 
1086 	/* setup VMDq for the first packet */
1087 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1088 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1089 			free_pkts(pkts, count);
1090 	}
1091 
1092 	for (i = 0; i < count; ++i)
1093 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1094 }
1095 
1096 /*
1097  * Main function of vhost-switch. It basically does:
1098  *
1099  * for each vhost device {
1100  *    - drain_eth_rx()
1101  *
1102  *      Which drains the host eth Rx queue linked to the vhost device,
1103  *      and deliver all of them to guest virito Rx ring associated with
1104  *      this vhost device.
1105  *
1106  *    - drain_virtio_tx()
1107  *
1108  *      Which drains the guest virtio Tx queue and deliver all of them
1109  *      to the target, which could be another vhost device, or the
1110  *      physical eth dev. The route is done in function "virtio_tx_route".
1111  * }
1112  */
1113 static int
1114 switch_worker(void *arg __rte_unused)
1115 {
1116 	unsigned i;
1117 	unsigned lcore_id = rte_lcore_id();
1118 	struct vhost_dev *vdev;
1119 	struct mbuf_table *tx_q;
1120 
1121 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1122 
1123 	tx_q = &lcore_tx_queue[lcore_id];
1124 	for (i = 0; i < rte_lcore_count(); i++) {
1125 		if (lcore_ids[i] == lcore_id) {
1126 			tx_q->txq_id = i;
1127 			break;
1128 		}
1129 	}
1130 
1131 	while(1) {
1132 		drain_mbuf_table(tx_q);
1133 
1134 		/*
1135 		 * Inform the configuration core that we have exited the
1136 		 * linked list and that no devices are in use if requested.
1137 		 */
1138 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1139 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1140 
1141 		/*
1142 		 * Process vhost devices
1143 		 */
1144 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1145 			      lcore_vdev_entry) {
1146 			if (unlikely(vdev->remove)) {
1147 				unlink_vmdq(vdev);
1148 				vdev->ready = DEVICE_SAFE_REMOVE;
1149 				continue;
1150 			}
1151 
1152 			if (likely(vdev->ready == DEVICE_RX))
1153 				drain_eth_rx(vdev);
1154 
1155 			if (likely(!vdev->remove))
1156 				drain_virtio_tx(vdev);
1157 		}
1158 	}
1159 
1160 	return 0;
1161 }
1162 
1163 /*
1164  * Remove a device from the specific data core linked list and from the
1165  * main linked list. Synchonization  occurs through the use of the
1166  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1167  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1168  */
1169 static void
1170 destroy_device(int vid)
1171 {
1172 	struct vhost_dev *vdev = NULL;
1173 	int lcore;
1174 
1175 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1176 		if (vdev->vid == vid)
1177 			break;
1178 	}
1179 	if (!vdev)
1180 		return;
1181 	/*set the remove flag. */
1182 	vdev->remove = 1;
1183 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1184 		rte_pause();
1185 	}
1186 
1187 	if (builtin_net_driver)
1188 		vs_vhost_net_remove(vdev);
1189 
1190 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1191 		     lcore_vdev_entry);
1192 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1193 
1194 
1195 	/* Set the dev_removal_flag on each lcore. */
1196 	RTE_LCORE_FOREACH_SLAVE(lcore)
1197 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1198 
1199 	/*
1200 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1201 	 * we can be sure that they can no longer access the device removed
1202 	 * from the linked lists and that the devices are no longer in use.
1203 	 */
1204 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1205 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1206 			rte_pause();
1207 	}
1208 
1209 	lcore_info[vdev->coreid].device_num--;
1210 
1211 	RTE_LOG(INFO, VHOST_DATA,
1212 		"(%d) device has been removed from data core\n",
1213 		vdev->vid);
1214 
1215 	rte_free(vdev);
1216 }
1217 
1218 /*
1219  * A new device is added to a data core. First the device is added to the main linked list
1220  * and the allocated to a specific data core.
1221  */
1222 static int
1223 new_device(int vid)
1224 {
1225 	int lcore, core_add = 0;
1226 	uint32_t device_num_min = num_devices;
1227 	struct vhost_dev *vdev;
1228 
1229 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1230 	if (vdev == NULL) {
1231 		RTE_LOG(INFO, VHOST_DATA,
1232 			"(%d) couldn't allocate memory for vhost dev\n",
1233 			vid);
1234 		return -1;
1235 	}
1236 	vdev->vid = vid;
1237 
1238 	if (builtin_net_driver)
1239 		vs_vhost_net_setup(vdev);
1240 
1241 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1242 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1243 
1244 	/*reset ready flag*/
1245 	vdev->ready = DEVICE_MAC_LEARNING;
1246 	vdev->remove = 0;
1247 
1248 	/* Find a suitable lcore to add the device. */
1249 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1250 		if (lcore_info[lcore].device_num < device_num_min) {
1251 			device_num_min = lcore_info[lcore].device_num;
1252 			core_add = lcore;
1253 		}
1254 	}
1255 	vdev->coreid = core_add;
1256 
1257 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1258 			  lcore_vdev_entry);
1259 	lcore_info[vdev->coreid].device_num++;
1260 
1261 	/* Disable notifications. */
1262 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1263 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1264 
1265 	RTE_LOG(INFO, VHOST_DATA,
1266 		"(%d) device has been added to data core %d\n",
1267 		vid, vdev->coreid);
1268 
1269 	return 0;
1270 }
1271 
1272 /*
1273  * These callback allow devices to be added to the data core when configuration
1274  * has been fully complete.
1275  */
1276 static const struct vhost_device_ops virtio_net_device_ops =
1277 {
1278 	.new_device =  new_device,
1279 	.destroy_device = destroy_device,
1280 };
1281 
1282 /*
1283  * This is a thread will wake up after a period to print stats if the user has
1284  * enabled them.
1285  */
1286 static void
1287 print_stats(void)
1288 {
1289 	struct vhost_dev *vdev;
1290 	uint64_t tx_dropped, rx_dropped;
1291 	uint64_t tx, tx_total, rx, rx_total;
1292 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1293 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1294 
1295 	while(1) {
1296 		sleep(enable_stats);
1297 
1298 		/* Clear screen and move to top left */
1299 		printf("%s%s\n", clr, top_left);
1300 		printf("Device statistics =================================\n");
1301 
1302 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1303 			tx_total   = vdev->stats.tx_total;
1304 			tx         = vdev->stats.tx;
1305 			tx_dropped = tx_total - tx;
1306 
1307 			rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1308 			rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1309 			rx_dropped = rx_total - rx;
1310 
1311 			printf("Statistics for device %d\n"
1312 				"-----------------------\n"
1313 				"TX total:              %" PRIu64 "\n"
1314 				"TX dropped:            %" PRIu64 "\n"
1315 				"TX successful:         %" PRIu64 "\n"
1316 				"RX total:              %" PRIu64 "\n"
1317 				"RX dropped:            %" PRIu64 "\n"
1318 				"RX successful:         %" PRIu64 "\n",
1319 				vdev->vid,
1320 				tx_total, tx_dropped, tx,
1321 				rx_total, rx_dropped, rx);
1322 		}
1323 
1324 		printf("===================================================\n");
1325 	}
1326 }
1327 
1328 static void
1329 unregister_drivers(int socket_num)
1330 {
1331 	int i, ret;
1332 
1333 	for (i = 0; i < socket_num; i++) {
1334 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1335 		if (ret != 0)
1336 			RTE_LOG(ERR, VHOST_CONFIG,
1337 				"Fail to unregister vhost driver for %s.\n",
1338 				socket_files + i * PATH_MAX);
1339 	}
1340 }
1341 
1342 /* When we receive a INT signal, unregister vhost driver */
1343 static void
1344 sigint_handler(__rte_unused int signum)
1345 {
1346 	/* Unregister vhost driver. */
1347 	unregister_drivers(nb_sockets);
1348 
1349 	exit(0);
1350 }
1351 
1352 /*
1353  * While creating an mbuf pool, one key thing is to figure out how
1354  * many mbuf entries is enough for our use. FYI, here are some
1355  * guidelines:
1356  *
1357  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1358  *
1359  * - For each switch core (A CPU core does the packet switch), we need
1360  *   also make some reservation for receiving the packets from virtio
1361  *   Tx queue. How many is enough depends on the usage. It's normally
1362  *   a simple calculation like following:
1363  *
1364  *       MAX_PKT_BURST * max packet size / mbuf size
1365  *
1366  *   So, we definitely need allocate more mbufs when TSO is enabled.
1367  *
1368  * - Similarly, for each switching core, we should serve @nr_rx_desc
1369  *   mbufs for receiving the packets from physical NIC device.
1370  *
1371  * - We also need make sure, for each switch core, we have allocated
1372  *   enough mbufs to fill up the mbuf cache.
1373  */
1374 static void
1375 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1376 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1377 {
1378 	uint32_t nr_mbufs;
1379 	uint32_t nr_mbufs_per_core;
1380 	uint32_t mtu = 1500;
1381 
1382 	if (mergeable)
1383 		mtu = 9000;
1384 	if (enable_tso)
1385 		mtu = 64 * 1024;
1386 
1387 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1388 			(mbuf_size - RTE_PKTMBUF_HEADROOM);
1389 	nr_mbufs_per_core += nr_rx_desc;
1390 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1391 
1392 	nr_mbufs  = nr_queues * nr_rx_desc;
1393 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1394 	nr_mbufs *= nr_port;
1395 
1396 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1397 					    nr_mbuf_cache, 0, mbuf_size,
1398 					    rte_socket_id());
1399 	if (mbuf_pool == NULL)
1400 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1401 }
1402 
1403 /*
1404  * Main function, does initialisation and calls the per-lcore functions.
1405  */
1406 int
1407 main(int argc, char *argv[])
1408 {
1409 	unsigned lcore_id, core_id = 0;
1410 	unsigned nb_ports, valid_num_ports;
1411 	int ret, i;
1412 	uint16_t portid;
1413 	static pthread_t tid;
1414 	char thread_name[RTE_MAX_THREAD_NAME_LEN];
1415 	uint64_t flags = 0;
1416 
1417 	signal(SIGINT, sigint_handler);
1418 
1419 	/* init EAL */
1420 	ret = rte_eal_init(argc, argv);
1421 	if (ret < 0)
1422 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1423 	argc -= ret;
1424 	argv += ret;
1425 
1426 	/* parse app arguments */
1427 	ret = us_vhost_parse_args(argc, argv);
1428 	if (ret < 0)
1429 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1430 
1431 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1432 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1433 
1434 		if (rte_lcore_is_enabled(lcore_id))
1435 			lcore_ids[core_id++] = lcore_id;
1436 	}
1437 
1438 	if (rte_lcore_count() > RTE_MAX_LCORE)
1439 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1440 
1441 	/* Get the number of physical ports. */
1442 	nb_ports = rte_eth_dev_count();
1443 
1444 	/*
1445 	 * Update the global var NUM_PORTS and global array PORTS
1446 	 * and get value of var VALID_NUM_PORTS according to system ports number
1447 	 */
1448 	valid_num_ports = check_ports_num(nb_ports);
1449 
1450 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1451 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1452 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1453 		return -1;
1454 	}
1455 
1456 	/*
1457 	 * FIXME: here we are trying to allocate mbufs big enough for
1458 	 * @MAX_QUEUES, but the truth is we're never going to use that
1459 	 * many queues here. We probably should only do allocation for
1460 	 * those queues we are going to use.
1461 	 */
1462 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1463 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1464 
1465 	if (vm2vm_mode == VM2VM_HARDWARE) {
1466 		/* Enable VT loop back to let L2 switch to do it. */
1467 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1468 		RTE_LOG(DEBUG, VHOST_CONFIG,
1469 			"Enable loop back for L2 switch in vmdq.\n");
1470 	}
1471 
1472 	/* initialize all ports */
1473 	for (portid = 0; portid < nb_ports; portid++) {
1474 		/* skip ports that are not enabled */
1475 		if ((enabled_port_mask & (1 << portid)) == 0) {
1476 			RTE_LOG(INFO, VHOST_PORT,
1477 				"Skipping disabled port %d\n", portid);
1478 			continue;
1479 		}
1480 		if (port_init(portid) != 0)
1481 			rte_exit(EXIT_FAILURE,
1482 				"Cannot initialize network ports\n");
1483 	}
1484 
1485 	/* Enable stats if the user option is set. */
1486 	if (enable_stats) {
1487 		ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
1488 		if (ret != 0)
1489 			rte_exit(EXIT_FAILURE,
1490 				"Cannot create print-stats thread\n");
1491 
1492 		/* Set thread_name for aid in debugging.  */
1493 		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
1494 		ret = rte_thread_setname(tid, thread_name);
1495 		if (ret != 0)
1496 			RTE_LOG(DEBUG, VHOST_CONFIG,
1497 				"Cannot set print-stats name\n");
1498 	}
1499 
1500 	/* Launch all data cores. */
1501 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1502 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1503 
1504 	if (client_mode)
1505 		flags |= RTE_VHOST_USER_CLIENT;
1506 
1507 	if (dequeue_zero_copy)
1508 		flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1509 
1510 	/* Register vhost user driver to handle vhost messages. */
1511 	for (i = 0; i < nb_sockets; i++) {
1512 		char *file = socket_files + i * PATH_MAX;
1513 		ret = rte_vhost_driver_register(file, flags);
1514 		if (ret != 0) {
1515 			unregister_drivers(i);
1516 			rte_exit(EXIT_FAILURE,
1517 				"vhost driver register failure.\n");
1518 		}
1519 
1520 		if (builtin_net_driver)
1521 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1522 
1523 		if (mergeable == 0) {
1524 			rte_vhost_driver_disable_features(file,
1525 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
1526 		}
1527 
1528 		if (enable_tx_csum == 0) {
1529 			rte_vhost_driver_disable_features(file,
1530 				1ULL << VIRTIO_NET_F_CSUM);
1531 		}
1532 
1533 		if (enable_tso == 0) {
1534 			rte_vhost_driver_disable_features(file,
1535 				1ULL << VIRTIO_NET_F_HOST_TSO4);
1536 			rte_vhost_driver_disable_features(file,
1537 				1ULL << VIRTIO_NET_F_HOST_TSO6);
1538 			rte_vhost_driver_disable_features(file,
1539 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
1540 			rte_vhost_driver_disable_features(file,
1541 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
1542 		}
1543 
1544 		if (promiscuous) {
1545 			rte_vhost_driver_enable_features(file,
1546 				1ULL << VIRTIO_NET_F_CTRL_RX);
1547 		}
1548 
1549 		ret = rte_vhost_driver_callback_register(file,
1550 			&virtio_net_device_ops);
1551 		if (ret != 0) {
1552 			rte_exit(EXIT_FAILURE,
1553 				"failed to register vhost driver callbacks.\n");
1554 		}
1555 
1556 		if (rte_vhost_driver_start(file) < 0) {
1557 			rte_exit(EXIT_FAILURE,
1558 				"failed to start vhost driver.\n");
1559 		}
1560 	}
1561 
1562 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1563 		rte_eal_wait_lcore(lcore_id);
1564 
1565 	return 0;
1566 
1567 }
1568