xref: /dpdk/examples/vhost/main.c (revision 37fb306c1665efd52cdf2fc1fc99441b8e98aa9e)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4 
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16 
17 #include <rte_atomic.h>
18 #include <rte_cycles.h>
19 #include <rte_ethdev.h>
20 #include <rte_log.h>
21 #include <rte_string_fns.h>
22 #include <rte_malloc.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27 
28 #include "main.h"
29 
30 #ifndef MAX_QUEUES
31 #define MAX_QUEUES 128
32 #endif
33 
34 /* the maximum number of external ports supported */
35 #define MAX_SUP_PORTS 1
36 
37 #define MBUF_CACHE_SIZE	128
38 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
39 
40 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
41 
42 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
43 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
44 
45 #define JUMBO_FRAME_MAX_SIZE    0x2600
46 
47 /* State of virtio device. */
48 #define DEVICE_MAC_LEARNING 0
49 #define DEVICE_RX			1
50 #define DEVICE_SAFE_REMOVE	2
51 
52 /* Configurable number of RX/TX ring descriptors */
53 #define RTE_TEST_RX_DESC_DEFAULT 1024
54 #define RTE_TEST_TX_DESC_DEFAULT 512
55 
56 #define INVALID_PORT_ID 0xFF
57 
58 /* Maximum long option length for option parsing. */
59 #define MAX_LONG_OPT_SZ 64
60 
61 /* mask of enabled ports */
62 static uint32_t enabled_port_mask = 0;
63 
64 /* Promiscuous mode */
65 static uint32_t promiscuous;
66 
67 /* number of devices/queues to support*/
68 static uint32_t num_queues = 0;
69 static uint32_t num_devices;
70 
71 static struct rte_mempool *mbuf_pool;
72 static int mergeable;
73 
74 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
75 typedef enum {
76 	VM2VM_DISABLED = 0,
77 	VM2VM_SOFTWARE = 1,
78 	VM2VM_HARDWARE = 2,
79 	VM2VM_LAST
80 } vm2vm_type;
81 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
82 
83 /* Enable stats. */
84 static uint32_t enable_stats = 0;
85 /* Enable retries on RX. */
86 static uint32_t enable_retry = 1;
87 
88 /* Disable TX checksum offload */
89 static uint32_t enable_tx_csum;
90 
91 /* Disable TSO offload */
92 static uint32_t enable_tso;
93 
94 static int client_mode;
95 static int dequeue_zero_copy;
96 
97 static int builtin_net_driver;
98 
99 /* Specify timeout (in useconds) between retries on RX. */
100 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
101 /* Specify the number of retries on RX. */
102 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
103 
104 /* Socket file paths. Can be set by user */
105 static char *socket_files;
106 static int nb_sockets;
107 
108 /* empty vmdq configuration structure. Filled in programatically */
109 static struct rte_eth_conf vmdq_conf_default = {
110 	.rxmode = {
111 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
112 		.split_hdr_size = 0,
113 		/*
114 		 * VLAN strip is necessary for 1G NIC such as I350,
115 		 * this fixes bug of ipv4 forwarding in guest can't
116 		 * forward pakets from one virtio dev to another virtio dev.
117 		 */
118 		.offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
119 	},
120 
121 	.txmode = {
122 		.mq_mode = ETH_MQ_TX_NONE,
123 		.offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
124 			     DEV_TX_OFFLOAD_TCP_CKSUM |
125 			     DEV_TX_OFFLOAD_VLAN_INSERT |
126 			     DEV_TX_OFFLOAD_MULTI_SEGS |
127 			     DEV_TX_OFFLOAD_TCP_TSO),
128 	},
129 	.rx_adv_conf = {
130 		/*
131 		 * should be overridden separately in code with
132 		 * appropriate values
133 		 */
134 		.vmdq_rx_conf = {
135 			.nb_queue_pools = ETH_8_POOLS,
136 			.enable_default_pool = 0,
137 			.default_pool = 0,
138 			.nb_pool_maps = 0,
139 			.pool_map = {{0, 0},},
140 		},
141 	},
142 };
143 
144 
145 static unsigned lcore_ids[RTE_MAX_LCORE];
146 static uint16_t ports[RTE_MAX_ETHPORTS];
147 static unsigned num_ports = 0; /**< The number of ports specified in command line */
148 static uint16_t num_pf_queues, num_vmdq_queues;
149 static uint16_t vmdq_pool_base, vmdq_queue_base;
150 static uint16_t queues_per_pool;
151 
152 const uint16_t vlan_tags[] = {
153 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
154 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
155 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
156 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
157 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
158 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
159 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
160 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
161 };
162 
163 /* ethernet addresses of ports */
164 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
165 
166 static struct vhost_dev_tailq_list vhost_dev_list =
167 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
168 
169 static struct lcore_info lcore_info[RTE_MAX_LCORE];
170 
171 /* Used for queueing bursts of TX packets. */
172 struct mbuf_table {
173 	unsigned len;
174 	unsigned txq_id;
175 	struct rte_mbuf *m_table[MAX_PKT_BURST];
176 };
177 
178 /* TX queue for each data core. */
179 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
180 
181 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
182 				 / US_PER_S * BURST_TX_DRAIN_US)
183 #define VLAN_HLEN       4
184 
185 /*
186  * Builds up the correct configuration for VMDQ VLAN pool map
187  * according to the pool & queue limits.
188  */
189 static inline int
190 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
191 {
192 	struct rte_eth_vmdq_rx_conf conf;
193 	struct rte_eth_vmdq_rx_conf *def_conf =
194 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
195 	unsigned i;
196 
197 	memset(&conf, 0, sizeof(conf));
198 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
199 	conf.nb_pool_maps = num_devices;
200 	conf.enable_loop_back = def_conf->enable_loop_back;
201 	conf.rx_mode = def_conf->rx_mode;
202 
203 	for (i = 0; i < conf.nb_pool_maps; i++) {
204 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
205 		conf.pool_map[i].pools = (1UL << i);
206 	}
207 
208 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
209 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
210 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
211 	return 0;
212 }
213 
214 /*
215  * Initialises a given port using global settings and with the rx buffers
216  * coming from the mbuf_pool passed as parameter
217  */
218 static inline int
219 port_init(uint16_t port)
220 {
221 	struct rte_eth_dev_info dev_info;
222 	struct rte_eth_conf port_conf;
223 	struct rte_eth_rxconf *rxconf;
224 	struct rte_eth_txconf *txconf;
225 	int16_t rx_rings, tx_rings;
226 	uint16_t rx_ring_size, tx_ring_size;
227 	int retval;
228 	uint16_t q;
229 
230 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
231 	retval = rte_eth_dev_info_get(port, &dev_info);
232 	if (retval != 0) {
233 		RTE_LOG(ERR, VHOST_PORT,
234 			"Error during getting device (port %u) info: %s\n",
235 			port, strerror(-retval));
236 
237 		return retval;
238 	}
239 
240 	rxconf = &dev_info.default_rxconf;
241 	txconf = &dev_info.default_txconf;
242 	rxconf->rx_drop_en = 1;
243 
244 	/*configure the number of supported virtio devices based on VMDQ limits */
245 	num_devices = dev_info.max_vmdq_pools;
246 
247 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
248 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
249 
250 	/*
251 	 * When dequeue zero copy is enabled, guest Tx used vring will be
252 	 * updated only when corresponding mbuf is freed. Thus, the nb_tx_desc
253 	 * (tx_ring_size here) must be small enough so that the driver will
254 	 * hit the free threshold easily and free mbufs timely. Otherwise,
255 	 * guest Tx vring would be starved.
256 	 */
257 	if (dequeue_zero_copy)
258 		tx_ring_size = 64;
259 
260 	tx_rings = (uint16_t)rte_lcore_count();
261 
262 	/* Get port configuration. */
263 	retval = get_eth_conf(&port_conf, num_devices);
264 	if (retval < 0)
265 		return retval;
266 	/* NIC queues are divided into pf queues and vmdq queues.  */
267 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
268 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
269 	num_vmdq_queues = num_devices * queues_per_pool;
270 	num_queues = num_pf_queues + num_vmdq_queues;
271 	vmdq_queue_base = dev_info.vmdq_queue_base;
272 	vmdq_pool_base  = dev_info.vmdq_pool_base;
273 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
274 		num_pf_queues, num_devices, queues_per_pool);
275 
276 	if (!rte_eth_dev_is_valid_port(port))
277 		return -1;
278 
279 	rx_rings = (uint16_t)dev_info.max_rx_queues;
280 	if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
281 		port_conf.txmode.offloads |=
282 			DEV_TX_OFFLOAD_MBUF_FAST_FREE;
283 	/* Configure ethernet device. */
284 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
285 	if (retval != 0) {
286 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
287 			port, strerror(-retval));
288 		return retval;
289 	}
290 
291 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
292 		&tx_ring_size);
293 	if (retval != 0) {
294 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
295 			"for port %u: %s.\n", port, strerror(-retval));
296 		return retval;
297 	}
298 	if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
299 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
300 			"for Rx queues on port %u.\n", port);
301 		return -1;
302 	}
303 
304 	/* Setup the queues. */
305 	rxconf->offloads = port_conf.rxmode.offloads;
306 	for (q = 0; q < rx_rings; q ++) {
307 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
308 						rte_eth_dev_socket_id(port),
309 						rxconf,
310 						mbuf_pool);
311 		if (retval < 0) {
312 			RTE_LOG(ERR, VHOST_PORT,
313 				"Failed to setup rx queue %u of port %u: %s.\n",
314 				q, port, strerror(-retval));
315 			return retval;
316 		}
317 	}
318 	txconf->offloads = port_conf.txmode.offloads;
319 	for (q = 0; q < tx_rings; q ++) {
320 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
321 						rte_eth_dev_socket_id(port),
322 						txconf);
323 		if (retval < 0) {
324 			RTE_LOG(ERR, VHOST_PORT,
325 				"Failed to setup tx queue %u of port %u: %s.\n",
326 				q, port, strerror(-retval));
327 			return retval;
328 		}
329 	}
330 
331 	/* Start the device. */
332 	retval  = rte_eth_dev_start(port);
333 	if (retval < 0) {
334 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
335 			port, strerror(-retval));
336 		return retval;
337 	}
338 
339 	if (promiscuous)
340 		rte_eth_promiscuous_enable(port);
341 
342 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
343 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
344 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
345 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
346 			port,
347 			vmdq_ports_eth_addr[port].addr_bytes[0],
348 			vmdq_ports_eth_addr[port].addr_bytes[1],
349 			vmdq_ports_eth_addr[port].addr_bytes[2],
350 			vmdq_ports_eth_addr[port].addr_bytes[3],
351 			vmdq_ports_eth_addr[port].addr_bytes[4],
352 			vmdq_ports_eth_addr[port].addr_bytes[5]);
353 
354 	return 0;
355 }
356 
357 /*
358  * Set socket file path.
359  */
360 static int
361 us_vhost_parse_socket_path(const char *q_arg)
362 {
363 	char *old;
364 
365 	/* parse number string */
366 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
367 		return -1;
368 
369 	old = socket_files;
370 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
371 	if (socket_files == NULL) {
372 		free(old);
373 		return -1;
374 	}
375 
376 	strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
377 	nb_sockets++;
378 
379 	return 0;
380 }
381 
382 /*
383  * Parse the portmask provided at run time.
384  */
385 static int
386 parse_portmask(const char *portmask)
387 {
388 	char *end = NULL;
389 	unsigned long pm;
390 
391 	errno = 0;
392 
393 	/* parse hexadecimal string */
394 	pm = strtoul(portmask, &end, 16);
395 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
396 		return -1;
397 
398 	if (pm == 0)
399 		return -1;
400 
401 	return pm;
402 
403 }
404 
405 /*
406  * Parse num options at run time.
407  */
408 static int
409 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
410 {
411 	char *end = NULL;
412 	unsigned long num;
413 
414 	errno = 0;
415 
416 	/* parse unsigned int string */
417 	num = strtoul(q_arg, &end, 10);
418 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
419 		return -1;
420 
421 	if (num > max_valid_value)
422 		return -1;
423 
424 	return num;
425 
426 }
427 
428 /*
429  * Display usage
430  */
431 static void
432 us_vhost_usage(const char *prgname)
433 {
434 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
435 	"		--vm2vm [0|1|2]\n"
436 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
437 	"		--socket-file <path>\n"
438 	"		--nb-devices ND\n"
439 	"		-p PORTMASK: Set mask for ports to be used by application\n"
440 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
441 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
442 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
443 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
444 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
445 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
446 	"		--socket-file: The path of the socket file.\n"
447 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
448 	"		--tso [0|1] disable/enable TCP segment offload.\n"
449 	"		--client register a vhost-user socket as client mode.\n"
450 	"		--dequeue-zero-copy enables dequeue zero copy\n",
451 	       prgname);
452 }
453 
454 /*
455  * Parse the arguments given in the command line of the application.
456  */
457 static int
458 us_vhost_parse_args(int argc, char **argv)
459 {
460 	int opt, ret;
461 	int option_index;
462 	unsigned i;
463 	const char *prgname = argv[0];
464 	static struct option long_option[] = {
465 		{"vm2vm", required_argument, NULL, 0},
466 		{"rx-retry", required_argument, NULL, 0},
467 		{"rx-retry-delay", required_argument, NULL, 0},
468 		{"rx-retry-num", required_argument, NULL, 0},
469 		{"mergeable", required_argument, NULL, 0},
470 		{"stats", required_argument, NULL, 0},
471 		{"socket-file", required_argument, NULL, 0},
472 		{"tx-csum", required_argument, NULL, 0},
473 		{"tso", required_argument, NULL, 0},
474 		{"client", no_argument, &client_mode, 1},
475 		{"dequeue-zero-copy", no_argument, &dequeue_zero_copy, 1},
476 		{"builtin-net-driver", no_argument, &builtin_net_driver, 1},
477 		{NULL, 0, 0, 0},
478 	};
479 
480 	/* Parse command line */
481 	while ((opt = getopt_long(argc, argv, "p:P",
482 			long_option, &option_index)) != EOF) {
483 		switch (opt) {
484 		/* Portmask */
485 		case 'p':
486 			enabled_port_mask = parse_portmask(optarg);
487 			if (enabled_port_mask == 0) {
488 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
489 				us_vhost_usage(prgname);
490 				return -1;
491 			}
492 			break;
493 
494 		case 'P':
495 			promiscuous = 1;
496 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
497 				ETH_VMDQ_ACCEPT_BROADCAST |
498 				ETH_VMDQ_ACCEPT_MULTICAST;
499 
500 			break;
501 
502 		case 0:
503 			/* Enable/disable vm2vm comms. */
504 			if (!strncmp(long_option[option_index].name, "vm2vm",
505 				MAX_LONG_OPT_SZ)) {
506 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
507 				if (ret == -1) {
508 					RTE_LOG(INFO, VHOST_CONFIG,
509 						"Invalid argument for "
510 						"vm2vm [0|1|2]\n");
511 					us_vhost_usage(prgname);
512 					return -1;
513 				} else {
514 					vm2vm_mode = (vm2vm_type)ret;
515 				}
516 			}
517 
518 			/* Enable/disable retries on RX. */
519 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
520 				ret = parse_num_opt(optarg, 1);
521 				if (ret == -1) {
522 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
523 					us_vhost_usage(prgname);
524 					return -1;
525 				} else {
526 					enable_retry = ret;
527 				}
528 			}
529 
530 			/* Enable/disable TX checksum offload. */
531 			if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
532 				ret = parse_num_opt(optarg, 1);
533 				if (ret == -1) {
534 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
535 					us_vhost_usage(prgname);
536 					return -1;
537 				} else
538 					enable_tx_csum = ret;
539 			}
540 
541 			/* Enable/disable TSO offload. */
542 			if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
543 				ret = parse_num_opt(optarg, 1);
544 				if (ret == -1) {
545 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
546 					us_vhost_usage(prgname);
547 					return -1;
548 				} else
549 					enable_tso = ret;
550 			}
551 
552 			/* Specify the retries delay time (in useconds) on RX. */
553 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
554 				ret = parse_num_opt(optarg, INT32_MAX);
555 				if (ret == -1) {
556 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
557 					us_vhost_usage(prgname);
558 					return -1;
559 				} else {
560 					burst_rx_delay_time = ret;
561 				}
562 			}
563 
564 			/* Specify the retries number on RX. */
565 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
566 				ret = parse_num_opt(optarg, INT32_MAX);
567 				if (ret == -1) {
568 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
569 					us_vhost_usage(prgname);
570 					return -1;
571 				} else {
572 					burst_rx_retry_num = ret;
573 				}
574 			}
575 
576 			/* Enable/disable RX mergeable buffers. */
577 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
578 				ret = parse_num_opt(optarg, 1);
579 				if (ret == -1) {
580 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
581 					us_vhost_usage(prgname);
582 					return -1;
583 				} else {
584 					mergeable = !!ret;
585 					if (ret) {
586 						vmdq_conf_default.rxmode.offloads |=
587 							DEV_RX_OFFLOAD_JUMBO_FRAME;
588 						vmdq_conf_default.rxmode.max_rx_pkt_len
589 							= JUMBO_FRAME_MAX_SIZE;
590 					}
591 				}
592 			}
593 
594 			/* Enable/disable stats. */
595 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
596 				ret = parse_num_opt(optarg, INT32_MAX);
597 				if (ret == -1) {
598 					RTE_LOG(INFO, VHOST_CONFIG,
599 						"Invalid argument for stats [0..N]\n");
600 					us_vhost_usage(prgname);
601 					return -1;
602 				} else {
603 					enable_stats = ret;
604 				}
605 			}
606 
607 			/* Set socket file path. */
608 			if (!strncmp(long_option[option_index].name,
609 						"socket-file", MAX_LONG_OPT_SZ)) {
610 				if (us_vhost_parse_socket_path(optarg) == -1) {
611 					RTE_LOG(INFO, VHOST_CONFIG,
612 					"Invalid argument for socket name (Max %d characters)\n",
613 					PATH_MAX);
614 					us_vhost_usage(prgname);
615 					return -1;
616 				}
617 			}
618 
619 			break;
620 
621 			/* Invalid option - print options. */
622 		default:
623 			us_vhost_usage(prgname);
624 			return -1;
625 		}
626 	}
627 
628 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
629 		if (enabled_port_mask & (1 << i))
630 			ports[num_ports++] = i;
631 	}
632 
633 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
634 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
635 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
636 		return -1;
637 	}
638 
639 	return 0;
640 }
641 
642 /*
643  * Update the global var NUM_PORTS and array PORTS according to system ports number
644  * and return valid ports number
645  */
646 static unsigned check_ports_num(unsigned nb_ports)
647 {
648 	unsigned valid_num_ports = num_ports;
649 	unsigned portid;
650 
651 	if (num_ports > nb_ports) {
652 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
653 			num_ports, nb_ports);
654 		num_ports = nb_ports;
655 	}
656 
657 	for (portid = 0; portid < num_ports; portid ++) {
658 		if (!rte_eth_dev_is_valid_port(ports[portid])) {
659 			RTE_LOG(INFO, VHOST_PORT,
660 				"\nSpecified port ID(%u) is not valid\n",
661 				ports[portid]);
662 			ports[portid] = INVALID_PORT_ID;
663 			valid_num_ports--;
664 		}
665 	}
666 	return valid_num_ports;
667 }
668 
669 static __rte_always_inline struct vhost_dev *
670 find_vhost_dev(struct rte_ether_addr *mac)
671 {
672 	struct vhost_dev *vdev;
673 
674 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
675 		if (vdev->ready == DEVICE_RX &&
676 		    rte_is_same_ether_addr(mac, &vdev->mac_address))
677 			return vdev;
678 	}
679 
680 	return NULL;
681 }
682 
683 /*
684  * This function learns the MAC address of the device and registers this along with a
685  * vlan tag to a VMDQ.
686  */
687 static int
688 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
689 {
690 	struct rte_ether_hdr *pkt_hdr;
691 	int i, ret;
692 
693 	/* Learn MAC address of guest device from packet */
694 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
695 
696 	if (find_vhost_dev(&pkt_hdr->s_addr)) {
697 		RTE_LOG(ERR, VHOST_DATA,
698 			"(%d) device is using a registered MAC!\n",
699 			vdev->vid);
700 		return -1;
701 	}
702 
703 	for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
704 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
705 
706 	/* vlan_tag currently uses the device_id. */
707 	vdev->vlan_tag = vlan_tags[vdev->vid];
708 
709 	/* Print out VMDQ registration info. */
710 	RTE_LOG(INFO, VHOST_DATA,
711 		"(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
712 		vdev->vid,
713 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
714 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
715 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
716 		vdev->vlan_tag);
717 
718 	/* Register the MAC address. */
719 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
720 				(uint32_t)vdev->vid + vmdq_pool_base);
721 	if (ret)
722 		RTE_LOG(ERR, VHOST_DATA,
723 			"(%d) failed to add device MAC address to VMDQ\n",
724 			vdev->vid);
725 
726 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
727 
728 	/* Set device as ready for RX. */
729 	vdev->ready = DEVICE_RX;
730 
731 	return 0;
732 }
733 
734 /*
735  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
736  * queue before disabling RX on the device.
737  */
738 static inline void
739 unlink_vmdq(struct vhost_dev *vdev)
740 {
741 	unsigned i = 0;
742 	unsigned rx_count;
743 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
744 
745 	if (vdev->ready == DEVICE_RX) {
746 		/*clear MAC and VLAN settings*/
747 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
748 		for (i = 0; i < 6; i++)
749 			vdev->mac_address.addr_bytes[i] = 0;
750 
751 		vdev->vlan_tag = 0;
752 
753 		/*Clear out the receive buffers*/
754 		rx_count = rte_eth_rx_burst(ports[0],
755 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
756 
757 		while (rx_count) {
758 			for (i = 0; i < rx_count; i++)
759 				rte_pktmbuf_free(pkts_burst[i]);
760 
761 			rx_count = rte_eth_rx_burst(ports[0],
762 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
763 		}
764 
765 		vdev->ready = DEVICE_MAC_LEARNING;
766 	}
767 }
768 
769 static __rte_always_inline void
770 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
771 	    struct rte_mbuf *m)
772 {
773 	uint16_t ret;
774 
775 	if (builtin_net_driver) {
776 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
777 	} else {
778 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
779 	}
780 
781 	if (enable_stats) {
782 		rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
783 		rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
784 		src_vdev->stats.tx_total++;
785 		src_vdev->stats.tx += ret;
786 	}
787 }
788 
789 /*
790  * Check if the packet destination MAC address is for a local device. If so then put
791  * the packet on that devices RX queue. If not then return.
792  */
793 static __rte_always_inline int
794 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
795 {
796 	struct rte_ether_hdr *pkt_hdr;
797 	struct vhost_dev *dst_vdev;
798 
799 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
800 
801 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
802 	if (!dst_vdev)
803 		return -1;
804 
805 	if (vdev->vid == dst_vdev->vid) {
806 		RTE_LOG_DP(DEBUG, VHOST_DATA,
807 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
808 			vdev->vid);
809 		return 0;
810 	}
811 
812 	RTE_LOG_DP(DEBUG, VHOST_DATA,
813 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
814 
815 	if (unlikely(dst_vdev->remove)) {
816 		RTE_LOG_DP(DEBUG, VHOST_DATA,
817 			"(%d) device is marked for removal\n", dst_vdev->vid);
818 		return 0;
819 	}
820 
821 	virtio_xmit(dst_vdev, vdev, m);
822 	return 0;
823 }
824 
825 /*
826  * Check if the destination MAC of a packet is one local VM,
827  * and get its vlan tag, and offset if it is.
828  */
829 static __rte_always_inline int
830 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
831 	uint32_t *offset, uint16_t *vlan_tag)
832 {
833 	struct vhost_dev *dst_vdev;
834 	struct rte_ether_hdr *pkt_hdr =
835 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
836 
837 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
838 	if (!dst_vdev)
839 		return 0;
840 
841 	if (vdev->vid == dst_vdev->vid) {
842 		RTE_LOG_DP(DEBUG, VHOST_DATA,
843 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
844 			vdev->vid);
845 		return -1;
846 	}
847 
848 	/*
849 	 * HW vlan strip will reduce the packet length
850 	 * by minus length of vlan tag, so need restore
851 	 * the packet length by plus it.
852 	 */
853 	*offset  = VLAN_HLEN;
854 	*vlan_tag = vlan_tags[vdev->vid];
855 
856 	RTE_LOG_DP(DEBUG, VHOST_DATA,
857 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
858 		vdev->vid, dst_vdev->vid, *vlan_tag);
859 
860 	return 0;
861 }
862 
863 static uint16_t
864 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
865 {
866 	if (ol_flags & PKT_TX_IPV4)
867 		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
868 	else /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
869 		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
870 }
871 
872 static void virtio_tx_offload(struct rte_mbuf *m)
873 {
874 	void *l3_hdr;
875 	struct rte_ipv4_hdr *ipv4_hdr = NULL;
876 	struct rte_tcp_hdr *tcp_hdr = NULL;
877 	struct rte_ether_hdr *eth_hdr =
878 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
879 
880 	l3_hdr = (char *)eth_hdr + m->l2_len;
881 
882 	if (m->ol_flags & PKT_TX_IPV4) {
883 		ipv4_hdr = l3_hdr;
884 		ipv4_hdr->hdr_checksum = 0;
885 		m->ol_flags |= PKT_TX_IP_CKSUM;
886 	}
887 
888 	tcp_hdr = (struct rte_tcp_hdr *)((char *)l3_hdr + m->l3_len);
889 	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
890 }
891 
892 static inline void
893 free_pkts(struct rte_mbuf **pkts, uint16_t n)
894 {
895 	while (n--)
896 		rte_pktmbuf_free(pkts[n]);
897 }
898 
899 static __rte_always_inline void
900 do_drain_mbuf_table(struct mbuf_table *tx_q)
901 {
902 	uint16_t count;
903 
904 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
905 				 tx_q->m_table, tx_q->len);
906 	if (unlikely(count < tx_q->len))
907 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
908 
909 	tx_q->len = 0;
910 }
911 
912 /*
913  * This function routes the TX packet to the correct interface. This
914  * may be a local device or the physical port.
915  */
916 static __rte_always_inline void
917 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
918 {
919 	struct mbuf_table *tx_q;
920 	unsigned offset = 0;
921 	const uint16_t lcore_id = rte_lcore_id();
922 	struct rte_ether_hdr *nh;
923 
924 
925 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
926 	if (unlikely(rte_is_broadcast_ether_addr(&nh->d_addr))) {
927 		struct vhost_dev *vdev2;
928 
929 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
930 			if (vdev2 != vdev)
931 				virtio_xmit(vdev2, vdev, m);
932 		}
933 		goto queue2nic;
934 	}
935 
936 	/*check if destination is local VM*/
937 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
938 		rte_pktmbuf_free(m);
939 		return;
940 	}
941 
942 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
943 		if (unlikely(find_local_dest(vdev, m, &offset,
944 					     &vlan_tag) != 0)) {
945 			rte_pktmbuf_free(m);
946 			return;
947 		}
948 	}
949 
950 	RTE_LOG_DP(DEBUG, VHOST_DATA,
951 		"(%d) TX: MAC address is external\n", vdev->vid);
952 
953 queue2nic:
954 
955 	/*Add packet to the port tx queue*/
956 	tx_q = &lcore_tx_queue[lcore_id];
957 
958 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
959 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
960 		/* Guest has inserted the vlan tag. */
961 		struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
962 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
963 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
964 			(vh->vlan_tci != vlan_tag_be))
965 			vh->vlan_tci = vlan_tag_be;
966 	} else {
967 		m->ol_flags |= PKT_TX_VLAN_PKT;
968 
969 		/*
970 		 * Find the right seg to adjust the data len when offset is
971 		 * bigger than tail room size.
972 		 */
973 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
974 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
975 				m->data_len += offset;
976 			else {
977 				struct rte_mbuf *seg = m;
978 
979 				while ((seg->next != NULL) &&
980 					(offset > rte_pktmbuf_tailroom(seg)))
981 					seg = seg->next;
982 
983 				seg->data_len += offset;
984 			}
985 			m->pkt_len += offset;
986 		}
987 
988 		m->vlan_tci = vlan_tag;
989 	}
990 
991 	if (m->ol_flags & PKT_TX_TCP_SEG)
992 		virtio_tx_offload(m);
993 
994 	tx_q->m_table[tx_q->len++] = m;
995 	if (enable_stats) {
996 		vdev->stats.tx_total++;
997 		vdev->stats.tx++;
998 	}
999 
1000 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1001 		do_drain_mbuf_table(tx_q);
1002 }
1003 
1004 
1005 static __rte_always_inline void
1006 drain_mbuf_table(struct mbuf_table *tx_q)
1007 {
1008 	static uint64_t prev_tsc;
1009 	uint64_t cur_tsc;
1010 
1011 	if (tx_q->len == 0)
1012 		return;
1013 
1014 	cur_tsc = rte_rdtsc();
1015 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1016 		prev_tsc = cur_tsc;
1017 
1018 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1019 			"TX queue drained after timeout with burst size %u\n",
1020 			tx_q->len);
1021 		do_drain_mbuf_table(tx_q);
1022 	}
1023 }
1024 
1025 static __rte_always_inline void
1026 drain_eth_rx(struct vhost_dev *vdev)
1027 {
1028 	uint16_t rx_count, enqueue_count;
1029 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1030 
1031 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1032 				    pkts, MAX_PKT_BURST);
1033 	if (!rx_count)
1034 		return;
1035 
1036 	/*
1037 	 * When "enable_retry" is set, here we wait and retry when there
1038 	 * is no enough free slots in the queue to hold @rx_count packets,
1039 	 * to diminish packet loss.
1040 	 */
1041 	if (enable_retry &&
1042 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1043 			VIRTIO_RXQ))) {
1044 		uint32_t retry;
1045 
1046 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1047 			rte_delay_us(burst_rx_delay_time);
1048 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1049 					VIRTIO_RXQ))
1050 				break;
1051 		}
1052 	}
1053 
1054 	if (builtin_net_driver) {
1055 		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1056 						pkts, rx_count);
1057 	} else {
1058 		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1059 						pkts, rx_count);
1060 	}
1061 	if (enable_stats) {
1062 		rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1063 		rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1064 	}
1065 
1066 	free_pkts(pkts, rx_count);
1067 }
1068 
1069 static __rte_always_inline void
1070 drain_virtio_tx(struct vhost_dev *vdev)
1071 {
1072 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1073 	uint16_t count;
1074 	uint16_t i;
1075 
1076 	if (builtin_net_driver) {
1077 		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1078 					pkts, MAX_PKT_BURST);
1079 	} else {
1080 		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1081 					mbuf_pool, pkts, MAX_PKT_BURST);
1082 	}
1083 
1084 	/* setup VMDq for the first packet */
1085 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1086 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1087 			free_pkts(pkts, count);
1088 	}
1089 
1090 	for (i = 0; i < count; ++i)
1091 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1092 }
1093 
1094 /*
1095  * Main function of vhost-switch. It basically does:
1096  *
1097  * for each vhost device {
1098  *    - drain_eth_rx()
1099  *
1100  *      Which drains the host eth Rx queue linked to the vhost device,
1101  *      and deliver all of them to guest virito Rx ring associated with
1102  *      this vhost device.
1103  *
1104  *    - drain_virtio_tx()
1105  *
1106  *      Which drains the guest virtio Tx queue and deliver all of them
1107  *      to the target, which could be another vhost device, or the
1108  *      physical eth dev. The route is done in function "virtio_tx_route".
1109  * }
1110  */
1111 static int
1112 switch_worker(void *arg __rte_unused)
1113 {
1114 	unsigned i;
1115 	unsigned lcore_id = rte_lcore_id();
1116 	struct vhost_dev *vdev;
1117 	struct mbuf_table *tx_q;
1118 
1119 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1120 
1121 	tx_q = &lcore_tx_queue[lcore_id];
1122 	for (i = 0; i < rte_lcore_count(); i++) {
1123 		if (lcore_ids[i] == lcore_id) {
1124 			tx_q->txq_id = i;
1125 			break;
1126 		}
1127 	}
1128 
1129 	while(1) {
1130 		drain_mbuf_table(tx_q);
1131 
1132 		/*
1133 		 * Inform the configuration core that we have exited the
1134 		 * linked list and that no devices are in use if requested.
1135 		 */
1136 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1137 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1138 
1139 		/*
1140 		 * Process vhost devices
1141 		 */
1142 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1143 			      lcore_vdev_entry) {
1144 			if (unlikely(vdev->remove)) {
1145 				unlink_vmdq(vdev);
1146 				vdev->ready = DEVICE_SAFE_REMOVE;
1147 				continue;
1148 			}
1149 
1150 			if (likely(vdev->ready == DEVICE_RX))
1151 				drain_eth_rx(vdev);
1152 
1153 			if (likely(!vdev->remove))
1154 				drain_virtio_tx(vdev);
1155 		}
1156 	}
1157 
1158 	return 0;
1159 }
1160 
1161 /*
1162  * Remove a device from the specific data core linked list and from the
1163  * main linked list. Synchonization  occurs through the use of the
1164  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1165  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1166  */
1167 static void
1168 destroy_device(int vid)
1169 {
1170 	struct vhost_dev *vdev = NULL;
1171 	int lcore;
1172 
1173 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1174 		if (vdev->vid == vid)
1175 			break;
1176 	}
1177 	if (!vdev)
1178 		return;
1179 	/*set the remove flag. */
1180 	vdev->remove = 1;
1181 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1182 		rte_pause();
1183 	}
1184 
1185 	if (builtin_net_driver)
1186 		vs_vhost_net_remove(vdev);
1187 
1188 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1189 		     lcore_vdev_entry);
1190 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1191 
1192 
1193 	/* Set the dev_removal_flag on each lcore. */
1194 	RTE_LCORE_FOREACH_SLAVE(lcore)
1195 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1196 
1197 	/*
1198 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1199 	 * we can be sure that they can no longer access the device removed
1200 	 * from the linked lists and that the devices are no longer in use.
1201 	 */
1202 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1203 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1204 			rte_pause();
1205 	}
1206 
1207 	lcore_info[vdev->coreid].device_num--;
1208 
1209 	RTE_LOG(INFO, VHOST_DATA,
1210 		"(%d) device has been removed from data core\n",
1211 		vdev->vid);
1212 
1213 	rte_free(vdev);
1214 }
1215 
1216 /*
1217  * A new device is added to a data core. First the device is added to the main linked list
1218  * and then allocated to a specific data core.
1219  */
1220 static int
1221 new_device(int vid)
1222 {
1223 	int lcore, core_add = 0;
1224 	uint32_t device_num_min = num_devices;
1225 	struct vhost_dev *vdev;
1226 
1227 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1228 	if (vdev == NULL) {
1229 		RTE_LOG(INFO, VHOST_DATA,
1230 			"(%d) couldn't allocate memory for vhost dev\n",
1231 			vid);
1232 		return -1;
1233 	}
1234 	vdev->vid = vid;
1235 
1236 	if (builtin_net_driver)
1237 		vs_vhost_net_setup(vdev);
1238 
1239 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1240 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1241 
1242 	/*reset ready flag*/
1243 	vdev->ready = DEVICE_MAC_LEARNING;
1244 	vdev->remove = 0;
1245 
1246 	/* Find a suitable lcore to add the device. */
1247 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1248 		if (lcore_info[lcore].device_num < device_num_min) {
1249 			device_num_min = lcore_info[lcore].device_num;
1250 			core_add = lcore;
1251 		}
1252 	}
1253 	vdev->coreid = core_add;
1254 
1255 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1256 			  lcore_vdev_entry);
1257 	lcore_info[vdev->coreid].device_num++;
1258 
1259 	/* Disable notifications. */
1260 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1261 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1262 
1263 	RTE_LOG(INFO, VHOST_DATA,
1264 		"(%d) device has been added to data core %d\n",
1265 		vid, vdev->coreid);
1266 
1267 	return 0;
1268 }
1269 
1270 /*
1271  * These callback allow devices to be added to the data core when configuration
1272  * has been fully complete.
1273  */
1274 static const struct vhost_device_ops virtio_net_device_ops =
1275 {
1276 	.new_device =  new_device,
1277 	.destroy_device = destroy_device,
1278 };
1279 
1280 /*
1281  * This is a thread will wake up after a period to print stats if the user has
1282  * enabled them.
1283  */
1284 static void *
1285 print_stats(__rte_unused void *arg)
1286 {
1287 	struct vhost_dev *vdev;
1288 	uint64_t tx_dropped, rx_dropped;
1289 	uint64_t tx, tx_total, rx, rx_total;
1290 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1291 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1292 
1293 	while(1) {
1294 		sleep(enable_stats);
1295 
1296 		/* Clear screen and move to top left */
1297 		printf("%s%s\n", clr, top_left);
1298 		printf("Device statistics =================================\n");
1299 
1300 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1301 			tx_total   = vdev->stats.tx_total;
1302 			tx         = vdev->stats.tx;
1303 			tx_dropped = tx_total - tx;
1304 
1305 			rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1306 			rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1307 			rx_dropped = rx_total - rx;
1308 
1309 			printf("Statistics for device %d\n"
1310 				"-----------------------\n"
1311 				"TX total:              %" PRIu64 "\n"
1312 				"TX dropped:            %" PRIu64 "\n"
1313 				"TX successful:         %" PRIu64 "\n"
1314 				"RX total:              %" PRIu64 "\n"
1315 				"RX dropped:            %" PRIu64 "\n"
1316 				"RX successful:         %" PRIu64 "\n",
1317 				vdev->vid,
1318 				tx_total, tx_dropped, tx,
1319 				rx_total, rx_dropped, rx);
1320 		}
1321 
1322 		printf("===================================================\n");
1323 	}
1324 
1325 	return NULL;
1326 }
1327 
1328 static void
1329 unregister_drivers(int socket_num)
1330 {
1331 	int i, ret;
1332 
1333 	for (i = 0; i < socket_num; i++) {
1334 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1335 		if (ret != 0)
1336 			RTE_LOG(ERR, VHOST_CONFIG,
1337 				"Fail to unregister vhost driver for %s.\n",
1338 				socket_files + i * PATH_MAX);
1339 	}
1340 }
1341 
1342 /* When we receive a INT signal, unregister vhost driver */
1343 static void
1344 sigint_handler(__rte_unused int signum)
1345 {
1346 	/* Unregister vhost driver. */
1347 	unregister_drivers(nb_sockets);
1348 
1349 	exit(0);
1350 }
1351 
1352 /*
1353  * While creating an mbuf pool, one key thing is to figure out how
1354  * many mbuf entries is enough for our use. FYI, here are some
1355  * guidelines:
1356  *
1357  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1358  *
1359  * - For each switch core (A CPU core does the packet switch), we need
1360  *   also make some reservation for receiving the packets from virtio
1361  *   Tx queue. How many is enough depends on the usage. It's normally
1362  *   a simple calculation like following:
1363  *
1364  *       MAX_PKT_BURST * max packet size / mbuf size
1365  *
1366  *   So, we definitely need allocate more mbufs when TSO is enabled.
1367  *
1368  * - Similarly, for each switching core, we should serve @nr_rx_desc
1369  *   mbufs for receiving the packets from physical NIC device.
1370  *
1371  * - We also need make sure, for each switch core, we have allocated
1372  *   enough mbufs to fill up the mbuf cache.
1373  */
1374 static void
1375 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1376 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1377 {
1378 	uint32_t nr_mbufs;
1379 	uint32_t nr_mbufs_per_core;
1380 	uint32_t mtu = 1500;
1381 
1382 	if (mergeable)
1383 		mtu = 9000;
1384 	if (enable_tso)
1385 		mtu = 64 * 1024;
1386 
1387 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1388 			(mbuf_size - RTE_PKTMBUF_HEADROOM);
1389 	nr_mbufs_per_core += nr_rx_desc;
1390 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1391 
1392 	nr_mbufs  = nr_queues * nr_rx_desc;
1393 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1394 	nr_mbufs *= nr_port;
1395 
1396 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1397 					    nr_mbuf_cache, 0, mbuf_size,
1398 					    rte_socket_id());
1399 	if (mbuf_pool == NULL)
1400 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1401 }
1402 
1403 /*
1404  * Main function, does initialisation and calls the per-lcore functions.
1405  */
1406 int
1407 main(int argc, char *argv[])
1408 {
1409 	unsigned lcore_id, core_id = 0;
1410 	unsigned nb_ports, valid_num_ports;
1411 	int ret, i;
1412 	uint16_t portid;
1413 	static pthread_t tid;
1414 	uint64_t flags = 0;
1415 
1416 	signal(SIGINT, sigint_handler);
1417 
1418 	/* init EAL */
1419 	ret = rte_eal_init(argc, argv);
1420 	if (ret < 0)
1421 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1422 	argc -= ret;
1423 	argv += ret;
1424 
1425 	/* parse app arguments */
1426 	ret = us_vhost_parse_args(argc, argv);
1427 	if (ret < 0)
1428 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1429 
1430 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1431 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1432 
1433 		if (rte_lcore_is_enabled(lcore_id))
1434 			lcore_ids[core_id++] = lcore_id;
1435 	}
1436 
1437 	if (rte_lcore_count() > RTE_MAX_LCORE)
1438 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1439 
1440 	/* Get the number of physical ports. */
1441 	nb_ports = rte_eth_dev_count_avail();
1442 
1443 	/*
1444 	 * Update the global var NUM_PORTS and global array PORTS
1445 	 * and get value of var VALID_NUM_PORTS according to system ports number
1446 	 */
1447 	valid_num_ports = check_ports_num(nb_ports);
1448 
1449 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1450 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1451 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1452 		return -1;
1453 	}
1454 
1455 	/*
1456 	 * FIXME: here we are trying to allocate mbufs big enough for
1457 	 * @MAX_QUEUES, but the truth is we're never going to use that
1458 	 * many queues here. We probably should only do allocation for
1459 	 * those queues we are going to use.
1460 	 */
1461 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1462 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1463 
1464 	if (vm2vm_mode == VM2VM_HARDWARE) {
1465 		/* Enable VT loop back to let L2 switch to do it. */
1466 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1467 		RTE_LOG(DEBUG, VHOST_CONFIG,
1468 			"Enable loop back for L2 switch in vmdq.\n");
1469 	}
1470 
1471 	/* initialize all ports */
1472 	RTE_ETH_FOREACH_DEV(portid) {
1473 		/* skip ports that are not enabled */
1474 		if ((enabled_port_mask & (1 << portid)) == 0) {
1475 			RTE_LOG(INFO, VHOST_PORT,
1476 				"Skipping disabled port %d\n", portid);
1477 			continue;
1478 		}
1479 		if (port_init(portid) != 0)
1480 			rte_exit(EXIT_FAILURE,
1481 				"Cannot initialize network ports\n");
1482 	}
1483 
1484 	/* Enable stats if the user option is set. */
1485 	if (enable_stats) {
1486 		ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1487 					print_stats, NULL);
1488 		if (ret < 0)
1489 			rte_exit(EXIT_FAILURE,
1490 				"Cannot create print-stats thread\n");
1491 	}
1492 
1493 	/* Launch all data cores. */
1494 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1495 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1496 
1497 	if (client_mode)
1498 		flags |= RTE_VHOST_USER_CLIENT;
1499 
1500 	if (dequeue_zero_copy)
1501 		flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1502 
1503 	/* Register vhost user driver to handle vhost messages. */
1504 	for (i = 0; i < nb_sockets; i++) {
1505 		char *file = socket_files + i * PATH_MAX;
1506 		ret = rte_vhost_driver_register(file, flags);
1507 		if (ret != 0) {
1508 			unregister_drivers(i);
1509 			rte_exit(EXIT_FAILURE,
1510 				"vhost driver register failure.\n");
1511 		}
1512 
1513 		if (builtin_net_driver)
1514 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1515 
1516 		if (mergeable == 0) {
1517 			rte_vhost_driver_disable_features(file,
1518 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
1519 		}
1520 
1521 		if (enable_tx_csum == 0) {
1522 			rte_vhost_driver_disable_features(file,
1523 				1ULL << VIRTIO_NET_F_CSUM);
1524 		}
1525 
1526 		if (enable_tso == 0) {
1527 			rte_vhost_driver_disable_features(file,
1528 				1ULL << VIRTIO_NET_F_HOST_TSO4);
1529 			rte_vhost_driver_disable_features(file,
1530 				1ULL << VIRTIO_NET_F_HOST_TSO6);
1531 			rte_vhost_driver_disable_features(file,
1532 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
1533 			rte_vhost_driver_disable_features(file,
1534 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
1535 		}
1536 
1537 		if (promiscuous) {
1538 			rte_vhost_driver_enable_features(file,
1539 				1ULL << VIRTIO_NET_F_CTRL_RX);
1540 		}
1541 
1542 		ret = rte_vhost_driver_callback_register(file,
1543 			&virtio_net_device_ops);
1544 		if (ret != 0) {
1545 			rte_exit(EXIT_FAILURE,
1546 				"failed to register vhost driver callbacks.\n");
1547 		}
1548 
1549 		if (rte_vhost_driver_start(file) < 0) {
1550 			rte_exit(EXIT_FAILURE,
1551 				"failed to start vhost driver.\n");
1552 		}
1553 	}
1554 
1555 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1556 		rte_eal_wait_lcore(lcore_id);
1557 
1558 	return 0;
1559 
1560 }
1561