xref: /dpdk/examples/vhost/main.c (revision f5057be340e44f3edc0fe90fa875eb89a4c49b4f)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4 
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16 
17 #include <rte_atomic.h>
18 #include <rte_cycles.h>
19 #include <rte_ethdev.h>
20 #include <rte_log.h>
21 #include <rte_string_fns.h>
22 #include <rte_malloc.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27 
28 #include "main.h"
29 
30 #ifndef MAX_QUEUES
31 #define MAX_QUEUES 128
32 #endif
33 
34 /* the maximum number of external ports supported */
35 #define MAX_SUP_PORTS 1
36 
37 #define MBUF_CACHE_SIZE	128
38 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
39 
40 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
41 
42 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
43 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
44 
45 #define JUMBO_FRAME_MAX_SIZE    0x2600
46 
47 /* State of virtio device. */
48 #define DEVICE_MAC_LEARNING 0
49 #define DEVICE_RX			1
50 #define DEVICE_SAFE_REMOVE	2
51 
52 /* Configurable number of RX/TX ring descriptors */
53 #define RTE_TEST_RX_DESC_DEFAULT 1024
54 #define RTE_TEST_TX_DESC_DEFAULT 512
55 
56 #define INVALID_PORT_ID 0xFF
57 
58 /* Maximum long option length for option parsing. */
59 #define MAX_LONG_OPT_SZ 64
60 
61 /* mask of enabled ports */
62 static uint32_t enabled_port_mask = 0;
63 
64 /* Promiscuous mode */
65 static uint32_t promiscuous;
66 
67 /* number of devices/queues to support*/
68 static uint32_t num_queues = 0;
69 static uint32_t num_devices;
70 
71 static struct rte_mempool *mbuf_pool;
72 static int mergeable;
73 
74 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
75 typedef enum {
76 	VM2VM_DISABLED = 0,
77 	VM2VM_SOFTWARE = 1,
78 	VM2VM_HARDWARE = 2,
79 	VM2VM_LAST
80 } vm2vm_type;
81 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
82 
83 /* Enable stats. */
84 static uint32_t enable_stats = 0;
85 /* Enable retries on RX. */
86 static uint32_t enable_retry = 1;
87 
88 /* Disable TX checksum offload */
89 static uint32_t enable_tx_csum;
90 
91 /* Disable TSO offload */
92 static uint32_t enable_tso;
93 
94 static int client_mode;
95 
96 static int builtin_net_driver;
97 
98 /* Specify timeout (in useconds) between retries on RX. */
99 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
100 /* Specify the number of retries on RX. */
101 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
102 
103 /* Socket file paths. Can be set by user */
104 static char *socket_files;
105 static int nb_sockets;
106 
107 /* empty vmdq configuration structure. Filled in programatically */
108 static struct rte_eth_conf vmdq_conf_default = {
109 	.rxmode = {
110 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
111 		.split_hdr_size = 0,
112 		/*
113 		 * VLAN strip is necessary for 1G NIC such as I350,
114 		 * this fixes bug of ipv4 forwarding in guest can't
115 		 * forward pakets from one virtio dev to another virtio dev.
116 		 */
117 		.offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
118 	},
119 
120 	.txmode = {
121 		.mq_mode = ETH_MQ_TX_NONE,
122 		.offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
123 			     DEV_TX_OFFLOAD_TCP_CKSUM |
124 			     DEV_TX_OFFLOAD_VLAN_INSERT |
125 			     DEV_TX_OFFLOAD_MULTI_SEGS |
126 			     DEV_TX_OFFLOAD_TCP_TSO),
127 	},
128 	.rx_adv_conf = {
129 		/*
130 		 * should be overridden separately in code with
131 		 * appropriate values
132 		 */
133 		.vmdq_rx_conf = {
134 			.nb_queue_pools = ETH_8_POOLS,
135 			.enable_default_pool = 0,
136 			.default_pool = 0,
137 			.nb_pool_maps = 0,
138 			.pool_map = {{0, 0},},
139 		},
140 	},
141 };
142 
143 
144 static unsigned lcore_ids[RTE_MAX_LCORE];
145 static uint16_t ports[RTE_MAX_ETHPORTS];
146 static unsigned num_ports = 0; /**< The number of ports specified in command line */
147 static uint16_t num_pf_queues, num_vmdq_queues;
148 static uint16_t vmdq_pool_base, vmdq_queue_base;
149 static uint16_t queues_per_pool;
150 
151 const uint16_t vlan_tags[] = {
152 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
153 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
154 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
155 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
156 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
157 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
158 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
159 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
160 };
161 
162 /* ethernet addresses of ports */
163 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
164 
165 static struct vhost_dev_tailq_list vhost_dev_list =
166 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
167 
168 static struct lcore_info lcore_info[RTE_MAX_LCORE];
169 
170 /* Used for queueing bursts of TX packets. */
171 struct mbuf_table {
172 	unsigned len;
173 	unsigned txq_id;
174 	struct rte_mbuf *m_table[MAX_PKT_BURST];
175 };
176 
177 /* TX queue for each data core. */
178 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
179 
180 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
181 				 / US_PER_S * BURST_TX_DRAIN_US)
182 #define VLAN_HLEN       4
183 
184 /*
185  * Builds up the correct configuration for VMDQ VLAN pool map
186  * according to the pool & queue limits.
187  */
188 static inline int
189 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
190 {
191 	struct rte_eth_vmdq_rx_conf conf;
192 	struct rte_eth_vmdq_rx_conf *def_conf =
193 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
194 	unsigned i;
195 
196 	memset(&conf, 0, sizeof(conf));
197 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
198 	conf.nb_pool_maps = num_devices;
199 	conf.enable_loop_back = def_conf->enable_loop_back;
200 	conf.rx_mode = def_conf->rx_mode;
201 
202 	for (i = 0; i < conf.nb_pool_maps; i++) {
203 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
204 		conf.pool_map[i].pools = (1UL << i);
205 	}
206 
207 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
208 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
209 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
210 	return 0;
211 }
212 
213 /*
214  * Initialises a given port using global settings and with the rx buffers
215  * coming from the mbuf_pool passed as parameter
216  */
217 static inline int
218 port_init(uint16_t port)
219 {
220 	struct rte_eth_dev_info dev_info;
221 	struct rte_eth_conf port_conf;
222 	struct rte_eth_rxconf *rxconf;
223 	struct rte_eth_txconf *txconf;
224 	int16_t rx_rings, tx_rings;
225 	uint16_t rx_ring_size, tx_ring_size;
226 	int retval;
227 	uint16_t q;
228 
229 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
230 	retval = rte_eth_dev_info_get(port, &dev_info);
231 	if (retval != 0) {
232 		RTE_LOG(ERR, VHOST_PORT,
233 			"Error during getting device (port %u) info: %s\n",
234 			port, strerror(-retval));
235 
236 		return retval;
237 	}
238 
239 	rxconf = &dev_info.default_rxconf;
240 	txconf = &dev_info.default_txconf;
241 	rxconf->rx_drop_en = 1;
242 
243 	/*configure the number of supported virtio devices based on VMDQ limits */
244 	num_devices = dev_info.max_vmdq_pools;
245 
246 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
247 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
248 
249 	tx_rings = (uint16_t)rte_lcore_count();
250 
251 	/* Get port configuration. */
252 	retval = get_eth_conf(&port_conf, num_devices);
253 	if (retval < 0)
254 		return retval;
255 	/* NIC queues are divided into pf queues and vmdq queues.  */
256 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
257 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
258 	num_vmdq_queues = num_devices * queues_per_pool;
259 	num_queues = num_pf_queues + num_vmdq_queues;
260 	vmdq_queue_base = dev_info.vmdq_queue_base;
261 	vmdq_pool_base  = dev_info.vmdq_pool_base;
262 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
263 		num_pf_queues, num_devices, queues_per_pool);
264 
265 	if (!rte_eth_dev_is_valid_port(port))
266 		return -1;
267 
268 	rx_rings = (uint16_t)dev_info.max_rx_queues;
269 	if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
270 		port_conf.txmode.offloads |=
271 			DEV_TX_OFFLOAD_MBUF_FAST_FREE;
272 	/* Configure ethernet device. */
273 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
274 	if (retval != 0) {
275 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
276 			port, strerror(-retval));
277 		return retval;
278 	}
279 
280 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
281 		&tx_ring_size);
282 	if (retval != 0) {
283 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
284 			"for port %u: %s.\n", port, strerror(-retval));
285 		return retval;
286 	}
287 	if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
288 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
289 			"for Rx queues on port %u.\n", port);
290 		return -1;
291 	}
292 
293 	/* Setup the queues. */
294 	rxconf->offloads = port_conf.rxmode.offloads;
295 	for (q = 0; q < rx_rings; q ++) {
296 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
297 						rte_eth_dev_socket_id(port),
298 						rxconf,
299 						mbuf_pool);
300 		if (retval < 0) {
301 			RTE_LOG(ERR, VHOST_PORT,
302 				"Failed to setup rx queue %u of port %u: %s.\n",
303 				q, port, strerror(-retval));
304 			return retval;
305 		}
306 	}
307 	txconf->offloads = port_conf.txmode.offloads;
308 	for (q = 0; q < tx_rings; q ++) {
309 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
310 						rte_eth_dev_socket_id(port),
311 						txconf);
312 		if (retval < 0) {
313 			RTE_LOG(ERR, VHOST_PORT,
314 				"Failed to setup tx queue %u of port %u: %s.\n",
315 				q, port, strerror(-retval));
316 			return retval;
317 		}
318 	}
319 
320 	/* Start the device. */
321 	retval  = rte_eth_dev_start(port);
322 	if (retval < 0) {
323 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
324 			port, strerror(-retval));
325 		return retval;
326 	}
327 
328 	if (promiscuous) {
329 		retval = rte_eth_promiscuous_enable(port);
330 		if (retval != 0) {
331 			RTE_LOG(ERR, VHOST_PORT,
332 				"Failed to enable promiscuous mode on port %u: %s\n",
333 				port, rte_strerror(-retval));
334 			return retval;
335 		}
336 	}
337 
338 	retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
339 	if (retval < 0) {
340 		RTE_LOG(ERR, VHOST_PORT,
341 			"Failed to get MAC address on port %u: %s\n",
342 			port, rte_strerror(-retval));
343 		return retval;
344 	}
345 
346 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
347 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
348 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
349 			port,
350 			vmdq_ports_eth_addr[port].addr_bytes[0],
351 			vmdq_ports_eth_addr[port].addr_bytes[1],
352 			vmdq_ports_eth_addr[port].addr_bytes[2],
353 			vmdq_ports_eth_addr[port].addr_bytes[3],
354 			vmdq_ports_eth_addr[port].addr_bytes[4],
355 			vmdq_ports_eth_addr[port].addr_bytes[5]);
356 
357 	return 0;
358 }
359 
360 /*
361  * Set socket file path.
362  */
363 static int
364 us_vhost_parse_socket_path(const char *q_arg)
365 {
366 	char *old;
367 
368 	/* parse number string */
369 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
370 		return -1;
371 
372 	old = socket_files;
373 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
374 	if (socket_files == NULL) {
375 		free(old);
376 		return -1;
377 	}
378 
379 	strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
380 	nb_sockets++;
381 
382 	return 0;
383 }
384 
385 /*
386  * Parse the portmask provided at run time.
387  */
388 static int
389 parse_portmask(const char *portmask)
390 {
391 	char *end = NULL;
392 	unsigned long pm;
393 
394 	errno = 0;
395 
396 	/* parse hexadecimal string */
397 	pm = strtoul(portmask, &end, 16);
398 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
399 		return 0;
400 
401 	return pm;
402 
403 }
404 
405 /*
406  * Parse num options at run time.
407  */
408 static int
409 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
410 {
411 	char *end = NULL;
412 	unsigned long num;
413 
414 	errno = 0;
415 
416 	/* parse unsigned int string */
417 	num = strtoul(q_arg, &end, 10);
418 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
419 		return -1;
420 
421 	if (num > max_valid_value)
422 		return -1;
423 
424 	return num;
425 
426 }
427 
428 /*
429  * Display usage
430  */
431 static void
432 us_vhost_usage(const char *prgname)
433 {
434 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
435 	"		--vm2vm [0|1|2]\n"
436 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
437 	"		--socket-file <path>\n"
438 	"		--nb-devices ND\n"
439 	"		-p PORTMASK: Set mask for ports to be used by application\n"
440 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
441 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
442 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
443 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
444 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
445 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
446 	"		--socket-file: The path of the socket file.\n"
447 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
448 	"		--tso [0|1] disable/enable TCP segment offload.\n"
449 	"		--client register a vhost-user socket as client mode.\n",
450 	       prgname);
451 }
452 
453 /*
454  * Parse the arguments given in the command line of the application.
455  */
456 static int
457 us_vhost_parse_args(int argc, char **argv)
458 {
459 	int opt, ret;
460 	int option_index;
461 	unsigned i;
462 	const char *prgname = argv[0];
463 	static struct option long_option[] = {
464 		{"vm2vm", required_argument, NULL, 0},
465 		{"rx-retry", required_argument, NULL, 0},
466 		{"rx-retry-delay", required_argument, NULL, 0},
467 		{"rx-retry-num", required_argument, NULL, 0},
468 		{"mergeable", required_argument, NULL, 0},
469 		{"stats", required_argument, NULL, 0},
470 		{"socket-file", required_argument, NULL, 0},
471 		{"tx-csum", required_argument, NULL, 0},
472 		{"tso", required_argument, NULL, 0},
473 		{"client", no_argument, &client_mode, 1},
474 		{"builtin-net-driver", no_argument, &builtin_net_driver, 1},
475 		{NULL, 0, 0, 0},
476 	};
477 
478 	/* Parse command line */
479 	while ((opt = getopt_long(argc, argv, "p:P",
480 			long_option, &option_index)) != EOF) {
481 		switch (opt) {
482 		/* Portmask */
483 		case 'p':
484 			enabled_port_mask = parse_portmask(optarg);
485 			if (enabled_port_mask == 0) {
486 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
487 				us_vhost_usage(prgname);
488 				return -1;
489 			}
490 			break;
491 
492 		case 'P':
493 			promiscuous = 1;
494 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
495 				ETH_VMDQ_ACCEPT_BROADCAST |
496 				ETH_VMDQ_ACCEPT_MULTICAST;
497 
498 			break;
499 
500 		case 0:
501 			/* Enable/disable vm2vm comms. */
502 			if (!strncmp(long_option[option_index].name, "vm2vm",
503 				MAX_LONG_OPT_SZ)) {
504 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
505 				if (ret == -1) {
506 					RTE_LOG(INFO, VHOST_CONFIG,
507 						"Invalid argument for "
508 						"vm2vm [0|1|2]\n");
509 					us_vhost_usage(prgname);
510 					return -1;
511 				} else {
512 					vm2vm_mode = (vm2vm_type)ret;
513 				}
514 			}
515 
516 			/* Enable/disable retries on RX. */
517 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
518 				ret = parse_num_opt(optarg, 1);
519 				if (ret == -1) {
520 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
521 					us_vhost_usage(prgname);
522 					return -1;
523 				} else {
524 					enable_retry = ret;
525 				}
526 			}
527 
528 			/* Enable/disable TX checksum offload. */
529 			if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
530 				ret = parse_num_opt(optarg, 1);
531 				if (ret == -1) {
532 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
533 					us_vhost_usage(prgname);
534 					return -1;
535 				} else
536 					enable_tx_csum = ret;
537 			}
538 
539 			/* Enable/disable TSO offload. */
540 			if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
541 				ret = parse_num_opt(optarg, 1);
542 				if (ret == -1) {
543 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
544 					us_vhost_usage(prgname);
545 					return -1;
546 				} else
547 					enable_tso = ret;
548 			}
549 
550 			/* Specify the retries delay time (in useconds) on RX. */
551 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
552 				ret = parse_num_opt(optarg, INT32_MAX);
553 				if (ret == -1) {
554 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
555 					us_vhost_usage(prgname);
556 					return -1;
557 				} else {
558 					burst_rx_delay_time = ret;
559 				}
560 			}
561 
562 			/* Specify the retries number on RX. */
563 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
564 				ret = parse_num_opt(optarg, INT32_MAX);
565 				if (ret == -1) {
566 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
567 					us_vhost_usage(prgname);
568 					return -1;
569 				} else {
570 					burst_rx_retry_num = ret;
571 				}
572 			}
573 
574 			/* Enable/disable RX mergeable buffers. */
575 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
576 				ret = parse_num_opt(optarg, 1);
577 				if (ret == -1) {
578 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
579 					us_vhost_usage(prgname);
580 					return -1;
581 				} else {
582 					mergeable = !!ret;
583 					if (ret) {
584 						vmdq_conf_default.rxmode.offloads |=
585 							DEV_RX_OFFLOAD_JUMBO_FRAME;
586 						vmdq_conf_default.rxmode.max_rx_pkt_len
587 							= JUMBO_FRAME_MAX_SIZE;
588 					}
589 				}
590 			}
591 
592 			/* Enable/disable stats. */
593 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
594 				ret = parse_num_opt(optarg, INT32_MAX);
595 				if (ret == -1) {
596 					RTE_LOG(INFO, VHOST_CONFIG,
597 						"Invalid argument for stats [0..N]\n");
598 					us_vhost_usage(prgname);
599 					return -1;
600 				} else {
601 					enable_stats = ret;
602 				}
603 			}
604 
605 			/* Set socket file path. */
606 			if (!strncmp(long_option[option_index].name,
607 						"socket-file", MAX_LONG_OPT_SZ)) {
608 				if (us_vhost_parse_socket_path(optarg) == -1) {
609 					RTE_LOG(INFO, VHOST_CONFIG,
610 					"Invalid argument for socket name (Max %d characters)\n",
611 					PATH_MAX);
612 					us_vhost_usage(prgname);
613 					return -1;
614 				}
615 			}
616 
617 			break;
618 
619 			/* Invalid option - print options. */
620 		default:
621 			us_vhost_usage(prgname);
622 			return -1;
623 		}
624 	}
625 
626 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
627 		if (enabled_port_mask & (1 << i))
628 			ports[num_ports++] = i;
629 	}
630 
631 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
632 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
633 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
634 		return -1;
635 	}
636 
637 	return 0;
638 }
639 
640 /*
641  * Update the global var NUM_PORTS and array PORTS according to system ports number
642  * and return valid ports number
643  */
644 static unsigned check_ports_num(unsigned nb_ports)
645 {
646 	unsigned valid_num_ports = num_ports;
647 	unsigned portid;
648 
649 	if (num_ports > nb_ports) {
650 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
651 			num_ports, nb_ports);
652 		num_ports = nb_ports;
653 	}
654 
655 	for (portid = 0; portid < num_ports; portid ++) {
656 		if (!rte_eth_dev_is_valid_port(ports[portid])) {
657 			RTE_LOG(INFO, VHOST_PORT,
658 				"\nSpecified port ID(%u) is not valid\n",
659 				ports[portid]);
660 			ports[portid] = INVALID_PORT_ID;
661 			valid_num_ports--;
662 		}
663 	}
664 	return valid_num_ports;
665 }
666 
667 static __rte_always_inline struct vhost_dev *
668 find_vhost_dev(struct rte_ether_addr *mac)
669 {
670 	struct vhost_dev *vdev;
671 
672 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
673 		if (vdev->ready == DEVICE_RX &&
674 		    rte_is_same_ether_addr(mac, &vdev->mac_address))
675 			return vdev;
676 	}
677 
678 	return NULL;
679 }
680 
681 /*
682  * This function learns the MAC address of the device and registers this along with a
683  * vlan tag to a VMDQ.
684  */
685 static int
686 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
687 {
688 	struct rte_ether_hdr *pkt_hdr;
689 	int i, ret;
690 
691 	/* Learn MAC address of guest device from packet */
692 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
693 
694 	if (find_vhost_dev(&pkt_hdr->s_addr)) {
695 		RTE_LOG(ERR, VHOST_DATA,
696 			"(%d) device is using a registered MAC!\n",
697 			vdev->vid);
698 		return -1;
699 	}
700 
701 	for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
702 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
703 
704 	/* vlan_tag currently uses the device_id. */
705 	vdev->vlan_tag = vlan_tags[vdev->vid];
706 
707 	/* Print out VMDQ registration info. */
708 	RTE_LOG(INFO, VHOST_DATA,
709 		"(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
710 		vdev->vid,
711 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
712 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
713 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
714 		vdev->vlan_tag);
715 
716 	/* Register the MAC address. */
717 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
718 				(uint32_t)vdev->vid + vmdq_pool_base);
719 	if (ret)
720 		RTE_LOG(ERR, VHOST_DATA,
721 			"(%d) failed to add device MAC address to VMDQ\n",
722 			vdev->vid);
723 
724 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
725 
726 	/* Set device as ready for RX. */
727 	vdev->ready = DEVICE_RX;
728 
729 	return 0;
730 }
731 
732 /*
733  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
734  * queue before disabling RX on the device.
735  */
736 static inline void
737 unlink_vmdq(struct vhost_dev *vdev)
738 {
739 	unsigned i = 0;
740 	unsigned rx_count;
741 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
742 
743 	if (vdev->ready == DEVICE_RX) {
744 		/*clear MAC and VLAN settings*/
745 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
746 		for (i = 0; i < 6; i++)
747 			vdev->mac_address.addr_bytes[i] = 0;
748 
749 		vdev->vlan_tag = 0;
750 
751 		/*Clear out the receive buffers*/
752 		rx_count = rte_eth_rx_burst(ports[0],
753 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
754 
755 		while (rx_count) {
756 			for (i = 0; i < rx_count; i++)
757 				rte_pktmbuf_free(pkts_burst[i]);
758 
759 			rx_count = rte_eth_rx_burst(ports[0],
760 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
761 		}
762 
763 		vdev->ready = DEVICE_MAC_LEARNING;
764 	}
765 }
766 
767 static __rte_always_inline void
768 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
769 	    struct rte_mbuf *m)
770 {
771 	uint16_t ret;
772 
773 	if (builtin_net_driver) {
774 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
775 	} else {
776 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
777 	}
778 
779 	if (enable_stats) {
780 		rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
781 		rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
782 		src_vdev->stats.tx_total++;
783 		src_vdev->stats.tx += ret;
784 	}
785 }
786 
787 /*
788  * Check if the packet destination MAC address is for a local device. If so then put
789  * the packet on that devices RX queue. If not then return.
790  */
791 static __rte_always_inline int
792 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
793 {
794 	struct rte_ether_hdr *pkt_hdr;
795 	struct vhost_dev *dst_vdev;
796 
797 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
798 
799 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
800 	if (!dst_vdev)
801 		return -1;
802 
803 	if (vdev->vid == dst_vdev->vid) {
804 		RTE_LOG_DP(DEBUG, VHOST_DATA,
805 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
806 			vdev->vid);
807 		return 0;
808 	}
809 
810 	RTE_LOG_DP(DEBUG, VHOST_DATA,
811 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
812 
813 	if (unlikely(dst_vdev->remove)) {
814 		RTE_LOG_DP(DEBUG, VHOST_DATA,
815 			"(%d) device is marked for removal\n", dst_vdev->vid);
816 		return 0;
817 	}
818 
819 	virtio_xmit(dst_vdev, vdev, m);
820 	return 0;
821 }
822 
823 /*
824  * Check if the destination MAC of a packet is one local VM,
825  * and get its vlan tag, and offset if it is.
826  */
827 static __rte_always_inline int
828 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
829 	uint32_t *offset, uint16_t *vlan_tag)
830 {
831 	struct vhost_dev *dst_vdev;
832 	struct rte_ether_hdr *pkt_hdr =
833 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
834 
835 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
836 	if (!dst_vdev)
837 		return 0;
838 
839 	if (vdev->vid == dst_vdev->vid) {
840 		RTE_LOG_DP(DEBUG, VHOST_DATA,
841 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
842 			vdev->vid);
843 		return -1;
844 	}
845 
846 	/*
847 	 * HW vlan strip will reduce the packet length
848 	 * by minus length of vlan tag, so need restore
849 	 * the packet length by plus it.
850 	 */
851 	*offset  = VLAN_HLEN;
852 	*vlan_tag = vlan_tags[vdev->vid];
853 
854 	RTE_LOG_DP(DEBUG, VHOST_DATA,
855 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
856 		vdev->vid, dst_vdev->vid, *vlan_tag);
857 
858 	return 0;
859 }
860 
861 static uint16_t
862 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
863 {
864 	if (ol_flags & PKT_TX_IPV4)
865 		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
866 	else /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
867 		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
868 }
869 
870 static void virtio_tx_offload(struct rte_mbuf *m)
871 {
872 	void *l3_hdr;
873 	struct rte_ipv4_hdr *ipv4_hdr = NULL;
874 	struct rte_tcp_hdr *tcp_hdr = NULL;
875 	struct rte_ether_hdr *eth_hdr =
876 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
877 
878 	l3_hdr = (char *)eth_hdr + m->l2_len;
879 
880 	if (m->ol_flags & PKT_TX_IPV4) {
881 		ipv4_hdr = l3_hdr;
882 		ipv4_hdr->hdr_checksum = 0;
883 		m->ol_flags |= PKT_TX_IP_CKSUM;
884 	}
885 
886 	tcp_hdr = (struct rte_tcp_hdr *)((char *)l3_hdr + m->l3_len);
887 	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
888 }
889 
890 static inline void
891 free_pkts(struct rte_mbuf **pkts, uint16_t n)
892 {
893 	while (n--)
894 		rte_pktmbuf_free(pkts[n]);
895 }
896 
897 static __rte_always_inline void
898 do_drain_mbuf_table(struct mbuf_table *tx_q)
899 {
900 	uint16_t count;
901 
902 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
903 				 tx_q->m_table, tx_q->len);
904 	if (unlikely(count < tx_q->len))
905 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
906 
907 	tx_q->len = 0;
908 }
909 
910 /*
911  * This function routes the TX packet to the correct interface. This
912  * may be a local device or the physical port.
913  */
914 static __rte_always_inline void
915 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
916 {
917 	struct mbuf_table *tx_q;
918 	unsigned offset = 0;
919 	const uint16_t lcore_id = rte_lcore_id();
920 	struct rte_ether_hdr *nh;
921 
922 
923 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
924 	if (unlikely(rte_is_broadcast_ether_addr(&nh->d_addr))) {
925 		struct vhost_dev *vdev2;
926 
927 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
928 			if (vdev2 != vdev)
929 				virtio_xmit(vdev2, vdev, m);
930 		}
931 		goto queue2nic;
932 	}
933 
934 	/*check if destination is local VM*/
935 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
936 		rte_pktmbuf_free(m);
937 		return;
938 	}
939 
940 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
941 		if (unlikely(find_local_dest(vdev, m, &offset,
942 					     &vlan_tag) != 0)) {
943 			rte_pktmbuf_free(m);
944 			return;
945 		}
946 	}
947 
948 	RTE_LOG_DP(DEBUG, VHOST_DATA,
949 		"(%d) TX: MAC address is external\n", vdev->vid);
950 
951 queue2nic:
952 
953 	/*Add packet to the port tx queue*/
954 	tx_q = &lcore_tx_queue[lcore_id];
955 
956 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
957 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
958 		/* Guest has inserted the vlan tag. */
959 		struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
960 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
961 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
962 			(vh->vlan_tci != vlan_tag_be))
963 			vh->vlan_tci = vlan_tag_be;
964 	} else {
965 		m->ol_flags |= PKT_TX_VLAN_PKT;
966 
967 		/*
968 		 * Find the right seg to adjust the data len when offset is
969 		 * bigger than tail room size.
970 		 */
971 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
972 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
973 				m->data_len += offset;
974 			else {
975 				struct rte_mbuf *seg = m;
976 
977 				while ((seg->next != NULL) &&
978 					(offset > rte_pktmbuf_tailroom(seg)))
979 					seg = seg->next;
980 
981 				seg->data_len += offset;
982 			}
983 			m->pkt_len += offset;
984 		}
985 
986 		m->vlan_tci = vlan_tag;
987 	}
988 
989 	if (m->ol_flags & PKT_TX_TCP_SEG)
990 		virtio_tx_offload(m);
991 
992 	tx_q->m_table[tx_q->len++] = m;
993 	if (enable_stats) {
994 		vdev->stats.tx_total++;
995 		vdev->stats.tx++;
996 	}
997 
998 	if (unlikely(tx_q->len == MAX_PKT_BURST))
999 		do_drain_mbuf_table(tx_q);
1000 }
1001 
1002 
1003 static __rte_always_inline void
1004 drain_mbuf_table(struct mbuf_table *tx_q)
1005 {
1006 	static uint64_t prev_tsc;
1007 	uint64_t cur_tsc;
1008 
1009 	if (tx_q->len == 0)
1010 		return;
1011 
1012 	cur_tsc = rte_rdtsc();
1013 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1014 		prev_tsc = cur_tsc;
1015 
1016 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1017 			"TX queue drained after timeout with burst size %u\n",
1018 			tx_q->len);
1019 		do_drain_mbuf_table(tx_q);
1020 	}
1021 }
1022 
1023 static __rte_always_inline void
1024 drain_eth_rx(struct vhost_dev *vdev)
1025 {
1026 	uint16_t rx_count, enqueue_count;
1027 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1028 
1029 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1030 				    pkts, MAX_PKT_BURST);
1031 	if (!rx_count)
1032 		return;
1033 
1034 	/*
1035 	 * When "enable_retry" is set, here we wait and retry when there
1036 	 * is no enough free slots in the queue to hold @rx_count packets,
1037 	 * to diminish packet loss.
1038 	 */
1039 	if (enable_retry &&
1040 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1041 			VIRTIO_RXQ))) {
1042 		uint32_t retry;
1043 
1044 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1045 			rte_delay_us(burst_rx_delay_time);
1046 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1047 					VIRTIO_RXQ))
1048 				break;
1049 		}
1050 	}
1051 
1052 	if (builtin_net_driver) {
1053 		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1054 						pkts, rx_count);
1055 	} else {
1056 		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1057 						pkts, rx_count);
1058 	}
1059 	if (enable_stats) {
1060 		rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1061 		rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1062 	}
1063 
1064 	free_pkts(pkts, rx_count);
1065 }
1066 
1067 static __rte_always_inline void
1068 drain_virtio_tx(struct vhost_dev *vdev)
1069 {
1070 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1071 	uint16_t count;
1072 	uint16_t i;
1073 
1074 	if (builtin_net_driver) {
1075 		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1076 					pkts, MAX_PKT_BURST);
1077 	} else {
1078 		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1079 					mbuf_pool, pkts, MAX_PKT_BURST);
1080 	}
1081 
1082 	/* setup VMDq for the first packet */
1083 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1084 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1085 			free_pkts(pkts, count);
1086 	}
1087 
1088 	for (i = 0; i < count; ++i)
1089 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1090 }
1091 
1092 /*
1093  * Main function of vhost-switch. It basically does:
1094  *
1095  * for each vhost device {
1096  *    - drain_eth_rx()
1097  *
1098  *      Which drains the host eth Rx queue linked to the vhost device,
1099  *      and deliver all of them to guest virito Rx ring associated with
1100  *      this vhost device.
1101  *
1102  *    - drain_virtio_tx()
1103  *
1104  *      Which drains the guest virtio Tx queue and deliver all of them
1105  *      to the target, which could be another vhost device, or the
1106  *      physical eth dev. The route is done in function "virtio_tx_route".
1107  * }
1108  */
1109 static int
1110 switch_worker(void *arg __rte_unused)
1111 {
1112 	unsigned i;
1113 	unsigned lcore_id = rte_lcore_id();
1114 	struct vhost_dev *vdev;
1115 	struct mbuf_table *tx_q;
1116 
1117 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1118 
1119 	tx_q = &lcore_tx_queue[lcore_id];
1120 	for (i = 0; i < rte_lcore_count(); i++) {
1121 		if (lcore_ids[i] == lcore_id) {
1122 			tx_q->txq_id = i;
1123 			break;
1124 		}
1125 	}
1126 
1127 	while(1) {
1128 		drain_mbuf_table(tx_q);
1129 
1130 		/*
1131 		 * Inform the configuration core that we have exited the
1132 		 * linked list and that no devices are in use if requested.
1133 		 */
1134 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1135 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1136 
1137 		/*
1138 		 * Process vhost devices
1139 		 */
1140 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1141 			      lcore_vdev_entry) {
1142 			if (unlikely(vdev->remove)) {
1143 				unlink_vmdq(vdev);
1144 				vdev->ready = DEVICE_SAFE_REMOVE;
1145 				continue;
1146 			}
1147 
1148 			if (likely(vdev->ready == DEVICE_RX))
1149 				drain_eth_rx(vdev);
1150 
1151 			if (likely(!vdev->remove))
1152 				drain_virtio_tx(vdev);
1153 		}
1154 	}
1155 
1156 	return 0;
1157 }
1158 
1159 /*
1160  * Remove a device from the specific data core linked list and from the
1161  * main linked list. Synchonization  occurs through the use of the
1162  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1163  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1164  */
1165 static void
1166 destroy_device(int vid)
1167 {
1168 	struct vhost_dev *vdev = NULL;
1169 	int lcore;
1170 
1171 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1172 		if (vdev->vid == vid)
1173 			break;
1174 	}
1175 	if (!vdev)
1176 		return;
1177 	/*set the remove flag. */
1178 	vdev->remove = 1;
1179 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1180 		rte_pause();
1181 	}
1182 
1183 	if (builtin_net_driver)
1184 		vs_vhost_net_remove(vdev);
1185 
1186 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1187 		     lcore_vdev_entry);
1188 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1189 
1190 
1191 	/* Set the dev_removal_flag on each lcore. */
1192 	RTE_LCORE_FOREACH_SLAVE(lcore)
1193 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1194 
1195 	/*
1196 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1197 	 * we can be sure that they can no longer access the device removed
1198 	 * from the linked lists and that the devices are no longer in use.
1199 	 */
1200 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1201 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1202 			rte_pause();
1203 	}
1204 
1205 	lcore_info[vdev->coreid].device_num--;
1206 
1207 	RTE_LOG(INFO, VHOST_DATA,
1208 		"(%d) device has been removed from data core\n",
1209 		vdev->vid);
1210 
1211 	rte_free(vdev);
1212 }
1213 
1214 /*
1215  * A new device is added to a data core. First the device is added to the main linked list
1216  * and then allocated to a specific data core.
1217  */
1218 static int
1219 new_device(int vid)
1220 {
1221 	int lcore, core_add = 0;
1222 	uint32_t device_num_min = num_devices;
1223 	struct vhost_dev *vdev;
1224 
1225 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1226 	if (vdev == NULL) {
1227 		RTE_LOG(INFO, VHOST_DATA,
1228 			"(%d) couldn't allocate memory for vhost dev\n",
1229 			vid);
1230 		return -1;
1231 	}
1232 	vdev->vid = vid;
1233 
1234 	if (builtin_net_driver)
1235 		vs_vhost_net_setup(vdev);
1236 
1237 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1238 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1239 
1240 	/*reset ready flag*/
1241 	vdev->ready = DEVICE_MAC_LEARNING;
1242 	vdev->remove = 0;
1243 
1244 	/* Find a suitable lcore to add the device. */
1245 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1246 		if (lcore_info[lcore].device_num < device_num_min) {
1247 			device_num_min = lcore_info[lcore].device_num;
1248 			core_add = lcore;
1249 		}
1250 	}
1251 	vdev->coreid = core_add;
1252 
1253 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1254 			  lcore_vdev_entry);
1255 	lcore_info[vdev->coreid].device_num++;
1256 
1257 	/* Disable notifications. */
1258 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1259 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1260 
1261 	RTE_LOG(INFO, VHOST_DATA,
1262 		"(%d) device has been added to data core %d\n",
1263 		vid, vdev->coreid);
1264 
1265 	return 0;
1266 }
1267 
1268 /*
1269  * These callback allow devices to be added to the data core when configuration
1270  * has been fully complete.
1271  */
1272 static const struct vhost_device_ops virtio_net_device_ops =
1273 {
1274 	.new_device =  new_device,
1275 	.destroy_device = destroy_device,
1276 };
1277 
1278 /*
1279  * This is a thread will wake up after a period to print stats if the user has
1280  * enabled them.
1281  */
1282 static void *
1283 print_stats(__rte_unused void *arg)
1284 {
1285 	struct vhost_dev *vdev;
1286 	uint64_t tx_dropped, rx_dropped;
1287 	uint64_t tx, tx_total, rx, rx_total;
1288 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1289 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1290 
1291 	while(1) {
1292 		sleep(enable_stats);
1293 
1294 		/* Clear screen and move to top left */
1295 		printf("%s%s\n", clr, top_left);
1296 		printf("Device statistics =================================\n");
1297 
1298 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1299 			tx_total   = vdev->stats.tx_total;
1300 			tx         = vdev->stats.tx;
1301 			tx_dropped = tx_total - tx;
1302 
1303 			rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1304 			rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1305 			rx_dropped = rx_total - rx;
1306 
1307 			printf("Statistics for device %d\n"
1308 				"-----------------------\n"
1309 				"TX total:              %" PRIu64 "\n"
1310 				"TX dropped:            %" PRIu64 "\n"
1311 				"TX successful:         %" PRIu64 "\n"
1312 				"RX total:              %" PRIu64 "\n"
1313 				"RX dropped:            %" PRIu64 "\n"
1314 				"RX successful:         %" PRIu64 "\n",
1315 				vdev->vid,
1316 				tx_total, tx_dropped, tx,
1317 				rx_total, rx_dropped, rx);
1318 		}
1319 
1320 		printf("===================================================\n");
1321 
1322 		fflush(stdout);
1323 	}
1324 
1325 	return NULL;
1326 }
1327 
1328 static void
1329 unregister_drivers(int socket_num)
1330 {
1331 	int i, ret;
1332 
1333 	for (i = 0; i < socket_num; i++) {
1334 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1335 		if (ret != 0)
1336 			RTE_LOG(ERR, VHOST_CONFIG,
1337 				"Fail to unregister vhost driver for %s.\n",
1338 				socket_files + i * PATH_MAX);
1339 	}
1340 }
1341 
1342 /* When we receive a INT signal, unregister vhost driver */
1343 static void
1344 sigint_handler(__rte_unused int signum)
1345 {
1346 	/* Unregister vhost driver. */
1347 	unregister_drivers(nb_sockets);
1348 
1349 	exit(0);
1350 }
1351 
1352 /*
1353  * While creating an mbuf pool, one key thing is to figure out how
1354  * many mbuf entries is enough for our use. FYI, here are some
1355  * guidelines:
1356  *
1357  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1358  *
1359  * - For each switch core (A CPU core does the packet switch), we need
1360  *   also make some reservation for receiving the packets from virtio
1361  *   Tx queue. How many is enough depends on the usage. It's normally
1362  *   a simple calculation like following:
1363  *
1364  *       MAX_PKT_BURST * max packet size / mbuf size
1365  *
1366  *   So, we definitely need allocate more mbufs when TSO is enabled.
1367  *
1368  * - Similarly, for each switching core, we should serve @nr_rx_desc
1369  *   mbufs for receiving the packets from physical NIC device.
1370  *
1371  * - We also need make sure, for each switch core, we have allocated
1372  *   enough mbufs to fill up the mbuf cache.
1373  */
1374 static void
1375 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1376 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1377 {
1378 	uint32_t nr_mbufs;
1379 	uint32_t nr_mbufs_per_core;
1380 	uint32_t mtu = 1500;
1381 
1382 	if (mergeable)
1383 		mtu = 9000;
1384 	if (enable_tso)
1385 		mtu = 64 * 1024;
1386 
1387 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1388 			(mbuf_size - RTE_PKTMBUF_HEADROOM);
1389 	nr_mbufs_per_core += nr_rx_desc;
1390 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1391 
1392 	nr_mbufs  = nr_queues * nr_rx_desc;
1393 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1394 	nr_mbufs *= nr_port;
1395 
1396 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1397 					    nr_mbuf_cache, 0, mbuf_size,
1398 					    rte_socket_id());
1399 	if (mbuf_pool == NULL)
1400 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1401 }
1402 
1403 /*
1404  * Main function, does initialisation and calls the per-lcore functions.
1405  */
1406 int
1407 main(int argc, char *argv[])
1408 {
1409 	unsigned lcore_id, core_id = 0;
1410 	unsigned nb_ports, valid_num_ports;
1411 	int ret, i;
1412 	uint16_t portid;
1413 	static pthread_t tid;
1414 	uint64_t flags = 0;
1415 
1416 	signal(SIGINT, sigint_handler);
1417 
1418 	/* init EAL */
1419 	ret = rte_eal_init(argc, argv);
1420 	if (ret < 0)
1421 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1422 	argc -= ret;
1423 	argv += ret;
1424 
1425 	/* parse app arguments */
1426 	ret = us_vhost_parse_args(argc, argv);
1427 	if (ret < 0)
1428 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1429 
1430 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1431 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1432 
1433 		if (rte_lcore_is_enabled(lcore_id))
1434 			lcore_ids[core_id++] = lcore_id;
1435 	}
1436 
1437 	if (rte_lcore_count() > RTE_MAX_LCORE)
1438 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1439 
1440 	/* Get the number of physical ports. */
1441 	nb_ports = rte_eth_dev_count_avail();
1442 
1443 	/*
1444 	 * Update the global var NUM_PORTS and global array PORTS
1445 	 * and get value of var VALID_NUM_PORTS according to system ports number
1446 	 */
1447 	valid_num_ports = check_ports_num(nb_ports);
1448 
1449 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1450 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1451 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1452 		return -1;
1453 	}
1454 
1455 	/*
1456 	 * FIXME: here we are trying to allocate mbufs big enough for
1457 	 * @MAX_QUEUES, but the truth is we're never going to use that
1458 	 * many queues here. We probably should only do allocation for
1459 	 * those queues we are going to use.
1460 	 */
1461 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1462 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1463 
1464 	if (vm2vm_mode == VM2VM_HARDWARE) {
1465 		/* Enable VT loop back to let L2 switch to do it. */
1466 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1467 		RTE_LOG(DEBUG, VHOST_CONFIG,
1468 			"Enable loop back for L2 switch in vmdq.\n");
1469 	}
1470 
1471 	/* initialize all ports */
1472 	RTE_ETH_FOREACH_DEV(portid) {
1473 		/* skip ports that are not enabled */
1474 		if ((enabled_port_mask & (1 << portid)) == 0) {
1475 			RTE_LOG(INFO, VHOST_PORT,
1476 				"Skipping disabled port %d\n", portid);
1477 			continue;
1478 		}
1479 		if (port_init(portid) != 0)
1480 			rte_exit(EXIT_FAILURE,
1481 				"Cannot initialize network ports\n");
1482 	}
1483 
1484 	/* Enable stats if the user option is set. */
1485 	if (enable_stats) {
1486 		ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1487 					print_stats, NULL);
1488 		if (ret < 0)
1489 			rte_exit(EXIT_FAILURE,
1490 				"Cannot create print-stats thread\n");
1491 	}
1492 
1493 	/* Launch all data cores. */
1494 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1495 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1496 
1497 	if (client_mode)
1498 		flags |= RTE_VHOST_USER_CLIENT;
1499 
1500 	/* Register vhost user driver to handle vhost messages. */
1501 	for (i = 0; i < nb_sockets; i++) {
1502 		char *file = socket_files + i * PATH_MAX;
1503 		ret = rte_vhost_driver_register(file, flags);
1504 		if (ret != 0) {
1505 			unregister_drivers(i);
1506 			rte_exit(EXIT_FAILURE,
1507 				"vhost driver register failure.\n");
1508 		}
1509 
1510 		if (builtin_net_driver)
1511 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1512 
1513 		if (mergeable == 0) {
1514 			rte_vhost_driver_disable_features(file,
1515 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
1516 		}
1517 
1518 		if (enable_tx_csum == 0) {
1519 			rte_vhost_driver_disable_features(file,
1520 				1ULL << VIRTIO_NET_F_CSUM);
1521 		}
1522 
1523 		if (enable_tso == 0) {
1524 			rte_vhost_driver_disable_features(file,
1525 				1ULL << VIRTIO_NET_F_HOST_TSO4);
1526 			rte_vhost_driver_disable_features(file,
1527 				1ULL << VIRTIO_NET_F_HOST_TSO6);
1528 			rte_vhost_driver_disable_features(file,
1529 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
1530 			rte_vhost_driver_disable_features(file,
1531 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
1532 		}
1533 
1534 		if (promiscuous) {
1535 			rte_vhost_driver_enable_features(file,
1536 				1ULL << VIRTIO_NET_F_CTRL_RX);
1537 		}
1538 
1539 		ret = rte_vhost_driver_callback_register(file,
1540 			&virtio_net_device_ops);
1541 		if (ret != 0) {
1542 			rte_exit(EXIT_FAILURE,
1543 				"failed to register vhost driver callbacks.\n");
1544 		}
1545 
1546 		if (rte_vhost_driver_start(file) < 0) {
1547 			rte_exit(EXIT_FAILURE,
1548 				"failed to start vhost driver.\n");
1549 		}
1550 	}
1551 
1552 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1553 		rte_eal_wait_lcore(lcore_id);
1554 
1555 	return 0;
1556 
1557 }
1558