xref: /dpdk/examples/vhost/main.c (revision 5dba3b9c4c131b88a78bcecfef39db23ebc47873)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4 
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16 
17 #include <rte_atomic.h>
18 #include <rte_cycles.h>
19 #include <rte_ethdev.h>
20 #include <rte_log.h>
21 #include <rte_string_fns.h>
22 #include <rte_malloc.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27 
28 #include "main.h"
29 
30 #ifndef MAX_QUEUES
31 #define MAX_QUEUES 128
32 #endif
33 
34 /* the maximum number of external ports supported */
35 #define MAX_SUP_PORTS 1
36 
37 #define MBUF_CACHE_SIZE	128
38 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
39 
40 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
41 
42 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
43 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
44 
45 #define JUMBO_FRAME_MAX_SIZE    0x2600
46 
47 /* State of virtio device. */
48 #define DEVICE_MAC_LEARNING 0
49 #define DEVICE_RX			1
50 #define DEVICE_SAFE_REMOVE	2
51 
52 /* Configurable number of RX/TX ring descriptors */
53 #define RTE_TEST_RX_DESC_DEFAULT 1024
54 #define RTE_TEST_TX_DESC_DEFAULT 512
55 
56 #define INVALID_PORT_ID 0xFF
57 
58 /* Max number of devices. Limited by vmdq. */
59 #define MAX_DEVICES 64
60 
61 /* Size of buffers used for snprintfs. */
62 #define MAX_PRINT_BUFF 6072
63 
64 /* Maximum long option length for option parsing. */
65 #define MAX_LONG_OPT_SZ 64
66 
67 /* mask of enabled ports */
68 static uint32_t enabled_port_mask = 0;
69 
70 /* Promiscuous mode */
71 static uint32_t promiscuous;
72 
73 /* number of devices/queues to support*/
74 static uint32_t num_queues = 0;
75 static uint32_t num_devices;
76 
77 static struct rte_mempool *mbuf_pool;
78 static int mergeable;
79 
80 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
81 typedef enum {
82 	VM2VM_DISABLED = 0,
83 	VM2VM_SOFTWARE = 1,
84 	VM2VM_HARDWARE = 2,
85 	VM2VM_LAST
86 } vm2vm_type;
87 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
88 
89 /* Enable stats. */
90 static uint32_t enable_stats = 0;
91 /* Enable retries on RX. */
92 static uint32_t enable_retry = 1;
93 
94 /* Disable TX checksum offload */
95 static uint32_t enable_tx_csum;
96 
97 /* Disable TSO offload */
98 static uint32_t enable_tso;
99 
100 static int client_mode;
101 static int dequeue_zero_copy;
102 
103 static int builtin_net_driver;
104 
105 /* Specify timeout (in useconds) between retries on RX. */
106 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
107 /* Specify the number of retries on RX. */
108 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
109 
110 /* Socket file paths. Can be set by user */
111 static char *socket_files;
112 static int nb_sockets;
113 
114 /* empty vmdq configuration structure. Filled in programatically */
115 static struct rte_eth_conf vmdq_conf_default = {
116 	.rxmode = {
117 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
118 		.split_hdr_size = 0,
119 		.header_split   = 0, /**< Header Split disabled */
120 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
121 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
122 		/*
123 		 * It is necessary for 1G NIC such as I350,
124 		 * this fixes bug of ipv4 forwarding in guest can't
125 		 * forward pakets from one virtio dev to another virtio dev.
126 		 */
127 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
128 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
129 		.hw_strip_crc   = 1, /**< CRC stripped by hardware */
130 	},
131 
132 	.txmode = {
133 		.mq_mode = ETH_MQ_TX_NONE,
134 	},
135 	.rx_adv_conf = {
136 		/*
137 		 * should be overridden separately in code with
138 		 * appropriate values
139 		 */
140 		.vmdq_rx_conf = {
141 			.nb_queue_pools = ETH_8_POOLS,
142 			.enable_default_pool = 0,
143 			.default_pool = 0,
144 			.nb_pool_maps = 0,
145 			.pool_map = {{0, 0},},
146 		},
147 	},
148 };
149 
150 static unsigned lcore_ids[RTE_MAX_LCORE];
151 static uint16_t ports[RTE_MAX_ETHPORTS];
152 static unsigned num_ports = 0; /**< The number of ports specified in command line */
153 static uint16_t num_pf_queues, num_vmdq_queues;
154 static uint16_t vmdq_pool_base, vmdq_queue_base;
155 static uint16_t queues_per_pool;
156 
157 const uint16_t vlan_tags[] = {
158 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
159 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
160 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
161 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
162 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
163 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
164 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
165 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
166 };
167 
168 /* ethernet addresses of ports */
169 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
170 
171 static struct vhost_dev_tailq_list vhost_dev_list =
172 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
173 
174 static struct lcore_info lcore_info[RTE_MAX_LCORE];
175 
176 /* Used for queueing bursts of TX packets. */
177 struct mbuf_table {
178 	unsigned len;
179 	unsigned txq_id;
180 	struct rte_mbuf *m_table[MAX_PKT_BURST];
181 };
182 
183 /* TX queue for each data core. */
184 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
185 
186 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
187 				 / US_PER_S * BURST_TX_DRAIN_US)
188 #define VLAN_HLEN       4
189 
190 /*
191  * Builds up the correct configuration for VMDQ VLAN pool map
192  * according to the pool & queue limits.
193  */
194 static inline int
195 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
196 {
197 	struct rte_eth_vmdq_rx_conf conf;
198 	struct rte_eth_vmdq_rx_conf *def_conf =
199 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
200 	unsigned i;
201 
202 	memset(&conf, 0, sizeof(conf));
203 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
204 	conf.nb_pool_maps = num_devices;
205 	conf.enable_loop_back = def_conf->enable_loop_back;
206 	conf.rx_mode = def_conf->rx_mode;
207 
208 	for (i = 0; i < conf.nb_pool_maps; i++) {
209 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
210 		conf.pool_map[i].pools = (1UL << i);
211 	}
212 
213 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
214 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
215 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
216 	return 0;
217 }
218 
219 /*
220  * Validate the device number according to the max pool number gotten form
221  * dev_info. If the device number is invalid, give the error message and
222  * return -1. Each device must have its own pool.
223  */
224 static inline int
225 validate_num_devices(uint32_t max_nb_devices)
226 {
227 	if (num_devices > max_nb_devices) {
228 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
229 		return -1;
230 	}
231 	return 0;
232 }
233 
234 /*
235  * Initialises a given port using global settings and with the rx buffers
236  * coming from the mbuf_pool passed as parameter
237  */
238 static inline int
239 port_init(uint16_t port)
240 {
241 	struct rte_eth_dev_info dev_info;
242 	struct rte_eth_conf port_conf;
243 	struct rte_eth_rxconf *rxconf;
244 	struct rte_eth_txconf *txconf;
245 	int16_t rx_rings, tx_rings;
246 	uint16_t rx_ring_size, tx_ring_size;
247 	int retval;
248 	uint16_t q;
249 
250 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
251 	rte_eth_dev_info_get (port, &dev_info);
252 
253 	if (dev_info.max_rx_queues > MAX_QUEUES) {
254 		rte_exit(EXIT_FAILURE,
255 			"please define MAX_QUEUES no less than %u in %s\n",
256 			dev_info.max_rx_queues, __FILE__);
257 	}
258 
259 	rxconf = &dev_info.default_rxconf;
260 	txconf = &dev_info.default_txconf;
261 	rxconf->rx_drop_en = 1;
262 
263 	/* Enable vlan offload */
264 	txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
265 
266 	/*configure the number of supported virtio devices based on VMDQ limits */
267 	num_devices = dev_info.max_vmdq_pools;
268 
269 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
270 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
271 
272 	/*
273 	 * When dequeue zero copy is enabled, guest Tx used vring will be
274 	 * updated only when corresponding mbuf is freed. Thus, the nb_tx_desc
275 	 * (tx_ring_size here) must be small enough so that the driver will
276 	 * hit the free threshold easily and free mbufs timely. Otherwise,
277 	 * guest Tx vring would be starved.
278 	 */
279 	if (dequeue_zero_copy)
280 		tx_ring_size = 64;
281 
282 	tx_rings = (uint16_t)rte_lcore_count();
283 
284 	retval = validate_num_devices(MAX_DEVICES);
285 	if (retval < 0)
286 		return retval;
287 
288 	/* Get port configuration. */
289 	retval = get_eth_conf(&port_conf, num_devices);
290 	if (retval < 0)
291 		return retval;
292 	/* NIC queues are divided into pf queues and vmdq queues.  */
293 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
294 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
295 	num_vmdq_queues = num_devices * queues_per_pool;
296 	num_queues = num_pf_queues + num_vmdq_queues;
297 	vmdq_queue_base = dev_info.vmdq_queue_base;
298 	vmdq_pool_base  = dev_info.vmdq_pool_base;
299 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
300 		num_pf_queues, num_devices, queues_per_pool);
301 
302 	if (port >= rte_eth_dev_count()) return -1;
303 
304 	rx_rings = (uint16_t)dev_info.max_rx_queues;
305 	/* Configure ethernet device. */
306 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
307 	if (retval != 0) {
308 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
309 			port, strerror(-retval));
310 		return retval;
311 	}
312 
313 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
314 		&tx_ring_size);
315 	if (retval != 0) {
316 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
317 			"for port %u: %s.\n", port, strerror(-retval));
318 		return retval;
319 	}
320 	if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
321 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
322 			"for Rx queues on port %u.\n", port);
323 		return -1;
324 	}
325 
326 	/* Setup the queues. */
327 	for (q = 0; q < rx_rings; q ++) {
328 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
329 						rte_eth_dev_socket_id(port),
330 						rxconf,
331 						mbuf_pool);
332 		if (retval < 0) {
333 			RTE_LOG(ERR, VHOST_PORT,
334 				"Failed to setup rx queue %u of port %u: %s.\n",
335 				q, port, strerror(-retval));
336 			return retval;
337 		}
338 	}
339 	for (q = 0; q < tx_rings; q ++) {
340 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
341 						rte_eth_dev_socket_id(port),
342 						txconf);
343 		if (retval < 0) {
344 			RTE_LOG(ERR, VHOST_PORT,
345 				"Failed to setup tx queue %u of port %u: %s.\n",
346 				q, port, strerror(-retval));
347 			return retval;
348 		}
349 	}
350 
351 	/* Start the device. */
352 	retval  = rte_eth_dev_start(port);
353 	if (retval < 0) {
354 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
355 			port, strerror(-retval));
356 		return retval;
357 	}
358 
359 	if (promiscuous)
360 		rte_eth_promiscuous_enable(port);
361 
362 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
363 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
364 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
365 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
366 			port,
367 			vmdq_ports_eth_addr[port].addr_bytes[0],
368 			vmdq_ports_eth_addr[port].addr_bytes[1],
369 			vmdq_ports_eth_addr[port].addr_bytes[2],
370 			vmdq_ports_eth_addr[port].addr_bytes[3],
371 			vmdq_ports_eth_addr[port].addr_bytes[4],
372 			vmdq_ports_eth_addr[port].addr_bytes[5]);
373 
374 	return 0;
375 }
376 
377 /*
378  * Set socket file path.
379  */
380 static int
381 us_vhost_parse_socket_path(const char *q_arg)
382 {
383 	/* parse number string */
384 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
385 		return -1;
386 
387 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
388 	snprintf(socket_files + nb_sockets * PATH_MAX, PATH_MAX, "%s", q_arg);
389 	nb_sockets++;
390 
391 	return 0;
392 }
393 
394 /*
395  * Parse the portmask provided at run time.
396  */
397 static int
398 parse_portmask(const char *portmask)
399 {
400 	char *end = NULL;
401 	unsigned long pm;
402 
403 	errno = 0;
404 
405 	/* parse hexadecimal string */
406 	pm = strtoul(portmask, &end, 16);
407 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
408 		return -1;
409 
410 	if (pm == 0)
411 		return -1;
412 
413 	return pm;
414 
415 }
416 
417 /*
418  * Parse num options at run time.
419  */
420 static int
421 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
422 {
423 	char *end = NULL;
424 	unsigned long num;
425 
426 	errno = 0;
427 
428 	/* parse unsigned int string */
429 	num = strtoul(q_arg, &end, 10);
430 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
431 		return -1;
432 
433 	if (num > max_valid_value)
434 		return -1;
435 
436 	return num;
437 
438 }
439 
440 /*
441  * Display usage
442  */
443 static void
444 us_vhost_usage(const char *prgname)
445 {
446 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
447 	"		--vm2vm [0|1|2]\n"
448 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
449 	"		--socket-file <path>\n"
450 	"		--nb-devices ND\n"
451 	"		-p PORTMASK: Set mask for ports to be used by application\n"
452 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
453 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
454 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
455 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
456 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
457 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
458 	"		--socket-file: The path of the socket file.\n"
459 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
460 	"		--tso [0|1] disable/enable TCP segment offload.\n"
461 	"		--client register a vhost-user socket as client mode.\n"
462 	"		--dequeue-zero-copy enables dequeue zero copy\n",
463 	       prgname);
464 }
465 
466 /*
467  * Parse the arguments given in the command line of the application.
468  */
469 static int
470 us_vhost_parse_args(int argc, char **argv)
471 {
472 	int opt, ret;
473 	int option_index;
474 	unsigned i;
475 	const char *prgname = argv[0];
476 	static struct option long_option[] = {
477 		{"vm2vm", required_argument, NULL, 0},
478 		{"rx-retry", required_argument, NULL, 0},
479 		{"rx-retry-delay", required_argument, NULL, 0},
480 		{"rx-retry-num", required_argument, NULL, 0},
481 		{"mergeable", required_argument, NULL, 0},
482 		{"stats", required_argument, NULL, 0},
483 		{"socket-file", required_argument, NULL, 0},
484 		{"tx-csum", required_argument, NULL, 0},
485 		{"tso", required_argument, NULL, 0},
486 		{"client", no_argument, &client_mode, 1},
487 		{"dequeue-zero-copy", no_argument, &dequeue_zero_copy, 1},
488 		{"builtin-net-driver", no_argument, &builtin_net_driver, 1},
489 		{NULL, 0, 0, 0},
490 	};
491 
492 	/* Parse command line */
493 	while ((opt = getopt_long(argc, argv, "p:P",
494 			long_option, &option_index)) != EOF) {
495 		switch (opt) {
496 		/* Portmask */
497 		case 'p':
498 			enabled_port_mask = parse_portmask(optarg);
499 			if (enabled_port_mask == 0) {
500 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
501 				us_vhost_usage(prgname);
502 				return -1;
503 			}
504 			break;
505 
506 		case 'P':
507 			promiscuous = 1;
508 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
509 				ETH_VMDQ_ACCEPT_BROADCAST |
510 				ETH_VMDQ_ACCEPT_MULTICAST;
511 
512 			break;
513 
514 		case 0:
515 			/* Enable/disable vm2vm comms. */
516 			if (!strncmp(long_option[option_index].name, "vm2vm",
517 				MAX_LONG_OPT_SZ)) {
518 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
519 				if (ret == -1) {
520 					RTE_LOG(INFO, VHOST_CONFIG,
521 						"Invalid argument for "
522 						"vm2vm [0|1|2]\n");
523 					us_vhost_usage(prgname);
524 					return -1;
525 				} else {
526 					vm2vm_mode = (vm2vm_type)ret;
527 				}
528 			}
529 
530 			/* Enable/disable retries on RX. */
531 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
532 				ret = parse_num_opt(optarg, 1);
533 				if (ret == -1) {
534 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
535 					us_vhost_usage(prgname);
536 					return -1;
537 				} else {
538 					enable_retry = ret;
539 				}
540 			}
541 
542 			/* Enable/disable TX checksum offload. */
543 			if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
544 				ret = parse_num_opt(optarg, 1);
545 				if (ret == -1) {
546 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
547 					us_vhost_usage(prgname);
548 					return -1;
549 				} else
550 					enable_tx_csum = ret;
551 			}
552 
553 			/* Enable/disable TSO offload. */
554 			if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
555 				ret = parse_num_opt(optarg, 1);
556 				if (ret == -1) {
557 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
558 					us_vhost_usage(prgname);
559 					return -1;
560 				} else
561 					enable_tso = ret;
562 			}
563 
564 			/* Specify the retries delay time (in useconds) on RX. */
565 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
566 				ret = parse_num_opt(optarg, INT32_MAX);
567 				if (ret == -1) {
568 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
569 					us_vhost_usage(prgname);
570 					return -1;
571 				} else {
572 					burst_rx_delay_time = ret;
573 				}
574 			}
575 
576 			/* Specify the retries number on RX. */
577 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
578 				ret = parse_num_opt(optarg, INT32_MAX);
579 				if (ret == -1) {
580 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
581 					us_vhost_usage(prgname);
582 					return -1;
583 				} else {
584 					burst_rx_retry_num = ret;
585 				}
586 			}
587 
588 			/* Enable/disable RX mergeable buffers. */
589 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
590 				ret = parse_num_opt(optarg, 1);
591 				if (ret == -1) {
592 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
593 					us_vhost_usage(prgname);
594 					return -1;
595 				} else {
596 					mergeable = !!ret;
597 					if (ret) {
598 						vmdq_conf_default.rxmode.jumbo_frame = 1;
599 						vmdq_conf_default.rxmode.max_rx_pkt_len
600 							= JUMBO_FRAME_MAX_SIZE;
601 					}
602 				}
603 			}
604 
605 			/* Enable/disable stats. */
606 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
607 				ret = parse_num_opt(optarg, INT32_MAX);
608 				if (ret == -1) {
609 					RTE_LOG(INFO, VHOST_CONFIG,
610 						"Invalid argument for stats [0..N]\n");
611 					us_vhost_usage(prgname);
612 					return -1;
613 				} else {
614 					enable_stats = ret;
615 				}
616 			}
617 
618 			/* Set socket file path. */
619 			if (!strncmp(long_option[option_index].name,
620 						"socket-file", MAX_LONG_OPT_SZ)) {
621 				if (us_vhost_parse_socket_path(optarg) == -1) {
622 					RTE_LOG(INFO, VHOST_CONFIG,
623 					"Invalid argument for socket name (Max %d characters)\n",
624 					PATH_MAX);
625 					us_vhost_usage(prgname);
626 					return -1;
627 				}
628 			}
629 
630 			break;
631 
632 			/* Invalid option - print options. */
633 		default:
634 			us_vhost_usage(prgname);
635 			return -1;
636 		}
637 	}
638 
639 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
640 		if (enabled_port_mask & (1 << i))
641 			ports[num_ports++] = i;
642 	}
643 
644 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
645 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
646 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
647 		return -1;
648 	}
649 
650 	return 0;
651 }
652 
653 /*
654  * Update the global var NUM_PORTS and array PORTS according to system ports number
655  * and return valid ports number
656  */
657 static unsigned check_ports_num(unsigned nb_ports)
658 {
659 	unsigned valid_num_ports = num_ports;
660 	unsigned portid;
661 
662 	if (num_ports > nb_ports) {
663 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
664 			num_ports, nb_ports);
665 		num_ports = nb_ports;
666 	}
667 
668 	for (portid = 0; portid < num_ports; portid ++) {
669 		if (ports[portid] >= nb_ports) {
670 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
671 				ports[portid], (nb_ports - 1));
672 			ports[portid] = INVALID_PORT_ID;
673 			valid_num_ports--;
674 		}
675 	}
676 	return valid_num_ports;
677 }
678 
679 static __rte_always_inline struct vhost_dev *
680 find_vhost_dev(struct ether_addr *mac)
681 {
682 	struct vhost_dev *vdev;
683 
684 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
685 		if (vdev->ready == DEVICE_RX &&
686 		    is_same_ether_addr(mac, &vdev->mac_address))
687 			return vdev;
688 	}
689 
690 	return NULL;
691 }
692 
693 /*
694  * This function learns the MAC address of the device and registers this along with a
695  * vlan tag to a VMDQ.
696  */
697 static int
698 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
699 {
700 	struct ether_hdr *pkt_hdr;
701 	int i, ret;
702 
703 	/* Learn MAC address of guest device from packet */
704 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
705 
706 	if (find_vhost_dev(&pkt_hdr->s_addr)) {
707 		RTE_LOG(ERR, VHOST_DATA,
708 			"(%d) device is using a registered MAC!\n",
709 			vdev->vid);
710 		return -1;
711 	}
712 
713 	for (i = 0; i < ETHER_ADDR_LEN; i++)
714 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
715 
716 	/* vlan_tag currently uses the device_id. */
717 	vdev->vlan_tag = vlan_tags[vdev->vid];
718 
719 	/* Print out VMDQ registration info. */
720 	RTE_LOG(INFO, VHOST_DATA,
721 		"(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
722 		vdev->vid,
723 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
724 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
725 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
726 		vdev->vlan_tag);
727 
728 	/* Register the MAC address. */
729 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
730 				(uint32_t)vdev->vid + vmdq_pool_base);
731 	if (ret)
732 		RTE_LOG(ERR, VHOST_DATA,
733 			"(%d) failed to add device MAC address to VMDQ\n",
734 			vdev->vid);
735 
736 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
737 
738 	/* Set device as ready for RX. */
739 	vdev->ready = DEVICE_RX;
740 
741 	return 0;
742 }
743 
744 /*
745  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
746  * queue before disabling RX on the device.
747  */
748 static inline void
749 unlink_vmdq(struct vhost_dev *vdev)
750 {
751 	unsigned i = 0;
752 	unsigned rx_count;
753 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
754 
755 	if (vdev->ready == DEVICE_RX) {
756 		/*clear MAC and VLAN settings*/
757 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
758 		for (i = 0; i < 6; i++)
759 			vdev->mac_address.addr_bytes[i] = 0;
760 
761 		vdev->vlan_tag = 0;
762 
763 		/*Clear out the receive buffers*/
764 		rx_count = rte_eth_rx_burst(ports[0],
765 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
766 
767 		while (rx_count) {
768 			for (i = 0; i < rx_count; i++)
769 				rte_pktmbuf_free(pkts_burst[i]);
770 
771 			rx_count = rte_eth_rx_burst(ports[0],
772 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
773 		}
774 
775 		vdev->ready = DEVICE_MAC_LEARNING;
776 	}
777 }
778 
779 static __rte_always_inline void
780 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
781 	    struct rte_mbuf *m)
782 {
783 	uint16_t ret;
784 
785 	if (builtin_net_driver) {
786 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
787 	} else {
788 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
789 	}
790 
791 	if (enable_stats) {
792 		rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
793 		rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
794 		src_vdev->stats.tx_total++;
795 		src_vdev->stats.tx += ret;
796 	}
797 }
798 
799 /*
800  * Check if the packet destination MAC address is for a local device. If so then put
801  * the packet on that devices RX queue. If not then return.
802  */
803 static __rte_always_inline int
804 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
805 {
806 	struct ether_hdr *pkt_hdr;
807 	struct vhost_dev *dst_vdev;
808 
809 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
810 
811 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
812 	if (!dst_vdev)
813 		return -1;
814 
815 	if (vdev->vid == dst_vdev->vid) {
816 		RTE_LOG_DP(DEBUG, VHOST_DATA,
817 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
818 			vdev->vid);
819 		return 0;
820 	}
821 
822 	RTE_LOG_DP(DEBUG, VHOST_DATA,
823 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
824 
825 	if (unlikely(dst_vdev->remove)) {
826 		RTE_LOG_DP(DEBUG, VHOST_DATA,
827 			"(%d) device is marked for removal\n", dst_vdev->vid);
828 		return 0;
829 	}
830 
831 	virtio_xmit(dst_vdev, vdev, m);
832 	return 0;
833 }
834 
835 /*
836  * Check if the destination MAC of a packet is one local VM,
837  * and get its vlan tag, and offset if it is.
838  */
839 static __rte_always_inline int
840 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
841 	uint32_t *offset, uint16_t *vlan_tag)
842 {
843 	struct vhost_dev *dst_vdev;
844 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
845 
846 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
847 	if (!dst_vdev)
848 		return 0;
849 
850 	if (vdev->vid == dst_vdev->vid) {
851 		RTE_LOG_DP(DEBUG, VHOST_DATA,
852 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
853 			vdev->vid);
854 		return -1;
855 	}
856 
857 	/*
858 	 * HW vlan strip will reduce the packet length
859 	 * by minus length of vlan tag, so need restore
860 	 * the packet length by plus it.
861 	 */
862 	*offset  = VLAN_HLEN;
863 	*vlan_tag = vlan_tags[vdev->vid];
864 
865 	RTE_LOG_DP(DEBUG, VHOST_DATA,
866 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
867 		vdev->vid, dst_vdev->vid, *vlan_tag);
868 
869 	return 0;
870 }
871 
872 static uint16_t
873 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
874 {
875 	if (ol_flags & PKT_TX_IPV4)
876 		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
877 	else /* assume ethertype == ETHER_TYPE_IPv6 */
878 		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
879 }
880 
881 static void virtio_tx_offload(struct rte_mbuf *m)
882 {
883 	void *l3_hdr;
884 	struct ipv4_hdr *ipv4_hdr = NULL;
885 	struct tcp_hdr *tcp_hdr = NULL;
886 	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
887 
888 	l3_hdr = (char *)eth_hdr + m->l2_len;
889 
890 	if (m->ol_flags & PKT_TX_IPV4) {
891 		ipv4_hdr = l3_hdr;
892 		ipv4_hdr->hdr_checksum = 0;
893 		m->ol_flags |= PKT_TX_IP_CKSUM;
894 	}
895 
896 	tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
897 	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
898 }
899 
900 static inline void
901 free_pkts(struct rte_mbuf **pkts, uint16_t n)
902 {
903 	while (n--)
904 		rte_pktmbuf_free(pkts[n]);
905 }
906 
907 static __rte_always_inline void
908 do_drain_mbuf_table(struct mbuf_table *tx_q)
909 {
910 	uint16_t count;
911 
912 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
913 				 tx_q->m_table, tx_q->len);
914 	if (unlikely(count < tx_q->len))
915 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
916 
917 	tx_q->len = 0;
918 }
919 
920 /*
921  * This function routes the TX packet to the correct interface. This
922  * may be a local device or the physical port.
923  */
924 static __rte_always_inline void
925 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
926 {
927 	struct mbuf_table *tx_q;
928 	unsigned offset = 0;
929 	const uint16_t lcore_id = rte_lcore_id();
930 	struct ether_hdr *nh;
931 
932 
933 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
934 	if (unlikely(is_broadcast_ether_addr(&nh->d_addr))) {
935 		struct vhost_dev *vdev2;
936 
937 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
938 			if (vdev2 != vdev)
939 				virtio_xmit(vdev2, vdev, m);
940 		}
941 		goto queue2nic;
942 	}
943 
944 	/*check if destination is local VM*/
945 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
946 		rte_pktmbuf_free(m);
947 		return;
948 	}
949 
950 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
951 		if (unlikely(find_local_dest(vdev, m, &offset,
952 					     &vlan_tag) != 0)) {
953 			rte_pktmbuf_free(m);
954 			return;
955 		}
956 	}
957 
958 	RTE_LOG_DP(DEBUG, VHOST_DATA,
959 		"(%d) TX: MAC address is external\n", vdev->vid);
960 
961 queue2nic:
962 
963 	/*Add packet to the port tx queue*/
964 	tx_q = &lcore_tx_queue[lcore_id];
965 
966 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
967 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
968 		/* Guest has inserted the vlan tag. */
969 		struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
970 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
971 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
972 			(vh->vlan_tci != vlan_tag_be))
973 			vh->vlan_tci = vlan_tag_be;
974 	} else {
975 		m->ol_flags |= PKT_TX_VLAN_PKT;
976 
977 		/*
978 		 * Find the right seg to adjust the data len when offset is
979 		 * bigger than tail room size.
980 		 */
981 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
982 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
983 				m->data_len += offset;
984 			else {
985 				struct rte_mbuf *seg = m;
986 
987 				while ((seg->next != NULL) &&
988 					(offset > rte_pktmbuf_tailroom(seg)))
989 					seg = seg->next;
990 
991 				seg->data_len += offset;
992 			}
993 			m->pkt_len += offset;
994 		}
995 
996 		m->vlan_tci = vlan_tag;
997 	}
998 
999 	if (m->ol_flags & PKT_TX_TCP_SEG)
1000 		virtio_tx_offload(m);
1001 
1002 	tx_q->m_table[tx_q->len++] = m;
1003 	if (enable_stats) {
1004 		vdev->stats.tx_total++;
1005 		vdev->stats.tx++;
1006 	}
1007 
1008 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1009 		do_drain_mbuf_table(tx_q);
1010 }
1011 
1012 
1013 static __rte_always_inline void
1014 drain_mbuf_table(struct mbuf_table *tx_q)
1015 {
1016 	static uint64_t prev_tsc;
1017 	uint64_t cur_tsc;
1018 
1019 	if (tx_q->len == 0)
1020 		return;
1021 
1022 	cur_tsc = rte_rdtsc();
1023 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1024 		prev_tsc = cur_tsc;
1025 
1026 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1027 			"TX queue drained after timeout with burst size %u\n",
1028 			tx_q->len);
1029 		do_drain_mbuf_table(tx_q);
1030 	}
1031 }
1032 
1033 static __rte_always_inline void
1034 drain_eth_rx(struct vhost_dev *vdev)
1035 {
1036 	uint16_t rx_count, enqueue_count;
1037 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1038 
1039 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1040 				    pkts, MAX_PKT_BURST);
1041 	if (!rx_count)
1042 		return;
1043 
1044 	/*
1045 	 * When "enable_retry" is set, here we wait and retry when there
1046 	 * is no enough free slots in the queue to hold @rx_count packets,
1047 	 * to diminish packet loss.
1048 	 */
1049 	if (enable_retry &&
1050 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1051 			VIRTIO_RXQ))) {
1052 		uint32_t retry;
1053 
1054 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1055 			rte_delay_us(burst_rx_delay_time);
1056 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1057 					VIRTIO_RXQ))
1058 				break;
1059 		}
1060 	}
1061 
1062 	if (builtin_net_driver) {
1063 		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1064 						pkts, rx_count);
1065 	} else {
1066 		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1067 						pkts, rx_count);
1068 	}
1069 	if (enable_stats) {
1070 		rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1071 		rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1072 	}
1073 
1074 	free_pkts(pkts, rx_count);
1075 }
1076 
1077 static __rte_always_inline void
1078 drain_virtio_tx(struct vhost_dev *vdev)
1079 {
1080 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1081 	uint16_t count;
1082 	uint16_t i;
1083 
1084 	if (builtin_net_driver) {
1085 		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1086 					pkts, MAX_PKT_BURST);
1087 	} else {
1088 		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1089 					mbuf_pool, pkts, MAX_PKT_BURST);
1090 	}
1091 
1092 	/* setup VMDq for the first packet */
1093 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1094 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1095 			free_pkts(pkts, count);
1096 	}
1097 
1098 	for (i = 0; i < count; ++i)
1099 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1100 }
1101 
1102 /*
1103  * Main function of vhost-switch. It basically does:
1104  *
1105  * for each vhost device {
1106  *    - drain_eth_rx()
1107  *
1108  *      Which drains the host eth Rx queue linked to the vhost device,
1109  *      and deliver all of them to guest virito Rx ring associated with
1110  *      this vhost device.
1111  *
1112  *    - drain_virtio_tx()
1113  *
1114  *      Which drains the guest virtio Tx queue and deliver all of them
1115  *      to the target, which could be another vhost device, or the
1116  *      physical eth dev. The route is done in function "virtio_tx_route".
1117  * }
1118  */
1119 static int
1120 switch_worker(void *arg __rte_unused)
1121 {
1122 	unsigned i;
1123 	unsigned lcore_id = rte_lcore_id();
1124 	struct vhost_dev *vdev;
1125 	struct mbuf_table *tx_q;
1126 
1127 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1128 
1129 	tx_q = &lcore_tx_queue[lcore_id];
1130 	for (i = 0; i < rte_lcore_count(); i++) {
1131 		if (lcore_ids[i] == lcore_id) {
1132 			tx_q->txq_id = i;
1133 			break;
1134 		}
1135 	}
1136 
1137 	while(1) {
1138 		drain_mbuf_table(tx_q);
1139 
1140 		/*
1141 		 * Inform the configuration core that we have exited the
1142 		 * linked list and that no devices are in use if requested.
1143 		 */
1144 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1145 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1146 
1147 		/*
1148 		 * Process vhost devices
1149 		 */
1150 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1151 			      lcore_vdev_entry) {
1152 			if (unlikely(vdev->remove)) {
1153 				unlink_vmdq(vdev);
1154 				vdev->ready = DEVICE_SAFE_REMOVE;
1155 				continue;
1156 			}
1157 
1158 			if (likely(vdev->ready == DEVICE_RX))
1159 				drain_eth_rx(vdev);
1160 
1161 			if (likely(!vdev->remove))
1162 				drain_virtio_tx(vdev);
1163 		}
1164 	}
1165 
1166 	return 0;
1167 }
1168 
1169 /*
1170  * Remove a device from the specific data core linked list and from the
1171  * main linked list. Synchonization  occurs through the use of the
1172  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1173  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1174  */
1175 static void
1176 destroy_device(int vid)
1177 {
1178 	struct vhost_dev *vdev = NULL;
1179 	int lcore;
1180 
1181 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1182 		if (vdev->vid == vid)
1183 			break;
1184 	}
1185 	if (!vdev)
1186 		return;
1187 	/*set the remove flag. */
1188 	vdev->remove = 1;
1189 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1190 		rte_pause();
1191 	}
1192 
1193 	if (builtin_net_driver)
1194 		vs_vhost_net_remove(vdev);
1195 
1196 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1197 		     lcore_vdev_entry);
1198 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1199 
1200 
1201 	/* Set the dev_removal_flag on each lcore. */
1202 	RTE_LCORE_FOREACH_SLAVE(lcore)
1203 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1204 
1205 	/*
1206 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1207 	 * we can be sure that they can no longer access the device removed
1208 	 * from the linked lists and that the devices are no longer in use.
1209 	 */
1210 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1211 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1212 			rte_pause();
1213 	}
1214 
1215 	lcore_info[vdev->coreid].device_num--;
1216 
1217 	RTE_LOG(INFO, VHOST_DATA,
1218 		"(%d) device has been removed from data core\n",
1219 		vdev->vid);
1220 
1221 	rte_free(vdev);
1222 }
1223 
1224 /*
1225  * A new device is added to a data core. First the device is added to the main linked list
1226  * and the allocated to a specific data core.
1227  */
1228 static int
1229 new_device(int vid)
1230 {
1231 	int lcore, core_add = 0;
1232 	uint32_t device_num_min = num_devices;
1233 	struct vhost_dev *vdev;
1234 
1235 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1236 	if (vdev == NULL) {
1237 		RTE_LOG(INFO, VHOST_DATA,
1238 			"(%d) couldn't allocate memory for vhost dev\n",
1239 			vid);
1240 		return -1;
1241 	}
1242 	vdev->vid = vid;
1243 
1244 	if (builtin_net_driver)
1245 		vs_vhost_net_setup(vdev);
1246 
1247 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1248 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1249 
1250 	/*reset ready flag*/
1251 	vdev->ready = DEVICE_MAC_LEARNING;
1252 	vdev->remove = 0;
1253 
1254 	/* Find a suitable lcore to add the device. */
1255 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1256 		if (lcore_info[lcore].device_num < device_num_min) {
1257 			device_num_min = lcore_info[lcore].device_num;
1258 			core_add = lcore;
1259 		}
1260 	}
1261 	vdev->coreid = core_add;
1262 
1263 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1264 			  lcore_vdev_entry);
1265 	lcore_info[vdev->coreid].device_num++;
1266 
1267 	/* Disable notifications. */
1268 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1269 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1270 
1271 	RTE_LOG(INFO, VHOST_DATA,
1272 		"(%d) device has been added to data core %d\n",
1273 		vid, vdev->coreid);
1274 
1275 	return 0;
1276 }
1277 
1278 /*
1279  * These callback allow devices to be added to the data core when configuration
1280  * has been fully complete.
1281  */
1282 static const struct vhost_device_ops virtio_net_device_ops =
1283 {
1284 	.new_device =  new_device,
1285 	.destroy_device = destroy_device,
1286 };
1287 
1288 /*
1289  * This is a thread will wake up after a period to print stats if the user has
1290  * enabled them.
1291  */
1292 static void
1293 print_stats(void)
1294 {
1295 	struct vhost_dev *vdev;
1296 	uint64_t tx_dropped, rx_dropped;
1297 	uint64_t tx, tx_total, rx, rx_total;
1298 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1299 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1300 
1301 	while(1) {
1302 		sleep(enable_stats);
1303 
1304 		/* Clear screen and move to top left */
1305 		printf("%s%s\n", clr, top_left);
1306 		printf("Device statistics =================================\n");
1307 
1308 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1309 			tx_total   = vdev->stats.tx_total;
1310 			tx         = vdev->stats.tx;
1311 			tx_dropped = tx_total - tx;
1312 
1313 			rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1314 			rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1315 			rx_dropped = rx_total - rx;
1316 
1317 			printf("Statistics for device %d\n"
1318 				"-----------------------\n"
1319 				"TX total:              %" PRIu64 "\n"
1320 				"TX dropped:            %" PRIu64 "\n"
1321 				"TX successful:         %" PRIu64 "\n"
1322 				"RX total:              %" PRIu64 "\n"
1323 				"RX dropped:            %" PRIu64 "\n"
1324 				"RX successful:         %" PRIu64 "\n",
1325 				vdev->vid,
1326 				tx_total, tx_dropped, tx,
1327 				rx_total, rx_dropped, rx);
1328 		}
1329 
1330 		printf("===================================================\n");
1331 	}
1332 }
1333 
1334 static void
1335 unregister_drivers(int socket_num)
1336 {
1337 	int i, ret;
1338 
1339 	for (i = 0; i < socket_num; i++) {
1340 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1341 		if (ret != 0)
1342 			RTE_LOG(ERR, VHOST_CONFIG,
1343 				"Fail to unregister vhost driver for %s.\n",
1344 				socket_files + i * PATH_MAX);
1345 	}
1346 }
1347 
1348 /* When we receive a INT signal, unregister vhost driver */
1349 static void
1350 sigint_handler(__rte_unused int signum)
1351 {
1352 	/* Unregister vhost driver. */
1353 	unregister_drivers(nb_sockets);
1354 
1355 	exit(0);
1356 }
1357 
1358 /*
1359  * While creating an mbuf pool, one key thing is to figure out how
1360  * many mbuf entries is enough for our use. FYI, here are some
1361  * guidelines:
1362  *
1363  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1364  *
1365  * - For each switch core (A CPU core does the packet switch), we need
1366  *   also make some reservation for receiving the packets from virtio
1367  *   Tx queue. How many is enough depends on the usage. It's normally
1368  *   a simple calculation like following:
1369  *
1370  *       MAX_PKT_BURST * max packet size / mbuf size
1371  *
1372  *   So, we definitely need allocate more mbufs when TSO is enabled.
1373  *
1374  * - Similarly, for each switching core, we should serve @nr_rx_desc
1375  *   mbufs for receiving the packets from physical NIC device.
1376  *
1377  * - We also need make sure, for each switch core, we have allocated
1378  *   enough mbufs to fill up the mbuf cache.
1379  */
1380 static void
1381 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1382 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1383 {
1384 	uint32_t nr_mbufs;
1385 	uint32_t nr_mbufs_per_core;
1386 	uint32_t mtu = 1500;
1387 
1388 	if (mergeable)
1389 		mtu = 9000;
1390 	if (enable_tso)
1391 		mtu = 64 * 1024;
1392 
1393 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1394 			(mbuf_size - RTE_PKTMBUF_HEADROOM);
1395 	nr_mbufs_per_core += nr_rx_desc;
1396 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1397 
1398 	nr_mbufs  = nr_queues * nr_rx_desc;
1399 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1400 	nr_mbufs *= nr_port;
1401 
1402 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1403 					    nr_mbuf_cache, 0, mbuf_size,
1404 					    rte_socket_id());
1405 	if (mbuf_pool == NULL)
1406 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1407 }
1408 
1409 /*
1410  * Main function, does initialisation and calls the per-lcore functions.
1411  */
1412 int
1413 main(int argc, char *argv[])
1414 {
1415 	unsigned lcore_id, core_id = 0;
1416 	unsigned nb_ports, valid_num_ports;
1417 	int ret, i;
1418 	uint16_t portid;
1419 	static pthread_t tid;
1420 	char thread_name[RTE_MAX_THREAD_NAME_LEN];
1421 	uint64_t flags = 0;
1422 
1423 	signal(SIGINT, sigint_handler);
1424 
1425 	/* init EAL */
1426 	ret = rte_eal_init(argc, argv);
1427 	if (ret < 0)
1428 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1429 	argc -= ret;
1430 	argv += ret;
1431 
1432 	/* parse app arguments */
1433 	ret = us_vhost_parse_args(argc, argv);
1434 	if (ret < 0)
1435 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1436 
1437 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1438 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1439 
1440 		if (rte_lcore_is_enabled(lcore_id))
1441 			lcore_ids[core_id++] = lcore_id;
1442 	}
1443 
1444 	if (rte_lcore_count() > RTE_MAX_LCORE)
1445 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1446 
1447 	/* Get the number of physical ports. */
1448 	nb_ports = rte_eth_dev_count();
1449 
1450 	/*
1451 	 * Update the global var NUM_PORTS and global array PORTS
1452 	 * and get value of var VALID_NUM_PORTS according to system ports number
1453 	 */
1454 	valid_num_ports = check_ports_num(nb_ports);
1455 
1456 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1457 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1458 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1459 		return -1;
1460 	}
1461 
1462 	/*
1463 	 * FIXME: here we are trying to allocate mbufs big enough for
1464 	 * @MAX_QUEUES, but the truth is we're never going to use that
1465 	 * many queues here. We probably should only do allocation for
1466 	 * those queues we are going to use.
1467 	 */
1468 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1469 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1470 
1471 	if (vm2vm_mode == VM2VM_HARDWARE) {
1472 		/* Enable VT loop back to let L2 switch to do it. */
1473 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1474 		RTE_LOG(DEBUG, VHOST_CONFIG,
1475 			"Enable loop back for L2 switch in vmdq.\n");
1476 	}
1477 
1478 	/* initialize all ports */
1479 	for (portid = 0; portid < nb_ports; portid++) {
1480 		/* skip ports that are not enabled */
1481 		if ((enabled_port_mask & (1 << portid)) == 0) {
1482 			RTE_LOG(INFO, VHOST_PORT,
1483 				"Skipping disabled port %d\n", portid);
1484 			continue;
1485 		}
1486 		if (port_init(portid) != 0)
1487 			rte_exit(EXIT_FAILURE,
1488 				"Cannot initialize network ports\n");
1489 	}
1490 
1491 	/* Enable stats if the user option is set. */
1492 	if (enable_stats) {
1493 		ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
1494 		if (ret != 0)
1495 			rte_exit(EXIT_FAILURE,
1496 				"Cannot create print-stats thread\n");
1497 
1498 		/* Set thread_name for aid in debugging.  */
1499 		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
1500 		ret = rte_thread_setname(tid, thread_name);
1501 		if (ret != 0)
1502 			RTE_LOG(DEBUG, VHOST_CONFIG,
1503 				"Cannot set print-stats name\n");
1504 	}
1505 
1506 	/* Launch all data cores. */
1507 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1508 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1509 
1510 	if (client_mode)
1511 		flags |= RTE_VHOST_USER_CLIENT;
1512 
1513 	if (dequeue_zero_copy)
1514 		flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1515 
1516 	/* Register vhost user driver to handle vhost messages. */
1517 	for (i = 0; i < nb_sockets; i++) {
1518 		char *file = socket_files + i * PATH_MAX;
1519 		ret = rte_vhost_driver_register(file, flags);
1520 		if (ret != 0) {
1521 			unregister_drivers(i);
1522 			rte_exit(EXIT_FAILURE,
1523 				"vhost driver register failure.\n");
1524 		}
1525 
1526 		if (builtin_net_driver)
1527 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1528 
1529 		if (mergeable == 0) {
1530 			rte_vhost_driver_disable_features(file,
1531 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
1532 		}
1533 
1534 		if (enable_tx_csum == 0) {
1535 			rte_vhost_driver_disable_features(file,
1536 				1ULL << VIRTIO_NET_F_CSUM);
1537 		}
1538 
1539 		if (enable_tso == 0) {
1540 			rte_vhost_driver_disable_features(file,
1541 				1ULL << VIRTIO_NET_F_HOST_TSO4);
1542 			rte_vhost_driver_disable_features(file,
1543 				1ULL << VIRTIO_NET_F_HOST_TSO6);
1544 			rte_vhost_driver_disable_features(file,
1545 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
1546 			rte_vhost_driver_disable_features(file,
1547 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
1548 		}
1549 
1550 		if (promiscuous) {
1551 			rte_vhost_driver_enable_features(file,
1552 				1ULL << VIRTIO_NET_F_CTRL_RX);
1553 		}
1554 
1555 		ret = rte_vhost_driver_callback_register(file,
1556 			&virtio_net_device_ops);
1557 		if (ret != 0) {
1558 			rte_exit(EXIT_FAILURE,
1559 				"failed to register vhost driver callbacks.\n");
1560 		}
1561 
1562 		if (rte_vhost_driver_start(file) < 0) {
1563 			rte_exit(EXIT_FAILURE,
1564 				"failed to start vhost driver.\n");
1565 		}
1566 	}
1567 
1568 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1569 		rte_eal_wait_lcore(lcore_id);
1570 
1571 	return 0;
1572 
1573 }
1574