xref: /dpdk/examples/vhost/main.c (revision 8b9bd0efe0b6920a08e28eebacf2bb916bdf5653)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4 
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16 
17 #include <rte_atomic.h>
18 #include <rte_cycles.h>
19 #include <rte_ethdev.h>
20 #include <rte_log.h>
21 #include <rte_string_fns.h>
22 #include <rte_malloc.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27 
28 #include "main.h"
29 
30 #ifndef MAX_QUEUES
31 #define MAX_QUEUES 128
32 #endif
33 
34 /* the maximum number of external ports supported */
35 #define MAX_SUP_PORTS 1
36 
37 #define MBUF_CACHE_SIZE	128
38 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
39 
40 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
41 
42 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
43 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
44 
45 #define JUMBO_FRAME_MAX_SIZE    0x2600
46 
47 /* State of virtio device. */
48 #define DEVICE_MAC_LEARNING 0
49 #define DEVICE_RX			1
50 #define DEVICE_SAFE_REMOVE	2
51 
52 /* Configurable number of RX/TX ring descriptors */
53 #define RTE_TEST_RX_DESC_DEFAULT 1024
54 #define RTE_TEST_TX_DESC_DEFAULT 512
55 
56 #define INVALID_PORT_ID 0xFF
57 
58 /* Max number of devices. Limited by vmdq. */
59 #define MAX_DEVICES 64
60 
61 /* Size of buffers used for snprintfs. */
62 #define MAX_PRINT_BUFF 6072
63 
64 /* Maximum long option length for option parsing. */
65 #define MAX_LONG_OPT_SZ 64
66 
67 /* mask of enabled ports */
68 static uint32_t enabled_port_mask = 0;
69 
70 /* Promiscuous mode */
71 static uint32_t promiscuous;
72 
73 /* number of devices/queues to support*/
74 static uint32_t num_queues = 0;
75 static uint32_t num_devices;
76 
77 static struct rte_mempool *mbuf_pool;
78 static int mergeable;
79 
80 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
81 typedef enum {
82 	VM2VM_DISABLED = 0,
83 	VM2VM_SOFTWARE = 1,
84 	VM2VM_HARDWARE = 2,
85 	VM2VM_LAST
86 } vm2vm_type;
87 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
88 
89 /* Enable stats. */
90 static uint32_t enable_stats = 0;
91 /* Enable retries on RX. */
92 static uint32_t enable_retry = 1;
93 
94 /* Disable TX checksum offload */
95 static uint32_t enable_tx_csum;
96 
97 /* Disable TSO offload */
98 static uint32_t enable_tso;
99 
100 static int client_mode;
101 static int dequeue_zero_copy;
102 
103 static int builtin_net_driver;
104 
105 /* Specify timeout (in useconds) between retries on RX. */
106 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
107 /* Specify the number of retries on RX. */
108 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
109 
110 /* Socket file paths. Can be set by user */
111 static char *socket_files;
112 static int nb_sockets;
113 
114 /* empty vmdq configuration structure. Filled in programatically */
115 static struct rte_eth_conf vmdq_conf_default = {
116 	.rxmode = {
117 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
118 		.split_hdr_size = 0,
119 		.ignore_offload_bitfield = 1,
120 		/*
121 		 * VLAN strip is necessary for 1G NIC such as I350,
122 		 * this fixes bug of ipv4 forwarding in guest can't
123 		 * forward pakets from one virtio dev to another virtio dev.
124 		 */
125 		.offloads = (DEV_RX_OFFLOAD_CRC_STRIP |
126 			     DEV_RX_OFFLOAD_VLAN_STRIP),
127 	},
128 
129 	.txmode = {
130 		.mq_mode = ETH_MQ_TX_NONE,
131 		.offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
132 			     DEV_TX_OFFLOAD_TCP_CKSUM |
133 			     DEV_TX_OFFLOAD_VLAN_INSERT |
134 			     DEV_TX_OFFLOAD_MULTI_SEGS |
135 			     DEV_TX_OFFLOAD_TCP_TSO),
136 	},
137 	.rx_adv_conf = {
138 		/*
139 		 * should be overridden separately in code with
140 		 * appropriate values
141 		 */
142 		.vmdq_rx_conf = {
143 			.nb_queue_pools = ETH_8_POOLS,
144 			.enable_default_pool = 0,
145 			.default_pool = 0,
146 			.nb_pool_maps = 0,
147 			.pool_map = {{0, 0},},
148 		},
149 	},
150 };
151 
152 
153 static unsigned lcore_ids[RTE_MAX_LCORE];
154 static uint16_t ports[RTE_MAX_ETHPORTS];
155 static unsigned num_ports = 0; /**< The number of ports specified in command line */
156 static uint16_t num_pf_queues, num_vmdq_queues;
157 static uint16_t vmdq_pool_base, vmdq_queue_base;
158 static uint16_t queues_per_pool;
159 
160 const uint16_t vlan_tags[] = {
161 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
162 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
163 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
164 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
165 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
166 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
167 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
168 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
169 };
170 
171 /* ethernet addresses of ports */
172 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
173 
174 static struct vhost_dev_tailq_list vhost_dev_list =
175 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
176 
177 static struct lcore_info lcore_info[RTE_MAX_LCORE];
178 
179 /* Used for queueing bursts of TX packets. */
180 struct mbuf_table {
181 	unsigned len;
182 	unsigned txq_id;
183 	struct rte_mbuf *m_table[MAX_PKT_BURST];
184 };
185 
186 /* TX queue for each data core. */
187 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
188 
189 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
190 				 / US_PER_S * BURST_TX_DRAIN_US)
191 #define VLAN_HLEN       4
192 
193 /*
194  * Builds up the correct configuration for VMDQ VLAN pool map
195  * according to the pool & queue limits.
196  */
197 static inline int
198 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
199 {
200 	struct rte_eth_vmdq_rx_conf conf;
201 	struct rte_eth_vmdq_rx_conf *def_conf =
202 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
203 	unsigned i;
204 
205 	memset(&conf, 0, sizeof(conf));
206 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
207 	conf.nb_pool_maps = num_devices;
208 	conf.enable_loop_back = def_conf->enable_loop_back;
209 	conf.rx_mode = def_conf->rx_mode;
210 
211 	for (i = 0; i < conf.nb_pool_maps; i++) {
212 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
213 		conf.pool_map[i].pools = (1UL << i);
214 	}
215 
216 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
217 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
218 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
219 	return 0;
220 }
221 
222 /*
223  * Validate the device number according to the max pool number gotten form
224  * dev_info. If the device number is invalid, give the error message and
225  * return -1. Each device must have its own pool.
226  */
227 static inline int
228 validate_num_devices(uint32_t max_nb_devices)
229 {
230 	if (num_devices > max_nb_devices) {
231 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
232 		return -1;
233 	}
234 	return 0;
235 }
236 
237 /*
238  * Initialises a given port using global settings and with the rx buffers
239  * coming from the mbuf_pool passed as parameter
240  */
241 static inline int
242 port_init(uint16_t port)
243 {
244 	struct rte_eth_dev_info dev_info;
245 	struct rte_eth_conf port_conf;
246 	struct rte_eth_rxconf *rxconf;
247 	struct rte_eth_txconf *txconf;
248 	int16_t rx_rings, tx_rings;
249 	uint16_t rx_ring_size, tx_ring_size;
250 	int retval;
251 	uint16_t q;
252 
253 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
254 	rte_eth_dev_info_get (port, &dev_info);
255 
256 	rxconf = &dev_info.default_rxconf;
257 	txconf = &dev_info.default_txconf;
258 	rxconf->rx_drop_en = 1;
259 	txconf->txq_flags = ETH_TXQ_FLAGS_IGNORE;
260 
261 	/*configure the number of supported virtio devices based on VMDQ limits */
262 	num_devices = dev_info.max_vmdq_pools;
263 
264 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
265 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
266 
267 	/*
268 	 * When dequeue zero copy is enabled, guest Tx used vring will be
269 	 * updated only when corresponding mbuf is freed. Thus, the nb_tx_desc
270 	 * (tx_ring_size here) must be small enough so that the driver will
271 	 * hit the free threshold easily and free mbufs timely. Otherwise,
272 	 * guest Tx vring would be starved.
273 	 */
274 	if (dequeue_zero_copy)
275 		tx_ring_size = 64;
276 
277 	tx_rings = (uint16_t)rte_lcore_count();
278 
279 	retval = validate_num_devices(MAX_DEVICES);
280 	if (retval < 0)
281 		return retval;
282 
283 	/* Get port configuration. */
284 	retval = get_eth_conf(&port_conf, num_devices);
285 	if (retval < 0)
286 		return retval;
287 	/* NIC queues are divided into pf queues and vmdq queues.  */
288 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
289 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
290 	num_vmdq_queues = num_devices * queues_per_pool;
291 	num_queues = num_pf_queues + num_vmdq_queues;
292 	vmdq_queue_base = dev_info.vmdq_queue_base;
293 	vmdq_pool_base  = dev_info.vmdq_pool_base;
294 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
295 		num_pf_queues, num_devices, queues_per_pool);
296 
297 	if (port >= rte_eth_dev_count()) return -1;
298 
299 	rx_rings = (uint16_t)dev_info.max_rx_queues;
300 	if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
301 		port_conf.txmode.offloads |=
302 			DEV_TX_OFFLOAD_MBUF_FAST_FREE;
303 	/* Configure ethernet device. */
304 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
305 	if (retval != 0) {
306 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
307 			port, strerror(-retval));
308 		return retval;
309 	}
310 
311 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
312 		&tx_ring_size);
313 	if (retval != 0) {
314 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
315 			"for port %u: %s.\n", port, strerror(-retval));
316 		return retval;
317 	}
318 	if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
319 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
320 			"for Rx queues on port %u.\n", port);
321 		return -1;
322 	}
323 
324 	/* Setup the queues. */
325 	rxconf->offloads = port_conf.rxmode.offloads;
326 	for (q = 0; q < rx_rings; q ++) {
327 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
328 						rte_eth_dev_socket_id(port),
329 						rxconf,
330 						mbuf_pool);
331 		if (retval < 0) {
332 			RTE_LOG(ERR, VHOST_PORT,
333 				"Failed to setup rx queue %u of port %u: %s.\n",
334 				q, port, strerror(-retval));
335 			return retval;
336 		}
337 	}
338 	txconf->offloads = port_conf.txmode.offloads;
339 	for (q = 0; q < tx_rings; q ++) {
340 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
341 						rte_eth_dev_socket_id(port),
342 						txconf);
343 		if (retval < 0) {
344 			RTE_LOG(ERR, VHOST_PORT,
345 				"Failed to setup tx queue %u of port %u: %s.\n",
346 				q, port, strerror(-retval));
347 			return retval;
348 		}
349 	}
350 
351 	/* Start the device. */
352 	retval  = rte_eth_dev_start(port);
353 	if (retval < 0) {
354 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
355 			port, strerror(-retval));
356 		return retval;
357 	}
358 
359 	if (promiscuous)
360 		rte_eth_promiscuous_enable(port);
361 
362 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
363 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
364 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
365 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
366 			port,
367 			vmdq_ports_eth_addr[port].addr_bytes[0],
368 			vmdq_ports_eth_addr[port].addr_bytes[1],
369 			vmdq_ports_eth_addr[port].addr_bytes[2],
370 			vmdq_ports_eth_addr[port].addr_bytes[3],
371 			vmdq_ports_eth_addr[port].addr_bytes[4],
372 			vmdq_ports_eth_addr[port].addr_bytes[5]);
373 
374 	return 0;
375 }
376 
377 /*
378  * Set socket file path.
379  */
380 static int
381 us_vhost_parse_socket_path(const char *q_arg)
382 {
383 	/* parse number string */
384 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
385 		return -1;
386 
387 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
388 	snprintf(socket_files + nb_sockets * PATH_MAX, PATH_MAX, "%s", q_arg);
389 	nb_sockets++;
390 
391 	return 0;
392 }
393 
394 /*
395  * Parse the portmask provided at run time.
396  */
397 static int
398 parse_portmask(const char *portmask)
399 {
400 	char *end = NULL;
401 	unsigned long pm;
402 
403 	errno = 0;
404 
405 	/* parse hexadecimal string */
406 	pm = strtoul(portmask, &end, 16);
407 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
408 		return -1;
409 
410 	if (pm == 0)
411 		return -1;
412 
413 	return pm;
414 
415 }
416 
417 /*
418  * Parse num options at run time.
419  */
420 static int
421 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
422 {
423 	char *end = NULL;
424 	unsigned long num;
425 
426 	errno = 0;
427 
428 	/* parse unsigned int string */
429 	num = strtoul(q_arg, &end, 10);
430 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
431 		return -1;
432 
433 	if (num > max_valid_value)
434 		return -1;
435 
436 	return num;
437 
438 }
439 
440 /*
441  * Display usage
442  */
443 static void
444 us_vhost_usage(const char *prgname)
445 {
446 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
447 	"		--vm2vm [0|1|2]\n"
448 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
449 	"		--socket-file <path>\n"
450 	"		--nb-devices ND\n"
451 	"		-p PORTMASK: Set mask for ports to be used by application\n"
452 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
453 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
454 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
455 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
456 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
457 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
458 	"		--socket-file: The path of the socket file.\n"
459 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
460 	"		--tso [0|1] disable/enable TCP segment offload.\n"
461 	"		--client register a vhost-user socket as client mode.\n"
462 	"		--dequeue-zero-copy enables dequeue zero copy\n",
463 	       prgname);
464 }
465 
466 /*
467  * Parse the arguments given in the command line of the application.
468  */
469 static int
470 us_vhost_parse_args(int argc, char **argv)
471 {
472 	int opt, ret;
473 	int option_index;
474 	unsigned i;
475 	const char *prgname = argv[0];
476 	static struct option long_option[] = {
477 		{"vm2vm", required_argument, NULL, 0},
478 		{"rx-retry", required_argument, NULL, 0},
479 		{"rx-retry-delay", required_argument, NULL, 0},
480 		{"rx-retry-num", required_argument, NULL, 0},
481 		{"mergeable", required_argument, NULL, 0},
482 		{"stats", required_argument, NULL, 0},
483 		{"socket-file", required_argument, NULL, 0},
484 		{"tx-csum", required_argument, NULL, 0},
485 		{"tso", required_argument, NULL, 0},
486 		{"client", no_argument, &client_mode, 1},
487 		{"dequeue-zero-copy", no_argument, &dequeue_zero_copy, 1},
488 		{"builtin-net-driver", no_argument, &builtin_net_driver, 1},
489 		{NULL, 0, 0, 0},
490 	};
491 
492 	/* Parse command line */
493 	while ((opt = getopt_long(argc, argv, "p:P",
494 			long_option, &option_index)) != EOF) {
495 		switch (opt) {
496 		/* Portmask */
497 		case 'p':
498 			enabled_port_mask = parse_portmask(optarg);
499 			if (enabled_port_mask == 0) {
500 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
501 				us_vhost_usage(prgname);
502 				return -1;
503 			}
504 			break;
505 
506 		case 'P':
507 			promiscuous = 1;
508 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
509 				ETH_VMDQ_ACCEPT_BROADCAST |
510 				ETH_VMDQ_ACCEPT_MULTICAST;
511 
512 			break;
513 
514 		case 0:
515 			/* Enable/disable vm2vm comms. */
516 			if (!strncmp(long_option[option_index].name, "vm2vm",
517 				MAX_LONG_OPT_SZ)) {
518 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
519 				if (ret == -1) {
520 					RTE_LOG(INFO, VHOST_CONFIG,
521 						"Invalid argument for "
522 						"vm2vm [0|1|2]\n");
523 					us_vhost_usage(prgname);
524 					return -1;
525 				} else {
526 					vm2vm_mode = (vm2vm_type)ret;
527 				}
528 			}
529 
530 			/* Enable/disable retries on RX. */
531 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
532 				ret = parse_num_opt(optarg, 1);
533 				if (ret == -1) {
534 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
535 					us_vhost_usage(prgname);
536 					return -1;
537 				} else {
538 					enable_retry = ret;
539 				}
540 			}
541 
542 			/* Enable/disable TX checksum offload. */
543 			if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
544 				ret = parse_num_opt(optarg, 1);
545 				if (ret == -1) {
546 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
547 					us_vhost_usage(prgname);
548 					return -1;
549 				} else
550 					enable_tx_csum = ret;
551 			}
552 
553 			/* Enable/disable TSO offload. */
554 			if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
555 				ret = parse_num_opt(optarg, 1);
556 				if (ret == -1) {
557 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
558 					us_vhost_usage(prgname);
559 					return -1;
560 				} else
561 					enable_tso = ret;
562 			}
563 
564 			/* Specify the retries delay time (in useconds) on RX. */
565 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
566 				ret = parse_num_opt(optarg, INT32_MAX);
567 				if (ret == -1) {
568 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
569 					us_vhost_usage(prgname);
570 					return -1;
571 				} else {
572 					burst_rx_delay_time = ret;
573 				}
574 			}
575 
576 			/* Specify the retries number on RX. */
577 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
578 				ret = parse_num_opt(optarg, INT32_MAX);
579 				if (ret == -1) {
580 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
581 					us_vhost_usage(prgname);
582 					return -1;
583 				} else {
584 					burst_rx_retry_num = ret;
585 				}
586 			}
587 
588 			/* Enable/disable RX mergeable buffers. */
589 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
590 				ret = parse_num_opt(optarg, 1);
591 				if (ret == -1) {
592 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
593 					us_vhost_usage(prgname);
594 					return -1;
595 				} else {
596 					mergeable = !!ret;
597 					if (ret) {
598 						vmdq_conf_default.rxmode.offloads |=
599 							DEV_RX_OFFLOAD_JUMBO_FRAME;
600 						vmdq_conf_default.rxmode.max_rx_pkt_len
601 							= JUMBO_FRAME_MAX_SIZE;
602 					}
603 				}
604 			}
605 
606 			/* Enable/disable stats. */
607 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
608 				ret = parse_num_opt(optarg, INT32_MAX);
609 				if (ret == -1) {
610 					RTE_LOG(INFO, VHOST_CONFIG,
611 						"Invalid argument for stats [0..N]\n");
612 					us_vhost_usage(prgname);
613 					return -1;
614 				} else {
615 					enable_stats = ret;
616 				}
617 			}
618 
619 			/* Set socket file path. */
620 			if (!strncmp(long_option[option_index].name,
621 						"socket-file", MAX_LONG_OPT_SZ)) {
622 				if (us_vhost_parse_socket_path(optarg) == -1) {
623 					RTE_LOG(INFO, VHOST_CONFIG,
624 					"Invalid argument for socket name (Max %d characters)\n",
625 					PATH_MAX);
626 					us_vhost_usage(prgname);
627 					return -1;
628 				}
629 			}
630 
631 			break;
632 
633 			/* Invalid option - print options. */
634 		default:
635 			us_vhost_usage(prgname);
636 			return -1;
637 		}
638 	}
639 
640 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
641 		if (enabled_port_mask & (1 << i))
642 			ports[num_ports++] = i;
643 	}
644 
645 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
646 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
647 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
648 		return -1;
649 	}
650 
651 	return 0;
652 }
653 
654 /*
655  * Update the global var NUM_PORTS and array PORTS according to system ports number
656  * and return valid ports number
657  */
658 static unsigned check_ports_num(unsigned nb_ports)
659 {
660 	unsigned valid_num_ports = num_ports;
661 	unsigned portid;
662 
663 	if (num_ports > nb_ports) {
664 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
665 			num_ports, nb_ports);
666 		num_ports = nb_ports;
667 	}
668 
669 	for (portid = 0; portid < num_ports; portid ++) {
670 		if (ports[portid] >= nb_ports) {
671 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
672 				ports[portid], (nb_ports - 1));
673 			ports[portid] = INVALID_PORT_ID;
674 			valid_num_ports--;
675 		}
676 	}
677 	return valid_num_ports;
678 }
679 
680 static __rte_always_inline struct vhost_dev *
681 find_vhost_dev(struct ether_addr *mac)
682 {
683 	struct vhost_dev *vdev;
684 
685 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
686 		if (vdev->ready == DEVICE_RX &&
687 		    is_same_ether_addr(mac, &vdev->mac_address))
688 			return vdev;
689 	}
690 
691 	return NULL;
692 }
693 
694 /*
695  * This function learns the MAC address of the device and registers this along with a
696  * vlan tag to a VMDQ.
697  */
698 static int
699 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
700 {
701 	struct ether_hdr *pkt_hdr;
702 	int i, ret;
703 
704 	/* Learn MAC address of guest device from packet */
705 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
706 
707 	if (find_vhost_dev(&pkt_hdr->s_addr)) {
708 		RTE_LOG(ERR, VHOST_DATA,
709 			"(%d) device is using a registered MAC!\n",
710 			vdev->vid);
711 		return -1;
712 	}
713 
714 	for (i = 0; i < ETHER_ADDR_LEN; i++)
715 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
716 
717 	/* vlan_tag currently uses the device_id. */
718 	vdev->vlan_tag = vlan_tags[vdev->vid];
719 
720 	/* Print out VMDQ registration info. */
721 	RTE_LOG(INFO, VHOST_DATA,
722 		"(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
723 		vdev->vid,
724 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
725 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
726 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
727 		vdev->vlan_tag);
728 
729 	/* Register the MAC address. */
730 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
731 				(uint32_t)vdev->vid + vmdq_pool_base);
732 	if (ret)
733 		RTE_LOG(ERR, VHOST_DATA,
734 			"(%d) failed to add device MAC address to VMDQ\n",
735 			vdev->vid);
736 
737 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
738 
739 	/* Set device as ready for RX. */
740 	vdev->ready = DEVICE_RX;
741 
742 	return 0;
743 }
744 
745 /*
746  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
747  * queue before disabling RX on the device.
748  */
749 static inline void
750 unlink_vmdq(struct vhost_dev *vdev)
751 {
752 	unsigned i = 0;
753 	unsigned rx_count;
754 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
755 
756 	if (vdev->ready == DEVICE_RX) {
757 		/*clear MAC and VLAN settings*/
758 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
759 		for (i = 0; i < 6; i++)
760 			vdev->mac_address.addr_bytes[i] = 0;
761 
762 		vdev->vlan_tag = 0;
763 
764 		/*Clear out the receive buffers*/
765 		rx_count = rte_eth_rx_burst(ports[0],
766 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
767 
768 		while (rx_count) {
769 			for (i = 0; i < rx_count; i++)
770 				rte_pktmbuf_free(pkts_burst[i]);
771 
772 			rx_count = rte_eth_rx_burst(ports[0],
773 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
774 		}
775 
776 		vdev->ready = DEVICE_MAC_LEARNING;
777 	}
778 }
779 
780 static __rte_always_inline void
781 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
782 	    struct rte_mbuf *m)
783 {
784 	uint16_t ret;
785 
786 	if (builtin_net_driver) {
787 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
788 	} else {
789 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
790 	}
791 
792 	if (enable_stats) {
793 		rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
794 		rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
795 		src_vdev->stats.tx_total++;
796 		src_vdev->stats.tx += ret;
797 	}
798 }
799 
800 /*
801  * Check if the packet destination MAC address is for a local device. If so then put
802  * the packet on that devices RX queue. If not then return.
803  */
804 static __rte_always_inline int
805 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
806 {
807 	struct ether_hdr *pkt_hdr;
808 	struct vhost_dev *dst_vdev;
809 
810 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
811 
812 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
813 	if (!dst_vdev)
814 		return -1;
815 
816 	if (vdev->vid == dst_vdev->vid) {
817 		RTE_LOG_DP(DEBUG, VHOST_DATA,
818 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
819 			vdev->vid);
820 		return 0;
821 	}
822 
823 	RTE_LOG_DP(DEBUG, VHOST_DATA,
824 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
825 
826 	if (unlikely(dst_vdev->remove)) {
827 		RTE_LOG_DP(DEBUG, VHOST_DATA,
828 			"(%d) device is marked for removal\n", dst_vdev->vid);
829 		return 0;
830 	}
831 
832 	virtio_xmit(dst_vdev, vdev, m);
833 	return 0;
834 }
835 
836 /*
837  * Check if the destination MAC of a packet is one local VM,
838  * and get its vlan tag, and offset if it is.
839  */
840 static __rte_always_inline int
841 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
842 	uint32_t *offset, uint16_t *vlan_tag)
843 {
844 	struct vhost_dev *dst_vdev;
845 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
846 
847 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
848 	if (!dst_vdev)
849 		return 0;
850 
851 	if (vdev->vid == dst_vdev->vid) {
852 		RTE_LOG_DP(DEBUG, VHOST_DATA,
853 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
854 			vdev->vid);
855 		return -1;
856 	}
857 
858 	/*
859 	 * HW vlan strip will reduce the packet length
860 	 * by minus length of vlan tag, so need restore
861 	 * the packet length by plus it.
862 	 */
863 	*offset  = VLAN_HLEN;
864 	*vlan_tag = vlan_tags[vdev->vid];
865 
866 	RTE_LOG_DP(DEBUG, VHOST_DATA,
867 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
868 		vdev->vid, dst_vdev->vid, *vlan_tag);
869 
870 	return 0;
871 }
872 
873 static uint16_t
874 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
875 {
876 	if (ol_flags & PKT_TX_IPV4)
877 		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
878 	else /* assume ethertype == ETHER_TYPE_IPv6 */
879 		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
880 }
881 
882 static void virtio_tx_offload(struct rte_mbuf *m)
883 {
884 	void *l3_hdr;
885 	struct ipv4_hdr *ipv4_hdr = NULL;
886 	struct tcp_hdr *tcp_hdr = NULL;
887 	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
888 
889 	l3_hdr = (char *)eth_hdr + m->l2_len;
890 
891 	if (m->ol_flags & PKT_TX_IPV4) {
892 		ipv4_hdr = l3_hdr;
893 		ipv4_hdr->hdr_checksum = 0;
894 		m->ol_flags |= PKT_TX_IP_CKSUM;
895 	}
896 
897 	tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
898 	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
899 }
900 
901 static inline void
902 free_pkts(struct rte_mbuf **pkts, uint16_t n)
903 {
904 	while (n--)
905 		rte_pktmbuf_free(pkts[n]);
906 }
907 
908 static __rte_always_inline void
909 do_drain_mbuf_table(struct mbuf_table *tx_q)
910 {
911 	uint16_t count;
912 
913 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
914 				 tx_q->m_table, tx_q->len);
915 	if (unlikely(count < tx_q->len))
916 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
917 
918 	tx_q->len = 0;
919 }
920 
921 /*
922  * This function routes the TX packet to the correct interface. This
923  * may be a local device or the physical port.
924  */
925 static __rte_always_inline void
926 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
927 {
928 	struct mbuf_table *tx_q;
929 	unsigned offset = 0;
930 	const uint16_t lcore_id = rte_lcore_id();
931 	struct ether_hdr *nh;
932 
933 
934 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
935 	if (unlikely(is_broadcast_ether_addr(&nh->d_addr))) {
936 		struct vhost_dev *vdev2;
937 
938 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
939 			if (vdev2 != vdev)
940 				virtio_xmit(vdev2, vdev, m);
941 		}
942 		goto queue2nic;
943 	}
944 
945 	/*check if destination is local VM*/
946 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
947 		rte_pktmbuf_free(m);
948 		return;
949 	}
950 
951 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
952 		if (unlikely(find_local_dest(vdev, m, &offset,
953 					     &vlan_tag) != 0)) {
954 			rte_pktmbuf_free(m);
955 			return;
956 		}
957 	}
958 
959 	RTE_LOG_DP(DEBUG, VHOST_DATA,
960 		"(%d) TX: MAC address is external\n", vdev->vid);
961 
962 queue2nic:
963 
964 	/*Add packet to the port tx queue*/
965 	tx_q = &lcore_tx_queue[lcore_id];
966 
967 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
968 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
969 		/* Guest has inserted the vlan tag. */
970 		struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
971 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
972 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
973 			(vh->vlan_tci != vlan_tag_be))
974 			vh->vlan_tci = vlan_tag_be;
975 	} else {
976 		m->ol_flags |= PKT_TX_VLAN_PKT;
977 
978 		/*
979 		 * Find the right seg to adjust the data len when offset is
980 		 * bigger than tail room size.
981 		 */
982 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
983 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
984 				m->data_len += offset;
985 			else {
986 				struct rte_mbuf *seg = m;
987 
988 				while ((seg->next != NULL) &&
989 					(offset > rte_pktmbuf_tailroom(seg)))
990 					seg = seg->next;
991 
992 				seg->data_len += offset;
993 			}
994 			m->pkt_len += offset;
995 		}
996 
997 		m->vlan_tci = vlan_tag;
998 	}
999 
1000 	if (m->ol_flags & PKT_TX_TCP_SEG)
1001 		virtio_tx_offload(m);
1002 
1003 	tx_q->m_table[tx_q->len++] = m;
1004 	if (enable_stats) {
1005 		vdev->stats.tx_total++;
1006 		vdev->stats.tx++;
1007 	}
1008 
1009 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1010 		do_drain_mbuf_table(tx_q);
1011 }
1012 
1013 
1014 static __rte_always_inline void
1015 drain_mbuf_table(struct mbuf_table *tx_q)
1016 {
1017 	static uint64_t prev_tsc;
1018 	uint64_t cur_tsc;
1019 
1020 	if (tx_q->len == 0)
1021 		return;
1022 
1023 	cur_tsc = rte_rdtsc();
1024 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1025 		prev_tsc = cur_tsc;
1026 
1027 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1028 			"TX queue drained after timeout with burst size %u\n",
1029 			tx_q->len);
1030 		do_drain_mbuf_table(tx_q);
1031 	}
1032 }
1033 
1034 static __rte_always_inline void
1035 drain_eth_rx(struct vhost_dev *vdev)
1036 {
1037 	uint16_t rx_count, enqueue_count;
1038 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1039 
1040 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1041 				    pkts, MAX_PKT_BURST);
1042 	if (!rx_count)
1043 		return;
1044 
1045 	/*
1046 	 * When "enable_retry" is set, here we wait and retry when there
1047 	 * is no enough free slots in the queue to hold @rx_count packets,
1048 	 * to diminish packet loss.
1049 	 */
1050 	if (enable_retry &&
1051 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1052 			VIRTIO_RXQ))) {
1053 		uint32_t retry;
1054 
1055 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1056 			rte_delay_us(burst_rx_delay_time);
1057 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1058 					VIRTIO_RXQ))
1059 				break;
1060 		}
1061 	}
1062 
1063 	if (builtin_net_driver) {
1064 		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1065 						pkts, rx_count);
1066 	} else {
1067 		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1068 						pkts, rx_count);
1069 	}
1070 	if (enable_stats) {
1071 		rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1072 		rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1073 	}
1074 
1075 	free_pkts(pkts, rx_count);
1076 }
1077 
1078 static __rte_always_inline void
1079 drain_virtio_tx(struct vhost_dev *vdev)
1080 {
1081 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1082 	uint16_t count;
1083 	uint16_t i;
1084 
1085 	if (builtin_net_driver) {
1086 		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1087 					pkts, MAX_PKT_BURST);
1088 	} else {
1089 		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1090 					mbuf_pool, pkts, MAX_PKT_BURST);
1091 	}
1092 
1093 	/* setup VMDq for the first packet */
1094 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1095 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1096 			free_pkts(pkts, count);
1097 	}
1098 
1099 	for (i = 0; i < count; ++i)
1100 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1101 }
1102 
1103 /*
1104  * Main function of vhost-switch. It basically does:
1105  *
1106  * for each vhost device {
1107  *    - drain_eth_rx()
1108  *
1109  *      Which drains the host eth Rx queue linked to the vhost device,
1110  *      and deliver all of them to guest virito Rx ring associated with
1111  *      this vhost device.
1112  *
1113  *    - drain_virtio_tx()
1114  *
1115  *      Which drains the guest virtio Tx queue and deliver all of them
1116  *      to the target, which could be another vhost device, or the
1117  *      physical eth dev. The route is done in function "virtio_tx_route".
1118  * }
1119  */
1120 static int
1121 switch_worker(void *arg __rte_unused)
1122 {
1123 	unsigned i;
1124 	unsigned lcore_id = rte_lcore_id();
1125 	struct vhost_dev *vdev;
1126 	struct mbuf_table *tx_q;
1127 
1128 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1129 
1130 	tx_q = &lcore_tx_queue[lcore_id];
1131 	for (i = 0; i < rte_lcore_count(); i++) {
1132 		if (lcore_ids[i] == lcore_id) {
1133 			tx_q->txq_id = i;
1134 			break;
1135 		}
1136 	}
1137 
1138 	while(1) {
1139 		drain_mbuf_table(tx_q);
1140 
1141 		/*
1142 		 * Inform the configuration core that we have exited the
1143 		 * linked list and that no devices are in use if requested.
1144 		 */
1145 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1146 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1147 
1148 		/*
1149 		 * Process vhost devices
1150 		 */
1151 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1152 			      lcore_vdev_entry) {
1153 			if (unlikely(vdev->remove)) {
1154 				unlink_vmdq(vdev);
1155 				vdev->ready = DEVICE_SAFE_REMOVE;
1156 				continue;
1157 			}
1158 
1159 			if (likely(vdev->ready == DEVICE_RX))
1160 				drain_eth_rx(vdev);
1161 
1162 			if (likely(!vdev->remove))
1163 				drain_virtio_tx(vdev);
1164 		}
1165 	}
1166 
1167 	return 0;
1168 }
1169 
1170 /*
1171  * Remove a device from the specific data core linked list and from the
1172  * main linked list. Synchonization  occurs through the use of the
1173  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1174  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1175  */
1176 static void
1177 destroy_device(int vid)
1178 {
1179 	struct vhost_dev *vdev = NULL;
1180 	int lcore;
1181 
1182 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1183 		if (vdev->vid == vid)
1184 			break;
1185 	}
1186 	if (!vdev)
1187 		return;
1188 	/*set the remove flag. */
1189 	vdev->remove = 1;
1190 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1191 		rte_pause();
1192 	}
1193 
1194 	if (builtin_net_driver)
1195 		vs_vhost_net_remove(vdev);
1196 
1197 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1198 		     lcore_vdev_entry);
1199 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1200 
1201 
1202 	/* Set the dev_removal_flag on each lcore. */
1203 	RTE_LCORE_FOREACH_SLAVE(lcore)
1204 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1205 
1206 	/*
1207 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1208 	 * we can be sure that they can no longer access the device removed
1209 	 * from the linked lists and that the devices are no longer in use.
1210 	 */
1211 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1212 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1213 			rte_pause();
1214 	}
1215 
1216 	lcore_info[vdev->coreid].device_num--;
1217 
1218 	RTE_LOG(INFO, VHOST_DATA,
1219 		"(%d) device has been removed from data core\n",
1220 		vdev->vid);
1221 
1222 	rte_free(vdev);
1223 }
1224 
1225 /*
1226  * A new device is added to a data core. First the device is added to the main linked list
1227  * and the allocated to a specific data core.
1228  */
1229 static int
1230 new_device(int vid)
1231 {
1232 	int lcore, core_add = 0;
1233 	uint32_t device_num_min = num_devices;
1234 	struct vhost_dev *vdev;
1235 
1236 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1237 	if (vdev == NULL) {
1238 		RTE_LOG(INFO, VHOST_DATA,
1239 			"(%d) couldn't allocate memory for vhost dev\n",
1240 			vid);
1241 		return -1;
1242 	}
1243 	vdev->vid = vid;
1244 
1245 	if (builtin_net_driver)
1246 		vs_vhost_net_setup(vdev);
1247 
1248 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1249 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1250 
1251 	/*reset ready flag*/
1252 	vdev->ready = DEVICE_MAC_LEARNING;
1253 	vdev->remove = 0;
1254 
1255 	/* Find a suitable lcore to add the device. */
1256 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1257 		if (lcore_info[lcore].device_num < device_num_min) {
1258 			device_num_min = lcore_info[lcore].device_num;
1259 			core_add = lcore;
1260 		}
1261 	}
1262 	vdev->coreid = core_add;
1263 
1264 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1265 			  lcore_vdev_entry);
1266 	lcore_info[vdev->coreid].device_num++;
1267 
1268 	/* Disable notifications. */
1269 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1270 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1271 
1272 	RTE_LOG(INFO, VHOST_DATA,
1273 		"(%d) device has been added to data core %d\n",
1274 		vid, vdev->coreid);
1275 
1276 	return 0;
1277 }
1278 
1279 /*
1280  * These callback allow devices to be added to the data core when configuration
1281  * has been fully complete.
1282  */
1283 static const struct vhost_device_ops virtio_net_device_ops =
1284 {
1285 	.new_device =  new_device,
1286 	.destroy_device = destroy_device,
1287 };
1288 
1289 /*
1290  * This is a thread will wake up after a period to print stats if the user has
1291  * enabled them.
1292  */
1293 static void
1294 print_stats(void)
1295 {
1296 	struct vhost_dev *vdev;
1297 	uint64_t tx_dropped, rx_dropped;
1298 	uint64_t tx, tx_total, rx, rx_total;
1299 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1300 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1301 
1302 	while(1) {
1303 		sleep(enable_stats);
1304 
1305 		/* Clear screen and move to top left */
1306 		printf("%s%s\n", clr, top_left);
1307 		printf("Device statistics =================================\n");
1308 
1309 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1310 			tx_total   = vdev->stats.tx_total;
1311 			tx         = vdev->stats.tx;
1312 			tx_dropped = tx_total - tx;
1313 
1314 			rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1315 			rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1316 			rx_dropped = rx_total - rx;
1317 
1318 			printf("Statistics for device %d\n"
1319 				"-----------------------\n"
1320 				"TX total:              %" PRIu64 "\n"
1321 				"TX dropped:            %" PRIu64 "\n"
1322 				"TX successful:         %" PRIu64 "\n"
1323 				"RX total:              %" PRIu64 "\n"
1324 				"RX dropped:            %" PRIu64 "\n"
1325 				"RX successful:         %" PRIu64 "\n",
1326 				vdev->vid,
1327 				tx_total, tx_dropped, tx,
1328 				rx_total, rx_dropped, rx);
1329 		}
1330 
1331 		printf("===================================================\n");
1332 	}
1333 }
1334 
1335 static void
1336 unregister_drivers(int socket_num)
1337 {
1338 	int i, ret;
1339 
1340 	for (i = 0; i < socket_num; i++) {
1341 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1342 		if (ret != 0)
1343 			RTE_LOG(ERR, VHOST_CONFIG,
1344 				"Fail to unregister vhost driver for %s.\n",
1345 				socket_files + i * PATH_MAX);
1346 	}
1347 }
1348 
1349 /* When we receive a INT signal, unregister vhost driver */
1350 static void
1351 sigint_handler(__rte_unused int signum)
1352 {
1353 	/* Unregister vhost driver. */
1354 	unregister_drivers(nb_sockets);
1355 
1356 	exit(0);
1357 }
1358 
1359 /*
1360  * While creating an mbuf pool, one key thing is to figure out how
1361  * many mbuf entries is enough for our use. FYI, here are some
1362  * guidelines:
1363  *
1364  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1365  *
1366  * - For each switch core (A CPU core does the packet switch), we need
1367  *   also make some reservation for receiving the packets from virtio
1368  *   Tx queue. How many is enough depends on the usage. It's normally
1369  *   a simple calculation like following:
1370  *
1371  *       MAX_PKT_BURST * max packet size / mbuf size
1372  *
1373  *   So, we definitely need allocate more mbufs when TSO is enabled.
1374  *
1375  * - Similarly, for each switching core, we should serve @nr_rx_desc
1376  *   mbufs for receiving the packets from physical NIC device.
1377  *
1378  * - We also need make sure, for each switch core, we have allocated
1379  *   enough mbufs to fill up the mbuf cache.
1380  */
1381 static void
1382 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1383 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1384 {
1385 	uint32_t nr_mbufs;
1386 	uint32_t nr_mbufs_per_core;
1387 	uint32_t mtu = 1500;
1388 
1389 	if (mergeable)
1390 		mtu = 9000;
1391 	if (enable_tso)
1392 		mtu = 64 * 1024;
1393 
1394 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1395 			(mbuf_size - RTE_PKTMBUF_HEADROOM);
1396 	nr_mbufs_per_core += nr_rx_desc;
1397 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1398 
1399 	nr_mbufs  = nr_queues * nr_rx_desc;
1400 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1401 	nr_mbufs *= nr_port;
1402 
1403 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1404 					    nr_mbuf_cache, 0, mbuf_size,
1405 					    rte_socket_id());
1406 	if (mbuf_pool == NULL)
1407 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1408 }
1409 
1410 /*
1411  * Main function, does initialisation and calls the per-lcore functions.
1412  */
1413 int
1414 main(int argc, char *argv[])
1415 {
1416 	unsigned lcore_id, core_id = 0;
1417 	unsigned nb_ports, valid_num_ports;
1418 	int ret, i;
1419 	uint16_t portid;
1420 	static pthread_t tid;
1421 	char thread_name[RTE_MAX_THREAD_NAME_LEN];
1422 	uint64_t flags = 0;
1423 
1424 	signal(SIGINT, sigint_handler);
1425 
1426 	/* init EAL */
1427 	ret = rte_eal_init(argc, argv);
1428 	if (ret < 0)
1429 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1430 	argc -= ret;
1431 	argv += ret;
1432 
1433 	/* parse app arguments */
1434 	ret = us_vhost_parse_args(argc, argv);
1435 	if (ret < 0)
1436 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1437 
1438 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1439 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1440 
1441 		if (rte_lcore_is_enabled(lcore_id))
1442 			lcore_ids[core_id++] = lcore_id;
1443 	}
1444 
1445 	if (rte_lcore_count() > RTE_MAX_LCORE)
1446 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1447 
1448 	/* Get the number of physical ports. */
1449 	nb_ports = rte_eth_dev_count();
1450 
1451 	/*
1452 	 * Update the global var NUM_PORTS and global array PORTS
1453 	 * and get value of var VALID_NUM_PORTS according to system ports number
1454 	 */
1455 	valid_num_ports = check_ports_num(nb_ports);
1456 
1457 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1458 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1459 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1460 		return -1;
1461 	}
1462 
1463 	/*
1464 	 * FIXME: here we are trying to allocate mbufs big enough for
1465 	 * @MAX_QUEUES, but the truth is we're never going to use that
1466 	 * many queues here. We probably should only do allocation for
1467 	 * those queues we are going to use.
1468 	 */
1469 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1470 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1471 
1472 	if (vm2vm_mode == VM2VM_HARDWARE) {
1473 		/* Enable VT loop back to let L2 switch to do it. */
1474 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1475 		RTE_LOG(DEBUG, VHOST_CONFIG,
1476 			"Enable loop back for L2 switch in vmdq.\n");
1477 	}
1478 
1479 	/* initialize all ports */
1480 	for (portid = 0; portid < nb_ports; portid++) {
1481 		/* skip ports that are not enabled */
1482 		if ((enabled_port_mask & (1 << portid)) == 0) {
1483 			RTE_LOG(INFO, VHOST_PORT,
1484 				"Skipping disabled port %d\n", portid);
1485 			continue;
1486 		}
1487 		if (port_init(portid) != 0)
1488 			rte_exit(EXIT_FAILURE,
1489 				"Cannot initialize network ports\n");
1490 	}
1491 
1492 	/* Enable stats if the user option is set. */
1493 	if (enable_stats) {
1494 		ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
1495 		if (ret != 0)
1496 			rte_exit(EXIT_FAILURE,
1497 				"Cannot create print-stats thread\n");
1498 
1499 		/* Set thread_name for aid in debugging.  */
1500 		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
1501 		ret = rte_thread_setname(tid, thread_name);
1502 		if (ret != 0)
1503 			RTE_LOG(DEBUG, VHOST_CONFIG,
1504 				"Cannot set print-stats name\n");
1505 	}
1506 
1507 	/* Launch all data cores. */
1508 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1509 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1510 
1511 	if (client_mode)
1512 		flags |= RTE_VHOST_USER_CLIENT;
1513 
1514 	if (dequeue_zero_copy)
1515 		flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1516 
1517 	/* Register vhost user driver to handle vhost messages. */
1518 	for (i = 0; i < nb_sockets; i++) {
1519 		char *file = socket_files + i * PATH_MAX;
1520 		ret = rte_vhost_driver_register(file, flags);
1521 		if (ret != 0) {
1522 			unregister_drivers(i);
1523 			rte_exit(EXIT_FAILURE,
1524 				"vhost driver register failure.\n");
1525 		}
1526 
1527 		if (builtin_net_driver)
1528 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1529 
1530 		if (mergeable == 0) {
1531 			rte_vhost_driver_disable_features(file,
1532 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
1533 		}
1534 
1535 		if (enable_tx_csum == 0) {
1536 			rte_vhost_driver_disable_features(file,
1537 				1ULL << VIRTIO_NET_F_CSUM);
1538 		}
1539 
1540 		if (enable_tso == 0) {
1541 			rte_vhost_driver_disable_features(file,
1542 				1ULL << VIRTIO_NET_F_HOST_TSO4);
1543 			rte_vhost_driver_disable_features(file,
1544 				1ULL << VIRTIO_NET_F_HOST_TSO6);
1545 			rte_vhost_driver_disable_features(file,
1546 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
1547 			rte_vhost_driver_disable_features(file,
1548 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
1549 		}
1550 
1551 		if (promiscuous) {
1552 			rte_vhost_driver_enable_features(file,
1553 				1ULL << VIRTIO_NET_F_CTRL_RX);
1554 		}
1555 
1556 		ret = rte_vhost_driver_callback_register(file,
1557 			&virtio_net_device_ops);
1558 		if (ret != 0) {
1559 			rte_exit(EXIT_FAILURE,
1560 				"failed to register vhost driver callbacks.\n");
1561 		}
1562 
1563 		if (rte_vhost_driver_start(file) < 0) {
1564 			rte_exit(EXIT_FAILURE,
1565 				"failed to start vhost driver.\n");
1566 		}
1567 	}
1568 
1569 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1570 		rte_eal_wait_lcore(lcore_id);
1571 
1572 	return 0;
1573 
1574 }
1575