xref: /dpdk/examples/vhost/main.c (revision fcee050aa1d74b3e65ea349f401728ece7cbdc50)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_vhost.h>
53 #include <rte_ip.h>
54 #include <rte_tcp.h>
55 #include <rte_pause.h>
56 
57 #include "main.h"
58 
59 #ifndef MAX_QUEUES
60 #define MAX_QUEUES 128
61 #endif
62 
63 /* the maximum number of external ports supported */
64 #define MAX_SUP_PORTS 1
65 
66 #define MBUF_CACHE_SIZE	128
67 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
68 
69 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
70 
71 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
72 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
73 
74 #define JUMBO_FRAME_MAX_SIZE    0x2600
75 
76 /* State of virtio device. */
77 #define DEVICE_MAC_LEARNING 0
78 #define DEVICE_RX			1
79 #define DEVICE_SAFE_REMOVE	2
80 
81 /* Configurable number of RX/TX ring descriptors */
82 #define RTE_TEST_RX_DESC_DEFAULT 1024
83 #define RTE_TEST_TX_DESC_DEFAULT 512
84 
85 #define INVALID_PORT_ID 0xFF
86 
87 /* Max number of devices. Limited by vmdq. */
88 #define MAX_DEVICES 64
89 
90 /* Size of buffers used for snprintfs. */
91 #define MAX_PRINT_BUFF 6072
92 
93 /* Maximum long option length for option parsing. */
94 #define MAX_LONG_OPT_SZ 64
95 
96 /* mask of enabled ports */
97 static uint32_t enabled_port_mask = 0;
98 
99 /* Promiscuous mode */
100 static uint32_t promiscuous;
101 
102 /* number of devices/queues to support*/
103 static uint32_t num_queues = 0;
104 static uint32_t num_devices;
105 
106 static struct rte_mempool *mbuf_pool;
107 static int mergeable;
108 
109 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
110 typedef enum {
111 	VM2VM_DISABLED = 0,
112 	VM2VM_SOFTWARE = 1,
113 	VM2VM_HARDWARE = 2,
114 	VM2VM_LAST
115 } vm2vm_type;
116 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
117 
118 /* Enable stats. */
119 static uint32_t enable_stats = 0;
120 /* Enable retries on RX. */
121 static uint32_t enable_retry = 1;
122 
123 /* Disable TX checksum offload */
124 static uint32_t enable_tx_csum;
125 
126 /* Disable TSO offload */
127 static uint32_t enable_tso;
128 
129 static int client_mode;
130 static int dequeue_zero_copy;
131 
132 static int builtin_net_driver;
133 
134 /* Specify timeout (in useconds) between retries on RX. */
135 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
136 /* Specify the number of retries on RX. */
137 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
138 
139 /* Socket file paths. Can be set by user */
140 static char *socket_files;
141 static int nb_sockets;
142 
143 /* empty vmdq configuration structure. Filled in programatically */
144 static struct rte_eth_conf vmdq_conf_default = {
145 	.rxmode = {
146 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
147 		.split_hdr_size = 0,
148 		.header_split   = 0, /**< Header Split disabled */
149 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
150 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
151 		/*
152 		 * It is necessary for 1G NIC such as I350,
153 		 * this fixes bug of ipv4 forwarding in guest can't
154 		 * forward pakets from one virtio dev to another virtio dev.
155 		 */
156 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
157 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
158 		.hw_strip_crc   = 1, /**< CRC stripped by hardware */
159 	},
160 
161 	.txmode = {
162 		.mq_mode = ETH_MQ_TX_NONE,
163 	},
164 	.rx_adv_conf = {
165 		/*
166 		 * should be overridden separately in code with
167 		 * appropriate values
168 		 */
169 		.vmdq_rx_conf = {
170 			.nb_queue_pools = ETH_8_POOLS,
171 			.enable_default_pool = 0,
172 			.default_pool = 0,
173 			.nb_pool_maps = 0,
174 			.pool_map = {{0, 0},},
175 		},
176 	},
177 };
178 
179 static unsigned lcore_ids[RTE_MAX_LCORE];
180 static uint8_t ports[RTE_MAX_ETHPORTS];
181 static unsigned num_ports = 0; /**< The number of ports specified in command line */
182 static uint16_t num_pf_queues, num_vmdq_queues;
183 static uint16_t vmdq_pool_base, vmdq_queue_base;
184 static uint16_t queues_per_pool;
185 
186 const uint16_t vlan_tags[] = {
187 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
188 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
189 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
190 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
191 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
192 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
193 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
194 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
195 };
196 
197 /* ethernet addresses of ports */
198 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
199 
200 static struct vhost_dev_tailq_list vhost_dev_list =
201 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
202 
203 static struct lcore_info lcore_info[RTE_MAX_LCORE];
204 
205 /* Used for queueing bursts of TX packets. */
206 struct mbuf_table {
207 	unsigned len;
208 	unsigned txq_id;
209 	struct rte_mbuf *m_table[MAX_PKT_BURST];
210 };
211 
212 /* TX queue for each data core. */
213 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
214 
215 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
216 				 / US_PER_S * BURST_TX_DRAIN_US)
217 #define VLAN_HLEN       4
218 
219 /*
220  * Builds up the correct configuration for VMDQ VLAN pool map
221  * according to the pool & queue limits.
222  */
223 static inline int
224 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
225 {
226 	struct rte_eth_vmdq_rx_conf conf;
227 	struct rte_eth_vmdq_rx_conf *def_conf =
228 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
229 	unsigned i;
230 
231 	memset(&conf, 0, sizeof(conf));
232 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
233 	conf.nb_pool_maps = num_devices;
234 	conf.enable_loop_back = def_conf->enable_loop_back;
235 	conf.rx_mode = def_conf->rx_mode;
236 
237 	for (i = 0; i < conf.nb_pool_maps; i++) {
238 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
239 		conf.pool_map[i].pools = (1UL << i);
240 	}
241 
242 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
243 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
244 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
245 	return 0;
246 }
247 
248 /*
249  * Validate the device number according to the max pool number gotten form
250  * dev_info. If the device number is invalid, give the error message and
251  * return -1. Each device must have its own pool.
252  */
253 static inline int
254 validate_num_devices(uint32_t max_nb_devices)
255 {
256 	if (num_devices > max_nb_devices) {
257 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
258 		return -1;
259 	}
260 	return 0;
261 }
262 
263 /*
264  * Initialises a given port using global settings and with the rx buffers
265  * coming from the mbuf_pool passed as parameter
266  */
267 static inline int
268 port_init(uint8_t port)
269 {
270 	struct rte_eth_dev_info dev_info;
271 	struct rte_eth_conf port_conf;
272 	struct rte_eth_rxconf *rxconf;
273 	struct rte_eth_txconf *txconf;
274 	int16_t rx_rings, tx_rings;
275 	uint16_t rx_ring_size, tx_ring_size;
276 	int retval;
277 	uint16_t q;
278 
279 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
280 	rte_eth_dev_info_get (port, &dev_info);
281 
282 	if (dev_info.max_rx_queues > MAX_QUEUES) {
283 		rte_exit(EXIT_FAILURE,
284 			"please define MAX_QUEUES no less than %u in %s\n",
285 			dev_info.max_rx_queues, __FILE__);
286 	}
287 
288 	rxconf = &dev_info.default_rxconf;
289 	txconf = &dev_info.default_txconf;
290 	rxconf->rx_drop_en = 1;
291 
292 	/* Enable vlan offload */
293 	txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
294 
295 	/*configure the number of supported virtio devices based on VMDQ limits */
296 	num_devices = dev_info.max_vmdq_pools;
297 
298 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
299 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
300 
301 	/*
302 	 * When dequeue zero copy is enabled, guest Tx used vring will be
303 	 * updated only when corresponding mbuf is freed. Thus, the nb_tx_desc
304 	 * (tx_ring_size here) must be small enough so that the driver will
305 	 * hit the free threshold easily and free mbufs timely. Otherwise,
306 	 * guest Tx vring would be starved.
307 	 */
308 	if (dequeue_zero_copy)
309 		tx_ring_size = 64;
310 
311 	tx_rings = (uint16_t)rte_lcore_count();
312 
313 	retval = validate_num_devices(MAX_DEVICES);
314 	if (retval < 0)
315 		return retval;
316 
317 	/* Get port configuration. */
318 	retval = get_eth_conf(&port_conf, num_devices);
319 	if (retval < 0)
320 		return retval;
321 	/* NIC queues are divided into pf queues and vmdq queues.  */
322 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
323 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
324 	num_vmdq_queues = num_devices * queues_per_pool;
325 	num_queues = num_pf_queues + num_vmdq_queues;
326 	vmdq_queue_base = dev_info.vmdq_queue_base;
327 	vmdq_pool_base  = dev_info.vmdq_pool_base;
328 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
329 		num_pf_queues, num_devices, queues_per_pool);
330 
331 	if (port >= rte_eth_dev_count()) return -1;
332 
333 	rx_rings = (uint16_t)dev_info.max_rx_queues;
334 	/* Configure ethernet device. */
335 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
336 	if (retval != 0) {
337 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
338 			port, strerror(-retval));
339 		return retval;
340 	}
341 
342 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
343 		&tx_ring_size);
344 	if (retval != 0) {
345 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
346 			"for port %u: %s.\n", port, strerror(-retval));
347 		return retval;
348 	}
349 	if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
350 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
351 			"for Rx queues on port %u.\n", port);
352 		return -1;
353 	}
354 
355 	/* Setup the queues. */
356 	for (q = 0; q < rx_rings; q ++) {
357 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
358 						rte_eth_dev_socket_id(port),
359 						rxconf,
360 						mbuf_pool);
361 		if (retval < 0) {
362 			RTE_LOG(ERR, VHOST_PORT,
363 				"Failed to setup rx queue %u of port %u: %s.\n",
364 				q, port, strerror(-retval));
365 			return retval;
366 		}
367 	}
368 	for (q = 0; q < tx_rings; q ++) {
369 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
370 						rte_eth_dev_socket_id(port),
371 						txconf);
372 		if (retval < 0) {
373 			RTE_LOG(ERR, VHOST_PORT,
374 				"Failed to setup tx queue %u of port %u: %s.\n",
375 				q, port, strerror(-retval));
376 			return retval;
377 		}
378 	}
379 
380 	/* Start the device. */
381 	retval  = rte_eth_dev_start(port);
382 	if (retval < 0) {
383 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
384 			port, strerror(-retval));
385 		return retval;
386 	}
387 
388 	if (promiscuous)
389 		rte_eth_promiscuous_enable(port);
390 
391 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
392 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
393 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
394 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
395 			(unsigned)port,
396 			vmdq_ports_eth_addr[port].addr_bytes[0],
397 			vmdq_ports_eth_addr[port].addr_bytes[1],
398 			vmdq_ports_eth_addr[port].addr_bytes[2],
399 			vmdq_ports_eth_addr[port].addr_bytes[3],
400 			vmdq_ports_eth_addr[port].addr_bytes[4],
401 			vmdq_ports_eth_addr[port].addr_bytes[5]);
402 
403 	return 0;
404 }
405 
406 /*
407  * Set socket file path.
408  */
409 static int
410 us_vhost_parse_socket_path(const char *q_arg)
411 {
412 	/* parse number string */
413 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
414 		return -1;
415 
416 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
417 	snprintf(socket_files + nb_sockets * PATH_MAX, PATH_MAX, "%s", q_arg);
418 	nb_sockets++;
419 
420 	return 0;
421 }
422 
423 /*
424  * Parse the portmask provided at run time.
425  */
426 static int
427 parse_portmask(const char *portmask)
428 {
429 	char *end = NULL;
430 	unsigned long pm;
431 
432 	errno = 0;
433 
434 	/* parse hexadecimal string */
435 	pm = strtoul(portmask, &end, 16);
436 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
437 		return -1;
438 
439 	if (pm == 0)
440 		return -1;
441 
442 	return pm;
443 
444 }
445 
446 /*
447  * Parse num options at run time.
448  */
449 static int
450 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
451 {
452 	char *end = NULL;
453 	unsigned long num;
454 
455 	errno = 0;
456 
457 	/* parse unsigned int string */
458 	num = strtoul(q_arg, &end, 10);
459 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
460 		return -1;
461 
462 	if (num > max_valid_value)
463 		return -1;
464 
465 	return num;
466 
467 }
468 
469 /*
470  * Display usage
471  */
472 static void
473 us_vhost_usage(const char *prgname)
474 {
475 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
476 	"		--vm2vm [0|1|2]\n"
477 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
478 	"		--socket-file <path>\n"
479 	"		--nb-devices ND\n"
480 	"		-p PORTMASK: Set mask for ports to be used by application\n"
481 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
482 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
483 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
484 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
485 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
486 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
487 	"		--socket-file: The path of the socket file.\n"
488 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
489 	"		--tso [0|1] disable/enable TCP segment offload.\n"
490 	"		--client register a vhost-user socket as client mode.\n"
491 	"		--dequeue-zero-copy enables dequeue zero copy\n",
492 	       prgname);
493 }
494 
495 /*
496  * Parse the arguments given in the command line of the application.
497  */
498 static int
499 us_vhost_parse_args(int argc, char **argv)
500 {
501 	int opt, ret;
502 	int option_index;
503 	unsigned i;
504 	const char *prgname = argv[0];
505 	static struct option long_option[] = {
506 		{"vm2vm", required_argument, NULL, 0},
507 		{"rx-retry", required_argument, NULL, 0},
508 		{"rx-retry-delay", required_argument, NULL, 0},
509 		{"rx-retry-num", required_argument, NULL, 0},
510 		{"mergeable", required_argument, NULL, 0},
511 		{"stats", required_argument, NULL, 0},
512 		{"socket-file", required_argument, NULL, 0},
513 		{"tx-csum", required_argument, NULL, 0},
514 		{"tso", required_argument, NULL, 0},
515 		{"client", no_argument, &client_mode, 1},
516 		{"dequeue-zero-copy", no_argument, &dequeue_zero_copy, 1},
517 		{"builtin-net-driver", no_argument, &builtin_net_driver, 1},
518 		{NULL, 0, 0, 0},
519 	};
520 
521 	/* Parse command line */
522 	while ((opt = getopt_long(argc, argv, "p:P",
523 			long_option, &option_index)) != EOF) {
524 		switch (opt) {
525 		/* Portmask */
526 		case 'p':
527 			enabled_port_mask = parse_portmask(optarg);
528 			if (enabled_port_mask == 0) {
529 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
530 				us_vhost_usage(prgname);
531 				return -1;
532 			}
533 			break;
534 
535 		case 'P':
536 			promiscuous = 1;
537 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
538 				ETH_VMDQ_ACCEPT_BROADCAST |
539 				ETH_VMDQ_ACCEPT_MULTICAST;
540 
541 			break;
542 
543 		case 0:
544 			/* Enable/disable vm2vm comms. */
545 			if (!strncmp(long_option[option_index].name, "vm2vm",
546 				MAX_LONG_OPT_SZ)) {
547 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
548 				if (ret == -1) {
549 					RTE_LOG(INFO, VHOST_CONFIG,
550 						"Invalid argument for "
551 						"vm2vm [0|1|2]\n");
552 					us_vhost_usage(prgname);
553 					return -1;
554 				} else {
555 					vm2vm_mode = (vm2vm_type)ret;
556 				}
557 			}
558 
559 			/* Enable/disable retries on RX. */
560 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
561 				ret = parse_num_opt(optarg, 1);
562 				if (ret == -1) {
563 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
564 					us_vhost_usage(prgname);
565 					return -1;
566 				} else {
567 					enable_retry = ret;
568 				}
569 			}
570 
571 			/* Enable/disable TX checksum offload. */
572 			if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
573 				ret = parse_num_opt(optarg, 1);
574 				if (ret == -1) {
575 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
576 					us_vhost_usage(prgname);
577 					return -1;
578 				} else
579 					enable_tx_csum = ret;
580 			}
581 
582 			/* Enable/disable TSO offload. */
583 			if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
584 				ret = parse_num_opt(optarg, 1);
585 				if (ret == -1) {
586 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
587 					us_vhost_usage(prgname);
588 					return -1;
589 				} else
590 					enable_tso = ret;
591 			}
592 
593 			/* Specify the retries delay time (in useconds) on RX. */
594 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
595 				ret = parse_num_opt(optarg, INT32_MAX);
596 				if (ret == -1) {
597 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
598 					us_vhost_usage(prgname);
599 					return -1;
600 				} else {
601 					burst_rx_delay_time = ret;
602 				}
603 			}
604 
605 			/* Specify the retries number on RX. */
606 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
607 				ret = parse_num_opt(optarg, INT32_MAX);
608 				if (ret == -1) {
609 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
610 					us_vhost_usage(prgname);
611 					return -1;
612 				} else {
613 					burst_rx_retry_num = ret;
614 				}
615 			}
616 
617 			/* Enable/disable RX mergeable buffers. */
618 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
619 				ret = parse_num_opt(optarg, 1);
620 				if (ret == -1) {
621 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
622 					us_vhost_usage(prgname);
623 					return -1;
624 				} else {
625 					mergeable = !!ret;
626 					if (ret) {
627 						vmdq_conf_default.rxmode.jumbo_frame = 1;
628 						vmdq_conf_default.rxmode.max_rx_pkt_len
629 							= JUMBO_FRAME_MAX_SIZE;
630 					}
631 				}
632 			}
633 
634 			/* Enable/disable stats. */
635 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
636 				ret = parse_num_opt(optarg, INT32_MAX);
637 				if (ret == -1) {
638 					RTE_LOG(INFO, VHOST_CONFIG,
639 						"Invalid argument for stats [0..N]\n");
640 					us_vhost_usage(prgname);
641 					return -1;
642 				} else {
643 					enable_stats = ret;
644 				}
645 			}
646 
647 			/* Set socket file path. */
648 			if (!strncmp(long_option[option_index].name,
649 						"socket-file", MAX_LONG_OPT_SZ)) {
650 				if (us_vhost_parse_socket_path(optarg) == -1) {
651 					RTE_LOG(INFO, VHOST_CONFIG,
652 					"Invalid argument for socket name (Max %d characters)\n",
653 					PATH_MAX);
654 					us_vhost_usage(prgname);
655 					return -1;
656 				}
657 			}
658 
659 			break;
660 
661 			/* Invalid option - print options. */
662 		default:
663 			us_vhost_usage(prgname);
664 			return -1;
665 		}
666 	}
667 
668 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
669 		if (enabled_port_mask & (1 << i))
670 			ports[num_ports++] = (uint8_t)i;
671 	}
672 
673 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
674 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
675 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
676 		return -1;
677 	}
678 
679 	return 0;
680 }
681 
682 /*
683  * Update the global var NUM_PORTS and array PORTS according to system ports number
684  * and return valid ports number
685  */
686 static unsigned check_ports_num(unsigned nb_ports)
687 {
688 	unsigned valid_num_ports = num_ports;
689 	unsigned portid;
690 
691 	if (num_ports > nb_ports) {
692 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
693 			num_ports, nb_ports);
694 		num_ports = nb_ports;
695 	}
696 
697 	for (portid = 0; portid < num_ports; portid ++) {
698 		if (ports[portid] >= nb_ports) {
699 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
700 				ports[portid], (nb_ports - 1));
701 			ports[portid] = INVALID_PORT_ID;
702 			valid_num_ports--;
703 		}
704 	}
705 	return valid_num_ports;
706 }
707 
708 static __rte_always_inline struct vhost_dev *
709 find_vhost_dev(struct ether_addr *mac)
710 {
711 	struct vhost_dev *vdev;
712 
713 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
714 		if (vdev->ready == DEVICE_RX &&
715 		    is_same_ether_addr(mac, &vdev->mac_address))
716 			return vdev;
717 	}
718 
719 	return NULL;
720 }
721 
722 /*
723  * This function learns the MAC address of the device and registers this along with a
724  * vlan tag to a VMDQ.
725  */
726 static int
727 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
728 {
729 	struct ether_hdr *pkt_hdr;
730 	int i, ret;
731 
732 	/* Learn MAC address of guest device from packet */
733 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
734 
735 	if (find_vhost_dev(&pkt_hdr->s_addr)) {
736 		RTE_LOG(ERR, VHOST_DATA,
737 			"(%d) device is using a registered MAC!\n",
738 			vdev->vid);
739 		return -1;
740 	}
741 
742 	for (i = 0; i < ETHER_ADDR_LEN; i++)
743 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
744 
745 	/* vlan_tag currently uses the device_id. */
746 	vdev->vlan_tag = vlan_tags[vdev->vid];
747 
748 	/* Print out VMDQ registration info. */
749 	RTE_LOG(INFO, VHOST_DATA,
750 		"(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
751 		vdev->vid,
752 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
753 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
754 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
755 		vdev->vlan_tag);
756 
757 	/* Register the MAC address. */
758 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
759 				(uint32_t)vdev->vid + vmdq_pool_base);
760 	if (ret)
761 		RTE_LOG(ERR, VHOST_DATA,
762 			"(%d) failed to add device MAC address to VMDQ\n",
763 			vdev->vid);
764 
765 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
766 
767 	/* Set device as ready for RX. */
768 	vdev->ready = DEVICE_RX;
769 
770 	return 0;
771 }
772 
773 /*
774  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
775  * queue before disabling RX on the device.
776  */
777 static inline void
778 unlink_vmdq(struct vhost_dev *vdev)
779 {
780 	unsigned i = 0;
781 	unsigned rx_count;
782 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
783 
784 	if (vdev->ready == DEVICE_RX) {
785 		/*clear MAC and VLAN settings*/
786 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
787 		for (i = 0; i < 6; i++)
788 			vdev->mac_address.addr_bytes[i] = 0;
789 
790 		vdev->vlan_tag = 0;
791 
792 		/*Clear out the receive buffers*/
793 		rx_count = rte_eth_rx_burst(ports[0],
794 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
795 
796 		while (rx_count) {
797 			for (i = 0; i < rx_count; i++)
798 				rte_pktmbuf_free(pkts_burst[i]);
799 
800 			rx_count = rte_eth_rx_burst(ports[0],
801 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
802 		}
803 
804 		vdev->ready = DEVICE_MAC_LEARNING;
805 	}
806 }
807 
808 static __rte_always_inline void
809 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
810 	    struct rte_mbuf *m)
811 {
812 	uint16_t ret;
813 
814 	if (builtin_net_driver) {
815 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
816 	} else {
817 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
818 	}
819 
820 	if (enable_stats) {
821 		rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
822 		rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
823 		src_vdev->stats.tx_total++;
824 		src_vdev->stats.tx += ret;
825 	}
826 }
827 
828 /*
829  * Check if the packet destination MAC address is for a local device. If so then put
830  * the packet on that devices RX queue. If not then return.
831  */
832 static __rte_always_inline int
833 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
834 {
835 	struct ether_hdr *pkt_hdr;
836 	struct vhost_dev *dst_vdev;
837 
838 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
839 
840 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
841 	if (!dst_vdev)
842 		return -1;
843 
844 	if (vdev->vid == dst_vdev->vid) {
845 		RTE_LOG_DP(DEBUG, VHOST_DATA,
846 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
847 			vdev->vid);
848 		return 0;
849 	}
850 
851 	RTE_LOG_DP(DEBUG, VHOST_DATA,
852 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
853 
854 	if (unlikely(dst_vdev->remove)) {
855 		RTE_LOG_DP(DEBUG, VHOST_DATA,
856 			"(%d) device is marked for removal\n", dst_vdev->vid);
857 		return 0;
858 	}
859 
860 	virtio_xmit(dst_vdev, vdev, m);
861 	return 0;
862 }
863 
864 /*
865  * Check if the destination MAC of a packet is one local VM,
866  * and get its vlan tag, and offset if it is.
867  */
868 static __rte_always_inline int
869 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
870 	uint32_t *offset, uint16_t *vlan_tag)
871 {
872 	struct vhost_dev *dst_vdev;
873 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
874 
875 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
876 	if (!dst_vdev)
877 		return 0;
878 
879 	if (vdev->vid == dst_vdev->vid) {
880 		RTE_LOG_DP(DEBUG, VHOST_DATA,
881 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
882 			vdev->vid);
883 		return -1;
884 	}
885 
886 	/*
887 	 * HW vlan strip will reduce the packet length
888 	 * by minus length of vlan tag, so need restore
889 	 * the packet length by plus it.
890 	 */
891 	*offset  = VLAN_HLEN;
892 	*vlan_tag = vlan_tags[vdev->vid];
893 
894 	RTE_LOG_DP(DEBUG, VHOST_DATA,
895 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
896 		vdev->vid, dst_vdev->vid, *vlan_tag);
897 
898 	return 0;
899 }
900 
901 static uint16_t
902 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
903 {
904 	if (ol_flags & PKT_TX_IPV4)
905 		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
906 	else /* assume ethertype == ETHER_TYPE_IPv6 */
907 		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
908 }
909 
910 static void virtio_tx_offload(struct rte_mbuf *m)
911 {
912 	void *l3_hdr;
913 	struct ipv4_hdr *ipv4_hdr = NULL;
914 	struct tcp_hdr *tcp_hdr = NULL;
915 	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
916 
917 	l3_hdr = (char *)eth_hdr + m->l2_len;
918 
919 	if (m->ol_flags & PKT_TX_IPV4) {
920 		ipv4_hdr = l3_hdr;
921 		ipv4_hdr->hdr_checksum = 0;
922 		m->ol_flags |= PKT_TX_IP_CKSUM;
923 	}
924 
925 	tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
926 	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
927 }
928 
929 static inline void
930 free_pkts(struct rte_mbuf **pkts, uint16_t n)
931 {
932 	while (n--)
933 		rte_pktmbuf_free(pkts[n]);
934 }
935 
936 static __rte_always_inline void
937 do_drain_mbuf_table(struct mbuf_table *tx_q)
938 {
939 	uint16_t count;
940 
941 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
942 				 tx_q->m_table, tx_q->len);
943 	if (unlikely(count < tx_q->len))
944 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
945 
946 	tx_q->len = 0;
947 }
948 
949 /*
950  * This function routes the TX packet to the correct interface. This
951  * may be a local device or the physical port.
952  */
953 static __rte_always_inline void
954 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
955 {
956 	struct mbuf_table *tx_q;
957 	unsigned offset = 0;
958 	const uint16_t lcore_id = rte_lcore_id();
959 	struct ether_hdr *nh;
960 
961 
962 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
963 	if (unlikely(is_broadcast_ether_addr(&nh->d_addr))) {
964 		struct vhost_dev *vdev2;
965 
966 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
967 			virtio_xmit(vdev2, vdev, m);
968 		}
969 		goto queue2nic;
970 	}
971 
972 	/*check if destination is local VM*/
973 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
974 		rte_pktmbuf_free(m);
975 		return;
976 	}
977 
978 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
979 		if (unlikely(find_local_dest(vdev, m, &offset,
980 					     &vlan_tag) != 0)) {
981 			rte_pktmbuf_free(m);
982 			return;
983 		}
984 	}
985 
986 	RTE_LOG_DP(DEBUG, VHOST_DATA,
987 		"(%d) TX: MAC address is external\n", vdev->vid);
988 
989 queue2nic:
990 
991 	/*Add packet to the port tx queue*/
992 	tx_q = &lcore_tx_queue[lcore_id];
993 
994 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
995 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
996 		/* Guest has inserted the vlan tag. */
997 		struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
998 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
999 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1000 			(vh->vlan_tci != vlan_tag_be))
1001 			vh->vlan_tci = vlan_tag_be;
1002 	} else {
1003 		m->ol_flags |= PKT_TX_VLAN_PKT;
1004 
1005 		/*
1006 		 * Find the right seg to adjust the data len when offset is
1007 		 * bigger than tail room size.
1008 		 */
1009 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1010 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1011 				m->data_len += offset;
1012 			else {
1013 				struct rte_mbuf *seg = m;
1014 
1015 				while ((seg->next != NULL) &&
1016 					(offset > rte_pktmbuf_tailroom(seg)))
1017 					seg = seg->next;
1018 
1019 				seg->data_len += offset;
1020 			}
1021 			m->pkt_len += offset;
1022 		}
1023 
1024 		m->vlan_tci = vlan_tag;
1025 	}
1026 
1027 	if (m->ol_flags & PKT_TX_TCP_SEG)
1028 		virtio_tx_offload(m);
1029 
1030 	tx_q->m_table[tx_q->len++] = m;
1031 	if (enable_stats) {
1032 		vdev->stats.tx_total++;
1033 		vdev->stats.tx++;
1034 	}
1035 
1036 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1037 		do_drain_mbuf_table(tx_q);
1038 }
1039 
1040 
1041 static __rte_always_inline void
1042 drain_mbuf_table(struct mbuf_table *tx_q)
1043 {
1044 	static uint64_t prev_tsc;
1045 	uint64_t cur_tsc;
1046 
1047 	if (tx_q->len == 0)
1048 		return;
1049 
1050 	cur_tsc = rte_rdtsc();
1051 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1052 		prev_tsc = cur_tsc;
1053 
1054 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1055 			"TX queue drained after timeout with burst size %u\n",
1056 			tx_q->len);
1057 		do_drain_mbuf_table(tx_q);
1058 	}
1059 }
1060 
1061 static __rte_always_inline void
1062 drain_eth_rx(struct vhost_dev *vdev)
1063 {
1064 	uint16_t rx_count, enqueue_count;
1065 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1066 
1067 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1068 				    pkts, MAX_PKT_BURST);
1069 	if (!rx_count)
1070 		return;
1071 
1072 	/*
1073 	 * When "enable_retry" is set, here we wait and retry when there
1074 	 * is no enough free slots in the queue to hold @rx_count packets,
1075 	 * to diminish packet loss.
1076 	 */
1077 	if (enable_retry &&
1078 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1079 			VIRTIO_RXQ))) {
1080 		uint32_t retry;
1081 
1082 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1083 			rte_delay_us(burst_rx_delay_time);
1084 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1085 					VIRTIO_RXQ))
1086 				break;
1087 		}
1088 	}
1089 
1090 	if (builtin_net_driver) {
1091 		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1092 						pkts, rx_count);
1093 	} else {
1094 		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1095 						pkts, rx_count);
1096 	}
1097 	if (enable_stats) {
1098 		rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1099 		rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1100 	}
1101 
1102 	free_pkts(pkts, rx_count);
1103 }
1104 
1105 static __rte_always_inline void
1106 drain_virtio_tx(struct vhost_dev *vdev)
1107 {
1108 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1109 	uint16_t count;
1110 	uint16_t i;
1111 
1112 	if (builtin_net_driver) {
1113 		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1114 					pkts, MAX_PKT_BURST);
1115 	} else {
1116 		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1117 					mbuf_pool, pkts, MAX_PKT_BURST);
1118 	}
1119 
1120 	/* setup VMDq for the first packet */
1121 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1122 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1123 			free_pkts(pkts, count);
1124 	}
1125 
1126 	for (i = 0; i < count; ++i)
1127 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1128 }
1129 
1130 /*
1131  * Main function of vhost-switch. It basically does:
1132  *
1133  * for each vhost device {
1134  *    - drain_eth_rx()
1135  *
1136  *      Which drains the host eth Rx queue linked to the vhost device,
1137  *      and deliver all of them to guest virito Rx ring associated with
1138  *      this vhost device.
1139  *
1140  *    - drain_virtio_tx()
1141  *
1142  *      Which drains the guest virtio Tx queue and deliver all of them
1143  *      to the target, which could be another vhost device, or the
1144  *      physical eth dev. The route is done in function "virtio_tx_route".
1145  * }
1146  */
1147 static int
1148 switch_worker(void *arg __rte_unused)
1149 {
1150 	unsigned i;
1151 	unsigned lcore_id = rte_lcore_id();
1152 	struct vhost_dev *vdev;
1153 	struct mbuf_table *tx_q;
1154 
1155 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1156 
1157 	tx_q = &lcore_tx_queue[lcore_id];
1158 	for (i = 0; i < rte_lcore_count(); i++) {
1159 		if (lcore_ids[i] == lcore_id) {
1160 			tx_q->txq_id = i;
1161 			break;
1162 		}
1163 	}
1164 
1165 	while(1) {
1166 		drain_mbuf_table(tx_q);
1167 
1168 		/*
1169 		 * Inform the configuration core that we have exited the
1170 		 * linked list and that no devices are in use if requested.
1171 		 */
1172 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1173 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1174 
1175 		/*
1176 		 * Process vhost devices
1177 		 */
1178 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1179 			      lcore_vdev_entry) {
1180 			if (unlikely(vdev->remove)) {
1181 				unlink_vmdq(vdev);
1182 				vdev->ready = DEVICE_SAFE_REMOVE;
1183 				continue;
1184 			}
1185 
1186 			if (likely(vdev->ready == DEVICE_RX))
1187 				drain_eth_rx(vdev);
1188 
1189 			if (likely(!vdev->remove))
1190 				drain_virtio_tx(vdev);
1191 		}
1192 	}
1193 
1194 	return 0;
1195 }
1196 
1197 /*
1198  * Remove a device from the specific data core linked list and from the
1199  * main linked list. Synchonization  occurs through the use of the
1200  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1201  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1202  */
1203 static void
1204 destroy_device(int vid)
1205 {
1206 	struct vhost_dev *vdev = NULL;
1207 	int lcore;
1208 
1209 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1210 		if (vdev->vid == vid)
1211 			break;
1212 	}
1213 	if (!vdev)
1214 		return;
1215 	/*set the remove flag. */
1216 	vdev->remove = 1;
1217 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1218 		rte_pause();
1219 	}
1220 
1221 	if (builtin_net_driver)
1222 		vs_vhost_net_remove(vdev);
1223 
1224 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1225 		     lcore_vdev_entry);
1226 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1227 
1228 
1229 	/* Set the dev_removal_flag on each lcore. */
1230 	RTE_LCORE_FOREACH_SLAVE(lcore)
1231 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1232 
1233 	/*
1234 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1235 	 * we can be sure that they can no longer access the device removed
1236 	 * from the linked lists and that the devices are no longer in use.
1237 	 */
1238 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1239 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1240 			rte_pause();
1241 	}
1242 
1243 	lcore_info[vdev->coreid].device_num--;
1244 
1245 	RTE_LOG(INFO, VHOST_DATA,
1246 		"(%d) device has been removed from data core\n",
1247 		vdev->vid);
1248 
1249 	rte_free(vdev);
1250 }
1251 
1252 /*
1253  * A new device is added to a data core. First the device is added to the main linked list
1254  * and the allocated to a specific data core.
1255  */
1256 static int
1257 new_device(int vid)
1258 {
1259 	int lcore, core_add = 0;
1260 	uint32_t device_num_min = num_devices;
1261 	struct vhost_dev *vdev;
1262 
1263 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1264 	if (vdev == NULL) {
1265 		RTE_LOG(INFO, VHOST_DATA,
1266 			"(%d) couldn't allocate memory for vhost dev\n",
1267 			vid);
1268 		return -1;
1269 	}
1270 	vdev->vid = vid;
1271 
1272 	if (builtin_net_driver)
1273 		vs_vhost_net_setup(vdev);
1274 
1275 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1276 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1277 
1278 	/*reset ready flag*/
1279 	vdev->ready = DEVICE_MAC_LEARNING;
1280 	vdev->remove = 0;
1281 
1282 	/* Find a suitable lcore to add the device. */
1283 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1284 		if (lcore_info[lcore].device_num < device_num_min) {
1285 			device_num_min = lcore_info[lcore].device_num;
1286 			core_add = lcore;
1287 		}
1288 	}
1289 	vdev->coreid = core_add;
1290 
1291 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1292 			  lcore_vdev_entry);
1293 	lcore_info[vdev->coreid].device_num++;
1294 
1295 	/* Disable notifications. */
1296 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1297 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1298 
1299 	RTE_LOG(INFO, VHOST_DATA,
1300 		"(%d) device has been added to data core %d\n",
1301 		vid, vdev->coreid);
1302 
1303 	return 0;
1304 }
1305 
1306 /*
1307  * These callback allow devices to be added to the data core when configuration
1308  * has been fully complete.
1309  */
1310 static const struct vhost_device_ops virtio_net_device_ops =
1311 {
1312 	.new_device =  new_device,
1313 	.destroy_device = destroy_device,
1314 };
1315 
1316 /*
1317  * This is a thread will wake up after a period to print stats if the user has
1318  * enabled them.
1319  */
1320 static void
1321 print_stats(void)
1322 {
1323 	struct vhost_dev *vdev;
1324 	uint64_t tx_dropped, rx_dropped;
1325 	uint64_t tx, tx_total, rx, rx_total;
1326 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1327 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1328 
1329 	while(1) {
1330 		sleep(enable_stats);
1331 
1332 		/* Clear screen and move to top left */
1333 		printf("%s%s\n", clr, top_left);
1334 		printf("Device statistics =================================\n");
1335 
1336 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1337 			tx_total   = vdev->stats.tx_total;
1338 			tx         = vdev->stats.tx;
1339 			tx_dropped = tx_total - tx;
1340 
1341 			rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1342 			rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1343 			rx_dropped = rx_total - rx;
1344 
1345 			printf("Statistics for device %d\n"
1346 				"-----------------------\n"
1347 				"TX total:              %" PRIu64 "\n"
1348 				"TX dropped:            %" PRIu64 "\n"
1349 				"TX successful:         %" PRIu64 "\n"
1350 				"RX total:              %" PRIu64 "\n"
1351 				"RX dropped:            %" PRIu64 "\n"
1352 				"RX successful:         %" PRIu64 "\n",
1353 				vdev->vid,
1354 				tx_total, tx_dropped, tx,
1355 				rx_total, rx_dropped, rx);
1356 		}
1357 
1358 		printf("===================================================\n");
1359 	}
1360 }
1361 
1362 static void
1363 unregister_drivers(int socket_num)
1364 {
1365 	int i, ret;
1366 
1367 	for (i = 0; i < socket_num; i++) {
1368 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1369 		if (ret != 0)
1370 			RTE_LOG(ERR, VHOST_CONFIG,
1371 				"Fail to unregister vhost driver for %s.\n",
1372 				socket_files + i * PATH_MAX);
1373 	}
1374 }
1375 
1376 /* When we receive a INT signal, unregister vhost driver */
1377 static void
1378 sigint_handler(__rte_unused int signum)
1379 {
1380 	/* Unregister vhost driver. */
1381 	unregister_drivers(nb_sockets);
1382 
1383 	exit(0);
1384 }
1385 
1386 /*
1387  * While creating an mbuf pool, one key thing is to figure out how
1388  * many mbuf entries is enough for our use. FYI, here are some
1389  * guidelines:
1390  *
1391  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1392  *
1393  * - For each switch core (A CPU core does the packet switch), we need
1394  *   also make some reservation for receiving the packets from virtio
1395  *   Tx queue. How many is enough depends on the usage. It's normally
1396  *   a simple calculation like following:
1397  *
1398  *       MAX_PKT_BURST * max packet size / mbuf size
1399  *
1400  *   So, we definitely need allocate more mbufs when TSO is enabled.
1401  *
1402  * - Similarly, for each switching core, we should serve @nr_rx_desc
1403  *   mbufs for receiving the packets from physical NIC device.
1404  *
1405  * - We also need make sure, for each switch core, we have allocated
1406  *   enough mbufs to fill up the mbuf cache.
1407  */
1408 static void
1409 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1410 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1411 {
1412 	uint32_t nr_mbufs;
1413 	uint32_t nr_mbufs_per_core;
1414 	uint32_t mtu = 1500;
1415 
1416 	if (mergeable)
1417 		mtu = 9000;
1418 	if (enable_tso)
1419 		mtu = 64 * 1024;
1420 
1421 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1422 			(mbuf_size - RTE_PKTMBUF_HEADROOM);
1423 	nr_mbufs_per_core += nr_rx_desc;
1424 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1425 
1426 	nr_mbufs  = nr_queues * nr_rx_desc;
1427 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1428 	nr_mbufs *= nr_port;
1429 
1430 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1431 					    nr_mbuf_cache, 0, mbuf_size,
1432 					    rte_socket_id());
1433 	if (mbuf_pool == NULL)
1434 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1435 }
1436 
1437 /*
1438  * Main function, does initialisation and calls the per-lcore functions.
1439  */
1440 int
1441 main(int argc, char *argv[])
1442 {
1443 	unsigned lcore_id, core_id = 0;
1444 	unsigned nb_ports, valid_num_ports;
1445 	int ret, i;
1446 	uint8_t portid;
1447 	static pthread_t tid;
1448 	char thread_name[RTE_MAX_THREAD_NAME_LEN];
1449 	uint64_t flags = 0;
1450 
1451 	signal(SIGINT, sigint_handler);
1452 
1453 	/* init EAL */
1454 	ret = rte_eal_init(argc, argv);
1455 	if (ret < 0)
1456 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1457 	argc -= ret;
1458 	argv += ret;
1459 
1460 	/* parse app arguments */
1461 	ret = us_vhost_parse_args(argc, argv);
1462 	if (ret < 0)
1463 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1464 
1465 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1466 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1467 
1468 		if (rte_lcore_is_enabled(lcore_id))
1469 			lcore_ids[core_id++] = lcore_id;
1470 	}
1471 
1472 	if (rte_lcore_count() > RTE_MAX_LCORE)
1473 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1474 
1475 	/* Get the number of physical ports. */
1476 	nb_ports = rte_eth_dev_count();
1477 
1478 	/*
1479 	 * Update the global var NUM_PORTS and global array PORTS
1480 	 * and get value of var VALID_NUM_PORTS according to system ports number
1481 	 */
1482 	valid_num_ports = check_ports_num(nb_ports);
1483 
1484 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1485 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1486 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1487 		return -1;
1488 	}
1489 
1490 	/*
1491 	 * FIXME: here we are trying to allocate mbufs big enough for
1492 	 * @MAX_QUEUES, but the truth is we're never going to use that
1493 	 * many queues here. We probably should only do allocation for
1494 	 * those queues we are going to use.
1495 	 */
1496 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1497 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1498 
1499 	if (vm2vm_mode == VM2VM_HARDWARE) {
1500 		/* Enable VT loop back to let L2 switch to do it. */
1501 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1502 		RTE_LOG(DEBUG, VHOST_CONFIG,
1503 			"Enable loop back for L2 switch in vmdq.\n");
1504 	}
1505 
1506 	/* initialize all ports */
1507 	for (portid = 0; portid < nb_ports; portid++) {
1508 		/* skip ports that are not enabled */
1509 		if ((enabled_port_mask & (1 << portid)) == 0) {
1510 			RTE_LOG(INFO, VHOST_PORT,
1511 				"Skipping disabled port %d\n", portid);
1512 			continue;
1513 		}
1514 		if (port_init(portid) != 0)
1515 			rte_exit(EXIT_FAILURE,
1516 				"Cannot initialize network ports\n");
1517 	}
1518 
1519 	/* Enable stats if the user option is set. */
1520 	if (enable_stats) {
1521 		ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
1522 		if (ret != 0)
1523 			rte_exit(EXIT_FAILURE,
1524 				"Cannot create print-stats thread\n");
1525 
1526 		/* Set thread_name for aid in debugging.  */
1527 		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
1528 		ret = rte_thread_setname(tid, thread_name);
1529 		if (ret != 0)
1530 			RTE_LOG(DEBUG, VHOST_CONFIG,
1531 				"Cannot set print-stats name\n");
1532 	}
1533 
1534 	/* Launch all data cores. */
1535 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1536 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1537 
1538 	if (client_mode)
1539 		flags |= RTE_VHOST_USER_CLIENT;
1540 
1541 	if (dequeue_zero_copy)
1542 		flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1543 
1544 	/* Register vhost user driver to handle vhost messages. */
1545 	for (i = 0; i < nb_sockets; i++) {
1546 		char *file = socket_files + i * PATH_MAX;
1547 		ret = rte_vhost_driver_register(file, flags);
1548 		if (ret != 0) {
1549 			unregister_drivers(i);
1550 			rte_exit(EXIT_FAILURE,
1551 				"vhost driver register failure.\n");
1552 		}
1553 
1554 		if (builtin_net_driver)
1555 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1556 
1557 		if (mergeable == 0) {
1558 			rte_vhost_driver_disable_features(file,
1559 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
1560 		}
1561 
1562 		if (enable_tx_csum == 0) {
1563 			rte_vhost_driver_disable_features(file,
1564 				1ULL << VIRTIO_NET_F_CSUM);
1565 		}
1566 
1567 		if (enable_tso == 0) {
1568 			rte_vhost_driver_disable_features(file,
1569 				1ULL << VIRTIO_NET_F_HOST_TSO4);
1570 			rte_vhost_driver_disable_features(file,
1571 				1ULL << VIRTIO_NET_F_HOST_TSO6);
1572 			rte_vhost_driver_disable_features(file,
1573 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
1574 			rte_vhost_driver_disable_features(file,
1575 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
1576 		}
1577 
1578 		if (promiscuous) {
1579 			rte_vhost_driver_enable_features(file,
1580 				1ULL << VIRTIO_NET_F_CTRL_RX);
1581 		}
1582 
1583 		ret = rte_vhost_driver_callback_register(file,
1584 			&virtio_net_device_ops);
1585 		if (ret != 0) {
1586 			rte_exit(EXIT_FAILURE,
1587 				"failed to register vhost driver callbacks.\n");
1588 		}
1589 
1590 		if (rte_vhost_driver_start(file) < 0) {
1591 			rte_exit(EXIT_FAILURE,
1592 				"failed to start vhost driver.\n");
1593 		}
1594 	}
1595 
1596 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1597 		rte_eal_wait_lcore(lcore_id);
1598 
1599 	return 0;
1600 
1601 }
1602