xref: /dpdk/examples/vhost/main.c (revision a798beb47c8e40f03261ee869a00578d2c44c824)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_vhost.h>
53 #include <rte_ip.h>
54 #include <rte_tcp.h>
55 
56 #include "main.h"
57 
58 #ifndef MAX_QUEUES
59 #define MAX_QUEUES 128
60 #endif
61 
62 /* the maximum number of external ports supported */
63 #define MAX_SUP_PORTS 1
64 
65 #define MBUF_CACHE_SIZE	128
66 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
67 
68 #define MAX_PKT_BURST 32		/* Max burst size for RX/TX */
69 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
70 
71 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
72 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
73 
74 #define JUMBO_FRAME_MAX_SIZE    0x2600
75 
76 /* State of virtio device. */
77 #define DEVICE_MAC_LEARNING 0
78 #define DEVICE_RX			1
79 #define DEVICE_SAFE_REMOVE	2
80 
81 /* Configurable number of RX/TX ring descriptors */
82 #define RTE_TEST_RX_DESC_DEFAULT 1024
83 #define RTE_TEST_TX_DESC_DEFAULT 512
84 
85 #define INVALID_PORT_ID 0xFF
86 
87 /* Max number of devices. Limited by vmdq. */
88 #define MAX_DEVICES 64
89 
90 /* Size of buffers used for snprintfs. */
91 #define MAX_PRINT_BUFF 6072
92 
93 /* Maximum long option length for option parsing. */
94 #define MAX_LONG_OPT_SZ 64
95 
96 /* mask of enabled ports */
97 static uint32_t enabled_port_mask = 0;
98 
99 /* Promiscuous mode */
100 static uint32_t promiscuous;
101 
102 /* number of devices/queues to support*/
103 static uint32_t num_queues = 0;
104 static uint32_t num_devices;
105 
106 static struct rte_mempool *mbuf_pool;
107 static int mergeable;
108 
109 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
110 typedef enum {
111 	VM2VM_DISABLED = 0,
112 	VM2VM_SOFTWARE = 1,
113 	VM2VM_HARDWARE = 2,
114 	VM2VM_LAST
115 } vm2vm_type;
116 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
117 
118 /* Enable stats. */
119 static uint32_t enable_stats = 0;
120 /* Enable retries on RX. */
121 static uint32_t enable_retry = 1;
122 
123 /* Disable TX checksum offload */
124 static uint32_t enable_tx_csum;
125 
126 /* Disable TSO offload */
127 static uint32_t enable_tso;
128 
129 static int client_mode;
130 static int dequeue_zero_copy;
131 
132 /* Specify timeout (in useconds) between retries on RX. */
133 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
134 /* Specify the number of retries on RX. */
135 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
136 
137 /* Socket file paths. Can be set by user */
138 static char *socket_files;
139 static int nb_sockets;
140 
141 /* empty vmdq configuration structure. Filled in programatically */
142 static struct rte_eth_conf vmdq_conf_default = {
143 	.rxmode = {
144 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
145 		.split_hdr_size = 0,
146 		.header_split   = 0, /**< Header Split disabled */
147 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
148 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
149 		/*
150 		 * It is necessary for 1G NIC such as I350,
151 		 * this fixes bug of ipv4 forwarding in guest can't
152 		 * forward pakets from one virtio dev to another virtio dev.
153 		 */
154 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
155 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
156 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
157 	},
158 
159 	.txmode = {
160 		.mq_mode = ETH_MQ_TX_NONE,
161 	},
162 	.rx_adv_conf = {
163 		/*
164 		 * should be overridden separately in code with
165 		 * appropriate values
166 		 */
167 		.vmdq_rx_conf = {
168 			.nb_queue_pools = ETH_8_POOLS,
169 			.enable_default_pool = 0,
170 			.default_pool = 0,
171 			.nb_pool_maps = 0,
172 			.pool_map = {{0, 0},},
173 		},
174 	},
175 };
176 
177 static unsigned lcore_ids[RTE_MAX_LCORE];
178 static uint8_t ports[RTE_MAX_ETHPORTS];
179 static unsigned num_ports = 0; /**< The number of ports specified in command line */
180 static uint16_t num_pf_queues, num_vmdq_queues;
181 static uint16_t vmdq_pool_base, vmdq_queue_base;
182 static uint16_t queues_per_pool;
183 
184 const uint16_t vlan_tags[] = {
185 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
186 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
187 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
188 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
189 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
190 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
191 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
192 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
193 };
194 
195 /* ethernet addresses of ports */
196 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
197 
198 static struct vhost_dev_tailq_list vhost_dev_list =
199 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
200 
201 static struct lcore_info lcore_info[RTE_MAX_LCORE];
202 
203 /* Used for queueing bursts of TX packets. */
204 struct mbuf_table {
205 	unsigned len;
206 	unsigned txq_id;
207 	struct rte_mbuf *m_table[MAX_PKT_BURST];
208 };
209 
210 /* TX queue for each data core. */
211 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
212 
213 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
214 				 / US_PER_S * BURST_TX_DRAIN_US)
215 #define VLAN_HLEN       4
216 
217 /*
218  * Builds up the correct configuration for VMDQ VLAN pool map
219  * according to the pool & queue limits.
220  */
221 static inline int
222 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
223 {
224 	struct rte_eth_vmdq_rx_conf conf;
225 	struct rte_eth_vmdq_rx_conf *def_conf =
226 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
227 	unsigned i;
228 
229 	memset(&conf, 0, sizeof(conf));
230 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
231 	conf.nb_pool_maps = num_devices;
232 	conf.enable_loop_back = def_conf->enable_loop_back;
233 	conf.rx_mode = def_conf->rx_mode;
234 
235 	for (i = 0; i < conf.nb_pool_maps; i++) {
236 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
237 		conf.pool_map[i].pools = (1UL << i);
238 	}
239 
240 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
241 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
242 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
243 	return 0;
244 }
245 
246 /*
247  * Validate the device number according to the max pool number gotten form
248  * dev_info. If the device number is invalid, give the error message and
249  * return -1. Each device must have its own pool.
250  */
251 static inline int
252 validate_num_devices(uint32_t max_nb_devices)
253 {
254 	if (num_devices > max_nb_devices) {
255 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
256 		return -1;
257 	}
258 	return 0;
259 }
260 
261 /*
262  * Initialises a given port using global settings and with the rx buffers
263  * coming from the mbuf_pool passed as parameter
264  */
265 static inline int
266 port_init(uint8_t port)
267 {
268 	struct rte_eth_dev_info dev_info;
269 	struct rte_eth_conf port_conf;
270 	struct rte_eth_rxconf *rxconf;
271 	struct rte_eth_txconf *txconf;
272 	int16_t rx_rings, tx_rings;
273 	uint16_t rx_ring_size, tx_ring_size;
274 	int retval;
275 	uint16_t q;
276 
277 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
278 	rte_eth_dev_info_get (port, &dev_info);
279 
280 	if (dev_info.max_rx_queues > MAX_QUEUES) {
281 		rte_exit(EXIT_FAILURE,
282 			"please define MAX_QUEUES no less than %u in %s\n",
283 			dev_info.max_rx_queues, __FILE__);
284 	}
285 
286 	rxconf = &dev_info.default_rxconf;
287 	txconf = &dev_info.default_txconf;
288 	rxconf->rx_drop_en = 1;
289 
290 	/* Enable vlan offload */
291 	txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
292 
293 	/*configure the number of supported virtio devices based on VMDQ limits */
294 	num_devices = dev_info.max_vmdq_pools;
295 
296 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
297 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
298 
299 	/*
300 	 * When dequeue zero copy is enabled, guest Tx used vring will be
301 	 * updated only when corresponding mbuf is freed. Thus, the nb_tx_desc
302 	 * (tx_ring_size here) must be small enough so that the driver will
303 	 * hit the free threshold easily and free mbufs timely. Otherwise,
304 	 * guest Tx vring would be starved.
305 	 */
306 	if (dequeue_zero_copy)
307 		tx_ring_size = 64;
308 
309 	tx_rings = (uint16_t)rte_lcore_count();
310 
311 	retval = validate_num_devices(MAX_DEVICES);
312 	if (retval < 0)
313 		return retval;
314 
315 	/* Get port configuration. */
316 	retval = get_eth_conf(&port_conf, num_devices);
317 	if (retval < 0)
318 		return retval;
319 	/* NIC queues are divided into pf queues and vmdq queues.  */
320 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
321 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
322 	num_vmdq_queues = num_devices * queues_per_pool;
323 	num_queues = num_pf_queues + num_vmdq_queues;
324 	vmdq_queue_base = dev_info.vmdq_queue_base;
325 	vmdq_pool_base  = dev_info.vmdq_pool_base;
326 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
327 		num_pf_queues, num_devices, queues_per_pool);
328 
329 	if (port >= rte_eth_dev_count()) return -1;
330 
331 	rx_rings = (uint16_t)dev_info.max_rx_queues;
332 	/* Configure ethernet device. */
333 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
334 	if (retval != 0) {
335 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
336 			port, strerror(-retval));
337 		return retval;
338 	}
339 
340 	/* Setup the queues. */
341 	for (q = 0; q < rx_rings; q ++) {
342 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
343 						rte_eth_dev_socket_id(port),
344 						rxconf,
345 						mbuf_pool);
346 		if (retval < 0) {
347 			RTE_LOG(ERR, VHOST_PORT,
348 				"Failed to setup rx queue %u of port %u: %s.\n",
349 				q, port, strerror(-retval));
350 			return retval;
351 		}
352 	}
353 	for (q = 0; q < tx_rings; q ++) {
354 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
355 						rte_eth_dev_socket_id(port),
356 						txconf);
357 		if (retval < 0) {
358 			RTE_LOG(ERR, VHOST_PORT,
359 				"Failed to setup tx queue %u of port %u: %s.\n",
360 				q, port, strerror(-retval));
361 			return retval;
362 		}
363 	}
364 
365 	/* Start the device. */
366 	retval  = rte_eth_dev_start(port);
367 	if (retval < 0) {
368 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
369 			port, strerror(-retval));
370 		return retval;
371 	}
372 
373 	if (promiscuous)
374 		rte_eth_promiscuous_enable(port);
375 
376 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
377 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
378 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
379 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
380 			(unsigned)port,
381 			vmdq_ports_eth_addr[port].addr_bytes[0],
382 			vmdq_ports_eth_addr[port].addr_bytes[1],
383 			vmdq_ports_eth_addr[port].addr_bytes[2],
384 			vmdq_ports_eth_addr[port].addr_bytes[3],
385 			vmdq_ports_eth_addr[port].addr_bytes[4],
386 			vmdq_ports_eth_addr[port].addr_bytes[5]);
387 
388 	return 0;
389 }
390 
391 /*
392  * Set socket file path.
393  */
394 static int
395 us_vhost_parse_socket_path(const char *q_arg)
396 {
397 	/* parse number string */
398 	if (strnlen(q_arg, PATH_MAX) > PATH_MAX)
399 		return -1;
400 
401 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
402 	snprintf(socket_files + nb_sockets * PATH_MAX, PATH_MAX, "%s", q_arg);
403 	nb_sockets++;
404 
405 	return 0;
406 }
407 
408 /*
409  * Parse the portmask provided at run time.
410  */
411 static int
412 parse_portmask(const char *portmask)
413 {
414 	char *end = NULL;
415 	unsigned long pm;
416 
417 	errno = 0;
418 
419 	/* parse hexadecimal string */
420 	pm = strtoul(portmask, &end, 16);
421 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
422 		return -1;
423 
424 	if (pm == 0)
425 		return -1;
426 
427 	return pm;
428 
429 }
430 
431 /*
432  * Parse num options at run time.
433  */
434 static int
435 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
436 {
437 	char *end = NULL;
438 	unsigned long num;
439 
440 	errno = 0;
441 
442 	/* parse unsigned int string */
443 	num = strtoul(q_arg, &end, 10);
444 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
445 		return -1;
446 
447 	if (num > max_valid_value)
448 		return -1;
449 
450 	return num;
451 
452 }
453 
454 /*
455  * Display usage
456  */
457 static void
458 us_vhost_usage(const char *prgname)
459 {
460 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
461 	"		--vm2vm [0|1|2]\n"
462 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
463 	"		--socket-file <path>\n"
464 	"		--nb-devices ND\n"
465 	"		-p PORTMASK: Set mask for ports to be used by application\n"
466 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
467 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
468 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
469 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
470 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
471 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
472 	"		--socket-file: The path of the socket file.\n"
473 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
474 	"		--tso [0|1] disable/enable TCP segment offload.\n"
475 	"		--client register a vhost-user socket as client mode.\n"
476 	"		--dequeue-zero-copy enables dequeue zero copy\n",
477 	       prgname);
478 }
479 
480 /*
481  * Parse the arguments given in the command line of the application.
482  */
483 static int
484 us_vhost_parse_args(int argc, char **argv)
485 {
486 	int opt, ret;
487 	int option_index;
488 	unsigned i;
489 	const char *prgname = argv[0];
490 	static struct option long_option[] = {
491 		{"vm2vm", required_argument, NULL, 0},
492 		{"rx-retry", required_argument, NULL, 0},
493 		{"rx-retry-delay", required_argument, NULL, 0},
494 		{"rx-retry-num", required_argument, NULL, 0},
495 		{"mergeable", required_argument, NULL, 0},
496 		{"stats", required_argument, NULL, 0},
497 		{"socket-file", required_argument, NULL, 0},
498 		{"tx-csum", required_argument, NULL, 0},
499 		{"tso", required_argument, NULL, 0},
500 		{"client", no_argument, &client_mode, 1},
501 		{"dequeue-zero-copy", no_argument, &dequeue_zero_copy, 1},
502 		{NULL, 0, 0, 0},
503 	};
504 
505 	/* Parse command line */
506 	while ((opt = getopt_long(argc, argv, "p:P",
507 			long_option, &option_index)) != EOF) {
508 		switch (opt) {
509 		/* Portmask */
510 		case 'p':
511 			enabled_port_mask = parse_portmask(optarg);
512 			if (enabled_port_mask == 0) {
513 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
514 				us_vhost_usage(prgname);
515 				return -1;
516 			}
517 			break;
518 
519 		case 'P':
520 			promiscuous = 1;
521 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
522 				ETH_VMDQ_ACCEPT_BROADCAST |
523 				ETH_VMDQ_ACCEPT_MULTICAST;
524 
525 			break;
526 
527 		case 0:
528 			/* Enable/disable vm2vm comms. */
529 			if (!strncmp(long_option[option_index].name, "vm2vm",
530 				MAX_LONG_OPT_SZ)) {
531 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
532 				if (ret == -1) {
533 					RTE_LOG(INFO, VHOST_CONFIG,
534 						"Invalid argument for "
535 						"vm2vm [0|1|2]\n");
536 					us_vhost_usage(prgname);
537 					return -1;
538 				} else {
539 					vm2vm_mode = (vm2vm_type)ret;
540 				}
541 			}
542 
543 			/* Enable/disable retries on RX. */
544 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
545 				ret = parse_num_opt(optarg, 1);
546 				if (ret == -1) {
547 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
548 					us_vhost_usage(prgname);
549 					return -1;
550 				} else {
551 					enable_retry = ret;
552 				}
553 			}
554 
555 			/* Enable/disable TX checksum offload. */
556 			if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
557 				ret = parse_num_opt(optarg, 1);
558 				if (ret == -1) {
559 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
560 					us_vhost_usage(prgname);
561 					return -1;
562 				} else
563 					enable_tx_csum = ret;
564 			}
565 
566 			/* Enable/disable TSO offload. */
567 			if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
568 				ret = parse_num_opt(optarg, 1);
569 				if (ret == -1) {
570 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
571 					us_vhost_usage(prgname);
572 					return -1;
573 				} else
574 					enable_tso = ret;
575 			}
576 
577 			/* Specify the retries delay time (in useconds) on RX. */
578 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
579 				ret = parse_num_opt(optarg, INT32_MAX);
580 				if (ret == -1) {
581 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
582 					us_vhost_usage(prgname);
583 					return -1;
584 				} else {
585 					burst_rx_delay_time = ret;
586 				}
587 			}
588 
589 			/* Specify the retries number on RX. */
590 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
591 				ret = parse_num_opt(optarg, INT32_MAX);
592 				if (ret == -1) {
593 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
594 					us_vhost_usage(prgname);
595 					return -1;
596 				} else {
597 					burst_rx_retry_num = ret;
598 				}
599 			}
600 
601 			/* Enable/disable RX mergeable buffers. */
602 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
603 				ret = parse_num_opt(optarg, 1);
604 				if (ret == -1) {
605 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
606 					us_vhost_usage(prgname);
607 					return -1;
608 				} else {
609 					mergeable = !!ret;
610 					if (ret) {
611 						vmdq_conf_default.rxmode.jumbo_frame = 1;
612 						vmdq_conf_default.rxmode.max_rx_pkt_len
613 							= JUMBO_FRAME_MAX_SIZE;
614 					}
615 				}
616 			}
617 
618 			/* Enable/disable stats. */
619 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
620 				ret = parse_num_opt(optarg, INT32_MAX);
621 				if (ret == -1) {
622 					RTE_LOG(INFO, VHOST_CONFIG,
623 						"Invalid argument for stats [0..N]\n");
624 					us_vhost_usage(prgname);
625 					return -1;
626 				} else {
627 					enable_stats = ret;
628 				}
629 			}
630 
631 			/* Set socket file path. */
632 			if (!strncmp(long_option[option_index].name,
633 						"socket-file", MAX_LONG_OPT_SZ)) {
634 				if (us_vhost_parse_socket_path(optarg) == -1) {
635 					RTE_LOG(INFO, VHOST_CONFIG,
636 					"Invalid argument for socket name (Max %d characters)\n",
637 					PATH_MAX);
638 					us_vhost_usage(prgname);
639 					return -1;
640 				}
641 			}
642 
643 			break;
644 
645 			/* Invalid option - print options. */
646 		default:
647 			us_vhost_usage(prgname);
648 			return -1;
649 		}
650 	}
651 
652 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
653 		if (enabled_port_mask & (1 << i))
654 			ports[num_ports++] = (uint8_t)i;
655 	}
656 
657 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
658 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
659 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
660 		return -1;
661 	}
662 
663 	return 0;
664 }
665 
666 /*
667  * Update the global var NUM_PORTS and array PORTS according to system ports number
668  * and return valid ports number
669  */
670 static unsigned check_ports_num(unsigned nb_ports)
671 {
672 	unsigned valid_num_ports = num_ports;
673 	unsigned portid;
674 
675 	if (num_ports > nb_ports) {
676 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
677 			num_ports, nb_ports);
678 		num_ports = nb_ports;
679 	}
680 
681 	for (portid = 0; portid < num_ports; portid ++) {
682 		if (ports[portid] >= nb_ports) {
683 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
684 				ports[portid], (nb_ports - 1));
685 			ports[portid] = INVALID_PORT_ID;
686 			valid_num_ports--;
687 		}
688 	}
689 	return valid_num_ports;
690 }
691 
692 static inline struct vhost_dev *__attribute__((always_inline))
693 find_vhost_dev(struct ether_addr *mac)
694 {
695 	struct vhost_dev *vdev;
696 
697 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
698 		if (vdev->ready == DEVICE_RX &&
699 		    is_same_ether_addr(mac, &vdev->mac_address))
700 			return vdev;
701 	}
702 
703 	return NULL;
704 }
705 
706 /*
707  * This function learns the MAC address of the device and registers this along with a
708  * vlan tag to a VMDQ.
709  */
710 static int
711 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
712 {
713 	struct ether_hdr *pkt_hdr;
714 	int i, ret;
715 
716 	/* Learn MAC address of guest device from packet */
717 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
718 
719 	if (find_vhost_dev(&pkt_hdr->s_addr)) {
720 		RTE_LOG(ERR, VHOST_DATA,
721 			"(%d) device is using a registered MAC!\n",
722 			vdev->vid);
723 		return -1;
724 	}
725 
726 	for (i = 0; i < ETHER_ADDR_LEN; i++)
727 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
728 
729 	/* vlan_tag currently uses the device_id. */
730 	vdev->vlan_tag = vlan_tags[vdev->vid];
731 
732 	/* Print out VMDQ registration info. */
733 	RTE_LOG(INFO, VHOST_DATA,
734 		"(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
735 		vdev->vid,
736 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
737 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
738 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
739 		vdev->vlan_tag);
740 
741 	/* Register the MAC address. */
742 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
743 				(uint32_t)vdev->vid + vmdq_pool_base);
744 	if (ret)
745 		RTE_LOG(ERR, VHOST_DATA,
746 			"(%d) failed to add device MAC address to VMDQ\n",
747 			vdev->vid);
748 
749 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
750 
751 	/* Set device as ready for RX. */
752 	vdev->ready = DEVICE_RX;
753 
754 	return 0;
755 }
756 
757 /*
758  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
759  * queue before disabling RX on the device.
760  */
761 static inline void
762 unlink_vmdq(struct vhost_dev *vdev)
763 {
764 	unsigned i = 0;
765 	unsigned rx_count;
766 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
767 
768 	if (vdev->ready == DEVICE_RX) {
769 		/*clear MAC and VLAN settings*/
770 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
771 		for (i = 0; i < 6; i++)
772 			vdev->mac_address.addr_bytes[i] = 0;
773 
774 		vdev->vlan_tag = 0;
775 
776 		/*Clear out the receive buffers*/
777 		rx_count = rte_eth_rx_burst(ports[0],
778 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
779 
780 		while (rx_count) {
781 			for (i = 0; i < rx_count; i++)
782 				rte_pktmbuf_free(pkts_burst[i]);
783 
784 			rx_count = rte_eth_rx_burst(ports[0],
785 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
786 		}
787 
788 		vdev->ready = DEVICE_MAC_LEARNING;
789 	}
790 }
791 
792 static inline void __attribute__((always_inline))
793 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
794 	    struct rte_mbuf *m)
795 {
796 	uint16_t ret;
797 
798 	ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
799 	if (enable_stats) {
800 		rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
801 		rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
802 		src_vdev->stats.tx_total++;
803 		src_vdev->stats.tx += ret;
804 	}
805 }
806 
807 /*
808  * Check if the packet destination MAC address is for a local device. If so then put
809  * the packet on that devices RX queue. If not then return.
810  */
811 static inline int __attribute__((always_inline))
812 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
813 {
814 	struct ether_hdr *pkt_hdr;
815 	struct vhost_dev *dst_vdev;
816 
817 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
818 
819 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
820 	if (!dst_vdev)
821 		return -1;
822 
823 	if (vdev->vid == dst_vdev->vid) {
824 		RTE_LOG_DP(DEBUG, VHOST_DATA,
825 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
826 			vdev->vid);
827 		return 0;
828 	}
829 
830 	RTE_LOG_DP(DEBUG, VHOST_DATA,
831 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
832 
833 	if (unlikely(dst_vdev->remove)) {
834 		RTE_LOG_DP(DEBUG, VHOST_DATA,
835 			"(%d) device is marked for removal\n", dst_vdev->vid);
836 		return 0;
837 	}
838 
839 	virtio_xmit(dst_vdev, vdev, m);
840 	return 0;
841 }
842 
843 /*
844  * Check if the destination MAC of a packet is one local VM,
845  * and get its vlan tag, and offset if it is.
846  */
847 static inline int __attribute__((always_inline))
848 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
849 	uint32_t *offset, uint16_t *vlan_tag)
850 {
851 	struct vhost_dev *dst_vdev;
852 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
853 
854 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
855 	if (!dst_vdev)
856 		return 0;
857 
858 	if (vdev->vid == dst_vdev->vid) {
859 		RTE_LOG_DP(DEBUG, VHOST_DATA,
860 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
861 			vdev->vid);
862 		return -1;
863 	}
864 
865 	/*
866 	 * HW vlan strip will reduce the packet length
867 	 * by minus length of vlan tag, so need restore
868 	 * the packet length by plus it.
869 	 */
870 	*offset  = VLAN_HLEN;
871 	*vlan_tag = vlan_tags[vdev->vid];
872 
873 	RTE_LOG_DP(DEBUG, VHOST_DATA,
874 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
875 		vdev->vid, dst_vdev->vid, *vlan_tag);
876 
877 	return 0;
878 }
879 
880 static uint16_t
881 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
882 {
883 	if (ol_flags & PKT_TX_IPV4)
884 		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
885 	else /* assume ethertype == ETHER_TYPE_IPv6 */
886 		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
887 }
888 
889 static void virtio_tx_offload(struct rte_mbuf *m)
890 {
891 	void *l3_hdr;
892 	struct ipv4_hdr *ipv4_hdr = NULL;
893 	struct tcp_hdr *tcp_hdr = NULL;
894 	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
895 
896 	l3_hdr = (char *)eth_hdr + m->l2_len;
897 
898 	if (m->ol_flags & PKT_TX_IPV4) {
899 		ipv4_hdr = l3_hdr;
900 		ipv4_hdr->hdr_checksum = 0;
901 		m->ol_flags |= PKT_TX_IP_CKSUM;
902 	}
903 
904 	tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
905 	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
906 }
907 
908 static inline void
909 free_pkts(struct rte_mbuf **pkts, uint16_t n)
910 {
911 	while (n--)
912 		rte_pktmbuf_free(pkts[n]);
913 }
914 
915 static inline void __attribute__((always_inline))
916 do_drain_mbuf_table(struct mbuf_table *tx_q)
917 {
918 	uint16_t count;
919 
920 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
921 				 tx_q->m_table, tx_q->len);
922 	if (unlikely(count < tx_q->len))
923 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
924 
925 	tx_q->len = 0;
926 }
927 
928 /*
929  * This function routes the TX packet to the correct interface. This
930  * may be a local device or the physical port.
931  */
932 static inline void __attribute__((always_inline))
933 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
934 {
935 	struct mbuf_table *tx_q;
936 	unsigned offset = 0;
937 	const uint16_t lcore_id = rte_lcore_id();
938 	struct ether_hdr *nh;
939 
940 
941 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
942 	if (unlikely(is_broadcast_ether_addr(&nh->d_addr))) {
943 		struct vhost_dev *vdev2;
944 
945 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
946 			virtio_xmit(vdev2, vdev, m);
947 		}
948 		goto queue2nic;
949 	}
950 
951 	/*check if destination is local VM*/
952 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
953 		rte_pktmbuf_free(m);
954 		return;
955 	}
956 
957 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
958 		if (unlikely(find_local_dest(vdev, m, &offset,
959 					     &vlan_tag) != 0)) {
960 			rte_pktmbuf_free(m);
961 			return;
962 		}
963 	}
964 
965 	RTE_LOG_DP(DEBUG, VHOST_DATA,
966 		"(%d) TX: MAC address is external\n", vdev->vid);
967 
968 queue2nic:
969 
970 	/*Add packet to the port tx queue*/
971 	tx_q = &lcore_tx_queue[lcore_id];
972 
973 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
974 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
975 		/* Guest has inserted the vlan tag. */
976 		struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
977 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
978 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
979 			(vh->vlan_tci != vlan_tag_be))
980 			vh->vlan_tci = vlan_tag_be;
981 	} else {
982 		m->ol_flags |= PKT_TX_VLAN_PKT;
983 
984 		/*
985 		 * Find the right seg to adjust the data len when offset is
986 		 * bigger than tail room size.
987 		 */
988 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
989 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
990 				m->data_len += offset;
991 			else {
992 				struct rte_mbuf *seg = m;
993 
994 				while ((seg->next != NULL) &&
995 					(offset > rte_pktmbuf_tailroom(seg)))
996 					seg = seg->next;
997 
998 				seg->data_len += offset;
999 			}
1000 			m->pkt_len += offset;
1001 		}
1002 
1003 		m->vlan_tci = vlan_tag;
1004 	}
1005 
1006 	if (m->ol_flags & PKT_TX_TCP_SEG)
1007 		virtio_tx_offload(m);
1008 
1009 	tx_q->m_table[tx_q->len++] = m;
1010 	if (enable_stats) {
1011 		vdev->stats.tx_total++;
1012 		vdev->stats.tx++;
1013 	}
1014 
1015 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1016 		do_drain_mbuf_table(tx_q);
1017 }
1018 
1019 
1020 static inline void __attribute__((always_inline))
1021 drain_mbuf_table(struct mbuf_table *tx_q)
1022 {
1023 	static uint64_t prev_tsc;
1024 	uint64_t cur_tsc;
1025 
1026 	if (tx_q->len == 0)
1027 		return;
1028 
1029 	cur_tsc = rte_rdtsc();
1030 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1031 		prev_tsc = cur_tsc;
1032 
1033 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1034 			"TX queue drained after timeout with burst size %u\n",
1035 			tx_q->len);
1036 		do_drain_mbuf_table(tx_q);
1037 	}
1038 }
1039 
1040 static inline void __attribute__((always_inline))
1041 drain_eth_rx(struct vhost_dev *vdev)
1042 {
1043 	uint16_t rx_count, enqueue_count;
1044 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1045 
1046 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1047 				    pkts, MAX_PKT_BURST);
1048 	if (!rx_count)
1049 		return;
1050 
1051 	/*
1052 	 * When "enable_retry" is set, here we wait and retry when there
1053 	 * is no enough free slots in the queue to hold @rx_count packets,
1054 	 * to diminish packet loss.
1055 	 */
1056 	if (enable_retry &&
1057 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1058 			VIRTIO_RXQ))) {
1059 		uint32_t retry;
1060 
1061 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1062 			rte_delay_us(burst_rx_delay_time);
1063 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1064 					VIRTIO_RXQ))
1065 				break;
1066 		}
1067 	}
1068 
1069 	enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1070 						pkts, rx_count);
1071 	if (enable_stats) {
1072 		rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1073 		rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1074 	}
1075 
1076 	free_pkts(pkts, rx_count);
1077 }
1078 
1079 static inline void __attribute__((always_inline))
1080 drain_virtio_tx(struct vhost_dev *vdev)
1081 {
1082 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1083 	uint16_t count;
1084 	uint16_t i;
1085 
1086 	count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ, mbuf_pool,
1087 					pkts, MAX_PKT_BURST);
1088 
1089 	/* setup VMDq for the first packet */
1090 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1091 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1092 			free_pkts(pkts, count);
1093 	}
1094 
1095 	for (i = 0; i < count; ++i)
1096 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1097 }
1098 
1099 /*
1100  * Main function of vhost-switch. It basically does:
1101  *
1102  * for each vhost device {
1103  *    - drain_eth_rx()
1104  *
1105  *      Which drains the host eth Rx queue linked to the vhost device,
1106  *      and deliver all of them to guest virito Rx ring associated with
1107  *      this vhost device.
1108  *
1109  *    - drain_virtio_tx()
1110  *
1111  *      Which drains the guest virtio Tx queue and deliver all of them
1112  *      to the target, which could be another vhost device, or the
1113  *      physical eth dev. The route is done in function "virtio_tx_route".
1114  * }
1115  */
1116 static int
1117 switch_worker(void *arg __rte_unused)
1118 {
1119 	unsigned i;
1120 	unsigned lcore_id = rte_lcore_id();
1121 	struct vhost_dev *vdev;
1122 	struct mbuf_table *tx_q;
1123 
1124 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1125 
1126 	tx_q = &lcore_tx_queue[lcore_id];
1127 	for (i = 0; i < rte_lcore_count(); i++) {
1128 		if (lcore_ids[i] == lcore_id) {
1129 			tx_q->txq_id = i;
1130 			break;
1131 		}
1132 	}
1133 
1134 	while(1) {
1135 		drain_mbuf_table(tx_q);
1136 
1137 		/*
1138 		 * Inform the configuration core that we have exited the
1139 		 * linked list and that no devices are in use if requested.
1140 		 */
1141 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1142 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1143 
1144 		/*
1145 		 * Process vhost devices
1146 		 */
1147 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1148 			      lcore_vdev_entry) {
1149 			if (unlikely(vdev->remove)) {
1150 				unlink_vmdq(vdev);
1151 				vdev->ready = DEVICE_SAFE_REMOVE;
1152 				continue;
1153 			}
1154 
1155 			if (likely(vdev->ready == DEVICE_RX))
1156 				drain_eth_rx(vdev);
1157 
1158 			if (likely(!vdev->remove))
1159 				drain_virtio_tx(vdev);
1160 		}
1161 	}
1162 
1163 	return 0;
1164 }
1165 
1166 /*
1167  * Remove a device from the specific data core linked list and from the
1168  * main linked list. Synchonization  occurs through the use of the
1169  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1170  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1171  */
1172 static void
1173 destroy_device(int vid)
1174 {
1175 	struct vhost_dev *vdev = NULL;
1176 	int lcore;
1177 
1178 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1179 		if (vdev->vid == vid)
1180 			break;
1181 	}
1182 	if (!vdev)
1183 		return;
1184 	/*set the remove flag. */
1185 	vdev->remove = 1;
1186 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1187 		rte_pause();
1188 	}
1189 
1190 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1191 		     lcore_vdev_entry);
1192 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1193 
1194 
1195 	/* Set the dev_removal_flag on each lcore. */
1196 	RTE_LCORE_FOREACH_SLAVE(lcore)
1197 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1198 
1199 	/*
1200 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1201 	 * we can be sure that they can no longer access the device removed
1202 	 * from the linked lists and that the devices are no longer in use.
1203 	 */
1204 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1205 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1206 			rte_pause();
1207 	}
1208 
1209 	lcore_info[vdev->coreid].device_num--;
1210 
1211 	RTE_LOG(INFO, VHOST_DATA,
1212 		"(%d) device has been removed from data core\n",
1213 		vdev->vid);
1214 
1215 	rte_free(vdev);
1216 }
1217 
1218 /*
1219  * A new device is added to a data core. First the device is added to the main linked list
1220  * and the allocated to a specific data core.
1221  */
1222 static int
1223 new_device(int vid)
1224 {
1225 	int lcore, core_add = 0;
1226 	uint32_t device_num_min = num_devices;
1227 	struct vhost_dev *vdev;
1228 
1229 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1230 	if (vdev == NULL) {
1231 		RTE_LOG(INFO, VHOST_DATA,
1232 			"(%d) couldn't allocate memory for vhost dev\n",
1233 			vid);
1234 		return -1;
1235 	}
1236 	vdev->vid = vid;
1237 
1238 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1239 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1240 
1241 	/*reset ready flag*/
1242 	vdev->ready = DEVICE_MAC_LEARNING;
1243 	vdev->remove = 0;
1244 
1245 	/* Find a suitable lcore to add the device. */
1246 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1247 		if (lcore_info[lcore].device_num < device_num_min) {
1248 			device_num_min = lcore_info[lcore].device_num;
1249 			core_add = lcore;
1250 		}
1251 	}
1252 	vdev->coreid = core_add;
1253 
1254 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1255 			  lcore_vdev_entry);
1256 	lcore_info[vdev->coreid].device_num++;
1257 
1258 	/* Disable notifications. */
1259 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1260 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1261 
1262 	RTE_LOG(INFO, VHOST_DATA,
1263 		"(%d) device has been added to data core %d\n",
1264 		vid, vdev->coreid);
1265 
1266 	return 0;
1267 }
1268 
1269 /*
1270  * These callback allow devices to be added to the data core when configuration
1271  * has been fully complete.
1272  */
1273 static const struct vhost_device_ops virtio_net_device_ops =
1274 {
1275 	.new_device =  new_device,
1276 	.destroy_device = destroy_device,
1277 };
1278 
1279 /*
1280  * This is a thread will wake up after a period to print stats if the user has
1281  * enabled them.
1282  */
1283 static void
1284 print_stats(void)
1285 {
1286 	struct vhost_dev *vdev;
1287 	uint64_t tx_dropped, rx_dropped;
1288 	uint64_t tx, tx_total, rx, rx_total;
1289 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1290 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1291 
1292 	while(1) {
1293 		sleep(enable_stats);
1294 
1295 		/* Clear screen and move to top left */
1296 		printf("%s%s\n", clr, top_left);
1297 		printf("Device statistics =================================\n");
1298 
1299 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1300 			tx_total   = vdev->stats.tx_total;
1301 			tx         = vdev->stats.tx;
1302 			tx_dropped = tx_total - tx;
1303 
1304 			rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1305 			rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1306 			rx_dropped = rx_total - rx;
1307 
1308 			printf("Statistics for device %d\n"
1309 				"-----------------------\n"
1310 				"TX total:              %" PRIu64 "\n"
1311 				"TX dropped:            %" PRIu64 "\n"
1312 				"TX successful:         %" PRIu64 "\n"
1313 				"RX total:              %" PRIu64 "\n"
1314 				"RX dropped:            %" PRIu64 "\n"
1315 				"RX successful:         %" PRIu64 "\n",
1316 				vdev->vid,
1317 				tx_total, tx_dropped, tx,
1318 				rx_total, rx_dropped, rx);
1319 		}
1320 
1321 		printf("===================================================\n");
1322 	}
1323 }
1324 
1325 static void
1326 unregister_drivers(int socket_num)
1327 {
1328 	int i, ret;
1329 
1330 	for (i = 0; i < socket_num; i++) {
1331 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1332 		if (ret != 0)
1333 			RTE_LOG(ERR, VHOST_CONFIG,
1334 				"Fail to unregister vhost driver for %s.\n",
1335 				socket_files + i * PATH_MAX);
1336 	}
1337 }
1338 
1339 /* When we receive a INT signal, unregister vhost driver */
1340 static void
1341 sigint_handler(__rte_unused int signum)
1342 {
1343 	/* Unregister vhost driver. */
1344 	unregister_drivers(nb_sockets);
1345 
1346 	exit(0);
1347 }
1348 
1349 /*
1350  * While creating an mbuf pool, one key thing is to figure out how
1351  * many mbuf entries is enough for our use. FYI, here are some
1352  * guidelines:
1353  *
1354  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1355  *
1356  * - For each switch core (A CPU core does the packet switch), we need
1357  *   also make some reservation for receiving the packets from virtio
1358  *   Tx queue. How many is enough depends on the usage. It's normally
1359  *   a simple calculation like following:
1360  *
1361  *       MAX_PKT_BURST * max packet size / mbuf size
1362  *
1363  *   So, we definitely need allocate more mbufs when TSO is enabled.
1364  *
1365  * - Similarly, for each switching core, we should serve @nr_rx_desc
1366  *   mbufs for receiving the packets from physical NIC device.
1367  *
1368  * - We also need make sure, for each switch core, we have allocated
1369  *   enough mbufs to fill up the mbuf cache.
1370  */
1371 static void
1372 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1373 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1374 {
1375 	uint32_t nr_mbufs;
1376 	uint32_t nr_mbufs_per_core;
1377 	uint32_t mtu = 1500;
1378 
1379 	if (mergeable)
1380 		mtu = 9000;
1381 	if (enable_tso)
1382 		mtu = 64 * 1024;
1383 
1384 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1385 			(mbuf_size - RTE_PKTMBUF_HEADROOM);
1386 	nr_mbufs_per_core += nr_rx_desc;
1387 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1388 
1389 	nr_mbufs  = nr_queues * nr_rx_desc;
1390 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1391 	nr_mbufs *= nr_port;
1392 
1393 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1394 					    nr_mbuf_cache, 0, mbuf_size,
1395 					    rte_socket_id());
1396 	if (mbuf_pool == NULL)
1397 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1398 }
1399 
1400 /*
1401  * Main function, does initialisation and calls the per-lcore functions.
1402  */
1403 int
1404 main(int argc, char *argv[])
1405 {
1406 	unsigned lcore_id, core_id = 0;
1407 	unsigned nb_ports, valid_num_ports;
1408 	int ret, i;
1409 	uint8_t portid;
1410 	static pthread_t tid;
1411 	char thread_name[RTE_MAX_THREAD_NAME_LEN];
1412 	uint64_t flags = 0;
1413 
1414 	signal(SIGINT, sigint_handler);
1415 
1416 	/* init EAL */
1417 	ret = rte_eal_init(argc, argv);
1418 	if (ret < 0)
1419 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1420 	argc -= ret;
1421 	argv += ret;
1422 
1423 	/* parse app arguments */
1424 	ret = us_vhost_parse_args(argc, argv);
1425 	if (ret < 0)
1426 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1427 
1428 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1429 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1430 
1431 		if (rte_lcore_is_enabled(lcore_id))
1432 			lcore_ids[core_id++] = lcore_id;
1433 	}
1434 
1435 	if (rte_lcore_count() > RTE_MAX_LCORE)
1436 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1437 
1438 	/* Get the number of physical ports. */
1439 	nb_ports = rte_eth_dev_count();
1440 
1441 	/*
1442 	 * Update the global var NUM_PORTS and global array PORTS
1443 	 * and get value of var VALID_NUM_PORTS according to system ports number
1444 	 */
1445 	valid_num_ports = check_ports_num(nb_ports);
1446 
1447 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1448 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1449 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1450 		return -1;
1451 	}
1452 
1453 	/*
1454 	 * FIXME: here we are trying to allocate mbufs big enough for
1455 	 * @MAX_QUEUES, but the truth is we're never going to use that
1456 	 * many queues here. We probably should only do allocation for
1457 	 * those queues we are going to use.
1458 	 */
1459 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1460 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1461 
1462 	if (vm2vm_mode == VM2VM_HARDWARE) {
1463 		/* Enable VT loop back to let L2 switch to do it. */
1464 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1465 		RTE_LOG(DEBUG, VHOST_CONFIG,
1466 			"Enable loop back for L2 switch in vmdq.\n");
1467 	}
1468 
1469 	/* initialize all ports */
1470 	for (portid = 0; portid < nb_ports; portid++) {
1471 		/* skip ports that are not enabled */
1472 		if ((enabled_port_mask & (1 << portid)) == 0) {
1473 			RTE_LOG(INFO, VHOST_PORT,
1474 				"Skipping disabled port %d\n", portid);
1475 			continue;
1476 		}
1477 		if (port_init(portid) != 0)
1478 			rte_exit(EXIT_FAILURE,
1479 				"Cannot initialize network ports\n");
1480 	}
1481 
1482 	/* Enable stats if the user option is set. */
1483 	if (enable_stats) {
1484 		ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
1485 		if (ret != 0)
1486 			rte_exit(EXIT_FAILURE,
1487 				"Cannot create print-stats thread\n");
1488 
1489 		/* Set thread_name for aid in debugging.  */
1490 		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
1491 		ret = rte_thread_setname(tid, thread_name);
1492 		if (ret != 0)
1493 			RTE_LOG(DEBUG, VHOST_CONFIG,
1494 				"Cannot set print-stats name\n");
1495 	}
1496 
1497 	/* Launch all data cores. */
1498 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1499 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1500 
1501 	if (client_mode)
1502 		flags |= RTE_VHOST_USER_CLIENT;
1503 
1504 	if (dequeue_zero_copy)
1505 		flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1506 
1507 	/* Register vhost user driver to handle vhost messages. */
1508 	for (i = 0; i < nb_sockets; i++) {
1509 		char *file = socket_files + i * PATH_MAX;
1510 		ret = rte_vhost_driver_register(file, flags);
1511 		if (ret != 0) {
1512 			unregister_drivers(i);
1513 			rte_exit(EXIT_FAILURE,
1514 				"vhost driver register failure.\n");
1515 		}
1516 		if (mergeable == 0) {
1517 			rte_vhost_driver_disable_features(file,
1518 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
1519 		}
1520 
1521 		if (enable_tx_csum == 0) {
1522 			rte_vhost_driver_disable_features(file,
1523 				1ULL << VIRTIO_NET_F_CSUM);
1524 		}
1525 
1526 		if (enable_tso == 0) {
1527 			rte_vhost_driver_disable_features(file,
1528 				1ULL << VIRTIO_NET_F_HOST_TSO4);
1529 			rte_vhost_driver_disable_features(file,
1530 				1ULL << VIRTIO_NET_F_HOST_TSO6);
1531 			rte_vhost_driver_disable_features(file,
1532 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
1533 			rte_vhost_driver_disable_features(file,
1534 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
1535 		}
1536 
1537 		if (promiscuous) {
1538 			rte_vhost_driver_enable_features(file,
1539 				1ULL << VIRTIO_NET_F_CTRL_RX);
1540 		}
1541 
1542 		ret = rte_vhost_driver_callback_register(file,
1543 			&virtio_net_device_ops);
1544 		if (ret != 0) {
1545 			rte_exit(EXIT_FAILURE,
1546 				"failed to register vhost driver callbacks.\n");
1547 		}
1548 
1549 		if (rte_vhost_driver_start(file) < 0) {
1550 			rte_exit(EXIT_FAILURE,
1551 				"failed to start vhost driver.\n");
1552 		}
1553 	}
1554 
1555 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1556 		rte_eal_wait_lcore(lcore_id);
1557 
1558 	return 0;
1559 
1560 }
1561