xref: /dpdk/examples/vhost/main.c (revision 4e30ead5e7ca886535e2b30632b2948d2aac1681)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_vhost.h>
53 #include <rte_ip.h>
54 #include <rte_tcp.h>
55 
56 #include "main.h"
57 
58 #ifndef MAX_QUEUES
59 #define MAX_QUEUES 128
60 #endif
61 
62 /* the maximum number of external ports supported */
63 #define MAX_SUP_PORTS 1
64 
65 #define MBUF_CACHE_SIZE	128
66 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
67 
68 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
69 
70 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
71 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
72 
73 #define JUMBO_FRAME_MAX_SIZE    0x2600
74 
75 /* State of virtio device. */
76 #define DEVICE_MAC_LEARNING 0
77 #define DEVICE_RX			1
78 #define DEVICE_SAFE_REMOVE	2
79 
80 /* Configurable number of RX/TX ring descriptors */
81 #define RTE_TEST_RX_DESC_DEFAULT 1024
82 #define RTE_TEST_TX_DESC_DEFAULT 512
83 
84 #define INVALID_PORT_ID 0xFF
85 
86 /* Max number of devices. Limited by vmdq. */
87 #define MAX_DEVICES 64
88 
89 /* Size of buffers used for snprintfs. */
90 #define MAX_PRINT_BUFF 6072
91 
92 /* Maximum long option length for option parsing. */
93 #define MAX_LONG_OPT_SZ 64
94 
95 /* mask of enabled ports */
96 static uint32_t enabled_port_mask = 0;
97 
98 /* Promiscuous mode */
99 static uint32_t promiscuous;
100 
101 /* number of devices/queues to support*/
102 static uint32_t num_queues = 0;
103 static uint32_t num_devices;
104 
105 static struct rte_mempool *mbuf_pool;
106 static int mergeable;
107 
108 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
109 typedef enum {
110 	VM2VM_DISABLED = 0,
111 	VM2VM_SOFTWARE = 1,
112 	VM2VM_HARDWARE = 2,
113 	VM2VM_LAST
114 } vm2vm_type;
115 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
116 
117 /* Enable stats. */
118 static uint32_t enable_stats = 0;
119 /* Enable retries on RX. */
120 static uint32_t enable_retry = 1;
121 
122 /* Disable TX checksum offload */
123 static uint32_t enable_tx_csum;
124 
125 /* Disable TSO offload */
126 static uint32_t enable_tso;
127 
128 static int client_mode;
129 static int dequeue_zero_copy;
130 
131 static int builtin_net_driver;
132 
133 /* Specify timeout (in useconds) between retries on RX. */
134 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
135 /* Specify the number of retries on RX. */
136 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
137 
138 /* Socket file paths. Can be set by user */
139 static char *socket_files;
140 static int nb_sockets;
141 
142 /* empty vmdq configuration structure. Filled in programatically */
143 static struct rte_eth_conf vmdq_conf_default = {
144 	.rxmode = {
145 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
146 		.split_hdr_size = 0,
147 		.header_split   = 0, /**< Header Split disabled */
148 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
149 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
150 		/*
151 		 * It is necessary for 1G NIC such as I350,
152 		 * this fixes bug of ipv4 forwarding in guest can't
153 		 * forward pakets from one virtio dev to another virtio dev.
154 		 */
155 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
156 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
157 		.hw_strip_crc   = 1, /**< CRC stripped by hardware */
158 	},
159 
160 	.txmode = {
161 		.mq_mode = ETH_MQ_TX_NONE,
162 	},
163 	.rx_adv_conf = {
164 		/*
165 		 * should be overridden separately in code with
166 		 * appropriate values
167 		 */
168 		.vmdq_rx_conf = {
169 			.nb_queue_pools = ETH_8_POOLS,
170 			.enable_default_pool = 0,
171 			.default_pool = 0,
172 			.nb_pool_maps = 0,
173 			.pool_map = {{0, 0},},
174 		},
175 	},
176 };
177 
178 static unsigned lcore_ids[RTE_MAX_LCORE];
179 static uint8_t ports[RTE_MAX_ETHPORTS];
180 static unsigned num_ports = 0; /**< The number of ports specified in command line */
181 static uint16_t num_pf_queues, num_vmdq_queues;
182 static uint16_t vmdq_pool_base, vmdq_queue_base;
183 static uint16_t queues_per_pool;
184 
185 const uint16_t vlan_tags[] = {
186 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
187 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
188 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
189 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
190 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
191 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
192 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
193 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
194 };
195 
196 /* ethernet addresses of ports */
197 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
198 
199 static struct vhost_dev_tailq_list vhost_dev_list =
200 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
201 
202 static struct lcore_info lcore_info[RTE_MAX_LCORE];
203 
204 /* Used for queueing bursts of TX packets. */
205 struct mbuf_table {
206 	unsigned len;
207 	unsigned txq_id;
208 	struct rte_mbuf *m_table[MAX_PKT_BURST];
209 };
210 
211 /* TX queue for each data core. */
212 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
213 
214 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
215 				 / US_PER_S * BURST_TX_DRAIN_US)
216 #define VLAN_HLEN       4
217 
218 /*
219  * Builds up the correct configuration for VMDQ VLAN pool map
220  * according to the pool & queue limits.
221  */
222 static inline int
223 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
224 {
225 	struct rte_eth_vmdq_rx_conf conf;
226 	struct rte_eth_vmdq_rx_conf *def_conf =
227 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
228 	unsigned i;
229 
230 	memset(&conf, 0, sizeof(conf));
231 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
232 	conf.nb_pool_maps = num_devices;
233 	conf.enable_loop_back = def_conf->enable_loop_back;
234 	conf.rx_mode = def_conf->rx_mode;
235 
236 	for (i = 0; i < conf.nb_pool_maps; i++) {
237 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
238 		conf.pool_map[i].pools = (1UL << i);
239 	}
240 
241 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
242 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
243 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
244 	return 0;
245 }
246 
247 /*
248  * Validate the device number according to the max pool number gotten form
249  * dev_info. If the device number is invalid, give the error message and
250  * return -1. Each device must have its own pool.
251  */
252 static inline int
253 validate_num_devices(uint32_t max_nb_devices)
254 {
255 	if (num_devices > max_nb_devices) {
256 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
257 		return -1;
258 	}
259 	return 0;
260 }
261 
262 /*
263  * Initialises a given port using global settings and with the rx buffers
264  * coming from the mbuf_pool passed as parameter
265  */
266 static inline int
267 port_init(uint8_t port)
268 {
269 	struct rte_eth_dev_info dev_info;
270 	struct rte_eth_conf port_conf;
271 	struct rte_eth_rxconf *rxconf;
272 	struct rte_eth_txconf *txconf;
273 	int16_t rx_rings, tx_rings;
274 	uint16_t rx_ring_size, tx_ring_size;
275 	int retval;
276 	uint16_t q;
277 
278 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
279 	rte_eth_dev_info_get (port, &dev_info);
280 
281 	if (dev_info.max_rx_queues > MAX_QUEUES) {
282 		rte_exit(EXIT_FAILURE,
283 			"please define MAX_QUEUES no less than %u in %s\n",
284 			dev_info.max_rx_queues, __FILE__);
285 	}
286 
287 	rxconf = &dev_info.default_rxconf;
288 	txconf = &dev_info.default_txconf;
289 	rxconf->rx_drop_en = 1;
290 
291 	/* Enable vlan offload */
292 	txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
293 
294 	/*configure the number of supported virtio devices based on VMDQ limits */
295 	num_devices = dev_info.max_vmdq_pools;
296 
297 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
298 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
299 
300 	/*
301 	 * When dequeue zero copy is enabled, guest Tx used vring will be
302 	 * updated only when corresponding mbuf is freed. Thus, the nb_tx_desc
303 	 * (tx_ring_size here) must be small enough so that the driver will
304 	 * hit the free threshold easily and free mbufs timely. Otherwise,
305 	 * guest Tx vring would be starved.
306 	 */
307 	if (dequeue_zero_copy)
308 		tx_ring_size = 64;
309 
310 	tx_rings = (uint16_t)rte_lcore_count();
311 
312 	retval = validate_num_devices(MAX_DEVICES);
313 	if (retval < 0)
314 		return retval;
315 
316 	/* Get port configuration. */
317 	retval = get_eth_conf(&port_conf, num_devices);
318 	if (retval < 0)
319 		return retval;
320 	/* NIC queues are divided into pf queues and vmdq queues.  */
321 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
322 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
323 	num_vmdq_queues = num_devices * queues_per_pool;
324 	num_queues = num_pf_queues + num_vmdq_queues;
325 	vmdq_queue_base = dev_info.vmdq_queue_base;
326 	vmdq_pool_base  = dev_info.vmdq_pool_base;
327 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
328 		num_pf_queues, num_devices, queues_per_pool);
329 
330 	if (port >= rte_eth_dev_count()) return -1;
331 
332 	rx_rings = (uint16_t)dev_info.max_rx_queues;
333 	/* Configure ethernet device. */
334 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
335 	if (retval != 0) {
336 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
337 			port, strerror(-retval));
338 		return retval;
339 	}
340 
341 	/* Setup the queues. */
342 	for (q = 0; q < rx_rings; q ++) {
343 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
344 						rte_eth_dev_socket_id(port),
345 						rxconf,
346 						mbuf_pool);
347 		if (retval < 0) {
348 			RTE_LOG(ERR, VHOST_PORT,
349 				"Failed to setup rx queue %u of port %u: %s.\n",
350 				q, port, strerror(-retval));
351 			return retval;
352 		}
353 	}
354 	for (q = 0; q < tx_rings; q ++) {
355 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
356 						rte_eth_dev_socket_id(port),
357 						txconf);
358 		if (retval < 0) {
359 			RTE_LOG(ERR, VHOST_PORT,
360 				"Failed to setup tx queue %u of port %u: %s.\n",
361 				q, port, strerror(-retval));
362 			return retval;
363 		}
364 	}
365 
366 	/* Start the device. */
367 	retval  = rte_eth_dev_start(port);
368 	if (retval < 0) {
369 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
370 			port, strerror(-retval));
371 		return retval;
372 	}
373 
374 	if (promiscuous)
375 		rte_eth_promiscuous_enable(port);
376 
377 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
378 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
379 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
380 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
381 			(unsigned)port,
382 			vmdq_ports_eth_addr[port].addr_bytes[0],
383 			vmdq_ports_eth_addr[port].addr_bytes[1],
384 			vmdq_ports_eth_addr[port].addr_bytes[2],
385 			vmdq_ports_eth_addr[port].addr_bytes[3],
386 			vmdq_ports_eth_addr[port].addr_bytes[4],
387 			vmdq_ports_eth_addr[port].addr_bytes[5]);
388 
389 	return 0;
390 }
391 
392 /*
393  * Set socket file path.
394  */
395 static int
396 us_vhost_parse_socket_path(const char *q_arg)
397 {
398 	/* parse number string */
399 	if (strnlen(q_arg, PATH_MAX) > PATH_MAX)
400 		return -1;
401 
402 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
403 	snprintf(socket_files + nb_sockets * PATH_MAX, PATH_MAX, "%s", q_arg);
404 	nb_sockets++;
405 
406 	return 0;
407 }
408 
409 /*
410  * Parse the portmask provided at run time.
411  */
412 static int
413 parse_portmask(const char *portmask)
414 {
415 	char *end = NULL;
416 	unsigned long pm;
417 
418 	errno = 0;
419 
420 	/* parse hexadecimal string */
421 	pm = strtoul(portmask, &end, 16);
422 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
423 		return -1;
424 
425 	if (pm == 0)
426 		return -1;
427 
428 	return pm;
429 
430 }
431 
432 /*
433  * Parse num options at run time.
434  */
435 static int
436 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
437 {
438 	char *end = NULL;
439 	unsigned long num;
440 
441 	errno = 0;
442 
443 	/* parse unsigned int string */
444 	num = strtoul(q_arg, &end, 10);
445 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
446 		return -1;
447 
448 	if (num > max_valid_value)
449 		return -1;
450 
451 	return num;
452 
453 }
454 
455 /*
456  * Display usage
457  */
458 static void
459 us_vhost_usage(const char *prgname)
460 {
461 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
462 	"		--vm2vm [0|1|2]\n"
463 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
464 	"		--socket-file <path>\n"
465 	"		--nb-devices ND\n"
466 	"		-p PORTMASK: Set mask for ports to be used by application\n"
467 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
468 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
469 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
470 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
471 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
472 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
473 	"		--socket-file: The path of the socket file.\n"
474 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
475 	"		--tso [0|1] disable/enable TCP segment offload.\n"
476 	"		--client register a vhost-user socket as client mode.\n"
477 	"		--dequeue-zero-copy enables dequeue zero copy\n",
478 	       prgname);
479 }
480 
481 /*
482  * Parse the arguments given in the command line of the application.
483  */
484 static int
485 us_vhost_parse_args(int argc, char **argv)
486 {
487 	int opt, ret;
488 	int option_index;
489 	unsigned i;
490 	const char *prgname = argv[0];
491 	static struct option long_option[] = {
492 		{"vm2vm", required_argument, NULL, 0},
493 		{"rx-retry", required_argument, NULL, 0},
494 		{"rx-retry-delay", required_argument, NULL, 0},
495 		{"rx-retry-num", required_argument, NULL, 0},
496 		{"mergeable", required_argument, NULL, 0},
497 		{"stats", required_argument, NULL, 0},
498 		{"socket-file", required_argument, NULL, 0},
499 		{"tx-csum", required_argument, NULL, 0},
500 		{"tso", required_argument, NULL, 0},
501 		{"client", no_argument, &client_mode, 1},
502 		{"dequeue-zero-copy", no_argument, &dequeue_zero_copy, 1},
503 		{"builtin-net-driver", no_argument, &builtin_net_driver, 1},
504 		{NULL, 0, 0, 0},
505 	};
506 
507 	/* Parse command line */
508 	while ((opt = getopt_long(argc, argv, "p:P",
509 			long_option, &option_index)) != EOF) {
510 		switch (opt) {
511 		/* Portmask */
512 		case 'p':
513 			enabled_port_mask = parse_portmask(optarg);
514 			if (enabled_port_mask == 0) {
515 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
516 				us_vhost_usage(prgname);
517 				return -1;
518 			}
519 			break;
520 
521 		case 'P':
522 			promiscuous = 1;
523 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
524 				ETH_VMDQ_ACCEPT_BROADCAST |
525 				ETH_VMDQ_ACCEPT_MULTICAST;
526 
527 			break;
528 
529 		case 0:
530 			/* Enable/disable vm2vm comms. */
531 			if (!strncmp(long_option[option_index].name, "vm2vm",
532 				MAX_LONG_OPT_SZ)) {
533 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
534 				if (ret == -1) {
535 					RTE_LOG(INFO, VHOST_CONFIG,
536 						"Invalid argument for "
537 						"vm2vm [0|1|2]\n");
538 					us_vhost_usage(prgname);
539 					return -1;
540 				} else {
541 					vm2vm_mode = (vm2vm_type)ret;
542 				}
543 			}
544 
545 			/* Enable/disable retries on RX. */
546 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
547 				ret = parse_num_opt(optarg, 1);
548 				if (ret == -1) {
549 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
550 					us_vhost_usage(prgname);
551 					return -1;
552 				} else {
553 					enable_retry = ret;
554 				}
555 			}
556 
557 			/* Enable/disable TX checksum offload. */
558 			if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
559 				ret = parse_num_opt(optarg, 1);
560 				if (ret == -1) {
561 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
562 					us_vhost_usage(prgname);
563 					return -1;
564 				} else
565 					enable_tx_csum = ret;
566 			}
567 
568 			/* Enable/disable TSO offload. */
569 			if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
570 				ret = parse_num_opt(optarg, 1);
571 				if (ret == -1) {
572 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
573 					us_vhost_usage(prgname);
574 					return -1;
575 				} else
576 					enable_tso = ret;
577 			}
578 
579 			/* Specify the retries delay time (in useconds) on RX. */
580 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
581 				ret = parse_num_opt(optarg, INT32_MAX);
582 				if (ret == -1) {
583 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
584 					us_vhost_usage(prgname);
585 					return -1;
586 				} else {
587 					burst_rx_delay_time = ret;
588 				}
589 			}
590 
591 			/* Specify the retries number on RX. */
592 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
593 				ret = parse_num_opt(optarg, INT32_MAX);
594 				if (ret == -1) {
595 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
596 					us_vhost_usage(prgname);
597 					return -1;
598 				} else {
599 					burst_rx_retry_num = ret;
600 				}
601 			}
602 
603 			/* Enable/disable RX mergeable buffers. */
604 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
605 				ret = parse_num_opt(optarg, 1);
606 				if (ret == -1) {
607 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
608 					us_vhost_usage(prgname);
609 					return -1;
610 				} else {
611 					mergeable = !!ret;
612 					if (ret) {
613 						vmdq_conf_default.rxmode.jumbo_frame = 1;
614 						vmdq_conf_default.rxmode.max_rx_pkt_len
615 							= JUMBO_FRAME_MAX_SIZE;
616 					}
617 				}
618 			}
619 
620 			/* Enable/disable stats. */
621 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
622 				ret = parse_num_opt(optarg, INT32_MAX);
623 				if (ret == -1) {
624 					RTE_LOG(INFO, VHOST_CONFIG,
625 						"Invalid argument for stats [0..N]\n");
626 					us_vhost_usage(prgname);
627 					return -1;
628 				} else {
629 					enable_stats = ret;
630 				}
631 			}
632 
633 			/* Set socket file path. */
634 			if (!strncmp(long_option[option_index].name,
635 						"socket-file", MAX_LONG_OPT_SZ)) {
636 				if (us_vhost_parse_socket_path(optarg) == -1) {
637 					RTE_LOG(INFO, VHOST_CONFIG,
638 					"Invalid argument for socket name (Max %d characters)\n",
639 					PATH_MAX);
640 					us_vhost_usage(prgname);
641 					return -1;
642 				}
643 			}
644 
645 			break;
646 
647 			/* Invalid option - print options. */
648 		default:
649 			us_vhost_usage(prgname);
650 			return -1;
651 		}
652 	}
653 
654 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
655 		if (enabled_port_mask & (1 << i))
656 			ports[num_ports++] = (uint8_t)i;
657 	}
658 
659 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
660 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
661 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
662 		return -1;
663 	}
664 
665 	return 0;
666 }
667 
668 /*
669  * Update the global var NUM_PORTS and array PORTS according to system ports number
670  * and return valid ports number
671  */
672 static unsigned check_ports_num(unsigned nb_ports)
673 {
674 	unsigned valid_num_ports = num_ports;
675 	unsigned portid;
676 
677 	if (num_ports > nb_ports) {
678 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
679 			num_ports, nb_ports);
680 		num_ports = nb_ports;
681 	}
682 
683 	for (portid = 0; portid < num_ports; portid ++) {
684 		if (ports[portid] >= nb_ports) {
685 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
686 				ports[portid], (nb_ports - 1));
687 			ports[portid] = INVALID_PORT_ID;
688 			valid_num_ports--;
689 		}
690 	}
691 	return valid_num_ports;
692 }
693 
694 static inline struct vhost_dev *__attribute__((always_inline))
695 find_vhost_dev(struct ether_addr *mac)
696 {
697 	struct vhost_dev *vdev;
698 
699 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
700 		if (vdev->ready == DEVICE_RX &&
701 		    is_same_ether_addr(mac, &vdev->mac_address))
702 			return vdev;
703 	}
704 
705 	return NULL;
706 }
707 
708 /*
709  * This function learns the MAC address of the device and registers this along with a
710  * vlan tag to a VMDQ.
711  */
712 static int
713 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
714 {
715 	struct ether_hdr *pkt_hdr;
716 	int i, ret;
717 
718 	/* Learn MAC address of guest device from packet */
719 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
720 
721 	if (find_vhost_dev(&pkt_hdr->s_addr)) {
722 		RTE_LOG(ERR, VHOST_DATA,
723 			"(%d) device is using a registered MAC!\n",
724 			vdev->vid);
725 		return -1;
726 	}
727 
728 	for (i = 0; i < ETHER_ADDR_LEN; i++)
729 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
730 
731 	/* vlan_tag currently uses the device_id. */
732 	vdev->vlan_tag = vlan_tags[vdev->vid];
733 
734 	/* Print out VMDQ registration info. */
735 	RTE_LOG(INFO, VHOST_DATA,
736 		"(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
737 		vdev->vid,
738 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
739 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
740 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
741 		vdev->vlan_tag);
742 
743 	/* Register the MAC address. */
744 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
745 				(uint32_t)vdev->vid + vmdq_pool_base);
746 	if (ret)
747 		RTE_LOG(ERR, VHOST_DATA,
748 			"(%d) failed to add device MAC address to VMDQ\n",
749 			vdev->vid);
750 
751 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
752 
753 	/* Set device as ready for RX. */
754 	vdev->ready = DEVICE_RX;
755 
756 	return 0;
757 }
758 
759 /*
760  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
761  * queue before disabling RX on the device.
762  */
763 static inline void
764 unlink_vmdq(struct vhost_dev *vdev)
765 {
766 	unsigned i = 0;
767 	unsigned rx_count;
768 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
769 
770 	if (vdev->ready == DEVICE_RX) {
771 		/*clear MAC and VLAN settings*/
772 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
773 		for (i = 0; i < 6; i++)
774 			vdev->mac_address.addr_bytes[i] = 0;
775 
776 		vdev->vlan_tag = 0;
777 
778 		/*Clear out the receive buffers*/
779 		rx_count = rte_eth_rx_burst(ports[0],
780 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
781 
782 		while (rx_count) {
783 			for (i = 0; i < rx_count; i++)
784 				rte_pktmbuf_free(pkts_burst[i]);
785 
786 			rx_count = rte_eth_rx_burst(ports[0],
787 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
788 		}
789 
790 		vdev->ready = DEVICE_MAC_LEARNING;
791 	}
792 }
793 
794 static inline void __attribute__((always_inline))
795 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
796 	    struct rte_mbuf *m)
797 {
798 	uint16_t ret;
799 
800 	if (builtin_net_driver) {
801 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
802 	} else {
803 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
804 	}
805 
806 	if (enable_stats) {
807 		rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
808 		rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
809 		src_vdev->stats.tx_total++;
810 		src_vdev->stats.tx += ret;
811 	}
812 }
813 
814 /*
815  * Check if the packet destination MAC address is for a local device. If so then put
816  * the packet on that devices RX queue. If not then return.
817  */
818 static inline int __attribute__((always_inline))
819 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
820 {
821 	struct ether_hdr *pkt_hdr;
822 	struct vhost_dev *dst_vdev;
823 
824 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
825 
826 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
827 	if (!dst_vdev)
828 		return -1;
829 
830 	if (vdev->vid == dst_vdev->vid) {
831 		RTE_LOG_DP(DEBUG, VHOST_DATA,
832 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
833 			vdev->vid);
834 		return 0;
835 	}
836 
837 	RTE_LOG_DP(DEBUG, VHOST_DATA,
838 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
839 
840 	if (unlikely(dst_vdev->remove)) {
841 		RTE_LOG_DP(DEBUG, VHOST_DATA,
842 			"(%d) device is marked for removal\n", dst_vdev->vid);
843 		return 0;
844 	}
845 
846 	virtio_xmit(dst_vdev, vdev, m);
847 	return 0;
848 }
849 
850 /*
851  * Check if the destination MAC of a packet is one local VM,
852  * and get its vlan tag, and offset if it is.
853  */
854 static inline int __attribute__((always_inline))
855 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
856 	uint32_t *offset, uint16_t *vlan_tag)
857 {
858 	struct vhost_dev *dst_vdev;
859 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
860 
861 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
862 	if (!dst_vdev)
863 		return 0;
864 
865 	if (vdev->vid == dst_vdev->vid) {
866 		RTE_LOG_DP(DEBUG, VHOST_DATA,
867 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
868 			vdev->vid);
869 		return -1;
870 	}
871 
872 	/*
873 	 * HW vlan strip will reduce the packet length
874 	 * by minus length of vlan tag, so need restore
875 	 * the packet length by plus it.
876 	 */
877 	*offset  = VLAN_HLEN;
878 	*vlan_tag = vlan_tags[vdev->vid];
879 
880 	RTE_LOG_DP(DEBUG, VHOST_DATA,
881 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
882 		vdev->vid, dst_vdev->vid, *vlan_tag);
883 
884 	return 0;
885 }
886 
887 static uint16_t
888 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
889 {
890 	if (ol_flags & PKT_TX_IPV4)
891 		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
892 	else /* assume ethertype == ETHER_TYPE_IPv6 */
893 		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
894 }
895 
896 static void virtio_tx_offload(struct rte_mbuf *m)
897 {
898 	void *l3_hdr;
899 	struct ipv4_hdr *ipv4_hdr = NULL;
900 	struct tcp_hdr *tcp_hdr = NULL;
901 	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
902 
903 	l3_hdr = (char *)eth_hdr + m->l2_len;
904 
905 	if (m->ol_flags & PKT_TX_IPV4) {
906 		ipv4_hdr = l3_hdr;
907 		ipv4_hdr->hdr_checksum = 0;
908 		m->ol_flags |= PKT_TX_IP_CKSUM;
909 	}
910 
911 	tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
912 	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
913 }
914 
915 static inline void
916 free_pkts(struct rte_mbuf **pkts, uint16_t n)
917 {
918 	while (n--)
919 		rte_pktmbuf_free(pkts[n]);
920 }
921 
922 static inline void __attribute__((always_inline))
923 do_drain_mbuf_table(struct mbuf_table *tx_q)
924 {
925 	uint16_t count;
926 
927 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
928 				 tx_q->m_table, tx_q->len);
929 	if (unlikely(count < tx_q->len))
930 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
931 
932 	tx_q->len = 0;
933 }
934 
935 /*
936  * This function routes the TX packet to the correct interface. This
937  * may be a local device or the physical port.
938  */
939 static inline void __attribute__((always_inline))
940 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
941 {
942 	struct mbuf_table *tx_q;
943 	unsigned offset = 0;
944 	const uint16_t lcore_id = rte_lcore_id();
945 	struct ether_hdr *nh;
946 
947 
948 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
949 	if (unlikely(is_broadcast_ether_addr(&nh->d_addr))) {
950 		struct vhost_dev *vdev2;
951 
952 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
953 			virtio_xmit(vdev2, vdev, m);
954 		}
955 		goto queue2nic;
956 	}
957 
958 	/*check if destination is local VM*/
959 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
960 		rte_pktmbuf_free(m);
961 		return;
962 	}
963 
964 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
965 		if (unlikely(find_local_dest(vdev, m, &offset,
966 					     &vlan_tag) != 0)) {
967 			rte_pktmbuf_free(m);
968 			return;
969 		}
970 	}
971 
972 	RTE_LOG_DP(DEBUG, VHOST_DATA,
973 		"(%d) TX: MAC address is external\n", vdev->vid);
974 
975 queue2nic:
976 
977 	/*Add packet to the port tx queue*/
978 	tx_q = &lcore_tx_queue[lcore_id];
979 
980 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
981 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
982 		/* Guest has inserted the vlan tag. */
983 		struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
984 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
985 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
986 			(vh->vlan_tci != vlan_tag_be))
987 			vh->vlan_tci = vlan_tag_be;
988 	} else {
989 		m->ol_flags |= PKT_TX_VLAN_PKT;
990 
991 		/*
992 		 * Find the right seg to adjust the data len when offset is
993 		 * bigger than tail room size.
994 		 */
995 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
996 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
997 				m->data_len += offset;
998 			else {
999 				struct rte_mbuf *seg = m;
1000 
1001 				while ((seg->next != NULL) &&
1002 					(offset > rte_pktmbuf_tailroom(seg)))
1003 					seg = seg->next;
1004 
1005 				seg->data_len += offset;
1006 			}
1007 			m->pkt_len += offset;
1008 		}
1009 
1010 		m->vlan_tci = vlan_tag;
1011 	}
1012 
1013 	if (m->ol_flags & PKT_TX_TCP_SEG)
1014 		virtio_tx_offload(m);
1015 
1016 	tx_q->m_table[tx_q->len++] = m;
1017 	if (enable_stats) {
1018 		vdev->stats.tx_total++;
1019 		vdev->stats.tx++;
1020 	}
1021 
1022 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1023 		do_drain_mbuf_table(tx_q);
1024 }
1025 
1026 
1027 static inline void __attribute__((always_inline))
1028 drain_mbuf_table(struct mbuf_table *tx_q)
1029 {
1030 	static uint64_t prev_tsc;
1031 	uint64_t cur_tsc;
1032 
1033 	if (tx_q->len == 0)
1034 		return;
1035 
1036 	cur_tsc = rte_rdtsc();
1037 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1038 		prev_tsc = cur_tsc;
1039 
1040 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1041 			"TX queue drained after timeout with burst size %u\n",
1042 			tx_q->len);
1043 		do_drain_mbuf_table(tx_q);
1044 	}
1045 }
1046 
1047 static inline void __attribute__((always_inline))
1048 drain_eth_rx(struct vhost_dev *vdev)
1049 {
1050 	uint16_t rx_count, enqueue_count;
1051 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1052 
1053 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1054 				    pkts, MAX_PKT_BURST);
1055 	if (!rx_count)
1056 		return;
1057 
1058 	/*
1059 	 * When "enable_retry" is set, here we wait and retry when there
1060 	 * is no enough free slots in the queue to hold @rx_count packets,
1061 	 * to diminish packet loss.
1062 	 */
1063 	if (enable_retry &&
1064 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1065 			VIRTIO_RXQ))) {
1066 		uint32_t retry;
1067 
1068 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1069 			rte_delay_us(burst_rx_delay_time);
1070 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1071 					VIRTIO_RXQ))
1072 				break;
1073 		}
1074 	}
1075 
1076 	if (builtin_net_driver) {
1077 		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1078 						pkts, rx_count);
1079 	} else {
1080 		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1081 						pkts, rx_count);
1082 	}
1083 	if (enable_stats) {
1084 		rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1085 		rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1086 	}
1087 
1088 	free_pkts(pkts, rx_count);
1089 }
1090 
1091 static inline void __attribute__((always_inline))
1092 drain_virtio_tx(struct vhost_dev *vdev)
1093 {
1094 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1095 	uint16_t count;
1096 	uint16_t i;
1097 
1098 	if (builtin_net_driver) {
1099 		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1100 					pkts, MAX_PKT_BURST);
1101 	} else {
1102 		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1103 					mbuf_pool, pkts, MAX_PKT_BURST);
1104 	}
1105 
1106 	/* setup VMDq for the first packet */
1107 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1108 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1109 			free_pkts(pkts, count);
1110 	}
1111 
1112 	for (i = 0; i < count; ++i)
1113 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1114 }
1115 
1116 /*
1117  * Main function of vhost-switch. It basically does:
1118  *
1119  * for each vhost device {
1120  *    - drain_eth_rx()
1121  *
1122  *      Which drains the host eth Rx queue linked to the vhost device,
1123  *      and deliver all of them to guest virito Rx ring associated with
1124  *      this vhost device.
1125  *
1126  *    - drain_virtio_tx()
1127  *
1128  *      Which drains the guest virtio Tx queue and deliver all of them
1129  *      to the target, which could be another vhost device, or the
1130  *      physical eth dev. The route is done in function "virtio_tx_route".
1131  * }
1132  */
1133 static int
1134 switch_worker(void *arg __rte_unused)
1135 {
1136 	unsigned i;
1137 	unsigned lcore_id = rte_lcore_id();
1138 	struct vhost_dev *vdev;
1139 	struct mbuf_table *tx_q;
1140 
1141 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1142 
1143 	tx_q = &lcore_tx_queue[lcore_id];
1144 	for (i = 0; i < rte_lcore_count(); i++) {
1145 		if (lcore_ids[i] == lcore_id) {
1146 			tx_q->txq_id = i;
1147 			break;
1148 		}
1149 	}
1150 
1151 	while(1) {
1152 		drain_mbuf_table(tx_q);
1153 
1154 		/*
1155 		 * Inform the configuration core that we have exited the
1156 		 * linked list and that no devices are in use if requested.
1157 		 */
1158 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1159 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1160 
1161 		/*
1162 		 * Process vhost devices
1163 		 */
1164 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1165 			      lcore_vdev_entry) {
1166 			if (unlikely(vdev->remove)) {
1167 				unlink_vmdq(vdev);
1168 				vdev->ready = DEVICE_SAFE_REMOVE;
1169 				continue;
1170 			}
1171 
1172 			if (likely(vdev->ready == DEVICE_RX))
1173 				drain_eth_rx(vdev);
1174 
1175 			if (likely(!vdev->remove))
1176 				drain_virtio_tx(vdev);
1177 		}
1178 	}
1179 
1180 	return 0;
1181 }
1182 
1183 /*
1184  * Remove a device from the specific data core linked list and from the
1185  * main linked list. Synchonization  occurs through the use of the
1186  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1187  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1188  */
1189 static void
1190 destroy_device(int vid)
1191 {
1192 	struct vhost_dev *vdev = NULL;
1193 	int lcore;
1194 
1195 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1196 		if (vdev->vid == vid)
1197 			break;
1198 	}
1199 	if (!vdev)
1200 		return;
1201 	/*set the remove flag. */
1202 	vdev->remove = 1;
1203 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1204 		rte_pause();
1205 	}
1206 
1207 	if (builtin_net_driver)
1208 		vs_vhost_net_remove(vdev);
1209 
1210 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1211 		     lcore_vdev_entry);
1212 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1213 
1214 
1215 	/* Set the dev_removal_flag on each lcore. */
1216 	RTE_LCORE_FOREACH_SLAVE(lcore)
1217 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1218 
1219 	/*
1220 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1221 	 * we can be sure that they can no longer access the device removed
1222 	 * from the linked lists and that the devices are no longer in use.
1223 	 */
1224 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1225 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1226 			rte_pause();
1227 	}
1228 
1229 	lcore_info[vdev->coreid].device_num--;
1230 
1231 	RTE_LOG(INFO, VHOST_DATA,
1232 		"(%d) device has been removed from data core\n",
1233 		vdev->vid);
1234 
1235 	rte_free(vdev);
1236 }
1237 
1238 /*
1239  * A new device is added to a data core. First the device is added to the main linked list
1240  * and the allocated to a specific data core.
1241  */
1242 static int
1243 new_device(int vid)
1244 {
1245 	int lcore, core_add = 0;
1246 	uint32_t device_num_min = num_devices;
1247 	struct vhost_dev *vdev;
1248 
1249 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1250 	if (vdev == NULL) {
1251 		RTE_LOG(INFO, VHOST_DATA,
1252 			"(%d) couldn't allocate memory for vhost dev\n",
1253 			vid);
1254 		return -1;
1255 	}
1256 	vdev->vid = vid;
1257 
1258 	if (builtin_net_driver)
1259 		vs_vhost_net_setup(vdev);
1260 
1261 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1262 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1263 
1264 	/*reset ready flag*/
1265 	vdev->ready = DEVICE_MAC_LEARNING;
1266 	vdev->remove = 0;
1267 
1268 	/* Find a suitable lcore to add the device. */
1269 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1270 		if (lcore_info[lcore].device_num < device_num_min) {
1271 			device_num_min = lcore_info[lcore].device_num;
1272 			core_add = lcore;
1273 		}
1274 	}
1275 	vdev->coreid = core_add;
1276 
1277 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1278 			  lcore_vdev_entry);
1279 	lcore_info[vdev->coreid].device_num++;
1280 
1281 	/* Disable notifications. */
1282 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1283 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1284 
1285 	RTE_LOG(INFO, VHOST_DATA,
1286 		"(%d) device has been added to data core %d\n",
1287 		vid, vdev->coreid);
1288 
1289 	return 0;
1290 }
1291 
1292 /*
1293  * These callback allow devices to be added to the data core when configuration
1294  * has been fully complete.
1295  */
1296 static const struct vhost_device_ops virtio_net_device_ops =
1297 {
1298 	.new_device =  new_device,
1299 	.destroy_device = destroy_device,
1300 };
1301 
1302 /*
1303  * This is a thread will wake up after a period to print stats if the user has
1304  * enabled them.
1305  */
1306 static void
1307 print_stats(void)
1308 {
1309 	struct vhost_dev *vdev;
1310 	uint64_t tx_dropped, rx_dropped;
1311 	uint64_t tx, tx_total, rx, rx_total;
1312 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1313 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1314 
1315 	while(1) {
1316 		sleep(enable_stats);
1317 
1318 		/* Clear screen and move to top left */
1319 		printf("%s%s\n", clr, top_left);
1320 		printf("Device statistics =================================\n");
1321 
1322 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1323 			tx_total   = vdev->stats.tx_total;
1324 			tx         = vdev->stats.tx;
1325 			tx_dropped = tx_total - tx;
1326 
1327 			rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1328 			rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1329 			rx_dropped = rx_total - rx;
1330 
1331 			printf("Statistics for device %d\n"
1332 				"-----------------------\n"
1333 				"TX total:              %" PRIu64 "\n"
1334 				"TX dropped:            %" PRIu64 "\n"
1335 				"TX successful:         %" PRIu64 "\n"
1336 				"RX total:              %" PRIu64 "\n"
1337 				"RX dropped:            %" PRIu64 "\n"
1338 				"RX successful:         %" PRIu64 "\n",
1339 				vdev->vid,
1340 				tx_total, tx_dropped, tx,
1341 				rx_total, rx_dropped, rx);
1342 		}
1343 
1344 		printf("===================================================\n");
1345 	}
1346 }
1347 
1348 static void
1349 unregister_drivers(int socket_num)
1350 {
1351 	int i, ret;
1352 
1353 	for (i = 0; i < socket_num; i++) {
1354 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1355 		if (ret != 0)
1356 			RTE_LOG(ERR, VHOST_CONFIG,
1357 				"Fail to unregister vhost driver for %s.\n",
1358 				socket_files + i * PATH_MAX);
1359 	}
1360 }
1361 
1362 /* When we receive a INT signal, unregister vhost driver */
1363 static void
1364 sigint_handler(__rte_unused int signum)
1365 {
1366 	/* Unregister vhost driver. */
1367 	unregister_drivers(nb_sockets);
1368 
1369 	exit(0);
1370 }
1371 
1372 /*
1373  * While creating an mbuf pool, one key thing is to figure out how
1374  * many mbuf entries is enough for our use. FYI, here are some
1375  * guidelines:
1376  *
1377  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1378  *
1379  * - For each switch core (A CPU core does the packet switch), we need
1380  *   also make some reservation for receiving the packets from virtio
1381  *   Tx queue. How many is enough depends on the usage. It's normally
1382  *   a simple calculation like following:
1383  *
1384  *       MAX_PKT_BURST * max packet size / mbuf size
1385  *
1386  *   So, we definitely need allocate more mbufs when TSO is enabled.
1387  *
1388  * - Similarly, for each switching core, we should serve @nr_rx_desc
1389  *   mbufs for receiving the packets from physical NIC device.
1390  *
1391  * - We also need make sure, for each switch core, we have allocated
1392  *   enough mbufs to fill up the mbuf cache.
1393  */
1394 static void
1395 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1396 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1397 {
1398 	uint32_t nr_mbufs;
1399 	uint32_t nr_mbufs_per_core;
1400 	uint32_t mtu = 1500;
1401 
1402 	if (mergeable)
1403 		mtu = 9000;
1404 	if (enable_tso)
1405 		mtu = 64 * 1024;
1406 
1407 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1408 			(mbuf_size - RTE_PKTMBUF_HEADROOM);
1409 	nr_mbufs_per_core += nr_rx_desc;
1410 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1411 
1412 	nr_mbufs  = nr_queues * nr_rx_desc;
1413 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1414 	nr_mbufs *= nr_port;
1415 
1416 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1417 					    nr_mbuf_cache, 0, mbuf_size,
1418 					    rte_socket_id());
1419 	if (mbuf_pool == NULL)
1420 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1421 }
1422 
1423 /*
1424  * Main function, does initialisation and calls the per-lcore functions.
1425  */
1426 int
1427 main(int argc, char *argv[])
1428 {
1429 	unsigned lcore_id, core_id = 0;
1430 	unsigned nb_ports, valid_num_ports;
1431 	int ret, i;
1432 	uint8_t portid;
1433 	static pthread_t tid;
1434 	char thread_name[RTE_MAX_THREAD_NAME_LEN];
1435 	uint64_t flags = 0;
1436 
1437 	signal(SIGINT, sigint_handler);
1438 
1439 	/* init EAL */
1440 	ret = rte_eal_init(argc, argv);
1441 	if (ret < 0)
1442 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1443 	argc -= ret;
1444 	argv += ret;
1445 
1446 	/* parse app arguments */
1447 	ret = us_vhost_parse_args(argc, argv);
1448 	if (ret < 0)
1449 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1450 
1451 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1452 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1453 
1454 		if (rte_lcore_is_enabled(lcore_id))
1455 			lcore_ids[core_id++] = lcore_id;
1456 	}
1457 
1458 	if (rte_lcore_count() > RTE_MAX_LCORE)
1459 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1460 
1461 	/* Get the number of physical ports. */
1462 	nb_ports = rte_eth_dev_count();
1463 
1464 	/*
1465 	 * Update the global var NUM_PORTS and global array PORTS
1466 	 * and get value of var VALID_NUM_PORTS according to system ports number
1467 	 */
1468 	valid_num_ports = check_ports_num(nb_ports);
1469 
1470 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1471 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1472 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1473 		return -1;
1474 	}
1475 
1476 	/*
1477 	 * FIXME: here we are trying to allocate mbufs big enough for
1478 	 * @MAX_QUEUES, but the truth is we're never going to use that
1479 	 * many queues here. We probably should only do allocation for
1480 	 * those queues we are going to use.
1481 	 */
1482 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1483 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1484 
1485 	if (vm2vm_mode == VM2VM_HARDWARE) {
1486 		/* Enable VT loop back to let L2 switch to do it. */
1487 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1488 		RTE_LOG(DEBUG, VHOST_CONFIG,
1489 			"Enable loop back for L2 switch in vmdq.\n");
1490 	}
1491 
1492 	/* initialize all ports */
1493 	for (portid = 0; portid < nb_ports; portid++) {
1494 		/* skip ports that are not enabled */
1495 		if ((enabled_port_mask & (1 << portid)) == 0) {
1496 			RTE_LOG(INFO, VHOST_PORT,
1497 				"Skipping disabled port %d\n", portid);
1498 			continue;
1499 		}
1500 		if (port_init(portid) != 0)
1501 			rte_exit(EXIT_FAILURE,
1502 				"Cannot initialize network ports\n");
1503 	}
1504 
1505 	/* Enable stats if the user option is set. */
1506 	if (enable_stats) {
1507 		ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
1508 		if (ret != 0)
1509 			rte_exit(EXIT_FAILURE,
1510 				"Cannot create print-stats thread\n");
1511 
1512 		/* Set thread_name for aid in debugging.  */
1513 		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
1514 		ret = rte_thread_setname(tid, thread_name);
1515 		if (ret != 0)
1516 			RTE_LOG(DEBUG, VHOST_CONFIG,
1517 				"Cannot set print-stats name\n");
1518 	}
1519 
1520 	/* Launch all data cores. */
1521 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1522 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1523 
1524 	if (client_mode)
1525 		flags |= RTE_VHOST_USER_CLIENT;
1526 
1527 	if (dequeue_zero_copy)
1528 		flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1529 
1530 	/* Register vhost user driver to handle vhost messages. */
1531 	for (i = 0; i < nb_sockets; i++) {
1532 		char *file = socket_files + i * PATH_MAX;
1533 		ret = rte_vhost_driver_register(file, flags);
1534 		if (ret != 0) {
1535 			unregister_drivers(i);
1536 			rte_exit(EXIT_FAILURE,
1537 				"vhost driver register failure.\n");
1538 		}
1539 
1540 		if (builtin_net_driver)
1541 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1542 
1543 		if (mergeable == 0) {
1544 			rte_vhost_driver_disable_features(file,
1545 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
1546 		}
1547 
1548 		if (enable_tx_csum == 0) {
1549 			rte_vhost_driver_disable_features(file,
1550 				1ULL << VIRTIO_NET_F_CSUM);
1551 		}
1552 
1553 		if (enable_tso == 0) {
1554 			rte_vhost_driver_disable_features(file,
1555 				1ULL << VIRTIO_NET_F_HOST_TSO4);
1556 			rte_vhost_driver_disable_features(file,
1557 				1ULL << VIRTIO_NET_F_HOST_TSO6);
1558 			rte_vhost_driver_disable_features(file,
1559 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
1560 			rte_vhost_driver_disable_features(file,
1561 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
1562 		}
1563 
1564 		if (promiscuous) {
1565 			rte_vhost_driver_enable_features(file,
1566 				1ULL << VIRTIO_NET_F_CTRL_RX);
1567 		}
1568 
1569 		ret = rte_vhost_driver_callback_register(file,
1570 			&virtio_net_device_ops);
1571 		if (ret != 0) {
1572 			rte_exit(EXIT_FAILURE,
1573 				"failed to register vhost driver callbacks.\n");
1574 		}
1575 
1576 		if (rte_vhost_driver_start(file) < 0) {
1577 			rte_exit(EXIT_FAILURE,
1578 				"failed to start vhost driver.\n");
1579 		}
1580 	}
1581 
1582 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1583 		rte_eal_wait_lcore(lcore_id);
1584 
1585 	return 0;
1586 
1587 }
1588