xref: /dpdk/examples/vhost/main.c (revision b218a1bf36d764c54ebf422b0799d5ac4f79310a)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 #include <rte_ip.h>
54 #include <rte_tcp.h>
55 
56 #include "main.h"
57 
58 #ifndef MAX_QUEUES
59 #define MAX_QUEUES 128
60 #endif
61 
62 /* the maximum number of external ports supported */
63 #define MAX_SUP_PORTS 1
64 
65 #define MBUF_CACHE_SIZE	128
66 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
67 
68 #define MAX_PKT_BURST 32		/* Max burst size for RX/TX */
69 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
70 
71 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
72 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
73 
74 #define JUMBO_FRAME_MAX_SIZE    0x2600
75 
76 /* State of virtio device. */
77 #define DEVICE_MAC_LEARNING 0
78 #define DEVICE_RX			1
79 #define DEVICE_SAFE_REMOVE	2
80 
81 /* Configurable number of RX/TX ring descriptors */
82 #define RTE_TEST_RX_DESC_DEFAULT 1024
83 #define RTE_TEST_TX_DESC_DEFAULT 512
84 
85 #define INVALID_PORT_ID 0xFF
86 
87 /* Max number of devices. Limited by vmdq. */
88 #define MAX_DEVICES 64
89 
90 /* Size of buffers used for snprintfs. */
91 #define MAX_PRINT_BUFF 6072
92 
93 /* Maximum character device basename size. */
94 #define MAX_BASENAME_SZ 10
95 
96 /* Maximum long option length for option parsing. */
97 #define MAX_LONG_OPT_SZ 64
98 
99 /* mask of enabled ports */
100 static uint32_t enabled_port_mask = 0;
101 
102 /* Promiscuous mode */
103 static uint32_t promiscuous;
104 
105 /* number of devices/queues to support*/
106 static uint32_t num_queues = 0;
107 static uint32_t num_devices;
108 
109 static struct rte_mempool *mbuf_pool;
110 static int mergeable;
111 
112 /* Do vlan strip on host, enabled on default */
113 static uint32_t vlan_strip = 1;
114 
115 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
116 typedef enum {
117 	VM2VM_DISABLED = 0,
118 	VM2VM_SOFTWARE = 1,
119 	VM2VM_HARDWARE = 2,
120 	VM2VM_LAST
121 } vm2vm_type;
122 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
123 
124 /* Enable stats. */
125 static uint32_t enable_stats = 0;
126 /* Enable retries on RX. */
127 static uint32_t enable_retry = 1;
128 
129 /* Disable TX checksum offload */
130 static uint32_t enable_tx_csum;
131 
132 /* Disable TSO offload */
133 static uint32_t enable_tso;
134 
135 /* Specify timeout (in useconds) between retries on RX. */
136 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
137 /* Specify the number of retries on RX. */
138 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
139 
140 /* Character device basename. Can be set by user. */
141 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
142 
143 /* empty vmdq configuration structure. Filled in programatically */
144 static struct rte_eth_conf vmdq_conf_default = {
145 	.rxmode = {
146 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
147 		.split_hdr_size = 0,
148 		.header_split   = 0, /**< Header Split disabled */
149 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
150 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
151 		/*
152 		 * It is necessary for 1G NIC such as I350,
153 		 * this fixes bug of ipv4 forwarding in guest can't
154 		 * forward pakets from one virtio dev to another virtio dev.
155 		 */
156 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
157 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
158 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
159 	},
160 
161 	.txmode = {
162 		.mq_mode = ETH_MQ_TX_NONE,
163 	},
164 	.rx_adv_conf = {
165 		/*
166 		 * should be overridden separately in code with
167 		 * appropriate values
168 		 */
169 		.vmdq_rx_conf = {
170 			.nb_queue_pools = ETH_8_POOLS,
171 			.enable_default_pool = 0,
172 			.default_pool = 0,
173 			.nb_pool_maps = 0,
174 			.pool_map = {{0, 0},},
175 		},
176 	},
177 };
178 
179 static unsigned lcore_ids[RTE_MAX_LCORE];
180 static uint8_t ports[RTE_MAX_ETHPORTS];
181 static unsigned num_ports = 0; /**< The number of ports specified in command line */
182 static uint16_t num_pf_queues, num_vmdq_queues;
183 static uint16_t vmdq_pool_base, vmdq_queue_base;
184 static uint16_t queues_per_pool;
185 
186 const uint16_t vlan_tags[] = {
187 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
188 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
189 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
190 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
191 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
192 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
193 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
194 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
195 };
196 
197 /* ethernet addresses of ports */
198 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
199 
200 static struct vhost_dev_tailq_list vhost_dev_list =
201 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
202 
203 static struct lcore_info lcore_info[RTE_MAX_LCORE];
204 
205 /* Used for queueing bursts of TX packets. */
206 struct mbuf_table {
207 	unsigned len;
208 	unsigned txq_id;
209 	struct rte_mbuf *m_table[MAX_PKT_BURST];
210 };
211 
212 /* TX queue for each data core. */
213 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
214 
215 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
216 				 / US_PER_S * BURST_TX_DRAIN_US)
217 #define VLAN_HLEN       4
218 
219 /*
220  * Builds up the correct configuration for VMDQ VLAN pool map
221  * according to the pool & queue limits.
222  */
223 static inline int
224 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
225 {
226 	struct rte_eth_vmdq_rx_conf conf;
227 	struct rte_eth_vmdq_rx_conf *def_conf =
228 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
229 	unsigned i;
230 
231 	memset(&conf, 0, sizeof(conf));
232 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
233 	conf.nb_pool_maps = num_devices;
234 	conf.enable_loop_back = def_conf->enable_loop_back;
235 	conf.rx_mode = def_conf->rx_mode;
236 
237 	for (i = 0; i < conf.nb_pool_maps; i++) {
238 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
239 		conf.pool_map[i].pools = (1UL << i);
240 	}
241 
242 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
243 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
244 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
245 	return 0;
246 }
247 
248 /*
249  * Validate the device number according to the max pool number gotten form
250  * dev_info. If the device number is invalid, give the error message and
251  * return -1. Each device must have its own pool.
252  */
253 static inline int
254 validate_num_devices(uint32_t max_nb_devices)
255 {
256 	if (num_devices > max_nb_devices) {
257 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
258 		return -1;
259 	}
260 	return 0;
261 }
262 
263 /*
264  * Initialises a given port using global settings and with the rx buffers
265  * coming from the mbuf_pool passed as parameter
266  */
267 static inline int
268 port_init(uint8_t port)
269 {
270 	struct rte_eth_dev_info dev_info;
271 	struct rte_eth_conf port_conf;
272 	struct rte_eth_rxconf *rxconf;
273 	struct rte_eth_txconf *txconf;
274 	int16_t rx_rings, tx_rings;
275 	uint16_t rx_ring_size, tx_ring_size;
276 	int retval;
277 	uint16_t q;
278 
279 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
280 	rte_eth_dev_info_get (port, &dev_info);
281 
282 	if (dev_info.max_rx_queues > MAX_QUEUES) {
283 		rte_exit(EXIT_FAILURE,
284 			"please define MAX_QUEUES no less than %u in %s\n",
285 			dev_info.max_rx_queues, __FILE__);
286 	}
287 
288 	rxconf = &dev_info.default_rxconf;
289 	txconf = &dev_info.default_txconf;
290 	rxconf->rx_drop_en = 1;
291 
292 	/* Enable vlan offload */
293 	txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
294 
295 	/*configure the number of supported virtio devices based on VMDQ limits */
296 	num_devices = dev_info.max_vmdq_pools;
297 
298 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
299 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
300 	tx_rings = (uint16_t)rte_lcore_count();
301 
302 	retval = validate_num_devices(MAX_DEVICES);
303 	if (retval < 0)
304 		return retval;
305 
306 	/* Get port configuration. */
307 	retval = get_eth_conf(&port_conf, num_devices);
308 	if (retval < 0)
309 		return retval;
310 	/* NIC queues are divided into pf queues and vmdq queues.  */
311 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
312 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
313 	num_vmdq_queues = num_devices * queues_per_pool;
314 	num_queues = num_pf_queues + num_vmdq_queues;
315 	vmdq_queue_base = dev_info.vmdq_queue_base;
316 	vmdq_pool_base  = dev_info.vmdq_pool_base;
317 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
318 		num_pf_queues, num_devices, queues_per_pool);
319 
320 	if (port >= rte_eth_dev_count()) return -1;
321 
322 	if (enable_tx_csum == 0)
323 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_CSUM);
324 
325 	if (enable_tso == 0) {
326 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4);
327 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO6);
328 	}
329 
330 	rx_rings = (uint16_t)dev_info.max_rx_queues;
331 	/* Configure ethernet device. */
332 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
333 	if (retval != 0)
334 		return retval;
335 
336 	/* Setup the queues. */
337 	for (q = 0; q < rx_rings; q ++) {
338 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
339 						rte_eth_dev_socket_id(port),
340 						rxconf,
341 						mbuf_pool);
342 		if (retval < 0)
343 			return retval;
344 	}
345 	for (q = 0; q < tx_rings; q ++) {
346 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
347 						rte_eth_dev_socket_id(port),
348 						txconf);
349 		if (retval < 0)
350 			return retval;
351 	}
352 
353 	/* Start the device. */
354 	retval  = rte_eth_dev_start(port);
355 	if (retval < 0) {
356 		RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
357 		return retval;
358 	}
359 
360 	if (promiscuous)
361 		rte_eth_promiscuous_enable(port);
362 
363 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
364 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
365 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
366 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
367 			(unsigned)port,
368 			vmdq_ports_eth_addr[port].addr_bytes[0],
369 			vmdq_ports_eth_addr[port].addr_bytes[1],
370 			vmdq_ports_eth_addr[port].addr_bytes[2],
371 			vmdq_ports_eth_addr[port].addr_bytes[3],
372 			vmdq_ports_eth_addr[port].addr_bytes[4],
373 			vmdq_ports_eth_addr[port].addr_bytes[5]);
374 
375 	return 0;
376 }
377 
378 /*
379  * Set character device basename.
380  */
381 static int
382 us_vhost_parse_basename(const char *q_arg)
383 {
384 	/* parse number string */
385 
386 	if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
387 		return -1;
388 	else
389 		snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
390 
391 	return 0;
392 }
393 
394 /*
395  * Parse the portmask provided at run time.
396  */
397 static int
398 parse_portmask(const char *portmask)
399 {
400 	char *end = NULL;
401 	unsigned long pm;
402 
403 	errno = 0;
404 
405 	/* parse hexadecimal string */
406 	pm = strtoul(portmask, &end, 16);
407 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
408 		return -1;
409 
410 	if (pm == 0)
411 		return -1;
412 
413 	return pm;
414 
415 }
416 
417 /*
418  * Parse num options at run time.
419  */
420 static int
421 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
422 {
423 	char *end = NULL;
424 	unsigned long num;
425 
426 	errno = 0;
427 
428 	/* parse unsigned int string */
429 	num = strtoul(q_arg, &end, 10);
430 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
431 		return -1;
432 
433 	if (num > max_valid_value)
434 		return -1;
435 
436 	return num;
437 
438 }
439 
440 /*
441  * Display usage
442  */
443 static void
444 us_vhost_usage(const char *prgname)
445 {
446 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
447 	"		--vm2vm [0|1|2]\n"
448 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
449 	"		--dev-basename <name>\n"
450 	"		--nb-devices ND\n"
451 	"		-p PORTMASK: Set mask for ports to be used by application\n"
452 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
453 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
454 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
455 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
456 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
457 	"		--vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
458 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
459 	"		--dev-basename: The basename to be used for the character device.\n"
460 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
461 	"		--tso [0|1] disable/enable TCP segment offload.\n",
462 	       prgname);
463 }
464 
465 /*
466  * Parse the arguments given in the command line of the application.
467  */
468 static int
469 us_vhost_parse_args(int argc, char **argv)
470 {
471 	int opt, ret;
472 	int option_index;
473 	unsigned i;
474 	const char *prgname = argv[0];
475 	static struct option long_option[] = {
476 		{"vm2vm", required_argument, NULL, 0},
477 		{"rx-retry", required_argument, NULL, 0},
478 		{"rx-retry-delay", required_argument, NULL, 0},
479 		{"rx-retry-num", required_argument, NULL, 0},
480 		{"mergeable", required_argument, NULL, 0},
481 		{"vlan-strip", required_argument, NULL, 0},
482 		{"stats", required_argument, NULL, 0},
483 		{"dev-basename", required_argument, NULL, 0},
484 		{"tx-csum", required_argument, NULL, 0},
485 		{"tso", required_argument, NULL, 0},
486 		{NULL, 0, 0, 0},
487 	};
488 
489 	/* Parse command line */
490 	while ((opt = getopt_long(argc, argv, "p:P",
491 			long_option, &option_index)) != EOF) {
492 		switch (opt) {
493 		/* Portmask */
494 		case 'p':
495 			enabled_port_mask = parse_portmask(optarg);
496 			if (enabled_port_mask == 0) {
497 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
498 				us_vhost_usage(prgname);
499 				return -1;
500 			}
501 			break;
502 
503 		case 'P':
504 			promiscuous = 1;
505 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
506 				ETH_VMDQ_ACCEPT_BROADCAST |
507 				ETH_VMDQ_ACCEPT_MULTICAST;
508 			rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
509 
510 			break;
511 
512 		case 0:
513 			/* Enable/disable vm2vm comms. */
514 			if (!strncmp(long_option[option_index].name, "vm2vm",
515 				MAX_LONG_OPT_SZ)) {
516 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
517 				if (ret == -1) {
518 					RTE_LOG(INFO, VHOST_CONFIG,
519 						"Invalid argument for "
520 						"vm2vm [0|1|2]\n");
521 					us_vhost_usage(prgname);
522 					return -1;
523 				} else {
524 					vm2vm_mode = (vm2vm_type)ret;
525 				}
526 			}
527 
528 			/* Enable/disable retries on RX. */
529 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
530 				ret = parse_num_opt(optarg, 1);
531 				if (ret == -1) {
532 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
533 					us_vhost_usage(prgname);
534 					return -1;
535 				} else {
536 					enable_retry = ret;
537 				}
538 			}
539 
540 			/* Enable/disable TX checksum offload. */
541 			if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
542 				ret = parse_num_opt(optarg, 1);
543 				if (ret == -1) {
544 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
545 					us_vhost_usage(prgname);
546 					return -1;
547 				} else
548 					enable_tx_csum = ret;
549 			}
550 
551 			/* Enable/disable TSO offload. */
552 			if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
553 				ret = parse_num_opt(optarg, 1);
554 				if (ret == -1) {
555 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
556 					us_vhost_usage(prgname);
557 					return -1;
558 				} else
559 					enable_tso = ret;
560 			}
561 
562 			/* Specify the retries delay time (in useconds) on RX. */
563 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
564 				ret = parse_num_opt(optarg, INT32_MAX);
565 				if (ret == -1) {
566 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
567 					us_vhost_usage(prgname);
568 					return -1;
569 				} else {
570 					burst_rx_delay_time = ret;
571 				}
572 			}
573 
574 			/* Specify the retries number on RX. */
575 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
576 				ret = parse_num_opt(optarg, INT32_MAX);
577 				if (ret == -1) {
578 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
579 					us_vhost_usage(prgname);
580 					return -1;
581 				} else {
582 					burst_rx_retry_num = ret;
583 				}
584 			}
585 
586 			/* Enable/disable RX mergeable buffers. */
587 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
588 				ret = parse_num_opt(optarg, 1);
589 				if (ret == -1) {
590 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
591 					us_vhost_usage(prgname);
592 					return -1;
593 				} else {
594 					mergeable = !!ret;
595 					if (ret) {
596 						vmdq_conf_default.rxmode.jumbo_frame = 1;
597 						vmdq_conf_default.rxmode.max_rx_pkt_len
598 							= JUMBO_FRAME_MAX_SIZE;
599 					}
600 				}
601 			}
602 
603 			/* Enable/disable RX VLAN strip on host. */
604 			if (!strncmp(long_option[option_index].name,
605 				"vlan-strip", MAX_LONG_OPT_SZ)) {
606 				ret = parse_num_opt(optarg, 1);
607 				if (ret == -1) {
608 					RTE_LOG(INFO, VHOST_CONFIG,
609 						"Invalid argument for VLAN strip [0|1]\n");
610 					us_vhost_usage(prgname);
611 					return -1;
612 				} else {
613 					vlan_strip = !!ret;
614 					vmdq_conf_default.rxmode.hw_vlan_strip =
615 						vlan_strip;
616 				}
617 			}
618 
619 			/* Enable/disable stats. */
620 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
621 				ret = parse_num_opt(optarg, INT32_MAX);
622 				if (ret == -1) {
623 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
624 					us_vhost_usage(prgname);
625 					return -1;
626 				} else {
627 					enable_stats = ret;
628 				}
629 			}
630 
631 			/* Set character device basename. */
632 			if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
633 				if (us_vhost_parse_basename(optarg) == -1) {
634 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
635 					us_vhost_usage(prgname);
636 					return -1;
637 				}
638 			}
639 
640 			break;
641 
642 			/* Invalid option - print options. */
643 		default:
644 			us_vhost_usage(prgname);
645 			return -1;
646 		}
647 	}
648 
649 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
650 		if (enabled_port_mask & (1 << i))
651 			ports[num_ports++] = (uint8_t)i;
652 	}
653 
654 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
655 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
656 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
657 		return -1;
658 	}
659 
660 	return 0;
661 }
662 
663 /*
664  * Update the global var NUM_PORTS and array PORTS according to system ports number
665  * and return valid ports number
666  */
667 static unsigned check_ports_num(unsigned nb_ports)
668 {
669 	unsigned valid_num_ports = num_ports;
670 	unsigned portid;
671 
672 	if (num_ports > nb_ports) {
673 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
674 			num_ports, nb_ports);
675 		num_ports = nb_ports;
676 	}
677 
678 	for (portid = 0; portid < num_ports; portid ++) {
679 		if (ports[portid] >= nb_ports) {
680 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
681 				ports[portid], (nb_ports - 1));
682 			ports[portid] = INVALID_PORT_ID;
683 			valid_num_ports--;
684 		}
685 	}
686 	return valid_num_ports;
687 }
688 
689 static inline struct vhost_dev *__attribute__((always_inline))
690 find_vhost_dev(struct ether_addr *mac)
691 {
692 	struct vhost_dev *vdev;
693 
694 	TAILQ_FOREACH(vdev, &vhost_dev_list, next) {
695 		if (vdev->ready == DEVICE_RX &&
696 		    is_same_ether_addr(mac, &vdev->mac_address))
697 			return vdev;
698 	}
699 
700 	return NULL;
701 }
702 
703 /*
704  * This function learns the MAC address of the device and registers this along with a
705  * vlan tag to a VMDQ.
706  */
707 static int
708 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
709 {
710 	struct ether_hdr *pkt_hdr;
711 	struct virtio_net *dev = vdev->dev;
712 	int i, ret;
713 
714 	/* Learn MAC address of guest device from packet */
715 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
716 
717 	if (find_vhost_dev(&pkt_hdr->s_addr)) {
718 		RTE_LOG(ERR, VHOST_DATA,
719 			"Device (%" PRIu64 ") is using a registered MAC!\n",
720 			dev->device_fh);
721 		return -1;
722 	}
723 
724 	for (i = 0; i < ETHER_ADDR_LEN; i++)
725 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
726 
727 	/* vlan_tag currently uses the device_id. */
728 	vdev->vlan_tag = vlan_tags[dev->device_fh];
729 
730 	/* Print out VMDQ registration info. */
731 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
732 		dev->device_fh,
733 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
734 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
735 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
736 		vdev->vlan_tag);
737 
738 	/* Register the MAC address. */
739 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
740 				(uint32_t)dev->device_fh + vmdq_pool_base);
741 	if (ret)
742 		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
743 					dev->device_fh);
744 
745 	/* Enable stripping of the vlan tag as we handle routing. */
746 	if (vlan_strip)
747 		rte_eth_dev_set_vlan_strip_on_queue(ports[0],
748 			(uint16_t)vdev->vmdq_rx_q, 1);
749 
750 	/* Set device as ready for RX. */
751 	vdev->ready = DEVICE_RX;
752 
753 	return 0;
754 }
755 
756 /*
757  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
758  * queue before disabling RX on the device.
759  */
760 static inline void
761 unlink_vmdq(struct vhost_dev *vdev)
762 {
763 	unsigned i = 0;
764 	unsigned rx_count;
765 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
766 
767 	if (vdev->ready == DEVICE_RX) {
768 		/*clear MAC and VLAN settings*/
769 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
770 		for (i = 0; i < 6; i++)
771 			vdev->mac_address.addr_bytes[i] = 0;
772 
773 		vdev->vlan_tag = 0;
774 
775 		/*Clear out the receive buffers*/
776 		rx_count = rte_eth_rx_burst(ports[0],
777 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
778 
779 		while (rx_count) {
780 			for (i = 0; i < rx_count; i++)
781 				rte_pktmbuf_free(pkts_burst[i]);
782 
783 			rx_count = rte_eth_rx_burst(ports[0],
784 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
785 		}
786 
787 		vdev->ready = DEVICE_MAC_LEARNING;
788 	}
789 }
790 
791 static inline void __attribute__((always_inline))
792 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
793 	    struct rte_mbuf *m)
794 {
795 	uint16_t ret;
796 
797 	ret = rte_vhost_enqueue_burst(dst_vdev->dev, VIRTIO_RXQ, &m, 1);
798 	if (enable_stats) {
799 		rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
800 		rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
801 		src_vdev->stats.tx_total++;
802 		src_vdev->stats.tx += ret;
803 	}
804 }
805 
806 /*
807  * Check if the packet destination MAC address is for a local device. If so then put
808  * the packet on that devices RX queue. If not then return.
809  */
810 static inline int __attribute__((always_inline))
811 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
812 {
813 	struct ether_hdr *pkt_hdr;
814 	struct vhost_dev *dst_vdev;
815 	uint64_t fh;
816 
817 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
818 
819 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
820 	if (!dst_vdev)
821 		return -1;
822 
823 	fh = dst_vdev->dev->device_fh;
824 	if (fh == vdev->dev->device_fh) {
825 		RTE_LOG(DEBUG, VHOST_DATA,
826 			"(%" PRIu64 ") TX: src and dst MAC is same. "
827 			"Dropping packet.\n", fh);
828 		return 0;
829 	}
830 
831 	RTE_LOG(DEBUG, VHOST_DATA,
832 		"(%" PRIu64 ") TX: MAC address is local\n", fh);
833 
834 	if (unlikely(dst_vdev->remove)) {
835 		RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") "
836 			"Device is marked for removal\n", fh);
837 		return 0;
838 	}
839 
840 	virtio_xmit(dst_vdev, vdev, m);
841 	return 0;
842 }
843 
844 /*
845  * Check if the destination MAC of a packet is one local VM,
846  * and get its vlan tag, and offset if it is.
847  */
848 static inline int __attribute__((always_inline))
849 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
850 	uint32_t *offset, uint16_t *vlan_tag)
851 {
852 	struct vhost_dev *dst_vdev;
853 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
854 
855 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
856 	if (!dst_vdev)
857 		return 0;
858 
859 	if (dst_vdev->dev->device_fh == dev->device_fh) {
860 		RTE_LOG(DEBUG, VHOST_DATA,
861 			"(%" PRIu64 ") TX: src and dst MAC is same. "
862 			" Dropping packet.\n", dst_vdev->dev->device_fh);
863 		return -1;
864 	}
865 
866 	/*
867 	 * HW vlan strip will reduce the packet length
868 	 * by minus length of vlan tag, so need restore
869 	 * the packet length by plus it.
870 	 */
871 	*offset  = VLAN_HLEN;
872 	*vlan_tag = vlan_tags[(uint16_t)dst_vdev->dev->device_fh];
873 
874 	RTE_LOG(DEBUG, VHOST_DATA,
875 		"(%" PRIu64 ") TX: pkt to local VM device id: (%" PRIu64 ") "
876 		"vlan tag: %u.\n",
877 		dev->device_fh, dst_vdev->dev->device_fh, *vlan_tag);
878 
879 	return 0;
880 }
881 
882 static uint16_t
883 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
884 {
885 	if (ol_flags & PKT_TX_IPV4)
886 		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
887 	else /* assume ethertype == ETHER_TYPE_IPv6 */
888 		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
889 }
890 
891 static void virtio_tx_offload(struct rte_mbuf *m)
892 {
893 	void *l3_hdr;
894 	struct ipv4_hdr *ipv4_hdr = NULL;
895 	struct tcp_hdr *tcp_hdr = NULL;
896 	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
897 
898 	l3_hdr = (char *)eth_hdr + m->l2_len;
899 
900 	if (m->ol_flags & PKT_TX_IPV4) {
901 		ipv4_hdr = l3_hdr;
902 		ipv4_hdr->hdr_checksum = 0;
903 		m->ol_flags |= PKT_TX_IP_CKSUM;
904 	}
905 
906 	tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
907 	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
908 }
909 
910 static inline void
911 free_pkts(struct rte_mbuf **pkts, uint16_t n)
912 {
913 	while (n--)
914 		rte_pktmbuf_free(pkts[n]);
915 }
916 
917 static inline void __attribute__((always_inline))
918 do_drain_mbuf_table(struct mbuf_table *tx_q)
919 {
920 	uint16_t count;
921 
922 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
923 				 tx_q->m_table, tx_q->len);
924 	if (unlikely(count < tx_q->len))
925 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
926 
927 	tx_q->len = 0;
928 }
929 
930 /*
931  * This function routes the TX packet to the correct interface. This
932  * may be a local device or the physical port.
933  */
934 static inline void __attribute__((always_inline))
935 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
936 {
937 	struct mbuf_table *tx_q;
938 	unsigned offset = 0;
939 	const uint16_t lcore_id = rte_lcore_id();
940 	struct virtio_net *dev = vdev->dev;
941 	struct ether_hdr *nh;
942 
943 
944 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
945 	if (unlikely(is_broadcast_ether_addr(&nh->d_addr))) {
946 		struct vhost_dev *vdev2;
947 
948 		TAILQ_FOREACH(vdev2, &vhost_dev_list, next) {
949 			virtio_xmit(vdev2, vdev, m);
950 		}
951 		goto queue2nic;
952 	}
953 
954 	/*check if destination is local VM*/
955 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
956 		rte_pktmbuf_free(m);
957 		return;
958 	}
959 
960 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
961 		if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
962 			rte_pktmbuf_free(m);
963 			return;
964 		}
965 	}
966 
967 	RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: "
968 		"MAC address is external\n", dev->device_fh);
969 
970 queue2nic:
971 
972 	/*Add packet to the port tx queue*/
973 	tx_q = &lcore_tx_queue[lcore_id];
974 
975 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
976 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
977 		/* Guest has inserted the vlan tag. */
978 		struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
979 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
980 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
981 			(vh->vlan_tci != vlan_tag_be))
982 			vh->vlan_tci = vlan_tag_be;
983 	} else {
984 		m->ol_flags |= PKT_TX_VLAN_PKT;
985 
986 		/*
987 		 * Find the right seg to adjust the data len when offset is
988 		 * bigger than tail room size.
989 		 */
990 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
991 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
992 				m->data_len += offset;
993 			else {
994 				struct rte_mbuf *seg = m;
995 
996 				while ((seg->next != NULL) &&
997 					(offset > rte_pktmbuf_tailroom(seg)))
998 					seg = seg->next;
999 
1000 				seg->data_len += offset;
1001 			}
1002 			m->pkt_len += offset;
1003 		}
1004 
1005 		m->vlan_tci = vlan_tag;
1006 	}
1007 
1008 	if (m->ol_flags & PKT_TX_TCP_SEG)
1009 		virtio_tx_offload(m);
1010 
1011 	tx_q->m_table[tx_q->len++] = m;
1012 	if (enable_stats) {
1013 		vdev->stats.tx_total++;
1014 		vdev->stats.tx++;
1015 	}
1016 
1017 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1018 		do_drain_mbuf_table(tx_q);
1019 }
1020 
1021 
1022 static inline void __attribute__((always_inline))
1023 drain_mbuf_table(struct mbuf_table *tx_q)
1024 {
1025 	static uint64_t prev_tsc;
1026 	uint64_t cur_tsc;
1027 
1028 	if (tx_q->len == 0)
1029 		return;
1030 
1031 	cur_tsc = rte_rdtsc();
1032 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1033 		prev_tsc = cur_tsc;
1034 
1035 		RTE_LOG(DEBUG, VHOST_DATA,
1036 			"TX queue drained after timeout with burst size %u\n",
1037 			tx_q->len);
1038 		do_drain_mbuf_table(tx_q);
1039 	}
1040 }
1041 
1042 static inline void __attribute__((always_inline))
1043 drain_eth_rx(struct vhost_dev *vdev)
1044 {
1045 	uint16_t rx_count, enqueue_count;
1046 	struct virtio_net *dev = vdev->dev;
1047 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1048 
1049 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1050 				    pkts, MAX_PKT_BURST);
1051 	if (!rx_count)
1052 		return;
1053 
1054 	/*
1055 	 * When "enable_retry" is set, here we wait and retry when there
1056 	 * is no enough free slots in the queue to hold @rx_count packets,
1057 	 * to diminish packet loss.
1058 	 */
1059 	if (enable_retry &&
1060 	    unlikely(rx_count > rte_vring_available_entries(dev,
1061 			VIRTIO_RXQ))) {
1062 		uint32_t retry;
1063 
1064 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1065 			rte_delay_us(burst_rx_delay_time);
1066 			if (rx_count <= rte_vring_available_entries(dev,
1067 					VIRTIO_RXQ))
1068 				break;
1069 		}
1070 	}
1071 
1072 	enqueue_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ,
1073 						pkts, rx_count);
1074 	if (enable_stats) {
1075 		rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1076 		rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1077 	}
1078 
1079 	free_pkts(pkts, rx_count);
1080 }
1081 
1082 static inline void __attribute__((always_inline))
1083 drain_virtio_tx(struct vhost_dev *vdev)
1084 {
1085 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1086 	uint16_t count;
1087 	uint16_t i;
1088 
1089 	count = rte_vhost_dequeue_burst(vdev->dev, VIRTIO_TXQ, mbuf_pool,
1090 					pkts, MAX_PKT_BURST);
1091 
1092 	/* setup VMDq for the first packet */
1093 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1094 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1095 			free_pkts(pkts, count);
1096 	}
1097 
1098 	for (i = 0; i < count; ++i) {
1099 		virtio_tx_route(vdev, pkts[i],
1100 			vlan_tags[(uint16_t)vdev->dev->device_fh]);
1101 	}
1102 }
1103 
1104 /*
1105  * Main function of vhost-switch. It basically does:
1106  *
1107  * for each vhost device {
1108  *    - drain_eth_rx()
1109  *
1110  *      Which drains the host eth Rx queue linked to the vhost device,
1111  *      and deliver all of them to guest virito Rx ring associated with
1112  *      this vhost device.
1113  *
1114  *    - drain_virtio_tx()
1115  *
1116  *      Which drains the guest virtio Tx queue and deliver all of them
1117  *      to the target, which could be another vhost device, or the
1118  *      physical eth dev. The route is done in function "virtio_tx_route".
1119  * }
1120  */
1121 static int
1122 switch_worker(void *arg __rte_unused)
1123 {
1124 	unsigned i;
1125 	unsigned lcore_id = rte_lcore_id();
1126 	struct vhost_dev *vdev;
1127 	struct mbuf_table *tx_q;
1128 
1129 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1130 
1131 	tx_q = &lcore_tx_queue[lcore_id];
1132 	for (i = 0; i < rte_lcore_count(); i++) {
1133 		if (lcore_ids[i] == lcore_id) {
1134 			tx_q->txq_id = i;
1135 			break;
1136 		}
1137 	}
1138 
1139 	while(1) {
1140 		drain_mbuf_table(tx_q);
1141 
1142 		/*
1143 		 * Inform the configuration core that we have exited the
1144 		 * linked list and that no devices are in use if requested.
1145 		 */
1146 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1147 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1148 
1149 		/*
1150 		 * Process vhost devices
1151 		 */
1152 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list, next) {
1153 			if (unlikely(vdev->remove)) {
1154 				unlink_vmdq(vdev);
1155 				vdev->ready = DEVICE_SAFE_REMOVE;
1156 				continue;
1157 			}
1158 
1159 			if (likely(vdev->ready == DEVICE_RX))
1160 				drain_eth_rx(vdev);
1161 
1162 			if (likely(!vdev->remove))
1163 				drain_virtio_tx(vdev);
1164 		}
1165 	}
1166 
1167 	return 0;
1168 }
1169 
1170 /*
1171  * Remove a device from the specific data core linked list and from the
1172  * main linked list. Synchonization  occurs through the use of the
1173  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1174  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1175  */
1176 static void
1177 destroy_device (volatile struct virtio_net *dev)
1178 {
1179 	struct vhost_dev *vdev;
1180 	int lcore;
1181 
1182 	dev->flags &= ~VIRTIO_DEV_RUNNING;
1183 
1184 	vdev = (struct vhost_dev *)dev->priv;
1185 	/*set the remove flag. */
1186 	vdev->remove = 1;
1187 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1188 		rte_pause();
1189 	}
1190 
1191 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev, next);
1192 	TAILQ_REMOVE(&vhost_dev_list, vdev, next);
1193 
1194 	/* Set the dev_removal_flag on each lcore. */
1195 	RTE_LCORE_FOREACH_SLAVE(lcore)
1196 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1197 
1198 	/*
1199 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1200 	 * we can be sure that they can no longer access the device removed
1201 	 * from the linked lists and that the devices are no longer in use.
1202 	 */
1203 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1204 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1205 			rte_pause();
1206 	}
1207 
1208 	lcore_info[vdev->coreid].device_num--;
1209 
1210 	RTE_LOG(INFO, VHOST_DATA,
1211 		"(%" PRIu64 ") Device has been removed from data core\n",
1212 		dev->device_fh);
1213 
1214 	rte_free(vdev);
1215 }
1216 
1217 /*
1218  * A new device is added to a data core. First the device is added to the main linked list
1219  * and the allocated to a specific data core.
1220  */
1221 static int
1222 new_device (struct virtio_net *dev)
1223 {
1224 	int lcore, core_add = 0;
1225 	uint32_t device_num_min = num_devices;
1226 	struct vhost_dev *vdev;
1227 
1228 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1229 	if (vdev == NULL) {
1230 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
1231 			dev->device_fh);
1232 		return -1;
1233 	}
1234 	vdev->dev = dev;
1235 	dev->priv = vdev;
1236 
1237 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, next);
1238 	vdev->vmdq_rx_q
1239 		= dev->device_fh * queues_per_pool + vmdq_queue_base;
1240 
1241 	/*reset ready flag*/
1242 	vdev->ready = DEVICE_MAC_LEARNING;
1243 	vdev->remove = 0;
1244 
1245 	/* Find a suitable lcore to add the device. */
1246 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1247 		if (lcore_info[lcore].device_num < device_num_min) {
1248 			device_num_min = lcore_info[lcore].device_num;
1249 			core_add = lcore;
1250 		}
1251 	}
1252 	vdev->coreid = core_add;
1253 
1254 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev, next);
1255 	lcore_info[vdev->coreid].device_num++;
1256 
1257 	/* Disable notifications. */
1258 	rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
1259 	rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
1260 	dev->flags |= VIRTIO_DEV_RUNNING;
1261 
1262 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
1263 
1264 	return 0;
1265 }
1266 
1267 /*
1268  * These callback allow devices to be added to the data core when configuration
1269  * has been fully complete.
1270  */
1271 static const struct virtio_net_device_ops virtio_net_device_ops =
1272 {
1273 	.new_device =  new_device,
1274 	.destroy_device = destroy_device,
1275 };
1276 
1277 /*
1278  * This is a thread will wake up after a period to print stats if the user has
1279  * enabled them.
1280  */
1281 static void
1282 print_stats(void)
1283 {
1284 	struct vhost_dev *vdev;
1285 	uint64_t tx_dropped, rx_dropped;
1286 	uint64_t tx, tx_total, rx, rx_total;
1287 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1288 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1289 
1290 	while(1) {
1291 		sleep(enable_stats);
1292 
1293 		/* Clear screen and move to top left */
1294 		printf("%s%s\n", clr, top_left);
1295 		printf("Device statistics =================================\n");
1296 
1297 		TAILQ_FOREACH(vdev, &vhost_dev_list, next) {
1298 			tx_total   = vdev->stats.tx_total;
1299 			tx         = vdev->stats.tx;
1300 			tx_dropped = tx_total - tx;
1301 
1302 			rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1303 			rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1304 			rx_dropped = rx_total - rx;
1305 
1306 			printf("Statistics for device %" PRIu64 "\n"
1307 				"-----------------------\n"
1308 				"TX total:              %" PRIu64 "\n"
1309 				"TX dropped:            %" PRIu64 "\n"
1310 				"TX successful:         %" PRIu64 "\n"
1311 				"RX total:              %" PRIu64 "\n"
1312 				"RX dropped:            %" PRIu64 "\n"
1313 				"RX successful:         %" PRIu64 "\n",
1314 				vdev->dev->device_fh,
1315 				tx_total, tx_dropped, tx,
1316 				rx_total, rx_dropped, rx);
1317 		}
1318 
1319 		printf("===================================================\n");
1320 	}
1321 }
1322 
1323 /* When we receive a INT signal, unregister vhost driver */
1324 static void
1325 sigint_handler(__rte_unused int signum)
1326 {
1327 	/* Unregister vhost driver. */
1328 	int ret = rte_vhost_driver_unregister((char *)&dev_basename);
1329 	if (ret != 0)
1330 		rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n");
1331 	exit(0);
1332 }
1333 
1334 /*
1335  * While creating an mbuf pool, one key thing is to figure out how
1336  * many mbuf entries is enough for our use. FYI, here are some
1337  * guidelines:
1338  *
1339  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1340  *
1341  * - For each switch core (A CPU core does the packet switch), we need
1342  *   also make some reservation for receiving the packets from virtio
1343  *   Tx queue. How many is enough depends on the usage. It's normally
1344  *   a simple calculation like following:
1345  *
1346  *       MAX_PKT_BURST * max packet size / mbuf size
1347  *
1348  *   So, we definitely need allocate more mbufs when TSO is enabled.
1349  *
1350  * - Similarly, for each switching core, we should serve @nr_rx_desc
1351  *   mbufs for receiving the packets from physical NIC device.
1352  *
1353  * - We also need make sure, for each switch core, we have allocated
1354  *   enough mbufs to fill up the mbuf cache.
1355  */
1356 static void
1357 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1358 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1359 {
1360 	uint32_t nr_mbufs;
1361 	uint32_t nr_mbufs_per_core;
1362 	uint32_t mtu = 1500;
1363 
1364 	if (mergeable)
1365 		mtu = 9000;
1366 	if (enable_tso)
1367 		mtu = 64 * 1024;
1368 
1369 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1370 			(mbuf_size - RTE_PKTMBUF_HEADROOM) * MAX_PKT_BURST;
1371 	nr_mbufs_per_core += nr_rx_desc;
1372 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1373 
1374 	nr_mbufs  = nr_queues * nr_rx_desc;
1375 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1376 	nr_mbufs *= nr_port;
1377 
1378 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1379 					    nr_mbuf_cache, 0, mbuf_size,
1380 					    rte_socket_id());
1381 	if (mbuf_pool == NULL)
1382 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1383 }
1384 
1385 /*
1386  * Main function, does initialisation and calls the per-lcore functions. The CUSE
1387  * device is also registered here to handle the IOCTLs.
1388  */
1389 int
1390 main(int argc, char *argv[])
1391 {
1392 	unsigned lcore_id, core_id = 0;
1393 	unsigned nb_ports, valid_num_ports;
1394 	int ret;
1395 	uint8_t portid;
1396 	static pthread_t tid;
1397 	char thread_name[RTE_MAX_THREAD_NAME_LEN];
1398 
1399 	signal(SIGINT, sigint_handler);
1400 
1401 	/* init EAL */
1402 	ret = rte_eal_init(argc, argv);
1403 	if (ret < 0)
1404 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1405 	argc -= ret;
1406 	argv += ret;
1407 
1408 	/* parse app arguments */
1409 	ret = us_vhost_parse_args(argc, argv);
1410 	if (ret < 0)
1411 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1412 
1413 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
1414 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1415 
1416 		if (rte_lcore_is_enabled(lcore_id))
1417 			lcore_ids[core_id ++] = lcore_id;
1418 
1419 	if (rte_lcore_count() > RTE_MAX_LCORE)
1420 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1421 
1422 	/* Get the number of physical ports. */
1423 	nb_ports = rte_eth_dev_count();
1424 
1425 	/*
1426 	 * Update the global var NUM_PORTS and global array PORTS
1427 	 * and get value of var VALID_NUM_PORTS according to system ports number
1428 	 */
1429 	valid_num_ports = check_ports_num(nb_ports);
1430 
1431 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1432 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1433 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1434 		return -1;
1435 	}
1436 
1437 	/*
1438 	 * FIXME: here we are trying to allocate mbufs big enough for
1439 	 * @MAX_QUEUES, but the truth is we're never going to use that
1440 	 * many queues here. We probably should only do allocation for
1441 	 * those queues we are going to use.
1442 	 */
1443 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1444 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1445 
1446 	if (vm2vm_mode == VM2VM_HARDWARE) {
1447 		/* Enable VT loop back to let L2 switch to do it. */
1448 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1449 		RTE_LOG(DEBUG, VHOST_CONFIG,
1450 			"Enable loop back for L2 switch in vmdq.\n");
1451 	}
1452 
1453 	/* initialize all ports */
1454 	for (portid = 0; portid < nb_ports; portid++) {
1455 		/* skip ports that are not enabled */
1456 		if ((enabled_port_mask & (1 << portid)) == 0) {
1457 			RTE_LOG(INFO, VHOST_PORT,
1458 				"Skipping disabled port %d\n", portid);
1459 			continue;
1460 		}
1461 		if (port_init(portid) != 0)
1462 			rte_exit(EXIT_FAILURE,
1463 				"Cannot initialize network ports\n");
1464 	}
1465 
1466 	/* Enable stats if the user option is set. */
1467 	if (enable_stats) {
1468 		ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
1469 		if (ret != 0)
1470 			rte_exit(EXIT_FAILURE,
1471 				"Cannot create print-stats thread\n");
1472 
1473 		/* Set thread_name for aid in debugging.  */
1474 		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
1475 		ret = rte_thread_setname(tid, thread_name);
1476 		if (ret != 0)
1477 			RTE_LOG(ERR, VHOST_CONFIG,
1478 				"Cannot set print-stats name\n");
1479 	}
1480 
1481 	/* Launch all data cores. */
1482 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1483 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1484 
1485 	if (mergeable == 0)
1486 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
1487 
1488 	/* Register vhost(cuse or user) driver to handle vhost messages. */
1489 	ret = rte_vhost_driver_register((char *)&dev_basename);
1490 	if (ret != 0)
1491 		rte_exit(EXIT_FAILURE, "vhost driver register failure.\n");
1492 
1493 	rte_vhost_driver_callback_register(&virtio_net_device_ops);
1494 
1495 	/* Start CUSE session. */
1496 	rte_vhost_driver_session_start();
1497 	return 0;
1498 
1499 }
1500