xref: /dpdk/examples/vhost/main.c (revision 45657a5c6861a7bea4041eb2d0e7d7179904336c)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 #include <rte_ip.h>
54 #include <rte_tcp.h>
55 
56 #include "main.h"
57 
58 #ifndef MAX_QUEUES
59 #define MAX_QUEUES 128
60 #endif
61 
62 /* the maximum number of external ports supported */
63 #define MAX_SUP_PORTS 1
64 
65 /*
66  * Calculate the number of buffers needed per port
67  */
68 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +		\
69 							(num_switching_cores*MAX_PKT_BURST) +  			\
70 							(num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
71 							((num_switching_cores+1)*MBUF_CACHE_SIZE))
72 
73 #define MBUF_CACHE_SIZE	128
74 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
75 
76 #define MAX_PKT_BURST 32		/* Max burst size for RX/TX */
77 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
78 
79 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
80 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
81 
82 #define JUMBO_FRAME_MAX_SIZE    0x2600
83 
84 /* State of virtio device. */
85 #define DEVICE_MAC_LEARNING 0
86 #define DEVICE_RX			1
87 #define DEVICE_SAFE_REMOVE	2
88 
89 /* Configurable number of RX/TX ring descriptors */
90 #define RTE_TEST_RX_DESC_DEFAULT 1024
91 #define RTE_TEST_TX_DESC_DEFAULT 512
92 
93 #define INVALID_PORT_ID 0xFF
94 
95 /* Max number of devices. Limited by vmdq. */
96 #define MAX_DEVICES 64
97 
98 /* Size of buffers used for snprintfs. */
99 #define MAX_PRINT_BUFF 6072
100 
101 /* Maximum character device basename size. */
102 #define MAX_BASENAME_SZ 10
103 
104 /* Maximum long option length for option parsing. */
105 #define MAX_LONG_OPT_SZ 64
106 
107 /* Used to compare MAC addresses. */
108 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
109 
110 /* mask of enabled ports */
111 static uint32_t enabled_port_mask = 0;
112 
113 /* Promiscuous mode */
114 static uint32_t promiscuous;
115 
116 /*Number of switching cores enabled*/
117 static uint32_t num_switching_cores = 0;
118 
119 /* number of devices/queues to support*/
120 static uint32_t num_queues = 0;
121 static uint32_t num_devices;
122 
123 static struct rte_mempool *mbuf_pool;
124 static int mergeable;
125 
126 /* Do vlan strip on host, enabled on default */
127 static uint32_t vlan_strip = 1;
128 
129 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
130 typedef enum {
131 	VM2VM_DISABLED = 0,
132 	VM2VM_SOFTWARE = 1,
133 	VM2VM_HARDWARE = 2,
134 	VM2VM_LAST
135 } vm2vm_type;
136 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
137 
138 /* Enable stats. */
139 static uint32_t enable_stats = 0;
140 /* Enable retries on RX. */
141 static uint32_t enable_retry = 1;
142 
143 /* Disable TX checksum offload */
144 static uint32_t enable_tx_csum;
145 
146 /* Disable TSO offload */
147 static uint32_t enable_tso;
148 
149 /* Specify timeout (in useconds) between retries on RX. */
150 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
151 /* Specify the number of retries on RX. */
152 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
153 
154 /* Character device basename. Can be set by user. */
155 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
156 
157 /* empty vmdq configuration structure. Filled in programatically */
158 static struct rte_eth_conf vmdq_conf_default = {
159 	.rxmode = {
160 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
161 		.split_hdr_size = 0,
162 		.header_split   = 0, /**< Header Split disabled */
163 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
164 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
165 		/*
166 		 * It is necessary for 1G NIC such as I350,
167 		 * this fixes bug of ipv4 forwarding in guest can't
168 		 * forward pakets from one virtio dev to another virtio dev.
169 		 */
170 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
171 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
172 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
173 	},
174 
175 	.txmode = {
176 		.mq_mode = ETH_MQ_TX_NONE,
177 	},
178 	.rx_adv_conf = {
179 		/*
180 		 * should be overridden separately in code with
181 		 * appropriate values
182 		 */
183 		.vmdq_rx_conf = {
184 			.nb_queue_pools = ETH_8_POOLS,
185 			.enable_default_pool = 0,
186 			.default_pool = 0,
187 			.nb_pool_maps = 0,
188 			.pool_map = {{0, 0},},
189 		},
190 	},
191 };
192 
193 static unsigned lcore_ids[RTE_MAX_LCORE];
194 static uint8_t ports[RTE_MAX_ETHPORTS];
195 static unsigned num_ports = 0; /**< The number of ports specified in command line */
196 static uint16_t num_pf_queues, num_vmdq_queues;
197 static uint16_t vmdq_pool_base, vmdq_queue_base;
198 static uint16_t queues_per_pool;
199 
200 const uint16_t vlan_tags[] = {
201 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
202 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
203 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
204 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
205 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
206 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
207 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
208 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
209 };
210 
211 /* ethernet addresses of ports */
212 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
213 
214 static struct vhost_dev_tailq_list vhost_dev_list =
215 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
216 
217 static struct lcore_info lcore_info[RTE_MAX_LCORE];
218 
219 /* Used for queueing bursts of TX packets. */
220 struct mbuf_table {
221 	unsigned len;
222 	unsigned txq_id;
223 	struct rte_mbuf *m_table[MAX_PKT_BURST];
224 };
225 
226 /* TX queue for each data core. */
227 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
228 
229 #define VLAN_HLEN       4
230 
231 /* Per-device statistics struct */
232 struct device_statistics {
233 	uint64_t tx_total;
234 	rte_atomic64_t rx_total_atomic;
235 	uint64_t tx;
236 	rte_atomic64_t rx_atomic;
237 } __rte_cache_aligned;
238 struct device_statistics dev_statistics[MAX_DEVICES];
239 
240 /*
241  * Builds up the correct configuration for VMDQ VLAN pool map
242  * according to the pool & queue limits.
243  */
244 static inline int
245 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
246 {
247 	struct rte_eth_vmdq_rx_conf conf;
248 	struct rte_eth_vmdq_rx_conf *def_conf =
249 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
250 	unsigned i;
251 
252 	memset(&conf, 0, sizeof(conf));
253 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
254 	conf.nb_pool_maps = num_devices;
255 	conf.enable_loop_back = def_conf->enable_loop_back;
256 	conf.rx_mode = def_conf->rx_mode;
257 
258 	for (i = 0; i < conf.nb_pool_maps; i++) {
259 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
260 		conf.pool_map[i].pools = (1UL << i);
261 	}
262 
263 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
264 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
265 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
266 	return 0;
267 }
268 
269 /*
270  * Validate the device number according to the max pool number gotten form
271  * dev_info. If the device number is invalid, give the error message and
272  * return -1. Each device must have its own pool.
273  */
274 static inline int
275 validate_num_devices(uint32_t max_nb_devices)
276 {
277 	if (num_devices > max_nb_devices) {
278 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
279 		return -1;
280 	}
281 	return 0;
282 }
283 
284 /*
285  * Initialises a given port using global settings and with the rx buffers
286  * coming from the mbuf_pool passed as parameter
287  */
288 static inline int
289 port_init(uint8_t port)
290 {
291 	struct rte_eth_dev_info dev_info;
292 	struct rte_eth_conf port_conf;
293 	struct rte_eth_rxconf *rxconf;
294 	struct rte_eth_txconf *txconf;
295 	int16_t rx_rings, tx_rings;
296 	uint16_t rx_ring_size, tx_ring_size;
297 	int retval;
298 	uint16_t q;
299 
300 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
301 	rte_eth_dev_info_get (port, &dev_info);
302 
303 	if (dev_info.max_rx_queues > MAX_QUEUES) {
304 		rte_exit(EXIT_FAILURE,
305 			"please define MAX_QUEUES no less than %u in %s\n",
306 			dev_info.max_rx_queues, __FILE__);
307 	}
308 
309 	rxconf = &dev_info.default_rxconf;
310 	txconf = &dev_info.default_txconf;
311 	rxconf->rx_drop_en = 1;
312 
313 	/* Enable vlan offload */
314 	txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
315 
316 	/*configure the number of supported virtio devices based on VMDQ limits */
317 	num_devices = dev_info.max_vmdq_pools;
318 
319 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
320 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
321 	tx_rings = (uint16_t)rte_lcore_count();
322 
323 	retval = validate_num_devices(MAX_DEVICES);
324 	if (retval < 0)
325 		return retval;
326 
327 	/* Get port configuration. */
328 	retval = get_eth_conf(&port_conf, num_devices);
329 	if (retval < 0)
330 		return retval;
331 	/* NIC queues are divided into pf queues and vmdq queues.  */
332 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
333 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
334 	num_vmdq_queues = num_devices * queues_per_pool;
335 	num_queues = num_pf_queues + num_vmdq_queues;
336 	vmdq_queue_base = dev_info.vmdq_queue_base;
337 	vmdq_pool_base  = dev_info.vmdq_pool_base;
338 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
339 		num_pf_queues, num_devices, queues_per_pool);
340 
341 	if (port >= rte_eth_dev_count()) return -1;
342 
343 	if (enable_tx_csum == 0)
344 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_CSUM);
345 
346 	if (enable_tso == 0) {
347 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4);
348 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO6);
349 	}
350 
351 	rx_rings = (uint16_t)dev_info.max_rx_queues;
352 	/* Configure ethernet device. */
353 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
354 	if (retval != 0)
355 		return retval;
356 
357 	/* Setup the queues. */
358 	for (q = 0; q < rx_rings; q ++) {
359 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
360 						rte_eth_dev_socket_id(port),
361 						rxconf,
362 						mbuf_pool);
363 		if (retval < 0)
364 			return retval;
365 	}
366 	for (q = 0; q < tx_rings; q ++) {
367 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
368 						rte_eth_dev_socket_id(port),
369 						txconf);
370 		if (retval < 0)
371 			return retval;
372 	}
373 
374 	/* Start the device. */
375 	retval  = rte_eth_dev_start(port);
376 	if (retval < 0) {
377 		RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
378 		return retval;
379 	}
380 
381 	if (promiscuous)
382 		rte_eth_promiscuous_enable(port);
383 
384 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
385 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
386 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
387 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
388 			(unsigned)port,
389 			vmdq_ports_eth_addr[port].addr_bytes[0],
390 			vmdq_ports_eth_addr[port].addr_bytes[1],
391 			vmdq_ports_eth_addr[port].addr_bytes[2],
392 			vmdq_ports_eth_addr[port].addr_bytes[3],
393 			vmdq_ports_eth_addr[port].addr_bytes[4],
394 			vmdq_ports_eth_addr[port].addr_bytes[5]);
395 
396 	return 0;
397 }
398 
399 /*
400  * Set character device basename.
401  */
402 static int
403 us_vhost_parse_basename(const char *q_arg)
404 {
405 	/* parse number string */
406 
407 	if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
408 		return -1;
409 	else
410 		snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
411 
412 	return 0;
413 }
414 
415 /*
416  * Parse the portmask provided at run time.
417  */
418 static int
419 parse_portmask(const char *portmask)
420 {
421 	char *end = NULL;
422 	unsigned long pm;
423 
424 	errno = 0;
425 
426 	/* parse hexadecimal string */
427 	pm = strtoul(portmask, &end, 16);
428 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
429 		return -1;
430 
431 	if (pm == 0)
432 		return -1;
433 
434 	return pm;
435 
436 }
437 
438 /*
439  * Parse num options at run time.
440  */
441 static int
442 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
443 {
444 	char *end = NULL;
445 	unsigned long num;
446 
447 	errno = 0;
448 
449 	/* parse unsigned int string */
450 	num = strtoul(q_arg, &end, 10);
451 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
452 		return -1;
453 
454 	if (num > max_valid_value)
455 		return -1;
456 
457 	return num;
458 
459 }
460 
461 /*
462  * Display usage
463  */
464 static void
465 us_vhost_usage(const char *prgname)
466 {
467 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
468 	"		--vm2vm [0|1|2]\n"
469 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
470 	"		--dev-basename <name>\n"
471 	"		--nb-devices ND\n"
472 	"		-p PORTMASK: Set mask for ports to be used by application\n"
473 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
474 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
475 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
476 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
477 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
478 	"		--vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
479 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
480 	"		--dev-basename: The basename to be used for the character device.\n"
481 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
482 	"		--tso [0|1] disable/enable TCP segment offload.\n",
483 	       prgname);
484 }
485 
486 /*
487  * Parse the arguments given in the command line of the application.
488  */
489 static int
490 us_vhost_parse_args(int argc, char **argv)
491 {
492 	int opt, ret;
493 	int option_index;
494 	unsigned i;
495 	const char *prgname = argv[0];
496 	static struct option long_option[] = {
497 		{"vm2vm", required_argument, NULL, 0},
498 		{"rx-retry", required_argument, NULL, 0},
499 		{"rx-retry-delay", required_argument, NULL, 0},
500 		{"rx-retry-num", required_argument, NULL, 0},
501 		{"mergeable", required_argument, NULL, 0},
502 		{"vlan-strip", required_argument, NULL, 0},
503 		{"stats", required_argument, NULL, 0},
504 		{"dev-basename", required_argument, NULL, 0},
505 		{"tx-csum", required_argument, NULL, 0},
506 		{"tso", required_argument, NULL, 0},
507 		{NULL, 0, 0, 0},
508 	};
509 
510 	/* Parse command line */
511 	while ((opt = getopt_long(argc, argv, "p:P",
512 			long_option, &option_index)) != EOF) {
513 		switch (opt) {
514 		/* Portmask */
515 		case 'p':
516 			enabled_port_mask = parse_portmask(optarg);
517 			if (enabled_port_mask == 0) {
518 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
519 				us_vhost_usage(prgname);
520 				return -1;
521 			}
522 			break;
523 
524 		case 'P':
525 			promiscuous = 1;
526 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
527 				ETH_VMDQ_ACCEPT_BROADCAST |
528 				ETH_VMDQ_ACCEPT_MULTICAST;
529 			rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
530 
531 			break;
532 
533 		case 0:
534 			/* Enable/disable vm2vm comms. */
535 			if (!strncmp(long_option[option_index].name, "vm2vm",
536 				MAX_LONG_OPT_SZ)) {
537 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
538 				if (ret == -1) {
539 					RTE_LOG(INFO, VHOST_CONFIG,
540 						"Invalid argument for "
541 						"vm2vm [0|1|2]\n");
542 					us_vhost_usage(prgname);
543 					return -1;
544 				} else {
545 					vm2vm_mode = (vm2vm_type)ret;
546 				}
547 			}
548 
549 			/* Enable/disable retries on RX. */
550 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
551 				ret = parse_num_opt(optarg, 1);
552 				if (ret == -1) {
553 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
554 					us_vhost_usage(prgname);
555 					return -1;
556 				} else {
557 					enable_retry = ret;
558 				}
559 			}
560 
561 			/* Enable/disable TX checksum offload. */
562 			if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
563 				ret = parse_num_opt(optarg, 1);
564 				if (ret == -1) {
565 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
566 					us_vhost_usage(prgname);
567 					return -1;
568 				} else
569 					enable_tx_csum = ret;
570 			}
571 
572 			/* Enable/disable TSO offload. */
573 			if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
574 				ret = parse_num_opt(optarg, 1);
575 				if (ret == -1) {
576 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
577 					us_vhost_usage(prgname);
578 					return -1;
579 				} else
580 					enable_tso = ret;
581 			}
582 
583 			/* Specify the retries delay time (in useconds) on RX. */
584 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
585 				ret = parse_num_opt(optarg, INT32_MAX);
586 				if (ret == -1) {
587 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
588 					us_vhost_usage(prgname);
589 					return -1;
590 				} else {
591 					burst_rx_delay_time = ret;
592 				}
593 			}
594 
595 			/* Specify the retries number on RX. */
596 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
597 				ret = parse_num_opt(optarg, INT32_MAX);
598 				if (ret == -1) {
599 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
600 					us_vhost_usage(prgname);
601 					return -1;
602 				} else {
603 					burst_rx_retry_num = ret;
604 				}
605 			}
606 
607 			/* Enable/disable RX mergeable buffers. */
608 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
609 				ret = parse_num_opt(optarg, 1);
610 				if (ret == -1) {
611 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
612 					us_vhost_usage(prgname);
613 					return -1;
614 				} else {
615 					mergeable = !!ret;
616 					if (ret) {
617 						vmdq_conf_default.rxmode.jumbo_frame = 1;
618 						vmdq_conf_default.rxmode.max_rx_pkt_len
619 							= JUMBO_FRAME_MAX_SIZE;
620 					}
621 				}
622 			}
623 
624 			/* Enable/disable RX VLAN strip on host. */
625 			if (!strncmp(long_option[option_index].name,
626 				"vlan-strip", MAX_LONG_OPT_SZ)) {
627 				ret = parse_num_opt(optarg, 1);
628 				if (ret == -1) {
629 					RTE_LOG(INFO, VHOST_CONFIG,
630 						"Invalid argument for VLAN strip [0|1]\n");
631 					us_vhost_usage(prgname);
632 					return -1;
633 				} else {
634 					vlan_strip = !!ret;
635 					vmdq_conf_default.rxmode.hw_vlan_strip =
636 						vlan_strip;
637 				}
638 			}
639 
640 			/* Enable/disable stats. */
641 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
642 				ret = parse_num_opt(optarg, INT32_MAX);
643 				if (ret == -1) {
644 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
645 					us_vhost_usage(prgname);
646 					return -1;
647 				} else {
648 					enable_stats = ret;
649 				}
650 			}
651 
652 			/* Set character device basename. */
653 			if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
654 				if (us_vhost_parse_basename(optarg) == -1) {
655 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
656 					us_vhost_usage(prgname);
657 					return -1;
658 				}
659 			}
660 
661 			break;
662 
663 			/* Invalid option - print options. */
664 		default:
665 			us_vhost_usage(prgname);
666 			return -1;
667 		}
668 	}
669 
670 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
671 		if (enabled_port_mask & (1 << i))
672 			ports[num_ports++] = (uint8_t)i;
673 	}
674 
675 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
676 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
677 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
678 		return -1;
679 	}
680 
681 	return 0;
682 }
683 
684 /*
685  * Update the global var NUM_PORTS and array PORTS according to system ports number
686  * and return valid ports number
687  */
688 static unsigned check_ports_num(unsigned nb_ports)
689 {
690 	unsigned valid_num_ports = num_ports;
691 	unsigned portid;
692 
693 	if (num_ports > nb_ports) {
694 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
695 			num_ports, nb_ports);
696 		num_ports = nb_ports;
697 	}
698 
699 	for (portid = 0; portid < num_ports; portid ++) {
700 		if (ports[portid] >= nb_ports) {
701 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
702 				ports[portid], (nb_ports - 1));
703 			ports[portid] = INVALID_PORT_ID;
704 			valid_num_ports--;
705 		}
706 	}
707 	return valid_num_ports;
708 }
709 
710 /*
711  * Compares a packet destination MAC address to a device MAC address.
712  */
713 static inline int __attribute__((always_inline))
714 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
715 {
716 	return ((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0;
717 }
718 
719 static inline struct vhost_dev *__attribute__((always_inline))
720 find_vhost_dev(struct ether_addr *mac)
721 {
722 	struct vhost_dev *vdev;
723 
724 	TAILQ_FOREACH(vdev, &vhost_dev_list, next) {
725 		if (vdev->ready == DEVICE_RX &&
726 		    ether_addr_cmp(mac, &vdev->mac_address))
727 			return vdev;
728 	}
729 
730 	return NULL;
731 }
732 
733 /*
734  * This function learns the MAC address of the device and registers this along with a
735  * vlan tag to a VMDQ.
736  */
737 static int
738 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
739 {
740 	struct ether_hdr *pkt_hdr;
741 	struct virtio_net *dev = vdev->dev;
742 	int i, ret;
743 
744 	/* Learn MAC address of guest device from packet */
745 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
746 
747 	if (find_vhost_dev(&pkt_hdr->s_addr)) {
748 		RTE_LOG(ERR, VHOST_DATA,
749 			"Device (%" PRIu64 ") is using a registered MAC!\n",
750 			dev->device_fh);
751 		return -1;
752 	}
753 
754 	for (i = 0; i < ETHER_ADDR_LEN; i++)
755 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
756 
757 	/* vlan_tag currently uses the device_id. */
758 	vdev->vlan_tag = vlan_tags[dev->device_fh];
759 
760 	/* Print out VMDQ registration info. */
761 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
762 		dev->device_fh,
763 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
764 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
765 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
766 		vdev->vlan_tag);
767 
768 	/* Register the MAC address. */
769 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
770 				(uint32_t)dev->device_fh + vmdq_pool_base);
771 	if (ret)
772 		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
773 					dev->device_fh);
774 
775 	/* Enable stripping of the vlan tag as we handle routing. */
776 	if (vlan_strip)
777 		rte_eth_dev_set_vlan_strip_on_queue(ports[0],
778 			(uint16_t)vdev->vmdq_rx_q, 1);
779 
780 	/* Set device as ready for RX. */
781 	vdev->ready = DEVICE_RX;
782 
783 	return 0;
784 }
785 
786 /*
787  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
788  * queue before disabling RX on the device.
789  */
790 static inline void
791 unlink_vmdq(struct vhost_dev *vdev)
792 {
793 	unsigned i = 0;
794 	unsigned rx_count;
795 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
796 
797 	if (vdev->ready == DEVICE_RX) {
798 		/*clear MAC and VLAN settings*/
799 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
800 		for (i = 0; i < 6; i++)
801 			vdev->mac_address.addr_bytes[i] = 0;
802 
803 		vdev->vlan_tag = 0;
804 
805 		/*Clear out the receive buffers*/
806 		rx_count = rte_eth_rx_burst(ports[0],
807 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
808 
809 		while (rx_count) {
810 			for (i = 0; i < rx_count; i++)
811 				rte_pktmbuf_free(pkts_burst[i]);
812 
813 			rx_count = rte_eth_rx_burst(ports[0],
814 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
815 		}
816 
817 		vdev->ready = DEVICE_MAC_LEARNING;
818 	}
819 }
820 
821 /*
822  * Check if the packet destination MAC address is for a local device. If so then put
823  * the packet on that devices RX queue. If not then return.
824  */
825 static inline int __attribute__((always_inline))
826 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
827 {
828 	struct ether_hdr *pkt_hdr;
829 	uint64_t ret = 0;
830 	struct vhost_dev *dst_vdev;
831 	uint64_t fh;
832 
833 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
834 
835 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
836 	if (!dst_vdev)
837 		return -1;
838 
839 	fh = dst_vdev->dev->device_fh;
840 	if (fh == vdev->dev->device_fh) {
841 		RTE_LOG(DEBUG, VHOST_DATA,
842 			"(%" PRIu64 ") TX: src and dst MAC is same. "
843 			"Dropping packet.\n", fh);
844 		return 0;
845 	}
846 
847 	RTE_LOG(DEBUG, VHOST_DATA,
848 		"(%" PRIu64 ") TX: MAC address is local\n", fh);
849 
850 	if (unlikely(dst_vdev->remove)) {
851 		RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") "
852 			"Device is marked for removal\n", fh);
853 		return 0;
854 	}
855 
856 	/* send the packet to the local virtio device */
857 	ret = rte_vhost_enqueue_burst(dst_vdev->dev, VIRTIO_RXQ, &m, 1);
858 	if (enable_stats) {
859 		rte_atomic64_inc(&dev_statistics[fh].rx_total_atomic);
860 		rte_atomic64_add(&dev_statistics[fh].rx_atomic, ret);
861 		dev_statistics[vdev->dev->device_fh].tx_total++;
862 		dev_statistics[vdev->dev->device_fh].tx += ret;
863 	}
864 
865 	return 0;
866 }
867 
868 /*
869  * Check if the destination MAC of a packet is one local VM,
870  * and get its vlan tag, and offset if it is.
871  */
872 static inline int __attribute__((always_inline))
873 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
874 	uint32_t *offset, uint16_t *vlan_tag)
875 {
876 	struct vhost_dev *dst_vdev;
877 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
878 
879 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
880 	if (!dst_vdev)
881 		return 0;
882 
883 	if (dst_vdev->dev->device_fh == dev->device_fh) {
884 		RTE_LOG(DEBUG, VHOST_DATA,
885 			"(%" PRIu64 ") TX: src and dst MAC is same. "
886 			" Dropping packet.\n", dst_vdev->dev->device_fh);
887 		return -1;
888 	}
889 
890 	/*
891 	 * HW vlan strip will reduce the packet length
892 	 * by minus length of vlan tag, so need restore
893 	 * the packet length by plus it.
894 	 */
895 	*offset  = VLAN_HLEN;
896 	*vlan_tag = vlan_tags[(uint16_t)dst_vdev->dev->device_fh];
897 
898 	RTE_LOG(DEBUG, VHOST_DATA,
899 		"(%" PRIu64 ") TX: pkt to local VM device id: (%" PRIu64 ") "
900 		"vlan tag: %u.\n",
901 		dev->device_fh, dst_vdev->dev->device_fh, *vlan_tag);
902 
903 	return 0;
904 }
905 
906 static uint16_t
907 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
908 {
909 	if (ol_flags & PKT_TX_IPV4)
910 		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
911 	else /* assume ethertype == ETHER_TYPE_IPv6 */
912 		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
913 }
914 
915 static void virtio_tx_offload(struct rte_mbuf *m)
916 {
917 	void *l3_hdr;
918 	struct ipv4_hdr *ipv4_hdr = NULL;
919 	struct tcp_hdr *tcp_hdr = NULL;
920 	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
921 
922 	l3_hdr = (char *)eth_hdr + m->l2_len;
923 
924 	if (m->ol_flags & PKT_TX_IPV4) {
925 		ipv4_hdr = l3_hdr;
926 		ipv4_hdr->hdr_checksum = 0;
927 		m->ol_flags |= PKT_TX_IP_CKSUM;
928 	}
929 
930 	tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
931 	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
932 }
933 
934 /*
935  * This function routes the TX packet to the correct interface. This may be a local device
936  * or the physical port.
937  */
938 static inline void __attribute__((always_inline))
939 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
940 {
941 	struct mbuf_table *tx_q;
942 	struct rte_mbuf **m_table;
943 	unsigned len, ret, offset = 0;
944 	const uint16_t lcore_id = rte_lcore_id();
945 	struct virtio_net *dev = vdev->dev;
946 	struct ether_hdr *nh;
947 
948 	/*check if destination is local VM*/
949 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
950 		rte_pktmbuf_free(m);
951 		return;
952 	}
953 
954 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
955 		if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
956 			rte_pktmbuf_free(m);
957 			return;
958 		}
959 	}
960 
961 	RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: "
962 		"MAC address is external\n", dev->device_fh);
963 
964 	/*Add packet to the port tx queue*/
965 	tx_q = &lcore_tx_queue[lcore_id];
966 	len = tx_q->len;
967 
968 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
969 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
970 		/* Guest has inserted the vlan tag. */
971 		struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
972 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
973 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
974 			(vh->vlan_tci != vlan_tag_be))
975 			vh->vlan_tci = vlan_tag_be;
976 	} else {
977 		m->ol_flags |= PKT_TX_VLAN_PKT;
978 
979 		/*
980 		 * Find the right seg to adjust the data len when offset is
981 		 * bigger than tail room size.
982 		 */
983 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
984 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
985 				m->data_len += offset;
986 			else {
987 				struct rte_mbuf *seg = m;
988 
989 				while ((seg->next != NULL) &&
990 					(offset > rte_pktmbuf_tailroom(seg)))
991 					seg = seg->next;
992 
993 				seg->data_len += offset;
994 			}
995 			m->pkt_len += offset;
996 		}
997 
998 		m->vlan_tci = vlan_tag;
999 	}
1000 
1001 	if (m->ol_flags & PKT_TX_TCP_SEG)
1002 		virtio_tx_offload(m);
1003 
1004 	tx_q->m_table[len] = m;
1005 	len++;
1006 	if (enable_stats) {
1007 		dev_statistics[dev->device_fh].tx_total++;
1008 		dev_statistics[dev->device_fh].tx++;
1009 	}
1010 
1011 	if (unlikely(len == MAX_PKT_BURST)) {
1012 		m_table = (struct rte_mbuf **)tx_q->m_table;
1013 		ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1014 		/* Free any buffers not handled by TX and update the port stats. */
1015 		if (unlikely(ret < len)) {
1016 			do {
1017 				rte_pktmbuf_free(m_table[ret]);
1018 			} while (++ret < len);
1019 		}
1020 
1021 		len = 0;
1022 	}
1023 
1024 	tx_q->len = len;
1025 	return;
1026 }
1027 /*
1028  * This function is called by each data core. It handles all RX/TX registered with the
1029  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1030  * with all devices in the main linked list.
1031  */
1032 static int
1033 switch_worker(__attribute__((unused)) void *arg)
1034 {
1035 	struct virtio_net *dev = NULL;
1036 	struct vhost_dev *vdev = NULL;
1037 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1038 	struct mbuf_table *tx_q;
1039 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1040 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1041 	unsigned ret, i;
1042 	const uint16_t lcore_id = rte_lcore_id();
1043 	const uint16_t num_cores = (uint16_t)rte_lcore_count();
1044 	uint16_t rx_count = 0;
1045 	uint16_t tx_count;
1046 	uint32_t retry = 0;
1047 
1048 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1049 	prev_tsc = 0;
1050 
1051 	tx_q = &lcore_tx_queue[lcore_id];
1052 	for (i = 0; i < num_cores; i ++) {
1053 		if (lcore_ids[i] == lcore_id) {
1054 			tx_q->txq_id = i;
1055 			break;
1056 		}
1057 	}
1058 
1059 	while(1) {
1060 		cur_tsc = rte_rdtsc();
1061 		/*
1062 		 * TX burst queue drain
1063 		 */
1064 		diff_tsc = cur_tsc - prev_tsc;
1065 		if (unlikely(diff_tsc > drain_tsc)) {
1066 
1067 			if (tx_q->len) {
1068 				RTE_LOG(DEBUG, VHOST_DATA,
1069 					"TX queue drained after timeout with burst size %u\n",
1070 					tx_q->len);
1071 
1072 				/*Tx any packets in the queue*/
1073 				ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1074 									   (struct rte_mbuf **)tx_q->m_table,
1075 									   (uint16_t)tx_q->len);
1076 				if (unlikely(ret < tx_q->len)) {
1077 					do {
1078 						rte_pktmbuf_free(tx_q->m_table[ret]);
1079 					} while (++ret < tx_q->len);
1080 				}
1081 
1082 				tx_q->len = 0;
1083 			}
1084 
1085 			prev_tsc = cur_tsc;
1086 
1087 		}
1088 
1089 		/*
1090 		 * Inform the configuration core that we have exited the
1091 		 * linked list and that no devices are in use if requested.
1092 		 */
1093 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1094 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1095 
1096 		/*
1097 		 * Process devices
1098 		 */
1099 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list, next) {
1100 			uint64_t fh;
1101 
1102 			dev = vdev->dev;
1103 			fh  = dev->device_fh;
1104 
1105 			if (unlikely(vdev->remove)) {
1106 				unlink_vmdq(vdev);
1107 				vdev->ready = DEVICE_SAFE_REMOVE;
1108 				continue;
1109 			}
1110 
1111 			if (likely(vdev->ready == DEVICE_RX)) {
1112 				/*Handle guest RX*/
1113 				rx_count = rte_eth_rx_burst(ports[0],
1114 					vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1115 
1116 				if (rx_count) {
1117 					/*
1118 					* Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1119 					* Here MAX_PKT_BURST must be less than virtio queue size
1120 					*/
1121 					if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1122 						for (retry = 0; retry < burst_rx_retry_num; retry++) {
1123 							rte_delay_us(burst_rx_delay_time);
1124 							if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1125 								break;
1126 						}
1127 					}
1128 					ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1129 					if (enable_stats) {
1130 						rte_atomic64_add(
1131 							&dev_statistics[fh].rx_total_atomic,
1132 							rx_count);
1133 						rte_atomic64_add(
1134 							&dev_statistics[fh].rx_atomic,
1135 							ret_count);
1136 					}
1137 					while (likely(rx_count)) {
1138 						rx_count--;
1139 						rte_pktmbuf_free(pkts_burst[rx_count]);
1140 					}
1141 
1142 				}
1143 			}
1144 
1145 			if (likely(!vdev->remove)) {
1146 				/* Handle guest TX*/
1147 				tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1148 				/* If this is the first received packet we need to learn the MAC and setup VMDQ */
1149 				if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1150 					if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1151 						while (tx_count)
1152 							rte_pktmbuf_free(pkts_burst[--tx_count]);
1153 					}
1154 				}
1155 				for (i = 0; i < tx_count; ++i) {
1156 					virtio_tx_route(vdev, pkts_burst[i],
1157 						vlan_tags[(uint16_t)dev->device_fh]);
1158 				}
1159 			}
1160 		}
1161 	}
1162 
1163 	return 0;
1164 }
1165 
1166 /*
1167  * Remove a device from the specific data core linked list and from the
1168  * main linked list. Synchonization  occurs through the use of the
1169  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1170  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1171  */
1172 static void
1173 destroy_device (volatile struct virtio_net *dev)
1174 {
1175 	struct vhost_dev *vdev;
1176 	int lcore;
1177 
1178 	dev->flags &= ~VIRTIO_DEV_RUNNING;
1179 
1180 	vdev = (struct vhost_dev *)dev->priv;
1181 	/*set the remove flag. */
1182 	vdev->remove = 1;
1183 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1184 		rte_pause();
1185 	}
1186 
1187 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev, next);
1188 	TAILQ_REMOVE(&vhost_dev_list, vdev, next);
1189 
1190 	/* Set the dev_removal_flag on each lcore. */
1191 	RTE_LCORE_FOREACH_SLAVE(lcore)
1192 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1193 
1194 	/*
1195 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1196 	 * we can be sure that they can no longer access the device removed
1197 	 * from the linked lists and that the devices are no longer in use.
1198 	 */
1199 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1200 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1201 			rte_pause();
1202 	}
1203 
1204 	lcore_info[vdev->coreid].device_num--;
1205 
1206 	RTE_LOG(INFO, VHOST_DATA,
1207 		"(%" PRIu64 ") Device has been removed from data core\n",
1208 		dev->device_fh);
1209 
1210 	rte_free(vdev);
1211 }
1212 
1213 /*
1214  * A new device is added to a data core. First the device is added to the main linked list
1215  * and the allocated to a specific data core.
1216  */
1217 static int
1218 new_device (struct virtio_net *dev)
1219 {
1220 	int lcore, core_add = 0;
1221 	uint32_t device_num_min = num_devices;
1222 	struct vhost_dev *vdev;
1223 
1224 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1225 	if (vdev == NULL) {
1226 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
1227 			dev->device_fh);
1228 		return -1;
1229 	}
1230 	vdev->dev = dev;
1231 	dev->priv = vdev;
1232 
1233 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, next);
1234 	vdev->vmdq_rx_q
1235 		= dev->device_fh * queues_per_pool + vmdq_queue_base;
1236 
1237 	/*reset ready flag*/
1238 	vdev->ready = DEVICE_MAC_LEARNING;
1239 	vdev->remove = 0;
1240 
1241 	/* Find a suitable lcore to add the device. */
1242 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1243 		if (lcore_info[lcore].device_num < device_num_min) {
1244 			device_num_min = lcore_info[lcore].device_num;
1245 			core_add = lcore;
1246 		}
1247 	}
1248 	vdev->coreid = core_add;
1249 
1250 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev, next);
1251 	lcore_info[vdev->coreid].device_num++;
1252 
1253 	/* Initialize device stats */
1254 	memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
1255 
1256 	/* Disable notifications. */
1257 	rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
1258 	rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
1259 	dev->flags |= VIRTIO_DEV_RUNNING;
1260 
1261 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
1262 
1263 	return 0;
1264 }
1265 
1266 /*
1267  * These callback allow devices to be added to the data core when configuration
1268  * has been fully complete.
1269  */
1270 static const struct virtio_net_device_ops virtio_net_device_ops =
1271 {
1272 	.new_device =  new_device,
1273 	.destroy_device = destroy_device,
1274 };
1275 
1276 /*
1277  * This is a thread will wake up after a period to print stats if the user has
1278  * enabled them.
1279  */
1280 static void
1281 print_stats(void)
1282 {
1283 	struct vhost_dev *vdev;
1284 	uint64_t tx_dropped, rx_dropped;
1285 	uint64_t tx, tx_total, rx, rx_total;
1286 	uint32_t device_fh;
1287 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1288 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1289 
1290 	while(1) {
1291 		sleep(enable_stats);
1292 
1293 		/* Clear screen and move to top left */
1294 		printf("%s%s", clr, top_left);
1295 
1296 		printf("\nDevice statistics ====================================");
1297 
1298 		TAILQ_FOREACH(vdev, &vhost_dev_list, next) {
1299 			device_fh = vdev->dev->device_fh;
1300 			tx_total = dev_statistics[device_fh].tx_total;
1301 			tx = dev_statistics[device_fh].tx;
1302 			tx_dropped = tx_total - tx;
1303 			rx_total = rte_atomic64_read(
1304 				&dev_statistics[device_fh].rx_total_atomic);
1305 			rx = rte_atomic64_read(
1306 				&dev_statistics[device_fh].rx_atomic);
1307 			rx_dropped = rx_total - rx;
1308 
1309 			printf("\nStatistics for device %"PRIu32" ------------------------------"
1310 					"\nTX total: 		%"PRIu64""
1311 					"\nTX dropped: 		%"PRIu64""
1312 					"\nTX successful: 		%"PRIu64""
1313 					"\nRX total: 		%"PRIu64""
1314 					"\nRX dropped: 		%"PRIu64""
1315 					"\nRX successful: 		%"PRIu64"",
1316 					device_fh,
1317 					tx_total,
1318 					tx_dropped,
1319 					tx,
1320 					rx_total,
1321 					rx_dropped,
1322 					rx);
1323 		}
1324 		printf("\n======================================================\n");
1325 	}
1326 }
1327 
1328 /* When we receive a INT signal, unregister vhost driver */
1329 static void
1330 sigint_handler(__rte_unused int signum)
1331 {
1332 	/* Unregister vhost driver. */
1333 	int ret = rte_vhost_driver_unregister((char *)&dev_basename);
1334 	if (ret != 0)
1335 		rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n");
1336 	exit(0);
1337 }
1338 
1339 /*
1340  * Main function, does initialisation and calls the per-lcore functions. The CUSE
1341  * device is also registered here to handle the IOCTLs.
1342  */
1343 int
1344 main(int argc, char *argv[])
1345 {
1346 	unsigned lcore_id, core_id = 0;
1347 	unsigned nb_ports, valid_num_ports;
1348 	int ret;
1349 	uint8_t portid;
1350 	static pthread_t tid;
1351 	char thread_name[RTE_MAX_THREAD_NAME_LEN];
1352 
1353 	signal(SIGINT, sigint_handler);
1354 
1355 	/* init EAL */
1356 	ret = rte_eal_init(argc, argv);
1357 	if (ret < 0)
1358 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1359 	argc -= ret;
1360 	argv += ret;
1361 
1362 	/* parse app arguments */
1363 	ret = us_vhost_parse_args(argc, argv);
1364 	if (ret < 0)
1365 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1366 
1367 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
1368 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1369 
1370 		if (rte_lcore_is_enabled(lcore_id))
1371 			lcore_ids[core_id ++] = lcore_id;
1372 
1373 	if (rte_lcore_count() > RTE_MAX_LCORE)
1374 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1375 
1376 	/*set the number of swithcing cores available*/
1377 	num_switching_cores = rte_lcore_count()-1;
1378 
1379 	/* Get the number of physical ports. */
1380 	nb_ports = rte_eth_dev_count();
1381 	if (nb_ports > RTE_MAX_ETHPORTS)
1382 		nb_ports = RTE_MAX_ETHPORTS;
1383 
1384 	/*
1385 	 * Update the global var NUM_PORTS and global array PORTS
1386 	 * and get value of var VALID_NUM_PORTS according to system ports number
1387 	 */
1388 	valid_num_ports = check_ports_num(nb_ports);
1389 
1390 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1391 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1392 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1393 		return -1;
1394 	}
1395 
1396 	/* Create the mbuf pool. */
1397 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL",
1398 		NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE,
1399 		0, MBUF_DATA_SIZE, rte_socket_id());
1400 	if (mbuf_pool == NULL)
1401 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1402 
1403 	if (vm2vm_mode == VM2VM_HARDWARE) {
1404 		/* Enable VT loop back to let L2 switch to do it. */
1405 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1406 		RTE_LOG(DEBUG, VHOST_CONFIG,
1407 			"Enable loop back for L2 switch in vmdq.\n");
1408 	}
1409 
1410 	/* initialize all ports */
1411 	for (portid = 0; portid < nb_ports; portid++) {
1412 		/* skip ports that are not enabled */
1413 		if ((enabled_port_mask & (1 << portid)) == 0) {
1414 			RTE_LOG(INFO, VHOST_PORT,
1415 				"Skipping disabled port %d\n", portid);
1416 			continue;
1417 		}
1418 		if (port_init(portid) != 0)
1419 			rte_exit(EXIT_FAILURE,
1420 				"Cannot initialize network ports\n");
1421 	}
1422 
1423 	/* Initialize device stats */
1424 	memset(&dev_statistics, 0, sizeof(dev_statistics));
1425 
1426 	/* Enable stats if the user option is set. */
1427 	if (enable_stats) {
1428 		ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
1429 		if (ret != 0)
1430 			rte_exit(EXIT_FAILURE,
1431 				"Cannot create print-stats thread\n");
1432 
1433 		/* Set thread_name for aid in debugging.  */
1434 		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
1435 		ret = rte_thread_setname(tid, thread_name);
1436 		if (ret != 0)
1437 			RTE_LOG(ERR, VHOST_CONFIG,
1438 				"Cannot set print-stats name\n");
1439 	}
1440 
1441 	/* Launch all data cores. */
1442 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1443 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1444 
1445 	if (mergeable == 0)
1446 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
1447 
1448 	/* Register vhost(cuse or user) driver to handle vhost messages. */
1449 	ret = rte_vhost_driver_register((char *)&dev_basename);
1450 	if (ret != 0)
1451 		rte_exit(EXIT_FAILURE, "vhost driver register failure.\n");
1452 
1453 	rte_vhost_driver_callback_register(&virtio_net_device_ops);
1454 
1455 	/* Start CUSE session. */
1456 	rte_vhost_driver_session_start();
1457 	return 0;
1458 
1459 }
1460