xref: /dpdk/examples/vhost/main.c (revision 9c5ef51207c62d3a2edb01cbd2a81ea61f5d37bd)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 #include <rte_ip.h>
54 #include <rte_tcp.h>
55 
56 #include "main.h"
57 
58 #ifndef MAX_QUEUES
59 #define MAX_QUEUES 128
60 #endif
61 
62 /* the maximum number of external ports supported */
63 #define MAX_SUP_PORTS 1
64 
65 /*
66  * Calculate the number of buffers needed per port
67  */
68 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +		\
69 							(num_switching_cores*MAX_PKT_BURST) +  			\
70 							(num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
71 							((num_switching_cores+1)*MBUF_CACHE_SIZE))
72 
73 #define MBUF_CACHE_SIZE	128
74 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
75 
76 #define MAX_PKT_BURST 32		/* Max burst size for RX/TX */
77 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
78 
79 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
80 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
81 
82 #define JUMBO_FRAME_MAX_SIZE    0x2600
83 
84 /* State of virtio device. */
85 #define DEVICE_MAC_LEARNING 0
86 #define DEVICE_RX			1
87 #define DEVICE_SAFE_REMOVE	2
88 
89 /* Configurable number of RX/TX ring descriptors */
90 #define RTE_TEST_RX_DESC_DEFAULT 1024
91 #define RTE_TEST_TX_DESC_DEFAULT 512
92 
93 #define INVALID_PORT_ID 0xFF
94 
95 /* Max number of devices. Limited by vmdq. */
96 #define MAX_DEVICES 64
97 
98 /* Size of buffers used for snprintfs. */
99 #define MAX_PRINT_BUFF 6072
100 
101 /* Maximum character device basename size. */
102 #define MAX_BASENAME_SZ 10
103 
104 /* Maximum long option length for option parsing. */
105 #define MAX_LONG_OPT_SZ 64
106 
107 /* mask of enabled ports */
108 static uint32_t enabled_port_mask = 0;
109 
110 /* Promiscuous mode */
111 static uint32_t promiscuous;
112 
113 /*Number of switching cores enabled*/
114 static uint32_t num_switching_cores = 0;
115 
116 /* number of devices/queues to support*/
117 static uint32_t num_queues = 0;
118 static uint32_t num_devices;
119 
120 static struct rte_mempool *mbuf_pool;
121 static int mergeable;
122 
123 /* Do vlan strip on host, enabled on default */
124 static uint32_t vlan_strip = 1;
125 
126 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
127 typedef enum {
128 	VM2VM_DISABLED = 0,
129 	VM2VM_SOFTWARE = 1,
130 	VM2VM_HARDWARE = 2,
131 	VM2VM_LAST
132 } vm2vm_type;
133 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
134 
135 /* Enable stats. */
136 static uint32_t enable_stats = 0;
137 /* Enable retries on RX. */
138 static uint32_t enable_retry = 1;
139 
140 /* Disable TX checksum offload */
141 static uint32_t enable_tx_csum;
142 
143 /* Disable TSO offload */
144 static uint32_t enable_tso;
145 
146 /* Specify timeout (in useconds) between retries on RX. */
147 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
148 /* Specify the number of retries on RX. */
149 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
150 
151 /* Character device basename. Can be set by user. */
152 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
153 
154 /* empty vmdq configuration structure. Filled in programatically */
155 static struct rte_eth_conf vmdq_conf_default = {
156 	.rxmode = {
157 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
158 		.split_hdr_size = 0,
159 		.header_split   = 0, /**< Header Split disabled */
160 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
161 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
162 		/*
163 		 * It is necessary for 1G NIC such as I350,
164 		 * this fixes bug of ipv4 forwarding in guest can't
165 		 * forward pakets from one virtio dev to another virtio dev.
166 		 */
167 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
168 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
169 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
170 	},
171 
172 	.txmode = {
173 		.mq_mode = ETH_MQ_TX_NONE,
174 	},
175 	.rx_adv_conf = {
176 		/*
177 		 * should be overridden separately in code with
178 		 * appropriate values
179 		 */
180 		.vmdq_rx_conf = {
181 			.nb_queue_pools = ETH_8_POOLS,
182 			.enable_default_pool = 0,
183 			.default_pool = 0,
184 			.nb_pool_maps = 0,
185 			.pool_map = {{0, 0},},
186 		},
187 	},
188 };
189 
190 static unsigned lcore_ids[RTE_MAX_LCORE];
191 static uint8_t ports[RTE_MAX_ETHPORTS];
192 static unsigned num_ports = 0; /**< The number of ports specified in command line */
193 static uint16_t num_pf_queues, num_vmdq_queues;
194 static uint16_t vmdq_pool_base, vmdq_queue_base;
195 static uint16_t queues_per_pool;
196 
197 const uint16_t vlan_tags[] = {
198 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
199 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
200 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
201 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
202 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
203 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
204 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
205 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
206 };
207 
208 /* ethernet addresses of ports */
209 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
210 
211 static struct vhost_dev_tailq_list vhost_dev_list =
212 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
213 
214 static struct lcore_info lcore_info[RTE_MAX_LCORE];
215 
216 /* Used for queueing bursts of TX packets. */
217 struct mbuf_table {
218 	unsigned len;
219 	unsigned txq_id;
220 	struct rte_mbuf *m_table[MAX_PKT_BURST];
221 };
222 
223 /* TX queue for each data core. */
224 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
225 
226 #define VLAN_HLEN       4
227 
228 /* Per-device statistics struct */
229 struct device_statistics {
230 	uint64_t tx_total;
231 	rte_atomic64_t rx_total_atomic;
232 	uint64_t tx;
233 	rte_atomic64_t rx_atomic;
234 } __rte_cache_aligned;
235 struct device_statistics dev_statistics[MAX_DEVICES];
236 
237 /*
238  * Builds up the correct configuration for VMDQ VLAN pool map
239  * according to the pool & queue limits.
240  */
241 static inline int
242 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
243 {
244 	struct rte_eth_vmdq_rx_conf conf;
245 	struct rte_eth_vmdq_rx_conf *def_conf =
246 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
247 	unsigned i;
248 
249 	memset(&conf, 0, sizeof(conf));
250 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
251 	conf.nb_pool_maps = num_devices;
252 	conf.enable_loop_back = def_conf->enable_loop_back;
253 	conf.rx_mode = def_conf->rx_mode;
254 
255 	for (i = 0; i < conf.nb_pool_maps; i++) {
256 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
257 		conf.pool_map[i].pools = (1UL << i);
258 	}
259 
260 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
261 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
262 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
263 	return 0;
264 }
265 
266 /*
267  * Validate the device number according to the max pool number gotten form
268  * dev_info. If the device number is invalid, give the error message and
269  * return -1. Each device must have its own pool.
270  */
271 static inline int
272 validate_num_devices(uint32_t max_nb_devices)
273 {
274 	if (num_devices > max_nb_devices) {
275 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
276 		return -1;
277 	}
278 	return 0;
279 }
280 
281 /*
282  * Initialises a given port using global settings and with the rx buffers
283  * coming from the mbuf_pool passed as parameter
284  */
285 static inline int
286 port_init(uint8_t port)
287 {
288 	struct rte_eth_dev_info dev_info;
289 	struct rte_eth_conf port_conf;
290 	struct rte_eth_rxconf *rxconf;
291 	struct rte_eth_txconf *txconf;
292 	int16_t rx_rings, tx_rings;
293 	uint16_t rx_ring_size, tx_ring_size;
294 	int retval;
295 	uint16_t q;
296 
297 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
298 	rte_eth_dev_info_get (port, &dev_info);
299 
300 	if (dev_info.max_rx_queues > MAX_QUEUES) {
301 		rte_exit(EXIT_FAILURE,
302 			"please define MAX_QUEUES no less than %u in %s\n",
303 			dev_info.max_rx_queues, __FILE__);
304 	}
305 
306 	rxconf = &dev_info.default_rxconf;
307 	txconf = &dev_info.default_txconf;
308 	rxconf->rx_drop_en = 1;
309 
310 	/* Enable vlan offload */
311 	txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
312 
313 	/*configure the number of supported virtio devices based on VMDQ limits */
314 	num_devices = dev_info.max_vmdq_pools;
315 
316 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
317 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
318 	tx_rings = (uint16_t)rte_lcore_count();
319 
320 	retval = validate_num_devices(MAX_DEVICES);
321 	if (retval < 0)
322 		return retval;
323 
324 	/* Get port configuration. */
325 	retval = get_eth_conf(&port_conf, num_devices);
326 	if (retval < 0)
327 		return retval;
328 	/* NIC queues are divided into pf queues and vmdq queues.  */
329 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
330 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
331 	num_vmdq_queues = num_devices * queues_per_pool;
332 	num_queues = num_pf_queues + num_vmdq_queues;
333 	vmdq_queue_base = dev_info.vmdq_queue_base;
334 	vmdq_pool_base  = dev_info.vmdq_pool_base;
335 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
336 		num_pf_queues, num_devices, queues_per_pool);
337 
338 	if (port >= rte_eth_dev_count()) return -1;
339 
340 	if (enable_tx_csum == 0)
341 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_CSUM);
342 
343 	if (enable_tso == 0) {
344 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4);
345 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO6);
346 	}
347 
348 	rx_rings = (uint16_t)dev_info.max_rx_queues;
349 	/* Configure ethernet device. */
350 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
351 	if (retval != 0)
352 		return retval;
353 
354 	/* Setup the queues. */
355 	for (q = 0; q < rx_rings; q ++) {
356 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
357 						rte_eth_dev_socket_id(port),
358 						rxconf,
359 						mbuf_pool);
360 		if (retval < 0)
361 			return retval;
362 	}
363 	for (q = 0; q < tx_rings; q ++) {
364 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
365 						rte_eth_dev_socket_id(port),
366 						txconf);
367 		if (retval < 0)
368 			return retval;
369 	}
370 
371 	/* Start the device. */
372 	retval  = rte_eth_dev_start(port);
373 	if (retval < 0) {
374 		RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
375 		return retval;
376 	}
377 
378 	if (promiscuous)
379 		rte_eth_promiscuous_enable(port);
380 
381 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
382 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
383 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
384 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
385 			(unsigned)port,
386 			vmdq_ports_eth_addr[port].addr_bytes[0],
387 			vmdq_ports_eth_addr[port].addr_bytes[1],
388 			vmdq_ports_eth_addr[port].addr_bytes[2],
389 			vmdq_ports_eth_addr[port].addr_bytes[3],
390 			vmdq_ports_eth_addr[port].addr_bytes[4],
391 			vmdq_ports_eth_addr[port].addr_bytes[5]);
392 
393 	return 0;
394 }
395 
396 /*
397  * Set character device basename.
398  */
399 static int
400 us_vhost_parse_basename(const char *q_arg)
401 {
402 	/* parse number string */
403 
404 	if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
405 		return -1;
406 	else
407 		snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
408 
409 	return 0;
410 }
411 
412 /*
413  * Parse the portmask provided at run time.
414  */
415 static int
416 parse_portmask(const char *portmask)
417 {
418 	char *end = NULL;
419 	unsigned long pm;
420 
421 	errno = 0;
422 
423 	/* parse hexadecimal string */
424 	pm = strtoul(portmask, &end, 16);
425 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
426 		return -1;
427 
428 	if (pm == 0)
429 		return -1;
430 
431 	return pm;
432 
433 }
434 
435 /*
436  * Parse num options at run time.
437  */
438 static int
439 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
440 {
441 	char *end = NULL;
442 	unsigned long num;
443 
444 	errno = 0;
445 
446 	/* parse unsigned int string */
447 	num = strtoul(q_arg, &end, 10);
448 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
449 		return -1;
450 
451 	if (num > max_valid_value)
452 		return -1;
453 
454 	return num;
455 
456 }
457 
458 /*
459  * Display usage
460  */
461 static void
462 us_vhost_usage(const char *prgname)
463 {
464 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
465 	"		--vm2vm [0|1|2]\n"
466 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
467 	"		--dev-basename <name>\n"
468 	"		--nb-devices ND\n"
469 	"		-p PORTMASK: Set mask for ports to be used by application\n"
470 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
471 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
472 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
473 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
474 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
475 	"		--vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
476 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
477 	"		--dev-basename: The basename to be used for the character device.\n"
478 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
479 	"		--tso [0|1] disable/enable TCP segment offload.\n",
480 	       prgname);
481 }
482 
483 /*
484  * Parse the arguments given in the command line of the application.
485  */
486 static int
487 us_vhost_parse_args(int argc, char **argv)
488 {
489 	int opt, ret;
490 	int option_index;
491 	unsigned i;
492 	const char *prgname = argv[0];
493 	static struct option long_option[] = {
494 		{"vm2vm", required_argument, NULL, 0},
495 		{"rx-retry", required_argument, NULL, 0},
496 		{"rx-retry-delay", required_argument, NULL, 0},
497 		{"rx-retry-num", required_argument, NULL, 0},
498 		{"mergeable", required_argument, NULL, 0},
499 		{"vlan-strip", required_argument, NULL, 0},
500 		{"stats", required_argument, NULL, 0},
501 		{"dev-basename", required_argument, NULL, 0},
502 		{"tx-csum", required_argument, NULL, 0},
503 		{"tso", required_argument, NULL, 0},
504 		{NULL, 0, 0, 0},
505 	};
506 
507 	/* Parse command line */
508 	while ((opt = getopt_long(argc, argv, "p:P",
509 			long_option, &option_index)) != EOF) {
510 		switch (opt) {
511 		/* Portmask */
512 		case 'p':
513 			enabled_port_mask = parse_portmask(optarg);
514 			if (enabled_port_mask == 0) {
515 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
516 				us_vhost_usage(prgname);
517 				return -1;
518 			}
519 			break;
520 
521 		case 'P':
522 			promiscuous = 1;
523 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
524 				ETH_VMDQ_ACCEPT_BROADCAST |
525 				ETH_VMDQ_ACCEPT_MULTICAST;
526 			rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
527 
528 			break;
529 
530 		case 0:
531 			/* Enable/disable vm2vm comms. */
532 			if (!strncmp(long_option[option_index].name, "vm2vm",
533 				MAX_LONG_OPT_SZ)) {
534 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
535 				if (ret == -1) {
536 					RTE_LOG(INFO, VHOST_CONFIG,
537 						"Invalid argument for "
538 						"vm2vm [0|1|2]\n");
539 					us_vhost_usage(prgname);
540 					return -1;
541 				} else {
542 					vm2vm_mode = (vm2vm_type)ret;
543 				}
544 			}
545 
546 			/* Enable/disable retries on RX. */
547 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
548 				ret = parse_num_opt(optarg, 1);
549 				if (ret == -1) {
550 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
551 					us_vhost_usage(prgname);
552 					return -1;
553 				} else {
554 					enable_retry = ret;
555 				}
556 			}
557 
558 			/* Enable/disable TX checksum offload. */
559 			if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
560 				ret = parse_num_opt(optarg, 1);
561 				if (ret == -1) {
562 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
563 					us_vhost_usage(prgname);
564 					return -1;
565 				} else
566 					enable_tx_csum = ret;
567 			}
568 
569 			/* Enable/disable TSO offload. */
570 			if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
571 				ret = parse_num_opt(optarg, 1);
572 				if (ret == -1) {
573 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
574 					us_vhost_usage(prgname);
575 					return -1;
576 				} else
577 					enable_tso = ret;
578 			}
579 
580 			/* Specify the retries delay time (in useconds) on RX. */
581 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
582 				ret = parse_num_opt(optarg, INT32_MAX);
583 				if (ret == -1) {
584 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
585 					us_vhost_usage(prgname);
586 					return -1;
587 				} else {
588 					burst_rx_delay_time = ret;
589 				}
590 			}
591 
592 			/* Specify the retries number on RX. */
593 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
594 				ret = parse_num_opt(optarg, INT32_MAX);
595 				if (ret == -1) {
596 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
597 					us_vhost_usage(prgname);
598 					return -1;
599 				} else {
600 					burst_rx_retry_num = ret;
601 				}
602 			}
603 
604 			/* Enable/disable RX mergeable buffers. */
605 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
606 				ret = parse_num_opt(optarg, 1);
607 				if (ret == -1) {
608 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
609 					us_vhost_usage(prgname);
610 					return -1;
611 				} else {
612 					mergeable = !!ret;
613 					if (ret) {
614 						vmdq_conf_default.rxmode.jumbo_frame = 1;
615 						vmdq_conf_default.rxmode.max_rx_pkt_len
616 							= JUMBO_FRAME_MAX_SIZE;
617 					}
618 				}
619 			}
620 
621 			/* Enable/disable RX VLAN strip on host. */
622 			if (!strncmp(long_option[option_index].name,
623 				"vlan-strip", MAX_LONG_OPT_SZ)) {
624 				ret = parse_num_opt(optarg, 1);
625 				if (ret == -1) {
626 					RTE_LOG(INFO, VHOST_CONFIG,
627 						"Invalid argument for VLAN strip [0|1]\n");
628 					us_vhost_usage(prgname);
629 					return -1;
630 				} else {
631 					vlan_strip = !!ret;
632 					vmdq_conf_default.rxmode.hw_vlan_strip =
633 						vlan_strip;
634 				}
635 			}
636 
637 			/* Enable/disable stats. */
638 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
639 				ret = parse_num_opt(optarg, INT32_MAX);
640 				if (ret == -1) {
641 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
642 					us_vhost_usage(prgname);
643 					return -1;
644 				} else {
645 					enable_stats = ret;
646 				}
647 			}
648 
649 			/* Set character device basename. */
650 			if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
651 				if (us_vhost_parse_basename(optarg) == -1) {
652 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
653 					us_vhost_usage(prgname);
654 					return -1;
655 				}
656 			}
657 
658 			break;
659 
660 			/* Invalid option - print options. */
661 		default:
662 			us_vhost_usage(prgname);
663 			return -1;
664 		}
665 	}
666 
667 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
668 		if (enabled_port_mask & (1 << i))
669 			ports[num_ports++] = (uint8_t)i;
670 	}
671 
672 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
673 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
674 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
675 		return -1;
676 	}
677 
678 	return 0;
679 }
680 
681 /*
682  * Update the global var NUM_PORTS and array PORTS according to system ports number
683  * and return valid ports number
684  */
685 static unsigned check_ports_num(unsigned nb_ports)
686 {
687 	unsigned valid_num_ports = num_ports;
688 	unsigned portid;
689 
690 	if (num_ports > nb_ports) {
691 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
692 			num_ports, nb_ports);
693 		num_ports = nb_ports;
694 	}
695 
696 	for (portid = 0; portid < num_ports; portid ++) {
697 		if (ports[portid] >= nb_ports) {
698 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
699 				ports[portid], (nb_ports - 1));
700 			ports[portid] = INVALID_PORT_ID;
701 			valid_num_ports--;
702 		}
703 	}
704 	return valid_num_ports;
705 }
706 
707 static inline struct vhost_dev *__attribute__((always_inline))
708 find_vhost_dev(struct ether_addr *mac)
709 {
710 	struct vhost_dev *vdev;
711 
712 	TAILQ_FOREACH(vdev, &vhost_dev_list, next) {
713 		if (vdev->ready == DEVICE_RX &&
714 		    is_same_ether_addr(mac, &vdev->mac_address))
715 			return vdev;
716 	}
717 
718 	return NULL;
719 }
720 
721 /*
722  * This function learns the MAC address of the device and registers this along with a
723  * vlan tag to a VMDQ.
724  */
725 static int
726 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
727 {
728 	struct ether_hdr *pkt_hdr;
729 	struct virtio_net *dev = vdev->dev;
730 	int i, ret;
731 
732 	/* Learn MAC address of guest device from packet */
733 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
734 
735 	if (find_vhost_dev(&pkt_hdr->s_addr)) {
736 		RTE_LOG(ERR, VHOST_DATA,
737 			"Device (%" PRIu64 ") is using a registered MAC!\n",
738 			dev->device_fh);
739 		return -1;
740 	}
741 
742 	for (i = 0; i < ETHER_ADDR_LEN; i++)
743 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
744 
745 	/* vlan_tag currently uses the device_id. */
746 	vdev->vlan_tag = vlan_tags[dev->device_fh];
747 
748 	/* Print out VMDQ registration info. */
749 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
750 		dev->device_fh,
751 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
752 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
753 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
754 		vdev->vlan_tag);
755 
756 	/* Register the MAC address. */
757 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
758 				(uint32_t)dev->device_fh + vmdq_pool_base);
759 	if (ret)
760 		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
761 					dev->device_fh);
762 
763 	/* Enable stripping of the vlan tag as we handle routing. */
764 	if (vlan_strip)
765 		rte_eth_dev_set_vlan_strip_on_queue(ports[0],
766 			(uint16_t)vdev->vmdq_rx_q, 1);
767 
768 	/* Set device as ready for RX. */
769 	vdev->ready = DEVICE_RX;
770 
771 	return 0;
772 }
773 
774 /*
775  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
776  * queue before disabling RX on the device.
777  */
778 static inline void
779 unlink_vmdq(struct vhost_dev *vdev)
780 {
781 	unsigned i = 0;
782 	unsigned rx_count;
783 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
784 
785 	if (vdev->ready == DEVICE_RX) {
786 		/*clear MAC and VLAN settings*/
787 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
788 		for (i = 0; i < 6; i++)
789 			vdev->mac_address.addr_bytes[i] = 0;
790 
791 		vdev->vlan_tag = 0;
792 
793 		/*Clear out the receive buffers*/
794 		rx_count = rte_eth_rx_burst(ports[0],
795 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
796 
797 		while (rx_count) {
798 			for (i = 0; i < rx_count; i++)
799 				rte_pktmbuf_free(pkts_burst[i]);
800 
801 			rx_count = rte_eth_rx_burst(ports[0],
802 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
803 		}
804 
805 		vdev->ready = DEVICE_MAC_LEARNING;
806 	}
807 }
808 
809 static inline void __attribute__((always_inline))
810 virtio_xmit(struct virtio_net *dst_dev, struct virtio_net *src_dev,
811 	    struct rte_mbuf *m)
812 {
813 	uint16_t ret;
814 
815 	ret = rte_vhost_enqueue_burst(dst_dev, VIRTIO_RXQ, &m, 1);
816 	if (enable_stats) {
817 		rte_atomic64_inc(&dev_statistics[dst_dev->device_fh].rx_total_atomic);
818 		rte_atomic64_add(&dev_statistics[dst_dev->device_fh].rx_atomic, ret);
819 		dev_statistics[src_dev->device_fh].tx_total++;
820 		dev_statistics[src_dev->device_fh].tx += ret;
821 	}
822 }
823 
824 /*
825  * Check if the packet destination MAC address is for a local device. If so then put
826  * the packet on that devices RX queue. If not then return.
827  */
828 static inline int __attribute__((always_inline))
829 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
830 {
831 	struct ether_hdr *pkt_hdr;
832 	struct vhost_dev *dst_vdev;
833 	uint64_t fh;
834 
835 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
836 
837 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
838 	if (!dst_vdev)
839 		return -1;
840 
841 	fh = dst_vdev->dev->device_fh;
842 	if (fh == vdev->dev->device_fh) {
843 		RTE_LOG(DEBUG, VHOST_DATA,
844 			"(%" PRIu64 ") TX: src and dst MAC is same. "
845 			"Dropping packet.\n", fh);
846 		return 0;
847 	}
848 
849 	RTE_LOG(DEBUG, VHOST_DATA,
850 		"(%" PRIu64 ") TX: MAC address is local\n", fh);
851 
852 	if (unlikely(dst_vdev->remove)) {
853 		RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") "
854 			"Device is marked for removal\n", fh);
855 		return 0;
856 	}
857 
858 	virtio_xmit(dst_vdev->dev, vdev->dev, m);
859 	return 0;
860 }
861 
862 /*
863  * Check if the destination MAC of a packet is one local VM,
864  * and get its vlan tag, and offset if it is.
865  */
866 static inline int __attribute__((always_inline))
867 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
868 	uint32_t *offset, uint16_t *vlan_tag)
869 {
870 	struct vhost_dev *dst_vdev;
871 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
872 
873 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
874 	if (!dst_vdev)
875 		return 0;
876 
877 	if (dst_vdev->dev->device_fh == dev->device_fh) {
878 		RTE_LOG(DEBUG, VHOST_DATA,
879 			"(%" PRIu64 ") TX: src and dst MAC is same. "
880 			" Dropping packet.\n", dst_vdev->dev->device_fh);
881 		return -1;
882 	}
883 
884 	/*
885 	 * HW vlan strip will reduce the packet length
886 	 * by minus length of vlan tag, so need restore
887 	 * the packet length by plus it.
888 	 */
889 	*offset  = VLAN_HLEN;
890 	*vlan_tag = vlan_tags[(uint16_t)dst_vdev->dev->device_fh];
891 
892 	RTE_LOG(DEBUG, VHOST_DATA,
893 		"(%" PRIu64 ") TX: pkt to local VM device id: (%" PRIu64 ") "
894 		"vlan tag: %u.\n",
895 		dev->device_fh, dst_vdev->dev->device_fh, *vlan_tag);
896 
897 	return 0;
898 }
899 
900 static uint16_t
901 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
902 {
903 	if (ol_flags & PKT_TX_IPV4)
904 		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
905 	else /* assume ethertype == ETHER_TYPE_IPv6 */
906 		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
907 }
908 
909 static void virtio_tx_offload(struct rte_mbuf *m)
910 {
911 	void *l3_hdr;
912 	struct ipv4_hdr *ipv4_hdr = NULL;
913 	struct tcp_hdr *tcp_hdr = NULL;
914 	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
915 
916 	l3_hdr = (char *)eth_hdr + m->l2_len;
917 
918 	if (m->ol_flags & PKT_TX_IPV4) {
919 		ipv4_hdr = l3_hdr;
920 		ipv4_hdr->hdr_checksum = 0;
921 		m->ol_flags |= PKT_TX_IP_CKSUM;
922 	}
923 
924 	tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
925 	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
926 }
927 
928 /*
929  * This function routes the TX packet to the correct interface. This may be a local device
930  * or the physical port.
931  */
932 static inline void __attribute__((always_inline))
933 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
934 {
935 	struct mbuf_table *tx_q;
936 	struct rte_mbuf **m_table;
937 	unsigned len, ret, offset = 0;
938 	const uint16_t lcore_id = rte_lcore_id();
939 	struct virtio_net *dev = vdev->dev;
940 	struct ether_hdr *nh;
941 
942 
943 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
944 	if (unlikely(is_broadcast_ether_addr(&nh->d_addr))) {
945 		struct vhost_dev *vdev2;
946 
947 		TAILQ_FOREACH(vdev2, &vhost_dev_list, next) {
948 			virtio_xmit(vdev2->dev, vdev->dev, m);
949 		}
950 		goto queue2nic;
951 	}
952 
953 	/*check if destination is local VM*/
954 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
955 		rte_pktmbuf_free(m);
956 		return;
957 	}
958 
959 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
960 		if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
961 			rte_pktmbuf_free(m);
962 			return;
963 		}
964 	}
965 
966 	RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: "
967 		"MAC address is external\n", dev->device_fh);
968 
969 queue2nic:
970 
971 	/*Add packet to the port tx queue*/
972 	tx_q = &lcore_tx_queue[lcore_id];
973 	len = tx_q->len;
974 
975 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
976 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
977 		/* Guest has inserted the vlan tag. */
978 		struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
979 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
980 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
981 			(vh->vlan_tci != vlan_tag_be))
982 			vh->vlan_tci = vlan_tag_be;
983 	} else {
984 		m->ol_flags |= PKT_TX_VLAN_PKT;
985 
986 		/*
987 		 * Find the right seg to adjust the data len when offset is
988 		 * bigger than tail room size.
989 		 */
990 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
991 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
992 				m->data_len += offset;
993 			else {
994 				struct rte_mbuf *seg = m;
995 
996 				while ((seg->next != NULL) &&
997 					(offset > rte_pktmbuf_tailroom(seg)))
998 					seg = seg->next;
999 
1000 				seg->data_len += offset;
1001 			}
1002 			m->pkt_len += offset;
1003 		}
1004 
1005 		m->vlan_tci = vlan_tag;
1006 	}
1007 
1008 	if (m->ol_flags & PKT_TX_TCP_SEG)
1009 		virtio_tx_offload(m);
1010 
1011 	tx_q->m_table[len] = m;
1012 	len++;
1013 	if (enable_stats) {
1014 		dev_statistics[dev->device_fh].tx_total++;
1015 		dev_statistics[dev->device_fh].tx++;
1016 	}
1017 
1018 	if (unlikely(len == MAX_PKT_BURST)) {
1019 		m_table = (struct rte_mbuf **)tx_q->m_table;
1020 		ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1021 		/* Free any buffers not handled by TX and update the port stats. */
1022 		if (unlikely(ret < len)) {
1023 			do {
1024 				rte_pktmbuf_free(m_table[ret]);
1025 			} while (++ret < len);
1026 		}
1027 
1028 		len = 0;
1029 	}
1030 
1031 	tx_q->len = len;
1032 	return;
1033 }
1034 /*
1035  * This function is called by each data core. It handles all RX/TX registered with the
1036  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1037  * with all devices in the main linked list.
1038  */
1039 static int
1040 switch_worker(__attribute__((unused)) void *arg)
1041 {
1042 	struct virtio_net *dev = NULL;
1043 	struct vhost_dev *vdev = NULL;
1044 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1045 	struct mbuf_table *tx_q;
1046 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1047 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1048 	unsigned ret, i;
1049 	const uint16_t lcore_id = rte_lcore_id();
1050 	const uint16_t num_cores = (uint16_t)rte_lcore_count();
1051 	uint16_t rx_count = 0;
1052 	uint16_t tx_count;
1053 	uint32_t retry = 0;
1054 
1055 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1056 	prev_tsc = 0;
1057 
1058 	tx_q = &lcore_tx_queue[lcore_id];
1059 	for (i = 0; i < num_cores; i ++) {
1060 		if (lcore_ids[i] == lcore_id) {
1061 			tx_q->txq_id = i;
1062 			break;
1063 		}
1064 	}
1065 
1066 	while(1) {
1067 		cur_tsc = rte_rdtsc();
1068 		/*
1069 		 * TX burst queue drain
1070 		 */
1071 		diff_tsc = cur_tsc - prev_tsc;
1072 		if (unlikely(diff_tsc > drain_tsc)) {
1073 
1074 			if (tx_q->len) {
1075 				RTE_LOG(DEBUG, VHOST_DATA,
1076 					"TX queue drained after timeout with burst size %u\n",
1077 					tx_q->len);
1078 
1079 				/*Tx any packets in the queue*/
1080 				ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1081 									   (struct rte_mbuf **)tx_q->m_table,
1082 									   (uint16_t)tx_q->len);
1083 				if (unlikely(ret < tx_q->len)) {
1084 					do {
1085 						rte_pktmbuf_free(tx_q->m_table[ret]);
1086 					} while (++ret < tx_q->len);
1087 				}
1088 
1089 				tx_q->len = 0;
1090 			}
1091 
1092 			prev_tsc = cur_tsc;
1093 
1094 		}
1095 
1096 		/*
1097 		 * Inform the configuration core that we have exited the
1098 		 * linked list and that no devices are in use if requested.
1099 		 */
1100 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1101 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1102 
1103 		/*
1104 		 * Process devices
1105 		 */
1106 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list, next) {
1107 			uint64_t fh;
1108 
1109 			dev = vdev->dev;
1110 			fh  = dev->device_fh;
1111 
1112 			if (unlikely(vdev->remove)) {
1113 				unlink_vmdq(vdev);
1114 				vdev->ready = DEVICE_SAFE_REMOVE;
1115 				continue;
1116 			}
1117 
1118 			if (likely(vdev->ready == DEVICE_RX)) {
1119 				/*Handle guest RX*/
1120 				rx_count = rte_eth_rx_burst(ports[0],
1121 					vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1122 
1123 				if (rx_count) {
1124 					/*
1125 					* Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1126 					* Here MAX_PKT_BURST must be less than virtio queue size
1127 					*/
1128 					if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1129 						for (retry = 0; retry < burst_rx_retry_num; retry++) {
1130 							rte_delay_us(burst_rx_delay_time);
1131 							if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1132 								break;
1133 						}
1134 					}
1135 					ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1136 					if (enable_stats) {
1137 						rte_atomic64_add(
1138 							&dev_statistics[fh].rx_total_atomic,
1139 							rx_count);
1140 						rte_atomic64_add(
1141 							&dev_statistics[fh].rx_atomic,
1142 							ret_count);
1143 					}
1144 					while (likely(rx_count)) {
1145 						rx_count--;
1146 						rte_pktmbuf_free(pkts_burst[rx_count]);
1147 					}
1148 
1149 				}
1150 			}
1151 
1152 			if (likely(!vdev->remove)) {
1153 				/* Handle guest TX*/
1154 				tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1155 				/* If this is the first received packet we need to learn the MAC and setup VMDQ */
1156 				if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1157 					if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1158 						while (tx_count)
1159 							rte_pktmbuf_free(pkts_burst[--tx_count]);
1160 					}
1161 				}
1162 				for (i = 0; i < tx_count; ++i) {
1163 					virtio_tx_route(vdev, pkts_burst[i],
1164 						vlan_tags[(uint16_t)dev->device_fh]);
1165 				}
1166 			}
1167 		}
1168 	}
1169 
1170 	return 0;
1171 }
1172 
1173 /*
1174  * Remove a device from the specific data core linked list and from the
1175  * main linked list. Synchonization  occurs through the use of the
1176  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1177  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1178  */
1179 static void
1180 destroy_device (volatile struct virtio_net *dev)
1181 {
1182 	struct vhost_dev *vdev;
1183 	int lcore;
1184 
1185 	dev->flags &= ~VIRTIO_DEV_RUNNING;
1186 
1187 	vdev = (struct vhost_dev *)dev->priv;
1188 	/*set the remove flag. */
1189 	vdev->remove = 1;
1190 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1191 		rte_pause();
1192 	}
1193 
1194 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev, next);
1195 	TAILQ_REMOVE(&vhost_dev_list, vdev, next);
1196 
1197 	/* Set the dev_removal_flag on each lcore. */
1198 	RTE_LCORE_FOREACH_SLAVE(lcore)
1199 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1200 
1201 	/*
1202 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1203 	 * we can be sure that they can no longer access the device removed
1204 	 * from the linked lists and that the devices are no longer in use.
1205 	 */
1206 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1207 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1208 			rte_pause();
1209 	}
1210 
1211 	lcore_info[vdev->coreid].device_num--;
1212 
1213 	RTE_LOG(INFO, VHOST_DATA,
1214 		"(%" PRIu64 ") Device has been removed from data core\n",
1215 		dev->device_fh);
1216 
1217 	rte_free(vdev);
1218 }
1219 
1220 /*
1221  * A new device is added to a data core. First the device is added to the main linked list
1222  * and the allocated to a specific data core.
1223  */
1224 static int
1225 new_device (struct virtio_net *dev)
1226 {
1227 	int lcore, core_add = 0;
1228 	uint32_t device_num_min = num_devices;
1229 	struct vhost_dev *vdev;
1230 
1231 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1232 	if (vdev == NULL) {
1233 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
1234 			dev->device_fh);
1235 		return -1;
1236 	}
1237 	vdev->dev = dev;
1238 	dev->priv = vdev;
1239 
1240 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, next);
1241 	vdev->vmdq_rx_q
1242 		= dev->device_fh * queues_per_pool + vmdq_queue_base;
1243 
1244 	/*reset ready flag*/
1245 	vdev->ready = DEVICE_MAC_LEARNING;
1246 	vdev->remove = 0;
1247 
1248 	/* Find a suitable lcore to add the device. */
1249 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1250 		if (lcore_info[lcore].device_num < device_num_min) {
1251 			device_num_min = lcore_info[lcore].device_num;
1252 			core_add = lcore;
1253 		}
1254 	}
1255 	vdev->coreid = core_add;
1256 
1257 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev, next);
1258 	lcore_info[vdev->coreid].device_num++;
1259 
1260 	/* Initialize device stats */
1261 	memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
1262 
1263 	/* Disable notifications. */
1264 	rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
1265 	rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
1266 	dev->flags |= VIRTIO_DEV_RUNNING;
1267 
1268 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
1269 
1270 	return 0;
1271 }
1272 
1273 /*
1274  * These callback allow devices to be added to the data core when configuration
1275  * has been fully complete.
1276  */
1277 static const struct virtio_net_device_ops virtio_net_device_ops =
1278 {
1279 	.new_device =  new_device,
1280 	.destroy_device = destroy_device,
1281 };
1282 
1283 /*
1284  * This is a thread will wake up after a period to print stats if the user has
1285  * enabled them.
1286  */
1287 static void
1288 print_stats(void)
1289 {
1290 	struct vhost_dev *vdev;
1291 	uint64_t tx_dropped, rx_dropped;
1292 	uint64_t tx, tx_total, rx, rx_total;
1293 	uint32_t device_fh;
1294 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1295 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1296 
1297 	while(1) {
1298 		sleep(enable_stats);
1299 
1300 		/* Clear screen and move to top left */
1301 		printf("%s%s", clr, top_left);
1302 
1303 		printf("\nDevice statistics ====================================");
1304 
1305 		TAILQ_FOREACH(vdev, &vhost_dev_list, next) {
1306 			device_fh = vdev->dev->device_fh;
1307 			tx_total = dev_statistics[device_fh].tx_total;
1308 			tx = dev_statistics[device_fh].tx;
1309 			tx_dropped = tx_total - tx;
1310 			rx_total = rte_atomic64_read(
1311 				&dev_statistics[device_fh].rx_total_atomic);
1312 			rx = rte_atomic64_read(
1313 				&dev_statistics[device_fh].rx_atomic);
1314 			rx_dropped = rx_total - rx;
1315 
1316 			printf("\nStatistics for device %"PRIu32" ------------------------------"
1317 					"\nTX total: 		%"PRIu64""
1318 					"\nTX dropped: 		%"PRIu64""
1319 					"\nTX successful: 		%"PRIu64""
1320 					"\nRX total: 		%"PRIu64""
1321 					"\nRX dropped: 		%"PRIu64""
1322 					"\nRX successful: 		%"PRIu64"",
1323 					device_fh,
1324 					tx_total,
1325 					tx_dropped,
1326 					tx,
1327 					rx_total,
1328 					rx_dropped,
1329 					rx);
1330 		}
1331 		printf("\n======================================================\n");
1332 	}
1333 }
1334 
1335 /* When we receive a INT signal, unregister vhost driver */
1336 static void
1337 sigint_handler(__rte_unused int signum)
1338 {
1339 	/* Unregister vhost driver. */
1340 	int ret = rte_vhost_driver_unregister((char *)&dev_basename);
1341 	if (ret != 0)
1342 		rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n");
1343 	exit(0);
1344 }
1345 
1346 /*
1347  * Main function, does initialisation and calls the per-lcore functions. The CUSE
1348  * device is also registered here to handle the IOCTLs.
1349  */
1350 int
1351 main(int argc, char *argv[])
1352 {
1353 	unsigned lcore_id, core_id = 0;
1354 	unsigned nb_ports, valid_num_ports;
1355 	int ret;
1356 	uint8_t portid;
1357 	static pthread_t tid;
1358 	char thread_name[RTE_MAX_THREAD_NAME_LEN];
1359 
1360 	signal(SIGINT, sigint_handler);
1361 
1362 	/* init EAL */
1363 	ret = rte_eal_init(argc, argv);
1364 	if (ret < 0)
1365 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1366 	argc -= ret;
1367 	argv += ret;
1368 
1369 	/* parse app arguments */
1370 	ret = us_vhost_parse_args(argc, argv);
1371 	if (ret < 0)
1372 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1373 
1374 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
1375 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1376 
1377 		if (rte_lcore_is_enabled(lcore_id))
1378 			lcore_ids[core_id ++] = lcore_id;
1379 
1380 	if (rte_lcore_count() > RTE_MAX_LCORE)
1381 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1382 
1383 	/*set the number of swithcing cores available*/
1384 	num_switching_cores = rte_lcore_count()-1;
1385 
1386 	/* Get the number of physical ports. */
1387 	nb_ports = rte_eth_dev_count();
1388 	if (nb_ports > RTE_MAX_ETHPORTS)
1389 		nb_ports = RTE_MAX_ETHPORTS;
1390 
1391 	/*
1392 	 * Update the global var NUM_PORTS and global array PORTS
1393 	 * and get value of var VALID_NUM_PORTS according to system ports number
1394 	 */
1395 	valid_num_ports = check_ports_num(nb_ports);
1396 
1397 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1398 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1399 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1400 		return -1;
1401 	}
1402 
1403 	/* Create the mbuf pool. */
1404 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL",
1405 		NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE,
1406 		0, MBUF_DATA_SIZE, rte_socket_id());
1407 	if (mbuf_pool == NULL)
1408 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1409 
1410 	if (vm2vm_mode == VM2VM_HARDWARE) {
1411 		/* Enable VT loop back to let L2 switch to do it. */
1412 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1413 		RTE_LOG(DEBUG, VHOST_CONFIG,
1414 			"Enable loop back for L2 switch in vmdq.\n");
1415 	}
1416 
1417 	/* initialize all ports */
1418 	for (portid = 0; portid < nb_ports; portid++) {
1419 		/* skip ports that are not enabled */
1420 		if ((enabled_port_mask & (1 << portid)) == 0) {
1421 			RTE_LOG(INFO, VHOST_PORT,
1422 				"Skipping disabled port %d\n", portid);
1423 			continue;
1424 		}
1425 		if (port_init(portid) != 0)
1426 			rte_exit(EXIT_FAILURE,
1427 				"Cannot initialize network ports\n");
1428 	}
1429 
1430 	/* Initialize device stats */
1431 	memset(&dev_statistics, 0, sizeof(dev_statistics));
1432 
1433 	/* Enable stats if the user option is set. */
1434 	if (enable_stats) {
1435 		ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
1436 		if (ret != 0)
1437 			rte_exit(EXIT_FAILURE,
1438 				"Cannot create print-stats thread\n");
1439 
1440 		/* Set thread_name for aid in debugging.  */
1441 		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
1442 		ret = rte_thread_setname(tid, thread_name);
1443 		if (ret != 0)
1444 			RTE_LOG(ERR, VHOST_CONFIG,
1445 				"Cannot set print-stats name\n");
1446 	}
1447 
1448 	/* Launch all data cores. */
1449 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1450 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1451 
1452 	if (mergeable == 0)
1453 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
1454 
1455 	/* Register vhost(cuse or user) driver to handle vhost messages. */
1456 	ret = rte_vhost_driver_register((char *)&dev_basename);
1457 	if (ret != 0)
1458 		rte_exit(EXIT_FAILURE, "vhost driver register failure.\n");
1459 
1460 	rte_vhost_driver_callback_register(&virtio_net_device_ops);
1461 
1462 	/* Start CUSE session. */
1463 	rte_vhost_driver_session_start();
1464 	return 0;
1465 
1466 }
1467