xref: /dpdk/examples/vhost/main.c (revision 273ecdbc06a2fc09b0ac58b77b918a3f677c47a6)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 #include <rte_ip.h>
54 #include <rte_tcp.h>
55 
56 #include "main.h"
57 
58 #ifndef MAX_QUEUES
59 #define MAX_QUEUES 128
60 #endif
61 
62 /* the maximum number of external ports supported */
63 #define MAX_SUP_PORTS 1
64 
65 #define MBUF_CACHE_SIZE	128
66 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
67 
68 #define MAX_PKT_BURST 32		/* Max burst size for RX/TX */
69 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
70 
71 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
72 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
73 
74 #define JUMBO_FRAME_MAX_SIZE    0x2600
75 
76 /* State of virtio device. */
77 #define DEVICE_MAC_LEARNING 0
78 #define DEVICE_RX			1
79 #define DEVICE_SAFE_REMOVE	2
80 
81 /* Configurable number of RX/TX ring descriptors */
82 #define RTE_TEST_RX_DESC_DEFAULT 1024
83 #define RTE_TEST_TX_DESC_DEFAULT 512
84 
85 #define INVALID_PORT_ID 0xFF
86 
87 /* Max number of devices. Limited by vmdq. */
88 #define MAX_DEVICES 64
89 
90 /* Size of buffers used for snprintfs. */
91 #define MAX_PRINT_BUFF 6072
92 
93 /* Maximum character device basename size. */
94 #define MAX_BASENAME_SZ 10
95 
96 /* Maximum long option length for option parsing. */
97 #define MAX_LONG_OPT_SZ 64
98 
99 /* mask of enabled ports */
100 static uint32_t enabled_port_mask = 0;
101 
102 /* Promiscuous mode */
103 static uint32_t promiscuous;
104 
105 /* number of devices/queues to support*/
106 static uint32_t num_queues = 0;
107 static uint32_t num_devices;
108 
109 static struct rte_mempool *mbuf_pool;
110 static int mergeable;
111 
112 /* Do vlan strip on host, enabled on default */
113 static uint32_t vlan_strip = 1;
114 
115 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
116 typedef enum {
117 	VM2VM_DISABLED = 0,
118 	VM2VM_SOFTWARE = 1,
119 	VM2VM_HARDWARE = 2,
120 	VM2VM_LAST
121 } vm2vm_type;
122 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
123 
124 /* Enable stats. */
125 static uint32_t enable_stats = 0;
126 /* Enable retries on RX. */
127 static uint32_t enable_retry = 1;
128 
129 /* Disable TX checksum offload */
130 static uint32_t enable_tx_csum;
131 
132 /* Disable TSO offload */
133 static uint32_t enable_tso;
134 
135 /* Specify timeout (in useconds) between retries on RX. */
136 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
137 /* Specify the number of retries on RX. */
138 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
139 
140 /* Character device basename. Can be set by user. */
141 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
142 
143 /* empty vmdq configuration structure. Filled in programatically */
144 static struct rte_eth_conf vmdq_conf_default = {
145 	.rxmode = {
146 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
147 		.split_hdr_size = 0,
148 		.header_split   = 0, /**< Header Split disabled */
149 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
150 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
151 		/*
152 		 * It is necessary for 1G NIC such as I350,
153 		 * this fixes bug of ipv4 forwarding in guest can't
154 		 * forward pakets from one virtio dev to another virtio dev.
155 		 */
156 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
157 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
158 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
159 	},
160 
161 	.txmode = {
162 		.mq_mode = ETH_MQ_TX_NONE,
163 	},
164 	.rx_adv_conf = {
165 		/*
166 		 * should be overridden separately in code with
167 		 * appropriate values
168 		 */
169 		.vmdq_rx_conf = {
170 			.nb_queue_pools = ETH_8_POOLS,
171 			.enable_default_pool = 0,
172 			.default_pool = 0,
173 			.nb_pool_maps = 0,
174 			.pool_map = {{0, 0},},
175 		},
176 	},
177 };
178 
179 static unsigned lcore_ids[RTE_MAX_LCORE];
180 static uint8_t ports[RTE_MAX_ETHPORTS];
181 static unsigned num_ports = 0; /**< The number of ports specified in command line */
182 static uint16_t num_pf_queues, num_vmdq_queues;
183 static uint16_t vmdq_pool_base, vmdq_queue_base;
184 static uint16_t queues_per_pool;
185 
186 const uint16_t vlan_tags[] = {
187 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
188 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
189 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
190 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
191 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
192 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
193 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
194 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
195 };
196 
197 /* ethernet addresses of ports */
198 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
199 
200 static struct vhost_dev_tailq_list vhost_dev_list =
201 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
202 
203 static struct lcore_info lcore_info[RTE_MAX_LCORE];
204 
205 /* Used for queueing bursts of TX packets. */
206 struct mbuf_table {
207 	unsigned len;
208 	unsigned txq_id;
209 	struct rte_mbuf *m_table[MAX_PKT_BURST];
210 };
211 
212 /* TX queue for each data core. */
213 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
214 
215 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
216 				 / US_PER_S * BURST_TX_DRAIN_US)
217 #define VLAN_HLEN       4
218 
219 /* Per-device statistics struct */
220 struct device_statistics {
221 	uint64_t tx_total;
222 	rte_atomic64_t rx_total_atomic;
223 	uint64_t tx;
224 	rte_atomic64_t rx_atomic;
225 } __rte_cache_aligned;
226 struct device_statistics dev_statistics[MAX_DEVICES];
227 
228 /*
229  * Builds up the correct configuration for VMDQ VLAN pool map
230  * according to the pool & queue limits.
231  */
232 static inline int
233 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
234 {
235 	struct rte_eth_vmdq_rx_conf conf;
236 	struct rte_eth_vmdq_rx_conf *def_conf =
237 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
238 	unsigned i;
239 
240 	memset(&conf, 0, sizeof(conf));
241 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
242 	conf.nb_pool_maps = num_devices;
243 	conf.enable_loop_back = def_conf->enable_loop_back;
244 	conf.rx_mode = def_conf->rx_mode;
245 
246 	for (i = 0; i < conf.nb_pool_maps; i++) {
247 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
248 		conf.pool_map[i].pools = (1UL << i);
249 	}
250 
251 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
252 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
253 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
254 	return 0;
255 }
256 
257 /*
258  * Validate the device number according to the max pool number gotten form
259  * dev_info. If the device number is invalid, give the error message and
260  * return -1. Each device must have its own pool.
261  */
262 static inline int
263 validate_num_devices(uint32_t max_nb_devices)
264 {
265 	if (num_devices > max_nb_devices) {
266 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
267 		return -1;
268 	}
269 	return 0;
270 }
271 
272 /*
273  * Initialises a given port using global settings and with the rx buffers
274  * coming from the mbuf_pool passed as parameter
275  */
276 static inline int
277 port_init(uint8_t port)
278 {
279 	struct rte_eth_dev_info dev_info;
280 	struct rte_eth_conf port_conf;
281 	struct rte_eth_rxconf *rxconf;
282 	struct rte_eth_txconf *txconf;
283 	int16_t rx_rings, tx_rings;
284 	uint16_t rx_ring_size, tx_ring_size;
285 	int retval;
286 	uint16_t q;
287 
288 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
289 	rte_eth_dev_info_get (port, &dev_info);
290 
291 	if (dev_info.max_rx_queues > MAX_QUEUES) {
292 		rte_exit(EXIT_FAILURE,
293 			"please define MAX_QUEUES no less than %u in %s\n",
294 			dev_info.max_rx_queues, __FILE__);
295 	}
296 
297 	rxconf = &dev_info.default_rxconf;
298 	txconf = &dev_info.default_txconf;
299 	rxconf->rx_drop_en = 1;
300 
301 	/* Enable vlan offload */
302 	txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
303 
304 	/*configure the number of supported virtio devices based on VMDQ limits */
305 	num_devices = dev_info.max_vmdq_pools;
306 
307 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
308 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
309 	tx_rings = (uint16_t)rte_lcore_count();
310 
311 	retval = validate_num_devices(MAX_DEVICES);
312 	if (retval < 0)
313 		return retval;
314 
315 	/* Get port configuration. */
316 	retval = get_eth_conf(&port_conf, num_devices);
317 	if (retval < 0)
318 		return retval;
319 	/* NIC queues are divided into pf queues and vmdq queues.  */
320 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
321 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
322 	num_vmdq_queues = num_devices * queues_per_pool;
323 	num_queues = num_pf_queues + num_vmdq_queues;
324 	vmdq_queue_base = dev_info.vmdq_queue_base;
325 	vmdq_pool_base  = dev_info.vmdq_pool_base;
326 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
327 		num_pf_queues, num_devices, queues_per_pool);
328 
329 	if (port >= rte_eth_dev_count()) return -1;
330 
331 	if (enable_tx_csum == 0)
332 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_CSUM);
333 
334 	if (enable_tso == 0) {
335 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4);
336 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO6);
337 	}
338 
339 	rx_rings = (uint16_t)dev_info.max_rx_queues;
340 	/* Configure ethernet device. */
341 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
342 	if (retval != 0)
343 		return retval;
344 
345 	/* Setup the queues. */
346 	for (q = 0; q < rx_rings; q ++) {
347 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
348 						rte_eth_dev_socket_id(port),
349 						rxconf,
350 						mbuf_pool);
351 		if (retval < 0)
352 			return retval;
353 	}
354 	for (q = 0; q < tx_rings; q ++) {
355 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
356 						rte_eth_dev_socket_id(port),
357 						txconf);
358 		if (retval < 0)
359 			return retval;
360 	}
361 
362 	/* Start the device. */
363 	retval  = rte_eth_dev_start(port);
364 	if (retval < 0) {
365 		RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
366 		return retval;
367 	}
368 
369 	if (promiscuous)
370 		rte_eth_promiscuous_enable(port);
371 
372 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
373 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
374 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
375 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
376 			(unsigned)port,
377 			vmdq_ports_eth_addr[port].addr_bytes[0],
378 			vmdq_ports_eth_addr[port].addr_bytes[1],
379 			vmdq_ports_eth_addr[port].addr_bytes[2],
380 			vmdq_ports_eth_addr[port].addr_bytes[3],
381 			vmdq_ports_eth_addr[port].addr_bytes[4],
382 			vmdq_ports_eth_addr[port].addr_bytes[5]);
383 
384 	return 0;
385 }
386 
387 /*
388  * Set character device basename.
389  */
390 static int
391 us_vhost_parse_basename(const char *q_arg)
392 {
393 	/* parse number string */
394 
395 	if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
396 		return -1;
397 	else
398 		snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
399 
400 	return 0;
401 }
402 
403 /*
404  * Parse the portmask provided at run time.
405  */
406 static int
407 parse_portmask(const char *portmask)
408 {
409 	char *end = NULL;
410 	unsigned long pm;
411 
412 	errno = 0;
413 
414 	/* parse hexadecimal string */
415 	pm = strtoul(portmask, &end, 16);
416 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
417 		return -1;
418 
419 	if (pm == 0)
420 		return -1;
421 
422 	return pm;
423 
424 }
425 
426 /*
427  * Parse num options at run time.
428  */
429 static int
430 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
431 {
432 	char *end = NULL;
433 	unsigned long num;
434 
435 	errno = 0;
436 
437 	/* parse unsigned int string */
438 	num = strtoul(q_arg, &end, 10);
439 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
440 		return -1;
441 
442 	if (num > max_valid_value)
443 		return -1;
444 
445 	return num;
446 
447 }
448 
449 /*
450  * Display usage
451  */
452 static void
453 us_vhost_usage(const char *prgname)
454 {
455 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
456 	"		--vm2vm [0|1|2]\n"
457 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
458 	"		--dev-basename <name>\n"
459 	"		--nb-devices ND\n"
460 	"		-p PORTMASK: Set mask for ports to be used by application\n"
461 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
462 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
463 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
464 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
465 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
466 	"		--vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
467 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
468 	"		--dev-basename: The basename to be used for the character device.\n"
469 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
470 	"		--tso [0|1] disable/enable TCP segment offload.\n",
471 	       prgname);
472 }
473 
474 /*
475  * Parse the arguments given in the command line of the application.
476  */
477 static int
478 us_vhost_parse_args(int argc, char **argv)
479 {
480 	int opt, ret;
481 	int option_index;
482 	unsigned i;
483 	const char *prgname = argv[0];
484 	static struct option long_option[] = {
485 		{"vm2vm", required_argument, NULL, 0},
486 		{"rx-retry", required_argument, NULL, 0},
487 		{"rx-retry-delay", required_argument, NULL, 0},
488 		{"rx-retry-num", required_argument, NULL, 0},
489 		{"mergeable", required_argument, NULL, 0},
490 		{"vlan-strip", required_argument, NULL, 0},
491 		{"stats", required_argument, NULL, 0},
492 		{"dev-basename", required_argument, NULL, 0},
493 		{"tx-csum", required_argument, NULL, 0},
494 		{"tso", required_argument, NULL, 0},
495 		{NULL, 0, 0, 0},
496 	};
497 
498 	/* Parse command line */
499 	while ((opt = getopt_long(argc, argv, "p:P",
500 			long_option, &option_index)) != EOF) {
501 		switch (opt) {
502 		/* Portmask */
503 		case 'p':
504 			enabled_port_mask = parse_portmask(optarg);
505 			if (enabled_port_mask == 0) {
506 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
507 				us_vhost_usage(prgname);
508 				return -1;
509 			}
510 			break;
511 
512 		case 'P':
513 			promiscuous = 1;
514 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
515 				ETH_VMDQ_ACCEPT_BROADCAST |
516 				ETH_VMDQ_ACCEPT_MULTICAST;
517 			rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
518 
519 			break;
520 
521 		case 0:
522 			/* Enable/disable vm2vm comms. */
523 			if (!strncmp(long_option[option_index].name, "vm2vm",
524 				MAX_LONG_OPT_SZ)) {
525 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
526 				if (ret == -1) {
527 					RTE_LOG(INFO, VHOST_CONFIG,
528 						"Invalid argument for "
529 						"vm2vm [0|1|2]\n");
530 					us_vhost_usage(prgname);
531 					return -1;
532 				} else {
533 					vm2vm_mode = (vm2vm_type)ret;
534 				}
535 			}
536 
537 			/* Enable/disable retries on RX. */
538 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
539 				ret = parse_num_opt(optarg, 1);
540 				if (ret == -1) {
541 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
542 					us_vhost_usage(prgname);
543 					return -1;
544 				} else {
545 					enable_retry = ret;
546 				}
547 			}
548 
549 			/* Enable/disable TX checksum offload. */
550 			if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
551 				ret = parse_num_opt(optarg, 1);
552 				if (ret == -1) {
553 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
554 					us_vhost_usage(prgname);
555 					return -1;
556 				} else
557 					enable_tx_csum = ret;
558 			}
559 
560 			/* Enable/disable TSO offload. */
561 			if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
562 				ret = parse_num_opt(optarg, 1);
563 				if (ret == -1) {
564 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
565 					us_vhost_usage(prgname);
566 					return -1;
567 				} else
568 					enable_tso = ret;
569 			}
570 
571 			/* Specify the retries delay time (in useconds) on RX. */
572 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
573 				ret = parse_num_opt(optarg, INT32_MAX);
574 				if (ret == -1) {
575 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
576 					us_vhost_usage(prgname);
577 					return -1;
578 				} else {
579 					burst_rx_delay_time = ret;
580 				}
581 			}
582 
583 			/* Specify the retries number on RX. */
584 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
585 				ret = parse_num_opt(optarg, INT32_MAX);
586 				if (ret == -1) {
587 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
588 					us_vhost_usage(prgname);
589 					return -1;
590 				} else {
591 					burst_rx_retry_num = ret;
592 				}
593 			}
594 
595 			/* Enable/disable RX mergeable buffers. */
596 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
597 				ret = parse_num_opt(optarg, 1);
598 				if (ret == -1) {
599 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
600 					us_vhost_usage(prgname);
601 					return -1;
602 				} else {
603 					mergeable = !!ret;
604 					if (ret) {
605 						vmdq_conf_default.rxmode.jumbo_frame = 1;
606 						vmdq_conf_default.rxmode.max_rx_pkt_len
607 							= JUMBO_FRAME_MAX_SIZE;
608 					}
609 				}
610 			}
611 
612 			/* Enable/disable RX VLAN strip on host. */
613 			if (!strncmp(long_option[option_index].name,
614 				"vlan-strip", MAX_LONG_OPT_SZ)) {
615 				ret = parse_num_opt(optarg, 1);
616 				if (ret == -1) {
617 					RTE_LOG(INFO, VHOST_CONFIG,
618 						"Invalid argument for VLAN strip [0|1]\n");
619 					us_vhost_usage(prgname);
620 					return -1;
621 				} else {
622 					vlan_strip = !!ret;
623 					vmdq_conf_default.rxmode.hw_vlan_strip =
624 						vlan_strip;
625 				}
626 			}
627 
628 			/* Enable/disable stats. */
629 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
630 				ret = parse_num_opt(optarg, INT32_MAX);
631 				if (ret == -1) {
632 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
633 					us_vhost_usage(prgname);
634 					return -1;
635 				} else {
636 					enable_stats = ret;
637 				}
638 			}
639 
640 			/* Set character device basename. */
641 			if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
642 				if (us_vhost_parse_basename(optarg) == -1) {
643 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
644 					us_vhost_usage(prgname);
645 					return -1;
646 				}
647 			}
648 
649 			break;
650 
651 			/* Invalid option - print options. */
652 		default:
653 			us_vhost_usage(prgname);
654 			return -1;
655 		}
656 	}
657 
658 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
659 		if (enabled_port_mask & (1 << i))
660 			ports[num_ports++] = (uint8_t)i;
661 	}
662 
663 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
664 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
665 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
666 		return -1;
667 	}
668 
669 	return 0;
670 }
671 
672 /*
673  * Update the global var NUM_PORTS and array PORTS according to system ports number
674  * and return valid ports number
675  */
676 static unsigned check_ports_num(unsigned nb_ports)
677 {
678 	unsigned valid_num_ports = num_ports;
679 	unsigned portid;
680 
681 	if (num_ports > nb_ports) {
682 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
683 			num_ports, nb_ports);
684 		num_ports = nb_ports;
685 	}
686 
687 	for (portid = 0; portid < num_ports; portid ++) {
688 		if (ports[portid] >= nb_ports) {
689 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
690 				ports[portid], (nb_ports - 1));
691 			ports[portid] = INVALID_PORT_ID;
692 			valid_num_ports--;
693 		}
694 	}
695 	return valid_num_ports;
696 }
697 
698 static inline struct vhost_dev *__attribute__((always_inline))
699 find_vhost_dev(struct ether_addr *mac)
700 {
701 	struct vhost_dev *vdev;
702 
703 	TAILQ_FOREACH(vdev, &vhost_dev_list, next) {
704 		if (vdev->ready == DEVICE_RX &&
705 		    is_same_ether_addr(mac, &vdev->mac_address))
706 			return vdev;
707 	}
708 
709 	return NULL;
710 }
711 
712 /*
713  * This function learns the MAC address of the device and registers this along with a
714  * vlan tag to a VMDQ.
715  */
716 static int
717 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
718 {
719 	struct ether_hdr *pkt_hdr;
720 	struct virtio_net *dev = vdev->dev;
721 	int i, ret;
722 
723 	/* Learn MAC address of guest device from packet */
724 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
725 
726 	if (find_vhost_dev(&pkt_hdr->s_addr)) {
727 		RTE_LOG(ERR, VHOST_DATA,
728 			"Device (%" PRIu64 ") is using a registered MAC!\n",
729 			dev->device_fh);
730 		return -1;
731 	}
732 
733 	for (i = 0; i < ETHER_ADDR_LEN; i++)
734 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
735 
736 	/* vlan_tag currently uses the device_id. */
737 	vdev->vlan_tag = vlan_tags[dev->device_fh];
738 
739 	/* Print out VMDQ registration info. */
740 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
741 		dev->device_fh,
742 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
743 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
744 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
745 		vdev->vlan_tag);
746 
747 	/* Register the MAC address. */
748 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
749 				(uint32_t)dev->device_fh + vmdq_pool_base);
750 	if (ret)
751 		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
752 					dev->device_fh);
753 
754 	/* Enable stripping of the vlan tag as we handle routing. */
755 	if (vlan_strip)
756 		rte_eth_dev_set_vlan_strip_on_queue(ports[0],
757 			(uint16_t)vdev->vmdq_rx_q, 1);
758 
759 	/* Set device as ready for RX. */
760 	vdev->ready = DEVICE_RX;
761 
762 	return 0;
763 }
764 
765 /*
766  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
767  * queue before disabling RX on the device.
768  */
769 static inline void
770 unlink_vmdq(struct vhost_dev *vdev)
771 {
772 	unsigned i = 0;
773 	unsigned rx_count;
774 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
775 
776 	if (vdev->ready == DEVICE_RX) {
777 		/*clear MAC and VLAN settings*/
778 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
779 		for (i = 0; i < 6; i++)
780 			vdev->mac_address.addr_bytes[i] = 0;
781 
782 		vdev->vlan_tag = 0;
783 
784 		/*Clear out the receive buffers*/
785 		rx_count = rte_eth_rx_burst(ports[0],
786 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
787 
788 		while (rx_count) {
789 			for (i = 0; i < rx_count; i++)
790 				rte_pktmbuf_free(pkts_burst[i]);
791 
792 			rx_count = rte_eth_rx_burst(ports[0],
793 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
794 		}
795 
796 		vdev->ready = DEVICE_MAC_LEARNING;
797 	}
798 }
799 
800 static inline void __attribute__((always_inline))
801 virtio_xmit(struct virtio_net *dst_dev, struct virtio_net *src_dev,
802 	    struct rte_mbuf *m)
803 {
804 	uint16_t ret;
805 
806 	ret = rte_vhost_enqueue_burst(dst_dev, VIRTIO_RXQ, &m, 1);
807 	if (enable_stats) {
808 		rte_atomic64_inc(&dev_statistics[dst_dev->device_fh].rx_total_atomic);
809 		rte_atomic64_add(&dev_statistics[dst_dev->device_fh].rx_atomic, ret);
810 		dev_statistics[src_dev->device_fh].tx_total++;
811 		dev_statistics[src_dev->device_fh].tx += ret;
812 	}
813 }
814 
815 /*
816  * Check if the packet destination MAC address is for a local device. If so then put
817  * the packet on that devices RX queue. If not then return.
818  */
819 static inline int __attribute__((always_inline))
820 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
821 {
822 	struct ether_hdr *pkt_hdr;
823 	struct vhost_dev *dst_vdev;
824 	uint64_t fh;
825 
826 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
827 
828 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
829 	if (!dst_vdev)
830 		return -1;
831 
832 	fh = dst_vdev->dev->device_fh;
833 	if (fh == vdev->dev->device_fh) {
834 		RTE_LOG(DEBUG, VHOST_DATA,
835 			"(%" PRIu64 ") TX: src and dst MAC is same. "
836 			"Dropping packet.\n", fh);
837 		return 0;
838 	}
839 
840 	RTE_LOG(DEBUG, VHOST_DATA,
841 		"(%" PRIu64 ") TX: MAC address is local\n", fh);
842 
843 	if (unlikely(dst_vdev->remove)) {
844 		RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") "
845 			"Device is marked for removal\n", fh);
846 		return 0;
847 	}
848 
849 	virtio_xmit(dst_vdev->dev, vdev->dev, m);
850 	return 0;
851 }
852 
853 /*
854  * Check if the destination MAC of a packet is one local VM,
855  * and get its vlan tag, and offset if it is.
856  */
857 static inline int __attribute__((always_inline))
858 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
859 	uint32_t *offset, uint16_t *vlan_tag)
860 {
861 	struct vhost_dev *dst_vdev;
862 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
863 
864 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
865 	if (!dst_vdev)
866 		return 0;
867 
868 	if (dst_vdev->dev->device_fh == dev->device_fh) {
869 		RTE_LOG(DEBUG, VHOST_DATA,
870 			"(%" PRIu64 ") TX: src and dst MAC is same. "
871 			" Dropping packet.\n", dst_vdev->dev->device_fh);
872 		return -1;
873 	}
874 
875 	/*
876 	 * HW vlan strip will reduce the packet length
877 	 * by minus length of vlan tag, so need restore
878 	 * the packet length by plus it.
879 	 */
880 	*offset  = VLAN_HLEN;
881 	*vlan_tag = vlan_tags[(uint16_t)dst_vdev->dev->device_fh];
882 
883 	RTE_LOG(DEBUG, VHOST_DATA,
884 		"(%" PRIu64 ") TX: pkt to local VM device id: (%" PRIu64 ") "
885 		"vlan tag: %u.\n",
886 		dev->device_fh, dst_vdev->dev->device_fh, *vlan_tag);
887 
888 	return 0;
889 }
890 
891 static uint16_t
892 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
893 {
894 	if (ol_flags & PKT_TX_IPV4)
895 		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
896 	else /* assume ethertype == ETHER_TYPE_IPv6 */
897 		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
898 }
899 
900 static void virtio_tx_offload(struct rte_mbuf *m)
901 {
902 	void *l3_hdr;
903 	struct ipv4_hdr *ipv4_hdr = NULL;
904 	struct tcp_hdr *tcp_hdr = NULL;
905 	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
906 
907 	l3_hdr = (char *)eth_hdr + m->l2_len;
908 
909 	if (m->ol_flags & PKT_TX_IPV4) {
910 		ipv4_hdr = l3_hdr;
911 		ipv4_hdr->hdr_checksum = 0;
912 		m->ol_flags |= PKT_TX_IP_CKSUM;
913 	}
914 
915 	tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
916 	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
917 }
918 
919 static inline void
920 free_pkts(struct rte_mbuf **pkts, uint16_t n)
921 {
922 	while (n--)
923 		rte_pktmbuf_free(pkts[n]);
924 }
925 
926 static inline void __attribute__((always_inline))
927 do_drain_mbuf_table(struct mbuf_table *tx_q)
928 {
929 	uint16_t count;
930 
931 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
932 				 tx_q->m_table, tx_q->len);
933 	if (unlikely(count < tx_q->len))
934 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
935 
936 	tx_q->len = 0;
937 }
938 
939 /*
940  * This function routes the TX packet to the correct interface. This
941  * may be a local device or the physical port.
942  */
943 static inline void __attribute__((always_inline))
944 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
945 {
946 	struct mbuf_table *tx_q;
947 	unsigned offset = 0;
948 	const uint16_t lcore_id = rte_lcore_id();
949 	struct virtio_net *dev = vdev->dev;
950 	struct ether_hdr *nh;
951 
952 
953 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
954 	if (unlikely(is_broadcast_ether_addr(&nh->d_addr))) {
955 		struct vhost_dev *vdev2;
956 
957 		TAILQ_FOREACH(vdev2, &vhost_dev_list, next) {
958 			virtio_xmit(vdev2->dev, vdev->dev, m);
959 		}
960 		goto queue2nic;
961 	}
962 
963 	/*check if destination is local VM*/
964 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
965 		rte_pktmbuf_free(m);
966 		return;
967 	}
968 
969 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
970 		if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
971 			rte_pktmbuf_free(m);
972 			return;
973 		}
974 	}
975 
976 	RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: "
977 		"MAC address is external\n", dev->device_fh);
978 
979 queue2nic:
980 
981 	/*Add packet to the port tx queue*/
982 	tx_q = &lcore_tx_queue[lcore_id];
983 
984 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
985 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
986 		/* Guest has inserted the vlan tag. */
987 		struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
988 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
989 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
990 			(vh->vlan_tci != vlan_tag_be))
991 			vh->vlan_tci = vlan_tag_be;
992 	} else {
993 		m->ol_flags |= PKT_TX_VLAN_PKT;
994 
995 		/*
996 		 * Find the right seg to adjust the data len when offset is
997 		 * bigger than tail room size.
998 		 */
999 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1000 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1001 				m->data_len += offset;
1002 			else {
1003 				struct rte_mbuf *seg = m;
1004 
1005 				while ((seg->next != NULL) &&
1006 					(offset > rte_pktmbuf_tailroom(seg)))
1007 					seg = seg->next;
1008 
1009 				seg->data_len += offset;
1010 			}
1011 			m->pkt_len += offset;
1012 		}
1013 
1014 		m->vlan_tci = vlan_tag;
1015 	}
1016 
1017 	if (m->ol_flags & PKT_TX_TCP_SEG)
1018 		virtio_tx_offload(m);
1019 
1020 	tx_q->m_table[tx_q->len++] = m;
1021 	if (enable_stats) {
1022 		dev_statistics[dev->device_fh].tx_total++;
1023 		dev_statistics[dev->device_fh].tx++;
1024 	}
1025 
1026 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1027 		do_drain_mbuf_table(tx_q);
1028 }
1029 
1030 
1031 static inline void __attribute__((always_inline))
1032 drain_mbuf_table(struct mbuf_table *tx_q)
1033 {
1034 	static uint64_t prev_tsc;
1035 	uint64_t cur_tsc;
1036 
1037 	if (tx_q->len == 0)
1038 		return;
1039 
1040 	cur_tsc = rte_rdtsc();
1041 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1042 		prev_tsc = cur_tsc;
1043 
1044 		RTE_LOG(DEBUG, VHOST_DATA,
1045 			"TX queue drained after timeout with burst size %u\n",
1046 			tx_q->len);
1047 		do_drain_mbuf_table(tx_q);
1048 	}
1049 }
1050 
1051 static inline void __attribute__((always_inline))
1052 drain_eth_rx(struct vhost_dev *vdev)
1053 {
1054 	uint16_t rx_count, enqueue_count;
1055 	struct virtio_net *dev = vdev->dev;
1056 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1057 
1058 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1059 				    pkts, MAX_PKT_BURST);
1060 	if (!rx_count)
1061 		return;
1062 
1063 	/*
1064 	 * When "enable_retry" is set, here we wait and retry when there
1065 	 * is no enough free slots in the queue to hold @rx_count packets,
1066 	 * to diminish packet loss.
1067 	 */
1068 	if (enable_retry &&
1069 	    unlikely(rx_count > rte_vring_available_entries(dev,
1070 			VIRTIO_RXQ))) {
1071 		uint32_t retry;
1072 
1073 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1074 			rte_delay_us(burst_rx_delay_time);
1075 			if (rx_count <= rte_vring_available_entries(dev,
1076 					VIRTIO_RXQ))
1077 				break;
1078 		}
1079 	}
1080 
1081 	enqueue_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ,
1082 						pkts, rx_count);
1083 	if (enable_stats) {
1084 		uint64_t fh = dev->device_fh;
1085 
1086 		rte_atomic64_add(&dev_statistics[fh].rx_total_atomic, rx_count);
1087 		rte_atomic64_add(&dev_statistics[fh].rx_atomic, enqueue_count);
1088 	}
1089 
1090 	free_pkts(pkts, rx_count);
1091 }
1092 
1093 static inline void __attribute__((always_inline))
1094 drain_virtio_tx(struct vhost_dev *vdev)
1095 {
1096 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1097 	uint16_t count;
1098 	uint16_t i;
1099 
1100 	count = rte_vhost_dequeue_burst(vdev->dev, VIRTIO_TXQ, mbuf_pool,
1101 					pkts, MAX_PKT_BURST);
1102 
1103 	/* setup VMDq for the first packet */
1104 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1105 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1106 			free_pkts(pkts, count);
1107 	}
1108 
1109 	for (i = 0; i < count; ++i) {
1110 		virtio_tx_route(vdev, pkts[i],
1111 			vlan_tags[(uint16_t)vdev->dev->device_fh]);
1112 	}
1113 }
1114 
1115 /*
1116  * Main function of vhost-switch. It basically does:
1117  *
1118  * for each vhost device {
1119  *    - drain_eth_rx()
1120  *
1121  *      Which drains the host eth Rx queue linked to the vhost device,
1122  *      and deliver all of them to guest virito Rx ring associated with
1123  *      this vhost device.
1124  *
1125  *    - drain_virtio_tx()
1126  *
1127  *      Which drains the guest virtio Tx queue and deliver all of them
1128  *      to the target, which could be another vhost device, or the
1129  *      physical eth dev. The route is done in function "virtio_tx_route".
1130  * }
1131  */
1132 static int
1133 switch_worker(void *arg __rte_unused)
1134 {
1135 	unsigned i;
1136 	unsigned lcore_id = rte_lcore_id();
1137 	struct vhost_dev *vdev;
1138 	struct mbuf_table *tx_q;
1139 
1140 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1141 
1142 	tx_q = &lcore_tx_queue[lcore_id];
1143 	for (i = 0; i < rte_lcore_count(); i++) {
1144 		if (lcore_ids[i] == lcore_id) {
1145 			tx_q->txq_id = i;
1146 			break;
1147 		}
1148 	}
1149 
1150 	while(1) {
1151 		drain_mbuf_table(tx_q);
1152 
1153 		/*
1154 		 * Inform the configuration core that we have exited the
1155 		 * linked list and that no devices are in use if requested.
1156 		 */
1157 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1158 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1159 
1160 		/*
1161 		 * Process vhost devices
1162 		 */
1163 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list, next) {
1164 			if (unlikely(vdev->remove)) {
1165 				unlink_vmdq(vdev);
1166 				vdev->ready = DEVICE_SAFE_REMOVE;
1167 				continue;
1168 			}
1169 
1170 			if (likely(vdev->ready == DEVICE_RX))
1171 				drain_eth_rx(vdev);
1172 
1173 			if (likely(!vdev->remove))
1174 				drain_virtio_tx(vdev);
1175 		}
1176 	}
1177 
1178 	return 0;
1179 }
1180 
1181 /*
1182  * Remove a device from the specific data core linked list and from the
1183  * main linked list. Synchonization  occurs through the use of the
1184  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1185  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1186  */
1187 static void
1188 destroy_device (volatile struct virtio_net *dev)
1189 {
1190 	struct vhost_dev *vdev;
1191 	int lcore;
1192 
1193 	dev->flags &= ~VIRTIO_DEV_RUNNING;
1194 
1195 	vdev = (struct vhost_dev *)dev->priv;
1196 	/*set the remove flag. */
1197 	vdev->remove = 1;
1198 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1199 		rte_pause();
1200 	}
1201 
1202 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev, next);
1203 	TAILQ_REMOVE(&vhost_dev_list, vdev, next);
1204 
1205 	/* Set the dev_removal_flag on each lcore. */
1206 	RTE_LCORE_FOREACH_SLAVE(lcore)
1207 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1208 
1209 	/*
1210 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1211 	 * we can be sure that they can no longer access the device removed
1212 	 * from the linked lists and that the devices are no longer in use.
1213 	 */
1214 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1215 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1216 			rte_pause();
1217 	}
1218 
1219 	lcore_info[vdev->coreid].device_num--;
1220 
1221 	RTE_LOG(INFO, VHOST_DATA,
1222 		"(%" PRIu64 ") Device has been removed from data core\n",
1223 		dev->device_fh);
1224 
1225 	rte_free(vdev);
1226 }
1227 
1228 /*
1229  * A new device is added to a data core. First the device is added to the main linked list
1230  * and the allocated to a specific data core.
1231  */
1232 static int
1233 new_device (struct virtio_net *dev)
1234 {
1235 	int lcore, core_add = 0;
1236 	uint32_t device_num_min = num_devices;
1237 	struct vhost_dev *vdev;
1238 
1239 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1240 	if (vdev == NULL) {
1241 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
1242 			dev->device_fh);
1243 		return -1;
1244 	}
1245 	vdev->dev = dev;
1246 	dev->priv = vdev;
1247 
1248 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, next);
1249 	vdev->vmdq_rx_q
1250 		= dev->device_fh * queues_per_pool + vmdq_queue_base;
1251 
1252 	/*reset ready flag*/
1253 	vdev->ready = DEVICE_MAC_LEARNING;
1254 	vdev->remove = 0;
1255 
1256 	/* Find a suitable lcore to add the device. */
1257 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1258 		if (lcore_info[lcore].device_num < device_num_min) {
1259 			device_num_min = lcore_info[lcore].device_num;
1260 			core_add = lcore;
1261 		}
1262 	}
1263 	vdev->coreid = core_add;
1264 
1265 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev, next);
1266 	lcore_info[vdev->coreid].device_num++;
1267 
1268 	/* Initialize device stats */
1269 	memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
1270 
1271 	/* Disable notifications. */
1272 	rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
1273 	rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
1274 	dev->flags |= VIRTIO_DEV_RUNNING;
1275 
1276 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
1277 
1278 	return 0;
1279 }
1280 
1281 /*
1282  * These callback allow devices to be added to the data core when configuration
1283  * has been fully complete.
1284  */
1285 static const struct virtio_net_device_ops virtio_net_device_ops =
1286 {
1287 	.new_device =  new_device,
1288 	.destroy_device = destroy_device,
1289 };
1290 
1291 /*
1292  * This is a thread will wake up after a period to print stats if the user has
1293  * enabled them.
1294  */
1295 static void
1296 print_stats(void)
1297 {
1298 	struct vhost_dev *vdev;
1299 	uint64_t tx_dropped, rx_dropped;
1300 	uint64_t tx, tx_total, rx, rx_total;
1301 	uint32_t device_fh;
1302 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1303 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1304 
1305 	while(1) {
1306 		sleep(enable_stats);
1307 
1308 		/* Clear screen and move to top left */
1309 		printf("%s%s", clr, top_left);
1310 
1311 		printf("\nDevice statistics ====================================");
1312 
1313 		TAILQ_FOREACH(vdev, &vhost_dev_list, next) {
1314 			device_fh = vdev->dev->device_fh;
1315 			tx_total = dev_statistics[device_fh].tx_total;
1316 			tx = dev_statistics[device_fh].tx;
1317 			tx_dropped = tx_total - tx;
1318 			rx_total = rte_atomic64_read(
1319 				&dev_statistics[device_fh].rx_total_atomic);
1320 			rx = rte_atomic64_read(
1321 				&dev_statistics[device_fh].rx_atomic);
1322 			rx_dropped = rx_total - rx;
1323 
1324 			printf("\nStatistics for device %"PRIu32" ------------------------------"
1325 					"\nTX total: 		%"PRIu64""
1326 					"\nTX dropped: 		%"PRIu64""
1327 					"\nTX successful: 		%"PRIu64""
1328 					"\nRX total: 		%"PRIu64""
1329 					"\nRX dropped: 		%"PRIu64""
1330 					"\nRX successful: 		%"PRIu64"",
1331 					device_fh,
1332 					tx_total,
1333 					tx_dropped,
1334 					tx,
1335 					rx_total,
1336 					rx_dropped,
1337 					rx);
1338 		}
1339 		printf("\n======================================================\n");
1340 	}
1341 }
1342 
1343 /* When we receive a INT signal, unregister vhost driver */
1344 static void
1345 sigint_handler(__rte_unused int signum)
1346 {
1347 	/* Unregister vhost driver. */
1348 	int ret = rte_vhost_driver_unregister((char *)&dev_basename);
1349 	if (ret != 0)
1350 		rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n");
1351 	exit(0);
1352 }
1353 
1354 /*
1355  * While creating an mbuf pool, one key thing is to figure out how
1356  * many mbuf entries is enough for our use. FYI, here are some
1357  * guidelines:
1358  *
1359  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1360  *
1361  * - For each switch core (A CPU core does the packet switch), we need
1362  *   also make some reservation for receiving the packets from virtio
1363  *   Tx queue. How many is enough depends on the usage. It's normally
1364  *   a simple calculation like following:
1365  *
1366  *       MAX_PKT_BURST * max packet size / mbuf size
1367  *
1368  *   So, we definitely need allocate more mbufs when TSO is enabled.
1369  *
1370  * - Similarly, for each switching core, we should serve @nr_rx_desc
1371  *   mbufs for receiving the packets from physical NIC device.
1372  *
1373  * - We also need make sure, for each switch core, we have allocated
1374  *   enough mbufs to fill up the mbuf cache.
1375  */
1376 static void
1377 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1378 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1379 {
1380 	uint32_t nr_mbufs;
1381 	uint32_t nr_mbufs_per_core;
1382 	uint32_t mtu = 1500;
1383 
1384 	if (mergeable)
1385 		mtu = 9000;
1386 	if (enable_tso)
1387 		mtu = 64 * 1024;
1388 
1389 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1390 			(mbuf_size - RTE_PKTMBUF_HEADROOM) * MAX_PKT_BURST;
1391 	nr_mbufs_per_core += nr_rx_desc;
1392 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1393 
1394 	nr_mbufs  = nr_queues * nr_rx_desc;
1395 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1396 	nr_mbufs *= nr_port;
1397 
1398 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1399 					    nr_mbuf_cache, 0, mbuf_size,
1400 					    rte_socket_id());
1401 	if (mbuf_pool == NULL)
1402 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1403 }
1404 
1405 /*
1406  * Main function, does initialisation and calls the per-lcore functions. The CUSE
1407  * device is also registered here to handle the IOCTLs.
1408  */
1409 int
1410 main(int argc, char *argv[])
1411 {
1412 	unsigned lcore_id, core_id = 0;
1413 	unsigned nb_ports, valid_num_ports;
1414 	int ret;
1415 	uint8_t portid;
1416 	static pthread_t tid;
1417 	char thread_name[RTE_MAX_THREAD_NAME_LEN];
1418 
1419 	signal(SIGINT, sigint_handler);
1420 
1421 	/* init EAL */
1422 	ret = rte_eal_init(argc, argv);
1423 	if (ret < 0)
1424 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1425 	argc -= ret;
1426 	argv += ret;
1427 
1428 	/* parse app arguments */
1429 	ret = us_vhost_parse_args(argc, argv);
1430 	if (ret < 0)
1431 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1432 
1433 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
1434 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1435 
1436 		if (rte_lcore_is_enabled(lcore_id))
1437 			lcore_ids[core_id ++] = lcore_id;
1438 
1439 	if (rte_lcore_count() > RTE_MAX_LCORE)
1440 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1441 
1442 	/* Get the number of physical ports. */
1443 	nb_ports = rte_eth_dev_count();
1444 	if (nb_ports > RTE_MAX_ETHPORTS)
1445 		nb_ports = RTE_MAX_ETHPORTS;
1446 
1447 	/*
1448 	 * Update the global var NUM_PORTS and global array PORTS
1449 	 * and get value of var VALID_NUM_PORTS according to system ports number
1450 	 */
1451 	valid_num_ports = check_ports_num(nb_ports);
1452 
1453 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1454 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1455 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1456 		return -1;
1457 	}
1458 
1459 	/*
1460 	 * FIXME: here we are trying to allocate mbufs big enough for
1461 	 * @MAX_QUEUES, but the truth is we're never going to use that
1462 	 * many queues here. We probably should only do allocation for
1463 	 * those queues we are going to use.
1464 	 */
1465 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1466 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1467 
1468 	if (vm2vm_mode == VM2VM_HARDWARE) {
1469 		/* Enable VT loop back to let L2 switch to do it. */
1470 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1471 		RTE_LOG(DEBUG, VHOST_CONFIG,
1472 			"Enable loop back for L2 switch in vmdq.\n");
1473 	}
1474 
1475 	/* initialize all ports */
1476 	for (portid = 0; portid < nb_ports; portid++) {
1477 		/* skip ports that are not enabled */
1478 		if ((enabled_port_mask & (1 << portid)) == 0) {
1479 			RTE_LOG(INFO, VHOST_PORT,
1480 				"Skipping disabled port %d\n", portid);
1481 			continue;
1482 		}
1483 		if (port_init(portid) != 0)
1484 			rte_exit(EXIT_FAILURE,
1485 				"Cannot initialize network ports\n");
1486 	}
1487 
1488 	/* Initialize device stats */
1489 	memset(&dev_statistics, 0, sizeof(dev_statistics));
1490 
1491 	/* Enable stats if the user option is set. */
1492 	if (enable_stats) {
1493 		ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
1494 		if (ret != 0)
1495 			rte_exit(EXIT_FAILURE,
1496 				"Cannot create print-stats thread\n");
1497 
1498 		/* Set thread_name for aid in debugging.  */
1499 		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
1500 		ret = rte_thread_setname(tid, thread_name);
1501 		if (ret != 0)
1502 			RTE_LOG(ERR, VHOST_CONFIG,
1503 				"Cannot set print-stats name\n");
1504 	}
1505 
1506 	/* Launch all data cores. */
1507 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1508 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1509 
1510 	if (mergeable == 0)
1511 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
1512 
1513 	/* Register vhost(cuse or user) driver to handle vhost messages. */
1514 	ret = rte_vhost_driver_register((char *)&dev_basename);
1515 	if (ret != 0)
1516 		rte_exit(EXIT_FAILURE, "vhost driver register failure.\n");
1517 
1518 	rte_vhost_driver_callback_register(&virtio_net_device_ops);
1519 
1520 	/* Start CUSE session. */
1521 	rte_vhost_driver_session_start();
1522 	return 0;
1523 
1524 }
1525