xref: /dpdk/examples/vhost/main.c (revision a49342abbb5d68fafab1d2ba4c669c0e76e32c65)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 #include <rte_ip.h>
54 #include <rte_tcp.h>
55 
56 #include "main.h"
57 
58 #ifndef MAX_QUEUES
59 #define MAX_QUEUES 128
60 #endif
61 
62 /* the maximum number of external ports supported */
63 #define MAX_SUP_PORTS 1
64 
65 #define MBUF_CACHE_SIZE	128
66 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
67 
68 #define MAX_PKT_BURST 32		/* Max burst size for RX/TX */
69 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
70 
71 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
72 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
73 
74 #define JUMBO_FRAME_MAX_SIZE    0x2600
75 
76 /* State of virtio device. */
77 #define DEVICE_MAC_LEARNING 0
78 #define DEVICE_RX			1
79 #define DEVICE_SAFE_REMOVE	2
80 
81 /* Configurable number of RX/TX ring descriptors */
82 #define RTE_TEST_RX_DESC_DEFAULT 1024
83 #define RTE_TEST_TX_DESC_DEFAULT 512
84 
85 #define INVALID_PORT_ID 0xFF
86 
87 /* Max number of devices. Limited by vmdq. */
88 #define MAX_DEVICES 64
89 
90 /* Size of buffers used for snprintfs. */
91 #define MAX_PRINT_BUFF 6072
92 
93 /* Maximum long option length for option parsing. */
94 #define MAX_LONG_OPT_SZ 64
95 
96 /* mask of enabled ports */
97 static uint32_t enabled_port_mask = 0;
98 
99 /* Promiscuous mode */
100 static uint32_t promiscuous;
101 
102 /* number of devices/queues to support*/
103 static uint32_t num_queues = 0;
104 static uint32_t num_devices;
105 
106 static struct rte_mempool *mbuf_pool;
107 static int mergeable;
108 
109 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
110 typedef enum {
111 	VM2VM_DISABLED = 0,
112 	VM2VM_SOFTWARE = 1,
113 	VM2VM_HARDWARE = 2,
114 	VM2VM_LAST
115 } vm2vm_type;
116 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
117 
118 /* Enable stats. */
119 static uint32_t enable_stats = 0;
120 /* Enable retries on RX. */
121 static uint32_t enable_retry = 1;
122 
123 /* Disable TX checksum offload */
124 static uint32_t enable_tx_csum;
125 
126 /* Disable TSO offload */
127 static uint32_t enable_tso;
128 
129 static int client_mode;
130 static int dequeue_zero_copy;
131 
132 /* Specify timeout (in useconds) between retries on RX. */
133 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
134 /* Specify the number of retries on RX. */
135 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
136 
137 /* Socket file paths. Can be set by user */
138 static char *socket_files;
139 static int nb_sockets;
140 
141 /* empty vmdq configuration structure. Filled in programatically */
142 static struct rte_eth_conf vmdq_conf_default = {
143 	.rxmode = {
144 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
145 		.split_hdr_size = 0,
146 		.header_split   = 0, /**< Header Split disabled */
147 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
148 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
149 		/*
150 		 * It is necessary for 1G NIC such as I350,
151 		 * this fixes bug of ipv4 forwarding in guest can't
152 		 * forward pakets from one virtio dev to another virtio dev.
153 		 */
154 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
155 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
156 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
157 	},
158 
159 	.txmode = {
160 		.mq_mode = ETH_MQ_TX_NONE,
161 	},
162 	.rx_adv_conf = {
163 		/*
164 		 * should be overridden separately in code with
165 		 * appropriate values
166 		 */
167 		.vmdq_rx_conf = {
168 			.nb_queue_pools = ETH_8_POOLS,
169 			.enable_default_pool = 0,
170 			.default_pool = 0,
171 			.nb_pool_maps = 0,
172 			.pool_map = {{0, 0},},
173 		},
174 	},
175 };
176 
177 static unsigned lcore_ids[RTE_MAX_LCORE];
178 static uint8_t ports[RTE_MAX_ETHPORTS];
179 static unsigned num_ports = 0; /**< The number of ports specified in command line */
180 static uint16_t num_pf_queues, num_vmdq_queues;
181 static uint16_t vmdq_pool_base, vmdq_queue_base;
182 static uint16_t queues_per_pool;
183 
184 const uint16_t vlan_tags[] = {
185 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
186 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
187 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
188 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
189 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
190 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
191 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
192 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
193 };
194 
195 /* ethernet addresses of ports */
196 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
197 
198 static struct vhost_dev_tailq_list vhost_dev_list =
199 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
200 
201 static struct lcore_info lcore_info[RTE_MAX_LCORE];
202 
203 /* Used for queueing bursts of TX packets. */
204 struct mbuf_table {
205 	unsigned len;
206 	unsigned txq_id;
207 	struct rte_mbuf *m_table[MAX_PKT_BURST];
208 };
209 
210 /* TX queue for each data core. */
211 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
212 
213 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
214 				 / US_PER_S * BURST_TX_DRAIN_US)
215 #define VLAN_HLEN       4
216 
217 /*
218  * Builds up the correct configuration for VMDQ VLAN pool map
219  * according to the pool & queue limits.
220  */
221 static inline int
222 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
223 {
224 	struct rte_eth_vmdq_rx_conf conf;
225 	struct rte_eth_vmdq_rx_conf *def_conf =
226 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
227 	unsigned i;
228 
229 	memset(&conf, 0, sizeof(conf));
230 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
231 	conf.nb_pool_maps = num_devices;
232 	conf.enable_loop_back = def_conf->enable_loop_back;
233 	conf.rx_mode = def_conf->rx_mode;
234 
235 	for (i = 0; i < conf.nb_pool_maps; i++) {
236 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
237 		conf.pool_map[i].pools = (1UL << i);
238 	}
239 
240 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
241 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
242 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
243 	return 0;
244 }
245 
246 /*
247  * Validate the device number according to the max pool number gotten form
248  * dev_info. If the device number is invalid, give the error message and
249  * return -1. Each device must have its own pool.
250  */
251 static inline int
252 validate_num_devices(uint32_t max_nb_devices)
253 {
254 	if (num_devices > max_nb_devices) {
255 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
256 		return -1;
257 	}
258 	return 0;
259 }
260 
261 /*
262  * Initialises a given port using global settings and with the rx buffers
263  * coming from the mbuf_pool passed as parameter
264  */
265 static inline int
266 port_init(uint8_t port)
267 {
268 	struct rte_eth_dev_info dev_info;
269 	struct rte_eth_conf port_conf;
270 	struct rte_eth_rxconf *rxconf;
271 	struct rte_eth_txconf *txconf;
272 	int16_t rx_rings, tx_rings;
273 	uint16_t rx_ring_size, tx_ring_size;
274 	int retval;
275 	uint16_t q;
276 
277 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
278 	rte_eth_dev_info_get (port, &dev_info);
279 
280 	if (dev_info.max_rx_queues > MAX_QUEUES) {
281 		rte_exit(EXIT_FAILURE,
282 			"please define MAX_QUEUES no less than %u in %s\n",
283 			dev_info.max_rx_queues, __FILE__);
284 	}
285 
286 	rxconf = &dev_info.default_rxconf;
287 	txconf = &dev_info.default_txconf;
288 	rxconf->rx_drop_en = 1;
289 
290 	/* Enable vlan offload */
291 	txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
292 
293 	/*configure the number of supported virtio devices based on VMDQ limits */
294 	num_devices = dev_info.max_vmdq_pools;
295 
296 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
297 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
298 
299 	/*
300 	 * When dequeue zero copy is enabled, guest Tx used vring will be
301 	 * updated only when corresponding mbuf is freed. Thus, the nb_tx_desc
302 	 * (tx_ring_size here) must be small enough so that the driver will
303 	 * hit the free threshold easily and free mbufs timely. Otherwise,
304 	 * guest Tx vring would be starved.
305 	 */
306 	if (dequeue_zero_copy)
307 		tx_ring_size = 64;
308 
309 	tx_rings = (uint16_t)rte_lcore_count();
310 
311 	retval = validate_num_devices(MAX_DEVICES);
312 	if (retval < 0)
313 		return retval;
314 
315 	/* Get port configuration. */
316 	retval = get_eth_conf(&port_conf, num_devices);
317 	if (retval < 0)
318 		return retval;
319 	/* NIC queues are divided into pf queues and vmdq queues.  */
320 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
321 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
322 	num_vmdq_queues = num_devices * queues_per_pool;
323 	num_queues = num_pf_queues + num_vmdq_queues;
324 	vmdq_queue_base = dev_info.vmdq_queue_base;
325 	vmdq_pool_base  = dev_info.vmdq_pool_base;
326 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
327 		num_pf_queues, num_devices, queues_per_pool);
328 
329 	if (port >= rte_eth_dev_count()) return -1;
330 
331 	if (enable_tx_csum == 0)
332 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_CSUM);
333 
334 	if (enable_tso == 0) {
335 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4);
336 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO6);
337 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_GUEST_TSO4);
338 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_GUEST_TSO6);
339 	}
340 
341 	rx_rings = (uint16_t)dev_info.max_rx_queues;
342 	/* Configure ethernet device. */
343 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
344 	if (retval != 0) {
345 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
346 			port, strerror(-retval));
347 		return retval;
348 	}
349 
350 	/* Setup the queues. */
351 	for (q = 0; q < rx_rings; q ++) {
352 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
353 						rte_eth_dev_socket_id(port),
354 						rxconf,
355 						mbuf_pool);
356 		if (retval < 0) {
357 			RTE_LOG(ERR, VHOST_PORT,
358 				"Failed to setup rx queue %u of port %u: %s.\n",
359 				q, port, strerror(-retval));
360 			return retval;
361 		}
362 	}
363 	for (q = 0; q < tx_rings; q ++) {
364 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
365 						rte_eth_dev_socket_id(port),
366 						txconf);
367 		if (retval < 0) {
368 			RTE_LOG(ERR, VHOST_PORT,
369 				"Failed to setup tx queue %u of port %u: %s.\n",
370 				q, port, strerror(-retval));
371 			return retval;
372 		}
373 	}
374 
375 	/* Start the device. */
376 	retval  = rte_eth_dev_start(port);
377 	if (retval < 0) {
378 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
379 			port, strerror(-retval));
380 		return retval;
381 	}
382 
383 	if (promiscuous)
384 		rte_eth_promiscuous_enable(port);
385 
386 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
387 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
388 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
389 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
390 			(unsigned)port,
391 			vmdq_ports_eth_addr[port].addr_bytes[0],
392 			vmdq_ports_eth_addr[port].addr_bytes[1],
393 			vmdq_ports_eth_addr[port].addr_bytes[2],
394 			vmdq_ports_eth_addr[port].addr_bytes[3],
395 			vmdq_ports_eth_addr[port].addr_bytes[4],
396 			vmdq_ports_eth_addr[port].addr_bytes[5]);
397 
398 	return 0;
399 }
400 
401 /*
402  * Set socket file path.
403  */
404 static int
405 us_vhost_parse_socket_path(const char *q_arg)
406 {
407 	/* parse number string */
408 	if (strnlen(q_arg, PATH_MAX) > PATH_MAX)
409 		return -1;
410 
411 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
412 	snprintf(socket_files + nb_sockets * PATH_MAX, PATH_MAX, "%s", q_arg);
413 	nb_sockets++;
414 
415 	return 0;
416 }
417 
418 /*
419  * Parse the portmask provided at run time.
420  */
421 static int
422 parse_portmask(const char *portmask)
423 {
424 	char *end = NULL;
425 	unsigned long pm;
426 
427 	errno = 0;
428 
429 	/* parse hexadecimal string */
430 	pm = strtoul(portmask, &end, 16);
431 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
432 		return -1;
433 
434 	if (pm == 0)
435 		return -1;
436 
437 	return pm;
438 
439 }
440 
441 /*
442  * Parse num options at run time.
443  */
444 static int
445 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
446 {
447 	char *end = NULL;
448 	unsigned long num;
449 
450 	errno = 0;
451 
452 	/* parse unsigned int string */
453 	num = strtoul(q_arg, &end, 10);
454 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
455 		return -1;
456 
457 	if (num > max_valid_value)
458 		return -1;
459 
460 	return num;
461 
462 }
463 
464 /*
465  * Display usage
466  */
467 static void
468 us_vhost_usage(const char *prgname)
469 {
470 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
471 	"		--vm2vm [0|1|2]\n"
472 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
473 	"		--socket-file <path>\n"
474 	"		--nb-devices ND\n"
475 	"		-p PORTMASK: Set mask for ports to be used by application\n"
476 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
477 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
478 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
479 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
480 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
481 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
482 	"		--socket-file: The path of the socket file.\n"
483 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
484 	"		--tso [0|1] disable/enable TCP segment offload.\n"
485 	"		--client register a vhost-user socket as client mode.\n"
486 	"		--dequeue-zero-copy enables dequeue zero copy\n",
487 	       prgname);
488 }
489 
490 /*
491  * Parse the arguments given in the command line of the application.
492  */
493 static int
494 us_vhost_parse_args(int argc, char **argv)
495 {
496 	int opt, ret;
497 	int option_index;
498 	unsigned i;
499 	const char *prgname = argv[0];
500 	static struct option long_option[] = {
501 		{"vm2vm", required_argument, NULL, 0},
502 		{"rx-retry", required_argument, NULL, 0},
503 		{"rx-retry-delay", required_argument, NULL, 0},
504 		{"rx-retry-num", required_argument, NULL, 0},
505 		{"mergeable", required_argument, NULL, 0},
506 		{"stats", required_argument, NULL, 0},
507 		{"socket-file", required_argument, NULL, 0},
508 		{"tx-csum", required_argument, NULL, 0},
509 		{"tso", required_argument, NULL, 0},
510 		{"client", no_argument, &client_mode, 1},
511 		{"dequeue-zero-copy", no_argument, &dequeue_zero_copy, 1},
512 		{NULL, 0, 0, 0},
513 	};
514 
515 	/* Parse command line */
516 	while ((opt = getopt_long(argc, argv, "p:P",
517 			long_option, &option_index)) != EOF) {
518 		switch (opt) {
519 		/* Portmask */
520 		case 'p':
521 			enabled_port_mask = parse_portmask(optarg);
522 			if (enabled_port_mask == 0) {
523 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
524 				us_vhost_usage(prgname);
525 				return -1;
526 			}
527 			break;
528 
529 		case 'P':
530 			promiscuous = 1;
531 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
532 				ETH_VMDQ_ACCEPT_BROADCAST |
533 				ETH_VMDQ_ACCEPT_MULTICAST;
534 			rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
535 
536 			break;
537 
538 		case 0:
539 			/* Enable/disable vm2vm comms. */
540 			if (!strncmp(long_option[option_index].name, "vm2vm",
541 				MAX_LONG_OPT_SZ)) {
542 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
543 				if (ret == -1) {
544 					RTE_LOG(INFO, VHOST_CONFIG,
545 						"Invalid argument for "
546 						"vm2vm [0|1|2]\n");
547 					us_vhost_usage(prgname);
548 					return -1;
549 				} else {
550 					vm2vm_mode = (vm2vm_type)ret;
551 				}
552 			}
553 
554 			/* Enable/disable retries on RX. */
555 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
556 				ret = parse_num_opt(optarg, 1);
557 				if (ret == -1) {
558 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
559 					us_vhost_usage(prgname);
560 					return -1;
561 				} else {
562 					enable_retry = ret;
563 				}
564 			}
565 
566 			/* Enable/disable TX checksum offload. */
567 			if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
568 				ret = parse_num_opt(optarg, 1);
569 				if (ret == -1) {
570 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
571 					us_vhost_usage(prgname);
572 					return -1;
573 				} else
574 					enable_tx_csum = ret;
575 			}
576 
577 			/* Enable/disable TSO offload. */
578 			if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
579 				ret = parse_num_opt(optarg, 1);
580 				if (ret == -1) {
581 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
582 					us_vhost_usage(prgname);
583 					return -1;
584 				} else
585 					enable_tso = ret;
586 			}
587 
588 			/* Specify the retries delay time (in useconds) on RX. */
589 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
590 				ret = parse_num_opt(optarg, INT32_MAX);
591 				if (ret == -1) {
592 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
593 					us_vhost_usage(prgname);
594 					return -1;
595 				} else {
596 					burst_rx_delay_time = ret;
597 				}
598 			}
599 
600 			/* Specify the retries number on RX. */
601 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
602 				ret = parse_num_opt(optarg, INT32_MAX);
603 				if (ret == -1) {
604 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
605 					us_vhost_usage(prgname);
606 					return -1;
607 				} else {
608 					burst_rx_retry_num = ret;
609 				}
610 			}
611 
612 			/* Enable/disable RX mergeable buffers. */
613 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
614 				ret = parse_num_opt(optarg, 1);
615 				if (ret == -1) {
616 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
617 					us_vhost_usage(prgname);
618 					return -1;
619 				} else {
620 					mergeable = !!ret;
621 					if (ret) {
622 						vmdq_conf_default.rxmode.jumbo_frame = 1;
623 						vmdq_conf_default.rxmode.max_rx_pkt_len
624 							= JUMBO_FRAME_MAX_SIZE;
625 					}
626 				}
627 			}
628 
629 			/* Enable/disable stats. */
630 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
631 				ret = parse_num_opt(optarg, INT32_MAX);
632 				if (ret == -1) {
633 					RTE_LOG(INFO, VHOST_CONFIG,
634 						"Invalid argument for stats [0..N]\n");
635 					us_vhost_usage(prgname);
636 					return -1;
637 				} else {
638 					enable_stats = ret;
639 				}
640 			}
641 
642 			/* Set socket file path. */
643 			if (!strncmp(long_option[option_index].name,
644 						"socket-file", MAX_LONG_OPT_SZ)) {
645 				if (us_vhost_parse_socket_path(optarg) == -1) {
646 					RTE_LOG(INFO, VHOST_CONFIG,
647 					"Invalid argument for socket name (Max %d characters)\n",
648 					PATH_MAX);
649 					us_vhost_usage(prgname);
650 					return -1;
651 				}
652 			}
653 
654 			break;
655 
656 			/* Invalid option - print options. */
657 		default:
658 			us_vhost_usage(prgname);
659 			return -1;
660 		}
661 	}
662 
663 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
664 		if (enabled_port_mask & (1 << i))
665 			ports[num_ports++] = (uint8_t)i;
666 	}
667 
668 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
669 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
670 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
671 		return -1;
672 	}
673 
674 	return 0;
675 }
676 
677 /*
678  * Update the global var NUM_PORTS and array PORTS according to system ports number
679  * and return valid ports number
680  */
681 static unsigned check_ports_num(unsigned nb_ports)
682 {
683 	unsigned valid_num_ports = num_ports;
684 	unsigned portid;
685 
686 	if (num_ports > nb_ports) {
687 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
688 			num_ports, nb_ports);
689 		num_ports = nb_ports;
690 	}
691 
692 	for (portid = 0; portid < num_ports; portid ++) {
693 		if (ports[portid] >= nb_ports) {
694 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
695 				ports[portid], (nb_ports - 1));
696 			ports[portid] = INVALID_PORT_ID;
697 			valid_num_ports--;
698 		}
699 	}
700 	return valid_num_ports;
701 }
702 
703 static inline struct vhost_dev *__attribute__((always_inline))
704 find_vhost_dev(struct ether_addr *mac)
705 {
706 	struct vhost_dev *vdev;
707 
708 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
709 		if (vdev->ready == DEVICE_RX &&
710 		    is_same_ether_addr(mac, &vdev->mac_address))
711 			return vdev;
712 	}
713 
714 	return NULL;
715 }
716 
717 /*
718  * This function learns the MAC address of the device and registers this along with a
719  * vlan tag to a VMDQ.
720  */
721 static int
722 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
723 {
724 	struct ether_hdr *pkt_hdr;
725 	int i, ret;
726 
727 	/* Learn MAC address of guest device from packet */
728 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
729 
730 	if (find_vhost_dev(&pkt_hdr->s_addr)) {
731 		RTE_LOG(ERR, VHOST_DATA,
732 			"(%d) device is using a registered MAC!\n",
733 			vdev->vid);
734 		return -1;
735 	}
736 
737 	for (i = 0; i < ETHER_ADDR_LEN; i++)
738 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
739 
740 	/* vlan_tag currently uses the device_id. */
741 	vdev->vlan_tag = vlan_tags[vdev->vid];
742 
743 	/* Print out VMDQ registration info. */
744 	RTE_LOG(INFO, VHOST_DATA,
745 		"(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
746 		vdev->vid,
747 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
748 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
749 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
750 		vdev->vlan_tag);
751 
752 	/* Register the MAC address. */
753 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
754 				(uint32_t)vdev->vid + vmdq_pool_base);
755 	if (ret)
756 		RTE_LOG(ERR, VHOST_DATA,
757 			"(%d) failed to add device MAC address to VMDQ\n",
758 			vdev->vid);
759 
760 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
761 
762 	/* Set device as ready for RX. */
763 	vdev->ready = DEVICE_RX;
764 
765 	return 0;
766 }
767 
768 /*
769  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
770  * queue before disabling RX on the device.
771  */
772 static inline void
773 unlink_vmdq(struct vhost_dev *vdev)
774 {
775 	unsigned i = 0;
776 	unsigned rx_count;
777 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
778 
779 	if (vdev->ready == DEVICE_RX) {
780 		/*clear MAC and VLAN settings*/
781 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
782 		for (i = 0; i < 6; i++)
783 			vdev->mac_address.addr_bytes[i] = 0;
784 
785 		vdev->vlan_tag = 0;
786 
787 		/*Clear out the receive buffers*/
788 		rx_count = rte_eth_rx_burst(ports[0],
789 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
790 
791 		while (rx_count) {
792 			for (i = 0; i < rx_count; i++)
793 				rte_pktmbuf_free(pkts_burst[i]);
794 
795 			rx_count = rte_eth_rx_burst(ports[0],
796 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
797 		}
798 
799 		vdev->ready = DEVICE_MAC_LEARNING;
800 	}
801 }
802 
803 static inline void __attribute__((always_inline))
804 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
805 	    struct rte_mbuf *m)
806 {
807 	uint16_t ret;
808 
809 	ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
810 	if (enable_stats) {
811 		rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
812 		rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
813 		src_vdev->stats.tx_total++;
814 		src_vdev->stats.tx += ret;
815 	}
816 }
817 
818 /*
819  * Check if the packet destination MAC address is for a local device. If so then put
820  * the packet on that devices RX queue. If not then return.
821  */
822 static inline int __attribute__((always_inline))
823 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
824 {
825 	struct ether_hdr *pkt_hdr;
826 	struct vhost_dev *dst_vdev;
827 
828 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
829 
830 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
831 	if (!dst_vdev)
832 		return -1;
833 
834 	if (vdev->vid == dst_vdev->vid) {
835 		RTE_LOG(DEBUG, VHOST_DATA,
836 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
837 			vdev->vid);
838 		return 0;
839 	}
840 
841 	RTE_LOG(DEBUG, VHOST_DATA,
842 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
843 
844 	if (unlikely(dst_vdev->remove)) {
845 		RTE_LOG(DEBUG, VHOST_DATA,
846 			"(%d) device is marked for removal\n", dst_vdev->vid);
847 		return 0;
848 	}
849 
850 	virtio_xmit(dst_vdev, vdev, m);
851 	return 0;
852 }
853 
854 /*
855  * Check if the destination MAC of a packet is one local VM,
856  * and get its vlan tag, and offset if it is.
857  */
858 static inline int __attribute__((always_inline))
859 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
860 	uint32_t *offset, uint16_t *vlan_tag)
861 {
862 	struct vhost_dev *dst_vdev;
863 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
864 
865 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
866 	if (!dst_vdev)
867 		return 0;
868 
869 	if (vdev->vid == dst_vdev->vid) {
870 		RTE_LOG(DEBUG, VHOST_DATA,
871 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
872 			vdev->vid);
873 		return -1;
874 	}
875 
876 	/*
877 	 * HW vlan strip will reduce the packet length
878 	 * by minus length of vlan tag, so need restore
879 	 * the packet length by plus it.
880 	 */
881 	*offset  = VLAN_HLEN;
882 	*vlan_tag = vlan_tags[vdev->vid];
883 
884 	RTE_LOG(DEBUG, VHOST_DATA,
885 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
886 		vdev->vid, dst_vdev->vid, *vlan_tag);
887 
888 	return 0;
889 }
890 
891 static uint16_t
892 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
893 {
894 	if (ol_flags & PKT_TX_IPV4)
895 		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
896 	else /* assume ethertype == ETHER_TYPE_IPv6 */
897 		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
898 }
899 
900 static void virtio_tx_offload(struct rte_mbuf *m)
901 {
902 	void *l3_hdr;
903 	struct ipv4_hdr *ipv4_hdr = NULL;
904 	struct tcp_hdr *tcp_hdr = NULL;
905 	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
906 
907 	l3_hdr = (char *)eth_hdr + m->l2_len;
908 
909 	if (m->ol_flags & PKT_TX_IPV4) {
910 		ipv4_hdr = l3_hdr;
911 		ipv4_hdr->hdr_checksum = 0;
912 		m->ol_flags |= PKT_TX_IP_CKSUM;
913 	}
914 
915 	tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
916 	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
917 }
918 
919 static inline void
920 free_pkts(struct rte_mbuf **pkts, uint16_t n)
921 {
922 	while (n--)
923 		rte_pktmbuf_free(pkts[n]);
924 }
925 
926 static inline void __attribute__((always_inline))
927 do_drain_mbuf_table(struct mbuf_table *tx_q)
928 {
929 	uint16_t count;
930 
931 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
932 				 tx_q->m_table, tx_q->len);
933 	if (unlikely(count < tx_q->len))
934 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
935 
936 	tx_q->len = 0;
937 }
938 
939 /*
940  * This function routes the TX packet to the correct interface. This
941  * may be a local device or the physical port.
942  */
943 static inline void __attribute__((always_inline))
944 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
945 {
946 	struct mbuf_table *tx_q;
947 	unsigned offset = 0;
948 	const uint16_t lcore_id = rte_lcore_id();
949 	struct ether_hdr *nh;
950 
951 
952 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
953 	if (unlikely(is_broadcast_ether_addr(&nh->d_addr))) {
954 		struct vhost_dev *vdev2;
955 
956 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
957 			virtio_xmit(vdev2, vdev, m);
958 		}
959 		goto queue2nic;
960 	}
961 
962 	/*check if destination is local VM*/
963 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
964 		rte_pktmbuf_free(m);
965 		return;
966 	}
967 
968 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
969 		if (unlikely(find_local_dest(vdev, m, &offset,
970 					     &vlan_tag) != 0)) {
971 			rte_pktmbuf_free(m);
972 			return;
973 		}
974 	}
975 
976 	RTE_LOG(DEBUG, VHOST_DATA,
977 		"(%d) TX: MAC address is external\n", vdev->vid);
978 
979 queue2nic:
980 
981 	/*Add packet to the port tx queue*/
982 	tx_q = &lcore_tx_queue[lcore_id];
983 
984 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
985 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
986 		/* Guest has inserted the vlan tag. */
987 		struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
988 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
989 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
990 			(vh->vlan_tci != vlan_tag_be))
991 			vh->vlan_tci = vlan_tag_be;
992 	} else {
993 		m->ol_flags |= PKT_TX_VLAN_PKT;
994 
995 		/*
996 		 * Find the right seg to adjust the data len when offset is
997 		 * bigger than tail room size.
998 		 */
999 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1000 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1001 				m->data_len += offset;
1002 			else {
1003 				struct rte_mbuf *seg = m;
1004 
1005 				while ((seg->next != NULL) &&
1006 					(offset > rte_pktmbuf_tailroom(seg)))
1007 					seg = seg->next;
1008 
1009 				seg->data_len += offset;
1010 			}
1011 			m->pkt_len += offset;
1012 		}
1013 
1014 		m->vlan_tci = vlan_tag;
1015 	}
1016 
1017 	if (m->ol_flags & PKT_TX_TCP_SEG)
1018 		virtio_tx_offload(m);
1019 
1020 	tx_q->m_table[tx_q->len++] = m;
1021 	if (enable_stats) {
1022 		vdev->stats.tx_total++;
1023 		vdev->stats.tx++;
1024 	}
1025 
1026 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1027 		do_drain_mbuf_table(tx_q);
1028 }
1029 
1030 
1031 static inline void __attribute__((always_inline))
1032 drain_mbuf_table(struct mbuf_table *tx_q)
1033 {
1034 	static uint64_t prev_tsc;
1035 	uint64_t cur_tsc;
1036 
1037 	if (tx_q->len == 0)
1038 		return;
1039 
1040 	cur_tsc = rte_rdtsc();
1041 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1042 		prev_tsc = cur_tsc;
1043 
1044 		RTE_LOG(DEBUG, VHOST_DATA,
1045 			"TX queue drained after timeout with burst size %u\n",
1046 			tx_q->len);
1047 		do_drain_mbuf_table(tx_q);
1048 	}
1049 }
1050 
1051 static inline void __attribute__((always_inline))
1052 drain_eth_rx(struct vhost_dev *vdev)
1053 {
1054 	uint16_t rx_count, enqueue_count;
1055 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1056 
1057 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1058 				    pkts, MAX_PKT_BURST);
1059 	if (!rx_count)
1060 		return;
1061 
1062 	/*
1063 	 * When "enable_retry" is set, here we wait and retry when there
1064 	 * is no enough free slots in the queue to hold @rx_count packets,
1065 	 * to diminish packet loss.
1066 	 */
1067 	if (enable_retry &&
1068 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1069 			VIRTIO_RXQ))) {
1070 		uint32_t retry;
1071 
1072 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1073 			rte_delay_us(burst_rx_delay_time);
1074 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1075 					VIRTIO_RXQ))
1076 				break;
1077 		}
1078 	}
1079 
1080 	enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1081 						pkts, rx_count);
1082 	if (enable_stats) {
1083 		rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1084 		rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1085 	}
1086 
1087 	free_pkts(pkts, rx_count);
1088 }
1089 
1090 static inline void __attribute__((always_inline))
1091 drain_virtio_tx(struct vhost_dev *vdev)
1092 {
1093 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1094 	uint16_t count;
1095 	uint16_t i;
1096 
1097 	count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ, mbuf_pool,
1098 					pkts, MAX_PKT_BURST);
1099 
1100 	/* setup VMDq for the first packet */
1101 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1102 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1103 			free_pkts(pkts, count);
1104 	}
1105 
1106 	for (i = 0; i < count; ++i)
1107 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1108 }
1109 
1110 /*
1111  * Main function of vhost-switch. It basically does:
1112  *
1113  * for each vhost device {
1114  *    - drain_eth_rx()
1115  *
1116  *      Which drains the host eth Rx queue linked to the vhost device,
1117  *      and deliver all of them to guest virito Rx ring associated with
1118  *      this vhost device.
1119  *
1120  *    - drain_virtio_tx()
1121  *
1122  *      Which drains the guest virtio Tx queue and deliver all of them
1123  *      to the target, which could be another vhost device, or the
1124  *      physical eth dev. The route is done in function "virtio_tx_route".
1125  * }
1126  */
1127 static int
1128 switch_worker(void *arg __rte_unused)
1129 {
1130 	unsigned i;
1131 	unsigned lcore_id = rte_lcore_id();
1132 	struct vhost_dev *vdev;
1133 	struct mbuf_table *tx_q;
1134 
1135 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1136 
1137 	tx_q = &lcore_tx_queue[lcore_id];
1138 	for (i = 0; i < rte_lcore_count(); i++) {
1139 		if (lcore_ids[i] == lcore_id) {
1140 			tx_q->txq_id = i;
1141 			break;
1142 		}
1143 	}
1144 
1145 	while(1) {
1146 		drain_mbuf_table(tx_q);
1147 
1148 		/*
1149 		 * Inform the configuration core that we have exited the
1150 		 * linked list and that no devices are in use if requested.
1151 		 */
1152 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1153 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1154 
1155 		/*
1156 		 * Process vhost devices
1157 		 */
1158 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1159 			      lcore_vdev_entry) {
1160 			if (unlikely(vdev->remove)) {
1161 				unlink_vmdq(vdev);
1162 				vdev->ready = DEVICE_SAFE_REMOVE;
1163 				continue;
1164 			}
1165 
1166 			if (likely(vdev->ready == DEVICE_RX))
1167 				drain_eth_rx(vdev);
1168 
1169 			if (likely(!vdev->remove))
1170 				drain_virtio_tx(vdev);
1171 		}
1172 	}
1173 
1174 	return 0;
1175 }
1176 
1177 /*
1178  * Remove a device from the specific data core linked list and from the
1179  * main linked list. Synchonization  occurs through the use of the
1180  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1181  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1182  */
1183 static void
1184 destroy_device(int vid)
1185 {
1186 	struct vhost_dev *vdev = NULL;
1187 	int lcore;
1188 
1189 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1190 		if (vdev->vid == vid)
1191 			break;
1192 	}
1193 	if (!vdev)
1194 		return;
1195 	/*set the remove flag. */
1196 	vdev->remove = 1;
1197 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1198 		rte_pause();
1199 	}
1200 
1201 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1202 		     lcore_vdev_entry);
1203 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1204 
1205 
1206 	/* Set the dev_removal_flag on each lcore. */
1207 	RTE_LCORE_FOREACH_SLAVE(lcore)
1208 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1209 
1210 	/*
1211 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1212 	 * we can be sure that they can no longer access the device removed
1213 	 * from the linked lists and that the devices are no longer in use.
1214 	 */
1215 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1216 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1217 			rte_pause();
1218 	}
1219 
1220 	lcore_info[vdev->coreid].device_num--;
1221 
1222 	RTE_LOG(INFO, VHOST_DATA,
1223 		"(%d) device has been removed from data core\n",
1224 		vdev->vid);
1225 
1226 	rte_free(vdev);
1227 }
1228 
1229 /*
1230  * A new device is added to a data core. First the device is added to the main linked list
1231  * and the allocated to a specific data core.
1232  */
1233 static int
1234 new_device(int vid)
1235 {
1236 	int lcore, core_add = 0;
1237 	uint32_t device_num_min = num_devices;
1238 	struct vhost_dev *vdev;
1239 
1240 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1241 	if (vdev == NULL) {
1242 		RTE_LOG(INFO, VHOST_DATA,
1243 			"(%d) couldn't allocate memory for vhost dev\n",
1244 			vid);
1245 		return -1;
1246 	}
1247 	vdev->vid = vid;
1248 
1249 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1250 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1251 
1252 	/*reset ready flag*/
1253 	vdev->ready = DEVICE_MAC_LEARNING;
1254 	vdev->remove = 0;
1255 
1256 	/* Find a suitable lcore to add the device. */
1257 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1258 		if (lcore_info[lcore].device_num < device_num_min) {
1259 			device_num_min = lcore_info[lcore].device_num;
1260 			core_add = lcore;
1261 		}
1262 	}
1263 	vdev->coreid = core_add;
1264 
1265 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1266 			  lcore_vdev_entry);
1267 	lcore_info[vdev->coreid].device_num++;
1268 
1269 	/* Disable notifications. */
1270 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1271 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1272 
1273 	RTE_LOG(INFO, VHOST_DATA,
1274 		"(%d) device has been added to data core %d\n",
1275 		vid, vdev->coreid);
1276 
1277 	return 0;
1278 }
1279 
1280 /*
1281  * These callback allow devices to be added to the data core when configuration
1282  * has been fully complete.
1283  */
1284 static const struct virtio_net_device_ops virtio_net_device_ops =
1285 {
1286 	.new_device =  new_device,
1287 	.destroy_device = destroy_device,
1288 };
1289 
1290 /*
1291  * This is a thread will wake up after a period to print stats if the user has
1292  * enabled them.
1293  */
1294 static void
1295 print_stats(void)
1296 {
1297 	struct vhost_dev *vdev;
1298 	uint64_t tx_dropped, rx_dropped;
1299 	uint64_t tx, tx_total, rx, rx_total;
1300 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1301 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1302 
1303 	while(1) {
1304 		sleep(enable_stats);
1305 
1306 		/* Clear screen and move to top left */
1307 		printf("%s%s\n", clr, top_left);
1308 		printf("Device statistics =================================\n");
1309 
1310 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1311 			tx_total   = vdev->stats.tx_total;
1312 			tx         = vdev->stats.tx;
1313 			tx_dropped = tx_total - tx;
1314 
1315 			rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1316 			rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1317 			rx_dropped = rx_total - rx;
1318 
1319 			printf("Statistics for device %d\n"
1320 				"-----------------------\n"
1321 				"TX total:              %" PRIu64 "\n"
1322 				"TX dropped:            %" PRIu64 "\n"
1323 				"TX successful:         %" PRIu64 "\n"
1324 				"RX total:              %" PRIu64 "\n"
1325 				"RX dropped:            %" PRIu64 "\n"
1326 				"RX successful:         %" PRIu64 "\n",
1327 				vdev->vid,
1328 				tx_total, tx_dropped, tx,
1329 				rx_total, rx_dropped, rx);
1330 		}
1331 
1332 		printf("===================================================\n");
1333 	}
1334 }
1335 
1336 static void
1337 unregister_drivers(int socket_num)
1338 {
1339 	int i, ret;
1340 
1341 	for (i = 0; i < socket_num; i++) {
1342 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1343 		if (ret != 0)
1344 			RTE_LOG(ERR, VHOST_CONFIG,
1345 				"Fail to unregister vhost driver for %s.\n",
1346 				socket_files + i * PATH_MAX);
1347 	}
1348 }
1349 
1350 /* When we receive a INT signal, unregister vhost driver */
1351 static void
1352 sigint_handler(__rte_unused int signum)
1353 {
1354 	/* Unregister vhost driver. */
1355 	unregister_drivers(nb_sockets);
1356 
1357 	exit(0);
1358 }
1359 
1360 /*
1361  * While creating an mbuf pool, one key thing is to figure out how
1362  * many mbuf entries is enough for our use. FYI, here are some
1363  * guidelines:
1364  *
1365  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1366  *
1367  * - For each switch core (A CPU core does the packet switch), we need
1368  *   also make some reservation for receiving the packets from virtio
1369  *   Tx queue. How many is enough depends on the usage. It's normally
1370  *   a simple calculation like following:
1371  *
1372  *       MAX_PKT_BURST * max packet size / mbuf size
1373  *
1374  *   So, we definitely need allocate more mbufs when TSO is enabled.
1375  *
1376  * - Similarly, for each switching core, we should serve @nr_rx_desc
1377  *   mbufs for receiving the packets from physical NIC device.
1378  *
1379  * - We also need make sure, for each switch core, we have allocated
1380  *   enough mbufs to fill up the mbuf cache.
1381  */
1382 static void
1383 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1384 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1385 {
1386 	uint32_t nr_mbufs;
1387 	uint32_t nr_mbufs_per_core;
1388 	uint32_t mtu = 1500;
1389 
1390 	if (mergeable)
1391 		mtu = 9000;
1392 	if (enable_tso)
1393 		mtu = 64 * 1024;
1394 
1395 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1396 			(mbuf_size - RTE_PKTMBUF_HEADROOM) * MAX_PKT_BURST;
1397 	nr_mbufs_per_core += nr_rx_desc;
1398 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1399 
1400 	nr_mbufs  = nr_queues * nr_rx_desc;
1401 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1402 	nr_mbufs *= nr_port;
1403 
1404 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1405 					    nr_mbuf_cache, 0, mbuf_size,
1406 					    rte_socket_id());
1407 	if (mbuf_pool == NULL)
1408 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1409 }
1410 
1411 /*
1412  * Main function, does initialisation and calls the per-lcore functions.
1413  */
1414 int
1415 main(int argc, char *argv[])
1416 {
1417 	unsigned lcore_id, core_id = 0;
1418 	unsigned nb_ports, valid_num_ports;
1419 	int ret, i;
1420 	uint8_t portid;
1421 	static pthread_t tid;
1422 	char thread_name[RTE_MAX_THREAD_NAME_LEN];
1423 	uint64_t flags = 0;
1424 
1425 	signal(SIGINT, sigint_handler);
1426 
1427 	/* init EAL */
1428 	ret = rte_eal_init(argc, argv);
1429 	if (ret < 0)
1430 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1431 	argc -= ret;
1432 	argv += ret;
1433 
1434 	/* parse app arguments */
1435 	ret = us_vhost_parse_args(argc, argv);
1436 	if (ret < 0)
1437 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1438 
1439 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
1440 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1441 
1442 		if (rte_lcore_is_enabled(lcore_id))
1443 			lcore_ids[core_id ++] = lcore_id;
1444 
1445 	if (rte_lcore_count() > RTE_MAX_LCORE)
1446 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1447 
1448 	/* Get the number of physical ports. */
1449 	nb_ports = rte_eth_dev_count();
1450 
1451 	/*
1452 	 * Update the global var NUM_PORTS and global array PORTS
1453 	 * and get value of var VALID_NUM_PORTS according to system ports number
1454 	 */
1455 	valid_num_ports = check_ports_num(nb_ports);
1456 
1457 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1458 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1459 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1460 		return -1;
1461 	}
1462 
1463 	/*
1464 	 * FIXME: here we are trying to allocate mbufs big enough for
1465 	 * @MAX_QUEUES, but the truth is we're never going to use that
1466 	 * many queues here. We probably should only do allocation for
1467 	 * those queues we are going to use.
1468 	 */
1469 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1470 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1471 
1472 	if (vm2vm_mode == VM2VM_HARDWARE) {
1473 		/* Enable VT loop back to let L2 switch to do it. */
1474 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1475 		RTE_LOG(DEBUG, VHOST_CONFIG,
1476 			"Enable loop back for L2 switch in vmdq.\n");
1477 	}
1478 
1479 	/* initialize all ports */
1480 	for (portid = 0; portid < nb_ports; portid++) {
1481 		/* skip ports that are not enabled */
1482 		if ((enabled_port_mask & (1 << portid)) == 0) {
1483 			RTE_LOG(INFO, VHOST_PORT,
1484 				"Skipping disabled port %d\n", portid);
1485 			continue;
1486 		}
1487 		if (port_init(portid) != 0)
1488 			rte_exit(EXIT_FAILURE,
1489 				"Cannot initialize network ports\n");
1490 	}
1491 
1492 	/* Enable stats if the user option is set. */
1493 	if (enable_stats) {
1494 		ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
1495 		if (ret != 0)
1496 			rte_exit(EXIT_FAILURE,
1497 				"Cannot create print-stats thread\n");
1498 
1499 		/* Set thread_name for aid in debugging.  */
1500 		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
1501 		ret = rte_thread_setname(tid, thread_name);
1502 		if (ret != 0)
1503 			RTE_LOG(DEBUG, VHOST_CONFIG,
1504 				"Cannot set print-stats name\n");
1505 	}
1506 
1507 	/* Launch all data cores. */
1508 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1509 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1510 
1511 	if (mergeable == 0)
1512 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
1513 
1514 	if (client_mode)
1515 		flags |= RTE_VHOST_USER_CLIENT;
1516 
1517 	if (dequeue_zero_copy)
1518 		flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1519 
1520 	/* Register vhost user driver to handle vhost messages. */
1521 	for (i = 0; i < nb_sockets; i++) {
1522 		ret = rte_vhost_driver_register
1523 				(socket_files + i * PATH_MAX, flags);
1524 		if (ret != 0) {
1525 			unregister_drivers(i);
1526 			rte_exit(EXIT_FAILURE,
1527 				"vhost driver register failure.\n");
1528 		}
1529 	}
1530 
1531 	rte_vhost_driver_callback_register(&virtio_net_device_ops);
1532 
1533 	rte_vhost_driver_session_start();
1534 	return 0;
1535 
1536 }
1537