xref: /dpdk/examples/vhost/main.c (revision c39d1e082a4b426e915074ce30eb6f410ee2654a)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4 
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16 
17 #include <rte_atomic.h>
18 #include <rte_cycles.h>
19 #include <rte_ethdev.h>
20 #include <rte_log.h>
21 #include <rte_string_fns.h>
22 #include <rte_malloc.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27 
28 #include "main.h"
29 
30 #ifndef MAX_QUEUES
31 #define MAX_QUEUES 128
32 #endif
33 
34 /* the maximum number of external ports supported */
35 #define MAX_SUP_PORTS 1
36 
37 #define MBUF_CACHE_SIZE	128
38 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
39 
40 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
41 
42 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
43 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
44 
45 #define JUMBO_FRAME_MAX_SIZE    0x2600
46 
47 /* State of virtio device. */
48 #define DEVICE_MAC_LEARNING 0
49 #define DEVICE_RX			1
50 #define DEVICE_SAFE_REMOVE	2
51 
52 /* Configurable number of RX/TX ring descriptors */
53 #define RTE_TEST_RX_DESC_DEFAULT 1024
54 #define RTE_TEST_TX_DESC_DEFAULT 512
55 
56 #define INVALID_PORT_ID 0xFF
57 
58 /* Maximum long option length for option parsing. */
59 #define MAX_LONG_OPT_SZ 64
60 
61 /* mask of enabled ports */
62 static uint32_t enabled_port_mask = 0;
63 
64 /* Promiscuous mode */
65 static uint32_t promiscuous;
66 
67 /* number of devices/queues to support*/
68 static uint32_t num_queues = 0;
69 static uint32_t num_devices;
70 
71 static struct rte_mempool *mbuf_pool;
72 static int mergeable;
73 
74 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
75 typedef enum {
76 	VM2VM_DISABLED = 0,
77 	VM2VM_SOFTWARE = 1,
78 	VM2VM_HARDWARE = 2,
79 	VM2VM_LAST
80 } vm2vm_type;
81 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
82 
83 /* Enable stats. */
84 static uint32_t enable_stats = 0;
85 /* Enable retries on RX. */
86 static uint32_t enable_retry = 1;
87 
88 /* Disable TX checksum offload */
89 static uint32_t enable_tx_csum;
90 
91 /* Disable TSO offload */
92 static uint32_t enable_tso;
93 
94 static int client_mode;
95 static int dequeue_zero_copy;
96 
97 static int builtin_net_driver;
98 
99 /* Specify timeout (in useconds) between retries on RX. */
100 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
101 /* Specify the number of retries on RX. */
102 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
103 
104 /* Socket file paths. Can be set by user */
105 static char *socket_files;
106 static int nb_sockets;
107 
108 /* empty vmdq configuration structure. Filled in programatically */
109 static struct rte_eth_conf vmdq_conf_default = {
110 	.rxmode = {
111 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
112 		.split_hdr_size = 0,
113 		/*
114 		 * VLAN strip is necessary for 1G NIC such as I350,
115 		 * this fixes bug of ipv4 forwarding in guest can't
116 		 * forward pakets from one virtio dev to another virtio dev.
117 		 */
118 		.offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
119 	},
120 
121 	.txmode = {
122 		.mq_mode = ETH_MQ_TX_NONE,
123 		.offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
124 			     DEV_TX_OFFLOAD_TCP_CKSUM |
125 			     DEV_TX_OFFLOAD_VLAN_INSERT |
126 			     DEV_TX_OFFLOAD_MULTI_SEGS |
127 			     DEV_TX_OFFLOAD_TCP_TSO),
128 	},
129 	.rx_adv_conf = {
130 		/*
131 		 * should be overridden separately in code with
132 		 * appropriate values
133 		 */
134 		.vmdq_rx_conf = {
135 			.nb_queue_pools = ETH_8_POOLS,
136 			.enable_default_pool = 0,
137 			.default_pool = 0,
138 			.nb_pool_maps = 0,
139 			.pool_map = {{0, 0},},
140 		},
141 	},
142 };
143 
144 
145 static unsigned lcore_ids[RTE_MAX_LCORE];
146 static uint16_t ports[RTE_MAX_ETHPORTS];
147 static unsigned num_ports = 0; /**< The number of ports specified in command line */
148 static uint16_t num_pf_queues, num_vmdq_queues;
149 static uint16_t vmdq_pool_base, vmdq_queue_base;
150 static uint16_t queues_per_pool;
151 
152 const uint16_t vlan_tags[] = {
153 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
154 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
155 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
156 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
157 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
158 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
159 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
160 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
161 };
162 
163 /* ethernet addresses of ports */
164 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
165 
166 static struct vhost_dev_tailq_list vhost_dev_list =
167 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
168 
169 static struct lcore_info lcore_info[RTE_MAX_LCORE];
170 
171 /* Used for queueing bursts of TX packets. */
172 struct mbuf_table {
173 	unsigned len;
174 	unsigned txq_id;
175 	struct rte_mbuf *m_table[MAX_PKT_BURST];
176 };
177 
178 /* TX queue for each data core. */
179 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
180 
181 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
182 				 / US_PER_S * BURST_TX_DRAIN_US)
183 #define VLAN_HLEN       4
184 
185 /*
186  * Builds up the correct configuration for VMDQ VLAN pool map
187  * according to the pool & queue limits.
188  */
189 static inline int
190 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
191 {
192 	struct rte_eth_vmdq_rx_conf conf;
193 	struct rte_eth_vmdq_rx_conf *def_conf =
194 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
195 	unsigned i;
196 
197 	memset(&conf, 0, sizeof(conf));
198 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
199 	conf.nb_pool_maps = num_devices;
200 	conf.enable_loop_back = def_conf->enable_loop_back;
201 	conf.rx_mode = def_conf->rx_mode;
202 
203 	for (i = 0; i < conf.nb_pool_maps; i++) {
204 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
205 		conf.pool_map[i].pools = (1UL << i);
206 	}
207 
208 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
209 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
210 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
211 	return 0;
212 }
213 
214 /*
215  * Initialises a given port using global settings and with the rx buffers
216  * coming from the mbuf_pool passed as parameter
217  */
218 static inline int
219 port_init(uint16_t port)
220 {
221 	struct rte_eth_dev_info dev_info;
222 	struct rte_eth_conf port_conf;
223 	struct rte_eth_rxconf *rxconf;
224 	struct rte_eth_txconf *txconf;
225 	int16_t rx_rings, tx_rings;
226 	uint16_t rx_ring_size, tx_ring_size;
227 	int retval;
228 	uint16_t q;
229 
230 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
231 	retval = rte_eth_dev_info_get(port, &dev_info);
232 	if (retval != 0) {
233 		RTE_LOG(ERR, VHOST_PORT,
234 			"Error during getting device (port %u) info: %s\n",
235 			port, strerror(-retval));
236 
237 		return retval;
238 	}
239 
240 	rxconf = &dev_info.default_rxconf;
241 	txconf = &dev_info.default_txconf;
242 	rxconf->rx_drop_en = 1;
243 
244 	/*configure the number of supported virtio devices based on VMDQ limits */
245 	num_devices = dev_info.max_vmdq_pools;
246 
247 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
248 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
249 
250 	/*
251 	 * When dequeue zero copy is enabled, guest Tx used vring will be
252 	 * updated only when corresponding mbuf is freed. Thus, the nb_tx_desc
253 	 * (tx_ring_size here) must be small enough so that the driver will
254 	 * hit the free threshold easily and free mbufs timely. Otherwise,
255 	 * guest Tx vring would be starved.
256 	 */
257 	if (dequeue_zero_copy)
258 		tx_ring_size = 64;
259 
260 	tx_rings = (uint16_t)rte_lcore_count();
261 
262 	/* Get port configuration. */
263 	retval = get_eth_conf(&port_conf, num_devices);
264 	if (retval < 0)
265 		return retval;
266 	/* NIC queues are divided into pf queues and vmdq queues.  */
267 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
268 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
269 	num_vmdq_queues = num_devices * queues_per_pool;
270 	num_queues = num_pf_queues + num_vmdq_queues;
271 	vmdq_queue_base = dev_info.vmdq_queue_base;
272 	vmdq_pool_base  = dev_info.vmdq_pool_base;
273 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
274 		num_pf_queues, num_devices, queues_per_pool);
275 
276 	if (!rte_eth_dev_is_valid_port(port))
277 		return -1;
278 
279 	rx_rings = (uint16_t)dev_info.max_rx_queues;
280 	if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
281 		port_conf.txmode.offloads |=
282 			DEV_TX_OFFLOAD_MBUF_FAST_FREE;
283 	/* Configure ethernet device. */
284 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
285 	if (retval != 0) {
286 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
287 			port, strerror(-retval));
288 		return retval;
289 	}
290 
291 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
292 		&tx_ring_size);
293 	if (retval != 0) {
294 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
295 			"for port %u: %s.\n", port, strerror(-retval));
296 		return retval;
297 	}
298 	if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
299 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
300 			"for Rx queues on port %u.\n", port);
301 		return -1;
302 	}
303 
304 	/* Setup the queues. */
305 	rxconf->offloads = port_conf.rxmode.offloads;
306 	for (q = 0; q < rx_rings; q ++) {
307 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
308 						rte_eth_dev_socket_id(port),
309 						rxconf,
310 						mbuf_pool);
311 		if (retval < 0) {
312 			RTE_LOG(ERR, VHOST_PORT,
313 				"Failed to setup rx queue %u of port %u: %s.\n",
314 				q, port, strerror(-retval));
315 			return retval;
316 		}
317 	}
318 	txconf->offloads = port_conf.txmode.offloads;
319 	for (q = 0; q < tx_rings; q ++) {
320 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
321 						rte_eth_dev_socket_id(port),
322 						txconf);
323 		if (retval < 0) {
324 			RTE_LOG(ERR, VHOST_PORT,
325 				"Failed to setup tx queue %u of port %u: %s.\n",
326 				q, port, strerror(-retval));
327 			return retval;
328 		}
329 	}
330 
331 	/* Start the device. */
332 	retval  = rte_eth_dev_start(port);
333 	if (retval < 0) {
334 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
335 			port, strerror(-retval));
336 		return retval;
337 	}
338 
339 	if (promiscuous) {
340 		retval = rte_eth_promiscuous_enable(port);
341 		if (retval != 0) {
342 			RTE_LOG(ERR, VHOST_PORT,
343 				"Failed to enable promiscuous mode on port %u: %s\n",
344 				port, rte_strerror(-retval));
345 			return retval;
346 		}
347 	}
348 
349 	retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
350 	if (retval < 0) {
351 		RTE_LOG(ERR, VHOST_PORT,
352 			"Failed to get MAC address on port %u: %s\n",
353 			port, rte_strerror(-retval));
354 		return retval;
355 	}
356 
357 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
358 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
359 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
360 			port,
361 			vmdq_ports_eth_addr[port].addr_bytes[0],
362 			vmdq_ports_eth_addr[port].addr_bytes[1],
363 			vmdq_ports_eth_addr[port].addr_bytes[2],
364 			vmdq_ports_eth_addr[port].addr_bytes[3],
365 			vmdq_ports_eth_addr[port].addr_bytes[4],
366 			vmdq_ports_eth_addr[port].addr_bytes[5]);
367 
368 	return 0;
369 }
370 
371 /*
372  * Set socket file path.
373  */
374 static int
375 us_vhost_parse_socket_path(const char *q_arg)
376 {
377 	char *old;
378 
379 	/* parse number string */
380 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
381 		return -1;
382 
383 	old = socket_files;
384 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
385 	if (socket_files == NULL) {
386 		free(old);
387 		return -1;
388 	}
389 
390 	strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
391 	nb_sockets++;
392 
393 	return 0;
394 }
395 
396 /*
397  * Parse the portmask provided at run time.
398  */
399 static int
400 parse_portmask(const char *portmask)
401 {
402 	char *end = NULL;
403 	unsigned long pm;
404 
405 	errno = 0;
406 
407 	/* parse hexadecimal string */
408 	pm = strtoul(portmask, &end, 16);
409 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
410 		return -1;
411 
412 	if (pm == 0)
413 		return -1;
414 
415 	return pm;
416 
417 }
418 
419 /*
420  * Parse num options at run time.
421  */
422 static int
423 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
424 {
425 	char *end = NULL;
426 	unsigned long num;
427 
428 	errno = 0;
429 
430 	/* parse unsigned int string */
431 	num = strtoul(q_arg, &end, 10);
432 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
433 		return -1;
434 
435 	if (num > max_valid_value)
436 		return -1;
437 
438 	return num;
439 
440 }
441 
442 /*
443  * Display usage
444  */
445 static void
446 us_vhost_usage(const char *prgname)
447 {
448 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
449 	"		--vm2vm [0|1|2]\n"
450 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
451 	"		--socket-file <path>\n"
452 	"		--nb-devices ND\n"
453 	"		-p PORTMASK: Set mask for ports to be used by application\n"
454 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
455 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
456 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
457 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
458 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
459 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
460 	"		--socket-file: The path of the socket file.\n"
461 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
462 	"		--tso [0|1] disable/enable TCP segment offload.\n"
463 	"		--client register a vhost-user socket as client mode.\n"
464 	"		--dequeue-zero-copy enables dequeue zero copy\n",
465 	       prgname);
466 }
467 
468 /*
469  * Parse the arguments given in the command line of the application.
470  */
471 static int
472 us_vhost_parse_args(int argc, char **argv)
473 {
474 	int opt, ret;
475 	int option_index;
476 	unsigned i;
477 	const char *prgname = argv[0];
478 	static struct option long_option[] = {
479 		{"vm2vm", required_argument, NULL, 0},
480 		{"rx-retry", required_argument, NULL, 0},
481 		{"rx-retry-delay", required_argument, NULL, 0},
482 		{"rx-retry-num", required_argument, NULL, 0},
483 		{"mergeable", required_argument, NULL, 0},
484 		{"stats", required_argument, NULL, 0},
485 		{"socket-file", required_argument, NULL, 0},
486 		{"tx-csum", required_argument, NULL, 0},
487 		{"tso", required_argument, NULL, 0},
488 		{"client", no_argument, &client_mode, 1},
489 		{"dequeue-zero-copy", no_argument, &dequeue_zero_copy, 1},
490 		{"builtin-net-driver", no_argument, &builtin_net_driver, 1},
491 		{NULL, 0, 0, 0},
492 	};
493 
494 	/* Parse command line */
495 	while ((opt = getopt_long(argc, argv, "p:P",
496 			long_option, &option_index)) != EOF) {
497 		switch (opt) {
498 		/* Portmask */
499 		case 'p':
500 			enabled_port_mask = parse_portmask(optarg);
501 			if (enabled_port_mask == 0) {
502 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
503 				us_vhost_usage(prgname);
504 				return -1;
505 			}
506 			break;
507 
508 		case 'P':
509 			promiscuous = 1;
510 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
511 				ETH_VMDQ_ACCEPT_BROADCAST |
512 				ETH_VMDQ_ACCEPT_MULTICAST;
513 
514 			break;
515 
516 		case 0:
517 			/* Enable/disable vm2vm comms. */
518 			if (!strncmp(long_option[option_index].name, "vm2vm",
519 				MAX_LONG_OPT_SZ)) {
520 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
521 				if (ret == -1) {
522 					RTE_LOG(INFO, VHOST_CONFIG,
523 						"Invalid argument for "
524 						"vm2vm [0|1|2]\n");
525 					us_vhost_usage(prgname);
526 					return -1;
527 				} else {
528 					vm2vm_mode = (vm2vm_type)ret;
529 				}
530 			}
531 
532 			/* Enable/disable retries on RX. */
533 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
534 				ret = parse_num_opt(optarg, 1);
535 				if (ret == -1) {
536 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
537 					us_vhost_usage(prgname);
538 					return -1;
539 				} else {
540 					enable_retry = ret;
541 				}
542 			}
543 
544 			/* Enable/disable TX checksum offload. */
545 			if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
546 				ret = parse_num_opt(optarg, 1);
547 				if (ret == -1) {
548 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
549 					us_vhost_usage(prgname);
550 					return -1;
551 				} else
552 					enable_tx_csum = ret;
553 			}
554 
555 			/* Enable/disable TSO offload. */
556 			if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
557 				ret = parse_num_opt(optarg, 1);
558 				if (ret == -1) {
559 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
560 					us_vhost_usage(prgname);
561 					return -1;
562 				} else
563 					enable_tso = ret;
564 			}
565 
566 			/* Specify the retries delay time (in useconds) on RX. */
567 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
568 				ret = parse_num_opt(optarg, INT32_MAX);
569 				if (ret == -1) {
570 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
571 					us_vhost_usage(prgname);
572 					return -1;
573 				} else {
574 					burst_rx_delay_time = ret;
575 				}
576 			}
577 
578 			/* Specify the retries number on RX. */
579 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
580 				ret = parse_num_opt(optarg, INT32_MAX);
581 				if (ret == -1) {
582 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
583 					us_vhost_usage(prgname);
584 					return -1;
585 				} else {
586 					burst_rx_retry_num = ret;
587 				}
588 			}
589 
590 			/* Enable/disable RX mergeable buffers. */
591 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
592 				ret = parse_num_opt(optarg, 1);
593 				if (ret == -1) {
594 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
595 					us_vhost_usage(prgname);
596 					return -1;
597 				} else {
598 					mergeable = !!ret;
599 					if (ret) {
600 						vmdq_conf_default.rxmode.offloads |=
601 							DEV_RX_OFFLOAD_JUMBO_FRAME;
602 						vmdq_conf_default.rxmode.max_rx_pkt_len
603 							= JUMBO_FRAME_MAX_SIZE;
604 					}
605 				}
606 			}
607 
608 			/* Enable/disable stats. */
609 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
610 				ret = parse_num_opt(optarg, INT32_MAX);
611 				if (ret == -1) {
612 					RTE_LOG(INFO, VHOST_CONFIG,
613 						"Invalid argument for stats [0..N]\n");
614 					us_vhost_usage(prgname);
615 					return -1;
616 				} else {
617 					enable_stats = ret;
618 				}
619 			}
620 
621 			/* Set socket file path. */
622 			if (!strncmp(long_option[option_index].name,
623 						"socket-file", MAX_LONG_OPT_SZ)) {
624 				if (us_vhost_parse_socket_path(optarg) == -1) {
625 					RTE_LOG(INFO, VHOST_CONFIG,
626 					"Invalid argument for socket name (Max %d characters)\n",
627 					PATH_MAX);
628 					us_vhost_usage(prgname);
629 					return -1;
630 				}
631 			}
632 
633 			break;
634 
635 			/* Invalid option - print options. */
636 		default:
637 			us_vhost_usage(prgname);
638 			return -1;
639 		}
640 	}
641 
642 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
643 		if (enabled_port_mask & (1 << i))
644 			ports[num_ports++] = i;
645 	}
646 
647 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
648 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
649 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
650 		return -1;
651 	}
652 
653 	return 0;
654 }
655 
656 /*
657  * Update the global var NUM_PORTS and array PORTS according to system ports number
658  * and return valid ports number
659  */
660 static unsigned check_ports_num(unsigned nb_ports)
661 {
662 	unsigned valid_num_ports = num_ports;
663 	unsigned portid;
664 
665 	if (num_ports > nb_ports) {
666 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
667 			num_ports, nb_ports);
668 		num_ports = nb_ports;
669 	}
670 
671 	for (portid = 0; portid < num_ports; portid ++) {
672 		if (!rte_eth_dev_is_valid_port(ports[portid])) {
673 			RTE_LOG(INFO, VHOST_PORT,
674 				"\nSpecified port ID(%u) is not valid\n",
675 				ports[portid]);
676 			ports[portid] = INVALID_PORT_ID;
677 			valid_num_ports--;
678 		}
679 	}
680 	return valid_num_ports;
681 }
682 
683 static __rte_always_inline struct vhost_dev *
684 find_vhost_dev(struct rte_ether_addr *mac)
685 {
686 	struct vhost_dev *vdev;
687 
688 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
689 		if (vdev->ready == DEVICE_RX &&
690 		    rte_is_same_ether_addr(mac, &vdev->mac_address))
691 			return vdev;
692 	}
693 
694 	return NULL;
695 }
696 
697 /*
698  * This function learns the MAC address of the device and registers this along with a
699  * vlan tag to a VMDQ.
700  */
701 static int
702 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
703 {
704 	struct rte_ether_hdr *pkt_hdr;
705 	int i, ret;
706 
707 	/* Learn MAC address of guest device from packet */
708 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
709 
710 	if (find_vhost_dev(&pkt_hdr->s_addr)) {
711 		RTE_LOG(ERR, VHOST_DATA,
712 			"(%d) device is using a registered MAC!\n",
713 			vdev->vid);
714 		return -1;
715 	}
716 
717 	for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
718 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
719 
720 	/* vlan_tag currently uses the device_id. */
721 	vdev->vlan_tag = vlan_tags[vdev->vid];
722 
723 	/* Print out VMDQ registration info. */
724 	RTE_LOG(INFO, VHOST_DATA,
725 		"(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
726 		vdev->vid,
727 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
728 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
729 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
730 		vdev->vlan_tag);
731 
732 	/* Register the MAC address. */
733 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
734 				(uint32_t)vdev->vid + vmdq_pool_base);
735 	if (ret)
736 		RTE_LOG(ERR, VHOST_DATA,
737 			"(%d) failed to add device MAC address to VMDQ\n",
738 			vdev->vid);
739 
740 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
741 
742 	/* Set device as ready for RX. */
743 	vdev->ready = DEVICE_RX;
744 
745 	return 0;
746 }
747 
748 /*
749  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
750  * queue before disabling RX on the device.
751  */
752 static inline void
753 unlink_vmdq(struct vhost_dev *vdev)
754 {
755 	unsigned i = 0;
756 	unsigned rx_count;
757 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
758 
759 	if (vdev->ready == DEVICE_RX) {
760 		/*clear MAC and VLAN settings*/
761 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
762 		for (i = 0; i < 6; i++)
763 			vdev->mac_address.addr_bytes[i] = 0;
764 
765 		vdev->vlan_tag = 0;
766 
767 		/*Clear out the receive buffers*/
768 		rx_count = rte_eth_rx_burst(ports[0],
769 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
770 
771 		while (rx_count) {
772 			for (i = 0; i < rx_count; i++)
773 				rte_pktmbuf_free(pkts_burst[i]);
774 
775 			rx_count = rte_eth_rx_burst(ports[0],
776 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
777 		}
778 
779 		vdev->ready = DEVICE_MAC_LEARNING;
780 	}
781 }
782 
783 static __rte_always_inline void
784 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
785 	    struct rte_mbuf *m)
786 {
787 	uint16_t ret;
788 
789 	if (builtin_net_driver) {
790 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
791 	} else {
792 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
793 	}
794 
795 	if (enable_stats) {
796 		rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
797 		rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
798 		src_vdev->stats.tx_total++;
799 		src_vdev->stats.tx += ret;
800 	}
801 }
802 
803 /*
804  * Check if the packet destination MAC address is for a local device. If so then put
805  * the packet on that devices RX queue. If not then return.
806  */
807 static __rte_always_inline int
808 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
809 {
810 	struct rte_ether_hdr *pkt_hdr;
811 	struct vhost_dev *dst_vdev;
812 
813 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
814 
815 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
816 	if (!dst_vdev)
817 		return -1;
818 
819 	if (vdev->vid == dst_vdev->vid) {
820 		RTE_LOG_DP(DEBUG, VHOST_DATA,
821 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
822 			vdev->vid);
823 		return 0;
824 	}
825 
826 	RTE_LOG_DP(DEBUG, VHOST_DATA,
827 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
828 
829 	if (unlikely(dst_vdev->remove)) {
830 		RTE_LOG_DP(DEBUG, VHOST_DATA,
831 			"(%d) device is marked for removal\n", dst_vdev->vid);
832 		return 0;
833 	}
834 
835 	virtio_xmit(dst_vdev, vdev, m);
836 	return 0;
837 }
838 
839 /*
840  * Check if the destination MAC of a packet is one local VM,
841  * and get its vlan tag, and offset if it is.
842  */
843 static __rte_always_inline int
844 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
845 	uint32_t *offset, uint16_t *vlan_tag)
846 {
847 	struct vhost_dev *dst_vdev;
848 	struct rte_ether_hdr *pkt_hdr =
849 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
850 
851 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
852 	if (!dst_vdev)
853 		return 0;
854 
855 	if (vdev->vid == dst_vdev->vid) {
856 		RTE_LOG_DP(DEBUG, VHOST_DATA,
857 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
858 			vdev->vid);
859 		return -1;
860 	}
861 
862 	/*
863 	 * HW vlan strip will reduce the packet length
864 	 * by minus length of vlan tag, so need restore
865 	 * the packet length by plus it.
866 	 */
867 	*offset  = VLAN_HLEN;
868 	*vlan_tag = vlan_tags[vdev->vid];
869 
870 	RTE_LOG_DP(DEBUG, VHOST_DATA,
871 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
872 		vdev->vid, dst_vdev->vid, *vlan_tag);
873 
874 	return 0;
875 }
876 
877 static uint16_t
878 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
879 {
880 	if (ol_flags & PKT_TX_IPV4)
881 		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
882 	else /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
883 		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
884 }
885 
886 static void virtio_tx_offload(struct rte_mbuf *m)
887 {
888 	void *l3_hdr;
889 	struct rte_ipv4_hdr *ipv4_hdr = NULL;
890 	struct rte_tcp_hdr *tcp_hdr = NULL;
891 	struct rte_ether_hdr *eth_hdr =
892 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
893 
894 	l3_hdr = (char *)eth_hdr + m->l2_len;
895 
896 	if (m->ol_flags & PKT_TX_IPV4) {
897 		ipv4_hdr = l3_hdr;
898 		ipv4_hdr->hdr_checksum = 0;
899 		m->ol_flags |= PKT_TX_IP_CKSUM;
900 	}
901 
902 	tcp_hdr = (struct rte_tcp_hdr *)((char *)l3_hdr + m->l3_len);
903 	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
904 }
905 
906 static inline void
907 free_pkts(struct rte_mbuf **pkts, uint16_t n)
908 {
909 	while (n--)
910 		rte_pktmbuf_free(pkts[n]);
911 }
912 
913 static __rte_always_inline void
914 do_drain_mbuf_table(struct mbuf_table *tx_q)
915 {
916 	uint16_t count;
917 
918 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
919 				 tx_q->m_table, tx_q->len);
920 	if (unlikely(count < tx_q->len))
921 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
922 
923 	tx_q->len = 0;
924 }
925 
926 /*
927  * This function routes the TX packet to the correct interface. This
928  * may be a local device or the physical port.
929  */
930 static __rte_always_inline void
931 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
932 {
933 	struct mbuf_table *tx_q;
934 	unsigned offset = 0;
935 	const uint16_t lcore_id = rte_lcore_id();
936 	struct rte_ether_hdr *nh;
937 
938 
939 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
940 	if (unlikely(rte_is_broadcast_ether_addr(&nh->d_addr))) {
941 		struct vhost_dev *vdev2;
942 
943 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
944 			if (vdev2 != vdev)
945 				virtio_xmit(vdev2, vdev, m);
946 		}
947 		goto queue2nic;
948 	}
949 
950 	/*check if destination is local VM*/
951 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
952 		rte_pktmbuf_free(m);
953 		return;
954 	}
955 
956 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
957 		if (unlikely(find_local_dest(vdev, m, &offset,
958 					     &vlan_tag) != 0)) {
959 			rte_pktmbuf_free(m);
960 			return;
961 		}
962 	}
963 
964 	RTE_LOG_DP(DEBUG, VHOST_DATA,
965 		"(%d) TX: MAC address is external\n", vdev->vid);
966 
967 queue2nic:
968 
969 	/*Add packet to the port tx queue*/
970 	tx_q = &lcore_tx_queue[lcore_id];
971 
972 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
973 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
974 		/* Guest has inserted the vlan tag. */
975 		struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
976 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
977 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
978 			(vh->vlan_tci != vlan_tag_be))
979 			vh->vlan_tci = vlan_tag_be;
980 	} else {
981 		m->ol_flags |= PKT_TX_VLAN_PKT;
982 
983 		/*
984 		 * Find the right seg to adjust the data len when offset is
985 		 * bigger than tail room size.
986 		 */
987 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
988 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
989 				m->data_len += offset;
990 			else {
991 				struct rte_mbuf *seg = m;
992 
993 				while ((seg->next != NULL) &&
994 					(offset > rte_pktmbuf_tailroom(seg)))
995 					seg = seg->next;
996 
997 				seg->data_len += offset;
998 			}
999 			m->pkt_len += offset;
1000 		}
1001 
1002 		m->vlan_tci = vlan_tag;
1003 	}
1004 
1005 	if (m->ol_flags & PKT_TX_TCP_SEG)
1006 		virtio_tx_offload(m);
1007 
1008 	tx_q->m_table[tx_q->len++] = m;
1009 	if (enable_stats) {
1010 		vdev->stats.tx_total++;
1011 		vdev->stats.tx++;
1012 	}
1013 
1014 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1015 		do_drain_mbuf_table(tx_q);
1016 }
1017 
1018 
1019 static __rte_always_inline void
1020 drain_mbuf_table(struct mbuf_table *tx_q)
1021 {
1022 	static uint64_t prev_tsc;
1023 	uint64_t cur_tsc;
1024 
1025 	if (tx_q->len == 0)
1026 		return;
1027 
1028 	cur_tsc = rte_rdtsc();
1029 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1030 		prev_tsc = cur_tsc;
1031 
1032 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1033 			"TX queue drained after timeout with burst size %u\n",
1034 			tx_q->len);
1035 		do_drain_mbuf_table(tx_q);
1036 	}
1037 }
1038 
1039 static __rte_always_inline void
1040 drain_eth_rx(struct vhost_dev *vdev)
1041 {
1042 	uint16_t rx_count, enqueue_count;
1043 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1044 
1045 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1046 				    pkts, MAX_PKT_BURST);
1047 	if (!rx_count)
1048 		return;
1049 
1050 	/*
1051 	 * When "enable_retry" is set, here we wait and retry when there
1052 	 * is no enough free slots in the queue to hold @rx_count packets,
1053 	 * to diminish packet loss.
1054 	 */
1055 	if (enable_retry &&
1056 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1057 			VIRTIO_RXQ))) {
1058 		uint32_t retry;
1059 
1060 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1061 			rte_delay_us(burst_rx_delay_time);
1062 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1063 					VIRTIO_RXQ))
1064 				break;
1065 		}
1066 	}
1067 
1068 	if (builtin_net_driver) {
1069 		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1070 						pkts, rx_count);
1071 	} else {
1072 		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1073 						pkts, rx_count);
1074 	}
1075 	if (enable_stats) {
1076 		rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1077 		rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1078 	}
1079 
1080 	free_pkts(pkts, rx_count);
1081 }
1082 
1083 static __rte_always_inline void
1084 drain_virtio_tx(struct vhost_dev *vdev)
1085 {
1086 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1087 	uint16_t count;
1088 	uint16_t i;
1089 
1090 	if (builtin_net_driver) {
1091 		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1092 					pkts, MAX_PKT_BURST);
1093 	} else {
1094 		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1095 					mbuf_pool, pkts, MAX_PKT_BURST);
1096 	}
1097 
1098 	/* setup VMDq for the first packet */
1099 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1100 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1101 			free_pkts(pkts, count);
1102 	}
1103 
1104 	for (i = 0; i < count; ++i)
1105 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1106 }
1107 
1108 /*
1109  * Main function of vhost-switch. It basically does:
1110  *
1111  * for each vhost device {
1112  *    - drain_eth_rx()
1113  *
1114  *      Which drains the host eth Rx queue linked to the vhost device,
1115  *      and deliver all of them to guest virito Rx ring associated with
1116  *      this vhost device.
1117  *
1118  *    - drain_virtio_tx()
1119  *
1120  *      Which drains the guest virtio Tx queue and deliver all of them
1121  *      to the target, which could be another vhost device, or the
1122  *      physical eth dev. The route is done in function "virtio_tx_route".
1123  * }
1124  */
1125 static int
1126 switch_worker(void *arg __rte_unused)
1127 {
1128 	unsigned i;
1129 	unsigned lcore_id = rte_lcore_id();
1130 	struct vhost_dev *vdev;
1131 	struct mbuf_table *tx_q;
1132 
1133 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1134 
1135 	tx_q = &lcore_tx_queue[lcore_id];
1136 	for (i = 0; i < rte_lcore_count(); i++) {
1137 		if (lcore_ids[i] == lcore_id) {
1138 			tx_q->txq_id = i;
1139 			break;
1140 		}
1141 	}
1142 
1143 	while(1) {
1144 		drain_mbuf_table(tx_q);
1145 
1146 		/*
1147 		 * Inform the configuration core that we have exited the
1148 		 * linked list and that no devices are in use if requested.
1149 		 */
1150 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1151 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1152 
1153 		/*
1154 		 * Process vhost devices
1155 		 */
1156 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1157 			      lcore_vdev_entry) {
1158 			if (unlikely(vdev->remove)) {
1159 				unlink_vmdq(vdev);
1160 				vdev->ready = DEVICE_SAFE_REMOVE;
1161 				continue;
1162 			}
1163 
1164 			if (likely(vdev->ready == DEVICE_RX))
1165 				drain_eth_rx(vdev);
1166 
1167 			if (likely(!vdev->remove))
1168 				drain_virtio_tx(vdev);
1169 		}
1170 	}
1171 
1172 	return 0;
1173 }
1174 
1175 /*
1176  * Remove a device from the specific data core linked list and from the
1177  * main linked list. Synchonization  occurs through the use of the
1178  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1179  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1180  */
1181 static void
1182 destroy_device(int vid)
1183 {
1184 	struct vhost_dev *vdev = NULL;
1185 	int lcore;
1186 
1187 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1188 		if (vdev->vid == vid)
1189 			break;
1190 	}
1191 	if (!vdev)
1192 		return;
1193 	/*set the remove flag. */
1194 	vdev->remove = 1;
1195 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1196 		rte_pause();
1197 	}
1198 
1199 	if (builtin_net_driver)
1200 		vs_vhost_net_remove(vdev);
1201 
1202 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1203 		     lcore_vdev_entry);
1204 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1205 
1206 
1207 	/* Set the dev_removal_flag on each lcore. */
1208 	RTE_LCORE_FOREACH_SLAVE(lcore)
1209 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1210 
1211 	/*
1212 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1213 	 * we can be sure that they can no longer access the device removed
1214 	 * from the linked lists and that the devices are no longer in use.
1215 	 */
1216 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1217 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1218 			rte_pause();
1219 	}
1220 
1221 	lcore_info[vdev->coreid].device_num--;
1222 
1223 	RTE_LOG(INFO, VHOST_DATA,
1224 		"(%d) device has been removed from data core\n",
1225 		vdev->vid);
1226 
1227 	rte_free(vdev);
1228 }
1229 
1230 /*
1231  * A new device is added to a data core. First the device is added to the main linked list
1232  * and then allocated to a specific data core.
1233  */
1234 static int
1235 new_device(int vid)
1236 {
1237 	int lcore, core_add = 0;
1238 	uint32_t device_num_min = num_devices;
1239 	struct vhost_dev *vdev;
1240 
1241 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1242 	if (vdev == NULL) {
1243 		RTE_LOG(INFO, VHOST_DATA,
1244 			"(%d) couldn't allocate memory for vhost dev\n",
1245 			vid);
1246 		return -1;
1247 	}
1248 	vdev->vid = vid;
1249 
1250 	if (builtin_net_driver)
1251 		vs_vhost_net_setup(vdev);
1252 
1253 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1254 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1255 
1256 	/*reset ready flag*/
1257 	vdev->ready = DEVICE_MAC_LEARNING;
1258 	vdev->remove = 0;
1259 
1260 	/* Find a suitable lcore to add the device. */
1261 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1262 		if (lcore_info[lcore].device_num < device_num_min) {
1263 			device_num_min = lcore_info[lcore].device_num;
1264 			core_add = lcore;
1265 		}
1266 	}
1267 	vdev->coreid = core_add;
1268 
1269 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1270 			  lcore_vdev_entry);
1271 	lcore_info[vdev->coreid].device_num++;
1272 
1273 	/* Disable notifications. */
1274 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1275 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1276 
1277 	RTE_LOG(INFO, VHOST_DATA,
1278 		"(%d) device has been added to data core %d\n",
1279 		vid, vdev->coreid);
1280 
1281 	return 0;
1282 }
1283 
1284 /*
1285  * These callback allow devices to be added to the data core when configuration
1286  * has been fully complete.
1287  */
1288 static const struct vhost_device_ops virtio_net_device_ops =
1289 {
1290 	.new_device =  new_device,
1291 	.destroy_device = destroy_device,
1292 };
1293 
1294 /*
1295  * This is a thread will wake up after a period to print stats if the user has
1296  * enabled them.
1297  */
1298 static void *
1299 print_stats(__rte_unused void *arg)
1300 {
1301 	struct vhost_dev *vdev;
1302 	uint64_t tx_dropped, rx_dropped;
1303 	uint64_t tx, tx_total, rx, rx_total;
1304 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1305 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1306 
1307 	while(1) {
1308 		sleep(enable_stats);
1309 
1310 		/* Clear screen and move to top left */
1311 		printf("%s%s\n", clr, top_left);
1312 		printf("Device statistics =================================\n");
1313 
1314 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1315 			tx_total   = vdev->stats.tx_total;
1316 			tx         = vdev->stats.tx;
1317 			tx_dropped = tx_total - tx;
1318 
1319 			rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1320 			rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1321 			rx_dropped = rx_total - rx;
1322 
1323 			printf("Statistics for device %d\n"
1324 				"-----------------------\n"
1325 				"TX total:              %" PRIu64 "\n"
1326 				"TX dropped:            %" PRIu64 "\n"
1327 				"TX successful:         %" PRIu64 "\n"
1328 				"RX total:              %" PRIu64 "\n"
1329 				"RX dropped:            %" PRIu64 "\n"
1330 				"RX successful:         %" PRIu64 "\n",
1331 				vdev->vid,
1332 				tx_total, tx_dropped, tx,
1333 				rx_total, rx_dropped, rx);
1334 		}
1335 
1336 		printf("===================================================\n");
1337 	}
1338 
1339 	return NULL;
1340 }
1341 
1342 static void
1343 unregister_drivers(int socket_num)
1344 {
1345 	int i, ret;
1346 
1347 	for (i = 0; i < socket_num; i++) {
1348 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1349 		if (ret != 0)
1350 			RTE_LOG(ERR, VHOST_CONFIG,
1351 				"Fail to unregister vhost driver for %s.\n",
1352 				socket_files + i * PATH_MAX);
1353 	}
1354 }
1355 
1356 /* When we receive a INT signal, unregister vhost driver */
1357 static void
1358 sigint_handler(__rte_unused int signum)
1359 {
1360 	/* Unregister vhost driver. */
1361 	unregister_drivers(nb_sockets);
1362 
1363 	exit(0);
1364 }
1365 
1366 /*
1367  * While creating an mbuf pool, one key thing is to figure out how
1368  * many mbuf entries is enough for our use. FYI, here are some
1369  * guidelines:
1370  *
1371  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1372  *
1373  * - For each switch core (A CPU core does the packet switch), we need
1374  *   also make some reservation for receiving the packets from virtio
1375  *   Tx queue. How many is enough depends on the usage. It's normally
1376  *   a simple calculation like following:
1377  *
1378  *       MAX_PKT_BURST * max packet size / mbuf size
1379  *
1380  *   So, we definitely need allocate more mbufs when TSO is enabled.
1381  *
1382  * - Similarly, for each switching core, we should serve @nr_rx_desc
1383  *   mbufs for receiving the packets from physical NIC device.
1384  *
1385  * - We also need make sure, for each switch core, we have allocated
1386  *   enough mbufs to fill up the mbuf cache.
1387  */
1388 static void
1389 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1390 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1391 {
1392 	uint32_t nr_mbufs;
1393 	uint32_t nr_mbufs_per_core;
1394 	uint32_t mtu = 1500;
1395 
1396 	if (mergeable)
1397 		mtu = 9000;
1398 	if (enable_tso)
1399 		mtu = 64 * 1024;
1400 
1401 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1402 			(mbuf_size - RTE_PKTMBUF_HEADROOM);
1403 	nr_mbufs_per_core += nr_rx_desc;
1404 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1405 
1406 	nr_mbufs  = nr_queues * nr_rx_desc;
1407 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1408 	nr_mbufs *= nr_port;
1409 
1410 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1411 					    nr_mbuf_cache, 0, mbuf_size,
1412 					    rte_socket_id());
1413 	if (mbuf_pool == NULL)
1414 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1415 }
1416 
1417 /*
1418  * Main function, does initialisation and calls the per-lcore functions.
1419  */
1420 int
1421 main(int argc, char *argv[])
1422 {
1423 	unsigned lcore_id, core_id = 0;
1424 	unsigned nb_ports, valid_num_ports;
1425 	int ret, i;
1426 	uint16_t portid;
1427 	static pthread_t tid;
1428 	uint64_t flags = 0;
1429 
1430 	signal(SIGINT, sigint_handler);
1431 
1432 	/* init EAL */
1433 	ret = rte_eal_init(argc, argv);
1434 	if (ret < 0)
1435 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1436 	argc -= ret;
1437 	argv += ret;
1438 
1439 	/* parse app arguments */
1440 	ret = us_vhost_parse_args(argc, argv);
1441 	if (ret < 0)
1442 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1443 
1444 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1445 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1446 
1447 		if (rte_lcore_is_enabled(lcore_id))
1448 			lcore_ids[core_id++] = lcore_id;
1449 	}
1450 
1451 	if (rte_lcore_count() > RTE_MAX_LCORE)
1452 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1453 
1454 	/* Get the number of physical ports. */
1455 	nb_ports = rte_eth_dev_count_avail();
1456 
1457 	/*
1458 	 * Update the global var NUM_PORTS and global array PORTS
1459 	 * and get value of var VALID_NUM_PORTS according to system ports number
1460 	 */
1461 	valid_num_ports = check_ports_num(nb_ports);
1462 
1463 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1464 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1465 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1466 		return -1;
1467 	}
1468 
1469 	/*
1470 	 * FIXME: here we are trying to allocate mbufs big enough for
1471 	 * @MAX_QUEUES, but the truth is we're never going to use that
1472 	 * many queues here. We probably should only do allocation for
1473 	 * those queues we are going to use.
1474 	 */
1475 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1476 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1477 
1478 	if (vm2vm_mode == VM2VM_HARDWARE) {
1479 		/* Enable VT loop back to let L2 switch to do it. */
1480 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1481 		RTE_LOG(DEBUG, VHOST_CONFIG,
1482 			"Enable loop back for L2 switch in vmdq.\n");
1483 	}
1484 
1485 	/* initialize all ports */
1486 	RTE_ETH_FOREACH_DEV(portid) {
1487 		/* skip ports that are not enabled */
1488 		if ((enabled_port_mask & (1 << portid)) == 0) {
1489 			RTE_LOG(INFO, VHOST_PORT,
1490 				"Skipping disabled port %d\n", portid);
1491 			continue;
1492 		}
1493 		if (port_init(portid) != 0)
1494 			rte_exit(EXIT_FAILURE,
1495 				"Cannot initialize network ports\n");
1496 	}
1497 
1498 	/* Enable stats if the user option is set. */
1499 	if (enable_stats) {
1500 		ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1501 					print_stats, NULL);
1502 		if (ret < 0)
1503 			rte_exit(EXIT_FAILURE,
1504 				"Cannot create print-stats thread\n");
1505 	}
1506 
1507 	/* Launch all data cores. */
1508 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1509 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1510 
1511 	if (client_mode)
1512 		flags |= RTE_VHOST_USER_CLIENT;
1513 
1514 	if (dequeue_zero_copy)
1515 		flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1516 
1517 	/* Register vhost user driver to handle vhost messages. */
1518 	for (i = 0; i < nb_sockets; i++) {
1519 		char *file = socket_files + i * PATH_MAX;
1520 		ret = rte_vhost_driver_register(file, flags);
1521 		if (ret != 0) {
1522 			unregister_drivers(i);
1523 			rte_exit(EXIT_FAILURE,
1524 				"vhost driver register failure.\n");
1525 		}
1526 
1527 		if (builtin_net_driver)
1528 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1529 
1530 		if (mergeable == 0) {
1531 			rte_vhost_driver_disable_features(file,
1532 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
1533 		}
1534 
1535 		if (enable_tx_csum == 0) {
1536 			rte_vhost_driver_disable_features(file,
1537 				1ULL << VIRTIO_NET_F_CSUM);
1538 		}
1539 
1540 		if (enable_tso == 0) {
1541 			rte_vhost_driver_disable_features(file,
1542 				1ULL << VIRTIO_NET_F_HOST_TSO4);
1543 			rte_vhost_driver_disable_features(file,
1544 				1ULL << VIRTIO_NET_F_HOST_TSO6);
1545 			rte_vhost_driver_disable_features(file,
1546 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
1547 			rte_vhost_driver_disable_features(file,
1548 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
1549 		}
1550 
1551 		if (promiscuous) {
1552 			rte_vhost_driver_enable_features(file,
1553 				1ULL << VIRTIO_NET_F_CTRL_RX);
1554 		}
1555 
1556 		ret = rte_vhost_driver_callback_register(file,
1557 			&virtio_net_device_ops);
1558 		if (ret != 0) {
1559 			rte_exit(EXIT_FAILURE,
1560 				"failed to register vhost driver callbacks.\n");
1561 		}
1562 
1563 		if (rte_vhost_driver_start(file) < 0) {
1564 			rte_exit(EXIT_FAILURE,
1565 				"failed to start vhost driver.\n");
1566 		}
1567 	}
1568 
1569 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1570 		rte_eal_wait_lcore(lcore_id);
1571 
1572 	return 0;
1573 
1574 }
1575