xref: /dpdk/examples/vhost/main.c (revision 7bd6f76ee678ec6aa81cb53562f852a43e842718)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4 
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16 
17 #include <rte_atomic.h>
18 #include <rte_cycles.h>
19 #include <rte_ethdev.h>
20 #include <rte_log.h>
21 #include <rte_string_fns.h>
22 #include <rte_malloc.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27 
28 #include "main.h"
29 
30 #ifndef MAX_QUEUES
31 #define MAX_QUEUES 128
32 #endif
33 
34 /* the maximum number of external ports supported */
35 #define MAX_SUP_PORTS 1
36 
37 #define MBUF_CACHE_SIZE	128
38 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
39 
40 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
41 
42 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
43 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
44 
45 #define JUMBO_FRAME_MAX_SIZE    0x2600
46 
47 /* State of virtio device. */
48 #define DEVICE_MAC_LEARNING 0
49 #define DEVICE_RX			1
50 #define DEVICE_SAFE_REMOVE	2
51 
52 /* Configurable number of RX/TX ring descriptors */
53 #define RTE_TEST_RX_DESC_DEFAULT 1024
54 #define RTE_TEST_TX_DESC_DEFAULT 512
55 
56 #define INVALID_PORT_ID 0xFF
57 
58 /* Max number of devices. Limited by vmdq. */
59 #define MAX_DEVICES 64
60 
61 /* Maximum long option length for option parsing. */
62 #define MAX_LONG_OPT_SZ 64
63 
64 /* mask of enabled ports */
65 static uint32_t enabled_port_mask = 0;
66 
67 /* Promiscuous mode */
68 static uint32_t promiscuous;
69 
70 /* number of devices/queues to support*/
71 static uint32_t num_queues = 0;
72 static uint32_t num_devices;
73 
74 static struct rte_mempool *mbuf_pool;
75 static int mergeable;
76 
77 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
78 typedef enum {
79 	VM2VM_DISABLED = 0,
80 	VM2VM_SOFTWARE = 1,
81 	VM2VM_HARDWARE = 2,
82 	VM2VM_LAST
83 } vm2vm_type;
84 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
85 
86 /* Enable stats. */
87 static uint32_t enable_stats = 0;
88 /* Enable retries on RX. */
89 static uint32_t enable_retry = 1;
90 
91 /* Disable TX checksum offload */
92 static uint32_t enable_tx_csum;
93 
94 /* Disable TSO offload */
95 static uint32_t enable_tso;
96 
97 static int client_mode;
98 static int dequeue_zero_copy;
99 
100 static int builtin_net_driver;
101 
102 /* Specify timeout (in useconds) between retries on RX. */
103 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
104 /* Specify the number of retries on RX. */
105 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
106 
107 /* Socket file paths. Can be set by user */
108 static char *socket_files;
109 static int nb_sockets;
110 
111 /* empty vmdq configuration structure. Filled in programatically */
112 static struct rte_eth_conf vmdq_conf_default = {
113 	.rxmode = {
114 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
115 		.split_hdr_size = 0,
116 		/*
117 		 * VLAN strip is necessary for 1G NIC such as I350,
118 		 * this fixes bug of ipv4 forwarding in guest can't
119 		 * forward pakets from one virtio dev to another virtio dev.
120 		 */
121 		.offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
122 	},
123 
124 	.txmode = {
125 		.mq_mode = ETH_MQ_TX_NONE,
126 		.offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
127 			     DEV_TX_OFFLOAD_TCP_CKSUM |
128 			     DEV_TX_OFFLOAD_VLAN_INSERT |
129 			     DEV_TX_OFFLOAD_MULTI_SEGS |
130 			     DEV_TX_OFFLOAD_TCP_TSO),
131 	},
132 	.rx_adv_conf = {
133 		/*
134 		 * should be overridden separately in code with
135 		 * appropriate values
136 		 */
137 		.vmdq_rx_conf = {
138 			.nb_queue_pools = ETH_8_POOLS,
139 			.enable_default_pool = 0,
140 			.default_pool = 0,
141 			.nb_pool_maps = 0,
142 			.pool_map = {{0, 0},},
143 		},
144 	},
145 };
146 
147 
148 static unsigned lcore_ids[RTE_MAX_LCORE];
149 static uint16_t ports[RTE_MAX_ETHPORTS];
150 static unsigned num_ports = 0; /**< The number of ports specified in command line */
151 static uint16_t num_pf_queues, num_vmdq_queues;
152 static uint16_t vmdq_pool_base, vmdq_queue_base;
153 static uint16_t queues_per_pool;
154 
155 const uint16_t vlan_tags[] = {
156 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
157 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
158 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
159 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
160 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
161 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
162 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
163 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
164 };
165 
166 /* ethernet addresses of ports */
167 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
168 
169 static struct vhost_dev_tailq_list vhost_dev_list =
170 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
171 
172 static struct lcore_info lcore_info[RTE_MAX_LCORE];
173 
174 /* Used for queueing bursts of TX packets. */
175 struct mbuf_table {
176 	unsigned len;
177 	unsigned txq_id;
178 	struct rte_mbuf *m_table[MAX_PKT_BURST];
179 };
180 
181 /* TX queue for each data core. */
182 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
183 
184 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
185 				 / US_PER_S * BURST_TX_DRAIN_US)
186 #define VLAN_HLEN       4
187 
188 /*
189  * Builds up the correct configuration for VMDQ VLAN pool map
190  * according to the pool & queue limits.
191  */
192 static inline int
193 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
194 {
195 	struct rte_eth_vmdq_rx_conf conf;
196 	struct rte_eth_vmdq_rx_conf *def_conf =
197 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
198 	unsigned i;
199 
200 	memset(&conf, 0, sizeof(conf));
201 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
202 	conf.nb_pool_maps = num_devices;
203 	conf.enable_loop_back = def_conf->enable_loop_back;
204 	conf.rx_mode = def_conf->rx_mode;
205 
206 	for (i = 0; i < conf.nb_pool_maps; i++) {
207 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
208 		conf.pool_map[i].pools = (1UL << i);
209 	}
210 
211 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
212 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
213 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
214 	return 0;
215 }
216 
217 /*
218  * Validate the device number according to the max pool number gotten form
219  * dev_info. If the device number is invalid, give the error message and
220  * return -1. Each device must have its own pool.
221  */
222 static inline int
223 validate_num_devices(uint32_t max_nb_devices)
224 {
225 	if (num_devices > max_nb_devices) {
226 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
227 		return -1;
228 	}
229 	return 0;
230 }
231 
232 /*
233  * Initialises a given port using global settings and with the rx buffers
234  * coming from the mbuf_pool passed as parameter
235  */
236 static inline int
237 port_init(uint16_t port)
238 {
239 	struct rte_eth_dev_info dev_info;
240 	struct rte_eth_conf port_conf;
241 	struct rte_eth_rxconf *rxconf;
242 	struct rte_eth_txconf *txconf;
243 	int16_t rx_rings, tx_rings;
244 	uint16_t rx_ring_size, tx_ring_size;
245 	int retval;
246 	uint16_t q;
247 
248 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
249 	rte_eth_dev_info_get (port, &dev_info);
250 
251 	rxconf = &dev_info.default_rxconf;
252 	txconf = &dev_info.default_txconf;
253 	rxconf->rx_drop_en = 1;
254 
255 	/*configure the number of supported virtio devices based on VMDQ limits */
256 	num_devices = dev_info.max_vmdq_pools;
257 
258 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
259 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
260 
261 	/*
262 	 * When dequeue zero copy is enabled, guest Tx used vring will be
263 	 * updated only when corresponding mbuf is freed. Thus, the nb_tx_desc
264 	 * (tx_ring_size here) must be small enough so that the driver will
265 	 * hit the free threshold easily and free mbufs timely. Otherwise,
266 	 * guest Tx vring would be starved.
267 	 */
268 	if (dequeue_zero_copy)
269 		tx_ring_size = 64;
270 
271 	tx_rings = (uint16_t)rte_lcore_count();
272 
273 	retval = validate_num_devices(MAX_DEVICES);
274 	if (retval < 0)
275 		return retval;
276 
277 	/* Get port configuration. */
278 	retval = get_eth_conf(&port_conf, num_devices);
279 	if (retval < 0)
280 		return retval;
281 	/* NIC queues are divided into pf queues and vmdq queues.  */
282 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
283 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
284 	num_vmdq_queues = num_devices * queues_per_pool;
285 	num_queues = num_pf_queues + num_vmdq_queues;
286 	vmdq_queue_base = dev_info.vmdq_queue_base;
287 	vmdq_pool_base  = dev_info.vmdq_pool_base;
288 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
289 		num_pf_queues, num_devices, queues_per_pool);
290 
291 	if (!rte_eth_dev_is_valid_port(port))
292 		return -1;
293 
294 	rx_rings = (uint16_t)dev_info.max_rx_queues;
295 	if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
296 		port_conf.txmode.offloads |=
297 			DEV_TX_OFFLOAD_MBUF_FAST_FREE;
298 	/* Configure ethernet device. */
299 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
300 	if (retval != 0) {
301 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
302 			port, strerror(-retval));
303 		return retval;
304 	}
305 
306 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
307 		&tx_ring_size);
308 	if (retval != 0) {
309 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
310 			"for port %u: %s.\n", port, strerror(-retval));
311 		return retval;
312 	}
313 	if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
314 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
315 			"for Rx queues on port %u.\n", port);
316 		return -1;
317 	}
318 
319 	/* Setup the queues. */
320 	rxconf->offloads = port_conf.rxmode.offloads;
321 	for (q = 0; q < rx_rings; q ++) {
322 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
323 						rte_eth_dev_socket_id(port),
324 						rxconf,
325 						mbuf_pool);
326 		if (retval < 0) {
327 			RTE_LOG(ERR, VHOST_PORT,
328 				"Failed to setup rx queue %u of port %u: %s.\n",
329 				q, port, strerror(-retval));
330 			return retval;
331 		}
332 	}
333 	txconf->offloads = port_conf.txmode.offloads;
334 	for (q = 0; q < tx_rings; q ++) {
335 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
336 						rte_eth_dev_socket_id(port),
337 						txconf);
338 		if (retval < 0) {
339 			RTE_LOG(ERR, VHOST_PORT,
340 				"Failed to setup tx queue %u of port %u: %s.\n",
341 				q, port, strerror(-retval));
342 			return retval;
343 		}
344 	}
345 
346 	/* Start the device. */
347 	retval  = rte_eth_dev_start(port);
348 	if (retval < 0) {
349 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
350 			port, strerror(-retval));
351 		return retval;
352 	}
353 
354 	if (promiscuous)
355 		rte_eth_promiscuous_enable(port);
356 
357 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
358 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
359 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
360 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
361 			port,
362 			vmdq_ports_eth_addr[port].addr_bytes[0],
363 			vmdq_ports_eth_addr[port].addr_bytes[1],
364 			vmdq_ports_eth_addr[port].addr_bytes[2],
365 			vmdq_ports_eth_addr[port].addr_bytes[3],
366 			vmdq_ports_eth_addr[port].addr_bytes[4],
367 			vmdq_ports_eth_addr[port].addr_bytes[5]);
368 
369 	return 0;
370 }
371 
372 /*
373  * Set socket file path.
374  */
375 static int
376 us_vhost_parse_socket_path(const char *q_arg)
377 {
378 	/* parse number string */
379 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
380 		return -1;
381 
382 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
383 	snprintf(socket_files + nb_sockets * PATH_MAX, PATH_MAX, "%s", q_arg);
384 	nb_sockets++;
385 
386 	return 0;
387 }
388 
389 /*
390  * Parse the portmask provided at run time.
391  */
392 static int
393 parse_portmask(const char *portmask)
394 {
395 	char *end = NULL;
396 	unsigned long pm;
397 
398 	errno = 0;
399 
400 	/* parse hexadecimal string */
401 	pm = strtoul(portmask, &end, 16);
402 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
403 		return -1;
404 
405 	if (pm == 0)
406 		return -1;
407 
408 	return pm;
409 
410 }
411 
412 /*
413  * Parse num options at run time.
414  */
415 static int
416 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
417 {
418 	char *end = NULL;
419 	unsigned long num;
420 
421 	errno = 0;
422 
423 	/* parse unsigned int string */
424 	num = strtoul(q_arg, &end, 10);
425 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
426 		return -1;
427 
428 	if (num > max_valid_value)
429 		return -1;
430 
431 	return num;
432 
433 }
434 
435 /*
436  * Display usage
437  */
438 static void
439 us_vhost_usage(const char *prgname)
440 {
441 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
442 	"		--vm2vm [0|1|2]\n"
443 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
444 	"		--socket-file <path>\n"
445 	"		--nb-devices ND\n"
446 	"		-p PORTMASK: Set mask for ports to be used by application\n"
447 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
448 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
449 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
450 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
451 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
452 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
453 	"		--socket-file: The path of the socket file.\n"
454 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
455 	"		--tso [0|1] disable/enable TCP segment offload.\n"
456 	"		--client register a vhost-user socket as client mode.\n"
457 	"		--dequeue-zero-copy enables dequeue zero copy\n",
458 	       prgname);
459 }
460 
461 /*
462  * Parse the arguments given in the command line of the application.
463  */
464 static int
465 us_vhost_parse_args(int argc, char **argv)
466 {
467 	int opt, ret;
468 	int option_index;
469 	unsigned i;
470 	const char *prgname = argv[0];
471 	static struct option long_option[] = {
472 		{"vm2vm", required_argument, NULL, 0},
473 		{"rx-retry", required_argument, NULL, 0},
474 		{"rx-retry-delay", required_argument, NULL, 0},
475 		{"rx-retry-num", required_argument, NULL, 0},
476 		{"mergeable", required_argument, NULL, 0},
477 		{"stats", required_argument, NULL, 0},
478 		{"socket-file", required_argument, NULL, 0},
479 		{"tx-csum", required_argument, NULL, 0},
480 		{"tso", required_argument, NULL, 0},
481 		{"client", no_argument, &client_mode, 1},
482 		{"dequeue-zero-copy", no_argument, &dequeue_zero_copy, 1},
483 		{"builtin-net-driver", no_argument, &builtin_net_driver, 1},
484 		{NULL, 0, 0, 0},
485 	};
486 
487 	/* Parse command line */
488 	while ((opt = getopt_long(argc, argv, "p:P",
489 			long_option, &option_index)) != EOF) {
490 		switch (opt) {
491 		/* Portmask */
492 		case 'p':
493 			enabled_port_mask = parse_portmask(optarg);
494 			if (enabled_port_mask == 0) {
495 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
496 				us_vhost_usage(prgname);
497 				return -1;
498 			}
499 			break;
500 
501 		case 'P':
502 			promiscuous = 1;
503 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
504 				ETH_VMDQ_ACCEPT_BROADCAST |
505 				ETH_VMDQ_ACCEPT_MULTICAST;
506 
507 			break;
508 
509 		case 0:
510 			/* Enable/disable vm2vm comms. */
511 			if (!strncmp(long_option[option_index].name, "vm2vm",
512 				MAX_LONG_OPT_SZ)) {
513 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
514 				if (ret == -1) {
515 					RTE_LOG(INFO, VHOST_CONFIG,
516 						"Invalid argument for "
517 						"vm2vm [0|1|2]\n");
518 					us_vhost_usage(prgname);
519 					return -1;
520 				} else {
521 					vm2vm_mode = (vm2vm_type)ret;
522 				}
523 			}
524 
525 			/* Enable/disable retries on RX. */
526 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
527 				ret = parse_num_opt(optarg, 1);
528 				if (ret == -1) {
529 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
530 					us_vhost_usage(prgname);
531 					return -1;
532 				} else {
533 					enable_retry = ret;
534 				}
535 			}
536 
537 			/* Enable/disable TX checksum offload. */
538 			if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
539 				ret = parse_num_opt(optarg, 1);
540 				if (ret == -1) {
541 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
542 					us_vhost_usage(prgname);
543 					return -1;
544 				} else
545 					enable_tx_csum = ret;
546 			}
547 
548 			/* Enable/disable TSO offload. */
549 			if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
550 				ret = parse_num_opt(optarg, 1);
551 				if (ret == -1) {
552 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
553 					us_vhost_usage(prgname);
554 					return -1;
555 				} else
556 					enable_tso = ret;
557 			}
558 
559 			/* Specify the retries delay time (in useconds) on RX. */
560 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
561 				ret = parse_num_opt(optarg, INT32_MAX);
562 				if (ret == -1) {
563 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
564 					us_vhost_usage(prgname);
565 					return -1;
566 				} else {
567 					burst_rx_delay_time = ret;
568 				}
569 			}
570 
571 			/* Specify the retries number on RX. */
572 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
573 				ret = parse_num_opt(optarg, INT32_MAX);
574 				if (ret == -1) {
575 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
576 					us_vhost_usage(prgname);
577 					return -1;
578 				} else {
579 					burst_rx_retry_num = ret;
580 				}
581 			}
582 
583 			/* Enable/disable RX mergeable buffers. */
584 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
585 				ret = parse_num_opt(optarg, 1);
586 				if (ret == -1) {
587 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
588 					us_vhost_usage(prgname);
589 					return -1;
590 				} else {
591 					mergeable = !!ret;
592 					if (ret) {
593 						vmdq_conf_default.rxmode.offloads |=
594 							DEV_RX_OFFLOAD_JUMBO_FRAME;
595 						vmdq_conf_default.rxmode.max_rx_pkt_len
596 							= JUMBO_FRAME_MAX_SIZE;
597 					}
598 				}
599 			}
600 
601 			/* Enable/disable stats. */
602 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
603 				ret = parse_num_opt(optarg, INT32_MAX);
604 				if (ret == -1) {
605 					RTE_LOG(INFO, VHOST_CONFIG,
606 						"Invalid argument for stats [0..N]\n");
607 					us_vhost_usage(prgname);
608 					return -1;
609 				} else {
610 					enable_stats = ret;
611 				}
612 			}
613 
614 			/* Set socket file path. */
615 			if (!strncmp(long_option[option_index].name,
616 						"socket-file", MAX_LONG_OPT_SZ)) {
617 				if (us_vhost_parse_socket_path(optarg) == -1) {
618 					RTE_LOG(INFO, VHOST_CONFIG,
619 					"Invalid argument for socket name (Max %d characters)\n",
620 					PATH_MAX);
621 					us_vhost_usage(prgname);
622 					return -1;
623 				}
624 			}
625 
626 			break;
627 
628 			/* Invalid option - print options. */
629 		default:
630 			us_vhost_usage(prgname);
631 			return -1;
632 		}
633 	}
634 
635 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
636 		if (enabled_port_mask & (1 << i))
637 			ports[num_ports++] = i;
638 	}
639 
640 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
641 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
642 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
643 		return -1;
644 	}
645 
646 	return 0;
647 }
648 
649 /*
650  * Update the global var NUM_PORTS and array PORTS according to system ports number
651  * and return valid ports number
652  */
653 static unsigned check_ports_num(unsigned nb_ports)
654 {
655 	unsigned valid_num_ports = num_ports;
656 	unsigned portid;
657 
658 	if (num_ports > nb_ports) {
659 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
660 			num_ports, nb_ports);
661 		num_ports = nb_ports;
662 	}
663 
664 	for (portid = 0; portid < num_ports; portid ++) {
665 		if (!rte_eth_dev_is_valid_port(ports[portid])) {
666 			RTE_LOG(INFO, VHOST_PORT,
667 				"\nSpecified port ID(%u) is not valid\n",
668 				ports[portid]);
669 			ports[portid] = INVALID_PORT_ID;
670 			valid_num_ports--;
671 		}
672 	}
673 	return valid_num_ports;
674 }
675 
676 static __rte_always_inline struct vhost_dev *
677 find_vhost_dev(struct ether_addr *mac)
678 {
679 	struct vhost_dev *vdev;
680 
681 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
682 		if (vdev->ready == DEVICE_RX &&
683 		    is_same_ether_addr(mac, &vdev->mac_address))
684 			return vdev;
685 	}
686 
687 	return NULL;
688 }
689 
690 /*
691  * This function learns the MAC address of the device and registers this along with a
692  * vlan tag to a VMDQ.
693  */
694 static int
695 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
696 {
697 	struct ether_hdr *pkt_hdr;
698 	int i, ret;
699 
700 	/* Learn MAC address of guest device from packet */
701 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
702 
703 	if (find_vhost_dev(&pkt_hdr->s_addr)) {
704 		RTE_LOG(ERR, VHOST_DATA,
705 			"(%d) device is using a registered MAC!\n",
706 			vdev->vid);
707 		return -1;
708 	}
709 
710 	for (i = 0; i < ETHER_ADDR_LEN; i++)
711 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
712 
713 	/* vlan_tag currently uses the device_id. */
714 	vdev->vlan_tag = vlan_tags[vdev->vid];
715 
716 	/* Print out VMDQ registration info. */
717 	RTE_LOG(INFO, VHOST_DATA,
718 		"(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
719 		vdev->vid,
720 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
721 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
722 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
723 		vdev->vlan_tag);
724 
725 	/* Register the MAC address. */
726 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
727 				(uint32_t)vdev->vid + vmdq_pool_base);
728 	if (ret)
729 		RTE_LOG(ERR, VHOST_DATA,
730 			"(%d) failed to add device MAC address to VMDQ\n",
731 			vdev->vid);
732 
733 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
734 
735 	/* Set device as ready for RX. */
736 	vdev->ready = DEVICE_RX;
737 
738 	return 0;
739 }
740 
741 /*
742  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
743  * queue before disabling RX on the device.
744  */
745 static inline void
746 unlink_vmdq(struct vhost_dev *vdev)
747 {
748 	unsigned i = 0;
749 	unsigned rx_count;
750 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
751 
752 	if (vdev->ready == DEVICE_RX) {
753 		/*clear MAC and VLAN settings*/
754 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
755 		for (i = 0; i < 6; i++)
756 			vdev->mac_address.addr_bytes[i] = 0;
757 
758 		vdev->vlan_tag = 0;
759 
760 		/*Clear out the receive buffers*/
761 		rx_count = rte_eth_rx_burst(ports[0],
762 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
763 
764 		while (rx_count) {
765 			for (i = 0; i < rx_count; i++)
766 				rte_pktmbuf_free(pkts_burst[i]);
767 
768 			rx_count = rte_eth_rx_burst(ports[0],
769 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
770 		}
771 
772 		vdev->ready = DEVICE_MAC_LEARNING;
773 	}
774 }
775 
776 static __rte_always_inline void
777 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
778 	    struct rte_mbuf *m)
779 {
780 	uint16_t ret;
781 
782 	if (builtin_net_driver) {
783 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
784 	} else {
785 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
786 	}
787 
788 	if (enable_stats) {
789 		rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
790 		rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
791 		src_vdev->stats.tx_total++;
792 		src_vdev->stats.tx += ret;
793 	}
794 }
795 
796 /*
797  * Check if the packet destination MAC address is for a local device. If so then put
798  * the packet on that devices RX queue. If not then return.
799  */
800 static __rte_always_inline int
801 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
802 {
803 	struct ether_hdr *pkt_hdr;
804 	struct vhost_dev *dst_vdev;
805 
806 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
807 
808 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
809 	if (!dst_vdev)
810 		return -1;
811 
812 	if (vdev->vid == dst_vdev->vid) {
813 		RTE_LOG_DP(DEBUG, VHOST_DATA,
814 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
815 			vdev->vid);
816 		return 0;
817 	}
818 
819 	RTE_LOG_DP(DEBUG, VHOST_DATA,
820 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
821 
822 	if (unlikely(dst_vdev->remove)) {
823 		RTE_LOG_DP(DEBUG, VHOST_DATA,
824 			"(%d) device is marked for removal\n", dst_vdev->vid);
825 		return 0;
826 	}
827 
828 	virtio_xmit(dst_vdev, vdev, m);
829 	return 0;
830 }
831 
832 /*
833  * Check if the destination MAC of a packet is one local VM,
834  * and get its vlan tag, and offset if it is.
835  */
836 static __rte_always_inline int
837 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
838 	uint32_t *offset, uint16_t *vlan_tag)
839 {
840 	struct vhost_dev *dst_vdev;
841 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
842 
843 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
844 	if (!dst_vdev)
845 		return 0;
846 
847 	if (vdev->vid == dst_vdev->vid) {
848 		RTE_LOG_DP(DEBUG, VHOST_DATA,
849 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
850 			vdev->vid);
851 		return -1;
852 	}
853 
854 	/*
855 	 * HW vlan strip will reduce the packet length
856 	 * by minus length of vlan tag, so need restore
857 	 * the packet length by plus it.
858 	 */
859 	*offset  = VLAN_HLEN;
860 	*vlan_tag = vlan_tags[vdev->vid];
861 
862 	RTE_LOG_DP(DEBUG, VHOST_DATA,
863 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
864 		vdev->vid, dst_vdev->vid, *vlan_tag);
865 
866 	return 0;
867 }
868 
869 static uint16_t
870 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
871 {
872 	if (ol_flags & PKT_TX_IPV4)
873 		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
874 	else /* assume ethertype == ETHER_TYPE_IPv6 */
875 		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
876 }
877 
878 static void virtio_tx_offload(struct rte_mbuf *m)
879 {
880 	void *l3_hdr;
881 	struct ipv4_hdr *ipv4_hdr = NULL;
882 	struct tcp_hdr *tcp_hdr = NULL;
883 	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
884 
885 	l3_hdr = (char *)eth_hdr + m->l2_len;
886 
887 	if (m->ol_flags & PKT_TX_IPV4) {
888 		ipv4_hdr = l3_hdr;
889 		ipv4_hdr->hdr_checksum = 0;
890 		m->ol_flags |= PKT_TX_IP_CKSUM;
891 	}
892 
893 	tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
894 	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
895 }
896 
897 static inline void
898 free_pkts(struct rte_mbuf **pkts, uint16_t n)
899 {
900 	while (n--)
901 		rte_pktmbuf_free(pkts[n]);
902 }
903 
904 static __rte_always_inline void
905 do_drain_mbuf_table(struct mbuf_table *tx_q)
906 {
907 	uint16_t count;
908 
909 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
910 				 tx_q->m_table, tx_q->len);
911 	if (unlikely(count < tx_q->len))
912 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
913 
914 	tx_q->len = 0;
915 }
916 
917 /*
918  * This function routes the TX packet to the correct interface. This
919  * may be a local device or the physical port.
920  */
921 static __rte_always_inline void
922 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
923 {
924 	struct mbuf_table *tx_q;
925 	unsigned offset = 0;
926 	const uint16_t lcore_id = rte_lcore_id();
927 	struct ether_hdr *nh;
928 
929 
930 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
931 	if (unlikely(is_broadcast_ether_addr(&nh->d_addr))) {
932 		struct vhost_dev *vdev2;
933 
934 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
935 			if (vdev2 != vdev)
936 				virtio_xmit(vdev2, vdev, m);
937 		}
938 		goto queue2nic;
939 	}
940 
941 	/*check if destination is local VM*/
942 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
943 		rte_pktmbuf_free(m);
944 		return;
945 	}
946 
947 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
948 		if (unlikely(find_local_dest(vdev, m, &offset,
949 					     &vlan_tag) != 0)) {
950 			rte_pktmbuf_free(m);
951 			return;
952 		}
953 	}
954 
955 	RTE_LOG_DP(DEBUG, VHOST_DATA,
956 		"(%d) TX: MAC address is external\n", vdev->vid);
957 
958 queue2nic:
959 
960 	/*Add packet to the port tx queue*/
961 	tx_q = &lcore_tx_queue[lcore_id];
962 
963 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
964 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
965 		/* Guest has inserted the vlan tag. */
966 		struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
967 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
968 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
969 			(vh->vlan_tci != vlan_tag_be))
970 			vh->vlan_tci = vlan_tag_be;
971 	} else {
972 		m->ol_flags |= PKT_TX_VLAN_PKT;
973 
974 		/*
975 		 * Find the right seg to adjust the data len when offset is
976 		 * bigger than tail room size.
977 		 */
978 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
979 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
980 				m->data_len += offset;
981 			else {
982 				struct rte_mbuf *seg = m;
983 
984 				while ((seg->next != NULL) &&
985 					(offset > rte_pktmbuf_tailroom(seg)))
986 					seg = seg->next;
987 
988 				seg->data_len += offset;
989 			}
990 			m->pkt_len += offset;
991 		}
992 
993 		m->vlan_tci = vlan_tag;
994 	}
995 
996 	if (m->ol_flags & PKT_TX_TCP_SEG)
997 		virtio_tx_offload(m);
998 
999 	tx_q->m_table[tx_q->len++] = m;
1000 	if (enable_stats) {
1001 		vdev->stats.tx_total++;
1002 		vdev->stats.tx++;
1003 	}
1004 
1005 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1006 		do_drain_mbuf_table(tx_q);
1007 }
1008 
1009 
1010 static __rte_always_inline void
1011 drain_mbuf_table(struct mbuf_table *tx_q)
1012 {
1013 	static uint64_t prev_tsc;
1014 	uint64_t cur_tsc;
1015 
1016 	if (tx_q->len == 0)
1017 		return;
1018 
1019 	cur_tsc = rte_rdtsc();
1020 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1021 		prev_tsc = cur_tsc;
1022 
1023 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1024 			"TX queue drained after timeout with burst size %u\n",
1025 			tx_q->len);
1026 		do_drain_mbuf_table(tx_q);
1027 	}
1028 }
1029 
1030 static __rte_always_inline void
1031 drain_eth_rx(struct vhost_dev *vdev)
1032 {
1033 	uint16_t rx_count, enqueue_count;
1034 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1035 
1036 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1037 				    pkts, MAX_PKT_BURST);
1038 	if (!rx_count)
1039 		return;
1040 
1041 	/*
1042 	 * When "enable_retry" is set, here we wait and retry when there
1043 	 * is no enough free slots in the queue to hold @rx_count packets,
1044 	 * to diminish packet loss.
1045 	 */
1046 	if (enable_retry &&
1047 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1048 			VIRTIO_RXQ))) {
1049 		uint32_t retry;
1050 
1051 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1052 			rte_delay_us(burst_rx_delay_time);
1053 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1054 					VIRTIO_RXQ))
1055 				break;
1056 		}
1057 	}
1058 
1059 	if (builtin_net_driver) {
1060 		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1061 						pkts, rx_count);
1062 	} else {
1063 		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1064 						pkts, rx_count);
1065 	}
1066 	if (enable_stats) {
1067 		rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1068 		rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1069 	}
1070 
1071 	free_pkts(pkts, rx_count);
1072 }
1073 
1074 static __rte_always_inline void
1075 drain_virtio_tx(struct vhost_dev *vdev)
1076 {
1077 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1078 	uint16_t count;
1079 	uint16_t i;
1080 
1081 	if (builtin_net_driver) {
1082 		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1083 					pkts, MAX_PKT_BURST);
1084 	} else {
1085 		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1086 					mbuf_pool, pkts, MAX_PKT_BURST);
1087 	}
1088 
1089 	/* setup VMDq for the first packet */
1090 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1091 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1092 			free_pkts(pkts, count);
1093 	}
1094 
1095 	for (i = 0; i < count; ++i)
1096 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1097 }
1098 
1099 /*
1100  * Main function of vhost-switch. It basically does:
1101  *
1102  * for each vhost device {
1103  *    - drain_eth_rx()
1104  *
1105  *      Which drains the host eth Rx queue linked to the vhost device,
1106  *      and deliver all of them to guest virito Rx ring associated with
1107  *      this vhost device.
1108  *
1109  *    - drain_virtio_tx()
1110  *
1111  *      Which drains the guest virtio Tx queue and deliver all of them
1112  *      to the target, which could be another vhost device, or the
1113  *      physical eth dev. The route is done in function "virtio_tx_route".
1114  * }
1115  */
1116 static int
1117 switch_worker(void *arg __rte_unused)
1118 {
1119 	unsigned i;
1120 	unsigned lcore_id = rte_lcore_id();
1121 	struct vhost_dev *vdev;
1122 	struct mbuf_table *tx_q;
1123 
1124 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1125 
1126 	tx_q = &lcore_tx_queue[lcore_id];
1127 	for (i = 0; i < rte_lcore_count(); i++) {
1128 		if (lcore_ids[i] == lcore_id) {
1129 			tx_q->txq_id = i;
1130 			break;
1131 		}
1132 	}
1133 
1134 	while(1) {
1135 		drain_mbuf_table(tx_q);
1136 
1137 		/*
1138 		 * Inform the configuration core that we have exited the
1139 		 * linked list and that no devices are in use if requested.
1140 		 */
1141 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1142 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1143 
1144 		/*
1145 		 * Process vhost devices
1146 		 */
1147 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1148 			      lcore_vdev_entry) {
1149 			if (unlikely(vdev->remove)) {
1150 				unlink_vmdq(vdev);
1151 				vdev->ready = DEVICE_SAFE_REMOVE;
1152 				continue;
1153 			}
1154 
1155 			if (likely(vdev->ready == DEVICE_RX))
1156 				drain_eth_rx(vdev);
1157 
1158 			if (likely(!vdev->remove))
1159 				drain_virtio_tx(vdev);
1160 		}
1161 	}
1162 
1163 	return 0;
1164 }
1165 
1166 /*
1167  * Remove a device from the specific data core linked list and from the
1168  * main linked list. Synchonization  occurs through the use of the
1169  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1170  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1171  */
1172 static void
1173 destroy_device(int vid)
1174 {
1175 	struct vhost_dev *vdev = NULL;
1176 	int lcore;
1177 
1178 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1179 		if (vdev->vid == vid)
1180 			break;
1181 	}
1182 	if (!vdev)
1183 		return;
1184 	/*set the remove flag. */
1185 	vdev->remove = 1;
1186 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1187 		rte_pause();
1188 	}
1189 
1190 	if (builtin_net_driver)
1191 		vs_vhost_net_remove(vdev);
1192 
1193 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1194 		     lcore_vdev_entry);
1195 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1196 
1197 
1198 	/* Set the dev_removal_flag on each lcore. */
1199 	RTE_LCORE_FOREACH_SLAVE(lcore)
1200 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1201 
1202 	/*
1203 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1204 	 * we can be sure that they can no longer access the device removed
1205 	 * from the linked lists and that the devices are no longer in use.
1206 	 */
1207 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1208 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1209 			rte_pause();
1210 	}
1211 
1212 	lcore_info[vdev->coreid].device_num--;
1213 
1214 	RTE_LOG(INFO, VHOST_DATA,
1215 		"(%d) device has been removed from data core\n",
1216 		vdev->vid);
1217 
1218 	rte_free(vdev);
1219 }
1220 
1221 /*
1222  * A new device is added to a data core. First the device is added to the main linked list
1223  * and the allocated to a specific data core.
1224  */
1225 static int
1226 new_device(int vid)
1227 {
1228 	int lcore, core_add = 0;
1229 	uint32_t device_num_min = num_devices;
1230 	struct vhost_dev *vdev;
1231 
1232 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1233 	if (vdev == NULL) {
1234 		RTE_LOG(INFO, VHOST_DATA,
1235 			"(%d) couldn't allocate memory for vhost dev\n",
1236 			vid);
1237 		return -1;
1238 	}
1239 	vdev->vid = vid;
1240 
1241 	if (builtin_net_driver)
1242 		vs_vhost_net_setup(vdev);
1243 
1244 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1245 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1246 
1247 	/*reset ready flag*/
1248 	vdev->ready = DEVICE_MAC_LEARNING;
1249 	vdev->remove = 0;
1250 
1251 	/* Find a suitable lcore to add the device. */
1252 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1253 		if (lcore_info[lcore].device_num < device_num_min) {
1254 			device_num_min = lcore_info[lcore].device_num;
1255 			core_add = lcore;
1256 		}
1257 	}
1258 	vdev->coreid = core_add;
1259 
1260 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1261 			  lcore_vdev_entry);
1262 	lcore_info[vdev->coreid].device_num++;
1263 
1264 	/* Disable notifications. */
1265 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1266 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1267 
1268 	RTE_LOG(INFO, VHOST_DATA,
1269 		"(%d) device has been added to data core %d\n",
1270 		vid, vdev->coreid);
1271 
1272 	return 0;
1273 }
1274 
1275 /*
1276  * These callback allow devices to be added to the data core when configuration
1277  * has been fully complete.
1278  */
1279 static const struct vhost_device_ops virtio_net_device_ops =
1280 {
1281 	.new_device =  new_device,
1282 	.destroy_device = destroy_device,
1283 };
1284 
1285 /*
1286  * This is a thread will wake up after a period to print stats if the user has
1287  * enabled them.
1288  */
1289 static void *
1290 print_stats(__rte_unused void *arg)
1291 {
1292 	struct vhost_dev *vdev;
1293 	uint64_t tx_dropped, rx_dropped;
1294 	uint64_t tx, tx_total, rx, rx_total;
1295 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1296 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1297 
1298 	while(1) {
1299 		sleep(enable_stats);
1300 
1301 		/* Clear screen and move to top left */
1302 		printf("%s%s\n", clr, top_left);
1303 		printf("Device statistics =================================\n");
1304 
1305 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1306 			tx_total   = vdev->stats.tx_total;
1307 			tx         = vdev->stats.tx;
1308 			tx_dropped = tx_total - tx;
1309 
1310 			rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1311 			rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1312 			rx_dropped = rx_total - rx;
1313 
1314 			printf("Statistics for device %d\n"
1315 				"-----------------------\n"
1316 				"TX total:              %" PRIu64 "\n"
1317 				"TX dropped:            %" PRIu64 "\n"
1318 				"TX successful:         %" PRIu64 "\n"
1319 				"RX total:              %" PRIu64 "\n"
1320 				"RX dropped:            %" PRIu64 "\n"
1321 				"RX successful:         %" PRIu64 "\n",
1322 				vdev->vid,
1323 				tx_total, tx_dropped, tx,
1324 				rx_total, rx_dropped, rx);
1325 		}
1326 
1327 		printf("===================================================\n");
1328 	}
1329 
1330 	return NULL;
1331 }
1332 
1333 static void
1334 unregister_drivers(int socket_num)
1335 {
1336 	int i, ret;
1337 
1338 	for (i = 0; i < socket_num; i++) {
1339 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1340 		if (ret != 0)
1341 			RTE_LOG(ERR, VHOST_CONFIG,
1342 				"Fail to unregister vhost driver for %s.\n",
1343 				socket_files + i * PATH_MAX);
1344 	}
1345 }
1346 
1347 /* When we receive a INT signal, unregister vhost driver */
1348 static void
1349 sigint_handler(__rte_unused int signum)
1350 {
1351 	/* Unregister vhost driver. */
1352 	unregister_drivers(nb_sockets);
1353 
1354 	exit(0);
1355 }
1356 
1357 /*
1358  * While creating an mbuf pool, one key thing is to figure out how
1359  * many mbuf entries is enough for our use. FYI, here are some
1360  * guidelines:
1361  *
1362  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1363  *
1364  * - For each switch core (A CPU core does the packet switch), we need
1365  *   also make some reservation for receiving the packets from virtio
1366  *   Tx queue. How many is enough depends on the usage. It's normally
1367  *   a simple calculation like following:
1368  *
1369  *       MAX_PKT_BURST * max packet size / mbuf size
1370  *
1371  *   So, we definitely need allocate more mbufs when TSO is enabled.
1372  *
1373  * - Similarly, for each switching core, we should serve @nr_rx_desc
1374  *   mbufs for receiving the packets from physical NIC device.
1375  *
1376  * - We also need make sure, for each switch core, we have allocated
1377  *   enough mbufs to fill up the mbuf cache.
1378  */
1379 static void
1380 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1381 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1382 {
1383 	uint32_t nr_mbufs;
1384 	uint32_t nr_mbufs_per_core;
1385 	uint32_t mtu = 1500;
1386 
1387 	if (mergeable)
1388 		mtu = 9000;
1389 	if (enable_tso)
1390 		mtu = 64 * 1024;
1391 
1392 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1393 			(mbuf_size - RTE_PKTMBUF_HEADROOM);
1394 	nr_mbufs_per_core += nr_rx_desc;
1395 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1396 
1397 	nr_mbufs  = nr_queues * nr_rx_desc;
1398 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1399 	nr_mbufs *= nr_port;
1400 
1401 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1402 					    nr_mbuf_cache, 0, mbuf_size,
1403 					    rte_socket_id());
1404 	if (mbuf_pool == NULL)
1405 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1406 }
1407 
1408 /*
1409  * Main function, does initialisation and calls the per-lcore functions.
1410  */
1411 int
1412 main(int argc, char *argv[])
1413 {
1414 	unsigned lcore_id, core_id = 0;
1415 	unsigned nb_ports, valid_num_ports;
1416 	int ret, i;
1417 	uint16_t portid;
1418 	static pthread_t tid;
1419 	uint64_t flags = 0;
1420 
1421 	signal(SIGINT, sigint_handler);
1422 
1423 	/* init EAL */
1424 	ret = rte_eal_init(argc, argv);
1425 	if (ret < 0)
1426 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1427 	argc -= ret;
1428 	argv += ret;
1429 
1430 	/* parse app arguments */
1431 	ret = us_vhost_parse_args(argc, argv);
1432 	if (ret < 0)
1433 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1434 
1435 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1436 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1437 
1438 		if (rte_lcore_is_enabled(lcore_id))
1439 			lcore_ids[core_id++] = lcore_id;
1440 	}
1441 
1442 	if (rte_lcore_count() > RTE_MAX_LCORE)
1443 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1444 
1445 	/* Get the number of physical ports. */
1446 	nb_ports = rte_eth_dev_count_avail();
1447 
1448 	/*
1449 	 * Update the global var NUM_PORTS and global array PORTS
1450 	 * and get value of var VALID_NUM_PORTS according to system ports number
1451 	 */
1452 	valid_num_ports = check_ports_num(nb_ports);
1453 
1454 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1455 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1456 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1457 		return -1;
1458 	}
1459 
1460 	/*
1461 	 * FIXME: here we are trying to allocate mbufs big enough for
1462 	 * @MAX_QUEUES, but the truth is we're never going to use that
1463 	 * many queues here. We probably should only do allocation for
1464 	 * those queues we are going to use.
1465 	 */
1466 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1467 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1468 
1469 	if (vm2vm_mode == VM2VM_HARDWARE) {
1470 		/* Enable VT loop back to let L2 switch to do it. */
1471 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1472 		RTE_LOG(DEBUG, VHOST_CONFIG,
1473 			"Enable loop back for L2 switch in vmdq.\n");
1474 	}
1475 
1476 	/* initialize all ports */
1477 	RTE_ETH_FOREACH_DEV(portid) {
1478 		/* skip ports that are not enabled */
1479 		if ((enabled_port_mask & (1 << portid)) == 0) {
1480 			RTE_LOG(INFO, VHOST_PORT,
1481 				"Skipping disabled port %d\n", portid);
1482 			continue;
1483 		}
1484 		if (port_init(portid) != 0)
1485 			rte_exit(EXIT_FAILURE,
1486 				"Cannot initialize network ports\n");
1487 	}
1488 
1489 	/* Enable stats if the user option is set. */
1490 	if (enable_stats) {
1491 		ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1492 					print_stats, NULL);
1493 		if (ret < 0)
1494 			rte_exit(EXIT_FAILURE,
1495 				"Cannot create print-stats thread\n");
1496 	}
1497 
1498 	/* Launch all data cores. */
1499 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1500 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1501 
1502 	if (client_mode)
1503 		flags |= RTE_VHOST_USER_CLIENT;
1504 
1505 	if (dequeue_zero_copy)
1506 		flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1507 
1508 	/* Register vhost user driver to handle vhost messages. */
1509 	for (i = 0; i < nb_sockets; i++) {
1510 		char *file = socket_files + i * PATH_MAX;
1511 		ret = rte_vhost_driver_register(file, flags);
1512 		if (ret != 0) {
1513 			unregister_drivers(i);
1514 			rte_exit(EXIT_FAILURE,
1515 				"vhost driver register failure.\n");
1516 		}
1517 
1518 		if (builtin_net_driver)
1519 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1520 
1521 		if (mergeable == 0) {
1522 			rte_vhost_driver_disable_features(file,
1523 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
1524 		}
1525 
1526 		if (enable_tx_csum == 0) {
1527 			rte_vhost_driver_disable_features(file,
1528 				1ULL << VIRTIO_NET_F_CSUM);
1529 		}
1530 
1531 		if (enable_tso == 0) {
1532 			rte_vhost_driver_disable_features(file,
1533 				1ULL << VIRTIO_NET_F_HOST_TSO4);
1534 			rte_vhost_driver_disable_features(file,
1535 				1ULL << VIRTIO_NET_F_HOST_TSO6);
1536 			rte_vhost_driver_disable_features(file,
1537 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
1538 			rte_vhost_driver_disable_features(file,
1539 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
1540 		}
1541 
1542 		if (promiscuous) {
1543 			rte_vhost_driver_enable_features(file,
1544 				1ULL << VIRTIO_NET_F_CTRL_RX);
1545 		}
1546 
1547 		ret = rte_vhost_driver_callback_register(file,
1548 			&virtio_net_device_ops);
1549 		if (ret != 0) {
1550 			rte_exit(EXIT_FAILURE,
1551 				"failed to register vhost driver callbacks.\n");
1552 		}
1553 
1554 		if (rte_vhost_driver_start(file) < 0) {
1555 			rte_exit(EXIT_FAILURE,
1556 				"failed to start vhost driver.\n");
1557 		}
1558 	}
1559 
1560 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1561 		rte_eal_wait_lcore(lcore_id);
1562 
1563 	return 0;
1564 
1565 }
1566