xref: /dpdk/examples/vhost/main.c (revision bc8e32473cc3978d763a1387eaa8244bcf75e77d)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4 
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16 
17 #include <rte_atomic.h>
18 #include <rte_cycles.h>
19 #include <rte_ethdev.h>
20 #include <rte_log.h>
21 #include <rte_string_fns.h>
22 #include <rte_malloc.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27 
28 #include "ioat.h"
29 #include "main.h"
30 
31 #ifndef MAX_QUEUES
32 #define MAX_QUEUES 128
33 #endif
34 
35 /* the maximum number of external ports supported */
36 #define MAX_SUP_PORTS 1
37 
38 #define MBUF_CACHE_SIZE	128
39 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
40 
41 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
42 
43 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
44 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
45 
46 #define JUMBO_FRAME_MAX_SIZE    0x2600
47 
48 /* State of virtio device. */
49 #define DEVICE_MAC_LEARNING 0
50 #define DEVICE_RX			1
51 #define DEVICE_SAFE_REMOVE	2
52 
53 /* Configurable number of RX/TX ring descriptors */
54 #define RTE_TEST_RX_DESC_DEFAULT 1024
55 #define RTE_TEST_TX_DESC_DEFAULT 512
56 
57 #define INVALID_PORT_ID 0xFF
58 
59 /* Maximum long option length for option parsing. */
60 #define MAX_LONG_OPT_SZ 64
61 
62 /* mask of enabled ports */
63 static uint32_t enabled_port_mask = 0;
64 
65 /* Promiscuous mode */
66 static uint32_t promiscuous;
67 
68 /* number of devices/queues to support*/
69 static uint32_t num_queues = 0;
70 static uint32_t num_devices;
71 
72 static struct rte_mempool *mbuf_pool;
73 static int mergeable;
74 
75 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
76 typedef enum {
77 	VM2VM_DISABLED = 0,
78 	VM2VM_SOFTWARE = 1,
79 	VM2VM_HARDWARE = 2,
80 	VM2VM_LAST
81 } vm2vm_type;
82 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
83 
84 /* Enable stats. */
85 static uint32_t enable_stats = 0;
86 /* Enable retries on RX. */
87 static uint32_t enable_retry = 1;
88 
89 /* Disable TX checksum offload */
90 static uint32_t enable_tx_csum;
91 
92 /* Disable TSO offload */
93 static uint32_t enable_tso;
94 
95 static int client_mode;
96 
97 static int builtin_net_driver;
98 
99 static int async_vhost_driver;
100 
101 static char dma_type[MAX_LONG_OPT_SZ];
102 
103 /* Specify timeout (in useconds) between retries on RX. */
104 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
105 /* Specify the number of retries on RX. */
106 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
107 
108 /* Socket file paths. Can be set by user */
109 static char *socket_files;
110 static int nb_sockets;
111 
112 /* empty vmdq configuration structure. Filled in programatically */
113 static struct rte_eth_conf vmdq_conf_default = {
114 	.rxmode = {
115 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
116 		.split_hdr_size = 0,
117 		/*
118 		 * VLAN strip is necessary for 1G NIC such as I350,
119 		 * this fixes bug of ipv4 forwarding in guest can't
120 		 * forward pakets from one virtio dev to another virtio dev.
121 		 */
122 		.offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
123 	},
124 
125 	.txmode = {
126 		.mq_mode = ETH_MQ_TX_NONE,
127 		.offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
128 			     DEV_TX_OFFLOAD_TCP_CKSUM |
129 			     DEV_TX_OFFLOAD_VLAN_INSERT |
130 			     DEV_TX_OFFLOAD_MULTI_SEGS |
131 			     DEV_TX_OFFLOAD_TCP_TSO),
132 	},
133 	.rx_adv_conf = {
134 		/*
135 		 * should be overridden separately in code with
136 		 * appropriate values
137 		 */
138 		.vmdq_rx_conf = {
139 			.nb_queue_pools = ETH_8_POOLS,
140 			.enable_default_pool = 0,
141 			.default_pool = 0,
142 			.nb_pool_maps = 0,
143 			.pool_map = {{0, 0},},
144 		},
145 	},
146 };
147 
148 
149 static unsigned lcore_ids[RTE_MAX_LCORE];
150 static uint16_t ports[RTE_MAX_ETHPORTS];
151 static unsigned num_ports = 0; /**< The number of ports specified in command line */
152 static uint16_t num_pf_queues, num_vmdq_queues;
153 static uint16_t vmdq_pool_base, vmdq_queue_base;
154 static uint16_t queues_per_pool;
155 
156 const uint16_t vlan_tags[] = {
157 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
158 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
159 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
160 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
161 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
162 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
163 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
164 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
165 };
166 
167 /* ethernet addresses of ports */
168 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
169 
170 static struct vhost_dev_tailq_list vhost_dev_list =
171 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
172 
173 static struct lcore_info lcore_info[RTE_MAX_LCORE];
174 
175 /* Used for queueing bursts of TX packets. */
176 struct mbuf_table {
177 	unsigned len;
178 	unsigned txq_id;
179 	struct rte_mbuf *m_table[MAX_PKT_BURST];
180 };
181 
182 /* TX queue for each data core. */
183 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
184 
185 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
186 				 / US_PER_S * BURST_TX_DRAIN_US)
187 #define VLAN_HLEN       4
188 
189 static inline int
190 open_dma(const char *value)
191 {
192 	if (strncmp(dma_type, "ioat", 4) == 0)
193 		return open_ioat(value);
194 
195 	return -1;
196 }
197 
198 /*
199  * Builds up the correct configuration for VMDQ VLAN pool map
200  * according to the pool & queue limits.
201  */
202 static inline int
203 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
204 {
205 	struct rte_eth_vmdq_rx_conf conf;
206 	struct rte_eth_vmdq_rx_conf *def_conf =
207 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
208 	unsigned i;
209 
210 	memset(&conf, 0, sizeof(conf));
211 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
212 	conf.nb_pool_maps = num_devices;
213 	conf.enable_loop_back = def_conf->enable_loop_back;
214 	conf.rx_mode = def_conf->rx_mode;
215 
216 	for (i = 0; i < conf.nb_pool_maps; i++) {
217 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
218 		conf.pool_map[i].pools = (1UL << i);
219 	}
220 
221 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
222 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
223 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
224 	return 0;
225 }
226 
227 /*
228  * Initialises a given port using global settings and with the rx buffers
229  * coming from the mbuf_pool passed as parameter
230  */
231 static inline int
232 port_init(uint16_t port)
233 {
234 	struct rte_eth_dev_info dev_info;
235 	struct rte_eth_conf port_conf;
236 	struct rte_eth_rxconf *rxconf;
237 	struct rte_eth_txconf *txconf;
238 	int16_t rx_rings, tx_rings;
239 	uint16_t rx_ring_size, tx_ring_size;
240 	int retval;
241 	uint16_t q;
242 
243 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
244 	retval = rte_eth_dev_info_get(port, &dev_info);
245 	if (retval != 0) {
246 		RTE_LOG(ERR, VHOST_PORT,
247 			"Error during getting device (port %u) info: %s\n",
248 			port, strerror(-retval));
249 
250 		return retval;
251 	}
252 
253 	rxconf = &dev_info.default_rxconf;
254 	txconf = &dev_info.default_txconf;
255 	rxconf->rx_drop_en = 1;
256 
257 	/*configure the number of supported virtio devices based on VMDQ limits */
258 	num_devices = dev_info.max_vmdq_pools;
259 
260 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
261 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
262 
263 	tx_rings = (uint16_t)rte_lcore_count();
264 
265 	/* Get port configuration. */
266 	retval = get_eth_conf(&port_conf, num_devices);
267 	if (retval < 0)
268 		return retval;
269 	/* NIC queues are divided into pf queues and vmdq queues.  */
270 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
271 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
272 	num_vmdq_queues = num_devices * queues_per_pool;
273 	num_queues = num_pf_queues + num_vmdq_queues;
274 	vmdq_queue_base = dev_info.vmdq_queue_base;
275 	vmdq_pool_base  = dev_info.vmdq_pool_base;
276 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
277 		num_pf_queues, num_devices, queues_per_pool);
278 
279 	if (!rte_eth_dev_is_valid_port(port))
280 		return -1;
281 
282 	rx_rings = (uint16_t)dev_info.max_rx_queues;
283 	if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
284 		port_conf.txmode.offloads |=
285 			DEV_TX_OFFLOAD_MBUF_FAST_FREE;
286 	/* Configure ethernet device. */
287 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
288 	if (retval != 0) {
289 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
290 			port, strerror(-retval));
291 		return retval;
292 	}
293 
294 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
295 		&tx_ring_size);
296 	if (retval != 0) {
297 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
298 			"for port %u: %s.\n", port, strerror(-retval));
299 		return retval;
300 	}
301 	if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
302 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
303 			"for Rx queues on port %u.\n", port);
304 		return -1;
305 	}
306 
307 	/* Setup the queues. */
308 	rxconf->offloads = port_conf.rxmode.offloads;
309 	for (q = 0; q < rx_rings; q ++) {
310 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
311 						rte_eth_dev_socket_id(port),
312 						rxconf,
313 						mbuf_pool);
314 		if (retval < 0) {
315 			RTE_LOG(ERR, VHOST_PORT,
316 				"Failed to setup rx queue %u of port %u: %s.\n",
317 				q, port, strerror(-retval));
318 			return retval;
319 		}
320 	}
321 	txconf->offloads = port_conf.txmode.offloads;
322 	for (q = 0; q < tx_rings; q ++) {
323 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
324 						rte_eth_dev_socket_id(port),
325 						txconf);
326 		if (retval < 0) {
327 			RTE_LOG(ERR, VHOST_PORT,
328 				"Failed to setup tx queue %u of port %u: %s.\n",
329 				q, port, strerror(-retval));
330 			return retval;
331 		}
332 	}
333 
334 	/* Start the device. */
335 	retval  = rte_eth_dev_start(port);
336 	if (retval < 0) {
337 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
338 			port, strerror(-retval));
339 		return retval;
340 	}
341 
342 	if (promiscuous) {
343 		retval = rte_eth_promiscuous_enable(port);
344 		if (retval != 0) {
345 			RTE_LOG(ERR, VHOST_PORT,
346 				"Failed to enable promiscuous mode on port %u: %s\n",
347 				port, rte_strerror(-retval));
348 			return retval;
349 		}
350 	}
351 
352 	retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
353 	if (retval < 0) {
354 		RTE_LOG(ERR, VHOST_PORT,
355 			"Failed to get MAC address on port %u: %s\n",
356 			port, rte_strerror(-retval));
357 		return retval;
358 	}
359 
360 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
361 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
362 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
363 			port,
364 			vmdq_ports_eth_addr[port].addr_bytes[0],
365 			vmdq_ports_eth_addr[port].addr_bytes[1],
366 			vmdq_ports_eth_addr[port].addr_bytes[2],
367 			vmdq_ports_eth_addr[port].addr_bytes[3],
368 			vmdq_ports_eth_addr[port].addr_bytes[4],
369 			vmdq_ports_eth_addr[port].addr_bytes[5]);
370 
371 	return 0;
372 }
373 
374 /*
375  * Set socket file path.
376  */
377 static int
378 us_vhost_parse_socket_path(const char *q_arg)
379 {
380 	char *old;
381 
382 	/* parse number string */
383 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
384 		return -1;
385 
386 	old = socket_files;
387 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
388 	if (socket_files == NULL) {
389 		free(old);
390 		return -1;
391 	}
392 
393 	strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
394 	nb_sockets++;
395 
396 	return 0;
397 }
398 
399 /*
400  * Parse the portmask provided at run time.
401  */
402 static int
403 parse_portmask(const char *portmask)
404 {
405 	char *end = NULL;
406 	unsigned long pm;
407 
408 	errno = 0;
409 
410 	/* parse hexadecimal string */
411 	pm = strtoul(portmask, &end, 16);
412 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
413 		return 0;
414 
415 	return pm;
416 
417 }
418 
419 /*
420  * Parse num options at run time.
421  */
422 static int
423 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
424 {
425 	char *end = NULL;
426 	unsigned long num;
427 
428 	errno = 0;
429 
430 	/* parse unsigned int string */
431 	num = strtoul(q_arg, &end, 10);
432 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
433 		return -1;
434 
435 	if (num > max_valid_value)
436 		return -1;
437 
438 	return num;
439 
440 }
441 
442 /*
443  * Display usage
444  */
445 static void
446 us_vhost_usage(const char *prgname)
447 {
448 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
449 	"		--vm2vm [0|1|2]\n"
450 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
451 	"		--socket-file <path>\n"
452 	"		--nb-devices ND\n"
453 	"		-p PORTMASK: Set mask for ports to be used by application\n"
454 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
455 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
456 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
457 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
458 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
459 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
460 	"		--socket-file: The path of the socket file.\n"
461 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
462 	"		--tso [0|1] disable/enable TCP segment offload.\n"
463 	"		--client register a vhost-user socket as client mode.\n"
464 	"		--dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
465 	"		--dmas register dma channel for specific vhost device.\n",
466 	       prgname);
467 }
468 
469 /*
470  * Parse the arguments given in the command line of the application.
471  */
472 static int
473 us_vhost_parse_args(int argc, char **argv)
474 {
475 	int opt, ret;
476 	int option_index;
477 	unsigned i;
478 	const char *prgname = argv[0];
479 	static struct option long_option[] = {
480 		{"vm2vm", required_argument, NULL, 0},
481 		{"rx-retry", required_argument, NULL, 0},
482 		{"rx-retry-delay", required_argument, NULL, 0},
483 		{"rx-retry-num", required_argument, NULL, 0},
484 		{"mergeable", required_argument, NULL, 0},
485 		{"stats", required_argument, NULL, 0},
486 		{"socket-file", required_argument, NULL, 0},
487 		{"tx-csum", required_argument, NULL, 0},
488 		{"tso", required_argument, NULL, 0},
489 		{"client", no_argument, &client_mode, 1},
490 		{"builtin-net-driver", no_argument, &builtin_net_driver, 1},
491 		{"dma-type", required_argument, NULL, 0},
492 		{"dmas", required_argument, NULL, 0},
493 		{NULL, 0, 0, 0},
494 	};
495 
496 	/* Parse command line */
497 	while ((opt = getopt_long(argc, argv, "p:P",
498 			long_option, &option_index)) != EOF) {
499 		switch (opt) {
500 		/* Portmask */
501 		case 'p':
502 			enabled_port_mask = parse_portmask(optarg);
503 			if (enabled_port_mask == 0) {
504 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
505 				us_vhost_usage(prgname);
506 				return -1;
507 			}
508 			break;
509 
510 		case 'P':
511 			promiscuous = 1;
512 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
513 				ETH_VMDQ_ACCEPT_BROADCAST |
514 				ETH_VMDQ_ACCEPT_MULTICAST;
515 
516 			break;
517 
518 		case 0:
519 			/* Enable/disable vm2vm comms. */
520 			if (!strncmp(long_option[option_index].name, "vm2vm",
521 				MAX_LONG_OPT_SZ)) {
522 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
523 				if (ret == -1) {
524 					RTE_LOG(INFO, VHOST_CONFIG,
525 						"Invalid argument for "
526 						"vm2vm [0|1|2]\n");
527 					us_vhost_usage(prgname);
528 					return -1;
529 				} else {
530 					vm2vm_mode = (vm2vm_type)ret;
531 				}
532 			}
533 
534 			/* Enable/disable retries on RX. */
535 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
536 				ret = parse_num_opt(optarg, 1);
537 				if (ret == -1) {
538 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
539 					us_vhost_usage(prgname);
540 					return -1;
541 				} else {
542 					enable_retry = ret;
543 				}
544 			}
545 
546 			/* Enable/disable TX checksum offload. */
547 			if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
548 				ret = parse_num_opt(optarg, 1);
549 				if (ret == -1) {
550 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
551 					us_vhost_usage(prgname);
552 					return -1;
553 				} else
554 					enable_tx_csum = ret;
555 			}
556 
557 			/* Enable/disable TSO offload. */
558 			if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
559 				ret = parse_num_opt(optarg, 1);
560 				if (ret == -1) {
561 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
562 					us_vhost_usage(prgname);
563 					return -1;
564 				} else
565 					enable_tso = ret;
566 			}
567 
568 			/* Specify the retries delay time (in useconds) on RX. */
569 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
570 				ret = parse_num_opt(optarg, INT32_MAX);
571 				if (ret == -1) {
572 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
573 					us_vhost_usage(prgname);
574 					return -1;
575 				} else {
576 					burst_rx_delay_time = ret;
577 				}
578 			}
579 
580 			/* Specify the retries number on RX. */
581 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
582 				ret = parse_num_opt(optarg, INT32_MAX);
583 				if (ret == -1) {
584 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
585 					us_vhost_usage(prgname);
586 					return -1;
587 				} else {
588 					burst_rx_retry_num = ret;
589 				}
590 			}
591 
592 			/* Enable/disable RX mergeable buffers. */
593 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
594 				ret = parse_num_opt(optarg, 1);
595 				if (ret == -1) {
596 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
597 					us_vhost_usage(prgname);
598 					return -1;
599 				} else {
600 					mergeable = !!ret;
601 					if (ret) {
602 						vmdq_conf_default.rxmode.offloads |=
603 							DEV_RX_OFFLOAD_JUMBO_FRAME;
604 						vmdq_conf_default.rxmode.max_rx_pkt_len
605 							= JUMBO_FRAME_MAX_SIZE;
606 					}
607 				}
608 			}
609 
610 			/* Enable/disable stats. */
611 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
612 				ret = parse_num_opt(optarg, INT32_MAX);
613 				if (ret == -1) {
614 					RTE_LOG(INFO, VHOST_CONFIG,
615 						"Invalid argument for stats [0..N]\n");
616 					us_vhost_usage(prgname);
617 					return -1;
618 				} else {
619 					enable_stats = ret;
620 				}
621 			}
622 
623 			/* Set socket file path. */
624 			if (!strncmp(long_option[option_index].name,
625 						"socket-file", MAX_LONG_OPT_SZ)) {
626 				if (us_vhost_parse_socket_path(optarg) == -1) {
627 					RTE_LOG(INFO, VHOST_CONFIG,
628 					"Invalid argument for socket name (Max %d characters)\n",
629 					PATH_MAX);
630 					us_vhost_usage(prgname);
631 					return -1;
632 				}
633 			}
634 
635 			if (!strncmp(long_option[option_index].name,
636 						"dma-type", MAX_LONG_OPT_SZ)) {
637 				if (strlen(optarg) >= MAX_LONG_OPT_SZ) {
638 					RTE_LOG(INFO, VHOST_CONFIG,
639 						"Wrong DMA type\n");
640 					us_vhost_usage(prgname);
641 					return -1;
642 				}
643 				strcpy(dma_type, optarg);
644 			}
645 
646 			if (!strncmp(long_option[option_index].name,
647 						"dmas", MAX_LONG_OPT_SZ)) {
648 				if (open_dma(optarg) == -1) {
649 					RTE_LOG(INFO, VHOST_CONFIG,
650 						"Wrong DMA args\n");
651 					us_vhost_usage(prgname);
652 					return -1;
653 				}
654 				async_vhost_driver = 1;
655 			}
656 
657 			break;
658 
659 			/* Invalid option - print options. */
660 		default:
661 			us_vhost_usage(prgname);
662 			return -1;
663 		}
664 	}
665 
666 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
667 		if (enabled_port_mask & (1 << i))
668 			ports[num_ports++] = i;
669 	}
670 
671 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
672 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
673 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
674 		return -1;
675 	}
676 
677 	return 0;
678 }
679 
680 /*
681  * Update the global var NUM_PORTS and array PORTS according to system ports number
682  * and return valid ports number
683  */
684 static unsigned check_ports_num(unsigned nb_ports)
685 {
686 	unsigned valid_num_ports = num_ports;
687 	unsigned portid;
688 
689 	if (num_ports > nb_ports) {
690 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
691 			num_ports, nb_ports);
692 		num_ports = nb_ports;
693 	}
694 
695 	for (portid = 0; portid < num_ports; portid ++) {
696 		if (!rte_eth_dev_is_valid_port(ports[portid])) {
697 			RTE_LOG(INFO, VHOST_PORT,
698 				"\nSpecified port ID(%u) is not valid\n",
699 				ports[portid]);
700 			ports[portid] = INVALID_PORT_ID;
701 			valid_num_ports--;
702 		}
703 	}
704 	return valid_num_ports;
705 }
706 
707 static __rte_always_inline struct vhost_dev *
708 find_vhost_dev(struct rte_ether_addr *mac)
709 {
710 	struct vhost_dev *vdev;
711 
712 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
713 		if (vdev->ready == DEVICE_RX &&
714 		    rte_is_same_ether_addr(mac, &vdev->mac_address))
715 			return vdev;
716 	}
717 
718 	return NULL;
719 }
720 
721 /*
722  * This function learns the MAC address of the device and registers this along with a
723  * vlan tag to a VMDQ.
724  */
725 static int
726 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
727 {
728 	struct rte_ether_hdr *pkt_hdr;
729 	int i, ret;
730 
731 	/* Learn MAC address of guest device from packet */
732 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
733 
734 	if (find_vhost_dev(&pkt_hdr->s_addr)) {
735 		RTE_LOG(ERR, VHOST_DATA,
736 			"(%d) device is using a registered MAC!\n",
737 			vdev->vid);
738 		return -1;
739 	}
740 
741 	for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
742 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
743 
744 	/* vlan_tag currently uses the device_id. */
745 	vdev->vlan_tag = vlan_tags[vdev->vid];
746 
747 	/* Print out VMDQ registration info. */
748 	RTE_LOG(INFO, VHOST_DATA,
749 		"(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
750 		vdev->vid,
751 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
752 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
753 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
754 		vdev->vlan_tag);
755 
756 	/* Register the MAC address. */
757 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
758 				(uint32_t)vdev->vid + vmdq_pool_base);
759 	if (ret)
760 		RTE_LOG(ERR, VHOST_DATA,
761 			"(%d) failed to add device MAC address to VMDQ\n",
762 			vdev->vid);
763 
764 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
765 
766 	/* Set device as ready for RX. */
767 	vdev->ready = DEVICE_RX;
768 
769 	return 0;
770 }
771 
772 /*
773  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
774  * queue before disabling RX on the device.
775  */
776 static inline void
777 unlink_vmdq(struct vhost_dev *vdev)
778 {
779 	unsigned i = 0;
780 	unsigned rx_count;
781 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
782 
783 	if (vdev->ready == DEVICE_RX) {
784 		/*clear MAC and VLAN settings*/
785 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
786 		for (i = 0; i < 6; i++)
787 			vdev->mac_address.addr_bytes[i] = 0;
788 
789 		vdev->vlan_tag = 0;
790 
791 		/*Clear out the receive buffers*/
792 		rx_count = rte_eth_rx_burst(ports[0],
793 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
794 
795 		while (rx_count) {
796 			for (i = 0; i < rx_count; i++)
797 				rte_pktmbuf_free(pkts_burst[i]);
798 
799 			rx_count = rte_eth_rx_burst(ports[0],
800 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
801 		}
802 
803 		vdev->ready = DEVICE_MAC_LEARNING;
804 	}
805 }
806 
807 static __rte_always_inline void
808 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
809 	    struct rte_mbuf *m)
810 {
811 	uint16_t ret;
812 	struct rte_mbuf *m_cpl[1];
813 
814 	if (builtin_net_driver) {
815 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
816 	} else if (async_vhost_driver) {
817 		ret = rte_vhost_submit_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ,
818 						&m, 1);
819 
820 		if (likely(ret))
821 			dst_vdev->nr_async_pkts++;
822 
823 		while (likely(dst_vdev->nr_async_pkts)) {
824 			if (rte_vhost_poll_enqueue_completed(dst_vdev->vid,
825 					VIRTIO_RXQ, m_cpl, 1))
826 				dst_vdev->nr_async_pkts--;
827 		}
828 	} else {
829 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
830 	}
831 
832 	if (enable_stats) {
833 		rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
834 		rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
835 		src_vdev->stats.tx_total++;
836 		src_vdev->stats.tx += ret;
837 	}
838 }
839 
840 /*
841  * Check if the packet destination MAC address is for a local device. If so then put
842  * the packet on that devices RX queue. If not then return.
843  */
844 static __rte_always_inline int
845 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
846 {
847 	struct rte_ether_hdr *pkt_hdr;
848 	struct vhost_dev *dst_vdev;
849 
850 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
851 
852 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
853 	if (!dst_vdev)
854 		return -1;
855 
856 	if (vdev->vid == dst_vdev->vid) {
857 		RTE_LOG_DP(DEBUG, VHOST_DATA,
858 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
859 			vdev->vid);
860 		return 0;
861 	}
862 
863 	RTE_LOG_DP(DEBUG, VHOST_DATA,
864 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
865 
866 	if (unlikely(dst_vdev->remove)) {
867 		RTE_LOG_DP(DEBUG, VHOST_DATA,
868 			"(%d) device is marked for removal\n", dst_vdev->vid);
869 		return 0;
870 	}
871 
872 	virtio_xmit(dst_vdev, vdev, m);
873 	return 0;
874 }
875 
876 /*
877  * Check if the destination MAC of a packet is one local VM,
878  * and get its vlan tag, and offset if it is.
879  */
880 static __rte_always_inline int
881 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
882 	uint32_t *offset, uint16_t *vlan_tag)
883 {
884 	struct vhost_dev *dst_vdev;
885 	struct rte_ether_hdr *pkt_hdr =
886 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
887 
888 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
889 	if (!dst_vdev)
890 		return 0;
891 
892 	if (vdev->vid == dst_vdev->vid) {
893 		RTE_LOG_DP(DEBUG, VHOST_DATA,
894 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
895 			vdev->vid);
896 		return -1;
897 	}
898 
899 	/*
900 	 * HW vlan strip will reduce the packet length
901 	 * by minus length of vlan tag, so need restore
902 	 * the packet length by plus it.
903 	 */
904 	*offset  = VLAN_HLEN;
905 	*vlan_tag = vlan_tags[vdev->vid];
906 
907 	RTE_LOG_DP(DEBUG, VHOST_DATA,
908 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
909 		vdev->vid, dst_vdev->vid, *vlan_tag);
910 
911 	return 0;
912 }
913 
914 static uint16_t
915 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
916 {
917 	if (ol_flags & PKT_TX_IPV4)
918 		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
919 	else /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
920 		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
921 }
922 
923 static void virtio_tx_offload(struct rte_mbuf *m)
924 {
925 	void *l3_hdr;
926 	struct rte_ipv4_hdr *ipv4_hdr = NULL;
927 	struct rte_tcp_hdr *tcp_hdr = NULL;
928 	struct rte_ether_hdr *eth_hdr =
929 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
930 
931 	l3_hdr = (char *)eth_hdr + m->l2_len;
932 
933 	if (m->ol_flags & PKT_TX_IPV4) {
934 		ipv4_hdr = l3_hdr;
935 		ipv4_hdr->hdr_checksum = 0;
936 		m->ol_flags |= PKT_TX_IP_CKSUM;
937 	}
938 
939 	tcp_hdr = (struct rte_tcp_hdr *)((char *)l3_hdr + m->l3_len);
940 	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
941 }
942 
943 static inline void
944 free_pkts(struct rte_mbuf **pkts, uint16_t n)
945 {
946 	while (n--)
947 		rte_pktmbuf_free(pkts[n]);
948 }
949 
950 static __rte_always_inline void
951 do_drain_mbuf_table(struct mbuf_table *tx_q)
952 {
953 	uint16_t count;
954 
955 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
956 				 tx_q->m_table, tx_q->len);
957 	if (unlikely(count < tx_q->len))
958 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
959 
960 	tx_q->len = 0;
961 }
962 
963 /*
964  * This function routes the TX packet to the correct interface. This
965  * may be a local device or the physical port.
966  */
967 static __rte_always_inline void
968 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
969 {
970 	struct mbuf_table *tx_q;
971 	unsigned offset = 0;
972 	const uint16_t lcore_id = rte_lcore_id();
973 	struct rte_ether_hdr *nh;
974 
975 
976 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
977 	if (unlikely(rte_is_broadcast_ether_addr(&nh->d_addr))) {
978 		struct vhost_dev *vdev2;
979 
980 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
981 			if (vdev2 != vdev)
982 				virtio_xmit(vdev2, vdev, m);
983 		}
984 		goto queue2nic;
985 	}
986 
987 	/*check if destination is local VM*/
988 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
989 		rte_pktmbuf_free(m);
990 		return;
991 	}
992 
993 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
994 		if (unlikely(find_local_dest(vdev, m, &offset,
995 					     &vlan_tag) != 0)) {
996 			rte_pktmbuf_free(m);
997 			return;
998 		}
999 	}
1000 
1001 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1002 		"(%d) TX: MAC address is external\n", vdev->vid);
1003 
1004 queue2nic:
1005 
1006 	/*Add packet to the port tx queue*/
1007 	tx_q = &lcore_tx_queue[lcore_id];
1008 
1009 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1010 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1011 		/* Guest has inserted the vlan tag. */
1012 		struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1013 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1014 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1015 			(vh->vlan_tci != vlan_tag_be))
1016 			vh->vlan_tci = vlan_tag_be;
1017 	} else {
1018 		m->ol_flags |= PKT_TX_VLAN_PKT;
1019 
1020 		/*
1021 		 * Find the right seg to adjust the data len when offset is
1022 		 * bigger than tail room size.
1023 		 */
1024 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1025 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1026 				m->data_len += offset;
1027 			else {
1028 				struct rte_mbuf *seg = m;
1029 
1030 				while ((seg->next != NULL) &&
1031 					(offset > rte_pktmbuf_tailroom(seg)))
1032 					seg = seg->next;
1033 
1034 				seg->data_len += offset;
1035 			}
1036 			m->pkt_len += offset;
1037 		}
1038 
1039 		m->vlan_tci = vlan_tag;
1040 	}
1041 
1042 	if (m->ol_flags & PKT_TX_TCP_SEG)
1043 		virtio_tx_offload(m);
1044 
1045 	tx_q->m_table[tx_q->len++] = m;
1046 	if (enable_stats) {
1047 		vdev->stats.tx_total++;
1048 		vdev->stats.tx++;
1049 	}
1050 
1051 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1052 		do_drain_mbuf_table(tx_q);
1053 }
1054 
1055 
1056 static __rte_always_inline void
1057 drain_mbuf_table(struct mbuf_table *tx_q)
1058 {
1059 	static uint64_t prev_tsc;
1060 	uint64_t cur_tsc;
1061 
1062 	if (tx_q->len == 0)
1063 		return;
1064 
1065 	cur_tsc = rte_rdtsc();
1066 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1067 		prev_tsc = cur_tsc;
1068 
1069 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1070 			"TX queue drained after timeout with burst size %u\n",
1071 			tx_q->len);
1072 		do_drain_mbuf_table(tx_q);
1073 	}
1074 }
1075 
1076 static __rte_always_inline void
1077 complete_async_pkts(struct vhost_dev *vdev, uint16_t qid)
1078 {
1079 	struct rte_mbuf *p_cpl[MAX_PKT_BURST];
1080 	uint16_t complete_count;
1081 
1082 	complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
1083 						qid, p_cpl, MAX_PKT_BURST);
1084 	vdev->nr_async_pkts -= complete_count;
1085 	if (complete_count)
1086 		free_pkts(p_cpl, complete_count);
1087 }
1088 
1089 static __rte_always_inline void
1090 drain_eth_rx(struct vhost_dev *vdev)
1091 {
1092 	uint16_t rx_count, enqueue_count;
1093 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1094 
1095 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1096 				    pkts, MAX_PKT_BURST);
1097 
1098 	while (likely(vdev->nr_async_pkts))
1099 		complete_async_pkts(vdev, VIRTIO_RXQ);
1100 
1101 	if (!rx_count)
1102 		return;
1103 
1104 	/*
1105 	 * When "enable_retry" is set, here we wait and retry when there
1106 	 * is no enough free slots in the queue to hold @rx_count packets,
1107 	 * to diminish packet loss.
1108 	 */
1109 	if (enable_retry &&
1110 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1111 			VIRTIO_RXQ))) {
1112 		uint32_t retry;
1113 
1114 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1115 			rte_delay_us(burst_rx_delay_time);
1116 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1117 					VIRTIO_RXQ))
1118 				break;
1119 		}
1120 	}
1121 
1122 	if (builtin_net_driver) {
1123 		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1124 						pkts, rx_count);
1125 	} else if (async_vhost_driver) {
1126 		enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1127 					VIRTIO_RXQ, pkts, rx_count);
1128 		vdev->nr_async_pkts += enqueue_count;
1129 	} else {
1130 		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1131 						pkts, rx_count);
1132 	}
1133 
1134 	if (enable_stats) {
1135 		rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1136 		rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1137 	}
1138 
1139 	if (!async_vhost_driver)
1140 		free_pkts(pkts, rx_count);
1141 }
1142 
1143 static __rte_always_inline void
1144 drain_virtio_tx(struct vhost_dev *vdev)
1145 {
1146 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1147 	uint16_t count;
1148 	uint16_t i;
1149 
1150 	if (builtin_net_driver) {
1151 		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1152 					pkts, MAX_PKT_BURST);
1153 	} else {
1154 		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1155 					mbuf_pool, pkts, MAX_PKT_BURST);
1156 	}
1157 
1158 	/* setup VMDq for the first packet */
1159 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1160 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1161 			free_pkts(pkts, count);
1162 	}
1163 
1164 	for (i = 0; i < count; ++i)
1165 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1166 }
1167 
1168 /*
1169  * Main function of vhost-switch. It basically does:
1170  *
1171  * for each vhost device {
1172  *    - drain_eth_rx()
1173  *
1174  *      Which drains the host eth Rx queue linked to the vhost device,
1175  *      and deliver all of them to guest virito Rx ring associated with
1176  *      this vhost device.
1177  *
1178  *    - drain_virtio_tx()
1179  *
1180  *      Which drains the guest virtio Tx queue and deliver all of them
1181  *      to the target, which could be another vhost device, or the
1182  *      physical eth dev. The route is done in function "virtio_tx_route".
1183  * }
1184  */
1185 static int
1186 switch_worker(void *arg __rte_unused)
1187 {
1188 	unsigned i;
1189 	unsigned lcore_id = rte_lcore_id();
1190 	struct vhost_dev *vdev;
1191 	struct mbuf_table *tx_q;
1192 
1193 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1194 
1195 	tx_q = &lcore_tx_queue[lcore_id];
1196 	for (i = 0; i < rte_lcore_count(); i++) {
1197 		if (lcore_ids[i] == lcore_id) {
1198 			tx_q->txq_id = i;
1199 			break;
1200 		}
1201 	}
1202 
1203 	while(1) {
1204 		drain_mbuf_table(tx_q);
1205 
1206 		/*
1207 		 * Inform the configuration core that we have exited the
1208 		 * linked list and that no devices are in use if requested.
1209 		 */
1210 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1211 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1212 
1213 		/*
1214 		 * Process vhost devices
1215 		 */
1216 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1217 			      lcore_vdev_entry) {
1218 			if (unlikely(vdev->remove)) {
1219 				unlink_vmdq(vdev);
1220 				vdev->ready = DEVICE_SAFE_REMOVE;
1221 				continue;
1222 			}
1223 
1224 			if (likely(vdev->ready == DEVICE_RX))
1225 				drain_eth_rx(vdev);
1226 
1227 			if (likely(!vdev->remove))
1228 				drain_virtio_tx(vdev);
1229 		}
1230 	}
1231 
1232 	return 0;
1233 }
1234 
1235 /*
1236  * Remove a device from the specific data core linked list and from the
1237  * main linked list. Synchonization  occurs through the use of the
1238  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1239  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1240  */
1241 static void
1242 destroy_device(int vid)
1243 {
1244 	struct vhost_dev *vdev = NULL;
1245 	int lcore;
1246 
1247 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1248 		if (vdev->vid == vid)
1249 			break;
1250 	}
1251 	if (!vdev)
1252 		return;
1253 	/*set the remove flag. */
1254 	vdev->remove = 1;
1255 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1256 		rte_pause();
1257 	}
1258 
1259 	if (builtin_net_driver)
1260 		vs_vhost_net_remove(vdev);
1261 
1262 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1263 		     lcore_vdev_entry);
1264 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1265 
1266 
1267 	/* Set the dev_removal_flag on each lcore. */
1268 	RTE_LCORE_FOREACH_WORKER(lcore)
1269 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1270 
1271 	/*
1272 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1273 	 * we can be sure that they can no longer access the device removed
1274 	 * from the linked lists and that the devices are no longer in use.
1275 	 */
1276 	RTE_LCORE_FOREACH_WORKER(lcore) {
1277 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1278 			rte_pause();
1279 	}
1280 
1281 	lcore_info[vdev->coreid].device_num--;
1282 
1283 	RTE_LOG(INFO, VHOST_DATA,
1284 		"(%d) device has been removed from data core\n",
1285 		vdev->vid);
1286 
1287 	if (async_vhost_driver)
1288 		rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1289 
1290 	rte_free(vdev);
1291 }
1292 
1293 /*
1294  * A new device is added to a data core. First the device is added to the main linked list
1295  * and then allocated to a specific data core.
1296  */
1297 static int
1298 new_device(int vid)
1299 {
1300 	int lcore, core_add = 0;
1301 	uint32_t device_num_min = num_devices;
1302 	struct vhost_dev *vdev;
1303 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1304 	if (vdev == NULL) {
1305 		RTE_LOG(INFO, VHOST_DATA,
1306 			"(%d) couldn't allocate memory for vhost dev\n",
1307 			vid);
1308 		return -1;
1309 	}
1310 	vdev->vid = vid;
1311 
1312 	if (builtin_net_driver)
1313 		vs_vhost_net_setup(vdev);
1314 
1315 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1316 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1317 
1318 	/*reset ready flag*/
1319 	vdev->ready = DEVICE_MAC_LEARNING;
1320 	vdev->remove = 0;
1321 
1322 	/* Find a suitable lcore to add the device. */
1323 	RTE_LCORE_FOREACH_WORKER(lcore) {
1324 		if (lcore_info[lcore].device_num < device_num_min) {
1325 			device_num_min = lcore_info[lcore].device_num;
1326 			core_add = lcore;
1327 		}
1328 	}
1329 	vdev->coreid = core_add;
1330 
1331 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1332 			  lcore_vdev_entry);
1333 	lcore_info[vdev->coreid].device_num++;
1334 
1335 	/* Disable notifications. */
1336 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1337 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1338 
1339 	RTE_LOG(INFO, VHOST_DATA,
1340 		"(%d) device has been added to data core %d\n",
1341 		vid, vdev->coreid);
1342 
1343 	if (async_vhost_driver) {
1344 		struct rte_vhost_async_features f;
1345 		struct rte_vhost_async_channel_ops channel_ops;
1346 		if (strncmp(dma_type, "ioat", 4) == 0) {
1347 			channel_ops.transfer_data = ioat_transfer_data_cb;
1348 			channel_ops.check_completed_copies =
1349 				ioat_check_completed_copies_cb;
1350 			f.async_inorder = 1;
1351 			f.async_threshold = 256;
1352 			return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
1353 				f.intval, &channel_ops);
1354 		}
1355 	}
1356 
1357 	return 0;
1358 }
1359 
1360 /*
1361  * These callback allow devices to be added to the data core when configuration
1362  * has been fully complete.
1363  */
1364 static const struct vhost_device_ops virtio_net_device_ops =
1365 {
1366 	.new_device =  new_device,
1367 	.destroy_device = destroy_device,
1368 };
1369 
1370 /*
1371  * This is a thread will wake up after a period to print stats if the user has
1372  * enabled them.
1373  */
1374 static void *
1375 print_stats(__rte_unused void *arg)
1376 {
1377 	struct vhost_dev *vdev;
1378 	uint64_t tx_dropped, rx_dropped;
1379 	uint64_t tx, tx_total, rx, rx_total;
1380 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1381 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1382 
1383 	while(1) {
1384 		sleep(enable_stats);
1385 
1386 		/* Clear screen and move to top left */
1387 		printf("%s%s\n", clr, top_left);
1388 		printf("Device statistics =================================\n");
1389 
1390 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1391 			tx_total   = vdev->stats.tx_total;
1392 			tx         = vdev->stats.tx;
1393 			tx_dropped = tx_total - tx;
1394 
1395 			rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1396 			rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1397 			rx_dropped = rx_total - rx;
1398 
1399 			printf("Statistics for device %d\n"
1400 				"-----------------------\n"
1401 				"TX total:              %" PRIu64 "\n"
1402 				"TX dropped:            %" PRIu64 "\n"
1403 				"TX successful:         %" PRIu64 "\n"
1404 				"RX total:              %" PRIu64 "\n"
1405 				"RX dropped:            %" PRIu64 "\n"
1406 				"RX successful:         %" PRIu64 "\n",
1407 				vdev->vid,
1408 				tx_total, tx_dropped, tx,
1409 				rx_total, rx_dropped, rx);
1410 		}
1411 
1412 		printf("===================================================\n");
1413 
1414 		fflush(stdout);
1415 	}
1416 
1417 	return NULL;
1418 }
1419 
1420 static void
1421 unregister_drivers(int socket_num)
1422 {
1423 	int i, ret;
1424 
1425 	for (i = 0; i < socket_num; i++) {
1426 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1427 		if (ret != 0)
1428 			RTE_LOG(ERR, VHOST_CONFIG,
1429 				"Fail to unregister vhost driver for %s.\n",
1430 				socket_files + i * PATH_MAX);
1431 	}
1432 }
1433 
1434 /* When we receive a INT signal, unregister vhost driver */
1435 static void
1436 sigint_handler(__rte_unused int signum)
1437 {
1438 	/* Unregister vhost driver. */
1439 	unregister_drivers(nb_sockets);
1440 
1441 	exit(0);
1442 }
1443 
1444 /*
1445  * While creating an mbuf pool, one key thing is to figure out how
1446  * many mbuf entries is enough for our use. FYI, here are some
1447  * guidelines:
1448  *
1449  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1450  *
1451  * - For each switch core (A CPU core does the packet switch), we need
1452  *   also make some reservation for receiving the packets from virtio
1453  *   Tx queue. How many is enough depends on the usage. It's normally
1454  *   a simple calculation like following:
1455  *
1456  *       MAX_PKT_BURST * max packet size / mbuf size
1457  *
1458  *   So, we definitely need allocate more mbufs when TSO is enabled.
1459  *
1460  * - Similarly, for each switching core, we should serve @nr_rx_desc
1461  *   mbufs for receiving the packets from physical NIC device.
1462  *
1463  * - We also need make sure, for each switch core, we have allocated
1464  *   enough mbufs to fill up the mbuf cache.
1465  */
1466 static void
1467 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1468 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1469 {
1470 	uint32_t nr_mbufs;
1471 	uint32_t nr_mbufs_per_core;
1472 	uint32_t mtu = 1500;
1473 
1474 	if (mergeable)
1475 		mtu = 9000;
1476 	if (enable_tso)
1477 		mtu = 64 * 1024;
1478 
1479 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1480 			(mbuf_size - RTE_PKTMBUF_HEADROOM);
1481 	nr_mbufs_per_core += nr_rx_desc;
1482 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1483 
1484 	nr_mbufs  = nr_queues * nr_rx_desc;
1485 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1486 	nr_mbufs *= nr_port;
1487 
1488 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1489 					    nr_mbuf_cache, 0, mbuf_size,
1490 					    rte_socket_id());
1491 	if (mbuf_pool == NULL)
1492 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1493 }
1494 
1495 /*
1496  * Main function, does initialisation and calls the per-lcore functions.
1497  */
1498 int
1499 main(int argc, char *argv[])
1500 {
1501 	unsigned lcore_id, core_id = 0;
1502 	unsigned nb_ports, valid_num_ports;
1503 	int ret, i;
1504 	uint16_t portid;
1505 	static pthread_t tid;
1506 	uint64_t flags = 0;
1507 
1508 	signal(SIGINT, sigint_handler);
1509 
1510 	/* init EAL */
1511 	ret = rte_eal_init(argc, argv);
1512 	if (ret < 0)
1513 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1514 	argc -= ret;
1515 	argv += ret;
1516 
1517 	/* parse app arguments */
1518 	ret = us_vhost_parse_args(argc, argv);
1519 	if (ret < 0)
1520 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1521 
1522 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1523 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1524 
1525 		if (rte_lcore_is_enabled(lcore_id))
1526 			lcore_ids[core_id++] = lcore_id;
1527 	}
1528 
1529 	if (rte_lcore_count() > RTE_MAX_LCORE)
1530 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1531 
1532 	/* Get the number of physical ports. */
1533 	nb_ports = rte_eth_dev_count_avail();
1534 
1535 	/*
1536 	 * Update the global var NUM_PORTS and global array PORTS
1537 	 * and get value of var VALID_NUM_PORTS according to system ports number
1538 	 */
1539 	valid_num_ports = check_ports_num(nb_ports);
1540 
1541 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1542 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1543 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1544 		return -1;
1545 	}
1546 
1547 	/*
1548 	 * FIXME: here we are trying to allocate mbufs big enough for
1549 	 * @MAX_QUEUES, but the truth is we're never going to use that
1550 	 * many queues here. We probably should only do allocation for
1551 	 * those queues we are going to use.
1552 	 */
1553 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1554 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1555 
1556 	if (vm2vm_mode == VM2VM_HARDWARE) {
1557 		/* Enable VT loop back to let L2 switch to do it. */
1558 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1559 		RTE_LOG(DEBUG, VHOST_CONFIG,
1560 			"Enable loop back for L2 switch in vmdq.\n");
1561 	}
1562 
1563 	/* initialize all ports */
1564 	RTE_ETH_FOREACH_DEV(portid) {
1565 		/* skip ports that are not enabled */
1566 		if ((enabled_port_mask & (1 << portid)) == 0) {
1567 			RTE_LOG(INFO, VHOST_PORT,
1568 				"Skipping disabled port %d\n", portid);
1569 			continue;
1570 		}
1571 		if (port_init(portid) != 0)
1572 			rte_exit(EXIT_FAILURE,
1573 				"Cannot initialize network ports\n");
1574 	}
1575 
1576 	/* Enable stats if the user option is set. */
1577 	if (enable_stats) {
1578 		ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1579 					print_stats, NULL);
1580 		if (ret < 0)
1581 			rte_exit(EXIT_FAILURE,
1582 				"Cannot create print-stats thread\n");
1583 	}
1584 
1585 	/* Launch all data cores. */
1586 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1587 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1588 
1589 	if (client_mode)
1590 		flags |= RTE_VHOST_USER_CLIENT;
1591 
1592 	/* Register vhost user driver to handle vhost messages. */
1593 	for (i = 0; i < nb_sockets; i++) {
1594 		char *file = socket_files + i * PATH_MAX;
1595 		if (async_vhost_driver)
1596 			flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1597 
1598 		ret = rte_vhost_driver_register(file, flags);
1599 		if (ret != 0) {
1600 			unregister_drivers(i);
1601 			rte_exit(EXIT_FAILURE,
1602 				"vhost driver register failure.\n");
1603 		}
1604 
1605 		if (builtin_net_driver)
1606 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1607 
1608 		if (mergeable == 0) {
1609 			rte_vhost_driver_disable_features(file,
1610 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
1611 		}
1612 
1613 		if (enable_tx_csum == 0) {
1614 			rte_vhost_driver_disable_features(file,
1615 				1ULL << VIRTIO_NET_F_CSUM);
1616 		}
1617 
1618 		if (enable_tso == 0) {
1619 			rte_vhost_driver_disable_features(file,
1620 				1ULL << VIRTIO_NET_F_HOST_TSO4);
1621 			rte_vhost_driver_disable_features(file,
1622 				1ULL << VIRTIO_NET_F_HOST_TSO6);
1623 			rte_vhost_driver_disable_features(file,
1624 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
1625 			rte_vhost_driver_disable_features(file,
1626 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
1627 		}
1628 
1629 		if (promiscuous) {
1630 			rte_vhost_driver_enable_features(file,
1631 				1ULL << VIRTIO_NET_F_CTRL_RX);
1632 		}
1633 
1634 		ret = rte_vhost_driver_callback_register(file,
1635 			&virtio_net_device_ops);
1636 		if (ret != 0) {
1637 			rte_exit(EXIT_FAILURE,
1638 				"failed to register vhost driver callbacks.\n");
1639 		}
1640 
1641 		if (rte_vhost_driver_start(file) < 0) {
1642 			rte_exit(EXIT_FAILURE,
1643 				"failed to start vhost driver.\n");
1644 		}
1645 	}
1646 
1647 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1648 		rte_eal_wait_lcore(lcore_id);
1649 
1650 	return 0;
1651 
1652 }
1653