xref: /dpdk/examples/vhost/main.c (revision c2341bb6713dcaa43113db6f8ee3dd40ae57aba7)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4 
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16 
17 #include <rte_atomic.h>
18 #include <rte_cycles.h>
19 #include <rte_ethdev.h>
20 #include <rte_log.h>
21 #include <rte_string_fns.h>
22 #include <rte_malloc.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27 
28 #include "ioat.h"
29 #include "main.h"
30 
31 #ifndef MAX_QUEUES
32 #define MAX_QUEUES 128
33 #endif
34 
35 /* the maximum number of external ports supported */
36 #define MAX_SUP_PORTS 1
37 
38 #define MBUF_CACHE_SIZE	128
39 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
40 
41 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
42 
43 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
44 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
45 
46 #define JUMBO_FRAME_MAX_SIZE    0x2600
47 
48 /* State of virtio device. */
49 #define DEVICE_MAC_LEARNING 0
50 #define DEVICE_RX			1
51 #define DEVICE_SAFE_REMOVE	2
52 
53 /* Configurable number of RX/TX ring descriptors */
54 #define RTE_TEST_RX_DESC_DEFAULT 1024
55 #define RTE_TEST_TX_DESC_DEFAULT 512
56 
57 #define INVALID_PORT_ID 0xFF
58 
59 /* Maximum long option length for option parsing. */
60 #define MAX_LONG_OPT_SZ 64
61 
62 /* mask of enabled ports */
63 static uint32_t enabled_port_mask = 0;
64 
65 /* Promiscuous mode */
66 static uint32_t promiscuous;
67 
68 /* number of devices/queues to support*/
69 static uint32_t num_queues = 0;
70 static uint32_t num_devices;
71 
72 static struct rte_mempool *mbuf_pool;
73 static int mergeable;
74 
75 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
76 typedef enum {
77 	VM2VM_DISABLED = 0,
78 	VM2VM_SOFTWARE = 1,
79 	VM2VM_HARDWARE = 2,
80 	VM2VM_LAST
81 } vm2vm_type;
82 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
83 
84 /* Enable stats. */
85 static uint32_t enable_stats = 0;
86 /* Enable retries on RX. */
87 static uint32_t enable_retry = 1;
88 
89 /* Disable TX checksum offload */
90 static uint32_t enable_tx_csum;
91 
92 /* Disable TSO offload */
93 static uint32_t enable_tso;
94 
95 static int client_mode;
96 
97 static int builtin_net_driver;
98 
99 static int async_vhost_driver;
100 
101 static char dma_type[MAX_LONG_OPT_SZ];
102 
103 /* Specify timeout (in useconds) between retries on RX. */
104 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
105 /* Specify the number of retries on RX. */
106 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
107 
108 /* Socket file paths. Can be set by user */
109 static char *socket_files;
110 static int nb_sockets;
111 
112 /* empty vmdq configuration structure. Filled in programatically */
113 static struct rte_eth_conf vmdq_conf_default = {
114 	.rxmode = {
115 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
116 		.split_hdr_size = 0,
117 		/*
118 		 * VLAN strip is necessary for 1G NIC such as I350,
119 		 * this fixes bug of ipv4 forwarding in guest can't
120 		 * forward pakets from one virtio dev to another virtio dev.
121 		 */
122 		.offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
123 	},
124 
125 	.txmode = {
126 		.mq_mode = ETH_MQ_TX_NONE,
127 		.offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
128 			     DEV_TX_OFFLOAD_TCP_CKSUM |
129 			     DEV_TX_OFFLOAD_VLAN_INSERT |
130 			     DEV_TX_OFFLOAD_MULTI_SEGS |
131 			     DEV_TX_OFFLOAD_TCP_TSO),
132 	},
133 	.rx_adv_conf = {
134 		/*
135 		 * should be overridden separately in code with
136 		 * appropriate values
137 		 */
138 		.vmdq_rx_conf = {
139 			.nb_queue_pools = ETH_8_POOLS,
140 			.enable_default_pool = 0,
141 			.default_pool = 0,
142 			.nb_pool_maps = 0,
143 			.pool_map = {{0, 0},},
144 		},
145 	},
146 };
147 
148 
149 static unsigned lcore_ids[RTE_MAX_LCORE];
150 static uint16_t ports[RTE_MAX_ETHPORTS];
151 static unsigned num_ports = 0; /**< The number of ports specified in command line */
152 static uint16_t num_pf_queues, num_vmdq_queues;
153 static uint16_t vmdq_pool_base, vmdq_queue_base;
154 static uint16_t queues_per_pool;
155 
156 const uint16_t vlan_tags[] = {
157 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
158 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
159 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
160 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
161 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
162 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
163 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
164 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
165 };
166 
167 /* ethernet addresses of ports */
168 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
169 
170 static struct vhost_dev_tailq_list vhost_dev_list =
171 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
172 
173 static struct lcore_info lcore_info[RTE_MAX_LCORE];
174 
175 /* Used for queueing bursts of TX packets. */
176 struct mbuf_table {
177 	unsigned len;
178 	unsigned txq_id;
179 	struct rte_mbuf *m_table[MAX_PKT_BURST];
180 };
181 
182 /* TX queue for each data core. */
183 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
184 
185 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
186 				 / US_PER_S * BURST_TX_DRAIN_US)
187 #define VLAN_HLEN       4
188 
189 static inline int
190 open_dma(const char *value)
191 {
192 	if (strncmp(dma_type, "ioat", 4) == 0)
193 		return open_ioat(value);
194 
195 	return -1;
196 }
197 
198 /*
199  * Builds up the correct configuration for VMDQ VLAN pool map
200  * according to the pool & queue limits.
201  */
202 static inline int
203 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
204 {
205 	struct rte_eth_vmdq_rx_conf conf;
206 	struct rte_eth_vmdq_rx_conf *def_conf =
207 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
208 	unsigned i;
209 
210 	memset(&conf, 0, sizeof(conf));
211 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
212 	conf.nb_pool_maps = num_devices;
213 	conf.enable_loop_back = def_conf->enable_loop_back;
214 	conf.rx_mode = def_conf->rx_mode;
215 
216 	for (i = 0; i < conf.nb_pool_maps; i++) {
217 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
218 		conf.pool_map[i].pools = (1UL << i);
219 	}
220 
221 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
222 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
223 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
224 	return 0;
225 }
226 
227 /*
228  * Initialises a given port using global settings and with the rx buffers
229  * coming from the mbuf_pool passed as parameter
230  */
231 static inline int
232 port_init(uint16_t port)
233 {
234 	struct rte_eth_dev_info dev_info;
235 	struct rte_eth_conf port_conf;
236 	struct rte_eth_rxconf *rxconf;
237 	struct rte_eth_txconf *txconf;
238 	int16_t rx_rings, tx_rings;
239 	uint16_t rx_ring_size, tx_ring_size;
240 	int retval;
241 	uint16_t q;
242 
243 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
244 	retval = rte_eth_dev_info_get(port, &dev_info);
245 	if (retval != 0) {
246 		RTE_LOG(ERR, VHOST_PORT,
247 			"Error during getting device (port %u) info: %s\n",
248 			port, strerror(-retval));
249 
250 		return retval;
251 	}
252 
253 	rxconf = &dev_info.default_rxconf;
254 	txconf = &dev_info.default_txconf;
255 	rxconf->rx_drop_en = 1;
256 
257 	/*configure the number of supported virtio devices based on VMDQ limits */
258 	num_devices = dev_info.max_vmdq_pools;
259 
260 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
261 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
262 
263 	tx_rings = (uint16_t)rte_lcore_count();
264 
265 	/* Get port configuration. */
266 	retval = get_eth_conf(&port_conf, num_devices);
267 	if (retval < 0)
268 		return retval;
269 	/* NIC queues are divided into pf queues and vmdq queues.  */
270 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
271 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
272 	num_vmdq_queues = num_devices * queues_per_pool;
273 	num_queues = num_pf_queues + num_vmdq_queues;
274 	vmdq_queue_base = dev_info.vmdq_queue_base;
275 	vmdq_pool_base  = dev_info.vmdq_pool_base;
276 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
277 		num_pf_queues, num_devices, queues_per_pool);
278 
279 	if (!rte_eth_dev_is_valid_port(port))
280 		return -1;
281 
282 	rx_rings = (uint16_t)dev_info.max_rx_queues;
283 	if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
284 		port_conf.txmode.offloads |=
285 			DEV_TX_OFFLOAD_MBUF_FAST_FREE;
286 	/* Configure ethernet device. */
287 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
288 	if (retval != 0) {
289 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
290 			port, strerror(-retval));
291 		return retval;
292 	}
293 
294 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
295 		&tx_ring_size);
296 	if (retval != 0) {
297 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
298 			"for port %u: %s.\n", port, strerror(-retval));
299 		return retval;
300 	}
301 	if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
302 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
303 			"for Rx queues on port %u.\n", port);
304 		return -1;
305 	}
306 
307 	/* Setup the queues. */
308 	rxconf->offloads = port_conf.rxmode.offloads;
309 	for (q = 0; q < rx_rings; q ++) {
310 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
311 						rte_eth_dev_socket_id(port),
312 						rxconf,
313 						mbuf_pool);
314 		if (retval < 0) {
315 			RTE_LOG(ERR, VHOST_PORT,
316 				"Failed to setup rx queue %u of port %u: %s.\n",
317 				q, port, strerror(-retval));
318 			return retval;
319 		}
320 	}
321 	txconf->offloads = port_conf.txmode.offloads;
322 	for (q = 0; q < tx_rings; q ++) {
323 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
324 						rte_eth_dev_socket_id(port),
325 						txconf);
326 		if (retval < 0) {
327 			RTE_LOG(ERR, VHOST_PORT,
328 				"Failed to setup tx queue %u of port %u: %s.\n",
329 				q, port, strerror(-retval));
330 			return retval;
331 		}
332 	}
333 
334 	/* Start the device. */
335 	retval  = rte_eth_dev_start(port);
336 	if (retval < 0) {
337 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
338 			port, strerror(-retval));
339 		return retval;
340 	}
341 
342 	if (promiscuous) {
343 		retval = rte_eth_promiscuous_enable(port);
344 		if (retval != 0) {
345 			RTE_LOG(ERR, VHOST_PORT,
346 				"Failed to enable promiscuous mode on port %u: %s\n",
347 				port, rte_strerror(-retval));
348 			return retval;
349 		}
350 	}
351 
352 	retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
353 	if (retval < 0) {
354 		RTE_LOG(ERR, VHOST_PORT,
355 			"Failed to get MAC address on port %u: %s\n",
356 			port, rte_strerror(-retval));
357 		return retval;
358 	}
359 
360 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
361 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
362 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
363 			port,
364 			vmdq_ports_eth_addr[port].addr_bytes[0],
365 			vmdq_ports_eth_addr[port].addr_bytes[1],
366 			vmdq_ports_eth_addr[port].addr_bytes[2],
367 			vmdq_ports_eth_addr[port].addr_bytes[3],
368 			vmdq_ports_eth_addr[port].addr_bytes[4],
369 			vmdq_ports_eth_addr[port].addr_bytes[5]);
370 
371 	return 0;
372 }
373 
374 /*
375  * Set socket file path.
376  */
377 static int
378 us_vhost_parse_socket_path(const char *q_arg)
379 {
380 	char *old;
381 
382 	/* parse number string */
383 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
384 		return -1;
385 
386 	old = socket_files;
387 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
388 	if (socket_files == NULL) {
389 		free(old);
390 		return -1;
391 	}
392 
393 	strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
394 	nb_sockets++;
395 
396 	return 0;
397 }
398 
399 /*
400  * Parse the portmask provided at run time.
401  */
402 static int
403 parse_portmask(const char *portmask)
404 {
405 	char *end = NULL;
406 	unsigned long pm;
407 
408 	errno = 0;
409 
410 	/* parse hexadecimal string */
411 	pm = strtoul(portmask, &end, 16);
412 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
413 		return 0;
414 
415 	return pm;
416 
417 }
418 
419 /*
420  * Parse num options at run time.
421  */
422 static int
423 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
424 {
425 	char *end = NULL;
426 	unsigned long num;
427 
428 	errno = 0;
429 
430 	/* parse unsigned int string */
431 	num = strtoul(q_arg, &end, 10);
432 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
433 		return -1;
434 
435 	if (num > max_valid_value)
436 		return -1;
437 
438 	return num;
439 
440 }
441 
442 /*
443  * Display usage
444  */
445 static void
446 us_vhost_usage(const char *prgname)
447 {
448 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
449 	"		--vm2vm [0|1|2]\n"
450 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
451 	"		--socket-file <path>\n"
452 	"		--nb-devices ND\n"
453 	"		-p PORTMASK: Set mask for ports to be used by application\n"
454 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
455 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
456 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
457 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
458 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
459 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
460 	"		--socket-file: The path of the socket file.\n"
461 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
462 	"		--tso [0|1] disable/enable TCP segment offload.\n"
463 	"		--client register a vhost-user socket as client mode.\n"
464 	"		--dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
465 	"		--dmas register dma channel for specific vhost device.\n",
466 	       prgname);
467 }
468 
469 /*
470  * Parse the arguments given in the command line of the application.
471  */
472 static int
473 us_vhost_parse_args(int argc, char **argv)
474 {
475 	int opt, ret;
476 	int option_index;
477 	unsigned i;
478 	const char *prgname = argv[0];
479 	static struct option long_option[] = {
480 		{"vm2vm", required_argument, NULL, 0},
481 		{"rx-retry", required_argument, NULL, 0},
482 		{"rx-retry-delay", required_argument, NULL, 0},
483 		{"rx-retry-num", required_argument, NULL, 0},
484 		{"mergeable", required_argument, NULL, 0},
485 		{"stats", required_argument, NULL, 0},
486 		{"socket-file", required_argument, NULL, 0},
487 		{"tx-csum", required_argument, NULL, 0},
488 		{"tso", required_argument, NULL, 0},
489 		{"client", no_argument, &client_mode, 1},
490 		{"builtin-net-driver", no_argument, &builtin_net_driver, 1},
491 		{"dma-type", required_argument, NULL, 0},
492 		{"dmas", required_argument, NULL, 0},
493 		{NULL, 0, 0, 0},
494 	};
495 
496 	/* Parse command line */
497 	while ((opt = getopt_long(argc, argv, "p:P",
498 			long_option, &option_index)) != EOF) {
499 		switch (opt) {
500 		/* Portmask */
501 		case 'p':
502 			enabled_port_mask = parse_portmask(optarg);
503 			if (enabled_port_mask == 0) {
504 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
505 				us_vhost_usage(prgname);
506 				return -1;
507 			}
508 			break;
509 
510 		case 'P':
511 			promiscuous = 1;
512 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
513 				ETH_VMDQ_ACCEPT_BROADCAST |
514 				ETH_VMDQ_ACCEPT_MULTICAST;
515 
516 			break;
517 
518 		case 0:
519 			/* Enable/disable vm2vm comms. */
520 			if (!strncmp(long_option[option_index].name, "vm2vm",
521 				MAX_LONG_OPT_SZ)) {
522 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
523 				if (ret == -1) {
524 					RTE_LOG(INFO, VHOST_CONFIG,
525 						"Invalid argument for "
526 						"vm2vm [0|1|2]\n");
527 					us_vhost_usage(prgname);
528 					return -1;
529 				} else {
530 					vm2vm_mode = (vm2vm_type)ret;
531 				}
532 			}
533 
534 			/* Enable/disable retries on RX. */
535 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
536 				ret = parse_num_opt(optarg, 1);
537 				if (ret == -1) {
538 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
539 					us_vhost_usage(prgname);
540 					return -1;
541 				} else {
542 					enable_retry = ret;
543 				}
544 			}
545 
546 			/* Enable/disable TX checksum offload. */
547 			if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
548 				ret = parse_num_opt(optarg, 1);
549 				if (ret == -1) {
550 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
551 					us_vhost_usage(prgname);
552 					return -1;
553 				} else
554 					enable_tx_csum = ret;
555 			}
556 
557 			/* Enable/disable TSO offload. */
558 			if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
559 				ret = parse_num_opt(optarg, 1);
560 				if (ret == -1) {
561 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
562 					us_vhost_usage(prgname);
563 					return -1;
564 				} else
565 					enable_tso = ret;
566 			}
567 
568 			/* Specify the retries delay time (in useconds) on RX. */
569 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
570 				ret = parse_num_opt(optarg, INT32_MAX);
571 				if (ret == -1) {
572 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
573 					us_vhost_usage(prgname);
574 					return -1;
575 				} else {
576 					burst_rx_delay_time = ret;
577 				}
578 			}
579 
580 			/* Specify the retries number on RX. */
581 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
582 				ret = parse_num_opt(optarg, INT32_MAX);
583 				if (ret == -1) {
584 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
585 					us_vhost_usage(prgname);
586 					return -1;
587 				} else {
588 					burst_rx_retry_num = ret;
589 				}
590 			}
591 
592 			/* Enable/disable RX mergeable buffers. */
593 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
594 				ret = parse_num_opt(optarg, 1);
595 				if (ret == -1) {
596 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
597 					us_vhost_usage(prgname);
598 					return -1;
599 				} else {
600 					mergeable = !!ret;
601 					if (ret) {
602 						vmdq_conf_default.rxmode.offloads |=
603 							DEV_RX_OFFLOAD_JUMBO_FRAME;
604 						vmdq_conf_default.rxmode.max_rx_pkt_len
605 							= JUMBO_FRAME_MAX_SIZE;
606 					}
607 				}
608 			}
609 
610 			/* Enable/disable stats. */
611 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
612 				ret = parse_num_opt(optarg, INT32_MAX);
613 				if (ret == -1) {
614 					RTE_LOG(INFO, VHOST_CONFIG,
615 						"Invalid argument for stats [0..N]\n");
616 					us_vhost_usage(prgname);
617 					return -1;
618 				} else {
619 					enable_stats = ret;
620 				}
621 			}
622 
623 			/* Set socket file path. */
624 			if (!strncmp(long_option[option_index].name,
625 						"socket-file", MAX_LONG_OPT_SZ)) {
626 				if (us_vhost_parse_socket_path(optarg) == -1) {
627 					RTE_LOG(INFO, VHOST_CONFIG,
628 					"Invalid argument for socket name (Max %d characters)\n",
629 					PATH_MAX);
630 					us_vhost_usage(prgname);
631 					return -1;
632 				}
633 			}
634 
635 			if (!strncmp(long_option[option_index].name,
636 						"dma-type", MAX_LONG_OPT_SZ)) {
637 				strcpy(dma_type, optarg);
638 			}
639 
640 			if (!strncmp(long_option[option_index].name,
641 						"dmas", MAX_LONG_OPT_SZ)) {
642 				if (open_dma(optarg) == -1) {
643 					RTE_LOG(INFO, VHOST_CONFIG,
644 						"Wrong DMA args\n");
645 					us_vhost_usage(prgname);
646 					return -1;
647 				}
648 				async_vhost_driver = 1;
649 			}
650 
651 			break;
652 
653 			/* Invalid option - print options. */
654 		default:
655 			us_vhost_usage(prgname);
656 			return -1;
657 		}
658 	}
659 
660 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
661 		if (enabled_port_mask & (1 << i))
662 			ports[num_ports++] = i;
663 	}
664 
665 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
666 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
667 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
668 		return -1;
669 	}
670 
671 	return 0;
672 }
673 
674 /*
675  * Update the global var NUM_PORTS and array PORTS according to system ports number
676  * and return valid ports number
677  */
678 static unsigned check_ports_num(unsigned nb_ports)
679 {
680 	unsigned valid_num_ports = num_ports;
681 	unsigned portid;
682 
683 	if (num_ports > nb_ports) {
684 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
685 			num_ports, nb_ports);
686 		num_ports = nb_ports;
687 	}
688 
689 	for (portid = 0; portid < num_ports; portid ++) {
690 		if (!rte_eth_dev_is_valid_port(ports[portid])) {
691 			RTE_LOG(INFO, VHOST_PORT,
692 				"\nSpecified port ID(%u) is not valid\n",
693 				ports[portid]);
694 			ports[portid] = INVALID_PORT_ID;
695 			valid_num_ports--;
696 		}
697 	}
698 	return valid_num_ports;
699 }
700 
701 static __rte_always_inline struct vhost_dev *
702 find_vhost_dev(struct rte_ether_addr *mac)
703 {
704 	struct vhost_dev *vdev;
705 
706 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
707 		if (vdev->ready == DEVICE_RX &&
708 		    rte_is_same_ether_addr(mac, &vdev->mac_address))
709 			return vdev;
710 	}
711 
712 	return NULL;
713 }
714 
715 /*
716  * This function learns the MAC address of the device and registers this along with a
717  * vlan tag to a VMDQ.
718  */
719 static int
720 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
721 {
722 	struct rte_ether_hdr *pkt_hdr;
723 	int i, ret;
724 
725 	/* Learn MAC address of guest device from packet */
726 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
727 
728 	if (find_vhost_dev(&pkt_hdr->s_addr)) {
729 		RTE_LOG(ERR, VHOST_DATA,
730 			"(%d) device is using a registered MAC!\n",
731 			vdev->vid);
732 		return -1;
733 	}
734 
735 	for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
736 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
737 
738 	/* vlan_tag currently uses the device_id. */
739 	vdev->vlan_tag = vlan_tags[vdev->vid];
740 
741 	/* Print out VMDQ registration info. */
742 	RTE_LOG(INFO, VHOST_DATA,
743 		"(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
744 		vdev->vid,
745 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
746 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
747 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
748 		vdev->vlan_tag);
749 
750 	/* Register the MAC address. */
751 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
752 				(uint32_t)vdev->vid + vmdq_pool_base);
753 	if (ret)
754 		RTE_LOG(ERR, VHOST_DATA,
755 			"(%d) failed to add device MAC address to VMDQ\n",
756 			vdev->vid);
757 
758 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
759 
760 	/* Set device as ready for RX. */
761 	vdev->ready = DEVICE_RX;
762 
763 	return 0;
764 }
765 
766 /*
767  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
768  * queue before disabling RX on the device.
769  */
770 static inline void
771 unlink_vmdq(struct vhost_dev *vdev)
772 {
773 	unsigned i = 0;
774 	unsigned rx_count;
775 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
776 
777 	if (vdev->ready == DEVICE_RX) {
778 		/*clear MAC and VLAN settings*/
779 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
780 		for (i = 0; i < 6; i++)
781 			vdev->mac_address.addr_bytes[i] = 0;
782 
783 		vdev->vlan_tag = 0;
784 
785 		/*Clear out the receive buffers*/
786 		rx_count = rte_eth_rx_burst(ports[0],
787 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
788 
789 		while (rx_count) {
790 			for (i = 0; i < rx_count; i++)
791 				rte_pktmbuf_free(pkts_burst[i]);
792 
793 			rx_count = rte_eth_rx_burst(ports[0],
794 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
795 		}
796 
797 		vdev->ready = DEVICE_MAC_LEARNING;
798 	}
799 }
800 
801 static __rte_always_inline void
802 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
803 	    struct rte_mbuf *m)
804 {
805 	uint16_t ret;
806 	struct rte_mbuf *m_cpl[1];
807 
808 	if (builtin_net_driver) {
809 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
810 	} else if (async_vhost_driver) {
811 		ret = rte_vhost_submit_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ,
812 						&m, 1);
813 
814 		if (likely(ret))
815 			dst_vdev->nr_async_pkts++;
816 
817 		while (likely(dst_vdev->nr_async_pkts)) {
818 			if (rte_vhost_poll_enqueue_completed(dst_vdev->vid,
819 					VIRTIO_RXQ, m_cpl, 1))
820 				dst_vdev->nr_async_pkts--;
821 		}
822 	} else {
823 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
824 	}
825 
826 	if (enable_stats) {
827 		rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
828 		rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
829 		src_vdev->stats.tx_total++;
830 		src_vdev->stats.tx += ret;
831 	}
832 }
833 
834 /*
835  * Check if the packet destination MAC address is for a local device. If so then put
836  * the packet on that devices RX queue. If not then return.
837  */
838 static __rte_always_inline int
839 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
840 {
841 	struct rte_ether_hdr *pkt_hdr;
842 	struct vhost_dev *dst_vdev;
843 
844 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
845 
846 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
847 	if (!dst_vdev)
848 		return -1;
849 
850 	if (vdev->vid == dst_vdev->vid) {
851 		RTE_LOG_DP(DEBUG, VHOST_DATA,
852 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
853 			vdev->vid);
854 		return 0;
855 	}
856 
857 	RTE_LOG_DP(DEBUG, VHOST_DATA,
858 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
859 
860 	if (unlikely(dst_vdev->remove)) {
861 		RTE_LOG_DP(DEBUG, VHOST_DATA,
862 			"(%d) device is marked for removal\n", dst_vdev->vid);
863 		return 0;
864 	}
865 
866 	virtio_xmit(dst_vdev, vdev, m);
867 	return 0;
868 }
869 
870 /*
871  * Check if the destination MAC of a packet is one local VM,
872  * and get its vlan tag, and offset if it is.
873  */
874 static __rte_always_inline int
875 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
876 	uint32_t *offset, uint16_t *vlan_tag)
877 {
878 	struct vhost_dev *dst_vdev;
879 	struct rte_ether_hdr *pkt_hdr =
880 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
881 
882 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
883 	if (!dst_vdev)
884 		return 0;
885 
886 	if (vdev->vid == dst_vdev->vid) {
887 		RTE_LOG_DP(DEBUG, VHOST_DATA,
888 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
889 			vdev->vid);
890 		return -1;
891 	}
892 
893 	/*
894 	 * HW vlan strip will reduce the packet length
895 	 * by minus length of vlan tag, so need restore
896 	 * the packet length by plus it.
897 	 */
898 	*offset  = VLAN_HLEN;
899 	*vlan_tag = vlan_tags[vdev->vid];
900 
901 	RTE_LOG_DP(DEBUG, VHOST_DATA,
902 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
903 		vdev->vid, dst_vdev->vid, *vlan_tag);
904 
905 	return 0;
906 }
907 
908 static uint16_t
909 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
910 {
911 	if (ol_flags & PKT_TX_IPV4)
912 		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
913 	else /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
914 		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
915 }
916 
917 static void virtio_tx_offload(struct rte_mbuf *m)
918 {
919 	void *l3_hdr;
920 	struct rte_ipv4_hdr *ipv4_hdr = NULL;
921 	struct rte_tcp_hdr *tcp_hdr = NULL;
922 	struct rte_ether_hdr *eth_hdr =
923 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
924 
925 	l3_hdr = (char *)eth_hdr + m->l2_len;
926 
927 	if (m->ol_flags & PKT_TX_IPV4) {
928 		ipv4_hdr = l3_hdr;
929 		ipv4_hdr->hdr_checksum = 0;
930 		m->ol_flags |= PKT_TX_IP_CKSUM;
931 	}
932 
933 	tcp_hdr = (struct rte_tcp_hdr *)((char *)l3_hdr + m->l3_len);
934 	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
935 }
936 
937 static inline void
938 free_pkts(struct rte_mbuf **pkts, uint16_t n)
939 {
940 	while (n--)
941 		rte_pktmbuf_free(pkts[n]);
942 }
943 
944 static __rte_always_inline void
945 do_drain_mbuf_table(struct mbuf_table *tx_q)
946 {
947 	uint16_t count;
948 
949 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
950 				 tx_q->m_table, tx_q->len);
951 	if (unlikely(count < tx_q->len))
952 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
953 
954 	tx_q->len = 0;
955 }
956 
957 /*
958  * This function routes the TX packet to the correct interface. This
959  * may be a local device or the physical port.
960  */
961 static __rte_always_inline void
962 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
963 {
964 	struct mbuf_table *tx_q;
965 	unsigned offset = 0;
966 	const uint16_t lcore_id = rte_lcore_id();
967 	struct rte_ether_hdr *nh;
968 
969 
970 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
971 	if (unlikely(rte_is_broadcast_ether_addr(&nh->d_addr))) {
972 		struct vhost_dev *vdev2;
973 
974 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
975 			if (vdev2 != vdev)
976 				virtio_xmit(vdev2, vdev, m);
977 		}
978 		goto queue2nic;
979 	}
980 
981 	/*check if destination is local VM*/
982 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
983 		rte_pktmbuf_free(m);
984 		return;
985 	}
986 
987 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
988 		if (unlikely(find_local_dest(vdev, m, &offset,
989 					     &vlan_tag) != 0)) {
990 			rte_pktmbuf_free(m);
991 			return;
992 		}
993 	}
994 
995 	RTE_LOG_DP(DEBUG, VHOST_DATA,
996 		"(%d) TX: MAC address is external\n", vdev->vid);
997 
998 queue2nic:
999 
1000 	/*Add packet to the port tx queue*/
1001 	tx_q = &lcore_tx_queue[lcore_id];
1002 
1003 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1004 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1005 		/* Guest has inserted the vlan tag. */
1006 		struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1007 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1008 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1009 			(vh->vlan_tci != vlan_tag_be))
1010 			vh->vlan_tci = vlan_tag_be;
1011 	} else {
1012 		m->ol_flags |= PKT_TX_VLAN_PKT;
1013 
1014 		/*
1015 		 * Find the right seg to adjust the data len when offset is
1016 		 * bigger than tail room size.
1017 		 */
1018 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1019 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1020 				m->data_len += offset;
1021 			else {
1022 				struct rte_mbuf *seg = m;
1023 
1024 				while ((seg->next != NULL) &&
1025 					(offset > rte_pktmbuf_tailroom(seg)))
1026 					seg = seg->next;
1027 
1028 				seg->data_len += offset;
1029 			}
1030 			m->pkt_len += offset;
1031 		}
1032 
1033 		m->vlan_tci = vlan_tag;
1034 	}
1035 
1036 	if (m->ol_flags & PKT_TX_TCP_SEG)
1037 		virtio_tx_offload(m);
1038 
1039 	tx_q->m_table[tx_q->len++] = m;
1040 	if (enable_stats) {
1041 		vdev->stats.tx_total++;
1042 		vdev->stats.tx++;
1043 	}
1044 
1045 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1046 		do_drain_mbuf_table(tx_q);
1047 }
1048 
1049 
1050 static __rte_always_inline void
1051 drain_mbuf_table(struct mbuf_table *tx_q)
1052 {
1053 	static uint64_t prev_tsc;
1054 	uint64_t cur_tsc;
1055 
1056 	if (tx_q->len == 0)
1057 		return;
1058 
1059 	cur_tsc = rte_rdtsc();
1060 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1061 		prev_tsc = cur_tsc;
1062 
1063 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1064 			"TX queue drained after timeout with burst size %u\n",
1065 			tx_q->len);
1066 		do_drain_mbuf_table(tx_q);
1067 	}
1068 }
1069 
1070 static __rte_always_inline void
1071 complete_async_pkts(struct vhost_dev *vdev, uint16_t qid)
1072 {
1073 	struct rte_mbuf *p_cpl[MAX_PKT_BURST];
1074 	uint16_t complete_count;
1075 
1076 	complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
1077 						qid, p_cpl, MAX_PKT_BURST);
1078 	vdev->nr_async_pkts -= complete_count;
1079 	if (complete_count)
1080 		free_pkts(p_cpl, complete_count);
1081 }
1082 
1083 static __rte_always_inline void
1084 drain_eth_rx(struct vhost_dev *vdev)
1085 {
1086 	uint16_t rx_count, enqueue_count;
1087 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1088 
1089 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1090 				    pkts, MAX_PKT_BURST);
1091 
1092 	while (likely(vdev->nr_async_pkts))
1093 		complete_async_pkts(vdev, VIRTIO_RXQ);
1094 
1095 	if (!rx_count)
1096 		return;
1097 
1098 	/*
1099 	 * When "enable_retry" is set, here we wait and retry when there
1100 	 * is no enough free slots in the queue to hold @rx_count packets,
1101 	 * to diminish packet loss.
1102 	 */
1103 	if (enable_retry &&
1104 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1105 			VIRTIO_RXQ))) {
1106 		uint32_t retry;
1107 
1108 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1109 			rte_delay_us(burst_rx_delay_time);
1110 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1111 					VIRTIO_RXQ))
1112 				break;
1113 		}
1114 	}
1115 
1116 	if (builtin_net_driver) {
1117 		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1118 						pkts, rx_count);
1119 	} else if (async_vhost_driver) {
1120 		enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1121 					VIRTIO_RXQ, pkts, rx_count);
1122 		vdev->nr_async_pkts += enqueue_count;
1123 	} else {
1124 		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1125 						pkts, rx_count);
1126 	}
1127 
1128 	if (enable_stats) {
1129 		rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1130 		rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1131 	}
1132 
1133 	if (!async_vhost_driver)
1134 		free_pkts(pkts, rx_count);
1135 }
1136 
1137 static __rte_always_inline void
1138 drain_virtio_tx(struct vhost_dev *vdev)
1139 {
1140 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1141 	uint16_t count;
1142 	uint16_t i;
1143 
1144 	if (builtin_net_driver) {
1145 		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1146 					pkts, MAX_PKT_BURST);
1147 	} else {
1148 		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1149 					mbuf_pool, pkts, MAX_PKT_BURST);
1150 	}
1151 
1152 	/* setup VMDq for the first packet */
1153 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1154 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1155 			free_pkts(pkts, count);
1156 	}
1157 
1158 	for (i = 0; i < count; ++i)
1159 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1160 }
1161 
1162 /*
1163  * Main function of vhost-switch. It basically does:
1164  *
1165  * for each vhost device {
1166  *    - drain_eth_rx()
1167  *
1168  *      Which drains the host eth Rx queue linked to the vhost device,
1169  *      and deliver all of them to guest virito Rx ring associated with
1170  *      this vhost device.
1171  *
1172  *    - drain_virtio_tx()
1173  *
1174  *      Which drains the guest virtio Tx queue and deliver all of them
1175  *      to the target, which could be another vhost device, or the
1176  *      physical eth dev. The route is done in function "virtio_tx_route".
1177  * }
1178  */
1179 static int
1180 switch_worker(void *arg __rte_unused)
1181 {
1182 	unsigned i;
1183 	unsigned lcore_id = rte_lcore_id();
1184 	struct vhost_dev *vdev;
1185 	struct mbuf_table *tx_q;
1186 
1187 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1188 
1189 	tx_q = &lcore_tx_queue[lcore_id];
1190 	for (i = 0; i < rte_lcore_count(); i++) {
1191 		if (lcore_ids[i] == lcore_id) {
1192 			tx_q->txq_id = i;
1193 			break;
1194 		}
1195 	}
1196 
1197 	while(1) {
1198 		drain_mbuf_table(tx_q);
1199 
1200 		/*
1201 		 * Inform the configuration core that we have exited the
1202 		 * linked list and that no devices are in use if requested.
1203 		 */
1204 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1205 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1206 
1207 		/*
1208 		 * Process vhost devices
1209 		 */
1210 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1211 			      lcore_vdev_entry) {
1212 			if (unlikely(vdev->remove)) {
1213 				unlink_vmdq(vdev);
1214 				vdev->ready = DEVICE_SAFE_REMOVE;
1215 				continue;
1216 			}
1217 
1218 			if (likely(vdev->ready == DEVICE_RX))
1219 				drain_eth_rx(vdev);
1220 
1221 			if (likely(!vdev->remove))
1222 				drain_virtio_tx(vdev);
1223 		}
1224 	}
1225 
1226 	return 0;
1227 }
1228 
1229 /*
1230  * Remove a device from the specific data core linked list and from the
1231  * main linked list. Synchonization  occurs through the use of the
1232  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1233  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1234  */
1235 static void
1236 destroy_device(int vid)
1237 {
1238 	struct vhost_dev *vdev = NULL;
1239 	int lcore;
1240 
1241 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1242 		if (vdev->vid == vid)
1243 			break;
1244 	}
1245 	if (!vdev)
1246 		return;
1247 	/*set the remove flag. */
1248 	vdev->remove = 1;
1249 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1250 		rte_pause();
1251 	}
1252 
1253 	if (builtin_net_driver)
1254 		vs_vhost_net_remove(vdev);
1255 
1256 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1257 		     lcore_vdev_entry);
1258 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1259 
1260 
1261 	/* Set the dev_removal_flag on each lcore. */
1262 	RTE_LCORE_FOREACH_WORKER(lcore)
1263 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1264 
1265 	/*
1266 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1267 	 * we can be sure that they can no longer access the device removed
1268 	 * from the linked lists and that the devices are no longer in use.
1269 	 */
1270 	RTE_LCORE_FOREACH_WORKER(lcore) {
1271 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1272 			rte_pause();
1273 	}
1274 
1275 	lcore_info[vdev->coreid].device_num--;
1276 
1277 	RTE_LOG(INFO, VHOST_DATA,
1278 		"(%d) device has been removed from data core\n",
1279 		vdev->vid);
1280 
1281 	if (async_vhost_driver)
1282 		rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1283 
1284 	rte_free(vdev);
1285 }
1286 
1287 /*
1288  * A new device is added to a data core. First the device is added to the main linked list
1289  * and then allocated to a specific data core.
1290  */
1291 static int
1292 new_device(int vid)
1293 {
1294 	int lcore, core_add = 0;
1295 	uint32_t device_num_min = num_devices;
1296 	struct vhost_dev *vdev;
1297 
1298 	struct rte_vhost_async_channel_ops channel_ops = {
1299 		.transfer_data = ioat_transfer_data_cb,
1300 		.check_completed_copies = ioat_check_completed_copies_cb
1301 	};
1302 	struct rte_vhost_async_features f;
1303 
1304 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1305 	if (vdev == NULL) {
1306 		RTE_LOG(INFO, VHOST_DATA,
1307 			"(%d) couldn't allocate memory for vhost dev\n",
1308 			vid);
1309 		return -1;
1310 	}
1311 	vdev->vid = vid;
1312 
1313 	if (builtin_net_driver)
1314 		vs_vhost_net_setup(vdev);
1315 
1316 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1317 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1318 
1319 	/*reset ready flag*/
1320 	vdev->ready = DEVICE_MAC_LEARNING;
1321 	vdev->remove = 0;
1322 
1323 	/* Find a suitable lcore to add the device. */
1324 	RTE_LCORE_FOREACH_WORKER(lcore) {
1325 		if (lcore_info[lcore].device_num < device_num_min) {
1326 			device_num_min = lcore_info[lcore].device_num;
1327 			core_add = lcore;
1328 		}
1329 	}
1330 	vdev->coreid = core_add;
1331 
1332 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1333 			  lcore_vdev_entry);
1334 	lcore_info[vdev->coreid].device_num++;
1335 
1336 	/* Disable notifications. */
1337 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1338 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1339 
1340 	RTE_LOG(INFO, VHOST_DATA,
1341 		"(%d) device has been added to data core %d\n",
1342 		vid, vdev->coreid);
1343 
1344 	if (async_vhost_driver) {
1345 		f.async_inorder = 1;
1346 		f.async_threshold = 256;
1347 		return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
1348 			f.intval, &channel_ops);
1349 	}
1350 
1351 	return 0;
1352 }
1353 
1354 /*
1355  * These callback allow devices to be added to the data core when configuration
1356  * has been fully complete.
1357  */
1358 static const struct vhost_device_ops virtio_net_device_ops =
1359 {
1360 	.new_device =  new_device,
1361 	.destroy_device = destroy_device,
1362 };
1363 
1364 /*
1365  * This is a thread will wake up after a period to print stats if the user has
1366  * enabled them.
1367  */
1368 static void *
1369 print_stats(__rte_unused void *arg)
1370 {
1371 	struct vhost_dev *vdev;
1372 	uint64_t tx_dropped, rx_dropped;
1373 	uint64_t tx, tx_total, rx, rx_total;
1374 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1375 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1376 
1377 	while(1) {
1378 		sleep(enable_stats);
1379 
1380 		/* Clear screen and move to top left */
1381 		printf("%s%s\n", clr, top_left);
1382 		printf("Device statistics =================================\n");
1383 
1384 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1385 			tx_total   = vdev->stats.tx_total;
1386 			tx         = vdev->stats.tx;
1387 			tx_dropped = tx_total - tx;
1388 
1389 			rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1390 			rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1391 			rx_dropped = rx_total - rx;
1392 
1393 			printf("Statistics for device %d\n"
1394 				"-----------------------\n"
1395 				"TX total:              %" PRIu64 "\n"
1396 				"TX dropped:            %" PRIu64 "\n"
1397 				"TX successful:         %" PRIu64 "\n"
1398 				"RX total:              %" PRIu64 "\n"
1399 				"RX dropped:            %" PRIu64 "\n"
1400 				"RX successful:         %" PRIu64 "\n",
1401 				vdev->vid,
1402 				tx_total, tx_dropped, tx,
1403 				rx_total, rx_dropped, rx);
1404 		}
1405 
1406 		printf("===================================================\n");
1407 
1408 		fflush(stdout);
1409 	}
1410 
1411 	return NULL;
1412 }
1413 
1414 static void
1415 unregister_drivers(int socket_num)
1416 {
1417 	int i, ret;
1418 
1419 	for (i = 0; i < socket_num; i++) {
1420 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1421 		if (ret != 0)
1422 			RTE_LOG(ERR, VHOST_CONFIG,
1423 				"Fail to unregister vhost driver for %s.\n",
1424 				socket_files + i * PATH_MAX);
1425 	}
1426 }
1427 
1428 /* When we receive a INT signal, unregister vhost driver */
1429 static void
1430 sigint_handler(__rte_unused int signum)
1431 {
1432 	/* Unregister vhost driver. */
1433 	unregister_drivers(nb_sockets);
1434 
1435 	exit(0);
1436 }
1437 
1438 /*
1439  * While creating an mbuf pool, one key thing is to figure out how
1440  * many mbuf entries is enough for our use. FYI, here are some
1441  * guidelines:
1442  *
1443  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1444  *
1445  * - For each switch core (A CPU core does the packet switch), we need
1446  *   also make some reservation for receiving the packets from virtio
1447  *   Tx queue. How many is enough depends on the usage. It's normally
1448  *   a simple calculation like following:
1449  *
1450  *       MAX_PKT_BURST * max packet size / mbuf size
1451  *
1452  *   So, we definitely need allocate more mbufs when TSO is enabled.
1453  *
1454  * - Similarly, for each switching core, we should serve @nr_rx_desc
1455  *   mbufs for receiving the packets from physical NIC device.
1456  *
1457  * - We also need make sure, for each switch core, we have allocated
1458  *   enough mbufs to fill up the mbuf cache.
1459  */
1460 static void
1461 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1462 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1463 {
1464 	uint32_t nr_mbufs;
1465 	uint32_t nr_mbufs_per_core;
1466 	uint32_t mtu = 1500;
1467 
1468 	if (mergeable)
1469 		mtu = 9000;
1470 	if (enable_tso)
1471 		mtu = 64 * 1024;
1472 
1473 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1474 			(mbuf_size - RTE_PKTMBUF_HEADROOM);
1475 	nr_mbufs_per_core += nr_rx_desc;
1476 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1477 
1478 	nr_mbufs  = nr_queues * nr_rx_desc;
1479 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1480 	nr_mbufs *= nr_port;
1481 
1482 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1483 					    nr_mbuf_cache, 0, mbuf_size,
1484 					    rte_socket_id());
1485 	if (mbuf_pool == NULL)
1486 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1487 }
1488 
1489 /*
1490  * Main function, does initialisation and calls the per-lcore functions.
1491  */
1492 int
1493 main(int argc, char *argv[])
1494 {
1495 	unsigned lcore_id, core_id = 0;
1496 	unsigned nb_ports, valid_num_ports;
1497 	int ret, i;
1498 	uint16_t portid;
1499 	static pthread_t tid;
1500 	uint64_t flags = 0;
1501 
1502 	signal(SIGINT, sigint_handler);
1503 
1504 	/* init EAL */
1505 	ret = rte_eal_init(argc, argv);
1506 	if (ret < 0)
1507 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1508 	argc -= ret;
1509 	argv += ret;
1510 
1511 	/* parse app arguments */
1512 	ret = us_vhost_parse_args(argc, argv);
1513 	if (ret < 0)
1514 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1515 
1516 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1517 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1518 
1519 		if (rte_lcore_is_enabled(lcore_id))
1520 			lcore_ids[core_id++] = lcore_id;
1521 	}
1522 
1523 	if (rte_lcore_count() > RTE_MAX_LCORE)
1524 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1525 
1526 	/* Get the number of physical ports. */
1527 	nb_ports = rte_eth_dev_count_avail();
1528 
1529 	/*
1530 	 * Update the global var NUM_PORTS and global array PORTS
1531 	 * and get value of var VALID_NUM_PORTS according to system ports number
1532 	 */
1533 	valid_num_ports = check_ports_num(nb_ports);
1534 
1535 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1536 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1537 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1538 		return -1;
1539 	}
1540 
1541 	/*
1542 	 * FIXME: here we are trying to allocate mbufs big enough for
1543 	 * @MAX_QUEUES, but the truth is we're never going to use that
1544 	 * many queues here. We probably should only do allocation for
1545 	 * those queues we are going to use.
1546 	 */
1547 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1548 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1549 
1550 	if (vm2vm_mode == VM2VM_HARDWARE) {
1551 		/* Enable VT loop back to let L2 switch to do it. */
1552 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1553 		RTE_LOG(DEBUG, VHOST_CONFIG,
1554 			"Enable loop back for L2 switch in vmdq.\n");
1555 	}
1556 
1557 	/* initialize all ports */
1558 	RTE_ETH_FOREACH_DEV(portid) {
1559 		/* skip ports that are not enabled */
1560 		if ((enabled_port_mask & (1 << portid)) == 0) {
1561 			RTE_LOG(INFO, VHOST_PORT,
1562 				"Skipping disabled port %d\n", portid);
1563 			continue;
1564 		}
1565 		if (port_init(portid) != 0)
1566 			rte_exit(EXIT_FAILURE,
1567 				"Cannot initialize network ports\n");
1568 	}
1569 
1570 	/* Enable stats if the user option is set. */
1571 	if (enable_stats) {
1572 		ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1573 					print_stats, NULL);
1574 		if (ret < 0)
1575 			rte_exit(EXIT_FAILURE,
1576 				"Cannot create print-stats thread\n");
1577 	}
1578 
1579 	/* Launch all data cores. */
1580 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1581 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1582 
1583 	if (client_mode)
1584 		flags |= RTE_VHOST_USER_CLIENT;
1585 
1586 	/* Register vhost user driver to handle vhost messages. */
1587 	for (i = 0; i < nb_sockets; i++) {
1588 		char *file = socket_files + i * PATH_MAX;
1589 		if (async_vhost_driver)
1590 			flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1591 
1592 		ret = rte_vhost_driver_register(file, flags);
1593 		if (ret != 0) {
1594 			unregister_drivers(i);
1595 			rte_exit(EXIT_FAILURE,
1596 				"vhost driver register failure.\n");
1597 		}
1598 
1599 		if (builtin_net_driver)
1600 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1601 
1602 		if (mergeable == 0) {
1603 			rte_vhost_driver_disable_features(file,
1604 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
1605 		}
1606 
1607 		if (enable_tx_csum == 0) {
1608 			rte_vhost_driver_disable_features(file,
1609 				1ULL << VIRTIO_NET_F_CSUM);
1610 		}
1611 
1612 		if (enable_tso == 0) {
1613 			rte_vhost_driver_disable_features(file,
1614 				1ULL << VIRTIO_NET_F_HOST_TSO4);
1615 			rte_vhost_driver_disable_features(file,
1616 				1ULL << VIRTIO_NET_F_HOST_TSO6);
1617 			rte_vhost_driver_disable_features(file,
1618 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
1619 			rte_vhost_driver_disable_features(file,
1620 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
1621 		}
1622 
1623 		if (promiscuous) {
1624 			rte_vhost_driver_enable_features(file,
1625 				1ULL << VIRTIO_NET_F_CTRL_RX);
1626 		}
1627 
1628 		ret = rte_vhost_driver_callback_register(file,
1629 			&virtio_net_device_ops);
1630 		if (ret != 0) {
1631 			rte_exit(EXIT_FAILURE,
1632 				"failed to register vhost driver callbacks.\n");
1633 		}
1634 
1635 		if (rte_vhost_driver_start(file) < 0) {
1636 			rte_exit(EXIT_FAILURE,
1637 				"failed to start vhost driver.\n");
1638 		}
1639 	}
1640 
1641 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1642 		rte_eal_wait_lcore(lcore_id);
1643 
1644 	return 0;
1645 
1646 }
1647