xref: /dpdk/examples/vhost/main.c (revision 59f3a8acbcdbafeebe816a26d76dfb06e6450f31)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4 
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16 
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
19 #include <rte_log.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
22 #include <rte_net.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27 
28 #include "ioat.h"
29 #include "main.h"
30 
31 #ifndef MAX_QUEUES
32 #define MAX_QUEUES 128
33 #endif
34 
35 /* the maximum number of external ports supported */
36 #define MAX_SUP_PORTS 1
37 
38 #define MBUF_CACHE_SIZE	128
39 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
40 
41 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
42 
43 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
44 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
45 
46 #define JUMBO_FRAME_MAX_SIZE    0x2600
47 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN))
48 
49 /* State of virtio device. */
50 #define DEVICE_MAC_LEARNING 0
51 #define DEVICE_RX			1
52 #define DEVICE_SAFE_REMOVE	2
53 
54 /* Configurable number of RX/TX ring descriptors */
55 #define RTE_TEST_RX_DESC_DEFAULT 1024
56 #define RTE_TEST_TX_DESC_DEFAULT 512
57 
58 #define INVALID_PORT_ID 0xFF
59 
60 /* mask of enabled ports */
61 static uint32_t enabled_port_mask = 0;
62 
63 /* Promiscuous mode */
64 static uint32_t promiscuous;
65 
66 /* number of devices/queues to support*/
67 static uint32_t num_queues = 0;
68 static uint32_t num_devices;
69 
70 static struct rte_mempool *mbuf_pool;
71 static int mergeable;
72 
73 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
74 typedef enum {
75 	VM2VM_DISABLED = 0,
76 	VM2VM_SOFTWARE = 1,
77 	VM2VM_HARDWARE = 2,
78 	VM2VM_LAST
79 } vm2vm_type;
80 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
81 
82 /* Enable stats. */
83 static uint32_t enable_stats = 0;
84 /* Enable retries on RX. */
85 static uint32_t enable_retry = 1;
86 
87 /* Disable TX checksum offload */
88 static uint32_t enable_tx_csum;
89 
90 /* Disable TSO offload */
91 static uint32_t enable_tso;
92 
93 static int client_mode;
94 
95 static int builtin_net_driver;
96 
97 static int async_vhost_driver;
98 
99 static char *dma_type;
100 
101 /* Specify timeout (in useconds) between retries on RX. */
102 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
103 /* Specify the number of retries on RX. */
104 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
105 
106 /* Socket file paths. Can be set by user */
107 static char *socket_files;
108 static int nb_sockets;
109 
110 /* empty vmdq configuration structure. Filled in programatically */
111 static struct rte_eth_conf vmdq_conf_default = {
112 	.rxmode = {
113 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
114 		.split_hdr_size = 0,
115 		/*
116 		 * VLAN strip is necessary for 1G NIC such as I350,
117 		 * this fixes bug of ipv4 forwarding in guest can't
118 		 * forward pakets from one virtio dev to another virtio dev.
119 		 */
120 		.offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
121 	},
122 
123 	.txmode = {
124 		.mq_mode = ETH_MQ_TX_NONE,
125 		.offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
126 			     DEV_TX_OFFLOAD_TCP_CKSUM |
127 			     DEV_TX_OFFLOAD_VLAN_INSERT |
128 			     DEV_TX_OFFLOAD_MULTI_SEGS |
129 			     DEV_TX_OFFLOAD_TCP_TSO),
130 	},
131 	.rx_adv_conf = {
132 		/*
133 		 * should be overridden separately in code with
134 		 * appropriate values
135 		 */
136 		.vmdq_rx_conf = {
137 			.nb_queue_pools = ETH_8_POOLS,
138 			.enable_default_pool = 0,
139 			.default_pool = 0,
140 			.nb_pool_maps = 0,
141 			.pool_map = {{0, 0},},
142 		},
143 	},
144 };
145 
146 
147 static unsigned lcore_ids[RTE_MAX_LCORE];
148 static uint16_t ports[RTE_MAX_ETHPORTS];
149 static unsigned num_ports = 0; /**< The number of ports specified in command line */
150 static uint16_t num_pf_queues, num_vmdq_queues;
151 static uint16_t vmdq_pool_base, vmdq_queue_base;
152 static uint16_t queues_per_pool;
153 
154 const uint16_t vlan_tags[] = {
155 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
156 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
157 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
158 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
159 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
160 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
161 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
162 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
163 };
164 
165 /* ethernet addresses of ports */
166 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
167 
168 static struct vhost_dev_tailq_list vhost_dev_list =
169 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
170 
171 static struct lcore_info lcore_info[RTE_MAX_LCORE];
172 
173 /* Used for queueing bursts of TX packets. */
174 struct mbuf_table {
175 	unsigned len;
176 	unsigned txq_id;
177 	struct rte_mbuf *m_table[MAX_PKT_BURST];
178 };
179 
180 struct vhost_bufftable {
181 	uint32_t len;
182 	uint64_t pre_tsc;
183 	struct rte_mbuf *m_table[MAX_PKT_BURST];
184 };
185 
186 /* TX queue for each data core. */
187 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
188 
189 /*
190  * Vhost TX buffer for each data core.
191  * Every data core maintains a TX buffer for every vhost device,
192  * which is used for batch pkts enqueue for higher performance.
193  */
194 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * MAX_VHOST_DEVICE];
195 
196 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
197 				 / US_PER_S * BURST_TX_DRAIN_US)
198 #define VLAN_HLEN       4
199 
200 static inline int
201 open_dma(const char *value)
202 {
203 	if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0)
204 		return open_ioat(value);
205 
206 	return -1;
207 }
208 
209 /*
210  * Builds up the correct configuration for VMDQ VLAN pool map
211  * according to the pool & queue limits.
212  */
213 static inline int
214 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
215 {
216 	struct rte_eth_vmdq_rx_conf conf;
217 	struct rte_eth_vmdq_rx_conf *def_conf =
218 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
219 	unsigned i;
220 
221 	memset(&conf, 0, sizeof(conf));
222 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
223 	conf.nb_pool_maps = num_devices;
224 	conf.enable_loop_back = def_conf->enable_loop_back;
225 	conf.rx_mode = def_conf->rx_mode;
226 
227 	for (i = 0; i < conf.nb_pool_maps; i++) {
228 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
229 		conf.pool_map[i].pools = (1UL << i);
230 	}
231 
232 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
233 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
234 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
235 	return 0;
236 }
237 
238 /*
239  * Initialises a given port using global settings and with the rx buffers
240  * coming from the mbuf_pool passed as parameter
241  */
242 static inline int
243 port_init(uint16_t port)
244 {
245 	struct rte_eth_dev_info dev_info;
246 	struct rte_eth_conf port_conf;
247 	struct rte_eth_rxconf *rxconf;
248 	struct rte_eth_txconf *txconf;
249 	int16_t rx_rings, tx_rings;
250 	uint16_t rx_ring_size, tx_ring_size;
251 	int retval;
252 	uint16_t q;
253 
254 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
255 	retval = rte_eth_dev_info_get(port, &dev_info);
256 	if (retval != 0) {
257 		RTE_LOG(ERR, VHOST_PORT,
258 			"Error during getting device (port %u) info: %s\n",
259 			port, strerror(-retval));
260 
261 		return retval;
262 	}
263 
264 	rxconf = &dev_info.default_rxconf;
265 	txconf = &dev_info.default_txconf;
266 	rxconf->rx_drop_en = 1;
267 
268 	/*configure the number of supported virtio devices based on VMDQ limits */
269 	num_devices = dev_info.max_vmdq_pools;
270 
271 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
272 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
273 
274 	tx_rings = (uint16_t)rte_lcore_count();
275 
276 	/* Get port configuration. */
277 	retval = get_eth_conf(&port_conf, num_devices);
278 	if (retval < 0)
279 		return retval;
280 	/* NIC queues are divided into pf queues and vmdq queues.  */
281 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
282 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
283 	num_vmdq_queues = num_devices * queues_per_pool;
284 	num_queues = num_pf_queues + num_vmdq_queues;
285 	vmdq_queue_base = dev_info.vmdq_queue_base;
286 	vmdq_pool_base  = dev_info.vmdq_pool_base;
287 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
288 		num_pf_queues, num_devices, queues_per_pool);
289 
290 	if (!rte_eth_dev_is_valid_port(port))
291 		return -1;
292 
293 	rx_rings = (uint16_t)dev_info.max_rx_queues;
294 	if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
295 		port_conf.txmode.offloads |=
296 			DEV_TX_OFFLOAD_MBUF_FAST_FREE;
297 	/* Configure ethernet device. */
298 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
299 	if (retval != 0) {
300 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
301 			port, strerror(-retval));
302 		return retval;
303 	}
304 
305 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
306 		&tx_ring_size);
307 	if (retval != 0) {
308 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
309 			"for port %u: %s.\n", port, strerror(-retval));
310 		return retval;
311 	}
312 	if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
313 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
314 			"for Rx queues on port %u.\n", port);
315 		return -1;
316 	}
317 
318 	/* Setup the queues. */
319 	rxconf->offloads = port_conf.rxmode.offloads;
320 	for (q = 0; q < rx_rings; q ++) {
321 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
322 						rte_eth_dev_socket_id(port),
323 						rxconf,
324 						mbuf_pool);
325 		if (retval < 0) {
326 			RTE_LOG(ERR, VHOST_PORT,
327 				"Failed to setup rx queue %u of port %u: %s.\n",
328 				q, port, strerror(-retval));
329 			return retval;
330 		}
331 	}
332 	txconf->offloads = port_conf.txmode.offloads;
333 	for (q = 0; q < tx_rings; q ++) {
334 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
335 						rte_eth_dev_socket_id(port),
336 						txconf);
337 		if (retval < 0) {
338 			RTE_LOG(ERR, VHOST_PORT,
339 				"Failed to setup tx queue %u of port %u: %s.\n",
340 				q, port, strerror(-retval));
341 			return retval;
342 		}
343 	}
344 
345 	/* Start the device. */
346 	retval  = rte_eth_dev_start(port);
347 	if (retval < 0) {
348 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
349 			port, strerror(-retval));
350 		return retval;
351 	}
352 
353 	if (promiscuous) {
354 		retval = rte_eth_promiscuous_enable(port);
355 		if (retval != 0) {
356 			RTE_LOG(ERR, VHOST_PORT,
357 				"Failed to enable promiscuous mode on port %u: %s\n",
358 				port, rte_strerror(-retval));
359 			return retval;
360 		}
361 	}
362 
363 	retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
364 	if (retval < 0) {
365 		RTE_LOG(ERR, VHOST_PORT,
366 			"Failed to get MAC address on port %u: %s\n",
367 			port, rte_strerror(-retval));
368 		return retval;
369 	}
370 
371 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
372 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
373 		" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
374 		port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port]));
375 
376 	return 0;
377 }
378 
379 /*
380  * Set socket file path.
381  */
382 static int
383 us_vhost_parse_socket_path(const char *q_arg)
384 {
385 	char *old;
386 
387 	/* parse number string */
388 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
389 		return -1;
390 
391 	old = socket_files;
392 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
393 	if (socket_files == NULL) {
394 		free(old);
395 		return -1;
396 	}
397 
398 	strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
399 	nb_sockets++;
400 
401 	return 0;
402 }
403 
404 /*
405  * Parse the portmask provided at run time.
406  */
407 static int
408 parse_portmask(const char *portmask)
409 {
410 	char *end = NULL;
411 	unsigned long pm;
412 
413 	errno = 0;
414 
415 	/* parse hexadecimal string */
416 	pm = strtoul(portmask, &end, 16);
417 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
418 		return 0;
419 
420 	return pm;
421 
422 }
423 
424 /*
425  * Parse num options at run time.
426  */
427 static int
428 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
429 {
430 	char *end = NULL;
431 	unsigned long num;
432 
433 	errno = 0;
434 
435 	/* parse unsigned int string */
436 	num = strtoul(q_arg, &end, 10);
437 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
438 		return -1;
439 
440 	if (num > max_valid_value)
441 		return -1;
442 
443 	return num;
444 
445 }
446 
447 /*
448  * Display usage
449  */
450 static void
451 us_vhost_usage(const char *prgname)
452 {
453 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
454 	"		--vm2vm [0|1|2]\n"
455 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
456 	"		--socket-file <path>\n"
457 	"		--nb-devices ND\n"
458 	"		-p PORTMASK: Set mask for ports to be used by application\n"
459 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
460 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
461 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
462 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
463 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
464 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
465 	"		--socket-file: The path of the socket file.\n"
466 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
467 	"		--tso [0|1] disable/enable TCP segment offload.\n"
468 	"		--client register a vhost-user socket as client mode.\n"
469 	"		--dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
470 	"		--dmas register dma channel for specific vhost device.\n",
471 	       prgname);
472 }
473 
474 enum {
475 #define OPT_VM2VM               "vm2vm"
476 	OPT_VM2VM_NUM = 256,
477 #define OPT_RX_RETRY            "rx-retry"
478 	OPT_RX_RETRY_NUM,
479 #define OPT_RX_RETRY_DELAY      "rx-retry-delay"
480 	OPT_RX_RETRY_DELAY_NUM,
481 #define OPT_RX_RETRY_NUMB       "rx-retry-num"
482 	OPT_RX_RETRY_NUMB_NUM,
483 #define OPT_MERGEABLE           "mergeable"
484 	OPT_MERGEABLE_NUM,
485 #define OPT_STATS               "stats"
486 	OPT_STATS_NUM,
487 #define OPT_SOCKET_FILE         "socket-file"
488 	OPT_SOCKET_FILE_NUM,
489 #define OPT_TX_CSUM             "tx-csum"
490 	OPT_TX_CSUM_NUM,
491 #define OPT_TSO                 "tso"
492 	OPT_TSO_NUM,
493 #define OPT_CLIENT              "client"
494 	OPT_CLIENT_NUM,
495 #define OPT_BUILTIN_NET_DRIVER  "builtin-net-driver"
496 	OPT_BUILTIN_NET_DRIVER_NUM,
497 #define OPT_DMA_TYPE            "dma-type"
498 	OPT_DMA_TYPE_NUM,
499 #define OPT_DMAS                "dmas"
500 	OPT_DMAS_NUM,
501 };
502 
503 /*
504  * Parse the arguments given in the command line of the application.
505  */
506 static int
507 us_vhost_parse_args(int argc, char **argv)
508 {
509 	int opt, ret;
510 	int option_index;
511 	unsigned i;
512 	const char *prgname = argv[0];
513 	static struct option long_option[] = {
514 		{OPT_VM2VM, required_argument,
515 				NULL, OPT_VM2VM_NUM},
516 		{OPT_RX_RETRY, required_argument,
517 				NULL, OPT_RX_RETRY_NUM},
518 		{OPT_RX_RETRY_DELAY, required_argument,
519 				NULL, OPT_RX_RETRY_DELAY_NUM},
520 		{OPT_RX_RETRY_NUMB, required_argument,
521 				NULL, OPT_RX_RETRY_NUMB_NUM},
522 		{OPT_MERGEABLE, required_argument,
523 				NULL, OPT_MERGEABLE_NUM},
524 		{OPT_STATS, required_argument,
525 				NULL, OPT_STATS_NUM},
526 		{OPT_SOCKET_FILE, required_argument,
527 				NULL, OPT_SOCKET_FILE_NUM},
528 		{OPT_TX_CSUM, required_argument,
529 				NULL, OPT_TX_CSUM_NUM},
530 		{OPT_TSO, required_argument,
531 				NULL, OPT_TSO_NUM},
532 		{OPT_CLIENT, no_argument,
533 				NULL, OPT_CLIENT_NUM},
534 		{OPT_BUILTIN_NET_DRIVER, no_argument,
535 				NULL, OPT_BUILTIN_NET_DRIVER_NUM},
536 		{OPT_DMA_TYPE, required_argument,
537 				NULL, OPT_DMA_TYPE_NUM},
538 		{OPT_DMAS, required_argument,
539 				NULL, OPT_DMAS_NUM},
540 		{NULL, 0, 0, 0},
541 	};
542 
543 	/* Parse command line */
544 	while ((opt = getopt_long(argc, argv, "p:P",
545 			long_option, &option_index)) != EOF) {
546 		switch (opt) {
547 		/* Portmask */
548 		case 'p':
549 			enabled_port_mask = parse_portmask(optarg);
550 			if (enabled_port_mask == 0) {
551 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
552 				us_vhost_usage(prgname);
553 				return -1;
554 			}
555 			break;
556 
557 		case 'P':
558 			promiscuous = 1;
559 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
560 				ETH_VMDQ_ACCEPT_BROADCAST |
561 				ETH_VMDQ_ACCEPT_MULTICAST;
562 			break;
563 
564 		case OPT_VM2VM_NUM:
565 			ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
566 			if (ret == -1) {
567 				RTE_LOG(INFO, VHOST_CONFIG,
568 					"Invalid argument for "
569 					"vm2vm [0|1|2]\n");
570 				us_vhost_usage(prgname);
571 				return -1;
572 			}
573 			vm2vm_mode = (vm2vm_type)ret;
574 			break;
575 
576 		case OPT_RX_RETRY_NUM:
577 			ret = parse_num_opt(optarg, 1);
578 			if (ret == -1) {
579 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
580 				us_vhost_usage(prgname);
581 				return -1;
582 			}
583 			enable_retry = ret;
584 			break;
585 
586 		case OPT_TX_CSUM_NUM:
587 			ret = parse_num_opt(optarg, 1);
588 			if (ret == -1) {
589 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
590 				us_vhost_usage(prgname);
591 				return -1;
592 			}
593 			enable_tx_csum = ret;
594 			break;
595 
596 		case OPT_TSO_NUM:
597 			ret = parse_num_opt(optarg, 1);
598 			if (ret == -1) {
599 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
600 				us_vhost_usage(prgname);
601 				return -1;
602 			}
603 			enable_tso = ret;
604 			break;
605 
606 		case OPT_RX_RETRY_DELAY_NUM:
607 			ret = parse_num_opt(optarg, INT32_MAX);
608 			if (ret == -1) {
609 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
610 				us_vhost_usage(prgname);
611 				return -1;
612 			}
613 			burst_rx_delay_time = ret;
614 			break;
615 
616 		case OPT_RX_RETRY_NUMB_NUM:
617 			ret = parse_num_opt(optarg, INT32_MAX);
618 			if (ret == -1) {
619 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
620 				us_vhost_usage(prgname);
621 				return -1;
622 			}
623 			burst_rx_retry_num = ret;
624 			break;
625 
626 		case OPT_MERGEABLE_NUM:
627 			ret = parse_num_opt(optarg, 1);
628 			if (ret == -1) {
629 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
630 				us_vhost_usage(prgname);
631 				return -1;
632 			}
633 			mergeable = !!ret;
634 			if (ret)
635 				vmdq_conf_default.rxmode.mtu = MAX_MTU;
636 			break;
637 
638 		case OPT_STATS_NUM:
639 			ret = parse_num_opt(optarg, INT32_MAX);
640 			if (ret == -1) {
641 				RTE_LOG(INFO, VHOST_CONFIG,
642 					"Invalid argument for stats [0..N]\n");
643 				us_vhost_usage(prgname);
644 				return -1;
645 			}
646 			enable_stats = ret;
647 			break;
648 
649 		/* Set socket file path. */
650 		case OPT_SOCKET_FILE_NUM:
651 			if (us_vhost_parse_socket_path(optarg) == -1) {
652 				RTE_LOG(INFO, VHOST_CONFIG,
653 				"Invalid argument for socket name (Max %d characters)\n",
654 				PATH_MAX);
655 				us_vhost_usage(prgname);
656 				return -1;
657 			}
658 			break;
659 
660 		case OPT_DMA_TYPE_NUM:
661 			dma_type = optarg;
662 			break;
663 
664 		case OPT_DMAS_NUM:
665 			if (open_dma(optarg) == -1) {
666 				RTE_LOG(INFO, VHOST_CONFIG,
667 					"Wrong DMA args\n");
668 				us_vhost_usage(prgname);
669 				return -1;
670 			}
671 			async_vhost_driver = 1;
672 			break;
673 
674 		case OPT_CLIENT_NUM:
675 			client_mode = 1;
676 			break;
677 
678 		case OPT_BUILTIN_NET_DRIVER_NUM:
679 			builtin_net_driver = 1;
680 			break;
681 
682 		/* Invalid option - print options. */
683 		default:
684 			us_vhost_usage(prgname);
685 			return -1;
686 		}
687 	}
688 
689 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
690 		if (enabled_port_mask & (1 << i))
691 			ports[num_ports++] = i;
692 	}
693 
694 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
695 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
696 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
697 		return -1;
698 	}
699 
700 	return 0;
701 }
702 
703 /*
704  * Update the global var NUM_PORTS and array PORTS according to system ports number
705  * and return valid ports number
706  */
707 static unsigned check_ports_num(unsigned nb_ports)
708 {
709 	unsigned valid_num_ports = num_ports;
710 	unsigned portid;
711 
712 	if (num_ports > nb_ports) {
713 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
714 			num_ports, nb_ports);
715 		num_ports = nb_ports;
716 	}
717 
718 	for (portid = 0; portid < num_ports; portid ++) {
719 		if (!rte_eth_dev_is_valid_port(ports[portid])) {
720 			RTE_LOG(INFO, VHOST_PORT,
721 				"\nSpecified port ID(%u) is not valid\n",
722 				ports[portid]);
723 			ports[portid] = INVALID_PORT_ID;
724 			valid_num_ports--;
725 		}
726 	}
727 	return valid_num_ports;
728 }
729 
730 static __rte_always_inline struct vhost_dev *
731 find_vhost_dev(struct rte_ether_addr *mac)
732 {
733 	struct vhost_dev *vdev;
734 
735 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
736 		if (vdev->ready == DEVICE_RX &&
737 		    rte_is_same_ether_addr(mac, &vdev->mac_address))
738 			return vdev;
739 	}
740 
741 	return NULL;
742 }
743 
744 /*
745  * This function learns the MAC address of the device and registers this along with a
746  * vlan tag to a VMDQ.
747  */
748 static int
749 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
750 {
751 	struct rte_ether_hdr *pkt_hdr;
752 	int i, ret;
753 
754 	/* Learn MAC address of guest device from packet */
755 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
756 
757 	if (find_vhost_dev(&pkt_hdr->src_addr)) {
758 		RTE_LOG(ERR, VHOST_DATA,
759 			"(%d) device is using a registered MAC!\n",
760 			vdev->vid);
761 		return -1;
762 	}
763 
764 	for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
765 		vdev->mac_address.addr_bytes[i] =
766 			pkt_hdr->src_addr.addr_bytes[i];
767 
768 	/* vlan_tag currently uses the device_id. */
769 	vdev->vlan_tag = vlan_tags[vdev->vid];
770 
771 	/* Print out VMDQ registration info. */
772 	RTE_LOG(INFO, VHOST_DATA,
773 		"(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n",
774 		vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address),
775 		vdev->vlan_tag);
776 
777 	/* Register the MAC address. */
778 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
779 				(uint32_t)vdev->vid + vmdq_pool_base);
780 	if (ret)
781 		RTE_LOG(ERR, VHOST_DATA,
782 			"(%d) failed to add device MAC address to VMDQ\n",
783 			vdev->vid);
784 
785 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
786 
787 	/* Set device as ready for RX. */
788 	vdev->ready = DEVICE_RX;
789 
790 	return 0;
791 }
792 
793 /*
794  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
795  * queue before disabling RX on the device.
796  */
797 static inline void
798 unlink_vmdq(struct vhost_dev *vdev)
799 {
800 	unsigned i = 0;
801 	unsigned rx_count;
802 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
803 
804 	if (vdev->ready == DEVICE_RX) {
805 		/*clear MAC and VLAN settings*/
806 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
807 		for (i = 0; i < 6; i++)
808 			vdev->mac_address.addr_bytes[i] = 0;
809 
810 		vdev->vlan_tag = 0;
811 
812 		/*Clear out the receive buffers*/
813 		rx_count = rte_eth_rx_burst(ports[0],
814 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
815 
816 		while (rx_count) {
817 			for (i = 0; i < rx_count; i++)
818 				rte_pktmbuf_free(pkts_burst[i]);
819 
820 			rx_count = rte_eth_rx_burst(ports[0],
821 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
822 		}
823 
824 		vdev->ready = DEVICE_MAC_LEARNING;
825 	}
826 }
827 
828 static inline void
829 free_pkts(struct rte_mbuf **pkts, uint16_t n)
830 {
831 	while (n--)
832 		rte_pktmbuf_free(pkts[n]);
833 }
834 
835 static __rte_always_inline void
836 complete_async_pkts(struct vhost_dev *vdev)
837 {
838 	struct rte_mbuf *p_cpl[MAX_PKT_BURST];
839 	uint16_t complete_count;
840 
841 	complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
842 					VIRTIO_RXQ, p_cpl, MAX_PKT_BURST);
843 	if (complete_count) {
844 		free_pkts(p_cpl, complete_count);
845 		__atomic_sub_fetch(&vdev->pkts_inflight, complete_count, __ATOMIC_SEQ_CST);
846 	}
847 
848 }
849 
850 static __rte_always_inline void
851 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
852 	    struct rte_mbuf *m)
853 {
854 	uint16_t ret;
855 
856 	if (builtin_net_driver) {
857 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
858 	} else {
859 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
860 	}
861 
862 	if (enable_stats) {
863 		__atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
864 				__ATOMIC_SEQ_CST);
865 		__atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
866 				__ATOMIC_SEQ_CST);
867 		src_vdev->stats.tx_total++;
868 		src_vdev->stats.tx += ret;
869 	}
870 }
871 
872 static __rte_always_inline void
873 drain_vhost(struct vhost_dev *vdev)
874 {
875 	uint16_t ret;
876 	uint32_t buff_idx = rte_lcore_id() * MAX_VHOST_DEVICE + vdev->vid;
877 	uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
878 	struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
879 
880 	if (builtin_net_driver) {
881 		ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
882 	} else if (async_vhost_driver) {
883 		uint16_t enqueue_fail = 0;
884 
885 		complete_async_pkts(vdev);
886 		ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ, m, nr_xmit);
887 		__atomic_add_fetch(&vdev->pkts_inflight, ret, __ATOMIC_SEQ_CST);
888 
889 		enqueue_fail = nr_xmit - ret;
890 		if (enqueue_fail)
891 			free_pkts(&m[ret], nr_xmit - ret);
892 	} else {
893 		ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
894 						m, nr_xmit);
895 	}
896 
897 	if (enable_stats) {
898 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
899 				__ATOMIC_SEQ_CST);
900 		__atomic_add_fetch(&vdev->stats.rx_atomic, ret,
901 				__ATOMIC_SEQ_CST);
902 	}
903 
904 	if (!async_vhost_driver)
905 		free_pkts(m, nr_xmit);
906 }
907 
908 static __rte_always_inline void
909 drain_vhost_table(void)
910 {
911 	uint16_t lcore_id = rte_lcore_id();
912 	struct vhost_bufftable *vhost_txq;
913 	struct vhost_dev *vdev;
914 	uint64_t cur_tsc;
915 
916 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
917 		vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE
918 						+ vdev->vid];
919 
920 		cur_tsc = rte_rdtsc();
921 		if (unlikely(cur_tsc - vhost_txq->pre_tsc
922 				> MBUF_TABLE_DRAIN_TSC)) {
923 			RTE_LOG_DP(DEBUG, VHOST_DATA,
924 				"Vhost TX queue drained after timeout with burst size %u\n",
925 				vhost_txq->len);
926 			drain_vhost(vdev);
927 			vhost_txq->len = 0;
928 			vhost_txq->pre_tsc = cur_tsc;
929 		}
930 	}
931 }
932 
933 /*
934  * Check if the packet destination MAC address is for a local device. If so then put
935  * the packet on that devices RX queue. If not then return.
936  */
937 static __rte_always_inline int
938 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
939 {
940 	struct rte_ether_hdr *pkt_hdr;
941 	struct vhost_dev *dst_vdev;
942 	struct vhost_bufftable *vhost_txq;
943 	uint16_t lcore_id = rte_lcore_id();
944 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
945 
946 	dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
947 	if (!dst_vdev)
948 		return -1;
949 
950 	if (vdev->vid == dst_vdev->vid) {
951 		RTE_LOG_DP(DEBUG, VHOST_DATA,
952 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
953 			vdev->vid);
954 		return 0;
955 	}
956 
957 	RTE_LOG_DP(DEBUG, VHOST_DATA,
958 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
959 
960 	if (unlikely(dst_vdev->remove)) {
961 		RTE_LOG_DP(DEBUG, VHOST_DATA,
962 			"(%d) device is marked for removal\n", dst_vdev->vid);
963 		return 0;
964 	}
965 
966 	vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE + dst_vdev->vid];
967 	vhost_txq->m_table[vhost_txq->len++] = m;
968 
969 	if (enable_stats) {
970 		vdev->stats.tx_total++;
971 		vdev->stats.tx++;
972 	}
973 
974 	if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
975 		drain_vhost(dst_vdev);
976 		vhost_txq->len = 0;
977 		vhost_txq->pre_tsc = rte_rdtsc();
978 	}
979 	return 0;
980 }
981 
982 /*
983  * Check if the destination MAC of a packet is one local VM,
984  * and get its vlan tag, and offset if it is.
985  */
986 static __rte_always_inline int
987 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
988 	uint32_t *offset, uint16_t *vlan_tag)
989 {
990 	struct vhost_dev *dst_vdev;
991 	struct rte_ether_hdr *pkt_hdr =
992 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
993 
994 	dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
995 	if (!dst_vdev)
996 		return 0;
997 
998 	if (vdev->vid == dst_vdev->vid) {
999 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1000 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
1001 			vdev->vid);
1002 		return -1;
1003 	}
1004 
1005 	/*
1006 	 * HW vlan strip will reduce the packet length
1007 	 * by minus length of vlan tag, so need restore
1008 	 * the packet length by plus it.
1009 	 */
1010 	*offset  = VLAN_HLEN;
1011 	*vlan_tag = vlan_tags[vdev->vid];
1012 
1013 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1014 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1015 		vdev->vid, dst_vdev->vid, *vlan_tag);
1016 
1017 	return 0;
1018 }
1019 
1020 static void virtio_tx_offload(struct rte_mbuf *m)
1021 {
1022 	struct rte_net_hdr_lens hdr_lens;
1023 	struct rte_ipv4_hdr *ipv4_hdr;
1024 	struct rte_tcp_hdr *tcp_hdr;
1025 	uint32_t ptype;
1026 	void *l3_hdr;
1027 
1028 	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
1029 	m->l2_len = hdr_lens.l2_len;
1030 	m->l3_len = hdr_lens.l3_len;
1031 	m->l4_len = hdr_lens.l4_len;
1032 
1033 	l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len);
1034 	tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *,
1035 		m->l2_len + m->l3_len);
1036 
1037 	m->ol_flags |= PKT_TX_TCP_SEG;
1038 	if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) {
1039 		m->ol_flags |= PKT_TX_IPV4;
1040 		m->ol_flags |= PKT_TX_IP_CKSUM;
1041 		ipv4_hdr = l3_hdr;
1042 		ipv4_hdr->hdr_checksum = 0;
1043 		tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags);
1044 	} else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1045 		m->ol_flags |= PKT_TX_IPV6;
1046 		tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags);
1047 	}
1048 }
1049 
1050 static __rte_always_inline void
1051 do_drain_mbuf_table(struct mbuf_table *tx_q)
1052 {
1053 	uint16_t count;
1054 
1055 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1056 				 tx_q->m_table, tx_q->len);
1057 	if (unlikely(count < tx_q->len))
1058 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
1059 
1060 	tx_q->len = 0;
1061 }
1062 
1063 /*
1064  * This function routes the TX packet to the correct interface. This
1065  * may be a local device or the physical port.
1066  */
1067 static __rte_always_inline void
1068 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1069 {
1070 	struct mbuf_table *tx_q;
1071 	unsigned offset = 0;
1072 	const uint16_t lcore_id = rte_lcore_id();
1073 	struct rte_ether_hdr *nh;
1074 
1075 
1076 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1077 	if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) {
1078 		struct vhost_dev *vdev2;
1079 
1080 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1081 			if (vdev2 != vdev)
1082 				sync_virtio_xmit(vdev2, vdev, m);
1083 		}
1084 		goto queue2nic;
1085 	}
1086 
1087 	/*check if destination is local VM*/
1088 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1089 		return;
1090 
1091 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1092 		if (unlikely(find_local_dest(vdev, m, &offset,
1093 					     &vlan_tag) != 0)) {
1094 			rte_pktmbuf_free(m);
1095 			return;
1096 		}
1097 	}
1098 
1099 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1100 		"(%d) TX: MAC address is external\n", vdev->vid);
1101 
1102 queue2nic:
1103 
1104 	/*Add packet to the port tx queue*/
1105 	tx_q = &lcore_tx_queue[lcore_id];
1106 
1107 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1108 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1109 		/* Guest has inserted the vlan tag. */
1110 		struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1111 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1112 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1113 			(vh->vlan_tci != vlan_tag_be))
1114 			vh->vlan_tci = vlan_tag_be;
1115 	} else {
1116 		m->ol_flags |= PKT_TX_VLAN_PKT;
1117 
1118 		/*
1119 		 * Find the right seg to adjust the data len when offset is
1120 		 * bigger than tail room size.
1121 		 */
1122 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1123 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1124 				m->data_len += offset;
1125 			else {
1126 				struct rte_mbuf *seg = m;
1127 
1128 				while ((seg->next != NULL) &&
1129 					(offset > rte_pktmbuf_tailroom(seg)))
1130 					seg = seg->next;
1131 
1132 				seg->data_len += offset;
1133 			}
1134 			m->pkt_len += offset;
1135 		}
1136 
1137 		m->vlan_tci = vlan_tag;
1138 	}
1139 
1140 	if (m->ol_flags & PKT_RX_LRO)
1141 		virtio_tx_offload(m);
1142 
1143 	tx_q->m_table[tx_q->len++] = m;
1144 	if (enable_stats) {
1145 		vdev->stats.tx_total++;
1146 		vdev->stats.tx++;
1147 	}
1148 
1149 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1150 		do_drain_mbuf_table(tx_q);
1151 }
1152 
1153 
1154 static __rte_always_inline void
1155 drain_mbuf_table(struct mbuf_table *tx_q)
1156 {
1157 	static uint64_t prev_tsc;
1158 	uint64_t cur_tsc;
1159 
1160 	if (tx_q->len == 0)
1161 		return;
1162 
1163 	cur_tsc = rte_rdtsc();
1164 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1165 		prev_tsc = cur_tsc;
1166 
1167 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1168 			"TX queue drained after timeout with burst size %u\n",
1169 			tx_q->len);
1170 		do_drain_mbuf_table(tx_q);
1171 	}
1172 }
1173 
1174 static __rte_always_inline void
1175 drain_eth_rx(struct vhost_dev *vdev)
1176 {
1177 	uint16_t rx_count, enqueue_count;
1178 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1179 
1180 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1181 				    pkts, MAX_PKT_BURST);
1182 
1183 	if (!rx_count)
1184 		return;
1185 
1186 	/*
1187 	 * When "enable_retry" is set, here we wait and retry when there
1188 	 * is no enough free slots in the queue to hold @rx_count packets,
1189 	 * to diminish packet loss.
1190 	 */
1191 	if (enable_retry &&
1192 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1193 			VIRTIO_RXQ))) {
1194 		uint32_t retry;
1195 
1196 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1197 			rte_delay_us(burst_rx_delay_time);
1198 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1199 					VIRTIO_RXQ))
1200 				break;
1201 		}
1202 	}
1203 
1204 	if (builtin_net_driver) {
1205 		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1206 						pkts, rx_count);
1207 	} else if (async_vhost_driver) {
1208 		uint16_t enqueue_fail = 0;
1209 
1210 		complete_async_pkts(vdev);
1211 		enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1212 					VIRTIO_RXQ, pkts, rx_count);
1213 		__atomic_add_fetch(&vdev->pkts_inflight, enqueue_count, __ATOMIC_SEQ_CST);
1214 
1215 		enqueue_fail = rx_count - enqueue_count;
1216 		if (enqueue_fail)
1217 			free_pkts(&pkts[enqueue_count], enqueue_fail);
1218 
1219 	} else {
1220 		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1221 						pkts, rx_count);
1222 	}
1223 
1224 	if (enable_stats) {
1225 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1226 				__ATOMIC_SEQ_CST);
1227 		__atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1228 				__ATOMIC_SEQ_CST);
1229 	}
1230 
1231 	if (!async_vhost_driver)
1232 		free_pkts(pkts, rx_count);
1233 }
1234 
1235 static __rte_always_inline void
1236 drain_virtio_tx(struct vhost_dev *vdev)
1237 {
1238 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1239 	uint16_t count;
1240 	uint16_t i;
1241 
1242 	if (builtin_net_driver) {
1243 		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1244 					pkts, MAX_PKT_BURST);
1245 	} else {
1246 		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1247 					mbuf_pool, pkts, MAX_PKT_BURST);
1248 	}
1249 
1250 	/* setup VMDq for the first packet */
1251 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1252 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1253 			free_pkts(pkts, count);
1254 	}
1255 
1256 	for (i = 0; i < count; ++i)
1257 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1258 }
1259 
1260 /*
1261  * Main function of vhost-switch. It basically does:
1262  *
1263  * for each vhost device {
1264  *    - drain_eth_rx()
1265  *
1266  *      Which drains the host eth Rx queue linked to the vhost device,
1267  *      and deliver all of them to guest virito Rx ring associated with
1268  *      this vhost device.
1269  *
1270  *    - drain_virtio_tx()
1271  *
1272  *      Which drains the guest virtio Tx queue and deliver all of them
1273  *      to the target, which could be another vhost device, or the
1274  *      physical eth dev. The route is done in function "virtio_tx_route".
1275  * }
1276  */
1277 static int
1278 switch_worker(void *arg __rte_unused)
1279 {
1280 	unsigned i;
1281 	unsigned lcore_id = rte_lcore_id();
1282 	struct vhost_dev *vdev;
1283 	struct mbuf_table *tx_q;
1284 
1285 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1286 
1287 	tx_q = &lcore_tx_queue[lcore_id];
1288 	for (i = 0; i < rte_lcore_count(); i++) {
1289 		if (lcore_ids[i] == lcore_id) {
1290 			tx_q->txq_id = i;
1291 			break;
1292 		}
1293 	}
1294 
1295 	while(1) {
1296 		drain_mbuf_table(tx_q);
1297 		drain_vhost_table();
1298 		/*
1299 		 * Inform the configuration core that we have exited the
1300 		 * linked list and that no devices are in use if requested.
1301 		 */
1302 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1303 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1304 
1305 		/*
1306 		 * Process vhost devices
1307 		 */
1308 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1309 			      lcore_vdev_entry) {
1310 			if (unlikely(vdev->remove)) {
1311 				unlink_vmdq(vdev);
1312 				vdev->ready = DEVICE_SAFE_REMOVE;
1313 				continue;
1314 			}
1315 
1316 			if (likely(vdev->ready == DEVICE_RX))
1317 				drain_eth_rx(vdev);
1318 
1319 			if (likely(!vdev->remove))
1320 				drain_virtio_tx(vdev);
1321 		}
1322 	}
1323 
1324 	return 0;
1325 }
1326 
1327 /*
1328  * Remove a device from the specific data core linked list and from the
1329  * main linked list. Synchonization  occurs through the use of the
1330  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1331  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1332  */
1333 static void
1334 destroy_device(int vid)
1335 {
1336 	struct vhost_dev *vdev = NULL;
1337 	int lcore;
1338 	uint16_t i;
1339 
1340 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1341 		if (vdev->vid == vid)
1342 			break;
1343 	}
1344 	if (!vdev)
1345 		return;
1346 	/*set the remove flag. */
1347 	vdev->remove = 1;
1348 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1349 		rte_pause();
1350 	}
1351 
1352 	for (i = 0; i < RTE_MAX_LCORE; i++)
1353 		rte_free(vhost_txbuff[i * MAX_VHOST_DEVICE + vid]);
1354 
1355 	if (builtin_net_driver)
1356 		vs_vhost_net_remove(vdev);
1357 
1358 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1359 		     lcore_vdev_entry);
1360 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1361 
1362 
1363 	/* Set the dev_removal_flag on each lcore. */
1364 	RTE_LCORE_FOREACH_WORKER(lcore)
1365 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1366 
1367 	/*
1368 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1369 	 * we can be sure that they can no longer access the device removed
1370 	 * from the linked lists and that the devices are no longer in use.
1371 	 */
1372 	RTE_LCORE_FOREACH_WORKER(lcore) {
1373 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1374 			rte_pause();
1375 	}
1376 
1377 	lcore_info[vdev->coreid].device_num--;
1378 
1379 	RTE_LOG(INFO, VHOST_DATA,
1380 		"(%d) device has been removed from data core\n",
1381 		vdev->vid);
1382 
1383 	if (async_vhost_driver) {
1384 		uint16_t n_pkt = 0;
1385 		struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1386 
1387 		while (vdev->pkts_inflight) {
1388 			n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, VIRTIO_RXQ,
1389 						m_cpl, vdev->pkts_inflight);
1390 			free_pkts(m_cpl, n_pkt);
1391 			__atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1392 		}
1393 
1394 		rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1395 	}
1396 
1397 	rte_free(vdev);
1398 }
1399 
1400 /*
1401  * A new device is added to a data core. First the device is added to the main linked list
1402  * and then allocated to a specific data core.
1403  */
1404 static int
1405 new_device(int vid)
1406 {
1407 	int lcore, core_add = 0;
1408 	uint16_t i;
1409 	uint32_t device_num_min = num_devices;
1410 	struct vhost_dev *vdev;
1411 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1412 	if (vdev == NULL) {
1413 		RTE_LOG(INFO, VHOST_DATA,
1414 			"(%d) couldn't allocate memory for vhost dev\n",
1415 			vid);
1416 		return -1;
1417 	}
1418 	vdev->vid = vid;
1419 
1420 	for (i = 0; i < RTE_MAX_LCORE; i++) {
1421 		vhost_txbuff[i * MAX_VHOST_DEVICE + vid]
1422 			= rte_zmalloc("vhost bufftable",
1423 				sizeof(struct vhost_bufftable),
1424 				RTE_CACHE_LINE_SIZE);
1425 
1426 		if (vhost_txbuff[i * MAX_VHOST_DEVICE + vid] == NULL) {
1427 			RTE_LOG(INFO, VHOST_DATA,
1428 			  "(%d) couldn't allocate memory for vhost TX\n", vid);
1429 			return -1;
1430 		}
1431 	}
1432 
1433 	if (builtin_net_driver)
1434 		vs_vhost_net_setup(vdev);
1435 
1436 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1437 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1438 
1439 	/*reset ready flag*/
1440 	vdev->ready = DEVICE_MAC_LEARNING;
1441 	vdev->remove = 0;
1442 
1443 	/* Find a suitable lcore to add the device. */
1444 	RTE_LCORE_FOREACH_WORKER(lcore) {
1445 		if (lcore_info[lcore].device_num < device_num_min) {
1446 			device_num_min = lcore_info[lcore].device_num;
1447 			core_add = lcore;
1448 		}
1449 	}
1450 	vdev->coreid = core_add;
1451 
1452 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1453 			  lcore_vdev_entry);
1454 	lcore_info[vdev->coreid].device_num++;
1455 
1456 	/* Disable notifications. */
1457 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1458 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1459 
1460 	RTE_LOG(INFO, VHOST_DATA,
1461 		"(%d) device has been added to data core %d\n",
1462 		vid, vdev->coreid);
1463 
1464 	if (async_vhost_driver) {
1465 		struct rte_vhost_async_config config = {0};
1466 		struct rte_vhost_async_channel_ops channel_ops;
1467 
1468 		if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
1469 			channel_ops.transfer_data = ioat_transfer_data_cb;
1470 			channel_ops.check_completed_copies =
1471 				ioat_check_completed_copies_cb;
1472 
1473 			config.features = RTE_VHOST_ASYNC_INORDER;
1474 
1475 			return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
1476 				config, &channel_ops);
1477 		}
1478 	}
1479 
1480 	return 0;
1481 }
1482 
1483 static int
1484 vring_state_changed(int vid, uint16_t queue_id, int enable)
1485 {
1486 	struct vhost_dev *vdev = NULL;
1487 
1488 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1489 		if (vdev->vid == vid)
1490 			break;
1491 	}
1492 	if (!vdev)
1493 		return -1;
1494 
1495 	if (queue_id != VIRTIO_RXQ)
1496 		return 0;
1497 
1498 	if (async_vhost_driver) {
1499 		if (!enable) {
1500 			uint16_t n_pkt = 0;
1501 			struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1502 
1503 			while (vdev->pkts_inflight) {
1504 				n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, queue_id,
1505 							m_cpl, vdev->pkts_inflight);
1506 				free_pkts(m_cpl, n_pkt);
1507 				__atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1508 			}
1509 		}
1510 	}
1511 
1512 	return 0;
1513 }
1514 
1515 /*
1516  * These callback allow devices to be added to the data core when configuration
1517  * has been fully complete.
1518  */
1519 static const struct vhost_device_ops virtio_net_device_ops =
1520 {
1521 	.new_device =  new_device,
1522 	.destroy_device = destroy_device,
1523 	.vring_state_changed = vring_state_changed,
1524 };
1525 
1526 /*
1527  * This is a thread will wake up after a period to print stats if the user has
1528  * enabled them.
1529  */
1530 static void *
1531 print_stats(__rte_unused void *arg)
1532 {
1533 	struct vhost_dev *vdev;
1534 	uint64_t tx_dropped, rx_dropped;
1535 	uint64_t tx, tx_total, rx, rx_total;
1536 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1537 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1538 
1539 	while(1) {
1540 		sleep(enable_stats);
1541 
1542 		/* Clear screen and move to top left */
1543 		printf("%s%s\n", clr, top_left);
1544 		printf("Device statistics =================================\n");
1545 
1546 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1547 			tx_total   = vdev->stats.tx_total;
1548 			tx         = vdev->stats.tx;
1549 			tx_dropped = tx_total - tx;
1550 
1551 			rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1552 				__ATOMIC_SEQ_CST);
1553 			rx         = __atomic_load_n(&vdev->stats.rx_atomic,
1554 				__ATOMIC_SEQ_CST);
1555 			rx_dropped = rx_total - rx;
1556 
1557 			printf("Statistics for device %d\n"
1558 				"-----------------------\n"
1559 				"TX total:              %" PRIu64 "\n"
1560 				"TX dropped:            %" PRIu64 "\n"
1561 				"TX successful:         %" PRIu64 "\n"
1562 				"RX total:              %" PRIu64 "\n"
1563 				"RX dropped:            %" PRIu64 "\n"
1564 				"RX successful:         %" PRIu64 "\n",
1565 				vdev->vid,
1566 				tx_total, tx_dropped, tx,
1567 				rx_total, rx_dropped, rx);
1568 		}
1569 
1570 		printf("===================================================\n");
1571 
1572 		fflush(stdout);
1573 	}
1574 
1575 	return NULL;
1576 }
1577 
1578 static void
1579 unregister_drivers(int socket_num)
1580 {
1581 	int i, ret;
1582 
1583 	for (i = 0; i < socket_num; i++) {
1584 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1585 		if (ret != 0)
1586 			RTE_LOG(ERR, VHOST_CONFIG,
1587 				"Fail to unregister vhost driver for %s.\n",
1588 				socket_files + i * PATH_MAX);
1589 	}
1590 }
1591 
1592 /* When we receive a INT signal, unregister vhost driver */
1593 static void
1594 sigint_handler(__rte_unused int signum)
1595 {
1596 	/* Unregister vhost driver. */
1597 	unregister_drivers(nb_sockets);
1598 
1599 	exit(0);
1600 }
1601 
1602 /*
1603  * While creating an mbuf pool, one key thing is to figure out how
1604  * many mbuf entries is enough for our use. FYI, here are some
1605  * guidelines:
1606  *
1607  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1608  *
1609  * - For each switch core (A CPU core does the packet switch), we need
1610  *   also make some reservation for receiving the packets from virtio
1611  *   Tx queue. How many is enough depends on the usage. It's normally
1612  *   a simple calculation like following:
1613  *
1614  *       MAX_PKT_BURST * max packet size / mbuf size
1615  *
1616  *   So, we definitely need allocate more mbufs when TSO is enabled.
1617  *
1618  * - Similarly, for each switching core, we should serve @nr_rx_desc
1619  *   mbufs for receiving the packets from physical NIC device.
1620  *
1621  * - We also need make sure, for each switch core, we have allocated
1622  *   enough mbufs to fill up the mbuf cache.
1623  */
1624 static void
1625 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1626 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1627 {
1628 	uint32_t nr_mbufs;
1629 	uint32_t nr_mbufs_per_core;
1630 	uint32_t mtu = 1500;
1631 
1632 	if (mergeable)
1633 		mtu = 9000;
1634 	if (enable_tso)
1635 		mtu = 64 * 1024;
1636 
1637 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1638 			(mbuf_size - RTE_PKTMBUF_HEADROOM);
1639 	nr_mbufs_per_core += nr_rx_desc;
1640 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1641 
1642 	nr_mbufs  = nr_queues * nr_rx_desc;
1643 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1644 	nr_mbufs *= nr_port;
1645 
1646 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1647 					    nr_mbuf_cache, 0, mbuf_size,
1648 					    rte_socket_id());
1649 	if (mbuf_pool == NULL)
1650 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1651 }
1652 
1653 /*
1654  * Main function, does initialisation and calls the per-lcore functions.
1655  */
1656 int
1657 main(int argc, char *argv[])
1658 {
1659 	unsigned lcore_id, core_id = 0;
1660 	unsigned nb_ports, valid_num_ports;
1661 	int ret, i;
1662 	uint16_t portid;
1663 	static pthread_t tid;
1664 	uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1665 
1666 	signal(SIGINT, sigint_handler);
1667 
1668 	/* init EAL */
1669 	ret = rte_eal_init(argc, argv);
1670 	if (ret < 0)
1671 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1672 	argc -= ret;
1673 	argv += ret;
1674 
1675 	/* parse app arguments */
1676 	ret = us_vhost_parse_args(argc, argv);
1677 	if (ret < 0)
1678 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1679 
1680 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1681 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1682 
1683 		if (rte_lcore_is_enabled(lcore_id))
1684 			lcore_ids[core_id++] = lcore_id;
1685 	}
1686 
1687 	if (rte_lcore_count() > RTE_MAX_LCORE)
1688 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1689 
1690 	/* Get the number of physical ports. */
1691 	nb_ports = rte_eth_dev_count_avail();
1692 
1693 	/*
1694 	 * Update the global var NUM_PORTS and global array PORTS
1695 	 * and get value of var VALID_NUM_PORTS according to system ports number
1696 	 */
1697 	valid_num_ports = check_ports_num(nb_ports);
1698 
1699 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1700 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1701 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1702 		return -1;
1703 	}
1704 
1705 	/*
1706 	 * FIXME: here we are trying to allocate mbufs big enough for
1707 	 * @MAX_QUEUES, but the truth is we're never going to use that
1708 	 * many queues here. We probably should only do allocation for
1709 	 * those queues we are going to use.
1710 	 */
1711 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1712 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1713 
1714 	if (vm2vm_mode == VM2VM_HARDWARE) {
1715 		/* Enable VT loop back to let L2 switch to do it. */
1716 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1717 		RTE_LOG(DEBUG, VHOST_CONFIG,
1718 			"Enable loop back for L2 switch in vmdq.\n");
1719 	}
1720 
1721 	/* initialize all ports */
1722 	RTE_ETH_FOREACH_DEV(portid) {
1723 		/* skip ports that are not enabled */
1724 		if ((enabled_port_mask & (1 << portid)) == 0) {
1725 			RTE_LOG(INFO, VHOST_PORT,
1726 				"Skipping disabled port %d\n", portid);
1727 			continue;
1728 		}
1729 		if (port_init(portid) != 0)
1730 			rte_exit(EXIT_FAILURE,
1731 				"Cannot initialize network ports\n");
1732 	}
1733 
1734 	/* Enable stats if the user option is set. */
1735 	if (enable_stats) {
1736 		ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1737 					print_stats, NULL);
1738 		if (ret < 0)
1739 			rte_exit(EXIT_FAILURE,
1740 				"Cannot create print-stats thread\n");
1741 	}
1742 
1743 	/* Launch all data cores. */
1744 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1745 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1746 
1747 	if (client_mode)
1748 		flags |= RTE_VHOST_USER_CLIENT;
1749 
1750 	/* Register vhost user driver to handle vhost messages. */
1751 	for (i = 0; i < nb_sockets; i++) {
1752 		char *file = socket_files + i * PATH_MAX;
1753 
1754 		if (async_vhost_driver)
1755 			flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1756 
1757 		ret = rte_vhost_driver_register(file, flags);
1758 		if (ret != 0) {
1759 			unregister_drivers(i);
1760 			rte_exit(EXIT_FAILURE,
1761 				"vhost driver register failure.\n");
1762 		}
1763 
1764 		if (builtin_net_driver)
1765 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1766 
1767 		if (mergeable == 0) {
1768 			rte_vhost_driver_disable_features(file,
1769 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
1770 		}
1771 
1772 		if (enable_tx_csum == 0) {
1773 			rte_vhost_driver_disable_features(file,
1774 				1ULL << VIRTIO_NET_F_CSUM);
1775 		}
1776 
1777 		if (enable_tso == 0) {
1778 			rte_vhost_driver_disable_features(file,
1779 				1ULL << VIRTIO_NET_F_HOST_TSO4);
1780 			rte_vhost_driver_disable_features(file,
1781 				1ULL << VIRTIO_NET_F_HOST_TSO6);
1782 			rte_vhost_driver_disable_features(file,
1783 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
1784 			rte_vhost_driver_disable_features(file,
1785 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
1786 		}
1787 
1788 		if (promiscuous) {
1789 			rte_vhost_driver_enable_features(file,
1790 				1ULL << VIRTIO_NET_F_CTRL_RX);
1791 		}
1792 
1793 		ret = rte_vhost_driver_callback_register(file,
1794 			&virtio_net_device_ops);
1795 		if (ret != 0) {
1796 			rte_exit(EXIT_FAILURE,
1797 				"failed to register vhost driver callbacks.\n");
1798 		}
1799 
1800 		if (rte_vhost_driver_start(file) < 0) {
1801 			rte_exit(EXIT_FAILURE,
1802 				"failed to start vhost driver.\n");
1803 		}
1804 	}
1805 
1806 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1807 		rte_eal_wait_lcore(lcore_id);
1808 
1809 	/* clean up the EAL */
1810 	rte_eal_cleanup();
1811 
1812 	return 0;
1813 }
1814