xref: /dpdk/examples/vhost/main.c (revision 0dff3f26d6faad4e51f75e5245f0387ee9bb0c6d)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4 
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16 
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
19 #include <rte_log.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
22 #include <rte_net.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27 
28 #include "ioat.h"
29 #include "main.h"
30 
31 #ifndef MAX_QUEUES
32 #define MAX_QUEUES 128
33 #endif
34 
35 /* the maximum number of external ports supported */
36 #define MAX_SUP_PORTS 1
37 
38 #define MBUF_CACHE_SIZE	128
39 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
40 
41 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
42 
43 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
44 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
45 
46 #define JUMBO_FRAME_MAX_SIZE    0x2600
47 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN))
48 
49 /* State of virtio device. */
50 #define DEVICE_MAC_LEARNING 0
51 #define DEVICE_RX			1
52 #define DEVICE_SAFE_REMOVE	2
53 
54 /* Configurable number of RX/TX ring descriptors */
55 #define RTE_TEST_RX_DESC_DEFAULT 1024
56 #define RTE_TEST_TX_DESC_DEFAULT 512
57 
58 #define INVALID_PORT_ID 0xFF
59 
60 /* mask of enabled ports */
61 static uint32_t enabled_port_mask = 0;
62 
63 /* Promiscuous mode */
64 static uint32_t promiscuous;
65 
66 /* number of devices/queues to support*/
67 static uint32_t num_queues = 0;
68 static uint32_t num_devices;
69 
70 static struct rte_mempool *mbuf_pool;
71 static int mergeable;
72 
73 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
74 typedef enum {
75 	VM2VM_DISABLED = 0,
76 	VM2VM_SOFTWARE = 1,
77 	VM2VM_HARDWARE = 2,
78 	VM2VM_LAST
79 } vm2vm_type;
80 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
81 
82 /* Enable stats. */
83 static uint32_t enable_stats = 0;
84 /* Enable retries on RX. */
85 static uint32_t enable_retry = 1;
86 
87 /* Disable TX checksum offload */
88 static uint32_t enable_tx_csum;
89 
90 /* Disable TSO offload */
91 static uint32_t enable_tso;
92 
93 static int client_mode;
94 
95 static int builtin_net_driver;
96 
97 static int async_vhost_driver;
98 
99 static char *dma_type;
100 
101 /* Specify timeout (in useconds) between retries on RX. */
102 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
103 /* Specify the number of retries on RX. */
104 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
105 
106 /* Socket file paths. Can be set by user */
107 static char *socket_files;
108 static int nb_sockets;
109 
110 /* empty VMDq configuration structure. Filled in programmatically */
111 static struct rte_eth_conf vmdq_conf_default = {
112 	.rxmode = {
113 		.mq_mode        = RTE_ETH_MQ_RX_VMDQ_ONLY,
114 		.split_hdr_size = 0,
115 		/*
116 		 * VLAN strip is necessary for 1G NIC such as I350,
117 		 * this fixes bug of ipv4 forwarding in guest can't
118 		 * forward packets from one virtio dev to another virtio dev.
119 		 */
120 		.offloads = RTE_ETH_RX_OFFLOAD_VLAN_STRIP,
121 	},
122 
123 	.txmode = {
124 		.mq_mode = RTE_ETH_MQ_TX_NONE,
125 		.offloads = (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM |
126 			     RTE_ETH_TX_OFFLOAD_TCP_CKSUM |
127 			     RTE_ETH_TX_OFFLOAD_VLAN_INSERT |
128 			     RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
129 			     RTE_ETH_TX_OFFLOAD_TCP_TSO),
130 	},
131 	.rx_adv_conf = {
132 		/*
133 		 * should be overridden separately in code with
134 		 * appropriate values
135 		 */
136 		.vmdq_rx_conf = {
137 			.nb_queue_pools = RTE_ETH_8_POOLS,
138 			.enable_default_pool = 0,
139 			.default_pool = 0,
140 			.nb_pool_maps = 0,
141 			.pool_map = {{0, 0},},
142 		},
143 	},
144 };
145 
146 
147 static unsigned lcore_ids[RTE_MAX_LCORE];
148 static uint16_t ports[RTE_MAX_ETHPORTS];
149 static unsigned num_ports = 0; /**< The number of ports specified in command line */
150 static uint16_t num_pf_queues, num_vmdq_queues;
151 static uint16_t vmdq_pool_base, vmdq_queue_base;
152 static uint16_t queues_per_pool;
153 
154 const uint16_t vlan_tags[] = {
155 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
156 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
157 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
158 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
159 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
160 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
161 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
162 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
163 };
164 
165 /* ethernet addresses of ports */
166 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
167 
168 static struct vhost_dev_tailq_list vhost_dev_list =
169 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
170 
171 static struct lcore_info lcore_info[RTE_MAX_LCORE];
172 
173 /* Used for queueing bursts of TX packets. */
174 struct mbuf_table {
175 	unsigned len;
176 	unsigned txq_id;
177 	struct rte_mbuf *m_table[MAX_PKT_BURST];
178 };
179 
180 struct vhost_bufftable {
181 	uint32_t len;
182 	uint64_t pre_tsc;
183 	struct rte_mbuf *m_table[MAX_PKT_BURST];
184 };
185 
186 /* TX queue for each data core. */
187 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
188 
189 /*
190  * Vhost TX buffer for each data core.
191  * Every data core maintains a TX buffer for every vhost device,
192  * which is used for batch pkts enqueue for higher performance.
193  */
194 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * MAX_VHOST_DEVICE];
195 
196 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
197 				 / US_PER_S * BURST_TX_DRAIN_US)
198 
199 static inline int
200 open_dma(const char *value)
201 {
202 	if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0)
203 		return open_ioat(value);
204 
205 	return -1;
206 }
207 
208 /*
209  * Builds up the correct configuration for VMDQ VLAN pool map
210  * according to the pool & queue limits.
211  */
212 static inline int
213 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
214 {
215 	struct rte_eth_vmdq_rx_conf conf;
216 	struct rte_eth_vmdq_rx_conf *def_conf =
217 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
218 	unsigned i;
219 
220 	memset(&conf, 0, sizeof(conf));
221 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
222 	conf.nb_pool_maps = num_devices;
223 	conf.enable_loop_back = def_conf->enable_loop_back;
224 	conf.rx_mode = def_conf->rx_mode;
225 
226 	for (i = 0; i < conf.nb_pool_maps; i++) {
227 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
228 		conf.pool_map[i].pools = (1UL << i);
229 	}
230 
231 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
232 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
233 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
234 	return 0;
235 }
236 
237 /*
238  * Initialises a given port using global settings and with the rx buffers
239  * coming from the mbuf_pool passed as parameter
240  */
241 static inline int
242 port_init(uint16_t port)
243 {
244 	struct rte_eth_dev_info dev_info;
245 	struct rte_eth_conf port_conf;
246 	struct rte_eth_rxconf *rxconf;
247 	struct rte_eth_txconf *txconf;
248 	int16_t rx_rings, tx_rings;
249 	uint16_t rx_ring_size, tx_ring_size;
250 	int retval;
251 	uint16_t q;
252 
253 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
254 	retval = rte_eth_dev_info_get(port, &dev_info);
255 	if (retval != 0) {
256 		RTE_LOG(ERR, VHOST_PORT,
257 			"Error during getting device (port %u) info: %s\n",
258 			port, strerror(-retval));
259 
260 		return retval;
261 	}
262 
263 	rxconf = &dev_info.default_rxconf;
264 	txconf = &dev_info.default_txconf;
265 	rxconf->rx_drop_en = 1;
266 
267 	/*configure the number of supported virtio devices based on VMDQ limits */
268 	num_devices = dev_info.max_vmdq_pools;
269 
270 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
271 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
272 
273 	tx_rings = (uint16_t)rte_lcore_count();
274 
275 	if (mergeable) {
276 		if (dev_info.max_mtu != UINT16_MAX && dev_info.max_rx_pktlen > dev_info.max_mtu)
277 			vmdq_conf_default.rxmode.mtu = dev_info.max_mtu;
278 		else
279 			vmdq_conf_default.rxmode.mtu = MAX_MTU;
280 	}
281 
282 	/* Get port configuration. */
283 	retval = get_eth_conf(&port_conf, num_devices);
284 	if (retval < 0)
285 		return retval;
286 	/* NIC queues are divided into pf queues and vmdq queues.  */
287 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
288 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
289 	num_vmdq_queues = num_devices * queues_per_pool;
290 	num_queues = num_pf_queues + num_vmdq_queues;
291 	vmdq_queue_base = dev_info.vmdq_queue_base;
292 	vmdq_pool_base  = dev_info.vmdq_pool_base;
293 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
294 		num_pf_queues, num_devices, queues_per_pool);
295 
296 	if (!rte_eth_dev_is_valid_port(port))
297 		return -1;
298 
299 	rx_rings = (uint16_t)dev_info.max_rx_queues;
300 	if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE)
301 		port_conf.txmode.offloads |=
302 			RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE;
303 	/* Configure ethernet device. */
304 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
305 	if (retval != 0) {
306 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
307 			port, strerror(-retval));
308 		return retval;
309 	}
310 
311 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
312 		&tx_ring_size);
313 	if (retval != 0) {
314 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
315 			"for port %u: %s.\n", port, strerror(-retval));
316 		return retval;
317 	}
318 	if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
319 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
320 			"for Rx queues on port %u.\n", port);
321 		return -1;
322 	}
323 
324 	/* Setup the queues. */
325 	rxconf->offloads = port_conf.rxmode.offloads;
326 	for (q = 0; q < rx_rings; q ++) {
327 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
328 						rte_eth_dev_socket_id(port),
329 						rxconf,
330 						mbuf_pool);
331 		if (retval < 0) {
332 			RTE_LOG(ERR, VHOST_PORT,
333 				"Failed to setup rx queue %u of port %u: %s.\n",
334 				q, port, strerror(-retval));
335 			return retval;
336 		}
337 	}
338 	txconf->offloads = port_conf.txmode.offloads;
339 	for (q = 0; q < tx_rings; q ++) {
340 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
341 						rte_eth_dev_socket_id(port),
342 						txconf);
343 		if (retval < 0) {
344 			RTE_LOG(ERR, VHOST_PORT,
345 				"Failed to setup tx queue %u of port %u: %s.\n",
346 				q, port, strerror(-retval));
347 			return retval;
348 		}
349 	}
350 
351 	/* Start the device. */
352 	retval  = rte_eth_dev_start(port);
353 	if (retval < 0) {
354 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
355 			port, strerror(-retval));
356 		return retval;
357 	}
358 
359 	if (promiscuous) {
360 		retval = rte_eth_promiscuous_enable(port);
361 		if (retval != 0) {
362 			RTE_LOG(ERR, VHOST_PORT,
363 				"Failed to enable promiscuous mode on port %u: %s\n",
364 				port, rte_strerror(-retval));
365 			return retval;
366 		}
367 	}
368 
369 	retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
370 	if (retval < 0) {
371 		RTE_LOG(ERR, VHOST_PORT,
372 			"Failed to get MAC address on port %u: %s\n",
373 			port, rte_strerror(-retval));
374 		return retval;
375 	}
376 
377 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
378 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
379 		" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
380 		port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port]));
381 
382 	return 0;
383 }
384 
385 /*
386  * Set socket file path.
387  */
388 static int
389 us_vhost_parse_socket_path(const char *q_arg)
390 {
391 	char *old;
392 
393 	/* parse number string */
394 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
395 		return -1;
396 
397 	old = socket_files;
398 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
399 	if (socket_files == NULL) {
400 		free(old);
401 		return -1;
402 	}
403 
404 	strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
405 	nb_sockets++;
406 
407 	return 0;
408 }
409 
410 /*
411  * Parse the portmask provided at run time.
412  */
413 static int
414 parse_portmask(const char *portmask)
415 {
416 	char *end = NULL;
417 	unsigned long pm;
418 
419 	errno = 0;
420 
421 	/* parse hexadecimal string */
422 	pm = strtoul(portmask, &end, 16);
423 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
424 		return 0;
425 
426 	return pm;
427 
428 }
429 
430 /*
431  * Parse num options at run time.
432  */
433 static int
434 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
435 {
436 	char *end = NULL;
437 	unsigned long num;
438 
439 	errno = 0;
440 
441 	/* parse unsigned int string */
442 	num = strtoul(q_arg, &end, 10);
443 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
444 		return -1;
445 
446 	if (num > max_valid_value)
447 		return -1;
448 
449 	return num;
450 
451 }
452 
453 /*
454  * Display usage
455  */
456 static void
457 us_vhost_usage(const char *prgname)
458 {
459 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
460 	"		--vm2vm [0|1|2]\n"
461 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
462 	"		--socket-file <path>\n"
463 	"		--nb-devices ND\n"
464 	"		-p PORTMASK: Set mask for ports to be used by application\n"
465 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
466 	"		--rx-retry [0|1]: disable/enable(default) retries on Rx. Enable retry if destination queue is full\n"
467 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
468 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
469 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
470 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
471 	"		--socket-file: The path of the socket file.\n"
472 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
473 	"		--tso [0|1] disable/enable TCP segment offload.\n"
474 	"		--client register a vhost-user socket as client mode.\n"
475 	"		--dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
476 	"		--dmas register dma channel for specific vhost device.\n",
477 	       prgname);
478 }
479 
480 enum {
481 #define OPT_VM2VM               "vm2vm"
482 	OPT_VM2VM_NUM = 256,
483 #define OPT_RX_RETRY            "rx-retry"
484 	OPT_RX_RETRY_NUM,
485 #define OPT_RX_RETRY_DELAY      "rx-retry-delay"
486 	OPT_RX_RETRY_DELAY_NUM,
487 #define OPT_RX_RETRY_NUMB       "rx-retry-num"
488 	OPT_RX_RETRY_NUMB_NUM,
489 #define OPT_MERGEABLE           "mergeable"
490 	OPT_MERGEABLE_NUM,
491 #define OPT_STATS               "stats"
492 	OPT_STATS_NUM,
493 #define OPT_SOCKET_FILE         "socket-file"
494 	OPT_SOCKET_FILE_NUM,
495 #define OPT_TX_CSUM             "tx-csum"
496 	OPT_TX_CSUM_NUM,
497 #define OPT_TSO                 "tso"
498 	OPT_TSO_NUM,
499 #define OPT_CLIENT              "client"
500 	OPT_CLIENT_NUM,
501 #define OPT_BUILTIN_NET_DRIVER  "builtin-net-driver"
502 	OPT_BUILTIN_NET_DRIVER_NUM,
503 #define OPT_DMA_TYPE            "dma-type"
504 	OPT_DMA_TYPE_NUM,
505 #define OPT_DMAS                "dmas"
506 	OPT_DMAS_NUM,
507 };
508 
509 /*
510  * Parse the arguments given in the command line of the application.
511  */
512 static int
513 us_vhost_parse_args(int argc, char **argv)
514 {
515 	int opt, ret;
516 	int option_index;
517 	unsigned i;
518 	const char *prgname = argv[0];
519 	static struct option long_option[] = {
520 		{OPT_VM2VM, required_argument,
521 				NULL, OPT_VM2VM_NUM},
522 		{OPT_RX_RETRY, required_argument,
523 				NULL, OPT_RX_RETRY_NUM},
524 		{OPT_RX_RETRY_DELAY, required_argument,
525 				NULL, OPT_RX_RETRY_DELAY_NUM},
526 		{OPT_RX_RETRY_NUMB, required_argument,
527 				NULL, OPT_RX_RETRY_NUMB_NUM},
528 		{OPT_MERGEABLE, required_argument,
529 				NULL, OPT_MERGEABLE_NUM},
530 		{OPT_STATS, required_argument,
531 				NULL, OPT_STATS_NUM},
532 		{OPT_SOCKET_FILE, required_argument,
533 				NULL, OPT_SOCKET_FILE_NUM},
534 		{OPT_TX_CSUM, required_argument,
535 				NULL, OPT_TX_CSUM_NUM},
536 		{OPT_TSO, required_argument,
537 				NULL, OPT_TSO_NUM},
538 		{OPT_CLIENT, no_argument,
539 				NULL, OPT_CLIENT_NUM},
540 		{OPT_BUILTIN_NET_DRIVER, no_argument,
541 				NULL, OPT_BUILTIN_NET_DRIVER_NUM},
542 		{OPT_DMA_TYPE, required_argument,
543 				NULL, OPT_DMA_TYPE_NUM},
544 		{OPT_DMAS, required_argument,
545 				NULL, OPT_DMAS_NUM},
546 		{NULL, 0, 0, 0},
547 	};
548 
549 	/* Parse command line */
550 	while ((opt = getopt_long(argc, argv, "p:P",
551 			long_option, &option_index)) != EOF) {
552 		switch (opt) {
553 		/* Portmask */
554 		case 'p':
555 			enabled_port_mask = parse_portmask(optarg);
556 			if (enabled_port_mask == 0) {
557 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
558 				us_vhost_usage(prgname);
559 				return -1;
560 			}
561 			break;
562 
563 		case 'P':
564 			promiscuous = 1;
565 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
566 				RTE_ETH_VMDQ_ACCEPT_BROADCAST |
567 				RTE_ETH_VMDQ_ACCEPT_MULTICAST;
568 			break;
569 
570 		case OPT_VM2VM_NUM:
571 			ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
572 			if (ret == -1) {
573 				RTE_LOG(INFO, VHOST_CONFIG,
574 					"Invalid argument for "
575 					"vm2vm [0|1|2]\n");
576 				us_vhost_usage(prgname);
577 				return -1;
578 			}
579 			vm2vm_mode = (vm2vm_type)ret;
580 			break;
581 
582 		case OPT_RX_RETRY_NUM:
583 			ret = parse_num_opt(optarg, 1);
584 			if (ret == -1) {
585 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
586 				us_vhost_usage(prgname);
587 				return -1;
588 			}
589 			enable_retry = ret;
590 			break;
591 
592 		case OPT_TX_CSUM_NUM:
593 			ret = parse_num_opt(optarg, 1);
594 			if (ret == -1) {
595 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
596 				us_vhost_usage(prgname);
597 				return -1;
598 			}
599 			enable_tx_csum = ret;
600 			break;
601 
602 		case OPT_TSO_NUM:
603 			ret = parse_num_opt(optarg, 1);
604 			if (ret == -1) {
605 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
606 				us_vhost_usage(prgname);
607 				return -1;
608 			}
609 			enable_tso = ret;
610 			break;
611 
612 		case OPT_RX_RETRY_DELAY_NUM:
613 			ret = parse_num_opt(optarg, INT32_MAX);
614 			if (ret == -1) {
615 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
616 				us_vhost_usage(prgname);
617 				return -1;
618 			}
619 			burst_rx_delay_time = ret;
620 			break;
621 
622 		case OPT_RX_RETRY_NUMB_NUM:
623 			ret = parse_num_opt(optarg, INT32_MAX);
624 			if (ret == -1) {
625 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
626 				us_vhost_usage(prgname);
627 				return -1;
628 			}
629 			burst_rx_retry_num = ret;
630 			break;
631 
632 		case OPT_MERGEABLE_NUM:
633 			ret = parse_num_opt(optarg, 1);
634 			if (ret == -1) {
635 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
636 				us_vhost_usage(prgname);
637 				return -1;
638 			}
639 			mergeable = !!ret;
640 			break;
641 
642 		case OPT_STATS_NUM:
643 			ret = parse_num_opt(optarg, INT32_MAX);
644 			if (ret == -1) {
645 				RTE_LOG(INFO, VHOST_CONFIG,
646 					"Invalid argument for stats [0..N]\n");
647 				us_vhost_usage(prgname);
648 				return -1;
649 			}
650 			enable_stats = ret;
651 			break;
652 
653 		/* Set socket file path. */
654 		case OPT_SOCKET_FILE_NUM:
655 			if (us_vhost_parse_socket_path(optarg) == -1) {
656 				RTE_LOG(INFO, VHOST_CONFIG,
657 				"Invalid argument for socket name (Max %d characters)\n",
658 				PATH_MAX);
659 				us_vhost_usage(prgname);
660 				return -1;
661 			}
662 			break;
663 
664 		case OPT_DMA_TYPE_NUM:
665 			dma_type = optarg;
666 			break;
667 
668 		case OPT_DMAS_NUM:
669 			if (open_dma(optarg) == -1) {
670 				RTE_LOG(INFO, VHOST_CONFIG,
671 					"Wrong DMA args\n");
672 				us_vhost_usage(prgname);
673 				return -1;
674 			}
675 			async_vhost_driver = 1;
676 			break;
677 
678 		case OPT_CLIENT_NUM:
679 			client_mode = 1;
680 			break;
681 
682 		case OPT_BUILTIN_NET_DRIVER_NUM:
683 			builtin_net_driver = 1;
684 			break;
685 
686 		/* Invalid option - print options. */
687 		default:
688 			us_vhost_usage(prgname);
689 			return -1;
690 		}
691 	}
692 
693 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
694 		if (enabled_port_mask & (1 << i))
695 			ports[num_ports++] = i;
696 	}
697 
698 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
699 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
700 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
701 		return -1;
702 	}
703 
704 	return 0;
705 }
706 
707 /*
708  * Update the global var NUM_PORTS and array PORTS according to system ports number
709  * and return valid ports number
710  */
711 static unsigned check_ports_num(unsigned nb_ports)
712 {
713 	unsigned valid_num_ports = num_ports;
714 	unsigned portid;
715 
716 	if (num_ports > nb_ports) {
717 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
718 			num_ports, nb_ports);
719 		num_ports = nb_ports;
720 	}
721 
722 	for (portid = 0; portid < num_ports; portid ++) {
723 		if (!rte_eth_dev_is_valid_port(ports[portid])) {
724 			RTE_LOG(INFO, VHOST_PORT,
725 				"\nSpecified port ID(%u) is not valid\n",
726 				ports[portid]);
727 			ports[portid] = INVALID_PORT_ID;
728 			valid_num_ports--;
729 		}
730 	}
731 	return valid_num_ports;
732 }
733 
734 static __rte_always_inline struct vhost_dev *
735 find_vhost_dev(struct rte_ether_addr *mac)
736 {
737 	struct vhost_dev *vdev;
738 
739 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
740 		if (vdev->ready == DEVICE_RX &&
741 		    rte_is_same_ether_addr(mac, &vdev->mac_address))
742 			return vdev;
743 	}
744 
745 	return NULL;
746 }
747 
748 /*
749  * This function learns the MAC address of the device and registers this along with a
750  * vlan tag to a VMDQ.
751  */
752 static int
753 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
754 {
755 	struct rte_ether_hdr *pkt_hdr;
756 	int i, ret;
757 
758 	/* Learn MAC address of guest device from packet */
759 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
760 
761 	if (find_vhost_dev(&pkt_hdr->src_addr)) {
762 		RTE_LOG(ERR, VHOST_DATA,
763 			"(%d) device is using a registered MAC!\n",
764 			vdev->vid);
765 		return -1;
766 	}
767 
768 	for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
769 		vdev->mac_address.addr_bytes[i] =
770 			pkt_hdr->src_addr.addr_bytes[i];
771 
772 	/* vlan_tag currently uses the device_id. */
773 	vdev->vlan_tag = vlan_tags[vdev->vid];
774 
775 	/* Print out VMDQ registration info. */
776 	RTE_LOG(INFO, VHOST_DATA,
777 		"(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n",
778 		vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address),
779 		vdev->vlan_tag);
780 
781 	/* Register the MAC address. */
782 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
783 				(uint32_t)vdev->vid + vmdq_pool_base);
784 	if (ret)
785 		RTE_LOG(ERR, VHOST_DATA,
786 			"(%d) failed to add device MAC address to VMDQ\n",
787 			vdev->vid);
788 
789 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
790 
791 	/* Set device as ready for RX. */
792 	vdev->ready = DEVICE_RX;
793 
794 	return 0;
795 }
796 
797 /*
798  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
799  * queue before disabling RX on the device.
800  */
801 static inline void
802 unlink_vmdq(struct vhost_dev *vdev)
803 {
804 	unsigned i = 0;
805 	unsigned rx_count;
806 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
807 
808 	if (vdev->ready == DEVICE_RX) {
809 		/*clear MAC and VLAN settings*/
810 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
811 		for (i = 0; i < 6; i++)
812 			vdev->mac_address.addr_bytes[i] = 0;
813 
814 		vdev->vlan_tag = 0;
815 
816 		/*Clear out the receive buffers*/
817 		rx_count = rte_eth_rx_burst(ports[0],
818 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
819 
820 		while (rx_count) {
821 			for (i = 0; i < rx_count; i++)
822 				rte_pktmbuf_free(pkts_burst[i]);
823 
824 			rx_count = rte_eth_rx_burst(ports[0],
825 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
826 		}
827 
828 		vdev->ready = DEVICE_MAC_LEARNING;
829 	}
830 }
831 
832 static inline void
833 free_pkts(struct rte_mbuf **pkts, uint16_t n)
834 {
835 	while (n--)
836 		rte_pktmbuf_free(pkts[n]);
837 }
838 
839 static __rte_always_inline void
840 complete_async_pkts(struct vhost_dev *vdev)
841 {
842 	struct rte_mbuf *p_cpl[MAX_PKT_BURST];
843 	uint16_t complete_count;
844 
845 	complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
846 					VIRTIO_RXQ, p_cpl, MAX_PKT_BURST);
847 	if (complete_count) {
848 		free_pkts(p_cpl, complete_count);
849 		__atomic_sub_fetch(&vdev->pkts_inflight, complete_count, __ATOMIC_SEQ_CST);
850 	}
851 
852 }
853 
854 static __rte_always_inline void
855 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
856 	    struct rte_mbuf *m)
857 {
858 	uint16_t ret;
859 
860 	if (builtin_net_driver) {
861 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
862 	} else {
863 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
864 	}
865 
866 	if (enable_stats) {
867 		__atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
868 				__ATOMIC_SEQ_CST);
869 		__atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
870 				__ATOMIC_SEQ_CST);
871 		src_vdev->stats.tx_total++;
872 		src_vdev->stats.tx += ret;
873 	}
874 }
875 
876 static __rte_always_inline void
877 drain_vhost(struct vhost_dev *vdev)
878 {
879 	uint16_t ret;
880 	uint32_t buff_idx = rte_lcore_id() * MAX_VHOST_DEVICE + vdev->vid;
881 	uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
882 	struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
883 
884 	if (builtin_net_driver) {
885 		ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
886 	} else if (async_vhost_driver) {
887 		uint16_t enqueue_fail = 0;
888 
889 		complete_async_pkts(vdev);
890 		ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ, m, nr_xmit);
891 		__atomic_add_fetch(&vdev->pkts_inflight, ret, __ATOMIC_SEQ_CST);
892 
893 		enqueue_fail = nr_xmit - ret;
894 		if (enqueue_fail)
895 			free_pkts(&m[ret], nr_xmit - ret);
896 	} else {
897 		ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
898 						m, nr_xmit);
899 	}
900 
901 	if (enable_stats) {
902 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
903 				__ATOMIC_SEQ_CST);
904 		__atomic_add_fetch(&vdev->stats.rx_atomic, ret,
905 				__ATOMIC_SEQ_CST);
906 	}
907 
908 	if (!async_vhost_driver)
909 		free_pkts(m, nr_xmit);
910 }
911 
912 static __rte_always_inline void
913 drain_vhost_table(void)
914 {
915 	uint16_t lcore_id = rte_lcore_id();
916 	struct vhost_bufftable *vhost_txq;
917 	struct vhost_dev *vdev;
918 	uint64_t cur_tsc;
919 
920 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
921 		if (unlikely(vdev->remove == 1))
922 			continue;
923 
924 		vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE
925 						+ vdev->vid];
926 
927 		cur_tsc = rte_rdtsc();
928 		if (unlikely(cur_tsc - vhost_txq->pre_tsc
929 				> MBUF_TABLE_DRAIN_TSC)) {
930 			RTE_LOG_DP(DEBUG, VHOST_DATA,
931 				"Vhost TX queue drained after timeout with burst size %u\n",
932 				vhost_txq->len);
933 			drain_vhost(vdev);
934 			vhost_txq->len = 0;
935 			vhost_txq->pre_tsc = cur_tsc;
936 		}
937 	}
938 }
939 
940 /*
941  * Check if the packet destination MAC address is for a local device. If so then put
942  * the packet on that devices RX queue. If not then return.
943  */
944 static __rte_always_inline int
945 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
946 {
947 	struct rte_ether_hdr *pkt_hdr;
948 	struct vhost_dev *dst_vdev;
949 	struct vhost_bufftable *vhost_txq;
950 	uint16_t lcore_id = rte_lcore_id();
951 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
952 
953 	dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
954 	if (!dst_vdev)
955 		return -1;
956 
957 	if (vdev->vid == dst_vdev->vid) {
958 		RTE_LOG_DP(DEBUG, VHOST_DATA,
959 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
960 			vdev->vid);
961 		return 0;
962 	}
963 
964 	RTE_LOG_DP(DEBUG, VHOST_DATA,
965 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
966 
967 	if (unlikely(dst_vdev->remove)) {
968 		RTE_LOG_DP(DEBUG, VHOST_DATA,
969 			"(%d) device is marked for removal\n", dst_vdev->vid);
970 		return 0;
971 	}
972 
973 	vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE + dst_vdev->vid];
974 	vhost_txq->m_table[vhost_txq->len++] = m;
975 
976 	if (enable_stats) {
977 		vdev->stats.tx_total++;
978 		vdev->stats.tx++;
979 	}
980 
981 	if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
982 		drain_vhost(dst_vdev);
983 		vhost_txq->len = 0;
984 		vhost_txq->pre_tsc = rte_rdtsc();
985 	}
986 	return 0;
987 }
988 
989 /*
990  * Check if the destination MAC of a packet is one local VM,
991  * and get its vlan tag, and offset if it is.
992  */
993 static __rte_always_inline int
994 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
995 	uint32_t *offset, uint16_t *vlan_tag)
996 {
997 	struct vhost_dev *dst_vdev;
998 	struct rte_ether_hdr *pkt_hdr =
999 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1000 
1001 	dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1002 	if (!dst_vdev)
1003 		return 0;
1004 
1005 	if (vdev->vid == dst_vdev->vid) {
1006 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1007 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
1008 			vdev->vid);
1009 		return -1;
1010 	}
1011 
1012 	/*
1013 	 * HW vlan strip will reduce the packet length
1014 	 * by minus length of vlan tag, so need restore
1015 	 * the packet length by plus it.
1016 	 */
1017 	*offset  = RTE_VLAN_HLEN;
1018 	*vlan_tag = vlan_tags[vdev->vid];
1019 
1020 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1021 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1022 		vdev->vid, dst_vdev->vid, *vlan_tag);
1023 
1024 	return 0;
1025 }
1026 
1027 static void virtio_tx_offload(struct rte_mbuf *m)
1028 {
1029 	struct rte_net_hdr_lens hdr_lens;
1030 	struct rte_ipv4_hdr *ipv4_hdr;
1031 	struct rte_tcp_hdr *tcp_hdr;
1032 	uint32_t ptype;
1033 	void *l3_hdr;
1034 
1035 	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
1036 	m->l2_len = hdr_lens.l2_len;
1037 	m->l3_len = hdr_lens.l3_len;
1038 	m->l4_len = hdr_lens.l4_len;
1039 
1040 	l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len);
1041 	tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *,
1042 		m->l2_len + m->l3_len);
1043 
1044 	m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG;
1045 	if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) {
1046 		m->ol_flags |= RTE_MBUF_F_TX_IPV4;
1047 		m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
1048 		ipv4_hdr = l3_hdr;
1049 		ipv4_hdr->hdr_checksum = 0;
1050 		tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags);
1051 	} else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1052 		m->ol_flags |= RTE_MBUF_F_TX_IPV6;
1053 		tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags);
1054 	}
1055 }
1056 
1057 static __rte_always_inline void
1058 do_drain_mbuf_table(struct mbuf_table *tx_q)
1059 {
1060 	uint16_t count;
1061 
1062 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1063 				 tx_q->m_table, tx_q->len);
1064 	if (unlikely(count < tx_q->len))
1065 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
1066 
1067 	tx_q->len = 0;
1068 }
1069 
1070 /*
1071  * This function routes the TX packet to the correct interface. This
1072  * may be a local device or the physical port.
1073  */
1074 static __rte_always_inline void
1075 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1076 {
1077 	struct mbuf_table *tx_q;
1078 	unsigned offset = 0;
1079 	const uint16_t lcore_id = rte_lcore_id();
1080 	struct rte_ether_hdr *nh;
1081 
1082 
1083 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1084 	if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) {
1085 		struct vhost_dev *vdev2;
1086 
1087 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1088 			if (vdev2 != vdev)
1089 				sync_virtio_xmit(vdev2, vdev, m);
1090 		}
1091 		goto queue2nic;
1092 	}
1093 
1094 	/*check if destination is local VM*/
1095 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1096 		return;
1097 
1098 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1099 		if (unlikely(find_local_dest(vdev, m, &offset,
1100 					     &vlan_tag) != 0)) {
1101 			rte_pktmbuf_free(m);
1102 			return;
1103 		}
1104 	}
1105 
1106 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1107 		"(%d) TX: MAC address is external\n", vdev->vid);
1108 
1109 queue2nic:
1110 
1111 	/*Add packet to the port tx queue*/
1112 	tx_q = &lcore_tx_queue[lcore_id];
1113 
1114 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1115 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1116 		/* Guest has inserted the vlan tag. */
1117 		struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1118 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1119 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1120 			(vh->vlan_tci != vlan_tag_be))
1121 			vh->vlan_tci = vlan_tag_be;
1122 	} else {
1123 		m->ol_flags |= RTE_MBUF_F_TX_VLAN;
1124 
1125 		/*
1126 		 * Find the right seg to adjust the data len when offset is
1127 		 * bigger than tail room size.
1128 		 */
1129 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1130 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1131 				m->data_len += offset;
1132 			else {
1133 				struct rte_mbuf *seg = m;
1134 
1135 				while ((seg->next != NULL) &&
1136 					(offset > rte_pktmbuf_tailroom(seg)))
1137 					seg = seg->next;
1138 
1139 				seg->data_len += offset;
1140 			}
1141 			m->pkt_len += offset;
1142 		}
1143 
1144 		m->vlan_tci = vlan_tag;
1145 	}
1146 
1147 	if (m->ol_flags & RTE_MBUF_F_RX_LRO)
1148 		virtio_tx_offload(m);
1149 
1150 	tx_q->m_table[tx_q->len++] = m;
1151 	if (enable_stats) {
1152 		vdev->stats.tx_total++;
1153 		vdev->stats.tx++;
1154 	}
1155 
1156 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1157 		do_drain_mbuf_table(tx_q);
1158 }
1159 
1160 
1161 static __rte_always_inline void
1162 drain_mbuf_table(struct mbuf_table *tx_q)
1163 {
1164 	static uint64_t prev_tsc;
1165 	uint64_t cur_tsc;
1166 
1167 	if (tx_q->len == 0)
1168 		return;
1169 
1170 	cur_tsc = rte_rdtsc();
1171 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1172 		prev_tsc = cur_tsc;
1173 
1174 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1175 			"TX queue drained after timeout with burst size %u\n",
1176 			tx_q->len);
1177 		do_drain_mbuf_table(tx_q);
1178 	}
1179 }
1180 
1181 static __rte_always_inline void
1182 drain_eth_rx(struct vhost_dev *vdev)
1183 {
1184 	uint16_t rx_count, enqueue_count;
1185 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1186 
1187 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1188 				    pkts, MAX_PKT_BURST);
1189 
1190 	if (!rx_count)
1191 		return;
1192 
1193 	/*
1194 	 * When "enable_retry" is set, here we wait and retry when there
1195 	 * is no enough free slots in the queue to hold @rx_count packets,
1196 	 * to diminish packet loss.
1197 	 */
1198 	if (enable_retry &&
1199 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1200 			VIRTIO_RXQ))) {
1201 		uint32_t retry;
1202 
1203 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1204 			rte_delay_us(burst_rx_delay_time);
1205 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1206 					VIRTIO_RXQ))
1207 				break;
1208 		}
1209 	}
1210 
1211 	if (builtin_net_driver) {
1212 		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1213 						pkts, rx_count);
1214 	} else if (async_vhost_driver) {
1215 		uint16_t enqueue_fail = 0;
1216 
1217 		complete_async_pkts(vdev);
1218 		enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1219 					VIRTIO_RXQ, pkts, rx_count);
1220 		__atomic_add_fetch(&vdev->pkts_inflight, enqueue_count, __ATOMIC_SEQ_CST);
1221 
1222 		enqueue_fail = rx_count - enqueue_count;
1223 		if (enqueue_fail)
1224 			free_pkts(&pkts[enqueue_count], enqueue_fail);
1225 
1226 	} else {
1227 		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1228 						pkts, rx_count);
1229 	}
1230 
1231 	if (enable_stats) {
1232 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1233 				__ATOMIC_SEQ_CST);
1234 		__atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1235 				__ATOMIC_SEQ_CST);
1236 	}
1237 
1238 	if (!async_vhost_driver)
1239 		free_pkts(pkts, rx_count);
1240 }
1241 
1242 static __rte_always_inline void
1243 drain_virtio_tx(struct vhost_dev *vdev)
1244 {
1245 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1246 	uint16_t count;
1247 	uint16_t i;
1248 
1249 	if (builtin_net_driver) {
1250 		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1251 					pkts, MAX_PKT_BURST);
1252 	} else {
1253 		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1254 					mbuf_pool, pkts, MAX_PKT_BURST);
1255 	}
1256 
1257 	/* setup VMDq for the first packet */
1258 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1259 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1260 			free_pkts(pkts, count);
1261 	}
1262 
1263 	for (i = 0; i < count; ++i)
1264 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1265 }
1266 
1267 /*
1268  * Main function of vhost-switch. It basically does:
1269  *
1270  * for each vhost device {
1271  *    - drain_eth_rx()
1272  *
1273  *      Which drains the host eth Rx queue linked to the vhost device,
1274  *      and deliver all of them to guest virito Rx ring associated with
1275  *      this vhost device.
1276  *
1277  *    - drain_virtio_tx()
1278  *
1279  *      Which drains the guest virtio Tx queue and deliver all of them
1280  *      to the target, which could be another vhost device, or the
1281  *      physical eth dev. The route is done in function "virtio_tx_route".
1282  * }
1283  */
1284 static int
1285 switch_worker(void *arg __rte_unused)
1286 {
1287 	unsigned i;
1288 	unsigned lcore_id = rte_lcore_id();
1289 	struct vhost_dev *vdev;
1290 	struct mbuf_table *tx_q;
1291 
1292 	RTE_LOG(INFO, VHOST_DATA, "Processing on Core %u started\n", lcore_id);
1293 
1294 	tx_q = &lcore_tx_queue[lcore_id];
1295 	for (i = 0; i < rte_lcore_count(); i++) {
1296 		if (lcore_ids[i] == lcore_id) {
1297 			tx_q->txq_id = i;
1298 			break;
1299 		}
1300 	}
1301 
1302 	while(1) {
1303 		drain_mbuf_table(tx_q);
1304 		drain_vhost_table();
1305 		/*
1306 		 * Inform the configuration core that we have exited the
1307 		 * linked list and that no devices are in use if requested.
1308 		 */
1309 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1310 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1311 
1312 		/*
1313 		 * Process vhost devices
1314 		 */
1315 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1316 			      lcore_vdev_entry) {
1317 			if (unlikely(vdev->remove)) {
1318 				unlink_vmdq(vdev);
1319 				vdev->ready = DEVICE_SAFE_REMOVE;
1320 				continue;
1321 			}
1322 
1323 			if (likely(vdev->ready == DEVICE_RX))
1324 				drain_eth_rx(vdev);
1325 
1326 			if (likely(!vdev->remove))
1327 				drain_virtio_tx(vdev);
1328 		}
1329 	}
1330 
1331 	return 0;
1332 }
1333 
1334 /*
1335  * Remove a device from the specific data core linked list and from the
1336  * main linked list. Synchronization  occurs through the use of the
1337  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1338  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1339  */
1340 static void
1341 destroy_device(int vid)
1342 {
1343 	struct vhost_dev *vdev = NULL;
1344 	int lcore;
1345 	uint16_t i;
1346 
1347 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1348 		if (vdev->vid == vid)
1349 			break;
1350 	}
1351 	if (!vdev)
1352 		return;
1353 	/*set the remove flag. */
1354 	vdev->remove = 1;
1355 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1356 		rte_pause();
1357 	}
1358 
1359 	for (i = 0; i < RTE_MAX_LCORE; i++)
1360 		rte_free(vhost_txbuff[i * MAX_VHOST_DEVICE + vid]);
1361 
1362 	if (builtin_net_driver)
1363 		vs_vhost_net_remove(vdev);
1364 
1365 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1366 		     lcore_vdev_entry);
1367 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1368 
1369 
1370 	/* Set the dev_removal_flag on each lcore. */
1371 	RTE_LCORE_FOREACH_WORKER(lcore)
1372 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1373 
1374 	/*
1375 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1376 	 * we can be sure that they can no longer access the device removed
1377 	 * from the linked lists and that the devices are no longer in use.
1378 	 */
1379 	RTE_LCORE_FOREACH_WORKER(lcore) {
1380 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1381 			rte_pause();
1382 	}
1383 
1384 	lcore_info[vdev->coreid].device_num--;
1385 
1386 	RTE_LOG(INFO, VHOST_DATA,
1387 		"(%d) device has been removed from data core\n",
1388 		vdev->vid);
1389 
1390 	if (async_vhost_driver) {
1391 		uint16_t n_pkt = 0;
1392 		struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1393 
1394 		while (vdev->pkts_inflight) {
1395 			n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, VIRTIO_RXQ,
1396 						m_cpl, vdev->pkts_inflight);
1397 			free_pkts(m_cpl, n_pkt);
1398 			__atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1399 		}
1400 
1401 		rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1402 	}
1403 
1404 	rte_free(vdev);
1405 }
1406 
1407 /*
1408  * A new device is added to a data core. First the device is added to the main linked list
1409  * and then allocated to a specific data core.
1410  */
1411 static int
1412 new_device(int vid)
1413 {
1414 	int lcore, core_add = 0;
1415 	uint16_t i;
1416 	uint32_t device_num_min = num_devices;
1417 	struct vhost_dev *vdev;
1418 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1419 	if (vdev == NULL) {
1420 		RTE_LOG(INFO, VHOST_DATA,
1421 			"(%d) couldn't allocate memory for vhost dev\n",
1422 			vid);
1423 		return -1;
1424 	}
1425 	vdev->vid = vid;
1426 
1427 	for (i = 0; i < RTE_MAX_LCORE; i++) {
1428 		vhost_txbuff[i * MAX_VHOST_DEVICE + vid]
1429 			= rte_zmalloc("vhost bufftable",
1430 				sizeof(struct vhost_bufftable),
1431 				RTE_CACHE_LINE_SIZE);
1432 
1433 		if (vhost_txbuff[i * MAX_VHOST_DEVICE + vid] == NULL) {
1434 			RTE_LOG(INFO, VHOST_DATA,
1435 			  "(%d) couldn't allocate memory for vhost TX\n", vid);
1436 			return -1;
1437 		}
1438 	}
1439 
1440 	if (builtin_net_driver)
1441 		vs_vhost_net_setup(vdev);
1442 
1443 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1444 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1445 
1446 	/*reset ready flag*/
1447 	vdev->ready = DEVICE_MAC_LEARNING;
1448 	vdev->remove = 0;
1449 
1450 	/* Find a suitable lcore to add the device. */
1451 	RTE_LCORE_FOREACH_WORKER(lcore) {
1452 		if (lcore_info[lcore].device_num < device_num_min) {
1453 			device_num_min = lcore_info[lcore].device_num;
1454 			core_add = lcore;
1455 		}
1456 	}
1457 	vdev->coreid = core_add;
1458 
1459 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1460 			  lcore_vdev_entry);
1461 	lcore_info[vdev->coreid].device_num++;
1462 
1463 	/* Disable notifications. */
1464 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1465 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1466 
1467 	RTE_LOG(INFO, VHOST_DATA,
1468 		"(%d) device has been added to data core %d\n",
1469 		vid, vdev->coreid);
1470 
1471 	if (async_vhost_driver) {
1472 		struct rte_vhost_async_config config = {0};
1473 		struct rte_vhost_async_channel_ops channel_ops;
1474 
1475 		if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
1476 			channel_ops.transfer_data = ioat_transfer_data_cb;
1477 			channel_ops.check_completed_copies =
1478 				ioat_check_completed_copies_cb;
1479 
1480 			config.features = RTE_VHOST_ASYNC_INORDER;
1481 
1482 			return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
1483 				config, &channel_ops);
1484 		}
1485 	}
1486 
1487 	return 0;
1488 }
1489 
1490 static int
1491 vring_state_changed(int vid, uint16_t queue_id, int enable)
1492 {
1493 	struct vhost_dev *vdev = NULL;
1494 
1495 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1496 		if (vdev->vid == vid)
1497 			break;
1498 	}
1499 	if (!vdev)
1500 		return -1;
1501 
1502 	if (queue_id != VIRTIO_RXQ)
1503 		return 0;
1504 
1505 	if (async_vhost_driver) {
1506 		if (!enable) {
1507 			uint16_t n_pkt = 0;
1508 			struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1509 
1510 			while (vdev->pkts_inflight) {
1511 				n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, queue_id,
1512 							m_cpl, vdev->pkts_inflight);
1513 				free_pkts(m_cpl, n_pkt);
1514 				__atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1515 			}
1516 		}
1517 	}
1518 
1519 	return 0;
1520 }
1521 
1522 /*
1523  * These callback allow devices to be added to the data core when configuration
1524  * has been fully complete.
1525  */
1526 static const struct rte_vhost_device_ops virtio_net_device_ops =
1527 {
1528 	.new_device =  new_device,
1529 	.destroy_device = destroy_device,
1530 	.vring_state_changed = vring_state_changed,
1531 };
1532 
1533 /*
1534  * This is a thread will wake up after a period to print stats if the user has
1535  * enabled them.
1536  */
1537 static void *
1538 print_stats(__rte_unused void *arg)
1539 {
1540 	struct vhost_dev *vdev;
1541 	uint64_t tx_dropped, rx_dropped;
1542 	uint64_t tx, tx_total, rx, rx_total;
1543 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1544 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1545 
1546 	while(1) {
1547 		sleep(enable_stats);
1548 
1549 		/* Clear screen and move to top left */
1550 		printf("%s%s\n", clr, top_left);
1551 		printf("Device statistics =================================\n");
1552 
1553 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1554 			tx_total   = vdev->stats.tx_total;
1555 			tx         = vdev->stats.tx;
1556 			tx_dropped = tx_total - tx;
1557 
1558 			rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1559 				__ATOMIC_SEQ_CST);
1560 			rx         = __atomic_load_n(&vdev->stats.rx_atomic,
1561 				__ATOMIC_SEQ_CST);
1562 			rx_dropped = rx_total - rx;
1563 
1564 			printf("Statistics for device %d\n"
1565 				"-----------------------\n"
1566 				"TX total:              %" PRIu64 "\n"
1567 				"TX dropped:            %" PRIu64 "\n"
1568 				"TX successful:         %" PRIu64 "\n"
1569 				"RX total:              %" PRIu64 "\n"
1570 				"RX dropped:            %" PRIu64 "\n"
1571 				"RX successful:         %" PRIu64 "\n",
1572 				vdev->vid,
1573 				tx_total, tx_dropped, tx,
1574 				rx_total, rx_dropped, rx);
1575 		}
1576 
1577 		printf("===================================================\n");
1578 
1579 		fflush(stdout);
1580 	}
1581 
1582 	return NULL;
1583 }
1584 
1585 static void
1586 unregister_drivers(int socket_num)
1587 {
1588 	int i, ret;
1589 
1590 	for (i = 0; i < socket_num; i++) {
1591 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1592 		if (ret != 0)
1593 			RTE_LOG(ERR, VHOST_CONFIG,
1594 				"Fail to unregister vhost driver for %s.\n",
1595 				socket_files + i * PATH_MAX);
1596 	}
1597 }
1598 
1599 /* When we receive a INT signal, unregister vhost driver */
1600 static void
1601 sigint_handler(__rte_unused int signum)
1602 {
1603 	/* Unregister vhost driver. */
1604 	unregister_drivers(nb_sockets);
1605 
1606 	exit(0);
1607 }
1608 
1609 /*
1610  * While creating an mbuf pool, one key thing is to figure out how
1611  * many mbuf entries is enough for our use. FYI, here are some
1612  * guidelines:
1613  *
1614  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1615  *
1616  * - For each switch core (A CPU core does the packet switch), we need
1617  *   also make some reservation for receiving the packets from virtio
1618  *   Tx queue. How many is enough depends on the usage. It's normally
1619  *   a simple calculation like following:
1620  *
1621  *       MAX_PKT_BURST * max packet size / mbuf size
1622  *
1623  *   So, we definitely need allocate more mbufs when TSO is enabled.
1624  *
1625  * - Similarly, for each switching core, we should serve @nr_rx_desc
1626  *   mbufs for receiving the packets from physical NIC device.
1627  *
1628  * - We also need make sure, for each switch core, we have allocated
1629  *   enough mbufs to fill up the mbuf cache.
1630  */
1631 static void
1632 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1633 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1634 {
1635 	uint32_t nr_mbufs;
1636 	uint32_t nr_mbufs_per_core;
1637 	uint32_t mtu = 1500;
1638 
1639 	if (mergeable)
1640 		mtu = 9000;
1641 	if (enable_tso)
1642 		mtu = 64 * 1024;
1643 
1644 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1645 			(mbuf_size - RTE_PKTMBUF_HEADROOM);
1646 	nr_mbufs_per_core += nr_rx_desc;
1647 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1648 
1649 	nr_mbufs  = nr_queues * nr_rx_desc;
1650 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1651 	nr_mbufs *= nr_port;
1652 
1653 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1654 					    nr_mbuf_cache, 0, mbuf_size,
1655 					    rte_socket_id());
1656 	if (mbuf_pool == NULL)
1657 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1658 }
1659 
1660 /*
1661  * Main function, does initialisation and calls the per-lcore functions.
1662  */
1663 int
1664 main(int argc, char *argv[])
1665 {
1666 	unsigned lcore_id, core_id = 0;
1667 	unsigned nb_ports, valid_num_ports;
1668 	int ret, i;
1669 	uint16_t portid;
1670 	static pthread_t tid;
1671 	uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1672 
1673 	signal(SIGINT, sigint_handler);
1674 
1675 	/* init EAL */
1676 	ret = rte_eal_init(argc, argv);
1677 	if (ret < 0)
1678 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1679 	argc -= ret;
1680 	argv += ret;
1681 
1682 	/* parse app arguments */
1683 	ret = us_vhost_parse_args(argc, argv);
1684 	if (ret < 0)
1685 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1686 
1687 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1688 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1689 
1690 		if (rte_lcore_is_enabled(lcore_id))
1691 			lcore_ids[core_id++] = lcore_id;
1692 	}
1693 
1694 	if (rte_lcore_count() > RTE_MAX_LCORE)
1695 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1696 
1697 	/* Get the number of physical ports. */
1698 	nb_ports = rte_eth_dev_count_avail();
1699 
1700 	/*
1701 	 * Update the global var NUM_PORTS and global array PORTS
1702 	 * and get value of var VALID_NUM_PORTS according to system ports number
1703 	 */
1704 	valid_num_ports = check_ports_num(nb_ports);
1705 
1706 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1707 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1708 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1709 		return -1;
1710 	}
1711 
1712 	/*
1713 	 * FIXME: here we are trying to allocate mbufs big enough for
1714 	 * @MAX_QUEUES, but the truth is we're never going to use that
1715 	 * many queues here. We probably should only do allocation for
1716 	 * those queues we are going to use.
1717 	 */
1718 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1719 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1720 
1721 	if (vm2vm_mode == VM2VM_HARDWARE) {
1722 		/* Enable VT loop back to let L2 switch to do it. */
1723 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1724 		RTE_LOG(DEBUG, VHOST_CONFIG,
1725 			"Enable loop back for L2 switch in vmdq.\n");
1726 	}
1727 
1728 	/* initialize all ports */
1729 	RTE_ETH_FOREACH_DEV(portid) {
1730 		/* skip ports that are not enabled */
1731 		if ((enabled_port_mask & (1 << portid)) == 0) {
1732 			RTE_LOG(INFO, VHOST_PORT,
1733 				"Skipping disabled port %d\n", portid);
1734 			continue;
1735 		}
1736 		if (port_init(portid) != 0)
1737 			rte_exit(EXIT_FAILURE,
1738 				"Cannot initialize network ports\n");
1739 	}
1740 
1741 	/* Enable stats if the user option is set. */
1742 	if (enable_stats) {
1743 		ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1744 					print_stats, NULL);
1745 		if (ret < 0)
1746 			rte_exit(EXIT_FAILURE,
1747 				"Cannot create print-stats thread\n");
1748 	}
1749 
1750 	/* Launch all data cores. */
1751 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1752 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1753 
1754 	if (client_mode)
1755 		flags |= RTE_VHOST_USER_CLIENT;
1756 
1757 	/* Register vhost user driver to handle vhost messages. */
1758 	for (i = 0; i < nb_sockets; i++) {
1759 		char *file = socket_files + i * PATH_MAX;
1760 
1761 		if (async_vhost_driver)
1762 			flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1763 
1764 		ret = rte_vhost_driver_register(file, flags);
1765 		if (ret != 0) {
1766 			unregister_drivers(i);
1767 			rte_exit(EXIT_FAILURE,
1768 				"vhost driver register failure.\n");
1769 		}
1770 
1771 		if (builtin_net_driver)
1772 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1773 
1774 		if (mergeable == 0) {
1775 			rte_vhost_driver_disable_features(file,
1776 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
1777 		}
1778 
1779 		if (enable_tx_csum == 0) {
1780 			rte_vhost_driver_disable_features(file,
1781 				1ULL << VIRTIO_NET_F_CSUM);
1782 		}
1783 
1784 		if (enable_tso == 0) {
1785 			rte_vhost_driver_disable_features(file,
1786 				1ULL << VIRTIO_NET_F_HOST_TSO4);
1787 			rte_vhost_driver_disable_features(file,
1788 				1ULL << VIRTIO_NET_F_HOST_TSO6);
1789 			rte_vhost_driver_disable_features(file,
1790 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
1791 			rte_vhost_driver_disable_features(file,
1792 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
1793 		}
1794 
1795 		if (promiscuous) {
1796 			rte_vhost_driver_enable_features(file,
1797 				1ULL << VIRTIO_NET_F_CTRL_RX);
1798 		}
1799 
1800 		ret = rte_vhost_driver_callback_register(file,
1801 			&virtio_net_device_ops);
1802 		if (ret != 0) {
1803 			rte_exit(EXIT_FAILURE,
1804 				"failed to register vhost driver callbacks.\n");
1805 		}
1806 
1807 		if (rte_vhost_driver_start(file) < 0) {
1808 			rte_exit(EXIT_FAILURE,
1809 				"failed to start vhost driver.\n");
1810 		}
1811 	}
1812 
1813 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1814 		rte_eal_wait_lcore(lcore_id);
1815 
1816 	/* clean up the EAL */
1817 	rte_eal_cleanup();
1818 
1819 	return 0;
1820 }
1821