xref: /dpdk/examples/vhost/main.c (revision a7db3afce75346832059d8bfe54a8f81945fb213)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4 
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16 
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
19 #include <rte_log.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
22 #include <rte_net.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27 
28 #include "ioat.h"
29 #include "main.h"
30 
31 #ifndef MAX_QUEUES
32 #define MAX_QUEUES 128
33 #endif
34 
35 /* the maximum number of external ports supported */
36 #define MAX_SUP_PORTS 1
37 
38 #define MBUF_CACHE_SIZE	128
39 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
40 
41 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
42 
43 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
44 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
45 
46 #define JUMBO_FRAME_MAX_SIZE    0x2600
47 
48 /* State of virtio device. */
49 #define DEVICE_MAC_LEARNING 0
50 #define DEVICE_RX			1
51 #define DEVICE_SAFE_REMOVE	2
52 
53 /* Configurable number of RX/TX ring descriptors */
54 #define RTE_TEST_RX_DESC_DEFAULT 1024
55 #define RTE_TEST_TX_DESC_DEFAULT 512
56 
57 #define INVALID_PORT_ID 0xFF
58 
59 /* mask of enabled ports */
60 static uint32_t enabled_port_mask = 0;
61 
62 /* Promiscuous mode */
63 static uint32_t promiscuous;
64 
65 /* number of devices/queues to support*/
66 static uint32_t num_queues = 0;
67 static uint32_t num_devices;
68 
69 static struct rte_mempool *mbuf_pool;
70 static int mergeable;
71 
72 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
73 typedef enum {
74 	VM2VM_DISABLED = 0,
75 	VM2VM_SOFTWARE = 1,
76 	VM2VM_HARDWARE = 2,
77 	VM2VM_LAST
78 } vm2vm_type;
79 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
80 
81 /* Enable stats. */
82 static uint32_t enable_stats = 0;
83 /* Enable retries on RX. */
84 static uint32_t enable_retry = 1;
85 
86 /* Disable TX checksum offload */
87 static uint32_t enable_tx_csum;
88 
89 /* Disable TSO offload */
90 static uint32_t enable_tso;
91 
92 static int client_mode;
93 
94 static int builtin_net_driver;
95 
96 static int async_vhost_driver;
97 
98 static char *dma_type;
99 
100 /* Specify timeout (in useconds) between retries on RX. */
101 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
102 /* Specify the number of retries on RX. */
103 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
104 
105 /* Socket file paths. Can be set by user */
106 static char *socket_files;
107 static int nb_sockets;
108 
109 /* empty vmdq configuration structure. Filled in programatically */
110 static struct rte_eth_conf vmdq_conf_default = {
111 	.rxmode = {
112 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
113 		.split_hdr_size = 0,
114 		/*
115 		 * VLAN strip is necessary for 1G NIC such as I350,
116 		 * this fixes bug of ipv4 forwarding in guest can't
117 		 * forward pakets from one virtio dev to another virtio dev.
118 		 */
119 		.offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
120 	},
121 
122 	.txmode = {
123 		.mq_mode = ETH_MQ_TX_NONE,
124 		.offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
125 			     DEV_TX_OFFLOAD_TCP_CKSUM |
126 			     DEV_TX_OFFLOAD_VLAN_INSERT |
127 			     DEV_TX_OFFLOAD_MULTI_SEGS |
128 			     DEV_TX_OFFLOAD_TCP_TSO),
129 	},
130 	.rx_adv_conf = {
131 		/*
132 		 * should be overridden separately in code with
133 		 * appropriate values
134 		 */
135 		.vmdq_rx_conf = {
136 			.nb_queue_pools = ETH_8_POOLS,
137 			.enable_default_pool = 0,
138 			.default_pool = 0,
139 			.nb_pool_maps = 0,
140 			.pool_map = {{0, 0},},
141 		},
142 	},
143 };
144 
145 
146 static unsigned lcore_ids[RTE_MAX_LCORE];
147 static uint16_t ports[RTE_MAX_ETHPORTS];
148 static unsigned num_ports = 0; /**< The number of ports specified in command line */
149 static uint16_t num_pf_queues, num_vmdq_queues;
150 static uint16_t vmdq_pool_base, vmdq_queue_base;
151 static uint16_t queues_per_pool;
152 
153 const uint16_t vlan_tags[] = {
154 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
155 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
156 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
157 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
158 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
159 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
160 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
161 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
162 };
163 
164 /* ethernet addresses of ports */
165 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
166 
167 static struct vhost_dev_tailq_list vhost_dev_list =
168 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
169 
170 static struct lcore_info lcore_info[RTE_MAX_LCORE];
171 
172 /* Used for queueing bursts of TX packets. */
173 struct mbuf_table {
174 	unsigned len;
175 	unsigned txq_id;
176 	struct rte_mbuf *m_table[MAX_PKT_BURST];
177 };
178 
179 struct vhost_bufftable {
180 	uint32_t len;
181 	uint64_t pre_tsc;
182 	struct rte_mbuf *m_table[MAX_PKT_BURST];
183 };
184 
185 /* TX queue for each data core. */
186 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
187 
188 /*
189  * Vhost TX buffer for each data core.
190  * Every data core maintains a TX buffer for every vhost device,
191  * which is used for batch pkts enqueue for higher performance.
192  */
193 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * MAX_VHOST_DEVICE];
194 
195 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
196 				 / US_PER_S * BURST_TX_DRAIN_US)
197 #define VLAN_HLEN       4
198 
199 static inline int
200 open_dma(const char *value)
201 {
202 	if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0)
203 		return open_ioat(value);
204 
205 	return -1;
206 }
207 
208 /*
209  * Builds up the correct configuration for VMDQ VLAN pool map
210  * according to the pool & queue limits.
211  */
212 static inline int
213 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
214 {
215 	struct rte_eth_vmdq_rx_conf conf;
216 	struct rte_eth_vmdq_rx_conf *def_conf =
217 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
218 	unsigned i;
219 
220 	memset(&conf, 0, sizeof(conf));
221 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
222 	conf.nb_pool_maps = num_devices;
223 	conf.enable_loop_back = def_conf->enable_loop_back;
224 	conf.rx_mode = def_conf->rx_mode;
225 
226 	for (i = 0; i < conf.nb_pool_maps; i++) {
227 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
228 		conf.pool_map[i].pools = (1UL << i);
229 	}
230 
231 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
232 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
233 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
234 	return 0;
235 }
236 
237 /*
238  * Initialises a given port using global settings and with the rx buffers
239  * coming from the mbuf_pool passed as parameter
240  */
241 static inline int
242 port_init(uint16_t port)
243 {
244 	struct rte_eth_dev_info dev_info;
245 	struct rte_eth_conf port_conf;
246 	struct rte_eth_rxconf *rxconf;
247 	struct rte_eth_txconf *txconf;
248 	int16_t rx_rings, tx_rings;
249 	uint16_t rx_ring_size, tx_ring_size;
250 	int retval;
251 	uint16_t q;
252 
253 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
254 	retval = rte_eth_dev_info_get(port, &dev_info);
255 	if (retval != 0) {
256 		RTE_LOG(ERR, VHOST_PORT,
257 			"Error during getting device (port %u) info: %s\n",
258 			port, strerror(-retval));
259 
260 		return retval;
261 	}
262 
263 	rxconf = &dev_info.default_rxconf;
264 	txconf = &dev_info.default_txconf;
265 	rxconf->rx_drop_en = 1;
266 
267 	/*configure the number of supported virtio devices based on VMDQ limits */
268 	num_devices = dev_info.max_vmdq_pools;
269 
270 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
271 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
272 
273 	tx_rings = (uint16_t)rte_lcore_count();
274 
275 	/* Get port configuration. */
276 	retval = get_eth_conf(&port_conf, num_devices);
277 	if (retval < 0)
278 		return retval;
279 	/* NIC queues are divided into pf queues and vmdq queues.  */
280 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
281 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
282 	num_vmdq_queues = num_devices * queues_per_pool;
283 	num_queues = num_pf_queues + num_vmdq_queues;
284 	vmdq_queue_base = dev_info.vmdq_queue_base;
285 	vmdq_pool_base  = dev_info.vmdq_pool_base;
286 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
287 		num_pf_queues, num_devices, queues_per_pool);
288 
289 	if (!rte_eth_dev_is_valid_port(port))
290 		return -1;
291 
292 	rx_rings = (uint16_t)dev_info.max_rx_queues;
293 	if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
294 		port_conf.txmode.offloads |=
295 			DEV_TX_OFFLOAD_MBUF_FAST_FREE;
296 	/* Configure ethernet device. */
297 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
298 	if (retval != 0) {
299 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
300 			port, strerror(-retval));
301 		return retval;
302 	}
303 
304 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
305 		&tx_ring_size);
306 	if (retval != 0) {
307 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
308 			"for port %u: %s.\n", port, strerror(-retval));
309 		return retval;
310 	}
311 	if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
312 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
313 			"for Rx queues on port %u.\n", port);
314 		return -1;
315 	}
316 
317 	/* Setup the queues. */
318 	rxconf->offloads = port_conf.rxmode.offloads;
319 	for (q = 0; q < rx_rings; q ++) {
320 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
321 						rte_eth_dev_socket_id(port),
322 						rxconf,
323 						mbuf_pool);
324 		if (retval < 0) {
325 			RTE_LOG(ERR, VHOST_PORT,
326 				"Failed to setup rx queue %u of port %u: %s.\n",
327 				q, port, strerror(-retval));
328 			return retval;
329 		}
330 	}
331 	txconf->offloads = port_conf.txmode.offloads;
332 	for (q = 0; q < tx_rings; q ++) {
333 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
334 						rte_eth_dev_socket_id(port),
335 						txconf);
336 		if (retval < 0) {
337 			RTE_LOG(ERR, VHOST_PORT,
338 				"Failed to setup tx queue %u of port %u: %s.\n",
339 				q, port, strerror(-retval));
340 			return retval;
341 		}
342 	}
343 
344 	/* Start the device. */
345 	retval  = rte_eth_dev_start(port);
346 	if (retval < 0) {
347 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
348 			port, strerror(-retval));
349 		return retval;
350 	}
351 
352 	if (promiscuous) {
353 		retval = rte_eth_promiscuous_enable(port);
354 		if (retval != 0) {
355 			RTE_LOG(ERR, VHOST_PORT,
356 				"Failed to enable promiscuous mode on port %u: %s\n",
357 				port, rte_strerror(-retval));
358 			return retval;
359 		}
360 	}
361 
362 	retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
363 	if (retval < 0) {
364 		RTE_LOG(ERR, VHOST_PORT,
365 			"Failed to get MAC address on port %u: %s\n",
366 			port, rte_strerror(-retval));
367 		return retval;
368 	}
369 
370 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
371 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
372 		" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
373 		port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port]));
374 
375 	return 0;
376 }
377 
378 /*
379  * Set socket file path.
380  */
381 static int
382 us_vhost_parse_socket_path(const char *q_arg)
383 {
384 	char *old;
385 
386 	/* parse number string */
387 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
388 		return -1;
389 
390 	old = socket_files;
391 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
392 	if (socket_files == NULL) {
393 		free(old);
394 		return -1;
395 	}
396 
397 	strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
398 	nb_sockets++;
399 
400 	return 0;
401 }
402 
403 /*
404  * Parse the portmask provided at run time.
405  */
406 static int
407 parse_portmask(const char *portmask)
408 {
409 	char *end = NULL;
410 	unsigned long pm;
411 
412 	errno = 0;
413 
414 	/* parse hexadecimal string */
415 	pm = strtoul(portmask, &end, 16);
416 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
417 		return 0;
418 
419 	return pm;
420 
421 }
422 
423 /*
424  * Parse num options at run time.
425  */
426 static int
427 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
428 {
429 	char *end = NULL;
430 	unsigned long num;
431 
432 	errno = 0;
433 
434 	/* parse unsigned int string */
435 	num = strtoul(q_arg, &end, 10);
436 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
437 		return -1;
438 
439 	if (num > max_valid_value)
440 		return -1;
441 
442 	return num;
443 
444 }
445 
446 /*
447  * Display usage
448  */
449 static void
450 us_vhost_usage(const char *prgname)
451 {
452 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
453 	"		--vm2vm [0|1|2]\n"
454 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
455 	"		--socket-file <path>\n"
456 	"		--nb-devices ND\n"
457 	"		-p PORTMASK: Set mask for ports to be used by application\n"
458 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
459 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
460 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
461 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
462 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
463 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
464 	"		--socket-file: The path of the socket file.\n"
465 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
466 	"		--tso [0|1] disable/enable TCP segment offload.\n"
467 	"		--client register a vhost-user socket as client mode.\n"
468 	"		--dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
469 	"		--dmas register dma channel for specific vhost device.\n",
470 	       prgname);
471 }
472 
473 enum {
474 #define OPT_VM2VM               "vm2vm"
475 	OPT_VM2VM_NUM = 256,
476 #define OPT_RX_RETRY            "rx-retry"
477 	OPT_RX_RETRY_NUM,
478 #define OPT_RX_RETRY_DELAY      "rx-retry-delay"
479 	OPT_RX_RETRY_DELAY_NUM,
480 #define OPT_RX_RETRY_NUMB       "rx-retry-num"
481 	OPT_RX_RETRY_NUMB_NUM,
482 #define OPT_MERGEABLE           "mergeable"
483 	OPT_MERGEABLE_NUM,
484 #define OPT_STATS               "stats"
485 	OPT_STATS_NUM,
486 #define OPT_SOCKET_FILE         "socket-file"
487 	OPT_SOCKET_FILE_NUM,
488 #define OPT_TX_CSUM             "tx-csum"
489 	OPT_TX_CSUM_NUM,
490 #define OPT_TSO                 "tso"
491 	OPT_TSO_NUM,
492 #define OPT_CLIENT              "client"
493 	OPT_CLIENT_NUM,
494 #define OPT_BUILTIN_NET_DRIVER  "builtin-net-driver"
495 	OPT_BUILTIN_NET_DRIVER_NUM,
496 #define OPT_DMA_TYPE            "dma-type"
497 	OPT_DMA_TYPE_NUM,
498 #define OPT_DMAS                "dmas"
499 	OPT_DMAS_NUM,
500 };
501 
502 /*
503  * Parse the arguments given in the command line of the application.
504  */
505 static int
506 us_vhost_parse_args(int argc, char **argv)
507 {
508 	int opt, ret;
509 	int option_index;
510 	unsigned i;
511 	const char *prgname = argv[0];
512 	static struct option long_option[] = {
513 		{OPT_VM2VM, required_argument,
514 				NULL, OPT_VM2VM_NUM},
515 		{OPT_RX_RETRY, required_argument,
516 				NULL, OPT_RX_RETRY_NUM},
517 		{OPT_RX_RETRY_DELAY, required_argument,
518 				NULL, OPT_RX_RETRY_DELAY_NUM},
519 		{OPT_RX_RETRY_NUMB, required_argument,
520 				NULL, OPT_RX_RETRY_NUMB_NUM},
521 		{OPT_MERGEABLE, required_argument,
522 				NULL, OPT_MERGEABLE_NUM},
523 		{OPT_STATS, required_argument,
524 				NULL, OPT_STATS_NUM},
525 		{OPT_SOCKET_FILE, required_argument,
526 				NULL, OPT_SOCKET_FILE_NUM},
527 		{OPT_TX_CSUM, required_argument,
528 				NULL, OPT_TX_CSUM_NUM},
529 		{OPT_TSO, required_argument,
530 				NULL, OPT_TSO_NUM},
531 		{OPT_CLIENT, no_argument,
532 				NULL, OPT_CLIENT_NUM},
533 		{OPT_BUILTIN_NET_DRIVER, no_argument,
534 				NULL, OPT_BUILTIN_NET_DRIVER_NUM},
535 		{OPT_DMA_TYPE, required_argument,
536 				NULL, OPT_DMA_TYPE_NUM},
537 		{OPT_DMAS, required_argument,
538 				NULL, OPT_DMAS_NUM},
539 		{NULL, 0, 0, 0},
540 	};
541 
542 	/* Parse command line */
543 	while ((opt = getopt_long(argc, argv, "p:P",
544 			long_option, &option_index)) != EOF) {
545 		switch (opt) {
546 		/* Portmask */
547 		case 'p':
548 			enabled_port_mask = parse_portmask(optarg);
549 			if (enabled_port_mask == 0) {
550 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
551 				us_vhost_usage(prgname);
552 				return -1;
553 			}
554 			break;
555 
556 		case 'P':
557 			promiscuous = 1;
558 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
559 				ETH_VMDQ_ACCEPT_BROADCAST |
560 				ETH_VMDQ_ACCEPT_MULTICAST;
561 			break;
562 
563 		case OPT_VM2VM_NUM:
564 			ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
565 			if (ret == -1) {
566 				RTE_LOG(INFO, VHOST_CONFIG,
567 					"Invalid argument for "
568 					"vm2vm [0|1|2]\n");
569 				us_vhost_usage(prgname);
570 				return -1;
571 			}
572 			vm2vm_mode = (vm2vm_type)ret;
573 			break;
574 
575 		case OPT_RX_RETRY_NUM:
576 			ret = parse_num_opt(optarg, 1);
577 			if (ret == -1) {
578 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
579 				us_vhost_usage(prgname);
580 				return -1;
581 			}
582 			enable_retry = ret;
583 			break;
584 
585 		case OPT_TX_CSUM_NUM:
586 			ret = parse_num_opt(optarg, 1);
587 			if (ret == -1) {
588 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
589 				us_vhost_usage(prgname);
590 				return -1;
591 			}
592 			enable_tx_csum = ret;
593 			break;
594 
595 		case OPT_TSO_NUM:
596 			ret = parse_num_opt(optarg, 1);
597 			if (ret == -1) {
598 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
599 				us_vhost_usage(prgname);
600 				return -1;
601 			}
602 			enable_tso = ret;
603 			break;
604 
605 		case OPT_RX_RETRY_DELAY_NUM:
606 			ret = parse_num_opt(optarg, INT32_MAX);
607 			if (ret == -1) {
608 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
609 				us_vhost_usage(prgname);
610 				return -1;
611 			}
612 			burst_rx_delay_time = ret;
613 			break;
614 
615 		case OPT_RX_RETRY_NUMB_NUM:
616 			ret = parse_num_opt(optarg, INT32_MAX);
617 			if (ret == -1) {
618 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
619 				us_vhost_usage(prgname);
620 				return -1;
621 			}
622 			burst_rx_retry_num = ret;
623 			break;
624 
625 		case OPT_MERGEABLE_NUM:
626 			ret = parse_num_opt(optarg, 1);
627 			if (ret == -1) {
628 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
629 				us_vhost_usage(prgname);
630 				return -1;
631 			}
632 			mergeable = !!ret;
633 			if (ret) {
634 				vmdq_conf_default.rxmode.offloads |=
635 					DEV_RX_OFFLOAD_JUMBO_FRAME;
636 				vmdq_conf_default.rxmode.max_rx_pkt_len
637 					= JUMBO_FRAME_MAX_SIZE;
638 			}
639 			break;
640 
641 		case OPT_STATS_NUM:
642 			ret = parse_num_opt(optarg, INT32_MAX);
643 			if (ret == -1) {
644 				RTE_LOG(INFO, VHOST_CONFIG,
645 					"Invalid argument for stats [0..N]\n");
646 				us_vhost_usage(prgname);
647 				return -1;
648 			}
649 			enable_stats = ret;
650 			break;
651 
652 		/* Set socket file path. */
653 		case OPT_SOCKET_FILE_NUM:
654 			if (us_vhost_parse_socket_path(optarg) == -1) {
655 				RTE_LOG(INFO, VHOST_CONFIG,
656 				"Invalid argument for socket name (Max %d characters)\n",
657 				PATH_MAX);
658 				us_vhost_usage(prgname);
659 				return -1;
660 			}
661 			break;
662 
663 		case OPT_DMA_TYPE_NUM:
664 			dma_type = optarg;
665 			break;
666 
667 		case OPT_DMAS_NUM:
668 			if (open_dma(optarg) == -1) {
669 				RTE_LOG(INFO, VHOST_CONFIG,
670 					"Wrong DMA args\n");
671 				us_vhost_usage(prgname);
672 				return -1;
673 			}
674 			async_vhost_driver = 1;
675 			break;
676 
677 		case OPT_CLIENT_NUM:
678 			client_mode = 1;
679 			break;
680 
681 		case OPT_BUILTIN_NET_DRIVER_NUM:
682 			builtin_net_driver = 1;
683 			break;
684 
685 		/* Invalid option - print options. */
686 		default:
687 			us_vhost_usage(prgname);
688 			return -1;
689 		}
690 	}
691 
692 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
693 		if (enabled_port_mask & (1 << i))
694 			ports[num_ports++] = i;
695 	}
696 
697 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
698 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
699 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
700 		return -1;
701 	}
702 
703 	return 0;
704 }
705 
706 /*
707  * Update the global var NUM_PORTS and array PORTS according to system ports number
708  * and return valid ports number
709  */
710 static unsigned check_ports_num(unsigned nb_ports)
711 {
712 	unsigned valid_num_ports = num_ports;
713 	unsigned portid;
714 
715 	if (num_ports > nb_ports) {
716 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
717 			num_ports, nb_ports);
718 		num_ports = nb_ports;
719 	}
720 
721 	for (portid = 0; portid < num_ports; portid ++) {
722 		if (!rte_eth_dev_is_valid_port(ports[portid])) {
723 			RTE_LOG(INFO, VHOST_PORT,
724 				"\nSpecified port ID(%u) is not valid\n",
725 				ports[portid]);
726 			ports[portid] = INVALID_PORT_ID;
727 			valid_num_ports--;
728 		}
729 	}
730 	return valid_num_ports;
731 }
732 
733 static __rte_always_inline struct vhost_dev *
734 find_vhost_dev(struct rte_ether_addr *mac)
735 {
736 	struct vhost_dev *vdev;
737 
738 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
739 		if (vdev->ready == DEVICE_RX &&
740 		    rte_is_same_ether_addr(mac, &vdev->mac_address))
741 			return vdev;
742 	}
743 
744 	return NULL;
745 }
746 
747 /*
748  * This function learns the MAC address of the device and registers this along with a
749  * vlan tag to a VMDQ.
750  */
751 static int
752 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
753 {
754 	struct rte_ether_hdr *pkt_hdr;
755 	int i, ret;
756 
757 	/* Learn MAC address of guest device from packet */
758 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
759 
760 	if (find_vhost_dev(&pkt_hdr->s_addr)) {
761 		RTE_LOG(ERR, VHOST_DATA,
762 			"(%d) device is using a registered MAC!\n",
763 			vdev->vid);
764 		return -1;
765 	}
766 
767 	for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
768 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
769 
770 	/* vlan_tag currently uses the device_id. */
771 	vdev->vlan_tag = vlan_tags[vdev->vid];
772 
773 	/* Print out VMDQ registration info. */
774 	RTE_LOG(INFO, VHOST_DATA,
775 		"(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n",
776 		vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address),
777 		vdev->vlan_tag);
778 
779 	/* Register the MAC address. */
780 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
781 				(uint32_t)vdev->vid + vmdq_pool_base);
782 	if (ret)
783 		RTE_LOG(ERR, VHOST_DATA,
784 			"(%d) failed to add device MAC address to VMDQ\n",
785 			vdev->vid);
786 
787 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
788 
789 	/* Set device as ready for RX. */
790 	vdev->ready = DEVICE_RX;
791 
792 	return 0;
793 }
794 
795 /*
796  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
797  * queue before disabling RX on the device.
798  */
799 static inline void
800 unlink_vmdq(struct vhost_dev *vdev)
801 {
802 	unsigned i = 0;
803 	unsigned rx_count;
804 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
805 
806 	if (vdev->ready == DEVICE_RX) {
807 		/*clear MAC and VLAN settings*/
808 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
809 		for (i = 0; i < 6; i++)
810 			vdev->mac_address.addr_bytes[i] = 0;
811 
812 		vdev->vlan_tag = 0;
813 
814 		/*Clear out the receive buffers*/
815 		rx_count = rte_eth_rx_burst(ports[0],
816 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
817 
818 		while (rx_count) {
819 			for (i = 0; i < rx_count; i++)
820 				rte_pktmbuf_free(pkts_burst[i]);
821 
822 			rx_count = rte_eth_rx_burst(ports[0],
823 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
824 		}
825 
826 		vdev->ready = DEVICE_MAC_LEARNING;
827 	}
828 }
829 
830 static inline void
831 free_pkts(struct rte_mbuf **pkts, uint16_t n)
832 {
833 	while (n--)
834 		rte_pktmbuf_free(pkts[n]);
835 }
836 
837 static __rte_always_inline void
838 complete_async_pkts(struct vhost_dev *vdev)
839 {
840 	struct rte_mbuf *p_cpl[MAX_PKT_BURST];
841 	uint16_t complete_count;
842 
843 	complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
844 					VIRTIO_RXQ, p_cpl, MAX_PKT_BURST);
845 	if (complete_count) {
846 		free_pkts(p_cpl, complete_count);
847 		__atomic_sub_fetch(&vdev->pkts_inflight, complete_count, __ATOMIC_SEQ_CST);
848 	}
849 
850 }
851 
852 static __rte_always_inline void
853 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
854 	    struct rte_mbuf *m)
855 {
856 	uint16_t ret;
857 
858 	if (builtin_net_driver) {
859 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
860 	} else {
861 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
862 	}
863 
864 	if (enable_stats) {
865 		__atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
866 				__ATOMIC_SEQ_CST);
867 		__atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
868 				__ATOMIC_SEQ_CST);
869 		src_vdev->stats.tx_total++;
870 		src_vdev->stats.tx += ret;
871 	}
872 }
873 
874 static __rte_always_inline void
875 drain_vhost(struct vhost_dev *vdev)
876 {
877 	uint16_t ret;
878 	uint32_t buff_idx = rte_lcore_id() * MAX_VHOST_DEVICE + vdev->vid;
879 	uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
880 	struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
881 
882 	if (builtin_net_driver) {
883 		ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
884 	} else if (async_vhost_driver) {
885 		uint32_t cpu_cpl_nr = 0;
886 		uint16_t enqueue_fail = 0;
887 		struct rte_mbuf *m_cpu_cpl[nr_xmit];
888 
889 		complete_async_pkts(vdev);
890 		ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ,
891 					m, nr_xmit, m_cpu_cpl, &cpu_cpl_nr);
892 		__atomic_add_fetch(&vdev->pkts_inflight, ret - cpu_cpl_nr, __ATOMIC_SEQ_CST);
893 
894 		if (cpu_cpl_nr)
895 			free_pkts(m_cpu_cpl, cpu_cpl_nr);
896 
897 		enqueue_fail = nr_xmit - ret;
898 		if (enqueue_fail)
899 			free_pkts(&m[ret], nr_xmit - ret);
900 	} else {
901 		ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
902 						m, nr_xmit);
903 	}
904 
905 	if (enable_stats) {
906 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
907 				__ATOMIC_SEQ_CST);
908 		__atomic_add_fetch(&vdev->stats.rx_atomic, ret,
909 				__ATOMIC_SEQ_CST);
910 	}
911 
912 	if (!async_vhost_driver)
913 		free_pkts(m, nr_xmit);
914 }
915 
916 static __rte_always_inline void
917 drain_vhost_table(void)
918 {
919 	uint16_t lcore_id = rte_lcore_id();
920 	struct vhost_bufftable *vhost_txq;
921 	struct vhost_dev *vdev;
922 	uint64_t cur_tsc;
923 
924 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
925 		vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE
926 						+ vdev->vid];
927 
928 		cur_tsc = rte_rdtsc();
929 		if (unlikely(cur_tsc - vhost_txq->pre_tsc
930 				> MBUF_TABLE_DRAIN_TSC)) {
931 			RTE_LOG_DP(DEBUG, VHOST_DATA,
932 				"Vhost TX queue drained after timeout with burst size %u\n",
933 				vhost_txq->len);
934 			drain_vhost(vdev);
935 			vhost_txq->len = 0;
936 			vhost_txq->pre_tsc = cur_tsc;
937 		}
938 	}
939 }
940 
941 /*
942  * Check if the packet destination MAC address is for a local device. If so then put
943  * the packet on that devices RX queue. If not then return.
944  */
945 static __rte_always_inline int
946 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
947 {
948 	struct rte_ether_hdr *pkt_hdr;
949 	struct vhost_dev *dst_vdev;
950 	struct vhost_bufftable *vhost_txq;
951 	uint16_t lcore_id = rte_lcore_id();
952 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
953 
954 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
955 	if (!dst_vdev)
956 		return -1;
957 
958 	if (vdev->vid == dst_vdev->vid) {
959 		RTE_LOG_DP(DEBUG, VHOST_DATA,
960 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
961 			vdev->vid);
962 		return 0;
963 	}
964 
965 	RTE_LOG_DP(DEBUG, VHOST_DATA,
966 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
967 
968 	if (unlikely(dst_vdev->remove)) {
969 		RTE_LOG_DP(DEBUG, VHOST_DATA,
970 			"(%d) device is marked for removal\n", dst_vdev->vid);
971 		return 0;
972 	}
973 
974 	vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE + dst_vdev->vid];
975 	vhost_txq->m_table[vhost_txq->len++] = m;
976 
977 	if (enable_stats) {
978 		vdev->stats.tx_total++;
979 		vdev->stats.tx++;
980 	}
981 
982 	if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
983 		drain_vhost(dst_vdev);
984 		vhost_txq->len = 0;
985 		vhost_txq->pre_tsc = rte_rdtsc();
986 	}
987 	return 0;
988 }
989 
990 /*
991  * Check if the destination MAC of a packet is one local VM,
992  * and get its vlan tag, and offset if it is.
993  */
994 static __rte_always_inline int
995 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
996 	uint32_t *offset, uint16_t *vlan_tag)
997 {
998 	struct vhost_dev *dst_vdev;
999 	struct rte_ether_hdr *pkt_hdr =
1000 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1001 
1002 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
1003 	if (!dst_vdev)
1004 		return 0;
1005 
1006 	if (vdev->vid == dst_vdev->vid) {
1007 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1008 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
1009 			vdev->vid);
1010 		return -1;
1011 	}
1012 
1013 	/*
1014 	 * HW vlan strip will reduce the packet length
1015 	 * by minus length of vlan tag, so need restore
1016 	 * the packet length by plus it.
1017 	 */
1018 	*offset  = VLAN_HLEN;
1019 	*vlan_tag = vlan_tags[vdev->vid];
1020 
1021 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1022 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1023 		vdev->vid, dst_vdev->vid, *vlan_tag);
1024 
1025 	return 0;
1026 }
1027 
1028 static void virtio_tx_offload(struct rte_mbuf *m)
1029 {
1030 	struct rte_net_hdr_lens hdr_lens;
1031 	struct rte_ipv4_hdr *ipv4_hdr;
1032 	struct rte_tcp_hdr *tcp_hdr;
1033 	uint32_t ptype;
1034 	void *l3_hdr;
1035 
1036 	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
1037 	m->l2_len = hdr_lens.l2_len;
1038 	m->l3_len = hdr_lens.l3_len;
1039 	m->l4_len = hdr_lens.l4_len;
1040 
1041 	l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len);
1042 	tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *,
1043 		m->l2_len + m->l3_len);
1044 
1045 	m->ol_flags |= PKT_TX_TCP_SEG;
1046 	if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) {
1047 		m->ol_flags |= PKT_TX_IPV4;
1048 		m->ol_flags |= PKT_TX_IP_CKSUM;
1049 		ipv4_hdr = l3_hdr;
1050 		ipv4_hdr->hdr_checksum = 0;
1051 		tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags);
1052 	} else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1053 		m->ol_flags |= PKT_TX_IPV6;
1054 		tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags);
1055 	}
1056 }
1057 
1058 static __rte_always_inline void
1059 do_drain_mbuf_table(struct mbuf_table *tx_q)
1060 {
1061 	uint16_t count;
1062 
1063 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1064 				 tx_q->m_table, tx_q->len);
1065 	if (unlikely(count < tx_q->len))
1066 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
1067 
1068 	tx_q->len = 0;
1069 }
1070 
1071 /*
1072  * This function routes the TX packet to the correct interface. This
1073  * may be a local device or the physical port.
1074  */
1075 static __rte_always_inline void
1076 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1077 {
1078 	struct mbuf_table *tx_q;
1079 	unsigned offset = 0;
1080 	const uint16_t lcore_id = rte_lcore_id();
1081 	struct rte_ether_hdr *nh;
1082 
1083 
1084 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1085 	if (unlikely(rte_is_broadcast_ether_addr(&nh->d_addr))) {
1086 		struct vhost_dev *vdev2;
1087 
1088 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1089 			if (vdev2 != vdev)
1090 				sync_virtio_xmit(vdev2, vdev, m);
1091 		}
1092 		goto queue2nic;
1093 	}
1094 
1095 	/*check if destination is local VM*/
1096 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1097 		return;
1098 
1099 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1100 		if (unlikely(find_local_dest(vdev, m, &offset,
1101 					     &vlan_tag) != 0)) {
1102 			rte_pktmbuf_free(m);
1103 			return;
1104 		}
1105 	}
1106 
1107 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1108 		"(%d) TX: MAC address is external\n", vdev->vid);
1109 
1110 queue2nic:
1111 
1112 	/*Add packet to the port tx queue*/
1113 	tx_q = &lcore_tx_queue[lcore_id];
1114 
1115 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1116 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1117 		/* Guest has inserted the vlan tag. */
1118 		struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1119 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1120 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1121 			(vh->vlan_tci != vlan_tag_be))
1122 			vh->vlan_tci = vlan_tag_be;
1123 	} else {
1124 		m->ol_flags |= PKT_TX_VLAN_PKT;
1125 
1126 		/*
1127 		 * Find the right seg to adjust the data len when offset is
1128 		 * bigger than tail room size.
1129 		 */
1130 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1131 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1132 				m->data_len += offset;
1133 			else {
1134 				struct rte_mbuf *seg = m;
1135 
1136 				while ((seg->next != NULL) &&
1137 					(offset > rte_pktmbuf_tailroom(seg)))
1138 					seg = seg->next;
1139 
1140 				seg->data_len += offset;
1141 			}
1142 			m->pkt_len += offset;
1143 		}
1144 
1145 		m->vlan_tci = vlan_tag;
1146 	}
1147 
1148 	if (m->ol_flags & PKT_RX_LRO)
1149 		virtio_tx_offload(m);
1150 
1151 	tx_q->m_table[tx_q->len++] = m;
1152 	if (enable_stats) {
1153 		vdev->stats.tx_total++;
1154 		vdev->stats.tx++;
1155 	}
1156 
1157 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1158 		do_drain_mbuf_table(tx_q);
1159 }
1160 
1161 
1162 static __rte_always_inline void
1163 drain_mbuf_table(struct mbuf_table *tx_q)
1164 {
1165 	static uint64_t prev_tsc;
1166 	uint64_t cur_tsc;
1167 
1168 	if (tx_q->len == 0)
1169 		return;
1170 
1171 	cur_tsc = rte_rdtsc();
1172 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1173 		prev_tsc = cur_tsc;
1174 
1175 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1176 			"TX queue drained after timeout with burst size %u\n",
1177 			tx_q->len);
1178 		do_drain_mbuf_table(tx_q);
1179 	}
1180 }
1181 
1182 static __rte_always_inline void
1183 drain_eth_rx(struct vhost_dev *vdev)
1184 {
1185 	uint16_t rx_count, enqueue_count;
1186 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1187 
1188 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1189 				    pkts, MAX_PKT_BURST);
1190 
1191 	if (!rx_count)
1192 		return;
1193 
1194 	/*
1195 	 * When "enable_retry" is set, here we wait and retry when there
1196 	 * is no enough free slots in the queue to hold @rx_count packets,
1197 	 * to diminish packet loss.
1198 	 */
1199 	if (enable_retry &&
1200 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1201 			VIRTIO_RXQ))) {
1202 		uint32_t retry;
1203 
1204 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1205 			rte_delay_us(burst_rx_delay_time);
1206 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1207 					VIRTIO_RXQ))
1208 				break;
1209 		}
1210 	}
1211 
1212 	if (builtin_net_driver) {
1213 		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1214 						pkts, rx_count);
1215 	} else if (async_vhost_driver) {
1216 		uint32_t cpu_cpl_nr = 0;
1217 		uint16_t enqueue_fail = 0;
1218 		struct rte_mbuf *m_cpu_cpl[MAX_PKT_BURST];
1219 
1220 		complete_async_pkts(vdev);
1221 		enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1222 					VIRTIO_RXQ, pkts, rx_count,
1223 					m_cpu_cpl, &cpu_cpl_nr);
1224 		__atomic_add_fetch(&vdev->pkts_inflight, enqueue_count - cpu_cpl_nr,
1225 					__ATOMIC_SEQ_CST);
1226 
1227 		if (cpu_cpl_nr)
1228 			free_pkts(m_cpu_cpl, cpu_cpl_nr);
1229 
1230 		enqueue_fail = rx_count - enqueue_count;
1231 		if (enqueue_fail)
1232 			free_pkts(&pkts[enqueue_count], enqueue_fail);
1233 
1234 	} else {
1235 		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1236 						pkts, rx_count);
1237 	}
1238 
1239 	if (enable_stats) {
1240 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1241 				__ATOMIC_SEQ_CST);
1242 		__atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1243 				__ATOMIC_SEQ_CST);
1244 	}
1245 
1246 	if (!async_vhost_driver)
1247 		free_pkts(pkts, rx_count);
1248 }
1249 
1250 static __rte_always_inline void
1251 drain_virtio_tx(struct vhost_dev *vdev)
1252 {
1253 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1254 	uint16_t count;
1255 	uint16_t i;
1256 
1257 	if (builtin_net_driver) {
1258 		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1259 					pkts, MAX_PKT_BURST);
1260 	} else {
1261 		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1262 					mbuf_pool, pkts, MAX_PKT_BURST);
1263 	}
1264 
1265 	/* setup VMDq for the first packet */
1266 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1267 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1268 			free_pkts(pkts, count);
1269 	}
1270 
1271 	for (i = 0; i < count; ++i)
1272 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1273 }
1274 
1275 /*
1276  * Main function of vhost-switch. It basically does:
1277  *
1278  * for each vhost device {
1279  *    - drain_eth_rx()
1280  *
1281  *      Which drains the host eth Rx queue linked to the vhost device,
1282  *      and deliver all of them to guest virito Rx ring associated with
1283  *      this vhost device.
1284  *
1285  *    - drain_virtio_tx()
1286  *
1287  *      Which drains the guest virtio Tx queue and deliver all of them
1288  *      to the target, which could be another vhost device, or the
1289  *      physical eth dev. The route is done in function "virtio_tx_route".
1290  * }
1291  */
1292 static int
1293 switch_worker(void *arg __rte_unused)
1294 {
1295 	unsigned i;
1296 	unsigned lcore_id = rte_lcore_id();
1297 	struct vhost_dev *vdev;
1298 	struct mbuf_table *tx_q;
1299 
1300 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1301 
1302 	tx_q = &lcore_tx_queue[lcore_id];
1303 	for (i = 0; i < rte_lcore_count(); i++) {
1304 		if (lcore_ids[i] == lcore_id) {
1305 			tx_q->txq_id = i;
1306 			break;
1307 		}
1308 	}
1309 
1310 	while(1) {
1311 		drain_mbuf_table(tx_q);
1312 		drain_vhost_table();
1313 		/*
1314 		 * Inform the configuration core that we have exited the
1315 		 * linked list and that no devices are in use if requested.
1316 		 */
1317 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1318 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1319 
1320 		/*
1321 		 * Process vhost devices
1322 		 */
1323 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1324 			      lcore_vdev_entry) {
1325 			if (unlikely(vdev->remove)) {
1326 				unlink_vmdq(vdev);
1327 				vdev->ready = DEVICE_SAFE_REMOVE;
1328 				continue;
1329 			}
1330 
1331 			if (likely(vdev->ready == DEVICE_RX))
1332 				drain_eth_rx(vdev);
1333 
1334 			if (likely(!vdev->remove))
1335 				drain_virtio_tx(vdev);
1336 		}
1337 	}
1338 
1339 	return 0;
1340 }
1341 
1342 /*
1343  * Remove a device from the specific data core linked list and from the
1344  * main linked list. Synchonization  occurs through the use of the
1345  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1346  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1347  */
1348 static void
1349 destroy_device(int vid)
1350 {
1351 	struct vhost_dev *vdev = NULL;
1352 	int lcore;
1353 	uint16_t i;
1354 
1355 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1356 		if (vdev->vid == vid)
1357 			break;
1358 	}
1359 	if (!vdev)
1360 		return;
1361 	/*set the remove flag. */
1362 	vdev->remove = 1;
1363 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1364 		rte_pause();
1365 	}
1366 
1367 	for (i = 0; i < RTE_MAX_LCORE; i++)
1368 		rte_free(vhost_txbuff[i * MAX_VHOST_DEVICE + vid]);
1369 
1370 	if (builtin_net_driver)
1371 		vs_vhost_net_remove(vdev);
1372 
1373 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1374 		     lcore_vdev_entry);
1375 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1376 
1377 
1378 	/* Set the dev_removal_flag on each lcore. */
1379 	RTE_LCORE_FOREACH_WORKER(lcore)
1380 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1381 
1382 	/*
1383 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1384 	 * we can be sure that they can no longer access the device removed
1385 	 * from the linked lists and that the devices are no longer in use.
1386 	 */
1387 	RTE_LCORE_FOREACH_WORKER(lcore) {
1388 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1389 			rte_pause();
1390 	}
1391 
1392 	lcore_info[vdev->coreid].device_num--;
1393 
1394 	RTE_LOG(INFO, VHOST_DATA,
1395 		"(%d) device has been removed from data core\n",
1396 		vdev->vid);
1397 
1398 	if (async_vhost_driver) {
1399 		uint16_t n_pkt = 0;
1400 		struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1401 
1402 		while (vdev->pkts_inflight) {
1403 			n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, VIRTIO_RXQ,
1404 						m_cpl, vdev->pkts_inflight);
1405 			free_pkts(m_cpl, n_pkt);
1406 			__atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1407 		}
1408 
1409 		rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1410 	}
1411 
1412 	rte_free(vdev);
1413 }
1414 
1415 /*
1416  * A new device is added to a data core. First the device is added to the main linked list
1417  * and then allocated to a specific data core.
1418  */
1419 static int
1420 new_device(int vid)
1421 {
1422 	int lcore, core_add = 0;
1423 	uint16_t i;
1424 	uint32_t device_num_min = num_devices;
1425 	struct vhost_dev *vdev;
1426 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1427 	if (vdev == NULL) {
1428 		RTE_LOG(INFO, VHOST_DATA,
1429 			"(%d) couldn't allocate memory for vhost dev\n",
1430 			vid);
1431 		return -1;
1432 	}
1433 	vdev->vid = vid;
1434 
1435 	for (i = 0; i < RTE_MAX_LCORE; i++) {
1436 		vhost_txbuff[i * MAX_VHOST_DEVICE + vid]
1437 			= rte_zmalloc("vhost bufftable",
1438 				sizeof(struct vhost_bufftable),
1439 				RTE_CACHE_LINE_SIZE);
1440 
1441 		if (vhost_txbuff[i * MAX_VHOST_DEVICE + vid] == NULL) {
1442 			RTE_LOG(INFO, VHOST_DATA,
1443 			  "(%d) couldn't allocate memory for vhost TX\n", vid);
1444 			return -1;
1445 		}
1446 	}
1447 
1448 	if (builtin_net_driver)
1449 		vs_vhost_net_setup(vdev);
1450 
1451 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1452 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1453 
1454 	/*reset ready flag*/
1455 	vdev->ready = DEVICE_MAC_LEARNING;
1456 	vdev->remove = 0;
1457 
1458 	/* Find a suitable lcore to add the device. */
1459 	RTE_LCORE_FOREACH_WORKER(lcore) {
1460 		if (lcore_info[lcore].device_num < device_num_min) {
1461 			device_num_min = lcore_info[lcore].device_num;
1462 			core_add = lcore;
1463 		}
1464 	}
1465 	vdev->coreid = core_add;
1466 
1467 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1468 			  lcore_vdev_entry);
1469 	lcore_info[vdev->coreid].device_num++;
1470 
1471 	/* Disable notifications. */
1472 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1473 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1474 
1475 	RTE_LOG(INFO, VHOST_DATA,
1476 		"(%d) device has been added to data core %d\n",
1477 		vid, vdev->coreid);
1478 
1479 	if (async_vhost_driver) {
1480 		struct rte_vhost_async_config config = {0};
1481 		struct rte_vhost_async_channel_ops channel_ops;
1482 
1483 		if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
1484 			channel_ops.transfer_data = ioat_transfer_data_cb;
1485 			channel_ops.check_completed_copies =
1486 				ioat_check_completed_copies_cb;
1487 
1488 			config.features = RTE_VHOST_ASYNC_INORDER;
1489 			config.async_threshold = 256;
1490 
1491 			return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
1492 				config, &channel_ops);
1493 		}
1494 	}
1495 
1496 	return 0;
1497 }
1498 
1499 static int
1500 vring_state_changed(int vid, uint16_t queue_id, int enable)
1501 {
1502 	struct vhost_dev *vdev = NULL;
1503 
1504 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1505 		if (vdev->vid == vid)
1506 			break;
1507 	}
1508 	if (!vdev)
1509 		return -1;
1510 
1511 	if (queue_id != VIRTIO_RXQ)
1512 		return 0;
1513 
1514 	if (async_vhost_driver) {
1515 		if (!enable) {
1516 			uint16_t n_pkt = 0;
1517 			struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1518 
1519 			while (vdev->pkts_inflight) {
1520 				n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, queue_id,
1521 							m_cpl, vdev->pkts_inflight);
1522 				free_pkts(m_cpl, n_pkt);
1523 				__atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1524 			}
1525 		}
1526 	}
1527 
1528 	return 0;
1529 }
1530 
1531 /*
1532  * These callback allow devices to be added to the data core when configuration
1533  * has been fully complete.
1534  */
1535 static const struct vhost_device_ops virtio_net_device_ops =
1536 {
1537 	.new_device =  new_device,
1538 	.destroy_device = destroy_device,
1539 	.vring_state_changed = vring_state_changed,
1540 };
1541 
1542 /*
1543  * This is a thread will wake up after a period to print stats if the user has
1544  * enabled them.
1545  */
1546 static void *
1547 print_stats(__rte_unused void *arg)
1548 {
1549 	struct vhost_dev *vdev;
1550 	uint64_t tx_dropped, rx_dropped;
1551 	uint64_t tx, tx_total, rx, rx_total;
1552 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1553 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1554 
1555 	while(1) {
1556 		sleep(enable_stats);
1557 
1558 		/* Clear screen and move to top left */
1559 		printf("%s%s\n", clr, top_left);
1560 		printf("Device statistics =================================\n");
1561 
1562 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1563 			tx_total   = vdev->stats.tx_total;
1564 			tx         = vdev->stats.tx;
1565 			tx_dropped = tx_total - tx;
1566 
1567 			rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1568 				__ATOMIC_SEQ_CST);
1569 			rx         = __atomic_load_n(&vdev->stats.rx_atomic,
1570 				__ATOMIC_SEQ_CST);
1571 			rx_dropped = rx_total - rx;
1572 
1573 			printf("Statistics for device %d\n"
1574 				"-----------------------\n"
1575 				"TX total:              %" PRIu64 "\n"
1576 				"TX dropped:            %" PRIu64 "\n"
1577 				"TX successful:         %" PRIu64 "\n"
1578 				"RX total:              %" PRIu64 "\n"
1579 				"RX dropped:            %" PRIu64 "\n"
1580 				"RX successful:         %" PRIu64 "\n",
1581 				vdev->vid,
1582 				tx_total, tx_dropped, tx,
1583 				rx_total, rx_dropped, rx);
1584 		}
1585 
1586 		printf("===================================================\n");
1587 
1588 		fflush(stdout);
1589 	}
1590 
1591 	return NULL;
1592 }
1593 
1594 static void
1595 unregister_drivers(int socket_num)
1596 {
1597 	int i, ret;
1598 
1599 	for (i = 0; i < socket_num; i++) {
1600 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1601 		if (ret != 0)
1602 			RTE_LOG(ERR, VHOST_CONFIG,
1603 				"Fail to unregister vhost driver for %s.\n",
1604 				socket_files + i * PATH_MAX);
1605 	}
1606 }
1607 
1608 /* When we receive a INT signal, unregister vhost driver */
1609 static void
1610 sigint_handler(__rte_unused int signum)
1611 {
1612 	/* Unregister vhost driver. */
1613 	unregister_drivers(nb_sockets);
1614 
1615 	exit(0);
1616 }
1617 
1618 /*
1619  * While creating an mbuf pool, one key thing is to figure out how
1620  * many mbuf entries is enough for our use. FYI, here are some
1621  * guidelines:
1622  *
1623  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1624  *
1625  * - For each switch core (A CPU core does the packet switch), we need
1626  *   also make some reservation for receiving the packets from virtio
1627  *   Tx queue. How many is enough depends on the usage. It's normally
1628  *   a simple calculation like following:
1629  *
1630  *       MAX_PKT_BURST * max packet size / mbuf size
1631  *
1632  *   So, we definitely need allocate more mbufs when TSO is enabled.
1633  *
1634  * - Similarly, for each switching core, we should serve @nr_rx_desc
1635  *   mbufs for receiving the packets from physical NIC device.
1636  *
1637  * - We also need make sure, for each switch core, we have allocated
1638  *   enough mbufs to fill up the mbuf cache.
1639  */
1640 static void
1641 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1642 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1643 {
1644 	uint32_t nr_mbufs;
1645 	uint32_t nr_mbufs_per_core;
1646 	uint32_t mtu = 1500;
1647 
1648 	if (mergeable)
1649 		mtu = 9000;
1650 	if (enable_tso)
1651 		mtu = 64 * 1024;
1652 
1653 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1654 			(mbuf_size - RTE_PKTMBUF_HEADROOM);
1655 	nr_mbufs_per_core += nr_rx_desc;
1656 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1657 
1658 	nr_mbufs  = nr_queues * nr_rx_desc;
1659 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1660 	nr_mbufs *= nr_port;
1661 
1662 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1663 					    nr_mbuf_cache, 0, mbuf_size,
1664 					    rte_socket_id());
1665 	if (mbuf_pool == NULL)
1666 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1667 }
1668 
1669 /*
1670  * Main function, does initialisation and calls the per-lcore functions.
1671  */
1672 int
1673 main(int argc, char *argv[])
1674 {
1675 	unsigned lcore_id, core_id = 0;
1676 	unsigned nb_ports, valid_num_ports;
1677 	int ret, i;
1678 	uint16_t portid;
1679 	static pthread_t tid;
1680 	uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1681 
1682 	signal(SIGINT, sigint_handler);
1683 
1684 	/* init EAL */
1685 	ret = rte_eal_init(argc, argv);
1686 	if (ret < 0)
1687 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1688 	argc -= ret;
1689 	argv += ret;
1690 
1691 	/* parse app arguments */
1692 	ret = us_vhost_parse_args(argc, argv);
1693 	if (ret < 0)
1694 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1695 
1696 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1697 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1698 
1699 		if (rte_lcore_is_enabled(lcore_id))
1700 			lcore_ids[core_id++] = lcore_id;
1701 	}
1702 
1703 	if (rte_lcore_count() > RTE_MAX_LCORE)
1704 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1705 
1706 	/* Get the number of physical ports. */
1707 	nb_ports = rte_eth_dev_count_avail();
1708 
1709 	/*
1710 	 * Update the global var NUM_PORTS and global array PORTS
1711 	 * and get value of var VALID_NUM_PORTS according to system ports number
1712 	 */
1713 	valid_num_ports = check_ports_num(nb_ports);
1714 
1715 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1716 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1717 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1718 		return -1;
1719 	}
1720 
1721 	/*
1722 	 * FIXME: here we are trying to allocate mbufs big enough for
1723 	 * @MAX_QUEUES, but the truth is we're never going to use that
1724 	 * many queues here. We probably should only do allocation for
1725 	 * those queues we are going to use.
1726 	 */
1727 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1728 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1729 
1730 	if (vm2vm_mode == VM2VM_HARDWARE) {
1731 		/* Enable VT loop back to let L2 switch to do it. */
1732 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1733 		RTE_LOG(DEBUG, VHOST_CONFIG,
1734 			"Enable loop back for L2 switch in vmdq.\n");
1735 	}
1736 
1737 	/* initialize all ports */
1738 	RTE_ETH_FOREACH_DEV(portid) {
1739 		/* skip ports that are not enabled */
1740 		if ((enabled_port_mask & (1 << portid)) == 0) {
1741 			RTE_LOG(INFO, VHOST_PORT,
1742 				"Skipping disabled port %d\n", portid);
1743 			continue;
1744 		}
1745 		if (port_init(portid) != 0)
1746 			rte_exit(EXIT_FAILURE,
1747 				"Cannot initialize network ports\n");
1748 	}
1749 
1750 	/* Enable stats if the user option is set. */
1751 	if (enable_stats) {
1752 		ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1753 					print_stats, NULL);
1754 		if (ret < 0)
1755 			rte_exit(EXIT_FAILURE,
1756 				"Cannot create print-stats thread\n");
1757 	}
1758 
1759 	/* Launch all data cores. */
1760 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1761 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1762 
1763 	if (client_mode)
1764 		flags |= RTE_VHOST_USER_CLIENT;
1765 
1766 	/* Register vhost user driver to handle vhost messages. */
1767 	for (i = 0; i < nb_sockets; i++) {
1768 		char *file = socket_files + i * PATH_MAX;
1769 
1770 		if (async_vhost_driver)
1771 			flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1772 
1773 		ret = rte_vhost_driver_register(file, flags);
1774 		if (ret != 0) {
1775 			unregister_drivers(i);
1776 			rte_exit(EXIT_FAILURE,
1777 				"vhost driver register failure.\n");
1778 		}
1779 
1780 		if (builtin_net_driver)
1781 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1782 
1783 		if (mergeable == 0) {
1784 			rte_vhost_driver_disable_features(file,
1785 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
1786 		}
1787 
1788 		if (enable_tx_csum == 0) {
1789 			rte_vhost_driver_disable_features(file,
1790 				1ULL << VIRTIO_NET_F_CSUM);
1791 		}
1792 
1793 		if (enable_tso == 0) {
1794 			rte_vhost_driver_disable_features(file,
1795 				1ULL << VIRTIO_NET_F_HOST_TSO4);
1796 			rte_vhost_driver_disable_features(file,
1797 				1ULL << VIRTIO_NET_F_HOST_TSO6);
1798 			rte_vhost_driver_disable_features(file,
1799 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
1800 			rte_vhost_driver_disable_features(file,
1801 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
1802 		}
1803 
1804 		if (promiscuous) {
1805 			rte_vhost_driver_enable_features(file,
1806 				1ULL << VIRTIO_NET_F_CTRL_RX);
1807 		}
1808 
1809 		ret = rte_vhost_driver_callback_register(file,
1810 			&virtio_net_device_ops);
1811 		if (ret != 0) {
1812 			rte_exit(EXIT_FAILURE,
1813 				"failed to register vhost driver callbacks.\n");
1814 		}
1815 
1816 		if (rte_vhost_driver_start(file) < 0) {
1817 			rte_exit(EXIT_FAILURE,
1818 				"failed to start vhost driver.\n");
1819 		}
1820 	}
1821 
1822 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1823 		rte_eal_wait_lcore(lcore_id);
1824 
1825 	/* clean up the EAL */
1826 	rte_eal_cleanup();
1827 
1828 	return 0;
1829 }
1830