xref: /dpdk/examples/vhost/main.c (revision 2490bb897182f57de80fd924dd3ae48dda819b8c)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4 
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16 
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
19 #include <rte_log.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
22 #include <rte_net.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27 
28 #include "ioat.h"
29 #include "main.h"
30 
31 #ifndef MAX_QUEUES
32 #define MAX_QUEUES 128
33 #endif
34 
35 /* the maximum number of external ports supported */
36 #define MAX_SUP_PORTS 1
37 
38 #define MBUF_CACHE_SIZE	128
39 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
40 
41 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
42 
43 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
44 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
45 
46 #define JUMBO_FRAME_MAX_SIZE    0x2600
47 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN))
48 
49 /* State of virtio device. */
50 #define DEVICE_MAC_LEARNING 0
51 #define DEVICE_RX			1
52 #define DEVICE_SAFE_REMOVE	2
53 
54 /* Configurable number of RX/TX ring descriptors */
55 #define RTE_TEST_RX_DESC_DEFAULT 1024
56 #define RTE_TEST_TX_DESC_DEFAULT 512
57 
58 #define INVALID_PORT_ID 0xFF
59 
60 /* mask of enabled ports */
61 static uint32_t enabled_port_mask = 0;
62 
63 /* Promiscuous mode */
64 static uint32_t promiscuous;
65 
66 /* number of devices/queues to support*/
67 static uint32_t num_queues = 0;
68 static uint32_t num_devices;
69 
70 static struct rte_mempool *mbuf_pool;
71 static int mergeable;
72 
73 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
74 typedef enum {
75 	VM2VM_DISABLED = 0,
76 	VM2VM_SOFTWARE = 1,
77 	VM2VM_HARDWARE = 2,
78 	VM2VM_LAST
79 } vm2vm_type;
80 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
81 
82 /* Enable stats. */
83 static uint32_t enable_stats = 0;
84 /* Enable retries on RX. */
85 static uint32_t enable_retry = 1;
86 
87 /* Disable TX checksum offload */
88 static uint32_t enable_tx_csum;
89 
90 /* Disable TSO offload */
91 static uint32_t enable_tso;
92 
93 static int client_mode;
94 
95 static int builtin_net_driver;
96 
97 static int async_vhost_driver;
98 
99 static char *dma_type;
100 
101 /* Specify timeout (in useconds) between retries on RX. */
102 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
103 /* Specify the number of retries on RX. */
104 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
105 
106 /* Socket file paths. Can be set by user */
107 static char *socket_files;
108 static int nb_sockets;
109 
110 /* empty vmdq configuration structure. Filled in programatically */
111 static struct rte_eth_conf vmdq_conf_default = {
112 	.rxmode = {
113 		.mq_mode        = RTE_ETH_MQ_RX_VMDQ_ONLY,
114 		.split_hdr_size = 0,
115 		/*
116 		 * VLAN strip is necessary for 1G NIC such as I350,
117 		 * this fixes bug of ipv4 forwarding in guest can't
118 		 * forward pakets from one virtio dev to another virtio dev.
119 		 */
120 		.offloads = RTE_ETH_RX_OFFLOAD_VLAN_STRIP,
121 	},
122 
123 	.txmode = {
124 		.mq_mode = RTE_ETH_MQ_TX_NONE,
125 		.offloads = (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM |
126 			     RTE_ETH_TX_OFFLOAD_TCP_CKSUM |
127 			     RTE_ETH_TX_OFFLOAD_VLAN_INSERT |
128 			     RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
129 			     RTE_ETH_TX_OFFLOAD_TCP_TSO),
130 	},
131 	.rx_adv_conf = {
132 		/*
133 		 * should be overridden separately in code with
134 		 * appropriate values
135 		 */
136 		.vmdq_rx_conf = {
137 			.nb_queue_pools = RTE_ETH_8_POOLS,
138 			.enable_default_pool = 0,
139 			.default_pool = 0,
140 			.nb_pool_maps = 0,
141 			.pool_map = {{0, 0},},
142 		},
143 	},
144 };
145 
146 
147 static unsigned lcore_ids[RTE_MAX_LCORE];
148 static uint16_t ports[RTE_MAX_ETHPORTS];
149 static unsigned num_ports = 0; /**< The number of ports specified in command line */
150 static uint16_t num_pf_queues, num_vmdq_queues;
151 static uint16_t vmdq_pool_base, vmdq_queue_base;
152 static uint16_t queues_per_pool;
153 
154 const uint16_t vlan_tags[] = {
155 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
156 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
157 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
158 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
159 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
160 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
161 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
162 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
163 };
164 
165 /* ethernet addresses of ports */
166 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
167 
168 static struct vhost_dev_tailq_list vhost_dev_list =
169 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
170 
171 static struct lcore_info lcore_info[RTE_MAX_LCORE];
172 
173 /* Used for queueing bursts of TX packets. */
174 struct mbuf_table {
175 	unsigned len;
176 	unsigned txq_id;
177 	struct rte_mbuf *m_table[MAX_PKT_BURST];
178 };
179 
180 struct vhost_bufftable {
181 	uint32_t len;
182 	uint64_t pre_tsc;
183 	struct rte_mbuf *m_table[MAX_PKT_BURST];
184 };
185 
186 /* TX queue for each data core. */
187 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
188 
189 /*
190  * Vhost TX buffer for each data core.
191  * Every data core maintains a TX buffer for every vhost device,
192  * which is used for batch pkts enqueue for higher performance.
193  */
194 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * MAX_VHOST_DEVICE];
195 
196 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
197 				 / US_PER_S * BURST_TX_DRAIN_US)
198 #define VLAN_HLEN       4
199 
200 static inline int
201 open_dma(const char *value)
202 {
203 	if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0)
204 		return open_ioat(value);
205 
206 	return -1;
207 }
208 
209 /*
210  * Builds up the correct configuration for VMDQ VLAN pool map
211  * according to the pool & queue limits.
212  */
213 static inline int
214 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
215 {
216 	struct rte_eth_vmdq_rx_conf conf;
217 	struct rte_eth_vmdq_rx_conf *def_conf =
218 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
219 	unsigned i;
220 
221 	memset(&conf, 0, sizeof(conf));
222 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
223 	conf.nb_pool_maps = num_devices;
224 	conf.enable_loop_back = def_conf->enable_loop_back;
225 	conf.rx_mode = def_conf->rx_mode;
226 
227 	for (i = 0; i < conf.nb_pool_maps; i++) {
228 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
229 		conf.pool_map[i].pools = (1UL << i);
230 	}
231 
232 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
233 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
234 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
235 	return 0;
236 }
237 
238 /*
239  * Initialises a given port using global settings and with the rx buffers
240  * coming from the mbuf_pool passed as parameter
241  */
242 static inline int
243 port_init(uint16_t port)
244 {
245 	struct rte_eth_dev_info dev_info;
246 	struct rte_eth_conf port_conf;
247 	struct rte_eth_rxconf *rxconf;
248 	struct rte_eth_txconf *txconf;
249 	int16_t rx_rings, tx_rings;
250 	uint16_t rx_ring_size, tx_ring_size;
251 	int retval;
252 	uint16_t q;
253 
254 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
255 	retval = rte_eth_dev_info_get(port, &dev_info);
256 	if (retval != 0) {
257 		RTE_LOG(ERR, VHOST_PORT,
258 			"Error during getting device (port %u) info: %s\n",
259 			port, strerror(-retval));
260 
261 		return retval;
262 	}
263 
264 	rxconf = &dev_info.default_rxconf;
265 	txconf = &dev_info.default_txconf;
266 	rxconf->rx_drop_en = 1;
267 
268 	/*configure the number of supported virtio devices based on VMDQ limits */
269 	num_devices = dev_info.max_vmdq_pools;
270 
271 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
272 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
273 
274 	tx_rings = (uint16_t)rte_lcore_count();
275 
276 	if (mergeable) {
277 		if (dev_info.max_mtu != UINT16_MAX && dev_info.max_rx_pktlen > dev_info.max_mtu)
278 			vmdq_conf_default.rxmode.mtu = dev_info.max_mtu;
279 		else
280 			vmdq_conf_default.rxmode.mtu = MAX_MTU;
281 	}
282 
283 	/* Get port configuration. */
284 	retval = get_eth_conf(&port_conf, num_devices);
285 	if (retval < 0)
286 		return retval;
287 	/* NIC queues are divided into pf queues and vmdq queues.  */
288 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
289 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
290 	num_vmdq_queues = num_devices * queues_per_pool;
291 	num_queues = num_pf_queues + num_vmdq_queues;
292 	vmdq_queue_base = dev_info.vmdq_queue_base;
293 	vmdq_pool_base  = dev_info.vmdq_pool_base;
294 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
295 		num_pf_queues, num_devices, queues_per_pool);
296 
297 	if (!rte_eth_dev_is_valid_port(port))
298 		return -1;
299 
300 	rx_rings = (uint16_t)dev_info.max_rx_queues;
301 	if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE)
302 		port_conf.txmode.offloads |=
303 			RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE;
304 	/* Configure ethernet device. */
305 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
306 	if (retval != 0) {
307 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
308 			port, strerror(-retval));
309 		return retval;
310 	}
311 
312 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
313 		&tx_ring_size);
314 	if (retval != 0) {
315 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
316 			"for port %u: %s.\n", port, strerror(-retval));
317 		return retval;
318 	}
319 	if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
320 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
321 			"for Rx queues on port %u.\n", port);
322 		return -1;
323 	}
324 
325 	/* Setup the queues. */
326 	rxconf->offloads = port_conf.rxmode.offloads;
327 	for (q = 0; q < rx_rings; q ++) {
328 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
329 						rte_eth_dev_socket_id(port),
330 						rxconf,
331 						mbuf_pool);
332 		if (retval < 0) {
333 			RTE_LOG(ERR, VHOST_PORT,
334 				"Failed to setup rx queue %u of port %u: %s.\n",
335 				q, port, strerror(-retval));
336 			return retval;
337 		}
338 	}
339 	txconf->offloads = port_conf.txmode.offloads;
340 	for (q = 0; q < tx_rings; q ++) {
341 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
342 						rte_eth_dev_socket_id(port),
343 						txconf);
344 		if (retval < 0) {
345 			RTE_LOG(ERR, VHOST_PORT,
346 				"Failed to setup tx queue %u of port %u: %s.\n",
347 				q, port, strerror(-retval));
348 			return retval;
349 		}
350 	}
351 
352 	/* Start the device. */
353 	retval  = rte_eth_dev_start(port);
354 	if (retval < 0) {
355 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
356 			port, strerror(-retval));
357 		return retval;
358 	}
359 
360 	if (promiscuous) {
361 		retval = rte_eth_promiscuous_enable(port);
362 		if (retval != 0) {
363 			RTE_LOG(ERR, VHOST_PORT,
364 				"Failed to enable promiscuous mode on port %u: %s\n",
365 				port, rte_strerror(-retval));
366 			return retval;
367 		}
368 	}
369 
370 	retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
371 	if (retval < 0) {
372 		RTE_LOG(ERR, VHOST_PORT,
373 			"Failed to get MAC address on port %u: %s\n",
374 			port, rte_strerror(-retval));
375 		return retval;
376 	}
377 
378 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
379 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
380 		" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
381 		port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port]));
382 
383 	return 0;
384 }
385 
386 /*
387  * Set socket file path.
388  */
389 static int
390 us_vhost_parse_socket_path(const char *q_arg)
391 {
392 	char *old;
393 
394 	/* parse number string */
395 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
396 		return -1;
397 
398 	old = socket_files;
399 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
400 	if (socket_files == NULL) {
401 		free(old);
402 		return -1;
403 	}
404 
405 	strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
406 	nb_sockets++;
407 
408 	return 0;
409 }
410 
411 /*
412  * Parse the portmask provided at run time.
413  */
414 static int
415 parse_portmask(const char *portmask)
416 {
417 	char *end = NULL;
418 	unsigned long pm;
419 
420 	errno = 0;
421 
422 	/* parse hexadecimal string */
423 	pm = strtoul(portmask, &end, 16);
424 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
425 		return 0;
426 
427 	return pm;
428 
429 }
430 
431 /*
432  * Parse num options at run time.
433  */
434 static int
435 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
436 {
437 	char *end = NULL;
438 	unsigned long num;
439 
440 	errno = 0;
441 
442 	/* parse unsigned int string */
443 	num = strtoul(q_arg, &end, 10);
444 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
445 		return -1;
446 
447 	if (num > max_valid_value)
448 		return -1;
449 
450 	return num;
451 
452 }
453 
454 /*
455  * Display usage
456  */
457 static void
458 us_vhost_usage(const char *prgname)
459 {
460 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
461 	"		--vm2vm [0|1|2]\n"
462 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
463 	"		--socket-file <path>\n"
464 	"		--nb-devices ND\n"
465 	"		-p PORTMASK: Set mask for ports to be used by application\n"
466 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
467 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
468 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
469 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
470 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
471 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
472 	"		--socket-file: The path of the socket file.\n"
473 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
474 	"		--tso [0|1] disable/enable TCP segment offload.\n"
475 	"		--client register a vhost-user socket as client mode.\n"
476 	"		--dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
477 	"		--dmas register dma channel for specific vhost device.\n",
478 	       prgname);
479 }
480 
481 enum {
482 #define OPT_VM2VM               "vm2vm"
483 	OPT_VM2VM_NUM = 256,
484 #define OPT_RX_RETRY            "rx-retry"
485 	OPT_RX_RETRY_NUM,
486 #define OPT_RX_RETRY_DELAY      "rx-retry-delay"
487 	OPT_RX_RETRY_DELAY_NUM,
488 #define OPT_RX_RETRY_NUMB       "rx-retry-num"
489 	OPT_RX_RETRY_NUMB_NUM,
490 #define OPT_MERGEABLE           "mergeable"
491 	OPT_MERGEABLE_NUM,
492 #define OPT_STATS               "stats"
493 	OPT_STATS_NUM,
494 #define OPT_SOCKET_FILE         "socket-file"
495 	OPT_SOCKET_FILE_NUM,
496 #define OPT_TX_CSUM             "tx-csum"
497 	OPT_TX_CSUM_NUM,
498 #define OPT_TSO                 "tso"
499 	OPT_TSO_NUM,
500 #define OPT_CLIENT              "client"
501 	OPT_CLIENT_NUM,
502 #define OPT_BUILTIN_NET_DRIVER  "builtin-net-driver"
503 	OPT_BUILTIN_NET_DRIVER_NUM,
504 #define OPT_DMA_TYPE            "dma-type"
505 	OPT_DMA_TYPE_NUM,
506 #define OPT_DMAS                "dmas"
507 	OPT_DMAS_NUM,
508 };
509 
510 /*
511  * Parse the arguments given in the command line of the application.
512  */
513 static int
514 us_vhost_parse_args(int argc, char **argv)
515 {
516 	int opt, ret;
517 	int option_index;
518 	unsigned i;
519 	const char *prgname = argv[0];
520 	static struct option long_option[] = {
521 		{OPT_VM2VM, required_argument,
522 				NULL, OPT_VM2VM_NUM},
523 		{OPT_RX_RETRY, required_argument,
524 				NULL, OPT_RX_RETRY_NUM},
525 		{OPT_RX_RETRY_DELAY, required_argument,
526 				NULL, OPT_RX_RETRY_DELAY_NUM},
527 		{OPT_RX_RETRY_NUMB, required_argument,
528 				NULL, OPT_RX_RETRY_NUMB_NUM},
529 		{OPT_MERGEABLE, required_argument,
530 				NULL, OPT_MERGEABLE_NUM},
531 		{OPT_STATS, required_argument,
532 				NULL, OPT_STATS_NUM},
533 		{OPT_SOCKET_FILE, required_argument,
534 				NULL, OPT_SOCKET_FILE_NUM},
535 		{OPT_TX_CSUM, required_argument,
536 				NULL, OPT_TX_CSUM_NUM},
537 		{OPT_TSO, required_argument,
538 				NULL, OPT_TSO_NUM},
539 		{OPT_CLIENT, no_argument,
540 				NULL, OPT_CLIENT_NUM},
541 		{OPT_BUILTIN_NET_DRIVER, no_argument,
542 				NULL, OPT_BUILTIN_NET_DRIVER_NUM},
543 		{OPT_DMA_TYPE, required_argument,
544 				NULL, OPT_DMA_TYPE_NUM},
545 		{OPT_DMAS, required_argument,
546 				NULL, OPT_DMAS_NUM},
547 		{NULL, 0, 0, 0},
548 	};
549 
550 	/* Parse command line */
551 	while ((opt = getopt_long(argc, argv, "p:P",
552 			long_option, &option_index)) != EOF) {
553 		switch (opt) {
554 		/* Portmask */
555 		case 'p':
556 			enabled_port_mask = parse_portmask(optarg);
557 			if (enabled_port_mask == 0) {
558 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
559 				us_vhost_usage(prgname);
560 				return -1;
561 			}
562 			break;
563 
564 		case 'P':
565 			promiscuous = 1;
566 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
567 				RTE_ETH_VMDQ_ACCEPT_BROADCAST |
568 				RTE_ETH_VMDQ_ACCEPT_MULTICAST;
569 			break;
570 
571 		case OPT_VM2VM_NUM:
572 			ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
573 			if (ret == -1) {
574 				RTE_LOG(INFO, VHOST_CONFIG,
575 					"Invalid argument for "
576 					"vm2vm [0|1|2]\n");
577 				us_vhost_usage(prgname);
578 				return -1;
579 			}
580 			vm2vm_mode = (vm2vm_type)ret;
581 			break;
582 
583 		case OPT_RX_RETRY_NUM:
584 			ret = parse_num_opt(optarg, 1);
585 			if (ret == -1) {
586 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
587 				us_vhost_usage(prgname);
588 				return -1;
589 			}
590 			enable_retry = ret;
591 			break;
592 
593 		case OPT_TX_CSUM_NUM:
594 			ret = parse_num_opt(optarg, 1);
595 			if (ret == -1) {
596 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
597 				us_vhost_usage(prgname);
598 				return -1;
599 			}
600 			enable_tx_csum = ret;
601 			break;
602 
603 		case OPT_TSO_NUM:
604 			ret = parse_num_opt(optarg, 1);
605 			if (ret == -1) {
606 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
607 				us_vhost_usage(prgname);
608 				return -1;
609 			}
610 			enable_tso = ret;
611 			break;
612 
613 		case OPT_RX_RETRY_DELAY_NUM:
614 			ret = parse_num_opt(optarg, INT32_MAX);
615 			if (ret == -1) {
616 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
617 				us_vhost_usage(prgname);
618 				return -1;
619 			}
620 			burst_rx_delay_time = ret;
621 			break;
622 
623 		case OPT_RX_RETRY_NUMB_NUM:
624 			ret = parse_num_opt(optarg, INT32_MAX);
625 			if (ret == -1) {
626 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
627 				us_vhost_usage(prgname);
628 				return -1;
629 			}
630 			burst_rx_retry_num = ret;
631 			break;
632 
633 		case OPT_MERGEABLE_NUM:
634 			ret = parse_num_opt(optarg, 1);
635 			if (ret == -1) {
636 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
637 				us_vhost_usage(prgname);
638 				return -1;
639 			}
640 			mergeable = !!ret;
641 			break;
642 
643 		case OPT_STATS_NUM:
644 			ret = parse_num_opt(optarg, INT32_MAX);
645 			if (ret == -1) {
646 				RTE_LOG(INFO, VHOST_CONFIG,
647 					"Invalid argument for stats [0..N]\n");
648 				us_vhost_usage(prgname);
649 				return -1;
650 			}
651 			enable_stats = ret;
652 			break;
653 
654 		/* Set socket file path. */
655 		case OPT_SOCKET_FILE_NUM:
656 			if (us_vhost_parse_socket_path(optarg) == -1) {
657 				RTE_LOG(INFO, VHOST_CONFIG,
658 				"Invalid argument for socket name (Max %d characters)\n",
659 				PATH_MAX);
660 				us_vhost_usage(prgname);
661 				return -1;
662 			}
663 			break;
664 
665 		case OPT_DMA_TYPE_NUM:
666 			dma_type = optarg;
667 			break;
668 
669 		case OPT_DMAS_NUM:
670 			if (open_dma(optarg) == -1) {
671 				RTE_LOG(INFO, VHOST_CONFIG,
672 					"Wrong DMA args\n");
673 				us_vhost_usage(prgname);
674 				return -1;
675 			}
676 			async_vhost_driver = 1;
677 			break;
678 
679 		case OPT_CLIENT_NUM:
680 			client_mode = 1;
681 			break;
682 
683 		case OPT_BUILTIN_NET_DRIVER_NUM:
684 			builtin_net_driver = 1;
685 			break;
686 
687 		/* Invalid option - print options. */
688 		default:
689 			us_vhost_usage(prgname);
690 			return -1;
691 		}
692 	}
693 
694 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
695 		if (enabled_port_mask & (1 << i))
696 			ports[num_ports++] = i;
697 	}
698 
699 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
700 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
701 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
702 		return -1;
703 	}
704 
705 	return 0;
706 }
707 
708 /*
709  * Update the global var NUM_PORTS and array PORTS according to system ports number
710  * and return valid ports number
711  */
712 static unsigned check_ports_num(unsigned nb_ports)
713 {
714 	unsigned valid_num_ports = num_ports;
715 	unsigned portid;
716 
717 	if (num_ports > nb_ports) {
718 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
719 			num_ports, nb_ports);
720 		num_ports = nb_ports;
721 	}
722 
723 	for (portid = 0; portid < num_ports; portid ++) {
724 		if (!rte_eth_dev_is_valid_port(ports[portid])) {
725 			RTE_LOG(INFO, VHOST_PORT,
726 				"\nSpecified port ID(%u) is not valid\n",
727 				ports[portid]);
728 			ports[portid] = INVALID_PORT_ID;
729 			valid_num_ports--;
730 		}
731 	}
732 	return valid_num_ports;
733 }
734 
735 static __rte_always_inline struct vhost_dev *
736 find_vhost_dev(struct rte_ether_addr *mac)
737 {
738 	struct vhost_dev *vdev;
739 
740 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
741 		if (vdev->ready == DEVICE_RX &&
742 		    rte_is_same_ether_addr(mac, &vdev->mac_address))
743 			return vdev;
744 	}
745 
746 	return NULL;
747 }
748 
749 /*
750  * This function learns the MAC address of the device and registers this along with a
751  * vlan tag to a VMDQ.
752  */
753 static int
754 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
755 {
756 	struct rte_ether_hdr *pkt_hdr;
757 	int i, ret;
758 
759 	/* Learn MAC address of guest device from packet */
760 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
761 
762 	if (find_vhost_dev(&pkt_hdr->src_addr)) {
763 		RTE_LOG(ERR, VHOST_DATA,
764 			"(%d) device is using a registered MAC!\n",
765 			vdev->vid);
766 		return -1;
767 	}
768 
769 	for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
770 		vdev->mac_address.addr_bytes[i] =
771 			pkt_hdr->src_addr.addr_bytes[i];
772 
773 	/* vlan_tag currently uses the device_id. */
774 	vdev->vlan_tag = vlan_tags[vdev->vid];
775 
776 	/* Print out VMDQ registration info. */
777 	RTE_LOG(INFO, VHOST_DATA,
778 		"(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n",
779 		vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address),
780 		vdev->vlan_tag);
781 
782 	/* Register the MAC address. */
783 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
784 				(uint32_t)vdev->vid + vmdq_pool_base);
785 	if (ret)
786 		RTE_LOG(ERR, VHOST_DATA,
787 			"(%d) failed to add device MAC address to VMDQ\n",
788 			vdev->vid);
789 
790 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
791 
792 	/* Set device as ready for RX. */
793 	vdev->ready = DEVICE_RX;
794 
795 	return 0;
796 }
797 
798 /*
799  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
800  * queue before disabling RX on the device.
801  */
802 static inline void
803 unlink_vmdq(struct vhost_dev *vdev)
804 {
805 	unsigned i = 0;
806 	unsigned rx_count;
807 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
808 
809 	if (vdev->ready == DEVICE_RX) {
810 		/*clear MAC and VLAN settings*/
811 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
812 		for (i = 0; i < 6; i++)
813 			vdev->mac_address.addr_bytes[i] = 0;
814 
815 		vdev->vlan_tag = 0;
816 
817 		/*Clear out the receive buffers*/
818 		rx_count = rte_eth_rx_burst(ports[0],
819 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
820 
821 		while (rx_count) {
822 			for (i = 0; i < rx_count; i++)
823 				rte_pktmbuf_free(pkts_burst[i]);
824 
825 			rx_count = rte_eth_rx_burst(ports[0],
826 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
827 		}
828 
829 		vdev->ready = DEVICE_MAC_LEARNING;
830 	}
831 }
832 
833 static inline void
834 free_pkts(struct rte_mbuf **pkts, uint16_t n)
835 {
836 	while (n--)
837 		rte_pktmbuf_free(pkts[n]);
838 }
839 
840 static __rte_always_inline void
841 complete_async_pkts(struct vhost_dev *vdev)
842 {
843 	struct rte_mbuf *p_cpl[MAX_PKT_BURST];
844 	uint16_t complete_count;
845 
846 	complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
847 					VIRTIO_RXQ, p_cpl, MAX_PKT_BURST);
848 	if (complete_count) {
849 		free_pkts(p_cpl, complete_count);
850 		__atomic_sub_fetch(&vdev->pkts_inflight, complete_count, __ATOMIC_SEQ_CST);
851 	}
852 
853 }
854 
855 static __rte_always_inline void
856 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
857 	    struct rte_mbuf *m)
858 {
859 	uint16_t ret;
860 
861 	if (builtin_net_driver) {
862 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
863 	} else {
864 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
865 	}
866 
867 	if (enable_stats) {
868 		__atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
869 				__ATOMIC_SEQ_CST);
870 		__atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
871 				__ATOMIC_SEQ_CST);
872 		src_vdev->stats.tx_total++;
873 		src_vdev->stats.tx += ret;
874 	}
875 }
876 
877 static __rte_always_inline void
878 drain_vhost(struct vhost_dev *vdev)
879 {
880 	uint16_t ret;
881 	uint32_t buff_idx = rte_lcore_id() * MAX_VHOST_DEVICE + vdev->vid;
882 	uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
883 	struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
884 
885 	if (builtin_net_driver) {
886 		ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
887 	} else if (async_vhost_driver) {
888 		uint16_t enqueue_fail = 0;
889 
890 		complete_async_pkts(vdev);
891 		ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ, m, nr_xmit);
892 		__atomic_add_fetch(&vdev->pkts_inflight, ret, __ATOMIC_SEQ_CST);
893 
894 		enqueue_fail = nr_xmit - ret;
895 		if (enqueue_fail)
896 			free_pkts(&m[ret], nr_xmit - ret);
897 	} else {
898 		ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
899 						m, nr_xmit);
900 	}
901 
902 	if (enable_stats) {
903 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
904 				__ATOMIC_SEQ_CST);
905 		__atomic_add_fetch(&vdev->stats.rx_atomic, ret,
906 				__ATOMIC_SEQ_CST);
907 	}
908 
909 	if (!async_vhost_driver)
910 		free_pkts(m, nr_xmit);
911 }
912 
913 static __rte_always_inline void
914 drain_vhost_table(void)
915 {
916 	uint16_t lcore_id = rte_lcore_id();
917 	struct vhost_bufftable *vhost_txq;
918 	struct vhost_dev *vdev;
919 	uint64_t cur_tsc;
920 
921 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
922 		if (unlikely(vdev->remove == 1))
923 			continue;
924 
925 		vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE
926 						+ vdev->vid];
927 
928 		cur_tsc = rte_rdtsc();
929 		if (unlikely(cur_tsc - vhost_txq->pre_tsc
930 				> MBUF_TABLE_DRAIN_TSC)) {
931 			RTE_LOG_DP(DEBUG, VHOST_DATA,
932 				"Vhost TX queue drained after timeout with burst size %u\n",
933 				vhost_txq->len);
934 			drain_vhost(vdev);
935 			vhost_txq->len = 0;
936 			vhost_txq->pre_tsc = cur_tsc;
937 		}
938 	}
939 }
940 
941 /*
942  * Check if the packet destination MAC address is for a local device. If so then put
943  * the packet on that devices RX queue. If not then return.
944  */
945 static __rte_always_inline int
946 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
947 {
948 	struct rte_ether_hdr *pkt_hdr;
949 	struct vhost_dev *dst_vdev;
950 	struct vhost_bufftable *vhost_txq;
951 	uint16_t lcore_id = rte_lcore_id();
952 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
953 
954 	dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
955 	if (!dst_vdev)
956 		return -1;
957 
958 	if (vdev->vid == dst_vdev->vid) {
959 		RTE_LOG_DP(DEBUG, VHOST_DATA,
960 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
961 			vdev->vid);
962 		return 0;
963 	}
964 
965 	RTE_LOG_DP(DEBUG, VHOST_DATA,
966 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
967 
968 	if (unlikely(dst_vdev->remove)) {
969 		RTE_LOG_DP(DEBUG, VHOST_DATA,
970 			"(%d) device is marked for removal\n", dst_vdev->vid);
971 		return 0;
972 	}
973 
974 	vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE + dst_vdev->vid];
975 	vhost_txq->m_table[vhost_txq->len++] = m;
976 
977 	if (enable_stats) {
978 		vdev->stats.tx_total++;
979 		vdev->stats.tx++;
980 	}
981 
982 	if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
983 		drain_vhost(dst_vdev);
984 		vhost_txq->len = 0;
985 		vhost_txq->pre_tsc = rte_rdtsc();
986 	}
987 	return 0;
988 }
989 
990 /*
991  * Check if the destination MAC of a packet is one local VM,
992  * and get its vlan tag, and offset if it is.
993  */
994 static __rte_always_inline int
995 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
996 	uint32_t *offset, uint16_t *vlan_tag)
997 {
998 	struct vhost_dev *dst_vdev;
999 	struct rte_ether_hdr *pkt_hdr =
1000 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1001 
1002 	dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1003 	if (!dst_vdev)
1004 		return 0;
1005 
1006 	if (vdev->vid == dst_vdev->vid) {
1007 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1008 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
1009 			vdev->vid);
1010 		return -1;
1011 	}
1012 
1013 	/*
1014 	 * HW vlan strip will reduce the packet length
1015 	 * by minus length of vlan tag, so need restore
1016 	 * the packet length by plus it.
1017 	 */
1018 	*offset  = VLAN_HLEN;
1019 	*vlan_tag = vlan_tags[vdev->vid];
1020 
1021 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1022 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1023 		vdev->vid, dst_vdev->vid, *vlan_tag);
1024 
1025 	return 0;
1026 }
1027 
1028 static void virtio_tx_offload(struct rte_mbuf *m)
1029 {
1030 	struct rte_net_hdr_lens hdr_lens;
1031 	struct rte_ipv4_hdr *ipv4_hdr;
1032 	struct rte_tcp_hdr *tcp_hdr;
1033 	uint32_t ptype;
1034 	void *l3_hdr;
1035 
1036 	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
1037 	m->l2_len = hdr_lens.l2_len;
1038 	m->l3_len = hdr_lens.l3_len;
1039 	m->l4_len = hdr_lens.l4_len;
1040 
1041 	l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len);
1042 	tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *,
1043 		m->l2_len + m->l3_len);
1044 
1045 	m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG;
1046 	if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) {
1047 		m->ol_flags |= RTE_MBUF_F_TX_IPV4;
1048 		m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
1049 		ipv4_hdr = l3_hdr;
1050 		ipv4_hdr->hdr_checksum = 0;
1051 		tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags);
1052 	} else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1053 		m->ol_flags |= RTE_MBUF_F_TX_IPV6;
1054 		tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags);
1055 	}
1056 }
1057 
1058 static __rte_always_inline void
1059 do_drain_mbuf_table(struct mbuf_table *tx_q)
1060 {
1061 	uint16_t count;
1062 
1063 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1064 				 tx_q->m_table, tx_q->len);
1065 	if (unlikely(count < tx_q->len))
1066 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
1067 
1068 	tx_q->len = 0;
1069 }
1070 
1071 /*
1072  * This function routes the TX packet to the correct interface. This
1073  * may be a local device or the physical port.
1074  */
1075 static __rte_always_inline void
1076 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1077 {
1078 	struct mbuf_table *tx_q;
1079 	unsigned offset = 0;
1080 	const uint16_t lcore_id = rte_lcore_id();
1081 	struct rte_ether_hdr *nh;
1082 
1083 
1084 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1085 	if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) {
1086 		struct vhost_dev *vdev2;
1087 
1088 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1089 			if (vdev2 != vdev)
1090 				sync_virtio_xmit(vdev2, vdev, m);
1091 		}
1092 		goto queue2nic;
1093 	}
1094 
1095 	/*check if destination is local VM*/
1096 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1097 		return;
1098 
1099 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1100 		if (unlikely(find_local_dest(vdev, m, &offset,
1101 					     &vlan_tag) != 0)) {
1102 			rte_pktmbuf_free(m);
1103 			return;
1104 		}
1105 	}
1106 
1107 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1108 		"(%d) TX: MAC address is external\n", vdev->vid);
1109 
1110 queue2nic:
1111 
1112 	/*Add packet to the port tx queue*/
1113 	tx_q = &lcore_tx_queue[lcore_id];
1114 
1115 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1116 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1117 		/* Guest has inserted the vlan tag. */
1118 		struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1119 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1120 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1121 			(vh->vlan_tci != vlan_tag_be))
1122 			vh->vlan_tci = vlan_tag_be;
1123 	} else {
1124 		m->ol_flags |= RTE_MBUF_F_TX_VLAN;
1125 
1126 		/*
1127 		 * Find the right seg to adjust the data len when offset is
1128 		 * bigger than tail room size.
1129 		 */
1130 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1131 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1132 				m->data_len += offset;
1133 			else {
1134 				struct rte_mbuf *seg = m;
1135 
1136 				while ((seg->next != NULL) &&
1137 					(offset > rte_pktmbuf_tailroom(seg)))
1138 					seg = seg->next;
1139 
1140 				seg->data_len += offset;
1141 			}
1142 			m->pkt_len += offset;
1143 		}
1144 
1145 		m->vlan_tci = vlan_tag;
1146 	}
1147 
1148 	if (m->ol_flags & RTE_MBUF_F_RX_LRO)
1149 		virtio_tx_offload(m);
1150 
1151 	tx_q->m_table[tx_q->len++] = m;
1152 	if (enable_stats) {
1153 		vdev->stats.tx_total++;
1154 		vdev->stats.tx++;
1155 	}
1156 
1157 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1158 		do_drain_mbuf_table(tx_q);
1159 }
1160 
1161 
1162 static __rte_always_inline void
1163 drain_mbuf_table(struct mbuf_table *tx_q)
1164 {
1165 	static uint64_t prev_tsc;
1166 	uint64_t cur_tsc;
1167 
1168 	if (tx_q->len == 0)
1169 		return;
1170 
1171 	cur_tsc = rte_rdtsc();
1172 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1173 		prev_tsc = cur_tsc;
1174 
1175 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1176 			"TX queue drained after timeout with burst size %u\n",
1177 			tx_q->len);
1178 		do_drain_mbuf_table(tx_q);
1179 	}
1180 }
1181 
1182 static __rte_always_inline void
1183 drain_eth_rx(struct vhost_dev *vdev)
1184 {
1185 	uint16_t rx_count, enqueue_count;
1186 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1187 
1188 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1189 				    pkts, MAX_PKT_BURST);
1190 
1191 	if (!rx_count)
1192 		return;
1193 
1194 	/*
1195 	 * When "enable_retry" is set, here we wait and retry when there
1196 	 * is no enough free slots in the queue to hold @rx_count packets,
1197 	 * to diminish packet loss.
1198 	 */
1199 	if (enable_retry &&
1200 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1201 			VIRTIO_RXQ))) {
1202 		uint32_t retry;
1203 
1204 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1205 			rte_delay_us(burst_rx_delay_time);
1206 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1207 					VIRTIO_RXQ))
1208 				break;
1209 		}
1210 	}
1211 
1212 	if (builtin_net_driver) {
1213 		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1214 						pkts, rx_count);
1215 	} else if (async_vhost_driver) {
1216 		uint16_t enqueue_fail = 0;
1217 
1218 		complete_async_pkts(vdev);
1219 		enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1220 					VIRTIO_RXQ, pkts, rx_count);
1221 		__atomic_add_fetch(&vdev->pkts_inflight, enqueue_count, __ATOMIC_SEQ_CST);
1222 
1223 		enqueue_fail = rx_count - enqueue_count;
1224 		if (enqueue_fail)
1225 			free_pkts(&pkts[enqueue_count], enqueue_fail);
1226 
1227 	} else {
1228 		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1229 						pkts, rx_count);
1230 	}
1231 
1232 	if (enable_stats) {
1233 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1234 				__ATOMIC_SEQ_CST);
1235 		__atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1236 				__ATOMIC_SEQ_CST);
1237 	}
1238 
1239 	if (!async_vhost_driver)
1240 		free_pkts(pkts, rx_count);
1241 }
1242 
1243 static __rte_always_inline void
1244 drain_virtio_tx(struct vhost_dev *vdev)
1245 {
1246 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1247 	uint16_t count;
1248 	uint16_t i;
1249 
1250 	if (builtin_net_driver) {
1251 		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1252 					pkts, MAX_PKT_BURST);
1253 	} else {
1254 		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1255 					mbuf_pool, pkts, MAX_PKT_BURST);
1256 	}
1257 
1258 	/* setup VMDq for the first packet */
1259 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1260 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1261 			free_pkts(pkts, count);
1262 	}
1263 
1264 	for (i = 0; i < count; ++i)
1265 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1266 }
1267 
1268 /*
1269  * Main function of vhost-switch. It basically does:
1270  *
1271  * for each vhost device {
1272  *    - drain_eth_rx()
1273  *
1274  *      Which drains the host eth Rx queue linked to the vhost device,
1275  *      and deliver all of them to guest virito Rx ring associated with
1276  *      this vhost device.
1277  *
1278  *    - drain_virtio_tx()
1279  *
1280  *      Which drains the guest virtio Tx queue and deliver all of them
1281  *      to the target, which could be another vhost device, or the
1282  *      physical eth dev. The route is done in function "virtio_tx_route".
1283  * }
1284  */
1285 static int
1286 switch_worker(void *arg __rte_unused)
1287 {
1288 	unsigned i;
1289 	unsigned lcore_id = rte_lcore_id();
1290 	struct vhost_dev *vdev;
1291 	struct mbuf_table *tx_q;
1292 
1293 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1294 
1295 	tx_q = &lcore_tx_queue[lcore_id];
1296 	for (i = 0; i < rte_lcore_count(); i++) {
1297 		if (lcore_ids[i] == lcore_id) {
1298 			tx_q->txq_id = i;
1299 			break;
1300 		}
1301 	}
1302 
1303 	while(1) {
1304 		drain_mbuf_table(tx_q);
1305 		drain_vhost_table();
1306 		/*
1307 		 * Inform the configuration core that we have exited the
1308 		 * linked list and that no devices are in use if requested.
1309 		 */
1310 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1311 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1312 
1313 		/*
1314 		 * Process vhost devices
1315 		 */
1316 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1317 			      lcore_vdev_entry) {
1318 			if (unlikely(vdev->remove)) {
1319 				unlink_vmdq(vdev);
1320 				vdev->ready = DEVICE_SAFE_REMOVE;
1321 				continue;
1322 			}
1323 
1324 			if (likely(vdev->ready == DEVICE_RX))
1325 				drain_eth_rx(vdev);
1326 
1327 			if (likely(!vdev->remove))
1328 				drain_virtio_tx(vdev);
1329 		}
1330 	}
1331 
1332 	return 0;
1333 }
1334 
1335 /*
1336  * Remove a device from the specific data core linked list and from the
1337  * main linked list. Synchonization  occurs through the use of the
1338  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1339  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1340  */
1341 static void
1342 destroy_device(int vid)
1343 {
1344 	struct vhost_dev *vdev = NULL;
1345 	int lcore;
1346 	uint16_t i;
1347 
1348 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1349 		if (vdev->vid == vid)
1350 			break;
1351 	}
1352 	if (!vdev)
1353 		return;
1354 	/*set the remove flag. */
1355 	vdev->remove = 1;
1356 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1357 		rte_pause();
1358 	}
1359 
1360 	for (i = 0; i < RTE_MAX_LCORE; i++)
1361 		rte_free(vhost_txbuff[i * MAX_VHOST_DEVICE + vid]);
1362 
1363 	if (builtin_net_driver)
1364 		vs_vhost_net_remove(vdev);
1365 
1366 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1367 		     lcore_vdev_entry);
1368 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1369 
1370 
1371 	/* Set the dev_removal_flag on each lcore. */
1372 	RTE_LCORE_FOREACH_WORKER(lcore)
1373 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1374 
1375 	/*
1376 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1377 	 * we can be sure that they can no longer access the device removed
1378 	 * from the linked lists and that the devices are no longer in use.
1379 	 */
1380 	RTE_LCORE_FOREACH_WORKER(lcore) {
1381 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1382 			rte_pause();
1383 	}
1384 
1385 	lcore_info[vdev->coreid].device_num--;
1386 
1387 	RTE_LOG(INFO, VHOST_DATA,
1388 		"(%d) device has been removed from data core\n",
1389 		vdev->vid);
1390 
1391 	if (async_vhost_driver) {
1392 		uint16_t n_pkt = 0;
1393 		struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1394 
1395 		while (vdev->pkts_inflight) {
1396 			n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, VIRTIO_RXQ,
1397 						m_cpl, vdev->pkts_inflight);
1398 			free_pkts(m_cpl, n_pkt);
1399 			__atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1400 		}
1401 
1402 		rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1403 	}
1404 
1405 	rte_free(vdev);
1406 }
1407 
1408 /*
1409  * A new device is added to a data core. First the device is added to the main linked list
1410  * and then allocated to a specific data core.
1411  */
1412 static int
1413 new_device(int vid)
1414 {
1415 	int lcore, core_add = 0;
1416 	uint16_t i;
1417 	uint32_t device_num_min = num_devices;
1418 	struct vhost_dev *vdev;
1419 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1420 	if (vdev == NULL) {
1421 		RTE_LOG(INFO, VHOST_DATA,
1422 			"(%d) couldn't allocate memory for vhost dev\n",
1423 			vid);
1424 		return -1;
1425 	}
1426 	vdev->vid = vid;
1427 
1428 	for (i = 0; i < RTE_MAX_LCORE; i++) {
1429 		vhost_txbuff[i * MAX_VHOST_DEVICE + vid]
1430 			= rte_zmalloc("vhost bufftable",
1431 				sizeof(struct vhost_bufftable),
1432 				RTE_CACHE_LINE_SIZE);
1433 
1434 		if (vhost_txbuff[i * MAX_VHOST_DEVICE + vid] == NULL) {
1435 			RTE_LOG(INFO, VHOST_DATA,
1436 			  "(%d) couldn't allocate memory for vhost TX\n", vid);
1437 			return -1;
1438 		}
1439 	}
1440 
1441 	if (builtin_net_driver)
1442 		vs_vhost_net_setup(vdev);
1443 
1444 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1445 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1446 
1447 	/*reset ready flag*/
1448 	vdev->ready = DEVICE_MAC_LEARNING;
1449 	vdev->remove = 0;
1450 
1451 	/* Find a suitable lcore to add the device. */
1452 	RTE_LCORE_FOREACH_WORKER(lcore) {
1453 		if (lcore_info[lcore].device_num < device_num_min) {
1454 			device_num_min = lcore_info[lcore].device_num;
1455 			core_add = lcore;
1456 		}
1457 	}
1458 	vdev->coreid = core_add;
1459 
1460 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1461 			  lcore_vdev_entry);
1462 	lcore_info[vdev->coreid].device_num++;
1463 
1464 	/* Disable notifications. */
1465 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1466 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1467 
1468 	RTE_LOG(INFO, VHOST_DATA,
1469 		"(%d) device has been added to data core %d\n",
1470 		vid, vdev->coreid);
1471 
1472 	if (async_vhost_driver) {
1473 		struct rte_vhost_async_config config = {0};
1474 		struct rte_vhost_async_channel_ops channel_ops;
1475 
1476 		if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
1477 			channel_ops.transfer_data = ioat_transfer_data_cb;
1478 			channel_ops.check_completed_copies =
1479 				ioat_check_completed_copies_cb;
1480 
1481 			config.features = RTE_VHOST_ASYNC_INORDER;
1482 
1483 			return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
1484 				config, &channel_ops);
1485 		}
1486 	}
1487 
1488 	return 0;
1489 }
1490 
1491 static int
1492 vring_state_changed(int vid, uint16_t queue_id, int enable)
1493 {
1494 	struct vhost_dev *vdev = NULL;
1495 
1496 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1497 		if (vdev->vid == vid)
1498 			break;
1499 	}
1500 	if (!vdev)
1501 		return -1;
1502 
1503 	if (queue_id != VIRTIO_RXQ)
1504 		return 0;
1505 
1506 	if (async_vhost_driver) {
1507 		if (!enable) {
1508 			uint16_t n_pkt = 0;
1509 			struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1510 
1511 			while (vdev->pkts_inflight) {
1512 				n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, queue_id,
1513 							m_cpl, vdev->pkts_inflight);
1514 				free_pkts(m_cpl, n_pkt);
1515 				__atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1516 			}
1517 		}
1518 	}
1519 
1520 	return 0;
1521 }
1522 
1523 /*
1524  * These callback allow devices to be added to the data core when configuration
1525  * has been fully complete.
1526  */
1527 static const struct rte_vhost_device_ops virtio_net_device_ops =
1528 {
1529 	.new_device =  new_device,
1530 	.destroy_device = destroy_device,
1531 	.vring_state_changed = vring_state_changed,
1532 };
1533 
1534 /*
1535  * This is a thread will wake up after a period to print stats if the user has
1536  * enabled them.
1537  */
1538 static void *
1539 print_stats(__rte_unused void *arg)
1540 {
1541 	struct vhost_dev *vdev;
1542 	uint64_t tx_dropped, rx_dropped;
1543 	uint64_t tx, tx_total, rx, rx_total;
1544 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1545 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1546 
1547 	while(1) {
1548 		sleep(enable_stats);
1549 
1550 		/* Clear screen and move to top left */
1551 		printf("%s%s\n", clr, top_left);
1552 		printf("Device statistics =================================\n");
1553 
1554 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1555 			tx_total   = vdev->stats.tx_total;
1556 			tx         = vdev->stats.tx;
1557 			tx_dropped = tx_total - tx;
1558 
1559 			rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1560 				__ATOMIC_SEQ_CST);
1561 			rx         = __atomic_load_n(&vdev->stats.rx_atomic,
1562 				__ATOMIC_SEQ_CST);
1563 			rx_dropped = rx_total - rx;
1564 
1565 			printf("Statistics for device %d\n"
1566 				"-----------------------\n"
1567 				"TX total:              %" PRIu64 "\n"
1568 				"TX dropped:            %" PRIu64 "\n"
1569 				"TX successful:         %" PRIu64 "\n"
1570 				"RX total:              %" PRIu64 "\n"
1571 				"RX dropped:            %" PRIu64 "\n"
1572 				"RX successful:         %" PRIu64 "\n",
1573 				vdev->vid,
1574 				tx_total, tx_dropped, tx,
1575 				rx_total, rx_dropped, rx);
1576 		}
1577 
1578 		printf("===================================================\n");
1579 
1580 		fflush(stdout);
1581 	}
1582 
1583 	return NULL;
1584 }
1585 
1586 static void
1587 unregister_drivers(int socket_num)
1588 {
1589 	int i, ret;
1590 
1591 	for (i = 0; i < socket_num; i++) {
1592 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1593 		if (ret != 0)
1594 			RTE_LOG(ERR, VHOST_CONFIG,
1595 				"Fail to unregister vhost driver for %s.\n",
1596 				socket_files + i * PATH_MAX);
1597 	}
1598 }
1599 
1600 /* When we receive a INT signal, unregister vhost driver */
1601 static void
1602 sigint_handler(__rte_unused int signum)
1603 {
1604 	/* Unregister vhost driver. */
1605 	unregister_drivers(nb_sockets);
1606 
1607 	exit(0);
1608 }
1609 
1610 /*
1611  * While creating an mbuf pool, one key thing is to figure out how
1612  * many mbuf entries is enough for our use. FYI, here are some
1613  * guidelines:
1614  *
1615  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1616  *
1617  * - For each switch core (A CPU core does the packet switch), we need
1618  *   also make some reservation for receiving the packets from virtio
1619  *   Tx queue. How many is enough depends on the usage. It's normally
1620  *   a simple calculation like following:
1621  *
1622  *       MAX_PKT_BURST * max packet size / mbuf size
1623  *
1624  *   So, we definitely need allocate more mbufs when TSO is enabled.
1625  *
1626  * - Similarly, for each switching core, we should serve @nr_rx_desc
1627  *   mbufs for receiving the packets from physical NIC device.
1628  *
1629  * - We also need make sure, for each switch core, we have allocated
1630  *   enough mbufs to fill up the mbuf cache.
1631  */
1632 static void
1633 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1634 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1635 {
1636 	uint32_t nr_mbufs;
1637 	uint32_t nr_mbufs_per_core;
1638 	uint32_t mtu = 1500;
1639 
1640 	if (mergeable)
1641 		mtu = 9000;
1642 	if (enable_tso)
1643 		mtu = 64 * 1024;
1644 
1645 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1646 			(mbuf_size - RTE_PKTMBUF_HEADROOM);
1647 	nr_mbufs_per_core += nr_rx_desc;
1648 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1649 
1650 	nr_mbufs  = nr_queues * nr_rx_desc;
1651 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1652 	nr_mbufs *= nr_port;
1653 
1654 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1655 					    nr_mbuf_cache, 0, mbuf_size,
1656 					    rte_socket_id());
1657 	if (mbuf_pool == NULL)
1658 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1659 }
1660 
1661 /*
1662  * Main function, does initialisation and calls the per-lcore functions.
1663  */
1664 int
1665 main(int argc, char *argv[])
1666 {
1667 	unsigned lcore_id, core_id = 0;
1668 	unsigned nb_ports, valid_num_ports;
1669 	int ret, i;
1670 	uint16_t portid;
1671 	static pthread_t tid;
1672 	uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1673 
1674 	signal(SIGINT, sigint_handler);
1675 
1676 	/* init EAL */
1677 	ret = rte_eal_init(argc, argv);
1678 	if (ret < 0)
1679 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1680 	argc -= ret;
1681 	argv += ret;
1682 
1683 	/* parse app arguments */
1684 	ret = us_vhost_parse_args(argc, argv);
1685 	if (ret < 0)
1686 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1687 
1688 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1689 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1690 
1691 		if (rte_lcore_is_enabled(lcore_id))
1692 			lcore_ids[core_id++] = lcore_id;
1693 	}
1694 
1695 	if (rte_lcore_count() > RTE_MAX_LCORE)
1696 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1697 
1698 	/* Get the number of physical ports. */
1699 	nb_ports = rte_eth_dev_count_avail();
1700 
1701 	/*
1702 	 * Update the global var NUM_PORTS and global array PORTS
1703 	 * and get value of var VALID_NUM_PORTS according to system ports number
1704 	 */
1705 	valid_num_ports = check_ports_num(nb_ports);
1706 
1707 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1708 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1709 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1710 		return -1;
1711 	}
1712 
1713 	/*
1714 	 * FIXME: here we are trying to allocate mbufs big enough for
1715 	 * @MAX_QUEUES, but the truth is we're never going to use that
1716 	 * many queues here. We probably should only do allocation for
1717 	 * those queues we are going to use.
1718 	 */
1719 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1720 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1721 
1722 	if (vm2vm_mode == VM2VM_HARDWARE) {
1723 		/* Enable VT loop back to let L2 switch to do it. */
1724 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1725 		RTE_LOG(DEBUG, VHOST_CONFIG,
1726 			"Enable loop back for L2 switch in vmdq.\n");
1727 	}
1728 
1729 	/* initialize all ports */
1730 	RTE_ETH_FOREACH_DEV(portid) {
1731 		/* skip ports that are not enabled */
1732 		if ((enabled_port_mask & (1 << portid)) == 0) {
1733 			RTE_LOG(INFO, VHOST_PORT,
1734 				"Skipping disabled port %d\n", portid);
1735 			continue;
1736 		}
1737 		if (port_init(portid) != 0)
1738 			rte_exit(EXIT_FAILURE,
1739 				"Cannot initialize network ports\n");
1740 	}
1741 
1742 	/* Enable stats if the user option is set. */
1743 	if (enable_stats) {
1744 		ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1745 					print_stats, NULL);
1746 		if (ret < 0)
1747 			rte_exit(EXIT_FAILURE,
1748 				"Cannot create print-stats thread\n");
1749 	}
1750 
1751 	/* Launch all data cores. */
1752 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1753 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1754 
1755 	if (client_mode)
1756 		flags |= RTE_VHOST_USER_CLIENT;
1757 
1758 	/* Register vhost user driver to handle vhost messages. */
1759 	for (i = 0; i < nb_sockets; i++) {
1760 		char *file = socket_files + i * PATH_MAX;
1761 
1762 		if (async_vhost_driver)
1763 			flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1764 
1765 		ret = rte_vhost_driver_register(file, flags);
1766 		if (ret != 0) {
1767 			unregister_drivers(i);
1768 			rte_exit(EXIT_FAILURE,
1769 				"vhost driver register failure.\n");
1770 		}
1771 
1772 		if (builtin_net_driver)
1773 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1774 
1775 		if (mergeable == 0) {
1776 			rte_vhost_driver_disable_features(file,
1777 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
1778 		}
1779 
1780 		if (enable_tx_csum == 0) {
1781 			rte_vhost_driver_disable_features(file,
1782 				1ULL << VIRTIO_NET_F_CSUM);
1783 		}
1784 
1785 		if (enable_tso == 0) {
1786 			rte_vhost_driver_disable_features(file,
1787 				1ULL << VIRTIO_NET_F_HOST_TSO4);
1788 			rte_vhost_driver_disable_features(file,
1789 				1ULL << VIRTIO_NET_F_HOST_TSO6);
1790 			rte_vhost_driver_disable_features(file,
1791 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
1792 			rte_vhost_driver_disable_features(file,
1793 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
1794 		}
1795 
1796 		if (promiscuous) {
1797 			rte_vhost_driver_enable_features(file,
1798 				1ULL << VIRTIO_NET_F_CTRL_RX);
1799 		}
1800 
1801 		ret = rte_vhost_driver_callback_register(file,
1802 			&virtio_net_device_ops);
1803 		if (ret != 0) {
1804 			rte_exit(EXIT_FAILURE,
1805 				"failed to register vhost driver callbacks.\n");
1806 		}
1807 
1808 		if (rte_vhost_driver_start(file) < 0) {
1809 			rte_exit(EXIT_FAILURE,
1810 				"failed to start vhost driver.\n");
1811 		}
1812 	}
1813 
1814 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1815 		rte_eal_wait_lcore(lcore_id);
1816 
1817 	/* clean up the EAL */
1818 	rte_eal_cleanup();
1819 
1820 	return 0;
1821 }
1822