xref: /dpdk/examples/vhost/main.c (revision b9f23beee0094afb930f3b15c1ac8f2d45b2584f)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4 
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16 
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
19 #include <rte_log.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
22 #include <rte_net.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27 
28 #include "ioat.h"
29 #include "main.h"
30 
31 #ifndef MAX_QUEUES
32 #define MAX_QUEUES 128
33 #endif
34 
35 /* the maximum number of external ports supported */
36 #define MAX_SUP_PORTS 1
37 
38 #define MBUF_CACHE_SIZE	128
39 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
40 
41 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
42 
43 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
44 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
45 
46 #define JUMBO_FRAME_MAX_SIZE    0x2600
47 
48 /* State of virtio device. */
49 #define DEVICE_MAC_LEARNING 0
50 #define DEVICE_RX			1
51 #define DEVICE_SAFE_REMOVE	2
52 
53 /* Configurable number of RX/TX ring descriptors */
54 #define RTE_TEST_RX_DESC_DEFAULT 1024
55 #define RTE_TEST_TX_DESC_DEFAULT 512
56 
57 #define INVALID_PORT_ID 0xFF
58 
59 /* mask of enabled ports */
60 static uint32_t enabled_port_mask = 0;
61 
62 /* Promiscuous mode */
63 static uint32_t promiscuous;
64 
65 /* number of devices/queues to support*/
66 static uint32_t num_queues = 0;
67 static uint32_t num_devices;
68 
69 static struct rte_mempool *mbuf_pool;
70 static int mergeable;
71 
72 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
73 typedef enum {
74 	VM2VM_DISABLED = 0,
75 	VM2VM_SOFTWARE = 1,
76 	VM2VM_HARDWARE = 2,
77 	VM2VM_LAST
78 } vm2vm_type;
79 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
80 
81 /* Enable stats. */
82 static uint32_t enable_stats = 0;
83 /* Enable retries on RX. */
84 static uint32_t enable_retry = 1;
85 
86 /* Disable TX checksum offload */
87 static uint32_t enable_tx_csum;
88 
89 /* Disable TSO offload */
90 static uint32_t enable_tso;
91 
92 static int client_mode;
93 
94 static int builtin_net_driver;
95 
96 static int async_vhost_driver;
97 
98 static char *dma_type;
99 
100 /* Specify timeout (in useconds) between retries on RX. */
101 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
102 /* Specify the number of retries on RX. */
103 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
104 
105 /* Socket file paths. Can be set by user */
106 static char *socket_files;
107 static int nb_sockets;
108 
109 /* empty vmdq configuration structure. Filled in programatically */
110 static struct rte_eth_conf vmdq_conf_default = {
111 	.rxmode = {
112 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
113 		.split_hdr_size = 0,
114 		/*
115 		 * VLAN strip is necessary for 1G NIC such as I350,
116 		 * this fixes bug of ipv4 forwarding in guest can't
117 		 * forward pakets from one virtio dev to another virtio dev.
118 		 */
119 		.offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
120 	},
121 
122 	.txmode = {
123 		.mq_mode = ETH_MQ_TX_NONE,
124 		.offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
125 			     DEV_TX_OFFLOAD_TCP_CKSUM |
126 			     DEV_TX_OFFLOAD_VLAN_INSERT |
127 			     DEV_TX_OFFLOAD_MULTI_SEGS |
128 			     DEV_TX_OFFLOAD_TCP_TSO),
129 	},
130 	.rx_adv_conf = {
131 		/*
132 		 * should be overridden separately in code with
133 		 * appropriate values
134 		 */
135 		.vmdq_rx_conf = {
136 			.nb_queue_pools = ETH_8_POOLS,
137 			.enable_default_pool = 0,
138 			.default_pool = 0,
139 			.nb_pool_maps = 0,
140 			.pool_map = {{0, 0},},
141 		},
142 	},
143 };
144 
145 
146 static unsigned lcore_ids[RTE_MAX_LCORE];
147 static uint16_t ports[RTE_MAX_ETHPORTS];
148 static unsigned num_ports = 0; /**< The number of ports specified in command line */
149 static uint16_t num_pf_queues, num_vmdq_queues;
150 static uint16_t vmdq_pool_base, vmdq_queue_base;
151 static uint16_t queues_per_pool;
152 
153 const uint16_t vlan_tags[] = {
154 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
155 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
156 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
157 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
158 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
159 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
160 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
161 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
162 };
163 
164 /* ethernet addresses of ports */
165 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
166 
167 static struct vhost_dev_tailq_list vhost_dev_list =
168 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
169 
170 static struct lcore_info lcore_info[RTE_MAX_LCORE];
171 
172 /* Used for queueing bursts of TX packets. */
173 struct mbuf_table {
174 	unsigned len;
175 	unsigned txq_id;
176 	struct rte_mbuf *m_table[MAX_PKT_BURST];
177 };
178 
179 struct vhost_bufftable {
180 	uint32_t len;
181 	uint64_t pre_tsc;
182 	struct rte_mbuf *m_table[MAX_PKT_BURST];
183 };
184 
185 /* TX queue for each data core. */
186 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
187 
188 /*
189  * Vhost TX buffer for each data core.
190  * Every data core maintains a TX buffer for every vhost device,
191  * which is used for batch pkts enqueue for higher performance.
192  */
193 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * MAX_VHOST_DEVICE];
194 
195 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
196 				 / US_PER_S * BURST_TX_DRAIN_US)
197 #define VLAN_HLEN       4
198 
199 static inline int
200 open_dma(const char *value)
201 {
202 	if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0)
203 		return open_ioat(value);
204 
205 	return -1;
206 }
207 
208 /*
209  * Builds up the correct configuration for VMDQ VLAN pool map
210  * according to the pool & queue limits.
211  */
212 static inline int
213 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
214 {
215 	struct rte_eth_vmdq_rx_conf conf;
216 	struct rte_eth_vmdq_rx_conf *def_conf =
217 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
218 	unsigned i;
219 
220 	memset(&conf, 0, sizeof(conf));
221 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
222 	conf.nb_pool_maps = num_devices;
223 	conf.enable_loop_back = def_conf->enable_loop_back;
224 	conf.rx_mode = def_conf->rx_mode;
225 
226 	for (i = 0; i < conf.nb_pool_maps; i++) {
227 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
228 		conf.pool_map[i].pools = (1UL << i);
229 	}
230 
231 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
232 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
233 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
234 	return 0;
235 }
236 
237 /*
238  * Initialises a given port using global settings and with the rx buffers
239  * coming from the mbuf_pool passed as parameter
240  */
241 static inline int
242 port_init(uint16_t port)
243 {
244 	struct rte_eth_dev_info dev_info;
245 	struct rte_eth_conf port_conf;
246 	struct rte_eth_rxconf *rxconf;
247 	struct rte_eth_txconf *txconf;
248 	int16_t rx_rings, tx_rings;
249 	uint16_t rx_ring_size, tx_ring_size;
250 	int retval;
251 	uint16_t q;
252 
253 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
254 	retval = rte_eth_dev_info_get(port, &dev_info);
255 	if (retval != 0) {
256 		RTE_LOG(ERR, VHOST_PORT,
257 			"Error during getting device (port %u) info: %s\n",
258 			port, strerror(-retval));
259 
260 		return retval;
261 	}
262 
263 	rxconf = &dev_info.default_rxconf;
264 	txconf = &dev_info.default_txconf;
265 	rxconf->rx_drop_en = 1;
266 
267 	/*configure the number of supported virtio devices based on VMDQ limits */
268 	num_devices = dev_info.max_vmdq_pools;
269 
270 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
271 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
272 
273 	tx_rings = (uint16_t)rte_lcore_count();
274 
275 	/* Get port configuration. */
276 	retval = get_eth_conf(&port_conf, num_devices);
277 	if (retval < 0)
278 		return retval;
279 	/* NIC queues are divided into pf queues and vmdq queues.  */
280 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
281 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
282 	num_vmdq_queues = num_devices * queues_per_pool;
283 	num_queues = num_pf_queues + num_vmdq_queues;
284 	vmdq_queue_base = dev_info.vmdq_queue_base;
285 	vmdq_pool_base  = dev_info.vmdq_pool_base;
286 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
287 		num_pf_queues, num_devices, queues_per_pool);
288 
289 	if (!rte_eth_dev_is_valid_port(port))
290 		return -1;
291 
292 	rx_rings = (uint16_t)dev_info.max_rx_queues;
293 	if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
294 		port_conf.txmode.offloads |=
295 			DEV_TX_OFFLOAD_MBUF_FAST_FREE;
296 	/* Configure ethernet device. */
297 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
298 	if (retval != 0) {
299 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
300 			port, strerror(-retval));
301 		return retval;
302 	}
303 
304 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
305 		&tx_ring_size);
306 	if (retval != 0) {
307 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
308 			"for port %u: %s.\n", port, strerror(-retval));
309 		return retval;
310 	}
311 	if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
312 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
313 			"for Rx queues on port %u.\n", port);
314 		return -1;
315 	}
316 
317 	/* Setup the queues. */
318 	rxconf->offloads = port_conf.rxmode.offloads;
319 	for (q = 0; q < rx_rings; q ++) {
320 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
321 						rte_eth_dev_socket_id(port),
322 						rxconf,
323 						mbuf_pool);
324 		if (retval < 0) {
325 			RTE_LOG(ERR, VHOST_PORT,
326 				"Failed to setup rx queue %u of port %u: %s.\n",
327 				q, port, strerror(-retval));
328 			return retval;
329 		}
330 	}
331 	txconf->offloads = port_conf.txmode.offloads;
332 	for (q = 0; q < tx_rings; q ++) {
333 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
334 						rte_eth_dev_socket_id(port),
335 						txconf);
336 		if (retval < 0) {
337 			RTE_LOG(ERR, VHOST_PORT,
338 				"Failed to setup tx queue %u of port %u: %s.\n",
339 				q, port, strerror(-retval));
340 			return retval;
341 		}
342 	}
343 
344 	/* Start the device. */
345 	retval  = rte_eth_dev_start(port);
346 	if (retval < 0) {
347 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
348 			port, strerror(-retval));
349 		return retval;
350 	}
351 
352 	if (promiscuous) {
353 		retval = rte_eth_promiscuous_enable(port);
354 		if (retval != 0) {
355 			RTE_LOG(ERR, VHOST_PORT,
356 				"Failed to enable promiscuous mode on port %u: %s\n",
357 				port, rte_strerror(-retval));
358 			return retval;
359 		}
360 	}
361 
362 	retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
363 	if (retval < 0) {
364 		RTE_LOG(ERR, VHOST_PORT,
365 			"Failed to get MAC address on port %u: %s\n",
366 			port, rte_strerror(-retval));
367 		return retval;
368 	}
369 
370 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
371 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
372 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
373 			port,
374 			vmdq_ports_eth_addr[port].addr_bytes[0],
375 			vmdq_ports_eth_addr[port].addr_bytes[1],
376 			vmdq_ports_eth_addr[port].addr_bytes[2],
377 			vmdq_ports_eth_addr[port].addr_bytes[3],
378 			vmdq_ports_eth_addr[port].addr_bytes[4],
379 			vmdq_ports_eth_addr[port].addr_bytes[5]);
380 
381 	return 0;
382 }
383 
384 /*
385  * Set socket file path.
386  */
387 static int
388 us_vhost_parse_socket_path(const char *q_arg)
389 {
390 	char *old;
391 
392 	/* parse number string */
393 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
394 		return -1;
395 
396 	old = socket_files;
397 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
398 	if (socket_files == NULL) {
399 		free(old);
400 		return -1;
401 	}
402 
403 	strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
404 	nb_sockets++;
405 
406 	return 0;
407 }
408 
409 /*
410  * Parse the portmask provided at run time.
411  */
412 static int
413 parse_portmask(const char *portmask)
414 {
415 	char *end = NULL;
416 	unsigned long pm;
417 
418 	errno = 0;
419 
420 	/* parse hexadecimal string */
421 	pm = strtoul(portmask, &end, 16);
422 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
423 		return 0;
424 
425 	return pm;
426 
427 }
428 
429 /*
430  * Parse num options at run time.
431  */
432 static int
433 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
434 {
435 	char *end = NULL;
436 	unsigned long num;
437 
438 	errno = 0;
439 
440 	/* parse unsigned int string */
441 	num = strtoul(q_arg, &end, 10);
442 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
443 		return -1;
444 
445 	if (num > max_valid_value)
446 		return -1;
447 
448 	return num;
449 
450 }
451 
452 /*
453  * Display usage
454  */
455 static void
456 us_vhost_usage(const char *prgname)
457 {
458 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
459 	"		--vm2vm [0|1|2]\n"
460 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
461 	"		--socket-file <path>\n"
462 	"		--nb-devices ND\n"
463 	"		-p PORTMASK: Set mask for ports to be used by application\n"
464 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
465 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
466 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
467 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
468 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
469 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
470 	"		--socket-file: The path of the socket file.\n"
471 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
472 	"		--tso [0|1] disable/enable TCP segment offload.\n"
473 	"		--client register a vhost-user socket as client mode.\n"
474 	"		--dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
475 	"		--dmas register dma channel for specific vhost device.\n",
476 	       prgname);
477 }
478 
479 enum {
480 #define OPT_VM2VM               "vm2vm"
481 	OPT_VM2VM_NUM = 256,
482 #define OPT_RX_RETRY            "rx-retry"
483 	OPT_RX_RETRY_NUM,
484 #define OPT_RX_RETRY_DELAY      "rx-retry-delay"
485 	OPT_RX_RETRY_DELAY_NUM,
486 #define OPT_RX_RETRY_NUMB       "rx-retry-num"
487 	OPT_RX_RETRY_NUMB_NUM,
488 #define OPT_MERGEABLE           "mergeable"
489 	OPT_MERGEABLE_NUM,
490 #define OPT_STATS               "stats"
491 	OPT_STATS_NUM,
492 #define OPT_SOCKET_FILE         "socket-file"
493 	OPT_SOCKET_FILE_NUM,
494 #define OPT_TX_CSUM             "tx-csum"
495 	OPT_TX_CSUM_NUM,
496 #define OPT_TSO                 "tso"
497 	OPT_TSO_NUM,
498 #define OPT_CLIENT              "client"
499 	OPT_CLIENT_NUM,
500 #define OPT_BUILTIN_NET_DRIVER  "builtin-net-driver"
501 	OPT_BUILTIN_NET_DRIVER_NUM,
502 #define OPT_DMA_TYPE            "dma-type"
503 	OPT_DMA_TYPE_NUM,
504 #define OPT_DMAS                "dmas"
505 	OPT_DMAS_NUM,
506 };
507 
508 /*
509  * Parse the arguments given in the command line of the application.
510  */
511 static int
512 us_vhost_parse_args(int argc, char **argv)
513 {
514 	int opt, ret;
515 	int option_index;
516 	unsigned i;
517 	const char *prgname = argv[0];
518 	static struct option long_option[] = {
519 		{OPT_VM2VM, required_argument,
520 				NULL, OPT_VM2VM_NUM},
521 		{OPT_RX_RETRY, required_argument,
522 				NULL, OPT_RX_RETRY_NUM},
523 		{OPT_RX_RETRY_DELAY, required_argument,
524 				NULL, OPT_RX_RETRY_DELAY_NUM},
525 		{OPT_RX_RETRY_NUMB, required_argument,
526 				NULL, OPT_RX_RETRY_NUMB_NUM},
527 		{OPT_MERGEABLE, required_argument,
528 				NULL, OPT_MERGEABLE_NUM},
529 		{OPT_STATS, required_argument,
530 				NULL, OPT_STATS_NUM},
531 		{OPT_SOCKET_FILE, required_argument,
532 				NULL, OPT_SOCKET_FILE_NUM},
533 		{OPT_TX_CSUM, required_argument,
534 				NULL, OPT_TX_CSUM_NUM},
535 		{OPT_TSO, required_argument,
536 				NULL, OPT_TSO_NUM},
537 		{OPT_CLIENT, no_argument,
538 				NULL, OPT_CLIENT_NUM},
539 		{OPT_BUILTIN_NET_DRIVER, no_argument,
540 				NULL, OPT_BUILTIN_NET_DRIVER_NUM},
541 		{OPT_DMA_TYPE, required_argument,
542 				NULL, OPT_DMA_TYPE_NUM},
543 		{OPT_DMAS, required_argument,
544 				NULL, OPT_DMAS_NUM},
545 		{NULL, 0, 0, 0},
546 	};
547 
548 	/* Parse command line */
549 	while ((opt = getopt_long(argc, argv, "p:P",
550 			long_option, &option_index)) != EOF) {
551 		switch (opt) {
552 		/* Portmask */
553 		case 'p':
554 			enabled_port_mask = parse_portmask(optarg);
555 			if (enabled_port_mask == 0) {
556 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
557 				us_vhost_usage(prgname);
558 				return -1;
559 			}
560 			break;
561 
562 		case 'P':
563 			promiscuous = 1;
564 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
565 				ETH_VMDQ_ACCEPT_BROADCAST |
566 				ETH_VMDQ_ACCEPT_MULTICAST;
567 			break;
568 
569 		case OPT_VM2VM_NUM:
570 			ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
571 			if (ret == -1) {
572 				RTE_LOG(INFO, VHOST_CONFIG,
573 					"Invalid argument for "
574 					"vm2vm [0|1|2]\n");
575 				us_vhost_usage(prgname);
576 				return -1;
577 			}
578 			vm2vm_mode = (vm2vm_type)ret;
579 			break;
580 
581 		case OPT_RX_RETRY_NUM:
582 			ret = parse_num_opt(optarg, 1);
583 			if (ret == -1) {
584 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
585 				us_vhost_usage(prgname);
586 				return -1;
587 			}
588 			enable_retry = ret;
589 			break;
590 
591 		case OPT_TX_CSUM_NUM:
592 			ret = parse_num_opt(optarg, 1);
593 			if (ret == -1) {
594 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
595 				us_vhost_usage(prgname);
596 				return -1;
597 			}
598 			enable_tx_csum = ret;
599 			break;
600 
601 		case OPT_TSO_NUM:
602 			ret = parse_num_opt(optarg, 1);
603 			if (ret == -1) {
604 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
605 				us_vhost_usage(prgname);
606 				return -1;
607 			}
608 			enable_tso = ret;
609 			break;
610 
611 		case OPT_RX_RETRY_DELAY_NUM:
612 			ret = parse_num_opt(optarg, INT32_MAX);
613 			if (ret == -1) {
614 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
615 				us_vhost_usage(prgname);
616 				return -1;
617 			}
618 			burst_rx_delay_time = ret;
619 			break;
620 
621 		case OPT_RX_RETRY_NUMB_NUM:
622 			ret = parse_num_opt(optarg, INT32_MAX);
623 			if (ret == -1) {
624 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
625 				us_vhost_usage(prgname);
626 				return -1;
627 			}
628 			burst_rx_retry_num = ret;
629 			break;
630 
631 		case OPT_MERGEABLE_NUM:
632 			ret = parse_num_opt(optarg, 1);
633 			if (ret == -1) {
634 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
635 				us_vhost_usage(prgname);
636 				return -1;
637 			}
638 			mergeable = !!ret;
639 			if (ret) {
640 				vmdq_conf_default.rxmode.offloads |=
641 					DEV_RX_OFFLOAD_JUMBO_FRAME;
642 				vmdq_conf_default.rxmode.max_rx_pkt_len
643 					= JUMBO_FRAME_MAX_SIZE;
644 			}
645 			break;
646 
647 		case OPT_STATS_NUM:
648 			ret = parse_num_opt(optarg, INT32_MAX);
649 			if (ret == -1) {
650 				RTE_LOG(INFO, VHOST_CONFIG,
651 					"Invalid argument for stats [0..N]\n");
652 				us_vhost_usage(prgname);
653 				return -1;
654 			}
655 			enable_stats = ret;
656 			break;
657 
658 		/* Set socket file path. */
659 		case OPT_SOCKET_FILE_NUM:
660 			if (us_vhost_parse_socket_path(optarg) == -1) {
661 				RTE_LOG(INFO, VHOST_CONFIG,
662 				"Invalid argument for socket name (Max %d characters)\n",
663 				PATH_MAX);
664 				us_vhost_usage(prgname);
665 				return -1;
666 			}
667 			break;
668 
669 		case OPT_DMA_TYPE_NUM:
670 			dma_type = optarg;
671 			break;
672 
673 		case OPT_DMAS_NUM:
674 			if (open_dma(optarg) == -1) {
675 				RTE_LOG(INFO, VHOST_CONFIG,
676 					"Wrong DMA args\n");
677 				us_vhost_usage(prgname);
678 				return -1;
679 			}
680 			async_vhost_driver = 1;
681 			break;
682 
683 		case OPT_CLIENT_NUM:
684 			client_mode = 1;
685 			break;
686 
687 		case OPT_BUILTIN_NET_DRIVER_NUM:
688 			builtin_net_driver = 1;
689 			break;
690 
691 		/* Invalid option - print options. */
692 		default:
693 			us_vhost_usage(prgname);
694 			return -1;
695 		}
696 	}
697 
698 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
699 		if (enabled_port_mask & (1 << i))
700 			ports[num_ports++] = i;
701 	}
702 
703 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
704 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
705 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
706 		return -1;
707 	}
708 
709 	return 0;
710 }
711 
712 /*
713  * Update the global var NUM_PORTS and array PORTS according to system ports number
714  * and return valid ports number
715  */
716 static unsigned check_ports_num(unsigned nb_ports)
717 {
718 	unsigned valid_num_ports = num_ports;
719 	unsigned portid;
720 
721 	if (num_ports > nb_ports) {
722 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
723 			num_ports, nb_ports);
724 		num_ports = nb_ports;
725 	}
726 
727 	for (portid = 0; portid < num_ports; portid ++) {
728 		if (!rte_eth_dev_is_valid_port(ports[portid])) {
729 			RTE_LOG(INFO, VHOST_PORT,
730 				"\nSpecified port ID(%u) is not valid\n",
731 				ports[portid]);
732 			ports[portid] = INVALID_PORT_ID;
733 			valid_num_ports--;
734 		}
735 	}
736 	return valid_num_ports;
737 }
738 
739 static __rte_always_inline struct vhost_dev *
740 find_vhost_dev(struct rte_ether_addr *mac)
741 {
742 	struct vhost_dev *vdev;
743 
744 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
745 		if (vdev->ready == DEVICE_RX &&
746 		    rte_is_same_ether_addr(mac, &vdev->mac_address))
747 			return vdev;
748 	}
749 
750 	return NULL;
751 }
752 
753 /*
754  * This function learns the MAC address of the device and registers this along with a
755  * vlan tag to a VMDQ.
756  */
757 static int
758 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
759 {
760 	struct rte_ether_hdr *pkt_hdr;
761 	int i, ret;
762 
763 	/* Learn MAC address of guest device from packet */
764 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
765 
766 	if (find_vhost_dev(&pkt_hdr->s_addr)) {
767 		RTE_LOG(ERR, VHOST_DATA,
768 			"(%d) device is using a registered MAC!\n",
769 			vdev->vid);
770 		return -1;
771 	}
772 
773 	for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
774 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
775 
776 	/* vlan_tag currently uses the device_id. */
777 	vdev->vlan_tag = vlan_tags[vdev->vid];
778 
779 	/* Print out VMDQ registration info. */
780 	RTE_LOG(INFO, VHOST_DATA,
781 		"(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
782 		vdev->vid,
783 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
784 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
785 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
786 		vdev->vlan_tag);
787 
788 	/* Register the MAC address. */
789 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
790 				(uint32_t)vdev->vid + vmdq_pool_base);
791 	if (ret)
792 		RTE_LOG(ERR, VHOST_DATA,
793 			"(%d) failed to add device MAC address to VMDQ\n",
794 			vdev->vid);
795 
796 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
797 
798 	/* Set device as ready for RX. */
799 	vdev->ready = DEVICE_RX;
800 
801 	return 0;
802 }
803 
804 /*
805  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
806  * queue before disabling RX on the device.
807  */
808 static inline void
809 unlink_vmdq(struct vhost_dev *vdev)
810 {
811 	unsigned i = 0;
812 	unsigned rx_count;
813 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
814 
815 	if (vdev->ready == DEVICE_RX) {
816 		/*clear MAC and VLAN settings*/
817 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
818 		for (i = 0; i < 6; i++)
819 			vdev->mac_address.addr_bytes[i] = 0;
820 
821 		vdev->vlan_tag = 0;
822 
823 		/*Clear out the receive buffers*/
824 		rx_count = rte_eth_rx_burst(ports[0],
825 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
826 
827 		while (rx_count) {
828 			for (i = 0; i < rx_count; i++)
829 				rte_pktmbuf_free(pkts_burst[i]);
830 
831 			rx_count = rte_eth_rx_burst(ports[0],
832 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
833 		}
834 
835 		vdev->ready = DEVICE_MAC_LEARNING;
836 	}
837 }
838 
839 static inline void
840 free_pkts(struct rte_mbuf **pkts, uint16_t n)
841 {
842 	while (n--)
843 		rte_pktmbuf_free(pkts[n]);
844 }
845 
846 static __rte_always_inline void
847 complete_async_pkts(struct vhost_dev *vdev)
848 {
849 	struct rte_mbuf *p_cpl[MAX_PKT_BURST];
850 	uint16_t complete_count;
851 
852 	complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
853 					VIRTIO_RXQ, p_cpl, MAX_PKT_BURST);
854 	if (complete_count) {
855 		free_pkts(p_cpl, complete_count);
856 		__atomic_sub_fetch(&vdev->pkts_inflight, complete_count, __ATOMIC_SEQ_CST);
857 	}
858 
859 }
860 
861 static __rte_always_inline void
862 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
863 	    struct rte_mbuf *m)
864 {
865 	uint16_t ret;
866 
867 	if (builtin_net_driver) {
868 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
869 	} else {
870 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
871 	}
872 
873 	if (enable_stats) {
874 		__atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
875 				__ATOMIC_SEQ_CST);
876 		__atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
877 				__ATOMIC_SEQ_CST);
878 		src_vdev->stats.tx_total++;
879 		src_vdev->stats.tx += ret;
880 	}
881 }
882 
883 static __rte_always_inline void
884 drain_vhost(struct vhost_dev *vdev)
885 {
886 	uint16_t ret;
887 	uint32_t buff_idx = rte_lcore_id() * MAX_VHOST_DEVICE + vdev->vid;
888 	uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
889 	struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
890 
891 	if (builtin_net_driver) {
892 		ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
893 	} else if (async_vhost_driver) {
894 		uint32_t cpu_cpl_nr = 0;
895 		uint16_t enqueue_fail = 0;
896 		struct rte_mbuf *m_cpu_cpl[nr_xmit];
897 
898 		complete_async_pkts(vdev);
899 		ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ,
900 					m, nr_xmit, m_cpu_cpl, &cpu_cpl_nr);
901 		__atomic_add_fetch(&vdev->pkts_inflight, ret - cpu_cpl_nr, __ATOMIC_SEQ_CST);
902 
903 		if (cpu_cpl_nr)
904 			free_pkts(m_cpu_cpl, cpu_cpl_nr);
905 
906 		enqueue_fail = nr_xmit - ret;
907 		if (enqueue_fail)
908 			free_pkts(&m[ret], nr_xmit - ret);
909 	} else {
910 		ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
911 						m, nr_xmit);
912 	}
913 
914 	if (enable_stats) {
915 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
916 				__ATOMIC_SEQ_CST);
917 		__atomic_add_fetch(&vdev->stats.rx_atomic, ret,
918 				__ATOMIC_SEQ_CST);
919 	}
920 
921 	if (!async_vhost_driver)
922 		free_pkts(m, nr_xmit);
923 }
924 
925 static __rte_always_inline void
926 drain_vhost_table(void)
927 {
928 	uint16_t lcore_id = rte_lcore_id();
929 	struct vhost_bufftable *vhost_txq;
930 	struct vhost_dev *vdev;
931 	uint64_t cur_tsc;
932 
933 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
934 		vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE
935 						+ vdev->vid];
936 
937 		cur_tsc = rte_rdtsc();
938 		if (unlikely(cur_tsc - vhost_txq->pre_tsc
939 				> MBUF_TABLE_DRAIN_TSC)) {
940 			RTE_LOG_DP(DEBUG, VHOST_DATA,
941 				"Vhost TX queue drained after timeout with burst size %u\n",
942 				vhost_txq->len);
943 			drain_vhost(vdev);
944 			vhost_txq->len = 0;
945 			vhost_txq->pre_tsc = cur_tsc;
946 		}
947 	}
948 }
949 
950 /*
951  * Check if the packet destination MAC address is for a local device. If so then put
952  * the packet on that devices RX queue. If not then return.
953  */
954 static __rte_always_inline int
955 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
956 {
957 	struct rte_ether_hdr *pkt_hdr;
958 	struct vhost_dev *dst_vdev;
959 	struct vhost_bufftable *vhost_txq;
960 	uint16_t lcore_id = rte_lcore_id();
961 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
962 
963 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
964 	if (!dst_vdev)
965 		return -1;
966 
967 	if (vdev->vid == dst_vdev->vid) {
968 		RTE_LOG_DP(DEBUG, VHOST_DATA,
969 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
970 			vdev->vid);
971 		return 0;
972 	}
973 
974 	RTE_LOG_DP(DEBUG, VHOST_DATA,
975 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
976 
977 	if (unlikely(dst_vdev->remove)) {
978 		RTE_LOG_DP(DEBUG, VHOST_DATA,
979 			"(%d) device is marked for removal\n", dst_vdev->vid);
980 		return 0;
981 	}
982 
983 	vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE + dst_vdev->vid];
984 	vhost_txq->m_table[vhost_txq->len++] = m;
985 
986 	if (enable_stats) {
987 		vdev->stats.tx_total++;
988 		vdev->stats.tx++;
989 	}
990 
991 	if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
992 		drain_vhost(dst_vdev);
993 		vhost_txq->len = 0;
994 		vhost_txq->pre_tsc = rte_rdtsc();
995 	}
996 	return 0;
997 }
998 
999 /*
1000  * Check if the destination MAC of a packet is one local VM,
1001  * and get its vlan tag, and offset if it is.
1002  */
1003 static __rte_always_inline int
1004 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
1005 	uint32_t *offset, uint16_t *vlan_tag)
1006 {
1007 	struct vhost_dev *dst_vdev;
1008 	struct rte_ether_hdr *pkt_hdr =
1009 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1010 
1011 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
1012 	if (!dst_vdev)
1013 		return 0;
1014 
1015 	if (vdev->vid == dst_vdev->vid) {
1016 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1017 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
1018 			vdev->vid);
1019 		return -1;
1020 	}
1021 
1022 	/*
1023 	 * HW vlan strip will reduce the packet length
1024 	 * by minus length of vlan tag, so need restore
1025 	 * the packet length by plus it.
1026 	 */
1027 	*offset  = VLAN_HLEN;
1028 	*vlan_tag = vlan_tags[vdev->vid];
1029 
1030 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1031 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1032 		vdev->vid, dst_vdev->vid, *vlan_tag);
1033 
1034 	return 0;
1035 }
1036 
1037 static void virtio_tx_offload(struct rte_mbuf *m)
1038 {
1039 	struct rte_net_hdr_lens hdr_lens;
1040 	struct rte_ipv4_hdr *ipv4_hdr;
1041 	struct rte_tcp_hdr *tcp_hdr;
1042 	uint32_t ptype;
1043 	void *l3_hdr;
1044 
1045 	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
1046 	m->l2_len = hdr_lens.l2_len;
1047 	m->l3_len = hdr_lens.l3_len;
1048 	m->l4_len = hdr_lens.l4_len;
1049 
1050 	l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len);
1051 	tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *,
1052 		m->l2_len + m->l3_len);
1053 
1054 	m->ol_flags |= PKT_TX_TCP_SEG;
1055 	if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) {
1056 		m->ol_flags |= PKT_TX_IPV4;
1057 		m->ol_flags |= PKT_TX_IP_CKSUM;
1058 		ipv4_hdr = l3_hdr;
1059 		ipv4_hdr->hdr_checksum = 0;
1060 		tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags);
1061 	} else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1062 		m->ol_flags |= PKT_TX_IPV6;
1063 		tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags);
1064 	}
1065 }
1066 
1067 static __rte_always_inline void
1068 do_drain_mbuf_table(struct mbuf_table *tx_q)
1069 {
1070 	uint16_t count;
1071 
1072 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1073 				 tx_q->m_table, tx_q->len);
1074 	if (unlikely(count < tx_q->len))
1075 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
1076 
1077 	tx_q->len = 0;
1078 }
1079 
1080 /*
1081  * This function routes the TX packet to the correct interface. This
1082  * may be a local device or the physical port.
1083  */
1084 static __rte_always_inline void
1085 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1086 {
1087 	struct mbuf_table *tx_q;
1088 	unsigned offset = 0;
1089 	const uint16_t lcore_id = rte_lcore_id();
1090 	struct rte_ether_hdr *nh;
1091 
1092 
1093 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1094 	if (unlikely(rte_is_broadcast_ether_addr(&nh->d_addr))) {
1095 		struct vhost_dev *vdev2;
1096 
1097 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1098 			if (vdev2 != vdev)
1099 				sync_virtio_xmit(vdev2, vdev, m);
1100 		}
1101 		goto queue2nic;
1102 	}
1103 
1104 	/*check if destination is local VM*/
1105 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1106 		return;
1107 
1108 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1109 		if (unlikely(find_local_dest(vdev, m, &offset,
1110 					     &vlan_tag) != 0)) {
1111 			rte_pktmbuf_free(m);
1112 			return;
1113 		}
1114 	}
1115 
1116 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1117 		"(%d) TX: MAC address is external\n", vdev->vid);
1118 
1119 queue2nic:
1120 
1121 	/*Add packet to the port tx queue*/
1122 	tx_q = &lcore_tx_queue[lcore_id];
1123 
1124 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1125 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1126 		/* Guest has inserted the vlan tag. */
1127 		struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1128 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1129 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1130 			(vh->vlan_tci != vlan_tag_be))
1131 			vh->vlan_tci = vlan_tag_be;
1132 	} else {
1133 		m->ol_flags |= PKT_TX_VLAN_PKT;
1134 
1135 		/*
1136 		 * Find the right seg to adjust the data len when offset is
1137 		 * bigger than tail room size.
1138 		 */
1139 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1140 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1141 				m->data_len += offset;
1142 			else {
1143 				struct rte_mbuf *seg = m;
1144 
1145 				while ((seg->next != NULL) &&
1146 					(offset > rte_pktmbuf_tailroom(seg)))
1147 					seg = seg->next;
1148 
1149 				seg->data_len += offset;
1150 			}
1151 			m->pkt_len += offset;
1152 		}
1153 
1154 		m->vlan_tci = vlan_tag;
1155 	}
1156 
1157 	if (m->ol_flags & PKT_RX_LRO)
1158 		virtio_tx_offload(m);
1159 
1160 	tx_q->m_table[tx_q->len++] = m;
1161 	if (enable_stats) {
1162 		vdev->stats.tx_total++;
1163 		vdev->stats.tx++;
1164 	}
1165 
1166 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1167 		do_drain_mbuf_table(tx_q);
1168 }
1169 
1170 
1171 static __rte_always_inline void
1172 drain_mbuf_table(struct mbuf_table *tx_q)
1173 {
1174 	static uint64_t prev_tsc;
1175 	uint64_t cur_tsc;
1176 
1177 	if (tx_q->len == 0)
1178 		return;
1179 
1180 	cur_tsc = rte_rdtsc();
1181 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1182 		prev_tsc = cur_tsc;
1183 
1184 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1185 			"TX queue drained after timeout with burst size %u\n",
1186 			tx_q->len);
1187 		do_drain_mbuf_table(tx_q);
1188 	}
1189 }
1190 
1191 static __rte_always_inline void
1192 drain_eth_rx(struct vhost_dev *vdev)
1193 {
1194 	uint16_t rx_count, enqueue_count;
1195 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1196 
1197 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1198 				    pkts, MAX_PKT_BURST);
1199 
1200 	if (!rx_count)
1201 		return;
1202 
1203 	/*
1204 	 * When "enable_retry" is set, here we wait and retry when there
1205 	 * is no enough free slots in the queue to hold @rx_count packets,
1206 	 * to diminish packet loss.
1207 	 */
1208 	if (enable_retry &&
1209 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1210 			VIRTIO_RXQ))) {
1211 		uint32_t retry;
1212 
1213 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1214 			rte_delay_us(burst_rx_delay_time);
1215 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1216 					VIRTIO_RXQ))
1217 				break;
1218 		}
1219 	}
1220 
1221 	if (builtin_net_driver) {
1222 		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1223 						pkts, rx_count);
1224 	} else if (async_vhost_driver) {
1225 		uint32_t cpu_cpl_nr = 0;
1226 		uint16_t enqueue_fail = 0;
1227 		struct rte_mbuf *m_cpu_cpl[MAX_PKT_BURST];
1228 
1229 		complete_async_pkts(vdev);
1230 		enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1231 					VIRTIO_RXQ, pkts, rx_count,
1232 					m_cpu_cpl, &cpu_cpl_nr);
1233 		__atomic_add_fetch(&vdev->pkts_inflight, enqueue_count - cpu_cpl_nr,
1234 					__ATOMIC_SEQ_CST);
1235 
1236 		if (cpu_cpl_nr)
1237 			free_pkts(m_cpu_cpl, cpu_cpl_nr);
1238 
1239 		enqueue_fail = rx_count - enqueue_count;
1240 		if (enqueue_fail)
1241 			free_pkts(&pkts[enqueue_count], enqueue_fail);
1242 
1243 	} else {
1244 		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1245 						pkts, rx_count);
1246 	}
1247 
1248 	if (enable_stats) {
1249 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1250 				__ATOMIC_SEQ_CST);
1251 		__atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1252 				__ATOMIC_SEQ_CST);
1253 	}
1254 
1255 	if (!async_vhost_driver)
1256 		free_pkts(pkts, rx_count);
1257 }
1258 
1259 static __rte_always_inline void
1260 drain_virtio_tx(struct vhost_dev *vdev)
1261 {
1262 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1263 	uint16_t count;
1264 	uint16_t i;
1265 
1266 	if (builtin_net_driver) {
1267 		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1268 					pkts, MAX_PKT_BURST);
1269 	} else {
1270 		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1271 					mbuf_pool, pkts, MAX_PKT_BURST);
1272 	}
1273 
1274 	/* setup VMDq for the first packet */
1275 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1276 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1277 			free_pkts(pkts, count);
1278 	}
1279 
1280 	for (i = 0; i < count; ++i)
1281 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1282 }
1283 
1284 /*
1285  * Main function of vhost-switch. It basically does:
1286  *
1287  * for each vhost device {
1288  *    - drain_eth_rx()
1289  *
1290  *      Which drains the host eth Rx queue linked to the vhost device,
1291  *      and deliver all of them to guest virito Rx ring associated with
1292  *      this vhost device.
1293  *
1294  *    - drain_virtio_tx()
1295  *
1296  *      Which drains the guest virtio Tx queue and deliver all of them
1297  *      to the target, which could be another vhost device, or the
1298  *      physical eth dev. The route is done in function "virtio_tx_route".
1299  * }
1300  */
1301 static int
1302 switch_worker(void *arg __rte_unused)
1303 {
1304 	unsigned i;
1305 	unsigned lcore_id = rte_lcore_id();
1306 	struct vhost_dev *vdev;
1307 	struct mbuf_table *tx_q;
1308 
1309 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1310 
1311 	tx_q = &lcore_tx_queue[lcore_id];
1312 	for (i = 0; i < rte_lcore_count(); i++) {
1313 		if (lcore_ids[i] == lcore_id) {
1314 			tx_q->txq_id = i;
1315 			break;
1316 		}
1317 	}
1318 
1319 	while(1) {
1320 		drain_mbuf_table(tx_q);
1321 		drain_vhost_table();
1322 		/*
1323 		 * Inform the configuration core that we have exited the
1324 		 * linked list and that no devices are in use if requested.
1325 		 */
1326 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1327 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1328 
1329 		/*
1330 		 * Process vhost devices
1331 		 */
1332 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1333 			      lcore_vdev_entry) {
1334 			if (unlikely(vdev->remove)) {
1335 				unlink_vmdq(vdev);
1336 				vdev->ready = DEVICE_SAFE_REMOVE;
1337 				continue;
1338 			}
1339 
1340 			if (likely(vdev->ready == DEVICE_RX))
1341 				drain_eth_rx(vdev);
1342 
1343 			if (likely(!vdev->remove))
1344 				drain_virtio_tx(vdev);
1345 		}
1346 	}
1347 
1348 	return 0;
1349 }
1350 
1351 /*
1352  * Remove a device from the specific data core linked list and from the
1353  * main linked list. Synchonization  occurs through the use of the
1354  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1355  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1356  */
1357 static void
1358 destroy_device(int vid)
1359 {
1360 	struct vhost_dev *vdev = NULL;
1361 	int lcore;
1362 	uint16_t i;
1363 
1364 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1365 		if (vdev->vid == vid)
1366 			break;
1367 	}
1368 	if (!vdev)
1369 		return;
1370 	/*set the remove flag. */
1371 	vdev->remove = 1;
1372 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1373 		rte_pause();
1374 	}
1375 
1376 	for (i = 0; i < RTE_MAX_LCORE; i++)
1377 		rte_free(vhost_txbuff[i * MAX_VHOST_DEVICE + vid]);
1378 
1379 	if (builtin_net_driver)
1380 		vs_vhost_net_remove(vdev);
1381 
1382 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1383 		     lcore_vdev_entry);
1384 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1385 
1386 
1387 	/* Set the dev_removal_flag on each lcore. */
1388 	RTE_LCORE_FOREACH_WORKER(lcore)
1389 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1390 
1391 	/*
1392 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1393 	 * we can be sure that they can no longer access the device removed
1394 	 * from the linked lists and that the devices are no longer in use.
1395 	 */
1396 	RTE_LCORE_FOREACH_WORKER(lcore) {
1397 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1398 			rte_pause();
1399 	}
1400 
1401 	lcore_info[vdev->coreid].device_num--;
1402 
1403 	RTE_LOG(INFO, VHOST_DATA,
1404 		"(%d) device has been removed from data core\n",
1405 		vdev->vid);
1406 
1407 	if (async_vhost_driver) {
1408 		uint16_t n_pkt = 0;
1409 		struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1410 
1411 		while (vdev->pkts_inflight) {
1412 			n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, VIRTIO_RXQ,
1413 						m_cpl, vdev->pkts_inflight);
1414 			free_pkts(m_cpl, n_pkt);
1415 			__atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1416 		}
1417 
1418 		rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1419 	}
1420 
1421 	rte_free(vdev);
1422 }
1423 
1424 /*
1425  * A new device is added to a data core. First the device is added to the main linked list
1426  * and then allocated to a specific data core.
1427  */
1428 static int
1429 new_device(int vid)
1430 {
1431 	int lcore, core_add = 0;
1432 	uint16_t i;
1433 	uint32_t device_num_min = num_devices;
1434 	struct vhost_dev *vdev;
1435 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1436 	if (vdev == NULL) {
1437 		RTE_LOG(INFO, VHOST_DATA,
1438 			"(%d) couldn't allocate memory for vhost dev\n",
1439 			vid);
1440 		return -1;
1441 	}
1442 	vdev->vid = vid;
1443 
1444 	for (i = 0; i < RTE_MAX_LCORE; i++) {
1445 		vhost_txbuff[i * MAX_VHOST_DEVICE + vid]
1446 			= rte_zmalloc("vhost bufftable",
1447 				sizeof(struct vhost_bufftable),
1448 				RTE_CACHE_LINE_SIZE);
1449 
1450 		if (vhost_txbuff[i * MAX_VHOST_DEVICE + vid] == NULL) {
1451 			RTE_LOG(INFO, VHOST_DATA,
1452 			  "(%d) couldn't allocate memory for vhost TX\n", vid);
1453 			return -1;
1454 		}
1455 	}
1456 
1457 	if (builtin_net_driver)
1458 		vs_vhost_net_setup(vdev);
1459 
1460 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1461 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1462 
1463 	/*reset ready flag*/
1464 	vdev->ready = DEVICE_MAC_LEARNING;
1465 	vdev->remove = 0;
1466 
1467 	/* Find a suitable lcore to add the device. */
1468 	RTE_LCORE_FOREACH_WORKER(lcore) {
1469 		if (lcore_info[lcore].device_num < device_num_min) {
1470 			device_num_min = lcore_info[lcore].device_num;
1471 			core_add = lcore;
1472 		}
1473 	}
1474 	vdev->coreid = core_add;
1475 
1476 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1477 			  lcore_vdev_entry);
1478 	lcore_info[vdev->coreid].device_num++;
1479 
1480 	/* Disable notifications. */
1481 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1482 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1483 
1484 	RTE_LOG(INFO, VHOST_DATA,
1485 		"(%d) device has been added to data core %d\n",
1486 		vid, vdev->coreid);
1487 
1488 	if (async_vhost_driver) {
1489 		struct rte_vhost_async_config config = {0};
1490 		struct rte_vhost_async_channel_ops channel_ops;
1491 
1492 		if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
1493 			channel_ops.transfer_data = ioat_transfer_data_cb;
1494 			channel_ops.check_completed_copies =
1495 				ioat_check_completed_copies_cb;
1496 
1497 			config.features = RTE_VHOST_ASYNC_INORDER;
1498 			config.async_threshold = 256;
1499 
1500 			return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
1501 				config, &channel_ops);
1502 		}
1503 	}
1504 
1505 	return 0;
1506 }
1507 
1508 static int
1509 vring_state_changed(int vid, uint16_t queue_id, int enable)
1510 {
1511 	struct vhost_dev *vdev = NULL;
1512 
1513 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1514 		if (vdev->vid == vid)
1515 			break;
1516 	}
1517 	if (!vdev)
1518 		return -1;
1519 
1520 	if (queue_id != VIRTIO_RXQ)
1521 		return 0;
1522 
1523 	if (async_vhost_driver) {
1524 		if (!enable) {
1525 			uint16_t n_pkt = 0;
1526 			struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1527 
1528 			while (vdev->pkts_inflight) {
1529 				n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, queue_id,
1530 							m_cpl, vdev->pkts_inflight);
1531 				free_pkts(m_cpl, n_pkt);
1532 				__atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1533 			}
1534 		}
1535 	}
1536 
1537 	return 0;
1538 }
1539 
1540 /*
1541  * These callback allow devices to be added to the data core when configuration
1542  * has been fully complete.
1543  */
1544 static const struct vhost_device_ops virtio_net_device_ops =
1545 {
1546 	.new_device =  new_device,
1547 	.destroy_device = destroy_device,
1548 	.vring_state_changed = vring_state_changed,
1549 };
1550 
1551 /*
1552  * This is a thread will wake up after a period to print stats if the user has
1553  * enabled them.
1554  */
1555 static void *
1556 print_stats(__rte_unused void *arg)
1557 {
1558 	struct vhost_dev *vdev;
1559 	uint64_t tx_dropped, rx_dropped;
1560 	uint64_t tx, tx_total, rx, rx_total;
1561 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1562 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1563 
1564 	while(1) {
1565 		sleep(enable_stats);
1566 
1567 		/* Clear screen and move to top left */
1568 		printf("%s%s\n", clr, top_left);
1569 		printf("Device statistics =================================\n");
1570 
1571 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1572 			tx_total   = vdev->stats.tx_total;
1573 			tx         = vdev->stats.tx;
1574 			tx_dropped = tx_total - tx;
1575 
1576 			rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1577 				__ATOMIC_SEQ_CST);
1578 			rx         = __atomic_load_n(&vdev->stats.rx_atomic,
1579 				__ATOMIC_SEQ_CST);
1580 			rx_dropped = rx_total - rx;
1581 
1582 			printf("Statistics for device %d\n"
1583 				"-----------------------\n"
1584 				"TX total:              %" PRIu64 "\n"
1585 				"TX dropped:            %" PRIu64 "\n"
1586 				"TX successful:         %" PRIu64 "\n"
1587 				"RX total:              %" PRIu64 "\n"
1588 				"RX dropped:            %" PRIu64 "\n"
1589 				"RX successful:         %" PRIu64 "\n",
1590 				vdev->vid,
1591 				tx_total, tx_dropped, tx,
1592 				rx_total, rx_dropped, rx);
1593 		}
1594 
1595 		printf("===================================================\n");
1596 
1597 		fflush(stdout);
1598 	}
1599 
1600 	return NULL;
1601 }
1602 
1603 static void
1604 unregister_drivers(int socket_num)
1605 {
1606 	int i, ret;
1607 
1608 	for (i = 0; i < socket_num; i++) {
1609 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1610 		if (ret != 0)
1611 			RTE_LOG(ERR, VHOST_CONFIG,
1612 				"Fail to unregister vhost driver for %s.\n",
1613 				socket_files + i * PATH_MAX);
1614 	}
1615 }
1616 
1617 /* When we receive a INT signal, unregister vhost driver */
1618 static void
1619 sigint_handler(__rte_unused int signum)
1620 {
1621 	/* Unregister vhost driver. */
1622 	unregister_drivers(nb_sockets);
1623 
1624 	exit(0);
1625 }
1626 
1627 /*
1628  * While creating an mbuf pool, one key thing is to figure out how
1629  * many mbuf entries is enough for our use. FYI, here are some
1630  * guidelines:
1631  *
1632  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1633  *
1634  * - For each switch core (A CPU core does the packet switch), we need
1635  *   also make some reservation for receiving the packets from virtio
1636  *   Tx queue. How many is enough depends on the usage. It's normally
1637  *   a simple calculation like following:
1638  *
1639  *       MAX_PKT_BURST * max packet size / mbuf size
1640  *
1641  *   So, we definitely need allocate more mbufs when TSO is enabled.
1642  *
1643  * - Similarly, for each switching core, we should serve @nr_rx_desc
1644  *   mbufs for receiving the packets from physical NIC device.
1645  *
1646  * - We also need make sure, for each switch core, we have allocated
1647  *   enough mbufs to fill up the mbuf cache.
1648  */
1649 static void
1650 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1651 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1652 {
1653 	uint32_t nr_mbufs;
1654 	uint32_t nr_mbufs_per_core;
1655 	uint32_t mtu = 1500;
1656 
1657 	if (mergeable)
1658 		mtu = 9000;
1659 	if (enable_tso)
1660 		mtu = 64 * 1024;
1661 
1662 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1663 			(mbuf_size - RTE_PKTMBUF_HEADROOM);
1664 	nr_mbufs_per_core += nr_rx_desc;
1665 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1666 
1667 	nr_mbufs  = nr_queues * nr_rx_desc;
1668 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1669 	nr_mbufs *= nr_port;
1670 
1671 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1672 					    nr_mbuf_cache, 0, mbuf_size,
1673 					    rte_socket_id());
1674 	if (mbuf_pool == NULL)
1675 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1676 }
1677 
1678 /*
1679  * Main function, does initialisation and calls the per-lcore functions.
1680  */
1681 int
1682 main(int argc, char *argv[])
1683 {
1684 	unsigned lcore_id, core_id = 0;
1685 	unsigned nb_ports, valid_num_ports;
1686 	int ret, i;
1687 	uint16_t portid;
1688 	static pthread_t tid;
1689 	uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1690 
1691 	signal(SIGINT, sigint_handler);
1692 
1693 	/* init EAL */
1694 	ret = rte_eal_init(argc, argv);
1695 	if (ret < 0)
1696 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1697 	argc -= ret;
1698 	argv += ret;
1699 
1700 	/* parse app arguments */
1701 	ret = us_vhost_parse_args(argc, argv);
1702 	if (ret < 0)
1703 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1704 
1705 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1706 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1707 
1708 		if (rte_lcore_is_enabled(lcore_id))
1709 			lcore_ids[core_id++] = lcore_id;
1710 	}
1711 
1712 	if (rte_lcore_count() > RTE_MAX_LCORE)
1713 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1714 
1715 	/* Get the number of physical ports. */
1716 	nb_ports = rte_eth_dev_count_avail();
1717 
1718 	/*
1719 	 * Update the global var NUM_PORTS and global array PORTS
1720 	 * and get value of var VALID_NUM_PORTS according to system ports number
1721 	 */
1722 	valid_num_ports = check_ports_num(nb_ports);
1723 
1724 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1725 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1726 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1727 		return -1;
1728 	}
1729 
1730 	/*
1731 	 * FIXME: here we are trying to allocate mbufs big enough for
1732 	 * @MAX_QUEUES, but the truth is we're never going to use that
1733 	 * many queues here. We probably should only do allocation for
1734 	 * those queues we are going to use.
1735 	 */
1736 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1737 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1738 
1739 	if (vm2vm_mode == VM2VM_HARDWARE) {
1740 		/* Enable VT loop back to let L2 switch to do it. */
1741 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1742 		RTE_LOG(DEBUG, VHOST_CONFIG,
1743 			"Enable loop back for L2 switch in vmdq.\n");
1744 	}
1745 
1746 	/* initialize all ports */
1747 	RTE_ETH_FOREACH_DEV(portid) {
1748 		/* skip ports that are not enabled */
1749 		if ((enabled_port_mask & (1 << portid)) == 0) {
1750 			RTE_LOG(INFO, VHOST_PORT,
1751 				"Skipping disabled port %d\n", portid);
1752 			continue;
1753 		}
1754 		if (port_init(portid) != 0)
1755 			rte_exit(EXIT_FAILURE,
1756 				"Cannot initialize network ports\n");
1757 	}
1758 
1759 	/* Enable stats if the user option is set. */
1760 	if (enable_stats) {
1761 		ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1762 					print_stats, NULL);
1763 		if (ret < 0)
1764 			rte_exit(EXIT_FAILURE,
1765 				"Cannot create print-stats thread\n");
1766 	}
1767 
1768 	/* Launch all data cores. */
1769 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1770 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1771 
1772 	if (client_mode)
1773 		flags |= RTE_VHOST_USER_CLIENT;
1774 
1775 	/* Register vhost user driver to handle vhost messages. */
1776 	for (i = 0; i < nb_sockets; i++) {
1777 		char *file = socket_files + i * PATH_MAX;
1778 
1779 		if (async_vhost_driver)
1780 			flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1781 
1782 		ret = rte_vhost_driver_register(file, flags);
1783 		if (ret != 0) {
1784 			unregister_drivers(i);
1785 			rte_exit(EXIT_FAILURE,
1786 				"vhost driver register failure.\n");
1787 		}
1788 
1789 		if (builtin_net_driver)
1790 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1791 
1792 		if (mergeable == 0) {
1793 			rte_vhost_driver_disable_features(file,
1794 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
1795 		}
1796 
1797 		if (enable_tx_csum == 0) {
1798 			rte_vhost_driver_disable_features(file,
1799 				1ULL << VIRTIO_NET_F_CSUM);
1800 		}
1801 
1802 		if (enable_tso == 0) {
1803 			rte_vhost_driver_disable_features(file,
1804 				1ULL << VIRTIO_NET_F_HOST_TSO4);
1805 			rte_vhost_driver_disable_features(file,
1806 				1ULL << VIRTIO_NET_F_HOST_TSO6);
1807 			rte_vhost_driver_disable_features(file,
1808 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
1809 			rte_vhost_driver_disable_features(file,
1810 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
1811 		}
1812 
1813 		if (promiscuous) {
1814 			rte_vhost_driver_enable_features(file,
1815 				1ULL << VIRTIO_NET_F_CTRL_RX);
1816 		}
1817 
1818 		ret = rte_vhost_driver_callback_register(file,
1819 			&virtio_net_device_ops);
1820 		if (ret != 0) {
1821 			rte_exit(EXIT_FAILURE,
1822 				"failed to register vhost driver callbacks.\n");
1823 		}
1824 
1825 		if (rte_vhost_driver_start(file) < 0) {
1826 			rte_exit(EXIT_FAILURE,
1827 				"failed to start vhost driver.\n");
1828 		}
1829 	}
1830 
1831 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1832 		rte_eal_wait_lcore(lcore_id);
1833 
1834 	/* clean up the EAL */
1835 	rte_eal_cleanup();
1836 
1837 	return 0;
1838 }
1839