xref: /dpdk/examples/vhost/main.c (revision 6c02043e9967a9d8f6e8c058256e257efe1d6d1a)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4 
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16 
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
19 #include <rte_log.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
22 #include <rte_vhost.h>
23 #include <rte_ip.h>
24 #include <rte_tcp.h>
25 #include <rte_pause.h>
26 
27 #include "ioat.h"
28 #include "main.h"
29 
30 #ifndef MAX_QUEUES
31 #define MAX_QUEUES 128
32 #endif
33 
34 /* the maximum number of external ports supported */
35 #define MAX_SUP_PORTS 1
36 
37 #define MBUF_CACHE_SIZE	128
38 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
39 
40 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
41 
42 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
43 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
44 
45 #define JUMBO_FRAME_MAX_SIZE    0x2600
46 
47 /* State of virtio device. */
48 #define DEVICE_MAC_LEARNING 0
49 #define DEVICE_RX			1
50 #define DEVICE_SAFE_REMOVE	2
51 
52 /* Configurable number of RX/TX ring descriptors */
53 #define RTE_TEST_RX_DESC_DEFAULT 1024
54 #define RTE_TEST_TX_DESC_DEFAULT 512
55 
56 #define INVALID_PORT_ID 0xFF
57 
58 /* Maximum long option length for option parsing. */
59 #define MAX_LONG_OPT_SZ 64
60 
61 /* mask of enabled ports */
62 static uint32_t enabled_port_mask = 0;
63 
64 /* Promiscuous mode */
65 static uint32_t promiscuous;
66 
67 /* number of devices/queues to support*/
68 static uint32_t num_queues = 0;
69 static uint32_t num_devices;
70 
71 static struct rte_mempool *mbuf_pool;
72 static int mergeable;
73 
74 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
75 typedef enum {
76 	VM2VM_DISABLED = 0,
77 	VM2VM_SOFTWARE = 1,
78 	VM2VM_HARDWARE = 2,
79 	VM2VM_LAST
80 } vm2vm_type;
81 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
82 
83 /* Enable stats. */
84 static uint32_t enable_stats = 0;
85 /* Enable retries on RX. */
86 static uint32_t enable_retry = 1;
87 
88 /* Disable TX checksum offload */
89 static uint32_t enable_tx_csum;
90 
91 /* Disable TSO offload */
92 static uint32_t enable_tso;
93 
94 static int client_mode;
95 
96 static int builtin_net_driver;
97 
98 static int async_vhost_driver;
99 
100 static char dma_type[MAX_LONG_OPT_SZ];
101 
102 /* Specify timeout (in useconds) between retries on RX. */
103 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
104 /* Specify the number of retries on RX. */
105 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
106 
107 /* Socket file paths. Can be set by user */
108 static char *socket_files;
109 static int nb_sockets;
110 
111 /* empty vmdq configuration structure. Filled in programatically */
112 static struct rte_eth_conf vmdq_conf_default = {
113 	.rxmode = {
114 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
115 		.split_hdr_size = 0,
116 		/*
117 		 * VLAN strip is necessary for 1G NIC such as I350,
118 		 * this fixes bug of ipv4 forwarding in guest can't
119 		 * forward pakets from one virtio dev to another virtio dev.
120 		 */
121 		.offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
122 	},
123 
124 	.txmode = {
125 		.mq_mode = ETH_MQ_TX_NONE,
126 		.offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
127 			     DEV_TX_OFFLOAD_TCP_CKSUM |
128 			     DEV_TX_OFFLOAD_VLAN_INSERT |
129 			     DEV_TX_OFFLOAD_MULTI_SEGS |
130 			     DEV_TX_OFFLOAD_TCP_TSO),
131 	},
132 	.rx_adv_conf = {
133 		/*
134 		 * should be overridden separately in code with
135 		 * appropriate values
136 		 */
137 		.vmdq_rx_conf = {
138 			.nb_queue_pools = ETH_8_POOLS,
139 			.enable_default_pool = 0,
140 			.default_pool = 0,
141 			.nb_pool_maps = 0,
142 			.pool_map = {{0, 0},},
143 		},
144 	},
145 };
146 
147 
148 static unsigned lcore_ids[RTE_MAX_LCORE];
149 static uint16_t ports[RTE_MAX_ETHPORTS];
150 static unsigned num_ports = 0; /**< The number of ports specified in command line */
151 static uint16_t num_pf_queues, num_vmdq_queues;
152 static uint16_t vmdq_pool_base, vmdq_queue_base;
153 static uint16_t queues_per_pool;
154 
155 const uint16_t vlan_tags[] = {
156 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
157 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
158 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
159 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
160 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
161 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
162 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
163 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
164 };
165 
166 /* ethernet addresses of ports */
167 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
168 
169 static struct vhost_dev_tailq_list vhost_dev_list =
170 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
171 
172 static struct lcore_info lcore_info[RTE_MAX_LCORE];
173 
174 /* Used for queueing bursts of TX packets. */
175 struct mbuf_table {
176 	unsigned len;
177 	unsigned txq_id;
178 	struct rte_mbuf *m_table[MAX_PKT_BURST];
179 };
180 
181 struct vhost_bufftable {
182 	uint32_t len;
183 	uint64_t pre_tsc;
184 	struct rte_mbuf *m_table[MAX_PKT_BURST];
185 };
186 
187 /* TX queue for each data core. */
188 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
189 
190 /*
191  * Vhost TX buffer for each data core.
192  * Every data core maintains a TX buffer for every vhost device,
193  * which is used for batch pkts enqueue for higher performance.
194  */
195 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * MAX_VHOST_DEVICE];
196 
197 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
198 				 / US_PER_S * BURST_TX_DRAIN_US)
199 #define VLAN_HLEN       4
200 
201 static inline int
202 open_dma(const char *value)
203 {
204 	if (strncmp(dma_type, "ioat", 4) == 0)
205 		return open_ioat(value);
206 
207 	return -1;
208 }
209 
210 /*
211  * Builds up the correct configuration for VMDQ VLAN pool map
212  * according to the pool & queue limits.
213  */
214 static inline int
215 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
216 {
217 	struct rte_eth_vmdq_rx_conf conf;
218 	struct rte_eth_vmdq_rx_conf *def_conf =
219 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
220 	unsigned i;
221 
222 	memset(&conf, 0, sizeof(conf));
223 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
224 	conf.nb_pool_maps = num_devices;
225 	conf.enable_loop_back = def_conf->enable_loop_back;
226 	conf.rx_mode = def_conf->rx_mode;
227 
228 	for (i = 0; i < conf.nb_pool_maps; i++) {
229 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
230 		conf.pool_map[i].pools = (1UL << i);
231 	}
232 
233 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
234 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
235 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
236 	return 0;
237 }
238 
239 /*
240  * Initialises a given port using global settings and with the rx buffers
241  * coming from the mbuf_pool passed as parameter
242  */
243 static inline int
244 port_init(uint16_t port)
245 {
246 	struct rte_eth_dev_info dev_info;
247 	struct rte_eth_conf port_conf;
248 	struct rte_eth_rxconf *rxconf;
249 	struct rte_eth_txconf *txconf;
250 	int16_t rx_rings, tx_rings;
251 	uint16_t rx_ring_size, tx_ring_size;
252 	int retval;
253 	uint16_t q;
254 
255 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
256 	retval = rte_eth_dev_info_get(port, &dev_info);
257 	if (retval != 0) {
258 		RTE_LOG(ERR, VHOST_PORT,
259 			"Error during getting device (port %u) info: %s\n",
260 			port, strerror(-retval));
261 
262 		return retval;
263 	}
264 
265 	rxconf = &dev_info.default_rxconf;
266 	txconf = &dev_info.default_txconf;
267 	rxconf->rx_drop_en = 1;
268 
269 	/*configure the number of supported virtio devices based on VMDQ limits */
270 	num_devices = dev_info.max_vmdq_pools;
271 
272 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
273 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
274 
275 	tx_rings = (uint16_t)rte_lcore_count();
276 
277 	/* Get port configuration. */
278 	retval = get_eth_conf(&port_conf, num_devices);
279 	if (retval < 0)
280 		return retval;
281 	/* NIC queues are divided into pf queues and vmdq queues.  */
282 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
283 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
284 	num_vmdq_queues = num_devices * queues_per_pool;
285 	num_queues = num_pf_queues + num_vmdq_queues;
286 	vmdq_queue_base = dev_info.vmdq_queue_base;
287 	vmdq_pool_base  = dev_info.vmdq_pool_base;
288 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
289 		num_pf_queues, num_devices, queues_per_pool);
290 
291 	if (!rte_eth_dev_is_valid_port(port))
292 		return -1;
293 
294 	rx_rings = (uint16_t)dev_info.max_rx_queues;
295 	if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
296 		port_conf.txmode.offloads |=
297 			DEV_TX_OFFLOAD_MBUF_FAST_FREE;
298 	/* Configure ethernet device. */
299 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
300 	if (retval != 0) {
301 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
302 			port, strerror(-retval));
303 		return retval;
304 	}
305 
306 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
307 		&tx_ring_size);
308 	if (retval != 0) {
309 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
310 			"for port %u: %s.\n", port, strerror(-retval));
311 		return retval;
312 	}
313 	if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
314 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
315 			"for Rx queues on port %u.\n", port);
316 		return -1;
317 	}
318 
319 	/* Setup the queues. */
320 	rxconf->offloads = port_conf.rxmode.offloads;
321 	for (q = 0; q < rx_rings; q ++) {
322 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
323 						rte_eth_dev_socket_id(port),
324 						rxconf,
325 						mbuf_pool);
326 		if (retval < 0) {
327 			RTE_LOG(ERR, VHOST_PORT,
328 				"Failed to setup rx queue %u of port %u: %s.\n",
329 				q, port, strerror(-retval));
330 			return retval;
331 		}
332 	}
333 	txconf->offloads = port_conf.txmode.offloads;
334 	for (q = 0; q < tx_rings; q ++) {
335 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
336 						rte_eth_dev_socket_id(port),
337 						txconf);
338 		if (retval < 0) {
339 			RTE_LOG(ERR, VHOST_PORT,
340 				"Failed to setup tx queue %u of port %u: %s.\n",
341 				q, port, strerror(-retval));
342 			return retval;
343 		}
344 	}
345 
346 	/* Start the device. */
347 	retval  = rte_eth_dev_start(port);
348 	if (retval < 0) {
349 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
350 			port, strerror(-retval));
351 		return retval;
352 	}
353 
354 	if (promiscuous) {
355 		retval = rte_eth_promiscuous_enable(port);
356 		if (retval != 0) {
357 			RTE_LOG(ERR, VHOST_PORT,
358 				"Failed to enable promiscuous mode on port %u: %s\n",
359 				port, rte_strerror(-retval));
360 			return retval;
361 		}
362 	}
363 
364 	retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
365 	if (retval < 0) {
366 		RTE_LOG(ERR, VHOST_PORT,
367 			"Failed to get MAC address on port %u: %s\n",
368 			port, rte_strerror(-retval));
369 		return retval;
370 	}
371 
372 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
373 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
374 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
375 			port,
376 			vmdq_ports_eth_addr[port].addr_bytes[0],
377 			vmdq_ports_eth_addr[port].addr_bytes[1],
378 			vmdq_ports_eth_addr[port].addr_bytes[2],
379 			vmdq_ports_eth_addr[port].addr_bytes[3],
380 			vmdq_ports_eth_addr[port].addr_bytes[4],
381 			vmdq_ports_eth_addr[port].addr_bytes[5]);
382 
383 	return 0;
384 }
385 
386 /*
387  * Set socket file path.
388  */
389 static int
390 us_vhost_parse_socket_path(const char *q_arg)
391 {
392 	char *old;
393 
394 	/* parse number string */
395 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
396 		return -1;
397 
398 	old = socket_files;
399 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
400 	if (socket_files == NULL) {
401 		free(old);
402 		return -1;
403 	}
404 
405 	strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
406 	nb_sockets++;
407 
408 	return 0;
409 }
410 
411 /*
412  * Parse the portmask provided at run time.
413  */
414 static int
415 parse_portmask(const char *portmask)
416 {
417 	char *end = NULL;
418 	unsigned long pm;
419 
420 	errno = 0;
421 
422 	/* parse hexadecimal string */
423 	pm = strtoul(portmask, &end, 16);
424 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
425 		return 0;
426 
427 	return pm;
428 
429 }
430 
431 /*
432  * Parse num options at run time.
433  */
434 static int
435 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
436 {
437 	char *end = NULL;
438 	unsigned long num;
439 
440 	errno = 0;
441 
442 	/* parse unsigned int string */
443 	num = strtoul(q_arg, &end, 10);
444 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
445 		return -1;
446 
447 	if (num > max_valid_value)
448 		return -1;
449 
450 	return num;
451 
452 }
453 
454 /*
455  * Display usage
456  */
457 static void
458 us_vhost_usage(const char *prgname)
459 {
460 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
461 	"		--vm2vm [0|1|2]\n"
462 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
463 	"		--socket-file <path>\n"
464 	"		--nb-devices ND\n"
465 	"		-p PORTMASK: Set mask for ports to be used by application\n"
466 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
467 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
468 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
469 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
470 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
471 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
472 	"		--socket-file: The path of the socket file.\n"
473 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
474 	"		--tso [0|1] disable/enable TCP segment offload.\n"
475 	"		--client register a vhost-user socket as client mode.\n"
476 	"		--dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
477 	"		--dmas register dma channel for specific vhost device.\n",
478 	       prgname);
479 }
480 
481 /*
482  * Parse the arguments given in the command line of the application.
483  */
484 static int
485 us_vhost_parse_args(int argc, char **argv)
486 {
487 	int opt, ret;
488 	int option_index;
489 	unsigned i;
490 	const char *prgname = argv[0];
491 	static struct option long_option[] = {
492 		{"vm2vm", required_argument, NULL, 0},
493 		{"rx-retry", required_argument, NULL, 0},
494 		{"rx-retry-delay", required_argument, NULL, 0},
495 		{"rx-retry-num", required_argument, NULL, 0},
496 		{"mergeable", required_argument, NULL, 0},
497 		{"stats", required_argument, NULL, 0},
498 		{"socket-file", required_argument, NULL, 0},
499 		{"tx-csum", required_argument, NULL, 0},
500 		{"tso", required_argument, NULL, 0},
501 		{"client", no_argument, &client_mode, 1},
502 		{"builtin-net-driver", no_argument, &builtin_net_driver, 1},
503 		{"dma-type", required_argument, NULL, 0},
504 		{"dmas", required_argument, NULL, 0},
505 		{NULL, 0, 0, 0},
506 	};
507 
508 	/* Parse command line */
509 	while ((opt = getopt_long(argc, argv, "p:P",
510 			long_option, &option_index)) != EOF) {
511 		switch (opt) {
512 		/* Portmask */
513 		case 'p':
514 			enabled_port_mask = parse_portmask(optarg);
515 			if (enabled_port_mask == 0) {
516 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
517 				us_vhost_usage(prgname);
518 				return -1;
519 			}
520 			break;
521 
522 		case 'P':
523 			promiscuous = 1;
524 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
525 				ETH_VMDQ_ACCEPT_BROADCAST |
526 				ETH_VMDQ_ACCEPT_MULTICAST;
527 
528 			break;
529 
530 		case 0:
531 			/* Enable/disable vm2vm comms. */
532 			if (!strncmp(long_option[option_index].name, "vm2vm",
533 				MAX_LONG_OPT_SZ)) {
534 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
535 				if (ret == -1) {
536 					RTE_LOG(INFO, VHOST_CONFIG,
537 						"Invalid argument for "
538 						"vm2vm [0|1|2]\n");
539 					us_vhost_usage(prgname);
540 					return -1;
541 				} else {
542 					vm2vm_mode = (vm2vm_type)ret;
543 				}
544 			}
545 
546 			/* Enable/disable retries on RX. */
547 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
548 				ret = parse_num_opt(optarg, 1);
549 				if (ret == -1) {
550 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
551 					us_vhost_usage(prgname);
552 					return -1;
553 				} else {
554 					enable_retry = ret;
555 				}
556 			}
557 
558 			/* Enable/disable TX checksum offload. */
559 			if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
560 				ret = parse_num_opt(optarg, 1);
561 				if (ret == -1) {
562 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
563 					us_vhost_usage(prgname);
564 					return -1;
565 				} else
566 					enable_tx_csum = ret;
567 			}
568 
569 			/* Enable/disable TSO offload. */
570 			if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
571 				ret = parse_num_opt(optarg, 1);
572 				if (ret == -1) {
573 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
574 					us_vhost_usage(prgname);
575 					return -1;
576 				} else
577 					enable_tso = ret;
578 			}
579 
580 			/* Specify the retries delay time (in useconds) on RX. */
581 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
582 				ret = parse_num_opt(optarg, INT32_MAX);
583 				if (ret == -1) {
584 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
585 					us_vhost_usage(prgname);
586 					return -1;
587 				} else {
588 					burst_rx_delay_time = ret;
589 				}
590 			}
591 
592 			/* Specify the retries number on RX. */
593 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
594 				ret = parse_num_opt(optarg, INT32_MAX);
595 				if (ret == -1) {
596 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
597 					us_vhost_usage(prgname);
598 					return -1;
599 				} else {
600 					burst_rx_retry_num = ret;
601 				}
602 			}
603 
604 			/* Enable/disable RX mergeable buffers. */
605 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
606 				ret = parse_num_opt(optarg, 1);
607 				if (ret == -1) {
608 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
609 					us_vhost_usage(prgname);
610 					return -1;
611 				} else {
612 					mergeable = !!ret;
613 					if (ret) {
614 						vmdq_conf_default.rxmode.offloads |=
615 							DEV_RX_OFFLOAD_JUMBO_FRAME;
616 						vmdq_conf_default.rxmode.max_rx_pkt_len
617 							= JUMBO_FRAME_MAX_SIZE;
618 					}
619 				}
620 			}
621 
622 			/* Enable/disable stats. */
623 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
624 				ret = parse_num_opt(optarg, INT32_MAX);
625 				if (ret == -1) {
626 					RTE_LOG(INFO, VHOST_CONFIG,
627 						"Invalid argument for stats [0..N]\n");
628 					us_vhost_usage(prgname);
629 					return -1;
630 				} else {
631 					enable_stats = ret;
632 				}
633 			}
634 
635 			/* Set socket file path. */
636 			if (!strncmp(long_option[option_index].name,
637 						"socket-file", MAX_LONG_OPT_SZ)) {
638 				if (us_vhost_parse_socket_path(optarg) == -1) {
639 					RTE_LOG(INFO, VHOST_CONFIG,
640 					"Invalid argument for socket name (Max %d characters)\n",
641 					PATH_MAX);
642 					us_vhost_usage(prgname);
643 					return -1;
644 				}
645 			}
646 
647 			if (!strncmp(long_option[option_index].name,
648 						"dma-type", MAX_LONG_OPT_SZ)) {
649 				if (strlen(optarg) >= MAX_LONG_OPT_SZ) {
650 					RTE_LOG(INFO, VHOST_CONFIG,
651 						"Wrong DMA type\n");
652 					us_vhost_usage(prgname);
653 					return -1;
654 				}
655 				strcpy(dma_type, optarg);
656 			}
657 
658 			if (!strncmp(long_option[option_index].name,
659 						"dmas", MAX_LONG_OPT_SZ)) {
660 				if (open_dma(optarg) == -1) {
661 					RTE_LOG(INFO, VHOST_CONFIG,
662 						"Wrong DMA args\n");
663 					us_vhost_usage(prgname);
664 					return -1;
665 				}
666 				async_vhost_driver = 1;
667 			}
668 
669 			break;
670 
671 			/* Invalid option - print options. */
672 		default:
673 			us_vhost_usage(prgname);
674 			return -1;
675 		}
676 	}
677 
678 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
679 		if (enabled_port_mask & (1 << i))
680 			ports[num_ports++] = i;
681 	}
682 
683 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
684 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
685 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
686 		return -1;
687 	}
688 
689 	return 0;
690 }
691 
692 /*
693  * Update the global var NUM_PORTS and array PORTS according to system ports number
694  * and return valid ports number
695  */
696 static unsigned check_ports_num(unsigned nb_ports)
697 {
698 	unsigned valid_num_ports = num_ports;
699 	unsigned portid;
700 
701 	if (num_ports > nb_ports) {
702 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
703 			num_ports, nb_ports);
704 		num_ports = nb_ports;
705 	}
706 
707 	for (portid = 0; portid < num_ports; portid ++) {
708 		if (!rte_eth_dev_is_valid_port(ports[portid])) {
709 			RTE_LOG(INFO, VHOST_PORT,
710 				"\nSpecified port ID(%u) is not valid\n",
711 				ports[portid]);
712 			ports[portid] = INVALID_PORT_ID;
713 			valid_num_ports--;
714 		}
715 	}
716 	return valid_num_ports;
717 }
718 
719 static __rte_always_inline struct vhost_dev *
720 find_vhost_dev(struct rte_ether_addr *mac)
721 {
722 	struct vhost_dev *vdev;
723 
724 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
725 		if (vdev->ready == DEVICE_RX &&
726 		    rte_is_same_ether_addr(mac, &vdev->mac_address))
727 			return vdev;
728 	}
729 
730 	return NULL;
731 }
732 
733 /*
734  * This function learns the MAC address of the device and registers this along with a
735  * vlan tag to a VMDQ.
736  */
737 static int
738 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
739 {
740 	struct rte_ether_hdr *pkt_hdr;
741 	int i, ret;
742 
743 	/* Learn MAC address of guest device from packet */
744 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
745 
746 	if (find_vhost_dev(&pkt_hdr->s_addr)) {
747 		RTE_LOG(ERR, VHOST_DATA,
748 			"(%d) device is using a registered MAC!\n",
749 			vdev->vid);
750 		return -1;
751 	}
752 
753 	for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
754 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
755 
756 	/* vlan_tag currently uses the device_id. */
757 	vdev->vlan_tag = vlan_tags[vdev->vid];
758 
759 	/* Print out VMDQ registration info. */
760 	RTE_LOG(INFO, VHOST_DATA,
761 		"(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
762 		vdev->vid,
763 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
764 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
765 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
766 		vdev->vlan_tag);
767 
768 	/* Register the MAC address. */
769 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
770 				(uint32_t)vdev->vid + vmdq_pool_base);
771 	if (ret)
772 		RTE_LOG(ERR, VHOST_DATA,
773 			"(%d) failed to add device MAC address to VMDQ\n",
774 			vdev->vid);
775 
776 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
777 
778 	/* Set device as ready for RX. */
779 	vdev->ready = DEVICE_RX;
780 
781 	return 0;
782 }
783 
784 /*
785  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
786  * queue before disabling RX on the device.
787  */
788 static inline void
789 unlink_vmdq(struct vhost_dev *vdev)
790 {
791 	unsigned i = 0;
792 	unsigned rx_count;
793 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
794 
795 	if (vdev->ready == DEVICE_RX) {
796 		/*clear MAC and VLAN settings*/
797 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
798 		for (i = 0; i < 6; i++)
799 			vdev->mac_address.addr_bytes[i] = 0;
800 
801 		vdev->vlan_tag = 0;
802 
803 		/*Clear out the receive buffers*/
804 		rx_count = rte_eth_rx_burst(ports[0],
805 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
806 
807 		while (rx_count) {
808 			for (i = 0; i < rx_count; i++)
809 				rte_pktmbuf_free(pkts_burst[i]);
810 
811 			rx_count = rte_eth_rx_burst(ports[0],
812 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
813 		}
814 
815 		vdev->ready = DEVICE_MAC_LEARNING;
816 	}
817 }
818 
819 static inline void
820 free_pkts(struct rte_mbuf **pkts, uint16_t n)
821 {
822 	while (n--)
823 		rte_pktmbuf_free(pkts[n]);
824 }
825 
826 static __rte_always_inline void
827 complete_async_pkts(struct vhost_dev *vdev)
828 {
829 	struct rte_mbuf *p_cpl[MAX_PKT_BURST];
830 	uint16_t complete_count;
831 
832 	complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
833 					VIRTIO_RXQ, p_cpl, MAX_PKT_BURST);
834 	if (complete_count) {
835 		__atomic_sub_fetch(&vdev->nr_async_pkts, complete_count,
836 			__ATOMIC_SEQ_CST);
837 		free_pkts(p_cpl, complete_count);
838 	}
839 }
840 
841 static __rte_always_inline void
842 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
843 	    struct rte_mbuf *m)
844 {
845 	uint16_t ret;
846 
847 	if (builtin_net_driver) {
848 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
849 	} else {
850 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
851 	}
852 
853 	if (enable_stats) {
854 		__atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
855 				__ATOMIC_SEQ_CST);
856 		__atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
857 				__ATOMIC_SEQ_CST);
858 		src_vdev->stats.tx_total++;
859 		src_vdev->stats.tx += ret;
860 	}
861 }
862 
863 static __rte_always_inline void
864 drain_vhost(struct vhost_dev *vdev)
865 {
866 	uint16_t ret;
867 	uint64_t buff_idx = rte_lcore_id() * MAX_VHOST_DEVICE + vdev->vid;
868 	uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
869 	struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
870 
871 	if (builtin_net_driver) {
872 		ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
873 	} else if (async_vhost_driver) {
874 		uint32_t cpu_cpl_nr = 0;
875 		uint16_t enqueue_fail = 0;
876 		struct rte_mbuf *m_cpu_cpl[nr_xmit];
877 
878 		complete_async_pkts(vdev);
879 		ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ,
880 					m, nr_xmit, m_cpu_cpl, &cpu_cpl_nr);
881 		__atomic_add_fetch(&vdev->nr_async_pkts, ret - cpu_cpl_nr,
882 				__ATOMIC_SEQ_CST);
883 
884 		if (cpu_cpl_nr)
885 			free_pkts(m_cpu_cpl, cpu_cpl_nr);
886 
887 		enqueue_fail = nr_xmit - ret;
888 		if (enqueue_fail)
889 			free_pkts(&m[ret], nr_xmit - ret);
890 	} else {
891 		ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
892 						m, nr_xmit);
893 	}
894 
895 	if (enable_stats) {
896 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
897 				__ATOMIC_SEQ_CST);
898 		__atomic_add_fetch(&vdev->stats.rx_atomic, ret,
899 				__ATOMIC_SEQ_CST);
900 	}
901 
902 	if (!async_vhost_driver)
903 		free_pkts(m, nr_xmit);
904 }
905 
906 static __rte_always_inline void
907 drain_vhost_table(void)
908 {
909 	uint16_t lcore_id = rte_lcore_id();
910 	struct vhost_bufftable *vhost_txq;
911 	struct vhost_dev *vdev;
912 	uint64_t cur_tsc;
913 
914 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
915 		vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE
916 						+ vdev->vid];
917 
918 		cur_tsc = rte_rdtsc();
919 		if (unlikely(cur_tsc - vhost_txq->pre_tsc
920 				> MBUF_TABLE_DRAIN_TSC)) {
921 			RTE_LOG_DP(DEBUG, VHOST_DATA,
922 				"Vhost TX queue drained after timeout with burst size %u\n",
923 				vhost_txq->len);
924 			drain_vhost(vdev);
925 			vhost_txq->len = 0;
926 			vhost_txq->pre_tsc = cur_tsc;
927 		}
928 	}
929 }
930 
931 /*
932  * Check if the packet destination MAC address is for a local device. If so then put
933  * the packet on that devices RX queue. If not then return.
934  */
935 static __rte_always_inline int
936 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
937 {
938 	struct rte_ether_hdr *pkt_hdr;
939 	struct vhost_dev *dst_vdev;
940 	struct vhost_bufftable *vhost_txq;
941 	uint16_t lcore_id = rte_lcore_id();
942 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
943 
944 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
945 	if (!dst_vdev)
946 		return -1;
947 
948 	if (vdev->vid == dst_vdev->vid) {
949 		RTE_LOG_DP(DEBUG, VHOST_DATA,
950 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
951 			vdev->vid);
952 		return 0;
953 	}
954 
955 	RTE_LOG_DP(DEBUG, VHOST_DATA,
956 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
957 
958 	if (unlikely(dst_vdev->remove)) {
959 		RTE_LOG_DP(DEBUG, VHOST_DATA,
960 			"(%d) device is marked for removal\n", dst_vdev->vid);
961 		return 0;
962 	}
963 
964 	vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE + dst_vdev->vid];
965 	vhost_txq->m_table[vhost_txq->len++] = m;
966 
967 	if (enable_stats) {
968 		vdev->stats.tx_total++;
969 		vdev->stats.tx++;
970 	}
971 
972 	if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
973 		drain_vhost(dst_vdev);
974 		vhost_txq->len = 0;
975 		vhost_txq->pre_tsc = rte_rdtsc();
976 	}
977 	return 0;
978 }
979 
980 /*
981  * Check if the destination MAC of a packet is one local VM,
982  * and get its vlan tag, and offset if it is.
983  */
984 static __rte_always_inline int
985 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
986 	uint32_t *offset, uint16_t *vlan_tag)
987 {
988 	struct vhost_dev *dst_vdev;
989 	struct rte_ether_hdr *pkt_hdr =
990 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
991 
992 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
993 	if (!dst_vdev)
994 		return 0;
995 
996 	if (vdev->vid == dst_vdev->vid) {
997 		RTE_LOG_DP(DEBUG, VHOST_DATA,
998 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
999 			vdev->vid);
1000 		return -1;
1001 	}
1002 
1003 	/*
1004 	 * HW vlan strip will reduce the packet length
1005 	 * by minus length of vlan tag, so need restore
1006 	 * the packet length by plus it.
1007 	 */
1008 	*offset  = VLAN_HLEN;
1009 	*vlan_tag = vlan_tags[vdev->vid];
1010 
1011 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1012 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1013 		vdev->vid, dst_vdev->vid, *vlan_tag);
1014 
1015 	return 0;
1016 }
1017 
1018 static uint16_t
1019 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
1020 {
1021 	if (ol_flags & PKT_TX_IPV4)
1022 		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
1023 	else /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1024 		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
1025 }
1026 
1027 static void virtio_tx_offload(struct rte_mbuf *m)
1028 {
1029 	void *l3_hdr;
1030 	struct rte_ipv4_hdr *ipv4_hdr = NULL;
1031 	struct rte_tcp_hdr *tcp_hdr = NULL;
1032 	struct rte_ether_hdr *eth_hdr =
1033 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1034 
1035 	l3_hdr = (char *)eth_hdr + m->l2_len;
1036 
1037 	if (m->ol_flags & PKT_TX_IPV4) {
1038 		ipv4_hdr = l3_hdr;
1039 		ipv4_hdr->hdr_checksum = 0;
1040 		m->ol_flags |= PKT_TX_IP_CKSUM;
1041 	}
1042 
1043 	tcp_hdr = (struct rte_tcp_hdr *)((char *)l3_hdr + m->l3_len);
1044 	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
1045 }
1046 
1047 static __rte_always_inline void
1048 do_drain_mbuf_table(struct mbuf_table *tx_q)
1049 {
1050 	uint16_t count;
1051 
1052 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1053 				 tx_q->m_table, tx_q->len);
1054 	if (unlikely(count < tx_q->len))
1055 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
1056 
1057 	tx_q->len = 0;
1058 }
1059 
1060 /*
1061  * This function routes the TX packet to the correct interface. This
1062  * may be a local device or the physical port.
1063  */
1064 static __rte_always_inline void
1065 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1066 {
1067 	struct mbuf_table *tx_q;
1068 	unsigned offset = 0;
1069 	const uint16_t lcore_id = rte_lcore_id();
1070 	struct rte_ether_hdr *nh;
1071 
1072 
1073 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1074 	if (unlikely(rte_is_broadcast_ether_addr(&nh->d_addr))) {
1075 		struct vhost_dev *vdev2;
1076 
1077 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1078 			if (vdev2 != vdev)
1079 				sync_virtio_xmit(vdev2, vdev, m);
1080 		}
1081 		goto queue2nic;
1082 	}
1083 
1084 	/*check if destination is local VM*/
1085 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1086 		return;
1087 
1088 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1089 		if (unlikely(find_local_dest(vdev, m, &offset,
1090 					     &vlan_tag) != 0)) {
1091 			rte_pktmbuf_free(m);
1092 			return;
1093 		}
1094 	}
1095 
1096 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1097 		"(%d) TX: MAC address is external\n", vdev->vid);
1098 
1099 queue2nic:
1100 
1101 	/*Add packet to the port tx queue*/
1102 	tx_q = &lcore_tx_queue[lcore_id];
1103 
1104 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1105 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1106 		/* Guest has inserted the vlan tag. */
1107 		struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1108 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1109 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1110 			(vh->vlan_tci != vlan_tag_be))
1111 			vh->vlan_tci = vlan_tag_be;
1112 	} else {
1113 		m->ol_flags |= PKT_TX_VLAN_PKT;
1114 
1115 		/*
1116 		 * Find the right seg to adjust the data len when offset is
1117 		 * bigger than tail room size.
1118 		 */
1119 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1120 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1121 				m->data_len += offset;
1122 			else {
1123 				struct rte_mbuf *seg = m;
1124 
1125 				while ((seg->next != NULL) &&
1126 					(offset > rte_pktmbuf_tailroom(seg)))
1127 					seg = seg->next;
1128 
1129 				seg->data_len += offset;
1130 			}
1131 			m->pkt_len += offset;
1132 		}
1133 
1134 		m->vlan_tci = vlan_tag;
1135 	}
1136 
1137 	if (m->ol_flags & PKT_TX_TCP_SEG)
1138 		virtio_tx_offload(m);
1139 
1140 	tx_q->m_table[tx_q->len++] = m;
1141 	if (enable_stats) {
1142 		vdev->stats.tx_total++;
1143 		vdev->stats.tx++;
1144 	}
1145 
1146 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1147 		do_drain_mbuf_table(tx_q);
1148 }
1149 
1150 
1151 static __rte_always_inline void
1152 drain_mbuf_table(struct mbuf_table *tx_q)
1153 {
1154 	static uint64_t prev_tsc;
1155 	uint64_t cur_tsc;
1156 
1157 	if (tx_q->len == 0)
1158 		return;
1159 
1160 	cur_tsc = rte_rdtsc();
1161 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1162 		prev_tsc = cur_tsc;
1163 
1164 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1165 			"TX queue drained after timeout with burst size %u\n",
1166 			tx_q->len);
1167 		do_drain_mbuf_table(tx_q);
1168 	}
1169 }
1170 
1171 static __rte_always_inline void
1172 drain_eth_rx(struct vhost_dev *vdev)
1173 {
1174 	uint16_t rx_count, enqueue_count;
1175 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1176 
1177 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1178 				    pkts, MAX_PKT_BURST);
1179 
1180 	if (!rx_count)
1181 		return;
1182 
1183 	/*
1184 	 * When "enable_retry" is set, here we wait and retry when there
1185 	 * is no enough free slots in the queue to hold @rx_count packets,
1186 	 * to diminish packet loss.
1187 	 */
1188 	if (enable_retry &&
1189 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1190 			VIRTIO_RXQ))) {
1191 		uint32_t retry;
1192 
1193 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1194 			rte_delay_us(burst_rx_delay_time);
1195 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1196 					VIRTIO_RXQ))
1197 				break;
1198 		}
1199 	}
1200 
1201 	if (builtin_net_driver) {
1202 		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1203 						pkts, rx_count);
1204 	} else if (async_vhost_driver) {
1205 		uint32_t cpu_cpl_nr = 0;
1206 		uint16_t enqueue_fail = 0;
1207 		struct rte_mbuf *m_cpu_cpl[MAX_PKT_BURST];
1208 
1209 		complete_async_pkts(vdev);
1210 		enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1211 					VIRTIO_RXQ, pkts, rx_count,
1212 					m_cpu_cpl, &cpu_cpl_nr);
1213 		__atomic_add_fetch(&vdev->nr_async_pkts,
1214 					enqueue_count - cpu_cpl_nr,
1215 					__ATOMIC_SEQ_CST);
1216 		if (cpu_cpl_nr)
1217 			free_pkts(m_cpu_cpl, cpu_cpl_nr);
1218 
1219 		enqueue_fail = rx_count - enqueue_count;
1220 		if (enqueue_fail)
1221 			free_pkts(&pkts[enqueue_count], enqueue_fail);
1222 
1223 	} else {
1224 		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1225 						pkts, rx_count);
1226 	}
1227 
1228 	if (enable_stats) {
1229 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1230 				__ATOMIC_SEQ_CST);
1231 		__atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1232 				__ATOMIC_SEQ_CST);
1233 	}
1234 
1235 	if (!async_vhost_driver)
1236 		free_pkts(pkts, rx_count);
1237 }
1238 
1239 static __rte_always_inline void
1240 drain_virtio_tx(struct vhost_dev *vdev)
1241 {
1242 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1243 	uint16_t count;
1244 	uint16_t i;
1245 
1246 	if (builtin_net_driver) {
1247 		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1248 					pkts, MAX_PKT_BURST);
1249 	} else {
1250 		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1251 					mbuf_pool, pkts, MAX_PKT_BURST);
1252 	}
1253 
1254 	/* setup VMDq for the first packet */
1255 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1256 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1257 			free_pkts(pkts, count);
1258 	}
1259 
1260 	for (i = 0; i < count; ++i)
1261 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1262 }
1263 
1264 /*
1265  * Main function of vhost-switch. It basically does:
1266  *
1267  * for each vhost device {
1268  *    - drain_eth_rx()
1269  *
1270  *      Which drains the host eth Rx queue linked to the vhost device,
1271  *      and deliver all of them to guest virito Rx ring associated with
1272  *      this vhost device.
1273  *
1274  *    - drain_virtio_tx()
1275  *
1276  *      Which drains the guest virtio Tx queue and deliver all of them
1277  *      to the target, which could be another vhost device, or the
1278  *      physical eth dev. The route is done in function "virtio_tx_route".
1279  * }
1280  */
1281 static int
1282 switch_worker(void *arg __rte_unused)
1283 {
1284 	unsigned i;
1285 	unsigned lcore_id = rte_lcore_id();
1286 	struct vhost_dev *vdev;
1287 	struct mbuf_table *tx_q;
1288 
1289 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1290 
1291 	tx_q = &lcore_tx_queue[lcore_id];
1292 	for (i = 0; i < rte_lcore_count(); i++) {
1293 		if (lcore_ids[i] == lcore_id) {
1294 			tx_q->txq_id = i;
1295 			break;
1296 		}
1297 	}
1298 
1299 	while(1) {
1300 		drain_mbuf_table(tx_q);
1301 		drain_vhost_table();
1302 		/*
1303 		 * Inform the configuration core that we have exited the
1304 		 * linked list and that no devices are in use if requested.
1305 		 */
1306 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1307 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1308 
1309 		/*
1310 		 * Process vhost devices
1311 		 */
1312 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1313 			      lcore_vdev_entry) {
1314 			if (unlikely(vdev->remove)) {
1315 				unlink_vmdq(vdev);
1316 				vdev->ready = DEVICE_SAFE_REMOVE;
1317 				continue;
1318 			}
1319 
1320 			if (likely(vdev->ready == DEVICE_RX))
1321 				drain_eth_rx(vdev);
1322 
1323 			if (likely(!vdev->remove))
1324 				drain_virtio_tx(vdev);
1325 		}
1326 	}
1327 
1328 	return 0;
1329 }
1330 
1331 /*
1332  * Remove a device from the specific data core linked list and from the
1333  * main linked list. Synchonization  occurs through the use of the
1334  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1335  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1336  */
1337 static void
1338 destroy_device(int vid)
1339 {
1340 	struct vhost_dev *vdev = NULL;
1341 	int lcore;
1342 	uint16_t i;
1343 
1344 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1345 		if (vdev->vid == vid)
1346 			break;
1347 	}
1348 	if (!vdev)
1349 		return;
1350 	/*set the remove flag. */
1351 	vdev->remove = 1;
1352 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1353 		rte_pause();
1354 	}
1355 
1356 	for (i = 0; i < RTE_MAX_LCORE; i++)
1357 		rte_free(vhost_txbuff[i * MAX_VHOST_DEVICE + vid]);
1358 
1359 	if (builtin_net_driver)
1360 		vs_vhost_net_remove(vdev);
1361 
1362 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1363 		     lcore_vdev_entry);
1364 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1365 
1366 
1367 	/* Set the dev_removal_flag on each lcore. */
1368 	RTE_LCORE_FOREACH_WORKER(lcore)
1369 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1370 
1371 	/*
1372 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1373 	 * we can be sure that they can no longer access the device removed
1374 	 * from the linked lists and that the devices are no longer in use.
1375 	 */
1376 	RTE_LCORE_FOREACH_WORKER(lcore) {
1377 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1378 			rte_pause();
1379 	}
1380 
1381 	lcore_info[vdev->coreid].device_num--;
1382 
1383 	RTE_LOG(INFO, VHOST_DATA,
1384 		"(%d) device has been removed from data core\n",
1385 		vdev->vid);
1386 
1387 	if (async_vhost_driver)
1388 		rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1389 
1390 	rte_free(vdev);
1391 }
1392 
1393 /*
1394  * A new device is added to a data core. First the device is added to the main linked list
1395  * and then allocated to a specific data core.
1396  */
1397 static int
1398 new_device(int vid)
1399 {
1400 	int lcore, core_add = 0;
1401 	uint16_t i;
1402 	uint32_t device_num_min = num_devices;
1403 	struct vhost_dev *vdev;
1404 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1405 	if (vdev == NULL) {
1406 		RTE_LOG(INFO, VHOST_DATA,
1407 			"(%d) couldn't allocate memory for vhost dev\n",
1408 			vid);
1409 		return -1;
1410 	}
1411 	vdev->vid = vid;
1412 
1413 	for (i = 0; i < RTE_MAX_LCORE; i++) {
1414 		vhost_txbuff[i * MAX_VHOST_DEVICE + vid]
1415 			= rte_zmalloc("vhost bufftable",
1416 				sizeof(struct vhost_bufftable),
1417 				RTE_CACHE_LINE_SIZE);
1418 
1419 		if (vhost_txbuff[i * MAX_VHOST_DEVICE + vid] == NULL) {
1420 			RTE_LOG(INFO, VHOST_DATA,
1421 			  "(%d) couldn't allocate memory for vhost TX\n", vid);
1422 			return -1;
1423 		}
1424 	}
1425 
1426 	if (builtin_net_driver)
1427 		vs_vhost_net_setup(vdev);
1428 
1429 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1430 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1431 
1432 	/*reset ready flag*/
1433 	vdev->ready = DEVICE_MAC_LEARNING;
1434 	vdev->remove = 0;
1435 
1436 	/* Find a suitable lcore to add the device. */
1437 	RTE_LCORE_FOREACH_WORKER(lcore) {
1438 		if (lcore_info[lcore].device_num < device_num_min) {
1439 			device_num_min = lcore_info[lcore].device_num;
1440 			core_add = lcore;
1441 		}
1442 	}
1443 	vdev->coreid = core_add;
1444 
1445 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1446 			  lcore_vdev_entry);
1447 	lcore_info[vdev->coreid].device_num++;
1448 
1449 	/* Disable notifications. */
1450 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1451 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1452 
1453 	RTE_LOG(INFO, VHOST_DATA,
1454 		"(%d) device has been added to data core %d\n",
1455 		vid, vdev->coreid);
1456 
1457 	if (async_vhost_driver) {
1458 		struct rte_vhost_async_features f;
1459 		struct rte_vhost_async_channel_ops channel_ops;
1460 
1461 		if (strncmp(dma_type, "ioat", 4) == 0) {
1462 			channel_ops.transfer_data = ioat_transfer_data_cb;
1463 			channel_ops.check_completed_copies =
1464 				ioat_check_completed_copies_cb;
1465 
1466 			f.async_inorder = 1;
1467 			f.async_threshold = 256;
1468 
1469 			return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
1470 				f.intval, &channel_ops);
1471 		}
1472 	}
1473 
1474 	return 0;
1475 }
1476 
1477 /*
1478  * These callback allow devices to be added to the data core when configuration
1479  * has been fully complete.
1480  */
1481 static const struct vhost_device_ops virtio_net_device_ops =
1482 {
1483 	.new_device =  new_device,
1484 	.destroy_device = destroy_device,
1485 };
1486 
1487 /*
1488  * This is a thread will wake up after a period to print stats if the user has
1489  * enabled them.
1490  */
1491 static void *
1492 print_stats(__rte_unused void *arg)
1493 {
1494 	struct vhost_dev *vdev;
1495 	uint64_t tx_dropped, rx_dropped;
1496 	uint64_t tx, tx_total, rx, rx_total;
1497 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1498 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1499 
1500 	while(1) {
1501 		sleep(enable_stats);
1502 
1503 		/* Clear screen and move to top left */
1504 		printf("%s%s\n", clr, top_left);
1505 		printf("Device statistics =================================\n");
1506 
1507 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1508 			tx_total   = vdev->stats.tx_total;
1509 			tx         = vdev->stats.tx;
1510 			tx_dropped = tx_total - tx;
1511 
1512 			rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1513 				__ATOMIC_SEQ_CST);
1514 			rx         = __atomic_load_n(&vdev->stats.rx_atomic,
1515 				__ATOMIC_SEQ_CST);
1516 			rx_dropped = rx_total - rx;
1517 
1518 			printf("Statistics for device %d\n"
1519 				"-----------------------\n"
1520 				"TX total:              %" PRIu64 "\n"
1521 				"TX dropped:            %" PRIu64 "\n"
1522 				"TX successful:         %" PRIu64 "\n"
1523 				"RX total:              %" PRIu64 "\n"
1524 				"RX dropped:            %" PRIu64 "\n"
1525 				"RX successful:         %" PRIu64 "\n",
1526 				vdev->vid,
1527 				tx_total, tx_dropped, tx,
1528 				rx_total, rx_dropped, rx);
1529 		}
1530 
1531 		printf("===================================================\n");
1532 
1533 		fflush(stdout);
1534 	}
1535 
1536 	return NULL;
1537 }
1538 
1539 static void
1540 unregister_drivers(int socket_num)
1541 {
1542 	int i, ret;
1543 
1544 	for (i = 0; i < socket_num; i++) {
1545 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1546 		if (ret != 0)
1547 			RTE_LOG(ERR, VHOST_CONFIG,
1548 				"Fail to unregister vhost driver for %s.\n",
1549 				socket_files + i * PATH_MAX);
1550 	}
1551 }
1552 
1553 /* When we receive a INT signal, unregister vhost driver */
1554 static void
1555 sigint_handler(__rte_unused int signum)
1556 {
1557 	/* Unregister vhost driver. */
1558 	unregister_drivers(nb_sockets);
1559 
1560 	exit(0);
1561 }
1562 
1563 /*
1564  * While creating an mbuf pool, one key thing is to figure out how
1565  * many mbuf entries is enough for our use. FYI, here are some
1566  * guidelines:
1567  *
1568  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1569  *
1570  * - For each switch core (A CPU core does the packet switch), we need
1571  *   also make some reservation for receiving the packets from virtio
1572  *   Tx queue. How many is enough depends on the usage. It's normally
1573  *   a simple calculation like following:
1574  *
1575  *       MAX_PKT_BURST * max packet size / mbuf size
1576  *
1577  *   So, we definitely need allocate more mbufs when TSO is enabled.
1578  *
1579  * - Similarly, for each switching core, we should serve @nr_rx_desc
1580  *   mbufs for receiving the packets from physical NIC device.
1581  *
1582  * - We also need make sure, for each switch core, we have allocated
1583  *   enough mbufs to fill up the mbuf cache.
1584  */
1585 static void
1586 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1587 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1588 {
1589 	uint32_t nr_mbufs;
1590 	uint32_t nr_mbufs_per_core;
1591 	uint32_t mtu = 1500;
1592 
1593 	if (mergeable)
1594 		mtu = 9000;
1595 	if (enable_tso)
1596 		mtu = 64 * 1024;
1597 
1598 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1599 			(mbuf_size - RTE_PKTMBUF_HEADROOM);
1600 	nr_mbufs_per_core += nr_rx_desc;
1601 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1602 
1603 	nr_mbufs  = nr_queues * nr_rx_desc;
1604 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1605 	nr_mbufs *= nr_port;
1606 
1607 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1608 					    nr_mbuf_cache, 0, mbuf_size,
1609 					    rte_socket_id());
1610 	if (mbuf_pool == NULL)
1611 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1612 }
1613 
1614 /*
1615  * Main function, does initialisation and calls the per-lcore functions.
1616  */
1617 int
1618 main(int argc, char *argv[])
1619 {
1620 	unsigned lcore_id, core_id = 0;
1621 	unsigned nb_ports, valid_num_ports;
1622 	int ret, i;
1623 	uint16_t portid;
1624 	static pthread_t tid;
1625 	uint64_t flags = 0;
1626 
1627 	signal(SIGINT, sigint_handler);
1628 
1629 	/* init EAL */
1630 	ret = rte_eal_init(argc, argv);
1631 	if (ret < 0)
1632 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1633 	argc -= ret;
1634 	argv += ret;
1635 
1636 	/* parse app arguments */
1637 	ret = us_vhost_parse_args(argc, argv);
1638 	if (ret < 0)
1639 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1640 
1641 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1642 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1643 
1644 		if (rte_lcore_is_enabled(lcore_id))
1645 			lcore_ids[core_id++] = lcore_id;
1646 	}
1647 
1648 	if (rte_lcore_count() > RTE_MAX_LCORE)
1649 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1650 
1651 	/* Get the number of physical ports. */
1652 	nb_ports = rte_eth_dev_count_avail();
1653 
1654 	/*
1655 	 * Update the global var NUM_PORTS and global array PORTS
1656 	 * and get value of var VALID_NUM_PORTS according to system ports number
1657 	 */
1658 	valid_num_ports = check_ports_num(nb_ports);
1659 
1660 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1661 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1662 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1663 		return -1;
1664 	}
1665 
1666 	/*
1667 	 * FIXME: here we are trying to allocate mbufs big enough for
1668 	 * @MAX_QUEUES, but the truth is we're never going to use that
1669 	 * many queues here. We probably should only do allocation for
1670 	 * those queues we are going to use.
1671 	 */
1672 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1673 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1674 
1675 	if (vm2vm_mode == VM2VM_HARDWARE) {
1676 		/* Enable VT loop back to let L2 switch to do it. */
1677 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1678 		RTE_LOG(DEBUG, VHOST_CONFIG,
1679 			"Enable loop back for L2 switch in vmdq.\n");
1680 	}
1681 
1682 	/* initialize all ports */
1683 	RTE_ETH_FOREACH_DEV(portid) {
1684 		/* skip ports that are not enabled */
1685 		if ((enabled_port_mask & (1 << portid)) == 0) {
1686 			RTE_LOG(INFO, VHOST_PORT,
1687 				"Skipping disabled port %d\n", portid);
1688 			continue;
1689 		}
1690 		if (port_init(portid) != 0)
1691 			rte_exit(EXIT_FAILURE,
1692 				"Cannot initialize network ports\n");
1693 	}
1694 
1695 	/* Enable stats if the user option is set. */
1696 	if (enable_stats) {
1697 		ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1698 					print_stats, NULL);
1699 		if (ret < 0)
1700 			rte_exit(EXIT_FAILURE,
1701 				"Cannot create print-stats thread\n");
1702 	}
1703 
1704 	/* Launch all data cores. */
1705 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1706 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1707 
1708 	if (client_mode)
1709 		flags |= RTE_VHOST_USER_CLIENT;
1710 
1711 	/* Register vhost user driver to handle vhost messages. */
1712 	for (i = 0; i < nb_sockets; i++) {
1713 		char *file = socket_files + i * PATH_MAX;
1714 
1715 		if (async_vhost_driver)
1716 			flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1717 
1718 		ret = rte_vhost_driver_register(file, flags);
1719 		if (ret != 0) {
1720 			unregister_drivers(i);
1721 			rte_exit(EXIT_FAILURE,
1722 				"vhost driver register failure.\n");
1723 		}
1724 
1725 		if (builtin_net_driver)
1726 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1727 
1728 		if (mergeable == 0) {
1729 			rte_vhost_driver_disable_features(file,
1730 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
1731 		}
1732 
1733 		if (enable_tx_csum == 0) {
1734 			rte_vhost_driver_disable_features(file,
1735 				1ULL << VIRTIO_NET_F_CSUM);
1736 		}
1737 
1738 		if (enable_tso == 0) {
1739 			rte_vhost_driver_disable_features(file,
1740 				1ULL << VIRTIO_NET_F_HOST_TSO4);
1741 			rte_vhost_driver_disable_features(file,
1742 				1ULL << VIRTIO_NET_F_HOST_TSO6);
1743 			rte_vhost_driver_disable_features(file,
1744 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
1745 			rte_vhost_driver_disable_features(file,
1746 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
1747 		}
1748 
1749 		if (promiscuous) {
1750 			rte_vhost_driver_enable_features(file,
1751 				1ULL << VIRTIO_NET_F_CTRL_RX);
1752 		}
1753 
1754 		ret = rte_vhost_driver_callback_register(file,
1755 			&virtio_net_device_ops);
1756 		if (ret != 0) {
1757 			rte_exit(EXIT_FAILURE,
1758 				"failed to register vhost driver callbacks.\n");
1759 		}
1760 
1761 		if (rte_vhost_driver_start(file) < 0) {
1762 			rte_exit(EXIT_FAILURE,
1763 				"failed to start vhost driver.\n");
1764 		}
1765 	}
1766 
1767 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1768 		rte_eal_wait_lcore(lcore_id);
1769 
1770 	return 0;
1771 
1772 }
1773