xref: /dpdk/examples/vhost/main.c (revision 89813a522e68076e6f50ec18b075fa57cc5ae937)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4 
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16 
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
19 #include <rte_log.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
22 #include <rte_vhost.h>
23 #include <rte_ip.h>
24 #include <rte_tcp.h>
25 #include <rte_pause.h>
26 
27 #include "ioat.h"
28 #include "main.h"
29 
30 #ifndef MAX_QUEUES
31 #define MAX_QUEUES 128
32 #endif
33 
34 /* the maximum number of external ports supported */
35 #define MAX_SUP_PORTS 1
36 
37 #define MBUF_CACHE_SIZE	128
38 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
39 
40 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
41 
42 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
43 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
44 
45 #define JUMBO_FRAME_MAX_SIZE    0x2600
46 
47 /* State of virtio device. */
48 #define DEVICE_MAC_LEARNING 0
49 #define DEVICE_RX			1
50 #define DEVICE_SAFE_REMOVE	2
51 
52 /* Configurable number of RX/TX ring descriptors */
53 #define RTE_TEST_RX_DESC_DEFAULT 1024
54 #define RTE_TEST_TX_DESC_DEFAULT 512
55 
56 #define INVALID_PORT_ID 0xFF
57 
58 /* Maximum long option length for option parsing. */
59 #define MAX_LONG_OPT_SZ 64
60 
61 /* mask of enabled ports */
62 static uint32_t enabled_port_mask = 0;
63 
64 /* Promiscuous mode */
65 static uint32_t promiscuous;
66 
67 /* number of devices/queues to support*/
68 static uint32_t num_queues = 0;
69 static uint32_t num_devices;
70 
71 static struct rte_mempool *mbuf_pool;
72 static int mergeable;
73 
74 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
75 typedef enum {
76 	VM2VM_DISABLED = 0,
77 	VM2VM_SOFTWARE = 1,
78 	VM2VM_HARDWARE = 2,
79 	VM2VM_LAST
80 } vm2vm_type;
81 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
82 
83 /* Enable stats. */
84 static uint32_t enable_stats = 0;
85 /* Enable retries on RX. */
86 static uint32_t enable_retry = 1;
87 
88 /* Disable TX checksum offload */
89 static uint32_t enable_tx_csum;
90 
91 /* Disable TSO offload */
92 static uint32_t enable_tso;
93 
94 static int client_mode;
95 
96 static int builtin_net_driver;
97 
98 static int async_vhost_driver;
99 
100 static char dma_type[MAX_LONG_OPT_SZ];
101 
102 /* Specify timeout (in useconds) between retries on RX. */
103 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
104 /* Specify the number of retries on RX. */
105 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
106 
107 /* Socket file paths. Can be set by user */
108 static char *socket_files;
109 static int nb_sockets;
110 
111 /* empty vmdq configuration structure. Filled in programatically */
112 static struct rte_eth_conf vmdq_conf_default = {
113 	.rxmode = {
114 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
115 		.split_hdr_size = 0,
116 		/*
117 		 * VLAN strip is necessary for 1G NIC such as I350,
118 		 * this fixes bug of ipv4 forwarding in guest can't
119 		 * forward pakets from one virtio dev to another virtio dev.
120 		 */
121 		.offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
122 	},
123 
124 	.txmode = {
125 		.mq_mode = ETH_MQ_TX_NONE,
126 		.offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
127 			     DEV_TX_OFFLOAD_TCP_CKSUM |
128 			     DEV_TX_OFFLOAD_VLAN_INSERT |
129 			     DEV_TX_OFFLOAD_MULTI_SEGS |
130 			     DEV_TX_OFFLOAD_TCP_TSO),
131 	},
132 	.rx_adv_conf = {
133 		/*
134 		 * should be overridden separately in code with
135 		 * appropriate values
136 		 */
137 		.vmdq_rx_conf = {
138 			.nb_queue_pools = ETH_8_POOLS,
139 			.enable_default_pool = 0,
140 			.default_pool = 0,
141 			.nb_pool_maps = 0,
142 			.pool_map = {{0, 0},},
143 		},
144 	},
145 };
146 
147 
148 static unsigned lcore_ids[RTE_MAX_LCORE];
149 static uint16_t ports[RTE_MAX_ETHPORTS];
150 static unsigned num_ports = 0; /**< The number of ports specified in command line */
151 static uint16_t num_pf_queues, num_vmdq_queues;
152 static uint16_t vmdq_pool_base, vmdq_queue_base;
153 static uint16_t queues_per_pool;
154 
155 const uint16_t vlan_tags[] = {
156 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
157 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
158 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
159 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
160 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
161 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
162 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
163 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
164 };
165 
166 /* ethernet addresses of ports */
167 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
168 
169 static struct vhost_dev_tailq_list vhost_dev_list =
170 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
171 
172 static struct lcore_info lcore_info[RTE_MAX_LCORE];
173 
174 /* Used for queueing bursts of TX packets. */
175 struct mbuf_table {
176 	unsigned len;
177 	unsigned txq_id;
178 	struct rte_mbuf *m_table[MAX_PKT_BURST];
179 };
180 
181 struct vhost_bufftable {
182 	uint32_t len;
183 	uint64_t pre_tsc;
184 	struct rte_mbuf *m_table[MAX_PKT_BURST];
185 };
186 
187 /* TX queue for each data core. */
188 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
189 
190 /*
191  * Vhost TX buffer for each data core.
192  * Every data core maintains a TX buffer for every vhost device,
193  * which is used for batch pkts enqueue for higher performance.
194  */
195 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * MAX_VHOST_DEVICE];
196 
197 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
198 				 / US_PER_S * BURST_TX_DRAIN_US)
199 #define VLAN_HLEN       4
200 
201 static inline int
202 open_dma(const char *value)
203 {
204 	if (strncmp(dma_type, "ioat", 4) == 0)
205 		return open_ioat(value);
206 
207 	return -1;
208 }
209 
210 /*
211  * Builds up the correct configuration for VMDQ VLAN pool map
212  * according to the pool & queue limits.
213  */
214 static inline int
215 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
216 {
217 	struct rte_eth_vmdq_rx_conf conf;
218 	struct rte_eth_vmdq_rx_conf *def_conf =
219 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
220 	unsigned i;
221 
222 	memset(&conf, 0, sizeof(conf));
223 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
224 	conf.nb_pool_maps = num_devices;
225 	conf.enable_loop_back = def_conf->enable_loop_back;
226 	conf.rx_mode = def_conf->rx_mode;
227 
228 	for (i = 0; i < conf.nb_pool_maps; i++) {
229 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
230 		conf.pool_map[i].pools = (1UL << i);
231 	}
232 
233 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
234 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
235 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
236 	return 0;
237 }
238 
239 /*
240  * Initialises a given port using global settings and with the rx buffers
241  * coming from the mbuf_pool passed as parameter
242  */
243 static inline int
244 port_init(uint16_t port)
245 {
246 	struct rte_eth_dev_info dev_info;
247 	struct rte_eth_conf port_conf;
248 	struct rte_eth_rxconf *rxconf;
249 	struct rte_eth_txconf *txconf;
250 	int16_t rx_rings, tx_rings;
251 	uint16_t rx_ring_size, tx_ring_size;
252 	int retval;
253 	uint16_t q;
254 
255 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
256 	retval = rte_eth_dev_info_get(port, &dev_info);
257 	if (retval != 0) {
258 		RTE_LOG(ERR, VHOST_PORT,
259 			"Error during getting device (port %u) info: %s\n",
260 			port, strerror(-retval));
261 
262 		return retval;
263 	}
264 
265 	rxconf = &dev_info.default_rxconf;
266 	txconf = &dev_info.default_txconf;
267 	rxconf->rx_drop_en = 1;
268 
269 	/*configure the number of supported virtio devices based on VMDQ limits */
270 	num_devices = dev_info.max_vmdq_pools;
271 
272 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
273 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
274 
275 	tx_rings = (uint16_t)rte_lcore_count();
276 
277 	/* Get port configuration. */
278 	retval = get_eth_conf(&port_conf, num_devices);
279 	if (retval < 0)
280 		return retval;
281 	/* NIC queues are divided into pf queues and vmdq queues.  */
282 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
283 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
284 	num_vmdq_queues = num_devices * queues_per_pool;
285 	num_queues = num_pf_queues + num_vmdq_queues;
286 	vmdq_queue_base = dev_info.vmdq_queue_base;
287 	vmdq_pool_base  = dev_info.vmdq_pool_base;
288 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
289 		num_pf_queues, num_devices, queues_per_pool);
290 
291 	if (!rte_eth_dev_is_valid_port(port))
292 		return -1;
293 
294 	rx_rings = (uint16_t)dev_info.max_rx_queues;
295 	if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
296 		port_conf.txmode.offloads |=
297 			DEV_TX_OFFLOAD_MBUF_FAST_FREE;
298 	/* Configure ethernet device. */
299 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
300 	if (retval != 0) {
301 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
302 			port, strerror(-retval));
303 		return retval;
304 	}
305 
306 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
307 		&tx_ring_size);
308 	if (retval != 0) {
309 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
310 			"for port %u: %s.\n", port, strerror(-retval));
311 		return retval;
312 	}
313 	if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
314 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
315 			"for Rx queues on port %u.\n", port);
316 		return -1;
317 	}
318 
319 	/* Setup the queues. */
320 	rxconf->offloads = port_conf.rxmode.offloads;
321 	for (q = 0; q < rx_rings; q ++) {
322 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
323 						rte_eth_dev_socket_id(port),
324 						rxconf,
325 						mbuf_pool);
326 		if (retval < 0) {
327 			RTE_LOG(ERR, VHOST_PORT,
328 				"Failed to setup rx queue %u of port %u: %s.\n",
329 				q, port, strerror(-retval));
330 			return retval;
331 		}
332 	}
333 	txconf->offloads = port_conf.txmode.offloads;
334 	for (q = 0; q < tx_rings; q ++) {
335 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
336 						rte_eth_dev_socket_id(port),
337 						txconf);
338 		if (retval < 0) {
339 			RTE_LOG(ERR, VHOST_PORT,
340 				"Failed to setup tx queue %u of port %u: %s.\n",
341 				q, port, strerror(-retval));
342 			return retval;
343 		}
344 	}
345 
346 	/* Start the device. */
347 	retval  = rte_eth_dev_start(port);
348 	if (retval < 0) {
349 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
350 			port, strerror(-retval));
351 		return retval;
352 	}
353 
354 	if (promiscuous) {
355 		retval = rte_eth_promiscuous_enable(port);
356 		if (retval != 0) {
357 			RTE_LOG(ERR, VHOST_PORT,
358 				"Failed to enable promiscuous mode on port %u: %s\n",
359 				port, rte_strerror(-retval));
360 			return retval;
361 		}
362 	}
363 
364 	retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
365 	if (retval < 0) {
366 		RTE_LOG(ERR, VHOST_PORT,
367 			"Failed to get MAC address on port %u: %s\n",
368 			port, rte_strerror(-retval));
369 		return retval;
370 	}
371 
372 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
373 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
374 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
375 			port,
376 			vmdq_ports_eth_addr[port].addr_bytes[0],
377 			vmdq_ports_eth_addr[port].addr_bytes[1],
378 			vmdq_ports_eth_addr[port].addr_bytes[2],
379 			vmdq_ports_eth_addr[port].addr_bytes[3],
380 			vmdq_ports_eth_addr[port].addr_bytes[4],
381 			vmdq_ports_eth_addr[port].addr_bytes[5]);
382 
383 	return 0;
384 }
385 
386 /*
387  * Set socket file path.
388  */
389 static int
390 us_vhost_parse_socket_path(const char *q_arg)
391 {
392 	char *old;
393 
394 	/* parse number string */
395 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
396 		return -1;
397 
398 	old = socket_files;
399 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
400 	if (socket_files == NULL) {
401 		free(old);
402 		return -1;
403 	}
404 
405 	strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
406 	nb_sockets++;
407 
408 	return 0;
409 }
410 
411 /*
412  * Parse the portmask provided at run time.
413  */
414 static int
415 parse_portmask(const char *portmask)
416 {
417 	char *end = NULL;
418 	unsigned long pm;
419 
420 	errno = 0;
421 
422 	/* parse hexadecimal string */
423 	pm = strtoul(portmask, &end, 16);
424 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
425 		return 0;
426 
427 	return pm;
428 
429 }
430 
431 /*
432  * Parse num options at run time.
433  */
434 static int
435 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
436 {
437 	char *end = NULL;
438 	unsigned long num;
439 
440 	errno = 0;
441 
442 	/* parse unsigned int string */
443 	num = strtoul(q_arg, &end, 10);
444 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
445 		return -1;
446 
447 	if (num > max_valid_value)
448 		return -1;
449 
450 	return num;
451 
452 }
453 
454 /*
455  * Display usage
456  */
457 static void
458 us_vhost_usage(const char *prgname)
459 {
460 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
461 	"		--vm2vm [0|1|2]\n"
462 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
463 	"		--socket-file <path>\n"
464 	"		--nb-devices ND\n"
465 	"		-p PORTMASK: Set mask for ports to be used by application\n"
466 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
467 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
468 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
469 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
470 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
471 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
472 	"		--socket-file: The path of the socket file.\n"
473 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
474 	"		--tso [0|1] disable/enable TCP segment offload.\n"
475 	"		--client register a vhost-user socket as client mode.\n"
476 	"		--dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
477 	"		--dmas register dma channel for specific vhost device.\n",
478 	       prgname);
479 }
480 
481 enum {
482 #define OPT_VM2VM               "vm2vm"
483 	OPT_VM2VM_NUM = 256,
484 #define OPT_RX_RETRY            "rx-retry"
485 	OPT_RX_RETRY_NUM,
486 #define OPT_RX_RETRY_DELAY      "rx-retry-delay"
487 	OPT_RX_RETRY_DELAY_NUM,
488 #define OPT_RX_RETRY_NUMB       "rx-retry-num"
489 	OPT_RX_RETRY_NUMB_NUM,
490 #define OPT_MERGEABLE           "mergeable"
491 	OPT_MERGEABLE_NUM,
492 #define OPT_STATS               "stats"
493 	OPT_STATS_NUM,
494 #define OPT_SOCKET_FILE         "socket-file"
495 	OPT_SOCKET_FILE_NUM,
496 #define OPT_TX_CSUM             "tx-csum"
497 	OPT_TX_CSUM_NUM,
498 #define OPT_TSO                 "tso"
499 	OPT_TSO_NUM,
500 #define OPT_CLIENT              "client"
501 	OPT_CLIENT_NUM,
502 #define OPT_BUILTIN_NET_DRIVER  "builtin-net-driver"
503 	OPT_BUILTIN_NET_DRIVER_NUM,
504 #define OPT_DMA_TYPE            "dma-type"
505 	OPT_DMA_TYPE_NUM,
506 #define OPT_DMAS                "dmas"
507 	OPT_DMAS_NUM,
508 };
509 
510 /*
511  * Parse the arguments given in the command line of the application.
512  */
513 static int
514 us_vhost_parse_args(int argc, char **argv)
515 {
516 	int opt, ret;
517 	int option_index;
518 	unsigned i;
519 	const char *prgname = argv[0];
520 	static struct option long_option[] = {
521 		{OPT_VM2VM, required_argument,
522 				NULL, OPT_VM2VM_NUM},
523 		{OPT_RX_RETRY, required_argument,
524 				NULL, OPT_RX_RETRY_NUM},
525 		{OPT_RX_RETRY_DELAY, required_argument,
526 				NULL, OPT_RX_RETRY_DELAY_NUM},
527 		{OPT_RX_RETRY_NUMB, required_argument,
528 				NULL, OPT_RX_RETRY_NUMB_NUM},
529 		{OPT_MERGEABLE, required_argument,
530 				NULL, OPT_MERGEABLE_NUM},
531 		{OPT_STATS, required_argument,
532 				NULL, OPT_STATS_NUM},
533 		{OPT_SOCKET_FILE, required_argument,
534 				NULL, OPT_SOCKET_FILE_NUM},
535 		{OPT_TX_CSUM, required_argument,
536 				NULL, OPT_TX_CSUM_NUM},
537 		{OPT_TSO, required_argument,
538 				NULL, OPT_TSO_NUM},
539 		{OPT_CLIENT, no_argument,
540 				NULL, OPT_CLIENT_NUM},
541 		{OPT_BUILTIN_NET_DRIVER, no_argument,
542 				NULL, OPT_BUILTIN_NET_DRIVER_NUM},
543 		{OPT_DMA_TYPE, required_argument,
544 				NULL, OPT_DMA_TYPE_NUM},
545 		{OPT_DMAS, required_argument,
546 				NULL, OPT_DMAS_NUM},
547 		{NULL, 0, 0, 0},
548 	};
549 
550 	/* Parse command line */
551 	while ((opt = getopt_long(argc, argv, "p:P",
552 			long_option, &option_index)) != EOF) {
553 		switch (opt) {
554 		/* Portmask */
555 		case 'p':
556 			enabled_port_mask = parse_portmask(optarg);
557 			if (enabled_port_mask == 0) {
558 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
559 				us_vhost_usage(prgname);
560 				return -1;
561 			}
562 			break;
563 
564 		case 'P':
565 			promiscuous = 1;
566 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
567 				ETH_VMDQ_ACCEPT_BROADCAST |
568 				ETH_VMDQ_ACCEPT_MULTICAST;
569 			break;
570 
571 		case OPT_VM2VM_NUM:
572 			ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
573 			if (ret == -1) {
574 				RTE_LOG(INFO, VHOST_CONFIG,
575 					"Invalid argument for "
576 					"vm2vm [0|1|2]\n");
577 				us_vhost_usage(prgname);
578 				return -1;
579 			}
580 			vm2vm_mode = (vm2vm_type)ret;
581 			break;
582 
583 		case OPT_RX_RETRY_NUM:
584 			ret = parse_num_opt(optarg, 1);
585 			if (ret == -1) {
586 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
587 				us_vhost_usage(prgname);
588 				return -1;
589 			}
590 			enable_retry = ret;
591 			break;
592 
593 		case OPT_TX_CSUM_NUM:
594 			ret = parse_num_opt(optarg, 1);
595 			if (ret == -1) {
596 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
597 				us_vhost_usage(prgname);
598 				return -1;
599 			}
600 			enable_tx_csum = ret;
601 			break;
602 
603 		case OPT_TSO_NUM:
604 			ret = parse_num_opt(optarg, 1);
605 			if (ret == -1) {
606 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
607 				us_vhost_usage(prgname);
608 				return -1;
609 			}
610 			enable_tso = ret;
611 			break;
612 
613 		case OPT_RX_RETRY_DELAY_NUM:
614 			ret = parse_num_opt(optarg, INT32_MAX);
615 			if (ret == -1) {
616 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
617 				us_vhost_usage(prgname);
618 				return -1;
619 			}
620 			burst_rx_delay_time = ret;
621 			break;
622 
623 		case OPT_RX_RETRY_NUMB_NUM:
624 			ret = parse_num_opt(optarg, INT32_MAX);
625 			if (ret == -1) {
626 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
627 				us_vhost_usage(prgname);
628 				return -1;
629 			}
630 			burst_rx_retry_num = ret;
631 			break;
632 
633 		case OPT_MERGEABLE_NUM:
634 			ret = parse_num_opt(optarg, 1);
635 			if (ret == -1) {
636 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
637 				us_vhost_usage(prgname);
638 				return -1;
639 			}
640 			mergeable = !!ret;
641 			if (ret) {
642 				vmdq_conf_default.rxmode.offloads |=
643 					DEV_RX_OFFLOAD_JUMBO_FRAME;
644 				vmdq_conf_default.rxmode.max_rx_pkt_len
645 					= JUMBO_FRAME_MAX_SIZE;
646 			}
647 			break;
648 
649 		case OPT_STATS_NUM:
650 			ret = parse_num_opt(optarg, INT32_MAX);
651 			if (ret == -1) {
652 				RTE_LOG(INFO, VHOST_CONFIG,
653 					"Invalid argument for stats [0..N]\n");
654 				us_vhost_usage(prgname);
655 				return -1;
656 			}
657 			enable_stats = ret;
658 			break;
659 
660 		/* Set socket file path. */
661 		case OPT_SOCKET_FILE_NUM:
662 			if (us_vhost_parse_socket_path(optarg) == -1) {
663 				RTE_LOG(INFO, VHOST_CONFIG,
664 				"Invalid argument for socket name (Max %d characters)\n",
665 				PATH_MAX);
666 				us_vhost_usage(prgname);
667 				return -1;
668 			}
669 			break;
670 
671 		case OPT_DMA_TYPE_NUM:
672 			strcpy(dma_type, optarg);
673 			break;
674 
675 		case OPT_DMAS_NUM:
676 			if (open_dma(optarg) == -1) {
677 				RTE_LOG(INFO, VHOST_CONFIG,
678 					"Wrong DMA args\n");
679 				us_vhost_usage(prgname);
680 				return -1;
681 			}
682 			async_vhost_driver = 1;
683 			break;
684 
685 		case OPT_CLIENT_NUM:
686 			client_mode = 1;
687 			break;
688 
689 		case OPT_BUILTIN_NET_DRIVER_NUM:
690 			builtin_net_driver = 1;
691 			break;
692 
693 		/* Invalid option - print options. */
694 		default:
695 			us_vhost_usage(prgname);
696 			return -1;
697 		}
698 	}
699 
700 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
701 		if (enabled_port_mask & (1 << i))
702 			ports[num_ports++] = i;
703 	}
704 
705 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
706 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
707 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
708 		return -1;
709 	}
710 
711 	return 0;
712 }
713 
714 /*
715  * Update the global var NUM_PORTS and array PORTS according to system ports number
716  * and return valid ports number
717  */
718 static unsigned check_ports_num(unsigned nb_ports)
719 {
720 	unsigned valid_num_ports = num_ports;
721 	unsigned portid;
722 
723 	if (num_ports > nb_ports) {
724 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
725 			num_ports, nb_ports);
726 		num_ports = nb_ports;
727 	}
728 
729 	for (portid = 0; portid < num_ports; portid ++) {
730 		if (!rte_eth_dev_is_valid_port(ports[portid])) {
731 			RTE_LOG(INFO, VHOST_PORT,
732 				"\nSpecified port ID(%u) is not valid\n",
733 				ports[portid]);
734 			ports[portid] = INVALID_PORT_ID;
735 			valid_num_ports--;
736 		}
737 	}
738 	return valid_num_ports;
739 }
740 
741 static __rte_always_inline struct vhost_dev *
742 find_vhost_dev(struct rte_ether_addr *mac)
743 {
744 	struct vhost_dev *vdev;
745 
746 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
747 		if (vdev->ready == DEVICE_RX &&
748 		    rte_is_same_ether_addr(mac, &vdev->mac_address))
749 			return vdev;
750 	}
751 
752 	return NULL;
753 }
754 
755 /*
756  * This function learns the MAC address of the device and registers this along with a
757  * vlan tag to a VMDQ.
758  */
759 static int
760 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
761 {
762 	struct rte_ether_hdr *pkt_hdr;
763 	int i, ret;
764 
765 	/* Learn MAC address of guest device from packet */
766 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
767 
768 	if (find_vhost_dev(&pkt_hdr->s_addr)) {
769 		RTE_LOG(ERR, VHOST_DATA,
770 			"(%d) device is using a registered MAC!\n",
771 			vdev->vid);
772 		return -1;
773 	}
774 
775 	for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
776 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
777 
778 	/* vlan_tag currently uses the device_id. */
779 	vdev->vlan_tag = vlan_tags[vdev->vid];
780 
781 	/* Print out VMDQ registration info. */
782 	RTE_LOG(INFO, VHOST_DATA,
783 		"(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
784 		vdev->vid,
785 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
786 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
787 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
788 		vdev->vlan_tag);
789 
790 	/* Register the MAC address. */
791 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
792 				(uint32_t)vdev->vid + vmdq_pool_base);
793 	if (ret)
794 		RTE_LOG(ERR, VHOST_DATA,
795 			"(%d) failed to add device MAC address to VMDQ\n",
796 			vdev->vid);
797 
798 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
799 
800 	/* Set device as ready for RX. */
801 	vdev->ready = DEVICE_RX;
802 
803 	return 0;
804 }
805 
806 /*
807  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
808  * queue before disabling RX on the device.
809  */
810 static inline void
811 unlink_vmdq(struct vhost_dev *vdev)
812 {
813 	unsigned i = 0;
814 	unsigned rx_count;
815 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
816 
817 	if (vdev->ready == DEVICE_RX) {
818 		/*clear MAC and VLAN settings*/
819 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
820 		for (i = 0; i < 6; i++)
821 			vdev->mac_address.addr_bytes[i] = 0;
822 
823 		vdev->vlan_tag = 0;
824 
825 		/*Clear out the receive buffers*/
826 		rx_count = rte_eth_rx_burst(ports[0],
827 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
828 
829 		while (rx_count) {
830 			for (i = 0; i < rx_count; i++)
831 				rte_pktmbuf_free(pkts_burst[i]);
832 
833 			rx_count = rte_eth_rx_burst(ports[0],
834 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
835 		}
836 
837 		vdev->ready = DEVICE_MAC_LEARNING;
838 	}
839 }
840 
841 static inline void
842 free_pkts(struct rte_mbuf **pkts, uint16_t n)
843 {
844 	while (n--)
845 		rte_pktmbuf_free(pkts[n]);
846 }
847 
848 static __rte_always_inline void
849 complete_async_pkts(struct vhost_dev *vdev)
850 {
851 	struct rte_mbuf *p_cpl[MAX_PKT_BURST];
852 	uint16_t complete_count;
853 
854 	complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
855 					VIRTIO_RXQ, p_cpl, MAX_PKT_BURST);
856 	if (complete_count)
857 		free_pkts(p_cpl, complete_count);
858 }
859 
860 static __rte_always_inline void
861 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
862 	    struct rte_mbuf *m)
863 {
864 	uint16_t ret;
865 
866 	if (builtin_net_driver) {
867 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
868 	} else {
869 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
870 	}
871 
872 	if (enable_stats) {
873 		__atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
874 				__ATOMIC_SEQ_CST);
875 		__atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
876 				__ATOMIC_SEQ_CST);
877 		src_vdev->stats.tx_total++;
878 		src_vdev->stats.tx += ret;
879 	}
880 }
881 
882 static __rte_always_inline void
883 drain_vhost(struct vhost_dev *vdev)
884 {
885 	uint16_t ret;
886 	uint32_t buff_idx = rte_lcore_id() * MAX_VHOST_DEVICE + vdev->vid;
887 	uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
888 	struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
889 
890 	if (builtin_net_driver) {
891 		ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
892 	} else if (async_vhost_driver) {
893 		uint32_t cpu_cpl_nr = 0;
894 		uint16_t enqueue_fail = 0;
895 		struct rte_mbuf *m_cpu_cpl[nr_xmit];
896 
897 		complete_async_pkts(vdev);
898 		ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ,
899 					m, nr_xmit, m_cpu_cpl, &cpu_cpl_nr);
900 
901 		if (cpu_cpl_nr)
902 			free_pkts(m_cpu_cpl, cpu_cpl_nr);
903 
904 		enqueue_fail = nr_xmit - ret;
905 		if (enqueue_fail)
906 			free_pkts(&m[ret], nr_xmit - ret);
907 	} else {
908 		ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
909 						m, nr_xmit);
910 	}
911 
912 	if (enable_stats) {
913 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
914 				__ATOMIC_SEQ_CST);
915 		__atomic_add_fetch(&vdev->stats.rx_atomic, ret,
916 				__ATOMIC_SEQ_CST);
917 	}
918 
919 	if (!async_vhost_driver)
920 		free_pkts(m, nr_xmit);
921 }
922 
923 static __rte_always_inline void
924 drain_vhost_table(void)
925 {
926 	uint16_t lcore_id = rte_lcore_id();
927 	struct vhost_bufftable *vhost_txq;
928 	struct vhost_dev *vdev;
929 	uint64_t cur_tsc;
930 
931 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
932 		vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE
933 						+ vdev->vid];
934 
935 		cur_tsc = rte_rdtsc();
936 		if (unlikely(cur_tsc - vhost_txq->pre_tsc
937 				> MBUF_TABLE_DRAIN_TSC)) {
938 			RTE_LOG_DP(DEBUG, VHOST_DATA,
939 				"Vhost TX queue drained after timeout with burst size %u\n",
940 				vhost_txq->len);
941 			drain_vhost(vdev);
942 			vhost_txq->len = 0;
943 			vhost_txq->pre_tsc = cur_tsc;
944 		}
945 	}
946 }
947 
948 /*
949  * Check if the packet destination MAC address is for a local device. If so then put
950  * the packet on that devices RX queue. If not then return.
951  */
952 static __rte_always_inline int
953 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
954 {
955 	struct rte_ether_hdr *pkt_hdr;
956 	struct vhost_dev *dst_vdev;
957 	struct vhost_bufftable *vhost_txq;
958 	uint16_t lcore_id = rte_lcore_id();
959 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
960 
961 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
962 	if (!dst_vdev)
963 		return -1;
964 
965 	if (vdev->vid == dst_vdev->vid) {
966 		RTE_LOG_DP(DEBUG, VHOST_DATA,
967 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
968 			vdev->vid);
969 		return 0;
970 	}
971 
972 	RTE_LOG_DP(DEBUG, VHOST_DATA,
973 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
974 
975 	if (unlikely(dst_vdev->remove)) {
976 		RTE_LOG_DP(DEBUG, VHOST_DATA,
977 			"(%d) device is marked for removal\n", dst_vdev->vid);
978 		return 0;
979 	}
980 
981 	vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE + dst_vdev->vid];
982 	vhost_txq->m_table[vhost_txq->len++] = m;
983 
984 	if (enable_stats) {
985 		vdev->stats.tx_total++;
986 		vdev->stats.tx++;
987 	}
988 
989 	if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
990 		drain_vhost(dst_vdev);
991 		vhost_txq->len = 0;
992 		vhost_txq->pre_tsc = rte_rdtsc();
993 	}
994 	return 0;
995 }
996 
997 /*
998  * Check if the destination MAC of a packet is one local VM,
999  * and get its vlan tag, and offset if it is.
1000  */
1001 static __rte_always_inline int
1002 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
1003 	uint32_t *offset, uint16_t *vlan_tag)
1004 {
1005 	struct vhost_dev *dst_vdev;
1006 	struct rte_ether_hdr *pkt_hdr =
1007 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1008 
1009 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
1010 	if (!dst_vdev)
1011 		return 0;
1012 
1013 	if (vdev->vid == dst_vdev->vid) {
1014 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1015 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
1016 			vdev->vid);
1017 		return -1;
1018 	}
1019 
1020 	/*
1021 	 * HW vlan strip will reduce the packet length
1022 	 * by minus length of vlan tag, so need restore
1023 	 * the packet length by plus it.
1024 	 */
1025 	*offset  = VLAN_HLEN;
1026 	*vlan_tag = vlan_tags[vdev->vid];
1027 
1028 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1029 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1030 		vdev->vid, dst_vdev->vid, *vlan_tag);
1031 
1032 	return 0;
1033 }
1034 
1035 static uint16_t
1036 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
1037 {
1038 	if (ol_flags & PKT_TX_IPV4)
1039 		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
1040 	else /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1041 		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
1042 }
1043 
1044 static void virtio_tx_offload(struct rte_mbuf *m)
1045 {
1046 	void *l3_hdr;
1047 	struct rte_ipv4_hdr *ipv4_hdr = NULL;
1048 	struct rte_tcp_hdr *tcp_hdr = NULL;
1049 	struct rte_ether_hdr *eth_hdr =
1050 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1051 
1052 	l3_hdr = (char *)eth_hdr + m->l2_len;
1053 
1054 	if (m->ol_flags & PKT_TX_IPV4) {
1055 		ipv4_hdr = l3_hdr;
1056 		ipv4_hdr->hdr_checksum = 0;
1057 		m->ol_flags |= PKT_TX_IP_CKSUM;
1058 	}
1059 
1060 	tcp_hdr = (struct rte_tcp_hdr *)((char *)l3_hdr + m->l3_len);
1061 	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
1062 }
1063 
1064 static __rte_always_inline void
1065 do_drain_mbuf_table(struct mbuf_table *tx_q)
1066 {
1067 	uint16_t count;
1068 
1069 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1070 				 tx_q->m_table, tx_q->len);
1071 	if (unlikely(count < tx_q->len))
1072 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
1073 
1074 	tx_q->len = 0;
1075 }
1076 
1077 /*
1078  * This function routes the TX packet to the correct interface. This
1079  * may be a local device or the physical port.
1080  */
1081 static __rte_always_inline void
1082 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1083 {
1084 	struct mbuf_table *tx_q;
1085 	unsigned offset = 0;
1086 	const uint16_t lcore_id = rte_lcore_id();
1087 	struct rte_ether_hdr *nh;
1088 
1089 
1090 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1091 	if (unlikely(rte_is_broadcast_ether_addr(&nh->d_addr))) {
1092 		struct vhost_dev *vdev2;
1093 
1094 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1095 			if (vdev2 != vdev)
1096 				sync_virtio_xmit(vdev2, vdev, m);
1097 		}
1098 		goto queue2nic;
1099 	}
1100 
1101 	/*check if destination is local VM*/
1102 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1103 		return;
1104 
1105 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1106 		if (unlikely(find_local_dest(vdev, m, &offset,
1107 					     &vlan_tag) != 0)) {
1108 			rte_pktmbuf_free(m);
1109 			return;
1110 		}
1111 	}
1112 
1113 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1114 		"(%d) TX: MAC address is external\n", vdev->vid);
1115 
1116 queue2nic:
1117 
1118 	/*Add packet to the port tx queue*/
1119 	tx_q = &lcore_tx_queue[lcore_id];
1120 
1121 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1122 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1123 		/* Guest has inserted the vlan tag. */
1124 		struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1125 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1126 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1127 			(vh->vlan_tci != vlan_tag_be))
1128 			vh->vlan_tci = vlan_tag_be;
1129 	} else {
1130 		m->ol_flags |= PKT_TX_VLAN_PKT;
1131 
1132 		/*
1133 		 * Find the right seg to adjust the data len when offset is
1134 		 * bigger than tail room size.
1135 		 */
1136 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1137 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1138 				m->data_len += offset;
1139 			else {
1140 				struct rte_mbuf *seg = m;
1141 
1142 				while ((seg->next != NULL) &&
1143 					(offset > rte_pktmbuf_tailroom(seg)))
1144 					seg = seg->next;
1145 
1146 				seg->data_len += offset;
1147 			}
1148 			m->pkt_len += offset;
1149 		}
1150 
1151 		m->vlan_tci = vlan_tag;
1152 	}
1153 
1154 	if (m->ol_flags & PKT_TX_TCP_SEG)
1155 		virtio_tx_offload(m);
1156 
1157 	tx_q->m_table[tx_q->len++] = m;
1158 	if (enable_stats) {
1159 		vdev->stats.tx_total++;
1160 		vdev->stats.tx++;
1161 	}
1162 
1163 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1164 		do_drain_mbuf_table(tx_q);
1165 }
1166 
1167 
1168 static __rte_always_inline void
1169 drain_mbuf_table(struct mbuf_table *tx_q)
1170 {
1171 	static uint64_t prev_tsc;
1172 	uint64_t cur_tsc;
1173 
1174 	if (tx_q->len == 0)
1175 		return;
1176 
1177 	cur_tsc = rte_rdtsc();
1178 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1179 		prev_tsc = cur_tsc;
1180 
1181 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1182 			"TX queue drained after timeout with burst size %u\n",
1183 			tx_q->len);
1184 		do_drain_mbuf_table(tx_q);
1185 	}
1186 }
1187 
1188 static __rte_always_inline void
1189 drain_eth_rx(struct vhost_dev *vdev)
1190 {
1191 	uint16_t rx_count, enqueue_count;
1192 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1193 
1194 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1195 				    pkts, MAX_PKT_BURST);
1196 
1197 	if (!rx_count)
1198 		return;
1199 
1200 	/*
1201 	 * When "enable_retry" is set, here we wait and retry when there
1202 	 * is no enough free slots in the queue to hold @rx_count packets,
1203 	 * to diminish packet loss.
1204 	 */
1205 	if (enable_retry &&
1206 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1207 			VIRTIO_RXQ))) {
1208 		uint32_t retry;
1209 
1210 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1211 			rte_delay_us(burst_rx_delay_time);
1212 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1213 					VIRTIO_RXQ))
1214 				break;
1215 		}
1216 	}
1217 
1218 	if (builtin_net_driver) {
1219 		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1220 						pkts, rx_count);
1221 	} else if (async_vhost_driver) {
1222 		uint32_t cpu_cpl_nr = 0;
1223 		uint16_t enqueue_fail = 0;
1224 		struct rte_mbuf *m_cpu_cpl[MAX_PKT_BURST];
1225 
1226 		complete_async_pkts(vdev);
1227 		enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1228 					VIRTIO_RXQ, pkts, rx_count,
1229 					m_cpu_cpl, &cpu_cpl_nr);
1230 		if (cpu_cpl_nr)
1231 			free_pkts(m_cpu_cpl, cpu_cpl_nr);
1232 
1233 		enqueue_fail = rx_count - enqueue_count;
1234 		if (enqueue_fail)
1235 			free_pkts(&pkts[enqueue_count], enqueue_fail);
1236 
1237 	} else {
1238 		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1239 						pkts, rx_count);
1240 	}
1241 
1242 	if (enable_stats) {
1243 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1244 				__ATOMIC_SEQ_CST);
1245 		__atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1246 				__ATOMIC_SEQ_CST);
1247 	}
1248 
1249 	if (!async_vhost_driver)
1250 		free_pkts(pkts, rx_count);
1251 }
1252 
1253 static __rte_always_inline void
1254 drain_virtio_tx(struct vhost_dev *vdev)
1255 {
1256 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1257 	uint16_t count;
1258 	uint16_t i;
1259 
1260 	if (builtin_net_driver) {
1261 		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1262 					pkts, MAX_PKT_BURST);
1263 	} else {
1264 		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1265 					mbuf_pool, pkts, MAX_PKT_BURST);
1266 	}
1267 
1268 	/* setup VMDq for the first packet */
1269 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1270 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1271 			free_pkts(pkts, count);
1272 	}
1273 
1274 	for (i = 0; i < count; ++i)
1275 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1276 }
1277 
1278 /*
1279  * Main function of vhost-switch. It basically does:
1280  *
1281  * for each vhost device {
1282  *    - drain_eth_rx()
1283  *
1284  *      Which drains the host eth Rx queue linked to the vhost device,
1285  *      and deliver all of them to guest virito Rx ring associated with
1286  *      this vhost device.
1287  *
1288  *    - drain_virtio_tx()
1289  *
1290  *      Which drains the guest virtio Tx queue and deliver all of them
1291  *      to the target, which could be another vhost device, or the
1292  *      physical eth dev. The route is done in function "virtio_tx_route".
1293  * }
1294  */
1295 static int
1296 switch_worker(void *arg __rte_unused)
1297 {
1298 	unsigned i;
1299 	unsigned lcore_id = rte_lcore_id();
1300 	struct vhost_dev *vdev;
1301 	struct mbuf_table *tx_q;
1302 
1303 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1304 
1305 	tx_q = &lcore_tx_queue[lcore_id];
1306 	for (i = 0; i < rte_lcore_count(); i++) {
1307 		if (lcore_ids[i] == lcore_id) {
1308 			tx_q->txq_id = i;
1309 			break;
1310 		}
1311 	}
1312 
1313 	while(1) {
1314 		drain_mbuf_table(tx_q);
1315 		drain_vhost_table();
1316 		/*
1317 		 * Inform the configuration core that we have exited the
1318 		 * linked list and that no devices are in use if requested.
1319 		 */
1320 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1321 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1322 
1323 		/*
1324 		 * Process vhost devices
1325 		 */
1326 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1327 			      lcore_vdev_entry) {
1328 			if (unlikely(vdev->remove)) {
1329 				unlink_vmdq(vdev);
1330 				vdev->ready = DEVICE_SAFE_REMOVE;
1331 				continue;
1332 			}
1333 
1334 			if (likely(vdev->ready == DEVICE_RX))
1335 				drain_eth_rx(vdev);
1336 
1337 			if (likely(!vdev->remove))
1338 				drain_virtio_tx(vdev);
1339 		}
1340 	}
1341 
1342 	return 0;
1343 }
1344 
1345 /*
1346  * Remove a device from the specific data core linked list and from the
1347  * main linked list. Synchonization  occurs through the use of the
1348  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1349  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1350  */
1351 static void
1352 destroy_device(int vid)
1353 {
1354 	struct vhost_dev *vdev = NULL;
1355 	int lcore;
1356 	uint16_t i;
1357 
1358 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1359 		if (vdev->vid == vid)
1360 			break;
1361 	}
1362 	if (!vdev)
1363 		return;
1364 	/*set the remove flag. */
1365 	vdev->remove = 1;
1366 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1367 		rte_pause();
1368 	}
1369 
1370 	for (i = 0; i < RTE_MAX_LCORE; i++)
1371 		rte_free(vhost_txbuff[i * MAX_VHOST_DEVICE + vid]);
1372 
1373 	if (builtin_net_driver)
1374 		vs_vhost_net_remove(vdev);
1375 
1376 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1377 		     lcore_vdev_entry);
1378 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1379 
1380 
1381 	/* Set the dev_removal_flag on each lcore. */
1382 	RTE_LCORE_FOREACH_WORKER(lcore)
1383 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1384 
1385 	/*
1386 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1387 	 * we can be sure that they can no longer access the device removed
1388 	 * from the linked lists and that the devices are no longer in use.
1389 	 */
1390 	RTE_LCORE_FOREACH_WORKER(lcore) {
1391 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1392 			rte_pause();
1393 	}
1394 
1395 	lcore_info[vdev->coreid].device_num--;
1396 
1397 	RTE_LOG(INFO, VHOST_DATA,
1398 		"(%d) device has been removed from data core\n",
1399 		vdev->vid);
1400 
1401 	if (async_vhost_driver)
1402 		rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1403 
1404 	rte_free(vdev);
1405 }
1406 
1407 /*
1408  * A new device is added to a data core. First the device is added to the main linked list
1409  * and then allocated to a specific data core.
1410  */
1411 static int
1412 new_device(int vid)
1413 {
1414 	int lcore, core_add = 0;
1415 	uint16_t i;
1416 	uint32_t device_num_min = num_devices;
1417 	struct vhost_dev *vdev;
1418 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1419 	if (vdev == NULL) {
1420 		RTE_LOG(INFO, VHOST_DATA,
1421 			"(%d) couldn't allocate memory for vhost dev\n",
1422 			vid);
1423 		return -1;
1424 	}
1425 	vdev->vid = vid;
1426 
1427 	for (i = 0; i < RTE_MAX_LCORE; i++) {
1428 		vhost_txbuff[i * MAX_VHOST_DEVICE + vid]
1429 			= rte_zmalloc("vhost bufftable",
1430 				sizeof(struct vhost_bufftable),
1431 				RTE_CACHE_LINE_SIZE);
1432 
1433 		if (vhost_txbuff[i * MAX_VHOST_DEVICE + vid] == NULL) {
1434 			RTE_LOG(INFO, VHOST_DATA,
1435 			  "(%d) couldn't allocate memory for vhost TX\n", vid);
1436 			return -1;
1437 		}
1438 	}
1439 
1440 	if (builtin_net_driver)
1441 		vs_vhost_net_setup(vdev);
1442 
1443 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1444 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1445 
1446 	/*reset ready flag*/
1447 	vdev->ready = DEVICE_MAC_LEARNING;
1448 	vdev->remove = 0;
1449 
1450 	/* Find a suitable lcore to add the device. */
1451 	RTE_LCORE_FOREACH_WORKER(lcore) {
1452 		if (lcore_info[lcore].device_num < device_num_min) {
1453 			device_num_min = lcore_info[lcore].device_num;
1454 			core_add = lcore;
1455 		}
1456 	}
1457 	vdev->coreid = core_add;
1458 
1459 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1460 			  lcore_vdev_entry);
1461 	lcore_info[vdev->coreid].device_num++;
1462 
1463 	/* Disable notifications. */
1464 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1465 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1466 
1467 	RTE_LOG(INFO, VHOST_DATA,
1468 		"(%d) device has been added to data core %d\n",
1469 		vid, vdev->coreid);
1470 
1471 	if (async_vhost_driver) {
1472 		struct rte_vhost_async_features f;
1473 		struct rte_vhost_async_channel_ops channel_ops;
1474 
1475 		if (strncmp(dma_type, "ioat", 4) == 0) {
1476 			channel_ops.transfer_data = ioat_transfer_data_cb;
1477 			channel_ops.check_completed_copies =
1478 				ioat_check_completed_copies_cb;
1479 
1480 			f.async_inorder = 1;
1481 			f.async_threshold = 256;
1482 
1483 			return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
1484 				f.intval, &channel_ops);
1485 		}
1486 	}
1487 
1488 	return 0;
1489 }
1490 
1491 /*
1492  * These callback allow devices to be added to the data core when configuration
1493  * has been fully complete.
1494  */
1495 static const struct vhost_device_ops virtio_net_device_ops =
1496 {
1497 	.new_device =  new_device,
1498 	.destroy_device = destroy_device,
1499 };
1500 
1501 /*
1502  * This is a thread will wake up after a period to print stats if the user has
1503  * enabled them.
1504  */
1505 static void *
1506 print_stats(__rte_unused void *arg)
1507 {
1508 	struct vhost_dev *vdev;
1509 	uint64_t tx_dropped, rx_dropped;
1510 	uint64_t tx, tx_total, rx, rx_total;
1511 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1512 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1513 
1514 	while(1) {
1515 		sleep(enable_stats);
1516 
1517 		/* Clear screen and move to top left */
1518 		printf("%s%s\n", clr, top_left);
1519 		printf("Device statistics =================================\n");
1520 
1521 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1522 			tx_total   = vdev->stats.tx_total;
1523 			tx         = vdev->stats.tx;
1524 			tx_dropped = tx_total - tx;
1525 
1526 			rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1527 				__ATOMIC_SEQ_CST);
1528 			rx         = __atomic_load_n(&vdev->stats.rx_atomic,
1529 				__ATOMIC_SEQ_CST);
1530 			rx_dropped = rx_total - rx;
1531 
1532 			printf("Statistics for device %d\n"
1533 				"-----------------------\n"
1534 				"TX total:              %" PRIu64 "\n"
1535 				"TX dropped:            %" PRIu64 "\n"
1536 				"TX successful:         %" PRIu64 "\n"
1537 				"RX total:              %" PRIu64 "\n"
1538 				"RX dropped:            %" PRIu64 "\n"
1539 				"RX successful:         %" PRIu64 "\n",
1540 				vdev->vid,
1541 				tx_total, tx_dropped, tx,
1542 				rx_total, rx_dropped, rx);
1543 		}
1544 
1545 		printf("===================================================\n");
1546 
1547 		fflush(stdout);
1548 	}
1549 
1550 	return NULL;
1551 }
1552 
1553 static void
1554 unregister_drivers(int socket_num)
1555 {
1556 	int i, ret;
1557 
1558 	for (i = 0; i < socket_num; i++) {
1559 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1560 		if (ret != 0)
1561 			RTE_LOG(ERR, VHOST_CONFIG,
1562 				"Fail to unregister vhost driver for %s.\n",
1563 				socket_files + i * PATH_MAX);
1564 	}
1565 }
1566 
1567 /* When we receive a INT signal, unregister vhost driver */
1568 static void
1569 sigint_handler(__rte_unused int signum)
1570 {
1571 	/* Unregister vhost driver. */
1572 	unregister_drivers(nb_sockets);
1573 
1574 	exit(0);
1575 }
1576 
1577 /*
1578  * While creating an mbuf pool, one key thing is to figure out how
1579  * many mbuf entries is enough for our use. FYI, here are some
1580  * guidelines:
1581  *
1582  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1583  *
1584  * - For each switch core (A CPU core does the packet switch), we need
1585  *   also make some reservation for receiving the packets from virtio
1586  *   Tx queue. How many is enough depends on the usage. It's normally
1587  *   a simple calculation like following:
1588  *
1589  *       MAX_PKT_BURST * max packet size / mbuf size
1590  *
1591  *   So, we definitely need allocate more mbufs when TSO is enabled.
1592  *
1593  * - Similarly, for each switching core, we should serve @nr_rx_desc
1594  *   mbufs for receiving the packets from physical NIC device.
1595  *
1596  * - We also need make sure, for each switch core, we have allocated
1597  *   enough mbufs to fill up the mbuf cache.
1598  */
1599 static void
1600 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1601 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1602 {
1603 	uint32_t nr_mbufs;
1604 	uint32_t nr_mbufs_per_core;
1605 	uint32_t mtu = 1500;
1606 
1607 	if (mergeable)
1608 		mtu = 9000;
1609 	if (enable_tso)
1610 		mtu = 64 * 1024;
1611 
1612 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1613 			(mbuf_size - RTE_PKTMBUF_HEADROOM);
1614 	nr_mbufs_per_core += nr_rx_desc;
1615 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1616 
1617 	nr_mbufs  = nr_queues * nr_rx_desc;
1618 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1619 	nr_mbufs *= nr_port;
1620 
1621 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1622 					    nr_mbuf_cache, 0, mbuf_size,
1623 					    rte_socket_id());
1624 	if (mbuf_pool == NULL)
1625 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1626 }
1627 
1628 /*
1629  * Main function, does initialisation and calls the per-lcore functions.
1630  */
1631 int
1632 main(int argc, char *argv[])
1633 {
1634 	unsigned lcore_id, core_id = 0;
1635 	unsigned nb_ports, valid_num_ports;
1636 	int ret, i;
1637 	uint16_t portid;
1638 	static pthread_t tid;
1639 	uint64_t flags = 0;
1640 
1641 	signal(SIGINT, sigint_handler);
1642 
1643 	/* init EAL */
1644 	ret = rte_eal_init(argc, argv);
1645 	if (ret < 0)
1646 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1647 	argc -= ret;
1648 	argv += ret;
1649 
1650 	/* parse app arguments */
1651 	ret = us_vhost_parse_args(argc, argv);
1652 	if (ret < 0)
1653 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1654 
1655 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1656 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1657 
1658 		if (rte_lcore_is_enabled(lcore_id))
1659 			lcore_ids[core_id++] = lcore_id;
1660 	}
1661 
1662 	if (rte_lcore_count() > RTE_MAX_LCORE)
1663 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1664 
1665 	/* Get the number of physical ports. */
1666 	nb_ports = rte_eth_dev_count_avail();
1667 
1668 	/*
1669 	 * Update the global var NUM_PORTS and global array PORTS
1670 	 * and get value of var VALID_NUM_PORTS according to system ports number
1671 	 */
1672 	valid_num_ports = check_ports_num(nb_ports);
1673 
1674 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1675 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1676 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1677 		return -1;
1678 	}
1679 
1680 	/*
1681 	 * FIXME: here we are trying to allocate mbufs big enough for
1682 	 * @MAX_QUEUES, but the truth is we're never going to use that
1683 	 * many queues here. We probably should only do allocation for
1684 	 * those queues we are going to use.
1685 	 */
1686 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1687 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1688 
1689 	if (vm2vm_mode == VM2VM_HARDWARE) {
1690 		/* Enable VT loop back to let L2 switch to do it. */
1691 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1692 		RTE_LOG(DEBUG, VHOST_CONFIG,
1693 			"Enable loop back for L2 switch in vmdq.\n");
1694 	}
1695 
1696 	/* initialize all ports */
1697 	RTE_ETH_FOREACH_DEV(portid) {
1698 		/* skip ports that are not enabled */
1699 		if ((enabled_port_mask & (1 << portid)) == 0) {
1700 			RTE_LOG(INFO, VHOST_PORT,
1701 				"Skipping disabled port %d\n", portid);
1702 			continue;
1703 		}
1704 		if (port_init(portid) != 0)
1705 			rte_exit(EXIT_FAILURE,
1706 				"Cannot initialize network ports\n");
1707 	}
1708 
1709 	/* Enable stats if the user option is set. */
1710 	if (enable_stats) {
1711 		ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1712 					print_stats, NULL);
1713 		if (ret < 0)
1714 			rte_exit(EXIT_FAILURE,
1715 				"Cannot create print-stats thread\n");
1716 	}
1717 
1718 	/* Launch all data cores. */
1719 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1720 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1721 
1722 	if (client_mode)
1723 		flags |= RTE_VHOST_USER_CLIENT;
1724 
1725 	/* Register vhost user driver to handle vhost messages. */
1726 	for (i = 0; i < nb_sockets; i++) {
1727 		char *file = socket_files + i * PATH_MAX;
1728 
1729 		if (async_vhost_driver)
1730 			flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1731 
1732 		ret = rte_vhost_driver_register(file, flags);
1733 		if (ret != 0) {
1734 			unregister_drivers(i);
1735 			rte_exit(EXIT_FAILURE,
1736 				"vhost driver register failure.\n");
1737 		}
1738 
1739 		if (builtin_net_driver)
1740 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1741 
1742 		if (mergeable == 0) {
1743 			rte_vhost_driver_disable_features(file,
1744 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
1745 		}
1746 
1747 		if (enable_tx_csum == 0) {
1748 			rte_vhost_driver_disable_features(file,
1749 				1ULL << VIRTIO_NET_F_CSUM);
1750 		}
1751 
1752 		if (enable_tso == 0) {
1753 			rte_vhost_driver_disable_features(file,
1754 				1ULL << VIRTIO_NET_F_HOST_TSO4);
1755 			rte_vhost_driver_disable_features(file,
1756 				1ULL << VIRTIO_NET_F_HOST_TSO6);
1757 			rte_vhost_driver_disable_features(file,
1758 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
1759 			rte_vhost_driver_disable_features(file,
1760 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
1761 		}
1762 
1763 		if (promiscuous) {
1764 			rte_vhost_driver_enable_features(file,
1765 				1ULL << VIRTIO_NET_F_CTRL_RX);
1766 		}
1767 
1768 		ret = rte_vhost_driver_callback_register(file,
1769 			&virtio_net_device_ops);
1770 		if (ret != 0) {
1771 			rte_exit(EXIT_FAILURE,
1772 				"failed to register vhost driver callbacks.\n");
1773 		}
1774 
1775 		if (rte_vhost_driver_start(file) < 0) {
1776 			rte_exit(EXIT_FAILURE,
1777 				"failed to start vhost driver.\n");
1778 		}
1779 	}
1780 
1781 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1782 		rte_eal_wait_lcore(lcore_id);
1783 
1784 	return 0;
1785 
1786 }
1787