xref: /dpdk/examples/vhost/main.c (revision 47afdbbe56554444f16e721bc872c262e245fb97)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4 
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16 
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
19 #include <rte_log.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
22 #include <rte_vhost.h>
23 #include <rte_ip.h>
24 #include <rte_tcp.h>
25 #include <rte_pause.h>
26 
27 #include "ioat.h"
28 #include "main.h"
29 
30 #ifndef MAX_QUEUES
31 #define MAX_QUEUES 128
32 #endif
33 
34 /* the maximum number of external ports supported */
35 #define MAX_SUP_PORTS 1
36 
37 #define MBUF_CACHE_SIZE	128
38 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
39 
40 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
41 
42 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
43 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
44 
45 #define JUMBO_FRAME_MAX_SIZE    0x2600
46 
47 /* State of virtio device. */
48 #define DEVICE_MAC_LEARNING 0
49 #define DEVICE_RX			1
50 #define DEVICE_SAFE_REMOVE	2
51 
52 /* Configurable number of RX/TX ring descriptors */
53 #define RTE_TEST_RX_DESC_DEFAULT 1024
54 #define RTE_TEST_TX_DESC_DEFAULT 512
55 
56 #define INVALID_PORT_ID 0xFF
57 
58 /* mask of enabled ports */
59 static uint32_t enabled_port_mask = 0;
60 
61 /* Promiscuous mode */
62 static uint32_t promiscuous;
63 
64 /* number of devices/queues to support*/
65 static uint32_t num_queues = 0;
66 static uint32_t num_devices;
67 
68 static struct rte_mempool *mbuf_pool;
69 static int mergeable;
70 
71 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
72 typedef enum {
73 	VM2VM_DISABLED = 0,
74 	VM2VM_SOFTWARE = 1,
75 	VM2VM_HARDWARE = 2,
76 	VM2VM_LAST
77 } vm2vm_type;
78 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
79 
80 /* Enable stats. */
81 static uint32_t enable_stats = 0;
82 /* Enable retries on RX. */
83 static uint32_t enable_retry = 1;
84 
85 /* Disable TX checksum offload */
86 static uint32_t enable_tx_csum;
87 
88 /* Disable TSO offload */
89 static uint32_t enable_tso;
90 
91 static int client_mode;
92 
93 static int builtin_net_driver;
94 
95 static int async_vhost_driver;
96 
97 static char *dma_type;
98 
99 /* Specify timeout (in useconds) between retries on RX. */
100 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
101 /* Specify the number of retries on RX. */
102 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
103 
104 /* Socket file paths. Can be set by user */
105 static char *socket_files;
106 static int nb_sockets;
107 
108 /* empty vmdq configuration structure. Filled in programatically */
109 static struct rte_eth_conf vmdq_conf_default = {
110 	.rxmode = {
111 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
112 		.split_hdr_size = 0,
113 		/*
114 		 * VLAN strip is necessary for 1G NIC such as I350,
115 		 * this fixes bug of ipv4 forwarding in guest can't
116 		 * forward pakets from one virtio dev to another virtio dev.
117 		 */
118 		.offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
119 	},
120 
121 	.txmode = {
122 		.mq_mode = ETH_MQ_TX_NONE,
123 		.offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
124 			     DEV_TX_OFFLOAD_TCP_CKSUM |
125 			     DEV_TX_OFFLOAD_VLAN_INSERT |
126 			     DEV_TX_OFFLOAD_MULTI_SEGS |
127 			     DEV_TX_OFFLOAD_TCP_TSO),
128 	},
129 	.rx_adv_conf = {
130 		/*
131 		 * should be overridden separately in code with
132 		 * appropriate values
133 		 */
134 		.vmdq_rx_conf = {
135 			.nb_queue_pools = ETH_8_POOLS,
136 			.enable_default_pool = 0,
137 			.default_pool = 0,
138 			.nb_pool_maps = 0,
139 			.pool_map = {{0, 0},},
140 		},
141 	},
142 };
143 
144 
145 static unsigned lcore_ids[RTE_MAX_LCORE];
146 static uint16_t ports[RTE_MAX_ETHPORTS];
147 static unsigned num_ports = 0; /**< The number of ports specified in command line */
148 static uint16_t num_pf_queues, num_vmdq_queues;
149 static uint16_t vmdq_pool_base, vmdq_queue_base;
150 static uint16_t queues_per_pool;
151 
152 const uint16_t vlan_tags[] = {
153 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
154 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
155 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
156 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
157 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
158 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
159 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
160 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
161 };
162 
163 /* ethernet addresses of ports */
164 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
165 
166 static struct vhost_dev_tailq_list vhost_dev_list =
167 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
168 
169 static struct lcore_info lcore_info[RTE_MAX_LCORE];
170 
171 /* Used for queueing bursts of TX packets. */
172 struct mbuf_table {
173 	unsigned len;
174 	unsigned txq_id;
175 	struct rte_mbuf *m_table[MAX_PKT_BURST];
176 };
177 
178 struct vhost_bufftable {
179 	uint32_t len;
180 	uint64_t pre_tsc;
181 	struct rte_mbuf *m_table[MAX_PKT_BURST];
182 };
183 
184 /* TX queue for each data core. */
185 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
186 
187 /*
188  * Vhost TX buffer for each data core.
189  * Every data core maintains a TX buffer for every vhost device,
190  * which is used for batch pkts enqueue for higher performance.
191  */
192 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * MAX_VHOST_DEVICE];
193 
194 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
195 				 / US_PER_S * BURST_TX_DRAIN_US)
196 #define VLAN_HLEN       4
197 
198 static inline int
199 open_dma(const char *value)
200 {
201 	if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0)
202 		return open_ioat(value);
203 
204 	return -1;
205 }
206 
207 /*
208  * Builds up the correct configuration for VMDQ VLAN pool map
209  * according to the pool & queue limits.
210  */
211 static inline int
212 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
213 {
214 	struct rte_eth_vmdq_rx_conf conf;
215 	struct rte_eth_vmdq_rx_conf *def_conf =
216 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
217 	unsigned i;
218 
219 	memset(&conf, 0, sizeof(conf));
220 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
221 	conf.nb_pool_maps = num_devices;
222 	conf.enable_loop_back = def_conf->enable_loop_back;
223 	conf.rx_mode = def_conf->rx_mode;
224 
225 	for (i = 0; i < conf.nb_pool_maps; i++) {
226 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
227 		conf.pool_map[i].pools = (1UL << i);
228 	}
229 
230 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
231 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
232 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
233 	return 0;
234 }
235 
236 /*
237  * Initialises a given port using global settings and with the rx buffers
238  * coming from the mbuf_pool passed as parameter
239  */
240 static inline int
241 port_init(uint16_t port)
242 {
243 	struct rte_eth_dev_info dev_info;
244 	struct rte_eth_conf port_conf;
245 	struct rte_eth_rxconf *rxconf;
246 	struct rte_eth_txconf *txconf;
247 	int16_t rx_rings, tx_rings;
248 	uint16_t rx_ring_size, tx_ring_size;
249 	int retval;
250 	uint16_t q;
251 
252 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
253 	retval = rte_eth_dev_info_get(port, &dev_info);
254 	if (retval != 0) {
255 		RTE_LOG(ERR, VHOST_PORT,
256 			"Error during getting device (port %u) info: %s\n",
257 			port, strerror(-retval));
258 
259 		return retval;
260 	}
261 
262 	rxconf = &dev_info.default_rxconf;
263 	txconf = &dev_info.default_txconf;
264 	rxconf->rx_drop_en = 1;
265 
266 	/*configure the number of supported virtio devices based on VMDQ limits */
267 	num_devices = dev_info.max_vmdq_pools;
268 
269 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
270 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
271 
272 	tx_rings = (uint16_t)rte_lcore_count();
273 
274 	/* Get port configuration. */
275 	retval = get_eth_conf(&port_conf, num_devices);
276 	if (retval < 0)
277 		return retval;
278 	/* NIC queues are divided into pf queues and vmdq queues.  */
279 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
280 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
281 	num_vmdq_queues = num_devices * queues_per_pool;
282 	num_queues = num_pf_queues + num_vmdq_queues;
283 	vmdq_queue_base = dev_info.vmdq_queue_base;
284 	vmdq_pool_base  = dev_info.vmdq_pool_base;
285 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
286 		num_pf_queues, num_devices, queues_per_pool);
287 
288 	if (!rte_eth_dev_is_valid_port(port))
289 		return -1;
290 
291 	rx_rings = (uint16_t)dev_info.max_rx_queues;
292 	if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
293 		port_conf.txmode.offloads |=
294 			DEV_TX_OFFLOAD_MBUF_FAST_FREE;
295 	/* Configure ethernet device. */
296 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
297 	if (retval != 0) {
298 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
299 			port, strerror(-retval));
300 		return retval;
301 	}
302 
303 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
304 		&tx_ring_size);
305 	if (retval != 0) {
306 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
307 			"for port %u: %s.\n", port, strerror(-retval));
308 		return retval;
309 	}
310 	if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
311 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
312 			"for Rx queues on port %u.\n", port);
313 		return -1;
314 	}
315 
316 	/* Setup the queues. */
317 	rxconf->offloads = port_conf.rxmode.offloads;
318 	for (q = 0; q < rx_rings; q ++) {
319 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
320 						rte_eth_dev_socket_id(port),
321 						rxconf,
322 						mbuf_pool);
323 		if (retval < 0) {
324 			RTE_LOG(ERR, VHOST_PORT,
325 				"Failed to setup rx queue %u of port %u: %s.\n",
326 				q, port, strerror(-retval));
327 			return retval;
328 		}
329 	}
330 	txconf->offloads = port_conf.txmode.offloads;
331 	for (q = 0; q < tx_rings; q ++) {
332 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
333 						rte_eth_dev_socket_id(port),
334 						txconf);
335 		if (retval < 0) {
336 			RTE_LOG(ERR, VHOST_PORT,
337 				"Failed to setup tx queue %u of port %u: %s.\n",
338 				q, port, strerror(-retval));
339 			return retval;
340 		}
341 	}
342 
343 	/* Start the device. */
344 	retval  = rte_eth_dev_start(port);
345 	if (retval < 0) {
346 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
347 			port, strerror(-retval));
348 		return retval;
349 	}
350 
351 	if (promiscuous) {
352 		retval = rte_eth_promiscuous_enable(port);
353 		if (retval != 0) {
354 			RTE_LOG(ERR, VHOST_PORT,
355 				"Failed to enable promiscuous mode on port %u: %s\n",
356 				port, rte_strerror(-retval));
357 			return retval;
358 		}
359 	}
360 
361 	retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
362 	if (retval < 0) {
363 		RTE_LOG(ERR, VHOST_PORT,
364 			"Failed to get MAC address on port %u: %s\n",
365 			port, rte_strerror(-retval));
366 		return retval;
367 	}
368 
369 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
370 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
371 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
372 			port,
373 			vmdq_ports_eth_addr[port].addr_bytes[0],
374 			vmdq_ports_eth_addr[port].addr_bytes[1],
375 			vmdq_ports_eth_addr[port].addr_bytes[2],
376 			vmdq_ports_eth_addr[port].addr_bytes[3],
377 			vmdq_ports_eth_addr[port].addr_bytes[4],
378 			vmdq_ports_eth_addr[port].addr_bytes[5]);
379 
380 	return 0;
381 }
382 
383 /*
384  * Set socket file path.
385  */
386 static int
387 us_vhost_parse_socket_path(const char *q_arg)
388 {
389 	char *old;
390 
391 	/* parse number string */
392 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
393 		return -1;
394 
395 	old = socket_files;
396 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
397 	if (socket_files == NULL) {
398 		free(old);
399 		return -1;
400 	}
401 
402 	strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
403 	nb_sockets++;
404 
405 	return 0;
406 }
407 
408 /*
409  * Parse the portmask provided at run time.
410  */
411 static int
412 parse_portmask(const char *portmask)
413 {
414 	char *end = NULL;
415 	unsigned long pm;
416 
417 	errno = 0;
418 
419 	/* parse hexadecimal string */
420 	pm = strtoul(portmask, &end, 16);
421 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
422 		return 0;
423 
424 	return pm;
425 
426 }
427 
428 /*
429  * Parse num options at run time.
430  */
431 static int
432 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
433 {
434 	char *end = NULL;
435 	unsigned long num;
436 
437 	errno = 0;
438 
439 	/* parse unsigned int string */
440 	num = strtoul(q_arg, &end, 10);
441 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
442 		return -1;
443 
444 	if (num > max_valid_value)
445 		return -1;
446 
447 	return num;
448 
449 }
450 
451 /*
452  * Display usage
453  */
454 static void
455 us_vhost_usage(const char *prgname)
456 {
457 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
458 	"		--vm2vm [0|1|2]\n"
459 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
460 	"		--socket-file <path>\n"
461 	"		--nb-devices ND\n"
462 	"		-p PORTMASK: Set mask for ports to be used by application\n"
463 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
464 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
465 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
466 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
467 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
468 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
469 	"		--socket-file: The path of the socket file.\n"
470 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
471 	"		--tso [0|1] disable/enable TCP segment offload.\n"
472 	"		--client register a vhost-user socket as client mode.\n"
473 	"		--dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
474 	"		--dmas register dma channel for specific vhost device.\n",
475 	       prgname);
476 }
477 
478 enum {
479 #define OPT_VM2VM               "vm2vm"
480 	OPT_VM2VM_NUM = 256,
481 #define OPT_RX_RETRY            "rx-retry"
482 	OPT_RX_RETRY_NUM,
483 #define OPT_RX_RETRY_DELAY      "rx-retry-delay"
484 	OPT_RX_RETRY_DELAY_NUM,
485 #define OPT_RX_RETRY_NUMB       "rx-retry-num"
486 	OPT_RX_RETRY_NUMB_NUM,
487 #define OPT_MERGEABLE           "mergeable"
488 	OPT_MERGEABLE_NUM,
489 #define OPT_STATS               "stats"
490 	OPT_STATS_NUM,
491 #define OPT_SOCKET_FILE         "socket-file"
492 	OPT_SOCKET_FILE_NUM,
493 #define OPT_TX_CSUM             "tx-csum"
494 	OPT_TX_CSUM_NUM,
495 #define OPT_TSO                 "tso"
496 	OPT_TSO_NUM,
497 #define OPT_CLIENT              "client"
498 	OPT_CLIENT_NUM,
499 #define OPT_BUILTIN_NET_DRIVER  "builtin-net-driver"
500 	OPT_BUILTIN_NET_DRIVER_NUM,
501 #define OPT_DMA_TYPE            "dma-type"
502 	OPT_DMA_TYPE_NUM,
503 #define OPT_DMAS                "dmas"
504 	OPT_DMAS_NUM,
505 };
506 
507 /*
508  * Parse the arguments given in the command line of the application.
509  */
510 static int
511 us_vhost_parse_args(int argc, char **argv)
512 {
513 	int opt, ret;
514 	int option_index;
515 	unsigned i;
516 	const char *prgname = argv[0];
517 	static struct option long_option[] = {
518 		{OPT_VM2VM, required_argument,
519 				NULL, OPT_VM2VM_NUM},
520 		{OPT_RX_RETRY, required_argument,
521 				NULL, OPT_RX_RETRY_NUM},
522 		{OPT_RX_RETRY_DELAY, required_argument,
523 				NULL, OPT_RX_RETRY_DELAY_NUM},
524 		{OPT_RX_RETRY_NUMB, required_argument,
525 				NULL, OPT_RX_RETRY_NUMB_NUM},
526 		{OPT_MERGEABLE, required_argument,
527 				NULL, OPT_MERGEABLE_NUM},
528 		{OPT_STATS, required_argument,
529 				NULL, OPT_STATS_NUM},
530 		{OPT_SOCKET_FILE, required_argument,
531 				NULL, OPT_SOCKET_FILE_NUM},
532 		{OPT_TX_CSUM, required_argument,
533 				NULL, OPT_TX_CSUM_NUM},
534 		{OPT_TSO, required_argument,
535 				NULL, OPT_TSO_NUM},
536 		{OPT_CLIENT, no_argument,
537 				NULL, OPT_CLIENT_NUM},
538 		{OPT_BUILTIN_NET_DRIVER, no_argument,
539 				NULL, OPT_BUILTIN_NET_DRIVER_NUM},
540 		{OPT_DMA_TYPE, required_argument,
541 				NULL, OPT_DMA_TYPE_NUM},
542 		{OPT_DMAS, required_argument,
543 				NULL, OPT_DMAS_NUM},
544 		{NULL, 0, 0, 0},
545 	};
546 
547 	/* Parse command line */
548 	while ((opt = getopt_long(argc, argv, "p:P",
549 			long_option, &option_index)) != EOF) {
550 		switch (opt) {
551 		/* Portmask */
552 		case 'p':
553 			enabled_port_mask = parse_portmask(optarg);
554 			if (enabled_port_mask == 0) {
555 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
556 				us_vhost_usage(prgname);
557 				return -1;
558 			}
559 			break;
560 
561 		case 'P':
562 			promiscuous = 1;
563 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
564 				ETH_VMDQ_ACCEPT_BROADCAST |
565 				ETH_VMDQ_ACCEPT_MULTICAST;
566 			break;
567 
568 		case OPT_VM2VM_NUM:
569 			ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
570 			if (ret == -1) {
571 				RTE_LOG(INFO, VHOST_CONFIG,
572 					"Invalid argument for "
573 					"vm2vm [0|1|2]\n");
574 				us_vhost_usage(prgname);
575 				return -1;
576 			}
577 			vm2vm_mode = (vm2vm_type)ret;
578 			break;
579 
580 		case OPT_RX_RETRY_NUM:
581 			ret = parse_num_opt(optarg, 1);
582 			if (ret == -1) {
583 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
584 				us_vhost_usage(prgname);
585 				return -1;
586 			}
587 			enable_retry = ret;
588 			break;
589 
590 		case OPT_TX_CSUM_NUM:
591 			ret = parse_num_opt(optarg, 1);
592 			if (ret == -1) {
593 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
594 				us_vhost_usage(prgname);
595 				return -1;
596 			}
597 			enable_tx_csum = ret;
598 			break;
599 
600 		case OPT_TSO_NUM:
601 			ret = parse_num_opt(optarg, 1);
602 			if (ret == -1) {
603 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
604 				us_vhost_usage(prgname);
605 				return -1;
606 			}
607 			enable_tso = ret;
608 			break;
609 
610 		case OPT_RX_RETRY_DELAY_NUM:
611 			ret = parse_num_opt(optarg, INT32_MAX);
612 			if (ret == -1) {
613 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
614 				us_vhost_usage(prgname);
615 				return -1;
616 			}
617 			burst_rx_delay_time = ret;
618 			break;
619 
620 		case OPT_RX_RETRY_NUMB_NUM:
621 			ret = parse_num_opt(optarg, INT32_MAX);
622 			if (ret == -1) {
623 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
624 				us_vhost_usage(prgname);
625 				return -1;
626 			}
627 			burst_rx_retry_num = ret;
628 			break;
629 
630 		case OPT_MERGEABLE_NUM:
631 			ret = parse_num_opt(optarg, 1);
632 			if (ret == -1) {
633 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
634 				us_vhost_usage(prgname);
635 				return -1;
636 			}
637 			mergeable = !!ret;
638 			if (ret) {
639 				vmdq_conf_default.rxmode.offloads |=
640 					DEV_RX_OFFLOAD_JUMBO_FRAME;
641 				vmdq_conf_default.rxmode.max_rx_pkt_len
642 					= JUMBO_FRAME_MAX_SIZE;
643 			}
644 			break;
645 
646 		case OPT_STATS_NUM:
647 			ret = parse_num_opt(optarg, INT32_MAX);
648 			if (ret == -1) {
649 				RTE_LOG(INFO, VHOST_CONFIG,
650 					"Invalid argument for stats [0..N]\n");
651 				us_vhost_usage(prgname);
652 				return -1;
653 			}
654 			enable_stats = ret;
655 			break;
656 
657 		/* Set socket file path. */
658 		case OPT_SOCKET_FILE_NUM:
659 			if (us_vhost_parse_socket_path(optarg) == -1) {
660 				RTE_LOG(INFO, VHOST_CONFIG,
661 				"Invalid argument for socket name (Max %d characters)\n",
662 				PATH_MAX);
663 				us_vhost_usage(prgname);
664 				return -1;
665 			}
666 			break;
667 
668 		case OPT_DMA_TYPE_NUM:
669 			dma_type = optarg;
670 			break;
671 
672 		case OPT_DMAS_NUM:
673 			if (open_dma(optarg) == -1) {
674 				RTE_LOG(INFO, VHOST_CONFIG,
675 					"Wrong DMA args\n");
676 				us_vhost_usage(prgname);
677 				return -1;
678 			}
679 			async_vhost_driver = 1;
680 			break;
681 
682 		case OPT_CLIENT_NUM:
683 			client_mode = 1;
684 			break;
685 
686 		case OPT_BUILTIN_NET_DRIVER_NUM:
687 			builtin_net_driver = 1;
688 			break;
689 
690 		/* Invalid option - print options. */
691 		default:
692 			us_vhost_usage(prgname);
693 			return -1;
694 		}
695 	}
696 
697 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
698 		if (enabled_port_mask & (1 << i))
699 			ports[num_ports++] = i;
700 	}
701 
702 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
703 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
704 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
705 		return -1;
706 	}
707 
708 	return 0;
709 }
710 
711 /*
712  * Update the global var NUM_PORTS and array PORTS according to system ports number
713  * and return valid ports number
714  */
715 static unsigned check_ports_num(unsigned nb_ports)
716 {
717 	unsigned valid_num_ports = num_ports;
718 	unsigned portid;
719 
720 	if (num_ports > nb_ports) {
721 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
722 			num_ports, nb_ports);
723 		num_ports = nb_ports;
724 	}
725 
726 	for (portid = 0; portid < num_ports; portid ++) {
727 		if (!rte_eth_dev_is_valid_port(ports[portid])) {
728 			RTE_LOG(INFO, VHOST_PORT,
729 				"\nSpecified port ID(%u) is not valid\n",
730 				ports[portid]);
731 			ports[portid] = INVALID_PORT_ID;
732 			valid_num_ports--;
733 		}
734 	}
735 	return valid_num_ports;
736 }
737 
738 static __rte_always_inline struct vhost_dev *
739 find_vhost_dev(struct rte_ether_addr *mac)
740 {
741 	struct vhost_dev *vdev;
742 
743 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
744 		if (vdev->ready == DEVICE_RX &&
745 		    rte_is_same_ether_addr(mac, &vdev->mac_address))
746 			return vdev;
747 	}
748 
749 	return NULL;
750 }
751 
752 /*
753  * This function learns the MAC address of the device and registers this along with a
754  * vlan tag to a VMDQ.
755  */
756 static int
757 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
758 {
759 	struct rte_ether_hdr *pkt_hdr;
760 	int i, ret;
761 
762 	/* Learn MAC address of guest device from packet */
763 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
764 
765 	if (find_vhost_dev(&pkt_hdr->s_addr)) {
766 		RTE_LOG(ERR, VHOST_DATA,
767 			"(%d) device is using a registered MAC!\n",
768 			vdev->vid);
769 		return -1;
770 	}
771 
772 	for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
773 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
774 
775 	/* vlan_tag currently uses the device_id. */
776 	vdev->vlan_tag = vlan_tags[vdev->vid];
777 
778 	/* Print out VMDQ registration info. */
779 	RTE_LOG(INFO, VHOST_DATA,
780 		"(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
781 		vdev->vid,
782 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
783 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
784 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
785 		vdev->vlan_tag);
786 
787 	/* Register the MAC address. */
788 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
789 				(uint32_t)vdev->vid + vmdq_pool_base);
790 	if (ret)
791 		RTE_LOG(ERR, VHOST_DATA,
792 			"(%d) failed to add device MAC address to VMDQ\n",
793 			vdev->vid);
794 
795 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
796 
797 	/* Set device as ready for RX. */
798 	vdev->ready = DEVICE_RX;
799 
800 	return 0;
801 }
802 
803 /*
804  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
805  * queue before disabling RX on the device.
806  */
807 static inline void
808 unlink_vmdq(struct vhost_dev *vdev)
809 {
810 	unsigned i = 0;
811 	unsigned rx_count;
812 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
813 
814 	if (vdev->ready == DEVICE_RX) {
815 		/*clear MAC and VLAN settings*/
816 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
817 		for (i = 0; i < 6; i++)
818 			vdev->mac_address.addr_bytes[i] = 0;
819 
820 		vdev->vlan_tag = 0;
821 
822 		/*Clear out the receive buffers*/
823 		rx_count = rte_eth_rx_burst(ports[0],
824 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
825 
826 		while (rx_count) {
827 			for (i = 0; i < rx_count; i++)
828 				rte_pktmbuf_free(pkts_burst[i]);
829 
830 			rx_count = rte_eth_rx_burst(ports[0],
831 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
832 		}
833 
834 		vdev->ready = DEVICE_MAC_LEARNING;
835 	}
836 }
837 
838 static inline void
839 free_pkts(struct rte_mbuf **pkts, uint16_t n)
840 {
841 	while (n--)
842 		rte_pktmbuf_free(pkts[n]);
843 }
844 
845 static __rte_always_inline void
846 complete_async_pkts(struct vhost_dev *vdev)
847 {
848 	struct rte_mbuf *p_cpl[MAX_PKT_BURST];
849 	uint16_t complete_count;
850 
851 	complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
852 					VIRTIO_RXQ, p_cpl, MAX_PKT_BURST);
853 	if (complete_count)
854 		free_pkts(p_cpl, complete_count);
855 }
856 
857 static __rte_always_inline void
858 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
859 	    struct rte_mbuf *m)
860 {
861 	uint16_t ret;
862 
863 	if (builtin_net_driver) {
864 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
865 	} else {
866 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
867 	}
868 
869 	if (enable_stats) {
870 		__atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
871 				__ATOMIC_SEQ_CST);
872 		__atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
873 				__ATOMIC_SEQ_CST);
874 		src_vdev->stats.tx_total++;
875 		src_vdev->stats.tx += ret;
876 	}
877 }
878 
879 static __rte_always_inline void
880 drain_vhost(struct vhost_dev *vdev)
881 {
882 	uint16_t ret;
883 	uint32_t buff_idx = rte_lcore_id() * MAX_VHOST_DEVICE + vdev->vid;
884 	uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
885 	struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
886 
887 	if (builtin_net_driver) {
888 		ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
889 	} else if (async_vhost_driver) {
890 		uint32_t cpu_cpl_nr = 0;
891 		uint16_t enqueue_fail = 0;
892 		struct rte_mbuf *m_cpu_cpl[nr_xmit];
893 
894 		complete_async_pkts(vdev);
895 		ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ,
896 					m, nr_xmit, m_cpu_cpl, &cpu_cpl_nr);
897 
898 		if (cpu_cpl_nr)
899 			free_pkts(m_cpu_cpl, cpu_cpl_nr);
900 
901 		enqueue_fail = nr_xmit - ret;
902 		if (enqueue_fail)
903 			free_pkts(&m[ret], nr_xmit - ret);
904 	} else {
905 		ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
906 						m, nr_xmit);
907 	}
908 
909 	if (enable_stats) {
910 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
911 				__ATOMIC_SEQ_CST);
912 		__atomic_add_fetch(&vdev->stats.rx_atomic, ret,
913 				__ATOMIC_SEQ_CST);
914 	}
915 
916 	if (!async_vhost_driver)
917 		free_pkts(m, nr_xmit);
918 }
919 
920 static __rte_always_inline void
921 drain_vhost_table(void)
922 {
923 	uint16_t lcore_id = rte_lcore_id();
924 	struct vhost_bufftable *vhost_txq;
925 	struct vhost_dev *vdev;
926 	uint64_t cur_tsc;
927 
928 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
929 		vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE
930 						+ vdev->vid];
931 
932 		cur_tsc = rte_rdtsc();
933 		if (unlikely(cur_tsc - vhost_txq->pre_tsc
934 				> MBUF_TABLE_DRAIN_TSC)) {
935 			RTE_LOG_DP(DEBUG, VHOST_DATA,
936 				"Vhost TX queue drained after timeout with burst size %u\n",
937 				vhost_txq->len);
938 			drain_vhost(vdev);
939 			vhost_txq->len = 0;
940 			vhost_txq->pre_tsc = cur_tsc;
941 		}
942 	}
943 }
944 
945 /*
946  * Check if the packet destination MAC address is for a local device. If so then put
947  * the packet on that devices RX queue. If not then return.
948  */
949 static __rte_always_inline int
950 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
951 {
952 	struct rte_ether_hdr *pkt_hdr;
953 	struct vhost_dev *dst_vdev;
954 	struct vhost_bufftable *vhost_txq;
955 	uint16_t lcore_id = rte_lcore_id();
956 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
957 
958 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
959 	if (!dst_vdev)
960 		return -1;
961 
962 	if (vdev->vid == dst_vdev->vid) {
963 		RTE_LOG_DP(DEBUG, VHOST_DATA,
964 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
965 			vdev->vid);
966 		return 0;
967 	}
968 
969 	RTE_LOG_DP(DEBUG, VHOST_DATA,
970 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
971 
972 	if (unlikely(dst_vdev->remove)) {
973 		RTE_LOG_DP(DEBUG, VHOST_DATA,
974 			"(%d) device is marked for removal\n", dst_vdev->vid);
975 		return 0;
976 	}
977 
978 	vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE + dst_vdev->vid];
979 	vhost_txq->m_table[vhost_txq->len++] = m;
980 
981 	if (enable_stats) {
982 		vdev->stats.tx_total++;
983 		vdev->stats.tx++;
984 	}
985 
986 	if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
987 		drain_vhost(dst_vdev);
988 		vhost_txq->len = 0;
989 		vhost_txq->pre_tsc = rte_rdtsc();
990 	}
991 	return 0;
992 }
993 
994 /*
995  * Check if the destination MAC of a packet is one local VM,
996  * and get its vlan tag, and offset if it is.
997  */
998 static __rte_always_inline int
999 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
1000 	uint32_t *offset, uint16_t *vlan_tag)
1001 {
1002 	struct vhost_dev *dst_vdev;
1003 	struct rte_ether_hdr *pkt_hdr =
1004 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1005 
1006 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
1007 	if (!dst_vdev)
1008 		return 0;
1009 
1010 	if (vdev->vid == dst_vdev->vid) {
1011 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1012 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
1013 			vdev->vid);
1014 		return -1;
1015 	}
1016 
1017 	/*
1018 	 * HW vlan strip will reduce the packet length
1019 	 * by minus length of vlan tag, so need restore
1020 	 * the packet length by plus it.
1021 	 */
1022 	*offset  = VLAN_HLEN;
1023 	*vlan_tag = vlan_tags[vdev->vid];
1024 
1025 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1026 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1027 		vdev->vid, dst_vdev->vid, *vlan_tag);
1028 
1029 	return 0;
1030 }
1031 
1032 static uint16_t
1033 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
1034 {
1035 	if (ol_flags & PKT_TX_IPV4)
1036 		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
1037 	else /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1038 		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
1039 }
1040 
1041 static void virtio_tx_offload(struct rte_mbuf *m)
1042 {
1043 	void *l3_hdr;
1044 	struct rte_ipv4_hdr *ipv4_hdr = NULL;
1045 	struct rte_tcp_hdr *tcp_hdr = NULL;
1046 	struct rte_ether_hdr *eth_hdr =
1047 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1048 
1049 	l3_hdr = (char *)eth_hdr + m->l2_len;
1050 
1051 	if (m->ol_flags & PKT_TX_IPV4) {
1052 		ipv4_hdr = l3_hdr;
1053 		ipv4_hdr->hdr_checksum = 0;
1054 		m->ol_flags |= PKT_TX_IP_CKSUM;
1055 	}
1056 
1057 	tcp_hdr = (struct rte_tcp_hdr *)((char *)l3_hdr + m->l3_len);
1058 	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
1059 }
1060 
1061 static __rte_always_inline void
1062 do_drain_mbuf_table(struct mbuf_table *tx_q)
1063 {
1064 	uint16_t count;
1065 
1066 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1067 				 tx_q->m_table, tx_q->len);
1068 	if (unlikely(count < tx_q->len))
1069 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
1070 
1071 	tx_q->len = 0;
1072 }
1073 
1074 /*
1075  * This function routes the TX packet to the correct interface. This
1076  * may be a local device or the physical port.
1077  */
1078 static __rte_always_inline void
1079 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1080 {
1081 	struct mbuf_table *tx_q;
1082 	unsigned offset = 0;
1083 	const uint16_t lcore_id = rte_lcore_id();
1084 	struct rte_ether_hdr *nh;
1085 
1086 
1087 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1088 	if (unlikely(rte_is_broadcast_ether_addr(&nh->d_addr))) {
1089 		struct vhost_dev *vdev2;
1090 
1091 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1092 			if (vdev2 != vdev)
1093 				sync_virtio_xmit(vdev2, vdev, m);
1094 		}
1095 		goto queue2nic;
1096 	}
1097 
1098 	/*check if destination is local VM*/
1099 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1100 		return;
1101 
1102 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1103 		if (unlikely(find_local_dest(vdev, m, &offset,
1104 					     &vlan_tag) != 0)) {
1105 			rte_pktmbuf_free(m);
1106 			return;
1107 		}
1108 	}
1109 
1110 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1111 		"(%d) TX: MAC address is external\n", vdev->vid);
1112 
1113 queue2nic:
1114 
1115 	/*Add packet to the port tx queue*/
1116 	tx_q = &lcore_tx_queue[lcore_id];
1117 
1118 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1119 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1120 		/* Guest has inserted the vlan tag. */
1121 		struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1122 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1123 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1124 			(vh->vlan_tci != vlan_tag_be))
1125 			vh->vlan_tci = vlan_tag_be;
1126 	} else {
1127 		m->ol_flags |= PKT_TX_VLAN_PKT;
1128 
1129 		/*
1130 		 * Find the right seg to adjust the data len when offset is
1131 		 * bigger than tail room size.
1132 		 */
1133 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1134 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1135 				m->data_len += offset;
1136 			else {
1137 				struct rte_mbuf *seg = m;
1138 
1139 				while ((seg->next != NULL) &&
1140 					(offset > rte_pktmbuf_tailroom(seg)))
1141 					seg = seg->next;
1142 
1143 				seg->data_len += offset;
1144 			}
1145 			m->pkt_len += offset;
1146 		}
1147 
1148 		m->vlan_tci = vlan_tag;
1149 	}
1150 
1151 	if (m->ol_flags & PKT_TX_TCP_SEG)
1152 		virtio_tx_offload(m);
1153 
1154 	tx_q->m_table[tx_q->len++] = m;
1155 	if (enable_stats) {
1156 		vdev->stats.tx_total++;
1157 		vdev->stats.tx++;
1158 	}
1159 
1160 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1161 		do_drain_mbuf_table(tx_q);
1162 }
1163 
1164 
1165 static __rte_always_inline void
1166 drain_mbuf_table(struct mbuf_table *tx_q)
1167 {
1168 	static uint64_t prev_tsc;
1169 	uint64_t cur_tsc;
1170 
1171 	if (tx_q->len == 0)
1172 		return;
1173 
1174 	cur_tsc = rte_rdtsc();
1175 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1176 		prev_tsc = cur_tsc;
1177 
1178 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1179 			"TX queue drained after timeout with burst size %u\n",
1180 			tx_q->len);
1181 		do_drain_mbuf_table(tx_q);
1182 	}
1183 }
1184 
1185 static __rte_always_inline void
1186 drain_eth_rx(struct vhost_dev *vdev)
1187 {
1188 	uint16_t rx_count, enqueue_count;
1189 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1190 
1191 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1192 				    pkts, MAX_PKT_BURST);
1193 
1194 	if (!rx_count)
1195 		return;
1196 
1197 	/*
1198 	 * When "enable_retry" is set, here we wait and retry when there
1199 	 * is no enough free slots in the queue to hold @rx_count packets,
1200 	 * to diminish packet loss.
1201 	 */
1202 	if (enable_retry &&
1203 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1204 			VIRTIO_RXQ))) {
1205 		uint32_t retry;
1206 
1207 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1208 			rte_delay_us(burst_rx_delay_time);
1209 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1210 					VIRTIO_RXQ))
1211 				break;
1212 		}
1213 	}
1214 
1215 	if (builtin_net_driver) {
1216 		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1217 						pkts, rx_count);
1218 	} else if (async_vhost_driver) {
1219 		uint32_t cpu_cpl_nr = 0;
1220 		uint16_t enqueue_fail = 0;
1221 		struct rte_mbuf *m_cpu_cpl[MAX_PKT_BURST];
1222 
1223 		complete_async_pkts(vdev);
1224 		enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1225 					VIRTIO_RXQ, pkts, rx_count,
1226 					m_cpu_cpl, &cpu_cpl_nr);
1227 		if (cpu_cpl_nr)
1228 			free_pkts(m_cpu_cpl, cpu_cpl_nr);
1229 
1230 		enqueue_fail = rx_count - enqueue_count;
1231 		if (enqueue_fail)
1232 			free_pkts(&pkts[enqueue_count], enqueue_fail);
1233 
1234 	} else {
1235 		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1236 						pkts, rx_count);
1237 	}
1238 
1239 	if (enable_stats) {
1240 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1241 				__ATOMIC_SEQ_CST);
1242 		__atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1243 				__ATOMIC_SEQ_CST);
1244 	}
1245 
1246 	if (!async_vhost_driver)
1247 		free_pkts(pkts, rx_count);
1248 }
1249 
1250 static __rte_always_inline void
1251 drain_virtio_tx(struct vhost_dev *vdev)
1252 {
1253 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1254 	uint16_t count;
1255 	uint16_t i;
1256 
1257 	if (builtin_net_driver) {
1258 		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1259 					pkts, MAX_PKT_BURST);
1260 	} else {
1261 		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1262 					mbuf_pool, pkts, MAX_PKT_BURST);
1263 	}
1264 
1265 	/* setup VMDq for the first packet */
1266 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1267 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1268 			free_pkts(pkts, count);
1269 	}
1270 
1271 	for (i = 0; i < count; ++i)
1272 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1273 }
1274 
1275 /*
1276  * Main function of vhost-switch. It basically does:
1277  *
1278  * for each vhost device {
1279  *    - drain_eth_rx()
1280  *
1281  *      Which drains the host eth Rx queue linked to the vhost device,
1282  *      and deliver all of them to guest virito Rx ring associated with
1283  *      this vhost device.
1284  *
1285  *    - drain_virtio_tx()
1286  *
1287  *      Which drains the guest virtio Tx queue and deliver all of them
1288  *      to the target, which could be another vhost device, or the
1289  *      physical eth dev. The route is done in function "virtio_tx_route".
1290  * }
1291  */
1292 static int
1293 switch_worker(void *arg __rte_unused)
1294 {
1295 	unsigned i;
1296 	unsigned lcore_id = rte_lcore_id();
1297 	struct vhost_dev *vdev;
1298 	struct mbuf_table *tx_q;
1299 
1300 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1301 
1302 	tx_q = &lcore_tx_queue[lcore_id];
1303 	for (i = 0; i < rte_lcore_count(); i++) {
1304 		if (lcore_ids[i] == lcore_id) {
1305 			tx_q->txq_id = i;
1306 			break;
1307 		}
1308 	}
1309 
1310 	while(1) {
1311 		drain_mbuf_table(tx_q);
1312 		drain_vhost_table();
1313 		/*
1314 		 * Inform the configuration core that we have exited the
1315 		 * linked list and that no devices are in use if requested.
1316 		 */
1317 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1318 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1319 
1320 		/*
1321 		 * Process vhost devices
1322 		 */
1323 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1324 			      lcore_vdev_entry) {
1325 			if (unlikely(vdev->remove)) {
1326 				unlink_vmdq(vdev);
1327 				vdev->ready = DEVICE_SAFE_REMOVE;
1328 				continue;
1329 			}
1330 
1331 			if (likely(vdev->ready == DEVICE_RX))
1332 				drain_eth_rx(vdev);
1333 
1334 			if (likely(!vdev->remove))
1335 				drain_virtio_tx(vdev);
1336 		}
1337 	}
1338 
1339 	return 0;
1340 }
1341 
1342 /*
1343  * Remove a device from the specific data core linked list and from the
1344  * main linked list. Synchonization  occurs through the use of the
1345  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1346  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1347  */
1348 static void
1349 destroy_device(int vid)
1350 {
1351 	struct vhost_dev *vdev = NULL;
1352 	int lcore;
1353 	uint16_t i;
1354 
1355 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1356 		if (vdev->vid == vid)
1357 			break;
1358 	}
1359 	if (!vdev)
1360 		return;
1361 	/*set the remove flag. */
1362 	vdev->remove = 1;
1363 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1364 		rte_pause();
1365 	}
1366 
1367 	for (i = 0; i < RTE_MAX_LCORE; i++)
1368 		rte_free(vhost_txbuff[i * MAX_VHOST_DEVICE + vid]);
1369 
1370 	if (builtin_net_driver)
1371 		vs_vhost_net_remove(vdev);
1372 
1373 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1374 		     lcore_vdev_entry);
1375 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1376 
1377 
1378 	/* Set the dev_removal_flag on each lcore. */
1379 	RTE_LCORE_FOREACH_WORKER(lcore)
1380 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1381 
1382 	/*
1383 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1384 	 * we can be sure that they can no longer access the device removed
1385 	 * from the linked lists and that the devices are no longer in use.
1386 	 */
1387 	RTE_LCORE_FOREACH_WORKER(lcore) {
1388 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1389 			rte_pause();
1390 	}
1391 
1392 	lcore_info[vdev->coreid].device_num--;
1393 
1394 	RTE_LOG(INFO, VHOST_DATA,
1395 		"(%d) device has been removed from data core\n",
1396 		vdev->vid);
1397 
1398 	if (async_vhost_driver)
1399 		rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1400 
1401 	rte_free(vdev);
1402 }
1403 
1404 /*
1405  * A new device is added to a data core. First the device is added to the main linked list
1406  * and then allocated to a specific data core.
1407  */
1408 static int
1409 new_device(int vid)
1410 {
1411 	int lcore, core_add = 0;
1412 	uint16_t i;
1413 	uint32_t device_num_min = num_devices;
1414 	struct vhost_dev *vdev;
1415 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1416 	if (vdev == NULL) {
1417 		RTE_LOG(INFO, VHOST_DATA,
1418 			"(%d) couldn't allocate memory for vhost dev\n",
1419 			vid);
1420 		return -1;
1421 	}
1422 	vdev->vid = vid;
1423 
1424 	for (i = 0; i < RTE_MAX_LCORE; i++) {
1425 		vhost_txbuff[i * MAX_VHOST_DEVICE + vid]
1426 			= rte_zmalloc("vhost bufftable",
1427 				sizeof(struct vhost_bufftable),
1428 				RTE_CACHE_LINE_SIZE);
1429 
1430 		if (vhost_txbuff[i * MAX_VHOST_DEVICE + vid] == NULL) {
1431 			RTE_LOG(INFO, VHOST_DATA,
1432 			  "(%d) couldn't allocate memory for vhost TX\n", vid);
1433 			return -1;
1434 		}
1435 	}
1436 
1437 	if (builtin_net_driver)
1438 		vs_vhost_net_setup(vdev);
1439 
1440 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1441 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1442 
1443 	/*reset ready flag*/
1444 	vdev->ready = DEVICE_MAC_LEARNING;
1445 	vdev->remove = 0;
1446 
1447 	/* Find a suitable lcore to add the device. */
1448 	RTE_LCORE_FOREACH_WORKER(lcore) {
1449 		if (lcore_info[lcore].device_num < device_num_min) {
1450 			device_num_min = lcore_info[lcore].device_num;
1451 			core_add = lcore;
1452 		}
1453 	}
1454 	vdev->coreid = core_add;
1455 
1456 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1457 			  lcore_vdev_entry);
1458 	lcore_info[vdev->coreid].device_num++;
1459 
1460 	/* Disable notifications. */
1461 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1462 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1463 
1464 	RTE_LOG(INFO, VHOST_DATA,
1465 		"(%d) device has been added to data core %d\n",
1466 		vid, vdev->coreid);
1467 
1468 	if (async_vhost_driver) {
1469 		struct rte_vhost_async_features f;
1470 		struct rte_vhost_async_channel_ops channel_ops;
1471 
1472 		if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
1473 			channel_ops.transfer_data = ioat_transfer_data_cb;
1474 			channel_ops.check_completed_copies =
1475 				ioat_check_completed_copies_cb;
1476 
1477 			f.async_inorder = 1;
1478 			f.async_threshold = 256;
1479 
1480 			return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
1481 				f.intval, &channel_ops);
1482 		}
1483 	}
1484 
1485 	return 0;
1486 }
1487 
1488 /*
1489  * These callback allow devices to be added to the data core when configuration
1490  * has been fully complete.
1491  */
1492 static const struct vhost_device_ops virtio_net_device_ops =
1493 {
1494 	.new_device =  new_device,
1495 	.destroy_device = destroy_device,
1496 };
1497 
1498 /*
1499  * This is a thread will wake up after a period to print stats if the user has
1500  * enabled them.
1501  */
1502 static void *
1503 print_stats(__rte_unused void *arg)
1504 {
1505 	struct vhost_dev *vdev;
1506 	uint64_t tx_dropped, rx_dropped;
1507 	uint64_t tx, tx_total, rx, rx_total;
1508 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1509 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1510 
1511 	while(1) {
1512 		sleep(enable_stats);
1513 
1514 		/* Clear screen and move to top left */
1515 		printf("%s%s\n", clr, top_left);
1516 		printf("Device statistics =================================\n");
1517 
1518 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1519 			tx_total   = vdev->stats.tx_total;
1520 			tx         = vdev->stats.tx;
1521 			tx_dropped = tx_total - tx;
1522 
1523 			rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1524 				__ATOMIC_SEQ_CST);
1525 			rx         = __atomic_load_n(&vdev->stats.rx_atomic,
1526 				__ATOMIC_SEQ_CST);
1527 			rx_dropped = rx_total - rx;
1528 
1529 			printf("Statistics for device %d\n"
1530 				"-----------------------\n"
1531 				"TX total:              %" PRIu64 "\n"
1532 				"TX dropped:            %" PRIu64 "\n"
1533 				"TX successful:         %" PRIu64 "\n"
1534 				"RX total:              %" PRIu64 "\n"
1535 				"RX dropped:            %" PRIu64 "\n"
1536 				"RX successful:         %" PRIu64 "\n",
1537 				vdev->vid,
1538 				tx_total, tx_dropped, tx,
1539 				rx_total, rx_dropped, rx);
1540 		}
1541 
1542 		printf("===================================================\n");
1543 
1544 		fflush(stdout);
1545 	}
1546 
1547 	return NULL;
1548 }
1549 
1550 static void
1551 unregister_drivers(int socket_num)
1552 {
1553 	int i, ret;
1554 
1555 	for (i = 0; i < socket_num; i++) {
1556 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1557 		if (ret != 0)
1558 			RTE_LOG(ERR, VHOST_CONFIG,
1559 				"Fail to unregister vhost driver for %s.\n",
1560 				socket_files + i * PATH_MAX);
1561 	}
1562 }
1563 
1564 /* When we receive a INT signal, unregister vhost driver */
1565 static void
1566 sigint_handler(__rte_unused int signum)
1567 {
1568 	/* Unregister vhost driver. */
1569 	unregister_drivers(nb_sockets);
1570 
1571 	exit(0);
1572 }
1573 
1574 /*
1575  * While creating an mbuf pool, one key thing is to figure out how
1576  * many mbuf entries is enough for our use. FYI, here are some
1577  * guidelines:
1578  *
1579  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1580  *
1581  * - For each switch core (A CPU core does the packet switch), we need
1582  *   also make some reservation for receiving the packets from virtio
1583  *   Tx queue. How many is enough depends on the usage. It's normally
1584  *   a simple calculation like following:
1585  *
1586  *       MAX_PKT_BURST * max packet size / mbuf size
1587  *
1588  *   So, we definitely need allocate more mbufs when TSO is enabled.
1589  *
1590  * - Similarly, for each switching core, we should serve @nr_rx_desc
1591  *   mbufs for receiving the packets from physical NIC device.
1592  *
1593  * - We also need make sure, for each switch core, we have allocated
1594  *   enough mbufs to fill up the mbuf cache.
1595  */
1596 static void
1597 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1598 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1599 {
1600 	uint32_t nr_mbufs;
1601 	uint32_t nr_mbufs_per_core;
1602 	uint32_t mtu = 1500;
1603 
1604 	if (mergeable)
1605 		mtu = 9000;
1606 	if (enable_tso)
1607 		mtu = 64 * 1024;
1608 
1609 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1610 			(mbuf_size - RTE_PKTMBUF_HEADROOM);
1611 	nr_mbufs_per_core += nr_rx_desc;
1612 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1613 
1614 	nr_mbufs  = nr_queues * nr_rx_desc;
1615 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1616 	nr_mbufs *= nr_port;
1617 
1618 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1619 					    nr_mbuf_cache, 0, mbuf_size,
1620 					    rte_socket_id());
1621 	if (mbuf_pool == NULL)
1622 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1623 }
1624 
1625 /*
1626  * Main function, does initialisation and calls the per-lcore functions.
1627  */
1628 int
1629 main(int argc, char *argv[])
1630 {
1631 	unsigned lcore_id, core_id = 0;
1632 	unsigned nb_ports, valid_num_ports;
1633 	int ret, i;
1634 	uint16_t portid;
1635 	static pthread_t tid;
1636 	uint64_t flags = 0;
1637 
1638 	signal(SIGINT, sigint_handler);
1639 
1640 	/* init EAL */
1641 	ret = rte_eal_init(argc, argv);
1642 	if (ret < 0)
1643 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1644 	argc -= ret;
1645 	argv += ret;
1646 
1647 	/* parse app arguments */
1648 	ret = us_vhost_parse_args(argc, argv);
1649 	if (ret < 0)
1650 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1651 
1652 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1653 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1654 
1655 		if (rte_lcore_is_enabled(lcore_id))
1656 			lcore_ids[core_id++] = lcore_id;
1657 	}
1658 
1659 	if (rte_lcore_count() > RTE_MAX_LCORE)
1660 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1661 
1662 	/* Get the number of physical ports. */
1663 	nb_ports = rte_eth_dev_count_avail();
1664 
1665 	/*
1666 	 * Update the global var NUM_PORTS and global array PORTS
1667 	 * and get value of var VALID_NUM_PORTS according to system ports number
1668 	 */
1669 	valid_num_ports = check_ports_num(nb_ports);
1670 
1671 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1672 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1673 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1674 		return -1;
1675 	}
1676 
1677 	/*
1678 	 * FIXME: here we are trying to allocate mbufs big enough for
1679 	 * @MAX_QUEUES, but the truth is we're never going to use that
1680 	 * many queues here. We probably should only do allocation for
1681 	 * those queues we are going to use.
1682 	 */
1683 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1684 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1685 
1686 	if (vm2vm_mode == VM2VM_HARDWARE) {
1687 		/* Enable VT loop back to let L2 switch to do it. */
1688 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1689 		RTE_LOG(DEBUG, VHOST_CONFIG,
1690 			"Enable loop back for L2 switch in vmdq.\n");
1691 	}
1692 
1693 	/* initialize all ports */
1694 	RTE_ETH_FOREACH_DEV(portid) {
1695 		/* skip ports that are not enabled */
1696 		if ((enabled_port_mask & (1 << portid)) == 0) {
1697 			RTE_LOG(INFO, VHOST_PORT,
1698 				"Skipping disabled port %d\n", portid);
1699 			continue;
1700 		}
1701 		if (port_init(portid) != 0)
1702 			rte_exit(EXIT_FAILURE,
1703 				"Cannot initialize network ports\n");
1704 	}
1705 
1706 	/* Enable stats if the user option is set. */
1707 	if (enable_stats) {
1708 		ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1709 					print_stats, NULL);
1710 		if (ret < 0)
1711 			rte_exit(EXIT_FAILURE,
1712 				"Cannot create print-stats thread\n");
1713 	}
1714 
1715 	/* Launch all data cores. */
1716 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1717 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1718 
1719 	if (client_mode)
1720 		flags |= RTE_VHOST_USER_CLIENT;
1721 
1722 	/* Register vhost user driver to handle vhost messages. */
1723 	for (i = 0; i < nb_sockets; i++) {
1724 		char *file = socket_files + i * PATH_MAX;
1725 
1726 		if (async_vhost_driver)
1727 			flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1728 
1729 		ret = rte_vhost_driver_register(file, flags);
1730 		if (ret != 0) {
1731 			unregister_drivers(i);
1732 			rte_exit(EXIT_FAILURE,
1733 				"vhost driver register failure.\n");
1734 		}
1735 
1736 		if (builtin_net_driver)
1737 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1738 
1739 		if (mergeable == 0) {
1740 			rte_vhost_driver_disable_features(file,
1741 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
1742 		}
1743 
1744 		if (enable_tx_csum == 0) {
1745 			rte_vhost_driver_disable_features(file,
1746 				1ULL << VIRTIO_NET_F_CSUM);
1747 		}
1748 
1749 		if (enable_tso == 0) {
1750 			rte_vhost_driver_disable_features(file,
1751 				1ULL << VIRTIO_NET_F_HOST_TSO4);
1752 			rte_vhost_driver_disable_features(file,
1753 				1ULL << VIRTIO_NET_F_HOST_TSO6);
1754 			rte_vhost_driver_disable_features(file,
1755 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
1756 			rte_vhost_driver_disable_features(file,
1757 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
1758 		}
1759 
1760 		if (promiscuous) {
1761 			rte_vhost_driver_enable_features(file,
1762 				1ULL << VIRTIO_NET_F_CTRL_RX);
1763 		}
1764 
1765 		ret = rte_vhost_driver_callback_register(file,
1766 			&virtio_net_device_ops);
1767 		if (ret != 0) {
1768 			rte_exit(EXIT_FAILURE,
1769 				"failed to register vhost driver callbacks.\n");
1770 		}
1771 
1772 		if (rte_vhost_driver_start(file) < 0) {
1773 			rte_exit(EXIT_FAILURE,
1774 				"failed to start vhost driver.\n");
1775 		}
1776 	}
1777 
1778 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1779 		rte_eal_wait_lcore(lcore_id);
1780 
1781 	/* clean up the EAL */
1782 	rte_eal_cleanup();
1783 
1784 	return 0;
1785 }
1786