xref: /dpdk/examples/vhost/main.c (revision ee6e451f14103b0396ae6a0087bdd33c67a79e12)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4 
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16 
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
19 #include <rte_log.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
22 #include <rte_vhost.h>
23 #include <rte_ip.h>
24 #include <rte_tcp.h>
25 #include <rte_pause.h>
26 
27 #include "ioat.h"
28 #include "main.h"
29 
30 #ifndef MAX_QUEUES
31 #define MAX_QUEUES 128
32 #endif
33 
34 /* the maximum number of external ports supported */
35 #define MAX_SUP_PORTS 1
36 
37 #define MBUF_CACHE_SIZE	128
38 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
39 
40 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
41 
42 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
43 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
44 
45 #define JUMBO_FRAME_MAX_SIZE    0x2600
46 
47 /* State of virtio device. */
48 #define DEVICE_MAC_LEARNING 0
49 #define DEVICE_RX			1
50 #define DEVICE_SAFE_REMOVE	2
51 
52 /* Configurable number of RX/TX ring descriptors */
53 #define RTE_TEST_RX_DESC_DEFAULT 1024
54 #define RTE_TEST_TX_DESC_DEFAULT 512
55 
56 #define INVALID_PORT_ID 0xFF
57 
58 /* Maximum long option length for option parsing. */
59 #define MAX_LONG_OPT_SZ 64
60 
61 /* mask of enabled ports */
62 static uint32_t enabled_port_mask = 0;
63 
64 /* Promiscuous mode */
65 static uint32_t promiscuous;
66 
67 /* number of devices/queues to support*/
68 static uint32_t num_queues = 0;
69 static uint32_t num_devices;
70 
71 static struct rte_mempool *mbuf_pool;
72 static int mergeable;
73 
74 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
75 typedef enum {
76 	VM2VM_DISABLED = 0,
77 	VM2VM_SOFTWARE = 1,
78 	VM2VM_HARDWARE = 2,
79 	VM2VM_LAST
80 } vm2vm_type;
81 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
82 
83 /* Enable stats. */
84 static uint32_t enable_stats = 0;
85 /* Enable retries on RX. */
86 static uint32_t enable_retry = 1;
87 
88 /* Disable TX checksum offload */
89 static uint32_t enable_tx_csum;
90 
91 /* Disable TSO offload */
92 static uint32_t enable_tso;
93 
94 static int client_mode;
95 
96 static int builtin_net_driver;
97 
98 static int async_vhost_driver;
99 
100 static char dma_type[MAX_LONG_OPT_SZ];
101 
102 /* Specify timeout (in useconds) between retries on RX. */
103 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
104 /* Specify the number of retries on RX. */
105 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
106 
107 /* Socket file paths. Can be set by user */
108 static char *socket_files;
109 static int nb_sockets;
110 
111 /* empty vmdq configuration structure. Filled in programatically */
112 static struct rte_eth_conf vmdq_conf_default = {
113 	.rxmode = {
114 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
115 		.split_hdr_size = 0,
116 		/*
117 		 * VLAN strip is necessary for 1G NIC such as I350,
118 		 * this fixes bug of ipv4 forwarding in guest can't
119 		 * forward pakets from one virtio dev to another virtio dev.
120 		 */
121 		.offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
122 	},
123 
124 	.txmode = {
125 		.mq_mode = ETH_MQ_TX_NONE,
126 		.offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
127 			     DEV_TX_OFFLOAD_TCP_CKSUM |
128 			     DEV_TX_OFFLOAD_VLAN_INSERT |
129 			     DEV_TX_OFFLOAD_MULTI_SEGS |
130 			     DEV_TX_OFFLOAD_TCP_TSO),
131 	},
132 	.rx_adv_conf = {
133 		/*
134 		 * should be overridden separately in code with
135 		 * appropriate values
136 		 */
137 		.vmdq_rx_conf = {
138 			.nb_queue_pools = ETH_8_POOLS,
139 			.enable_default_pool = 0,
140 			.default_pool = 0,
141 			.nb_pool_maps = 0,
142 			.pool_map = {{0, 0},},
143 		},
144 	},
145 };
146 
147 
148 static unsigned lcore_ids[RTE_MAX_LCORE];
149 static uint16_t ports[RTE_MAX_ETHPORTS];
150 static unsigned num_ports = 0; /**< The number of ports specified in command line */
151 static uint16_t num_pf_queues, num_vmdq_queues;
152 static uint16_t vmdq_pool_base, vmdq_queue_base;
153 static uint16_t queues_per_pool;
154 
155 const uint16_t vlan_tags[] = {
156 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
157 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
158 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
159 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
160 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
161 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
162 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
163 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
164 };
165 
166 /* ethernet addresses of ports */
167 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
168 
169 static struct vhost_dev_tailq_list vhost_dev_list =
170 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
171 
172 static struct lcore_info lcore_info[RTE_MAX_LCORE];
173 
174 /* Used for queueing bursts of TX packets. */
175 struct mbuf_table {
176 	unsigned len;
177 	unsigned txq_id;
178 	struct rte_mbuf *m_table[MAX_PKT_BURST];
179 };
180 
181 struct vhost_bufftable {
182 	uint32_t len;
183 	uint64_t pre_tsc;
184 	struct rte_mbuf *m_table[MAX_PKT_BURST];
185 };
186 
187 /* TX queue for each data core. */
188 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
189 
190 /*
191  * Vhost TX buffer for each data core.
192  * Every data core maintains a TX buffer for every vhost device,
193  * which is used for batch pkts enqueue for higher performance.
194  */
195 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * MAX_VHOST_DEVICE];
196 
197 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
198 				 / US_PER_S * BURST_TX_DRAIN_US)
199 #define VLAN_HLEN       4
200 
201 static inline int
202 open_dma(const char *value)
203 {
204 	if (strncmp(dma_type, "ioat", 4) == 0)
205 		return open_ioat(value);
206 
207 	return -1;
208 }
209 
210 /*
211  * Builds up the correct configuration for VMDQ VLAN pool map
212  * according to the pool & queue limits.
213  */
214 static inline int
215 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
216 {
217 	struct rte_eth_vmdq_rx_conf conf;
218 	struct rte_eth_vmdq_rx_conf *def_conf =
219 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
220 	unsigned i;
221 
222 	memset(&conf, 0, sizeof(conf));
223 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
224 	conf.nb_pool_maps = num_devices;
225 	conf.enable_loop_back = def_conf->enable_loop_back;
226 	conf.rx_mode = def_conf->rx_mode;
227 
228 	for (i = 0; i < conf.nb_pool_maps; i++) {
229 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
230 		conf.pool_map[i].pools = (1UL << i);
231 	}
232 
233 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
234 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
235 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
236 	return 0;
237 }
238 
239 /*
240  * Initialises a given port using global settings and with the rx buffers
241  * coming from the mbuf_pool passed as parameter
242  */
243 static inline int
244 port_init(uint16_t port)
245 {
246 	struct rte_eth_dev_info dev_info;
247 	struct rte_eth_conf port_conf;
248 	struct rte_eth_rxconf *rxconf;
249 	struct rte_eth_txconf *txconf;
250 	int16_t rx_rings, tx_rings;
251 	uint16_t rx_ring_size, tx_ring_size;
252 	int retval;
253 	uint16_t q;
254 
255 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
256 	retval = rte_eth_dev_info_get(port, &dev_info);
257 	if (retval != 0) {
258 		RTE_LOG(ERR, VHOST_PORT,
259 			"Error during getting device (port %u) info: %s\n",
260 			port, strerror(-retval));
261 
262 		return retval;
263 	}
264 
265 	rxconf = &dev_info.default_rxconf;
266 	txconf = &dev_info.default_txconf;
267 	rxconf->rx_drop_en = 1;
268 
269 	/*configure the number of supported virtio devices based on VMDQ limits */
270 	num_devices = dev_info.max_vmdq_pools;
271 
272 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
273 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
274 
275 	tx_rings = (uint16_t)rte_lcore_count();
276 
277 	/* Get port configuration. */
278 	retval = get_eth_conf(&port_conf, num_devices);
279 	if (retval < 0)
280 		return retval;
281 	/* NIC queues are divided into pf queues and vmdq queues.  */
282 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
283 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
284 	num_vmdq_queues = num_devices * queues_per_pool;
285 	num_queues = num_pf_queues + num_vmdq_queues;
286 	vmdq_queue_base = dev_info.vmdq_queue_base;
287 	vmdq_pool_base  = dev_info.vmdq_pool_base;
288 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
289 		num_pf_queues, num_devices, queues_per_pool);
290 
291 	if (!rte_eth_dev_is_valid_port(port))
292 		return -1;
293 
294 	rx_rings = (uint16_t)dev_info.max_rx_queues;
295 	if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
296 		port_conf.txmode.offloads |=
297 			DEV_TX_OFFLOAD_MBUF_FAST_FREE;
298 	/* Configure ethernet device. */
299 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
300 	if (retval != 0) {
301 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
302 			port, strerror(-retval));
303 		return retval;
304 	}
305 
306 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
307 		&tx_ring_size);
308 	if (retval != 0) {
309 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
310 			"for port %u: %s.\n", port, strerror(-retval));
311 		return retval;
312 	}
313 	if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
314 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
315 			"for Rx queues on port %u.\n", port);
316 		return -1;
317 	}
318 
319 	/* Setup the queues. */
320 	rxconf->offloads = port_conf.rxmode.offloads;
321 	for (q = 0; q < rx_rings; q ++) {
322 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
323 						rte_eth_dev_socket_id(port),
324 						rxconf,
325 						mbuf_pool);
326 		if (retval < 0) {
327 			RTE_LOG(ERR, VHOST_PORT,
328 				"Failed to setup rx queue %u of port %u: %s.\n",
329 				q, port, strerror(-retval));
330 			return retval;
331 		}
332 	}
333 	txconf->offloads = port_conf.txmode.offloads;
334 	for (q = 0; q < tx_rings; q ++) {
335 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
336 						rte_eth_dev_socket_id(port),
337 						txconf);
338 		if (retval < 0) {
339 			RTE_LOG(ERR, VHOST_PORT,
340 				"Failed to setup tx queue %u of port %u: %s.\n",
341 				q, port, strerror(-retval));
342 			return retval;
343 		}
344 	}
345 
346 	/* Start the device. */
347 	retval  = rte_eth_dev_start(port);
348 	if (retval < 0) {
349 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
350 			port, strerror(-retval));
351 		return retval;
352 	}
353 
354 	if (promiscuous) {
355 		retval = rte_eth_promiscuous_enable(port);
356 		if (retval != 0) {
357 			RTE_LOG(ERR, VHOST_PORT,
358 				"Failed to enable promiscuous mode on port %u: %s\n",
359 				port, rte_strerror(-retval));
360 			return retval;
361 		}
362 	}
363 
364 	retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
365 	if (retval < 0) {
366 		RTE_LOG(ERR, VHOST_PORT,
367 			"Failed to get MAC address on port %u: %s\n",
368 			port, rte_strerror(-retval));
369 		return retval;
370 	}
371 
372 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
373 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
374 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
375 			port,
376 			vmdq_ports_eth_addr[port].addr_bytes[0],
377 			vmdq_ports_eth_addr[port].addr_bytes[1],
378 			vmdq_ports_eth_addr[port].addr_bytes[2],
379 			vmdq_ports_eth_addr[port].addr_bytes[3],
380 			vmdq_ports_eth_addr[port].addr_bytes[4],
381 			vmdq_ports_eth_addr[port].addr_bytes[5]);
382 
383 	return 0;
384 }
385 
386 /*
387  * Set socket file path.
388  */
389 static int
390 us_vhost_parse_socket_path(const char *q_arg)
391 {
392 	char *old;
393 
394 	/* parse number string */
395 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
396 		return -1;
397 
398 	old = socket_files;
399 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
400 	if (socket_files == NULL) {
401 		free(old);
402 		return -1;
403 	}
404 
405 	strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
406 	nb_sockets++;
407 
408 	return 0;
409 }
410 
411 /*
412  * Parse the portmask provided at run time.
413  */
414 static int
415 parse_portmask(const char *portmask)
416 {
417 	char *end = NULL;
418 	unsigned long pm;
419 
420 	errno = 0;
421 
422 	/* parse hexadecimal string */
423 	pm = strtoul(portmask, &end, 16);
424 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
425 		return 0;
426 
427 	return pm;
428 
429 }
430 
431 /*
432  * Parse num options at run time.
433  */
434 static int
435 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
436 {
437 	char *end = NULL;
438 	unsigned long num;
439 
440 	errno = 0;
441 
442 	/* parse unsigned int string */
443 	num = strtoul(q_arg, &end, 10);
444 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
445 		return -1;
446 
447 	if (num > max_valid_value)
448 		return -1;
449 
450 	return num;
451 
452 }
453 
454 /*
455  * Display usage
456  */
457 static void
458 us_vhost_usage(const char *prgname)
459 {
460 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
461 	"		--vm2vm [0|1|2]\n"
462 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
463 	"		--socket-file <path>\n"
464 	"		--nb-devices ND\n"
465 	"		-p PORTMASK: Set mask for ports to be used by application\n"
466 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
467 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
468 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
469 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
470 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
471 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
472 	"		--socket-file: The path of the socket file.\n"
473 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
474 	"		--tso [0|1] disable/enable TCP segment offload.\n"
475 	"		--client register a vhost-user socket as client mode.\n"
476 	"		--dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
477 	"		--dmas register dma channel for specific vhost device.\n",
478 	       prgname);
479 }
480 
481 /*
482  * Parse the arguments given in the command line of the application.
483  */
484 static int
485 us_vhost_parse_args(int argc, char **argv)
486 {
487 	int opt, ret;
488 	int option_index;
489 	unsigned i;
490 	const char *prgname = argv[0];
491 	static struct option long_option[] = {
492 		{"vm2vm", required_argument, NULL, 0},
493 		{"rx-retry", required_argument, NULL, 0},
494 		{"rx-retry-delay", required_argument, NULL, 0},
495 		{"rx-retry-num", required_argument, NULL, 0},
496 		{"mergeable", required_argument, NULL, 0},
497 		{"stats", required_argument, NULL, 0},
498 		{"socket-file", required_argument, NULL, 0},
499 		{"tx-csum", required_argument, NULL, 0},
500 		{"tso", required_argument, NULL, 0},
501 		{"client", no_argument, &client_mode, 1},
502 		{"builtin-net-driver", no_argument, &builtin_net_driver, 1},
503 		{"dma-type", required_argument, NULL, 0},
504 		{"dmas", required_argument, NULL, 0},
505 		{NULL, 0, 0, 0},
506 	};
507 
508 	/* Parse command line */
509 	while ((opt = getopt_long(argc, argv, "p:P",
510 			long_option, &option_index)) != EOF) {
511 		switch (opt) {
512 		/* Portmask */
513 		case 'p':
514 			enabled_port_mask = parse_portmask(optarg);
515 			if (enabled_port_mask == 0) {
516 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
517 				us_vhost_usage(prgname);
518 				return -1;
519 			}
520 			break;
521 
522 		case 'P':
523 			promiscuous = 1;
524 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
525 				ETH_VMDQ_ACCEPT_BROADCAST |
526 				ETH_VMDQ_ACCEPT_MULTICAST;
527 
528 			break;
529 
530 		case 0:
531 			/* Enable/disable vm2vm comms. */
532 			if (!strncmp(long_option[option_index].name, "vm2vm",
533 				MAX_LONG_OPT_SZ)) {
534 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
535 				if (ret == -1) {
536 					RTE_LOG(INFO, VHOST_CONFIG,
537 						"Invalid argument for "
538 						"vm2vm [0|1|2]\n");
539 					us_vhost_usage(prgname);
540 					return -1;
541 				} else {
542 					vm2vm_mode = (vm2vm_type)ret;
543 				}
544 			}
545 
546 			/* Enable/disable retries on RX. */
547 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
548 				ret = parse_num_opt(optarg, 1);
549 				if (ret == -1) {
550 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
551 					us_vhost_usage(prgname);
552 					return -1;
553 				} else {
554 					enable_retry = ret;
555 				}
556 			}
557 
558 			/* Enable/disable TX checksum offload. */
559 			if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
560 				ret = parse_num_opt(optarg, 1);
561 				if (ret == -1) {
562 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
563 					us_vhost_usage(prgname);
564 					return -1;
565 				} else
566 					enable_tx_csum = ret;
567 			}
568 
569 			/* Enable/disable TSO offload. */
570 			if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
571 				ret = parse_num_opt(optarg, 1);
572 				if (ret == -1) {
573 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
574 					us_vhost_usage(prgname);
575 					return -1;
576 				} else
577 					enable_tso = ret;
578 			}
579 
580 			/* Specify the retries delay time (in useconds) on RX. */
581 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
582 				ret = parse_num_opt(optarg, INT32_MAX);
583 				if (ret == -1) {
584 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
585 					us_vhost_usage(prgname);
586 					return -1;
587 				} else {
588 					burst_rx_delay_time = ret;
589 				}
590 			}
591 
592 			/* Specify the retries number on RX. */
593 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
594 				ret = parse_num_opt(optarg, INT32_MAX);
595 				if (ret == -1) {
596 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
597 					us_vhost_usage(prgname);
598 					return -1;
599 				} else {
600 					burst_rx_retry_num = ret;
601 				}
602 			}
603 
604 			/* Enable/disable RX mergeable buffers. */
605 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
606 				ret = parse_num_opt(optarg, 1);
607 				if (ret == -1) {
608 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
609 					us_vhost_usage(prgname);
610 					return -1;
611 				} else {
612 					mergeable = !!ret;
613 					if (ret) {
614 						vmdq_conf_default.rxmode.offloads |=
615 							DEV_RX_OFFLOAD_JUMBO_FRAME;
616 						vmdq_conf_default.rxmode.max_rx_pkt_len
617 							= JUMBO_FRAME_MAX_SIZE;
618 					}
619 				}
620 			}
621 
622 			/* Enable/disable stats. */
623 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
624 				ret = parse_num_opt(optarg, INT32_MAX);
625 				if (ret == -1) {
626 					RTE_LOG(INFO, VHOST_CONFIG,
627 						"Invalid argument for stats [0..N]\n");
628 					us_vhost_usage(prgname);
629 					return -1;
630 				} else {
631 					enable_stats = ret;
632 				}
633 			}
634 
635 			/* Set socket file path. */
636 			if (!strncmp(long_option[option_index].name,
637 						"socket-file", MAX_LONG_OPT_SZ)) {
638 				if (us_vhost_parse_socket_path(optarg) == -1) {
639 					RTE_LOG(INFO, VHOST_CONFIG,
640 					"Invalid argument for socket name (Max %d characters)\n",
641 					PATH_MAX);
642 					us_vhost_usage(prgname);
643 					return -1;
644 				}
645 			}
646 
647 			if (!strncmp(long_option[option_index].name,
648 						"dma-type", MAX_LONG_OPT_SZ)) {
649 				if (strlen(optarg) >= MAX_LONG_OPT_SZ) {
650 					RTE_LOG(INFO, VHOST_CONFIG,
651 						"Wrong DMA type\n");
652 					us_vhost_usage(prgname);
653 					return -1;
654 				}
655 				strcpy(dma_type, optarg);
656 			}
657 
658 			if (!strncmp(long_option[option_index].name,
659 						"dmas", MAX_LONG_OPT_SZ)) {
660 				if (open_dma(optarg) == -1) {
661 					RTE_LOG(INFO, VHOST_CONFIG,
662 						"Wrong DMA args\n");
663 					us_vhost_usage(prgname);
664 					return -1;
665 				}
666 				async_vhost_driver = 1;
667 			}
668 
669 			break;
670 
671 			/* Invalid option - print options. */
672 		default:
673 			us_vhost_usage(prgname);
674 			return -1;
675 		}
676 	}
677 
678 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
679 		if (enabled_port_mask & (1 << i))
680 			ports[num_ports++] = i;
681 	}
682 
683 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
684 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
685 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
686 		return -1;
687 	}
688 
689 	return 0;
690 }
691 
692 /*
693  * Update the global var NUM_PORTS and array PORTS according to system ports number
694  * and return valid ports number
695  */
696 static unsigned check_ports_num(unsigned nb_ports)
697 {
698 	unsigned valid_num_ports = num_ports;
699 	unsigned portid;
700 
701 	if (num_ports > nb_ports) {
702 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
703 			num_ports, nb_ports);
704 		num_ports = nb_ports;
705 	}
706 
707 	for (portid = 0; portid < num_ports; portid ++) {
708 		if (!rte_eth_dev_is_valid_port(ports[portid])) {
709 			RTE_LOG(INFO, VHOST_PORT,
710 				"\nSpecified port ID(%u) is not valid\n",
711 				ports[portid]);
712 			ports[portid] = INVALID_PORT_ID;
713 			valid_num_ports--;
714 		}
715 	}
716 	return valid_num_ports;
717 }
718 
719 static __rte_always_inline struct vhost_dev *
720 find_vhost_dev(struct rte_ether_addr *mac)
721 {
722 	struct vhost_dev *vdev;
723 
724 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
725 		if (vdev->ready == DEVICE_RX &&
726 		    rte_is_same_ether_addr(mac, &vdev->mac_address))
727 			return vdev;
728 	}
729 
730 	return NULL;
731 }
732 
733 /*
734  * This function learns the MAC address of the device and registers this along with a
735  * vlan tag to a VMDQ.
736  */
737 static int
738 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
739 {
740 	struct rte_ether_hdr *pkt_hdr;
741 	int i, ret;
742 
743 	/* Learn MAC address of guest device from packet */
744 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
745 
746 	if (find_vhost_dev(&pkt_hdr->s_addr)) {
747 		RTE_LOG(ERR, VHOST_DATA,
748 			"(%d) device is using a registered MAC!\n",
749 			vdev->vid);
750 		return -1;
751 	}
752 
753 	for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
754 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
755 
756 	/* vlan_tag currently uses the device_id. */
757 	vdev->vlan_tag = vlan_tags[vdev->vid];
758 
759 	/* Print out VMDQ registration info. */
760 	RTE_LOG(INFO, VHOST_DATA,
761 		"(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
762 		vdev->vid,
763 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
764 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
765 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
766 		vdev->vlan_tag);
767 
768 	/* Register the MAC address. */
769 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
770 				(uint32_t)vdev->vid + vmdq_pool_base);
771 	if (ret)
772 		RTE_LOG(ERR, VHOST_DATA,
773 			"(%d) failed to add device MAC address to VMDQ\n",
774 			vdev->vid);
775 
776 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
777 
778 	/* Set device as ready for RX. */
779 	vdev->ready = DEVICE_RX;
780 
781 	return 0;
782 }
783 
784 /*
785  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
786  * queue before disabling RX on the device.
787  */
788 static inline void
789 unlink_vmdq(struct vhost_dev *vdev)
790 {
791 	unsigned i = 0;
792 	unsigned rx_count;
793 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
794 
795 	if (vdev->ready == DEVICE_RX) {
796 		/*clear MAC and VLAN settings*/
797 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
798 		for (i = 0; i < 6; i++)
799 			vdev->mac_address.addr_bytes[i] = 0;
800 
801 		vdev->vlan_tag = 0;
802 
803 		/*Clear out the receive buffers*/
804 		rx_count = rte_eth_rx_burst(ports[0],
805 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
806 
807 		while (rx_count) {
808 			for (i = 0; i < rx_count; i++)
809 				rte_pktmbuf_free(pkts_burst[i]);
810 
811 			rx_count = rte_eth_rx_burst(ports[0],
812 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
813 		}
814 
815 		vdev->ready = DEVICE_MAC_LEARNING;
816 	}
817 }
818 
819 static inline void
820 free_pkts(struct rte_mbuf **pkts, uint16_t n)
821 {
822 	while (n--)
823 		rte_pktmbuf_free(pkts[n]);
824 }
825 
826 static __rte_always_inline void
827 complete_async_pkts(struct vhost_dev *vdev)
828 {
829 	struct rte_mbuf *p_cpl[MAX_PKT_BURST];
830 	uint16_t complete_count;
831 
832 	complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
833 					VIRTIO_RXQ, p_cpl, MAX_PKT_BURST);
834 	if (complete_count)
835 		free_pkts(p_cpl, complete_count);
836 }
837 
838 static __rte_always_inline void
839 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
840 	    struct rte_mbuf *m)
841 {
842 	uint16_t ret;
843 
844 	if (builtin_net_driver) {
845 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
846 	} else {
847 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
848 	}
849 
850 	if (enable_stats) {
851 		__atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
852 				__ATOMIC_SEQ_CST);
853 		__atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
854 				__ATOMIC_SEQ_CST);
855 		src_vdev->stats.tx_total++;
856 		src_vdev->stats.tx += ret;
857 	}
858 }
859 
860 static __rte_always_inline void
861 drain_vhost(struct vhost_dev *vdev)
862 {
863 	uint16_t ret;
864 	uint32_t buff_idx = rte_lcore_id() * MAX_VHOST_DEVICE + vdev->vid;
865 	uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
866 	struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
867 
868 	if (builtin_net_driver) {
869 		ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
870 	} else if (async_vhost_driver) {
871 		uint32_t cpu_cpl_nr = 0;
872 		uint16_t enqueue_fail = 0;
873 		struct rte_mbuf *m_cpu_cpl[nr_xmit];
874 
875 		complete_async_pkts(vdev);
876 		ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ,
877 					m, nr_xmit, m_cpu_cpl, &cpu_cpl_nr);
878 
879 		if (cpu_cpl_nr)
880 			free_pkts(m_cpu_cpl, cpu_cpl_nr);
881 
882 		enqueue_fail = nr_xmit - ret;
883 		if (enqueue_fail)
884 			free_pkts(&m[ret], nr_xmit - ret);
885 	} else {
886 		ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
887 						m, nr_xmit);
888 	}
889 
890 	if (enable_stats) {
891 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
892 				__ATOMIC_SEQ_CST);
893 		__atomic_add_fetch(&vdev->stats.rx_atomic, ret,
894 				__ATOMIC_SEQ_CST);
895 	}
896 
897 	if (!async_vhost_driver)
898 		free_pkts(m, nr_xmit);
899 }
900 
901 static __rte_always_inline void
902 drain_vhost_table(void)
903 {
904 	uint16_t lcore_id = rte_lcore_id();
905 	struct vhost_bufftable *vhost_txq;
906 	struct vhost_dev *vdev;
907 	uint64_t cur_tsc;
908 
909 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
910 		vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE
911 						+ vdev->vid];
912 
913 		cur_tsc = rte_rdtsc();
914 		if (unlikely(cur_tsc - vhost_txq->pre_tsc
915 				> MBUF_TABLE_DRAIN_TSC)) {
916 			RTE_LOG_DP(DEBUG, VHOST_DATA,
917 				"Vhost TX queue drained after timeout with burst size %u\n",
918 				vhost_txq->len);
919 			drain_vhost(vdev);
920 			vhost_txq->len = 0;
921 			vhost_txq->pre_tsc = cur_tsc;
922 		}
923 	}
924 }
925 
926 /*
927  * Check if the packet destination MAC address is for a local device. If so then put
928  * the packet on that devices RX queue. If not then return.
929  */
930 static __rte_always_inline int
931 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
932 {
933 	struct rte_ether_hdr *pkt_hdr;
934 	struct vhost_dev *dst_vdev;
935 	struct vhost_bufftable *vhost_txq;
936 	uint16_t lcore_id = rte_lcore_id();
937 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
938 
939 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
940 	if (!dst_vdev)
941 		return -1;
942 
943 	if (vdev->vid == dst_vdev->vid) {
944 		RTE_LOG_DP(DEBUG, VHOST_DATA,
945 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
946 			vdev->vid);
947 		return 0;
948 	}
949 
950 	RTE_LOG_DP(DEBUG, VHOST_DATA,
951 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
952 
953 	if (unlikely(dst_vdev->remove)) {
954 		RTE_LOG_DP(DEBUG, VHOST_DATA,
955 			"(%d) device is marked for removal\n", dst_vdev->vid);
956 		return 0;
957 	}
958 
959 	vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE + dst_vdev->vid];
960 	vhost_txq->m_table[vhost_txq->len++] = m;
961 
962 	if (enable_stats) {
963 		vdev->stats.tx_total++;
964 		vdev->stats.tx++;
965 	}
966 
967 	if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
968 		drain_vhost(dst_vdev);
969 		vhost_txq->len = 0;
970 		vhost_txq->pre_tsc = rte_rdtsc();
971 	}
972 	return 0;
973 }
974 
975 /*
976  * Check if the destination MAC of a packet is one local VM,
977  * and get its vlan tag, and offset if it is.
978  */
979 static __rte_always_inline int
980 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
981 	uint32_t *offset, uint16_t *vlan_tag)
982 {
983 	struct vhost_dev *dst_vdev;
984 	struct rte_ether_hdr *pkt_hdr =
985 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
986 
987 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
988 	if (!dst_vdev)
989 		return 0;
990 
991 	if (vdev->vid == dst_vdev->vid) {
992 		RTE_LOG_DP(DEBUG, VHOST_DATA,
993 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
994 			vdev->vid);
995 		return -1;
996 	}
997 
998 	/*
999 	 * HW vlan strip will reduce the packet length
1000 	 * by minus length of vlan tag, so need restore
1001 	 * the packet length by plus it.
1002 	 */
1003 	*offset  = VLAN_HLEN;
1004 	*vlan_tag = vlan_tags[vdev->vid];
1005 
1006 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1007 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1008 		vdev->vid, dst_vdev->vid, *vlan_tag);
1009 
1010 	return 0;
1011 }
1012 
1013 static uint16_t
1014 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
1015 {
1016 	if (ol_flags & PKT_TX_IPV4)
1017 		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
1018 	else /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1019 		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
1020 }
1021 
1022 static void virtio_tx_offload(struct rte_mbuf *m)
1023 {
1024 	void *l3_hdr;
1025 	struct rte_ipv4_hdr *ipv4_hdr = NULL;
1026 	struct rte_tcp_hdr *tcp_hdr = NULL;
1027 	struct rte_ether_hdr *eth_hdr =
1028 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1029 
1030 	l3_hdr = (char *)eth_hdr + m->l2_len;
1031 
1032 	if (m->ol_flags & PKT_TX_IPV4) {
1033 		ipv4_hdr = l3_hdr;
1034 		ipv4_hdr->hdr_checksum = 0;
1035 		m->ol_flags |= PKT_TX_IP_CKSUM;
1036 	}
1037 
1038 	tcp_hdr = (struct rte_tcp_hdr *)((char *)l3_hdr + m->l3_len);
1039 	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
1040 }
1041 
1042 static __rte_always_inline void
1043 do_drain_mbuf_table(struct mbuf_table *tx_q)
1044 {
1045 	uint16_t count;
1046 
1047 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1048 				 tx_q->m_table, tx_q->len);
1049 	if (unlikely(count < tx_q->len))
1050 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
1051 
1052 	tx_q->len = 0;
1053 }
1054 
1055 /*
1056  * This function routes the TX packet to the correct interface. This
1057  * may be a local device or the physical port.
1058  */
1059 static __rte_always_inline void
1060 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1061 {
1062 	struct mbuf_table *tx_q;
1063 	unsigned offset = 0;
1064 	const uint16_t lcore_id = rte_lcore_id();
1065 	struct rte_ether_hdr *nh;
1066 
1067 
1068 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1069 	if (unlikely(rte_is_broadcast_ether_addr(&nh->d_addr))) {
1070 		struct vhost_dev *vdev2;
1071 
1072 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1073 			if (vdev2 != vdev)
1074 				sync_virtio_xmit(vdev2, vdev, m);
1075 		}
1076 		goto queue2nic;
1077 	}
1078 
1079 	/*check if destination is local VM*/
1080 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1081 		return;
1082 
1083 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1084 		if (unlikely(find_local_dest(vdev, m, &offset,
1085 					     &vlan_tag) != 0)) {
1086 			rte_pktmbuf_free(m);
1087 			return;
1088 		}
1089 	}
1090 
1091 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1092 		"(%d) TX: MAC address is external\n", vdev->vid);
1093 
1094 queue2nic:
1095 
1096 	/*Add packet to the port tx queue*/
1097 	tx_q = &lcore_tx_queue[lcore_id];
1098 
1099 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1100 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1101 		/* Guest has inserted the vlan tag. */
1102 		struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1103 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1104 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1105 			(vh->vlan_tci != vlan_tag_be))
1106 			vh->vlan_tci = vlan_tag_be;
1107 	} else {
1108 		m->ol_flags |= PKT_TX_VLAN_PKT;
1109 
1110 		/*
1111 		 * Find the right seg to adjust the data len when offset is
1112 		 * bigger than tail room size.
1113 		 */
1114 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1115 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1116 				m->data_len += offset;
1117 			else {
1118 				struct rte_mbuf *seg = m;
1119 
1120 				while ((seg->next != NULL) &&
1121 					(offset > rte_pktmbuf_tailroom(seg)))
1122 					seg = seg->next;
1123 
1124 				seg->data_len += offset;
1125 			}
1126 			m->pkt_len += offset;
1127 		}
1128 
1129 		m->vlan_tci = vlan_tag;
1130 	}
1131 
1132 	if (m->ol_flags & PKT_TX_TCP_SEG)
1133 		virtio_tx_offload(m);
1134 
1135 	tx_q->m_table[tx_q->len++] = m;
1136 	if (enable_stats) {
1137 		vdev->stats.tx_total++;
1138 		vdev->stats.tx++;
1139 	}
1140 
1141 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1142 		do_drain_mbuf_table(tx_q);
1143 }
1144 
1145 
1146 static __rte_always_inline void
1147 drain_mbuf_table(struct mbuf_table *tx_q)
1148 {
1149 	static uint64_t prev_tsc;
1150 	uint64_t cur_tsc;
1151 
1152 	if (tx_q->len == 0)
1153 		return;
1154 
1155 	cur_tsc = rte_rdtsc();
1156 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1157 		prev_tsc = cur_tsc;
1158 
1159 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1160 			"TX queue drained after timeout with burst size %u\n",
1161 			tx_q->len);
1162 		do_drain_mbuf_table(tx_q);
1163 	}
1164 }
1165 
1166 static __rte_always_inline void
1167 drain_eth_rx(struct vhost_dev *vdev)
1168 {
1169 	uint16_t rx_count, enqueue_count;
1170 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1171 
1172 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1173 				    pkts, MAX_PKT_BURST);
1174 
1175 	if (!rx_count)
1176 		return;
1177 
1178 	/*
1179 	 * When "enable_retry" is set, here we wait and retry when there
1180 	 * is no enough free slots in the queue to hold @rx_count packets,
1181 	 * to diminish packet loss.
1182 	 */
1183 	if (enable_retry &&
1184 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1185 			VIRTIO_RXQ))) {
1186 		uint32_t retry;
1187 
1188 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1189 			rte_delay_us(burst_rx_delay_time);
1190 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1191 					VIRTIO_RXQ))
1192 				break;
1193 		}
1194 	}
1195 
1196 	if (builtin_net_driver) {
1197 		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1198 						pkts, rx_count);
1199 	} else if (async_vhost_driver) {
1200 		uint32_t cpu_cpl_nr = 0;
1201 		uint16_t enqueue_fail = 0;
1202 		struct rte_mbuf *m_cpu_cpl[MAX_PKT_BURST];
1203 
1204 		complete_async_pkts(vdev);
1205 		enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1206 					VIRTIO_RXQ, pkts, rx_count,
1207 					m_cpu_cpl, &cpu_cpl_nr);
1208 		if (cpu_cpl_nr)
1209 			free_pkts(m_cpu_cpl, cpu_cpl_nr);
1210 
1211 		enqueue_fail = rx_count - enqueue_count;
1212 		if (enqueue_fail)
1213 			free_pkts(&pkts[enqueue_count], enqueue_fail);
1214 
1215 	} else {
1216 		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1217 						pkts, rx_count);
1218 	}
1219 
1220 	if (enable_stats) {
1221 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1222 				__ATOMIC_SEQ_CST);
1223 		__atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1224 				__ATOMIC_SEQ_CST);
1225 	}
1226 
1227 	if (!async_vhost_driver)
1228 		free_pkts(pkts, rx_count);
1229 }
1230 
1231 static __rte_always_inline void
1232 drain_virtio_tx(struct vhost_dev *vdev)
1233 {
1234 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1235 	uint16_t count;
1236 	uint16_t i;
1237 
1238 	if (builtin_net_driver) {
1239 		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1240 					pkts, MAX_PKT_BURST);
1241 	} else {
1242 		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1243 					mbuf_pool, pkts, MAX_PKT_BURST);
1244 	}
1245 
1246 	/* setup VMDq for the first packet */
1247 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1248 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1249 			free_pkts(pkts, count);
1250 	}
1251 
1252 	for (i = 0; i < count; ++i)
1253 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1254 }
1255 
1256 /*
1257  * Main function of vhost-switch. It basically does:
1258  *
1259  * for each vhost device {
1260  *    - drain_eth_rx()
1261  *
1262  *      Which drains the host eth Rx queue linked to the vhost device,
1263  *      and deliver all of them to guest virito Rx ring associated with
1264  *      this vhost device.
1265  *
1266  *    - drain_virtio_tx()
1267  *
1268  *      Which drains the guest virtio Tx queue and deliver all of them
1269  *      to the target, which could be another vhost device, or the
1270  *      physical eth dev. The route is done in function "virtio_tx_route".
1271  * }
1272  */
1273 static int
1274 switch_worker(void *arg __rte_unused)
1275 {
1276 	unsigned i;
1277 	unsigned lcore_id = rte_lcore_id();
1278 	struct vhost_dev *vdev;
1279 	struct mbuf_table *tx_q;
1280 
1281 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1282 
1283 	tx_q = &lcore_tx_queue[lcore_id];
1284 	for (i = 0; i < rte_lcore_count(); i++) {
1285 		if (lcore_ids[i] == lcore_id) {
1286 			tx_q->txq_id = i;
1287 			break;
1288 		}
1289 	}
1290 
1291 	while(1) {
1292 		drain_mbuf_table(tx_q);
1293 		drain_vhost_table();
1294 		/*
1295 		 * Inform the configuration core that we have exited the
1296 		 * linked list and that no devices are in use if requested.
1297 		 */
1298 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1299 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1300 
1301 		/*
1302 		 * Process vhost devices
1303 		 */
1304 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1305 			      lcore_vdev_entry) {
1306 			if (unlikely(vdev->remove)) {
1307 				unlink_vmdq(vdev);
1308 				vdev->ready = DEVICE_SAFE_REMOVE;
1309 				continue;
1310 			}
1311 
1312 			if (likely(vdev->ready == DEVICE_RX))
1313 				drain_eth_rx(vdev);
1314 
1315 			if (likely(!vdev->remove))
1316 				drain_virtio_tx(vdev);
1317 		}
1318 	}
1319 
1320 	return 0;
1321 }
1322 
1323 /*
1324  * Remove a device from the specific data core linked list and from the
1325  * main linked list. Synchonization  occurs through the use of the
1326  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1327  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1328  */
1329 static void
1330 destroy_device(int vid)
1331 {
1332 	struct vhost_dev *vdev = NULL;
1333 	int lcore;
1334 	uint16_t i;
1335 
1336 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1337 		if (vdev->vid == vid)
1338 			break;
1339 	}
1340 	if (!vdev)
1341 		return;
1342 	/*set the remove flag. */
1343 	vdev->remove = 1;
1344 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1345 		rte_pause();
1346 	}
1347 
1348 	for (i = 0; i < RTE_MAX_LCORE; i++)
1349 		rte_free(vhost_txbuff[i * MAX_VHOST_DEVICE + vid]);
1350 
1351 	if (builtin_net_driver)
1352 		vs_vhost_net_remove(vdev);
1353 
1354 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1355 		     lcore_vdev_entry);
1356 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1357 
1358 
1359 	/* Set the dev_removal_flag on each lcore. */
1360 	RTE_LCORE_FOREACH_WORKER(lcore)
1361 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1362 
1363 	/*
1364 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1365 	 * we can be sure that they can no longer access the device removed
1366 	 * from the linked lists and that the devices are no longer in use.
1367 	 */
1368 	RTE_LCORE_FOREACH_WORKER(lcore) {
1369 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1370 			rte_pause();
1371 	}
1372 
1373 	lcore_info[vdev->coreid].device_num--;
1374 
1375 	RTE_LOG(INFO, VHOST_DATA,
1376 		"(%d) device has been removed from data core\n",
1377 		vdev->vid);
1378 
1379 	if (async_vhost_driver)
1380 		rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1381 
1382 	rte_free(vdev);
1383 }
1384 
1385 /*
1386  * A new device is added to a data core. First the device is added to the main linked list
1387  * and then allocated to a specific data core.
1388  */
1389 static int
1390 new_device(int vid)
1391 {
1392 	int lcore, core_add = 0;
1393 	uint16_t i;
1394 	uint32_t device_num_min = num_devices;
1395 	struct vhost_dev *vdev;
1396 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1397 	if (vdev == NULL) {
1398 		RTE_LOG(INFO, VHOST_DATA,
1399 			"(%d) couldn't allocate memory for vhost dev\n",
1400 			vid);
1401 		return -1;
1402 	}
1403 	vdev->vid = vid;
1404 
1405 	for (i = 0; i < RTE_MAX_LCORE; i++) {
1406 		vhost_txbuff[i * MAX_VHOST_DEVICE + vid]
1407 			= rte_zmalloc("vhost bufftable",
1408 				sizeof(struct vhost_bufftable),
1409 				RTE_CACHE_LINE_SIZE);
1410 
1411 		if (vhost_txbuff[i * MAX_VHOST_DEVICE + vid] == NULL) {
1412 			RTE_LOG(INFO, VHOST_DATA,
1413 			  "(%d) couldn't allocate memory for vhost TX\n", vid);
1414 			return -1;
1415 		}
1416 	}
1417 
1418 	if (builtin_net_driver)
1419 		vs_vhost_net_setup(vdev);
1420 
1421 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1422 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1423 
1424 	/*reset ready flag*/
1425 	vdev->ready = DEVICE_MAC_LEARNING;
1426 	vdev->remove = 0;
1427 
1428 	/* Find a suitable lcore to add the device. */
1429 	RTE_LCORE_FOREACH_WORKER(lcore) {
1430 		if (lcore_info[lcore].device_num < device_num_min) {
1431 			device_num_min = lcore_info[lcore].device_num;
1432 			core_add = lcore;
1433 		}
1434 	}
1435 	vdev->coreid = core_add;
1436 
1437 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1438 			  lcore_vdev_entry);
1439 	lcore_info[vdev->coreid].device_num++;
1440 
1441 	/* Disable notifications. */
1442 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1443 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1444 
1445 	RTE_LOG(INFO, VHOST_DATA,
1446 		"(%d) device has been added to data core %d\n",
1447 		vid, vdev->coreid);
1448 
1449 	if (async_vhost_driver) {
1450 		struct rte_vhost_async_features f;
1451 		struct rte_vhost_async_channel_ops channel_ops;
1452 
1453 		if (strncmp(dma_type, "ioat", 4) == 0) {
1454 			channel_ops.transfer_data = ioat_transfer_data_cb;
1455 			channel_ops.check_completed_copies =
1456 				ioat_check_completed_copies_cb;
1457 
1458 			f.async_inorder = 1;
1459 			f.async_threshold = 256;
1460 
1461 			return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
1462 				f.intval, &channel_ops);
1463 		}
1464 	}
1465 
1466 	return 0;
1467 }
1468 
1469 /*
1470  * These callback allow devices to be added to the data core when configuration
1471  * has been fully complete.
1472  */
1473 static const struct vhost_device_ops virtio_net_device_ops =
1474 {
1475 	.new_device =  new_device,
1476 	.destroy_device = destroy_device,
1477 };
1478 
1479 /*
1480  * This is a thread will wake up after a period to print stats if the user has
1481  * enabled them.
1482  */
1483 static void *
1484 print_stats(__rte_unused void *arg)
1485 {
1486 	struct vhost_dev *vdev;
1487 	uint64_t tx_dropped, rx_dropped;
1488 	uint64_t tx, tx_total, rx, rx_total;
1489 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1490 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1491 
1492 	while(1) {
1493 		sleep(enable_stats);
1494 
1495 		/* Clear screen and move to top left */
1496 		printf("%s%s\n", clr, top_left);
1497 		printf("Device statistics =================================\n");
1498 
1499 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1500 			tx_total   = vdev->stats.tx_total;
1501 			tx         = vdev->stats.tx;
1502 			tx_dropped = tx_total - tx;
1503 
1504 			rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1505 				__ATOMIC_SEQ_CST);
1506 			rx         = __atomic_load_n(&vdev->stats.rx_atomic,
1507 				__ATOMIC_SEQ_CST);
1508 			rx_dropped = rx_total - rx;
1509 
1510 			printf("Statistics for device %d\n"
1511 				"-----------------------\n"
1512 				"TX total:              %" PRIu64 "\n"
1513 				"TX dropped:            %" PRIu64 "\n"
1514 				"TX successful:         %" PRIu64 "\n"
1515 				"RX total:              %" PRIu64 "\n"
1516 				"RX dropped:            %" PRIu64 "\n"
1517 				"RX successful:         %" PRIu64 "\n",
1518 				vdev->vid,
1519 				tx_total, tx_dropped, tx,
1520 				rx_total, rx_dropped, rx);
1521 		}
1522 
1523 		printf("===================================================\n");
1524 
1525 		fflush(stdout);
1526 	}
1527 
1528 	return NULL;
1529 }
1530 
1531 static void
1532 unregister_drivers(int socket_num)
1533 {
1534 	int i, ret;
1535 
1536 	for (i = 0; i < socket_num; i++) {
1537 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1538 		if (ret != 0)
1539 			RTE_LOG(ERR, VHOST_CONFIG,
1540 				"Fail to unregister vhost driver for %s.\n",
1541 				socket_files + i * PATH_MAX);
1542 	}
1543 }
1544 
1545 /* When we receive a INT signal, unregister vhost driver */
1546 static void
1547 sigint_handler(__rte_unused int signum)
1548 {
1549 	/* Unregister vhost driver. */
1550 	unregister_drivers(nb_sockets);
1551 
1552 	exit(0);
1553 }
1554 
1555 /*
1556  * While creating an mbuf pool, one key thing is to figure out how
1557  * many mbuf entries is enough for our use. FYI, here are some
1558  * guidelines:
1559  *
1560  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1561  *
1562  * - For each switch core (A CPU core does the packet switch), we need
1563  *   also make some reservation for receiving the packets from virtio
1564  *   Tx queue. How many is enough depends on the usage. It's normally
1565  *   a simple calculation like following:
1566  *
1567  *       MAX_PKT_BURST * max packet size / mbuf size
1568  *
1569  *   So, we definitely need allocate more mbufs when TSO is enabled.
1570  *
1571  * - Similarly, for each switching core, we should serve @nr_rx_desc
1572  *   mbufs for receiving the packets from physical NIC device.
1573  *
1574  * - We also need make sure, for each switch core, we have allocated
1575  *   enough mbufs to fill up the mbuf cache.
1576  */
1577 static void
1578 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1579 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1580 {
1581 	uint32_t nr_mbufs;
1582 	uint32_t nr_mbufs_per_core;
1583 	uint32_t mtu = 1500;
1584 
1585 	if (mergeable)
1586 		mtu = 9000;
1587 	if (enable_tso)
1588 		mtu = 64 * 1024;
1589 
1590 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1591 			(mbuf_size - RTE_PKTMBUF_HEADROOM);
1592 	nr_mbufs_per_core += nr_rx_desc;
1593 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1594 
1595 	nr_mbufs  = nr_queues * nr_rx_desc;
1596 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1597 	nr_mbufs *= nr_port;
1598 
1599 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1600 					    nr_mbuf_cache, 0, mbuf_size,
1601 					    rte_socket_id());
1602 	if (mbuf_pool == NULL)
1603 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1604 }
1605 
1606 /*
1607  * Main function, does initialisation and calls the per-lcore functions.
1608  */
1609 int
1610 main(int argc, char *argv[])
1611 {
1612 	unsigned lcore_id, core_id = 0;
1613 	unsigned nb_ports, valid_num_ports;
1614 	int ret, i;
1615 	uint16_t portid;
1616 	static pthread_t tid;
1617 	uint64_t flags = 0;
1618 
1619 	signal(SIGINT, sigint_handler);
1620 
1621 	/* init EAL */
1622 	ret = rte_eal_init(argc, argv);
1623 	if (ret < 0)
1624 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1625 	argc -= ret;
1626 	argv += ret;
1627 
1628 	/* parse app arguments */
1629 	ret = us_vhost_parse_args(argc, argv);
1630 	if (ret < 0)
1631 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1632 
1633 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1634 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1635 
1636 		if (rte_lcore_is_enabled(lcore_id))
1637 			lcore_ids[core_id++] = lcore_id;
1638 	}
1639 
1640 	if (rte_lcore_count() > RTE_MAX_LCORE)
1641 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1642 
1643 	/* Get the number of physical ports. */
1644 	nb_ports = rte_eth_dev_count_avail();
1645 
1646 	/*
1647 	 * Update the global var NUM_PORTS and global array PORTS
1648 	 * and get value of var VALID_NUM_PORTS according to system ports number
1649 	 */
1650 	valid_num_ports = check_ports_num(nb_ports);
1651 
1652 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1653 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1654 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1655 		return -1;
1656 	}
1657 
1658 	/*
1659 	 * FIXME: here we are trying to allocate mbufs big enough for
1660 	 * @MAX_QUEUES, but the truth is we're never going to use that
1661 	 * many queues here. We probably should only do allocation for
1662 	 * those queues we are going to use.
1663 	 */
1664 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1665 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1666 
1667 	if (vm2vm_mode == VM2VM_HARDWARE) {
1668 		/* Enable VT loop back to let L2 switch to do it. */
1669 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1670 		RTE_LOG(DEBUG, VHOST_CONFIG,
1671 			"Enable loop back for L2 switch in vmdq.\n");
1672 	}
1673 
1674 	/* initialize all ports */
1675 	RTE_ETH_FOREACH_DEV(portid) {
1676 		/* skip ports that are not enabled */
1677 		if ((enabled_port_mask & (1 << portid)) == 0) {
1678 			RTE_LOG(INFO, VHOST_PORT,
1679 				"Skipping disabled port %d\n", portid);
1680 			continue;
1681 		}
1682 		if (port_init(portid) != 0)
1683 			rte_exit(EXIT_FAILURE,
1684 				"Cannot initialize network ports\n");
1685 	}
1686 
1687 	/* Enable stats if the user option is set. */
1688 	if (enable_stats) {
1689 		ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1690 					print_stats, NULL);
1691 		if (ret < 0)
1692 			rte_exit(EXIT_FAILURE,
1693 				"Cannot create print-stats thread\n");
1694 	}
1695 
1696 	/* Launch all data cores. */
1697 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1698 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1699 
1700 	if (client_mode)
1701 		flags |= RTE_VHOST_USER_CLIENT;
1702 
1703 	/* Register vhost user driver to handle vhost messages. */
1704 	for (i = 0; i < nb_sockets; i++) {
1705 		char *file = socket_files + i * PATH_MAX;
1706 
1707 		if (async_vhost_driver)
1708 			flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1709 
1710 		ret = rte_vhost_driver_register(file, flags);
1711 		if (ret != 0) {
1712 			unregister_drivers(i);
1713 			rte_exit(EXIT_FAILURE,
1714 				"vhost driver register failure.\n");
1715 		}
1716 
1717 		if (builtin_net_driver)
1718 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1719 
1720 		if (mergeable == 0) {
1721 			rte_vhost_driver_disable_features(file,
1722 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
1723 		}
1724 
1725 		if (enable_tx_csum == 0) {
1726 			rte_vhost_driver_disable_features(file,
1727 				1ULL << VIRTIO_NET_F_CSUM);
1728 		}
1729 
1730 		if (enable_tso == 0) {
1731 			rte_vhost_driver_disable_features(file,
1732 				1ULL << VIRTIO_NET_F_HOST_TSO4);
1733 			rte_vhost_driver_disable_features(file,
1734 				1ULL << VIRTIO_NET_F_HOST_TSO6);
1735 			rte_vhost_driver_disable_features(file,
1736 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
1737 			rte_vhost_driver_disable_features(file,
1738 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
1739 		}
1740 
1741 		if (promiscuous) {
1742 			rte_vhost_driver_enable_features(file,
1743 				1ULL << VIRTIO_NET_F_CTRL_RX);
1744 		}
1745 
1746 		ret = rte_vhost_driver_callback_register(file,
1747 			&virtio_net_device_ops);
1748 		if (ret != 0) {
1749 			rte_exit(EXIT_FAILURE,
1750 				"failed to register vhost driver callbacks.\n");
1751 		}
1752 
1753 		if (rte_vhost_driver_start(file) < 0) {
1754 			rte_exit(EXIT_FAILURE,
1755 				"failed to start vhost driver.\n");
1756 		}
1757 	}
1758 
1759 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1760 		rte_eal_wait_lcore(lcore_id);
1761 
1762 	return 0;
1763 
1764 }
1765