xref: /dpdk/examples/vhost/main.c (revision 1b7b24389cee5baa421d334048782e3e99e7dec5)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4 
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16 
17 #include <rte_atomic.h>
18 #include <rte_cycles.h>
19 #include <rte_ethdev.h>
20 #include <rte_log.h>
21 #include <rte_string_fns.h>
22 #include <rte_malloc.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27 
28 #include "ioat.h"
29 #include "main.h"
30 
31 #ifndef MAX_QUEUES
32 #define MAX_QUEUES 128
33 #endif
34 
35 /* the maximum number of external ports supported */
36 #define MAX_SUP_PORTS 1
37 
38 #define MBUF_CACHE_SIZE	128
39 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
40 
41 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
42 
43 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
44 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
45 
46 #define JUMBO_FRAME_MAX_SIZE    0x2600
47 
48 /* State of virtio device. */
49 #define DEVICE_MAC_LEARNING 0
50 #define DEVICE_RX			1
51 #define DEVICE_SAFE_REMOVE	2
52 
53 /* Configurable number of RX/TX ring descriptors */
54 #define RTE_TEST_RX_DESC_DEFAULT 1024
55 #define RTE_TEST_TX_DESC_DEFAULT 512
56 
57 #define INVALID_PORT_ID 0xFF
58 
59 /* Maximum long option length for option parsing. */
60 #define MAX_LONG_OPT_SZ 64
61 
62 /* mask of enabled ports */
63 static uint32_t enabled_port_mask = 0;
64 
65 /* Promiscuous mode */
66 static uint32_t promiscuous;
67 
68 /* number of devices/queues to support*/
69 static uint32_t num_queues = 0;
70 static uint32_t num_devices;
71 
72 static struct rte_mempool *mbuf_pool;
73 static int mergeable;
74 
75 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
76 typedef enum {
77 	VM2VM_DISABLED = 0,
78 	VM2VM_SOFTWARE = 1,
79 	VM2VM_HARDWARE = 2,
80 	VM2VM_LAST
81 } vm2vm_type;
82 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
83 
84 /* Enable stats. */
85 static uint32_t enable_stats = 0;
86 /* Enable retries on RX. */
87 static uint32_t enable_retry = 1;
88 
89 /* Disable TX checksum offload */
90 static uint32_t enable_tx_csum;
91 
92 /* Disable TSO offload */
93 static uint32_t enable_tso;
94 
95 static int client_mode;
96 
97 static int builtin_net_driver;
98 
99 static int async_vhost_driver;
100 
101 static char dma_type[MAX_LONG_OPT_SZ];
102 
103 /* Specify timeout (in useconds) between retries on RX. */
104 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
105 /* Specify the number of retries on RX. */
106 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
107 
108 /* Socket file paths. Can be set by user */
109 static char *socket_files;
110 static int nb_sockets;
111 
112 /* empty vmdq configuration structure. Filled in programatically */
113 static struct rte_eth_conf vmdq_conf_default = {
114 	.rxmode = {
115 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
116 		.split_hdr_size = 0,
117 		/*
118 		 * VLAN strip is necessary for 1G NIC such as I350,
119 		 * this fixes bug of ipv4 forwarding in guest can't
120 		 * forward pakets from one virtio dev to another virtio dev.
121 		 */
122 		.offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
123 	},
124 
125 	.txmode = {
126 		.mq_mode = ETH_MQ_TX_NONE,
127 		.offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
128 			     DEV_TX_OFFLOAD_TCP_CKSUM |
129 			     DEV_TX_OFFLOAD_VLAN_INSERT |
130 			     DEV_TX_OFFLOAD_MULTI_SEGS |
131 			     DEV_TX_OFFLOAD_TCP_TSO),
132 	},
133 	.rx_adv_conf = {
134 		/*
135 		 * should be overridden separately in code with
136 		 * appropriate values
137 		 */
138 		.vmdq_rx_conf = {
139 			.nb_queue_pools = ETH_8_POOLS,
140 			.enable_default_pool = 0,
141 			.default_pool = 0,
142 			.nb_pool_maps = 0,
143 			.pool_map = {{0, 0},},
144 		},
145 	},
146 };
147 
148 
149 static unsigned lcore_ids[RTE_MAX_LCORE];
150 static uint16_t ports[RTE_MAX_ETHPORTS];
151 static unsigned num_ports = 0; /**< The number of ports specified in command line */
152 static uint16_t num_pf_queues, num_vmdq_queues;
153 static uint16_t vmdq_pool_base, vmdq_queue_base;
154 static uint16_t queues_per_pool;
155 
156 const uint16_t vlan_tags[] = {
157 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
158 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
159 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
160 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
161 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
162 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
163 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
164 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
165 };
166 
167 /* ethernet addresses of ports */
168 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
169 
170 static struct vhost_dev_tailq_list vhost_dev_list =
171 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
172 
173 static struct lcore_info lcore_info[RTE_MAX_LCORE];
174 
175 /* Used for queueing bursts of TX packets. */
176 struct mbuf_table {
177 	unsigned len;
178 	unsigned txq_id;
179 	struct rte_mbuf *m_table[MAX_PKT_BURST];
180 };
181 
182 /* TX queue for each data core. */
183 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
184 
185 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
186 				 / US_PER_S * BURST_TX_DRAIN_US)
187 #define VLAN_HLEN       4
188 
189 static inline int
190 open_dma(const char *value)
191 {
192 	if (strncmp(dma_type, "ioat", 4) == 0)
193 		return open_ioat(value);
194 
195 	return -1;
196 }
197 
198 /*
199  * Builds up the correct configuration for VMDQ VLAN pool map
200  * according to the pool & queue limits.
201  */
202 static inline int
203 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
204 {
205 	struct rte_eth_vmdq_rx_conf conf;
206 	struct rte_eth_vmdq_rx_conf *def_conf =
207 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
208 	unsigned i;
209 
210 	memset(&conf, 0, sizeof(conf));
211 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
212 	conf.nb_pool_maps = num_devices;
213 	conf.enable_loop_back = def_conf->enable_loop_back;
214 	conf.rx_mode = def_conf->rx_mode;
215 
216 	for (i = 0; i < conf.nb_pool_maps; i++) {
217 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
218 		conf.pool_map[i].pools = (1UL << i);
219 	}
220 
221 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
222 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
223 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
224 	return 0;
225 }
226 
227 /*
228  * Initialises a given port using global settings and with the rx buffers
229  * coming from the mbuf_pool passed as parameter
230  */
231 static inline int
232 port_init(uint16_t port)
233 {
234 	struct rte_eth_dev_info dev_info;
235 	struct rte_eth_conf port_conf;
236 	struct rte_eth_rxconf *rxconf;
237 	struct rte_eth_txconf *txconf;
238 	int16_t rx_rings, tx_rings;
239 	uint16_t rx_ring_size, tx_ring_size;
240 	int retval;
241 	uint16_t q;
242 
243 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
244 	retval = rte_eth_dev_info_get(port, &dev_info);
245 	if (retval != 0) {
246 		RTE_LOG(ERR, VHOST_PORT,
247 			"Error during getting device (port %u) info: %s\n",
248 			port, strerror(-retval));
249 
250 		return retval;
251 	}
252 
253 	rxconf = &dev_info.default_rxconf;
254 	txconf = &dev_info.default_txconf;
255 	rxconf->rx_drop_en = 1;
256 
257 	/*configure the number of supported virtio devices based on VMDQ limits */
258 	num_devices = dev_info.max_vmdq_pools;
259 
260 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
261 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
262 
263 	tx_rings = (uint16_t)rte_lcore_count();
264 
265 	/* Get port configuration. */
266 	retval = get_eth_conf(&port_conf, num_devices);
267 	if (retval < 0)
268 		return retval;
269 	/* NIC queues are divided into pf queues and vmdq queues.  */
270 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
271 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
272 	num_vmdq_queues = num_devices * queues_per_pool;
273 	num_queues = num_pf_queues + num_vmdq_queues;
274 	vmdq_queue_base = dev_info.vmdq_queue_base;
275 	vmdq_pool_base  = dev_info.vmdq_pool_base;
276 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
277 		num_pf_queues, num_devices, queues_per_pool);
278 
279 	if (!rte_eth_dev_is_valid_port(port))
280 		return -1;
281 
282 	rx_rings = (uint16_t)dev_info.max_rx_queues;
283 	if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
284 		port_conf.txmode.offloads |=
285 			DEV_TX_OFFLOAD_MBUF_FAST_FREE;
286 	/* Configure ethernet device. */
287 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
288 	if (retval != 0) {
289 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
290 			port, strerror(-retval));
291 		return retval;
292 	}
293 
294 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
295 		&tx_ring_size);
296 	if (retval != 0) {
297 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
298 			"for port %u: %s.\n", port, strerror(-retval));
299 		return retval;
300 	}
301 	if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
302 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
303 			"for Rx queues on port %u.\n", port);
304 		return -1;
305 	}
306 
307 	/* Setup the queues. */
308 	rxconf->offloads = port_conf.rxmode.offloads;
309 	for (q = 0; q < rx_rings; q ++) {
310 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
311 						rte_eth_dev_socket_id(port),
312 						rxconf,
313 						mbuf_pool);
314 		if (retval < 0) {
315 			RTE_LOG(ERR, VHOST_PORT,
316 				"Failed to setup rx queue %u of port %u: %s.\n",
317 				q, port, strerror(-retval));
318 			return retval;
319 		}
320 	}
321 	txconf->offloads = port_conf.txmode.offloads;
322 	for (q = 0; q < tx_rings; q ++) {
323 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
324 						rte_eth_dev_socket_id(port),
325 						txconf);
326 		if (retval < 0) {
327 			RTE_LOG(ERR, VHOST_PORT,
328 				"Failed to setup tx queue %u of port %u: %s.\n",
329 				q, port, strerror(-retval));
330 			return retval;
331 		}
332 	}
333 
334 	/* Start the device. */
335 	retval  = rte_eth_dev_start(port);
336 	if (retval < 0) {
337 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
338 			port, strerror(-retval));
339 		return retval;
340 	}
341 
342 	if (promiscuous) {
343 		retval = rte_eth_promiscuous_enable(port);
344 		if (retval != 0) {
345 			RTE_LOG(ERR, VHOST_PORT,
346 				"Failed to enable promiscuous mode on port %u: %s\n",
347 				port, rte_strerror(-retval));
348 			return retval;
349 		}
350 	}
351 
352 	retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
353 	if (retval < 0) {
354 		RTE_LOG(ERR, VHOST_PORT,
355 			"Failed to get MAC address on port %u: %s\n",
356 			port, rte_strerror(-retval));
357 		return retval;
358 	}
359 
360 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
361 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
362 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
363 			port,
364 			vmdq_ports_eth_addr[port].addr_bytes[0],
365 			vmdq_ports_eth_addr[port].addr_bytes[1],
366 			vmdq_ports_eth_addr[port].addr_bytes[2],
367 			vmdq_ports_eth_addr[port].addr_bytes[3],
368 			vmdq_ports_eth_addr[port].addr_bytes[4],
369 			vmdq_ports_eth_addr[port].addr_bytes[5]);
370 
371 	return 0;
372 }
373 
374 /*
375  * Set socket file path.
376  */
377 static int
378 us_vhost_parse_socket_path(const char *q_arg)
379 {
380 	char *old;
381 
382 	/* parse number string */
383 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
384 		return -1;
385 
386 	old = socket_files;
387 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
388 	if (socket_files == NULL) {
389 		free(old);
390 		return -1;
391 	}
392 
393 	strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
394 	nb_sockets++;
395 
396 	return 0;
397 }
398 
399 /*
400  * Parse the portmask provided at run time.
401  */
402 static int
403 parse_portmask(const char *portmask)
404 {
405 	char *end = NULL;
406 	unsigned long pm;
407 
408 	errno = 0;
409 
410 	/* parse hexadecimal string */
411 	pm = strtoul(portmask, &end, 16);
412 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
413 		return 0;
414 
415 	return pm;
416 
417 }
418 
419 /*
420  * Parse num options at run time.
421  */
422 static int
423 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
424 {
425 	char *end = NULL;
426 	unsigned long num;
427 
428 	errno = 0;
429 
430 	/* parse unsigned int string */
431 	num = strtoul(q_arg, &end, 10);
432 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
433 		return -1;
434 
435 	if (num > max_valid_value)
436 		return -1;
437 
438 	return num;
439 
440 }
441 
442 /*
443  * Display usage
444  */
445 static void
446 us_vhost_usage(const char *prgname)
447 {
448 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
449 	"		--vm2vm [0|1|2]\n"
450 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
451 	"		--socket-file <path>\n"
452 	"		--nb-devices ND\n"
453 	"		-p PORTMASK: Set mask for ports to be used by application\n"
454 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
455 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
456 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
457 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
458 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
459 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
460 	"		--socket-file: The path of the socket file.\n"
461 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
462 	"		--tso [0|1] disable/enable TCP segment offload.\n"
463 	"		--client register a vhost-user socket as client mode.\n"
464 	"		--dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
465 	"		--dmas register dma channel for specific vhost device.\n",
466 	       prgname);
467 }
468 
469 /*
470  * Parse the arguments given in the command line of the application.
471  */
472 static int
473 us_vhost_parse_args(int argc, char **argv)
474 {
475 	int opt, ret;
476 	int option_index;
477 	unsigned i;
478 	const char *prgname = argv[0];
479 	static struct option long_option[] = {
480 		{"vm2vm", required_argument, NULL, 0},
481 		{"rx-retry", required_argument, NULL, 0},
482 		{"rx-retry-delay", required_argument, NULL, 0},
483 		{"rx-retry-num", required_argument, NULL, 0},
484 		{"mergeable", required_argument, NULL, 0},
485 		{"stats", required_argument, NULL, 0},
486 		{"socket-file", required_argument, NULL, 0},
487 		{"tx-csum", required_argument, NULL, 0},
488 		{"tso", required_argument, NULL, 0},
489 		{"client", no_argument, &client_mode, 1},
490 		{"builtin-net-driver", no_argument, &builtin_net_driver, 1},
491 		{"dma-type", required_argument, NULL, 0},
492 		{"dmas", required_argument, NULL, 0},
493 		{NULL, 0, 0, 0},
494 	};
495 
496 	/* Parse command line */
497 	while ((opt = getopt_long(argc, argv, "p:P",
498 			long_option, &option_index)) != EOF) {
499 		switch (opt) {
500 		/* Portmask */
501 		case 'p':
502 			enabled_port_mask = parse_portmask(optarg);
503 			if (enabled_port_mask == 0) {
504 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
505 				us_vhost_usage(prgname);
506 				return -1;
507 			}
508 			break;
509 
510 		case 'P':
511 			promiscuous = 1;
512 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
513 				ETH_VMDQ_ACCEPT_BROADCAST |
514 				ETH_VMDQ_ACCEPT_MULTICAST;
515 
516 			break;
517 
518 		case 0:
519 			/* Enable/disable vm2vm comms. */
520 			if (!strncmp(long_option[option_index].name, "vm2vm",
521 				MAX_LONG_OPT_SZ)) {
522 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
523 				if (ret == -1) {
524 					RTE_LOG(INFO, VHOST_CONFIG,
525 						"Invalid argument for "
526 						"vm2vm [0|1|2]\n");
527 					us_vhost_usage(prgname);
528 					return -1;
529 				} else {
530 					vm2vm_mode = (vm2vm_type)ret;
531 				}
532 			}
533 
534 			/* Enable/disable retries on RX. */
535 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
536 				ret = parse_num_opt(optarg, 1);
537 				if (ret == -1) {
538 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
539 					us_vhost_usage(prgname);
540 					return -1;
541 				} else {
542 					enable_retry = ret;
543 				}
544 			}
545 
546 			/* Enable/disable TX checksum offload. */
547 			if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
548 				ret = parse_num_opt(optarg, 1);
549 				if (ret == -1) {
550 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
551 					us_vhost_usage(prgname);
552 					return -1;
553 				} else
554 					enable_tx_csum = ret;
555 			}
556 
557 			/* Enable/disable TSO offload. */
558 			if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
559 				ret = parse_num_opt(optarg, 1);
560 				if (ret == -1) {
561 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
562 					us_vhost_usage(prgname);
563 					return -1;
564 				} else
565 					enable_tso = ret;
566 			}
567 
568 			/* Specify the retries delay time (in useconds) on RX. */
569 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
570 				ret = parse_num_opt(optarg, INT32_MAX);
571 				if (ret == -1) {
572 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
573 					us_vhost_usage(prgname);
574 					return -1;
575 				} else {
576 					burst_rx_delay_time = ret;
577 				}
578 			}
579 
580 			/* Specify the retries number on RX. */
581 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
582 				ret = parse_num_opt(optarg, INT32_MAX);
583 				if (ret == -1) {
584 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
585 					us_vhost_usage(prgname);
586 					return -1;
587 				} else {
588 					burst_rx_retry_num = ret;
589 				}
590 			}
591 
592 			/* Enable/disable RX mergeable buffers. */
593 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
594 				ret = parse_num_opt(optarg, 1);
595 				if (ret == -1) {
596 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
597 					us_vhost_usage(prgname);
598 					return -1;
599 				} else {
600 					mergeable = !!ret;
601 					if (ret) {
602 						vmdq_conf_default.rxmode.offloads |=
603 							DEV_RX_OFFLOAD_JUMBO_FRAME;
604 						vmdq_conf_default.rxmode.max_rx_pkt_len
605 							= JUMBO_FRAME_MAX_SIZE;
606 					}
607 				}
608 			}
609 
610 			/* Enable/disable stats. */
611 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
612 				ret = parse_num_opt(optarg, INT32_MAX);
613 				if (ret == -1) {
614 					RTE_LOG(INFO, VHOST_CONFIG,
615 						"Invalid argument for stats [0..N]\n");
616 					us_vhost_usage(prgname);
617 					return -1;
618 				} else {
619 					enable_stats = ret;
620 				}
621 			}
622 
623 			/* Set socket file path. */
624 			if (!strncmp(long_option[option_index].name,
625 						"socket-file", MAX_LONG_OPT_SZ)) {
626 				if (us_vhost_parse_socket_path(optarg) == -1) {
627 					RTE_LOG(INFO, VHOST_CONFIG,
628 					"Invalid argument for socket name (Max %d characters)\n",
629 					PATH_MAX);
630 					us_vhost_usage(prgname);
631 					return -1;
632 				}
633 			}
634 
635 			if (!strncmp(long_option[option_index].name,
636 						"dma-type", MAX_LONG_OPT_SZ)) {
637 				if (strlen(optarg) >= MAX_LONG_OPT_SZ) {
638 					RTE_LOG(INFO, VHOST_CONFIG,
639 						"Wrong DMA type\n");
640 					us_vhost_usage(prgname);
641 					return -1;
642 				}
643 				strcpy(dma_type, optarg);
644 			}
645 
646 			if (!strncmp(long_option[option_index].name,
647 						"dmas", MAX_LONG_OPT_SZ)) {
648 				if (open_dma(optarg) == -1) {
649 					RTE_LOG(INFO, VHOST_CONFIG,
650 						"Wrong DMA args\n");
651 					us_vhost_usage(prgname);
652 					return -1;
653 				}
654 				async_vhost_driver = 1;
655 			}
656 
657 			break;
658 
659 			/* Invalid option - print options. */
660 		default:
661 			us_vhost_usage(prgname);
662 			return -1;
663 		}
664 	}
665 
666 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
667 		if (enabled_port_mask & (1 << i))
668 			ports[num_ports++] = i;
669 	}
670 
671 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
672 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
673 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
674 		return -1;
675 	}
676 
677 	return 0;
678 }
679 
680 /*
681  * Update the global var NUM_PORTS and array PORTS according to system ports number
682  * and return valid ports number
683  */
684 static unsigned check_ports_num(unsigned nb_ports)
685 {
686 	unsigned valid_num_ports = num_ports;
687 	unsigned portid;
688 
689 	if (num_ports > nb_ports) {
690 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
691 			num_ports, nb_ports);
692 		num_ports = nb_ports;
693 	}
694 
695 	for (portid = 0; portid < num_ports; portid ++) {
696 		if (!rte_eth_dev_is_valid_port(ports[portid])) {
697 			RTE_LOG(INFO, VHOST_PORT,
698 				"\nSpecified port ID(%u) is not valid\n",
699 				ports[portid]);
700 			ports[portid] = INVALID_PORT_ID;
701 			valid_num_ports--;
702 		}
703 	}
704 	return valid_num_ports;
705 }
706 
707 static __rte_always_inline struct vhost_dev *
708 find_vhost_dev(struct rte_ether_addr *mac)
709 {
710 	struct vhost_dev *vdev;
711 
712 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
713 		if (vdev->ready == DEVICE_RX &&
714 		    rte_is_same_ether_addr(mac, &vdev->mac_address))
715 			return vdev;
716 	}
717 
718 	return NULL;
719 }
720 
721 /*
722  * This function learns the MAC address of the device and registers this along with a
723  * vlan tag to a VMDQ.
724  */
725 static int
726 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
727 {
728 	struct rte_ether_hdr *pkt_hdr;
729 	int i, ret;
730 
731 	/* Learn MAC address of guest device from packet */
732 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
733 
734 	if (find_vhost_dev(&pkt_hdr->s_addr)) {
735 		RTE_LOG(ERR, VHOST_DATA,
736 			"(%d) device is using a registered MAC!\n",
737 			vdev->vid);
738 		return -1;
739 	}
740 
741 	for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
742 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
743 
744 	/* vlan_tag currently uses the device_id. */
745 	vdev->vlan_tag = vlan_tags[vdev->vid];
746 
747 	/* Print out VMDQ registration info. */
748 	RTE_LOG(INFO, VHOST_DATA,
749 		"(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
750 		vdev->vid,
751 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
752 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
753 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
754 		vdev->vlan_tag);
755 
756 	/* Register the MAC address. */
757 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
758 				(uint32_t)vdev->vid + vmdq_pool_base);
759 	if (ret)
760 		RTE_LOG(ERR, VHOST_DATA,
761 			"(%d) failed to add device MAC address to VMDQ\n",
762 			vdev->vid);
763 
764 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
765 
766 	/* Set device as ready for RX. */
767 	vdev->ready = DEVICE_RX;
768 
769 	return 0;
770 }
771 
772 /*
773  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
774  * queue before disabling RX on the device.
775  */
776 static inline void
777 unlink_vmdq(struct vhost_dev *vdev)
778 {
779 	unsigned i = 0;
780 	unsigned rx_count;
781 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
782 
783 	if (vdev->ready == DEVICE_RX) {
784 		/*clear MAC and VLAN settings*/
785 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
786 		for (i = 0; i < 6; i++)
787 			vdev->mac_address.addr_bytes[i] = 0;
788 
789 		vdev->vlan_tag = 0;
790 
791 		/*Clear out the receive buffers*/
792 		rx_count = rte_eth_rx_burst(ports[0],
793 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
794 
795 		while (rx_count) {
796 			for (i = 0; i < rx_count; i++)
797 				rte_pktmbuf_free(pkts_burst[i]);
798 
799 			rx_count = rte_eth_rx_burst(ports[0],
800 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
801 		}
802 
803 		vdev->ready = DEVICE_MAC_LEARNING;
804 	}
805 }
806 
807 static __rte_always_inline void
808 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
809 	    struct rte_mbuf *m)
810 {
811 	uint16_t ret;
812 	struct rte_mbuf *m_cpl[1], *comp_pkt;
813 	uint32_t nr_comp = 0;
814 
815 	if (builtin_net_driver) {
816 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
817 	} else if (async_vhost_driver) {
818 		ret = rte_vhost_submit_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ,
819 						&m, 1, &comp_pkt, &nr_comp);
820 		if (nr_comp == 1)
821 			goto done;
822 
823 		if (likely(ret))
824 			dst_vdev->nr_async_pkts++;
825 
826 		while (likely(dst_vdev->nr_async_pkts)) {
827 			if (rte_vhost_poll_enqueue_completed(dst_vdev->vid,
828 					VIRTIO_RXQ, m_cpl, 1))
829 				dst_vdev->nr_async_pkts--;
830 		}
831 	} else {
832 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
833 	}
834 
835 done:
836 	if (enable_stats) {
837 		rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
838 		rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
839 		src_vdev->stats.tx_total++;
840 		src_vdev->stats.tx += ret;
841 	}
842 }
843 
844 /*
845  * Check if the packet destination MAC address is for a local device. If so then put
846  * the packet on that devices RX queue. If not then return.
847  */
848 static __rte_always_inline int
849 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
850 {
851 	struct rte_ether_hdr *pkt_hdr;
852 	struct vhost_dev *dst_vdev;
853 
854 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
855 
856 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
857 	if (!dst_vdev)
858 		return -1;
859 
860 	if (vdev->vid == dst_vdev->vid) {
861 		RTE_LOG_DP(DEBUG, VHOST_DATA,
862 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
863 			vdev->vid);
864 		return 0;
865 	}
866 
867 	RTE_LOG_DP(DEBUG, VHOST_DATA,
868 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
869 
870 	if (unlikely(dst_vdev->remove)) {
871 		RTE_LOG_DP(DEBUG, VHOST_DATA,
872 			"(%d) device is marked for removal\n", dst_vdev->vid);
873 		return 0;
874 	}
875 
876 	virtio_xmit(dst_vdev, vdev, m);
877 	return 0;
878 }
879 
880 /*
881  * Check if the destination MAC of a packet is one local VM,
882  * and get its vlan tag, and offset if it is.
883  */
884 static __rte_always_inline int
885 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
886 	uint32_t *offset, uint16_t *vlan_tag)
887 {
888 	struct vhost_dev *dst_vdev;
889 	struct rte_ether_hdr *pkt_hdr =
890 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
891 
892 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
893 	if (!dst_vdev)
894 		return 0;
895 
896 	if (vdev->vid == dst_vdev->vid) {
897 		RTE_LOG_DP(DEBUG, VHOST_DATA,
898 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
899 			vdev->vid);
900 		return -1;
901 	}
902 
903 	/*
904 	 * HW vlan strip will reduce the packet length
905 	 * by minus length of vlan tag, so need restore
906 	 * the packet length by plus it.
907 	 */
908 	*offset  = VLAN_HLEN;
909 	*vlan_tag = vlan_tags[vdev->vid];
910 
911 	RTE_LOG_DP(DEBUG, VHOST_DATA,
912 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
913 		vdev->vid, dst_vdev->vid, *vlan_tag);
914 
915 	return 0;
916 }
917 
918 static uint16_t
919 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
920 {
921 	if (ol_flags & PKT_TX_IPV4)
922 		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
923 	else /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
924 		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
925 }
926 
927 static void virtio_tx_offload(struct rte_mbuf *m)
928 {
929 	void *l3_hdr;
930 	struct rte_ipv4_hdr *ipv4_hdr = NULL;
931 	struct rte_tcp_hdr *tcp_hdr = NULL;
932 	struct rte_ether_hdr *eth_hdr =
933 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
934 
935 	l3_hdr = (char *)eth_hdr + m->l2_len;
936 
937 	if (m->ol_flags & PKT_TX_IPV4) {
938 		ipv4_hdr = l3_hdr;
939 		ipv4_hdr->hdr_checksum = 0;
940 		m->ol_flags |= PKT_TX_IP_CKSUM;
941 	}
942 
943 	tcp_hdr = (struct rte_tcp_hdr *)((char *)l3_hdr + m->l3_len);
944 	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
945 }
946 
947 static inline void
948 free_pkts(struct rte_mbuf **pkts, uint16_t n)
949 {
950 	while (n--)
951 		rte_pktmbuf_free(pkts[n]);
952 }
953 
954 static __rte_always_inline void
955 do_drain_mbuf_table(struct mbuf_table *tx_q)
956 {
957 	uint16_t count;
958 
959 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
960 				 tx_q->m_table, tx_q->len);
961 	if (unlikely(count < tx_q->len))
962 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
963 
964 	tx_q->len = 0;
965 }
966 
967 /*
968  * This function routes the TX packet to the correct interface. This
969  * may be a local device or the physical port.
970  */
971 static __rte_always_inline void
972 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
973 {
974 	struct mbuf_table *tx_q;
975 	unsigned offset = 0;
976 	const uint16_t lcore_id = rte_lcore_id();
977 	struct rte_ether_hdr *nh;
978 
979 
980 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
981 	if (unlikely(rte_is_broadcast_ether_addr(&nh->d_addr))) {
982 		struct vhost_dev *vdev2;
983 
984 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
985 			if (vdev2 != vdev)
986 				virtio_xmit(vdev2, vdev, m);
987 		}
988 		goto queue2nic;
989 	}
990 
991 	/*check if destination is local VM*/
992 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
993 		rte_pktmbuf_free(m);
994 		return;
995 	}
996 
997 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
998 		if (unlikely(find_local_dest(vdev, m, &offset,
999 					     &vlan_tag) != 0)) {
1000 			rte_pktmbuf_free(m);
1001 			return;
1002 		}
1003 	}
1004 
1005 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1006 		"(%d) TX: MAC address is external\n", vdev->vid);
1007 
1008 queue2nic:
1009 
1010 	/*Add packet to the port tx queue*/
1011 	tx_q = &lcore_tx_queue[lcore_id];
1012 
1013 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1014 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1015 		/* Guest has inserted the vlan tag. */
1016 		struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1017 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1018 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1019 			(vh->vlan_tci != vlan_tag_be))
1020 			vh->vlan_tci = vlan_tag_be;
1021 	} else {
1022 		m->ol_flags |= PKT_TX_VLAN_PKT;
1023 
1024 		/*
1025 		 * Find the right seg to adjust the data len when offset is
1026 		 * bigger than tail room size.
1027 		 */
1028 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1029 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1030 				m->data_len += offset;
1031 			else {
1032 				struct rte_mbuf *seg = m;
1033 
1034 				while ((seg->next != NULL) &&
1035 					(offset > rte_pktmbuf_tailroom(seg)))
1036 					seg = seg->next;
1037 
1038 				seg->data_len += offset;
1039 			}
1040 			m->pkt_len += offset;
1041 		}
1042 
1043 		m->vlan_tci = vlan_tag;
1044 	}
1045 
1046 	if (m->ol_flags & PKT_TX_TCP_SEG)
1047 		virtio_tx_offload(m);
1048 
1049 	tx_q->m_table[tx_q->len++] = m;
1050 	if (enable_stats) {
1051 		vdev->stats.tx_total++;
1052 		vdev->stats.tx++;
1053 	}
1054 
1055 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1056 		do_drain_mbuf_table(tx_q);
1057 }
1058 
1059 
1060 static __rte_always_inline void
1061 drain_mbuf_table(struct mbuf_table *tx_q)
1062 {
1063 	static uint64_t prev_tsc;
1064 	uint64_t cur_tsc;
1065 
1066 	if (tx_q->len == 0)
1067 		return;
1068 
1069 	cur_tsc = rte_rdtsc();
1070 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1071 		prev_tsc = cur_tsc;
1072 
1073 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1074 			"TX queue drained after timeout with burst size %u\n",
1075 			tx_q->len);
1076 		do_drain_mbuf_table(tx_q);
1077 	}
1078 }
1079 
1080 static __rte_always_inline void
1081 complete_async_pkts(struct vhost_dev *vdev, uint16_t qid)
1082 {
1083 	struct rte_mbuf *p_cpl[MAX_PKT_BURST];
1084 	uint16_t complete_count;
1085 
1086 	complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
1087 						qid, p_cpl, MAX_PKT_BURST);
1088 	vdev->nr_async_pkts -= complete_count;
1089 	if (complete_count)
1090 		free_pkts(p_cpl, complete_count);
1091 }
1092 
1093 static __rte_always_inline void
1094 drain_eth_rx(struct vhost_dev *vdev)
1095 {
1096 	uint16_t rx_count, enqueue_count;
1097 	struct rte_mbuf *pkts[MAX_PKT_BURST], *comp_pkts[MAX_PKT_BURST];
1098 	uint32_t nr_comp = 0;
1099 
1100 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1101 				    pkts, MAX_PKT_BURST);
1102 
1103 	while (likely(vdev->nr_async_pkts))
1104 		complete_async_pkts(vdev, VIRTIO_RXQ);
1105 
1106 	if (!rx_count)
1107 		return;
1108 
1109 	/*
1110 	 * When "enable_retry" is set, here we wait and retry when there
1111 	 * is no enough free slots in the queue to hold @rx_count packets,
1112 	 * to diminish packet loss.
1113 	 */
1114 	if (enable_retry &&
1115 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1116 			VIRTIO_RXQ))) {
1117 		uint32_t retry;
1118 
1119 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1120 			rte_delay_us(burst_rx_delay_time);
1121 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1122 					VIRTIO_RXQ))
1123 				break;
1124 		}
1125 	}
1126 
1127 	if (builtin_net_driver) {
1128 		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1129 						pkts, rx_count);
1130 	} else if (async_vhost_driver) {
1131 		enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1132 					VIRTIO_RXQ, pkts, rx_count, comp_pkts,
1133 					&nr_comp);
1134 		if (nr_comp > 0) {
1135 			free_pkts(comp_pkts, nr_comp);
1136 			enqueue_count -= nr_comp;
1137 		}
1138 		vdev->nr_async_pkts += enqueue_count;
1139 	} else {
1140 		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1141 						pkts, rx_count);
1142 	}
1143 
1144 	if (enable_stats) {
1145 		rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1146 		rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1147 	}
1148 
1149 	if (!async_vhost_driver)
1150 		free_pkts(pkts, rx_count);
1151 }
1152 
1153 static __rte_always_inline void
1154 drain_virtio_tx(struct vhost_dev *vdev)
1155 {
1156 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1157 	uint16_t count;
1158 	uint16_t i;
1159 
1160 	if (builtin_net_driver) {
1161 		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1162 					pkts, MAX_PKT_BURST);
1163 	} else {
1164 		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1165 					mbuf_pool, pkts, MAX_PKT_BURST);
1166 	}
1167 
1168 	/* setup VMDq for the first packet */
1169 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1170 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1171 			free_pkts(pkts, count);
1172 	}
1173 
1174 	for (i = 0; i < count; ++i)
1175 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1176 }
1177 
1178 /*
1179  * Main function of vhost-switch. It basically does:
1180  *
1181  * for each vhost device {
1182  *    - drain_eth_rx()
1183  *
1184  *      Which drains the host eth Rx queue linked to the vhost device,
1185  *      and deliver all of them to guest virito Rx ring associated with
1186  *      this vhost device.
1187  *
1188  *    - drain_virtio_tx()
1189  *
1190  *      Which drains the guest virtio Tx queue and deliver all of them
1191  *      to the target, which could be another vhost device, or the
1192  *      physical eth dev. The route is done in function "virtio_tx_route".
1193  * }
1194  */
1195 static int
1196 switch_worker(void *arg __rte_unused)
1197 {
1198 	unsigned i;
1199 	unsigned lcore_id = rte_lcore_id();
1200 	struct vhost_dev *vdev;
1201 	struct mbuf_table *tx_q;
1202 
1203 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1204 
1205 	tx_q = &lcore_tx_queue[lcore_id];
1206 	for (i = 0; i < rte_lcore_count(); i++) {
1207 		if (lcore_ids[i] == lcore_id) {
1208 			tx_q->txq_id = i;
1209 			break;
1210 		}
1211 	}
1212 
1213 	while(1) {
1214 		drain_mbuf_table(tx_q);
1215 
1216 		/*
1217 		 * Inform the configuration core that we have exited the
1218 		 * linked list and that no devices are in use if requested.
1219 		 */
1220 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1221 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1222 
1223 		/*
1224 		 * Process vhost devices
1225 		 */
1226 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1227 			      lcore_vdev_entry) {
1228 			if (unlikely(vdev->remove)) {
1229 				unlink_vmdq(vdev);
1230 				vdev->ready = DEVICE_SAFE_REMOVE;
1231 				continue;
1232 			}
1233 
1234 			if (likely(vdev->ready == DEVICE_RX))
1235 				drain_eth_rx(vdev);
1236 
1237 			if (likely(!vdev->remove))
1238 				drain_virtio_tx(vdev);
1239 		}
1240 	}
1241 
1242 	return 0;
1243 }
1244 
1245 /*
1246  * Remove a device from the specific data core linked list and from the
1247  * main linked list. Synchonization  occurs through the use of the
1248  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1249  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1250  */
1251 static void
1252 destroy_device(int vid)
1253 {
1254 	struct vhost_dev *vdev = NULL;
1255 	int lcore;
1256 
1257 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1258 		if (vdev->vid == vid)
1259 			break;
1260 	}
1261 	if (!vdev)
1262 		return;
1263 	/*set the remove flag. */
1264 	vdev->remove = 1;
1265 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1266 		rte_pause();
1267 	}
1268 
1269 	if (builtin_net_driver)
1270 		vs_vhost_net_remove(vdev);
1271 
1272 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1273 		     lcore_vdev_entry);
1274 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1275 
1276 
1277 	/* Set the dev_removal_flag on each lcore. */
1278 	RTE_LCORE_FOREACH_WORKER(lcore)
1279 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1280 
1281 	/*
1282 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1283 	 * we can be sure that they can no longer access the device removed
1284 	 * from the linked lists and that the devices are no longer in use.
1285 	 */
1286 	RTE_LCORE_FOREACH_WORKER(lcore) {
1287 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1288 			rte_pause();
1289 	}
1290 
1291 	lcore_info[vdev->coreid].device_num--;
1292 
1293 	RTE_LOG(INFO, VHOST_DATA,
1294 		"(%d) device has been removed from data core\n",
1295 		vdev->vid);
1296 
1297 	if (async_vhost_driver)
1298 		rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1299 
1300 	rte_free(vdev);
1301 }
1302 
1303 /*
1304  * A new device is added to a data core. First the device is added to the main linked list
1305  * and then allocated to a specific data core.
1306  */
1307 static int
1308 new_device(int vid)
1309 {
1310 	int lcore, core_add = 0;
1311 	uint32_t device_num_min = num_devices;
1312 	struct vhost_dev *vdev;
1313 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1314 	if (vdev == NULL) {
1315 		RTE_LOG(INFO, VHOST_DATA,
1316 			"(%d) couldn't allocate memory for vhost dev\n",
1317 			vid);
1318 		return -1;
1319 	}
1320 	vdev->vid = vid;
1321 
1322 	if (builtin_net_driver)
1323 		vs_vhost_net_setup(vdev);
1324 
1325 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1326 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1327 
1328 	/*reset ready flag*/
1329 	vdev->ready = DEVICE_MAC_LEARNING;
1330 	vdev->remove = 0;
1331 
1332 	/* Find a suitable lcore to add the device. */
1333 	RTE_LCORE_FOREACH_WORKER(lcore) {
1334 		if (lcore_info[lcore].device_num < device_num_min) {
1335 			device_num_min = lcore_info[lcore].device_num;
1336 			core_add = lcore;
1337 		}
1338 	}
1339 	vdev->coreid = core_add;
1340 
1341 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1342 			  lcore_vdev_entry);
1343 	lcore_info[vdev->coreid].device_num++;
1344 
1345 	/* Disable notifications. */
1346 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1347 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1348 
1349 	RTE_LOG(INFO, VHOST_DATA,
1350 		"(%d) device has been added to data core %d\n",
1351 		vid, vdev->coreid);
1352 
1353 	if (async_vhost_driver) {
1354 		struct rte_vhost_async_features f;
1355 		struct rte_vhost_async_channel_ops channel_ops;
1356 		if (strncmp(dma_type, "ioat", 4) == 0) {
1357 			channel_ops.transfer_data = ioat_transfer_data_cb;
1358 			channel_ops.check_completed_copies =
1359 				ioat_check_completed_copies_cb;
1360 			f.async_inorder = 1;
1361 			f.async_threshold = 256;
1362 			return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
1363 				f.intval, &channel_ops);
1364 		}
1365 	}
1366 
1367 	return 0;
1368 }
1369 
1370 /*
1371  * These callback allow devices to be added to the data core when configuration
1372  * has been fully complete.
1373  */
1374 static const struct vhost_device_ops virtio_net_device_ops =
1375 {
1376 	.new_device =  new_device,
1377 	.destroy_device = destroy_device,
1378 };
1379 
1380 /*
1381  * This is a thread will wake up after a period to print stats if the user has
1382  * enabled them.
1383  */
1384 static void *
1385 print_stats(__rte_unused void *arg)
1386 {
1387 	struct vhost_dev *vdev;
1388 	uint64_t tx_dropped, rx_dropped;
1389 	uint64_t tx, tx_total, rx, rx_total;
1390 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1391 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1392 
1393 	while(1) {
1394 		sleep(enable_stats);
1395 
1396 		/* Clear screen and move to top left */
1397 		printf("%s%s\n", clr, top_left);
1398 		printf("Device statistics =================================\n");
1399 
1400 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1401 			tx_total   = vdev->stats.tx_total;
1402 			tx         = vdev->stats.tx;
1403 			tx_dropped = tx_total - tx;
1404 
1405 			rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1406 			rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1407 			rx_dropped = rx_total - rx;
1408 
1409 			printf("Statistics for device %d\n"
1410 				"-----------------------\n"
1411 				"TX total:              %" PRIu64 "\n"
1412 				"TX dropped:            %" PRIu64 "\n"
1413 				"TX successful:         %" PRIu64 "\n"
1414 				"RX total:              %" PRIu64 "\n"
1415 				"RX dropped:            %" PRIu64 "\n"
1416 				"RX successful:         %" PRIu64 "\n",
1417 				vdev->vid,
1418 				tx_total, tx_dropped, tx,
1419 				rx_total, rx_dropped, rx);
1420 		}
1421 
1422 		printf("===================================================\n");
1423 
1424 		fflush(stdout);
1425 	}
1426 
1427 	return NULL;
1428 }
1429 
1430 static void
1431 unregister_drivers(int socket_num)
1432 {
1433 	int i, ret;
1434 
1435 	for (i = 0; i < socket_num; i++) {
1436 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1437 		if (ret != 0)
1438 			RTE_LOG(ERR, VHOST_CONFIG,
1439 				"Fail to unregister vhost driver for %s.\n",
1440 				socket_files + i * PATH_MAX);
1441 	}
1442 }
1443 
1444 /* When we receive a INT signal, unregister vhost driver */
1445 static void
1446 sigint_handler(__rte_unused int signum)
1447 {
1448 	/* Unregister vhost driver. */
1449 	unregister_drivers(nb_sockets);
1450 
1451 	exit(0);
1452 }
1453 
1454 /*
1455  * While creating an mbuf pool, one key thing is to figure out how
1456  * many mbuf entries is enough for our use. FYI, here are some
1457  * guidelines:
1458  *
1459  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1460  *
1461  * - For each switch core (A CPU core does the packet switch), we need
1462  *   also make some reservation for receiving the packets from virtio
1463  *   Tx queue. How many is enough depends on the usage. It's normally
1464  *   a simple calculation like following:
1465  *
1466  *       MAX_PKT_BURST * max packet size / mbuf size
1467  *
1468  *   So, we definitely need allocate more mbufs when TSO is enabled.
1469  *
1470  * - Similarly, for each switching core, we should serve @nr_rx_desc
1471  *   mbufs for receiving the packets from physical NIC device.
1472  *
1473  * - We also need make sure, for each switch core, we have allocated
1474  *   enough mbufs to fill up the mbuf cache.
1475  */
1476 static void
1477 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1478 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1479 {
1480 	uint32_t nr_mbufs;
1481 	uint32_t nr_mbufs_per_core;
1482 	uint32_t mtu = 1500;
1483 
1484 	if (mergeable)
1485 		mtu = 9000;
1486 	if (enable_tso)
1487 		mtu = 64 * 1024;
1488 
1489 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1490 			(mbuf_size - RTE_PKTMBUF_HEADROOM);
1491 	nr_mbufs_per_core += nr_rx_desc;
1492 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1493 
1494 	nr_mbufs  = nr_queues * nr_rx_desc;
1495 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1496 	nr_mbufs *= nr_port;
1497 
1498 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1499 					    nr_mbuf_cache, 0, mbuf_size,
1500 					    rte_socket_id());
1501 	if (mbuf_pool == NULL)
1502 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1503 }
1504 
1505 /*
1506  * Main function, does initialisation and calls the per-lcore functions.
1507  */
1508 int
1509 main(int argc, char *argv[])
1510 {
1511 	unsigned lcore_id, core_id = 0;
1512 	unsigned nb_ports, valid_num_ports;
1513 	int ret, i;
1514 	uint16_t portid;
1515 	static pthread_t tid;
1516 	uint64_t flags = 0;
1517 
1518 	signal(SIGINT, sigint_handler);
1519 
1520 	/* init EAL */
1521 	ret = rte_eal_init(argc, argv);
1522 	if (ret < 0)
1523 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1524 	argc -= ret;
1525 	argv += ret;
1526 
1527 	/* parse app arguments */
1528 	ret = us_vhost_parse_args(argc, argv);
1529 	if (ret < 0)
1530 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1531 
1532 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1533 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1534 
1535 		if (rte_lcore_is_enabled(lcore_id))
1536 			lcore_ids[core_id++] = lcore_id;
1537 	}
1538 
1539 	if (rte_lcore_count() > RTE_MAX_LCORE)
1540 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1541 
1542 	/* Get the number of physical ports. */
1543 	nb_ports = rte_eth_dev_count_avail();
1544 
1545 	/*
1546 	 * Update the global var NUM_PORTS and global array PORTS
1547 	 * and get value of var VALID_NUM_PORTS according to system ports number
1548 	 */
1549 	valid_num_ports = check_ports_num(nb_ports);
1550 
1551 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1552 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1553 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1554 		return -1;
1555 	}
1556 
1557 	/*
1558 	 * FIXME: here we are trying to allocate mbufs big enough for
1559 	 * @MAX_QUEUES, but the truth is we're never going to use that
1560 	 * many queues here. We probably should only do allocation for
1561 	 * those queues we are going to use.
1562 	 */
1563 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1564 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1565 
1566 	if (vm2vm_mode == VM2VM_HARDWARE) {
1567 		/* Enable VT loop back to let L2 switch to do it. */
1568 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1569 		RTE_LOG(DEBUG, VHOST_CONFIG,
1570 			"Enable loop back for L2 switch in vmdq.\n");
1571 	}
1572 
1573 	/* initialize all ports */
1574 	RTE_ETH_FOREACH_DEV(portid) {
1575 		/* skip ports that are not enabled */
1576 		if ((enabled_port_mask & (1 << portid)) == 0) {
1577 			RTE_LOG(INFO, VHOST_PORT,
1578 				"Skipping disabled port %d\n", portid);
1579 			continue;
1580 		}
1581 		if (port_init(portid) != 0)
1582 			rte_exit(EXIT_FAILURE,
1583 				"Cannot initialize network ports\n");
1584 	}
1585 
1586 	/* Enable stats if the user option is set. */
1587 	if (enable_stats) {
1588 		ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1589 					print_stats, NULL);
1590 		if (ret < 0)
1591 			rte_exit(EXIT_FAILURE,
1592 				"Cannot create print-stats thread\n");
1593 	}
1594 
1595 	/* Launch all data cores. */
1596 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1597 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1598 
1599 	if (client_mode)
1600 		flags |= RTE_VHOST_USER_CLIENT;
1601 
1602 	/* Register vhost user driver to handle vhost messages. */
1603 	for (i = 0; i < nb_sockets; i++) {
1604 		char *file = socket_files + i * PATH_MAX;
1605 		if (async_vhost_driver)
1606 			flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1607 
1608 		ret = rte_vhost_driver_register(file, flags);
1609 		if (ret != 0) {
1610 			unregister_drivers(i);
1611 			rte_exit(EXIT_FAILURE,
1612 				"vhost driver register failure.\n");
1613 		}
1614 
1615 		if (builtin_net_driver)
1616 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1617 
1618 		if (mergeable == 0) {
1619 			rte_vhost_driver_disable_features(file,
1620 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
1621 		}
1622 
1623 		if (enable_tx_csum == 0) {
1624 			rte_vhost_driver_disable_features(file,
1625 				1ULL << VIRTIO_NET_F_CSUM);
1626 		}
1627 
1628 		if (enable_tso == 0) {
1629 			rte_vhost_driver_disable_features(file,
1630 				1ULL << VIRTIO_NET_F_HOST_TSO4);
1631 			rte_vhost_driver_disable_features(file,
1632 				1ULL << VIRTIO_NET_F_HOST_TSO6);
1633 			rte_vhost_driver_disable_features(file,
1634 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
1635 			rte_vhost_driver_disable_features(file,
1636 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
1637 		}
1638 
1639 		if (promiscuous) {
1640 			rte_vhost_driver_enable_features(file,
1641 				1ULL << VIRTIO_NET_F_CTRL_RX);
1642 		}
1643 
1644 		ret = rte_vhost_driver_callback_register(file,
1645 			&virtio_net_device_ops);
1646 		if (ret != 0) {
1647 			rte_exit(EXIT_FAILURE,
1648 				"failed to register vhost driver callbacks.\n");
1649 		}
1650 
1651 		if (rte_vhost_driver_start(file) < 0) {
1652 			rte_exit(EXIT_FAILURE,
1653 				"failed to start vhost driver.\n");
1654 		}
1655 	}
1656 
1657 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1658 		rte_eal_wait_lcore(lcore_id);
1659 
1660 	return 0;
1661 
1662 }
1663