xref: /dpdk/examples/vhost/main.c (revision 2808423a9ce42a748aed77a4b487be27d2b6acfa)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4 
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16 
17 #include <rte_atomic.h>
18 #include <rte_cycles.h>
19 #include <rte_ethdev.h>
20 #include <rte_log.h>
21 #include <rte_string_fns.h>
22 #include <rte_malloc.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27 
28 #include "main.h"
29 
30 #ifndef MAX_QUEUES
31 #define MAX_QUEUES 128
32 #endif
33 
34 /* the maximum number of external ports supported */
35 #define MAX_SUP_PORTS 1
36 
37 #define MBUF_CACHE_SIZE	128
38 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
39 
40 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
41 
42 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
43 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
44 
45 #define JUMBO_FRAME_MAX_SIZE    0x2600
46 
47 /* State of virtio device. */
48 #define DEVICE_MAC_LEARNING 0
49 #define DEVICE_RX			1
50 #define DEVICE_SAFE_REMOVE	2
51 
52 /* Configurable number of RX/TX ring descriptors */
53 #define RTE_TEST_RX_DESC_DEFAULT 1024
54 #define RTE_TEST_TX_DESC_DEFAULT 512
55 
56 #define INVALID_PORT_ID 0xFF
57 
58 /* Max number of devices. Limited by vmdq. */
59 #define MAX_DEVICES 64
60 
61 /* Size of buffers used for snprintfs. */
62 #define MAX_PRINT_BUFF 6072
63 
64 /* Maximum long option length for option parsing. */
65 #define MAX_LONG_OPT_SZ 64
66 
67 /* mask of enabled ports */
68 static uint32_t enabled_port_mask = 0;
69 
70 /* Promiscuous mode */
71 static uint32_t promiscuous;
72 
73 /* number of devices/queues to support*/
74 static uint32_t num_queues = 0;
75 static uint32_t num_devices;
76 
77 static struct rte_mempool *mbuf_pool;
78 static int mergeable;
79 
80 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
81 typedef enum {
82 	VM2VM_DISABLED = 0,
83 	VM2VM_SOFTWARE = 1,
84 	VM2VM_HARDWARE = 2,
85 	VM2VM_LAST
86 } vm2vm_type;
87 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
88 
89 /* Enable stats. */
90 static uint32_t enable_stats = 0;
91 /* Enable retries on RX. */
92 static uint32_t enable_retry = 1;
93 
94 /* Disable TX checksum offload */
95 static uint32_t enable_tx_csum;
96 
97 /* Disable TSO offload */
98 static uint32_t enable_tso;
99 
100 static int client_mode;
101 static int dequeue_zero_copy;
102 
103 static int builtin_net_driver;
104 
105 /* Specify timeout (in useconds) between retries on RX. */
106 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
107 /* Specify the number of retries on RX. */
108 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
109 
110 /* Socket file paths. Can be set by user */
111 static char *socket_files;
112 static int nb_sockets;
113 
114 /* empty vmdq configuration structure. Filled in programatically */
115 static struct rte_eth_conf vmdq_conf_default = {
116 	.rxmode = {
117 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
118 		.split_hdr_size = 0,
119 		/*
120 		 * VLAN strip is necessary for 1G NIC such as I350,
121 		 * this fixes bug of ipv4 forwarding in guest can't
122 		 * forward pakets from one virtio dev to another virtio dev.
123 		 */
124 		.offloads = (DEV_RX_OFFLOAD_CRC_STRIP |
125 			     DEV_RX_OFFLOAD_VLAN_STRIP),
126 	},
127 
128 	.txmode = {
129 		.mq_mode = ETH_MQ_TX_NONE,
130 		.offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
131 			     DEV_TX_OFFLOAD_TCP_CKSUM |
132 			     DEV_TX_OFFLOAD_VLAN_INSERT |
133 			     DEV_TX_OFFLOAD_MULTI_SEGS |
134 			     DEV_TX_OFFLOAD_TCP_TSO),
135 	},
136 	.rx_adv_conf = {
137 		/*
138 		 * should be overridden separately in code with
139 		 * appropriate values
140 		 */
141 		.vmdq_rx_conf = {
142 			.nb_queue_pools = ETH_8_POOLS,
143 			.enable_default_pool = 0,
144 			.default_pool = 0,
145 			.nb_pool_maps = 0,
146 			.pool_map = {{0, 0},},
147 		},
148 	},
149 };
150 
151 
152 static unsigned lcore_ids[RTE_MAX_LCORE];
153 static uint16_t ports[RTE_MAX_ETHPORTS];
154 static unsigned num_ports = 0; /**< The number of ports specified in command line */
155 static uint16_t num_pf_queues, num_vmdq_queues;
156 static uint16_t vmdq_pool_base, vmdq_queue_base;
157 static uint16_t queues_per_pool;
158 
159 const uint16_t vlan_tags[] = {
160 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
161 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
162 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
163 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
164 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
165 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
166 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
167 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
168 };
169 
170 /* ethernet addresses of ports */
171 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
172 
173 static struct vhost_dev_tailq_list vhost_dev_list =
174 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
175 
176 static struct lcore_info lcore_info[RTE_MAX_LCORE];
177 
178 /* Used for queueing bursts of TX packets. */
179 struct mbuf_table {
180 	unsigned len;
181 	unsigned txq_id;
182 	struct rte_mbuf *m_table[MAX_PKT_BURST];
183 };
184 
185 /* TX queue for each data core. */
186 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
187 
188 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
189 				 / US_PER_S * BURST_TX_DRAIN_US)
190 #define VLAN_HLEN       4
191 
192 /*
193  * Builds up the correct configuration for VMDQ VLAN pool map
194  * according to the pool & queue limits.
195  */
196 static inline int
197 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
198 {
199 	struct rte_eth_vmdq_rx_conf conf;
200 	struct rte_eth_vmdq_rx_conf *def_conf =
201 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
202 	unsigned i;
203 
204 	memset(&conf, 0, sizeof(conf));
205 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
206 	conf.nb_pool_maps = num_devices;
207 	conf.enable_loop_back = def_conf->enable_loop_back;
208 	conf.rx_mode = def_conf->rx_mode;
209 
210 	for (i = 0; i < conf.nb_pool_maps; i++) {
211 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
212 		conf.pool_map[i].pools = (1UL << i);
213 	}
214 
215 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
216 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
217 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
218 	return 0;
219 }
220 
221 /*
222  * Validate the device number according to the max pool number gotten form
223  * dev_info. If the device number is invalid, give the error message and
224  * return -1. Each device must have its own pool.
225  */
226 static inline int
227 validate_num_devices(uint32_t max_nb_devices)
228 {
229 	if (num_devices > max_nb_devices) {
230 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
231 		return -1;
232 	}
233 	return 0;
234 }
235 
236 /*
237  * Initialises a given port using global settings and with the rx buffers
238  * coming from the mbuf_pool passed as parameter
239  */
240 static inline int
241 port_init(uint16_t port)
242 {
243 	struct rte_eth_dev_info dev_info;
244 	struct rte_eth_conf port_conf;
245 	struct rte_eth_rxconf *rxconf;
246 	struct rte_eth_txconf *txconf;
247 	int16_t rx_rings, tx_rings;
248 	uint16_t rx_ring_size, tx_ring_size;
249 	int retval;
250 	uint16_t q;
251 
252 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
253 	rte_eth_dev_info_get (port, &dev_info);
254 
255 	rxconf = &dev_info.default_rxconf;
256 	txconf = &dev_info.default_txconf;
257 	rxconf->rx_drop_en = 1;
258 
259 	/*configure the number of supported virtio devices based on VMDQ limits */
260 	num_devices = dev_info.max_vmdq_pools;
261 
262 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
263 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
264 
265 	/*
266 	 * When dequeue zero copy is enabled, guest Tx used vring will be
267 	 * updated only when corresponding mbuf is freed. Thus, the nb_tx_desc
268 	 * (tx_ring_size here) must be small enough so that the driver will
269 	 * hit the free threshold easily and free mbufs timely. Otherwise,
270 	 * guest Tx vring would be starved.
271 	 */
272 	if (dequeue_zero_copy)
273 		tx_ring_size = 64;
274 
275 	tx_rings = (uint16_t)rte_lcore_count();
276 
277 	retval = validate_num_devices(MAX_DEVICES);
278 	if (retval < 0)
279 		return retval;
280 
281 	/* Get port configuration. */
282 	retval = get_eth_conf(&port_conf, num_devices);
283 	if (retval < 0)
284 		return retval;
285 	/* NIC queues are divided into pf queues and vmdq queues.  */
286 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
287 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
288 	num_vmdq_queues = num_devices * queues_per_pool;
289 	num_queues = num_pf_queues + num_vmdq_queues;
290 	vmdq_queue_base = dev_info.vmdq_queue_base;
291 	vmdq_pool_base  = dev_info.vmdq_pool_base;
292 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
293 		num_pf_queues, num_devices, queues_per_pool);
294 
295 	if (!rte_eth_dev_is_valid_port(port))
296 		return -1;
297 
298 	rx_rings = (uint16_t)dev_info.max_rx_queues;
299 	if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
300 		port_conf.txmode.offloads |=
301 			DEV_TX_OFFLOAD_MBUF_FAST_FREE;
302 	/* Configure ethernet device. */
303 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
304 	if (retval != 0) {
305 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
306 			port, strerror(-retval));
307 		return retval;
308 	}
309 
310 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
311 		&tx_ring_size);
312 	if (retval != 0) {
313 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
314 			"for port %u: %s.\n", port, strerror(-retval));
315 		return retval;
316 	}
317 	if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
318 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
319 			"for Rx queues on port %u.\n", port);
320 		return -1;
321 	}
322 
323 	/* Setup the queues. */
324 	rxconf->offloads = port_conf.rxmode.offloads;
325 	for (q = 0; q < rx_rings; q ++) {
326 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
327 						rte_eth_dev_socket_id(port),
328 						rxconf,
329 						mbuf_pool);
330 		if (retval < 0) {
331 			RTE_LOG(ERR, VHOST_PORT,
332 				"Failed to setup rx queue %u of port %u: %s.\n",
333 				q, port, strerror(-retval));
334 			return retval;
335 		}
336 	}
337 	txconf->offloads = port_conf.txmode.offloads;
338 	for (q = 0; q < tx_rings; q ++) {
339 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
340 						rte_eth_dev_socket_id(port),
341 						txconf);
342 		if (retval < 0) {
343 			RTE_LOG(ERR, VHOST_PORT,
344 				"Failed to setup tx queue %u of port %u: %s.\n",
345 				q, port, strerror(-retval));
346 			return retval;
347 		}
348 	}
349 
350 	/* Start the device. */
351 	retval  = rte_eth_dev_start(port);
352 	if (retval < 0) {
353 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
354 			port, strerror(-retval));
355 		return retval;
356 	}
357 
358 	if (promiscuous)
359 		rte_eth_promiscuous_enable(port);
360 
361 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
362 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
363 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
364 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
365 			port,
366 			vmdq_ports_eth_addr[port].addr_bytes[0],
367 			vmdq_ports_eth_addr[port].addr_bytes[1],
368 			vmdq_ports_eth_addr[port].addr_bytes[2],
369 			vmdq_ports_eth_addr[port].addr_bytes[3],
370 			vmdq_ports_eth_addr[port].addr_bytes[4],
371 			vmdq_ports_eth_addr[port].addr_bytes[5]);
372 
373 	return 0;
374 }
375 
376 /*
377  * Set socket file path.
378  */
379 static int
380 us_vhost_parse_socket_path(const char *q_arg)
381 {
382 	/* parse number string */
383 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
384 		return -1;
385 
386 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
387 	snprintf(socket_files + nb_sockets * PATH_MAX, PATH_MAX, "%s", q_arg);
388 	nb_sockets++;
389 
390 	return 0;
391 }
392 
393 /*
394  * Parse the portmask provided at run time.
395  */
396 static int
397 parse_portmask(const char *portmask)
398 {
399 	char *end = NULL;
400 	unsigned long pm;
401 
402 	errno = 0;
403 
404 	/* parse hexadecimal string */
405 	pm = strtoul(portmask, &end, 16);
406 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
407 		return -1;
408 
409 	if (pm == 0)
410 		return -1;
411 
412 	return pm;
413 
414 }
415 
416 /*
417  * Parse num options at run time.
418  */
419 static int
420 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
421 {
422 	char *end = NULL;
423 	unsigned long num;
424 
425 	errno = 0;
426 
427 	/* parse unsigned int string */
428 	num = strtoul(q_arg, &end, 10);
429 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
430 		return -1;
431 
432 	if (num > max_valid_value)
433 		return -1;
434 
435 	return num;
436 
437 }
438 
439 /*
440  * Display usage
441  */
442 static void
443 us_vhost_usage(const char *prgname)
444 {
445 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
446 	"		--vm2vm [0|1|2]\n"
447 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
448 	"		--socket-file <path>\n"
449 	"		--nb-devices ND\n"
450 	"		-p PORTMASK: Set mask for ports to be used by application\n"
451 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
452 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
453 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
454 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
455 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
456 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
457 	"		--socket-file: The path of the socket file.\n"
458 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
459 	"		--tso [0|1] disable/enable TCP segment offload.\n"
460 	"		--client register a vhost-user socket as client mode.\n"
461 	"		--dequeue-zero-copy enables dequeue zero copy\n",
462 	       prgname);
463 }
464 
465 /*
466  * Parse the arguments given in the command line of the application.
467  */
468 static int
469 us_vhost_parse_args(int argc, char **argv)
470 {
471 	int opt, ret;
472 	int option_index;
473 	unsigned i;
474 	const char *prgname = argv[0];
475 	static struct option long_option[] = {
476 		{"vm2vm", required_argument, NULL, 0},
477 		{"rx-retry", required_argument, NULL, 0},
478 		{"rx-retry-delay", required_argument, NULL, 0},
479 		{"rx-retry-num", required_argument, NULL, 0},
480 		{"mergeable", required_argument, NULL, 0},
481 		{"stats", required_argument, NULL, 0},
482 		{"socket-file", required_argument, NULL, 0},
483 		{"tx-csum", required_argument, NULL, 0},
484 		{"tso", required_argument, NULL, 0},
485 		{"client", no_argument, &client_mode, 1},
486 		{"dequeue-zero-copy", no_argument, &dequeue_zero_copy, 1},
487 		{"builtin-net-driver", no_argument, &builtin_net_driver, 1},
488 		{NULL, 0, 0, 0},
489 	};
490 
491 	/* Parse command line */
492 	while ((opt = getopt_long(argc, argv, "p:P",
493 			long_option, &option_index)) != EOF) {
494 		switch (opt) {
495 		/* Portmask */
496 		case 'p':
497 			enabled_port_mask = parse_portmask(optarg);
498 			if (enabled_port_mask == 0) {
499 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
500 				us_vhost_usage(prgname);
501 				return -1;
502 			}
503 			break;
504 
505 		case 'P':
506 			promiscuous = 1;
507 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
508 				ETH_VMDQ_ACCEPT_BROADCAST |
509 				ETH_VMDQ_ACCEPT_MULTICAST;
510 
511 			break;
512 
513 		case 0:
514 			/* Enable/disable vm2vm comms. */
515 			if (!strncmp(long_option[option_index].name, "vm2vm",
516 				MAX_LONG_OPT_SZ)) {
517 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
518 				if (ret == -1) {
519 					RTE_LOG(INFO, VHOST_CONFIG,
520 						"Invalid argument for "
521 						"vm2vm [0|1|2]\n");
522 					us_vhost_usage(prgname);
523 					return -1;
524 				} else {
525 					vm2vm_mode = (vm2vm_type)ret;
526 				}
527 			}
528 
529 			/* Enable/disable retries on RX. */
530 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
531 				ret = parse_num_opt(optarg, 1);
532 				if (ret == -1) {
533 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
534 					us_vhost_usage(prgname);
535 					return -1;
536 				} else {
537 					enable_retry = ret;
538 				}
539 			}
540 
541 			/* Enable/disable TX checksum offload. */
542 			if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
543 				ret = parse_num_opt(optarg, 1);
544 				if (ret == -1) {
545 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
546 					us_vhost_usage(prgname);
547 					return -1;
548 				} else
549 					enable_tx_csum = ret;
550 			}
551 
552 			/* Enable/disable TSO offload. */
553 			if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
554 				ret = parse_num_opt(optarg, 1);
555 				if (ret == -1) {
556 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
557 					us_vhost_usage(prgname);
558 					return -1;
559 				} else
560 					enable_tso = ret;
561 			}
562 
563 			/* Specify the retries delay time (in useconds) on RX. */
564 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
565 				ret = parse_num_opt(optarg, INT32_MAX);
566 				if (ret == -1) {
567 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
568 					us_vhost_usage(prgname);
569 					return -1;
570 				} else {
571 					burst_rx_delay_time = ret;
572 				}
573 			}
574 
575 			/* Specify the retries number on RX. */
576 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
577 				ret = parse_num_opt(optarg, INT32_MAX);
578 				if (ret == -1) {
579 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
580 					us_vhost_usage(prgname);
581 					return -1;
582 				} else {
583 					burst_rx_retry_num = ret;
584 				}
585 			}
586 
587 			/* Enable/disable RX mergeable buffers. */
588 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
589 				ret = parse_num_opt(optarg, 1);
590 				if (ret == -1) {
591 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
592 					us_vhost_usage(prgname);
593 					return -1;
594 				} else {
595 					mergeable = !!ret;
596 					if (ret) {
597 						vmdq_conf_default.rxmode.offloads |=
598 							DEV_RX_OFFLOAD_JUMBO_FRAME;
599 						vmdq_conf_default.rxmode.max_rx_pkt_len
600 							= JUMBO_FRAME_MAX_SIZE;
601 					}
602 				}
603 			}
604 
605 			/* Enable/disable stats. */
606 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
607 				ret = parse_num_opt(optarg, INT32_MAX);
608 				if (ret == -1) {
609 					RTE_LOG(INFO, VHOST_CONFIG,
610 						"Invalid argument for stats [0..N]\n");
611 					us_vhost_usage(prgname);
612 					return -1;
613 				} else {
614 					enable_stats = ret;
615 				}
616 			}
617 
618 			/* Set socket file path. */
619 			if (!strncmp(long_option[option_index].name,
620 						"socket-file", MAX_LONG_OPT_SZ)) {
621 				if (us_vhost_parse_socket_path(optarg) == -1) {
622 					RTE_LOG(INFO, VHOST_CONFIG,
623 					"Invalid argument for socket name (Max %d characters)\n",
624 					PATH_MAX);
625 					us_vhost_usage(prgname);
626 					return -1;
627 				}
628 			}
629 
630 			break;
631 
632 			/* Invalid option - print options. */
633 		default:
634 			us_vhost_usage(prgname);
635 			return -1;
636 		}
637 	}
638 
639 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
640 		if (enabled_port_mask & (1 << i))
641 			ports[num_ports++] = i;
642 	}
643 
644 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
645 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
646 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
647 		return -1;
648 	}
649 
650 	return 0;
651 }
652 
653 /*
654  * Update the global var NUM_PORTS and array PORTS according to system ports number
655  * and return valid ports number
656  */
657 static unsigned check_ports_num(unsigned nb_ports)
658 {
659 	unsigned valid_num_ports = num_ports;
660 	unsigned portid;
661 
662 	if (num_ports > nb_ports) {
663 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
664 			num_ports, nb_ports);
665 		num_ports = nb_ports;
666 	}
667 
668 	for (portid = 0; portid < num_ports; portid ++) {
669 		if (!rte_eth_dev_is_valid_port(ports[portid])) {
670 			RTE_LOG(INFO, VHOST_PORT,
671 				"\nSpecified port ID(%u) is not valid\n",
672 				ports[portid]);
673 			ports[portid] = INVALID_PORT_ID;
674 			valid_num_ports--;
675 		}
676 	}
677 	return valid_num_ports;
678 }
679 
680 static __rte_always_inline struct vhost_dev *
681 find_vhost_dev(struct ether_addr *mac)
682 {
683 	struct vhost_dev *vdev;
684 
685 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
686 		if (vdev->ready == DEVICE_RX &&
687 		    is_same_ether_addr(mac, &vdev->mac_address))
688 			return vdev;
689 	}
690 
691 	return NULL;
692 }
693 
694 /*
695  * This function learns the MAC address of the device and registers this along with a
696  * vlan tag to a VMDQ.
697  */
698 static int
699 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
700 {
701 	struct ether_hdr *pkt_hdr;
702 	int i, ret;
703 
704 	/* Learn MAC address of guest device from packet */
705 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
706 
707 	if (find_vhost_dev(&pkt_hdr->s_addr)) {
708 		RTE_LOG(ERR, VHOST_DATA,
709 			"(%d) device is using a registered MAC!\n",
710 			vdev->vid);
711 		return -1;
712 	}
713 
714 	for (i = 0; i < ETHER_ADDR_LEN; i++)
715 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
716 
717 	/* vlan_tag currently uses the device_id. */
718 	vdev->vlan_tag = vlan_tags[vdev->vid];
719 
720 	/* Print out VMDQ registration info. */
721 	RTE_LOG(INFO, VHOST_DATA,
722 		"(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
723 		vdev->vid,
724 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
725 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
726 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
727 		vdev->vlan_tag);
728 
729 	/* Register the MAC address. */
730 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
731 				(uint32_t)vdev->vid + vmdq_pool_base);
732 	if (ret)
733 		RTE_LOG(ERR, VHOST_DATA,
734 			"(%d) failed to add device MAC address to VMDQ\n",
735 			vdev->vid);
736 
737 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
738 
739 	/* Set device as ready for RX. */
740 	vdev->ready = DEVICE_RX;
741 
742 	return 0;
743 }
744 
745 /*
746  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
747  * queue before disabling RX on the device.
748  */
749 static inline void
750 unlink_vmdq(struct vhost_dev *vdev)
751 {
752 	unsigned i = 0;
753 	unsigned rx_count;
754 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
755 
756 	if (vdev->ready == DEVICE_RX) {
757 		/*clear MAC and VLAN settings*/
758 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
759 		for (i = 0; i < 6; i++)
760 			vdev->mac_address.addr_bytes[i] = 0;
761 
762 		vdev->vlan_tag = 0;
763 
764 		/*Clear out the receive buffers*/
765 		rx_count = rte_eth_rx_burst(ports[0],
766 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
767 
768 		while (rx_count) {
769 			for (i = 0; i < rx_count; i++)
770 				rte_pktmbuf_free(pkts_burst[i]);
771 
772 			rx_count = rte_eth_rx_burst(ports[0],
773 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
774 		}
775 
776 		vdev->ready = DEVICE_MAC_LEARNING;
777 	}
778 }
779 
780 static __rte_always_inline void
781 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
782 	    struct rte_mbuf *m)
783 {
784 	uint16_t ret;
785 
786 	if (builtin_net_driver) {
787 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
788 	} else {
789 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
790 	}
791 
792 	if (enable_stats) {
793 		rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
794 		rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
795 		src_vdev->stats.tx_total++;
796 		src_vdev->stats.tx += ret;
797 	}
798 }
799 
800 /*
801  * Check if the packet destination MAC address is for a local device. If so then put
802  * the packet on that devices RX queue. If not then return.
803  */
804 static __rte_always_inline int
805 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
806 {
807 	struct ether_hdr *pkt_hdr;
808 	struct vhost_dev *dst_vdev;
809 
810 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
811 
812 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
813 	if (!dst_vdev)
814 		return -1;
815 
816 	if (vdev->vid == dst_vdev->vid) {
817 		RTE_LOG_DP(DEBUG, VHOST_DATA,
818 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
819 			vdev->vid);
820 		return 0;
821 	}
822 
823 	RTE_LOG_DP(DEBUG, VHOST_DATA,
824 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
825 
826 	if (unlikely(dst_vdev->remove)) {
827 		RTE_LOG_DP(DEBUG, VHOST_DATA,
828 			"(%d) device is marked for removal\n", dst_vdev->vid);
829 		return 0;
830 	}
831 
832 	virtio_xmit(dst_vdev, vdev, m);
833 	return 0;
834 }
835 
836 /*
837  * Check if the destination MAC of a packet is one local VM,
838  * and get its vlan tag, and offset if it is.
839  */
840 static __rte_always_inline int
841 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
842 	uint32_t *offset, uint16_t *vlan_tag)
843 {
844 	struct vhost_dev *dst_vdev;
845 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
846 
847 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
848 	if (!dst_vdev)
849 		return 0;
850 
851 	if (vdev->vid == dst_vdev->vid) {
852 		RTE_LOG_DP(DEBUG, VHOST_DATA,
853 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
854 			vdev->vid);
855 		return -1;
856 	}
857 
858 	/*
859 	 * HW vlan strip will reduce the packet length
860 	 * by minus length of vlan tag, so need restore
861 	 * the packet length by plus it.
862 	 */
863 	*offset  = VLAN_HLEN;
864 	*vlan_tag = vlan_tags[vdev->vid];
865 
866 	RTE_LOG_DP(DEBUG, VHOST_DATA,
867 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
868 		vdev->vid, dst_vdev->vid, *vlan_tag);
869 
870 	return 0;
871 }
872 
873 static uint16_t
874 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
875 {
876 	if (ol_flags & PKT_TX_IPV4)
877 		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
878 	else /* assume ethertype == ETHER_TYPE_IPv6 */
879 		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
880 }
881 
882 static void virtio_tx_offload(struct rte_mbuf *m)
883 {
884 	void *l3_hdr;
885 	struct ipv4_hdr *ipv4_hdr = NULL;
886 	struct tcp_hdr *tcp_hdr = NULL;
887 	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
888 
889 	l3_hdr = (char *)eth_hdr + m->l2_len;
890 
891 	if (m->ol_flags & PKT_TX_IPV4) {
892 		ipv4_hdr = l3_hdr;
893 		ipv4_hdr->hdr_checksum = 0;
894 		m->ol_flags |= PKT_TX_IP_CKSUM;
895 	}
896 
897 	tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
898 	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
899 }
900 
901 static inline void
902 free_pkts(struct rte_mbuf **pkts, uint16_t n)
903 {
904 	while (n--)
905 		rte_pktmbuf_free(pkts[n]);
906 }
907 
908 static __rte_always_inline void
909 do_drain_mbuf_table(struct mbuf_table *tx_q)
910 {
911 	uint16_t count;
912 
913 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
914 				 tx_q->m_table, tx_q->len);
915 	if (unlikely(count < tx_q->len))
916 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
917 
918 	tx_q->len = 0;
919 }
920 
921 /*
922  * This function routes the TX packet to the correct interface. This
923  * may be a local device or the physical port.
924  */
925 static __rte_always_inline void
926 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
927 {
928 	struct mbuf_table *tx_q;
929 	unsigned offset = 0;
930 	const uint16_t lcore_id = rte_lcore_id();
931 	struct ether_hdr *nh;
932 
933 
934 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
935 	if (unlikely(is_broadcast_ether_addr(&nh->d_addr))) {
936 		struct vhost_dev *vdev2;
937 
938 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
939 			if (vdev2 != vdev)
940 				virtio_xmit(vdev2, vdev, m);
941 		}
942 		goto queue2nic;
943 	}
944 
945 	/*check if destination is local VM*/
946 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
947 		rte_pktmbuf_free(m);
948 		return;
949 	}
950 
951 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
952 		if (unlikely(find_local_dest(vdev, m, &offset,
953 					     &vlan_tag) != 0)) {
954 			rte_pktmbuf_free(m);
955 			return;
956 		}
957 	}
958 
959 	RTE_LOG_DP(DEBUG, VHOST_DATA,
960 		"(%d) TX: MAC address is external\n", vdev->vid);
961 
962 queue2nic:
963 
964 	/*Add packet to the port tx queue*/
965 	tx_q = &lcore_tx_queue[lcore_id];
966 
967 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
968 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
969 		/* Guest has inserted the vlan tag. */
970 		struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
971 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
972 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
973 			(vh->vlan_tci != vlan_tag_be))
974 			vh->vlan_tci = vlan_tag_be;
975 	} else {
976 		m->ol_flags |= PKT_TX_VLAN_PKT;
977 
978 		/*
979 		 * Find the right seg to adjust the data len when offset is
980 		 * bigger than tail room size.
981 		 */
982 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
983 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
984 				m->data_len += offset;
985 			else {
986 				struct rte_mbuf *seg = m;
987 
988 				while ((seg->next != NULL) &&
989 					(offset > rte_pktmbuf_tailroom(seg)))
990 					seg = seg->next;
991 
992 				seg->data_len += offset;
993 			}
994 			m->pkt_len += offset;
995 		}
996 
997 		m->vlan_tci = vlan_tag;
998 	}
999 
1000 	if (m->ol_flags & PKT_TX_TCP_SEG)
1001 		virtio_tx_offload(m);
1002 
1003 	tx_q->m_table[tx_q->len++] = m;
1004 	if (enable_stats) {
1005 		vdev->stats.tx_total++;
1006 		vdev->stats.tx++;
1007 	}
1008 
1009 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1010 		do_drain_mbuf_table(tx_q);
1011 }
1012 
1013 
1014 static __rte_always_inline void
1015 drain_mbuf_table(struct mbuf_table *tx_q)
1016 {
1017 	static uint64_t prev_tsc;
1018 	uint64_t cur_tsc;
1019 
1020 	if (tx_q->len == 0)
1021 		return;
1022 
1023 	cur_tsc = rte_rdtsc();
1024 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1025 		prev_tsc = cur_tsc;
1026 
1027 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1028 			"TX queue drained after timeout with burst size %u\n",
1029 			tx_q->len);
1030 		do_drain_mbuf_table(tx_q);
1031 	}
1032 }
1033 
1034 static __rte_always_inline void
1035 drain_eth_rx(struct vhost_dev *vdev)
1036 {
1037 	uint16_t rx_count, enqueue_count;
1038 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1039 
1040 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1041 				    pkts, MAX_PKT_BURST);
1042 	if (!rx_count)
1043 		return;
1044 
1045 	/*
1046 	 * When "enable_retry" is set, here we wait and retry when there
1047 	 * is no enough free slots in the queue to hold @rx_count packets,
1048 	 * to diminish packet loss.
1049 	 */
1050 	if (enable_retry &&
1051 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1052 			VIRTIO_RXQ))) {
1053 		uint32_t retry;
1054 
1055 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1056 			rte_delay_us(burst_rx_delay_time);
1057 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1058 					VIRTIO_RXQ))
1059 				break;
1060 		}
1061 	}
1062 
1063 	if (builtin_net_driver) {
1064 		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1065 						pkts, rx_count);
1066 	} else {
1067 		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1068 						pkts, rx_count);
1069 	}
1070 	if (enable_stats) {
1071 		rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1072 		rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1073 	}
1074 
1075 	free_pkts(pkts, rx_count);
1076 }
1077 
1078 static __rte_always_inline void
1079 drain_virtio_tx(struct vhost_dev *vdev)
1080 {
1081 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1082 	uint16_t count;
1083 	uint16_t i;
1084 
1085 	if (builtin_net_driver) {
1086 		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1087 					pkts, MAX_PKT_BURST);
1088 	} else {
1089 		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1090 					mbuf_pool, pkts, MAX_PKT_BURST);
1091 	}
1092 
1093 	/* setup VMDq for the first packet */
1094 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1095 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1096 			free_pkts(pkts, count);
1097 	}
1098 
1099 	for (i = 0; i < count; ++i)
1100 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1101 }
1102 
1103 /*
1104  * Main function of vhost-switch. It basically does:
1105  *
1106  * for each vhost device {
1107  *    - drain_eth_rx()
1108  *
1109  *      Which drains the host eth Rx queue linked to the vhost device,
1110  *      and deliver all of them to guest virito Rx ring associated with
1111  *      this vhost device.
1112  *
1113  *    - drain_virtio_tx()
1114  *
1115  *      Which drains the guest virtio Tx queue and deliver all of them
1116  *      to the target, which could be another vhost device, or the
1117  *      physical eth dev. The route is done in function "virtio_tx_route".
1118  * }
1119  */
1120 static int
1121 switch_worker(void *arg __rte_unused)
1122 {
1123 	unsigned i;
1124 	unsigned lcore_id = rte_lcore_id();
1125 	struct vhost_dev *vdev;
1126 	struct mbuf_table *tx_q;
1127 
1128 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1129 
1130 	tx_q = &lcore_tx_queue[lcore_id];
1131 	for (i = 0; i < rte_lcore_count(); i++) {
1132 		if (lcore_ids[i] == lcore_id) {
1133 			tx_q->txq_id = i;
1134 			break;
1135 		}
1136 	}
1137 
1138 	while(1) {
1139 		drain_mbuf_table(tx_q);
1140 
1141 		/*
1142 		 * Inform the configuration core that we have exited the
1143 		 * linked list and that no devices are in use if requested.
1144 		 */
1145 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1146 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1147 
1148 		/*
1149 		 * Process vhost devices
1150 		 */
1151 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1152 			      lcore_vdev_entry) {
1153 			if (unlikely(vdev->remove)) {
1154 				unlink_vmdq(vdev);
1155 				vdev->ready = DEVICE_SAFE_REMOVE;
1156 				continue;
1157 			}
1158 
1159 			if (likely(vdev->ready == DEVICE_RX))
1160 				drain_eth_rx(vdev);
1161 
1162 			if (likely(!vdev->remove))
1163 				drain_virtio_tx(vdev);
1164 		}
1165 	}
1166 
1167 	return 0;
1168 }
1169 
1170 /*
1171  * Remove a device from the specific data core linked list and from the
1172  * main linked list. Synchonization  occurs through the use of the
1173  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1174  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1175  */
1176 static void
1177 destroy_device(int vid)
1178 {
1179 	struct vhost_dev *vdev = NULL;
1180 	int lcore;
1181 
1182 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1183 		if (vdev->vid == vid)
1184 			break;
1185 	}
1186 	if (!vdev)
1187 		return;
1188 	/*set the remove flag. */
1189 	vdev->remove = 1;
1190 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1191 		rte_pause();
1192 	}
1193 
1194 	if (builtin_net_driver)
1195 		vs_vhost_net_remove(vdev);
1196 
1197 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1198 		     lcore_vdev_entry);
1199 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1200 
1201 
1202 	/* Set the dev_removal_flag on each lcore. */
1203 	RTE_LCORE_FOREACH_SLAVE(lcore)
1204 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1205 
1206 	/*
1207 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1208 	 * we can be sure that they can no longer access the device removed
1209 	 * from the linked lists and that the devices are no longer in use.
1210 	 */
1211 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1212 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1213 			rte_pause();
1214 	}
1215 
1216 	lcore_info[vdev->coreid].device_num--;
1217 
1218 	RTE_LOG(INFO, VHOST_DATA,
1219 		"(%d) device has been removed from data core\n",
1220 		vdev->vid);
1221 
1222 	rte_free(vdev);
1223 }
1224 
1225 /*
1226  * A new device is added to a data core. First the device is added to the main linked list
1227  * and the allocated to a specific data core.
1228  */
1229 static int
1230 new_device(int vid)
1231 {
1232 	int lcore, core_add = 0;
1233 	uint32_t device_num_min = num_devices;
1234 	struct vhost_dev *vdev;
1235 
1236 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1237 	if (vdev == NULL) {
1238 		RTE_LOG(INFO, VHOST_DATA,
1239 			"(%d) couldn't allocate memory for vhost dev\n",
1240 			vid);
1241 		return -1;
1242 	}
1243 	vdev->vid = vid;
1244 
1245 	if (builtin_net_driver)
1246 		vs_vhost_net_setup(vdev);
1247 
1248 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1249 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1250 
1251 	/*reset ready flag*/
1252 	vdev->ready = DEVICE_MAC_LEARNING;
1253 	vdev->remove = 0;
1254 
1255 	/* Find a suitable lcore to add the device. */
1256 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1257 		if (lcore_info[lcore].device_num < device_num_min) {
1258 			device_num_min = lcore_info[lcore].device_num;
1259 			core_add = lcore;
1260 		}
1261 	}
1262 	vdev->coreid = core_add;
1263 
1264 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1265 			  lcore_vdev_entry);
1266 	lcore_info[vdev->coreid].device_num++;
1267 
1268 	/* Disable notifications. */
1269 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1270 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1271 
1272 	RTE_LOG(INFO, VHOST_DATA,
1273 		"(%d) device has been added to data core %d\n",
1274 		vid, vdev->coreid);
1275 
1276 	return 0;
1277 }
1278 
1279 /*
1280  * These callback allow devices to be added to the data core when configuration
1281  * has been fully complete.
1282  */
1283 static const struct vhost_device_ops virtio_net_device_ops =
1284 {
1285 	.new_device =  new_device,
1286 	.destroy_device = destroy_device,
1287 };
1288 
1289 /*
1290  * This is a thread will wake up after a period to print stats if the user has
1291  * enabled them.
1292  */
1293 static void *
1294 print_stats(__rte_unused void *arg)
1295 {
1296 	struct vhost_dev *vdev;
1297 	uint64_t tx_dropped, rx_dropped;
1298 	uint64_t tx, tx_total, rx, rx_total;
1299 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1300 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1301 
1302 	while(1) {
1303 		sleep(enable_stats);
1304 
1305 		/* Clear screen and move to top left */
1306 		printf("%s%s\n", clr, top_left);
1307 		printf("Device statistics =================================\n");
1308 
1309 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1310 			tx_total   = vdev->stats.tx_total;
1311 			tx         = vdev->stats.tx;
1312 			tx_dropped = tx_total - tx;
1313 
1314 			rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1315 			rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1316 			rx_dropped = rx_total - rx;
1317 
1318 			printf("Statistics for device %d\n"
1319 				"-----------------------\n"
1320 				"TX total:              %" PRIu64 "\n"
1321 				"TX dropped:            %" PRIu64 "\n"
1322 				"TX successful:         %" PRIu64 "\n"
1323 				"RX total:              %" PRIu64 "\n"
1324 				"RX dropped:            %" PRIu64 "\n"
1325 				"RX successful:         %" PRIu64 "\n",
1326 				vdev->vid,
1327 				tx_total, tx_dropped, tx,
1328 				rx_total, rx_dropped, rx);
1329 		}
1330 
1331 		printf("===================================================\n");
1332 	}
1333 
1334 	return NULL;
1335 }
1336 
1337 static void
1338 unregister_drivers(int socket_num)
1339 {
1340 	int i, ret;
1341 
1342 	for (i = 0; i < socket_num; i++) {
1343 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1344 		if (ret != 0)
1345 			RTE_LOG(ERR, VHOST_CONFIG,
1346 				"Fail to unregister vhost driver for %s.\n",
1347 				socket_files + i * PATH_MAX);
1348 	}
1349 }
1350 
1351 /* When we receive a INT signal, unregister vhost driver */
1352 static void
1353 sigint_handler(__rte_unused int signum)
1354 {
1355 	/* Unregister vhost driver. */
1356 	unregister_drivers(nb_sockets);
1357 
1358 	exit(0);
1359 }
1360 
1361 /*
1362  * While creating an mbuf pool, one key thing is to figure out how
1363  * many mbuf entries is enough for our use. FYI, here are some
1364  * guidelines:
1365  *
1366  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1367  *
1368  * - For each switch core (A CPU core does the packet switch), we need
1369  *   also make some reservation for receiving the packets from virtio
1370  *   Tx queue. How many is enough depends on the usage. It's normally
1371  *   a simple calculation like following:
1372  *
1373  *       MAX_PKT_BURST * max packet size / mbuf size
1374  *
1375  *   So, we definitely need allocate more mbufs when TSO is enabled.
1376  *
1377  * - Similarly, for each switching core, we should serve @nr_rx_desc
1378  *   mbufs for receiving the packets from physical NIC device.
1379  *
1380  * - We also need make sure, for each switch core, we have allocated
1381  *   enough mbufs to fill up the mbuf cache.
1382  */
1383 static void
1384 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1385 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1386 {
1387 	uint32_t nr_mbufs;
1388 	uint32_t nr_mbufs_per_core;
1389 	uint32_t mtu = 1500;
1390 
1391 	if (mergeable)
1392 		mtu = 9000;
1393 	if (enable_tso)
1394 		mtu = 64 * 1024;
1395 
1396 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1397 			(mbuf_size - RTE_PKTMBUF_HEADROOM);
1398 	nr_mbufs_per_core += nr_rx_desc;
1399 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1400 
1401 	nr_mbufs  = nr_queues * nr_rx_desc;
1402 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1403 	nr_mbufs *= nr_port;
1404 
1405 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1406 					    nr_mbuf_cache, 0, mbuf_size,
1407 					    rte_socket_id());
1408 	if (mbuf_pool == NULL)
1409 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1410 }
1411 
1412 /*
1413  * Main function, does initialisation and calls the per-lcore functions.
1414  */
1415 int
1416 main(int argc, char *argv[])
1417 {
1418 	unsigned lcore_id, core_id = 0;
1419 	unsigned nb_ports, valid_num_ports;
1420 	int ret, i;
1421 	uint16_t portid;
1422 	static pthread_t tid;
1423 	uint64_t flags = 0;
1424 
1425 	signal(SIGINT, sigint_handler);
1426 
1427 	/* init EAL */
1428 	ret = rte_eal_init(argc, argv);
1429 	if (ret < 0)
1430 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1431 	argc -= ret;
1432 	argv += ret;
1433 
1434 	/* parse app arguments */
1435 	ret = us_vhost_parse_args(argc, argv);
1436 	if (ret < 0)
1437 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1438 
1439 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1440 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1441 
1442 		if (rte_lcore_is_enabled(lcore_id))
1443 			lcore_ids[core_id++] = lcore_id;
1444 	}
1445 
1446 	if (rte_lcore_count() > RTE_MAX_LCORE)
1447 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1448 
1449 	/* Get the number of physical ports. */
1450 	nb_ports = rte_eth_dev_count_avail();
1451 
1452 	/*
1453 	 * Update the global var NUM_PORTS and global array PORTS
1454 	 * and get value of var VALID_NUM_PORTS according to system ports number
1455 	 */
1456 	valid_num_ports = check_ports_num(nb_ports);
1457 
1458 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1459 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1460 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1461 		return -1;
1462 	}
1463 
1464 	/*
1465 	 * FIXME: here we are trying to allocate mbufs big enough for
1466 	 * @MAX_QUEUES, but the truth is we're never going to use that
1467 	 * many queues here. We probably should only do allocation for
1468 	 * those queues we are going to use.
1469 	 */
1470 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1471 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1472 
1473 	if (vm2vm_mode == VM2VM_HARDWARE) {
1474 		/* Enable VT loop back to let L2 switch to do it. */
1475 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1476 		RTE_LOG(DEBUG, VHOST_CONFIG,
1477 			"Enable loop back for L2 switch in vmdq.\n");
1478 	}
1479 
1480 	/* initialize all ports */
1481 	RTE_ETH_FOREACH_DEV(portid) {
1482 		/* skip ports that are not enabled */
1483 		if ((enabled_port_mask & (1 << portid)) == 0) {
1484 			RTE_LOG(INFO, VHOST_PORT,
1485 				"Skipping disabled port %d\n", portid);
1486 			continue;
1487 		}
1488 		if (port_init(portid) != 0)
1489 			rte_exit(EXIT_FAILURE,
1490 				"Cannot initialize network ports\n");
1491 	}
1492 
1493 	/* Enable stats if the user option is set. */
1494 	if (enable_stats) {
1495 		ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1496 					print_stats, NULL);
1497 		if (ret < 0)
1498 			rte_exit(EXIT_FAILURE,
1499 				"Cannot create print-stats thread\n");
1500 	}
1501 
1502 	/* Launch all data cores. */
1503 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1504 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1505 
1506 	if (client_mode)
1507 		flags |= RTE_VHOST_USER_CLIENT;
1508 
1509 	if (dequeue_zero_copy)
1510 		flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1511 
1512 	/* Register vhost user driver to handle vhost messages. */
1513 	for (i = 0; i < nb_sockets; i++) {
1514 		char *file = socket_files + i * PATH_MAX;
1515 		ret = rte_vhost_driver_register(file, flags);
1516 		if (ret != 0) {
1517 			unregister_drivers(i);
1518 			rte_exit(EXIT_FAILURE,
1519 				"vhost driver register failure.\n");
1520 		}
1521 
1522 		if (builtin_net_driver)
1523 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1524 
1525 		if (mergeable == 0) {
1526 			rte_vhost_driver_disable_features(file,
1527 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
1528 		}
1529 
1530 		if (enable_tx_csum == 0) {
1531 			rte_vhost_driver_disable_features(file,
1532 				1ULL << VIRTIO_NET_F_CSUM);
1533 		}
1534 
1535 		if (enable_tso == 0) {
1536 			rte_vhost_driver_disable_features(file,
1537 				1ULL << VIRTIO_NET_F_HOST_TSO4);
1538 			rte_vhost_driver_disable_features(file,
1539 				1ULL << VIRTIO_NET_F_HOST_TSO6);
1540 			rte_vhost_driver_disable_features(file,
1541 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
1542 			rte_vhost_driver_disable_features(file,
1543 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
1544 		}
1545 
1546 		if (promiscuous) {
1547 			rte_vhost_driver_enable_features(file,
1548 				1ULL << VIRTIO_NET_F_CTRL_RX);
1549 		}
1550 
1551 		ret = rte_vhost_driver_callback_register(file,
1552 			&virtio_net_device_ops);
1553 		if (ret != 0) {
1554 			rte_exit(EXIT_FAILURE,
1555 				"failed to register vhost driver callbacks.\n");
1556 		}
1557 
1558 		if (rte_vhost_driver_start(file) < 0) {
1559 			rte_exit(EXIT_FAILURE,
1560 				"failed to start vhost driver.\n");
1561 		}
1562 	}
1563 
1564 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1565 		rte_eal_wait_lcore(lcore_id);
1566 
1567 	return 0;
1568 
1569 }
1570